actually explicitly strip "unknown" html tags from fedi notes

This commit is contained in:
jesopo 2019-10-14 10:46:15 +01:00
parent a75386df32
commit af1fd37003

View file

@ -57,14 +57,19 @@ def find_actor(username, instance):
if link["type"] == ACTIVITY_TYPE: if link["type"] == ACTIVITY_TYPE:
return link["href"] return link["href"]
KNOWN_TAGS = ["p", "br"]
def _normalise_note(content): def _normalise_note(content):
soup = bs4.BeautifulSoup(content, "html.parser") soup = bs4.BeautifulSoup(content, "html.parser")
lines = [] lines = []
for element in soup.find_all(): for element in soup.find_all():
out = ""
if element.text.strip() == "": if element.text.strip() == "":
continue element.decompose()
elif element.name == "p": elif not element.name in KNOWN_TAGS:
element.unwrap()
for element in soup.find_all():
out = ""
if element.name == "p":
for subitem in element.contents: for subitem in element.contents:
if type(subitem) == bs4.element.Tag: if type(subitem) == bs4.element.Tag:
if subitem.name == "br": if subitem.name == "br":
@ -72,8 +77,6 @@ def _normalise_note(content):
out = "" out = ""
else: else:
out += subitem out += subitem
else:
continue
lines.append(out.replace(" ", " ")) lines.append(out.replace(" ", " "))
return " ".join(lines) return " ".join(lines)