use lxml for parsing fedi Notes. html.parse is bad and unpredictable

This commit is contained in:
jesopo 2019-10-31 15:17:39 +00:00
parent 8188aeb9b8
commit 5d08a496a4

View file

@ -61,7 +61,7 @@ def find_actor(username, instance):
KNOWN_TAGS = ["p", "br"] KNOWN_TAGS = ["p", "br"]
def _normalise_note(content): def _normalise_note(content):
soup = bs4.BeautifulSoup(content, "html.parser") soup = bs4.BeautifulSoup(content, "lxml").body
lines = [] lines = []
for element in soup.find_all(): for element in soup.find_all():
if element.text.strip() == "": if element.text.strip() == "":