replace lxml usage with html5lib! the future is cool

2020-02-13 21:50:33 +00:00 · 2020-02-13 21:50:33 +00:00 · df38d7a57f
commit df38d7a57f
parent 5c9e965d25
2 changed files with 4 additions and 3 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -3,6 +3,7 @@ cryptography==2.7
 dataclasses==0.6
 dnspython==1.16.0
 feedparser==5.2.1
+html5lib==1.0.1
 lxml==4.4.1
 netifaces==0.10.9
 PySocks==1.7.1
--- a/src/utils/http.py
+++ b/src/utils/http.py
@ -127,7 +127,7 @@ class Response(object):
        return self.data.decode(encoding or self.encoding)
    def json(self) -> typing.Any:
        return _json.loads(self.data)
-    def soup(self, parser: str="lxml") -> bs4.BeautifulSoup:
+    def soup(self, parser: str="html5lib") -> bs4.BeautifulSoup:
        return bs4.BeautifulSoup(self.decode(), parser)

 def _split_content(s: str) -> typing.Dict[str, str]:
@ -144,7 +144,7 @@ def _find_encoding(headers: typing.Dict[str, str], data: bytes
        if "charset" in content_header:
            return content_header["charset"]

-    soup = bs4.BeautifulSoup(data, "lxml")
+    soup = bs4.BeautifulSoup(data, "html5lib")
    if not soup.meta == None:
        meta_charset = soup.meta.get("charset")
        if not meta_charset == None:
@ -275,7 +275,7 @@ class Client(object):
    request_many = request_many

 def strip_html(s: str) -> str:
-    return bs4.BeautifulSoup(s, "lxml").get_text()
+    return bs4.BeautifulSoup(s, "html5lib").get_text()

 def resolve_hostname(hostname: str) -> typing.List[str]:
    try: