only look for <meta>-related tags when there are meta tags

This commit is contained in:
jesopo 2019-09-09 14:39:19 +01:00
parent 8e824c9277
commit 0a67659637

View file

@ -66,19 +66,21 @@ def _meta_content(s: str) -> typing.Dict[str, str]:
return out return out
def _find_encoding(soup: bs4.BeautifulSoup) -> typing.Optional[str]: def _find_encoding(soup: bs4.BeautifulSoup) -> typing.Optional[str]:
if not soup.meta == None:
meta_charset = soup.meta.get("charset") meta_charset = soup.meta.get("charset")
if not meta_charset == None: if not meta_charset == None:
return meta_charset return meta_charset
else:
meta_content_type = soup.findAll("meta", meta_content_type = soup.findAll("meta",
{"http-equiv": lambda v: (v or "").lower() == "content-type"}) {"http-equiv": lambda v: (v or "").lower() == "content-type"})
if meta_content_type: if meta_content_type:
return _meta_content(meta_content_type[0].get("content"))["charset"] return _meta_content(meta_content_type[0].get("content"))["charset"]
else:
doctype = [item for item in soup.contents if isinstance(item, doctype = [item for item in soup.contents if isinstance(item,
bs4.Doctype)] or None bs4.Doctype)] or None
if doctype and doctype[0] == "html": if doctype and doctype[0] == "html":
return "utf8" return "utf8"
return None return None
def request(url: str, method: str="GET", get_params: dict={}, def request(url: str, method: str="GET", get_params: dict={},