From ed775ddbe3d66d1218ce034de97ad4ea641002bf Mon Sep 17 00:00:00 2001 From: jesopo Date: Tue, 26 Nov 2019 11:35:56 +0000 Subject: [PATCH] remove `parser` from utils.http.Request, add Request.soup() --- modules/acronym.py | 12 +++++++----- modules/eval_lua.py | 7 +++---- modules/fediverse/ap_utils.py | 5 ++--- modules/title.py | 10 ++++++---- src/utils/http.py | 21 ++++++--------------- 5 files changed, 24 insertions(+), 31 deletions(-) diff --git a/modules/acronym.py b/modules/acronym.py index 212465d5..116317a1 100644 --- a/modules/acronym.py +++ b/modules/acronym.py @@ -9,11 +9,13 @@ class Module(ModuleManager.BaseModule): @utils.kwarg("usage", "") def acronym(self, event): query = event["args_split"][0].upper() - response = utils.http.request(API % query, parse=True) - if response.data: - acronyms = [] - for element in response.data.find_all("acro"): - acronyms.append(element.expan.string) + response = utils.http.request(API % query) + + acronyms = [] + for element in response.soup().find_all("acro"): + acronyms.append(element.expan.string) + + if acronyms: event["stdout"].write("%s: %s" % (query, ", ".join(acronyms))) else: raise utils.EventResultsError() diff --git a/modules/eval_lua.py b/modules/eval_lua.py index 102aadc7..6b34c7c2 100644 --- a/modules/eval_lua.py +++ b/modules/eval_lua.py @@ -10,15 +10,14 @@ class Module(ModuleManager.BaseModule): @utils.hook("received.command.lua", min_args=1) def eval(self, event): try: - page = utils.http.request(EVAL_URL, - post_data={"input": event["args"]}, - method="POST", parse=True) + page = utils.http.request(EVAL_URL, post_data= + {"input": event["args"]}, method="POST") except socket.timeout: raise utils.EventError("%s: eval timed out" % event["user"].nickname) if page: - textareas = page.data.find_all("textarea") + textareas = page.soup().find_all("textarea") if len(textareas) > 1: out = textareas[1].text.strip("\n") event["stdout"].write("%s: %s" % (event["user"].nickname, out)) diff --git a/modules/fediverse/ap_utils.py b/modules/fediverse/ap_utils.py index fc6ea5fd..686b8850 100644 --- a/modules/fediverse/ap_utils.py +++ b/modules/fediverse/ap_utils.py @@ -38,8 +38,7 @@ class FindActorException(Exception): def find_actor(username, instance): hostmeta = HOSTMETA_TEMPLATE % instance - hostmeta_request = utils.http.Request(HOSTMETA_TEMPLATE % instance, - parse=True, check_content_type=False) + hostmeta_request = utils.http.Request(HOSTMETA_TEMPLATE % instance) try: hostmeta = utils.http.request(hostmeta_request) except: @@ -47,7 +46,7 @@ def find_actor(username, instance): webfinger_url = None if hostmeta.code == 200: - for item in hostmeta.data.find_all("link"): + for item in hostmeta.soup().find_all("link"): if item["rel"] and item["rel"][0] == "lrdd": webfinger_url = item["template"] break diff --git a/modules/title.py b/modules/title.py index d02f7249..01425dbe 100644 --- a/modules/title.py +++ b/modules/title.py @@ -50,14 +50,16 @@ class Module(ModuleManager.BaseModule): return -1, None try: - page = utils.http.request(url, parse=True) - except utils.http.HTTPWrongContentTypeException: - return -1, None + page = utils.http.request(url) except Exception as e: self.log.error("failed to get URL title for %s: %s", [url, str(e)]) return -1, None - if page.data.title: + if not page.content_type in utils.http.SOUP_CONTENT_TYPES: + return -1, None + page = page.soup() + + if page.title: title = utils.parse.line_normalise(page.data.title.text) if not title: return -3, None diff --git a/src/utils/http.py b/src/utils/http.py index 9e9bf3e6..80bf5eae 100644 --- a/src/utils/http.py +++ b/src/utils/http.py @@ -72,9 +72,7 @@ class Request(object): allow_redirects: bool = True check_content_type: bool = True - parse: bool = False detect_encoding: bool = True - parser: str = "lxml" fallback_encoding: typing.Optional[str] = None content_type: typing.Optional[str] = None proxy: typing.Optional[str] = None @@ -126,8 +124,12 @@ class Response(object): self.encoding = encoding self.headers = headers self.cookies = cookies - def json(self): + def decode(self) -> str: + return self.data + def json(self) -> typing.Any: return _json.loads(self.data) + def soup(self, parser: str="lxml") -> bs4.BeautifulSoup: + return bs4.BeautifulSoup(self.decode(), parser) def _meta_content(s: str) -> typing.Dict[str, str]: out = {} @@ -200,23 +202,12 @@ def _request(request_obj: Request) -> Response: if (request_obj.detect_encoding and response.content_type and response.content_type in SOUP_CONTENT_TYPES): - souped = bs4.BeautifulSoup(response.data, request_obj.parser) + souped = bs4.BeautifulSoup(response.data, "lxml") encoding = _find_encoding(souped) or encoding def _decode_data(): return response.data.decode(encoding) - if request_obj.parse: - if (not request_obj.check_content_type or - response.content_type in SOUP_CONTENT_TYPES): - souped = bs4.BeautifulSoup(_decode_data(), request_obj.parser) - response.data = souped - return response - else: - raise HTTPWrongContentTypeException( - "Tried to soup non-html/non-xml data (%s)" % - response.content_type) - if request_obj.json and response.data: data = _decode_data() try: