From 5b9ffe013ddf1f8f6f16284d85ff0d4564663387 Mon Sep 17 00:00:00 2001 From: jesopo Date: Wed, 10 Oct 2018 14:25:44 +0100 Subject: [PATCH] Use signal.alarm to Deadline utils.http.get_url and throw useful exceptions --- src/utils/http.py | 44 ++++++++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/src/utils/http.py b/src/utils/http.py index d47f01c0..5261e955 100644 --- a/src/utils/http.py +++ b/src/utils/http.py @@ -1,4 +1,4 @@ -import re, traceback, urllib.error, urllib.parse +import re, signal, traceback, urllib.error, urllib.parse import json as _json import bs4, requests @@ -8,6 +8,14 @@ REGEX_HTTP = re.compile("https?://", re.I) RESPONSE_MAX = (1024*1024)*100 +class HTTPException: + pass +class HTTPTimeoutException(HTTPException): + pass +class HTTPParsingException(HTTPException): + pass + + def get_url(url, method="GET", get_params={}, post_data=None, headers={}, json_data=None, code=False, json=False, soup=False, parser="lxml"): @@ -19,16 +27,23 @@ def get_url(url, method="GET", get_params={}, post_data=None, headers={}, if not "User-Agent" in headers: headers["User-Agent"] = USER_AGENT - response = requests.request( - method.upper(), - url, - headers=headers, - params=get_params, - data=post_data, - json=json_data, - stream=True - ) - response_content = response.raw.read(RESPONSE_MAX, decode_content=True) + signal.signal(signal.SIGALRM, lambda: raise TimeoutError()) + signal.alarm(5) + try: + response = requests.request( + method.upper(), + url, + headers=headers, + params=get_params, + data=post_data, + json=json_data, + stream=True + ) + response_content = response.raw.read(RESPONSE_MAX, decode_content=True) + except TimeoutError: + raise HTTPTimeoutException() + finally: + signal.signal(signal.SIGALRM, signal.SIG_IGN) if soup: soup = bs4.BeautifulSoup(response_content, parser) @@ -40,11 +55,8 @@ def get_url(url, method="GET", get_params={}, post_data=None, headers={}, if json and data: try: data = _json.loads(data) - except _json.decoder.JSONDecodeError: - traceback.print_exc() - if code: - return 0, False - return False + except _json.decoder.JSONDecodeError as e: + raise HTTPParsingException(str(e)) if code: return response.status_code, data