2018-10-10 13:25:44 +00:00
|
|
|
import re, signal, traceback, urllib.error, urllib.parse
|
2018-10-10 12:41:58 +00:00
|
|
|
import json as _json
|
|
|
|
import bs4, requests
|
2018-10-03 12:22:37 +00:00
|
|
|
|
|
|
|
USER_AGENT = ("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 "
|
|
|
|
"(KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36")
|
|
|
|
REGEX_HTTP = re.compile("https?://", re.I)
|
|
|
|
|
2018-10-10 13:05:15 +00:00
|
|
|
RESPONSE_MAX = (1024*1024)*100
|
|
|
|
|
2018-10-10 13:25:44 +00:00
|
|
|
class HTTPException:
|
|
|
|
pass
|
|
|
|
class HTTPTimeoutException(HTTPException):
|
|
|
|
pass
|
|
|
|
class HTTPParsingException(HTTPException):
|
|
|
|
pass
|
|
|
|
|
2018-10-10 14:07:04 +00:00
|
|
|
def throw_timeout():
|
|
|
|
raise HTTPTimeoutException()
|
2018-10-10 13:25:44 +00:00
|
|
|
|
2018-10-10 12:41:58 +00:00
|
|
|
def get_url(url, method="GET", get_params={}, post_data=None, headers={},
|
2018-10-10 22:49:42 +00:00
|
|
|
json_data=None, code=False, json=False, soup=False, parser="lxml",
|
|
|
|
fallback_encoding="utf8"):
|
2018-10-10 12:41:58 +00:00
|
|
|
|
2018-10-03 12:22:37 +00:00
|
|
|
if not urllib.parse.urlparse(url).scheme:
|
|
|
|
url = "http://%s" % url
|
|
|
|
|
2018-10-10 12:41:58 +00:00
|
|
|
if not "Accept-Language" in headers:
|
|
|
|
headers["Accept-Language"] = "en-GB"
|
|
|
|
if not "User-Agent" in headers:
|
|
|
|
headers["User-Agent"] = USER_AGENT
|
|
|
|
|
2018-10-10 14:07:04 +00:00
|
|
|
signal.signal(signal.SIGALRM, throw_timeout)
|
2018-10-10 13:25:44 +00:00
|
|
|
signal.alarm(5)
|
|
|
|
try:
|
|
|
|
response = requests.request(
|
|
|
|
method.upper(),
|
|
|
|
url,
|
|
|
|
headers=headers,
|
|
|
|
params=get_params,
|
|
|
|
data=post_data,
|
|
|
|
json=json_data,
|
|
|
|
stream=True
|
|
|
|
)
|
|
|
|
response_content = response.raw.read(RESPONSE_MAX, decode_content=True)
|
|
|
|
except TimeoutError:
|
|
|
|
raise HTTPTimeoutException()
|
|
|
|
finally:
|
|
|
|
signal.signal(signal.SIGALRM, signal.SIG_IGN)
|
2018-10-10 12:41:58 +00:00
|
|
|
|
|
|
|
if soup:
|
2018-10-10 13:05:15 +00:00
|
|
|
soup = bs4.BeautifulSoup(response_content, parser)
|
2018-10-10 12:41:58 +00:00
|
|
|
if code:
|
2018-10-09 21:16:04 +00:00
|
|
|
return response.code, soup
|
|
|
|
return soup
|
|
|
|
|
2018-10-10 22:49:42 +00:00
|
|
|
data = response_content.decode(response.encoding or fallback_encoding)
|
2018-10-10 12:41:58 +00:00
|
|
|
if json and data:
|
2018-10-03 12:22:37 +00:00
|
|
|
try:
|
2018-10-10 12:41:58 +00:00
|
|
|
data = _json.loads(data)
|
2018-10-10 13:25:44 +00:00
|
|
|
except _json.decoder.JSONDecodeError as e:
|
|
|
|
raise HTTPParsingException(str(e))
|
2018-10-10 12:41:58 +00:00
|
|
|
|
|
|
|
if code:
|
|
|
|
return response.status_code, data
|
2018-10-03 12:22:37 +00:00
|
|
|
else:
|
|
|
|
return data
|
|
|
|
|
|
|
|
def strip_html(s):
|
|
|
|
return bs4.BeautifulSoup(s, "lxml").get_text()
|
|
|
|
|