2019-07-08 11:45:10 +00:00
|
|
|
import asyncio, ipaddress, re, signal, socket, traceback, typing
|
2019-04-25 14:58:58 +00:00
|
|
|
import urllib.error, urllib.parse
|
2018-10-10 12:41:58 +00:00
|
|
|
import json as _json
|
2019-07-08 11:45:10 +00:00
|
|
|
import bs4, netifaces, requests
|
|
|
|
import tornado.httpclient
|
2018-12-11 22:30:05 +00:00
|
|
|
from src import utils
|
2018-10-03 12:22:37 +00:00
|
|
|
|
2019-09-02 12:25:48 +00:00
|
|
|
REGEX_URL = re.compile("https?://\S+", re.I)
|
|
|
|
|
|
|
|
PAIRED_CHARACTERS = ["<>", "()"]
|
2019-04-24 14:46:54 +00:00
|
|
|
|
2019-07-02 13:10:18 +00:00
|
|
|
# best-effort tidying up of URLs
|
2019-07-02 13:15:49 +00:00
|
|
|
def url_sanitise(url: str):
|
2019-07-08 11:54:06 +00:00
|
|
|
if not urllib.parse.urlparse(url).scheme:
|
|
|
|
url = "http://%s" % url
|
|
|
|
|
2019-09-02 12:25:48 +00:00
|
|
|
for pair_start, pair_end in PAIRED_CHARACTERS:
|
2019-07-02 13:10:18 +00:00
|
|
|
# trim ")" from the end only if there's not a "(" to match it
|
|
|
|
# google.com/) -> google.com/
|
|
|
|
# google.com/() -> google.com/()
|
|
|
|
# google.com/()) -> google.com/()
|
2019-09-02 12:25:48 +00:00
|
|
|
if url.endswith(pair_end):
|
|
|
|
if pair_start in url:
|
|
|
|
open_index = url.rfind("(")
|
|
|
|
other_index = url.rfind(")", 0, len(url)-1)
|
|
|
|
if not other_index == -1 and other_index < open_index:
|
|
|
|
url = url[:-1]
|
|
|
|
else:
|
|
|
|
url = url[:-1]
|
2019-07-02 13:10:18 +00:00
|
|
|
return url
|
|
|
|
|
2019-09-12 09:41:50 +00:00
|
|
|
DEFAULT_USERAGENT = ("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 "
|
2018-10-03 12:22:37 +00:00
|
|
|
"(KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36")
|
|
|
|
|
2018-10-10 13:05:15 +00:00
|
|
|
RESPONSE_MAX = (1024*1024)*100
|
2019-02-26 11:18:50 +00:00
|
|
|
SOUP_CONTENT_TYPES = ["text/html", "text/xml", "application/xml"]
|
2019-09-11 14:28:13 +00:00
|
|
|
DECODE_CONTENT_TYPES = ["text/plain"]+SOUP_CONTENT_TYPES
|
2018-10-10 13:05:15 +00:00
|
|
|
|
2018-10-30 17:49:35 +00:00
|
|
|
class HTTPException(Exception):
|
2018-10-10 13:25:44 +00:00
|
|
|
pass
|
|
|
|
class HTTPTimeoutException(HTTPException):
|
2019-06-27 17:28:08 +00:00
|
|
|
def __init__(self):
|
|
|
|
Exception.__init__(self, "HTTP request timed out")
|
2018-10-10 13:25:44 +00:00
|
|
|
class HTTPParsingException(HTTPException):
|
2019-09-02 12:27:44 +00:00
|
|
|
def __init__(self, message: str, data: str=None):
|
2019-06-28 22:00:48 +00:00
|
|
|
Exception.__init__(self, message or "HTTP parsing failed")
|
2019-02-28 23:28:45 +00:00
|
|
|
class HTTPWrongContentTypeException(HTTPException):
|
2019-06-28 22:00:48 +00:00
|
|
|
def __init__(self, message: str=None):
|
|
|
|
Exception.__init__(self,
|
|
|
|
message or "HTTP request gave wrong content type")
|
2018-10-10 13:25:44 +00:00
|
|
|
|
2018-10-10 14:07:04 +00:00
|
|
|
def throw_timeout():
|
|
|
|
raise HTTPTimeoutException()
|
2018-10-10 13:25:44 +00:00
|
|
|
|
2019-09-11 16:44:07 +00:00
|
|
|
class Request(object):
|
|
|
|
def __init__(self, url: str, method: str="GET",
|
|
|
|
get_params: typing.Dict[str, str]={}, post_data: typing.Any=None,
|
|
|
|
headers: typing.Dict[str, str]={},
|
|
|
|
|
|
|
|
json: bool=False, allow_redirects: bool=True,
|
|
|
|
check_content_type: bool=True, parse: bool=False,
|
|
|
|
detect_encoding: bool=True,
|
|
|
|
|
|
|
|
parser: str="lxml", fallback_encoding="iso-8859-1",
|
2019-09-12 09:41:50 +00:00
|
|
|
content_type: str=None, proxy: str=None, useragent: str=None,
|
2019-09-11 16:44:07 +00:00
|
|
|
|
|
|
|
**kwargs):
|
|
|
|
self.set_url(url)
|
|
|
|
self.method = method.upper()
|
|
|
|
self.get_params = get_params
|
|
|
|
self.post_data = post_data
|
|
|
|
self.headers = headers
|
|
|
|
|
|
|
|
self.json = json
|
|
|
|
self.allow_redirects = allow_redirects
|
|
|
|
self.check_content_type = check_content_type
|
|
|
|
self.parse = parse
|
|
|
|
self.detect_encoding = detect_encoding
|
|
|
|
|
|
|
|
self.parser = parser
|
|
|
|
self.fallback_encoding = fallback_encoding
|
|
|
|
self.content_type = content_type
|
2019-09-11 16:53:37 +00:00
|
|
|
self.proxy = proxy
|
2019-09-12 09:41:50 +00:00
|
|
|
self.useragent = useragent
|
2019-09-11 16:44:07 +00:00
|
|
|
|
|
|
|
if kwargs:
|
|
|
|
if method == "POST":
|
|
|
|
self.post_data = kwargs
|
|
|
|
else:
|
|
|
|
self.get_params.update(kwargs)
|
|
|
|
|
|
|
|
def set_url(self, url: str):
|
|
|
|
if not urllib.parse.urlparse(url).scheme:
|
|
|
|
url = "http://%s" % url
|
|
|
|
self.url = url
|
|
|
|
|
|
|
|
def get_headers(self) -> typing.Dict[str, str]:
|
|
|
|
headers = self.headers.copy()
|
|
|
|
if not "Accept-Language" in headers:
|
|
|
|
headers["Accept-Language"] = "en-GB"
|
|
|
|
if not "User-Agent" in headers:
|
2019-09-12 09:41:50 +00:00
|
|
|
headers["User-Agent"] = self.useragent or DEFAULT_USERAGENT
|
2019-09-11 16:44:07 +00:00
|
|
|
if not "Content-Type" in headers and self.content_type:
|
|
|
|
headers["Content-Type"] = self.content_type
|
|
|
|
return headers
|
|
|
|
|
|
|
|
def get_body(self) -> typing.Any:
|
|
|
|
if self.content_type == "application/json":
|
|
|
|
return _json.dumps(self.post_data)
|
|
|
|
else:
|
|
|
|
return self.post_data
|
|
|
|
|
2018-12-11 22:26:38 +00:00
|
|
|
class Response(object):
|
|
|
|
def __init__(self, code: int, data: typing.Any,
|
|
|
|
headers: typing.Dict[str, str]):
|
|
|
|
self.code = code
|
|
|
|
self.data = data
|
|
|
|
self.headers = headers
|
|
|
|
|
2019-09-09 13:10:58 +00:00
|
|
|
def _meta_content(s: str) -> typing.Dict[str, str]:
|
|
|
|
out = {}
|
2019-09-09 13:53:11 +00:00
|
|
|
for keyvalue in s.split(";"):
|
2019-09-09 13:10:58 +00:00
|
|
|
key, _, value = keyvalue.strip().partition("=")
|
|
|
|
out[key] = value
|
|
|
|
return out
|
|
|
|
|
|
|
|
def _find_encoding(soup: bs4.BeautifulSoup) -> typing.Optional[str]:
|
2019-09-09 13:39:19 +00:00
|
|
|
if not soup.meta == None:
|
|
|
|
meta_charset = soup.meta.get("charset")
|
|
|
|
if not meta_charset == None:
|
|
|
|
return meta_charset
|
|
|
|
|
2019-09-09 13:10:58 +00:00
|
|
|
meta_content_type = soup.findAll("meta",
|
|
|
|
{"http-equiv": lambda v: (v or "").lower() == "content-type"})
|
|
|
|
if meta_content_type:
|
|
|
|
return _meta_content(meta_content_type[0].get("content"))["charset"]
|
2019-09-09 13:39:19 +00:00
|
|
|
|
|
|
|
doctype = [item for item in soup.contents if isinstance(item,
|
|
|
|
bs4.Doctype)] or None
|
|
|
|
if doctype and doctype[0] == "html":
|
|
|
|
return "utf8"
|
|
|
|
|
2019-09-09 13:25:01 +00:00
|
|
|
return None
|
2019-09-09 13:10:58 +00:00
|
|
|
|
2019-09-11 16:44:07 +00:00
|
|
|
def request(request_obj: typing.Union[str, Request], **kwargs) -> Response:
|
|
|
|
if type(request_obj) == str:
|
|
|
|
request_obj = Request(request_obj, **kwargs)
|
|
|
|
return _request(request_obj)
|
2018-10-03 12:22:37 +00:00
|
|
|
|
2019-09-11 16:44:07 +00:00
|
|
|
def _request(request_obj: Request) -> Response:
|
|
|
|
headers = request_obj.get_headers()
|
2018-10-10 12:41:58 +00:00
|
|
|
|
2019-09-02 14:50:21 +00:00
|
|
|
with utils.deadline(seconds=5):
|
|
|
|
try:
|
|
|
|
response = requests.request(
|
2019-09-11 16:44:07 +00:00
|
|
|
request_obj.method,
|
|
|
|
request_obj.url,
|
2019-09-02 14:50:21 +00:00
|
|
|
headers=headers,
|
2019-09-11 16:44:07 +00:00
|
|
|
params=request_obj.get_params,
|
|
|
|
data=request_obj.get_body(),
|
|
|
|
allow_redirects=request_obj.allow_redirects,
|
2019-09-02 14:50:21 +00:00
|
|
|
stream=True
|
|
|
|
)
|
2019-09-09 13:10:58 +00:00
|
|
|
response_content = response.raw.read(RESPONSE_MAX,
|
|
|
|
decode_content=True)
|
|
|
|
if not response_content or not response.raw.read(1) == b"":
|
|
|
|
# response too large!
|
|
|
|
pass
|
2019-09-03 13:54:59 +00:00
|
|
|
except utils.DeadlineExceededException:
|
2019-09-02 14:50:21 +00:00
|
|
|
raise HTTPTimeoutException()
|
2018-10-10 12:41:58 +00:00
|
|
|
|
2018-12-11 22:30:57 +00:00
|
|
|
response_headers = utils.CaseInsensitiveDict(dict(response.headers))
|
2019-08-02 16:33:16 +00:00
|
|
|
content_type = response.headers.get("Content-Type", "").split(";", 1)[0]
|
2019-05-28 09:22:35 +00:00
|
|
|
|
2019-09-11 16:44:07 +00:00
|
|
|
encoding = response.encoding or request_obj.fallback_encoding
|
|
|
|
if (request_obj.detect_encoding and
|
|
|
|
content_type and content_type in SOUP_CONTENT_TYPES):
|
|
|
|
souped = bs4.BeautifulSoup(response_content, request_obj.parser)
|
2019-09-09 15:17:26 +00:00
|
|
|
encoding = _find_encoding(souped) or encoding
|
2019-09-09 13:10:58 +00:00
|
|
|
|
2019-06-04 12:47:03 +00:00
|
|
|
def _decode_data():
|
2019-09-09 13:10:58 +00:00
|
|
|
return response_content.decode(encoding)
|
2019-06-04 12:47:03 +00:00
|
|
|
|
2019-09-11 16:44:07 +00:00
|
|
|
if request_obj.parse:
|
|
|
|
if (not request_obj.check_content_type or
|
|
|
|
content_type in SOUP_CONTENT_TYPES):
|
|
|
|
souped = bs4.BeautifulSoup(_decode_data(), request_obj.parser)
|
|
|
|
return Response(response.status_code, souped, response_headers)
|
2019-02-27 15:16:08 +00:00
|
|
|
else:
|
2019-02-28 23:28:45 +00:00
|
|
|
raise HTTPWrongContentTypeException(
|
2019-08-05 14:41:02 +00:00
|
|
|
"Tried to soup non-html/non-xml data (%s)" % content_type)
|
2018-10-09 21:16:04 +00:00
|
|
|
|
2019-09-11 16:44:07 +00:00
|
|
|
if request_obj.json and response_content:
|
2019-09-09 15:17:26 +00:00
|
|
|
data = _decode_data()
|
2018-10-03 12:22:37 +00:00
|
|
|
try:
|
2018-12-11 22:26:38 +00:00
|
|
|
return Response(response.status_code, _json.loads(data),
|
|
|
|
response_headers)
|
2018-10-10 13:25:44 +00:00
|
|
|
except _json.decoder.JSONDecodeError as e:
|
2019-09-02 12:27:44 +00:00
|
|
|
raise HTTPParsingException(str(e), data)
|
2018-10-10 12:41:58 +00:00
|
|
|
|
2019-09-11 14:28:13 +00:00
|
|
|
if content_type in DECODE_CONTENT_TYPES:
|
|
|
|
return Response(response.status_code, _decode_data(), response_headers)
|
|
|
|
else:
|
|
|
|
return Response(response.status_code, response_content,
|
|
|
|
response_headers)
|
2018-10-03 12:22:37 +00:00
|
|
|
|
2019-07-08 10:43:09 +00:00
|
|
|
def request_many(urls: typing.List[str]) -> typing.Dict[str, Response]:
|
|
|
|
responses = {}
|
|
|
|
|
2019-07-08 12:46:27 +00:00
|
|
|
async def _request(url):
|
|
|
|
client = tornado.httpclient.AsyncHTTPClient()
|
|
|
|
request = tornado.httpclient.HTTPRequest(url, method="GET",
|
|
|
|
connect_timeout=2, request_timeout=2)
|
|
|
|
|
2019-07-08 12:51:02 +00:00
|
|
|
response = await client.fetch(request)
|
2019-07-08 12:46:27 +00:00
|
|
|
|
|
|
|
headers = utils.CaseInsensitiveDict(dict(response.headers))
|
|
|
|
data = response.body.decode("utf8")
|
|
|
|
responses[url] = Response(response.code, data, headers)
|
2019-07-08 10:43:09 +00:00
|
|
|
|
2019-07-08 12:59:48 +00:00
|
|
|
loop = asyncio.new_event_loop()
|
2019-07-08 12:46:27 +00:00
|
|
|
awaits = []
|
|
|
|
for url in urls:
|
|
|
|
awaits.append(_request(url))
|
2019-07-08 13:50:11 +00:00
|
|
|
task = asyncio.wait(awaits, loop=loop, timeout=5)
|
2019-07-08 12:46:27 +00:00
|
|
|
loop.run_until_complete(task)
|
2019-07-08 12:59:48 +00:00
|
|
|
loop.close()
|
2019-07-08 12:46:27 +00:00
|
|
|
|
2019-07-08 10:43:09 +00:00
|
|
|
return responses
|
|
|
|
|
2019-09-11 16:53:49 +00:00
|
|
|
class Client(object):
|
|
|
|
request = request
|
|
|
|
request_many = request_many
|
|
|
|
|
2018-10-30 14:58:48 +00:00
|
|
|
def strip_html(s: str) -> str:
|
2018-10-03 12:22:37 +00:00
|
|
|
return bs4.BeautifulSoup(s, "lxml").get_text()
|
|
|
|
|
2019-04-25 14:58:58 +00:00
|
|
|
def resolve_hostname(hostname: str) -> typing.List[str]:
|
|
|
|
try:
|
|
|
|
addresses = socket.getaddrinfo(hostname, None, 0, socket.SOCK_STREAM)
|
|
|
|
except:
|
|
|
|
return []
|
|
|
|
return [address[-1][0] for address in addresses]
|
|
|
|
|
|
|
|
def is_ip(addr: str) -> bool:
|
|
|
|
try:
|
|
|
|
ipaddress.ip_address(addr)
|
|
|
|
except ValueError:
|
|
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
|
|
def is_localhost(hostname: str) -> bool:
|
|
|
|
if is_ip(hostname):
|
|
|
|
ips = [ipaddress.ip_address(hostname)]
|
|
|
|
else:
|
|
|
|
ips = [ipaddress.ip_address(ip) for ip in resolve_hostname(hostname)]
|
|
|
|
|
|
|
|
for interface in netifaces.interfaces():
|
|
|
|
links = netifaces.ifaddresses(interface)
|
2019-04-25 16:48:51 +00:00
|
|
|
|
|
|
|
for link in links.get(netifaces.AF_INET, []
|
2019-04-25 16:50:41 +00:00
|
|
|
)+links.get(netifaces.AF_INET6, []):
|
2019-04-25 14:58:58 +00:00
|
|
|
address = ipaddress.ip_address(link["addr"].split("%", 1)[0])
|
|
|
|
if address in ips:
|
|
|
|
return True
|
2019-04-25 16:48:51 +00:00
|
|
|
|
2019-04-25 14:58:58 +00:00
|
|
|
return False
|