2019-04-25 14:58:58 +00:00
|
|
|
import ipaddress, re, signal, socket, traceback, typing
|
|
|
|
import urllib.error, urllib.parse
|
2018-10-10 12:41:58 +00:00
|
|
|
import json as _json
|
2019-04-25 14:58:58 +00:00
|
|
|
import bs4, netifaces, requests
|
2018-12-11 22:30:05 +00:00
|
|
|
from src import utils
|
2018-10-03 12:22:37 +00:00
|
|
|
|
2019-05-03 03:50:51 +00:00
|
|
|
REGEX_URL = re.compile("https?://[A-Z0-9{}]+".format(re.escape("-._~:/%?#[]@!$&'()*+,;=")), re.I)
|
2019-04-24 14:46:54 +00:00
|
|
|
|
2018-10-03 12:22:37 +00:00
|
|
|
USER_AGENT = ("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 "
|
|
|
|
"(KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36")
|
|
|
|
|
2018-10-10 13:05:15 +00:00
|
|
|
RESPONSE_MAX = (1024*1024)*100
|
2019-02-26 11:18:50 +00:00
|
|
|
SOUP_CONTENT_TYPES = ["text/html", "text/xml", "application/xml"]
|
2018-10-10 13:05:15 +00:00
|
|
|
|
2018-10-30 17:49:35 +00:00
|
|
|
class HTTPException(Exception):
|
2018-10-10 13:25:44 +00:00
|
|
|
pass
|
|
|
|
class HTTPTimeoutException(HTTPException):
|
|
|
|
pass
|
|
|
|
class HTTPParsingException(HTTPException):
|
|
|
|
pass
|
2019-02-28 23:28:45 +00:00
|
|
|
class HTTPWrongContentTypeException(HTTPException):
|
|
|
|
pass
|
2018-10-10 13:25:44 +00:00
|
|
|
|
2018-10-10 14:07:04 +00:00
|
|
|
def throw_timeout():
|
|
|
|
raise HTTPTimeoutException()
|
2018-10-10 13:25:44 +00:00
|
|
|
|
2018-12-11 22:26:38 +00:00
|
|
|
class Response(object):
|
|
|
|
def __init__(self, code: int, data: typing.Any,
|
|
|
|
headers: typing.Dict[str, str]):
|
|
|
|
self.code = code
|
|
|
|
self.data = data
|
|
|
|
self.headers = headers
|
|
|
|
|
|
|
|
def request(url: str, method: str="GET", get_params: dict={},
|
2018-10-30 14:58:48 +00:00
|
|
|
post_data: typing.Any=None, headers: dict={},
|
|
|
|
json_data: typing.Any=None, code: bool=False, json: bool=False,
|
2018-12-11 22:26:38 +00:00
|
|
|
soup: bool=False, parser: str="lxml", fallback_encoding: str="utf8",
|
|
|
|
) -> Response:
|
2018-10-10 12:41:58 +00:00
|
|
|
|
2018-10-03 12:22:37 +00:00
|
|
|
if not urllib.parse.urlparse(url).scheme:
|
|
|
|
url = "http://%s" % url
|
|
|
|
|
2018-10-10 12:41:58 +00:00
|
|
|
if not "Accept-Language" in headers:
|
|
|
|
headers["Accept-Language"] = "en-GB"
|
|
|
|
if not "User-Agent" in headers:
|
|
|
|
headers["User-Agent"] = USER_AGENT
|
|
|
|
|
2018-10-25 13:09:19 +00:00
|
|
|
signal.signal(signal.SIGALRM, lambda _1, _2: throw_timeout())
|
2018-10-10 13:25:44 +00:00
|
|
|
signal.alarm(5)
|
|
|
|
try:
|
|
|
|
response = requests.request(
|
|
|
|
method.upper(),
|
|
|
|
url,
|
|
|
|
headers=headers,
|
|
|
|
params=get_params,
|
|
|
|
data=post_data,
|
|
|
|
json=json_data,
|
|
|
|
stream=True
|
|
|
|
)
|
|
|
|
response_content = response.raw.read(RESPONSE_MAX, decode_content=True)
|
|
|
|
except TimeoutError:
|
|
|
|
raise HTTPTimeoutException()
|
|
|
|
finally:
|
|
|
|
signal.signal(signal.SIGALRM, signal.SIG_IGN)
|
2018-10-10 12:41:58 +00:00
|
|
|
|
2018-12-11 22:30:57 +00:00
|
|
|
response_headers = utils.CaseInsensitiveDict(dict(response.headers))
|
2018-12-11 22:26:38 +00:00
|
|
|
|
2019-02-26 11:18:50 +00:00
|
|
|
content_type = response.headers["Content-Type"].split(";", 1)[0]
|
2019-02-27 15:16:08 +00:00
|
|
|
if soup:
|
|
|
|
if content_type in SOUP_CONTENT_TYPES:
|
|
|
|
soup = bs4.BeautifulSoup(response_content, parser)
|
|
|
|
return Response(response.status_code, soup, response_headers)
|
|
|
|
else:
|
2019-02-28 23:28:45 +00:00
|
|
|
raise HTTPWrongContentTypeException(
|
|
|
|
"Tried to soup non-html/non-xml data")
|
2018-10-09 21:16:04 +00:00
|
|
|
|
2019-02-26 11:18:50 +00:00
|
|
|
|
2018-10-10 22:49:42 +00:00
|
|
|
data = response_content.decode(response.encoding or fallback_encoding)
|
2018-10-10 12:41:58 +00:00
|
|
|
if json and data:
|
2018-10-03 12:22:37 +00:00
|
|
|
try:
|
2018-12-11 22:26:38 +00:00
|
|
|
return Response(response.status_code, _json.loads(data),
|
|
|
|
response_headers)
|
2018-10-10 13:25:44 +00:00
|
|
|
except _json.decoder.JSONDecodeError as e:
|
|
|
|
raise HTTPParsingException(str(e))
|
2018-10-10 12:41:58 +00:00
|
|
|
|
2018-12-11 22:26:38 +00:00
|
|
|
return Response(response.status_code, data, response_headers)
|
2018-10-03 12:22:37 +00:00
|
|
|
|
2018-10-30 14:58:48 +00:00
|
|
|
def strip_html(s: str) -> str:
|
2018-10-03 12:22:37 +00:00
|
|
|
return bs4.BeautifulSoup(s, "lxml").get_text()
|
|
|
|
|
2019-04-25 14:58:58 +00:00
|
|
|
def resolve_hostname(hostname: str) -> typing.List[str]:
|
|
|
|
try:
|
|
|
|
addresses = socket.getaddrinfo(hostname, None, 0, socket.SOCK_STREAM)
|
|
|
|
except:
|
|
|
|
return []
|
|
|
|
return [address[-1][0] for address in addresses]
|
|
|
|
|
|
|
|
def is_ip(addr: str) -> bool:
|
|
|
|
try:
|
|
|
|
ipaddress.ip_address(addr)
|
|
|
|
except ValueError:
|
|
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
|
|
def is_localhost(hostname: str) -> bool:
|
|
|
|
if is_ip(hostname):
|
|
|
|
ips = [ipaddress.ip_address(hostname)]
|
|
|
|
else:
|
|
|
|
ips = [ipaddress.ip_address(ip) for ip in resolve_hostname(hostname)]
|
|
|
|
|
|
|
|
for interface in netifaces.interfaces():
|
|
|
|
links = netifaces.ifaddresses(interface)
|
2019-04-25 16:48:51 +00:00
|
|
|
|
|
|
|
for link in links.get(netifaces.AF_INET, []
|
2019-04-25 16:50:41 +00:00
|
|
|
)+links.get(netifaces.AF_INET6, []):
|
2019-04-25 14:58:58 +00:00
|
|
|
address = ipaddress.ip_address(link["addr"].split("%", 1)[0])
|
|
|
|
if address in ips:
|
|
|
|
return True
|
2019-04-25 16:48:51 +00:00
|
|
|
|
2019-04-25 14:58:58 +00:00
|
|
|
return False
|