bitbot-3.11-fork/src/utils/http.py

163 lines
5.4 KiB
Python
Raw Normal View History

import ipaddress, re, signal, socket, traceback, typing
import urllib.error, urllib.parse
2018-10-10 12:41:58 +00:00
import json as _json
import bs4, netifaces, requests, tornado.gen, tornado.httpclient, tornado.ioloop
from src import utils
2019-05-03 03:50:51 +00:00
REGEX_URL = re.compile("https?://[A-Z0-9{}]+".format(re.escape("-._~:/%?#[]@!$&'()*+,;=")), re.I)
# best-effort tidying up of URLs
2019-07-02 13:15:49 +00:00
def url_sanitise(url: str):
if url.endswith(")"):
# trim ")" from the end only if there's not a "(" to match it
# google.com/) -> google.com/
# google.com/() -> google.com/()
# google.com/()) -> google.com/()
if "(" in url:
open_index = url.rfind("(")
other_index = url.rfind(")", 0, len(url)-1)
if other_index == -1 or other_index < open_index:
return url
return url[:-1]
return url
USER_AGENT = ("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36")
RESPONSE_MAX = (1024*1024)*100
SOUP_CONTENT_TYPES = ["text/html", "text/xml", "application/xml"]
class HTTPException(Exception):
pass
class HTTPTimeoutException(HTTPException):
def __init__(self):
Exception.__init__(self, "HTTP request timed out")
class HTTPParsingException(HTTPException):
def __init__(self, message: str=None):
Exception.__init__(self, message or "HTTP parsing failed")
class HTTPWrongContentTypeException(HTTPException):
def __init__(self, message: str=None):
Exception.__init__(self,
message or "HTTP request gave wrong content type")
def throw_timeout():
raise HTTPTimeoutException()
class Response(object):
def __init__(self, code: int, data: typing.Any,
headers: typing.Dict[str, str]):
self.code = code
self.data = data
self.headers = headers
def request(url: str, method: str="GET", get_params: dict={},
post_data: typing.Any=None, headers: dict={},
json_data: typing.Any=None, code: bool=False, json: bool=False,
soup: bool=False, parser: str="lxml", fallback_encoding: str="utf8",
allow_redirects: bool=True
) -> Response:
2018-10-10 12:41:58 +00:00
if not urllib.parse.urlparse(url).scheme:
url = "http://%s" % url
2018-10-10 12:41:58 +00:00
if not "Accept-Language" in headers:
headers["Accept-Language"] = "en-GB"
if not "User-Agent" in headers:
headers["User-Agent"] = USER_AGENT
signal.signal(signal.SIGALRM, lambda _1, _2: throw_timeout())
signal.alarm(5)
try:
response = requests.request(
method.upper(),
url,
headers=headers,
params=get_params,
data=post_data,
json=json_data,
allow_redirects=allow_redirects,
stream=True
)
response_content = response.raw.read(RESPONSE_MAX, decode_content=True)
except TimeoutError:
raise HTTPTimeoutException()
finally:
signal.signal(signal.SIGALRM, signal.SIG_IGN)
2018-10-10 12:41:58 +00:00
response_headers = utils.CaseInsensitiveDict(dict(response.headers))
content_type = response.headers["Content-Type"].split(";", 1)[0]
def _decode_data():
return response_content.decode(response.encoding or fallback_encoding)
if soup:
if content_type in SOUP_CONTENT_TYPES:
soup = bs4.BeautifulSoup(_decode_data(), parser)
return Response(response.status_code, soup, response_headers)
else:
raise HTTPWrongContentTypeException(
"Tried to soup non-html/non-xml data")
data = _decode_data()
2018-10-10 12:41:58 +00:00
if json and data:
try:
return Response(response.status_code, _json.loads(data),
response_headers)
except _json.decoder.JSONDecodeError as e:
raise HTTPParsingException(str(e))
2018-10-10 12:41:58 +00:00
return Response(response.status_code, data, response_headers)
def request_many(urls: typing.List[str]) -> typing.Dict[str, Response]:
responses = {}
@tornado.gen.coroutine
def _request():
for url in urls:
client = tornado.httpclient.AsyncHTTPClient()
request = tornado.httpclient.HTTPRequest(url, method="GET",
connect_timeout=2, request_timeout=2)
response = yield client.fetch(request)
headers = utils.CaseInsensitiveDict(dict(response.headers))
data = response.body.decode("utf8")
responses[url] = Response(response.code, data, headers)
tornado.ioloop.IOLoop.current().run_sync(_request)
return responses
def strip_html(s: str) -> str:
return bs4.BeautifulSoup(s, "lxml").get_text()
def resolve_hostname(hostname: str) -> typing.List[str]:
try:
addresses = socket.getaddrinfo(hostname, None, 0, socket.SOCK_STREAM)
except:
return []
return [address[-1][0] for address in addresses]
def is_ip(addr: str) -> bool:
try:
ipaddress.ip_address(addr)
except ValueError:
return False
return True
def is_localhost(hostname: str) -> bool:
if is_ip(hostname):
ips = [ipaddress.ip_address(hostname)]
else:
ips = [ipaddress.ip_address(ip) for ip in resolve_hostname(hostname)]
for interface in netifaces.interfaces():
links = netifaces.ifaddresses(interface)
for link in links.get(netifaces.AF_INET, []
2019-04-25 16:50:41 +00:00
)+links.get(netifaces.AF_INET6, []):
address = ipaddress.ip_address(link["addr"].split("%", 1)[0])
if address in ips:
return True
return False