bitbot-3.11-fork/src/utils/http.py
2018-10-08 12:43:31 +01:00

99 lines
3.2 KiB
Python

import re, traceback, urllib.error, urllib.parse, urllib.request
import json, ssl
import bs4
USER_AGENT = ("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36")
REGEX_HTTP = re.compile("https?://", re.I)
def get_url(url, **kwargs):
if not urllib.parse.urlparse(url).scheme:
url = "http://%s" % url
url_parsed = urllib.parse.urlparse(url)
method = kwargs.get("method", "GET")
get_params = kwargs.get("get_params", "")
post_params = kwargs.get("post_params", None)
post_data = kwargs.get("post_data", None)
headers = kwargs.get("headers", {})
if get_params:
get_params = "?%s" % urllib.parse.urlencode(get_params)
if post_params:
post_data = urllib.parse.urlencode(post_params)
url = "%s%s" % (url, get_params)
try:
url.encode("latin-1")
if post_data:
post_data = post_data.encode("utf8")
except UnicodeEncodeError:
if kwargs.get("code"):
return 0, False
return False
request = urllib.request.Request(url, post_data)
request.add_header("Accept-Language", "en-US")
request.add_header("User-Agent", USER_AGENT)
for header, value in headers.items():
request.add_header(header, value)
request.method = method
try:
response = urllib.request.urlopen(request, timeout=5)
except urllib.error.HTTPError as e:
traceback.print_exc()
if kwargs.get("code"):
return e.code, False
return False
except urllib.error.URLError as e:
traceback.print_exc()
if kwargs.get("code"):
return -1, False
return False
except ssl.CertificateError as e:
traceback.print_exc()
if kwargs.get("code"):
return -1, False,
return False
response_content = response.read()
encoding = response.info().get_content_charset()
if kwargs.get("soup"):
return bs4.BeautifulSoup(response_content, kwargs.get("parser", "lxml"))
if not encoding:
soup = bs4.BeautifulSoup(response_content, kwargs.get("parser", "lxml"))
metas = soup.find_all("meta")
for meta in metas:
if "charset=" in meta.get("content", ""):
encoding = meta.get("content").split("charset=", 1)[1
].split(";", 1)[0]
elif meta.get("charset", ""):
encoding = meta.get("charset")
else:
continue
break
if not encoding:
for item in soup.contents:
if isinstance(item, bs4.Doctype):
if item == "html":
encoding = "utf8"
else:
encoding = "latin-1"
break
response_content = response_content.decode(encoding or "utf8")
data = response_content
if kwargs.get("json") and data:
try:
data = json.loads(response_content)
except json.decoder.JSONDecodeError:
traceback.print_exc()
return False
if kwargs.get("code"):
return response.code, data
else:
return data
def strip_html(s):
return bs4.BeautifulSoup(s, "lxml").get_text()