use \S+ for url regex (for non-ascii chars), use url_sanitize to catch <>

This commit is contained in:
jesopo 2019-09-02 13:25:48 +01:00
parent 72e1d71a0a
commit 408b89aeb7

View file

@ -5,25 +5,28 @@ import bs4, netifaces, requests
import tornado.httpclient import tornado.httpclient
from src import utils from src import utils
REGEX_URL = re.compile("https?://[A-Z0-9{}]+".format(re.escape("-._~:/%?#[]@!$&'()*+,;=")), re.I) REGEX_URL = re.compile("https?://\S+", re.I)
PAIRED_CHARACTERS = ["<>", "()"]
# best-effort tidying up of URLs # best-effort tidying up of URLs
def url_sanitise(url: str): def url_sanitise(url: str):
if not urllib.parse.urlparse(url).scheme: if not urllib.parse.urlparse(url).scheme:
url = "http://%s" % url url = "http://%s" % url
if url.endswith(")"): for pair_start, pair_end in PAIRED_CHARACTERS:
# trim ")" from the end only if there's not a "(" to match it # trim ")" from the end only if there's not a "(" to match it
# google.com/) -> google.com/ # google.com/) -> google.com/
# google.com/() -> google.com/() # google.com/() -> google.com/()
# google.com/()) -> google.com/() # google.com/()) -> google.com/()
if url.endswith(pair_end):
if "(" in url: if pair_start in url:
open_index = url.rfind("(") open_index = url.rfind("(")
other_index = url.rfind(")", 0, len(url)-1) other_index = url.rfind(")", 0, len(url)-1)
if other_index == -1 or other_index < open_index: if not other_index == -1 and other_index < open_index:
return url url = url[:-1]
return url[:-1] else:
url = url[:-1]
return url return url
USER_AGENT = ("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 " USER_AGENT = ("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 "