use \S+ for url regex (for non-ascii chars), use url_sanitize to catch <>
This commit is contained in:
parent
72e1d71a0a
commit
408b89aeb7
1 changed files with 12 additions and 9 deletions
|
@ -5,25 +5,28 @@ import bs4, netifaces, requests
|
||||||
import tornado.httpclient
|
import tornado.httpclient
|
||||||
from src import utils
|
from src import utils
|
||||||
|
|
||||||
REGEX_URL = re.compile("https?://[A-Z0-9{}]+".format(re.escape("-._~:/%?#[]@!$&'()*+,;=")), re.I)
|
REGEX_URL = re.compile("https?://\S+", re.I)
|
||||||
|
|
||||||
|
PAIRED_CHARACTERS = ["<>", "()"]
|
||||||
|
|
||||||
# best-effort tidying up of URLs
|
# best-effort tidying up of URLs
|
||||||
def url_sanitise(url: str):
|
def url_sanitise(url: str):
|
||||||
if not urllib.parse.urlparse(url).scheme:
|
if not urllib.parse.urlparse(url).scheme:
|
||||||
url = "http://%s" % url
|
url = "http://%s" % url
|
||||||
|
|
||||||
if url.endswith(")"):
|
for pair_start, pair_end in PAIRED_CHARACTERS:
|
||||||
# trim ")" from the end only if there's not a "(" to match it
|
# trim ")" from the end only if there's not a "(" to match it
|
||||||
# google.com/) -> google.com/
|
# google.com/) -> google.com/
|
||||||
# google.com/() -> google.com/()
|
# google.com/() -> google.com/()
|
||||||
# google.com/()) -> google.com/()
|
# google.com/()) -> google.com/()
|
||||||
|
if url.endswith(pair_end):
|
||||||
if "(" in url:
|
if pair_start in url:
|
||||||
open_index = url.rfind("(")
|
open_index = url.rfind("(")
|
||||||
other_index = url.rfind(")", 0, len(url)-1)
|
other_index = url.rfind(")", 0, len(url)-1)
|
||||||
if other_index == -1 or other_index < open_index:
|
if not other_index == -1 and other_index < open_index:
|
||||||
return url
|
url = url[:-1]
|
||||||
return url[:-1]
|
else:
|
||||||
|
url = url[:-1]
|
||||||
return url
|
return url
|
||||||
|
|
||||||
USER_AGENT = ("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 "
|
USER_AGENT = ("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 "
|
||||||
|
|
Loading…
Reference in a new issue