2019-05-25 20:40:06 +00:00
|
|
|
#--depends-on commands
|
|
|
|
#--depends-on config
|
2019-06-25 16:53:00 +00:00
|
|
|
#--depends-on shorturl
|
2019-05-25 20:40:06 +00:00
|
|
|
|
2019-04-25 14:58:58 +00:00
|
|
|
import hashlib, re, urllib.parse
|
2019-02-09 10:57:05 +00:00
|
|
|
from src import EventManager, ModuleManager, utils
|
2016-03-29 11:56:58 +00:00
|
|
|
|
2019-10-09 11:58:25 +00:00
|
|
|
RE_WORDSPLIT = re.compile("[\s/]")
|
|
|
|
|
2019-06-28 22:16:05 +00:00
|
|
|
@utils.export("channelset", utils.BoolSetting("auto-title",
|
|
|
|
"Disable/Enable automatically getting info titles from URLs"))
|
|
|
|
@utils.export("channelset", utils.BoolSetting("title-shorten",
|
|
|
|
"Enable/disable shortening URLs when getting their title"))
|
|
|
|
@utils.export("channelset", utils.BoolSetting("auto-title-first",
|
|
|
|
"Enable/disable showing who first posted a URL that was auto-titled"))
|
2019-10-04 09:52:07 +00:00
|
|
|
@utils.export("channelset", utils.BoolSetting("auto-title-difference",
|
|
|
|
"Enable/disable checking if a <title> is different enough from the URL"
|
|
|
|
" before showing it"))
|
2018-09-26 17:27:17 +00:00
|
|
|
class Module(ModuleManager.BaseModule):
|
2019-04-24 13:48:15 +00:00
|
|
|
def _url_hash(self, url):
|
|
|
|
return "sha256:%s" % hashlib.sha256(url.lower().encode("utf8")
|
|
|
|
).hexdigest()
|
|
|
|
|
2019-10-04 09:52:07 +00:00
|
|
|
def _different(self, url, title):
|
|
|
|
url = url.lower()
|
2019-10-04 10:20:35 +00:00
|
|
|
title_words = []
|
2019-10-09 11:58:25 +00:00
|
|
|
for title_word in RE_WORDSPLIT.split(title):
|
2019-10-04 10:20:35 +00:00
|
|
|
if len(title_word) > 1 or title_word.isalpha():
|
2019-10-28 12:25:23 +00:00
|
|
|
title_word = title_word.lower()
|
|
|
|
title_words.append(title_word.strip("'\"<>()"))
|
2019-10-04 10:20:35 +00:00
|
|
|
|
2019-10-04 09:52:07 +00:00
|
|
|
present = 0
|
|
|
|
for title_word in title_words:
|
|
|
|
if title_word in url:
|
|
|
|
present += 1
|
|
|
|
|
2019-10-04 10:20:35 +00:00
|
|
|
similarity = present/len(title_words)
|
2019-10-04 09:52:07 +00:00
|
|
|
# if at least 80% of words are in the URL, too similar
|
2019-10-04 10:20:35 +00:00
|
|
|
if similarity >= 0.8:
|
2019-10-04 09:52:07 +00:00
|
|
|
return False
|
|
|
|
return True
|
|
|
|
|
2019-06-25 17:04:36 +00:00
|
|
|
def _get_title(self, server, channel, url):
|
2019-04-25 14:58:58 +00:00
|
|
|
if not urllib.parse.urlparse(url).scheme:
|
|
|
|
url = "http://%s" % url
|
|
|
|
|
|
|
|
hostname = urllib.parse.urlparse(url).hostname
|
2019-09-30 14:15:20 +00:00
|
|
|
if not utils.http.host_permitted(hostname):
|
|
|
|
self.log.warn("Attempted to get forbidden host: %s", [url])
|
2019-09-20 14:56:14 +00:00
|
|
|
return -1, None
|
2019-04-25 14:58:58 +00:00
|
|
|
|
2019-02-09 10:35:37 +00:00
|
|
|
try:
|
2019-11-26 11:35:56 +00:00
|
|
|
page = utils.http.request(url)
|
2019-02-09 10:35:37 +00:00
|
|
|
except Exception as e:
|
2019-11-14 13:22:11 +00:00
|
|
|
self.log.error("failed to get URL title for %s: %s", [url, str(e)])
|
2019-09-20 14:56:14 +00:00
|
|
|
return -1, None
|
2019-10-31 10:26:00 +00:00
|
|
|
|
2019-11-26 11:35:56 +00:00
|
|
|
if not page.content_type in utils.http.SOUP_CONTENT_TYPES:
|
|
|
|
return -1, None
|
|
|
|
page = page.soup()
|
|
|
|
|
|
|
|
if page.title:
|
2019-11-26 14:34:41 +00:00
|
|
|
title = utils.parse.line_normalise(page.title.text)
|
2019-10-31 10:27:00 +00:00
|
|
|
if not title:
|
|
|
|
return -3, None
|
2019-10-31 10:26:00 +00:00
|
|
|
|
|
|
|
if channel:
|
|
|
|
if (channel.get_setting("auto-title-difference", True) and
|
|
|
|
not self._different(url, title)):
|
|
|
|
return -2, title
|
|
|
|
|
|
|
|
if channel.get_setting("title-shorten", False):
|
|
|
|
short_url = self.exports.get_one("shorturl")(server, url,
|
|
|
|
context=channel)
|
|
|
|
return page.code, "%s - %s" % (title, short_url)
|
2019-09-20 14:56:14 +00:00
|
|
|
return page.code, title
|
2019-02-09 10:38:44 +00:00
|
|
|
else:
|
2019-09-20 14:56:14 +00:00
|
|
|
return -1, None
|
2019-02-09 10:35:37 +00:00
|
|
|
|
2019-06-26 13:37:26 +00:00
|
|
|
@utils.hook("command.regex")
|
|
|
|
@utils.kwarg("ignore_action", False)
|
|
|
|
@utils.kwarg("priority", EventManager.PRIORITY_MONITOR)
|
|
|
|
@utils.kwarg("command", "title")
|
|
|
|
@utils.kwarg("pattern", utils.http.REGEX_URL)
|
2019-02-09 10:35:37 +00:00
|
|
|
def channel_message(self, event):
|
2019-05-19 09:44:48 +00:00
|
|
|
if event["target"].get_setting("auto-title", False):
|
2019-05-21 08:54:51 +00:00
|
|
|
event.eat()
|
2019-07-02 13:16:16 +00:00
|
|
|
url = utils.http.url_sanitise(event["match"].group(0))
|
2019-09-20 14:56:14 +00:00
|
|
|
code, title = self._get_title(event["server"], event["target"], url)
|
2019-04-24 13:48:15 +00:00
|
|
|
|
2019-09-20 14:56:14 +00:00
|
|
|
if code == 200 and title:
|
2019-05-19 09:44:48 +00:00
|
|
|
message = title
|
|
|
|
if event["target"].get_setting("auto-title-first", False):
|
|
|
|
setting = "url-last-%s" % self._url_hash(url)
|
|
|
|
first_details = event["target"].get_setting(setting, None)
|
2019-04-24 13:48:15 +00:00
|
|
|
|
2019-05-19 09:44:48 +00:00
|
|
|
if first_details:
|
|
|
|
first_nickname, first_timestamp, _ = first_details
|
2019-11-15 13:59:09 +00:00
|
|
|
timestamp_parsed = utils.datetime.iso8601_parse(
|
|
|
|
first_timestamp)
|
|
|
|
timestamp_human = utils.datetime.datetime_human(
|
|
|
|
timestamp_parsed)
|
|
|
|
|
2019-05-19 09:44:48 +00:00
|
|
|
message = "%s (first posted by %s at %s)" % (title,
|
|
|
|
first_nickname, timestamp_human)
|
|
|
|
else:
|
|
|
|
event["target"].set_setting(setting,
|
2019-11-15 13:59:09 +00:00
|
|
|
[event["user"].nickname,
|
|
|
|
utils.datetime.iso8601_format_now(), url])
|
2019-05-19 09:44:48 +00:00
|
|
|
event["stdout"].write(message)
|
2019-10-04 09:52:07 +00:00
|
|
|
if code == -2:
|
|
|
|
self.log.debug("Not showing title for %s, too similar", [url])
|
2019-02-09 10:35:37 +00:00
|
|
|
|
2018-10-10 09:42:41 +00:00
|
|
|
@utils.hook("received.command.t", alias_of="title")
|
|
|
|
@utils.hook("received.command.title", usage="[URL]")
|
2018-08-31 11:55:52 +00:00
|
|
|
def title(self, event):
|
2018-09-26 17:27:17 +00:00
|
|
|
"""
|
2018-09-30 16:29:09 +00:00
|
|
|
:help: Get the title of a URL
|
|
|
|
:usage: [URL]
|
2018-09-26 17:27:17 +00:00
|
|
|
"""
|
2018-08-31 11:55:52 +00:00
|
|
|
url = None
|
|
|
|
if len(event["args"]) > 0:
|
|
|
|
url = event["args_split"][0]
|
|
|
|
else:
|
2019-08-13 12:48:03 +00:00
|
|
|
match = event["target"].buffer.find(utils.http.REGEX_URL)
|
|
|
|
if match:
|
|
|
|
url = match.match
|
2018-08-31 11:55:52 +00:00
|
|
|
if not url:
|
2018-10-16 14:09:58 +00:00
|
|
|
raise utils.EventError("No URL provided/found.")
|
2018-10-19 09:31:45 +00:00
|
|
|
|
2019-10-31 10:26:00 +00:00
|
|
|
channel = None
|
|
|
|
if event["is_channel"]:
|
|
|
|
channel = event["target"]
|
|
|
|
code, title = self._get_title(event["server"], channel, url)
|
2018-10-19 09:31:45 +00:00
|
|
|
|
2018-08-31 11:55:52 +00:00
|
|
|
if title:
|
|
|
|
event["stdout"].write(title)
|
|
|
|
else:
|
2019-02-09 10:35:37 +00:00
|
|
|
event["stderr"].write("Failed to get title")
|
2019-11-20 17:59:26 +00:00
|
|
|
|
|
|
|
@utils.hook("received.command.notitle")
|
|
|
|
@utils.kwarg("expect_output", False)
|
|
|
|
def notitle(self, event):
|
|
|
|
pass
|