don't auto-title when a URL contains most of it's <title>

This commit is contained in:
jesopo 2019-10-04 10:52:07 +01:00
parent 3466a3c43e
commit 8f927afdc9

View file

@ -11,11 +11,27 @@ from src import EventManager, ModuleManager, utils
"Enable/disable shortening URLs when getting their title")) "Enable/disable shortening URLs when getting their title"))
@utils.export("channelset", utils.BoolSetting("auto-title-first", @utils.export("channelset", utils.BoolSetting("auto-title-first",
"Enable/disable showing who first posted a URL that was auto-titled")) "Enable/disable showing who first posted a URL that was auto-titled"))
@utils.export("channelset", utils.BoolSetting("auto-title-difference",
"Enable/disable checking if a <title> is different enough from the URL"
" before showing it"))
class Module(ModuleManager.BaseModule): class Module(ModuleManager.BaseModule):
def _url_hash(self, url): def _url_hash(self, url):
return "sha256:%s" % hashlib.sha256(url.lower().encode("utf8") return "sha256:%s" % hashlib.sha256(url.lower().encode("utf8")
).hexdigest() ).hexdigest()
def _different(self, url, title):
url = url.lower()
title_words = [word.lower() for word in title.split()]
present = 0
for title_word in title_words:
if title_word in url:
present += 1
# if at least 80% of words are in the URL, too similar
if (present/len(title_words)) >= 0.8:
return False
return True
def _get_title(self, server, channel, url): def _get_title(self, server, channel, url):
if not urllib.parse.urlparse(url).scheme: if not urllib.parse.urlparse(url).scheme:
url = "http://%s" % url url = "http://%s" % url
@ -35,6 +51,9 @@ class Module(ModuleManager.BaseModule):
if page.data.title: if page.data.title:
title = page.data.title.text.replace("\n", " ").replace( title = page.data.title.text.replace("\n", " ").replace(
"\r", "").replace(" ", " ").strip() "\r", "").replace(" ", " ").strip()
if (channel.get_setting("auto-title-difference", True) and
not self._different(url, title)):
return -2, title
if channel.get_setting("title-shorten", False): if channel.get_setting("title-shorten", False):
short_url = self.exports.get_one("shorturl")(server, url, short_url = self.exports.get_one("shorturl")(server, url,
@ -72,6 +91,8 @@ class Module(ModuleManager.BaseModule):
[event["user"].nickname, utils.iso8601_format_now(), [event["user"].nickname, utils.iso8601_format_now(),
url]) url])
event["stdout"].write(message) event["stdout"].write(message)
if code == -2:
self.log.debug("Not showing title for %s, too similar", [url])
@utils.hook("received.command.t", alias_of="title") @utils.hook("received.command.t", alias_of="title")
@utils.hook("received.command.title", usage="[URL]") @utils.hook("received.command.title", usage="[URL]")