bitbot-3.11-fork/modules/rss.py

225 lines
8.5 KiB
Python
Raw Normal View History

2019-07-15 16:45:48 +00:00
#--depends-on config
#--depends-on shorturl
import difflib, hashlib, time
2019-06-23 15:03:15 +00:00
from src import ModuleManager, utils
import feedparser
RSS_INTERVAL = 60 # 1 minute
2020-04-09 14:47:20 +00:00
SETTING_BIND = utils.Setting("rss-bindhost",
"Which local address to bind to for RSS requests", example="127.0.0.1")
2019-07-04 10:15:01 +00:00
@utils.export("botset", utils.IntSetting("rss-interval",
"Interval (in seconds) between RSS polls", example="120"))
@utils.export("channelset", utils.BoolSetting("rss-shorten",
"Whether or not to shorten RSS urls"))
@utils.export("channelset", utils.Setting("rss-format", "Format of RSS announcements", example="$longtitle: $title - $link [$author]"))
2020-04-09 14:47:20 +00:00
@utils.export("serverset", SETTING_BIND)
@utils.export("channelset", SETTING_BIND)
2019-06-23 15:03:15 +00:00
class Module(ModuleManager.BaseModule):
2019-06-24 19:23:36 +00:00
_name = "RSS"
2019-06-23 15:03:15 +00:00
def on_load(self):
self.timers.add("rss-feeds", self._timer,
self.bot.get_setting("rss-interval", RSS_INTERVAL))
2019-06-23 15:03:15 +00:00
def _format_entry(self, server, channel, feed_title, entry, shorten):
title = utils.parse.line_normalise(utils.http.strip_html(
entry["title"]))
author = entry.get("author", "unknown author")
author = "%s" % author if author else ""
link = entry.get("link", None)
if shorten:
try:
2020-03-08 14:14:00 +00:00
link = self.exports.get("shorturl")(server, link)
except:
pass
link = "%s" % link if link else ""
feed_title_str = "%s" % feed_title if feed_title else ""
# just in case the format starts keyerroring and you're not sure why
self.log.trace("RSS Entry: " + str(entry))
try:
format = channel.get_setting("rss-format", "$longtitle: $title by $author - $link").replace("$longtitle", feed_title_str).replace("$title", title).replace("$link", link).replace("$author", author).format(**entry)
except KeyError:
self.log.warn(f"Failed to format RSS entry for {channel}. Falling back to default format.")
format = f"{feed_title_str}: {title} by {author} - {link}"
return format
def _timer(self, timer):
2019-07-08 12:46:12 +00:00
start_time = time.monotonic()
self.log.trace("Polling RSS feeds")
timer.redo()
2019-06-23 15:03:15 +00:00
hook_settings = self.bot.database.channel_settings.find_by_setting(
"rss-hooks")
hooks = {}
for server_id, channel_name, urls in hook_settings:
server = self.bot.get_server_by_id(server_id)
if server and channel_name in server.channels:
channel = server.channels.get(channel_name)
for url in urls:
2020-04-09 17:26:54 +00:00
bindhost = channel.get_setting("rss-bindhost",
server.get_setting("rss-bindhost", None))
if url.startswith("www."):
url = url.replace("www.", "", 1)
key = (url, bindhost)
if not key in hooks:
hooks[key] = []
hooks[key].append((server, channel))
2019-06-23 15:03:15 +00:00
if not hooks:
return
requests = []
2020-04-09 17:26:54 +00:00
for url, bindhost in hooks.keys():
requests.append(utils.http.Request(url, id=f"{url} {bindhost}",
bindhost=bindhost))
pages = utils.http.request_many(requests)
2019-06-23 15:03:15 +00:00
2020-04-09 17:26:54 +00:00
for (url, bindhost), channels in hooks.items():
key = f"{url} {bindhost}"
if not key in pages:
# async url get failed
continue
try:
2020-04-09 17:26:54 +00:00
data = pages[key].decode()
except Exception as e:
self.log.error("Failed to decode rss URL %s", [url],
exc_info=True)
continue
feed = feedparser.parse(data)
feed_title = feed["feed"].get("title", None)
max_ids = len(feed["entries"])*10
2019-06-23 15:03:15 +00:00
for server, channel in channels:
seen_ids = channel.get_setting("rss-seen-ids-%s" % url, [])
2019-06-23 15:22:45 +00:00
valid = 0
for entry in feed["entries"][::-1]:
entry_id, entry_id_hash = self._get_id(entry)
if entry_id_hash in seen_ids or entry_id in seen_ids:
2019-06-23 15:03:15 +00:00
continue
2019-06-23 15:22:45 +00:00
if valid == 3:
2019-06-23 15:03:15 +00:00
continue
valid += 1
shorten = channel.get_setting("rss-shorten", False)
output = self._format_entry(server, channel, feed_title, entry,
shorten)
2019-06-23 15:03:15 +00:00
self.events.on("send.stdout").call(target=channel,
module_name="RSS", server=server, message=output)
seen_ids.append(entry_id_hash)
2019-06-23 15:03:15 +00:00
if len(seen_ids) > max_ids:
seen_ids = seen_ids[len(seen_ids)-max_ids:]
channel.set_setting("rss-seen-ids-%s" % url, seen_ids)
2019-06-23 15:03:15 +00:00
2019-07-08 12:46:12 +00:00
total_milliseconds = (time.monotonic() - start_time) * 1000
self.log.trace("Polled RSS feeds in %fms", [total_milliseconds])
2019-08-09 15:48:41 +00:00
def _get_id(self, entry):
entry_id = entry.get("id", entry["link"])
entry_id_hash = hashlib.sha1(entry_id.encode("utf8")).hexdigest()
return entry_id, "sha1:%s" % entry_id_hash
2019-08-09 15:48:41 +00:00
def _get_entries(self, url, max: int=None):
try:
feed = feedparser.parse(utils.http.request(url).data)
except Exception as e:
self.log.warn("failed to parse RSS %s", [url], exc_info=True)
feed = None
if not feed or not feed["feed"]:
2019-08-09 15:48:41 +00:00
return None, None
entry_ids = []
for entry in feed["entries"]:
entry_ids.append(entry.get("id", entry["link"]))
2019-08-09 15:48:41 +00:00
return feed["feed"].get("title", None), feed["entries"][:max]
2019-06-23 15:03:15 +00:00
@utils.hook("received.command.rss", min_args=1, channel_only=True)
def rss(self, event):
"""
:help: Modify RSS/Atom configuration for the current channel
:usage: list
:usage: add <url>
:usage: remove <url>
:permission: rss
"""
changed = False
message = None
rss_hooks = event["target"].get_setting("rss-hooks", [])
subcommand = event["args_split"][0].lower()
if subcommand == "list":
event["stdout"].write("RSS hooks: %s" % ", ".join(rss_hooks))
elif subcommand == "add":
if not len(event["args_split"]) > 1:
raise utils.EventError("Please provide a URL")
url = utils.http.url_sanitise(event["args_split"][1])
2019-06-23 15:03:15 +00:00
if url in rss_hooks:
raise utils.EventError("That URL is already being watched")
2019-08-09 15:48:41 +00:00
title, entries = self._get_entries(url)
2019-08-09 22:22:46 +00:00
if entries == None:
raise utils.EventError("Failed to read feed")
2019-08-09 15:48:41 +00:00
seen_ids = [self._get_id(e)[1] for e in entries]
event["target"].set_setting("rss-seen-ids-%s" % url, seen_ids)
2019-06-23 15:03:15 +00:00
rss_hooks.append(url)
changed = True
message = "Added RSS feed"
elif subcommand == "remove":
if not len(event["args_split"]) > 1:
raise utils.EventError("Please provide a URL")
url = utils.http.url_sanitise(event["args_split"][1])
2019-06-23 15:03:15 +00:00
if not url in rss_hooks:
matches = difflib.get_close_matches(url, rss_hooks, cutoff=0.5)
if matches:
raise utils.EventError("Did you mean %s ?" % matches[0])
else:
raise utils.EventError("I'm not watching that URL")
2019-06-23 15:03:15 +00:00
rss_hooks.remove(url)
changed = True
message = "Removed RSS feed"
2019-08-09 15:48:41 +00:00
elif subcommand == "read":
url = None
2019-08-09 15:48:41 +00:00
if not len(event["args_split"]) > 1:
if len(rss_hooks) == 1:
url = rss_hooks[0]
else:
raise utils.EventError("Please provide a url")
else:
url = event["args_split"][1]
2019-08-09 15:48:41 +00:00
title, entries = self._get_entries(url)
2019-08-09 15:48:41 +00:00
if not entries:
raise utils.EventError("Failed to get RSS entries")
shorten = event["target"].get_setting("rss-shorten", False)
out = self._format_entry(event["server"], event["target"], title, entries[0],
2019-08-09 15:48:41 +00:00
shorten)
event["stdout"].write(out)
2019-06-23 15:03:15 +00:00
else:
raise utils.EventError("Unknown subcommand '%s'" % subcommand)
if changed:
if rss_hooks:
event["target"].set_setting("rss-hooks", rss_hooks)
else:
event["target"].del_setting("rss-hooks")
2019-06-23 15:03:15 +00:00
event["stdout"].write(message)