Store more "seen ids", hash seen ids so they take up a constant space

This commit is contained in:
jesopo 2019-08-12 15:08:36 +01:00
parent 8c58d33fa3
commit ff8b81a761

View file

@ -1,7 +1,7 @@
#--depends-on config #--depends-on config
#--depends-on shorturl #--depends-on shorturl
import time import hashlib, time
from src import ModuleManager, utils from src import ModuleManager, utils
import feedparser import feedparser
@ -63,15 +63,14 @@ class Module(ModuleManager.BaseModule):
feed = feedparser.parse(pages[url].data) feed = feedparser.parse(pages[url].data)
feed_title = feed["feed"].get("title", None) feed_title = feed["feed"].get("title", None)
max_ids = len(feed["entries"])*10
for server, channel in channels: for server, channel in channels:
seen_ids = channel.get_setting("rss-seen-ids-%s" % url, []) seen_ids = channel.get_setting("rss-seen-ids-%s" % url, [])
new_ids = []
valid = 0 valid = 0
for entry in feed["entries"][::-1]: for entry in feed["entries"][::-1]:
entry_id = entry.get("id", entry["link"]) entry_id = self._get_id(entry)
if entry_id in seen_ids: if entry_id in seen_ids:
new_ids.append(entry_id)
continue continue
if valid == 3: if valid == 3:
@ -84,15 +83,18 @@ class Module(ModuleManager.BaseModule):
self.events.on("send.stdout").call(target=channel, self.events.on("send.stdout").call(target=channel,
module_name="RSS", server=server, message=output) module_name="RSS", server=server, message=output)
new_ids.append(entry_id) seen_ids.append(entry_id)
channel.set_setting("rss-seen-ids-%s" % url, new_ids) if len(seen_ids) > max_ids:
seen_ids = seen_ids[len(seen_ids)-max_ids:]
channel.set_setting("rss-seen-ids-%s" % url, seen_ids)
total_milliseconds = (time.monotonic() - start_time) * 1000 total_milliseconds = (time.monotonic() - start_time) * 1000
self.log.trace("Polled RSS feeds in %fms", [total_milliseconds]) self.log.trace("Polled RSS feeds in %fms", [total_milliseconds])
def _get_id(self, entry): def _get_id(self, entry):
return entry.get("id", entry["link"]) return "sha1:%s" % hashlib.sha1(entry.get("id", entry["link"]
).encode("utf8")).hexdigest()
def _get_entries(self, url, max: int=None): def _get_entries(self, url, max: int=None):
try: try: