ignore one-char "words" in <title> if they're not a "letter"

This commit is contained in:
jesopo 2019-10-04 11:20:35 +01:00
parent 8f927afdc9
commit 8e4c0f4963

View file

@ -21,14 +21,19 @@ class Module(ModuleManager.BaseModule):
def _different(self, url, title): def _different(self, url, title):
url = url.lower() url = url.lower()
title_words = [word.lower() for word in title.split()] title_words = []
for title_word in title.split():
if len(title_word) > 1 or title_word.isalpha():
title_words.append(title_word.lower())
present = 0 present = 0
for title_word in title_words: for title_word in title_words:
if title_word in url: if title_word in url:
present += 1 present += 1
similarity = present/len(title_words)
# if at least 80% of words are in the URL, too similar # if at least 80% of words are in the URL, too similar
if (present/len(title_words)) >= 0.8: if similarity >= 0.8:
return False return False
return True return True