ignore one-char "words" in <title> if they're not a "letter"
This commit is contained in:
parent
8f927afdc9
commit
8e4c0f4963
1 changed files with 7 additions and 2 deletions
|
@ -21,14 +21,19 @@ class Module(ModuleManager.BaseModule):
|
||||||
|
|
||||||
def _different(self, url, title):
|
def _different(self, url, title):
|
||||||
url = url.lower()
|
url = url.lower()
|
||||||
title_words = [word.lower() for word in title.split()]
|
title_words = []
|
||||||
|
for title_word in title.split():
|
||||||
|
if len(title_word) > 1 or title_word.isalpha():
|
||||||
|
title_words.append(title_word.lower())
|
||||||
|
|
||||||
present = 0
|
present = 0
|
||||||
for title_word in title_words:
|
for title_word in title_words:
|
||||||
if title_word in url:
|
if title_word in url:
|
||||||
present += 1
|
present += 1
|
||||||
|
|
||||||
|
similarity = present/len(title_words)
|
||||||
# if at least 80% of words are in the URL, too similar
|
# if at least 80% of words are in the URL, too similar
|
||||||
if (present/len(title_words)) >= 0.8:
|
if similarity >= 0.8:
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue