Add first version of modules/mixed_unicode.py, designed to detect when we get a
message that mixes scripts (latin, cyrillic, greek, etc) that might be spam
This commit is contained in:
parent
c2ef675bf0
commit
b19ce0be2f
1 changed files with 42 additions and 0 deletions
42
modules/mixed_unicode.py
Normal file
42
modules/mixed_unicode.py
Normal file
|
@ -0,0 +1,42 @@
|
|||
import enum
|
||||
from src import ModuleManager, utils
|
||||
|
||||
class Script(enum.Enum):
|
||||
Unknown = 0
|
||||
Latin = 1
|
||||
Cyrillic = 2
|
||||
Greek = 3
|
||||
WORD_SEPERATORS = [",", " ", "\t", "."]
|
||||
|
||||
class Module(ModuleManager.BaseModule):
|
||||
def _detect_script(self, char):
|
||||
point = ord(char)
|
||||
if 0 <= point <= 687:
|
||||
return Script.Latin
|
||||
elif 880 <= point <= 1023:
|
||||
return Script.Greek
|
||||
elif 1024 <= point <= 1327:
|
||||
return Script.Cyrillic
|
||||
return Script.Unknown
|
||||
|
||||
@utils.hook("received.message.channel")
|
||||
def channel_message(self, event):
|
||||
last_script = None
|
||||
last_was_separator = False
|
||||
score = 0
|
||||
|
||||
for char in event["message"]:
|
||||
if char in WORD_SEPERATORS:
|
||||
last_was_separator = True
|
||||
else:
|
||||
script = self._detect_script(char)
|
||||
if not script == Script.Unknown:
|
||||
if last_script and not script == last_script:
|
||||
score += 1
|
||||
if not last_was_separator:
|
||||
score += 1
|
||||
|
||||
last_script = script
|
||||
|
||||
last_was_separator = False
|
||||
self.log.trace("Message given a mixed-unicode score of %d", [score])
|
Loading…
Reference in a new issue