NewsBlur/apps/analyzer/tokenizer.py

31 lines
750 B
Python
Raw Normal View History

import re
2024-04-24 09:43:56 -04:00
class Tokenizer:
"""A simple regex-based whitespace tokenizer.
It expects a string and can return all tokens lower-cased
or in their existing case.
"""
2024-04-24 09:43:56 -04:00
WORD_RE = re.compile("[^a-zA-Z-]+")
def __init__(self, phrases, lower=False):
self.phrases = phrases
self.lower = lower
2024-04-24 09:43:56 -04:00
def tokenize(self, doc):
print(doc)
2024-04-24 09:43:56 -04:00
formatted_doc = " ".join(self.WORD_RE.split(doc))
print(formatted_doc)
for phrase in self.phrases:
if phrase in formatted_doc:
yield phrase
2024-04-24 09:43:56 -04:00
if __name__ == "__main__":
phrases = ["Extra Extra", "Streetlevel", "House of the Day"]
tokenizer = Tokenizer(phrases)
2024-04-24 09:43:56 -04:00
doc = "Extra, Extra"
tokenizer.tokenize(doc)