2009-12-18 18:29:34 +00:00
|
|
|
import re
|
|
|
|
|
|
|
|
class Tokenizer:
|
|
|
|
"""A simple regex-based whitespace tokenizer.
|
|
|
|
It expects a string and can return all tokens lower-cased
|
|
|
|
or in their existing case.
|
|
|
|
"""
|
|
|
|
|
|
|
|
WORD_RE = re.compile('[^a-zA-Z-]+')
|
|
|
|
|
|
|
|
def __init__(self, phrases, lower=False):
|
|
|
|
self.phrases = phrases
|
|
|
|
self.lower = lower
|
|
|
|
|
|
|
|
def tokenize(self, doc):
|
2020-10-13 22:01:32 +07:00
|
|
|
print(doc)
|
2009-12-18 18:29:34 +00:00
|
|
|
formatted_doc = ' '.join(self.WORD_RE.split(doc))
|
2020-10-13 22:01:32 +07:00
|
|
|
print(formatted_doc)
|
2009-12-18 18:29:34 +00:00
|
|
|
for phrase in self.phrases:
|
|
|
|
if phrase in formatted_doc:
|
|
|
|
yield phrase
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
phrases = ['Extra Extra', 'Streetlevel', 'House of the Day']
|
|
|
|
tokenizer = Tokenizer(phrases)
|
|
|
|
|
|
|
|
doc = 'Extra, Extra'
|
|
|
|
tokenizer.tokenize(doc)
|