import re class Tokenizer: """A simple regex-based whitespace tokenizer. It expects a string and can return all tokens lower-cased or in their existing case. """ WORD_RE = re.compile("[^a-zA-Z-]+") def __init__(self, phrases, lower=False): self.phrases = phrases self.lower = lower def tokenize(self, doc): print(doc) formatted_doc = " ".join(self.WORD_RE.split(doc)) print(formatted_doc) for phrase in self.phrases: if phrase in formatted_doc: yield phrase if __name__ == "__main__": phrases = ["Extra Extra", "Streetlevel", "House of the Day"] tokenizer = Tokenizer(phrases) doc = "Extra, Extra" tokenizer.tokenize(doc)