2024-04-24 09:50:42 -04:00
|
|
|
from itertools import groupby
|
2024-04-24 09:43:56 -04:00
|
|
|
|
2009-12-18 18:29:34 +00:00
|
|
|
# from apps.analyzer.classifier import FisherClassifier
|
2011-09-05 22:06:24 -07:00
|
|
|
import nltk
|
2024-04-24 09:50:42 -04:00
|
|
|
from django.core import management
|
|
|
|
from django.test import TestCase
|
|
|
|
from django.test.client import Client
|
|
|
|
|
|
|
|
from apps.analyzer.phrase_filter import PhraseFilter
|
2009-12-18 18:29:34 +00:00
|
|
|
from apps.analyzer.tokenizer import Tokenizer
|
2024-04-24 09:50:42 -04:00
|
|
|
from apps.rss_feeds.models import MStory
|
2011-04-11 21:57:45 -04:00
|
|
|
from vendor.reverend.thomas import Bayes
|
2009-11-03 03:52:03 +00:00
|
|
|
|
2011-09-05 22:06:24 -07:00
|
|
|
|
|
|
|
class QuadgramCollocationFinder(nltk.collocations.AbstractCollocationFinder):
|
2024-04-24 09:43:56 -04:00
|
|
|
"""A tool for the finding and ranking of quadgram collocations or other association measures.
|
2011-09-05 22:06:24 -07:00
|
|
|
It is often useful to use from_words() rather thanconstructing an instance directly.
|
|
|
|
"""
|
2024-04-24 09:43:56 -04:00
|
|
|
|
2011-09-05 22:06:24 -07:00
|
|
|
def __init__(self, word_fd, quadgram_fd, trigram_fd, bigram_fd, wildcard_fd):
|
|
|
|
"""Construct a TrigramCollocationFinder, given FreqDists for appearances of words, bigrams, two words with any word between them,and trigrams."""
|
|
|
|
nltk.collocations.AbstractCollocationFinder.__init__(self, word_fd, quadgram_fd)
|
|
|
|
self.trigram_fd = trigram_fd
|
|
|
|
self.bigram_fd = bigram_fd
|
|
|
|
self.wildcard_fd = wildcard_fd
|
2024-04-24 09:43:56 -04:00
|
|
|
|
2011-09-05 22:06:24 -07:00
|
|
|
@classmethod
|
|
|
|
def from_words(cls, words):
|
|
|
|
wfd = nltk.probability.FreqDist()
|
|
|
|
qfd = nltk.probability.FreqDist()
|
|
|
|
tfd = nltk.probability.FreqDist()
|
|
|
|
bfd = nltk.probability.FreqDist()
|
|
|
|
wildfd = nltk.probability.FreqDist()
|
2024-04-24 09:43:56 -04:00
|
|
|
|
|
|
|
for w1, w2, w3, w4 in nltk.util.ingrams(words, 4, pad_right=True):
|
2011-09-05 22:06:24 -07:00
|
|
|
wfd.inc(w1)
|
|
|
|
if w4 is None:
|
|
|
|
continue
|
|
|
|
else:
|
2024-04-24 09:43:56 -04:00
|
|
|
qfd.inc((w1, w2, w3, w4))
|
|
|
|
bfd.inc((w1, w2))
|
|
|
|
tfd.inc((w1, w2, w3))
|
|
|
|
wildfd.inc((w1, w3, w4))
|
|
|
|
wildfd.inc((w1, w2, w4))
|
|
|
|
|
2011-09-05 22:06:24 -07:00
|
|
|
return cls(wfd, qfd, tfd, bfd, wildfd)
|
2024-04-24 09:43:56 -04:00
|
|
|
|
2011-09-05 22:06:24 -07:00
|
|
|
def score_ngram(self, score_fn, w1, w2, w3, w4):
|
|
|
|
n_all = self.word_fd.N()
|
|
|
|
n_iiii = self.ngram_fd[(w1, w2, w3, w4)]
|
|
|
|
if not n_iiii:
|
|
|
|
return
|
|
|
|
n_iiix = self.bigram_fd[(w1, w2)]
|
|
|
|
n_iixi = self.bigram_fd[(w2, w3)]
|
|
|
|
n_ixii = self.bigram_fd[(w3, w4)]
|
|
|
|
n_xiii = self.bigram_fd[(w3, w4)]
|
|
|
|
n_iixx = self.word_fd[w1]
|
|
|
|
n_ixix = self.word_fd[w2]
|
|
|
|
n_ixxi = self.word_fd[w3]
|
|
|
|
n_ixxx = self.word_fd[w4]
|
|
|
|
n_xiix = self.trigram_fd[(w1, w2)]
|
|
|
|
n_xixi = self.trigram_fd[(w2, w3)]
|
|
|
|
n_xxii = self.trigram_fd[(w3, w4)]
|
|
|
|
n_xxxi = self.trigram_fd[(w3, w4)]
|
2024-04-24 09:43:56 -04:00
|
|
|
return score_fn(
|
|
|
|
n_iiii,
|
|
|
|
(n_iiix, n_iixi, n_ixii, n_xiii),
|
|
|
|
(n_iixx, n_ixix, n_ixxi, n_ixxx),
|
|
|
|
(n_xiix, n_xixi, n_xxii, n_xxxi),
|
|
|
|
n_all,
|
|
|
|
)
|
|
|
|
|
2011-09-05 22:06:24 -07:00
|
|
|
|
|
|
|
class CollocationTest(TestCase):
|
2024-04-24 09:43:56 -04:00
|
|
|
fixtures = ["brownstoner.json"]
|
|
|
|
|
2011-09-05 22:06:24 -07:00
|
|
|
def setUp(self):
|
|
|
|
self.client = Client()
|
2024-04-24 09:43:56 -04:00
|
|
|
|
2011-09-05 22:06:24 -07:00
|
|
|
def test_bigrams(self):
|
|
|
|
# bigram_measures = nltk.collocations.BigramAssocMeasures()
|
|
|
|
trigram_measures = nltk.collocations.TrigramAssocMeasures()
|
|
|
|
|
|
|
|
tokens = [
|
2024-04-24 09:43:56 -04:00
|
|
|
"Co-op",
|
|
|
|
"of",
|
|
|
|
"the",
|
|
|
|
"day",
|
|
|
|
"House",
|
|
|
|
"of",
|
|
|
|
"the",
|
|
|
|
"day",
|
|
|
|
"Condo",
|
|
|
|
"of",
|
|
|
|
"the",
|
|
|
|
"day",
|
|
|
|
"Development",
|
|
|
|
"Watch",
|
|
|
|
"Co-op",
|
|
|
|
"of",
|
|
|
|
"the",
|
|
|
|
"day",
|
2011-09-05 22:06:24 -07:00
|
|
|
]
|
|
|
|
finder = nltk.collocations.TrigramCollocationFinder.from_words(tokens)
|
2024-04-24 09:43:56 -04:00
|
|
|
|
2011-09-05 22:06:24 -07:00
|
|
|
finder.apply_freq_filter(2)
|
2024-04-24 09:43:56 -04:00
|
|
|
|
2011-09-05 22:06:24 -07:00
|
|
|
# return the 10 n-grams with the highest PMI
|
2020-10-13 22:01:32 +07:00
|
|
|
print(finder.nbest(trigram_measures.pmi, 10))
|
2011-09-05 22:06:24 -07:00
|
|
|
|
|
|
|
titles = [
|
2024-04-24 09:43:56 -04:00
|
|
|
"Co-op of the day",
|
|
|
|
"Condo of the day",
|
|
|
|
"Co-op of the day",
|
|
|
|
"House of the day",
|
|
|
|
"Development Watch",
|
|
|
|
"Streetlevel",
|
2011-09-05 22:06:24 -07:00
|
|
|
]
|
|
|
|
|
2024-04-24 09:43:56 -04:00
|
|
|
tokens = nltk.tokenize.word(" ".join(titles))
|
2011-09-05 22:06:24 -07:00
|
|
|
ngrams = nltk.ngrams(tokens, 4)
|
|
|
|
d = [key for key, group in groupby(sorted(ngrams)) if len(list(group)) >= 2]
|
2020-10-13 22:01:32 +07:00
|
|
|
print(d)
|
2011-09-05 22:06:24 -07:00
|
|
|
|
2024-04-24 09:43:56 -04:00
|
|
|
|
2009-11-03 03:52:03 +00:00
|
|
|
class ClassifierTest(TestCase):
|
2024-04-24 09:43:56 -04:00
|
|
|
fixtures = ["classifiers.json", "brownstoner.json"]
|
|
|
|
|
2009-11-03 03:52:03 +00:00
|
|
|
def setUp(self):
|
|
|
|
self.client = Client()
|
2024-04-24 09:43:56 -04:00
|
|
|
|
|
|
|
#
|
2009-12-18 18:29:34 +00:00
|
|
|
# def test_filter(self):
|
|
|
|
# user = User.objects.all()
|
|
|
|
# feed = Feed.objects.all()
|
2024-04-24 09:43:56 -04:00
|
|
|
#
|
2009-12-18 18:29:34 +00:00
|
|
|
# management.call_command('loaddata', 'brownstoner.json', verbosity=0)
|
|
|
|
# response = self.client.get('/reader/refresh_feed', { "feed_id": 1, "force": True })
|
|
|
|
# management.call_command('loaddata', 'brownstoner2.json', verbosity=0)
|
|
|
|
# response = self.client.get('/reader/refresh_feed', { "feed_id": 1, "force": True })
|
|
|
|
# management.call_command('loaddata', 'gothamist1.json', verbosity=0)
|
|
|
|
# response = self.client.get('/reader/refresh_feed', { "feed_id": 4, "force": True })
|
|
|
|
# management.call_command('loaddata', 'gothamist2.json', verbosity=0)
|
|
|
|
# response = self.client.get('/reader/refresh_feed', { "feed_id": 4, "force": True })
|
2024-04-24 09:43:56 -04:00
|
|
|
#
|
2009-12-18 18:29:34 +00:00
|
|
|
# stories = Story.objects.filter(story_feed=feed[1]).order_by('-story_date')[:100]
|
2024-04-24 09:43:56 -04:00
|
|
|
#
|
2009-12-18 18:29:34 +00:00
|
|
|
# phrasefilter = PhraseFilter()
|
|
|
|
# for story in stories:
|
|
|
|
# # print story.story_title, story.id
|
|
|
|
# phrasefilter.run(story.story_title, story.id)
|
2024-04-24 09:43:56 -04:00
|
|
|
#
|
2009-12-18 18:29:34 +00:00
|
|
|
# phrasefilter.pare_phrases()
|
|
|
|
# phrasefilter.print_phrases()
|
2024-04-24 09:43:56 -04:00
|
|
|
#
|
2009-12-18 18:29:34 +00:00
|
|
|
def test_train(self):
|
2011-09-05 22:06:24 -07:00
|
|
|
# user = User.objects.all()
|
|
|
|
# feed = Feed.objects.all()
|
2024-04-24 09:43:56 -04:00
|
|
|
|
|
|
|
management.call_command("loaddata", "brownstoner.json", verbosity=0, commit=False, skip_checks=False)
|
|
|
|
management.call_command(
|
|
|
|
"refresh_feed", force=1, feed=1, single_threaded=True, daemonize=False, skip_checks=False
|
|
|
|
)
|
|
|
|
management.call_command("loaddata", "brownstoner2.json", verbosity=0, commit=False, skip_checks=False)
|
|
|
|
management.call_command(
|
|
|
|
"refresh_feed", force=1, feed=1, single_threaded=True, daemonize=False, skip_checks=False
|
|
|
|
)
|
|
|
|
|
2011-04-21 10:44:50 -04:00
|
|
|
stories = MStory.objects(story_feed_id=1)[:53]
|
2024-04-24 09:43:56 -04:00
|
|
|
|
2009-11-15 18:57:53 +00:00
|
|
|
phrasefilter = PhraseFilter()
|
|
|
|
for story in stories:
|
2009-12-18 18:29:34 +00:00
|
|
|
# print story.story_title, story.id
|
2009-11-15 18:57:53 +00:00
|
|
|
phrasefilter.run(story.story_title, story.id)
|
|
|
|
|
|
|
|
phrasefilter.pare_phrases()
|
2009-12-18 18:29:34 +00:00
|
|
|
phrases = phrasefilter.get_phrases()
|
2020-10-13 22:01:32 +07:00
|
|
|
print(phrases)
|
2024-04-24 09:43:56 -04:00
|
|
|
|
2009-12-18 18:29:34 +00:00
|
|
|
tokenizer = Tokenizer(phrases)
|
2024-04-24 09:43:56 -04:00
|
|
|
classifier = Bayes(tokenizer) # FisherClassifier(user[0], feed[0], phrases)
|
|
|
|
|
|
|
|
classifier.train("good", "House of the Day: 393 Pacific St.")
|
|
|
|
classifier.train("good", "House of the Day: 393 Pacific St.")
|
|
|
|
classifier.train("good", "Condo of the Day: 393 Pacific St.")
|
|
|
|
classifier.train("good", "Co-op of the Day: 393 Pacific St. #3")
|
|
|
|
classifier.train("good", "Co-op of the Day: 393 Pacific St. #3")
|
|
|
|
classifier.train("good", "Development Watch: 393 Pacific St. #3")
|
|
|
|
classifier.train("bad", "Development Watch: 393 Pacific St. #3")
|
|
|
|
classifier.train("bad", "Development Watch: 393 Pacific St. #3")
|
|
|
|
classifier.train("bad", "Development Watch: 393 Pacific St. #3")
|
|
|
|
classifier.train("bad", "Streetlevel: 393 Pacific St. #3")
|
|
|
|
|
|
|
|
guess = dict(classifier.guess("Co-op of the Day: 413 Atlantic"))
|
|
|
|
self.assertTrue(guess["good"] > 0.99)
|
|
|
|
self.assertTrue("bad" not in guess)
|
|
|
|
|
|
|
|
guess = dict(classifier.guess("House of the Day: 413 Atlantic"))
|
|
|
|
self.assertTrue(guess["good"] > 0.99)
|
|
|
|
self.assertTrue("bad" not in guess)
|
|
|
|
|
|
|
|
guess = dict(classifier.guess("Development Watch: Yatta"))
|
|
|
|
self.assertTrue(guess["bad"] > 0.7)
|
|
|
|
self.assertTrue(guess["good"] < 0.3)
|
|
|
|
|
|
|
|
guess = dict(classifier.guess("Development Watch: 393 Pacific St."))
|
|
|
|
self.assertTrue(guess["bad"] > 0.7)
|
|
|
|
self.assertTrue(guess["good"] < 0.3)
|
|
|
|
|
|
|
|
guess = dict(classifier.guess("Streetlevel: 123 Carlton St."))
|
|
|
|
self.assertTrue(guess["bad"] > 0.99)
|
|
|
|
self.assertTrue("good" not in guess)
|
|
|
|
|
|
|
|
guess = classifier.guess("Extra, Extra")
|
|
|
|
self.assertTrue("bad" not in guess)
|
|
|
|
self.assertTrue("good" not in guess)
|
|
|
|
|
|
|
|
guess = classifier.guess("Nothing doing: 393 Pacific St.")
|
|
|
|
self.assertTrue("bad" not in guess)
|
|
|
|
self.assertTrue("good" not in guess)
|