NewsBlur-viq/apps/analyzer/tests.py

from itertools import groupby

# from apps.analyzer.classifier import FisherClassifier
import nltk
from django.core import management
from django.test import TestCase
from django.test.client import Client

from apps.analyzer.phrase_filter import PhraseFilter
from apps.analyzer.tokenizer import Tokenizer
from apps.rss_feeds.models import MStory
from vendor.reverend.thomas import Bayes


class QuadgramCollocationFinder(nltk.collocations.AbstractCollocationFinder):
    """A tool for the finding and ranking of quadgram collocations or other association measures.
    It is often useful to use from_words() rather thanconstructing an instance directly.
    """

    def __init__(self, word_fd, quadgram_fd, trigram_fd, bigram_fd, wildcard_fd):
        """Construct a TrigramCollocationFinder, given FreqDists for appearances of words, bigrams, two words with any word between them,and trigrams."""
        nltk.collocations.AbstractCollocationFinder.__init__(self, word_fd, quadgram_fd)
        self.trigram_fd = trigram_fd
        self.bigram_fd = bigram_fd
        self.wildcard_fd = wildcard_fd

    @classmethod
    def from_words(cls, words):
        wfd = nltk.probability.FreqDist()
        qfd = nltk.probability.FreqDist()
        tfd = nltk.probability.FreqDist()
        bfd = nltk.probability.FreqDist()
        wildfd = nltk.probability.FreqDist()

        for w1, w2, w3, w4 in nltk.util.ingrams(words, 4, pad_right=True):
            wfd.inc(w1)
            if w4 is None:
                continue
            else:
                qfd.inc((w1, w2, w3, w4))
            bfd.inc((w1, w2))
            tfd.inc((w1, w2, w3))
            wildfd.inc((w1, w3, w4))
            wildfd.inc((w1, w2, w4))

        return cls(wfd, qfd, tfd, bfd, wildfd)

    def score_ngram(self, score_fn, w1, w2, w3, w4):
        n_all = self.word_fd.N()
        n_iiii = self.ngram_fd[(w1, w2, w3, w4)]
        if not n_iiii:
            return
        n_iiix = self.bigram_fd[(w1, w2)]
        n_iixi = self.bigram_fd[(w2, w3)]
        n_ixii = self.bigram_fd[(w3, w4)]
        n_xiii = self.bigram_fd[(w3, w4)]
        n_iixx = self.word_fd[w1]
        n_ixix = self.word_fd[w2]
        n_ixxi = self.word_fd[w3]
        n_ixxx = self.word_fd[w4]
        n_xiix = self.trigram_fd[(w1, w2)]
        n_xixi = self.trigram_fd[(w2, w3)]
        n_xxii = self.trigram_fd[(w3, w4)]
        n_xxxi = self.trigram_fd[(w3, w4)]
        return score_fn(
            n_iiii,
            (n_iiix, n_iixi, n_ixii, n_xiii),
            (n_iixx, n_ixix, n_ixxi, n_ixxx),
            (n_xiix, n_xixi, n_xxii, n_xxxi),
            n_all,
        )


class CollocationTest(TestCase):
    fixtures = ["brownstoner.json"]

    def setUp(self):
        self.client = Client()

    def test_bigrams(self):
        # bigram_measures = nltk.collocations.BigramAssocMeasures()
        trigram_measures = nltk.collocations.TrigramAssocMeasures()

        tokens = [
            "Co-op",
            "of",
            "the",
            "day",
            "House",
            "of",
            "the",
            "day",
            "Condo",
            "of",
            "the",
            "day",
            "Development",
            "Watch",
            "Co-op",
            "of",
            "the",
            "day",
        ]
        finder = nltk.collocations.TrigramCollocationFinder.from_words(tokens)

        finder.apply_freq_filter(2)

        # return the 10 n-grams with the highest PMI
        print(finder.nbest(trigram_measures.pmi, 10))

        titles = [
            "Co-op of the day",
            "Condo of the day",
            "Co-op of the day",
            "House of the day",
            "Development Watch",
            "Streetlevel",
        ]

        tokens = nltk.tokenize.word(" ".join(titles))
        ngrams = nltk.ngrams(tokens, 4)
        d = [key for key, group in groupby(sorted(ngrams)) if len(list(group)) >= 2]
        print(d)


class ClassifierTest(TestCase):
    fixtures = ["classifiers.json", "brownstoner.json"]

    def setUp(self):
        self.client = Client()

    #
    # def test_filter(self):
    #     user = User.objects.all()
    #     feed = Feed.objects.all()
    #
    #     management.call_command('loaddata', 'brownstoner.json', verbosity=0)
    #     response = self.client.get('/reader/refresh_feed', { "feed_id": 1, "force": True })
    #     management.call_command('loaddata', 'brownstoner2.json', verbosity=0)
    #     response = self.client.get('/reader/refresh_feed', { "feed_id": 1, "force": True })
    #     management.call_command('loaddata', 'gothamist1.json', verbosity=0)
    #     response = self.client.get('/reader/refresh_feed', { "feed_id": 4, "force": True })
    #     management.call_command('loaddata', 'gothamist2.json', verbosity=0)
    #     response = self.client.get('/reader/refresh_feed', { "feed_id": 4, "force": True })
    #
    #     stories = Story.objects.filter(story_feed=feed[1]).order_by('-story_date')[:100]
    #
    #     phrasefilter = PhraseFilter()
    #     for story in stories:
    #         # print story.story_title, story.id
    #         phrasefilter.run(story.story_title, story.id)
    #
    #     phrasefilter.pare_phrases()
    #     phrasefilter.print_phrases()
    #
    def test_train(self):
        # user = User.objects.all()
        # feed = Feed.objects.all()

        management.call_command("loaddata", "brownstoner.json", verbosity=0, commit=False, skip_checks=False)
        management.call_command(
            "refresh_feed", force=1, feed=1, single_threaded=True, daemonize=False, skip_checks=False
        )
        management.call_command("loaddata", "brownstoner2.json", verbosity=0, commit=False, skip_checks=False)
        management.call_command(
            "refresh_feed", force=1, feed=1, single_threaded=True, daemonize=False, skip_checks=False
        )

        stories = MStory.objects(story_feed_id=1)[:53]

        phrasefilter = PhraseFilter()
        for story in stories:
            # print story.story_title, story.id
            phrasefilter.run(story.story_title, story.id)

        phrasefilter.pare_phrases()
        phrases = phrasefilter.get_phrases()
        print(phrases)

        tokenizer = Tokenizer(phrases)
        classifier = Bayes(tokenizer)  # FisherClassifier(user[0], feed[0], phrases)

        classifier.train("good", "House of the Day: 393 Pacific St.")
        classifier.train("good", "House of the Day: 393 Pacific St.")
        classifier.train("good", "Condo of the Day: 393 Pacific St.")
        classifier.train("good", "Co-op of the Day: 393 Pacific St. #3")
        classifier.train("good", "Co-op of the Day: 393 Pacific St. #3")
        classifier.train("good", "Development Watch: 393 Pacific St. #3")
        classifier.train("bad", "Development Watch: 393 Pacific St. #3")
        classifier.train("bad", "Development Watch: 393 Pacific St. #3")
        classifier.train("bad", "Development Watch: 393 Pacific St. #3")
        classifier.train("bad", "Streetlevel: 393 Pacific St. #3")

        guess = dict(classifier.guess("Co-op of the Day: 413 Atlantic"))
        self.assertTrue(guess["good"] > 0.99)
        self.assertTrue("bad" not in guess)

        guess = dict(classifier.guess("House of the Day: 413 Atlantic"))
        self.assertTrue(guess["good"] > 0.99)
        self.assertTrue("bad" not in guess)

        guess = dict(classifier.guess("Development Watch: Yatta"))
        self.assertTrue(guess["bad"] > 0.7)
        self.assertTrue(guess["good"] < 0.3)

        guess = dict(classifier.guess("Development Watch: 393 Pacific St."))
        self.assertTrue(guess["bad"] > 0.7)
        self.assertTrue(guess["good"] < 0.3)

        guess = dict(classifier.guess("Streetlevel: 123 Carlton St."))
        self.assertTrue(guess["bad"] > 0.99)
        self.assertTrue("good" not in guess)

        guess = classifier.guess("Extra, Extra")
        self.assertTrue("bad" not in guess)
        self.assertTrue("good" not in guess)

        guess = classifier.guess("Nothing doing: 393 Pacific St.")
        self.assertTrue("bad" not in guess)
        self.assertTrue("good" not in guess)
Black formatting and isort 2024-04-24 09:50:42 -04:00			`from itertools import groupby`
Black formatting. 2024-04-24 09:43:56 -04:00
Adding a tokenizer to the classifier, as well as using Divmod's Reverend Bayesian classifier. 2009-12-18 18:29:34 +00:00			`# from apps.analyzer.classifier import FisherClassifier`
Trying out a new quadrigram collocator on for size. Needs some tokenizer lovin'. 2011-09-05 22:06:24 -07:00			`import nltk`
Black formatting and isort 2024-04-24 09:50:42 -04:00			`from django.core import management`
			`from django.test import TestCase`
			`from django.test.client import Client`

			`from apps.analyzer.phrase_filter import PhraseFilter`
Adding a tokenizer to the classifier, as well as using Divmod's Reverend Bayesian classifier. 2009-12-18 18:29:34 +00:00			`from apps.analyzer.tokenizer import Tokenizer`
Black formatting and isort 2024-04-24 09:50:42 -04:00			`from apps.rss_feeds.models import MStory`
Utils -> vendor 2011-04-11 21:57:45 -04:00			`from vendor.reverend.thomas import Bayes`
Naive Fisher Bayesian Classifier. Stubbed in some boilerplate for making the analyzer do its job. Still have to connect it to feeds and train on real data, but it's looking good as a starting point. 2009-11-03 03:52:03 +00:00
Trying out a new quadrigram collocator on for size. Needs some tokenizer lovin'. 2011-09-05 22:06:24 -07:00
			`class QuadgramCollocationFinder(nltk.collocations.AbstractCollocationFinder):`
Black formatting. 2024-04-24 09:43:56 -04:00			`"""A tool for the finding and ranking of quadgram collocations or other association measures.`
Trying out a new quadrigram collocator on for size. Needs some tokenizer lovin'. 2011-09-05 22:06:24 -07:00			`It is often useful to use from_words() rather thanconstructing an instance directly.`
			`"""`
Black formatting. 2024-04-24 09:43:56 -04:00
Trying out a new quadrigram collocator on for size. Needs some tokenizer lovin'. 2011-09-05 22:06:24 -07:00			`def __init__(self, word_fd, quadgram_fd, trigram_fd, bigram_fd, wildcard_fd):`
			`"""Construct a TrigramCollocationFinder, given FreqDists for appearances of words, bigrams, two words with any word between them,and trigrams."""`
			`nltk.collocations.AbstractCollocationFinder.__init__(self, word_fd, quadgram_fd)`
			`self.trigram_fd = trigram_fd`
			`self.bigram_fd = bigram_fd`
			`self.wildcard_fd = wildcard_fd`
Black formatting. 2024-04-24 09:43:56 -04:00
Trying out a new quadrigram collocator on for size. Needs some tokenizer lovin'. 2011-09-05 22:06:24 -07:00			`@classmethod`
			`def from_words(cls, words):`
			`wfd = nltk.probability.FreqDist()`
			`qfd = nltk.probability.FreqDist()`
			`tfd = nltk.probability.FreqDist()`
			`bfd = nltk.probability.FreqDist()`
			`wildfd = nltk.probability.FreqDist()`
Black formatting. 2024-04-24 09:43:56 -04:00
			`for w1, w2, w3, w4 in nltk.util.ingrams(words, 4, pad_right=True):`
Trying out a new quadrigram collocator on for size. Needs some tokenizer lovin'. 2011-09-05 22:06:24 -07:00			`wfd.inc(w1)`
			`if w4 is None:`
			`continue`
			`else:`
Black formatting. 2024-04-24 09:43:56 -04:00			`qfd.inc((w1, w2, w3, w4))`
			`bfd.inc((w1, w2))`
			`tfd.inc((w1, w2, w3))`
			`wildfd.inc((w1, w3, w4))`
			`wildfd.inc((w1, w2, w4))`

Trying out a new quadrigram collocator on for size. Needs some tokenizer lovin'. 2011-09-05 22:06:24 -07:00			`return cls(wfd, qfd, tfd, bfd, wildfd)`
Black formatting. 2024-04-24 09:43:56 -04:00
Trying out a new quadrigram collocator on for size. Needs some tokenizer lovin'. 2011-09-05 22:06:24 -07:00			`def score_ngram(self, score_fn, w1, w2, w3, w4):`
			`n_all = self.word_fd.N()`
			`n_iiii = self.ngram_fd[(w1, w2, w3, w4)]`
			`if not n_iiii:`
			`return`
			`n_iiix = self.bigram_fd[(w1, w2)]`
			`n_iixi = self.bigram_fd[(w2, w3)]`
			`n_ixii = self.bigram_fd[(w3, w4)]`
			`n_xiii = self.bigram_fd[(w3, w4)]`
			`n_iixx = self.word_fd[w1]`
			`n_ixix = self.word_fd[w2]`
			`n_ixxi = self.word_fd[w3]`
			`n_ixxx = self.word_fd[w4]`
			`n_xiix = self.trigram_fd[(w1, w2)]`
			`n_xixi = self.trigram_fd[(w2, w3)]`
			`n_xxii = self.trigram_fd[(w3, w4)]`
			`n_xxxi = self.trigram_fd[(w3, w4)]`
Black formatting. 2024-04-24 09:43:56 -04:00			`return score_fn(`
			`n_iiii,`
			`(n_iiix, n_iixi, n_ixii, n_xiii),`
			`(n_iixx, n_ixix, n_ixxi, n_ixxx),`
			`(n_xiix, n_xixi, n_xxii, n_xxxi),`
			`n_all,`
			`)`

Trying out a new quadrigram collocator on for size. Needs some tokenizer lovin'. 2011-09-05 22:06:24 -07:00
			`class CollocationTest(TestCase):`
Black formatting. 2024-04-24 09:43:56 -04:00			`fixtures = ["brownstoner.json"]`

Trying out a new quadrigram collocator on for size. Needs some tokenizer lovin'. 2011-09-05 22:06:24 -07:00			`def setUp(self):`
			`self.client = Client()`
Black formatting. 2024-04-24 09:43:56 -04:00
Trying out a new quadrigram collocator on for size. Needs some tokenizer lovin'. 2011-09-05 22:06:24 -07:00			`def test_bigrams(self):`
			`# bigram_measures = nltk.collocations.BigramAssocMeasures()`
			`trigram_measures = nltk.collocations.TrigramAssocMeasures()`

			`tokens = [`
Black formatting. 2024-04-24 09:43:56 -04:00			`"Co-op",`
			`"of",`
			`"the",`
			`"day",`
			`"House",`
			`"of",`
			`"the",`
			`"day",`
			`"Condo",`
			`"of",`
			`"the",`
			`"day",`
			`"Development",`
			`"Watch",`
			`"Co-op",`
			`"of",`
			`"the",`
			`"day",`
Trying out a new quadrigram collocator on for size. Needs some tokenizer lovin'. 2011-09-05 22:06:24 -07:00			`]`
			`finder = nltk.collocations.TrigramCollocationFinder.from_words(tokens)`
Black formatting. 2024-04-24 09:43:56 -04:00
Trying out a new quadrigram collocator on for size. Needs some tokenizer lovin'. 2011-09-05 22:06:24 -07:00			`finder.apply_freq_filter(2)`
Black formatting. 2024-04-24 09:43:56 -04:00
Trying out a new quadrigram collocator on for size. Needs some tokenizer lovin'. 2011-09-05 22:06:24 -07:00			`# return the 10 n-grams with the highest PMI`
python2 to python3 conversion for analyzer and push 2020-10-13 22:01:32 +07:00			`print(finder.nbest(trigram_measures.pmi, 10))`
Trying out a new quadrigram collocator on for size. Needs some tokenizer lovin'. 2011-09-05 22:06:24 -07:00
			`titles = [`
Black formatting. 2024-04-24 09:43:56 -04:00			`"Co-op of the day",`
			`"Condo of the day",`
			`"Co-op of the day",`
			`"House of the day",`
			`"Development Watch",`
			`"Streetlevel",`
Trying out a new quadrigram collocator on for size. Needs some tokenizer lovin'. 2011-09-05 22:06:24 -07:00			`]`

Black formatting. 2024-04-24 09:43:56 -04:00			`tokens = nltk.tokenize.word(" ".join(titles))`
Trying out a new quadrigram collocator on for size. Needs some tokenizer lovin'. 2011-09-05 22:06:24 -07:00			`ngrams = nltk.ngrams(tokens, 4)`
			`d = [key for key, group in groupby(sorted(ngrams)) if len(list(group)) >= 2]`
python2 to python3 conversion for analyzer and push 2020-10-13 22:01:32 +07:00			`print(d)`
Trying out a new quadrigram collocator on for size. Needs some tokenizer lovin'. 2011-09-05 22:06:24 -07:00
Black formatting. 2024-04-24 09:43:56 -04:00
Naive Fisher Bayesian Classifier. Stubbed in some boilerplate for making the analyzer do its job. Still have to connect it to feeds and train on real data, but it's looking good as a starting point. 2009-11-03 03:52:03 +00:00			`class ClassifierTest(TestCase):`
Black formatting. 2024-04-24 09:43:56 -04:00			`fixtures = ["classifiers.json", "brownstoner.json"]`

Naive Fisher Bayesian Classifier. Stubbed in some boilerplate for making the analyzer do its job. Still have to connect it to feeds and train on real data, but it's looking good as a starting point. 2009-11-03 03:52:03 +00:00			`def setUp(self):`
			`self.client = Client()`
Black formatting. 2024-04-24 09:43:56 -04:00
			`#`
Adding a tokenizer to the classifier, as well as using Divmod's Reverend Bayesian classifier. 2009-12-18 18:29:34 +00:00			`# def test_filter(self):`
			`# user = User.objects.all()`
			`# feed = Feed.objects.all()`
Black formatting. 2024-04-24 09:43:56 -04:00			`#`
Adding a tokenizer to the classifier, as well as using Divmod's Reverend Bayesian classifier. 2009-12-18 18:29:34 +00:00			`# management.call_command('loaddata', 'brownstoner.json', verbosity=0)`
			`# response = self.client.get('/reader/refresh_feed', { "feed_id": 1, "force": True })`
			`# management.call_command('loaddata', 'brownstoner2.json', verbosity=0)`
			`# response = self.client.get('/reader/refresh_feed', { "feed_id": 1, "force": True })`
			`# management.call_command('loaddata', 'gothamist1.json', verbosity=0)`
			`# response = self.client.get('/reader/refresh_feed', { "feed_id": 4, "force": True })`
			`# management.call_command('loaddata', 'gothamist2.json', verbosity=0)`
			`# response = self.client.get('/reader/refresh_feed', { "feed_id": 4, "force": True })`
Black formatting. 2024-04-24 09:43:56 -04:00			`#`
Adding a tokenizer to the classifier, as well as using Divmod's Reverend Bayesian classifier. 2009-12-18 18:29:34 +00:00			`# stories = Story.objects.filter(story_feed=feed[1]).order_by('-story_date')[:100]`
Black formatting. 2024-04-24 09:43:56 -04:00			`#`
Adding a tokenizer to the classifier, as well as using Divmod's Reverend Bayesian classifier. 2009-12-18 18:29:34 +00:00			`# phrasefilter = PhraseFilter()`
			`# for story in stories:`
			`# # print story.story_title, story.id`
			`# phrasefilter.run(story.story_title, story.id)`
Black formatting. 2024-04-24 09:43:56 -04:00			`#`
Adding a tokenizer to the classifier, as well as using Divmod's Reverend Bayesian classifier. 2009-12-18 18:29:34 +00:00			`# phrasefilter.pare_phrases()`
			`# phrasefilter.print_phrases()`
Black formatting. 2024-04-24 09:43:56 -04:00			`#`
Adding a tokenizer to the classifier, as well as using Divmod's Reverend Bayesian classifier. 2009-12-18 18:29:34 +00:00			`def test_train(self):`
Trying out a new quadrigram collocator on for size. Needs some tokenizer lovin'. 2011-09-05 22:06:24 -07:00			`# user = User.objects.all()`
			`# feed = Feed.objects.all()`
Black formatting. 2024-04-24 09:43:56 -04:00
			`management.call_command("loaddata", "brownstoner.json", verbosity=0, commit=False, skip_checks=False)`
			`management.call_command(`
			`"refresh_feed", force=1, feed=1, single_threaded=True, daemonize=False, skip_checks=False`
			`)`
			`management.call_command("loaddata", "brownstoner2.json", verbosity=0, commit=False, skip_checks=False)`
			`management.call_command(`
			`"refresh_feed", force=1, feed=1, single_threaded=True, daemonize=False, skip_checks=False`
			`)`

Paging from 1 and not 0. Moving the JS assetmodel to use new paging and API endpoints. 2011-04-21 10:44:50 -04:00			`stories = MStory.objects(story_feed_id=1)[:53]`
Black formatting. 2024-04-24 09:43:56 -04:00
Adding Phrase Filter to Analyzer. Just need to hook up the phrase filter to the story classifier and then save the results to be used for later processing. 2009-11-15 18:57:53 +00:00			`phrasefilter = PhraseFilter()`
			`for story in stories:`
Adding a tokenizer to the classifier, as well as using Divmod's Reverend Bayesian classifier. 2009-12-18 18:29:34 +00:00			`# print story.story_title, story.id`
Adding Phrase Filter to Analyzer. Just need to hook up the phrase filter to the story classifier and then save the results to be used for later processing. 2009-11-15 18:57:53 +00:00			`phrasefilter.run(story.story_title, story.id)`

			`phrasefilter.pare_phrases()`
Adding a tokenizer to the classifier, as well as using Divmod's Reverend Bayesian classifier. 2009-12-18 18:29:34 +00:00			`phrases = phrasefilter.get_phrases()`
python2 to python3 conversion for analyzer and push 2020-10-13 22:01:32 +07:00			`print(phrases)`
Black formatting. 2024-04-24 09:43:56 -04:00
Adding a tokenizer to the classifier, as well as using Divmod's Reverend Bayesian classifier. 2009-12-18 18:29:34 +00:00			`tokenizer = Tokenizer(phrases)`
Black formatting. 2024-04-24 09:43:56 -04:00			`classifier = Bayes(tokenizer) # FisherClassifier(user[0], feed[0], phrases)`

			`classifier.train("good", "House of the Day: 393 Pacific St.")`
			`classifier.train("good", "House of the Day: 393 Pacific St.")`
			`classifier.train("good", "Condo of the Day: 393 Pacific St.")`
			`classifier.train("good", "Co-op of the Day: 393 Pacific St. #3")`
			`classifier.train("good", "Co-op of the Day: 393 Pacific St. #3")`
			`classifier.train("good", "Development Watch: 393 Pacific St. #3")`
			`classifier.train("bad", "Development Watch: 393 Pacific St. #3")`
			`classifier.train("bad", "Development Watch: 393 Pacific St. #3")`
			`classifier.train("bad", "Development Watch: 393 Pacific St. #3")`
			`classifier.train("bad", "Streetlevel: 393 Pacific St. #3")`

			`guess = dict(classifier.guess("Co-op of the Day: 413 Atlantic"))`
			`self.assertTrue(guess["good"] > 0.99)`
			`self.assertTrue("bad" not in guess)`

			`guess = dict(classifier.guess("House of the Day: 413 Atlantic"))`
			`self.assertTrue(guess["good"] > 0.99)`
			`self.assertTrue("bad" not in guess)`

			`guess = dict(classifier.guess("Development Watch: Yatta"))`
			`self.assertTrue(guess["bad"] > 0.7)`
			`self.assertTrue(guess["good"] < 0.3)`

			`guess = dict(classifier.guess("Development Watch: 393 Pacific St."))`
			`self.assertTrue(guess["bad"] > 0.7)`
			`self.assertTrue(guess["good"] < 0.3)`

			`guess = dict(classifier.guess("Streetlevel: 123 Carlton St."))`
			`self.assertTrue(guess["bad"] > 0.99)`
			`self.assertTrue("good" not in guess)`

			`guess = classifier.guess("Extra, Extra")`
			`self.assertTrue("bad" not in guess)`
			`self.assertTrue("good" not in guess)`

			`guess = classifier.guess("Nothing doing: 393 Pacific St.")`
			`self.assertTrue("bad" not in guess)`
			`self.assertTrue("good" not in guess)`