NewsBlur/apps/analyzer/lda.py

import zlib
from collections import defaultdict
from glob import glob
from math import exp, log
from random import random

from bs4 import BeautifulSoup
from nltk import FreqDist

from apps.rss_feeds.models import MStory


def lgammln(xx):
    """
    Returns the gamma function of xx.
    Gamma(z) = Integral(0,infinity) of t^(z-1)exp(-t) dt.
    (Adapted from: Numerical Recipies in C.)

    Usage:   lgammln(xx)

    Copied from stats.py by strang@nmr.mgh.harvard.edu
    """

    coeff = [76.18009173, -86.50532033, 24.01409822, -1.231739516, 0.120858003e-2, -0.536382e-5]
    x = xx - 1.0
    tmp = x + 5.5
    tmp = tmp - (x + 0.5) * log(tmp)
    ser = 1.0
    for j in range(len(coeff)):
        x = x + 1
        ser = ser + coeff[j] / x
    return -tmp + log(2.50662827465 * ser)


def log_sum(log_a, log_b):
    if log_a < log_b:
        return log_b + log(1 + exp(log_a - log_b))
    else:
        return log_a + log(1 + exp(log_b - log_a))


def log_normalize(dist):
    normalizer = reduce(log_sum, dist)
    for ii in xrange(len(dist)):
        dist[ii] -= normalizer
    return dist


def log_sample(dist):
    """
    Sample a key from a dictionary using the values as probabilities (unnormalized)
    """
    cutoff = random()
    dist = log_normalize(dist)
    # print "Normalizer: ", normalizer

    current = 0
    for ii in xrange(len(dist)):
        current += exp(dist[ii])
        if current >= cutoff:
            # print "Chose", i
            return ii
    assert False, "Didn't choose anything: %f %f" % (cutoff, current)


def create_data(stories, lang="english", doc_limit=-1, delimiter=""):
    from nltk.tokenize.treebank import TreebankWordTokenizer

    tokenizer = TreebankWordTokenizer()

    from nltk.corpus import stopwords

    stop = stopwords.words("english")

    from string import ascii_lowercase

    docs = {}
    print("Found %i stories" % stories.count())
    for story in stories:
        text = zlib.decompress(story.story_content_z)
        # text = story.story_title
        text = "".join(BeautifulSoup(text, features="lxml").findAll(text=True)).lower()
        if delimiter:
            sections = text.split(delimiter)
        else:
            sections = [text]

        if doc_limit > 0 and len(docs) > doc_limit:
            print("Passed doc limit %i" % len(docs))
            break
        print(story.story_title, len(sections))

        for jj in xrange(len(sections)):
            docs["%s-%i" % (story.story_title, jj)] = [
                x
                for x in tokenizer.tokenize(sections[jj])
                if (not x in stop) and (min(y in ascii_lowercase for y in x))
            ]
    return docs


class LdaSampler:
    def __init__(self, num_topics, doc_smoothing=0.1, topic_smoothing=0.01):
        self._docs = defaultdict(FreqDist)
        self._topics = defaultdict(FreqDist)
        self._K = num_topics
        self._state = None

        self._alpha = doc_smoothing
        self._lambda = topic_smoothing

    def optimize_hyperparameters(self, samples=5, step=3.0):
        rawParam = [log(self._alpha), log(self._lambda)]

        for ii in xrange(samples):
            lp_old = self.lhood(self._alpha, self._lambda)
            lp_new = log(random()) + lp_old
            print("OLD: %f\tNEW: %f at (%f, %f)" % (lp_old, lp_new, self._alpha, self._lambda))

            l = [x - random() * step for x in rawParam]
            r = [x + step for x in rawParam]

            for jj in xrange(100):
                rawParamNew = [l[x] + random() * (r[x] - l[x]) for x in xrange(len(rawParam))]
                trial_alpha, trial_lambda = [exp(x) for x in rawParamNew]
                lp_test = self.lhood(trial_alpha, trial_lambda)
                # print("TRYING: %f (need %f) at (%f, %f)" % (lp_test - lp_old, lp_new - lp_old, trial_alpha, trial_lambda))

                if lp_test > lp_new:
                    print(jj)
                    self._alpha = exp(rawParamNew[0])
                    self._lambda = exp(rawParamNew[1])
                    self._alpha_sum = self._alpha * self._K
                    self._lambda_sum = self._lambda * self._W
                    rawParam = [log(self._alpha), log(self._lambda)]
                    break
                else:
                    for dd in xrange(len(rawParamNew)):
                        if rawParamNew[dd] < rawParam[dd]:
                            l[dd] = rawParamNew[dd]
                        else:
                            r[dd] = rawParamNew[dd]
                        assert l[dd] <= rawParam[dd]
                        assert r[dd] >= rawParam[dd]

            print("\nNew hyperparameters (%i): %f %f" % (jj, self._alpha, self._lambda))

    def lhood(self, doc_smoothing, voc_smoothing):
        doc_sum = doc_smoothing * self._K
        voc_sum = voc_smoothing * self._W

        val = 0.0
        val += lgammln(doc_sum) * len(self._docs)
        val -= lgammln(doc_smoothing) * self._K * len(self._docs)
        for ii in self._docs:
            for jj in xrange(self._K):
                val += lgammln(doc_smoothing + self._docs[ii][jj])
            val -= lgammln(doc_sum + self._docs[ii].N())

        val += lgammln(voc_sum) * self._K
        val -= lgammln(voc_smoothing) * self._W * self._K
        for ii in self._topics:
            for jj in self._vocab:
                val += lgammln(voc_smoothing + self._topics[ii][jj])
            val -= lgammln(voc_sum + self._topics[ii].N())
        return val

    def initialize(self, data):
        """
        Data should be keyed by doc-id, values should be iterable
        """

        self._alpha_sum = self._alpha * self._K
        self._state = defaultdict(dict)

        self._vocab = set([])
        for dd in data:
            for ww in xrange(len(data[dd])):
                # Learn all the words we'll see
                self._vocab.add(data[dd][ww])

                # Initialize the state to unassigned
                self._state[dd][ww] = -1

        self._W = len(self._vocab)
        self._lambda_sum = float(self._W) * self._lambda

        self._data = data

        print("Initialized vocab of size %i" % len(self._vocab))

    def prob(self, doc, word, topic):
        val = log(self._docs[doc][topic] + self._alpha)
        # This is constant across a document, so we don't need to compute this term
        # val -= log(self._docs[doc].N() + self._alpha_sum)

        val += log(self._topics[topic][word] + self._lambda)
        val -= log(self._topics[topic].N() + self._lambda_sum)

        # print doc, word, topic, self._docs[doc][topic], self._topics[topic][word]

        return val

    def sample_word(self, doc, position):
        word = self._data[doc][position]

        old_topic = self._state[doc][position]
        if old_topic != -1:
            self.change_count(doc, word, old_topic, -1)

        probs = [self.prob(doc, self._data[doc][position], x) for x in xrange(self._K)]
        new_topic = log_sample(probs)
        # print doc, word, new_topic

        self.change_count(doc, word, new_topic, 1)
        self._state[doc][position] = new_topic

    def change_count(self, doc, word, topic, delta):
        self._docs[doc].inc(topic, delta)
        self._topics[topic].inc(word, delta)

    def sample(self, iterations=100, hyper_delay=10):
        assert self._state
        for ii in xrange(iterations):
            for dd in self._data:
                for ww in xrange(len(self._data[dd])):
                    self.sample_word(dd, ww)
            print("Iteration %i %f" % (ii, self.lhood(self._alpha, self._lambda)))
            if hyper_delay >= 0 and ii % hyper_delay == 0:
                self.optimize_hyperparameters()

    def print_topics(self, num_words=15):
        for ii in self._topics:
            print("%i:%s\n" % (ii, "\t".join(self._topics[ii].keys()[:num_words])))


if __name__ == "__main__":
    stories = MStory.objects(story_feed_id=199)
    d = create_data(stories, doc_limit=250, delimiter="")
    lda = LdaSampler(5)
    lda.initialize(d)

    lda.sample(50)
    lda.print_topics()
Black formatting and isort 2024-04-24 09:50:42 -04:00			`import zlib`
Adding TF/IDF and LDA (semantic analysis) algorithms. 2010-11-22 11:14:00 -05:00			`from collections import defaultdict`
Black formatting and isort 2024-04-24 09:50:42 -04:00			`from glob import glob`
			`from math import exp, log`
Adding TF/IDF and LDA (semantic analysis) algorithms. 2010-11-22 11:14:00 -05:00			`from random import random`
Black formatting and isort 2024-04-24 09:50:42 -04:00
			`from bs4 import BeautifulSoup`
Adding TF/IDF and LDA (semantic analysis) algorithms. 2010-11-22 11:14:00 -05:00			`from nltk import FreqDist`

Black formatting and isort 2024-04-24 09:50:42 -04:00			`from apps.rss_feeds.models import MStory`

Adding TF/IDF and LDA (semantic analysis) algorithms. 2010-11-22 11:14:00 -05:00
			`def lgammln(xx):`
Black formatting. 2024-04-24 09:43:56 -04:00			`"""`
			`Returns the gamma function of xx.`
			`Gamma(z) = Integral(0,infinity) of t^(z-1)exp(-t) dt.`
			`(Adapted from: Numerical Recipies in C.)`
Adding TF/IDF and LDA (semantic analysis) algorithms. 2010-11-22 11:14:00 -05:00
Black formatting. 2024-04-24 09:43:56 -04:00			`Usage: lgammln(xx)`
Adding TF/IDF and LDA (semantic analysis) algorithms. 2010-11-22 11:14:00 -05:00
Black formatting. 2024-04-24 09:43:56 -04:00			`Copied from stats.py by strang@nmr.mgh.harvard.edu`
			`"""`
Adding TF/IDF and LDA (semantic analysis) algorithms. 2010-11-22 11:14:00 -05:00
Black formatting. 2024-04-24 09:43:56 -04:00			`coeff = [76.18009173, -86.50532033, 24.01409822, -1.231739516, 0.120858003e-2, -0.536382e-5]`
			`x = xx - 1.0`
			`tmp = x + 5.5`
			`tmp = tmp - (x + 0.5) * log(tmp)`
			`ser = 1.0`
			`for j in range(len(coeff)):`
			`x = x + 1`
			`ser = ser + coeff[j] / x`
			`return -tmp + log(2.50662827465 * ser)`
Adding TF/IDF and LDA (semantic analysis) algorithms. 2010-11-22 11:14:00 -05:00
Black formatting. 2024-04-24 09:43:56 -04:00
			`def log_sum(log_a, log_b):`
			`if log_a < log_b:`
			`return log_b + log(1 + exp(log_a - log_b))`
Adding TF/IDF and LDA (semantic analysis) algorithms. 2010-11-22 11:14:00 -05:00			`else:`
Black formatting. 2024-04-24 09:43:56 -04:00			`return log_a + log(1 + exp(log_b - log_a))`
Adding TF/IDF and LDA (semantic analysis) algorithms. 2010-11-22 11:14:00 -05:00

Black formatting. 2024-04-24 09:43:56 -04:00			`def log_normalize(dist):`
			`normalizer = reduce(log_sum, dist)`
			`for ii in xrange(len(dist)):`
			`dist[ii] -= normalizer`
			`return dist`
Adding TF/IDF and LDA (semantic analysis) algorithms. 2010-11-22 11:14:00 -05:00

Black formatting. 2024-04-24 09:43:56 -04:00			`def log_sample(dist):`
			`"""`
			`Sample a key from a dictionary using the values as probabilities (unnormalized)`
			`"""`
			`cutoff = random()`
			`dist = log_normalize(dist)`
			`# print "Normalizer: ", normalizer`
Adding TF/IDF and LDA (semantic analysis) algorithms. 2010-11-22 11:14:00 -05:00
Black formatting. 2024-04-24 09:43:56 -04:00			`current = 0`
			`for ii in xrange(len(dist)):`
			`current += exp(dist[ii])`
			`if current >= cutoff:`
			`# print "Chose", i`
			`return ii`
			`assert False, "Didn't choose anything: %f %f" % (cutoff, current)`
Adding TF/IDF and LDA (semantic analysis) algorithms. 2010-11-22 11:14:00 -05:00

Black formatting. 2024-04-24 09:43:56 -04:00			`def create_data(stories, lang="english", doc_limit=-1, delimiter=""):`
			`from nltk.tokenize.treebank import TreebankWordTokenizer`
Adding TF/IDF and LDA (semantic analysis) algorithms. 2010-11-22 11:14:00 -05:00
Black formatting. 2024-04-24 09:43:56 -04:00			`tokenizer = TreebankWordTokenizer()`
Adding TF/IDF and LDA (semantic analysis) algorithms. 2010-11-22 11:14:00 -05:00
Black formatting. 2024-04-24 09:43:56 -04:00			`from nltk.corpus import stopwords`
Adding TF/IDF and LDA (semantic analysis) algorithms. 2010-11-22 11:14:00 -05:00
Black formatting. 2024-04-24 09:43:56 -04:00			`stop = stopwords.words("english")`
Adding TF/IDF and LDA (semantic analysis) algorithms. 2010-11-22 11:14:00 -05:00
Black formatting. 2024-04-24 09:43:56 -04:00			`from string import ascii_lowercase`
Adding TF/IDF and LDA (semantic analysis) algorithms. 2010-11-22 11:14:00 -05:00
Black formatting. 2024-04-24 09:43:56 -04:00			`docs = {}`
			`print("Found %i stories" % stories.count())`
			`for story in stories:`
			`text = zlib.decompress(story.story_content_z)`
			`# text = story.story_title`
			`text = "".join(BeautifulSoup(text, features="lxml").findAll(text=True)).lower()`
			`if delimiter:`
			`sections = text.split(delimiter)`
			`else:`
			`sections = [text]`
Adding TF/IDF and LDA (semantic analysis) algorithms. 2010-11-22 11:14:00 -05:00
Black formatting. 2024-04-24 09:43:56 -04:00			`if doc_limit > 0 and len(docs) > doc_limit:`
			`print("Passed doc limit %i" % len(docs))`
			`break`
			`print(story.story_title, len(sections))`
Adding TF/IDF and LDA (semantic analysis) algorithms. 2010-11-22 11:14:00 -05:00
Black formatting. 2024-04-24 09:43:56 -04:00			`for jj in xrange(len(sections)):`
			`docs["%s-%i" % (story.story_title, jj)] = [`
			`x`
			`for x in tokenizer.tokenize(sections[jj])`
			`if (not x in stop) and (min(y in ascii_lowercase for y in x))`
			`]`
			`return docs`
Adding TF/IDF and LDA (semantic analysis) algorithms. 2010-11-22 11:14:00 -05:00

Black formatting. 2024-04-24 09:43:56 -04:00			`class LdaSampler:`
			`def __init__(self, num_topics, doc_smoothing=0.1, topic_smoothing=0.01):`
			`self._docs = defaultdict(FreqDist)`
			`self._topics = defaultdict(FreqDist)`
			`self._K = num_topics`
			`self._state = None`

			`self._alpha = doc_smoothing`
			`self._lambda = topic_smoothing`

			`def optimize_hyperparameters(self, samples=5, step=3.0):`
			`rawParam = [log(self._alpha), log(self._lambda)]`

			`for ii in xrange(samples):`
			`lp_old = self.lhood(self._alpha, self._lambda)`
			`lp_new = log(random()) + lp_old`
			`print("OLD: %f\tNEW: %f at (%f, %f)" % (lp_old, lp_new, self._alpha, self._lambda))`

			`l = [x - random() * step for x in rawParam]`
			`r = [x + step for x in rawParam]`

			`for jj in xrange(100):`
			`rawParamNew = [l[x] + random() * (r[x] - l[x]) for x in xrange(len(rawParam))]`
			`trial_alpha, trial_lambda = [exp(x) for x in rawParamNew]`
			`lp_test = self.lhood(trial_alpha, trial_lambda)`
			`# print("TRYING: %f (need %f) at (%f, %f)" % (lp_test - lp_old, lp_new - lp_old, trial_alpha, trial_lambda))`

			`if lp_test > lp_new:`
			`print(jj)`
			`self._alpha = exp(rawParamNew[0])`
			`self._lambda = exp(rawParamNew[1])`
			`self._alpha_sum = self._alpha * self._K`
			`self._lambda_sum = self._lambda * self._W`
			`rawParam = [log(self._alpha), log(self._lambda)]`
			`break`
			`else:`
			`for dd in xrange(len(rawParamNew)):`
			`if rawParamNew[dd] < rawParam[dd]:`
			`l[dd] = rawParamNew[dd]`
			`else:`
			`r[dd] = rawParamNew[dd]`
			`assert l[dd] <= rawParam[dd]`
			`assert r[dd] >= rawParam[dd]`

			`print("\nNew hyperparameters (%i): %f %f" % (jj, self._alpha, self._lambda))`

			`def lhood(self, doc_smoothing, voc_smoothing):`
			`doc_sum = doc_smoothing * self._K`
			`voc_sum = voc_smoothing * self._W`

			`val = 0.0`
			`val += lgammln(doc_sum) * len(self._docs)`
			`val -= lgammln(doc_smoothing) * self._K * len(self._docs)`
			`for ii in self._docs:`
			`for jj in xrange(self._K):`
			`val += lgammln(doc_smoothing + self._docs[ii][jj])`
			`val -= lgammln(doc_sum + self._docs[ii].N())`

			`val += lgammln(voc_sum) * self._K`
			`val -= lgammln(voc_smoothing) * self._W * self._K`
			`for ii in self._topics:`
			`for jj in self._vocab:`
			`val += lgammln(voc_smoothing + self._topics[ii][jj])`
			`val -= lgammln(voc_sum + self._topics[ii].N())`
			`return val`

			`def initialize(self, data):`
			`"""`
			`Data should be keyed by doc-id, values should be iterable`
			`"""`

			`self._alpha_sum = self._alpha * self._K`
			`self._state = defaultdict(dict)`

			`self._vocab = set([])`
			`for dd in data:`
			`for ww in xrange(len(data[dd])):`
			`# Learn all the words we'll see`
			`self._vocab.add(data[dd][ww])`

			`# Initialize the state to unassigned`
			`self._state[dd][ww] = -1`

			`self._W = len(self._vocab)`
			`self._lambda_sum = float(self._W) * self._lambda`

			`self._data = data`

			`print("Initialized vocab of size %i" % len(self._vocab))`

			`def prob(self, doc, word, topic):`
			`val = log(self._docs[doc][topic] + self._alpha)`
			`# This is constant across a document, so we don't need to compute this term`
			`# val -= log(self._docs[doc].N() + self._alpha_sum)`

			`val += log(self._topics[topic][word] + self._lambda)`
			`val -= log(self._topics[topic].N() + self._lambda_sum)`

			`# print doc, word, topic, self._docs[doc][topic], self._topics[topic][word]`

			`return val`

			`def sample_word(self, doc, position):`
			`word = self._data[doc][position]`

			`old_topic = self._state[doc][position]`
			`if old_topic != -1:`
			`self.change_count(doc, word, old_topic, -1)`

			`probs = [self.prob(doc, self._data[doc][position], x) for x in xrange(self._K)]`
			`new_topic = log_sample(probs)`
			`# print doc, word, new_topic`

			`self.change_count(doc, word, new_topic, 1)`
			`self._state[doc][position] = new_topic`

			`def change_count(self, doc, word, topic, delta):`
			`self._docs[doc].inc(topic, delta)`
			`self._topics[topic].inc(word, delta)`

			`def sample(self, iterations=100, hyper_delay=10):`
			`assert self._state`
			`for ii in xrange(iterations):`
			`for dd in self._data:`
			`for ww in xrange(len(self._data[dd])):`
			`self.sample_word(dd, ww)`
			`print("Iteration %i %f" % (ii, self.lhood(self._alpha, self._lambda)))`
			`if hyper_delay >= 0 and ii % hyper_delay == 0:`
			`self.optimize_hyperparameters()`

			`def print_topics(self, num_words=15):`
			`for ii in self._topics:`
			`print("%i:%s\n" % (ii, "\t".join(self._topics[ii].keys()[:num_words])))`
Adding TF/IDF and LDA (semantic analysis) algorithms. 2010-11-22 11:14:00 -05:00

			`if __name__ == "__main__":`
Black formatting. 2024-04-24 09:43:56 -04:00			`stories = MStory.objects(story_feed_id=199)`
			`d = create_data(stories, doc_limit=250, delimiter="")`
			`lda = LdaSampler(5)`
			`lda.initialize(d)`
Adding TF/IDF and LDA (semantic analysis) algorithms. 2010-11-22 11:14:00 -05:00
Black formatting. 2024-04-24 09:43:56 -04:00			`lda.sample(50)`
			`lda.print_topics()`