Adding TF/IDF and LDA (semantic analysis) algorithms.

2025-08-31 22:20:12 +00:00 · 2010-11-22 11:14:00 -05:00 · 2010-11-22 11:14:00 -05:00 · eeb322f3b4
commit eeb322f3b4
parent bccb8e16bc
2 changed files with 275 additions and 0 deletions
--- a/apps/analyzer/lda.py
+++ b/apps/analyzer/lda.py
@ -0,0 +1,234 @@
+from BeautifulSoup import BeautifulSoup
+from glob import glob
+from collections import defaultdict
+from math import log, exp
+from random import random
+import zlib
+from apps.rss_feeds.models import MStory
+from nltk import FreqDist
+
+
+def lgammln(xx):
+  """
+  Returns the gamma function of xx.
+  Gamma(z) = Integral(0,infinity) of t^(z-1)exp(-t) dt.
+  (Adapted from: Numerical Recipies in C.)
+  
+  Usage:   lgammln(xx)
+  
+  Copied from stats.py by strang@nmr.mgh.harvard.edu
+  """
+
+  coeff = [76.18009173, -86.50532033, 24.01409822, -1.231739516,
+           0.120858003e-2, -0.536382e-5]
+  x = xx - 1.0
+  tmp = x + 5.5
+  tmp = tmp - (x+0.5)*log(tmp)
+  ser = 1.0
+  for j in range(len(coeff)):
+      x = x + 1
+      ser = ser + coeff[j]/x
+  return -tmp + log(2.50662827465*ser)
+
+def log_sum(log_a, log_b):
+  if log_a < log_b:
+    return log_b + log(1 + exp(log_a - log_b))
+  else:
+    return log_a + log(1 + exp(log_b - log_a))
+
+def log_normalize(dist):
+  normalizer = reduce(log_sum, dist)
+  for ii in xrange(len(dist)):
+    dist[ii] -= normalizer
+  return dist
+
+def log_sample(dist):
+  """
+  Sample a key from a dictionary using the values as probabilities (unnormalized)
+  """
+  cutoff = random()
+  dist = log_normalize(dist)
+  #print "Normalizer: ", normalizer
+
+  current = 0
+  for ii in xrange(len(dist)):
+    current += exp(dist[ii])
+    if current >= cutoff:
+      #print "Chose", i
+      return ii
+  assert False, "Didn't choose anything: %f %f" % (cutoff, current)
+
+def create_data(stories, lang="english", doc_limit=-1, delimiter=""):
+  from nltk.tokenize.treebank import TreebankWordTokenizer
+  tokenizer = TreebankWordTokenizer()
+
+  from nltk.corpus import stopwords
+  stop = stopwords.words('english')
+  
+  from string import ascii_lowercase
+  
+  docs = {}
+  print("Found %i stories" % stories.count())
+  for story in stories:
+    text = zlib.decompress(story.story_content_z)
+    # text = story.story_title
+    text = ''.join(BeautifulSoup(text).findAll(text=True)).lower()
+    if delimiter:
+      sections = text.split(delimiter)
+    else:
+      sections = [text]
+            
+    if doc_limit > 0 and len(docs) > doc_limit:
+      print("Passed doc limit %i" % len(docs))
+      break
+    print(story.story_title, len(sections))
+
+    for jj in xrange(len(sections)):
+      docs["%s-%i" % (story.story_title, jj)] = [x for x in tokenizer.tokenize(sections[jj]) \
+                                  if (not x in stop) and \
+                                  (min(y in ascii_lowercase for y in x))]
+  return docs
+
+class LdaSampler:
+  def __init__(self, num_topics, doc_smoothing = 0.1, topic_smoothing = 0.01):
+    self._docs = defaultdict(FreqDist)
+    self._topics = defaultdict(FreqDist)
+    self._K = num_topics
+    self._state = None
+
+    self._alpha = doc_smoothing
+    self._lambda = topic_smoothing
+
+  def optimize_hyperparameters(self, samples=5, step = 3.0):
+    rawParam = [log(self._alpha), log(self._lambda)]
+
+    for ii in xrange(samples):
+      lp_old = self.lhood(self._alpha, self._lambda)
+      lp_new = log(random()) + lp_old
+      print("OLD: %f\tNEW: %f at (%f, %f)" % (lp_old, lp_new, self._alpha, self._lambda))
+
+      l = [x - random() * step for x in rawParam]
+      r = [x + step for x in rawParam]
+
+      for jj in xrange(100):
+        rawParamNew = [l[x] + random() * (r[x] - l[x]) for x in xrange(len(rawParam))]
+        trial_alpha, trial_lambda = [exp(x) for x in rawParamNew]
+        lp_test = self.lhood(trial_alpha, trial_lambda)
+        #print("TRYING: %f (need %f) at (%f, %f)" % (lp_test - lp_old, lp_new - lp_old, trial_alpha, trial_lambda))
+
+        if lp_test > lp_new:
+          print(jj)
+          self._alpha = exp(rawParamNew[0])
+          self._lambda = exp(rawParamNew[1])
+          self._alpha_sum = self._alpha * self._K
+          self._lambda_sum = self._lambda * self._W
+          rawParam = [log(self._alpha), log(self._lambda)]
+          break
+        else:
+          for dd in xrange(len(rawParamNew)):
+            if rawParamNew[dd] < rawParam[dd]:
+              l[dd] = rawParamNew[dd]
+            else:
+              r[dd] = rawParamNew[dd]
+            assert l[dd] <= rawParam[dd]
+            assert r[dd] >= rawParam[dd]
+
+      print("\nNew hyperparameters (%i): %f %f" % (jj, self._alpha, self._lambda))
+
+  def lhood(self, doc_smoothing, voc_smoothing):
+    doc_sum = doc_smoothing * self._K
+    voc_sum = voc_smoothing * self._W
+
+    val = 0.0
+    val += lgammln(doc_sum) * len(self._docs)
+    val -= lgammln(doc_smoothing) * self._K * len(self._docs)
+    for ii in self._docs:
+      for jj in xrange(self._K):
+        val += lgammln(doc_smoothing + self._docs[ii][jj])
+      val -= lgammln(doc_sum + self._docs[ii].N())
+      
+    val += lgammln(voc_sum) * self._K
+    val -= lgammln(voc_smoothing) * self._W * self._K
+    for ii in self._topics:
+      for jj in self._vocab:
+        val += lgammln(voc_smoothing + self._topics[ii][jj])
+      val -= lgammln(voc_sum + self._topics[ii].N())
+    return val
+
+  def initialize(self, data):
+    """
+    Data should be keyed by doc-id, values should be iterable
+    """
+
+    self._alpha_sum = self._alpha * self._K
+    self._state = defaultdict(dict)
+
+    self._vocab = set([])
+    for dd in data:
+      for ww in xrange(len(data[dd])):
+        # Learn all the words we'll see
+        self._vocab.add(data[dd][ww])
+
+        # Initialize the state to unassigned
+        self._state[dd][ww] = -1
+
+    self._W = len(self._vocab)
+    self._lambda_sum = float(self._W) * self._lambda
+
+    self._data = data
+
+    print("Initialized vocab of size %i" % len(self._vocab))
+
+  def prob(self, doc, word, topic):
+    val = log(self._docs[doc][topic] + self._alpha)
+    # This is constant across a document, so we don't need to compute this term
+    # val -= log(self._docs[doc].N() + self._alpha_sum)
+    
+    val += log(self._topics[topic][word] + self._lambda)
+    val -= log(self._topics[topic].N() + self._lambda_sum)
+
+    # print doc, word, topic, self._docs[doc][topic], self._topics[topic][word]
+    
+    return val
+
+  def sample_word(self, doc, position):
+    word = self._data[doc][position]
+
+    old_topic = self._state[doc][position]
+    if old_topic != -1:
+      self.change_count(doc, word, old_topic, -1)
+
+    probs = [self.prob(doc, self._data[doc][position], x) for x in xrange(self._K)]
+    new_topic = log_sample(probs)
+    #print doc, word, new_topic
+
+    self.change_count(doc, word, new_topic, 1)
+    self._state[doc][position] = new_topic
+
+  def change_count(self, doc, word, topic, delta):
+    self._docs[doc].inc(topic, delta)
+    self._topics[topic].inc(word, delta)
+
+  def sample(self, iterations = 100, hyper_delay = 10):
+    assert self._state
+    for ii in xrange(iterations):
+      for dd in self._data:
+        for ww in xrange(len(self._data[dd])):
+            self.sample_word(dd, ww)
+      print("Iteration %i %f" % (ii, self.lhood(self._alpha, self._lambda)))
+      if hyper_delay >= 0 and ii % hyper_delay == 0:
+          self.optimize_hyperparameters()
+
+  def print_topics(self, num_words=15):
+    for ii in self._topics:
+      print("%i:%s\n" % (ii, "\t".join(self._topics[ii].keys()[:num_words])))
+
+
+if __name__ == "__main__":
+  stories = MStory.objects(story_feed_id=199)
+  d = create_data(stories, doc_limit=250, delimiter="")
+  lda = LdaSampler(5)
+  lda.initialize(d)
+
+  lda.sample(50)
+  lda.print_topics()
--- a/apps/analyzer/tfidf.py
+++ b/apps/analyzer/tfidf.py
@ -0,0 +1,41 @@
+import zlib
+import math
+from operator import itemgetter
+from apps.rss_feeds.models import MStory
+from BeautifulSoup import BeautifulSoup
+
+def freq(word, document):
+  return document.split(None).count(word)
+
+def wordCount(document):
+  return len(document.split(None))
+
+def numDocsContaining(word,documentList):
+  count = 0
+  for document in documentList:
+    if freq(word,document) > 0:
+      count += 1
+  return count
+
+def tf(word, document):
+  return (freq(word,document) / float(wordCount(document)))
+
+def idf(word, documentList):
+  return math.log(len(documentList) / numDocsContaining(word,documentList))
+
+def tfidf(word, document, documentList):
+  return (tf(word,document) * idf(word,documentList))
+
+if __name__ == '__main__':
+  stories = MStory.objects(story_feed_id=184)
+  documentList = []
+  for story in stories:
+    text = zlib.decompress(story.story_content_z)
+    text = ''.join(BeautifulSoup(text).findAll(text=True)).lower()
+    documentList.append(text)
+  words = {}
+  documentNumber = 0
+  for word in documentList[documentNumber].split(None):
+    words[word] = tfidf(word,documentList[documentNumber],documentList)
+  for item in sorted(words.items(), key=itemgetter(1), reverse=True):
+    print "%f <= %s" % (item[1], item[0])