Trying out a new tfidf matcher for subscriptions. Works OK, not great.

2025-09-18 21:43:31 +00:00 · 2014-12-17 18:24:40 -08:00 · 2014-12-17 18:24:40 -08:00 · 30117c78d6
commit 30117c78d6
parent 2e429dd293
3 changed files with 95 additions and 36 deletions
--- a/apps/analyzer/models.py
+++ b/apps/analyzer/models.py
@ -22,7 +22,6 @@ class Category(models.Model):
    
    def __unicode__(self):
        return '%s (%s)' % (self.category, self.count)
-        

 class MClassifierTitle(mongo.Document):
    user_id = mongo.IntField()
--- a/apps/analyzer/tfidf.py
+++ b/apps/analyzer/tfidf.py
@ -1,41 +1,56 @@
-import zlib
-import math
-from operator import itemgetter
-from apps.rss_feeds.models import MStory
-from BeautifulSoup import BeautifulSoup
+#!/usr/bin/env python

-def freq(word, document):
-  return document.split(None).count(word)
+"""
+The simplest TF-IDF library imaginable.
+Add your documents as two-element lists `[docname, [list_of_words_in_the_document]]` with `addDocument(docname, list_of_words)`. Get a list of all the `[docname, similarity_score]` pairs relative to a document by calling `similarities([list_of_words])`.
+See the README for a usage example.
+"""

-def wordCount(document):
-  return len(document.split(None))
+import sys
+import os

-def numDocsContaining(word,documentList):
-  count = 0
-  for document in documentList:
-    if freq(word,document) > 0:
-      count += 1
-  return count
+class tfidf:
+    def __init__(self):
+        self.weighted = False
+        self.documents = []
+        self.corpus_dict = {}

-def tf(word, document):
-  return (freq(word,document) / float(wordCount(document)))
+    def addDocument(self, doc_name, list_of_words):
+        # building a dictionary
+        doc_dict = {}
+        for w in list_of_words:
+            doc_dict[w] = doc_dict.get(w, 0.) + 1.0
+            self.corpus_dict[w] = self.corpus_dict.get(w, 0.0) + 1.0

-def idf(word, documentList):
-  return math.log(len(documentList) / numDocsContaining(word,documentList))
+        # normalizing the dictionary
+        length = float(len(list_of_words))
+        for k in doc_dict:
+            doc_dict[k] = doc_dict[k] / length

-def tfidf(word, document, documentList):
-  return (tf(word,document) * idf(word,documentList))
+        # add the normalized document to the corpus
+        self.documents.append([doc_name, doc_dict])

-if __name__ == '__main__':
-  stories = MStory.objects(story_feed_id=184)
-  documentList = []
-  for story in stories:
-    text = zlib.decompress(story.story_content_z)
-    text = ''.join(BeautifulSoup(text).findAll(text=True)).lower()
-    documentList.append(text)
-  words = {}
-  documentNumber = 0
-  for word in documentList[documentNumber].split(None):
-    words[word] = tfidf(word,documentList[documentNumber],documentList)
-  for item in sorted(words.items(), key=itemgetter(1), reverse=True):
-    print "%f <= %s" % (item[1], item[0])
+    def similarities(self, list_of_words):
+        """Returns a list of all the [docname, similarity_score] pairs relative to a list of words."""
+
+        # building the query dictionary
+        query_dict = {}
+        for w in list_of_words:
+            query_dict[w] = query_dict.get(w, 0.0) + 1.0
+
+        # normalizing the query
+        length = float(len(list_of_words))
+        for k in query_dict:
+            query_dict[k] = query_dict[k] / length
+
+        # computing the list of similarities
+        sims = []
+        for doc in self.documents:
+            score = 0.0
+            doc_dict = doc[1]
+            for k in query_dict:
+                if doc_dict.has_key(k):
+                    score += (query_dict[k] / self.corpus_dict[k]) + (doc_dict[k] / self.corpus_dict[k])
+            sims.append([doc[0], score])
+
+        return sims
--- a/apps/reader/models.py
+++ b/apps/reader/models.py
@ -2,10 +2,14 @@ import datetime
 import time
 import re
 import redis
+from collections import defaultdict
+from operator import itemgetter
+from pprint import pprint
 from utils import log as logging
 from utils import json_functions as json
 from django.db import models, IntegrityError
-from django.db.models import Q
+from django.db.models import Q, F
+from django.db.models import Count
 from django.conf import settings
 from django.contrib.auth.models import User
 from django.core.cache import cache
@ -15,6 +19,7 @@ from apps.reader.managers import UserSubscriptionManager
 from apps.rss_feeds.models import Feed, MStory, DuplicateFeed
 from apps.analyzer.models import MClassifierFeed, MClassifierAuthor, MClassifierTag, MClassifierTitle
 from apps.analyzer.models import apply_classifier_titles, apply_classifier_feeds, apply_classifier_authors, apply_classifier_tags
+from apps.analyzer.tfidf import tfidf
 from utils.feed_functions import add_object_to_folder, chunks

 class UserSubscription(models.Model):
@ -865,7 +870,47 @@ class UserSubscription(models.Model):
            feed = Feed.get_by_id(feed_id)
            feed.set_next_scheduled_update()

+    @classmethod
+    def count_subscribers_to_other_subscriptions(cls, feed_id):
+        # feeds = defaultdict(int)
+        subscribing_users = cls.objects.filter(feed=feed_id).values('user', 'feed_opens').order_by('-feed_opens')[:25]
+        print "Got subscribing users"
+        subscribing_user_ids = [sub['user'] for sub in subscribing_users]
+        print "Got subscribing user ids"
+        cofeeds = cls.objects.filter(user__in=subscribing_user_ids).values('feed').annotate(
+                                     user_count=Count('user')).order_by('-user_count')[:200]
+        print "Got cofeeds: %s" % len(cofeeds)
+        # feed_subscribers = Feed.objects.filter(pk__in=[f['feed'] for f in cofeeds]).values('pk', 'num_subscribers')
+        # max_local_subscribers = float(max([f['user_count'] for f in cofeeds]))
+        # max_total_subscribers = float(max([f['num_subscribers'] for f in feed_subscribers]))
+        # feed_subscribers = dict([(s['pk'], float(s['num_subscribers'])) for s in feed_subscribers])
+        # pctfeeds = [(f['feed'],
+        #              f['user_count'],
+        #              feed_subscribers[f['feed']],
+        #              f['user_count']/max_total_subscribers,
+        #              f['user_count']/max_local_subscribers,
+        #              max_local_subscribers,
+        #              max_total_subscribers) for f in cofeeds]
+        # print pctfeeds[:5]
+        # orderedpctfeeds = sorted(pctfeeds, key=lambda f: .5*f[3]+.5*f[4], reverse=True)[:8]
+        # pprint([(Feed.get_by_id(o[0]), o[1], o[2], o[3], o[4]) for o in orderedpctfeeds])

+        users_by_feeds = {}
+        for feed in [f['feed'] for f in cofeeds]:
+            users_by_feeds[feed] = [u['user'] for u in cls.objects.filter(feed=feed, user__in=subscribing_user_ids).values('user')]
+        print "Got users_by_feeds"
+        
+        table = tfidf()
+        for feed in users_by_feeds.keys():
+            table.addDocument(feed, users_by_feeds[feed])
+        print "Got table"
+        
+        sorted_table = sorted(table.similarities(subscribing_user_ids), key=itemgetter(1), reverse=True)[:8]
+        pprint([(Feed.get_by_id(o[0]), o[1]) for o in sorted_table])
+        
+        return table
+        # return cofeeds
+        
 class RUserStory:
    
    @classmethod