Trying out a new tfidf matcher for subscriptions. Works OK, not great.

This commit is contained in:
Samuel Clay 2014-12-17 18:24:40 -08:00
parent 2e429dd293
commit 30117c78d6
3 changed files with 95 additions and 36 deletions

View file

@ -22,7 +22,6 @@ class Category(models.Model):
def __unicode__(self):
return '%s (%s)' % (self.category, self.count)
class MClassifierTitle(mongo.Document):
user_id = mongo.IntField()

83
apps/analyzer/tfidf.py Normal file → Executable file
View file

@ -1,41 +1,56 @@
import zlib
import math
from operator import itemgetter
from apps.rss_feeds.models import MStory
from BeautifulSoup import BeautifulSoup
#!/usr/bin/env python
def freq(word, document):
return document.split(None).count(word)
"""
The simplest TF-IDF library imaginable.
Add your documents as two-element lists `[docname, [list_of_words_in_the_document]]` with `addDocument(docname, list_of_words)`. Get a list of all the `[docname, similarity_score]` pairs relative to a document by calling `similarities([list_of_words])`.
See the README for a usage example.
"""
def wordCount(document):
return len(document.split(None))
import sys
import os
def numDocsContaining(word,documentList):
count = 0
for document in documentList:
if freq(word,document) > 0:
count += 1
return count
class tfidf:
def __init__(self):
self.weighted = False
self.documents = []
self.corpus_dict = {}
def tf(word, document):
return (freq(word,document) / float(wordCount(document)))
def addDocument(self, doc_name, list_of_words):
# building a dictionary
doc_dict = {}
for w in list_of_words:
doc_dict[w] = doc_dict.get(w, 0.) + 1.0
self.corpus_dict[w] = self.corpus_dict.get(w, 0.0) + 1.0
def idf(word, documentList):
return math.log(len(documentList) / numDocsContaining(word,documentList))
# normalizing the dictionary
length = float(len(list_of_words))
for k in doc_dict:
doc_dict[k] = doc_dict[k] / length
def tfidf(word, document, documentList):
return (tf(word,document) * idf(word,documentList))
# add the normalized document to the corpus
self.documents.append([doc_name, doc_dict])
if __name__ == '__main__':
stories = MStory.objects(story_feed_id=184)
documentList = []
for story in stories:
text = zlib.decompress(story.story_content_z)
text = ''.join(BeautifulSoup(text).findAll(text=True)).lower()
documentList.append(text)
words = {}
documentNumber = 0
for word in documentList[documentNumber].split(None):
words[word] = tfidf(word,documentList[documentNumber],documentList)
for item in sorted(words.items(), key=itemgetter(1), reverse=True):
print "%f <= %s" % (item[1], item[0])
def similarities(self, list_of_words):
"""Returns a list of all the [docname, similarity_score] pairs relative to a list of words."""
# building the query dictionary
query_dict = {}
for w in list_of_words:
query_dict[w] = query_dict.get(w, 0.0) + 1.0
# normalizing the query
length = float(len(list_of_words))
for k in query_dict:
query_dict[k] = query_dict[k] / length
# computing the list of similarities
sims = []
for doc in self.documents:
score = 0.0
doc_dict = doc[1]
for k in query_dict:
if doc_dict.has_key(k):
score += (query_dict[k] / self.corpus_dict[k]) + (doc_dict[k] / self.corpus_dict[k])
sims.append([doc[0], score])
return sims

View file

@ -2,10 +2,14 @@ import datetime
import time
import re
import redis
from collections import defaultdict
from operator import itemgetter
from pprint import pprint
from utils import log as logging
from utils import json_functions as json
from django.db import models, IntegrityError
from django.db.models import Q
from django.db.models import Q, F
from django.db.models import Count
from django.conf import settings
from django.contrib.auth.models import User
from django.core.cache import cache
@ -15,6 +19,7 @@ from apps.reader.managers import UserSubscriptionManager
from apps.rss_feeds.models import Feed, MStory, DuplicateFeed
from apps.analyzer.models import MClassifierFeed, MClassifierAuthor, MClassifierTag, MClassifierTitle
from apps.analyzer.models import apply_classifier_titles, apply_classifier_feeds, apply_classifier_authors, apply_classifier_tags
from apps.analyzer.tfidf import tfidf
from utils.feed_functions import add_object_to_folder, chunks
class UserSubscription(models.Model):
@ -865,7 +870,47 @@ class UserSubscription(models.Model):
feed = Feed.get_by_id(feed_id)
feed.set_next_scheduled_update()
@classmethod
def count_subscribers_to_other_subscriptions(cls, feed_id):
# feeds = defaultdict(int)
subscribing_users = cls.objects.filter(feed=feed_id).values('user', 'feed_opens').order_by('-feed_opens')[:25]
print "Got subscribing users"
subscribing_user_ids = [sub['user'] for sub in subscribing_users]
print "Got subscribing user ids"
cofeeds = cls.objects.filter(user__in=subscribing_user_ids).values('feed').annotate(
user_count=Count('user')).order_by('-user_count')[:200]
print "Got cofeeds: %s" % len(cofeeds)
# feed_subscribers = Feed.objects.filter(pk__in=[f['feed'] for f in cofeeds]).values('pk', 'num_subscribers')
# max_local_subscribers = float(max([f['user_count'] for f in cofeeds]))
# max_total_subscribers = float(max([f['num_subscribers'] for f in feed_subscribers]))
# feed_subscribers = dict([(s['pk'], float(s['num_subscribers'])) for s in feed_subscribers])
# pctfeeds = [(f['feed'],
# f['user_count'],
# feed_subscribers[f['feed']],
# f['user_count']/max_total_subscribers,
# f['user_count']/max_local_subscribers,
# max_local_subscribers,
# max_total_subscribers) for f in cofeeds]
# print pctfeeds[:5]
# orderedpctfeeds = sorted(pctfeeds, key=lambda f: .5*f[3]+.5*f[4], reverse=True)[:8]
# pprint([(Feed.get_by_id(o[0]), o[1], o[2], o[3], o[4]) for o in orderedpctfeeds])
users_by_feeds = {}
for feed in [f['feed'] for f in cofeeds]:
users_by_feeds[feed] = [u['user'] for u in cls.objects.filter(feed=feed, user__in=subscribing_user_ids).values('user')]
print "Got users_by_feeds"
table = tfidf()
for feed in users_by_feeds.keys():
table.addDocument(feed, users_by_feeds[feed])
print "Got table"
sorted_table = sorted(table.similarities(subscribing_user_ids), key=itemgetter(1), reverse=True)[:8]
pprint([(Feed.get_by_id(o[0]), o[1]) for o in sorted_table])
return table
# return cofeeds
class RUserStory:
@classmethod