From 30117c78d644161d1eb39f71678f180f71673e1a Mon Sep 17 00:00:00 2001 From: Samuel Clay Date: Wed, 17 Dec 2014 18:24:40 -0800 Subject: [PATCH] Trying out a new tfidf matcher for subscriptions. Works OK, not great. --- apps/analyzer/models.py | 1 - apps/analyzer/tfidf.py | 83 ++++++++++++++++++++++++----------------- apps/reader/models.py | 47 ++++++++++++++++++++++- 3 files changed, 95 insertions(+), 36 deletions(-) mode change 100644 => 100755 apps/analyzer/tfidf.py diff --git a/apps/analyzer/models.py b/apps/analyzer/models.py index 63644af53..c3ee27f5e 100644 --- a/apps/analyzer/models.py +++ b/apps/analyzer/models.py @@ -22,7 +22,6 @@ class Category(models.Model): def __unicode__(self): return '%s (%s)' % (self.category, self.count) - class MClassifierTitle(mongo.Document): user_id = mongo.IntField() diff --git a/apps/analyzer/tfidf.py b/apps/analyzer/tfidf.py old mode 100644 new mode 100755 index e662df35c..08fe0e4f0 --- a/apps/analyzer/tfidf.py +++ b/apps/analyzer/tfidf.py @@ -1,41 +1,56 @@ -import zlib -import math -from operator import itemgetter -from apps.rss_feeds.models import MStory -from BeautifulSoup import BeautifulSoup +#!/usr/bin/env python -def freq(word, document): - return document.split(None).count(word) +""" +The simplest TF-IDF library imaginable. +Add your documents as two-element lists `[docname, [list_of_words_in_the_document]]` with `addDocument(docname, list_of_words)`. Get a list of all the `[docname, similarity_score]` pairs relative to a document by calling `similarities([list_of_words])`. +See the README for a usage example. +""" -def wordCount(document): - return len(document.split(None)) +import sys +import os -def numDocsContaining(word,documentList): - count = 0 - for document in documentList: - if freq(word,document) > 0: - count += 1 - return count +class tfidf: + def __init__(self): + self.weighted = False + self.documents = [] + self.corpus_dict = {} -def tf(word, document): - return (freq(word,document) / float(wordCount(document))) + def addDocument(self, doc_name, list_of_words): + # building a dictionary + doc_dict = {} + for w in list_of_words: + doc_dict[w] = doc_dict.get(w, 0.) + 1.0 + self.corpus_dict[w] = self.corpus_dict.get(w, 0.0) + 1.0 -def idf(word, documentList): - return math.log(len(documentList) / numDocsContaining(word,documentList)) + # normalizing the dictionary + length = float(len(list_of_words)) + for k in doc_dict: + doc_dict[k] = doc_dict[k] / length -def tfidf(word, document, documentList): - return (tf(word,document) * idf(word,documentList)) + # add the normalized document to the corpus + self.documents.append([doc_name, doc_dict]) -if __name__ == '__main__': - stories = MStory.objects(story_feed_id=184) - documentList = [] - for story in stories: - text = zlib.decompress(story.story_content_z) - text = ''.join(BeautifulSoup(text).findAll(text=True)).lower() - documentList.append(text) - words = {} - documentNumber = 0 - for word in documentList[documentNumber].split(None): - words[word] = tfidf(word,documentList[documentNumber],documentList) - for item in sorted(words.items(), key=itemgetter(1), reverse=True): - print "%f <= %s" % (item[1], item[0]) \ No newline at end of file + def similarities(self, list_of_words): + """Returns a list of all the [docname, similarity_score] pairs relative to a list of words.""" + + # building the query dictionary + query_dict = {} + for w in list_of_words: + query_dict[w] = query_dict.get(w, 0.0) + 1.0 + + # normalizing the query + length = float(len(list_of_words)) + for k in query_dict: + query_dict[k] = query_dict[k] / length + + # computing the list of similarities + sims = [] + for doc in self.documents: + score = 0.0 + doc_dict = doc[1] + for k in query_dict: + if doc_dict.has_key(k): + score += (query_dict[k] / self.corpus_dict[k]) + (doc_dict[k] / self.corpus_dict[k]) + sims.append([doc[0], score]) + + return sims \ No newline at end of file diff --git a/apps/reader/models.py b/apps/reader/models.py index cf8434d5a..28bb2658d 100644 --- a/apps/reader/models.py +++ b/apps/reader/models.py @@ -2,10 +2,14 @@ import datetime import time import re import redis +from collections import defaultdict +from operator import itemgetter +from pprint import pprint from utils import log as logging from utils import json_functions as json from django.db import models, IntegrityError -from django.db.models import Q +from django.db.models import Q, F +from django.db.models import Count from django.conf import settings from django.contrib.auth.models import User from django.core.cache import cache @@ -15,6 +19,7 @@ from apps.reader.managers import UserSubscriptionManager from apps.rss_feeds.models import Feed, MStory, DuplicateFeed from apps.analyzer.models import MClassifierFeed, MClassifierAuthor, MClassifierTag, MClassifierTitle from apps.analyzer.models import apply_classifier_titles, apply_classifier_feeds, apply_classifier_authors, apply_classifier_tags +from apps.analyzer.tfidf import tfidf from utils.feed_functions import add_object_to_folder, chunks class UserSubscription(models.Model): @@ -865,7 +870,47 @@ class UserSubscription(models.Model): feed = Feed.get_by_id(feed_id) feed.set_next_scheduled_update() + @classmethod + def count_subscribers_to_other_subscriptions(cls, feed_id): + # feeds = defaultdict(int) + subscribing_users = cls.objects.filter(feed=feed_id).values('user', 'feed_opens').order_by('-feed_opens')[:25] + print "Got subscribing users" + subscribing_user_ids = [sub['user'] for sub in subscribing_users] + print "Got subscribing user ids" + cofeeds = cls.objects.filter(user__in=subscribing_user_ids).values('feed').annotate( + user_count=Count('user')).order_by('-user_count')[:200] + print "Got cofeeds: %s" % len(cofeeds) + # feed_subscribers = Feed.objects.filter(pk__in=[f['feed'] for f in cofeeds]).values('pk', 'num_subscribers') + # max_local_subscribers = float(max([f['user_count'] for f in cofeeds])) + # max_total_subscribers = float(max([f['num_subscribers'] for f in feed_subscribers])) + # feed_subscribers = dict([(s['pk'], float(s['num_subscribers'])) for s in feed_subscribers]) + # pctfeeds = [(f['feed'], + # f['user_count'], + # feed_subscribers[f['feed']], + # f['user_count']/max_total_subscribers, + # f['user_count']/max_local_subscribers, + # max_local_subscribers, + # max_total_subscribers) for f in cofeeds] + # print pctfeeds[:5] + # orderedpctfeeds = sorted(pctfeeds, key=lambda f: .5*f[3]+.5*f[4], reverse=True)[:8] + # pprint([(Feed.get_by_id(o[0]), o[1], o[2], o[3], o[4]) for o in orderedpctfeeds]) + users_by_feeds = {} + for feed in [f['feed'] for f in cofeeds]: + users_by_feeds[feed] = [u['user'] for u in cls.objects.filter(feed=feed, user__in=subscribing_user_ids).values('user')] + print "Got users_by_feeds" + + table = tfidf() + for feed in users_by_feeds.keys(): + table.addDocument(feed, users_by_feeds[feed]) + print "Got table" + + sorted_table = sorted(table.similarities(subscribing_user_ids), key=itemgetter(1), reverse=True)[:8] + pprint([(Feed.get_by_id(o[0]), o[1]) for o in sorted_table]) + + return table + # return cofeeds + class RUserStory: @classmethod