mirror of
https://github.com/viq/NewsBlur.git
synced 2025-09-18 21:43:31 +00:00
Trying out a new tfidf matcher for subscriptions. Works OK, not great.
This commit is contained in:
parent
2e429dd293
commit
30117c78d6
3 changed files with 95 additions and 36 deletions
|
@ -22,7 +22,6 @@ class Category(models.Model):
|
|||
|
||||
def __unicode__(self):
|
||||
return '%s (%s)' % (self.category, self.count)
|
||||
|
||||
|
||||
class MClassifierTitle(mongo.Document):
|
||||
user_id = mongo.IntField()
|
||||
|
|
83
apps/analyzer/tfidf.py
Normal file → Executable file
83
apps/analyzer/tfidf.py
Normal file → Executable file
|
@ -1,41 +1,56 @@
|
|||
import zlib
|
||||
import math
|
||||
from operator import itemgetter
|
||||
from apps.rss_feeds.models import MStory
|
||||
from BeautifulSoup import BeautifulSoup
|
||||
#!/usr/bin/env python
|
||||
|
||||
def freq(word, document):
|
||||
return document.split(None).count(word)
|
||||
"""
|
||||
The simplest TF-IDF library imaginable.
|
||||
Add your documents as two-element lists `[docname, [list_of_words_in_the_document]]` with `addDocument(docname, list_of_words)`. Get a list of all the `[docname, similarity_score]` pairs relative to a document by calling `similarities([list_of_words])`.
|
||||
See the README for a usage example.
|
||||
"""
|
||||
|
||||
def wordCount(document):
|
||||
return len(document.split(None))
|
||||
import sys
|
||||
import os
|
||||
|
||||
def numDocsContaining(word,documentList):
|
||||
count = 0
|
||||
for document in documentList:
|
||||
if freq(word,document) > 0:
|
||||
count += 1
|
||||
return count
|
||||
class tfidf:
|
||||
def __init__(self):
|
||||
self.weighted = False
|
||||
self.documents = []
|
||||
self.corpus_dict = {}
|
||||
|
||||
def tf(word, document):
|
||||
return (freq(word,document) / float(wordCount(document)))
|
||||
def addDocument(self, doc_name, list_of_words):
|
||||
# building a dictionary
|
||||
doc_dict = {}
|
||||
for w in list_of_words:
|
||||
doc_dict[w] = doc_dict.get(w, 0.) + 1.0
|
||||
self.corpus_dict[w] = self.corpus_dict.get(w, 0.0) + 1.0
|
||||
|
||||
def idf(word, documentList):
|
||||
return math.log(len(documentList) / numDocsContaining(word,documentList))
|
||||
# normalizing the dictionary
|
||||
length = float(len(list_of_words))
|
||||
for k in doc_dict:
|
||||
doc_dict[k] = doc_dict[k] / length
|
||||
|
||||
def tfidf(word, document, documentList):
|
||||
return (tf(word,document) * idf(word,documentList))
|
||||
# add the normalized document to the corpus
|
||||
self.documents.append([doc_name, doc_dict])
|
||||
|
||||
if __name__ == '__main__':
|
||||
stories = MStory.objects(story_feed_id=184)
|
||||
documentList = []
|
||||
for story in stories:
|
||||
text = zlib.decompress(story.story_content_z)
|
||||
text = ''.join(BeautifulSoup(text).findAll(text=True)).lower()
|
||||
documentList.append(text)
|
||||
words = {}
|
||||
documentNumber = 0
|
||||
for word in documentList[documentNumber].split(None):
|
||||
words[word] = tfidf(word,documentList[documentNumber],documentList)
|
||||
for item in sorted(words.items(), key=itemgetter(1), reverse=True):
|
||||
print "%f <= %s" % (item[1], item[0])
|
||||
def similarities(self, list_of_words):
|
||||
"""Returns a list of all the [docname, similarity_score] pairs relative to a list of words."""
|
||||
|
||||
# building the query dictionary
|
||||
query_dict = {}
|
||||
for w in list_of_words:
|
||||
query_dict[w] = query_dict.get(w, 0.0) + 1.0
|
||||
|
||||
# normalizing the query
|
||||
length = float(len(list_of_words))
|
||||
for k in query_dict:
|
||||
query_dict[k] = query_dict[k] / length
|
||||
|
||||
# computing the list of similarities
|
||||
sims = []
|
||||
for doc in self.documents:
|
||||
score = 0.0
|
||||
doc_dict = doc[1]
|
||||
for k in query_dict:
|
||||
if doc_dict.has_key(k):
|
||||
score += (query_dict[k] / self.corpus_dict[k]) + (doc_dict[k] / self.corpus_dict[k])
|
||||
sims.append([doc[0], score])
|
||||
|
||||
return sims
|
|
@ -2,10 +2,14 @@ import datetime
|
|||
import time
|
||||
import re
|
||||
import redis
|
||||
from collections import defaultdict
|
||||
from operator import itemgetter
|
||||
from pprint import pprint
|
||||
from utils import log as logging
|
||||
from utils import json_functions as json
|
||||
from django.db import models, IntegrityError
|
||||
from django.db.models import Q
|
||||
from django.db.models import Q, F
|
||||
from django.db.models import Count
|
||||
from django.conf import settings
|
||||
from django.contrib.auth.models import User
|
||||
from django.core.cache import cache
|
||||
|
@ -15,6 +19,7 @@ from apps.reader.managers import UserSubscriptionManager
|
|||
from apps.rss_feeds.models import Feed, MStory, DuplicateFeed
|
||||
from apps.analyzer.models import MClassifierFeed, MClassifierAuthor, MClassifierTag, MClassifierTitle
|
||||
from apps.analyzer.models import apply_classifier_titles, apply_classifier_feeds, apply_classifier_authors, apply_classifier_tags
|
||||
from apps.analyzer.tfidf import tfidf
|
||||
from utils.feed_functions import add_object_to_folder, chunks
|
||||
|
||||
class UserSubscription(models.Model):
|
||||
|
@ -865,7 +870,47 @@ class UserSubscription(models.Model):
|
|||
feed = Feed.get_by_id(feed_id)
|
||||
feed.set_next_scheduled_update()
|
||||
|
||||
@classmethod
|
||||
def count_subscribers_to_other_subscriptions(cls, feed_id):
|
||||
# feeds = defaultdict(int)
|
||||
subscribing_users = cls.objects.filter(feed=feed_id).values('user', 'feed_opens').order_by('-feed_opens')[:25]
|
||||
print "Got subscribing users"
|
||||
subscribing_user_ids = [sub['user'] for sub in subscribing_users]
|
||||
print "Got subscribing user ids"
|
||||
cofeeds = cls.objects.filter(user__in=subscribing_user_ids).values('feed').annotate(
|
||||
user_count=Count('user')).order_by('-user_count')[:200]
|
||||
print "Got cofeeds: %s" % len(cofeeds)
|
||||
# feed_subscribers = Feed.objects.filter(pk__in=[f['feed'] for f in cofeeds]).values('pk', 'num_subscribers')
|
||||
# max_local_subscribers = float(max([f['user_count'] for f in cofeeds]))
|
||||
# max_total_subscribers = float(max([f['num_subscribers'] for f in feed_subscribers]))
|
||||
# feed_subscribers = dict([(s['pk'], float(s['num_subscribers'])) for s in feed_subscribers])
|
||||
# pctfeeds = [(f['feed'],
|
||||
# f['user_count'],
|
||||
# feed_subscribers[f['feed']],
|
||||
# f['user_count']/max_total_subscribers,
|
||||
# f['user_count']/max_local_subscribers,
|
||||
# max_local_subscribers,
|
||||
# max_total_subscribers) for f in cofeeds]
|
||||
# print pctfeeds[:5]
|
||||
# orderedpctfeeds = sorted(pctfeeds, key=lambda f: .5*f[3]+.5*f[4], reverse=True)[:8]
|
||||
# pprint([(Feed.get_by_id(o[0]), o[1], o[2], o[3], o[4]) for o in orderedpctfeeds])
|
||||
|
||||
users_by_feeds = {}
|
||||
for feed in [f['feed'] for f in cofeeds]:
|
||||
users_by_feeds[feed] = [u['user'] for u in cls.objects.filter(feed=feed, user__in=subscribing_user_ids).values('user')]
|
||||
print "Got users_by_feeds"
|
||||
|
||||
table = tfidf()
|
||||
for feed in users_by_feeds.keys():
|
||||
table.addDocument(feed, users_by_feeds[feed])
|
||||
print "Got table"
|
||||
|
||||
sorted_table = sorted(table.similarities(subscribing_user_ids), key=itemgetter(1), reverse=True)[:8]
|
||||
pprint([(Feed.get_by_id(o[0]), o[1]) for o in sorted_table])
|
||||
|
||||
return table
|
||||
# return cofeeds
|
||||
|
||||
class RUserStory:
|
||||
|
||||
@classmethod
|
||||
|
|
Loading…
Add table
Reference in a new issue