Further Mongo work. Stories are now saved correctly, with tags + authors. Still need to do UserStories.

This commit is contained in:
Samuel Clay 2010-08-21 20:42:38 -04:00
parent fa3be28b43
commit ac53f33788
14 changed files with 115 additions and 109 deletions

View file

@ -1,12 +1,6 @@
from django.contrib.auth.models import User
from apps.rss_feeds.models import Feed, Story
from apps.reader.models import UserSubscription, UserStory
from apps.analyzer.models import Category, FeatureCategory
from django.db.models.aggregates import Sum
import datetime
import re
import math
import itertools
class Classifier:

View file

@ -1,41 +1,41 @@
from django.contrib.auth.models import User
from apps.rss_feeds.models import Feed, Story
from apps.reader.models import UserSubscription, UserStory
from apps.reader.models import UserSubscription
from apps.analyzer.models import Category, FeatureCategory
import datetime
import re
import math
def entry_features(self, entry):
splitter=re.compile('\\W*')
f={}
# Extract the title words and annotate
titlewords=[s.lower() for s in splitter.split(entry['title'])
if len(s)>2 and len(s)<20]
for w in titlewords: f['Title:'+w]=1
# Extract the summary words
summarywords=[s.lower() for s in splitter.split(entry['summary'])
if len(s)>2 and len(s)<20]
def entry_features(self, entry):
splitter=re.compile('\\W*')
f={}
# Count uppercase words
uc=0
for i in range(len(summarywords)):
w=summarywords[i]
f[w]=1
if w.isupper(): uc+=1
# Extract the title words and annotate
titlewords=[s.lower() for s in splitter.split(entry['title'])
if len(s)>2 and len(s)<20]
# Get word pairs in summary as features
if i<len(summarywords)-1:
twowords=' '.join(summarywords[i:i+1])
f[twowords]=1
# Keep creator and publisher whole
f['Publisher:'+entry['publisher']]=1
for w in titlewords: f['Title:'+w]=1
# UPPERCASE is a virtual word flagging too much shouting
if float(uc)/len(summarywords)>0.3: f['UPPERCASE']=1
return f
# Extract the summary words
summarywords=[s.lower() for s in splitter.split(entry['summary'])
if len(s)>2 and len(s)<20]
# Count uppercase words
uc=0
for i in range(len(summarywords)):
w=summarywords[i]
f[w]=1
if w.isupper(): uc+=1
# Get word pairs in summary as features
if i<len(summarywords)-1:
twowords=' '.join(summarywords[i:i+1])
f[twowords]=1
# Keep creator and publisher whole
f['Publisher:'+entry['publisher']]=1
# UPPERCASE is a virtual word flagging too much shouting
if float(uc)/len(summarywords)>0.3: f['UPPERCASE']=1
return f

View file

@ -1,7 +1,6 @@
from apps.reader.models import UserSubscription, UserStory, UserSubscriptionFolders, Feature
from apps.reader.models import UserSubscription, UserSubscriptionFolders, Feature
from django.contrib import admin
admin.site.register(UserSubscription)
admin.site.register(UserSubscriptionFolders)
admin.site.register(UserStory)
admin.site.register(Feature)

View file

@ -1,4 +1,5 @@
import datetime
import mongoengine as mongo
from utils import log as logging
from django.db import models
from django.contrib.auth.models import User
@ -154,6 +155,14 @@ class UserStory(models.Model):
verbose_name = "user story"
unique_together = ("user", "feed", "story")
class MUserStory(mongo.Document):
"""
Stories read by the user. These are deleted as the mark_read_date for the
UserSubscription passes the UserStory date.
"""
class UserSubscriptionFolders(models.Model):
"""
A JSON list of folders and feeds for while a user has subscribed. The list

View file

@ -1,10 +1,6 @@
from django.core.management.base import BaseCommand
from django.core.handlers.wsgi import WSGIHandler
from apps.rss_feeds.models import Feed, Story
from django.core.cache import cache
from apps.reader.models import UserSubscription, UserStory
from optparse import OptionParser, make_option
from utils import log as logging
from apps.reader.models import UserSubscription
from optparse import make_option
import os
import errno
import re

View file

@ -1,13 +1,7 @@
from django.core.management.base import BaseCommand
from django.core.handlers.wsgi import WSGIHandler
from apps.rss_feeds.models import Feed, Story
from django.contrib.auth.models import User
from django.core.cache import cache
from apps.reader.models import UserSubscription, UserStory
from optparse import OptionParser, make_option
import os
import errno
import re
from apps.reader.models import UserSubscription
from optparse import make_option
import datetime
class Command(BaseCommand):

View file

@ -1,13 +1,7 @@
from django.core.management.base import BaseCommand
from django.core.handlers.wsgi import WSGIHandler
from apps.rss_feeds.models import Feed, Story
from django.core.cache import cache
from django.db.models import Q
from apps.reader.models import UserSubscription, UserStory
from optparse import OptionParser, make_option
from apps.rss_feeds.models import Feed
from optparse import make_option
from utils.management_functions import daemonize
import os
import errno
class Command(BaseCommand):
option_list = BaseCommand.option_list + (
@ -29,7 +23,4 @@ class Command(BaseCommand):
def _refresh_feeds(self, feeds, force=False):
for feed in feeds:
feed.update(force=force, single_threaded=True)
usersubs = UserSubscription.objects.filter(
feed=feed.id
)
feed.update(force=force, single_threaded=True)

View file

@ -12,6 +12,7 @@ class Command(BaseCommand):
option_list = BaseCommand.option_list + (
make_option("-f", "--feed", default=None),
make_option("-d", "--daemon", dest="daemonize", action="store_true"),
make_option("-F", "--force", dest="force", action="store_true"),
make_option("-s", "--single_threaded", dest="single_threaded", action="store_true"),
make_option('-t', '--timeout', type='int', default=10,
help='Wait timeout in seconds when connecting to feeds.'),
@ -41,7 +42,10 @@ class Command(BaseCommand):
socket.setdefaulttimeout(options['timeout'])
feeds = Feed.objects.filter(next_scheduled_update__lte=now)#.order_by('?')
if options['force']:
feeds = Feed.objects.all()
num_workers = min(len(feeds), options['workerthreads'])
if options['single_threaded']:
num_workers = 1

View file

@ -0,0 +1,35 @@
from pprint import pprint
from django.conf import settings
from apps.rss_feeds.models import Feed, Story, MStory, StoryAuthor
import mongoengine
import sys
from utils import json
MONGO_DB = settings.MONGO_DB
db = mongoengine.connect(MONGO_DB['NAME'], host=MONGO_DB['HOST'], port=MONGO_DB['PORT'])
print "Mongo DB stories: %s" % MStory.objects().count()
db.stories.drop()
print "Dropped! Mongo DB stories: %s" % MStory.objects().count()
print "Stories: %s" % Story.objects.all().count()
pprint(db.stories.index_information())
feeds = Feed.objects.all().order_by('-average_stories_per_month')
for feed in feeds:
print "%-5s: %s" % (Story.objects.select_related('story_author', 'tags').filter(story_feed=feed).count(),
feed)
sys.stdout.flush()
stories = Story.objects.filter(story_feed=feed).values()
for story in stories:
# story['story_tags'] = [tag.name for tag in Tag.objects.filter(story=story['id'])]
story['story_tags'] = json.decode(story['story_tags'])
del story['id']
del story['story_author_id']
MStory(**story).save()
print "Mongo DB stories: %s" % MStory.objects().count()

View file

@ -119,10 +119,10 @@ class Feed(models.Model):
def count_stories(self, verbose=False, lock=None):
month_ago = datetime.datetime.now() - datetime.timedelta(days=30)
stories_last_month = Story.objects.filter(story_feed=self, story_date__gte=month_ago).count()
stories_last_month = MStory.objects(story_feed=self.pk, story_date__gte=month_ago).count()
self.stories_last_month = stories_last_month
self.recount_feed(lock)
# self.recount_feed(lock)
self.save(lock=lock)
@ -254,8 +254,7 @@ class Feed(models.Model):
story_author_name = story.get('author'),
story_permalink = story.get('link'),
story_guid = story.get('guid') or story.get('id') or story.get('link'),
story_tags = self._shorten_and_encode_story_tags(story_tags),
# tags = story_tags
story_tags = story_tags
)
try:
s.save()
@ -264,8 +263,6 @@ class Feed(models.Model):
except IntegrityError:
ret_values[ENTRY_ERR] += 1
# print('Saving new story, IntegrityError: %s - %s: %s' % (self.feed_title, story.get('title'), e))
# for tcat in story_tags:
# Tag.objects.get_or_create(feed=self, tag=tcat)
elif existing_story and story_has_changed:
# update story
# logging.debug('- Updated story in feed (%s - %s): %s / %s' % (self.feed_title, story.get('title'), len(existing_story.story_content), len(story_content)))
@ -296,10 +293,7 @@ class Feed(models.Model):
existing_story['story_author_name'] = story.get('author')
existing_story['story_permalink'] = story.get('link')
existing_story['story_guid'] = story.get('guid') or story.get('id') or story.get('link')
existing_story['story_tags'] = self._shorten_and_encode_story_tags(story_tags)
# existing_story['tags'] = story_tags
# s.tags.clear()
# [s.tags.add(tcat) for tcat in story_tags]
existing_story['story_tags'] = story_tags
try:
db.stories.update({'_id': existing_story['_id']}, existing_story)
ret_values[ENTRY_UPDATED] += 1
@ -321,6 +315,7 @@ class Feed(models.Model):
if not feed_tags:
from apps.rss_feeds.models import Tag
from django.db.models.aggregates import Count
# Map Reduce this
all_tags = Tag.objects.filter(feed=self)\
.annotate(stories_count=Count('story'))\
.order_by('-stories_count')[:20]
@ -352,15 +347,7 @@ class Feed(models.Model):
authors_list = json.decode(feed_authors) if feed_authors else []
if len(authors_list) > 1:
self.save_popular_authors(authors_list[:-1])
def _shorten_and_encode_story_tags(self, story_tags):
encoded_tags = json.encode([t.name for t in story_tags])
if len(encoded_tags) < 2000:
return encoded_tags
if len(story_tags) > 1:
return self._shorten_and_encode_story_tags(story_tags[:-1])
def trim_feed(self):
from apps.reader.models import UserStory
stories_deleted_count = 0
@ -389,11 +376,8 @@ class Feed(models.Model):
user_stories_count)
def get_stories(self, offset=0, limit=25, force=False):
if not force:
stories = cache.get('feed_stories:%s-%s-%s' % (self.id, offset, limit), [])
else:
stories = None
stories = cache.get('feed_stories:%s-%s-%s' % (self.id, offset, limit), [])
if not stories or force:
stories_db = MStory.objects(story_feed_id=self.pk)[offset:offset+limit]
stories = self.format_stories(stories_db)
@ -407,8 +391,7 @@ class Feed(models.Model):
# print "Formatting Stories: %s" % stories_db.count()
for story_db in stories_db:
story = {}
# story_tags = story_db.tags.all()
story['story_tags'] = (story_db.story_tags and json.decode(story_db.story_tags)) or []
story['story_tags'] = story_db.story_tags # or []
story['short_parsed_date'] = format_story_link_date__short(story_db.story_date)
story['long_parsed_date'] = format_story_link_date__long(story_db.story_date)
story['story_date'] = story_db.story_date
@ -445,10 +428,7 @@ class Feed(models.Model):
tagname = tagname.strip()
if not tagname or tagname == ' ':
continue
if not Tag.objects.filter(name=tagname, feed=self):
cobj = Tag(name=tagname, feed=self)
cobj.save()
fcat.append(Tag.objects.get(name=tagname, feed=self))
fcat.append(tagname)
return fcat
def _exists_story(self, story=None, story_content=None, existing_stories=None):
@ -652,7 +632,7 @@ class Story(models.Model):
class MStory(mongo.Document):
'''A feed item'''
story_feed_id = mongo.IntField()
story_feed_id = mongo.IntField(unique_with='story_guid')
story_date = mongo.DateTimeField()
story_title = mongo.StringField(max_length=255)
story_content = mongo.StringField()
@ -663,12 +643,13 @@ class MStory(mongo.Document):
story_permalink = mongo.StringField()
story_guid = mongo.StringField(primary_key=True)
story_guid_hash = mongo.StringField(max_length=40)
story_tags = mongo.StringField(max_length=2000)
tags = mongo.ListField(mongo.StringField(max_length=100))
story_tags = mongo.ListField(mongo.StringField(max_length=100))
meta = {
'collection': 'stories',
'indexes': ['story_feed_id', 'story_date']
'indexes': ['story_feed_id', 'story_date', ('story_feed_id', '-story_date')],
'ordering': ['-story_date'],
'allow_inheritance': False,
}
class FeedUpdateHistory(models.Model):

View file

@ -239,4 +239,4 @@ DEBUG_TOOLBAR_CONFIG = {
# = Mongo =
# =========
connect(MONGO_DB['NAME'], host=MONGO_DB['HOST'], port=MONGO_DB['PORT'])
MONGODB = connect(MONGO_DB['NAME'], host=MONGO_DB['HOST'], port=MONGO_DB['PORT'])

View file

@ -22,7 +22,7 @@ import pymongo
# Refresh feed code adapted from Feedjack.
# http://feedjack.googlecode.com
VERSION = '0.4'
VERSION = '0.8'
URL = 'http://www.newsblur.com/'
USER_AGENT = 'NewsBlur Fetcher %s - %s' % (VERSION, URL)
SLOWFEED_WARNING = 10
@ -77,13 +77,12 @@ class FetchFeed:
return identity
class ProcessFeed:
def __init__(self, feed, fpf, options):
def __init__(self, feed, fpf, db, options):
self.feed = feed
self.options = options
self.fpf = fpf
self.lock = multiprocessing.Lock()
connection = pymongo.Connection(settings.MONGO_DB['HOST'])
self.db = connection[settings.MONGO_DB['NAME']]
self.db = db
def process(self):
""" Downloads and parses a feed.
@ -242,6 +241,9 @@ class Dispatcher:
from django.db import connection
connection.close()
MONGO_DB = settings.MONGO_DB
db = pymongo.Connection(host=MONGO_DB['HOST'], port=MONGO_DB['PORT'])[MONGO_DB['NAME']]
current_process = multiprocessing.current_process()
lock = multiprocessing.Lock()
@ -271,7 +273,7 @@ class Dispatcher:
delta = datetime.datetime.now() - start_time
if fetched_feed and ret_feed == FEED_OK:
pfeed = ProcessFeed(feed, fetched_feed, self.options)
pfeed = ProcessFeed(feed, fetched_feed, db, self.options)
ret_feed, ret_entries = pfeed.process()
if ret_entries.get(ENTRY_NEW):

View file

@ -12,6 +12,8 @@ from django.db.models.query import QuerySet
import sys
def decode(data):
if not data:
return data
return json.loads(data)
def encode(data, *args, **kwargs):

View file

@ -1,7 +1,7 @@
#!/usr/bin/env python
from utils.munin.base import MuninGraph
from apps.rss_feeds.models import Story, Tag, StoryAuthor
from apps.rss_feeds.models import MStory, StoryAuthor
from apps.reader.models import UserStory
graph_config = {
@ -15,8 +15,7 @@ graph_config = {
}
metrics = {
'stories': Story.objects.count(),
'tags': Tag.objects.count(),
'stories': MStory.objects().count(),
'authors': StoryAuthor.objects.count(),
'read_stories': UserStory.objects.count(),
}