diff --git a/apps/analyzer/classifier.py b/apps/analyzer/classifier.py index 94e0e48ed..a9d74a7e0 100644 --- a/apps/analyzer/classifier.py +++ b/apps/analyzer/classifier.py @@ -1,12 +1,6 @@ -from django.contrib.auth.models import User -from apps.rss_feeds.models import Feed, Story -from apps.reader.models import UserSubscription, UserStory from apps.analyzer.models import Category, FeatureCategory from django.db.models.aggregates import Sum -import datetime -import re import math -import itertools class Classifier: diff --git a/apps/analyzer/feed_filter.py b/apps/analyzer/feed_filter.py index bff9bfd34..61dae9903 100644 --- a/apps/analyzer/feed_filter.py +++ b/apps/analyzer/feed_filter.py @@ -1,41 +1,41 @@ from django.contrib.auth.models import User from apps.rss_feeds.models import Feed, Story -from apps.reader.models import UserSubscription, UserStory +from apps.reader.models import UserSubscription from apps.analyzer.models import Category, FeatureCategory import datetime import re import math - def entry_features(self, entry): - splitter=re.compile('\\W*') - f={} - - # Extract the title words and annotate - titlewords=[s.lower() for s in splitter.split(entry['title']) - if len(s)>2 and len(s)<20] - - for w in titlewords: f['Title:'+w]=1 - - # Extract the summary words - summarywords=[s.lower() for s in splitter.split(entry['summary']) - if len(s)>2 and len(s)<20] +def entry_features(self, entry): + splitter=re.compile('\\W*') + f={} - # Count uppercase words - uc=0 - for i in range(len(summarywords)): - w=summarywords[i] - f[w]=1 - if w.isupper(): uc+=1 + # Extract the title words and annotate + titlewords=[s.lower() for s in splitter.split(entry['title']) + if len(s)>2 and len(s)<20] - # Get word pairs in summary as features - if i0.3: f['UPPERCASE']=1 - - return f + # Extract the summary words + summarywords=[s.lower() for s in splitter.split(entry['summary']) + if len(s)>2 and len(s)<20] + + # Count uppercase words + uc=0 + for i in range(len(summarywords)): + w=summarywords[i] + f[w]=1 + if w.isupper(): uc+=1 + + # Get word pairs in summary as features + if i0.3: f['UPPERCASE']=1 + + return f diff --git a/apps/reader/admin.py b/apps/reader/admin.py index 139807d9f..03daf35fc 100644 --- a/apps/reader/admin.py +++ b/apps/reader/admin.py @@ -1,7 +1,6 @@ -from apps.reader.models import UserSubscription, UserStory, UserSubscriptionFolders, Feature +from apps.reader.models import UserSubscription, UserSubscriptionFolders, Feature from django.contrib import admin admin.site.register(UserSubscription) admin.site.register(UserSubscriptionFolders) -admin.site.register(UserStory) admin.site.register(Feature) \ No newline at end of file diff --git a/apps/reader/models.py b/apps/reader/models.py index b2c6784ac..734c15afe 100644 --- a/apps/reader/models.py +++ b/apps/reader/models.py @@ -1,4 +1,5 @@ import datetime +import mongoengine as mongo from utils import log as logging from django.db import models from django.contrib.auth.models import User @@ -154,6 +155,14 @@ class UserStory(models.Model): verbose_name = "user story" unique_together = ("user", "feed", "story") +class MUserStory(mongo.Document): + """ + Stories read by the user. These are deleted as the mark_read_date for the + UserSubscription passes the UserStory date. + """ + + + class UserSubscriptionFolders(models.Model): """ A JSON list of folders and feeds for while a user has subscribed. The list diff --git a/apps/rss_feeds/management/commands/calculate_scores.py b/apps/rss_feeds/management/commands/calculate_scores.py index 71b4520ef..4ca0fbe97 100644 --- a/apps/rss_feeds/management/commands/calculate_scores.py +++ b/apps/rss_feeds/management/commands/calculate_scores.py @@ -1,10 +1,6 @@ from django.core.management.base import BaseCommand -from django.core.handlers.wsgi import WSGIHandler -from apps.rss_feeds.models import Feed, Story -from django.core.cache import cache -from apps.reader.models import UserSubscription, UserStory -from optparse import OptionParser, make_option -from utils import log as logging +from apps.reader.models import UserSubscription +from optparse import make_option import os import errno import re diff --git a/apps/rss_feeds/management/commands/mark_read.py b/apps/rss_feeds/management/commands/mark_read.py index 124345e2a..fa39444c0 100644 --- a/apps/rss_feeds/management/commands/mark_read.py +++ b/apps/rss_feeds/management/commands/mark_read.py @@ -1,13 +1,7 @@ from django.core.management.base import BaseCommand -from django.core.handlers.wsgi import WSGIHandler -from apps.rss_feeds.models import Feed, Story from django.contrib.auth.models import User -from django.core.cache import cache -from apps.reader.models import UserSubscription, UserStory -from optparse import OptionParser, make_option -import os -import errno -import re +from apps.reader.models import UserSubscription +from optparse import make_option import datetime class Command(BaseCommand): diff --git a/apps/rss_feeds/management/commands/refresh_feed.py b/apps/rss_feeds/management/commands/refresh_feed.py index d01f067c1..98e2c7a34 100644 --- a/apps/rss_feeds/management/commands/refresh_feed.py +++ b/apps/rss_feeds/management/commands/refresh_feed.py @@ -1,13 +1,7 @@ from django.core.management.base import BaseCommand -from django.core.handlers.wsgi import WSGIHandler -from apps.rss_feeds.models import Feed, Story -from django.core.cache import cache -from django.db.models import Q -from apps.reader.models import UserSubscription, UserStory -from optparse import OptionParser, make_option +from apps.rss_feeds.models import Feed +from optparse import make_option from utils.management_functions import daemonize -import os -import errno class Command(BaseCommand): option_list = BaseCommand.option_list + ( @@ -29,7 +23,4 @@ class Command(BaseCommand): def _refresh_feeds(self, feeds, force=False): for feed in feeds: - feed.update(force=force, single_threaded=True) - usersubs = UserSubscription.objects.filter( - feed=feed.id - ) \ No newline at end of file + feed.update(force=force, single_threaded=True) \ No newline at end of file diff --git a/apps/rss_feeds/management/commands/refresh_feeds.py b/apps/rss_feeds/management/commands/refresh_feeds.py index 377a8c364..2d0f6c0fd 100644 --- a/apps/rss_feeds/management/commands/refresh_feeds.py +++ b/apps/rss_feeds/management/commands/refresh_feeds.py @@ -12,6 +12,7 @@ class Command(BaseCommand): option_list = BaseCommand.option_list + ( make_option("-f", "--feed", default=None), make_option("-d", "--daemon", dest="daemonize", action="store_true"), + make_option("-F", "--force", dest="force", action="store_true"), make_option("-s", "--single_threaded", dest="single_threaded", action="store_true"), make_option('-t', '--timeout', type='int', default=10, help='Wait timeout in seconds when connecting to feeds.'), @@ -41,7 +42,10 @@ class Command(BaseCommand): socket.setdefaulttimeout(options['timeout']) feeds = Feed.objects.filter(next_scheduled_update__lte=now)#.order_by('?') - + + if options['force']: + feeds = Feed.objects.all() + num_workers = min(len(feeds), options['workerthreads']) if options['single_threaded']: num_workers = 1 diff --git a/apps/rss_feeds/migrations/bootstrap_mongo.py b/apps/rss_feeds/migrations/bootstrap_mongo.py new file mode 100644 index 000000000..0200eeb62 --- /dev/null +++ b/apps/rss_feeds/migrations/bootstrap_mongo.py @@ -0,0 +1,35 @@ +from pprint import pprint +from django.conf import settings +from apps.rss_feeds.models import Feed, Story, MStory, StoryAuthor +import mongoengine +import sys +from utils import json + +MONGO_DB = settings.MONGO_DB +db = mongoengine.connect(MONGO_DB['NAME'], host=MONGO_DB['HOST'], port=MONGO_DB['PORT']) + +print "Mongo DB stories: %s" % MStory.objects().count() + +db.stories.drop() +print "Dropped! Mongo DB stories: %s" % MStory.objects().count() + + +print "Stories: %s" % Story.objects.all().count() + +pprint(db.stories.index_information()) + +feeds = Feed.objects.all().order_by('-average_stories_per_month') +for feed in feeds: + print "%-5s: %s" % (Story.objects.select_related('story_author', 'tags').filter(story_feed=feed).count(), + feed) + sys.stdout.flush() + + stories = Story.objects.filter(story_feed=feed).values() + for story in stories: + # story['story_tags'] = [tag.name for tag in Tag.objects.filter(story=story['id'])] + story['story_tags'] = json.decode(story['story_tags']) + del story['id'] + del story['story_author_id'] + MStory(**story).save() + +print "Mongo DB stories: %s" % MStory.objects().count() diff --git a/apps/rss_feeds/models.py b/apps/rss_feeds/models.py index c2408fda2..aa7932fed 100644 --- a/apps/rss_feeds/models.py +++ b/apps/rss_feeds/models.py @@ -119,10 +119,10 @@ class Feed(models.Model): def count_stories(self, verbose=False, lock=None): month_ago = datetime.datetime.now() - datetime.timedelta(days=30) - stories_last_month = Story.objects.filter(story_feed=self, story_date__gte=month_ago).count() + stories_last_month = MStory.objects(story_feed=self.pk, story_date__gte=month_ago).count() self.stories_last_month = stories_last_month - self.recount_feed(lock) + # self.recount_feed(lock) self.save(lock=lock) @@ -254,8 +254,7 @@ class Feed(models.Model): story_author_name = story.get('author'), story_permalink = story.get('link'), story_guid = story.get('guid') or story.get('id') or story.get('link'), - story_tags = self._shorten_and_encode_story_tags(story_tags), - # tags = story_tags + story_tags = story_tags ) try: s.save() @@ -264,8 +263,6 @@ class Feed(models.Model): except IntegrityError: ret_values[ENTRY_ERR] += 1 # print('Saving new story, IntegrityError: %s - %s: %s' % (self.feed_title, story.get('title'), e)) - # for tcat in story_tags: - # Tag.objects.get_or_create(feed=self, tag=tcat) elif existing_story and story_has_changed: # update story # logging.debug('- Updated story in feed (%s - %s): %s / %s' % (self.feed_title, story.get('title'), len(existing_story.story_content), len(story_content))) @@ -296,10 +293,7 @@ class Feed(models.Model): existing_story['story_author_name'] = story.get('author') existing_story['story_permalink'] = story.get('link') existing_story['story_guid'] = story.get('guid') or story.get('id') or story.get('link') - existing_story['story_tags'] = self._shorten_and_encode_story_tags(story_tags) - # existing_story['tags'] = story_tags - # s.tags.clear() - # [s.tags.add(tcat) for tcat in story_tags] + existing_story['story_tags'] = story_tags try: db.stories.update({'_id': existing_story['_id']}, existing_story) ret_values[ENTRY_UPDATED] += 1 @@ -321,6 +315,7 @@ class Feed(models.Model): if not feed_tags: from apps.rss_feeds.models import Tag from django.db.models.aggregates import Count + # Map Reduce this all_tags = Tag.objects.filter(feed=self)\ .annotate(stories_count=Count('story'))\ .order_by('-stories_count')[:20] @@ -352,15 +347,7 @@ class Feed(models.Model): authors_list = json.decode(feed_authors) if feed_authors else [] if len(authors_list) > 1: self.save_popular_authors(authors_list[:-1]) - - def _shorten_and_encode_story_tags(self, story_tags): - encoded_tags = json.encode([t.name for t in story_tags]) - if len(encoded_tags) < 2000: - return encoded_tags - - if len(story_tags) > 1: - return self._shorten_and_encode_story_tags(story_tags[:-1]) - + def trim_feed(self): from apps.reader.models import UserStory stories_deleted_count = 0 @@ -389,11 +376,8 @@ class Feed(models.Model): user_stories_count) def get_stories(self, offset=0, limit=25, force=False): - if not force: - stories = cache.get('feed_stories:%s-%s-%s' % (self.id, offset, limit), []) - else: - stories = None - + stories = cache.get('feed_stories:%s-%s-%s' % (self.id, offset, limit), []) + if not stories or force: stories_db = MStory.objects(story_feed_id=self.pk)[offset:offset+limit] stories = self.format_stories(stories_db) @@ -407,8 +391,7 @@ class Feed(models.Model): # print "Formatting Stories: %s" % stories_db.count() for story_db in stories_db: story = {} - # story_tags = story_db.tags.all() - story['story_tags'] = (story_db.story_tags and json.decode(story_db.story_tags)) or [] + story['story_tags'] = story_db.story_tags # or [] story['short_parsed_date'] = format_story_link_date__short(story_db.story_date) story['long_parsed_date'] = format_story_link_date__long(story_db.story_date) story['story_date'] = story_db.story_date @@ -445,10 +428,7 @@ class Feed(models.Model): tagname = tagname.strip() if not tagname or tagname == ' ': continue - if not Tag.objects.filter(name=tagname, feed=self): - cobj = Tag(name=tagname, feed=self) - cobj.save() - fcat.append(Tag.objects.get(name=tagname, feed=self)) + fcat.append(tagname) return fcat def _exists_story(self, story=None, story_content=None, existing_stories=None): @@ -652,7 +632,7 @@ class Story(models.Model): class MStory(mongo.Document): '''A feed item''' - story_feed_id = mongo.IntField() + story_feed_id = mongo.IntField(unique_with='story_guid') story_date = mongo.DateTimeField() story_title = mongo.StringField(max_length=255) story_content = mongo.StringField() @@ -663,12 +643,13 @@ class MStory(mongo.Document): story_permalink = mongo.StringField() story_guid = mongo.StringField(primary_key=True) story_guid_hash = mongo.StringField(max_length=40) - story_tags = mongo.StringField(max_length=2000) - tags = mongo.ListField(mongo.StringField(max_length=100)) + story_tags = mongo.ListField(mongo.StringField(max_length=100)) meta = { 'collection': 'stories', - 'indexes': ['story_feed_id', 'story_date'] + 'indexes': ['story_feed_id', 'story_date', ('story_feed_id', '-story_date')], + 'ordering': ['-story_date'], + 'allow_inheritance': False, } class FeedUpdateHistory(models.Model): diff --git a/settings.py b/settings.py index 46f2be50f..bdba5c9eb 100644 --- a/settings.py +++ b/settings.py @@ -239,4 +239,4 @@ DEBUG_TOOLBAR_CONFIG = { # = Mongo = # ========= -connect(MONGO_DB['NAME'], host=MONGO_DB['HOST'], port=MONGO_DB['PORT']) +MONGODB = connect(MONGO_DB['NAME'], host=MONGO_DB['HOST'], port=MONGO_DB['PORT']) diff --git a/utils/feed_fetcher.py b/utils/feed_fetcher.py index d04158997..a5f12e2f9 100644 --- a/utils/feed_fetcher.py +++ b/utils/feed_fetcher.py @@ -22,7 +22,7 @@ import pymongo # Refresh feed code adapted from Feedjack. # http://feedjack.googlecode.com -VERSION = '0.4' +VERSION = '0.8' URL = 'http://www.newsblur.com/' USER_AGENT = 'NewsBlur Fetcher %s - %s' % (VERSION, URL) SLOWFEED_WARNING = 10 @@ -77,13 +77,12 @@ class FetchFeed: return identity class ProcessFeed: - def __init__(self, feed, fpf, options): + def __init__(self, feed, fpf, db, options): self.feed = feed self.options = options self.fpf = fpf self.lock = multiprocessing.Lock() - connection = pymongo.Connection(settings.MONGO_DB['HOST']) - self.db = connection[settings.MONGO_DB['NAME']] + self.db = db def process(self): """ Downloads and parses a feed. @@ -242,6 +241,9 @@ class Dispatcher: from django.db import connection connection.close() + MONGO_DB = settings.MONGO_DB + db = pymongo.Connection(host=MONGO_DB['HOST'], port=MONGO_DB['PORT'])[MONGO_DB['NAME']] + current_process = multiprocessing.current_process() lock = multiprocessing.Lock() @@ -271,7 +273,7 @@ class Dispatcher: delta = datetime.datetime.now() - start_time if fetched_feed and ret_feed == FEED_OK: - pfeed = ProcessFeed(feed, fetched_feed, self.options) + pfeed = ProcessFeed(feed, fetched_feed, db, self.options) ret_feed, ret_entries = pfeed.process() if ret_entries.get(ENTRY_NEW): diff --git a/utils/json.py b/utils/json.py index 36451f12a..1bfc8b7a3 100644 --- a/utils/json.py +++ b/utils/json.py @@ -12,6 +12,8 @@ from django.db.models.query import QuerySet import sys def decode(data): + if not data: + return data return json.loads(data) def encode(data, *args, **kwargs): diff --git a/utils/munin/newsblur_stories.py b/utils/munin/newsblur_stories.py index fb1f36dd2..6e7311318 100755 --- a/utils/munin/newsblur_stories.py +++ b/utils/munin/newsblur_stories.py @@ -1,7 +1,7 @@ #!/usr/bin/env python from utils.munin.base import MuninGraph -from apps.rss_feeds.models import Story, Tag, StoryAuthor +from apps.rss_feeds.models import MStory, StoryAuthor from apps.reader.models import UserStory graph_config = { @@ -15,8 +15,7 @@ graph_config = { } metrics = { - 'stories': Story.objects.count(), - 'tags': Tag.objects.count(), + 'stories': MStory.objects().count(), 'authors': StoryAuthor.objects.count(), 'read_stories': UserStory.objects.count(), }