mirror of
https://github.com/viq/NewsBlur.git
synced 2025-04-13 09:38:09 +00:00
Further Mongo work. Stories are now saved correctly, with tags + authors. Still need to do UserStories.
This commit is contained in:
parent
fa3be28b43
commit
ac53f33788
14 changed files with 115 additions and 109 deletions
|
@ -1,12 +1,6 @@
|
|||
from django.contrib.auth.models import User
|
||||
from apps.rss_feeds.models import Feed, Story
|
||||
from apps.reader.models import UserSubscription, UserStory
|
||||
from apps.analyzer.models import Category, FeatureCategory
|
||||
from django.db.models.aggregates import Sum
|
||||
import datetime
|
||||
import re
|
||||
import math
|
||||
import itertools
|
||||
|
||||
class Classifier:
|
||||
|
||||
|
|
|
@ -1,41 +1,41 @@
|
|||
from django.contrib.auth.models import User
|
||||
from apps.rss_feeds.models import Feed, Story
|
||||
from apps.reader.models import UserSubscription, UserStory
|
||||
from apps.reader.models import UserSubscription
|
||||
from apps.analyzer.models import Category, FeatureCategory
|
||||
import datetime
|
||||
import re
|
||||
import math
|
||||
|
||||
def entry_features(self, entry):
|
||||
splitter=re.compile('\\W*')
|
||||
f={}
|
||||
|
||||
# Extract the title words and annotate
|
||||
titlewords=[s.lower() for s in splitter.split(entry['title'])
|
||||
if len(s)>2 and len(s)<20]
|
||||
|
||||
for w in titlewords: f['Title:'+w]=1
|
||||
|
||||
# Extract the summary words
|
||||
summarywords=[s.lower() for s in splitter.split(entry['summary'])
|
||||
if len(s)>2 and len(s)<20]
|
||||
def entry_features(self, entry):
|
||||
splitter=re.compile('\\W*')
|
||||
f={}
|
||||
|
||||
# Count uppercase words
|
||||
uc=0
|
||||
for i in range(len(summarywords)):
|
||||
w=summarywords[i]
|
||||
f[w]=1
|
||||
if w.isupper(): uc+=1
|
||||
# Extract the title words and annotate
|
||||
titlewords=[s.lower() for s in splitter.split(entry['title'])
|
||||
if len(s)>2 and len(s)<20]
|
||||
|
||||
# Get word pairs in summary as features
|
||||
if i<len(summarywords)-1:
|
||||
twowords=' '.join(summarywords[i:i+1])
|
||||
f[twowords]=1
|
||||
|
||||
# Keep creator and publisher whole
|
||||
f['Publisher:'+entry['publisher']]=1
|
||||
for w in titlewords: f['Title:'+w]=1
|
||||
|
||||
# UPPERCASE is a virtual word flagging too much shouting
|
||||
if float(uc)/len(summarywords)>0.3: f['UPPERCASE']=1
|
||||
|
||||
return f
|
||||
# Extract the summary words
|
||||
summarywords=[s.lower() for s in splitter.split(entry['summary'])
|
||||
if len(s)>2 and len(s)<20]
|
||||
|
||||
# Count uppercase words
|
||||
uc=0
|
||||
for i in range(len(summarywords)):
|
||||
w=summarywords[i]
|
||||
f[w]=1
|
||||
if w.isupper(): uc+=1
|
||||
|
||||
# Get word pairs in summary as features
|
||||
if i<len(summarywords)-1:
|
||||
twowords=' '.join(summarywords[i:i+1])
|
||||
f[twowords]=1
|
||||
|
||||
# Keep creator and publisher whole
|
||||
f['Publisher:'+entry['publisher']]=1
|
||||
|
||||
# UPPERCASE is a virtual word flagging too much shouting
|
||||
if float(uc)/len(summarywords)>0.3: f['UPPERCASE']=1
|
||||
|
||||
return f
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
from apps.reader.models import UserSubscription, UserStory, UserSubscriptionFolders, Feature
|
||||
from apps.reader.models import UserSubscription, UserSubscriptionFolders, Feature
|
||||
from django.contrib import admin
|
||||
|
||||
admin.site.register(UserSubscription)
|
||||
admin.site.register(UserSubscriptionFolders)
|
||||
admin.site.register(UserStory)
|
||||
admin.site.register(Feature)
|
|
@ -1,4 +1,5 @@
|
|||
import datetime
|
||||
import mongoengine as mongo
|
||||
from utils import log as logging
|
||||
from django.db import models
|
||||
from django.contrib.auth.models import User
|
||||
|
@ -154,6 +155,14 @@ class UserStory(models.Model):
|
|||
verbose_name = "user story"
|
||||
unique_together = ("user", "feed", "story")
|
||||
|
||||
class MUserStory(mongo.Document):
|
||||
"""
|
||||
Stories read by the user. These are deleted as the mark_read_date for the
|
||||
UserSubscription passes the UserStory date.
|
||||
"""
|
||||
|
||||
|
||||
|
||||
class UserSubscriptionFolders(models.Model):
|
||||
"""
|
||||
A JSON list of folders and feeds for while a user has subscribed. The list
|
||||
|
|
|
@ -1,10 +1,6 @@
|
|||
from django.core.management.base import BaseCommand
|
||||
from django.core.handlers.wsgi import WSGIHandler
|
||||
from apps.rss_feeds.models import Feed, Story
|
||||
from django.core.cache import cache
|
||||
from apps.reader.models import UserSubscription, UserStory
|
||||
from optparse import OptionParser, make_option
|
||||
from utils import log as logging
|
||||
from apps.reader.models import UserSubscription
|
||||
from optparse import make_option
|
||||
import os
|
||||
import errno
|
||||
import re
|
||||
|
|
|
@ -1,13 +1,7 @@
|
|||
from django.core.management.base import BaseCommand
|
||||
from django.core.handlers.wsgi import WSGIHandler
|
||||
from apps.rss_feeds.models import Feed, Story
|
||||
from django.contrib.auth.models import User
|
||||
from django.core.cache import cache
|
||||
from apps.reader.models import UserSubscription, UserStory
|
||||
from optparse import OptionParser, make_option
|
||||
import os
|
||||
import errno
|
||||
import re
|
||||
from apps.reader.models import UserSubscription
|
||||
from optparse import make_option
|
||||
import datetime
|
||||
|
||||
class Command(BaseCommand):
|
||||
|
|
|
@ -1,13 +1,7 @@
|
|||
from django.core.management.base import BaseCommand
|
||||
from django.core.handlers.wsgi import WSGIHandler
|
||||
from apps.rss_feeds.models import Feed, Story
|
||||
from django.core.cache import cache
|
||||
from django.db.models import Q
|
||||
from apps.reader.models import UserSubscription, UserStory
|
||||
from optparse import OptionParser, make_option
|
||||
from apps.rss_feeds.models import Feed
|
||||
from optparse import make_option
|
||||
from utils.management_functions import daemonize
|
||||
import os
|
||||
import errno
|
||||
|
||||
class Command(BaseCommand):
|
||||
option_list = BaseCommand.option_list + (
|
||||
|
@ -29,7 +23,4 @@ class Command(BaseCommand):
|
|||
|
||||
def _refresh_feeds(self, feeds, force=False):
|
||||
for feed in feeds:
|
||||
feed.update(force=force, single_threaded=True)
|
||||
usersubs = UserSubscription.objects.filter(
|
||||
feed=feed.id
|
||||
)
|
||||
feed.update(force=force, single_threaded=True)
|
|
@ -12,6 +12,7 @@ class Command(BaseCommand):
|
|||
option_list = BaseCommand.option_list + (
|
||||
make_option("-f", "--feed", default=None),
|
||||
make_option("-d", "--daemon", dest="daemonize", action="store_true"),
|
||||
make_option("-F", "--force", dest="force", action="store_true"),
|
||||
make_option("-s", "--single_threaded", dest="single_threaded", action="store_true"),
|
||||
make_option('-t', '--timeout', type='int', default=10,
|
||||
help='Wait timeout in seconds when connecting to feeds.'),
|
||||
|
@ -41,7 +42,10 @@ class Command(BaseCommand):
|
|||
|
||||
socket.setdefaulttimeout(options['timeout'])
|
||||
feeds = Feed.objects.filter(next_scheduled_update__lte=now)#.order_by('?')
|
||||
|
||||
|
||||
if options['force']:
|
||||
feeds = Feed.objects.all()
|
||||
|
||||
num_workers = min(len(feeds), options['workerthreads'])
|
||||
if options['single_threaded']:
|
||||
num_workers = 1
|
||||
|
|
35
apps/rss_feeds/migrations/bootstrap_mongo.py
Normal file
35
apps/rss_feeds/migrations/bootstrap_mongo.py
Normal file
|
@ -0,0 +1,35 @@
|
|||
from pprint import pprint
|
||||
from django.conf import settings
|
||||
from apps.rss_feeds.models import Feed, Story, MStory, StoryAuthor
|
||||
import mongoengine
|
||||
import sys
|
||||
from utils import json
|
||||
|
||||
MONGO_DB = settings.MONGO_DB
|
||||
db = mongoengine.connect(MONGO_DB['NAME'], host=MONGO_DB['HOST'], port=MONGO_DB['PORT'])
|
||||
|
||||
print "Mongo DB stories: %s" % MStory.objects().count()
|
||||
|
||||
db.stories.drop()
|
||||
print "Dropped! Mongo DB stories: %s" % MStory.objects().count()
|
||||
|
||||
|
||||
print "Stories: %s" % Story.objects.all().count()
|
||||
|
||||
pprint(db.stories.index_information())
|
||||
|
||||
feeds = Feed.objects.all().order_by('-average_stories_per_month')
|
||||
for feed in feeds:
|
||||
print "%-5s: %s" % (Story.objects.select_related('story_author', 'tags').filter(story_feed=feed).count(),
|
||||
feed)
|
||||
sys.stdout.flush()
|
||||
|
||||
stories = Story.objects.filter(story_feed=feed).values()
|
||||
for story in stories:
|
||||
# story['story_tags'] = [tag.name for tag in Tag.objects.filter(story=story['id'])]
|
||||
story['story_tags'] = json.decode(story['story_tags'])
|
||||
del story['id']
|
||||
del story['story_author_id']
|
||||
MStory(**story).save()
|
||||
|
||||
print "Mongo DB stories: %s" % MStory.objects().count()
|
|
@ -119,10 +119,10 @@ class Feed(models.Model):
|
|||
|
||||
def count_stories(self, verbose=False, lock=None):
|
||||
month_ago = datetime.datetime.now() - datetime.timedelta(days=30)
|
||||
stories_last_month = Story.objects.filter(story_feed=self, story_date__gte=month_ago).count()
|
||||
stories_last_month = MStory.objects(story_feed=self.pk, story_date__gte=month_ago).count()
|
||||
self.stories_last_month = stories_last_month
|
||||
|
||||
self.recount_feed(lock)
|
||||
# self.recount_feed(lock)
|
||||
|
||||
self.save(lock=lock)
|
||||
|
||||
|
@ -254,8 +254,7 @@ class Feed(models.Model):
|
|||
story_author_name = story.get('author'),
|
||||
story_permalink = story.get('link'),
|
||||
story_guid = story.get('guid') or story.get('id') or story.get('link'),
|
||||
story_tags = self._shorten_and_encode_story_tags(story_tags),
|
||||
# tags = story_tags
|
||||
story_tags = story_tags
|
||||
)
|
||||
try:
|
||||
s.save()
|
||||
|
@ -264,8 +263,6 @@ class Feed(models.Model):
|
|||
except IntegrityError:
|
||||
ret_values[ENTRY_ERR] += 1
|
||||
# print('Saving new story, IntegrityError: %s - %s: %s' % (self.feed_title, story.get('title'), e))
|
||||
# for tcat in story_tags:
|
||||
# Tag.objects.get_or_create(feed=self, tag=tcat)
|
||||
elif existing_story and story_has_changed:
|
||||
# update story
|
||||
# logging.debug('- Updated story in feed (%s - %s): %s / %s' % (self.feed_title, story.get('title'), len(existing_story.story_content), len(story_content)))
|
||||
|
@ -296,10 +293,7 @@ class Feed(models.Model):
|
|||
existing_story['story_author_name'] = story.get('author')
|
||||
existing_story['story_permalink'] = story.get('link')
|
||||
existing_story['story_guid'] = story.get('guid') or story.get('id') or story.get('link')
|
||||
existing_story['story_tags'] = self._shorten_and_encode_story_tags(story_tags)
|
||||
# existing_story['tags'] = story_tags
|
||||
# s.tags.clear()
|
||||
# [s.tags.add(tcat) for tcat in story_tags]
|
||||
existing_story['story_tags'] = story_tags
|
||||
try:
|
||||
db.stories.update({'_id': existing_story['_id']}, existing_story)
|
||||
ret_values[ENTRY_UPDATED] += 1
|
||||
|
@ -321,6 +315,7 @@ class Feed(models.Model):
|
|||
if not feed_tags:
|
||||
from apps.rss_feeds.models import Tag
|
||||
from django.db.models.aggregates import Count
|
||||
# Map Reduce this
|
||||
all_tags = Tag.objects.filter(feed=self)\
|
||||
.annotate(stories_count=Count('story'))\
|
||||
.order_by('-stories_count')[:20]
|
||||
|
@ -352,15 +347,7 @@ class Feed(models.Model):
|
|||
authors_list = json.decode(feed_authors) if feed_authors else []
|
||||
if len(authors_list) > 1:
|
||||
self.save_popular_authors(authors_list[:-1])
|
||||
|
||||
def _shorten_and_encode_story_tags(self, story_tags):
|
||||
encoded_tags = json.encode([t.name for t in story_tags])
|
||||
if len(encoded_tags) < 2000:
|
||||
return encoded_tags
|
||||
|
||||
if len(story_tags) > 1:
|
||||
return self._shorten_and_encode_story_tags(story_tags[:-1])
|
||||
|
||||
|
||||
def trim_feed(self):
|
||||
from apps.reader.models import UserStory
|
||||
stories_deleted_count = 0
|
||||
|
@ -389,11 +376,8 @@ class Feed(models.Model):
|
|||
user_stories_count)
|
||||
|
||||
def get_stories(self, offset=0, limit=25, force=False):
|
||||
if not force:
|
||||
stories = cache.get('feed_stories:%s-%s-%s' % (self.id, offset, limit), [])
|
||||
else:
|
||||
stories = None
|
||||
|
||||
stories = cache.get('feed_stories:%s-%s-%s' % (self.id, offset, limit), [])
|
||||
|
||||
if not stories or force:
|
||||
stories_db = MStory.objects(story_feed_id=self.pk)[offset:offset+limit]
|
||||
stories = self.format_stories(stories_db)
|
||||
|
@ -407,8 +391,7 @@ class Feed(models.Model):
|
|||
# print "Formatting Stories: %s" % stories_db.count()
|
||||
for story_db in stories_db:
|
||||
story = {}
|
||||
# story_tags = story_db.tags.all()
|
||||
story['story_tags'] = (story_db.story_tags and json.decode(story_db.story_tags)) or []
|
||||
story['story_tags'] = story_db.story_tags # or []
|
||||
story['short_parsed_date'] = format_story_link_date__short(story_db.story_date)
|
||||
story['long_parsed_date'] = format_story_link_date__long(story_db.story_date)
|
||||
story['story_date'] = story_db.story_date
|
||||
|
@ -445,10 +428,7 @@ class Feed(models.Model):
|
|||
tagname = tagname.strip()
|
||||
if not tagname or tagname == ' ':
|
||||
continue
|
||||
if not Tag.objects.filter(name=tagname, feed=self):
|
||||
cobj = Tag(name=tagname, feed=self)
|
||||
cobj.save()
|
||||
fcat.append(Tag.objects.get(name=tagname, feed=self))
|
||||
fcat.append(tagname)
|
||||
return fcat
|
||||
|
||||
def _exists_story(self, story=None, story_content=None, existing_stories=None):
|
||||
|
@ -652,7 +632,7 @@ class Story(models.Model):
|
|||
|
||||
class MStory(mongo.Document):
|
||||
'''A feed item'''
|
||||
story_feed_id = mongo.IntField()
|
||||
story_feed_id = mongo.IntField(unique_with='story_guid')
|
||||
story_date = mongo.DateTimeField()
|
||||
story_title = mongo.StringField(max_length=255)
|
||||
story_content = mongo.StringField()
|
||||
|
@ -663,12 +643,13 @@ class MStory(mongo.Document):
|
|||
story_permalink = mongo.StringField()
|
||||
story_guid = mongo.StringField(primary_key=True)
|
||||
story_guid_hash = mongo.StringField(max_length=40)
|
||||
story_tags = mongo.StringField(max_length=2000)
|
||||
tags = mongo.ListField(mongo.StringField(max_length=100))
|
||||
story_tags = mongo.ListField(mongo.StringField(max_length=100))
|
||||
|
||||
meta = {
|
||||
'collection': 'stories',
|
||||
'indexes': ['story_feed_id', 'story_date']
|
||||
'indexes': ['story_feed_id', 'story_date', ('story_feed_id', '-story_date')],
|
||||
'ordering': ['-story_date'],
|
||||
'allow_inheritance': False,
|
||||
}
|
||||
|
||||
class FeedUpdateHistory(models.Model):
|
||||
|
|
|
@ -239,4 +239,4 @@ DEBUG_TOOLBAR_CONFIG = {
|
|||
# = Mongo =
|
||||
# =========
|
||||
|
||||
connect(MONGO_DB['NAME'], host=MONGO_DB['HOST'], port=MONGO_DB['PORT'])
|
||||
MONGODB = connect(MONGO_DB['NAME'], host=MONGO_DB['HOST'], port=MONGO_DB['PORT'])
|
||||
|
|
|
@ -22,7 +22,7 @@ import pymongo
|
|||
# Refresh feed code adapted from Feedjack.
|
||||
# http://feedjack.googlecode.com
|
||||
|
||||
VERSION = '0.4'
|
||||
VERSION = '0.8'
|
||||
URL = 'http://www.newsblur.com/'
|
||||
USER_AGENT = 'NewsBlur Fetcher %s - %s' % (VERSION, URL)
|
||||
SLOWFEED_WARNING = 10
|
||||
|
@ -77,13 +77,12 @@ class FetchFeed:
|
|||
return identity
|
||||
|
||||
class ProcessFeed:
|
||||
def __init__(self, feed, fpf, options):
|
||||
def __init__(self, feed, fpf, db, options):
|
||||
self.feed = feed
|
||||
self.options = options
|
||||
self.fpf = fpf
|
||||
self.lock = multiprocessing.Lock()
|
||||
connection = pymongo.Connection(settings.MONGO_DB['HOST'])
|
||||
self.db = connection[settings.MONGO_DB['NAME']]
|
||||
self.db = db
|
||||
|
||||
def process(self):
|
||||
""" Downloads and parses a feed.
|
||||
|
@ -242,6 +241,9 @@ class Dispatcher:
|
|||
from django.db import connection
|
||||
connection.close()
|
||||
|
||||
MONGO_DB = settings.MONGO_DB
|
||||
db = pymongo.Connection(host=MONGO_DB['HOST'], port=MONGO_DB['PORT'])[MONGO_DB['NAME']]
|
||||
|
||||
current_process = multiprocessing.current_process()
|
||||
lock = multiprocessing.Lock()
|
||||
|
||||
|
@ -271,7 +273,7 @@ class Dispatcher:
|
|||
delta = datetime.datetime.now() - start_time
|
||||
|
||||
if fetched_feed and ret_feed == FEED_OK:
|
||||
pfeed = ProcessFeed(feed, fetched_feed, self.options)
|
||||
pfeed = ProcessFeed(feed, fetched_feed, db, self.options)
|
||||
ret_feed, ret_entries = pfeed.process()
|
||||
|
||||
if ret_entries.get(ENTRY_NEW):
|
||||
|
|
|
@ -12,6 +12,8 @@ from django.db.models.query import QuerySet
|
|||
import sys
|
||||
|
||||
def decode(data):
|
||||
if not data:
|
||||
return data
|
||||
return json.loads(data)
|
||||
|
||||
def encode(data, *args, **kwargs):
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
from utils.munin.base import MuninGraph
|
||||
from apps.rss_feeds.models import Story, Tag, StoryAuthor
|
||||
from apps.rss_feeds.models import MStory, StoryAuthor
|
||||
from apps.reader.models import UserStory
|
||||
|
||||
graph_config = {
|
||||
|
@ -15,8 +15,7 @@ graph_config = {
|
|||
}
|
||||
|
||||
metrics = {
|
||||
'stories': Story.objects.count(),
|
||||
'tags': Tag.objects.count(),
|
||||
'stories': MStory.objects().count(),
|
||||
'authors': StoryAuthor.objects.count(),
|
||||
'read_stories': UserStory.objects.count(),
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue