2010-05-20 15:13:25 -04:00
|
|
|
import difflib
|
|
|
|
import datetime
|
2010-06-24 15:27:25 -04:00
|
|
|
import random
|
2010-07-27 22:11:23 -04:00
|
|
|
import re
|
2010-08-21 13:57:39 -04:00
|
|
|
import mongoengine as mongo
|
2010-08-29 12:35:09 -04:00
|
|
|
import zlib
|
2010-11-15 22:43:01 -05:00
|
|
|
import urllib
|
2010-08-04 18:30:51 -04:00
|
|
|
from collections import defaultdict
|
2010-08-21 23:49:36 -04:00
|
|
|
from operator import itemgetter
|
2010-07-27 22:11:23 -04:00
|
|
|
from BeautifulSoup import BeautifulStoneSoup
|
2010-07-27 22:27:32 -04:00
|
|
|
from nltk.collocations import TrigramCollocationFinder, BigramCollocationFinder, TrigramAssocMeasures, BigramAssocMeasures
|
2009-06-16 03:08:55 +00:00
|
|
|
from django.db import models
|
2009-09-08 00:13:49 +00:00
|
|
|
from django.db import IntegrityError
|
2009-06-16 03:08:55 +00:00
|
|
|
from django.core.cache import cache
|
2010-10-11 13:19:42 -04:00
|
|
|
from django.conf import settings
|
2010-09-10 01:01:18 -07:00
|
|
|
from mongoengine.queryset import OperationError
|
2011-02-15 21:08:40 -05:00
|
|
|
from mongoengine.base import ValidationError
|
2010-12-23 12:32:24 -05:00
|
|
|
from apps.rss_feeds.tasks import UpdateFeeds
|
|
|
|
from celery.task import Task
|
2010-10-23 13:06:28 -04:00
|
|
|
from utils import json_functions as json
|
2010-08-25 19:10:55 -04:00
|
|
|
from utils import feedfinder
|
2011-02-08 22:07:59 -05:00
|
|
|
from utils import urlnorm
|
|
|
|
from utils import log as logging
|
2011-01-17 22:48:38 -05:00
|
|
|
from utils.fields import AutoOneToOneField
|
2010-05-20 15:13:25 -04:00
|
|
|
from utils.feed_functions import levenshtein_distance
|
2011-02-02 13:07:12 -05:00
|
|
|
from utils.feed_functions import timelimit, TimeoutError
|
2009-12-18 20:47:44 +00:00
|
|
|
from utils.story_functions import pre_process_story
|
2009-08-29 19:34:42 +00:00
|
|
|
from utils.diff import HTMLDiff
|
2009-06-16 03:08:55 +00:00
|
|
|
|
2009-08-20 02:43:01 +00:00
|
|
|
ENTRY_NEW, ENTRY_UPDATED, ENTRY_SAME, ENTRY_ERR = range(4)
|
2009-07-21 03:18:29 +00:00
|
|
|
|
2009-06-16 03:08:55 +00:00
|
|
|
class Feed(models.Model):
|
|
|
|
feed_address = models.URLField(max_length=255, verify_exists=True, unique=True)
|
2010-07-21 23:22:27 -04:00
|
|
|
feed_link = models.URLField(max_length=1000, default="", blank=True, null=True)
|
2011-02-15 21:08:40 -05:00
|
|
|
feed_title = models.CharField(max_length=255, default="[Untitled]", blank=True, null=True)
|
2011-01-17 13:52:11 -05:00
|
|
|
active = models.BooleanField(default=True, db_index=True)
|
2010-09-17 12:42:44 -04:00
|
|
|
num_subscribers = models.IntegerField(default=-1)
|
2011-01-17 13:52:11 -05:00
|
|
|
active_subscribers = models.IntegerField(default=-1, db_index=True)
|
2010-10-16 22:49:03 -04:00
|
|
|
premium_subscribers = models.IntegerField(default=-1)
|
2010-10-02 17:15:51 -04:00
|
|
|
last_update = models.DateTimeField(db_index=True)
|
2010-08-09 20:44:36 -04:00
|
|
|
fetched_once = models.BooleanField(default=False)
|
2010-09-08 10:52:04 -07:00
|
|
|
has_feed_exception = models.BooleanField(default=False, db_index=True)
|
|
|
|
has_page_exception = models.BooleanField(default=False, db_index=True)
|
2010-08-25 20:43:35 -04:00
|
|
|
exception_code = models.IntegerField(default=0)
|
2010-12-23 13:29:31 -05:00
|
|
|
min_to_decay = models.IntegerField(default=0)
|
2009-06-16 03:08:55 +00:00
|
|
|
days_to_trim = models.IntegerField(default=90)
|
|
|
|
creation = models.DateField(auto_now_add=True)
|
2010-09-17 13:06:28 -04:00
|
|
|
etag = models.CharField(max_length=255, blank=True, null=True)
|
2009-06-16 03:08:55 +00:00
|
|
|
last_modified = models.DateTimeField(null=True, blank=True)
|
2010-07-25 23:13:27 -04:00
|
|
|
stories_last_month = models.IntegerField(default=0)
|
|
|
|
average_stories_per_month = models.IntegerField(default=0)
|
2010-10-02 17:15:51 -04:00
|
|
|
next_scheduled_update = models.DateTimeField(db_index=True)
|
|
|
|
queued_date = models.DateTimeField(db_index=True)
|
2010-04-29 13:35:46 -04:00
|
|
|
last_load_time = models.IntegerField(default=0)
|
2009-06-16 03:08:55 +00:00
|
|
|
|
|
|
|
def __unicode__(self):
|
2010-09-28 04:57:24 -04:00
|
|
|
if not self.feed_title:
|
|
|
|
self.feed_title = "[Untitled]"
|
|
|
|
self.save()
|
2009-06-16 03:08:55 +00:00
|
|
|
return self.feed_title
|
2010-07-19 14:29:27 -04:00
|
|
|
|
2010-11-05 20:34:17 -04:00
|
|
|
def save(self, *args, **kwargs):
|
2010-10-02 17:15:51 -04:00
|
|
|
if not self.last_update:
|
2010-10-10 23:55:00 -04:00
|
|
|
self.last_update = datetime.datetime.utcnow()
|
2010-10-02 17:15:51 -04:00
|
|
|
if not self.next_scheduled_update:
|
2010-10-10 23:55:00 -04:00
|
|
|
self.next_scheduled_update = datetime.datetime.utcnow()
|
2010-10-02 17:15:51 -04:00
|
|
|
if not self.queued_date:
|
2010-10-10 23:55:00 -04:00
|
|
|
self.queued_date = datetime.datetime.utcnow()
|
2011-02-15 21:08:40 -05:00
|
|
|
|
|
|
|
max_feed_title = Feed._meta.get_field('feed_title').max_length
|
|
|
|
if len(self.feed_title) > max_feed_title:
|
|
|
|
self.feed_title = self.feed_title[:max_feed_title]
|
2010-10-02 17:15:51 -04:00
|
|
|
|
2010-09-28 05:29:40 -04:00
|
|
|
try:
|
2010-10-07 19:07:43 -04:00
|
|
|
super(Feed, self).save(*args, **kwargs)
|
2010-10-07 19:16:32 -04:00
|
|
|
except IntegrityError, e:
|
2010-10-07 19:21:03 -04:00
|
|
|
duplicate_feed = Feed.objects.filter(feed_address=self.feed_address)
|
|
|
|
logging.debug("%s: %s" % (self.feed_address, duplicate_feed))
|
2010-10-08 16:33:53 -04:00
|
|
|
logging.debug(' ***> [%-30s] Feed deleted. Could not save: %s' % (self, e))
|
2010-10-07 19:21:03 -04:00
|
|
|
if duplicate_feed:
|
2010-10-07 19:23:35 -04:00
|
|
|
merge_feeds(self.pk, duplicate_feed[0].pk)
|
2010-10-08 16:33:53 -04:00
|
|
|
return duplicate_feed[0].pk
|
2010-09-28 05:29:40 -04:00
|
|
|
# Feed has been deleted. Just ignore it.
|
|
|
|
pass
|
2010-08-23 09:55:21 -04:00
|
|
|
|
2011-02-08 22:07:59 -05:00
|
|
|
@classmethod
|
|
|
|
def get_feed_from_url(cls, url):
|
|
|
|
feed = None
|
|
|
|
|
|
|
|
def by_url(address):
|
|
|
|
duplicate_feed = DuplicateFeed.objects.filter(duplicate_address=address).order_by('pk')
|
|
|
|
if duplicate_feed:
|
|
|
|
feed = [duplicate_feed[0].feed]
|
|
|
|
else:
|
|
|
|
feed = cls.objects.filter(feed_address=address).order_by('pk')
|
|
|
|
return feed
|
|
|
|
|
|
|
|
url = urlnorm.normalize(url)
|
|
|
|
feed = by_url(url)
|
2011-02-08 22:54:40 -05:00
|
|
|
|
|
|
|
if feed:
|
|
|
|
feed = feed[0]
|
|
|
|
else:
|
2011-02-08 22:07:59 -05:00
|
|
|
if feedfinder.isFeed(url):
|
|
|
|
feed = cls.objects.create(feed_address=url)
|
2011-02-09 18:52:36 -05:00
|
|
|
feed = feed.update()
|
2011-02-08 22:07:59 -05:00
|
|
|
else:
|
|
|
|
feed_finder_url = feedfinder.feed(url)
|
|
|
|
if feed_finder_url:
|
|
|
|
feed = by_url(feed_finder_url)
|
|
|
|
if not feed:
|
|
|
|
feed = cls.objects.create(feed_address=feed_finder_url)
|
2011-02-09 18:52:36 -05:00
|
|
|
feed = feed.update()
|
2011-02-08 22:07:59 -05:00
|
|
|
else:
|
|
|
|
feed = feed[0]
|
|
|
|
|
|
|
|
return feed
|
|
|
|
|
2010-12-23 12:32:24 -05:00
|
|
|
@classmethod
|
|
|
|
def task_feeds(cls, feeds, queue_size=12):
|
2011-02-09 15:45:41 -05:00
|
|
|
logging.debug(" ---> Tasking %s feeds..." % feeds.count())
|
2010-12-23 12:32:24 -05:00
|
|
|
|
|
|
|
publisher = Task.get_publisher()
|
|
|
|
|
|
|
|
feed_queue = []
|
|
|
|
for f in feeds:
|
|
|
|
f.queued_date = datetime.datetime.utcnow()
|
|
|
|
f.set_next_scheduled_update()
|
|
|
|
|
|
|
|
for feed_queue in (feeds[pos:pos + queue_size] for pos in xrange(0, len(feeds), queue_size)):
|
|
|
|
feed_ids = [feed.pk for feed in feed_queue]
|
|
|
|
UpdateFeeds.apply_async(args=(feed_ids,), queue='update_feeds', publisher=publisher)
|
|
|
|
|
|
|
|
publisher.connection.close()
|
|
|
|
|
2010-11-05 20:34:17 -04:00
|
|
|
def update_all_statistics(self):
|
|
|
|
self.count_subscribers()
|
|
|
|
self.count_stories()
|
|
|
|
self.save_popular_authors()
|
|
|
|
self.save_popular_tags()
|
2010-08-25 19:10:55 -04:00
|
|
|
|
2010-10-06 10:21:14 -04:00
|
|
|
def setup_feed_for_premium_subscribers(self):
|
|
|
|
self.count_subscribers()
|
|
|
|
self.set_next_scheduled_update()
|
|
|
|
|
2010-08-25 19:10:55 -04:00
|
|
|
def check_feed_address_for_feed_link(self):
|
2011-02-02 13:07:12 -05:00
|
|
|
@timelimit(10)
|
|
|
|
def _1():
|
|
|
|
feed_address = None
|
2011-02-23 14:25:07 -05:00
|
|
|
try:
|
|
|
|
is_feed = feedfinder.isFeed(self.feed_address)
|
|
|
|
except KeyError:
|
|
|
|
is_feed = False
|
|
|
|
if not is_feed:
|
2011-02-02 13:07:12 -05:00
|
|
|
feed_address = feedfinder.feed(self.feed_address)
|
|
|
|
if not feed_address and self.feed_link:
|
|
|
|
feed_address = feedfinder.feed(self.feed_link)
|
|
|
|
else:
|
|
|
|
feed_address_from_link = feedfinder.feed(self.feed_link)
|
|
|
|
if feed_address_from_link != self.feed_address:
|
|
|
|
feed_address = feed_address_from_link
|
|
|
|
|
|
|
|
if feed_address:
|
|
|
|
try:
|
|
|
|
self.feed_address = feed_address
|
|
|
|
self.next_scheduled_update = datetime.datetime.utcnow()
|
|
|
|
self.has_feed_exception = False
|
|
|
|
self.active = True
|
|
|
|
self.save()
|
|
|
|
except IntegrityError:
|
|
|
|
original_feed = Feed.objects.get(feed_address=feed_address)
|
|
|
|
original_feed.has_feed_exception = False
|
|
|
|
original_feed.active = True
|
|
|
|
original_feed.save()
|
|
|
|
merge_feeds(original_feed.pk, self.pk)
|
|
|
|
return feed_address
|
2010-07-19 14:29:27 -04:00
|
|
|
|
2011-02-02 13:07:12 -05:00
|
|
|
try:
|
|
|
|
feed_address = _1()
|
|
|
|
except TimeoutError:
|
|
|
|
logging.debug(' ---> [%-30s] Feed address check timed out...' % (unicode(self.feed_title)[:30]))
|
|
|
|
self.save_feed_history(505, 'Timeout', '')
|
|
|
|
feed_address = None
|
2010-08-25 19:10:55 -04:00
|
|
|
|
|
|
|
return not not feed_address
|
|
|
|
|
2010-07-08 11:37:54 -04:00
|
|
|
def save_feed_history(self, status_code, message, exception=None):
|
2010-08-29 20:11:36 -04:00
|
|
|
MFeedFetchHistory(feed_id=self.pk,
|
|
|
|
status_code=int(status_code),
|
|
|
|
message=message,
|
|
|
|
exception=exception,
|
2010-10-10 23:55:00 -04:00
|
|
|
fetch_date=datetime.datetime.utcnow()).save()
|
2010-09-27 19:12:17 -04:00
|
|
|
old_fetch_histories = MFeedFetchHistory.objects(feed_id=self.pk).order_by('-fetch_date')[5:]
|
2010-07-08 11:37:54 -04:00
|
|
|
for history in old_fetch_histories:
|
|
|
|
history.delete()
|
2010-10-03 19:05:16 -04:00
|
|
|
if status_code not in (200, 304):
|
2010-08-29 14:32:54 -04:00
|
|
|
fetch_history = map(lambda h: h.status_code,
|
|
|
|
MFeedFetchHistory.objects(feed_id=self.pk))
|
2010-08-26 10:04:32 -04:00
|
|
|
self.count_errors_in_history(fetch_history, status_code, 'feed')
|
|
|
|
elif self.has_feed_exception:
|
|
|
|
self.has_feed_exception = False
|
2010-08-25 20:43:35 -04:00
|
|
|
self.active = True
|
2010-08-24 16:22:12 -04:00
|
|
|
self.save()
|
2010-08-18 20:35:45 -04:00
|
|
|
|
2010-07-08 11:37:54 -04:00
|
|
|
def save_page_history(self, status_code, message, exception=None):
|
2010-08-29 20:11:36 -04:00
|
|
|
MPageFetchHistory(feed_id=self.pk,
|
|
|
|
status_code=int(status_code),
|
|
|
|
message=message,
|
|
|
|
exception=exception,
|
2010-10-10 23:55:00 -04:00
|
|
|
fetch_date=datetime.datetime.utcnow()).save()
|
2010-09-27 19:12:17 -04:00
|
|
|
old_fetch_histories = MPageFetchHistory.objects(feed_id=self.pk).order_by('-fetch_date')[5:]
|
2010-07-06 13:21:12 -04:00
|
|
|
for history in old_fetch_histories:
|
|
|
|
history.delete()
|
2010-08-18 20:35:45 -04:00
|
|
|
|
2010-10-03 19:05:16 -04:00
|
|
|
if status_code not in (200, 304):
|
2010-08-29 14:32:54 -04:00
|
|
|
fetch_history = map(lambda h: h.status_code,
|
|
|
|
MPageFetchHistory.objects(feed_id=self.pk))
|
2010-08-26 10:04:32 -04:00
|
|
|
self.count_errors_in_history(fetch_history, status_code, 'page')
|
|
|
|
elif self.has_page_exception:
|
|
|
|
self.has_page_exception = False
|
2010-08-25 20:43:35 -04:00
|
|
|
self.active = True
|
2010-08-24 16:22:12 -04:00
|
|
|
self.save()
|
2010-07-06 13:21:12 -04:00
|
|
|
|
2010-08-26 10:04:32 -04:00
|
|
|
def count_errors_in_history(self, fetch_history, status_code, exception_type):
|
2010-10-03 19:05:16 -04:00
|
|
|
non_errors = [h for h in fetch_history if int(h) in (200, 304)]
|
|
|
|
errors = [h for h in fetch_history if int(h) not in (200, 304)]
|
2010-08-18 21:54:33 -04:00
|
|
|
|
2010-08-18 20:35:45 -04:00
|
|
|
if len(non_errors) == 0 and len(errors) >= 1:
|
2010-08-26 10:04:32 -04:00
|
|
|
if exception_type == 'feed':
|
|
|
|
self.has_feed_exception = True
|
2010-10-29 11:45:00 -04:00
|
|
|
self.active = False
|
2010-08-26 10:04:32 -04:00
|
|
|
elif exception_type == 'page':
|
|
|
|
self.has_page_exception = True
|
2010-08-25 20:43:35 -04:00
|
|
|
self.exception_code = status_code
|
2010-08-18 20:35:45 -04:00
|
|
|
self.save()
|
2010-10-27 13:09:46 -04:00
|
|
|
elif self.exception_code > 0:
|
|
|
|
self.active = True
|
|
|
|
self.exception_code = 0
|
|
|
|
self.save()
|
2010-08-18 20:35:45 -04:00
|
|
|
|
2010-11-05 20:34:17 -04:00
|
|
|
def count_subscribers(self, verbose=False):
|
2010-11-08 12:09:55 -05:00
|
|
|
SUBSCRIBER_EXPIRE = datetime.datetime.now() - datetime.timedelta(days=settings.SUBSCRIBER_EXPIRE)
|
2010-06-27 20:43:17 -04:00
|
|
|
from apps.reader.models import UserSubscription
|
2010-10-06 10:21:14 -04:00
|
|
|
|
2010-06-27 20:43:17 -04:00
|
|
|
subs = UserSubscription.objects.filter(feed=self)
|
|
|
|
self.num_subscribers = subs.count()
|
2010-10-06 10:21:14 -04:00
|
|
|
|
|
|
|
active_subs = UserSubscription.objects.filter(
|
|
|
|
feed=self,
|
|
|
|
active=True,
|
|
|
|
user__profile__last_seen_on__gte=SUBSCRIBER_EXPIRE
|
|
|
|
)
|
|
|
|
self.active_subscribers = active_subs.count()
|
|
|
|
|
|
|
|
premium_subs = UserSubscription.objects.filter(
|
|
|
|
feed=self,
|
|
|
|
active=True,
|
|
|
|
user__profile__is_premium=True
|
|
|
|
)
|
|
|
|
self.premium_subscribers = premium_subs.count()
|
|
|
|
|
2010-11-05 20:34:17 -04:00
|
|
|
self.save()
|
2010-06-27 20:43:17 -04:00
|
|
|
|
|
|
|
if verbose:
|
2010-06-27 23:15:31 -04:00
|
|
|
if self.num_subscribers <= 1:
|
|
|
|
print '.',
|
|
|
|
else:
|
|
|
|
print "\n %s> %s subscriber%s: %s" % (
|
|
|
|
'-' * min(self.num_subscribers, 20),
|
|
|
|
self.num_subscribers,
|
|
|
|
'' if self.num_subscribers == 1 else 's',
|
|
|
|
self.feed_title,
|
|
|
|
),
|
2010-08-13 10:43:48 -04:00
|
|
|
|
2010-11-05 20:34:17 -04:00
|
|
|
def count_stories(self, verbose=False):
|
|
|
|
self.save_feed_stories_last_month(verbose)
|
|
|
|
# self.save_feed_story_history_statistics()
|
2010-08-23 09:55:21 -04:00
|
|
|
|
2010-11-05 20:34:17 -04:00
|
|
|
def save_feed_stories_last_month(self, verbose=False):
|
2010-10-10 23:55:00 -04:00
|
|
|
month_ago = datetime.datetime.utcnow() - datetime.timedelta(days=30)
|
2010-08-23 09:55:21 -04:00
|
|
|
stories_last_month = MStory.objects(story_feed_id=self.pk,
|
|
|
|
story_date__gte=month_ago).count()
|
2010-07-25 23:13:27 -04:00
|
|
|
self.stories_last_month = stories_last_month
|
|
|
|
|
2010-11-05 20:34:17 -04:00
|
|
|
self.save()
|
2010-07-05 22:53:49 -04:00
|
|
|
|
2010-07-02 15:49:08 -04:00
|
|
|
if verbose:
|
2010-08-23 09:55:21 -04:00
|
|
|
print " ---> %s [%s]: %s stories last month" % (self.feed_title, self.pk,
|
|
|
|
self.stories_last_month)
|
2010-08-13 10:43:48 -04:00
|
|
|
|
2010-11-05 20:34:17 -04:00
|
|
|
def save_feed_story_history_statistics(self, current_counts=None):
|
2010-08-13 10:43:48 -04:00
|
|
|
"""
|
|
|
|
Fills in missing months between earlier occurances and now.
|
|
|
|
|
|
|
|
Save format: [('YYYY-MM, #), ...]
|
|
|
|
Example output: [(2010-12, 123), (2011-01, 146)]
|
|
|
|
"""
|
2010-10-10 23:55:00 -04:00
|
|
|
now = datetime.datetime.utcnow()
|
2010-08-13 10:43:48 -04:00
|
|
|
min_year = now.year
|
|
|
|
total = 0
|
|
|
|
month_count = 0
|
2010-08-27 19:09:47 -04:00
|
|
|
if not current_counts:
|
2011-01-17 22:48:38 -05:00
|
|
|
current_counts = self.data.story_count_history and json.decode(self.data.story_count_history)
|
2010-08-13 10:43:48 -04:00
|
|
|
|
|
|
|
if not current_counts:
|
|
|
|
current_counts = []
|
2010-08-27 19:09:47 -04:00
|
|
|
|
|
|
|
# Count stories, aggregate by year and month. Map Reduce!
|
2010-08-27 18:35:33 -04:00
|
|
|
map_f = """
|
|
|
|
function() {
|
|
|
|
var date = (this.story_date.getFullYear()) + "-" + (this.story_date.getMonth()+1);
|
|
|
|
emit(date, 1);
|
|
|
|
}
|
|
|
|
"""
|
|
|
|
reduce_f = """
|
|
|
|
function(key, values) {
|
|
|
|
var total = 0;
|
|
|
|
for (var i=0; i < values.length; i++) {
|
|
|
|
total += values[i];
|
|
|
|
}
|
|
|
|
return total;
|
|
|
|
}
|
|
|
|
"""
|
2010-08-27 19:09:47 -04:00
|
|
|
dates = {}
|
2010-10-25 20:44:52 -04:00
|
|
|
res = MStory.objects(story_feed_id=self.pk).map_reduce(map_f, reduce_f, keep_temp=False)
|
2010-08-27 18:35:33 -04:00
|
|
|
for r in res:
|
2010-08-27 19:09:47 -04:00
|
|
|
dates[r.key] = r.value
|
2011-01-17 22:48:38 -05:00
|
|
|
year = int(re.findall(r"(\d{4})-\d{1,2}", r.key)[0])
|
|
|
|
if year < min_year:
|
|
|
|
min_year = year
|
2010-08-27 19:09:47 -04:00
|
|
|
|
2010-08-13 10:43:48 -04:00
|
|
|
# Add on to existing months, always amending up, never down. (Current month
|
|
|
|
# is guaranteed to be accurate, since trim_feeds won't delete it until after
|
|
|
|
# a month. Hacker News can have 1,000+ and still be counted.)
|
|
|
|
for current_month, current_count in current_counts:
|
2011-01-17 22:48:38 -05:00
|
|
|
year = int(re.findall(r"(\d{4})-\d{1,2}", current_month)[0])
|
2010-08-27 19:09:47 -04:00
|
|
|
if current_month not in dates or dates[current_month] < current_count:
|
|
|
|
dates[current_month] = current_count
|
2011-01-17 22:48:38 -05:00
|
|
|
if year < min_year:
|
|
|
|
min_year = year
|
|
|
|
|
2010-08-13 10:43:48 -04:00
|
|
|
# Assemble a list with 0's filled in for missing months,
|
|
|
|
# trimming left and right 0's.
|
|
|
|
months = []
|
|
|
|
start = False
|
|
|
|
for year in range(min_year, now.year+1):
|
|
|
|
for month in range(1, 12+1):
|
|
|
|
if datetime.datetime(year, month, 1) < now:
|
2010-08-27 19:09:47 -04:00
|
|
|
key = u'%s-%s' % (year, month)
|
|
|
|
if dates.get(key) or start:
|
2010-08-13 10:43:48 -04:00
|
|
|
start = True
|
2010-08-27 19:09:47 -04:00
|
|
|
months.append((key, dates.get(key, 0)))
|
|
|
|
total += dates.get(key, 0)
|
2010-08-13 10:43:48 -04:00
|
|
|
month_count += 1
|
2011-01-17 22:48:38 -05:00
|
|
|
self.data.story_count_history = json.encode(months)
|
|
|
|
self.data.save()
|
2010-08-27 19:09:47 -04:00
|
|
|
if not total:
|
|
|
|
self.average_stories_per_month = 0
|
|
|
|
else:
|
|
|
|
self.average_stories_per_month = total / month_count
|
2010-11-05 20:34:17 -04:00
|
|
|
self.save()
|
2010-08-13 10:43:48 -04:00
|
|
|
|
|
|
|
|
2010-11-10 18:04:17 -05:00
|
|
|
def update(self, force=False, single_threaded=True, compute_scores=True):
|
2009-08-29 19:34:42 +00:00
|
|
|
from utils import feed_fetcher
|
2009-08-14 01:54:22 +00:00
|
|
|
try:
|
2010-06-08 11:19:07 -04:00
|
|
|
self.feed_address = self.feed_address % {'NEWSBLUR_DIR': settings.NEWSBLUR_DIR}
|
2009-08-14 01:54:22 +00:00
|
|
|
except:
|
|
|
|
pass
|
2009-08-01 04:26:57 +00:00
|
|
|
|
2010-10-03 19:05:16 -04:00
|
|
|
self.set_next_scheduled_update()
|
|
|
|
|
2009-08-29 19:34:42 +00:00
|
|
|
options = {
|
2010-09-19 11:36:04 -04:00
|
|
|
'verbose': 1 if not force else 2,
|
2010-04-09 16:37:19 -04:00
|
|
|
'timeout': 10,
|
|
|
|
'single_threaded': single_threaded,
|
2010-04-29 11:18:49 -04:00
|
|
|
'force': force,
|
2010-11-10 18:04:17 -05:00
|
|
|
'compute_scores': compute_scores,
|
2009-08-29 19:34:42 +00:00
|
|
|
}
|
|
|
|
disp = feed_fetcher.Dispatcher(options, 1)
|
2010-10-03 18:04:40 -04:00
|
|
|
disp.add_jobs([[self.pk]])
|
2009-11-15 18:57:53 +00:00
|
|
|
disp.run_jobs()
|
2011-02-24 15:48:00 -05:00
|
|
|
|
|
|
|
return Feed.objects.get(pk=self.pk)
|
2009-06-16 03:08:55 +00:00
|
|
|
|
2010-10-11 13:19:42 -04:00
|
|
|
def add_update_stories(self, stories, existing_stories):
|
2009-08-20 02:43:01 +00:00
|
|
|
ret_values = {
|
|
|
|
ENTRY_NEW:0,
|
|
|
|
ENTRY_UPDATED:0,
|
|
|
|
ENTRY_SAME:0,
|
|
|
|
ENTRY_ERR:0
|
|
|
|
}
|
|
|
|
|
2009-06-16 03:08:55 +00:00
|
|
|
for story in stories:
|
2009-12-18 20:47:44 +00:00
|
|
|
story = pre_process_story(story)
|
2010-01-04 22:26:53 +00:00
|
|
|
|
2009-08-01 04:26:57 +00:00
|
|
|
if story.get('title'):
|
|
|
|
story_contents = story.get('content')
|
2010-01-04 22:26:53 +00:00
|
|
|
story_tags = self.get_tags(story)
|
|
|
|
|
2009-08-01 04:26:57 +00:00
|
|
|
if story_contents is not None:
|
|
|
|
story_content = story_contents[0]['value']
|
2009-06-16 03:08:55 +00:00
|
|
|
else:
|
2009-08-01 04:26:57 +00:00
|
|
|
story_content = story.get('summary')
|
2010-01-04 22:26:53 +00:00
|
|
|
|
2009-08-30 00:43:13 +00:00
|
|
|
existing_story, story_has_changed = self._exists_story(story, story_content, existing_stories)
|
2009-08-01 04:26:57 +00:00
|
|
|
if existing_story is None:
|
2010-08-21 13:57:39 -04:00
|
|
|
s = MStory(story_feed_id = self.pk,
|
2009-08-01 04:26:57 +00:00
|
|
|
story_date = story.get('published'),
|
|
|
|
story_title = story.get('title'),
|
|
|
|
story_content = story_content,
|
2010-07-08 11:37:54 -04:00
|
|
|
story_author_name = story.get('author'),
|
2009-09-05 02:22:25 +00:00
|
|
|
story_permalink = story.get('link'),
|
2010-04-05 02:42:43 -04:00
|
|
|
story_guid = story.get('guid') or story.get('id') or story.get('link'),
|
2010-08-21 20:42:38 -04:00
|
|
|
story_tags = story_tags
|
2009-08-01 04:26:57 +00:00
|
|
|
)
|
|
|
|
try:
|
2010-08-21 13:57:39 -04:00
|
|
|
s.save()
|
2010-05-11 12:27:39 -04:00
|
|
|
ret_values[ENTRY_NEW] += 1
|
|
|
|
cache.set('updated_feed:%s' % self.id, 1)
|
2010-12-23 12:32:24 -05:00
|
|
|
except (IntegrityError, OperationError):
|
2009-08-20 02:43:01 +00:00
|
|
|
ret_values[ENTRY_ERR] += 1
|
2011-02-05 15:43:59 -05:00
|
|
|
# logging.info('Saving new story, IntegrityError: %s - %s: %s' % (self.feed_title, story.get('title'), e))
|
2009-08-30 00:43:13 +00:00
|
|
|
elif existing_story and story_has_changed:
|
2009-08-01 04:26:57 +00:00
|
|
|
# update story
|
2010-04-29 19:12:18 -04:00
|
|
|
# logging.debug('- Updated story in feed (%s - %s): %s / %s' % (self.feed_title, story.get('title'), len(existing_story.story_content), len(story_content)))
|
2009-08-01 04:26:57 +00:00
|
|
|
|
|
|
|
original_content = None
|
2010-12-16 16:52:00 -05:00
|
|
|
if existing_story.story_original_content_z:
|
|
|
|
original_content = zlib.decompress(existing_story.story_original_content_z)
|
|
|
|
elif existing_story.story_content_z:
|
|
|
|
original_content = zlib.decompress(existing_story.story_content_z)
|
2010-01-27 18:08:07 -05:00
|
|
|
# print 'Type: %s %s' % (type(original_content), type(story_content))
|
2010-11-06 12:59:54 -04:00
|
|
|
if story_content and len(story_content) > 10:
|
2010-08-01 23:47:40 -04:00
|
|
|
diff = HTMLDiff(unicode(original_content), story_content)
|
|
|
|
story_content_diff = diff.getDiff()
|
|
|
|
else:
|
|
|
|
story_content_diff = original_content
|
2009-08-01 04:26:57 +00:00
|
|
|
# logging.debug("\t\tDiff: %s %s %s" % diff.getStats())
|
|
|
|
# logging.debug("\t\tDiff content: %s" % diff.getDiff())
|
2010-12-16 16:52:00 -05:00
|
|
|
if existing_story.story_title != story.get('title'):
|
2010-01-27 16:28:57 -05:00
|
|
|
# logging.debug('\tExisting title / New: : \n\t\t- %s\n\t\t- %s' % (existing_story.story_title, story.get('title')))
|
2009-08-14 01:48:21 +00:00
|
|
|
pass
|
2009-06-16 03:08:55 +00:00
|
|
|
|
2010-12-16 16:52:00 -05:00
|
|
|
existing_story.story_feed = self.pk
|
|
|
|
existing_story.story_date = story.get('published')
|
|
|
|
existing_story.story_title = story.get('title')
|
|
|
|
existing_story.story_content = story_content_diff
|
|
|
|
existing_story.story_original_content = original_content
|
|
|
|
existing_story.story_author_name = story.get('author')
|
|
|
|
existing_story.story_permalink = story.get('link')
|
|
|
|
existing_story.story_guid = story.get('guid') or story.get('id') or story.get('link')
|
|
|
|
existing_story.story_tags = story_tags
|
2009-08-01 04:26:57 +00:00
|
|
|
try:
|
2010-12-16 16:52:00 -05:00
|
|
|
existing_story.save()
|
2010-05-11 12:27:39 -04:00
|
|
|
ret_values[ENTRY_UPDATED] += 1
|
|
|
|
cache.set('updated_feed:%s' % self.id, 1)
|
2010-09-10 01:01:18 -07:00
|
|
|
except (IntegrityError, OperationError):
|
2009-09-08 00:13:49 +00:00
|
|
|
ret_values[ENTRY_ERR] += 1
|
2010-12-16 16:52:00 -05:00
|
|
|
logging.info('Saving updated story, IntegrityError: %s - %s' % (self.feed_title, story.get('title')))
|
2011-02-15 21:08:40 -05:00
|
|
|
except ValidationError, e:
|
|
|
|
ret_values[ENTRY_ERR] += 1
|
|
|
|
logging.info('Saving updated story, ValidationError: %s - %s: %s' % (self.feed_title, story.get('title'), e))
|
2009-08-20 02:43:01 +00:00
|
|
|
else:
|
|
|
|
ret_values[ENTRY_SAME] += 1
|
2009-08-01 04:26:57 +00:00
|
|
|
# logging.debug("Unchanged story: %s " % story.get('title'))
|
2009-06-16 03:08:55 +00:00
|
|
|
|
2009-08-20 02:43:01 +00:00
|
|
|
return ret_values
|
2009-06-16 03:08:55 +00:00
|
|
|
|
2011-02-05 15:34:43 -05:00
|
|
|
def save_popular_tags(self, feed_tags=None, verbose=False):
|
2010-07-01 15:16:33 -04:00
|
|
|
if not feed_tags:
|
2011-02-05 19:37:15 -05:00
|
|
|
all_tags = MStory.objects(story_feed_id=self.pk, story_tags__exists=True).item_frequencies('story_tags')
|
2011-02-05 15:34:43 -05:00
|
|
|
|
2011-02-05 19:37:15 -05:00
|
|
|
feed_tags = sorted([(k, v) for k, v in all_tags.items() if isinstance(v, float) and int(v) > 1],
|
2010-08-21 23:49:36 -04:00
|
|
|
key=itemgetter(1),
|
2011-02-05 15:34:43 -05:00
|
|
|
reverse=True)[:25]
|
2010-07-01 15:16:33 -04:00
|
|
|
popular_tags = json.encode(feed_tags)
|
2010-08-21 23:49:36 -04:00
|
|
|
|
|
|
|
# TODO: This len() bullshit will be gone when feeds move to mongo
|
|
|
|
# On second thought, it might stay, because we don't want
|
|
|
|
# popular tags the size of a small planet. I'm looking at you
|
|
|
|
# Tumblr writers.
|
2010-07-01 15:16:33 -04:00
|
|
|
if len(popular_tags) < 1024:
|
2011-01-17 22:48:38 -05:00
|
|
|
self.data.popular_tags = popular_tags
|
|
|
|
self.data.save()
|
2010-07-01 15:16:33 -04:00
|
|
|
return
|
|
|
|
|
2010-07-01 15:20:38 -04:00
|
|
|
tags_list = json.decode(feed_tags) if feed_tags else []
|
2010-07-01 15:16:33 -04:00
|
|
|
if len(tags_list) > 1:
|
|
|
|
self.save_popular_tags(tags_list[:-1])
|
|
|
|
|
2010-11-05 20:34:17 -04:00
|
|
|
def save_popular_authors(self, feed_authors=None):
|
2010-07-01 15:16:33 -04:00
|
|
|
if not feed_authors:
|
2010-08-21 23:49:36 -04:00
|
|
|
authors = defaultdict(int)
|
|
|
|
for story in MStory.objects(story_feed_id=self.pk).only('story_author_name'):
|
|
|
|
authors[story.story_author_name] += 1
|
|
|
|
feed_authors = sorted([(k, v) for k, v in authors.items() if k],
|
|
|
|
key=itemgetter(1),
|
|
|
|
reverse=True)[:20]
|
|
|
|
|
2010-07-01 15:16:33 -04:00
|
|
|
popular_authors = json.encode(feed_authors)
|
2011-02-05 22:09:31 -05:00
|
|
|
if len(popular_authors) < 1023:
|
2011-01-17 22:48:38 -05:00
|
|
|
self.data.popular_authors = popular_authors
|
|
|
|
self.data.save()
|
2010-07-01 15:16:33 -04:00
|
|
|
return
|
|
|
|
|
2010-08-25 21:55:22 -04:00
|
|
|
if len(feed_authors) > 1:
|
2010-11-05 20:34:17 -04:00
|
|
|
self.save_popular_authors(feed_authors=feed_authors[:-1])
|
2010-08-21 20:42:38 -04:00
|
|
|
|
2011-01-22 11:32:49 -05:00
|
|
|
def trim_feed(self, verbose=False):
|
2010-09-22 15:57:55 -04:00
|
|
|
from apps.reader.models import MUserStory
|
2010-11-09 22:06:08 -05:00
|
|
|
trim_cutoff = 500
|
2010-09-22 16:22:52 -04:00
|
|
|
if self.active_subscribers <= 1:
|
2010-11-07 22:32:27 -05:00
|
|
|
trim_cutoff = 50
|
2010-09-22 16:22:52 -04:00
|
|
|
elif self.active_subscribers <= 3:
|
2010-11-07 22:32:27 -05:00
|
|
|
trim_cutoff = 100
|
2010-09-22 16:22:52 -04:00
|
|
|
elif self.active_subscribers <= 5:
|
2010-11-07 22:32:27 -05:00
|
|
|
trim_cutoff = 150
|
2010-09-22 16:22:52 -04:00
|
|
|
elif self.active_subscribers <= 10:
|
2010-11-09 22:06:08 -05:00
|
|
|
trim_cutoff = 250
|
|
|
|
elif self.active_subscribers <= 25:
|
|
|
|
trim_cutoff = 350
|
2010-09-22 15:34:47 -04:00
|
|
|
stories = MStory.objects(
|
2010-09-22 14:35:26 -04:00
|
|
|
story_feed_id=self.pk,
|
2010-09-22 15:34:47 -04:00
|
|
|
).order_by('-story_date')
|
2010-09-22 16:22:52 -04:00
|
|
|
if stories.count() > trim_cutoff:
|
2011-01-22 11:32:49 -05:00
|
|
|
if verbose:
|
2011-01-22 11:33:58 -05:00
|
|
|
print 'Found %s stories in %s. Trimming to %s...' % (stories.count(), self, trim_cutoff)
|
2010-09-22 16:22:52 -04:00
|
|
|
story_trim_date = stories[trim_cutoff].story_date
|
|
|
|
extra_stories = MStory.objects(story_feed_id=self.pk, story_date__lte=story_trim_date)
|
2010-09-22 15:34:47 -04:00
|
|
|
extra_stories.delete()
|
2010-09-22 15:57:55 -04:00
|
|
|
# print "Deleted stories, %s left." % MStory.objects(story_feed_id=self.pk).count()
|
2010-09-22 16:22:52 -04:00
|
|
|
userstories = MUserStory.objects(feed_id=self.pk, read_date__lte=story_trim_date)
|
2010-09-22 15:57:55 -04:00
|
|
|
if userstories.count():
|
|
|
|
# print "Found %s user stories. Deleting..." % userstories.count()
|
|
|
|
userstories.delete()
|
2009-08-01 04:26:57 +00:00
|
|
|
|
2010-06-24 15:10:15 -04:00
|
|
|
def get_stories(self, offset=0, limit=25, force=False):
|
2010-08-21 20:42:38 -04:00
|
|
|
stories = cache.get('feed_stories:%s-%s-%s' % (self.id, offset, limit), [])
|
|
|
|
|
2010-06-24 15:10:15 -04:00
|
|
|
if not stories or force:
|
2010-08-21 13:57:39 -04:00
|
|
|
stories_db = MStory.objects(story_feed_id=self.pk)[offset:offset+limit]
|
2010-12-02 20:18:33 -05:00
|
|
|
stories = Feed.format_stories(stories_db, self.pk)
|
2010-06-24 15:10:15 -04:00
|
|
|
cache.set('feed_stories:%s-%s-%s' % (self.id, offset, limit), stories)
|
2009-07-28 02:27:27 +00:00
|
|
|
|
|
|
|
return stories
|
|
|
|
|
2010-12-02 20:18:33 -05:00
|
|
|
@classmethod
|
|
|
|
def format_stories(cls, stories_db, feed_id=None):
|
2010-01-21 13:12:29 -05:00
|
|
|
stories = []
|
2010-10-07 19:56:23 -04:00
|
|
|
|
2010-01-21 13:12:29 -05:00
|
|
|
for story_db in stories_db:
|
2010-02-17 03:22:45 -05:00
|
|
|
story = {}
|
2010-08-24 16:14:56 -04:00
|
|
|
story['story_tags'] = story_db.story_tags or []
|
2010-02-17 03:22:45 -05:00
|
|
|
story['story_date'] = story_db.story_date
|
2010-07-08 11:37:54 -04:00
|
|
|
story['story_authors'] = story_db.story_author_name
|
2010-02-17 03:22:45 -05:00
|
|
|
story['story_title'] = story_db.story_title
|
2010-08-29 13:23:50 -04:00
|
|
|
story['story_content'] = story_db.story_content_z and zlib.decompress(story_db.story_content_z)
|
2010-11-15 22:43:01 -05:00
|
|
|
story['story_permalink'] = urllib.unquote(urllib.unquote(story_db.story_permalink))
|
2010-12-02 20:18:33 -05:00
|
|
|
story['story_feed_id'] = feed_id or story_db.story_feed_id
|
2010-09-10 00:51:20 -07:00
|
|
|
story['id'] = story_db.story_guid
|
2010-12-02 20:18:33 -05:00
|
|
|
if hasattr(story_db, 'starred_date'):
|
|
|
|
story['starred_date'] = story_db.starred_date
|
2010-02-17 03:22:45 -05:00
|
|
|
|
2010-01-21 13:12:29 -05:00
|
|
|
stories.append(story)
|
|
|
|
|
|
|
|
return stories
|
|
|
|
|
2010-01-04 22:26:53 +00:00
|
|
|
def get_tags(self, entry):
|
|
|
|
fcat = []
|
|
|
|
if entry.has_key('tags'):
|
|
|
|
for tcat in entry.tags:
|
2011-02-15 21:08:40 -05:00
|
|
|
if hasattr(tcat, 'label') and tcat.label:
|
2010-01-04 22:26:53 +00:00
|
|
|
term = tcat.label
|
2010-07-06 18:16:41 -04:00
|
|
|
elif tcat.term:
|
2010-01-04 22:26:53 +00:00
|
|
|
term = tcat.term
|
2010-07-06 18:16:41 -04:00
|
|
|
else:
|
|
|
|
continue
|
2010-01-04 22:26:53 +00:00
|
|
|
qcat = term.strip()
|
|
|
|
if ',' in qcat or '/' in qcat:
|
|
|
|
qcat = qcat.replace(',', '/').split('/')
|
|
|
|
else:
|
|
|
|
qcat = [qcat]
|
|
|
|
for zcat in qcat:
|
|
|
|
tagname = zcat.lower()
|
|
|
|
while ' ' in tagname:
|
|
|
|
tagname = tagname.replace(' ', ' ')
|
|
|
|
tagname = tagname.strip()
|
|
|
|
if not tagname or tagname == ' ':
|
|
|
|
continue
|
2010-08-21 20:42:38 -04:00
|
|
|
fcat.append(tagname)
|
2011-02-23 14:25:07 -05:00
|
|
|
fcat = [t[:250] for t in fcat]
|
2011-02-22 13:36:38 -05:00
|
|
|
return fcat[:12]
|
2010-01-04 22:26:53 +00:00
|
|
|
|
2009-08-01 04:26:57 +00:00
|
|
|
def _exists_story(self, story=None, story_content=None, existing_stories=None):
|
2009-08-30 00:43:13 +00:00
|
|
|
story_in_system = None
|
|
|
|
story_has_changed = False
|
2009-08-01 04:26:57 +00:00
|
|
|
story_pub_date = story.get('published')
|
2010-01-28 13:28:27 -05:00
|
|
|
story_published_now = story.get('published_now', False)
|
2009-08-01 04:26:57 +00:00
|
|
|
start_date = story_pub_date - datetime.timedelta(hours=8)
|
|
|
|
end_date = story_pub_date + datetime.timedelta(hours=8)
|
2010-09-28 21:10:10 -04:00
|
|
|
existing_stories.rewind()
|
2009-08-30 00:43:13 +00:00
|
|
|
|
2009-08-01 04:26:57 +00:00
|
|
|
for existing_story in existing_stories:
|
2009-08-30 00:43:13 +00:00
|
|
|
content_ratio = 0
|
2010-12-16 16:52:00 -05:00
|
|
|
existing_story_pub_date = existing_story.story_date
|
2010-01-28 13:28:27 -05:00
|
|
|
# print 'Story pub date: %s %s' % (story_published_now, story_pub_date)
|
2010-09-10 00:51:20 -07:00
|
|
|
if (story_published_now or
|
2010-09-28 21:10:10 -04:00
|
|
|
(existing_story_pub_date > start_date and existing_story_pub_date < end_date)):
|
2010-12-16 16:52:00 -05:00
|
|
|
if isinstance(existing_story.id, unicode):
|
|
|
|
existing_story.story_guid = existing_story.id
|
|
|
|
if story.get('guid') and story.get('guid') == existing_story.story_guid:
|
2009-09-05 02:22:25 +00:00
|
|
|
story_in_system = existing_story
|
2010-12-16 16:52:00 -05:00
|
|
|
elif story.get('link') and story.get('link') == existing_story.story_permalink:
|
2009-08-30 00:43:13 +00:00
|
|
|
story_in_system = existing_story
|
|
|
|
|
|
|
|
# Title distance + content distance, checking if story changed
|
2009-08-01 04:26:57 +00:00
|
|
|
story_title_difference = levenshtein_distance(story.get('title'),
|
2010-12-16 16:52:00 -05:00
|
|
|
existing_story.story_title)
|
2010-09-10 01:27:41 -07:00
|
|
|
if 'story_content_z' in existing_story:
|
2010-12-16 16:52:00 -05:00
|
|
|
existing_story_content = unicode(zlib.decompress(existing_story.story_content_z))
|
2010-09-10 01:27:41 -07:00
|
|
|
elif 'story_content' in existing_story:
|
2010-12-16 16:52:00 -05:00
|
|
|
existing_story_content = existing_story.story_content
|
2010-09-10 01:27:41 -07:00
|
|
|
else:
|
2010-09-17 13:33:11 -04:00
|
|
|
existing_story_content = u''
|
|
|
|
|
2010-09-08 18:30:46 -07:00
|
|
|
seq = difflib.SequenceMatcher(None, story_content, existing_story_content)
|
2009-08-30 00:43:13 +00:00
|
|
|
|
2010-04-29 13:52:24 -04:00
|
|
|
if (seq
|
|
|
|
and story_content
|
2010-09-08 18:30:46 -07:00
|
|
|
and existing_story_content
|
2010-04-29 13:52:24 -04:00
|
|
|
and seq.real_quick_ratio() > .9
|
|
|
|
and seq.quick_ratio() > .95):
|
2009-08-30 00:43:13 +00:00
|
|
|
content_ratio = seq.ratio()
|
|
|
|
|
|
|
|
if story_title_difference > 0 and story_title_difference < 5 and content_ratio > .98:
|
|
|
|
story_in_system = existing_story
|
|
|
|
if story_title_difference > 0 or content_ratio < 1.0:
|
2010-01-27 16:28:57 -05:00
|
|
|
# print "Title difference - %s/%s (%s): %s" % (story.get('title'), existing_story.story_title, story_title_difference, content_ratio)
|
2009-08-30 00:43:13 +00:00
|
|
|
story_has_changed = True
|
|
|
|
break
|
|
|
|
|
|
|
|
# More restrictive content distance, still no story match
|
2009-09-07 01:56:52 +00:00
|
|
|
if not story_in_system and content_ratio > .98:
|
2010-01-27 16:28:57 -05:00
|
|
|
# print "Content difference - %s/%s (%s): %s" % (story.get('title'), existing_story.story_title, story_title_difference, content_ratio)
|
2009-08-30 00:43:13 +00:00
|
|
|
story_in_system = existing_story
|
|
|
|
story_has_changed = True
|
|
|
|
break
|
2009-08-01 04:26:57 +00:00
|
|
|
|
2009-08-30 00:43:13 +00:00
|
|
|
if story_in_system:
|
2010-09-08 18:30:46 -07:00
|
|
|
if story_content != existing_story_content:
|
2009-08-30 00:43:13 +00:00
|
|
|
story_has_changed = True
|
2009-08-01 04:26:57 +00:00
|
|
|
break
|
2010-01-28 13:28:27 -05:00
|
|
|
|
2010-02-02 18:01:02 -05:00
|
|
|
# if story_has_changed or not story_in_system:
|
|
|
|
# print 'New/updated story: %s' % (story),
|
2009-08-30 00:43:13 +00:00
|
|
|
return story_in_system, story_has_changed
|
2010-07-25 23:13:27 -04:00
|
|
|
|
2010-12-23 13:29:31 -05:00
|
|
|
def get_next_scheduled_update(self, force=False):
|
|
|
|
if self.min_to_decay and not force:
|
|
|
|
random_factor = random.randint(0, self.min_to_decay) / 4
|
|
|
|
return self.min_to_decay, random_factor
|
|
|
|
|
2010-06-24 15:27:25 -04:00
|
|
|
# Use stories per month to calculate next feed update
|
2010-08-16 17:51:15 -04:00
|
|
|
updates_per_day = self.stories_last_month / 30.0
|
2010-11-06 13:42:58 -04:00
|
|
|
# if updates_per_day < 1 and self.num_subscribers > 2:
|
|
|
|
# updates_per_day = 1
|
2010-08-16 17:51:15 -04:00
|
|
|
# 0 updates per day = 24 hours
|
2010-10-29 12:28:05 -04:00
|
|
|
# 1 subscriber:
|
|
|
|
# 1 update per day = 6 hours
|
|
|
|
# 2 updates = 3.5 hours
|
2010-09-21 11:16:22 -04:00
|
|
|
# 4 updates = 2 hours
|
2010-11-07 00:40:26 -04:00
|
|
|
# 10 updates = 1 hour
|
2010-10-29 12:28:05 -04:00
|
|
|
# 2 subscribers:
|
|
|
|
# 1 update per day = 4.5 hours
|
|
|
|
# 10 updates = 55 minutes
|
2011-03-01 10:34:49 -05:00
|
|
|
updates_per_day_delay = 6 * 60 / max(.25, ((max(0, self.active_subscribers)**.20)
|
2011-02-11 11:49:21 -05:00
|
|
|
* (updates_per_day**.7)))
|
2010-11-06 13:44:23 -04:00
|
|
|
if self.premium_subscribers > 0:
|
2011-03-01 10:34:49 -05:00
|
|
|
updates_per_day_delay /= 6
|
2010-07-02 15:49:08 -04:00
|
|
|
# Lots of subscribers = lots of updates
|
2010-08-16 17:51:15 -04:00
|
|
|
# 144 hours for 0 subscribers.
|
|
|
|
# 24 hours for 1 subscriber.
|
2010-10-29 12:28:05 -04:00
|
|
|
# 7 hours for 2 subscribers.
|
|
|
|
# 3 hours for 3 subscribers.
|
|
|
|
# 25 min for 10 subscribers.
|
2011-03-01 10:34:49 -05:00
|
|
|
subscriber_bonus = 24 * 60 / max(.167, max(0, self.active_subscribers)**2.35)
|
2010-11-05 15:54:26 -04:00
|
|
|
if self.premium_subscribers > 0:
|
2011-03-01 10:34:49 -05:00
|
|
|
subscriber_bonus /= 6
|
2010-06-27 21:44:35 -04:00
|
|
|
|
2010-06-28 11:37:54 -04:00
|
|
|
slow_punishment = 0
|
2010-09-07 14:02:48 -07:00
|
|
|
if self.num_subscribers <= 1:
|
|
|
|
if 30 <= self.last_load_time < 60:
|
|
|
|
slow_punishment = self.last_load_time
|
2010-09-21 11:16:22 -04:00
|
|
|
elif 60 <= self.last_load_time < 200:
|
2010-10-29 12:28:05 -04:00
|
|
|
slow_punishment = 2 * self.last_load_time
|
2010-09-21 11:16:22 -04:00
|
|
|
elif self.last_load_time >= 200:
|
2010-10-29 12:28:05 -04:00
|
|
|
slow_punishment = 6 * self.last_load_time
|
2010-11-05 15:54:26 -04:00
|
|
|
total = max(6, int(updates_per_day_delay + subscriber_bonus + slow_punishment))
|
2010-09-28 04:57:24 -04:00
|
|
|
# print "[%s] %s (%s-%s), %s, %s: %s" % (self, updates_per_day_delay, updates_per_day, self.num_subscribers, subscriber_bonus, slow_punishment, total)
|
2010-07-25 23:13:27 -04:00
|
|
|
random_factor = random.randint(0, total) / 4
|
2010-07-05 22:53:49 -04:00
|
|
|
|
2010-07-26 22:21:58 -04:00
|
|
|
return total, random_factor
|
2010-07-25 23:13:27 -04:00
|
|
|
|
2010-11-05 20:34:17 -04:00
|
|
|
def set_next_scheduled_update(self):
|
2010-12-23 13:29:31 -05:00
|
|
|
total, random_factor = self.get_next_scheduled_update(force=True)
|
|
|
|
|
2010-10-10 23:55:00 -04:00
|
|
|
next_scheduled_update = datetime.datetime.utcnow() + datetime.timedelta(
|
2010-09-07 14:02:48 -07:00
|
|
|
minutes = total + random_factor)
|
2010-07-26 22:21:58 -04:00
|
|
|
|
2010-12-23 13:29:31 -05:00
|
|
|
self.min_to_decay = total
|
2010-06-24 15:27:25 -04:00
|
|
|
self.next_scheduled_update = next_scheduled_update
|
2010-07-05 23:17:36 -04:00
|
|
|
|
2010-11-05 20:34:17 -04:00
|
|
|
self.save()
|
2010-08-25 19:10:55 -04:00
|
|
|
|
2010-11-05 20:34:17 -04:00
|
|
|
def schedule_feed_fetch_immediately(self):
|
2010-10-10 23:55:00 -04:00
|
|
|
self.next_scheduled_update = datetime.datetime.utcnow()
|
2010-08-25 19:10:55 -04:00
|
|
|
|
2010-11-05 20:34:17 -04:00
|
|
|
self.save()
|
2010-07-27 23:29:04 -04:00
|
|
|
|
|
|
|
def calculate_collocations_story_content(self,
|
|
|
|
collocation_measures=TrigramAssocMeasures,
|
|
|
|
collocation_finder=TrigramCollocationFinder):
|
2010-10-29 12:37:51 -04:00
|
|
|
stories = MStory.objects.filter(story_feed_id=self.pk)
|
2010-07-27 22:53:30 -04:00
|
|
|
story_content = ' '.join([s.story_content for s in stories if s.story_content])
|
2010-07-27 23:29:04 -04:00
|
|
|
return self.calculate_collocations(story_content, collocation_measures, collocation_finder)
|
|
|
|
|
|
|
|
def calculate_collocations_story_title(self,
|
|
|
|
collocation_measures=BigramAssocMeasures,
|
|
|
|
collocation_finder=BigramCollocationFinder):
|
2010-10-29 12:37:51 -04:00
|
|
|
stories = MStory.objects.filter(story_feed_id=self.pk)
|
2010-07-27 23:29:04 -04:00
|
|
|
story_titles = ' '.join([s.story_title for s in stories if s.story_title])
|
|
|
|
return self.calculate_collocations(story_titles, collocation_measures, collocation_finder)
|
|
|
|
|
|
|
|
def calculate_collocations(self, content,
|
|
|
|
collocation_measures=TrigramAssocMeasures,
|
|
|
|
collocation_finder=TrigramCollocationFinder):
|
|
|
|
content = re.sub(r'’', '\'', content)
|
|
|
|
content = re.sub(r'&', '&', content)
|
|
|
|
try:
|
|
|
|
content = unicode(BeautifulStoneSoup(content,
|
|
|
|
convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
|
|
|
|
except ValueError, e:
|
|
|
|
print "ValueError, ignoring: %s" % e
|
|
|
|
content = re.sub(r'</?\w+\s+[^>]*>', '', content)
|
|
|
|
content = re.split(r"[^A-Za-z-'&]+", content)
|
2010-07-27 22:11:23 -04:00
|
|
|
|
2010-07-27 23:29:04 -04:00
|
|
|
finder = collocation_finder.from_words(content)
|
2010-07-27 22:11:23 -04:00
|
|
|
finder.apply_freq_filter(3)
|
2010-07-27 22:27:32 -04:00
|
|
|
best = finder.nbest(collocation_measures.pmi, 10)
|
2010-07-27 22:11:23 -04:00
|
|
|
phrases = [' '.join(phrase) for phrase in best]
|
|
|
|
|
2010-07-27 22:27:32 -04:00
|
|
|
return phrases
|
2010-07-27 22:11:23 -04:00
|
|
|
|
2009-06-16 03:08:55 +00:00
|
|
|
class Meta:
|
|
|
|
db_table="feeds"
|
|
|
|
ordering=["feed_title"]
|
2010-07-27 22:11:23 -04:00
|
|
|
|
2010-07-27 22:37:52 -04:00
|
|
|
# class FeedCollocations(models.Model):
|
|
|
|
# feed = models.ForeignKey(Feed)
|
|
|
|
# phrase = models.CharField(max_length=500)
|
2009-06-16 03:08:55 +00:00
|
|
|
|
2011-01-17 20:23:29 -05:00
|
|
|
class FeedData(models.Model):
|
2011-01-17 22:48:38 -05:00
|
|
|
feed = AutoOneToOneField(Feed, related_name='data')
|
2011-02-06 15:43:13 -05:00
|
|
|
feed_tagline = models.CharField(max_length=1024, blank=True, null=True)
|
2011-01-17 20:23:29 -05:00
|
|
|
story_count_history = models.TextField(blank=True, null=True)
|
|
|
|
popular_tags = models.CharField(max_length=1024, blank=True, null=True)
|
|
|
|
popular_authors = models.CharField(max_length=2048, blank=True, null=True)
|
2009-06-16 03:08:55 +00:00
|
|
|
|
2011-01-17 22:48:38 -05:00
|
|
|
def save(self, *args, **kwargs):
|
2011-02-05 22:15:03 -05:00
|
|
|
if self.feed_tagline and len(self.feed_tagline) >= 1000:
|
|
|
|
self.feed_tagline = self.feed_tagline[:1000]
|
2009-12-18 20:47:44 +00:00
|
|
|
|
2011-01-21 20:29:19 -05:00
|
|
|
try:
|
|
|
|
super(FeedData, self).save(*args, **kwargs)
|
|
|
|
except (IntegrityError, OperationError):
|
2011-02-05 22:09:31 -05:00
|
|
|
if hasattr(self, 'id') and self.id: self.delete()
|
2010-05-20 15:13:25 -04:00
|
|
|
|
2011-01-27 19:05:50 -05:00
|
|
|
|
|
|
|
class FeedIcon(models.Model):
|
2011-01-29 19:16:40 -05:00
|
|
|
feed = AutoOneToOneField(Feed, primary_key=True, related_name='icon')
|
2011-01-29 22:01:09 -05:00
|
|
|
color = models.CharField(max_length=6, blank=True, null=True)
|
2011-01-30 21:40:00 -05:00
|
|
|
data = models.TextField(blank=True, null=True)
|
2011-01-27 19:05:50 -05:00
|
|
|
icon_url = models.CharField(max_length=2000, blank=True, null=True)
|
2011-01-29 11:24:27 -05:00
|
|
|
not_found = models.BooleanField(default=False)
|
2011-01-27 19:05:50 -05:00
|
|
|
|
|
|
|
def save(self, *args, **kwargs):
|
2011-01-30 12:42:32 -05:00
|
|
|
if self.icon_url:
|
|
|
|
self.icon_url = unicode(self.icon_url)
|
2011-01-27 19:05:50 -05:00
|
|
|
try:
|
|
|
|
super(FeedIcon, self).save(*args, **kwargs)
|
2011-02-08 09:44:24 -05:00
|
|
|
except (IntegrityError, OperationError):
|
|
|
|
# print "Error on Icon: %s" % e
|
|
|
|
if hasattr(self, 'id'): self.delete()
|
2011-01-27 19:05:50 -05:00
|
|
|
|
|
|
|
|
2010-08-27 18:35:33 -04:00
|
|
|
class MFeedPage(mongo.Document):
|
|
|
|
feed_id = mongo.IntField(primary_key=True)
|
2010-08-29 12:35:09 -04:00
|
|
|
page_data = mongo.BinaryField()
|
2010-08-27 18:35:33 -04:00
|
|
|
|
|
|
|
meta = {
|
2010-08-29 12:35:09 -04:00
|
|
|
'collection': 'feed_pages',
|
2010-08-27 18:35:33 -04:00
|
|
|
'allow_inheritance': False,
|
|
|
|
}
|
|
|
|
|
|
|
|
def save(self, *args, **kwargs):
|
2010-08-29 12:35:09 -04:00
|
|
|
if self.page_data:
|
|
|
|
self.page_data = zlib.compress(self.page_data)
|
2010-08-27 18:35:33 -04:00
|
|
|
super(MFeedPage, self).save(*args, **kwargs)
|
2011-01-29 19:16:40 -05:00
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def get_data(cls, feed_id):
|
|
|
|
data = None
|
|
|
|
feed_page = cls.objects(feed_id=feed_id)
|
|
|
|
|
|
|
|
if feed_page:
|
|
|
|
data = feed_page[0].page_data and zlib.decompress(feed_page[0].page_data)
|
|
|
|
|
|
|
|
if not data:
|
|
|
|
dupe_feed = DuplicateFeed.objects.filter(duplicate_feed_id=feed_id)
|
|
|
|
if dupe_feed:
|
|
|
|
feed = dupe_feed[0].feed
|
|
|
|
feed_page = MFeedPage.objects.filter(feed_id=feed.pk)
|
|
|
|
if feed_page:
|
|
|
|
data = feed_page[0].page_data and zlib.decompress(feed_page[0].page_data)
|
|
|
|
|
|
|
|
return data
|
2010-05-20 15:13:25 -04:00
|
|
|
|
2010-08-21 13:57:39 -04:00
|
|
|
class MStory(mongo.Document):
|
|
|
|
'''A feed item'''
|
2010-11-30 10:30:18 -05:00
|
|
|
story_feed_id = mongo.IntField()
|
|
|
|
story_date = mongo.DateTimeField()
|
|
|
|
story_title = mongo.StringField(max_length=1024)
|
|
|
|
story_content = mongo.StringField()
|
|
|
|
story_content_z = mongo.BinaryField()
|
|
|
|
story_original_content = mongo.StringField()
|
2010-08-29 13:23:50 -04:00
|
|
|
story_original_content_z = mongo.BinaryField()
|
2010-11-30 10:30:18 -05:00
|
|
|
story_content_type = mongo.StringField(max_length=255)
|
|
|
|
story_author_name = mongo.StringField()
|
|
|
|
story_permalink = mongo.StringField()
|
|
|
|
story_guid = mongo.StringField()
|
|
|
|
story_tags = mongo.ListField(mongo.StringField(max_length=250))
|
|
|
|
|
2010-08-21 13:57:39 -04:00
|
|
|
meta = {
|
|
|
|
'collection': 'stories',
|
2011-02-03 18:38:10 -05:00
|
|
|
'indexes': [('story_feed_id', '-story_date')],
|
2010-08-21 20:42:38 -04:00
|
|
|
'ordering': ['-story_date'],
|
|
|
|
'allow_inheritance': False,
|
2010-08-21 13:57:39 -04:00
|
|
|
}
|
2010-08-29 13:23:50 -04:00
|
|
|
|
|
|
|
def save(self, *args, **kwargs):
|
2011-02-15 21:08:40 -05:00
|
|
|
story_title_max = MStory._fields['story_title'].max_length
|
|
|
|
story_content_type_max = MStory._fields['story_content_type'].max_length
|
2010-08-29 13:23:50 -04:00
|
|
|
if self.story_content:
|
|
|
|
self.story_content_z = zlib.compress(self.story_content)
|
|
|
|
self.story_content = None
|
|
|
|
if self.story_original_content:
|
|
|
|
self.story_original_content_z = zlib.compress(self.story_original_content)
|
|
|
|
self.story_original_content = None
|
2011-02-15 21:16:34 -05:00
|
|
|
if self.story_title and len(self.story_title) > story_title_max:
|
2011-02-15 21:08:40 -05:00
|
|
|
self.story_title = self.story_title[:story_title_max]
|
2011-02-15 21:16:34 -05:00
|
|
|
if self.story_content_type and len(self.story_content_type) > story_content_type_max:
|
2011-02-15 21:08:40 -05:00
|
|
|
self.story_content_type = self.story_content_type[:story_content_type_max]
|
2010-08-29 13:23:50 -04:00
|
|
|
super(MStory, self).save(*args, **kwargs)
|
2010-11-30 10:30:18 -05:00
|
|
|
|
|
|
|
|
|
|
|
class MStarredStory(mongo.Document):
|
|
|
|
"""Like MStory, but not inherited due to large overhead of _cls and _type in
|
|
|
|
mongoengine's inheritance model on every single row."""
|
|
|
|
user_id = mongo.IntField()
|
2010-12-02 20:18:33 -05:00
|
|
|
starred_date = mongo.DateTimeField()
|
2010-11-30 10:30:18 -05:00
|
|
|
story_feed_id = mongo.IntField()
|
|
|
|
story_date = mongo.DateTimeField()
|
|
|
|
story_title = mongo.StringField(max_length=1024)
|
|
|
|
story_content = mongo.StringField()
|
|
|
|
story_content_z = mongo.BinaryField()
|
|
|
|
story_original_content = mongo.StringField()
|
|
|
|
story_original_content_z = mongo.BinaryField()
|
|
|
|
story_content_type = mongo.StringField(max_length=255)
|
|
|
|
story_author_name = mongo.StringField()
|
|
|
|
story_permalink = mongo.StringField()
|
|
|
|
story_guid = mongo.StringField(unique_with=('user_id',))
|
|
|
|
story_tags = mongo.ListField(mongo.StringField(max_length=250))
|
|
|
|
|
|
|
|
meta = {
|
|
|
|
'collection': 'starred_stories',
|
2010-12-02 20:18:33 -05:00
|
|
|
'indexes': [('user_id', '-starred_date'), 'story_feed_id'],
|
|
|
|
'ordering': ['-starred_date'],
|
2010-11-30 10:30:18 -05:00
|
|
|
'allow_inheritance': False,
|
|
|
|
}
|
|
|
|
|
|
|
|
def save(self, *args, **kwargs):
|
|
|
|
if self.story_content:
|
|
|
|
self.story_content_z = zlib.compress(self.story_content)
|
|
|
|
self.story_content = None
|
|
|
|
if self.story_original_content:
|
|
|
|
self.story_original_content_z = zlib.compress(self.story_original_content)
|
|
|
|
self.story_original_content = None
|
|
|
|
super(MStarredStory, self).save(*args, **kwargs)
|
|
|
|
|
|
|
|
|
2010-04-23 21:19:19 -04:00
|
|
|
class FeedUpdateHistory(models.Model):
|
2010-10-02 17:15:51 -04:00
|
|
|
fetch_date = models.DateTimeField(auto_now=True)
|
2010-04-23 21:19:19 -04:00
|
|
|
number_of_feeds = models.IntegerField()
|
|
|
|
seconds_taken = models.IntegerField()
|
2010-04-25 16:11:04 -04:00
|
|
|
average_per_feed = models.DecimalField(decimal_places=1, max_digits=4)
|
2010-04-23 21:19:19 -04:00
|
|
|
|
|
|
|
def __unicode__(self):
|
|
|
|
return "[%s] %s feeds: %s seconds" % (
|
2010-05-20 15:13:25 -04:00
|
|
|
self.fetch_date.strftime('%F %d'),
|
2010-04-23 21:19:19 -04:00
|
|
|
self.number_of_feeds,
|
|
|
|
self.seconds_taken,
|
|
|
|
)
|
2010-04-25 16:11:04 -04:00
|
|
|
|
|
|
|
def save(self, *args, **kwargs):
|
|
|
|
self.average_per_feed = str(self.seconds_taken / float(max(1.0,self.number_of_feeds)))
|
|
|
|
super(FeedUpdateHistory, self).save(*args, **kwargs)
|
2010-07-06 13:21:12 -04:00
|
|
|
|
2010-11-30 10:30:18 -05:00
|
|
|
|
2010-08-29 14:32:54 -04:00
|
|
|
class MFeedFetchHistory(mongo.Document):
|
|
|
|
feed_id = mongo.IntField()
|
|
|
|
status_code = mongo.IntField()
|
|
|
|
message = mongo.StringField()
|
|
|
|
exception = mongo.StringField()
|
|
|
|
fetch_date = mongo.DateTimeField()
|
|
|
|
|
|
|
|
meta = {
|
|
|
|
'collection': 'feed_fetch_history',
|
|
|
|
'allow_inheritance': False,
|
2010-10-03 17:22:58 -04:00
|
|
|
'indexes': [('fetch_date', 'status_code'), ('feed_id', 'status_code'), ('feed_id', 'fetch_date')],
|
2010-08-29 14:32:54 -04:00
|
|
|
}
|
2010-08-31 16:34:34 -04:00
|
|
|
|
|
|
|
def save(self, *args, **kwargs):
|
|
|
|
if not isinstance(self.exception, basestring):
|
|
|
|
self.exception = unicode(self.exception)
|
|
|
|
super(MFeedFetchHistory, self).save(*args, **kwargs)
|
2010-08-29 14:32:54 -04:00
|
|
|
|
2011-02-13 14:47:58 -05:00
|
|
|
@classmethod
|
|
|
|
def feed_history(cls, feed_id):
|
|
|
|
fetches = cls.objects(feed_id=feed_id).order_by('-fetch_date')
|
|
|
|
fetch_history = []
|
|
|
|
for fetch in fetches:
|
|
|
|
history = {}
|
|
|
|
history['message'] = fetch.message
|
|
|
|
history['fetch_date'] = fetch.fetch_date
|
|
|
|
history['status_code'] = fetch.status_code
|
|
|
|
history['exception'] = fetch.exception
|
|
|
|
fetch_history.append(history)
|
|
|
|
return fetch_history
|
|
|
|
|
2010-11-30 10:30:18 -05:00
|
|
|
|
2010-08-29 14:32:54 -04:00
|
|
|
class MPageFetchHistory(mongo.Document):
|
|
|
|
feed_id = mongo.IntField()
|
|
|
|
status_code = mongo.IntField()
|
|
|
|
message = mongo.StringField()
|
|
|
|
exception = mongo.StringField()
|
|
|
|
fetch_date = mongo.DateTimeField()
|
|
|
|
|
|
|
|
meta = {
|
|
|
|
'collection': 'page_fetch_history',
|
|
|
|
'allow_inheritance': False,
|
2010-10-03 17:22:58 -04:00
|
|
|
'indexes': [('fetch_date', 'status_code'), ('feed_id', 'status_code'), ('feed_id', 'fetch_date')],
|
2010-08-29 14:32:54 -04:00
|
|
|
}
|
2010-09-01 08:19:58 -04:00
|
|
|
|
|
|
|
def save(self, *args, **kwargs):
|
|
|
|
if not isinstance(self.exception, basestring):
|
|
|
|
self.exception = unicode(self.exception)
|
|
|
|
super(MPageFetchHistory, self).save(*args, **kwargs)
|
2010-09-23 10:29:18 -04:00
|
|
|
|
2011-02-13 14:47:58 -05:00
|
|
|
@classmethod
|
|
|
|
def feed_history(cls, feed_id):
|
|
|
|
fetches = cls.objects(feed_id=feed_id).order_by('-fetch_date')
|
|
|
|
fetch_history = []
|
|
|
|
for fetch in fetches:
|
|
|
|
history = {}
|
|
|
|
history['message'] = fetch.message
|
|
|
|
history['fetch_date'] = fetch.fetch_date
|
|
|
|
history['status_code'] = fetch.status_code
|
|
|
|
history['exception'] = fetch.exception
|
|
|
|
fetch_history.append(history)
|
|
|
|
return fetch_history
|
2010-09-23 10:29:18 -04:00
|
|
|
|
|
|
|
class FeedLoadtime(models.Model):
|
|
|
|
feed = models.ForeignKey(Feed)
|
2010-10-02 17:15:51 -04:00
|
|
|
date_accessed = models.DateTimeField(auto_now=True)
|
2010-09-23 10:29:18 -04:00
|
|
|
loadtime = models.FloatField()
|
|
|
|
|
|
|
|
def __unicode__(self):
|
|
|
|
return "%s: %s sec" % (self.feed, self.loadtime)
|
|
|
|
|
2010-08-19 10:43:07 -04:00
|
|
|
class DuplicateFeed(models.Model):
|
2011-02-06 15:04:21 -05:00
|
|
|
duplicate_address = models.CharField(max_length=255)
|
2010-11-09 09:55:44 -05:00
|
|
|
duplicate_feed_id = models.CharField(max_length=255, null=True)
|
2010-08-19 10:43:07 -04:00
|
|
|
feed = models.ForeignKey(Feed, related_name='duplicate_addresses')
|
2010-08-25 19:10:55 -04:00
|
|
|
|
2010-12-23 17:09:08 -05:00
|
|
|
def merge_feeds(original_feed_id, duplicate_feed_id, force=False):
|
2010-08-25 19:10:55 -04:00
|
|
|
from apps.reader.models import UserSubscription, UserSubscriptionFolders, MUserStory
|
|
|
|
from apps.analyzer.models import MClassifierTitle, MClassifierAuthor, MClassifierFeed, MClassifierTag
|
2010-12-23 17:09:08 -05:00
|
|
|
if original_feed_id > duplicate_feed_id and not force:
|
2010-10-07 19:21:03 -04:00
|
|
|
original_feed_id, duplicate_feed_id = duplicate_feed_id, original_feed_id
|
2010-08-25 19:10:55 -04:00
|
|
|
try:
|
|
|
|
original_feed = Feed.objects.get(pk=original_feed_id)
|
|
|
|
duplicate_feed = Feed.objects.get(pk=duplicate_feed_id)
|
|
|
|
except Feed.DoesNotExist:
|
|
|
|
logging.info(" ***> Already deleted feed: %s" % duplicate_feed_id)
|
|
|
|
return
|
|
|
|
|
|
|
|
logging.info(" ---> Feed: [%s - %s] %s - %s" % (original_feed_id, duplicate_feed_id,
|
|
|
|
original_feed, original_feed.feed_link))
|
|
|
|
logging.info(" --> %s" % original_feed.feed_address)
|
|
|
|
logging.info(" --> %s" % duplicate_feed.feed_address)
|
|
|
|
|
|
|
|
user_subs = UserSubscription.objects.filter(feed=duplicate_feed)
|
|
|
|
for user_sub in user_subs:
|
|
|
|
# Rewrite feed in subscription folders
|
|
|
|
try:
|
|
|
|
user_sub_folders = UserSubscriptionFolders.objects.get(user=user_sub.user)
|
|
|
|
except Exception, e:
|
|
|
|
logging.info(" *** ---> UserSubscriptionFolders error: %s" % e)
|
|
|
|
continue
|
|
|
|
|
|
|
|
# Switch to original feed for the user subscription
|
|
|
|
logging.info(" ===> %s " % user_sub.user)
|
|
|
|
user_sub.feed = original_feed
|
|
|
|
user_sub.needs_unread_recalc = True
|
|
|
|
try:
|
|
|
|
user_sub.save()
|
|
|
|
folders = json.decode(user_sub_folders.folders)
|
|
|
|
folders = rewrite_folders(folders, original_feed, duplicate_feed)
|
|
|
|
user_sub_folders.folders = json.encode(folders)
|
|
|
|
user_sub_folders.save()
|
2010-09-10 08:23:04 -07:00
|
|
|
except (IntegrityError, OperationError):
|
2010-08-25 19:10:55 -04:00
|
|
|
logging.info(" !!!!> %s already subscribed" % user_sub.user)
|
|
|
|
user_sub.delete()
|
|
|
|
|
|
|
|
# Switch read stories
|
|
|
|
user_stories = MUserStory.objects(feed_id=duplicate_feed.pk)
|
|
|
|
logging.info(" ---> %s read stories" % user_stories.count())
|
|
|
|
for user_story in user_stories:
|
|
|
|
user_story.feed_id = original_feed.pk
|
|
|
|
duplicate_story = user_story.story
|
2010-09-10 01:18:31 -07:00
|
|
|
story_guid = duplicate_story.story_guid if hasattr(duplicate_story, 'story_guid') else duplicate_story.id
|
|
|
|
original_story = MStory.objects(story_feed_id=original_feed.pk,
|
|
|
|
story_guid=story_guid)
|
2010-08-25 19:10:55 -04:00
|
|
|
|
|
|
|
if original_story:
|
|
|
|
user_story.story = original_story[0]
|
2010-11-09 11:40:26 -05:00
|
|
|
try:
|
|
|
|
user_story.save()
|
|
|
|
except OperationError:
|
|
|
|
# User read the story in the original feed, too. Ugh, just ignore it.
|
|
|
|
pass
|
2010-08-25 19:10:55 -04:00
|
|
|
else:
|
2010-09-10 01:24:12 -07:00
|
|
|
logging.info(" ***> Can't find original story: %s" % duplicate_story.id)
|
2010-09-10 01:19:46 -07:00
|
|
|
user_story.delete()
|
2010-08-25 19:10:55 -04:00
|
|
|
|
|
|
|
def delete_story_feed(model, feed_field='feed_id'):
|
|
|
|
duplicate_stories = model.objects(**{feed_field: duplicate_feed.pk})
|
|
|
|
# if duplicate_stories.count():
|
|
|
|
# logging.info(" ---> Deleting %s %s" % (duplicate_stories.count(), model))
|
|
|
|
duplicate_stories.delete()
|
|
|
|
|
|
|
|
def switch_feed(model):
|
|
|
|
duplicates = model.objects(feed_id=duplicate_feed.pk)
|
|
|
|
if duplicates.count():
|
|
|
|
logging.info(" ---> Switching %s %s" % (duplicates.count(), model))
|
|
|
|
for duplicate in duplicates:
|
|
|
|
duplicate.feed_id = original_feed.pk
|
|
|
|
try:
|
|
|
|
duplicate.save()
|
|
|
|
pass
|
2010-09-10 08:23:04 -07:00
|
|
|
except (IntegrityError, OperationError):
|
2010-08-25 19:10:55 -04:00
|
|
|
logging.info(" !!!!> %s already exists" % duplicate)
|
|
|
|
duplicate.delete()
|
|
|
|
|
|
|
|
delete_story_feed(MStory, 'story_feed_id')
|
2011-01-14 00:59:51 -05:00
|
|
|
delete_story_feed(MFeedPage, 'feed_id')
|
2010-08-25 19:10:55 -04:00
|
|
|
switch_feed(MClassifierTitle)
|
|
|
|
switch_feed(MClassifierAuthor)
|
|
|
|
switch_feed(MClassifierFeed)
|
|
|
|
switch_feed(MClassifierTag)
|
|
|
|
|
|
|
|
try:
|
|
|
|
DuplicateFeed.objects.create(
|
|
|
|
duplicate_address=duplicate_feed.feed_address,
|
2010-11-09 09:55:44 -05:00
|
|
|
duplicate_feed_id=duplicate_feed.pk,
|
2010-08-25 19:10:55 -04:00
|
|
|
feed=original_feed
|
|
|
|
)
|
2010-09-10 08:23:04 -07:00
|
|
|
except (IntegrityError, OperationError), e:
|
|
|
|
logging.info(" ***> Could not save DuplicateFeed: %s" % e)
|
2010-08-25 19:10:55 -04:00
|
|
|
|
2010-11-09 09:55:44 -05:00
|
|
|
# Switch this dupe feed's dupe feeds over to the new original.
|
|
|
|
duplicate_feeds_duplicate_feeds = DuplicateFeed.objects.filter(feed=duplicate_feed)
|
|
|
|
for dupe_feed in duplicate_feeds_duplicate_feeds:
|
|
|
|
dupe_feed.feed = original_feed
|
|
|
|
dupe_feed.duplicate_feed_id = duplicate_feed.pk
|
|
|
|
dupe_feed.save()
|
|
|
|
|
2010-08-25 19:10:55 -04:00
|
|
|
duplicate_feed.delete()
|
|
|
|
|
|
|
|
|
|
|
|
def rewrite_folders(folders, original_feed, duplicate_feed):
|
|
|
|
new_folders = []
|
|
|
|
|
|
|
|
for k, folder in enumerate(folders):
|
|
|
|
if isinstance(folder, int):
|
|
|
|
if folder == duplicate_feed.pk:
|
|
|
|
# logging.info(" ===> Rewrote %s'th item: %s" % (k+1, folders))
|
|
|
|
new_folders.append(original_feed.pk)
|
|
|
|
else:
|
|
|
|
new_folders.append(folder)
|
|
|
|
elif isinstance(folder, dict):
|
|
|
|
for f_k, f_v in folder.items():
|
|
|
|
new_folders.append({f_k: rewrite_folders(f_v, original_feed, duplicate_feed)})
|
|
|
|
|
|
|
|
return new_folders
|