2010-05-20 15:13:25 -04:00
|
|
|
import difflib
|
|
|
|
import datetime
|
2012-07-15 22:51:59 -07:00
|
|
|
import time
|
2010-06-24 15:27:25 -04:00
|
|
|
import random
|
2010-07-27 22:11:23 -04:00
|
|
|
import re
|
2011-10-26 20:09:28 -07:00
|
|
|
import math
|
2010-08-21 13:57:39 -04:00
|
|
|
import mongoengine as mongo
|
2010-08-29 12:35:09 -04:00
|
|
|
import zlib
|
2011-11-15 22:14:21 -08:00
|
|
|
import hashlib
|
2012-07-15 22:51:59 -07:00
|
|
|
import redis
|
2013-04-15 15:09:04 -07:00
|
|
|
import pymongo
|
2010-08-04 18:30:51 -04:00
|
|
|
from collections import defaultdict
|
2010-08-21 23:49:36 -04:00
|
|
|
from operator import itemgetter
|
2013-03-20 15:43:35 -07:00
|
|
|
from bson.objectid import ObjectId
|
2013-06-20 13:41:37 -07:00
|
|
|
from BeautifulSoup import BeautifulSoup
|
2011-09-19 08:56:16 -07:00
|
|
|
# from nltk.collocations import TrigramCollocationFinder, BigramCollocationFinder, TrigramAssocMeasures, BigramAssocMeasures
|
2009-06-16 03:08:55 +00:00
|
|
|
from django.db import models
|
2009-09-08 00:13:49 +00:00
|
|
|
from django.db import IntegrityError
|
2010-10-11 13:19:42 -04:00
|
|
|
from django.conf import settings
|
2011-04-25 20:53:29 -04:00
|
|
|
from django.db.models.query import QuerySet
|
2012-01-26 18:59:40 -08:00
|
|
|
from django.core.urlresolvers import reverse
|
2013-07-11 12:08:21 -07:00
|
|
|
from django.contrib.auth.models import User
|
2012-07-05 18:29:38 -07:00
|
|
|
from django.contrib.sites.models import Site
|
2013-07-10 11:38:05 -07:00
|
|
|
from django.template.defaultfilters import slugify
|
2013-05-29 16:26:04 -07:00
|
|
|
from mongoengine.queryset import OperationError, Q, NotUniqueError
|
2011-02-15 21:08:40 -05:00
|
|
|
from mongoengine.base import ValidationError
|
2012-12-28 21:42:52 -08:00
|
|
|
from vendor.timezones.utilities import localtime_for_timezone
|
2012-03-28 15:49:21 -07:00
|
|
|
from apps.rss_feeds.tasks import UpdateFeeds, PushFeeds
|
2013-01-08 18:33:30 -08:00
|
|
|
from apps.rss_feeds.text_importer import TextImporter
|
2013-01-04 16:34:27 -08:00
|
|
|
from apps.search.models import SearchStarredStory, SearchFeed
|
2013-03-25 11:10:36 -07:00
|
|
|
from apps.statistics.rstats import RStats
|
2010-10-23 13:06:28 -04:00
|
|
|
from utils import json_functions as json
|
2012-03-19 19:58:17 -07:00
|
|
|
from utils import feedfinder, feedparser
|
2011-02-08 22:07:59 -05:00
|
|
|
from utils import urlnorm
|
|
|
|
from utils import log as logging
|
2011-01-17 22:48:38 -05:00
|
|
|
from utils.fields import AutoOneToOneField
|
2010-05-20 15:13:25 -04:00
|
|
|
from utils.feed_functions import levenshtein_distance
|
2011-02-02 13:07:12 -05:00
|
|
|
from utils.feed_functions import timelimit, TimeoutError
|
2011-03-04 12:27:31 -05:00
|
|
|
from utils.feed_functions import relative_timesince
|
2011-08-18 09:56:52 -07:00
|
|
|
from utils.feed_functions import seconds_timesince
|
2013-04-08 16:14:33 -07:00
|
|
|
from utils.story_functions import strip_tags, htmldiff, strip_comments, strip_comments__lxml
|
2013-05-07 18:43:47 -07:00
|
|
|
from vendor.haystack.query import SearchQuerySet
|
2009-06-16 03:08:55 +00:00
|
|
|
|
2009-08-20 02:43:01 +00:00
|
|
|
ENTRY_NEW, ENTRY_UPDATED, ENTRY_SAME, ENTRY_ERR = range(4)
|
2009-07-21 03:18:29 +00:00
|
|
|
|
2012-05-07 16:26:31 -07:00
|
|
|
|
2009-06-16 03:08:55 +00:00
|
|
|
class Feed(models.Model):
|
2012-12-25 14:58:49 -08:00
|
|
|
feed_address = models.URLField(max_length=764, db_index=True)
|
2011-11-15 22:14:21 -08:00
|
|
|
feed_address_locked = models.NullBooleanField(default=False, blank=True, null=True)
|
2010-07-21 23:22:27 -04:00
|
|
|
feed_link = models.URLField(max_length=1000, default="", blank=True, null=True)
|
2011-09-01 09:34:57 -07:00
|
|
|
feed_link_locked = models.BooleanField(default=False)
|
2013-03-29 22:41:16 -07:00
|
|
|
hash_address_and_link = models.CharField(max_length=64, unique=True)
|
2011-02-15 21:08:40 -05:00
|
|
|
feed_title = models.CharField(max_length=255, default="[Untitled]", blank=True, null=True)
|
2012-03-27 16:26:07 -07:00
|
|
|
is_push = models.NullBooleanField(default=False, blank=True, null=True)
|
2011-01-17 13:52:11 -05:00
|
|
|
active = models.BooleanField(default=True, db_index=True)
|
2010-09-17 12:42:44 -04:00
|
|
|
num_subscribers = models.IntegerField(default=-1)
|
2011-01-17 13:52:11 -05:00
|
|
|
active_subscribers = models.IntegerField(default=-1, db_index=True)
|
2010-10-16 22:49:03 -04:00
|
|
|
premium_subscribers = models.IntegerField(default=-1)
|
2013-03-29 20:23:54 -07:00
|
|
|
active_premium_subscribers = models.IntegerField(default=-1)
|
2011-11-17 18:34:23 -08:00
|
|
|
branch_from_feed = models.ForeignKey('Feed', blank=True, null=True, db_index=True)
|
2010-10-02 17:15:51 -04:00
|
|
|
last_update = models.DateTimeField(db_index=True)
|
2013-03-29 22:09:11 -07:00
|
|
|
next_scheduled_update = models.DateTimeField()
|
2013-04-23 17:04:21 -07:00
|
|
|
last_story_date = models.DateTimeField(null=True, blank=True)
|
2010-08-09 20:44:36 -04:00
|
|
|
fetched_once = models.BooleanField(default=False)
|
2013-03-29 20:23:54 -07:00
|
|
|
known_good = models.BooleanField(default=False)
|
2010-09-08 10:52:04 -07:00
|
|
|
has_feed_exception = models.BooleanField(default=False, db_index=True)
|
|
|
|
has_page_exception = models.BooleanField(default=False, db_index=True)
|
2011-09-04 10:59:29 -07:00
|
|
|
has_page = models.BooleanField(default=True)
|
2010-08-25 20:43:35 -04:00
|
|
|
exception_code = models.IntegerField(default=0)
|
2012-08-09 13:26:44 -07:00
|
|
|
errors_since_good = models.IntegerField(default=0)
|
2010-12-23 13:29:31 -05:00
|
|
|
min_to_decay = models.IntegerField(default=0)
|
2009-06-16 03:08:55 +00:00
|
|
|
days_to_trim = models.IntegerField(default=90)
|
|
|
|
creation = models.DateField(auto_now_add=True)
|
2010-09-17 13:06:28 -04:00
|
|
|
etag = models.CharField(max_length=255, blank=True, null=True)
|
2009-06-16 03:08:55 +00:00
|
|
|
last_modified = models.DateTimeField(null=True, blank=True)
|
2010-07-25 23:13:27 -04:00
|
|
|
stories_last_month = models.IntegerField(default=0)
|
|
|
|
average_stories_per_month = models.IntegerField(default=0)
|
2010-04-29 13:35:46 -04:00
|
|
|
last_load_time = models.IntegerField(default=0)
|
2011-04-26 19:37:52 -04:00
|
|
|
favicon_color = models.CharField(max_length=6, null=True, blank=True)
|
|
|
|
favicon_not_found = models.BooleanField(default=False)
|
2012-09-18 17:09:07 -07:00
|
|
|
s3_page = models.NullBooleanField(default=False, blank=True, null=True)
|
|
|
|
s3_icon = models.NullBooleanField(default=False, blank=True, null=True)
|
2011-11-15 22:14:21 -08:00
|
|
|
|
|
|
|
class Meta:
|
|
|
|
db_table="feeds"
|
|
|
|
ordering=["feed_title"]
|
|
|
|
# unique_together=[('feed_address', 'feed_link')]
|
2009-06-16 03:08:55 +00:00
|
|
|
|
|
|
|
def __unicode__(self):
|
2010-09-28 04:57:24 -04:00
|
|
|
if not self.feed_title:
|
|
|
|
self.feed_title = "[Untitled]"
|
|
|
|
self.save()
|
2013-01-29 14:37:01 -08:00
|
|
|
return "%s (%s - %s/%s/%s)%s" % (
|
2013-01-04 16:34:27 -08:00
|
|
|
self.feed_title,
|
|
|
|
self.pk,
|
|
|
|
self.num_subscribers,
|
|
|
|
self.active_subscribers,
|
2013-04-23 15:44:31 -07:00
|
|
|
self.active_premium_subscribers,
|
2013-01-29 15:58:15 -08:00
|
|
|
(" [B: %s]" % self.branch_from_feed.pk if self.branch_from_feed else ""))
|
2012-03-20 16:46:38 -07:00
|
|
|
|
|
|
|
@property
|
|
|
|
def title(self):
|
|
|
|
return self.feed_title or "[Untitled]"
|
2013-07-10 11:38:05 -07:00
|
|
|
|
|
|
|
@property
|
|
|
|
def permalink(self):
|
|
|
|
return "%s/site/%s/%s" % (settings.NEWSBLUR_URL, self.pk, slugify(self.feed_title.lower()[:50]))
|
|
|
|
|
2012-07-05 18:29:38 -07:00
|
|
|
@property
|
|
|
|
def favicon_url(self):
|
2012-09-21 17:41:07 -07:00
|
|
|
if settings.BACKED_BY_AWS['icons_on_s3'] and self.s3_icon:
|
2012-09-20 12:29:52 -07:00
|
|
|
return "http://%s/%s.png" % (settings.S3_ICONS_BUCKET_NAME, self.pk)
|
2012-07-05 18:29:38 -07:00
|
|
|
return reverse('feed-favicon', kwargs={'feed_id': self.pk})
|
|
|
|
|
|
|
|
@property
|
|
|
|
def favicon_url_fqdn(self):
|
2012-09-21 17:41:07 -07:00
|
|
|
if settings.BACKED_BY_AWS['icons_on_s3'] and self.s3_icon:
|
2012-09-20 12:43:34 -07:00
|
|
|
return self.favicon_url
|
2012-07-05 18:29:38 -07:00
|
|
|
return "http://%s%s" % (
|
2012-07-29 22:31:40 -07:00
|
|
|
Site.objects.get_current().domain,
|
2012-07-05 18:29:38 -07:00
|
|
|
self.favicon_url
|
|
|
|
)
|
2012-09-18 17:09:07 -07:00
|
|
|
|
|
|
|
@property
|
|
|
|
def s3_pages_key(self):
|
|
|
|
return "%s.gz.html" % self.pk
|
|
|
|
|
2012-09-19 16:33:23 -07:00
|
|
|
@property
|
|
|
|
def s3_icons_key(self):
|
|
|
|
return "%s.png" % self.pk
|
|
|
|
|
2011-04-04 12:01:29 -04:00
|
|
|
def canonical(self, full=False, include_favicon=True):
|
2011-03-04 12:27:31 -05:00
|
|
|
feed = {
|
|
|
|
'id': self.pk,
|
|
|
|
'feed_title': self.feed_title,
|
|
|
|
'feed_address': self.feed_address,
|
|
|
|
'feed_link': self.feed_link,
|
2011-07-27 22:17:34 -07:00
|
|
|
'num_subscribers': self.num_subscribers,
|
2011-03-04 12:27:31 -05:00
|
|
|
'updated': relative_timesince(self.last_update),
|
2011-08-18 09:56:52 -07:00
|
|
|
'updated_seconds_ago': seconds_timesince(self.last_update),
|
2013-02-11 16:07:08 -08:00
|
|
|
'min_to_decay': self.min_to_decay,
|
2011-03-04 12:27:31 -05:00
|
|
|
'subs': self.num_subscribers,
|
2012-03-28 17:31:25 -07:00
|
|
|
'is_push': self.is_push,
|
2012-05-22 17:39:21 -07:00
|
|
|
'fetched_once': self.fetched_once,
|
|
|
|
'not_yet_fetched': not self.fetched_once, # Legacy. Doh.
|
2011-04-26 19:37:52 -04:00
|
|
|
'favicon_color': self.favicon_color,
|
2011-10-26 09:40:04 -07:00
|
|
|
'favicon_fade': self.favicon_fade(),
|
2012-01-31 10:15:11 -08:00
|
|
|
'favicon_border': self.favicon_border(),
|
2011-10-26 20:09:28 -07:00
|
|
|
'favicon_text_color': self.favicon_text_color(),
|
2011-11-30 21:27:55 -08:00
|
|
|
'favicon_fetching': self.favicon_fetching,
|
2012-07-05 18:29:38 -07:00
|
|
|
'favicon_url': self.favicon_url,
|
2012-09-19 16:33:23 -07:00
|
|
|
's3_page': self.s3_page,
|
|
|
|
's3_icon': self.s3_icon,
|
2011-03-04 12:27:31 -05:00
|
|
|
}
|
|
|
|
|
2011-04-11 09:42:38 -04:00
|
|
|
if include_favicon:
|
2011-05-01 17:19:01 -04:00
|
|
|
try:
|
2012-01-10 10:22:12 -08:00
|
|
|
feed_icon = MFeedIcon.objects.get(feed_id=self.pk)
|
2011-05-01 17:19:01 -04:00
|
|
|
feed['favicon'] = feed_icon.data
|
|
|
|
except MFeedIcon.DoesNotExist:
|
|
|
|
pass
|
2011-03-04 12:27:31 -05:00
|
|
|
if self.has_page_exception or self.has_feed_exception:
|
|
|
|
feed['has_exception'] = True
|
|
|
|
feed['exception_type'] = 'feed' if self.has_feed_exception else 'page'
|
|
|
|
feed['exception_code'] = self.exception_code
|
|
|
|
elif full:
|
|
|
|
feed['has_exception'] = False
|
|
|
|
feed['exception_type'] = None
|
|
|
|
feed['exception_code'] = self.exception_code
|
|
|
|
|
2012-09-05 11:32:12 -07:00
|
|
|
if not self.has_page:
|
|
|
|
feed['disabled_page'] = True
|
2011-03-04 12:27:31 -05:00
|
|
|
if full:
|
2013-02-22 16:02:45 -08:00
|
|
|
feed['average_stories_per_month'] = self.average_stories_per_month
|
|
|
|
feed['tagline'] = self.data.feed_tagline
|
2011-03-04 12:27:31 -05:00
|
|
|
feed['feed_tags'] = json.decode(self.data.popular_tags) if self.data.popular_tags else []
|
|
|
|
feed['feed_authors'] = json.decode(self.data.popular_authors) if self.data.popular_authors else []
|
|
|
|
|
|
|
|
|
|
|
|
return feed
|
2012-02-13 11:07:32 -08:00
|
|
|
|
2010-11-05 20:34:17 -04:00
|
|
|
def save(self, *args, **kwargs):
|
2010-10-02 17:15:51 -04:00
|
|
|
if not self.last_update:
|
2010-10-10 23:55:00 -04:00
|
|
|
self.last_update = datetime.datetime.utcnow()
|
2010-10-02 17:15:51 -04:00
|
|
|
if not self.next_scheduled_update:
|
2010-10-10 23:55:00 -04:00
|
|
|
self.next_scheduled_update = datetime.datetime.utcnow()
|
2013-04-10 19:14:19 -07:00
|
|
|
self.fix_google_alerts_urls()
|
|
|
|
|
2011-11-16 06:52:14 -08:00
|
|
|
feed_address = self.feed_address or ""
|
|
|
|
feed_link = self.feed_link or ""
|
|
|
|
self.hash_address_and_link = hashlib.sha1(feed_address+feed_link).hexdigest()
|
2011-02-15 21:08:40 -05:00
|
|
|
|
|
|
|
max_feed_title = Feed._meta.get_field('feed_title').max_length
|
|
|
|
if len(self.feed_title) > max_feed_title:
|
|
|
|
self.feed_title = self.feed_title[:max_feed_title]
|
2011-08-27 14:13:28 -07:00
|
|
|
max_feed_address = Feed._meta.get_field('feed_address').max_length
|
2012-04-06 21:40:18 -07:00
|
|
|
if len(feed_address) > max_feed_address:
|
|
|
|
self.feed_address = feed_address[:max_feed_address]
|
2012-04-04 22:22:24 -07:00
|
|
|
max_feed_link = Feed._meta.get_field('feed_link').max_length
|
2012-04-06 21:40:18 -07:00
|
|
|
if len(feed_link) > max_feed_link:
|
|
|
|
self.feed_link = feed_link[:max_feed_link]
|
2010-10-02 17:15:51 -04:00
|
|
|
|
2010-09-28 05:29:40 -04:00
|
|
|
try:
|
2010-10-07 19:07:43 -04:00
|
|
|
super(Feed, self).save(*args, **kwargs)
|
2012-12-25 14:58:49 -08:00
|
|
|
except IntegrityError, e:
|
|
|
|
logging.debug(" ---> ~FRFeed save collision (%s), checking dupe..." % e)
|
2013-01-29 14:37:01 -08:00
|
|
|
duplicate_feeds = Feed.objects.filter(feed_address=self.feed_address,
|
|
|
|
feed_link=self.feed_link)
|
2013-04-04 17:13:06 -07:00
|
|
|
if not duplicate_feeds:
|
2013-04-19 11:09:31 -07:00
|
|
|
feed_address = self.feed_address or ""
|
|
|
|
feed_link = self.feed_link or ""
|
|
|
|
hash_address_and_link = hashlib.sha1(feed_address+feed_link).hexdigest()
|
2013-04-04 17:13:06 -07:00
|
|
|
duplicate_feeds = Feed.objects.filter(hash_address_and_link=hash_address_and_link)
|
2013-01-29 14:37:01 -08:00
|
|
|
if not duplicate_feeds:
|
2012-12-25 14:58:49 -08:00
|
|
|
# Feed has been deleted. Just ignore it.
|
2013-04-03 22:51:03 -07:00
|
|
|
logging.debug(" ***> Changed to: %s - %s: %s" % (self.feed_address, self.feed_link, duplicate_feeds))
|
2012-12-25 14:58:49 -08:00
|
|
|
logging.debug(' ***> [%-30s] Feed deleted (%s).' % (unicode(self)[:30], self.pk))
|
|
|
|
return
|
|
|
|
|
2013-01-29 14:37:01 -08:00
|
|
|
if self.pk != duplicate_feeds[0].pk:
|
|
|
|
logging.debug(" ---> ~FRFound different feed (%s), merging..." % duplicate_feeds[0])
|
|
|
|
feed = Feed.get_by_id(merge_feeds(duplicate_feeds[0].pk, self.pk))
|
2012-12-28 22:03:48 -08:00
|
|
|
return feed
|
2013-04-08 12:54:02 -07:00
|
|
|
|
|
|
|
return self
|
2012-11-26 10:20:41 -08:00
|
|
|
|
2013-01-04 16:34:27 -08:00
|
|
|
def index_for_search(self):
|
|
|
|
if self.num_subscribers > 1 and not self.branch_from_feed:
|
|
|
|
SearchFeed.index(feed_id=self.pk,
|
|
|
|
title=self.feed_title,
|
|
|
|
address=self.feed_address,
|
|
|
|
link=self.feed_link,
|
|
|
|
num_subscribers=self.num_subscribers)
|
|
|
|
|
2010-08-23 09:55:21 -04:00
|
|
|
|
2012-07-23 13:06:12 -07:00
|
|
|
def sync_redis(self):
|
2013-05-02 12:27:37 -07:00
|
|
|
return MStory.sync_feed_redis(self.pk)
|
2013-05-29 18:00:09 -07:00
|
|
|
|
2013-08-14 14:32:50 -07:00
|
|
|
def expire_redis(self, r=None):
|
2013-05-29 18:00:09 -07:00
|
|
|
if not r:
|
|
|
|
r = redis.Redis(connection_pool=settings.REDIS_STORY_HASH_POOL)
|
2013-08-14 14:32:50 -07:00
|
|
|
# if not r2:
|
|
|
|
# r2 = redis.Redis(connection_pool=settings.REDIS_STORY_HASH_POOL2)
|
2013-05-29 18:00:09 -07:00
|
|
|
|
2013-08-02 15:59:03 -07:00
|
|
|
r.expire('F:%s' % self.pk, settings.DAYS_OF_UNREAD_NEW*24*60*60)
|
2013-08-14 14:32:50 -07:00
|
|
|
# r2.expire('F:%s' % self.pk, settings.DAYS_OF_UNREAD_NEW*24*60*60)
|
2013-08-02 15:59:03 -07:00
|
|
|
r.expire('zF:%s' % self.pk, settings.DAYS_OF_UNREAD_NEW*24*60*60)
|
2013-08-14 14:32:50 -07:00
|
|
|
# r2.expire('zF:%s' % self.pk, settings.DAYS_OF_UNREAD_NEW*24*60*60)
|
2013-06-03 17:20:36 -07:00
|
|
|
|
2013-04-08 12:54:02 -07:00
|
|
|
@classmethod
|
|
|
|
def autocomplete(self, prefix, limit=5):
|
2013-05-07 18:43:47 -07:00
|
|
|
results = SearchQuerySet().autocomplete(address=prefix).order_by('-num_subscribers')[:limit]
|
2013-04-08 12:54:02 -07:00
|
|
|
|
|
|
|
if len(results) < limit:
|
2013-05-07 18:43:47 -07:00
|
|
|
results += SearchQuerySet().autocomplete(title=prefix).order_by('-num_subscribers')[:limit-len(results)]
|
|
|
|
|
|
|
|
return list(set([int(f.pk) for f in results]))
|
2012-07-23 13:06:12 -07:00
|
|
|
|
2012-03-12 18:11:13 -07:00
|
|
|
@classmethod
|
|
|
|
def find_or_create(cls, feed_address, feed_link, *args, **kwargs):
|
|
|
|
feeds = cls.objects.filter(feed_address=feed_address, feed_link=feed_link)
|
|
|
|
if feeds:
|
|
|
|
return feeds[0], False
|
|
|
|
|
2012-07-30 13:25:44 -07:00
|
|
|
if feed_link and feed_link.endswith('/'):
|
2012-03-12 18:11:13 -07:00
|
|
|
feeds = cls.objects.filter(feed_address=feed_address, feed_link=feed_link[:-1])
|
|
|
|
if feeds:
|
|
|
|
return feeds[0], False
|
|
|
|
|
|
|
|
return cls.objects.get_or_create(feed_address=feed_address, feed_link=feed_link, *args, **kwargs)
|
|
|
|
|
2012-02-03 13:02:26 -08:00
|
|
|
@classmethod
|
|
|
|
def merge_feeds(cls, *args, **kwargs):
|
2012-12-28 22:03:48 -08:00
|
|
|
return merge_feeds(*args, **kwargs)
|
2013-01-02 12:27:08 -08:00
|
|
|
|
2013-04-10 19:14:19 -07:00
|
|
|
def fix_google_alerts_urls(self):
|
|
|
|
if (self.feed_address.startswith('http://user/') and
|
|
|
|
'/state/com.google/alerts/' in self.feed_address):
|
|
|
|
match = re.match(r"http://user/(\d+)/state/com.google/alerts/(\d+)", self.feed_address)
|
|
|
|
if match:
|
|
|
|
user_id, alert_id = match.groups()
|
|
|
|
self.feed_address = "http://www.google.com/alerts/feeds/%s/%s" % (user_id, alert_id)
|
|
|
|
|
2013-01-02 12:27:08 -08:00
|
|
|
@classmethod
|
|
|
|
def schedule_feed_fetches_immediately(cls, feed_ids):
|
2013-06-12 13:52:43 -07:00
|
|
|
if settings.DEBUG:
|
|
|
|
logging.info(" ---> ~SN~FMSkipping the scheduling immediate fetch of ~SB%s~SN feeds (in DEBUG)..." %
|
|
|
|
len(feed_ids))
|
|
|
|
return
|
2013-01-02 12:27:08 -08:00
|
|
|
logging.info(" ---> ~SN~FMScheduling immediate fetch of ~SB%s~SN feeds..." %
|
|
|
|
len(feed_ids))
|
|
|
|
|
|
|
|
feeds = Feed.objects.filter(pk__in=feed_ids)
|
|
|
|
for feed in feeds:
|
|
|
|
feed.count_subscribers()
|
|
|
|
feed.schedule_feed_fetch_immediately(verbose=False)
|
|
|
|
|
2011-11-30 21:27:55 -08:00
|
|
|
@property
|
|
|
|
def favicon_fetching(self):
|
|
|
|
return bool(not (self.favicon_not_found or self.favicon_color))
|
|
|
|
|
2011-02-08 22:07:59 -05:00
|
|
|
@classmethod
|
2011-08-29 21:09:41 -07:00
|
|
|
def get_feed_from_url(cls, url, create=True, aggressive=False, fetch=True, offset=0):
|
2011-02-08 22:07:59 -05:00
|
|
|
feed = None
|
2011-04-25 20:53:29 -04:00
|
|
|
|
|
|
|
def criteria(key, value):
|
|
|
|
if aggressive:
|
|
|
|
return {'%s__icontains' % key: value}
|
|
|
|
else:
|
|
|
|
return {'%s' % key: value}
|
|
|
|
|
2011-02-08 22:07:59 -05:00
|
|
|
def by_url(address):
|
2013-05-07 11:37:01 -07:00
|
|
|
feed = cls.objects.filter(
|
|
|
|
branch_from_feed=None
|
|
|
|
).filter(**criteria('feed_address', address)).order_by('-num_subscribers')
|
2011-03-09 18:52:06 -05:00
|
|
|
if not feed:
|
2011-04-25 20:55:39 -04:00
|
|
|
duplicate_feed = DuplicateFeed.objects.filter(**criteria('duplicate_address', address))
|
2011-04-25 20:53:29 -04:00
|
|
|
if duplicate_feed and len(duplicate_feed) > offset:
|
|
|
|
feed = [duplicate_feed[offset].feed]
|
2012-03-28 15:49:21 -07:00
|
|
|
if not feed and aggressive:
|
2013-05-07 11:37:01 -07:00
|
|
|
feed = cls.objects.filter(
|
|
|
|
branch_from_feed=None
|
|
|
|
).filter(**criteria('feed_link', address)).order_by('-num_subscribers')
|
2011-03-09 18:52:06 -05:00
|
|
|
|
2011-02-08 22:07:59 -05:00
|
|
|
return feed
|
2011-04-25 20:53:29 -04:00
|
|
|
|
|
|
|
# Normalize and check for feed_address, dupes, and feed_link
|
2012-03-28 15:49:21 -07:00
|
|
|
url = urlnorm.normalize(url)
|
2011-02-08 22:07:59 -05:00
|
|
|
feed = by_url(url)
|
2011-04-25 20:53:29 -04:00
|
|
|
|
|
|
|
# Create if it looks good
|
|
|
|
if feed and len(feed) > offset:
|
|
|
|
feed = feed[offset]
|
|
|
|
elif create:
|
2012-03-19 19:58:17 -07:00
|
|
|
create_okay = False
|
2011-02-08 22:07:59 -05:00
|
|
|
if feedfinder.isFeed(url):
|
2012-03-19 19:58:17 -07:00
|
|
|
create_okay = True
|
2013-06-30 17:00:31 -07:00
|
|
|
elif fetch:
|
2012-03-19 19:58:17 -07:00
|
|
|
# Could still be a feed. Just check if there are entries
|
|
|
|
fp = feedparser.parse(url)
|
|
|
|
if len(fp.entries):
|
|
|
|
create_okay = True
|
|
|
|
if create_okay:
|
2011-02-08 22:07:59 -05:00
|
|
|
feed = cls.objects.create(feed_address=url)
|
2011-02-09 18:52:36 -05:00
|
|
|
feed = feed.update()
|
2011-04-25 20:53:29 -04:00
|
|
|
|
|
|
|
# Still nothing? Maybe the URL has some clues.
|
2011-08-29 21:09:41 -07:00
|
|
|
if not feed and fetch:
|
2011-04-25 20:53:29 -04:00
|
|
|
feed_finder_url = feedfinder.feed(url)
|
2013-03-30 11:39:49 -07:00
|
|
|
if feed_finder_url and 'comments' not in feed_finder_url:
|
2011-04-25 20:53:29 -04:00
|
|
|
feed = by_url(feed_finder_url)
|
|
|
|
if not feed and create:
|
|
|
|
feed = cls.objects.create(feed_address=feed_finder_url)
|
|
|
|
feed = feed.update()
|
|
|
|
elif feed and len(feed) > offset:
|
|
|
|
feed = feed[offset]
|
|
|
|
|
|
|
|
# Not created and not within bounds, so toss results.
|
|
|
|
if isinstance(feed, QuerySet):
|
|
|
|
return
|
|
|
|
|
2011-02-08 22:07:59 -05:00
|
|
|
return feed
|
|
|
|
|
2010-12-23 12:32:24 -05:00
|
|
|
@classmethod
|
2012-12-25 12:08:17 -08:00
|
|
|
def task_feeds(cls, feeds, queue_size=12, verbose=True):
|
2013-03-29 21:46:04 -07:00
|
|
|
if not feeds: return
|
2013-03-30 19:05:13 -07:00
|
|
|
r = redis.Redis(connection_pool=settings.REDIS_FEED_POOL)
|
|
|
|
|
2012-10-29 14:58:43 -07:00
|
|
|
if isinstance(feeds, Feed):
|
2012-12-25 12:08:17 -08:00
|
|
|
if verbose:
|
2013-03-30 19:05:13 -07:00
|
|
|
logging.debug(" ---> ~SN~FBTasking feed: ~SB%s" % feeds)
|
|
|
|
feeds = [feeds.pk]
|
2012-12-25 12:08:17 -08:00
|
|
|
elif verbose:
|
2013-03-30 19:05:13 -07:00
|
|
|
logging.debug(" ---> ~SN~FBTasking ~SB%s~SN feeds..." % len(feeds))
|
2010-12-23 12:32:24 -05:00
|
|
|
|
2013-03-30 19:05:13 -07:00
|
|
|
if isinstance(feeds, QuerySet):
|
|
|
|
feeds = [f.pk for f in feeds]
|
|
|
|
|
|
|
|
r.srem('queued_feeds', *feeds)
|
2013-04-03 17:22:45 -07:00
|
|
|
now = datetime.datetime.now().strftime("%s")
|
|
|
|
p = r.pipeline()
|
|
|
|
for feed_id in feeds:
|
|
|
|
p.zadd('tasked_feeds', feed_id, now)
|
|
|
|
p.execute()
|
2013-03-30 19:05:13 -07:00
|
|
|
|
2013-04-08 14:39:00 -07:00
|
|
|
# for feed_ids in (feeds[pos:pos + queue_size] for pos in xrange(0, len(feeds), queue_size)):
|
|
|
|
for feed_id in feeds:
|
|
|
|
UpdateFeeds.apply_async(args=(feed_id,), queue='update_feeds')
|
2013-04-15 19:26:41 -07:00
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def drain_task_feeds(cls, empty=False):
|
|
|
|
r = redis.Redis(connection_pool=settings.REDIS_FEED_POOL)
|
|
|
|
if not empty:
|
|
|
|
tasked_feeds = r.zrange('tasked_feeds', 0, -1)
|
2013-06-12 16:55:54 -07:00
|
|
|
logging.debug(" ---> ~FRDraining %s feeds..." % len(tasked_feeds))
|
2013-04-15 19:26:41 -07:00
|
|
|
r.sadd('queued_feeds', *tasked_feeds)
|
|
|
|
r.zremrangebyrank('tasked_feeds', 0, -1)
|
|
|
|
|
2012-03-26 12:40:13 -07:00
|
|
|
def update_all_statistics(self, full=True, force=False):
|
2010-11-05 20:34:17 -04:00
|
|
|
self.count_subscribers()
|
2013-06-04 11:49:33 -07:00
|
|
|
self.calculate_last_story_date()
|
2013-04-23 17:04:21 -07:00
|
|
|
|
2012-03-26 12:40:13 -07:00
|
|
|
count_extra = False
|
2013-02-25 11:11:34 -08:00
|
|
|
if random.random() > .99 or not self.data.popular_tags or not self.data.popular_authors:
|
|
|
|
count_extra = True
|
2013-04-23 15:44:31 -07:00
|
|
|
|
|
|
|
if force or full:
|
|
|
|
self.save_feed_stories_last_month()
|
|
|
|
|
2012-03-26 12:40:13 -07:00
|
|
|
if force or (full and count_extra):
|
2011-12-14 14:18:01 -08:00
|
|
|
self.save_popular_authors()
|
|
|
|
self.save_popular_tags()
|
2013-04-23 15:44:31 -07:00
|
|
|
self.save_feed_story_history_statistics()
|
2010-08-25 19:10:55 -04:00
|
|
|
|
2013-04-23 17:04:21 -07:00
|
|
|
def calculate_last_story_date(self):
|
|
|
|
last_story_date = None
|
2013-04-23 21:22:49 -07:00
|
|
|
|
2013-04-23 17:04:21 -07:00
|
|
|
try:
|
2013-04-23 21:22:49 -07:00
|
|
|
latest_story = MStory.objects(
|
2013-04-23 17:04:21 -07:00
|
|
|
story_feed_id=self.pk
|
2013-04-23 21:22:49 -07:00
|
|
|
).limit(1).order_by('-story_date').only('story_date').first()
|
|
|
|
if latest_story:
|
|
|
|
last_story_date = latest_story.story_date
|
2013-04-23 17:04:21 -07:00
|
|
|
except MStory.DoesNotExist:
|
|
|
|
pass
|
|
|
|
|
|
|
|
if not last_story_date or seconds_timesince(last_story_date) < 0:
|
|
|
|
last_story_date = datetime.datetime.now()
|
|
|
|
|
|
|
|
self.last_story_date = last_story_date
|
|
|
|
self.save()
|
|
|
|
|
2013-02-15 09:52:11 -08:00
|
|
|
@classmethod
|
|
|
|
def setup_feeds_for_premium_subscribers(cls, feed_ids):
|
|
|
|
logging.info(" ---> ~SN~FMScheduling immediate premium setup of ~SB%s~SN feeds..." %
|
|
|
|
len(feed_ids))
|
|
|
|
|
|
|
|
feeds = Feed.objects.filter(pk__in=feed_ids)
|
|
|
|
for feed in feeds:
|
|
|
|
feed.setup_feed_for_premium_subscribers()
|
|
|
|
|
2010-10-06 10:21:14 -04:00
|
|
|
def setup_feed_for_premium_subscribers(self):
|
|
|
|
self.count_subscribers()
|
|
|
|
self.set_next_scheduled_update()
|
|
|
|
|
2011-11-16 18:41:36 -08:00
|
|
|
def check_feed_link_for_feed_address(self):
|
2011-02-02 13:07:12 -05:00
|
|
|
@timelimit(10)
|
|
|
|
def _1():
|
|
|
|
feed_address = None
|
2013-04-04 17:18:27 -07:00
|
|
|
feed = self
|
2011-02-23 14:25:07 -05:00
|
|
|
try:
|
|
|
|
is_feed = feedfinder.isFeed(self.feed_address)
|
|
|
|
except KeyError:
|
|
|
|
is_feed = False
|
|
|
|
if not is_feed:
|
2011-02-02 13:07:12 -05:00
|
|
|
feed_address = feedfinder.feed(self.feed_address)
|
|
|
|
if not feed_address and self.feed_link:
|
|
|
|
feed_address = feedfinder.feed(self.feed_link)
|
|
|
|
else:
|
|
|
|
feed_address_from_link = feedfinder.feed(self.feed_link)
|
|
|
|
if feed_address_from_link != self.feed_address:
|
|
|
|
feed_address = feed_address_from_link
|
|
|
|
|
|
|
|
if feed_address:
|
2013-05-13 15:41:25 -07:00
|
|
|
if (feed_address.endswith('feedburner.com/atom.xml') or
|
|
|
|
feed_address.endswith('feedburner.com/feed/')):
|
2011-11-12 15:36:38 -08:00
|
|
|
logging.debug(" ---> Feed points to 'Wierdo', ignoring.")
|
2013-04-04 17:13:06 -07:00
|
|
|
return False, self
|
2011-02-02 13:07:12 -05:00
|
|
|
try:
|
|
|
|
self.feed_address = feed_address
|
2013-04-04 17:09:07 -07:00
|
|
|
feed = self.save()
|
|
|
|
feed.schedule_feed_fetch_immediately()
|
|
|
|
feed.has_feed_exception = False
|
|
|
|
feed.active = True
|
|
|
|
feed = feed.save()
|
2011-02-02 13:07:12 -05:00
|
|
|
except IntegrityError:
|
2011-12-01 22:03:30 -08:00
|
|
|
original_feed = Feed.objects.get(feed_address=feed_address, feed_link=self.feed_link)
|
2011-02-02 13:07:12 -05:00
|
|
|
original_feed.has_feed_exception = False
|
|
|
|
original_feed.active = True
|
|
|
|
original_feed.save()
|
|
|
|
merge_feeds(original_feed.pk, self.pk)
|
2013-04-04 17:09:07 -07:00
|
|
|
return feed_address, feed
|
2010-07-19 14:29:27 -04:00
|
|
|
|
2011-11-16 18:41:36 -08:00
|
|
|
if self.feed_address_locked:
|
2013-04-04 17:13:06 -07:00
|
|
|
return False, self
|
2011-11-16 18:41:36 -08:00
|
|
|
|
2011-02-02 13:07:12 -05:00
|
|
|
try:
|
2013-04-04 17:09:07 -07:00
|
|
|
feed_address, feed = _1()
|
2013-04-15 14:30:31 -07:00
|
|
|
except TimeoutError, e:
|
2012-01-04 21:49:55 -08:00
|
|
|
logging.debug(' ---> [%-30s] Feed address check timed out...' % (unicode(self)[:30]))
|
2013-04-15 14:30:31 -07:00
|
|
|
self.save_feed_history(505, 'Timeout', e)
|
2013-04-04 17:27:31 -07:00
|
|
|
feed = self
|
2011-02-02 13:07:12 -05:00
|
|
|
feed_address = None
|
2013-04-04 17:09:07 -07:00
|
|
|
|
|
|
|
return bool(feed_address), feed
|
2010-08-25 19:10:55 -04:00
|
|
|
|
2010-07-08 11:37:54 -04:00
|
|
|
def save_feed_history(self, status_code, message, exception=None):
|
2013-05-31 17:14:17 -07:00
|
|
|
fetch_history = MFetchHistory.add(feed_id=self.pk,
|
|
|
|
fetch_type='feed',
|
|
|
|
code=int(status_code),
|
|
|
|
message=message,
|
|
|
|
exception=exception)
|
|
|
|
|
2010-10-03 19:05:16 -04:00
|
|
|
if status_code not in (200, 304):
|
2012-08-09 13:26:44 -07:00
|
|
|
self.errors_since_good += 1
|
2013-05-31 17:14:17 -07:00
|
|
|
self.count_errors_in_history('feed', status_code, fetch_history=fetch_history)
|
2012-08-09 13:26:44 -07:00
|
|
|
self.set_next_scheduled_update()
|
|
|
|
elif self.has_feed_exception or self.errors_since_good:
|
|
|
|
self.errors_since_good = 0
|
2010-08-26 10:04:32 -04:00
|
|
|
self.has_feed_exception = False
|
2010-08-25 20:43:35 -04:00
|
|
|
self.active = True
|
2010-08-24 16:22:12 -04:00
|
|
|
self.save()
|
2010-08-18 20:35:45 -04:00
|
|
|
|
2010-07-08 11:37:54 -04:00
|
|
|
def save_page_history(self, status_code, message, exception=None):
|
2013-05-31 17:14:17 -07:00
|
|
|
fetch_history = MFetchHistory.add(feed_id=self.pk,
|
|
|
|
fetch_type='page',
|
|
|
|
code=int(status_code),
|
|
|
|
message=message,
|
|
|
|
exception=exception)
|
2010-08-18 20:35:45 -04:00
|
|
|
|
2010-10-03 19:05:16 -04:00
|
|
|
if status_code not in (200, 304):
|
2013-05-31 17:14:17 -07:00
|
|
|
self.count_errors_in_history('page', status_code, fetch_history=fetch_history)
|
2013-07-23 10:43:28 -07:00
|
|
|
elif self.has_page_exception or not self.has_page:
|
2010-08-26 10:04:32 -04:00
|
|
|
self.has_page_exception = False
|
2012-03-05 13:12:50 -08:00
|
|
|
self.has_page = True
|
2010-08-25 20:43:35 -04:00
|
|
|
self.active = True
|
2010-08-24 16:22:12 -04:00
|
|
|
self.save()
|
2010-07-06 13:21:12 -04:00
|
|
|
|
2013-05-31 17:14:17 -07:00
|
|
|
def count_errors_in_history(self, exception_type='feed', status_code=None, fetch_history=None):
|
2012-02-24 16:15:49 -08:00
|
|
|
logging.debug(' ---> [%-30s] Counting errors in history...' % (unicode(self)[:30]))
|
2013-05-31 17:14:17 -07:00
|
|
|
if not fetch_history:
|
|
|
|
fetch_history = MFetchHistory.feed(self.pk)
|
2013-04-15 14:30:31 -07:00
|
|
|
fh = fetch_history[exception_type + '_fetch_history']
|
|
|
|
non_errors = [h for h in fh if h['status_code'] and int(h['status_code']) in (200, 304)]
|
|
|
|
errors = [h for h in fh if h['status_code'] and int(h['status_code']) not in (200, 304)]
|
2012-02-24 11:47:38 -08:00
|
|
|
|
|
|
|
if len(non_errors) == 0 and len(errors) > 1:
|
2012-08-09 13:26:44 -07:00
|
|
|
self.active = True
|
2010-08-26 10:04:32 -04:00
|
|
|
if exception_type == 'feed':
|
|
|
|
self.has_feed_exception = True
|
2012-08-09 13:26:44 -07:00
|
|
|
# self.active = False # No longer, just geometrically fetch
|
2010-08-26 10:04:32 -04:00
|
|
|
elif exception_type == 'page':
|
|
|
|
self.has_page_exception = True
|
2012-02-23 13:20:10 -08:00
|
|
|
self.exception_code = status_code or int(errors[0])
|
2010-08-18 20:35:45 -04:00
|
|
|
self.save()
|
2010-10-27 13:09:46 -04:00
|
|
|
elif self.exception_code > 0:
|
|
|
|
self.active = True
|
|
|
|
self.exception_code = 0
|
2012-02-24 11:47:38 -08:00
|
|
|
if exception_type == 'feed':
|
|
|
|
self.has_feed_exception = False
|
|
|
|
elif exception_type == 'page':
|
|
|
|
self.has_page_exception = False
|
2010-10-27 13:09:46 -04:00
|
|
|
self.save()
|
2012-02-23 13:20:10 -08:00
|
|
|
|
|
|
|
return errors, non_errors
|
2010-08-18 20:35:45 -04:00
|
|
|
|
2010-11-05 20:34:17 -04:00
|
|
|
def count_subscribers(self, verbose=False):
|
2010-11-08 12:09:55 -05:00
|
|
|
SUBSCRIBER_EXPIRE = datetime.datetime.now() - datetime.timedelta(days=settings.SUBSCRIBER_EXPIRE)
|
2010-06-27 20:43:17 -04:00
|
|
|
from apps.reader.models import UserSubscription
|
2010-10-06 10:21:14 -04:00
|
|
|
|
2011-11-17 18:34:23 -08:00
|
|
|
if self.branch_from_feed:
|
|
|
|
original_feed_id = self.branch_from_feed.pk
|
|
|
|
else:
|
|
|
|
original_feed_id = self.pk
|
|
|
|
feed_ids = [f['id'] for f in Feed.objects.filter(branch_from_feed=original_feed_id).values('id')]
|
|
|
|
feed_ids.append(original_feed_id)
|
|
|
|
feed_ids = list(set(feed_ids))
|
|
|
|
|
|
|
|
subs = UserSubscription.objects.filter(feed__in=feed_ids)
|
2010-06-27 20:43:17 -04:00
|
|
|
self.num_subscribers = subs.count()
|
2010-10-06 10:21:14 -04:00
|
|
|
|
|
|
|
active_subs = UserSubscription.objects.filter(
|
2011-11-17 18:34:23 -08:00
|
|
|
feed__in=feed_ids,
|
2010-10-06 10:21:14 -04:00
|
|
|
active=True,
|
|
|
|
user__profile__last_seen_on__gte=SUBSCRIBER_EXPIRE
|
|
|
|
)
|
|
|
|
self.active_subscribers = active_subs.count()
|
|
|
|
|
|
|
|
premium_subs = UserSubscription.objects.filter(
|
2011-11-17 18:34:23 -08:00
|
|
|
feed__in=feed_ids,
|
2010-10-06 10:21:14 -04:00
|
|
|
active=True,
|
|
|
|
user__profile__is_premium=True
|
|
|
|
)
|
|
|
|
self.premium_subscribers = premium_subs.count()
|
|
|
|
|
2012-01-09 19:08:22 -08:00
|
|
|
active_premium_subscribers = UserSubscription.objects.filter(
|
|
|
|
feed__in=feed_ids,
|
|
|
|
active=True,
|
|
|
|
user__profile__is_premium=True,
|
|
|
|
user__profile__last_seen_on__gte=SUBSCRIBER_EXPIRE
|
|
|
|
)
|
|
|
|
self.active_premium_subscribers = active_premium_subscribers.count()
|
|
|
|
|
2010-11-05 20:34:17 -04:00
|
|
|
self.save()
|
2010-06-27 20:43:17 -04:00
|
|
|
|
|
|
|
if verbose:
|
2010-06-27 23:15:31 -04:00
|
|
|
if self.num_subscribers <= 1:
|
|
|
|
print '.',
|
|
|
|
else:
|
|
|
|
print "\n %s> %s subscriber%s: %s" % (
|
|
|
|
'-' * min(self.num_subscribers, 20),
|
|
|
|
self.num_subscribers,
|
|
|
|
'' if self.num_subscribers == 1 else 's',
|
|
|
|
self.feed_title,
|
|
|
|
),
|
2010-08-13 10:43:48 -04:00
|
|
|
|
2011-10-26 20:09:28 -07:00
|
|
|
def _split_favicon_color(self):
|
2011-10-26 09:40:04 -07:00
|
|
|
color = self.favicon_color
|
|
|
|
if color:
|
|
|
|
splitter = lambda s, p: [s[i:i+p] for i in range(0, len(s), p)]
|
|
|
|
red, green, blue = splitter(color[:6], 2)
|
2011-10-26 20:09:28 -07:00
|
|
|
return red, green, blue
|
|
|
|
return None, None, None
|
|
|
|
|
|
|
|
def favicon_fade(self):
|
|
|
|
red, green, blue = self._split_favicon_color()
|
|
|
|
if red and green and blue:
|
2012-01-31 10:15:11 -08:00
|
|
|
fade_red = hex(min(int(red, 16) + 35, 255))[2:].zfill(2)
|
|
|
|
fade_green = hex(min(int(green, 16) + 35, 255))[2:].zfill(2)
|
|
|
|
fade_blue = hex(min(int(blue, 16) + 35, 255))[2:].zfill(2)
|
|
|
|
return "%s%s%s" % (fade_red, fade_green, fade_blue)
|
|
|
|
|
|
|
|
def favicon_border(self):
|
|
|
|
red, green, blue = self._split_favicon_color()
|
|
|
|
if red and green and blue:
|
|
|
|
fade_red = hex(min(int(int(red, 16) * .75), 255))[2:].zfill(2)
|
|
|
|
fade_green = hex(min(int(int(green, 16) * .75), 255))[2:].zfill(2)
|
|
|
|
fade_blue = hex(min(int(int(blue, 16) * .75), 255))[2:].zfill(2)
|
2011-10-26 09:40:04 -07:00
|
|
|
return "%s%s%s" % (fade_red, fade_green, fade_blue)
|
|
|
|
|
2011-10-26 20:09:28 -07:00
|
|
|
def favicon_text_color(self):
|
|
|
|
# Color format: {r: 1, g: .5, b: 0}
|
|
|
|
def contrast(color1, color2):
|
|
|
|
lum1 = luminosity(color1)
|
|
|
|
lum2 = luminosity(color2)
|
|
|
|
if lum1 > lum2:
|
|
|
|
return (lum1 + 0.05) / (lum2 + 0.05)
|
|
|
|
else:
|
|
|
|
return (lum2 + 0.05) / (lum1 + 0.05)
|
|
|
|
|
|
|
|
def luminosity(color):
|
|
|
|
r = color['red']
|
|
|
|
g = color['green']
|
|
|
|
b = color['blue']
|
|
|
|
val = lambda c: c/12.92 if c <= 0.02928 else math.pow(((c + 0.055)/1.055), 2.4)
|
|
|
|
red = val(r)
|
|
|
|
green = val(g)
|
|
|
|
blue = val(b)
|
|
|
|
return 0.2126 * red + 0.7152 * green + 0.0722 * blue
|
|
|
|
|
|
|
|
red, green, blue = self._split_favicon_color()
|
|
|
|
if red and green and blue:
|
|
|
|
color = {
|
|
|
|
'red': int(red, 16) / 256.0,
|
|
|
|
'green': int(green, 16) / 256.0,
|
|
|
|
'blue': int(blue, 16) / 256.0,
|
|
|
|
}
|
|
|
|
white = {
|
|
|
|
'red': 1,
|
|
|
|
'green': 1,
|
|
|
|
'blue': 1,
|
|
|
|
}
|
|
|
|
grey = {
|
|
|
|
'red': 0.5,
|
|
|
|
'green': 0.5,
|
|
|
|
'blue': 0.5,
|
|
|
|
}
|
|
|
|
|
|
|
|
if contrast(color, white) > contrast(color, grey):
|
|
|
|
return 'white'
|
|
|
|
else:
|
|
|
|
return 'black'
|
|
|
|
|
2010-11-05 20:34:17 -04:00
|
|
|
def save_feed_stories_last_month(self, verbose=False):
|
2010-10-10 23:55:00 -04:00
|
|
|
month_ago = datetime.datetime.utcnow() - datetime.timedelta(days=30)
|
2010-08-23 09:55:21 -04:00
|
|
|
stories_last_month = MStory.objects(story_feed_id=self.pk,
|
|
|
|
story_date__gte=month_ago).count()
|
2010-07-25 23:13:27 -04:00
|
|
|
self.stories_last_month = stories_last_month
|
|
|
|
|
2010-11-05 20:34:17 -04:00
|
|
|
self.save()
|
2010-07-05 22:53:49 -04:00
|
|
|
|
2010-07-02 15:49:08 -04:00
|
|
|
if verbose:
|
2010-08-23 09:55:21 -04:00
|
|
|
print " ---> %s [%s]: %s stories last month" % (self.feed_title, self.pk,
|
|
|
|
self.stories_last_month)
|
2010-08-13 10:43:48 -04:00
|
|
|
|
2010-11-05 20:34:17 -04:00
|
|
|
def save_feed_story_history_statistics(self, current_counts=None):
|
2010-08-13 10:43:48 -04:00
|
|
|
"""
|
|
|
|
Fills in missing months between earlier occurances and now.
|
|
|
|
|
|
|
|
Save format: [('YYYY-MM, #), ...]
|
|
|
|
Example output: [(2010-12, 123), (2011-01, 146)]
|
|
|
|
"""
|
2010-10-10 23:55:00 -04:00
|
|
|
now = datetime.datetime.utcnow()
|
2010-08-13 10:43:48 -04:00
|
|
|
min_year = now.year
|
|
|
|
total = 0
|
|
|
|
month_count = 0
|
2010-08-27 19:09:47 -04:00
|
|
|
if not current_counts:
|
2011-01-17 22:48:38 -05:00
|
|
|
current_counts = self.data.story_count_history and json.decode(self.data.story_count_history)
|
2013-05-20 17:29:19 -07:00
|
|
|
|
|
|
|
if isinstance(current_counts, dict):
|
|
|
|
current_counts = current_counts['months']
|
|
|
|
|
2010-08-13 10:43:48 -04:00
|
|
|
if not current_counts:
|
|
|
|
current_counts = []
|
2010-08-27 19:09:47 -04:00
|
|
|
|
|
|
|
# Count stories, aggregate by year and month. Map Reduce!
|
2010-08-27 18:35:33 -04:00
|
|
|
map_f = """
|
|
|
|
function() {
|
|
|
|
var date = (this.story_date.getFullYear()) + "-" + (this.story_date.getMonth()+1);
|
|
|
|
emit(date, 1);
|
|
|
|
}
|
|
|
|
"""
|
|
|
|
reduce_f = """
|
|
|
|
function(key, values) {
|
|
|
|
var total = 0;
|
|
|
|
for (var i=0; i < values.length; i++) {
|
|
|
|
total += values[i];
|
|
|
|
}
|
|
|
|
return total;
|
|
|
|
}
|
|
|
|
"""
|
2010-08-27 19:09:47 -04:00
|
|
|
dates = {}
|
2011-08-27 17:14:31 -07:00
|
|
|
res = MStory.objects(story_feed_id=self.pk).map_reduce(map_f, reduce_f, output='inline')
|
2010-08-27 18:35:33 -04:00
|
|
|
for r in res:
|
2010-08-27 19:09:47 -04:00
|
|
|
dates[r.key] = r.value
|
2011-01-17 22:48:38 -05:00
|
|
|
year = int(re.findall(r"(\d{4})-\d{1,2}", r.key)[0])
|
2012-10-30 10:40:49 -07:00
|
|
|
if year < min_year and year > 2000:
|
2011-01-17 22:48:38 -05:00
|
|
|
min_year = year
|
2010-08-27 19:09:47 -04:00
|
|
|
|
2010-08-13 10:43:48 -04:00
|
|
|
# Add on to existing months, always amending up, never down. (Current month
|
|
|
|
# is guaranteed to be accurate, since trim_feeds won't delete it until after
|
|
|
|
# a month. Hacker News can have 1,000+ and still be counted.)
|
|
|
|
for current_month, current_count in current_counts:
|
2011-01-17 22:48:38 -05:00
|
|
|
year = int(re.findall(r"(\d{4})-\d{1,2}", current_month)[0])
|
2010-08-27 19:09:47 -04:00
|
|
|
if current_month not in dates or dates[current_month] < current_count:
|
|
|
|
dates[current_month] = current_count
|
2012-10-30 10:40:49 -07:00
|
|
|
if year < min_year and year > 2000:
|
2011-01-17 22:48:38 -05:00
|
|
|
min_year = year
|
|
|
|
|
2010-08-13 10:43:48 -04:00
|
|
|
# Assemble a list with 0's filled in for missing months,
|
|
|
|
# trimming left and right 0's.
|
|
|
|
months = []
|
|
|
|
start = False
|
|
|
|
for year in range(min_year, now.year+1):
|
|
|
|
for month in range(1, 12+1):
|
|
|
|
if datetime.datetime(year, month, 1) < now:
|
2010-08-27 19:09:47 -04:00
|
|
|
key = u'%s-%s' % (year, month)
|
|
|
|
if dates.get(key) or start:
|
2010-08-13 10:43:48 -04:00
|
|
|
start = True
|
2010-08-27 19:09:47 -04:00
|
|
|
months.append((key, dates.get(key, 0)))
|
|
|
|
total += dates.get(key, 0)
|
2010-08-13 10:43:48 -04:00
|
|
|
month_count += 1
|
2011-01-17 22:48:38 -05:00
|
|
|
self.data.story_count_history = json.encode(months)
|
|
|
|
self.data.save()
|
2012-12-21 16:48:47 -08:00
|
|
|
if not total or not month_count:
|
2010-08-27 19:09:47 -04:00
|
|
|
self.average_stories_per_month = 0
|
|
|
|
else:
|
2012-12-21 16:48:47 -08:00
|
|
|
self.average_stories_per_month = int(round(total / float(month_count)))
|
2010-11-05 20:34:17 -04:00
|
|
|
self.save()
|
2010-08-13 10:43:48 -04:00
|
|
|
|
|
|
|
|
2011-04-09 11:06:36 -04:00
|
|
|
def save_classifier_counts(self):
|
2011-04-07 17:00:28 -04:00
|
|
|
from apps.analyzer.models import MClassifierTitle, MClassifierAuthor, MClassifierFeed, MClassifierTag
|
|
|
|
|
|
|
|
def calculate_scores(cls, facet):
|
|
|
|
map_f = """
|
|
|
|
function() {
|
|
|
|
emit(this["%s"], {
|
|
|
|
pos: this.score>0 ? this.score : 0,
|
|
|
|
neg: this.score<0 ? Math.abs(this.score) : 0
|
|
|
|
});
|
|
|
|
}
|
|
|
|
""" % (facet)
|
|
|
|
reduce_f = """
|
|
|
|
function(key, values) {
|
|
|
|
var result = {pos: 0, neg: 0};
|
|
|
|
values.forEach(function(value) {
|
|
|
|
result.pos += value.pos;
|
|
|
|
result.neg += value.neg;
|
|
|
|
});
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
"""
|
2011-04-09 14:37:07 -04:00
|
|
|
scores = []
|
2011-08-27 17:14:31 -07:00
|
|
|
res = cls.objects(feed_id=self.pk).map_reduce(map_f, reduce_f, output='inline')
|
2011-04-07 17:00:28 -04:00
|
|
|
for r in res:
|
2011-04-09 14:37:07 -04:00
|
|
|
facet_values = dict([(k, int(v)) for k,v in r.value.iteritems()])
|
|
|
|
facet_values[facet] = r.key
|
2012-12-12 16:36:36 -08:00
|
|
|
if facet_values['pos'] + facet_values['neg'] > 1:
|
|
|
|
scores.append(facet_values)
|
2011-04-09 14:37:07 -04:00
|
|
|
scores = sorted(scores, key=lambda v: v['neg'] - v['pos'])
|
|
|
|
|
2011-04-07 17:00:28 -04:00
|
|
|
return scores
|
|
|
|
|
|
|
|
scores = {}
|
|
|
|
for cls, facet in [(MClassifierTitle, 'title'),
|
|
|
|
(MClassifierAuthor, 'author'),
|
|
|
|
(MClassifierTag, 'tag'),
|
|
|
|
(MClassifierFeed, 'feed_id')]:
|
|
|
|
scores[facet] = calculate_scores(cls, facet)
|
2011-04-09 14:37:07 -04:00
|
|
|
if facet == 'feed_id' and scores[facet]:
|
|
|
|
scores['feed'] = scores[facet]
|
2011-04-07 17:00:28 -04:00
|
|
|
del scores['feed_id']
|
|
|
|
elif not scores[facet]:
|
|
|
|
del scores[facet]
|
|
|
|
|
|
|
|
if scores:
|
|
|
|
self.data.feed_classifier_counts = json.encode(scores)
|
|
|
|
self.data.save()
|
|
|
|
|
2012-03-26 17:04:35 -07:00
|
|
|
def update(self, **kwargs):
|
2009-08-29 19:34:42 +00:00
|
|
|
from utils import feed_fetcher
|
2013-03-30 19:05:13 -07:00
|
|
|
r = redis.Redis(connection_pool=settings.REDIS_FEED_POOL)
|
2013-04-04 20:51:56 -07:00
|
|
|
original_feed_id = int(self.pk)
|
2013-03-30 19:05:13 -07:00
|
|
|
|
2012-03-06 16:11:27 -08:00
|
|
|
if getattr(settings, 'TEST_DEBUG', False):
|
2013-07-03 17:52:13 -07:00
|
|
|
self.feed_address = self.feed_address.replace("%(NEWSBLUR_DIR)s", settings.NEWSBLUR_DIR)
|
|
|
|
self.feed_link = self.feed_link.replace("%(NEWSBLUR_DIR)s", settings.NEWSBLUR_DIR)
|
2012-03-26 11:04:05 -07:00
|
|
|
self.save()
|
2013-04-04 20:47:41 -07:00
|
|
|
|
2012-03-26 17:04:35 -07:00
|
|
|
options = {
|
|
|
|
'verbose': kwargs.get('verbose'),
|
2010-04-09 16:37:19 -04:00
|
|
|
'timeout': 10,
|
2012-03-26 17:04:35 -07:00
|
|
|
'single_threaded': kwargs.get('single_threaded', True),
|
|
|
|
'force': kwargs.get('force'),
|
|
|
|
'compute_scores': kwargs.get('compute_scores', True),
|
2012-08-17 00:10:17 -07:00
|
|
|
'mongodb_replication_lag': kwargs.get('mongodb_replication_lag', None),
|
2012-03-26 17:04:35 -07:00
|
|
|
'fake': kwargs.get('fake'),
|
|
|
|
'quick': kwargs.get('quick'),
|
|
|
|
'debug': kwargs.get('debug'),
|
2012-03-27 18:37:04 -07:00
|
|
|
'fpf': kwargs.get('fpf'),
|
2012-03-28 15:49:21 -07:00
|
|
|
'feed_xml': kwargs.get('feed_xml'),
|
2012-03-26 17:04:35 -07:00
|
|
|
}
|
2009-08-29 19:34:42 +00:00
|
|
|
disp = feed_fetcher.Dispatcher(options, 1)
|
2010-10-03 18:04:40 -04:00
|
|
|
disp.add_jobs([[self.pk]])
|
2012-03-21 16:05:52 -07:00
|
|
|
feed = disp.run_jobs()
|
2011-02-24 15:48:00 -05:00
|
|
|
|
2013-04-03 21:11:40 -07:00
|
|
|
if feed:
|
|
|
|
feed = Feed.get_by_id(feed.pk)
|
2013-04-03 21:10:37 -07:00
|
|
|
if feed:
|
|
|
|
feed.last_update = datetime.datetime.utcnow()
|
|
|
|
feed.set_next_scheduled_update()
|
|
|
|
r.zadd('fetched_feeds_last_hour', feed.pk, int(datetime.datetime.now().strftime('%s')))
|
2013-04-03 21:11:40 -07:00
|
|
|
|
2013-04-04 20:47:41 -07:00
|
|
|
if not feed or original_feed_id != feed.pk:
|
|
|
|
logging.info(" ---> ~FRFeed changed id, removing %s from tasked_feeds queue..." % original_feed_id)
|
|
|
|
r.zrem('tasked_feeds', original_feed_id)
|
2013-04-08 10:50:50 -07:00
|
|
|
r.zrem('error_feeds', original_feed_id)
|
2013-04-04 20:47:41 -07:00
|
|
|
if feed:
|
|
|
|
r.zrem('tasked_feeds', feed.pk)
|
2013-04-08 10:50:50 -07:00
|
|
|
r.zrem('error_feeds', feed.pk)
|
2012-03-21 16:05:52 -07:00
|
|
|
|
2011-05-19 19:04:10 -04:00
|
|
|
return feed
|
2009-06-16 03:08:55 +00:00
|
|
|
|
2012-07-17 14:18:26 -07:00
|
|
|
@classmethod
|
2012-07-18 18:34:19 -07:00
|
|
|
def get_by_id(cls, feed_id, feed_address=None):
|
2012-07-17 14:18:26 -07:00
|
|
|
try:
|
|
|
|
feed = Feed.objects.get(pk=feed_id)
|
2012-07-18 18:34:19 -07:00
|
|
|
return feed
|
2012-07-17 14:18:26 -07:00
|
|
|
except Feed.DoesNotExist:
|
|
|
|
# Feed has been merged after updating. Find the right feed.
|
|
|
|
duplicate_feeds = DuplicateFeed.objects.filter(duplicate_feed_id=feed_id)
|
|
|
|
if duplicate_feeds:
|
2012-07-18 18:34:19 -07:00
|
|
|
return duplicate_feeds[0].feed
|
|
|
|
if feed_address:
|
|
|
|
duplicate_feeds = DuplicateFeed.objects.filter(duplicate_address=feed_address)
|
|
|
|
if duplicate_feeds:
|
|
|
|
return duplicate_feeds[0].feed
|
2013-01-04 16:34:27 -08:00
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def get_by_name(cls, query, limit=1):
|
|
|
|
results = SearchFeed.query(query)
|
|
|
|
feed_ids = [result.feed_id for result in results]
|
|
|
|
|
|
|
|
if limit == 1:
|
|
|
|
return Feed.get_by_id(feed_ids[0])
|
|
|
|
else:
|
|
|
|
return [Feed.get_by_id(f) for f in feed_ids][:limit]
|
|
|
|
|
2011-11-24 15:29:26 -05:00
|
|
|
def add_update_stories(self, stories, existing_stories, verbose=False):
|
2012-09-04 11:46:41 -07:00
|
|
|
ret_values = dict(new=0, updated=0, same=0, error=0)
|
2013-04-08 16:14:33 -07:00
|
|
|
error_count = self.error_count
|
|
|
|
|
2013-04-03 18:11:23 -07:00
|
|
|
if settings.DEBUG or verbose:
|
2012-12-24 00:39:52 -08:00
|
|
|
logging.debug(" ---> [%-30s] ~FBChecking ~SB%s~SN new/updated against ~SB%s~SN stories" % (
|
|
|
|
self.title[:30],
|
2012-12-24 00:10:40 -08:00
|
|
|
len(stories),
|
2013-01-28 16:45:48 -08:00
|
|
|
len(existing_stories.keys())))
|
2013-08-06 13:54:06 -07:00
|
|
|
@timelimit(2)
|
|
|
|
def _1(story, story_content, existing_stories):
|
|
|
|
existing_story, story_has_changed = self._exists_story(story, story_content, existing_stories)
|
|
|
|
return existing_story, story_has_changed
|
2012-12-24 00:10:40 -08:00
|
|
|
|
2009-06-16 03:08:55 +00:00
|
|
|
for story in stories:
|
2012-03-06 16:11:27 -08:00
|
|
|
if not story.get('title'):
|
|
|
|
continue
|
|
|
|
|
|
|
|
story_content = story.get('story_content')
|
2013-04-08 16:14:33 -07:00
|
|
|
if error_count:
|
|
|
|
story_content = strip_comments__lxml(story_content)
|
|
|
|
else:
|
|
|
|
story_content = strip_comments(story_content)
|
2012-03-06 16:11:27 -08:00
|
|
|
story_tags = self.get_tags(story)
|
|
|
|
story_link = self.get_permalink(story)
|
2013-05-29 15:31:26 -07:00
|
|
|
|
2013-08-06 13:54:06 -07:00
|
|
|
try:
|
|
|
|
existing_story, story_has_changed = _1(story, story_content, existing_stories)
|
|
|
|
except TimeoutError, e:
|
|
|
|
logging.debug(' ---> [%-30s] ~SB~FRExisting story check timed out...' % (unicode(self)[:30]))
|
|
|
|
existing_story = None
|
|
|
|
story_has_changed = False
|
|
|
|
|
2012-03-06 16:11:27 -08:00
|
|
|
if existing_story is None:
|
2012-12-24 00:39:52 -08:00
|
|
|
if settings.DEBUG and False:
|
|
|
|
logging.debug(' ---> New story in feed (%s - %s): %s' % (self.feed_title, story.get('title'), len(story_content)))
|
2012-12-24 00:10:40 -08:00
|
|
|
|
2012-03-06 16:11:27 -08:00
|
|
|
s = MStory(story_feed_id = self.pk,
|
|
|
|
story_date = story.get('published'),
|
|
|
|
story_title = story.get('title'),
|
|
|
|
story_content = story_content,
|
|
|
|
story_author_name = story.get('author'),
|
|
|
|
story_permalink = story_link,
|
|
|
|
story_guid = story.get('guid'),
|
|
|
|
story_tags = story_tags
|
|
|
|
)
|
2013-06-26 11:38:49 -07:00
|
|
|
s.extract_image_urls()
|
2012-03-06 16:11:27 -08:00
|
|
|
try:
|
|
|
|
s.save()
|
2012-09-04 11:46:41 -07:00
|
|
|
ret_values['new'] += 1
|
2013-04-05 12:09:32 -07:00
|
|
|
except (IntegrityError, OperationError), e:
|
2012-09-04 11:46:41 -07:00
|
|
|
ret_values['error'] += 1
|
2012-12-24 00:10:40 -08:00
|
|
|
if settings.DEBUG:
|
2013-04-05 12:09:32 -07:00
|
|
|
logging.info(' ---> [%-30s] ~SN~FRIntegrityError on new story: %s - %s' % (self.feed_title[:30], story.get('guid'), e))
|
2012-03-06 16:11:27 -08:00
|
|
|
elif existing_story and story_has_changed:
|
|
|
|
# update story
|
|
|
|
original_content = None
|
|
|
|
try:
|
|
|
|
if existing_story and existing_story.id:
|
|
|
|
try:
|
2012-03-26 12:40:13 -07:00
|
|
|
existing_story = MStory.objects.get(id=existing_story.id)
|
2012-03-06 16:11:27 -08:00
|
|
|
except ValidationError:
|
2013-02-20 16:08:14 -08:00
|
|
|
existing_story, _ = MStory.find_story(existing_story.story_feed_id,
|
|
|
|
existing_story.id,
|
|
|
|
original_only=True)
|
2012-03-06 16:11:27 -08:00
|
|
|
elif existing_story and existing_story.story_guid:
|
2013-02-20 16:08:14 -08:00
|
|
|
existing_story, _ = MStory.find_story(existing_story.story_feed_id,
|
|
|
|
existing_story.story_guid,
|
|
|
|
original_only=True)
|
2010-08-01 23:47:40 -04:00
|
|
|
else:
|
2012-03-06 16:11:27 -08:00
|
|
|
raise MStory.DoesNotExist
|
2013-04-05 12:09:32 -07:00
|
|
|
except (MStory.DoesNotExist, OperationError), e:
|
2012-09-04 11:46:41 -07:00
|
|
|
ret_values['error'] += 1
|
2012-03-06 16:11:27 -08:00
|
|
|
if verbose:
|
2013-04-05 12:09:32 -07:00
|
|
|
logging.info(' ---> [%-30s] ~SN~FROperation on existing story: %s - %s' % (self.feed_title[:30], story.get('guid'), e))
|
2012-03-06 16:11:27 -08:00
|
|
|
continue
|
|
|
|
if existing_story.story_original_content_z:
|
|
|
|
original_content = zlib.decompress(existing_story.story_original_content_z)
|
|
|
|
elif existing_story.story_content_z:
|
|
|
|
original_content = zlib.decompress(existing_story.story_content_z)
|
|
|
|
# print 'Type: %s %s' % (type(original_content), type(story_content))
|
|
|
|
if story_content and len(story_content) > 10:
|
2012-09-24 13:36:06 -07:00
|
|
|
story_content_diff = htmldiff(unicode(original_content), unicode(story_content))
|
2009-08-20 02:43:01 +00:00
|
|
|
else:
|
2012-03-06 16:11:27 -08:00
|
|
|
story_content_diff = original_content
|
|
|
|
# logging.debug("\t\tDiff: %s %s %s" % diff.getStats())
|
|
|
|
# logging.debug("\t\tDiff content: %s" % diff.getDiff())
|
|
|
|
# if existing_story.story_title != story.get('title'):
|
|
|
|
# logging.debug('\tExisting title / New: : \n\t\t- %s\n\t\t- %s' % (existing_story.story_title, story.get('title')))
|
|
|
|
if existing_story.story_guid != story.get('guid'):
|
2013-05-28 10:23:36 -07:00
|
|
|
self.update_story_with_new_guid(existing_story, story.get('guid'))
|
2012-12-24 00:10:40 -08:00
|
|
|
|
2012-12-24 00:39:52 -08:00
|
|
|
if settings.DEBUG and False:
|
2012-12-24 00:10:40 -08:00
|
|
|
logging.debug('- Updated story in feed (%s - %s): %s / %s' % (self.feed_title, story.get('title'), len(story_content_diff), len(story_content)))
|
2012-03-06 16:11:27 -08:00
|
|
|
|
|
|
|
existing_story.story_feed = self.pk
|
|
|
|
existing_story.story_title = story.get('title')
|
|
|
|
existing_story.story_content = story_content_diff
|
2012-07-22 12:25:09 -07:00
|
|
|
existing_story.story_latest_content = story_content
|
2012-03-06 16:11:27 -08:00
|
|
|
existing_story.story_original_content = original_content
|
|
|
|
existing_story.story_author_name = story.get('author')
|
|
|
|
existing_story.story_permalink = story_link
|
|
|
|
existing_story.story_guid = story.get('guid')
|
|
|
|
existing_story.story_tags = story_tags
|
2012-12-24 00:10:40 -08:00
|
|
|
# Do not allow publishers to change the story date once a story is published.
|
|
|
|
# Leads to incorrect unread story counts.
|
|
|
|
# existing_story.story_date = story.get('published') # No, don't
|
2013-06-26 11:38:49 -07:00
|
|
|
existing_story.extract_image_urls()
|
2012-12-24 00:10:40 -08:00
|
|
|
|
2012-03-06 16:11:27 -08:00
|
|
|
try:
|
|
|
|
existing_story.save()
|
2012-09-04 11:46:41 -07:00
|
|
|
ret_values['updated'] += 1
|
2012-03-06 16:11:27 -08:00
|
|
|
except (IntegrityError, OperationError):
|
2012-09-04 11:46:41 -07:00
|
|
|
ret_values['error'] += 1
|
2012-03-06 16:11:27 -08:00
|
|
|
if verbose:
|
2012-03-27 11:19:53 -07:00
|
|
|
logging.info(' ---> [%-30s] ~SN~FRIntegrityError on updated story: %s' % (self.feed_title[:30], story.get('title')[:30]))
|
|
|
|
except ValidationError:
|
2012-09-04 11:46:41 -07:00
|
|
|
ret_values['error'] += 1
|
2012-03-06 16:11:27 -08:00
|
|
|
if verbose:
|
2012-03-27 11:19:53 -07:00
|
|
|
logging.info(' ---> [%-30s] ~SN~FRValidationError on updated story: %s' % (self.feed_title[:30], story.get('title')[:30]))
|
2012-03-06 16:11:27 -08:00
|
|
|
else:
|
2012-09-04 11:46:41 -07:00
|
|
|
ret_values['same'] += 1
|
2012-03-06 16:11:27 -08:00
|
|
|
# logging.debug("Unchanged story: %s " % story.get('title'))
|
|
|
|
|
2009-08-20 02:43:01 +00:00
|
|
|
return ret_values
|
2011-10-10 20:46:13 -07:00
|
|
|
|
2013-05-28 10:23:36 -07:00
|
|
|
def update_story_with_new_guid(self, existing_story, new_story_guid):
|
2013-05-10 16:11:30 -07:00
|
|
|
from apps.reader.models import RUserStory
|
2012-08-10 14:22:51 -07:00
|
|
|
from apps.social.models import MSharedStory
|
2013-05-28 10:23:36 -07:00
|
|
|
|
|
|
|
existing_story.remove_from_redis()
|
2013-05-10 16:11:30 -07:00
|
|
|
|
2013-05-28 10:23:36 -07:00
|
|
|
old_hash = existing_story.story_hash
|
2013-06-04 15:34:03 -07:00
|
|
|
new_hash = MStory.ensure_story_hash(new_story_guid, self.pk)
|
2013-05-28 10:23:36 -07:00
|
|
|
RUserStory.switch_hash(feed_id=self.pk, old_hash=old_hash, new_hash=new_hash)
|
2013-05-10 16:11:30 -07:00
|
|
|
|
2012-08-10 14:22:51 -07:00
|
|
|
shared_stories = MSharedStory.objects.filter(story_feed_id=self.pk,
|
2013-05-28 10:23:36 -07:00
|
|
|
story_hash=old_hash)
|
2012-08-10 14:22:51 -07:00
|
|
|
for story in shared_stories:
|
|
|
|
story.story_guid = new_story_guid
|
2013-05-28 10:23:36 -07:00
|
|
|
story.story_hash = new_hash
|
2013-05-29 16:26:04 -07:00
|
|
|
try:
|
|
|
|
story.save()
|
|
|
|
except NotUniqueError:
|
|
|
|
# Story is already shared, skip.
|
|
|
|
pass
|
2012-03-21 13:54:37 -07:00
|
|
|
|
2011-02-05 15:34:43 -05:00
|
|
|
def save_popular_tags(self, feed_tags=None, verbose=False):
|
2010-07-01 15:16:33 -04:00
|
|
|
if not feed_tags:
|
2013-02-20 16:08:14 -08:00
|
|
|
all_tags = MStory.objects(story_feed_id=self.pk,
|
|
|
|
story_tags__exists=True).item_frequencies('story_tags')
|
2012-08-12 20:34:30 -07:00
|
|
|
feed_tags = sorted([(k, v) for k, v in all_tags.items() if int(v) > 0],
|
2010-08-21 23:49:36 -04:00
|
|
|
key=itemgetter(1),
|
2011-02-05 15:34:43 -05:00
|
|
|
reverse=True)[:25]
|
2010-07-01 15:16:33 -04:00
|
|
|
popular_tags = json.encode(feed_tags)
|
2012-08-12 20:34:30 -07:00
|
|
|
if verbose:
|
|
|
|
print "Found %s tags: %s" % (len(feed_tags), popular_tags)
|
2010-08-21 23:49:36 -04:00
|
|
|
|
|
|
|
# TODO: This len() bullshit will be gone when feeds move to mongo
|
|
|
|
# On second thought, it might stay, because we don't want
|
|
|
|
# popular tags the size of a small planet. I'm looking at you
|
|
|
|
# Tumblr writers.
|
2010-07-01 15:16:33 -04:00
|
|
|
if len(popular_tags) < 1024:
|
2011-01-17 22:48:38 -05:00
|
|
|
self.data.popular_tags = popular_tags
|
|
|
|
self.data.save()
|
2010-07-01 15:16:33 -04:00
|
|
|
return
|
|
|
|
|
2011-08-27 14:13:28 -07:00
|
|
|
tags_list = []
|
|
|
|
if feed_tags and isinstance(feed_tags, unicode):
|
2011-08-27 13:22:56 -07:00
|
|
|
tags_list = json.decode(feed_tags)
|
2011-08-27 14:13:28 -07:00
|
|
|
if len(tags_list) >= 1:
|
2010-07-01 15:16:33 -04:00
|
|
|
self.save_popular_tags(tags_list[:-1])
|
|
|
|
|
2010-11-05 20:34:17 -04:00
|
|
|
def save_popular_authors(self, feed_authors=None):
|
2010-07-01 15:16:33 -04:00
|
|
|
if not feed_authors:
|
2010-08-21 23:49:36 -04:00
|
|
|
authors = defaultdict(int)
|
|
|
|
for story in MStory.objects(story_feed_id=self.pk).only('story_author_name'):
|
|
|
|
authors[story.story_author_name] += 1
|
|
|
|
feed_authors = sorted([(k, v) for k, v in authors.items() if k],
|
|
|
|
key=itemgetter(1),
|
|
|
|
reverse=True)[:20]
|
|
|
|
|
2010-07-01 15:16:33 -04:00
|
|
|
popular_authors = json.encode(feed_authors)
|
2011-02-05 22:09:31 -05:00
|
|
|
if len(popular_authors) < 1023:
|
2011-01-17 22:48:38 -05:00
|
|
|
self.data.popular_authors = popular_authors
|
|
|
|
self.data.save()
|
2010-07-01 15:16:33 -04:00
|
|
|
return
|
|
|
|
|
2010-08-25 21:55:22 -04:00
|
|
|
if len(feed_authors) > 1:
|
2010-11-05 20:34:17 -04:00
|
|
|
self.save_popular_authors(feed_authors=feed_authors[:-1])
|
2013-06-03 17:20:36 -07:00
|
|
|
|
|
|
|
@classmethod
|
2013-06-04 11:26:01 -07:00
|
|
|
def trim_old_stories(cls, start=0, verbose=True, dryrun=False):
|
2013-06-03 17:58:27 -07:00
|
|
|
now = datetime.datetime.now()
|
2013-08-02 15:59:03 -07:00
|
|
|
month_ago = now - datetime.timedelta(days=settings.DAYS_OF_UNREAD_NEW)
|
2013-06-04 11:26:01 -07:00
|
|
|
feed_count = Feed.objects.latest('pk').pk
|
2013-08-12 16:48:16 -07:00
|
|
|
total = 0
|
2013-06-04 11:26:01 -07:00
|
|
|
for feed_id in xrange(start, feed_count):
|
|
|
|
if feed_id % 1000 == 0:
|
|
|
|
print "\n\n -------------------------- %s --------------------------\n\n" % feed_id
|
|
|
|
try:
|
|
|
|
feed = Feed.objects.get(pk=feed_id)
|
|
|
|
except Feed.DoesNotExist:
|
|
|
|
continue
|
|
|
|
if feed.active_subscribers > 0:
|
|
|
|
continue
|
|
|
|
if not feed.last_story_date or feed.last_story_date < month_ago:
|
|
|
|
months_ago = 6
|
|
|
|
if feed.last_story_date:
|
|
|
|
months_ago = int((now - feed.last_story_date).days / 30.0)
|
|
|
|
cutoff = max(1, 6 - months_ago)
|
|
|
|
if dryrun:
|
|
|
|
print " DRYRUN: %s cutoff - %s" % (cutoff, feed)
|
|
|
|
else:
|
2013-08-12 16:48:16 -07:00
|
|
|
total += MStory.trim_feed(feed=feed, cutoff=cutoff, verbose=verbose)
|
|
|
|
|
|
|
|
print " ---> Deleted %s stories in total." % total
|
2013-06-21 12:30:06 -07:00
|
|
|
|
|
|
|
@property
|
|
|
|
def story_cutoff(self):
|
|
|
|
cutoff = 500
|
|
|
|
if self.active_subscribers <= 0:
|
|
|
|
cutoff = 25
|
2013-08-05 10:23:22 -07:00
|
|
|
elif self.active_premium_subscribers < 1:
|
2013-06-21 12:30:06 -07:00
|
|
|
cutoff = 100
|
2013-08-05 10:23:22 -07:00
|
|
|
elif self.active_premium_subscribers <= 2:
|
2013-06-21 12:30:06 -07:00
|
|
|
cutoff = 200
|
2013-08-05 10:23:22 -07:00
|
|
|
elif self.active_premium_subscribers <= 5:
|
2013-06-21 12:30:06 -07:00
|
|
|
cutoff = 300
|
2013-08-05 10:23:22 -07:00
|
|
|
elif self.active_premium_subscribers <= 10:
|
2013-06-21 12:30:06 -07:00
|
|
|
cutoff = 350
|
2013-08-05 10:23:22 -07:00
|
|
|
elif self.active_premium_subscribers <= 15:
|
2013-06-21 12:30:06 -07:00
|
|
|
cutoff = 400
|
2013-08-05 10:23:22 -07:00
|
|
|
elif self.active_premium_subscribers <= 20:
|
2013-06-21 12:30:06 -07:00
|
|
|
cutoff = 450
|
2013-08-05 10:23:22 -07:00
|
|
|
|
|
|
|
if self.active_subscribers and self.average_stories_per_month < 5 and self.stories_last_month < 5:
|
|
|
|
cutoff /= 2
|
|
|
|
if self.active_premium_subscribers <= 1 and self.average_stories_per_month <= 1 and self.stories_last_month <= 1:
|
|
|
|
cutoff /= 2
|
|
|
|
|
2013-06-21 12:30:06 -07:00
|
|
|
return cutoff
|
|
|
|
|
2013-06-03 17:20:36 -07:00
|
|
|
def trim_feed(self, verbose=False, cutoff=None):
|
|
|
|
if not cutoff:
|
2013-06-21 12:30:06 -07:00
|
|
|
cutoff = self.story_cutoff
|
2013-06-03 17:48:11 -07:00
|
|
|
MStory.trim_feed(feed=self, cutoff=cutoff, verbose=verbose)
|
2012-10-19 18:33:28 -07:00
|
|
|
|
2013-04-15 14:30:31 -07:00
|
|
|
# @staticmethod
|
|
|
|
# def clean_invalid_ids():
|
|
|
|
# history = MFeedFetchHistory.objects(status_code=500, exception__contains='InvalidId:')
|
|
|
|
# urls = set()
|
|
|
|
# for h in history:
|
|
|
|
# u = re.split('InvalidId: (.*?) is not a valid ObjectId\\n$', h.exception)[1]
|
|
|
|
# urls.add((h.feed_id, u))
|
|
|
|
#
|
|
|
|
# for f, u in urls:
|
|
|
|
# print "db.stories.remove({\"story_feed_id\": %s, \"_id\": \"%s\"})" % (f, u)
|
2012-10-19 18:33:28 -07:00
|
|
|
|
2009-08-01 04:26:57 +00:00
|
|
|
|
2012-01-09 13:55:26 -08:00
|
|
|
def get_stories(self, offset=0, limit=25, force=False):
|
|
|
|
stories_db = MStory.objects(story_feed_id=self.pk)[offset:offset+limit]
|
|
|
|
stories = self.format_stories(stories_db, self.pk)
|
2009-07-28 02:27:27 +00:00
|
|
|
|
|
|
|
return stories
|
|
|
|
|
2013-08-07 15:43:25 -07:00
|
|
|
@classmethod
|
|
|
|
def find_feed_stories(cls, feed_ids, query, offset=0, limit=25):
|
|
|
|
stories_db = MStory.objects(
|
|
|
|
Q(story_feed_id__in=feed_ids) &
|
|
|
|
(Q(story_title__icontains=query) |
|
|
|
|
Q(story_author_name__icontains=query) |
|
|
|
|
Q(story_tags__icontains=query))
|
|
|
|
).order_by('-story_date')[offset:offset+limit]
|
|
|
|
stories = cls.format_stories(stories_db)
|
|
|
|
|
|
|
|
return stories
|
|
|
|
|
2012-12-19 14:21:46 -08:00
|
|
|
def find_stories(self, query, offset=0, limit=25):
|
|
|
|
stories_db = MStory.objects(
|
|
|
|
Q(story_feed_id=self.pk) &
|
|
|
|
(Q(story_title__icontains=query) |
|
2013-07-30 12:01:45 -07:00
|
|
|
Q(story_author_name__icontains=query) |
|
|
|
|
Q(story_tags__icontains=query))
|
|
|
|
).order_by('-story_date')[offset:offset+limit]
|
2012-12-19 14:21:46 -08:00
|
|
|
stories = self.format_stories(stories_db, self.pk)
|
|
|
|
|
|
|
|
return stories
|
|
|
|
|
2010-12-02 20:18:33 -05:00
|
|
|
@classmethod
|
2013-05-14 16:36:03 -07:00
|
|
|
def format_stories(cls, stories_db, feed_id=None, include_permalinks=False):
|
2010-01-21 13:12:29 -05:00
|
|
|
stories = []
|
2010-10-07 19:56:23 -04:00
|
|
|
|
2010-01-21 13:12:29 -05:00
|
|
|
for story_db in stories_db:
|
2013-05-14 16:36:03 -07:00
|
|
|
story = cls.format_story(story_db, feed_id, include_permalinks=include_permalinks)
|
2010-01-21 13:12:29 -05:00
|
|
|
stories.append(story)
|
|
|
|
|
|
|
|
return stories
|
2011-05-08 19:41:50 -04:00
|
|
|
|
|
|
|
@classmethod
|
2013-05-14 16:36:03 -07:00
|
|
|
def format_story(cls, story_db, feed_id=None, text=False, include_permalinks=False):
|
2012-09-04 11:46:41 -07:00
|
|
|
if isinstance(story_db.story_content_z, unicode):
|
|
|
|
story_db.story_content_z = story_db.story_content_z.decode('base64')
|
|
|
|
|
2011-12-24 14:45:19 -08:00
|
|
|
story_content = story_db.story_content_z and zlib.decompress(story_db.story_content_z) or ''
|
2011-05-08 19:41:50 -04:00
|
|
|
story = {}
|
2013-04-29 15:27:22 -07:00
|
|
|
story['story_hash'] = getattr(story_db, 'story_hash', None)
|
2011-05-08 19:41:50 -04:00
|
|
|
story['story_tags'] = story_db.story_tags or []
|
2012-09-04 11:46:41 -07:00
|
|
|
story['story_date'] = story_db.story_date.replace(tzinfo=None)
|
2013-05-26 16:32:48 -07:00
|
|
|
story['story_timestamp'] = story_db.story_date.strftime('%s')
|
2011-05-08 19:41:50 -04:00
|
|
|
story['story_authors'] = story_db.story_author_name
|
|
|
|
story['story_title'] = story_db.story_title
|
2011-12-24 14:45:19 -08:00
|
|
|
story['story_content'] = story_content
|
2012-04-06 13:38:21 -07:00
|
|
|
story['story_permalink'] = story_db.story_permalink
|
2013-06-26 11:38:49 -07:00
|
|
|
story['image_urls'] = story_db.image_urls
|
2011-05-08 19:41:50 -04:00
|
|
|
story['story_feed_id'] = feed_id or story_db.story_feed_id
|
2012-01-15 20:51:48 -08:00
|
|
|
story['comment_count'] = story_db.comment_count if hasattr(story_db, 'comment_count') else 0
|
|
|
|
story['comment_user_ids'] = story_db.comment_user_ids if hasattr(story_db, 'comment_user_ids') else []
|
|
|
|
story['share_count'] = story_db.share_count if hasattr(story_db, 'share_count') else 0
|
|
|
|
story['share_user_ids'] = story_db.share_user_ids if hasattr(story_db, 'share_user_ids') else []
|
|
|
|
story['guid_hash'] = story_db.guid_hash if hasattr(story_db, 'guid_hash') else None
|
2012-04-30 11:52:19 -07:00
|
|
|
if hasattr(story_db, 'source_user_id'):
|
|
|
|
story['source_user_id'] = story_db.source_user_id
|
2011-11-24 15:19:53 -05:00
|
|
|
story['id'] = story_db.story_guid or story_db.story_date
|
2011-05-08 19:41:50 -04:00
|
|
|
if hasattr(story_db, 'starred_date'):
|
|
|
|
story['starred_date'] = story_db.starred_date
|
2012-01-24 09:02:23 -08:00
|
|
|
if hasattr(story_db, 'shared_date'):
|
|
|
|
story['shared_date'] = story_db.shared_date
|
2013-05-14 16:36:03 -07:00
|
|
|
if include_permalinks and hasattr(story_db, 'blurblog_permalink'):
|
2012-11-27 11:59:54 -08:00
|
|
|
story['blurblog_permalink'] = story_db.blurblog_permalink()
|
2011-05-08 19:41:50 -04:00
|
|
|
if text:
|
|
|
|
soup = BeautifulSoup(story['story_content'])
|
|
|
|
text = ''.join(soup.findAll(text=True))
|
2011-05-08 20:21:09 -04:00
|
|
|
text = re.sub(r'\n+', '\n\n', text)
|
2011-05-08 19:41:50 -04:00
|
|
|
text = re.sub(r'\t+', '\t', text)
|
|
|
|
story['text'] = text
|
2012-06-26 19:19:57 -07:00
|
|
|
if '<ins' in story['story_content'] or '<del' in story['story_content']:
|
|
|
|
story['has_modifications'] = True
|
2011-11-24 15:19:53 -05:00
|
|
|
|
2011-05-08 19:41:50 -04:00
|
|
|
return story
|
2012-01-09 13:55:26 -08:00
|
|
|
|
2010-01-04 22:26:53 +00:00
|
|
|
def get_tags(self, entry):
|
|
|
|
fcat = []
|
|
|
|
if entry.has_key('tags'):
|
|
|
|
for tcat in entry.tags:
|
2012-07-30 06:32:34 -07:00
|
|
|
term = None
|
2011-02-15 21:08:40 -05:00
|
|
|
if hasattr(tcat, 'label') and tcat.label:
|
2010-01-04 22:26:53 +00:00
|
|
|
term = tcat.label
|
2012-07-25 19:11:59 -07:00
|
|
|
elif hasattr(tcat, 'term') and tcat.term:
|
2010-01-04 22:26:53 +00:00
|
|
|
term = tcat.term
|
2012-07-30 06:32:34 -07:00
|
|
|
if not term:
|
2010-07-06 18:16:41 -04:00
|
|
|
continue
|
2010-01-04 22:26:53 +00:00
|
|
|
qcat = term.strip()
|
|
|
|
if ',' in qcat or '/' in qcat:
|
|
|
|
qcat = qcat.replace(',', '/').split('/')
|
|
|
|
else:
|
|
|
|
qcat = [qcat]
|
|
|
|
for zcat in qcat:
|
|
|
|
tagname = zcat.lower()
|
|
|
|
while ' ' in tagname:
|
|
|
|
tagname = tagname.replace(' ', ' ')
|
|
|
|
tagname = tagname.strip()
|
|
|
|
if not tagname or tagname == ' ':
|
|
|
|
continue
|
2010-08-21 20:42:38 -04:00
|
|
|
fcat.append(tagname)
|
2012-07-21 16:38:37 -07:00
|
|
|
fcat = [strip_tags(t)[:250] for t in fcat[:12]]
|
|
|
|
return fcat
|
2011-12-08 11:19:04 -08:00
|
|
|
|
|
|
|
def get_permalink(self, entry):
|
|
|
|
link = entry.get('link')
|
|
|
|
if not link:
|
|
|
|
links = entry.get('links')
|
|
|
|
if links:
|
2011-12-08 14:51:52 -08:00
|
|
|
link = links[0].get('href')
|
|
|
|
if not link:
|
|
|
|
link = entry.get('id')
|
2011-12-08 11:19:04 -08:00
|
|
|
return link
|
|
|
|
|
2009-08-01 04:26:57 +00:00
|
|
|
def _exists_story(self, story=None, story_content=None, existing_stories=None):
|
2009-08-30 00:43:13 +00:00
|
|
|
story_in_system = None
|
|
|
|
story_has_changed = False
|
2011-12-14 23:26:07 -08:00
|
|
|
story_link = self.get_permalink(story)
|
2013-01-28 16:45:48 -08:00
|
|
|
existing_stories_guids = existing_stories.keys()
|
2012-12-24 00:10:40 -08:00
|
|
|
# story_pub_date = story.get('published')
|
|
|
|
# story_published_now = story.get('published_now', False)
|
|
|
|
# start_date = story_pub_date - datetime.timedelta(hours=8)
|
|
|
|
# end_date = story_pub_date + datetime.timedelta(hours=8)
|
|
|
|
|
2013-01-28 16:45:48 -08:00
|
|
|
for existing_story in existing_stories.values():
|
2009-08-30 00:43:13 +00:00
|
|
|
content_ratio = 0
|
2012-12-24 00:10:40 -08:00
|
|
|
# existing_story_pub_date = existing_story.story_date
|
2010-01-28 13:28:27 -05:00
|
|
|
# print 'Story pub date: %s %s' % (story_published_now, story_pub_date)
|
2012-12-24 00:10:40 -08:00
|
|
|
|
|
|
|
if 'story_latest_content_z' in existing_story:
|
|
|
|
existing_story_content = unicode(zlib.decompress(existing_story.story_latest_content_z))
|
|
|
|
elif 'story_latest_content' in existing_story:
|
|
|
|
existing_story_content = existing_story.story_latest_content
|
|
|
|
elif 'story_content_z' in existing_story:
|
|
|
|
existing_story_content = unicode(zlib.decompress(existing_story.story_content_z))
|
|
|
|
elif 'story_content' in existing_story:
|
|
|
|
existing_story_content = existing_story.story_content
|
|
|
|
else:
|
|
|
|
existing_story_content = u''
|
2009-08-30 00:43:13 +00:00
|
|
|
|
2012-12-24 00:10:40 -08:00
|
|
|
if isinstance(existing_story.id, unicode):
|
|
|
|
existing_story.story_guid = existing_story.id
|
2013-01-28 16:45:48 -08:00
|
|
|
if (story.get('guid') in existing_stories_guids and
|
|
|
|
story.get('guid') != existing_story.story_guid):
|
|
|
|
continue
|
|
|
|
elif story.get('guid') == existing_story.story_guid:
|
2012-12-24 00:10:40 -08:00
|
|
|
story_in_system = existing_story
|
|
|
|
|
|
|
|
# Title distance + content distance, checking if story changed
|
|
|
|
story_title_difference = abs(levenshtein_distance(story.get('title'),
|
|
|
|
existing_story.story_title))
|
|
|
|
|
|
|
|
seq = difflib.SequenceMatcher(None, story_content, existing_story_content)
|
|
|
|
|
|
|
|
if (seq
|
|
|
|
and story_content
|
2013-07-05 17:49:06 -07:00
|
|
|
and len(story_content) > 1000
|
2012-12-24 00:10:40 -08:00
|
|
|
and existing_story_content
|
|
|
|
and seq.real_quick_ratio() > .9
|
|
|
|
and seq.quick_ratio() > .95):
|
|
|
|
content_ratio = seq.ratio()
|
2009-08-30 00:43:13 +00:00
|
|
|
|
2012-12-24 00:10:40 -08:00
|
|
|
if story_title_difference > 0 and content_ratio > .98:
|
|
|
|
story_in_system = existing_story
|
|
|
|
if story_title_difference > 0 or content_ratio < 1.0:
|
2013-06-04 11:26:01 -07:00
|
|
|
if settings.DEBUG and False:
|
2013-05-29 15:31:26 -07:00
|
|
|
logging.debug(" ---> Title difference - %s/%s (%s): %s" % (story.get('title'), existing_story.story_title, story_title_difference, content_ratio))
|
2009-08-30 00:43:13 +00:00
|
|
|
story_has_changed = True
|
|
|
|
break
|
2012-12-24 00:10:40 -08:00
|
|
|
|
|
|
|
# More restrictive content distance, still no story match
|
|
|
|
if not story_in_system and content_ratio > .98:
|
2013-06-04 11:26:01 -07:00
|
|
|
if settings.DEBUG and False:
|
2013-05-29 15:31:26 -07:00
|
|
|
logging.debug(" ---> Content difference - %s/%s (%s): %s" % (story.get('title'), existing_story.story_title, story_title_difference, content_ratio))
|
2012-12-24 00:10:40 -08:00
|
|
|
story_in_system = existing_story
|
|
|
|
story_has_changed = True
|
|
|
|
break
|
|
|
|
|
|
|
|
if story_in_system and not story_has_changed:
|
|
|
|
if story_content != existing_story_content:
|
2013-06-04 11:26:01 -07:00
|
|
|
if settings.DEBUG and False:
|
2013-05-29 15:31:26 -07:00
|
|
|
logging.debug(" ---> Content difference - %s/%s" % (story_content, existing_story_content))
|
2012-12-24 00:10:40 -08:00
|
|
|
story_has_changed = True
|
|
|
|
if story_link != existing_story.story_permalink:
|
2013-06-04 11:26:01 -07:00
|
|
|
if settings.DEBUG and False:
|
2013-05-29 15:31:26 -07:00
|
|
|
logging.debug(" ---> Permalink difference - %s/%s" % (story_link, existing_story.story_permalink))
|
2012-12-24 00:10:40 -08:00
|
|
|
story_has_changed = True
|
|
|
|
# if story_pub_date != existing_story.story_date:
|
|
|
|
# story_has_changed = True
|
|
|
|
break
|
2011-12-08 11:19:04 -08:00
|
|
|
|
2010-01-28 13:28:27 -05:00
|
|
|
|
2010-02-02 18:01:02 -05:00
|
|
|
# if story_has_changed or not story_in_system:
|
2012-07-22 12:25:09 -07:00
|
|
|
# print 'New/updated story: %s' % (story),
|
2009-08-30 00:43:13 +00:00
|
|
|
return story_in_system, story_has_changed
|
2013-08-06 13:54:06 -07:00
|
|
|
|
2012-01-09 19:08:22 -08:00
|
|
|
def get_next_scheduled_update(self, force=False, verbose=True):
|
2010-12-23 13:29:31 -05:00
|
|
|
if self.min_to_decay and not force:
|
2013-04-18 16:56:54 -07:00
|
|
|
return self.min_to_decay
|
2013-04-23 15:44:31 -07:00
|
|
|
|
2013-04-23 17:04:21 -07:00
|
|
|
upd = self.stories_last_month / 30.0
|
2013-04-23 16:03:45 -07:00
|
|
|
subs = (self.active_premium_subscribers +
|
|
|
|
((self.active_subscribers - self.active_premium_subscribers) / 10.0))
|
2013-04-23 15:44:31 -07:00
|
|
|
# UPD = 1 Subs > 1: t = 5 # 11625 * 1440/5 = 3348000
|
|
|
|
# UPD = 1 Subs = 1: t = 60 # 17231 * 1440/60 = 413544
|
|
|
|
# UPD < 1 Subs > 1: t = 60 # 37904 * 1440/60 = 909696
|
|
|
|
# UPD < 1 Subs = 1: t = 60 * 12 # 143012 * 1440/(60*12) = 286024
|
|
|
|
# UPD = 0 Subs > 1: t = 60 * 3 # 28351 * 1440/(60*3) = 226808
|
|
|
|
# UPD = 0 Subs = 1: t = 60 * 24 # 807690 * 1440/(60*24) = 807690
|
|
|
|
if upd >= 1:
|
|
|
|
if subs > 1:
|
2013-04-23 17:04:21 -07:00
|
|
|
total = 10
|
2013-04-23 15:44:31 -07:00
|
|
|
else:
|
|
|
|
total = 60
|
|
|
|
elif upd > 0:
|
|
|
|
if subs > 1:
|
|
|
|
total = 60 - (upd * 60)
|
|
|
|
else:
|
|
|
|
total = 60*12 - (upd * 60*12)
|
|
|
|
elif upd == 0:
|
|
|
|
if subs > 1:
|
2013-04-23 17:04:21 -07:00
|
|
|
total = 60 * 6
|
2013-04-23 15:44:31 -07:00
|
|
|
else:
|
|
|
|
total = 60 * 24
|
2013-07-02 10:36:16 -04:00
|
|
|
months_since_last_story = seconds_timesince(self.last_story_date) / (60*60*24*30)
|
2013-04-23 17:04:21 -07:00
|
|
|
total *= max(1, months_since_last_story)
|
2013-04-23 15:44:31 -07:00
|
|
|
# updates_per_day_delay = 3 * 60 / max(.25, ((max(0, self.active_subscribers)**.2)
|
|
|
|
# * (self.stories_last_month**0.25)))
|
|
|
|
# if self.active_premium_subscribers > 0:
|
|
|
|
# updates_per_day_delay /= min(self.active_subscribers+self.active_premium_subscribers, 4)
|
|
|
|
# updates_per_day_delay = int(updates_per_day_delay)
|
2013-03-28 12:17:30 -07:00
|
|
|
|
2010-07-02 15:49:08 -04:00
|
|
|
# Lots of subscribers = lots of updates
|
2011-04-02 00:17:59 -04:00
|
|
|
# 24 hours for 0 subscribers.
|
|
|
|
# 4 hours for 1 subscriber.
|
|
|
|
# .5 hours for 2 subscribers.
|
|
|
|
# .25 hours for 3 subscribers.
|
|
|
|
# 1 min for 10 subscribers.
|
2013-04-23 15:44:31 -07:00
|
|
|
# subscriber_bonus = 6 * 60 / max(.167, max(0, self.active_subscribers)**3)
|
|
|
|
# if self.premium_subscribers > 0:
|
|
|
|
# subscriber_bonus /= min(self.active_subscribers+self.premium_subscribers, 5)
|
|
|
|
# subscriber_bonus = int(subscriber_bonus)
|
2013-02-07 15:30:35 -08:00
|
|
|
|
2012-03-28 15:49:21 -07:00
|
|
|
if self.is_push:
|
2013-08-14 18:01:12 -07:00
|
|
|
fetch_history = MFetchHistory.feed(self.pk)
|
|
|
|
if len(fetch_history['push_history']):
|
|
|
|
total = total * 12
|
2012-12-21 16:48:47 -08:00
|
|
|
|
2013-04-23 17:04:21 -07:00
|
|
|
# 3 day max
|
2013-08-15 12:20:37 -07:00
|
|
|
total = min(total, 60*24*2)
|
2013-02-13 12:54:14 -08:00
|
|
|
|
2012-01-09 19:08:22 -08:00
|
|
|
if verbose:
|
2013-04-23 15:44:31 -07:00
|
|
|
logging.debug(" ---> [%-30s] Fetched every %s min - Subs: %s/%s/%s Stories: %s" % (
|
2013-03-28 12:17:30 -07:00
|
|
|
unicode(self)[:30], total,
|
|
|
|
self.num_subscribers,
|
2013-04-23 15:44:31 -07:00
|
|
|
self.active_subscribers,
|
2013-03-28 12:17:30 -07:00
|
|
|
self.active_premium_subscribers,
|
2013-04-23 16:03:45 -07:00
|
|
|
upd))
|
2013-04-18 16:56:54 -07:00
|
|
|
return total
|
2010-07-25 23:13:27 -04:00
|
|
|
|
2013-03-28 12:17:30 -07:00
|
|
|
def set_next_scheduled_update(self, verbose=False, skip_scheduling=False):
|
2013-03-30 19:05:13 -07:00
|
|
|
r = redis.Redis(connection_pool=settings.REDIS_FEED_POOL)
|
2013-04-18 16:56:54 -07:00
|
|
|
total = self.get_next_scheduled_update(force=True, verbose=verbose)
|
2013-04-08 10:50:50 -07:00
|
|
|
error_count = self.error_count
|
2010-12-23 13:29:31 -05:00
|
|
|
|
2013-04-08 10:50:50 -07:00
|
|
|
if error_count:
|
|
|
|
total = total * error_count
|
2013-08-15 12:20:37 -07:00
|
|
|
total = min(total, 60*24*7)
|
2012-12-25 12:08:17 -08:00
|
|
|
if verbose:
|
|
|
|
logging.debug(' ---> [%-30s] ~FBScheduling feed fetch geometrically: '
|
|
|
|
'~SB%s errors. Time: %s min' % (
|
|
|
|
unicode(self)[:30], self.errors_since_good, total))
|
2013-04-18 16:56:54 -07:00
|
|
|
|
|
|
|
random_factor = random.randint(0, total) / 4
|
2010-10-10 23:55:00 -04:00
|
|
|
next_scheduled_update = datetime.datetime.utcnow() + datetime.timedelta(
|
2010-09-07 14:02:48 -07:00
|
|
|
minutes = total + random_factor)
|
2013-03-30 19:05:13 -07:00
|
|
|
|
2010-12-23 13:29:31 -05:00
|
|
|
self.min_to_decay = total
|
2013-04-23 13:46:07 -07:00
|
|
|
delta = self.next_scheduled_update - datetime.datetime.now()
|
|
|
|
minutes_to_next_fetch = delta.total_seconds() / 60
|
|
|
|
if minutes_to_next_fetch > self.min_to_decay or not skip_scheduling:
|
2013-03-28 12:17:30 -07:00
|
|
|
self.next_scheduled_update = next_scheduled_update
|
2013-04-18 16:47:55 -07:00
|
|
|
if self.active_subscribers >= 1:
|
|
|
|
r.zadd('scheduled_updates', self.pk, self.next_scheduled_update.strftime('%s'))
|
2013-04-03 17:22:45 -07:00
|
|
|
r.zrem('tasked_feeds', self.pk)
|
2013-04-08 10:50:50 -07:00
|
|
|
r.srem('queued_feeds', self.pk)
|
2013-04-03 17:22:45 -07:00
|
|
|
|
2010-11-05 20:34:17 -04:00
|
|
|
self.save()
|
2013-03-30 19:05:13 -07:00
|
|
|
|
2013-04-08 10:50:50 -07:00
|
|
|
|
|
|
|
@property
|
|
|
|
def error_count(self):
|
|
|
|
r = redis.Redis(connection_pool=settings.REDIS_FEED_POOL)
|
|
|
|
fetch_errors = int(r.zscore('error_feeds', self.pk) or 0)
|
|
|
|
|
|
|
|
return fetch_errors + self.errors_since_good
|
|
|
|
|
2013-01-02 12:27:08 -08:00
|
|
|
def schedule_feed_fetch_immediately(self, verbose=True):
|
2013-03-30 19:05:13 -07:00
|
|
|
r = redis.Redis(connection_pool=settings.REDIS_FEED_POOL)
|
2013-01-02 12:27:08 -08:00
|
|
|
if verbose:
|
|
|
|
logging.debug(' ---> [%-30s] Scheduling feed fetch immediately...' % (unicode(self)[:30]))
|
|
|
|
|
2010-10-10 23:55:00 -04:00
|
|
|
self.next_scheduled_update = datetime.datetime.utcnow()
|
2013-03-30 19:05:13 -07:00
|
|
|
r.zadd('scheduled_updates', self.pk, self.next_scheduled_update.strftime('%s'))
|
2010-08-25 19:10:55 -04:00
|
|
|
|
2012-03-27 17:34:39 -07:00
|
|
|
return self.save()
|
2010-07-27 23:29:04 -04:00
|
|
|
|
2012-03-27 18:37:04 -07:00
|
|
|
def setup_push(self):
|
|
|
|
from apps.push.models import PushSubscription
|
2012-03-28 16:49:15 -07:00
|
|
|
try:
|
|
|
|
push = self.push
|
|
|
|
except PushSubscription.DoesNotExist:
|
|
|
|
self.is_push = False
|
|
|
|
else:
|
2012-03-27 18:37:04 -07:00
|
|
|
self.is_push = push.verified
|
2010-11-05 20:34:17 -04:00
|
|
|
self.save()
|
2012-03-28 15:49:21 -07:00
|
|
|
|
|
|
|
def queue_pushed_feed_xml(self, xml):
|
2013-06-18 12:21:27 -07:00
|
|
|
r = redis.Redis(connection_pool=settings.REDIS_FEED_POOL)
|
2012-12-26 02:41:13 -08:00
|
|
|
queue_size = r.llen("push_feeds")
|
2010-07-27 23:29:04 -04:00
|
|
|
|
2012-12-26 02:41:13 -08:00
|
|
|
if queue_size > 1000:
|
|
|
|
self.schedule_feed_fetch_immediately()
|
|
|
|
else:
|
|
|
|
logging.debug(' ---> [%-30s] [%s] ~FBQueuing pushed stories...' % (unicode(self)[:30], self.pk))
|
|
|
|
self.set_next_scheduled_update()
|
|
|
|
PushFeeds.apply_async(args=(self.pk, xml), queue='push_feeds')
|
|
|
|
|
2011-09-19 08:56:16 -07:00
|
|
|
# def calculate_collocations_story_content(self,
|
|
|
|
# collocation_measures=TrigramAssocMeasures,
|
|
|
|
# collocation_finder=TrigramCollocationFinder):
|
|
|
|
# stories = MStory.objects.filter(story_feed_id=self.pk)
|
|
|
|
# story_content = ' '.join([s.story_content for s in stories if s.story_content])
|
|
|
|
# return self.calculate_collocations(story_content, collocation_measures, collocation_finder)
|
|
|
|
#
|
|
|
|
# def calculate_collocations_story_title(self,
|
|
|
|
# collocation_measures=BigramAssocMeasures,
|
|
|
|
# collocation_finder=BigramCollocationFinder):
|
|
|
|
# stories = MStory.objects.filter(story_feed_id=self.pk)
|
|
|
|
# story_titles = ' '.join([s.story_title for s in stories if s.story_title])
|
|
|
|
# return self.calculate_collocations(story_titles, collocation_measures, collocation_finder)
|
|
|
|
#
|
|
|
|
# def calculate_collocations(self, content,
|
|
|
|
# collocation_measures=TrigramAssocMeasures,
|
|
|
|
# collocation_finder=TrigramCollocationFinder):
|
|
|
|
# content = re.sub(r'’', '\'', content)
|
|
|
|
# content = re.sub(r'&', '&', content)
|
|
|
|
# try:
|
|
|
|
# content = unicode(BeautifulStoneSoup(content,
|
|
|
|
# convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
|
|
|
|
# except ValueError, e:
|
|
|
|
# print "ValueError, ignoring: %s" % e
|
|
|
|
# content = re.sub(r'</?\w+\s+[^>]*>', '', content)
|
|
|
|
# content = re.split(r"[^A-Za-z-'&]+", content)
|
|
|
|
#
|
|
|
|
# finder = collocation_finder.from_words(content)
|
|
|
|
# finder.apply_freq_filter(3)
|
|
|
|
# best = finder.nbest(collocation_measures.pmi, 10)
|
|
|
|
# phrases = [' '.join(phrase) for phrase in best]
|
|
|
|
#
|
|
|
|
# return phrases
|
2010-07-27 22:11:23 -04:00
|
|
|
|
2010-07-27 22:37:52 -04:00
|
|
|
# class FeedCollocations(models.Model):
|
|
|
|
# feed = models.ForeignKey(Feed)
|
|
|
|
# phrase = models.CharField(max_length=500)
|
2009-06-16 03:08:55 +00:00
|
|
|
|
2011-01-17 20:23:29 -05:00
|
|
|
class FeedData(models.Model):
|
2011-01-17 22:48:38 -05:00
|
|
|
feed = AutoOneToOneField(Feed, related_name='data')
|
2011-02-06 15:43:13 -05:00
|
|
|
feed_tagline = models.CharField(max_length=1024, blank=True, null=True)
|
2011-01-17 20:23:29 -05:00
|
|
|
story_count_history = models.TextField(blank=True, null=True)
|
2011-04-07 17:00:28 -04:00
|
|
|
feed_classifier_counts = models.TextField(blank=True, null=True)
|
2011-01-17 20:23:29 -05:00
|
|
|
popular_tags = models.CharField(max_length=1024, blank=True, null=True)
|
|
|
|
popular_authors = models.CharField(max_length=2048, blank=True, null=True)
|
2009-06-16 03:08:55 +00:00
|
|
|
|
2011-01-17 22:48:38 -05:00
|
|
|
def save(self, *args, **kwargs):
|
2011-02-05 22:15:03 -05:00
|
|
|
if self.feed_tagline and len(self.feed_tagline) >= 1000:
|
|
|
|
self.feed_tagline = self.feed_tagline[:1000]
|
2009-12-18 20:47:44 +00:00
|
|
|
|
2011-01-21 20:29:19 -05:00
|
|
|
try:
|
|
|
|
super(FeedData, self).save(*args, **kwargs)
|
|
|
|
except (IntegrityError, OperationError):
|
2011-02-05 22:09:31 -05:00
|
|
|
if hasattr(self, 'id') and self.id: self.delete()
|
2010-05-20 15:13:25 -04:00
|
|
|
|
2011-01-27 19:05:50 -05:00
|
|
|
|
2011-04-21 23:10:43 -04:00
|
|
|
class MFeedIcon(mongo.Document):
|
2012-03-29 16:03:06 -07:00
|
|
|
feed_id = mongo.IntField(primary_key=True)
|
|
|
|
color = mongo.StringField(max_length=6)
|
|
|
|
data = mongo.StringField()
|
|
|
|
icon_url = mongo.StringField()
|
|
|
|
not_found = mongo.BooleanField(default=False)
|
2011-04-21 23:10:43 -04:00
|
|
|
|
|
|
|
meta = {
|
|
|
|
'collection' : 'feed_icons',
|
|
|
|
'allow_inheritance' : False,
|
|
|
|
}
|
|
|
|
|
|
|
|
def save(self, *args, **kwargs):
|
|
|
|
if self.icon_url:
|
|
|
|
self.icon_url = unicode(self.icon_url)
|
|
|
|
try:
|
2013-05-29 19:37:50 -07:00
|
|
|
return super(MFeedIcon, self).save(*args, **kwargs)
|
2011-04-21 23:10:43 -04:00
|
|
|
except (IntegrityError, OperationError):
|
|
|
|
# print "Error on Icon: %s" % e
|
|
|
|
if hasattr(self, '_id'): self.delete()
|
|
|
|
|
|
|
|
|
2010-08-27 18:35:33 -04:00
|
|
|
class MFeedPage(mongo.Document):
|
|
|
|
feed_id = mongo.IntField(primary_key=True)
|
2010-08-29 12:35:09 -04:00
|
|
|
page_data = mongo.BinaryField()
|
2010-08-27 18:35:33 -04:00
|
|
|
|
|
|
|
meta = {
|
2010-08-29 12:35:09 -04:00
|
|
|
'collection': 'feed_pages',
|
2010-08-27 18:35:33 -04:00
|
|
|
'allow_inheritance': False,
|
|
|
|
}
|
|
|
|
|
|
|
|
def save(self, *args, **kwargs):
|
2010-08-29 12:35:09 -04:00
|
|
|
if self.page_data:
|
|
|
|
self.page_data = zlib.compress(self.page_data)
|
2013-05-29 19:37:50 -07:00
|
|
|
return super(MFeedPage, self).save(*args, **kwargs)
|
2011-01-29 19:16:40 -05:00
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def get_data(cls, feed_id):
|
|
|
|
data = None
|
|
|
|
feed_page = cls.objects(feed_id=feed_id)
|
|
|
|
if feed_page:
|
2012-04-24 17:40:34 -07:00
|
|
|
page_data_z = feed_page[0].page_data
|
|
|
|
if page_data_z:
|
|
|
|
data = zlib.decompress(page_data_z)
|
2011-01-29 19:16:40 -05:00
|
|
|
|
|
|
|
if not data:
|
|
|
|
dupe_feed = DuplicateFeed.objects.filter(duplicate_feed_id=feed_id)
|
|
|
|
if dupe_feed:
|
|
|
|
feed = dupe_feed[0].feed
|
|
|
|
feed_page = MFeedPage.objects.filter(feed_id=feed.pk)
|
|
|
|
if feed_page:
|
2012-04-24 17:40:34 -07:00
|
|
|
page_data_z = feed_page[0].page_data
|
|
|
|
if page_data_z:
|
|
|
|
data = zlib.decompress(feed_page[0].page_data)
|
2012-03-29 14:45:19 -07:00
|
|
|
|
2011-01-29 19:16:40 -05:00
|
|
|
return data
|
2010-05-20 15:13:25 -04:00
|
|
|
|
2010-08-21 13:57:39 -04:00
|
|
|
class MStory(mongo.Document):
|
|
|
|
'''A feed item'''
|
2013-02-20 16:08:14 -08:00
|
|
|
story_feed_id = mongo.IntField()
|
2010-11-30 10:30:18 -05:00
|
|
|
story_date = mongo.DateTimeField()
|
|
|
|
story_title = mongo.StringField(max_length=1024)
|
|
|
|
story_content = mongo.StringField()
|
|
|
|
story_content_z = mongo.BinaryField()
|
|
|
|
story_original_content = mongo.StringField()
|
2010-08-29 13:23:50 -04:00
|
|
|
story_original_content_z = mongo.BinaryField()
|
2012-07-22 12:25:09 -07:00
|
|
|
story_latest_content = mongo.StringField()
|
|
|
|
story_latest_content_z = mongo.BinaryField()
|
2013-01-08 18:33:30 -08:00
|
|
|
original_text_z = mongo.BinaryField()
|
2010-11-30 10:30:18 -05:00
|
|
|
story_content_type = mongo.StringField(max_length=255)
|
|
|
|
story_author_name = mongo.StringField()
|
|
|
|
story_permalink = mongo.StringField()
|
|
|
|
story_guid = mongo.StringField()
|
2013-01-08 14:11:59 -08:00
|
|
|
story_hash = mongo.StringField()
|
2013-06-26 11:38:49 -07:00
|
|
|
image_urls = mongo.ListField(mongo.StringField(max_length=1024))
|
2010-11-30 10:30:18 -05:00
|
|
|
story_tags = mongo.ListField(mongo.StringField(max_length=250))
|
2012-01-09 13:55:26 -08:00
|
|
|
comment_count = mongo.IntField()
|
|
|
|
comment_user_ids = mongo.ListField(mongo.IntField())
|
|
|
|
share_count = mongo.IntField()
|
|
|
|
share_user_ids = mongo.ListField(mongo.IntField())
|
2010-11-30 10:30:18 -05:00
|
|
|
|
2010-08-21 13:57:39 -04:00
|
|
|
meta = {
|
|
|
|
'collection': 'stories',
|
2013-02-20 15:42:40 -08:00
|
|
|
'indexes': [('story_feed_id', '-story_date'),
|
|
|
|
{'fields': ['story_hash'],
|
|
|
|
'unique': True,
|
2013-05-29 18:00:09 -07:00
|
|
|
'types': False, }],
|
2011-11-29 09:43:16 -08:00
|
|
|
'index_drop_dups': True,
|
2010-08-21 20:42:38 -04:00
|
|
|
'ordering': ['-story_date'],
|
|
|
|
'allow_inheritance': False,
|
2012-09-17 17:01:56 -07:00
|
|
|
'cascade': False,
|
2010-08-21 13:57:39 -04:00
|
|
|
}
|
2010-08-29 13:23:50 -04:00
|
|
|
|
2013-06-04 15:34:03 -07:00
|
|
|
RE_STORY_HASH = re.compile(r"^(\d{1,10}):(\w{6})$")
|
|
|
|
RE_RS_KEY = re.compile(r"^RS:(\d+):(\d+)$")
|
|
|
|
|
2012-01-09 13:55:26 -08:00
|
|
|
@property
|
|
|
|
def guid_hash(self):
|
2012-11-27 11:59:54 -08:00
|
|
|
return hashlib.sha1(self.story_guid).hexdigest()[:6]
|
2013-01-08 14:11:59 -08:00
|
|
|
|
|
|
|
@property
|
|
|
|
def feed_guid_hash(self):
|
2013-02-20 15:42:40 -08:00
|
|
|
return "%s:%s" % (self.story_feed_id, self.guid_hash)
|
2012-01-09 13:55:26 -08:00
|
|
|
|
2010-08-29 13:23:50 -04:00
|
|
|
def save(self, *args, **kwargs):
|
2011-02-15 21:08:40 -05:00
|
|
|
story_title_max = MStory._fields['story_title'].max_length
|
|
|
|
story_content_type_max = MStory._fields['story_content_type'].max_length
|
2013-02-20 15:42:40 -08:00
|
|
|
self.story_hash = self.feed_guid_hash
|
|
|
|
|
2010-08-29 13:23:50 -04:00
|
|
|
if self.story_content:
|
|
|
|
self.story_content_z = zlib.compress(self.story_content)
|
|
|
|
self.story_content = None
|
|
|
|
if self.story_original_content:
|
|
|
|
self.story_original_content_z = zlib.compress(self.story_original_content)
|
|
|
|
self.story_original_content = None
|
2012-07-22 12:25:09 -07:00
|
|
|
if self.story_latest_content:
|
|
|
|
self.story_latest_content_z = zlib.compress(self.story_latest_content)
|
|
|
|
self.story_latest_content = None
|
2011-02-15 21:16:34 -05:00
|
|
|
if self.story_title and len(self.story_title) > story_title_max:
|
2011-02-15 21:08:40 -05:00
|
|
|
self.story_title = self.story_title[:story_title_max]
|
2011-02-15 21:16:34 -05:00
|
|
|
if self.story_content_type and len(self.story_content_type) > story_content_type_max:
|
2011-02-15 21:08:40 -05:00
|
|
|
self.story_content_type = self.story_content_type[:story_content_type_max]
|
2013-01-08 14:11:59 -08:00
|
|
|
|
2010-08-29 13:23:50 -04:00
|
|
|
super(MStory, self).save(*args, **kwargs)
|
2012-07-25 17:55:23 -07:00
|
|
|
|
|
|
|
self.sync_redis()
|
2013-05-29 19:37:50 -07:00
|
|
|
|
|
|
|
return self
|
2012-01-09 13:55:26 -08:00
|
|
|
|
2012-07-16 20:49:43 -07:00
|
|
|
def delete(self, *args, **kwargs):
|
|
|
|
self.remove_from_redis()
|
|
|
|
|
|
|
|
super(MStory, self).delete(*args, **kwargs)
|
2012-01-09 13:55:26 -08:00
|
|
|
|
2013-06-03 17:36:57 -07:00
|
|
|
@classmethod
|
2013-06-03 17:48:11 -07:00
|
|
|
def trim_feed(cls, cutoff, feed_id=None, feed=None, verbose=True):
|
2013-08-12 16:48:16 -07:00
|
|
|
extra_stories_count = 0
|
2013-06-03 17:48:11 -07:00
|
|
|
if not feed_id and not feed:
|
2013-08-12 16:48:16 -07:00
|
|
|
return extra_stories_count
|
2013-06-03 17:36:57 -07:00
|
|
|
|
2013-06-03 17:48:11 -07:00
|
|
|
if not feed_id:
|
|
|
|
feed_id = feed.pk
|
|
|
|
if not feed:
|
|
|
|
feed = feed_id
|
|
|
|
|
2013-06-03 17:36:57 -07:00
|
|
|
stories = cls.objects(
|
|
|
|
story_feed_id=feed_id,
|
|
|
|
).order_by('-story_date')
|
|
|
|
|
|
|
|
if stories.count() > cutoff:
|
2013-08-05 10:23:22 -07:00
|
|
|
logging.debug(' ---> [%-30s] ~FMFound %s stories. Trimming to ~SB%s~SN...' %
|
2013-06-03 17:48:11 -07:00
|
|
|
(unicode(feed)[:30], stories.count(), cutoff))
|
2013-06-03 17:36:57 -07:00
|
|
|
try:
|
|
|
|
story_trim_date = stories[cutoff].story_date
|
|
|
|
except IndexError, e:
|
2013-06-03 17:48:11 -07:00
|
|
|
logging.debug(' ***> [%-30s] ~BRError trimming feed: %s' % (unicode(feed)[:30], e))
|
2013-08-12 16:48:16 -07:00
|
|
|
return extra_stories_count
|
2013-06-03 17:36:57 -07:00
|
|
|
|
|
|
|
extra_stories = MStory.objects(story_feed_id=feed_id,
|
|
|
|
story_date__lte=story_trim_date)
|
|
|
|
extra_stories_count = extra_stories.count()
|
|
|
|
for story in extra_stories:
|
|
|
|
story.delete()
|
|
|
|
if verbose:
|
|
|
|
existing_story_count = MStory.objects(story_feed_id=feed_id).count()
|
|
|
|
logging.debug(" ---> Deleted %s stories, %s left." % (
|
|
|
|
extra_stories_count,
|
|
|
|
existing_story_count))
|
2013-08-12 16:48:16 -07:00
|
|
|
|
|
|
|
return extra_stories_count
|
2013-06-03 17:36:57 -07:00
|
|
|
|
2012-07-26 22:12:48 -07:00
|
|
|
@classmethod
|
2013-01-08 18:33:30 -08:00
|
|
|
def find_story(cls, story_feed_id, story_id, original_only=False):
|
2012-07-26 22:12:48 -07:00
|
|
|
from apps.social.models import MSharedStory
|
2013-06-19 13:22:11 -07:00
|
|
|
original_found = False
|
2013-06-04 15:34:03 -07:00
|
|
|
story_hash = cls.ensure_story_hash(story_id, story_feed_id)
|
|
|
|
|
2013-03-20 15:43:35 -07:00
|
|
|
if isinstance(story_id, ObjectId):
|
|
|
|
story = cls.objects(id=story_id).limit(1).first()
|
|
|
|
else:
|
|
|
|
story = cls.objects(story_hash=story_hash).limit(1).first()
|
2013-01-08 18:33:30 -08:00
|
|
|
|
2013-06-19 13:22:11 -07:00
|
|
|
if story:
|
|
|
|
original_found = True
|
2013-01-08 18:33:30 -08:00
|
|
|
if not story and not original_only:
|
2012-07-26 22:12:48 -07:00
|
|
|
story = MSharedStory.objects.filter(story_feed_id=story_feed_id,
|
2013-06-04 15:34:03 -07:00
|
|
|
story_hash=story_hash).limit(1).first()
|
2013-01-08 18:33:30 -08:00
|
|
|
if not story and not original_only:
|
2012-07-26 22:12:48 -07:00
|
|
|
story = MStarredStory.objects.filter(story_feed_id=story_feed_id,
|
2013-06-04 15:34:03 -07:00
|
|
|
story_hash=story_hash).limit(1).first()
|
2012-07-26 22:12:48 -07:00
|
|
|
|
|
|
|
return story, original_found
|
2012-12-13 17:49:07 -08:00
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def find_by_id(cls, story_ids):
|
|
|
|
from apps.social.models import MSharedStory
|
|
|
|
count = len(story_ids)
|
|
|
|
multiple = isinstance(story_ids, list) or isinstance(story_ids, tuple)
|
|
|
|
|
|
|
|
stories = list(cls.objects(id__in=story_ids))
|
|
|
|
if len(stories) < count:
|
|
|
|
shared_stories = list(MSharedStory.objects(id__in=story_ids))
|
|
|
|
stories.extend(shared_stories)
|
2013-04-29 15:27:22 -07:00
|
|
|
|
|
|
|
if not multiple:
|
|
|
|
stories = stories[0]
|
|
|
|
|
|
|
|
return stories
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def find_by_story_hashes(cls, story_hashes):
|
|
|
|
from apps.social.models import MSharedStory
|
|
|
|
count = len(story_hashes)
|
|
|
|
multiple = isinstance(story_hashes, list) or isinstance(story_hashes, tuple)
|
|
|
|
|
|
|
|
stories = list(cls.objects(story_hash__in=story_hashes))
|
|
|
|
if len(stories) < count:
|
2013-04-30 15:49:44 -07:00
|
|
|
hashes_found = [s.story_hash for s in stories]
|
2013-04-30 16:59:02 -07:00
|
|
|
remaining_hashes = list(set(story_hashes) - set(hashes_found))
|
2013-04-30 15:49:44 -07:00
|
|
|
story_feed_ids = [h.split(':')[0] for h in remaining_hashes]
|
2013-04-30 15:28:00 -07:00
|
|
|
shared_stories = list(MSharedStory.objects(story_feed_id__in=story_feed_ids,
|
2013-04-30 15:49:44 -07:00
|
|
|
story_hash__in=remaining_hashes))
|
2013-04-29 15:27:22 -07:00
|
|
|
stories.extend(shared_stories)
|
|
|
|
|
2012-12-13 17:49:07 -08:00
|
|
|
if not multiple:
|
|
|
|
stories = stories[0]
|
|
|
|
|
|
|
|
return stories
|
2013-06-04 15:34:03 -07:00
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def ensure_story_hash(cls, story_id, story_feed_id):
|
|
|
|
if not cls.RE_STORY_HASH.match(story_id):
|
|
|
|
story_id = "%s:%s" % (story_feed_id, hashlib.sha1(story_id).hexdigest()[:6])
|
2012-07-16 20:49:43 -07:00
|
|
|
|
2013-06-04 15:34:03 -07:00
|
|
|
return story_id
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def split_story_hash(cls, story_hash):
|
|
|
|
matches = cls.RE_STORY_HASH.match(story_hash)
|
|
|
|
if matches:
|
|
|
|
groups = matches.groups()
|
|
|
|
return groups[0], groups[1]
|
|
|
|
return None, None
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def split_rs_key(cls, rs_key):
|
|
|
|
matches = cls.RE_RS_KEY.match(rs_key)
|
|
|
|
if matches:
|
|
|
|
groups = matches.groups()
|
|
|
|
return groups[0], groups[1]
|
|
|
|
return None, None
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def story_hashes(cls, story_ids):
|
|
|
|
story_hashes = []
|
|
|
|
for story_id in story_ids:
|
|
|
|
story_hash = cls.ensure_story_hash(story_id)
|
|
|
|
if not story_hash: continue
|
|
|
|
story_hashes.append(story_hash)
|
|
|
|
|
|
|
|
return story_hashes
|
|
|
|
|
2013-08-14 14:32:50 -07:00
|
|
|
def sync_redis(self, r=None):
|
2012-07-23 13:06:12 -07:00
|
|
|
if not r:
|
2013-05-02 12:27:37 -07:00
|
|
|
r = redis.Redis(connection_pool=settings.REDIS_STORY_HASH_POOL)
|
2013-08-14 14:32:50 -07:00
|
|
|
# if not r2:
|
|
|
|
# r2 = redis.Redis(connection_pool=settings.REDIS_STORY_HASH_POOL2)
|
2013-08-02 15:59:03 -07:00
|
|
|
UNREAD_CUTOFF = datetime.datetime.now() - datetime.timedelta(days=settings.DAYS_OF_UNREAD_NEW)
|
2012-07-25 17:55:23 -07:00
|
|
|
|
2013-04-13 22:31:05 -07:00
|
|
|
if self.id and self.story_date > UNREAD_CUTOFF:
|
2013-07-01 22:19:22 -07:00
|
|
|
feed_key = 'F:%s' % self.story_feed_id
|
|
|
|
r.sadd(feed_key, self.story_hash)
|
2013-08-02 15:59:03 -07:00
|
|
|
r.expire(feed_key, settings.DAYS_OF_UNREAD_NEW*24*60*60)
|
2013-08-14 14:32:50 -07:00
|
|
|
# r2.sadd(feed_key, self.story_hash)
|
|
|
|
# r2.expire(feed_key, settings.DAYS_OF_UNREAD_NEW*24*60*60)
|
2013-07-01 22:19:22 -07:00
|
|
|
|
|
|
|
r.zadd('z' + feed_key, self.story_hash, time.mktime(self.story_date.timetuple()))
|
2013-08-02 15:59:03 -07:00
|
|
|
r.expire('z' + feed_key, settings.DAYS_OF_UNREAD_NEW*24*60*60)
|
2013-08-14 14:32:50 -07:00
|
|
|
# r2.zadd('z' + feed_key, self.story_hash, time.mktime(self.story_date.timetuple()))
|
|
|
|
# r2.expire('z' + feed_key, settings.DAYS_OF_UNREAD_NEW*24*60*60)
|
2012-07-16 18:11:18 -07:00
|
|
|
|
2013-08-14 14:32:50 -07:00
|
|
|
def remove_from_redis(self, r=None):
|
2012-07-23 13:06:12 -07:00
|
|
|
if not r:
|
2013-05-02 12:27:37 -07:00
|
|
|
r = redis.Redis(connection_pool=settings.REDIS_STORY_HASH_POOL)
|
2013-08-14 14:32:50 -07:00
|
|
|
# if not r2:
|
|
|
|
# r2 = redis.Redis(connection_pool=settings.REDIS_STORY_HASH_POOL2)
|
2012-07-23 10:44:32 -07:00
|
|
|
if self.id:
|
2013-05-02 12:27:37 -07:00
|
|
|
r.srem('F:%s' % self.story_feed_id, self.story_hash)
|
2013-08-14 14:32:50 -07:00
|
|
|
# r2.srem('F:%s' % self.story_feed_id, self.story_hash)
|
2013-05-02 12:27:37 -07:00
|
|
|
r.zrem('zF:%s' % self.story_feed_id, self.story_hash)
|
2013-08-14 14:32:50 -07:00
|
|
|
# r2.zrem('zF:%s' % self.story_feed_id, self.story_hash)
|
2012-07-16 20:49:43 -07:00
|
|
|
|
2012-07-16 18:11:18 -07:00
|
|
|
@classmethod
|
2013-05-02 12:27:37 -07:00
|
|
|
def sync_feed_redis(cls, story_feed_id):
|
|
|
|
r = redis.Redis(connection_pool=settings.REDIS_STORY_HASH_POOL)
|
2013-08-14 14:32:50 -07:00
|
|
|
# r2 = redis.Redis(connection_pool=settings.REDIS_STORY_HASH_POOL2)
|
2013-08-02 15:59:03 -07:00
|
|
|
UNREAD_CUTOFF = datetime.datetime.now() - datetime.timedelta(days=settings.DAYS_OF_UNREAD_NEW)
|
2013-05-02 12:27:37 -07:00
|
|
|
feed = Feed.get_by_id(story_feed_id)
|
|
|
|
stories = cls.objects.filter(story_feed_id=story_feed_id, story_date__gte=UNREAD_CUTOFF)
|
|
|
|
r.delete('F:%s' % story_feed_id)
|
2013-08-14 14:32:50 -07:00
|
|
|
# r2.delete('F:%s' % story_feed_id)
|
2013-05-02 12:27:37 -07:00
|
|
|
r.delete('zF:%s' % story_feed_id)
|
2013-08-14 14:32:50 -07:00
|
|
|
# r2.delete('zF:%s' % story_feed_id)
|
2012-10-29 14:58:43 -07:00
|
|
|
|
2013-07-05 18:50:57 -07:00
|
|
|
logging.info(" ---> [%-30s] ~FMSyncing ~SB%s~SN stories to redis" % (feed and feed.title[:30] or story_feed_id, stories.count()))
|
2013-05-02 12:27:37 -07:00
|
|
|
p = r.pipeline()
|
2013-08-14 14:32:50 -07:00
|
|
|
# p2 = r2.pipeline()
|
2012-07-16 18:11:18 -07:00
|
|
|
for story in stories:
|
2013-08-14 14:32:50 -07:00
|
|
|
story.sync_redis(r=p)
|
2013-05-02 12:27:37 -07:00
|
|
|
p.execute()
|
2013-08-14 14:32:50 -07:00
|
|
|
# p2.execute()
|
2012-07-16 18:11:18 -07:00
|
|
|
|
2012-01-09 13:55:26 -08:00
|
|
|
def count_comments(self):
|
|
|
|
from apps.social.models import MSharedStory
|
|
|
|
params = {
|
|
|
|
'story_guid': self.story_guid,
|
|
|
|
'story_feed_id': self.story_feed_id,
|
|
|
|
}
|
|
|
|
comments = MSharedStory.objects.filter(has_comments=True, **params).only('user_id')
|
2012-01-15 20:51:48 -08:00
|
|
|
shares = MSharedStory.objects.filter(**params).only('user_id')
|
2012-01-09 13:55:26 -08:00
|
|
|
self.comment_count = comments.count()
|
|
|
|
self.comment_user_ids = [c['user_id'] for c in comments]
|
|
|
|
self.share_count = shares.count()
|
|
|
|
self.share_user_ids = [s['user_id'] for s in shares]
|
|
|
|
self.save()
|
2013-06-20 13:41:37 -07:00
|
|
|
|
2013-06-26 11:38:49 -07:00
|
|
|
def extract_image_urls(self, force=False):
|
|
|
|
if self.image_urls and not force:
|
|
|
|
return self.image_urls
|
2013-06-20 13:44:10 -07:00
|
|
|
|
2013-06-20 13:46:11 -07:00
|
|
|
story_content = self.story_content
|
|
|
|
if not story_content and self.story_content_z:
|
|
|
|
story_content = zlib.decompress(self.story_content_z)
|
2013-06-20 13:44:10 -07:00
|
|
|
if not story_content:
|
|
|
|
return
|
|
|
|
|
2013-08-06 13:18:55 -07:00
|
|
|
try:
|
|
|
|
soup = BeautifulSoup(story_content)
|
|
|
|
except ValueError:
|
|
|
|
return
|
|
|
|
|
2013-06-26 11:38:49 -07:00
|
|
|
images = soup.findAll('img')
|
|
|
|
if not images:
|
|
|
|
return
|
|
|
|
|
|
|
|
image_urls = []
|
|
|
|
for image in images:
|
2013-06-20 14:45:05 -07:00
|
|
|
image_url = image.get('src')
|
2013-06-26 16:26:14 -07:00
|
|
|
if not image_url:
|
|
|
|
continue
|
2013-06-20 14:45:05 -07:00
|
|
|
if image_url and len(image_url) >= 1024:
|
2013-06-26 11:38:49 -07:00
|
|
|
continue
|
|
|
|
image_urls.append(image_url)
|
|
|
|
|
2013-06-26 16:26:14 -07:00
|
|
|
if not image_urls:
|
|
|
|
return
|
|
|
|
|
2013-06-26 11:38:49 -07:00
|
|
|
self.image_urls = image_urls
|
|
|
|
return self.image_urls
|
2013-06-20 13:41:37 -07:00
|
|
|
|
2013-01-08 18:33:30 -08:00
|
|
|
def fetch_original_text(self, force=False, request=None):
|
|
|
|
original_text_z = self.original_text_z
|
2013-07-15 11:06:50 -07:00
|
|
|
feed = Feed.get_by_id(self.story_feed_id)
|
2013-01-08 18:33:30 -08:00
|
|
|
|
|
|
|
if not original_text_z or force:
|
2013-07-15 11:06:50 -07:00
|
|
|
ti = TextImporter(self, feed=feed, request=request)
|
2013-01-08 18:33:30 -08:00
|
|
|
original_text = ti.fetch()
|
|
|
|
else:
|
|
|
|
logging.user(request, "~FYFetching ~FGoriginal~FY story text, ~SBfound.")
|
|
|
|
original_text = zlib.decompress(original_text_z)
|
|
|
|
|
|
|
|
return original_text
|
2012-09-04 11:46:41 -07:00
|
|
|
|
2010-11-30 10:30:18 -05:00
|
|
|
|
|
|
|
class MStarredStory(mongo.Document):
|
|
|
|
"""Like MStory, but not inherited due to large overhead of _cls and _type in
|
|
|
|
mongoengine's inheritance model on every single row."""
|
2012-03-26 13:14:02 -07:00
|
|
|
user_id = mongo.IntField(unique_with=('story_guid',))
|
2010-12-02 20:18:33 -05:00
|
|
|
starred_date = mongo.DateTimeField()
|
2010-11-30 10:30:18 -05:00
|
|
|
story_feed_id = mongo.IntField()
|
|
|
|
story_date = mongo.DateTimeField()
|
|
|
|
story_title = mongo.StringField(max_length=1024)
|
|
|
|
story_content = mongo.StringField()
|
|
|
|
story_content_z = mongo.BinaryField()
|
|
|
|
story_original_content = mongo.StringField()
|
|
|
|
story_original_content_z = mongo.BinaryField()
|
2013-01-28 15:43:00 -08:00
|
|
|
original_text_z = mongo.BinaryField()
|
2010-11-30 10:30:18 -05:00
|
|
|
story_content_type = mongo.StringField(max_length=255)
|
|
|
|
story_author_name = mongo.StringField()
|
|
|
|
story_permalink = mongo.StringField()
|
2012-03-26 13:14:02 -07:00
|
|
|
story_guid = mongo.StringField()
|
2013-04-29 16:07:08 -07:00
|
|
|
story_hash = mongo.StringField()
|
2010-11-30 10:30:18 -05:00
|
|
|
story_tags = mongo.ListField(mongo.StringField(max_length=250))
|
2013-06-26 11:38:49 -07:00
|
|
|
image_urls = mongo.ListField(mongo.StringField(max_length=1024))
|
2010-11-30 10:30:18 -05:00
|
|
|
|
|
|
|
meta = {
|
|
|
|
'collection': 'starred_stories',
|
2012-03-26 13:14:02 -07:00
|
|
|
'indexes': [('user_id', '-starred_date'), ('user_id', 'story_feed_id'), 'story_feed_id'],
|
2011-11-29 17:57:20 -08:00
|
|
|
'index_drop_dups': True,
|
2010-12-02 20:18:33 -05:00
|
|
|
'ordering': ['-starred_date'],
|
2010-11-30 10:30:18 -05:00
|
|
|
'allow_inheritance': False,
|
|
|
|
}
|
|
|
|
|
|
|
|
def save(self, *args, **kwargs):
|
|
|
|
if self.story_content:
|
|
|
|
self.story_content_z = zlib.compress(self.story_content)
|
|
|
|
self.story_content = None
|
|
|
|
if self.story_original_content:
|
|
|
|
self.story_original_content_z = zlib.compress(self.story_original_content)
|
|
|
|
self.story_original_content = None
|
2013-04-29 16:07:08 -07:00
|
|
|
self.story_hash = self.feed_guid_hash
|
|
|
|
|
2013-05-29 19:37:50 -07:00
|
|
|
return super(MStarredStory, self).save(*args, **kwargs)
|
2012-12-20 16:07:22 -08:00
|
|
|
|
2012-12-20 16:20:28 -08:00
|
|
|
# self.index_for_search()
|
2012-07-13 14:33:16 -07:00
|
|
|
|
|
|
|
def index_for_search(self):
|
|
|
|
story_content = zlib.decompress(self.story_content_z)
|
|
|
|
SearchStarredStory.index(user_id=self.user_id,
|
|
|
|
story_id=self.story_guid,
|
|
|
|
story_title=self.story_title,
|
|
|
|
story_content=story_content,
|
|
|
|
story_author=self.story_author_name,
|
2012-12-20 16:07:22 -08:00
|
|
|
story_date=self.story_date,
|
|
|
|
db_id=str(self.id))
|
2010-11-30 10:30:18 -05:00
|
|
|
|
2013-07-30 12:01:45 -07:00
|
|
|
@classmethod
|
|
|
|
def find_stories(cls, query, user_id, offset=0, limit=25):
|
|
|
|
stories_db = cls.objects(
|
|
|
|
Q(user_id=user_id) &
|
|
|
|
(Q(story_title__icontains=query) |
|
|
|
|
Q(story_author_name__icontains=query) |
|
|
|
|
Q(story_tags__icontains=query))
|
|
|
|
).order_by('-starred_date')[offset:offset+limit]
|
|
|
|
stories = Feed.format_stories(stories_db)
|
|
|
|
|
|
|
|
return stories
|
|
|
|
|
2013-07-11 12:08:21 -07:00
|
|
|
@classmethod
|
|
|
|
def trim_old_stories(cls, stories=10, days=30, dryrun=False):
|
|
|
|
print " ---> Fetching starred story counts..."
|
|
|
|
stats = settings.MONGODB.newsblur.starred_stories.aggregate([{
|
|
|
|
"$group": {
|
|
|
|
"_id": "$user_id",
|
|
|
|
"stories": {"$sum": 1},
|
|
|
|
},
|
|
|
|
}, {
|
|
|
|
"$match": {
|
|
|
|
"stories": {"$gte": stories}
|
|
|
|
},
|
|
|
|
}])
|
|
|
|
month_ago = datetime.datetime.now() - datetime.timedelta(days=days)
|
|
|
|
user_ids = stats['result']
|
|
|
|
user_ids = sorted(user_ids, key=lambda x:x['stories'], reverse=True)
|
|
|
|
print " ---> Found %s users with more than %s starred stories" % (len(user_ids), stories)
|
|
|
|
|
2013-07-11 12:24:48 -07:00
|
|
|
total = 0
|
2013-07-11 12:08:21 -07:00
|
|
|
for stat in user_ids:
|
|
|
|
try:
|
|
|
|
user = User.objects.select_related('profile').get(pk=stat['_id'])
|
|
|
|
except User.DoesNotExist:
|
|
|
|
user = None
|
2013-07-11 12:24:48 -07:00
|
|
|
|
|
|
|
if user and (user.profile.is_premium or user.profile.last_seen_on > month_ago):
|
|
|
|
continue
|
|
|
|
|
|
|
|
total += stat['stories']
|
|
|
|
print " ---> %20.20s: %-20.20s %s stories" % (user and user.profile.last_seen_on or "Deleted",
|
2013-07-11 15:09:00 -07:00
|
|
|
user and user.username or " - ",
|
2013-07-11 12:24:48 -07:00
|
|
|
stat['stories'])
|
2013-07-11 15:09:00 -07:00
|
|
|
if not dryrun and stat['_id']:
|
2013-07-11 12:24:48 -07:00
|
|
|
cls.objects.filter(user_id=stat['_id']).delete()
|
2013-07-11 12:08:21 -07:00
|
|
|
|
|
|
|
print " ---> Deleted %s stories in total." % total
|
|
|
|
|
2012-11-16 15:43:39 -08:00
|
|
|
@property
|
|
|
|
def guid_hash(self):
|
2012-11-27 11:59:54 -08:00
|
|
|
return hashlib.sha1(self.story_guid).hexdigest()[:6]
|
2013-01-28 15:43:00 -08:00
|
|
|
|
2013-04-29 16:07:08 -07:00
|
|
|
@property
|
|
|
|
def feed_guid_hash(self):
|
|
|
|
return "%s:%s" % (self.story_feed_id or "0", self.guid_hash)
|
|
|
|
|
2013-01-28 15:43:00 -08:00
|
|
|
def fetch_original_text(self, force=False, request=None):
|
|
|
|
original_text_z = self.original_text_z
|
2013-07-15 11:06:50 -07:00
|
|
|
feed = Feed.get_by_id(self.story_feed_id)
|
2013-01-28 15:43:00 -08:00
|
|
|
|
|
|
|
if not original_text_z or force:
|
2013-07-15 11:06:50 -07:00
|
|
|
ti = TextImporter(self, feed, request=request)
|
2013-01-28 15:43:00 -08:00
|
|
|
original_text = ti.fetch()
|
|
|
|
else:
|
|
|
|
logging.user(request, "~FYFetching ~FGoriginal~FY story text, ~SBfound.")
|
|
|
|
original_text = zlib.decompress(original_text_z)
|
|
|
|
|
|
|
|
return original_text
|
|
|
|
|
2012-11-16 15:43:39 -08:00
|
|
|
|
2013-06-18 13:22:31 -07:00
|
|
|
class MFetchHistory(mongo.Document):
|
|
|
|
feed_id = mongo.IntField(unique=True)
|
|
|
|
feed_fetch_history = mongo.DynamicField()
|
|
|
|
page_fetch_history = mongo.DynamicField()
|
|
|
|
push_history = mongo.DynamicField()
|
2010-08-31 16:34:34 -04:00
|
|
|
|
2013-06-18 13:22:31 -07:00
|
|
|
meta = {
|
|
|
|
'db_alias': 'nbanalytics',
|
|
|
|
'collection': 'fetch_history',
|
|
|
|
'allow_inheritance': False,
|
|
|
|
}
|
|
|
|
|
2011-02-13 14:47:58 -05:00
|
|
|
@classmethod
|
2013-05-31 17:17:20 -07:00
|
|
|
def feed(cls, feed_id, timezone=None, fetch_history=None):
|
|
|
|
if not fetch_history:
|
2013-05-31 23:03:58 -07:00
|
|
|
try:
|
2013-06-03 10:20:23 -07:00
|
|
|
fetch_history = cls.objects.read_preference(pymongo.ReadPreference.PRIMARY)\
|
|
|
|
.get(feed_id=feed_id)
|
2013-05-31 23:03:58 -07:00
|
|
|
except cls.DoesNotExist:
|
2013-06-03 10:20:23 -07:00
|
|
|
fetch_history = cls.objects.create(feed_id=feed_id)
|
2013-04-15 14:30:31 -07:00
|
|
|
history = {}
|
2010-09-23 10:29:18 -04:00
|
|
|
|
2013-04-15 14:30:31 -07:00
|
|
|
for fetch_type in ['feed_fetch_history', 'page_fetch_history', 'push_history']:
|
|
|
|
history[fetch_type] = getattr(fetch_history, fetch_type)
|
|
|
|
if not history[fetch_type]:
|
|
|
|
history[fetch_type] = []
|
|
|
|
for f, fetch in enumerate(history[fetch_type]):
|
|
|
|
date_key = 'push_date' if fetch_type == 'push_history' else 'fetch_date'
|
|
|
|
history[fetch_type][f] = {
|
|
|
|
date_key: localtime_for_timezone(fetch[0],
|
|
|
|
timezone).strftime("%Y-%m-%d %H:%M:%S"),
|
|
|
|
'status_code': fetch[1],
|
|
|
|
'message': fetch[2]
|
|
|
|
}
|
|
|
|
return history
|
2010-09-01 08:19:58 -04:00
|
|
|
|
2011-02-13 14:47:58 -05:00
|
|
|
@classmethod
|
2013-04-15 14:30:31 -07:00
|
|
|
def add(cls, feed_id, fetch_type, date=None, message=None, code=None, exception=None):
|
|
|
|
if not date:
|
|
|
|
date = datetime.datetime.now()
|
2013-05-31 23:03:58 -07:00
|
|
|
try:
|
2013-06-03 10:20:23 -07:00
|
|
|
fetch_history = cls.objects.read_preference(pymongo.ReadPreference.PRIMARY)\
|
|
|
|
.get(feed_id=feed_id)
|
2013-05-31 23:03:58 -07:00
|
|
|
except cls.DoesNotExist:
|
2013-06-03 10:20:23 -07:00
|
|
|
fetch_history = cls.objects.create(feed_id=feed_id)
|
|
|
|
|
2013-04-15 14:30:31 -07:00
|
|
|
if fetch_type == 'feed':
|
|
|
|
history = fetch_history.feed_fetch_history or []
|
|
|
|
elif fetch_type == 'page':
|
|
|
|
history = fetch_history.page_fetch_history or []
|
|
|
|
elif fetch_type == 'push':
|
|
|
|
history = fetch_history.push_history or []
|
|
|
|
|
2013-06-28 20:51:31 -07:00
|
|
|
history = [[date, code, message]] + history
|
2013-05-29 16:05:56 -07:00
|
|
|
if code and code >= 400:
|
|
|
|
history = history[:50]
|
|
|
|
else:
|
|
|
|
history = history[:5]
|
2013-04-15 14:30:31 -07:00
|
|
|
|
|
|
|
if fetch_type == 'feed':
|
|
|
|
fetch_history.feed_fetch_history = history
|
|
|
|
elif fetch_type == 'page':
|
|
|
|
fetch_history.page_fetch_history = history
|
|
|
|
elif fetch_type == 'push':
|
|
|
|
fetch_history.push_history = history
|
|
|
|
|
|
|
|
fetch_history.save()
|
|
|
|
|
|
|
|
if fetch_type == 'feed':
|
|
|
|
RStats.add('feed_fetch')
|
2013-05-31 17:14:17 -07:00
|
|
|
|
2013-05-31 17:17:20 -07:00
|
|
|
return cls.feed(feed_id, fetch_history=fetch_history)
|
2013-04-15 14:30:31 -07:00
|
|
|
|
2013-01-03 13:47:38 -08:00
|
|
|
|
2010-08-19 10:43:07 -04:00
|
|
|
class DuplicateFeed(models.Model):
|
2013-01-07 16:35:29 -08:00
|
|
|
duplicate_address = models.CharField(max_length=764, db_index=True)
|
|
|
|
duplicate_link = models.CharField(max_length=764, null=True, db_index=True)
|
2012-03-12 18:11:13 -07:00
|
|
|
duplicate_feed_id = models.CharField(max_length=255, null=True, db_index=True)
|
2010-08-19 10:43:07 -04:00
|
|
|
feed = models.ForeignKey(Feed, related_name='duplicate_addresses')
|
2011-03-09 18:52:06 -05:00
|
|
|
|
|
|
|
def __unicode__(self):
|
2012-03-12 18:11:13 -07:00
|
|
|
return "%s: %s / %s" % (self.feed, self.duplicate_address, self.duplicate_link)
|
2011-12-25 12:45:07 -08:00
|
|
|
|
2013-06-12 13:52:43 -07:00
|
|
|
def canonical(self):
|
2011-12-25 12:45:07 -08:00
|
|
|
return {
|
|
|
|
'duplicate_address': self.duplicate_address,
|
2012-03-12 18:11:13 -07:00
|
|
|
'duplicate_link': self.duplicate_link,
|
2011-12-25 12:45:07 -08:00
|
|
|
'duplicate_feed_id': self.duplicate_feed_id,
|
2012-01-26 09:32:24 -08:00
|
|
|
'feed_id': self.feed_id
|
2011-12-25 12:45:07 -08:00
|
|
|
}
|
2013-05-15 15:40:38 -07:00
|
|
|
|
|
|
|
def save(self, *args, **kwargs):
|
2013-05-15 15:41:39 -07:00
|
|
|
max_address = DuplicateFeed._meta.get_field('duplicate_address').max_length
|
2013-05-15 15:40:38 -07:00
|
|
|
if len(self.duplicate_address) > max_address:
|
|
|
|
self.duplicate_address = self.duplicate_address[:max_address]
|
2013-05-15 15:41:39 -07:00
|
|
|
max_link = DuplicateFeed._meta.get_field('duplicate_link').max_length
|
2013-05-15 17:21:10 -07:00
|
|
|
if self.duplicate_link and len(self.duplicate_link) > max_link:
|
2013-05-15 15:40:38 -07:00
|
|
|
self.duplicate_link = self.duplicate_link[:max_link]
|
|
|
|
|
2013-05-15 15:43:07 -07:00
|
|
|
super(DuplicateFeed, self).save(*args, **kwargs)
|
2010-08-25 19:10:55 -04:00
|
|
|
|
2010-12-23 17:09:08 -05:00
|
|
|
def merge_feeds(original_feed_id, duplicate_feed_id, force=False):
|
2011-11-29 09:43:16 -08:00
|
|
|
from apps.reader.models import UserSubscription
|
2012-03-30 14:56:16 -07:00
|
|
|
from apps.social.models import MSharedStory
|
|
|
|
|
2011-11-29 09:43:16 -08:00
|
|
|
if original_feed_id == duplicate_feed_id:
|
|
|
|
logging.info(" ***> Merging the same feed. Ignoring...")
|
2012-12-28 22:03:48 -08:00
|
|
|
return original_feed_id
|
2010-08-25 19:10:55 -04:00
|
|
|
try:
|
|
|
|
original_feed = Feed.objects.get(pk=original_feed_id)
|
|
|
|
duplicate_feed = Feed.objects.get(pk=duplicate_feed_id)
|
|
|
|
except Feed.DoesNotExist:
|
|
|
|
logging.info(" ***> Already deleted feed: %s" % duplicate_feed_id)
|
2012-12-28 22:03:48 -08:00
|
|
|
return original_feed_id
|
2013-01-29 14:37:01 -08:00
|
|
|
|
|
|
|
heavier_dupe = original_feed.num_subscribers < duplicate_feed.num_subscribers
|
|
|
|
branched_original = original_feed.branch_from_feed
|
|
|
|
if (heavier_dupe or branched_original) and not force:
|
2012-12-12 15:13:17 -08:00
|
|
|
original_feed, duplicate_feed = duplicate_feed, original_feed
|
|
|
|
original_feed_id, duplicate_feed_id = duplicate_feed_id, original_feed_id
|
2013-01-29 14:37:01 -08:00
|
|
|
if branched_original:
|
|
|
|
original_feed.feed_address = duplicate_feed.feed_address
|
2010-08-25 19:10:55 -04:00
|
|
|
|
|
|
|
logging.info(" ---> Feed: [%s - %s] %s - %s" % (original_feed_id, duplicate_feed_id,
|
2012-11-26 10:20:41 -08:00
|
|
|
original_feed, original_feed.feed_link))
|
2013-01-29 14:37:01 -08:00
|
|
|
logging.info(" Orig ++> %s: (%s subs) %s / %s %s" % (original_feed.pk,
|
2012-12-12 15:13:17 -08:00
|
|
|
original_feed.num_subscribers,
|
2012-11-26 10:20:41 -08:00
|
|
|
original_feed.feed_address,
|
2013-01-29 14:37:01 -08:00
|
|
|
original_feed.feed_link,
|
2013-01-29 15:58:15 -08:00
|
|
|
" [B: %s]" % original_feed.branch_from_feed.pk if original_feed.branch_from_feed else ""))
|
2013-01-29 14:37:01 -08:00
|
|
|
logging.info(" Dupe --> %s: (%s subs) %s / %s %s" % (duplicate_feed.pk,
|
2012-12-12 15:13:17 -08:00
|
|
|
duplicate_feed.num_subscribers,
|
2012-11-26 10:20:41 -08:00
|
|
|
duplicate_feed.feed_address,
|
2013-01-29 14:37:01 -08:00
|
|
|
duplicate_feed.feed_link,
|
2013-01-29 15:58:15 -08:00
|
|
|
" [B: %s]" % duplicate_feed.branch_from_feed.pk if duplicate_feed.branch_from_feed else ""))
|
2010-08-25 19:10:55 -04:00
|
|
|
|
2013-05-07 15:37:36 -07:00
|
|
|
original_feed.branch_from_feed = None
|
|
|
|
|
2012-12-12 16:05:28 -08:00
|
|
|
user_subs = UserSubscription.objects.filter(feed=duplicate_feed).order_by('-pk')
|
2010-08-25 19:10:55 -04:00
|
|
|
for user_sub in user_subs:
|
2011-11-16 10:00:03 -08:00
|
|
|
user_sub.switch_feed(original_feed, duplicate_feed)
|
2010-08-25 19:10:55 -04:00
|
|
|
|
|
|
|
def delete_story_feed(model, feed_field='feed_id'):
|
|
|
|
duplicate_stories = model.objects(**{feed_field: duplicate_feed.pk})
|
|
|
|
# if duplicate_stories.count():
|
|
|
|
# logging.info(" ---> Deleting %s %s" % (duplicate_stories.count(), model))
|
|
|
|
duplicate_stories.delete()
|
|
|
|
|
|
|
|
delete_story_feed(MStory, 'story_feed_id')
|
2011-01-14 00:59:51 -05:00
|
|
|
delete_story_feed(MFeedPage, 'feed_id')
|
2010-08-25 19:10:55 -04:00
|
|
|
|
|
|
|
try:
|
|
|
|
DuplicateFeed.objects.create(
|
|
|
|
duplicate_address=duplicate_feed.feed_address,
|
2012-03-12 18:11:13 -07:00
|
|
|
duplicate_link=duplicate_feed.feed_link,
|
2010-11-09 09:55:44 -05:00
|
|
|
duplicate_feed_id=duplicate_feed.pk,
|
2010-08-25 19:10:55 -04:00
|
|
|
feed=original_feed
|
|
|
|
)
|
2010-09-10 08:23:04 -07:00
|
|
|
except (IntegrityError, OperationError), e:
|
|
|
|
logging.info(" ***> Could not save DuplicateFeed: %s" % e)
|
2010-08-25 19:10:55 -04:00
|
|
|
|
2010-11-09 09:55:44 -05:00
|
|
|
# Switch this dupe feed's dupe feeds over to the new original.
|
|
|
|
duplicate_feeds_duplicate_feeds = DuplicateFeed.objects.filter(feed=duplicate_feed)
|
|
|
|
for dupe_feed in duplicate_feeds_duplicate_feeds:
|
|
|
|
dupe_feed.feed = original_feed
|
|
|
|
dupe_feed.duplicate_feed_id = duplicate_feed.pk
|
|
|
|
dupe_feed.save()
|
2010-08-25 19:10:55 -04:00
|
|
|
|
2013-05-07 15:37:36 -07:00
|
|
|
logging.debug(' ---> Dupe subscribers (%s): %s, Original subscribers (%s): %s' %
|
|
|
|
(duplicate_feed.pk, duplicate_feed.num_subscribers,
|
|
|
|
original_feed.pk, original_feed.num_subscribers))
|
2012-12-12 16:05:28 -08:00
|
|
|
if duplicate_feed.pk != original_feed.pk:
|
|
|
|
duplicate_feed.delete()
|
|
|
|
else:
|
|
|
|
logging.debug(" ***> Duplicate feed is the same as original feed. Panic!")
|
2012-12-12 16:37:39 -08:00
|
|
|
logging.debug(' ---> Deleted duplicate feed: %s/%s' % (duplicate_feed, duplicate_feed_id))
|
2013-01-29 14:37:01 -08:00
|
|
|
original_feed.branch_from_feed = None
|
2011-05-16 23:13:49 -04:00
|
|
|
original_feed.count_subscribers()
|
2012-12-28 22:03:48 -08:00
|
|
|
original_feed.save()
|
2012-11-26 10:20:41 -08:00
|
|
|
logging.debug(' ---> Now original subscribers: %s' %
|
|
|
|
(original_feed.num_subscribers))
|
2012-12-10 15:20:58 -08:00
|
|
|
|
|
|
|
|
2012-03-30 14:56:16 -07:00
|
|
|
MSharedStory.switch_feed(original_feed_id, duplicate_feed_id)
|
|
|
|
|
2012-12-28 22:03:48 -08:00
|
|
|
return original_feed_id
|
|
|
|
|
2010-08-25 19:10:55 -04:00
|
|
|
def rewrite_folders(folders, original_feed, duplicate_feed):
|
|
|
|
new_folders = []
|
|
|
|
|
|
|
|
for k, folder in enumerate(folders):
|
|
|
|
if isinstance(folder, int):
|
|
|
|
if folder == duplicate_feed.pk:
|
|
|
|
# logging.info(" ===> Rewrote %s'th item: %s" % (k+1, folders))
|
|
|
|
new_folders.append(original_feed.pk)
|
|
|
|
else:
|
|
|
|
new_folders.append(folder)
|
|
|
|
elif isinstance(folder, dict):
|
|
|
|
for f_k, f_v in folder.items():
|
|
|
|
new_folders.append({f_k: rewrite_folders(f_v, original_feed, duplicate_feed)})
|
|
|
|
|
2011-11-24 15:19:53 -05:00
|
|
|
return new_folders
|