2010-05-20 15:13:25 -04:00
import difflib
import datetime
2012-07-15 22:51:59 -07:00
import time
2010-06-24 15:27:25 -04:00
import random
2010-07-27 22:11:23 -04:00
import re
2011-10-26 20:09:28 -07:00
import math
2010-08-21 13:57:39 -04:00
import mongoengine as mongo
2010-08-29 12:35:09 -04:00
import zlib
2011-11-15 22:14:21 -08:00
import hashlib
2012-07-15 22:51:59 -07:00
import redis
2013-04-15 15:09:04 -07:00
import pymongo
2013-09-10 11:59:31 -07:00
import HTMLParser
2010-08-04 18:30:51 -04:00
from collections import defaultdict
2010-08-21 23:49:36 -04:00
from operator import itemgetter
2013-03-20 15:43:35 -07:00
from bson . objectid import ObjectId
2013-06-20 13:41:37 -07:00
from BeautifulSoup import BeautifulSoup
2014-04-22 12:00:20 -07:00
from pyes . exceptions import NotFoundException
2011-09-19 08:56:16 -07:00
# from nltk.collocations import TrigramCollocationFinder, BigramCollocationFinder, TrigramAssocMeasures, BigramAssocMeasures
2009-06-16 03:08:55 +00:00
from django . db import models
2009-09-08 00:13:49 +00:00
from django . db import IntegrityError
2010-10-11 13:19:42 -04:00
from django . conf import settings
2011-04-25 20:53:29 -04:00
from django . db . models . query import QuerySet
2016-01-28 11:35:11 -08:00
from django . db . utils import DatabaseError
2012-01-26 18:59:40 -08:00
from django . core . urlresolvers import reverse
2013-07-11 12:08:21 -07:00
from django . contrib . auth . models import User
2012-07-05 18:29:38 -07:00
from django . contrib . sites . models import Site
2013-07-10 11:38:05 -07:00
from django . template . defaultfilters import slugify
2015-12-02 07:46:25 +01:00
from django . utils . encoding import smart_str , smart_unicode
2013-05-29 16:26:04 -07:00
from mongoengine . queryset import OperationError , Q , NotUniqueError
2011-02-15 21:08:40 -05:00
from mongoengine . base import ValidationError
2012-12-28 21:42:52 -08:00
from vendor . timezones . utilities import localtime_for_timezone
2014-04-29 12:36:42 -07:00
from apps . rss_feeds . tasks import UpdateFeeds , PushFeeds , ScheduleCountTagsForUser
2013-01-08 18:33:30 -08:00
from apps . rss_feeds . text_importer import TextImporter
2014-04-11 15:40:58 -07:00
from apps . search . models import SearchStory , SearchFeed
2013-03-25 11:10:36 -07:00
from apps . statistics . rstats import RStats
2010-10-23 13:06:28 -04:00
from utils import json_functions as json
2016-02-04 12:28:55 -08:00
from utils import feedfinder2 as feedfinder , feedparser
2011-02-08 22:07:59 -05:00
from utils import urlnorm
from utils import log as logging
2011-01-17 22:48:38 -05:00
from utils . fields import AutoOneToOneField
2010-05-20 15:13:25 -04:00
from utils . feed_functions import levenshtein_distance
2011-02-02 13:07:12 -05:00
from utils . feed_functions import timelimit , TimeoutError
2011-03-04 12:27:31 -05:00
from utils . feed_functions import relative_timesince
2011-08-18 09:56:52 -07:00
from utils . feed_functions import seconds_timesince
2013-04-08 16:14:33 -07:00
from utils . story_functions import strip_tags , htmldiff , strip_comments , strip_comments__lxml
2014-04-15 14:17:15 -07:00
from utils . story_functions import prep_for_search
2009-06-16 03:08:55 +00:00
2009-08-20 02:43:01 +00:00
ENTRY_NEW , ENTRY_UPDATED , ENTRY_SAME , ENTRY_ERR = range ( 4 )
2009-07-21 03:18:29 +00:00
2012-05-07 16:26:31 -07:00
2009-06-16 03:08:55 +00:00
class Feed ( models . Model ) :
2012-12-25 14:58:49 -08:00
feed_address = models . URLField ( max_length = 764 , db_index = True )
2011-11-15 22:14:21 -08:00
feed_address_locked = models . NullBooleanField ( default = False , blank = True , null = True )
2010-07-21 23:22:27 -04:00
feed_link = models . URLField ( max_length = 1000 , default = " " , blank = True , null = True )
2011-09-01 09:34:57 -07:00
feed_link_locked = models . BooleanField ( default = False )
2013-03-29 22:41:16 -07:00
hash_address_and_link = models . CharField ( max_length = 64 , unique = True )
2011-02-15 21:08:40 -05:00
feed_title = models . CharField ( max_length = 255 , default = " [Untitled] " , blank = True , null = True )
2012-03-27 16:26:07 -07:00
is_push = models . NullBooleanField ( default = False , blank = True , null = True )
2011-01-17 13:52:11 -05:00
active = models . BooleanField ( default = True , db_index = True )
2010-09-17 12:42:44 -04:00
num_subscribers = models . IntegerField ( default = - 1 )
2011-01-17 13:52:11 -05:00
active_subscribers = models . IntegerField ( default = - 1 , db_index = True )
2010-10-16 22:49:03 -04:00
premium_subscribers = models . IntegerField ( default = - 1 )
2013-03-29 20:23:54 -07:00
active_premium_subscribers = models . IntegerField ( default = - 1 )
2011-11-17 18:34:23 -08:00
branch_from_feed = models . ForeignKey ( ' Feed ' , blank = True , null = True , db_index = True )
2010-10-02 17:15:51 -04:00
last_update = models . DateTimeField ( db_index = True )
2013-03-29 22:09:11 -07:00
next_scheduled_update = models . DateTimeField ( )
2013-04-23 17:04:21 -07:00
last_story_date = models . DateTimeField ( null = True , blank = True )
2010-08-09 20:44:36 -04:00
fetched_once = models . BooleanField ( default = False )
2013-03-29 20:23:54 -07:00
known_good = models . BooleanField ( default = False )
2010-09-08 10:52:04 -07:00
has_feed_exception = models . BooleanField ( default = False , db_index = True )
has_page_exception = models . BooleanField ( default = False , db_index = True )
2011-09-04 10:59:29 -07:00
has_page = models . BooleanField ( default = True )
2010-08-25 20:43:35 -04:00
exception_code = models . IntegerField ( default = 0 )
2012-08-09 13:26:44 -07:00
errors_since_good = models . IntegerField ( default = 0 )
2010-12-23 13:29:31 -05:00
min_to_decay = models . IntegerField ( default = 0 )
2009-06-16 03:08:55 +00:00
days_to_trim = models . IntegerField ( default = 90 )
creation = models . DateField ( auto_now_add = True )
2010-09-17 13:06:28 -04:00
etag = models . CharField ( max_length = 255 , blank = True , null = True )
2009-06-16 03:08:55 +00:00
last_modified = models . DateTimeField ( null = True , blank = True )
2010-07-25 23:13:27 -04:00
stories_last_month = models . IntegerField ( default = 0 )
average_stories_per_month = models . IntegerField ( default = 0 )
2010-04-29 13:35:46 -04:00
last_load_time = models . IntegerField ( default = 0 )
2011-04-26 19:37:52 -04:00
favicon_color = models . CharField ( max_length = 6 , null = True , blank = True )
favicon_not_found = models . BooleanField ( default = False )
2012-09-18 17:09:07 -07:00
s3_page = models . NullBooleanField ( default = False , blank = True , null = True )
s3_icon = models . NullBooleanField ( default = False , blank = True , null = True )
2014-04-15 15:08:10 -07:00
search_indexed = models . NullBooleanField ( default = None , null = True , blank = True )
2014-04-15 14:59:00 -07:00
2011-11-15 22:14:21 -08:00
class Meta :
db_table = " feeds "
ordering = [ " feed_title " ]
# unique_together=[('feed_address', 'feed_link')]
2009-06-16 03:08:55 +00:00
def __unicode__ ( self ) :
2010-09-28 04:57:24 -04:00
if not self . feed_title :
self . feed_title = " [Untitled] "
self . save ( )
2013-01-29 14:37:01 -08:00
return " %s ( %s - %s / %s / %s ) %s " % (
2013-01-04 16:34:27 -08:00
self . feed_title ,
self . pk ,
self . num_subscribers ,
self . active_subscribers ,
2013-04-23 15:44:31 -07:00
self . active_premium_subscribers ,
2013-01-29 15:58:15 -08:00
( " [B: %s ] " % self . branch_from_feed . pk if self . branch_from_feed else " " ) )
2012-03-20 16:46:38 -07:00
@property
def title ( self ) :
2013-09-16 16:42:49 -07:00
title = self . feed_title or " [Untitled] "
if self . active_premium_subscribers > = 1 :
title = " %s * " % title [ : 29 ]
return title
2013-07-10 11:38:05 -07:00
@property
def permalink ( self ) :
return " %s /site/ %s / %s " % ( settings . NEWSBLUR_URL , self . pk , slugify ( self . feed_title . lower ( ) [ : 50 ] ) )
2012-07-05 18:29:38 -07:00
@property
def favicon_url ( self ) :
2012-09-21 17:41:07 -07:00
if settings . BACKED_BY_AWS [ ' icons_on_s3 ' ] and self . s3_icon :
2014-08-10 11:12:18 -07:00
return " https://s3.amazonaws.com/ %s / %s .png " % ( settings . S3_ICONS_BUCKET_NAME , self . pk )
2012-07-05 18:29:38 -07:00
return reverse ( ' feed-favicon ' , kwargs = { ' feed_id ' : self . pk } )
@property
def favicon_url_fqdn ( self ) :
2012-09-21 17:41:07 -07:00
if settings . BACKED_BY_AWS [ ' icons_on_s3 ' ] and self . s3_icon :
2012-09-20 12:43:34 -07:00
return self . favicon_url
2012-07-05 18:29:38 -07:00
return " http:// %s %s " % (
2012-07-29 22:31:40 -07:00
Site . objects . get_current ( ) . domain ,
2012-07-05 18:29:38 -07:00
self . favicon_url
)
2012-09-18 17:09:07 -07:00
@property
def s3_pages_key ( self ) :
return " %s .gz.html " % self . pk
2012-09-19 16:33:23 -07:00
@property
def s3_icons_key ( self ) :
return " %s .png " % self . pk
2013-09-16 16:42:49 -07:00
@property
def unread_cutoff ( self ) :
if self . active_premium_subscribers > 0 :
return datetime . datetime . utcnow ( ) - datetime . timedelta ( days = settings . DAYS_OF_UNREAD )
return datetime . datetime . utcnow ( ) - datetime . timedelta ( days = settings . DAYS_OF_UNREAD_FREE )
2016-01-07 18:38:14 -08:00
@classmethod
def generate_hash_address_and_link ( cls , feed_address , feed_link ) :
if not feed_address : feed_address = " "
if not feed_link : feed_link = " "
return hashlib . sha1 ( feed_address + feed_link ) . hexdigest ( )
2016-02-09 16:22:22 -08:00
@property
def is_newsletter ( self ) :
return self . feed_address . startswith ( ' newsletter: ' )
2016-01-07 18:38:14 -08:00
2011-04-04 12:01:29 -04:00
def canonical ( self , full = False , include_favicon = True ) :
2011-03-04 12:27:31 -05:00
feed = {
' id ' : self . pk ,
' feed_title ' : self . feed_title ,
' feed_address ' : self . feed_address ,
' feed_link ' : self . feed_link ,
2011-07-27 22:17:34 -07:00
' num_subscribers ' : self . num_subscribers ,
2011-03-04 12:27:31 -05:00
' updated ' : relative_timesince ( self . last_update ) ,
2011-08-18 09:56:52 -07:00
' updated_seconds_ago ' : seconds_timesince ( self . last_update ) ,
2013-09-23 13:34:16 -07:00
' last_story_date ' : self . last_story_date ,
2014-08-04 20:03:50 -07:00
' last_story_seconds_ago ' : seconds_timesince ( self . last_story_date ) ,
2014-07-23 17:49:23 -07:00
' stories_last_month ' : self . stories_last_month ,
' average_stories_per_month ' : self . average_stories_per_month ,
2013-02-11 16:07:08 -08:00
' min_to_decay ' : self . min_to_decay ,
2011-03-04 12:27:31 -05:00
' subs ' : self . num_subscribers ,
2012-03-28 17:31:25 -07:00
' is_push ' : self . is_push ,
2016-02-09 16:22:22 -08:00
' is_newsletter ' : self . is_newsletter ,
2012-05-22 17:39:21 -07:00
' fetched_once ' : self . fetched_once ,
2014-04-22 15:48:29 -07:00
' search_indexed ' : self . search_indexed ,
2012-05-22 17:39:21 -07:00
' not_yet_fetched ' : not self . fetched_once , # Legacy. Doh.
2011-04-26 19:37:52 -04:00
' favicon_color ' : self . favicon_color ,
2011-10-26 09:40:04 -07:00
' favicon_fade ' : self . favicon_fade ( ) ,
2012-01-31 10:15:11 -08:00
' favicon_border ' : self . favicon_border ( ) ,
2011-10-26 20:09:28 -07:00
' favicon_text_color ' : self . favicon_text_color ( ) ,
2011-11-30 21:27:55 -08:00
' favicon_fetching ' : self . favicon_fetching ,
2012-07-05 18:29:38 -07:00
' favicon_url ' : self . favicon_url ,
2012-09-19 16:33:23 -07:00
' s3_page ' : self . s3_page ,
' s3_icon ' : self . s3_icon ,
2011-03-04 12:27:31 -05:00
}
2011-04-11 09:42:38 -04:00
if include_favicon :
2011-05-01 17:19:01 -04:00
try :
2012-01-10 10:22:12 -08:00
feed_icon = MFeedIcon . objects . get ( feed_id = self . pk )
2011-05-01 17:19:01 -04:00
feed [ ' favicon ' ] = feed_icon . data
except MFeedIcon . DoesNotExist :
pass
2011-03-04 12:27:31 -05:00
if self . has_page_exception or self . has_feed_exception :
feed [ ' has_exception ' ] = True
feed [ ' exception_type ' ] = ' feed ' if self . has_feed_exception else ' page '
feed [ ' exception_code ' ] = self . exception_code
elif full :
feed [ ' has_exception ' ] = False
feed [ ' exception_type ' ] = None
feed [ ' exception_code ' ] = self . exception_code
2012-09-05 11:32:12 -07:00
if not self . has_page :
feed [ ' disabled_page ' ] = True
2011-03-04 12:27:31 -05:00
if full :
2013-02-22 16:02:45 -08:00
feed [ ' average_stories_per_month ' ] = self . average_stories_per_month
feed [ ' tagline ' ] = self . data . feed_tagline
2011-03-04 12:27:31 -05:00
feed [ ' feed_tags ' ] = json . decode ( self . data . popular_tags ) if self . data . popular_tags else [ ]
feed [ ' feed_authors ' ] = json . decode ( self . data . popular_authors ) if self . data . popular_authors else [ ]
return feed
2012-02-13 11:07:32 -08:00
2010-11-05 20:34:17 -04:00
def save ( self , * args , * * kwargs ) :
2010-10-02 17:15:51 -04:00
if not self . last_update :
2010-10-10 23:55:00 -04:00
self . last_update = datetime . datetime . utcnow ( )
2010-10-02 17:15:51 -04:00
if not self . next_scheduled_update :
2010-10-10 23:55:00 -04:00
self . next_scheduled_update = datetime . datetime . utcnow ( )
2013-04-10 19:14:19 -07:00
self . fix_google_alerts_urls ( )
2011-11-16 06:52:14 -08:00
feed_address = self . feed_address or " "
feed_link = self . feed_link or " "
2016-01-07 18:38:14 -08:00
self . hash_address_and_link = self . generate_hash_address_and_link ( feed_address , feed_link )
2011-02-15 21:08:40 -05:00
max_feed_title = Feed . _meta . get_field ( ' feed_title ' ) . max_length
if len ( self . feed_title ) > max_feed_title :
self . feed_title = self . feed_title [ : max_feed_title ]
2011-08-27 14:13:28 -07:00
max_feed_address = Feed . _meta . get_field ( ' feed_address ' ) . max_length
2012-04-06 21:40:18 -07:00
if len ( feed_address ) > max_feed_address :
self . feed_address = feed_address [ : max_feed_address ]
2012-04-04 22:22:24 -07:00
max_feed_link = Feed . _meta . get_field ( ' feed_link ' ) . max_length
2012-04-06 21:40:18 -07:00
if len ( feed_link ) > max_feed_link :
self . feed_link = feed_link [ : max_feed_link ]
2010-10-02 17:15:51 -04:00
2010-09-28 05:29:40 -04:00
try :
2010-10-07 19:07:43 -04:00
super ( Feed , self ) . save ( * args , * * kwargs )
2016-01-28 11:35:11 -08:00
except DatabaseError , e :
logging . debug ( " ---> ~FBFeed update failed, no change: %s / %s ... " % ( kwargs . get ( ' update_fields ' , None ) , e ) )
pass
2012-12-25 14:58:49 -08:00
except IntegrityError , e :
logging . debug ( " ---> ~FRFeed save collision ( %s ), checking dupe... " % e )
2013-01-29 14:37:01 -08:00
duplicate_feeds = Feed . objects . filter ( feed_address = self . feed_address ,
feed_link = self . feed_link )
2013-04-04 17:13:06 -07:00
if not duplicate_feeds :
2013-04-19 11:09:31 -07:00
feed_address = self . feed_address or " "
feed_link = self . feed_link or " "
2016-01-07 18:38:14 -08:00
hash_address_and_link = self . generate_hash_address_and_link ( feed_address , feed_link )
2013-04-04 17:13:06 -07:00
duplicate_feeds = Feed . objects . filter ( hash_address_and_link = hash_address_and_link )
2013-01-29 14:37:01 -08:00
if not duplicate_feeds :
2012-12-25 14:58:49 -08:00
# Feed has been deleted. Just ignore it.
2013-04-03 22:51:03 -07:00
logging . debug ( " ***> Changed to: %s - %s : %s " % ( self . feed_address , self . feed_link , duplicate_feeds ) )
2012-12-25 14:58:49 -08:00
logging . debug ( ' ***> [ %-30s ] Feed deleted ( %s ). ' % ( unicode ( self ) [ : 30 ] , self . pk ) )
return
2013-01-29 14:37:01 -08:00
if self . pk != duplicate_feeds [ 0 ] . pk :
2014-09-05 18:17:07 -07:00
logging . debug ( " ---> ~FRFound different feed ( %s ), merging %s in... " % ( duplicate_feeds [ 0 ] , self . pk ) )
2013-01-29 14:37:01 -08:00
feed = Feed . get_by_id ( merge_feeds ( duplicate_feeds [ 0 ] . pk , self . pk ) )
2012-12-28 22:03:48 -08:00
return feed
2013-04-08 12:54:02 -07:00
return self
2014-04-11 15:40:58 -07:00
@classmethod
2015-07-30 11:51:21 -07:00
def index_all_for_search ( cls , offset = 0 , subscribers = 2 ) :
2014-04-11 18:09:23 -07:00
if not offset :
2014-04-16 15:31:44 -07:00
SearchFeed . create_elasticsearch_mapping ( delete = True )
2014-04-11 15:40:58 -07:00
last_pk = cls . objects . latest ( ' pk ' ) . pk
for f in xrange ( offset , last_pk , 1000 ) :
print " ---> %s / %s ( %.2s %% ) " % ( f , last_pk , float ( f ) / last_pk * 100 )
feeds = Feed . objects . filter ( pk__in = range ( f , f + 1000 ) ,
active = True ,
2015-07-30 11:51:21 -07:00
active_subscribers__gte = subscribers ) \
2014-04-11 15:40:58 -07:00
. values_list ( ' pk ' )
for feed_id , in feeds :
2014-04-22 12:00:20 -07:00
Feed . objects . get ( pk = feed_id ) . index_feed_for_search ( )
2014-04-11 15:40:58 -07:00
2014-04-22 12:00:20 -07:00
def index_feed_for_search ( self ) :
2016-02-09 17:22:07 -08:00
if self . num_subscribers > 1 and not self . branch_from_feed and not self . is_newsletter :
2013-01-04 16:34:27 -08:00
SearchFeed . index ( feed_id = self . pk ,
title = self . feed_title ,
address = self . feed_address ,
link = self . feed_link ,
num_subscribers = self . num_subscribers )
2014-04-15 16:52:25 -07:00
def index_stories_for_search ( self ) :
if self . search_indexed : return
self . search_indexed = True
self . save ( )
stories = MStory . objects ( story_feed_id = self . pk )
for story in stories :
2014-04-22 12:00:20 -07:00
story . index_story_for_search ( )
2010-08-23 09:55:21 -04:00
2012-07-23 13:06:12 -07:00
def sync_redis ( self ) :
2013-05-02 12:27:37 -07:00
return MStory . sync_feed_redis ( self . pk )
2013-05-29 18:00:09 -07:00
2013-08-14 14:32:50 -07:00
def expire_redis ( self , r = None ) :
2013-05-29 18:00:09 -07:00
if not r :
r = redis . Redis ( connection_pool = settings . REDIS_STORY_HASH_POOL )
2013-08-14 14:32:50 -07:00
# if not r2:
# r2 = redis.Redis(connection_pool=settings.REDIS_STORY_HASH_POOL2)
2013-05-29 18:00:09 -07:00
2013-09-16 16:42:49 -07:00
r . expire ( ' F: %s ' % self . pk , settings . DAYS_OF_STORY_HASHES * 24 * 60 * 60 )
# r2.expire('F:%s' % self.pk, settings.DAYS_OF_STORY_HASHES*24*60*60)
r . expire ( ' zF: %s ' % self . pk , settings . DAYS_OF_STORY_HASHES * 24 * 60 * 60 )
# r2.expire('zF:%s' % self.pk, settings.DAYS_OF_STORY_HASHES*24*60*60)
2013-06-03 17:20:36 -07:00
2013-04-08 12:54:02 -07:00
@classmethod
def autocomplete ( self , prefix , limit = 5 ) :
2014-04-11 15:40:58 -07:00
results = SearchFeed . query ( prefix )
feed_ids = [ result . feed_id for result in results [ : 5 ] ]
# results = SearchQuerySet().autocomplete(address=prefix).order_by('-num_subscribers')[:limit]
#
# if len(results) < limit:
# results += SearchQuerySet().autocomplete(title=prefix).order_by('-num_subscribers')[:limit-len(results)]
#
return feed_ids
2012-07-23 13:06:12 -07:00
2012-03-12 18:11:13 -07:00
@classmethod
def find_or_create ( cls , feed_address , feed_link , * args , * * kwargs ) :
feeds = cls . objects . filter ( feed_address = feed_address , feed_link = feed_link )
if feeds :
return feeds [ 0 ] , False
2012-07-30 13:25:44 -07:00
if feed_link and feed_link . endswith ( ' / ' ) :
2012-03-12 18:11:13 -07:00
feeds = cls . objects . filter ( feed_address = feed_address , feed_link = feed_link [ : - 1 ] )
if feeds :
return feeds [ 0 ] , False
return cls . objects . get_or_create ( feed_address = feed_address , feed_link = feed_link , * args , * * kwargs )
2012-02-03 13:02:26 -08:00
@classmethod
def merge_feeds ( cls , * args , * * kwargs ) :
2012-12-28 22:03:48 -08:00
return merge_feeds ( * args , * * kwargs )
2013-01-02 12:27:08 -08:00
2013-04-10 19:14:19 -07:00
def fix_google_alerts_urls ( self ) :
if ( self . feed_address . startswith ( ' http://user/ ' ) and
' /state/com.google/alerts/ ' in self . feed_address ) :
match = re . match ( r " http://user/( \ d+)/state/com.google/alerts/( \ d+) " , self . feed_address )
if match :
user_id , alert_id = match . groups ( )
self . feed_address = " http://www.google.com/alerts/feeds/ %s / %s " % ( user_id , alert_id )
2013-01-02 12:27:08 -08:00
@classmethod
2014-04-21 16:26:10 -07:00
def schedule_feed_fetches_immediately ( cls , feed_ids , user_id = None ) :
2013-06-12 13:52:43 -07:00
if settings . DEBUG :
logging . info ( " ---> ~SN~FMSkipping the scheduling immediate fetch of ~SB %s ~SN feeds (in DEBUG)... " %
len ( feed_ids ) )
return
2013-01-02 12:27:08 -08:00
2014-04-21 16:26:10 -07:00
if user_id :
user = User . objects . get ( pk = user_id )
logging . user ( user , " ~SN~FMScheduling immediate fetch of ~SB %s ~SN feeds... " %
len ( feed_ids ) )
else :
logging . debug ( " ---> ~SN~FMScheduling immediate fetch of ~SB %s ~SN feeds... " %
len ( feed_ids ) )
2014-04-21 16:29:58 -07:00
if len ( feed_ids ) > 100 :
logging . debug ( " ---> ~SN~FMFeeds scheduled: %s " % feed_ids )
2014-04-21 16:26:10 -07:00
day_ago = datetime . datetime . now ( ) - datetime . timedelta ( days = 1 )
2013-01-02 12:27:08 -08:00
feeds = Feed . objects . filter ( pk__in = feed_ids )
for feed in feeds :
2014-04-21 16:26:10 -07:00
if feed . active_subscribers < = 0 :
feed . count_subscribers ( )
if not feed . active or feed . next_scheduled_update < day_ago :
feed . schedule_feed_fetch_immediately ( verbose = False )
2013-01-02 12:27:08 -08:00
2011-11-30 21:27:55 -08:00
@property
def favicon_fetching ( self ) :
return bool ( not ( self . favicon_not_found or self . favicon_color ) )
2011-02-08 22:07:59 -05:00
@classmethod
2011-08-29 21:09:41 -07:00
def get_feed_from_url ( cls , url , create = True , aggressive = False , fetch = True , offset = 0 ) :
2011-02-08 22:07:59 -05:00
feed = None
2011-04-25 20:53:29 -04:00
2016-02-09 16:22:22 -08:00
if url and url . startswith ( ' newsletter: ' ) :
return cls . objects . get ( feed_address = url )
2015-11-30 11:58:26 -08:00
if url and ' youtube.com/user/ ' in url :
2015-04-29 17:26:21 -07:00
username = re . search ( ' youtube.com/user/( \ w+) ' , url ) . group ( 1 )
url = " http://gdata.youtube.com/feeds/base/users/ %s /uploads " % username
2015-11-30 11:58:26 -08:00
if url and ' youtube.com/channel/ ' in url :
2015-05-08 14:50:44 -07:00
channel_id = re . search ( ' youtube.com/channel/([-_ \ w]+) ' , url ) . group ( 1 )
url = " https://www.youtube.com/feeds/videos.xml?channel_id= %s " % channel_id
2015-04-29 17:26:21 -07:00
2011-04-25 20:53:29 -04:00
def criteria ( key , value ) :
if aggressive :
return { ' %s __icontains ' % key : value }
else :
return { ' %s ' % key : value }
2011-02-08 22:07:59 -05:00
def by_url ( address ) :
2013-05-07 11:37:01 -07:00
feed = cls . objects . filter (
branch_from_feed = None
) . filter ( * * criteria ( ' feed_address ' , address ) ) . order_by ( ' -num_subscribers ' )
2011-03-09 18:52:06 -05:00
if not feed :
2011-04-25 20:55:39 -04:00
duplicate_feed = DuplicateFeed . objects . filter ( * * criteria ( ' duplicate_address ' , address ) )
2011-04-25 20:53:29 -04:00
if duplicate_feed and len ( duplicate_feed ) > offset :
feed = [ duplicate_feed [ offset ] . feed ]
2012-03-28 15:49:21 -07:00
if not feed and aggressive :
2013-05-07 11:37:01 -07:00
feed = cls . objects . filter (
branch_from_feed = None
) . filter ( * * criteria ( ' feed_link ' , address ) ) . order_by ( ' -num_subscribers ' )
2011-03-09 18:52:06 -05:00
2011-02-08 22:07:59 -05:00
return feed
2011-04-25 20:53:29 -04:00
# Normalize and check for feed_address, dupes, and feed_link
2012-03-28 15:49:21 -07:00
url = urlnorm . normalize ( url )
2016-02-16 13:19:59 -08:00
if not url :
return
2011-02-08 22:07:59 -05:00
feed = by_url ( url )
2016-02-04 21:31:49 -08:00
found_feed_urls = [ ]
2011-04-25 20:53:29 -04:00
# Create if it looks good
if feed and len ( feed ) > offset :
feed = feed [ offset ]
2016-02-16 11:48:35 -08:00
else :
2016-02-04 21:31:49 -08:00
found_feed_urls = feedfinder . find_feeds ( url )
if len ( found_feed_urls ) :
2016-02-16 11:48:35 -08:00
feed_finder_url = found_feed_urls [ 0 ]
logging . debug ( " ---> Found feed URLs for %s : %s " % ( url , found_feed_urls ) )
feed = by_url ( feed_finder_url )
if feed and len ( feed ) > offset :
feed = feed [ offset ]
logging . debug ( " ---> Feed exists ( %s ), updating... " % ( feed ) )
feed = feed . update ( )
elif create :
logging . debug ( " ---> Feed doesn ' t exist, creating: %s " % ( feed_finder_url ) )
feed = cls . objects . create ( feed_address = feed_finder_url )
feed = feed . update ( )
2011-04-25 20:53:29 -04:00
# Still nothing? Maybe the URL has some clues.
2016-02-04 21:31:49 -08:00
if not feed and fetch and len ( found_feed_urls ) :
feed_finder_url = found_feed_urls [ 0 ]
feed = by_url ( feed_finder_url )
if not feed and create :
feed = cls . objects . create ( feed_address = feed_finder_url )
feed = feed . update ( )
elif feed and len ( feed ) > offset :
feed = feed [ offset ]
2011-04-25 20:53:29 -04:00
# Not created and not within bounds, so toss results.
if isinstance ( feed , QuerySet ) :
return
2011-02-08 22:07:59 -05:00
return feed
2010-12-23 12:32:24 -05:00
@classmethod
2012-12-25 12:08:17 -08:00
def task_feeds ( cls , feeds , queue_size = 12 , verbose = True ) :
2013-03-29 21:46:04 -07:00
if not feeds : return
2015-07-27 18:35:25 -07:00
r = redis . Redis ( connection_pool = settings . REDIS_FEED_UPDATE_POOL )
2013-03-30 19:05:13 -07:00
2012-10-29 14:58:43 -07:00
if isinstance ( feeds , Feed ) :
2012-12-25 12:08:17 -08:00
if verbose :
2013-03-30 19:05:13 -07:00
logging . debug ( " ---> ~SN~FBTasking feed: ~SB %s " % feeds )
feeds = [ feeds . pk ]
2012-12-25 12:08:17 -08:00
elif verbose :
2015-07-23 16:29:47 -07:00
logging . debug ( " ---> ~SN~FBTasking ~SB~FC %s ~FB~SN feeds... " % len ( feeds ) )
2010-12-23 12:32:24 -05:00
2013-03-30 19:05:13 -07:00
if isinstance ( feeds , QuerySet ) :
feeds = [ f . pk for f in feeds ]
r . srem ( ' queued_feeds ' , * feeds )
2013-04-03 17:22:45 -07:00
now = datetime . datetime . now ( ) . strftime ( " %s " )
p = r . pipeline ( )
for feed_id in feeds :
p . zadd ( ' tasked_feeds ' , feed_id , now )
p . execute ( )
2013-03-30 19:05:13 -07:00
2013-04-08 14:39:00 -07:00
# for feed_ids in (feeds[pos:pos + queue_size] for pos in xrange(0, len(feeds), queue_size)):
for feed_id in feeds :
UpdateFeeds . apply_async ( args = ( feed_id , ) , queue = ' update_feeds ' )
2013-04-15 19:26:41 -07:00
@classmethod
2014-05-29 16:02:07 -07:00
def drain_task_feeds ( cls ) :
2015-07-27 18:35:25 -07:00
r = redis . Redis ( connection_pool = settings . REDIS_FEED_UPDATE_POOL )
2014-05-29 16:02:07 -07:00
tasked_feeds = r . zrange ( ' tasked_feeds ' , 0 , - 1 )
logging . debug ( " ---> ~FRDraining %s tasked feeds... " % len ( tasked_feeds ) )
r . sadd ( ' queued_feeds ' , * tasked_feeds )
2013-04-15 19:26:41 -07:00
r . zremrangebyrank ( ' tasked_feeds ' , 0 , - 1 )
2014-05-29 16:02:07 -07:00
errored_feeds = r . zrange ( ' error_feeds ' , 0 , - 1 )
logging . debug ( " ---> ~FRDraining %s errored feeds... " % len ( errored_feeds ) )
r . sadd ( ' queued_feeds ' , * errored_feeds )
r . zremrangebyrank ( ' error_feeds ' , 0 , - 1 )
2013-04-15 19:26:41 -07:00
2015-08-05 13:59:26 -07:00
def update_all_statistics ( self , has_new_stories = False , force = False ) :
recount = not self . counts_converted_to_redis
2012-03-26 12:40:13 -07:00
count_extra = False
2015-07-28 20:58:56 -07:00
if random . random ( ) < 0.01 or not self . data . popular_tags or not self . data . popular_authors :
2013-02-25 11:11:34 -08:00
count_extra = True
2013-04-23 15:44:31 -07:00
2015-08-05 13:59:26 -07:00
self . count_subscribers ( recount = recount )
self . calculate_last_story_date ( )
2015-12-18 12:57:11 -08:00
if force or has_new_stories or count_extra :
2013-04-23 15:44:31 -07:00
self . save_feed_stories_last_month ( )
2015-08-05 13:59:26 -07:00
if force or ( has_new_stories and count_extra ) :
2011-12-14 14:18:01 -08:00
self . save_popular_authors ( )
self . save_popular_tags ( )
2013-04-23 15:44:31 -07:00
self . save_feed_story_history_statistics ( )
2010-08-25 19:10:55 -04:00
2013-04-23 17:04:21 -07:00
def calculate_last_story_date ( self ) :
last_story_date = None
2013-04-23 21:22:49 -07:00
2013-04-23 17:04:21 -07:00
try :
2013-04-23 21:22:49 -07:00
latest_story = MStory . objects (
2013-04-23 17:04:21 -07:00
story_feed_id = self . pk
2013-04-23 21:22:49 -07:00
) . limit ( 1 ) . order_by ( ' -story_date ' ) . only ( ' story_date ' ) . first ( )
if latest_story :
last_story_date = latest_story . story_date
2013-04-23 17:04:21 -07:00
except MStory . DoesNotExist :
pass
if not last_story_date or seconds_timesince ( last_story_date ) < 0 :
last_story_date = datetime . datetime . now ( )
2015-07-22 13:53:20 -07:00
if last_story_date != self . last_story_date :
self . last_story_date = last_story_date
self . save ( update_fields = [ ' last_story_date ' ] )
2013-04-23 17:04:21 -07:00
2013-02-15 09:52:11 -08:00
@classmethod
def setup_feeds_for_premium_subscribers ( cls , feed_ids ) :
logging . info ( " ---> ~SN~FMScheduling immediate premium setup of ~SB %s ~SN feeds... " %
len ( feed_ids ) )
feeds = Feed . objects . filter ( pk__in = feed_ids )
for feed in feeds :
feed . setup_feed_for_premium_subscribers ( )
2010-10-06 10:21:14 -04:00
def setup_feed_for_premium_subscribers ( self ) :
self . count_subscribers ( )
self . set_next_scheduled_update ( )
2011-11-16 18:41:36 -08:00
def check_feed_link_for_feed_address ( self ) :
2011-02-02 13:07:12 -05:00
@timelimit ( 10 )
def _1 ( ) :
feed_address = None
2013-04-04 17:18:27 -07:00
feed = self
2016-02-05 14:43:31 -08:00
found_feed_urls = [ ]
2011-02-23 14:25:07 -05:00
try :
2016-02-05 14:43:31 -08:00
logging . debug ( " ---> Checking: %s " % self . feed_address )
found_feed_urls = feedfinder . find_feeds ( self . feed_address )
2016-02-05 14:46:56 -08:00
if found_feed_urls :
feed_address = found_feed_urls [ 0 ]
2011-02-23 14:25:07 -05:00
except KeyError :
is_feed = False
2016-02-05 14:43:31 -08:00
if not len ( found_feed_urls ) and self . feed_link :
found_feed_urls = feedfinder . find_feeds ( self . feed_link )
if len ( found_feed_urls ) and found_feed_urls [ 0 ] != self . feed_address :
feed_address = found_feed_urls [ 0 ]
2011-02-02 13:07:12 -05:00
if feed_address :
2013-05-13 15:41:25 -07:00
if ( feed_address . endswith ( ' feedburner.com/atom.xml ' ) or
feed_address . endswith ( ' feedburner.com/feed/ ' ) ) :
2011-11-12 15:36:38 -08:00
logging . debug ( " ---> Feed points to ' Wierdo ' , ignoring. " )
2013-04-04 17:13:06 -07:00
return False , self
2011-02-02 13:07:12 -05:00
try :
self . feed_address = feed_address
2013-04-04 17:09:07 -07:00
feed = self . save ( )
2016-02-09 16:34:59 -08:00
feed . count_subscribers ( )
2013-04-04 17:09:07 -07:00
feed . schedule_feed_fetch_immediately ( )
feed . has_feed_exception = False
feed . active = True
feed = feed . save ( )
2011-02-02 13:07:12 -05:00
except IntegrityError :
2011-12-01 22:03:30 -08:00
original_feed = Feed . objects . get ( feed_address = feed_address , feed_link = self . feed_link )
2011-02-02 13:07:12 -05:00
original_feed . has_feed_exception = False
original_feed . active = True
original_feed . save ( )
merge_feeds ( original_feed . pk , self . pk )
2013-04-04 17:09:07 -07:00
return feed_address , feed
2010-07-19 14:29:27 -04:00
2011-11-16 18:41:36 -08:00
if self . feed_address_locked :
2013-04-04 17:13:06 -07:00
return False , self
2011-11-16 18:41:36 -08:00
2011-02-02 13:07:12 -05:00
try :
2013-04-04 17:09:07 -07:00
feed_address , feed = _1 ( )
2013-04-15 14:30:31 -07:00
except TimeoutError , e :
2012-01-04 21:49:55 -08:00
logging . debug ( ' ---> [ %-30s ] Feed address check timed out... ' % ( unicode ( self ) [ : 30 ] ) )
2013-04-15 14:30:31 -07:00
self . save_feed_history ( 505 , ' Timeout ' , e )
2013-04-04 17:27:31 -07:00
feed = self
2011-02-02 13:07:12 -05:00
feed_address = None
2013-04-04 17:09:07 -07:00
return bool ( feed_address ) , feed
2010-08-25 19:10:55 -04:00
2010-07-08 11:37:54 -04:00
def save_feed_history ( self , status_code , message , exception = None ) :
2013-05-31 17:14:17 -07:00
fetch_history = MFetchHistory . add ( feed_id = self . pk ,
fetch_type = ' feed ' ,
code = int ( status_code ) ,
message = message ,
exception = exception )
2010-10-03 19:05:16 -04:00
if status_code not in ( 200 , 304 ) :
2012-08-09 13:26:44 -07:00
self . errors_since_good + = 1
2013-05-31 17:14:17 -07:00
self . count_errors_in_history ( ' feed ' , status_code , fetch_history = fetch_history )
2012-08-09 13:26:44 -07:00
self . set_next_scheduled_update ( )
elif self . has_feed_exception or self . errors_since_good :
self . errors_since_good = 0
2010-08-26 10:04:32 -04:00
self . has_feed_exception = False
2010-08-25 20:43:35 -04:00
self . active = True
2010-08-24 16:22:12 -04:00
self . save ( )
2010-08-18 20:35:45 -04:00
2010-07-08 11:37:54 -04:00
def save_page_history ( self , status_code , message , exception = None ) :
2013-05-31 17:14:17 -07:00
fetch_history = MFetchHistory . add ( feed_id = self . pk ,
fetch_type = ' page ' ,
code = int ( status_code ) ,
message = message ,
exception = exception )
2010-08-18 20:35:45 -04:00
2010-10-03 19:05:16 -04:00
if status_code not in ( 200 , 304 ) :
2013-05-31 17:14:17 -07:00
self . count_errors_in_history ( ' page ' , status_code , fetch_history = fetch_history )
2013-07-23 10:43:28 -07:00
elif self . has_page_exception or not self . has_page :
2010-08-26 10:04:32 -04:00
self . has_page_exception = False
2012-03-05 13:12:50 -08:00
self . has_page = True
2010-08-25 20:43:35 -04:00
self . active = True
2010-08-24 16:22:12 -04:00
self . save ( )
2010-07-06 13:21:12 -04:00
2013-05-31 17:14:17 -07:00
def count_errors_in_history ( self , exception_type = ' feed ' , status_code = None , fetch_history = None ) :
if not fetch_history :
fetch_history = MFetchHistory . feed ( self . pk )
2013-04-15 14:30:31 -07:00
fh = fetch_history [ exception_type + ' _fetch_history ' ]
non_errors = [ h for h in fh if h [ ' status_code ' ] and int ( h [ ' status_code ' ] ) in ( 200 , 304 ) ]
errors = [ h for h in fh if h [ ' status_code ' ] and int ( h [ ' status_code ' ] ) not in ( 200 , 304 ) ]
2012-02-24 11:47:38 -08:00
if len ( non_errors ) == 0 and len ( errors ) > 1 :
2012-08-09 13:26:44 -07:00
self . active = True
2010-08-26 10:04:32 -04:00
if exception_type == ' feed ' :
self . has_feed_exception = True
2012-08-09 13:26:44 -07:00
# self.active = False # No longer, just geometrically fetch
2010-08-26 10:04:32 -04:00
elif exception_type == ' page ' :
self . has_page_exception = True
2012-02-23 13:20:10 -08:00
self . exception_code = status_code or int ( errors [ 0 ] )
2010-08-18 20:35:45 -04:00
self . save ( )
2010-10-27 13:09:46 -04:00
elif self . exception_code > 0 :
self . active = True
self . exception_code = 0
2012-02-24 11:47:38 -08:00
if exception_type == ' feed ' :
self . has_feed_exception = False
elif exception_type == ' page ' :
self . has_page_exception = False
2010-10-27 13:09:46 -04:00
self . save ( )
2012-02-23 13:20:10 -08:00
2016-02-05 14:43:31 -08:00
logging . debug ( ' ---> [ %-30s ] ~FBCounting any errors in history: %s ( %s non errors) ' %
( unicode ( self ) [ : 30 ] , len ( errors ) , len ( non_errors ) ) )
2012-02-23 13:20:10 -08:00
return errors , non_errors
2015-06-01 17:14:04 -07:00
def count_redirects_in_history ( self , fetch_type = ' feed ' , fetch_history = None ) :
logging . debug ( ' ---> [ %-30s ] Counting redirects in history... ' % ( unicode ( self ) [ : 30 ] ) )
if not fetch_history :
fetch_history = MFetchHistory . feed ( self . pk )
fh = fetch_history [ fetch_type + ' _fetch_history ' ]
redirects = [ h for h in fh if h [ ' status_code ' ] and int ( h [ ' status_code ' ] ) in ( 301 , 302 ) ]
non_redirects = [ h for h in fh if h [ ' status_code ' ] and int ( h [ ' status_code ' ] ) not in ( 301 , 302 ) ]
return redirects , non_redirects
2010-08-18 20:35:45 -04:00
2015-07-29 11:44:39 -07:00
@property
def original_feed_id ( self ) :
if self . branch_from_feed :
return self . branch_from_feed . pk
else :
return self . pk
@property
def counts_converted_to_redis ( self ) :
2015-08-05 13:59:26 -07:00
SUBSCRIBER_EXPIRE_DATE = datetime . datetime . now ( ) - datetime . timedelta ( days = settings . SUBSCRIBER_EXPIRE )
subscriber_expire = int ( SUBSCRIBER_EXPIRE_DATE . strftime ( ' %s ' ) )
2015-07-29 11:44:39 -07:00
r = redis . Redis ( connection_pool = settings . REDIS_FEED_SUB_POOL )
2015-08-05 13:59:26 -07:00
total_key = " s: %s " % self . original_feed_id
premium_key = " sp: %s " % self . original_feed_id
last_recount = r . zscore ( total_key , - 1 )
# Check for expired feeds with no active users who would have triggered a cleanup
if last_recount and last_recount > subscriber_expire :
return True
elif last_recount :
2015-08-24 14:26:49 -07:00
logging . info ( " ---> [ %-30s ] ~SN~FBFeed has expired redis subscriber counts ( %s < %s ), clearing... " % (
2015-08-05 14:00:58 -07:00
unicode ( self ) [ : 30 ] , last_recount , subscriber_expire ) )
2015-08-05 13:59:26 -07:00
r . delete ( total_key , - 1 )
r . delete ( premium_key , - 1 )
return False
2015-07-29 11:44:39 -07:00
2015-07-28 20:48:51 -07:00
def count_subscribers ( self , recount = True , verbose = False ) :
2015-08-05 13:59:26 -07:00
if recount or not self . counts_converted_to_redis :
2015-07-28 20:46:30 -07:00
from apps . profile . models import Profile
Profile . count_feed_subscribers ( feed_id = self . pk )
2015-07-28 18:46:37 -07:00
SUBSCRIBER_EXPIRE_DATE = datetime . datetime . now ( ) - datetime . timedelta ( days = settings . SUBSCRIBER_EXPIRE )
subscriber_expire = int ( SUBSCRIBER_EXPIRE_DATE . strftime ( ' %s ' ) )
now = int ( datetime . datetime . now ( ) . strftime ( ' %s ' ) )
r = redis . Redis ( connection_pool = settings . REDIS_FEED_SUB_POOL )
total = 0
active = 0
premium = 0
active_premium = 0
# Include all branched feeds in counts
2015-07-29 11:44:39 -07:00
feed_ids = [ f [ ' id ' ] for f in Feed . objects . filter ( branch_from_feed = self . original_feed_id ) . values ( ' id ' ) ]
feed_ids . append ( self . original_feed_id )
2011-11-17 18:34:23 -08:00
feed_ids = list ( set ( feed_ids ) )
2015-07-29 11:44:39 -07:00
if self . counts_converted_to_redis :
2015-07-28 18:46:37 -07:00
# For each branched feed, count different subscribers
for feed_id in feed_ids :
2015-07-28 19:19:58 -07:00
pipeline = r . pipeline ( )
2015-07-28 20:46:30 -07:00
2015-07-29 17:01:16 -07:00
# now+1 ensures `-1` flag will be corrected for later with - 1
2015-07-28 18:46:37 -07:00
total_key = " s: %s " % feed_id
premium_key = " sp: %s " % feed_id
pipeline . zcard ( total_key )
2015-07-29 17:01:16 -07:00
pipeline . zcount ( total_key , subscriber_expire , now + 1 )
2015-07-28 18:46:37 -07:00
pipeline . zcard ( premium_key )
2015-07-29 17:01:16 -07:00
pipeline . zcount ( premium_key , subscriber_expire , now + 1 )
2015-07-28 18:46:37 -07:00
results = pipeline . execute ( )
# -1 due to key=-1 signaling counts_converted_to_redis
total + = results [ 0 ] - 1
2015-07-29 17:01:16 -07:00
active + = results [ 1 ] - 1
2015-07-28 18:46:37 -07:00
premium + = results [ 2 ] - 1
2015-07-29 17:01:16 -07:00
active_premium + = results [ 3 ] - 1
2015-07-28 18:46:37 -07:00
original_num_subscribers = self . num_subscribers
original_active_subs = self . active_subscribers
original_premium_subscribers = self . premium_subscribers
original_active_premium_subscribers = self . active_premium_subscribers
logging . info ( " ---> [ %-30s ] ~SN~FBCounting subscribers from ~FCredis~FB: ~FMt:~SB~FM %s ~SN a:~SB %s ~SN p:~SB %s ~SN ap:~SB %s " %
( self . title [ : 30 ] , total , active , premium , active_premium ) )
else :
from apps . reader . models import UserSubscription
subs = UserSubscription . objects . filter ( feed__in = feed_ids )
original_num_subscribers = self . num_subscribers
total = subs . count ( )
active_subs = UserSubscription . objects . filter (
feed__in = feed_ids ,
active = True ,
user__profile__last_seen_on__gte = SUBSCRIBER_EXPIRE_DATE
)
original_active_subs = self . active_subscribers
active = active_subs . count ( )
premium_subs = UserSubscription . objects . filter (
feed__in = feed_ids ,
active = True ,
user__profile__is_premium = True
)
original_premium_subscribers = self . premium_subscribers
premium = premium_subs . count ( )
active_premium_subscribers = UserSubscription . objects . filter (
feed__in = feed_ids ,
active = True ,
user__profile__is_premium = True ,
user__profile__last_seen_on__gte = SUBSCRIBER_EXPIRE_DATE
)
original_active_premium_subscribers = self . active_premium_subscribers
active_premium = active_premium_subscribers . count ( )
logging . debug ( " ---> [ %-30s ] ~SN~FBCounting subscribers from ~FYpostgres~FB: ~FMt:~SB~FM %s ~SN a:~SB %s ~SN p:~SB %s ~SN ap:~SB %s " %
( self . title [ : 30 ] , total , active , premium , active_premium ) )
2015-07-29 16:34:48 -07:00
# If any counts have changed, save them
2015-07-28 18:46:37 -07:00
self . num_subscribers = total
self . active_subscribers = active
self . premium_subscribers = premium
self . active_premium_subscribers = active_premium
2015-07-22 13:53:20 -07:00
if ( self . num_subscribers != original_num_subscribers or
self . active_subscribers != original_active_subs or
self . premium_subscribers != original_premium_subscribers or
self . active_premium_subscribers != original_active_premium_subscribers ) :
2016-01-07 18:46:16 -08:00
if original_premium_subscribers == - 1 or original_active_premium_subscribers == - 1 :
self . save ( )
else :
self . save ( update_fields = [ ' num_subscribers ' , ' active_subscribers ' ,
' premium_subscribers ' , ' active_premium_subscribers ' ] )
2010-06-27 20:43:17 -04:00
if verbose :
2010-06-27 23:15:31 -04:00
if self . num_subscribers < = 1 :
print ' . ' ,
else :
print " \n %s > %s subscriber %s : %s " % (
' - ' * min ( self . num_subscribers , 20 ) ,
self . num_subscribers ,
' ' if self . num_subscribers == 1 else ' s ' ,
self . feed_title ,
) ,
2015-07-28 18:46:37 -07:00
2011-10-26 20:09:28 -07:00
def _split_favicon_color ( self ) :
2011-10-26 09:40:04 -07:00
color = self . favicon_color
if color :
splitter = lambda s , p : [ s [ i : i + p ] for i in range ( 0 , len ( s ) , p ) ]
red , green , blue = splitter ( color [ : 6 ] , 2 )
2011-10-26 20:09:28 -07:00
return red , green , blue
return None , None , None
def favicon_fade ( self ) :
red , green , blue = self . _split_favicon_color ( )
if red and green and blue :
2012-01-31 10:15:11 -08:00
fade_red = hex ( min ( int ( red , 16 ) + 35 , 255 ) ) [ 2 : ] . zfill ( 2 )
fade_green = hex ( min ( int ( green , 16 ) + 35 , 255 ) ) [ 2 : ] . zfill ( 2 )
fade_blue = hex ( min ( int ( blue , 16 ) + 35 , 255 ) ) [ 2 : ] . zfill ( 2 )
return " %s %s %s " % ( fade_red , fade_green , fade_blue )
def favicon_border ( self ) :
red , green , blue = self . _split_favicon_color ( )
if red and green and blue :
fade_red = hex ( min ( int ( int ( red , 16 ) * .75 ) , 255 ) ) [ 2 : ] . zfill ( 2 )
fade_green = hex ( min ( int ( int ( green , 16 ) * .75 ) , 255 ) ) [ 2 : ] . zfill ( 2 )
fade_blue = hex ( min ( int ( int ( blue , 16 ) * .75 ) , 255 ) ) [ 2 : ] . zfill ( 2 )
2011-10-26 09:40:04 -07:00
return " %s %s %s " % ( fade_red , fade_green , fade_blue )
2011-10-26 20:09:28 -07:00
def favicon_text_color ( self ) :
# Color format: {r: 1, g: .5, b: 0}
def contrast ( color1 , color2 ) :
lum1 = luminosity ( color1 )
lum2 = luminosity ( color2 )
if lum1 > lum2 :
return ( lum1 + 0.05 ) / ( lum2 + 0.05 )
else :
return ( lum2 + 0.05 ) / ( lum1 + 0.05 )
def luminosity ( color ) :
r = color [ ' red ' ]
g = color [ ' green ' ]
b = color [ ' blue ' ]
val = lambda c : c / 12.92 if c < = 0.02928 else math . pow ( ( ( c + 0.055 ) / 1.055 ) , 2.4 )
red = val ( r )
green = val ( g )
blue = val ( b )
return 0.2126 * red + 0.7152 * green + 0.0722 * blue
red , green , blue = self . _split_favicon_color ( )
if red and green and blue :
color = {
' red ' : int ( red , 16 ) / 256.0 ,
' green ' : int ( green , 16 ) / 256.0 ,
' blue ' : int ( blue , 16 ) / 256.0 ,
}
white = {
' red ' : 1 ,
' green ' : 1 ,
' blue ' : 1 ,
}
grey = {
' red ' : 0.5 ,
' green ' : 0.5 ,
' blue ' : 0.5 ,
}
if contrast ( color , white ) > contrast ( color , grey ) :
return ' white '
else :
return ' black '
2010-11-05 20:34:17 -04:00
def save_feed_stories_last_month ( self , verbose = False ) :
2010-10-10 23:55:00 -04:00
month_ago = datetime . datetime . utcnow ( ) - datetime . timedelta ( days = 30 )
2010-08-23 09:55:21 -04:00
stories_last_month = MStory . objects ( story_feed_id = self . pk ,
story_date__gte = month_ago ) . count ( )
2015-07-22 13:53:20 -07:00
if self . stories_last_month != stories_last_month :
self . stories_last_month = stories_last_month
self . save ( update_fields = [ ' stories_last_month ' ] )
2010-07-05 22:53:49 -04:00
2010-07-02 15:49:08 -04:00
if verbose :
2010-08-23 09:55:21 -04:00
print " ---> %s [ %s ]: %s stories last month " % ( self . feed_title , self . pk ,
self . stories_last_month )
2010-08-13 10:43:48 -04:00
2010-11-05 20:34:17 -04:00
def save_feed_story_history_statistics ( self , current_counts = None ) :
2010-08-13 10:43:48 -04:00
"""
Fills in missing months between earlier occurances and now .
Save format : [ ( ' YYYY-MM, #), ...]
Example output : [ ( 2010 - 12 , 123 ) , ( 2011 - 01 , 146 ) ]
"""
2010-10-10 23:55:00 -04:00
now = datetime . datetime . utcnow ( )
2010-08-13 10:43:48 -04:00
min_year = now . year
total = 0
month_count = 0
2010-08-27 19:09:47 -04:00
if not current_counts :
2011-01-17 22:48:38 -05:00
current_counts = self . data . story_count_history and json . decode ( self . data . story_count_history )
2013-05-20 17:29:19 -07:00
if isinstance ( current_counts , dict ) :
current_counts = current_counts [ ' months ' ]
2010-08-13 10:43:48 -04:00
if not current_counts :
current_counts = [ ]
2010-08-27 19:09:47 -04:00
# Count stories, aggregate by year and month. Map Reduce!
2010-08-27 18:35:33 -04:00
map_f = """
function ( ) {
var date = ( this . story_date . getFullYear ( ) ) + " - " + ( this . story_date . getMonth ( ) + 1 ) ;
2016-01-14 16:01:00 -08:00
var hour = this . story_date . getUTCHours ( ) ;
2016-01-05 11:32:36 -08:00
var day = this . story_date . getDay ( ) ;
emit ( this . story_hash , { ' month ' : date , ' hour ' : hour , ' day ' : day } ) ;
2010-08-27 18:35:33 -04:00
}
"""
reduce_f = """
function ( key , values ) {
2016-01-05 11:32:36 -08:00
return values ;
2010-08-27 18:35:33 -04:00
}
"""
2016-01-05 11:32:36 -08:00
dates = defaultdict ( int )
hours = defaultdict ( int )
days = defaultdict ( int )
results = MStory . objects ( story_feed_id = self . pk ) . map_reduce ( map_f , reduce_f , output = ' inline ' )
for result in results :
dates [ result . value [ ' month ' ] ] + = 1
hours [ int ( result . value [ ' hour ' ] ) ] + = 1
days [ int ( result . value [ ' day ' ] ) ] + = 1
year = int ( re . findall ( r " ( \ d {4} )- \ d { 1,2} " , result . value [ ' month ' ] ) [ 0 ] )
if year < min_year and year > 2000 :
min_year = year
2010-08-13 10:43:48 -04:00
# Add on to existing months, always amending up, never down. (Current month
# is guaranteed to be accurate, since trim_feeds won't delete it until after
# a month. Hacker News can have 1,000+ and still be counted.)
for current_month , current_count in current_counts :
2011-01-17 22:48:38 -05:00
year = int ( re . findall ( r " ( \ d {4} )- \ d { 1,2} " , current_month ) [ 0 ] )
2010-08-27 19:09:47 -04:00
if current_month not in dates or dates [ current_month ] < current_count :
dates [ current_month ] = current_count
2012-10-30 10:40:49 -07:00
if year < min_year and year > 2000 :
2011-01-17 22:48:38 -05:00
min_year = year
2010-08-13 10:43:48 -04:00
# Assemble a list with 0's filled in for missing months,
# trimming left and right 0's.
months = [ ]
start = False
for year in range ( min_year , now . year + 1 ) :
for month in range ( 1 , 12 + 1 ) :
if datetime . datetime ( year , month , 1 ) < now :
2010-08-27 19:09:47 -04:00
key = u ' %s - %s ' % ( year , month )
if dates . get ( key ) or start :
2010-08-13 10:43:48 -04:00
start = True
2010-08-27 19:09:47 -04:00
months . append ( ( key , dates . get ( key , 0 ) ) )
total + = dates . get ( key , 0 )
2010-08-13 10:43:48 -04:00
month_count + = 1
2015-07-22 13:53:20 -07:00
original_story_count_history = self . data . story_count_history
2016-01-05 11:32:36 -08:00
self . data . story_count_history = json . encode ( { ' months ' : months , ' hours ' : hours , ' days ' : days } )
2015-07-22 13:53:20 -07:00
if self . data . story_count_history != original_story_count_history :
self . data . save ( update_fields = [ ' story_count_history ' ] )
original_average_stories_per_month = self . average_stories_per_month
2012-12-21 16:48:47 -08:00
if not total or not month_count :
2010-08-27 19:09:47 -04:00
self . average_stories_per_month = 0
else :
2012-12-21 16:48:47 -08:00
self . average_stories_per_month = int ( round ( total / float ( month_count ) ) )
2015-07-22 13:53:20 -07:00
if self . average_stories_per_month != original_average_stories_per_month :
self . save ( update_fields = [ ' average_stories_per_month ' ] )
2010-08-13 10:43:48 -04:00
2011-04-09 11:06:36 -04:00
def save_classifier_counts ( self ) :
2011-04-07 17:00:28 -04:00
from apps . analyzer . models import MClassifierTitle , MClassifierAuthor , MClassifierFeed , MClassifierTag
def calculate_scores ( cls , facet ) :
map_f = """
function ( ) {
emit ( this [ " %s " ] , {
pos : this . score > 0 ? this . score : 0 ,
neg : this . score < 0 ? Math . abs ( this . score ) : 0
} ) ;
}
""" % (facet)
reduce_f = """
function ( key , values ) {
var result = { pos : 0 , neg : 0 } ;
values . forEach ( function ( value ) {
result . pos + = value . pos ;
result . neg + = value . neg ;
} ) ;
return result ;
}
"""
2011-04-09 14:37:07 -04:00
scores = [ ]
2011-08-27 17:14:31 -07:00
res = cls . objects ( feed_id = self . pk ) . map_reduce ( map_f , reduce_f , output = ' inline ' )
2011-04-07 17:00:28 -04:00
for r in res :
2011-04-09 14:37:07 -04:00
facet_values = dict ( [ ( k , int ( v ) ) for k , v in r . value . iteritems ( ) ] )
facet_values [ facet ] = r . key
2012-12-12 16:36:36 -08:00
if facet_values [ ' pos ' ] + facet_values [ ' neg ' ] > 1 :
scores . append ( facet_values )
2011-04-09 14:37:07 -04:00
scores = sorted ( scores , key = lambda v : v [ ' neg ' ] - v [ ' pos ' ] )
2011-04-07 17:00:28 -04:00
return scores
scores = { }
for cls , facet in [ ( MClassifierTitle , ' title ' ) ,
( MClassifierAuthor , ' author ' ) ,
( MClassifierTag , ' tag ' ) ,
( MClassifierFeed , ' feed_id ' ) ] :
scores [ facet ] = calculate_scores ( cls , facet )
2011-04-09 14:37:07 -04:00
if facet == ' feed_id ' and scores [ facet ] :
scores [ ' feed ' ] = scores [ facet ]
2011-04-07 17:00:28 -04:00
del scores [ ' feed_id ' ]
elif not scores [ facet ] :
del scores [ facet ]
if scores :
self . data . feed_classifier_counts = json . encode ( scores )
self . data . save ( )
2012-03-26 17:04:35 -07:00
def update ( self , * * kwargs ) :
2009-08-29 19:34:42 +00:00
from utils import feed_fetcher
2015-07-27 18:35:25 -07:00
r = redis . Redis ( connection_pool = settings . REDIS_FEED_UPDATE_POOL )
2013-04-04 20:51:56 -07:00
original_feed_id = int ( self . pk )
2016-02-09 16:59:58 -08:00
2012-03-06 16:11:27 -08:00
if getattr ( settings , ' TEST_DEBUG ' , False ) :
2015-07-22 13:53:20 -07:00
original_feed_address = self . feed_address
original_feed_link = self . feed_link
2013-07-03 17:52:13 -07:00
self . feed_address = self . feed_address . replace ( " %(NEWSBLUR_DIR)s " , settings . NEWSBLUR_DIR )
2014-01-21 13:34:53 -08:00
if self . feed_link :
self . feed_link = self . feed_link . replace ( " %(NEWSBLUR_DIR)s " , settings . NEWSBLUR_DIR )
2015-07-22 13:53:20 -07:00
if self . feed_address != original_feed_address or self . feed_link != original_feed_link :
self . save ( update_fields = [ ' feed_address ' , ' feed_link ' ] )
2012-03-26 17:04:35 -07:00
options = {
' verbose ' : kwargs . get ( ' verbose ' ) ,
2010-04-09 16:37:19 -04:00
' timeout ' : 10 ,
2012-03-26 17:04:35 -07:00
' single_threaded ' : kwargs . get ( ' single_threaded ' , True ) ,
' force ' : kwargs . get ( ' force ' ) ,
' compute_scores ' : kwargs . get ( ' compute_scores ' , True ) ,
2012-08-17 00:10:17 -07:00
' mongodb_replication_lag ' : kwargs . get ( ' mongodb_replication_lag ' , None ) ,
2012-03-26 17:04:35 -07:00
' fake ' : kwargs . get ( ' fake ' ) ,
' quick ' : kwargs . get ( ' quick ' ) ,
2014-05-29 17:59:00 -07:00
' updates_off ' : kwargs . get ( ' updates_off ' ) ,
2012-03-26 17:04:35 -07:00
' debug ' : kwargs . get ( ' debug ' ) ,
2012-03-27 18:37:04 -07:00
' fpf ' : kwargs . get ( ' fpf ' ) ,
2012-03-28 15:49:21 -07:00
' feed_xml ' : kwargs . get ( ' feed_xml ' ) ,
2012-03-26 17:04:35 -07:00
}
2016-02-09 17:14:59 -08:00
if self . is_newsletter :
feed = self . update_newsletter_icon ( )
else :
disp = feed_fetcher . Dispatcher ( options , 1 )
disp . add_jobs ( [ [ self . pk ] ] )
feed = disp . run_jobs ( )
2011-02-24 15:48:00 -05:00
2013-04-03 21:11:40 -07:00
if feed :
feed = Feed . get_by_id ( feed . pk )
2013-04-03 21:10:37 -07:00
if feed :
feed . last_update = datetime . datetime . utcnow ( )
feed . set_next_scheduled_update ( )
r . zadd ( ' fetched_feeds_last_hour ' , feed . pk , int ( datetime . datetime . now ( ) . strftime ( ' %s ' ) ) )
2013-04-03 21:11:40 -07:00
2013-04-04 20:47:41 -07:00
if not feed or original_feed_id != feed . pk :
logging . info ( " ---> ~FRFeed changed id, removing %s from tasked_feeds queue... " % original_feed_id )
r . zrem ( ' tasked_feeds ' , original_feed_id )
2013-04-08 10:50:50 -07:00
r . zrem ( ' error_feeds ' , original_feed_id )
2013-04-04 20:47:41 -07:00
if feed :
r . zrem ( ' tasked_feeds ' , feed . pk )
2013-04-08 10:50:50 -07:00
r . zrem ( ' error_feeds ' , feed . pk )
2012-03-21 16:05:52 -07:00
2011-05-19 19:04:10 -04:00
return feed
2016-02-09 16:59:58 -08:00
def update_newsletter_icon ( self ) :
from apps . rss_feeds . icon_importer import IconImporter
icon_importer = IconImporter ( self )
icon_importer . save ( )
return self
2012-07-17 14:18:26 -07:00
@classmethod
2012-07-18 18:34:19 -07:00
def get_by_id ( cls , feed_id , feed_address = None ) :
2012-07-17 14:18:26 -07:00
try :
feed = Feed . objects . get ( pk = feed_id )
2012-07-18 18:34:19 -07:00
return feed
2012-07-17 14:18:26 -07:00
except Feed . DoesNotExist :
# Feed has been merged after updating. Find the right feed.
duplicate_feeds = DuplicateFeed . objects . filter ( duplicate_feed_id = feed_id )
if duplicate_feeds :
2012-07-18 18:34:19 -07:00
return duplicate_feeds [ 0 ] . feed
if feed_address :
duplicate_feeds = DuplicateFeed . objects . filter ( duplicate_address = feed_address )
if duplicate_feeds :
return duplicate_feeds [ 0 ] . feed
2013-01-04 16:34:27 -08:00
@classmethod
def get_by_name ( cls , query , limit = 1 ) :
results = SearchFeed . query ( query )
feed_ids = [ result . feed_id for result in results ]
if limit == 1 :
return Feed . get_by_id ( feed_ids [ 0 ] )
else :
return [ Feed . get_by_id ( f ) for f in feed_ids ] [ : limit ]
2014-05-29 17:53:16 -07:00
def add_update_stories ( self , stories , existing_stories , verbose = False , updates_off = False ) :
2012-09-04 11:46:41 -07:00
ret_values = dict ( new = 0 , updated = 0 , same = 0 , error = 0 )
2013-04-08 16:14:33 -07:00
error_count = self . error_count
2014-04-17 12:10:04 -07:00
new_story_hashes = [ s . get ( ' story_hash ' ) for s in stories ]
2013-04-08 16:14:33 -07:00
2013-04-03 18:11:23 -07:00
if settings . DEBUG or verbose :
2012-12-24 00:39:52 -08:00
logging . debug ( " ---> [ %-30s ] ~FBChecking ~SB %s ~SN new/updated against ~SB %s ~SN stories " % (
self . title [ : 30 ] ,
2012-12-24 00:10:40 -08:00
len ( stories ) ,
2013-01-28 16:45:48 -08:00
len ( existing_stories . keys ( ) ) ) )
2013-08-06 13:54:06 -07:00
@timelimit ( 2 )
2014-04-17 12:10:04 -07:00
def _1 ( story , story_content , existing_stories , new_story_hashes ) :
2014-03-13 16:02:27 -07:00
existing_story , story_has_changed = self . _exists_story ( story , story_content ,
2014-04-17 12:10:04 -07:00
existing_stories , new_story_hashes )
2013-08-06 13:54:06 -07:00
return existing_story , story_has_changed
2012-12-24 00:10:40 -08:00
2009-06-16 03:08:55 +00:00
for story in stories :
2014-09-05 10:38:29 -07:00
if verbose :
logging . debug ( " ---> [ %-30s ] ~FBChecking ~SB %s ~SN / ~SB %s " % (
self . title [ : 30 ] ,
story . get ( ' title ' ) ,
story . get ( ' guid ' ) ) )
2012-03-06 16:11:27 -08:00
if not story . get ( ' title ' ) :
continue
story_content = story . get ( ' story_content ' )
2013-04-08 16:14:33 -07:00
if error_count :
story_content = strip_comments__lxml ( story_content )
else :
story_content = strip_comments ( story_content )
2012-03-06 16:11:27 -08:00
story_tags = self . get_tags ( story )
story_link = self . get_permalink ( story )
2013-11-15 10:48:26 -08:00
replace_story_date = False
2013-05-29 15:31:26 -07:00
2013-08-06 13:54:06 -07:00
try :
2014-03-13 16:02:27 -07:00
existing_story , story_has_changed = _1 ( story , story_content ,
2014-04-17 12:10:04 -07:00
existing_stories , new_story_hashes )
2013-08-06 13:54:06 -07:00
except TimeoutError , e :
logging . debug ( ' ---> [ %-30s ] ~SB~FRExisting story check timed out... ' % ( unicode ( self ) [ : 30 ] ) )
existing_story = None
story_has_changed = False
2012-03-06 16:11:27 -08:00
if existing_story is None :
2012-12-24 00:39:52 -08:00
if settings . DEBUG and False :
logging . debug ( ' ---> New story in feed ( %s - %s ): %s ' % ( self . feed_title , story . get ( ' title ' ) , len ( story_content ) ) )
2012-12-24 00:10:40 -08:00
2012-03-06 16:11:27 -08:00
s = MStory ( story_feed_id = self . pk ,
story_date = story . get ( ' published ' ) ,
story_title = story . get ( ' title ' ) ,
story_content = story_content ,
story_author_name = story . get ( ' author ' ) ,
story_permalink = story_link ,
story_guid = story . get ( ' guid ' ) ,
story_tags = story_tags
)
2013-06-26 11:38:49 -07:00
s . extract_image_urls ( )
2012-03-06 16:11:27 -08:00
try :
s . save ( )
2012-09-04 11:46:41 -07:00
ret_values [ ' new ' ] + = 1
2013-04-05 12:09:32 -07:00
except ( IntegrityError , OperationError ) , e :
2012-09-04 11:46:41 -07:00
ret_values [ ' error ' ] + = 1
2012-12-24 00:10:40 -08:00
if settings . DEBUG :
2013-04-05 12:09:32 -07:00
logging . info ( ' ---> [ %-30s ] ~SN~FRIntegrityError on new story: %s - %s ' % ( self . feed_title [ : 30 ] , story . get ( ' guid ' ) , e ) )
2014-04-23 14:35:56 -07:00
if self . search_indexed :
s . index_story_for_search ( )
2014-05-30 13:41:57 -07:00
elif existing_story and story_has_changed and not updates_off and ret_values [ ' updated ' ] < 3 :
2012-03-06 16:11:27 -08:00
# update story
original_content = None
try :
if existing_story and existing_story . id :
try :
2012-03-26 12:40:13 -07:00
existing_story = MStory . objects . get ( id = existing_story . id )
2012-03-06 16:11:27 -08:00
except ValidationError :
2013-02-20 16:08:14 -08:00
existing_story , _ = MStory . find_story ( existing_story . story_feed_id ,
existing_story . id ,
original_only = True )
2014-04-17 12:10:04 -07:00
elif existing_story and existing_story . story_hash :
2013-02-20 16:08:14 -08:00
existing_story , _ = MStory . find_story ( existing_story . story_feed_id ,
2014-04-17 12:10:04 -07:00
existing_story . story_hash ,
2013-02-20 16:08:14 -08:00
original_only = True )
2010-08-01 23:47:40 -04:00
else :
2012-03-06 16:11:27 -08:00
raise MStory . DoesNotExist
2013-04-05 12:09:32 -07:00
except ( MStory . DoesNotExist , OperationError ) , e :
2012-09-04 11:46:41 -07:00
ret_values [ ' error ' ] + = 1
2012-03-06 16:11:27 -08:00
if verbose :
2013-04-05 12:09:32 -07:00
logging . info ( ' ---> [ %-30s ] ~SN~FROperation on existing story: %s - %s ' % ( self . feed_title [ : 30 ] , story . get ( ' guid ' ) , e ) )
2012-03-06 16:11:27 -08:00
continue
if existing_story . story_original_content_z :
original_content = zlib . decompress ( existing_story . story_original_content_z )
elif existing_story . story_content_z :
original_content = zlib . decompress ( existing_story . story_content_z )
# print 'Type: %s %s' % (type(original_content), type(story_content))
if story_content and len ( story_content ) > 10 :
2014-10-15 17:31:00 -07:00
if " <code " in story_content :
# Don't mangle stories with code, just use new
story_content_diff = story_content
else :
2015-12-02 08:08:38 +01:00
story_content_diff = htmldiff ( smart_unicode ( original_content ) , smart_unicode ( story_content ) )
2009-08-20 02:43:01 +00:00
else :
2012-03-06 16:11:27 -08:00
story_content_diff = original_content
# logging.debug("\t\tDiff: %s %s %s" % diff.getStats())
# logging.debug("\t\tDiff content: %s" % diff.getDiff())
# if existing_story.story_title != story.get('title'):
# logging.debug('\tExisting title / New: : \n\t\t- %s\n\t\t- %s' % (existing_story.story_title, story.get('title')))
2014-04-17 12:10:04 -07:00
if existing_story . story_hash != story . get ( ' story_hash ' ) :
2013-05-28 10:23:36 -07:00
self . update_story_with_new_guid ( existing_story , story . get ( ' guid ' ) )
2012-12-24 00:10:40 -08:00
2014-09-05 10:38:29 -07:00
if verbose :
2012-12-24 00:10:40 -08:00
logging . debug ( ' - Updated story in feed ( %s - %s ): %s / %s ' % ( self . feed_title , story . get ( ' title ' ) , len ( story_content_diff ) , len ( story_content ) ) )
2012-03-06 16:11:27 -08:00
existing_story . story_feed = self . pk
existing_story . story_title = story . get ( ' title ' )
existing_story . story_content = story_content_diff
2012-07-22 12:25:09 -07:00
existing_story . story_latest_content = story_content
2012-03-06 16:11:27 -08:00
existing_story . story_original_content = original_content
existing_story . story_author_name = story . get ( ' author ' )
existing_story . story_permalink = story_link
existing_story . story_guid = story . get ( ' guid ' )
existing_story . story_tags = story_tags
2016-02-25 15:45:32 -08:00
existing_story . original_text_z = None # Reset Text view cache
2012-12-24 00:10:40 -08:00
# Do not allow publishers to change the story date once a story is published.
# Leads to incorrect unread story counts.
2013-11-15 10:48:26 -08:00
if replace_story_date :
existing_story . story_date = story . get ( ' published ' ) # Really shouldn't do this.
2014-04-23 14:35:56 -07:00
existing_story . extract_image_urls ( )
2012-03-06 16:11:27 -08:00
try :
existing_story . save ( )
2012-09-04 11:46:41 -07:00
ret_values [ ' updated ' ] + = 1
2012-03-06 16:11:27 -08:00
except ( IntegrityError , OperationError ) :
2012-09-04 11:46:41 -07:00
ret_values [ ' error ' ] + = 1
2012-03-06 16:11:27 -08:00
if verbose :
2012-03-27 11:19:53 -07:00
logging . info ( ' ---> [ %-30s ] ~SN~FRIntegrityError on updated story: %s ' % ( self . feed_title [ : 30 ] , story . get ( ' title ' ) [ : 30 ] ) )
except ValidationError :
2012-09-04 11:46:41 -07:00
ret_values [ ' error ' ] + = 1
2012-03-06 16:11:27 -08:00
if verbose :
2012-03-27 11:19:53 -07:00
logging . info ( ' ---> [ %-30s ] ~SN~FRValidationError on updated story: %s ' % ( self . feed_title [ : 30 ] , story . get ( ' title ' ) [ : 30 ] ) )
2014-04-23 14:35:56 -07:00
if self . search_indexed :
existing_story . index_story_for_search ( )
2012-03-06 16:11:27 -08:00
else :
2012-09-04 11:46:41 -07:00
ret_values [ ' same ' ] + = 1
2014-09-05 10:38:29 -07:00
if verbose :
2014-10-08 16:43:48 -07:00
logging . debug ( " Unchanged story ( %s ): %s / %s " % ( story . get ( ' story_hash ' ) , story . get ( ' guid ' ) , story . get ( ' title ' ) ) )
2012-03-06 16:11:27 -08:00
2009-08-20 02:43:01 +00:00
return ret_values
2011-10-10 20:46:13 -07:00
2013-05-28 10:23:36 -07:00
def update_story_with_new_guid ( self , existing_story , new_story_guid ) :
2013-05-10 16:11:30 -07:00
from apps . reader . models import RUserStory
2012-08-10 14:22:51 -07:00
from apps . social . models import MSharedStory
2013-05-28 10:23:36 -07:00
existing_story . remove_from_redis ( )
2014-04-15 14:59:00 -07:00
existing_story . remove_from_search_index ( )
2013-05-10 16:11:30 -07:00
2013-05-28 10:23:36 -07:00
old_hash = existing_story . story_hash
2013-06-04 15:34:03 -07:00
new_hash = MStory . ensure_story_hash ( new_story_guid , self . pk )
2013-05-28 10:23:36 -07:00
RUserStory . switch_hash ( feed_id = self . pk , old_hash = old_hash , new_hash = new_hash )
2013-05-10 16:11:30 -07:00
2012-08-10 14:22:51 -07:00
shared_stories = MSharedStory . objects . filter ( story_feed_id = self . pk ,
2013-05-28 10:23:36 -07:00
story_hash = old_hash )
2012-08-10 14:22:51 -07:00
for story in shared_stories :
story . story_guid = new_story_guid
2013-05-28 10:23:36 -07:00
story . story_hash = new_hash
2013-05-29 16:26:04 -07:00
try :
story . save ( )
except NotUniqueError :
# Story is already shared, skip.
pass
2012-03-21 13:54:37 -07:00
2011-02-05 15:34:43 -05:00
def save_popular_tags ( self , feed_tags = None , verbose = False ) :
2010-07-01 15:16:33 -04:00
if not feed_tags :
2013-02-20 16:08:14 -08:00
all_tags = MStory . objects ( story_feed_id = self . pk ,
story_tags__exists = True ) . item_frequencies ( ' story_tags ' )
2012-08-12 20:34:30 -07:00
feed_tags = sorted ( [ ( k , v ) for k , v in all_tags . items ( ) if int ( v ) > 0 ] ,
2010-08-21 23:49:36 -04:00
key = itemgetter ( 1 ) ,
2011-02-05 15:34:43 -05:00
reverse = True ) [ : 25 ]
2010-07-01 15:16:33 -04:00
popular_tags = json . encode ( feed_tags )
2012-08-12 20:34:30 -07:00
if verbose :
print " Found %s tags: %s " % ( len ( feed_tags ) , popular_tags )
2010-08-21 23:49:36 -04:00
# TODO: This len() bullshit will be gone when feeds move to mongo
# On second thought, it might stay, because we don't want
# popular tags the size of a small planet. I'm looking at you
# Tumblr writers.
2010-07-01 15:16:33 -04:00
if len ( popular_tags ) < 1024 :
2015-07-22 13:53:20 -07:00
if self . data . popular_tags != popular_tags :
self . data . popular_tags = popular_tags
self . data . save ( update_fields = [ ' popular_tags ' ] )
2010-07-01 15:16:33 -04:00
return
2011-08-27 14:13:28 -07:00
tags_list = [ ]
if feed_tags and isinstance ( feed_tags , unicode ) :
2011-08-27 13:22:56 -07:00
tags_list = json . decode ( feed_tags )
2011-08-27 14:13:28 -07:00
if len ( tags_list ) > = 1 :
2010-07-01 15:16:33 -04:00
self . save_popular_tags ( tags_list [ : - 1 ] )
2010-11-05 20:34:17 -04:00
def save_popular_authors ( self , feed_authors = None ) :
2010-07-01 15:16:33 -04:00
if not feed_authors :
2010-08-21 23:49:36 -04:00
authors = defaultdict ( int )
for story in MStory . objects ( story_feed_id = self . pk ) . only ( ' story_author_name ' ) :
authors [ story . story_author_name ] + = 1
feed_authors = sorted ( [ ( k , v ) for k , v in authors . items ( ) if k ] ,
key = itemgetter ( 1 ) ,
reverse = True ) [ : 20 ]
2010-07-01 15:16:33 -04:00
popular_authors = json . encode ( feed_authors )
2011-02-05 22:09:31 -05:00
if len ( popular_authors ) < 1023 :
2015-07-22 13:53:20 -07:00
if self . data . popular_authors != popular_authors :
self . data . popular_authors = popular_authors
self . data . save ( update_fields = [ ' popular_authors ' ] )
2010-07-01 15:16:33 -04:00
return
2010-08-25 21:55:22 -04:00
if len ( feed_authors ) > 1 :
2010-11-05 20:34:17 -04:00
self . save_popular_authors ( feed_authors = feed_authors [ : - 1 ] )
2013-06-03 17:20:36 -07:00
@classmethod
2016-01-20 13:32:49 -08:00
def trim_old_stories ( cls , start = 0 , verbose = True , dryrun = False , total = 0 ) :
2013-06-03 17:58:27 -07:00
now = datetime . datetime . now ( )
2013-09-16 16:42:49 -07:00
month_ago = now - datetime . timedelta ( days = settings . DAYS_OF_STORY_HASHES )
2013-06-04 11:26:01 -07:00
feed_count = Feed . objects . latest ( ' pk ' ) . pk
2016-01-20 13:32:49 -08:00
2013-06-04 11:26:01 -07:00
for feed_id in xrange ( start , feed_count ) :
if feed_id % 1000 == 0 :
2015-02-11 15:11:56 -08:00
print " \n \n -------------------------- %s ( %s deleted so far) -------------------------- \n \n " % ( feed_id , total )
2013-06-04 11:26:01 -07:00
try :
feed = Feed . objects . get ( pk = feed_id )
except Feed . DoesNotExist :
continue
2016-01-19 11:28:27 -08:00
if feed . active_subscribers < = 0 and ( not feed . last_story_date or feed . last_story_date < month_ago ) :
2013-06-04 11:26:01 -07:00
months_ago = 6
if feed . last_story_date :
months_ago = int ( ( now - feed . last_story_date ) . days / 30.0 )
cutoff = max ( 1 , 6 - months_ago )
if dryrun :
print " DRYRUN: %s cutoff - %s " % ( cutoff , feed )
else :
2013-08-12 16:48:16 -07:00
total + = MStory . trim_feed ( feed = feed , cutoff = cutoff , verbose = verbose )
2016-01-19 11:28:27 -08:00
else :
if dryrun :
print " DRYRUN: %s / %s cutoff - %s " % ( cutoff , feed . story_cutoff , feed )
else :
total + = feed . trim_feed ( verbose = verbose )
2013-08-12 16:48:16 -07:00
print " ---> Deleted %s stories in total. " % total
2013-06-21 12:30:06 -07:00
@property
def story_cutoff ( self ) :
cutoff = 500
if self . active_subscribers < = 0 :
cutoff = 25
2013-08-05 10:23:22 -07:00
elif self . active_premium_subscribers < 1 :
2013-06-21 12:30:06 -07:00
cutoff = 100
2013-08-05 10:23:22 -07:00
elif self . active_premium_subscribers < = 2 :
2013-06-21 12:30:06 -07:00
cutoff = 200
2013-08-05 10:23:22 -07:00
elif self . active_premium_subscribers < = 5 :
2013-06-21 12:30:06 -07:00
cutoff = 300
2013-08-05 10:23:22 -07:00
elif self . active_premium_subscribers < = 10 :
2013-06-21 12:30:06 -07:00
cutoff = 350
2013-08-05 10:23:22 -07:00
elif self . active_premium_subscribers < = 15 :
2013-06-21 12:30:06 -07:00
cutoff = 400
2013-08-05 10:23:22 -07:00
elif self . active_premium_subscribers < = 20 :
2013-06-21 12:30:06 -07:00
cutoff = 450
2015-12-29 12:41:40 -08:00
2013-08-05 10:23:22 -07:00
if self . active_subscribers and self . average_stories_per_month < 5 and self . stories_last_month < 5 :
cutoff / = 2
if self . active_premium_subscribers < = 1 and self . average_stories_per_month < = 1 and self . stories_last_month < = 1 :
cutoff / = 2
2015-12-29 12:41:40 -08:00
r = redis . Redis ( connection_pool = settings . REDIS_FEED_READ_POOL )
pipeline = r . pipeline ( )
read_stories_per_week = [ ]
now = datetime . datetime . now ( )
2015-12-29 12:45:47 -08:00
for weeks_back in range ( 2 * int ( math . floor ( settings . DAYS_OF_STORY_HASHES / 7 ) ) ) :
2015-12-29 12:41:40 -08:00
weeks_ago = now - datetime . timedelta ( days = 7 * weeks_back )
week_of_year = weeks_ago . strftime ( ' % Y- % U ' )
feed_read_key = " fR: %s : %s " % ( self . pk , week_of_year )
pipeline . get ( feed_read_key )
read_stories_per_week = pipeline . execute ( )
read_stories_last_month = sum ( [ int ( rs ) for rs in read_stories_per_week if rs ] )
if read_stories_last_month == 0 :
2015-12-29 12:56:49 -08:00
original_cutoff = cutoff
2016-02-25 11:50:14 -08:00
cutoff = min ( cutoff , 10 )
2016-01-20 13:32:49 -08:00
try :
logging . debug ( " ---> [ %-30s ] ~FBTrimming down to ~SB %s (instead of %s )~SN stories (~FM %s ~FB) " % ( self , cutoff , original_cutoff , self . last_story_date . strftime ( " % Y- % m- %d " ) if self . last_story_date else " No last story date " ) )
except ValueError , e :
logging . debug ( " ***> [ %-30s ] Error trimming: %s " % ( self , e ) )
pass
2015-12-29 12:41:40 -08:00
2013-06-21 12:30:06 -07:00
return cutoff
2013-06-03 17:20:36 -07:00
def trim_feed ( self , verbose = False , cutoff = None ) :
if not cutoff :
2013-06-21 12:30:06 -07:00
cutoff = self . story_cutoff
2016-01-19 11:28:27 -08:00
return MStory . trim_feed ( feed = self , cutoff = cutoff , verbose = verbose )
2015-12-26 12:53:32 -08:00
def purge_feed_stories ( self , update = True ) :
MStory . purge_feed_stories ( feed = self , cutoff = self . story_cutoff )
if update :
self . update ( )
2015-12-26 12:59:44 -08:00
def purge_author ( self , author ) :
all_stories = MStory . objects . filter ( story_feed_id = self . pk )
author_stories = MStory . objects . filter ( story_feed_id = self . pk , story_author_name__iexact = author )
logging . debug ( " ---> Deleting %s of %s stories in %s by ' %s ' . " % ( author_stories . count ( ) , all_stories . count ( ) , self , author ) )
author_stories . delete ( )
def purge_tag ( self , tag ) :
all_stories = MStory . objects . filter ( story_feed_id = self . pk )
tagged_stories = MStory . objects . filter ( story_feed_id = self . pk , story_tags__icontains = tag )
logging . debug ( " ---> Deleting %s of %s stories in %s by ' %s ' . " % ( tagged_stories . count ( ) , all_stories . count ( ) , self , tag ) )
tagged_stories . delete ( )
2013-04-15 14:30:31 -07:00
# @staticmethod
# def clean_invalid_ids():
# history = MFeedFetchHistory.objects(status_code=500, exception__contains='InvalidId:')
# urls = set()
# for h in history:
# u = re.split('InvalidId: (.*?) is not a valid ObjectId\\n$', h.exception)[1]
# urls.add((h.feed_id, u))
#
# for f, u in urls:
# print "db.stories.remove({\"story_feed_id\": %s, \"_id\": \"%s\"})" % (f, u)
2012-10-19 18:33:28 -07:00
2009-08-01 04:26:57 +00:00
2012-01-09 13:55:26 -08:00
def get_stories ( self , offset = 0 , limit = 25 , force = False ) :
stories_db = MStory . objects ( story_feed_id = self . pk ) [ offset : offset + limit ]
stories = self . format_stories ( stories_db , self . pk )
2009-07-28 02:27:27 +00:00
return stories
2013-08-07 15:43:25 -07:00
@classmethod
2014-04-16 17:21:53 -07:00
def find_feed_stories ( cls , feed_ids , query , order = " newest " , offset = 0 , limit = 25 ) :
story_ids = SearchStory . query ( feed_ids = feed_ids , query = query , order = order ,
offset = offset , limit = limit )
2013-08-07 15:43:25 -07:00
stories_db = MStory . objects (
2014-04-15 14:17:15 -07:00
story_hash__in = story_ids
2014-04-16 17:23:27 -07:00
) . order_by ( ' -story_date ' if order == " newest " else ' story_date ' )
2013-08-07 15:43:25 -07:00
stories = cls . format_stories ( stories_db )
return stories
2014-04-23 15:05:47 -07:00
def find_stories ( self , query , order = " newest " , offset = 0 , limit = 25 ) :
story_ids = SearchStory . query ( feed_ids = [ self . pk ] , query = query , order = order ,
offset = offset , limit = limit )
2012-12-19 14:21:46 -08:00
stories_db = MStory . objects (
2014-04-23 15:05:47 -07:00
story_hash__in = story_ids
) . order_by ( ' -story_date ' if order == " newest " else ' story_date ' )
2014-04-22 12:39:09 -07:00
2012-12-19 14:21:46 -08:00
stories = self . format_stories ( stories_db , self . pk )
return stories
2010-12-02 20:18:33 -05:00
@classmethod
2013-05-14 16:36:03 -07:00
def format_stories ( cls , stories_db , feed_id = None , include_permalinks = False ) :
2010-01-21 13:12:29 -05:00
stories = [ ]
2010-10-07 19:56:23 -04:00
2010-01-21 13:12:29 -05:00
for story_db in stories_db :
2013-05-14 16:36:03 -07:00
story = cls . format_story ( story_db , feed_id , include_permalinks = include_permalinks )
2010-01-21 13:12:29 -05:00
stories . append ( story )
return stories
2011-05-08 19:41:50 -04:00
@classmethod
2016-02-26 20:01:41 -08:00
def format_story ( cls , story_db , feed_id = None , text = False , include_permalinks = False ,
show_changes = False ) :
2012-09-04 11:46:41 -07:00
if isinstance ( story_db . story_content_z , unicode ) :
story_db . story_content_z = story_db . story_content_z . decode ( ' base64 ' )
2016-02-26 20:01:41 -08:00
story_content = ' '
has_changes = False
if story_db . story_latest_content_z :
if not show_changes :
story_content = smart_unicode ( zlib . decompress ( story_db . story_latest_content_z ) )
has_changes = True
if not story_content and story_db . story_content_z :
story_content = smart_unicode ( zlib . decompress ( story_db . story_content_z ) )
2012-09-04 11:46:41 -07:00
2011-05-08 19:41:50 -04:00
story = { }
2013-04-29 15:27:22 -07:00
story [ ' story_hash ' ] = getattr ( story_db , ' story_hash ' , None )
2011-05-08 19:41:50 -04:00
story [ ' story_tags ' ] = story_db . story_tags or [ ]
2012-09-04 11:46:41 -07:00
story [ ' story_date ' ] = story_db . story_date . replace ( tzinfo = None )
2013-05-26 16:32:48 -07:00
story [ ' story_timestamp ' ] = story_db . story_date . strftime ( ' %s ' )
2014-02-10 12:42:55 -08:00
story [ ' story_authors ' ] = story_db . story_author_name or " "
2011-05-08 19:41:50 -04:00
story [ ' story_title ' ] = story_db . story_title
2011-12-24 14:45:19 -08:00
story [ ' story_content ' ] = story_content
2012-04-06 13:38:21 -07:00
story [ ' story_permalink ' ] = story_db . story_permalink
2013-06-26 11:38:49 -07:00
story [ ' image_urls ' ] = story_db . image_urls
2011-05-08 19:41:50 -04:00
story [ ' story_feed_id ' ] = feed_id or story_db . story_feed_id
2016-02-26 20:01:41 -08:00
story [ ' has_modifications ' ] = has_changes
2012-01-15 20:51:48 -08:00
story [ ' comment_count ' ] = story_db . comment_count if hasattr ( story_db , ' comment_count ' ) else 0
story [ ' comment_user_ids ' ] = story_db . comment_user_ids if hasattr ( story_db , ' comment_user_ids ' ) else [ ]
story [ ' share_count ' ] = story_db . share_count if hasattr ( story_db , ' share_count ' ) else 0
story [ ' share_user_ids ' ] = story_db . share_user_ids if hasattr ( story_db , ' share_user_ids ' ) else [ ]
story [ ' guid_hash ' ] = story_db . guid_hash if hasattr ( story_db , ' guid_hash ' ) else None
2012-04-30 11:52:19 -07:00
if hasattr ( story_db , ' source_user_id ' ) :
story [ ' source_user_id ' ] = story_db . source_user_id
2011-11-24 15:19:53 -05:00
story [ ' id ' ] = story_db . story_guid or story_db . story_date
2011-05-08 19:41:50 -04:00
if hasattr ( story_db , ' starred_date ' ) :
story [ ' starred_date ' ] = story_db . starred_date
2013-08-13 17:21:41 -07:00
if hasattr ( story_db , ' user_tags ' ) :
story [ ' user_tags ' ] = story_db . user_tags
2012-01-24 09:02:23 -08:00
if hasattr ( story_db , ' shared_date ' ) :
story [ ' shared_date ' ] = story_db . shared_date
2014-01-29 12:43:57 -08:00
if hasattr ( story_db , ' comments ' ) :
story [ ' comments ' ] = story_db . comments
if hasattr ( story_db , ' user_id ' ) :
story [ ' user_id ' ] = story_db . user_id
2013-05-14 16:36:03 -07:00
if include_permalinks and hasattr ( story_db , ' blurblog_permalink ' ) :
2012-11-27 11:59:54 -08:00
story [ ' blurblog_permalink ' ] = story_db . blurblog_permalink ( )
2011-05-08 19:41:50 -04:00
if text :
soup = BeautifulSoup ( story [ ' story_content ' ] )
text = ' ' . join ( soup . findAll ( text = True ) )
2011-05-08 20:21:09 -04:00
text = re . sub ( r ' \ n+ ' , ' \n \n ' , text )
2011-05-08 19:41:50 -04:00
text = re . sub ( r ' \ t+ ' , ' \t ' , text )
story [ ' text ' ] = text
2011-11-24 15:19:53 -05:00
2011-05-08 19:41:50 -04:00
return story
2012-01-09 13:55:26 -08:00
2010-01-04 22:26:53 +00:00
def get_tags ( self , entry ) :
fcat = [ ]
if entry . has_key ( ' tags ' ) :
for tcat in entry . tags :
2012-07-30 06:32:34 -07:00
term = None
2011-02-15 21:08:40 -05:00
if hasattr ( tcat , ' label ' ) and tcat . label :
2010-01-04 22:26:53 +00:00
term = tcat . label
2012-07-25 19:11:59 -07:00
elif hasattr ( tcat , ' term ' ) and tcat . term :
2010-01-04 22:26:53 +00:00
term = tcat . term
2012-07-30 06:32:34 -07:00
if not term :
2010-07-06 18:16:41 -04:00
continue
2010-01-04 22:26:53 +00:00
qcat = term . strip ( )
if ' , ' in qcat or ' / ' in qcat :
qcat = qcat . replace ( ' , ' , ' / ' ) . split ( ' / ' )
else :
qcat = [ qcat ]
for zcat in qcat :
tagname = zcat . lower ( )
while ' ' in tagname :
tagname = tagname . replace ( ' ' , ' ' )
tagname = tagname . strip ( )
if not tagname or tagname == ' ' :
continue
2010-08-21 20:42:38 -04:00
fcat . append ( tagname )
2012-07-21 16:38:37 -07:00
fcat = [ strip_tags ( t ) [ : 250 ] for t in fcat [ : 12 ] ]
return fcat
2011-12-08 11:19:04 -08:00
2015-02-19 10:39:10 -08:00
@classmethod
def get_permalink ( cls , entry ) :
2011-12-08 11:19:04 -08:00
link = entry . get ( ' link ' )
if not link :
links = entry . get ( ' links ' )
if links :
2011-12-08 14:51:52 -08:00
link = links [ 0 ] . get ( ' href ' )
if not link :
link = entry . get ( ' id ' )
2011-12-08 11:19:04 -08:00
return link
2014-04-17 12:10:04 -07:00
def _exists_story ( self , story , story_content , existing_stories , new_story_hashes ) :
2009-08-30 00:43:13 +00:00
story_in_system = None
story_has_changed = False
2011-12-14 23:26:07 -08:00
story_link = self . get_permalink ( story )
2014-04-17 12:10:04 -07:00
existing_stories_hashes = existing_stories . keys ( )
2014-03-13 16:32:13 -07:00
story_pub_date = story . get ( ' published ' )
2012-12-24 00:10:40 -08:00
# story_published_now = story.get('published_now', False)
# start_date = story_pub_date - datetime.timedelta(hours=8)
# end_date = story_pub_date + datetime.timedelta(hours=8)
2014-10-08 16:43:48 -07:00
2013-01-28 16:45:48 -08:00
for existing_story in existing_stories . values ( ) :
2009-08-30 00:43:13 +00:00
content_ratio = 0
2012-12-24 00:10:40 -08:00
# existing_story_pub_date = existing_story.story_date
2010-01-28 13:28:27 -05:00
# print 'Story pub date: %s %s' % (story_published_now, story_pub_date)
2014-04-17 12:10:04 -07:00
if isinstance ( existing_story . id , unicode ) :
# Correcting a MongoDB bug
existing_story . story_guid = existing_story . id
2012-12-24 00:10:40 -08:00
2014-04-17 12:10:04 -07:00
if story . get ( ' story_hash ' ) == existing_story . story_hash :
story_in_system = existing_story
elif ( story . get ( ' story_hash ' ) in existing_stories_hashes and
story . get ( ' story_hash ' ) != existing_story . story_hash ) :
# Story already exists but is not this one
continue
elif ( existing_story . story_hash in new_story_hashes and
story . get ( ' story_hash ' ) != existing_story . story_hash ) :
# Story coming up later
continue
2012-12-24 00:10:40 -08:00
if ' story_latest_content_z ' in existing_story :
2015-12-02 07:46:25 +01:00
existing_story_content = smart_unicode ( zlib . decompress ( existing_story . story_latest_content_z ) )
2012-12-24 00:10:40 -08:00
elif ' story_latest_content ' in existing_story :
existing_story_content = existing_story . story_latest_content
elif ' story_content_z ' in existing_story :
2015-12-02 07:46:25 +01:00
existing_story_content = smart_unicode ( zlib . decompress ( existing_story . story_content_z ) )
2012-12-24 00:10:40 -08:00
elif ' story_content ' in existing_story :
existing_story_content = existing_story . story_content
else :
existing_story_content = u ' '
2009-08-30 00:43:13 +00:00
2014-03-13 16:32:13 -07:00
2012-12-24 00:10:40 -08:00
# Title distance + content distance, checking if story changed
story_title_difference = abs ( levenshtein_distance ( story . get ( ' title ' ) ,
existing_story . story_title ) )
2014-03-13 16:32:13 -07:00
title_ratio = difflib . SequenceMatcher ( None , story . get ( ' title ' , " " ) ,
existing_story . story_title ) . ratio ( )
if title_ratio < .75 : continue
story_timedelta = existing_story . story_date - story_pub_date
if abs ( story_timedelta . days ) > = 1 : continue
2012-12-24 00:10:40 -08:00
seq = difflib . SequenceMatcher ( None , story_content , existing_story_content )
2014-10-03 14:59:26 -07:00
similiar_length_min = 1000
2014-10-08 16:43:48 -07:00
if ( existing_story . story_permalink == story_link and
existing_story . story_title == story . get ( ' title ' ) ) :
2014-10-03 14:59:26 -07:00
similiar_length_min = 20
2012-12-24 00:10:40 -08:00
if ( seq
and story_content
2014-10-03 14:59:26 -07:00
and len ( story_content ) > similiar_length_min
2012-12-24 00:10:40 -08:00
and existing_story_content
and seq . real_quick_ratio ( ) > .9
and seq . quick_ratio ( ) > .95 ) :
content_ratio = seq . ratio ( )
2009-08-30 00:43:13 +00:00
2012-12-24 00:10:40 -08:00
if story_title_difference > 0 and content_ratio > .98 :
story_in_system = existing_story
if story_title_difference > 0 or content_ratio < 1.0 :
2014-03-13 15:39:49 -07:00
if settings . DEBUG :
2015-08-24 14:26:49 -07:00
logging . debug ( " ---> Title difference - %s / %s ( %s ): %s " % ( story . get ( ' title ' ) , existing_story . story_title , story_title_difference , content_ratio ) )
2009-08-30 00:43:13 +00:00
story_has_changed = True
break
2012-12-24 00:10:40 -08:00
# More restrictive content distance, still no story match
if not story_in_system and content_ratio > .98 :
2014-03-13 15:39:49 -07:00
if settings . DEBUG :
2015-08-24 14:26:49 -07:00
logging . debug ( " ---> Content difference - %s / %s ( %s ): %s " % ( story . get ( ' title ' ) , existing_story . story_title , story_title_difference , content_ratio ) )
2012-12-24 00:10:40 -08:00
story_in_system = existing_story
story_has_changed = True
break
if story_in_system and not story_has_changed :
if story_content != existing_story_content :
2014-03-13 15:39:49 -07:00
if settings . DEBUG :
2015-08-24 14:26:49 -07:00
logging . debug ( " ---> Content difference - %s ( %s )/ %s ( %s ) " % ( story . get ( ' title ' ) , len ( story_content ) , existing_story . story_title , len ( existing_story_content ) ) )
2012-12-24 00:10:40 -08:00
story_has_changed = True
if story_link != existing_story . story_permalink :
2014-03-13 15:39:49 -07:00
if settings . DEBUG :
2015-08-24 14:26:49 -07:00
logging . debug ( " ---> Permalink difference - %s / %s " % ( story_link , existing_story . story_permalink ) )
2012-12-24 00:10:40 -08:00
story_has_changed = True
# if story_pub_date != existing_story.story_date:
# story_has_changed = True
break
2011-12-08 11:19:04 -08:00
2010-01-28 13:28:27 -05:00
2010-02-02 18:01:02 -05:00
# if story_has_changed or not story_in_system:
2012-07-22 12:25:09 -07:00
# print 'New/updated story: %s' % (story),
2009-08-30 00:43:13 +00:00
return story_in_system , story_has_changed
2013-08-06 13:54:06 -07:00
2013-09-11 10:25:46 -07:00
def get_next_scheduled_update ( self , force = False , verbose = True , premium_speed = False ) :
if self . min_to_decay and not force and not premium_speed :
2013-04-18 16:56:54 -07:00
return self . min_to_decay
2013-04-23 15:44:31 -07:00
2013-09-11 10:25:46 -07:00
if premium_speed :
self . active_premium_subscribers + = 1
2015-07-29 16:34:48 -07:00
spd = self . stories_last_month / 30.0
2013-04-23 16:03:45 -07:00
subs = ( self . active_premium_subscribers +
( ( self . active_subscribers - self . active_premium_subscribers ) / 10.0 ) )
2015-07-29 16:34:48 -07:00
# Calculate sub counts:
# SELECT COUNT(*) FROM feeds WHERE active_premium_subscribers > 10 AND stories_last_month >= 30;
# SELECT COUNT(*) FROM feeds WHERE active_premium_subscribers > 1 AND active_premium_subscribers < 10 AND stories_last_month >= 30;
# SELECT COUNT(*) FROM feeds WHERE active_premium_subscribers = 1 AND stories_last_month >= 30;
# SpD > 1 Subs > 10: t = 6 # 4267 * 1440/6 = 1024080
# SpD > 1 Subs > 1: t = 15 # 18973 * 1440/15 = 1821408
# SpD > 1 Subs = 1: t = 60 # 65503 * 1440/60 = 1572072
# SELECT COUNT(*) FROM feeds WHERE active_premium_subscribers > 1 AND stories_last_month < 30 AND stories_last_month > 0;
# SELECT COUNT(*) FROM feeds WHERE active_premium_subscribers = 1 AND stories_last_month < 30 AND stories_last_month > 0;
# SpD < 1 Subs > 1: t = 60 # 77618 * 1440/60 = 1862832
# SpD < 1 Subs = 1: t = 60 * 12 # 282186 * 1440/(60*12) = 564372
# SELECT COUNT(*) FROM feeds WHERE active_premium_subscribers > 1 AND stories_last_month = 0;
# SELECT COUNT(*) FROM feeds WHERE active_subscribers > 0 AND active_premium_subscribers <= 1 AND stories_last_month = 0;
# SpD = 0 Subs > 1: t = 60 * 3 # 30158 * 1440/(60*3) = 241264
# SpD = 0 Subs = 1: t = 60 * 24 # 514131 * 1440/(60*24) = 514131
if spd > = 1 :
if subs > 10 :
total = 6
elif subs > 1 :
total = 15
2013-04-23 15:44:31 -07:00
else :
total = 60
2015-07-29 16:34:48 -07:00
elif spd > 0 :
2013-04-23 15:44:31 -07:00
if subs > 1 :
2015-07-29 16:34:48 -07:00
total = 60 - ( spd * 60 )
2013-04-23 15:44:31 -07:00
else :
2015-07-29 16:34:48 -07:00
total = 60 * 12 - ( spd * 60 * 12 )
elif spd == 0 :
2013-04-23 15:44:31 -07:00
if subs > 1 :
2013-04-23 17:04:21 -07:00
total = 60 * 6
2016-01-08 10:31:49 -08:00
elif subs == 1 :
total = 60 * 12
2013-04-23 15:44:31 -07:00
else :
total = 60 * 24
2013-07-02 10:36:16 -04:00
months_since_last_story = seconds_timesince ( self . last_story_date ) / ( 60 * 60 * 24 * 30 )
2013-04-23 17:04:21 -07:00
total * = max ( 1 , months_since_last_story )
2013-04-23 15:44:31 -07:00
# updates_per_day_delay = 3 * 60 / max(.25, ((max(0, self.active_subscribers)**.2)
# * (self.stories_last_month**0.25)))
# if self.active_premium_subscribers > 0:
# updates_per_day_delay /= min(self.active_subscribers+self.active_premium_subscribers, 4)
# updates_per_day_delay = int(updates_per_day_delay)
2013-03-28 12:17:30 -07:00
2010-07-02 15:49:08 -04:00
# Lots of subscribers = lots of updates
2011-04-02 00:17:59 -04:00
# 24 hours for 0 subscribers.
# 4 hours for 1 subscriber.
# .5 hours for 2 subscribers.
# .25 hours for 3 subscribers.
# 1 min for 10 subscribers.
2013-04-23 15:44:31 -07:00
# subscriber_bonus = 6 * 60 / max(.167, max(0, self.active_subscribers)**3)
# if self.premium_subscribers > 0:
# subscriber_bonus /= min(self.active_subscribers+self.premium_subscribers, 5)
# subscriber_bonus = int(subscriber_bonus)
2013-02-07 15:30:35 -08:00
2012-03-28 15:49:21 -07:00
if self . is_push :
2013-08-14 18:01:12 -07:00
fetch_history = MFetchHistory . feed ( self . pk )
if len ( fetch_history [ ' push_history ' ] ) :
total = total * 12
2012-12-21 16:48:47 -08:00
2016-01-23 15:04:05 -08:00
# 12 hour max for premiums, 48 hour max for free
2016-01-08 10:35:34 -08:00
if subs > = 1 :
2016-01-23 15:04:05 -08:00
total = min ( total , 60 * 12 * 1 )
2016-01-08 10:35:34 -08:00
else :
total = min ( total , 60 * 24 * 2 )
2013-02-13 12:54:14 -08:00
2012-01-09 19:08:22 -08:00
if verbose :
2015-07-29 16:34:48 -07:00
logging . debug ( " ---> [ %-30s ] Fetched every %s min - Subs: %s / %s / %s Stories/day: %s " % (
2013-03-28 12:17:30 -07:00
unicode ( self ) [ : 30 ] , total ,
self . num_subscribers ,
2013-04-23 15:44:31 -07:00
self . active_subscribers ,
2013-03-28 12:17:30 -07:00
self . active_premium_subscribers ,
2015-07-29 16:34:48 -07:00
spd ) )
2013-04-18 16:56:54 -07:00
return total
2010-07-25 23:13:27 -04:00
2013-03-28 12:17:30 -07:00
def set_next_scheduled_update ( self , verbose = False , skip_scheduling = False ) :
2015-07-27 18:35:25 -07:00
r = redis . Redis ( connection_pool = settings . REDIS_FEED_UPDATE_POOL )
2013-04-18 16:56:54 -07:00
total = self . get_next_scheduled_update ( force = True , verbose = verbose )
2013-04-08 10:50:50 -07:00
error_count = self . error_count
2010-12-23 13:29:31 -05:00
2013-04-08 10:50:50 -07:00
if error_count :
total = total * error_count
2013-08-15 12:20:37 -07:00
total = min ( total , 60 * 24 * 7 )
2012-12-25 12:08:17 -08:00
if verbose :
logging . debug ( ' ---> [ %-30s ] ~FBScheduling feed fetch geometrically: '
' ~SB %s errors. Time: %s min ' % (
unicode ( self ) [ : 30 ] , self . errors_since_good , total ) )
2013-04-18 16:56:54 -07:00
random_factor = random . randint ( 0 , total ) / 4
2010-10-10 23:55:00 -04:00
next_scheduled_update = datetime . datetime . utcnow ( ) + datetime . timedelta (
2010-09-07 14:02:48 -07:00
minutes = total + random_factor )
2015-07-22 15:08:15 -07:00
original_min_to_decay = self . min_to_decay
2010-12-23 13:29:31 -05:00
self . min_to_decay = total
2015-07-22 15:08:15 -07:00
2013-04-23 13:46:07 -07:00
delta = self . next_scheduled_update - datetime . datetime . now ( )
2014-05-18 10:42:35 +08:00
minutes_to_next_fetch = ( delta . seconds + ( delta . days * 24 * 3600 ) ) / 60
2013-04-23 13:46:07 -07:00
if minutes_to_next_fetch > self . min_to_decay or not skip_scheduling :
2013-03-28 12:17:30 -07:00
self . next_scheduled_update = next_scheduled_update
2013-04-18 16:47:55 -07:00
if self . active_subscribers > = 1 :
r . zadd ( ' scheduled_updates ' , self . pk , self . next_scheduled_update . strftime ( ' %s ' ) )
2013-04-03 17:22:45 -07:00
r . zrem ( ' tasked_feeds ' , self . pk )
2013-04-08 10:50:50 -07:00
r . srem ( ' queued_feeds ' , self . pk )
2013-03-30 19:05:13 -07:00
2015-07-22 15:08:15 -07:00
updated_fields = [ ' last_update ' , ' next_scheduled_update ' ]
if self . min_to_decay != original_min_to_decay :
updated_fields . append ( ' min_to_decay ' )
self . save ( update_fields = updated_fields )
2013-04-08 10:50:50 -07:00
@property
def error_count ( self ) :
2015-07-27 18:35:25 -07:00
r = redis . Redis ( connection_pool = settings . REDIS_FEED_UPDATE_POOL )
2013-04-08 10:50:50 -07:00
fetch_errors = int ( r . zscore ( ' error_feeds ' , self . pk ) or 0 )
return fetch_errors + self . errors_since_good
2013-01-02 12:27:08 -08:00
def schedule_feed_fetch_immediately ( self , verbose = True ) :
2015-07-27 18:35:25 -07:00
r = redis . Redis ( connection_pool = settings . REDIS_FEED_UPDATE_POOL )
2016-02-09 16:34:59 -08:00
if not self . num_subscribers :
logging . debug ( ' ---> [ %-30s ] Not scheduling feed fetch immediately, no subs. ' % ( unicode ( self ) [ : 30 ] ) )
return
2013-01-02 12:27:08 -08:00
if verbose :
logging . debug ( ' ---> [ %-30s ] Scheduling feed fetch immediately... ' % ( unicode ( self ) [ : 30 ] ) )
2010-10-10 23:55:00 -04:00
self . next_scheduled_update = datetime . datetime . utcnow ( )
2013-03-30 19:05:13 -07:00
r . zadd ( ' scheduled_updates ' , self . pk , self . next_scheduled_update . strftime ( ' %s ' ) )
2010-08-25 19:10:55 -04:00
2012-03-27 17:34:39 -07:00
return self . save ( )
2010-07-27 23:29:04 -04:00
2012-03-27 18:37:04 -07:00
def setup_push ( self ) :
from apps . push . models import PushSubscription
2012-03-28 16:49:15 -07:00
try :
push = self . push
except PushSubscription . DoesNotExist :
self . is_push = False
else :
2012-03-27 18:37:04 -07:00
self . is_push = push . verified
2010-11-05 20:34:17 -04:00
self . save ( )
2012-03-28 15:49:21 -07:00
2015-02-19 13:41:09 -08:00
def queue_pushed_feed_xml ( self , xml , latest_push_date_delta = None ) :
2015-07-27 18:35:25 -07:00
r = redis . Redis ( connection_pool = settings . REDIS_FEED_UPDATE_POOL )
2012-12-26 02:41:13 -08:00
queue_size = r . llen ( " push_feeds " )
2010-07-27 23:29:04 -04:00
2015-02-19 14:20:08 -08:00
if latest_push_date_delta :
2015-02-19 14:21:08 -08:00
latest_push_date_delta = " %s " % str ( latest_push_date_delta ) . split ( ' . ' , 2 ) [ 0 ]
2015-02-19 14:20:08 -08:00
2012-12-26 02:41:13 -08:00
if queue_size > 1000 :
self . schedule_feed_fetch_immediately ( )
else :
2015-02-23 13:31:42 -08:00
logging . debug ( ' ---> [ %-30s ] [ %s ] ~FB~SBQueuing pushed stories, last pushed %s ... ' % ( unicode ( self ) [ : 30 ] , self . pk , latest_push_date_delta ) )
2012-12-26 02:41:13 -08:00
self . set_next_scheduled_update ( )
PushFeeds . apply_async ( args = ( self . pk , xml ) , queue = ' push_feeds ' )
2011-09-19 08:56:16 -07:00
# def calculate_collocations_story_content(self,
# collocation_measures=TrigramAssocMeasures,
# collocation_finder=TrigramCollocationFinder):
# stories = MStory.objects.filter(story_feed_id=self.pk)
# story_content = ' '.join([s.story_content for s in stories if s.story_content])
# return self.calculate_collocations(story_content, collocation_measures, collocation_finder)
#
# def calculate_collocations_story_title(self,
# collocation_measures=BigramAssocMeasures,
# collocation_finder=BigramCollocationFinder):
# stories = MStory.objects.filter(story_feed_id=self.pk)
# story_titles = ' '.join([s.story_title for s in stories if s.story_title])
# return self.calculate_collocations(story_titles, collocation_measures, collocation_finder)
#
# def calculate_collocations(self, content,
# collocation_measures=TrigramAssocMeasures,
# collocation_finder=TrigramCollocationFinder):
# content = re.sub(r'’', '\'', content)
# content = re.sub(r'&', '&', content)
# try:
# content = unicode(BeautifulStoneSoup(content,
# convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# except ValueError, e:
# print "ValueError, ignoring: %s" % e
# content = re.sub(r'</?\w+\s+[^>]*>', '', content)
# content = re.split(r"[^A-Za-z-'&]+", content)
#
# finder = collocation_finder.from_words(content)
# finder.apply_freq_filter(3)
# best = finder.nbest(collocation_measures.pmi, 10)
# phrases = [' '.join(phrase) for phrase in best]
#
# return phrases
2010-07-27 22:11:23 -04:00
2010-07-27 22:37:52 -04:00
# class FeedCollocations(models.Model):
# feed = models.ForeignKey(Feed)
# phrase = models.CharField(max_length=500)
2009-06-16 03:08:55 +00:00
2011-01-17 20:23:29 -05:00
class FeedData ( models . Model ) :
2011-01-17 22:48:38 -05:00
feed = AutoOneToOneField ( Feed , related_name = ' data ' )
2011-02-06 15:43:13 -05:00
feed_tagline = models . CharField ( max_length = 1024 , blank = True , null = True )
2011-01-17 20:23:29 -05:00
story_count_history = models . TextField ( blank = True , null = True )
2011-04-07 17:00:28 -04:00
feed_classifier_counts = models . TextField ( blank = True , null = True )
2011-01-17 20:23:29 -05:00
popular_tags = models . CharField ( max_length = 1024 , blank = True , null = True )
popular_authors = models . CharField ( max_length = 2048 , blank = True , null = True )
2009-06-16 03:08:55 +00:00
2011-01-17 22:48:38 -05:00
def save ( self , * args , * * kwargs ) :
2011-02-05 22:15:03 -05:00
if self . feed_tagline and len ( self . feed_tagline ) > = 1000 :
self . feed_tagline = self . feed_tagline [ : 1000 ]
2009-12-18 20:47:44 +00:00
2011-01-21 20:29:19 -05:00
try :
super ( FeedData , self ) . save ( * args , * * kwargs )
except ( IntegrityError , OperationError ) :
2011-02-05 22:09:31 -05:00
if hasattr ( self , ' id ' ) and self . id : self . delete ( )
2010-05-20 15:13:25 -04:00
2011-01-27 19:05:50 -05:00
2011-04-21 23:10:43 -04:00
class MFeedIcon ( mongo . Document ) :
2012-03-29 16:03:06 -07:00
feed_id = mongo . IntField ( primary_key = True )
color = mongo . StringField ( max_length = 6 )
data = mongo . StringField ( )
icon_url = mongo . StringField ( )
not_found = mongo . BooleanField ( default = False )
2011-04-21 23:10:43 -04:00
meta = {
' collection ' : ' feed_icons ' ,
' allow_inheritance ' : False ,
}
2014-05-16 12:18:59 -07:00
@classmethod
def get_feed ( cls , feed_id , create = True ) :
try :
feed_icon = cls . objects . read_preference ( pymongo . ReadPreference . PRIMARY ) \
. get ( feed_id = feed_id )
except cls . DoesNotExist :
if create :
feed_icon = cls . objects . create ( feed_id = feed_id )
else :
feed_icon = None
return feed_icon
2011-04-21 23:10:43 -04:00
def save ( self , * args , * * kwargs ) :
if self . icon_url :
self . icon_url = unicode ( self . icon_url )
try :
2013-05-29 19:37:50 -07:00
return super ( MFeedIcon , self ) . save ( * args , * * kwargs )
2011-04-21 23:10:43 -04:00
except ( IntegrityError , OperationError ) :
# print "Error on Icon: %s" % e
if hasattr ( self , ' _id ' ) : self . delete ( )
2010-08-27 18:35:33 -04:00
class MFeedPage ( mongo . Document ) :
feed_id = mongo . IntField ( primary_key = True )
2010-08-29 12:35:09 -04:00
page_data = mongo . BinaryField ( )
2010-08-27 18:35:33 -04:00
meta = {
2010-08-29 12:35:09 -04:00
' collection ' : ' feed_pages ' ,
2010-08-27 18:35:33 -04:00
' allow_inheritance ' : False ,
}
def save ( self , * args , * * kwargs ) :
2010-08-29 12:35:09 -04:00
if self . page_data :
2016-02-05 14:43:31 -08:00
self . page_data = zlib . compress ( self . page_data ) . decode ( ' utf-8 ' )
2013-05-29 19:37:50 -07:00
return super ( MFeedPage , self ) . save ( * args , * * kwargs )
2011-01-29 19:16:40 -05:00
2016-02-05 14:43:31 -08:00
def page ( self ) :
return zlib . decompress ( self . page_data )
2011-01-29 19:16:40 -05:00
@classmethod
def get_data ( cls , feed_id ) :
data = None
feed_page = cls . objects ( feed_id = feed_id )
if feed_page :
2012-04-24 17:40:34 -07:00
page_data_z = feed_page [ 0 ] . page_data
if page_data_z :
data = zlib . decompress ( page_data_z )
2011-01-29 19:16:40 -05:00
if not data :
dupe_feed = DuplicateFeed . objects . filter ( duplicate_feed_id = feed_id )
if dupe_feed :
feed = dupe_feed [ 0 ] . feed
feed_page = MFeedPage . objects . filter ( feed_id = feed . pk )
if feed_page :
2012-04-24 17:40:34 -07:00
page_data_z = feed_page [ 0 ] . page_data
if page_data_z :
data = zlib . decompress ( feed_page [ 0 ] . page_data )
2012-03-29 14:45:19 -07:00
2011-01-29 19:16:40 -05:00
return data
2010-05-20 15:13:25 -04:00
2010-08-21 13:57:39 -04:00
class MStory ( mongo . Document ) :
''' A feed item '''
2013-02-20 16:08:14 -08:00
story_feed_id = mongo . IntField ( )
2010-11-30 10:30:18 -05:00
story_date = mongo . DateTimeField ( )
story_title = mongo . StringField ( max_length = 1024 )
story_content = mongo . StringField ( )
story_content_z = mongo . BinaryField ( )
story_original_content = mongo . StringField ( )
2010-08-29 13:23:50 -04:00
story_original_content_z = mongo . BinaryField ( )
2012-07-22 12:25:09 -07:00
story_latest_content = mongo . StringField ( )
story_latest_content_z = mongo . BinaryField ( )
2013-01-08 18:33:30 -08:00
original_text_z = mongo . BinaryField ( )
2014-10-29 16:16:50 -07:00
original_page_z = mongo . BinaryField ( )
2010-11-30 10:30:18 -05:00
story_content_type = mongo . StringField ( max_length = 255 )
story_author_name = mongo . StringField ( )
story_permalink = mongo . StringField ( )
story_guid = mongo . StringField ( )
2013-01-08 14:11:59 -08:00
story_hash = mongo . StringField ( )
2013-06-26 11:38:49 -07:00
image_urls = mongo . ListField ( mongo . StringField ( max_length = 1024 ) )
2010-11-30 10:30:18 -05:00
story_tags = mongo . ListField ( mongo . StringField ( max_length = 250 ) )
2012-01-09 13:55:26 -08:00
comment_count = mongo . IntField ( )
comment_user_ids = mongo . ListField ( mongo . IntField ( ) )
share_count = mongo . IntField ( )
share_user_ids = mongo . ListField ( mongo . IntField ( ) )
2010-11-30 10:30:18 -05:00
2010-08-21 13:57:39 -04:00
meta = {
' collection ' : ' stories ' ,
2013-02-20 15:42:40 -08:00
' indexes ' : [ ( ' story_feed_id ' , ' -story_date ' ) ,
{ ' fields ' : [ ' story_hash ' ] ,
' unique ' : True ,
2013-05-29 18:00:09 -07:00
' types ' : False , } ] ,
2011-11-29 09:43:16 -08:00
' index_drop_dups ' : True ,
2010-08-21 20:42:38 -04:00
' ordering ' : [ ' -story_date ' ] ,
' allow_inheritance ' : False ,
2012-09-17 17:01:56 -07:00
' cascade ' : False ,
2010-08-21 13:57:39 -04:00
}
2010-08-29 13:23:50 -04:00
2013-06-04 15:34:03 -07:00
RE_STORY_HASH = re . compile ( r " ^( \ d { 1,10}):( \ w {6} )$ " )
RE_RS_KEY = re . compile ( r " ^RS:( \ d+):( \ d+)$ " )
2012-01-09 13:55:26 -08:00
@property
def guid_hash ( self ) :
2012-11-27 11:59:54 -08:00
return hashlib . sha1 ( self . story_guid ) . hexdigest ( ) [ : 6 ]
2013-01-08 14:11:59 -08:00
2014-04-17 12:10:04 -07:00
@classmethod
def guid_hash_unsaved ( self , guid ) :
return hashlib . sha1 ( guid ) . hexdigest ( ) [ : 6 ]
2013-01-08 14:11:59 -08:00
@property
def feed_guid_hash ( self ) :
2013-02-20 15:42:40 -08:00
return " %s : %s " % ( self . story_feed_id , self . guid_hash )
2014-04-17 12:10:04 -07:00
@classmethod
def feed_guid_hash_unsaved ( cls , feed_id , guid ) :
return " %s : %s " % ( feed_id , cls . guid_hash_unsaved ( guid ) )
2013-09-10 11:59:31 -07:00
@property
def decoded_story_title ( self ) :
h = HTMLParser . HTMLParser ( )
return h . unescape ( self . story_title )
2010-08-29 13:23:50 -04:00
def save ( self , * args , * * kwargs ) :
2011-02-15 21:08:40 -05:00
story_title_max = MStory . _fields [ ' story_title ' ] . max_length
story_content_type_max = MStory . _fields [ ' story_content_type ' ] . max_length
2013-02-20 15:42:40 -08:00
self . story_hash = self . feed_guid_hash
2010-08-29 13:23:50 -04:00
if self . story_content :
2015-11-27 12:26:58 +01:00
self . story_content_z = zlib . compress ( smart_str ( self . story_content ) )
2010-08-29 13:23:50 -04:00
self . story_content = None
if self . story_original_content :
2015-11-27 12:26:58 +01:00
self . story_original_content_z = zlib . compress ( smart_str ( self . story_original_content ) )
2010-08-29 13:23:50 -04:00
self . story_original_content = None
2012-07-22 12:25:09 -07:00
if self . story_latest_content :
2015-11-27 12:26:58 +01:00
self . story_latest_content_z = zlib . compress ( smart_str ( self . story_latest_content ) )
2012-07-22 12:25:09 -07:00
self . story_latest_content = None
2011-02-15 21:16:34 -05:00
if self . story_title and len ( self . story_title ) > story_title_max :
2011-02-15 21:08:40 -05:00
self . story_title = self . story_title [ : story_title_max ]
2011-02-15 21:16:34 -05:00
if self . story_content_type and len ( self . story_content_type ) > story_content_type_max :
2011-02-15 21:08:40 -05:00
self . story_content_type = self . story_content_type [ : story_content_type_max ]
2013-01-08 14:11:59 -08:00
2010-08-29 13:23:50 -04:00
super ( MStory , self ) . save ( * args , * * kwargs )
2012-07-25 17:55:23 -07:00
self . sync_redis ( )
2013-05-29 19:37:50 -07:00
return self
2012-01-09 13:55:26 -08:00
2012-07-16 20:49:43 -07:00
def delete ( self , * args , * * kwargs ) :
self . remove_from_redis ( )
2014-04-15 14:59:00 -07:00
self . remove_from_search_index ( )
2012-07-16 20:49:43 -07:00
super ( MStory , self ) . delete ( * args , * * kwargs )
2015-12-26 12:53:32 -08:00
@classmethod
def purge_feed_stories ( cls , feed , cutoff , verbose = True ) :
stories = cls . objects ( story_feed_id = feed . pk )
logging . debug ( " ---> Deleting %s stories from %s " % ( stories . count ( ) , feed ) )
if stories . count ( ) > cutoff * 1.25 :
logging . debug ( " ***> ~FRToo many stories in %s , not purging... " % ( feed ) )
return
stories . delete ( )
2014-04-11 17:25:13 -07:00
@classmethod
def index_all_for_search ( cls , offset = 0 ) :
2014-04-15 14:17:15 -07:00
if not offset :
2014-04-16 15:31:44 -07:00
SearchStory . create_elasticsearch_mapping ( delete = True )
2014-04-11 17:25:13 -07:00
last_pk = Feed . objects . latest ( ' pk ' ) . pk
for f in xrange ( offset , last_pk , 1000 ) :
print " ---> %s / %s ( %.2s %% ) " % ( f , last_pk , float ( f ) / last_pk * 100 )
feeds = Feed . objects . filter ( pk__in = range ( f , f + 1000 ) ,
active = True ,
active_subscribers__gte = 1 ) \
. values_list ( ' pk ' )
for feed_id , in feeds :
stories = cls . objects . filter ( story_feed_id = feed_id )
for story in stories :
2014-04-22 12:00:20 -07:00
story . index_story_for_search ( )
2014-04-11 17:25:13 -07:00
2014-04-22 12:00:20 -07:00
def index_story_for_search ( self ) :
2014-04-23 14:25:07 -07:00
story_content = self . story_content or " "
2014-04-15 16:52:25 -07:00
if self . story_content_z :
story_content = zlib . decompress ( self . story_content_z )
2014-04-11 17:25:13 -07:00
SearchStory . index ( story_hash = self . story_hash ,
2014-04-11 15:40:58 -07:00
story_title = self . story_title ,
2014-04-15 14:17:15 -07:00
story_content = prep_for_search ( story_content ) ,
2014-04-22 15:15:42 -07:00
story_tags = self . story_tags ,
2014-04-11 15:40:58 -07:00
story_author = self . story_author_name ,
2014-04-15 14:17:15 -07:00
story_feed_id = self . story_feed_id ,
2014-04-11 17:25:13 -07:00
story_date = self . story_date )
2012-01-09 13:55:26 -08:00
2014-04-15 14:59:00 -07:00
def remove_from_search_index ( self ) :
2014-04-22 12:00:20 -07:00
try :
SearchStory . remove ( self . story_hash )
except NotFoundException :
2014-04-22 12:39:09 -07:00
pass
2014-04-15 14:59:00 -07:00
2013-06-03 17:36:57 -07:00
@classmethod
2013-06-03 17:48:11 -07:00
def trim_feed ( cls , cutoff , feed_id = None , feed = None , verbose = True ) :
2013-08-12 16:48:16 -07:00
extra_stories_count = 0
2013-06-03 17:48:11 -07:00
if not feed_id and not feed :
2013-08-12 16:48:16 -07:00
return extra_stories_count
2013-06-03 17:36:57 -07:00
2013-06-03 17:48:11 -07:00
if not feed_id :
feed_id = feed . pk
if not feed :
feed = feed_id
2013-06-03 17:36:57 -07:00
stories = cls . objects (
2014-04-03 16:25:18 -07:00
story_feed_id = feed_id
2014-04-02 12:10:34 -07:00
) . only ( ' story_date ' ) . order_by ( ' -story_date ' )
2016-01-19 11:28:27 -08:00
2013-06-03 17:36:57 -07:00
if stories . count ( ) > cutoff :
2013-08-05 10:23:22 -07:00
logging . debug ( ' ---> [ %-30s ] ~FMFound %s stories. Trimming to ~SB %s ~SN... ' %
2013-06-03 17:48:11 -07:00
( unicode ( feed ) [ : 30 ] , stories . count ( ) , cutoff ) )
2013-06-03 17:36:57 -07:00
try :
story_trim_date = stories [ cutoff ] . story_date
except IndexError , e :
2013-06-03 17:48:11 -07:00
logging . debug ( ' ***> [ %-30s ] ~BRError trimming feed: %s ' % ( unicode ( feed ) [ : 30 ] , e ) )
2013-08-12 16:48:16 -07:00
return extra_stories_count
2013-06-03 17:36:57 -07:00
2014-04-03 16:25:18 -07:00
extra_stories = cls . objects ( story_feed_id = feed_id ,
story_date__lte = story_trim_date )
2013-06-03 17:36:57 -07:00
extra_stories_count = extra_stories . count ( )
2014-04-03 16:25:18 -07:00
shared_story_count = 0
2013-06-03 17:36:57 -07:00
for story in extra_stories :
2014-04-03 16:25:18 -07:00
if story . share_count :
shared_story_count + = 1
2016-01-19 11:30:13 -08:00
extra_stories_count - = 1
2014-04-03 16:25:18 -07:00
continue
2013-06-03 17:36:57 -07:00
story . delete ( )
if verbose :
2014-04-03 16:25:18 -07:00
existing_story_count = cls . objects ( story_feed_id = feed_id ) . count ( )
2014-04-02 12:10:34 -07:00
logging . debug ( " ---> Deleted %s stories, %s ( %s shared) left. " % (
2013-06-03 17:36:57 -07:00
extra_stories_count ,
2014-04-02 12:10:34 -07:00
existing_story_count ,
shared_story_count ) )
2013-08-12 16:48:16 -07:00
return extra_stories_count
2013-06-03 17:36:57 -07:00
2012-07-26 22:12:48 -07:00
@classmethod
2014-03-28 15:11:58 -07:00
def find_story ( cls , story_feed_id = None , story_id = None , story_hash = None , original_only = False ) :
2012-07-26 22:12:48 -07:00
from apps . social . models import MSharedStory
2013-06-19 13:22:11 -07:00
original_found = False
2014-03-28 15:11:58 -07:00
if story_hash :
story_id = story_hash
2013-06-04 15:34:03 -07:00
story_hash = cls . ensure_story_hash ( story_id , story_feed_id )
2014-03-28 15:11:58 -07:00
if not story_feed_id :
2014-03-28 15:13:41 -07:00
story_feed_id , _ = cls . split_story_hash ( story_hash )
2013-03-20 15:43:35 -07:00
if isinstance ( story_id , ObjectId ) :
story = cls . objects ( id = story_id ) . limit ( 1 ) . first ( )
else :
story = cls . objects ( story_hash = story_hash ) . limit ( 1 ) . first ( )
2013-01-08 18:33:30 -08:00
2013-06-19 13:22:11 -07:00
if story :
original_found = True
2013-01-08 18:33:30 -08:00
if not story and not original_only :
2012-07-26 22:12:48 -07:00
story = MSharedStory . objects . filter ( story_feed_id = story_feed_id ,
2013-06-04 15:34:03 -07:00
story_hash = story_hash ) . limit ( 1 ) . first ( )
2013-01-08 18:33:30 -08:00
if not story and not original_only :
2012-07-26 22:12:48 -07:00
story = MStarredStory . objects . filter ( story_feed_id = story_feed_id ,
2013-06-04 15:34:03 -07:00
story_hash = story_hash ) . limit ( 1 ) . first ( )
2012-07-26 22:12:48 -07:00
return story , original_found
2012-12-13 17:49:07 -08:00
@classmethod
def find_by_id ( cls , story_ids ) :
from apps . social . models import MSharedStory
count = len ( story_ids )
multiple = isinstance ( story_ids , list ) or isinstance ( story_ids , tuple )
stories = list ( cls . objects ( id__in = story_ids ) )
if len ( stories ) < count :
shared_stories = list ( MSharedStory . objects ( id__in = story_ids ) )
stories . extend ( shared_stories )
2013-04-29 15:27:22 -07:00
if not multiple :
stories = stories [ 0 ]
return stories
@classmethod
def find_by_story_hashes ( cls , story_hashes ) :
from apps . social . models import MSharedStory
count = len ( story_hashes )
multiple = isinstance ( story_hashes , list ) or isinstance ( story_hashes , tuple )
stories = list ( cls . objects ( story_hash__in = story_hashes ) )
if len ( stories ) < count :
2013-04-30 15:49:44 -07:00
hashes_found = [ s . story_hash for s in stories ]
2013-04-30 16:59:02 -07:00
remaining_hashes = list ( set ( story_hashes ) - set ( hashes_found ) )
2013-04-30 15:49:44 -07:00
story_feed_ids = [ h . split ( ' : ' ) [ 0 ] for h in remaining_hashes ]
2013-04-30 15:28:00 -07:00
shared_stories = list ( MSharedStory . objects ( story_feed_id__in = story_feed_ids ,
2013-04-30 15:49:44 -07:00
story_hash__in = remaining_hashes ) )
2013-04-29 15:27:22 -07:00
stories . extend ( shared_stories )
2012-12-13 17:49:07 -08:00
if not multiple :
stories = stories [ 0 ]
return stories
2013-06-04 15:34:03 -07:00
@classmethod
def ensure_story_hash ( cls , story_id , story_feed_id ) :
if not cls . RE_STORY_HASH . match ( story_id ) :
story_id = " %s : %s " % ( story_feed_id , hashlib . sha1 ( story_id ) . hexdigest ( ) [ : 6 ] )
2012-07-16 20:49:43 -07:00
2013-06-04 15:34:03 -07:00
return story_id
@classmethod
def split_story_hash ( cls , story_hash ) :
matches = cls . RE_STORY_HASH . match ( story_hash )
if matches :
groups = matches . groups ( )
return groups [ 0 ] , groups [ 1 ]
return None , None
@classmethod
def split_rs_key ( cls , rs_key ) :
matches = cls . RE_RS_KEY . match ( rs_key )
if matches :
groups = matches . groups ( )
return groups [ 0 ] , groups [ 1 ]
return None , None
@classmethod
def story_hashes ( cls , story_ids ) :
story_hashes = [ ]
for story_id in story_ids :
story_hash = cls . ensure_story_hash ( story_id )
if not story_hash : continue
story_hashes . append ( story_hash )
return story_hashes
2013-08-14 14:32:50 -07:00
def sync_redis ( self , r = None ) :
2012-07-23 13:06:12 -07:00
if not r :
2013-05-02 12:27:37 -07:00
r = redis . Redis ( connection_pool = settings . REDIS_STORY_HASH_POOL )
2013-08-14 14:32:50 -07:00
# if not r2:
# r2 = redis.Redis(connection_pool=settings.REDIS_STORY_HASH_POOL2)
2013-09-16 16:42:49 -07:00
UNREAD_CUTOFF = datetime . datetime . now ( ) - datetime . timedelta ( days = settings . DAYS_OF_STORY_HASHES )
2012-07-25 17:55:23 -07:00
2013-04-13 22:31:05 -07:00
if self . id and self . story_date > UNREAD_CUTOFF :
2013-07-01 22:19:22 -07:00
feed_key = ' F: %s ' % self . story_feed_id
r . sadd ( feed_key , self . story_hash )
2013-09-16 16:42:49 -07:00
r . expire ( feed_key , settings . DAYS_OF_STORY_HASHES * 24 * 60 * 60 )
2013-08-14 14:32:50 -07:00
# r2.sadd(feed_key, self.story_hash)
2013-09-16 16:42:49 -07:00
# r2.expire(feed_key, settings.DAYS_OF_STORY_HASHES*24*60*60)
2013-07-01 22:19:22 -07:00
r . zadd ( ' z ' + feed_key , self . story_hash , time . mktime ( self . story_date . timetuple ( ) ) )
2013-09-16 16:42:49 -07:00
r . expire ( ' z ' + feed_key , settings . DAYS_OF_STORY_HASHES * 24 * 60 * 60 )
2013-08-14 14:32:50 -07:00
# r2.zadd('z' + feed_key, self.story_hash, time.mktime(self.story_date.timetuple()))
2013-09-16 16:42:49 -07:00
# r2.expire('z' + feed_key, settings.DAYS_OF_STORY_HASHES*24*60*60)
2012-07-16 18:11:18 -07:00
2013-08-14 14:32:50 -07:00
def remove_from_redis ( self , r = None ) :
2012-07-23 13:06:12 -07:00
if not r :
2013-05-02 12:27:37 -07:00
r = redis . Redis ( connection_pool = settings . REDIS_STORY_HASH_POOL )
2013-08-14 14:32:50 -07:00
# if not r2:
# r2 = redis.Redis(connection_pool=settings.REDIS_STORY_HASH_POOL2)
2012-07-23 10:44:32 -07:00
if self . id :
2013-05-02 12:27:37 -07:00
r . srem ( ' F: %s ' % self . story_feed_id , self . story_hash )
2013-08-14 14:32:50 -07:00
# r2.srem('F:%s' % self.story_feed_id, self.story_hash)
2013-05-02 12:27:37 -07:00
r . zrem ( ' zF: %s ' % self . story_feed_id , self . story_hash )
2013-08-14 14:32:50 -07:00
# r2.zrem('zF:%s' % self.story_feed_id, self.story_hash)
2012-07-16 20:49:43 -07:00
2012-07-16 18:11:18 -07:00
@classmethod
2013-05-02 12:27:37 -07:00
def sync_feed_redis ( cls , story_feed_id ) :
r = redis . Redis ( connection_pool = settings . REDIS_STORY_HASH_POOL )
2013-08-14 14:32:50 -07:00
# r2 = redis.Redis(connection_pool=settings.REDIS_STORY_HASH_POOL2)
2013-09-16 16:42:49 -07:00
UNREAD_CUTOFF = datetime . datetime . now ( ) - datetime . timedelta ( days = settings . DAYS_OF_STORY_HASHES )
2013-05-02 12:27:37 -07:00
feed = Feed . get_by_id ( story_feed_id )
stories = cls . objects . filter ( story_feed_id = story_feed_id , story_date__gte = UNREAD_CUTOFF )
r . delete ( ' F: %s ' % story_feed_id )
2013-08-14 14:32:50 -07:00
# r2.delete('F:%s' % story_feed_id)
2013-05-02 12:27:37 -07:00
r . delete ( ' zF: %s ' % story_feed_id )
2013-08-14 14:32:50 -07:00
# r2.delete('zF:%s' % story_feed_id)
2012-10-29 14:58:43 -07:00
2013-07-05 18:50:57 -07:00
logging . info ( " ---> [ %-30s ] ~FMSyncing ~SB %s ~SN stories to redis " % ( feed and feed . title [ : 30 ] or story_feed_id , stories . count ( ) ) )
2013-05-02 12:27:37 -07:00
p = r . pipeline ( )
2013-08-14 14:32:50 -07:00
# p2 = r2.pipeline()
2012-07-16 18:11:18 -07:00
for story in stories :
2013-08-14 14:32:50 -07:00
story . sync_redis ( r = p )
2013-05-02 12:27:37 -07:00
p . execute ( )
2013-08-14 14:32:50 -07:00
# p2.execute()
2012-07-16 18:11:18 -07:00
2012-01-09 13:55:26 -08:00
def count_comments ( self ) :
from apps . social . models import MSharedStory
params = {
' story_guid ' : self . story_guid ,
' story_feed_id ' : self . story_feed_id ,
}
comments = MSharedStory . objects . filter ( has_comments = True , * * params ) . only ( ' user_id ' )
2012-01-15 20:51:48 -08:00
shares = MSharedStory . objects . filter ( * * params ) . only ( ' user_id ' )
2012-01-09 13:55:26 -08:00
self . comment_count = comments . count ( )
self . comment_user_ids = [ c [ ' user_id ' ] for c in comments ]
self . share_count = shares . count ( )
self . share_user_ids = [ s [ ' user_id ' ] for s in shares ]
self . save ( )
2013-06-20 13:41:37 -07:00
2013-06-26 11:38:49 -07:00
def extract_image_urls ( self , force = False ) :
if self . image_urls and not force :
return self . image_urls
2013-06-20 13:44:10 -07:00
2013-06-20 13:46:11 -07:00
story_content = self . story_content
if not story_content and self . story_content_z :
story_content = zlib . decompress ( self . story_content_z )
2013-06-20 13:44:10 -07:00
if not story_content :
return
2013-08-06 13:18:55 -07:00
try :
soup = BeautifulSoup ( story_content )
except ValueError :
return
2013-06-26 11:38:49 -07:00
images = soup . findAll ( ' img ' )
if not images :
return
image_urls = [ ]
for image in images :
2013-06-20 14:45:05 -07:00
image_url = image . get ( ' src ' )
2013-06-26 16:26:14 -07:00
if not image_url :
continue
2013-06-20 14:45:05 -07:00
if image_url and len ( image_url ) > = 1024 :
2013-06-26 11:38:49 -07:00
continue
image_urls . append ( image_url )
2013-06-26 16:26:14 -07:00
if not image_urls :
return
2013-06-26 11:38:49 -07:00
self . image_urls = image_urls
return self . image_urls
2013-06-20 13:41:37 -07:00
2014-07-21 14:22:07 -07:00
def fetch_original_text ( self , force = False , request = None , debug = False ) :
2013-01-08 18:33:30 -08:00
original_text_z = self . original_text_z
if not original_text_z or force :
2014-10-29 16:16:50 -07:00
feed = Feed . get_by_id ( self . story_feed_id )
2014-07-21 14:22:07 -07:00
ti = TextImporter ( self , feed = feed , request = request , debug = debug )
2013-01-08 18:33:30 -08:00
original_text = ti . fetch ( )
else :
logging . user ( request , " ~FYFetching ~FGoriginal~FY story text, ~SBfound. " )
original_text = zlib . decompress ( original_text_z )
return original_text
2012-09-04 11:46:41 -07:00
2014-10-29 16:16:50 -07:00
def fetch_original_page ( self , force = False , request = None , debug = False ) :
from apps . rss_feeds . page_importer import PageImporter
if not self . original_page_z or force :
feed = Feed . get_by_id ( self . story_feed_id )
importer = PageImporter ( request = request , feed = feed , story = self )
original_page = importer . fetch_story ( )
else :
logging . user ( request , " ~FYFetching ~FGoriginal~FY story page, ~SBfound. " )
original_page = zlib . decompress ( self . original_page_z )
return original_page
2010-11-30 10:30:18 -05:00
class MStarredStory ( mongo . Document ) :
""" Like MStory, but not inherited due to large overhead of _cls and _type in
mongoengine ' s inheritance model on every single row. " " "
2012-03-26 13:14:02 -07:00
user_id = mongo . IntField ( unique_with = ( ' story_guid ' , ) )
2010-12-02 20:18:33 -05:00
starred_date = mongo . DateTimeField ( )
2010-11-30 10:30:18 -05:00
story_feed_id = mongo . IntField ( )
story_date = mongo . DateTimeField ( )
story_title = mongo . StringField ( max_length = 1024 )
story_content = mongo . StringField ( )
story_content_z = mongo . BinaryField ( )
story_original_content = mongo . StringField ( )
story_original_content_z = mongo . BinaryField ( )
2013-01-28 15:43:00 -08:00
original_text_z = mongo . BinaryField ( )
2010-11-30 10:30:18 -05:00
story_content_type = mongo . StringField ( max_length = 255 )
story_author_name = mongo . StringField ( )
story_permalink = mongo . StringField ( )
2012-03-26 13:14:02 -07:00
story_guid = mongo . StringField ( )
2013-04-29 16:07:08 -07:00
story_hash = mongo . StringField ( )
2010-11-30 10:30:18 -05:00
story_tags = mongo . ListField ( mongo . StringField ( max_length = 250 ) )
2013-08-07 10:56:51 -07:00
user_tags = mongo . ListField ( mongo . StringField ( max_length = 128 ) )
2013-06-26 11:38:49 -07:00
image_urls = mongo . ListField ( mongo . StringField ( max_length = 1024 ) )
2010-11-30 10:30:18 -05:00
meta = {
' collection ' : ' starred_stories ' ,
2012-03-26 13:14:02 -07:00
' indexes ' : [ ( ' user_id ' , ' -starred_date ' ) , ( ' user_id ' , ' story_feed_id ' ) , ' story_feed_id ' ] ,
2011-11-29 17:57:20 -08:00
' index_drop_dups ' : True ,
2010-12-02 20:18:33 -05:00
' ordering ' : [ ' -starred_date ' ] ,
2010-11-30 10:30:18 -05:00
' allow_inheritance ' : False ,
}
def save ( self , * args , * * kwargs ) :
if self . story_content :
self . story_content_z = zlib . compress ( self . story_content )
self . story_content = None
if self . story_original_content :
self . story_original_content_z = zlib . compress ( self . story_original_content )
self . story_original_content = None
2013-04-29 16:07:08 -07:00
self . story_hash = self . feed_guid_hash
2013-05-29 19:37:50 -07:00
return super ( MStarredStory , self ) . save ( * args , * * kwargs )
2012-07-13 14:33:16 -07:00
2013-07-30 12:01:45 -07:00
@classmethod
2014-06-11 15:20:59 -07:00
def find_stories ( cls , query , user_id , tag = None , offset = 0 , limit = 25 , order = " newest " ) :
2013-07-30 12:01:45 -07:00
stories_db = cls . objects (
Q ( user_id = user_id ) &
( Q ( story_title__icontains = query ) |
Q ( story_author_name__icontains = query ) |
Q ( story_tags__icontains = query ) )
2013-09-09 18:18:13 -07:00
)
if tag :
stories_db = stories_db . filter ( user_tags__contains = tag )
2014-06-11 15:20:59 -07:00
stories_db = stories_db . order_by ( ' %s starred_date ' %
( ' - ' if order == " newest " else " " ) ) [ offset : offset + limit ]
2013-07-30 12:01:45 -07:00
stories = Feed . format_stories ( stories_db )
return stories
2013-08-15 18:22:22 -07:00
@classmethod
def find_stories_by_user_tag ( cls , user_tag , user_id , offset = 0 , limit = 25 ) :
stories_db = cls . objects (
Q ( user_id = user_id ) ,
Q ( user_tags__icontains = user_tag )
2013-07-30 12:01:45 -07:00
) . order_by ( ' -starred_date ' ) [ offset : offset + limit ]
stories = Feed . format_stories ( stories_db )
return stories
2013-07-11 12:08:21 -07:00
@classmethod
2015-07-09 18:03:30 -07:00
def trim_old_stories ( cls , stories = 10 , days = 90 , dryrun = False ) :
2013-07-11 12:08:21 -07:00
print " ---> Fetching starred story counts... "
stats = settings . MONGODB . newsblur . starred_stories . aggregate ( [ {
" $group " : {
" _id " : " $user_id " ,
" stories " : { " $sum " : 1 } ,
} ,
} , {
" $match " : {
" stories " : { " $gte " : stories }
} ,
} ] )
month_ago = datetime . datetime . now ( ) - datetime . timedelta ( days = days )
user_ids = stats [ ' result ' ]
user_ids = sorted ( user_ids , key = lambda x : x [ ' stories ' ] , reverse = True )
print " ---> Found %s users with more than %s starred stories " % ( len ( user_ids ) , stories )
2013-07-11 12:24:48 -07:00
total = 0
2013-07-11 12:08:21 -07:00
for stat in user_ids :
try :
user = User . objects . select_related ( ' profile ' ) . get ( pk = stat [ ' _id ' ] )
except User . DoesNotExist :
user = None
2013-07-11 12:24:48 -07:00
if user and ( user . profile . is_premium or user . profile . last_seen_on > month_ago ) :
continue
total + = stat [ ' stories ' ]
2014-03-28 12:53:01 -07:00
username = " %s ( %s ) " % ( user and user . username or " - " , stat [ ' _id ' ] )
print " ---> %19.19s : %-20.20s %s stories " % ( user and user . profile . last_seen_on or " Deleted " ,
username ,
2013-07-11 12:24:48 -07:00
stat [ ' stories ' ] )
2013-07-11 15:09:00 -07:00
if not dryrun and stat [ ' _id ' ] :
2013-07-11 12:24:48 -07:00
cls . objects . filter ( user_id = stat [ ' _id ' ] ) . delete ( )
2015-02-11 15:55:18 -08:00
elif not dryrun and stat [ ' _id ' ] == 0 :
print " ---> Deleting unstarred stories (user_id = 0) "
cls . objects . filter ( user_id = stat [ ' _id ' ] ) . delete ( )
2013-07-11 12:08:21 -07:00
print " ---> Deleted %s stories in total. " % total
2012-11-16 15:43:39 -08:00
@property
def guid_hash ( self ) :
2012-11-27 11:59:54 -08:00
return hashlib . sha1 ( self . story_guid ) . hexdigest ( ) [ : 6 ]
2013-01-28 15:43:00 -08:00
2013-04-29 16:07:08 -07:00
@property
def feed_guid_hash ( self ) :
return " %s : %s " % ( self . story_feed_id or " 0 " , self . guid_hash )
2014-07-21 21:13:52 -07:00
def fetch_original_text ( self , force = False , request = None , debug = False ) :
2013-01-28 15:43:00 -08:00
original_text_z = self . original_text_z
2013-07-15 11:06:50 -07:00
feed = Feed . get_by_id ( self . story_feed_id )
2013-01-28 15:43:00 -08:00
if not original_text_z or force :
2014-07-21 21:13:52 -07:00
ti = TextImporter ( self , feed = feed , request = request , debug = debug )
2013-01-28 15:43:00 -08:00
original_text = ti . fetch ( )
else :
logging . user ( request , " ~FYFetching ~FGoriginal~FY story text, ~SBfound. " )
original_text = zlib . decompress ( original_text_z )
return original_text
2013-08-07 10:56:51 -07:00
class MStarredStoryCounts ( mongo . Document ) :
user_id = mongo . IntField ( )
2014-05-28 15:30:30 -07:00
tag = mongo . StringField ( max_length = 128 )
feed_id = mongo . IntField ( )
2013-12-19 11:08:22 -08:00
slug = mongo . StringField ( max_length = 128 )
2014-05-28 20:12:35 -07:00
count = mongo . IntField ( default = 0 )
2012-11-16 15:43:39 -08:00
2013-08-07 10:56:51 -07:00
meta = {
' collection ' : ' starred_stories_counts ' ,
' indexes ' : [ ' user_id ' ] ,
' ordering ' : [ ' tag ' ] ,
' allow_inheritance ' : False ,
}
2013-12-19 11:08:22 -08:00
@property
def rss_url ( self , secret_token = None ) :
2014-05-29 15:43:44 -07:00
if self . feed_id :
return
2013-12-19 11:08:22 -08:00
if not secret_token :
user = User . objects . select_related ( ' profile ' ) . get ( pk = self . user_id )
secret_token = user . profile . secret_token
2014-05-28 15:30:30 -07:00
slug = self . slug if self . slug else " "
2013-12-19 11:08:22 -08:00
return " %s /reader/starred_rss/ %s / %s / %s " % ( settings . NEWSBLUR_URL , self . user_id ,
2014-05-28 15:30:30 -07:00
secret_token , slug )
2013-08-07 10:56:51 -07:00
@classmethod
2013-09-09 18:49:21 -07:00
def user_counts ( cls , user_id , include_total = False , try_counting = True ) :
2013-12-19 11:08:22 -08:00
counts = cls . objects . filter ( user_id = user_id )
2014-05-28 15:30:30 -07:00
counts = sorted ( [ { ' tag ' : c . tag ,
' count ' : c . count ,
' feed_address ' : c . rss_url ,
' feed_id ' : c . feed_id }
for c in counts ] ,
key = lambda x : ( x . get ( ' tag ' , ' ' ) or ' ' ) . lower ( ) )
2013-09-09 18:49:21 -07:00
2014-05-29 13:30:41 -07:00
total = 0
feed_total = 0
for c in counts :
if not c [ ' tag ' ] and not c [ ' feed_id ' ] :
total = c [ ' count ' ]
if c [ ' feed_id ' ] :
feed_total + = c [ ' count ' ]
2014-05-29 13:37:46 -07:00
if try_counting and ( total != feed_total or not len ( counts ) ) :
2014-05-29 13:30:41 -07:00
user = User . objects . get ( pk = user_id )
logging . user ( user , " ~FC~SBCounting~SN saved stories ( %s total vs. %s counted)... " %
( total , feed_total ) )
2014-05-28 15:30:30 -07:00
cls . count_for_user ( user_id )
2013-12-19 11:08:22 -08:00
return cls . user_counts ( user_id , include_total = include_total ,
try_counting = False )
2013-09-09 18:49:21 -07:00
2013-08-16 16:02:45 -07:00
if include_total :
2014-05-29 13:30:41 -07:00
return counts , total
2013-08-07 10:56:51 -07:00
return counts
2014-04-29 12:36:42 -07:00
@classmethod
def schedule_count_tags_for_user ( cls , user_id ) :
ScheduleCountTagsForUser . apply_async ( kwargs = dict ( user_id = user_id ) )
2014-05-28 15:30:30 -07:00
2013-08-07 10:56:51 -07:00
@classmethod
2014-05-28 17:35:51 -07:00
def count_for_user ( cls , user_id , total_only = False ) :
user_tags = [ ]
user_feeds = [ ]
2013-08-15 18:22:22 -07:00
2014-05-28 17:35:51 -07:00
if not total_only :
cls . objects ( user_id = user_id ) . delete ( )
2016-01-28 12:30:51 -08:00
try :
user_tags = cls . count_tags_for_user ( user_id )
user_feeds = cls . count_feeds_for_user ( user_id )
except pymongo . errors . OperationFailure , e :
logging . debug ( " ---> ~FBOperationError on mongo: ~SB %s " % e )
2014-05-28 17:35:51 -07:00
2013-08-15 18:22:22 -07:00
total_stories_count = MStarredStory . objects ( user_id = user_id ) . count ( )
2014-05-29 13:30:41 -07:00
cls . objects ( user_id = user_id , tag = None , feed_id = None ) . update_one ( set__count = total_stories_count ,
upsert = True )
2014-05-28 15:30:30 -07:00
return dict ( total = total_stories_count , tags = user_tags , feeds = user_feeds )
2013-08-07 10:56:51 -07:00
@classmethod
def count_tags_for_user ( cls , user_id ) :
all_tags = MStarredStory . objects ( user_id = user_id ,
user_tags__exists = True ) . item_frequencies ( ' user_tags ' )
2014-05-28 15:30:30 -07:00
user_tags = sorted ( [ ( k , v ) for k , v in all_tags . items ( ) if int ( v ) > 0 and k ] ,
key = lambda x : x [ 0 ] . lower ( ) ,
2013-08-07 10:56:51 -07:00
reverse = True )
2013-08-15 18:22:22 -07:00
for tag , count in dict ( user_tags ) . items ( ) :
2014-05-29 12:19:26 -07:00
cls . objects ( user_id = user_id , tag = tag , slug = slugify ( tag ) ) . update_one ( set__count = count ,
upsert = True )
2014-05-28 15:30:30 -07:00
return user_tags
2013-12-19 11:08:22 -08:00
2014-05-28 15:30:30 -07:00
@classmethod
def count_feeds_for_user ( cls , user_id ) :
all_feeds = MStarredStory . objects ( user_id = user_id ) . item_frequencies ( ' story_feed_id ' )
2014-05-29 13:30:41 -07:00
user_feeds = dict ( [ ( k , v ) for k , v in all_feeds . items ( ) if v ] )
# Clean up None'd and 0'd feed_ids, so they can be counted against the total
if user_feeds . get ( None , False ) :
user_feeds [ 0 ] = user_feeds . get ( 0 , 0 )
user_feeds [ 0 ] + = user_feeds . get ( None )
del user_feeds [ None ]
if user_feeds . get ( 0 , False ) :
user_feeds [ - 1 ] = user_feeds . get ( 0 , 0 )
del user_feeds [ 0 ]
for feed_id , count in user_feeds . items ( ) :
2014-05-29 12:19:26 -07:00
cls . objects ( user_id = user_id ,
feed_id = feed_id ,
slug = " feed: %s " % feed_id ) . update_one ( set__count = count ,
upsert = True )
2014-05-28 15:30:30 -07:00
return user_feeds
2013-12-19 11:08:22 -08:00
2014-05-28 20:12:35 -07:00
@classmethod
def adjust_count ( cls , user_id , feed_id = None , tag = None , amount = 0 ) :
params = dict ( user_id = user_id )
if feed_id :
params [ ' feed_id ' ] = feed_id
if tag :
params [ ' tag ' ] = tag
2014-05-29 12:19:26 -07:00
cls . objects ( * * params ) . update_one ( inc__count = amount , upsert = True )
2014-07-13 16:07:32 -07:00
try :
story_count = cls . objects . get ( * * params )
except cls . MultipleObjectsReturned :
story_count = cls . objects ( * * params ) . first ( )
if story_count and story_count . count < = 0 :
2014-05-28 20:12:35 -07:00
story_count . delete ( )
2014-05-28 15:30:30 -07:00
2013-06-18 13:22:31 -07:00
class MFetchHistory ( mongo . Document ) :
feed_id = mongo . IntField ( unique = True )
feed_fetch_history = mongo . DynamicField ( )
page_fetch_history = mongo . DynamicField ( )
push_history = mongo . DynamicField ( )
2010-08-31 16:34:34 -04:00
2013-06-18 13:22:31 -07:00
meta = {
' db_alias ' : ' nbanalytics ' ,
' collection ' : ' fetch_history ' ,
' allow_inheritance ' : False ,
}
2011-02-13 14:47:58 -05:00
@classmethod
2013-05-31 17:17:20 -07:00
def feed ( cls , feed_id , timezone = None , fetch_history = None ) :
if not fetch_history :
2013-05-31 23:03:58 -07:00
try :
2013-06-03 10:20:23 -07:00
fetch_history = cls . objects . read_preference ( pymongo . ReadPreference . PRIMARY ) \
. get ( feed_id = feed_id )
2013-05-31 23:03:58 -07:00
except cls . DoesNotExist :
2013-06-03 10:20:23 -07:00
fetch_history = cls . objects . create ( feed_id = feed_id )
2013-04-15 14:30:31 -07:00
history = { }
2010-09-23 10:29:18 -04:00
2013-04-15 14:30:31 -07:00
for fetch_type in [ ' feed_fetch_history ' , ' page_fetch_history ' , ' push_history ' ] :
history [ fetch_type ] = getattr ( fetch_history , fetch_type )
if not history [ fetch_type ] :
history [ fetch_type ] = [ ]
for f , fetch in enumerate ( history [ fetch_type ] ) :
date_key = ' push_date ' if fetch_type == ' push_history ' else ' fetch_date '
history [ fetch_type ] [ f ] = {
date_key : localtime_for_timezone ( fetch [ 0 ] ,
timezone ) . strftime ( " % Y- % m- %d % H: % M: % S " ) ,
' status_code ' : fetch [ 1 ] ,
' message ' : fetch [ 2 ]
}
return history
2010-09-01 08:19:58 -04:00
2011-02-13 14:47:58 -05:00
@classmethod
2013-04-15 14:30:31 -07:00
def add ( cls , feed_id , fetch_type , date = None , message = None , code = None , exception = None ) :
if not date :
date = datetime . datetime . now ( )
2013-05-31 23:03:58 -07:00
try :
2013-06-03 10:20:23 -07:00
fetch_history = cls . objects . read_preference ( pymongo . ReadPreference . PRIMARY ) \
. get ( feed_id = feed_id )
2013-05-31 23:03:58 -07:00
except cls . DoesNotExist :
2013-06-03 10:20:23 -07:00
fetch_history = cls . objects . create ( feed_id = feed_id )
2013-04-15 14:30:31 -07:00
if fetch_type == ' feed ' :
history = fetch_history . feed_fetch_history or [ ]
elif fetch_type == ' page ' :
history = fetch_history . page_fetch_history or [ ]
elif fetch_type == ' push ' :
history = fetch_history . push_history or [ ]
2013-06-28 20:51:31 -07:00
history = [ [ date , code , message ] ] + history
2016-02-03 12:11:22 -08:00
any_exceptions = any ( [ c for d , c , m in history if c not in [ 200 , 304 ] ] )
2016-01-13 12:42:21 -08:00
if any_exceptions :
history = history [ : 25 ]
2013-05-29 16:05:56 -07:00
else :
history = history [ : 5 ]
2013-04-15 14:30:31 -07:00
if fetch_type == ' feed ' :
fetch_history . feed_fetch_history = history
elif fetch_type == ' page ' :
fetch_history . page_fetch_history = history
elif fetch_type == ' push ' :
fetch_history . push_history = history
fetch_history . save ( )
if fetch_type == ' feed ' :
RStats . add ( ' feed_fetch ' )
2013-05-31 17:14:17 -07:00
2013-05-31 17:17:20 -07:00
return cls . feed ( feed_id , fetch_history = fetch_history )
2013-04-15 14:30:31 -07:00
2013-01-03 13:47:38 -08:00
2010-08-19 10:43:07 -04:00
class DuplicateFeed ( models . Model ) :
2013-01-07 16:35:29 -08:00
duplicate_address = models . CharField ( max_length = 764 , db_index = True )
duplicate_link = models . CharField ( max_length = 764 , null = True , db_index = True )
2012-03-12 18:11:13 -07:00
duplicate_feed_id = models . CharField ( max_length = 255 , null = True , db_index = True )
2010-08-19 10:43:07 -04:00
feed = models . ForeignKey ( Feed , related_name = ' duplicate_addresses ' )
2011-03-09 18:52:06 -05:00
def __unicode__ ( self ) :
2012-03-12 18:11:13 -07:00
return " %s : %s / %s " % ( self . feed , self . duplicate_address , self . duplicate_link )
2011-12-25 12:45:07 -08:00
2013-06-12 13:52:43 -07:00
def canonical ( self ) :
2011-12-25 12:45:07 -08:00
return {
' duplicate_address ' : self . duplicate_address ,
2012-03-12 18:11:13 -07:00
' duplicate_link ' : self . duplicate_link ,
2011-12-25 12:45:07 -08:00
' duplicate_feed_id ' : self . duplicate_feed_id ,
2012-01-26 09:32:24 -08:00
' feed_id ' : self . feed_id
2011-12-25 12:45:07 -08:00
}
2013-05-15 15:40:38 -07:00
def save ( self , * args , * * kwargs ) :
2013-05-15 15:41:39 -07:00
max_address = DuplicateFeed . _meta . get_field ( ' duplicate_address ' ) . max_length
2013-05-15 15:40:38 -07:00
if len ( self . duplicate_address ) > max_address :
self . duplicate_address = self . duplicate_address [ : max_address ]
2013-05-15 15:41:39 -07:00
max_link = DuplicateFeed . _meta . get_field ( ' duplicate_link ' ) . max_length
2013-05-15 17:21:10 -07:00
if self . duplicate_link and len ( self . duplicate_link ) > max_link :
2013-05-15 15:40:38 -07:00
self . duplicate_link = self . duplicate_link [ : max_link ]
2013-05-15 15:43:07 -07:00
super ( DuplicateFeed , self ) . save ( * args , * * kwargs )
2010-08-25 19:10:55 -04:00
2010-12-23 17:09:08 -05:00
def merge_feeds ( original_feed_id , duplicate_feed_id , force = False ) :
2011-11-29 09:43:16 -08:00
from apps . reader . models import UserSubscription
2012-03-30 14:56:16 -07:00
from apps . social . models import MSharedStory
2011-11-29 09:43:16 -08:00
if original_feed_id == duplicate_feed_id :
logging . info ( " ***> Merging the same feed. Ignoring... " )
2012-12-28 22:03:48 -08:00
return original_feed_id
2010-08-25 19:10:55 -04:00
try :
original_feed = Feed . objects . get ( pk = original_feed_id )
duplicate_feed = Feed . objects . get ( pk = duplicate_feed_id )
except Feed . DoesNotExist :
logging . info ( " ***> Already deleted feed: %s " % duplicate_feed_id )
2012-12-28 22:03:48 -08:00
return original_feed_id
2013-01-29 14:37:01 -08:00
heavier_dupe = original_feed . num_subscribers < duplicate_feed . num_subscribers
branched_original = original_feed . branch_from_feed
if ( heavier_dupe or branched_original ) and not force :
2012-12-12 15:13:17 -08:00
original_feed , duplicate_feed = duplicate_feed , original_feed
original_feed_id , duplicate_feed_id = duplicate_feed_id , original_feed_id
2013-01-29 14:37:01 -08:00
if branched_original :
original_feed . feed_address = duplicate_feed . feed_address
2010-08-25 19:10:55 -04:00
logging . info ( " ---> Feed: [ %s - %s ] %s - %s " % ( original_feed_id , duplicate_feed_id ,
2012-11-26 10:20:41 -08:00
original_feed , original_feed . feed_link ) )
2013-01-29 14:37:01 -08:00
logging . info ( " Orig ++> %s : ( %s subs) %s / %s %s " % ( original_feed . pk ,
2012-12-12 15:13:17 -08:00
original_feed . num_subscribers ,
2012-11-26 10:20:41 -08:00
original_feed . feed_address ,
2013-01-29 14:37:01 -08:00
original_feed . feed_link ,
2013-01-29 15:58:15 -08:00
" [B: %s ] " % original_feed . branch_from_feed . pk if original_feed . branch_from_feed else " " ) )
2013-01-29 14:37:01 -08:00
logging . info ( " Dupe --> %s : ( %s subs) %s / %s %s " % ( duplicate_feed . pk ,
2012-12-12 15:13:17 -08:00
duplicate_feed . num_subscribers ,
2012-11-26 10:20:41 -08:00
duplicate_feed . feed_address ,
2013-01-29 14:37:01 -08:00
duplicate_feed . feed_link ,
2013-01-29 15:58:15 -08:00
" [B: %s ] " % duplicate_feed . branch_from_feed . pk if duplicate_feed . branch_from_feed else " " ) )
2010-08-25 19:10:55 -04:00
2013-05-07 15:37:36 -07:00
original_feed . branch_from_feed = None
2012-12-12 16:05:28 -08:00
user_subs = UserSubscription . objects . filter ( feed = duplicate_feed ) . order_by ( ' -pk ' )
2010-08-25 19:10:55 -04:00
for user_sub in user_subs :
2011-11-16 10:00:03 -08:00
user_sub . switch_feed ( original_feed , duplicate_feed )
2010-08-25 19:10:55 -04:00
def delete_story_feed ( model , feed_field = ' feed_id ' ) :
duplicate_stories = model . objects ( * * { feed_field : duplicate_feed . pk } )
# if duplicate_stories.count():
# logging.info(" ---> Deleting %s %s" % (duplicate_stories.count(), model))
duplicate_stories . delete ( )
delete_story_feed ( MStory , ' story_feed_id ' )
2011-01-14 00:59:51 -05:00
delete_story_feed ( MFeedPage , ' feed_id ' )
2010-08-25 19:10:55 -04:00
try :
DuplicateFeed . objects . create (
duplicate_address = duplicate_feed . feed_address ,
2012-03-12 18:11:13 -07:00
duplicate_link = duplicate_feed . feed_link ,
2010-11-09 09:55:44 -05:00
duplicate_feed_id = duplicate_feed . pk ,
2010-08-25 19:10:55 -04:00
feed = original_feed
)
2010-09-10 08:23:04 -07:00
except ( IntegrityError , OperationError ) , e :
logging . info ( " ***> Could not save DuplicateFeed: %s " % e )
2010-08-25 19:10:55 -04:00
2010-11-09 09:55:44 -05:00
# Switch this dupe feed's dupe feeds over to the new original.
duplicate_feeds_duplicate_feeds = DuplicateFeed . objects . filter ( feed = duplicate_feed )
for dupe_feed in duplicate_feeds_duplicate_feeds :
dupe_feed . feed = original_feed
dupe_feed . duplicate_feed_id = duplicate_feed . pk
dupe_feed . save ( )
2010-08-25 19:10:55 -04:00
2013-05-07 15:37:36 -07:00
logging . debug ( ' ---> Dupe subscribers ( %s ): %s , Original subscribers ( %s ): %s ' %
( duplicate_feed . pk , duplicate_feed . num_subscribers ,
original_feed . pk , original_feed . num_subscribers ) )
2012-12-12 16:05:28 -08:00
if duplicate_feed . pk != original_feed . pk :
duplicate_feed . delete ( )
else :
logging . debug ( " ***> Duplicate feed is the same as original feed. Panic! " )
2012-12-12 16:37:39 -08:00
logging . debug ( ' ---> Deleted duplicate feed: %s / %s ' % ( duplicate_feed , duplicate_feed_id ) )
2013-01-29 14:37:01 -08:00
original_feed . branch_from_feed = None
2011-05-16 23:13:49 -04:00
original_feed . count_subscribers ( )
2012-12-28 22:03:48 -08:00
original_feed . save ( )
2012-11-26 10:20:41 -08:00
logging . debug ( ' ---> Now original subscribers: %s ' %
( original_feed . num_subscribers ) )
2012-12-10 15:20:58 -08:00
2012-03-30 14:56:16 -07:00
MSharedStory . switch_feed ( original_feed_id , duplicate_feed_id )
2012-12-28 22:03:48 -08:00
return original_feed_id
2010-08-25 19:10:55 -04:00
def rewrite_folders ( folders , original_feed , duplicate_feed ) :
new_folders = [ ]
for k , folder in enumerate ( folders ) :
if isinstance ( folder , int ) :
if folder == duplicate_feed . pk :
# logging.info(" ===> Rewrote %s'th item: %s" % (k+1, folders))
new_folders . append ( original_feed . pk )
else :
new_folders . append ( folder )
elif isinstance ( folder , dict ) :
for f_k , f_v in folder . items ( ) :
new_folders . append ( { f_k : rewrite_folders ( f_v , original_feed , duplicate_feed ) } )
2011-11-24 15:19:53 -05:00
return new_folders