2010-05-20 15:13:25 -04:00
import difflib
2017-05-22 16:46:56 -07:00
import requests
2010-05-20 15:13:25 -04:00
import datetime
2012-07-15 22:51:59 -07:00
import time
2010-06-24 15:27:25 -04:00
import random
2010-07-27 22:11:23 -04:00
import re
2011-10-26 20:09:28 -07:00
import math
2010-08-21 13:57:39 -04:00
import mongoengine as mongo
2010-08-29 12:35:09 -04:00
import zlib
2011-11-15 22:14:21 -08:00
import hashlib
2012-07-15 22:51:59 -07:00
import redis
2020-07-01 16:59:21 -04:00
import base64
2013-04-15 15:09:04 -07:00
import pymongo
2021-03-02 09:46:52 -05:00
import html
2020-06-15 02:54:37 -04:00
import urllib . parse
2010-08-04 18:30:51 -04:00
from collections import defaultdict
2010-08-21 23:49:36 -04:00
from operator import itemgetter
2013-03-20 15:43:35 -07:00
from bson . objectid import ObjectId
2020-06-15 02:54:37 -04:00
from bs4 import BeautifulSoup
2011-09-19 08:56:16 -07:00
# from nltk.collocations import TrigramCollocationFinder, BigramCollocationFinder, TrigramAssocMeasures, BigramAssocMeasures
2009-06-16 03:08:55 +00:00
from django . db import models
2009-09-08 00:13:49 +00:00
from django . db import IntegrityError
2010-10-11 13:19:42 -04:00
from django . conf import settings
2011-04-25 20:53:29 -04:00
from django . db . models . query import QuerySet
2016-01-28 11:35:11 -08:00
from django . db . utils import DatabaseError
2020-06-15 02:54:37 -04:00
from django . urls import reverse
2013-07-11 12:08:21 -07:00
from django . contrib . auth . models import User
2012-07-05 18:29:38 -07:00
from django . contrib . sites . models import Site
2013-07-10 11:38:05 -07:00
from django . template . defaultfilters import slugify
2020-11-06 14:46:40 +07:00
from django . utils . encoding import smart_bytes , smart_str
2021-03-25 14:54:49 -04:00
from django . utils . encoding import DjangoUnicodeDecodeError
2013-05-29 16:26:04 -07:00
from mongoengine . queryset import OperationError , Q , NotUniqueError
2020-06-29 17:39:55 -04:00
from mongoengine . errors import ValidationError
2012-12-28 21:42:52 -08:00
from vendor . timezones . utilities import localtime_for_timezone
2014-04-29 12:36:42 -07:00
from apps . rss_feeds . tasks import UpdateFeeds , PushFeeds , ScheduleCountTagsForUser
2013-01-08 18:33:30 -08:00
from apps . rss_feeds . text_importer import TextImporter
2014-04-11 15:40:58 -07:00
from apps . search . models import SearchStory , SearchFeed
2013-03-25 11:10:36 -07:00
from apps . statistics . rstats import RStats
2010-10-23 13:06:28 -04:00
from utils import json_functions as json
2020-06-15 12:30:30 -04:00
from utils import feedfinder_forman
from utils import feedfinder_pilgrim
2011-02-08 22:07:59 -05:00
from utils import urlnorm
from utils import log as logging
2011-01-17 22:48:38 -05:00
from utils . fields import AutoOneToOneField
2010-05-20 15:13:25 -04:00
from utils . feed_functions import levenshtein_distance
2011-02-02 13:07:12 -05:00
from utils . feed_functions import timelimit , TimeoutError
2011-03-04 12:27:31 -05:00
from utils . feed_functions import relative_timesince
2011-08-18 09:56:52 -07:00
from utils . feed_functions import seconds_timesince
2013-04-08 16:14:33 -07:00
from utils . story_functions import strip_tags , htmldiff , strip_comments , strip_comments__lxml
2014-04-15 14:17:15 -07:00
from utils . story_functions import prep_for_search
2019-12-25 18:13:29 -05:00
from utils . story_functions import create_imageproxy_signed_url
2009-06-16 03:08:55 +00:00
2020-06-15 02:54:37 -04:00
ENTRY_NEW , ENTRY_UPDATED , ENTRY_SAME , ENTRY_ERR = list ( range ( 4 ) )
2009-07-21 03:18:29 +00:00
2012-05-07 16:26:31 -07:00
2009-06-16 03:08:55 +00:00
class Feed ( models . Model ) :
2012-12-25 14:58:49 -08:00
feed_address = models . URLField ( max_length = 764 , db_index = True )
2020-12-09 17:20:08 -05:00
feed_address_locked = models . BooleanField ( default = False , blank = True , null = True )
2010-07-21 23:22:27 -04:00
feed_link = models . URLField ( max_length = 1000 , default = " " , blank = True , null = True )
2011-09-01 09:34:57 -07:00
feed_link_locked = models . BooleanField ( default = False )
2013-03-29 22:41:16 -07:00
hash_address_and_link = models . CharField ( max_length = 64 , unique = True )
2011-02-15 21:08:40 -05:00
feed_title = models . CharField ( max_length = 255 , default = " [Untitled] " , blank = True , null = True )
2020-12-09 17:20:08 -05:00
is_push = models . BooleanField ( default = False , blank = True , null = True )
2011-01-17 13:52:11 -05:00
active = models . BooleanField ( default = True , db_index = True )
2010-09-17 12:42:44 -04:00
num_subscribers = models . IntegerField ( default = - 1 )
2011-01-17 13:52:11 -05:00
active_subscribers = models . IntegerField ( default = - 1 , db_index = True )
2010-10-16 22:49:03 -04:00
premium_subscribers = models . IntegerField ( default = - 1 )
2022-01-10 17:00:27 -05:00
pro_subscribers = models . IntegerField ( default = 0 , null = True , blank = True )
2013-03-29 20:23:54 -07:00
active_premium_subscribers = models . IntegerField ( default = - 1 )
2020-06-11 05:45:48 -04:00
branch_from_feed = models . ForeignKey ( ' Feed ' , blank = True , null = True , db_index = True , on_delete = models . CASCADE )
2010-10-02 17:15:51 -04:00
last_update = models . DateTimeField ( db_index = True )
2013-03-29 22:09:11 -07:00
next_scheduled_update = models . DateTimeField ( )
2013-04-23 17:04:21 -07:00
last_story_date = models . DateTimeField ( null = True , blank = True )
2010-08-09 20:44:36 -04:00
fetched_once = models . BooleanField ( default = False )
2013-03-29 20:23:54 -07:00
known_good = models . BooleanField ( default = False )
2010-09-08 10:52:04 -07:00
has_feed_exception = models . BooleanField ( default = False , db_index = True )
has_page_exception = models . BooleanField ( default = False , db_index = True )
2011-09-04 10:59:29 -07:00
has_page = models . BooleanField ( default = True )
2010-08-25 20:43:35 -04:00
exception_code = models . IntegerField ( default = 0 )
2012-08-09 13:26:44 -07:00
errors_since_good = models . IntegerField ( default = 0 )
2010-12-23 13:29:31 -05:00
min_to_decay = models . IntegerField ( default = 0 )
2009-06-16 03:08:55 +00:00
days_to_trim = models . IntegerField ( default = 90 )
creation = models . DateField ( auto_now_add = True )
2010-09-17 13:06:28 -04:00
etag = models . CharField ( max_length = 255 , blank = True , null = True )
2009-06-16 03:08:55 +00:00
last_modified = models . DateTimeField ( null = True , blank = True )
2010-07-25 23:13:27 -04:00
stories_last_month = models . IntegerField ( default = 0 )
average_stories_per_month = models . IntegerField ( default = 0 )
2010-04-29 13:35:46 -04:00
last_load_time = models . IntegerField ( default = 0 )
2011-04-26 19:37:52 -04:00
favicon_color = models . CharField ( max_length = 6 , null = True , blank = True )
favicon_not_found = models . BooleanField ( default = False )
2020-12-09 17:20:08 -05:00
s3_page = models . BooleanField ( default = False , blank = True , null = True )
s3_icon = models . BooleanField ( default = False , blank = True , null = True )
search_indexed = models . BooleanField ( default = None , null = True , blank = True )
2014-04-15 14:59:00 -07:00
2011-11-15 22:14:21 -08:00
class Meta :
db_table = " feeds "
ordering = [ " feed_title " ]
# unique_together=[('feed_address', 'feed_link')]
2009-06-16 03:08:55 +00:00
2020-06-30 20:50:30 -04:00
def __str__ ( self ) :
2010-09-28 04:57:24 -04:00
if not self . feed_title :
self . feed_title = " [Untitled] "
self . save ( )
2022-01-10 17:00:27 -05:00
return " %s %s : %s - %s / %s / %s / %s " % (
2013-01-04 16:34:27 -08:00
self . pk ,
2017-03-31 19:52:24 -07:00
( " [B: %s ] " % self . branch_from_feed . pk if self . branch_from_feed else " " ) ,
self . feed_title ,
2013-01-04 16:34:27 -08:00
self . num_subscribers ,
self . active_subscribers ,
2013-04-23 15:44:31 -07:00
self . active_premium_subscribers ,
2022-01-10 17:00:27 -05:00
self . pro_subscribers ,
2017-03-31 19:52:24 -07:00
)
2012-03-20 16:46:38 -07:00
@property
def title ( self ) :
2013-09-16 16:42:49 -07:00
title = self . feed_title or " [Untitled] "
if self . active_premium_subscribers > = 1 :
title = " %s * " % title [ : 29 ]
return title
2013-07-10 11:38:05 -07:00
2017-03-31 19:52:24 -07:00
@property
def log_title ( self ) :
2020-06-30 20:50:30 -04:00
return self . __str__ ( )
2017-03-31 19:52:24 -07:00
2013-07-10 11:38:05 -07:00
@property
def permalink ( self ) :
return " %s /site/ %s / %s " % ( settings . NEWSBLUR_URL , self . pk , slugify ( self . feed_title . lower ( ) [ : 50 ] ) )
2012-07-05 18:29:38 -07:00
@property
def favicon_url ( self ) :
2012-09-21 17:41:07 -07:00
if settings . BACKED_BY_AWS [ ' icons_on_s3 ' ] and self . s3_icon :
2014-08-10 11:12:18 -07:00
return " https://s3.amazonaws.com/ %s / %s .png " % ( settings . S3_ICONS_BUCKET_NAME , self . pk )
2012-07-05 18:29:38 -07:00
return reverse ( ' feed-favicon ' , kwargs = { ' feed_id ' : self . pk } )
@property
def favicon_url_fqdn ( self ) :
2012-09-21 17:41:07 -07:00
if settings . BACKED_BY_AWS [ ' icons_on_s3 ' ] and self . s3_icon :
2012-09-20 12:43:34 -07:00
return self . favicon_url
2012-07-05 18:29:38 -07:00
return " http:// %s %s " % (
2012-07-29 22:31:40 -07:00
Site . objects . get_current ( ) . domain ,
2012-07-05 18:29:38 -07:00
self . favicon_url
)
2012-09-18 17:09:07 -07:00
@property
def s3_pages_key ( self ) :
return " %s .gz.html " % self . pk
2012-09-19 16:33:23 -07:00
@property
def s3_icons_key ( self ) :
return " %s .png " % self . pk
2013-09-16 16:42:49 -07:00
@property
def unread_cutoff ( self ) :
2022-01-10 17:00:27 -05:00
if self . pro_subscribers > 0 :
return datetime . datetime . utcnow ( ) - datetime . timedelta ( days = 9999 )
2013-09-16 16:42:49 -07:00
if self . active_premium_subscribers > 0 :
return datetime . datetime . utcnow ( ) - datetime . timedelta ( days = settings . DAYS_OF_UNREAD )
return datetime . datetime . utcnow ( ) - datetime . timedelta ( days = settings . DAYS_OF_UNREAD_FREE )
2016-01-07 18:38:14 -08:00
2017-05-01 09:27:31 -07:00
@property
def story_hashes_in_unread_cutoff ( self ) :
r = redis . Redis ( connection_pool = settings . REDIS_STORY_HASH_POOL )
current_time = int ( time . time ( ) + 60 * 60 * 24 )
unread_cutoff = self . unread_cutoff . strftime ( ' %s ' )
2017-05-01 12:08:56 -07:00
story_hashes = r . zrevrangebyscore ( ' zF: %s ' % self . pk , current_time , unread_cutoff )
2019-01-17 15:05:32 -05:00
2017-05-01 09:27:31 -07:00
return story_hashes
2016-01-07 18:38:14 -08:00
@classmethod
def generate_hash_address_and_link ( cls , feed_address , feed_link ) :
if not feed_address : feed_address = " "
if not feed_link : feed_link = " "
2020-06-20 00:27:01 -04:00
return hashlib . sha1 ( ( feed_address + feed_link ) . encode ( encoding = ' utf-8 ' ) ) . hexdigest ( )
2016-02-09 16:22:22 -08:00
@property
def is_newsletter ( self ) :
2020-06-22 10:47:51 -04:00
return self . feed_address . startswith ( ' newsletter: ' ) or self . feed_address . startswith ( ' http://newsletter: ' )
2016-01-07 18:38:14 -08:00
2011-04-04 12:01:29 -04:00
def canonical ( self , full = False , include_favicon = True ) :
2011-03-04 12:27:31 -05:00
feed = {
' id ' : self . pk ,
' feed_title ' : self . feed_title ,
' feed_address ' : self . feed_address ,
' feed_link ' : self . feed_link ,
2011-07-27 22:17:34 -07:00
' num_subscribers ' : self . num_subscribers ,
2011-03-04 12:27:31 -05:00
' updated ' : relative_timesince ( self . last_update ) ,
2011-08-18 09:56:52 -07:00
' updated_seconds_ago ' : seconds_timesince ( self . last_update ) ,
2013-09-23 13:34:16 -07:00
' last_story_date ' : self . last_story_date ,
2014-08-04 20:03:50 -07:00
' last_story_seconds_ago ' : seconds_timesince ( self . last_story_date ) ,
2014-07-23 17:49:23 -07:00
' stories_last_month ' : self . stories_last_month ,
' average_stories_per_month ' : self . average_stories_per_month ,
2013-02-11 16:07:08 -08:00
' min_to_decay ' : self . min_to_decay ,
2011-03-04 12:27:31 -05:00
' subs ' : self . num_subscribers ,
2012-03-28 17:31:25 -07:00
' is_push ' : self . is_push ,
2016-02-09 16:22:22 -08:00
' is_newsletter ' : self . is_newsletter ,
2012-05-22 17:39:21 -07:00
' fetched_once ' : self . fetched_once ,
2014-04-22 15:48:29 -07:00
' search_indexed ' : self . search_indexed ,
2012-05-22 17:39:21 -07:00
' not_yet_fetched ' : not self . fetched_once , # Legacy. Doh.
2011-04-26 19:37:52 -04:00
' favicon_color ' : self . favicon_color ,
2011-10-26 09:40:04 -07:00
' favicon_fade ' : self . favicon_fade ( ) ,
2012-01-31 10:15:11 -08:00
' favicon_border ' : self . favicon_border ( ) ,
2011-10-26 20:09:28 -07:00
' favicon_text_color ' : self . favicon_text_color ( ) ,
2011-11-30 21:27:55 -08:00
' favicon_fetching ' : self . favicon_fetching ,
2012-07-05 18:29:38 -07:00
' favicon_url ' : self . favicon_url ,
2012-09-19 16:33:23 -07:00
' s3_page ' : self . s3_page ,
' s3_icon ' : self . s3_icon ,
2021-04-01 18:22:50 -04:00
' disabled_page ' : not self . has_page ,
2011-03-04 12:27:31 -05:00
}
2011-04-11 09:42:38 -04:00
if include_favicon :
2011-05-01 17:19:01 -04:00
try :
2012-01-10 10:22:12 -08:00
feed_icon = MFeedIcon . objects . get ( feed_id = self . pk )
2011-05-01 17:19:01 -04:00
feed [ ' favicon ' ] = feed_icon . data
except MFeedIcon . DoesNotExist :
pass
2011-03-04 12:27:31 -05:00
if self . has_page_exception or self . has_feed_exception :
feed [ ' has_exception ' ] = True
feed [ ' exception_type ' ] = ' feed ' if self . has_feed_exception else ' page '
feed [ ' exception_code ' ] = self . exception_code
elif full :
feed [ ' has_exception ' ] = False
feed [ ' exception_type ' ] = None
feed [ ' exception_code ' ] = self . exception_code
if full :
2013-02-22 16:02:45 -08:00
feed [ ' average_stories_per_month ' ] = self . average_stories_per_month
feed [ ' tagline ' ] = self . data . feed_tagline
2011-03-04 12:27:31 -05:00
feed [ ' feed_tags ' ] = json . decode ( self . data . popular_tags ) if self . data . popular_tags else [ ]
feed [ ' feed_authors ' ] = json . decode ( self . data . popular_authors ) if self . data . popular_authors else [ ]
return feed
2012-02-13 11:07:32 -08:00
2010-11-05 20:34:17 -04:00
def save ( self , * args , * * kwargs ) :
2010-10-02 17:15:51 -04:00
if not self . last_update :
2010-10-10 23:55:00 -04:00
self . last_update = datetime . datetime . utcnow ( )
2010-10-02 17:15:51 -04:00
if not self . next_scheduled_update :
2010-10-10 23:55:00 -04:00
self . next_scheduled_update = datetime . datetime . utcnow ( )
2013-04-10 19:14:19 -07:00
self . fix_google_alerts_urls ( )
2011-11-16 06:52:14 -08:00
feed_address = self . feed_address or " "
feed_link = self . feed_link or " "
2016-01-07 18:38:14 -08:00
self . hash_address_and_link = self . generate_hash_address_and_link ( feed_address , feed_link )
2011-02-15 21:08:40 -05:00
max_feed_title = Feed . _meta . get_field ( ' feed_title ' ) . max_length
if len ( self . feed_title ) > max_feed_title :
self . feed_title = self . feed_title [ : max_feed_title ]
2011-08-27 14:13:28 -07:00
max_feed_address = Feed . _meta . get_field ( ' feed_address ' ) . max_length
2012-04-06 21:40:18 -07:00
if len ( feed_address ) > max_feed_address :
self . feed_address = feed_address [ : max_feed_address ]
2012-04-04 22:22:24 -07:00
max_feed_link = Feed . _meta . get_field ( ' feed_link ' ) . max_length
2012-04-06 21:40:18 -07:00
if len ( feed_link ) > max_feed_link :
self . feed_link = feed_link [ : max_feed_link ]
2010-10-02 17:15:51 -04:00
2010-09-28 05:29:40 -04:00
try :
2010-10-07 19:07:43 -04:00
super ( Feed , self ) . save ( * args , * * kwargs )
2020-06-15 02:54:37 -04:00
except IntegrityError as e :
2016-05-20 15:59:32 -07:00
logging . debug ( " ---> ~FRFeed save collision ( %s ), checking dupe hash... " % e )
feed_address = self . feed_address or " "
feed_link = self . feed_link or " "
hash_address_and_link = self . generate_hash_address_and_link ( feed_address , feed_link )
logging . debug ( " ---> ~FRNo dupes, checking hash collision: %s " % hash_address_and_link )
duplicate_feeds = Feed . objects . filter ( hash_address_and_link = hash_address_and_link )
2013-04-04 17:13:06 -07:00
if not duplicate_feeds :
2016-05-20 15:59:32 -07:00
duplicate_feeds = Feed . objects . filter ( feed_address = self . feed_address ,
feed_link = self . feed_link )
2013-01-29 14:37:01 -08:00
if not duplicate_feeds :
2012-12-25 14:58:49 -08:00
# Feed has been deleted. Just ignore it.
2013-04-03 22:51:03 -07:00
logging . debug ( " ***> Changed to: %s - %s : %s " % ( self . feed_address , self . feed_link , duplicate_feeds ) )
2017-03-31 19:52:24 -07:00
logging . debug ( ' ***> [ %-30s ] Feed deleted ( %s ). ' % ( self . log_title [ : 30 ] , self . pk ) )
2012-12-25 14:58:49 -08:00
return
2016-05-20 15:55:37 -07:00
2016-05-20 15:59:32 -07:00
for duplicate_feed in duplicate_feeds :
if duplicate_feed . pk != self . pk :
logging . debug ( " ---> ~FRFound different feed ( %s ), merging %s in... " % ( duplicate_feeds [ 0 ] , self . pk ) )
feed = Feed . get_by_id ( merge_feeds ( duplicate_feeds [ 0 ] . pk , self . pk ) )
return feed
2016-05-20 15:55:37 -07:00
else :
2016-05-20 15:59:32 -07:00
logging . debug ( " ---> ~FRFeed is its own dupe? %s == %s " % ( self , duplicate_feeds ) )
2020-06-15 02:54:37 -04:00
except DatabaseError as e :
2016-05-20 15:52:42 -07:00
logging . debug ( " ---> ~FBFeed update failed, no change: %s / %s ... " % ( kwargs . get ( ' update_fields ' , None ) , e ) )
pass
2013-04-08 12:54:02 -07:00
return self
2014-04-11 15:40:58 -07:00
@classmethod
2015-07-30 11:51:21 -07:00
def index_all_for_search ( cls , offset = 0 , subscribers = 2 ) :
2014-04-11 18:09:23 -07:00
if not offset :
2014-04-16 15:31:44 -07:00
SearchFeed . create_elasticsearch_mapping ( delete = True )
2014-04-11 15:40:58 -07:00
last_pk = cls . objects . latest ( ' pk ' ) . pk
2021-03-22 11:10:14 -04:00
for f in range ( offset , last_pk , 1000 ) :
2020-07-01 16:59:21 -04:00
print ( " ---> {f} / {last_pk} ( {pct} % ) " . format ( f = f , last_pk = last_pk , pct = str ( float ( f ) / last_pk * 100 ) [ : 2 ] ) )
2014-04-11 15:40:58 -07:00
feeds = Feed . objects . filter ( pk__in = range ( f , f + 1000 ) ,
active = True ,
2015-07-30 11:51:21 -07:00
active_subscribers__gte = subscribers ) \
2014-04-11 15:40:58 -07:00
. values_list ( ' pk ' )
for feed_id , in feeds :
2014-04-22 12:00:20 -07:00
Feed . objects . get ( pk = feed_id ) . index_feed_for_search ( )
2014-04-11 15:40:58 -07:00
2014-04-22 12:00:20 -07:00
def index_feed_for_search ( self ) :
2021-03-24 21:09:26 -04:00
min_subscribers = 1
if settings . DEBUG :
min_subscribers = 0
if self . num_subscribers > min_subscribers and not self . branch_from_feed and not self . is_newsletter :
2013-01-04 16:34:27 -08:00
SearchFeed . index ( feed_id = self . pk ,
title = self . feed_title ,
address = self . feed_address ,
link = self . feed_link ,
num_subscribers = self . num_subscribers )
2014-04-15 16:52:25 -07:00
def index_stories_for_search ( self ) :
if self . search_indexed : return
stories = MStory . objects ( story_feed_id = self . pk )
for story in stories :
2014-04-22 12:00:20 -07:00
story . index_story_for_search ( )
2021-03-24 21:09:26 -04:00
self . search_indexed = True
self . save ( )
2010-08-23 09:55:21 -04:00
2012-07-23 13:06:12 -07:00
def sync_redis ( self ) :
2013-05-02 12:27:37 -07:00
return MStory . sync_feed_redis ( self . pk )
2013-05-29 18:00:09 -07:00
2013-08-14 14:32:50 -07:00
def expire_redis ( self , r = None ) :
2013-05-29 18:00:09 -07:00
if not r :
r = redis . Redis ( connection_pool = settings . REDIS_STORY_HASH_POOL )
2013-08-14 14:32:50 -07:00
# if not r2:
# r2 = redis.Redis(connection_pool=settings.REDIS_STORY_HASH_POOL2)
2013-05-29 18:00:09 -07:00
2013-09-16 16:42:49 -07:00
r . expire ( ' F: %s ' % self . pk , settings . DAYS_OF_STORY_HASHES * 24 * 60 * 60 )
# r2.expire('F:%s' % self.pk, settings.DAYS_OF_STORY_HASHES*24*60*60)
r . expire ( ' zF: %s ' % self . pk , settings . DAYS_OF_STORY_HASHES * 24 * 60 * 60 )
# r2.expire('zF:%s' % self.pk, settings.DAYS_OF_STORY_HASHES*24*60*60)
2013-06-03 17:20:36 -07:00
2017-10-16 14:22:37 -07:00
@classmethod
2017-11-05 14:01:25 -08:00
def low_volume_feeds ( cls , feed_ids , stories_per_month = 30 ) :
try :
stories_per_month = int ( stories_per_month )
except ValueError :
stories_per_month = 30
feeds = Feed . objects . filter ( pk__in = feed_ids , average_stories_per_month__lte = stories_per_month ) . only ( ' pk ' )
2017-10-16 14:22:37 -07:00
return [ f . pk for f in feeds ]
2013-04-08 12:54:02 -07:00
@classmethod
def autocomplete ( self , prefix , limit = 5 ) :
2014-04-11 15:40:58 -07:00
results = SearchFeed . query ( prefix )
2021-03-25 15:38:15 -04:00
feed_ids = [ result [ ' _source ' ] [ ' feed_id ' ] for result in results [ : 5 ] ]
2014-04-11 15:40:58 -07:00
# results = SearchQuerySet().autocomplete(address=prefix).order_by('-num_subscribers')[:limit]
#
# if len(results) < limit:
# results += SearchQuerySet().autocomplete(title=prefix).order_by('-num_subscribers')[:limit-len(results)]
#
return feed_ids
2012-07-23 13:06:12 -07:00
2012-03-12 18:11:13 -07:00
@classmethod
2019-12-25 15:59:38 -05:00
def find_or_create ( cls , feed_address , feed_link , defaults = None , * * kwargs ) :
2012-03-12 18:11:13 -07:00
feeds = cls . objects . filter ( feed_address = feed_address , feed_link = feed_link )
if feeds :
return feeds [ 0 ] , False
2012-07-30 13:25:44 -07:00
if feed_link and feed_link . endswith ( ' / ' ) :
2012-03-12 18:11:13 -07:00
feeds = cls . objects . filter ( feed_address = feed_address , feed_link = feed_link [ : - 1 ] )
if feeds :
return feeds [ 0 ] , False
2019-12-25 15:59:38 -05:00
try :
feed = cls . objects . get ( feed_address = feed_address , feed_link = feed_link )
return feed , False
except cls . DoesNotExist :
feed = cls ( * * defaults )
2021-01-24 21:05:22 -05:00
feed = feed . save ( )
2019-12-25 15:59:38 -05:00
return feed , True
2012-03-12 18:11:13 -07:00
2012-02-03 13:02:26 -08:00
@classmethod
def merge_feeds ( cls , * args , * * kwargs ) :
2012-12-28 22:03:48 -08:00
return merge_feeds ( * args , * * kwargs )
2013-01-02 12:27:08 -08:00
2013-04-10 19:14:19 -07:00
def fix_google_alerts_urls ( self ) :
if ( self . feed_address . startswith ( ' http://user/ ' ) and
' /state/com.google/alerts/ ' in self . feed_address ) :
match = re . match ( r " http://user/( \ d+)/state/com.google/alerts/( \ d+) " , self . feed_address )
if match :
user_id , alert_id = match . groups ( )
self . feed_address = " http://www.google.com/alerts/feeds/ %s / %s " % ( user_id , alert_id )
2013-01-02 12:27:08 -08:00
@classmethod
2014-04-21 16:26:10 -07:00
def schedule_feed_fetches_immediately ( cls , feed_ids , user_id = None ) :
2013-06-12 13:52:43 -07:00
if settings . DEBUG :
logging . info ( " ---> ~SN~FMSkipping the scheduling immediate fetch of ~SB %s ~SN feeds (in DEBUG)... " %
len ( feed_ids ) )
return
2013-01-02 12:27:08 -08:00
2014-04-21 16:26:10 -07:00
if user_id :
user = User . objects . get ( pk = user_id )
logging . user ( user , " ~SN~FMScheduling immediate fetch of ~SB %s ~SN feeds... " %
len ( feed_ids ) )
else :
logging . debug ( " ---> ~SN~FMScheduling immediate fetch of ~SB %s ~SN feeds... " %
len ( feed_ids ) )
2014-04-21 16:29:58 -07:00
if len ( feed_ids ) > 100 :
logging . debug ( " ---> ~SN~FMFeeds scheduled: %s " % feed_ids )
2014-04-21 16:26:10 -07:00
day_ago = datetime . datetime . now ( ) - datetime . timedelta ( days = 1 )
2013-01-02 12:27:08 -08:00
feeds = Feed . objects . filter ( pk__in = feed_ids )
for feed in feeds :
2014-04-21 16:26:10 -07:00
if feed . active_subscribers < = 0 :
feed . count_subscribers ( )
if not feed . active or feed . next_scheduled_update < day_ago :
feed . schedule_feed_fetch_immediately ( verbose = False )
2013-01-02 12:27:08 -08:00
2011-11-30 21:27:55 -08:00
@property
def favicon_fetching ( self ) :
return bool ( not ( self . favicon_not_found or self . favicon_color ) )
2018-06-28 13:38:56 -04:00
2020-12-05 12:51:26 -05:00
@classmethod
def get_feed_by_url ( self , * args , * * kwargs ) :
return self . get_feed_from_url ( * args , * * kwargs )
2011-02-08 22:07:59 -05:00
@classmethod
2019-05-12 16:47:20 -04:00
def get_feed_from_url ( cls , url , create = True , aggressive = False , fetch = True , offset = 0 , user = None , interactive = False ) :
2011-02-08 22:07:59 -05:00
feed = None
2016-05-26 14:30:26 -07:00
without_rss = False
2017-12-18 21:48:26 -08:00
original_url = url
2011-04-25 20:53:29 -04:00
2016-02-09 16:22:22 -08:00
if url and url . startswith ( ' newsletter: ' ) :
2020-12-06 18:23:31 -05:00
try :
return cls . objects . get ( feed_address = url )
except cls . MultipleObjectsReturned :
return cls . objects . filter ( feed_address = url ) [ 0 ]
2020-06-04 16:57:04 -04:00
if url and re . match ( ' (https?://)?twitter.com/ \ w+/? ' , url ) :
2016-05-26 14:30:26 -07:00
without_rss = True
2018-03-26 17:31:12 -07:00
if url and re . match ( r ' (https?://)?(www \ .)?facebook.com/ \ w+/?$ ' , url ) :
without_rss = True
2015-11-30 11:58:26 -08:00
if url and ' youtube.com/user/ ' in url :
2015-04-29 17:26:21 -07:00
username = re . search ( ' youtube.com/user/( \ w+) ' , url ) . group ( 1 )
url = " http://gdata.youtube.com/feeds/base/users/ %s /uploads " % username
2016-05-26 14:30:26 -07:00
without_rss = True
2015-11-30 11:58:26 -08:00
if url and ' youtube.com/channel/ ' in url :
2015-05-08 14:50:44 -07:00
channel_id = re . search ( ' youtube.com/channel/([-_ \ w]+) ' , url ) . group ( 1 )
url = " https://www.youtube.com/feeds/videos.xml?channel_id= %s " % channel_id
2016-05-26 14:30:26 -07:00
without_rss = True
2016-05-26 14:50:07 -07:00
if url and ' youtube.com/feeds ' in url :
2016-05-26 14:30:26 -07:00
without_rss = True
2016-06-13 22:54:52 -07:00
if url and ' youtube.com/playlist ' in url :
without_rss = True
2015-04-29 17:26:21 -07:00
2011-04-25 20:53:29 -04:00
def criteria ( key , value ) :
if aggressive :
return { ' %s __icontains ' % key : value }
else :
return { ' %s ' % key : value }
2011-02-08 22:07:59 -05:00
def by_url ( address ) :
2013-05-07 11:37:01 -07:00
feed = cls . objects . filter (
branch_from_feed = None
) . filter ( * * criteria ( ' feed_address ' , address ) ) . order_by ( ' -num_subscribers ' )
2011-03-09 18:52:06 -05:00
if not feed :
2011-04-25 20:55:39 -04:00
duplicate_feed = DuplicateFeed . objects . filter ( * * criteria ( ' duplicate_address ' , address ) )
2011-04-25 20:53:29 -04:00
if duplicate_feed and len ( duplicate_feed ) > offset :
feed = [ duplicate_feed [ offset ] . feed ]
2012-03-28 15:49:21 -07:00
if not feed and aggressive :
2013-05-07 11:37:01 -07:00
feed = cls . objects . filter (
branch_from_feed = None
) . filter ( * * criteria ( ' feed_link ' , address ) ) . order_by ( ' -num_subscribers ' )
2011-03-09 18:52:06 -05:00
2011-02-08 22:07:59 -05:00
return feed
2011-04-25 20:53:29 -04:00
2019-08-28 10:43:02 -07:00
@timelimit ( 10 )
2020-06-15 12:30:30 -04:00
def _feedfinder_forman ( url ) :
found_feed_urls = feedfinder_forman . find_feeds ( url )
2019-08-28 10:43:02 -07:00
return found_feed_urls
@timelimit ( 10 )
2020-06-15 12:30:30 -04:00
def _feedfinder_pilgrim ( url ) :
found_feed_urls = feedfinder_pilgrim . feeds ( url )
2018-06-28 13:38:56 -04:00
return found_feed_urls
2011-04-25 20:53:29 -04:00
# Normalize and check for feed_address, dupes, and feed_link
2012-03-28 15:49:21 -07:00
url = urlnorm . normalize ( url )
2016-02-16 13:19:59 -08:00
if not url :
2019-05-12 16:44:31 -04:00
logging . debug ( " ---> ~FRCouldn ' t normalize url: ~SB %s " % url )
2016-02-16 13:19:59 -08:00
return
2011-02-08 22:07:59 -05:00
feed = by_url ( url )
2016-02-04 21:31:49 -08:00
found_feed_urls = [ ]
2011-04-25 20:53:29 -04:00
2019-05-12 16:47:20 -04:00
if interactive :
import pdb ; pdb . set_trace ( )
2011-04-25 20:53:29 -04:00
# Create if it looks good
if feed and len ( feed ) > offset :
feed = feed [ offset ]
2016-02-16 11:48:35 -08:00
else :
2018-06-28 13:38:56 -04:00
try :
2020-06-15 12:30:30 -04:00
found_feed_urls = _feedfinder_forman ( url )
2018-06-28 13:38:56 -04:00
except TimeoutError :
logging . debug ( ' ---> Feed finder timed out... ' )
found_feed_urls = [ ]
2019-08-28 10:43:02 -07:00
if not found_feed_urls :
try :
2020-06-15 12:30:30 -04:00
found_feed_urls = _feedfinder_pilgrim ( url )
2019-08-28 10:43:02 -07:00
except TimeoutError :
logging . debug ( ' ---> Feed finder old timed out... ' )
found_feed_urls = [ ]
2018-06-28 13:38:56 -04:00
2016-02-04 21:31:49 -08:00
if len ( found_feed_urls ) :
2016-02-16 11:48:35 -08:00
feed_finder_url = found_feed_urls [ 0 ]
logging . debug ( " ---> Found feed URLs for %s : %s " % ( url , found_feed_urls ) )
feed = by_url ( feed_finder_url )
if feed and len ( feed ) > offset :
feed = feed [ offset ]
logging . debug ( " ---> Feed exists ( %s ), updating... " % ( feed ) )
feed = feed . update ( )
elif create :
logging . debug ( " ---> Feed doesn ' t exist, creating: %s " % ( feed_finder_url ) )
feed = cls . objects . create ( feed_address = feed_finder_url )
feed = feed . update ( )
2016-05-26 14:30:26 -07:00
elif without_rss :
2017-12-18 21:48:26 -08:00
logging . debug ( " ---> Found without_rss feed: %s / %s " % ( url , original_url ) )
feed = cls . objects . create ( feed_address = url , feed_link = original_url )
2016-05-26 14:30:26 -07:00
feed = feed . update ( requesting_user_id = user . pk if user else None )
2016-05-23 11:58:01 -07:00
2017-05-22 16:46:56 -07:00
# Check for JSON feed
if not feed and fetch and create :
2018-01-18 16:15:17 -08:00
try :
r = requests . get ( url )
2018-07-13 11:23:56 -04:00
except ( requests . ConnectionError , requests . models . InvalidURL ) :
2018-01-18 16:15:17 -08:00
r = None
if r and ' application/json ' in r . headers . get ( ' Content-Type ' ) :
2017-05-22 16:46:56 -07:00
feed = cls . objects . create ( feed_address = url )
feed = feed . update ( )
2011-04-25 20:53:29 -04:00
# Still nothing? Maybe the URL has some clues.
2016-02-04 21:31:49 -08:00
if not feed and fetch and len ( found_feed_urls ) :
feed_finder_url = found_feed_urls [ 0 ]
feed = by_url ( feed_finder_url )
if not feed and create :
feed = cls . objects . create ( feed_address = feed_finder_url )
feed = feed . update ( )
elif feed and len ( feed ) > offset :
feed = feed [ offset ]
2011-04-25 20:53:29 -04:00
# Not created and not within bounds, so toss results.
if isinstance ( feed , QuerySet ) :
2019-05-12 16:44:31 -04:00
logging . debug ( " ---> ~FRNot created and not within bounds, tossing: ~SB %s " % feed )
2011-04-25 20:53:29 -04:00
return
2011-02-08 22:07:59 -05:00
return feed
2010-12-23 12:32:24 -05:00
@classmethod
2012-12-25 12:08:17 -08:00
def task_feeds ( cls , feeds , queue_size = 12 , verbose = True ) :
2013-03-29 21:46:04 -07:00
if not feeds : return
2015-07-27 18:35:25 -07:00
r = redis . Redis ( connection_pool = settings . REDIS_FEED_UPDATE_POOL )
2013-03-30 19:05:13 -07:00
2012-10-29 14:58:43 -07:00
if isinstance ( feeds , Feed ) :
2012-12-25 12:08:17 -08:00
if verbose :
2013-03-30 19:05:13 -07:00
logging . debug ( " ---> ~SN~FBTasking feed: ~SB %s " % feeds )
feeds = [ feeds . pk ]
2012-12-25 12:08:17 -08:00
elif verbose :
2015-07-23 16:29:47 -07:00
logging . debug ( " ---> ~SN~FBTasking ~SB~FC %s ~FB~SN feeds... " % len ( feeds ) )
2010-12-23 12:32:24 -05:00
2013-03-30 19:05:13 -07:00
if isinstance ( feeds , QuerySet ) :
feeds = [ f . pk for f in feeds ]
r . srem ( ' queued_feeds ' , * feeds )
2013-04-03 17:22:45 -07:00
now = datetime . datetime . now ( ) . strftime ( " %s " )
p = r . pipeline ( )
for feed_id in feeds :
2020-06-20 00:08:18 -04:00
p . zadd ( ' tasked_feeds ' , { feed_id : now } )
2013-04-03 17:22:45 -07:00
p . execute ( )
2013-03-30 19:05:13 -07:00
2013-04-08 14:39:00 -07:00
# for feed_ids in (feeds[pos:pos + queue_size] for pos in xrange(0, len(feeds), queue_size)):
for feed_id in feeds :
2021-01-06 14:42:24 -05:00
UpdateFeeds . apply_async ( args = ( feed_id , ) , queue = ' update_feeds ' )
2013-04-15 19:26:41 -07:00
@classmethod
2014-05-29 16:02:07 -07:00
def drain_task_feeds ( cls ) :
2015-07-27 18:35:25 -07:00
r = redis . Redis ( connection_pool = settings . REDIS_FEED_UPDATE_POOL )
2014-05-29 16:02:07 -07:00
tasked_feeds = r . zrange ( ' tasked_feeds ' , 0 , - 1 )
2021-02-23 10:08:40 -05:00
if tasked_feeds :
logging . debug ( " ---> ~FRDraining %s tasked feeds... " % len ( tasked_feeds ) )
r . sadd ( ' queued_feeds ' , * tasked_feeds )
r . zremrangebyrank ( ' tasked_feeds ' , 0 , - 1 )
else :
logging . debug ( " ---> No tasked feeds to drain " )
2013-04-15 19:26:41 -07:00
2021-02-23 10:08:40 -05:00
errored_feeds = r . zrange ( ' error_feeds ' , 0 , - 1 )
if errored_feeds :
logging . debug ( " ---> ~FRDraining %s errored feeds... " % len ( errored_feeds ) )
r . sadd ( ' queued_feeds ' , * errored_feeds )
r . zremrangebyrank ( ' error_feeds ' , 0 , - 1 )
else :
logging . debug ( " ---> No errored feeds to drain " )
2015-08-05 13:59:26 -07:00
def update_all_statistics ( self , has_new_stories = False , force = False ) :
recount = not self . counts_converted_to_redis
2012-03-26 12:40:13 -07:00
count_extra = False
2015-07-28 20:58:56 -07:00
if random . random ( ) < 0.01 or not self . data . popular_tags or not self . data . popular_authors :
2013-02-25 11:11:34 -08:00
count_extra = True
2013-04-23 15:44:31 -07:00
2015-08-05 13:59:26 -07:00
self . count_subscribers ( recount = recount )
self . calculate_last_story_date ( )
2015-12-18 12:57:11 -08:00
if force or has_new_stories or count_extra :
2013-04-23 15:44:31 -07:00
self . save_feed_stories_last_month ( )
2015-08-05 13:59:26 -07:00
if force or ( has_new_stories and count_extra ) :
2011-12-14 14:18:01 -08:00
self . save_popular_authors ( )
self . save_popular_tags ( )
2013-04-23 15:44:31 -07:00
self . save_feed_story_history_statistics ( )
2010-08-25 19:10:55 -04:00
2013-04-23 17:04:21 -07:00
def calculate_last_story_date ( self ) :
last_story_date = None
2013-04-23 21:22:49 -07:00
2013-04-23 17:04:21 -07:00
try :
2013-04-23 21:22:49 -07:00
latest_story = MStory . objects (
2013-04-23 17:04:21 -07:00
story_feed_id = self . pk
2013-04-23 21:22:49 -07:00
) . limit ( 1 ) . order_by ( ' -story_date ' ) . only ( ' story_date ' ) . first ( )
if latest_story :
last_story_date = latest_story . story_date
2013-04-23 17:04:21 -07:00
except MStory . DoesNotExist :
pass
if not last_story_date or seconds_timesince ( last_story_date ) < 0 :
last_story_date = datetime . datetime . now ( )
2015-07-22 13:53:20 -07:00
if last_story_date != self . last_story_date :
self . last_story_date = last_story_date
self . save ( update_fields = [ ' last_story_date ' ] )
2013-04-23 17:04:21 -07:00
2013-02-15 09:52:11 -08:00
@classmethod
def setup_feeds_for_premium_subscribers ( cls , feed_ids ) :
logging . info ( " ---> ~SN~FMScheduling immediate premium setup of ~SB %s ~SN feeds... " %
len ( feed_ids ) )
feeds = Feed . objects . filter ( pk__in = feed_ids )
for feed in feeds :
feed . setup_feed_for_premium_subscribers ( )
2010-10-06 10:21:14 -04:00
def setup_feed_for_premium_subscribers ( self ) :
self . count_subscribers ( )
self . set_next_scheduled_update ( )
2011-11-16 18:41:36 -08:00
def check_feed_link_for_feed_address ( self ) :
2011-02-02 13:07:12 -05:00
@timelimit ( 10 )
def _1 ( ) :
feed_address = None
2013-04-04 17:18:27 -07:00
feed = self
2016-02-05 14:43:31 -08:00
found_feed_urls = [ ]
2011-02-23 14:25:07 -05:00
try :
2016-02-05 14:43:31 -08:00
logging . debug ( " ---> Checking: %s " % self . feed_address )
2020-06-15 12:30:30 -04:00
found_feed_urls = feedfinder_forman . find_feeds ( self . feed_address )
2016-02-05 14:46:56 -08:00
if found_feed_urls :
feed_address = found_feed_urls [ 0 ]
2011-02-23 14:25:07 -05:00
except KeyError :
2016-11-20 14:18:13 -08:00
pass
2016-02-05 14:43:31 -08:00
if not len ( found_feed_urls ) and self . feed_link :
2020-06-15 12:30:30 -04:00
found_feed_urls = feedfinder_forman . find_feeds ( self . feed_link )
2016-02-05 14:43:31 -08:00
if len ( found_feed_urls ) and found_feed_urls [ 0 ] != self . feed_address :
feed_address = found_feed_urls [ 0 ]
2011-02-02 13:07:12 -05:00
if feed_address :
2016-04-18 16:17:15 -07:00
if any ( ignored_domain in feed_address for ignored_domain in [
' feedburner.com/atom.xml ' ,
' feedburner.com/feed/ ' ,
' feedsportal.com ' ,
] ) :
logging . debug ( " ---> Feed points to ' Wierdo ' or ' feedsportal ' , ignoring. " )
2013-04-04 17:13:06 -07:00
return False , self
2011-02-02 13:07:12 -05:00
try :
self . feed_address = feed_address
2013-04-04 17:09:07 -07:00
feed = self . save ( )
2016-02-09 16:34:59 -08:00
feed . count_subscribers ( )
2021-04-07 14:02:10 -04:00
# feed.schedule_feed_fetch_immediately() # Don't fetch as it can get stuck in a loop
2013-04-04 17:09:07 -07:00
feed . has_feed_exception = False
feed . active = True
feed = feed . save ( )
2011-02-02 13:07:12 -05:00
except IntegrityError :
2011-12-01 22:03:30 -08:00
original_feed = Feed . objects . get ( feed_address = feed_address , feed_link = self . feed_link )
2011-02-02 13:07:12 -05:00
original_feed . has_feed_exception = False
original_feed . active = True
original_feed . save ( )
merge_feeds ( original_feed . pk , self . pk )
2013-04-04 17:09:07 -07:00
return feed_address , feed
2010-07-19 14:29:27 -04:00
2011-11-16 18:41:36 -08:00
if self . feed_address_locked :
2013-04-04 17:13:06 -07:00
return False , self
2011-11-16 18:41:36 -08:00
2011-02-02 13:07:12 -05:00
try :
2013-04-04 17:09:07 -07:00
feed_address , feed = _1 ( )
2020-06-15 02:54:37 -04:00
except TimeoutError as e :
2017-03-31 19:52:24 -07:00
logging . debug ( ' ---> [ %-30s ] Feed address check timed out... ' % ( self . log_title [ : 30 ] ) )
2013-04-15 14:30:31 -07:00
self . save_feed_history ( 505 , ' Timeout ' , e )
2013-04-04 17:27:31 -07:00
feed = self
2011-02-02 13:07:12 -05:00
feed_address = None
2013-04-04 17:09:07 -07:00
return bool ( feed_address ) , feed
2010-08-25 19:10:55 -04:00
2017-04-12 19:13:33 -07:00
def save_feed_history ( self , status_code , message , exception = None , date = None ) :
2013-05-31 17:14:17 -07:00
fetch_history = MFetchHistory . add ( feed_id = self . pk ,
fetch_type = ' feed ' ,
code = int ( status_code ) ,
2017-04-12 19:13:33 -07:00
date = date ,
2013-05-31 17:14:17 -07:00
message = message ,
exception = exception )
2010-10-03 19:05:16 -04:00
if status_code not in ( 200 , 304 ) :
2012-08-09 13:26:44 -07:00
self . errors_since_good + = 1
2013-05-31 17:14:17 -07:00
self . count_errors_in_history ( ' feed ' , status_code , fetch_history = fetch_history )
2012-08-09 13:26:44 -07:00
self . set_next_scheduled_update ( )
elif self . has_feed_exception or self . errors_since_good :
self . errors_since_good = 0
2010-08-26 10:04:32 -04:00
self . has_feed_exception = False
2010-08-25 20:43:35 -04:00
self . active = True
2010-08-24 16:22:12 -04:00
self . save ( )
2010-08-18 20:35:45 -04:00
2017-04-12 19:13:33 -07:00
def save_page_history ( self , status_code , message , exception = None , date = None ) :
2013-05-31 17:14:17 -07:00
fetch_history = MFetchHistory . add ( feed_id = self . pk ,
fetch_type = ' page ' ,
code = int ( status_code ) ,
2017-04-12 19:13:33 -07:00
date = date ,
2013-05-31 17:14:17 -07:00
message = message ,
exception = exception )
2010-08-18 20:35:45 -04:00
2010-10-03 19:05:16 -04:00
if status_code not in ( 200 , 304 ) :
2013-05-31 17:14:17 -07:00
self . count_errors_in_history ( ' page ' , status_code , fetch_history = fetch_history )
2013-07-23 10:43:28 -07:00
elif self . has_page_exception or not self . has_page :
2010-08-26 10:04:32 -04:00
self . has_page_exception = False
2012-03-05 13:12:50 -08:00
self . has_page = True
2010-08-25 20:43:35 -04:00
self . active = True
2010-08-24 16:22:12 -04:00
self . save ( )
2017-04-12 19:13:33 -07:00
def save_raw_feed ( self , raw_feed , fetch_date ) :
MFetchHistory . add ( feed_id = self . pk ,
fetch_type = ' raw_feed ' ,
code = 200 ,
message = raw_feed ,
date = fetch_date )
2010-07-06 13:21:12 -04:00
2013-05-31 17:14:17 -07:00
def count_errors_in_history ( self , exception_type = ' feed ' , status_code = None , fetch_history = None ) :
if not fetch_history :
fetch_history = MFetchHistory . feed ( self . pk )
2013-04-15 14:30:31 -07:00
fh = fetch_history [ exception_type + ' _fetch_history ' ]
non_errors = [ h for h in fh if h [ ' status_code ' ] and int ( h [ ' status_code ' ] ) in ( 200 , 304 ) ]
errors = [ h for h in fh if h [ ' status_code ' ] and int ( h [ ' status_code ' ] ) not in ( 200 , 304 ) ]
2012-02-24 11:47:38 -08:00
if len ( non_errors ) == 0 and len ( errors ) > 1 :
2012-08-09 13:26:44 -07:00
self . active = True
2010-08-26 10:04:32 -04:00
if exception_type == ' feed ' :
self . has_feed_exception = True
2012-08-09 13:26:44 -07:00
# self.active = False # No longer, just geometrically fetch
2010-08-26 10:04:32 -04:00
elif exception_type == ' page ' :
self . has_page_exception = True
2012-02-23 13:20:10 -08:00
self . exception_code = status_code or int ( errors [ 0 ] )
2010-08-18 20:35:45 -04:00
self . save ( )
2010-10-27 13:09:46 -04:00
elif self . exception_code > 0 :
self . active = True
self . exception_code = 0
2012-02-24 11:47:38 -08:00
if exception_type == ' feed ' :
self . has_feed_exception = False
elif exception_type == ' page ' :
self . has_page_exception = False
2010-10-27 13:09:46 -04:00
self . save ( )
2012-02-23 13:20:10 -08:00
2016-02-05 14:43:31 -08:00
logging . debug ( ' ---> [ %-30s ] ~FBCounting any errors in history: %s ( %s non errors) ' %
2017-03-31 19:52:24 -07:00
( self . log_title [ : 30 ] , len ( errors ) , len ( non_errors ) ) )
2016-02-05 14:43:31 -08:00
2012-02-23 13:20:10 -08:00
return errors , non_errors
2015-06-01 17:14:04 -07:00
def count_redirects_in_history ( self , fetch_type = ' feed ' , fetch_history = None ) :
2017-03-31 19:52:24 -07:00
logging . debug ( ' ---> [ %-30s ] Counting redirects in history... ' % ( self . log_title [ : 30 ] ) )
2015-06-01 17:14:04 -07:00
if not fetch_history :
fetch_history = MFetchHistory . feed ( self . pk )
fh = fetch_history [ fetch_type + ' _fetch_history ' ]
redirects = [ h for h in fh if h [ ' status_code ' ] and int ( h [ ' status_code ' ] ) in ( 301 , 302 ) ]
non_redirects = [ h for h in fh if h [ ' status_code ' ] and int ( h [ ' status_code ' ] ) not in ( 301 , 302 ) ]
return redirects , non_redirects
2010-08-18 20:35:45 -04:00
2015-07-29 11:44:39 -07:00
@property
def original_feed_id ( self ) :
if self . branch_from_feed :
return self . branch_from_feed . pk
else :
return self . pk
@property
def counts_converted_to_redis ( self ) :
2015-08-05 13:59:26 -07:00
SUBSCRIBER_EXPIRE_DATE = datetime . datetime . now ( ) - datetime . timedelta ( days = settings . SUBSCRIBER_EXPIRE )
subscriber_expire = int ( SUBSCRIBER_EXPIRE_DATE . strftime ( ' %s ' ) )
2015-07-29 11:44:39 -07:00
r = redis . Redis ( connection_pool = settings . REDIS_FEED_SUB_POOL )
2015-08-05 13:59:26 -07:00
total_key = " s: %s " % self . original_feed_id
premium_key = " sp: %s " % self . original_feed_id
2016-05-16 17:57:02 -07:00
last_recount = r . zscore ( total_key , - 1 ) # Need to subtract this extra when counting subs
last_recount = r . zscore ( premium_key , - 1 ) # Need to subtract this extra when counting subs
2015-08-05 13:59:26 -07:00
# Check for expired feeds with no active users who would have triggered a cleanup
if last_recount and last_recount > subscriber_expire :
return True
elif last_recount :
2015-08-24 14:26:49 -07:00
logging . info ( " ---> [ %-30s ] ~SN~FBFeed has expired redis subscriber counts ( %s < %s ), clearing... " % (
2017-03-31 19:52:24 -07:00
self . log_title [ : 30 ] , last_recount , subscriber_expire ) )
2015-08-05 13:59:26 -07:00
r . delete ( total_key , - 1 )
r . delete ( premium_key , - 1 )
return False
2015-07-29 11:44:39 -07:00
2015-07-28 20:48:51 -07:00
def count_subscribers ( self , recount = True , verbose = False ) :
2015-08-05 13:59:26 -07:00
if recount or not self . counts_converted_to_redis :
2015-07-28 20:46:30 -07:00
from apps . profile . models import Profile
Profile . count_feed_subscribers ( feed_id = self . pk )
2015-07-28 18:46:37 -07:00
SUBSCRIBER_EXPIRE_DATE = datetime . datetime . now ( ) - datetime . timedelta ( days = settings . SUBSCRIBER_EXPIRE )
subscriber_expire = int ( SUBSCRIBER_EXPIRE_DATE . strftime ( ' %s ' ) )
now = int ( datetime . datetime . now ( ) . strftime ( ' %s ' ) )
r = redis . Redis ( connection_pool = settings . REDIS_FEED_SUB_POOL )
total = 0
active = 0
premium = 0
2022-01-10 17:00:27 -05:00
pro = 0
2015-07-28 18:46:37 -07:00
active_premium = 0
# Include all branched feeds in counts
2015-07-29 11:44:39 -07:00
feed_ids = [ f [ ' id ' ] for f in Feed . objects . filter ( branch_from_feed = self . original_feed_id ) . values ( ' id ' ) ]
feed_ids . append ( self . original_feed_id )
2011-11-17 18:34:23 -08:00
feed_ids = list ( set ( feed_ids ) )
2015-07-29 11:44:39 -07:00
if self . counts_converted_to_redis :
2015-07-28 18:46:37 -07:00
# For each branched feed, count different subscribers
for feed_id in feed_ids :
2015-07-28 19:19:58 -07:00
pipeline = r . pipeline ( )
2015-07-28 20:46:30 -07:00
2015-07-29 17:01:16 -07:00
# now+1 ensures `-1` flag will be corrected for later with - 1
2015-07-28 18:46:37 -07:00
total_key = " s: %s " % feed_id
premium_key = " sp: %s " % feed_id
2022-01-10 17:00:27 -05:00
pro_key = " spro: %s " % feed_id
2015-07-28 18:46:37 -07:00
pipeline . zcard ( total_key )
2015-07-29 17:01:16 -07:00
pipeline . zcount ( total_key , subscriber_expire , now + 1 )
2015-07-28 18:46:37 -07:00
pipeline . zcard ( premium_key )
2015-07-29 17:01:16 -07:00
pipeline . zcount ( premium_key , subscriber_expire , now + 1 )
2022-01-10 17:00:27 -05:00
pipeline . zcard ( pro_key )
2015-07-28 18:46:37 -07:00
results = pipeline . execute ( )
2016-05-16 17:57:02 -07:00
# -1 due to counts_converted_to_redis using key=-1 for last_recount date
2016-05-20 16:19:57 -07:00
total + = max ( 0 , results [ 0 ] - 1 )
active + = max ( 0 , results [ 1 ] - 1 )
premium + = max ( 0 , results [ 2 ] - 1 )
active_premium + = max ( 0 , results [ 3 ] - 1 )
2022-01-10 17:00:27 -05:00
pro + = max ( 0 , results [ 4 ] - 1 )
2015-07-29 17:01:16 -07:00
2015-07-28 18:46:37 -07:00
original_num_subscribers = self . num_subscribers
original_active_subs = self . active_subscribers
original_premium_subscribers = self . premium_subscribers
original_active_premium_subscribers = self . active_premium_subscribers
2022-01-10 17:00:27 -05:00
original_pro_subscribers = self . pro_subscribers
logging . info ( " ---> [ %-30s ] ~SN~FBCounting subscribers from ~FCredis~FB: ~FMt:~SB~FM %s ~SN a:~SB %s ~SN p:~SB %s ~SN ap:~SB %s pro:~SB %s ~SN~FC %s " %
( self . log_title [ : 30 ] , total , active , premium , active_premium , pro , " ( %s branches) " % ( len ( feed_ids ) - 1 ) if len ( feed_ids ) > 1 else " " ) )
2015-07-28 18:46:37 -07:00
else :
from apps . reader . models import UserSubscription
subs = UserSubscription . objects . filter ( feed__in = feed_ids )
original_num_subscribers = self . num_subscribers
total = subs . count ( )
active_subs = UserSubscription . objects . filter (
feed__in = feed_ids ,
active = True ,
user__profile__last_seen_on__gte = SUBSCRIBER_EXPIRE_DATE
)
original_active_subs = self . active_subscribers
active = active_subs . count ( )
premium_subs = UserSubscription . objects . filter (
feed__in = feed_ids ,
active = True ,
user__profile__is_premium = True
)
original_premium_subscribers = self . premium_subscribers
premium = premium_subs . count ( )
2022-01-10 17:00:27 -05:00
pro_subs = UserSubscription . objects . filter (
feed__in = feed_ids ,
active = True ,
user__profile__is_pro = True
)
original_pro_subscribers = self . pro_subscribers
pro = pro_subs . count ( )
2015-07-28 18:46:37 -07:00
active_premium_subscribers = UserSubscription . objects . filter (
feed__in = feed_ids ,
active = True ,
user__profile__is_premium = True ,
user__profile__last_seen_on__gte = SUBSCRIBER_EXPIRE_DATE
)
original_active_premium_subscribers = self . active_premium_subscribers
active_premium = active_premium_subscribers . count ( )
2022-01-10 17:00:27 -05:00
logging . debug ( " ---> [ %-30s ] ~SN~FBCounting subscribers from ~FYpostgres~FB: ~FMt:~SB~FM %s ~SN a:~SB %s ~SN p:~SB %s ~SN ap:~SB %s ~SN pro:~SB %s " %
( self . log_title [ : 30 ] , total , active , premium , active_premium , pro ) )
2015-07-29 16:34:48 -07:00
2021-04-27 20:49:10 -04:00
if settings . DOCKERBUILD :
# Local installs enjoy 100% active feeds
active = total
2015-07-29 16:34:48 -07:00
# If any counts have changed, save them
2015-07-28 18:46:37 -07:00
self . num_subscribers = total
self . active_subscribers = active
self . premium_subscribers = premium
self . active_premium_subscribers = active_premium
2022-01-10 17:00:27 -05:00
self . pro_subscribers = pro
2015-07-22 13:53:20 -07:00
if ( self . num_subscribers != original_num_subscribers or
self . active_subscribers != original_active_subs or
self . premium_subscribers != original_premium_subscribers or
self . active_premium_subscribers != original_active_premium_subscribers ) :
2022-01-10 17:00:27 -05:00
self . pro_subscribers != original_pro_subscribers ) :
2016-01-07 18:46:16 -08:00
if original_premium_subscribers == - 1 or original_active_premium_subscribers == - 1 :
self . save ( )
else :
self . save ( update_fields = [ ' num_subscribers ' , ' active_subscribers ' ,
2022-01-10 17:00:27 -05:00
' premium_subscribers ' , ' active_premium_subscribers ' ,
' pro_subscribers ' ] )
2010-06-27 20:43:17 -04:00
if verbose :
2010-06-27 23:15:31 -04:00
if self . num_subscribers < = 1 :
2020-07-01 16:59:21 -04:00
print ( " . " , end = " " )
2010-06-27 23:15:31 -04:00
else :
2020-06-15 02:54:37 -04:00
print ( " \n %s > %s subscriber %s : %s " % (
2010-06-27 23:15:31 -04:00
' - ' * min ( self . num_subscribers , 20 ) ,
self . num_subscribers ,
' ' if self . num_subscribers == 1 else ' s ' ,
self . feed_title ,
2020-06-15 02:54:37 -04:00
) , end = ' ' )
2015-07-28 18:46:37 -07:00
2021-04-16 17:25:57 -04:00
def _split_favicon_color ( self , color = None ) :
if not color :
color = self . favicon_color
if not color :
return None , None , None
splitter = lambda s , p : [ s [ i : i + p ] for i in range ( 0 , len ( s ) , p ) ]
red , green , blue = splitter ( color [ : 6 ] , 2 )
return red , green , blue
2011-10-26 20:09:28 -07:00
def favicon_fade ( self ) :
2021-04-16 17:25:57 -04:00
return self . adjust_color ( adjust = 30 )
def adjust_color ( self , color = None , adjust = 0 ) :
red , green , blue = self . _split_favicon_color ( color = color )
2011-10-26 20:09:28 -07:00
if red and green and blue :
2021-04-16 17:25:57 -04:00
fade_red = hex ( min ( int ( red , 16 ) + adjust , 255 ) ) [ 2 : ] . zfill ( 2 )
fade_green = hex ( min ( int ( green , 16 ) + adjust , 255 ) ) [ 2 : ] . zfill ( 2 )
fade_blue = hex ( min ( int ( blue , 16 ) + adjust , 255 ) ) [ 2 : ] . zfill ( 2 )
2012-01-31 10:15:11 -08:00
return " %s %s %s " % ( fade_red , fade_green , fade_blue )
def favicon_border ( self ) :
red , green , blue = self . _split_favicon_color ( )
if red and green and blue :
fade_red = hex ( min ( int ( int ( red , 16 ) * .75 ) , 255 ) ) [ 2 : ] . zfill ( 2 )
fade_green = hex ( min ( int ( int ( green , 16 ) * .75 ) , 255 ) ) [ 2 : ] . zfill ( 2 )
fade_blue = hex ( min ( int ( int ( blue , 16 ) * .75 ) , 255 ) ) [ 2 : ] . zfill ( 2 )
2011-10-26 09:40:04 -07:00
return " %s %s %s " % ( fade_red , fade_green , fade_blue )
2011-10-26 20:09:28 -07:00
def favicon_text_color ( self ) :
# Color format: {r: 1, g: .5, b: 0}
def contrast ( color1 , color2 ) :
lum1 = luminosity ( color1 )
lum2 = luminosity ( color2 )
if lum1 > lum2 :
return ( lum1 + 0.05 ) / ( lum2 + 0.05 )
else :
return ( lum2 + 0.05 ) / ( lum1 + 0.05 )
def luminosity ( color ) :
r = color [ ' red ' ]
g = color [ ' green ' ]
b = color [ ' blue ' ]
val = lambda c : c / 12.92 if c < = 0.02928 else math . pow ( ( ( c + 0.055 ) / 1.055 ) , 2.4 )
red = val ( r )
green = val ( g )
blue = val ( b )
return 0.2126 * red + 0.7152 * green + 0.0722 * blue
red , green , blue = self . _split_favicon_color ( )
if red and green and blue :
color = {
' red ' : int ( red , 16 ) / 256.0 ,
' green ' : int ( green , 16 ) / 256.0 ,
' blue ' : int ( blue , 16 ) / 256.0 ,
}
white = {
' red ' : 1 ,
' green ' : 1 ,
' blue ' : 1 ,
}
grey = {
' red ' : 0.5 ,
' green ' : 0.5 ,
' blue ' : 0.5 ,
}
if contrast ( color , white ) > contrast ( color , grey ) :
return ' white '
else :
return ' black '
2010-11-05 20:34:17 -04:00
def save_feed_stories_last_month ( self , verbose = False ) :
2010-10-10 23:55:00 -04:00
month_ago = datetime . datetime . utcnow ( ) - datetime . timedelta ( days = 30 )
2010-08-23 09:55:21 -04:00
stories_last_month = MStory . objects ( story_feed_id = self . pk ,
story_date__gte = month_ago ) . count ( )
2015-07-22 13:53:20 -07:00
if self . stories_last_month != stories_last_month :
self . stories_last_month = stories_last_month
self . save ( update_fields = [ ' stories_last_month ' ] )
2010-07-05 22:53:49 -04:00
2010-07-02 15:49:08 -04:00
if verbose :
2020-06-15 02:54:37 -04:00
print ( f " ---> { self . feed } [ { self . pk } ]: { self . stories_last_month } stories last month " )
2010-08-13 10:43:48 -04:00
2010-11-05 20:34:17 -04:00
def save_feed_story_history_statistics ( self , current_counts = None ) :
2010-08-13 10:43:48 -04:00
"""
Fills in missing months between earlier occurances and now .
Save format : [ ( ' YYYY-MM, #), ...]
Example output : [ ( 2010 - 12 , 123 ) , ( 2011 - 01 , 146 ) ]
"""
2010-10-10 23:55:00 -04:00
now = datetime . datetime . utcnow ( )
2010-08-13 10:43:48 -04:00
min_year = now . year
total = 0
month_count = 0
2010-08-27 19:09:47 -04:00
if not current_counts :
2011-01-17 22:48:38 -05:00
current_counts = self . data . story_count_history and json . decode ( self . data . story_count_history )
2013-05-20 17:29:19 -07:00
if isinstance ( current_counts , dict ) :
current_counts = current_counts [ ' months ' ]
2010-08-13 10:43:48 -04:00
if not current_counts :
current_counts = [ ]
2010-08-27 19:09:47 -04:00
# Count stories, aggregate by year and month. Map Reduce!
2010-08-27 18:35:33 -04:00
map_f = """
function ( ) {
var date = ( this . story_date . getFullYear ( ) ) + " - " + ( this . story_date . getMonth ( ) + 1 ) ;
2016-01-14 16:01:00 -08:00
var hour = this . story_date . getUTCHours ( ) ;
2016-01-05 11:32:36 -08:00
var day = this . story_date . getDay ( ) ;
emit ( this . story_hash , { ' month ' : date , ' hour ' : hour , ' day ' : day } ) ;
2010-08-27 18:35:33 -04:00
}
"""
reduce_f = """
function ( key , values ) {
2016-01-05 11:32:36 -08:00
return values ;
2010-08-27 18:35:33 -04:00
}
"""
2016-01-05 11:32:36 -08:00
dates = defaultdict ( int )
hours = defaultdict ( int )
days = defaultdict ( int )
results = MStory . objects ( story_feed_id = self . pk ) . map_reduce ( map_f , reduce_f , output = ' inline ' )
for result in results :
dates [ result . value [ ' month ' ] ] + = 1
hours [ int ( result . value [ ' hour ' ] ) ] + = 1
days [ int ( result . value [ ' day ' ] ) ] + = 1
year = int ( re . findall ( r " ( \ d {4} )- \ d { 1,2} " , result . value [ ' month ' ] ) [ 0 ] )
if year < min_year and year > 2000 :
min_year = year
2010-08-13 10:43:48 -04:00
# Add on to existing months, always amending up, never down. (Current month
# is guaranteed to be accurate, since trim_feeds won't delete it until after
# a month. Hacker News can have 1,000+ and still be counted.)
for current_month , current_count in current_counts :
2011-01-17 22:48:38 -05:00
year = int ( re . findall ( r " ( \ d {4} )- \ d { 1,2} " , current_month ) [ 0 ] )
2010-08-27 19:09:47 -04:00
if current_month not in dates or dates [ current_month ] < current_count :
dates [ current_month ] = current_count
2012-10-30 10:40:49 -07:00
if year < min_year and year > 2000 :
2011-01-17 22:48:38 -05:00
min_year = year
2010-08-13 10:43:48 -04:00
# Assemble a list with 0's filled in for missing months,
# trimming left and right 0's.
months = [ ]
start = False
for year in range ( min_year , now . year + 1 ) :
for month in range ( 1 , 12 + 1 ) :
if datetime . datetime ( year , month , 1 ) < now :
2020-06-15 02:54:37 -04:00
key = ' %s - %s ' % ( year , month )
2010-08-27 19:09:47 -04:00
if dates . get ( key ) or start :
2010-08-13 10:43:48 -04:00
start = True
2010-08-27 19:09:47 -04:00
months . append ( ( key , dates . get ( key , 0 ) ) )
total + = dates . get ( key , 0 )
2017-12-15 17:12:14 -08:00
if dates . get ( key , 0 ) > 0 :
month_count + = 1 # Only count months that have stories for the average
2015-07-22 13:53:20 -07:00
original_story_count_history = self . data . story_count_history
2016-01-05 11:32:36 -08:00
self . data . story_count_history = json . encode ( { ' months ' : months , ' hours ' : hours , ' days ' : days } )
2015-07-22 13:53:20 -07:00
if self . data . story_count_history != original_story_count_history :
self . data . save ( update_fields = [ ' story_count_history ' ] )
original_average_stories_per_month = self . average_stories_per_month
2012-12-21 16:48:47 -08:00
if not total or not month_count :
2010-08-27 19:09:47 -04:00
self . average_stories_per_month = 0
else :
2012-12-21 16:48:47 -08:00
self . average_stories_per_month = int ( round ( total / float ( month_count ) ) )
2015-07-22 13:53:20 -07:00
if self . average_stories_per_month != original_average_stories_per_month :
self . save ( update_fields = [ ' average_stories_per_month ' ] )
2010-08-13 10:43:48 -04:00
2011-04-09 11:06:36 -04:00
def save_classifier_counts ( self ) :
2011-04-07 17:00:28 -04:00
from apps . analyzer . models import MClassifierTitle , MClassifierAuthor , MClassifierFeed , MClassifierTag
def calculate_scores ( cls , facet ) :
map_f = """
function ( ) {
emit ( this [ " %s " ] , {
pos : this . score > 0 ? this . score : 0 ,
neg : this . score < 0 ? Math . abs ( this . score ) : 0
} ) ;
}
""" % (facet)
reduce_f = """
function ( key , values ) {
var result = { pos : 0 , neg : 0 } ;
values . forEach ( function ( value ) {
result . pos + = value . pos ;
result . neg + = value . neg ;
} ) ;
return result ;
}
"""
2011-04-09 14:37:07 -04:00
scores = [ ]
2011-08-27 17:14:31 -07:00
res = cls . objects ( feed_id = self . pk ) . map_reduce ( map_f , reduce_f , output = ' inline ' )
2011-04-07 17:00:28 -04:00
for r in res :
2020-06-15 02:54:37 -04:00
facet_values = dict ( [ ( k , int ( v ) ) for k , v in r . value . items ( ) ] )
2011-04-09 14:37:07 -04:00
facet_values [ facet ] = r . key
2017-01-05 16:44:10 -08:00
if facet_values [ ' pos ' ] + facet_values [ ' neg ' ] > = 1 :
2012-12-12 16:36:36 -08:00
scores . append ( facet_values )
2011-04-09 14:37:07 -04:00
scores = sorted ( scores , key = lambda v : v [ ' neg ' ] - v [ ' pos ' ] )
2011-04-07 17:00:28 -04:00
return scores
scores = { }
for cls , facet in [ ( MClassifierTitle , ' title ' ) ,
( MClassifierAuthor , ' author ' ) ,
( MClassifierTag , ' tag ' ) ,
( MClassifierFeed , ' feed_id ' ) ] :
scores [ facet ] = calculate_scores ( cls , facet )
2011-04-09 14:37:07 -04:00
if facet == ' feed_id ' and scores [ facet ] :
scores [ ' feed ' ] = scores [ facet ]
2011-04-07 17:00:28 -04:00
del scores [ ' feed_id ' ]
elif not scores [ facet ] :
del scores [ facet ]
if scores :
self . data . feed_classifier_counts = json . encode ( scores )
self . data . save ( )
2017-01-05 16:44:10 -08:00
return scores
2017-02-17 12:05:27 -08:00
@property
def user_agent ( self ) :
2020-06-15 02:54:37 -04:00
feed_parts = urllib . parse . urlparse ( self . feed_address )
2018-10-26 09:12:00 +00:00
if feed_parts . netloc . find ( ' .tumblr.com ' ) != - 1 :
# Certain tumblr feeds will redirect to tumblr's login page when fetching.
# A known workaround is using facebook's user agent.
return ' facebookexternalhit/1.0 (+http://www.facebook.com/externalhit_uatext.php) '
2020-11-30 15:48:59 -05:00
ua = ( ' NewsBlur Feed Fetcher - %s subscriber %s - %s %s ' % (
2017-02-17 12:05:27 -08:00
self . num_subscribers ,
' s ' if self . num_subscribers != 1 else ' ' ,
self . permalink ,
2020-11-30 15:48:59 -05:00
self . fake_user_agent ,
2017-02-17 12:05:27 -08:00
) )
return ua
@property
2017-02-17 12:19:55 -08:00
def fake_user_agent ( self ) :
2020-11-30 15:48:59 -05:00
ua = ( ' ( " Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) '
' AppleWebKit/605.1.15 (KHTML, like Gecko) '
' Version/14.0.1 Safari/605.1.15 " ) ' )
2017-02-17 12:19:55 -08:00
return ua
def fetch_headers ( self , fake = False ) :
2017-02-17 12:05:27 -08:00
headers = {
2017-02-17 12:19:55 -08:00
' User-Agent ' : self . user_agent if not fake else self . fake_user_agent ,
2017-02-17 12:05:27 -08:00
' Accept ' : ' application/atom+xml, application/rss+xml, application/xml;q=0.8, text/xml;q=0.6, */*;q=0.2 ' ,
' Accept-Encoding ' : ' gzip, deflate ' ,
}
return headers
2017-01-05 16:44:10 -08:00
2012-03-26 17:04:35 -07:00
def update ( self , * * kwargs ) :
2019-10-13 09:31:45 -04:00
try :
from utils import feed_fetcher
2020-06-15 02:54:37 -04:00
except ImportError as e :
2019-10-13 09:31:45 -04:00
logging . info ( " ***> ~BR~FRImportError: %s " % e )
2019-12-21 09:49:41 -05:00
return
2015-07-27 18:35:25 -07:00
r = redis . Redis ( connection_pool = settings . REDIS_FEED_UPDATE_POOL )
2013-04-04 20:51:56 -07:00
original_feed_id = int ( self . pk )
2015-07-22 13:53:20 -07:00
2012-03-26 17:04:35 -07:00
options = {
' verbose ' : kwargs . get ( ' verbose ' ) ,
2010-04-09 16:37:19 -04:00
' timeout ' : 10 ,
2012-03-26 17:04:35 -07:00
' single_threaded ' : kwargs . get ( ' single_threaded ' , True ) ,
' force ' : kwargs . get ( ' force ' ) ,
2017-02-17 11:45:43 -08:00
' force_fp ' : kwargs . get ( ' force_fp ' ) ,
2012-03-26 17:04:35 -07:00
' compute_scores ' : kwargs . get ( ' compute_scores ' , True ) ,
2012-08-17 00:10:17 -07:00
' mongodb_replication_lag ' : kwargs . get ( ' mongodb_replication_lag ' , None ) ,
2012-03-26 17:04:35 -07:00
' fake ' : kwargs . get ( ' fake ' ) ,
' quick ' : kwargs . get ( ' quick ' ) ,
2014-05-29 17:59:00 -07:00
' updates_off ' : kwargs . get ( ' updates_off ' ) ,
2012-03-26 17:04:35 -07:00
' debug ' : kwargs . get ( ' debug ' ) ,
2012-03-27 18:37:04 -07:00
' fpf ' : kwargs . get ( ' fpf ' ) ,
2012-03-28 15:49:21 -07:00
' feed_xml ' : kwargs . get ( ' feed_xml ' ) ,
2016-05-26 14:30:26 -07:00
' requesting_user_id ' : kwargs . get ( ' requesting_user_id ' , None )
2012-03-26 17:04:35 -07:00
}
2017-04-30 18:47:10 -07:00
2020-06-29 17:39:55 -04:00
if getattr ( settings , ' TEST_DEBUG ' , False ) and " NEWSBLUR_DIR " in self . feed_address :
2020-06-15 02:54:37 -04:00
print ( " ---> Testing feed fetch: %s " % self . log_title )
2017-05-22 16:46:56 -07:00
# options['force_fp'] = True # No, why would this be needed?
2017-04-30 18:47:10 -07:00
original_feed_address = self . feed_address
original_feed_link = self . feed_link
self . feed_address = self . feed_address . replace ( " %(NEWSBLUR_DIR)s " , settings . NEWSBLUR_DIR )
if self . feed_link :
self . feed_link = self . feed_link . replace ( " %(NEWSBLUR_DIR)s " , settings . NEWSBLUR_DIR )
if self . feed_address != original_feed_address or self . feed_link != original_feed_link :
self . save ( update_fields = [ ' feed_address ' , ' feed_link ' ] )
2016-02-09 17:14:59 -08:00
if self . is_newsletter :
feed = self . update_newsletter_icon ( )
else :
disp = feed_fetcher . Dispatcher ( options , 1 )
disp . add_jobs ( [ [ self . pk ] ] )
feed = disp . run_jobs ( )
2011-02-24 15:48:00 -05:00
2013-04-03 21:11:40 -07:00
if feed :
feed = Feed . get_by_id ( feed . pk )
2013-04-03 21:10:37 -07:00
if feed :
feed . last_update = datetime . datetime . utcnow ( )
feed . set_next_scheduled_update ( )
2020-06-20 00:08:18 -04:00
r . zadd ( ' fetched_feeds_last_hour ' , { feed . pk : int ( datetime . datetime . now ( ) . strftime ( ' %s ' ) ) } )
2013-04-03 21:11:40 -07:00
2013-04-04 20:47:41 -07:00
if not feed or original_feed_id != feed . pk :
logging . info ( " ---> ~FRFeed changed id, removing %s from tasked_feeds queue... " % original_feed_id )
r . zrem ( ' tasked_feeds ' , original_feed_id )
2013-04-08 10:50:50 -07:00
r . zrem ( ' error_feeds ' , original_feed_id )
2013-04-04 20:47:41 -07:00
if feed :
r . zrem ( ' tasked_feeds ' , feed . pk )
2013-04-08 10:50:50 -07:00
r . zrem ( ' error_feeds ' , feed . pk )
2012-03-21 16:05:52 -07:00
2011-05-19 19:04:10 -04:00
return feed
2016-02-09 16:59:58 -08:00
def update_newsletter_icon ( self ) :
from apps . rss_feeds . icon_importer import IconImporter
icon_importer = IconImporter ( self )
icon_importer . save ( )
return self
2012-07-17 14:18:26 -07:00
@classmethod
2012-07-18 18:34:19 -07:00
def get_by_id ( cls , feed_id , feed_address = None ) :
2012-07-17 14:18:26 -07:00
try :
feed = Feed . objects . get ( pk = feed_id )
2012-07-18 18:34:19 -07:00
return feed
2012-07-17 14:18:26 -07:00
except Feed . DoesNotExist :
# Feed has been merged after updating. Find the right feed.
duplicate_feeds = DuplicateFeed . objects . filter ( duplicate_feed_id = feed_id )
if duplicate_feeds :
2012-07-18 18:34:19 -07:00
return duplicate_feeds [ 0 ] . feed
if feed_address :
duplicate_feeds = DuplicateFeed . objects . filter ( duplicate_address = feed_address )
if duplicate_feeds :
return duplicate_feeds [ 0 ] . feed
2013-01-04 16:34:27 -08:00
@classmethod
def get_by_name ( cls , query , limit = 1 ) :
results = SearchFeed . query ( query )
feed_ids = [ result . feed_id for result in results ]
if limit == 1 :
return Feed . get_by_id ( feed_ids [ 0 ] )
else :
return [ Feed . get_by_id ( f ) for f in feed_ids ] [ : limit ]
2014-05-29 17:53:16 -07:00
def add_update_stories ( self , stories , existing_stories , verbose = False , updates_off = False ) :
2012-09-04 11:46:41 -07:00
ret_values = dict ( new = 0 , updated = 0 , same = 0 , error = 0 )
2013-04-08 16:14:33 -07:00
error_count = self . error_count
2014-04-17 12:10:04 -07:00
new_story_hashes = [ s . get ( ' story_hash ' ) for s in stories ]
2013-04-08 16:14:33 -07:00
2013-04-03 18:11:23 -07:00
if settings . DEBUG or verbose :
2012-12-24 00:39:52 -08:00
logging . debug ( " ---> [ %-30s ] ~FBChecking ~SB %s ~SN new/updated against ~SB %s ~SN stories " % (
2017-03-31 19:52:24 -07:00
self . log_title [ : 30 ] ,
2012-12-24 00:10:40 -08:00
len ( stories ) ,
2020-06-15 02:54:37 -04:00
len ( list ( existing_stories . keys ( ) ) ) ) )
2021-03-25 12:23:19 -04:00
@timelimit ( 5 )
2014-04-17 12:10:04 -07:00
def _1 ( story , story_content , existing_stories , new_story_hashes ) :
2014-03-13 16:02:27 -07:00
existing_story , story_has_changed = self . _exists_story ( story , story_content ,
2014-04-17 12:10:04 -07:00
existing_stories , new_story_hashes )
2013-08-06 13:54:06 -07:00
return existing_story , story_has_changed
2012-12-24 00:10:40 -08:00
2009-06-16 03:08:55 +00:00
for story in stories :
2014-09-05 10:38:29 -07:00
if verbose :
logging . debug ( " ---> [ %-30s ] ~FBChecking ~SB %s ~SN / ~SB %s " % (
2017-03-31 19:52:24 -07:00
self . log_title [ : 30 ] ,
2014-09-05 10:38:29 -07:00
story . get ( ' title ' ) ,
story . get ( ' guid ' ) ) )
2012-03-06 16:11:27 -08:00
story_content = story . get ( ' story_content ' )
2013-04-08 16:14:33 -07:00
if error_count :
story_content = strip_comments__lxml ( story_content )
else :
story_content = strip_comments ( story_content )
2012-03-06 16:11:27 -08:00
story_tags = self . get_tags ( story )
story_link = self . get_permalink ( story )
2013-11-15 10:48:26 -08:00
replace_story_date = False
2013-05-29 15:31:26 -07:00
2013-08-06 13:54:06 -07:00
try :
2014-03-13 16:02:27 -07:00
existing_story , story_has_changed = _1 ( story , story_content ,
2014-04-17 12:10:04 -07:00
existing_stories , new_story_hashes )
2021-02-25 19:13:21 -05:00
except TimeoutError :
2017-03-31 19:52:24 -07:00
logging . debug ( ' ---> [ %-30s ] ~SB~FRExisting story check timed out... ' % ( self . log_title [ : 30 ] ) )
2013-08-06 13:54:06 -07:00
existing_story = None
story_has_changed = False
2012-03-06 16:11:27 -08:00
if existing_story is None :
2012-12-24 00:39:52 -08:00
if settings . DEBUG and False :
logging . debug ( ' ---> New story in feed ( %s - %s ): %s ' % ( self . feed_title , story . get ( ' title ' ) , len ( story_content ) ) )
2012-12-24 00:10:40 -08:00
2012-03-06 16:11:27 -08:00
s = MStory ( story_feed_id = self . pk ,
story_date = story . get ( ' published ' ) ,
story_title = story . get ( ' title ' ) ,
story_content = story_content ,
story_author_name = story . get ( ' author ' ) ,
story_permalink = story_link ,
story_guid = story . get ( ' guid ' ) ,
story_tags = story_tags
)
try :
s . save ( )
2012-09-04 11:46:41 -07:00
ret_values [ ' new ' ] + = 1
2016-12-13 16:29:42 -08:00
s . publish_to_subscribers ( )
2020-06-15 02:54:37 -04:00
except ( IntegrityError , OperationError ) as e :
2012-09-04 11:46:41 -07:00
ret_values [ ' error ' ] + = 1
2012-12-24 00:10:40 -08:00
if settings . DEBUG :
2013-04-05 12:09:32 -07:00
logging . info ( ' ---> [ %-30s ] ~SN~FRIntegrityError on new story: %s - %s ' % ( self . feed_title [ : 30 ] , story . get ( ' guid ' ) , e ) )
2014-04-23 14:35:56 -07:00
if self . search_indexed :
s . index_story_for_search ( )
2014-05-30 13:41:57 -07:00
elif existing_story and story_has_changed and not updates_off and ret_values [ ' updated ' ] < 3 :
2012-03-06 16:11:27 -08:00
# update story
original_content = None
try :
if existing_story and existing_story . id :
try :
2012-03-26 12:40:13 -07:00
existing_story = MStory . objects . get ( id = existing_story . id )
2012-03-06 16:11:27 -08:00
except ValidationError :
2013-02-20 16:08:14 -08:00
existing_story , _ = MStory . find_story ( existing_story . story_feed_id ,
existing_story . id ,
original_only = True )
2014-04-17 12:10:04 -07:00
elif existing_story and existing_story . story_hash :
2013-02-20 16:08:14 -08:00
existing_story , _ = MStory . find_story ( existing_story . story_feed_id ,
2014-04-17 12:10:04 -07:00
existing_story . story_hash ,
2013-02-20 16:08:14 -08:00
original_only = True )
2010-08-01 23:47:40 -04:00
else :
2012-03-06 16:11:27 -08:00
raise MStory . DoesNotExist
2020-06-15 02:54:37 -04:00
except ( MStory . DoesNotExist , OperationError ) as e :
2012-09-04 11:46:41 -07:00
ret_values [ ' error ' ] + = 1
2012-03-06 16:11:27 -08:00
if verbose :
2013-04-05 12:09:32 -07:00
logging . info ( ' ---> [ %-30s ] ~SN~FROperation on existing story: %s - %s ' % ( self . feed_title [ : 30 ] , story . get ( ' guid ' ) , e ) )
2012-03-06 16:11:27 -08:00
continue
if existing_story . story_original_content_z :
original_content = zlib . decompress ( existing_story . story_original_content_z )
elif existing_story . story_content_z :
original_content = zlib . decompress ( existing_story . story_content_z )
if story_content and len ( story_content ) > 10 :
2021-02-25 19:23:43 -05:00
if " <code " in story_content :
2014-10-15 17:31:00 -07:00
# Don't mangle stories with code, just use new
story_content_diff = story_content
else :
2020-11-06 14:46:40 +07:00
story_content_diff = htmldiff ( smart_str ( original_content ) , smart_str ( story_content ) )
2009-08-20 02:43:01 +00:00
else :
2012-03-06 16:11:27 -08:00
story_content_diff = original_content
# logging.debug("\t\tDiff: %s %s %s" % diff.getStats())
# logging.debug("\t\tDiff content: %s" % diff.getDiff())
# if existing_story.story_title != story.get('title'):
# logging.debug('\tExisting title / New: : \n\t\t- %s\n\t\t- %s' % (existing_story.story_title, story.get('title')))
2014-04-17 12:10:04 -07:00
if existing_story . story_hash != story . get ( ' story_hash ' ) :
2013-05-28 10:23:36 -07:00
self . update_story_with_new_guid ( existing_story , story . get ( ' guid ' ) )
2012-12-24 00:10:40 -08:00
2014-09-05 10:38:29 -07:00
if verbose :
2012-12-24 00:10:40 -08:00
logging . debug ( ' - Updated story in feed ( %s - %s ): %s / %s ' % ( self . feed_title , story . get ( ' title ' ) , len ( story_content_diff ) , len ( story_content ) ) )
2012-03-06 16:11:27 -08:00
existing_story . story_feed = self . pk
existing_story . story_title = story . get ( ' title ' )
existing_story . story_content = story_content_diff
2012-07-22 12:25:09 -07:00
existing_story . story_latest_content = story_content
2012-03-06 16:11:27 -08:00
existing_story . story_original_content = original_content
existing_story . story_author_name = story . get ( ' author ' )
existing_story . story_permalink = story_link
existing_story . story_guid = story . get ( ' guid ' )
existing_story . story_tags = story_tags
2016-02-25 15:45:32 -08:00
existing_story . original_text_z = None # Reset Text view cache
2012-12-24 00:10:40 -08:00
# Do not allow publishers to change the story date once a story is published.
# Leads to incorrect unread story counts.
2013-11-15 10:48:26 -08:00
if replace_story_date :
existing_story . story_date = story . get ( ' published ' ) # Really shouldn't do this.
2019-02-01 11:55:10 -05:00
existing_story . extract_image_urls ( force = True )
2012-03-06 16:11:27 -08:00
try :
existing_story . save ( )
2012-09-04 11:46:41 -07:00
ret_values [ ' updated ' ] + = 1
2012-03-06 16:11:27 -08:00
except ( IntegrityError , OperationError ) :
2012-09-04 11:46:41 -07:00
ret_values [ ' error ' ] + = 1
2012-03-06 16:11:27 -08:00
if verbose :
2012-03-27 11:19:53 -07:00
logging . info ( ' ---> [ %-30s ] ~SN~FRIntegrityError on updated story: %s ' % ( self . feed_title [ : 30 ] , story . get ( ' title ' ) [ : 30 ] ) )
except ValidationError :
2012-09-04 11:46:41 -07:00
ret_values [ ' error ' ] + = 1
2012-03-06 16:11:27 -08:00
if verbose :
2012-03-27 11:19:53 -07:00
logging . info ( ' ---> [ %-30s ] ~SN~FRValidationError on updated story: %s ' % ( self . feed_title [ : 30 ] , story . get ( ' title ' ) [ : 30 ] ) )
2014-04-23 14:35:56 -07:00
if self . search_indexed :
existing_story . index_story_for_search ( )
2012-03-06 16:11:27 -08:00
else :
2012-09-04 11:46:41 -07:00
ret_values [ ' same ' ] + = 1
2014-09-05 10:38:29 -07:00
if verbose :
2014-10-08 16:43:48 -07:00
logging . debug ( " Unchanged story ( %s ): %s / %s " % ( story . get ( ' story_hash ' ) , story . get ( ' guid ' ) , story . get ( ' title ' ) ) )
2012-03-06 16:11:27 -08:00
2009-08-20 02:43:01 +00:00
return ret_values
2011-10-10 20:46:13 -07:00
2013-05-28 10:23:36 -07:00
def update_story_with_new_guid ( self , existing_story , new_story_guid ) :
2013-05-10 16:11:30 -07:00
from apps . reader . models import RUserStory
2012-08-10 14:22:51 -07:00
from apps . social . models import MSharedStory
2013-05-28 10:23:36 -07:00
existing_story . remove_from_redis ( )
2014-04-15 14:59:00 -07:00
existing_story . remove_from_search_index ( )
2013-05-10 16:11:30 -07:00
2013-05-28 10:23:36 -07:00
old_hash = existing_story . story_hash
2013-06-04 15:34:03 -07:00
new_hash = MStory . ensure_story_hash ( new_story_guid , self . pk )
2017-05-01 11:39:24 -07:00
RUserStory . switch_hash ( feed = self , old_hash = old_hash , new_hash = new_hash )
2013-05-10 16:11:30 -07:00
2012-08-10 14:22:51 -07:00
shared_stories = MSharedStory . objects . filter ( story_feed_id = self . pk ,
2013-05-28 10:23:36 -07:00
story_hash = old_hash )
2012-08-10 14:22:51 -07:00
for story in shared_stories :
story . story_guid = new_story_guid
2013-05-28 10:23:36 -07:00
story . story_hash = new_hash
2013-05-29 16:26:04 -07:00
try :
story . save ( )
except NotUniqueError :
# Story is already shared, skip.
pass
2012-03-21 13:54:37 -07:00
2011-02-05 15:34:43 -05:00
def save_popular_tags ( self , feed_tags = None , verbose = False ) :
2010-07-01 15:16:33 -04:00
if not feed_tags :
2013-02-20 16:08:14 -08:00
all_tags = MStory . objects ( story_feed_id = self . pk ,
story_tags__exists = True ) . item_frequencies ( ' story_tags ' )
2020-06-15 02:54:37 -04:00
feed_tags = sorted ( [ ( k , v ) for k , v in list ( all_tags . items ( ) ) if int ( v ) > 0 ] ,
2010-08-21 23:49:36 -04:00
key = itemgetter ( 1 ) ,
2011-02-05 15:34:43 -05:00
reverse = True ) [ : 25 ]
2010-07-01 15:16:33 -04:00
popular_tags = json . encode ( feed_tags )
2012-08-12 20:34:30 -07:00
if verbose :
2020-06-15 02:54:37 -04:00
print ( " Found %s tags: %s " % ( len ( feed_tags ) , popular_tags ) )
2010-08-21 23:49:36 -04:00
# TODO: This len() bullshit will be gone when feeds move to mongo
# On second thought, it might stay, because we don't want
# popular tags the size of a small planet. I'm looking at you
# Tumblr writers.
2010-07-01 15:16:33 -04:00
if len ( popular_tags ) < 1024 :
2015-07-22 13:53:20 -07:00
if self . data . popular_tags != popular_tags :
self . data . popular_tags = popular_tags
self . data . save ( update_fields = [ ' popular_tags ' ] )
2010-07-01 15:16:33 -04:00
return
2011-08-27 14:13:28 -07:00
tags_list = [ ]
2020-06-15 02:54:37 -04:00
if feed_tags and isinstance ( feed_tags , str ) :
2011-08-27 13:22:56 -07:00
tags_list = json . decode ( feed_tags )
2011-08-27 14:13:28 -07:00
if len ( tags_list ) > = 1 :
2010-07-01 15:16:33 -04:00
self . save_popular_tags ( tags_list [ : - 1 ] )
2010-11-05 20:34:17 -04:00
def save_popular_authors ( self , feed_authors = None ) :
2010-07-01 15:16:33 -04:00
if not feed_authors :
2010-08-21 23:49:36 -04:00
authors = defaultdict ( int )
for story in MStory . objects ( story_feed_id = self . pk ) . only ( ' story_author_name ' ) :
authors [ story . story_author_name ] + = 1
2020-06-15 02:54:37 -04:00
feed_authors = sorted ( [ ( k , v ) for k , v in list ( authors . items ( ) ) if k ] ,
2010-08-21 23:49:36 -04:00
key = itemgetter ( 1 ) ,
reverse = True ) [ : 20 ]
2010-07-01 15:16:33 -04:00
popular_authors = json . encode ( feed_authors )
2011-02-05 22:09:31 -05:00
if len ( popular_authors ) < 1023 :
2015-07-22 13:53:20 -07:00
if self . data . popular_authors != popular_authors :
self . data . popular_authors = popular_authors
self . data . save ( update_fields = [ ' popular_authors ' ] )
2010-07-01 15:16:33 -04:00
return
2010-08-25 21:55:22 -04:00
if len ( feed_authors ) > 1 :
2010-11-05 20:34:17 -04:00
self . save_popular_authors ( feed_authors = feed_authors [ : - 1 ] )
2013-06-03 17:20:36 -07:00
@classmethod
2016-01-20 13:32:49 -08:00
def trim_old_stories ( cls , start = 0 , verbose = True , dryrun = False , total = 0 ) :
2013-06-03 17:58:27 -07:00
now = datetime . datetime . now ( )
2013-09-16 16:42:49 -07:00
month_ago = now - datetime . timedelta ( days = settings . DAYS_OF_STORY_HASHES )
2013-06-04 11:26:01 -07:00
feed_count = Feed . objects . latest ( ' pk ' ) . pk
2016-01-20 13:32:49 -08:00
2020-06-15 02:54:37 -04:00
for feed_id in range ( start , feed_count ) :
2013-06-04 11:26:01 -07:00
if feed_id % 1000 == 0 :
2020-06-15 02:54:37 -04:00
print ( " \n \n -------------------------- %s ( %s deleted so far) -------------------------- \n \n " % ( feed_id , total ) )
2013-06-04 11:26:01 -07:00
try :
feed = Feed . objects . get ( pk = feed_id )
except Feed . DoesNotExist :
continue
2016-01-19 11:28:27 -08:00
if feed . active_subscribers < = 0 and ( not feed . last_story_date or feed . last_story_date < month_ago ) :
2013-06-04 11:26:01 -07:00
months_ago = 6
if feed . last_story_date :
months_ago = int ( ( now - feed . last_story_date ) . days / 30.0 )
cutoff = max ( 1 , 6 - months_ago )
if dryrun :
2020-06-15 02:54:37 -04:00
print ( " DRYRUN: %s cutoff - %s " % ( cutoff , feed ) )
2013-06-04 11:26:01 -07:00
else :
2013-08-12 16:48:16 -07:00
total + = MStory . trim_feed ( feed = feed , cutoff = cutoff , verbose = verbose )
2016-01-19 11:28:27 -08:00
else :
if dryrun :
2020-06-15 02:54:37 -04:00
print ( " DRYRUN: %s / %s cutoff - %s " % ( cutoff , feed . story_cutoff , feed ) )
2016-01-19 11:28:27 -08:00
else :
total + = feed . trim_feed ( verbose = verbose )
2013-08-12 16:48:16 -07:00
2020-06-15 02:54:37 -04:00
print ( " ---> Deleted %s stories in total. " % total )
2013-06-21 12:30:06 -07:00
@property
def story_cutoff ( self ) :
2022-01-10 17:00:27 -05:00
if self . pro_subscribers > = 1 :
return 10000
2013-06-21 12:30:06 -07:00
cutoff = 500
if self . active_subscribers < = 0 :
cutoff = 25
2013-08-05 10:23:22 -07:00
elif self . active_premium_subscribers < 1 :
2013-06-21 12:30:06 -07:00
cutoff = 100
2013-08-05 10:23:22 -07:00
elif self . active_premium_subscribers < = 2 :
2013-06-21 12:30:06 -07:00
cutoff = 200
2013-08-05 10:23:22 -07:00
elif self . active_premium_subscribers < = 5 :
2013-06-21 12:30:06 -07:00
cutoff = 300
2013-08-05 10:23:22 -07:00
elif self . active_premium_subscribers < = 10 :
2013-06-21 12:30:06 -07:00
cutoff = 350
2013-08-05 10:23:22 -07:00
elif self . active_premium_subscribers < = 15 :
2013-06-21 12:30:06 -07:00
cutoff = 400
2013-08-05 10:23:22 -07:00
elif self . active_premium_subscribers < = 20 :
2013-06-21 12:30:06 -07:00
cutoff = 450
2015-12-29 12:41:40 -08:00
2013-08-05 10:23:22 -07:00
if self . active_subscribers and self . average_stories_per_month < 5 and self . stories_last_month < 5 :
cutoff / = 2
if self . active_premium_subscribers < = 1 and self . average_stories_per_month < = 1 and self . stories_last_month < = 1 :
cutoff / = 2
2015-12-29 12:41:40 -08:00
r = redis . Redis ( connection_pool = settings . REDIS_FEED_READ_POOL )
pipeline = r . pipeline ( )
read_stories_per_week = [ ]
now = datetime . datetime . now ( )
2015-12-29 12:45:47 -08:00
for weeks_back in range ( 2 * int ( math . floor ( settings . DAYS_OF_STORY_HASHES / 7 ) ) ) :
2015-12-29 12:41:40 -08:00
weeks_ago = now - datetime . timedelta ( days = 7 * weeks_back )
week_of_year = weeks_ago . strftime ( ' % Y- % U ' )
feed_read_key = " fR: %s : %s " % ( self . pk , week_of_year )
pipeline . get ( feed_read_key )
read_stories_per_week = pipeline . execute ( )
read_stories_last_month = sum ( [ int ( rs ) for rs in read_stories_per_week if rs ] )
if read_stories_last_month == 0 :
2015-12-29 12:56:49 -08:00
original_cutoff = cutoff
2016-02-25 11:50:14 -08:00
cutoff = min ( cutoff , 10 )
2016-01-20 13:32:49 -08:00
try :
2017-03-31 19:52:24 -07:00
logging . debug ( " ---> [ %-30s ] ~FBTrimming down to ~SB %s (instead of %s )~SN stories (~FM %s ~FB) " % ( self . log_title [ : 30 ] , cutoff , original_cutoff , self . last_story_date . strftime ( " % Y- % m- %d " ) if self . last_story_date else " No last story date " ) )
2020-06-15 02:54:37 -04:00
except ValueError as e :
2017-03-31 19:52:24 -07:00
logging . debug ( " ***> [ %-30s ] Error trimming: %s " % ( self . log_title [ : 30 ] , e ) )
2016-01-20 13:32:49 -08:00
pass
2015-12-29 12:41:40 -08:00
2018-11-04 12:00:14 -05:00
if getattr ( settings , ' OVERRIDE_STORY_COUNT_MAX ' , None ) :
cutoff = settings . OVERRIDE_STORY_COUNT_MAX
2013-06-21 12:30:06 -07:00
return cutoff
2013-06-03 17:20:36 -07:00
def trim_feed ( self , verbose = False , cutoff = None ) :
if not cutoff :
2013-06-21 12:30:06 -07:00
cutoff = self . story_cutoff
2016-01-19 11:28:27 -08:00
return MStory . trim_feed ( feed = self , cutoff = cutoff , verbose = verbose )
2015-12-26 12:53:32 -08:00
def purge_feed_stories ( self , update = True ) :
MStory . purge_feed_stories ( feed = self , cutoff = self . story_cutoff )
if update :
self . update ( )
2015-12-26 12:59:44 -08:00
def purge_author ( self , author ) :
all_stories = MStory . objects . filter ( story_feed_id = self . pk )
author_stories = MStory . objects . filter ( story_feed_id = self . pk , story_author_name__iexact = author )
logging . debug ( " ---> Deleting %s of %s stories in %s by ' %s ' . " % ( author_stories . count ( ) , all_stories . count ( ) , self , author ) )
author_stories . delete ( )
def purge_tag ( self , tag ) :
all_stories = MStory . objects . filter ( story_feed_id = self . pk )
tagged_stories = MStory . objects . filter ( story_feed_id = self . pk , story_tags__icontains = tag )
logging . debug ( " ---> Deleting %s of %s stories in %s by ' %s ' . " % ( tagged_stories . count ( ) , all_stories . count ( ) , self , tag ) )
tagged_stories . delete ( )
2013-04-15 14:30:31 -07:00
# @staticmethod
# def clean_invalid_ids():
# history = MFeedFetchHistory.objects(status_code=500, exception__contains='InvalidId:')
# urls = set()
# for h in history:
# u = re.split('InvalidId: (.*?) is not a valid ObjectId\\n$', h.exception)[1]
# urls.add((h.feed_id, u))
#
# for f, u in urls:
# print "db.stories.remove({\"story_feed_id\": %s, \"_id\": \"%s\"})" % (f, u)
2012-10-19 18:33:28 -07:00
2009-08-01 04:26:57 +00:00
2012-01-09 13:55:26 -08:00
def get_stories ( self , offset = 0 , limit = 25 , force = False ) :
stories_db = MStory . objects ( story_feed_id = self . pk ) [ offset : offset + limit ]
stories = self . format_stories ( stories_db , self . pk )
2009-07-28 02:27:27 +00:00
return stories
2013-08-07 15:43:25 -07:00
@classmethod
2014-04-16 17:21:53 -07:00
def find_feed_stories ( cls , feed_ids , query , order = " newest " , offset = 0 , limit = 25 ) :
story_ids = SearchStory . query ( feed_ids = feed_ids , query = query , order = order ,
offset = offset , limit = limit )
2013-08-07 15:43:25 -07:00
stories_db = MStory . objects (
2014-04-15 14:17:15 -07:00
story_hash__in = story_ids
2014-04-16 17:23:27 -07:00
) . order_by ( ' -story_date ' if order == " newest " else ' story_date ' )
2013-08-07 15:43:25 -07:00
stories = cls . format_stories ( stories_db )
return stories
2016-10-06 15:24:53 -07:00
@classmethod
2016-10-06 17:20:30 -07:00
def query_popularity ( cls , query , limit , order = ' newest ' ) :
2016-10-06 15:24:53 -07:00
popularity = { }
2016-10-06 17:26:50 -07:00
seen_feeds = set ( )
2016-10-06 18:06:58 -07:00
feed_title_to_id = dict ( )
2016-10-06 15:24:53 -07:00
# Collect stories, sort by feed
story_ids = SearchStory . global_query ( query , order = order , offset = 0 , limit = limit )
for story_hash in story_ids :
feed_id , story_id = MStory . split_story_hash ( story_hash )
2016-10-06 17:26:50 -07:00
feed = Feed . get_by_id ( feed_id )
if not feed : continue
if feed . feed_title in seen_feeds :
2016-10-06 18:06:58 -07:00
feed_id = feed_title_to_id [ feed . feed_title ]
else :
feed_title_to_id [ feed . feed_title ] = feed_id
2016-10-06 17:26:50 -07:00
seen_feeds . add ( feed . feed_title )
2016-10-06 15:24:53 -07:00
if feed_id not in popularity :
2017-01-05 18:39:01 -08:00
# feed.update_all_statistics()
# classifiers = feed.save_classifier_counts()
2016-10-06 20:33:52 -07:00
well_read_score = feed . well_read_score ( )
2016-10-06 15:24:53 -07:00
popularity [ feed_id ] = {
' feed_title ' : feed . feed_title ,
2016-10-06 17:20:30 -07:00
' feed_url ' : feed . feed_link ,
2016-10-06 15:24:53 -07:00
' num_subscribers ' : feed . num_subscribers ,
' feed_id ' : feed . pk ,
' story_ids ' : [ ] ,
' authors ' : { } ,
2016-10-06 20:33:52 -07:00
' read_pct ' : well_read_score [ ' read_pct ' ] ,
' reader_count ' : well_read_score [ ' reader_count ' ] ,
' story_count ' : well_read_score [ ' story_count ' ] ,
2017-01-05 16:44:10 -08:00
' reach_score ' : well_read_score [ ' reach_score ' ] ,
' share_count ' : well_read_score [ ' share_count ' ] ,
' ps ' : 0 ,
' ng ' : 0 ,
2017-01-05 18:39:01 -08:00
' classifiers ' : json . decode ( feed . data . feed_classifier_counts ) ,
2016-10-06 15:24:53 -07:00
}
2017-01-05 16:44:10 -08:00
if popularity [ feed_id ] [ ' classifiers ' ] :
for classifier in popularity [ feed_id ] [ ' classifiers ' ] . get ( ' feed ' , [ ] ) :
if int ( classifier [ ' feed_id ' ] ) == int ( feed_id ) :
popularity [ feed_id ] [ ' ps ' ] = classifier [ ' pos ' ]
popularity [ feed_id ] [ ' ng ' ] = - 1 * classifier [ ' neg ' ]
2016-10-06 15:24:53 -07:00
popularity [ feed_id ] [ ' story_ids ' ] . append ( story_hash )
2020-06-15 02:54:37 -04:00
sorted_popularity = sorted ( list ( popularity . values ( ) ) , key = lambda x : x [ ' reach_score ' ] ,
2016-10-06 15:24:53 -07:00
reverse = True )
# Extract story authors from feeds
for feed in sorted_popularity :
story_ids = feed [ ' story_ids ' ]
stories_db = MStory . objects ( story_hash__in = story_ids )
stories = cls . format_stories ( stories_db )
for story in stories :
2016-10-28 17:04:12 -07:00
story [ ' story_permalink ' ] = story [ ' story_permalink ' ] [ : 250 ]
2016-10-06 15:24:53 -07:00
if story [ ' story_authors ' ] not in feed [ ' authors ' ] :
feed [ ' authors ' ] [ story [ ' story_authors ' ] ] = {
2016-10-06 17:20:30 -07:00
' name ' : story [ ' story_authors ' ] ,
2016-10-06 15:24:53 -07:00
' count ' : 0 ,
2017-01-05 16:44:10 -08:00
' ps ' : 0 ,
' ng ' : 0 ,
2016-10-06 15:34:53 -07:00
' tags ' : { } ,
2016-10-06 17:20:30 -07:00
' stories ' : [ ] ,
2016-10-06 15:24:53 -07:00
}
2017-01-05 16:44:10 -08:00
author = feed [ ' authors ' ] [ story [ ' story_authors ' ] ]
2016-10-06 18:10:52 -07:00
seen = False
2017-01-05 16:44:10 -08:00
for seen_story in author [ ' stories ' ] :
2016-10-06 18:09:46 -07:00
if seen_story [ ' url ' ] == story [ ' story_permalink ' ] :
2016-10-06 18:10:52 -07:00
seen = True
2016-10-06 18:09:46 -07:00
break
else :
2017-01-05 16:44:10 -08:00
author [ ' stories ' ] . append ( {
2016-10-06 18:09:46 -07:00
' title ' : story [ ' story_title ' ] ,
' url ' : story [ ' story_permalink ' ] ,
' date ' : story [ ' story_date ' ] ,
} )
2017-01-05 16:44:10 -08:00
author [ ' count ' ] + = 1
2016-10-06 18:10:52 -07:00
if seen : continue # Don't recount tags
2017-01-05 16:44:10 -08:00
if feed [ ' classifiers ' ] :
for classifier in feed [ ' classifiers ' ] . get ( ' author ' , [ ] ) :
if classifier [ ' author ' ] == author [ ' name ' ] :
author [ ' ps ' ] = classifier [ ' pos ' ]
author [ ' ng ' ] = - 1 * classifier [ ' neg ' ]
2016-10-06 15:24:53 -07:00
for tag in story [ ' story_tags ' ] :
2017-01-05 16:44:10 -08:00
if tag not in author [ ' tags ' ] :
author [ ' tags ' ] [ tag ] = { ' name ' : tag , ' count ' : 0 , ' ps ' : 0 , ' ng ' : 0 }
author [ ' tags ' ] [ tag ] [ ' count ' ] + = 1
if feed [ ' classifiers ' ] :
for classifier in feed [ ' classifiers ' ] . get ( ' tag ' , [ ] ) :
if classifier [ ' tag ' ] == tag :
author [ ' tags ' ] [ tag ] [ ' ps ' ] = classifier [ ' pos ' ]
author [ ' tags ' ] [ tag ] [ ' ng ' ] = - 1 * classifier [ ' neg ' ]
2020-06-15 02:54:37 -04:00
sorted_authors = sorted ( list ( feed [ ' authors ' ] . values ( ) ) , key = lambda x : x [ ' count ' ] )
2016-10-06 17:20:30 -07:00
feed [ ' authors ' ] = sorted_authors
2016-10-06 15:34:53 -07:00
2016-10-06 20:33:52 -07:00
# pprint(sorted_popularity)
2016-10-06 17:20:30 -07:00
return sorted_popularity
def well_read_score ( self ) :
2021-03-16 19:34:11 -04:00
""" Average percentage of stories read vs published across recently active subscribers """
2016-10-06 20:13:53 -07:00
from apps . reader . models import UserSubscription
2017-01-05 16:44:10 -08:00
from apps . social . models import MSharedStory
2016-10-06 17:20:30 -07:00
r = redis . Redis ( connection_pool = settings . REDIS_STORY_HASH_POOL )
2016-10-06 19:33:09 -07:00
p = r . pipeline ( )
2016-10-06 20:13:53 -07:00
2017-01-05 16:44:10 -08:00
shared_stories = MSharedStory . objects ( story_feed_id = self . pk ) . count ( )
2016-10-06 20:13:53 -07:00
subscribing_users = UserSubscription . objects . filter ( feed_id = self . pk ) . values ( ' user_id ' )
subscribing_user_ids = [ sub [ ' user_id ' ] for sub in subscribing_users ]
for user_id in subscribing_user_ids :
user_rs = " RS: %s : %s " % ( user_id , self . pk )
2016-10-06 19:33:09 -07:00
p . scard ( user_rs )
counts = p . execute ( )
2016-10-06 20:13:53 -07:00
counts = [ c for c in counts if c > 0 ]
reader_count = len ( counts )
2016-10-06 19:33:09 -07:00
2021-04-13 16:24:49 -04:00
now = datetime . datetime . now ( ) . strftime ( ' %s ' )
unread_cutoff = self . unread_cutoff . strftime ( ' %s ' )
2021-04-13 16:36:00 -04:00
story_count = len ( r . zrangebyscore ( " zF: %s " % self . pk , max = now , min = unread_cutoff ) )
2016-10-06 20:13:53 -07:00
if reader_count and story_count :
average_pct = ( sum ( counts ) / float ( reader_count ) ) / float ( story_count )
2016-10-06 19:33:09 -07:00
else :
average_pct = 0
2016-10-06 20:33:52 -07:00
reach_score = average_pct * reader_count * story_count
return { ' read_pct ' : average_pct , ' reader_count ' : reader_count ,
2017-01-05 16:44:10 -08:00
' reach_score ' : reach_score , ' story_count ' : story_count ,
' share_count ' : shared_stories }
2016-10-06 17:20:30 -07:00
@classmethod
def xls_query_popularity ( cls , queries , limit ) :
import xlsxwriter
2017-01-05 18:26:50 -08:00
from xlsxwriter . utility import xl_rowcol_to_cell
2020-06-15 02:54:37 -04:00
if isinstance ( queries , str ) :
2017-01-05 18:26:50 -08:00
queries = [ q . strip ( ) for q in queries . split ( ' , ' ) ]
title = ' NewsBlur- %s .xlsx ' % slugify ( ' - ' . join ( queries ) )
workbook = xlsxwriter . Workbook ( title )
2016-10-06 17:20:30 -07:00
bold = workbook . add_format ( { ' bold ' : 1 } )
date_format = workbook . add_format ( { ' num_format ' : ' mmm d yyyy ' } )
2016-10-06 19:33:09 -07:00
unread_format = workbook . add_format ( { ' font_color ' : ' #E0E0E0 ' } )
2016-10-06 17:20:30 -07:00
for query in queries :
worksheet = workbook . add_worksheet ( query )
row = 1
col = 0
2017-01-05 18:26:50 -08:00
worksheet . write ( 0 , col , ' Publisher ' , bold )
2017-01-05 16:44:10 -08:00
worksheet . set_column ( col , col , 15 ) ; col + = 1
worksheet . write ( 0 , col , ' Feed URL ' , bold )
worksheet . set_column ( col , col , 20 ) ; col + = 1
worksheet . write ( 0 , col , ' Reach score ' , bold )
2017-01-05 19:06:45 -08:00
worksheet . write_comment ( 0 , col , ' Feeds are sorted based on this score. It \' s simply the # of readers * # of stories in the past 30 days * the percentage of stories that are actually read. ' )
2017-01-05 18:26:50 -08:00
worksheet . set_column ( col , col , 9 ) ; col + = 1
2017-01-05 16:44:10 -08:00
worksheet . write ( 0 , col , ' # subs ' , bold )
2017-01-05 19:06:45 -08:00
worksheet . write_comment ( 0 , col , ' Total number of subscribers on NewsBlur, not necessarily active ' )
2017-01-05 18:26:50 -08:00
worksheet . set_column ( col , col , 5 ) ; col + = 1
2017-01-05 16:44:10 -08:00
worksheet . write ( 0 , col , ' # readers ' , bold )
2017-01-05 19:06:45 -08:00
worksheet . write_comment ( 0 , col , ' Total number of active subscribers who have read a story from the feed in the past 30 days. ' )
2017-01-05 16:44:10 -08:00
worksheet . set_column ( col , col , 8 ) ; col + = 1
2017-01-05 19:13:49 -08:00
worksheet . write ( 0 , col , " read pct " , bold )
2017-01-05 19:20:03 -08:00
worksheet . write_comment ( 0 , col , " Of the active subscribers reading this feed in the past 30 days, this is the percentage of stories the average subscriber reads. Values over 100 pct signify that the feed has many shared stories, which throws off the number slightly but not significantly. " )
2017-01-05 19:13:49 -08:00
worksheet . set_column ( col , col , 8 ) ; col + = 1
2017-01-05 16:44:10 -08:00
worksheet . write ( 0 , col , ' # stories 30d ' , bold )
2017-01-05 19:06:45 -08:00
worksheet . write_comment ( 0 , col , " It ' s important to ignore feeds that haven ' t published anything in the last 30 days, which is why this is part of the Reach Score. " )
2017-01-05 16:44:10 -08:00
worksheet . set_column ( col , col , 10 ) ; col + = 1
worksheet . write ( 0 , col , ' # shared ' , bold )
2017-01-05 19:06:45 -08:00
worksheet . write_comment ( 0 , col , ' Number of stories from this feed that were shared on NewsBlur. This is a strong signal of interest although it is not included in the Reach Score. ' )
2017-01-05 18:26:50 -08:00
worksheet . set_column ( col , col , 7 ) ; col + = 1
2017-01-05 16:44:10 -08:00
worksheet . write ( 0 , col , ' # feed pos ' , bold )
2017-01-05 19:06:45 -08:00
worksheet . write_comment ( 0 , col , ' Number of times this feed was trained with a thumbs up. Users use training to hide stories they don \' t want to see while highlighting those that they do. ' )
2017-01-05 16:44:10 -08:00
worksheet . set_column ( col , col , 8 ) ; col + = 1
worksheet . write ( 0 , col , ' # feed neg ' , bold )
2017-01-05 19:06:45 -08:00
worksheet . write_comment ( 0 , col , ' Number of times this feed was trained with a thumbs down. Users use training to hide stories they don \' t want to see while highlighting those that they do. ' )
2017-01-05 16:44:10 -08:00
worksheet . set_column ( col , col , 8 ) ; col + = 1
worksheet . write ( 0 , col , ' Author ' , bold )
worksheet . set_column ( col , col , 15 ) ; col + = 1
worksheet . write ( 0 , col , ' # author pos ' , bold )
2017-01-05 19:06:45 -08:00
worksheet . write_comment ( 0 , col , ' Number of times this author was trained with a thumbs up. Users use training to hide stories they don \' t want to see while highlighting those that they do. ' )
2017-01-05 18:26:50 -08:00
worksheet . set_column ( col , col , 10 ) ; col + = 1
2017-01-05 16:44:10 -08:00
worksheet . write ( 0 , col , ' # author neg ' , bold )
2017-01-05 19:06:45 -08:00
worksheet . write_comment ( 0 , col , ' Number of times this author was trained with a thumbs down. Users use training to hide stories they don \' t want to see while highlighting those that they do. ' )
2017-01-05 18:26:50 -08:00
worksheet . set_column ( col , col , 10 ) ; col + = 1
2017-01-05 16:44:10 -08:00
worksheet . write ( 0 , col , ' Story title ' , bold )
worksheet . set_column ( col , col , 30 ) ; col + = 1
worksheet . write ( 0 , col , ' Story URL ' , bold )
worksheet . set_column ( col , col , 20 ) ; col + = 1
worksheet . write ( 0 , col , ' Story date ' , bold )
worksheet . set_column ( col , col , 10 ) ; col + = 1
worksheet . write ( 0 , col , ' Tag ' , bold )
worksheet . set_column ( col , col , 15 ) ; col + = 1
2017-01-05 19:06:45 -08:00
worksheet . write ( 0 , col , ' Tag count ' , bold )
worksheet . write_comment ( 0 , col , ' Number of times this tag is used in other stories that also contain the search query. ' )
2017-01-05 16:44:10 -08:00
worksheet . set_column ( col , col , 8 ) ; col + = 1
worksheet . write ( 0 , col , ' # tag pos ' , bold )
2017-01-05 19:06:45 -08:00
worksheet . write_comment ( 0 , col , ' Number of times this tag was trained with a thumbs up. Users use training to hide stories they don \' t want to see while highlighting those that they do. ' )
2017-01-05 18:26:50 -08:00
worksheet . set_column ( col , col , 7 ) ; col + = 1
2017-01-05 16:44:10 -08:00
worksheet . write ( 0 , col , ' # tag neg ' , bold )
2017-01-05 19:06:45 -08:00
worksheet . write_comment ( 0 , col , ' Number of times this tag was trained with a thumbs down. Users use training to hide stories they don \' t want to see while highlighting those that they do. ' )
2017-01-05 18:26:50 -08:00
worksheet . set_column ( col , col , 7 ) ; col + = 1
2016-10-06 17:20:30 -07:00
popularity = cls . query_popularity ( query , limit = limit )
for feed in popularity :
2017-01-05 16:44:10 -08:00
col = 0
worksheet . write ( row , col , feed [ ' feed_title ' ] ) ; col + = 1
2017-01-08 18:14:48 -08:00
worksheet . write_url ( row , col , feed . get ( ' feed_url ' ) or " " ) ; col + = 1
2017-01-08 18:13:00 -08:00
worksheet . conditional_format ( row , col , row , col + 8 , { ' type ' : ' cell ' ,
' criteria ' : ' == ' ,
' value ' : 0 ,
' format ' : unread_format } )
2017-01-05 18:26:50 -08:00
worksheet . write ( row , col , " = %s * %s * %s " % (
xl_rowcol_to_cell ( row , col + 2 ) ,
xl_rowcol_to_cell ( row , col + 3 ) ,
xl_rowcol_to_cell ( row , col + 4 ) ,
) ) ; col + = 1
2017-01-05 16:44:10 -08:00
worksheet . write ( row , col , feed [ ' num_subscribers ' ] ) ; col + = 1
worksheet . write ( row , col , feed [ ' reader_count ' ] ) ; col + = 1
worksheet . write ( row , col , feed [ ' read_pct ' ] ) ; col + = 1
worksheet . write ( row , col , feed [ ' story_count ' ] ) ; col + = 1
worksheet . write ( row , col , feed [ ' share_count ' ] ) ; col + = 1
worksheet . write ( row , col , feed [ ' ps ' ] ) ; col + = 1
worksheet . write ( row , col , feed [ ' ng ' ] ) ; col + = 1
2016-10-06 17:20:30 -07:00
for author in feed [ ' authors ' ] :
2017-01-05 16:44:10 -08:00
row + = 1
2017-01-08 18:13:00 -08:00
worksheet . conditional_format ( row , col , row , col + 2 , { ' type ' : ' cell ' ,
2017-01-05 16:44:10 -08:00
' criteria ' : ' == ' ,
' value ' : 0 ,
' format ' : unread_format } )
2017-01-08 18:13:00 -08:00
worksheet . write ( row , col , author [ ' name ' ] )
worksheet . write ( row , col + 1 , author [ ' ps ' ] )
worksheet . write ( row , col + 2 , author [ ' ng ' ] )
2016-10-06 17:20:30 -07:00
for story in author [ ' stories ' ] :
2017-01-05 16:44:10 -08:00
worksheet . write ( row , col + 3 , story [ ' title ' ] )
worksheet . write_url ( row , col + 4 , story [ ' url ' ] )
worksheet . write_datetime ( row , col + 5 , story [ ' date ' ] , date_format )
2016-10-06 17:20:30 -07:00
row + = 1
2020-06-15 02:54:37 -04:00
for tag in list ( author [ ' tags ' ] . values ( ) ) :
2017-01-08 18:13:00 -08:00
worksheet . conditional_format ( row , col + 7 , row , col + 9 , { ' type ' : ' cell ' ,
2017-01-05 16:44:10 -08:00
' criteria ' : ' == ' ,
' value ' : 0 ,
2017-01-08 18:13:00 -08:00
' format ' : unread_format } )
2017-01-05 16:44:10 -08:00
worksheet . write ( row , col + 6 , tag [ ' name ' ] )
worksheet . write ( row , col + 7 , tag [ ' count ' ] )
worksheet . write ( row , col + 8 , tag [ ' ps ' ] )
worksheet . write ( row , col + 9 , tag [ ' ng ' ] )
2016-10-06 17:20:30 -07:00
row + = 1
workbook . close ( )
2017-01-05 18:26:50 -08:00
return title
2013-08-07 15:43:25 -07:00
2014-04-23 15:05:47 -07:00
def find_stories ( self , query , order = " newest " , offset = 0 , limit = 25 ) :
story_ids = SearchStory . query ( feed_ids = [ self . pk ] , query = query , order = order ,
offset = offset , limit = limit )
2012-12-19 14:21:46 -08:00
stories_db = MStory . objects (
2014-04-23 15:05:47 -07:00
story_hash__in = story_ids
) . order_by ( ' -story_date ' if order == " newest " else ' story_date ' )
2014-04-22 12:39:09 -07:00
2012-12-19 14:21:46 -08:00
stories = self . format_stories ( stories_db , self . pk )
return stories
2010-12-02 20:18:33 -05:00
@classmethod
2013-05-14 16:36:03 -07:00
def format_stories ( cls , stories_db , feed_id = None , include_permalinks = False ) :
2010-01-21 13:12:29 -05:00
stories = [ ]
2010-10-07 19:56:23 -04:00
2010-01-21 13:12:29 -05:00
for story_db in stories_db :
2013-05-14 16:36:03 -07:00
story = cls . format_story ( story_db , feed_id , include_permalinks = include_permalinks )
2010-01-21 13:12:29 -05:00
stories . append ( story )
return stories
2011-05-08 19:41:50 -04:00
@classmethod
2016-02-26 20:01:41 -08:00
def format_story ( cls , story_db , feed_id = None , text = False , include_permalinks = False ,
show_changes = False ) :
2020-06-15 02:54:37 -04:00
if isinstance ( story_db . story_content_z , str ) :
2020-07-01 16:59:21 -04:00
story_db . story_content_z = base64 . b64decode ( story_db . story_content_z )
2016-02-26 20:01:41 -08:00
story_content = ' '
2016-02-26 20:16:26 -08:00
latest_story_content = None
2016-02-26 20:01:41 -08:00
has_changes = False
2016-03-01 10:13:33 -05:00
if ( not show_changes and
hasattr ( story_db , ' story_latest_content_z ' ) and
story_db . story_latest_content_z ) :
2021-03-25 14:54:49 -04:00
try :
latest_story_content = smart_str ( zlib . decompress ( story_db . story_latest_content_z ) )
except DjangoUnicodeDecodeError :
latest_story_content = zlib . decompress ( story_db . story_latest_content_z )
2016-02-26 20:16:26 -08:00
if story_db . story_content_z :
2020-11-06 14:46:40 +07:00
story_content = smart_str ( zlib . decompress ( story_db . story_content_z ) )
2016-02-26 20:16:26 -08:00
if ' <ins ' in story_content or ' <del ' in story_content :
has_changes = True
if not show_changes and latest_story_content :
story_content = latest_story_content
2018-07-12 14:19:14 -04:00
story_title = story_db . story_title
blank_story_title = False
if not story_title :
blank_story_title = True
if story_content :
story_title = strip_tags ( story_content )
if not story_title and story_db . story_permalink :
story_title = story_db . story_permalink
2018-08-07 18:37:58 -04:00
if story_title and len ( story_title ) > 80 :
2018-07-12 14:19:14 -04:00
story_title = story_title [ : 80 ] + ' ... '
2011-05-08 19:41:50 -04:00
story = { }
2013-04-29 15:27:22 -07:00
story [ ' story_hash ' ] = getattr ( story_db , ' story_hash ' , None )
2011-05-08 19:41:50 -04:00
story [ ' story_tags ' ] = story_db . story_tags or [ ]
2012-09-04 11:46:41 -07:00
story [ ' story_date ' ] = story_db . story_date . replace ( tzinfo = None )
2013-05-26 16:32:48 -07:00
story [ ' story_timestamp ' ] = story_db . story_date . strftime ( ' %s ' )
2014-02-10 12:42:55 -08:00
story [ ' story_authors ' ] = story_db . story_author_name or " "
2018-07-12 14:19:14 -04:00
story [ ' story_title ' ] = story_title
if blank_story_title :
story [ ' story_title_blank ' ] = True
2011-12-24 14:45:19 -08:00
story [ ' story_content ' ] = story_content
2012-04-06 13:38:21 -07:00
story [ ' story_permalink ' ] = story_db . story_permalink
2013-06-26 11:38:49 -07:00
story [ ' image_urls ' ] = story_db . image_urls
2019-01-19 15:37:20 -05:00
story [ ' secure_image_urls ' ] = cls . secure_image_urls ( story_db . image_urls )
2019-12-25 18:13:29 -05:00
story [ ' secure_image_thumbnails ' ] = cls . secure_image_thumbnails ( story_db . image_urls )
2011-05-08 19:41:50 -04:00
story [ ' story_feed_id ' ] = feed_id or story_db . story_feed_id
2016-02-26 20:01:41 -08:00
story [ ' has_modifications ' ] = has_changes
2012-01-15 20:51:48 -08:00
story [ ' comment_count ' ] = story_db . comment_count if hasattr ( story_db , ' comment_count ' ) else 0
story [ ' comment_user_ids ' ] = story_db . comment_user_ids if hasattr ( story_db , ' comment_user_ids ' ) else [ ]
story [ ' share_count ' ] = story_db . share_count if hasattr ( story_db , ' share_count ' ) else 0
story [ ' share_user_ids ' ] = story_db . share_user_ids if hasattr ( story_db , ' share_user_ids ' ) else [ ]
story [ ' guid_hash ' ] = story_db . guid_hash if hasattr ( story_db , ' guid_hash ' ) else None
2012-04-30 11:52:19 -07:00
if hasattr ( story_db , ' source_user_id ' ) :
story [ ' source_user_id ' ] = story_db . source_user_id
2011-11-24 15:19:53 -05:00
story [ ' id ' ] = story_db . story_guid or story_db . story_date
2011-05-08 19:41:50 -04:00
if hasattr ( story_db , ' starred_date ' ) :
story [ ' starred_date ' ] = story_db . starred_date
2013-08-13 17:21:41 -07:00
if hasattr ( story_db , ' user_tags ' ) :
story [ ' user_tags ' ] = story_db . user_tags
2020-07-15 21:25:00 -04:00
if hasattr ( story_db , ' user_notes ' ) :
story [ ' user_notes ' ] = story_db . user_notes
2018-08-07 18:37:58 -04:00
if hasattr ( story_db , ' highlights ' ) :
2018-08-09 16:25:28 -04:00
story [ ' highlights ' ] = story_db . highlights
2012-01-24 09:02:23 -08:00
if hasattr ( story_db , ' shared_date ' ) :
story [ ' shared_date ' ] = story_db . shared_date
2014-01-29 12:43:57 -08:00
if hasattr ( story_db , ' comments ' ) :
story [ ' comments ' ] = story_db . comments
if hasattr ( story_db , ' user_id ' ) :
story [ ' user_id ' ] = story_db . user_id
2013-05-14 16:36:03 -07:00
if include_permalinks and hasattr ( story_db , ' blurblog_permalink ' ) :
2012-11-27 11:59:54 -08:00
story [ ' blurblog_permalink ' ] = story_db . blurblog_permalink ( )
2011-05-08 19:41:50 -04:00
if text :
2020-06-30 17:22:47 -04:00
soup = BeautifulSoup ( story [ ' story_content ' ] , features = " lxml " )
2011-05-08 19:41:50 -04:00
text = ' ' . join ( soup . findAll ( text = True ) )
2011-05-08 20:21:09 -04:00
text = re . sub ( r ' \ n+ ' , ' \n \n ' , text )
2011-05-08 19:41:50 -04:00
text = re . sub ( r ' \ t+ ' , ' \t ' , text )
story [ ' text ' ] = text
2011-11-24 15:19:53 -05:00
2011-05-08 19:41:50 -04:00
return story
2012-01-09 13:55:26 -08:00
2019-01-19 15:37:20 -05:00
@classmethod
def secure_image_urls ( cls , urls ) :
2019-12-25 18:13:29 -05:00
signed_urls = [ create_imageproxy_signed_url ( settings . IMAGES_URL ,
settings . IMAGES_SECRET_KEY ,
url ) for url in urls ]
2020-06-19 02:35:30 -04:00
return dict ( zip ( urls , signed_urls ) )
2019-12-25 18:13:29 -05:00
@classmethod
2020-01-23 20:34:32 -05:00
def secure_image_thumbnails ( cls , urls , size = 192 ) :
2019-12-25 18:13:29 -05:00
signed_urls = [ create_imageproxy_signed_url ( settings . IMAGES_URL ,
settings . IMAGES_SECRET_KEY ,
url ,
size ) for url in urls ]
2020-06-19 02:35:30 -04:00
return dict ( zip ( urls , signed_urls ) )
2019-01-19 15:37:20 -05:00
2010-01-04 22:26:53 +00:00
def get_tags ( self , entry ) :
fcat = [ ]
2020-06-15 02:54:37 -04:00
if ' tags ' in entry :
2010-01-04 22:26:53 +00:00
for tcat in entry . tags :
2012-07-30 06:32:34 -07:00
term = None
2011-02-15 21:08:40 -05:00
if hasattr ( tcat , ' label ' ) and tcat . label :
2010-01-04 22:26:53 +00:00
term = tcat . label
2012-07-25 19:11:59 -07:00
elif hasattr ( tcat , ' term ' ) and tcat . term :
2010-01-04 22:26:53 +00:00
term = tcat . term
2019-11-06 16:10:56 -05:00
if not term or " CDATA " in term :
2010-07-06 18:16:41 -04:00
continue
2010-01-04 22:26:53 +00:00
qcat = term . strip ( )
if ' , ' in qcat or ' / ' in qcat :
qcat = qcat . replace ( ' , ' , ' / ' ) . split ( ' / ' )
else :
qcat = [ qcat ]
for zcat in qcat :
tagname = zcat . lower ( )
while ' ' in tagname :
tagname = tagname . replace ( ' ' , ' ' )
tagname = tagname . strip ( )
if not tagname or tagname == ' ' :
continue
2010-08-21 20:42:38 -04:00
fcat . append ( tagname )
2012-07-21 16:38:37 -07:00
fcat = [ strip_tags ( t ) [ : 250 ] for t in fcat [ : 12 ] ]
return fcat
2011-12-08 11:19:04 -08:00
2015-02-19 10:39:10 -08:00
@classmethod
def get_permalink ( cls , entry ) :
2011-12-08 11:19:04 -08:00
link = entry . get ( ' link ' )
if not link :
links = entry . get ( ' links ' )
if links :
2011-12-08 14:51:52 -08:00
link = links [ 0 ] . get ( ' href ' )
if not link :
link = entry . get ( ' id ' )
2011-12-08 11:19:04 -08:00
return link
2021-03-25 14:35:14 -04:00
def _exists_story ( self , story , story_content , existing_stories , new_story_hashes , lightweight = False ) :
2009-08-30 00:43:13 +00:00
story_in_system = None
story_has_changed = False
2011-12-14 23:26:07 -08:00
story_link = self . get_permalink ( story )
2020-06-15 02:54:37 -04:00
existing_stories_hashes = list ( existing_stories . keys ( ) )
2014-03-13 16:32:13 -07:00
story_pub_date = story . get ( ' published ' )
2012-12-24 00:10:40 -08:00
# story_published_now = story.get('published_now', False)
# start_date = story_pub_date - datetime.timedelta(hours=8)
# end_date = story_pub_date + datetime.timedelta(hours=8)
2014-10-08 16:43:48 -07:00
2020-06-15 02:54:37 -04:00
for existing_story in list ( existing_stories . values ( ) ) :
2009-08-30 00:43:13 +00:00
content_ratio = 0
2012-12-24 00:10:40 -08:00
# existing_story_pub_date = existing_story.story_date
2014-04-17 12:10:04 -07:00
2020-06-15 02:54:37 -04:00
if isinstance ( existing_story . id , str ) :
2014-04-17 12:10:04 -07:00
# Correcting a MongoDB bug
existing_story . story_guid = existing_story . id
2012-12-24 00:10:40 -08:00
2014-04-17 12:10:04 -07:00
if story . get ( ' story_hash ' ) == existing_story . story_hash :
story_in_system = existing_story
elif ( story . get ( ' story_hash ' ) in existing_stories_hashes and
story . get ( ' story_hash ' ) != existing_story . story_hash ) :
# Story already exists but is not this one
continue
elif ( existing_story . story_hash in new_story_hashes and
story . get ( ' story_hash ' ) != existing_story . story_hash ) :
# Story coming up later
continue
2012-12-24 00:10:40 -08:00
if ' story_latest_content_z ' in existing_story :
2020-11-06 14:46:40 +07:00
existing_story_content = smart_str ( zlib . decompress ( existing_story . story_latest_content_z ) )
2012-12-24 00:10:40 -08:00
elif ' story_latest_content ' in existing_story :
existing_story_content = existing_story . story_latest_content
elif ' story_content_z ' in existing_story :
2020-11-06 14:46:40 +07:00
existing_story_content = smart_str ( zlib . decompress ( existing_story . story_content_z ) )
2012-12-24 00:10:40 -08:00
elif ' story_content ' in existing_story :
existing_story_content = existing_story . story_content
else :
2020-06-15 02:54:37 -04:00
existing_story_content = ' '
2009-08-30 00:43:13 +00:00
2014-03-13 16:32:13 -07:00
2012-12-24 00:10:40 -08:00
# Title distance + content distance, checking if story changed
story_title_difference = abs ( levenshtein_distance ( story . get ( ' title ' ) ,
existing_story . story_title ) )
2014-03-13 16:32:13 -07:00
title_ratio = difflib . SequenceMatcher ( None , story . get ( ' title ' , " " ) ,
existing_story . story_title ) . ratio ( )
if title_ratio < .75 : continue
story_timedelta = existing_story . story_date - story_pub_date
2021-05-05 14:56:08 -04:00
# logging.debug('Story pub date: %s %s (%s, %s)' % (existing_story.story_date, story_pub_date, title_ratio, story_timedelta))
if abs ( story_timedelta . days ) > = 2 : continue
2014-03-13 16:32:13 -07:00
2012-12-24 00:10:40 -08:00
seq = difflib . SequenceMatcher ( None , story_content , existing_story_content )
2014-10-03 14:59:26 -07:00
similiar_length_min = 1000
2014-10-08 16:43:48 -07:00
if ( existing_story . story_permalink == story_link and
existing_story . story_title == story . get ( ' title ' ) ) :
2014-10-03 14:59:26 -07:00
similiar_length_min = 20
2021-03-25 14:35:14 -04:00
# Skip content check if already failed due to a timeout. This way we catch titles
if lightweight : continue
2012-12-24 00:10:40 -08:00
if ( seq
and story_content
2014-10-03 14:59:26 -07:00
and len ( story_content ) > similiar_length_min
2012-12-24 00:10:40 -08:00
and existing_story_content
and seq . real_quick_ratio ( ) > .9
and seq . quick_ratio ( ) > .95 ) :
content_ratio = seq . ratio ( )
2021-05-05 14:56:08 -04:00
2012-12-24 00:10:40 -08:00
if story_title_difference > 0 and content_ratio > .98 :
story_in_system = existing_story
if story_title_difference > 0 or content_ratio < 1.0 :
2014-03-13 15:39:49 -07:00
if settings . DEBUG :
2015-08-24 14:26:49 -07:00
logging . debug ( " ---> Title difference - %s / %s ( %s ): %s " % ( story . get ( ' title ' ) , existing_story . story_title , story_title_difference , content_ratio ) )
2009-08-30 00:43:13 +00:00
story_has_changed = True
break
2012-12-24 00:10:40 -08:00
# More restrictive content distance, still no story match
if not story_in_system and content_ratio > .98 :
2014-03-13 15:39:49 -07:00
if settings . DEBUG :
2015-08-24 14:26:49 -07:00
logging . debug ( " ---> Content difference - %s / %s ( %s ): %s " % ( story . get ( ' title ' ) , existing_story . story_title , story_title_difference , content_ratio ) )
2012-12-24 00:10:40 -08:00
story_in_system = existing_story
story_has_changed = True
break
if story_in_system and not story_has_changed :
if story_content != existing_story_content :
2014-03-13 15:39:49 -07:00
if settings . DEBUG :
2015-08-24 14:26:49 -07:00
logging . debug ( " ---> Content difference - %s ( %s )/ %s ( %s ) " % ( story . get ( ' title ' ) , len ( story_content ) , existing_story . story_title , len ( existing_story_content ) ) )
2012-12-24 00:10:40 -08:00
story_has_changed = True
if story_link != existing_story . story_permalink :
2014-03-13 15:39:49 -07:00
if settings . DEBUG :
2015-08-24 14:26:49 -07:00
logging . debug ( " ---> Permalink difference - %s / %s " % ( story_link , existing_story . story_permalink ) )
2012-12-24 00:10:40 -08:00
story_has_changed = True
# if story_pub_date != existing_story.story_date:
# story_has_changed = True
break
2011-12-08 11:19:04 -08:00
2010-01-28 13:28:27 -05:00
2010-02-02 18:01:02 -05:00
# if story_has_changed or not story_in_system:
2012-07-22 12:25:09 -07:00
# print 'New/updated story: %s' % (story),
2009-08-30 00:43:13 +00:00
return story_in_system , story_has_changed
2013-08-06 13:54:06 -07:00
2022-01-10 17:00:27 -05:00
def get_next_scheduled_update ( self , force = False , verbose = True , premium_speed = False , pro_speed = False ) :
2013-09-11 10:25:46 -07:00
if self . min_to_decay and not force and not premium_speed :
2013-04-18 16:56:54 -07:00
return self . min_to_decay
2013-04-23 15:44:31 -07:00
2021-01-23 14:20:19 -05:00
from apps . notifications . models import MUserFeedNotification
2022-01-10 17:00:27 -05:00
2013-09-11 10:25:46 -07:00
if premium_speed :
self . active_premium_subscribers + = 1
2015-07-29 16:34:48 -07:00
spd = self . stories_last_month / 30.0
2013-04-23 16:03:45 -07:00
subs = ( self . active_premium_subscribers +
( ( self . active_subscribers - self . active_premium_subscribers ) / 10.0 ) )
2021-01-23 14:20:19 -05:00
notification_count = MUserFeedNotification . objects . filter ( feed_id = self . pk ) . count ( )
2015-07-29 16:34:48 -07:00
# Calculate sub counts:
# SELECT COUNT(*) FROM feeds WHERE active_premium_subscribers > 10 AND stories_last_month >= 30;
# SELECT COUNT(*) FROM feeds WHERE active_premium_subscribers > 1 AND active_premium_subscribers < 10 AND stories_last_month >= 30;
# SELECT COUNT(*) FROM feeds WHERE active_premium_subscribers = 1 AND stories_last_month >= 30;
# SpD > 1 Subs > 10: t = 6 # 4267 * 1440/6 = 1024080
# SpD > 1 Subs > 1: t = 15 # 18973 * 1440/15 = 1821408
# SpD > 1 Subs = 1: t = 60 # 65503 * 1440/60 = 1572072
# SELECT COUNT(*) FROM feeds WHERE active_premium_subscribers > 1 AND stories_last_month < 30 AND stories_last_month > 0;
# SELECT COUNT(*) FROM feeds WHERE active_premium_subscribers = 1 AND stories_last_month < 30 AND stories_last_month > 0;
# SpD < 1 Subs > 1: t = 60 # 77618 * 1440/60 = 1862832
# SpD < 1 Subs = 1: t = 60 * 12 # 282186 * 1440/(60*12) = 564372
# SELECT COUNT(*) FROM feeds WHERE active_premium_subscribers > 1 AND stories_last_month = 0;
# SELECT COUNT(*) FROM feeds WHERE active_subscribers > 0 AND active_premium_subscribers <= 1 AND stories_last_month = 0;
# SpD = 0 Subs > 1: t = 60 * 3 # 30158 * 1440/(60*3) = 241264
# SpD = 0 Subs = 1: t = 60 * 24 # 514131 * 1440/(60*24) = 514131
if spd > = 1 :
2018-08-02 09:59:43 -04:00
if subs > = 10 :
2015-07-29 16:34:48 -07:00
total = 6
elif subs > 1 :
total = 15
2013-04-23 15:44:31 -07:00
else :
2018-08-02 09:59:43 -04:00
total = 45
2015-07-29 16:34:48 -07:00
elif spd > 0 :
2013-04-23 15:44:31 -07:00
if subs > 1 :
2015-07-29 16:34:48 -07:00
total = 60 - ( spd * 60 )
2013-04-23 15:44:31 -07:00
else :
2021-04-20 18:46:16 -04:00
total = 60 * 6 - ( spd * 60 * 6 )
2015-07-29 16:34:48 -07:00
elif spd == 0 :
2013-04-23 15:44:31 -07:00
if subs > 1 :
2013-04-23 17:04:21 -07:00
total = 60 * 6
2016-01-08 10:31:49 -08:00
elif subs == 1 :
total = 60 * 12
2013-04-23 15:44:31 -07:00
else :
total = 60 * 24
2013-07-02 10:36:16 -04:00
months_since_last_story = seconds_timesince ( self . last_story_date ) / ( 60 * 60 * 24 * 30 )
2013-04-23 17:04:21 -07:00
total * = max ( 1 , months_since_last_story )
2013-04-23 15:44:31 -07:00
# updates_per_day_delay = 3 * 60 / max(.25, ((max(0, self.active_subscribers)**.2)
# * (self.stories_last_month**0.25)))
# if self.active_premium_subscribers > 0:
# updates_per_day_delay /= min(self.active_subscribers+self.active_premium_subscribers, 4)
# updates_per_day_delay = int(updates_per_day_delay)
2013-03-28 12:17:30 -07:00
2010-07-02 15:49:08 -04:00
# Lots of subscribers = lots of updates
2011-04-02 00:17:59 -04:00
# 24 hours for 0 subscribers.
# 4 hours for 1 subscriber.
# .5 hours for 2 subscribers.
# .25 hours for 3 subscribers.
# 1 min for 10 subscribers.
2013-04-23 15:44:31 -07:00
# subscriber_bonus = 6 * 60 / max(.167, max(0, self.active_subscribers)**3)
# if self.premium_subscribers > 0:
# subscriber_bonus /= min(self.active_subscribers+self.premium_subscribers, 5)
# subscriber_bonus = int(subscriber_bonus)
2013-02-07 15:30:35 -08:00
2012-03-28 15:49:21 -07:00
if self . is_push :
2013-08-14 18:01:12 -07:00
fetch_history = MFetchHistory . feed ( self . pk )
if len ( fetch_history [ ' push_history ' ] ) :
total = total * 12
2012-12-21 16:48:47 -08:00
2021-01-23 14:20:19 -05:00
# Any notifications means a 30 min minumum
if notification_count > 0 :
total = min ( total , 30 )
2020-04-06 09:42:11 -04:00
# 4 hour max for premiums, 48 hour max for free
2016-01-08 10:35:34 -08:00
if subs > = 1 :
2019-06-07 14:45:34 -04:00
total = min ( total , 60 * 4 * 1 )
2016-01-08 10:35:34 -08:00
else :
total = min ( total , 60 * 24 * 2 )
2020-04-06 09:43:18 -04:00
# Craigslist feeds get 6 hours minimum
if ' craigslist ' in self . feed_address :
total = max ( total , 60 * 6 )
2021-08-24 17:25:31 -04:00
# Twitter feeds get 2 hours minimum
if ' twitter ' in self . feed_address :
total = max ( total , 60 * 2 )
2022-01-10 17:00:27 -05:00
# Pro subscribers get absolute minimum
if pro_speed or self . pro_subscribers > = 1 :
total = min ( total , 5 )
2012-01-09 19:08:22 -08:00
if verbose :
2022-01-10 17:00:27 -05:00
logging . debug ( " ---> [ %-30s ] Fetched every %s min - Subs: %s / %s / %s / %s Stories/day: %s " % (
2017-03-31 19:52:24 -07:00
self . log_title [ : 30 ] , total ,
2013-03-28 12:17:30 -07:00
self . num_subscribers ,
2013-04-23 15:44:31 -07:00
self . active_subscribers ,
2013-03-28 12:17:30 -07:00
self . active_premium_subscribers ,
2022-01-10 17:00:27 -05:00
self . pro_subscribers ,
2015-07-29 16:34:48 -07:00
spd ) )
2013-04-18 16:56:54 -07:00
return total
2010-07-25 23:13:27 -04:00
2013-03-28 12:17:30 -07:00
def set_next_scheduled_update ( self , verbose = False , skip_scheduling = False ) :
2015-07-27 18:35:25 -07:00
r = redis . Redis ( connection_pool = settings . REDIS_FEED_UPDATE_POOL )
2013-04-18 16:56:54 -07:00
total = self . get_next_scheduled_update ( force = True , verbose = verbose )
2013-04-08 10:50:50 -07:00
error_count = self . error_count
2010-12-23 13:29:31 -05:00
2013-04-08 10:50:50 -07:00
if error_count :
total = total * error_count
2013-08-15 12:20:37 -07:00
total = min ( total , 60 * 24 * 7 )
2012-12-25 12:08:17 -08:00
if verbose :
logging . debug ( ' ---> [ %-30s ] ~FBScheduling feed fetch geometrically: '
' ~SB %s errors. Time: %s min ' % (
2017-03-31 19:52:24 -07:00
self . log_title [ : 30 ] , self . errors_since_good , total ) )
2013-04-18 16:56:54 -07:00
2021-02-25 19:34:21 -05:00
random_factor = random . randint ( 0 , int ( total ) ) / 4
2010-10-10 23:55:00 -04:00
next_scheduled_update = datetime . datetime . utcnow ( ) + datetime . timedelta (
2010-09-07 14:02:48 -07:00
minutes = total + random_factor )
2015-07-22 15:08:15 -07:00
original_min_to_decay = self . min_to_decay
2010-12-23 13:29:31 -05:00
self . min_to_decay = total
2015-07-22 15:08:15 -07:00
2013-04-23 13:46:07 -07:00
delta = self . next_scheduled_update - datetime . datetime . now ( )
2014-05-18 10:42:35 +08:00
minutes_to_next_fetch = ( delta . seconds + ( delta . days * 24 * 3600 ) ) / 60
2013-04-23 13:46:07 -07:00
if minutes_to_next_fetch > self . min_to_decay or not skip_scheduling :
2013-03-28 12:17:30 -07:00
self . next_scheduled_update = next_scheduled_update
2013-04-18 16:47:55 -07:00
if self . active_subscribers > = 1 :
2020-06-20 00:08:18 -04:00
r . zadd ( ' scheduled_updates ' , { self . pk : self . next_scheduled_update . strftime ( ' %s ' ) } )
2013-04-03 17:22:45 -07:00
r . zrem ( ' tasked_feeds ' , self . pk )
2013-04-08 10:50:50 -07:00
r . srem ( ' queued_feeds ' , self . pk )
2013-03-30 19:05:13 -07:00
2015-07-22 15:08:15 -07:00
updated_fields = [ ' last_update ' , ' next_scheduled_update ' ]
if self . min_to_decay != original_min_to_decay :
updated_fields . append ( ' min_to_decay ' )
self . save ( update_fields = updated_fields )
2013-04-08 10:50:50 -07:00
@property
def error_count ( self ) :
2015-07-27 18:35:25 -07:00
r = redis . Redis ( connection_pool = settings . REDIS_FEED_UPDATE_POOL )
2013-04-08 10:50:50 -07:00
fetch_errors = int ( r . zscore ( ' error_feeds ' , self . pk ) or 0 )
return fetch_errors + self . errors_since_good
2013-01-02 12:27:08 -08:00
def schedule_feed_fetch_immediately ( self , verbose = True ) :
2015-07-27 18:35:25 -07:00
r = redis . Redis ( connection_pool = settings . REDIS_FEED_UPDATE_POOL )
2016-02-09 16:34:59 -08:00
if not self . num_subscribers :
2017-03-31 19:52:24 -07:00
logging . debug ( ' ---> [ %-30s ] Not scheduling feed fetch immediately, no subs. ' % ( self . log_title [ : 30 ] ) )
2016-02-09 16:34:59 -08:00
return
2013-01-02 12:27:08 -08:00
if verbose :
2017-03-31 19:52:24 -07:00
logging . debug ( ' ---> [ %-30s ] Scheduling feed fetch immediately... ' % ( self . log_title [ : 30 ] ) )
2013-01-02 12:27:08 -08:00
2010-10-10 23:55:00 -04:00
self . next_scheduled_update = datetime . datetime . utcnow ( )
2020-06-20 00:08:18 -04:00
r . zadd ( ' scheduled_updates ' , { self . pk : self . next_scheduled_update . strftime ( ' %s ' ) } )
2010-08-25 19:10:55 -04:00
2012-03-27 17:34:39 -07:00
return self . save ( )
2010-07-27 23:29:04 -04:00
2012-03-27 18:37:04 -07:00
def setup_push ( self ) :
from apps . push . models import PushSubscription
2012-03-28 16:49:15 -07:00
try :
push = self . push
except PushSubscription . DoesNotExist :
self . is_push = False
else :
2012-03-27 18:37:04 -07:00
self . is_push = push . verified
2010-11-05 20:34:17 -04:00
self . save ( )
2012-03-28 15:49:21 -07:00
2015-02-19 13:41:09 -08:00
def queue_pushed_feed_xml ( self , xml , latest_push_date_delta = None ) :
2015-07-27 18:35:25 -07:00
r = redis . Redis ( connection_pool = settings . REDIS_FEED_UPDATE_POOL )
2012-12-26 02:41:13 -08:00
queue_size = r . llen ( " push_feeds " )
2010-07-27 23:29:04 -04:00
2015-02-19 14:20:08 -08:00
if latest_push_date_delta :
2015-02-19 14:21:08 -08:00
latest_push_date_delta = " %s " % str ( latest_push_date_delta ) . split ( ' . ' , 2 ) [ 0 ]
2015-02-19 14:20:08 -08:00
2012-12-26 02:41:13 -08:00
if queue_size > 1000 :
self . schedule_feed_fetch_immediately ( )
else :
2017-03-31 19:52:24 -07:00
logging . debug ( ' ---> [ %-30s ] [ %s ] ~FB~SBQueuing pushed stories, last pushed %s ... ' % ( self . log_title [ : 30 ] , self . pk , latest_push_date_delta ) )
2012-12-26 02:41:13 -08:00
self . set_next_scheduled_update ( )
2021-01-06 14:42:24 -05:00
PushFeeds . apply_async ( args = ( self . pk , xml ) , queue = ' push_feeds ' )
2012-12-26 02:41:13 -08:00
2011-09-19 08:56:16 -07:00
# def calculate_collocations_story_content(self,
# collocation_measures=TrigramAssocMeasures,
# collocation_finder=TrigramCollocationFinder):
# stories = MStory.objects.filter(story_feed_id=self.pk)
# story_content = ' '.join([s.story_content for s in stories if s.story_content])
# return self.calculate_collocations(story_content, collocation_measures, collocation_finder)
#
# def calculate_collocations_story_title(self,
# collocation_measures=BigramAssocMeasures,
# collocation_finder=BigramCollocationFinder):
# stories = MStory.objects.filter(story_feed_id=self.pk)
# story_titles = ' '.join([s.story_title for s in stories if s.story_title])
# return self.calculate_collocations(story_titles, collocation_measures, collocation_finder)
#
# def calculate_collocations(self, content,
# collocation_measures=TrigramAssocMeasures,
# collocation_finder=TrigramCollocationFinder):
# content = re.sub(r'’', '\'', content)
# content = re.sub(r'&', '&', content)
# try:
# content = unicode(BeautifulStoneSoup(content,
# convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# except ValueError, e:
# print "ValueError, ignoring: %s" % e
# content = re.sub(r'</?\w+\s+[^>]*>', '', content)
# content = re.split(r"[^A-Za-z-'&]+", content)
#
# finder = collocation_finder.from_words(content)
# finder.apply_freq_filter(3)
# best = finder.nbest(collocation_measures.pmi, 10)
# phrases = [' '.join(phrase) for phrase in best]
#
# return phrases
2010-07-27 22:11:23 -04:00
2017-05-06 19:38:36 -07:00
2010-07-27 22:37:52 -04:00
# class FeedCollocations(models.Model):
# feed = models.ForeignKey(Feed)
# phrase = models.CharField(max_length=500)
2009-06-16 03:08:55 +00:00
2011-01-17 20:23:29 -05:00
class FeedData ( models . Model ) :
2020-06-17 00:17:32 -04:00
feed = AutoOneToOneField ( Feed , related_name = ' data ' , on_delete = models . CASCADE )
2011-02-06 15:43:13 -05:00
feed_tagline = models . CharField ( max_length = 1024 , blank = True , null = True )
2011-01-17 20:23:29 -05:00
story_count_history = models . TextField ( blank = True , null = True )
2011-04-07 17:00:28 -04:00
feed_classifier_counts = models . TextField ( blank = True , null = True )
2011-01-17 20:23:29 -05:00
popular_tags = models . CharField ( max_length = 1024 , blank = True , null = True )
popular_authors = models . CharField ( max_length = 2048 , blank = True , null = True )
2009-06-16 03:08:55 +00:00
2011-01-17 22:48:38 -05:00
def save ( self , * args , * * kwargs ) :
2011-02-05 22:15:03 -05:00
if self . feed_tagline and len ( self . feed_tagline ) > = 1000 :
self . feed_tagline = self . feed_tagline [ : 1000 ]
2009-12-18 20:47:44 +00:00
2011-01-21 20:29:19 -05:00
try :
super ( FeedData , self ) . save ( * args , * * kwargs )
except ( IntegrityError , OperationError ) :
2011-02-05 22:09:31 -05:00
if hasattr ( self , ' id ' ) and self . id : self . delete ( )
2020-06-15 02:54:37 -04:00
except DatabaseError as e :
2016-05-20 16:22:47 -07:00
# Nothing updated
2016-05-20 16:40:46 -07:00
logging . debug ( " ---> ~FRNothing updated in FeedData ( %s ): %s " % ( self . feed , e ) )
2016-05-20 16:22:47 -07:00
pass
2010-05-20 15:13:25 -04:00
2011-01-27 19:05:50 -05:00
2011-04-21 23:10:43 -04:00
class MFeedIcon ( mongo . Document ) :
2012-03-29 16:03:06 -07:00
feed_id = mongo . IntField ( primary_key = True )
color = mongo . StringField ( max_length = 6 )
data = mongo . StringField ( )
icon_url = mongo . StringField ( )
not_found = mongo . BooleanField ( default = False )
2011-04-21 23:10:43 -04:00
meta = {
' collection ' : ' feed_icons ' ,
' allow_inheritance ' : False ,
}
2014-05-16 12:18:59 -07:00
@classmethod
def get_feed ( cls , feed_id , create = True ) :
try :
feed_icon = cls . objects . read_preference ( pymongo . ReadPreference . PRIMARY ) \
. get ( feed_id = feed_id )
except cls . DoesNotExist :
if create :
feed_icon = cls . objects . create ( feed_id = feed_id )
else :
feed_icon = None
return feed_icon
2011-04-21 23:10:43 -04:00
def save ( self , * args , * * kwargs ) :
if self . icon_url :
2020-06-15 02:54:37 -04:00
self . icon_url = str ( self . icon_url )
2011-04-21 23:10:43 -04:00
try :
2013-05-29 19:37:50 -07:00
return super ( MFeedIcon , self ) . save ( * args , * * kwargs )
2011-04-21 23:10:43 -04:00
except ( IntegrityError , OperationError ) :
# print "Error on Icon: %s" % e
if hasattr ( self , ' _id ' ) : self . delete ( )
2010-08-27 18:35:33 -04:00
class MFeedPage ( mongo . Document ) :
feed_id = mongo . IntField ( primary_key = True )
2010-08-29 12:35:09 -04:00
page_data = mongo . BinaryField ( )
2010-08-27 18:35:33 -04:00
meta = {
2010-08-29 12:35:09 -04:00
' collection ' : ' feed_pages ' ,
2010-08-27 18:35:33 -04:00
' allow_inheritance ' : False ,
}
2016-02-05 14:43:31 -08:00
def page ( self ) :
2021-04-02 14:46:52 -04:00
try :
return zlib . decompress ( self . page_data )
except zlib . error as e :
logging . debug ( " ***> Zlib decompress error: %s " % e )
self . page_data = None
self . save ( )
return
2016-02-05 14:43:31 -08:00
2011-01-29 19:16:40 -05:00
@classmethod
def get_data ( cls , feed_id ) :
data = None
feed_page = cls . objects ( feed_id = feed_id )
if feed_page :
2012-04-24 17:40:34 -07:00
page_data_z = feed_page [ 0 ] . page_data
if page_data_z :
2021-04-02 14:46:52 -04:00
try :
data = zlib . decompress ( page_data_z )
except zlib . error as e :
logging . debug ( " ***> Zlib decompress error: %s " % e )
self . page_data = None
self . save ( )
return
2011-01-29 19:16:40 -05:00
if not data :
dupe_feed = DuplicateFeed . objects . filter ( duplicate_feed_id = feed_id )
if dupe_feed :
feed = dupe_feed [ 0 ] . feed
feed_page = MFeedPage . objects . filter ( feed_id = feed . pk )
if feed_page :
2012-04-24 17:40:34 -07:00
page_data_z = feed_page [ 0 ] . page_data
if page_data_z :
data = zlib . decompress ( feed_page [ 0 ] . page_data )
2012-03-29 14:45:19 -07:00
2011-01-29 19:16:40 -05:00
return data
2010-05-20 15:13:25 -04:00
2010-08-21 13:57:39 -04:00
class MStory ( mongo . Document ) :
''' A feed item '''
2013-02-20 16:08:14 -08:00
story_feed_id = mongo . IntField ( )
2010-11-30 10:30:18 -05:00
story_date = mongo . DateTimeField ( )
story_title = mongo . StringField ( max_length = 1024 )
story_content = mongo . StringField ( )
story_content_z = mongo . BinaryField ( )
story_original_content = mongo . StringField ( )
2010-08-29 13:23:50 -04:00
story_original_content_z = mongo . BinaryField ( )
2012-07-22 12:25:09 -07:00
story_latest_content = mongo . StringField ( )
story_latest_content_z = mongo . BinaryField ( )
2013-01-08 18:33:30 -08:00
original_text_z = mongo . BinaryField ( )
2014-10-29 16:16:50 -07:00
original_page_z = mongo . BinaryField ( )
2010-11-30 10:30:18 -05:00
story_content_type = mongo . StringField ( max_length = 255 )
story_author_name = mongo . StringField ( )
story_permalink = mongo . StringField ( )
story_guid = mongo . StringField ( )
2013-01-08 14:11:59 -08:00
story_hash = mongo . StringField ( )
2013-06-26 11:38:49 -07:00
image_urls = mongo . ListField ( mongo . StringField ( max_length = 1024 ) )
2010-11-30 10:30:18 -05:00
story_tags = mongo . ListField ( mongo . StringField ( max_length = 250 ) )
2012-01-09 13:55:26 -08:00
comment_count = mongo . IntField ( )
comment_user_ids = mongo . ListField ( mongo . IntField ( ) )
share_count = mongo . IntField ( )
share_user_ids = mongo . ListField ( mongo . IntField ( ) )
2010-11-30 10:30:18 -05:00
2010-08-21 13:57:39 -04:00
meta = {
' collection ' : ' stories ' ,
2013-02-20 15:42:40 -08:00
' indexes ' : [ ( ' story_feed_id ' , ' -story_date ' ) ,
{ ' fields ' : [ ' story_hash ' ] ,
2019-12-25 16:06:34 -05:00
' unique ' : True ,
2020-06-20 00:05:32 -04:00
} ] ,
2010-08-21 20:42:38 -04:00
' ordering ' : [ ' -story_date ' ] ,
' allow_inheritance ' : False ,
2012-09-17 17:01:56 -07:00
' cascade ' : False ,
2016-11-10 17:56:08 -08:00
' strict ' : False ,
2010-08-21 13:57:39 -04:00
}
2010-08-29 13:23:50 -04:00
2013-06-04 15:34:03 -07:00
RE_STORY_HASH = re . compile ( r " ^( \ d { 1,10}):( \ w {6} )$ " )
RE_RS_KEY = re . compile ( r " ^RS:( \ d+):( \ d+)$ " )
2021-03-25 14:35:14 -04:00
def __str__ ( self ) :
return f " { self . story_hash } : { self . story_title [ : 20 ] } ( { len ( self . story_content_z ) } bytes) "
2012-01-09 13:55:26 -08:00
@property
def guid_hash ( self ) :
2020-06-20 00:27:01 -04:00
return hashlib . sha1 ( ( self . story_guid ) . encode ( encoding = ' utf-8 ' ) ) . hexdigest ( ) [ : 6 ]
2013-01-08 14:11:59 -08:00
2014-04-17 12:10:04 -07:00
@classmethod
def guid_hash_unsaved ( self , guid ) :
2020-06-20 00:27:01 -04:00
return hashlib . sha1 ( guid . encode ( encoding = ' utf-8 ' ) ) . hexdigest ( ) [ : 6 ]
2014-04-17 12:10:04 -07:00
2013-01-08 14:11:59 -08:00
@property
def feed_guid_hash ( self ) :
2013-02-20 15:42:40 -08:00
return " %s : %s " % ( self . story_feed_id , self . guid_hash )
2014-04-17 12:10:04 -07:00
@classmethod
def feed_guid_hash_unsaved ( cls , feed_id , guid ) :
return " %s : %s " % ( feed_id , cls . guid_hash_unsaved ( guid ) )
2013-09-10 11:59:31 -07:00
@property
def decoded_story_title ( self ) :
2021-03-02 09:46:52 -05:00
return html . unescape ( self . story_title )
2013-09-10 11:59:31 -07:00
2010-08-29 13:23:50 -04:00
def save ( self , * args , * * kwargs ) :
2011-02-15 21:08:40 -05:00
story_title_max = MStory . _fields [ ' story_title ' ] . max_length
story_content_type_max = MStory . _fields [ ' story_content_type ' ] . max_length
2013-02-20 15:42:40 -08:00
self . story_hash = self . feed_guid_hash
2019-02-04 11:49:01 -05:00
self . extract_image_urls ( )
2010-08-29 13:23:50 -04:00
if self . story_content :
2020-06-15 02:54:37 -04:00
self . story_content_z = zlib . compress ( smart_bytes ( self . story_content ) )
2010-08-29 13:23:50 -04:00
self . story_content = None
if self . story_original_content :
2020-06-15 02:54:37 -04:00
self . story_original_content_z = zlib . compress ( smart_bytes ( self . story_original_content ) )
2010-08-29 13:23:50 -04:00
self . story_original_content = None
2012-07-22 12:25:09 -07:00
if self . story_latest_content :
2020-06-15 02:54:37 -04:00
self . story_latest_content_z = zlib . compress ( smart_bytes ( self . story_latest_content ) )
2012-07-22 12:25:09 -07:00
self . story_latest_content = None
2011-02-15 21:16:34 -05:00
if self . story_title and len ( self . story_title ) > story_title_max :
2011-02-15 21:08:40 -05:00
self . story_title = self . story_title [ : story_title_max ]
2011-02-15 21:16:34 -05:00
if self . story_content_type and len ( self . story_content_type ) > story_content_type_max :
2011-02-15 21:08:40 -05:00
self . story_content_type = self . story_content_type [ : story_content_type_max ]
2013-01-08 14:11:59 -08:00
2010-08-29 13:23:50 -04:00
super ( MStory , self ) . save ( * args , * * kwargs )
2012-07-25 17:55:23 -07:00
self . sync_redis ( )
2013-05-29 19:37:50 -07:00
return self
2012-01-09 13:55:26 -08:00
2012-07-16 20:49:43 -07:00
def delete ( self , * args , * * kwargs ) :
self . remove_from_redis ( )
2014-04-15 14:59:00 -07:00
self . remove_from_search_index ( )
2012-07-16 20:49:43 -07:00
super ( MStory , self ) . delete ( * args , * * kwargs )
2015-12-26 12:53:32 -08:00
2016-12-13 16:29:42 -08:00
def publish_to_subscribers ( self ) :
try :
r = redis . Redis ( connection_pool = settings . REDIS_PUBSUB_POOL )
r . publish ( " %s :story " % ( self . story_feed_id ) , ' %s , %s ' % ( self . story_hash , self . story_date . strftime ( ' %s ' ) ) )
except redis . ConnectionError :
logging . debug ( " ***> [ %-30s ] ~BMRedis is unavailable for real-time. " % ( Feed . get_by_id ( self . story_feed_id ) . title [ : 30 ] , ) )
2015-12-26 12:53:32 -08:00
@classmethod
def purge_feed_stories ( cls , feed , cutoff , verbose = True ) :
stories = cls . objects ( story_feed_id = feed . pk )
logging . debug ( " ---> Deleting %s stories from %s " % ( stories . count ( ) , feed ) )
if stories . count ( ) > cutoff * 1.25 :
logging . debug ( " ***> ~FRToo many stories in %s , not purging... " % ( feed ) )
return
stories . delete ( )
2014-04-11 17:25:13 -07:00
@classmethod
def index_all_for_search ( cls , offset = 0 ) :
2014-04-15 14:17:15 -07:00
if not offset :
2014-04-16 15:31:44 -07:00
SearchStory . create_elasticsearch_mapping ( delete = True )
2014-04-11 17:25:13 -07:00
last_pk = Feed . objects . latest ( ' pk ' ) . pk
2020-06-15 02:54:37 -04:00
for f in range ( offset , last_pk , 1000 ) :
print ( " ---> %s / %s ( %.2s %% ) " % ( f , last_pk , float ( f ) / last_pk * 100 ) )
feeds = Feed . objects . filter ( pk__in = list ( range ( f , f + 1000 ) ) ,
2014-04-11 17:25:13 -07:00
active = True ,
active_subscribers__gte = 1 ) \
. values_list ( ' pk ' )
2021-04-27 20:49:10 -04:00
for f , in feeds :
stories = cls . objects . filter ( story_feed_id = f )
if not len ( stories ) :
continue
print ( f " Indexing { len ( stories ) } stories in feed { f } " )
2014-04-11 17:25:13 -07:00
for story in stories :
2014-04-22 12:00:20 -07:00
story . index_story_for_search ( )
2014-04-11 17:25:13 -07:00
2014-04-22 12:00:20 -07:00
def index_story_for_search ( self ) :
2014-04-23 14:25:07 -07:00
story_content = self . story_content or " "
2014-04-15 16:52:25 -07:00
if self . story_content_z :
story_content = zlib . decompress ( self . story_content_z )
2014-04-11 17:25:13 -07:00
SearchStory . index ( story_hash = self . story_hash ,
2014-04-11 15:40:58 -07:00
story_title = self . story_title ,
2014-04-15 14:17:15 -07:00
story_content = prep_for_search ( story_content ) ,
2014-04-22 15:15:42 -07:00
story_tags = self . story_tags ,
2014-04-11 15:40:58 -07:00
story_author = self . story_author_name ,
2014-04-15 14:17:15 -07:00
story_feed_id = self . story_feed_id ,
2014-04-11 17:25:13 -07:00
story_date = self . story_date )
2012-01-09 13:55:26 -08:00
2014-04-15 14:59:00 -07:00
def remove_from_search_index ( self ) :
2014-04-22 12:00:20 -07:00
try :
SearchStory . remove ( self . story_hash )
2020-06-15 02:54:37 -04:00
except Exception :
2014-04-22 12:39:09 -07:00
pass
2016-11-15 20:45:59 -08:00
2013-06-03 17:36:57 -07:00
@classmethod
2013-06-03 17:48:11 -07:00
def trim_feed ( cls , cutoff , feed_id = None , feed = None , verbose = True ) :
2013-08-12 16:48:16 -07:00
extra_stories_count = 0
2021-02-25 19:54:00 -05:00
cutoff = int ( cutoff )
2013-06-03 17:48:11 -07:00
if not feed_id and not feed :
2013-08-12 16:48:16 -07:00
return extra_stories_count
2013-06-03 17:36:57 -07:00
2013-06-03 17:48:11 -07:00
if not feed_id :
feed_id = feed . pk
if not feed :
feed = feed_id
2013-06-03 17:36:57 -07:00
stories = cls . objects (
2014-04-03 16:25:18 -07:00
story_feed_id = feed_id
2014-04-02 12:10:34 -07:00
) . only ( ' story_date ' ) . order_by ( ' -story_date ' )
2016-01-19 11:28:27 -08:00
2013-06-03 17:36:57 -07:00
if stories . count ( ) > cutoff :
2013-08-05 10:23:22 -07:00
logging . debug ( ' ---> [ %-30s ] ~FMFound %s stories. Trimming to ~SB %s ~SN... ' %
2020-06-15 02:54:37 -04:00
( str ( feed ) [ : 30 ] , stories . count ( ) , cutoff ) )
2013-06-03 17:36:57 -07:00
try :
story_trim_date = stories [ cutoff ] . story_date
2017-06-28 17:19:54 -07:00
if story_trim_date == stories [ 0 ] . story_date :
# Handle case where every story is the same time
story_trim_date = story_trim_date - datetime . timedelta ( seconds = 1 )
2020-06-15 02:54:37 -04:00
except IndexError as e :
logging . debug ( ' ***> [ %-30s ] ~BRError trimming feed: %s ' % ( str ( feed ) [ : 30 ] , e ) )
2013-08-12 16:48:16 -07:00
return extra_stories_count
2013-06-03 17:36:57 -07:00
2014-04-03 16:25:18 -07:00
extra_stories = cls . objects ( story_feed_id = feed_id ,
story_date__lte = story_trim_date )
2013-06-03 17:36:57 -07:00
extra_stories_count = extra_stories . count ( )
2014-04-03 16:25:18 -07:00
shared_story_count = 0
2013-06-03 17:36:57 -07:00
for story in extra_stories :
2014-04-03 16:25:18 -07:00
if story . share_count :
shared_story_count + = 1
2016-01-19 11:30:13 -08:00
extra_stories_count - = 1
2014-04-03 16:25:18 -07:00
continue
2013-06-03 17:36:57 -07:00
story . delete ( )
if verbose :
2014-04-03 16:25:18 -07:00
existing_story_count = cls . objects ( story_feed_id = feed_id ) . count ( )
2014-04-02 12:10:34 -07:00
logging . debug ( " ---> Deleted %s stories, %s ( %s shared) left. " % (
2013-06-03 17:36:57 -07:00
extra_stories_count ,
2014-04-02 12:10:34 -07:00
existing_story_count ,
shared_story_count ) )
2013-08-12 16:48:16 -07:00
return extra_stories_count
2013-06-03 17:36:57 -07:00
2012-07-26 22:12:48 -07:00
@classmethod
2014-03-28 15:11:58 -07:00
def find_story ( cls , story_feed_id = None , story_id = None , story_hash = None , original_only = False ) :
2012-07-26 22:12:48 -07:00
from apps . social . models import MSharedStory
2013-06-19 13:22:11 -07:00
original_found = False
2014-03-28 15:11:58 -07:00
if story_hash :
story_id = story_hash
2013-06-04 15:34:03 -07:00
story_hash = cls . ensure_story_hash ( story_id , story_feed_id )
2014-03-28 15:11:58 -07:00
if not story_feed_id :
2014-03-28 15:13:41 -07:00
story_feed_id , _ = cls . split_story_hash ( story_hash )
2013-03-20 15:43:35 -07:00
if isinstance ( story_id , ObjectId ) :
story = cls . objects ( id = story_id ) . limit ( 1 ) . first ( )
else :
story = cls . objects ( story_hash = story_hash ) . limit ( 1 ) . first ( )
2013-01-08 18:33:30 -08:00
2013-06-19 13:22:11 -07:00
if story :
original_found = True
2013-01-08 18:33:30 -08:00
if not story and not original_only :
2012-07-26 22:12:48 -07:00
story = MSharedStory . objects . filter ( story_feed_id = story_feed_id ,
2013-06-04 15:34:03 -07:00
story_hash = story_hash ) . limit ( 1 ) . first ( )
2013-01-08 18:33:30 -08:00
if not story and not original_only :
2012-07-26 22:12:48 -07:00
story = MStarredStory . objects . filter ( story_feed_id = story_feed_id ,
2013-06-04 15:34:03 -07:00
story_hash = story_hash ) . limit ( 1 ) . first ( )
2012-07-26 22:12:48 -07:00
return story , original_found
2012-12-13 17:49:07 -08:00
@classmethod
def find_by_id ( cls , story_ids ) :
from apps . social . models import MSharedStory
count = len ( story_ids )
multiple = isinstance ( story_ids , list ) or isinstance ( story_ids , tuple )
stories = list ( cls . objects ( id__in = story_ids ) )
if len ( stories ) < count :
shared_stories = list ( MSharedStory . objects ( id__in = story_ids ) )
stories . extend ( shared_stories )
2013-04-29 15:27:22 -07:00
if not multiple :
stories = stories [ 0 ]
return stories
@classmethod
def find_by_story_hashes ( cls , story_hashes ) :
from apps . social . models import MSharedStory
count = len ( story_hashes )
multiple = isinstance ( story_hashes , list ) or isinstance ( story_hashes , tuple )
stories = list ( cls . objects ( story_hash__in = story_hashes ) )
if len ( stories ) < count :
2013-04-30 15:49:44 -07:00
hashes_found = [ s . story_hash for s in stories ]
2013-04-30 16:59:02 -07:00
remaining_hashes = list ( set ( story_hashes ) - set ( hashes_found ) )
2013-04-30 15:49:44 -07:00
story_feed_ids = [ h . split ( ' : ' ) [ 0 ] for h in remaining_hashes ]
2013-04-30 15:28:00 -07:00
shared_stories = list ( MSharedStory . objects ( story_feed_id__in = story_feed_ids ,
2013-04-30 15:49:44 -07:00
story_hash__in = remaining_hashes ) )
2013-04-29 15:27:22 -07:00
stories . extend ( shared_stories )
2012-12-13 17:49:07 -08:00
if not multiple :
stories = stories [ 0 ]
return stories
2013-06-04 15:34:03 -07:00
@classmethod
def ensure_story_hash ( cls , story_id , story_feed_id ) :
if not cls . RE_STORY_HASH . match ( story_id ) :
2020-06-20 00:27:01 -04:00
story_id = " %s : %s " % ( story_feed_id , hashlib . sha1 ( story_id . encode ( encoding = ' utf-8 ' ) ) . hexdigest ( ) [ : 6 ] )
2012-07-16 20:49:43 -07:00
2013-06-04 15:34:03 -07:00
return story_id
@classmethod
def split_story_hash ( cls , story_hash ) :
matches = cls . RE_STORY_HASH . match ( story_hash )
if matches :
groups = matches . groups ( )
return groups [ 0 ] , groups [ 1 ]
return None , None
@classmethod
def split_rs_key ( cls , rs_key ) :
matches = cls . RE_RS_KEY . match ( rs_key )
if matches :
groups = matches . groups ( )
return groups [ 0 ] , groups [ 1 ]
return None , None
@classmethod
def story_hashes ( cls , story_ids ) :
story_hashes = [ ]
for story_id in story_ids :
story_hash = cls . ensure_story_hash ( story_id )
if not story_hash : continue
story_hashes . append ( story_hash )
return story_hashes
2013-08-14 14:32:50 -07:00
def sync_redis ( self , r = None ) :
2012-07-23 13:06:12 -07:00
if not r :
2013-05-02 12:27:37 -07:00
r = redis . Redis ( connection_pool = settings . REDIS_STORY_HASH_POOL )
2013-08-14 14:32:50 -07:00
# if not r2:
# r2 = redis.Redis(connection_pool=settings.REDIS_STORY_HASH_POOL2)
2013-09-16 16:42:49 -07:00
UNREAD_CUTOFF = datetime . datetime . now ( ) - datetime . timedelta ( days = settings . DAYS_OF_STORY_HASHES )
2012-07-25 17:55:23 -07:00
2013-04-13 22:31:05 -07:00
if self . id and self . story_date > UNREAD_CUTOFF :
2013-07-01 22:19:22 -07:00
feed_key = ' F: %s ' % self . story_feed_id
r . sadd ( feed_key , self . story_hash )
2013-09-16 16:42:49 -07:00
r . expire ( feed_key , settings . DAYS_OF_STORY_HASHES * 24 * 60 * 60 )
2013-08-14 14:32:50 -07:00
# r2.sadd(feed_key, self.story_hash)
2013-09-16 16:42:49 -07:00
# r2.expire(feed_key, settings.DAYS_OF_STORY_HASHES*24*60*60)
2013-07-01 22:19:22 -07:00
2020-06-20 00:08:18 -04:00
r . zadd ( ' z ' + feed_key , { self . story_hash : time . mktime ( self . story_date . timetuple ( ) ) } )
2013-09-16 16:42:49 -07:00
r . expire ( ' z ' + feed_key , settings . DAYS_OF_STORY_HASHES * 24 * 60 * 60 )
2013-08-14 14:32:50 -07:00
# r2.zadd('z' + feed_key, self.story_hash, time.mktime(self.story_date.timetuple()))
2013-09-16 16:42:49 -07:00
# r2.expire('z' + feed_key, settings.DAYS_OF_STORY_HASHES*24*60*60)
2012-07-16 18:11:18 -07:00
2013-08-14 14:32:50 -07:00
def remove_from_redis ( self , r = None ) :
2012-07-23 13:06:12 -07:00
if not r :
2013-05-02 12:27:37 -07:00
r = redis . Redis ( connection_pool = settings . REDIS_STORY_HASH_POOL )
2013-08-14 14:32:50 -07:00
# if not r2:
# r2 = redis.Redis(connection_pool=settings.REDIS_STORY_HASH_POOL2)
2012-07-23 10:44:32 -07:00
if self . id :
2013-05-02 12:27:37 -07:00
r . srem ( ' F: %s ' % self . story_feed_id , self . story_hash )
2013-08-14 14:32:50 -07:00
# r2.srem('F:%s' % self.story_feed_id, self.story_hash)
2013-05-02 12:27:37 -07:00
r . zrem ( ' zF: %s ' % self . story_feed_id , self . story_hash )
2013-08-14 14:32:50 -07:00
# r2.zrem('zF:%s' % self.story_feed_id, self.story_hash)
2012-07-16 20:49:43 -07:00
2012-07-16 18:11:18 -07:00
@classmethod
2013-05-02 12:27:37 -07:00
def sync_feed_redis ( cls , story_feed_id ) :
r = redis . Redis ( connection_pool = settings . REDIS_STORY_HASH_POOL )
2013-08-14 14:32:50 -07:00
# r2 = redis.Redis(connection_pool=settings.REDIS_STORY_HASH_POOL2)
2013-09-16 16:42:49 -07:00
UNREAD_CUTOFF = datetime . datetime . now ( ) - datetime . timedelta ( days = settings . DAYS_OF_STORY_HASHES )
2013-05-02 12:27:37 -07:00
feed = Feed . get_by_id ( story_feed_id )
stories = cls . objects . filter ( story_feed_id = story_feed_id , story_date__gte = UNREAD_CUTOFF )
r . delete ( ' F: %s ' % story_feed_id )
2013-08-14 14:32:50 -07:00
# r2.delete('F:%s' % story_feed_id)
2013-05-02 12:27:37 -07:00
r . delete ( ' zF: %s ' % story_feed_id )
2013-08-14 14:32:50 -07:00
# r2.delete('zF:%s' % story_feed_id)
2012-10-29 14:58:43 -07:00
2017-03-31 19:52:24 -07:00
logging . info ( " ---> [ %-30s ] ~FMSyncing ~SB %s ~SN stories to redis " % ( feed and feed . log_title [ : 30 ] or story_feed_id , stories . count ( ) ) )
2013-05-02 12:27:37 -07:00
p = r . pipeline ( )
2013-08-14 14:32:50 -07:00
# p2 = r2.pipeline()
2012-07-16 18:11:18 -07:00
for story in stories :
2013-08-14 14:32:50 -07:00
story . sync_redis ( r = p )
2013-05-02 12:27:37 -07:00
p . execute ( )
2013-08-14 14:32:50 -07:00
# p2.execute()
2012-07-16 18:11:18 -07:00
2012-01-09 13:55:26 -08:00
def count_comments ( self ) :
from apps . social . models import MSharedStory
params = {
' story_guid ' : self . story_guid ,
' story_feed_id ' : self . story_feed_id ,
}
comments = MSharedStory . objects . filter ( has_comments = True , * * params ) . only ( ' user_id ' )
2012-01-15 20:51:48 -08:00
shares = MSharedStory . objects . filter ( * * params ) . only ( ' user_id ' )
2012-01-09 13:55:26 -08:00
self . comment_count = comments . count ( )
self . comment_user_ids = [ c [ ' user_id ' ] for c in comments ]
self . share_count = shares . count ( )
self . share_user_ids = [ s [ ' user_id ' ] for s in shares ]
self . save ( )
2013-06-20 13:41:37 -07:00
2017-03-23 16:06:06 -07:00
def extract_image_urls ( self , force = False , text = False ) :
2017-03-23 16:28:47 -07:00
if self . image_urls and not force and not text :
2013-06-26 11:38:49 -07:00
return self . image_urls
2013-06-20 13:44:10 -07:00
2017-03-23 16:06:06 -07:00
story_content = None
if not text :
story_content = self . story_content
if not story_content and self . story_content_z :
story_content = zlib . decompress ( self . story_content_z )
elif text :
if self . original_text_z :
story_content = zlib . decompress ( self . original_text_z )
2013-06-20 13:44:10 -07:00
if not story_content :
return
2013-08-06 13:18:55 -07:00
try :
2020-06-30 15:29:28 -04:00
soup = BeautifulSoup ( story_content , features = " lxml " )
2021-01-12 15:52:10 -05:00
except UserWarning as e :
logging . debug ( " ---> ~FBWarning on BS4: ~SB %s " % str ( e ) [ : 100 ] )
return
2013-08-06 13:18:55 -07:00
except ValueError :
2017-03-23 16:06:06 -07:00
if not text :
return self . extract_image_urls ( force = force , text = True )
else :
return
2013-06-26 11:38:49 -07:00
images = soup . findAll ( ' img ' )
if not images :
2017-03-23 16:06:06 -07:00
if not text :
return self . extract_image_urls ( force = force , text = True )
else :
return
2013-06-26 11:38:49 -07:00
2019-01-20 13:55:34 -05:00
image_urls = self . image_urls
if not image_urls :
image_urls = [ ]
2013-06-26 11:38:49 -07:00
for image in images :
2013-06-20 14:45:05 -07:00
image_url = image . get ( ' src ' )
2013-06-26 16:26:14 -07:00
if not image_url :
continue
2013-06-20 14:45:05 -07:00
if image_url and len ( image_url ) > = 1024 :
2013-06-26 11:38:49 -07:00
continue
2021-07-08 13:55:56 -04:00
if ' feedburner.com ' in image_url :
continue
2021-03-25 18:47:15 -04:00
image_url = urllib . parse . urljoin ( self . story_permalink , image_url )
2013-06-26 11:38:49 -07:00
image_urls . append ( image_url )
2019-01-20 14:53:39 -05:00
2013-06-26 16:26:14 -07:00
if not image_urls :
2017-03-23 16:06:06 -07:00
if not text :
return self . extract_image_urls ( force = force , text = True )
else :
return
2017-03-23 16:28:47 -07:00
2018-01-17 16:51:06 -08:00
if text :
urls = [ ]
for url in image_urls :
if ' http:// ' in url [ 1 : ] or ' https:// ' in url [ 1 : ] :
continue
urls . append ( url )
image_urls = urls
2019-01-20 14:53:39 -05:00
ordered_image_urls = [ ]
for image_url in list ( set ( image_urls ) ) :
if ' feedburner ' in image_url :
ordered_image_urls . append ( image_url )
else :
ordered_image_urls . insert ( 0 , image_url )
image_urls = ordered_image_urls
2018-01-17 16:51:06 -08:00
if len ( image_urls ) :
2019-02-18 15:38:40 -05:00
self . image_urls = [ u for u in image_urls if u ]
2020-12-06 12:30:24 -05:00
else :
return
2018-01-17 16:51:06 -08:00
2020-11-30 18:53:52 -05:00
max_length = MStory . image_urls . field . max_length
while len ( ' ' . join ( self . image_urls ) ) > max_length :
if len ( self . image_urls ) < = 1 :
self . image_urls [ 0 ] = self . image_urls [ 0 ] [ : max_length - 1 ]
break
else :
self . image_urls . pop ( )
2013-06-26 11:38:49 -07:00
return self . image_urls
2013-06-20 13:41:37 -07:00
2014-07-21 14:22:07 -07:00
def fetch_original_text ( self , force = False , request = None , debug = False ) :
2013-01-08 18:33:30 -08:00
original_text_z = self . original_text_z
if not original_text_z or force :
2014-10-29 16:16:50 -07:00
feed = Feed . get_by_id ( self . story_feed_id )
2021-02-26 12:10:30 -05:00
self . extract_image_urls ( force = force , text = False )
2014-07-21 14:22:07 -07:00
ti = TextImporter ( self , feed = feed , request = request , debug = debug )
2017-11-02 22:09:37 -07:00
original_doc = ti . fetch ( return_document = True )
2017-11-03 13:48:44 -07:00
original_text = original_doc . get ( ' content ' ) if original_doc else None
2019-02-01 11:55:10 -05:00
self . extract_image_urls ( force = force , text = True )
2017-03-23 16:06:06 -07:00
self . save ( )
2013-01-08 18:33:30 -08:00
else :
logging . user ( request , " ~FYFetching ~FGoriginal~FY story text, ~SBfound. " )
original_text = zlib . decompress ( original_text_z )
return original_text
2012-09-04 11:46:41 -07:00
2014-10-29 16:16:50 -07:00
def fetch_original_page ( self , force = False , request = None , debug = False ) :
from apps . rss_feeds . page_importer import PageImporter
if not self . original_page_z or force :
feed = Feed . get_by_id ( self . story_feed_id )
importer = PageImporter ( request = request , feed = feed , story = self )
original_page = importer . fetch_story ( )
else :
logging . user ( request , " ~FYFetching ~FGoriginal~FY story page, ~SBfound. " )
original_page = zlib . decompress ( self . original_page_z )
return original_page
2010-11-30 10:30:18 -05:00
2016-11-10 18:02:34 -08:00
class MStarredStory ( mongo . DynamicDocument ) :
2010-11-30 10:30:18 -05:00
""" Like MStory, but not inherited due to large overhead of _cls and _type in
mongoengine ' s inheritance model on every single row. " " "
2012-03-26 13:14:02 -07:00
user_id = mongo . IntField ( unique_with = ( ' story_guid ' , ) )
2010-12-02 20:18:33 -05:00
starred_date = mongo . DateTimeField ( )
2020-08-21 11:48:57 -04:00
starred_updated = mongo . DateTimeField ( )
2010-11-30 10:30:18 -05:00
story_feed_id = mongo . IntField ( )
story_date = mongo . DateTimeField ( )
story_title = mongo . StringField ( max_length = 1024 )
story_content = mongo . StringField ( )
story_content_z = mongo . BinaryField ( )
story_original_content = mongo . StringField ( )
story_original_content_z = mongo . BinaryField ( )
2013-01-28 15:43:00 -08:00
original_text_z = mongo . BinaryField ( )
2010-11-30 10:30:18 -05:00
story_content_type = mongo . StringField ( max_length = 255 )
story_author_name = mongo . StringField ( )
story_permalink = mongo . StringField ( )
2012-03-26 13:14:02 -07:00
story_guid = mongo . StringField ( )
2013-04-29 16:07:08 -07:00
story_hash = mongo . StringField ( )
2010-11-30 10:30:18 -05:00
story_tags = mongo . ListField ( mongo . StringField ( max_length = 250 ) )
2020-07-15 21:25:00 -04:00
user_notes = mongo . StringField ( )
2013-08-07 10:56:51 -07:00
user_tags = mongo . ListField ( mongo . StringField ( max_length = 128 ) )
2020-07-08 19:00:09 -04:00
highlights = mongo . ListField ( mongo . StringField ( max_length = 1024 ) )
2013-06-26 11:38:49 -07:00
image_urls = mongo . ListField ( mongo . StringField ( max_length = 1024 ) )
2010-11-30 10:30:18 -05:00
meta = {
' collection ' : ' starred_stories ' ,
2016-11-10 17:24:31 -08:00
' indexes ' : [ ( ' user_id ' , ' -starred_date ' ) , ( ' user_id ' , ' story_feed_id ' ) ,
( ' user_id ' , ' story_hash ' ) , ' story_feed_id ' ] ,
2010-12-02 20:18:33 -05:00
' ordering ' : [ ' -starred_date ' ] ,
2010-11-30 10:30:18 -05:00
' allow_inheritance ' : False ,
2016-11-10 17:56:08 -08:00
' strict ' : False ,
2010-11-30 10:30:18 -05:00
}
2018-08-24 09:33:39 -04:00
def __unicode__ ( self ) :
2020-07-10 17:59:11 -04:00
try :
user = User . objects . get ( pk = self . user_id )
username = user . username
except User . DoesNotExist :
username = ' [deleted] '
return " %s : %s ( %s ) " % ( username ,
2018-08-24 09:33:39 -04:00
self . story_title [ : 20 ] ,
self . story_feed_id )
2010-11-30 10:30:18 -05:00
def save ( self , * args , * * kwargs ) :
if self . story_content :
2021-05-12 21:19:09 -04:00
self . story_content_z = zlib . compress ( smart_bytes ( self . story_content ) )
2010-11-30 10:30:18 -05:00
self . story_content = None
if self . story_original_content :
2021-05-12 21:19:09 -04:00
self . story_original_content_z = zlib . compress ( smart_bytes ( self . story_original_content ) )
2010-11-30 10:30:18 -05:00
self . story_original_content = None
2013-04-29 16:07:08 -07:00
self . story_hash = self . feed_guid_hash
2020-08-21 11:48:57 -04:00
self . starred_updated = datetime . datetime . now ( )
2013-05-29 19:37:50 -07:00
return super ( MStarredStory , self ) . save ( * args , * * kwargs )
2012-07-13 14:33:16 -07:00
2013-07-30 12:01:45 -07:00
@classmethod
2014-06-11 15:20:59 -07:00
def find_stories ( cls , query , user_id , tag = None , offset = 0 , limit = 25 , order = " newest " ) :
2013-07-30 12:01:45 -07:00
stories_db = cls . objects (
Q ( user_id = user_id ) &
( Q ( story_title__icontains = query ) |
Q ( story_author_name__icontains = query ) |
Q ( story_tags__icontains = query ) )
2013-09-09 18:18:13 -07:00
)
if tag :
stories_db = stories_db . filter ( user_tags__contains = tag )
2014-06-11 15:20:59 -07:00
stories_db = stories_db . order_by ( ' %s starred_date ' %
( ' - ' if order == " newest " else " " ) ) [ offset : offset + limit ]
2013-07-30 12:01:45 -07:00
stories = Feed . format_stories ( stories_db )
return stories
2013-08-15 18:22:22 -07:00
@classmethod
def find_stories_by_user_tag ( cls , user_tag , user_id , offset = 0 , limit = 25 ) :
stories_db = cls . objects (
Q ( user_id = user_id ) ,
Q ( user_tags__icontains = user_tag )
2013-07-30 12:01:45 -07:00
) . order_by ( ' -starred_date ' ) [ offset : offset + limit ]
stories = Feed . format_stories ( stories_db )
return stories
2013-07-11 12:08:21 -07:00
@classmethod
2015-07-09 18:03:30 -07:00
def trim_old_stories ( cls , stories = 10 , days = 90 , dryrun = False ) :
2020-06-15 02:54:37 -04:00
print ( " ---> Fetching starred story counts... " )
2013-07-11 12:08:21 -07:00
stats = settings . MONGODB . newsblur . starred_stories . aggregate ( [ {
" $group " : {
" _id " : " $user_id " ,
" stories " : { " $sum " : 1 } ,
} ,
} , {
" $match " : {
" stories " : { " $gte " : stories }
} ,
} ] )
month_ago = datetime . datetime . now ( ) - datetime . timedelta ( days = days )
2016-11-20 13:11:56 -08:00
user_ids = list ( stats )
2013-07-11 12:08:21 -07:00
user_ids = sorted ( user_ids , key = lambda x : x [ ' stories ' ] , reverse = True )
2020-06-15 02:54:37 -04:00
print ( " ---> Found %s users with more than %s starred stories " % ( len ( user_ids ) , stories ) )
2013-07-11 12:08:21 -07:00
2013-07-11 12:24:48 -07:00
total = 0
2013-07-11 12:08:21 -07:00
for stat in user_ids :
try :
user = User . objects . select_related ( ' profile ' ) . get ( pk = stat [ ' _id ' ] )
except User . DoesNotExist :
user = None
2013-07-11 12:24:48 -07:00
if user and ( user . profile . is_premium or user . profile . last_seen_on > month_ago ) :
continue
total + = stat [ ' stories ' ]
2014-03-28 12:53:01 -07:00
username = " %s ( %s ) " % ( user and user . username or " - " , stat [ ' _id ' ] )
2020-06-15 02:54:37 -04:00
print ( " ---> %19.19s : %-20.20s %s stories " % ( user and user . profile . last_seen_on or " Deleted " ,
2014-03-28 12:53:01 -07:00
username ,
2020-06-15 02:54:37 -04:00
stat [ ' stories ' ] ) )
2013-07-11 15:09:00 -07:00
if not dryrun and stat [ ' _id ' ] :
2013-07-11 12:24:48 -07:00
cls . objects . filter ( user_id = stat [ ' _id ' ] ) . delete ( )
2015-02-11 15:55:18 -08:00
elif not dryrun and stat [ ' _id ' ] == 0 :
2020-06-15 02:54:37 -04:00
print ( " ---> Deleting unstarred stories (user_id = 0) " )
2015-02-11 15:55:18 -08:00
cls . objects . filter ( user_id = stat [ ' _id ' ] ) . delete ( )
2013-07-11 12:08:21 -07:00
2020-06-15 02:54:37 -04:00
print ( " ---> Deleted %s stories in total. " % total )
2013-07-11 12:08:21 -07:00
2012-11-16 15:43:39 -08:00
@property
def guid_hash ( self ) :
2020-06-20 00:27:01 -04:00
return hashlib . sha1 ( self . story_guid . encode ( encoding = ' utf-8 ' ) ) . hexdigest ( ) [ : 6 ]
2013-01-28 15:43:00 -08:00
2013-04-29 16:07:08 -07:00
@property
def feed_guid_hash ( self ) :
return " %s : %s " % ( self . story_feed_id or " 0 " , self . guid_hash )
2014-07-21 21:13:52 -07:00
def fetch_original_text ( self , force = False , request = None , debug = False ) :
2013-01-28 15:43:00 -08:00
original_text_z = self . original_text_z
2013-07-15 11:06:50 -07:00
feed = Feed . get_by_id ( self . story_feed_id )
2013-01-28 15:43:00 -08:00
if not original_text_z or force :
2014-07-21 21:13:52 -07:00
ti = TextImporter ( self , feed = feed , request = request , debug = debug )
2013-01-28 15:43:00 -08:00
original_text = ti . fetch ( )
else :
logging . user ( request , " ~FYFetching ~FGoriginal~FY story text, ~SBfound. " )
original_text = zlib . decompress ( original_text_z )
return original_text
2019-01-18 13:44:32 -05:00
def fetch_original_page ( self , force = False , request = None , debug = False ) :
2019-01-18 13:45:32 -05:00
return None
2013-01-28 15:43:00 -08:00
2013-08-07 10:56:51 -07:00
class MStarredStoryCounts ( mongo . Document ) :
user_id = mongo . IntField ( )
2014-05-28 15:30:30 -07:00
tag = mongo . StringField ( max_length = 128 )
feed_id = mongo . IntField ( )
2020-07-13 13:05:13 -04:00
is_highlights = mongo . BooleanField ( )
2013-12-19 11:08:22 -08:00
slug = mongo . StringField ( max_length = 128 )
2014-05-28 20:12:35 -07:00
count = mongo . IntField ( default = 0 )
2012-11-16 15:43:39 -08:00
2013-08-07 10:56:51 -07:00
meta = {
' collection ' : ' starred_stories_counts ' ,
' indexes ' : [ ' user_id ' ] ,
' ordering ' : [ ' tag ' ] ,
' allow_inheritance ' : False ,
}
2020-07-10 17:59:11 -04:00
def __unicode__ ( self ) :
if self . tag :
return " Tag: %s ( %s ) " % ( self . tag , self . count )
elif self . feed_id :
return " Feed: %s ( %s ) " % ( self . feed_id , self . count )
2020-07-13 13:05:13 -04:00
elif self . is_highlights :
return " Highlights: %s ( %s ) " % ( self . is_highlights , self . count )
2020-07-10 17:59:11 -04:00
2020-07-13 13:05:13 -04:00
return " %s / %s / %s " % ( self . tag , self . feed_id , self . is_highlights )
2013-12-19 11:08:22 -08:00
@property
def rss_url ( self , secret_token = None ) :
2014-05-29 15:43:44 -07:00
if self . feed_id :
return
2013-12-19 11:08:22 -08:00
if not secret_token :
user = User . objects . select_related ( ' profile ' ) . get ( pk = self . user_id )
secret_token = user . profile . secret_token
2014-05-28 15:30:30 -07:00
slug = self . slug if self . slug else " "
2020-10-22 20:12:56 -04:00
if not self . slug and self . tag :
slug = slugify ( self . tag )
2020-10-22 20:14:11 -04:00
self . slug = slug
self . save ( )
2020-10-22 20:12:56 -04:00
2013-12-19 11:08:22 -08:00
return " %s /reader/starred_rss/ %s / %s / %s " % ( settings . NEWSBLUR_URL , self . user_id ,
2014-05-28 15:30:30 -07:00
secret_token , slug )
2013-08-07 10:56:51 -07:00
@classmethod
2013-09-09 18:49:21 -07:00
def user_counts ( cls , user_id , include_total = False , try_counting = True ) :
2013-12-19 11:08:22 -08:00
counts = cls . objects . filter ( user_id = user_id )
2014-05-28 15:30:30 -07:00
counts = sorted ( [ { ' tag ' : c . tag ,
' count ' : c . count ,
2020-07-13 13:05:13 -04:00
' is_highlights ' : c . is_highlights ,
2014-05-28 15:30:30 -07:00
' feed_address ' : c . rss_url ,
2021-03-04 10:24:17 -05:00
' active ' : True ,
2014-05-28 15:30:30 -07:00
' feed_id ' : c . feed_id }
for c in counts ] ,
key = lambda x : ( x . get ( ' tag ' , ' ' ) or ' ' ) . lower ( ) )
2013-09-09 18:49:21 -07:00
2014-05-29 13:30:41 -07:00
total = 0
feed_total = 0
for c in counts :
2020-07-13 13:05:13 -04:00
if not c [ ' tag ' ] and not c [ ' feed_id ' ] and not c [ ' is_highlights ' ] :
2014-05-29 13:30:41 -07:00
total = c [ ' count ' ]
if c [ ' feed_id ' ] :
feed_total + = c [ ' count ' ]
2014-05-29 13:37:46 -07:00
if try_counting and ( total != feed_total or not len ( counts ) ) :
2014-05-29 13:30:41 -07:00
user = User . objects . get ( pk = user_id )
logging . user ( user , " ~FC~SBCounting~SN saved stories ( %s total vs. %s counted)... " %
( total , feed_total ) )
2014-05-28 15:30:30 -07:00
cls . count_for_user ( user_id )
2013-12-19 11:08:22 -08:00
return cls . user_counts ( user_id , include_total = include_total ,
try_counting = False )
2013-09-09 18:49:21 -07:00
2013-08-16 16:02:45 -07:00
if include_total :
2014-05-29 13:30:41 -07:00
return counts , total
2013-08-07 10:56:51 -07:00
return counts
2014-04-29 12:36:42 -07:00
@classmethod
def schedule_count_tags_for_user ( cls , user_id ) :
2021-01-06 14:42:24 -05:00
ScheduleCountTagsForUser . apply_async ( kwargs = dict ( user_id = user_id ) )
2014-05-28 15:30:30 -07:00
2013-08-07 10:56:51 -07:00
@classmethod
2014-05-28 17:35:51 -07:00
def count_for_user ( cls , user_id , total_only = False ) :
user_tags = [ ]
user_feeds = [ ]
2020-07-10 17:59:11 -04:00
highlights = 0
2013-08-15 18:22:22 -07:00
2014-05-28 17:35:51 -07:00
if not total_only :
cls . objects ( user_id = user_id ) . delete ( )
2016-01-28 12:30:51 -08:00
try :
user_tags = cls . count_tags_for_user ( user_id )
2020-07-10 17:59:11 -04:00
highlights = cls . count_highlights_for_user ( user_id )
2016-01-28 12:30:51 -08:00
user_feeds = cls . count_feeds_for_user ( user_id )
2020-06-15 02:54:37 -04:00
except pymongo . errors . OperationFailure as e :
2016-01-28 12:30:51 -08:00
logging . debug ( " ---> ~FBOperationError on mongo: ~SB %s " % e )
2014-05-28 17:35:51 -07:00
2013-08-15 18:22:22 -07:00
total_stories_count = MStarredStory . objects ( user_id = user_id ) . count ( )
2020-07-13 13:05:13 -04:00
cls . objects ( user_id = user_id , tag = None , feed_id = None , is_highlights = None ) . update_one ( set__count = total_stories_count ,
2014-05-29 13:30:41 -07:00
upsert = True )
2014-05-28 15:30:30 -07:00
2020-07-10 17:59:11 -04:00
return dict ( total = total_stories_count , tags = user_tags , feeds = user_feeds , highlights = highlights )
2014-05-28 15:30:30 -07:00
2013-08-07 10:56:51 -07:00
@classmethod
def count_tags_for_user ( cls , user_id ) :
all_tags = MStarredStory . objects ( user_id = user_id ,
user_tags__exists = True ) . item_frequencies ( ' user_tags ' )
2020-06-15 02:54:37 -04:00
user_tags = sorted ( [ ( k , v ) for k , v in list ( all_tags . items ( ) ) if int ( v ) > 0 and k ] ,
2014-05-28 15:30:30 -07:00
key = lambda x : x [ 0 ] . lower ( ) ,
2013-08-07 10:56:51 -07:00
reverse = True )
2013-08-15 18:22:22 -07:00
2020-06-15 02:54:37 -04:00
for tag , count in list ( dict ( user_tags ) . items ( ) ) :
2014-05-29 12:19:26 -07:00
cls . objects ( user_id = user_id , tag = tag , slug = slugify ( tag ) ) . update_one ( set__count = count ,
upsert = True )
2020-07-10 17:59:11 -04:00
2014-05-28 15:30:30 -07:00
return user_tags
2013-12-19 11:08:22 -08:00
2020-07-10 17:59:11 -04:00
@classmethod
def count_highlights_for_user ( cls , user_id ) :
2020-07-10 18:58:01 -04:00
highlighted_count = MStarredStory . objects ( user_id = user_id ,
highlights__exists = True ,
__raw__ = { " $where " : " this.highlights.length > 0 " } ) . count ( )
2020-08-04 08:56:07 -04:00
if highlighted_count > 0 :
cls . objects ( user_id = user_id ,
is_highlights = True ,
slug = " highlights "
) . update_one ( set__count = highlighted_count , upsert = True )
else :
cls . objects ( user_id = user_id , is_highlights = True , slug = " highlights " ) . delete ( )
2020-07-10 17:59:11 -04:00
return highlighted_count
2014-05-28 15:30:30 -07:00
@classmethod
def count_feeds_for_user ( cls , user_id ) :
all_feeds = MStarredStory . objects ( user_id = user_id ) . item_frequencies ( ' story_feed_id ' )
2020-06-15 02:54:37 -04:00
user_feeds = dict ( [ ( k , v ) for k , v in list ( all_feeds . items ( ) ) if v ] )
2014-05-29 13:30:41 -07:00
# Clean up None'd and 0'd feed_ids, so they can be counted against the total
if user_feeds . get ( None , False ) :
user_feeds [ 0 ] = user_feeds . get ( 0 , 0 )
user_feeds [ 0 ] + = user_feeds . get ( None )
del user_feeds [ None ]
if user_feeds . get ( 0 , False ) :
user_feeds [ - 1 ] = user_feeds . get ( 0 , 0 )
del user_feeds [ 0 ]
2019-06-10 15:14:09 -04:00
too_many_feeds = False if len ( user_feeds ) < 1000 else True
2020-06-15 02:54:37 -04:00
for feed_id , count in list ( user_feeds . items ( ) ) :
2019-06-10 15:14:09 -04:00
if too_many_feeds and count < = 1 : continue
2014-05-29 12:19:26 -07:00
cls . objects ( user_id = user_id ,
feed_id = feed_id ,
slug = " feed: %s " % feed_id ) . update_one ( set__count = count ,
upsert = True )
2014-05-28 15:30:30 -07:00
return user_feeds
2013-12-19 11:08:22 -08:00
2014-05-28 20:12:35 -07:00
@classmethod
2020-07-10 17:59:11 -04:00
def adjust_count ( cls , user_id , feed_id = None , tag = None , highlights = None , amount = 0 ) :
2014-05-28 20:12:35 -07:00
params = dict ( user_id = user_id )
if feed_id :
params [ ' feed_id ' ] = feed_id
if tag :
params [ ' tag ' ] = tag
2020-07-10 17:59:11 -04:00
if highlights :
2020-07-13 13:05:13 -04:00
params [ ' is_highlights ' ] = True
2014-05-28 20:12:35 -07:00
2014-05-29 12:19:26 -07:00
cls . objects ( * * params ) . update_one ( inc__count = amount , upsert = True )
2014-07-13 16:07:32 -07:00
try :
story_count = cls . objects . get ( * * params )
except cls . MultipleObjectsReturned :
story_count = cls . objects ( * * params ) . first ( )
if story_count and story_count . count < = 0 :
2014-05-28 20:12:35 -07:00
story_count . delete ( )
2016-02-27 15:51:42 -08:00
class MSavedSearch ( mongo . Document ) :
user_id = mongo . IntField ( )
query = mongo . StringField ( max_length = 1024 )
2017-03-03 18:12:27 -05:00
feed_id = mongo . StringField ( )
2016-02-27 15:51:42 -08:00
slug = mongo . StringField ( max_length = 128 )
meta = {
' collection ' : ' saved_searches ' ,
2017-03-07 12:28:21 -08:00
' indexes ' : [ ' user_id ' ,
{ ' fields ' : [ ' user_id ' , ' feed_id ' , ' query ' ] ,
2019-12-25 16:06:34 -05:00
' unique ' : True ,
2020-06-20 00:05:32 -04:00
} ] ,
2016-02-27 15:51:42 -08:00
' ordering ' : [ ' query ' ] ,
' allow_inheritance ' : False ,
}
2014-05-28 15:30:30 -07:00
2016-02-27 15:51:42 -08:00
@property
def rss_url ( self , secret_token = None ) :
if not secret_token :
user = User . objects . select_related ( ' profile ' ) . get ( pk = self . user_id )
secret_token = user . profile . secret_token
slug = self . slug if self . slug else " "
return " %s /reader/saved_search/ %s / %s / %s " % ( settings . NEWSBLUR_URL , self . user_id ,
secret_token , slug )
@classmethod
def user_searches ( cls , user_id ) :
searches = cls . objects . filter ( user_id = user_id )
searches = sorted ( [ { ' query ' : s . query ,
' feed_address ' : s . rss_url ,
' feed_id ' : s . feed_id ,
2021-03-04 10:24:17 -05:00
' active ' : True ,
2016-02-27 15:51:42 -08:00
} for s in searches ] ,
key = lambda x : ( x . get ( ' query ' , ' ' ) or ' ' ) . lower ( ) )
return searches
2017-03-03 18:12:27 -05:00
@classmethod
def save_search ( cls , user_id , feed_id , query ) :
user = User . objects . get ( pk = user_id )
params = dict ( user_id = user_id ,
feed_id = feed_id ,
query = query ,
slug = slugify ( query ) )
try :
saved_search = cls . objects . get ( * * params )
logging . user ( user , " ~FRSaved search already exists: ~SB %s " % query )
except cls . DoesNotExist :
logging . user ( user , " ~FCCreating a saved search: ~SB %s ~SN/~SB %s " % ( feed_id , query ) )
saved_search = cls . objects . create ( * * params )
return saved_search
2017-03-06 19:55:18 -08:00
@classmethod
def delete_search ( cls , user_id , feed_id , query ) :
user = User . objects . get ( pk = user_id )
params = dict ( user_id = user_id ,
feed_id = feed_id ,
query = query )
try :
saved_search = cls . objects . get ( * * params )
logging . user ( user , " ~FCDeleting saved search: ~SB %s " % query )
saved_search . delete ( )
except cls . DoesNotExist :
logging . user ( user , " ~FRCan ' t delete saved search, missing: ~SB %s ~SN/~SB %s " % ( feed_id , query ) )
2017-03-07 12:28:21 -08:00
except cls . MultipleObjectsReturned :
logging . user ( user , " ~FRFound multiple saved searches, deleting: ~SB %s ~SN/~SB %s " % ( feed_id , query ) )
cls . objects ( * * params ) . delete ( )
2017-03-06 19:55:18 -08:00
2016-02-27 15:51:42 -08:00
2013-06-18 13:22:31 -07:00
class MFetchHistory ( mongo . Document ) :
feed_id = mongo . IntField ( unique = True )
feed_fetch_history = mongo . DynamicField ( )
page_fetch_history = mongo . DynamicField ( )
push_history = mongo . DynamicField ( )
2017-04-12 19:13:33 -07:00
raw_feed_history = mongo . DynamicField ( )
2010-08-31 16:34:34 -04:00
2013-06-18 13:22:31 -07:00
meta = {
' db_alias ' : ' nbanalytics ' ,
' collection ' : ' fetch_history ' ,
' allow_inheritance ' : False ,
}
2011-02-13 14:47:58 -05:00
@classmethod
2013-05-31 17:17:20 -07:00
def feed ( cls , feed_id , timezone = None , fetch_history = None ) :
if not fetch_history :
2013-05-31 23:03:58 -07:00
try :
2013-06-03 10:20:23 -07:00
fetch_history = cls . objects . read_preference ( pymongo . ReadPreference . PRIMARY ) \
. get ( feed_id = feed_id )
2013-05-31 23:03:58 -07:00
except cls . DoesNotExist :
2013-06-03 10:20:23 -07:00
fetch_history = cls . objects . create ( feed_id = feed_id )
2013-04-15 14:30:31 -07:00
history = { }
2010-09-23 10:29:18 -04:00
2013-04-15 14:30:31 -07:00
for fetch_type in [ ' feed_fetch_history ' , ' page_fetch_history ' , ' push_history ' ] :
history [ fetch_type ] = getattr ( fetch_history , fetch_type )
if not history [ fetch_type ] :
history [ fetch_type ] = [ ]
for f , fetch in enumerate ( history [ fetch_type ] ) :
date_key = ' push_date ' if fetch_type == ' push_history ' else ' fetch_date '
history [ fetch_type ] [ f ] = {
date_key : localtime_for_timezone ( fetch [ 0 ] ,
timezone ) . strftime ( " % Y- % m- %d % H: % M: % S " ) ,
' status_code ' : fetch [ 1 ] ,
' message ' : fetch [ 2 ]
}
return history
2010-09-01 08:19:58 -04:00
2011-02-13 14:47:58 -05:00
@classmethod
2013-04-15 14:30:31 -07:00
def add ( cls , feed_id , fetch_type , date = None , message = None , code = None , exception = None ) :
if not date :
date = datetime . datetime . now ( )
2013-05-31 23:03:58 -07:00
try :
2013-06-03 10:20:23 -07:00
fetch_history = cls . objects . read_preference ( pymongo . ReadPreference . PRIMARY ) \
. get ( feed_id = feed_id )
2013-05-31 23:03:58 -07:00
except cls . DoesNotExist :
2013-06-03 10:20:23 -07:00
fetch_history = cls . objects . create ( feed_id = feed_id )
2013-04-15 14:30:31 -07:00
if fetch_type == ' feed ' :
history = fetch_history . feed_fetch_history or [ ]
elif fetch_type == ' page ' :
history = fetch_history . page_fetch_history or [ ]
elif fetch_type == ' push ' :
history = fetch_history . push_history or [ ]
2017-04-12 19:13:33 -07:00
elif fetch_type == ' raw_feed ' :
history = fetch_history . raw_feed_history or [ ]
2013-04-15 14:30:31 -07:00
2013-06-28 20:51:31 -07:00
history = [ [ date , code , message ] ] + history
2016-02-03 12:11:22 -08:00
any_exceptions = any ( [ c for d , c , m in history if c not in [ 200 , 304 ] ] )
2016-01-13 12:42:21 -08:00
if any_exceptions :
history = history [ : 25 ]
2017-04-12 19:13:33 -07:00
elif fetch_type == ' raw_feed ' :
history = history [ : 10 ]
2013-05-29 16:05:56 -07:00
else :
history = history [ : 5 ]
2013-04-15 14:30:31 -07:00
if fetch_type == ' feed ' :
fetch_history . feed_fetch_history = history
elif fetch_type == ' page ' :
fetch_history . page_fetch_history = history
elif fetch_type == ' push ' :
fetch_history . push_history = history
2017-04-12 19:13:33 -07:00
elif fetch_type == ' raw_feed ' :
fetch_history . raw_feed_history = history
2013-04-15 14:30:31 -07:00
fetch_history . save ( )
if fetch_type == ' feed ' :
RStats . add ( ' feed_fetch ' )
2013-05-31 17:14:17 -07:00
2013-05-31 17:17:20 -07:00
return cls . feed ( feed_id , fetch_history = fetch_history )
2013-04-15 14:30:31 -07:00
2013-01-03 13:47:38 -08:00
2010-08-19 10:43:07 -04:00
class DuplicateFeed ( models . Model ) :
2013-01-07 16:35:29 -08:00
duplicate_address = models . CharField ( max_length = 764 , db_index = True )
duplicate_link = models . CharField ( max_length = 764 , null = True , db_index = True )
2012-03-12 18:11:13 -07:00
duplicate_feed_id = models . CharField ( max_length = 255 , null = True , db_index = True )
2020-06-17 00:17:32 -04:00
feed = models . ForeignKey ( Feed , related_name = ' duplicate_addresses ' , on_delete = models . CASCADE )
2011-03-09 18:52:06 -05:00
2020-06-30 20:50:30 -04:00
def __str__ ( self ) :
2012-03-12 18:11:13 -07:00
return " %s : %s / %s " % ( self . feed , self . duplicate_address , self . duplicate_link )
2011-12-25 12:45:07 -08:00
2013-06-12 13:52:43 -07:00
def canonical ( self ) :
2011-12-25 12:45:07 -08:00
return {
' duplicate_address ' : self . duplicate_address ,
2012-03-12 18:11:13 -07:00
' duplicate_link ' : self . duplicate_link ,
2011-12-25 12:45:07 -08:00
' duplicate_feed_id ' : self . duplicate_feed_id ,
2012-01-26 09:32:24 -08:00
' feed_id ' : self . feed_id
2011-12-25 12:45:07 -08:00
}
2013-05-15 15:40:38 -07:00
def save ( self , * args , * * kwargs ) :
2013-05-15 15:41:39 -07:00
max_address = DuplicateFeed . _meta . get_field ( ' duplicate_address ' ) . max_length
2013-05-15 15:40:38 -07:00
if len ( self . duplicate_address ) > max_address :
self . duplicate_address = self . duplicate_address [ : max_address ]
2013-05-15 15:41:39 -07:00
max_link = DuplicateFeed . _meta . get_field ( ' duplicate_link ' ) . max_length
2013-05-15 17:21:10 -07:00
if self . duplicate_link and len ( self . duplicate_link ) > max_link :
2013-05-15 15:40:38 -07:00
self . duplicate_link = self . duplicate_link [ : max_link ]
2013-05-15 15:43:07 -07:00
super ( DuplicateFeed , self ) . save ( * args , * * kwargs )
2010-08-25 19:10:55 -04:00
2010-12-23 17:09:08 -05:00
def merge_feeds ( original_feed_id , duplicate_feed_id , force = False ) :
2011-11-29 09:43:16 -08:00
from apps . reader . models import UserSubscription
2012-03-30 14:56:16 -07:00
from apps . social . models import MSharedStory
2011-11-29 09:43:16 -08:00
if original_feed_id == duplicate_feed_id :
logging . info ( " ***> Merging the same feed. Ignoring... " )
2012-12-28 22:03:48 -08:00
return original_feed_id
2010-08-25 19:10:55 -04:00
try :
original_feed = Feed . objects . get ( pk = original_feed_id )
duplicate_feed = Feed . objects . get ( pk = duplicate_feed_id )
except Feed . DoesNotExist :
logging . info ( " ***> Already deleted feed: %s " % duplicate_feed_id )
2012-12-28 22:03:48 -08:00
return original_feed_id
2013-01-29 14:37:01 -08:00
heavier_dupe = original_feed . num_subscribers < duplicate_feed . num_subscribers
2016-05-21 06:13:35 -07:00
branched_original = original_feed . branch_from_feed and not duplicate_feed . branch_from_feed
2013-01-29 14:37:01 -08:00
if ( heavier_dupe or branched_original ) and not force :
2012-12-12 15:13:17 -08:00
original_feed , duplicate_feed = duplicate_feed , original_feed
original_feed_id , duplicate_feed_id = duplicate_feed_id , original_feed_id
2013-01-29 14:37:01 -08:00
if branched_original :
original_feed . feed_address = duplicate_feed . feed_address
2010-08-25 19:10:55 -04:00
logging . info ( " ---> Feed: [ %s - %s ] %s - %s " % ( original_feed_id , duplicate_feed_id ,
2012-11-26 10:20:41 -08:00
original_feed , original_feed . feed_link ) )
2013-01-29 14:37:01 -08:00
logging . info ( " Orig ++> %s : ( %s subs) %s / %s %s " % ( original_feed . pk ,
2012-12-12 15:13:17 -08:00
original_feed . num_subscribers ,
2012-11-26 10:20:41 -08:00
original_feed . feed_address ,
2013-01-29 14:37:01 -08:00
original_feed . feed_link ,
2013-01-29 15:58:15 -08:00
" [B: %s ] " % original_feed . branch_from_feed . pk if original_feed . branch_from_feed else " " ) )
2013-01-29 14:37:01 -08:00
logging . info ( " Dupe --> %s : ( %s subs) %s / %s %s " % ( duplicate_feed . pk ,
2012-12-12 15:13:17 -08:00
duplicate_feed . num_subscribers ,
2012-11-26 10:20:41 -08:00
duplicate_feed . feed_address ,
2013-01-29 14:37:01 -08:00
duplicate_feed . feed_link ,
2013-01-29 15:58:15 -08:00
" [B: %s ] " % duplicate_feed . branch_from_feed . pk if duplicate_feed . branch_from_feed else " " ) )
2010-08-25 19:10:55 -04:00
2013-05-07 15:37:36 -07:00
original_feed . branch_from_feed = None
2012-12-12 16:05:28 -08:00
user_subs = UserSubscription . objects . filter ( feed = duplicate_feed ) . order_by ( ' -pk ' )
2010-08-25 19:10:55 -04:00
for user_sub in user_subs :
2011-11-16 10:00:03 -08:00
user_sub . switch_feed ( original_feed , duplicate_feed )
2010-08-25 19:10:55 -04:00
def delete_story_feed ( model , feed_field = ' feed_id ' ) :
duplicate_stories = model . objects ( * * { feed_field : duplicate_feed . pk } )
# if duplicate_stories.count():
# logging.info(" ---> Deleting %s %s" % (duplicate_stories.count(), model))
duplicate_stories . delete ( )
delete_story_feed ( MStory , ' story_feed_id ' )
2011-01-14 00:59:51 -05:00
delete_story_feed ( MFeedPage , ' feed_id ' )
2010-08-25 19:10:55 -04:00
try :
DuplicateFeed . objects . create (
duplicate_address = duplicate_feed . feed_address ,
2012-03-12 18:11:13 -07:00
duplicate_link = duplicate_feed . feed_link ,
2010-11-09 09:55:44 -05:00
duplicate_feed_id = duplicate_feed . pk ,
2010-08-25 19:10:55 -04:00
feed = original_feed
)
2020-06-15 02:54:37 -04:00
except ( IntegrityError , OperationError ) as e :
2010-09-10 08:23:04 -07:00
logging . info ( " ***> Could not save DuplicateFeed: %s " % e )
2010-08-25 19:10:55 -04:00
2010-11-09 09:55:44 -05:00
# Switch this dupe feed's dupe feeds over to the new original.
duplicate_feeds_duplicate_feeds = DuplicateFeed . objects . filter ( feed = duplicate_feed )
for dupe_feed in duplicate_feeds_duplicate_feeds :
dupe_feed . feed = original_feed
dupe_feed . duplicate_feed_id = duplicate_feed . pk
dupe_feed . save ( )
2010-08-25 19:10:55 -04:00
2013-05-07 15:37:36 -07:00
logging . debug ( ' ---> Dupe subscribers ( %s ): %s , Original subscribers ( %s ): %s ' %
( duplicate_feed . pk , duplicate_feed . num_subscribers ,
original_feed . pk , original_feed . num_subscribers ) )
2012-12-12 16:05:28 -08:00
if duplicate_feed . pk != original_feed . pk :
duplicate_feed . delete ( )
else :
logging . debug ( " ***> Duplicate feed is the same as original feed. Panic! " )
2012-12-12 16:37:39 -08:00
logging . debug ( ' ---> Deleted duplicate feed: %s / %s ' % ( duplicate_feed , duplicate_feed_id ) )
2013-01-29 14:37:01 -08:00
original_feed . branch_from_feed = None
2011-05-16 23:13:49 -04:00
original_feed . count_subscribers ( )
2012-12-28 22:03:48 -08:00
original_feed . save ( )
2012-11-26 10:20:41 -08:00
logging . debug ( ' ---> Now original subscribers: %s ' %
( original_feed . num_subscribers ) )
2012-12-10 15:20:58 -08:00
2012-03-30 14:56:16 -07:00
MSharedStory . switch_feed ( original_feed_id , duplicate_feed_id )
2012-12-28 22:03:48 -08:00
return original_feed_id
2010-08-25 19:10:55 -04:00
def rewrite_folders ( folders , original_feed , duplicate_feed ) :
new_folders = [ ]
for k , folder in enumerate ( folders ) :
if isinstance ( folder , int ) :
if folder == duplicate_feed . pk :
# logging.info(" ===> Rewrote %s'th item: %s" % (k+1, folders))
new_folders . append ( original_feed . pk )
else :
new_folders . append ( folder )
elif isinstance ( folder , dict ) :
2020-06-15 02:54:37 -04:00
for f_k , f_v in list ( folder . items ( ) ) :
2010-08-25 19:10:55 -04:00
new_folders . append ( { f_k : rewrite_folders ( f_v , original_feed , duplicate_feed ) } )
2011-11-24 15:19:53 -05:00
return new_folders