2010-05-20 15:13:25 -04:00
import time
import settings
import difflib
import datetime
import hashlib
2010-06-24 15:27:25 -04:00
import random
2010-07-27 22:11:23 -04:00
import re
2010-08-04 18:30:51 -04:00
from collections import defaultdict
2010-07-27 22:11:23 -04:00
from BeautifulSoup import BeautifulStoneSoup
2010-07-27 22:27:32 -04:00
from nltk . collocations import TrigramCollocationFinder , BigramCollocationFinder , TrigramAssocMeasures , BigramAssocMeasures
2009-06-16 03:08:55 +00:00
from django . db import models
2009-09-08 00:13:49 +00:00
from django . db import IntegrityError
2009-06-16 03:08:55 +00:00
from django . core . cache import cache
2010-05-20 15:13:25 -04:00
from utils import json
from utils . feed_functions import levenshtein_distance
2009-07-28 02:27:27 +00:00
from utils . story_functions import format_story_link_date__short
from utils . story_functions import format_story_link_date__long
2009-12-18 20:47:44 +00:00
from utils . story_functions import pre_process_story
2010-07-01 15:16:33 -04:00
from utils . compressed_textfield import CompressedTextField , StoryField
2009-08-29 19:34:42 +00:00
from utils . diff import HTMLDiff
2009-06-16 03:08:55 +00:00
2010-06-08 11:19:07 -04:00
USER_AGENT = ' NewsBlur v1.0 - newsblur.com '
2009-06-16 03:08:55 +00:00
2009-08-20 02:43:01 +00:00
ENTRY_NEW , ENTRY_UPDATED , ENTRY_SAME , ENTRY_ERR = range ( 4 )
2009-07-21 03:18:29 +00:00
2009-06-16 03:08:55 +00:00
class Feed ( models . Model ) :
feed_address = models . URLField ( max_length = 255 , verify_exists = True , unique = True )
2010-07-21 23:22:27 -04:00
feed_link = models . URLField ( max_length = 1000 , default = " " , blank = True , null = True )
2010-06-28 08:06:12 -04:00
feed_title = models . CharField ( max_length = 255 , default = " " , blank = True , null = True )
feed_tagline = models . CharField ( max_length = 1024 , default = " " , blank = True , null = True )
2009-06-16 03:08:55 +00:00
active = models . BooleanField ( default = True )
num_subscribers = models . IntegerField ( default = 0 )
last_update = models . DateTimeField ( auto_now = True , default = 0 )
min_to_decay = models . IntegerField ( default = 15 )
days_to_trim = models . IntegerField ( default = 90 )
creation = models . DateField ( auto_now_add = True )
2010-04-13 19:35:36 -04:00
etag = models . CharField ( max_length = 50 , blank = True , null = True )
2009-06-16 03:08:55 +00:00
last_modified = models . DateTimeField ( null = True , blank = True )
2010-07-25 23:13:27 -04:00
stories_last_month = models . IntegerField ( default = 0 )
average_stories_per_month = models . IntegerField ( default = 0 )
stories_last_year = models . CharField ( max_length = 1024 , blank = True , null = True )
2010-04-19 12:09:04 -04:00
next_scheduled_update = models . DateTimeField ( default = datetime . datetime . now )
2010-04-29 13:35:46 -04:00
last_load_time = models . IntegerField ( default = 0 )
2010-07-01 15:16:33 -04:00
popular_tags = models . CharField ( max_length = 1024 , blank = True , null = True )
popular_authors = models . CharField ( max_length = 2048 , blank = True , null = True )
2009-06-16 03:08:55 +00:00
def __unicode__ ( self ) :
return self . feed_title
2010-07-19 14:29:27 -04:00
2010-07-25 23:13:27 -04:00
def save ( self , lock = None , * args , * * kwargs ) :
2010-07-20 22:57:18 -04:00
if self . feed_tagline and len ( self . feed_tagline ) > 1024 :
2010-07-19 14:29:27 -04:00
self . feed_tagline = self . feed_tagline [ : 1024 ]
2010-07-25 23:13:27 -04:00
if lock :
lock . acquire ( )
try :
super ( Feed , self ) . save ( * args , * * kwargs )
finally :
lock . release ( )
else :
super ( Feed , self ) . save ( * args , * * kwargs )
2010-07-19 14:29:27 -04:00
2010-07-08 11:37:54 -04:00
def save_feed_history ( self , status_code , message , exception = None ) :
2010-07-06 13:21:12 -04:00
FeedFetchHistory . objects . create ( feed = self ,
status_code = status_code ,
message = message ,
exception = exception )
2010-07-08 11:37:54 -04:00
old_fetch_histories = self . feed_fetch_history . all ( ) [ 10 : ]
for history in old_fetch_histories :
history . delete ( )
def save_page_history ( self , status_code , message , exception = None ) :
PageFetchHistory . objects . create ( feed = self ,
status_code = status_code ,
message = message ,
exception = exception )
old_fetch_histories = self . page_fetch_history . all ( ) [ 10 : ]
2010-07-06 13:21:12 -04:00
for history in old_fetch_histories :
history . delete ( )
2010-07-05 22:53:49 -04:00
def count_subscribers ( self , verbose = False , lock = None ) :
2010-06-27 20:43:17 -04:00
from apps . reader . models import UserSubscription
subs = UserSubscription . objects . filter ( feed = self )
self . num_subscribers = subs . count ( )
2010-07-05 22:53:49 -04:00
2010-07-25 23:13:27 -04:00
self . save ( lock = lock )
2010-06-27 20:43:17 -04:00
if verbose :
2010-06-27 23:15:31 -04:00
if self . num_subscribers < = 1 :
print ' . ' ,
else :
print " \n %s > %s subscriber %s : %s " % (
' - ' * min ( self . num_subscribers , 20 ) ,
self . num_subscribers ,
' ' if self . num_subscribers == 1 else ' s ' ,
self . feed_title ,
) ,
2010-07-25 23:13:27 -04:00
def count_stories ( self , verbose = False , lock = None ) :
2010-07-02 15:49:08 -04:00
month_ago = datetime . datetime . now ( ) - datetime . timedelta ( days = 30 )
2010-07-25 23:13:27 -04:00
stories_last_month = Story . objects . filter ( story_feed = self , story_date__gte = month_ago ) . count ( )
self . stories_last_month = stories_last_month
# Save stories for this month count in granular StoriesPerMonth model
today = datetime . datetime . now ( )
beginning_of_month = datetime . datetime ( today . year , today . month , 1 )
stories_this_month = Story . objects . filter ( story_feed = self ,
story_date__gte = beginning_of_month ) . count ( )
stories_per_month , created = StoriesPerMonth . objects . get_or_create (
feed = self ,
year = today . year ,
month = today . month ,
defaults = {
' story_count ' : stories_this_month ,
' beginning_of_month ' : beginning_of_month ,
} )
if not created :
stories_per_month . story_count = stories_this_month
stories_per_month . save ( )
stories_last_year , average_stories_per_month = StoriesPerMonth . past_year ( self )
self . stories_last_year = json . encode ( stories_last_year )
self . average_stories_per_month = average_stories_per_month
self . save ( lock = lock )
2010-07-05 22:53:49 -04:00
2010-07-02 15:49:08 -04:00
if verbose :
2010-07-25 23:13:27 -04:00
print " ---> %s [ %s ]: %s stories " % ( self . feed_title , self . pk , self . stories_last_month )
2010-07-02 15:49:08 -04:00
2009-06-16 03:08:55 +00:00
def last_updated ( self ) :
return time . time ( ) - time . mktime ( self . last_update . timetuple ( ) )
def new_stories_since_date ( self , date ) :
2010-01-21 13:12:29 -05:00
stories = Story . objects . filter ( story_date__gte = date ,
story_feed = self )
return stories
2009-06-16 03:08:55 +00:00
def add_feed ( self , feed_address , feed_link , feed_title ) :
print locals ( )
2010-04-09 16:37:19 -04:00
def update ( self , force = False , feed = None , single_threaded = False ) :
2009-08-29 19:34:42 +00:00
from utils import feed_fetcher
2009-08-14 01:54:22 +00:00
try :
2010-06-08 11:19:07 -04:00
self . feed_address = self . feed_address % { ' NEWSBLUR_DIR ' : settings . NEWSBLUR_DIR }
2009-08-14 01:54:22 +00:00
except :
pass
2009-08-01 04:26:57 +00:00
2009-08-29 19:34:42 +00:00
options = {
2009-11-15 18:57:53 +00:00
' verbose ' : 2 ,
2010-04-09 16:37:19 -04:00
' timeout ' : 10 ,
' single_threaded ' : single_threaded ,
2010-04-29 11:18:49 -04:00
' force ' : force ,
2009-08-29 19:34:42 +00:00
}
disp = feed_fetcher . Dispatcher ( options , 1 )
2009-11-15 18:57:53 +00:00
disp . add_jobs ( [ [ self ] ] )
disp . run_jobs ( )
2009-08-29 19:34:42 +00:00
disp . poll ( )
2009-06-16 03:08:55 +00:00
return
2009-08-01 04:26:57 +00:00
def add_update_stories ( self , stories , existing_stories ) :
2009-08-20 02:43:01 +00:00
ret_values = {
ENTRY_NEW : 0 ,
ENTRY_UPDATED : 0 ,
ENTRY_SAME : 0 ,
ENTRY_ERR : 0
}
2009-06-16 03:08:55 +00:00
for story in stories :
2009-12-18 20:47:44 +00:00
story = pre_process_story ( story )
2010-01-04 22:26:53 +00:00
2009-08-01 04:26:57 +00:00
if story . get ( ' title ' ) :
story_contents = story . get ( ' content ' )
2010-01-04 22:26:53 +00:00
story_tags = self . get_tags ( story )
2009-08-01 04:26:57 +00:00
if story_contents is not None :
story_content = story_contents [ 0 ] [ ' value ' ]
2009-06-16 03:08:55 +00:00
else :
2009-08-01 04:26:57 +00:00
story_content = story . get ( ' summary ' )
2010-01-04 22:26:53 +00:00
2009-08-30 00:43:13 +00:00
existing_story , story_has_changed = self . _exists_story ( story , story_content , existing_stories )
2009-12-18 20:47:44 +00:00
story_author , _ = self . _save_story_author ( story . get ( ' author ' ) )
2009-08-01 04:26:57 +00:00
if existing_story is None :
2010-05-20 15:13:25 -04:00
# pub_date = datetime.datetime.timetuple(story.get('published'))
2009-08-14 01:48:21 +00:00
# logging.debug('- New story: %s %s' % (pub_date, story.get('title')))
2009-12-18 20:47:44 +00:00
2009-08-01 04:26:57 +00:00
s = Story ( story_feed = self ,
story_date = story . get ( ' published ' ) ,
story_title = story . get ( ' title ' ) ,
story_content = story_content ,
2009-12-18 20:47:44 +00:00
story_author = story_author ,
2010-07-08 11:37:54 -04:00
story_author_name = story . get ( ' author ' ) ,
2009-09-05 02:22:25 +00:00
story_permalink = story . get ( ' link ' ) ,
2010-04-05 02:42:43 -04:00
story_guid = story . get ( ' guid ' ) or story . get ( ' id ' ) or story . get ( ' link ' ) ,
2010-06-29 09:01:30 -04:00
story_tags = self . _shorten_story_tags ( story_tags )
2009-08-01 04:26:57 +00:00
)
try :
s . save ( force_insert = True )
2010-05-11 12:27:39 -04:00
ret_values [ ENTRY_NEW ] + = 1
cache . set ( ' updated_feed: %s ' % self . id , 1 )
2009-09-08 00:13:49 +00:00
except IntegrityError , e :
2009-08-20 02:43:01 +00:00
ret_values [ ENTRY_ERR ] + = 1
2009-12-18 20:47:44 +00:00
print ( ' Saving new story, IntegrityError: %s - %s : %s ' % ( self . feed_title , story . get ( ' title ' ) , e ) )
2010-01-04 22:26:53 +00:00
[ s . tags . add ( tcat ) for tcat in story_tags ]
2009-08-30 00:43:13 +00:00
elif existing_story and story_has_changed :
2009-08-01 04:26:57 +00:00
# update story
2010-04-29 19:12:18 -04:00
# logging.debug('- Updated story in feed (%s - %s): %s / %s' % (self.feed_title, story.get('title'), len(existing_story.story_content), len(story_content)))
2009-08-01 04:26:57 +00:00
original_content = None
2010-01-27 16:28:57 -05:00
if existing_story . story_original_content :
original_content = existing_story . story_original_content
2009-08-01 04:26:57 +00:00
else :
2010-01-27 16:28:57 -05:00
original_content = existing_story . story_content
2010-01-27 18:08:07 -05:00
# print 'Type: %s %s' % (type(original_content), type(story_content))
2010-08-01 23:47:40 -04:00
if len ( story_content ) > 10 :
diff = HTMLDiff ( unicode ( original_content ) , story_content )
story_content_diff = diff . getDiff ( )
else :
story_content_diff = original_content
2009-08-01 04:26:57 +00:00
# logging.debug("\t\tDiff: %s %s %s" % diff.getStats())
# logging.debug("\t\tDiff content: %s" % diff.getDiff())
2010-01-27 16:28:57 -05:00
if existing_story . story_title != story . get ( ' title ' ) :
# logging.debug('\tExisting title / New: : \n\t\t- %s\n\t\t- %s' % (existing_story.story_title, story.get('title')))
2009-08-14 01:48:21 +00:00
pass
2009-06-16 03:08:55 +00:00
2010-01-27 16:28:57 -05:00
s = Story ( id = existing_story . id ,
2009-08-01 04:26:57 +00:00
story_feed = self ,
story_date = story . get ( ' published ' ) ,
story_title = story . get ( ' title ' ) ,
2010-08-01 23:47:40 -04:00
story_content = story_content_diff ,
2009-08-01 04:26:57 +00:00
story_original_content = original_content ,
2009-12-18 20:47:44 +00:00
story_author = story_author ,
2010-07-08 11:37:54 -04:00
story_author_name = story . get ( ' author ' ) ,
2009-09-05 02:22:25 +00:00
story_permalink = story . get ( ' link ' ) ,
2010-04-05 02:42:43 -04:00
story_guid = story . get ( ' guid ' ) or story . get ( ' id ' ) or story . get ( ' link ' ) ,
2010-06-29 09:01:30 -04:00
story_tags = self . _shorten_story_tags ( story_tags )
2009-08-01 04:26:57 +00:00
)
2010-01-04 22:26:53 +00:00
s . tags . clear ( )
[ s . tags . add ( tcat ) for tcat in story_tags ]
2009-08-01 04:26:57 +00:00
try :
s . save ( force_update = True )
2010-05-11 12:27:39 -04:00
ret_values [ ENTRY_UPDATED ] + = 1
cache . set ( ' updated_feed: %s ' % self . id , 1 )
2009-09-08 00:13:49 +00:00
except IntegrityError , e :
ret_values [ ENTRY_ERR ] + = 1
2009-12-18 20:47:44 +00:00
print ( ' Saving updated story, IntegrityError: %s - %s ' % ( self . feed_title , story . get ( ' title ' ) ) )
2009-08-20 02:43:01 +00:00
else :
ret_values [ ENTRY_SAME ] + = 1
2009-08-01 04:26:57 +00:00
# logging.debug("Unchanged story: %s " % story.get('title'))
2009-06-16 03:08:55 +00:00
2009-08-20 02:43:01 +00:00
return ret_values
2009-06-16 03:08:55 +00:00
2009-12-18 20:47:44 +00:00
def _save_story_author ( self , author ) :
author , created = StoryAuthor . objects . get_or_create ( feed = self , author_name = author )
return author , created
2010-06-29 09:01:30 -04:00
2010-07-05 22:53:49 -04:00
def save_popular_tags ( self , feed_tags = None , lock = None ) :
2010-07-01 15:16:33 -04:00
if not feed_tags :
from apps . rss_feeds . models import Tag
from django . db . models . aggregates import Count
all_tags = Tag . objects . filter ( feed = self ) \
. annotate ( stories_count = Count ( ' story ' ) ) \
. order_by ( ' -stories_count ' ) [ : 20 ]
feed_tags = [ ( tag . name , tag . stories_count ) for tag in all_tags if tag . stories_count > 1 ]
popular_tags = json . encode ( feed_tags )
if len ( popular_tags ) < 1024 :
self . popular_tags = popular_tags
2010-07-25 23:13:27 -04:00
self . save ( lock = lock )
2010-07-01 15:16:33 -04:00
return
2010-07-01 15:20:38 -04:00
tags_list = json . decode ( feed_tags ) if feed_tags else [ ]
2010-07-01 15:16:33 -04:00
if len ( tags_list ) > 1 :
self . save_popular_tags ( tags_list [ : - 1 ] )
2010-07-05 22:53:49 -04:00
def save_popular_authors ( self , feed_authors = None , lock = None ) :
2010-07-01 15:16:33 -04:00
if not feed_authors :
from django . db . models . aggregates import Count
all_authors = StoryAuthor . objects . filter ( feed = self , author_name__isnull = False ) \
. annotate ( stories_count = Count ( ' story ' ) ) \
. order_by ( ' -stories_count ' ) [ : 20 ]
feed_authors = [ ( author . author_name , author . stories_count ) for author in all_authors \
if author . stories_count > 1 ]
popular_authors = json . encode ( feed_authors )
if len ( popular_authors ) < 1024 :
self . popular_authors = popular_authors
2010-07-25 23:13:27 -04:00
self . save ( lock = lock )
2010-07-01 15:16:33 -04:00
return
2010-07-01 15:20:38 -04:00
authors_list = json . decode ( feed_authors ) if feed_authors else [ ]
2010-07-01 15:16:33 -04:00
if len ( authors_list ) > 1 :
self . save_popular_authors ( authors_list [ : - 1 ] )
2010-07-05 22:53:49 -04:00
2010-06-29 09:01:30 -04:00
def _shorten_story_tags ( self , story_tags ) :
encoded_tags = json . encode ( [ t . name for t in story_tags ] )
if len ( encoded_tags ) < 2000 :
return encoded_tags
2010-07-01 15:16:33 -04:00
if len ( story_tags ) > 1 :
return self . _shorten_story_tags ( story_tags [ : - 1 ] )
2009-12-18 20:47:44 +00:00
2009-08-01 04:26:57 +00:00
def trim_feed ( self ) :
2010-01-26 19:59:43 -05:00
from apps . reader . models import UserStory
stories_deleted_count = 0
2010-01-26 20:27:11 -05:00
user_stories_count = 0
2010-07-25 23:13:27 -04:00
month_ago = datetime . datetime . now ( ) - datetime . timedelta ( days = 30 )
stories = Story . objects . filter (
story_feed = self ,
story_date__lte = month_ago
) . order_by ( ' -story_date ' )
2010-01-26 20:02:36 -05:00
print ' Found %s stories in %s . Trimming... ' % ( stories . count ( ) , self )
2010-02-12 19:30:26 -05:00
if stories . count ( ) > 1000 :
2010-02-12 19:34:32 -05:00
old_story = stories [ 1000 ]
2010-02-12 19:34:03 -05:00
user_stories = UserStory . objects . filter ( feed = self ,
2010-02-12 19:33:18 -05:00
read_date__lte = old_story . story_date )
user_stories_count = user_stories . count ( )
user_stories . delete ( )
2010-02-12 19:30:26 -05:00
old_stories = Story . objects . filter ( story_feed = self ,
story_date__lte = old_story . story_date )
2010-02-12 19:33:18 -05:00
stories_deleted_count = old_stories . count ( )
2010-02-12 19:30:26 -05:00
old_stories . delete ( )
2009-08-01 04:26:57 +00:00
2010-01-26 19:59:43 -05:00
if stories_deleted_count :
print " Trimming %s stories from %s . %s user stories. " % (
stories_deleted_count ,
self ,
user_stories_count )
2010-06-24 15:10:15 -04:00
def get_stories ( self , offset = 0 , limit = 25 , force = False ) :
if not force :
2010-05-11 12:27:39 -04:00
stories = cache . get ( ' feed_stories: %s - %s - %s ' % ( self . id , offset , limit ) , [ ] )
2010-06-24 15:10:15 -04:00
else :
stories = None
2010-01-21 13:12:29 -05:00
2010-06-24 15:10:15 -04:00
if not stories or force :
2010-07-08 11:37:54 -04:00
stories_db = Story . objects . filter ( story_feed = self ) [ offset : offset + limit ]
2010-01-21 13:12:29 -05:00
stories = self . format_stories ( stories_db )
2010-06-24 15:10:15 -04:00
cache . set ( ' feed_stories: %s - %s - %s ' % ( self . id , offset , limit ) , stories )
2009-07-28 02:27:27 +00:00
return stories
2010-01-21 13:12:29 -05:00
def format_stories ( self , stories_db ) :
stories = [ ]
2010-04-05 02:42:43 -04:00
# from django.db import connection
# print "Formatting Stories: %s" % stories_db.count()
2010-01-21 13:12:29 -05:00
for story_db in stories_db :
2010-02-17 03:22:45 -05:00
story = { }
2010-04-05 02:55:18 -04:00
# story_tags = story_db.tags.all()
2010-04-05 03:20:44 -04:00
story [ ' story_tags ' ] = ( story_db . story_tags and json . decode ( story_db . story_tags ) ) or [ ]
2010-02-17 03:22:45 -05:00
story [ ' short_parsed_date ' ] = format_story_link_date__short ( story_db . story_date )
story [ ' long_parsed_date ' ] = format_story_link_date__long ( story_db . story_date )
story [ ' story_date ' ] = story_db . story_date
2010-07-08 11:37:54 -04:00
story [ ' story_authors ' ] = story_db . story_author_name
2010-02-17 03:22:45 -05:00
story [ ' story_title ' ] = story_db . story_title
story [ ' story_content ' ] = story_db . story_content
story [ ' story_permalink ' ] = story_db . story_permalink
2010-04-05 03:20:44 -04:00
story [ ' story_feed_id ' ] = self . pk
2010-02-17 03:22:45 -05:00
story [ ' id ' ] = story_db . id
2010-01-21 13:12:29 -05:00
stories . append ( story )
return stories
2010-01-04 22:26:53 +00:00
def get_tags ( self , entry ) :
fcat = [ ]
if entry . has_key ( ' tags ' ) :
for tcat in entry . tags :
2010-07-06 18:16:41 -04:00
if tcat . label :
2010-01-04 22:26:53 +00:00
term = tcat . label
2010-07-06 18:16:41 -04:00
elif tcat . term :
2010-01-04 22:26:53 +00:00
term = tcat . term
2010-07-06 18:16:41 -04:00
else :
continue
2010-01-04 22:26:53 +00:00
qcat = term . strip ( )
if ' , ' in qcat or ' / ' in qcat :
qcat = qcat . replace ( ' , ' , ' / ' ) . split ( ' / ' )
else :
qcat = [ qcat ]
for zcat in qcat :
tagname = zcat . lower ( )
while ' ' in tagname :
tagname = tagname . replace ( ' ' , ' ' )
tagname = tagname . strip ( )
if not tagname or tagname == ' ' :
continue
if not Tag . objects . filter ( name = tagname , feed = self ) :
cobj = Tag ( name = tagname , feed = self )
cobj . save ( )
fcat . append ( Tag . objects . get ( name = tagname , feed = self ) )
return fcat
2009-08-01 04:26:57 +00:00
def _exists_story ( self , story = None , story_content = None , existing_stories = None ) :
2009-08-30 00:43:13 +00:00
story_in_system = None
story_has_changed = False
2009-08-01 04:26:57 +00:00
story_pub_date = story . get ( ' published ' )
2010-01-28 13:28:27 -05:00
story_published_now = story . get ( ' published_now ' , False )
2009-08-01 04:26:57 +00:00
start_date = story_pub_date - datetime . timedelta ( hours = 8 )
end_date = story_pub_date + datetime . timedelta ( hours = 8 )
2009-08-30 00:43:13 +00:00
2009-08-01 04:26:57 +00:00
for existing_story in existing_stories :
2009-08-30 00:43:13 +00:00
content_ratio = 0
2010-01-28 13:28:27 -05:00
# print 'Story pub date: %s %s' % (story_published_now, story_pub_date)
if story_published_now or \
( story_pub_date > start_date and story_pub_date < end_date ) :
if story . get ( ' guid ' ) and story . get ( ' guid ' ) == existing_story . story_guid :
2009-09-05 02:22:25 +00:00
story_in_system = existing_story
2010-01-27 16:28:57 -05:00
elif story . get ( ' link ' ) and story . get ( ' link ' ) == existing_story . story_permalink :
2009-08-30 00:43:13 +00:00
story_in_system = existing_story
2009-09-07 01:56:52 +00:00
# import pdb
# pdb.set_trace()
2009-09-10 03:33:05 +00:00
2009-08-30 00:43:13 +00:00
# Title distance + content distance, checking if story changed
2009-08-01 04:26:57 +00:00
story_title_difference = levenshtein_distance ( story . get ( ' title ' ) ,
2010-01-27 16:28:57 -05:00
existing_story . story_title )
seq = difflib . SequenceMatcher ( None , story_content , existing_story . story_content )
2009-08-30 00:43:13 +00:00
2010-04-29 13:52:24 -04:00
if ( seq
and story_content
and existing_story . story_content
and seq . real_quick_ratio ( ) > .9
and seq . quick_ratio ( ) > .95 ) :
2009-08-30 00:43:13 +00:00
content_ratio = seq . ratio ( )
if story_title_difference > 0 and story_title_difference < 5 and content_ratio > .98 :
story_in_system = existing_story
if story_title_difference > 0 or content_ratio < 1.0 :
2010-01-27 16:28:57 -05:00
# print "Title difference - %s/%s (%s): %s" % (story.get('title'), existing_story.story_title, story_title_difference, content_ratio)
2009-08-30 00:43:13 +00:00
story_has_changed = True
break
# More restrictive content distance, still no story match
2009-09-07 01:56:52 +00:00
if not story_in_system and content_ratio > .98 :
2010-01-27 16:28:57 -05:00
# print "Content difference - %s/%s (%s): %s" % (story.get('title'), existing_story.story_title, story_title_difference, content_ratio)
2009-08-30 00:43:13 +00:00
story_in_system = existing_story
story_has_changed = True
break
2009-08-01 04:26:57 +00:00
2009-08-30 00:43:13 +00:00
if story_in_system :
2010-01-27 16:28:57 -05:00
if story_content != existing_story . story_content :
2009-08-30 00:43:13 +00:00
story_has_changed = True
2009-08-01 04:26:57 +00:00
break
2010-01-28 13:28:27 -05:00
2010-02-02 18:01:02 -05:00
# if story_has_changed or not story_in_system:
# print 'New/updated story: %s' % (story),
2009-08-30 00:43:13 +00:00
return story_in_system , story_has_changed
2010-07-25 23:13:27 -04:00
def get_next_scheduled_update ( self ) :
2010-06-24 15:27:25 -04:00
# Use stories per month to calculate next feed update
2010-07-25 23:13:27 -04:00
updates_per_day = max ( 30 , self . stories_last_month ) / 30.0
2010-06-27 22:58:45 -04:00
# 1 update per day = 12 hours
2010-06-28 11:37:54 -04:00
# > 1 update per day:
2010-07-05 22:53:49 -04:00
# 2 updates = 3 hours
# 4 updates = 1 hour
# 10 updates = 20 minutes
updates_per_day_delay = 60 * 12 / ( updates_per_day * * 1.55 )
2010-07-02 15:49:08 -04:00
# Lots of subscribers = lots of updates
2010-07-05 22:53:49 -04:00
# 72 hours for 0 subscribers.
2010-07-02 15:49:08 -04:00
# 6 hours for 1 subscriber.
2010-07-05 22:53:49 -04:00
# 45 min for 2 subscribers.
# 10 min for 3 subscribers.
subscriber_bonus = 6 * 60 / max ( .083 , self . num_subscribers * * 3 )
2010-06-27 21:44:35 -04:00
2010-06-28 11:37:54 -04:00
slow_punishment = 0
2010-06-24 15:27:25 -04:00
if 30 < = self . last_load_time < 60 :
slow_punishment = self . last_load_time
elif 60 < = self . last_load_time < 100 :
slow_punishment = 4 * self . last_load_time
2010-07-05 14:26:35 -04:00
elif self . last_load_time > = 100 :
slow_punishment = 12 * self . last_load_time
2010-07-25 23:13:27 -04:00
total = int ( updates_per_day_delay + subscriber_bonus + slow_punishment )
random_factor = random . randint ( 0 , total ) / 4
2010-07-05 22:53:49 -04:00
2010-07-26 22:21:58 -04:00
return total , random_factor
2010-07-25 23:13:27 -04:00
def set_next_scheduled_update ( self , lock = None ) :
2010-07-26 22:21:58 -04:00
total , random_factor = self . get_next_scheduled_update ( )
next_scheduled_update = datetime . datetime . now ( ) + datetime . timedelta (
minutes = total + random_factor )
2010-06-24 15:27:25 -04:00
self . next_scheduled_update = next_scheduled_update
2010-07-05 23:17:36 -04:00
2010-07-25 23:13:27 -04:00
self . save ( lock = lock )
2010-07-27 23:29:04 -04:00
def calculate_collocations_story_content ( self ,
collocation_measures = TrigramAssocMeasures ,
collocation_finder = TrigramCollocationFinder ) :
2010-07-27 22:11:23 -04:00
stories = Story . objects . filter ( story_feed = self )
2010-07-27 22:53:30 -04:00
story_content = ' ' . join ( [ s . story_content for s in stories if s . story_content ] )
2010-07-27 23:29:04 -04:00
return self . calculate_collocations ( story_content , collocation_measures , collocation_finder )
def calculate_collocations_story_title ( self ,
collocation_measures = BigramAssocMeasures ,
collocation_finder = BigramCollocationFinder ) :
stories = Story . objects . filter ( story_feed = self )
story_titles = ' ' . join ( [ s . story_title for s in stories if s . story_title ] )
return self . calculate_collocations ( story_titles , collocation_measures , collocation_finder )
def calculate_collocations ( self , content ,
collocation_measures = TrigramAssocMeasures ,
collocation_finder = TrigramCollocationFinder ) :
content = re . sub ( r ' ’ ' , ' \' ' , content )
content = re . sub ( r ' & ' , ' & ' , content )
try :
content = unicode ( BeautifulStoneSoup ( content ,
convertEntities = BeautifulStoneSoup . HTML_ENTITIES ) )
except ValueError , e :
print " ValueError, ignoring: %s " % e
content = re . sub ( r ' </? \ w+ \ s+[^>]*> ' , ' ' , content )
content = re . split ( r " [^A-Za-z- ' &]+ " , content )
2010-07-27 22:11:23 -04:00
2010-07-27 23:29:04 -04:00
finder = collocation_finder . from_words ( content )
2010-07-27 22:11:23 -04:00
finder . apply_freq_filter ( 3 )
2010-07-27 22:27:32 -04:00
best = finder . nbest ( collocation_measures . pmi , 10 )
2010-07-27 22:11:23 -04:00
phrases = [ ' ' . join ( phrase ) for phrase in best ]
2010-07-27 22:27:32 -04:00
return phrases
2010-07-27 22:11:23 -04:00
2009-06-16 03:08:55 +00:00
class Meta :
db_table = " feeds "
ordering = [ " feed_title " ]
2010-07-27 22:11:23 -04:00
2010-07-27 22:37:52 -04:00
# class FeedCollocations(models.Model):
# feed = models.ForeignKey(Feed)
# phrase = models.CharField(max_length=500)
2009-06-16 03:08:55 +00:00
class Tag ( models . Model ) :
2010-01-04 22:26:53 +00:00
feed = models . ForeignKey ( Feed )
name = models . CharField ( max_length = 255 )
2009-06-16 03:08:55 +00:00
def __unicode__ ( self ) :
2010-01-04 22:26:53 +00:00
return ' %s - %s ' % ( self . feed , self . name )
2009-06-16 03:08:55 +00:00
def save ( self ) :
super ( Tag , self ) . save ( )
2009-12-18 20:47:44 +00:00
class StoryAuthor ( models . Model ) :
feed = models . ForeignKey ( Feed )
author_name = models . CharField ( max_length = 255 , null = True , blank = True )
2009-12-18 21:59:14 +00:00
def __unicode__ ( self ) :
return ' %s - %s ' % ( self . feed , self . author_name )
2010-05-20 15:13:25 -04:00
class FeedPage ( models . Model ) :
feed = models . OneToOneField ( Feed , related_name = " feed_page " )
page_data = StoryField ( null = True , blank = True )
class FeedXML ( models . Model ) :
feed = models . OneToOneField ( Feed , related_name = " feed_xml " )
rss_xml = StoryField ( null = True , blank = True )
2009-06-16 03:08:55 +00:00
class Story ( models . Model ) :
''' A feed item '''
2010-04-09 16:10:23 -04:00
story_feed = models . ForeignKey ( Feed , related_name = " stories " )
2009-06-16 03:08:55 +00:00
story_date = models . DateTimeField ( )
story_title = models . CharField ( max_length = 255 )
2010-01-27 16:28:57 -05:00
story_content = StoryField ( null = True , blank = True )
story_original_content = StoryField ( null = True , blank = True )
2009-06-16 03:08:55 +00:00
story_content_type = models . CharField ( max_length = 255 , null = True ,
blank = True )
2009-12-18 20:47:44 +00:00
story_author = models . ForeignKey ( StoryAuthor )
2010-07-08 11:37:54 -04:00
story_author_name = models . CharField ( max_length = 500 , null = True , blank = True )
2009-06-16 03:08:55 +00:00
story_permalink = models . CharField ( max_length = 1000 )
2009-09-05 02:22:25 +00:00
story_guid = models . CharField ( max_length = 1000 )
2010-04-29 11:18:49 -04:00
story_guid_hash = models . CharField ( max_length = 40 )
2009-06-16 03:08:55 +00:00
story_past_trim_date = models . BooleanField ( default = False )
2010-07-20 20:23:49 -04:00
story_tags = models . CharField ( max_length = 2000 , null = True , blank = True )
2010-06-27 22:40:22 -04:00
tags = models . ManyToManyField ( ' Tag ' )
2009-06-16 03:08:55 +00:00
def __unicode__ ( self ) :
return self . story_title
class Meta :
verbose_name_plural = " stories "
verbose_name = " story "
db_table = " stories "
2009-07-18 23:39:16 +00:00
ordering = [ " -story_date " ]
2010-07-20 20:23:49 -04:00
unique_together = ( ( " story_feed " , " story_guid_hash " ) , )
2009-08-14 02:32:30 +00:00
2010-04-29 11:18:49 -04:00
def save ( self , * args , * * kwargs ) :
if not self . story_guid_hash and self . story_guid :
self . story_guid_hash = hashlib . md5 ( self . story_guid ) . hexdigest ( )
2010-08-04 18:55:52 -04:00
if len ( self . story_title ) > 255 :
self . story_title = self . story_title [ : 255 ]
2010-04-29 11:18:49 -04:00
super ( Story , self ) . save ( * args , * * kwargs )
2010-04-23 21:19:19 -04:00
class FeedUpdateHistory ( models . Model ) :
fetch_date = models . DateTimeField ( default = datetime . datetime . now )
number_of_feeds = models . IntegerField ( )
seconds_taken = models . IntegerField ( )
2010-04-25 16:11:04 -04:00
average_per_feed = models . DecimalField ( decimal_places = 1 , max_digits = 4 )
2010-04-23 21:19:19 -04:00
def __unicode__ ( self ) :
return " [ %s ] %s feeds: %s seconds " % (
2010-05-20 15:13:25 -04:00
self . fetch_date . strftime ( ' %F %d ' ) ,
2010-04-23 21:19:19 -04:00
self . number_of_feeds ,
self . seconds_taken ,
)
2010-04-25 16:11:04 -04:00
def save ( self , * args , * * kwargs ) :
self . average_per_feed = str ( self . seconds_taken / float ( max ( 1.0 , self . number_of_feeds ) ) )
super ( FeedUpdateHistory , self ) . save ( * args , * * kwargs )
2010-07-06 13:21:12 -04:00
class FeedFetchHistory ( models . Model ) :
2010-07-08 11:37:54 -04:00
feed = models . ForeignKey ( Feed , related_name = ' feed_fetch_history ' )
status_code = models . CharField ( max_length = 10 , null = True , blank = True )
message = models . CharField ( max_length = 255 , null = True , blank = True )
exception = models . TextField ( null = True , blank = True )
fetch_date = models . DateTimeField ( default = datetime . datetime . now )
def __unicode__ ( self ) :
return " [ %s ] %s ( %s ): %s %s : %s " % (
self . feed . id ,
self . feed ,
self . fetch_date ,
self . status_code ,
self . message ,
self . exception [ : 50 ]
)
class PageFetchHistory ( models . Model ) :
feed = models . ForeignKey ( Feed , related_name = ' page_fetch_history ' )
2010-07-06 13:21:12 -04:00
status_code = models . CharField ( max_length = 10 , null = True , blank = True )
message = models . CharField ( max_length = 255 , null = True , blank = True )
exception = models . TextField ( null = True , blank = True )
fetch_date = models . DateTimeField ( default = datetime . datetime . now )
def __unicode__ ( self ) :
return " [ %s ] %s ( %s ): %s %s : %s " % (
self . feed . id ,
self . feed ,
self . fetch_date ,
self . status_code ,
self . message ,
self . exception [ : 50 ]
2010-07-25 23:13:27 -04:00
)
class StoriesPerMonth ( models . Model ) :
feed = models . ForeignKey ( Feed , related_name = ' stories_per_month ' )
year = models . IntegerField ( )
month = models . IntegerField ( )
story_count = models . IntegerField ( )
beginning_of_month = models . DateTimeField ( default = datetime . datetime . now )
@classmethod
def past_year ( cls , feed ) :
year_ago = datetime . datetime . now ( ) - datetime . timedelta ( days = 365 )
story_counts = StoriesPerMonth . objects . filter (
feed = feed ,
beginning_of_month__gte = year_ago
) . order_by ( ' beginning_of_month ' )
month_counts = [ m . story_count for m in story_counts ]
average_per_month = sum ( month_counts ) / max ( len ( month_counts ) , 1 )
2010-08-04 18:30:51 -04:00
return month_counts , average_per_month
@classmethod
def recount_feed ( cls , feed ) :
d = defaultdict ( int )
stories = Story . objects . filter ( story_feed = feed ) . extra ( select = { ' year ' : " EXTRACT(year FROM story_date) " , ' month ' : " EXTRACT(month from story_date) " } ) . values ( ' year ' , ' month ' )
for story in stories :
pass