2010-05-20 15:13:25 -04:00
|
|
|
import time
|
|
|
|
import settings
|
|
|
|
import difflib
|
|
|
|
import datetime
|
|
|
|
import hashlib
|
2010-06-24 15:27:25 -04:00
|
|
|
import random
|
2010-07-27 22:11:23 -04:00
|
|
|
import re
|
2010-08-04 18:30:51 -04:00
|
|
|
from collections import defaultdict
|
2010-07-27 22:11:23 -04:00
|
|
|
from BeautifulSoup import BeautifulStoneSoup
|
2010-07-27 22:27:32 -04:00
|
|
|
from nltk.collocations import TrigramCollocationFinder, BigramCollocationFinder, TrigramAssocMeasures, BigramAssocMeasures
|
2009-06-16 03:08:55 +00:00
|
|
|
from django.db import models
|
2009-09-08 00:13:49 +00:00
|
|
|
from django.db import IntegrityError
|
2009-06-16 03:08:55 +00:00
|
|
|
from django.core.cache import cache
|
2010-05-20 15:13:25 -04:00
|
|
|
from utils import json
|
|
|
|
from utils.feed_functions import levenshtein_distance
|
2009-07-28 02:27:27 +00:00
|
|
|
from utils.story_functions import format_story_link_date__short
|
|
|
|
from utils.story_functions import format_story_link_date__long
|
2009-12-18 20:47:44 +00:00
|
|
|
from utils.story_functions import pre_process_story
|
2010-07-01 15:16:33 -04:00
|
|
|
from utils.compressed_textfield import CompressedTextField, StoryField
|
2009-08-29 19:34:42 +00:00
|
|
|
from utils.diff import HTMLDiff
|
2009-06-16 03:08:55 +00:00
|
|
|
|
2010-06-08 11:19:07 -04:00
|
|
|
USER_AGENT = 'NewsBlur v1.0 - newsblur.com'
|
2009-06-16 03:08:55 +00:00
|
|
|
|
2009-08-20 02:43:01 +00:00
|
|
|
ENTRY_NEW, ENTRY_UPDATED, ENTRY_SAME, ENTRY_ERR = range(4)
|
2009-07-21 03:18:29 +00:00
|
|
|
|
2009-06-16 03:08:55 +00:00
|
|
|
class Feed(models.Model):
|
|
|
|
feed_address = models.URLField(max_length=255, verify_exists=True, unique=True)
|
2010-07-21 23:22:27 -04:00
|
|
|
feed_link = models.URLField(max_length=1000, default="", blank=True, null=True)
|
2010-06-28 08:06:12 -04:00
|
|
|
feed_title = models.CharField(max_length=255, default="", blank=True, null=True)
|
|
|
|
feed_tagline = models.CharField(max_length=1024, default="", blank=True, null=True)
|
2009-06-16 03:08:55 +00:00
|
|
|
active = models.BooleanField(default=True)
|
|
|
|
num_subscribers = models.IntegerField(default=0)
|
2010-08-09 20:44:36 -04:00
|
|
|
last_update = models.DateTimeField(auto_now=True)
|
|
|
|
fetched_once = models.BooleanField(default=False)
|
2010-08-18 20:35:45 -04:00
|
|
|
has_exception = models.BooleanField(default=False)
|
2009-06-16 03:08:55 +00:00
|
|
|
min_to_decay = models.IntegerField(default=15)
|
|
|
|
days_to_trim = models.IntegerField(default=90)
|
|
|
|
creation = models.DateField(auto_now_add=True)
|
2010-04-13 19:35:36 -04:00
|
|
|
etag = models.CharField(max_length=50, blank=True, null=True)
|
2009-06-16 03:08:55 +00:00
|
|
|
last_modified = models.DateTimeField(null=True, blank=True)
|
2010-07-25 23:13:27 -04:00
|
|
|
stories_last_month = models.IntegerField(default=0)
|
|
|
|
average_stories_per_month = models.IntegerField(default=0)
|
2010-08-13 11:03:07 -04:00
|
|
|
story_count_history = models.TextField(blank=True, null=True)
|
2010-04-19 12:09:04 -04:00
|
|
|
next_scheduled_update = models.DateTimeField(default=datetime.datetime.now)
|
2010-04-29 13:35:46 -04:00
|
|
|
last_load_time = models.IntegerField(default=0)
|
2010-07-01 15:16:33 -04:00
|
|
|
popular_tags = models.CharField(max_length=1024, blank=True, null=True)
|
|
|
|
popular_authors = models.CharField(max_length=2048, blank=True, null=True)
|
2009-06-16 03:08:55 +00:00
|
|
|
|
|
|
|
|
|
|
|
def __unicode__(self):
|
|
|
|
return self.feed_title
|
2010-07-19 14:29:27 -04:00
|
|
|
|
2010-07-25 23:13:27 -04:00
|
|
|
def save(self, lock=None, *args, **kwargs):
|
2010-07-20 22:57:18 -04:00
|
|
|
if self.feed_tagline and len(self.feed_tagline) > 1024:
|
2010-07-19 14:29:27 -04:00
|
|
|
self.feed_tagline = self.feed_tagline[:1024]
|
|
|
|
|
2010-07-25 23:13:27 -04:00
|
|
|
if lock:
|
|
|
|
lock.acquire()
|
|
|
|
try:
|
|
|
|
super(Feed, self).save(*args, **kwargs)
|
|
|
|
finally:
|
|
|
|
lock.release()
|
|
|
|
else:
|
|
|
|
super(Feed, self).save(*args, **kwargs)
|
2010-07-19 14:29:27 -04:00
|
|
|
|
2010-07-08 11:37:54 -04:00
|
|
|
def save_feed_history(self, status_code, message, exception=None):
|
2010-07-06 13:21:12 -04:00
|
|
|
FeedFetchHistory.objects.create(feed=self,
|
|
|
|
status_code=status_code,
|
|
|
|
message=message,
|
|
|
|
exception=exception)
|
2010-07-08 11:37:54 -04:00
|
|
|
old_fetch_histories = self.feed_fetch_history.all()[10:]
|
|
|
|
for history in old_fetch_histories:
|
|
|
|
history.delete()
|
2010-08-18 21:54:33 -04:00
|
|
|
|
2010-08-18 20:35:45 -04:00
|
|
|
if status_code >= 400:
|
2010-08-18 21:54:33 -04:00
|
|
|
fetch_history = self.feed_fetch_history.all().values('status_code')
|
|
|
|
self.count_errors_in_history(fetch_history)
|
2010-08-18 20:35:45 -04:00
|
|
|
|
2010-07-08 11:37:54 -04:00
|
|
|
def save_page_history(self, status_code, message, exception=None):
|
|
|
|
PageFetchHistory.objects.create(feed=self,
|
|
|
|
status_code=status_code,
|
|
|
|
message=message,
|
|
|
|
exception=exception)
|
|
|
|
old_fetch_histories = self.page_fetch_history.all()[10:]
|
2010-07-06 13:21:12 -04:00
|
|
|
for history in old_fetch_histories:
|
|
|
|
history.delete()
|
2010-08-18 20:35:45 -04:00
|
|
|
|
|
|
|
if status_code >= 400:
|
2010-08-18 21:54:33 -04:00
|
|
|
fetch_history = self.page_fetch_history.all().values('status_code')
|
|
|
|
self.count_errors_in_history(fetch_history)
|
2010-07-06 13:21:12 -04:00
|
|
|
|
2010-08-18 21:54:33 -04:00
|
|
|
def count_errors_in_history(self, fetch_history):
|
2010-08-18 22:06:56 -04:00
|
|
|
non_errors = [h for h in fetch_history if int(h['status_code']) < 400]
|
|
|
|
errors = [h for h in fetch_history if int(h['status_code']) >= 400]
|
2010-08-18 21:54:33 -04:00
|
|
|
|
2010-08-18 20:35:45 -04:00
|
|
|
if len(non_errors) == 0 and len(errors) >= 1:
|
2010-08-18 21:54:33 -04:00
|
|
|
self.has_exception = True
|
|
|
|
self.active = False
|
2010-08-18 20:35:45 -04:00
|
|
|
self.save()
|
|
|
|
|
2010-07-05 22:53:49 -04:00
|
|
|
def count_subscribers(self, verbose=False, lock=None):
|
2010-06-27 20:43:17 -04:00
|
|
|
from apps.reader.models import UserSubscription
|
|
|
|
subs = UserSubscription.objects.filter(feed=self)
|
|
|
|
self.num_subscribers = subs.count()
|
2010-07-05 22:53:49 -04:00
|
|
|
|
2010-07-25 23:13:27 -04:00
|
|
|
self.save(lock=lock)
|
2010-06-27 20:43:17 -04:00
|
|
|
|
|
|
|
if verbose:
|
2010-06-27 23:15:31 -04:00
|
|
|
if self.num_subscribers <= 1:
|
|
|
|
print '.',
|
|
|
|
else:
|
|
|
|
print "\n %s> %s subscriber%s: %s" % (
|
|
|
|
'-' * min(self.num_subscribers, 20),
|
|
|
|
self.num_subscribers,
|
|
|
|
'' if self.num_subscribers == 1 else 's',
|
|
|
|
self.feed_title,
|
|
|
|
),
|
2010-08-13 10:43:48 -04:00
|
|
|
|
2010-07-25 23:13:27 -04:00
|
|
|
def count_stories(self, verbose=False, lock=None):
|
2010-07-02 15:49:08 -04:00
|
|
|
month_ago = datetime.datetime.now() - datetime.timedelta(days=30)
|
2010-07-25 23:13:27 -04:00
|
|
|
stories_last_month = Story.objects.filter(story_feed=self, story_date__gte=month_ago).count()
|
|
|
|
self.stories_last_month = stories_last_month
|
|
|
|
|
2010-08-13 10:43:48 -04:00
|
|
|
self.recount_feed(lock)
|
2010-07-25 23:13:27 -04:00
|
|
|
|
|
|
|
self.save(lock=lock)
|
2010-07-05 22:53:49 -04:00
|
|
|
|
2010-07-02 15:49:08 -04:00
|
|
|
if verbose:
|
2010-07-25 23:13:27 -04:00
|
|
|
print " ---> %s [%s]: %s stories" % (self.feed_title, self.pk, self.stories_last_month)
|
2010-08-13 10:43:48 -04:00
|
|
|
|
|
|
|
def recount_feed(self, lock=None):
|
|
|
|
"""
|
|
|
|
Fills in missing months between earlier occurances and now.
|
|
|
|
|
|
|
|
Save format: [('YYYY-MM, #), ...]
|
|
|
|
Example output: [(2010-12, 123), (2011-01, 146)]
|
|
|
|
"""
|
|
|
|
d = defaultdict(int)
|
|
|
|
now = datetime.datetime.now()
|
|
|
|
min_year = now.year
|
|
|
|
total = 0
|
|
|
|
month_count = 0
|
2010-08-13 11:03:07 -04:00
|
|
|
current_counts = self.story_count_history and json.decode(self.story_count_history)
|
2010-08-13 10:43:48 -04:00
|
|
|
|
|
|
|
if not current_counts:
|
|
|
|
current_counts = []
|
|
|
|
|
|
|
|
# Count stories, aggregate by year and month
|
|
|
|
stories = Story.objects.filter(story_feed=self).extra(select={
|
|
|
|
'year': "EXTRACT(year FROM story_date)",
|
|
|
|
'month': "EXTRACT(month from story_date)"
|
|
|
|
}).values('year', 'month')
|
|
|
|
for story in stories:
|
|
|
|
year = int(story['year'])
|
|
|
|
d['%s-%s' % (year, int(story['month']))] += 1
|
|
|
|
if year < min_year:
|
|
|
|
min_year = year
|
|
|
|
|
|
|
|
# Add on to existing months, always amending up, never down. (Current month
|
|
|
|
# is guaranteed to be accurate, since trim_feeds won't delete it until after
|
|
|
|
# a month. Hacker News can have 1,000+ and still be counted.)
|
|
|
|
for current_month, current_count in current_counts:
|
|
|
|
if current_month not in d or d[current_month] < current_count:
|
|
|
|
d[current_month] = current_count
|
|
|
|
year = re.findall(r"(\d{4})-\d{1,2}", current_month)[0]
|
|
|
|
if year < min_year:
|
|
|
|
min_year = year
|
|
|
|
|
|
|
|
# Assemble a list with 0's filled in for missing months,
|
|
|
|
# trimming left and right 0's.
|
|
|
|
months = []
|
|
|
|
start = False
|
|
|
|
for year in range(min_year, now.year+1):
|
|
|
|
for month in range(1, 12+1):
|
|
|
|
if datetime.datetime(year, month, 1) < now:
|
|
|
|
key = '%s-%s' % (year, month)
|
|
|
|
if d.get(key) or start:
|
|
|
|
start = True
|
|
|
|
months.append((key, d.get(key, 0)))
|
|
|
|
total += d.get(key, 0)
|
|
|
|
month_count += 1
|
|
|
|
|
2010-08-13 11:03:07 -04:00
|
|
|
self.story_count_history = json.encode(months)
|
2010-08-13 10:43:48 -04:00
|
|
|
if not total:
|
|
|
|
self.average_stories_per_month = 0
|
|
|
|
else:
|
|
|
|
self.average_stories_per_month = total / month_count
|
|
|
|
self.save(lock)
|
|
|
|
|
|
|
|
|
2009-06-16 03:08:55 +00:00
|
|
|
def last_updated(self):
|
|
|
|
return time.time() - time.mktime(self.last_update.timetuple())
|
|
|
|
|
|
|
|
def new_stories_since_date(self, date):
|
2010-01-21 13:12:29 -05:00
|
|
|
stories = Story.objects.filter(story_date__gte=date,
|
|
|
|
story_feed=self)
|
|
|
|
return stories
|
2009-06-16 03:08:55 +00:00
|
|
|
|
|
|
|
def add_feed(self, feed_address, feed_link, feed_title):
|
|
|
|
print locals()
|
|
|
|
|
2010-04-09 16:37:19 -04:00
|
|
|
def update(self, force=False, feed=None, single_threaded=False):
|
2009-08-29 19:34:42 +00:00
|
|
|
from utils import feed_fetcher
|
2009-08-14 01:54:22 +00:00
|
|
|
try:
|
2010-06-08 11:19:07 -04:00
|
|
|
self.feed_address = self.feed_address % {'NEWSBLUR_DIR': settings.NEWSBLUR_DIR}
|
2009-08-14 01:54:22 +00:00
|
|
|
except:
|
|
|
|
pass
|
2009-08-01 04:26:57 +00:00
|
|
|
|
2009-08-29 19:34:42 +00:00
|
|
|
options = {
|
2009-11-15 18:57:53 +00:00
|
|
|
'verbose': 2,
|
2010-04-09 16:37:19 -04:00
|
|
|
'timeout': 10,
|
|
|
|
'single_threaded': single_threaded,
|
2010-04-29 11:18:49 -04:00
|
|
|
'force': force,
|
2009-08-29 19:34:42 +00:00
|
|
|
}
|
|
|
|
disp = feed_fetcher.Dispatcher(options, 1)
|
2009-11-15 18:57:53 +00:00
|
|
|
disp.add_jobs([[self]])
|
|
|
|
disp.run_jobs()
|
2009-08-29 19:34:42 +00:00
|
|
|
disp.poll()
|
2009-06-16 03:08:55 +00:00
|
|
|
|
|
|
|
return
|
|
|
|
|
2009-08-01 04:26:57 +00:00
|
|
|
def add_update_stories(self, stories, existing_stories):
|
2009-08-20 02:43:01 +00:00
|
|
|
ret_values = {
|
|
|
|
ENTRY_NEW:0,
|
|
|
|
ENTRY_UPDATED:0,
|
|
|
|
ENTRY_SAME:0,
|
|
|
|
ENTRY_ERR:0
|
|
|
|
}
|
|
|
|
|
2009-06-16 03:08:55 +00:00
|
|
|
for story in stories:
|
2009-12-18 20:47:44 +00:00
|
|
|
story = pre_process_story(story)
|
2010-01-04 22:26:53 +00:00
|
|
|
|
2009-08-01 04:26:57 +00:00
|
|
|
if story.get('title'):
|
|
|
|
story_contents = story.get('content')
|
2010-01-04 22:26:53 +00:00
|
|
|
story_tags = self.get_tags(story)
|
|
|
|
|
2009-08-01 04:26:57 +00:00
|
|
|
if story_contents is not None:
|
|
|
|
story_content = story_contents[0]['value']
|
2009-06-16 03:08:55 +00:00
|
|
|
else:
|
2009-08-01 04:26:57 +00:00
|
|
|
story_content = story.get('summary')
|
2010-01-04 22:26:53 +00:00
|
|
|
|
2009-08-30 00:43:13 +00:00
|
|
|
existing_story, story_has_changed = self._exists_story(story, story_content, existing_stories)
|
2009-12-18 20:47:44 +00:00
|
|
|
story_author, _ = self._save_story_author(story.get('author'))
|
2009-08-01 04:26:57 +00:00
|
|
|
if existing_story is None:
|
2010-05-20 15:13:25 -04:00
|
|
|
# pub_date = datetime.datetime.timetuple(story.get('published'))
|
2009-08-14 01:48:21 +00:00
|
|
|
# logging.debug('- New story: %s %s' % (pub_date, story.get('title')))
|
2009-12-18 20:47:44 +00:00
|
|
|
|
2009-08-01 04:26:57 +00:00
|
|
|
s = Story(story_feed = self,
|
|
|
|
story_date = story.get('published'),
|
|
|
|
story_title = story.get('title'),
|
|
|
|
story_content = story_content,
|
2009-12-18 20:47:44 +00:00
|
|
|
story_author = story_author,
|
2010-07-08 11:37:54 -04:00
|
|
|
story_author_name = story.get('author'),
|
2009-09-05 02:22:25 +00:00
|
|
|
story_permalink = story.get('link'),
|
2010-04-05 02:42:43 -04:00
|
|
|
story_guid = story.get('guid') or story.get('id') or story.get('link'),
|
2010-06-29 09:01:30 -04:00
|
|
|
story_tags = self._shorten_story_tags(story_tags)
|
2009-08-01 04:26:57 +00:00
|
|
|
)
|
|
|
|
try:
|
|
|
|
s.save(force_insert=True)
|
2010-05-11 12:27:39 -04:00
|
|
|
ret_values[ENTRY_NEW] += 1
|
|
|
|
cache.set('updated_feed:%s' % self.id, 1)
|
2010-08-18 21:54:33 -04:00
|
|
|
except IntegrityError:
|
2009-08-20 02:43:01 +00:00
|
|
|
ret_values[ENTRY_ERR] += 1
|
2010-08-18 21:54:33 -04:00
|
|
|
# print('Saving new story, IntegrityError: %s - %s: %s' % (self.feed_title, story.get('title'), e))
|
2010-01-04 22:26:53 +00:00
|
|
|
[s.tags.add(tcat) for tcat in story_tags]
|
2009-08-30 00:43:13 +00:00
|
|
|
elif existing_story and story_has_changed:
|
2009-08-01 04:26:57 +00:00
|
|
|
# update story
|
2010-04-29 19:12:18 -04:00
|
|
|
# logging.debug('- Updated story in feed (%s - %s): %s / %s' % (self.feed_title, story.get('title'), len(existing_story.story_content), len(story_content)))
|
2009-08-01 04:26:57 +00:00
|
|
|
|
|
|
|
original_content = None
|
2010-01-27 16:28:57 -05:00
|
|
|
if existing_story.story_original_content:
|
|
|
|
original_content = existing_story.story_original_content
|
2009-08-01 04:26:57 +00:00
|
|
|
else:
|
2010-01-27 16:28:57 -05:00
|
|
|
original_content = existing_story.story_content
|
2010-01-27 18:08:07 -05:00
|
|
|
# print 'Type: %s %s' % (type(original_content), type(story_content))
|
2010-08-01 23:47:40 -04:00
|
|
|
if len(story_content) > 10:
|
|
|
|
diff = HTMLDiff(unicode(original_content), story_content)
|
|
|
|
story_content_diff = diff.getDiff()
|
|
|
|
else:
|
|
|
|
story_content_diff = original_content
|
2009-08-01 04:26:57 +00:00
|
|
|
# logging.debug("\t\tDiff: %s %s %s" % diff.getStats())
|
|
|
|
# logging.debug("\t\tDiff content: %s" % diff.getDiff())
|
2010-01-27 16:28:57 -05:00
|
|
|
if existing_story.story_title != story.get('title'):
|
|
|
|
# logging.debug('\tExisting title / New: : \n\t\t- %s\n\t\t- %s' % (existing_story.story_title, story.get('title')))
|
2009-08-14 01:48:21 +00:00
|
|
|
pass
|
2009-06-16 03:08:55 +00:00
|
|
|
|
2010-01-27 16:28:57 -05:00
|
|
|
s = Story(id = existing_story.id,
|
2009-08-01 04:26:57 +00:00
|
|
|
story_feed = self,
|
|
|
|
story_date = story.get('published'),
|
|
|
|
story_title = story.get('title'),
|
2010-08-01 23:47:40 -04:00
|
|
|
story_content = story_content_diff,
|
2009-08-01 04:26:57 +00:00
|
|
|
story_original_content = original_content,
|
2009-12-18 20:47:44 +00:00
|
|
|
story_author = story_author,
|
2010-07-08 11:37:54 -04:00
|
|
|
story_author_name = story.get('author'),
|
2009-09-05 02:22:25 +00:00
|
|
|
story_permalink = story.get('link'),
|
2010-04-05 02:42:43 -04:00
|
|
|
story_guid = story.get('guid') or story.get('id') or story.get('link'),
|
2010-06-29 09:01:30 -04:00
|
|
|
story_tags = self._shorten_story_tags(story_tags)
|
2009-08-01 04:26:57 +00:00
|
|
|
)
|
2010-01-04 22:26:53 +00:00
|
|
|
s.tags.clear()
|
|
|
|
[s.tags.add(tcat) for tcat in story_tags]
|
2009-08-01 04:26:57 +00:00
|
|
|
try:
|
|
|
|
s.save(force_update=True)
|
2010-05-11 12:27:39 -04:00
|
|
|
ret_values[ENTRY_UPDATED] += 1
|
|
|
|
cache.set('updated_feed:%s' % self.id, 1)
|
2010-08-18 21:54:33 -04:00
|
|
|
except IntegrityError:
|
2009-09-08 00:13:49 +00:00
|
|
|
ret_values[ENTRY_ERR] += 1
|
2010-08-18 21:54:33 -04:00
|
|
|
# print('Saving updated story, IntegrityError: %s - %s' % (self.feed_title, story.get('title')))
|
2009-08-20 02:43:01 +00:00
|
|
|
else:
|
|
|
|
ret_values[ENTRY_SAME] += 1
|
2009-08-01 04:26:57 +00:00
|
|
|
# logging.debug("Unchanged story: %s " % story.get('title'))
|
2009-06-16 03:08:55 +00:00
|
|
|
|
2009-08-20 02:43:01 +00:00
|
|
|
return ret_values
|
2009-06-16 03:08:55 +00:00
|
|
|
|
2009-12-18 20:47:44 +00:00
|
|
|
def _save_story_author(self, author):
|
|
|
|
author, created = StoryAuthor.objects.get_or_create(feed=self, author_name=author)
|
|
|
|
return author, created
|
2010-06-29 09:01:30 -04:00
|
|
|
|
2010-07-05 22:53:49 -04:00
|
|
|
def save_popular_tags(self, feed_tags=None, lock=None):
|
2010-07-01 15:16:33 -04:00
|
|
|
if not feed_tags:
|
|
|
|
from apps.rss_feeds.models import Tag
|
|
|
|
from django.db.models.aggregates import Count
|
|
|
|
all_tags = Tag.objects.filter(feed=self)\
|
|
|
|
.annotate(stories_count=Count('story'))\
|
|
|
|
.order_by('-stories_count')[:20]
|
|
|
|
feed_tags = [(tag.name, tag.stories_count) for tag in all_tags if tag.stories_count > 1]
|
|
|
|
popular_tags = json.encode(feed_tags)
|
|
|
|
if len(popular_tags) < 1024:
|
|
|
|
self.popular_tags = popular_tags
|
2010-07-25 23:13:27 -04:00
|
|
|
self.save(lock=lock)
|
2010-07-01 15:16:33 -04:00
|
|
|
return
|
|
|
|
|
2010-07-01 15:20:38 -04:00
|
|
|
tags_list = json.decode(feed_tags) if feed_tags else []
|
2010-07-01 15:16:33 -04:00
|
|
|
if len(tags_list) > 1:
|
|
|
|
self.save_popular_tags(tags_list[:-1])
|
|
|
|
|
2010-07-05 22:53:49 -04:00
|
|
|
def save_popular_authors(self, feed_authors=None, lock=None):
|
2010-07-01 15:16:33 -04:00
|
|
|
if not feed_authors:
|
|
|
|
from django.db.models.aggregates import Count
|
|
|
|
all_authors = StoryAuthor.objects.filter(feed=self, author_name__isnull=False)\
|
|
|
|
.annotate(stories_count=Count('story'))\
|
|
|
|
.order_by('-stories_count')[:20]
|
|
|
|
feed_authors = [(author.author_name, author.stories_count) for author in all_authors\
|
|
|
|
if author.stories_count > 1]
|
|
|
|
popular_authors = json.encode(feed_authors)
|
|
|
|
if len(popular_authors) < 1024:
|
|
|
|
self.popular_authors = popular_authors
|
2010-07-25 23:13:27 -04:00
|
|
|
self.save(lock=lock)
|
2010-07-01 15:16:33 -04:00
|
|
|
return
|
|
|
|
|
2010-07-01 15:20:38 -04:00
|
|
|
authors_list = json.decode(feed_authors) if feed_authors else []
|
2010-07-01 15:16:33 -04:00
|
|
|
if len(authors_list) > 1:
|
|
|
|
self.save_popular_authors(authors_list[:-1])
|
2010-07-05 22:53:49 -04:00
|
|
|
|
2010-06-29 09:01:30 -04:00
|
|
|
def _shorten_story_tags(self, story_tags):
|
|
|
|
encoded_tags = json.encode([t.name for t in story_tags])
|
|
|
|
if len(encoded_tags) < 2000:
|
|
|
|
return encoded_tags
|
|
|
|
|
2010-07-01 15:16:33 -04:00
|
|
|
if len(story_tags) > 1:
|
|
|
|
return self._shorten_story_tags(story_tags[:-1])
|
2009-12-18 20:47:44 +00:00
|
|
|
|
2009-08-01 04:26:57 +00:00
|
|
|
def trim_feed(self):
|
2010-01-26 19:59:43 -05:00
|
|
|
from apps.reader.models import UserStory
|
|
|
|
stories_deleted_count = 0
|
2010-01-26 20:27:11 -05:00
|
|
|
user_stories_count = 0
|
2010-07-25 23:13:27 -04:00
|
|
|
month_ago = datetime.datetime.now() - datetime.timedelta(days=30)
|
|
|
|
stories = Story.objects.filter(
|
|
|
|
story_feed=self,
|
|
|
|
story_date__lte=month_ago
|
|
|
|
).order_by('-story_date')
|
2010-01-26 20:02:36 -05:00
|
|
|
print 'Found %s stories in %s. Trimming...' % (stories.count(), self)
|
2010-02-12 19:30:26 -05:00
|
|
|
if stories.count() > 1000:
|
2010-02-12 19:34:32 -05:00
|
|
|
old_story = stories[1000]
|
2010-02-12 19:34:03 -05:00
|
|
|
user_stories = UserStory.objects.filter(feed=self,
|
2010-02-12 19:33:18 -05:00
|
|
|
read_date__lte=old_story.story_date)
|
|
|
|
user_stories_count = user_stories.count()
|
|
|
|
user_stories.delete()
|
2010-02-12 19:30:26 -05:00
|
|
|
old_stories = Story.objects.filter(story_feed=self,
|
|
|
|
story_date__lte=old_story.story_date)
|
2010-02-12 19:33:18 -05:00
|
|
|
stories_deleted_count = old_stories.count()
|
2010-02-12 19:30:26 -05:00
|
|
|
old_stories.delete()
|
2009-08-01 04:26:57 +00:00
|
|
|
|
2010-01-26 19:59:43 -05:00
|
|
|
if stories_deleted_count:
|
|
|
|
print "Trimming %s stories from %s. %s user stories." % (
|
|
|
|
stories_deleted_count,
|
|
|
|
self,
|
|
|
|
user_stories_count)
|
|
|
|
|
2010-06-24 15:10:15 -04:00
|
|
|
def get_stories(self, offset=0, limit=25, force=False):
|
|
|
|
if not force:
|
2010-05-11 12:27:39 -04:00
|
|
|
stories = cache.get('feed_stories:%s-%s-%s' % (self.id, offset, limit), [])
|
2010-06-24 15:10:15 -04:00
|
|
|
else:
|
|
|
|
stories = None
|
2010-01-21 13:12:29 -05:00
|
|
|
|
2010-06-24 15:10:15 -04:00
|
|
|
if not stories or force:
|
2010-07-08 11:37:54 -04:00
|
|
|
stories_db = Story.objects.filter(story_feed=self)[offset:offset+limit]
|
2010-01-21 13:12:29 -05:00
|
|
|
stories = self.format_stories(stories_db)
|
2010-06-24 15:10:15 -04:00
|
|
|
cache.set('feed_stories:%s-%s-%s' % (self.id, offset, limit), stories)
|
2009-07-28 02:27:27 +00:00
|
|
|
|
|
|
|
return stories
|
|
|
|
|
2010-01-21 13:12:29 -05:00
|
|
|
def format_stories(self, stories_db):
|
|
|
|
stories = []
|
2010-04-05 02:42:43 -04:00
|
|
|
# from django.db import connection
|
|
|
|
# print "Formatting Stories: %s" % stories_db.count()
|
2010-01-21 13:12:29 -05:00
|
|
|
for story_db in stories_db:
|
2010-02-17 03:22:45 -05:00
|
|
|
story = {}
|
2010-04-05 02:55:18 -04:00
|
|
|
# story_tags = story_db.tags.all()
|
2010-04-05 03:20:44 -04:00
|
|
|
story['story_tags'] = (story_db.story_tags and json.decode(story_db.story_tags)) or []
|
2010-02-17 03:22:45 -05:00
|
|
|
story['short_parsed_date'] = format_story_link_date__short(story_db.story_date)
|
|
|
|
story['long_parsed_date'] = format_story_link_date__long(story_db.story_date)
|
|
|
|
story['story_date'] = story_db.story_date
|
2010-07-08 11:37:54 -04:00
|
|
|
story['story_authors'] = story_db.story_author_name
|
2010-02-17 03:22:45 -05:00
|
|
|
story['story_title'] = story_db.story_title
|
|
|
|
story['story_content'] = story_db.story_content
|
|
|
|
story['story_permalink'] = story_db.story_permalink
|
2010-04-05 03:20:44 -04:00
|
|
|
story['story_feed_id'] = self.pk
|
2010-02-17 03:22:45 -05:00
|
|
|
story['id'] = story_db.id
|
|
|
|
|
2010-01-21 13:12:29 -05:00
|
|
|
stories.append(story)
|
|
|
|
|
|
|
|
return stories
|
|
|
|
|
2010-01-04 22:26:53 +00:00
|
|
|
def get_tags(self, entry):
|
|
|
|
fcat = []
|
|
|
|
if entry.has_key('tags'):
|
|
|
|
for tcat in entry.tags:
|
2010-07-06 18:16:41 -04:00
|
|
|
if tcat.label:
|
2010-01-04 22:26:53 +00:00
|
|
|
term = tcat.label
|
2010-07-06 18:16:41 -04:00
|
|
|
elif tcat.term:
|
2010-01-04 22:26:53 +00:00
|
|
|
term = tcat.term
|
2010-07-06 18:16:41 -04:00
|
|
|
else:
|
|
|
|
continue
|
2010-01-04 22:26:53 +00:00
|
|
|
qcat = term.strip()
|
|
|
|
if ',' in qcat or '/' in qcat:
|
|
|
|
qcat = qcat.replace(',', '/').split('/')
|
|
|
|
else:
|
|
|
|
qcat = [qcat]
|
|
|
|
for zcat in qcat:
|
|
|
|
tagname = zcat.lower()
|
|
|
|
while ' ' in tagname:
|
|
|
|
tagname = tagname.replace(' ', ' ')
|
|
|
|
tagname = tagname.strip()
|
|
|
|
if not tagname or tagname == ' ':
|
|
|
|
continue
|
|
|
|
if not Tag.objects.filter(name=tagname, feed=self):
|
|
|
|
cobj = Tag(name=tagname, feed=self)
|
|
|
|
cobj.save()
|
|
|
|
fcat.append(Tag.objects.get(name=tagname, feed=self))
|
|
|
|
return fcat
|
|
|
|
|
2009-08-01 04:26:57 +00:00
|
|
|
def _exists_story(self, story=None, story_content=None, existing_stories=None):
|
2009-08-30 00:43:13 +00:00
|
|
|
story_in_system = None
|
|
|
|
story_has_changed = False
|
2009-08-01 04:26:57 +00:00
|
|
|
story_pub_date = story.get('published')
|
2010-01-28 13:28:27 -05:00
|
|
|
story_published_now = story.get('published_now', False)
|
2009-08-01 04:26:57 +00:00
|
|
|
start_date = story_pub_date - datetime.timedelta(hours=8)
|
|
|
|
end_date = story_pub_date + datetime.timedelta(hours=8)
|
2009-08-30 00:43:13 +00:00
|
|
|
|
2009-08-01 04:26:57 +00:00
|
|
|
for existing_story in existing_stories:
|
2009-08-30 00:43:13 +00:00
|
|
|
content_ratio = 0
|
2010-01-28 13:28:27 -05:00
|
|
|
# print 'Story pub date: %s %s' % (story_published_now, story_pub_date)
|
|
|
|
if story_published_now or\
|
|
|
|
(story_pub_date > start_date and story_pub_date < end_date):
|
|
|
|
if story.get('guid') and story.get('guid') == existing_story.story_guid:
|
2009-09-05 02:22:25 +00:00
|
|
|
story_in_system = existing_story
|
2010-01-27 16:28:57 -05:00
|
|
|
elif story.get('link') and story.get('link') == existing_story.story_permalink:
|
2009-08-30 00:43:13 +00:00
|
|
|
story_in_system = existing_story
|
|
|
|
|
2009-09-07 01:56:52 +00:00
|
|
|
# import pdb
|
|
|
|
# pdb.set_trace()
|
2009-09-10 03:33:05 +00:00
|
|
|
|
2009-08-30 00:43:13 +00:00
|
|
|
# Title distance + content distance, checking if story changed
|
2009-08-01 04:26:57 +00:00
|
|
|
story_title_difference = levenshtein_distance(story.get('title'),
|
2010-01-27 16:28:57 -05:00
|
|
|
existing_story.story_title)
|
|
|
|
seq = difflib.SequenceMatcher(None, story_content, existing_story.story_content)
|
2009-08-30 00:43:13 +00:00
|
|
|
|
2010-04-29 13:52:24 -04:00
|
|
|
if (seq
|
|
|
|
and story_content
|
|
|
|
and existing_story.story_content
|
|
|
|
and seq.real_quick_ratio() > .9
|
|
|
|
and seq.quick_ratio() > .95):
|
2009-08-30 00:43:13 +00:00
|
|
|
content_ratio = seq.ratio()
|
|
|
|
|
|
|
|
if story_title_difference > 0 and story_title_difference < 5 and content_ratio > .98:
|
|
|
|
story_in_system = existing_story
|
|
|
|
if story_title_difference > 0 or content_ratio < 1.0:
|
2010-01-27 16:28:57 -05:00
|
|
|
# print "Title difference - %s/%s (%s): %s" % (story.get('title'), existing_story.story_title, story_title_difference, content_ratio)
|
2009-08-30 00:43:13 +00:00
|
|
|
story_has_changed = True
|
|
|
|
break
|
|
|
|
|
|
|
|
# More restrictive content distance, still no story match
|
2009-09-07 01:56:52 +00:00
|
|
|
if not story_in_system and content_ratio > .98:
|
2010-01-27 16:28:57 -05:00
|
|
|
# print "Content difference - %s/%s (%s): %s" % (story.get('title'), existing_story.story_title, story_title_difference, content_ratio)
|
2009-08-30 00:43:13 +00:00
|
|
|
story_in_system = existing_story
|
|
|
|
story_has_changed = True
|
|
|
|
break
|
2009-08-01 04:26:57 +00:00
|
|
|
|
2009-08-30 00:43:13 +00:00
|
|
|
if story_in_system:
|
2010-01-27 16:28:57 -05:00
|
|
|
if story_content != existing_story.story_content:
|
2009-08-30 00:43:13 +00:00
|
|
|
story_has_changed = True
|
2009-08-01 04:26:57 +00:00
|
|
|
break
|
2010-01-28 13:28:27 -05:00
|
|
|
|
2010-02-02 18:01:02 -05:00
|
|
|
# if story_has_changed or not story_in_system:
|
|
|
|
# print 'New/updated story: %s' % (story),
|
2009-08-30 00:43:13 +00:00
|
|
|
return story_in_system, story_has_changed
|
2010-07-25 23:13:27 -04:00
|
|
|
|
|
|
|
def get_next_scheduled_update(self):
|
2010-06-24 15:27:25 -04:00
|
|
|
# Use stories per month to calculate next feed update
|
2010-08-16 17:51:15 -04:00
|
|
|
updates_per_day = self.stories_last_month / 30.0
|
|
|
|
# 0 updates per day = 24 hours
|
|
|
|
# 1 update per day = 6 hours
|
2010-06-28 11:37:54 -04:00
|
|
|
# > 1 update per day:
|
2010-07-05 22:53:49 -04:00
|
|
|
# 2 updates = 3 hours
|
|
|
|
# 4 updates = 1 hour
|
|
|
|
# 10 updates = 20 minutes
|
2010-08-16 17:51:15 -04:00
|
|
|
updates_per_day_delay = 6 * 60 / max(.25, updates_per_day ** 1.55)
|
2010-07-02 15:49:08 -04:00
|
|
|
|
|
|
|
# Lots of subscribers = lots of updates
|
2010-08-16 17:51:15 -04:00
|
|
|
# 144 hours for 0 subscribers.
|
|
|
|
# 24 hours for 1 subscriber.
|
|
|
|
# 3 hours for 2 subscribers.
|
|
|
|
# ~53 min for 3 subscribers.
|
|
|
|
subscriber_bonus = 24 * 60 / max(.167, self.num_subscribers**3)
|
2010-06-27 21:44:35 -04:00
|
|
|
|
2010-06-28 11:37:54 -04:00
|
|
|
slow_punishment = 0
|
2010-06-24 15:27:25 -04:00
|
|
|
if 30 <= self.last_load_time < 60:
|
|
|
|
slow_punishment = self.last_load_time
|
|
|
|
elif 60 <= self.last_load_time < 100:
|
|
|
|
slow_punishment = 4 * self.last_load_time
|
2010-07-05 14:26:35 -04:00
|
|
|
elif self.last_load_time >= 100:
|
|
|
|
slow_punishment = 12 * self.last_load_time
|
2010-07-25 23:13:27 -04:00
|
|
|
|
|
|
|
total = int(updates_per_day_delay + subscriber_bonus + slow_punishment)
|
|
|
|
random_factor = random.randint(0, total) / 4
|
2010-07-05 22:53:49 -04:00
|
|
|
|
2010-07-26 22:21:58 -04:00
|
|
|
return total, random_factor
|
2010-07-25 23:13:27 -04:00
|
|
|
|
|
|
|
def set_next_scheduled_update(self, lock=None):
|
2010-07-26 22:21:58 -04:00
|
|
|
total, random_factor = self.get_next_scheduled_update()
|
|
|
|
|
|
|
|
next_scheduled_update = datetime.datetime.now() + datetime.timedelta(
|
|
|
|
minutes = total + random_factor)
|
|
|
|
|
2010-06-24 15:27:25 -04:00
|
|
|
self.next_scheduled_update = next_scheduled_update
|
2010-07-05 23:17:36 -04:00
|
|
|
|
2010-07-25 23:13:27 -04:00
|
|
|
self.save(lock=lock)
|
2010-07-27 23:29:04 -04:00
|
|
|
|
|
|
|
def calculate_collocations_story_content(self,
|
|
|
|
collocation_measures=TrigramAssocMeasures,
|
|
|
|
collocation_finder=TrigramCollocationFinder):
|
2010-07-27 22:11:23 -04:00
|
|
|
stories = Story.objects.filter(story_feed=self)
|
2010-07-27 22:53:30 -04:00
|
|
|
story_content = ' '.join([s.story_content for s in stories if s.story_content])
|
2010-07-27 23:29:04 -04:00
|
|
|
return self.calculate_collocations(story_content, collocation_measures, collocation_finder)
|
|
|
|
|
|
|
|
def calculate_collocations_story_title(self,
|
|
|
|
collocation_measures=BigramAssocMeasures,
|
|
|
|
collocation_finder=BigramCollocationFinder):
|
|
|
|
stories = Story.objects.filter(story_feed=self)
|
|
|
|
story_titles = ' '.join([s.story_title for s in stories if s.story_title])
|
|
|
|
return self.calculate_collocations(story_titles, collocation_measures, collocation_finder)
|
|
|
|
|
|
|
|
def calculate_collocations(self, content,
|
|
|
|
collocation_measures=TrigramAssocMeasures,
|
|
|
|
collocation_finder=TrigramCollocationFinder):
|
|
|
|
content = re.sub(r'’', '\'', content)
|
|
|
|
content = re.sub(r'&', '&', content)
|
|
|
|
try:
|
|
|
|
content = unicode(BeautifulStoneSoup(content,
|
|
|
|
convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
|
|
|
|
except ValueError, e:
|
|
|
|
print "ValueError, ignoring: %s" % e
|
|
|
|
content = re.sub(r'</?\w+\s+[^>]*>', '', content)
|
|
|
|
content = re.split(r"[^A-Za-z-'&]+", content)
|
2010-07-27 22:11:23 -04:00
|
|
|
|
2010-07-27 23:29:04 -04:00
|
|
|
finder = collocation_finder.from_words(content)
|
2010-07-27 22:11:23 -04:00
|
|
|
finder.apply_freq_filter(3)
|
2010-07-27 22:27:32 -04:00
|
|
|
best = finder.nbest(collocation_measures.pmi, 10)
|
2010-07-27 22:11:23 -04:00
|
|
|
phrases = [' '.join(phrase) for phrase in best]
|
|
|
|
|
2010-07-27 22:27:32 -04:00
|
|
|
return phrases
|
2010-07-27 22:11:23 -04:00
|
|
|
|
2009-06-16 03:08:55 +00:00
|
|
|
class Meta:
|
|
|
|
db_table="feeds"
|
|
|
|
ordering=["feed_title"]
|
2010-07-27 22:11:23 -04:00
|
|
|
|
2010-07-27 22:37:52 -04:00
|
|
|
# class FeedCollocations(models.Model):
|
|
|
|
# feed = models.ForeignKey(Feed)
|
|
|
|
# phrase = models.CharField(max_length=500)
|
2009-06-16 03:08:55 +00:00
|
|
|
|
|
|
|
class Tag(models.Model):
|
2010-01-04 22:26:53 +00:00
|
|
|
feed = models.ForeignKey(Feed)
|
|
|
|
name = models.CharField(max_length=255)
|
2009-06-16 03:08:55 +00:00
|
|
|
|
|
|
|
def __unicode__(self):
|
2010-01-04 22:26:53 +00:00
|
|
|
return '%s - %s' % (self.feed, self.name)
|
2009-06-16 03:08:55 +00:00
|
|
|
|
|
|
|
def save(self):
|
|
|
|
super(Tag, self).save()
|
|
|
|
|
2009-12-18 20:47:44 +00:00
|
|
|
class StoryAuthor(models.Model):
|
|
|
|
feed = models.ForeignKey(Feed)
|
|
|
|
author_name = models.CharField(max_length=255, null=True, blank=True)
|
|
|
|
|
2009-12-18 21:59:14 +00:00
|
|
|
def __unicode__(self):
|
|
|
|
return '%s - %s' % (self.feed, self.author_name)
|
2010-05-20 15:13:25 -04:00
|
|
|
|
|
|
|
class FeedPage(models.Model):
|
|
|
|
feed = models.OneToOneField(Feed, related_name="feed_page")
|
|
|
|
page_data = StoryField(null=True, blank=True)
|
|
|
|
|
|
|
|
class FeedXML(models.Model):
|
|
|
|
feed = models.OneToOneField(Feed, related_name="feed_xml")
|
|
|
|
rss_xml = StoryField(null=True, blank=True)
|
|
|
|
|
2009-06-16 03:08:55 +00:00
|
|
|
class Story(models.Model):
|
|
|
|
'''A feed item'''
|
2010-04-09 16:10:23 -04:00
|
|
|
story_feed = models.ForeignKey(Feed, related_name="stories")
|
2009-06-16 03:08:55 +00:00
|
|
|
story_date = models.DateTimeField()
|
|
|
|
story_title = models.CharField(max_length=255)
|
2010-01-27 16:28:57 -05:00
|
|
|
story_content = StoryField(null=True, blank=True)
|
|
|
|
story_original_content = StoryField(null=True, blank=True)
|
2009-06-16 03:08:55 +00:00
|
|
|
story_content_type = models.CharField(max_length=255, null=True,
|
|
|
|
blank=True)
|
2009-12-18 20:47:44 +00:00
|
|
|
story_author = models.ForeignKey(StoryAuthor)
|
2010-07-08 11:37:54 -04:00
|
|
|
story_author_name = models.CharField(max_length=500, null=True, blank=True)
|
2009-06-16 03:08:55 +00:00
|
|
|
story_permalink = models.CharField(max_length=1000)
|
2009-09-05 02:22:25 +00:00
|
|
|
story_guid = models.CharField(max_length=1000)
|
2010-04-29 11:18:49 -04:00
|
|
|
story_guid_hash = models.CharField(max_length=40)
|
2009-06-16 03:08:55 +00:00
|
|
|
story_past_trim_date = models.BooleanField(default=False)
|
2010-07-20 20:23:49 -04:00
|
|
|
story_tags = models.CharField(max_length=2000, null=True, blank=True)
|
2010-06-27 22:40:22 -04:00
|
|
|
tags = models.ManyToManyField('Tag')
|
2009-06-16 03:08:55 +00:00
|
|
|
|
|
|
|
def __unicode__(self):
|
|
|
|
return self.story_title
|
|
|
|
|
|
|
|
class Meta:
|
|
|
|
verbose_name_plural = "stories"
|
|
|
|
verbose_name = "story"
|
|
|
|
db_table="stories"
|
2009-07-18 23:39:16 +00:00
|
|
|
ordering=["-story_date"]
|
2010-07-20 20:23:49 -04:00
|
|
|
unique_together = (("story_feed", "story_guid_hash"),)
|
2009-08-14 02:32:30 +00:00
|
|
|
|
2010-04-29 11:18:49 -04:00
|
|
|
def save(self, *args, **kwargs):
|
|
|
|
if not self.story_guid_hash and self.story_guid:
|
|
|
|
self.story_guid_hash = hashlib.md5(self.story_guid).hexdigest()
|
2010-08-04 18:55:52 -04:00
|
|
|
if len(self.story_title) > 255:
|
|
|
|
self.story_title = self.story_title[:255]
|
2010-04-29 11:18:49 -04:00
|
|
|
super(Story, self).save(*args, **kwargs)
|
|
|
|
|
2010-04-23 21:19:19 -04:00
|
|
|
class FeedUpdateHistory(models.Model):
|
|
|
|
fetch_date = models.DateTimeField(default=datetime.datetime.now)
|
|
|
|
number_of_feeds = models.IntegerField()
|
|
|
|
seconds_taken = models.IntegerField()
|
2010-04-25 16:11:04 -04:00
|
|
|
average_per_feed = models.DecimalField(decimal_places=1, max_digits=4)
|
2010-04-23 21:19:19 -04:00
|
|
|
|
|
|
|
def __unicode__(self):
|
|
|
|
return "[%s] %s feeds: %s seconds" % (
|
2010-05-20 15:13:25 -04:00
|
|
|
self.fetch_date.strftime('%F %d'),
|
2010-04-23 21:19:19 -04:00
|
|
|
self.number_of_feeds,
|
|
|
|
self.seconds_taken,
|
|
|
|
)
|
2010-04-25 16:11:04 -04:00
|
|
|
|
|
|
|
def save(self, *args, **kwargs):
|
|
|
|
self.average_per_feed = str(self.seconds_taken / float(max(1.0,self.number_of_feeds)))
|
|
|
|
super(FeedUpdateHistory, self).save(*args, **kwargs)
|
2010-07-06 13:21:12 -04:00
|
|
|
|
|
|
|
class FeedFetchHistory(models.Model):
|
2010-07-08 11:37:54 -04:00
|
|
|
feed = models.ForeignKey(Feed, related_name='feed_fetch_history')
|
|
|
|
status_code = models.CharField(max_length=10, null=True, blank=True)
|
|
|
|
message = models.CharField(max_length=255, null=True, blank=True)
|
|
|
|
exception = models.TextField(null=True, blank=True)
|
|
|
|
fetch_date = models.DateTimeField(default=datetime.datetime.now)
|
|
|
|
|
|
|
|
def __unicode__(self):
|
|
|
|
return "[%s] %s (%s): %s %s: %s" % (
|
|
|
|
self.feed.id,
|
|
|
|
self.feed,
|
|
|
|
self.fetch_date,
|
|
|
|
self.status_code,
|
|
|
|
self.message,
|
|
|
|
self.exception[:50]
|
|
|
|
)
|
|
|
|
|
|
|
|
class PageFetchHistory(models.Model):
|
|
|
|
feed = models.ForeignKey(Feed, related_name='page_fetch_history')
|
2010-07-06 13:21:12 -04:00
|
|
|
status_code = models.CharField(max_length=10, null=True, blank=True)
|
|
|
|
message = models.CharField(max_length=255, null=True, blank=True)
|
|
|
|
exception = models.TextField(null=True, blank=True)
|
|
|
|
fetch_date = models.DateTimeField(default=datetime.datetime.now)
|
|
|
|
|
|
|
|
def __unicode__(self):
|
|
|
|
return "[%s] %s (%s): %s %s: %s" % (
|
|
|
|
self.feed.id,
|
|
|
|
self.feed,
|
|
|
|
self.fetch_date,
|
|
|
|
self.status_code,
|
|
|
|
self.message,
|
|
|
|
self.exception[:50]
|
2010-07-25 23:13:27 -04:00
|
|
|
)
|
2010-08-19 10:43:07 -04:00
|
|
|
|
|
|
|
class DuplicateFeed(models.Model):
|
|
|
|
duplicate_address = models.CharField(max_length=255, unique=True)
|
|
|
|
feed = models.ForeignKey(Feed, related_name='duplicate_addresses')
|
|
|
|
|