Feed update cascading. Feeds update relative to how often they have new content. This should save a LOT of processing power. Whew.

This commit is contained in:
Samuel Clay 2010-04-19 12:09:04 -04:00
parent 67cc20c272
commit 71b746a48d
3 changed files with 28 additions and 9 deletions

View file

@ -8,6 +8,7 @@ import logging
import socket
import os
import math
import datetime
class Command(BaseCommand):
@ -26,23 +27,21 @@ class Command(BaseCommand):
def handle(self, *args, **options):
if options['daemonize']:
daemonize()
feeds = Feed.objects.all().order_by('?')
socket.setdefaulttimeout(options['timeout'])
now = datetime.datetime.now()
feeds = Feed.objects.filter(next_scheduled_update__lte=now).order_by('?')
num_workers = min(len(feeds), options['workerthreads'])
if options['single_threaded']:
num_workers = 1
# settting socket timeout (default= 10 seconds)
socket.setdefaulttimeout(options['timeout'])
disp = feed_fetcher.Dispatcher(options, num_workers)
feeds_queue = []
for _ in range(num_workers):
feeds_queue.append([])
i = 0
for feed in feeds:
feeds_queue[i%num_workers].append(feed)

View file

@ -18,6 +18,7 @@ from django.db.models import Q
import settings
import logging
import difflib
import datetime
from utils.diff import HTMLDiff
USER_AGENT = 'NewsBlur v1.0 - newsblur.com'
@ -38,6 +39,8 @@ class Feed(models.Model):
etag = models.CharField(max_length=50, blank=True, null=True)
last_modified = models.DateTimeField(null=True, blank=True)
page_data = StoryField(null=True, blank=True)
stories_per_month = models.IntegerField(default=0)
next_scheduled_update = models.DateTimeField(default=datetime.datetime.now)
def __unicode__(self):

View file

@ -15,6 +15,8 @@ import datetime
import traceback
import multiprocessing
import Queue
import datetime
import random
# Refresh feed code adapted from Feedjack.
# http://feedjack.googlecode.com
@ -107,6 +109,19 @@ class ProcessFeed:
logging.debug(u'[%d] Processing %s' % (self.feed.id,
self.feed.feed_title))
# Count stories in past month to calculate next scheduled update
month_ago = datetime.datetime.now() - datetime.timedelta(days=30)
stories_count = Story.objects.filter(story_feed=self.feed, story_date__gte=month_ago).count()
stories_count = stories_count
self.feed.stories_per_month = stories_count
updates_per_day = max(30, stories_count) / 30.0 * 12
minutes_to_next_update = 60 * 24 / updates_per_day
random_factor = random.randint(0,int(minutes_to_next_update/4))
next_scheduled_update = datetime.datetime.now() + datetime.timedelta(
minutes=minutes_to_next_update+random_factor
)
self.feed.next_scheduled_update = next_scheduled_update
if hasattr(self.fpf, 'status'):
if self.options['verbose']:
logging.debug(u'[%d] HTTP status %d: %s' % (self.feed.id,
@ -118,6 +133,7 @@ class ProcessFeed:
logging.debug('[%d] Feed has not changed since ' \
'last check: %s' % (self.feed.id,
self.feed.feed_address))
self.feed.save()
return FEED_SAME, ret_values
if self.fpf.status >= 400:
@ -125,6 +141,7 @@ class ProcessFeed:
logging.error('[%d] !HTTP_ERROR! %d: %s' % (self.feed.id,
self.fpf.status,
self.feed.feed_address))
self.feed.save()
return FEED_ERRHTTP, ret_values
if hasattr(self.fpf, 'bozo') and self.fpf.bozo:
@ -147,6 +164,7 @@ class ProcessFeed:
self.feed.feed_tagline = self.fpf.feed.get('tagline', self.feed.feed_tagline)
self.feed.feed_link = self.fpf.feed.get('link', self.feed.feed_link)
self.feed.last_update = datetime.datetime.now()
if False and self.options['verbose']:
logging.debug(u'[%d] Feed info for: %s\n' \
@ -242,7 +260,6 @@ class Dispatcher:
identity = "X"
if current_process._identity:
identity = current_process._identity[0]
# print feed_queue
for feed in feed_queue:
# print "Process Feed: [%s] %s" % (current_process.name, feed)