diff --git a/apps/rss_feeds/management/commands/task_feeds.py b/apps/rss_feeds/management/commands/task_feeds.py index 15fcaa76c..fb5e758c2 100644 --- a/apps/rss_feeds/management/commands/task_feeds.py +++ b/apps/rss_feeds/management/commands/task_feeds.py @@ -46,4 +46,12 @@ class Command(BaseCommand): active_subscribers__gte=1, active=True ).order_by('?') + if feeds: Feed.task_feeds(feeds) + + feeds = Feed.objects.filter( + last_update__lte=day, + active_subscribers__gte=1, + active=False, + known_good=True + ).order_by('?') if feeds: Feed.task_feeds(feeds) \ No newline at end of file diff --git a/apps/rss_feeds/migrations/0053_known_good.py b/apps/rss_feeds/migrations/0053_known_good.py new file mode 100644 index 000000000..8c96f41bf --- /dev/null +++ b/apps/rss_feeds/migrations/0053_known_good.py @@ -0,0 +1,83 @@ +# encoding: utf-8 +import datetime +from south.db import db +from south.v2 import SchemaMigration +from django.db import models + +class Migration(SchemaMigration): + + def forwards(self, orm): + + # Adding field 'Feed.known_good' + db.add_column('feeds', 'known_good', self.gf('django.db.models.fields.BooleanField')(default=False, db_index=True), keep_default=False) + + + def backwards(self, orm): + + # Deleting field 'Feed.known_good' + db.delete_column('feeds', 'known_good') + + + models = { + 'rss_feeds.duplicatefeed': { + 'Meta': {'object_name': 'DuplicateFeed'}, + 'duplicate_address': ('django.db.models.fields.CharField', [], {'max_length': '255'}), + 'duplicate_feed_id': ('django.db.models.fields.CharField', [], {'max_length': '255', 'null': 'True'}), + 'feed': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'duplicate_addresses'", 'to': "orm['rss_feeds.Feed']"}), + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}) + }, + 'rss_feeds.feed': { + 'Meta': {'ordering': "['feed_title']", 'object_name': 'Feed', 'db_table': "'feeds'"}, + 'active': ('django.db.models.fields.BooleanField', [], {'default': 'True', 'db_index': 'True'}), + 'active_premium_subscribers': ('django.db.models.fields.IntegerField', [], {'default': '-1', 'db_index': 'True'}), + 'active_subscribers': ('django.db.models.fields.IntegerField', [], {'default': '-1', 'db_index': 'True'}), + 'average_stories_per_month': ('django.db.models.fields.IntegerField', [], {'default': '0'}), + 'branch_from_feed': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['rss_feeds.Feed']", 'null': 'True', 'blank': 'True'}), + 'creation': ('django.db.models.fields.DateField', [], {'auto_now_add': 'True', 'blank': 'True'}), + 'days_to_trim': ('django.db.models.fields.IntegerField', [], {'default': '90'}), + 'etag': ('django.db.models.fields.CharField', [], {'max_length': '255', 'null': 'True', 'blank': 'True'}), + 'exception_code': ('django.db.models.fields.IntegerField', [], {'default': '0'}), + 'favicon_color': ('django.db.models.fields.CharField', [], {'max_length': '6', 'null': 'True', 'blank': 'True'}), + 'favicon_not_found': ('django.db.models.fields.BooleanField', [], {'default': 'False'}), + 'feed_address': ('django.db.models.fields.URLField', [], {'max_length': '255'}), + 'feed_address_locked': ('django.db.models.fields.NullBooleanField', [], {'default': 'False', 'null': 'True', 'blank': 'True'}), + 'feed_link': ('django.db.models.fields.URLField', [], {'default': "''", 'max_length': '1000', 'null': 'True', 'blank': 'True'}), + 'feed_link_locked': ('django.db.models.fields.BooleanField', [], {'default': 'False'}), + 'feed_title': ('django.db.models.fields.CharField', [], {'default': "'[Untitled]'", 'max_length': '255', 'null': 'True', 'blank': 'True'}), + 'fetched_once': ('django.db.models.fields.BooleanField', [], {'default': 'False'}), + 'has_feed_exception': ('django.db.models.fields.BooleanField', [], {'default': 'False', 'db_index': 'True'}), + 'has_page': ('django.db.models.fields.BooleanField', [], {'default': 'True'}), + 'has_page_exception': ('django.db.models.fields.BooleanField', [], {'default': 'False', 'db_index': 'True'}), + 'hash_address_and_link': ('django.db.models.fields.CharField', [], {'unique': 'True', 'max_length': '64', 'db_index': 'True'}), + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'known_good': ('django.db.models.fields.BooleanField', [], {'default': 'False', 'db_index': 'True'}), + 'last_load_time': ('django.db.models.fields.IntegerField', [], {'default': '0'}), + 'last_modified': ('django.db.models.fields.DateTimeField', [], {'null': 'True', 'blank': 'True'}), + 'last_update': ('django.db.models.fields.DateTimeField', [], {'db_index': 'True'}), + 'min_to_decay': ('django.db.models.fields.IntegerField', [], {'default': '0'}), + 'next_scheduled_update': ('django.db.models.fields.DateTimeField', [], {'db_index': 'True'}), + 'num_subscribers': ('django.db.models.fields.IntegerField', [], {'default': '-1'}), + 'premium_subscribers': ('django.db.models.fields.IntegerField', [], {'default': '-1'}), + 'queued_date': ('django.db.models.fields.DateTimeField', [], {'db_index': 'True'}), + 'stories_last_month': ('django.db.models.fields.IntegerField', [], {'default': '0'}) + }, + 'rss_feeds.feeddata': { + 'Meta': {'object_name': 'FeedData'}, + 'feed': ('utils.fields.AutoOneToOneField', [], {'related_name': "'data'", 'unique': 'True', 'to': "orm['rss_feeds.Feed']"}), + 'feed_classifier_counts': ('django.db.models.fields.TextField', [], {'null': 'True', 'blank': 'True'}), + 'feed_tagline': ('django.db.models.fields.CharField', [], {'max_length': '1024', 'null': 'True', 'blank': 'True'}), + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'popular_authors': ('django.db.models.fields.CharField', [], {'max_length': '2048', 'null': 'True', 'blank': 'True'}), + 'popular_tags': ('django.db.models.fields.CharField', [], {'max_length': '1024', 'null': 'True', 'blank': 'True'}), + 'story_count_history': ('django.db.models.fields.TextField', [], {'null': 'True', 'blank': 'True'}) + }, + 'rss_feeds.feedloadtime': { + 'Meta': {'object_name': 'FeedLoadtime'}, + 'date_accessed': ('django.db.models.fields.DateTimeField', [], {'auto_now': 'True', 'blank': 'True'}), + 'feed': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['rss_feeds.Feed']"}), + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'loadtime': ('django.db.models.fields.FloatField', [], {}) + } + } + + complete_apps = ['rss_feeds'] diff --git a/apps/rss_feeds/models.py b/apps/rss_feeds/models.py index 11f3a3887..853416f89 100644 --- a/apps/rss_feeds/models.py +++ b/apps/rss_feeds/models.py @@ -50,6 +50,7 @@ class Feed(models.Model): branch_from_feed = models.ForeignKey('Feed', blank=True, null=True, db_index=True) last_update = models.DateTimeField(db_index=True) fetched_once = models.BooleanField(default=False) + known_good = models.BooleanField(default=False, db_index=True) has_feed_exception = models.BooleanField(default=False, db_index=True) has_page_exception = models.BooleanField(default=False, db_index=True) has_page = models.BooleanField(default=True) diff --git a/utils/feed_fetcher.py b/utils/feed_fetcher.py index aa81a6ead..32060dd40 100644 --- a/utils/feed_fetcher.py +++ b/utils/feed_fetcher.py @@ -1,8 +1,13 @@ -# from apps.rss_feeds.models import FeedXML +import time +import datetime +import traceback +import multiprocessing +import urllib2 +import xml.sax +import redis from django.core.cache import cache from django.conf import settings from django.db import IntegrityError -# from mongoengine.queryset import Q from apps.reader.models import UserSubscription, MUserStory from apps.rss_feeds.models import Feed, MStory from apps.rss_feeds.page_importer import PageImporter @@ -11,18 +16,10 @@ from utils import feedparser from utils.story_functions import pre_process_story from utils import log as logging from utils.feed_functions import timelimit, TimeoutError, mail_feed_error_to_admin, utf8encode -import time -import datetime -import traceback -import multiprocessing -import urllib2 -import xml.sax -import redis # Refresh feed code adapted from Feedjack. # http://feedjack.googlecode.com -SLOWFEED_WARNING = 10 ENTRY_NEW, ENTRY_UPDATED, ENTRY_SAME, ENTRY_ERR = range(4) FEED_OK, FEED_SAME, FEED_ERRPARSE, FEED_ERRHTTP, FEED_ERREXC = range(5) @@ -132,6 +129,8 @@ class ProcessFeed: if not self.feed.fetched_once: self.feed.has_feed_exception = True self.feed.fetched_once = True + self.feed.known_good = True + logging.debug(" ---> [%-30s] Feed is 302'ing, but it's not new. Refetching..." % (unicode(self.feed)[:30])) self.feed.schedule_feed_fetch_immediately() if not self.fpf.entries: self.feed.save() @@ -139,9 +138,9 @@ class ProcessFeed: return FEED_ERRHTTP, ret_values if self.fpf.status >= 400: - logging.debug(" ---> [%-30s] HTTP Status code: %s.%s Checking address..." % (unicode(self.feed)[:30], self.fpf.status, ' Not' if self.feed.fetched_once else '')) + logging.debug(" ---> [%-30s] HTTP Status code: %s.%s Checking address..." % (unicode(self.feed)[:30], self.fpf.status, ' Not' if self.feed.known_good else '')) fixed_feed = None - if not self.feed.fetched_once: + if not self.feed.known_good: fixed_feed = self.feed.check_feed_link_for_feed_address() if not fixed_feed: self.feed.save_feed_history(self.fpf.status, "HTTP Error") @@ -152,10 +151,10 @@ class ProcessFeed: return FEED_ERRHTTP, ret_values if self.fpf.bozo and isinstance(self.fpf.bozo_exception, feedparser.NonXMLContentType): - logging.debug(" ---> [%-30s] Feed is Non-XML. %s entries.%s Checking address..." % (unicode(self.feed)[:30], len(self.fpf.entries), ' Not' if self.fpf.entries else '')) + logging.debug(" ---> [%-30s] Feed is Non-XML. %s entries.%s Checking address..." % (unicode(self.feed)[:30], len(self.fpf.entries), ' Not' if self.feed.known_good and self.fpf.entries else '')) if not self.fpf.entries: fixed_feed = None - if not self.feed.fetched_once: + if not self.feed.known_good: fixed_feed = self.feed.check_feed_link_for_feed_address() if not fixed_feed: self.feed.save_feed_history(502, 'Non-xml feed', self.fpf.bozo_exception) @@ -168,7 +167,7 @@ class ProcessFeed: logging.debug(" ---> [%-30s] Feed has SAX/XML parsing issues. %s entries.%s Checking address..." % (unicode(self.feed)[:30], len(self.fpf.entries), ' Not' if self.fpf.entries else '')) if not self.fpf.entries: fixed_feed = None - if not self.feed.fetched_once: + if not self.feed.known_good: fixed_feed = self.feed.check_feed_link_for_feed_address() if not fixed_feed: self.feed.save_feed_history(503, 'SAX Exception', self.fpf.bozo_exception) @@ -306,9 +305,9 @@ class Dispatcher: feed = self.refresh_feed(feed_id) - if ret_entries.get(ENTRY_NEW) or self.options['force'] or not feed.fetched_once: - if not feed.fetched_once: - feed.fetched_once = True + if ret_entries.get(ENTRY_NEW) or self.options['force']: + if not feed.known_good: + feed.known_good = True feed.save() MUserStory.delete_old_stories(feed_id=feed.pk) try: