Merge branch 'master' into social

* master:
  Adding build/ back to .gitignore.
  Updating .gitignore.
  Using dsa pem for retrieving rabbit update counts.
  Preserving feed fetch error histories for 2 weeks.
  Adding munin graph for known good feeds.
  Turning down verbosity on feed tasking.
  Adding known good feeds to munin to chart their progress.
  Fixing headers overflow in Chrome.
  Adding new feeds db column 'known_good', for forcing updates of feeds that were once good but are now throwing 500s.
  Adding blogging OT press story.
This commit is contained in:
Samuel Clay 2012-02-02 17:22:32 -08:00
commit b3fffab05a
10 changed files with 125 additions and 32 deletions

8
.gitignore vendored
View file

@ -1,9 +1,5 @@
logs/*.log
*.pycmedia/release/
*.pyc
media/*/*-compressed-*.*
media/css/*/*-compressed-*.*
media/release
static/*
local_settings.py
media/iphone/NewsBlur/build
@ -12,10 +8,7 @@ build/
.DS_Store
**/*.perspectivev*
data/
logs
mongo/
**/*.xcuserstate
media/iphone/NewsBlur.xcodeproj/project.xcworkspace/xcuserdata/conesus.xcuserdatad/UserInterfaceState.xcuserstate
UserInterfaceState.xcuserstate
UserInterfaceState\.xcuserstate
*.xcuserstate
@ -23,3 +16,4 @@ xcuserdata
.xcodeproj/ push.xcodeproj/project.pbxproj
*.mode1v3
*.pbxuser
media/maintenance.html

View file

@ -46,4 +46,12 @@ class Command(BaseCommand):
active_subscribers__gte=1,
active=True
).order_by('?')
if feeds: Feed.task_feeds(feeds)
feeds = Feed.objects.filter(
last_update__lte=day,
active_subscribers__gte=1,
active=False,
known_good=True
).order_by('?')
if feeds: Feed.task_feeds(feeds)

View file

@ -0,0 +1,83 @@
# encoding: utf-8
import datetime
from south.db import db
from south.v2 import SchemaMigration
from django.db import models
class Migration(SchemaMigration):
def forwards(self, orm):
# Adding field 'Feed.known_good'
db.add_column('feeds', 'known_good', self.gf('django.db.models.fields.BooleanField')(default=False, db_index=True), keep_default=False)
def backwards(self, orm):
# Deleting field 'Feed.known_good'
db.delete_column('feeds', 'known_good')
models = {
'rss_feeds.duplicatefeed': {
'Meta': {'object_name': 'DuplicateFeed'},
'duplicate_address': ('django.db.models.fields.CharField', [], {'max_length': '255'}),
'duplicate_feed_id': ('django.db.models.fields.CharField', [], {'max_length': '255', 'null': 'True'}),
'feed': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'duplicate_addresses'", 'to': "orm['rss_feeds.Feed']"}),
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'})
},
'rss_feeds.feed': {
'Meta': {'ordering': "['feed_title']", 'object_name': 'Feed', 'db_table': "'feeds'"},
'active': ('django.db.models.fields.BooleanField', [], {'default': 'True', 'db_index': 'True'}),
'active_premium_subscribers': ('django.db.models.fields.IntegerField', [], {'default': '-1', 'db_index': 'True'}),
'active_subscribers': ('django.db.models.fields.IntegerField', [], {'default': '-1', 'db_index': 'True'}),
'average_stories_per_month': ('django.db.models.fields.IntegerField', [], {'default': '0'}),
'branch_from_feed': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['rss_feeds.Feed']", 'null': 'True', 'blank': 'True'}),
'creation': ('django.db.models.fields.DateField', [], {'auto_now_add': 'True', 'blank': 'True'}),
'days_to_trim': ('django.db.models.fields.IntegerField', [], {'default': '90'}),
'etag': ('django.db.models.fields.CharField', [], {'max_length': '255', 'null': 'True', 'blank': 'True'}),
'exception_code': ('django.db.models.fields.IntegerField', [], {'default': '0'}),
'favicon_color': ('django.db.models.fields.CharField', [], {'max_length': '6', 'null': 'True', 'blank': 'True'}),
'favicon_not_found': ('django.db.models.fields.BooleanField', [], {'default': 'False'}),
'feed_address': ('django.db.models.fields.URLField', [], {'max_length': '255'}),
'feed_address_locked': ('django.db.models.fields.NullBooleanField', [], {'default': 'False', 'null': 'True', 'blank': 'True'}),
'feed_link': ('django.db.models.fields.URLField', [], {'default': "''", 'max_length': '1000', 'null': 'True', 'blank': 'True'}),
'feed_link_locked': ('django.db.models.fields.BooleanField', [], {'default': 'False'}),
'feed_title': ('django.db.models.fields.CharField', [], {'default': "'[Untitled]'", 'max_length': '255', 'null': 'True', 'blank': 'True'}),
'fetched_once': ('django.db.models.fields.BooleanField', [], {'default': 'False'}),
'has_feed_exception': ('django.db.models.fields.BooleanField', [], {'default': 'False', 'db_index': 'True'}),
'has_page': ('django.db.models.fields.BooleanField', [], {'default': 'True'}),
'has_page_exception': ('django.db.models.fields.BooleanField', [], {'default': 'False', 'db_index': 'True'}),
'hash_address_and_link': ('django.db.models.fields.CharField', [], {'unique': 'True', 'max_length': '64', 'db_index': 'True'}),
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
'known_good': ('django.db.models.fields.BooleanField', [], {'default': 'False', 'db_index': 'True'}),
'last_load_time': ('django.db.models.fields.IntegerField', [], {'default': '0'}),
'last_modified': ('django.db.models.fields.DateTimeField', [], {'null': 'True', 'blank': 'True'}),
'last_update': ('django.db.models.fields.DateTimeField', [], {'db_index': 'True'}),
'min_to_decay': ('django.db.models.fields.IntegerField', [], {'default': '0'}),
'next_scheduled_update': ('django.db.models.fields.DateTimeField', [], {'db_index': 'True'}),
'num_subscribers': ('django.db.models.fields.IntegerField', [], {'default': '-1'}),
'premium_subscribers': ('django.db.models.fields.IntegerField', [], {'default': '-1'}),
'queued_date': ('django.db.models.fields.DateTimeField', [], {'db_index': 'True'}),
'stories_last_month': ('django.db.models.fields.IntegerField', [], {'default': '0'})
},
'rss_feeds.feeddata': {
'Meta': {'object_name': 'FeedData'},
'feed': ('utils.fields.AutoOneToOneField', [], {'related_name': "'data'", 'unique': 'True', 'to': "orm['rss_feeds.Feed']"}),
'feed_classifier_counts': ('django.db.models.fields.TextField', [], {'null': 'True', 'blank': 'True'}),
'feed_tagline': ('django.db.models.fields.CharField', [], {'max_length': '1024', 'null': 'True', 'blank': 'True'}),
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
'popular_authors': ('django.db.models.fields.CharField', [], {'max_length': '2048', 'null': 'True', 'blank': 'True'}),
'popular_tags': ('django.db.models.fields.CharField', [], {'max_length': '1024', 'null': 'True', 'blank': 'True'}),
'story_count_history': ('django.db.models.fields.TextField', [], {'null': 'True', 'blank': 'True'})
},
'rss_feeds.feedloadtime': {
'Meta': {'object_name': 'FeedLoadtime'},
'date_accessed': ('django.db.models.fields.DateTimeField', [], {'auto_now': 'True', 'blank': 'True'}),
'feed': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['rss_feeds.Feed']"}),
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
'loadtime': ('django.db.models.fields.FloatField', [], {})
}
}
complete_apps = ['rss_feeds']

View file

@ -52,6 +52,7 @@ class Feed(models.Model):
branch_from_feed = models.ForeignKey('Feed', blank=True, null=True, db_index=True)
last_update = models.DateTimeField(db_index=True)
fetched_once = models.BooleanField(default=False)
known_good = models.BooleanField(default=False, db_index=True)
has_feed_exception = models.BooleanField(default=False, db_index=True)
has_page_exception = models.BooleanField(default=False, db_index=True)
has_page = models.BooleanField(default=True)
@ -1013,7 +1014,7 @@ class Feed(models.Model):
return total, random_factor*2
def set_next_scheduled_update(self):
total, random_factor = self.get_next_scheduled_update(force=True)
total, random_factor = self.get_next_scheduled_update(force=True, verbose=False)
next_scheduled_update = datetime.datetime.utcnow() + datetime.timedelta(
minutes = total + random_factor)

View file

@ -52,17 +52,20 @@ class MStatistics(mongo.Document):
def collect_statistics_feeds_fetched(cls, last_day=None):
if not last_day:
last_day = datetime.datetime.now() - datetime.timedelta(hours=24)
last_biweek = datetime.datetime.now() - datetime.timedelta(days=14)
feeds_fetched = MFeedFetchHistory.objects.count()
feeds_fetched = MFeedFetchHistory.objects.filter(fetch_date__lt=last_day).count()
cls.objects(key='feeds_fetched').update_one(upsert=True, key='feeds_fetched', value=feeds_fetched)
pages_fetched = MPageFetchHistory.objects.count()
pages_fetched = MPageFetchHistory.objects.filter(fetch_date__lt=last_day).count()
cls.objects(key='pages_fetched').update_one(upsert=True, key='pages_fetched', value=pages_fetched)
from utils.feed_functions import timelimit, TimeoutError
@timelimit(60)
def delete_old_history():
MFeedFetchHistory.objects(fetch_date__lt=last_day).delete()
MPageFetchHistory.objects(fetch_date__lt=last_day).delete()
MFeedFetchHistory.objects(fetch_date__lt=last_day, status_code__in=[200, 304]).delete()
MPageFetchHistory.objects(fetch_date__lt=last_day, status_code__in=[200, 304]).delete()
MFeedFetchHistory.objects(fetch_date__lt=last_biweek).delete()
MPageFetchHistory.objects(fetch_date__lt=last_biweek).delete()
try:
delete_old_history()
except TimeoutError:

View file

@ -3785,6 +3785,7 @@ background: transparent;
.NB-module h5 {
margin: 0 0 12px;
padding: 8px 12px 6px;
overflow: hidden;
}
.NB-module .NB-module-header-left {

View file

@ -132,6 +132,10 @@
Mar 8, 2011
</span>
</li>
<li>
<a href="http://www.bloggingot.com/blogging-tools/newsblur-rss-feed-client-for-feed-junkies/">NewsBlur: RSS Feed Client for Feed Junkies</a>
<span class="NB-press-publisher"><img src="http://www.bloggingot.com/favicon.ico"> Blogging OT</span>, <span class="NB-press-author">Panah</span>, <span class="NB-press-date">Jan 21, 2012</span>
</li>
<li>
<a href="http://www.genbeta.com/web/newsblur-una-excelente-alternativa-a-google-reader-que-filtra-los-posts-mas-relevantes">
NewsBlur, una excelente alternativa a Google Reader que filtra los posts más relevantes

View file

@ -1,8 +1,13 @@
# from apps.rss_feeds.models import FeedXML
import time
import datetime
import traceback
import multiprocessing
import urllib2
import xml.sax
import redis
from django.core.cache import cache
from django.conf import settings
from django.db import IntegrityError
# from mongoengine.queryset import Q
from apps.reader.models import UserSubscription, MUserStory
from apps.rss_feeds.models import Feed, MStory
from apps.rss_feeds.page_importer import PageImporter
@ -11,18 +16,10 @@ from utils import feedparser
from utils.story_functions import pre_process_story
from utils import log as logging
from utils.feed_functions import timelimit, TimeoutError, mail_feed_error_to_admin, utf8encode
import time
import datetime
import traceback
import multiprocessing
import urllib2
import xml.sax
import redis
# Refresh feed code adapted from Feedjack.
# http://feedjack.googlecode.com
SLOWFEED_WARNING = 10
ENTRY_NEW, ENTRY_UPDATED, ENTRY_SAME, ENTRY_ERR = range(4)
FEED_OK, FEED_SAME, FEED_ERRPARSE, FEED_ERRHTTP, FEED_ERREXC = range(5)
@ -132,16 +129,16 @@ class ProcessFeed:
if not self.feed.fetched_once:
self.feed.has_feed_exception = True
self.feed.fetched_once = True
logging.debug(" ---> [%-30s] Feed is 302'ing, but it's not new. Refetching..." % (unicode(self.feed)[:30]))
self.feed.schedule_feed_fetch_immediately()
if not self.fpf.entries:
self.feed.save()
self.feed.save_feed_history(self.fpf.status, "HTTP Redirect")
return FEED_ERRHTTP, ret_values
if self.fpf.status >= 400:
logging.debug(" ---> [%-30s] HTTP Status code: %s.%s Checking address..." % (unicode(self.feed)[:30], self.fpf.status, ' Not' if self.feed.fetched_once else ''))
logging.debug(" ---> [%-30s] HTTP Status code: %s.%s Checking address..." % (unicode(self.feed)[:30], self.fpf.status, ' Not' if self.feed.known_good else ''))
fixed_feed = None
if not self.feed.fetched_once:
if not self.feed.known_good:
fixed_feed = self.feed.check_feed_link_for_feed_address()
if not fixed_feed:
self.feed.save_feed_history(self.fpf.status, "HTTP Error")
@ -152,10 +149,10 @@ class ProcessFeed:
return FEED_ERRHTTP, ret_values
if self.fpf.bozo and isinstance(self.fpf.bozo_exception, feedparser.NonXMLContentType):
logging.debug(" ---> [%-30s] Feed is Non-XML. %s entries.%s Checking address..." % (unicode(self.feed)[:30], len(self.fpf.entries), ' Not' if self.fpf.entries else ''))
logging.debug(" ---> [%-30s] Feed is Non-XML. %s entries.%s Checking address..." % (unicode(self.feed)[:30], len(self.fpf.entries), ' Not' if self.feed.known_good and self.fpf.entries else ''))
if not self.fpf.entries:
fixed_feed = None
if not self.feed.fetched_once:
if not self.feed.known_good:
fixed_feed = self.feed.check_feed_link_for_feed_address()
if not fixed_feed:
self.feed.save_feed_history(502, 'Non-xml feed', self.fpf.bozo_exception)
@ -168,7 +165,7 @@ class ProcessFeed:
logging.debug(" ---> [%-30s] Feed has SAX/XML parsing issues. %s entries.%s Checking address..." % (unicode(self.feed)[:30], len(self.fpf.entries), ' Not' if self.fpf.entries else ''))
if not self.fpf.entries:
fixed_feed = None
if not self.feed.fetched_once:
if not self.feed.known_good:
fixed_feed = self.feed.check_feed_link_for_feed_address()
if not fixed_feed:
self.feed.save_feed_history(503, 'SAX Exception', self.fpf.bozo_exception)
@ -306,9 +303,9 @@ class Dispatcher:
feed = self.refresh_feed(feed_id)
if ret_entries.get(ENTRY_NEW) or self.options['force'] or not feed.fetched_once:
if not feed.fetched_once:
feed.fetched_once = True
if ret_entries.get(ENTRY_NEW) or self.options['force']:
if not feed.known_good:
feed.known_good = True
feed.save()
MUserStory.delete_old_stories(feed_id=feed.pk)
try:

View file

@ -10,6 +10,7 @@ graph_config = {
'inactive_feeds.label': 'inactive_feeds',
'duplicate_feeds.label': 'duplicate_feeds',
'active_feeds.label': 'active_feeds',
'known_good_feeds.label': 'known_good',
}
def calculate_metrics():
from apps.rss_feeds.models import Feed, DuplicateFeed
@ -20,6 +21,7 @@ def calculate_metrics():
'inactive_feeds': Feed.objects.filter(active=False).count(),
'duplicate_feeds': DuplicateFeed.objects.count(),
'active_feeds': Feed.objects.filter(active_subscribers__gt=0).count(),
'known_good_feeds': Feed.objects.filter(known_good=True).count(),
}
if __name__ == '__main__':

View file

@ -18,7 +18,7 @@ def calculate_metrics():
from apps.rss_feeds.models import Feed
hour_ago = datetime.datetime.utcnow() - datetime.timedelta(hours=1)
update_feeds_query = "ssh sclay@db01 \"sudo rabbitmqctl list_queues -p newsblurvhost | grep %s\" | awk '{print $2}'"
update_feeds_query = "ssh -i ~sclay/.ssh/id_dsa sclay@db01 \"sudo rabbitmqctl list_queues -p newsblurvhost | grep %s\" | awk '{print $2}'"
return {
'update_queue': Feed.objects.filter(queued_date__gte=hour_ago).count(),