Merge branch 'master' into social

* master:
  Adding build/ back to .gitignore.
  Updating .gitignore.
  Using dsa pem for retrieving rabbit update counts.
  Preserving feed fetch error histories for 2 weeks.
  Adding munin graph for known good feeds.
  Turning down verbosity on feed tasking.
  Adding known good feeds to munin to chart their progress.
  Fixing headers overflow in Chrome.
  Adding new feeds db column 'known_good', for forcing updates of feeds that were once good but are now throwing 500s.
  Adding blogging OT press story.
This commit is contained in:
Samuel Clay 2012-02-02 17:22:32 -08:00
commit b3fffab05a
10 changed files with 125 additions and 32 deletions

8
.gitignore vendored
View file

@ -1,9 +1,5 @@
logs/*.log logs/*.log
*.pycmedia/release/
*.pyc *.pyc
media/*/*-compressed-*.*
media/css/*/*-compressed-*.*
media/release
static/* static/*
local_settings.py local_settings.py
media/iphone/NewsBlur/build media/iphone/NewsBlur/build
@ -12,10 +8,7 @@ build/
.DS_Store .DS_Store
**/*.perspectivev* **/*.perspectivev*
data/ data/
logs
mongo/
**/*.xcuserstate **/*.xcuserstate
media/iphone/NewsBlur.xcodeproj/project.xcworkspace/xcuserdata/conesus.xcuserdatad/UserInterfaceState.xcuserstate
UserInterfaceState.xcuserstate UserInterfaceState.xcuserstate
UserInterfaceState\.xcuserstate UserInterfaceState\.xcuserstate
*.xcuserstate *.xcuserstate
@ -23,3 +16,4 @@ xcuserdata
.xcodeproj/ push.xcodeproj/project.pbxproj .xcodeproj/ push.xcodeproj/project.pbxproj
*.mode1v3 *.mode1v3
*.pbxuser *.pbxuser
media/maintenance.html

View file

@ -46,4 +46,12 @@ class Command(BaseCommand):
active_subscribers__gte=1, active_subscribers__gte=1,
active=True active=True
).order_by('?') ).order_by('?')
if feeds: Feed.task_feeds(feeds)
feeds = Feed.objects.filter(
last_update__lte=day,
active_subscribers__gte=1,
active=False,
known_good=True
).order_by('?')
if feeds: Feed.task_feeds(feeds) if feeds: Feed.task_feeds(feeds)

View file

@ -0,0 +1,83 @@
# encoding: utf-8
import datetime
from south.db import db
from south.v2 import SchemaMigration
from django.db import models
class Migration(SchemaMigration):
def forwards(self, orm):
# Adding field 'Feed.known_good'
db.add_column('feeds', 'known_good', self.gf('django.db.models.fields.BooleanField')(default=False, db_index=True), keep_default=False)
def backwards(self, orm):
# Deleting field 'Feed.known_good'
db.delete_column('feeds', 'known_good')
models = {
'rss_feeds.duplicatefeed': {
'Meta': {'object_name': 'DuplicateFeed'},
'duplicate_address': ('django.db.models.fields.CharField', [], {'max_length': '255'}),
'duplicate_feed_id': ('django.db.models.fields.CharField', [], {'max_length': '255', 'null': 'True'}),
'feed': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'duplicate_addresses'", 'to': "orm['rss_feeds.Feed']"}),
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'})
},
'rss_feeds.feed': {
'Meta': {'ordering': "['feed_title']", 'object_name': 'Feed', 'db_table': "'feeds'"},
'active': ('django.db.models.fields.BooleanField', [], {'default': 'True', 'db_index': 'True'}),
'active_premium_subscribers': ('django.db.models.fields.IntegerField', [], {'default': '-1', 'db_index': 'True'}),
'active_subscribers': ('django.db.models.fields.IntegerField', [], {'default': '-1', 'db_index': 'True'}),
'average_stories_per_month': ('django.db.models.fields.IntegerField', [], {'default': '0'}),
'branch_from_feed': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['rss_feeds.Feed']", 'null': 'True', 'blank': 'True'}),
'creation': ('django.db.models.fields.DateField', [], {'auto_now_add': 'True', 'blank': 'True'}),
'days_to_trim': ('django.db.models.fields.IntegerField', [], {'default': '90'}),
'etag': ('django.db.models.fields.CharField', [], {'max_length': '255', 'null': 'True', 'blank': 'True'}),
'exception_code': ('django.db.models.fields.IntegerField', [], {'default': '0'}),
'favicon_color': ('django.db.models.fields.CharField', [], {'max_length': '6', 'null': 'True', 'blank': 'True'}),
'favicon_not_found': ('django.db.models.fields.BooleanField', [], {'default': 'False'}),
'feed_address': ('django.db.models.fields.URLField', [], {'max_length': '255'}),
'feed_address_locked': ('django.db.models.fields.NullBooleanField', [], {'default': 'False', 'null': 'True', 'blank': 'True'}),
'feed_link': ('django.db.models.fields.URLField', [], {'default': "''", 'max_length': '1000', 'null': 'True', 'blank': 'True'}),
'feed_link_locked': ('django.db.models.fields.BooleanField', [], {'default': 'False'}),
'feed_title': ('django.db.models.fields.CharField', [], {'default': "'[Untitled]'", 'max_length': '255', 'null': 'True', 'blank': 'True'}),
'fetched_once': ('django.db.models.fields.BooleanField', [], {'default': 'False'}),
'has_feed_exception': ('django.db.models.fields.BooleanField', [], {'default': 'False', 'db_index': 'True'}),
'has_page': ('django.db.models.fields.BooleanField', [], {'default': 'True'}),
'has_page_exception': ('django.db.models.fields.BooleanField', [], {'default': 'False', 'db_index': 'True'}),
'hash_address_and_link': ('django.db.models.fields.CharField', [], {'unique': 'True', 'max_length': '64', 'db_index': 'True'}),
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
'known_good': ('django.db.models.fields.BooleanField', [], {'default': 'False', 'db_index': 'True'}),
'last_load_time': ('django.db.models.fields.IntegerField', [], {'default': '0'}),
'last_modified': ('django.db.models.fields.DateTimeField', [], {'null': 'True', 'blank': 'True'}),
'last_update': ('django.db.models.fields.DateTimeField', [], {'db_index': 'True'}),
'min_to_decay': ('django.db.models.fields.IntegerField', [], {'default': '0'}),
'next_scheduled_update': ('django.db.models.fields.DateTimeField', [], {'db_index': 'True'}),
'num_subscribers': ('django.db.models.fields.IntegerField', [], {'default': '-1'}),
'premium_subscribers': ('django.db.models.fields.IntegerField', [], {'default': '-1'}),
'queued_date': ('django.db.models.fields.DateTimeField', [], {'db_index': 'True'}),
'stories_last_month': ('django.db.models.fields.IntegerField', [], {'default': '0'})
},
'rss_feeds.feeddata': {
'Meta': {'object_name': 'FeedData'},
'feed': ('utils.fields.AutoOneToOneField', [], {'related_name': "'data'", 'unique': 'True', 'to': "orm['rss_feeds.Feed']"}),
'feed_classifier_counts': ('django.db.models.fields.TextField', [], {'null': 'True', 'blank': 'True'}),
'feed_tagline': ('django.db.models.fields.CharField', [], {'max_length': '1024', 'null': 'True', 'blank': 'True'}),
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
'popular_authors': ('django.db.models.fields.CharField', [], {'max_length': '2048', 'null': 'True', 'blank': 'True'}),
'popular_tags': ('django.db.models.fields.CharField', [], {'max_length': '1024', 'null': 'True', 'blank': 'True'}),
'story_count_history': ('django.db.models.fields.TextField', [], {'null': 'True', 'blank': 'True'})
},
'rss_feeds.feedloadtime': {
'Meta': {'object_name': 'FeedLoadtime'},
'date_accessed': ('django.db.models.fields.DateTimeField', [], {'auto_now': 'True', 'blank': 'True'}),
'feed': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['rss_feeds.Feed']"}),
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
'loadtime': ('django.db.models.fields.FloatField', [], {})
}
}
complete_apps = ['rss_feeds']

View file

@ -52,6 +52,7 @@ class Feed(models.Model):
branch_from_feed = models.ForeignKey('Feed', blank=True, null=True, db_index=True) branch_from_feed = models.ForeignKey('Feed', blank=True, null=True, db_index=True)
last_update = models.DateTimeField(db_index=True) last_update = models.DateTimeField(db_index=True)
fetched_once = models.BooleanField(default=False) fetched_once = models.BooleanField(default=False)
known_good = models.BooleanField(default=False, db_index=True)
has_feed_exception = models.BooleanField(default=False, db_index=True) has_feed_exception = models.BooleanField(default=False, db_index=True)
has_page_exception = models.BooleanField(default=False, db_index=True) has_page_exception = models.BooleanField(default=False, db_index=True)
has_page = models.BooleanField(default=True) has_page = models.BooleanField(default=True)
@ -1013,7 +1014,7 @@ class Feed(models.Model):
return total, random_factor*2 return total, random_factor*2
def set_next_scheduled_update(self): def set_next_scheduled_update(self):
total, random_factor = self.get_next_scheduled_update(force=True) total, random_factor = self.get_next_scheduled_update(force=True, verbose=False)
next_scheduled_update = datetime.datetime.utcnow() + datetime.timedelta( next_scheduled_update = datetime.datetime.utcnow() + datetime.timedelta(
minutes = total + random_factor) minutes = total + random_factor)

View file

@ -52,17 +52,20 @@ class MStatistics(mongo.Document):
def collect_statistics_feeds_fetched(cls, last_day=None): def collect_statistics_feeds_fetched(cls, last_day=None):
if not last_day: if not last_day:
last_day = datetime.datetime.now() - datetime.timedelta(hours=24) last_day = datetime.datetime.now() - datetime.timedelta(hours=24)
last_biweek = datetime.datetime.now() - datetime.timedelta(days=14)
feeds_fetched = MFeedFetchHistory.objects.count() feeds_fetched = MFeedFetchHistory.objects.filter(fetch_date__lt=last_day).count()
cls.objects(key='feeds_fetched').update_one(upsert=True, key='feeds_fetched', value=feeds_fetched) cls.objects(key='feeds_fetched').update_one(upsert=True, key='feeds_fetched', value=feeds_fetched)
pages_fetched = MPageFetchHistory.objects.count() pages_fetched = MPageFetchHistory.objects.filter(fetch_date__lt=last_day).count()
cls.objects(key='pages_fetched').update_one(upsert=True, key='pages_fetched', value=pages_fetched) cls.objects(key='pages_fetched').update_one(upsert=True, key='pages_fetched', value=pages_fetched)
from utils.feed_functions import timelimit, TimeoutError from utils.feed_functions import timelimit, TimeoutError
@timelimit(60) @timelimit(60)
def delete_old_history(): def delete_old_history():
MFeedFetchHistory.objects(fetch_date__lt=last_day).delete() MFeedFetchHistory.objects(fetch_date__lt=last_day, status_code__in=[200, 304]).delete()
MPageFetchHistory.objects(fetch_date__lt=last_day).delete() MPageFetchHistory.objects(fetch_date__lt=last_day, status_code__in=[200, 304]).delete()
MFeedFetchHistory.objects(fetch_date__lt=last_biweek).delete()
MPageFetchHistory.objects(fetch_date__lt=last_biweek).delete()
try: try:
delete_old_history() delete_old_history()
except TimeoutError: except TimeoutError:

View file

@ -3785,6 +3785,7 @@ background: transparent;
.NB-module h5 { .NB-module h5 {
margin: 0 0 12px; margin: 0 0 12px;
padding: 8px 12px 6px; padding: 8px 12px 6px;
overflow: hidden;
} }
.NB-module .NB-module-header-left { .NB-module .NB-module-header-left {

View file

@ -132,6 +132,10 @@
Mar 8, 2011 Mar 8, 2011
</span> </span>
</li> </li>
<li>
<a href="http://www.bloggingot.com/blogging-tools/newsblur-rss-feed-client-for-feed-junkies/">NewsBlur: RSS Feed Client for Feed Junkies</a>
<span class="NB-press-publisher"><img src="http://www.bloggingot.com/favicon.ico"> Blogging OT</span>, <span class="NB-press-author">Panah</span>, <span class="NB-press-date">Jan 21, 2012</span>
</li>
<li> <li>
<a href="http://www.genbeta.com/web/newsblur-una-excelente-alternativa-a-google-reader-que-filtra-los-posts-mas-relevantes"> <a href="http://www.genbeta.com/web/newsblur-una-excelente-alternativa-a-google-reader-que-filtra-los-posts-mas-relevantes">
NewsBlur, una excelente alternativa a Google Reader que filtra los posts más relevantes NewsBlur, una excelente alternativa a Google Reader que filtra los posts más relevantes

View file

@ -1,8 +1,13 @@
# from apps.rss_feeds.models import FeedXML import time
import datetime
import traceback
import multiprocessing
import urllib2
import xml.sax
import redis
from django.core.cache import cache from django.core.cache import cache
from django.conf import settings from django.conf import settings
from django.db import IntegrityError from django.db import IntegrityError
# from mongoengine.queryset import Q
from apps.reader.models import UserSubscription, MUserStory from apps.reader.models import UserSubscription, MUserStory
from apps.rss_feeds.models import Feed, MStory from apps.rss_feeds.models import Feed, MStory
from apps.rss_feeds.page_importer import PageImporter from apps.rss_feeds.page_importer import PageImporter
@ -11,18 +16,10 @@ from utils import feedparser
from utils.story_functions import pre_process_story from utils.story_functions import pre_process_story
from utils import log as logging from utils import log as logging
from utils.feed_functions import timelimit, TimeoutError, mail_feed_error_to_admin, utf8encode from utils.feed_functions import timelimit, TimeoutError, mail_feed_error_to_admin, utf8encode
import time
import datetime
import traceback
import multiprocessing
import urllib2
import xml.sax
import redis
# Refresh feed code adapted from Feedjack. # Refresh feed code adapted from Feedjack.
# http://feedjack.googlecode.com # http://feedjack.googlecode.com
SLOWFEED_WARNING = 10
ENTRY_NEW, ENTRY_UPDATED, ENTRY_SAME, ENTRY_ERR = range(4) ENTRY_NEW, ENTRY_UPDATED, ENTRY_SAME, ENTRY_ERR = range(4)
FEED_OK, FEED_SAME, FEED_ERRPARSE, FEED_ERRHTTP, FEED_ERREXC = range(5) FEED_OK, FEED_SAME, FEED_ERRPARSE, FEED_ERRHTTP, FEED_ERREXC = range(5)
@ -132,16 +129,16 @@ class ProcessFeed:
if not self.feed.fetched_once: if not self.feed.fetched_once:
self.feed.has_feed_exception = True self.feed.has_feed_exception = True
self.feed.fetched_once = True self.feed.fetched_once = True
logging.debug(" ---> [%-30s] Feed is 302'ing, but it's not new. Refetching..." % (unicode(self.feed)[:30]))
self.feed.schedule_feed_fetch_immediately() self.feed.schedule_feed_fetch_immediately()
if not self.fpf.entries: if not self.fpf.entries:
self.feed.save() self.feed.save()
self.feed.save_feed_history(self.fpf.status, "HTTP Redirect") self.feed.save_feed_history(self.fpf.status, "HTTP Redirect")
return FEED_ERRHTTP, ret_values return FEED_ERRHTTP, ret_values
if self.fpf.status >= 400: if self.fpf.status >= 400:
logging.debug(" ---> [%-30s] HTTP Status code: %s.%s Checking address..." % (unicode(self.feed)[:30], self.fpf.status, ' Not' if self.feed.fetched_once else '')) logging.debug(" ---> [%-30s] HTTP Status code: %s.%s Checking address..." % (unicode(self.feed)[:30], self.fpf.status, ' Not' if self.feed.known_good else ''))
fixed_feed = None fixed_feed = None
if not self.feed.fetched_once: if not self.feed.known_good:
fixed_feed = self.feed.check_feed_link_for_feed_address() fixed_feed = self.feed.check_feed_link_for_feed_address()
if not fixed_feed: if not fixed_feed:
self.feed.save_feed_history(self.fpf.status, "HTTP Error") self.feed.save_feed_history(self.fpf.status, "HTTP Error")
@ -152,10 +149,10 @@ class ProcessFeed:
return FEED_ERRHTTP, ret_values return FEED_ERRHTTP, ret_values
if self.fpf.bozo and isinstance(self.fpf.bozo_exception, feedparser.NonXMLContentType): if self.fpf.bozo and isinstance(self.fpf.bozo_exception, feedparser.NonXMLContentType):
logging.debug(" ---> [%-30s] Feed is Non-XML. %s entries.%s Checking address..." % (unicode(self.feed)[:30], len(self.fpf.entries), ' Not' if self.fpf.entries else '')) logging.debug(" ---> [%-30s] Feed is Non-XML. %s entries.%s Checking address..." % (unicode(self.feed)[:30], len(self.fpf.entries), ' Not' if self.feed.known_good and self.fpf.entries else ''))
if not self.fpf.entries: if not self.fpf.entries:
fixed_feed = None fixed_feed = None
if not self.feed.fetched_once: if not self.feed.known_good:
fixed_feed = self.feed.check_feed_link_for_feed_address() fixed_feed = self.feed.check_feed_link_for_feed_address()
if not fixed_feed: if not fixed_feed:
self.feed.save_feed_history(502, 'Non-xml feed', self.fpf.bozo_exception) self.feed.save_feed_history(502, 'Non-xml feed', self.fpf.bozo_exception)
@ -168,7 +165,7 @@ class ProcessFeed:
logging.debug(" ---> [%-30s] Feed has SAX/XML parsing issues. %s entries.%s Checking address..." % (unicode(self.feed)[:30], len(self.fpf.entries), ' Not' if self.fpf.entries else '')) logging.debug(" ---> [%-30s] Feed has SAX/XML parsing issues. %s entries.%s Checking address..." % (unicode(self.feed)[:30], len(self.fpf.entries), ' Not' if self.fpf.entries else ''))
if not self.fpf.entries: if not self.fpf.entries:
fixed_feed = None fixed_feed = None
if not self.feed.fetched_once: if not self.feed.known_good:
fixed_feed = self.feed.check_feed_link_for_feed_address() fixed_feed = self.feed.check_feed_link_for_feed_address()
if not fixed_feed: if not fixed_feed:
self.feed.save_feed_history(503, 'SAX Exception', self.fpf.bozo_exception) self.feed.save_feed_history(503, 'SAX Exception', self.fpf.bozo_exception)
@ -306,9 +303,9 @@ class Dispatcher:
feed = self.refresh_feed(feed_id) feed = self.refresh_feed(feed_id)
if ret_entries.get(ENTRY_NEW) or self.options['force'] or not feed.fetched_once: if ret_entries.get(ENTRY_NEW) or self.options['force']:
if not feed.fetched_once: if not feed.known_good:
feed.fetched_once = True feed.known_good = True
feed.save() feed.save()
MUserStory.delete_old_stories(feed_id=feed.pk) MUserStory.delete_old_stories(feed_id=feed.pk)
try: try:

View file

@ -10,6 +10,7 @@ graph_config = {
'inactive_feeds.label': 'inactive_feeds', 'inactive_feeds.label': 'inactive_feeds',
'duplicate_feeds.label': 'duplicate_feeds', 'duplicate_feeds.label': 'duplicate_feeds',
'active_feeds.label': 'active_feeds', 'active_feeds.label': 'active_feeds',
'known_good_feeds.label': 'known_good',
} }
def calculate_metrics(): def calculate_metrics():
from apps.rss_feeds.models import Feed, DuplicateFeed from apps.rss_feeds.models import Feed, DuplicateFeed
@ -20,6 +21,7 @@ def calculate_metrics():
'inactive_feeds': Feed.objects.filter(active=False).count(), 'inactive_feeds': Feed.objects.filter(active=False).count(),
'duplicate_feeds': DuplicateFeed.objects.count(), 'duplicate_feeds': DuplicateFeed.objects.count(),
'active_feeds': Feed.objects.filter(active_subscribers__gt=0).count(), 'active_feeds': Feed.objects.filter(active_subscribers__gt=0).count(),
'known_good_feeds': Feed.objects.filter(known_good=True).count(),
} }
if __name__ == '__main__': if __name__ == '__main__':

View file

@ -18,7 +18,7 @@ def calculate_metrics():
from apps.rss_feeds.models import Feed from apps.rss_feeds.models import Feed
hour_ago = datetime.datetime.utcnow() - datetime.timedelta(hours=1) hour_ago = datetime.datetime.utcnow() - datetime.timedelta(hours=1)
update_feeds_query = "ssh sclay@db01 \"sudo rabbitmqctl list_queues -p newsblurvhost | grep %s\" | awk '{print $2}'" update_feeds_query = "ssh -i ~sclay/.ssh/id_dsa sclay@db01 \"sudo rabbitmqctl list_queues -p newsblurvhost | grep %s\" | awk '{print $2}'"
return { return {
'update_queue': Feed.objects.filter(queued_date__gte=hour_ago).count(), 'update_queue': Feed.objects.filter(queued_date__gte=hour_ago).count(),