Cache busting feeds on force and 1% of the time.

This commit is contained in:
Samuel Clay 2013-03-24 15:50:57 -07:00
parent 0bee01036a
commit 9e2fa3115d
3 changed files with 34 additions and 8 deletions

9
fabfile.py vendored
View file

@ -79,10 +79,15 @@ env.roledefs ={
'db20.newsblur.com',
'db21.newsblur.com',
'db22.newsblur.com',
'db23.newsblur.com',
],
'dbdo':['198.211.115.113',
'dbdo':['198.211.109.225',
'198.211.109.224',
'198.211.110.164',
'198.211.115.113',
'198.211.115.153',
'198.211.115.8',
'198.211.117.116',
],
'task': ['task01.newsblur.com',
'task02.newsblur.com',
@ -521,7 +526,7 @@ def setup_imaging():
def setup_supervisor():
sudo('apt-get -y install supervisor')
@parallel
# @parallel
def setup_hosts():
put('../secrets-newsblur/configs/hosts', '/etc/hosts', use_sudo=True)

View file

@ -18,7 +18,7 @@ from apps.statistics.models import MAnalyticsFetcher
from utils import feedparser
from utils.story_functions import pre_process_story
from utils import log as logging
from utils.feed_functions import timelimit, TimeoutError, utf8encode
from utils.feed_functions import timelimit, TimeoutError, utf8encode, cache_bust_url
# from utils.feed_functions import mail_feed_error_to_admin
@ -54,11 +54,18 @@ class FetchFeed:
etag=self.feed.etag
modified = self.feed.last_modified.utctimetuple()[:7] if self.feed.last_modified else None
address = self.feed.feed_address
if self.options.get('force') or not self.feed.fetched_once or not self.feed.known_good:
if (self.options.get('force') or random.random() <= .01):
modified = None
etag = None
address = cache_bust_url(address)
logging.debug(u' ---> [%-30s] ~FBForcing fetch: %s' % (
self.feed.title[:30], address))
elif (not self.feed.fetched_once or not self.feed.known_good):
modified = None
etag = None
USER_AGENT = 'NewsBlur Feed Fetcher - %s subscriber%s - %s (Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/536.2.3 (KHTML, like Gecko) Version/5.2)' % (
self.feed.num_subscribers,
's' if self.feed.num_subscribers != 1 else '',
@ -75,7 +82,7 @@ class FetchFeed:
return FEED_OK, self.fpf
try:
self.fpf = feedparser.parse(self.feed.feed_address,
self.fpf = feedparser.parse(address,
agent=USER_AGENT,
etag=etag,
modified=modified)
@ -83,7 +90,7 @@ class FetchFeed:
logging.debug(u' ***> [%-30s] ~FR%s, turning off microformats.' %
(self.feed.title[:30], e))
feedparser.PARSE_MICROFORMATS = False
self.fpf = feedparser.parse(self.feed.feed_address,
self.fpf = feedparser.parse(address,
agent=USER_AGENT,
etag=etag,
modified=modified)

View file

@ -3,9 +3,11 @@ import threading
import sys
import traceback
import pprint
import urllib
import urlparse
import random
from django.core.mail import mail_admins
from django.utils.translation import ungettext
from django.conf import settings
from utils import log as logging
class TimeoutError(Exception): pass
@ -56,6 +58,18 @@ def utf8encode(tstr):
except UnicodeDecodeError:
return u''
def append_query_string_to_url(url, **kwargs):
url_parts = list(urlparse.urlparse(url))
query = dict(urlparse.parse_qsl(url_parts[4]))
query.update(kwargs)
url_parts[4] = urllib.urlencode(query)
return urlparse.urlunparse(url_parts)
def cache_bust_url(url):
return append_query_string_to_url(url, _=random.randint(0, 10000))
# From: http://www.poromenos.org/node/87
def levenshtein_distance(first, second):
"""Find the Levenshtein distance between two strings."""