Cleaning up logging, adding several errors and timeout checks on page and icon fetching. Standardizing fetcher headers.

This commit is contained in:
Samuel Clay 2011-01-29 22:01:09 -05:00
parent 235f88fb50
commit 75fead0a27
6 changed files with 42 additions and 28 deletions

View file

@ -25,6 +25,9 @@ class IconImporter(object):
if not self.force and self.feed.icon.not_found:
print 'Not found, skipping...'
return
if not self.force and not self.feed.icon.not_found and self.feed.icon.icon_url:
print 'Found, but skipping...'
return
image, image_file, icon_url = self.fetch_image_from_page_data()
if not image:
image, image_file, icon_url = self.fetch(force=self.force)
@ -59,12 +62,10 @@ class IconImporter(object):
image_file.seek(0)
header = struct.unpack('<3H', image_file.read(6))
except Exception, e:
print 'No on struct: %s'% e
return
# Check magic
if header[:2] != (0, 1):
print 'No on header', header
return
# Collect icon directories
@ -157,11 +158,11 @@ class IconImporter(object):
image, image_file = self.get_image_from_url(url)
except(urllib2.HTTPError, urllib2.URLError):
return None, None, None
print 'Found: %s - %s' % (url, image)
# print 'Found: %s - %s' % (url, image)
return image, image_file, url
def get_image_from_url(self, url):
print 'Requesting: %s' % url
# print 'Requesting: %s' % url
try:
request = urllib2.Request(url, headers=HEADERS)
icon = urllib2.urlopen(request).read()
@ -186,7 +187,6 @@ class IconImporter(object):
def normalize_image(self, image):
# if image.size != (16, 16):
# image = image.resize((16, 16), Image.BICUBIC)
print image
if image.mode != 'RGBA':
image = image.convert('RGBA')
@ -201,7 +201,7 @@ class IconImporter(object):
ar = ar.reshape(scipy.product(shape[:2]), shape[2])
codes, _ = scipy.cluster.vq.kmeans(ar, NUM_CLUSTERS)
print "Before: %s" % codes
# print "Before: %s" % codes
original_codes = codes
for low, hi in [(60, 200), (35, 230), (10, 250)]:
codes = scipy.array([code for code in codes
@ -209,18 +209,17 @@ class IconImporter(object):
(code[0] > hi and code[1] > hi and code[2] > hi))])
if not len(codes): codes = original_codes
else: break
print "After: %s" % codes
colors = [''.join(chr(c) for c in code).encode('hex') for code in codes]
# print "After: %s" % codes
vecs, _ = scipy.cluster.vq.vq(ar, codes) # assign codes
counts, bins = scipy.histogram(vecs, len(codes)) # count occurrences
print counts
total = scipy.sum(counts)
print dict(zip(colors, [count/float(total) for count in counts]))
# colors = [''.join(chr(c) for c in code).encode('hex') for code in codes]
# total = scipy.sum(counts)
# print dict(zip(colors, [count/float(total) for count in counts]))
index_max = scipy.argmax(counts) # find most frequent
peak = codes[index_max]
color = ''.join(chr(c) for c in peak).encode('hex')
print 'most frequent is %s (#%s)' % (peak, color)
# print 'most frequent is %s (#%s)' % (peak, color)
return color[:6]

View file

@ -727,7 +727,7 @@ class FeedData(models.Model):
class FeedIcon(models.Model):
feed = AutoOneToOneField(Feed, primary_key=True, related_name='icon')
color = models.CharField(max_length=6, default="000000")
color = models.CharField(max_length=6, blank=True, null=True)
data = models.TextField()
icon_url = models.CharField(max_length=2000, blank=True, null=True)
not_found = models.BooleanField(default=False)

View file

@ -5,6 +5,12 @@ import traceback
import feedparser
from utils import log as logging
from apps.rss_feeds.models import MFeedPage
from utils.feed_functions import timelimit
HEADERS = {
'User-Agent': 'NewsBlur Page Fetcher - http://www.newsblur.com',
'Connection': 'close',
}
class PageImporter(object):
@ -12,12 +18,13 @@ class PageImporter(object):
self.url = url
self.feed = feed
@timelimit(30)
def fetch_page(self):
if not self.url:
return
try:
request = urllib2.Request(self.url)
request = urllib2.Request(self.url, headers=HEADERS)
response = urllib2.urlopen(request)
data = response.read()
html = self.rewrite_page(data)

View file

@ -1497,8 +1497,8 @@ background: transparent;
linear,
left bottom,
left top,
color-stop(0.36, rgba(248, 221,105, 250)),
color-stop(0.84, rgba(268, 241, 125, 250))
color-stop(0.36, rgba(76, 76, 76, 250)),
color-stop(0.84, rgba(55, 55, 55, 250))
);
background-image: -moz-linear-gradient(
center bottom,

View file

@ -2173,6 +2173,9 @@
generate_gradient: function(feed, type) {
var color = feed.favicon_color;
NEWSBLUR.log(['generate_gradient', feed.feed_title, color]);
if (!color) return '';
var r = parseInt(color.substr(0, 2), 16);
var g = parseInt(color.substr(2, 2), 16);
var b = parseInt(color.substr(4, 2), 16);

View file

@ -22,14 +22,11 @@ import xml.sax
# Refresh feed code adapted from Feedjack.
# http://feedjack.googlecode.com
VERSION = '1.0'
URL = 'http://www.newsblur.com/'
USER_AGENT = 'NewsBlur Fetcher %s - %s' % (VERSION, URL)
SLOWFEED_WARNING = 10
ENTRY_NEW, ENTRY_UPDATED, ENTRY_SAME, ENTRY_ERR = range(4)
FEED_OK, FEED_SAME, FEED_ERRPARSE, FEED_ERRHTTP, FEED_ERREXC = range(5)
def mtime(ttime):
""" datetime auxiliar function.
"""
@ -61,6 +58,12 @@ class FetchFeed:
modified = None
etag = None
USER_AGENT = 'NewsBlur Feed Fetcher (%s subscriber%s) - %s' % (
self.feed.num_subscribers,
's' if self.feed.num_subscribers != 1 else '',
URL
)
print USER_AGENT
self.fpf = feedparser.parse(self.feed.feed_address,
agent=USER_AGENT,
etag=etag,
@ -210,9 +213,6 @@ class ProcessFeed:
story_feed_id=self.feed.pk
).limit(len(story_guids))
logging.info(u' ---> [%-30s] Parsing: %s existing stories' % (
unicode(self.feed)[:30],
len(existing_stories)))
# MStory.objects(
# (Q(story_date__gte=start_date) & Q(story_date__lte=end_date))
# | (Q(story_guid__in=story_guids)),
@ -328,19 +328,23 @@ class Dispatcher:
(ret_feed == FEED_OK or
(ret_feed == FEED_SAME and feed.stories_last_month > 10)))):
logging.debug(u' ---> [%-30s] Fetching page' % (unicode(feed)[:30]))
logging.debug(u' ---> [%-30s] Fetching page: %s' % (unicode(feed)[:30], feed.feed_link))
page_importer = PageImporter(feed.feed_link, feed)
try:
page_importer.fetch_page()
except TimeoutError, e:
logging.debug(' ---> [%-30s] Page fetch timed out...' % (unicode(feed)[:30]))
feed.save_page_history(555, 'Timeout', '')
except Exception, e:
logging.debug('[%d] ! -------------------------' % (feed_id,))
tb = traceback.format_exc()
logging.error(tb)
logging.debug('[%d] ! -------------------------' % (feed_id,))
ret_feed = FEED_ERREXC
feed.save_feed_history(550, "Page Error", tb)
feed.save_page_history(550, "Page Error", tb)
fetched_feed = None
logging.debug(u' ---> [%-30s] Fetching icon: %s' % (unicode(feed)[:30], feed.feed_link))
icon_importer = IconImporter(feed, force=self.options['force'])
try:
icon_importer.save()
@ -361,9 +365,9 @@ class Dispatcher:
except IntegrityError:
logging.debug(" ---> [%-30s] IntegrityError on feed: %s" % (unicode(feed)[:30], feed.feed_address,))
done_msg = (u'%2s ---> [%-30s] Processed in %s [%s]' % (
done_msg = (u'%2s ---> [%-30s] Processed in %s (%s) [%s]' % (
identity, feed.feed_title[:30], unicode(delta),
self.feed_trans[ret_feed],))
feed.pk, self.feed_trans[ret_feed],))
logging.debug(done_msg)
self.feed_stats[ret_feed] += 1
@ -384,8 +388,9 @@ class Dispatcher:
active=True,
user__profile__last_seen_on__gte=UNREAD_CUTOFF)\
.order_by('-last_read_date')
logging.debug(u' ---> [%-30s] Computing scores for all feed subscribers: %s subscribers' % (
unicode(feed)[:30], user_subs.count()))
logging.debug(u' ---> [%-30s] Computing scores: %s (%s/%s/%s) subscribers' % (
unicode(feed)[:30], user_subs.count(),
feed.num_subscribers, feed.active_subscribers, feed.premium_subscribers))
stories_db = MStory.objects(story_feed_id=feed.pk,
story_date__gte=UNREAD_CUTOFF)