mirror of
https://github.com/samuelclay/NewsBlur.git
synced 2025-08-31 21:41:33 +00:00
Cleaning up logging, adding several errors and timeout checks on page and icon fetching. Standardizing fetcher headers.
This commit is contained in:
parent
235f88fb50
commit
75fead0a27
6 changed files with 42 additions and 28 deletions
|
@ -25,6 +25,9 @@ class IconImporter(object):
|
|||
if not self.force and self.feed.icon.not_found:
|
||||
print 'Not found, skipping...'
|
||||
return
|
||||
if not self.force and not self.feed.icon.not_found and self.feed.icon.icon_url:
|
||||
print 'Found, but skipping...'
|
||||
return
|
||||
image, image_file, icon_url = self.fetch_image_from_page_data()
|
||||
if not image:
|
||||
image, image_file, icon_url = self.fetch(force=self.force)
|
||||
|
@ -59,12 +62,10 @@ class IconImporter(object):
|
|||
image_file.seek(0)
|
||||
header = struct.unpack('<3H', image_file.read(6))
|
||||
except Exception, e:
|
||||
print 'No on struct: %s'% e
|
||||
return
|
||||
|
||||
# Check magic
|
||||
if header[:2] != (0, 1):
|
||||
print 'No on header', header
|
||||
return
|
||||
|
||||
# Collect icon directories
|
||||
|
@ -157,11 +158,11 @@ class IconImporter(object):
|
|||
image, image_file = self.get_image_from_url(url)
|
||||
except(urllib2.HTTPError, urllib2.URLError):
|
||||
return None, None, None
|
||||
print 'Found: %s - %s' % (url, image)
|
||||
# print 'Found: %s - %s' % (url, image)
|
||||
return image, image_file, url
|
||||
|
||||
def get_image_from_url(self, url):
|
||||
print 'Requesting: %s' % url
|
||||
# print 'Requesting: %s' % url
|
||||
try:
|
||||
request = urllib2.Request(url, headers=HEADERS)
|
||||
icon = urllib2.urlopen(request).read()
|
||||
|
@ -186,7 +187,6 @@ class IconImporter(object):
|
|||
def normalize_image(self, image):
|
||||
# if image.size != (16, 16):
|
||||
# image = image.resize((16, 16), Image.BICUBIC)
|
||||
print image
|
||||
if image.mode != 'RGBA':
|
||||
image = image.convert('RGBA')
|
||||
|
||||
|
@ -201,7 +201,7 @@ class IconImporter(object):
|
|||
ar = ar.reshape(scipy.product(shape[:2]), shape[2])
|
||||
|
||||
codes, _ = scipy.cluster.vq.kmeans(ar, NUM_CLUSTERS)
|
||||
print "Before: %s" % codes
|
||||
# print "Before: %s" % codes
|
||||
original_codes = codes
|
||||
for low, hi in [(60, 200), (35, 230), (10, 250)]:
|
||||
codes = scipy.array([code for code in codes
|
||||
|
@ -209,18 +209,17 @@ class IconImporter(object):
|
|||
(code[0] > hi and code[1] > hi and code[2] > hi))])
|
||||
if not len(codes): codes = original_codes
|
||||
else: break
|
||||
print "After: %s" % codes
|
||||
colors = [''.join(chr(c) for c in code).encode('hex') for code in codes]
|
||||
# print "After: %s" % codes
|
||||
|
||||
vecs, _ = scipy.cluster.vq.vq(ar, codes) # assign codes
|
||||
counts, bins = scipy.histogram(vecs, len(codes)) # count occurrences
|
||||
print counts
|
||||
total = scipy.sum(counts)
|
||||
print dict(zip(colors, [count/float(total) for count in counts]))
|
||||
# colors = [''.join(chr(c) for c in code).encode('hex') for code in codes]
|
||||
# total = scipy.sum(counts)
|
||||
# print dict(zip(colors, [count/float(total) for count in counts]))
|
||||
index_max = scipy.argmax(counts) # find most frequent
|
||||
peak = codes[index_max]
|
||||
color = ''.join(chr(c) for c in peak).encode('hex')
|
||||
print 'most frequent is %s (#%s)' % (peak, color)
|
||||
# print 'most frequent is %s (#%s)' % (peak, color)
|
||||
|
||||
return color[:6]
|
||||
|
||||
|
|
|
@ -727,7 +727,7 @@ class FeedData(models.Model):
|
|||
|
||||
class FeedIcon(models.Model):
|
||||
feed = AutoOneToOneField(Feed, primary_key=True, related_name='icon')
|
||||
color = models.CharField(max_length=6, default="000000")
|
||||
color = models.CharField(max_length=6, blank=True, null=True)
|
||||
data = models.TextField()
|
||||
icon_url = models.CharField(max_length=2000, blank=True, null=True)
|
||||
not_found = models.BooleanField(default=False)
|
||||
|
|
|
@ -5,6 +5,12 @@ import traceback
|
|||
import feedparser
|
||||
from utils import log as logging
|
||||
from apps.rss_feeds.models import MFeedPage
|
||||
from utils.feed_functions import timelimit
|
||||
|
||||
HEADERS = {
|
||||
'User-Agent': 'NewsBlur Page Fetcher - http://www.newsblur.com',
|
||||
'Connection': 'close',
|
||||
}
|
||||
|
||||
class PageImporter(object):
|
||||
|
||||
|
@ -12,12 +18,13 @@ class PageImporter(object):
|
|||
self.url = url
|
||||
self.feed = feed
|
||||
|
||||
@timelimit(30)
|
||||
def fetch_page(self):
|
||||
if not self.url:
|
||||
return
|
||||
|
||||
try:
|
||||
request = urllib2.Request(self.url)
|
||||
request = urllib2.Request(self.url, headers=HEADERS)
|
||||
response = urllib2.urlopen(request)
|
||||
data = response.read()
|
||||
html = self.rewrite_page(data)
|
||||
|
|
|
@ -1497,8 +1497,8 @@ background: transparent;
|
|||
linear,
|
||||
left bottom,
|
||||
left top,
|
||||
color-stop(0.36, rgba(248, 221,105, 250)),
|
||||
color-stop(0.84, rgba(268, 241, 125, 250))
|
||||
color-stop(0.36, rgba(76, 76, 76, 250)),
|
||||
color-stop(0.84, rgba(55, 55, 55, 250))
|
||||
);
|
||||
background-image: -moz-linear-gradient(
|
||||
center bottom,
|
||||
|
|
|
@ -2173,6 +2173,9 @@
|
|||
|
||||
generate_gradient: function(feed, type) {
|
||||
var color = feed.favicon_color;
|
||||
NEWSBLUR.log(['generate_gradient', feed.feed_title, color]);
|
||||
if (!color) return '';
|
||||
|
||||
var r = parseInt(color.substr(0, 2), 16);
|
||||
var g = parseInt(color.substr(2, 2), 16);
|
||||
var b = parseInt(color.substr(4, 2), 16);
|
||||
|
|
|
@ -22,14 +22,11 @@ import xml.sax
|
|||
# Refresh feed code adapted from Feedjack.
|
||||
# http://feedjack.googlecode.com
|
||||
|
||||
VERSION = '1.0'
|
||||
URL = 'http://www.newsblur.com/'
|
||||
USER_AGENT = 'NewsBlur Fetcher %s - %s' % (VERSION, URL)
|
||||
SLOWFEED_WARNING = 10
|
||||
ENTRY_NEW, ENTRY_UPDATED, ENTRY_SAME, ENTRY_ERR = range(4)
|
||||
FEED_OK, FEED_SAME, FEED_ERRPARSE, FEED_ERRHTTP, FEED_ERREXC = range(5)
|
||||
|
||||
|
||||
def mtime(ttime):
|
||||
""" datetime auxiliar function.
|
||||
"""
|
||||
|
@ -61,6 +58,12 @@ class FetchFeed:
|
|||
modified = None
|
||||
etag = None
|
||||
|
||||
USER_AGENT = 'NewsBlur Feed Fetcher (%s subscriber%s) - %s' % (
|
||||
self.feed.num_subscribers,
|
||||
's' if self.feed.num_subscribers != 1 else '',
|
||||
URL
|
||||
)
|
||||
print USER_AGENT
|
||||
self.fpf = feedparser.parse(self.feed.feed_address,
|
||||
agent=USER_AGENT,
|
||||
etag=etag,
|
||||
|
@ -210,9 +213,6 @@ class ProcessFeed:
|
|||
story_feed_id=self.feed.pk
|
||||
).limit(len(story_guids))
|
||||
|
||||
logging.info(u' ---> [%-30s] Parsing: %s existing stories' % (
|
||||
unicode(self.feed)[:30],
|
||||
len(existing_stories)))
|
||||
# MStory.objects(
|
||||
# (Q(story_date__gte=start_date) & Q(story_date__lte=end_date))
|
||||
# | (Q(story_guid__in=story_guids)),
|
||||
|
@ -328,19 +328,23 @@ class Dispatcher:
|
|||
(ret_feed == FEED_OK or
|
||||
(ret_feed == FEED_SAME and feed.stories_last_month > 10)))):
|
||||
|
||||
logging.debug(u' ---> [%-30s] Fetching page' % (unicode(feed)[:30]))
|
||||
logging.debug(u' ---> [%-30s] Fetching page: %s' % (unicode(feed)[:30], feed.feed_link))
|
||||
page_importer = PageImporter(feed.feed_link, feed)
|
||||
try:
|
||||
page_importer.fetch_page()
|
||||
except TimeoutError, e:
|
||||
logging.debug(' ---> [%-30s] Page fetch timed out...' % (unicode(feed)[:30]))
|
||||
feed.save_page_history(555, 'Timeout', '')
|
||||
except Exception, e:
|
||||
logging.debug('[%d] ! -------------------------' % (feed_id,))
|
||||
tb = traceback.format_exc()
|
||||
logging.error(tb)
|
||||
logging.debug('[%d] ! -------------------------' % (feed_id,))
|
||||
ret_feed = FEED_ERREXC
|
||||
feed.save_feed_history(550, "Page Error", tb)
|
||||
feed.save_page_history(550, "Page Error", tb)
|
||||
fetched_feed = None
|
||||
|
||||
logging.debug(u' ---> [%-30s] Fetching icon: %s' % (unicode(feed)[:30], feed.feed_link))
|
||||
icon_importer = IconImporter(feed, force=self.options['force'])
|
||||
try:
|
||||
icon_importer.save()
|
||||
|
@ -361,9 +365,9 @@ class Dispatcher:
|
|||
except IntegrityError:
|
||||
logging.debug(" ---> [%-30s] IntegrityError on feed: %s" % (unicode(feed)[:30], feed.feed_address,))
|
||||
|
||||
done_msg = (u'%2s ---> [%-30s] Processed in %s [%s]' % (
|
||||
done_msg = (u'%2s ---> [%-30s] Processed in %s (%s) [%s]' % (
|
||||
identity, feed.feed_title[:30], unicode(delta),
|
||||
self.feed_trans[ret_feed],))
|
||||
feed.pk, self.feed_trans[ret_feed],))
|
||||
logging.debug(done_msg)
|
||||
|
||||
self.feed_stats[ret_feed] += 1
|
||||
|
@ -384,8 +388,9 @@ class Dispatcher:
|
|||
active=True,
|
||||
user__profile__last_seen_on__gte=UNREAD_CUTOFF)\
|
||||
.order_by('-last_read_date')
|
||||
logging.debug(u' ---> [%-30s] Computing scores for all feed subscribers: %s subscribers' % (
|
||||
unicode(feed)[:30], user_subs.count()))
|
||||
logging.debug(u' ---> [%-30s] Computing scores: %s (%s/%s/%s) subscribers' % (
|
||||
unicode(feed)[:30], user_subs.count(),
|
||||
feed.num_subscribers, feed.active_subscribers, feed.premium_subscribers))
|
||||
|
||||
stories_db = MStory.objects(story_feed_id=feed.pk,
|
||||
story_date__gte=UNREAD_CUTOFF)
|
||||
|
|
Loading…
Add table
Reference in a new issue