mirror of
https://github.com/samuelclay/NewsBlur.git
synced 2025-08-31 21:41:33 +00:00
Cleaning up logging, adding several errors and timeout checks on page and icon fetching. Standardizing fetcher headers.
This commit is contained in:
parent
235f88fb50
commit
75fead0a27
6 changed files with 42 additions and 28 deletions
|
@ -25,6 +25,9 @@ class IconImporter(object):
|
||||||
if not self.force and self.feed.icon.not_found:
|
if not self.force and self.feed.icon.not_found:
|
||||||
print 'Not found, skipping...'
|
print 'Not found, skipping...'
|
||||||
return
|
return
|
||||||
|
if not self.force and not self.feed.icon.not_found and self.feed.icon.icon_url:
|
||||||
|
print 'Found, but skipping...'
|
||||||
|
return
|
||||||
image, image_file, icon_url = self.fetch_image_from_page_data()
|
image, image_file, icon_url = self.fetch_image_from_page_data()
|
||||||
if not image:
|
if not image:
|
||||||
image, image_file, icon_url = self.fetch(force=self.force)
|
image, image_file, icon_url = self.fetch(force=self.force)
|
||||||
|
@ -59,12 +62,10 @@ class IconImporter(object):
|
||||||
image_file.seek(0)
|
image_file.seek(0)
|
||||||
header = struct.unpack('<3H', image_file.read(6))
|
header = struct.unpack('<3H', image_file.read(6))
|
||||||
except Exception, e:
|
except Exception, e:
|
||||||
print 'No on struct: %s'% e
|
|
||||||
return
|
return
|
||||||
|
|
||||||
# Check magic
|
# Check magic
|
||||||
if header[:2] != (0, 1):
|
if header[:2] != (0, 1):
|
||||||
print 'No on header', header
|
|
||||||
return
|
return
|
||||||
|
|
||||||
# Collect icon directories
|
# Collect icon directories
|
||||||
|
@ -157,11 +158,11 @@ class IconImporter(object):
|
||||||
image, image_file = self.get_image_from_url(url)
|
image, image_file = self.get_image_from_url(url)
|
||||||
except(urllib2.HTTPError, urllib2.URLError):
|
except(urllib2.HTTPError, urllib2.URLError):
|
||||||
return None, None, None
|
return None, None, None
|
||||||
print 'Found: %s - %s' % (url, image)
|
# print 'Found: %s - %s' % (url, image)
|
||||||
return image, image_file, url
|
return image, image_file, url
|
||||||
|
|
||||||
def get_image_from_url(self, url):
|
def get_image_from_url(self, url):
|
||||||
print 'Requesting: %s' % url
|
# print 'Requesting: %s' % url
|
||||||
try:
|
try:
|
||||||
request = urllib2.Request(url, headers=HEADERS)
|
request = urllib2.Request(url, headers=HEADERS)
|
||||||
icon = urllib2.urlopen(request).read()
|
icon = urllib2.urlopen(request).read()
|
||||||
|
@ -186,7 +187,6 @@ class IconImporter(object):
|
||||||
def normalize_image(self, image):
|
def normalize_image(self, image):
|
||||||
# if image.size != (16, 16):
|
# if image.size != (16, 16):
|
||||||
# image = image.resize((16, 16), Image.BICUBIC)
|
# image = image.resize((16, 16), Image.BICUBIC)
|
||||||
print image
|
|
||||||
if image.mode != 'RGBA':
|
if image.mode != 'RGBA':
|
||||||
image = image.convert('RGBA')
|
image = image.convert('RGBA')
|
||||||
|
|
||||||
|
@ -201,7 +201,7 @@ class IconImporter(object):
|
||||||
ar = ar.reshape(scipy.product(shape[:2]), shape[2])
|
ar = ar.reshape(scipy.product(shape[:2]), shape[2])
|
||||||
|
|
||||||
codes, _ = scipy.cluster.vq.kmeans(ar, NUM_CLUSTERS)
|
codes, _ = scipy.cluster.vq.kmeans(ar, NUM_CLUSTERS)
|
||||||
print "Before: %s" % codes
|
# print "Before: %s" % codes
|
||||||
original_codes = codes
|
original_codes = codes
|
||||||
for low, hi in [(60, 200), (35, 230), (10, 250)]:
|
for low, hi in [(60, 200), (35, 230), (10, 250)]:
|
||||||
codes = scipy.array([code for code in codes
|
codes = scipy.array([code for code in codes
|
||||||
|
@ -209,18 +209,17 @@ class IconImporter(object):
|
||||||
(code[0] > hi and code[1] > hi and code[2] > hi))])
|
(code[0] > hi and code[1] > hi and code[2] > hi))])
|
||||||
if not len(codes): codes = original_codes
|
if not len(codes): codes = original_codes
|
||||||
else: break
|
else: break
|
||||||
print "After: %s" % codes
|
# print "After: %s" % codes
|
||||||
colors = [''.join(chr(c) for c in code).encode('hex') for code in codes]
|
|
||||||
|
|
||||||
vecs, _ = scipy.cluster.vq.vq(ar, codes) # assign codes
|
vecs, _ = scipy.cluster.vq.vq(ar, codes) # assign codes
|
||||||
counts, bins = scipy.histogram(vecs, len(codes)) # count occurrences
|
counts, bins = scipy.histogram(vecs, len(codes)) # count occurrences
|
||||||
print counts
|
# colors = [''.join(chr(c) for c in code).encode('hex') for code in codes]
|
||||||
total = scipy.sum(counts)
|
# total = scipy.sum(counts)
|
||||||
print dict(zip(colors, [count/float(total) for count in counts]))
|
# print dict(zip(colors, [count/float(total) for count in counts]))
|
||||||
index_max = scipy.argmax(counts) # find most frequent
|
index_max = scipy.argmax(counts) # find most frequent
|
||||||
peak = codes[index_max]
|
peak = codes[index_max]
|
||||||
color = ''.join(chr(c) for c in peak).encode('hex')
|
color = ''.join(chr(c) for c in peak).encode('hex')
|
||||||
print 'most frequent is %s (#%s)' % (peak, color)
|
# print 'most frequent is %s (#%s)' % (peak, color)
|
||||||
|
|
||||||
return color[:6]
|
return color[:6]
|
||||||
|
|
||||||
|
|
|
@ -727,7 +727,7 @@ class FeedData(models.Model):
|
||||||
|
|
||||||
class FeedIcon(models.Model):
|
class FeedIcon(models.Model):
|
||||||
feed = AutoOneToOneField(Feed, primary_key=True, related_name='icon')
|
feed = AutoOneToOneField(Feed, primary_key=True, related_name='icon')
|
||||||
color = models.CharField(max_length=6, default="000000")
|
color = models.CharField(max_length=6, blank=True, null=True)
|
||||||
data = models.TextField()
|
data = models.TextField()
|
||||||
icon_url = models.CharField(max_length=2000, blank=True, null=True)
|
icon_url = models.CharField(max_length=2000, blank=True, null=True)
|
||||||
not_found = models.BooleanField(default=False)
|
not_found = models.BooleanField(default=False)
|
||||||
|
|
|
@ -5,6 +5,12 @@ import traceback
|
||||||
import feedparser
|
import feedparser
|
||||||
from utils import log as logging
|
from utils import log as logging
|
||||||
from apps.rss_feeds.models import MFeedPage
|
from apps.rss_feeds.models import MFeedPage
|
||||||
|
from utils.feed_functions import timelimit
|
||||||
|
|
||||||
|
HEADERS = {
|
||||||
|
'User-Agent': 'NewsBlur Page Fetcher - http://www.newsblur.com',
|
||||||
|
'Connection': 'close',
|
||||||
|
}
|
||||||
|
|
||||||
class PageImporter(object):
|
class PageImporter(object):
|
||||||
|
|
||||||
|
@ -12,12 +18,13 @@ class PageImporter(object):
|
||||||
self.url = url
|
self.url = url
|
||||||
self.feed = feed
|
self.feed = feed
|
||||||
|
|
||||||
|
@timelimit(30)
|
||||||
def fetch_page(self):
|
def fetch_page(self):
|
||||||
if not self.url:
|
if not self.url:
|
||||||
return
|
return
|
||||||
|
|
||||||
try:
|
try:
|
||||||
request = urllib2.Request(self.url)
|
request = urllib2.Request(self.url, headers=HEADERS)
|
||||||
response = urllib2.urlopen(request)
|
response = urllib2.urlopen(request)
|
||||||
data = response.read()
|
data = response.read()
|
||||||
html = self.rewrite_page(data)
|
html = self.rewrite_page(data)
|
||||||
|
|
|
@ -1497,8 +1497,8 @@ background: transparent;
|
||||||
linear,
|
linear,
|
||||||
left bottom,
|
left bottom,
|
||||||
left top,
|
left top,
|
||||||
color-stop(0.36, rgba(248, 221,105, 250)),
|
color-stop(0.36, rgba(76, 76, 76, 250)),
|
||||||
color-stop(0.84, rgba(268, 241, 125, 250))
|
color-stop(0.84, rgba(55, 55, 55, 250))
|
||||||
);
|
);
|
||||||
background-image: -moz-linear-gradient(
|
background-image: -moz-linear-gradient(
|
||||||
center bottom,
|
center bottom,
|
||||||
|
|
|
@ -2173,6 +2173,9 @@
|
||||||
|
|
||||||
generate_gradient: function(feed, type) {
|
generate_gradient: function(feed, type) {
|
||||||
var color = feed.favicon_color;
|
var color = feed.favicon_color;
|
||||||
|
NEWSBLUR.log(['generate_gradient', feed.feed_title, color]);
|
||||||
|
if (!color) return '';
|
||||||
|
|
||||||
var r = parseInt(color.substr(0, 2), 16);
|
var r = parseInt(color.substr(0, 2), 16);
|
||||||
var g = parseInt(color.substr(2, 2), 16);
|
var g = parseInt(color.substr(2, 2), 16);
|
||||||
var b = parseInt(color.substr(4, 2), 16);
|
var b = parseInt(color.substr(4, 2), 16);
|
||||||
|
|
|
@ -22,14 +22,11 @@ import xml.sax
|
||||||
# Refresh feed code adapted from Feedjack.
|
# Refresh feed code adapted from Feedjack.
|
||||||
# http://feedjack.googlecode.com
|
# http://feedjack.googlecode.com
|
||||||
|
|
||||||
VERSION = '1.0'
|
|
||||||
URL = 'http://www.newsblur.com/'
|
URL = 'http://www.newsblur.com/'
|
||||||
USER_AGENT = 'NewsBlur Fetcher %s - %s' % (VERSION, URL)
|
|
||||||
SLOWFEED_WARNING = 10
|
SLOWFEED_WARNING = 10
|
||||||
ENTRY_NEW, ENTRY_UPDATED, ENTRY_SAME, ENTRY_ERR = range(4)
|
ENTRY_NEW, ENTRY_UPDATED, ENTRY_SAME, ENTRY_ERR = range(4)
|
||||||
FEED_OK, FEED_SAME, FEED_ERRPARSE, FEED_ERRHTTP, FEED_ERREXC = range(5)
|
FEED_OK, FEED_SAME, FEED_ERRPARSE, FEED_ERRHTTP, FEED_ERREXC = range(5)
|
||||||
|
|
||||||
|
|
||||||
def mtime(ttime):
|
def mtime(ttime):
|
||||||
""" datetime auxiliar function.
|
""" datetime auxiliar function.
|
||||||
"""
|
"""
|
||||||
|
@ -61,6 +58,12 @@ class FetchFeed:
|
||||||
modified = None
|
modified = None
|
||||||
etag = None
|
etag = None
|
||||||
|
|
||||||
|
USER_AGENT = 'NewsBlur Feed Fetcher (%s subscriber%s) - %s' % (
|
||||||
|
self.feed.num_subscribers,
|
||||||
|
's' if self.feed.num_subscribers != 1 else '',
|
||||||
|
URL
|
||||||
|
)
|
||||||
|
print USER_AGENT
|
||||||
self.fpf = feedparser.parse(self.feed.feed_address,
|
self.fpf = feedparser.parse(self.feed.feed_address,
|
||||||
agent=USER_AGENT,
|
agent=USER_AGENT,
|
||||||
etag=etag,
|
etag=etag,
|
||||||
|
@ -210,9 +213,6 @@ class ProcessFeed:
|
||||||
story_feed_id=self.feed.pk
|
story_feed_id=self.feed.pk
|
||||||
).limit(len(story_guids))
|
).limit(len(story_guids))
|
||||||
|
|
||||||
logging.info(u' ---> [%-30s] Parsing: %s existing stories' % (
|
|
||||||
unicode(self.feed)[:30],
|
|
||||||
len(existing_stories)))
|
|
||||||
# MStory.objects(
|
# MStory.objects(
|
||||||
# (Q(story_date__gte=start_date) & Q(story_date__lte=end_date))
|
# (Q(story_date__gte=start_date) & Q(story_date__lte=end_date))
|
||||||
# | (Q(story_guid__in=story_guids)),
|
# | (Q(story_guid__in=story_guids)),
|
||||||
|
@ -328,19 +328,23 @@ class Dispatcher:
|
||||||
(ret_feed == FEED_OK or
|
(ret_feed == FEED_OK or
|
||||||
(ret_feed == FEED_SAME and feed.stories_last_month > 10)))):
|
(ret_feed == FEED_SAME and feed.stories_last_month > 10)))):
|
||||||
|
|
||||||
logging.debug(u' ---> [%-30s] Fetching page' % (unicode(feed)[:30]))
|
logging.debug(u' ---> [%-30s] Fetching page: %s' % (unicode(feed)[:30], feed.feed_link))
|
||||||
page_importer = PageImporter(feed.feed_link, feed)
|
page_importer = PageImporter(feed.feed_link, feed)
|
||||||
try:
|
try:
|
||||||
page_importer.fetch_page()
|
page_importer.fetch_page()
|
||||||
|
except TimeoutError, e:
|
||||||
|
logging.debug(' ---> [%-30s] Page fetch timed out...' % (unicode(feed)[:30]))
|
||||||
|
feed.save_page_history(555, 'Timeout', '')
|
||||||
except Exception, e:
|
except Exception, e:
|
||||||
logging.debug('[%d] ! -------------------------' % (feed_id,))
|
logging.debug('[%d] ! -------------------------' % (feed_id,))
|
||||||
tb = traceback.format_exc()
|
tb = traceback.format_exc()
|
||||||
logging.error(tb)
|
logging.error(tb)
|
||||||
logging.debug('[%d] ! -------------------------' % (feed_id,))
|
logging.debug('[%d] ! -------------------------' % (feed_id,))
|
||||||
ret_feed = FEED_ERREXC
|
ret_feed = FEED_ERREXC
|
||||||
feed.save_feed_history(550, "Page Error", tb)
|
feed.save_page_history(550, "Page Error", tb)
|
||||||
fetched_feed = None
|
fetched_feed = None
|
||||||
|
|
||||||
|
logging.debug(u' ---> [%-30s] Fetching icon: %s' % (unicode(feed)[:30], feed.feed_link))
|
||||||
icon_importer = IconImporter(feed, force=self.options['force'])
|
icon_importer = IconImporter(feed, force=self.options['force'])
|
||||||
try:
|
try:
|
||||||
icon_importer.save()
|
icon_importer.save()
|
||||||
|
@ -361,9 +365,9 @@ class Dispatcher:
|
||||||
except IntegrityError:
|
except IntegrityError:
|
||||||
logging.debug(" ---> [%-30s] IntegrityError on feed: %s" % (unicode(feed)[:30], feed.feed_address,))
|
logging.debug(" ---> [%-30s] IntegrityError on feed: %s" % (unicode(feed)[:30], feed.feed_address,))
|
||||||
|
|
||||||
done_msg = (u'%2s ---> [%-30s] Processed in %s [%s]' % (
|
done_msg = (u'%2s ---> [%-30s] Processed in %s (%s) [%s]' % (
|
||||||
identity, feed.feed_title[:30], unicode(delta),
|
identity, feed.feed_title[:30], unicode(delta),
|
||||||
self.feed_trans[ret_feed],))
|
feed.pk, self.feed_trans[ret_feed],))
|
||||||
logging.debug(done_msg)
|
logging.debug(done_msg)
|
||||||
|
|
||||||
self.feed_stats[ret_feed] += 1
|
self.feed_stats[ret_feed] += 1
|
||||||
|
@ -384,8 +388,9 @@ class Dispatcher:
|
||||||
active=True,
|
active=True,
|
||||||
user__profile__last_seen_on__gte=UNREAD_CUTOFF)\
|
user__profile__last_seen_on__gte=UNREAD_CUTOFF)\
|
||||||
.order_by('-last_read_date')
|
.order_by('-last_read_date')
|
||||||
logging.debug(u' ---> [%-30s] Computing scores for all feed subscribers: %s subscribers' % (
|
logging.debug(u' ---> [%-30s] Computing scores: %s (%s/%s/%s) subscribers' % (
|
||||||
unicode(feed)[:30], user_subs.count()))
|
unicode(feed)[:30], user_subs.count(),
|
||||||
|
feed.num_subscribers, feed.active_subscribers, feed.premium_subscribers))
|
||||||
|
|
||||||
stories_db = MStory.objects(story_feed_id=feed.pk,
|
stories_db = MStory.objects(story_feed_id=feed.pk,
|
||||||
story_date__gte=UNREAD_CUTOFF)
|
story_date__gte=UNREAD_CUTOFF)
|
||||||
|
|
Loading…
Add table
Reference in a new issue