diff --git a/apps/rss_feeds/icon_importer.py b/apps/rss_feeds/icon_importer.py new file mode 100644 index 000000000..7b79795ed --- /dev/null +++ b/apps/rss_feeds/icon_importer.py @@ -0,0 +1,108 @@ +import urllib2 +import lxml.html +import scipy +import scipy.misc +import scipy.cluster +import StringIO +from PIL import ImageFile + +class IconImporter(object): + + def __init__(self, feed): + self.feed = feed + + def save(self): + image, icon_url = self.fetch() + if not image: return + color = self.determine_dominant_color_in_image(image) + image_str = self.string_from_image(image) + self.feed.icon.data = image_str + self.feed.icon.icon_url = icon_url + self.feed.icon.color = color + self.feed.icon.save() + + def fetch(self, path='favicon.ico'): + HEADERS = { + 'User-Agent': 'NewsBlur Favicon Fetcher - http://www.newsblur.com', + 'Connection': 'close', + } + image = None + url = self.feed.icon.icon_url + + if not url: + url = self.feed.feed_link + + if not url.endswith('/') and not url.endswith('favicon.ico'): + url += '/favicon.ico' + if url.endswith('/'): + url += 'favicon.ico' + + def request_image(request): + icon = urllib2.urlopen(request) + parser = ImageFile.Parser() + while True: + s = icon.read(1024) + if not s: + break + parser.feed(s) + image = parser.close() + return image + + request = urllib2.Request(url, headers=HEADERS) + try: + image = request_image(request) + except(urllib2.HTTPError, urllib2.URLError): + request = urllib2.Request(self.feed.feed_link, headers=HEADERS) + try: + # 2048 bytes should be enough for most of websites + content = urllib2.urlopen(request).read(2048) + except(urllib2.HTTPError, urllib2.URLError): + return + icon_path = lxml.html.fromstring(content).xpath( + '//link[@rel="icon" or @rel="shortcut icon"]/@href' + ) + if icon_path: + url = self.feed.feed_link + icon_path[0] + request = urllib2.Request(url, headers=HEADERS) + try: + image = request_image(request) + except(urllib2.HTTPError, urllib2.URLError): + return + + image = image.resize((16, 16)) + + return image, url + + def determine_dominant_color_in_image(self, image): + NUM_CLUSTERS = 5 + + if image.mode == 'P': + image.putalpha(0) + + ar = scipy.misc.fromimage(image) + shape = ar.shape + if len(shape) > 2: + ar = ar.reshape(scipy.product(shape[:2]), shape[2]) + + codes, dist = scipy.cluster.vq.kmeans(ar, NUM_CLUSTERS) + colors = [''.join(chr(c) for c in code).encode('hex') for code in codes] + + vecs, dist = scipy.cluster.vq.vq(ar, codes) # assign codes + counts, bins = scipy.histogram(vecs, len(codes)) # count occurrences + total = scipy.sum(counts) + print dict(zip(colors, [count/float(total) for count in counts])) + index_max = scipy.argmax(counts) # find most frequent + peak = codes[index_max] + color = ''.join(chr(c) for c in peak).encode('hex') + print 'most frequent is %s (#%s)' % (peak, color) + + return color + + def string_from_image(self, image): + output = StringIO.StringIO() + image.save(output, format="PNG") + contents = output.getvalue() + output.close() + print contents.encode('base64') + return contents.encode('base64') + \ No newline at end of file diff --git a/apps/rss_feeds/migrations/0039_feedicon.py b/apps/rss_feeds/migrations/0039_feedicon.py new file mode 100644 index 000000000..44f3f8c8a --- /dev/null +++ b/apps/rss_feeds/migrations/0039_feedicon.py @@ -0,0 +1,96 @@ +# encoding: utf-8 +import datetime +from south.db import db +from south.v2 import SchemaMigration +from django.db import models + +class Migration(SchemaMigration): + + def forwards(self, orm): + + # Adding model 'FeedIcon' + db.create_table('rss_feeds_feedicon', ( + ('id', self.gf('django.db.models.fields.AutoField')(primary_key=True)), + ('feed', self.gf('utils.fields.AutoOneToOneField')(related_name='icon', unique=True, to=orm['rss_feeds.Feed'])), + ('color', self.gf('django.db.models.fields.CharField')(default='000000', max_length=6)), + ('data', self.gf('django.db.models.fields.TextField')()), + ('icon_url', self.gf('django.db.models.fields.CharField')(max_length=2000, null=True, blank=True)), + )) + db.send_create_signal('rss_feeds', ['FeedIcon']) + + + def backwards(self, orm): + + # Deleting model 'FeedIcon' + db.delete_table('rss_feeds_feedicon') + + + models = { + 'rss_feeds.duplicatefeed': { + 'Meta': {'object_name': 'DuplicateFeed'}, + 'duplicate_address': ('django.db.models.fields.CharField', [], {'unique': 'True', 'max_length': '255'}), + 'duplicate_feed_id': ('django.db.models.fields.CharField', [], {'max_length': '255', 'null': 'True'}), + 'feed': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'duplicate_addresses'", 'to': "orm['rss_feeds.Feed']"}), + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}) + }, + 'rss_feeds.feed': { + 'Meta': {'ordering': "['feed_title']", 'object_name': 'Feed', 'db_table': "'feeds'"}, + 'active': ('django.db.models.fields.BooleanField', [], {'default': 'True', 'db_index': 'True'}), + 'active_subscribers': ('django.db.models.fields.IntegerField', [], {'default': '-1', 'db_index': 'True'}), + 'average_stories_per_month': ('django.db.models.fields.IntegerField', [], {'default': '0'}), + 'creation': ('django.db.models.fields.DateField', [], {'auto_now_add': 'True', 'blank': 'True'}), + 'days_to_trim': ('django.db.models.fields.IntegerField', [], {'default': '90'}), + 'etag': ('django.db.models.fields.CharField', [], {'max_length': '255', 'null': 'True', 'blank': 'True'}), + 'exception_code': ('django.db.models.fields.IntegerField', [], {'default': '0'}), + 'feed_address': ('django.db.models.fields.URLField', [], {'unique': 'True', 'max_length': '255'}), + 'feed_link': ('django.db.models.fields.URLField', [], {'default': "''", 'max_length': '1000', 'null': 'True', 'blank': 'True'}), + 'feed_title': ('django.db.models.fields.CharField', [], {'default': "''", 'max_length': '255', 'null': 'True', 'blank': 'True'}), + 'fetched_once': ('django.db.models.fields.BooleanField', [], {'default': 'False'}), + 'has_feed_exception': ('django.db.models.fields.BooleanField', [], {'default': 'False', 'db_index': 'True'}), + 'has_page_exception': ('django.db.models.fields.BooleanField', [], {'default': 'False', 'db_index': 'True'}), + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'last_load_time': ('django.db.models.fields.IntegerField', [], {'default': '0'}), + 'last_modified': ('django.db.models.fields.DateTimeField', [], {'null': 'True', 'blank': 'True'}), + 'last_update': ('django.db.models.fields.DateTimeField', [], {'db_index': 'True'}), + 'min_to_decay': ('django.db.models.fields.IntegerField', [], {'default': '0'}), + 'next_scheduled_update': ('django.db.models.fields.DateTimeField', [], {'db_index': 'True'}), + 'num_subscribers': ('django.db.models.fields.IntegerField', [], {'default': '-1'}), + 'premium_subscribers': ('django.db.models.fields.IntegerField', [], {'default': '-1'}), + 'queued_date': ('django.db.models.fields.DateTimeField', [], {'db_index': 'True'}), + 'stories_last_month': ('django.db.models.fields.IntegerField', [], {'default': '0'}) + }, + 'rss_feeds.feeddata': { + 'Meta': {'object_name': 'FeedData'}, + 'feed': ('utils.fields.AutoOneToOneField', [], {'related_name': "'data'", 'unique': 'True', 'to': "orm['rss_feeds.Feed']"}), + 'feed_tagline': ('django.db.models.fields.CharField', [], {'max_length': '1024', 'null': 'True', 'blank': 'True'}), + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'popular_authors': ('django.db.models.fields.CharField', [], {'max_length': '2048', 'null': 'True', 'blank': 'True'}), + 'popular_tags': ('django.db.models.fields.CharField', [], {'max_length': '1024', 'null': 'True', 'blank': 'True'}), + 'story_count_history': ('django.db.models.fields.TextField', [], {'null': 'True', 'blank': 'True'}) + }, + 'rss_feeds.feedicon': { + 'Meta': {'object_name': 'FeedIcon'}, + 'color': ('django.db.models.fields.CharField', [], {'default': "'000000'", 'max_length': '6'}), + 'data': ('django.db.models.fields.TextField', [], {}), + 'feed': ('utils.fields.AutoOneToOneField', [], {'related_name': "'icon'", 'unique': 'True', 'to': "orm['rss_feeds.Feed']"}), + 'icon_url': ('django.db.models.fields.CharField', [], {'max_length': '2000', 'null': 'True', 'blank': 'True'}), + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}) + }, + 'rss_feeds.feedloadtime': { + 'Meta': {'object_name': 'FeedLoadtime'}, + 'date_accessed': ('django.db.models.fields.DateTimeField', [], {'auto_now': 'True', 'blank': 'True'}), + 'feed': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['rss_feeds.Feed']"}), + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'loadtime': ('django.db.models.fields.FloatField', [], {}) + }, + 'rss_feeds.feedupdatehistory': { + 'Meta': {'object_name': 'FeedUpdateHistory'}, + 'average_per_feed': ('django.db.models.fields.DecimalField', [], {'max_digits': '4', 'decimal_places': '1'}), + 'fetch_date': ('django.db.models.fields.DateTimeField', [], {'auto_now': 'True', 'blank': 'True'}), + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'number_of_feeds': ('django.db.models.fields.IntegerField', [], {}), + 'seconds_taken': ('django.db.models.fields.IntegerField', [], {}) + } + } + + complete_apps = ['rss_feeds'] diff --git a/apps/rss_feeds/models.py b/apps/rss_feeds/models.py index 68099741b..4919238db 100644 --- a/apps/rss_feeds/models.py +++ b/apps/rss_feeds/models.py @@ -22,7 +22,6 @@ from utils import feedfinder from utils.fields import AutoOneToOneField from utils.feed_functions import levenshtein_distance from utils.feed_functions import timelimit -from utils.feed_functions import fetch_site_favicon, determine_dominant_color_in_image from utils.story_functions import pre_process_story from utils.diff import HTMLDiff from utils import log as logging @@ -701,12 +700,6 @@ class Feed(models.Model): return phrases - def download_favicon(self): - icon = fetch_site_favicon(self.feed_link) - - if icon: - determine_dominant_color_in_image(icon) - class Meta: db_table="feeds" ordering=["feed_title"] @@ -731,6 +724,20 @@ class FeedData(models.Model): except (IntegrityError, OperationError): if self.id: self.delete() + +class FeedIcon(models.Model): + feed = AutoOneToOneField(Feed, related_name='icon') + color = models.CharField(max_length=6, default="000000") + data = models.TextField() + icon_url = models.CharField(max_length=2000, blank=True, null=True) + + def save(self, *args, **kwargs): + try: + super(FeedIcon, self).save(*args, **kwargs) + except (IntegrityError, OperationError): + if self.id: self.delete() + + class MFeedPage(mongo.Document): feed_id = mongo.IntField(primary_key=True) page_data = mongo.BinaryField() diff --git a/apps/rss_feeds/importer.py b/apps/rss_feeds/page_importer.py similarity index 96% rename from apps/rss_feeds/importer.py rename to apps/rss_feeds/page_importer.py index f15805e71..80a5486c8 100644 --- a/apps/rss_feeds/importer.py +++ b/apps/rss_feeds/page_importer.py @@ -1,9 +1,6 @@ -import socket -socket.setdefaulttimeout(15) import urllib2 import re import urlparse -import multiprocessing import traceback import feedparser from utils import log as logging @@ -14,7 +11,6 @@ class PageImporter(object): def __init__(self, url, feed): self.url = url self.feed = feed - self.lock = multiprocessing.Lock() def fetch_page(self): if not self.url: diff --git a/extensions/chrome/manifest.json b/extensions/chrome/manifest.json index af4718fe3..22f74cda1 100644 --- a/extensions/chrome/manifest.json +++ b/extensions/chrome/manifest.json @@ -1,7 +1,7 @@ { "name": "NewsBlur", "description": "RSS feed reading with intelligence.", - "version": "1", + "version": "1.0", "icons": { "48": "48.png", "128": "128.png" diff --git a/utils/feed_fetcher.py b/utils/feed_fetcher.py index eb7eb6cab..629c43c74 100644 --- a/utils/feed_fetcher.py +++ b/utils/feed_fetcher.py @@ -6,7 +6,8 @@ from django.db import IntegrityError # from mongoengine.queryset import Q from apps.reader.models import UserSubscription, MUserStory from apps.rss_feeds.models import Feed, MStory -from apps.rss_feeds.importer import PageImporter +from apps.rss_feeds.page_importer import PageImporter +from apps.rss_feeds.icon_importer import IconImporter from utils import feedparser from utils.story_functions import pre_process_story from utils import log as logging @@ -339,6 +340,16 @@ class Dispatcher: ret_feed = FEED_ERREXC feed.save_feed_history(550, "Page Error", tb) fetched_feed = None + + icon_importer = IconImporter(feed) + try: + icon_importer.save() + except Exception, e: + logging.debug('[%d] ! -------------------------' % (feed_id,)) + tb = traceback.format_exc() + logging.error(tb) + logging.debug('[%d] ! -------------------------' % (feed_id,)) + # feed.save_feed_history(560, "Icon Error", tb) feed = self.refresh_feed(feed_id) delta = datetime.datetime.utcnow() - start_time diff --git a/utils/feed_functions.py b/utils/feed_functions.py index 78db9f7e1..334a31814 100644 --- a/utils/feed_functions.py +++ b/utils/feed_functions.py @@ -1,12 +1,6 @@ import datetime import threading import sys -import urllib2 -import lxml.html -from PIL import ImageFile -import scipy -import scipy.misc -import scipy.cluster from django.utils.translation import ungettext from utils import feedfinder @@ -176,76 +170,6 @@ def format_relative_date(date, future=False): return "%s hours %s" % ((((diff.seconds / 60) + 15) / 60), '' if future else 'ago') - -def fetch_site_favicon(url, path='favicon.ico'): - HEADERS = { - 'User-Agent': 'NewsBlur Favicon Fetcher - http://www.newsblur.com', - 'Connection': 'close', - } - image = None - - if not url.endswith('/'): - url += '/' - - def request_image(request): - icon = urllib2.urlopen(request) - parser = ImageFile.Parser() - while True: - s = icon.read(1024) - if not s: - break - parser.feed(s) - image = parser.close() - return image - - request = urllib2.Request(url + 'favicon.ico', headers=HEADERS) - try: - image = request_image(request) - except(urllib2.HTTPError, urllib2.URLError): - request = urllib2.Request(url, headers=HEADERS) - try: - content = urllib2.urlopen(request).read(2048) # 2048 bytes should be enough for most of websites - except(urllib2.HTTPError, urllib2.URLError): - return - icon_path = lxml.html.fromstring(content).xpath( - '//link[@rel="icon" or @rel="shortcut icon"]/@href' - ) - if icon_path: - request = urllib2.Request(url + icon_path[0], headers=HEADERS) - try: - image = request_image(request) - except(urllib2.HTTPError, urllib2.URLError): - return - - image = image.resize((16, 16)) - - return image - -def determine_dominant_color_in_image(image): - - NUM_CLUSTERS = 5 - - if image.mode == 'P': - image.putalpha(0) - ar = scipy.misc.fromimage(image) - shape = ar.shape - if len(shape) > 2: - ar = ar.reshape(scipy.product(shape[:2]), shape[2]) - - codes, dist = scipy.cluster.vq.kmeans(ar, NUM_CLUSTERS) - colors = [''.join(chr(c) for c in code).encode('hex') for code in codes] - - vecs, dist = scipy.cluster.vq.vq(ar, codes) # assign codes - counts, bins = scipy.histogram(vecs, len(codes)) # count occurrences - print counts - total = scipy.sum(counts) - print colors - print dict(zip(colors, [count/float(total) for count in counts])) - index_max = scipy.argmax(counts) # find most frequent - peak = codes[index_max] - colour = ''.join(chr(c) for c in peak).encode('hex') - print 'most frequent is %s (#%s)' % (peak, colour) - def add_object_to_folder(obj, folder, folders): if not folder: folders.append(obj)