Finally saving proper base64 encoded favicons, along with their URLs, autodetecting when not found at /favicon.ico, and now all thats left is serving icons and their colors. Woohoo!

2025-08-05 16:49:45 +00:00 · 2011-01-27 19:05:50 -05:00 · 2011-01-27 19:05:50 -05:00 · a182bb1216
commit a182bb1216
parent 36a960b13b
7 changed files with 231 additions and 89 deletions
--- a/apps/rss_feeds/icon_importer.py
+++ b/apps/rss_feeds/icon_importer.py
@ -0,0 +1,108 @@
+import urllib2
+import lxml.html
+import scipy
+import scipy.misc
+import scipy.cluster
+import StringIO
+from PIL import ImageFile
+
+class IconImporter(object):
+    
+    def __init__(self, feed):
+        self.feed = feed
+    
+    def save(self):
+        image, icon_url = self.fetch()
+        if not image: return
+        color = self.determine_dominant_color_in_image(image)
+        image_str = self.string_from_image(image)
+        self.feed.icon.data = image_str
+        self.feed.icon.icon_url = icon_url
+        self.feed.icon.color = color
+        self.feed.icon.save()
+       
+    def fetch(self, path='favicon.ico'):
+        HEADERS = {
+            'User-Agent': 'NewsBlur Favicon Fetcher - http://www.newsblur.com',
+            'Connection': 'close',
+        }
+        image = None
+        url = self.feed.icon.icon_url
+        
+        if not url:
+            url = self.feed.feed_link
+
+        if not url.endswith('/') and not url.endswith('favicon.ico'):
+            url += '/favicon.ico'
+        if url.endswith('/'):
+            url += 'favicon.ico'
+
+        def request_image(request):
+            icon = urllib2.urlopen(request)
+            parser = ImageFile.Parser()
+            while True:
+                s = icon.read(1024)
+                if not s:
+                    break
+                parser.feed(s)
+            image = parser.close()
+            return image
+        
+        request = urllib2.Request(url, headers=HEADERS)
+        try:
+            image = request_image(request)
+        except(urllib2.HTTPError, urllib2.URLError):
+            request = urllib2.Request(self.feed.feed_link, headers=HEADERS)
+            try:
+                # 2048 bytes should be enough for most of websites
+                content = urllib2.urlopen(request).read(2048) 
+            except(urllib2.HTTPError, urllib2.URLError):
+                return
+            icon_path = lxml.html.fromstring(content).xpath(
+                '//link[@rel="icon" or @rel="shortcut icon"]/@href'
+            )
+            if icon_path:
+                url = self.feed.feed_link + icon_path[0]
+                request = urllib2.Request(url, headers=HEADERS)
+                try:
+                    image = request_image(request)
+                except(urllib2.HTTPError, urllib2.URLError):
+                    return
+    
+        image = image.resize((16, 16))
+    
+        return image, url
+
+    def determine_dominant_color_in_image(self, image):
+        NUM_CLUSTERS = 5
+
+        if image.mode == 'P':
+            image.putalpha(0)
+            
+        ar = scipy.misc.fromimage(image)
+        shape = ar.shape
+        if len(shape) > 2:
+            ar = ar.reshape(scipy.product(shape[:2]), shape[2])
+
+        codes, dist = scipy.cluster.vq.kmeans(ar, NUM_CLUSTERS)
+        colors = [''.join(chr(c) for c in code).encode('hex') for code in codes]
+    
+        vecs, dist = scipy.cluster.vq.vq(ar, codes)         # assign codes
+        counts, bins = scipy.histogram(vecs, len(codes))    # count occurrences
+        total = scipy.sum(counts)
+        print dict(zip(colors, [count/float(total) for count in counts]))
+        index_max = scipy.argmax(counts)                    # find most frequent
+        peak = codes[index_max]
+        color = ''.join(chr(c) for c in peak).encode('hex')
+        print 'most frequent is %s (#%s)' % (peak, color)
+        
+        return color
+
+    def string_from_image(self, image):
+        output = StringIO.StringIO()
+        image.save(output, format="PNG")
+        contents = output.getvalue()
+        output.close()
+        print contents.encode('base64')
+        return contents.encode('base64')
+    
--- a/apps/rss_feeds/migrations/0039_feedicon.py
+++ b/apps/rss_feeds/migrations/0039_feedicon.py
@ -0,0 +1,96 @@
+# encoding: utf-8
+import datetime
+from south.db import db
+from south.v2 import SchemaMigration
+from django.db import models
+
+class Migration(SchemaMigration):
+
+    def forwards(self, orm):
+        
+        # Adding model 'FeedIcon'
+        db.create_table('rss_feeds_feedicon', (
+            ('id', self.gf('django.db.models.fields.AutoField')(primary_key=True)),
+            ('feed', self.gf('utils.fields.AutoOneToOneField')(related_name='icon', unique=True, to=orm['rss_feeds.Feed'])),
+            ('color', self.gf('django.db.models.fields.CharField')(default='000000', max_length=6)),
+            ('data', self.gf('django.db.models.fields.TextField')()),
+            ('icon_url', self.gf('django.db.models.fields.CharField')(max_length=2000, null=True, blank=True)),
+        ))
+        db.send_create_signal('rss_feeds', ['FeedIcon'])
+
+
+    def backwards(self, orm):
+        
+        # Deleting model 'FeedIcon'
+        db.delete_table('rss_feeds_feedicon')
+
+
+    models = {
+        'rss_feeds.duplicatefeed': {
+            'Meta': {'object_name': 'DuplicateFeed'},
+            'duplicate_address': ('django.db.models.fields.CharField', [], {'unique': 'True', 'max_length': '255'}),
+            'duplicate_feed_id': ('django.db.models.fields.CharField', [], {'max_length': '255', 'null': 'True'}),
+            'feed': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'duplicate_addresses'", 'to': "orm['rss_feeds.Feed']"}),
+            'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'})
+        },
+        'rss_feeds.feed': {
+            'Meta': {'ordering': "['feed_title']", 'object_name': 'Feed', 'db_table': "'feeds'"},
+            'active': ('django.db.models.fields.BooleanField', [], {'default': 'True', 'db_index': 'True'}),
+            'active_subscribers': ('django.db.models.fields.IntegerField', [], {'default': '-1', 'db_index': 'True'}),
+            'average_stories_per_month': ('django.db.models.fields.IntegerField', [], {'default': '0'}),
+            'creation': ('django.db.models.fields.DateField', [], {'auto_now_add': 'True', 'blank': 'True'}),
+            'days_to_trim': ('django.db.models.fields.IntegerField', [], {'default': '90'}),
+            'etag': ('django.db.models.fields.CharField', [], {'max_length': '255', 'null': 'True', 'blank': 'True'}),
+            'exception_code': ('django.db.models.fields.IntegerField', [], {'default': '0'}),
+            'feed_address': ('django.db.models.fields.URLField', [], {'unique': 'True', 'max_length': '255'}),
+            'feed_link': ('django.db.models.fields.URLField', [], {'default': "''", 'max_length': '1000', 'null': 'True', 'blank': 'True'}),
+            'feed_title': ('django.db.models.fields.CharField', [], {'default': "''", 'max_length': '255', 'null': 'True', 'blank': 'True'}),
+            'fetched_once': ('django.db.models.fields.BooleanField', [], {'default': 'False'}),
+            'has_feed_exception': ('django.db.models.fields.BooleanField', [], {'default': 'False', 'db_index': 'True'}),
+            'has_page_exception': ('django.db.models.fields.BooleanField', [], {'default': 'False', 'db_index': 'True'}),
+            'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
+            'last_load_time': ('django.db.models.fields.IntegerField', [], {'default': '0'}),
+            'last_modified': ('django.db.models.fields.DateTimeField', [], {'null': 'True', 'blank': 'True'}),
+            'last_update': ('django.db.models.fields.DateTimeField', [], {'db_index': 'True'}),
+            'min_to_decay': ('django.db.models.fields.IntegerField', [], {'default': '0'}),
+            'next_scheduled_update': ('django.db.models.fields.DateTimeField', [], {'db_index': 'True'}),
+            'num_subscribers': ('django.db.models.fields.IntegerField', [], {'default': '-1'}),
+            'premium_subscribers': ('django.db.models.fields.IntegerField', [], {'default': '-1'}),
+            'queued_date': ('django.db.models.fields.DateTimeField', [], {'db_index': 'True'}),
+            'stories_last_month': ('django.db.models.fields.IntegerField', [], {'default': '0'})
+        },
+        'rss_feeds.feeddata': {
+            'Meta': {'object_name': 'FeedData'},
+            'feed': ('utils.fields.AutoOneToOneField', [], {'related_name': "'data'", 'unique': 'True', 'to': "orm['rss_feeds.Feed']"}),
+            'feed_tagline': ('django.db.models.fields.CharField', [], {'max_length': '1024', 'null': 'True', 'blank': 'True'}),
+            'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
+            'popular_authors': ('django.db.models.fields.CharField', [], {'max_length': '2048', 'null': 'True', 'blank': 'True'}),
+            'popular_tags': ('django.db.models.fields.CharField', [], {'max_length': '1024', 'null': 'True', 'blank': 'True'}),
+            'story_count_history': ('django.db.models.fields.TextField', [], {'null': 'True', 'blank': 'True'})
+        },
+        'rss_feeds.feedicon': {
+            'Meta': {'object_name': 'FeedIcon'},
+            'color': ('django.db.models.fields.CharField', [], {'default': "'000000'", 'max_length': '6'}),
+            'data': ('django.db.models.fields.TextField', [], {}),
+            'feed': ('utils.fields.AutoOneToOneField', [], {'related_name': "'icon'", 'unique': 'True', 'to': "orm['rss_feeds.Feed']"}),
+            'icon_url': ('django.db.models.fields.CharField', [], {'max_length': '2000', 'null': 'True', 'blank': 'True'}),
+            'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'})
+        },
+        'rss_feeds.feedloadtime': {
+            'Meta': {'object_name': 'FeedLoadtime'},
+            'date_accessed': ('django.db.models.fields.DateTimeField', [], {'auto_now': 'True', 'blank': 'True'}),
+            'feed': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['rss_feeds.Feed']"}),
+            'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
+            'loadtime': ('django.db.models.fields.FloatField', [], {})
+        },
+        'rss_feeds.feedupdatehistory': {
+            'Meta': {'object_name': 'FeedUpdateHistory'},
+            'average_per_feed': ('django.db.models.fields.DecimalField', [], {'max_digits': '4', 'decimal_places': '1'}),
+            'fetch_date': ('django.db.models.fields.DateTimeField', [], {'auto_now': 'True', 'blank': 'True'}),
+            'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
+            'number_of_feeds': ('django.db.models.fields.IntegerField', [], {}),
+            'seconds_taken': ('django.db.models.fields.IntegerField', [], {})
+        }
+    }
+
+    complete_apps = ['rss_feeds']
--- a/apps/rss_feeds/models.py
+++ b/apps/rss_feeds/models.py
@ -22,7 +22,6 @@ from utils import feedfinder
 from utils.fields import AutoOneToOneField
 from utils.feed_functions import levenshtein_distance
 from utils.feed_functions import timelimit
-from utils.feed_functions import fetch_site_favicon, determine_dominant_color_in_image
 from utils.story_functions import pre_process_story
 from utils.diff import HTMLDiff
 from utils import log as logging
@ -701,12 +700,6 @@ class Feed(models.Model):
        
        return phrases
        
-    def download_favicon(self):
-        icon = fetch_site_favicon(self.feed_link)
-        
-        if icon:
-            determine_dominant_color_in_image(icon)
-        
    class Meta:
        db_table="feeds"
        ordering=["feed_title"]
@ -731,6 +724,20 @@ class FeedData(models.Model):
        except (IntegrityError, OperationError):
            if self.id: self.delete()

+
+class FeedIcon(models.Model):
+    feed = AutoOneToOneField(Feed, related_name='icon')
+    color = models.CharField(max_length=6, default="000000")
+    data = models.TextField()
+    icon_url = models.CharField(max_length=2000, blank=True, null=True)
+    
+    def save(self, *args, **kwargs):
+        try:    
+            super(FeedIcon, self).save(*args, **kwargs)
+        except (IntegrityError, OperationError):
+            if self.id: self.delete()
+
+
 class MFeedPage(mongo.Document):
    feed_id = mongo.IntField(primary_key=True)
    page_data = mongo.BinaryField()
--- a/apps/rss_feeds/page_importer.py
+++ b/apps/rss_feeds/page_importer.py
@ -1,9 +1,6 @@
-import socket
-socket.setdefaulttimeout(15)
 import urllib2
 import re
 import urlparse
-import multiprocessing
 import traceback
 import feedparser
 from utils import log as logging
@ -14,7 +11,6 @@ class PageImporter(object):
    def __init__(self, url, feed):
        self.url = url
        self.feed = feed
-        self.lock = multiprocessing.Lock()
    
    def fetch_page(self):
        if not self.url:
--- a/extensions/chrome/manifest.json
+++ b/extensions/chrome/manifest.json
@ -1,7 +1,7 @@
 {
  "name": "NewsBlur",
  "description": "RSS feed reading with intelligence.",
-  "version": "1",
+  "version": "1.0",
  "icons": { 
    "48": "48.png",
    "128": "128.png"
--- a/utils/feed_fetcher.py
+++ b/utils/feed_fetcher.py
@ -6,7 +6,8 @@ from django.db import IntegrityError
 # from mongoengine.queryset import Q
 from apps.reader.models import UserSubscription, MUserStory
 from apps.rss_feeds.models import Feed, MStory
-from apps.rss_feeds.importer import PageImporter
+from apps.rss_feeds.page_importer import PageImporter
+from apps.rss_feeds.icon_importer import IconImporter
 from utils import feedparser
 from utils.story_functions import pre_process_story
 from utils import log as logging
@ -340,6 +341,16 @@ class Dispatcher:
                    feed.save_feed_history(550, "Page Error", tb)
                    fetched_feed = None
                    
+                icon_importer = IconImporter(feed)
+                try:
+                    icon_importer.save()
+                except Exception, e:
+                    logging.debug('[%d] ! -------------------------' % (feed_id,))
+                    tb = traceback.format_exc()
+                    logging.error(tb)
+                    logging.debug('[%d] ! -------------------------' % (feed_id,))
+                    # feed.save_feed_history(560, "Icon Error", tb)
+                
            feed = self.refresh_feed(feed_id)
            delta = datetime.datetime.utcnow() - start_time
            
--- a/utils/feed_functions.py
+++ b/utils/feed_functions.py
@ -1,12 +1,6 @@
 import datetime
 import threading
 import sys
-import urllib2
-import lxml.html
-from PIL import ImageFile
-import scipy
-import scipy.misc
-import scipy.cluster
 from django.utils.translation import ungettext
 from utils import feedfinder

@ -176,76 +170,6 @@ def format_relative_date(date, future=False):
            return "%s hours %s" % ((((diff.seconds / 60) + 15) / 60), 
                                    '' if future else 'ago')

-                   
-def fetch_site_favicon(url, path='favicon.ico'):
-    HEADERS = {
-        'User-Agent': 'NewsBlur Favicon Fetcher - http://www.newsblur.com',
-        'Connection': 'close',
-    }
-    image = None
-    
-    if not url.endswith('/'):
-        url += '/'
-    
-    def request_image(request):
-        icon = urllib2.urlopen(request)
-        parser = ImageFile.Parser()
-        while True:
-            s = icon.read(1024)
-            if not s:
-                break
-            parser.feed(s)
-        image = parser.close()
-        return image
-        
-    request = urllib2.Request(url + 'favicon.ico', headers=HEADERS)
-    try:
-        image = request_image(request)
-    except(urllib2.HTTPError, urllib2.URLError):
-        request = urllib2.Request(url, headers=HEADERS)
-        try:
-            content = urllib2.urlopen(request).read(2048) # 2048 bytes should be enough for most of websites
-        except(urllib2.HTTPError, urllib2.URLError):
-            return
-        icon_path = lxml.html.fromstring(content).xpath(
-            '//link[@rel="icon" or @rel="shortcut icon"]/@href'
-        )
-        if icon_path:
-            request = urllib2.Request(url + icon_path[0], headers=HEADERS)
-            try:
-                image = request_image(request)
-            except(urllib2.HTTPError, urllib2.URLError):
-                return
-    
-    image = image.resize((16, 16))
-    
-    return image
-
-def determine_dominant_color_in_image(image):
-
-    NUM_CLUSTERS = 5
-
-    if image.mode == 'P':
-        image.putalpha(0)
-    ar = scipy.misc.fromimage(image)
-    shape = ar.shape
-    if len(shape) > 2:
-        ar = ar.reshape(scipy.product(shape[:2]), shape[2])
-
-    codes, dist = scipy.cluster.vq.kmeans(ar, NUM_CLUSTERS)
-    colors = [''.join(chr(c) for c in code).encode('hex') for code in codes]
-    
-    vecs, dist = scipy.cluster.vq.vq(ar, codes)         # assign codes
-    counts, bins = scipy.histogram(vecs, len(codes))    # count occurrences
-    print counts
-    total = scipy.sum(counts)
-    print colors
-    print dict(zip(colors, [count/float(total) for count in counts]))
-    index_max = scipy.argmax(counts)                    # find most frequent
-    peak = codes[index_max]
-    colour = ''.join(chr(c) for c in peak).encode('hex')
-    print 'most frequent is %s (#%s)' % (peak, colour)
-
 def add_object_to_folder(obj, folder, folders):
    if not folder:
        folders.append(obj)