Finally saving proper base64 encoded favicons, along with their URLs, autodetecting when not found at /favicon.ico, and now all thats left is serving icons and their colors. Woohoo!

This commit is contained in:
Samuel Clay 2011-01-27 19:05:50 -05:00
parent 36a960b13b
commit a182bb1216
7 changed files with 231 additions and 89 deletions

View file

@ -0,0 +1,108 @@
import urllib2
import lxml.html
import scipy
import scipy.misc
import scipy.cluster
import StringIO
from PIL import ImageFile
class IconImporter(object):
def __init__(self, feed):
self.feed = feed
def save(self):
image, icon_url = self.fetch()
if not image: return
color = self.determine_dominant_color_in_image(image)
image_str = self.string_from_image(image)
self.feed.icon.data = image_str
self.feed.icon.icon_url = icon_url
self.feed.icon.color = color
self.feed.icon.save()
def fetch(self, path='favicon.ico'):
HEADERS = {
'User-Agent': 'NewsBlur Favicon Fetcher - http://www.newsblur.com',
'Connection': 'close',
}
image = None
url = self.feed.icon.icon_url
if not url:
url = self.feed.feed_link
if not url.endswith('/') and not url.endswith('favicon.ico'):
url += '/favicon.ico'
if url.endswith('/'):
url += 'favicon.ico'
def request_image(request):
icon = urllib2.urlopen(request)
parser = ImageFile.Parser()
while True:
s = icon.read(1024)
if not s:
break
parser.feed(s)
image = parser.close()
return image
request = urllib2.Request(url, headers=HEADERS)
try:
image = request_image(request)
except(urllib2.HTTPError, urllib2.URLError):
request = urllib2.Request(self.feed.feed_link, headers=HEADERS)
try:
# 2048 bytes should be enough for most of websites
content = urllib2.urlopen(request).read(2048)
except(urllib2.HTTPError, urllib2.URLError):
return
icon_path = lxml.html.fromstring(content).xpath(
'//link[@rel="icon" or @rel="shortcut icon"]/@href'
)
if icon_path:
url = self.feed.feed_link + icon_path[0]
request = urllib2.Request(url, headers=HEADERS)
try:
image = request_image(request)
except(urllib2.HTTPError, urllib2.URLError):
return
image = image.resize((16, 16))
return image, url
def determine_dominant_color_in_image(self, image):
NUM_CLUSTERS = 5
if image.mode == 'P':
image.putalpha(0)
ar = scipy.misc.fromimage(image)
shape = ar.shape
if len(shape) > 2:
ar = ar.reshape(scipy.product(shape[:2]), shape[2])
codes, dist = scipy.cluster.vq.kmeans(ar, NUM_CLUSTERS)
colors = [''.join(chr(c) for c in code).encode('hex') for code in codes]
vecs, dist = scipy.cluster.vq.vq(ar, codes) # assign codes
counts, bins = scipy.histogram(vecs, len(codes)) # count occurrences
total = scipy.sum(counts)
print dict(zip(colors, [count/float(total) for count in counts]))
index_max = scipy.argmax(counts) # find most frequent
peak = codes[index_max]
color = ''.join(chr(c) for c in peak).encode('hex')
print 'most frequent is %s (#%s)' % (peak, color)
return color
def string_from_image(self, image):
output = StringIO.StringIO()
image.save(output, format="PNG")
contents = output.getvalue()
output.close()
print contents.encode('base64')
return contents.encode('base64')

View file

@ -0,0 +1,96 @@
# encoding: utf-8
import datetime
from south.db import db
from south.v2 import SchemaMigration
from django.db import models
class Migration(SchemaMigration):
def forwards(self, orm):
# Adding model 'FeedIcon'
db.create_table('rss_feeds_feedicon', (
('id', self.gf('django.db.models.fields.AutoField')(primary_key=True)),
('feed', self.gf('utils.fields.AutoOneToOneField')(related_name='icon', unique=True, to=orm['rss_feeds.Feed'])),
('color', self.gf('django.db.models.fields.CharField')(default='000000', max_length=6)),
('data', self.gf('django.db.models.fields.TextField')()),
('icon_url', self.gf('django.db.models.fields.CharField')(max_length=2000, null=True, blank=True)),
))
db.send_create_signal('rss_feeds', ['FeedIcon'])
def backwards(self, orm):
# Deleting model 'FeedIcon'
db.delete_table('rss_feeds_feedicon')
models = {
'rss_feeds.duplicatefeed': {
'Meta': {'object_name': 'DuplicateFeed'},
'duplicate_address': ('django.db.models.fields.CharField', [], {'unique': 'True', 'max_length': '255'}),
'duplicate_feed_id': ('django.db.models.fields.CharField', [], {'max_length': '255', 'null': 'True'}),
'feed': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'duplicate_addresses'", 'to': "orm['rss_feeds.Feed']"}),
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'})
},
'rss_feeds.feed': {
'Meta': {'ordering': "['feed_title']", 'object_name': 'Feed', 'db_table': "'feeds'"},
'active': ('django.db.models.fields.BooleanField', [], {'default': 'True', 'db_index': 'True'}),
'active_subscribers': ('django.db.models.fields.IntegerField', [], {'default': '-1', 'db_index': 'True'}),
'average_stories_per_month': ('django.db.models.fields.IntegerField', [], {'default': '0'}),
'creation': ('django.db.models.fields.DateField', [], {'auto_now_add': 'True', 'blank': 'True'}),
'days_to_trim': ('django.db.models.fields.IntegerField', [], {'default': '90'}),
'etag': ('django.db.models.fields.CharField', [], {'max_length': '255', 'null': 'True', 'blank': 'True'}),
'exception_code': ('django.db.models.fields.IntegerField', [], {'default': '0'}),
'feed_address': ('django.db.models.fields.URLField', [], {'unique': 'True', 'max_length': '255'}),
'feed_link': ('django.db.models.fields.URLField', [], {'default': "''", 'max_length': '1000', 'null': 'True', 'blank': 'True'}),
'feed_title': ('django.db.models.fields.CharField', [], {'default': "''", 'max_length': '255', 'null': 'True', 'blank': 'True'}),
'fetched_once': ('django.db.models.fields.BooleanField', [], {'default': 'False'}),
'has_feed_exception': ('django.db.models.fields.BooleanField', [], {'default': 'False', 'db_index': 'True'}),
'has_page_exception': ('django.db.models.fields.BooleanField', [], {'default': 'False', 'db_index': 'True'}),
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
'last_load_time': ('django.db.models.fields.IntegerField', [], {'default': '0'}),
'last_modified': ('django.db.models.fields.DateTimeField', [], {'null': 'True', 'blank': 'True'}),
'last_update': ('django.db.models.fields.DateTimeField', [], {'db_index': 'True'}),
'min_to_decay': ('django.db.models.fields.IntegerField', [], {'default': '0'}),
'next_scheduled_update': ('django.db.models.fields.DateTimeField', [], {'db_index': 'True'}),
'num_subscribers': ('django.db.models.fields.IntegerField', [], {'default': '-1'}),
'premium_subscribers': ('django.db.models.fields.IntegerField', [], {'default': '-1'}),
'queued_date': ('django.db.models.fields.DateTimeField', [], {'db_index': 'True'}),
'stories_last_month': ('django.db.models.fields.IntegerField', [], {'default': '0'})
},
'rss_feeds.feeddata': {
'Meta': {'object_name': 'FeedData'},
'feed': ('utils.fields.AutoOneToOneField', [], {'related_name': "'data'", 'unique': 'True', 'to': "orm['rss_feeds.Feed']"}),
'feed_tagline': ('django.db.models.fields.CharField', [], {'max_length': '1024', 'null': 'True', 'blank': 'True'}),
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
'popular_authors': ('django.db.models.fields.CharField', [], {'max_length': '2048', 'null': 'True', 'blank': 'True'}),
'popular_tags': ('django.db.models.fields.CharField', [], {'max_length': '1024', 'null': 'True', 'blank': 'True'}),
'story_count_history': ('django.db.models.fields.TextField', [], {'null': 'True', 'blank': 'True'})
},
'rss_feeds.feedicon': {
'Meta': {'object_name': 'FeedIcon'},
'color': ('django.db.models.fields.CharField', [], {'default': "'000000'", 'max_length': '6'}),
'data': ('django.db.models.fields.TextField', [], {}),
'feed': ('utils.fields.AutoOneToOneField', [], {'related_name': "'icon'", 'unique': 'True', 'to': "orm['rss_feeds.Feed']"}),
'icon_url': ('django.db.models.fields.CharField', [], {'max_length': '2000', 'null': 'True', 'blank': 'True'}),
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'})
},
'rss_feeds.feedloadtime': {
'Meta': {'object_name': 'FeedLoadtime'},
'date_accessed': ('django.db.models.fields.DateTimeField', [], {'auto_now': 'True', 'blank': 'True'}),
'feed': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['rss_feeds.Feed']"}),
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
'loadtime': ('django.db.models.fields.FloatField', [], {})
},
'rss_feeds.feedupdatehistory': {
'Meta': {'object_name': 'FeedUpdateHistory'},
'average_per_feed': ('django.db.models.fields.DecimalField', [], {'max_digits': '4', 'decimal_places': '1'}),
'fetch_date': ('django.db.models.fields.DateTimeField', [], {'auto_now': 'True', 'blank': 'True'}),
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
'number_of_feeds': ('django.db.models.fields.IntegerField', [], {}),
'seconds_taken': ('django.db.models.fields.IntegerField', [], {})
}
}
complete_apps = ['rss_feeds']

View file

@ -22,7 +22,6 @@ from utils import feedfinder
from utils.fields import AutoOneToOneField from utils.fields import AutoOneToOneField
from utils.feed_functions import levenshtein_distance from utils.feed_functions import levenshtein_distance
from utils.feed_functions import timelimit from utils.feed_functions import timelimit
from utils.feed_functions import fetch_site_favicon, determine_dominant_color_in_image
from utils.story_functions import pre_process_story from utils.story_functions import pre_process_story
from utils.diff import HTMLDiff from utils.diff import HTMLDiff
from utils import log as logging from utils import log as logging
@ -701,12 +700,6 @@ class Feed(models.Model):
return phrases return phrases
def download_favicon(self):
icon = fetch_site_favicon(self.feed_link)
if icon:
determine_dominant_color_in_image(icon)
class Meta: class Meta:
db_table="feeds" db_table="feeds"
ordering=["feed_title"] ordering=["feed_title"]
@ -731,6 +724,20 @@ class FeedData(models.Model):
except (IntegrityError, OperationError): except (IntegrityError, OperationError):
if self.id: self.delete() if self.id: self.delete()
class FeedIcon(models.Model):
feed = AutoOneToOneField(Feed, related_name='icon')
color = models.CharField(max_length=6, default="000000")
data = models.TextField()
icon_url = models.CharField(max_length=2000, blank=True, null=True)
def save(self, *args, **kwargs):
try:
super(FeedIcon, self).save(*args, **kwargs)
except (IntegrityError, OperationError):
if self.id: self.delete()
class MFeedPage(mongo.Document): class MFeedPage(mongo.Document):
feed_id = mongo.IntField(primary_key=True) feed_id = mongo.IntField(primary_key=True)
page_data = mongo.BinaryField() page_data = mongo.BinaryField()

View file

@ -1,9 +1,6 @@
import socket
socket.setdefaulttimeout(15)
import urllib2 import urllib2
import re import re
import urlparse import urlparse
import multiprocessing
import traceback import traceback
import feedparser import feedparser
from utils import log as logging from utils import log as logging
@ -14,7 +11,6 @@ class PageImporter(object):
def __init__(self, url, feed): def __init__(self, url, feed):
self.url = url self.url = url
self.feed = feed self.feed = feed
self.lock = multiprocessing.Lock()
def fetch_page(self): def fetch_page(self):
if not self.url: if not self.url:

View file

@ -1,7 +1,7 @@
{ {
"name": "NewsBlur", "name": "NewsBlur",
"description": "RSS feed reading with intelligence.", "description": "RSS feed reading with intelligence.",
"version": "1", "version": "1.0",
"icons": { "icons": {
"48": "48.png", "48": "48.png",
"128": "128.png" "128": "128.png"

View file

@ -6,7 +6,8 @@ from django.db import IntegrityError
# from mongoengine.queryset import Q # from mongoengine.queryset import Q
from apps.reader.models import UserSubscription, MUserStory from apps.reader.models import UserSubscription, MUserStory
from apps.rss_feeds.models import Feed, MStory from apps.rss_feeds.models import Feed, MStory
from apps.rss_feeds.importer import PageImporter from apps.rss_feeds.page_importer import PageImporter
from apps.rss_feeds.icon_importer import IconImporter
from utils import feedparser from utils import feedparser
from utils.story_functions import pre_process_story from utils.story_functions import pre_process_story
from utils import log as logging from utils import log as logging
@ -340,6 +341,16 @@ class Dispatcher:
feed.save_feed_history(550, "Page Error", tb) feed.save_feed_history(550, "Page Error", tb)
fetched_feed = None fetched_feed = None
icon_importer = IconImporter(feed)
try:
icon_importer.save()
except Exception, e:
logging.debug('[%d] ! -------------------------' % (feed_id,))
tb = traceback.format_exc()
logging.error(tb)
logging.debug('[%d] ! -------------------------' % (feed_id,))
# feed.save_feed_history(560, "Icon Error", tb)
feed = self.refresh_feed(feed_id) feed = self.refresh_feed(feed_id)
delta = datetime.datetime.utcnow() - start_time delta = datetime.datetime.utcnow() - start_time

View file

@ -1,12 +1,6 @@
import datetime import datetime
import threading import threading
import sys import sys
import urllib2
import lxml.html
from PIL import ImageFile
import scipy
import scipy.misc
import scipy.cluster
from django.utils.translation import ungettext from django.utils.translation import ungettext
from utils import feedfinder from utils import feedfinder
@ -176,76 +170,6 @@ def format_relative_date(date, future=False):
return "%s hours %s" % ((((diff.seconds / 60) + 15) / 60), return "%s hours %s" % ((((diff.seconds / 60) + 15) / 60),
'' if future else 'ago') '' if future else 'ago')
def fetch_site_favicon(url, path='favicon.ico'):
HEADERS = {
'User-Agent': 'NewsBlur Favicon Fetcher - http://www.newsblur.com',
'Connection': 'close',
}
image = None
if not url.endswith('/'):
url += '/'
def request_image(request):
icon = urllib2.urlopen(request)
parser = ImageFile.Parser()
while True:
s = icon.read(1024)
if not s:
break
parser.feed(s)
image = parser.close()
return image
request = urllib2.Request(url + 'favicon.ico', headers=HEADERS)
try:
image = request_image(request)
except(urllib2.HTTPError, urllib2.URLError):
request = urllib2.Request(url, headers=HEADERS)
try:
content = urllib2.urlopen(request).read(2048) # 2048 bytes should be enough for most of websites
except(urllib2.HTTPError, urllib2.URLError):
return
icon_path = lxml.html.fromstring(content).xpath(
'//link[@rel="icon" or @rel="shortcut icon"]/@href'
)
if icon_path:
request = urllib2.Request(url + icon_path[0], headers=HEADERS)
try:
image = request_image(request)
except(urllib2.HTTPError, urllib2.URLError):
return
image = image.resize((16, 16))
return image
def determine_dominant_color_in_image(image):
NUM_CLUSTERS = 5
if image.mode == 'P':
image.putalpha(0)
ar = scipy.misc.fromimage(image)
shape = ar.shape
if len(shape) > 2:
ar = ar.reshape(scipy.product(shape[:2]), shape[2])
codes, dist = scipy.cluster.vq.kmeans(ar, NUM_CLUSTERS)
colors = [''.join(chr(c) for c in code).encode('hex') for code in codes]
vecs, dist = scipy.cluster.vq.vq(ar, codes) # assign codes
counts, bins = scipy.histogram(vecs, len(codes)) # count occurrences
print counts
total = scipy.sum(counts)
print colors
print dict(zip(colors, [count/float(total) for count in counts]))
index_max = scipy.argmax(counts) # find most frequent
peak = codes[index_max]
colour = ''.join(chr(c) for c in peak).encode('hex')
print 'most frequent is %s (#%s)' % (peak, colour)
def add_object_to_folder(obj, folder, folders): def add_object_to_folder(obj, folder, folders):
if not folder: if not folder:
folders.append(obj) folders.append(obj)