mirror of
https://github.com/viq/NewsBlur.git
synced 2025-04-13 09:38:09 +00:00
Finally saving proper base64 encoded favicons, along with their URLs, autodetecting when not found at /favicon.ico, and now all thats left is serving icons and their colors. Woohoo!
This commit is contained in:
parent
36a960b13b
commit
a182bb1216
7 changed files with 231 additions and 89 deletions
108
apps/rss_feeds/icon_importer.py
Normal file
108
apps/rss_feeds/icon_importer.py
Normal file
|
@ -0,0 +1,108 @@
|
|||
import urllib2
|
||||
import lxml.html
|
||||
import scipy
|
||||
import scipy.misc
|
||||
import scipy.cluster
|
||||
import StringIO
|
||||
from PIL import ImageFile
|
||||
|
||||
class IconImporter(object):
|
||||
|
||||
def __init__(self, feed):
|
||||
self.feed = feed
|
||||
|
||||
def save(self):
|
||||
image, icon_url = self.fetch()
|
||||
if not image: return
|
||||
color = self.determine_dominant_color_in_image(image)
|
||||
image_str = self.string_from_image(image)
|
||||
self.feed.icon.data = image_str
|
||||
self.feed.icon.icon_url = icon_url
|
||||
self.feed.icon.color = color
|
||||
self.feed.icon.save()
|
||||
|
||||
def fetch(self, path='favicon.ico'):
|
||||
HEADERS = {
|
||||
'User-Agent': 'NewsBlur Favicon Fetcher - http://www.newsblur.com',
|
||||
'Connection': 'close',
|
||||
}
|
||||
image = None
|
||||
url = self.feed.icon.icon_url
|
||||
|
||||
if not url:
|
||||
url = self.feed.feed_link
|
||||
|
||||
if not url.endswith('/') and not url.endswith('favicon.ico'):
|
||||
url += '/favicon.ico'
|
||||
if url.endswith('/'):
|
||||
url += 'favicon.ico'
|
||||
|
||||
def request_image(request):
|
||||
icon = urllib2.urlopen(request)
|
||||
parser = ImageFile.Parser()
|
||||
while True:
|
||||
s = icon.read(1024)
|
||||
if not s:
|
||||
break
|
||||
parser.feed(s)
|
||||
image = parser.close()
|
||||
return image
|
||||
|
||||
request = urllib2.Request(url, headers=HEADERS)
|
||||
try:
|
||||
image = request_image(request)
|
||||
except(urllib2.HTTPError, urllib2.URLError):
|
||||
request = urllib2.Request(self.feed.feed_link, headers=HEADERS)
|
||||
try:
|
||||
# 2048 bytes should be enough for most of websites
|
||||
content = urllib2.urlopen(request).read(2048)
|
||||
except(urllib2.HTTPError, urllib2.URLError):
|
||||
return
|
||||
icon_path = lxml.html.fromstring(content).xpath(
|
||||
'//link[@rel="icon" or @rel="shortcut icon"]/@href'
|
||||
)
|
||||
if icon_path:
|
||||
url = self.feed.feed_link + icon_path[0]
|
||||
request = urllib2.Request(url, headers=HEADERS)
|
||||
try:
|
||||
image = request_image(request)
|
||||
except(urllib2.HTTPError, urllib2.URLError):
|
||||
return
|
||||
|
||||
image = image.resize((16, 16))
|
||||
|
||||
return image, url
|
||||
|
||||
def determine_dominant_color_in_image(self, image):
|
||||
NUM_CLUSTERS = 5
|
||||
|
||||
if image.mode == 'P':
|
||||
image.putalpha(0)
|
||||
|
||||
ar = scipy.misc.fromimage(image)
|
||||
shape = ar.shape
|
||||
if len(shape) > 2:
|
||||
ar = ar.reshape(scipy.product(shape[:2]), shape[2])
|
||||
|
||||
codes, dist = scipy.cluster.vq.kmeans(ar, NUM_CLUSTERS)
|
||||
colors = [''.join(chr(c) for c in code).encode('hex') for code in codes]
|
||||
|
||||
vecs, dist = scipy.cluster.vq.vq(ar, codes) # assign codes
|
||||
counts, bins = scipy.histogram(vecs, len(codes)) # count occurrences
|
||||
total = scipy.sum(counts)
|
||||
print dict(zip(colors, [count/float(total) for count in counts]))
|
||||
index_max = scipy.argmax(counts) # find most frequent
|
||||
peak = codes[index_max]
|
||||
color = ''.join(chr(c) for c in peak).encode('hex')
|
||||
print 'most frequent is %s (#%s)' % (peak, color)
|
||||
|
||||
return color
|
||||
|
||||
def string_from_image(self, image):
|
||||
output = StringIO.StringIO()
|
||||
image.save(output, format="PNG")
|
||||
contents = output.getvalue()
|
||||
output.close()
|
||||
print contents.encode('base64')
|
||||
return contents.encode('base64')
|
||||
|
96
apps/rss_feeds/migrations/0039_feedicon.py
Normal file
96
apps/rss_feeds/migrations/0039_feedicon.py
Normal file
|
@ -0,0 +1,96 @@
|
|||
# encoding: utf-8
|
||||
import datetime
|
||||
from south.db import db
|
||||
from south.v2 import SchemaMigration
|
||||
from django.db import models
|
||||
|
||||
class Migration(SchemaMigration):
|
||||
|
||||
def forwards(self, orm):
|
||||
|
||||
# Adding model 'FeedIcon'
|
||||
db.create_table('rss_feeds_feedicon', (
|
||||
('id', self.gf('django.db.models.fields.AutoField')(primary_key=True)),
|
||||
('feed', self.gf('utils.fields.AutoOneToOneField')(related_name='icon', unique=True, to=orm['rss_feeds.Feed'])),
|
||||
('color', self.gf('django.db.models.fields.CharField')(default='000000', max_length=6)),
|
||||
('data', self.gf('django.db.models.fields.TextField')()),
|
||||
('icon_url', self.gf('django.db.models.fields.CharField')(max_length=2000, null=True, blank=True)),
|
||||
))
|
||||
db.send_create_signal('rss_feeds', ['FeedIcon'])
|
||||
|
||||
|
||||
def backwards(self, orm):
|
||||
|
||||
# Deleting model 'FeedIcon'
|
||||
db.delete_table('rss_feeds_feedicon')
|
||||
|
||||
|
||||
models = {
|
||||
'rss_feeds.duplicatefeed': {
|
||||
'Meta': {'object_name': 'DuplicateFeed'},
|
||||
'duplicate_address': ('django.db.models.fields.CharField', [], {'unique': 'True', 'max_length': '255'}),
|
||||
'duplicate_feed_id': ('django.db.models.fields.CharField', [], {'max_length': '255', 'null': 'True'}),
|
||||
'feed': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'duplicate_addresses'", 'to': "orm['rss_feeds.Feed']"}),
|
||||
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'})
|
||||
},
|
||||
'rss_feeds.feed': {
|
||||
'Meta': {'ordering': "['feed_title']", 'object_name': 'Feed', 'db_table': "'feeds'"},
|
||||
'active': ('django.db.models.fields.BooleanField', [], {'default': 'True', 'db_index': 'True'}),
|
||||
'active_subscribers': ('django.db.models.fields.IntegerField', [], {'default': '-1', 'db_index': 'True'}),
|
||||
'average_stories_per_month': ('django.db.models.fields.IntegerField', [], {'default': '0'}),
|
||||
'creation': ('django.db.models.fields.DateField', [], {'auto_now_add': 'True', 'blank': 'True'}),
|
||||
'days_to_trim': ('django.db.models.fields.IntegerField', [], {'default': '90'}),
|
||||
'etag': ('django.db.models.fields.CharField', [], {'max_length': '255', 'null': 'True', 'blank': 'True'}),
|
||||
'exception_code': ('django.db.models.fields.IntegerField', [], {'default': '0'}),
|
||||
'feed_address': ('django.db.models.fields.URLField', [], {'unique': 'True', 'max_length': '255'}),
|
||||
'feed_link': ('django.db.models.fields.URLField', [], {'default': "''", 'max_length': '1000', 'null': 'True', 'blank': 'True'}),
|
||||
'feed_title': ('django.db.models.fields.CharField', [], {'default': "''", 'max_length': '255', 'null': 'True', 'blank': 'True'}),
|
||||
'fetched_once': ('django.db.models.fields.BooleanField', [], {'default': 'False'}),
|
||||
'has_feed_exception': ('django.db.models.fields.BooleanField', [], {'default': 'False', 'db_index': 'True'}),
|
||||
'has_page_exception': ('django.db.models.fields.BooleanField', [], {'default': 'False', 'db_index': 'True'}),
|
||||
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
|
||||
'last_load_time': ('django.db.models.fields.IntegerField', [], {'default': '0'}),
|
||||
'last_modified': ('django.db.models.fields.DateTimeField', [], {'null': 'True', 'blank': 'True'}),
|
||||
'last_update': ('django.db.models.fields.DateTimeField', [], {'db_index': 'True'}),
|
||||
'min_to_decay': ('django.db.models.fields.IntegerField', [], {'default': '0'}),
|
||||
'next_scheduled_update': ('django.db.models.fields.DateTimeField', [], {'db_index': 'True'}),
|
||||
'num_subscribers': ('django.db.models.fields.IntegerField', [], {'default': '-1'}),
|
||||
'premium_subscribers': ('django.db.models.fields.IntegerField', [], {'default': '-1'}),
|
||||
'queued_date': ('django.db.models.fields.DateTimeField', [], {'db_index': 'True'}),
|
||||
'stories_last_month': ('django.db.models.fields.IntegerField', [], {'default': '0'})
|
||||
},
|
||||
'rss_feeds.feeddata': {
|
||||
'Meta': {'object_name': 'FeedData'},
|
||||
'feed': ('utils.fields.AutoOneToOneField', [], {'related_name': "'data'", 'unique': 'True', 'to': "orm['rss_feeds.Feed']"}),
|
||||
'feed_tagline': ('django.db.models.fields.CharField', [], {'max_length': '1024', 'null': 'True', 'blank': 'True'}),
|
||||
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
|
||||
'popular_authors': ('django.db.models.fields.CharField', [], {'max_length': '2048', 'null': 'True', 'blank': 'True'}),
|
||||
'popular_tags': ('django.db.models.fields.CharField', [], {'max_length': '1024', 'null': 'True', 'blank': 'True'}),
|
||||
'story_count_history': ('django.db.models.fields.TextField', [], {'null': 'True', 'blank': 'True'})
|
||||
},
|
||||
'rss_feeds.feedicon': {
|
||||
'Meta': {'object_name': 'FeedIcon'},
|
||||
'color': ('django.db.models.fields.CharField', [], {'default': "'000000'", 'max_length': '6'}),
|
||||
'data': ('django.db.models.fields.TextField', [], {}),
|
||||
'feed': ('utils.fields.AutoOneToOneField', [], {'related_name': "'icon'", 'unique': 'True', 'to': "orm['rss_feeds.Feed']"}),
|
||||
'icon_url': ('django.db.models.fields.CharField', [], {'max_length': '2000', 'null': 'True', 'blank': 'True'}),
|
||||
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'})
|
||||
},
|
||||
'rss_feeds.feedloadtime': {
|
||||
'Meta': {'object_name': 'FeedLoadtime'},
|
||||
'date_accessed': ('django.db.models.fields.DateTimeField', [], {'auto_now': 'True', 'blank': 'True'}),
|
||||
'feed': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['rss_feeds.Feed']"}),
|
||||
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
|
||||
'loadtime': ('django.db.models.fields.FloatField', [], {})
|
||||
},
|
||||
'rss_feeds.feedupdatehistory': {
|
||||
'Meta': {'object_name': 'FeedUpdateHistory'},
|
||||
'average_per_feed': ('django.db.models.fields.DecimalField', [], {'max_digits': '4', 'decimal_places': '1'}),
|
||||
'fetch_date': ('django.db.models.fields.DateTimeField', [], {'auto_now': 'True', 'blank': 'True'}),
|
||||
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
|
||||
'number_of_feeds': ('django.db.models.fields.IntegerField', [], {}),
|
||||
'seconds_taken': ('django.db.models.fields.IntegerField', [], {})
|
||||
}
|
||||
}
|
||||
|
||||
complete_apps = ['rss_feeds']
|
|
@ -22,7 +22,6 @@ from utils import feedfinder
|
|||
from utils.fields import AutoOneToOneField
|
||||
from utils.feed_functions import levenshtein_distance
|
||||
from utils.feed_functions import timelimit
|
||||
from utils.feed_functions import fetch_site_favicon, determine_dominant_color_in_image
|
||||
from utils.story_functions import pre_process_story
|
||||
from utils.diff import HTMLDiff
|
||||
from utils import log as logging
|
||||
|
@ -701,12 +700,6 @@ class Feed(models.Model):
|
|||
|
||||
return phrases
|
||||
|
||||
def download_favicon(self):
|
||||
icon = fetch_site_favicon(self.feed_link)
|
||||
|
||||
if icon:
|
||||
determine_dominant_color_in_image(icon)
|
||||
|
||||
class Meta:
|
||||
db_table="feeds"
|
||||
ordering=["feed_title"]
|
||||
|
@ -731,6 +724,20 @@ class FeedData(models.Model):
|
|||
except (IntegrityError, OperationError):
|
||||
if self.id: self.delete()
|
||||
|
||||
|
||||
class FeedIcon(models.Model):
|
||||
feed = AutoOneToOneField(Feed, related_name='icon')
|
||||
color = models.CharField(max_length=6, default="000000")
|
||||
data = models.TextField()
|
||||
icon_url = models.CharField(max_length=2000, blank=True, null=True)
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
try:
|
||||
super(FeedIcon, self).save(*args, **kwargs)
|
||||
except (IntegrityError, OperationError):
|
||||
if self.id: self.delete()
|
||||
|
||||
|
||||
class MFeedPage(mongo.Document):
|
||||
feed_id = mongo.IntField(primary_key=True)
|
||||
page_data = mongo.BinaryField()
|
||||
|
|
|
@ -1,9 +1,6 @@
|
|||
import socket
|
||||
socket.setdefaulttimeout(15)
|
||||
import urllib2
|
||||
import re
|
||||
import urlparse
|
||||
import multiprocessing
|
||||
import traceback
|
||||
import feedparser
|
||||
from utils import log as logging
|
||||
|
@ -14,7 +11,6 @@ class PageImporter(object):
|
|||
def __init__(self, url, feed):
|
||||
self.url = url
|
||||
self.feed = feed
|
||||
self.lock = multiprocessing.Lock()
|
||||
|
||||
def fetch_page(self):
|
||||
if not self.url:
|
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"name": "NewsBlur",
|
||||
"description": "RSS feed reading with intelligence.",
|
||||
"version": "1",
|
||||
"version": "1.0",
|
||||
"icons": {
|
||||
"48": "48.png",
|
||||
"128": "128.png"
|
||||
|
|
|
@ -6,7 +6,8 @@ from django.db import IntegrityError
|
|||
# from mongoengine.queryset import Q
|
||||
from apps.reader.models import UserSubscription, MUserStory
|
||||
from apps.rss_feeds.models import Feed, MStory
|
||||
from apps.rss_feeds.importer import PageImporter
|
||||
from apps.rss_feeds.page_importer import PageImporter
|
||||
from apps.rss_feeds.icon_importer import IconImporter
|
||||
from utils import feedparser
|
||||
from utils.story_functions import pre_process_story
|
||||
from utils import log as logging
|
||||
|
@ -339,6 +340,16 @@ class Dispatcher:
|
|||
ret_feed = FEED_ERREXC
|
||||
feed.save_feed_history(550, "Page Error", tb)
|
||||
fetched_feed = None
|
||||
|
||||
icon_importer = IconImporter(feed)
|
||||
try:
|
||||
icon_importer.save()
|
||||
except Exception, e:
|
||||
logging.debug('[%d] ! -------------------------' % (feed_id,))
|
||||
tb = traceback.format_exc()
|
||||
logging.error(tb)
|
||||
logging.debug('[%d] ! -------------------------' % (feed_id,))
|
||||
# feed.save_feed_history(560, "Icon Error", tb)
|
||||
|
||||
feed = self.refresh_feed(feed_id)
|
||||
delta = datetime.datetime.utcnow() - start_time
|
||||
|
|
|
@ -1,12 +1,6 @@
|
|||
import datetime
|
||||
import threading
|
||||
import sys
|
||||
import urllib2
|
||||
import lxml.html
|
||||
from PIL import ImageFile
|
||||
import scipy
|
||||
import scipy.misc
|
||||
import scipy.cluster
|
||||
from django.utils.translation import ungettext
|
||||
from utils import feedfinder
|
||||
|
||||
|
@ -176,76 +170,6 @@ def format_relative_date(date, future=False):
|
|||
return "%s hours %s" % ((((diff.seconds / 60) + 15) / 60),
|
||||
'' if future else 'ago')
|
||||
|
||||
|
||||
def fetch_site_favicon(url, path='favicon.ico'):
|
||||
HEADERS = {
|
||||
'User-Agent': 'NewsBlur Favicon Fetcher - http://www.newsblur.com',
|
||||
'Connection': 'close',
|
||||
}
|
||||
image = None
|
||||
|
||||
if not url.endswith('/'):
|
||||
url += '/'
|
||||
|
||||
def request_image(request):
|
||||
icon = urllib2.urlopen(request)
|
||||
parser = ImageFile.Parser()
|
||||
while True:
|
||||
s = icon.read(1024)
|
||||
if not s:
|
||||
break
|
||||
parser.feed(s)
|
||||
image = parser.close()
|
||||
return image
|
||||
|
||||
request = urllib2.Request(url + 'favicon.ico', headers=HEADERS)
|
||||
try:
|
||||
image = request_image(request)
|
||||
except(urllib2.HTTPError, urllib2.URLError):
|
||||
request = urllib2.Request(url, headers=HEADERS)
|
||||
try:
|
||||
content = urllib2.urlopen(request).read(2048) # 2048 bytes should be enough for most of websites
|
||||
except(urllib2.HTTPError, urllib2.URLError):
|
||||
return
|
||||
icon_path = lxml.html.fromstring(content).xpath(
|
||||
'//link[@rel="icon" or @rel="shortcut icon"]/@href'
|
||||
)
|
||||
if icon_path:
|
||||
request = urllib2.Request(url + icon_path[0], headers=HEADERS)
|
||||
try:
|
||||
image = request_image(request)
|
||||
except(urllib2.HTTPError, urllib2.URLError):
|
||||
return
|
||||
|
||||
image = image.resize((16, 16))
|
||||
|
||||
return image
|
||||
|
||||
def determine_dominant_color_in_image(image):
|
||||
|
||||
NUM_CLUSTERS = 5
|
||||
|
||||
if image.mode == 'P':
|
||||
image.putalpha(0)
|
||||
ar = scipy.misc.fromimage(image)
|
||||
shape = ar.shape
|
||||
if len(shape) > 2:
|
||||
ar = ar.reshape(scipy.product(shape[:2]), shape[2])
|
||||
|
||||
codes, dist = scipy.cluster.vq.kmeans(ar, NUM_CLUSTERS)
|
||||
colors = [''.join(chr(c) for c in code).encode('hex') for code in codes]
|
||||
|
||||
vecs, dist = scipy.cluster.vq.vq(ar, codes) # assign codes
|
||||
counts, bins = scipy.histogram(vecs, len(codes)) # count occurrences
|
||||
print counts
|
||||
total = scipy.sum(counts)
|
||||
print colors
|
||||
print dict(zip(colors, [count/float(total) for count in counts]))
|
||||
index_max = scipy.argmax(counts) # find most frequent
|
||||
peak = codes[index_max]
|
||||
colour = ''.join(chr(c) for c in peak).encode('hex')
|
||||
print 'most frequent is %s (#%s)' % (peak, colour)
|
||||
|
||||
def add_object_to_folder(obj, folder, folders):
|
||||
if not folder:
|
||||
folders.append(obj)
|
||||
|
|
Loading…
Add table
Reference in a new issue