2011-01-27 19:05:50 -05:00
|
|
|
import urllib2
|
|
|
|
import lxml.html
|
|
|
|
import scipy
|
|
|
|
import scipy.misc
|
|
|
|
import scipy.cluster
|
2011-01-29 19:16:40 -05:00
|
|
|
import urlparse
|
|
|
|
import struct
|
2011-01-29 20:20:35 -05:00
|
|
|
import operator
|
2011-01-29 11:24:27 -05:00
|
|
|
from StringIO import StringIO
|
2011-01-29 19:16:40 -05:00
|
|
|
from apps.rss_feeds.models import MFeedPage
|
2011-01-29 20:20:35 -05:00
|
|
|
from PIL import BmpImagePlugin, PngImagePlugin, Image
|
2011-01-29 11:24:27 -05:00
|
|
|
|
2011-01-29 19:16:40 -05:00
|
|
|
HEADERS = {
|
|
|
|
'User-Agent': 'NewsBlur Favicon Fetcher - http://www.newsblur.com',
|
|
|
|
'Connection': 'close',
|
|
|
|
}
|
2011-01-27 19:05:50 -05:00
|
|
|
|
|
|
|
class IconImporter(object):
|
|
|
|
|
2011-01-29 11:24:27 -05:00
|
|
|
def __init__(self, feed, force=False):
|
2011-01-27 19:05:50 -05:00
|
|
|
self.feed = feed
|
2011-01-29 11:24:27 -05:00
|
|
|
self.force = force
|
2011-01-27 19:05:50 -05:00
|
|
|
|
|
|
|
def save(self):
|
2011-01-29 11:24:27 -05:00
|
|
|
if not self.force and self.feed.icon.not_found:
|
|
|
|
print 'Not found, skipping...'
|
|
|
|
return
|
2011-01-29 20:20:35 -05:00
|
|
|
image, image_file, icon_url = self.fetch_image_from_page_data()
|
2011-01-29 19:16:40 -05:00
|
|
|
if not image:
|
2011-01-29 20:20:35 -05:00
|
|
|
image, image_file, icon_url = self.fetch(force=self.force)
|
2011-01-29 11:24:27 -05:00
|
|
|
|
|
|
|
if image:
|
2011-01-29 20:20:35 -05:00
|
|
|
ico_image = self.load_icon(image_file)
|
|
|
|
if ico_image: image = ico_image
|
2011-01-29 11:24:27 -05:00
|
|
|
image = self.normalize_image(image)
|
|
|
|
color = self.determine_dominant_color_in_image(image)
|
|
|
|
image_str = self.string_from_image(image)
|
|
|
|
|
2011-01-29 19:16:40 -05:00
|
|
|
self.feed.icon.save()
|
2011-01-29 11:24:27 -05:00
|
|
|
self.feed.icon.data = image_str
|
|
|
|
self.feed.icon.icon_url = icon_url
|
|
|
|
self.feed.icon.color = color
|
|
|
|
self.feed.icon.not_found = False
|
|
|
|
else:
|
2011-01-29 19:16:40 -05:00
|
|
|
self.feed.icon.save()
|
2011-01-29 11:24:27 -05:00
|
|
|
self.feed.icon.not_found = True
|
|
|
|
|
2011-01-27 19:05:50 -05:00
|
|
|
self.feed.icon.save()
|
2011-01-29 11:24:27 -05:00
|
|
|
return not self.feed.icon.not_found
|
2011-01-29 19:16:40 -05:00
|
|
|
|
2011-01-29 20:20:35 -05:00
|
|
|
def load_icon(self, image_file, index=None):
|
|
|
|
'''
|
|
|
|
Load Windows ICO image.
|
|
|
|
|
|
|
|
See http://en.wikipedia.org/w/index.php?oldid=264332061 for file format
|
|
|
|
description.
|
|
|
|
'''
|
|
|
|
try:
|
|
|
|
image_file.seek(0)
|
|
|
|
header = struct.unpack('<3H', image_file.read(6))
|
|
|
|
except Exception, e:
|
|
|
|
print 'No on struct: %s'% e
|
|
|
|
return
|
|
|
|
|
|
|
|
# Check magic
|
|
|
|
if header[:2] != (0, 1):
|
|
|
|
print 'No on header', header
|
|
|
|
return
|
|
|
|
|
|
|
|
# Collect icon directories
|
|
|
|
directories = []
|
|
|
|
for i in xrange(header[2]):
|
|
|
|
directory = list(struct.unpack('<4B2H2I', image_file.read(16)))
|
|
|
|
for j in xrange(3):
|
|
|
|
if not directory[j]:
|
|
|
|
directory[j] = 256
|
|
|
|
|
|
|
|
directories.append(directory)
|
|
|
|
|
|
|
|
if index is None:
|
|
|
|
# Select best icon
|
|
|
|
directory = max(directories, key=operator.itemgetter(slice(0, 3)))
|
|
|
|
else:
|
|
|
|
directory = directories[index]
|
|
|
|
|
|
|
|
# Seek to the bitmap data
|
|
|
|
image_file.seek(directory[7])
|
|
|
|
|
|
|
|
prefix = image_file.read(16)
|
|
|
|
image_file.seek(-16, 1)
|
|
|
|
|
|
|
|
if PngImagePlugin._accept(prefix):
|
|
|
|
# Windows Vista icon with PNG inside
|
|
|
|
image = PngImagePlugin.PngImageFile(image_file)
|
|
|
|
else:
|
|
|
|
# Load XOR bitmap
|
|
|
|
image = BmpImagePlugin.DibImageFile(image_file)
|
|
|
|
if image.mode == 'RGBA':
|
|
|
|
# Windows XP 32-bit color depth icon without AND bitmap
|
|
|
|
pass
|
|
|
|
else:
|
|
|
|
# Patch up the bitmap height
|
|
|
|
image.size = image.size[0], image.size[1] >> 1
|
|
|
|
d, e, o, a = image.tile[0]
|
|
|
|
image.tile[0] = d, (0, 0) + image.size, o, a
|
|
|
|
|
|
|
|
# Calculate AND bitmap dimensions. See
|
|
|
|
# http://en.wikipedia.org/w/index.php?oldid=264236948#Pixel_storage
|
|
|
|
# for description
|
|
|
|
offset = o + a[1] * image.size[1]
|
|
|
|
stride = ((image.size[0] + 31) >> 5) << 2
|
|
|
|
size = stride * image.size[1]
|
|
|
|
|
|
|
|
# Load AND bitmap
|
|
|
|
image_file.seek(offset)
|
|
|
|
string = image_file.read(size)
|
|
|
|
mask = Image.fromstring('1', image.size, string, 'raw',
|
|
|
|
('1;I', stride, -1))
|
|
|
|
|
|
|
|
image = image.convert('RGBA')
|
|
|
|
image.putalpha(mask)
|
|
|
|
|
|
|
|
return image
|
|
|
|
|
2011-01-29 19:16:40 -05:00
|
|
|
def fetch_image_from_page_data(self):
|
|
|
|
image = None
|
2011-01-29 20:20:35 -05:00
|
|
|
image_file = None
|
2011-01-29 19:16:40 -05:00
|
|
|
content = MFeedPage.get_data(feed_id=self.feed.pk)
|
|
|
|
url = self._url_from_html(content)
|
|
|
|
if url:
|
2011-01-29 20:20:35 -05:00
|
|
|
image, image_file = self.get_image_from_url(url)
|
|
|
|
return image, image_file, url
|
2011-01-29 19:16:40 -05:00
|
|
|
|
2011-01-29 11:24:27 -05:00
|
|
|
def fetch(self, path='favicon.ico', force=False):
|
2011-01-27 19:05:50 -05:00
|
|
|
image = None
|
2011-01-29 11:24:27 -05:00
|
|
|
url = None
|
|
|
|
|
|
|
|
if not force:
|
|
|
|
url = self.feed.icon.icon_url
|
2011-01-27 19:05:50 -05:00
|
|
|
if not url:
|
2011-01-29 19:16:40 -05:00
|
|
|
url = urlparse.urljoin(self.feed.feed_link, 'favicon.ico')
|
2011-01-27 19:05:50 -05:00
|
|
|
|
2011-01-29 20:20:35 -05:00
|
|
|
image, image_file = self.get_image_from_url(url)
|
2011-01-29 19:16:40 -05:00
|
|
|
if not image:
|
|
|
|
url = urlparse.urljoin(self.feed.feed_link, '/favicon.ico')
|
2011-01-29 20:20:35 -05:00
|
|
|
image, image_file = self.get_image_from_url(url)
|
2011-01-29 19:16:40 -05:00
|
|
|
if not image:
|
|
|
|
request = urllib2.Request(self.feed.feed_link, headers=HEADERS)
|
2011-01-27 19:05:50 -05:00
|
|
|
try:
|
2011-01-29 19:16:40 -05:00
|
|
|
# 2048 bytes should be enough for most of websites
|
|
|
|
content = urllib2.urlopen(request).read(2048)
|
|
|
|
except(urllib2.HTTPError, urllib2.URLError):
|
2011-01-29 20:20:35 -05:00
|
|
|
return None, None, None
|
2011-01-29 19:16:40 -05:00
|
|
|
url = self._url_from_html(content)
|
|
|
|
if url:
|
|
|
|
try:
|
2011-01-29 20:20:35 -05:00
|
|
|
image, image_file = self.get_image_from_url(url)
|
2011-01-29 19:16:40 -05:00
|
|
|
except(urllib2.HTTPError, urllib2.URLError):
|
2011-01-29 20:20:35 -05:00
|
|
|
return None, None, None
|
2011-01-29 11:24:27 -05:00
|
|
|
print 'Found: %s - %s' % (url, image)
|
2011-01-29 20:20:35 -05:00
|
|
|
return image, image_file, url
|
2011-01-29 11:24:27 -05:00
|
|
|
|
2011-01-29 19:16:40 -05:00
|
|
|
def get_image_from_url(self, url):
|
|
|
|
print 'Requesting: %s' % url
|
|
|
|
try:
|
|
|
|
request = urllib2.Request(url, headers=HEADERS)
|
2011-01-29 20:20:35 -05:00
|
|
|
icon = urllib2.urlopen(request).read()
|
|
|
|
icon_file = StringIO(icon)
|
|
|
|
image = Image.open(icon_file)
|
|
|
|
except (urllib2.HTTPError, urllib2.URLError, IOError):
|
|
|
|
return None, None
|
|
|
|
return image, icon_file
|
2011-01-29 19:16:40 -05:00
|
|
|
|
|
|
|
def _url_from_html(self, content):
|
|
|
|
url = None
|
|
|
|
icon_path = lxml.html.fromstring(content).xpath(
|
|
|
|
'//link[@rel="icon" or @rel="shortcut icon"]/@href'
|
|
|
|
)
|
|
|
|
if icon_path:
|
|
|
|
if str(icon_path[0]).startswith('http'):
|
|
|
|
url = icon_path[0]
|
|
|
|
else:
|
|
|
|
url = urlparse.urljoin(self.feed.feed_link, icon_path[0])
|
|
|
|
return url
|
|
|
|
|
2011-01-29 11:24:27 -05:00
|
|
|
def normalize_image(self, image):
|
2011-01-29 19:16:40 -05:00
|
|
|
# if image.size != (16, 16):
|
|
|
|
# image = image.resize((16, 16), Image.BICUBIC)
|
|
|
|
print image
|
2011-01-29 11:24:27 -05:00
|
|
|
if image.mode != 'RGBA':
|
|
|
|
image = image.convert('RGBA')
|
|
|
|
|
|
|
|
return image
|
2011-01-27 19:05:50 -05:00
|
|
|
|
|
|
|
def determine_dominant_color_in_image(self, image):
|
|
|
|
NUM_CLUSTERS = 5
|
|
|
|
|
|
|
|
ar = scipy.misc.fromimage(image)
|
|
|
|
shape = ar.shape
|
|
|
|
if len(shape) > 2:
|
|
|
|
ar = ar.reshape(scipy.product(shape[:2]), shape[2])
|
|
|
|
|
2011-01-29 19:16:40 -05:00
|
|
|
codes, _ = scipy.cluster.vq.kmeans(ar, NUM_CLUSTERS)
|
|
|
|
print "Before: %s" % codes
|
|
|
|
original_codes = codes
|
|
|
|
for low, hi in [(60, 200), (35, 230), (10, 250)]:
|
|
|
|
codes = scipy.array([code for code in codes
|
|
|
|
if not ((code[0] < low and code[1] < low and code[2] < low) or
|
|
|
|
(code[0] > hi and code[1] > hi and code[2] > hi))])
|
|
|
|
if not len(codes): codes = original_codes
|
|
|
|
else: break
|
|
|
|
print "After: %s" % codes
|
2011-01-27 19:05:50 -05:00
|
|
|
colors = [''.join(chr(c) for c in code).encode('hex') for code in codes]
|
|
|
|
|
2011-01-29 19:16:40 -05:00
|
|
|
vecs, _ = scipy.cluster.vq.vq(ar, codes) # assign codes
|
2011-01-27 19:05:50 -05:00
|
|
|
counts, bins = scipy.histogram(vecs, len(codes)) # count occurrences
|
2011-01-29 19:16:40 -05:00
|
|
|
print counts
|
2011-01-27 19:05:50 -05:00
|
|
|
total = scipy.sum(counts)
|
|
|
|
print dict(zip(colors, [count/float(total) for count in counts]))
|
|
|
|
index_max = scipy.argmax(counts) # find most frequent
|
|
|
|
peak = codes[index_max]
|
|
|
|
color = ''.join(chr(c) for c in peak).encode('hex')
|
|
|
|
print 'most frequent is %s (#%s)' % (peak, color)
|
|
|
|
|
2011-01-29 11:24:27 -05:00
|
|
|
return color[:6]
|
2011-01-27 19:05:50 -05:00
|
|
|
|
|
|
|
def string_from_image(self, image):
|
2011-01-29 11:24:27 -05:00
|
|
|
output = StringIO()
|
|
|
|
image.save(output, 'png', quality=95)
|
2011-01-27 19:05:50 -05:00
|
|
|
contents = output.getvalue()
|
|
|
|
output.close()
|
|
|
|
return contents.encode('base64')
|
|
|
|
|