import urllib2 import lxml.html import scipy import scipy.misc import scipy.cluster import urlparse import struct import operator import BmpImagePlugin, PngImagePlugin, Image from StringIO import StringIO from apps.rss_feeds.models import MFeedPage from utils.feed_functions import timelimit, TimeoutError HEADERS = { 'User-Agent': 'NewsBlur Favicon Fetcher - http://www.newsblur.com', 'Connection': 'close', } class IconImporter(object): def __init__(self, feed, force=False): self.feed = feed self.force = force def save(self): if not self.force and self.feed.icon.not_found: # print 'Not found, skipping...' return if not self.force and not self.feed.icon.not_found and self.feed.icon.icon_url: # print 'Found, but skipping...' return image, image_file, icon_url = self.fetch_image_from_page_data() if not image: image, image_file, icon_url = self.fetch(force=self.force) if image: try: ico_image = self.load_icon(image_file) if ico_image: image = ico_image except ValueError: # print "Bad .ICO" pass image = self.normalize_image(image) color = self.determine_dominant_color_in_image(image) image_str = self.string_from_image(image) self.feed.icon.save() self.feed.icon.data = image_str self.feed.icon.icon_url = icon_url self.feed.icon.color = color self.feed.icon.not_found = False else: self.feed.icon.save() self.feed.icon.not_found = True self.feed.icon.save() return not self.feed.icon.not_found def load_icon(self, image_file, index=None): ''' Load Windows ICO image. See http://en.wikipedia.org/w/index.php?oldid=264332061 for file format description. ''' try: image_file.seek(0) header = struct.unpack('<3H', image_file.read(6)) except Exception, e: return # Check magic if header[:2] != (0, 1): return # Collect icon directories directories = [] for i in xrange(header[2]): directory = list(struct.unpack('<4B2H2I', image_file.read(16))) for j in xrange(3): if not directory[j]: directory[j] = 256 directories.append(directory) if index is None: # Select best icon directory = max(directories, key=operator.itemgetter(slice(0, 3))) else: directory = directories[index] # Seek to the bitmap data image_file.seek(directory[7]) prefix = image_file.read(16) image_file.seek(-16, 1) if PngImagePlugin._accept(prefix): # Windows Vista icon with PNG inside image = PngImagePlugin.PngImageFile(image_file) else: # Load XOR bitmap image = BmpImagePlugin.DibImageFile(image_file) if image.mode == 'RGBA': # Windows XP 32-bit color depth icon without AND bitmap pass else: # Patch up the bitmap height image.size = image.size[0], image.size[1] >> 1 d, e, o, a = image.tile[0] image.tile[0] = d, (0, 0) + image.size, o, a # Calculate AND bitmap dimensions. See # http://en.wikipedia.org/w/index.php?oldid=264236948#Pixel_storage # for description offset = o + a[1] * image.size[1] stride = ((image.size[0] + 31) >> 5) << 2 size = stride * image.size[1] # Load AND bitmap image_file.seek(offset) string = image_file.read(size) mask = Image.fromstring('1', image.size, string, 'raw', ('1;I', stride, -1)) image = image.convert('RGBA') image.putalpha(mask) return image def fetch_image_from_page_data(self): image = None image_file = None content = MFeedPage.get_data(feed_id=self.feed.pk) url = self._url_from_html(content) if url: image, image_file = self.get_image_from_url(url) return image, image_file, url def fetch(self, path='favicon.ico', force=False): image = None url = None if not force: url = self.feed.icon.icon_url if not url and self.feed.feed_link and len(self.feed.feed_link) > 6: url = urlparse.urljoin(self.feed.feed_link, 'favicon.ico') if not url: return None, None, None image, image_file = self.get_image_from_url(url) if not image: url = urlparse.urljoin(self.feed.feed_link, '/favicon.ico') image, image_file = self.get_image_from_url(url) if not image: request = urllib2.Request(self.feed.feed_link, headers=HEADERS) try: # 2048 bytes should be enough for most of websites content = urllib2.urlopen(request).read(2048) except(urllib2.HTTPError, urllib2.URLError): return None, None, None url = self._url_from_html(content) if url: try: image, image_file = self.get_image_from_url(url) except(urllib2.HTTPError, urllib2.URLError): return None, None, None # print 'Found: %s - %s' % (url, image) return image, image_file, url def get_image_from_url(self, url): # print 'Requesting: %s' % url @timelimit(30) def _1(url): request = urllib2.Request(url, headers=HEADERS) icon = urllib2.urlopen(request).read() icon_file = StringIO(icon) image = Image.open(icon_file) return image, icon_file try: image, icon_file = _1(url) except (urllib2.HTTPError, urllib2.URLError, IOError, TimeoutError): return None, None return image, icon_file def _url_from_html(self, content): url = None if not content: return url icon_path = lxml.html.fromstring(content).xpath( '//link[@rel="icon" or @rel="shortcut icon"]/@href' ) if icon_path: if str(icon_path[0]).startswith('http'): url = icon_path[0] else: url = urlparse.urljoin(self.feed.feed_link, icon_path[0]) return url def normalize_image(self, image): # if image.size != (16, 16): # image = image.resize((16, 16), Image.BICUBIC) if image.mode != 'RGBA': image = image.convert('RGBA') return image def determine_dominant_color_in_image(self, image): NUM_CLUSTERS = 5 ar = scipy.misc.fromimage(image) shape = ar.shape if len(shape) > 2: ar = ar.reshape(scipy.product(shape[:2]), shape[2]) codes, _ = scipy.cluster.vq.kmeans(ar, NUM_CLUSTERS) # print "Before: %s" % codes original_codes = codes for low, hi in [(60, 200), (35, 230), (10, 250)]: codes = scipy.array([code for code in codes if not ((code[0] < low and code[1] < low and code[2] < low) or (code[0] > hi and code[1] > hi and code[2] > hi))]) if not len(codes): codes = original_codes else: break # print "After: %s" % codes vecs, _ = scipy.cluster.vq.vq(ar, codes) # assign codes counts, bins = scipy.histogram(vecs, len(codes)) # count occurrences # colors = [''.join(chr(c) for c in code).encode('hex') for code in codes] # total = scipy.sum(counts) # print dict(zip(colors, [count/float(total) for count in counts])) index_max = scipy.argmax(counts) # find most frequent peak = codes[index_max] color = ''.join(chr(c) for c in peak).encode('hex') # print 'most frequent is %s (#%s)' % (peak, color) return color[:6] def string_from_image(self, image): output = StringIO() image.save(output, 'png', quality=95) contents = output.getvalue() output.close() return contents.encode('base64')