NewsBlur/apps/rss_feeds/icon_importer.py

import urllib2
import lxml.html
import scipy
import scipy.misc
import scipy.cluster
import Image
import urlparse
import operator
import struct
from StringIO import StringIO
from apps.rss_feeds.models import MFeedPage
from PIL import BmpImagePlugin, PngImagePlugin, ImageFile

HEADERS = {
    'User-Agent': 'NewsBlur Favicon Fetcher - http://www.newsblur.com',
    'Connection': 'close',
}

class IconImporter(object):
    
    def __init__(self, feed, force=False):
        self.feed = feed
        self.force = force
    
    def save(self):
        if not self.force and self.feed.icon.not_found:
            print 'Not found, skipping...'
            return
        image, icon_url = self.fetch_image_from_page_data()
        if not image:
            image, icon_url = self.fetch(force=self.force)

        if image:
            image     = self.normalize_image(image)
            color     = self.determine_dominant_color_in_image(image)
            image_str = self.string_from_image(image)

            self.feed.icon.save()
            self.feed.icon.data      = image_str
            self.feed.icon.icon_url  = icon_url
            self.feed.icon.color     = color
            self.feed.icon.not_found = False
        else:
            self.feed.icon.save()
            self.feed.icon.not_found = True
            
        self.feed.icon.save()
        return not self.feed.icon.not_found
     
    def fetch_image_from_page_data(self):
        image = None
        content = MFeedPage.get_data(feed_id=self.feed.pk)
        url = self._url_from_html(content)
        if url:
            image = self.get_image_from_url(url)
        return image, url

    def fetch(self, path='favicon.ico', force=False):
        image = None
        url = None

        if not force:
            url = self.feed.icon.icon_url
        if not url:
            url = urlparse.urljoin(self.feed.feed_link, 'favicon.ico')

        image = self.get_image_from_url(url)
        if not image:
            url = urlparse.urljoin(self.feed.feed_link, '/favicon.ico')
            image = self.get_image_from_url(url)
            if not image:
                request = urllib2.Request(self.feed.feed_link, headers=HEADERS)
                try:
                    # 2048 bytes should be enough for most of websites
                    content = urllib2.urlopen(request).read(2048) 
                except(urllib2.HTTPError, urllib2.URLError):
                    return None, None
                url = self._url_from_html(content)
                if url:
                    try:
                        image = self.get_image_from_url(url)
                    except(urllib2.HTTPError, urllib2.URLError):
                        return None, None
        print 'Found: %s - %s' % (url, image)
        return image, url
    
    def get_image_from_url(self, url):
        print 'Requesting: %s' % url
        try:
            request = urllib2.Request(url, headers=HEADERS)
            icon = urllib2.urlopen(request)
        except (urllib2.HTTPError, urllib2.URLError), e:
            return None
        parser = ImageFile.Parser()
        s = icon.read()
        if s:
            parser.feed(s)
        try:
            image = parser.close()
            return image
        except IOError, e:
            return None
    
    def _url_from_html(self, content):
        url = None
        icon_path = lxml.html.fromstring(content).xpath(
            '//link[@rel="icon" or @rel="shortcut icon"]/@href'
        )
        if icon_path:
            if str(icon_path[0]).startswith('http'):
                url = icon_path[0]
            else:
                url = urlparse.urljoin(self.feed.feed_link, icon_path[0])
        return url
        
    def normalize_image(self, image):
        print image.size
        # if image.size != (16, 16):
        #     image = image.resize((16, 16), Image.BICUBIC)
        print image
        if image.mode != 'RGBA':
            image = image.convert('RGBA')
        
        return image

    def determine_dominant_color_in_image(self, image):
        NUM_CLUSTERS = 5
            
        ar = scipy.misc.fromimage(image)
        shape = ar.shape
        if len(shape) > 2:
            ar = ar.reshape(scipy.product(shape[:2]), shape[2])

        codes, _ = scipy.cluster.vq.kmeans(ar, NUM_CLUSTERS)
        print "Before: %s" % codes
        original_codes = codes
        for low, hi in [(60, 200), (35, 230), (10, 250)]:
            codes = scipy.array([code for code in codes 
                                 if not ((code[0] < low and code[1] < low and code[2] < low) or
                                         (code[0] > hi and code[1] > hi and code[2] > hi))])
            if not len(codes): codes = original_codes
            else: break
        print "After: %s" % codes
        colors = [''.join(chr(c) for c in code).encode('hex') for code in codes]
    
        vecs, _ = scipy.cluster.vq.vq(ar, codes)         # assign codes
        counts, bins = scipy.histogram(vecs, len(codes))    # count occurrences
        print counts
        total = scipy.sum(counts)
        print dict(zip(colors, [count/float(total) for count in counts]))
        index_max = scipy.argmax(counts)                    # find most frequent
        peak = codes[index_max]
        color = ''.join(chr(c) for c in peak).encode('hex')
        print 'most frequent is %s (#%s)' % (peak, color)
        
        return color[:6]

    def string_from_image(self, image):
        output = StringIO()
        image.save(output, 'png', quality=95)
        contents = output.getvalue()
        output.close()
        return contents.encode('base64')
Finally saving proper base64 encoded favicons, along with their URLs, autodetecting when not found at /favicon.ico, and now all thats left is serving icons and their colors. Woohoo! 2011-01-27 19:05:50 -05:00			`import urllib2`
			`import lxml.html`
			`import scipy`
			`import scipy.misc`
			`import scipy.cluster`
Correctly downloading, storing, retrieving, and showing favicons and gradients for favicons. This commit includes numerous fixes for the Feed view, too. Loaded. 2011-01-29 19:16:40 -05:00			`import Image`
			`import urlparse`
			`import operator`
			`import struct`
Success on saving all images. Still some issues with transparency and resizing large images and having weird cutoffs. 2011-01-29 11:24:27 -05:00			`from StringIO import StringIO`
Correctly downloading, storing, retrieving, and showing favicons and gradients for favicons. This commit includes numerous fixes for the Feed view, too. Loaded. 2011-01-29 19:16:40 -05:00			`from apps.rss_feeds.models import MFeedPage`
			`from PIL import BmpImagePlugin, PngImagePlugin, ImageFile`
Success on saving all images. Still some issues with transparency and resizing large images and having weird cutoffs. 2011-01-29 11:24:27 -05:00
Correctly downloading, storing, retrieving, and showing favicons and gradients for favicons. This commit includes numerous fixes for the Feed view, too. Loaded. 2011-01-29 19:16:40 -05:00			`HEADERS = {`
			`'User-Agent': 'NewsBlur Favicon Fetcher - http://www.newsblur.com',`
			`'Connection': 'close',`
			`}`
Finally saving proper base64 encoded favicons, along with their URLs, autodetecting when not found at /favicon.ico, and now all thats left is serving icons and their colors. Woohoo! 2011-01-27 19:05:50 -05:00
			`class IconImporter(object):`

Success on saving all images. Still some issues with transparency and resizing large images and having weird cutoffs. 2011-01-29 11:24:27 -05:00			`def __init__(self, feed, force=False):`
Finally saving proper base64 encoded favicons, along with their URLs, autodetecting when not found at /favicon.ico, and now all thats left is serving icons and their colors. Woohoo! 2011-01-27 19:05:50 -05:00			`self.feed = feed`
Success on saving all images. Still some issues with transparency and resizing large images and having weird cutoffs. 2011-01-29 11:24:27 -05:00			`self.force = force`
Finally saving proper base64 encoded favicons, along with their URLs, autodetecting when not found at /favicon.ico, and now all thats left is serving icons and their colors. Woohoo! 2011-01-27 19:05:50 -05:00
			`def save(self):`
Success on saving all images. Still some issues with transparency and resizing large images and having weird cutoffs. 2011-01-29 11:24:27 -05:00			`if not self.force and self.feed.icon.not_found:`
			`print 'Not found, skipping...'`
			`return`
Correctly downloading, storing, retrieving, and showing favicons and gradients for favicons. This commit includes numerous fixes for the Feed view, too. Loaded. 2011-01-29 19:16:40 -05:00			`image, icon_url = self.fetch_image_from_page_data()`
			`if not image:`
			`image, icon_url = self.fetch(force=self.force)`
Success on saving all images. Still some issues with transparency and resizing large images and having weird cutoffs. 2011-01-29 11:24:27 -05:00
			`if image:`
			`image = self.normalize_image(image)`
			`color = self.determine_dominant_color_in_image(image)`
			`image_str = self.string_from_image(image)`

Correctly downloading, storing, retrieving, and showing favicons and gradients for favicons. This commit includes numerous fixes for the Feed view, too. Loaded. 2011-01-29 19:16:40 -05:00			`self.feed.icon.save()`
Success on saving all images. Still some issues with transparency and resizing large images and having weird cutoffs. 2011-01-29 11:24:27 -05:00			`self.feed.icon.data = image_str`
			`self.feed.icon.icon_url = icon_url`
			`self.feed.icon.color = color`
			`self.feed.icon.not_found = False`
			`else:`
Correctly downloading, storing, retrieving, and showing favicons and gradients for favicons. This commit includes numerous fixes for the Feed view, too. Loaded. 2011-01-29 19:16:40 -05:00			`self.feed.icon.save()`
Success on saving all images. Still some issues with transparency and resizing large images and having weird cutoffs. 2011-01-29 11:24:27 -05:00			`self.feed.icon.not_found = True`

Finally saving proper base64 encoded favicons, along with their URLs, autodetecting when not found at /favicon.ico, and now all thats left is serving icons and their colors. Woohoo! 2011-01-27 19:05:50 -05:00			`self.feed.icon.save()`
Success on saving all images. Still some issues with transparency and resizing large images and having weird cutoffs. 2011-01-29 11:24:27 -05:00			`return not self.feed.icon.not_found`
Correctly downloading, storing, retrieving, and showing favicons and gradients for favicons. This commit includes numerous fixes for the Feed view, too. Loaded. 2011-01-29 19:16:40 -05:00
			`def fetch_image_from_page_data(self):`
			`image = None`
			`content = MFeedPage.get_data(feed_id=self.feed.pk)`
			`url = self._url_from_html(content)`
			`if url:`
			`image = self.get_image_from_url(url)`
			`return image, url`

Success on saving all images. Still some issues with transparency and resizing large images and having weird cutoffs. 2011-01-29 11:24:27 -05:00			`def fetch(self, path='favicon.ico', force=False):`
Finally saving proper base64 encoded favicons, along with their URLs, autodetecting when not found at /favicon.ico, and now all thats left is serving icons and their colors. Woohoo! 2011-01-27 19:05:50 -05:00			`image = None`
Success on saving all images. Still some issues with transparency and resizing large images and having weird cutoffs. 2011-01-29 11:24:27 -05:00			`url = None`

			`if not force:`
			`url = self.feed.icon.icon_url`
Finally saving proper base64 encoded favicons, along with their URLs, autodetecting when not found at /favicon.ico, and now all thats left is serving icons and their colors. Woohoo! 2011-01-27 19:05:50 -05:00			`if not url:`
Correctly downloading, storing, retrieving, and showing favicons and gradients for favicons. This commit includes numerous fixes for the Feed view, too. Loaded. 2011-01-29 19:16:40 -05:00			`url = urlparse.urljoin(self.feed.feed_link, 'favicon.ico')`
Finally saving proper base64 encoded favicons, along with their URLs, autodetecting when not found at /favicon.ico, and now all thats left is serving icons and their colors. Woohoo! 2011-01-27 19:05:50 -05:00
Correctly downloading, storing, retrieving, and showing favicons and gradients for favicons. This commit includes numerous fixes for the Feed view, too. Loaded. 2011-01-29 19:16:40 -05:00			`image = self.get_image_from_url(url)`
			`if not image:`
			`url = urlparse.urljoin(self.feed.feed_link, '/favicon.ico')`
			`image = self.get_image_from_url(url)`
			`if not image:`
			`request = urllib2.Request(self.feed.feed_link, headers=HEADERS)`
Finally saving proper base64 encoded favicons, along with their URLs, autodetecting when not found at /favicon.ico, and now all thats left is serving icons and their colors. Woohoo! 2011-01-27 19:05:50 -05:00			`try:`
Correctly downloading, storing, retrieving, and showing favicons and gradients for favicons. This commit includes numerous fixes for the Feed view, too. Loaded. 2011-01-29 19:16:40 -05:00			`# 2048 bytes should be enough for most of websites`
			`content = urllib2.urlopen(request).read(2048)`
			`except(urllib2.HTTPError, urllib2.URLError):`
Success on saving all images. Still some issues with transparency and resizing large images and having weird cutoffs. 2011-01-29 11:24:27 -05:00			`return None, None`
Correctly downloading, storing, retrieving, and showing favicons and gradients for favicons. This commit includes numerous fixes for the Feed view, too. Loaded. 2011-01-29 19:16:40 -05:00			`url = self._url_from_html(content)`
			`if url:`
			`try:`
			`image = self.get_image_from_url(url)`
			`except(urllib2.HTTPError, urllib2.URLError):`
			`return None, None`
Success on saving all images. Still some issues with transparency and resizing large images and having weird cutoffs. 2011-01-29 11:24:27 -05:00			`print 'Found: %s - %s' % (url, image)`
Finally saving proper base64 encoded favicons, along with their URLs, autodetecting when not found at /favicon.ico, and now all thats left is serving icons and their colors. Woohoo! 2011-01-27 19:05:50 -05:00			`return image, url`
Success on saving all images. Still some issues with transparency and resizing large images and having weird cutoffs. 2011-01-29 11:24:27 -05:00
Correctly downloading, storing, retrieving, and showing favicons and gradients for favicons. This commit includes numerous fixes for the Feed view, too. Loaded. 2011-01-29 19:16:40 -05:00			`def get_image_from_url(self, url):`
			`print 'Requesting: %s' % url`
			`try:`
			`request = urllib2.Request(url, headers=HEADERS)`
			`icon = urllib2.urlopen(request)`
			`except (urllib2.HTTPError, urllib2.URLError), e:`
			`return None`
			`parser = ImageFile.Parser()`
			`s = icon.read()`
			`if s:`
			`parser.feed(s)`
			`try:`
			`image = parser.close()`
			`return image`
			`except IOError, e:`
			`return None`

			`def _url_from_html(self, content):`
			`url = None`
			`icon_path = lxml.html.fromstring(content).xpath(`
			`'//link[@rel="icon" or @rel="shortcut icon"]/@href'`
			`)`
			`if icon_path:`
			`if str(icon_path[0]).startswith('http'):`
			`url = icon_path[0]`
			`else:`
			`url = urlparse.urljoin(self.feed.feed_link, icon_path[0])`
			`return url`

Success on saving all images. Still some issues with transparency and resizing large images and having weird cutoffs. 2011-01-29 11:24:27 -05:00			`def normalize_image(self, image):`
Correctly downloading, storing, retrieving, and showing favicons and gradients for favicons. This commit includes numerous fixes for the Feed view, too. Loaded. 2011-01-29 19:16:40 -05:00			`print image.size`
			`# if image.size != (16, 16):`
			`# image = image.resize((16, 16), Image.BICUBIC)`
			`print image`
Success on saving all images. Still some issues with transparency and resizing large images and having weird cutoffs. 2011-01-29 11:24:27 -05:00			`if image.mode != 'RGBA':`
			`image = image.convert('RGBA')`

			`return image`
Finally saving proper base64 encoded favicons, along with their URLs, autodetecting when not found at /favicon.ico, and now all thats left is serving icons and their colors. Woohoo! 2011-01-27 19:05:50 -05:00
			`def determine_dominant_color_in_image(self, image):`
			`NUM_CLUSTERS = 5`

			`ar = scipy.misc.fromimage(image)`
			`shape = ar.shape`
			`if len(shape) > 2:`
			`ar = ar.reshape(scipy.product(shape[:2]), shape[2])`

Correctly downloading, storing, retrieving, and showing favicons and gradients for favicons. This commit includes numerous fixes for the Feed view, too. Loaded. 2011-01-29 19:16:40 -05:00			`codes, _ = scipy.cluster.vq.kmeans(ar, NUM_CLUSTERS)`
			`print "Before: %s" % codes`
			`original_codes = codes`
			`for low, hi in [(60, 200), (35, 230), (10, 250)]:`
			`codes = scipy.array([code for code in codes`
			`if not ((code[0] < low and code[1] < low and code[2] < low) or`
			`(code[0] > hi and code[1] > hi and code[2] > hi))])`
			`if not len(codes): codes = original_codes`
			`else: break`
			`print "After: %s" % codes`
Finally saving proper base64 encoded favicons, along with their URLs, autodetecting when not found at /favicon.ico, and now all thats left is serving icons and their colors. Woohoo! 2011-01-27 19:05:50 -05:00			`colors = [''.join(chr(c) for c in code).encode('hex') for code in codes]`

Correctly downloading, storing, retrieving, and showing favicons and gradients for favicons. This commit includes numerous fixes for the Feed view, too. Loaded. 2011-01-29 19:16:40 -05:00			`vecs, _ = scipy.cluster.vq.vq(ar, codes) # assign codes`
Finally saving proper base64 encoded favicons, along with their URLs, autodetecting when not found at /favicon.ico, and now all thats left is serving icons and their colors. Woohoo! 2011-01-27 19:05:50 -05:00			`counts, bins = scipy.histogram(vecs, len(codes)) # count occurrences`
Correctly downloading, storing, retrieving, and showing favicons and gradients for favicons. This commit includes numerous fixes for the Feed view, too. Loaded. 2011-01-29 19:16:40 -05:00			`print counts`
Finally saving proper base64 encoded favicons, along with their URLs, autodetecting when not found at /favicon.ico, and now all thats left is serving icons and their colors. Woohoo! 2011-01-27 19:05:50 -05:00			`total = scipy.sum(counts)`
			`print dict(zip(colors, [count/float(total) for count in counts]))`
			`index_max = scipy.argmax(counts) # find most frequent`
			`peak = codes[index_max]`
			`color = ''.join(chr(c) for c in peak).encode('hex')`
			`print 'most frequent is %s (#%s)' % (peak, color)`

Success on saving all images. Still some issues with transparency and resizing large images and having weird cutoffs. 2011-01-29 11:24:27 -05:00			`return color[:6]`
Finally saving proper base64 encoded favicons, along with their URLs, autodetecting when not found at /favicon.ico, and now all thats left is serving icons and their colors. Woohoo! 2011-01-27 19:05:50 -05:00
			`def string_from_image(self, image):`
Success on saving all images. Still some issues with transparency and resizing large images and having weird cutoffs. 2011-01-29 11:24:27 -05:00			`output = StringIO()`
			`image.save(output, 'png', quality=95)`
Finally saving proper base64 encoded favicons, along with their URLs, autodetecting when not found at /favicon.ico, and now all thats left is serving icons and their colors. Woohoo! 2011-01-27 19:05:50 -05:00			`contents = output.getvalue()`
			`output.close()`
			`return contents.encode('base64')`