NewsBlur-viq/apps/rss_feeds/icon_importer.py

import urllib2
import lxml.html
import scipy
import scipy.misc
import scipy.cluster
from StringIO import StringIO
from PIL import ImageFile
import ImageChops, Image
from django.conf import settings

class BadImage(Exception): pass

class IconImporter(object):
    
    def __init__(self, feed, force=False):
        self.feed = feed
        self.force = force
    
    def save(self):
        if not self.force and self.feed.icon.not_found:
            print 'Not found, skipping...'
            return
        image, icon_url = self.fetch(force=self.force)

        if image:
            image     = self.normalize_image(image)
            color     = self.determine_dominant_color_in_image(image)
            image_str = self.string_from_image(image)

            self.feed.icon.data      = image_str
            self.feed.icon.icon_url  = icon_url
            self.feed.icon.color     = color
            self.feed.icon.not_found = False
        else:
            self.feed.icon.not_found = True
            
        self.feed.icon.save()
        return not self.feed.icon.not_found
       
    def fetch(self, path='favicon.ico', force=False):
        HEADERS = {
            'User-Agent': 'NewsBlur Favicon Fetcher - http://www.newsblur.com',
            'Connection': 'close',
        }
        image = None
        url = None

        if not force:
            url = self.feed.icon.icon_url
        if not url:
            url = self.feed.feed_link

        if not url.endswith('/') and not url.endswith('favicon.ico'):
            url += '/favicon.ico'
        if url.endswith('/'):
            url += 'favicon.ico'

        def request_image(url):
            print 'Requesting: %s' % url
            request = urllib2.Request(url, headers=HEADERS)
            icon = urllib2.urlopen(request)
            parser = ImageFile.Parser()
            s = icon.read()
            if s:
                parser.feed(s)
            try:
                image = parser.close()
                return image
            except IOError:
                raise BadImage
        
        try:
            image = request_image(url)
        except (urllib2.HTTPError, urllib2.URLError, BadImage):
            request = urllib2.Request(self.feed.feed_link, headers=HEADERS)
            try:
                # 2048 bytes should be enough for most of websites
                content = urllib2.urlopen(request).read(2048) 
            except(urllib2.HTTPError, urllib2.URLError):
                return None, None
            icon_path = lxml.html.fromstring(content).xpath(
                '//link[@rel="icon" or @rel="shortcut icon"]/@href'
            )
            if icon_path:
                if str(icon_path[0]).startswith('http'):
                    url = icon_path[0]
                else:
                    url = self.feed.feed_link + icon_path[0]
                try:
                    image = request_image(url)
                except(urllib2.HTTPError, urllib2.URLError, BadImage):
                    return None, None
        print 'Found: %s - %s' % (url, image)
        return image, url
    
    def normalize_image(self, image):
        image = image.resize((16, 16), Image.ANTIALIAS)
        if image.mode != 'RGBA':
            image = image.convert('RGBA')
        # mask = Image.open(settings.IMAGE_MASK)
        print image
        print image.mode
        print image.size
        # mask = mask.convert('L')
        # print mask
        # image.paste(Image.new('RGBA', image.size, '#FFFFFF'), (0, 0), ImageChops.invert(mask))
        # image.putalpha(mask)
        
        return image

    def determine_dominant_color_in_image(self, image):
        NUM_CLUSTERS = 5

        # if image.mode == 'P':
        #     image.putalpha(0)
            
        ar = scipy.misc.fromimage(image)
        shape = ar.shape
        if len(shape) > 2:
            ar = ar.reshape(scipy.product(shape[:2]), shape[2])

        codes, dist = scipy.cluster.vq.kmeans(ar, NUM_CLUSTERS)
        colors = [''.join(chr(c) for c in code).encode('hex') for code in codes]
    
        vecs, dist = scipy.cluster.vq.vq(ar, codes)         # assign codes
        counts, bins = scipy.histogram(vecs, len(codes))    # count occurrences
        total = scipy.sum(counts)
        print dict(zip(colors, [count/float(total) for count in counts]))
        index_max = scipy.argmax(counts)                    # find most frequent
        peak = codes[index_max]
        color = ''.join(chr(c) for c in peak).encode('hex')
        print 'most frequent is %s (#%s)' % (peak, color)
        
        return color[:6]

    def string_from_image(self, image):
        output = StringIO()
        image.save(output, 'png', quality=95)
        contents = output.getvalue()
        output.close()
        print contents.encode('base64')
        return contents.encode('base64')
Finally saving proper base64 encoded favicons, along with their URLs, autodetecting when not found at /favicon.ico, and now all thats left is serving icons and their colors. Woohoo! 2011-01-27 19:05:50 -05:00			`import urllib2`
			`import lxml.html`
			`import scipy`
			`import scipy.misc`
			`import scipy.cluster`
Success on saving all images. Still some issues with transparency and resizing large images and having weird cutoffs. 2011-01-29 11:24:27 -05:00			`from StringIO import StringIO`
Finally saving proper base64 encoded favicons, along with their URLs, autodetecting when not found at /favicon.ico, and now all thats left is serving icons and their colors. Woohoo! 2011-01-27 19:05:50 -05:00			`from PIL import ImageFile`
Success on saving all images. Still some issues with transparency and resizing large images and having weird cutoffs. 2011-01-29 11:24:27 -05:00			`import ImageChops, Image`
			`from django.conf import settings`

			`class BadImage(Exception): pass`
Finally saving proper base64 encoded favicons, along with their URLs, autodetecting when not found at /favicon.ico, and now all thats left is serving icons and their colors. Woohoo! 2011-01-27 19:05:50 -05:00
			`class IconImporter(object):`

Success on saving all images. Still some issues with transparency and resizing large images and having weird cutoffs. 2011-01-29 11:24:27 -05:00			`def __init__(self, feed, force=False):`
Finally saving proper base64 encoded favicons, along with their URLs, autodetecting when not found at /favicon.ico, and now all thats left is serving icons and their colors. Woohoo! 2011-01-27 19:05:50 -05:00			`self.feed = feed`
Success on saving all images. Still some issues with transparency and resizing large images and having weird cutoffs. 2011-01-29 11:24:27 -05:00			`self.force = force`
Finally saving proper base64 encoded favicons, along with their URLs, autodetecting when not found at /favicon.ico, and now all thats left is serving icons and their colors. Woohoo! 2011-01-27 19:05:50 -05:00
			`def save(self):`
Success on saving all images. Still some issues with transparency and resizing large images and having weird cutoffs. 2011-01-29 11:24:27 -05:00			`if not self.force and self.feed.icon.not_found:`
			`print 'Not found, skipping...'`
			`return`
			`image, icon_url = self.fetch(force=self.force)`

			`if image:`
			`image = self.normalize_image(image)`
			`color = self.determine_dominant_color_in_image(image)`
			`image_str = self.string_from_image(image)`

			`self.feed.icon.data = image_str`
			`self.feed.icon.icon_url = icon_url`
			`self.feed.icon.color = color`
			`self.feed.icon.not_found = False`
			`else:`
			`self.feed.icon.not_found = True`

Finally saving proper base64 encoded favicons, along with their URLs, autodetecting when not found at /favicon.ico, and now all thats left is serving icons and their colors. Woohoo! 2011-01-27 19:05:50 -05:00			`self.feed.icon.save()`
Success on saving all images. Still some issues with transparency and resizing large images and having weird cutoffs. 2011-01-29 11:24:27 -05:00			`return not self.feed.icon.not_found`
Finally saving proper base64 encoded favicons, along with their URLs, autodetecting when not found at /favicon.ico, and now all thats left is serving icons and their colors. Woohoo! 2011-01-27 19:05:50 -05:00
Success on saving all images. Still some issues with transparency and resizing large images and having weird cutoffs. 2011-01-29 11:24:27 -05:00			`def fetch(self, path='favicon.ico', force=False):`
Finally saving proper base64 encoded favicons, along with their URLs, autodetecting when not found at /favicon.ico, and now all thats left is serving icons and their colors. Woohoo! 2011-01-27 19:05:50 -05:00			`HEADERS = {`
			`'User-Agent': 'NewsBlur Favicon Fetcher - http://www.newsblur.com',`
			`'Connection': 'close',`
			`}`
			`image = None`
Success on saving all images. Still some issues with transparency and resizing large images and having weird cutoffs. 2011-01-29 11:24:27 -05:00			`url = None`

			`if not force:`
			`url = self.feed.icon.icon_url`
Finally saving proper base64 encoded favicons, along with their URLs, autodetecting when not found at /favicon.ico, and now all thats left is serving icons and their colors. Woohoo! 2011-01-27 19:05:50 -05:00			`if not url:`
			`url = self.feed.feed_link`

			`if not url.endswith('/') and not url.endswith('favicon.ico'):`
			`url += '/favicon.ico'`
			`if url.endswith('/'):`
			`url += 'favicon.ico'`

Success on saving all images. Still some issues with transparency and resizing large images and having weird cutoffs. 2011-01-29 11:24:27 -05:00			`def request_image(url):`
			`print 'Requesting: %s' % url`
			`request = urllib2.Request(url, headers=HEADERS)`
Finally saving proper base64 encoded favicons, along with their URLs, autodetecting when not found at /favicon.ico, and now all thats left is serving icons and their colors. Woohoo! 2011-01-27 19:05:50 -05:00			`icon = urllib2.urlopen(request)`
			`parser = ImageFile.Parser()`
Success on saving all images. Still some issues with transparency and resizing large images and having weird cutoffs. 2011-01-29 11:24:27 -05:00			`s = icon.read()`
			`if s:`
Finally saving proper base64 encoded favicons, along with their URLs, autodetecting when not found at /favicon.ico, and now all thats left is serving icons and their colors. Woohoo! 2011-01-27 19:05:50 -05:00			`parser.feed(s)`
Success on saving all images. Still some issues with transparency and resizing large images and having weird cutoffs. 2011-01-29 11:24:27 -05:00			`try:`
			`image = parser.close()`
			`return image`
			`except IOError:`
			`raise BadImage`
Finally saving proper base64 encoded favicons, along with their URLs, autodetecting when not found at /favicon.ico, and now all thats left is serving icons and their colors. Woohoo! 2011-01-27 19:05:50 -05:00
			`try:`
Success on saving all images. Still some issues with transparency and resizing large images and having weird cutoffs. 2011-01-29 11:24:27 -05:00			`image = request_image(url)`
			`except (urllib2.HTTPError, urllib2.URLError, BadImage):`
Finally saving proper base64 encoded favicons, along with their URLs, autodetecting when not found at /favicon.ico, and now all thats left is serving icons and their colors. Woohoo! 2011-01-27 19:05:50 -05:00			`request = urllib2.Request(self.feed.feed_link, headers=HEADERS)`
			`try:`
			`# 2048 bytes should be enough for most of websites`
			`content = urllib2.urlopen(request).read(2048)`
			`except(urllib2.HTTPError, urllib2.URLError):`
Success on saving all images. Still some issues with transparency and resizing large images and having weird cutoffs. 2011-01-29 11:24:27 -05:00			`return None, None`
Finally saving proper base64 encoded favicons, along with their URLs, autodetecting when not found at /favicon.ico, and now all thats left is serving icons and their colors. Woohoo! 2011-01-27 19:05:50 -05:00			`icon_path = lxml.html.fromstring(content).xpath(`
			`'//link[@rel="icon" or @rel="shortcut icon"]/@href'`
			`)`
			`if icon_path:`
Success on saving all images. Still some issues with transparency and resizing large images and having weird cutoffs. 2011-01-29 11:24:27 -05:00			`if str(icon_path[0]).startswith('http'):`
			`url = icon_path[0]`
			`else:`
			`url = self.feed.feed_link + icon_path[0]`
Finally saving proper base64 encoded favicons, along with their URLs, autodetecting when not found at /favicon.ico, and now all thats left is serving icons and their colors. Woohoo! 2011-01-27 19:05:50 -05:00			`try:`
Success on saving all images. Still some issues with transparency and resizing large images and having weird cutoffs. 2011-01-29 11:24:27 -05:00			`image = request_image(url)`
			`except(urllib2.HTTPError, urllib2.URLError, BadImage):`
			`return None, None`
			`print 'Found: %s - %s' % (url, image)`
Finally saving proper base64 encoded favicons, along with their URLs, autodetecting when not found at /favicon.ico, and now all thats left is serving icons and their colors. Woohoo! 2011-01-27 19:05:50 -05:00			`return image, url`
Success on saving all images. Still some issues with transparency and resizing large images and having weird cutoffs. 2011-01-29 11:24:27 -05:00
			`def normalize_image(self, image):`
			`image = image.resize((16, 16), Image.ANTIALIAS)`
			`if image.mode != 'RGBA':`
			`image = image.convert('RGBA')`
			`# mask = Image.open(settings.IMAGE_MASK)`
			`print image`
			`print image.mode`
			`print image.size`
			`# mask = mask.convert('L')`
			`# print mask`
			`# image.paste(Image.new('RGBA', image.size, '#FFFFFF'), (0, 0), ImageChops.invert(mask))`
			`# image.putalpha(mask)`

			`return image`
Finally saving proper base64 encoded favicons, along with their URLs, autodetecting when not found at /favicon.ico, and now all thats left is serving icons and their colors. Woohoo! 2011-01-27 19:05:50 -05:00
			`def determine_dominant_color_in_image(self, image):`
			`NUM_CLUSTERS = 5`

Success on saving all images. Still some issues with transparency and resizing large images and having weird cutoffs. 2011-01-29 11:24:27 -05:00			`# if image.mode == 'P':`
			`# image.putalpha(0)`
Finally saving proper base64 encoded favicons, along with their URLs, autodetecting when not found at /favicon.ico, and now all thats left is serving icons and their colors. Woohoo! 2011-01-27 19:05:50 -05:00
			`ar = scipy.misc.fromimage(image)`
			`shape = ar.shape`
			`if len(shape) > 2:`
			`ar = ar.reshape(scipy.product(shape[:2]), shape[2])`

			`codes, dist = scipy.cluster.vq.kmeans(ar, NUM_CLUSTERS)`
			`colors = [''.join(chr(c) for c in code).encode('hex') for code in codes]`

			`vecs, dist = scipy.cluster.vq.vq(ar, codes) # assign codes`
			`counts, bins = scipy.histogram(vecs, len(codes)) # count occurrences`
			`total = scipy.sum(counts)`
			`print dict(zip(colors, [count/float(total) for count in counts]))`
			`index_max = scipy.argmax(counts) # find most frequent`
			`peak = codes[index_max]`
			`color = ''.join(chr(c) for c in peak).encode('hex')`
			`print 'most frequent is %s (#%s)' % (peak, color)`

Success on saving all images. Still some issues with transparency and resizing large images and having weird cutoffs. 2011-01-29 11:24:27 -05:00			`return color[:6]`
Finally saving proper base64 encoded favicons, along with their URLs, autodetecting when not found at /favicon.ico, and now all thats left is serving icons and their colors. Woohoo! 2011-01-27 19:05:50 -05:00
			`def string_from_image(self, image):`
Success on saving all images. Still some issues with transparency and resizing large images and having weird cutoffs. 2011-01-29 11:24:27 -05:00			`output = StringIO()`
			`image.save(output, 'png', quality=95)`
Finally saving proper base64 encoded favicons, along with their URLs, autodetecting when not found at /favicon.ico, and now all thats left is serving icons and their colors. Woohoo! 2011-01-27 19:05:50 -05:00			`contents = output.getvalue()`
			`output.close()`
			`print contents.encode('base64')`
			`return contents.encode('base64')`