NewsBlur/utils/feed_functions.py

import datetime
import threading
import sys
import urllib2
import lxml.html
from PIL import ImageFile
from django.utils.translation import ungettext
from utils import feedfinder

class TimeoutError(Exception): pass
def timelimit(timeout):
    """borrowed from web.py"""
    def _1(function):
        def _2(*args, **kw):
            class Dispatch(threading.Thread):
                def __init__(self):
                    threading.Thread.__init__(self)
                    self.result = None
                    self.error = None
                    
                    self.setDaemon(True)
                    self.start()

                def run(self):
                    try:
                        self.result = function(*args, **kw)
                    except:
                        self.error = sys.exc_info()

            c = Dispatch()
            c.join(timeout)
            if c.isAlive():
                raise TimeoutError, 'took too long'
            if c.error:
                raise c.error[0], c.error[1]
            return c.result
        return _2
    return _1
    
def encode(tstr):
    """ Encodes a unicode string in utf-8
    """
    if not tstr:
        return ''
    # this is _not_ pretty, but it works
    try:
        return tstr.encode('utf-8', "xmlcharrefreplace")
    except UnicodeDecodeError:
        # it's already UTF8.. sigh
        return tstr.decode('utf-8').encode('utf-8')

# From: http://www.poromenos.org/node/87
def levenshtein_distance(first, second):
    """Find the Levenshtein distance between two strings."""
    if len(first) > len(second):
        first, second = second, first
    if len(second) == 0:
        return len(first)
    first_length = len(first) + 1
    second_length = len(second) + 1
    distance_matrix = [[0] * second_length for x in range(first_length)]
    for i in range(first_length):
       distance_matrix[i][0] = i
    for j in range(second_length):
       distance_matrix[0][j]=j
    for i in xrange(1, first_length):
        for j in range(1, second_length):
            deletion = distance_matrix[i-1][j] + 1
            insertion = distance_matrix[i][j-1] + 1
            substitution = distance_matrix[i-1][j-1]
            if first[i-1] != second[j-1]:
                substitution += 1
            distance_matrix[i][j] = min(insertion, deletion, substitution)
    return distance_matrix[first_length-1][second_length-1]
    
    
def fetch_address_from_page(url, existing_feed=None):
    from apps.rss_feeds.models import Feed, DuplicateFeed
    feed_finder_url = feedfinder.feed(url)
    if feed_finder_url:
        if existing_feed:
            if Feed.objects.filter(feed_address=feed_finder_url):
                return None
            existing_feed.feed_address = feed_finder_url
            existing_feed.save()
            feed = existing_feed
        else:
            duplicate_feed = DuplicateFeed.objects.filter(duplicate_address=feed_finder_url)
            if duplicate_feed:
                feed = [duplicate_feed[0].feed]
            else:
                feed = Feed.objects.filter(feed_address=feed_finder_url)
            if not feed:
                feed = Feed(feed_address=feed_finder_url)
                feed.save()
                feed.update()
            else:
                feed = feed[0]
        return feed
        
def _do_timesince(d, chunks, now=None):
    """
    Started as a copy of django.util.timesince.timesince, but modified to
    only output one time unit, and use months as the maximum unit of measure.
    
    Takes two datetime objects and returns the time between d and now
    as a nicely formatted string, e.g. "10 minutes".  If d occurs after now,
    then "0 minutes" is returned.

    Units used are months, weeks, days, hours, and minutes.
    Seconds and microseconds are ignored.
    """
    # Convert datetime.date to datetime.datetime for comparison
    if d.__class__ is not datetime.datetime:
        d = datetime.datetime(d.year, d.month, d.day)

    if not now:
        now = datetime.datetime.utcnow()

    # ignore microsecond part of 'd' since we removed it from 'now'
    delta = now - (d - datetime.timedelta(0, 0, d.microsecond))
    since = delta.days * 24 * 60 * 60 + delta.seconds
    for i, (seconds, name) in enumerate(chunks):
        count = since // seconds
        if count != 0:
            break
    s = '%(number)d %(type)s' % {'number': count, 'type': name(count)}
    return s

def relative_timesince(value):
    if not value:
        return u''

    chunks = (
      (60 * 60, lambda n: ungettext('hour', 'hours', n)),
      (60, lambda n: ungettext('minute', 'minutes', n))
    )
    return _do_timesince(value, chunks)
    
def relative_timeuntil(value):
    if not value:
        return u''

    chunks = (
      (60 * 60, lambda n: ungettext('hour', 'hours', n)),
      (60, lambda n: ungettext('minute', 'minutes', n))
    )
    
    now = datetime.datetime.utcnow()
    
    return _do_timesince(now, chunks, value)
        
def format_relative_date(date, future=False):
    if not date or date < datetime.datetime(2010, 1, 1):
        return "Soon"
        
    now = datetime.datetime.utcnow()
    diff = abs(now - date)
    if diff < datetime.timedelta(minutes=60):
        minutes = diff.seconds / 60
        return "%s minute%s %s" % (minutes, 
                                   '' if minutes == 1 else 's', 
                                   '' if future else 'ago')
    elif datetime.timedelta(minutes=60) <= diff < datetime.timedelta(minutes=90):
        return "1 hour %s" % ('' if future else 'ago')
    elif diff >= datetime.timedelta(minutes=90):
        dec = (diff.seconds / 60 + 15) % 60
        if dec >= 30:
            return "%s.5 hours %s" % ((((diff.seconds / 60) + 15) / 60),
                                      '' if future else 'ago')
        else:
            return "%s hours %s" % ((((diff.seconds / 60) + 15) / 60), 
                                    '' if future else 'ago')
                                    

def fetch_site_favicon(url, path='favicon.ico'):
    HEADERS = {
        'User-Agent': 'NewsBlur Favicon Fetcher - http://www.newsblur.com',
        'Connection': 'close',
    }
    image = None
    
    if not url.endswith('/'):
        url += '/'

    request = urllib2.Request(url + 'favicon.ico', headers=HEADERS)
    try:
        icon = urllib2.urlopen(request)
        parser = ImageFile.Parser()
        while True:
            s = icon.read(1024)
            if not s:
                break
            parser.feed(s)
        image = parser.close()
    except(urllib2.HTTPError, urllib2.URLError):
        request = urllib2.Request(url, headers=HEADERS)
        try:
            content = urllib2.urlopen(request).read(2048) # 2048 bytes should be enough for most of websites
        except(urllib2.HTTPError, urllib2.URLError):
            return
        icon_path = lxml.html.fromstring(content).xpath(
            '//link[@rel="icon" or @rel="shortcut icon"]/@href'
        )
        if icon_path:
            request = urllib2.Request(url + icon_path[:1], headers=HEADERS)
            try:
                icon = urllib2.urlopen(request)
                parser = ImageFile.Parser()
                while True:
                    s = icon.read(1024)
                    if not s:
                        break
                    parser.feed(s)
                image = parser.close()
            except(urllib2.HTTPError, urllib2.URLError):
                return

    return image

def determine_dominant_color_in_image(image):
    import Image
    import scipy
    import scipy.misc
    import scipy.cluster

    NUM_CLUSTERS = 5

    print 'reading image'
    # im = image.resize((150, 150))      # optional, to reduce time
    ar = scipy.misc.fromimage(image)
    shape = ar.shape
    print shape
    ar = ar.reshape(scipy.product(shape[:2]), shape[2])

    print 'finding clusters'
    codes, dist = scipy.cluster.vq.kmeans(ar, NUM_CLUSTERS)
    print 'cluster centres:\n', codes
    print 'cluster centres:\n', '--'.join([''.join(chr(c) for c in code).encode('hex') for code in codes])
    
    vecs, dist = scipy.cluster.vq.vq(ar, codes)         # assign codes
    counts, bins = scipy.histogram(vecs, len(codes))    # count occurrences
    print counts
    index_max = scipy.argmax(counts)                    # find most frequent
    peak = codes[index_max]
    colour = ''.join(chr(c) for c in peak).encode('hex')
    print 'most frequent is %s (#%s)' % (peak, colour)
Initial 2009-06-16 03:08:55 +00:00			`import datetime`
Refactoring timelimit to add feed address re-check to a limited timeout. 2010-09-27 11:44:23 -04:00			`import threading`
			`import sys`
Downloading, parsing, and shaping favicons into colors. Doesn't work for many images, though. Still need to parse the best non-black/white color. 2011-01-09 23:11:09 -05:00			`import urllib2`
			`import lxml.html`
			`from PIL import ImageFile`
Small fixes: removing height calculation from classifier modal, adding in better logging for feed address changes, iPhone story detail view bug. 2010-08-03 21:53:08 -04:00			`from django.utils.translation import ungettext`
Fixing bad page URLs to use the feed fetcher to find the correct address. 2010-07-21 11:38:33 -04:00			`from utils import feedfinder`
Initial 2009-06-16 03:08:55 +00:00
Refactoring timelimit to add feed address re-check to a limited timeout. 2010-09-27 11:44:23 -04:00			`class TimeoutError(Exception): pass`
			`def timelimit(timeout):`
			`"""borrowed from web.py"""`
			`def _1(function):`
			`def _2(args, *kw):`
			`class Dispatch(threading.Thread):`
			`def __init__(self):`
			`threading.Thread.__init__(self)`
			`self.result = None`
			`self.error = None`

			`self.setDaemon(True)`
			`self.start()`

			`def run(self):`
			`try:`
			`self.result = function(args, *kw)`
			`except:`
			`self.error = sys.exc_info()`

			`c = Dispatch()`
			`c.join(timeout)`
			`if c.isAlive():`
			`raise TimeoutError, 'took too long'`
			`if c.error:`
			`raise c.error[0], c.error[1]`
			`return c.result`
			`return _2`
			`return _1`

Initial 2009-06-16 03:08:55 +00:00			`def encode(tstr):`
			`""" Encodes a unicode string in utf-8`
			`"""`
			`if not tstr:`
			`return ''`
			`# this is _not_ pretty, but it works`
			`try:`
			`return tstr.encode('utf-8', "xmlcharrefreplace")`
			`except UnicodeDecodeError:`
			`# it's already UTF8.. sigh`
			`return tstr.decode('utf-8').encode('utf-8')`

Rewrote ingestion and acquiring of feeds. This needs some serious TLC soon. 2009-08-01 04:26:57 +00:00			`# From: http://www.poromenos.org/node/87`
			`def levenshtein_distance(first, second):`
			`"""Find the Levenshtein distance between two strings."""`
			`if len(first) > len(second):`
			`first, second = second, first`
			`if len(second) == 0:`
			`return len(first)`
			`first_length = len(first) + 1`
			`second_length = len(second) + 1`
			`distance_matrix = [[0] * second_length for x in range(first_length)]`
			`for i in range(first_length):`
			`distance_matrix[i][0] = i`
			`for j in range(second_length):`
			`distance_matrix[0][j]=j`
			`for i in xrange(1, first_length):`
			`for j in range(1, second_length):`
			`deletion = distance_matrix[i-1][j] + 1`
			`insertion = distance_matrix[i][j-1] + 1`
			`substitution = distance_matrix[i-1][j-1]`
			`if first[i-1] != second[j-1]:`
			`substitution += 1`
			`distance_matrix[i][j] = min(insertion, deletion, substitution)`
Fixing bad page URLs to use the feed fetcher to find the correct address. 2010-07-21 11:38:33 -04:00			`return distance_matrix[first_length-1][second_length-1]`


			`def fetch_address_from_page(url, existing_feed=None):`
Fixing delete feeds to only delete the one feed you want to delete, not all of the others. Also fixed add URL to check more aggressively for duplicate feeds. 2010-09-14 23:47:21 -04:00			`from apps.rss_feeds.models import Feed, DuplicateFeed`
Fixing bad page URLs to use the feed fetcher to find the correct address. 2010-07-21 11:38:33 -04:00			`feed_finder_url = feedfinder.feed(url)`
			`if feed_finder_url:`
			`if existing_feed:`
Small fixes: removing height calculation from classifier modal, adding in better logging for feed address changes, iPhone story detail view bug. 2010-08-03 21:53:08 -04:00			`if Feed.objects.filter(feed_address=feed_finder_url):`
			`return None`
Fixing bad page URLs to use the feed fetcher to find the correct address. 2010-07-21 11:38:33 -04:00			`existing_feed.feed_address = feed_finder_url`
			`existing_feed.save()`
			`feed = existing_feed`
			`else:`
Fixing delete feeds to only delete the one feed you want to delete, not all of the others. Also fixed add URL to check more aggressively for duplicate feeds. 2010-09-14 23:47:21 -04:00			`duplicate_feed = DuplicateFeed.objects.filter(duplicate_address=feed_finder_url)`
			`if duplicate_feed:`
			`feed = [duplicate_feed[0].feed]`
			`else:`
			`feed = Feed.objects.filter(feed_address=feed_finder_url)`
			`if not feed:`
Fixing bad page URLs to use the feed fetcher to find the correct address. 2010-07-21 11:38:33 -04:00			`feed = Feed(feed_address=feed_finder_url)`
			`feed.save()`
			`feed.update()`
Fixing delete feeds to only delete the one feed you want to delete, not all of the others. Also fixed add URL to check more aggressively for duplicate feeds. 2010-09-14 23:47:21 -04:00			`else:`
			`feed = feed[0]`
Adding a last updated date display to every feed. 2010-07-26 21:38:56 -04:00			`return feed`

Don't normalize non-string urls. 2010-07-28 18:18:01 -04:00			`def _do_timesince(d, chunks, now=None):`
			`"""`
			`Started as a copy of django.util.timesince.timesince, but modified to`
			`only output one time unit, and use months as the maximum unit of measure.`

			`Takes two datetime objects and returns the time between d and now`
			`as a nicely formatted string, e.g. "10 minutes". If d occurs after now,`
			`then "0 minutes" is returned.`

			`Units used are months, weeks, days, hours, and minutes.`
			`Seconds and microseconds are ignored.`
			`"""`
			`# Convert datetime.date to datetime.datetime for comparison`
			`if d.__class__ is not datetime.datetime:`
			`d = datetime.datetime(d.year, d.month, d.day)`

			`if not now:`
Changing all datetime.datetime.now() to datetime.datetime.utcnow(). 2010-10-10 23:55:00 -04:00			`now = datetime.datetime.utcnow()`
Don't normalize non-string urls. 2010-07-28 18:18:01 -04:00
			`# ignore microsecond part of 'd' since we removed it from 'now'`
			`delta = now - (d - datetime.timedelta(0, 0, d.microsecond))`
			`since = delta.days * 24 * 60 * 60 + delta.seconds`
			`for i, (seconds, name) in enumerate(chunks):`
			`count = since // seconds`
			`if count != 0:`
			`break`
			`s = '%(number)d %(type)s' % {'number': count, 'type': name(count)}`
			`return s`

			`def relative_timesince(value):`
			`if not value:`
			`return u''`

			`chunks = (`
			`(60 * 60, lambda n: ungettext('hour', 'hours', n)),`
			`(60, lambda n: ungettext('minute', 'minutes', n))`
			`)`
			`return _do_timesince(value, chunks)`

			`def relative_timeuntil(value):`
			`if not value:`
			`return u''`

			`chunks = (`
			`(60 * 60, lambda n: ungettext('hour', 'hours', n)),`
			`(60, lambda n: ungettext('minute', 'minutes', n))`
			`)`

Changing all datetime.datetime.now() to datetime.datetime.utcnow(). 2010-10-10 23:55:00 -04:00			`now = datetime.datetime.utcnow()`
Don't normalize non-string urls. 2010-07-28 18:18:01 -04:00
			`return _do_timesince(now, chunks, value)`

Adding feed update stats to stats modal. 2010-07-28 01:14:25 -04:00			`def format_relative_date(date, future=False):`
Adding a last updated date display to every feed. 2010-07-26 21:38:56 -04:00			`if not date or date < datetime.datetime(2010, 1, 1):`
			`return "Soon"`

Changing all datetime.datetime.now() to datetime.datetime.utcnow(). 2010-10-10 23:55:00 -04:00			`now = datetime.datetime.utcnow()`
Adding feed update stats to stats modal. 2010-07-28 01:14:25 -04:00			`diff = abs(now - date)`
Adding a last updated date display to every feed. 2010-07-26 21:38:56 -04:00			`if diff < datetime.timedelta(minutes=60):`
Rudimentary statistics modals. There's more data coming in, but I'm too excited to work on the collocation phrase filter to finish this out. It'll get done, but for now it's functional. 2010-07-26 22:21:58 -04:00			`minutes = diff.seconds / 60`
Don't normalize non-string urls. 2010-07-28 18:18:01 -04:00			`return "%s minute%s %s" % (minutes,`
			`'' if minutes == 1 else 's',`
			`'' if future else 'ago')`
Adding a last updated date display to every feed. 2010-07-26 21:38:56 -04:00			`elif datetime.timedelta(minutes=60) <= diff < datetime.timedelta(minutes=90):`
Adding feed update stats to stats modal. 2010-07-28 01:14:25 -04:00			`return "1 hour %s" % ('' if future else 'ago')`
Adding a last updated date display to every feed. 2010-07-26 21:38:56 -04:00			`elif diff >= datetime.timedelta(minutes=90):`
Adding feed update stats to stats modal. 2010-07-28 01:14:25 -04:00			`dec = (diff.seconds / 60 + 15) % 60`
Adding a last updated date display to every feed. 2010-07-26 21:38:56 -04:00			`if dec >= 30:`
Don't normalize non-string urls. 2010-07-28 18:18:01 -04:00			`return "%s.5 hours %s" % ((((diff.seconds / 60) + 15) / 60),`
			`'' if future else 'ago')`
Adding a last updated date display to every feed. 2010-07-26 21:38:56 -04:00			`else:`
Don't normalize non-string urls. 2010-07-28 18:18:01 -04:00			`return "%s hours %s" % ((((diff.seconds / 60) + 15) / 60),`
Downloading, parsing, and shaping favicons into colors. Doesn't work for many images, though. Still need to parse the best non-black/white color. 2011-01-09 23:11:09 -05:00			`'' if future else 'ago')`


			`def fetch_site_favicon(url, path='favicon.ico'):`
			`HEADERS = {`
			`'User-Agent': 'NewsBlur Favicon Fetcher - http://www.newsblur.com',`
			`'Connection': 'close',`
			`}`
			`image = None`

			`if not url.endswith('/'):`
			`url += '/'`

			`request = urllib2.Request(url + 'favicon.ico', headers=HEADERS)`
			`try:`
			`icon = urllib2.urlopen(request)`
			`parser = ImageFile.Parser()`
			`while True:`
			`s = icon.read(1024)`
			`if not s:`
			`break`
			`parser.feed(s)`
			`image = parser.close()`
			`except(urllib2.HTTPError, urllib2.URLError):`
			`request = urllib2.Request(url, headers=HEADERS)`
			`try:`
			`content = urllib2.urlopen(request).read(2048) # 2048 bytes should be enough for most of websites`
			`except(urllib2.HTTPError, urllib2.URLError):`
			`return`
			`icon_path = lxml.html.fromstring(content).xpath(`
			`'//link[@rel="icon" or @rel="shortcut icon"]/@href'`
			`)`
			`if icon_path:`
			`request = urllib2.Request(url + icon_path[:1], headers=HEADERS)`
			`try:`
			`icon = urllib2.urlopen(request)`
			`parser = ImageFile.Parser()`
			`while True:`
			`s = icon.read(1024)`
			`if not s:`
			`break`
			`parser.feed(s)`
			`image = parser.close()`
			`except(urllib2.HTTPError, urllib2.URLError):`
			`return`

			`return image`

			`def determine_dominant_color_in_image(image):`
			`import Image`
			`import scipy`
			`import scipy.misc`
			`import scipy.cluster`

			`NUM_CLUSTERS = 5`

			`print 'reading image'`
			`# im = image.resize((150, 150)) # optional, to reduce time`
			`ar = scipy.misc.fromimage(image)`
			`shape = ar.shape`
			`print shape`
			`ar = ar.reshape(scipy.product(shape[:2]), shape[2])`

			`print 'finding clusters'`
			`codes, dist = scipy.cluster.vq.kmeans(ar, NUM_CLUSTERS)`
			`print 'cluster centres:\n', codes`
			`print 'cluster centres:\n', '--'.join([''.join(chr(c) for c in code).encode('hex') for code in codes])`

			`vecs, dist = scipy.cluster.vq.vq(ar, codes) # assign codes`
			`counts, bins = scipy.histogram(vecs, len(codes)) # count occurrences`
			`print counts`
			`index_max = scipy.argmax(counts) # find most frequent`
			`peak = codes[index_max]`
			`colour = ''.join(chr(c) for c in peak).encode('hex')`
			`print 'most frequent is %s (#%s)' % (peak, colour)`