NewsBlur/utils/story_functions.py

import datetime
from HTMLParser import HTMLParser
from itertools import chain
from django.utils.dateformat import DateFormat
from django.conf import settings

def story_score(story, bottom_delta=None):
    # A) Date - Assumes story is unread and within unread range
    if not bottom_delta: 
        bottom_delta = datetime.timedelta(days=settings.DAYS_OF_UNREAD)
    now        = datetime.datetime.utcnow()
    date_delta = now - story['story_date']
    seconds    = lambda td: td.seconds + (td.days * 86400)
    date_score = max(0, 1 - (seconds(date_delta) / float(seconds(bottom_delta))))
    
    # B) Statistics
    statistics_score = 0
    
    # C) Intelligence
    intelligence_score = 1
    # intelligence_score = feed_counts[int(story['story_feed_id'])] / float(max_feed_count)
    
    # print "%s - %s" % (story['story_date'], date_score)
    return (30/100. * date_score) + (55/100. * statistics_score) + (15/100. * intelligence_score)

def format_story_link_date__short(date, now=None):
    if not now: now = datetime.datetime.now()
    diff = date.date() - now.date()
    if diff.days == 0:
        return date.strftime('%I:%M%p').lstrip('0').lower()
    elif diff.days == 1:
        return 'Yesterday, ' + date.strftime('%I:%M%p').lstrip('0').lower()
    else:
        return date.strftime('%d %b %Y, ') + date.strftime('%I:%M%p').lstrip('0').lower()

def format_story_link_date__long(date, now=None):
    if not now: now = datetime.datetime.utcnow()
    diff = now.date() - date.date()
    parsed_date = DateFormat(date)
    if diff.days == 0:
        return 'Today, ' + parsed_date.format('F jS ') + date.strftime('%I:%M%p').lstrip('0').lower()
    elif diff.days == 1:
        return 'Yesterday, ' + parsed_date.format('F jS g:ia').replace('.','')
    elif date.date().timetuple()[7] == now.date().timetuple()[7]:
        return parsed_date.format('l, F jS g:ia').replace('.','')
    else:
        return parsed_date.format('l, F jS, Y g:ia').replace('.','')

def _extract_date_tuples(date):
    parsed_date = DateFormat(date)
    date_tuple = datetime.datetime.timetuple(date)[:3]
    today_tuple = datetime.datetime.timetuple(datetime.datetime.utcnow())[:3]
    today = datetime.datetime.today()
    yesterday_tuple = datetime.datetime.timetuple(today - datetime.timedelta(1))[:3]
    
    return parsed_date, date_tuple, today_tuple, yesterday_tuple
    
def pre_process_story(entry):
    publish_date = entry.get('published_parsed', entry.get('updated_parsed'))
    entry['published'] = datetime.datetime(*publish_date[:6]) if publish_date else datetime.datetime.utcnow()
    
    # entry_link = entry.get('link') or ''
    # protocol_index = entry_link.find("://")
    # if protocol_index != -1:
    #     entry['link'] = (entry_link[:protocol_index+3]
    #                     + urlquote(entry_link[protocol_index+3:]))
    # else:
    #     entry['link'] = urlquote(entry_link)
    if isinstance(entry.get('guid'), dict):
        entry['guid'] = unicode(entry['guid'])

    # Normalize story content/summary
    if entry.get('content'):
        entry['story_content'] = entry['content'][0].get('value', '').strip()
    else:
        entry['story_content'] = entry.get('summary', '').strip()
    
    # Add each media enclosure as a Download link
    for media_content in chain(entry.get('media_content', [])[:5], entry.get('links', [])[:5]):
        media_url = media_content.get('url', '')
        media_type = media_content.get('type', '')
        if media_url and media_type and entry['story_content'] and media_url not in entry['story_content']:
            media_type_name = media_type.split('/')[0]
            if 'audio' in media_type and media_url:
                entry['story_content'] += """<br><br>
                    <audio controls="controls" preload="none">
                        <source src="%(media_url)s" type="%(media_type)s" />
                    </audio>"""  % {
                        'media_url': media_url, 
                        'media_type': media_type
                    }
            elif 'image' in media_type and media_url:
                entry['story_content'] += """<br><br><img src="%s" />"""  % media_url
                continue
            elif media_content.get('rel') == 'alternative' or 'text' in media_content.get('type'):
                continue
            elif media_type_name in ['application']:
                continue
            entry['story_content'] += """<br><br>
                Download %(media_type)s: <a href="%(media_url)s">%(media_url)s</a>"""  % {
                'media_type': media_type_name,
                'media_url': media_url, 
            }
    
    entry['guid'] = entry.get('guid') or entry.get('id') or entry.get('link') or str(entry.get('published'))

    if not entry.get('title') and entry.get('story_content'):
        story_title = strip_tags(entry['story_content'])
        if len(story_title) > 80:
            story_title = story_title[:80] + '...'
        entry['title'] = story_title
    
    return entry
    
class bunch(dict):
    """Example of overloading __getatr__ and __setattr__
    This example creates a dictionary where members can be accessed as attributes
    """
    def __init__(self, indict=None, attribute=None):
        if indict is None:
            indict = {}
        # set any attributes here - before initialisation
        # these remain as normal attributes
        self.attribute = attribute
        dict.__init__(self, indict)
        self.__initialised = True
        # after initialisation, setting attributes is the same as setting an item

    def __getattr__(self, item):
        """Maps values to attributes.
        Only called if there *isn't* an attribute with this name
        """
        try:
            return self.__getitem__(item)
        except KeyError:
            return None

    def __setattr__(self, item, value):
        """Maps attributes to values.
        Only if we are initialised
        """
        if not self.__dict__.has_key('_bunch__initialised'):  # this test allows attributes to be set in the __init__ method
            return dict.__setattr__(self, item, value)
        elif self.__dict__.has_key(item):       # any normal attributes are handled normally
            dict.__setattr__(self, item, value)
        else:
            self.__setitem__(item, value)
            

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ' '.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()
Initial 2009-06-16 03:08:55 +00:00			`import datetime`
Correctly handling story that lack titles but have content. Using a parsed and shucked truncation of content in place of missing titles. 2012-04-17 11:48:28 -07:00			`from HTMLParser import HTMLParser`
FIXING THE WORST BUG OF MY LIFE -- finally figured out what was causing the story-shows-as-unread bug. Also fixed enclosures on certain types of feeds. 2011-12-14 23:26:07 -08:00			`from itertools import chain`
Correctly handling story that lack titles but have content. Using a parsed and shucked truncation of content in place of missing titles. 2012-04-17 11:48:28 -07:00			`from django.utils.dateformat import DateFormat`
			`from django.conf import settings`
Retooling River of News to use three hueristics: Story date, statistics, and intelligence. So far, this is story date, with the necessary changes to ensure unread stories are within the correct date bounds at all times. 2011-01-15 18:41:41 -05:00
			`def story_score(story, bottom_delta=None):`
			`# A) Date - Assumes story is unread and within unread range`
			`if not bottom_delta:`
			`bottom_delta = datetime.timedelta(days=settings.DAYS_OF_UNREAD)`
			`now = datetime.datetime.utcnow()`
			`date_delta = now - story['story_date']`
			`seconds = lambda td: td.seconds + (td.days * 86400)`
Merging in river of news but hiding the river functions. 2011-01-16 15:48:27 -05:00			`date_score = max(0, 1 - (seconds(date_delta) / float(seconds(bottom_delta))))`
Retooling River of News to use three hueristics: Story date, statistics, and intelligence. So far, this is story date, with the necessary changes to ensure unread stories are within the correct date bounds at all times. 2011-01-15 18:41:41 -05:00
			`# B) Statistics`
			`statistics_score = 0`

			`# C) Intelligence`
Final cleanup before sending icons to staging. 2011-01-30 10:46:53 -05:00			`intelligence_score = 1`
			`# intelligence_score = feed_counts[int(story['story_feed_id'])] / float(max_feed_count)`
Retooling River of News to use three hueristics: Story date, statistics, and intelligence. So far, this is story date, with the necessary changes to ensure unread stories are within the correct date bounds at all times. 2011-01-15 18:41:41 -05:00
			`# print "%s - %s" % (story['story_date'], date_score)`
Final cleanup before sending icons to staging. 2011-01-30 10:46:53 -05:00			`return (30/100. * date_score) + (55/100. * statistics_score) + (15/100. * intelligence_score)`
Initial 2009-06-16 03:08:55 +00:00
Fixing timezones! No more "Yesterday, 11:30pm" when it's really today. GMT is 5 hours ahead of EST, so with it goes the Today/Yesterday. But that's FIXED. 2011-01-12 23:30:38 -05:00			`def format_story_link_date__short(date, now=None):`
			`if not now: now = datetime.datetime.now()`
			`diff = date.date() - now.date()`
			`if diff.days == 0:`
			`return date.strftime('%I:%M%p').lstrip('0').lower()`
			`elif diff.days == 1:`
			`return 'Yesterday, ' + date.strftime('%I:%M%p').lstrip('0').lower()`
Initial 2009-06-16 03:08:55 +00:00			`else:`
Fixing timezones! No more "Yesterday, 11:30pm" when it's really today. GMT is 5 hours ahead of EST, so with it goes the Today/Yesterday. But that's FIXED. 2011-01-12 23:30:38 -05:00			`return date.strftime('%d %b %Y, ') + date.strftime('%I:%M%p').lstrip('0').lower()`
Initial 2009-06-16 03:08:55 +00:00
Fixing timezones! No more "Yesterday, 11:30pm" when it's really today. GMT is 5 hours ahead of EST, so with it goes the Today/Yesterday. But that's FIXED. 2011-01-12 23:30:38 -05:00			`def format_story_link_date__long(date, now=None):`
			`if not now: now = datetime.datetime.utcnow()`
			`diff = now.date() - date.date()`
			`parsed_date = DateFormat(date)`
			`if diff.days == 0:`
			`return 'Today, ' + parsed_date.format('F jS ') + date.strftime('%I:%M%p').lstrip('0').lower()`
			`elif diff.days == 1:`
Pretty story cells in the feed detail table. Also prettier story details, but still needs some title love. Not to mention buttons everywhere. 2010-07-15 23:32:37 -04:00			`return 'Yesterday, ' + parsed_date.format('F jS g:ia').replace('.','')`
Fixing timezones! No more "Yesterday, 11:30pm" when it's really today. GMT is 5 hours ahead of EST, so with it goes the Today/Yesterday. But that's FIXED. 2011-01-12 23:30:38 -05:00			`elif date.date().timetuple()[7] == now.date().timetuple()[7]:`
Initial 2009-06-16 03:08:55 +00:00			`return parsed_date.format('l, F jS g:ia').replace('.','')`
			`else:`
			`return parsed_date.format('l, F jS, Y g:ia').replace('.','')`

			`def _extract_date_tuples(date):`
			`parsed_date = DateFormat(date)`
			`date_tuple = datetime.datetime.timetuple(date)[:3]`
Changing all datetime.datetime.now() to datetime.datetime.utcnow(). 2010-10-10 23:55:00 -04:00			`today_tuple = datetime.datetime.timetuple(datetime.datetime.utcnow())[:3]`
Initial 2009-06-16 03:08:55 +00:00			`today = datetime.datetime.today()`
			`yesterday_tuple = datetime.datetime.timetuple(today - datetime.timedelta(1))[:3]`

Fixing the out-of-date dupe bug. 2009-12-18 20:47:44 +00:00			`return parsed_date, date_tuple, today_tuple, yesterday_tuple`

			`def pre_process_story(entry):`
Handling broken dates in feeds. 2010-09-17 13:33:11 -04:00			`publish_date = entry.get('published_parsed', entry.get('updated_parsed'))`
			`entry['published'] = datetime.datetime(*publish_date[:6]) if publish_date else datetime.datetime.utcnow()`
Fixing the out-of-date dupe bug. 2009-12-18 20:47:44 +00:00
Correcting a double-encoding bug for story permalinks that was from way back when. 2012-03-27 18:37:04 -07:00			`# entry_link = entry.get('link') or ''`
			`# protocol_index = entry_link.find("://")`
			`# if protocol_index != -1:`
			`# entry['link'] = (entry_link[:protocol_index+3]`
			`# + urlquote(entry_link[protocol_index+3:]))`
			`# else:`
			`# entry['link'] = urlquote(entry_link)`
Fixing feed parsing for stories with dict ids. This includes all Google Reader feeds and Google News Alerts. Thanks to Richard <richard@lbrc.org>. 2010-12-10 15:58:21 -05:00			`if isinstance(entry.get('guid'), dict):`
			`entry['guid'] = unicode(entry['guid'])`
Refining media enclosures by adding all types and linking to their source. 2011-11-25 10:38:39 -05:00
			`# Normalize story content/summary`
Correcting for feeds that have incorrect media content. 2011-11-25 00:08:17 -05:00			`if entry.get('content'):`
Correcting a double-encoding bug for story permalinks that was from way back when. 2012-03-27 18:37:04 -07:00			`entry['story_content'] = entry['content'][0].get('value', '').strip()`
Refining media enclosures by adding all types and linking to their source. 2011-11-25 10:38:39 -05:00			`else:`
Correcting a double-encoding bug for story permalinks that was from way back when. 2012-03-27 18:37:04 -07:00			`entry['story_content'] = entry.get('summary', '').strip()`
Refining media enclosures by adding all types and linking to their source. 2011-11-25 10:38:39 -05:00
			`# Add each media enclosure as a Download link`
Small cleanup to parameters of feed update method. 2012-03-26 17:04:35 -07:00			`for media_content in chain(entry.get('media_content', [])[:5], entry.get('links', [])[:5]):`
Refining media enclosures by adding all types and linking to their source. 2011-11-25 10:38:39 -05:00			`media_url = media_content.get('url', '')`
			`media_type = media_content.get('type', '')`
Fixing numerous feed fetching errors. 2012-02-24 14:39:23 -08:00			`if media_url and media_type and entry['story_content'] and media_url not in entry['story_content']:`
Removing application media types from enclosures. Nobody needs that. 2012-01-14 17:13:05 -08:00			`media_type_name = media_type.split('/')[0]`
Refining media enclosures by adding all types and linking to their source. 2011-11-25 10:38:39 -05:00			`if 'audio' in media_type and media_url:`
			`entry['story_content'] += """<br><br>`
New audio control, with Flash fallback for browsers like Firefox/IE. 2012-03-05 12:57:47 -08:00			`<audio controls="controls" preload="none">`
Refining media enclosures by adding all types and linking to their source. 2011-11-25 10:38:39 -05:00			`<source src="%(media_url)s" type="%(media_type)s" />`
			`</audio>""" % {`
			`'media_url': media_url,`
			`'media_type': media_type`
			`}`
			`elif 'image' in media_type and media_url:`
			`entry['story_content'] += """<br><br><img src="%s" />""" % media_url`
Don't bother showing 'Download Image' enclosure links. Just embed them. 2012-01-15 19:17:17 -08:00			`continue`
FIXING THE WORST BUG OF MY LIFE -- finally figured out what was causing the story-shows-as-unread bug. Also fixed enclosures on certain types of feeds. 2011-12-14 23:26:07 -08:00			`elif media_content.get('rel') == 'alternative' or 'text' in media_content.get('type'):`
			`continue`
Removing application media types from enclosures. Nobody needs that. 2012-01-14 17:13:05 -08:00			`elif media_type_name in ['application']:`
			`continue`
Refining media enclosures by adding all types and linking to their source. 2011-11-25 10:38:39 -05:00			`entry['story_content'] += """<br><br>`
			`Download %(media_type)s: <a href="%(media_url)s">%(media_url)s</a>""" % {`
Removing application media types from enclosures. Nobody needs that. 2012-01-14 17:13:05 -08:00			`'media_type': media_type_name,`
Refining media enclosure detection. 2011-11-25 00:26:30 -05:00			`'media_url': media_url,`
			`}`
Adding media enclosures in an audio tag. 2011-11-24 15:58:04 -05:00
			`entry['guid'] = entry.get('guid') or entry.get('id') or entry.get('link') or str(entry.get('published'))`
Correctly handling story that lack titles but have content. Using a parsed and shucked truncation of content in place of missing titles. 2012-04-17 11:48:28 -07:00
			`if not entry.get('title') and entry.get('story_content'):`
			`story_title = strip_tags(entry['story_content'])`
			`if len(story_title) > 80:`
			`story_title = story_title[:80] + '...'`
			`entry['title'] = story_title`
Adding media enclosures in an audio tag. 2011-11-24 15:58:04 -05:00
Removing items before the mark_read_date. Paging is broken though. 2011-01-07 16:26:17 -05:00			`return entry`

			`class bunch(dict):`
			`"""Example of overloading __getatr__ and __setattr__`
			`This example creates a dictionary where members can be accessed as attributes`
			`"""`
			`def __init__(self, indict=None, attribute=None):`
			`if indict is None:`
			`indict = {}`
			`# set any attributes here - before initialisation`
			`# these remain as normal attributes`
			`self.attribute = attribute`
			`dict.__init__(self, indict)`
			`self.__initialised = True`
			`# after initialisation, setting attributes is the same as setting an item`

			`def __getattr__(self, item):`
			`"""Maps values to attributes.`
			`Only called if there isn't an attribute with this name`
			`"""`
			`try:`
			`return self.__getitem__(item)`
			`except KeyError:`
			`return None`

			`def __setattr__(self, item, value):`
			`"""Maps attributes to values.`
			`Only if we are initialised`
			`"""`
			`if not self.__dict__.has_key('_bunch__initialised'): # this test allows attributes to be set in the __init__ method`
			`return dict.__setattr__(self, item, value)`
			`elif self.__dict__.has_key(item): # any normal attributes are handled normally`
			`dict.__setattr__(self, item, value)`
			`else:`
Correctly handling story that lack titles but have content. Using a parsed and shucked truncation of content in place of missing titles. 2012-04-17 11:48:28 -07:00			`self.__setitem__(item, value)`


			`class MLStripper(HTMLParser):`
			`def __init__(self):`
			`self.reset()`
			`self.fed = []`
			`def handle_data(self, d):`
			`self.fed.append(d)`
			`def get_data(self):`
			`return ' '.join(self.fed)`

			`def strip_tags(html):`
			`s = MLStripper()`
			`s.feed(html)`
			`return s.get_data()`