NewsBlur-viq/utils/story_functions.py

import datetime
import struct
from HTMLParser import HTMLParser
from lxml.html.diff import tokenize, fixup_ins_del_tags, htmldiff_tokens
from lxml.etree import ParserError
from itertools import chain
from django.utils.dateformat import DateFormat
from django.utils.html import strip_tags as strip_tags_django
from django.conf import settings
from utils.tornado_escape import linkify as linkify_tornado
from utils.tornado_escape import xhtml_unescape as xhtml_unescape_tornado
from vendor import reseekfile

def story_score(story, bottom_delta=None):
    # A) Date - Assumes story is unread and within unread range
    if not bottom_delta: 
        bottom_delta = datetime.timedelta(days=settings.DAYS_OF_UNREAD)
    now        = datetime.datetime.utcnow()
    date_delta = now - story['story_date']
    seconds    = lambda td: td.seconds + (td.days * 86400)
    date_score = max(0, 1 - (seconds(date_delta) / float(seconds(bottom_delta))))
    
    # B) Statistics
    statistics_score = 0
    
    # C) Intelligence
    intelligence_score = 1
    # intelligence_score = feed_counts[int(story['story_feed_id'])] / float(max_feed_count)
    
    # print "%s - %s" % (story['story_date'], date_score)
    return (30/100. * date_score) + (55/100. * statistics_score) + (15/100. * intelligence_score)

def format_story_link_date__short(date, now=None):
    if not now: now = datetime.datetime.now()
    diff = date.date() - now.date()
    if diff.days == 0:
        return date.strftime('%I:%M%p').lstrip('0').lower()
    elif diff.days == 1:
        return 'Yesterday, ' + date.strftime('%I:%M%p').lstrip('0').lower()
    else:
        return date.strftime('%d %b %Y, ') + date.strftime('%I:%M%p').lstrip('0').lower()

def format_story_link_date__long(date, now=None):
    if not now: now = datetime.datetime.utcnow()
    diff = now.date() - date.date()
    parsed_date = DateFormat(date)
    if diff.days == 0:
        return 'Today, ' + parsed_date.format('F jS ') + date.strftime('%I:%M%p').lstrip('0').lower()
    elif diff.days == 1:
        return 'Yesterday, ' + parsed_date.format('F jS g:ia').replace('.','')
    elif date.date().timetuple()[7] == now.date().timetuple()[7]:
        return parsed_date.format('l, F jS g:ia').replace('.','')
    else:
        return parsed_date.format('l, F jS, Y g:ia').replace('.','')

def _extract_date_tuples(date):
    parsed_date = DateFormat(date)
    date_tuple = datetime.datetime.timetuple(date)[:3]
    today_tuple = datetime.datetime.timetuple(datetime.datetime.utcnow())[:3]
    today = datetime.datetime.today()
    yesterday_tuple = datetime.datetime.timetuple(today - datetime.timedelta(1))[:3]
    
    return parsed_date, date_tuple, today_tuple, yesterday_tuple
    
def pre_process_story(entry):
    publish_date = entry.get('published_parsed', entry.get('updated_parsed'))
    entry['published'] = datetime.datetime(*publish_date[:6]) if publish_date else datetime.datetime.utcnow()
    
    # entry_link = entry.get('link') or ''
    # protocol_index = entry_link.find("://")
    # if protocol_index != -1:
    #     entry['link'] = (entry_link[:protocol_index+3]
    #                     + urlquote(entry_link[protocol_index+3:]))
    # else:
    #     entry['link'] = urlquote(entry_link)
    if isinstance(entry.get('guid'), dict):
        entry['guid'] = unicode(entry['guid'])

    # Normalize story content/summary
    if entry.get('content'):
        entry['story_content'] = entry['content'][0].get('value', '').strip()
    else:
        summary = entry.get('summary') or ''
        entry['story_content'] = summary.strip()
    
    # Add each media enclosure as a Download link
    for media_content in chain(entry.get('media_content', [])[:5], entry.get('links', [])[:5]):
        media_url = media_content.get('url', '')
        media_type = media_content.get('type', '')
        if media_url and media_type and entry['story_content'] and media_url not in entry['story_content']:
            media_type_name = media_type.split('/')[0]
            if 'audio' in media_type and media_url:
                entry['story_content'] += """<br><br>
                    <audio controls="controls" preload="none">
                        <source src="%(media_url)s" type="%(media_type)s" />
                    </audio>"""  % {
                        'media_url': media_url, 
                        'media_type': media_type
                    }
            elif 'image' in media_type and media_url:
                entry['story_content'] += """<br><br><img src="%s" />"""  % media_url
                continue
            elif media_content.get('rel') == 'alternative' or 'text' in media_content.get('type'):
                continue
            elif media_type_name in ['application']:
                continue
            entry['story_content'] += """<br><br>
                Download %(media_type)s: <a href="%(media_url)s">%(media_url)s</a>"""  % {
                'media_type': media_type_name,
                'media_url': media_url, 
            }
    
    entry['guid'] = entry.get('guid') or entry.get('id') or entry.get('link') or str(entry.get('published'))

    if not entry.get('title') and entry.get('story_content'):
        story_title = strip_tags(entry['story_content'])
        if len(story_title) > 80:
            story_title = story_title[:80] + '...'
        entry['title'] = story_title
    
    entry['title'] = strip_tags(entry.get('title'))
    entry['author'] = strip_tags(entry.get('author'))
    
    return entry
    
class bunch(dict):
    """Example of overloading __getatr__ and __setattr__
    This example creates a dictionary where members can be accessed as attributes
    """
    def __init__(self, indict=None, attribute=None):
        if indict is None:
            indict = {}
        # set any attributes here - before initialisation
        # these remain as normal attributes
        self.attribute = attribute
        dict.__init__(self, indict)
        self.__initialised = True
        # after initialisation, setting attributes is the same as setting an item

    def __getattr__(self, item):
        """Maps values to attributes.
        Only called if there *isn't* an attribute with this name
        """
        try:
            return self.__getitem__(item)
        except KeyError:
            return None

    def __setattr__(self, item, value):
        """Maps attributes to values.
        Only if we are initialised
        """
        if not self.__dict__.has_key('_bunch__initialised'):  # this test allows attributes to be set in the __init__ method
            return dict.__setattr__(self, item, value)
        elif self.__dict__.has_key(item):       # any normal attributes are handled normally
            dict.__setattr__(self, item, value)
        else:
            self.__setitem__(item, value)
            
class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ' '.join(self.fed)

def strip_tags(html):
    if not html:
        return ''
    return strip_tags_django(html)
    
    s = MLStripper()
    s.feed(html)
    return s.get_data()

def linkify(*args, **kwargs):
    return xhtml_unescape_tornado(linkify_tornado(*args, **kwargs))
    
def truncate_chars(value, max_length):
    if len(value) <= max_length:
        return value
 
    truncd_val = value[:max_length]
    if value[max_length] != " ":
        rightmost_space = truncd_val.rfind(" ")
        if rightmost_space != -1:
            truncd_val = truncd_val[:rightmost_space]
 
    return truncd_val + "..."

def image_size(datastream):
    datastream = reseekfile.ReseekFile(datastream)
    data = str(datastream.read(30))
    size = len(data)
    height = -1
    width = -1
    content_type = ''

    # handle GIFs
    if (size >= 10) and data[:6] in ('GIF87a', 'GIF89a'):
        # Check to see if content_type is correct
        content_type = 'image/gif'
        w, h = struct.unpack("<HH", data[6:10])
        width = int(w)
        height = int(h)

    # See PNG 2. Edition spec (http://www.w3.org/TR/PNG/)
    # Bytes 0-7 are below, 4-byte chunk length, then 'IHDR'
    # and finally the 4-byte width, height
    elif ((size >= 24) and data.startswith('\211PNG\r\n\032\n')
          and (data[12:16] == 'IHDR')):
        content_type = 'image/png'
        w, h = struct.unpack(">LL", data[16:24])
        width = int(w)
        height = int(h)

    # Maybe this is for an older PNG version.
    elif (size >= 16) and data.startswith('\211PNG\r\n\032\n'):
        # Check to see if we have the right content type
        content_type = 'image/png'
        w, h = struct.unpack(">LL", data[8:16])
        width = int(w)
        height = int(h)

    # handle JPEGs
    elif (size >= 2) and data.startswith('\377\330'):
        content_type = 'image/jpeg'
        datastream.seek(0)
        datastream.read(2)
        b = datastream.read(1)
        try:
            while (b and ord(b) != 0xDA):
                while (ord(b) != 0xFF): b = datastream.read(1)
                while (ord(b) == 0xFF): b = datastream.read(1)
                if (ord(b) >= 0xC0 and ord(b) <= 0xC3):
                    datastream.read(3)
                    h, w = struct.unpack(">HH", datastream.read(4))
                    break
                else:
                    datastream.read(int(struct.unpack(">H", datastream.read(2))[0])-2)
                b = datastream.read(1)
            width = int(w)
            height = int(h)
        except struct.error:
            pass
        except ValueError:
            pass

    return content_type, width, height

def htmldiff(old_html, new_html):
    try:
        old_html_tokens = tokenize(old_html, include_hrefs=False) 
        new_html_tokens = tokenize(new_html, include_hrefs=False) 
    except (KeyError, ParserError):
        return new_html
    
    result = htmldiff_tokens(old_html_tokens, new_html_tokens) 
    result = ''.join(result).strip() 
    
    return fixup_ins_del_tags(result)
Initial 2009-06-16 03:08:55 +00:00			`import datetime`
Adding two new functions for stories: collect image sizes and grab original article text. Useful for future features. 2012-09-10 17:41:01 -07:00			`import struct`
Correctly handling story that lack titles but have content. Using a parsed and shucked truncation of content in place of missing titles. 2012-04-17 11:48:28 -07:00			`from HTMLParser import HTMLParser`
Fixing these damn Link: text urls in story modifications. 2012-10-01 19:04:09 -07:00			`from lxml.html.diff import tokenize, fixup_ins_del_tags, htmldiff_tokens`
Fixing a bunch of feed fetch errors. 2012-10-01 19:31:33 -07:00			`from lxml.etree import ParserError`
FIXING THE WORST BUG OF MY LIFE -- finally figured out what was causing the story-shows-as-unread bug. Also fixed enclosures on certain types of feeds. 2011-12-14 23:26:07 -08:00			`from itertools import chain`
Correctly handling story that lack titles but have content. Using a parsed and shucked truncation of content in place of missing titles. 2012-04-17 11:48:28 -07:00			`from django.utils.dateformat import DateFormat`
Auto-linkifying comments and replies, and stripping html from comments, replies, and story titles, tags, and authors. 2012-07-21 16:38:37 -07:00			`from django.utils.html import strip_tags as strip_tags_django`
Correctly handling story that lack titles but have content. Using a parsed and shucked truncation of content in place of missing titles. 2012-04-17 11:48:28 -07:00			`from django.conf import settings`
Auto-linkifying comments and replies, and stripping html from comments, replies, and story titles, tags, and authors. 2012-07-21 16:38:37 -07:00			`from utils.tornado_escape import linkify as linkify_tornado`
Unescaping incorrectly escaped and linkified comments and replies. 2012-07-23 23:23:34 -07:00			`from utils.tornado_escape import xhtml_unescape as xhtml_unescape_tornado`
Adding two new functions for stories: collect image sizes and grab original article text. Useful for future features. 2012-09-10 17:41:01 -07:00			`from vendor import reseekfile`
Retooling River of News to use three hueristics: Story date, statistics, and intelligence. So far, this is story date, with the necessary changes to ensure unread stories are within the correct date bounds at all times. 2011-01-15 18:41:41 -05:00
			`def story_score(story, bottom_delta=None):`
			`# A) Date - Assumes story is unread and within unread range`
			`if not bottom_delta:`
			`bottom_delta = datetime.timedelta(days=settings.DAYS_OF_UNREAD)`
			`now = datetime.datetime.utcnow()`
			`date_delta = now - story['story_date']`
			`seconds = lambda td: td.seconds + (td.days * 86400)`
Merging in river of news but hiding the river functions. 2011-01-16 15:48:27 -05:00			`date_score = max(0, 1 - (seconds(date_delta) / float(seconds(bottom_delta))))`
Retooling River of News to use three hueristics: Story date, statistics, and intelligence. So far, this is story date, with the necessary changes to ensure unread stories are within the correct date bounds at all times. 2011-01-15 18:41:41 -05:00
			`# B) Statistics`
			`statistics_score = 0`

			`# C) Intelligence`
Final cleanup before sending icons to staging. 2011-01-30 10:46:53 -05:00			`intelligence_score = 1`
			`# intelligence_score = feed_counts[int(story['story_feed_id'])] / float(max_feed_count)`
Retooling River of News to use three hueristics: Story date, statistics, and intelligence. So far, this is story date, with the necessary changes to ensure unread stories are within the correct date bounds at all times. 2011-01-15 18:41:41 -05:00
			`# print "%s - %s" % (story['story_date'], date_score)`
Final cleanup before sending icons to staging. 2011-01-30 10:46:53 -05:00			`return (30/100. * date_score) + (55/100. * statistics_score) + (15/100. * intelligence_score)`
Initial 2009-06-16 03:08:55 +00:00
Fixing timezones! No more "Yesterday, 11:30pm" when it's really today. GMT is 5 hours ahead of EST, so with it goes the Today/Yesterday. But that's FIXED. 2011-01-12 23:30:38 -05:00			`def format_story_link_date__short(date, now=None):`
			`if not now: now = datetime.datetime.now()`
			`diff = date.date() - now.date()`
			`if diff.days == 0:`
			`return date.strftime('%I:%M%p').lstrip('0').lower()`
			`elif diff.days == 1:`
			`return 'Yesterday, ' + date.strftime('%I:%M%p').lstrip('0').lower()`
Initial 2009-06-16 03:08:55 +00:00			`else:`
Fixing timezones! No more "Yesterday, 11:30pm" when it's really today. GMT is 5 hours ahead of EST, so with it goes the Today/Yesterday. But that's FIXED. 2011-01-12 23:30:38 -05:00			`return date.strftime('%d %b %Y, ') + date.strftime('%I:%M%p').lstrip('0').lower()`
Initial 2009-06-16 03:08:55 +00:00
Fixing timezones! No more "Yesterday, 11:30pm" when it's really today. GMT is 5 hours ahead of EST, so with it goes the Today/Yesterday. But that's FIXED. 2011-01-12 23:30:38 -05:00			`def format_story_link_date__long(date, now=None):`
			`if not now: now = datetime.datetime.utcnow()`
			`diff = now.date() - date.date()`
			`parsed_date = DateFormat(date)`
			`if diff.days == 0:`
			`return 'Today, ' + parsed_date.format('F jS ') + date.strftime('%I:%M%p').lstrip('0').lower()`
			`elif diff.days == 1:`
Pretty story cells in the feed detail table. Also prettier story details, but still needs some title love. Not to mention buttons everywhere. 2010-07-15 23:32:37 -04:00			`return 'Yesterday, ' + parsed_date.format('F jS g:ia').replace('.','')`
Fixing timezones! No more "Yesterday, 11:30pm" when it's really today. GMT is 5 hours ahead of EST, so with it goes the Today/Yesterday. But that's FIXED. 2011-01-12 23:30:38 -05:00			`elif date.date().timetuple()[7] == now.date().timetuple()[7]:`
Initial 2009-06-16 03:08:55 +00:00			`return parsed_date.format('l, F jS g:ia').replace('.','')`
			`else:`
			`return parsed_date.format('l, F jS, Y g:ia').replace('.','')`

			`def _extract_date_tuples(date):`
			`parsed_date = DateFormat(date)`
			`date_tuple = datetime.datetime.timetuple(date)[:3]`
Changing all datetime.datetime.now() to datetime.datetime.utcnow(). 2010-10-10 23:55:00 -04:00			`today_tuple = datetime.datetime.timetuple(datetime.datetime.utcnow())[:3]`
Initial 2009-06-16 03:08:55 +00:00			`today = datetime.datetime.today()`
			`yesterday_tuple = datetime.datetime.timetuple(today - datetime.timedelta(1))[:3]`

Fixing the out-of-date dupe bug. 2009-12-18 20:47:44 +00:00			`return parsed_date, date_tuple, today_tuple, yesterday_tuple`

			`def pre_process_story(entry):`
Handling broken dates in feeds. 2010-09-17 13:33:11 -04:00			`publish_date = entry.get('published_parsed', entry.get('updated_parsed'))`
			`entry['published'] = datetime.datetime(*publish_date[:6]) if publish_date else datetime.datetime.utcnow()`
Fixing the out-of-date dupe bug. 2009-12-18 20:47:44 +00:00
Correcting a double-encoding bug for story permalinks that was from way back when. 2012-03-27 18:37:04 -07:00			`# entry_link = entry.get('link') or ''`
			`# protocol_index = entry_link.find("://")`
			`# if protocol_index != -1:`
			`# entry['link'] = (entry_link[:protocol_index+3]`
			`# + urlquote(entry_link[protocol_index+3:]))`
			`# else:`
			`# entry['link'] = urlquote(entry_link)`
Fixing feed parsing for stories with dict ids. This includes all Google Reader feeds and Google News Alerts. Thanks to Richard <richard@lbrc.org>. 2010-12-10 15:58:21 -05:00			`if isinstance(entry.get('guid'), dict):`
			`entry['guid'] = unicode(entry['guid'])`
Refining media enclosures by adding all types and linking to their source. 2011-11-25 10:38:39 -05:00
			`# Normalize story content/summary`
Correcting for feeds that have incorrect media content. 2011-11-25 00:08:17 -05:00			`if entry.get('content'):`
Correcting a double-encoding bug for story permalinks that was from way back when. 2012-03-27 18:37:04 -07:00			`entry['story_content'] = entry['content'][0].get('value', '').strip()`
Refining media enclosures by adding all types and linking to their source. 2011-11-25 10:38:39 -05:00			`else:`
Fixing small bugs: feed sort order, unread/oldest not respecting mark as read date. 2012-08-02 11:04:11 -07:00			`summary = entry.get('summary') or ''`
			`entry['story_content'] = summary.strip()`
Refining media enclosures by adding all types and linking to their source. 2011-11-25 10:38:39 -05:00
			`# Add each media enclosure as a Download link`
Small cleanup to parameters of feed update method. 2012-03-26 17:04:35 -07:00			`for media_content in chain(entry.get('media_content', [])[:5], entry.get('links', [])[:5]):`
Refining media enclosures by adding all types and linking to their source. 2011-11-25 10:38:39 -05:00			`media_url = media_content.get('url', '')`
			`media_type = media_content.get('type', '')`
Fixing numerous feed fetching errors. 2012-02-24 14:39:23 -08:00			`if media_url and media_type and entry['story_content'] and media_url not in entry['story_content']:`
Removing application media types from enclosures. Nobody needs that. 2012-01-14 17:13:05 -08:00			`media_type_name = media_type.split('/')[0]`
Refining media enclosures by adding all types and linking to their source. 2011-11-25 10:38:39 -05:00			`if 'audio' in media_type and media_url:`
			`entry['story_content'] += """<br><br>`
New audio control, with Flash fallback for browsers like Firefox/IE. 2012-03-05 12:57:47 -08:00			`<audio controls="controls" preload="none">`
Refining media enclosures by adding all types and linking to their source. 2011-11-25 10:38:39 -05:00			`<source src="%(media_url)s" type="%(media_type)s" />`
			`</audio>""" % {`
			`'media_url': media_url,`
			`'media_type': media_type`
			`}`
			`elif 'image' in media_type and media_url:`
			`entry['story_content'] += """<br><br><img src="%s" />""" % media_url`
Don't bother showing 'Download Image' enclosure links. Just embed them. 2012-01-15 19:17:17 -08:00			`continue`
FIXING THE WORST BUG OF MY LIFE -- finally figured out what was causing the story-shows-as-unread bug. Also fixed enclosures on certain types of feeds. 2011-12-14 23:26:07 -08:00			`elif media_content.get('rel') == 'alternative' or 'text' in media_content.get('type'):`
			`continue`
Removing application media types from enclosures. Nobody needs that. 2012-01-14 17:13:05 -08:00			`elif media_type_name in ['application']:`
			`continue`
Refining media enclosures by adding all types and linking to their source. 2011-11-25 10:38:39 -05:00			`entry['story_content'] += """<br><br>`
			`Download %(media_type)s: <a href="%(media_url)s">%(media_url)s</a>""" % {`
Removing application media types from enclosures. Nobody needs that. 2012-01-14 17:13:05 -08:00			`'media_type': media_type_name,`
Refining media enclosure detection. 2011-11-25 00:26:30 -05:00			`'media_url': media_url,`
			`}`
Adding media enclosures in an audio tag. 2011-11-24 15:58:04 -05:00
			`entry['guid'] = entry.get('guid') or entry.get('id') or entry.get('link') or str(entry.get('published'))`
Correctly handling story that lack titles but have content. Using a parsed and shucked truncation of content in place of missing titles. 2012-04-17 11:48:28 -07:00
			`if not entry.get('title') and entry.get('story_content'):`
			`story_title = strip_tags(entry['story_content'])`
			`if len(story_title) > 80:`
			`story_title = story_title[:80] + '...'`
			`entry['title'] = story_title`
Adding media enclosures in an audio tag. 2011-11-24 15:58:04 -05:00
Auto-linkifying comments and replies, and stripping html from comments, replies, and story titles, tags, and authors. 2012-07-21 16:38:37 -07:00			`entry['title'] = strip_tags(entry.get('title'))`
			`entry['author'] = strip_tags(entry.get('author'))`

Removing items before the mark_read_date. Paging is broken though. 2011-01-07 16:26:17 -05:00			`return entry`

			`class bunch(dict):`
			`"""Example of overloading __getatr__ and __setattr__`
			`This example creates a dictionary where members can be accessed as attributes`
			`"""`
			`def __init__(self, indict=None, attribute=None):`
			`if indict is None:`
			`indict = {}`
			`# set any attributes here - before initialisation`
			`# these remain as normal attributes`
			`self.attribute = attribute`
			`dict.__init__(self, indict)`
			`self.__initialised = True`
			`# after initialisation, setting attributes is the same as setting an item`

			`def __getattr__(self, item):`
			`"""Maps values to attributes.`
			`Only called if there isn't an attribute with this name`
			`"""`
			`try:`
			`return self.__getitem__(item)`
			`except KeyError:`
			`return None`

			`def __setattr__(self, item, value):`
			`"""Maps attributes to values.`
			`Only if we are initialised`
			`"""`
			`if not self.__dict__.has_key('_bunch__initialised'): # this test allows attributes to be set in the __init__ method`
			`return dict.__setattr__(self, item, value)`
			`elif self.__dict__.has_key(item): # any normal attributes are handled normally`
			`dict.__setattr__(self, item, value)`
			`else:`
Correctly handling story that lack titles but have content. Using a parsed and shucked truncation of content in place of missing titles. 2012-04-17 11:48:28 -07:00			`self.__setitem__(item, value)`

			`class MLStripper(HTMLParser):`
			`def __init__(self):`
			`self.reset()`
			`self.fed = []`
			`def handle_data(self, d):`
			`self.fed.append(d)`
			`def get_data(self):`
			`return ' '.join(self.fed)`

			`def strip_tags(html):`
Auto-linkifying comments and replies, and stripping html from comments, replies, and story titles, tags, and authors. 2012-07-21 16:38:37 -07:00			`if not html:`
			`return ''`
			`return strip_tags_django(html)`

Correctly handling story that lack titles but have content. Using a parsed and shucked truncation of content in place of missing titles. 2012-04-17 11:48:28 -07:00			`s = MLStripper()`
			`s.feed(html)`
Adding post to twitter/facebook during share. Also adding email for new followers, which includes common followers and common followings. 2012-06-27 23:57:57 -07:00			`return s.get_data()`
Auto-linkifying comments and replies, and stripping html from comments, replies, and story titles, tags, and authors. 2012-07-21 16:38:37 -07:00
			`def linkify(args, *kwargs):`
Unescaping incorrectly escaped and linkified comments and replies. 2012-07-23 23:23:34 -07:00			`return xhtml_unescape_tornado(linkify_tornado(args, *kwargs))`
Adding post to twitter/facebook during share. Also adding email for new followers, which includes common followers and common followings. 2012-06-27 23:57:57 -07:00
			`def truncate_chars(value, max_length):`
			`if len(value) <= max_length:`
			`return value`

			`truncd_val = value[:max_length]`
			`if value[max_length] != " ":`
			`rightmost_space = truncd_val.rfind(" ")`
			`if rightmost_space != -1:`
			`truncd_val = truncd_val[:rightmost_space]`

Adding two new functions for stories: collect image sizes and grab original article text. Useful for future features. 2012-09-10 17:41:01 -07:00			`return truncd_val + "..."`

			`def image_size(datastream):`
			`datastream = reseekfile.ReseekFile(datastream)`
			`data = str(datastream.read(30))`
			`size = len(data)`
			`height = -1`
			`width = -1`
			`content_type = ''`

			`# handle GIFs`
			`if (size >= 10) and data[:6] in ('GIF87a', 'GIF89a'):`
			`# Check to see if content_type is correct`
			`content_type = 'image/gif'`
			`w, h = struct.unpack("<HH", data[6:10])`
			`width = int(w)`
			`height = int(h)`

			`# See PNG 2. Edition spec (http://www.w3.org/TR/PNG/)`
			`# Bytes 0-7 are below, 4-byte chunk length, then 'IHDR'`
			`# and finally the 4-byte width, height`
			`elif ((size >= 24) and data.startswith('\211PNG\r\n\032\n')`
			`and (data[12:16] == 'IHDR')):`
			`content_type = 'image/png'`
			`w, h = struct.unpack(">LL", data[16:24])`
			`width = int(w)`
			`height = int(h)`

			`# Maybe this is for an older PNG version.`
			`elif (size >= 16) and data.startswith('\211PNG\r\n\032\n'):`
			`# Check to see if we have the right content type`
			`content_type = 'image/png'`
			`w, h = struct.unpack(">LL", data[8:16])`
			`width = int(w)`
			`height = int(h)`

			`# handle JPEGs`
			`elif (size >= 2) and data.startswith('\377\330'):`
			`content_type = 'image/jpeg'`
			`datastream.seek(0)`
			`datastream.read(2)`
			`b = datastream.read(1)`
			`try:`
			`while (b and ord(b) != 0xDA):`
			`while (ord(b) != 0xFF): b = datastream.read(1)`
			`while (ord(b) == 0xFF): b = datastream.read(1)`
			`if (ord(b) >= 0xC0 and ord(b) <= 0xC3):`
			`datastream.read(3)`
			`h, w = struct.unpack(">HH", datastream.read(4))`
			`break`
			`else:`
			`datastream.read(int(struct.unpack(">H", datastream.read(2))[0])-2)`
			`b = datastream.read(1)`
			`width = int(w)`
			`height = int(h)`
			`except struct.error:`
			`pass`
			`except ValueError:`
			`pass`

			`return content_type, width, height`

Fixing these damn Link: text urls in story modifications. 2012-10-01 19:04:09 -07:00			`def htmldiff(old_html, new_html):`
Fixing a bunch of feed fetch errors. 2012-10-01 19:31:33 -07:00			`try:`
			`old_html_tokens = tokenize(old_html, include_hrefs=False)`
			`new_html_tokens = tokenize(new_html, include_hrefs=False)`
			`except (KeyError, ParserError):`
			`return new_html`

Fixing these damn Link: text urls in story modifications. 2012-10-01 19:04:09 -07:00			`result = htmldiff_tokens(old_html_tokens, new_html_tokens)`
			`result = ''.join(result).strip()`

			`return fixup_ins_del_tags(result)`