NewsBlur/utils/story_functions.py

334 lines
12 KiB
Python
Raw Normal View History

2012-10-19 13:53:11 -07:00
import re
2009-06-16 03:08:55 +00:00
import datetime
import struct
2012-12-10 15:31:39 -08:00
import dateutil
from HTMLParser import HTMLParser
from lxml.html.diff import tokenize, fixup_ins_del_tags, htmldiff_tokens
2012-10-19 13:36:57 -07:00
from lxml.etree import ParserError, XMLSyntaxError
import lxml.html, lxml.etree
from lxml.html.clean import Cleaner
from itertools import chain
from django.utils.dateformat import DateFormat
from django.utils.html import strip_tags as strip_tags_django
from utils.tornado_escape import linkify as linkify_tornado
from utils.tornado_escape import xhtml_unescape as xhtml_unescape_tornado
from vendor import reseekfile
# COMMENTS_RE = re.compile('\<![ \r\n\t]*(--([^\-]|[\r\n]|-[^\-])*--[ \r\n\t]*)\>')
COMMENTS_RE = re.compile('\<!--.*?--\>')
2012-10-19 13:53:11 -07:00
def midnight_today(now=None):
if not now:
now = datetime.datetime.now()
return now.replace(hour=0, minute=0, second=0, microsecond=0, tzinfo=None)
def midnight_yesterday(midnight=None):
if not midnight:
midnight = midnight_today()
return midnight - datetime.timedelta(days=1)
def beginning_of_this_month():
return datetime.datetime.now().replace(day=1, hour=0, minute=0, second=0, microsecond=0)
def format_story_link_date__short(date, now=None):
if not now:
now = datetime.datetime.now()
date = date.replace(tzinfo=None)
midnight = midnight_today(now)
if date >= midnight:
return date.strftime('%I:%M%p').lstrip('0').lower()
elif date >= midnight_yesterday(midnight):
return 'Yesterday, ' + date.strftime('%I:%M%p').lstrip('0').lower()
2009-06-16 03:08:55 +00:00
else:
return date.strftime('%d %b %Y, ') + date.strftime('%I:%M%p').lstrip('0').lower()
2009-06-16 03:08:55 +00:00
def format_story_link_date__long(date, now=None):
if not now:
now = datetime.datetime.now()
date = date.replace(tzinfo=None)
2013-11-21 17:52:26 -08:00
midnight = midnight_today(now)
parsed_date = DateFormat(date)
if date >= midnight:
return 'Today, ' + parsed_date.format('F jS ') + date.strftime('%I:%M%p').lstrip('0').lower()
elif date >= midnight_yesterday(midnight):
return 'Yesterday, ' + parsed_date.format('F jS g:ia').replace('.','')
elif date >= beginning_of_this_month():
2009-06-16 03:08:55 +00:00
return parsed_date.format('l, F jS g:ia').replace('.','')
else:
return parsed_date.format('l, F jS, Y g:ia').replace('.','')
def _extract_date_tuples(date):
parsed_date = DateFormat(date)
date_tuple = datetime.datetime.timetuple(date)[:3]
today_tuple = datetime.datetime.timetuple(datetime.datetime.utcnow())[:3]
2009-06-16 03:08:55 +00:00
today = datetime.datetime.today()
yesterday_tuple = datetime.datetime.timetuple(today - datetime.timedelta(1))[:3]
2009-12-18 20:47:44 +00:00
return parsed_date, date_tuple, today_tuple, yesterday_tuple
def pre_process_story(entry):
publish_date = entry.get('published_parsed') or entry.get('updated_parsed')
2012-12-10 15:31:39 -08:00
if publish_date:
publish_date = datetime.datetime(*publish_date[:6])
if not publish_date and entry.get('published'):
try:
2012-12-10 15:32:31 -08:00
publish_date = dateutil.parser.parse(entry.get('published')).replace(tzinfo=None)
except (ValueError, TypeError):
2012-12-10 15:31:39 -08:00
pass
if publish_date:
entry['published'] = publish_date
else:
entry['published'] = datetime.datetime.utcnow()
2009-12-18 20:47:44 +00:00
if entry['published'] > datetime.datetime.now() + datetime.timedelta(days=1):
entry['published'] = datetime.datetime.now()
# entry_link = entry.get('link') or ''
# protocol_index = entry_link.find("://")
# if protocol_index != -1:
# entry['link'] = (entry_link[:protocol_index+3]
# + urlquote(entry_link[protocol_index+3:]))
# else:
# entry['link'] = urlquote(entry_link)
if isinstance(entry.get('guid'), dict):
entry['guid'] = unicode(entry['guid'])
# Normalize story content/summary
summary = entry.get('summary') or ""
content = ""
if not summary and 'summary_detail' in entry:
summary = entry['summary_detail'].get('value', '')
if entry.get('content'):
content = entry['content'][0].get('value', '')
if len(content) > len(summary):
entry['story_content'] = content.strip()
else:
entry['story_content'] = summary.strip()
# Add each media enclosure as a Download link
for media_content in chain(entry.get('media_content', [])[:5], entry.get('links', [])[:5]):
media_url = media_content.get('url', '')
media_type = media_content.get('type', '')
2012-02-24 14:39:23 -08:00
if media_url and media_type and entry['story_content'] and media_url not in entry['story_content']:
media_type_name = media_type.split('/')[0]
if 'audio' in media_type and media_url:
entry['story_content'] += """<br><br>
<audio controls="controls" preload="none">
<source src="%(media_url)s" type="%(media_type)s" />
</audio>""" % {
'media_url': media_url,
'media_type': media_type
}
elif 'image' in media_type and media_url:
entry['story_content'] += """<br><br><img src="%s" />""" % media_url
continue
elif media_content.get('rel') == 'alternative' or 'text' in media_content.get('type'):
continue
elif media_type_name in ['application']:
continue
entry['story_content'] += """<br><br>
Download %(media_type)s: <a href="%(media_url)s">%(media_url)s</a>""" % {
'media_type': media_type_name,
2011-11-25 00:26:30 -05:00
'media_url': media_url,
}
entry['guid'] = entry.get('guid') or entry.get('id') or entry.get('link') or str(entry.get('published'))
if not entry.get('title') and entry.get('story_content'):
story_title = strip_tags(entry['story_content'])
if len(story_title) > 80:
story_title = story_title[:80] + '...'
entry['title'] = story_title
entry['title'] = strip_tags(entry.get('title'))
entry['author'] = strip_tags(entry.get('author'))
return entry
class bunch(dict):
"""Example of overloading __getatr__ and __setattr__
This example creates a dictionary where members can be accessed as attributes
"""
def __init__(self, indict=None, attribute=None):
if indict is None:
indict = {}
# set any attributes here - before initialisation
# these remain as normal attributes
self.attribute = attribute
dict.__init__(self, indict)
self.__initialised = True
# after initialisation, setting attributes is the same as setting an item
def __getattr__(self, item):
"""Maps values to attributes.
Only called if there *isn't* an attribute with this name
"""
try:
return self.__getitem__(item)
except KeyError:
return None
def __setattr__(self, item, value):
"""Maps attributes to values.
Only if we are initialised
"""
if not self.__dict__.has_key('_bunch__initialised'): # this test allows attributes to be set in the __init__ method
return dict.__setattr__(self, item, value)
elif self.__dict__.has_key(item): # any normal attributes are handled normally
dict.__setattr__(self, item, value)
else:
self.__setitem__(item, value)
class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return ' '.join(self.fed)
def strip_tags(html):
if not html:
return ''
return strip_tags_django(html)
s = MLStripper()
s.feed(html)
return s.get_data()
def strip_comments(html_string):
2012-10-19 13:53:11 -07:00
return COMMENTS_RE.sub('', html_string)
def strip_comments__lxml2(html_string=""):
if not html_string: return html_string
tree = lxml.html.fromstring(html_string)
comments = tree.xpath('//comment()')
for c in comments:
p = c.getparent()
p.remove(c)
return lxml.etree.tostring(tree)
def strip_comments__lxml(html_string=""):
if not html_string: return html_string
2012-10-19 13:53:11 -07:00
params = {
'comments': True,
'scripts': False,
'javascript': False,
'style': False,
'links': False,
'meta': False,
'page_structure': False,
'processing_instructions': False,
'embedded': False,
'frames': False,
'forms': False,
'annoying_tags': False,
'remove_tags': None,
'allow_tags': None,
'remove_unknown_tags': True,
'safe_attrs_only': False,
}
2012-10-19 13:36:57 -07:00
try:
cleaner = Cleaner(**params)
html = lxml.html.fromstring(html_string)
clean_html = cleaner.clean_html(html)
2012-10-19 13:36:57 -07:00
return lxml.etree.tostring(clean_html)
except (XMLSyntaxError, ParserError):
2012-10-19 13:36:57 -07:00
return html_string
def linkify(*args, **kwargs):
return xhtml_unescape_tornado(linkify_tornado(*args, **kwargs))
def truncate_chars(value, max_length):
if len(value) <= max_length:
return value
truncd_val = value[:max_length]
if value[max_length] != " ":
rightmost_space = truncd_val.rfind(" ")
if rightmost_space != -1:
truncd_val = truncd_val[:rightmost_space]
return truncd_val + "..."
def image_size(datastream):
datastream = reseekfile.ReseekFile(datastream)
data = str(datastream.read(30))
size = len(data)
height = -1
width = -1
content_type = ''
# handle GIFs
if (size >= 10) and data[:6] in ('GIF87a', 'GIF89a'):
# Check to see if content_type is correct
content_type = 'image/gif'
w, h = struct.unpack("<HH", data[6:10])
width = int(w)
height = int(h)
# See PNG 2. Edition spec (http://www.w3.org/TR/PNG/)
# Bytes 0-7 are below, 4-byte chunk length, then 'IHDR'
# and finally the 4-byte width, height
elif ((size >= 24) and data.startswith('\211PNG\r\n\032\n')
and (data[12:16] == 'IHDR')):
content_type = 'image/png'
w, h = struct.unpack(">LL", data[16:24])
width = int(w)
height = int(h)
# Maybe this is for an older PNG version.
elif (size >= 16) and data.startswith('\211PNG\r\n\032\n'):
# Check to see if we have the right content type
content_type = 'image/png'
w, h = struct.unpack(">LL", data[8:16])
width = int(w)
height = int(h)
# handle JPEGs
elif (size >= 2) and data.startswith('\377\330'):
content_type = 'image/jpeg'
datastream.seek(0)
datastream.read(2)
b = datastream.read(1)
try:
2013-04-04 17:49:38 -07:00
w = 0
h = 0
while (b and ord(b) != 0xDA):
while (ord(b) != 0xFF): b = datastream.read(1)
while (ord(b) == 0xFF): b = datastream.read(1)
if (ord(b) >= 0xC0 and ord(b) <= 0xC3):
datastream.read(3)
h, w = struct.unpack(">HH", datastream.read(4))
break
else:
datastream.read(int(struct.unpack(">H", datastream.read(2))[0])-2)
b = datastream.read(1)
width = int(w)
height = int(h)
except struct.error:
pass
except ValueError:
pass
return content_type, width, height
def htmldiff(old_html, new_html):
2012-10-01 19:31:33 -07:00
try:
old_html_tokens = tokenize(old_html, include_hrefs=False)
new_html_tokens = tokenize(new_html, include_hrefs=False)
except (KeyError, ParserError):
return new_html
result = htmldiff_tokens(old_html_tokens, new_html_tokens)
result = ''.join(result).strip()
return fixup_ins_del_tags(result)