2012-10-19 13:53:11 -07:00
|
|
|
import re
|
2009-06-16 03:08:55 +00:00
|
|
|
import datetime
|
2012-09-10 17:41:01 -07:00
|
|
|
import struct
|
2012-12-10 15:31:39 -08:00
|
|
|
import dateutil
|
2012-04-17 11:48:28 -07:00
|
|
|
from HTMLParser import HTMLParser
|
2012-10-01 19:04:09 -07:00
|
|
|
from lxml.html.diff import tokenize, fixup_ins_del_tags, htmldiff_tokens
|
2012-10-19 13:36:57 -07:00
|
|
|
from lxml.etree import ParserError, XMLSyntaxError
|
2012-10-19 12:49:39 -07:00
|
|
|
import lxml.html, lxml.etree
|
|
|
|
from lxml.html.clean import Cleaner
|
2011-12-14 23:26:07 -08:00
|
|
|
from itertools import chain
|
2012-04-17 11:48:28 -07:00
|
|
|
from django.utils.dateformat import DateFormat
|
2012-07-21 16:38:37 -07:00
|
|
|
from django.utils.html import strip_tags as strip_tags_django
|
|
|
|
from utils.tornado_escape import linkify as linkify_tornado
|
2012-07-23 23:23:34 -07:00
|
|
|
from utils.tornado_escape import xhtml_unescape as xhtml_unescape_tornado
|
2013-03-19 17:49:56 -07:00
|
|
|
from vendor import reseekfile
|
2016-05-11 12:01:20 -07:00
|
|
|
from utils import feedparser
|
2011-01-15 18:41:41 -05:00
|
|
|
|
2013-04-08 16:14:33 -07:00
|
|
|
# COMMENTS_RE = re.compile('\<![ \r\n\t]*(--([^\-]|[\r\n]|-[^\-])*--[ \r\n\t]*)\>')
|
|
|
|
COMMENTS_RE = re.compile('\<!--.*?--\>')
|
2012-10-19 13:53:11 -07:00
|
|
|
|
2013-11-06 23:39:20 -08:00
|
|
|
def midnight_today(now=None):
|
|
|
|
if not now:
|
|
|
|
now = datetime.datetime.now()
|
|
|
|
return now.replace(hour=0, minute=0, second=0, microsecond=0, tzinfo=None)
|
2013-10-10 12:58:40 -07:00
|
|
|
|
|
|
|
def midnight_yesterday(midnight=None):
|
|
|
|
if not midnight:
|
|
|
|
midnight = midnight_today()
|
|
|
|
return midnight - datetime.timedelta(days=1)
|
|
|
|
|
|
|
|
def beginning_of_this_month():
|
|
|
|
return datetime.datetime.now().replace(day=1, hour=0, minute=0, second=0, microsecond=0)
|
|
|
|
|
2013-11-06 23:39:20 -08:00
|
|
|
def format_story_link_date__short(date, now=None):
|
|
|
|
if not now:
|
|
|
|
now = datetime.datetime.now()
|
2013-10-10 12:58:40 -07:00
|
|
|
date = date.replace(tzinfo=None)
|
2013-11-06 23:39:20 -08:00
|
|
|
midnight = midnight_today(now)
|
2013-12-27 16:51:13 +00:00
|
|
|
if date >= midnight:
|
2011-01-12 23:30:38 -05:00
|
|
|
return date.strftime('%I:%M%p').lstrip('0').lower()
|
2013-12-27 16:51:13 +00:00
|
|
|
elif date >= midnight_yesterday(midnight):
|
2011-01-12 23:30:38 -05:00
|
|
|
return 'Yesterday, ' + date.strftime('%I:%M%p').lstrip('0').lower()
|
2009-06-16 03:08:55 +00:00
|
|
|
else:
|
2011-01-12 23:30:38 -05:00
|
|
|
return date.strftime('%d %b %Y, ') + date.strftime('%I:%M%p').lstrip('0').lower()
|
2009-06-16 03:08:55 +00:00
|
|
|
|
2011-01-12 23:30:38 -05:00
|
|
|
def format_story_link_date__long(date, now=None):
|
2013-10-10 12:58:40 -07:00
|
|
|
if not now:
|
|
|
|
now = datetime.datetime.now()
|
|
|
|
date = date.replace(tzinfo=None)
|
2013-11-21 17:52:26 -08:00
|
|
|
midnight = midnight_today(now)
|
2011-01-12 23:30:38 -05:00
|
|
|
parsed_date = DateFormat(date)
|
2013-10-10 12:58:40 -07:00
|
|
|
|
2013-12-27 16:51:13 +00:00
|
|
|
if date >= midnight:
|
2011-01-12 23:30:38 -05:00
|
|
|
return 'Today, ' + parsed_date.format('F jS ') + date.strftime('%I:%M%p').lstrip('0').lower()
|
2013-12-27 16:51:13 +00:00
|
|
|
elif date >= midnight_yesterday(midnight):
|
2010-07-15 23:32:37 -04:00
|
|
|
return 'Yesterday, ' + parsed_date.format('F jS g:ia').replace('.','')
|
2013-12-27 16:51:13 +00:00
|
|
|
elif date >= beginning_of_this_month():
|
2009-06-16 03:08:55 +00:00
|
|
|
return parsed_date.format('l, F jS g:ia').replace('.','')
|
|
|
|
else:
|
|
|
|
return parsed_date.format('l, F jS, Y g:ia').replace('.','')
|
|
|
|
|
|
|
|
def _extract_date_tuples(date):
|
|
|
|
parsed_date = DateFormat(date)
|
|
|
|
date_tuple = datetime.datetime.timetuple(date)[:3]
|
2010-10-10 23:55:00 -04:00
|
|
|
today_tuple = datetime.datetime.timetuple(datetime.datetime.utcnow())[:3]
|
2009-06-16 03:08:55 +00:00
|
|
|
today = datetime.datetime.today()
|
|
|
|
yesterday_tuple = datetime.datetime.timetuple(today - datetime.timedelta(1))[:3]
|
|
|
|
|
2009-12-18 20:47:44 +00:00
|
|
|
return parsed_date, date_tuple, today_tuple, yesterday_tuple
|
|
|
|
|
2016-05-11 12:01:20 -07:00
|
|
|
def pre_process_story(entry, encoding):
|
2012-12-24 00:10:40 -08:00
|
|
|
publish_date = entry.get('published_parsed') or entry.get('updated_parsed')
|
2012-12-10 15:31:39 -08:00
|
|
|
if publish_date:
|
|
|
|
publish_date = datetime.datetime(*publish_date[:6])
|
|
|
|
if not publish_date and entry.get('published'):
|
|
|
|
try:
|
2012-12-10 15:32:31 -08:00
|
|
|
publish_date = dateutil.parser.parse(entry.get('published')).replace(tzinfo=None)
|
2016-04-18 16:20:16 -07:00
|
|
|
except (ValueError, TypeError, OverflowError):
|
2012-12-10 15:31:39 -08:00
|
|
|
pass
|
|
|
|
|
|
|
|
if publish_date:
|
|
|
|
entry['published'] = publish_date
|
|
|
|
else:
|
|
|
|
entry['published'] = datetime.datetime.utcnow()
|
2009-12-18 20:47:44 +00:00
|
|
|
|
2015-09-02 15:26:55 -07:00
|
|
|
if entry['published'] < datetime.datetime(2000, 1, 1):
|
|
|
|
entry['published'] = datetime.datetime.utcnow()
|
|
|
|
|
2013-04-07 21:07:34 -07:00
|
|
|
if entry['published'] > datetime.datetime.now() + datetime.timedelta(days=1):
|
|
|
|
entry['published'] = datetime.datetime.now()
|
|
|
|
|
2012-03-27 18:37:04 -07:00
|
|
|
# entry_link = entry.get('link') or ''
|
|
|
|
# protocol_index = entry_link.find("://")
|
|
|
|
# if protocol_index != -1:
|
|
|
|
# entry['link'] = (entry_link[:protocol_index+3]
|
|
|
|
# + urlquote(entry_link[protocol_index+3:]))
|
|
|
|
# else:
|
|
|
|
# entry['link'] = urlquote(entry_link)
|
2010-12-10 15:58:21 -05:00
|
|
|
if isinstance(entry.get('guid'), dict):
|
|
|
|
entry['guid'] = unicode(entry['guid'])
|
2011-11-25 10:38:39 -05:00
|
|
|
|
|
|
|
# Normalize story content/summary
|
2013-08-06 13:18:55 -07:00
|
|
|
summary = entry.get('summary') or ""
|
2013-07-16 11:59:52 -07:00
|
|
|
content = ""
|
|
|
|
if not summary and 'summary_detail' in entry:
|
|
|
|
summary = entry['summary_detail'].get('value', '')
|
2011-11-25 00:08:17 -05:00
|
|
|
if entry.get('content'):
|
2013-07-16 11:59:52 -07:00
|
|
|
content = entry['content'][0].get('value', '')
|
|
|
|
if len(content) > len(summary):
|
|
|
|
entry['story_content'] = content.strip()
|
2011-11-25 10:38:39 -05:00
|
|
|
else:
|
2012-08-02 11:04:11 -07:00
|
|
|
entry['story_content'] = summary.strip()
|
2011-11-25 10:38:39 -05:00
|
|
|
|
2016-05-11 12:01:20 -07:00
|
|
|
if 'summary_detail' in entry and entry['summary_detail'].get('type', None) == 'text/plain':
|
2016-05-11 13:37:35 -07:00
|
|
|
try:
|
|
|
|
entry['story_content'] = feedparser._sanitizeHTML(entry['story_content'], encoding, 'text/plain')
|
|
|
|
if encoding and not isinstance(entry['story_content'], unicode):
|
|
|
|
entry['story_content'] = entry['story_content'].decode(encoding, 'ignore')
|
|
|
|
except UnicodeEncodeError:
|
|
|
|
pass
|
2016-05-11 12:01:20 -07:00
|
|
|
|
2011-11-25 10:38:39 -05:00
|
|
|
# Add each media enclosure as a Download link
|
2016-08-12 20:43:43 -07:00
|
|
|
for media_content in chain(entry.get('media_content', [])[:15], entry.get('links', [])[:15]):
|
2011-11-25 10:38:39 -05:00
|
|
|
media_url = media_content.get('url', '')
|
2016-08-12 20:43:43 -07:00
|
|
|
media_type = media_content.get('type', media_content.get('medium', ''))
|
2012-02-24 14:39:23 -08:00
|
|
|
if media_url and media_type and entry['story_content'] and media_url not in entry['story_content']:
|
2012-01-14 17:13:05 -08:00
|
|
|
media_type_name = media_type.split('/')[0]
|
2011-11-25 10:38:39 -05:00
|
|
|
if 'audio' in media_type and media_url:
|
|
|
|
entry['story_content'] += """<br><br>
|
2012-03-05 12:57:47 -08:00
|
|
|
<audio controls="controls" preload="none">
|
2011-11-25 10:38:39 -05:00
|
|
|
<source src="%(media_url)s" type="%(media_type)s" />
|
|
|
|
</audio>""" % {
|
|
|
|
'media_url': media_url,
|
|
|
|
'media_type': media_type
|
|
|
|
}
|
|
|
|
elif 'image' in media_type and media_url:
|
|
|
|
entry['story_content'] += """<br><br><img src="%s" />""" % media_url
|
2012-01-15 19:17:17 -08:00
|
|
|
continue
|
2011-12-14 23:26:07 -08:00
|
|
|
elif media_content.get('rel') == 'alternative' or 'text' in media_content.get('type'):
|
|
|
|
continue
|
2012-01-14 17:13:05 -08:00
|
|
|
elif media_type_name in ['application']:
|
|
|
|
continue
|
2011-11-25 10:38:39 -05:00
|
|
|
entry['story_content'] += """<br><br>
|
|
|
|
Download %(media_type)s: <a href="%(media_url)s">%(media_url)s</a>""" % {
|
2012-01-14 17:13:05 -08:00
|
|
|
'media_type': media_type_name,
|
2011-11-25 00:26:30 -05:00
|
|
|
'media_url': media_url,
|
|
|
|
}
|
2011-11-24 15:58:04 -05:00
|
|
|
|
|
|
|
entry['guid'] = entry.get('guid') or entry.get('id') or entry.get('link') or str(entry.get('published'))
|
2012-04-17 11:48:28 -07:00
|
|
|
|
|
|
|
if not entry.get('title') and entry.get('story_content'):
|
|
|
|
story_title = strip_tags(entry['story_content'])
|
|
|
|
if len(story_title) > 80:
|
|
|
|
story_title = story_title[:80] + '...'
|
|
|
|
entry['title'] = story_title
|
2014-09-05 10:38:29 -07:00
|
|
|
if not entry.get('title') and entry.get('link'):
|
|
|
|
entry['title'] = entry['link']
|
|
|
|
|
2012-07-21 16:38:37 -07:00
|
|
|
entry['title'] = strip_tags(entry.get('title'))
|
|
|
|
entry['author'] = strip_tags(entry.get('author'))
|
|
|
|
|
2015-12-09 15:11:32 -08:00
|
|
|
entry['story_content'] = attach_media_scripts(entry['story_content'])
|
|
|
|
|
2011-01-07 16:26:17 -05:00
|
|
|
return entry
|
2015-12-09 15:11:32 -08:00
|
|
|
|
|
|
|
def attach_media_scripts(content):
|
|
|
|
if 'instagram-media' in content and '<script' not in content:
|
2015-12-09 17:47:19 -08:00
|
|
|
content += '<script async defer src="https://platform.instagram.com/en_US/embeds.js"></script><script>(function(){if(window.instgrm)window.instgrm.Embeds.process()})()</script>'
|
2015-12-09 15:11:32 -08:00
|
|
|
if 'twitter-tweet' in content and '<script' not in content:
|
2015-12-09 17:47:19 -08:00
|
|
|
content += '<script id="twitter-wjs" type="text/javascript" async defer src="https://platform.twitter.com/widgets.js"></script>'
|
2015-12-09 15:11:32 -08:00
|
|
|
if 'imgur-embed-pub' in content and '<script' not in content:
|
2015-12-09 17:47:19 -08:00
|
|
|
content += '<script async src="https://s.imgur.com/min/embed.js" charset="utf-8"></script>'
|
2015-12-09 15:11:32 -08:00
|
|
|
return content
|
|
|
|
|
2011-01-07 16:26:17 -05:00
|
|
|
|
2012-04-17 11:48:28 -07:00
|
|
|
class MLStripper(HTMLParser):
|
|
|
|
def __init__(self):
|
|
|
|
self.reset()
|
|
|
|
self.fed = []
|
|
|
|
def handle_data(self, d):
|
|
|
|
self.fed.append(d)
|
|
|
|
def get_data(self):
|
|
|
|
return ' '.join(self.fed)
|
|
|
|
|
|
|
|
def strip_tags(html):
|
2012-07-21 16:38:37 -07:00
|
|
|
if not html:
|
|
|
|
return ''
|
|
|
|
return strip_tags_django(html)
|
|
|
|
|
2012-04-17 11:48:28 -07:00
|
|
|
s = MLStripper()
|
|
|
|
s.feed(html)
|
2012-06-27 23:57:57 -07:00
|
|
|
return s.get_data()
|
2012-07-21 16:38:37 -07:00
|
|
|
|
2012-10-19 12:49:39 -07:00
|
|
|
def strip_comments(html_string):
|
2012-10-19 13:53:11 -07:00
|
|
|
return COMMENTS_RE.sub('', html_string)
|
2013-04-08 16:14:33 -07:00
|
|
|
|
|
|
|
def strip_comments__lxml2(html_string=""):
|
|
|
|
if not html_string: return html_string
|
|
|
|
tree = lxml.html.fromstring(html_string)
|
|
|
|
comments = tree.xpath('//comment()')
|
|
|
|
|
|
|
|
for c in comments:
|
|
|
|
p = c.getparent()
|
|
|
|
p.remove(c)
|
|
|
|
|
|
|
|
return lxml.etree.tostring(tree)
|
|
|
|
|
|
|
|
def strip_comments__lxml(html_string=""):
|
|
|
|
if not html_string: return html_string
|
2012-10-19 13:53:11 -07:00
|
|
|
|
2012-10-19 12:49:39 -07:00
|
|
|
params = {
|
|
|
|
'comments': True,
|
|
|
|
'scripts': False,
|
|
|
|
'javascript': False,
|
|
|
|
'style': False,
|
|
|
|
'links': False,
|
|
|
|
'meta': False,
|
|
|
|
'page_structure': False,
|
|
|
|
'processing_instructions': False,
|
|
|
|
'embedded': False,
|
|
|
|
'frames': False,
|
|
|
|
'forms': False,
|
|
|
|
'annoying_tags': False,
|
|
|
|
'remove_tags': None,
|
|
|
|
'allow_tags': None,
|
|
|
|
'remove_unknown_tags': True,
|
|
|
|
'safe_attrs_only': False,
|
|
|
|
}
|
2012-10-19 13:36:57 -07:00
|
|
|
try:
|
|
|
|
cleaner = Cleaner(**params)
|
|
|
|
html = lxml.html.fromstring(html_string)
|
|
|
|
clean_html = cleaner.clean_html(html)
|
2012-10-19 12:49:39 -07:00
|
|
|
|
2012-10-19 13:36:57 -07:00
|
|
|
return lxml.etree.tostring(clean_html)
|
2013-05-31 17:14:17 -07:00
|
|
|
except (XMLSyntaxError, ParserError):
|
2012-10-19 13:36:57 -07:00
|
|
|
return html_string
|
2014-04-15 14:17:15 -07:00
|
|
|
|
|
|
|
def prep_for_search(html):
|
|
|
|
html = strip_tags_django(html)
|
|
|
|
html = html.lower()
|
|
|
|
html = xhtml_unescape_tornado(html)
|
|
|
|
|
|
|
|
return html[:100000]
|
|
|
|
|
2012-07-21 16:38:37 -07:00
|
|
|
def linkify(*args, **kwargs):
|
2012-07-23 23:23:34 -07:00
|
|
|
return xhtml_unescape_tornado(linkify_tornado(*args, **kwargs))
|
2012-06-27 23:57:57 -07:00
|
|
|
|
|
|
|
def truncate_chars(value, max_length):
|
|
|
|
if len(value) <= max_length:
|
|
|
|
return value
|
|
|
|
|
|
|
|
truncd_val = value[:max_length]
|
|
|
|
if value[max_length] != " ":
|
|
|
|
rightmost_space = truncd_val.rfind(" ")
|
|
|
|
if rightmost_space != -1:
|
|
|
|
truncd_val = truncd_val[:rightmost_space]
|
|
|
|
|
2012-09-10 17:41:01 -07:00
|
|
|
return truncd_val + "..."
|
|
|
|
|
|
|
|
def image_size(datastream):
|
2013-03-19 17:49:56 -07:00
|
|
|
datastream = reseekfile.ReseekFile(datastream)
|
|
|
|
data = str(datastream.read(30))
|
2012-09-10 17:41:01 -07:00
|
|
|
size = len(data)
|
|
|
|
height = -1
|
|
|
|
width = -1
|
|
|
|
content_type = ''
|
|
|
|
|
|
|
|
# handle GIFs
|
|
|
|
if (size >= 10) and data[:6] in ('GIF87a', 'GIF89a'):
|
|
|
|
# Check to see if content_type is correct
|
|
|
|
content_type = 'image/gif'
|
|
|
|
w, h = struct.unpack("<HH", data[6:10])
|
|
|
|
width = int(w)
|
|
|
|
height = int(h)
|
|
|
|
|
|
|
|
# See PNG 2. Edition spec (http://www.w3.org/TR/PNG/)
|
|
|
|
# Bytes 0-7 are below, 4-byte chunk length, then 'IHDR'
|
|
|
|
# and finally the 4-byte width, height
|
|
|
|
elif ((size >= 24) and data.startswith('\211PNG\r\n\032\n')
|
|
|
|
and (data[12:16] == 'IHDR')):
|
|
|
|
content_type = 'image/png'
|
|
|
|
w, h = struct.unpack(">LL", data[16:24])
|
|
|
|
width = int(w)
|
|
|
|
height = int(h)
|
|
|
|
|
|
|
|
# Maybe this is for an older PNG version.
|
|
|
|
elif (size >= 16) and data.startswith('\211PNG\r\n\032\n'):
|
|
|
|
# Check to see if we have the right content type
|
|
|
|
content_type = 'image/png'
|
|
|
|
w, h = struct.unpack(">LL", data[8:16])
|
|
|
|
width = int(w)
|
|
|
|
height = int(h)
|
|
|
|
|
|
|
|
# handle JPEGs
|
|
|
|
elif (size >= 2) and data.startswith('\377\330'):
|
|
|
|
content_type = 'image/jpeg'
|
|
|
|
datastream.seek(0)
|
|
|
|
datastream.read(2)
|
|
|
|
b = datastream.read(1)
|
|
|
|
try:
|
2013-04-04 17:49:38 -07:00
|
|
|
w = 0
|
|
|
|
h = 0
|
2012-09-10 17:41:01 -07:00
|
|
|
while (b and ord(b) != 0xDA):
|
|
|
|
while (ord(b) != 0xFF): b = datastream.read(1)
|
|
|
|
while (ord(b) == 0xFF): b = datastream.read(1)
|
|
|
|
if (ord(b) >= 0xC0 and ord(b) <= 0xC3):
|
|
|
|
datastream.read(3)
|
|
|
|
h, w = struct.unpack(">HH", datastream.read(4))
|
|
|
|
break
|
|
|
|
else:
|
|
|
|
datastream.read(int(struct.unpack(">H", datastream.read(2))[0])-2)
|
|
|
|
b = datastream.read(1)
|
|
|
|
width = int(w)
|
|
|
|
height = int(h)
|
|
|
|
except struct.error:
|
|
|
|
pass
|
|
|
|
except ValueError:
|
|
|
|
pass
|
|
|
|
|
|
|
|
return content_type, width, height
|
|
|
|
|
2012-10-01 19:04:09 -07:00
|
|
|
def htmldiff(old_html, new_html):
|
2012-10-01 19:31:33 -07:00
|
|
|
try:
|
|
|
|
old_html_tokens = tokenize(old_html, include_hrefs=False)
|
|
|
|
new_html_tokens = tokenize(new_html, include_hrefs=False)
|
|
|
|
except (KeyError, ParserError):
|
|
|
|
return new_html
|
|
|
|
|
2012-10-01 19:04:09 -07:00
|
|
|
result = htmldiff_tokens(old_html_tokens, new_html_tokens)
|
2012-11-20 09:54:23 -08:00
|
|
|
result = ''.join(result).strip()
|
2012-10-01 19:04:09 -07:00
|
|
|
|
2014-01-27 17:16:34 -08:00
|
|
|
return fixup_ins_del_tags(result)
|