NewsBlur-viq/utils/story_functions.py

373 lines
13 KiB
Python
Raw Permalink Normal View History

2024-04-24 09:50:42 -04:00
import base64
2009-06-16 03:08:55 +00:00
import datetime
import hashlib
2024-04-24 09:50:42 -04:00
import hmac
2021-03-02 09:46:52 -05:00
import html
2024-04-24 09:50:42 -04:00
import re
import struct
import sys
2024-04-24 09:50:42 -04:00
from binascii import hexlify
from hashlib import sha1
from itertools import chain
2024-04-24 09:50:42 -04:00
from random import randint
import dateutil
import feedparser
import lxml.etree
import lxml.html
from django.utils.dateformat import DateFormat
from django.utils.html import strip_tags as strip_tags_django
2024-04-24 09:50:42 -04:00
from lxml.etree import ParserError, SerialisationError, XMLSyntaxError
from lxml.html.clean import Cleaner
from lxml.html.diff import fixup_ins_del_tags, htmldiff_tokens, tokenize
from utils.tornado_escape import linkify as linkify_tornado
from utils.tornado_escape import xhtml_unescape as xhtml_unescape_tornado
2019-01-17 15:05:32 -05:00
# COMMENTS_RE = re.compile('\<![ \r\n\t]*(--([^\-]|[\r\n]|-[^\-])*--[ \r\n\t]*)\>')
2024-04-24 09:43:56 -04:00
COMMENTS_RE = re.compile("\<!--.*?--\>")
2012-10-19 13:53:11 -07:00
def midnight_today(now=None):
if not now:
now = datetime.datetime.now()
return now.replace(hour=0, minute=0, second=0, microsecond=0, tzinfo=None)
2024-04-24 09:43:56 -04:00
def midnight_yesterday(midnight=None):
if not midnight:
midnight = midnight_today()
return midnight - datetime.timedelta(days=1)
2024-04-24 09:43:56 -04:00
def beginning_of_this_month():
return datetime.datetime.now().replace(day=1, hour=0, minute=0, second=0, microsecond=0)
2024-04-24 09:43:56 -04:00
def format_story_link_date__short(date, now=None):
if not now:
now = datetime.datetime.now()
date = date.replace(tzinfo=None)
midnight = midnight_today(now)
if date >= midnight:
2024-04-24 09:43:56 -04:00
return date.strftime("%I:%M%p").lstrip("0").lower()
elif date >= midnight_yesterday(midnight):
2024-04-24 09:43:56 -04:00
return "Yesterday, " + date.strftime("%I:%M%p").lstrip("0").lower()
2009-06-16 03:08:55 +00:00
else:
2024-04-24 09:43:56 -04:00
return date.strftime("%d %b %Y, ") + date.strftime("%I:%M%p").lstrip("0").lower()
2009-06-16 03:08:55 +00:00
def format_story_link_date__long(date, now=None):
if not now:
now = datetime.datetime.now()
date = date.replace(tzinfo=None)
2013-11-21 17:52:26 -08:00
midnight = midnight_today(now)
parsed_date = DateFormat(date)
if date >= midnight:
2024-04-24 09:43:56 -04:00
return "Today, " + parsed_date.format("F jS ") + date.strftime("%I:%M%p").lstrip("0").lower()
elif date >= midnight_yesterday(midnight):
2024-04-24 09:43:56 -04:00
return "Yesterday, " + parsed_date.format("F jS g:ia").replace(".", "")
elif date >= beginning_of_this_month():
2024-04-24 09:43:56 -04:00
return parsed_date.format("l, F jS g:ia").replace(".", "")
2009-06-16 03:08:55 +00:00
else:
2024-04-24 09:43:56 -04:00
return parsed_date.format("l, F jS, Y g:ia").replace(".", "")
2009-06-16 03:08:55 +00:00
def relative_date(d):
diff = datetime.datetime.utcnow() - d
s = diff.seconds
if diff.days == 1:
2024-04-24 09:43:56 -04:00
return "1 day ago"
elif diff.days > 1:
2024-04-24 09:43:56 -04:00
return "{} days ago".format(diff.days)
elif s < 60:
2024-04-24 09:43:56 -04:00
return "just now"
elif s < 120:
2024-04-24 09:43:56 -04:00
return "1 minute ago"
elif s < 3600:
2024-04-24 09:43:56 -04:00
return "{} minutes ago".format(s // 60)
elif s < 7200:
2024-04-24 09:43:56 -04:00
return "1 hour ago"
else:
2024-04-24 09:43:56 -04:00
return "{} hours ago".format(s // 3600)
2009-06-16 03:08:55 +00:00
def _extract_date_tuples(date):
parsed_date = DateFormat(date)
date_tuple = datetime.datetime.timetuple(date)[:3]
today_tuple = datetime.datetime.timetuple(datetime.datetime.utcnow())[:3]
2009-06-16 03:08:55 +00:00
today = datetime.datetime.today()
yesterday_tuple = datetime.datetime.timetuple(today - datetime.timedelta(1))[:3]
2024-04-24 09:43:56 -04:00
2009-12-18 20:47:44 +00:00
return parsed_date, date_tuple, today_tuple, yesterday_tuple
2024-04-24 09:43:56 -04:00
2016-05-11 12:01:20 -07:00
def pre_process_story(entry, encoding):
# Do not switch to published_parsed or every story will be dated the fetch time
2024-04-24 09:43:56 -04:00
publish_date = entry.get("g_parsed") or entry.get("updated_parsed")
2012-12-10 15:31:39 -08:00
if publish_date:
publish_date = datetime.datetime(*publish_date[:6])
2024-04-24 09:43:56 -04:00
if not publish_date and entry.get("published"):
2012-12-10 15:31:39 -08:00
try:
2024-04-24 09:43:56 -04:00
publish_date = dateutil.parser.parse(entry.get("published")).replace(tzinfo=None)
except (ValueError, TypeError, OverflowError):
2012-12-10 15:31:39 -08:00
pass
2024-04-24 09:43:56 -04:00
2012-12-10 15:31:39 -08:00
if publish_date:
2024-04-24 09:43:56 -04:00
entry["published"] = publish_date
2012-12-10 15:31:39 -08:00
else:
2024-04-24 09:43:56 -04:00
entry["published"] = datetime.datetime.utcnow() + datetime.timedelta(seconds=randint(0, 59))
if entry["published"] < datetime.datetime(2000, 1, 1):
entry["published"] = datetime.datetime.utcnow()
# Future dated stories get forced to current date
# if entry['published'] > datetime.datetime.now() + datetime.timedelta(days=1):
2024-04-24 09:43:56 -04:00
if entry["published"] > datetime.datetime.now():
entry["published"] = datetime.datetime.now() + datetime.timedelta(seconds=randint(0, 59))
# entry_link = entry.get('link') or ''
# protocol_index = entry_link.find("://")
# if protocol_index != -1:
# entry['link'] = (entry_link[:protocol_index+3]
# + urlquote(entry_link[protocol_index+3:]))
# else:
# entry['link'] = urlquote(entry_link)
2024-04-24 09:43:56 -04:00
if isinstance(entry.get("guid"), dict):
entry["guid"] = str(entry["guid"])
# Normalize story content/summary
2024-04-24 09:43:56 -04:00
summary = entry.get("summary") or ""
content = ""
2024-04-24 09:43:56 -04:00
if not summary and "summary_detail" in entry:
summary = entry["summary_detail"].get("value", "")
if entry.get("content"):
content = entry["content"][0].get("value", "")
if len(content) > len(summary):
2024-04-24 09:43:56 -04:00
entry["story_content"] = content.strip()
else:
2024-04-24 09:43:56 -04:00
entry["story_content"] = summary.strip()
if not entry["story_content"] and entry.get("subtitle"):
entry["story_content"] = entry.get("subtitle")
if "summary_detail" in entry and entry["summary_detail"].get("type", None) == "text/plain":
try:
2024-04-24 09:43:56 -04:00
entry["story_content"] = feedparser.sanitizer._sanitize_html(
entry["story_content"], encoding, "text/plain"
)
if encoding and not isinstance(entry["story_content"], str):
entry["story_content"] = entry["story_content"].decode(encoding, "ignore")
except UnicodeEncodeError:
pass
2024-04-24 09:43:56 -04:00
# Add each media enclosure as a Download link
2024-04-24 09:43:56 -04:00
for media_content in chain(entry.get("media_content", [])[:15], entry.get("links", [])[:15]):
media_url = media_content.get("url", media_content.get("href", ""))
media_type = media_content.get("type", media_content.get("medium", ""))
if media_url and media_type and media_url not in entry["story_content"]:
media_type_name = media_type.split("/")[0]
if "audio" in media_type and media_url:
entry[
"story_content"
] += """<br><br>
<audio controls="controls" preload="none">
<source src="%(media_url)s" type="%(media_type)s" />
2024-04-24 09:43:56 -04:00
</audio>""" % {
"media_url": media_url,
"media_type": media_type,
}
elif "video" in media_type and media_url:
entry[
"story_content"
] += """<br><br>
2018-07-08 16:24:33 -04:00
<video controls="controls" preload="none">
<source src="%(media_url)s" type="%(media_type)s" />
2024-04-24 09:43:56 -04:00
</video>""" % {
"media_url": media_url,
"media_type": media_type,
}
elif "image" in media_type and media_url and media_url not in entry["story_content"]:
entry["story_content"] += """<br><br><img src="%s" />""" % media_url
continue
2024-04-24 09:43:56 -04:00
elif media_content.get("rel", "") == "alternative" or "text" in media_content.get("type", ""):
continue
2024-04-24 09:43:56 -04:00
elif media_type_name in ["application"]:
continue
2024-04-24 09:43:56 -04:00
entry[
"story_content"
] += """<br><br>
Download %(media_type)s: <a href="%(media_url)s">%(media_url)s</a>""" % {
"media_type": media_type_name,
"media_url": media_url,
2011-11-25 00:26:30 -05:00
}
2024-04-24 09:43:56 -04:00
entry["guid"] = entry.get("guid") or entry.get("id") or entry.get("link") or str(entry.get("published"))
if not entry.get("title"):
entry["title"] = ""
entry["title"] = strip_tags(entry.get("title"))
entry["author"] = strip_tags(entry.get("author"))
if not entry["author"]:
entry["author"] = strip_tags(entry.get("credit"))
entry["story_content"] = attach_media_scripts(entry["story_content"])
return entry
2024-04-24 09:43:56 -04:00
def attach_media_scripts(content):
2024-04-24 09:43:56 -04:00
if "instagram-media" in content and "<script" not in content:
content += '<script async defer src="https://platform.instagram.com/en_US/embeds.js"></script><script>(function(){if(window.instgrm)window.instgrm.Embeds.process()})()</script>'
2024-04-24 09:43:56 -04:00
if "twitter-tweet" in content and "<script" not in content:
content += '<script id="twitter-wjs" type="text/javascript" async defer src="https://platform.twitter.com/widgets.js"></script>'
2024-04-24 09:43:56 -04:00
if "imgur-embed-pub" in content and "<script" not in content:
content += '<script async src="https://s.imgur.com/min/embed.js" charset="utf-8"></script>'
return content
2024-04-24 09:43:56 -04:00
def strip_tags(html):
if not html:
2024-04-24 09:43:56 -04:00
return ""
return strip_tags_django(html)
2024-04-24 09:43:56 -04:00
def strip_comments(html_string):
2024-04-24 09:43:56 -04:00
return COMMENTS_RE.sub("", html_string)
def strip_comments__lxml2(html_string=""):
2024-04-24 09:43:56 -04:00
if not html_string:
return html_string
tree = lxml.html.fromstring(html_string)
2024-04-24 09:43:56 -04:00
comments = tree.xpath("//comment()")
for c in comments:
p = c.getparent()
p.remove(c)
return lxml.etree.tostring(tree)
2024-04-24 09:43:56 -04:00
def strip_comments__lxml(html_string=""):
2024-04-24 09:43:56 -04:00
if not html_string:
return html_string
params = {
2024-04-24 09:43:56 -04:00
"comments": True,
"scripts": False,
"javascript": False,
"style": False,
"links": False,
"meta": False,
"page_structure": False,
"processing_instructions": False,
"embedded": False,
"frames": False,
"forms": False,
"annoying_tags": False,
"remove_tags": None,
"allow_tags": None,
"remove_unknown_tags": True,
"safe_attrs_only": False,
}
2012-10-19 13:36:57 -07:00
try:
cleaner = Cleaner(**params)
html = lxml.html.fromstring(html_string)
clean_html = cleaner.clean_html(html)
return lxml.etree.tostring(clean_html).decode()
2021-02-28 20:20:41 -05:00
except (XMLSyntaxError, ParserError, SerialisationError):
2012-10-19 13:36:57 -07:00
return html_string
2024-04-24 09:43:56 -04:00
def prep_for_search(html):
html = strip_tags_django(html)
html = html.lower()
html = xhtml_unescape_tornado(html)
2024-04-24 09:43:56 -04:00
return html[:100000]
2024-04-24 09:43:56 -04:00
def linkify(*args, **kwargs):
return xhtml_unescape_tornado(linkify_tornado(*args, **kwargs))
2024-04-24 09:43:56 -04:00
def truncate_chars(value, max_length):
try:
2024-04-24 09:43:56 -04:00
value = value.encode("utf-8")
except UnicodeDecodeError:
pass
2018-08-09 10:24:45 -04:00
if len(value) <= max_length:
2024-04-24 09:43:56 -04:00
return value.decode("utf-8", "ignore")
2018-08-09 10:24:45 -04:00
truncd_val = value[:max_length]
2021-03-02 18:33:00 -05:00
if value[max_length] != b" ":
2021-03-02 18:16:57 -05:00
rightmost_space = truncd_val.rfind(b" ")
if rightmost_space != -1:
truncd_val = truncd_val[:rightmost_space]
2024-04-24 09:43:56 -04:00
return truncd_val.decode("utf-8", "ignore") + "..."
def htmldiff(old_html, new_html):
2012-10-01 19:31:33 -07:00
try:
2024-04-24 09:43:56 -04:00
old_html_tokens = tokenize(old_html, include_hrefs=False)
new_html_tokens = tokenize(new_html, include_hrefs=False)
2012-10-01 19:31:33 -07:00
except (KeyError, ParserError):
return new_html
2024-04-24 09:43:56 -04:00
result = htmldiff_tokens(old_html_tokens, new_html_tokens)
result = "".join(result).strip()
return fixup_ins_del_tags(result)
2019-01-17 15:05:32 -05:00
2019-08-11 14:44:15 -07:00
def create_camo_signed_url(base_url, hmac_key, url):
2019-01-17 15:05:32 -05:00
"""Create a camo signed URL for the specified image URL
Args:
base_url: Base URL of the camo installation
hmac_key: HMAC shared key to be used for signing
url: URL of the destination image
Returns:
str: A full url that can be used to serve the proxied image
"""
2024-04-24 09:43:56 -04:00
base_url = base_url.rstrip("/")
2019-01-17 15:05:32 -05:00
signature = hmac.HMAC(hmac_key, url.encode(), digestmod=sha1).hexdigest()
hex_url = hexlify(url.encode()).decode()
2024-04-24 09:43:56 -04:00
return "{base}/{signature}/{hex_url}".format(base=base_url, signature=signature, hex_url=hex_url)
def create_imageproxy_signed_url(base_url, hmac_key, url, options=None):
"""Create a imageproxy signed URL for the specified image URL
Args:
base_url: Base URL of the imageproxy installation
hmac_key: HMAC shared key to be used for signing
url: URL of the destination image
Returns:
str: A full url that can be used to serve the proxied image
"""
2024-04-24 09:43:56 -04:00
if not options:
options = []
if isinstance(options, int):
options = [str(options)]
if not isinstance(options, list):
options = [options]
if sys.getdefaultencoding() == "ascii":
url = url.encode("utf-8")
if url.startswith("data:"):
return url
2024-04-24 09:43:56 -04:00
base_url = base_url.rstrip("/")
signature = base64.urlsafe_b64encode(
hmac.new(hmac_key.encode(), msg=url.encode(), digestmod=hashlib.sha256).digest()
)
options.append("sc")
options.append("s" + signature.decode())
2024-04-24 09:43:56 -04:00
return "{base}/{options}/{url}".format(base=base_url, options=",".join(options), url=url)