Auto-linkifying comments and replies, and stripping html from comments, replies, and story titles, tags, and authors.

This commit is contained in:
Samuel Clay 2012-07-21 16:38:37 -07:00
parent 1872b0a091
commit 18428a32a1
9 changed files with 396 additions and 12 deletions

View file

@ -44,6 +44,7 @@ from utils.story_functions import format_story_link_date__short
from utils.story_functions import format_story_link_date__long
from utils.story_functions import bunch
from utils.story_functions import story_score
from utils.story_functions import strip_tags
from utils import log as logging
from utils.view_functions import get_argument_or_404, render_to, is_true
from utils.ratelimit import ratelimit
@ -478,7 +479,7 @@ def load_single_feed(request, feed_id):
story['shared'] = True
shared_date = localtime_for_timezone(shared_stories[story['id']]['shared_date'], user.profile.timezone)
story['shared_date'] = format_story_link_date__long(shared_date, now)
story['shared_comments'] = shared_stories[story['id']]['comments']
story['shared_comments'] = strip_tags(shared_stories[story['id']]['comments'])
else:
story['read_status'] = 1
story['intelligence'] = {

View file

@ -27,7 +27,7 @@ from utils.feed_functions import levenshtein_distance
from utils.feed_functions import timelimit, TimeoutError
from utils.feed_functions import relative_timesince
from utils.feed_functions import seconds_timesince
from utils.story_functions import pre_process_story
from utils.story_functions import strip_tags
from utils.diff import HTMLDiff
ENTRY_NEW, ENTRY_UPDATED, ENTRY_SAME, ENTRY_ERR = range(4)
@ -724,8 +724,6 @@ class Feed(models.Model):
}
for story in stories:
story = pre_process_story(story)
if not story.get('title'):
continue
@ -979,8 +977,8 @@ class Feed(models.Model):
if not tagname or tagname == ' ':
continue
fcat.append(tagname)
fcat = [t[:250] for t in fcat]
return fcat[:12]
fcat = [strip_tags(t)[:250] for t in fcat[:12]]
return fcat
def get_permalink(self, entry):
link = entry.get('link')

View file

@ -24,7 +24,7 @@ from vendor import facebook
from vendor import tweepy
from utils import log as logging
from utils.feed_functions import relative_timesince
from utils.story_functions import truncate_chars
from utils.story_functions import truncate_chars, strip_tags, linkify
from utils import json_functions as json
RECOMMENDATIONS_LIMIT = 5
@ -475,6 +475,10 @@ class MSocialProfile(mongo.Document):
def send_email_for_new_follower(self, follower_user_id):
user = User.objects.get(pk=self.user_id)
if not user.email or not user.profile.send_emails or self.user_id == follower_user_id:
if not user.email:
logging.user(user, "~BB~FMNo email to send to, skipping.")
elif not user.profile.send_emails:
logging.user(user, "~BB~FMDisabled emails, skipping.")
return
emails_sent = MSentEmail.objects.filter(receiver_user_id=user.pk,
@ -988,6 +992,10 @@ class MSharedStory(mongo.Document):
self.story_original_content_z = zlib.compress(self.story_original_content)
self.story_original_content = None
self.comments = linkify(strip_tags(self.comments))
for reply in self.replies:
reply.comments = linkify(strip_tags(reply.comments))
r = redis.Redis(connection_pool=settings.REDIS_POOL)
share_key = "S:%s:%s" % (self.story_feed_id, self.guid_hash)
r.sadd(share_key, self.user_id)
@ -1363,6 +1371,10 @@ class MSharedStory(mongo.Document):
user = User.objects.get(pk=user_id)
if not user.email or not user.profile.send_emails:
if not user.email:
logging.user(user, "~BB~FMNo email to send to, skipping.")
elif not user.profile.send_emails:
logging.user(user, "~BB~FMDisabled emails, skipping.")
continue
mute_url = "http://%s%s" % (
@ -1403,6 +1415,10 @@ class MSharedStory(mongo.Document):
story_guid=self.story_guid)
if not original_user.email or not original_user.profile.send_emails:
if not original_user.email:
logging.user(original_user, "~BB~FMNo email to send to, skipping.")
elif not original_user.profile.send_emails:
logging.user(original_user, "~BB~FMDisabled emails, skipping.")
return
story_feed = Feed.objects.get(pk=self.story_feed_id)

View file

@ -25,6 +25,7 @@ from utils.user_functions import get_user, ajax_login_required
from utils.view_functions import render_to
from utils.story_functions import format_story_link_date__short
from utils.story_functions import format_story_link_date__long
from utils.story_functions import strip_tags, linkify
from utils import jennyholzer
from vendor.timezones.utilities import localtime_for_timezone
@ -139,7 +140,7 @@ def load_social_stories(request, user_id, username=None):
shared_date = localtime_for_timezone(shared_stories[story['id']]['shared_date'],
user.profile.timezone)
story['shared_date'] = format_story_link_date__long(shared_date, now)
story['shared_comments'] = shared_stories[story['id']]['comments']
story['shared_comments'] = strip_tags(shared_stories[story['id']]['comments'])
story['intelligence'] = {
'feed': apply_classifier_feeds(classifier_feeds, story['story_feed_id'],
@ -343,7 +344,7 @@ def mark_story_as_shared(request):
stories, profiles = MSharedStory.stories_with_comments_and_profiles([story], request.user.pk,
check_all=check_all)
story = stories[0]
story['shared_comments'] = shared_story['comments'] or ""
story['shared_comments'] = strip_tags(shared_story['comments'] or "")
if post_to_services:
for service in post_to_services:
@ -438,7 +439,7 @@ def save_comment_reply(request):
replies = []
for story_reply in shared_story.replies:
if (story_reply.user_id == reply.user_id and
story_reply.comments == original_message):
strip_tags(story_reply.comments) == original_message):
reply.publish_date = story_reply.publish_date
replies.append(reply)
else:
@ -900,7 +901,7 @@ def load_activities(request):
public = user_id != request.user.pk
page = max(1, int(request.REQUEST.get('page', 1)))
limit = request.REQUEST.get('limit')
limit = request.REQUEST.get('limit', 4)
activities, has_next_page = MActivity.user(user_id, page=page, limit=limit, public=public)
format = request.REQUEST.get('format', None)

View file

@ -2,6 +2,7 @@ NEWSBLUR.Models.Story = Backbone.Model.extend({
initialize: function() {
this.bind('change:selected', this.change_selected);
this.bind('change:shared_comments', this.populate_comments);
this.bind('change:comments', this.populate_comments);
this.bind('change:comment_count', this.populate_comments);
this.populate_comments();

View file

@ -215,6 +215,7 @@ NEWSBLUR.Views.StoryShareView = Backbone.View.extend({
$share_button.removeClass('NB-saving').removeClass('NB-disabled').text('Share');
$unshare_button.removeClass('NB-saving').removeClass('NB-disabled').text('Delete Share');
$share_sideoption.text(shared_text).closest('.NB-sideoption');
$comments_sideoptions.val(this.model.get('shared_comments'));
if (this.options.on_social_page) {
this.model.social_page_story.$el.toggleClass('NB-story-shared', this.model.get('shared'));

View file

@ -217,12 +217,14 @@ class ProcessFeed:
start_date = datetime.datetime.utcnow()
# end_date = datetime.datetime.utcnow()
story_guids = []
stories = []
for entry in self.fpf.entries:
story = pre_process_story(entry)
if story.get('published') < start_date:
start_date = story.get('published')
# if story.get('published') > end_date:
# end_date = story.get('published')
stories.append(story)
story_guids.append(story.get('guid') or story.get('link'))
existing_stories = list(MStory.objects(
@ -236,7 +238,8 @@ class ProcessFeed:
# | (Q(story_guid__in=story_guids)),
# story_feed=self.feed
# ).order_by('-story_date')
ret_values = self.feed.add_update_stories(self.fpf.entries, existing_stories, verbose=self.options['verbose'])
ret_values = self.feed.add_update_stories(stories, existing_stories,
verbose=self.options['verbose'])
if ((not self.feed.is_push or self.options.get('force'))
and hasattr(self.fpf, 'feed') and

View file

@ -2,7 +2,9 @@ import datetime
from HTMLParser import HTMLParser
from itertools import chain
from django.utils.dateformat import DateFormat
from django.utils.html import strip_tags as strip_tags_django
from django.conf import settings
from utils.tornado_escape import linkify as linkify_tornado
def story_score(story, bottom_delta=None):
# A) Date - Assumes story is unread and within unread range
@ -110,6 +112,9 @@ def pre_process_story(entry):
story_title = story_title[:80] + '...'
entry['title'] = story_title
entry['title'] = strip_tags(entry.get('title'))
entry['author'] = strip_tags(entry.get('author'))
return entry
class bunch(dict):
@ -156,9 +161,16 @@ class MLStripper(HTMLParser):
return ' '.join(self.fed)
def strip_tags(html):
if not html:
return ''
return strip_tags_django(html)
s = MLStripper()
s.feed(html)
return s.get_data()
def linkify(*args, **kwargs):
return linkify_tornado(*args, **kwargs)
def truncate_chars(value, max_length):
if len(value) <= max_length:

351
utils/tornado_escape.py Normal file
View file

@ -0,0 +1,351 @@
#!/usr/bin/env python
#
# Copyright 2009 Facebook
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
"""Escaping/unescaping methods for HTML, JSON, URLs, and others.
Also includes a few other miscellaneous string manipulation functions that
have crept in over time.
"""
from __future__ import absolute_import, division, with_statement
import htmlentitydefs
import re
import sys
import urllib
# Python3 compatibility: On python2.5, introduce the bytes alias from 2.6
try:
bytes
except Exception:
bytes = str
try:
from urlparse import parse_qs # Python 2.6+
except ImportError:
from cgi import parse_qs
# json module is in the standard library as of python 2.6; fall back to
# simplejson if present for older versions.
try:
import json
assert hasattr(json, "loads") and hasattr(json, "dumps")
_json_decode = json.loads
_json_encode = json.dumps
except Exception:
try:
import simplejson
_json_decode = lambda s: simplejson.loads(_unicode(s))
_json_encode = lambda v: simplejson.dumps(v)
except ImportError:
try:
# For Google AppEngine
from django.utils import simplejson
_json_decode = lambda s: simplejson.loads(_unicode(s))
_json_encode = lambda v: simplejson.dumps(v)
except ImportError:
def _json_decode(s):
raise NotImplementedError(
"A JSON parser is required, e.g., simplejson at "
"http://pypi.python.org/pypi/simplejson/")
_json_encode = _json_decode
_XHTML_ESCAPE_RE = re.compile('[&<>"]')
_XHTML_ESCAPE_DICT = {'&': '&amp;', '<': '&lt;', '>': '&gt;', '"': '&quot;'}
def xhtml_escape(value):
"""Escapes a string so it is valid within XML or XHTML."""
return _XHTML_ESCAPE_RE.sub(lambda match: _XHTML_ESCAPE_DICT[match.group(0)],
to_basestring(value))
def xhtml_unescape(value):
"""Un-escapes an XML-escaped string."""
return re.sub(r"&(#?)(\w+?);", _convert_entity, _unicode(value))
def json_encode(value):
"""JSON-encodes the given Python object."""
# JSON permits but does not require forward slashes to be escaped.
# This is useful when json data is emitted in a <script> tag
# in HTML, as it prevents </script> tags from prematurely terminating
# the javscript. Some json libraries do this escaping by default,
# although python's standard library does not, so we do it here.
# http://stackoverflow.com/questions/1580647/json-why-are-forward-slashes-escaped
return _json_encode(recursive_unicode(value)).replace("</", "<\\/")
def json_decode(value):
"""Returns Python objects for the given JSON string."""
return _json_decode(to_basestring(value))
def squeeze(value):
"""Replace all sequences of whitespace chars with a single space."""
return re.sub(r"[\x00-\x20]+", " ", value).strip()
def url_escape(value):
"""Returns a valid URL-encoded version of the given value."""
return urllib.quote_plus(utf8(value))
# python 3 changed things around enough that we need two separate
# implementations of url_unescape. We also need our own implementation
# of parse_qs since python 3's version insists on decoding everything.
if sys.version_info[0] < 3:
def url_unescape(value, encoding='utf-8'):
"""Decodes the given value from a URL.
The argument may be either a byte or unicode string.
If encoding is None, the result will be a byte string. Otherwise,
the result is a unicode string in the specified encoding.
"""
if encoding is None:
return urllib.unquote_plus(utf8(value))
else:
return unicode(urllib.unquote_plus(utf8(value)), encoding)
parse_qs_bytes = parse_qs
else:
def url_unescape(value, encoding='utf-8'):
"""Decodes the given value from a URL.
The argument may be either a byte or unicode string.
If encoding is None, the result will be a byte string. Otherwise,
the result is a unicode string in the specified encoding.
"""
if encoding is None:
return urllib.parse.unquote_to_bytes(value)
else:
return urllib.unquote_plus(to_basestring(value), encoding=encoding)
def parse_qs_bytes(qs, keep_blank_values=False, strict_parsing=False):
"""Parses a query string like urlparse.parse_qs, but returns the
values as byte strings.
Keys still become type str (interpreted as latin1 in python3!)
because it's too painful to keep them as byte strings in
python3 and in practice they're nearly always ascii anyway.
"""
# This is gross, but python3 doesn't give us another way.
# Latin1 is the universal donor of character encodings.
result = parse_qs(qs, keep_blank_values, strict_parsing,
encoding='latin1', errors='strict')
encoded = {}
for k, v in result.iteritems():
encoded[k] = [i.encode('latin1') for i in v]
return encoded
_UTF8_TYPES = (bytes, type(None))
def utf8(value):
"""Converts a string argument to a byte string.
If the argument is already a byte string or None, it is returned unchanged.
Otherwise it must be a unicode string and is encoded as utf8.
"""
if isinstance(value, _UTF8_TYPES):
return value
assert isinstance(value, unicode)
return value.encode("utf-8")
_TO_UNICODE_TYPES = (unicode, type(None))
def to_unicode(value):
"""Converts a string argument to a unicode string.
If the argument is already a unicode string or None, it is returned
unchanged. Otherwise it must be a byte string and is decoded as utf8.
"""
if isinstance(value, _TO_UNICODE_TYPES):
return value
assert isinstance(value, bytes)
return value.decode("utf-8")
# to_unicode was previously named _unicode not because it was private,
# but to avoid conflicts with the built-in unicode() function/type
_unicode = to_unicode
# When dealing with the standard library across python 2 and 3 it is
# sometimes useful to have a direct conversion to the native string type
if str is unicode:
native_str = to_unicode
else:
native_str = utf8
_BASESTRING_TYPES = (basestring, type(None))
def to_basestring(value):
"""Converts a string argument to a subclass of basestring.
In python2, byte and unicode strings are mostly interchangeable,
so functions that deal with a user-supplied argument in combination
with ascii string constants can use either and should return the type
the user supplied. In python3, the two types are not interchangeable,
so this method is needed to convert byte strings to unicode.
"""
if isinstance(value, _BASESTRING_TYPES):
return value
assert isinstance(value, bytes)
return value.decode("utf-8")
def recursive_unicode(obj):
"""Walks a simple data structure, converting byte strings to unicode.
Supports lists, tuples, and dictionaries.
"""
if isinstance(obj, dict):
return dict((recursive_unicode(k), recursive_unicode(v)) for (k, v) in obj.iteritems())
elif isinstance(obj, list):
return list(recursive_unicode(i) for i in obj)
elif isinstance(obj, tuple):
return tuple(recursive_unicode(i) for i in obj)
elif isinstance(obj, bytes):
return to_unicode(obj)
else:
return obj
# I originally used the regex from
# http://daringfireball.net/2010/07/improved_regex_for_matching_urls
# but it gets all exponential on certain patterns (such as too many trailing
# dots), causing the regex matcher to never return.
# This regex should avoid those problems.
_URL_RE = re.compile(ur"""\b((?:([\w-]+):(/{1,3})|www[.])(?:(?:(?:[^\s&()]|&amp;|&quot;)*(?:[^!"#$%&'()*+,.:;<=>?@\[\]^`{|}~\s]))|(?:\((?:[^\s&()]|&amp;|&quot;)*\)))+)""")
def linkify(text, shorten=False, extra_params="",
require_protocol=False, permitted_protocols=["http", "https"]):
"""Converts plain text into HTML with links.
For example: ``linkify("Hello http://tornadoweb.org!")`` would return
``Hello <a href="http://tornadoweb.org">http://tornadoweb.org</a>!``
Parameters:
shorten: Long urls will be shortened for display.
extra_params: Extra text to include in the link tag, or a callable
taking the link as an argument and returning the extra text
e.g. ``linkify(text, extra_params='rel="nofollow" class="external"')``,
or::
def extra_params_cb(url):
if url.startswith("http://example.com"):
return 'class="internal"'
else:
return 'class="external" rel="nofollow"'
linkify(text, extra_params=extra_params_cb)
require_protocol: Only linkify urls which include a protocol. If this is
False, urls such as www.facebook.com will also be linkified.
permitted_protocols: List (or set) of protocols which should be linkified,
e.g. linkify(text, permitted_protocols=["http", "ftp", "mailto"]).
It is very unsafe to include protocols such as "javascript".
"""
if extra_params and not callable(extra_params):
extra_params = " " + extra_params.strip()
def make_link(m):
url = m.group(1)
proto = m.group(2)
if require_protocol and not proto:
return url # not protocol, no linkify
if proto and proto not in permitted_protocols:
return url # bad protocol, no linkify
href = m.group(1)
if not proto:
href = "http://" + href # no proto specified, use http
if callable(extra_params):
params = " " + extra_params(href).strip()
else:
params = extra_params
# clip long urls. max_len is just an approximation
max_len = 30
if shorten and len(url) > max_len:
before_clip = url
if proto:
proto_len = len(proto) + 1 + len(m.group(3) or "") # +1 for :
else:
proto_len = 0
parts = url[proto_len:].split("/")
if len(parts) > 1:
# Grab the whole host part plus the first bit of the path
# The path is usually not that interesting once shortened
# (no more slug, etc), so it really just provides a little
# extra indication of shortening.
url = url[:proto_len] + parts[0] + "/" + \
parts[1][:8].split('?')[0].split('.')[0]
if len(url) > max_len * 1.5: # still too long
url = url[:max_len]
if url != before_clip:
amp = url.rfind('&')
# avoid splitting html char entities
if amp > max_len - 5:
url = url[:amp]
url += "..."
if len(url) >= len(before_clip):
url = before_clip
else:
# full url is visible on mouse-over (for those who don't
# have a status bar, such as Safari by default)
params += ' title="%s"' % href
return u'<a href="%s"%s>%s</a>' % (href, params, url)
# First HTML-escape so that our strings are all safe.
# The regex is modified to avoid character entites other than &amp; so
# that we won't pick up &quot;, etc.
text = _unicode(xhtml_escape(text))
return _URL_RE.sub(make_link, text)
def _convert_entity(m):
if m.group(1) == "#":
try:
return unichr(int(m.group(2)))
except ValueError:
return "&#%s;" % m.group(2)
try:
return _HTML_UNICODE_MAP[m.group(2)]
except KeyError:
return "&%s;" % m.group(2)
def _build_unicode_map():
unicode_map = {}
for name, value in htmlentitydefs.name2codepoint.iteritems():
unicode_map[name] = unichr(value)
return unicode_map
_HTML_UNICODE_MAP = _build_unicode_map()