From 18428a32a1964d5d4d57ca813eb6cdc024b61af7 Mon Sep 17 00:00:00 2001 From: Samuel Clay Date: Sat, 21 Jul 2012 16:38:37 -0700 Subject: [PATCH] Auto-linkifying comments and replies, and stripping html from comments, replies, and story titles, tags, and authors. --- apps/reader/views.py | 3 +- apps/rss_feeds/models.py | 8 +- apps/social/models.py | 18 +- apps/social/views.py | 9 +- media/js/newsblur/models/stories.js | 1 + media/js/newsblur/views/story_share_view.js | 1 + utils/feed_fetcher.py | 5 +- utils/story_functions.py | 12 + utils/tornado_escape.py | 351 ++++++++++++++++++++ 9 files changed, 396 insertions(+), 12 deletions(-) create mode 100644 utils/tornado_escape.py diff --git a/apps/reader/views.py b/apps/reader/views.py index 9d26f71f3..cc3fde07c 100644 --- a/apps/reader/views.py +++ b/apps/reader/views.py @@ -44,6 +44,7 @@ from utils.story_functions import format_story_link_date__short from utils.story_functions import format_story_link_date__long from utils.story_functions import bunch from utils.story_functions import story_score +from utils.story_functions import strip_tags from utils import log as logging from utils.view_functions import get_argument_or_404, render_to, is_true from utils.ratelimit import ratelimit @@ -478,7 +479,7 @@ def load_single_feed(request, feed_id): story['shared'] = True shared_date = localtime_for_timezone(shared_stories[story['id']]['shared_date'], user.profile.timezone) story['shared_date'] = format_story_link_date__long(shared_date, now) - story['shared_comments'] = shared_stories[story['id']]['comments'] + story['shared_comments'] = strip_tags(shared_stories[story['id']]['comments']) else: story['read_status'] = 1 story['intelligence'] = { diff --git a/apps/rss_feeds/models.py b/apps/rss_feeds/models.py index a2b6c0c11..5736cb222 100644 --- a/apps/rss_feeds/models.py +++ b/apps/rss_feeds/models.py @@ -27,7 +27,7 @@ from utils.feed_functions import levenshtein_distance from utils.feed_functions import timelimit, TimeoutError from utils.feed_functions import relative_timesince from utils.feed_functions import seconds_timesince -from utils.story_functions import pre_process_story +from utils.story_functions import strip_tags from utils.diff import HTMLDiff ENTRY_NEW, ENTRY_UPDATED, ENTRY_SAME, ENTRY_ERR = range(4) @@ -724,8 +724,6 @@ class Feed(models.Model): } for story in stories: - story = pre_process_story(story) - if not story.get('title'): continue @@ -979,8 +977,8 @@ class Feed(models.Model): if not tagname or tagname == ' ': continue fcat.append(tagname) - fcat = [t[:250] for t in fcat] - return fcat[:12] + fcat = [strip_tags(t)[:250] for t in fcat[:12]] + return fcat def get_permalink(self, entry): link = entry.get('link') diff --git a/apps/social/models.py b/apps/social/models.py index e9f1ea37c..8a87329c2 100644 --- a/apps/social/models.py +++ b/apps/social/models.py @@ -24,7 +24,7 @@ from vendor import facebook from vendor import tweepy from utils import log as logging from utils.feed_functions import relative_timesince -from utils.story_functions import truncate_chars +from utils.story_functions import truncate_chars, strip_tags, linkify from utils import json_functions as json RECOMMENDATIONS_LIMIT = 5 @@ -475,6 +475,10 @@ class MSocialProfile(mongo.Document): def send_email_for_new_follower(self, follower_user_id): user = User.objects.get(pk=self.user_id) if not user.email or not user.profile.send_emails or self.user_id == follower_user_id: + if not user.email: + logging.user(user, "~BB~FMNo email to send to, skipping.") + elif not user.profile.send_emails: + logging.user(user, "~BB~FMDisabled emails, skipping.") return emails_sent = MSentEmail.objects.filter(receiver_user_id=user.pk, @@ -988,6 +992,10 @@ class MSharedStory(mongo.Document): self.story_original_content_z = zlib.compress(self.story_original_content) self.story_original_content = None + self.comments = linkify(strip_tags(self.comments)) + for reply in self.replies: + reply.comments = linkify(strip_tags(reply.comments)) + r = redis.Redis(connection_pool=settings.REDIS_POOL) share_key = "S:%s:%s" % (self.story_feed_id, self.guid_hash) r.sadd(share_key, self.user_id) @@ -1363,6 +1371,10 @@ class MSharedStory(mongo.Document): user = User.objects.get(pk=user_id) if not user.email or not user.profile.send_emails: + if not user.email: + logging.user(user, "~BB~FMNo email to send to, skipping.") + elif not user.profile.send_emails: + logging.user(user, "~BB~FMDisabled emails, skipping.") continue mute_url = "http://%s%s" % ( @@ -1403,6 +1415,10 @@ class MSharedStory(mongo.Document): story_guid=self.story_guid) if not original_user.email or not original_user.profile.send_emails: + if not original_user.email: + logging.user(original_user, "~BB~FMNo email to send to, skipping.") + elif not original_user.profile.send_emails: + logging.user(original_user, "~BB~FMDisabled emails, skipping.") return story_feed = Feed.objects.get(pk=self.story_feed_id) diff --git a/apps/social/views.py b/apps/social/views.py index 5facd1ed5..d9392916b 100644 --- a/apps/social/views.py +++ b/apps/social/views.py @@ -25,6 +25,7 @@ from utils.user_functions import get_user, ajax_login_required from utils.view_functions import render_to from utils.story_functions import format_story_link_date__short from utils.story_functions import format_story_link_date__long +from utils.story_functions import strip_tags, linkify from utils import jennyholzer from vendor.timezones.utilities import localtime_for_timezone @@ -139,7 +140,7 @@ def load_social_stories(request, user_id, username=None): shared_date = localtime_for_timezone(shared_stories[story['id']]['shared_date'], user.profile.timezone) story['shared_date'] = format_story_link_date__long(shared_date, now) - story['shared_comments'] = shared_stories[story['id']]['comments'] + story['shared_comments'] = strip_tags(shared_stories[story['id']]['comments']) story['intelligence'] = { 'feed': apply_classifier_feeds(classifier_feeds, story['story_feed_id'], @@ -343,7 +344,7 @@ def mark_story_as_shared(request): stories, profiles = MSharedStory.stories_with_comments_and_profiles([story], request.user.pk, check_all=check_all) story = stories[0] - story['shared_comments'] = shared_story['comments'] or "" + story['shared_comments'] = strip_tags(shared_story['comments'] or "") if post_to_services: for service in post_to_services: @@ -438,7 +439,7 @@ def save_comment_reply(request): replies = [] for story_reply in shared_story.replies: if (story_reply.user_id == reply.user_id and - story_reply.comments == original_message): + strip_tags(story_reply.comments) == original_message): reply.publish_date = story_reply.publish_date replies.append(reply) else: @@ -900,7 +901,7 @@ def load_activities(request): public = user_id != request.user.pk page = max(1, int(request.REQUEST.get('page', 1))) - limit = request.REQUEST.get('limit') + limit = request.REQUEST.get('limit', 4) activities, has_next_page = MActivity.user(user_id, page=page, limit=limit, public=public) format = request.REQUEST.get('format', None) diff --git a/media/js/newsblur/models/stories.js b/media/js/newsblur/models/stories.js index f9fd0a80e..4ca119707 100644 --- a/media/js/newsblur/models/stories.js +++ b/media/js/newsblur/models/stories.js @@ -2,6 +2,7 @@ NEWSBLUR.Models.Story = Backbone.Model.extend({ initialize: function() { this.bind('change:selected', this.change_selected); + this.bind('change:shared_comments', this.populate_comments); this.bind('change:comments', this.populate_comments); this.bind('change:comment_count', this.populate_comments); this.populate_comments(); diff --git a/media/js/newsblur/views/story_share_view.js b/media/js/newsblur/views/story_share_view.js index ed635dec9..c227924e1 100644 --- a/media/js/newsblur/views/story_share_view.js +++ b/media/js/newsblur/views/story_share_view.js @@ -215,6 +215,7 @@ NEWSBLUR.Views.StoryShareView = Backbone.View.extend({ $share_button.removeClass('NB-saving').removeClass('NB-disabled').text('Share'); $unshare_button.removeClass('NB-saving').removeClass('NB-disabled').text('Delete Share'); $share_sideoption.text(shared_text).closest('.NB-sideoption'); + $comments_sideoptions.val(this.model.get('shared_comments')); if (this.options.on_social_page) { this.model.social_page_story.$el.toggleClass('NB-story-shared', this.model.get('shared')); diff --git a/utils/feed_fetcher.py b/utils/feed_fetcher.py index 438135725..8e0eee278 100644 --- a/utils/feed_fetcher.py +++ b/utils/feed_fetcher.py @@ -217,12 +217,14 @@ class ProcessFeed: start_date = datetime.datetime.utcnow() # end_date = datetime.datetime.utcnow() story_guids = [] + stories = [] for entry in self.fpf.entries: story = pre_process_story(entry) if story.get('published') < start_date: start_date = story.get('published') # if story.get('published') > end_date: # end_date = story.get('published') + stories.append(story) story_guids.append(story.get('guid') or story.get('link')) existing_stories = list(MStory.objects( @@ -236,7 +238,8 @@ class ProcessFeed: # | (Q(story_guid__in=story_guids)), # story_feed=self.feed # ).order_by('-story_date') - ret_values = self.feed.add_update_stories(self.fpf.entries, existing_stories, verbose=self.options['verbose']) + ret_values = self.feed.add_update_stories(stories, existing_stories, + verbose=self.options['verbose']) if ((not self.feed.is_push or self.options.get('force')) and hasattr(self.fpf, 'feed') and diff --git a/utils/story_functions.py b/utils/story_functions.py index 3f1690c4d..d4638b7e8 100644 --- a/utils/story_functions.py +++ b/utils/story_functions.py @@ -2,7 +2,9 @@ import datetime from HTMLParser import HTMLParser from itertools import chain from django.utils.dateformat import DateFormat +from django.utils.html import strip_tags as strip_tags_django from django.conf import settings +from utils.tornado_escape import linkify as linkify_tornado def story_score(story, bottom_delta=None): # A) Date - Assumes story is unread and within unread range @@ -110,6 +112,9 @@ def pre_process_story(entry): story_title = story_title[:80] + '...' entry['title'] = story_title + entry['title'] = strip_tags(entry.get('title')) + entry['author'] = strip_tags(entry.get('author')) + return entry class bunch(dict): @@ -156,9 +161,16 @@ class MLStripper(HTMLParser): return ' '.join(self.fed) def strip_tags(html): + if not html: + return '' + return strip_tags_django(html) + s = MLStripper() s.feed(html) return s.get_data() + +def linkify(*args, **kwargs): + return linkify_tornado(*args, **kwargs) def truncate_chars(value, max_length): if len(value) <= max_length: diff --git a/utils/tornado_escape.py b/utils/tornado_escape.py new file mode 100644 index 000000000..14eb43e3c --- /dev/null +++ b/utils/tornado_escape.py @@ -0,0 +1,351 @@ +#!/usr/bin/env python +# +# Copyright 2009 Facebook +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +"""Escaping/unescaping methods for HTML, JSON, URLs, and others. + +Also includes a few other miscellaneous string manipulation functions that +have crept in over time. +""" + +from __future__ import absolute_import, division, with_statement + +import htmlentitydefs +import re +import sys +import urllib + +# Python3 compatibility: On python2.5, introduce the bytes alias from 2.6 +try: + bytes +except Exception: + bytes = str + +try: + from urlparse import parse_qs # Python 2.6+ +except ImportError: + from cgi import parse_qs + +# json module is in the standard library as of python 2.6; fall back to +# simplejson if present for older versions. +try: + import json + assert hasattr(json, "loads") and hasattr(json, "dumps") + _json_decode = json.loads + _json_encode = json.dumps +except Exception: + try: + import simplejson + _json_decode = lambda s: simplejson.loads(_unicode(s)) + _json_encode = lambda v: simplejson.dumps(v) + except ImportError: + try: + # For Google AppEngine + from django.utils import simplejson + _json_decode = lambda s: simplejson.loads(_unicode(s)) + _json_encode = lambda v: simplejson.dumps(v) + except ImportError: + def _json_decode(s): + raise NotImplementedError( + "A JSON parser is required, e.g., simplejson at " + "http://pypi.python.org/pypi/simplejson/") + _json_encode = _json_decode + + +_XHTML_ESCAPE_RE = re.compile('[&<>"]') +_XHTML_ESCAPE_DICT = {'&': '&', '<': '<', '>': '>', '"': '"'} + + +def xhtml_escape(value): + """Escapes a string so it is valid within XML or XHTML.""" + return _XHTML_ESCAPE_RE.sub(lambda match: _XHTML_ESCAPE_DICT[match.group(0)], + to_basestring(value)) + + +def xhtml_unescape(value): + """Un-escapes an XML-escaped string.""" + return re.sub(r"&(#?)(\w+?);", _convert_entity, _unicode(value)) + + +def json_encode(value): + """JSON-encodes the given Python object.""" + # JSON permits but does not require forward slashes to be escaped. + # This is useful when json data is emitted in a tags from prematurely terminating + # the javscript. Some json libraries do this escaping by default, + # although python's standard library does not, so we do it here. + # http://stackoverflow.com/questions/1580647/json-why-are-forward-slashes-escaped + return _json_encode(recursive_unicode(value)).replace("?@\[\]^`{|}~\s]))|(?:\((?:[^\s&()]|&|")*\)))+)""") + + +def linkify(text, shorten=False, extra_params="", + require_protocol=False, permitted_protocols=["http", "https"]): + """Converts plain text into HTML with links. + + For example: ``linkify("Hello http://tornadoweb.org!")`` would return + ``Hello http://tornadoweb.org!`` + + Parameters: + + shorten: Long urls will be shortened for display. + + extra_params: Extra text to include in the link tag, or a callable + taking the link as an argument and returning the extra text + e.g. ``linkify(text, extra_params='rel="nofollow" class="external"')``, + or:: + + def extra_params_cb(url): + if url.startswith("http://example.com"): + return 'class="internal"' + else: + return 'class="external" rel="nofollow"' + linkify(text, extra_params=extra_params_cb) + + require_protocol: Only linkify urls which include a protocol. If this is + False, urls such as www.facebook.com will also be linkified. + + permitted_protocols: List (or set) of protocols which should be linkified, + e.g. linkify(text, permitted_protocols=["http", "ftp", "mailto"]). + It is very unsafe to include protocols such as "javascript". + """ + if extra_params and not callable(extra_params): + extra_params = " " + extra_params.strip() + + def make_link(m): + url = m.group(1) + proto = m.group(2) + if require_protocol and not proto: + return url # not protocol, no linkify + + if proto and proto not in permitted_protocols: + return url # bad protocol, no linkify + + href = m.group(1) + if not proto: + href = "http://" + href # no proto specified, use http + + if callable(extra_params): + params = " " + extra_params(href).strip() + else: + params = extra_params + + # clip long urls. max_len is just an approximation + max_len = 30 + if shorten and len(url) > max_len: + before_clip = url + if proto: + proto_len = len(proto) + 1 + len(m.group(3) or "") # +1 for : + else: + proto_len = 0 + + parts = url[proto_len:].split("/") + if len(parts) > 1: + # Grab the whole host part plus the first bit of the path + # The path is usually not that interesting once shortened + # (no more slug, etc), so it really just provides a little + # extra indication of shortening. + url = url[:proto_len] + parts[0] + "/" + \ + parts[1][:8].split('?')[0].split('.')[0] + + if len(url) > max_len * 1.5: # still too long + url = url[:max_len] + + if url != before_clip: + amp = url.rfind('&') + # avoid splitting html char entities + if amp > max_len - 5: + url = url[:amp] + url += "..." + + if len(url) >= len(before_clip): + url = before_clip + else: + # full url is visible on mouse-over (for those who don't + # have a status bar, such as Safari by default) + params += ' title="%s"' % href + + return u'%s' % (href, params, url) + + # First HTML-escape so that our strings are all safe. + # The regex is modified to avoid character entites other than & so + # that we won't pick up ", etc. + text = _unicode(xhtml_escape(text)) + return _URL_RE.sub(make_link, text) + + +def _convert_entity(m): + if m.group(1) == "#": + try: + return unichr(int(m.group(2))) + except ValueError: + return "&#%s;" % m.group(2) + try: + return _HTML_UNICODE_MAP[m.group(2)] + except KeyError: + return "&%s;" % m.group(2) + + +def _build_unicode_map(): + unicode_map = {} + for name, value in htmlentitydefs.name2codepoint.iteritems(): + unicode_map[name] = unichr(value) + return unicode_map + +_HTML_UNICODE_MAP = _build_unicode_map() \ No newline at end of file