From 18428a32a1964d5d4d57ca813eb6cdc024b61af7 Mon Sep 17 00:00:00 2001
From: Samuel Clay <samuel@ofbrooklyn.com>
Date: Sat, 21 Jul 2012 16:38:37 -0700
Subject: [PATCH] Auto-linkifying comments and replies, and stripping html from
 comments, replies, and story titles, tags, and authors.

---
 apps/reader/views.py                        |   3 +-
 apps/rss_feeds/models.py                    |   8 +-
 apps/social/models.py                       |  18 +-
 apps/social/views.py                        |   9 +-
 media/js/newsblur/models/stories.js         |   1 +
 media/js/newsblur/views/story_share_view.js |   1 +
 utils/feed_fetcher.py                       |   5 +-
 utils/story_functions.py                    |  12 +
 utils/tornado_escape.py                     | 351 ++++++++++++++++++++
 9 files changed, 396 insertions(+), 12 deletions(-)
 create mode 100644 utils/tornado_escape.py

diff --git a/apps/reader/views.py b/apps/reader/views.py
index 9d26f71f3..cc3fde07c 100644
--- a/apps/reader/views.py
+++ b/apps/reader/views.py
@@ -44,6 +44,7 @@ from utils.story_functions import format_story_link_date__short
 from utils.story_functions import format_story_link_date__long
 from utils.story_functions import bunch
 from utils.story_functions import story_score
+from utils.story_functions import strip_tags
 from utils import log as logging
 from utils.view_functions import get_argument_or_404, render_to, is_true
 from utils.ratelimit import ratelimit
@@ -478,7 +479,7 @@ def load_single_feed(request, feed_id):
                 story['shared'] = True
                 shared_date = localtime_for_timezone(shared_stories[story['id']]['shared_date'], user.profile.timezone)
                 story['shared_date'] = format_story_link_date__long(shared_date, now)
-                story['shared_comments'] = shared_stories[story['id']]['comments']
+                story['shared_comments'] = strip_tags(shared_stories[story['id']]['comments'])
         else:
             story['read_status'] = 1
         story['intelligence'] = {
diff --git a/apps/rss_feeds/models.py b/apps/rss_feeds/models.py
index a2b6c0c11..5736cb222 100644
--- a/apps/rss_feeds/models.py
+++ b/apps/rss_feeds/models.py
@@ -27,7 +27,7 @@ from utils.feed_functions import levenshtein_distance
 from utils.feed_functions import timelimit, TimeoutError
 from utils.feed_functions import relative_timesince
 from utils.feed_functions import seconds_timesince
-from utils.story_functions import pre_process_story
+from utils.story_functions import strip_tags
 from utils.diff import HTMLDiff
 
 ENTRY_NEW, ENTRY_UPDATED, ENTRY_SAME, ENTRY_ERR = range(4)
@@ -724,8 +724,6 @@ class Feed(models.Model):
         }
 
         for story in stories:
-            story = pre_process_story(story)
-            
             if not story.get('title'):
                 continue
                 
@@ -979,8 +977,8 @@ class Feed(models.Model):
                     if not tagname or tagname == ' ':
                         continue
                     fcat.append(tagname)
-        fcat = [t[:250] for t in fcat]
-        return fcat[:12]
+        fcat = [strip_tags(t)[:250] for t in fcat[:12]]
+        return fcat
     
     def get_permalink(self, entry):
         link = entry.get('link')
diff --git a/apps/social/models.py b/apps/social/models.py
index e9f1ea37c..8a87329c2 100644
--- a/apps/social/models.py
+++ b/apps/social/models.py
@@ -24,7 +24,7 @@ from vendor import facebook
 from vendor import tweepy
 from utils import log as logging
 from utils.feed_functions import relative_timesince
-from utils.story_functions import truncate_chars
+from utils.story_functions import truncate_chars, strip_tags, linkify
 from utils import json_functions as json
 
 RECOMMENDATIONS_LIMIT = 5
@@ -475,6 +475,10 @@ class MSocialProfile(mongo.Document):
     def send_email_for_new_follower(self, follower_user_id):
         user = User.objects.get(pk=self.user_id)
         if not user.email or not user.profile.send_emails or self.user_id == follower_user_id:
+            if not user.email:
+                logging.user(user, "~BB~FMNo email to send to, skipping.")
+            elif not user.profile.send_emails:
+                logging.user(user, "~BB~FMDisabled emails, skipping.")
             return
         
         emails_sent = MSentEmail.objects.filter(receiver_user_id=user.pk,
@@ -988,6 +992,10 @@ class MSharedStory(mongo.Document):
             self.story_original_content_z = zlib.compress(self.story_original_content)
             self.story_original_content = None
         
+        self.comments = linkify(strip_tags(self.comments))
+        for reply in self.replies:
+            reply.comments = linkify(strip_tags(reply.comments))
+        
         r = redis.Redis(connection_pool=settings.REDIS_POOL)
         share_key = "S:%s:%s" % (self.story_feed_id, self.guid_hash)
         r.sadd(share_key, self.user_id)
@@ -1363,6 +1371,10 @@ class MSharedStory(mongo.Document):
             user = User.objects.get(pk=user_id)
 
             if not user.email or not user.profile.send_emails:
+                if not user.email:
+                    logging.user(user, "~BB~FMNo email to send to, skipping.")
+                elif not user.profile.send_emails:
+                    logging.user(user, "~BB~FMDisabled emails, skipping.")
                 continue
             
             mute_url = "http://%s%s" % (
@@ -1403,6 +1415,10 @@ class MSharedStory(mongo.Document):
                                                          story_guid=self.story_guid)
                                                          
         if not original_user.email or not original_user.profile.send_emails:
+            if not original_user.email:
+                logging.user(original_user, "~BB~FMNo email to send to, skipping.")
+            elif not original_user.profile.send_emails:
+                logging.user(original_user, "~BB~FMDisabled emails, skipping.")
             return
             
         story_feed = Feed.objects.get(pk=self.story_feed_id)
diff --git a/apps/social/views.py b/apps/social/views.py
index 5facd1ed5..d9392916b 100644
--- a/apps/social/views.py
+++ b/apps/social/views.py
@@ -25,6 +25,7 @@ from utils.user_functions import get_user, ajax_login_required
 from utils.view_functions import render_to
 from utils.story_functions import format_story_link_date__short
 from utils.story_functions import format_story_link_date__long
+from utils.story_functions import strip_tags, linkify
 from utils import jennyholzer
 from vendor.timezones.utilities import localtime_for_timezone
 
@@ -139,7 +140,7 @@ def load_social_stories(request, user_id, username=None):
             shared_date = localtime_for_timezone(shared_stories[story['id']]['shared_date'],
                                                  user.profile.timezone)
             story['shared_date'] = format_story_link_date__long(shared_date, now)
-            story['shared_comments'] = shared_stories[story['id']]['comments']
+            story['shared_comments'] = strip_tags(shared_stories[story['id']]['comments'])
 
         story['intelligence'] = {
             'feed': apply_classifier_feeds(classifier_feeds, story['story_feed_id'],
@@ -343,7 +344,7 @@ def mark_story_as_shared(request):
     stories, profiles = MSharedStory.stories_with_comments_and_profiles([story], request.user.pk,
                                                                         check_all=check_all)
     story = stories[0]
-    story['shared_comments'] = shared_story['comments'] or ""
+    story['shared_comments'] = strip_tags(shared_story['comments'] or "")
     
     if post_to_services:
         for service in post_to_services:
@@ -438,7 +439,7 @@ def save_comment_reply(request):
         replies = []
         for story_reply in shared_story.replies:
             if (story_reply.user_id == reply.user_id and 
-                story_reply.comments == original_message):
+                strip_tags(story_reply.comments) == original_message):
                 reply.publish_date = story_reply.publish_date
                 replies.append(reply)
             else:
@@ -900,7 +901,7 @@ def load_activities(request):
         
     public = user_id != request.user.pk
     page = max(1, int(request.REQUEST.get('page', 1)))
-    limit = request.REQUEST.get('limit')
+    limit = request.REQUEST.get('limit', 4)
     activities, has_next_page = MActivity.user(user_id, page=page, limit=limit, public=public)
     format = request.REQUEST.get('format', None)
     
diff --git a/media/js/newsblur/models/stories.js b/media/js/newsblur/models/stories.js
index f9fd0a80e..4ca119707 100644
--- a/media/js/newsblur/models/stories.js
+++ b/media/js/newsblur/models/stories.js
@@ -2,6 +2,7 @@ NEWSBLUR.Models.Story = Backbone.Model.extend({
     
     initialize: function() {
         this.bind('change:selected', this.change_selected);
+        this.bind('change:shared_comments', this.populate_comments);
         this.bind('change:comments', this.populate_comments);
         this.bind('change:comment_count', this.populate_comments);
         this.populate_comments();
diff --git a/media/js/newsblur/views/story_share_view.js b/media/js/newsblur/views/story_share_view.js
index ed635dec9..c227924e1 100644
--- a/media/js/newsblur/views/story_share_view.js
+++ b/media/js/newsblur/views/story_share_view.js
@@ -215,6 +215,7 @@ NEWSBLUR.Views.StoryShareView = Backbone.View.extend({
         $share_button.removeClass('NB-saving').removeClass('NB-disabled').text('Share');
         $unshare_button.removeClass('NB-saving').removeClass('NB-disabled').text('Delete Share');
         $share_sideoption.text(shared_text).closest('.NB-sideoption');
+        $comments_sideoptions.val(this.model.get('shared_comments'));
         
         if (this.options.on_social_page) {
             this.model.social_page_story.$el.toggleClass('NB-story-shared', this.model.get('shared'));
diff --git a/utils/feed_fetcher.py b/utils/feed_fetcher.py
index 438135725..8e0eee278 100644
--- a/utils/feed_fetcher.py
+++ b/utils/feed_fetcher.py
@@ -217,12 +217,14 @@ class ProcessFeed:
         start_date = datetime.datetime.utcnow()
         # end_date = datetime.datetime.utcnow()
         story_guids = []
+        stories = []
         for entry in self.fpf.entries:
             story = pre_process_story(entry)
             if story.get('published') < start_date:
                 start_date = story.get('published')
             # if story.get('published') > end_date:
             #     end_date = story.get('published')
+            stories.append(story)
             story_guids.append(story.get('guid') or story.get('link'))
 
         existing_stories = list(MStory.objects(
@@ -236,7 +238,8 @@ class ProcessFeed:
         #     | (Q(story_guid__in=story_guids)),
         #     story_feed=self.feed
         # ).order_by('-story_date')
-        ret_values = self.feed.add_update_stories(self.fpf.entries, existing_stories, verbose=self.options['verbose'])
+        ret_values = self.feed.add_update_stories(stories, existing_stories,
+                                                  verbose=self.options['verbose'])
 
         if ((not self.feed.is_push or self.options.get('force'))
             and hasattr(self.fpf, 'feed') and 
diff --git a/utils/story_functions.py b/utils/story_functions.py
index 3f1690c4d..d4638b7e8 100644
--- a/utils/story_functions.py
+++ b/utils/story_functions.py
@@ -2,7 +2,9 @@ import datetime
 from HTMLParser import HTMLParser
 from itertools import chain
 from django.utils.dateformat import DateFormat
+from django.utils.html import strip_tags as strip_tags_django
 from django.conf import settings
+from utils.tornado_escape import linkify as linkify_tornado
 
 def story_score(story, bottom_delta=None):
     # A) Date - Assumes story is unread and within unread range
@@ -110,6 +112,9 @@ def pre_process_story(entry):
             story_title = story_title[:80] + '...'
         entry['title'] = story_title
     
+    entry['title'] = strip_tags(entry.get('title'))
+    entry['author'] = strip_tags(entry.get('author'))
+    
     return entry
     
 class bunch(dict):
@@ -156,9 +161,16 @@ class MLStripper(HTMLParser):
         return ' '.join(self.fed)
 
 def strip_tags(html):
+    if not html:
+        return ''
+    return strip_tags_django(html)
+    
     s = MLStripper()
     s.feed(html)
     return s.get_data()
+
+def linkify(*args, **kwargs):
+    return linkify_tornado(*args, **kwargs)
     
 def truncate_chars(value, max_length):
     if len(value) <= max_length:
diff --git a/utils/tornado_escape.py b/utils/tornado_escape.py
new file mode 100644
index 000000000..14eb43e3c
--- /dev/null
+++ b/utils/tornado_escape.py
@@ -0,0 +1,351 @@
+#!/usr/bin/env python
+#
+# Copyright 2009 Facebook
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+"""Escaping/unescaping methods for HTML, JSON, URLs, and others.
+
+Also includes a few other miscellaneous string manipulation functions that
+have crept in over time.
+"""
+
+from __future__ import absolute_import, division, with_statement
+
+import htmlentitydefs
+import re
+import sys
+import urllib
+
+# Python3 compatibility:  On python2.5, introduce the bytes alias from 2.6
+try:
+    bytes
+except Exception:
+    bytes = str
+
+try:
+    from urlparse import parse_qs  # Python 2.6+
+except ImportError:
+    from cgi import parse_qs
+
+# json module is in the standard library as of python 2.6; fall back to
+# simplejson if present for older versions.
+try:
+    import json
+    assert hasattr(json, "loads") and hasattr(json, "dumps")
+    _json_decode = json.loads
+    _json_encode = json.dumps
+except Exception:
+    try:
+        import simplejson
+        _json_decode = lambda s: simplejson.loads(_unicode(s))
+        _json_encode = lambda v: simplejson.dumps(v)
+    except ImportError:
+        try:
+            # For Google AppEngine
+            from django.utils import simplejson
+            _json_decode = lambda s: simplejson.loads(_unicode(s))
+            _json_encode = lambda v: simplejson.dumps(v)
+        except ImportError:
+            def _json_decode(s):
+                raise NotImplementedError(
+                    "A JSON parser is required, e.g., simplejson at "
+                    "http://pypi.python.org/pypi/simplejson/")
+            _json_encode = _json_decode
+
+
+_XHTML_ESCAPE_RE = re.compile('[&<>"]')
+_XHTML_ESCAPE_DICT = {'&': '&amp;', '<': '&lt;', '>': '&gt;', '"': '&quot;'}
+
+
+def xhtml_escape(value):
+    """Escapes a string so it is valid within XML or XHTML."""
+    return _XHTML_ESCAPE_RE.sub(lambda match: _XHTML_ESCAPE_DICT[match.group(0)],
+                                to_basestring(value))
+
+
+def xhtml_unescape(value):
+    """Un-escapes an XML-escaped string."""
+    return re.sub(r"&(#?)(\w+?);", _convert_entity, _unicode(value))
+
+
+def json_encode(value):
+    """JSON-encodes the given Python object."""
+    # JSON permits but does not require forward slashes to be escaped.
+    # This is useful when json data is emitted in a <script> tag
+    # in HTML, as it prevents </script> tags from prematurely terminating
+    # the javscript.  Some json libraries do this escaping by default,
+    # although python's standard library does not, so we do it here.
+    # http://stackoverflow.com/questions/1580647/json-why-are-forward-slashes-escaped
+    return _json_encode(recursive_unicode(value)).replace("</", "<\\/")
+
+
+def json_decode(value):
+    """Returns Python objects for the given JSON string."""
+    return _json_decode(to_basestring(value))
+
+
+def squeeze(value):
+    """Replace all sequences of whitespace chars with a single space."""
+    return re.sub(r"[\x00-\x20]+", " ", value).strip()
+
+
+def url_escape(value):
+    """Returns a valid URL-encoded version of the given value."""
+    return urllib.quote_plus(utf8(value))
+
+# python 3 changed things around enough that we need two separate
+# implementations of url_unescape.  We also need our own implementation
+# of parse_qs since python 3's version insists on decoding everything.
+if sys.version_info[0] < 3:
+    def url_unescape(value, encoding='utf-8'):
+        """Decodes the given value from a URL.
+
+        The argument may be either a byte or unicode string.
+
+        If encoding is None, the result will be a byte string.  Otherwise,
+        the result is a unicode string in the specified encoding.
+        """
+        if encoding is None:
+            return urllib.unquote_plus(utf8(value))
+        else:
+            return unicode(urllib.unquote_plus(utf8(value)), encoding)
+
+    parse_qs_bytes = parse_qs
+else:
+    def url_unescape(value, encoding='utf-8'):
+        """Decodes the given value from a URL.
+
+        The argument may be either a byte or unicode string.
+
+        If encoding is None, the result will be a byte string.  Otherwise,
+        the result is a unicode string in the specified encoding.
+        """
+        if encoding is None:
+            return urllib.parse.unquote_to_bytes(value)
+        else:
+            return urllib.unquote_plus(to_basestring(value), encoding=encoding)
+
+    def parse_qs_bytes(qs, keep_blank_values=False, strict_parsing=False):
+        """Parses a query string like urlparse.parse_qs, but returns the
+        values as byte strings.
+
+        Keys still become type str (interpreted as latin1 in python3!)
+        because it's too painful to keep them as byte strings in
+        python3 and in practice they're nearly always ascii anyway.
+        """
+        # This is gross, but python3 doesn't give us another way.
+        # Latin1 is the universal donor of character encodings.
+        result = parse_qs(qs, keep_blank_values, strict_parsing,
+                          encoding='latin1', errors='strict')
+        encoded = {}
+        for k, v in result.iteritems():
+            encoded[k] = [i.encode('latin1') for i in v]
+        return encoded
+
+
+_UTF8_TYPES = (bytes, type(None))
+
+
+def utf8(value):
+    """Converts a string argument to a byte string.
+
+    If the argument is already a byte string or None, it is returned unchanged.
+    Otherwise it must be a unicode string and is encoded as utf8.
+    """
+    if isinstance(value, _UTF8_TYPES):
+        return value
+    assert isinstance(value, unicode)
+    return value.encode("utf-8")
+
+_TO_UNICODE_TYPES = (unicode, type(None))
+
+
+def to_unicode(value):
+    """Converts a string argument to a unicode string.
+
+    If the argument is already a unicode string or None, it is returned
+    unchanged.  Otherwise it must be a byte string and is decoded as utf8.
+    """
+    if isinstance(value, _TO_UNICODE_TYPES):
+        return value
+    assert isinstance(value, bytes)
+    return value.decode("utf-8")
+
+# to_unicode was previously named _unicode not because it was private,
+# but to avoid conflicts with the built-in unicode() function/type
+_unicode = to_unicode
+
+# When dealing with the standard library across python 2 and 3 it is
+# sometimes useful to have a direct conversion to the native string type
+if str is unicode:
+    native_str = to_unicode
+else:
+    native_str = utf8
+
+_BASESTRING_TYPES = (basestring, type(None))
+
+
+def to_basestring(value):
+    """Converts a string argument to a subclass of basestring.
+
+    In python2, byte and unicode strings are mostly interchangeable,
+    so functions that deal with a user-supplied argument in combination
+    with ascii string constants can use either and should return the type
+    the user supplied.  In python3, the two types are not interchangeable,
+    so this method is needed to convert byte strings to unicode.
+    """
+    if isinstance(value, _BASESTRING_TYPES):
+        return value
+    assert isinstance(value, bytes)
+    return value.decode("utf-8")
+
+
+def recursive_unicode(obj):
+    """Walks a simple data structure, converting byte strings to unicode.
+
+    Supports lists, tuples, and dictionaries.
+    """
+    if isinstance(obj, dict):
+        return dict((recursive_unicode(k), recursive_unicode(v)) for (k, v) in obj.iteritems())
+    elif isinstance(obj, list):
+        return list(recursive_unicode(i) for i in obj)
+    elif isinstance(obj, tuple):
+        return tuple(recursive_unicode(i) for i in obj)
+    elif isinstance(obj, bytes):
+        return to_unicode(obj)
+    else:
+        return obj
+
+# I originally used the regex from
+# http://daringfireball.net/2010/07/improved_regex_for_matching_urls
+# but it gets all exponential on certain patterns (such as too many trailing
+# dots), causing the regex matcher to never return.
+# This regex should avoid those problems.
+_URL_RE = re.compile(ur"""\b((?:([\w-]+):(/{1,3})|www[.])(?:(?:(?:[^\s&()]|&amp;|&quot;)*(?:[^!"#$%&'()*+,.:;<=>?@\[\]^`{|}~\s]))|(?:\((?:[^\s&()]|&amp;|&quot;)*\)))+)""")
+
+
+def linkify(text, shorten=False, extra_params="",
+            require_protocol=False, permitted_protocols=["http", "https"]):
+    """Converts plain text into HTML with links.
+
+    For example: ``linkify("Hello http://tornadoweb.org!")`` would return
+    ``Hello <a href="http://tornadoweb.org">http://tornadoweb.org</a>!``
+
+    Parameters:
+
+    shorten: Long urls will be shortened for display.
+
+    extra_params: Extra text to include in the link tag, or a callable
+        taking the link as an argument and returning the extra text
+        e.g. ``linkify(text, extra_params='rel="nofollow" class="external"')``,
+        or::
+
+            def extra_params_cb(url):
+                if url.startswith("http://example.com"):
+                    return 'class="internal"'
+                else:
+                    return 'class="external" rel="nofollow"'
+            linkify(text, extra_params=extra_params_cb)
+
+    require_protocol: Only linkify urls which include a protocol. If this is
+        False, urls such as www.facebook.com will also be linkified.
+
+    permitted_protocols: List (or set) of protocols which should be linkified,
+        e.g. linkify(text, permitted_protocols=["http", "ftp", "mailto"]).
+        It is very unsafe to include protocols such as "javascript".
+    """
+    if extra_params and not callable(extra_params):
+        extra_params = " " + extra_params.strip()
+
+    def make_link(m):
+        url = m.group(1)
+        proto = m.group(2)
+        if require_protocol and not proto:
+            return url  # not protocol, no linkify
+
+        if proto and proto not in permitted_protocols:
+            return url  # bad protocol, no linkify
+
+        href = m.group(1)
+        if not proto:
+            href = "http://" + href   # no proto specified, use http
+
+        if callable(extra_params):
+            params = " " + extra_params(href).strip()
+        else:
+            params = extra_params
+
+        # clip long urls. max_len is just an approximation
+        max_len = 30
+        if shorten and len(url) > max_len:
+            before_clip = url
+            if proto:
+                proto_len = len(proto) + 1 + len(m.group(3) or "")  # +1 for :
+            else:
+                proto_len = 0
+
+            parts = url[proto_len:].split("/")
+            if len(parts) > 1:
+                # Grab the whole host part plus the first bit of the path
+                # The path is usually not that interesting once shortened
+                # (no more slug, etc), so it really just provides a little
+                # extra indication of shortening.
+                url = url[:proto_len] + parts[0] + "/" + \
+                        parts[1][:8].split('?')[0].split('.')[0]
+
+            if len(url) > max_len * 1.5:  # still too long
+                url = url[:max_len]
+
+            if url != before_clip:
+                amp = url.rfind('&')
+                # avoid splitting html char entities
+                if amp > max_len - 5:
+                    url = url[:amp]
+                url += "..."
+
+                if len(url) >= len(before_clip):
+                    url = before_clip
+                else:
+                    # full url is visible on mouse-over (for those who don't
+                    # have a status bar, such as Safari by default)
+                    params += ' title="%s"' % href
+
+        return u'<a href="%s"%s>%s</a>' % (href, params, url)
+
+    # First HTML-escape so that our strings are all safe.
+    # The regex is modified to avoid character entites other than &amp; so
+    # that we won't pick up &quot;, etc.
+    text = _unicode(xhtml_escape(text))
+    return _URL_RE.sub(make_link, text)
+
+
+def _convert_entity(m):
+    if m.group(1) == "#":
+        try:
+            return unichr(int(m.group(2)))
+        except ValueError:
+            return "&#%s;" % m.group(2)
+    try:
+        return _HTML_UNICODE_MAP[m.group(2)]
+    except KeyError:
+        return "&%s;" % m.group(2)
+
+
+def _build_unicode_map():
+    unicode_map = {}
+    for name, value in htmlentitydefs.name2codepoint.iteritems():
+        unicode_map[name] = unichr(value)
+    return unicode_map
+
+_HTML_UNICODE_MAP = _build_unicode_map()
\ No newline at end of file