Using regular expressions for comment stripping, unless the feed has an error count, in which case switch to lxml.

2025-09-18 21:43:31 +00:00 · 2013-04-08 16:14:33 -07:00 · 2013-04-08 16:14:33 -07:00 · 47c2257ef7
commit 47c2257ef7
parent 33e605875e
2 changed files with 24 additions and 6 deletions
--- a/apps/rss_feeds/models.py
+++ b/apps/rss_feeds/models.py
@ -36,7 +36,7 @@ from utils.feed_functions import levenshtein_distance
 from utils.feed_functions import timelimit, TimeoutError
 from utils.feed_functions import relative_timesince
 from utils.feed_functions import seconds_timesince
-from utils.story_functions import strip_tags, htmldiff, strip_comments__lxml
+from utils.story_functions import strip_tags, htmldiff, strip_comments, strip_comments__lxml
 from vendor.redis_completion.engine import RedisEngine

 ENTRY_NEW, ENTRY_UPDATED, ENTRY_SAME, ENTRY_ERR = range(4)
@ -840,7 +840,8 @@ class Feed(models.Model):
        
    def add_update_stories(self, stories, existing_stories, verbose=False):
        ret_values = dict(new=0, updated=0, same=0, error=0)
-
+        error_count = self.error_count
+        
        if settings.DEBUG or verbose:
            logging.debug("   ---> [%-30s] ~FBChecking ~SB%s~SN new/updated against ~SB%s~SN stories" % (
                          self.title[:30],
@ -852,7 +853,10 @@ class Feed(models.Model):
                continue
                
            story_content = story.get('story_content')
-            story_content = strip_comments__lxml(story_content)
+            if error_count:
+                story_content = strip_comments__lxml(story_content)
+            else:
+                story_content = strip_comments(story_content)
            story_tags = self.get_tags(story)
            story_link = self.get_permalink(story)
                
--- a/utils/story_functions.py
+++ b/utils/story_functions.py
@ -15,7 +15,8 @@ from utils.tornado_escape import linkify as linkify_tornado
 from utils.tornado_escape import xhtml_unescape as xhtml_unescape_tornado
 from vendor import reseekfile

-COMMENTS_RE = re.compile('\<![ \r\n\t]*(--([^\-]|[\r\n]|-[^\-])*--[ \r\n\t]*)\>')
+# COMMENTS_RE = re.compile('\<![ \r\n\t]*(--([^\-]|[\r\n]|-[^\-])*--[ \r\n\t]*)\>')
+COMMENTS_RE = re.compile('\<!--.*?--\>')

 def story_score(story, bottom_delta=None):
    # A) Date - Assumes story is unread and within unread range
@ -197,8 +198,21 @@ def strip_tags(html):

 def strip_comments(html_string):
    return COMMENTS_RE.sub('', html_string)
+
+def strip_comments__lxml2(html_string=""):
+    if not html_string: return html_string
+    tree = lxml.html.fromstring(html_string)
+    comments = tree.xpath('//comment()')
+
+    for c in comments:
+        p = c.getparent()
+        p.remove(c)
+
+    return lxml.etree.tostring(tree)
+        
+def strip_comments__lxml(html_string=""):
+    if not html_string: return html_string
    
-def strip_comments__lxml(html_string):
    params = {
        'comments': True,
        'scripts': False,
@ -225,7 +239,7 @@ def strip_comments__lxml(html_string):
        return lxml.etree.tostring(clean_html)
    except XMLSyntaxError:
        return html_string
-
+        
 def linkify(*args, **kwargs):
    return xhtml_unescape_tornado(linkify_tornado(*args, **kwargs))