Using regular expressions for comment stripping, unless the feed has an error count, in which case switch to lxml.

This commit is contained in:
Samuel Clay 2013-04-08 16:14:33 -07:00
parent 33e605875e
commit 47c2257ef7
2 changed files with 24 additions and 6 deletions

View file

@ -36,7 +36,7 @@ from utils.feed_functions import levenshtein_distance
from utils.feed_functions import timelimit, TimeoutError
from utils.feed_functions import relative_timesince
from utils.feed_functions import seconds_timesince
from utils.story_functions import strip_tags, htmldiff, strip_comments__lxml
from utils.story_functions import strip_tags, htmldiff, strip_comments, strip_comments__lxml
from vendor.redis_completion.engine import RedisEngine
ENTRY_NEW, ENTRY_UPDATED, ENTRY_SAME, ENTRY_ERR = range(4)
@ -840,7 +840,8 @@ class Feed(models.Model):
def add_update_stories(self, stories, existing_stories, verbose=False):
ret_values = dict(new=0, updated=0, same=0, error=0)
error_count = self.error_count
if settings.DEBUG or verbose:
logging.debug(" ---> [%-30s] ~FBChecking ~SB%s~SN new/updated against ~SB%s~SN stories" % (
self.title[:30],
@ -852,7 +853,10 @@ class Feed(models.Model):
continue
story_content = story.get('story_content')
story_content = strip_comments__lxml(story_content)
if error_count:
story_content = strip_comments__lxml(story_content)
else:
story_content = strip_comments(story_content)
story_tags = self.get_tags(story)
story_link = self.get_permalink(story)

View file

@ -15,7 +15,8 @@ from utils.tornado_escape import linkify as linkify_tornado
from utils.tornado_escape import xhtml_unescape as xhtml_unescape_tornado
from vendor import reseekfile
COMMENTS_RE = re.compile('\<![ \r\n\t]*(--([^\-]|[\r\n]|-[^\-])*--[ \r\n\t]*)\>')
# COMMENTS_RE = re.compile('\<![ \r\n\t]*(--([^\-]|[\r\n]|-[^\-])*--[ \r\n\t]*)\>')
COMMENTS_RE = re.compile('\<!--.*?--\>')
def story_score(story, bottom_delta=None):
# A) Date - Assumes story is unread and within unread range
@ -197,8 +198,21 @@ def strip_tags(html):
def strip_comments(html_string):
return COMMENTS_RE.sub('', html_string)
def strip_comments__lxml2(html_string=""):
if not html_string: return html_string
tree = lxml.html.fromstring(html_string)
comments = tree.xpath('//comment()')
for c in comments:
p = c.getparent()
p.remove(c)
return lxml.etree.tostring(tree)
def strip_comments__lxml(html_string=""):
if not html_string: return html_string
def strip_comments__lxml(html_string):
params = {
'comments': True,
'scripts': False,
@ -225,7 +239,7 @@ def strip_comments__lxml(html_string):
return lxml.etree.tostring(clean_html)
except XMLSyntaxError:
return html_string
def linkify(*args, **kwargs):
return xhtml_unescape_tornado(linkify_tornado(*args, **kwargs))