mirror of
https://github.com/viq/NewsBlur.git
synced 2025-09-18 21:43:31 +00:00
Using regular expressions for comment stripping, unless the feed has an error count, in which case switch to lxml.
This commit is contained in:
parent
33e605875e
commit
47c2257ef7
2 changed files with 24 additions and 6 deletions
|
@ -36,7 +36,7 @@ from utils.feed_functions import levenshtein_distance
|
|||
from utils.feed_functions import timelimit, TimeoutError
|
||||
from utils.feed_functions import relative_timesince
|
||||
from utils.feed_functions import seconds_timesince
|
||||
from utils.story_functions import strip_tags, htmldiff, strip_comments__lxml
|
||||
from utils.story_functions import strip_tags, htmldiff, strip_comments, strip_comments__lxml
|
||||
from vendor.redis_completion.engine import RedisEngine
|
||||
|
||||
ENTRY_NEW, ENTRY_UPDATED, ENTRY_SAME, ENTRY_ERR = range(4)
|
||||
|
@ -840,7 +840,8 @@ class Feed(models.Model):
|
|||
|
||||
def add_update_stories(self, stories, existing_stories, verbose=False):
|
||||
ret_values = dict(new=0, updated=0, same=0, error=0)
|
||||
|
||||
error_count = self.error_count
|
||||
|
||||
if settings.DEBUG or verbose:
|
||||
logging.debug(" ---> [%-30s] ~FBChecking ~SB%s~SN new/updated against ~SB%s~SN stories" % (
|
||||
self.title[:30],
|
||||
|
@ -852,7 +853,10 @@ class Feed(models.Model):
|
|||
continue
|
||||
|
||||
story_content = story.get('story_content')
|
||||
story_content = strip_comments__lxml(story_content)
|
||||
if error_count:
|
||||
story_content = strip_comments__lxml(story_content)
|
||||
else:
|
||||
story_content = strip_comments(story_content)
|
||||
story_tags = self.get_tags(story)
|
||||
story_link = self.get_permalink(story)
|
||||
|
||||
|
|
|
@ -15,7 +15,8 @@ from utils.tornado_escape import linkify as linkify_tornado
|
|||
from utils.tornado_escape import xhtml_unescape as xhtml_unescape_tornado
|
||||
from vendor import reseekfile
|
||||
|
||||
COMMENTS_RE = re.compile('\<![ \r\n\t]*(--([^\-]|[\r\n]|-[^\-])*--[ \r\n\t]*)\>')
|
||||
# COMMENTS_RE = re.compile('\<![ \r\n\t]*(--([^\-]|[\r\n]|-[^\-])*--[ \r\n\t]*)\>')
|
||||
COMMENTS_RE = re.compile('\<!--.*?--\>')
|
||||
|
||||
def story_score(story, bottom_delta=None):
|
||||
# A) Date - Assumes story is unread and within unread range
|
||||
|
@ -197,8 +198,21 @@ def strip_tags(html):
|
|||
|
||||
def strip_comments(html_string):
|
||||
return COMMENTS_RE.sub('', html_string)
|
||||
|
||||
def strip_comments__lxml2(html_string=""):
|
||||
if not html_string: return html_string
|
||||
tree = lxml.html.fromstring(html_string)
|
||||
comments = tree.xpath('//comment()')
|
||||
|
||||
for c in comments:
|
||||
p = c.getparent()
|
||||
p.remove(c)
|
||||
|
||||
return lxml.etree.tostring(tree)
|
||||
|
||||
def strip_comments__lxml(html_string=""):
|
||||
if not html_string: return html_string
|
||||
|
||||
def strip_comments__lxml(html_string):
|
||||
params = {
|
||||
'comments': True,
|
||||
'scripts': False,
|
||||
|
@ -225,7 +239,7 @@ def strip_comments__lxml(html_string):
|
|||
return lxml.etree.tostring(clean_html)
|
||||
except XMLSyntaxError:
|
||||
return html_string
|
||||
|
||||
|
||||
def linkify(*args, **kwargs):
|
||||
return xhtml_unescape_tornado(linkify_tornado(*args, **kwargs))
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue