From 82cdae1e4d9f905b28346f4fedafd72dcb4aee7d Mon Sep 17 00:00:00 2001 From: Samuel Clay Date: Thu, 23 Mar 2017 16:28:47 -0700 Subject: [PATCH] Extracting images from original text's noscript. --- apps/rss_feeds/models.py | 6 +++--- apps/rss_feeds/text_importer.py | 15 ++++++++++++++- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/apps/rss_feeds/models.py b/apps/rss_feeds/models.py index d23226d49..750fd11fe 100644 --- a/apps/rss_feeds/models.py +++ b/apps/rss_feeds/models.py @@ -2598,7 +2598,7 @@ class MStory(mongo.Document): self.save() def extract_image_urls(self, force=False, text=False): - if self.image_urls and not force: + if self.image_urls and not force and not text: return self.image_urls story_content = None @@ -2641,14 +2641,14 @@ class MStory(mongo.Document): return self.extract_image_urls(force=force, text=True) else: return - + self.image_urls = image_urls return self.image_urls def fetch_original_text(self, force=False, request=None, debug=False): original_text_z = self.original_text_z - if not original_text_z or force: + if not original_text_z or force or True: feed = Feed.get_by_id(self.story_feed_id) ti = TextImporter(self, feed=feed, request=request, debug=debug) original_text = ti.fetch() diff --git a/apps/rss_feeds/text_importer.py b/apps/rss_feeds/text_importer.py index 0871f3a48..051a3bcd9 100644 --- a/apps/rss_feeds/text_importer.py +++ b/apps/rss_feeds/text_importer.py @@ -10,6 +10,7 @@ from utils.feed_functions import timelimit, TimeoutError from OpenSSL.SSL import Error as OpenSSLError from pyasn1.error import PyAsn1Error from django.utils.encoding import smart_str +from BeautifulSoup import BeautifulSoup BROKEN_URLS = [ "gamespot.com", @@ -87,7 +88,10 @@ class TextImporter: except TypeError: title = "" url = resp.url - + + if content: + content = self.rewrite_content(content) + if content: if self.story and not skip_save: self.story.original_text_z = zlib.compress(smart_str(content)) @@ -110,6 +114,15 @@ class TextImporter: return content + def rewrite_content(self, content): + soup = BeautifulSoup(content) + + for noscript in soup.findAll('noscript'): + if len(noscript.contents) > 0: + noscript.replaceWith(noscript.contents[0]) + + return unicode(soup) + @timelimit(10) def fetch_request(self): url = self.story_url