diff --git a/apps/rss_feeds/models.py b/apps/rss_feeds/models.py index d23226d49..750fd11fe 100644 --- a/apps/rss_feeds/models.py +++ b/apps/rss_feeds/models.py @@ -2598,7 +2598,7 @@ class MStory(mongo.Document): self.save() def extract_image_urls(self, force=False, text=False): - if self.image_urls and not force: + if self.image_urls and not force and not text: return self.image_urls story_content = None @@ -2641,14 +2641,14 @@ class MStory(mongo.Document): return self.extract_image_urls(force=force, text=True) else: return - + self.image_urls = image_urls return self.image_urls def fetch_original_text(self, force=False, request=None, debug=False): original_text_z = self.original_text_z - if not original_text_z or force: + if not original_text_z or force or True: feed = Feed.get_by_id(self.story_feed_id) ti = TextImporter(self, feed=feed, request=request, debug=debug) original_text = ti.fetch() diff --git a/apps/rss_feeds/text_importer.py b/apps/rss_feeds/text_importer.py index 0871f3a48..051a3bcd9 100644 --- a/apps/rss_feeds/text_importer.py +++ b/apps/rss_feeds/text_importer.py @@ -10,6 +10,7 @@ from utils.feed_functions import timelimit, TimeoutError from OpenSSL.SSL import Error as OpenSSLError from pyasn1.error import PyAsn1Error from django.utils.encoding import smart_str +from BeautifulSoup import BeautifulSoup BROKEN_URLS = [ "gamespot.com", @@ -87,7 +88,10 @@ class TextImporter: except TypeError: title = "" url = resp.url - + + if content: + content = self.rewrite_content(content) + if content: if self.story and not skip_save: self.story.original_text_z = zlib.compress(smart_str(content)) @@ -110,6 +114,15 @@ class TextImporter: return content + def rewrite_content(self, content): + soup = BeautifulSoup(content) + + for noscript in soup.findAll('noscript'): + if len(noscript.contents) > 0: + noscript.replaceWith(noscript.contents[0]) + + return unicode(soup) + @timelimit(10) def fetch_request(self): url = self.story_url