From 82cdae1e4d9f905b28346f4fedafd72dcb4aee7d Mon Sep 17 00:00:00 2001
From: Samuel Clay <samuel@ofbrooklyn.com>
Date: Thu, 23 Mar 2017 16:28:47 -0700
Subject: [PATCH] Extracting images from original text's noscript.

---
 apps/rss_feeds/models.py        |  6 +++---
 apps/rss_feeds/text_importer.py | 15 ++++++++++++++-
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/apps/rss_feeds/models.py b/apps/rss_feeds/models.py
index d23226d49..750fd11fe 100644
--- a/apps/rss_feeds/models.py
+++ b/apps/rss_feeds/models.py
@@ -2598,7 +2598,7 @@ class MStory(mongo.Document):
         self.save()
     
     def extract_image_urls(self, force=False, text=False):
-        if self.image_urls and not force:
+        if self.image_urls and not force and not text:
             return self.image_urls
         
         story_content = None
@@ -2641,14 +2641,14 @@ class MStory(mongo.Document):
                 return self.extract_image_urls(force=force, text=True)
             else:
                 return
-            
+        
         self.image_urls = image_urls
         return self.image_urls
 
     def fetch_original_text(self, force=False, request=None, debug=False):
         original_text_z = self.original_text_z
         
-        if not original_text_z or force:
+        if not original_text_z or force or True:
             feed = Feed.get_by_id(self.story_feed_id)
             ti = TextImporter(self, feed=feed, request=request, debug=debug)
             original_text = ti.fetch()
diff --git a/apps/rss_feeds/text_importer.py b/apps/rss_feeds/text_importer.py
index 0871f3a48..051a3bcd9 100644
--- a/apps/rss_feeds/text_importer.py
+++ b/apps/rss_feeds/text_importer.py
@@ -10,6 +10,7 @@ from utils.feed_functions import timelimit, TimeoutError
 from OpenSSL.SSL import Error as OpenSSLError
 from pyasn1.error import PyAsn1Error
 from django.utils.encoding import smart_str
+from BeautifulSoup import BeautifulSoup
 
 BROKEN_URLS = [
     "gamespot.com",
@@ -87,7 +88,10 @@ class TextImporter:
         except TypeError:
             title = ""
         url = resp.url
-
+        
+        if content:
+            content = self.rewrite_content(content)
+        
         if content:
             if self.story and not skip_save:
                 self.story.original_text_z = zlib.compress(smart_str(content))
@@ -110,6 +114,15 @@ class TextImporter:
 
         return content
 
+    def rewrite_content(self, content):
+        soup = BeautifulSoup(content)
+        
+        for noscript in soup.findAll('noscript'):
+            if len(noscript.contents) > 0:
+                noscript.replaceWith(noscript.contents[0])
+        
+        return unicode(soup)
+    
     @timelimit(10)
     def fetch_request(self):
         url = self.story_url