Extracting images from original text's noscript.

2025-09-18 21:50:56 +00:00 · 2017-03-23 16:28:47 -07:00 · 2017-03-23 16:28:47 -07:00 · 82cdae1e4d
commit 82cdae1e4d
parent 2c195cde2a
2 changed files with 17 additions and 4 deletions
--- a/apps/rss_feeds/models.py
+++ b/apps/rss_feeds/models.py
@ -2598,7 +2598,7 @@ class MStory(mongo.Document):
        self.save()
    
    def extract_image_urls(self, force=False, text=False):
-        if self.image_urls and not force:
+        if self.image_urls and not force and not text:
            return self.image_urls
        
        story_content = None
@ -2641,14 +2641,14 @@ class MStory(mongo.Document):
                return self.extract_image_urls(force=force, text=True)
            else:
                return
-            
+        
        self.image_urls = image_urls
        return self.image_urls

    def fetch_original_text(self, force=False, request=None, debug=False):
        original_text_z = self.original_text_z
        
-        if not original_text_z or force:
+        if not original_text_z or force or True:
            feed = Feed.get_by_id(self.story_feed_id)
            ti = TextImporter(self, feed=feed, request=request, debug=debug)
            original_text = ti.fetch()
--- a/apps/rss_feeds/text_importer.py
+++ b/apps/rss_feeds/text_importer.py
@ -10,6 +10,7 @@ from utils.feed_functions import timelimit, TimeoutError
 from OpenSSL.SSL import Error as OpenSSLError
 from pyasn1.error import PyAsn1Error
 from django.utils.encoding import smart_str
+from BeautifulSoup import BeautifulSoup

 BROKEN_URLS = [
    "gamespot.com",
@ -87,7 +88,10 @@ class TextImporter:
        except TypeError:
            title = ""
        url = resp.url
-
+        
+        if content:
+            content = self.rewrite_content(content)
+        
        if content:
            if self.story and not skip_save:
                self.story.original_text_z = zlib.compress(smart_str(content))
@ -110,6 +114,15 @@ class TextImporter:

        return content

+    def rewrite_content(self, content):
+        soup = BeautifulSoup(content)
+        
+        for noscript in soup.findAll('noscript'):
+            if len(noscript.contents) > 0:
+                noscript.replaceWith(noscript.contents[0])
+        
+        return unicode(soup)
+    
    @timelimit(10)
    def fetch_request(self):
        url = self.story_url