Extracting images from original text's noscript.

This commit is contained in:
Samuel Clay 2017-03-23 16:28:47 -07:00
parent 2c195cde2a
commit 82cdae1e4d
2 changed files with 17 additions and 4 deletions

View file

@ -2598,7 +2598,7 @@ class MStory(mongo.Document):
self.save()
def extract_image_urls(self, force=False, text=False):
if self.image_urls and not force:
if self.image_urls and not force and not text:
return self.image_urls
story_content = None
@ -2641,14 +2641,14 @@ class MStory(mongo.Document):
return self.extract_image_urls(force=force, text=True)
else:
return
self.image_urls = image_urls
return self.image_urls
def fetch_original_text(self, force=False, request=None, debug=False):
original_text_z = self.original_text_z
if not original_text_z or force:
if not original_text_z or force or True:
feed = Feed.get_by_id(self.story_feed_id)
ti = TextImporter(self, feed=feed, request=request, debug=debug)
original_text = ti.fetch()

View file

@ -10,6 +10,7 @@ from utils.feed_functions import timelimit, TimeoutError
from OpenSSL.SSL import Error as OpenSSLError
from pyasn1.error import PyAsn1Error
from django.utils.encoding import smart_str
from BeautifulSoup import BeautifulSoup
BROKEN_URLS = [
"gamespot.com",
@ -87,7 +88,10 @@ class TextImporter:
except TypeError:
title = ""
url = resp.url
if content:
content = self.rewrite_content(content)
if content:
if self.story and not skip_save:
self.story.original_text_z = zlib.compress(smart_str(content))
@ -110,6 +114,15 @@ class TextImporter:
return content
def rewrite_content(self, content):
soup = BeautifulSoup(content)
for noscript in soup.findAll('noscript'):
if len(noscript.contents) > 0:
noscript.replaceWith(noscript.contents[0])
return unicode(soup)
@timelimit(10)
def fetch_request(self):
url = self.story_url