mirror of
https://github.com/samuelclay/NewsBlur.git
synced 2025-09-18 21:50:56 +00:00
Extracting images from original text's noscript.
This commit is contained in:
parent
2c195cde2a
commit
82cdae1e4d
2 changed files with 17 additions and 4 deletions
|
@ -2598,7 +2598,7 @@ class MStory(mongo.Document):
|
|||
self.save()
|
||||
|
||||
def extract_image_urls(self, force=False, text=False):
|
||||
if self.image_urls and not force:
|
||||
if self.image_urls and not force and not text:
|
||||
return self.image_urls
|
||||
|
||||
story_content = None
|
||||
|
@ -2641,14 +2641,14 @@ class MStory(mongo.Document):
|
|||
return self.extract_image_urls(force=force, text=True)
|
||||
else:
|
||||
return
|
||||
|
||||
|
||||
self.image_urls = image_urls
|
||||
return self.image_urls
|
||||
|
||||
def fetch_original_text(self, force=False, request=None, debug=False):
|
||||
original_text_z = self.original_text_z
|
||||
|
||||
if not original_text_z or force:
|
||||
if not original_text_z or force or True:
|
||||
feed = Feed.get_by_id(self.story_feed_id)
|
||||
ti = TextImporter(self, feed=feed, request=request, debug=debug)
|
||||
original_text = ti.fetch()
|
||||
|
|
|
@ -10,6 +10,7 @@ from utils.feed_functions import timelimit, TimeoutError
|
|||
from OpenSSL.SSL import Error as OpenSSLError
|
||||
from pyasn1.error import PyAsn1Error
|
||||
from django.utils.encoding import smart_str
|
||||
from BeautifulSoup import BeautifulSoup
|
||||
|
||||
BROKEN_URLS = [
|
||||
"gamespot.com",
|
||||
|
@ -87,7 +88,10 @@ class TextImporter:
|
|||
except TypeError:
|
||||
title = ""
|
||||
url = resp.url
|
||||
|
||||
|
||||
if content:
|
||||
content = self.rewrite_content(content)
|
||||
|
||||
if content:
|
||||
if self.story and not skip_save:
|
||||
self.story.original_text_z = zlib.compress(smart_str(content))
|
||||
|
@ -110,6 +114,15 @@ class TextImporter:
|
|||
|
||||
return content
|
||||
|
||||
def rewrite_content(self, content):
|
||||
soup = BeautifulSoup(content)
|
||||
|
||||
for noscript in soup.findAll('noscript'):
|
||||
if len(noscript.contents) > 0:
|
||||
noscript.replaceWith(noscript.contents[0])
|
||||
|
||||
return unicode(soup)
|
||||
|
||||
@timelimit(10)
|
||||
def fetch_request(self):
|
||||
url = self.story_url
|
||||
|
|
Loading…
Add table
Reference in a new issue