mirror of
https://github.com/viq/NewsBlur.git
synced 2025-11-01 09:09:16 +00:00
Refactoring image size fetcher to be much simpler, albeit slower.
This commit is contained in:
parent
72791b8e33
commit
eac7e71b81
4 changed files with 22 additions and 508 deletions
|
|
@ -34,11 +34,10 @@ from vendor import pynliner
|
|||
from utils import log as logging
|
||||
from utils import json_functions as json
|
||||
from utils.feed_functions import relative_timesince, chunks
|
||||
from utils.story_functions import truncate_chars, strip_tags, linkify, image_size
|
||||
from utils.story_functions import truncate_chars, strip_tags, linkify
|
||||
from utils.image_functions import ImageOps
|
||||
from utils.scrubber import SelectiveScriptScrubber
|
||||
from utils import s3_utils
|
||||
from io import BytesIO
|
||||
|
||||
try:
|
||||
from apps.social.spam import detect_spammers
|
||||
|
|
@ -2340,14 +2339,7 @@ class MSharedStory(mongo.DynamicDocument):
|
|||
for image_source in self.image_urls[:10]:
|
||||
if any(ignore in image_source for ignore in IGNORE_IMAGE_SOURCES):
|
||||
continue
|
||||
req = requests.get(image_source, headers=headers, stream=True, timeout=10)
|
||||
try:
|
||||
datastream = BytesIO(req.content)
|
||||
width, height = ImageOps.image_size(datastream)
|
||||
except IOError as e:
|
||||
logging.debug(" ***> Couldn't read image: %s / %s" % (e, image_source))
|
||||
datastream = BytesIO(req.content[:100])
|
||||
_, width, height = image_size(datastream)
|
||||
width, height = ImageOps.image_size(image_source, headers=headers)
|
||||
# if width <= 16 or height <= 16:
|
||||
# continue
|
||||
image_sizes.append({'src': image_source, 'size': (width, height)})
|
||||
|
|
|
|||
|
|
@ -1,10 +1,11 @@
|
|||
"""Operations for images through the PIL."""
|
||||
|
||||
import urllib.request
|
||||
from PIL import Image
|
||||
from PIL import ImageFile
|
||||
from PIL import ImageOps as PILOps
|
||||
from PIL.ExifTags import TAGS
|
||||
from io import BytesIO
|
||||
from vendor import reseekfile
|
||||
|
||||
PROFILE_PICTURE_SIZES = {
|
||||
'fullsize': (256, 256),
|
||||
|
|
@ -71,7 +72,21 @@ class ImageOps:
|
|||
return image
|
||||
|
||||
@classmethod
|
||||
def image_size(cls, datastream):
|
||||
datastream = reseekfile.ReseekFile(datastream)
|
||||
image = Image.open(datastream)
|
||||
return image.size
|
||||
def image_size(cls, url, headers=None):
|
||||
if not headers: headers = {}
|
||||
req = urllib.request.Request(url, data=None, headers=headers)
|
||||
file = urllib.request.urlopen(req)
|
||||
size = file.headers.get("content-length")
|
||||
if size:
|
||||
size = int(size)
|
||||
p = ImageFile.Parser()
|
||||
while True:
|
||||
data = file.read(1024)
|
||||
if not data:
|
||||
break
|
||||
p.feed(data)
|
||||
if p.image:
|
||||
return p.image.size
|
||||
break
|
||||
file.close()
|
||||
return None, None
|
||||
|
|
|
|||
|
|
@ -16,7 +16,6 @@ from django.utils.dateformat import DateFormat
|
|||
from django.utils.html import strip_tags as strip_tags_django
|
||||
from utils.tornado_escape import linkify as linkify_tornado
|
||||
from utils.tornado_escape import xhtml_unescape as xhtml_unescape_tornado
|
||||
from vendor import reseekfile
|
||||
import feedparser
|
||||
|
||||
import hmac
|
||||
|
|
@ -284,68 +283,6 @@ def truncate_chars(value, max_length):
|
|||
|
||||
return truncd_val.decode('utf-8', 'ignore') + "..."
|
||||
|
||||
def image_size(datastream):
|
||||
datastream = reseekfile.ReseekFile(datastream)
|
||||
data = str(datastream.read(30))
|
||||
size = len(data)
|
||||
height = -1
|
||||
width = -1
|
||||
content_type = ''
|
||||
|
||||
# handle GIFs
|
||||
if (size >= 10) and data[:6] in ('GIF87a', 'GIF89a'):
|
||||
# Check to see if content_type is correct
|
||||
content_type = 'image/gif'
|
||||
w, h = struct.unpack("<HH", data[6:10])
|
||||
width = int(w)
|
||||
height = int(h)
|
||||
|
||||
# See PNG 2. Edition spec (http://www.w3.org/TR/PNG/)
|
||||
# Bytes 0-7 are below, 4-byte chunk length, then 'IHDR'
|
||||
# and finally the 4-byte width, height
|
||||
elif ((size >= 24) and data.startswith('\211PNG\r\n\032\n')
|
||||
and (data[12:16] == 'IHDR')):
|
||||
content_type = 'image/png'
|
||||
w, h = struct.unpack(">LL", data[16:24])
|
||||
width = int(w)
|
||||
height = int(h)
|
||||
|
||||
# Maybe this is for an older PNG version.
|
||||
elif (size >= 16) and data.startswith('\211PNG\r\n\032\n'):
|
||||
# Check to see if we have the right content type
|
||||
content_type = 'image/png'
|
||||
w, h = struct.unpack(">LL", data[8:16])
|
||||
width = int(w)
|
||||
height = int(h)
|
||||
|
||||
# handle JPEGs
|
||||
elif (size >= 2) and data.startswith('\377\330'):
|
||||
content_type = 'image/jpeg'
|
||||
datastream.seek(0)
|
||||
datastream.read(2)
|
||||
b = datastream.read(1)
|
||||
try:
|
||||
w = 0
|
||||
h = 0
|
||||
while (b and ord(b) != 0xDA):
|
||||
while (ord(b) != 0xFF): b = datastream.read(1)
|
||||
while (ord(b) == 0xFF): b = datastream.read(1)
|
||||
if (ord(b) >= 0xC0 and ord(b) <= 0xC3):
|
||||
datastream.read(3)
|
||||
h, w = struct.unpack(">HH", datastream.read(4))
|
||||
break
|
||||
else:
|
||||
datastream.read(int(struct.unpack(">H", datastream.read(2))[0])-2)
|
||||
b = datastream.read(1)
|
||||
width = int(w)
|
||||
height = int(h)
|
||||
except struct.error:
|
||||
pass
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
return content_type, width, height
|
||||
|
||||
def htmldiff(old_html, new_html):
|
||||
try:
|
||||
old_html_tokens = tokenize(old_html, include_hrefs=False)
|
||||
|
|
|
|||
430
vendor/reseekfile.py
vendored
430
vendor/reseekfile.py
vendored
|
|
@ -1,430 +0,0 @@
|
|||
"""Wrap a file handle to allow seeks back to the beginning
|
||||
|
||||
Sometimes data coming from a socket or other input file handle isn't
|
||||
what it was supposed to be. For example, suppose you are reading from
|
||||
a buggy server which is supposed to return an XML stream but can also
|
||||
return an unformatted error message. (This often happens because the
|
||||
server doesn't handle incorrect input very well.)
|
||||
|
||||
A ReseekFile helps solve this problem. It is a wrapper to the
|
||||
original input stream but provides a buffer. Read requests to the
|
||||
ReseekFile get forwarded to the input stream, appended to a buffer,
|
||||
then returned to the caller. The buffer contains all the data read so
|
||||
far.
|
||||
|
||||
The ReseekFile can be told to reseek to the start position. The next
|
||||
read request will come from the buffer, until the buffer has been
|
||||
read, in which case it gets the data from the input stream. This
|
||||
newly read data is also appended to the buffer.
|
||||
|
||||
When buffering is no longer needed, use the 'nobuffer()' method. This
|
||||
tells the ReseekFile that once it has read from the buffer it should
|
||||
throw the buffer away. After nobuffer is called, the behaviour of
|
||||
'seek' is no longer defined.
|
||||
|
||||
For example, suppose you have the server as above which either
|
||||
gives an error message is of the form:
|
||||
|
||||
ERROR: cannot do that
|
||||
|
||||
or an XML data stream, starting with "<?xml".
|
||||
|
||||
infile = urllib2.urlopen("http://somewhere/")
|
||||
infile = ReseekFile.ReseekFile(infile)
|
||||
s = infile.readline()
|
||||
if s.startswith("ERROR:"):
|
||||
raise Exception(s[:-1])
|
||||
infile.seek(0)
|
||||
infile.nobuffer() # Don't buffer the data
|
||||
... process the XML from infile ...
|
||||
|
||||
|
||||
This module also implements 'prepare_input_source(source)' modeled on
|
||||
xml.sax.saxutils.prepare_input_source. This opens a URL and if the
|
||||
input stream is not already seekable, wraps it in a ReseekFile.
|
||||
|
||||
|
||||
NOTE:
|
||||
Don't use bound methods for the ReseekFile. When the buffer is
|
||||
empty, the ReseekFile reassigns the input file's read/readlines/etc.
|
||||
method as instance variable. This gives slightly better performance
|
||||
at the cost of not allowing an infrequently used idiom.
|
||||
|
||||
Use tell() to get the beginning byte location. ReseekFile will
|
||||
attempt to get the real position from the wrapped file and use that as
|
||||
the beginning location. If the wrapped file does not support tell(),
|
||||
ReseekFile.tell() will return 0.
|
||||
|
||||
readlines does not yet support a sizehint. Want to implement it?
|
||||
|
||||
The latest version of this code can be found at
|
||||
http://www.dalkescientific.com/Python/
|
||||
"""
|
||||
# Started in 2003 by Andrew Dalke, Dalke Scientific Software, LLC.
|
||||
# This software has been released to the public domain. No
|
||||
# copyright is asserted.
|
||||
|
||||
## Changelog:
|
||||
# 2005-11-06
|
||||
# Use StringIO if cStringIO doesn't exist. Suggested by Howard Golden
|
||||
# for use with non-CPython implementations.
|
||||
# 2005-05-18
|
||||
# Can specify a factory to specify how to create the temporary file.
|
||||
# Factories for memory-based (cStringIO) and file-based storages
|
||||
# Track the buffer file size so I don't depend on getvalue()
|
||||
# Fixed a few typos
|
||||
|
||||
def memory_backed_tempfile():
|
||||
try:
|
||||
from io import StringIO
|
||||
except ImportError:
|
||||
from io import StringIO
|
||||
return StringIO()
|
||||
|
||||
def file_backed_tempfile():
|
||||
import tempfile
|
||||
return tempfile.NamedTemporaryFile(mode="r+b")
|
||||
|
||||
class ReseekFile:
|
||||
"""wrap a file handle to allow seeks back to the beginning
|
||||
|
||||
Takes a file handle in the constructor.
|
||||
|
||||
See the module docstring for more documentation.
|
||||
"""
|
||||
def __init__(self, file, tempfile_factory = memory_backed_tempfile):
|
||||
self.file = file
|
||||
self.buffer_file = tempfile_factory()
|
||||
self.at_beginning = 1
|
||||
try:
|
||||
self.beginning = file.tell()
|
||||
except (IOError, AttributeError):
|
||||
self.beginning = 0
|
||||
self._use_buffer = 1
|
||||
self._buffer_size = 0
|
||||
|
||||
def seek(self, offset, whence = 0):
|
||||
"""offset, whence = 0
|
||||
|
||||
Seek to a given byte position. Only supports whence == 0
|
||||
and offset == the initial value of ReseekFile.tell() (which
|
||||
is usually 0, but not always.)
|
||||
"""
|
||||
if whence != 0:
|
||||
raise TypeError("Unexpected whence value of %s; expecting 0" % \
|
||||
(whence,))
|
||||
if offset != self.beginning:
|
||||
raise TypeError("Unexpected offset value of %r; expecting '%s'" % \
|
||||
(offset, self.beginning))
|
||||
self.buffer_file.seek(0)
|
||||
self.at_beginning = 1
|
||||
|
||||
def tell(self):
|
||||
"""the current position of the file
|
||||
|
||||
The initial position may not be 0 if the underlying input
|
||||
file supports tell and it not at position 0.
|
||||
"""
|
||||
if not self.at_beginning:
|
||||
raise TypeError("ReseekFile cannot tell except at the beginning of file")
|
||||
return self.beginning
|
||||
|
||||
def _read(self, size):
|
||||
if size < 0:
|
||||
y = self.file.read()
|
||||
z = self.buffer_file.read() + y
|
||||
if self._use_buffer:
|
||||
self.buffer_file.write(y)
|
||||
self._buffer_size += len(y)
|
||||
return z
|
||||
if size == 0:
|
||||
return ""
|
||||
x = self.buffer_file.read(size)
|
||||
if len(x) < size:
|
||||
y = self.file.read(size - len(x))
|
||||
if self._use_buffer:
|
||||
self.buffer_file.write(y)
|
||||
self._buffer_size += len(y)
|
||||
return x + y
|
||||
return x
|
||||
|
||||
def read(self, size = -1):
|
||||
"""read up to 'size' bytes from the file
|
||||
|
||||
Default is -1, which means to read to end of file.
|
||||
"""
|
||||
x = self._read(size)
|
||||
if self.at_beginning and x:
|
||||
self.at_beginning = 0
|
||||
self._check_no_buffer()
|
||||
return x
|
||||
|
||||
def readline(self):
|
||||
"""read a line from the file"""
|
||||
|
||||
# Can we get it out of the buffer_file?
|
||||
s = self.buffer_file.readline()
|
||||
if s[-1:] == "\n":
|
||||
return s
|
||||
# No, so now we read a line from the input file
|
||||
t = self.file.readline()
|
||||
|
||||
# Append the new data to the buffer, if still buffering
|
||||
if self._use_buffer:
|
||||
self.buffer_file.write(t)
|
||||
self._buffer_size += len(t)
|
||||
self._check_no_buffer()
|
||||
|
||||
return s + t
|
||||
|
||||
def readlines(self):
|
||||
"""read all remaining lines from the file"""
|
||||
s = self.read()
|
||||
lines = []
|
||||
i, j = 0, s.find("\n")
|
||||
while j > -1:
|
||||
lines.append(s[i:j+1])
|
||||
i = j+1
|
||||
j = s.find("\n", i)
|
||||
if i < len(s):
|
||||
# Only get here if the last line doesn't have a newline
|
||||
lines.append(s[i:])
|
||||
return lines
|
||||
|
||||
def _check_no_buffer(self):
|
||||
# If 'nobuffer' called and finished with the buffer file
|
||||
# then get rid of the buffer and redirect everything to
|
||||
# the original input file.
|
||||
if (self._use_buffer == 0 and
|
||||
(self.buffer_file.tell() == self._buffer_size)):
|
||||
# I'm doing this for the slightly better performance
|
||||
self.seek = getattr(self.file, "seek", None)
|
||||
self.tell = getattr(self.file, "tell", None)
|
||||
self.read = self.file.read
|
||||
self.readline = self.file.readline
|
||||
self.readlines = self.file.readlines
|
||||
del self.buffer_file
|
||||
|
||||
def nobuffer(self):
|
||||
"""tell the ReseekFile to stop using the buffer once it's exhausted"""
|
||||
self._use_buffer = 0
|
||||
|
||||
def prepare_input_source(source):
|
||||
"""given a URL, returns a xml.sax.xmlreader.InputSource
|
||||
|
||||
Works like xml.sax.saxutils.prepare_input_source. Wraps the
|
||||
InputSource in a ReseekFile if the URL returns a non-seekable
|
||||
file.
|
||||
|
||||
To turn the buffer off if that happens, you'll need to do
|
||||
something like
|
||||
|
||||
f = source.getCharacterStream()
|
||||
...
|
||||
try:
|
||||
f.nobuffer()
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
or
|
||||
|
||||
if isinstance(f, ReseekFile):
|
||||
f.nobuffer()
|
||||
|
||||
"""
|
||||
from xml.sax import saxutils
|
||||
source = saxutils.prepare_input_source(source)
|
||||
# Is this correct? Don't know - don't have Unicode experience
|
||||
f = source.getCharacterStream() or source.getByteStream()
|
||||
try:
|
||||
f.tell()
|
||||
except (AttributeError, IOError):
|
||||
f = ReseekFile.ReseekFile(f)
|
||||
source.setByteStream(f)
|
||||
source.setCharacterStream(None)
|
||||
return source
|
||||
|
||||
def test_reads(test_s, file, seek0):
|
||||
assert file.read(2) == "Th"
|
||||
assert file.read(3) == "is "
|
||||
assert file.read(4) == "is a"
|
||||
assert file.read(0) == ""
|
||||
assert file.read(0) == ""
|
||||
assert file.read(6) == " test."
|
||||
file.seek(seek0)
|
||||
assert file.read(2) == "Th"
|
||||
assert file.read(3) == "is "
|
||||
assert file.read(4) == "is a"
|
||||
assert file.read(0) == ""
|
||||
assert file.read(0) == ""
|
||||
assert file.read(6) == " test."
|
||||
assert file.read(1) == "\n"
|
||||
assert file.read(5) == "12345"
|
||||
assert file.read() == "67890\n"
|
||||
file.seek(seek0)
|
||||
assert file.read() == test_s
|
||||
file.seek(seek0)
|
||||
|
||||
|
||||
def _test(ReseekFileFactory):
|
||||
from io import StringIO
|
||||
s = "This is a test.\n1234567890\n"
|
||||
file = StringIO(s)
|
||||
# Test with a normal file
|
||||
x = file.tell()
|
||||
test_reads(s, file, x)
|
||||
test_reads(s, file, x)
|
||||
|
||||
# Test with a ReseekFile wrapper
|
||||
rf = ReseekFileFactory(file)
|
||||
y = rf.tell()
|
||||
rf.seek(y)
|
||||
test_reads(s, rf, y)
|
||||
assert rf.read() == s
|
||||
assert rf.read() == ""
|
||||
|
||||
# Make sure the tell offset is correct (may not be 0)
|
||||
file = StringIO("X" + s)
|
||||
file.read(1)
|
||||
rf = ReseekFileFactory(file)
|
||||
y = rf.tell()
|
||||
test_reads(s, rf, y)
|
||||
rf.seek(y)
|
||||
test_reads(s, rf, y)
|
||||
assert rf.read() == s
|
||||
assert rf.read() == ""
|
||||
|
||||
# Test the ability to turn off buffering and have changes
|
||||
# propogate correctly
|
||||
file = StringIO("X" + s)
|
||||
file.read(1)
|
||||
rf = ReseekFileFactory(file)
|
||||
y = rf.tell()
|
||||
assert y == 1
|
||||
rf.read(1000)
|
||||
rf.seek(y)
|
||||
rf.nobuffer()
|
||||
assert rf.tell() == y
|
||||
test_reads(s, rf, y)
|
||||
rf.seek(y)
|
||||
test_reads(s, rf, y)
|
||||
assert rf.read() == s
|
||||
assert rf.read() == ""
|
||||
|
||||
# turn off buffering after partial reads
|
||||
file = StringIO("X" + s)
|
||||
file.read(1)
|
||||
rf = ReseekFileFactory(file)
|
||||
y = rf.tell()
|
||||
rf.read(5)
|
||||
rf.seek(y)
|
||||
rf.nobuffer()
|
||||
assert rf.read() == s
|
||||
|
||||
file = StringIO("X" + s)
|
||||
file.read(1)
|
||||
rf = ReseekFileFactory(file)
|
||||
y = rf.tell()
|
||||
t = rf.read(5)
|
||||
rf.seek(y)
|
||||
rf.nobuffer()
|
||||
assert rf.read(5) == t
|
||||
|
||||
file = StringIO("X" + s)
|
||||
file.read(1)
|
||||
rf = ReseekFileFactory(file)
|
||||
y = rf.tell()
|
||||
t = rf.read(5)
|
||||
assert t == s[:5]
|
||||
rf.seek(y)
|
||||
rf.nobuffer()
|
||||
assert rf.read(8) == s[:8]
|
||||
|
||||
file = StringIO("X" + s)
|
||||
file.read(1)
|
||||
rf = ReseekFileFactory(file)
|
||||
y = rf.tell()
|
||||
t = rf.read(5)
|
||||
assert t == s[:5]
|
||||
rf.nobuffer()
|
||||
assert rf.read(8) == s[5:5+8]
|
||||
|
||||
# Should only do this test on Unix systems
|
||||
import os
|
||||
infile = os.popen("echo HELLO_THERE")
|
||||
infile.read(1)
|
||||
rf = ReseekFileFactory(infile)
|
||||
y = rf.tell()
|
||||
assert rf.read(1) == "E"
|
||||
assert rf.read(2) == "LL"
|
||||
rf.seek(y)
|
||||
assert rf.read(4) == "ELLO"
|
||||
rf.seek(y)
|
||||
assert rf.read(1) == "E"
|
||||
rf.nobuffer()
|
||||
assert rf.read(1) == "L"
|
||||
assert rf.read(4) == "LO_T"
|
||||
assert rf.read(4) == "HERE"
|
||||
try:
|
||||
rf.seek(y)
|
||||
raise AssertionError("Cannot seek here!")
|
||||
except IOError:
|
||||
pass
|
||||
try:
|
||||
rf.tell()
|
||||
raise AssertionError("Cannot tell here!")
|
||||
except IOError:
|
||||
pass
|
||||
|
||||
# Check if readline/readlines works
|
||||
s = "This is line 1.\nAnd line 2.\nAnd now, page 3!"
|
||||
file = StringIO(s)
|
||||
rf = ReseekFileFactory(file)
|
||||
rf.read(1)
|
||||
assert rf.readline() == "his is line 1.\n"
|
||||
rf.seek(0)
|
||||
assert rf.readline() == "This is line 1.\n"
|
||||
rf.read(2)
|
||||
assert rf.readline() == "d line 2.\n"
|
||||
rf.seek(0)
|
||||
assert rf.readlines() == ["This is line 1.\n",
|
||||
"And line 2.\n",
|
||||
"And now, page 3!"]
|
||||
|
||||
rf.seek(0)
|
||||
rf.read(len(s))
|
||||
assert rf.readlines() == []
|
||||
rf.seek(0)
|
||||
|
||||
# Now there is a final newline
|
||||
s = "This is line 1.\nAnd line 2.\nAnd now, page 3!\n"
|
||||
rf = ReseekFileFactory(StringIO(s))
|
||||
rf.read(1)
|
||||
rf.seek(0)
|
||||
rf.nobuffer()
|
||||
assert rf.readlines() == ["This is line 1.\n",
|
||||
"And line 2.\n",
|
||||
"And now, page 3!\n"]
|
||||
|
||||
def test():
|
||||
_test(ReseekFile)
|
||||
|
||||
# Test with a different backing store. Make sure that I'm
|
||||
# using the backing store.
|
||||
was_called = [0]
|
||||
def file_backed(infile):
|
||||
was_called[0] = 1
|
||||
return ReseekFile(infile, file_backed_tempfile)
|
||||
_test(file_backed)
|
||||
if not was_called[0]:
|
||||
raise AssertionError("file_backed_tempfile was not called")
|
||||
|
||||
import io
|
||||
f = io.StringIO("Andrew")
|
||||
g = ReseekFile(f, file_backed_tempfile)
|
||||
if not hasattr(g.buffer_file, "name"):
|
||||
raise AssertionError("backend file not created")
|
||||
|
||||
if __name__ == "__main__":
|
||||
test()
|
||||
print("All tests passed.")
|
||||
Loading…
Add table
Reference in a new issue