Refactoring image size fetcher to be much simpler, albeit slower.

This commit is contained in:
Samuel Clay 2021-03-03 13:50:22 -05:00
parent 72791b8e33
commit eac7e71b81
4 changed files with 22 additions and 508 deletions

View file

@ -34,11 +34,10 @@ from vendor import pynliner
from utils import log as logging
from utils import json_functions as json
from utils.feed_functions import relative_timesince, chunks
from utils.story_functions import truncate_chars, strip_tags, linkify, image_size
from utils.story_functions import truncate_chars, strip_tags, linkify
from utils.image_functions import ImageOps
from utils.scrubber import SelectiveScriptScrubber
from utils import s3_utils
from io import BytesIO
try:
from apps.social.spam import detect_spammers
@ -2340,14 +2339,7 @@ class MSharedStory(mongo.DynamicDocument):
for image_source in self.image_urls[:10]:
if any(ignore in image_source for ignore in IGNORE_IMAGE_SOURCES):
continue
req = requests.get(image_source, headers=headers, stream=True, timeout=10)
try:
datastream = BytesIO(req.content)
width, height = ImageOps.image_size(datastream)
except IOError as e:
logging.debug(" ***> Couldn't read image: %s / %s" % (e, image_source))
datastream = BytesIO(req.content[:100])
_, width, height = image_size(datastream)
width, height = ImageOps.image_size(image_source, headers=headers)
# if width <= 16 or height <= 16:
# continue
image_sizes.append({'src': image_source, 'size': (width, height)})

View file

@ -1,10 +1,11 @@
"""Operations for images through the PIL."""
import urllib.request
from PIL import Image
from PIL import ImageFile
from PIL import ImageOps as PILOps
from PIL.ExifTags import TAGS
from io import BytesIO
from vendor import reseekfile
PROFILE_PICTURE_SIZES = {
'fullsize': (256, 256),
@ -71,7 +72,21 @@ class ImageOps:
return image
@classmethod
def image_size(cls, datastream):
datastream = reseekfile.ReseekFile(datastream)
image = Image.open(datastream)
return image.size
def image_size(cls, url, headers=None):
if not headers: headers = {}
req = urllib.request.Request(url, data=None, headers=headers)
file = urllib.request.urlopen(req)
size = file.headers.get("content-length")
if size:
size = int(size)
p = ImageFile.Parser()
while True:
data = file.read(1024)
if not data:
break
p.feed(data)
if p.image:
return p.image.size
break
file.close()
return None, None

View file

@ -16,7 +16,6 @@ from django.utils.dateformat import DateFormat
from django.utils.html import strip_tags as strip_tags_django
from utils.tornado_escape import linkify as linkify_tornado
from utils.tornado_escape import xhtml_unescape as xhtml_unescape_tornado
from vendor import reseekfile
import feedparser
import hmac
@ -284,68 +283,6 @@ def truncate_chars(value, max_length):
return truncd_val.decode('utf-8', 'ignore') + "..."
def image_size(datastream):
datastream = reseekfile.ReseekFile(datastream)
data = str(datastream.read(30))
size = len(data)
height = -1
width = -1
content_type = ''
# handle GIFs
if (size >= 10) and data[:6] in ('GIF87a', 'GIF89a'):
# Check to see if content_type is correct
content_type = 'image/gif'
w, h = struct.unpack("<HH", data[6:10])
width = int(w)
height = int(h)
# See PNG 2. Edition spec (http://www.w3.org/TR/PNG/)
# Bytes 0-7 are below, 4-byte chunk length, then 'IHDR'
# and finally the 4-byte width, height
elif ((size >= 24) and data.startswith('\211PNG\r\n\032\n')
and (data[12:16] == 'IHDR')):
content_type = 'image/png'
w, h = struct.unpack(">LL", data[16:24])
width = int(w)
height = int(h)
# Maybe this is for an older PNG version.
elif (size >= 16) and data.startswith('\211PNG\r\n\032\n'):
# Check to see if we have the right content type
content_type = 'image/png'
w, h = struct.unpack(">LL", data[8:16])
width = int(w)
height = int(h)
# handle JPEGs
elif (size >= 2) and data.startswith('\377\330'):
content_type = 'image/jpeg'
datastream.seek(0)
datastream.read(2)
b = datastream.read(1)
try:
w = 0
h = 0
while (b and ord(b) != 0xDA):
while (ord(b) != 0xFF): b = datastream.read(1)
while (ord(b) == 0xFF): b = datastream.read(1)
if (ord(b) >= 0xC0 and ord(b) <= 0xC3):
datastream.read(3)
h, w = struct.unpack(">HH", datastream.read(4))
break
else:
datastream.read(int(struct.unpack(">H", datastream.read(2))[0])-2)
b = datastream.read(1)
width = int(w)
height = int(h)
except struct.error:
pass
except ValueError:
pass
return content_type, width, height
def htmldiff(old_html, new_html):
try:
old_html_tokens = tokenize(old_html, include_hrefs=False)

430
vendor/reseekfile.py vendored
View file

@ -1,430 +0,0 @@
"""Wrap a file handle to allow seeks back to the beginning
Sometimes data coming from a socket or other input file handle isn't
what it was supposed to be. For example, suppose you are reading from
a buggy server which is supposed to return an XML stream but can also
return an unformatted error message. (This often happens because the
server doesn't handle incorrect input very well.)
A ReseekFile helps solve this problem. It is a wrapper to the
original input stream but provides a buffer. Read requests to the
ReseekFile get forwarded to the input stream, appended to a buffer,
then returned to the caller. The buffer contains all the data read so
far.
The ReseekFile can be told to reseek to the start position. The next
read request will come from the buffer, until the buffer has been
read, in which case it gets the data from the input stream. This
newly read data is also appended to the buffer.
When buffering is no longer needed, use the 'nobuffer()' method. This
tells the ReseekFile that once it has read from the buffer it should
throw the buffer away. After nobuffer is called, the behaviour of
'seek' is no longer defined.
For example, suppose you have the server as above which either
gives an error message is of the form:
ERROR: cannot do that
or an XML data stream, starting with "<?xml".
infile = urllib2.urlopen("http://somewhere/")
infile = ReseekFile.ReseekFile(infile)
s = infile.readline()
if s.startswith("ERROR:"):
raise Exception(s[:-1])
infile.seek(0)
infile.nobuffer() # Don't buffer the data
... process the XML from infile ...
This module also implements 'prepare_input_source(source)' modeled on
xml.sax.saxutils.prepare_input_source. This opens a URL and if the
input stream is not already seekable, wraps it in a ReseekFile.
NOTE:
Don't use bound methods for the ReseekFile. When the buffer is
empty, the ReseekFile reassigns the input file's read/readlines/etc.
method as instance variable. This gives slightly better performance
at the cost of not allowing an infrequently used idiom.
Use tell() to get the beginning byte location. ReseekFile will
attempt to get the real position from the wrapped file and use that as
the beginning location. If the wrapped file does not support tell(),
ReseekFile.tell() will return 0.
readlines does not yet support a sizehint. Want to implement it?
The latest version of this code can be found at
http://www.dalkescientific.com/Python/
"""
# Started in 2003 by Andrew Dalke, Dalke Scientific Software, LLC.
# This software has been released to the public domain. No
# copyright is asserted.
## Changelog:
# 2005-11-06
# Use StringIO if cStringIO doesn't exist. Suggested by Howard Golden
# for use with non-CPython implementations.
# 2005-05-18
# Can specify a factory to specify how to create the temporary file.
# Factories for memory-based (cStringIO) and file-based storages
# Track the buffer file size so I don't depend on getvalue()
# Fixed a few typos
def memory_backed_tempfile():
try:
from io import StringIO
except ImportError:
from io import StringIO
return StringIO()
def file_backed_tempfile():
import tempfile
return tempfile.NamedTemporaryFile(mode="r+b")
class ReseekFile:
"""wrap a file handle to allow seeks back to the beginning
Takes a file handle in the constructor.
See the module docstring for more documentation.
"""
def __init__(self, file, tempfile_factory = memory_backed_tempfile):
self.file = file
self.buffer_file = tempfile_factory()
self.at_beginning = 1
try:
self.beginning = file.tell()
except (IOError, AttributeError):
self.beginning = 0
self._use_buffer = 1
self._buffer_size = 0
def seek(self, offset, whence = 0):
"""offset, whence = 0
Seek to a given byte position. Only supports whence == 0
and offset == the initial value of ReseekFile.tell() (which
is usually 0, but not always.)
"""
if whence != 0:
raise TypeError("Unexpected whence value of %s; expecting 0" % \
(whence,))
if offset != self.beginning:
raise TypeError("Unexpected offset value of %r; expecting '%s'" % \
(offset, self.beginning))
self.buffer_file.seek(0)
self.at_beginning = 1
def tell(self):
"""the current position of the file
The initial position may not be 0 if the underlying input
file supports tell and it not at position 0.
"""
if not self.at_beginning:
raise TypeError("ReseekFile cannot tell except at the beginning of file")
return self.beginning
def _read(self, size):
if size < 0:
y = self.file.read()
z = self.buffer_file.read() + y
if self._use_buffer:
self.buffer_file.write(y)
self._buffer_size += len(y)
return z
if size == 0:
return ""
x = self.buffer_file.read(size)
if len(x) < size:
y = self.file.read(size - len(x))
if self._use_buffer:
self.buffer_file.write(y)
self._buffer_size += len(y)
return x + y
return x
def read(self, size = -1):
"""read up to 'size' bytes from the file
Default is -1, which means to read to end of file.
"""
x = self._read(size)
if self.at_beginning and x:
self.at_beginning = 0
self._check_no_buffer()
return x
def readline(self):
"""read a line from the file"""
# Can we get it out of the buffer_file?
s = self.buffer_file.readline()
if s[-1:] == "\n":
return s
# No, so now we read a line from the input file
t = self.file.readline()
# Append the new data to the buffer, if still buffering
if self._use_buffer:
self.buffer_file.write(t)
self._buffer_size += len(t)
self._check_no_buffer()
return s + t
def readlines(self):
"""read all remaining lines from the file"""
s = self.read()
lines = []
i, j = 0, s.find("\n")
while j > -1:
lines.append(s[i:j+1])
i = j+1
j = s.find("\n", i)
if i < len(s):
# Only get here if the last line doesn't have a newline
lines.append(s[i:])
return lines
def _check_no_buffer(self):
# If 'nobuffer' called and finished with the buffer file
# then get rid of the buffer and redirect everything to
# the original input file.
if (self._use_buffer == 0 and
(self.buffer_file.tell() == self._buffer_size)):
# I'm doing this for the slightly better performance
self.seek = getattr(self.file, "seek", None)
self.tell = getattr(self.file, "tell", None)
self.read = self.file.read
self.readline = self.file.readline
self.readlines = self.file.readlines
del self.buffer_file
def nobuffer(self):
"""tell the ReseekFile to stop using the buffer once it's exhausted"""
self._use_buffer = 0
def prepare_input_source(source):
"""given a URL, returns a xml.sax.xmlreader.InputSource
Works like xml.sax.saxutils.prepare_input_source. Wraps the
InputSource in a ReseekFile if the URL returns a non-seekable
file.
To turn the buffer off if that happens, you'll need to do
something like
f = source.getCharacterStream()
...
try:
f.nobuffer()
except AttributeError:
pass
or
if isinstance(f, ReseekFile):
f.nobuffer()
"""
from xml.sax import saxutils
source = saxutils.prepare_input_source(source)
# Is this correct? Don't know - don't have Unicode experience
f = source.getCharacterStream() or source.getByteStream()
try:
f.tell()
except (AttributeError, IOError):
f = ReseekFile.ReseekFile(f)
source.setByteStream(f)
source.setCharacterStream(None)
return source
def test_reads(test_s, file, seek0):
assert file.read(2) == "Th"
assert file.read(3) == "is "
assert file.read(4) == "is a"
assert file.read(0) == ""
assert file.read(0) == ""
assert file.read(6) == " test."
file.seek(seek0)
assert file.read(2) == "Th"
assert file.read(3) == "is "
assert file.read(4) == "is a"
assert file.read(0) == ""
assert file.read(0) == ""
assert file.read(6) == " test."
assert file.read(1) == "\n"
assert file.read(5) == "12345"
assert file.read() == "67890\n"
file.seek(seek0)
assert file.read() == test_s
file.seek(seek0)
def _test(ReseekFileFactory):
from io import StringIO
s = "This is a test.\n1234567890\n"
file = StringIO(s)
# Test with a normal file
x = file.tell()
test_reads(s, file, x)
test_reads(s, file, x)
# Test with a ReseekFile wrapper
rf = ReseekFileFactory(file)
y = rf.tell()
rf.seek(y)
test_reads(s, rf, y)
assert rf.read() == s
assert rf.read() == ""
# Make sure the tell offset is correct (may not be 0)
file = StringIO("X" + s)
file.read(1)
rf = ReseekFileFactory(file)
y = rf.tell()
test_reads(s, rf, y)
rf.seek(y)
test_reads(s, rf, y)
assert rf.read() == s
assert rf.read() == ""
# Test the ability to turn off buffering and have changes
# propogate correctly
file = StringIO("X" + s)
file.read(1)
rf = ReseekFileFactory(file)
y = rf.tell()
assert y == 1
rf.read(1000)
rf.seek(y)
rf.nobuffer()
assert rf.tell() == y
test_reads(s, rf, y)
rf.seek(y)
test_reads(s, rf, y)
assert rf.read() == s
assert rf.read() == ""
# turn off buffering after partial reads
file = StringIO("X" + s)
file.read(1)
rf = ReseekFileFactory(file)
y = rf.tell()
rf.read(5)
rf.seek(y)
rf.nobuffer()
assert rf.read() == s
file = StringIO("X" + s)
file.read(1)
rf = ReseekFileFactory(file)
y = rf.tell()
t = rf.read(5)
rf.seek(y)
rf.nobuffer()
assert rf.read(5) == t
file = StringIO("X" + s)
file.read(1)
rf = ReseekFileFactory(file)
y = rf.tell()
t = rf.read(5)
assert t == s[:5]
rf.seek(y)
rf.nobuffer()
assert rf.read(8) == s[:8]
file = StringIO("X" + s)
file.read(1)
rf = ReseekFileFactory(file)
y = rf.tell()
t = rf.read(5)
assert t == s[:5]
rf.nobuffer()
assert rf.read(8) == s[5:5+8]
# Should only do this test on Unix systems
import os
infile = os.popen("echo HELLO_THERE")
infile.read(1)
rf = ReseekFileFactory(infile)
y = rf.tell()
assert rf.read(1) == "E"
assert rf.read(2) == "LL"
rf.seek(y)
assert rf.read(4) == "ELLO"
rf.seek(y)
assert rf.read(1) == "E"
rf.nobuffer()
assert rf.read(1) == "L"
assert rf.read(4) == "LO_T"
assert rf.read(4) == "HERE"
try:
rf.seek(y)
raise AssertionError("Cannot seek here!")
except IOError:
pass
try:
rf.tell()
raise AssertionError("Cannot tell here!")
except IOError:
pass
# Check if readline/readlines works
s = "This is line 1.\nAnd line 2.\nAnd now, page 3!"
file = StringIO(s)
rf = ReseekFileFactory(file)
rf.read(1)
assert rf.readline() == "his is line 1.\n"
rf.seek(0)
assert rf.readline() == "This is line 1.\n"
rf.read(2)
assert rf.readline() == "d line 2.\n"
rf.seek(0)
assert rf.readlines() == ["This is line 1.\n",
"And line 2.\n",
"And now, page 3!"]
rf.seek(0)
rf.read(len(s))
assert rf.readlines() == []
rf.seek(0)
# Now there is a final newline
s = "This is line 1.\nAnd line 2.\nAnd now, page 3!\n"
rf = ReseekFileFactory(StringIO(s))
rf.read(1)
rf.seek(0)
rf.nobuffer()
assert rf.readlines() == ["This is line 1.\n",
"And line 2.\n",
"And now, page 3!\n"]
def test():
_test(ReseekFile)
# Test with a different backing store. Make sure that I'm
# using the backing store.
was_called = [0]
def file_backed(infile):
was_called[0] = 1
return ReseekFile(infile, file_backed_tempfile)
_test(file_backed)
if not was_called[0]:
raise AssertionError("file_backed_tempfile was not called")
import io
f = io.StringIO("Andrew")
g = ReseekFile(f, file_backed_tempfile)
if not hasattr(g.buffer_file, "name"):
raise AssertionError("backend file not created")
if __name__ == "__main__":
test()
print("All tests passed.")