Refactoring image size fetcher to be much simpler, albeit slower.

2025-11-01 09:09:16 +00:00 · 2021-03-03 13:50:22 -05:00 · 2021-03-03 13:50:22 -05:00 · eac7e71b81
commit eac7e71b81
parent 72791b8e33
4 changed files with 22 additions and 508 deletions
--- a/apps/social/models.py
+++ b/apps/social/models.py
@ -34,11 +34,10 @@ from vendor import pynliner
 from utils import log as logging
 from utils import json_functions as json
 from utils.feed_functions import relative_timesince, chunks
-from utils.story_functions import truncate_chars, strip_tags, linkify, image_size
+from utils.story_functions import truncate_chars, strip_tags, linkify
 from utils.image_functions import ImageOps
 from utils.scrubber import SelectiveScriptScrubber
 from utils import s3_utils
-from io import BytesIO

 try:
    from apps.social.spam import detect_spammers
@ -2340,14 +2339,7 @@ class MSharedStory(mongo.DynamicDocument):
        for image_source in self.image_urls[:10]:
            if any(ignore in image_source for ignore in IGNORE_IMAGE_SOURCES):
                continue
-            req = requests.get(image_source, headers=headers, stream=True, timeout=10)
-            try:
-                datastream = BytesIO(req.content)
-                width, height = ImageOps.image_size(datastream)
-            except IOError as e:
-                logging.debug(" ***> Couldn't read image: %s / %s" % (e, image_source))
-                datastream = BytesIO(req.content[:100])
-                _, width, height = image_size(datastream)
+            width, height = ImageOps.image_size(image_source, headers=headers)
            # if width <= 16 or height <= 16:
            #     continue
            image_sizes.append({'src': image_source, 'size': (width, height)})
--- a/utils/image_functions.py
+++ b/utils/image_functions.py
@ -1,10 +1,11 @@
 """Operations for images through the PIL."""

+import urllib.request
 from PIL import Image
+from PIL import ImageFile
 from PIL import ImageOps as PILOps
 from PIL.ExifTags import TAGS
 from io import BytesIO
-from vendor import reseekfile

 PROFILE_PICTURE_SIZES = {
    'fullsize': (256, 256),
@ -71,7 +72,21 @@ class ImageOps:
        return image
    
    @classmethod
-    def image_size(cls, datastream):
-        datastream = reseekfile.ReseekFile(datastream)
-        image = Image.open(datastream)
-        return image.size
+    def image_size(cls, url, headers=None):
+        if not headers: headers = {}
+        req = urllib.request.Request(url, data=None, headers=headers)
+        file = urllib.request.urlopen(req)
+        size = file.headers.get("content-length")
+        if size: 
+            size = int(size)
+        p = ImageFile.Parser()
+        while True:
+            data = file.read(1024)
+            if not data:
+                break
+            p.feed(data)
+            if p.image:
+                return p.image.size
+                break
+        file.close()
+        return None, None
--- a/utils/story_functions.py
+++ b/utils/story_functions.py
@ -16,7 +16,6 @@ from django.utils.dateformat import DateFormat
 from django.utils.html import strip_tags as strip_tags_django
 from utils.tornado_escape import linkify as linkify_tornado
 from utils.tornado_escape import xhtml_unescape as xhtml_unescape_tornado
-from vendor import reseekfile
 import feedparser

 import hmac
@ -284,68 +283,6 @@ def truncate_chars(value, max_length):
 
    return truncd_val.decode('utf-8', 'ignore') + "..."

-def image_size(datastream):
-    datastream = reseekfile.ReseekFile(datastream)
-    data = str(datastream.read(30))
-    size = len(data)
-    height = -1
-    width = -1
-    content_type = ''
-
-    # handle GIFs
-    if (size >= 10) and data[:6] in ('GIF87a', 'GIF89a'):
-        # Check to see if content_type is correct
-        content_type = 'image/gif'
-        w, h = struct.unpack("<HH", data[6:10])
-        width = int(w)
-        height = int(h)
-
-    # See PNG 2. Edition spec (http://www.w3.org/TR/PNG/)
-    # Bytes 0-7 are below, 4-byte chunk length, then 'IHDR'
-    # and finally the 4-byte width, height
-    elif ((size >= 24) and data.startswith('\211PNG\r\n\032\n')
-          and (data[12:16] == 'IHDR')):
-        content_type = 'image/png'
-        w, h = struct.unpack(">LL", data[16:24])
-        width = int(w)
-        height = int(h)
-
-    # Maybe this is for an older PNG version.
-    elif (size >= 16) and data.startswith('\211PNG\r\n\032\n'):
-        # Check to see if we have the right content type
-        content_type = 'image/png'
-        w, h = struct.unpack(">LL", data[8:16])
-        width = int(w)
-        height = int(h)
-
-    # handle JPEGs
-    elif (size >= 2) and data.startswith('\377\330'):
-        content_type = 'image/jpeg'
-        datastream.seek(0)
-        datastream.read(2)
-        b = datastream.read(1)
-        try:
-            w = 0
-            h = 0
-            while (b and ord(b) != 0xDA):
-                while (ord(b) != 0xFF): b = datastream.read(1)
-                while (ord(b) == 0xFF): b = datastream.read(1)
-                if (ord(b) >= 0xC0 and ord(b) <= 0xC3):
-                    datastream.read(3)
-                    h, w = struct.unpack(">HH", datastream.read(4))
-                    break
-                else:
-                    datastream.read(int(struct.unpack(">H", datastream.read(2))[0])-2)
-                b = datastream.read(1)
-            width = int(w)
-            height = int(h)
-        except struct.error:
-            pass
-        except ValueError:
-            pass
-
-    return content_type, width, height
-
 def htmldiff(old_html, new_html):
    try:
        old_html_tokens = tokenize(old_html, include_hrefs=False) 
--- a/vendor/reseekfile.py
+++ b/vendor/reseekfile.py
@ -1,430 +0,0 @@
-"""Wrap a file handle to allow seeks back to the beginning
-
-Sometimes data coming from a socket or other input file handle isn't
-what it was supposed to be.  For example, suppose you are reading from
-a buggy server which is supposed to return an XML stream but can also
-return an unformatted error message.  (This often happens because the
-server doesn't handle incorrect input very well.)
-
-A ReseekFile helps solve this problem.  It is a wrapper to the
-original input stream but provides a buffer.  Read requests to the
-ReseekFile get forwarded to the input stream, appended to a buffer,
-then returned to the caller.  The buffer contains all the data read so
-far.
-
-The ReseekFile can be told to reseek to the start position.  The next
-read request will come from the buffer, until the buffer has been
-read, in which case it gets the data from the input stream.  This
-newly read data is also appended to the buffer.
-
-When buffering is no longer needed, use the 'nobuffer()' method.  This
-tells the ReseekFile that once it has read from the buffer it should
-throw the buffer away.  After nobuffer is called, the behaviour of
-'seek' is no longer defined.
-
-For example, suppose you have the server as above which either
-gives an error message is of the form:
-
-  ERROR: cannot do that
-
-or an XML data stream, starting with "<?xml".
-
-  infile = urllib2.urlopen("http://somewhere/")
-  infile = ReseekFile.ReseekFile(infile)
-  s = infile.readline()
-  if s.startswith("ERROR:"):
-      raise Exception(s[:-1])
-  infile.seek(0)
-  infile.nobuffer()   # Don't buffer the data
-   ... process the XML from infile ...
-
-
-This module also implements 'prepare_input_source(source)' modeled on
-xml.sax.saxutils.prepare_input_source.  This opens a URL and if the
-input stream is not already seekable, wraps it in a ReseekFile.
-
-
-NOTE:
-  Don't use bound methods for the ReseekFile.  When the buffer is
-empty, the ReseekFile reassigns the input file's read/readlines/etc.
-method as instance variable.  This gives slightly better performance
-at the cost of not allowing an infrequently used idiom.
-
-  Use tell() to get the beginning byte location.  ReseekFile will
-attempt to get the real position from the wrapped file and use that as
-the beginning location.  If the wrapped file does not support tell(),
-ReseekFile.tell() will return 0.
-
-  readlines does not yet support a sizehint.  Want to implement it?
-
-The latest version of this code can be found at
-  http://www.dalkescientific.com/Python/
-"""
-# Started in 2003 by Andrew Dalke, Dalke Scientific Software, LLC.
-# This software has been released to the public domain.  No
-# copyright is asserted.
-
-## Changelog:
-# 2005-11-06
-#   Use StringIO if cStringIO doesn't exist.  Suggested by Howard Golden
-#   for use with non-CPython implementations.
-# 2005-05-18
-#   Can specify a factory to specify how to create the temporary file.
-#   Factories for memory-based (cStringIO) and file-based storages
-#   Track the buffer file size so I don't depend on getvalue()
-#   Fixed a few typos
-
-def memory_backed_tempfile():
-    try:
-        from io import StringIO
-    except ImportError:
-        from io import StringIO
-    return StringIO()
-
-def file_backed_tempfile():
-    import tempfile
-    return tempfile.NamedTemporaryFile(mode="r+b")
-
-class ReseekFile:
-    """wrap a file handle to allow seeks back to the beginning
-
-    Takes a file handle in the constructor.
-    
-    See the module docstring for more documentation.
-    """
-    def __init__(self, file, tempfile_factory = memory_backed_tempfile):
-        self.file = file
-        self.buffer_file = tempfile_factory()
-        self.at_beginning = 1
-        try:
-            self.beginning = file.tell()
-        except (IOError, AttributeError):
-            self.beginning = 0
-        self._use_buffer = 1
-        self._buffer_size = 0
-        
-    def seek(self, offset, whence = 0):
-        """offset, whence = 0
-
-        Seek to a given byte position.  Only supports whence == 0
-        and offset == the initial value of ReseekFile.tell() (which
-        is usually 0, but not always.)
-        """
-        if whence != 0:
-            raise TypeError("Unexpected whence value of %s; expecting 0" % \
-                            (whence,))
-        if offset != self.beginning:
-            raise TypeError("Unexpected offset value of %r; expecting '%s'" % \
-                             (offset, self.beginning))
-        self.buffer_file.seek(0)
-        self.at_beginning = 1
-        
-    def tell(self):
-        """the current position of the file
-
-        The initial position may not be 0 if the underlying input
-        file supports tell and it not at position 0.
-        """
-        if not self.at_beginning:
-            raise TypeError("ReseekFile cannot tell except at the beginning of file")
-        return self.beginning
-
-    def _read(self, size):
-        if size < 0:
-            y = self.file.read()
-            z = self.buffer_file.read() + y
-            if self._use_buffer:
-                self.buffer_file.write(y)
-                self._buffer_size += len(y)
-            return z
-        if size == 0:
-            return ""
-        x = self.buffer_file.read(size)
-        if len(x) < size:
-            y = self.file.read(size - len(x))
-            if self._use_buffer:
-                self.buffer_file.write(y)
-                self._buffer_size += len(y)
-            return x + y
-        return x
-        
-    def read(self, size = -1):
-        """read up to 'size' bytes from the file
-
-        Default is -1, which means to read to end of file.
-        """
-        x = self._read(size)
-        if self.at_beginning and x:
-            self.at_beginning = 0
-        self._check_no_buffer()
-        return x
-
-    def readline(self):
-        """read a line from the file"""
-
-        # Can we get it out of the buffer_file?
-        s = self.buffer_file.readline()
-        if s[-1:] == "\n":
-            return s
-        # No, so now we read a line from the input file
-        t = self.file.readline()
-
-        # Append the new data to the buffer, if still buffering
-        if self._use_buffer:
-            self.buffer_file.write(t)
-            self._buffer_size += len(t)        
-        self._check_no_buffer()
-
-        return s + t
-
-    def readlines(self):
-        """read all remaining lines from the file"""
-        s = self.read()
-        lines = []
-        i, j = 0, s.find("\n")
-        while j > -1:
-            lines.append(s[i:j+1])
-            i = j+1
-            j = s.find("\n", i)
-        if i < len(s):
-            # Only get here if the last line doesn't have a newline
-            lines.append(s[i:])
-        return lines
-
-    def _check_no_buffer(self):
-        # If 'nobuffer' called and finished with the buffer file
-        # then get rid of the buffer and redirect everything to
-        # the original input file.
-        if (self._use_buffer == 0 and 
-            (self.buffer_file.tell() == self._buffer_size)):
-            # I'm doing this for the slightly better performance
-            self.seek = getattr(self.file, "seek", None)
-            self.tell = getattr(self.file, "tell", None)
-            self.read = self.file.read
-            self.readline = self.file.readline
-            self.readlines = self.file.readlines
-            del self.buffer_file
-
-    def nobuffer(self):
-        """tell the ReseekFile to stop using the buffer once it's exhausted"""
-        self._use_buffer = 0
-
-def prepare_input_source(source):
-    """given a URL, returns a xml.sax.xmlreader.InputSource
-
-    Works like xml.sax.saxutils.prepare_input_source.  Wraps the
-    InputSource in a ReseekFile if the URL returns a non-seekable
-    file.
-
-    To turn the buffer off if that happens, you'll need to do
-    something like
-
-    f = source.getCharacterStream()
-     ...
-    try:
-       f.nobuffer()
-    except AttributeError:
-       pass
-
-    or
-
-    if isinstance(f, ReseekFile):
-      f.nobuffer()
-    
-    """
-    from xml.sax import saxutils
-    source = saxutils.prepare_input_source(source)
-    # Is this correct?  Don't know - don't have Unicode experience
-    f = source.getCharacterStream() or source.getByteStream()
-    try:
-        f.tell()
-    except (AttributeError, IOError):
-        f = ReseekFile.ReseekFile(f)
-        source.setByteStream(f)
-        source.setCharacterStream(None)
-    return source
-
-def test_reads(test_s, file, seek0):
-    assert file.read(2) == "Th"
-    assert file.read(3) == "is "
-    assert file.read(4) == "is a"
-    assert file.read(0) == ""
-    assert file.read(0) == ""
-    assert file.read(6) == " test."
-    file.seek(seek0)
-    assert file.read(2) == "Th"
-    assert file.read(3) == "is "
-    assert file.read(4) == "is a"
-    assert file.read(0) == ""
-    assert file.read(0) == ""
-    assert file.read(6) == " test."
-    assert file.read(1) == "\n"
-    assert file.read(5) == "12345"
-    assert file.read() == "67890\n"
-    file.seek(seek0)
-    assert file.read() == test_s
-    file.seek(seek0)
-
-    
-def _test(ReseekFileFactory):
-    from io import StringIO
-    s = "This is a test.\n1234567890\n"
-    file = StringIO(s)
-    # Test with a normal file
-    x = file.tell()
-    test_reads(s, file, x)
-    test_reads(s, file, x)
-
-    # Test with a ReseekFile wrapper
-    rf = ReseekFileFactory(file)
-    y = rf.tell()
-    rf.seek(y)
-    test_reads(s, rf, y)
-    assert rf.read() == s
-    assert rf.read() == ""
-
-    # Make sure the tell offset is correct (may not be 0)
-    file = StringIO("X" + s)
-    file.read(1)
-    rf = ReseekFileFactory(file)
-    y = rf.tell()
-    test_reads(s, rf, y)
-    rf.seek(y)
-    test_reads(s, rf, y)
-    assert rf.read() == s
-    assert rf.read() == ""
-
-    # Test the ability to turn off buffering and have changes
-    # propogate correctly
-    file = StringIO("X" + s)
-    file.read(1)
-    rf = ReseekFileFactory(file)
-    y = rf.tell()
-    assert y == 1
-    rf.read(1000)
-    rf.seek(y)
-    rf.nobuffer()
-    assert rf.tell() == y
-    test_reads(s, rf, y)
-    rf.seek(y)
-    test_reads(s, rf, y)
-    assert rf.read() == s
-    assert rf.read() == ""
-
-    # turn off buffering after partial reads
-    file = StringIO("X" + s)
-    file.read(1)
-    rf = ReseekFileFactory(file)
-    y = rf.tell()
-    rf.read(5)
-    rf.seek(y)
-    rf.nobuffer()
-    assert rf.read() == s
-
-    file = StringIO("X" + s)
-    file.read(1)
-    rf = ReseekFileFactory(file)
-    y = rf.tell()
-    t = rf.read(5)
-    rf.seek(y)
-    rf.nobuffer()
-    assert rf.read(5) == t
-
-    file = StringIO("X" + s)
-    file.read(1)
-    rf = ReseekFileFactory(file)
-    y = rf.tell()
-    t = rf.read(5)
-    assert t == s[:5]
-    rf.seek(y)
-    rf.nobuffer()
-    assert rf.read(8) == s[:8]
-
-    file = StringIO("X" + s)
-    file.read(1)
-    rf = ReseekFileFactory(file)
-    y = rf.tell()
-    t = rf.read(5)
-    assert t == s[:5]
-    rf.nobuffer()
-    assert rf.read(8) == s[5:5+8]
-
-    # Should only do this test on Unix systems
-    import os
-    infile = os.popen("echo HELLO_THERE")
-    infile.read(1)
-    rf = ReseekFileFactory(infile)
-    y = rf.tell()
-    assert rf.read(1) == "E"
-    assert rf.read(2) == "LL"
-    rf.seek(y)
-    assert rf.read(4) == "ELLO"
-    rf.seek(y)
-    assert rf.read(1) == "E"
-    rf.nobuffer()
-    assert rf.read(1) == "L"
-    assert rf.read(4) == "LO_T"
-    assert rf.read(4) == "HERE"
-    try:
-        rf.seek(y)
-        raise AssertionError("Cannot seek here!")
-    except IOError:
-        pass
-    try:
-        rf.tell()
-        raise AssertionError("Cannot tell here!")
-    except IOError:
-        pass
-
-    # Check if readline/readlines works
-    s = "This is line 1.\nAnd line 2.\nAnd now, page 3!"
-    file = StringIO(s)
-    rf = ReseekFileFactory(file)
-    rf.read(1)
-    assert rf.readline() == "his is line 1.\n"
-    rf.seek(0)
-    assert rf.readline() == "This is line 1.\n"
-    rf.read(2)
-    assert rf.readline() == "d line 2.\n"
-    rf.seek(0)
-    assert rf.readlines() == ["This is line 1.\n",
-                              "And line 2.\n",
-                              "And now, page 3!"]
-
-    rf.seek(0)
-    rf.read(len(s))
-    assert rf.readlines() == []
-    rf.seek(0)
-
-    # Now there is a final newline
-    s = "This is line 1.\nAnd line 2.\nAnd now, page 3!\n"
-    rf = ReseekFileFactory(StringIO(s))
-    rf.read(1)
-    rf.seek(0)
-    rf.nobuffer()
-    assert rf.readlines() == ["This is line 1.\n",
-                              "And line 2.\n",
-                              "And now, page 3!\n"]
-    
-def test():
-    _test(ReseekFile)
-
-    # Test with a different backing store.  Make sure that I'm
-    # using the backing store.
-    was_called = [0]
-    def file_backed(infile):
-        was_called[0] = 1
-        return ReseekFile(infile, file_backed_tempfile)
-    _test(file_backed)
-    if not was_called[0]:
-        raise AssertionError("file_backed_tempfile was not called")
-
-    import io
-    f = io.StringIO("Andrew")
-    g = ReseekFile(f, file_backed_tempfile)
-    if not hasattr(g.buffer_file, "name"):
-        raise AssertionError("backend file not created")
-    
-if __name__ == "__main__":
-    test()
-    print("All tests passed.")