"""Wrap a file handle to allow seeks back to the beginning Sometimes data coming from a socket or other input file handle isn't what it was supposed to be. For example, suppose you are reading from a buggy server which is supposed to return an XML stream but can also return an unformatted error message. (This often happens because the server doesn't handle incorrect input very well.) A ReseekFile helps solve this problem. It is a wrapper to the original input stream but provides a buffer. Read requests to the ReseekFile get forwarded to the input stream, appended to a buffer, then returned to the caller. The buffer contains all the data read so far. The ReseekFile can be told to reseek to the start position. The next read request will come from the buffer, until the buffer has been read, in which case it gets the data from the input stream. This newly read data is also appended to the buffer. When buffering is no longer needed, use the 'nobuffer()' method. This tells the ReseekFile that once it has read from the buffer it should throw the buffer away. After nobuffer is called, the behaviour of 'seek' is no longer defined. For example, suppose you have the server as above which either gives an error message is of the form: ERROR: cannot do that or an XML data stream, starting with " -1: lines.append(s[i:j+1]) i = j+1 j = s.find("\n", i) if i < len(s): # Only get here if the last line doesn't have a newline lines.append(s[i:]) return lines def _check_no_buffer(self): # If 'nobuffer' called and finished with the buffer file # then get rid of the buffer and redirect everything to # the original input file. if (self._use_buffer == 0 and (self.buffer_file.tell() == self._buffer_size)): # I'm doing this for the slightly better performance self.seek = getattr(self.file, "seek", None) self.tell = getattr(self.file, "tell", None) self.read = self.file.read self.readline = self.file.readline self.readlines = self.file.readlines del self.buffer_file def nobuffer(self): """tell the ReseekFile to stop using the buffer once it's exhausted""" self._use_buffer = 0 def prepare_input_source(source): """given a URL, returns a xml.sax.xmlreader.InputSource Works like xml.sax.saxutils.prepare_input_source. Wraps the InputSource in a ReseekFile if the URL returns a non-seekable file. To turn the buffer off if that happens, you'll need to do something like f = source.getCharacterStream() ... try: f.nobuffer() except AttributeError: pass or if isinstance(f, ReseekFile): f.nobuffer() """ from xml.sax import saxutils source = saxutils.prepare_input_source(source) # Is this correct? Don't know - don't have Unicode experience f = source.getCharacterStream() or source.getByteStream() try: f.tell() except (AttributeError, IOError): f = ReseekFile.ReseekFile(f) source.setByteStream(f) source.setCharacterStream(None) return source def test_reads(test_s, file, seek0): assert file.read(2) == "Th" assert file.read(3) == "is " assert file.read(4) == "is a" assert file.read(0) == "" assert file.read(0) == "" assert file.read(6) == " test." file.seek(seek0) assert file.read(2) == "Th" assert file.read(3) == "is " assert file.read(4) == "is a" assert file.read(0) == "" assert file.read(0) == "" assert file.read(6) == " test." assert file.read(1) == "\n" assert file.read(5) == "12345" assert file.read() == "67890\n" file.seek(seek0) assert file.read() == test_s file.seek(seek0) def _test(ReseekFileFactory): from cStringIO import StringIO s = "This is a test.\n1234567890\n" file = StringIO(s) # Test with a normal file x = file.tell() test_reads(s, file, x) test_reads(s, file, x) # Test with a ReseekFile wrapper rf = ReseekFileFactory(file) y = rf.tell() rf.seek(y) test_reads(s, rf, y) assert rf.read() == s assert rf.read() == "" # Make sure the tell offset is correct (may not be 0) file = StringIO("X" + s) file.read(1) rf = ReseekFileFactory(file) y = rf.tell() test_reads(s, rf, y) rf.seek(y) test_reads(s, rf, y) assert rf.read() == s assert rf.read() == "" # Test the ability to turn off buffering and have changes # propogate correctly file = StringIO("X" + s) file.read(1) rf = ReseekFileFactory(file) y = rf.tell() assert y == 1 rf.read(1000) rf.seek(y) rf.nobuffer() assert rf.tell() == y test_reads(s, rf, y) rf.seek(y) test_reads(s, rf, y) assert rf.read() == s assert rf.read() == "" # turn off buffering after partial reads file = StringIO("X" + s) file.read(1) rf = ReseekFileFactory(file) y = rf.tell() rf.read(5) rf.seek(y) rf.nobuffer() assert rf.read() == s file = StringIO("X" + s) file.read(1) rf = ReseekFileFactory(file) y = rf.tell() t = rf.read(5) rf.seek(y) rf.nobuffer() assert rf.read(5) == t file = StringIO("X" + s) file.read(1) rf = ReseekFileFactory(file) y = rf.tell() t = rf.read(5) assert t == s[:5] rf.seek(y) rf.nobuffer() assert rf.read(8) == s[:8] file = StringIO("X" + s) file.read(1) rf = ReseekFileFactory(file) y = rf.tell() t = rf.read(5) assert t == s[:5] rf.nobuffer() assert rf.read(8) == s[5:5+8] # Should only do this test on Unix systems import os infile = os.popen("echo HELLO_THERE") infile.read(1) rf = ReseekFileFactory(infile) y = rf.tell() assert rf.read(1) == "E" assert rf.read(2) == "LL" rf.seek(y) assert rf.read(4) == "ELLO" rf.seek(y) assert rf.read(1) == "E" rf.nobuffer() assert rf.read(1) == "L" assert rf.read(4) == "LO_T" assert rf.read(4) == "HERE" try: rf.seek(y) raise AssertionError("Cannot seek here!") except IOError: pass try: rf.tell() raise AssertionError("Cannot tell here!") except IOError: pass # Check if readline/readlines works s = "This is line 1.\nAnd line 2.\nAnd now, page 3!" file = StringIO(s) rf = ReseekFileFactory(file) rf.read(1) assert rf.readline() == "his is line 1.\n" rf.seek(0) assert rf.readline() == "This is line 1.\n" rf.read(2) assert rf.readline() == "d line 2.\n" rf.seek(0) assert rf.readlines() == ["This is line 1.\n", "And line 2.\n", "And now, page 3!"] rf.seek(0) rf.read(len(s)) assert rf.readlines() == [] rf.seek(0) # Now there is a final newline s = "This is line 1.\nAnd line 2.\nAnd now, page 3!\n" rf = ReseekFileFactory(StringIO(s)) rf.read(1) rf.seek(0) rf.nobuffer() assert rf.readlines() == ["This is line 1.\n", "And line 2.\n", "And now, page 3!\n"] def test(): _test(ReseekFile) # Test with a different backing store. Make sure that I'm # using the backing store. was_called = [0] def file_backed(infile): was_called[0] = 1 return ReseekFile(infile, file_backed_tempfile) _test(file_backed) if not was_called[0]: raise AssertionError("file_backed_tempfile was not called") import cStringIO f = cStringIO.StringIO("Andrew") g = ReseekFile(f, file_backed_tempfile) if not hasattr(g.buffer_file, "name"): raise AssertionError("backend file not created") if __name__ == "__main__": test() print "All tests passed."