"""Functions that read and write gzipped files. The user of the file doesn't have to worry about the compression, but random access is not allowed.""" # based on Andrew Kuchling's minigzip.py distributed with the zlib module import struct, sys, time, os import zlib import io import __builtin__ __all__ = ["GzipFile","open"] FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16 READ, WRITE = 1, 2 def write32u(output, value): # The L format writes the bit pattern correctly whether signed # or unsigned. output.write(struct.pack("' def _check_closed(self): """Raises a ValueError if the underlying file object has been closed. """ if self.closed: raise ValueError('I/O operation on closed file.') def _init_write(self, filename): self.name = filename self.crc = zlib.crc32("") & 0xffffffffL self.size = 0 self.writebuf = [] self.bufsize = 0 def _write_gzip_header(self): self.fileobj.write('\037\213') # magic header self.fileobj.write('\010') # compression method fname = os.path.basename(self.name) if fname.endswith(".gz"): fname = fname[:-3] flags = 0 if fname: flags = FNAME self.fileobj.write(chr(flags)) mtime = self.mtime if mtime is None: mtime = time.time() write32u(self.fileobj, long(mtime)) self.fileobj.write('\002') self.fileobj.write('\377') if fname: self.fileobj.write(fname + '\000') def _init_read(self): self.crc = zlib.crc32("") & 0xffffffffL self.size = 0 def _read_exact(self, n): data = self.fileobj.read(n) while len(data) < n: b = self.fileobj.read(n - len(data)) if not b: raise EOFError("Compressed file ended before the " "end-of-stream marker was reached") data += b return data def _read_gzip_header(self): magic = self.fileobj.read(2) if magic != '\037\213': raise IOError, 'Not a gzipped file' method, flag, self.mtime = struct.unpack(" 0: self.size = self.size + len(data) self.crc = zlib.crc32(data, self.crc) & 0xffffffffL self.fileobj.write( self.compress.compress(data) ) self.offset += len(data) return len(data) def read(self, size=-1): self._check_closed() if self.mode != READ: import errno raise IOError(errno.EBADF, "read() on write-only GzipFile object") if self.extrasize <= 0 and self.fileobj is None: return '' readsize = 1024 if size < 0: # get the whole thing while self._read(readsize): readsize = min(self.max_read_chunk, readsize * 2) size = self.extrasize else: # just get some more of it while size > self.extrasize: if not self._read(readsize): if size > self.extrasize: size = self.extrasize break readsize = min(self.max_read_chunk, readsize * 2) offset = self.offset - self.extrastart chunk = self.extrabuf[offset: offset + size] self.extrasize = self.extrasize - size self.offset += size return chunk def _unread(self, buf): self.extrasize = len(buf) + self.extrasize self.offset -= len(buf) def _read(self, size=1024): if self.fileobj is None: return False if self._new_member: # If the _new_member flag is set, we have to # jump to the next member, if there is one. # # First, check if we're at the end of the file; # if so, it's time to stop; no more members to read. pos = self.fileobj.tell() # Save current position self.fileobj.seek(0, 2) # Seek to end of file if pos == self.fileobj.tell(): return False else: self.fileobj.seek( pos ) # Return to original position self._init_read() self._read_gzip_header() self.decompress = zlib.decompressobj(-zlib.MAX_WBITS) self._new_member = False # Read a chunk of data from the file buf = self.fileobj.read(size) # If the EOF has been reached, flush the decompression object # and mark this object as finished. if buf == "": uncompress = self.decompress.flush() self.fileobj.seek(-len(self.decompress.unused_data), 1) self._read_eof() self._add_read_data( uncompress ) return False uncompress = self.decompress.decompress(buf) self._add_read_data( uncompress ) if self.decompress.unused_data != "": # Ending case: we've come to the end of a member in the file, # so seek back to the start of the unused data, finish up # this member, and read a new gzip header. # (The number of bytes to seek back is the length of the unused # data) self.fileobj.seek(-len(self.decompress.unused_data), 1) # Check the CRC and file size, and set the flag so we read # a new member on the next call self._read_eof() self._new_member = True return True def _add_read_data(self, data): self.crc = zlib.crc32(data, self.crc) & 0xffffffffL offset = self.offset - self.extrastart self.extrabuf = self.extrabuf[offset:] + data self.extrasize = self.extrasize + len(data) self.extrastart = self.offset self.size = self.size + len(data) def _read_eof(self): # We've read to the end of the file. # We check the that the computed CRC and size of the # uncompressed data matches the stored values. Note that the size # stored is the true file size mod 2**32. crc32, isize = struct.unpack(" 0: self.extrasize -= i - offset self.offset += i - offset return self.extrabuf[offset: i] size = sys.maxint readsize = self.min_readsize else: readsize = size bufs = [] while size != 0: c = self.read(readsize) i = c.find('\n') # We set i=size to break out of the loop under two # conditions: 1) there's no newline, and the chunk is # larger than size, or 2) there is a newline, but the # resulting line would be longer than 'size'. if (size <= i) or (i == -1 and len(c) > size): i = size - 1 if i >= 0 or c == '': bufs.append(c[:i + 1]) # Add portion of last chunk self._unread(c[i + 1:]) # Push back rest of chunk break # Append chunk to list, decrease 'size', bufs.append(c) size = size - len(c) readsize = min(size, readsize * 2) if readsize > self.min_readsize: self.min_readsize = min(readsize, self.min_readsize * 2, 512) return ''.join(bufs) # Return resulting line def _test(): # Act like gzip; with -d, act like gunzip. # The input file is not deleted, however, nor are any other gzip # options or features supported. args = sys.argv[1:] decompress = args and args[0] == "-d" if decompress: args = args[1:] if not args: args = ["-"] for arg in args: if decompress: if arg == "-": f = GzipFile(filename="", mode="rb", fileobj=sys.stdin) g = sys.stdout else: if arg[-3:] != ".gz": print "filename doesn't end in .gz:", repr(arg) continue f = open(arg, "rb") g = __builtin__.open(arg[:-3], "wb") else: if arg == "-": f = sys.stdin g = GzipFile(filename="", mode="wb", fileobj=sys.stdout) else: f = __builtin__.open(arg, "rb") g = open(arg + ".gz", "wb") while True: chunk = f.read(1024) if not chunk: break g.write(chunk) if g is not sys.stdout: g.close() if f is not sys.stdin: f.close() if __name__ == '__main__': _test()