| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.hadoop.io.compress.zlib; |
| |
| import java.io.IOException; |
| import java.util.zip.Checksum; |
| import java.util.zip.DataFormatException; |
| import java.util.zip.Inflater; |
| |
| import org.apache.hadoop.io.compress.Decompressor; |
| import org.apache.hadoop.io.compress.DoNotPool; |
| import org.apache.hadoop.util.DataChecksum; |
| |
| /** |
| * A {@link Decompressor} based on the popular gzip compressed file format. |
| * http://www.gzip.org/ |
| * |
| */ |
| @DoNotPool |
| public class BuiltInGzipDecompressor implements Decompressor { |
| private static final int GZIP_MAGIC_ID = 0x8b1f; // if read as LE short int |
| private static final int GZIP_DEFLATE_METHOD = 8; |
| private static final int GZIP_FLAGBIT_HEADER_CRC = 0x02; |
| private static final int GZIP_FLAGBIT_EXTRA_FIELD = 0x04; |
| private static final int GZIP_FLAGBIT_FILENAME = 0x08; |
| private static final int GZIP_FLAGBIT_COMMENT = 0x10; |
| private static final int GZIP_FLAGBITS_RESERVED = 0xe0; |
| |
| // 'true' (nowrap) => Inflater will handle raw deflate stream only |
| private Inflater inflater = new Inflater(true); |
| |
| private byte[] userBuf = null; |
| private int userBufOff = 0; |
| private int userBufLen = 0; |
| |
| private byte[] localBuf = new byte[256]; |
| private int localBufOff = 0; |
| |
| private int headerBytesRead = 0; |
| private int trailerBytesRead = 0; |
| private int numExtraFieldBytesRemaining = -1; |
| private Checksum crc = DataChecksum.newCrc32(); |
| private boolean hasExtraField = false; |
| private boolean hasFilename = false; |
| private boolean hasComment = false; |
| private boolean hasHeaderCRC = false; |
| |
| private GzipStateLabel state; |
| |
| /** |
| * The current state of the gzip decoder, external to the Inflater context. |
| * (Technically, the private variables localBuf through hasHeaderCRC are |
| * also part of the state, so this enum is merely the label for it.) |
| */ |
| private static enum GzipStateLabel { |
| /** |
| * Immediately prior to or (strictly) within the 10-byte basic gzip header. |
| */ |
| HEADER_BASIC, |
| /** |
| * Immediately prior to or within the optional "extra field." |
| */ |
| HEADER_EXTRA_FIELD, |
| /** |
| * Immediately prior to or within the optional filename field. |
| */ |
| HEADER_FILENAME, |
| /** |
| * Immediately prior to or within the optional comment field. |
| */ |
| HEADER_COMMENT, |
| /** |
| * Immediately prior to or within the optional 2-byte header CRC value. |
| */ |
| HEADER_CRC, |
| /** |
| * Immediately prior to or within the main compressed (deflate) data stream. |
| */ |
| DEFLATE_STREAM, |
| /** |
| * Immediately prior to or (strictly) within the 4-byte uncompressed CRC. |
| */ |
| TRAILER_CRC, |
| /** |
| * Immediately prior to or (strictly) within the 4-byte uncompressed size. |
| */ |
| TRAILER_SIZE, |
| /** |
| * Immediately after the trailer (and potentially prior to the next gzip |
| * member/substream header), without reset() having been called. |
| */ |
| FINISHED; |
| } |
| |
| /** |
| * Creates a new (pure Java) gzip decompressor. |
| */ |
| public BuiltInGzipDecompressor() { |
| state = GzipStateLabel.HEADER_BASIC; |
| crc.reset(); |
| // FIXME? Inflater docs say: 'it is also necessary to provide an extra |
| // "dummy" byte as input. This is required by the ZLIB native |
| // library in order to support certain optimizations.' However, |
| // this does not appear to be true, and in any case, it's not |
| // entirely clear where the byte should go or what its value |
| // should be. Perhaps it suffices to have some deflated bytes |
| // in the first buffer load? (But how else would one do it?) |
| } |
| |
| @Override |
| public synchronized boolean needsInput() { |
| if (state == GzipStateLabel.DEFLATE_STREAM) { // most common case |
| return inflater.needsInput(); |
| } |
| // see userBufLen comment at top of decompress(); currently no need to |
| // verify userBufLen <= 0 |
| return (state != GzipStateLabel.FINISHED); |
| } |
| |
| /** {@inheritDoc} */ |
| /* |
| * In our case, the input data includes both gzip header/trailer bytes (which |
| * we handle in executeState()) and deflate-stream bytes (which we hand off |
| * to Inflater). |
| * |
| * NOTE: This code assumes the data passed in via b[] remains unmodified |
| * until _we_ signal that it's safe to modify it (via needsInput()). |
| * The alternative would require an additional buffer-copy even for |
| * the bulk deflate stream, which is a performance hit we don't want |
| * to absorb. (Decompressor now documents this requirement.) |
| */ |
| @Override |
| public synchronized void setInput(byte[] b, int off, int len) { |
| if (b == null) { |
| throw new NullPointerException(); |
| } |
| if (off < 0 || len < 0 || off > b.length - len) { |
| throw new ArrayIndexOutOfBoundsException(); |
| } |
| |
| userBuf = b; |
| userBufOff = off; |
| userBufLen = len; // note: might be zero |
| } |
| |
| /** |
| * Decompress the data (gzip header, deflate stream, gzip trailer) in the |
| * provided buffer. |
| * |
| * @return the number of decompressed bytes placed into b |
| */ |
| /* From the caller's perspective, this is where the state machine lives. |
| * The code is written such that we never return from decompress() with |
| * data remaining in userBuf unless we're in FINISHED state and there was |
| * data beyond the current gzip member (e.g., we're within a concatenated |
| * gzip stream). If this ever changes, {@link #needsInput()} will also |
| * need to be modified (i.e., uncomment the userBufLen condition). |
| * |
| * The actual deflate-stream processing (decompression) is handled by |
| * Java's Inflater class. Unlike the gzip header/trailer code (execute* |
| * methods below), the deflate stream is never copied; Inflater operates |
| * directly on the user's buffer. |
| */ |
| @Override |
| public synchronized int decompress(byte[] b, int off, int len) |
| throws IOException { |
| int numAvailBytes = 0; |
| |
| if (state != GzipStateLabel.DEFLATE_STREAM) { |
| executeHeaderState(); |
| |
| if (userBufLen <= 0) { |
| return numAvailBytes; |
| } |
| } |
| |
| // "executeDeflateStreamState()" |
| if (state == GzipStateLabel.DEFLATE_STREAM) { |
| // hand off user data (or what's left of it) to Inflater--but note that |
| // Inflater may not have consumed all of previous bufferload (e.g., if |
| // data highly compressed or output buffer very small), in which case |
| // userBufLen will be zero |
| if (userBufLen > 0) { |
| inflater.setInput(userBuf, userBufOff, userBufLen); |
| userBufOff += userBufLen; |
| userBufLen = 0; |
| } |
| |
| // now decompress it into b[] |
| try { |
| numAvailBytes = inflater.inflate(b, off, len); |
| } catch (DataFormatException dfe) { |
| throw new IOException(dfe.getMessage()); |
| } |
| crc.update(b, off, numAvailBytes); // CRC-32 is on _uncompressed_ data |
| if (inflater.finished()) { |
| state = GzipStateLabel.TRAILER_CRC; |
| int bytesRemaining = inflater.getRemaining(); |
| assert (bytesRemaining >= 0) : |
| "logic error: Inflater finished; byte-count is inconsistent"; |
| // could save a copy of userBufLen at call to inflater.setInput() and |
| // verify that bytesRemaining <= origUserBufLen, but would have to |
| // be a (class) member variable...seems excessive for a sanity check |
| userBufOff -= bytesRemaining; |
| userBufLen = bytesRemaining; // or "+=", but guaranteed 0 coming in |
| } else { |
| return numAvailBytes; // minor optimization |
| } |
| } |
| |
| executeTrailerState(); |
| |
| return numAvailBytes; |
| } |
| |
| /** |
| * Parse the gzip header (assuming we're in the appropriate state). |
| * In order to deal with degenerate cases (e.g., user buffer is one byte |
| * long), we copy (some) header bytes to another buffer. (Filename, |
| * comment, and extra-field bytes are simply skipped.)</p> |
| * |
| * See http://www.ietf.org/rfc/rfc1952.txt for the gzip spec. Note that |
| * no version of gzip to date (at least through 1.4.0, 2010-01-20) supports |
| * the FHCRC header-CRC16 flagbit; instead, the implementation treats it |
| * as a multi-file continuation flag (which it also doesn't support). :-( |
| * Sun's JDK v6 (1.6) supports the header CRC, however, and so do we. |
| */ |
| private void executeHeaderState() throws IOException { |
| |
| // this can happen because DecompressorStream's decompress() is written |
| // to call decompress() first, setInput() second: |
| if (userBufLen <= 0) { |
| return; |
| } |
| |
| // "basic"/required header: somewhere in first 10 bytes |
| if (state == GzipStateLabel.HEADER_BASIC) { |
| int n = Math.min(userBufLen, 10-localBufOff); // (or 10-headerBytesRead) |
| checkAndCopyBytesToLocal(n); // modifies userBufLen, etc. |
| if (localBufOff >= 10) { // should be strictly == |
| processBasicHeader(); // sig, compression method, flagbits |
| localBufOff = 0; // no further need for basic header |
| state = GzipStateLabel.HEADER_EXTRA_FIELD; |
| } |
| } |
| |
| if (userBufLen <= 0) { |
| return; |
| } |
| |
| // optional header stuff (extra field, filename, comment, header CRC) |
| |
| if (state == GzipStateLabel.HEADER_EXTRA_FIELD) { |
| if (hasExtraField) { |
| // 2 substates: waiting for 2 bytes => get numExtraFieldBytesRemaining, |
| // or already have 2 bytes & waiting to finish skipping specified length |
| if (numExtraFieldBytesRemaining < 0) { |
| int n = Math.min(userBufLen, 2-localBufOff); |
| checkAndCopyBytesToLocal(n); |
| if (localBufOff >= 2) { |
| numExtraFieldBytesRemaining = readUShortLE(localBuf, 0); |
| localBufOff = 0; |
| } |
| } |
| if (numExtraFieldBytesRemaining > 0 && userBufLen > 0) { |
| int n = Math.min(userBufLen, numExtraFieldBytesRemaining); |
| checkAndSkipBytes(n); // modifies userBufLen, etc. |
| numExtraFieldBytesRemaining -= n; |
| } |
| if (numExtraFieldBytesRemaining == 0) { |
| state = GzipStateLabel.HEADER_FILENAME; |
| } |
| } else { |
| state = GzipStateLabel.HEADER_FILENAME; |
| } |
| } |
| |
| if (userBufLen <= 0) { |
| return; |
| } |
| |
| if (state == GzipStateLabel.HEADER_FILENAME) { |
| if (hasFilename) { |
| boolean doneWithFilename = checkAndSkipBytesUntilNull(); |
| if (!doneWithFilename) { |
| return; // exit early: used up entire buffer without hitting NULL |
| } |
| } |
| state = GzipStateLabel.HEADER_COMMENT; |
| } |
| |
| if (userBufLen <= 0) { |
| return; |
| } |
| |
| if (state == GzipStateLabel.HEADER_COMMENT) { |
| if (hasComment) { |
| boolean doneWithComment = checkAndSkipBytesUntilNull(); |
| if (!doneWithComment) { |
| return; // exit early: used up entire buffer |
| } |
| } |
| state = GzipStateLabel.HEADER_CRC; |
| } |
| |
| if (userBufLen <= 0) { |
| return; |
| } |
| |
| if (state == GzipStateLabel.HEADER_CRC) { |
| if (hasHeaderCRC) { |
| assert (localBufOff < 2); |
| int n = Math.min(userBufLen, 2-localBufOff); |
| copyBytesToLocal(n); |
| if (localBufOff >= 2) { |
| long headerCRC = readUShortLE(localBuf, 0); |
| if (headerCRC != (crc.getValue() & 0xffff)) { |
| throw new IOException("gzip header CRC failure"); |
| } |
| localBufOff = 0; |
| crc.reset(); |
| state = GzipStateLabel.DEFLATE_STREAM; |
| } |
| } else { |
| crc.reset(); // will reuse for CRC-32 of uncompressed data |
| state = GzipStateLabel.DEFLATE_STREAM; // switching to Inflater now |
| } |
| } |
| } |
| |
| /** |
| * Parse the gzip trailer (assuming we're in the appropriate state). |
| * In order to deal with degenerate cases (e.g., user buffer is one byte |
| * long), we copy trailer bytes (all 8 of 'em) to a local buffer.</p> |
| * |
| * See http://www.ietf.org/rfc/rfc1952.txt for the gzip spec. |
| */ |
| private void executeTrailerState() throws IOException { |
| |
| if (userBufLen <= 0) { |
| return; |
| } |
| |
| // verify that the CRC-32 of the decompressed stream matches the value |
| // stored in the gzip trailer |
| if (state == GzipStateLabel.TRAILER_CRC) { |
| // localBuf was empty before we handed off to Inflater, so we handle this |
| // exactly like header fields |
| assert (localBufOff < 4); // initially 0, but may need multiple calls |
| int n = Math.min(userBufLen, 4-localBufOff); |
| copyBytesToLocal(n); |
| if (localBufOff >= 4) { |
| long streamCRC = readUIntLE(localBuf, 0); |
| if (streamCRC != crc.getValue()) { |
| throw new IOException("gzip stream CRC failure"); |
| } |
| localBufOff = 0; |
| crc.reset(); |
| state = GzipStateLabel.TRAILER_SIZE; |
| } |
| } |
| |
| if (userBufLen <= 0) { |
| return; |
| } |
| |
| // verify that the mod-2^32 decompressed stream size matches the value |
| // stored in the gzip trailer |
| if (state == GzipStateLabel.TRAILER_SIZE) { |
| assert (localBufOff < 4); // initially 0, but may need multiple calls |
| int n = Math.min(userBufLen, 4-localBufOff); |
| copyBytesToLocal(n); // modifies userBufLen, etc. |
| if (localBufOff >= 4) { // should be strictly == |
| long inputSize = readUIntLE(localBuf, 0); |
| if (inputSize != (inflater.getBytesWritten() & 0xffffffffL)) { |
| throw new IOException( |
| "stored gzip size doesn't match decompressed size"); |
| } |
| localBufOff = 0; |
| state = GzipStateLabel.FINISHED; |
| } |
| } |
| |
| if (state == GzipStateLabel.FINISHED) { |
| return; |
| } |
| } |
| |
| /** |
| * Returns the total number of compressed bytes input so far, including |
| * gzip header/trailer bytes.</p> |
| * |
| * @return the total (non-negative) number of compressed bytes read so far |
| */ |
| public synchronized long getBytesRead() { |
| return headerBytesRead + inflater.getBytesRead() + trailerBytesRead; |
| } |
| |
| /** |
| * Returns the number of bytes remaining in the input buffer; normally |
| * called when finished() is true to determine amount of post-gzip-stream |
| * data. Note that, other than the finished state with concatenated data |
| * after the end of the current gzip stream, this will never return a |
| * non-zero value unless called after {@link #setInput(byte[] b, int off, |
| * int len)} and before {@link #decompress(byte[] b, int off, int len)}. |
| * (That is, after {@link #decompress(byte[] b, int off, int len)} it |
| * always returns zero, except in finished state with concatenated data.)</p> |
| * |
| * @return the total (non-negative) number of unprocessed bytes in input |
| */ |
| @Override |
| public synchronized int getRemaining() { |
| return userBufLen; |
| } |
| |
| @Override |
| public synchronized boolean needsDictionary() { |
| return inflater.needsDictionary(); |
| } |
| |
| @Override |
| public synchronized void setDictionary(byte[] b, int off, int len) { |
| inflater.setDictionary(b, off, len); |
| } |
| |
| /** |
| * Returns true if the end of the gzip substream (single "member") has been |
| * reached.</p> |
| */ |
| @Override |
| public synchronized boolean finished() { |
| return (state == GzipStateLabel.FINISHED); |
| } |
| |
| /** |
| * Resets everything, including the input buffer, regardless of whether the |
| * current gzip substream is finished.</p> |
| */ |
| @Override |
| public synchronized void reset() { |
| // could optionally emit INFO message if state != GzipStateLabel.FINISHED |
| inflater.reset(); |
| state = GzipStateLabel.HEADER_BASIC; |
| crc.reset(); |
| userBufOff = userBufLen = 0; |
| localBufOff = 0; |
| headerBytesRead = 0; |
| trailerBytesRead = 0; |
| numExtraFieldBytesRemaining = -1; |
| hasExtraField = false; |
| hasFilename = false; |
| hasComment = false; |
| hasHeaderCRC = false; |
| } |
| |
| @Override |
| public synchronized void end() { |
| inflater.end(); |
| } |
| |
| /** |
| * Check ID bytes (throw if necessary), compression method (throw if not 8), |
| * and flag bits (set hasExtraField, hasFilename, hasComment, hasHeaderCRC). |
| * Ignore MTIME, XFL, OS. Caller must ensure we have at least 10 bytes (at |
| * the start of localBuf).</p> |
| */ |
| /* |
| * Flag bits (remainder are reserved and must be zero): |
| * bit 0 FTEXT |
| * bit 1 FHCRC (never implemented in gzip, at least through version |
| * 1.4.0; instead interpreted as "continuation of multi- |
| * part gzip file," which is unsupported through 1.4.0) |
| * bit 2 FEXTRA |
| * bit 3 FNAME |
| * bit 4 FCOMMENT |
| * [bit 5 encrypted] |
| */ |
| private void processBasicHeader() throws IOException { |
| if (readUShortLE(localBuf, 0) != GZIP_MAGIC_ID) { |
| throw new IOException("not a gzip file"); |
| } |
| if (readUByte(localBuf, 2) != GZIP_DEFLATE_METHOD) { |
| throw new IOException("gzip data not compressed with deflate method"); |
| } |
| int flg = readUByte(localBuf, 3); |
| if ((flg & GZIP_FLAGBITS_RESERVED) != 0) { |
| throw new IOException("unknown gzip format (reserved flagbits set)"); |
| } |
| hasExtraField = ((flg & GZIP_FLAGBIT_EXTRA_FIELD) != 0); |
| hasFilename = ((flg & GZIP_FLAGBIT_FILENAME) != 0); |
| hasComment = ((flg & GZIP_FLAGBIT_COMMENT) != 0); |
| hasHeaderCRC = ((flg & GZIP_FLAGBIT_HEADER_CRC) != 0); |
| } |
| |
| private void checkAndCopyBytesToLocal(int len) { |
| System.arraycopy(userBuf, userBufOff, localBuf, localBufOff, len); |
| localBufOff += len; |
| // alternatively, could call checkAndSkipBytes(len) for rest... |
| crc.update(userBuf, userBufOff, len); |
| userBufOff += len; |
| userBufLen -= len; |
| headerBytesRead += len; |
| } |
| |
| private void checkAndSkipBytes(int len) { |
| crc.update(userBuf, userBufOff, len); |
| userBufOff += len; |
| userBufLen -= len; |
| headerBytesRead += len; |
| } |
| |
| // returns true if saw NULL, false if ran out of buffer first; called _only_ |
| // during gzip-header processing (not trailer) |
| // (caller can check before/after state of userBufLen to compute num bytes) |
| private boolean checkAndSkipBytesUntilNull() { |
| boolean hitNull = false; |
| if (userBufLen > 0) { |
| do { |
| hitNull = (userBuf[userBufOff] == 0); |
| crc.update(userBuf[userBufOff]); |
| ++userBufOff; |
| --userBufLen; |
| ++headerBytesRead; |
| } while (userBufLen > 0 && !hitNull); |
| } |
| return hitNull; |
| } |
| |
| // this one doesn't update the CRC and does support trailer processing but |
| // otherwise is same as its "checkAnd" sibling |
| private void copyBytesToLocal(int len) { |
| System.arraycopy(userBuf, userBufOff, localBuf, localBufOff, len); |
| localBufOff += len; |
| userBufOff += len; |
| userBufLen -= len; |
| if (state == GzipStateLabel.TRAILER_CRC || |
| state == GzipStateLabel.TRAILER_SIZE) { |
| trailerBytesRead += len; |
| } else { |
| headerBytesRead += len; |
| } |
| } |
| |
| private int readUByte(byte[] b, int off) { |
| return ((int)b[off] & 0xff); |
| } |
| |
| // caller is responsible for not overrunning buffer |
| private int readUShortLE(byte[] b, int off) { |
| return ((((b[off+1] & 0xff) << 8) | |
| ((b[off] & 0xff) )) & 0xffff); |
| } |
| |
| // caller is responsible for not overrunning buffer |
| private long readUIntLE(byte[] b, int off) { |
| return ((((long)(b[off+3] & 0xff) << 24) | |
| ((long)(b[off+2] & 0xff) << 16) | |
| ((long)(b[off+1] & 0xff) << 8) | |
| ((long)(b[off] & 0xff) )) & 0xffffffffL); |
| } |
| |
| } |