src/main/java/org/apache/commons/compress/compressors/gzip/GzipCompressorInputStream.java - commons-compress - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */
 package org.apache.commons.compress.compressors.gzip;

 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.EOFException;
 import java.io.InputStream;
 import java.io.DataInput;
 import java.io.DataInputStream;
 import java.io.BufferedInputStream;
 import java.nio.charset.StandardCharsets;
 import java.util.zip.DataFormatException;
 import java.util.zip.Deflater;
 import java.util.zip.Inflater;
 import java.util.zip.CRC32;

 import org.apache.commons.compress.compressors.CompressorInputStream;
 import org.apache.commons.compress.utils.ByteUtils;
 import org.apache.commons.compress.utils.CountingInputStream;
 import org.apache.commons.compress.utils.IOUtils;
 import org.apache.commons.compress.utils.InputStreamStatistics;

 /**
  * Input stream that decompresses .gz files.
  *
  * <p>This supports decompressing concatenated .gz files which is important
  * when decompressing standalone .gz files.</p>
  *
  * <p>
  * {@link java.util.zip.GZIPInputStream} doesn't decompress concatenated .gz
  * files: it stops after the first member and silently ignores the rest.
  * It doesn't leave the read position to point to the beginning of the next
  * member, which makes it difficult workaround the lack of concatenation
  * support.
  * </p>
  *
  * <p>
  * Instead of using <code>GZIPInputStream</code>, this class has its own .gz
  * container format decoder. The actual decompression is done with
  * {@link java.util.zip.Inflater}.
  * </p>
  *
  * <p>If you use the constructor {@code GzipCompressorInputStream(in)}
  * or {@code GzipCompressorInputStream(in, false)} with some {@code
  * InputStream} {@code in} then {@link #read} will return -1 as soon
  * as the first internal member has been read completely. The stream
  * {@code in} will be positioned at the start of the second gzip
  * member if there is one.</p>
  *
  * <p>If you use the constructor {@code GzipCompressorInputStream(in,
  * true)} with some {@code InputStream} {@code in} then {@link #read}
  * will return -1 once the stream {@code in} has been exhausted. The
  * data read from a stream constructed this way will consist of the
  * concatenated data of all gzip members contained inside {@code
  * in}.</p>
  *
  * @see "https://tools.ietf.org/html/rfc1952"
  */
 public class GzipCompressorInputStream extends CompressorInputStream
     implements InputStreamStatistics {

     // Header flags
     // private static final int FTEXT = 0x01; // Uninteresting for us
     private static final int FHCRC = 0x02;
     private static final int FEXTRA = 0x04;
     private static final int FNAME = 0x08;
     private static final int FCOMMENT = 0x10;
     private static final int FRESERVED = 0xE0;

     private final CountingInputStream countingStream;

     // Compressed input stream, possibly wrapped in a
     // BufferedInputStream, always wrapped in countingStream above
     private final InputStream in;

     // True if decompressing multi member streams.
     private final boolean decompressConcatenated;

     // Buffer to hold the input data
     private final byte[] buf = new byte[8192];

     // Amount of data in buf.
     private int bufUsed;

     // Decompressor
     private Inflater inf = new Inflater(true);

     // CRC32 from uncompressed data
     private final CRC32 crc = new CRC32();

     // True once everything has been decompressed
     private boolean endReached;

     // used in no-arg read method
     private final byte[] oneByte = new byte[1];

     private final GzipParameters parameters = new GzipParameters();

     /**
      * Constructs a new input stream that decompresses gzip-compressed data
      * from the specified input stream.
      * <p>
      * This is equivalent to
      * <code>GzipCompressorInputStream(inputStream, false)</code> and thus
      * will not decompress concatenated .gz files.
      *
      * @param inputStream  the InputStream from which this object should
      *                     be created of
      *
      * @throws IOException if the stream could not be created
      */
     public GzipCompressorInputStream(final InputStream inputStream)
             throws IOException {
         this(inputStream, false);
     }

     /**
      * Constructs a new input stream that decompresses gzip-compressed data
      * from the specified input stream.
      * <p>
      * If <code>decompressConcatenated</code> is {@code false}:
      * This decompressor might read more input than it will actually use.
      * If <code>inputStream</code> supports <code>mark</code> and
      * <code>reset</code>, then the input position will be adjusted
      * so that it is right after the last byte of the compressed stream.
      * If <code>mark</code> isn't supported, the input position will be
      * undefined.
      *
      * @param inputStream  the InputStream from which this object should
      *                     be created of
      * @param decompressConcatenated
      *                     if true, decompress until the end of the input;
      *                     if false, stop after the first .gz member
      *
      * @throws IOException if the stream could not be created
      */
     public GzipCompressorInputStream(final InputStream inputStream,
                                      final boolean decompressConcatenated)
             throws IOException {
         countingStream = new CountingInputStream(inputStream);
         // Mark support is strictly needed for concatenated files only,
         // but it's simpler if it is always available.
         if (countingStream.markSupported()) {
             in = countingStream;
         } else {
             in = new BufferedInputStream(countingStream);
         }

         this.decompressConcatenated = decompressConcatenated;
         init(true);
     }

     /**
      * Provides the stream's meta data - may change with each stream
      * when decompressing concatenated streams.
      * @return the stream's meta data
      * @since 1.8
      */
     public GzipParameters getMetaData() {
         return parameters;
     }

     private boolean init(final boolean isFirstMember) throws IOException {
         assert isFirstMember || decompressConcatenated;

         // Check the magic bytes without a possibility of EOFException.
         final int magic0 = in.read();

         // If end of input was reached after decompressing at least
         // one .gz member, we have reached the end of the file successfully.
         if (magic0 == -1 && !isFirstMember) {
             return false;
         }

         if (magic0 != 31 || in.read() != 139) {
             throw new IOException(isFirstMember
                                   ? "Input is not in the .gz format"
                                   : "Garbage after a valid .gz stream");
         }

         // Parsing the rest of the header may throw EOFException.
         final DataInput inData = new DataInputStream(in);
         final int method = inData.readUnsignedByte();
         if (method != Deflater.DEFLATED) {
             throw new IOException("Unsupported compression method "
                                   + method + " in the .gz header");
         }

         final int flg = inData.readUnsignedByte();
         if ((flg & FRESERVED) != 0) {
             throw new IOException(
                     "Reserved flags are set in the .gz header");
         }

         parameters.setModificationTime(ByteUtils.fromLittleEndian(inData, 4) * 1000);
         switch (inData.readUnsignedByte()) { // extra flags
         case 2:
             parameters.setCompressionLevel(Deflater.BEST_COMPRESSION);
             break;
         case 4:
             parameters.setCompressionLevel(Deflater.BEST_SPEED);
             break;
         default:
             // ignored for now
             break;
         }
         parameters.setOperatingSystem(inData.readUnsignedByte());

         // Extra field, ignored
         if ((flg & FEXTRA) != 0) {
             int xlen = inData.readUnsignedByte();
             xlen |= inData.readUnsignedByte() << 8;

             // This isn't as efficient as calling in.skip would be,
             // but it's lazier to handle unexpected end of input this way.
             // Most files don't have an extra field anyway.
             while (xlen-- > 0) {
                 inData.readUnsignedByte();
             }
         }

         // Original file name
         if ((flg & FNAME) != 0) {
             parameters.setFilename(new String(readToNull(inData),
                     StandardCharsets.ISO_8859_1));
         }

         // Comment
         if ((flg & FCOMMENT) != 0) {
             parameters.setComment(new String(readToNull(inData),
                     StandardCharsets.ISO_8859_1));
         }

         // Header "CRC16" which is actually a truncated CRC32 (which isn't
         // as good as real CRC16). I don't know if any encoder implementation
         // sets this, so it's not worth trying to verify it. GNU gzip 1.4
         // doesn't support this field, but zlib seems to be able to at least
         // skip over it.
         if ((flg & FHCRC) != 0) {
             inData.readShort();
         }

         // Reset
         inf.reset();
         crc.reset();

         return true;
     }

     private static byte[] readToNull(final DataInput inData) throws IOException {
         try (final ByteArrayOutputStream bos = new ByteArrayOutputStream()) {
             int b = 0;
             while ((b = inData.readUnsignedByte()) != 0x00) { // NOPMD NOSONAR
                 bos.write(b);
             }
             return bos.toByteArray();
         }
     }

     @Override
     public int read() throws IOException {
         return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF;
     }

     /**
      * {@inheritDoc}
      *
      * @since 1.1
      */
     @Override
     public int read(final byte[] b, int off, int len) throws IOException {
         if (len == 0) {
             return 0;
         }
         if (endReached) {
             return -1;
         }

         int size = 0;

         while (len > 0) {
             if (inf.needsInput()) {
                 // Remember the current position because we may need to
                 // rewind after reading too much input.
                 in.mark(buf.length);

                 bufUsed = in.read(buf);
                 if (bufUsed == -1) {
                     throw new EOFException();
                 }

                 inf.setInput(buf, 0, bufUsed);
             }

             final int ret;
             try {
                 ret = inf.inflate(b, off, len);
             } catch (final DataFormatException e) { // NOSONAR
                 throw new IOException("Gzip-compressed data is corrupt");
             }

             crc.update(b, off, ret);
             off += ret;
             len -= ret;
             size += ret;
             count(ret);

             if (inf.finished()) {
                 // We may have read too many bytes. Rewind the read
                 // position to match the actual amount used.
                 in.reset();

                 final int skipAmount = bufUsed - inf.getRemaining();
                 if (IOUtils.skip(in, skipAmount) != skipAmount) {
                     throw new IOException();
                 }

                 bufUsed = 0;

                 final DataInput inData = new DataInputStream(in);

                 // CRC32
                 final long crcStored = ByteUtils.fromLittleEndian(inData, 4);

                 if (crcStored != crc.getValue()) {
                     throw new IOException("Gzip-compressed data is corrupt "
                                           + "(CRC32 error)");
                 }

                 // Uncompressed size modulo 2^32 (ISIZE in the spec)
                 final long isize = ByteUtils.fromLittleEndian(inData, 4);

                 if (isize != (inf.getBytesWritten() & 0xffffffffL)) {
                     throw new IOException("Gzip-compressed data is corrupt"
                                           + "(uncompressed size mismatch)");
                 }

                 // See if this is the end of the file.
                 if (!decompressConcatenated || !init(false)) {
                     inf.end();
                     inf = null;
                     endReached = true;
                     return size == 0 ? -1 : size;
                 }
             }
         }

         return size;
     }

     /**
      * Checks if the signature matches what is expected for a .gz file.
      *
      * @param signature the bytes to check
      * @param length    the number of bytes to check
      * @return          true if this is a .gz stream, false otherwise
      *
      * @since 1.1
      */
     public static boolean matches(final byte[] signature, final int length) {
         return length >= 2 && signature[0] == 31 && signature[1] == -117;
     }

     /**
      * Closes the input stream (unless it is System.in).
      *
      * @since 1.2
      */
     @Override
     public void close() throws IOException {
         if (inf != null) {
             inf.end();
             inf = null;
         }

         if (this.in != System.in) {
             this.in.close();
         }
     }

     /**
      * @since 1.17
      */
     @Override
     public long getCompressedCount() {
         return countingStream.getBytesRead();
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/
	package org.apache.commons.compress.compressors.gzip;

	import java.io.ByteArrayOutputStream;
	import java.io.IOException;
	import java.io.EOFException;
	import java.io.InputStream;
	import java.io.DataInput;
	import java.io.DataInputStream;
	import java.io.BufferedInputStream;
	import java.nio.charset.StandardCharsets;
	import java.util.zip.DataFormatException;
	import java.util.zip.Deflater;
	import java.util.zip.Inflater;
	import java.util.zip.CRC32;

	import org.apache.commons.compress.compressors.CompressorInputStream;
	import org.apache.commons.compress.utils.ByteUtils;
	import org.apache.commons.compress.utils.CountingInputStream;
	import org.apache.commons.compress.utils.IOUtils;
	import org.apache.commons.compress.utils.InputStreamStatistics;

	/**
	* Input stream that decompresses .gz files.
	*
	* <p>This supports decompressing concatenated .gz files which is important
	* when decompressing standalone .gz files.</p>
	*
	* <p>
	* {@link java.util.zip.GZIPInputStream} doesn't decompress concatenated .gz
	* files: it stops after the first member and silently ignores the rest.
	* It doesn't leave the read position to point to the beginning of the next
	* member, which makes it difficult workaround the lack of concatenation
	* support.
	* </p>
	*
	* <p>
	* Instead of using <code>GZIPInputStream</code>, this class has its own .gz
	* container format decoder. The actual decompression is done with
	* {@link java.util.zip.Inflater}.
	* </p>
	*
	* <p>If you use the constructor {@code GzipCompressorInputStream(in)}
	* or {@code GzipCompressorInputStream(in, false)} with some {@code
	* InputStream} {@code in} then {@link #read} will return -1 as soon
	* as the first internal member has been read completely. The stream
	* {@code in} will be positioned at the start of the second gzip
	* member if there is one.</p>
	*
	* <p>If you use the constructor {@code GzipCompressorInputStream(in,
	* true)} with some {@code InputStream} {@code in} then {@link #read}
	* will return -1 once the stream {@code in} has been exhausted. The
	* data read from a stream constructed this way will consist of the
	* concatenated data of all gzip members contained inside {@code
	* in}.</p>
	*
	* @see "https://tools.ietf.org/html/rfc1952"
	*/
	public class GzipCompressorInputStream extends CompressorInputStream
	implements InputStreamStatistics {

	// Header flags
	// private static final int FTEXT = 0x01; // Uninteresting for us
	private static final int FHCRC = 0x02;
	private static final int FEXTRA = 0x04;
	private static final int FNAME = 0x08;
	private static final int FCOMMENT = 0x10;
	private static final int FRESERVED = 0xE0;

	private final CountingInputStream countingStream;

	// Compressed input stream, possibly wrapped in a
	// BufferedInputStream, always wrapped in countingStream above
	private final InputStream in;

	// True if decompressing multi member streams.
	private final boolean decompressConcatenated;

	// Buffer to hold the input data
	private final byte[] buf = new byte[8192];

	// Amount of data in buf.
	private int bufUsed;

	// Decompressor
	private Inflater inf = new Inflater(true);

	// CRC32 from uncompressed data
	private final CRC32 crc = new CRC32();

	// True once everything has been decompressed
	private boolean endReached;

	// used in no-arg read method
	private final byte[] oneByte = new byte[1];

	private final GzipParameters parameters = new GzipParameters();

	/**
	* Constructs a new input stream that decompresses gzip-compressed data
	* from the specified input stream.
	* <p>
	* This is equivalent to
	* <code>GzipCompressorInputStream(inputStream, false)</code> and thus
	* will not decompress concatenated .gz files.
	*
	* @param inputStream the InputStream from which this object should
	* be created of
	*
	* @throws IOException if the stream could not be created
	*/
	public GzipCompressorInputStream(final InputStream inputStream)
	throws IOException {
	this(inputStream, false);
	}

	/**
	* Constructs a new input stream that decompresses gzip-compressed data
	* from the specified input stream.
	* <p>
	* If <code>decompressConcatenated</code> is {@code false}:
	* This decompressor might read more input than it will actually use.
	* If <code>inputStream</code> supports <code>mark</code> and
	* <code>reset</code>, then the input position will be adjusted
	* so that it is right after the last byte of the compressed stream.
	* If <code>mark</code> isn't supported, the input position will be
	* undefined.
	*
	* @param inputStream the InputStream from which this object should
	* be created of
	* @param decompressConcatenated
	* if true, decompress until the end of the input;
	* if false, stop after the first .gz member
	*
	* @throws IOException if the stream could not be created
	*/
	public GzipCompressorInputStream(final InputStream inputStream,
	final boolean decompressConcatenated)
	throws IOException {
	countingStream = new CountingInputStream(inputStream);
	// Mark support is strictly needed for concatenated files only,
	// but it's simpler if it is always available.
	if (countingStream.markSupported()) {
	in = countingStream;
	} else {
	in = new BufferedInputStream(countingStream);
	}

	this.decompressConcatenated = decompressConcatenated;
	init(true);
	}

	/**
	* Provides the stream's meta data - may change with each stream
	* when decompressing concatenated streams.
	* @return the stream's meta data
	* @since 1.8
	*/
	public GzipParameters getMetaData() {
	return parameters;
	}

	private boolean init(final boolean isFirstMember) throws IOException {
	assert isFirstMember \|\| decompressConcatenated;

	// Check the magic bytes without a possibility of EOFException.
	final int magic0 = in.read();

	// If end of input was reached after decompressing at least
	// one .gz member, we have reached the end of the file successfully.
	if (magic0 == -1 && !isFirstMember) {
	return false;
	}

	if (magic0 != 31 \|\| in.read() != 139) {
	throw new IOException(isFirstMember
	? "Input is not in the .gz format"
	: "Garbage after a valid .gz stream");
	}

	// Parsing the rest of the header may throw EOFException.
	final DataInput inData = new DataInputStream(in);
	final int method = inData.readUnsignedByte();
	if (method != Deflater.DEFLATED) {
	throw new IOException("Unsupported compression method "
	+ method + " in the .gz header");
	}

	final int flg = inData.readUnsignedByte();
	if ((flg & FRESERVED) != 0) {
	throw new IOException(
	"Reserved flags are set in the .gz header");
	}

	parameters.setModificationTime(ByteUtils.fromLittleEndian(inData, 4) * 1000);
	switch (inData.readUnsignedByte()) { // extra flags
	case 2:
	parameters.setCompressionLevel(Deflater.BEST_COMPRESSION);
	break;
	case 4:
	parameters.setCompressionLevel(Deflater.BEST_SPEED);
	break;
	default:
	// ignored for now
	break;
	}
	parameters.setOperatingSystem(inData.readUnsignedByte());

	// Extra field, ignored
	if ((flg & FEXTRA) != 0) {
	int xlen = inData.readUnsignedByte();
	xlen \|= inData.readUnsignedByte() << 8;

	// This isn't as efficient as calling in.skip would be,
	// but it's lazier to handle unexpected end of input this way.
	// Most files don't have an extra field anyway.
	while (xlen-- > 0) {
	inData.readUnsignedByte();
	}
	}

	// Original file name
	if ((flg & FNAME) != 0) {
	parameters.setFilename(new String(readToNull(inData),
	StandardCharsets.ISO_8859_1));
	}

	// Comment
	if ((flg & FCOMMENT) != 0) {
	parameters.setComment(new String(readToNull(inData),
	StandardCharsets.ISO_8859_1));
	}

	// Header "CRC16" which is actually a truncated CRC32 (which isn't
	// as good as real CRC16). I don't know if any encoder implementation
	// sets this, so it's not worth trying to verify it. GNU gzip 1.4
	// doesn't support this field, but zlib seems to be able to at least
	// skip over it.
	if ((flg & FHCRC) != 0) {
	inData.readShort();
	}

	// Reset
	inf.reset();
	crc.reset();

	return true;
	}

	private static byte[] readToNull(final DataInput inData) throws IOException {
	try (final ByteArrayOutputStream bos = new ByteArrayOutputStream()) {
	int b = 0;
	while ((b = inData.readUnsignedByte()) != 0x00) { // NOPMD NOSONAR
	bos.write(b);
	}
	return bos.toByteArray();
	}
	}

	@Override
	public int read() throws IOException {
	return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF;
	}

	/**
	* {@inheritDoc}
	*
	* @since 1.1
	*/
	@Override
	public int read(final byte[] b, int off, int len) throws IOException {
	if (len == 0) {
	return 0;
	}
	if (endReached) {
	return -1;
	}

	int size = 0;

	while (len > 0) {
	if (inf.needsInput()) {
	// Remember the current position because we may need to
	// rewind after reading too much input.
	in.mark(buf.length);

	bufUsed = in.read(buf);
	if (bufUsed == -1) {
	throw new EOFException();
	}

	inf.setInput(buf, 0, bufUsed);
	}

	final int ret;
	try {
	ret = inf.inflate(b, off, len);
	} catch (final DataFormatException e) { // NOSONAR
	throw new IOException("Gzip-compressed data is corrupt");
	}

	crc.update(b, off, ret);
	off += ret;
	len -= ret;
	size += ret;
	count(ret);

	if (inf.finished()) {
	// We may have read too many bytes. Rewind the read
	// position to match the actual amount used.
	in.reset();

	final int skipAmount = bufUsed - inf.getRemaining();
	if (IOUtils.skip(in, skipAmount) != skipAmount) {
	throw new IOException();
	}

	bufUsed = 0;

	final DataInput inData = new DataInputStream(in);

	// CRC32
	final long crcStored = ByteUtils.fromLittleEndian(inData, 4);

	if (crcStored != crc.getValue()) {
	throw new IOException("Gzip-compressed data is corrupt "
	+ "(CRC32 error)");
	}

	// Uncompressed size modulo 2^32 (ISIZE in the spec)
	final long isize = ByteUtils.fromLittleEndian(inData, 4);

	if (isize != (inf.getBytesWritten() & 0xffffffffL)) {
	throw new IOException("Gzip-compressed data is corrupt"
	+ "(uncompressed size mismatch)");
	}

	// See if this is the end of the file.
	if (!decompressConcatenated \|\| !init(false)) {
	inf.end();
	inf = null;
	endReached = true;
	return size == 0 ? -1 : size;
	}
	}
	}

	return size;
	}

	/**
	* Checks if the signature matches what is expected for a .gz file.
	*
	* @param signature the bytes to check
	* @param length the number of bytes to check
	* @return true if this is a .gz stream, false otherwise
	*
	* @since 1.1
	*/
	public static boolean matches(final byte[] signature, final int length) {
	return length >= 2 && signature[0] == 31 && signature[1] == -117;
	}

	/**
	* Closes the input stream (unless it is System.in).
	*
	* @since 1.2
	*/
	@Override
	public void close() throws IOException {
	if (inf != null) {
	inf.end();
	inf = null;
	}

	if (this.in != System.in) {
	this.in.close();
	}
	}

	/**
	* @since 1.17
	*/
	@Override
	public long getCompressedCount() {
	return countingStream.getBytesRead();
	}
	}