blob: 9a8c879ec34a1827617cbca7d038c5354ddbf3d4 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.commons.compress.compressors.gzip;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.EOFException;
import java.io.InputStream;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.BufferedInputStream;
import java.nio.charset.StandardCharsets;
import java.util.zip.DataFormatException;
import java.util.zip.Deflater;
import java.util.zip.Inflater;
import java.util.zip.CRC32;
import org.apache.commons.compress.compressors.CompressorInputStream;
import org.apache.commons.compress.utils.ByteUtils;
import org.apache.commons.compress.utils.CountingInputStream;
import org.apache.commons.compress.utils.IOUtils;
import org.apache.commons.compress.utils.InputStreamStatistics;
/**
* Input stream that decompresses .gz files.
*
* <p>This supports decompressing concatenated .gz files which is important
* when decompressing standalone .gz files.</p>
*
* <p>
* {@link java.util.zip.GZIPInputStream} doesn't decompress concatenated .gz
* files: it stops after the first member and silently ignores the rest.
* It doesn't leave the read position to point to the beginning of the next
* member, which makes it difficult workaround the lack of concatenation
* support.
* </p>
*
* <p>
* Instead of using <code>GZIPInputStream</code>, this class has its own .gz
* container format decoder. The actual decompression is done with
* {@link java.util.zip.Inflater}.
* </p>
*
* <p>If you use the constructor {@code GzipCompressorInputStream(in)}
* or {@code GzipCompressorInputStream(in, false)} with some {@code
* InputStream} {@code in} then {@link #read} will return -1 as soon
* as the first internal member has been read completely. The stream
* {@code in} will be positioned at the start of the second gzip
* member if there is one.</p>
*
* <p>If you use the constructor {@code GzipCompressorInputStream(in,
* true)} with some {@code InputStream} {@code in} then {@link #read}
* will return -1 once the stream {@code in} has been exhausted. The
* data read from a stream constructed this way will consist of the
* concatenated data of all gzip members contained inside {@code
* in}.</p>
*
* @see "https://tools.ietf.org/html/rfc1952"
*/
public class GzipCompressorInputStream extends CompressorInputStream
implements InputStreamStatistics {
// Header flags
// private static final int FTEXT = 0x01; // Uninteresting for us
private static final int FHCRC = 0x02;
private static final int FEXTRA = 0x04;
private static final int FNAME = 0x08;
private static final int FCOMMENT = 0x10;
private static final int FRESERVED = 0xE0;
private final CountingInputStream countingStream;
// Compressed input stream, possibly wrapped in a
// BufferedInputStream, always wrapped in countingStream above
private final InputStream in;
// True if decompressing multi member streams.
private final boolean decompressConcatenated;
// Buffer to hold the input data
private final byte[] buf = new byte[8192];
// Amount of data in buf.
private int bufUsed;
// Decompressor
private Inflater inf = new Inflater(true);
// CRC32 from uncompressed data
private final CRC32 crc = new CRC32();
// True once everything has been decompressed
private boolean endReached;
// used in no-arg read method
private final byte[] oneByte = new byte[1];
private final GzipParameters parameters = new GzipParameters();
/**
* Constructs a new input stream that decompresses gzip-compressed data
* from the specified input stream.
* <p>
* This is equivalent to
* <code>GzipCompressorInputStream(inputStream, false)</code> and thus
* will not decompress concatenated .gz files.
*
* @param inputStream the InputStream from which this object should
* be created of
*
* @throws IOException if the stream could not be created
*/
public GzipCompressorInputStream(final InputStream inputStream)
throws IOException {
this(inputStream, false);
}
/**
* Constructs a new input stream that decompresses gzip-compressed data
* from the specified input stream.
* <p>
* If <code>decompressConcatenated</code> is {@code false}:
* This decompressor might read more input than it will actually use.
* If <code>inputStream</code> supports <code>mark</code> and
* <code>reset</code>, then the input position will be adjusted
* so that it is right after the last byte of the compressed stream.
* If <code>mark</code> isn't supported, the input position will be
* undefined.
*
* @param inputStream the InputStream from which this object should
* be created of
* @param decompressConcatenated
* if true, decompress until the end of the input;
* if false, stop after the first .gz member
*
* @throws IOException if the stream could not be created
*/
public GzipCompressorInputStream(final InputStream inputStream,
final boolean decompressConcatenated)
throws IOException {
countingStream = new CountingInputStream(inputStream);
// Mark support is strictly needed for concatenated files only,
// but it's simpler if it is always available.
if (countingStream.markSupported()) {
in = countingStream;
} else {
in = new BufferedInputStream(countingStream);
}
this.decompressConcatenated = decompressConcatenated;
init(true);
}
/**
* Provides the stream's meta data - may change with each stream
* when decompressing concatenated streams.
* @return the stream's meta data
* @since 1.8
*/
public GzipParameters getMetaData() {
return parameters;
}
private boolean init(final boolean isFirstMember) throws IOException {
assert isFirstMember || decompressConcatenated;
// Check the magic bytes without a possibility of EOFException.
final int magic0 = in.read();
// If end of input was reached after decompressing at least
// one .gz member, we have reached the end of the file successfully.
if (magic0 == -1 && !isFirstMember) {
return false;
}
if (magic0 != 31 || in.read() != 139) {
throw new IOException(isFirstMember
? "Input is not in the .gz format"
: "Garbage after a valid .gz stream");
}
// Parsing the rest of the header may throw EOFException.
final DataInput inData = new DataInputStream(in);
final int method = inData.readUnsignedByte();
if (method != Deflater.DEFLATED) {
throw new IOException("Unsupported compression method "
+ method + " in the .gz header");
}
final int flg = inData.readUnsignedByte();
if ((flg & FRESERVED) != 0) {
throw new IOException(
"Reserved flags are set in the .gz header");
}
parameters.setModificationTime(ByteUtils.fromLittleEndian(inData, 4) * 1000);
switch (inData.readUnsignedByte()) { // extra flags
case 2:
parameters.setCompressionLevel(Deflater.BEST_COMPRESSION);
break;
case 4:
parameters.setCompressionLevel(Deflater.BEST_SPEED);
break;
default:
// ignored for now
break;
}
parameters.setOperatingSystem(inData.readUnsignedByte());
// Extra field, ignored
if ((flg & FEXTRA) != 0) {
int xlen = inData.readUnsignedByte();
xlen |= inData.readUnsignedByte() << 8;
// This isn't as efficient as calling in.skip would be,
// but it's lazier to handle unexpected end of input this way.
// Most files don't have an extra field anyway.
while (xlen-- > 0) {
inData.readUnsignedByte();
}
}
// Original file name
if ((flg & FNAME) != 0) {
parameters.setFilename(new String(readToNull(inData),
StandardCharsets.ISO_8859_1));
}
// Comment
if ((flg & FCOMMENT) != 0) {
parameters.setComment(new String(readToNull(inData),
StandardCharsets.ISO_8859_1));
}
// Header "CRC16" which is actually a truncated CRC32 (which isn't
// as good as real CRC16). I don't know if any encoder implementation
// sets this, so it's not worth trying to verify it. GNU gzip 1.4
// doesn't support this field, but zlib seems to be able to at least
// skip over it.
if ((flg & FHCRC) != 0) {
inData.readShort();
}
// Reset
inf.reset();
crc.reset();
return true;
}
private static byte[] readToNull(final DataInput inData) throws IOException {
try (final ByteArrayOutputStream bos = new ByteArrayOutputStream()) {
int b = 0;
while ((b = inData.readUnsignedByte()) != 0x00) { // NOPMD NOSONAR
bos.write(b);
}
return bos.toByteArray();
}
}
@Override
public int read() throws IOException {
return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF;
}
/**
* {@inheritDoc}
*
* @since 1.1
*/
@Override
public int read(final byte[] b, int off, int len) throws IOException {
if (len == 0) {
return 0;
}
if (endReached) {
return -1;
}
int size = 0;
while (len > 0) {
if (inf.needsInput()) {
// Remember the current position because we may need to
// rewind after reading too much input.
in.mark(buf.length);
bufUsed = in.read(buf);
if (bufUsed == -1) {
throw new EOFException();
}
inf.setInput(buf, 0, bufUsed);
}
final int ret;
try {
ret = inf.inflate(b, off, len);
} catch (final DataFormatException e) { // NOSONAR
throw new IOException("Gzip-compressed data is corrupt");
}
crc.update(b, off, ret);
off += ret;
len -= ret;
size += ret;
count(ret);
if (inf.finished()) {
// We may have read too many bytes. Rewind the read
// position to match the actual amount used.
in.reset();
final int skipAmount = bufUsed - inf.getRemaining();
if (IOUtils.skip(in, skipAmount) != skipAmount) {
throw new IOException();
}
bufUsed = 0;
final DataInput inData = new DataInputStream(in);
// CRC32
final long crcStored = ByteUtils.fromLittleEndian(inData, 4);
if (crcStored != crc.getValue()) {
throw new IOException("Gzip-compressed data is corrupt "
+ "(CRC32 error)");
}
// Uncompressed size modulo 2^32 (ISIZE in the spec)
final long isize = ByteUtils.fromLittleEndian(inData, 4);
if (isize != (inf.getBytesWritten() & 0xffffffffL)) {
throw new IOException("Gzip-compressed data is corrupt"
+ "(uncompressed size mismatch)");
}
// See if this is the end of the file.
if (!decompressConcatenated || !init(false)) {
inf.end();
inf = null;
endReached = true;
return size == 0 ? -1 : size;
}
}
}
return size;
}
/**
* Checks if the signature matches what is expected for a .gz file.
*
* @param signature the bytes to check
* @param length the number of bytes to check
* @return true if this is a .gz stream, false otherwise
*
* @since 1.1
*/
public static boolean matches(final byte[] signature, final int length) {
return length >= 2 && signature[0] == 31 && signature[1] == -117;
}
/**
* Closes the input stream (unless it is System.in).
*
* @since 1.2
*/
@Override
public void close() throws IOException {
if (inf != null) {
inf.end();
inf = null;
}
if (this.in != System.in) {
this.in.close();
}
}
/**
* @since 1.17
*/
@Override
public long getCompressedCount() {
return countingStream.getBytesRead();
}
}