| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| package org.apache.commons.compress.compressors.snappy; |
| |
| import java.io.IOException; |
| import java.io.InputStream; |
| |
| import org.apache.commons.compress.compressors.lz77support.AbstractLZ77CompressorInputStream; |
| import org.apache.commons.compress.utils.ByteUtils; |
| |
| /** |
| * CompressorInputStream for the raw Snappy format. |
| * |
| * <p>This implementation uses an internal buffer in order to handle |
| * the back-references that are at the heart of the LZ77 algorithm. |
| * The size of the buffer must be at least as big as the biggest |
| * offset used in the compressed stream. The current version of the |
| * Snappy algorithm as defined by Google works on 32k blocks and |
| * doesn't contain offsets bigger than 32k which is the default block |
| * size used by this class.</p> |
| * |
| * @see <a href="https://github.com/google/snappy/blob/master/format_description.txt">Snappy compressed format description</a> |
| * @since 1.7 |
| */ |
| public class SnappyCompressorInputStream extends AbstractLZ77CompressorInputStream { |
| |
| /** Mask used to determine the type of "tag" is being processed */ |
| private static final int TAG_MASK = 0x03; |
| |
| /** Default block size */ |
| public static final int DEFAULT_BLOCK_SIZE = 32768; |
| |
| /** The size of the uncompressed data */ |
| private final int size; |
| |
| /** Number of uncompressed bytes still to be read. */ |
| private int uncompressedBytesRemaining; |
| |
| /** Current state of the stream */ |
| private State state = State.NO_BLOCK; |
| |
| private boolean endReached; |
| |
| /** |
| * Constructor using the default buffer size of 32k. |
| * |
| * @param is |
| * An InputStream to read compressed data from |
| * |
| * @throws IOException if reading fails |
| */ |
| public SnappyCompressorInputStream(final InputStream is) throws IOException { |
| this(is, DEFAULT_BLOCK_SIZE); |
| } |
| |
| /** |
| * Constructor using a configurable buffer size. |
| * |
| * @param is |
| * An InputStream to read compressed data from |
| * @param blockSize |
| * The block size used in compression |
| * |
| * @throws IOException if reading fails |
| * @throws IllegalArgumentException if blockSize is not bigger than 0 |
| */ |
| public SnappyCompressorInputStream(final InputStream is, final int blockSize) |
| throws IOException { |
| super(is, blockSize); |
| uncompressedBytesRemaining = size = (int) readSize(); |
| } |
| |
| /** |
| * {@inheritDoc} |
| */ |
| @Override |
| public int read(final byte[] b, final int off, final int len) throws IOException { |
| if (len == 0) { |
| return 0; |
| } |
| if (endReached) { |
| return -1; |
| } |
| switch (state) { |
| case NO_BLOCK: |
| fill(); |
| return read(b, off, len); |
| case IN_LITERAL: |
| final int litLen = readLiteral(b, off, len); |
| if (!hasMoreDataInBlock()) { |
| state = State.NO_BLOCK; |
| } |
| return litLen > 0 ? litLen : read(b, off, len); |
| case IN_BACK_REFERENCE: |
| final int backReferenceLen = readBackReference(b, off, len); |
| if (!hasMoreDataInBlock()) { |
| state = State.NO_BLOCK; |
| } |
| return backReferenceLen > 0 ? backReferenceLen : read(b, off, len); |
| default: |
| throw new IOException("Unknown stream state " + state); |
| } |
| } |
| |
| /** |
| * Try to fill the buffer with the next block of data. |
| */ |
| private void fill() throws IOException { |
| if (uncompressedBytesRemaining == 0) { |
| endReached = true; |
| return; |
| } |
| |
| int b = readOneByte(); |
| if (b == -1) { |
| throw new IOException("Premature end of stream reading block start"); |
| } |
| int length = 0; |
| int offset = 0; |
| |
| switch (b & TAG_MASK) { |
| |
| case 0x00: |
| |
| length = readLiteralLength(b); |
| if (length < 0) { |
| throw new IOException("Illegal block with a negative literal size found"); |
| } |
| uncompressedBytesRemaining -= length; |
| startLiteral(length); |
| state = State.IN_LITERAL; |
| break; |
| |
| case 0x01: |
| |
| /* |
| * These elements can encode lengths between [4..11] bytes and |
| * offsets between [0..2047] bytes. (len-4) occupies three bits |
| * and is stored in bits [2..4] of the tag byte. The offset |
| * occupies 11 bits, of which the upper three are stored in the |
| * upper three bits ([5..7]) of the tag byte, and the lower |
| * eight are stored in a byte following the tag byte. |
| */ |
| |
| length = 4 + ((b >> 2) & 0x07); |
| if (length < 0) { |
| throw new IOException("Illegal block with a negative match length found"); |
| } |
| uncompressedBytesRemaining -= length; |
| offset = (b & 0xE0) << 3; |
| b = readOneByte(); |
| if (b == -1) { |
| throw new IOException("Premature end of stream reading back-reference length"); |
| } |
| offset |= b; |
| |
| try { |
| startBackReference(offset, length); |
| } catch (final IllegalArgumentException ex) { |
| throw new IOException("Illegal block with bad offset found", ex); |
| } |
| state = State.IN_BACK_REFERENCE; |
| break; |
| |
| case 0x02: |
| |
| /* |
| * These elements can encode lengths between [1..64] and offsets |
| * from [0..65535]. (len-1) occupies six bits and is stored in |
| * the upper six bits ([2..7]) of the tag byte. The offset is |
| * stored as a little-endian 16-bit integer in the two bytes |
| * following the tag byte. |
| */ |
| |
| length = (b >> 2) + 1; |
| if (length < 0) { |
| throw new IOException("Illegal block with a negative match length found"); |
| } |
| uncompressedBytesRemaining -= length; |
| |
| offset = (int) ByteUtils.fromLittleEndian(supplier, 2); |
| |
| try { |
| startBackReference(offset, length); |
| } catch (final IllegalArgumentException ex) { |
| throw new IOException("Illegal block with bad offset found", ex); |
| } |
| state = State.IN_BACK_REFERENCE; |
| break; |
| |
| case 0x03: |
| |
| /* |
| * These are like the copies with 2-byte offsets (see previous |
| * subsection), except that the offset is stored as a 32-bit |
| * integer instead of a 16-bit integer (and thus will occupy |
| * four bytes). |
| */ |
| |
| length = (b >> 2) + 1; |
| if (length < 0) { |
| throw new IOException("Illegal block with a negative match length found"); |
| } |
| uncompressedBytesRemaining -= length; |
| |
| offset = (int) ByteUtils.fromLittleEndian(supplier, 4) & 0x7fffffff; |
| |
| try { |
| startBackReference(offset, length); |
| } catch (final IllegalArgumentException ex) { |
| throw new IOException("Illegal block with bad offset found", ex); |
| } |
| state = State.IN_BACK_REFERENCE; |
| break; |
| default: |
| // impossible as TAG_MASK is two bits and all four possible cases have been covered |
| break; |
| } |
| } |
| |
| /* |
| * For literals up to and including 60 bytes in length, the |
| * upper six bits of the tag byte contain (len-1). The literal |
| * follows immediately thereafter in the bytestream. - For |
| * longer literals, the (len-1) value is stored after the tag |
| * byte, little-endian. The upper six bits of the tag byte |
| * describe how many bytes are used for the length; 60, 61, 62 |
| * or 63 for 1-4 bytes, respectively. The literal itself follows |
| * after the length. |
| */ |
| private int readLiteralLength(final int b) throws IOException { |
| final int length; |
| switch (b >> 2) { |
| case 60: |
| length = readOneByte(); |
| if (length == -1) { |
| throw new IOException("Premature end of stream reading literal length"); |
| } |
| break; |
| case 61: |
| length = (int) ByteUtils.fromLittleEndian(supplier, 2); |
| break; |
| case 62: |
| length = (int) ByteUtils.fromLittleEndian(supplier, 3); |
| break; |
| case 63: |
| length = (int) ByteUtils.fromLittleEndian(supplier, 4); |
| break; |
| default: |
| length = b >> 2; |
| break; |
| } |
| |
| return length + 1; |
| } |
| |
| /** |
| * The stream starts with the uncompressed length (up to a maximum of 2^32 - |
| * 1), stored as a little-endian varint. Varints consist of a series of |
| * bytes, where the lower 7 bits are data and the upper bit is set iff there |
| * are more bytes to be read. In other words, an uncompressed length of 64 |
| * would be stored as 0x40, and an uncompressed length of 2097150 (0x1FFFFE) |
| * would be stored as 0xFE 0xFF 0x7F. |
| * |
| * @return The size of the uncompressed data |
| * |
| * @throws IOException |
| * Could not read a byte |
| */ |
| private long readSize() throws IOException { |
| int index = 0; |
| long sz = 0; |
| int b = 0; |
| |
| do { |
| b = readOneByte(); |
| if (b == -1) { |
| throw new IOException("Premature end of stream reading size"); |
| } |
| sz |= (b & 0x7f) << (index++ * 7); |
| } while (0 != (b & 0x80)); |
| return sz; |
| } |
| |
| /** |
| * Get the uncompressed size of the stream |
| * |
| * @return the uncompressed size |
| */ |
| @Override |
| public int getSize() { |
| return size; |
| } |
| |
| private enum State { |
| NO_BLOCK, IN_LITERAL, IN_BACK_REFERENCE |
| } |
| } |