| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| package org.apache.commons.compress.compressors.snappy; |
| |
| import java.io.IOException; |
| import java.io.OutputStream; |
| |
| import org.apache.commons.compress.compressors.CompressorOutputStream; |
| import org.apache.commons.compress.compressors.lz77support.LZ77Compressor; |
| import org.apache.commons.compress.compressors.lz77support.Parameters; |
| import org.apache.commons.compress.utils.ByteUtils; |
| |
| /** |
| * CompressorOutputStream for the raw Snappy format. |
| * |
| * <p>This implementation uses an internal buffer in order to handle |
| * the back-references that are at the heart of the LZ77 algorithm. |
| * The size of the buffer must be at least as big as the biggest |
| * offset used in the compressed stream. The current version of the |
| * Snappy algorithm as defined by Google works on 32k blocks and |
| * doesn't contain offsets bigger than 32k which is the default block |
| * size used by this class.</p> |
| * |
| * <p>The raw Snappy format requires the uncompressed size to be |
| * written at the beginning of the stream using a varint |
| * representation, i.e. the number of bytes needed to write the |
| * information is not known before the uncompressed size is |
| * known. We've chosen to make the uncompressedSize a parameter of the |
| * constructor in favor of buffering the whole output until the size |
| * is known. When using the {@link FramedSnappyCompressorOutputStream} |
| * this limitation is taken care of by the warpping framing |
| * format.</p> |
| * |
| * @see <a href="https://github.com/google/snappy/blob/master/format_description.txt">Snappy compressed format description</a> |
| * @since 1.14 |
| * @NotThreadSafe |
| */ |
| public class SnappyCompressorOutputStream extends CompressorOutputStream { |
| private final LZ77Compressor compressor; |
| private final OutputStream os; |
| private final ByteUtils.ByteConsumer consumer; |
| |
| // used in one-arg write method |
| private final byte[] oneByte = new byte[1]; |
| |
| private boolean finished; |
| |
| /** |
| * Constructor using the default block size of 32k. |
| * |
| * @param os the outputstream to write compressed data to |
| * @param uncompressedSize the uncompressed size of data |
| * @throws IOException if writing of the size fails |
| */ |
| public SnappyCompressorOutputStream(final OutputStream os, final long uncompressedSize) throws IOException { |
| this(os, uncompressedSize, SnappyCompressorInputStream.DEFAULT_BLOCK_SIZE); |
| } |
| |
| /** |
| * Constructor using a configurable block size. |
| * |
| * @param os the outputstream to write compressed data to |
| * @param uncompressedSize the uncompressed size of data |
| * @param blockSize the block size used - must be a power of two |
| * @throws IOException if writing of the size fails |
| */ |
| public SnappyCompressorOutputStream(final OutputStream os, final long uncompressedSize, final int blockSize) |
| throws IOException { |
| this(os, uncompressedSize, createParameterBuilder(blockSize).build()); |
| } |
| |
| /** |
| * Constructor providing full control over the underlying LZ77 compressor. |
| * |
| * @param os the outputstream to write compressed data to |
| * @param uncompressedSize the uncompressed size of data |
| * @param params the parameters to use by the compressor - note |
| * that the format itself imposes some limits like a maximum match |
| * length of 64 bytes |
| * @throws IOException if writing of the size fails |
| */ |
| public SnappyCompressorOutputStream(final OutputStream os, final long uncompressedSize, final Parameters params) |
| throws IOException { |
| this.os = os; |
| consumer = new ByteUtils.OutputStreamByteConsumer(os); |
| compressor = new LZ77Compressor(params, block -> { |
| switch (block.getType()) { |
| case LITERAL: |
| writeLiteralBlock((LZ77Compressor.LiteralBlock) block); |
| break; |
| case BACK_REFERENCE: |
| writeBackReference((LZ77Compressor.BackReference) block); |
| break; |
| case EOD: |
| break; |
| } |
| }); |
| writeUncompressedSize(uncompressedSize); |
| } |
| |
| @Override |
| public void write(final int b) throws IOException { |
| oneByte[0] = (byte) (b & 0xff); |
| write(oneByte); |
| } |
| |
| @Override |
| public void write(final byte[] data, final int off, final int len) throws IOException { |
| compressor.compress(data, off, len); |
| } |
| |
| @Override |
| public void close() throws IOException { |
| try { |
| finish(); |
| } finally { |
| os.close(); |
| } |
| } |
| |
| /** |
| * Compresses all remaining data and writes it to the stream, |
| * doesn't close the underlying stream. |
| * @throws IOException if an error occurs |
| */ |
| public void finish() throws IOException { |
| if (!finished) { |
| compressor.finish(); |
| finished = true; |
| } |
| } |
| |
| private void writeUncompressedSize(long uncompressedSize) throws IOException { |
| boolean more = false; |
| do { |
| int currentByte = (int) (uncompressedSize & 0x7F); |
| more = uncompressedSize > currentByte; |
| if (more) { |
| currentByte |= 0x80; |
| } |
| os.write(currentByte); |
| uncompressedSize >>= 7; |
| } while (more); |
| } |
| |
| // literal length is stored as (len - 1) either inside the tag |
| // (six bits minus four flags) or in 1 to 4 bytes after the tag |
| private static final int MAX_LITERAL_SIZE_WITHOUT_SIZE_BYTES = 60; |
| private static final int MAX_LITERAL_SIZE_WITH_ONE_SIZE_BYTE = 1 << 8; |
| private static final int MAX_LITERAL_SIZE_WITH_TWO_SIZE_BYTES = 1 << 16; |
| private static final int MAX_LITERAL_SIZE_WITH_THREE_SIZE_BYTES = 1 << 24; |
| |
| private static final int ONE_SIZE_BYTE_MARKER = 60 << 2; |
| private static final int TWO_SIZE_BYTE_MARKER = 61 << 2; |
| private static final int THREE_SIZE_BYTE_MARKER = 62 << 2; |
| private static final int FOUR_SIZE_BYTE_MARKER = 63 << 2; |
| |
| private void writeLiteralBlock(final LZ77Compressor.LiteralBlock block) throws IOException { |
| final int len = block.getLength(); |
| if (len <= MAX_LITERAL_SIZE_WITHOUT_SIZE_BYTES) { |
| writeLiteralBlockNoSizeBytes(block, len); |
| } else if (len <= MAX_LITERAL_SIZE_WITH_ONE_SIZE_BYTE) { |
| writeLiteralBlockOneSizeByte(block, len); |
| } else if (len <= MAX_LITERAL_SIZE_WITH_TWO_SIZE_BYTES) { |
| writeLiteralBlockTwoSizeBytes(block, len); |
| } else if (len <= MAX_LITERAL_SIZE_WITH_THREE_SIZE_BYTES) { |
| writeLiteralBlockThreeSizeBytes(block, len); |
| } else { |
| writeLiteralBlockFourSizeBytes(block, len); |
| } |
| } |
| |
| private void writeLiteralBlockNoSizeBytes(final LZ77Compressor.LiteralBlock block, final int len) throws IOException { |
| writeLiteralBlockWithSize(len - 1 << 2, 0, len, block); |
| } |
| |
| private void writeLiteralBlockOneSizeByte(final LZ77Compressor.LiteralBlock block, final int len) throws IOException { |
| writeLiteralBlockWithSize(ONE_SIZE_BYTE_MARKER, 1, len, block); |
| } |
| |
| private void writeLiteralBlockTwoSizeBytes(final LZ77Compressor.LiteralBlock block, final int len) throws IOException { |
| writeLiteralBlockWithSize(TWO_SIZE_BYTE_MARKER, 2, len, block); |
| } |
| |
| private void writeLiteralBlockThreeSizeBytes(final LZ77Compressor.LiteralBlock block, final int len) throws IOException { |
| writeLiteralBlockWithSize(THREE_SIZE_BYTE_MARKER, 3, len, block); |
| } |
| |
| private void writeLiteralBlockFourSizeBytes(final LZ77Compressor.LiteralBlock block, final int len) throws IOException { |
| writeLiteralBlockWithSize(FOUR_SIZE_BYTE_MARKER, 4, len, block); |
| } |
| |
| private void writeLiteralBlockWithSize(final int tagByte, final int sizeBytes, final int len, final LZ77Compressor.LiteralBlock block) |
| throws IOException { |
| os.write(tagByte); |
| writeLittleEndian(sizeBytes, len - 1); |
| os.write(block.getData(), block.getOffset(), len); |
| } |
| |
| private void writeLittleEndian(final int numBytes, final int num) throws IOException { |
| ByteUtils.toLittleEndian(consumer, num, numBytes); |
| } |
| |
| // Back-references ("copies") have their offset/size information |
| // in two, three or five bytes. |
| private static final int MIN_MATCH_LENGTH_WITH_ONE_OFFSET_BYTE = 4; |
| private static final int MAX_MATCH_LENGTH_WITH_ONE_OFFSET_BYTE = 11; |
| private static final int MAX_OFFSET_WITH_ONE_OFFSET_BYTE = 1 << 11 - 1; |
| private static final int MAX_OFFSET_WITH_TWO_OFFSET_BYTES = 1 << 16 - 1; |
| |
| private static final int ONE_BYTE_COPY_TAG = 1; |
| private static final int TWO_BYTE_COPY_TAG = 2; |
| private static final int FOUR_BYTE_COPY_TAG = 3; |
| |
| private void writeBackReference(final LZ77Compressor.BackReference block) throws IOException { |
| final int len = block.getLength(); |
| final int offset = block.getOffset(); |
| if (len >= MIN_MATCH_LENGTH_WITH_ONE_OFFSET_BYTE && len <= MAX_MATCH_LENGTH_WITH_ONE_OFFSET_BYTE |
| && offset <= MAX_OFFSET_WITH_ONE_OFFSET_BYTE) { |
| writeBackReferenceWithOneOffsetByte(len, offset); |
| } else if (offset < MAX_OFFSET_WITH_TWO_OFFSET_BYTES) { |
| writeBackReferenceWithTwoOffsetBytes(len, offset); |
| } else { |
| writeBackReferenceWithFourOffsetBytes(len, offset); |
| } |
| } |
| |
| private void writeBackReferenceWithOneOffsetByte(final int len, final int offset) throws IOException { |
| os.write(ONE_BYTE_COPY_TAG | ((len - 4) << 2) | ((offset & 0x700) >> 3)); |
| os.write(offset & 0xff); |
| } |
| |
| private void writeBackReferenceWithTwoOffsetBytes(final int len, final int offset) throws IOException { |
| writeBackReferenceWithLittleEndianOffset(TWO_BYTE_COPY_TAG, 2, len, offset); |
| } |
| |
| private void writeBackReferenceWithFourOffsetBytes(final int len, final int offset) throws IOException { |
| writeBackReferenceWithLittleEndianOffset(FOUR_BYTE_COPY_TAG, 4, len, offset); |
| } |
| |
| private void writeBackReferenceWithLittleEndianOffset(final int tag, final int offsetBytes, final int len, final int offset) |
| throws IOException { |
| os.write(tag | ((len - 1) << 2)); |
| writeLittleEndian(offsetBytes, offset); |
| } |
| |
| // technically the format could use shorter matches but with a |
| // length of three the offset would be encoded as at least two |
| // bytes in addition to the tag, so yield no compression at all |
| private static final int MIN_MATCH_LENGTH = 4; |
| // Snappy stores the match length in six bits of the tag |
| private static final int MAX_MATCH_LENGTH = 64; |
| |
| /** |
| * Returns a builder correctly configured for the Snappy algorithm using the gven block size. |
| * @param blockSize the block size. |
| * @return a builder correctly configured for the Snappy algorithm using the gven block size |
| */ |
| public static Parameters.Builder createParameterBuilder(final int blockSize) { |
| // the max offset and max literal length defined by the format |
| // are 2^32 - 1 and 2^32 respectively - with blockSize being |
| // an integer we will never exceed that |
| return Parameters.builder(blockSize) |
| .withMinBackReferenceLength(MIN_MATCH_LENGTH) |
| .withMaxBackReferenceLength(MAX_MATCH_LENGTH) |
| .withMaxOffset(blockSize) |
| .withMaxLiteralLength(blockSize); |
| } |
| } |