blob: 1b9c47cb728f68a28fd2be9bd3d1edf2e73cdc63 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.commons.compress.compressors.snappy;
import java.io.IOException;
import java.io.OutputStream;
import org.apache.commons.compress.compressors.CompressorOutputStream;
import org.apache.commons.compress.compressors.lz77support.LZ77Compressor;
import org.apache.commons.compress.compressors.lz77support.Parameters;
import org.apache.commons.compress.utils.ByteUtils;
/**
* CompressorOutputStream for the raw Snappy format.
*
* <p>This implementation uses an internal buffer in order to handle
* the back-references that are at the heart of the LZ77 algorithm.
* The size of the buffer must be at least as big as the biggest
* offset used in the compressed stream. The current version of the
* Snappy algorithm as defined by Google works on 32k blocks and
* doesn't contain offsets bigger than 32k which is the default block
* size used by this class.</p>
*
* <p>The raw Snappy format requires the uncompressed size to be
* written at the beginning of the stream using a varint
* representation, i.e. the number of bytes needed to write the
* information is not known before the uncompressed size is
* known. We've chosen to make the uncompressedSize a parameter of the
* constructor in favor of buffering the whole output until the size
* is known. When using the {@link FramedSnappyCompressorOutputStream}
* this limitation is taken care of by the warpping framing
* format.</p>
*
* @see <a href="https://github.com/google/snappy/blob/master/format_description.txt">Snappy compressed format description</a>
* @since 1.14
* @NotThreadSafe
*/
public class SnappyCompressorOutputStream extends CompressorOutputStream {
private final LZ77Compressor compressor;
private final OutputStream os;
private final ByteUtils.ByteConsumer consumer;
// used in one-arg write method
private final byte[] oneByte = new byte[1];
private boolean finished;
/**
* Constructor using the default block size of 32k.
*
* @param os the outputstream to write compressed data to
* @param uncompressedSize the uncompressed size of data
* @throws IOException if writing of the size fails
*/
public SnappyCompressorOutputStream(final OutputStream os, final long uncompressedSize) throws IOException {
this(os, uncompressedSize, SnappyCompressorInputStream.DEFAULT_BLOCK_SIZE);
}
/**
* Constructor using a configurable block size.
*
* @param os the outputstream to write compressed data to
* @param uncompressedSize the uncompressed size of data
* @param blockSize the block size used - must be a power of two
* @throws IOException if writing of the size fails
*/
public SnappyCompressorOutputStream(final OutputStream os, final long uncompressedSize, final int blockSize)
throws IOException {
this(os, uncompressedSize, createParameterBuilder(blockSize).build());
}
/**
* Constructor providing full control over the underlying LZ77 compressor.
*
* @param os the outputstream to write compressed data to
* @param uncompressedSize the uncompressed size of data
* @param params the parameters to use by the compressor - note
* that the format itself imposes some limits like a maximum match
* length of 64 bytes
* @throws IOException if writing of the size fails
*/
public SnappyCompressorOutputStream(final OutputStream os, final long uncompressedSize, final Parameters params)
throws IOException {
this.os = os;
consumer = new ByteUtils.OutputStreamByteConsumer(os);
compressor = new LZ77Compressor(params, block -> {
switch (block.getType()) {
case LITERAL:
writeLiteralBlock((LZ77Compressor.LiteralBlock) block);
break;
case BACK_REFERENCE:
writeBackReference((LZ77Compressor.BackReference) block);
break;
case EOD:
break;
}
});
writeUncompressedSize(uncompressedSize);
}
@Override
public void write(final int b) throws IOException {
oneByte[0] = (byte) (b & 0xff);
write(oneByte);
}
@Override
public void write(final byte[] data, final int off, final int len) throws IOException {
compressor.compress(data, off, len);
}
@Override
public void close() throws IOException {
try {
finish();
} finally {
os.close();
}
}
/**
* Compresses all remaining data and writes it to the stream,
* doesn't close the underlying stream.
* @throws IOException if an error occurs
*/
public void finish() throws IOException {
if (!finished) {
compressor.finish();
finished = true;
}
}
private void writeUncompressedSize(long uncompressedSize) throws IOException {
boolean more = false;
do {
int currentByte = (int) (uncompressedSize & 0x7F);
more = uncompressedSize > currentByte;
if (more) {
currentByte |= 0x80;
}
os.write(currentByte);
uncompressedSize >>= 7;
} while (more);
}
// literal length is stored as (len - 1) either inside the tag
// (six bits minus four flags) or in 1 to 4 bytes after the tag
private static final int MAX_LITERAL_SIZE_WITHOUT_SIZE_BYTES = 60;
private static final int MAX_LITERAL_SIZE_WITH_ONE_SIZE_BYTE = 1 << 8;
private static final int MAX_LITERAL_SIZE_WITH_TWO_SIZE_BYTES = 1 << 16;
private static final int MAX_LITERAL_SIZE_WITH_THREE_SIZE_BYTES = 1 << 24;
private static final int ONE_SIZE_BYTE_MARKER = 60 << 2;
private static final int TWO_SIZE_BYTE_MARKER = 61 << 2;
private static final int THREE_SIZE_BYTE_MARKER = 62 << 2;
private static final int FOUR_SIZE_BYTE_MARKER = 63 << 2;
private void writeLiteralBlock(final LZ77Compressor.LiteralBlock block) throws IOException {
final int len = block.getLength();
if (len <= MAX_LITERAL_SIZE_WITHOUT_SIZE_BYTES) {
writeLiteralBlockNoSizeBytes(block, len);
} else if (len <= MAX_LITERAL_SIZE_WITH_ONE_SIZE_BYTE) {
writeLiteralBlockOneSizeByte(block, len);
} else if (len <= MAX_LITERAL_SIZE_WITH_TWO_SIZE_BYTES) {
writeLiteralBlockTwoSizeBytes(block, len);
} else if (len <= MAX_LITERAL_SIZE_WITH_THREE_SIZE_BYTES) {
writeLiteralBlockThreeSizeBytes(block, len);
} else {
writeLiteralBlockFourSizeBytes(block, len);
}
}
private void writeLiteralBlockNoSizeBytes(final LZ77Compressor.LiteralBlock block, final int len) throws IOException {
writeLiteralBlockWithSize(len - 1 << 2, 0, len, block);
}
private void writeLiteralBlockOneSizeByte(final LZ77Compressor.LiteralBlock block, final int len) throws IOException {
writeLiteralBlockWithSize(ONE_SIZE_BYTE_MARKER, 1, len, block);
}
private void writeLiteralBlockTwoSizeBytes(final LZ77Compressor.LiteralBlock block, final int len) throws IOException {
writeLiteralBlockWithSize(TWO_SIZE_BYTE_MARKER, 2, len, block);
}
private void writeLiteralBlockThreeSizeBytes(final LZ77Compressor.LiteralBlock block, final int len) throws IOException {
writeLiteralBlockWithSize(THREE_SIZE_BYTE_MARKER, 3, len, block);
}
private void writeLiteralBlockFourSizeBytes(final LZ77Compressor.LiteralBlock block, final int len) throws IOException {
writeLiteralBlockWithSize(FOUR_SIZE_BYTE_MARKER, 4, len, block);
}
private void writeLiteralBlockWithSize(final int tagByte, final int sizeBytes, final int len, final LZ77Compressor.LiteralBlock block)
throws IOException {
os.write(tagByte);
writeLittleEndian(sizeBytes, len - 1);
os.write(block.getData(), block.getOffset(), len);
}
private void writeLittleEndian(final int numBytes, final int num) throws IOException {
ByteUtils.toLittleEndian(consumer, num, numBytes);
}
// Back-references ("copies") have their offset/size information
// in two, three or five bytes.
private static final int MIN_MATCH_LENGTH_WITH_ONE_OFFSET_BYTE = 4;
private static final int MAX_MATCH_LENGTH_WITH_ONE_OFFSET_BYTE = 11;
private static final int MAX_OFFSET_WITH_ONE_OFFSET_BYTE = 1 << 11 - 1;
private static final int MAX_OFFSET_WITH_TWO_OFFSET_BYTES = 1 << 16 - 1;
private static final int ONE_BYTE_COPY_TAG = 1;
private static final int TWO_BYTE_COPY_TAG = 2;
private static final int FOUR_BYTE_COPY_TAG = 3;
private void writeBackReference(final LZ77Compressor.BackReference block) throws IOException {
final int len = block.getLength();
final int offset = block.getOffset();
if (len >= MIN_MATCH_LENGTH_WITH_ONE_OFFSET_BYTE && len <= MAX_MATCH_LENGTH_WITH_ONE_OFFSET_BYTE
&& offset <= MAX_OFFSET_WITH_ONE_OFFSET_BYTE) {
writeBackReferenceWithOneOffsetByte(len, offset);
} else if (offset < MAX_OFFSET_WITH_TWO_OFFSET_BYTES) {
writeBackReferenceWithTwoOffsetBytes(len, offset);
} else {
writeBackReferenceWithFourOffsetBytes(len, offset);
}
}
private void writeBackReferenceWithOneOffsetByte(final int len, final int offset) throws IOException {
os.write(ONE_BYTE_COPY_TAG | ((len - 4) << 2) | ((offset & 0x700) >> 3));
os.write(offset & 0xff);
}
private void writeBackReferenceWithTwoOffsetBytes(final int len, final int offset) throws IOException {
writeBackReferenceWithLittleEndianOffset(TWO_BYTE_COPY_TAG, 2, len, offset);
}
private void writeBackReferenceWithFourOffsetBytes(final int len, final int offset) throws IOException {
writeBackReferenceWithLittleEndianOffset(FOUR_BYTE_COPY_TAG, 4, len, offset);
}
private void writeBackReferenceWithLittleEndianOffset(final int tag, final int offsetBytes, final int len, final int offset)
throws IOException {
os.write(tag | ((len - 1) << 2));
writeLittleEndian(offsetBytes, offset);
}
// technically the format could use shorter matches but with a
// length of three the offset would be encoded as at least two
// bytes in addition to the tag, so yield no compression at all
private static final int MIN_MATCH_LENGTH = 4;
// Snappy stores the match length in six bits of the tag
private static final int MAX_MATCH_LENGTH = 64;
/**
* Returns a builder correctly configured for the Snappy algorithm using the gven block size.
* @param blockSize the block size.
* @return a builder correctly configured for the Snappy algorithm using the gven block size
*/
public static Parameters.Builder createParameterBuilder(final int blockSize) {
// the max offset and max literal length defined by the format
// are 2^32 - 1 and 2^32 respectively - with blockSize being
// an integer we will never exceed that
return Parameters.builder(blockSize)
.withMinBackReferenceLength(MIN_MATCH_LENGTH)
.withMaxBackReferenceLength(MAX_MATCH_LENGTH)
.withMaxOffset(blockSize)
.withMaxLiteralLength(blockSize);
}
}