blob: a0550f6ac7f43b8f7e2fd0a5be1326b1aa762766 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.commons.compress.compressors.lz4;
import java.io.IOException;
import java.io.OutputStream;
import java.util.Arrays;
import java.util.Deque;
import java.util.Iterator;
import java.util.LinkedList;
import org.apache.commons.compress.compressors.CompressorOutputStream;
import org.apache.commons.compress.compressors.lz77support.LZ77Compressor;
import org.apache.commons.compress.compressors.lz77support.Parameters;
import org.apache.commons.compress.utils.ByteUtils;
/**
* CompressorOutputStream for the LZ4 block format.
*
* @see <a href="http://lz4.github.io/lz4/lz4_Block_format.html">LZ4 Block Format Description</a>
* @since 1.14
* @NotThreadSafe
*/
public class BlockLZ4CompressorOutputStream extends CompressorOutputStream {
private static final int MIN_BACK_REFERENCE_LENGTH = 4;
private static final int MIN_OFFSET_OF_LAST_BACK_REFERENCE = 12;
/*
The LZ4 block format has a few properties that make it less
straight-forward than one would hope:
* literal blocks and back-references must come in pairs (except
for the very last literal block), so consecutive literal
blocks created by the compressor must be merged into a single
block.
* the start of a literal/back-reference pair contains the length
of the back-reference (at least some part of it) so we can't
start writing the literal before we know how long the next
back-reference is going to be.
* there are special rules for the final blocks
> There are specific parsing rules to respect in order to remain
> compatible with assumptions made by the decoder :
>
> 1. The last 5 bytes are always literals
>
> 2. The last match must start at least 12 bytes before end of
> block. Consequently, a block with less than 13 bytes cannot be
> compressed.
which means any back-reference may need to get rewritten as a
literal block unless we know the next block is at least of
length 5 and the sum of this block's length and offset and the
next block's length is at least twelve.
*/
private final LZ77Compressor compressor;
private final OutputStream os;
// used in one-arg write method
private final byte[] oneByte = new byte[1];
private boolean finished;
private final Deque<Pair> pairs = new LinkedList<>();
// keeps track of the last window-size bytes (64k) in order to be
// able to expand back-references when needed
private final Deque<byte[]> expandedBlocks = new LinkedList<>();
/**
* Creates a new LZ4 output stream.
*
* @param os
* An OutputStream to read compressed data from
*
* @throws IOException if reading fails
*/
public BlockLZ4CompressorOutputStream(final OutputStream os) throws IOException {
this(os, createParameterBuilder().build());
}
/**
* Creates a new LZ4 output stream.
*
* @param os
* An OutputStream to read compressed data from
* @param params
* The parameters to use for LZ77 compression.
*
* @throws IOException if reading fails
*/
public BlockLZ4CompressorOutputStream(final OutputStream os, final Parameters params) throws IOException {
this.os = os;
compressor = new LZ77Compressor(params,
block -> {
switch (block.getType()) {
case LITERAL:
addLiteralBlock((LZ77Compressor.LiteralBlock) block);
break;
case BACK_REFERENCE:
addBackReference((LZ77Compressor.BackReference) block);
break;
case EOD:
writeFinalLiteralBlock();
break;
}
});
}
@Override
public void write(final int b) throws IOException {
oneByte[0] = (byte) (b & 0xff);
write(oneByte);
}
@Override
public void write(final byte[] data, final int off, final int len) throws IOException {
compressor.compress(data, off, len);
}
@Override
public void close() throws IOException {
try {
finish();
} finally {
os.close();
}
}
/**
* Compresses all remaining data and writes it to the stream,
* doesn't close the underlying stream.
* @throws IOException if an error occurs
*/
public void finish() throws IOException {
if (!finished) {
compressor.finish();
finished = true;
}
}
/**
* Adds some initial data to fill the window with.
*
* @param data the data to fill the window with.
* @param off offset of real data into the array
* @param len amount of data
* @throws IllegalStateException if the stream has already started to write data
* @see LZ77Compressor#prefill
*/
public void prefill(final byte[] data, final int off, final int len) {
if (len > 0) {
final byte[] b = Arrays.copyOfRange(data, off, off + len);
compressor.prefill(b);
recordLiteral(b);
}
}
private void addLiteralBlock(final LZ77Compressor.LiteralBlock block) throws IOException {
final Pair last = writeBlocksAndReturnUnfinishedPair(block.getLength());
recordLiteral(last.addLiteral(block));
clearUnusedBlocksAndPairs();
}
private void addBackReference(final LZ77Compressor.BackReference block) throws IOException {
final Pair last = writeBlocksAndReturnUnfinishedPair(block.getLength());
last.setBackReference(block);
recordBackReference(block);
clearUnusedBlocksAndPairs();
}
private Pair writeBlocksAndReturnUnfinishedPair(final int length) throws IOException {
writeWritablePairs(length);
Pair last = pairs.peekLast();
if (last == null || last.hasBackReference()) {
last = new Pair();
pairs.addLast(last);
}
return last;
}
private void recordLiteral(final byte[] b) {
expandedBlocks.addFirst(b);
}
private void clearUnusedBlocksAndPairs() {
clearUnusedBlocks();
clearUnusedPairs();
}
private void clearUnusedBlocks() {
int blockLengths = 0;
int blocksToKeep = 0;
for (final byte[] b : expandedBlocks) {
blocksToKeep++;
blockLengths += b.length;
if (blockLengths >= BlockLZ4CompressorInputStream.WINDOW_SIZE) {
break;
}
}
final int size = expandedBlocks.size();
for (int i = blocksToKeep; i < size; i++) {
expandedBlocks.removeLast();
}
}
private void recordBackReference(final LZ77Compressor.BackReference block) {
expandedBlocks.addFirst(expand(block.getOffset(), block.getLength()));
}
private byte[] expand(final int offset, final int length) {
final byte[] expanded = new byte[length];
if (offset == 1) { // surprisingly common special case
final byte[] block = expandedBlocks.peekFirst();
final byte b = block[block.length - 1];
if (b != 0) { // the fresh array contains 0s anyway
Arrays.fill(expanded, b);
}
} else {
expandFromList(expanded, offset, length);
}
return expanded;
}
private void expandFromList(final byte[] expanded, final int offset, final int length) {
int offsetRemaining = offset;
int lengthRemaining = length;
int writeOffset = 0;
while (lengthRemaining > 0) {
// find block that contains offsetRemaining
byte[] block = null;
final int copyLen;
final int copyOffset;
if (offsetRemaining > 0) {
int blockOffset = 0;
for (final byte[] b : expandedBlocks) {
if (b.length + blockOffset >= offsetRemaining) {
block = b;
break;
}
blockOffset += b.length;
}
if (block == null) {
// should not be possible
throw new IllegalStateException("Failed to find a block containing offset " + offset);
}
copyOffset = blockOffset + block.length - offsetRemaining;
copyLen = Math.min(lengthRemaining, block.length - copyOffset);
} else {
// offsetRemaining is negative or 0 and points into the expanded bytes
block = expanded;
copyOffset = -offsetRemaining;
copyLen = Math.min(lengthRemaining, writeOffset + offsetRemaining);
}
System.arraycopy(block, copyOffset, expanded, writeOffset, copyLen);
offsetRemaining -= copyLen;
lengthRemaining -= copyLen;
writeOffset += copyLen;
}
}
private void clearUnusedPairs() {
int pairLengths = 0;
int pairsToKeep = 0;
for (final Iterator<Pair> it = pairs.descendingIterator(); it.hasNext(); ) {
final Pair p = it.next();
pairsToKeep++;
pairLengths += p.length();
if (pairLengths >= BlockLZ4CompressorInputStream.WINDOW_SIZE) {
break;
}
}
final int size = pairs.size();
for (int i = pairsToKeep; i < size; i++) {
final Pair p = pairs.peekFirst();
if (!p.hasBeenWritten()) {
break;
}
pairs.removeFirst();
}
}
private void writeFinalLiteralBlock() throws IOException {
rewriteLastPairs();
for (final Pair p : pairs) {
if (!p.hasBeenWritten()) {
p.writeTo(os);
}
}
pairs.clear();
}
private void writeWritablePairs(final int lengthOfBlocksAfterLastPair) throws IOException {
int unwrittenLength = lengthOfBlocksAfterLastPair;
for (final Iterator<Pair> it = pairs.descendingIterator(); it.hasNext(); ) {
final Pair p = it.next();
if (p.hasBeenWritten()) {
break;
}
unwrittenLength += p.length();
}
for (final Pair p : pairs) {
if (p.hasBeenWritten()) {
continue;
}
unwrittenLength -= p.length();
if (!p.canBeWritten(unwrittenLength)) {
break;
}
p.writeTo(os);
}
}
private void rewriteLastPairs() {
final LinkedList<Pair> lastPairs = new LinkedList<>();
final LinkedList<Integer> pairLength = new LinkedList<>();
int offset = 0;
for (final Iterator<Pair> it = pairs.descendingIterator(); it.hasNext(); ) {
final Pair p = it.next();
if (p.hasBeenWritten()) {
break;
}
final int len = p.length();
pairLength.addFirst(len);
lastPairs.addFirst(p);
offset += len;
if (offset >= MIN_OFFSET_OF_LAST_BACK_REFERENCE) {
break;
}
}
for (final Pair p : lastPairs) {
pairs.remove(p);
}
// lastPairs may contain between one and four Pairs:
// * the last pair may be a one byte literal
// * all other Pairs contain a back-reference which must be four bytes long at minimum
// we could merge them all into a single literal block but
// this may harm compression. For example compressing
// "bla.tar" from our tests yields a last block containing a
// back-reference of length > 2k and we'd end up with a last
// literal of that size rather than a 2k back-reference and a
// 12 byte literal at the end.
// Instead we merge all but the first of lastPairs into a new
// literal-only Pair "replacement" and look at the
// back-reference in the first of lastPairs and see if we can
// split it. We can split it if it is longer than 16 -
// replacement.length (i.e. the minimal length of four is kept
// while making sure the last literal is at least twelve bytes
// long). If we can't split it, we expand the first of the pairs
// as well.
// this is not optimal, we could get better compression
// results with more complex approaches as the last literal
// only needs to be five bytes long if the previous
// back-reference has an offset big enough
final int lastPairsSize = lastPairs.size();
int toExpand = 0;
for (int i = 1; i < lastPairsSize; i++) {
toExpand += pairLength.get(i);
}
final Pair replacement = new Pair();
if (toExpand > 0) {
replacement.prependLiteral(expand(toExpand, toExpand));
}
final Pair splitCandidate = lastPairs.get(0);
final int stillNeeded = MIN_OFFSET_OF_LAST_BACK_REFERENCE - toExpand;
final int brLen = splitCandidate.hasBackReference() ? splitCandidate.backReferenceLength() : 0;
if (splitCandidate.hasBackReference() && brLen >= MIN_BACK_REFERENCE_LENGTH + stillNeeded) {
replacement.prependLiteral(expand(toExpand + stillNeeded, stillNeeded));
pairs.add(splitCandidate.splitWithNewBackReferenceLengthOf(brLen - stillNeeded));
} else {
if (splitCandidate.hasBackReference()) {
replacement.prependLiteral(expand(toExpand + brLen, brLen));
}
splitCandidate.prependTo(replacement);
}
pairs.add(replacement);
}
/**
* Returns a builder correctly configured for the LZ4 algorithm.
* @return a builder correctly configured for the LZ4 algorithm
*/
public static Parameters.Builder createParameterBuilder() {
final int maxLen = BlockLZ4CompressorInputStream.WINDOW_SIZE - 1;
return Parameters.builder(BlockLZ4CompressorInputStream.WINDOW_SIZE)
.withMinBackReferenceLength(MIN_BACK_REFERENCE_LENGTH)
.withMaxBackReferenceLength(maxLen)
.withMaxOffset(maxLen)
.withMaxLiteralLength(maxLen);
}
final static class Pair {
private final Deque<byte[]> literals = new LinkedList<>();
private int brOffset, brLength;
private boolean written;
private void prependLiteral(final byte[] data) {
literals.addFirst(data);
}
byte[] addLiteral(final LZ77Compressor.LiteralBlock block) {
final byte[] copy = Arrays.copyOfRange(block.getData(), block.getOffset(),
block.getOffset() + block.getLength());
literals.add(copy);
return copy;
}
void setBackReference(final LZ77Compressor.BackReference block) {
if (hasBackReference()) {
throw new IllegalStateException();
}
brOffset = block.getOffset();
brLength = block.getLength();
}
boolean hasBackReference() {
return brOffset > 0;
}
boolean canBeWritten(final int lengthOfBlocksAfterThisPair) {
return hasBackReference()
&& lengthOfBlocksAfterThisPair >= MIN_OFFSET_OF_LAST_BACK_REFERENCE + MIN_BACK_REFERENCE_LENGTH;
}
int length() {
return literalLength() + brLength;
}
private boolean hasBeenWritten() {
return written;
}
void writeTo(final OutputStream out) throws IOException {
final int litLength = literalLength();
out.write(lengths(litLength, brLength));
if (litLength >= BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK) {
writeLength(litLength - BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK, out);
}
for (final byte[] b : literals) {
out.write(b);
}
if (hasBackReference()) {
ByteUtils.toLittleEndian(out, brOffset, 2);
if (brLength - MIN_BACK_REFERENCE_LENGTH >= BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK) {
writeLength(brLength - MIN_BACK_REFERENCE_LENGTH
- BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK, out);
}
}
written = true;
}
private int literalLength() {
int length = 0;
for (final byte[] b : literals) {
length += b.length;
}
return length;
}
private static int lengths(final int litLength, final int brLength) {
final int l = litLength < 15 ? litLength : 15;
final int br = brLength < 4 ? 0 : (brLength < 19 ? brLength - 4 : 15);
return (l << BlockLZ4CompressorInputStream.SIZE_BITS) | br;
}
private static void writeLength(int length, final OutputStream out) throws IOException {
while (length >= 255) {
out.write(255);
length -= 255;
}
out.write(length);
}
private int backReferenceLength() {
return brLength;
}
private void prependTo(final Pair other) {
final Iterator<byte[]> listBackwards = literals.descendingIterator();
while (listBackwards.hasNext()) {
other.prependLiteral(listBackwards.next());
}
}
private Pair splitWithNewBackReferenceLengthOf(final int newBackReferenceLength) {
final Pair p = new Pair();
p.literals.addAll(literals);
p.brOffset = brOffset;
p.brLength = newBackReferenceLength;
return p;
}
}
}