| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| package org.apache.commons.compress.compressors.lz4; |
| |
| import java.io.IOException; |
| import java.io.OutputStream; |
| import java.util.Arrays; |
| import java.util.Deque; |
| import java.util.Iterator; |
| import java.util.LinkedList; |
| |
| import org.apache.commons.compress.compressors.CompressorOutputStream; |
| import org.apache.commons.compress.compressors.lz77support.LZ77Compressor; |
| import org.apache.commons.compress.compressors.lz77support.Parameters; |
| import org.apache.commons.compress.utils.ByteUtils; |
| |
| /** |
| * CompressorOutputStream for the LZ4 block format. |
| * |
| * @see <a href="http://lz4.github.io/lz4/lz4_Block_format.html">LZ4 Block Format Description</a> |
| * @since 1.14 |
| * @NotThreadSafe |
| */ |
| public class BlockLZ4CompressorOutputStream extends CompressorOutputStream { |
| |
| private static final int MIN_BACK_REFERENCE_LENGTH = 4; |
| private static final int MIN_OFFSET_OF_LAST_BACK_REFERENCE = 12; |
| |
| /* |
| |
| The LZ4 block format has a few properties that make it less |
| straight-forward than one would hope: |
| |
| * literal blocks and back-references must come in pairs (except |
| for the very last literal block), so consecutive literal |
| blocks created by the compressor must be merged into a single |
| block. |
| |
| * the start of a literal/back-reference pair contains the length |
| of the back-reference (at least some part of it) so we can't |
| start writing the literal before we know how long the next |
| back-reference is going to be. |
| |
| * there are special rules for the final blocks |
| |
| > There are specific parsing rules to respect in order to remain |
| > compatible with assumptions made by the decoder : |
| > |
| > 1. The last 5 bytes are always literals |
| > |
| > 2. The last match must start at least 12 bytes before end of |
| > block. Consequently, a block with less than 13 bytes cannot be |
| > compressed. |
| |
| which means any back-reference may need to get rewritten as a |
| literal block unless we know the next block is at least of |
| length 5 and the sum of this block's length and offset and the |
| next block's length is at least twelve. |
| |
| */ |
| |
| private final LZ77Compressor compressor; |
| private final OutputStream os; |
| |
| // used in one-arg write method |
| private final byte[] oneByte = new byte[1]; |
| |
| private boolean finished; |
| |
| private final Deque<Pair> pairs = new LinkedList<>(); |
| // keeps track of the last window-size bytes (64k) in order to be |
| // able to expand back-references when needed |
| private final Deque<byte[]> expandedBlocks = new LinkedList<>(); |
| |
| /** |
| * Creates a new LZ4 output stream. |
| * |
| * @param os |
| * An OutputStream to read compressed data from |
| * |
| * @throws IOException if reading fails |
| */ |
| public BlockLZ4CompressorOutputStream(final OutputStream os) throws IOException { |
| this(os, createParameterBuilder().build()); |
| } |
| |
| /** |
| * Creates a new LZ4 output stream. |
| * |
| * @param os |
| * An OutputStream to read compressed data from |
| * @param params |
| * The parameters to use for LZ77 compression. |
| * |
| * @throws IOException if reading fails |
| */ |
| public BlockLZ4CompressorOutputStream(final OutputStream os, final Parameters params) throws IOException { |
| this.os = os; |
| compressor = new LZ77Compressor(params, |
| block -> { |
| switch (block.getType()) { |
| case LITERAL: |
| addLiteralBlock((LZ77Compressor.LiteralBlock) block); |
| break; |
| case BACK_REFERENCE: |
| addBackReference((LZ77Compressor.BackReference) block); |
| break; |
| case EOD: |
| writeFinalLiteralBlock(); |
| break; |
| } |
| }); |
| } |
| |
| @Override |
| public void write(final int b) throws IOException { |
| oneByte[0] = (byte) (b & 0xff); |
| write(oneByte); |
| } |
| |
| @Override |
| public void write(final byte[] data, final int off, final int len) throws IOException { |
| compressor.compress(data, off, len); |
| } |
| |
| @Override |
| public void close() throws IOException { |
| try { |
| finish(); |
| } finally { |
| os.close(); |
| } |
| } |
| |
| /** |
| * Compresses all remaining data and writes it to the stream, |
| * doesn't close the underlying stream. |
| * @throws IOException if an error occurs |
| */ |
| public void finish() throws IOException { |
| if (!finished) { |
| compressor.finish(); |
| finished = true; |
| } |
| } |
| |
| /** |
| * Adds some initial data to fill the window with. |
| * |
| * @param data the data to fill the window with. |
| * @param off offset of real data into the array |
| * @param len amount of data |
| * @throws IllegalStateException if the stream has already started to write data |
| * @see LZ77Compressor#prefill |
| */ |
| public void prefill(final byte[] data, final int off, final int len) { |
| if (len > 0) { |
| final byte[] b = Arrays.copyOfRange(data, off, off + len); |
| compressor.prefill(b); |
| recordLiteral(b); |
| } |
| } |
| |
| private void addLiteralBlock(final LZ77Compressor.LiteralBlock block) throws IOException { |
| final Pair last = writeBlocksAndReturnUnfinishedPair(block.getLength()); |
| recordLiteral(last.addLiteral(block)); |
| clearUnusedBlocksAndPairs(); |
| } |
| |
| private void addBackReference(final LZ77Compressor.BackReference block) throws IOException { |
| final Pair last = writeBlocksAndReturnUnfinishedPair(block.getLength()); |
| last.setBackReference(block); |
| recordBackReference(block); |
| clearUnusedBlocksAndPairs(); |
| } |
| |
| private Pair writeBlocksAndReturnUnfinishedPair(final int length) throws IOException { |
| writeWritablePairs(length); |
| Pair last = pairs.peekLast(); |
| if (last == null || last.hasBackReference()) { |
| last = new Pair(); |
| pairs.addLast(last); |
| } |
| return last; |
| } |
| |
| private void recordLiteral(final byte[] b) { |
| expandedBlocks.addFirst(b); |
| } |
| |
| private void clearUnusedBlocksAndPairs() { |
| clearUnusedBlocks(); |
| clearUnusedPairs(); |
| } |
| |
| private void clearUnusedBlocks() { |
| int blockLengths = 0; |
| int blocksToKeep = 0; |
| for (final byte[] b : expandedBlocks) { |
| blocksToKeep++; |
| blockLengths += b.length; |
| if (blockLengths >= BlockLZ4CompressorInputStream.WINDOW_SIZE) { |
| break; |
| } |
| } |
| final int size = expandedBlocks.size(); |
| for (int i = blocksToKeep; i < size; i++) { |
| expandedBlocks.removeLast(); |
| } |
| } |
| |
| private void recordBackReference(final LZ77Compressor.BackReference block) { |
| expandedBlocks.addFirst(expand(block.getOffset(), block.getLength())); |
| } |
| |
| private byte[] expand(final int offset, final int length) { |
| final byte[] expanded = new byte[length]; |
| if (offset == 1) { // surprisingly common special case |
| final byte[] block = expandedBlocks.peekFirst(); |
| final byte b = block[block.length - 1]; |
| if (b != 0) { // the fresh array contains 0s anyway |
| Arrays.fill(expanded, b); |
| } |
| } else { |
| expandFromList(expanded, offset, length); |
| } |
| return expanded; |
| } |
| |
| private void expandFromList(final byte[] expanded, final int offset, final int length) { |
| int offsetRemaining = offset; |
| int lengthRemaining = length; |
| int writeOffset = 0; |
| while (lengthRemaining > 0) { |
| // find block that contains offsetRemaining |
| byte[] block = null; |
| final int copyLen; |
| final int copyOffset; |
| if (offsetRemaining > 0) { |
| int blockOffset = 0; |
| for (final byte[] b : expandedBlocks) { |
| if (b.length + blockOffset >= offsetRemaining) { |
| block = b; |
| break; |
| } |
| blockOffset += b.length; |
| } |
| if (block == null) { |
| // should not be possible |
| throw new IllegalStateException("Failed to find a block containing offset " + offset); |
| } |
| copyOffset = blockOffset + block.length - offsetRemaining; |
| copyLen = Math.min(lengthRemaining, block.length - copyOffset); |
| } else { |
| // offsetRemaining is negative or 0 and points into the expanded bytes |
| block = expanded; |
| copyOffset = -offsetRemaining; |
| copyLen = Math.min(lengthRemaining, writeOffset + offsetRemaining); |
| } |
| System.arraycopy(block, copyOffset, expanded, writeOffset, copyLen); |
| offsetRemaining -= copyLen; |
| lengthRemaining -= copyLen; |
| writeOffset += copyLen; |
| } |
| } |
| |
| private void clearUnusedPairs() { |
| int pairLengths = 0; |
| int pairsToKeep = 0; |
| for (final Iterator<Pair> it = pairs.descendingIterator(); it.hasNext(); ) { |
| final Pair p = it.next(); |
| pairsToKeep++; |
| pairLengths += p.length(); |
| if (pairLengths >= BlockLZ4CompressorInputStream.WINDOW_SIZE) { |
| break; |
| } |
| } |
| final int size = pairs.size(); |
| for (int i = pairsToKeep; i < size; i++) { |
| final Pair p = pairs.peekFirst(); |
| if (!p.hasBeenWritten()) { |
| break; |
| } |
| pairs.removeFirst(); |
| } |
| } |
| |
| private void writeFinalLiteralBlock() throws IOException { |
| rewriteLastPairs(); |
| for (final Pair p : pairs) { |
| if (!p.hasBeenWritten()) { |
| p.writeTo(os); |
| } |
| } |
| pairs.clear(); |
| } |
| |
| private void writeWritablePairs(final int lengthOfBlocksAfterLastPair) throws IOException { |
| int unwrittenLength = lengthOfBlocksAfterLastPair; |
| for (final Iterator<Pair> it = pairs.descendingIterator(); it.hasNext(); ) { |
| final Pair p = it.next(); |
| if (p.hasBeenWritten()) { |
| break; |
| } |
| unwrittenLength += p.length(); |
| } |
| for (final Pair p : pairs) { |
| if (p.hasBeenWritten()) { |
| continue; |
| } |
| unwrittenLength -= p.length(); |
| if (!p.canBeWritten(unwrittenLength)) { |
| break; |
| } |
| p.writeTo(os); |
| } |
| } |
| |
| private void rewriteLastPairs() { |
| final LinkedList<Pair> lastPairs = new LinkedList<>(); |
| final LinkedList<Integer> pairLength = new LinkedList<>(); |
| int offset = 0; |
| for (final Iterator<Pair> it = pairs.descendingIterator(); it.hasNext(); ) { |
| final Pair p = it.next(); |
| if (p.hasBeenWritten()) { |
| break; |
| } |
| final int len = p.length(); |
| pairLength.addFirst(len); |
| lastPairs.addFirst(p); |
| offset += len; |
| if (offset >= MIN_OFFSET_OF_LAST_BACK_REFERENCE) { |
| break; |
| } |
| } |
| for (final Pair p : lastPairs) { |
| pairs.remove(p); |
| } |
| // lastPairs may contain between one and four Pairs: |
| // * the last pair may be a one byte literal |
| // * all other Pairs contain a back-reference which must be four bytes long at minimum |
| // we could merge them all into a single literal block but |
| // this may harm compression. For example compressing |
| // "bla.tar" from our tests yields a last block containing a |
| // back-reference of length > 2k and we'd end up with a last |
| // literal of that size rather than a 2k back-reference and a |
| // 12 byte literal at the end. |
| |
| // Instead we merge all but the first of lastPairs into a new |
| // literal-only Pair "replacement" and look at the |
| // back-reference in the first of lastPairs and see if we can |
| // split it. We can split it if it is longer than 16 - |
| // replacement.length (i.e. the minimal length of four is kept |
| // while making sure the last literal is at least twelve bytes |
| // long). If we can't split it, we expand the first of the pairs |
| // as well. |
| |
| // this is not optimal, we could get better compression |
| // results with more complex approaches as the last literal |
| // only needs to be five bytes long if the previous |
| // back-reference has an offset big enough |
| |
| final int lastPairsSize = lastPairs.size(); |
| int toExpand = 0; |
| for (int i = 1; i < lastPairsSize; i++) { |
| toExpand += pairLength.get(i); |
| } |
| final Pair replacement = new Pair(); |
| if (toExpand > 0) { |
| replacement.prependLiteral(expand(toExpand, toExpand)); |
| } |
| final Pair splitCandidate = lastPairs.get(0); |
| final int stillNeeded = MIN_OFFSET_OF_LAST_BACK_REFERENCE - toExpand; |
| final int brLen = splitCandidate.hasBackReference() ? splitCandidate.backReferenceLength() : 0; |
| if (splitCandidate.hasBackReference() && brLen >= MIN_BACK_REFERENCE_LENGTH + stillNeeded) { |
| replacement.prependLiteral(expand(toExpand + stillNeeded, stillNeeded)); |
| pairs.add(splitCandidate.splitWithNewBackReferenceLengthOf(brLen - stillNeeded)); |
| } else { |
| if (splitCandidate.hasBackReference()) { |
| replacement.prependLiteral(expand(toExpand + brLen, brLen)); |
| } |
| splitCandidate.prependTo(replacement); |
| } |
| pairs.add(replacement); |
| } |
| |
| /** |
| * Returns a builder correctly configured for the LZ4 algorithm. |
| * @return a builder correctly configured for the LZ4 algorithm |
| */ |
| public static Parameters.Builder createParameterBuilder() { |
| final int maxLen = BlockLZ4CompressorInputStream.WINDOW_SIZE - 1; |
| return Parameters.builder(BlockLZ4CompressorInputStream.WINDOW_SIZE) |
| .withMinBackReferenceLength(MIN_BACK_REFERENCE_LENGTH) |
| .withMaxBackReferenceLength(maxLen) |
| .withMaxOffset(maxLen) |
| .withMaxLiteralLength(maxLen); |
| } |
| |
| final static class Pair { |
| private final Deque<byte[]> literals = new LinkedList<>(); |
| private int brOffset, brLength; |
| private boolean written; |
| |
| private void prependLiteral(final byte[] data) { |
| literals.addFirst(data); |
| } |
| byte[] addLiteral(final LZ77Compressor.LiteralBlock block) { |
| final byte[] copy = Arrays.copyOfRange(block.getData(), block.getOffset(), |
| block.getOffset() + block.getLength()); |
| literals.add(copy); |
| return copy; |
| } |
| void setBackReference(final LZ77Compressor.BackReference block) { |
| if (hasBackReference()) { |
| throw new IllegalStateException(); |
| } |
| brOffset = block.getOffset(); |
| brLength = block.getLength(); |
| } |
| boolean hasBackReference() { |
| return brOffset > 0; |
| } |
| boolean canBeWritten(final int lengthOfBlocksAfterThisPair) { |
| return hasBackReference() |
| && lengthOfBlocksAfterThisPair >= MIN_OFFSET_OF_LAST_BACK_REFERENCE + MIN_BACK_REFERENCE_LENGTH; |
| } |
| int length() { |
| return literalLength() + brLength; |
| } |
| private boolean hasBeenWritten() { |
| return written; |
| } |
| void writeTo(final OutputStream out) throws IOException { |
| final int litLength = literalLength(); |
| out.write(lengths(litLength, brLength)); |
| if (litLength >= BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK) { |
| writeLength(litLength - BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK, out); |
| } |
| for (final byte[] b : literals) { |
| out.write(b); |
| } |
| if (hasBackReference()) { |
| ByteUtils.toLittleEndian(out, brOffset, 2); |
| if (brLength - MIN_BACK_REFERENCE_LENGTH >= BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK) { |
| writeLength(brLength - MIN_BACK_REFERENCE_LENGTH |
| - BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK, out); |
| } |
| } |
| written = true; |
| } |
| private int literalLength() { |
| int length = 0; |
| for (final byte[] b : literals) { |
| length += b.length; |
| } |
| return length; |
| } |
| private static int lengths(final int litLength, final int brLength) { |
| final int l = litLength < 15 ? litLength : 15; |
| final int br = brLength < 4 ? 0 : (brLength < 19 ? brLength - 4 : 15); |
| return (l << BlockLZ4CompressorInputStream.SIZE_BITS) | br; |
| } |
| private static void writeLength(int length, final OutputStream out) throws IOException { |
| while (length >= 255) { |
| out.write(255); |
| length -= 255; |
| } |
| out.write(length); |
| } |
| private int backReferenceLength() { |
| return brLength; |
| } |
| private void prependTo(final Pair other) { |
| final Iterator<byte[]> listBackwards = literals.descendingIterator(); |
| while (listBackwards.hasNext()) { |
| other.prependLiteral(listBackwards.next()); |
| } |
| } |
| private Pair splitWithNewBackReferenceLengthOf(final int newBackReferenceLength) { |
| final Pair p = new Pair(); |
| p.literals.addAll(literals); |
| p.brOffset = brOffset; |
| p.brLength = newBackReferenceLength; |
| return p; |
| } |
| } |
| } |