| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| package org.apache.commons.compress.compressors.lz77support; |
| |
| import java.io.IOException; |
| import java.util.Arrays; |
| import java.util.Objects; |
| |
| /** |
| * Helper class for compression algorithms that use the ideas of LZ77. |
| * |
| * <p>Most LZ77 derived algorithms split input data into blocks of |
| * uncompressed data (called literal blocks) and back-references |
| * (pairs of offsets and lengths) that state "add <code>length</code> |
| * bytes that are the same as those already written starting |
| * <code>offset</code> bytes before the current position. The details |
| * of how those blocks and back-references are encoded are quite |
| * different between the algorithms and some algorithms perform |
| * additional steps (Huffman encoding in the case of DEFLATE for |
| * example).</p> |
| * |
| * <p>This class attempts to extract the core logic - finding |
| * back-references - so it can be re-used. It follows the algorithm |
| * explained in section 4 of RFC 1951 (DEFLATE) and currently doesn't |
| * implement the "lazy match" optimization. The three-byte hash |
| * function used in this class is the same as the one used by zlib and |
| * InfoZIP's ZIP implementation of DEFLATE. The whole class is |
| * strongly inspired by InfoZIP's implementation.</p> |
| * |
| * <p>LZ77 is used vaguely here (as well as many other places that |
| * talk about it :-), LZSS would likely be closer to the truth but |
| * LZ77 has become the synonym for a whole family of algorithms.</p> |
| * |
| * <p>The API consists of a compressor that is fed <code>byte</code>s |
| * and emits {@link Block}s to a registered callback where the blocks |
| * represent either {@link LiteralBlock literal blocks}, {@link |
| * BackReference back-references} or {@link EOD end of data |
| * markers}. In order to ensure the callback receives all information, |
| * the {@code #finish} method must be used once all data has been fed |
| * into the compressor.</p> |
| * |
| * <p>Several parameters influence the outcome of the "compression":</p> |
| * <dl> |
| * |
| * <dt><code>windowSize</code></dt> <dd>the size of the sliding |
| * window, must be a power of two - this determines the maximum |
| * offset a back-reference can take. The compressor maintains a |
| * buffer of twice of <code>windowSize</code> - real world values are |
| * in the area of 32k.</dd> |
| * |
| * <dt><code>minBackReferenceLength</code></dt> |
| * <dd>Minimal length of a back-reference found. A true minimum of 3 is |
| * hard-coded inside of this implementation but bigger lengths can be |
| * configured.</dd> |
| * |
| * <dt><code>maxBackReferenceLength</code></dt> |
| * <dd>Maximal length of a back-reference found.</dd> |
| * |
| * <dt><code>maxOffset</code></dt> |
| * <dd>Maximal offset of a back-reference.</dd> |
| * |
| * <dt><code>maxLiteralLength</code></dt> |
| * <dd>Maximal length of a literal block.</dd> |
| * </dl> |
| * |
| * @see "https://tools.ietf.org/html/rfc1951#section-4" |
| * @since 1.14 |
| * @NotThreadSafe |
| */ |
| public class LZ77Compressor { |
| |
| /** |
| * Base class representing blocks the compressor may emit. |
| * |
| * <p>This class is not supposed to be subclassed by classes |
| * outside of Commons Compress so it is considered internal and |
| * changed that would break subclasses may get introduced with |
| * future releases.</p> |
| */ |
| public static abstract class Block { |
| /** Enumeration of the block types the compressor may emit. */ |
| public enum BlockType { |
| LITERAL, BACK_REFERENCE, EOD |
| } |
| public abstract BlockType getType(); |
| } |
| |
| /** |
| * Represents a literal block of data. |
| * |
| * <p>For performance reasons this encapsulates the real data, not |
| * a copy of it. Don't modify the data and process it inside of |
| * {@link Callback#accept} immediately as it will get overwritten |
| * sooner or later.</p> |
| */ |
| public static final class LiteralBlock extends Block { |
| private final byte[] data; |
| private final int offset, length; |
| public LiteralBlock(final byte[] data, final int offset, final int length) { |
| this.data = data; |
| this.offset = offset; |
| this.length = length; |
| } |
| /** |
| * The literal data. |
| * |
| * <p>This returns a life view of the actual data in order to |
| * avoid copying, modify the array at your own risk.</p> |
| * @return the data |
| */ |
| public byte[] getData() { |
| return data; |
| } |
| /** |
| * Offset into data where the literal block starts. |
| * @return the offset |
| */ |
| public int getOffset() { |
| return offset; |
| } |
| /** |
| * Length of literal block. |
| * @return the length |
| */ |
| public int getLength() { |
| return length; |
| } |
| @Override |
| public BlockType getType() { |
| return BlockType.LITERAL; |
| } |
| @Override |
| public String toString() { |
| return "LiteralBlock starting at " + offset + " with length " + length; |
| } |
| } |
| |
| /** |
| * Represents a back-reference. |
| */ |
| public static final class BackReference extends Block { |
| private final int offset, length; |
| public BackReference(final int offset, final int length) { |
| this.offset = offset; |
| this.length = length; |
| } |
| /** |
| * Provides the offset of the back-reference. |
| * @return the offset |
| */ |
| public int getOffset() { |
| return offset; |
| } |
| /** |
| * Provides the length of the back-reference. |
| * @return the length |
| */ |
| public int getLength() { |
| return length; |
| } |
| @Override |
| public BlockType getType() { |
| return BlockType.BACK_REFERENCE; |
| } |
| @Override |
| public String toString() { |
| return "BackReference with offset " + offset + " and length " + length; |
| } |
| } |
| |
| /** A simple "we are done" marker. */ |
| public static final class EOD extends Block { |
| @Override |
| public BlockType getType() { |
| return BlockType.EOD; |
| } |
| } |
| |
| private static final Block THE_EOD = new EOD(); |
| |
| /** |
| * Callback invoked while the compressor processes data. |
| * |
| * <p>The callback is invoked on the same thread that receives the |
| * bytes to compress and may be invoked multiple times during the |
| * execution of {@link #compress} or {@link #finish}.</p> |
| */ |
| public interface Callback { |
| /** |
| * Consumes a block. |
| * @param b the block to consume |
| * @throws IOException in case of an error |
| */ |
| void accept(Block b) throws IOException; |
| } |
| |
| static final int NUMBER_OF_BYTES_IN_HASH = 3; |
| private static final int NO_MATCH = -1; |
| |
| private final Parameters params; |
| private final Callback callback; |
| |
| // the sliding window, twice as big as "windowSize" parameter |
| private final byte[] window; |
| // the head of hash-chain - indexed by hash-code, points to the |
| // location inside of window of the latest sequence of bytes with |
| // the given hash. |
| private final int[] head; |
| // for each window-location points to the latest earlier location |
| // with the same hash. Only stores values for the latest |
| // "windowSize" elements, the index is "window location modulo |
| // windowSize". |
| private final int[] prev; |
| |
| // bit mask used when indexing into prev |
| private final int wMask; |
| |
| private boolean initialized = false; |
| // the position inside of window that shall be encoded right now |
| private int currentPosition; |
| // the number of bytes available to compress including the one at |
| // currentPosition |
| private int lookahead = 0; |
| // the hash of the three bytes stating at the current position |
| private int insertHash = 0; |
| // the position inside of the window where the current literal |
| // block starts (in case we are inside of a literal block). |
| private int blockStart = 0; |
| // position of the current match |
| private int matchStart = NO_MATCH; |
| // number of missed insertString calls for the up to three last |
| // bytes of the last match that can only be performed once more |
| // data has been read |
| private int missedInserts = 0; |
| |
| /** |
| * Initializes a compressor with parameters and a callback. |
| * @param params the parameters |
| * @param callback the callback |
| * @throws NullPointerException if either parameter is <code>null</code> |
| */ |
| public LZ77Compressor(final Parameters params, final Callback callback) { |
| Objects.requireNonNull(params, "params"); |
| Objects.requireNonNull(callback, "callback"); |
| |
| this.params = params; |
| this.callback = callback; |
| |
| final int wSize = params.getWindowSize(); |
| window = new byte[wSize * 2]; |
| wMask = wSize - 1; |
| head = new int[HASH_SIZE]; |
| Arrays.fill(head, NO_MATCH); |
| prev = new int[wSize]; |
| } |
| |
| /** |
| * Feeds bytes into the compressor which in turn may emit zero or |
| * more blocks to the callback during the execution of this |
| * method. |
| * @param data the data to compress - must not be null |
| * @throws IOException if the callback throws an exception |
| */ |
| public void compress(final byte[] data) throws IOException { |
| compress(data, 0, data.length); |
| } |
| |
| /** |
| * Feeds bytes into the compressor which in turn may emit zero or |
| * more blocks to the callback during the execution of this |
| * method. |
| * @param data the data to compress - must not be null |
| * @param off the start offset of the data |
| * @param len the number of bytes to compress |
| * @throws IOException if the callback throws an exception |
| */ |
| public void compress(final byte[] data, int off, int len) throws IOException { |
| final int wSize = params.getWindowSize(); |
| while (len > wSize) { // chop into windowSize sized chunks |
| doCompress(data, off, wSize); |
| off += wSize; |
| len -= wSize; |
| } |
| if (len > 0) { |
| doCompress(data, off, len); |
| } |
| } |
| |
| /** |
| * Tells the compressor to process all remaining data and signal |
| * end of data to the callback. |
| * |
| * <p>The compressor will in turn emit at least one block ({@link |
| * EOD}) but potentially multiple blocks to the callback during |
| * the execution of this method.</p> |
| * @throws IOException if the callback throws an exception |
| */ |
| public void finish() throws IOException { |
| if (blockStart != currentPosition || lookahead > 0) { |
| currentPosition += lookahead; |
| flushLiteralBlock(); |
| } |
| callback.accept(THE_EOD); |
| } |
| |
| /** |
| * Adds some initial data to fill the window with. |
| * |
| * <p>This is used if the stream has been cut into blocks and |
| * back-references of one block may refer to data of the previous |
| * block(s). One such example is the LZ4 frame format using block |
| * dependency.</p> |
| * |
| * @param data the data to fill the window with. |
| * @throws IllegalStateException if the compressor has already started to accept data |
| */ |
| public void prefill(final byte[] data) { |
| if (currentPosition != 0 || lookahead != 0) { |
| throw new IllegalStateException("The compressor has already started to accept data, can't prefill anymore"); |
| } |
| |
| // don't need more than windowSize for back-references |
| final int len = Math.min(params.getWindowSize(), data.length); |
| System.arraycopy(data, data.length - len, window, 0, len); |
| |
| if (len >= NUMBER_OF_BYTES_IN_HASH) { |
| initialize(); |
| final int stop = len - NUMBER_OF_BYTES_IN_HASH + 1; |
| for (int i = 0; i < stop; i++) { |
| insertString(i); |
| } |
| missedInserts = NUMBER_OF_BYTES_IN_HASH - 1; |
| } else { // not enough data to hash anything |
| missedInserts = len; |
| } |
| blockStart = currentPosition = len; |
| } |
| |
| // we use a 15 bit hashcode as calculated in updateHash |
| private static final int HASH_SIZE = 1 << 15; |
| private static final int HASH_MASK = HASH_SIZE - 1; |
| private static final int H_SHIFT = 5; |
| |
| /** |
| * Assumes we are calculating the hash for three consecutive bytes |
| * as a rolling hash, i.e. for bytes ABCD if H is the hash of ABC |
| * the new hash for BCD is nextHash(H, D). |
| * |
| * <p>The hash is shifted by five bits on each update so all |
| * effects of A have been swapped after the third update.</p> |
| */ |
| private int nextHash(final int oldHash, final byte nextByte) { |
| final int nextVal = nextByte & 0xFF; |
| return ((oldHash << H_SHIFT) ^ nextVal) & HASH_MASK; |
| } |
| |
| // performs the actual algorithm with the pre-condition len <= windowSize |
| private void doCompress(final byte[] data, final int off, final int len) throws IOException { |
| final int spaceLeft = window.length - currentPosition - lookahead; |
| if (len > spaceLeft) { |
| slide(); |
| } |
| System.arraycopy(data, off, window, currentPosition + lookahead, len); |
| lookahead += len; |
| if (!initialized && lookahead >= params.getMinBackReferenceLength()) { |
| initialize(); |
| } |
| if (initialized) { |
| compress(); |
| } |
| } |
| |
| private void slide() throws IOException { |
| final int wSize = params.getWindowSize(); |
| if (blockStart != currentPosition && blockStart < wSize) { |
| flushLiteralBlock(); |
| blockStart = currentPosition; |
| } |
| System.arraycopy(window, wSize, window, 0, wSize); |
| currentPosition -= wSize; |
| matchStart -= wSize; |
| blockStart -= wSize; |
| for (int i = 0; i < HASH_SIZE; i++) { |
| final int h = head[i]; |
| head[i] = h >= wSize ? h - wSize : NO_MATCH; |
| } |
| for (int i = 0; i < wSize; i++) { |
| final int p = prev[i]; |
| prev[i] = p >= wSize ? p - wSize : NO_MATCH; |
| } |
| } |
| |
| private void initialize() { |
| for (int i = 0; i < NUMBER_OF_BYTES_IN_HASH - 1; i++) { |
| insertHash = nextHash(insertHash, window[i]); |
| } |
| initialized = true; |
| } |
| |
| private void compress() throws IOException { |
| final int minMatch = params.getMinBackReferenceLength(); |
| final boolean lazy = params.getLazyMatching(); |
| final int lazyThreshold = params.getLazyMatchingThreshold(); |
| |
| while (lookahead >= minMatch) { |
| catchUpMissedInserts(); |
| int matchLength = 0; |
| final int hashHead = insertString(currentPosition); |
| if (hashHead != NO_MATCH && hashHead - currentPosition <= params.getMaxOffset()) { |
| // sets matchStart as a side effect |
| matchLength = longestMatch(hashHead); |
| |
| if (lazy && matchLength <= lazyThreshold && lookahead > minMatch) { |
| // try to find a longer match using the next position |
| matchLength = longestMatchForNextPosition(matchLength); |
| } |
| } |
| if (matchLength >= minMatch) { |
| if (blockStart != currentPosition) { |
| // emit preceeding literal block |
| flushLiteralBlock(); |
| blockStart = NO_MATCH; |
| } |
| flushBackReference(matchLength); |
| insertStringsInMatch(matchLength); |
| lookahead -= matchLength; |
| currentPosition += matchLength; |
| blockStart = currentPosition; |
| } else { |
| // no match, append to current or start a new literal |
| lookahead--; |
| currentPosition++; |
| if (currentPosition - blockStart >= params.getMaxLiteralLength()) { |
| flushLiteralBlock(); |
| blockStart = currentPosition; |
| } |
| } |
| } |
| } |
| |
| /** |
| * Inserts the current three byte sequence into the dictionary and |
| * returns the previous head of the hash-chain. |
| * |
| * <p>Updates <code>insertHash</code> and <code>prev</code> as a |
| * side effect.</p> |
| */ |
| private int insertString(final int pos) { |
| insertHash = nextHash(insertHash, window[pos - 1 + NUMBER_OF_BYTES_IN_HASH]); |
| final int hashHead = head[insertHash]; |
| prev[pos & wMask] = hashHead; |
| head[insertHash] = pos; |
| return hashHead; |
| } |
| |
| private int longestMatchForNextPosition(final int prevMatchLength) { |
| // save a bunch of values to restore them if the next match isn't better than the current one |
| final int prevMatchStart = matchStart; |
| final int prevInsertHash = insertHash; |
| |
| lookahead--; |
| currentPosition++; |
| final int hashHead = insertString(currentPosition); |
| final int prevHashHead = prev[currentPosition & wMask]; |
| int matchLength = longestMatch(hashHead); |
| |
| if (matchLength <= prevMatchLength) { |
| // use the first match, as the next one isn't any better |
| matchLength = prevMatchLength; |
| matchStart = prevMatchStart; |
| |
| // restore modified values |
| head[insertHash] = prevHashHead; |
| insertHash = prevInsertHash; |
| currentPosition--; |
| lookahead++; |
| } |
| return matchLength; |
| } |
| |
| private void insertStringsInMatch(final int matchLength) { |
| // inserts strings contained in current match |
| // insertString inserts the byte 2 bytes after position, which may not yet be available -> missedInserts |
| final int stop = Math.min(matchLength - 1, lookahead - NUMBER_OF_BYTES_IN_HASH); |
| // currentPosition has been inserted already |
| for (int i = 1; i <= stop; i++) { |
| insertString(currentPosition + i); |
| } |
| missedInserts = matchLength - stop - 1; |
| } |
| |
| private void catchUpMissedInserts() { |
| while (missedInserts > 0) { |
| insertString(currentPosition - missedInserts--); |
| } |
| } |
| |
| private void flushBackReference(final int matchLength) throws IOException { |
| callback.accept(new BackReference(currentPosition - matchStart, matchLength)); |
| } |
| |
| private void flushLiteralBlock() throws IOException { |
| callback.accept(new LiteralBlock(window, blockStart, currentPosition - blockStart)); |
| } |
| |
| /** |
| * Searches the hash chain for real matches and returns the length |
| * of the longest match (0 if none were found) that isn't too far |
| * away (WRT maxOffset). |
| * |
| * <p>Sets matchStart to the index of the start position of the |
| * longest match as a side effect.</p> |
| */ |
| private int longestMatch(int matchHead) { |
| final int minLength = params.getMinBackReferenceLength(); |
| int longestMatchLength = minLength - 1; |
| final int maxPossibleLength = Math.min(params.getMaxBackReferenceLength(), lookahead); |
| final int minIndex = Math.max(0, currentPosition - params.getMaxOffset()); |
| final int niceBackReferenceLength = Math.min(maxPossibleLength, params.getNiceBackReferenceLength()); |
| final int maxCandidates = params.getMaxCandidates(); |
| for (int candidates = 0; candidates < maxCandidates && matchHead >= minIndex; candidates++) { |
| int currentLength = 0; |
| for (int i = 0; i < maxPossibleLength; i++) { |
| if (window[matchHead + i] != window[currentPosition + i]) { |
| break; |
| } |
| currentLength++; |
| } |
| if (currentLength > longestMatchLength) { |
| longestMatchLength = currentLength; |
| matchStart = matchHead; |
| if (currentLength >= niceBackReferenceLength) { |
| // no need to search any further |
| break; |
| } |
| } |
| matchHead = prev[matchHead & wMask]; |
| } |
| return longestMatchLength; // < minLength if no matches have been found, will be ignored in compress() |
| } |
| } |