| /* |
| * LZ4 Library |
| * Copyright (c) 2011-2016, Yann Collet |
| * All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without modification, |
| * are permitted provided that the following conditions are met: |
| * |
| * * Redistributions of source code must retain the above copyright notice, this |
| * list of conditions and the following disclaimer. |
| * |
| * * Redistributions in binary form must reproduce the above copyright notice, this |
| * list of conditions and the following disclaimer in the documentation and/or |
| * other materials provided with the distribution. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
| * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR |
| * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON |
| * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| package org.apache.lucene.util.compress; |
| |
| import java.io.IOException; |
| import java.util.Arrays; |
| |
| import org.apache.lucene.store.DataInput; |
| import org.apache.lucene.store.DataOutput; |
| import org.apache.lucene.util.FutureArrays; |
| import org.apache.lucene.util.FutureObjects; |
| import org.apache.lucene.util.packed.PackedInts; |
| |
| /** |
| * LZ4 compression and decompression routines. |
| * |
| * https://github.com/lz4/lz4/tree/dev/lib |
| * http://fastcompression.blogspot.fr/p/lz4.html |
| * |
| * The high-compression option is a simpler version of the one of the original |
| * algorithm, and only retains a better hash table that remembers about more |
| * occurrences of a previous 4-bytes sequence, and removes all the logic about |
| * handling of the case when overlapping matches are found. |
| */ |
| public final class LZ4 { |
| |
| private LZ4() {} |
| |
| static final int MEMORY_USAGE = 14; |
| static final int MIN_MATCH = 4; // minimum length of a match |
| static final int MAX_DISTANCE = 1 << 16; // maximum distance of a reference |
| static final int LAST_LITERALS = 5; // the last 5 bytes must be encoded as literals |
| static final int HASH_LOG_HC = 15; // log size of the dictionary for compressHC |
| static final int HASH_TABLE_SIZE_HC = 1 << HASH_LOG_HC; |
| |
| |
| private static int hash(int i, int hashBits) { |
| return (i * -1640531535) >>> (32 - hashBits); |
| } |
| |
| private static int hashHC(int i) { |
| return hash(i, HASH_LOG_HC); |
| } |
| |
| private static int readInt(byte[] buf, int i) { |
| return ((buf[i] & 0xFF) << 24) | ((buf[i+1] & 0xFF) << 16) | ((buf[i+2] & 0xFF) << 8) | (buf[i+3] & 0xFF); |
| } |
| |
| private static int commonBytes(byte[] b, int o1, int o2, int limit) { |
| assert o1 < o2; |
| // never -1 because lengths always differ |
| return FutureArrays.mismatch(b, o1, limit, b, o2, limit); |
| } |
| |
| /** |
| * Decompress at least {@code decompressedLen} bytes into |
| * {@code dest[dOff:]}. Please note that {@code dest} must be large |
| * enough to be able to hold <b>all</b> decompressed data (meaning that you |
| * need to know the total decompressed length). |
| * If the given bytes were compressed using a preset dictionary then the same |
| * dictionary must be provided in {@code dest[dOff-dictLen:dOff]}. |
| */ |
| public static int decompress(DataInput compressed, int decompressedLen, byte[] dest, int dOff) throws IOException { |
| final int destEnd = dOff + decompressedLen; |
| |
| do { |
| // literals |
| final int token = compressed.readByte() & 0xFF; |
| int literalLen = token >>> 4; |
| |
| if (literalLen != 0) { |
| if (literalLen == 0x0F) { |
| byte len; |
| while ((len = compressed.readByte()) == (byte) 0xFF) { |
| literalLen += 0xFF; |
| } |
| literalLen += len & 0xFF; |
| } |
| compressed.readBytes(dest, dOff, literalLen); |
| dOff += literalLen; |
| } |
| |
| if (dOff >= destEnd) { |
| break; |
| } |
| |
| // matchs |
| final int matchDec = (compressed.readByte() & 0xFF) | ((compressed.readByte() & 0xFF) << 8); |
| assert matchDec > 0; |
| |
| int matchLen = token & 0x0F; |
| if (matchLen == 0x0F) { |
| int len; |
| while ((len = compressed.readByte()) == (byte) 0xFF) { |
| matchLen += 0xFF; |
| } |
| matchLen += len & 0xFF; |
| } |
| matchLen += MIN_MATCH; |
| |
| // copying a multiple of 8 bytes can make decompression from 5% to 10% faster |
| final int fastLen = (matchLen + 7) & 0xFFFFFFF8; |
| if (matchDec < matchLen || dOff + fastLen > destEnd) { |
| // overlap -> naive incremental copy |
| for (int ref = dOff - matchDec, end = dOff + matchLen; dOff < end; ++ref, ++dOff) { |
| dest[dOff] = dest[ref]; |
| } |
| } else { |
| // no overlap -> arraycopy |
| System.arraycopy(dest, dOff - matchDec, dest, dOff, fastLen); |
| dOff += matchLen; |
| } |
| } while (dOff < destEnd); |
| |
| return dOff; |
| } |
| |
| private static void encodeLen(int l, DataOutput out) throws IOException { |
| while (l >= 0xFF) { |
| out.writeByte((byte) 0xFF); |
| l -= 0xFF; |
| } |
| out.writeByte((byte) l); |
| } |
| |
| private static void encodeLiterals(byte[] bytes, int token, int anchor, int literalLen, DataOutput out) throws IOException { |
| out.writeByte((byte) token); |
| |
| // encode literal length |
| if (literalLen >= 0x0F) { |
| encodeLen(literalLen - 0x0F, out); |
| } |
| |
| // encode literals |
| out.writeBytes(bytes, anchor, literalLen); |
| } |
| |
| private static void encodeLastLiterals(byte[] bytes, int anchor, int literalLen, DataOutput out) throws IOException { |
| final int token = Math.min(literalLen, 0x0F) << 4; |
| encodeLiterals(bytes, token, anchor, literalLen, out); |
| } |
| |
| private static void encodeSequence(byte[] bytes, int anchor, int matchRef, int matchOff, int matchLen, DataOutput out) throws IOException { |
| final int literalLen = matchOff - anchor; |
| assert matchLen >= 4; |
| // encode token |
| final int token = (Math.min(literalLen, 0x0F) << 4) | Math.min(matchLen - 4, 0x0F); |
| encodeLiterals(bytes, token, anchor, literalLen, out); |
| |
| // encode match dec |
| final int matchDec = matchOff - matchRef; |
| assert matchDec > 0 && matchDec < 1 << 16; |
| out.writeByte((byte) matchDec); |
| out.writeByte((byte) (matchDec >>> 8)); |
| |
| // encode match len |
| if (matchLen >= MIN_MATCH + 0x0F) { |
| encodeLen(matchLen - 0x0F - MIN_MATCH, out); |
| } |
| } |
| |
| /** |
| * A record of previous occurrences of sequences of 4 bytes. |
| */ |
| static abstract class HashTable { |
| |
| /** Reset this hash table in order to compress the given content. */ |
| abstract void reset(byte[] b, int off, int len); |
| |
| /** Init {@code dictLen} bytes to be used as a dictionary. */ |
| abstract void initDictionary(int dictLen); |
| |
| /** |
| * Advance the cursor to {@off} and return an index that stored the same |
| * 4 bytes as {@code b[o:o+4)}. This may only be called on strictly |
| * increasing sequences of offsets. A return value of {@code -1} indicates |
| * that no other index could be found. */ |
| abstract int get(int off); |
| |
| /** |
| * Return an index that less than {@code off} and stores the same 4 |
| * bytes. Unlike {@link #get}, it doesn't need to be called on increasing |
| * offsets. A return value of {@code -1} indicates that no other index could |
| * be found. */ |
| abstract int previous(int off); |
| |
| // For testing |
| abstract boolean assertReset(); |
| } |
| |
| /** |
| * Simple lossy {@link HashTable} that only stores the last ocurrence for |
| * each hash on {@code 2^14} bytes of memory. |
| */ |
| public static final class FastCompressionHashTable extends HashTable { |
| |
| private byte[] bytes; |
| private int base; |
| private int lastOff; |
| private int end; |
| private int hashLog; |
| private PackedInts.Mutable hashTable; |
| |
| /** Sole constructor */ |
| public FastCompressionHashTable() {} |
| |
| @Override |
| void reset(byte[] bytes, int off, int len) { |
| FutureObjects.checkFromIndexSize(off, len, bytes.length); |
| this.bytes = bytes; |
| this.base = off; |
| this.end = off + len; |
| final int bitsPerOffset = PackedInts.bitsRequired(len - LAST_LITERALS); |
| final int bitsPerOffsetLog = 32 - Integer.numberOfLeadingZeros(bitsPerOffset - 1); |
| hashLog = MEMORY_USAGE + 3 - bitsPerOffsetLog; |
| if (hashTable == null || hashTable.size() < 1 << hashLog || hashTable.getBitsPerValue() < bitsPerOffset) { |
| hashTable = PackedInts.getMutable(1 << hashLog, bitsPerOffset, PackedInts.DEFAULT); |
| } else { |
| // Avoid calling hashTable.clear(), this makes it costly to compress many short sequences otherwise. |
| // Instead, get() checks that references are less than the current offset. |
| } |
| this.lastOff = off - 1; |
| } |
| |
| @Override |
| void initDictionary(int dictLen) { |
| for (int i = 0; i < dictLen; ++i) { |
| final int v = readInt(bytes, base + i); |
| final int h = hash(v, hashLog); |
| hashTable.set(h, i); |
| } |
| lastOff += dictLen; |
| } |
| |
| @Override |
| int get(int off) { |
| assert off > lastOff; |
| assert off < end; |
| |
| final int v = readInt(bytes, off); |
| final int h = hash(v, hashLog); |
| |
| final int ref = base + (int) hashTable.get(h); |
| hashTable.set(h, off - base); |
| lastOff = off; |
| |
| if (ref < off && off - ref < MAX_DISTANCE && readInt(bytes, ref) == v) { |
| return ref; |
| } else { |
| return -1; |
| } |
| } |
| |
| @Override |
| public int previous(int off) { |
| return -1; |
| } |
| |
| @Override |
| boolean assertReset() { |
| return true; |
| } |
| |
| } |
| |
| /** |
| * A higher-precision {@link HashTable}. It stores up to 256 occurrences of |
| * 4-bytes sequences in the last {@code 2^16} bytes, which makes it much more |
| * likely to find matches than {@link FastCompressionHashTable}. |
| */ |
| public static final class HighCompressionHashTable extends HashTable { |
| private static final int MAX_ATTEMPTS = 256; |
| private static final int MASK = MAX_DISTANCE - 1; |
| |
| private byte[] bytes; |
| private int base; |
| private int next; |
| private int end; |
| private final int[] hashTable; |
| private final short[] chainTable; |
| private int attempts = 0; |
| |
| /** Sole constructor */ |
| public HighCompressionHashTable() { |
| hashTable = new int[HASH_TABLE_SIZE_HC]; |
| Arrays.fill(hashTable, -1); |
| chainTable = new short[MAX_DISTANCE]; |
| Arrays.fill(chainTable, (short) 0xFFFF); |
| } |
| |
| @Override |
| void reset(byte[] bytes, int off, int len) { |
| FutureObjects.checkFromIndexSize(off, len, bytes.length); |
| if (end - base < chainTable.length) { |
| // The last call to compress was done on less than 64kB, let's not reset |
| // the hashTable and only reset the relevant parts of the chainTable. |
| // This helps avoid slowing down calling compress() many times on short |
| // inputs. |
| int startOffset = base & MASK; |
| int endOffset = end == 0 ? 0 : ((end - 1) & MASK) + 1; |
| if (startOffset < endOffset) { |
| Arrays.fill(chainTable, startOffset, endOffset, (short) 0xFFFF); |
| } else { |
| Arrays.fill(chainTable, 0, endOffset, (short) 0xFFFF); |
| Arrays.fill(chainTable, startOffset, chainTable.length, (short) 0xFFFF); |
| } |
| } else { |
| // The last call to compress was done on a large enough amount of data |
| // that it's fine to reset both tables |
| Arrays.fill(hashTable, -1); |
| Arrays.fill(chainTable, (short) 0xFFFF); |
| } |
| this.bytes = bytes; |
| this.base = off; |
| this.next = off; |
| this.end = off + len; |
| } |
| |
| @Override |
| void initDictionary(int dictLen) { |
| assert next == base; |
| for (int i = 0; i < dictLen; ++i) { |
| addHash(base + i); |
| } |
| next += dictLen; |
| } |
| |
| @Override |
| int get(int off) { |
| assert off >= next; |
| assert off < end; |
| |
| for (; next < off; next++) { |
| addHash(next); |
| } |
| |
| final int v = readInt(bytes, off); |
| final int h = hashHC(v); |
| |
| attempts = 0; |
| int ref = hashTable[h]; |
| if (ref >= off) { |
| // remainder from a previous call to compress() |
| return -1; |
| } |
| for (int min = Math.max(base, off - MAX_DISTANCE + 1); |
| ref >= min && attempts < MAX_ATTEMPTS; |
| ref -= chainTable[ref & MASK] & 0xFFFF, attempts++) { |
| if (readInt(bytes, ref) == v) { |
| return ref; |
| } |
| } |
| return -1; |
| } |
| |
| private void addHash(int off) { |
| final int v = readInt(bytes, off); |
| final int h = hashHC(v); |
| int delta = off - hashTable[h]; |
| if (delta <= 0 || delta >= MAX_DISTANCE) { |
| delta = MAX_DISTANCE - 1; |
| } |
| chainTable[off & MASK] = (short) delta; |
| hashTable[h] = off; |
| } |
| |
| @Override |
| int previous(int off) { |
| final int v = readInt(bytes, off); |
| for (int ref = off - (chainTable[off & MASK] & 0xFFFF); |
| ref >= base && attempts < MAX_ATTEMPTS; |
| ref -= chainTable[ref & MASK] & 0xFFFF, attempts++ ) { |
| if (readInt(bytes, ref) == v) { |
| return ref; |
| } |
| } |
| return -1; |
| } |
| |
| @Override |
| boolean assertReset() { |
| for (int i = 0; i < chainTable.length; ++i) { |
| assert chainTable[i] == (short) 0xFFFF : i; |
| } |
| return true; |
| } |
| } |
| |
| /** |
| * Compress {@code bytes[off:off+len]} into {@code out} using at most 16kB of |
| * memory. {@code ht} shouldn't be shared across threads but can safely be |
| * reused. |
| */ |
| public static void compress(byte[] bytes, int off, int len, DataOutput out, HashTable ht) throws IOException { |
| compressWithDictionary(bytes, off, 0, len, out, ht); |
| } |
| |
| /** |
| * Compress {@code bytes[dictOff+dictLen:dictOff+dictLen+len]} into |
| * {@code out} using at most 16kB of memory. |
| * {@code bytes[dictOff:dictOff+dictLen]} will be used as a dictionary. |
| * {@code dictLen} must not be greater than 64kB, the maximum window size. |
| * |
| * {@code ht} shouldn't be shared across threads but can safely be reused. |
| */ |
| public static void compressWithDictionary(byte[] bytes, int dictOff, int dictLen, int len, DataOutput out, HashTable ht) throws IOException { |
| FutureObjects.checkFromIndexSize(dictOff, dictLen, bytes.length); |
| FutureObjects.checkFromIndexSize(dictOff + dictLen, len, bytes.length); |
| if (dictLen > MAX_DISTANCE) { |
| throw new IllegalArgumentException("dictLen must not be greater than 64kB, but got " + dictLen); |
| } |
| |
| final int end = dictOff + dictLen + len; |
| |
| int off = dictOff + dictLen; |
| int anchor = off; |
| |
| if (len > LAST_LITERALS + MIN_MATCH) { |
| |
| final int limit = end - LAST_LITERALS; |
| final int matchLimit = limit - MIN_MATCH; |
| ht.reset(bytes, dictOff, dictLen + len); |
| ht.initDictionary(dictLen); |
| |
| main: |
| while (off <= limit) { |
| // find a match |
| int ref; |
| while (true) { |
| if (off >= matchLimit) { |
| break main; |
| } |
| ref = ht.get(off); |
| if (ref != -1) { |
| assert ref >= dictOff && ref < off; |
| assert readInt(bytes, ref) == readInt(bytes, off); |
| break; |
| } |
| ++off; |
| } |
| |
| // compute match length |
| int matchLen = MIN_MATCH + commonBytes(bytes, ref + MIN_MATCH, off + MIN_MATCH, limit); |
| |
| // try to find a better match |
| for (int r = ht.previous(ref), min = Math.max(off - MAX_DISTANCE + 1, dictOff); r >= min; r = ht.previous(r)) { |
| assert readInt(bytes, r) == readInt(bytes, off); |
| int rMatchLen = MIN_MATCH + commonBytes(bytes, r + MIN_MATCH, off + MIN_MATCH, limit); |
| if (rMatchLen > matchLen) { |
| ref = r; |
| matchLen = rMatchLen; |
| } |
| } |
| |
| encodeSequence(bytes, anchor, ref, off, matchLen, out); |
| off += matchLen; |
| anchor = off; |
| } |
| } |
| |
| // last literals |
| final int literalLen = end - anchor; |
| assert literalLen >= LAST_LITERALS || literalLen == len; |
| encodeLastLiterals(bytes, anchor, end - anchor, out); |
| } |
| |
| } |