blob: 2a471eacd7b39430629e9f60a9d6756ff7a0ef7b [file] [log] [blame]
/*
* LZ4 Library
* Copyright (c) 2011-2016, Yann Collet
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice, this
* list of conditions and the following disclaimer in the documentation and/or
* other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package org.apache.lucene.util.compress;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.FutureArrays;
import org.apache.lucene.util.FutureObjects;
import org.apache.lucene.util.packed.PackedInts;
/**
* LZ4 compression and decompression routines.
*
* https://github.com/lz4/lz4/tree/dev/lib
* http://fastcompression.blogspot.fr/p/lz4.html
*
* The high-compression option is a simpler version of the one of the original
* algorithm, and only retains a better hash table that remembers about more
* occurrences of a previous 4-bytes sequence, and removes all the logic about
* handling of the case when overlapping matches are found.
*/
public final class LZ4 {
private LZ4() {}
static final int MEMORY_USAGE = 14;
static final int MIN_MATCH = 4; // minimum length of a match
static final int MAX_DISTANCE = 1 << 16; // maximum distance of a reference
static final int LAST_LITERALS = 5; // the last 5 bytes must be encoded as literals
static final int HASH_LOG_HC = 15; // log size of the dictionary for compressHC
static final int HASH_TABLE_SIZE_HC = 1 << HASH_LOG_HC;
private static int hash(int i, int hashBits) {
return (i * -1640531535) >>> (32 - hashBits);
}
private static int hashHC(int i) {
return hash(i, HASH_LOG_HC);
}
private static int readInt(byte[] buf, int i) {
return ((buf[i] & 0xFF) << 24) | ((buf[i+1] & 0xFF) << 16) | ((buf[i+2] & 0xFF) << 8) | (buf[i+3] & 0xFF);
}
private static int commonBytes(byte[] b, int o1, int o2, int limit) {
assert o1 < o2;
// never -1 because lengths always differ
return FutureArrays.mismatch(b, o1, limit, b, o2, limit);
}
/**
* Decompress at least {@code decompressedLen} bytes into
* {@code dest[dOff:]}. Please note that {@code dest} must be large
* enough to be able to hold <b>all</b> decompressed data (meaning that you
* need to know the total decompressed length).
* If the given bytes were compressed using a preset dictionary then the same
* dictionary must be provided in {@code dest[dOff-dictLen:dOff]}.
*/
public static int decompress(DataInput compressed, int decompressedLen, byte[] dest, int dOff) throws IOException {
final int destEnd = dOff + decompressedLen;
do {
// literals
final int token = compressed.readByte() & 0xFF;
int literalLen = token >>> 4;
if (literalLen != 0) {
if (literalLen == 0x0F) {
byte len;
while ((len = compressed.readByte()) == (byte) 0xFF) {
literalLen += 0xFF;
}
literalLen += len & 0xFF;
}
compressed.readBytes(dest, dOff, literalLen);
dOff += literalLen;
}
if (dOff >= destEnd) {
break;
}
// matchs
final int matchDec = (compressed.readByte() & 0xFF) | ((compressed.readByte() & 0xFF) << 8);
assert matchDec > 0;
int matchLen = token & 0x0F;
if (matchLen == 0x0F) {
int len;
while ((len = compressed.readByte()) == (byte) 0xFF) {
matchLen += 0xFF;
}
matchLen += len & 0xFF;
}
matchLen += MIN_MATCH;
// copying a multiple of 8 bytes can make decompression from 5% to 10% faster
final int fastLen = (matchLen + 7) & 0xFFFFFFF8;
if (matchDec < matchLen || dOff + fastLen > destEnd) {
// overlap -> naive incremental copy
for (int ref = dOff - matchDec, end = dOff + matchLen; dOff < end; ++ref, ++dOff) {
dest[dOff] = dest[ref];
}
} else {
// no overlap -> arraycopy
System.arraycopy(dest, dOff - matchDec, dest, dOff, fastLen);
dOff += matchLen;
}
} while (dOff < destEnd);
return dOff;
}
private static void encodeLen(int l, DataOutput out) throws IOException {
while (l >= 0xFF) {
out.writeByte((byte) 0xFF);
l -= 0xFF;
}
out.writeByte((byte) l);
}
private static void encodeLiterals(byte[] bytes, int token, int anchor, int literalLen, DataOutput out) throws IOException {
out.writeByte((byte) token);
// encode literal length
if (literalLen >= 0x0F) {
encodeLen(literalLen - 0x0F, out);
}
// encode literals
out.writeBytes(bytes, anchor, literalLen);
}
private static void encodeLastLiterals(byte[] bytes, int anchor, int literalLen, DataOutput out) throws IOException {
final int token = Math.min(literalLen, 0x0F) << 4;
encodeLiterals(bytes, token, anchor, literalLen, out);
}
private static void encodeSequence(byte[] bytes, int anchor, int matchRef, int matchOff, int matchLen, DataOutput out) throws IOException {
final int literalLen = matchOff - anchor;
assert matchLen >= 4;
// encode token
final int token = (Math.min(literalLen, 0x0F) << 4) | Math.min(matchLen - 4, 0x0F);
encodeLiterals(bytes, token, anchor, literalLen, out);
// encode match dec
final int matchDec = matchOff - matchRef;
assert matchDec > 0 && matchDec < 1 << 16;
out.writeByte((byte) matchDec);
out.writeByte((byte) (matchDec >>> 8));
// encode match len
if (matchLen >= MIN_MATCH + 0x0F) {
encodeLen(matchLen - 0x0F - MIN_MATCH, out);
}
}
/**
* A record of previous occurrences of sequences of 4 bytes.
*/
static abstract class HashTable {
/** Reset this hash table in order to compress the given content. */
abstract void reset(byte[] b, int off, int len);
/** Init {@code dictLen} bytes to be used as a dictionary. */
abstract void initDictionary(int dictLen);
/**
* Advance the cursor to {@off} and return an index that stored the same
* 4 bytes as {@code b[o:o+4)}. This may only be called on strictly
* increasing sequences of offsets. A return value of {@code -1} indicates
* that no other index could be found. */
abstract int get(int off);
/**
* Return an index that less than {@code off} and stores the same 4
* bytes. Unlike {@link #get}, it doesn't need to be called on increasing
* offsets. A return value of {@code -1} indicates that no other index could
* be found. */
abstract int previous(int off);
// For testing
abstract boolean assertReset();
}
/**
* Simple lossy {@link HashTable} that only stores the last ocurrence for
* each hash on {@code 2^14} bytes of memory.
*/
public static final class FastCompressionHashTable extends HashTable {
private byte[] bytes;
private int base;
private int lastOff;
private int end;
private int hashLog;
private PackedInts.Mutable hashTable;
/** Sole constructor */
public FastCompressionHashTable() {}
@Override
void reset(byte[] bytes, int off, int len) {
FutureObjects.checkFromIndexSize(off, len, bytes.length);
this.bytes = bytes;
this.base = off;
this.end = off + len;
final int bitsPerOffset = PackedInts.bitsRequired(len - LAST_LITERALS);
final int bitsPerOffsetLog = 32 - Integer.numberOfLeadingZeros(bitsPerOffset - 1);
hashLog = MEMORY_USAGE + 3 - bitsPerOffsetLog;
if (hashTable == null || hashTable.size() < 1 << hashLog || hashTable.getBitsPerValue() < bitsPerOffset) {
hashTable = PackedInts.getMutable(1 << hashLog, bitsPerOffset, PackedInts.DEFAULT);
} else {
// Avoid calling hashTable.clear(), this makes it costly to compress many short sequences otherwise.
// Instead, get() checks that references are less than the current offset.
}
this.lastOff = off - 1;
}
@Override
void initDictionary(int dictLen) {
for (int i = 0; i < dictLen; ++i) {
final int v = readInt(bytes, base + i);
final int h = hash(v, hashLog);
hashTable.set(h, i);
}
lastOff += dictLen;
}
@Override
int get(int off) {
assert off > lastOff;
assert off < end;
final int v = readInt(bytes, off);
final int h = hash(v, hashLog);
final int ref = base + (int) hashTable.get(h);
hashTable.set(h, off - base);
lastOff = off;
if (ref < off && off - ref < MAX_DISTANCE && readInt(bytes, ref) == v) {
return ref;
} else {
return -1;
}
}
@Override
public int previous(int off) {
return -1;
}
@Override
boolean assertReset() {
return true;
}
}
/**
* A higher-precision {@link HashTable}. It stores up to 256 occurrences of
* 4-bytes sequences in the last {@code 2^16} bytes, which makes it much more
* likely to find matches than {@link FastCompressionHashTable}.
*/
public static final class HighCompressionHashTable extends HashTable {
private static final int MAX_ATTEMPTS = 256;
private static final int MASK = MAX_DISTANCE - 1;
private byte[] bytes;
private int base;
private int next;
private int end;
private final int[] hashTable;
private final short[] chainTable;
private int attempts = 0;
/** Sole constructor */
public HighCompressionHashTable() {
hashTable = new int[HASH_TABLE_SIZE_HC];
Arrays.fill(hashTable, -1);
chainTable = new short[MAX_DISTANCE];
Arrays.fill(chainTable, (short) 0xFFFF);
}
@Override
void reset(byte[] bytes, int off, int len) {
FutureObjects.checkFromIndexSize(off, len, bytes.length);
if (end - base < chainTable.length) {
// The last call to compress was done on less than 64kB, let's not reset
// the hashTable and only reset the relevant parts of the chainTable.
// This helps avoid slowing down calling compress() many times on short
// inputs.
int startOffset = base & MASK;
int endOffset = end == 0 ? 0 : ((end - 1) & MASK) + 1;
if (startOffset < endOffset) {
Arrays.fill(chainTable, startOffset, endOffset, (short) 0xFFFF);
} else {
Arrays.fill(chainTable, 0, endOffset, (short) 0xFFFF);
Arrays.fill(chainTable, startOffset, chainTable.length, (short) 0xFFFF);
}
} else {
// The last call to compress was done on a large enough amount of data
// that it's fine to reset both tables
Arrays.fill(hashTable, -1);
Arrays.fill(chainTable, (short) 0xFFFF);
}
this.bytes = bytes;
this.base = off;
this.next = off;
this.end = off + len;
}
@Override
void initDictionary(int dictLen) {
assert next == base;
for (int i = 0; i < dictLen; ++i) {
addHash(base + i);
}
next += dictLen;
}
@Override
int get(int off) {
assert off >= next;
assert off < end;
for (; next < off; next++) {
addHash(next);
}
final int v = readInt(bytes, off);
final int h = hashHC(v);
attempts = 0;
int ref = hashTable[h];
if (ref >= off) {
// remainder from a previous call to compress()
return -1;
}
for (int min = Math.max(base, off - MAX_DISTANCE + 1);
ref >= min && attempts < MAX_ATTEMPTS;
ref -= chainTable[ref & MASK] & 0xFFFF, attempts++) {
if (readInt(bytes, ref) == v) {
return ref;
}
}
return -1;
}
private void addHash(int off) {
final int v = readInt(bytes, off);
final int h = hashHC(v);
int delta = off - hashTable[h];
if (delta <= 0 || delta >= MAX_DISTANCE) {
delta = MAX_DISTANCE - 1;
}
chainTable[off & MASK] = (short) delta;
hashTable[h] = off;
}
@Override
int previous(int off) {
final int v = readInt(bytes, off);
for (int ref = off - (chainTable[off & MASK] & 0xFFFF);
ref >= base && attempts < MAX_ATTEMPTS;
ref -= chainTable[ref & MASK] & 0xFFFF, attempts++ ) {
if (readInt(bytes, ref) == v) {
return ref;
}
}
return -1;
}
@Override
boolean assertReset() {
for (int i = 0; i < chainTable.length; ++i) {
assert chainTable[i] == (short) 0xFFFF : i;
}
return true;
}
}
/**
* Compress {@code bytes[off:off+len]} into {@code out} using at most 16kB of
* memory. {@code ht} shouldn't be shared across threads but can safely be
* reused.
*/
public static void compress(byte[] bytes, int off, int len, DataOutput out, HashTable ht) throws IOException {
compressWithDictionary(bytes, off, 0, len, out, ht);
}
/**
* Compress {@code bytes[dictOff+dictLen:dictOff+dictLen+len]} into
* {@code out} using at most 16kB of memory.
* {@code bytes[dictOff:dictOff+dictLen]} will be used as a dictionary.
* {@code dictLen} must not be greater than 64kB, the maximum window size.
*
* {@code ht} shouldn't be shared across threads but can safely be reused.
*/
public static void compressWithDictionary(byte[] bytes, int dictOff, int dictLen, int len, DataOutput out, HashTable ht) throws IOException {
FutureObjects.checkFromIndexSize(dictOff, dictLen, bytes.length);
FutureObjects.checkFromIndexSize(dictOff + dictLen, len, bytes.length);
if (dictLen > MAX_DISTANCE) {
throw new IllegalArgumentException("dictLen must not be greater than 64kB, but got " + dictLen);
}
final int end = dictOff + dictLen + len;
int off = dictOff + dictLen;
int anchor = off;
if (len > LAST_LITERALS + MIN_MATCH) {
final int limit = end - LAST_LITERALS;
final int matchLimit = limit - MIN_MATCH;
ht.reset(bytes, dictOff, dictLen + len);
ht.initDictionary(dictLen);
main:
while (off <= limit) {
// find a match
int ref;
while (true) {
if (off >= matchLimit) {
break main;
}
ref = ht.get(off);
if (ref != -1) {
assert ref >= dictOff && ref < off;
assert readInt(bytes, ref) == readInt(bytes, off);
break;
}
++off;
}
// compute match length
int matchLen = MIN_MATCH + commonBytes(bytes, ref + MIN_MATCH, off + MIN_MATCH, limit);
// try to find a better match
for (int r = ht.previous(ref), min = Math.max(off - MAX_DISTANCE + 1, dictOff); r >= min; r = ht.previous(r)) {
assert readInt(bytes, r) == readInt(bytes, off);
int rMatchLen = MIN_MATCH + commonBytes(bytes, r + MIN_MATCH, off + MIN_MATCH, limit);
if (rMatchLen > matchLen) {
ref = r;
matchLen = rMatchLen;
}
}
encodeSequence(bytes, anchor, ref, off, matchLen, out);
off += matchLen;
anchor = off;
}
}
// last literals
final int literalLen = end - anchor;
assert literalLen >= LAST_LITERALS || literalLen == len;
encodeLastLiterals(bytes, anchor, end - anchor, out);
}
}