blob: 70c910daa0de1d75cf37d0d32881074430992ac9 [file] [log] [blame]
using J2N.Numerics;
using Lucene.Net.Diagnostics;
using Lucene.Net.Support;
using System;
using System.Runtime.CompilerServices;
namespace Lucene.Net.Codecs.Compressing
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using DataInput = Lucene.Net.Store.DataInput;
using DataOutput = Lucene.Net.Store.DataOutput;
using PackedInt32s = Lucene.Net.Util.Packed.PackedInt32s;
/// <summary>
/// LZ4 compression and decompression routines.
/// <para/>
/// http://code.google.com/p/lz4/
/// http://fastcompression.blogspot.fr/p/lz4.html
/// </summary>
public static class LZ4 // LUCENENET specific - made static
{
internal const int MEMORY_USAGE = 14;
internal const int MIN_MATCH = 4; // minimum length of a match
internal const int MAX_DISTANCE = 1 << 16; // maximum distance of a reference
internal const int LAST_LITERALS = 5; // the last 5 bytes must be encoded as literals
internal const int HASH_LOG_HC = 15; // log size of the dictionary for compressHC
internal const int HASH_TABLE_SIZE_HC = 1 << HASH_LOG_HC;
internal const int OPTIMAL_ML = 0x0F + 4 - 1; // match length that doesn't require an additional byte
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int Hash(int i, int hashBits)
{
return (i * -1640531535).TripleShift(32 - hashBits);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int HashHC(int i)
{
return Hash(i, HASH_LOG_HC);
}
/// <summary>
/// NOTE: This was readInt() in Lucene.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int ReadInt32(byte[] buf, int i)
{
return ((((sbyte)buf[i]) & 0xFF) << 24) | ((((sbyte)buf[i + 1]) & 0xFF) << 16) | ((((sbyte)buf[i + 2]) & 0xFF) << 8) |
(((sbyte)buf[i + 3]) & 0xFF);
}
/// <summary>
/// NOTE: This was readIntEquals() in Lucene.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static bool ReadInt32Equals(byte[] buf, int i, int j)
{
return ReadInt32(buf, i) == ReadInt32(buf, j);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int CommonBytes(byte[] b, int o1, int o2, int limit)
{
if (Debugging.AssertsEnabled) Debugging.Assert(o1 < o2);
int count = 0;
while (o2 < limit && b[o1++] == b[o2++])
{
++count;
}
return count;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int CommonBytesBackward(byte[] b, int o1, int o2, int l1, int l2)
{
int count = 0;
while (o1 > l1 && o2 > l2 && b[--o1] == b[--o2])
{
++count;
}
return count;
}
/// <summary>
/// Decompress at least <paramref name="decompressedLen"/> bytes into
/// <c>dest[dOff]</c>. Please note that <paramref name="dest"/> must be large
/// enough to be able to hold <b>all</b> decompressed data (meaning that you
/// need to know the total decompressed length).
/// </summary>
public static int Decompress(DataInput compressed, int decompressedLen, byte[] dest, int dOff)
{
int destEnd = dest.Length;
do
{
// literals
int token = compressed.ReadByte() & 0xFF;
int literalLen = (int)(((uint)token) >> 4);
if (literalLen != 0)
{
if (literalLen == 0x0F)
{
byte len;
while ((len = compressed.ReadByte()) == 0xFF)
{
literalLen += 0xFF;
}
literalLen += len & 0xFF;
}
compressed.ReadBytes(dest, dOff, literalLen);
dOff += literalLen;
}
if (dOff >= decompressedLen)
{
break;
}
// matchs
var byte1 = compressed.ReadByte();
var byte2 = compressed.ReadByte();
int matchDec = (byte1 & 0xFF) | ((byte2 & 0xFF) << 8);
if (Debugging.AssertsEnabled) Debugging.Assert(matchDec > 0);
int matchLen = token & 0x0F;
if (matchLen == 0x0F)
{
int len;
while ((len = compressed.ReadByte()) == 0xFF)
{
matchLen += 0xFF;
}
matchLen += len & 0xFF;
}
matchLen += MIN_MATCH;
// copying a multiple of 8 bytes can make decompression from 5% to 10% faster
int fastLen = (int)((matchLen + 7) & 0xFFFFFFF8);
if (matchDec < matchLen || dOff + fastLen > destEnd)
{
// overlap -> naive incremental copy
for (int @ref = dOff - matchDec, end = dOff + matchLen; dOff < end; ++@ref, ++dOff)
{
dest[dOff] = dest[@ref];
}
}
else
{
// no overlap -> arraycopy
Array.Copy(dest, dOff - matchDec, dest, dOff, fastLen);
dOff += matchLen;
}
} while (dOff < decompressedLen);
return dOff;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static void EncodeLen(int l, DataOutput @out)
{
while (l >= 0xFF)
{
@out.WriteByte(/*(byte)*/0xFF); // LUCENENET: Removed unnecessary cast
l -= 0xFF;
}
@out.WriteByte((byte)l);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static void EncodeLiterals(byte[] bytes, int token, int anchor, int literalLen, DataOutput @out)
{
@out.WriteByte((byte)token);
// encode literal length
if (literalLen >= 0x0F)
{
EncodeLen(literalLen - 0x0F, @out);
}
// encode literals
@out.WriteBytes(bytes, anchor, literalLen);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static void EncodeLastLiterals(byte[] bytes, int anchor, int literalLen, DataOutput @out)
{
int token = Math.Min(literalLen, 0x0F) << 4;
EncodeLiterals(bytes, token, anchor, literalLen, @out);
}
private static void EncodeSequence(byte[] bytes, int anchor, int matchRef, int matchOff, int matchLen, DataOutput @out)
{
int literalLen = matchOff - anchor;
if (Debugging.AssertsEnabled) Debugging.Assert(matchLen >= 4);
// encode token
int token = (Math.Min(literalLen, 0x0F) << 4) | Math.Min(matchLen - 4, 0x0F);
EncodeLiterals(bytes, token, anchor, literalLen, @out);
// encode match dec
int matchDec = matchOff - matchRef;
if (Debugging.AssertsEnabled) Debugging.Assert(matchDec > 0 && matchDec < 1 << 16);
@out.WriteByte((byte)matchDec);
@out.WriteByte((byte)(int)((uint)matchDec >> 8));
// encode match len
if (matchLen >= MIN_MATCH + 0x0F)
{
EncodeLen(matchLen - 0x0F - MIN_MATCH, @out);
}
}
public sealed class HashTable
{
internal int hashLog;
internal PackedInt32s.Mutable hashTable;
internal void Reset(int len)
{
int bitsPerOffset = PackedInt32s.BitsRequired(len - LAST_LITERALS);
int bitsPerOffsetLog = 32 - (bitsPerOffset - 1).LeadingZeroCount();
hashLog = MEMORY_USAGE + 3 - bitsPerOffsetLog;
if (hashTable == null || hashTable.Count < 1 << hashLog || hashTable.BitsPerValue < bitsPerOffset)
{
hashTable = PackedInt32s.GetMutable(1 << hashLog, bitsPerOffset, PackedInt32s.DEFAULT);
}
else
{
hashTable.Clear();
}
}
}
/// <summary>
/// Compress <c>bytes[off:off+len]</c> into <paramref name="out"/> using
/// at most 16KB of memory. <paramref name="ht"/> shouldn't be shared across threads
/// but can safely be reused.
/// </summary>
public static void Compress(byte[] bytes, int off, int len, DataOutput @out, HashTable ht)
{
int @base = off;
int end = off + len;
int anchor = off++;
if (len > LAST_LITERALS + MIN_MATCH)
{
int limit = end - LAST_LITERALS;
int matchLimit = limit - MIN_MATCH;
ht.Reset(len);
int hashLog = ht.hashLog;
PackedInt32s.Mutable hashTable = ht.hashTable;
while (off <= limit)
{
// find a match
int @ref;
while (true)
{
if (off >= matchLimit)
{
goto mainBreak;
}
int v = ReadInt32(bytes, off);
int h = Hash(v, hashLog);
@ref = @base + (int)hashTable.Get(h);
if (Debugging.AssertsEnabled) Debugging.Assert(PackedInt32s.BitsRequired(off - @base) <= hashTable.BitsPerValue);
hashTable.Set(h, off - @base);
if (off - @ref < MAX_DISTANCE && ReadInt32(bytes, @ref) == v)
{
break;
}
++off;
}
// compute match length
int matchLen = MIN_MATCH + CommonBytes(bytes, @ref + MIN_MATCH, off + MIN_MATCH, limit);
EncodeSequence(bytes, anchor, @ref, off, matchLen, @out);
off += matchLen;
anchor = off;
//mainContinue: ; // LUCENENET NOTE: Not Referenced
}
mainBreak: ;
}
// last literals
int literalLen = end - anchor;
if (Debugging.AssertsEnabled) Debugging.Assert(literalLen >= LAST_LITERALS || literalLen == len);
EncodeLastLiterals(bytes, anchor, end - anchor, @out);
}
public class Match
{
internal int start, @ref, len;
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal virtual void Fix(int correction)
{
start += correction;
@ref += correction;
len -= correction;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal virtual int End()
{
return start + len;
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static void CopyTo(Match m1, Match m2)
{
m2.len = m1.len;
m2.start = m1.start;
m2.@ref = m1.@ref;
}
public sealed class HCHashTable
{
internal const int MAX_ATTEMPTS = 256;
internal const int MASK = MAX_DISTANCE - 1;
internal int nextToUpdate;
private int @base;
private readonly int[] hashTable;
private readonly short[] chainTable;
internal HCHashTable()
{
hashTable = new int[HASH_TABLE_SIZE_HC];
chainTable = new short[MAX_DISTANCE];
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal void Reset(int @base)
{
this.@base = @base;
nextToUpdate = @base;
Arrays.Fill(hashTable, -1);
Arrays.Fill(chainTable, (short)0);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private int HashPointer(byte[] bytes, int off)
{
int v = ReadInt32(bytes, off);
int h = HashHC(v);
return hashTable[h];
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private int Next(int off)
{
return off - (chainTable[off & MASK] & 0xFFFF);
}
private void AddHash(byte[] bytes, int off)
{
int v = ReadInt32(bytes, off);
int h = HashHC(v);
int delta = off - hashTable[h];
if (Debugging.AssertsEnabled) Debugging.Assert(delta > 0, delta.ToString());
if (delta >= MAX_DISTANCE)
{
delta = MAX_DISTANCE - 1;
}
chainTable[off & MASK] = (short)delta;
hashTable[h] = off;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal void Insert(int off, byte[] bytes)
{
for (; nextToUpdate < off; ++nextToUpdate)
{
AddHash(bytes, nextToUpdate);
}
}
internal bool InsertAndFindBestMatch(byte[] buf, int off, int matchLimit, Match match)
{
match.start = off;
match.len = 0;
int delta = 0;
int repl = 0;
Insert(off, buf);
int @ref = HashPointer(buf, off);
if (@ref >= off - 4 && @ref <= off && @ref >= @base) // potential repetition
{
if (ReadInt32Equals(buf, @ref, off)) // confirmed
{
delta = off - @ref;
repl = match.len = MIN_MATCH + CommonBytes(buf, @ref + MIN_MATCH, off + MIN_MATCH, matchLimit);
match.@ref = @ref;
}
@ref = Next(@ref);
}
for (int i = 0; i < MAX_ATTEMPTS; ++i)
{
if (@ref < Math.Max(@base, off - MAX_DISTANCE + 1) || @ref > off)
{
break;
}
if (buf[@ref + match.len] == buf[off + match.len] && ReadInt32Equals(buf, @ref, off))
{
int matchLen = MIN_MATCH + CommonBytes(buf, @ref + MIN_MATCH, off + MIN_MATCH, matchLimit);
if (matchLen > match.len)
{
match.@ref = @ref;
match.len = matchLen;
}
}
@ref = Next(@ref);
}
if (repl != 0)
{
int ptr = off;
int end = off + repl - (MIN_MATCH - 1);
while (ptr < end - delta)
{
chainTable[ptr & MASK] = (short)delta; // pre load
++ptr;
}
do
{
chainTable[ptr & MASK] = (short)delta;
hashTable[HashHC(ReadInt32(buf, ptr))] = ptr;
++ptr;
} while (ptr < end);
nextToUpdate = end;
}
return match.len != 0;
}
internal bool InsertAndFindWiderMatch(byte[] buf, int off, int startLimit, int matchLimit, int minLen, Match match)
{
match.len = minLen;
Insert(off, buf);
int delta = off - startLimit;
int @ref = HashPointer(buf, off);
for (int i = 0; i < MAX_ATTEMPTS; ++i)
{
if (@ref < Math.Max(@base, off - MAX_DISTANCE + 1) || @ref > off)
{
break;
}
if (buf[@ref - delta + match.len] == buf[startLimit + match.len] && ReadInt32Equals(buf, @ref, off))
{
int matchLenForward = MIN_MATCH + CommonBytes(buf, @ref + MIN_MATCH, off + MIN_MATCH, matchLimit);
int matchLenBackward = CommonBytesBackward(buf, @ref, off, @base, startLimit);
int matchLen = matchLenBackward + matchLenForward;
if (matchLen > match.len)
{
match.len = matchLen;
match.@ref = @ref - matchLenBackward;
match.start = off - matchLenBackward;
}
}
@ref = Next(@ref);
}
return match.len > minLen;
}
}
/// <summary>
/// Compress <c>bytes[off:off+len]</c> into <paramref name="out"/>. Compared to
/// <see cref="LZ4.Compress(byte[], int, int, DataOutput, HashTable)"/>, this method
/// is slower and uses more memory (~ 256KB per thread) but should provide
/// better compression ratios (especially on large inputs) because it chooses
/// the best match among up to 256 candidates and then performs trade-offs to
/// fix overlapping matches. <paramref name="ht"/> shouldn't be shared across threads
/// but can safely be reused.
/// </summary>
public static void CompressHC(byte[] src, int srcOff, int srcLen, DataOutput @out, HCHashTable ht)
{
int srcEnd = srcOff + srcLen;
int matchLimit = srcEnd - LAST_LITERALS;
int mfLimit = matchLimit - MIN_MATCH;
int sOff = srcOff;
int anchor = sOff++;
ht.Reset(srcOff);
Match match0 = new Match();
Match match1 = new Match();
Match match2 = new Match();
Match match3 = new Match();
while (sOff <= mfLimit)
{
if (!ht.InsertAndFindBestMatch(src, sOff, matchLimit, match1))
{
++sOff;
continue;
}
// saved, in case we would skip too much
CopyTo(match1, match0);
while (true)
{
if (Debugging.AssertsEnabled) Debugging.Assert(match1.start >= anchor);
if (match1.End() >= mfLimit || !ht.InsertAndFindWiderMatch(src, match1.End() - 2, match1.start + 1, matchLimit, match1.len, match2))
{
// no better match
EncodeSequence(src, anchor, match1.@ref, match1.start, match1.len, @out);
anchor = sOff = match1.End();
goto mainContinue;
}
if (match0.start < match1.start)
{
if (match2.start < match1.start + match0.len) // empirical
{
CopyTo(match0, match1);
}
}
if (Debugging.AssertsEnabled) Debugging.Assert(match2.start > match1.start);
if (match2.start - match1.start < 3) // First Match too small : removed
{
CopyTo(match2, match1);
goto search2Continue;
}
while (true)
{
if (match2.start - match1.start < OPTIMAL_ML)
{
int newMatchLen = match1.len;
if (newMatchLen > OPTIMAL_ML)
{
newMatchLen = OPTIMAL_ML;
}
if (match1.start + newMatchLen > match2.End() - MIN_MATCH)
{
newMatchLen = match2.start - match1.start + match2.len - MIN_MATCH;
}
int correction = newMatchLen - (match2.start - match1.start);
if (correction > 0)
{
match2.Fix(correction);
}
}
if (match2.start + match2.len >= mfLimit || !ht.InsertAndFindWiderMatch(src, match2.End() - 3, match2.start, matchLimit, match2.len, match3))
{
// no better match -> 2 sequences to encode
if (match2.start < match1.End())
{
match1.len = match2.start - match1.start;
}
// encode seq 1
EncodeSequence(src, anchor, match1.@ref, match1.start, match1.len, @out);
anchor = /*sOff =*/ match1.End(); // LUCENENET: IDE0059: Remove unnecessary value assignment
// encode seq 2
EncodeSequence(src, anchor, match2.@ref, match2.start, match2.len, @out);
anchor = sOff = match2.End();
goto mainContinue;
}
if (match3.start < match1.End() + 3) // Not enough space for match 2 : remove it
{
if (match3.start >= match1.End()) // // can write Seq1 immediately ==> Seq2 is removed, so Seq3 becomes Seq1
{
if (match2.start < match1.End())
{
int correction = match1.End() - match2.start;
match2.Fix(correction);
if (match2.len < MIN_MATCH)
{
CopyTo(match3, match2);
}
}
EncodeSequence(src, anchor, match1.@ref, match1.start, match1.len, @out);
anchor = /*sOff =*/ match1.End(); // LUCENENET: IDE0059: Remove unnecessary value assignment
CopyTo(match3, match1);
CopyTo(match2, match0);
goto search2Continue;
}
CopyTo(match3, match2);
goto search3Continue;
}
// OK, now we have 3 ascending matches; let's write at least the first one
if (match2.start < match1.End())
{
if (match2.start - match1.start < 0x0F)
{
if (match1.len > OPTIMAL_ML)
{
match1.len = OPTIMAL_ML;
}
if (match1.End() > match2.End() - MIN_MATCH)
{
match1.len = match2.End() - match1.start - MIN_MATCH;
}
int correction = match1.End() - match2.start;
match2.Fix(correction);
}
else
{
match1.len = match2.start - match1.start;
}
}
EncodeSequence(src, anchor, match1.@ref, match1.start, match1.len, @out);
anchor = /*sOff =*/ match1.End(); // LUCENENET: IDE0059: Remove unnecessary value assignment
CopyTo(match2, match1);
CopyTo(match3, match2);
goto search3Continue;
search3Continue: ;
}
//search3Break: ; // LUCENENET NOTE: Unreachable
search2Continue: ;
}
//search2Break: ; // LUCENENET NOTE: Not referenced
mainContinue: ;
}
//mainBreak: // LUCENENET NOTE: Not referenced
EncodeLastLiterals(src, anchor, srcEnd - anchor, @out);
}
}
}