blob: d26013f22c3b5398beba545e3282809028ccab20 [file] [log] [blame]
using Lucene.Net.Diagnostics;
using System;
using System.Collections;
using System.Collections.Generic;
using System.Runtime.CompilerServices;
using JCG = J2N.Collections.Generic;
namespace Lucene.Net.Codecs.Lucene45
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using BlockPackedWriter = Lucene.Net.Util.Packed.BlockPackedWriter;
using BytesRef = Lucene.Net.Util.BytesRef;
using FieldInfo = Lucene.Net.Index.FieldInfo;
using IndexFileNames = Lucene.Net.Index.IndexFileNames;
using IndexOutput = Lucene.Net.Store.IndexOutput;
using IOUtils = Lucene.Net.Util.IOUtils;
using MathUtil = Lucene.Net.Util.MathUtil;
using MonotonicBlockPackedWriter = Lucene.Net.Util.Packed.MonotonicBlockPackedWriter;
using PackedInt32s = Lucene.Net.Util.Packed.PackedInt32s;
using RAMOutputStream = Lucene.Net.Store.RAMOutputStream;
using SegmentWriteState = Lucene.Net.Index.SegmentWriteState;
using StringHelper = Lucene.Net.Util.StringHelper;
/// <summary>
/// Writer for <see cref="Lucene45DocValuesFormat"/> </summary>
public class Lucene45DocValuesConsumer : DocValuesConsumer // LUCENENET specific - removed IDisposable, it is already implemented in base class
{
internal const int BLOCK_SIZE = 16384;
internal const int ADDRESS_INTERVAL = 16;
internal const long MISSING_ORD = -1L;
/// <summary>
/// Compressed using packed blocks of <see cref="int"/>s. </summary>
public const int DELTA_COMPRESSED = 0;
/// <summary>
/// Compressed by computing the GCD. </summary>
public const int GCD_COMPRESSED = 1;
/// <summary>
/// Compressed by giving IDs to unique values. </summary>
public const int TABLE_COMPRESSED = 2;
/// <summary>
/// Uncompressed binary, written directly (fixed length). </summary>
public const int BINARY_FIXED_UNCOMPRESSED = 0;
/// <summary>
/// Uncompressed binary, written directly (variable length). </summary>
public const int BINARY_VARIABLE_UNCOMPRESSED = 1;
/// <summary>
/// Compressed binary with shared prefixes </summary>
public const int BINARY_PREFIX_COMPRESSED = 2;
/// <summary>
/// Standard storage for sorted set values with 1 level of indirection:
/// docId -> address -> ord.
/// </summary>
public static readonly int SORTED_SET_WITH_ADDRESSES = 0;
/// <summary>
/// Single-valued sorted set values, encoded as sorted values, so no level
/// of indirection: docId -> ord.
/// </summary>
public static readonly int SORTED_SET_SINGLE_VALUED_SORTED = 1;
#pragma warning disable CA2213 // Disposable fields should be disposed
internal IndexOutput data, meta;
#pragma warning restore CA2213 // Disposable fields should be disposed
internal readonly int maxDoc;
/// <summary>
/// Expert: Creates a new writer. </summary>
public Lucene45DocValuesConsumer(SegmentWriteState state, string dataCodec, string dataExtension, string metaCodec, string metaExtension)
{
bool success = false;
try
{
string dataName = IndexFileNames.SegmentFileName(state.SegmentInfo.Name, state.SegmentSuffix, dataExtension);
data = state.Directory.CreateOutput(dataName, state.Context);
CodecUtil.WriteHeader(data, dataCodec, Lucene45DocValuesFormat.VERSION_CURRENT);
string metaName = IndexFileNames.SegmentFileName(state.SegmentInfo.Name, state.SegmentSuffix, metaExtension);
meta = state.Directory.CreateOutput(metaName, state.Context);
CodecUtil.WriteHeader(meta, metaCodec, Lucene45DocValuesFormat.VERSION_CURRENT);
maxDoc = state.SegmentInfo.DocCount;
success = true;
}
finally
{
if (!success)
{
IOUtils.DisposeWhileHandlingException(this);
}
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public override void AddNumericField(FieldInfo field, IEnumerable<long?> values)
{
AddNumericField(field, values, true);
}
internal virtual void AddNumericField(FieldInfo field, IEnumerable<long?> values, bool optimizeStorage)
{
long count = 0;
long minValue = long.MaxValue;
long maxValue = long.MinValue;
long gcd = 0;
bool missing = false;
// TODO: more efficient?
JCG.HashSet<long> uniqueValues = null;
if (optimizeStorage)
{
uniqueValues = new JCG.HashSet<long>();
foreach (long? nv in values)
{
long v;
if (nv == null)
{
v = 0;
missing = true;
}
else
{
v = nv.Value;
}
if (gcd != 1)
{
if (v < long.MinValue / 2 || v > long.MaxValue / 2)
{
// in that case v - minValue might overflow and make the GCD computation return
// wrong results. Since these extreme values are unlikely, we just discard
// GCD computation for them
gcd = 1;
} // minValue needs to be set first
else if (count != 0)
{
gcd = MathUtil.Gcd(gcd, v - minValue);
}
}
minValue = Math.Min(minValue, v);
maxValue = Math.Max(maxValue, v);
if (uniqueValues != null)
{
if (uniqueValues.Add(v))
{
if (uniqueValues.Count > 256)
{
uniqueValues = null;
}
}
}
++count;
}
}
else
{
foreach (var nv in values)
{
++count;
}
}
long delta = maxValue - minValue;
int format;
if (uniqueValues != null && (delta < 0L || PackedInt32s.BitsRequired(uniqueValues.Count - 1) < PackedInt32s.BitsRequired(delta)) && count <= int.MaxValue)
{
format = TABLE_COMPRESSED;
}
else if (gcd != 0 && gcd != 1)
{
format = GCD_COMPRESSED;
}
else
{
format = DELTA_COMPRESSED;
}
meta.WriteVInt32(field.Number);
meta.WriteByte((byte)Lucene45DocValuesFormat.NUMERIC);
meta.WriteVInt32(format);
if (missing)
{
meta.WriteInt64(data.GetFilePointer());
WriteMissingBitset(values);
}
else
{
meta.WriteInt64(-1L);
}
meta.WriteVInt32(PackedInt32s.VERSION_CURRENT);
meta.WriteInt64(data.GetFilePointer());
meta.WriteVInt64(count);
meta.WriteVInt32(BLOCK_SIZE);
switch (format)
{
case GCD_COMPRESSED:
meta.WriteInt64(minValue);
meta.WriteInt64(gcd);
BlockPackedWriter quotientWriter = new BlockPackedWriter(data, BLOCK_SIZE);
foreach (long? nv in values)
{
quotientWriter.Add((nv.GetValueOrDefault() - minValue) / gcd);
}
quotientWriter.Finish();
break;
case DELTA_COMPRESSED:
BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE);
foreach (long? nv in values)
{
writer.Add(nv.GetValueOrDefault());
}
writer.Finish();
break;
case TABLE_COMPRESSED:
// LUCENENET NOTE: diming an array and then using .CopyTo() for better efficiency than LINQ .ToArray()
long[] decode = new long[uniqueValues.Count];
uniqueValues.CopyTo(decode, 0);
Dictionary<long, int> encode = new Dictionary<long, int>();
meta.WriteVInt32(decode.Length);
for (int i = 0; i < decode.Length; i++)
{
meta.WriteInt64(decode[i]);
encode[decode[i]] = i;
}
int bitsRequired = PackedInt32s.BitsRequired(uniqueValues.Count - 1);
PackedInt32s.Writer ordsWriter = PackedInt32s.GetWriterNoHeader(data, PackedInt32s.Format.PACKED, (int)count, bitsRequired, PackedInt32s.DEFAULT_BUFFER_SIZE);
foreach (long? nv in values)
{
ordsWriter.Add(encode[nv.GetValueOrDefault()]);
}
ordsWriter.Finish();
break;
default:
throw new InvalidOperationException();
}
}
// TODO: in some cases representing missing with minValue-1 wouldn't take up additional space and so on,
// but this is very simple, and algorithms only check this for values of 0 anyway (doesnt slow down normal decode)
internal virtual void WriteMissingBitset(IEnumerable values)
{
sbyte bits = 0;
int count = 0;
foreach (object v in values)
{
if (count == 8)
{
data.WriteByte((byte)bits);
count = 0;
bits = 0;
}
if (v != null)
{
bits |= (sbyte)(1 << (count & 7));
}
count++;
}
if (count > 0)
{
data.WriteByte((byte)bits);
}
}
public override void AddBinaryField(FieldInfo field, IEnumerable<BytesRef> values)
{
// write the byte[] data
meta.WriteVInt32(field.Number);
meta.WriteByte((byte)Lucene45DocValuesFormat.BINARY);
int minLength = int.MaxValue;
int maxLength = int.MinValue;
long startFP = data.GetFilePointer();
long count = 0;
bool missing = false;
foreach (BytesRef v in values)
{
int length;
if (v == null)
{
length = 0;
missing = true;
}
else
{
length = v.Length;
}
minLength = Math.Min(minLength, length);
maxLength = Math.Max(maxLength, length);
if (v != null)
{
data.WriteBytes(v.Bytes, v.Offset, v.Length);
}
count++;
}
meta.WriteVInt32(minLength == maxLength ? BINARY_FIXED_UNCOMPRESSED : BINARY_VARIABLE_UNCOMPRESSED);
if (missing)
{
meta.WriteInt64(data.GetFilePointer());
WriteMissingBitset(values);
}
else
{
meta.WriteInt64(-1L);
}
meta.WriteVInt32(minLength);
meta.WriteVInt32(maxLength);
meta.WriteVInt64(count);
meta.WriteInt64(startFP);
// if minLength == maxLength, its a fixed-length byte[], we are done (the addresses are implicit)
// otherwise, we need to record the length fields...
if (minLength != maxLength)
{
meta.WriteInt64(data.GetFilePointer());
meta.WriteVInt32(PackedInt32s.VERSION_CURRENT);
meta.WriteVInt32(BLOCK_SIZE);
MonotonicBlockPackedWriter writer = new MonotonicBlockPackedWriter(data, BLOCK_SIZE);
long addr = 0;
foreach (BytesRef v in values)
{
if (v != null)
{
addr += v.Length;
}
writer.Add(addr);
}
writer.Finish();
}
}
/// <summary>
/// Expert: writes a value dictionary for a sorted/sortedset field. </summary>
protected virtual void AddTermsDict(FieldInfo field, IEnumerable<BytesRef> values)
{
// first check if its a "fixed-length" terms dict
int minLength = int.MaxValue;
int maxLength = int.MinValue;
foreach (BytesRef v in values)
{
minLength = Math.Min(minLength, v.Length);
maxLength = Math.Max(maxLength, v.Length);
}
if (minLength == maxLength)
{
// no index needed: direct addressing by mult
AddBinaryField(field, values);
}
else
{
// header
meta.WriteVInt32(field.Number);
meta.WriteByte((byte)Lucene45DocValuesFormat.BINARY);
meta.WriteVInt32(BINARY_PREFIX_COMPRESSED);
meta.WriteInt64(-1L);
// now write the bytes: sharing prefixes within a block
long startFP = data.GetFilePointer();
// currently, we have to store the delta from expected for every 1/nth term
// we could avoid this, but its not much and less overall RAM than the previous approach!
RAMOutputStream addressBuffer = new RAMOutputStream();
MonotonicBlockPackedWriter termAddresses = new MonotonicBlockPackedWriter(addressBuffer, BLOCK_SIZE);
BytesRef lastTerm = new BytesRef();
long count = 0;
foreach (BytesRef v in values)
{
if (count % ADDRESS_INTERVAL == 0)
{
termAddresses.Add(data.GetFilePointer() - startFP);
// force the first term in a block to be abs-encoded
lastTerm.Length = 0;
}
// prefix-code
int sharedPrefix = StringHelper.BytesDifference(lastTerm, v);
data.WriteVInt32(sharedPrefix);
data.WriteVInt32(v.Length - sharedPrefix);
data.WriteBytes(v.Bytes, v.Offset + sharedPrefix, v.Length - sharedPrefix);
lastTerm.CopyBytes(v);
count++;
}
long indexStartFP = data.GetFilePointer();
// write addresses of indexed terms
termAddresses.Finish();
addressBuffer.WriteTo(data);
//addressBuffer = null; // LUCENENET: IDE0059: Remove unnecessary value assignment
//termAddresses = null; // LUCENENET: IDE0059: Remove unnecessary value assignment
meta.WriteVInt32(minLength);
meta.WriteVInt32(maxLength);
meta.WriteVInt64(count);
meta.WriteInt64(startFP);
meta.WriteVInt32(ADDRESS_INTERVAL);
meta.WriteInt64(indexStartFP);
meta.WriteVInt32(PackedInt32s.VERSION_CURRENT);
meta.WriteVInt32(BLOCK_SIZE);
}
}
public override void AddSortedField(FieldInfo field, IEnumerable<BytesRef> values, IEnumerable<long?> docToOrd)
{
meta.WriteVInt32(field.Number);
meta.WriteByte((byte)Lucene45DocValuesFormat.SORTED);
AddTermsDict(field, values);
AddNumericField(field, docToOrd, false);
}
private static bool IsSingleValued(IEnumerable<long?> docToOrdCount)
{
foreach (var ordCount in docToOrdCount)
{
if (ordCount.GetValueOrDefault() > 1)
return false;
}
return true;
}
public override void AddSortedSetField(FieldInfo field, IEnumerable<BytesRef> values, IEnumerable<long?> docToOrdCount, IEnumerable<long?> ords)
{
meta.WriteVInt32(field.Number);
meta.WriteByte((byte)Lucene45DocValuesFormat.SORTED_SET);
if (IsSingleValued(docToOrdCount))
{
meta.WriteVInt32(SORTED_SET_SINGLE_VALUED_SORTED);
// The field is single-valued, we can encode it as SORTED
AddSortedField(field, values, GetSortedSetEnumerable(docToOrdCount, ords));
return;
}
meta.WriteVInt32(SORTED_SET_WITH_ADDRESSES);
// write the ord -> byte[] as a binary field
AddTermsDict(field, values);
// write the stream of ords as a numeric field
// NOTE: we could return an iterator that delta-encodes these within a doc
AddNumericField(field, ords, false);
// write the doc -> ord count as a absolute index to the stream
meta.WriteVInt32(field.Number);
meta.WriteByte((byte)Lucene45DocValuesFormat.NUMERIC);
meta.WriteVInt32(DELTA_COMPRESSED);
meta.WriteInt64(-1L);
meta.WriteVInt32(PackedInt32s.VERSION_CURRENT);
meta.WriteInt64(data.GetFilePointer());
meta.WriteVInt64(maxDoc);
meta.WriteVInt32(BLOCK_SIZE);
var writer = new MonotonicBlockPackedWriter(data, BLOCK_SIZE);
long addr = 0;
foreach (long? v in docToOrdCount)
{
addr += v.Value;
writer.Add(addr);
}
writer.Finish();
}
private IEnumerable<long?> GetSortedSetEnumerable(IEnumerable<long?> docToOrdCount, IEnumerable<long?> ords)
{
IEnumerator<long?> docToOrdCountIter = docToOrdCount.GetEnumerator();
IEnumerator<long?> ordsIter = ords.GetEnumerator();
const long MISSING_ORD = -1;
while (docToOrdCountIter.MoveNext())
{
long current = docToOrdCountIter.Current.Value;
if (current == 0)
{
yield return MISSING_ORD;
}
else
{
if (Debugging.AssertsEnabled) Debugging.Assert(current == 1);
ordsIter.MoveNext();
yield return ordsIter.Current;
}
}
if (Debugging.AssertsEnabled) Debugging.Assert(!ordsIter.MoveNext());
}
protected override void Dispose(bool disposing)
{
if (disposing)
{
bool success = false;
try
{
if (meta != null)
{
meta.WriteVInt32(-1); // write EOF marker
CodecUtil.WriteFooter(meta); // write checksum
}
if (data != null)
{
CodecUtil.WriteFooter(data); // write checksum
}
success = true;
}
finally
{
if (success)
{
IOUtils.Dispose(data, meta);
}
else
{
IOUtils.DisposeWhileHandlingException(data, meta);
}
meta = data = null;
}
}
}
}
}