blob: 6a7b1c95f2062c582aa633dbbd17e6b31fc38f30 [file] [log] [blame]
using Lucene.Net.Diagnostics;
using System;
using System.Collections.Generic;
using System.Runtime.CompilerServices;
namespace Lucene.Net.Index
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using AppendingDeltaPackedInt64Buffer = Lucene.Net.Util.Packed.AppendingDeltaPackedInt64Buffer;
using AppendingPackedInt64Buffer = Lucene.Net.Util.Packed.AppendingPackedInt64Buffer;
using ArrayUtil = Lucene.Net.Util.ArrayUtil;
using ByteBlockPool = Lucene.Net.Util.ByteBlockPool;
using BytesRef = Lucene.Net.Util.BytesRef;
using BytesRefHash = Lucene.Net.Util.BytesRefHash;
using Counter = Lucene.Net.Util.Counter;
using DirectBytesStartArray = Lucene.Net.Util.BytesRefHash.DirectBytesStartArray;
using DocValuesConsumer = Lucene.Net.Codecs.DocValuesConsumer;
using PackedInt32s = Lucene.Net.Util.Packed.PackedInt32s;
using RamUsageEstimator = Lucene.Net.Util.RamUsageEstimator;
/// <summary>
/// Buffers up pending <see cref="T:byte[]"/>s per doc, deref and sorting via
/// int ord, then flushes when segment flushes.
/// </summary>
internal class SortedSetDocValuesWriter : DocValuesWriter
{
internal readonly BytesRefHash hash;
private AppendingPackedInt64Buffer pending; // stream of all termIDs
private AppendingDeltaPackedInt64Buffer pendingCounts; // termIDs per doc
private readonly Counter iwBytesUsed;
private long bytesUsed; // this only tracks differences in 'pending' and 'pendingCounts'
private readonly FieldInfo fieldInfo;
private int currentDoc;
private int[] currentValues = new int[8];
private int currentUpto = 0;
private int maxCount = 0;
public SortedSetDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed)
{
this.fieldInfo = fieldInfo;
this.iwBytesUsed = iwBytesUsed;
hash = new BytesRefHash(new ByteBlockPool(new ByteBlockPool.DirectTrackingAllocator(iwBytesUsed)), BytesRefHash.DEFAULT_CAPACITY, new DirectBytesStartArray(BytesRefHash.DEFAULT_CAPACITY, iwBytesUsed));
pending = new AppendingPackedInt64Buffer(PackedInt32s.COMPACT);
pendingCounts = new AppendingDeltaPackedInt64Buffer(PackedInt32s.COMPACT);
bytesUsed = pending.RamBytesUsed() + pendingCounts.RamBytesUsed();
iwBytesUsed.AddAndGet(bytesUsed);
}
public virtual void AddValue(int docID, BytesRef value)
{
if (value == null)
{
throw new ArgumentException("field \"" + fieldInfo.Name + "\": null value not allowed");
}
if (value.Length > (ByteBlockPool.BYTE_BLOCK_SIZE - 2))
{
throw new ArgumentException("DocValuesField \"" + fieldInfo.Name + "\" is too large, must be <= " + (ByteBlockPool.BYTE_BLOCK_SIZE - 2));
}
if (docID != currentDoc)
{
FinishCurrentDoc();
}
// Fill in any holes:
while (currentDoc < docID)
{
pendingCounts.Add(0); // no values
currentDoc++;
}
AddOneValue(value);
UpdateBytesUsed();
}
// finalize currentDoc: this deduplicates the current term ids
private void FinishCurrentDoc()
{
Array.Sort(currentValues, 0, currentUpto);
int lastValue = -1;
int count = 0;
for (int i = 0; i < currentUpto; i++)
{
int termID = currentValues[i];
// if its not a duplicate
if (termID != lastValue)
{
pending.Add(termID); // record the term id
count++;
}
lastValue = termID;
}
// record the number of unique term ids for this doc
pendingCounts.Add(count);
maxCount = Math.Max(maxCount, count);
currentUpto = 0;
currentDoc++;
}
public override void Finish(int maxDoc)
{
FinishCurrentDoc();
// fill in any holes
for (int i = currentDoc; i < maxDoc; i++)
{
pendingCounts.Add(0); // no values
}
}
private void AddOneValue(BytesRef value)
{
int termID = hash.Add(value);
if (termID < 0)
{
termID = -termID - 1;
}
else
{
// reserve additional space for each unique value:
// 1. when indexing, when hash is 50% full, rehash() suddenly needs 2*size ints.
// TODO: can this same OOM happen in THPF?
// 2. when flushing, we need 1 int per value (slot in the ordMap).
iwBytesUsed.AddAndGet(2 * RamUsageEstimator.NUM_BYTES_INT32);
}
if (currentUpto == currentValues.Length)
{
currentValues = ArrayUtil.Grow(currentValues, currentValues.Length + 1);
// reserve additional space for max # values per-doc
// when flushing, we need an int[] to sort the mapped-ords within the doc
iwBytesUsed.AddAndGet((currentValues.Length - currentUpto) * 2 * RamUsageEstimator.NUM_BYTES_INT32);
}
currentValues[currentUpto] = termID;
currentUpto++;
}
private void UpdateBytesUsed()
{
long newBytesUsed = pending.RamBytesUsed() + pendingCounts.RamBytesUsed();
iwBytesUsed.AddAndGet(newBytesUsed - bytesUsed);
bytesUsed = newBytesUsed;
}
[MethodImpl(MethodImplOptions.NoInlining)]
public override void Flush(SegmentWriteState state, DocValuesConsumer dvConsumer)
{
int maxDoc = state.SegmentInfo.DocCount;
int maxCountPerDoc = maxCount;
if (Debugging.AssertsEnabled) Debugging.Assert(pendingCounts.Count == maxDoc);
int valueCount = hash.Count;
int[] sortedValues = hash.Sort(BytesRef.UTF8SortedAsUnicodeComparer);
int[] ordMap = new int[valueCount];
for (int ord = 0; ord < valueCount; ord++)
{
ordMap[sortedValues[ord]] = ord;
}
dvConsumer.AddSortedSetField(fieldInfo, GetBytesRefEnumberable(valueCount, sortedValues),
// doc -> ordCount
GetOrdsEnumberable(maxDoc),
// ords
GetOrdCountEnumberable(maxCountPerDoc, ordMap));
}
[MethodImpl(MethodImplOptions.NoInlining)]
public override void Abort()
{
}
private IEnumerable<BytesRef> GetBytesRefEnumberable(int valueCount, int[] sortedValues)
{
var scratch = new BytesRef();
for (int i = 0; i < valueCount; ++i)
{
yield return hash.Get(sortedValues[i], scratch);
}
}
private IEnumerable<long?> GetOrdsEnumberable(int maxDoc)
{
AppendingDeltaPackedInt64Buffer.Iterator iter = pendingCounts.GetIterator();
if (Debugging.AssertsEnabled) Debugging.Assert(pendingCounts.Count == maxDoc, () => "MaxDoc: " + maxDoc + ", pending.Count: " + pending.Count);
for (int i = 0; i < maxDoc; ++i)
{
yield return iter.Next();
}
}
private IEnumerable<long?> GetOrdCountEnumberable(int maxCountPerDoc, int[] ordMap)
{
int currentUpTo = 0, currentLength = 0;
AppendingPackedInt64Buffer.Iterator iter = pending.GetIterator();
AppendingDeltaPackedInt64Buffer.Iterator counts = pendingCounts.GetIterator();
int[] currentDoc = new int[maxCountPerDoc];
for (long i = 0; i < pending.Count; ++i)
{
while (currentUpTo == currentLength)
{
// refill next doc, and sort remapped ords within the doc.
currentUpTo = 0;
currentLength = (int)counts.Next();
for (int j = 0; j < currentLength; j++)
{
currentDoc[j] = ordMap[(int)iter.Next()];
}
Array.Sort(currentDoc, 0, currentLength);
}
int ord = currentDoc[currentUpTo];
currentUpTo++;
yield return ord;
}
}
}
}