blob: 66ed14f89d51e419a7b9fe33f02662c222500e97 [file] [log] [blame]
using System;
using System.Collections.Generic;
using System.Diagnostics;
namespace Lucene.Net.Index
{
using AppendingDeltaPackedLongBuffer = Lucene.Net.Util.Packed.AppendingDeltaPackedLongBuffer;
using AppendingPackedLongBuffer = Lucene.Net.Util.Packed.AppendingPackedLongBuffer;
using ArrayUtil = Lucene.Net.Util.ArrayUtil;
using ByteBlockPool = Lucene.Net.Util.ByteBlockPool;
using BytesRef = Lucene.Net.Util.BytesRef;
using BytesRefHash = Lucene.Net.Util.BytesRefHash;
using Counter = Lucene.Net.Util.Counter;
using DirectBytesStartArray = Lucene.Net.Util.BytesRefHash.DirectBytesStartArray;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using DocValuesConsumer = Lucene.Net.Codecs.DocValuesConsumer;
using PackedInts = Lucene.Net.Util.Packed.PackedInts;
using RamUsageEstimator = Lucene.Net.Util.RamUsageEstimator;
/// <summary>
/// Buffers up pending byte[]s per doc, deref and sorting via
/// int ord, then flushes when segment flushes.
/// </summary>
internal class SortedSetDocValuesWriter : DocValuesWriter
{
internal readonly BytesRefHash Hash;
private AppendingPackedLongBuffer Pending; // stream of all termIDs
private AppendingDeltaPackedLongBuffer PendingCounts; // termIDs per doc
private readonly Counter IwBytesUsed;
private long BytesUsed; // this only tracks differences in 'pending' and 'pendingCounts'
private readonly FieldInfo FieldInfo;
private int CurrentDoc;
private int[] CurrentValues = new int[8];
private int CurrentUpto = 0;
private int MaxCount = 0;
public SortedSetDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed)
{
this.FieldInfo = fieldInfo;
this.IwBytesUsed = iwBytesUsed;
Hash = new BytesRefHash(new ByteBlockPool(new ByteBlockPool.DirectTrackingAllocator(iwBytesUsed)), BytesRefHash.DEFAULT_CAPACITY, new DirectBytesStartArray(BytesRefHash.DEFAULT_CAPACITY, iwBytesUsed));
Pending = new AppendingPackedLongBuffer(PackedInts.COMPACT);
PendingCounts = new AppendingDeltaPackedLongBuffer(PackedInts.COMPACT);
BytesUsed = Pending.RamBytesUsed() + PendingCounts.RamBytesUsed();
iwBytesUsed.AddAndGet(BytesUsed);
}
public virtual void AddValue(int docID, BytesRef value)
{
if (value == null)
{
throw new System.ArgumentException("field \"" + FieldInfo.Name + "\": null value not allowed");
}
if (value.Length > (ByteBlockPool.BYTE_BLOCK_SIZE - 2))
{
throw new System.ArgumentException("DocValuesField \"" + FieldInfo.Name + "\" is too large, must be <= " + (ByteBlockPool.BYTE_BLOCK_SIZE - 2));
}
if (docID != CurrentDoc)
{
FinishCurrentDoc();
}
// Fill in any holes:
while (CurrentDoc < docID)
{
PendingCounts.Add(0); // no values
CurrentDoc++;
}
AddOneValue(value);
UpdateBytesUsed();
}
// finalize currentDoc: this deduplicates the current term ids
private void FinishCurrentDoc()
{
Array.Sort(CurrentValues, 0, CurrentUpto);
int lastValue = -1;
int count = 0;
for (int i = 0; i < CurrentUpto; i++)
{
int termID = CurrentValues[i];
// if its not a duplicate
if (termID != lastValue)
{
Pending.Add(termID); // record the term id
count++;
}
lastValue = termID;
}
// record the number of unique term ids for this doc
PendingCounts.Add(count);
MaxCount = Math.Max(MaxCount, count);
CurrentUpto = 0;
CurrentDoc++;
}
internal override void Finish(int maxDoc)
{
FinishCurrentDoc();
// fill in any holes
for (int i = CurrentDoc; i < maxDoc; i++)
{
PendingCounts.Add(0); // no values
}
}
private void AddOneValue(BytesRef value)
{
int termID = Hash.Add(value);
if (termID < 0)
{
termID = -termID - 1;
}
else
{
// reserve additional space for each unique value:
// 1. when indexing, when hash is 50% full, rehash() suddenly needs 2*size ints.
// TODO: can this same OOM happen in THPF?
// 2. when flushing, we need 1 int per value (slot in the ordMap).
IwBytesUsed.AddAndGet(2 * RamUsageEstimator.NUM_BYTES_INT);
}
if (CurrentUpto == CurrentValues.Length)
{
CurrentValues = ArrayUtil.Grow(CurrentValues, CurrentValues.Length + 1);
// reserve additional space for max # values per-doc
// when flushing, we need an int[] to sort the mapped-ords within the doc
IwBytesUsed.AddAndGet((CurrentValues.Length - CurrentUpto) * 2 * RamUsageEstimator.NUM_BYTES_INT);
}
CurrentValues[CurrentUpto] = termID;
CurrentUpto++;
}
private void UpdateBytesUsed()
{
long newBytesUsed = Pending.RamBytesUsed() + PendingCounts.RamBytesUsed();
IwBytesUsed.AddAndGet(newBytesUsed - BytesUsed);
BytesUsed = newBytesUsed;
}
internal override void Flush(SegmentWriteState state, DocValuesConsumer dvConsumer)
{
int maxDoc = state.SegmentInfo.DocCount;
int maxCountPerDoc = MaxCount;
Debug.Assert(PendingCounts.Size() == maxDoc);
int valueCount = Hash.Size();
int[] sortedValues = Hash.Sort(BytesRef.UTF8SortedAsUnicodeComparer);
int[] ordMap = new int[valueCount];
for (int ord = 0; ord < valueCount; ord++)
{
ordMap[sortedValues[ord]] = ord;
}
dvConsumer.AddSortedSetField(FieldInfo, GetBytesRefEnumberable(valueCount, sortedValues),
// doc -> ordCount
GetOrdsEnumberable(maxDoc),
// ords
GetOrdCountEnumberable(maxCountPerDoc, ordMap));
}
internal override void Abort()
{
}
private IEnumerable<BytesRef> GetBytesRefEnumberable(int valueCount, int[] sortedValues)
{
for (int i = 0; i < valueCount; ++i)
{
var scratch = new BytesRef();
yield return Hash.Get(sortedValues[i], scratch);
}
}
private IEnumerable<long?> GetOrdsEnumberable(int maxDoc)
{
AppendingDeltaPackedLongBuffer.Iterator iter = PendingCounts.GetIterator();
Debug.Assert(maxDoc == PendingCounts.Size(), "MaxDoc: " + maxDoc + ", pending.Size(): " + Pending.Size());
for (int i = 0; i < maxDoc; ++i)
{
yield return (int)iter.Next();
}
}
private IEnumerable<long?> GetOrdCountEnumberable(int maxCountPerDoc, int[] ordMap)
{
int currentUpTo = 0, currentLength = 0;
AppendingPackedLongBuffer.Iterator iter = Pending.GetIterator();
AppendingDeltaPackedLongBuffer.Iterator counts = PendingCounts.GetIterator();
int[] currentDoc = new int[maxCountPerDoc];
for (long i = 0; i < Pending.Size(); ++i)
{
while (currentUpTo == currentLength)
{
// refill next doc, and sort remapped ords within the doc.
currentUpTo = 0;
currentLength = (int)counts.Next();
for (int j = 0; j < currentLength; j++)
{
currentDoc[j] = ordMap[(int)iter.Next()];
}
Array.Sort(currentDoc, 0, currentLength);
}
int ord = currentDoc[currentUpTo];
currentUpTo++;
yield return ord;
}
}
/*
private class IterableAnonymousInnerClassHelper : IEnumerable<BytesRef>
{
private readonly SortedSetDocValuesWriter OuterInstance;
private int ValueCount;
private int[] SortedValues;
public IterableAnonymousInnerClassHelper(SortedSetDocValuesWriter outerInstance, int valueCount, int[] sortedValues)
{
this.OuterInstance = outerInstance;
this.ValueCount = valueCount;
this.SortedValues = sortedValues;
}
// ord -> value
public virtual IEnumerator<BytesRef> GetEnumerator()
{
return new ValuesIterator(OuterInstance, SortedValues, ValueCount);
}
}
private class IterableAnonymousInnerClassHelper2 : IEnumerable<Number>
{
private readonly SortedSetDocValuesWriter OuterInstance;
private int MaxDoc;
public IterableAnonymousInnerClassHelper2(SortedSetDocValuesWriter outerInstance, int maxDoc)
{
this.OuterInstance = outerInstance;
this.MaxDoc = maxDoc;
}
public virtual IEnumerator<Number> GetEnumerator()
{
return new OrdCountIterator(OuterInstance, MaxDoc);
}
}
private class IterableAnonymousInnerClassHelper3 : IEnumerable<Number>
{
private readonly SortedSetDocValuesWriter OuterInstance;
private int MaxCountPerDoc;
private int[] OrdMap;
public IterableAnonymousInnerClassHelper3(SortedSetDocValuesWriter outerInstance, int maxCountPerDoc, int[] ordMap)
{
this.OuterInstance = outerInstance;
this.MaxCountPerDoc = maxCountPerDoc;
this.OrdMap = ordMap;
}
public virtual IEnumerator<Number> GetEnumerator()
{
return new OrdsIterator(OuterInstance, OrdMap, MaxCountPerDoc);
}
}
public override void Abort()
{
}
// iterates over the unique values we have in ram
private class ValuesIterator : IEnumerator<BytesRef>
{
private readonly SortedSetDocValuesWriter OuterInstance;
internal readonly int[] SortedValues;
internal readonly BytesRef Scratch = new BytesRef();
internal readonly int ValueCount;
internal int OrdUpto;
internal ValuesIterator(SortedSetDocValuesWriter outerInstance, int[] sortedValues, int valueCount)
{
this.OuterInstance = outerInstance;
this.SortedValues = sortedValues;
this.ValueCount = valueCount;
}
public override bool HasNext()
{
return OrdUpto < ValueCount;
}
public override BytesRef Next()
{
if (!HasNext())
{
throw new Exception();
}
OuterInstance.Hash.Get(SortedValues[OrdUpto], Scratch);
OrdUpto++;
return Scratch;
}
public override void Remove()
{
throw new System.NotSupportedException();
}
}
// iterates over the ords for each doc we have in ram
private class OrdsIterator : IEnumerator<Number>
{
internal bool InstanceFieldsInitialized = false;
internal virtual void InitializeInstanceFields()
{
Iter = OuterInstance.Pending.Iterator();
Counts = OuterInstance.PendingCounts.Iterator();
}
private readonly SortedSetDocValuesWriter OuterInstance;
internal AppendingPackedLongBuffer.Iterator Iter;
internal AppendingDeltaPackedLongBuffer.Iterator Counts;
internal readonly int[] OrdMap;
internal readonly long NumOrds;
internal long OrdUpto;
internal readonly int[] CurrentDoc;
internal int CurrentUpto;
internal int CurrentLength;
internal OrdsIterator(SortedSetDocValuesWriter outerInstance, int[] ordMap, int maxCount)
{
this.OuterInstance = outerInstance;
if (!InstanceFieldsInitialized)
{
InitializeInstanceFields();
InstanceFieldsInitialized = true;
}
this.CurrentDoc = new int[maxCount];
this.OrdMap = ordMap;
this.NumOrds = outerInstance.Pending.Size();
}
public override bool HasNext()
{
return OrdUpto < NumOrds;
}
public override Number Next()
{
if (!HasNext())
{
throw new Exception();
}
while (CurrentUpto == CurrentLength)
{
// refill next doc, and sort remapped ords within the doc.
CurrentUpto = 0;
CurrentLength = (int) Counts.Next();
for (int i = 0; i < CurrentLength; i++)
{
CurrentDoc[i] = OrdMap[(int) Iter.Next()];
}
Array.Sort(CurrentDoc, 0, CurrentLength);
}
int ord = CurrentDoc[CurrentUpto];
CurrentUpto++;
OrdUpto++;
// TODO: make reusable Number
return ord;
}
public override void Remove()
{
throw new System.NotSupportedException();
}
}
private class OrdCountIterator : IEnumerator<Number>
{
internal bool InstanceFieldsInitialized = false;
internal virtual void InitializeInstanceFields()
{
Iter = OuterInstance.PendingCounts.Iterator();
}
private readonly SortedSetDocValuesWriter OuterInstance;
internal AppendingDeltaPackedLongBuffer.Iterator Iter;
internal readonly int MaxDoc;
internal int DocUpto;
internal OrdCountIterator(SortedSetDocValuesWriter outerInstance, int maxDoc)
{
this.OuterInstance = outerInstance;
if (!InstanceFieldsInitialized)
{
InitializeInstanceFields();
InstanceFieldsInitialized = true;
}
this.MaxDoc = maxDoc;
Debug.Assert(outerInstance.PendingCounts.Size() == maxDoc);
}
public override bool HasNext()
{
return DocUpto < MaxDoc;
}
public override Number Next()
{
if (!HasNext())
{
throw new Exception();
}
DocUpto++;
// TODO: make reusable Number
return Iter.Next();
}
public override void Remove()
{
throw new System.NotSupportedException();
}
}*/
}
}