blob: 57c4cbd016e9644356ba1b3f3504ae7870203805 [file] [log] [blame]
using Lucene.Net.Codecs;
using Lucene.Net.Diagnostics;
using Lucene.Net.Util;
using Lucene.Net.Util.Packed;
using System;
using System.Collections.Generic;
using System.Runtime.CompilerServices;
namespace Lucene.Net.Index
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Buffers up pending <see cref="T:byte[]"/> per doc, deref and sorting via
/// int ord, then flushes when segment flushes.
/// </summary>
internal class SortedDocValuesWriter : DocValuesWriter
{
internal readonly BytesRefHash hash;
private readonly AppendingDeltaPackedInt64Buffer pending; // LUCENENET: marked readonly
private readonly Counter iwBytesUsed;
private long bytesUsed; // this currently only tracks differences in 'pending'
private readonly FieldInfo fieldInfo;
private const int EMPTY_ORD = -1;
public SortedDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed)
{
this.fieldInfo = fieldInfo;
this.iwBytesUsed = iwBytesUsed;
hash = new BytesRefHash(new ByteBlockPool(new ByteBlockPool.DirectTrackingAllocator(iwBytesUsed)), BytesRefHash.DEFAULT_CAPACITY, new BytesRefHash.DirectBytesStartArray(BytesRefHash.DEFAULT_CAPACITY, iwBytesUsed));
pending = new AppendingDeltaPackedInt64Buffer(PackedInt32s.COMPACT);
bytesUsed = pending.RamBytesUsed();
iwBytesUsed.AddAndGet(bytesUsed);
}
public virtual void AddValue(int docID, BytesRef value)
{
if (docID < pending.Count)
{
throw new ArgumentException("DocValuesField \"" + fieldInfo.Name + "\" appears more than once in this document (only one value is allowed per field)");
}
if (value == null)
{
throw new ArgumentException("field \"" + fieldInfo.Name + "\": null value not allowed");
}
if (value.Length > (ByteBlockPool.BYTE_BLOCK_SIZE - 2))
{
throw new ArgumentException("DocValuesField \"" + fieldInfo.Name + "\" is too large, must be <= " + (ByteBlockPool.BYTE_BLOCK_SIZE - 2));
}
// Fill in any holes:
while (pending.Count < docID)
{
pending.Add(EMPTY_ORD);
}
AddOneValue(value);
}
public override void Finish(int maxDoc)
{
while (pending.Count < maxDoc)
{
pending.Add(EMPTY_ORD);
}
UpdateBytesUsed();
}
private void AddOneValue(BytesRef value)
{
int termID = hash.Add(value);
if (termID < 0)
{
termID = -termID - 1;
}
else
{
// reserve additional space for each unique value:
// 1. when indexing, when hash is 50% full, rehash() suddenly needs 2*size ints.
// TODO: can this same OOM happen in THPF?
// 2. when flushing, we need 1 int per value (slot in the ordMap).
iwBytesUsed.AddAndGet(2 * RamUsageEstimator.NUM_BYTES_INT32);
}
pending.Add(termID);
UpdateBytesUsed();
}
private void UpdateBytesUsed()
{
long newBytesUsed = pending.RamBytesUsed();
iwBytesUsed.AddAndGet(newBytesUsed - bytesUsed);
bytesUsed = newBytesUsed;
}
[MethodImpl(MethodImplOptions.NoInlining)]
public override void Flush(SegmentWriteState state, DocValuesConsumer dvConsumer)
{
int maxDoc = state.SegmentInfo.DocCount;
if (Debugging.AssertsEnabled) Debugging.Assert(pending.Count == maxDoc);
int valueCount = hash.Count;
int[] sortedValues = hash.Sort(BytesRef.UTF8SortedAsUnicodeComparer);
int[] ordMap = new int[valueCount];
for (int ord = 0; ord < valueCount; ord++)
{
ordMap[sortedValues[ord]] = ord;
}
dvConsumer.AddSortedField(fieldInfo, GetBytesRefEnumberable(valueCount, sortedValues),
// doc -> ord
GetOrdsEnumberable(maxDoc, ordMap));
}
[MethodImpl(MethodImplOptions.NoInlining)]
public override void Abort()
{
}
private IEnumerable<BytesRef> GetBytesRefEnumberable(int valueCount, int[] sortedValues)
{
var scratch = new BytesRef();
for (int i = 0; i < valueCount; ++i)
{
yield return hash.Get(sortedValues[i], scratch);
}
}
private IEnumerable<long?> GetOrdsEnumberable(int maxDoc, int[] ordMap)
{
AppendingDeltaPackedInt64Buffer.Iterator iter = pending.GetIterator();
if (Debugging.AssertsEnabled) Debugging.Assert(pending.Count == maxDoc);
for (int i = 0; i < maxDoc; ++i)
{
int ord = (int)iter.Next();
yield return ord == -1 ? ord : ordMap[ord];
}
}
}
}