src/Lucene.Net/Index/SortedDocValuesWriter.cs - lucenenet - Git at Google

 using Lucene.Net.Codecs;
 using Lucene.Net.Diagnostics;
 using Lucene.Net.Util;
 using Lucene.Net.Util.Packed;
 using System;
 using System.Collections.Generic;
 using System.Runtime.CompilerServices;

 namespace Lucene.Net.Index
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     /// <summary>
     /// Buffers up pending <see cref="T:byte[]"/> per doc, deref and sorting via
     /// int ord, then flushes when segment flushes.
     /// </summary>
     internal class SortedDocValuesWriter : DocValuesWriter
     {
         internal readonly BytesRefHash hash;
         private readonly AppendingDeltaPackedInt64Buffer pending; // LUCENENET: marked readonly
         private readonly Counter iwBytesUsed;
         private long bytesUsed; // this currently only tracks differences in 'pending'
         private readonly FieldInfo fieldInfo;

         private const int EMPTY_ORD = -1;

         public SortedDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed)
         {
             this.fieldInfo = fieldInfo;
             this.iwBytesUsed = iwBytesUsed;
             hash = new BytesRefHash(new ByteBlockPool(new ByteBlockPool.DirectTrackingAllocator(iwBytesUsed)), BytesRefHash.DEFAULT_CAPACITY, new BytesRefHash.DirectBytesStartArray(BytesRefHash.DEFAULT_CAPACITY, iwBytesUsed));
             pending = new AppendingDeltaPackedInt64Buffer(PackedInt32s.COMPACT);
             bytesUsed = pending.RamBytesUsed();
             iwBytesUsed.AddAndGet(bytesUsed);
         }

         public virtual void AddValue(int docID, BytesRef value)
         {
             if (docID < pending.Count)
             {
                 throw new ArgumentException("DocValuesField \"" + fieldInfo.Name + "\" appears more than once in this document (only one value is allowed per field)");
             }
             if (value == null)
             {
                 throw new ArgumentException("field \"" + fieldInfo.Name + "\": null value not allowed");
             }
             if (value.Length > (ByteBlockPool.BYTE_BLOCK_SIZE - 2))
             {
                 throw new ArgumentException("DocValuesField \"" + fieldInfo.Name + "\" is too large, must be <= " + (ByteBlockPool.BYTE_BLOCK_SIZE - 2));
             }

             // Fill in any holes:
             while (pending.Count < docID)
             {
                 pending.Add(EMPTY_ORD);
             }

             AddOneValue(value);
         }

         public override void Finish(int maxDoc)
         {
             while (pending.Count < maxDoc)
             {
                 pending.Add(EMPTY_ORD);
             }
             UpdateBytesUsed();
         }

         private void AddOneValue(BytesRef value)
         {
             int termID = hash.Add(value);
             if (termID < 0)
             {
                 termID = -termID - 1;
             }
             else
             {
                 // reserve additional space for each unique value:
                 // 1. when indexing, when hash is 50% full, rehash() suddenly needs 2*size ints.
                 //    TODO: can this same OOM happen in THPF?
                 // 2. when flushing, we need 1 int per value (slot in the ordMap).
                 iwBytesUsed.AddAndGet(2 * RamUsageEstimator.NUM_BYTES_INT32);
             }

             pending.Add(termID);
             UpdateBytesUsed();
         }

         private void UpdateBytesUsed()
         {
             long newBytesUsed = pending.RamBytesUsed();
             iwBytesUsed.AddAndGet(newBytesUsed - bytesUsed);
             bytesUsed = newBytesUsed;
         }

         [MethodImpl(MethodImplOptions.NoInlining)]
         public override void Flush(SegmentWriteState state, DocValuesConsumer dvConsumer)
         {
             int maxDoc = state.SegmentInfo.DocCount;

             if (Debugging.AssertsEnabled) Debugging.Assert(pending.Count == maxDoc);
             int valueCount = hash.Count;

             int[] sortedValues = hash.Sort(BytesRef.UTF8SortedAsUnicodeComparer);
             int[] ordMap = new int[valueCount];

             for (int ord = 0; ord < valueCount; ord++)
             {
                 ordMap[sortedValues[ord]] = ord;
             }

             dvConsumer.AddSortedField(fieldInfo, GetBytesRefEnumberable(valueCount, sortedValues),
                 // doc -> ord
                                       GetOrdsEnumberable(maxDoc, ordMap));
         }

         [MethodImpl(MethodImplOptions.NoInlining)]
         public override void Abort()
         {
         }

         private IEnumerable<BytesRef> GetBytesRefEnumberable(int valueCount, int[] sortedValues)
         {
             var scratch = new BytesRef();

             for (int i = 0; i < valueCount; ++i)
             {
                 yield return hash.Get(sortedValues[i], scratch);
             }
         }

         private IEnumerable<long?> GetOrdsEnumberable(int maxDoc, int[] ordMap)
         {
             AppendingDeltaPackedInt64Buffer.Iterator iter = pending.GetIterator();
             if (Debugging.AssertsEnabled) Debugging.Assert(pending.Count == maxDoc);

             for (int i = 0; i < maxDoc; ++i)
             {
                 int ord = (int)iter.Next();
                 yield return ord == -1 ? ord : ordMap[ord];
             }
         }
     }
 }
	using Lucene.Net.Codecs;
	using Lucene.Net.Diagnostics;
	using Lucene.Net.Util;
	using Lucene.Net.Util.Packed;
	using System;
	using System.Collections.Generic;
	using System.Runtime.CompilerServices;

	namespace Lucene.Net.Index
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/// <summary>
	/// Buffers up pending <see cref="T:byte[]"/> per doc, deref and sorting via
	/// int ord, then flushes when segment flushes.
	/// </summary>
	internal class SortedDocValuesWriter : DocValuesWriter
	{
	internal readonly BytesRefHash hash;
	private readonly AppendingDeltaPackedInt64Buffer pending; // LUCENENET: marked readonly
	private readonly Counter iwBytesUsed;
	private long bytesUsed; // this currently only tracks differences in 'pending'
	private readonly FieldInfo fieldInfo;

	private const int EMPTY_ORD = -1;

	public SortedDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed)
	{
	this.fieldInfo = fieldInfo;
	this.iwBytesUsed = iwBytesUsed;
	hash = new BytesRefHash(new ByteBlockPool(new ByteBlockPool.DirectTrackingAllocator(iwBytesUsed)), BytesRefHash.DEFAULT_CAPACITY, new BytesRefHash.DirectBytesStartArray(BytesRefHash.DEFAULT_CAPACITY, iwBytesUsed));
	pending = new AppendingDeltaPackedInt64Buffer(PackedInt32s.COMPACT);
	bytesUsed = pending.RamBytesUsed();
	iwBytesUsed.AddAndGet(bytesUsed);
	}

	public virtual void AddValue(int docID, BytesRef value)
	{
	if (docID < pending.Count)
	{
	throw new ArgumentException("DocValuesField \"" + fieldInfo.Name + "\" appears more than once in this document (only one value is allowed per field)");
	}
	if (value == null)
	{
	throw new ArgumentException("field \"" + fieldInfo.Name + "\": null value not allowed");
	}
	if (value.Length > (ByteBlockPool.BYTE_BLOCK_SIZE - 2))
	{
	throw new ArgumentException("DocValuesField \"" + fieldInfo.Name + "\" is too large, must be <= " + (ByteBlockPool.BYTE_BLOCK_SIZE - 2));
	}

	// Fill in any holes:
	while (pending.Count < docID)
	{
	pending.Add(EMPTY_ORD);
	}

	AddOneValue(value);
	}

	public override void Finish(int maxDoc)
	{
	while (pending.Count < maxDoc)
	{
	pending.Add(EMPTY_ORD);
	}
	UpdateBytesUsed();
	}

	private void AddOneValue(BytesRef value)
	{
	int termID = hash.Add(value);
	if (termID < 0)
	{
	termID = -termID - 1;
	}
	else
	{
	// reserve additional space for each unique value:
	// 1. when indexing, when hash is 50% full, rehash() suddenly needs 2*size ints.
	// TODO: can this same OOM happen in THPF?
	// 2. when flushing, we need 1 int per value (slot in the ordMap).
	iwBytesUsed.AddAndGet(2 * RamUsageEstimator.NUM_BYTES_INT32);
	}

	pending.Add(termID);
	UpdateBytesUsed();
	}

	private void UpdateBytesUsed()
	{
	long newBytesUsed = pending.RamBytesUsed();
	iwBytesUsed.AddAndGet(newBytesUsed - bytesUsed);
	bytesUsed = newBytesUsed;
	}

	[MethodImpl(MethodImplOptions.NoInlining)]
	public override void Flush(SegmentWriteState state, DocValuesConsumer dvConsumer)
	{
	int maxDoc = state.SegmentInfo.DocCount;

	if (Debugging.AssertsEnabled) Debugging.Assert(pending.Count == maxDoc);
	int valueCount = hash.Count;

	int[] sortedValues = hash.Sort(BytesRef.UTF8SortedAsUnicodeComparer);
	int[] ordMap = new int[valueCount];

	for (int ord = 0; ord < valueCount; ord++)
	{
	ordMap[sortedValues[ord]] = ord;
	}

	dvConsumer.AddSortedField(fieldInfo, GetBytesRefEnumberable(valueCount, sortedValues),
	// doc -> ord
	GetOrdsEnumberable(maxDoc, ordMap));
	}

	[MethodImpl(MethodImplOptions.NoInlining)]
	public override void Abort()
	{
	}

	private IEnumerable<BytesRef> GetBytesRefEnumberable(int valueCount, int[] sortedValues)
	{
	var scratch = new BytesRef();

	for (int i = 0; i < valueCount; ++i)
	{
	yield return hash.Get(sortedValues[i], scratch);
	}
	}

	private IEnumerable<long?> GetOrdsEnumberable(int maxDoc, int[] ordMap)
	{
	AppendingDeltaPackedInt64Buffer.Iterator iter = pending.GetIterator();
	if (Debugging.AssertsEnabled) Debugging.Assert(pending.Count == maxDoc);

	for (int i = 0; i < maxDoc; ++i)
	{
	int ord = (int)iter.Next();
	yield return ord == -1 ? ord : ordMap[ord];
	}
	}
	}
	}