blob: 34d617ec1bb4ad5d8cc685cb3ffcb3a05d2ecc84 [file] [log] [blame]
using J2N.Collections.Generic.Extensions;
using Lucene.Net.Diagnostics;
using Lucene.Net.Support;
using System;
using System.Collections.Generic;
using System.IO;
using System.Runtime.CompilerServices;
namespace Lucene.Net.Index
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using Codec = Lucene.Net.Codecs.Codec;
using Directory = Lucene.Net.Store.Directory;
using DocValuesConsumer = Lucene.Net.Codecs.DocValuesConsumer;
using FieldInfosWriter = Lucene.Net.Codecs.FieldInfosWriter;
using FieldsConsumer = Lucene.Net.Codecs.FieldsConsumer;
using IBits = Lucene.Net.Util.IBits;
using InfoStream = Lucene.Net.Util.InfoStream;
using IOContext = Lucene.Net.Store.IOContext;
using IOUtils = Lucene.Net.Util.IOUtils;
using StoredFieldsWriter = Lucene.Net.Codecs.StoredFieldsWriter;
using TermVectorsWriter = Lucene.Net.Codecs.TermVectorsWriter;
/// <summary>
/// The <see cref="SegmentMerger"/> class combines two or more Segments, represented by an
/// <see cref="IndexReader"/>, into a single Segment. Call the merge method to combine the
/// segments.
/// </summary>
/// <seealso cref="Merge()"/>
internal sealed class SegmentMerger
{
private readonly Directory directory;
private readonly int termIndexInterval;
private readonly Codec codec;
private readonly IOContext context;
private readonly MergeState mergeState;
private readonly FieldInfos.Builder fieldInfosBuilder;
// note, just like in codec apis Directory 'dir' is NOT the same as segmentInfo.dir!!
internal SegmentMerger(IList<AtomicReader> readers, SegmentInfo segmentInfo, InfoStream infoStream, Directory dir, int termIndexInterval, CheckAbort checkAbort, FieldInfos.FieldNumbers fieldNumbers, IOContext context, bool validate)
{
// validate incoming readers
if (validate)
{
foreach (AtomicReader reader in readers)
{
reader.CheckIntegrity();
}
}
mergeState = new MergeState(readers, segmentInfo, infoStream, checkAbort);
directory = dir;
this.termIndexInterval = termIndexInterval;
this.codec = segmentInfo.Codec;
this.context = context;
this.fieldInfosBuilder = new FieldInfos.Builder(fieldNumbers);
mergeState.SegmentInfo.DocCount = SetDocMaps();
}
/// <summary>
/// <c>True</c> if any merging should happen </summary>
internal bool ShouldMerge => mergeState.SegmentInfo.DocCount > 0;
/// <summary>
/// Merges the readers into the directory passed to the constructor </summary>
/// <returns> The number of documents that were merged </returns>
/// <exception cref="CorruptIndexException"> if the index is corrupt </exception>
/// <exception cref="IOException"> if there is a low-level IO error </exception>
[MethodImpl(MethodImplOptions.NoInlining)]
internal MergeState Merge()
{
if (!ShouldMerge)
{
throw new InvalidOperationException("Merge would result in 0 document segment");
}
// NOTE: it's important to add calls to
// checkAbort.work(...) if you make any changes to this
// method that will spend alot of time. The frequency
// of this check impacts how long
// IndexWriter.close(false) takes to actually stop the
// threads.
MergeFieldInfos();
SetMatchingSegmentReaders();
long t0 = 0;
if (mergeState.InfoStream.IsEnabled("SM"))
{
t0 = Time.NanoTime();
}
int numMerged = MergeFields();
if (mergeState.InfoStream.IsEnabled("SM"))
{
long t1 = Time.NanoTime();
mergeState.InfoStream.Message("SM", ((t1 - t0) / 1000000) + " msec to merge stored fields [" + numMerged + " docs]");
}
if (Debugging.AssertsEnabled) Debugging.Assert(numMerged == mergeState.SegmentInfo.DocCount);
SegmentWriteState segmentWriteState = new SegmentWriteState(mergeState.InfoStream, directory, mergeState.SegmentInfo, mergeState.FieldInfos, termIndexInterval, null, context);
if (mergeState.InfoStream.IsEnabled("SM"))
{
t0 = Time.NanoTime();
}
MergeTerms(segmentWriteState);
if (mergeState.InfoStream.IsEnabled("SM"))
{
long t1 = Time.NanoTime();
mergeState.InfoStream.Message("SM", ((t1 - t0) / 1000000) + " msec to merge postings [" + numMerged + " docs]");
}
if (mergeState.InfoStream.IsEnabled("SM"))
{
t0 = Time.NanoTime();
}
if (mergeState.FieldInfos.HasDocValues)
{
MergeDocValues(segmentWriteState);
}
if (mergeState.InfoStream.IsEnabled("SM"))
{
long t1 = Time.NanoTime();
mergeState.InfoStream.Message("SM", ((t1 - t0) / 1000000) + " msec to merge doc values [" + numMerged + " docs]");
}
if (mergeState.FieldInfos.HasNorms)
{
if (mergeState.InfoStream.IsEnabled("SM"))
{
t0 = Time.NanoTime();
}
MergeNorms(segmentWriteState);
if (mergeState.InfoStream.IsEnabled("SM"))
{
long t1 = Time.NanoTime();
mergeState.InfoStream.Message("SM", ((t1 - t0) / 1000000) + " msec to merge norms [" + numMerged + " docs]");
}
}
if (mergeState.FieldInfos.HasVectors)
{
if (mergeState.InfoStream.IsEnabled("SM"))
{
t0 = Time.NanoTime();
}
numMerged = MergeVectors();
if (mergeState.InfoStream.IsEnabled("SM"))
{
long t1 = Time.NanoTime();
mergeState.InfoStream.Message("SM", ((t1 - t0) / 1000000) + " msec to merge vectors [" + numMerged + " docs]");
}
if (Debugging.AssertsEnabled) Debugging.Assert(numMerged == mergeState.SegmentInfo.DocCount);
}
// write the merged infos
FieldInfosWriter fieldInfosWriter = codec.FieldInfosFormat.FieldInfosWriter;
fieldInfosWriter.Write(directory, mergeState.SegmentInfo.Name, "", mergeState.FieldInfos, context);
return mergeState;
}
private void MergeDocValues(SegmentWriteState segmentWriteState)
{
DocValuesConsumer consumer = codec.DocValuesFormat.FieldsConsumer(segmentWriteState);
bool success = false;
try
{
foreach (FieldInfo field in mergeState.FieldInfos)
{
DocValuesType type = field.DocValuesType;
if (type != DocValuesType.NONE)
{
if (type == DocValuesType.NUMERIC)
{
IList<NumericDocValues> toMerge = new List<NumericDocValues>();
IList<IBits> docsWithField = new List<IBits>();
foreach (AtomicReader reader in mergeState.Readers)
{
NumericDocValues values = reader.GetNumericDocValues(field.Name);
IBits bits = reader.GetDocsWithField(field.Name);
if (values == null)
{
values = DocValues.EMPTY_NUMERIC;
bits = new Lucene.Net.Util.Bits.MatchNoBits(reader.MaxDoc);
}
toMerge.Add(values);
docsWithField.Add(bits);
}
consumer.MergeNumericField(field, mergeState, toMerge, docsWithField);
}
else if (type == DocValuesType.BINARY)
{
IList<BinaryDocValues> toMerge = new List<BinaryDocValues>();
IList<IBits> docsWithField = new List<IBits>();
foreach (AtomicReader reader in mergeState.Readers)
{
BinaryDocValues values = reader.GetBinaryDocValues(field.Name);
IBits bits = reader.GetDocsWithField(field.Name);
if (values == null)
{
values = DocValues.EMPTY_BINARY;
bits = new Lucene.Net.Util.Bits.MatchNoBits(reader.MaxDoc);
}
toMerge.Add(values);
docsWithField.Add(bits);
}
consumer.MergeBinaryField(field, mergeState, toMerge, docsWithField);
}
else if (type == DocValuesType.SORTED)
{
IList<SortedDocValues> toMerge = new List<SortedDocValues>();
foreach (AtomicReader reader in mergeState.Readers)
{
SortedDocValues values = reader.GetSortedDocValues(field.Name);
if (values == null)
{
values = DocValues.EMPTY_SORTED;
}
toMerge.Add(values);
}
consumer.MergeSortedField(field, mergeState, toMerge);
}
else if (type == DocValuesType.SORTED_SET)
{
IList<SortedSetDocValues> toMerge = new List<SortedSetDocValues>();
foreach (AtomicReader reader in mergeState.Readers)
{
SortedSetDocValues values = reader.GetSortedSetDocValues(field.Name);
if (values == null)
{
values = DocValues.EMPTY_SORTED_SET;
}
toMerge.Add(values);
}
consumer.MergeSortedSetField(field, mergeState, toMerge);
}
else
{
throw new InvalidOperationException("type=" + type);
}
}
}
success = true;
}
finally
{
if (success)
{
IOUtils.Dispose(consumer);
}
else
{
IOUtils.DisposeWhileHandlingException(consumer);
}
}
}
private void MergeNorms(SegmentWriteState segmentWriteState)
{
DocValuesConsumer consumer = codec.NormsFormat.NormsConsumer(segmentWriteState);
bool success = false;
try
{
foreach (FieldInfo field in mergeState.FieldInfos)
{
if (field.HasNorms)
{
IList<NumericDocValues> toMerge = new List<NumericDocValues>();
IList<IBits> docsWithField = new List<IBits>();
foreach (AtomicReader reader in mergeState.Readers)
{
NumericDocValues norms = reader.GetNormValues(field.Name);
if (norms == null)
{
norms = DocValues.EMPTY_NUMERIC;
}
toMerge.Add(norms);
docsWithField.Add(new Lucene.Net.Util.Bits.MatchAllBits(reader.MaxDoc));
}
consumer.MergeNumericField(field, mergeState, toMerge, docsWithField);
}
}
success = true;
}
finally
{
if (success)
{
IOUtils.Dispose(consumer);
}
else
{
IOUtils.DisposeWhileHandlingException(consumer);
}
}
}
private void SetMatchingSegmentReaders()
{
// If the i'th reader is a SegmentReader and has
// identical fieldName -> number mapping, then this
// array will be non-null at position i:
int numReaders = mergeState.Readers.Count;
mergeState.MatchingSegmentReaders = new SegmentReader[numReaders];
// If this reader is a SegmentReader, and all of its
// field name -> number mappings match the "merged"
// FieldInfos, then we can do a bulk copy of the
// stored fields:
for (int i = 0; i < numReaders; i++)
{
AtomicReader reader = mergeState.Readers[i];
// TODO: we may be able to broaden this to
// non-SegmentReaders, since FieldInfos is now
// required? But... this'd also require exposing
// bulk-copy (TVs and stored fields) API in foreign
// readers..
if (reader is SegmentReader segmentReader)
{
bool same = true;
FieldInfos segmentFieldInfos = segmentReader.FieldInfos;
foreach (FieldInfo fi in segmentFieldInfos)
{
FieldInfo other = mergeState.FieldInfos.FieldInfo(fi.Number);
if (other == null || !other.Name.Equals(fi.Name, StringComparison.Ordinal))
{
same = false;
break;
}
}
if (same)
{
mergeState.MatchingSegmentReaders[i] = segmentReader;
mergeState.MatchedCount++;
}
}
}
if (mergeState.InfoStream.IsEnabled("SM"))
{
mergeState.InfoStream.Message("SM", "merge store matchedCount=" + mergeState.MatchedCount + " vs " + mergeState.Readers.Count);
if (mergeState.MatchedCount != mergeState.Readers.Count)
{
mergeState.InfoStream.Message("SM", "" + (mergeState.Readers.Count - mergeState.MatchedCount) + " non-bulk merges");
}
}
}
public void MergeFieldInfos()
{
foreach (AtomicReader reader in mergeState.Readers)
{
FieldInfos readerFieldInfos = reader.FieldInfos;
foreach (FieldInfo fi in readerFieldInfos)
{
fieldInfosBuilder.Add(fi);
}
}
mergeState.FieldInfos = fieldInfosBuilder.Finish();
}
///
/// <returns> The number of documents in all of the readers </returns>
/// <exception cref="CorruptIndexException"> if the index is corrupt </exception>
/// <exception cref="IOException"> if there is a low-level IO error </exception>
private int MergeFields()
{
StoredFieldsWriter fieldsWriter = codec.StoredFieldsFormat.FieldsWriter(directory, mergeState.SegmentInfo, context);
try
{
return fieldsWriter.Merge(mergeState);
}
finally
{
fieldsWriter.Dispose();
}
}
/// <summary>
/// Merge the TermVectors from each of the segments into the new one. </summary>
/// <exception cref="IOException"> if there is a low-level IO error </exception>
private int MergeVectors()
{
TermVectorsWriter termVectorsWriter = codec.TermVectorsFormat.VectorsWriter(directory, mergeState.SegmentInfo, context);
try
{
return termVectorsWriter.Merge(mergeState);
}
finally
{
termVectorsWriter.Dispose();
}
}
// NOTE: removes any "all deleted" readers from mergeState.readers
private int SetDocMaps()
{
int numReaders = mergeState.Readers.Count;
// Remap docIDs
mergeState.DocMaps = new MergeState.DocMap[numReaders];
mergeState.DocBase = new int[numReaders];
int docBase = 0;
int i = 0;
while (i < mergeState.Readers.Count)
{
AtomicReader reader = mergeState.Readers[i];
mergeState.DocBase[i] = docBase;
MergeState.DocMap docMap = MergeState.DocMap.Build(reader);
mergeState.DocMaps[i] = docMap;
docBase += docMap.NumDocs;
i++;
}
return docBase;
}
[MethodImpl(MethodImplOptions.NoInlining)]
private void MergeTerms(SegmentWriteState segmentWriteState)
{
IList<Fields> fields = new List<Fields>();
IList<ReaderSlice> slices = new List<ReaderSlice>();
int docBase = 0;
for (int readerIndex = 0; readerIndex < mergeState.Readers.Count; readerIndex++)
{
AtomicReader reader = mergeState.Readers[readerIndex];
Fields f = reader.Fields;
int maxDoc = reader.MaxDoc;
if (f != null)
{
slices.Add(new ReaderSlice(docBase, maxDoc, readerIndex));
fields.Add(f);
}
docBase += maxDoc;
}
FieldsConsumer consumer = codec.PostingsFormat.FieldsConsumer(segmentWriteState);
bool success = false;
try
{
consumer.Merge(mergeState, new MultiFields(fields.ToArray(/*Fields.EMPTY_ARRAY*/), slices.ToArray(/*ReaderSlice.EMPTY_ARRAY*/)));
success = true;
}
finally
{
if (success)
{
IOUtils.Dispose(consumer);
}
else
{
IOUtils.DisposeWhileHandlingException(consumer);
}
}
}
}
}