blob: 7b340c8fbdba92a982afc5bfa3cb249738b5b725 [file] [log] [blame]
using System.Collections.Generic;
using System.Diagnostics;
namespace Lucene.Net.Index
{
using AppendingPackedLongBuffer = Lucene.Net.Util.Packed.AppendingPackedLongBuffer;
using Bits = Lucene.Net.Util.Bits;
using BytesRef = Lucene.Net.Util.BytesRef;
using MonotonicAppendingLongBuffer = Lucene.Net.Util.Packed.MonotonicAppendingLongBuffer;
using PackedInts = Lucene.Net.Util.Packed.PackedInts;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using TermsEnumIndex = Lucene.Net.Index.MultiTermsEnum.TermsEnumIndex;
using TermsEnumWithSlice = Lucene.Net.Index.MultiTermsEnum.TermsEnumWithSlice;
/// <summary>
/// A wrapper for CompositeIndexReader providing access to DocValues.
///
/// <p><b>NOTE</b>: for multi readers, you'll get better
/// performance by gathering the sub readers using
/// <seealso cref="IndexReader#getContext()"/> to get the
/// atomic leaves and then operate per-AtomicReader,
/// instead of using this class.
///
/// <p><b>NOTE</b>: this is very costly.
///
/// @lucene.experimental
/// @lucene.internal
/// </summary>
public class MultiDocValues
{
/// <summary>
/// No instantiation </summary>
private MultiDocValues()
{
}
/// <summary>
/// Returns a NumericDocValues for a reader's norms (potentially merging on-the-fly).
/// <p>
/// this is a slow way to access normalization values. Instead, access them per-segment
/// with <seealso cref="AtomicReader#getNormValues(String)"/>
/// </p>
/// </summary>
public static NumericDocValues GetNormValues(IndexReader r, string field)
{
IList<AtomicReaderContext> leaves = r.Leaves;
int size = leaves.Count;
if (size == 0)
{
return null;
}
else if (size == 1)
{
return leaves[0].AtomicReader.GetNormValues(field);
}
FieldInfo fi = MultiFields.GetMergedFieldInfos(r).FieldInfo(field);
if (fi == null || fi.HasNorms() == false)
{
return null;
}
bool anyReal = false;
NumericDocValues[] values = new NumericDocValues[size];
int[] starts = new int[size + 1];
for (int i = 0; i < size; i++)
{
AtomicReaderContext context = leaves[i];
NumericDocValues v = context.AtomicReader.GetNormValues(field);
if (v == null)
{
v = DocValues.EMPTY_NUMERIC;
}
else
{
anyReal = true;
}
values[i] = v;
starts[i] = context.DocBase;
}
starts[size] = r.MaxDoc;
Debug.Assert(anyReal);
return new NumericDocValuesAnonymousInnerClassHelper(values, starts);
}
private class NumericDocValuesAnonymousInnerClassHelper : NumericDocValues
{
private Lucene.Net.Index.NumericDocValues[] Values;
private int[] Starts;
public NumericDocValuesAnonymousInnerClassHelper(Lucene.Net.Index.NumericDocValues[] values, int[] starts)
{
this.Values = values;
this.Starts = starts;
}
public override long Get(int docID)
{
int subIndex = ReaderUtil.SubIndex(docID, Starts);
return Values[subIndex].Get(docID - Starts[subIndex]);
}
}
/// <summary>
/// Returns a NumericDocValues for a reader's docvalues (potentially merging on-the-fly)
/// <p>
/// this is a slow way to access numeric values. Instead, access them per-segment
/// with <seealso cref="AtomicReader#getNumericDocValues(String)"/>
/// </p>
///
/// </summary>
public static NumericDocValues GetNumericValues(IndexReader r, string field)
{
IList<AtomicReaderContext> leaves = r.Leaves;
int size = leaves.Count;
if (size == 0)
{
return null;
}
else if (size == 1)
{
return leaves[0].AtomicReader.GetNumericDocValues(field);
}
bool anyReal = false;
NumericDocValues[] values = new NumericDocValues[size];
int[] starts = new int[size + 1];
for (int i = 0; i < size; i++)
{
AtomicReaderContext context = leaves[i];
NumericDocValues v = context.AtomicReader.GetNumericDocValues(field);
if (v == null)
{
v = DocValues.EMPTY_NUMERIC;
}
else
{
anyReal = true;
}
values[i] = v;
starts[i] = context.DocBase;
}
starts[size] = r.MaxDoc;
if (!anyReal)
{
return null;
}
else
{
return new NumericDocValuesAnonymousInnerClassHelper2(values, starts);
}
}
private class NumericDocValuesAnonymousInnerClassHelper2 : NumericDocValues
{
private Lucene.Net.Index.NumericDocValues[] Values;
private int[] Starts;
public NumericDocValuesAnonymousInnerClassHelper2(Lucene.Net.Index.NumericDocValues[] values, int[] starts)
{
this.Values = values;
this.Starts = starts;
}
public override long Get(int docID)
{
int subIndex = ReaderUtil.SubIndex(docID, Starts);
return Values[subIndex].Get(docID - Starts[subIndex]);
}
}
/// <summary>
/// Returns a Bits for a reader's docsWithField (potentially merging on-the-fly)
/// <p>
/// this is a slow way to access this bitset. Instead, access them per-segment
/// with <seealso cref="AtomicReader#getDocsWithField(String)"/>
/// </p>
///
/// </summary>
public static Bits GetDocsWithField(IndexReader r, string field)
{
IList<AtomicReaderContext> leaves = r.Leaves;
int size = leaves.Count;
if (size == 0)
{
return null;
}
else if (size == 1)
{
return leaves[0].AtomicReader.GetDocsWithField(field);
}
bool anyReal = false;
bool anyMissing = false;
Bits[] values = new Bits[size];
int[] starts = new int[size + 1];
for (int i = 0; i < size; i++)
{
AtomicReaderContext context = leaves[i];
Bits v = context.AtomicReader.GetDocsWithField(field);
if (v == null)
{
v = new Lucene.Net.Util.Bits_MatchNoBits(context.Reader.MaxDoc);
anyMissing = true;
}
else
{
anyReal = true;
if (v is Lucene.Net.Util.Bits_MatchAllBits == false)
{
anyMissing = true;
}
}
values[i] = v;
starts[i] = context.DocBase;
}
starts[size] = r.MaxDoc;
if (!anyReal)
{
return null;
}
else if (!anyMissing)
{
return new Lucene.Net.Util.Bits_MatchAllBits(r.MaxDoc);
}
else
{
return new MultiBits(values, starts, false);
}
}
/// <summary>
/// Returns a BinaryDocValues for a reader's docvalues (potentially merging on-the-fly)
/// <p>
/// this is a slow way to access binary values. Instead, access them per-segment
/// with <seealso cref="AtomicReader#getBinaryDocValues(String)"/>
/// </p>
/// </summary>
public static BinaryDocValues GetBinaryValues(IndexReader r, string field)
{
IList<AtomicReaderContext> leaves = r.Leaves;
int size = leaves.Count;
if (size == 0)
{
return null;
}
else if (size == 1)
{
return leaves[0].AtomicReader.GetBinaryDocValues(field);
}
bool anyReal = false;
BinaryDocValues[] values = new BinaryDocValues[size];
int[] starts = new int[size + 1];
for (int i = 0; i < size; i++)
{
AtomicReaderContext context = leaves[i];
BinaryDocValues v = context.AtomicReader.GetBinaryDocValues(field);
if (v == null)
{
v = DocValues.EMPTY_BINARY;
}
else
{
anyReal = true;
}
values[i] = v;
starts[i] = context.DocBase;
}
starts[size] = r.MaxDoc;
if (!anyReal)
{
return null;
}
else
{
return new BinaryDocValuesAnonymousInnerClassHelper(values, starts);
}
}
private class BinaryDocValuesAnonymousInnerClassHelper : BinaryDocValues
{
private Lucene.Net.Index.BinaryDocValues[] Values;
private int[] Starts;
public BinaryDocValuesAnonymousInnerClassHelper(Lucene.Net.Index.BinaryDocValues[] values, int[] starts)
{
this.Values = values;
this.Starts = starts;
}
public override void Get(int docID, BytesRef result)
{
int subIndex = ReaderUtil.SubIndex(docID, Starts);
Values[subIndex].Get(docID - Starts[subIndex], result);
}
}
/// <summary>
/// Returns a SortedDocValues for a reader's docvalues (potentially doing extremely slow things).
/// <p>
/// this is an extremely slow way to access sorted values. Instead, access them per-segment
/// with <seealso cref="AtomicReader#getSortedDocValues(String)"/>
/// </p>
/// </summary>
public static SortedDocValues GetSortedValues(IndexReader r, string field)
{
IList<AtomicReaderContext> leaves = r.Leaves;
int size = leaves.Count;
if (size == 0)
{
return null;
}
else if (size == 1)
{
return leaves[0].AtomicReader.GetSortedDocValues(field);
}
bool anyReal = false;
var values = new SortedDocValues[size];
int[] starts = new int[size + 1];
for (int i = 0; i < size; i++)
{
AtomicReaderContext context = leaves[i];
SortedDocValues v = context.AtomicReader.GetSortedDocValues(field);
if (v == null)
{
v = DocValues.EMPTY_SORTED;
}
else
{
anyReal = true;
}
values[i] = v;
starts[i] = context.DocBase;
}
starts[size] = r.MaxDoc;
if (!anyReal)
{
return null;
}
else
{
TermsEnum[] enums = new TermsEnum[values.Length];
for (int i = 0; i < values.Length; i++)
{
enums[i] = values[i].TermsEnum();
}
OrdinalMap mapping = new OrdinalMap(r.CoreCacheKey, enums);
return new MultiSortedDocValues(values, starts, mapping);
}
}
/// <summary>
/// Returns a SortedSetDocValues for a reader's docvalues (potentially doing extremely slow things).
/// <p>
/// this is an extremely slow way to access sorted values. Instead, access them per-segment
/// with <seealso cref="AtomicReader#getSortedSetDocValues(String)"/>
/// </p>
/// </summary>
public static SortedSetDocValues GetSortedSetValues(IndexReader r, string field)
{
IList<AtomicReaderContext> leaves = r.Leaves;
int size = leaves.Count;
if (size == 0)
{
return null;
}
else if (size == 1)
{
return leaves[0].AtomicReader.GetSortedSetDocValues(field);
}
bool anyReal = false;
SortedSetDocValues[] values = new SortedSetDocValues[size];
int[] starts = new int[size + 1];
for (int i = 0; i < size; i++)
{
AtomicReaderContext context = leaves[i];
SortedSetDocValues v = context.AtomicReader.GetSortedSetDocValues(field);
if (v == null)
{
v = DocValues.EMPTY_SORTED_SET;
}
else
{
anyReal = true;
}
values[i] = v;
starts[i] = context.DocBase;
}
starts[size] = r.MaxDoc;
if (!anyReal)
{
return null;
}
else
{
TermsEnum[] enums = new TermsEnum[values.Length];
for (int i = 0; i < values.Length; i++)
{
enums[i] = values[i].TermsEnum();
}
OrdinalMap mapping = new OrdinalMap(r.CoreCacheKey, enums);
return new MultiSortedSetDocValues(values, starts, mapping);
}
}
/// <summary>
/// maps per-segment ordinals to/from global ordinal space </summary>
// TODO: use more efficient packed ints structures?
// TODO: pull this out? its pretty generic (maps between N ord()-enabled TermsEnums)
public class OrdinalMap
{
// cache key of whoever asked for this awful thing
internal readonly object Owner;
// globalOrd -> (globalOrd - segmentOrd) where segmentOrd is the the ordinal in the first segment that contains this term
internal readonly MonotonicAppendingLongBuffer GlobalOrdDeltas;
// globalOrd -> first segment container
internal readonly AppendingPackedLongBuffer FirstSegments;
// for every segment, segmentOrd -> (globalOrd - segmentOrd)
internal readonly MonotonicAppendingLongBuffer[] OrdDeltas;
/// <summary>
/// Creates an ordinal map that allows mapping ords to/from a merged
/// space from <code>subs</code>. </summary>
/// <param name="owner"> a cache key </param>
/// <param name="subs"> TermsEnums that support <seealso cref="TermsEnum#ord()"/>. They need
/// not be dense (e.g. can be FilteredTermsEnums}. </param>
/// <exception cref="IOException"> if an I/O error occurred. </exception>
public OrdinalMap(object owner, TermsEnum[] subs)
{
// create the ordinal mappings by pulling a termsenum over each sub's
// unique terms, and walking a multitermsenum over those
this.Owner = owner;
GlobalOrdDeltas = new MonotonicAppendingLongBuffer(PackedInts.COMPACT);
FirstSegments = new AppendingPackedLongBuffer(PackedInts.COMPACT);
OrdDeltas = new MonotonicAppendingLongBuffer[subs.Length];
for (int i = 0; i < OrdDeltas.Length; i++)
{
OrdDeltas[i] = new MonotonicAppendingLongBuffer();
}
long[] segmentOrds = new long[subs.Length];
ReaderSlice[] slices = new ReaderSlice[subs.Length];
TermsEnumIndex[] indexes = new TermsEnumIndex[slices.Length];
for (int i = 0; i < slices.Length; i++)
{
slices[i] = new ReaderSlice(0, 0, i);
indexes[i] = new TermsEnumIndex(subs[i], i);
}
MultiTermsEnum mte = new MultiTermsEnum(slices);
mte.Reset(indexes);
long globalOrd = 0;
while (mte.Next() != null)
{
TermsEnumWithSlice[] matches = mte.MatchArray;
for (int i = 0; i < mte.MatchCount; i++)
{
int segmentIndex = matches[i].Index;
long segmentOrd = matches[i].Terms.Ord();
long delta = globalOrd - segmentOrd;
// for each unique term, just mark the first segment index/delta where it occurs
if (i == 0)
{
FirstSegments.Add(segmentIndex);
GlobalOrdDeltas.Add(delta);
}
// for each per-segment ord, map it back to the global term.
while (segmentOrds[segmentIndex] <= segmentOrd)
{
OrdDeltas[segmentIndex].Add(delta);
segmentOrds[segmentIndex]++;
}
}
globalOrd++;
}
FirstSegments.Freeze();
GlobalOrdDeltas.Freeze();
for (int i = 0; i < OrdDeltas.Length; ++i)
{
OrdDeltas[i].Freeze();
}
}
/// <summary>
/// Given a segment number and segment ordinal, returns
/// the corresponding global ordinal.
/// </summary>
public virtual long GetGlobalOrd(int segmentIndex, long segmentOrd)
{
return segmentOrd + OrdDeltas[segmentIndex].Get(segmentOrd);
}
/// <summary>
/// Given global ordinal, returns the ordinal of the first segment which contains
/// this ordinal (the corresponding to the segment return <seealso cref="#getFirstSegmentNumber"/>).
/// </summary>
public virtual long GetFirstSegmentOrd(long globalOrd)
{
return globalOrd - GlobalOrdDeltas.Get(globalOrd);
}
/// <summary>
/// Given a global ordinal, returns the index of the first
/// segment that contains this term.
/// </summary>
public virtual int GetFirstSegmentNumber(long globalOrd)
{
return (int)FirstSegments.Get(globalOrd);
}
/// <summary>
/// Returns the total number of unique terms in global ord space.
/// </summary>
public virtual long ValueCount
{
get
{
return GlobalOrdDeltas.Size();
}
}
/// <summary>
/// Returns total byte size used by this ordinal map.
/// </summary>
public virtual long RamBytesUsed()
{
long size = GlobalOrdDeltas.RamBytesUsed() + FirstSegments.RamBytesUsed();
for (int i = 0; i < OrdDeltas.Length; i++)
{
size += OrdDeltas[i].RamBytesUsed();
}
return size;
}
}
/// <summary>
/// Implements SortedDocValues over n subs, using an OrdinalMap
/// @lucene.internal
/// </summary>
public class MultiSortedDocValues : SortedDocValues
{
/// <summary>
/// docbase for each leaf: parallel with <seealso cref="#values"/> </summary>
public readonly int[] DocStarts;
/// <summary>
/// leaf values </summary>
public readonly SortedDocValues[] Values;
/// <summary>
/// ordinal map mapping ords from <code>values</code> to global ord space </summary>
public readonly OrdinalMap Mapping;
/// <summary>
/// Creates a new MultiSortedDocValues over <code>values</code> </summary>
internal MultiSortedDocValues(SortedDocValues[] values, int[] docStarts, OrdinalMap mapping)
{
Debug.Assert(values.Length == mapping.OrdDeltas.Length);
Debug.Assert(docStarts.Length == values.Length + 1);
this.Values = values;
this.DocStarts = docStarts;
this.Mapping = mapping;
}
public override int GetOrd(int docID)
{
int subIndex = ReaderUtil.SubIndex(docID, DocStarts);
int segmentOrd = Values[subIndex].GetOrd(docID - DocStarts[subIndex]);
return segmentOrd == -1 ? segmentOrd : (int)Mapping.GetGlobalOrd(subIndex, segmentOrd);
}
public override void LookupOrd(int ord, BytesRef result)
{
int subIndex = Mapping.GetFirstSegmentNumber(ord);
int segmentOrd = (int)Mapping.GetFirstSegmentOrd(ord);
Values[subIndex].LookupOrd(segmentOrd, result);
}
public override int ValueCount
{
get
{
return (int)Mapping.ValueCount;
}
}
}
/// <summary>
/// Implements MultiSortedSetDocValues over n subs, using an OrdinalMap
/// @lucene.internal
/// </summary>
public class MultiSortedSetDocValues : SortedSetDocValues
{
/// <summary>
/// docbase for each leaf: parallel with <seealso cref="#values"/> </summary>
public readonly int[] DocStarts;
/// <summary>
/// leaf values </summary>
public readonly SortedSetDocValues[] Values;
/// <summary>
/// ordinal map mapping ords from <code>values</code> to global ord space </summary>
public readonly OrdinalMap Mapping;
internal int CurrentSubIndex;
/// <summary>
/// Creates a new MultiSortedSetDocValues over <code>values</code> </summary>
internal MultiSortedSetDocValues(SortedSetDocValues[] values, int[] docStarts, OrdinalMap mapping)
{
Debug.Assert(values.Length == mapping.OrdDeltas.Length);
Debug.Assert(docStarts.Length == values.Length + 1);
this.Values = values;
this.DocStarts = docStarts;
this.Mapping = mapping;
}
public override long NextOrd()
{
long segmentOrd = Values[CurrentSubIndex].NextOrd();
if (segmentOrd == NO_MORE_ORDS)
{
return segmentOrd;
}
else
{
return Mapping.GetGlobalOrd(CurrentSubIndex, segmentOrd);
}
}
public override int Document
{
set
{
CurrentSubIndex = ReaderUtil.SubIndex(value, DocStarts);
Values[CurrentSubIndex].Document = value - DocStarts[CurrentSubIndex];
}
}
public override void LookupOrd(long ord, BytesRef result)
{
int subIndex = Mapping.GetFirstSegmentNumber(ord);
long segmentOrd = Mapping.GetFirstSegmentOrd(ord);
Values[subIndex].LookupOrd(segmentOrd, result);
}
public override long ValueCount
{
get
{
return Mapping.ValueCount;
}
}
}
}
}