| package org.apache.lucene.index; |
| |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| import java.io.IOException; |
| import java.util.List; |
| |
| import org.apache.lucene.index.MultiTermsEnum.TermsEnumIndex; |
| import org.apache.lucene.index.MultiTermsEnum.TermsEnumWithSlice; |
| import org.apache.lucene.util.Bits; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.packed.AppendingPackedLongBuffer; |
| import org.apache.lucene.util.packed.MonotonicAppendingLongBuffer; |
| import org.apache.lucene.util.packed.PackedInts; |
| |
| /** |
| * A wrapper for CompositeIndexReader providing access to DocValues. |
| * |
| * <p><b>NOTE</b>: for multi readers, you'll get better |
| * performance by gathering the sub readers using |
| * {@link IndexReader#getContext()} to get the |
| * atomic leaves and then operate per-AtomicReader, |
| * instead of using this class. |
| * |
| * <p><b>NOTE</b>: This is very costly. |
| * |
| * @lucene.experimental |
| * @lucene.internal |
| */ |
| public class MultiDocValues { |
| |
| /** No instantiation */ |
| private MultiDocValues() {} |
| |
| /** Returns a NumericDocValues for a reader's norms (potentially merging on-the-fly). |
| * <p> |
| * This is a slow way to access normalization values. Instead, access them per-segment |
| * with {@link AtomicReader#getNormValues(String)} |
| * </p> |
| */ |
| public static NumericDocValues getNormValues(final IndexReader r, final String field) throws IOException { |
| final List<AtomicReaderContext> leaves = r.leaves(); |
| final int size = leaves.size(); |
| if (size == 0) { |
| return null; |
| } else if (size == 1) { |
| return leaves.get(0).reader().getNormValues(field); |
| } |
| FieldInfo fi = MultiFields.getMergedFieldInfos(r).fieldInfo(field); |
| if (fi == null || fi.hasNorms() == false) { |
| return null; |
| } |
| |
| boolean anyReal = false; |
| final NumericDocValues[] values = new NumericDocValues[size]; |
| final int[] starts = new int[size+1]; |
| for (int i = 0; i < size; i++) { |
| AtomicReaderContext context = leaves.get(i); |
| NumericDocValues v = context.reader().getNormValues(field); |
| if (v == null) { |
| v = NumericDocValues.EMPTY; |
| } else { |
| anyReal = true; |
| } |
| values[i] = v; |
| starts[i] = context.docBase; |
| } |
| starts[size] = r.maxDoc(); |
| |
| assert anyReal; |
| |
| return new NumericDocValues() { |
| @Override |
| public long get(int docID) { |
| int subIndex = ReaderUtil.subIndex(docID, starts); |
| return values[subIndex].get(docID - starts[subIndex]); |
| } |
| }; |
| } |
| |
| /** Returns a NumericDocValues for a reader's docvalues (potentially merging on-the-fly) |
| * <p> |
| * This is a slow way to access numeric values. Instead, access them per-segment |
| * with {@link AtomicReader#getNumericDocValues(String)} |
| * </p> |
| * */ |
| public static NumericDocValues getNumericValues(final IndexReader r, final String field) throws IOException { |
| final List<AtomicReaderContext> leaves = r.leaves(); |
| final int size = leaves.size(); |
| if (size == 0) { |
| return null; |
| } else if (size == 1) { |
| return leaves.get(0).reader().getNumericDocValues(field); |
| } |
| |
| boolean anyReal = false; |
| final NumericDocValues[] values = new NumericDocValues[size]; |
| final int[] starts = new int[size+1]; |
| for (int i = 0; i < size; i++) { |
| AtomicReaderContext context = leaves.get(i); |
| NumericDocValues v = context.reader().getNumericDocValues(field); |
| if (v == null) { |
| v = NumericDocValues.EMPTY; |
| } else { |
| anyReal = true; |
| } |
| values[i] = v; |
| starts[i] = context.docBase; |
| } |
| starts[size] = r.maxDoc(); |
| |
| if (!anyReal) { |
| return null; |
| } else { |
| return new NumericDocValues() { |
| @Override |
| public long get(int docID) { |
| int subIndex = ReaderUtil.subIndex(docID, starts); |
| return values[subIndex].get(docID - starts[subIndex]); |
| } |
| }; |
| } |
| } |
| |
| /** Returns a Bits for a reader's docsWithField (potentially merging on-the-fly) |
| * <p> |
| * This is a slow way to access this bitset. Instead, access them per-segment |
| * with {@link AtomicReader#getDocsWithField(String)} |
| * </p> |
| * */ |
| public static Bits getDocsWithField(final IndexReader r, final String field) throws IOException { |
| final List<AtomicReaderContext> leaves = r.leaves(); |
| final int size = leaves.size(); |
| if (size == 0) { |
| return null; |
| } else if (size == 1) { |
| return leaves.get(0).reader().getDocsWithField(field); |
| } |
| |
| boolean anyReal = false; |
| boolean anyMissing = false; |
| final Bits[] values = new Bits[size]; |
| final int[] starts = new int[size+1]; |
| for (int i = 0; i < size; i++) { |
| AtomicReaderContext context = leaves.get(i); |
| Bits v = context.reader().getDocsWithField(field); |
| if (v == null) { |
| v = new Bits.MatchNoBits(context.reader().maxDoc()); |
| anyMissing = true; |
| } else { |
| anyReal = true; |
| if (v instanceof Bits.MatchAllBits == false) { |
| anyMissing = true; |
| } |
| } |
| values[i] = v; |
| starts[i] = context.docBase; |
| } |
| starts[size] = r.maxDoc(); |
| |
| if (!anyReal) { |
| return null; |
| } else if (!anyMissing) { |
| return new Bits.MatchAllBits(r.maxDoc()); |
| } else { |
| return new MultiBits(values, starts, false); |
| } |
| } |
| |
| /** Returns a BinaryDocValues for a reader's docvalues (potentially merging on-the-fly) |
| * <p> |
| * This is a slow way to access binary values. Instead, access them per-segment |
| * with {@link AtomicReader#getBinaryDocValues(String)} |
| * </p> |
| */ |
| public static BinaryDocValues getBinaryValues(final IndexReader r, final String field) throws IOException { |
| final List<AtomicReaderContext> leaves = r.leaves(); |
| final int size = leaves.size(); |
| |
| if (size == 0) { |
| return null; |
| } else if (size == 1) { |
| return leaves.get(0).reader().getBinaryDocValues(field); |
| } |
| |
| boolean anyReal = false; |
| final BinaryDocValues[] values = new BinaryDocValues[size]; |
| final int[] starts = new int[size+1]; |
| for (int i = 0; i < size; i++) { |
| AtomicReaderContext context = leaves.get(i); |
| BinaryDocValues v = context.reader().getBinaryDocValues(field); |
| if (v == null) { |
| v = BinaryDocValues.EMPTY; |
| } else { |
| anyReal = true; |
| } |
| values[i] = v; |
| starts[i] = context.docBase; |
| } |
| starts[size] = r.maxDoc(); |
| |
| if (!anyReal) { |
| return null; |
| } else { |
| return new BinaryDocValues() { |
| @Override |
| public void get(int docID, BytesRef result) { |
| int subIndex = ReaderUtil.subIndex(docID, starts); |
| values[subIndex].get(docID - starts[subIndex], result); |
| } |
| }; |
| } |
| } |
| |
| /** Returns a SortedDocValues for a reader's docvalues (potentially doing extremely slow things). |
| * <p> |
| * This is an extremely slow way to access sorted values. Instead, access them per-segment |
| * with {@link AtomicReader#getSortedDocValues(String)} |
| * </p> |
| */ |
| public static SortedDocValues getSortedValues(final IndexReader r, final String field) throws IOException { |
| final List<AtomicReaderContext> leaves = r.leaves(); |
| final int size = leaves.size(); |
| |
| if (size == 0) { |
| return null; |
| } else if (size == 1) { |
| return leaves.get(0).reader().getSortedDocValues(field); |
| } |
| |
| boolean anyReal = false; |
| final SortedDocValues[] values = new SortedDocValues[size]; |
| final int[] starts = new int[size+1]; |
| for (int i = 0; i < size; i++) { |
| AtomicReaderContext context = leaves.get(i); |
| SortedDocValues v = context.reader().getSortedDocValues(field); |
| if (v == null) { |
| v = SortedDocValues.EMPTY; |
| } else { |
| anyReal = true; |
| } |
| values[i] = v; |
| starts[i] = context.docBase; |
| } |
| starts[size] = r.maxDoc(); |
| |
| if (!anyReal) { |
| return null; |
| } else { |
| TermsEnum enums[] = new TermsEnum[values.length]; |
| for (int i = 0; i < values.length; i++) { |
| enums[i] = values[i].termsEnum(); |
| } |
| OrdinalMap mapping = new OrdinalMap(r.getCoreCacheKey(), enums); |
| return new MultiSortedDocValues(values, starts, mapping); |
| } |
| } |
| |
| /** Returns a SortedSetDocValues for a reader's docvalues (potentially doing extremely slow things). |
| * <p> |
| * This is an extremely slow way to access sorted values. Instead, access them per-segment |
| * with {@link AtomicReader#getSortedSetDocValues(String)} |
| * </p> |
| */ |
| public static SortedSetDocValues getSortedSetValues(final IndexReader r, final String field) throws IOException { |
| final List<AtomicReaderContext> leaves = r.leaves(); |
| final int size = leaves.size(); |
| |
| if (size == 0) { |
| return null; |
| } else if (size == 1) { |
| return leaves.get(0).reader().getSortedSetDocValues(field); |
| } |
| |
| boolean anyReal = false; |
| final SortedSetDocValues[] values = new SortedSetDocValues[size]; |
| final int[] starts = new int[size+1]; |
| for (int i = 0; i < size; i++) { |
| AtomicReaderContext context = leaves.get(i); |
| SortedSetDocValues v = context.reader().getSortedSetDocValues(field); |
| if (v == null) { |
| v = SortedSetDocValues.EMPTY; |
| } else { |
| anyReal = true; |
| } |
| values[i] = v; |
| starts[i] = context.docBase; |
| } |
| starts[size] = r.maxDoc(); |
| |
| if (!anyReal) { |
| return null; |
| } else { |
| TermsEnum enums[] = new TermsEnum[values.length]; |
| for (int i = 0; i < values.length; i++) { |
| enums[i] = values[i].termsEnum(); |
| } |
| OrdinalMap mapping = new OrdinalMap(r.getCoreCacheKey(), enums); |
| return new MultiSortedSetDocValues(values, starts, mapping); |
| } |
| } |
| |
| /** maps per-segment ordinals to/from global ordinal space */ |
| // TODO: use more efficient packed ints structures? |
| // TODO: pull this out? its pretty generic (maps between N ord()-enabled TermsEnums) |
| public static class OrdinalMap { |
| // cache key of whoever asked for this awful thing |
| final Object owner; |
| // globalOrd -> (globalOrd - segmentOrd) where segmentOrd is the the ordinal in the first segment that contains this term |
| final MonotonicAppendingLongBuffer globalOrdDeltas; |
| // globalOrd -> first segment container |
| final AppendingPackedLongBuffer firstSegments; |
| // for every segment, segmentOrd -> (globalOrd - segmentOrd) |
| final MonotonicAppendingLongBuffer ordDeltas[]; |
| |
| /** |
| * Creates an ordinal map that allows mapping ords to/from a merged |
| * space from <code>subs</code>. |
| * @param owner a cache key |
| * @param subs TermsEnums that support {@link TermsEnum#ord()}. They need |
| * not be dense (e.g. can be FilteredTermsEnums}. |
| * @throws IOException if an I/O error occurred. |
| */ |
| public OrdinalMap(Object owner, TermsEnum subs[]) throws IOException { |
| // create the ordinal mappings by pulling a termsenum over each sub's |
| // unique terms, and walking a multitermsenum over those |
| this.owner = owner; |
| globalOrdDeltas = new MonotonicAppendingLongBuffer(PackedInts.COMPACT); |
| firstSegments = new AppendingPackedLongBuffer(PackedInts.COMPACT); |
| ordDeltas = new MonotonicAppendingLongBuffer[subs.length]; |
| for (int i = 0; i < ordDeltas.length; i++) { |
| ordDeltas[i] = new MonotonicAppendingLongBuffer(); |
| } |
| long segmentOrds[] = new long[subs.length]; |
| ReaderSlice slices[] = new ReaderSlice[subs.length]; |
| TermsEnumIndex indexes[] = new TermsEnumIndex[slices.length]; |
| for (int i = 0; i < slices.length; i++) { |
| slices[i] = new ReaderSlice(0, 0, i); |
| indexes[i] = new TermsEnumIndex(subs[i], i); |
| } |
| MultiTermsEnum mte = new MultiTermsEnum(slices); |
| mte.reset(indexes); |
| long globalOrd = 0; |
| while (mte.next() != null) { |
| TermsEnumWithSlice matches[] = mte.getMatchArray(); |
| for (int i = 0; i < mte.getMatchCount(); i++) { |
| int segmentIndex = matches[i].index; |
| long segmentOrd = matches[i].terms.ord(); |
| long delta = globalOrd - segmentOrd; |
| // for each unique term, just mark the first segment index/delta where it occurs |
| if (i == 0) { |
| firstSegments.add(segmentIndex); |
| globalOrdDeltas.add(delta); |
| } |
| // for each per-segment ord, map it back to the global term. |
| while (segmentOrds[segmentIndex] <= segmentOrd) { |
| ordDeltas[segmentIndex].add(delta); |
| segmentOrds[segmentIndex]++; |
| } |
| } |
| globalOrd++; |
| } |
| firstSegments.freeze(); |
| globalOrdDeltas.freeze(); |
| for (int i = 0; i < ordDeltas.length; ++i) { |
| ordDeltas[i].freeze(); |
| } |
| } |
| |
| /** |
| * Given a segment number and segment ordinal, returns |
| * the corresponding global ordinal. |
| */ |
| public long getGlobalOrd(int segmentIndex, long segmentOrd) { |
| return segmentOrd + ordDeltas[segmentIndex].get(segmentOrd); |
| } |
| |
| /** |
| * Given global ordinal, returns the ordinal of the first segment which contains |
| * this ordinal (the corresponding to the segment return {@link #getFirstSegmentNumber}). |
| */ |
| public long getFirstSegmentOrd(long globalOrd) { |
| return globalOrd - globalOrdDeltas.get(globalOrd); |
| } |
| |
| /** |
| * Given a global ordinal, returns the index of the first |
| * segment that contains this term. |
| */ |
| public int getFirstSegmentNumber(long globalOrd) { |
| return (int) firstSegments.get(globalOrd); |
| } |
| |
| /** |
| * Returns the total number of unique terms in global ord space. |
| */ |
| public long getValueCount() { |
| return globalOrdDeltas.size(); |
| } |
| |
| /** |
| * Returns total byte size used by this ordinal map. |
| */ |
| public long ramBytesUsed() { |
| long size = globalOrdDeltas.ramBytesUsed() + firstSegments.ramBytesUsed(); |
| for (int i = 0; i < ordDeltas.length; i++) { |
| size += ordDeltas[i].ramBytesUsed(); |
| } |
| return size; |
| } |
| } |
| |
| /** |
| * Implements SortedDocValues over n subs, using an OrdinalMap |
| * @lucene.internal |
| */ |
| public static class MultiSortedDocValues extends SortedDocValues { |
| /** docbase for each leaf: parallel with {@link #values} */ |
| public final int docStarts[]; |
| /** leaf values */ |
| public final SortedDocValues values[]; |
| /** ordinal map mapping ords from <code>values</code> to global ord space */ |
| public final OrdinalMap mapping; |
| |
| /** Creates a new MultiSortedDocValues over <code>values</code> */ |
| MultiSortedDocValues(SortedDocValues values[], int docStarts[], OrdinalMap mapping) throws IOException { |
| assert values.length == mapping.ordDeltas.length; |
| assert docStarts.length == values.length + 1; |
| this.values = values; |
| this.docStarts = docStarts; |
| this.mapping = mapping; |
| } |
| |
| @Override |
| public int getOrd(int docID) { |
| int subIndex = ReaderUtil.subIndex(docID, docStarts); |
| int segmentOrd = values[subIndex].getOrd(docID - docStarts[subIndex]); |
| return segmentOrd == -1 ? segmentOrd : (int) mapping.getGlobalOrd(subIndex, segmentOrd); |
| } |
| |
| @Override |
| public void lookupOrd(int ord, BytesRef result) { |
| int subIndex = mapping.getFirstSegmentNumber(ord); |
| int segmentOrd = (int) mapping.getFirstSegmentOrd(ord); |
| values[subIndex].lookupOrd(segmentOrd, result); |
| } |
| |
| @Override |
| public int getValueCount() { |
| return (int) mapping.getValueCount(); |
| } |
| } |
| |
| /** |
| * Implements MultiSortedSetDocValues over n subs, using an OrdinalMap |
| * @lucene.internal |
| */ |
| public static class MultiSortedSetDocValues extends SortedSetDocValues { |
| /** docbase for each leaf: parallel with {@link #values} */ |
| public final int docStarts[]; |
| /** leaf values */ |
| public final SortedSetDocValues values[]; |
| /** ordinal map mapping ords from <code>values</code> to global ord space */ |
| public final OrdinalMap mapping; |
| int currentSubIndex; |
| |
| /** Creates a new MultiSortedSetDocValues over <code>values</code> */ |
| MultiSortedSetDocValues(SortedSetDocValues values[], int docStarts[], OrdinalMap mapping) throws IOException { |
| assert values.length == mapping.ordDeltas.length; |
| assert docStarts.length == values.length + 1; |
| this.values = values; |
| this.docStarts = docStarts; |
| this.mapping = mapping; |
| } |
| |
| @Override |
| public long nextOrd() { |
| long segmentOrd = values[currentSubIndex].nextOrd(); |
| if (segmentOrd == NO_MORE_ORDS) { |
| return segmentOrd; |
| } else { |
| return mapping.getGlobalOrd(currentSubIndex, segmentOrd); |
| } |
| } |
| |
| @Override |
| public void setDocument(int docID) { |
| currentSubIndex = ReaderUtil.subIndex(docID, docStarts); |
| values[currentSubIndex].setDocument(docID - docStarts[currentSubIndex]); |
| } |
| |
| @Override |
| public void lookupOrd(long ord, BytesRef result) { |
| int subIndex = mapping.getFirstSegmentNumber(ord); |
| long segmentOrd = mapping.getFirstSegmentOrd(ord); |
| values[subIndex].lookupOrd(segmentOrd, result); |
| } |
| |
| @Override |
| public long getValueCount() { |
| return mapping.getValueCount(); |
| } |
| } |
| } |