| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.codecs; |
| |
| import java.io.Closeable; |
| import java.io.IOException; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.Iterator; |
| import java.util.List; |
| |
| import org.apache.lucene.document.ReferenceDocValuesField; |
| import org.apache.lucene.index.BinaryDocValues; |
| import org.apache.lucene.index.DocIDMerger; |
| import org.apache.lucene.index.DocValues; |
| import org.apache.lucene.index.DocValuesType; |
| import org.apache.lucene.index.EmptyDocValuesProducer; |
| import org.apache.lucene.index.FieldInfo; |
| import org.apache.lucene.index.FilteredTermsEnum; |
| import org.apache.lucene.index.MergeState; |
| import org.apache.lucene.index.NumericDocValues; |
| import org.apache.lucene.index.OrdinalMap; |
| import org.apache.lucene.index.ReferenceDocValuesWriter; |
| import org.apache.lucene.index.SegmentWriteState; // javadocs |
| import org.apache.lucene.index.SortedDocValues; |
| import org.apache.lucene.index.SortedNumericDocValues; |
| import org.apache.lucene.index.SortedSetDocValues; |
| import org.apache.lucene.index.TermsEnum; |
| import org.apache.lucene.index.VectorDocValues; |
| import org.apache.lucene.index.VectorDocValuesWriter; |
| import org.apache.lucene.search.GraphSearch; |
| import org.apache.lucene.search.ScoreDoc; |
| import org.apache.lucene.util.Bits; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.Counter; |
| import org.apache.lucene.util.LongBitSet; |
| import org.apache.lucene.util.LongValues; |
| import org.apache.lucene.util.packed.PackedInts; |
| |
| import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; |
| |
| /** |
| * Abstract API that consumes numeric, binary and |
| * sorted docvalues. Concrete implementations of this |
| * actually do "something" with the docvalues (write it into |
| * the index in a specific format). |
| * <p> |
| * The lifecycle is: |
| * <ol> |
| * <li>DocValuesConsumer is created by |
| * {@link NormsFormat#normsConsumer(SegmentWriteState)}. |
| * <li>{@link #addNumericField}, {@link #addBinaryField}, |
| * {@link #addSortedField}, {@link #addSortedSetField}, |
| * or {@link #addSortedNumericField} are called for each Numeric, |
| * Binary, Sorted, SortedSet, or SortedNumeric docvalues field. |
| * The API is a "pull" rather than "push", and the implementation |
| * is free to iterate over the values multiple times |
| * ({@link Iterable#iterator()}). |
| * <li>After all fields are added, the consumer is {@link #close}d. |
| * </ol> |
| * |
| * @lucene.experimental |
| */ |
| public abstract class DocValuesConsumer implements Closeable { |
| |
| /** Sole constructor. (For invocation by subclass |
| * constructors, typically implicit.) */ |
| protected DocValuesConsumer() {} |
| |
| /** |
| * Writes numeric docvalues for a field. |
| * @param field field information |
| * @param valuesProducer Numeric values to write. |
| * @throws IOException if an I/O error occurred. |
| */ |
| public abstract void addNumericField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException; |
| |
| /** |
| * Writes binary docvalues for a field. |
| * @param field field information |
| * @param valuesProducer Binary values to write. |
| * @throws IOException if an I/O error occurred. |
| */ |
| public abstract void addBinaryField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException; |
| |
| /** |
| * Writes pre-sorted binary docvalues for a field. |
| * @param field field information |
| * @param valuesProducer produces the values and ordinals to write |
| * @throws IOException if an I/O error occurred. |
| */ |
| public abstract void addSortedField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException; |
| |
| /** |
| * Writes pre-sorted numeric docvalues for a field |
| * @param field field information |
| * @param valuesProducer produces the values to write |
| * @throws IOException if an I/O error occurred. |
| */ |
| public abstract void addSortedNumericField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException; |
| |
| /** |
| * Writes pre-sorted set docvalues for a field |
| * @param field field information |
| * @param valuesProducer produces the values to write |
| * @throws IOException if an I/O error occurred. |
| */ |
| public abstract void addSortedSetField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException; |
| |
| /** Merges in the fields from the readers in |
| * <code>mergeState</code>. The default implementation |
| * calls {@link #mergeNumericField}, {@link #mergeBinaryField}, |
| * {@link #mergeSortedField}, {@link #mergeSortedSetField}, |
| * or {@link #mergeSortedNumericField} for each field, |
| * depending on its type. |
| * Implementations can override this method |
| * for more sophisticated merging (bulk-byte copying, etc). */ |
| public void merge(MergeState mergeState) throws IOException { |
| for(DocValuesProducer docValuesProducer : mergeState.docValuesProducers) { |
| if (docValuesProducer != null) { |
| docValuesProducer.checkIntegrity(); |
| } |
| } |
| |
| for (FieldInfo mergeFieldInfo : mergeState.mergeFieldInfos) { |
| DocValuesType type = mergeFieldInfo.getDocValuesType(); |
| if (type != DocValuesType.NONE) { |
| if (type == DocValuesType.NUMERIC) { |
| mergeNumericField(mergeFieldInfo, mergeState); |
| } else if (type == DocValuesType.BINARY) { |
| mergeBinaryField(mergeFieldInfo, mergeState); |
| } else if (type == DocValuesType.SORTED) { |
| mergeSortedField(mergeFieldInfo, mergeState); |
| } else if (type == DocValuesType.SORTED_SET) { |
| mergeSortedSetField(mergeFieldInfo, mergeState); |
| } else if (type == DocValuesType.SORTED_NUMERIC) { |
| String refType = mergeFieldInfo.getAttribute(ReferenceDocValuesField.REFTYPE_ATTR); |
| if ("knn-graph".equals(refType)) { |
| mergeReferenceField(mergeFieldInfo, mergeState); |
| } else { |
| mergeSortedNumericField(mergeFieldInfo, mergeState); |
| } |
| } else { |
| throw new AssertionError("type=" + type); |
| } |
| } |
| } |
| } |
| |
| /** Tracks state of one numeric sub-reader that we are merging */ |
| private static class NumericDocValuesSub extends DocIDMerger.Sub { |
| |
| final NumericDocValues values; |
| |
| public NumericDocValuesSub(MergeState.DocMap docMap, NumericDocValues values) { |
| super(docMap); |
| this.values = values; |
| assert values.docID() == -1; |
| } |
| |
| @Override |
| public int nextDoc() throws IOException { |
| return values.nextDoc(); |
| } |
| } |
| |
| /** |
| * Merges the numeric docvalues from <code>MergeState</code>. |
| * <p> |
| * The default implementation calls {@link #addNumericField}, passing |
| * a DocValuesProducer that merges and filters deleted documents on the fly. |
| */ |
| public void mergeNumericField(final FieldInfo mergeFieldInfo, final MergeState mergeState) throws IOException { |
| addNumericField(mergeFieldInfo, |
| new EmptyDocValuesProducer() { |
| @Override |
| public NumericDocValues getNumeric(FieldInfo fieldInfo) throws IOException { |
| if (fieldInfo != mergeFieldInfo) { |
| throw new IllegalArgumentException("wrong fieldInfo"); |
| } |
| |
| List<NumericDocValuesSub> subs = new ArrayList<>(); |
| assert mergeState.docMaps.length == mergeState.docValuesProducers.length; |
| long cost = 0; |
| for (int i=0;i<mergeState.docValuesProducers.length;i++) { |
| NumericDocValues values = null; |
| DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; |
| if (docValuesProducer != null) { |
| FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name); |
| if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.NUMERIC) { |
| values = docValuesProducer.getNumeric(readerFieldInfo); |
| } |
| } |
| if (values != null) { |
| cost += values.cost(); |
| subs.add(new NumericDocValuesSub(mergeState.docMaps[i], values)); |
| } |
| } |
| |
| final DocIDMerger<NumericDocValuesSub> docIDMerger = DocIDMerger.of(subs, mergeState.needsIndexSort); |
| |
| final long finalCost = cost; |
| |
| return new NumericDocValues() { |
| private int docID = -1; |
| private NumericDocValuesSub current; |
| |
| @Override |
| public int docID() { |
| return docID; |
| } |
| |
| @Override |
| public int nextDoc() throws IOException { |
| current = docIDMerger.next(); |
| if (current == null) { |
| docID = NO_MORE_DOCS; |
| } else { |
| docID = current.mappedDocID; |
| } |
| return docID; |
| } |
| |
| @Override |
| public int advance(int target) throws IOException { |
| throw new UnsupportedOperationException(); |
| } |
| |
| @Override |
| public boolean advanceExact(int target) throws IOException { |
| throw new UnsupportedOperationException(); |
| } |
| |
| @Override |
| public long cost() { |
| return finalCost; |
| } |
| |
| @Override |
| public long longValue() throws IOException { |
| return current.values.longValue(); |
| } |
| }; |
| } |
| }); |
| } |
| |
| /** Tracks state of one binary sub-reader that we are merging */ |
| private static class BinaryDocValuesSub extends DocIDMerger.Sub { |
| |
| final BinaryDocValues values; |
| |
| public BinaryDocValuesSub(MergeState.DocMap docMap, BinaryDocValues values) { |
| super(docMap); |
| this.values = values; |
| assert values.docID() == -1; |
| } |
| |
| @Override |
| public int nextDoc() throws IOException { |
| return values.nextDoc(); |
| } |
| } |
| |
| /** |
| * Merges the binary docvalues from <code>MergeState</code>. |
| * <p> |
| * The default implementation calls {@link #addBinaryField}, passing |
| * a DocValuesProducer that merges and filters deleted documents on the fly. |
| */ |
| public void mergeBinaryField(FieldInfo mergeFieldInfo, final MergeState mergeState) throws IOException { |
| addBinaryField(mergeFieldInfo, |
| new EmptyDocValuesProducer() { |
| @Override |
| public BinaryDocValues getBinary(FieldInfo fieldInfo) throws IOException { |
| if (fieldInfo != mergeFieldInfo) { |
| throw new IllegalArgumentException("wrong fieldInfo"); |
| } |
| |
| List<BinaryDocValuesSub> subs = new ArrayList<>(); |
| |
| long cost = 0; |
| for (int i=0;i<mergeState.docValuesProducers.length;i++) { |
| BinaryDocValues values = null; |
| DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; |
| if (docValuesProducer != null) { |
| FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name); |
| if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.BINARY) { |
| values = docValuesProducer.getBinary(readerFieldInfo); |
| } |
| } |
| if (values != null) { |
| cost += values.cost(); |
| subs.add(new BinaryDocValuesSub(mergeState.docMaps[i], values)); |
| } |
| } |
| |
| final DocIDMerger<BinaryDocValuesSub> docIDMerger = DocIDMerger.of(subs, mergeState.needsIndexSort); |
| final long finalCost = cost; |
| |
| return new BinaryDocValues() { |
| private BinaryDocValuesSub current; |
| private int docID = -1; |
| |
| @Override |
| public int docID() { |
| return docID; |
| } |
| |
| @Override |
| public int nextDoc() throws IOException { |
| current = docIDMerger.next(); |
| if (current == null) { |
| docID = NO_MORE_DOCS; |
| } else { |
| docID = current.mappedDocID; |
| } |
| return docID; |
| } |
| |
| @Override |
| public int advance(int target) throws IOException { |
| throw new UnsupportedOperationException(); |
| } |
| |
| @Override |
| public boolean advanceExact(int target) throws IOException { |
| throw new UnsupportedOperationException(); |
| } |
| |
| @Override |
| public long cost() { |
| return finalCost; |
| } |
| |
| @Override |
| public BytesRef binaryValue() throws IOException { |
| return current.values.binaryValue(); |
| } |
| }; |
| } |
| }); |
| } |
| |
| /** Tracks state of one sorted numeric sub-reader that we are merging */ |
| private static class SortedNumericDocValuesSub extends DocIDMerger.Sub { |
| |
| final SortedNumericDocValues values; |
| |
| public SortedNumericDocValuesSub(MergeState.DocMap docMap, SortedNumericDocValues values) { |
| super(docMap); |
| this.values = values; |
| assert values.docID() == -1; |
| } |
| |
| @Override |
| public int nextDoc() throws IOException { |
| return values.nextDoc(); |
| } |
| } |
| |
| /** |
| * Merges the sorted docvalues from <code>toMerge</code>. |
| * <p> |
| * The default implementation calls {@link #addSortedNumericField}, passing |
| * iterables that filter deleted documents. |
| */ |
| public void mergeSortedNumericField(FieldInfo mergeFieldInfo, final MergeState mergeState) throws IOException { |
| |
| addSortedNumericField(mergeFieldInfo, |
| new EmptyDocValuesProducer() { |
| @Override |
| public SortedNumericDocValues getSortedNumeric(FieldInfo fieldInfo) throws IOException { |
| if (fieldInfo != mergeFieldInfo) { |
| throw new IllegalArgumentException("wrong FieldInfo"); |
| } |
| |
| // We must make new iterators + DocIDMerger for each iterator: |
| List<SortedNumericDocValuesSub> subs = new ArrayList<>(); |
| long cost = 0; |
| for (int i=0;i<mergeState.docValuesProducers.length;i++) { |
| DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; |
| SortedNumericDocValues values = null; |
| if (docValuesProducer != null) { |
| FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name); |
| if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED_NUMERIC) { |
| values = docValuesProducer.getSortedNumeric(readerFieldInfo); |
| } |
| } |
| if (values == null) { |
| values = DocValues.emptySortedNumeric(mergeState.maxDocs[i]); |
| } |
| cost += values.cost(); |
| subs.add(new SortedNumericDocValuesSub(mergeState.docMaps[i], values)); |
| } |
| |
| final long finalCost = cost; |
| |
| final DocIDMerger<SortedNumericDocValuesSub> docIDMerger = DocIDMerger.of(subs, mergeState.needsIndexSort); |
| |
| return new SortedNumericDocValues() { |
| |
| private int docID = -1; |
| private SortedNumericDocValuesSub currentSub; |
| |
| @Override |
| public int docID() { |
| return docID; |
| } |
| |
| @Override |
| public int nextDoc() throws IOException { |
| currentSub = docIDMerger.next(); |
| if (currentSub == null) { |
| docID = NO_MORE_DOCS; |
| } else { |
| docID = currentSub.mappedDocID; |
| } |
| |
| return docID; |
| } |
| |
| @Override |
| public int advance(int target) throws IOException { |
| throw new UnsupportedOperationException(); |
| } |
| |
| @Override |
| public boolean advanceExact(int target) throws IOException { |
| throw new UnsupportedOperationException(); |
| } |
| |
| @Override |
| public int docValueCount() { |
| return currentSub.values.docValueCount(); |
| } |
| |
| @Override |
| public long cost() { |
| return finalCost; |
| } |
| |
| @Override |
| public long nextValue() throws IOException { |
| return currentSub.values.nextValue(); |
| } |
| }; |
| } |
| }); |
| } |
| |
| /** |
| * Merges the sorted docvalues from <code>toMerge</code>. |
| * <p> |
| * The default implementation calls {@link #addSortedNumericField}, passing |
| * iterables that filter deleted documents. |
| */ |
| public void mergeReferenceField(FieldInfo mergeFieldInfo, final MergeState mergeState) throws IOException { |
| |
| assert mergeFieldInfo.name.substring(mergeFieldInfo.name.length() - 4).equals("$nbr"); |
| String vectorFieldName = mergeFieldInfo.name.substring(0, mergeFieldInfo.name.length() - 4); |
| // We must compute the entire merged field in memory since each document's values depend on its neighbors |
| //mergeState.infoStream.message("ReferenceDocValues", "merging " + mergeState.segmentInfo); |
| List<VectorDocValuesSub> subs = new ArrayList<>(); |
| List<VectorDocValuesSupplier> suppliers = new ArrayList<>(); |
| int dimension = -1; |
| for (int i = 0 ; i < mergeState.docValuesProducers.length; i++) { |
| DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; |
| if (docValuesProducer != null) { |
| FieldInfo vectorFieldInfo = mergeState.fieldInfos[i].fieldInfo(vectorFieldName); |
| if (vectorFieldInfo != null && vectorFieldInfo.getDocValuesType() == DocValuesType.BINARY) { |
| int segmentDimension = VectorDocValuesWriter.getDimensionFromAttribute(vectorFieldInfo); |
| if (dimension == -1) { |
| dimension = segmentDimension; |
| } else if (dimension != segmentDimension) { |
| throw new IllegalStateException("Varying dimensions for vector-valued field " + mergeFieldInfo.name |
| + ": " + dimension + "!=" + segmentDimension); |
| } |
| VectorDocValues values = VectorDocValues.get(docValuesProducer.getBinary(vectorFieldInfo), dimension); |
| suppliers.add(() -> VectorDocValues.get(docValuesProducer.getBinary(vectorFieldInfo), segmentDimension)); |
| subs.add(new VectorDocValuesSub(i, mergeState.docMaps[i], values)); |
| } |
| } |
| } |
| // Create a new SortedNumericDocValues by iterating over the vectors, searching for |
| // its nearest neighbor vectors in the newly merged segments' vectors, mapping the resulting |
| // docids using docMaps in the mergeState. |
| MultiVectorDV multiVectors = new MultiVectorDV(suppliers, subs, mergeState.maxDocs); |
| ReferenceDocValuesWriter refWriter = new ReferenceDocValuesWriter(mergeFieldInfo, Counter.newCounter(false)); |
| SortedNumericDocValues refs = refWriter.getBufferedValues(); |
| float[] vector = new float[dimension]; |
| GraphSearch graphSearch = GraphSearch.fromDimension(dimension); |
| int i; |
| for (i = 0; i < subs.size(); i++) { |
| // advance past the first document; there are no neighbors for it |
| if (subs.get(i).nextDoc() != NO_MORE_DOCS) { |
| break; |
| } |
| } |
| for (; i < subs.size(); i++) { |
| VectorDocValuesSub sub = subs.get(i); |
| MergeState.DocMap docMap = mergeState.docMaps[sub.segmentIndex]; |
| // nocommit: test sorted index and test index with deletions |
| int docid; |
| while ((docid = sub.nextDoc()) != NO_MORE_DOCS) { |
| int mappedDocId = docMap.get(docid); |
| assert sub.values.docID() == docid; |
| assert docid == multiVectors.unmap(mappedDocId) : "unmap mismatch " + docid + " != " + multiVectors.unmap(mappedDocId); |
| sub.values.vector(vector); |
| //System.out.println("merge doc " + mappedDocId + " mapped from [" + i + "," + docid + "] in thread " + Thread.currentThread().getName()); |
| for (ScoreDoc ref : graphSearch.search(() -> multiVectors, () -> refs, vector, mappedDocId)) { |
| if (ref.doc >= 0) { |
| // ignore sentinels |
| //System.out.println(" ref " + ref.doc); |
| refWriter.addValue(mappedDocId, ref.doc); |
| } |
| } |
| } |
| } |
| |
| addSortedNumericField(mergeFieldInfo, |
| new EmptyDocValuesProducer() { |
| @Override |
| public SortedNumericDocValues getSortedNumeric(FieldInfo fieldInfo) { |
| if (fieldInfo != mergeFieldInfo) { |
| throw new IllegalArgumentException("wrong FieldInfo"); |
| } |
| //mergeState.infoStream.message("ReferenceDocValues", "new iterator " + mergeState.segmentInfo); |
| return refWriter.getIterableValues(); |
| } |
| }); |
| |
| //mergeState.infoStream.message("ReferenceDocValues", " mergeReferenceField done: " + mergeState.segmentInfo); |
| } |
| |
| /** Tracks state of one binary sub-reader that we are merging */ |
| private static class VectorDocValuesSub extends DocIDMerger.Sub { |
| |
| final VectorDocValues values; |
| final int segmentIndex; |
| |
| public VectorDocValuesSub(int segmentIndex, MergeState.DocMap docMap, VectorDocValues values) { |
| super(docMap); |
| this.values = values; |
| this.segmentIndex = segmentIndex; |
| assert values.docID() == -1; |
| } |
| |
| @Override |
| public int nextDoc() throws IOException { |
| return values.nextDoc(); |
| } |
| } |
| |
| // provides a view over multiple VectorDocValues by concatenating their docid spaces |
| private static class MultiVectorDV extends VectorDocValues { |
| private final VectorDocValues[] subValues; |
| private final int[] docBase; |
| private final int[] segmentMaxDocs; |
| private final int cost; |
| |
| private int whichSub; |
| |
| MultiVectorDV(List<VectorDocValuesSupplier> suppliers, List<VectorDocValuesSub> subs, int[] maxDocs) throws IOException { |
| this.subValues = new VectorDocValues[suppliers.size()]; |
| // TODO: this complicated logic needs its own test |
| // maxDocs actually says *how many* docs there are, not what the number of the max doc is |
| int maxDoc = -1; |
| int lastMaxDoc = -1; |
| segmentMaxDocs = new int[subs.size() - 1]; |
| docBase = new int[subs.size()]; |
| for (int i = 0, j = 0; j < subs.size(); i++) { |
| lastMaxDoc = maxDoc; |
| maxDoc += maxDocs[i]; |
| if (i == subs.get(j).segmentIndex) { |
| // we may skip some segments if they have no docs with values for this field |
| if (j > 0) { |
| segmentMaxDocs[j - 1] = lastMaxDoc; |
| } |
| docBase[j] = lastMaxDoc + 1; |
| ++j; |
| } |
| } |
| |
| int i = 0; |
| int totalCost = 0; |
| for (VectorDocValuesSupplier supplier : suppliers) { |
| ResettingVectorDV sub = new ResettingVectorDV(supplier); |
| totalCost += sub.cost(); |
| this.subValues[i++] = sub; |
| } |
| cost = totalCost; |
| whichSub = 0; |
| } |
| |
| private int findSegment(int docid) { |
| int segment = Arrays.binarySearch(segmentMaxDocs, docid); |
| if (segment < 0) { |
| return -1 - segment; |
| } else { |
| return segment; |
| } |
| } |
| |
| @Override |
| public int docID() { |
| throw new UnsupportedOperationException(); |
| } |
| |
| @Override |
| public int dimension() { |
| return subValues[0].dimension(); |
| } |
| |
| @Override |
| public long cost() { |
| return cost; |
| } |
| |
| @Override |
| public int nextDoc() throws IOException { |
| throw new UnsupportedOperationException(); |
| } |
| |
| @Override |
| public int advance(int target) throws IOException { |
| int rebased = unmapSettingWhich(target); |
| if (rebased < 0) { |
| rebased = 0; |
| } |
| int segmentDocId = subValues[whichSub].advance(rebased); |
| if (segmentDocId == NO_MORE_DOCS) { |
| if (++whichSub < subValues.length) { |
| // Get the first document in the next segment; Note that all segments have values. |
| segmentDocId = subValues[whichSub].advance(0); |
| } else { |
| return NO_MORE_DOCS; |
| } |
| } |
| return docBase[whichSub] + segmentDocId; |
| } |
| |
| @Override |
| public boolean advanceExact(int target) throws IOException { |
| int rebased = unmapSettingWhich(target); |
| if (rebased < 0) { |
| return false; |
| } |
| return subValues[whichSub].advanceExact(rebased); |
| } |
| |
| int unmap(int docid) { |
| // map from global (merged) to segment-local (unmerged) |
| // like mapDocid but no side effect - used for assertion |
| return docid - docBase[findSegment(docid)]; |
| } |
| |
| private int unmapSettingWhich(int target) { |
| whichSub = findSegment(target); |
| return target - docBase[whichSub]; |
| } |
| |
| @Override |
| public void vector(float[] vector) throws IOException { |
| subValues[whichSub].vector(vector); |
| } |
| } |
| |
| // provides pseudo-random access to the values as float[] by recreating an underlying |
| // iterator whenever the iteration goes backwards |
| private static class ResettingVectorDV extends VectorDocValues { |
| |
| private final VectorDocValuesSupplier supplier; |
| private VectorDocValues delegate; |
| private int docId = -1; |
| |
| ResettingVectorDV(VectorDocValuesSupplier supplier) throws IOException { |
| this.supplier = supplier; |
| delegate = supplier.get(); |
| } |
| |
| @Override |
| public int docID() { |
| if (docId < 0) { |
| return -docId; |
| } else { |
| return docId; |
| } |
| } |
| |
| @Override |
| public int dimension() { |
| return delegate.dimension(); |
| } |
| |
| @Override |
| public long cost() { |
| return delegate.cost(); |
| } |
| |
| @Override |
| public int nextDoc() throws IOException { |
| docId = delegate.nextDoc(); |
| return docId; |
| } |
| |
| @Override |
| public int advance(int target) throws IOException { |
| if (target == docId) { |
| return target; |
| } |
| maybeReset(target); |
| docId = delegate.advance(target); |
| return docId; |
| } |
| |
| @Override |
| public boolean advanceExact(int target) throws IOException { |
| if (target == docId) { |
| return true; |
| } |
| maybeReset(target); |
| boolean advanced = delegate.advanceExact(target); |
| if (advanced) { |
| docId = delegate.docID(); |
| } else { |
| docId = -delegate.docID(); |
| } |
| return advanced; |
| } |
| |
| @Override |
| public void vector(float[] vector) throws IOException { |
| delegate.vector(vector); |
| } |
| |
| private void maybeReset(int target) throws IOException { |
| if (target < delegate.docID()) { |
| delegate = supplier.get(); |
| } |
| } |
| } |
| |
| private interface VectorDocValuesSupplier { |
| VectorDocValues get() throws IOException; |
| } |
| |
| /** Tracks state of one sorted sub-reader that we are merging */ |
| private static class SortedDocValuesSub extends DocIDMerger.Sub { |
| |
| final SortedDocValues values; |
| final LongValues map; |
| |
| public SortedDocValuesSub(MergeState.DocMap docMap, SortedDocValues values, LongValues map) { |
| super(docMap); |
| this.values = values; |
| this.map = map; |
| assert values.docID() == -1; |
| } |
| |
| @Override |
| public int nextDoc() throws IOException { |
| return values.nextDoc(); |
| } |
| } |
| |
| /** |
| * Merges the sorted docvalues from <code>toMerge</code>. |
| * <p> |
| * The default implementation calls {@link #addSortedField}, passing |
| * an Iterable that merges ordinals and values and filters deleted documents . |
| */ |
| public void mergeSortedField(FieldInfo fieldInfo, final MergeState mergeState) throws IOException { |
| List<SortedDocValues> toMerge = new ArrayList<>(); |
| for (int i=0;i<mergeState.docValuesProducers.length;i++) { |
| SortedDocValues values = null; |
| DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; |
| if (docValuesProducer != null) { |
| FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(fieldInfo.name); |
| if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED) { |
| values = docValuesProducer.getSorted(fieldInfo); |
| } |
| } |
| if (values == null) { |
| values = DocValues.emptySorted(); |
| } |
| toMerge.add(values); |
| } |
| |
| final int numReaders = toMerge.size(); |
| final SortedDocValues dvs[] = toMerge.toArray(new SortedDocValues[numReaders]); |
| |
| // step 1: iterate thru each sub and mark terms still in use |
| TermsEnum liveTerms[] = new TermsEnum[dvs.length]; |
| long[] weights = new long[liveTerms.length]; |
| for (int sub=0;sub<numReaders;sub++) { |
| SortedDocValues dv = dvs[sub]; |
| Bits liveDocs = mergeState.liveDocs[sub]; |
| if (liveDocs == null) { |
| liveTerms[sub] = dv.termsEnum(); |
| weights[sub] = dv.getValueCount(); |
| } else { |
| LongBitSet bitset = new LongBitSet(dv.getValueCount()); |
| int docID; |
| while ((docID = dv.nextDoc()) != NO_MORE_DOCS) { |
| if (liveDocs.get(docID)) { |
| int ord = dv.ordValue(); |
| if (ord >= 0) { |
| bitset.set(ord); |
| } |
| } |
| } |
| liveTerms[sub] = new BitsFilteredTermsEnum(dv.termsEnum(), bitset); |
| weights[sub] = bitset.cardinality(); |
| } |
| } |
| |
| // step 2: create ordinal map (this conceptually does the "merging") |
| final OrdinalMap map = OrdinalMap.build(null, liveTerms, weights, PackedInts.COMPACT); |
| |
| // step 3: add field |
| addSortedField(fieldInfo, |
| new EmptyDocValuesProducer() { |
| @Override |
| public SortedDocValues getSorted(FieldInfo fieldInfoIn) throws IOException { |
| if (fieldInfoIn != fieldInfo) { |
| throw new IllegalArgumentException("wrong FieldInfo"); |
| } |
| |
| // We must make new iterators + DocIDMerger for each iterator: |
| |
| List<SortedDocValuesSub> subs = new ArrayList<>(); |
| long cost = 0; |
| for (int i=0;i<mergeState.docValuesProducers.length;i++) { |
| SortedDocValues values = null; |
| DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; |
| if (docValuesProducer != null) { |
| FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(fieldInfo.name); |
| if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED) { |
| values = docValuesProducer.getSorted(readerFieldInfo); |
| } |
| } |
| if (values == null) { |
| values = DocValues.emptySorted(); |
| } |
| cost += values.cost(); |
| |
| subs.add(new SortedDocValuesSub(mergeState.docMaps[i], values, map.getGlobalOrds(i))); |
| } |
| |
| final long finalCost = cost; |
| |
| final DocIDMerger<SortedDocValuesSub> docIDMerger = DocIDMerger.of(subs, mergeState.needsIndexSort); |
| |
| return new SortedDocValues() { |
| private int docID = -1; |
| private int ord; |
| |
| @Override |
| public int docID() { |
| return docID; |
| } |
| |
| @Override |
| public int nextDoc() throws IOException { |
| SortedDocValuesSub sub = docIDMerger.next(); |
| if (sub == null) { |
| return docID = NO_MORE_DOCS; |
| } |
| int subOrd = sub.values.ordValue(); |
| assert subOrd != -1; |
| ord = (int) sub.map.get(subOrd); |
| docID = sub.mappedDocID; |
| return docID; |
| } |
| |
| @Override |
| public int ordValue() { |
| return ord; |
| } |
| |
| @Override |
| public int advance(int target) { |
| throw new UnsupportedOperationException(); |
| } |
| |
| @Override |
| public boolean advanceExact(int target) throws IOException { |
| throw new UnsupportedOperationException(); |
| } |
| |
| @Override |
| public long cost() { |
| return finalCost; |
| } |
| |
| @Override |
| public int getValueCount() { |
| return (int) map.getValueCount(); |
| } |
| |
| @Override |
| public BytesRef lookupOrd(int ord) throws IOException { |
| int segmentNumber = map.getFirstSegmentNumber(ord); |
| int segmentOrd = (int) map.getFirstSegmentOrd(ord); |
| return dvs[segmentNumber].lookupOrd(segmentOrd); |
| } |
| }; |
| } |
| }); |
| } |
| |
| /** Tracks state of one sorted set sub-reader that we are merging */ |
| private static class SortedSetDocValuesSub extends DocIDMerger.Sub { |
| |
| final SortedSetDocValues values; |
| final LongValues map; |
| |
| public SortedSetDocValuesSub(MergeState.DocMap docMap, SortedSetDocValues values, LongValues map) { |
| super(docMap); |
| this.values = values; |
| this.map = map; |
| assert values.docID() == -1; |
| } |
| |
| @Override |
| public int nextDoc() throws IOException { |
| return values.nextDoc(); |
| } |
| |
| @Override |
| public String toString() { |
| return "SortedSetDocValuesSub(mappedDocID=" + mappedDocID + " values=" + values + ")"; |
| } |
| } |
| |
| /** |
| * Merges the sortedset docvalues from <code>toMerge</code>. |
| * <p> |
| * The default implementation calls {@link #addSortedSetField}, passing |
| * an Iterable that merges ordinals and values and filters deleted documents . |
| */ |
| public void mergeSortedSetField(FieldInfo mergeFieldInfo, final MergeState mergeState) throws IOException { |
| |
| List<SortedSetDocValues> toMerge = new ArrayList<>(); |
| for (int i=0;i<mergeState.docValuesProducers.length;i++) { |
| SortedSetDocValues values = null; |
| DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; |
| if (docValuesProducer != null) { |
| FieldInfo fieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name); |
| if (fieldInfo != null && fieldInfo.getDocValuesType() == DocValuesType.SORTED_SET) { |
| values = docValuesProducer.getSortedSet(fieldInfo); |
| } |
| } |
| if (values == null) { |
| values = DocValues.emptySortedSet(); |
| } |
| toMerge.add(values); |
| } |
| |
| // step 1: iterate thru each sub and mark terms still in use |
| TermsEnum liveTerms[] = new TermsEnum[toMerge.size()]; |
| long[] weights = new long[liveTerms.length]; |
| for (int sub = 0; sub < liveTerms.length; sub++) { |
| SortedSetDocValues dv = toMerge.get(sub); |
| Bits liveDocs = mergeState.liveDocs[sub]; |
| if (liveDocs == null) { |
| liveTerms[sub] = dv.termsEnum(); |
| weights[sub] = dv.getValueCount(); |
| } else { |
| LongBitSet bitset = new LongBitSet(dv.getValueCount()); |
| int docID; |
| while ((docID = dv.nextDoc()) != NO_MORE_DOCS) { |
| if (liveDocs.get(docID)) { |
| long ord; |
| while ((ord = dv.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) { |
| bitset.set(ord); |
| } |
| } |
| } |
| liveTerms[sub] = new BitsFilteredTermsEnum(dv.termsEnum(), bitset); |
| weights[sub] = bitset.cardinality(); |
| } |
| } |
| |
| // step 2: create ordinal map (this conceptually does the "merging") |
| final OrdinalMap map = OrdinalMap.build(null, liveTerms, weights, PackedInts.COMPACT); |
| |
| // step 3: add field |
| addSortedSetField(mergeFieldInfo, |
| new EmptyDocValuesProducer() { |
| @Override |
| public SortedSetDocValues getSortedSet(FieldInfo fieldInfo) throws IOException { |
| if (fieldInfo != mergeFieldInfo) { |
| throw new IllegalArgumentException("wrong FieldInfo"); |
| } |
| |
| // We must make new iterators + DocIDMerger for each iterator: |
| List<SortedSetDocValuesSub> subs = new ArrayList<>(); |
| |
| long cost = 0; |
| |
| for (int i=0;i<mergeState.docValuesProducers.length;i++) { |
| SortedSetDocValues values = null; |
| DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; |
| if (docValuesProducer != null) { |
| FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name); |
| if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED_SET) { |
| values = docValuesProducer.getSortedSet(readerFieldInfo); |
| } |
| } |
| if (values == null) { |
| values = DocValues.emptySortedSet(); |
| } |
| cost += values.cost(); |
| subs.add(new SortedSetDocValuesSub(mergeState.docMaps[i], values, map.getGlobalOrds(i))); |
| } |
| |
| final DocIDMerger<SortedSetDocValuesSub> docIDMerger = DocIDMerger.of(subs, mergeState.needsIndexSort); |
| |
| final long finalCost = cost; |
| |
| return new SortedSetDocValues() { |
| private int docID = -1; |
| private SortedSetDocValuesSub currentSub; |
| |
| @Override |
| public int docID() { |
| return docID; |
| } |
| |
| @Override |
| public int nextDoc() throws IOException { |
| currentSub = docIDMerger.next(); |
| if (currentSub == null) { |
| docID = NO_MORE_DOCS; |
| } else { |
| docID = currentSub.mappedDocID; |
| } |
| |
| return docID; |
| } |
| |
| @Override |
| public int advance(int target) throws IOException { |
| throw new UnsupportedOperationException(); |
| } |
| |
| @Override |
| public boolean advanceExact(int target) throws IOException { |
| throw new UnsupportedOperationException(); |
| } |
| |
| @Override |
| public long nextOrd() throws IOException { |
| long subOrd = currentSub.values.nextOrd(); |
| if (subOrd == NO_MORE_ORDS) { |
| return NO_MORE_ORDS; |
| } |
| return currentSub.map.get(subOrd); |
| } |
| |
| @Override |
| public long cost() { |
| return finalCost; |
| } |
| |
| @Override |
| public BytesRef lookupOrd(long ord) throws IOException { |
| int segmentNumber = map.getFirstSegmentNumber(ord); |
| long segmentOrd = map.getFirstSegmentOrd(ord); |
| return toMerge.get(segmentNumber).lookupOrd(segmentOrd); |
| } |
| |
| @Override |
| public long getValueCount() { |
| return map.getValueCount(); |
| } |
| }; |
| } |
| }); |
| } |
| |
| // TODO: seek-by-ord to nextSetBit |
| static class BitsFilteredTermsEnum extends FilteredTermsEnum { |
| final LongBitSet liveTerms; |
| |
| BitsFilteredTermsEnum(TermsEnum in, LongBitSet liveTerms) { |
| super(in, false); // <-- not passing false here wasted about 3 hours of my time!!!!!!!!!!!!! |
| assert liveTerms != null; |
| this.liveTerms = liveTerms; |
| } |
| |
| @Override |
| protected AcceptStatus accept(BytesRef term) throws IOException { |
| if (liveTerms.get(ord())) { |
| return AcceptStatus.YES; |
| } else { |
| return AcceptStatus.NO; |
| } |
| } |
| } |
| |
| /** Helper: returns true if the given docToValue count contains only at most one value */ |
| public static boolean isSingleValued(Iterable<Number> docToValueCount) { |
| for (Number count : docToValueCount) { |
| if (count.longValue() > 1) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| /** Helper: returns single-valued view, using {@code missingValue} when count is zero */ |
| public static Iterable<Number> singletonView(final Iterable<Number> docToValueCount, final Iterable<Number> values, final Number missingValue) { |
| assert isSingleValued(docToValueCount); |
| return new Iterable<Number>() { |
| |
| @Override |
| public Iterator<Number> iterator() { |
| final Iterator<Number> countIterator = docToValueCount.iterator(); |
| final Iterator<Number> valuesIterator = values.iterator(); |
| return new Iterator<Number>() { |
| |
| @Override |
| public boolean hasNext() { |
| return countIterator.hasNext(); |
| } |
| |
| @Override |
| public Number next() { |
| int count = countIterator.next().intValue(); |
| if (count == 0) { |
| return missingValue; |
| } else { |
| return valuesIterator.next(); |
| } |
| } |
| |
| @Override |
| public void remove() { |
| throw new UnsupportedOperationException(); |
| } |
| }; |
| } |
| }; |
| } |
| } |