| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.lucene.index; |
| |
| import java.io.IOException; |
| import java.util.Arrays; |
| import java.util.Comparator; |
| |
| import org.apache.lucene.util.ArrayUtil; |
| import org.apache.lucene.util.Bits; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.BytesRefArray; |
| import org.apache.lucene.util.BytesRefIterator; |
| import org.apache.lucene.util.Counter; |
| import org.apache.lucene.util.FixedBitSet; |
| import org.apache.lucene.util.RamUsageEstimator; |
| |
| /** |
| * This class efficiently buffers numeric and binary field updates and stores |
| * terms, values and metadata in a memory efficient way without creating large amounts |
| * of objects. Update terms are stored without de-duplicating the update term. |
| * In general we try to optimize for several use-cases. For instance we try to use constant |
| * space for update terms field since the common case always updates on the same field. Also for docUpTo |
| * we try to optimize for the case when updates should be applied to all docs ie. docUpTo=Integer.MAX_VALUE. |
| * In other cases each update will likely have a different docUpTo. |
| * Along the same lines this impl optimizes the case when all updates have a value. Lastly, if all updates share the |
| * same value for a numeric field we only store the value once. |
| */ |
| final class FieldUpdatesBuffer { |
| private static final long SELF_SHALLOW_SIZE = RamUsageEstimator.shallowSizeOfInstance(FieldUpdatesBuffer.class); |
| private static final long STRING_SHALLOW_SIZE = RamUsageEstimator.shallowSizeOfInstance(String.class); |
| private final Counter bytesUsed; |
| private int numUpdates = 1; |
| // we use a very simple approach and store the update term values without de-duplication |
| // which is also not a common case to keep updating the same value more than once... |
| // we might pay a higher price in terms of memory in certain cases but will gain |
| // on CPU for those. We also save on not needing to sort in order to apply the terms in order |
| // since by definition we store them in order. |
| private final BytesRefArray termValues; |
| private BytesRefArray.SortState termSortState; |
| private final BytesRefArray byteValues; // this will be null if we are buffering numerics |
| private int[] docsUpTo; |
| private long[] numericValues; // this will be null if we are buffering binaries |
| private FixedBitSet hasValues; |
| private long maxNumeric = Long.MIN_VALUE; |
| private long minNumeric = Long.MAX_VALUE; |
| private String[] fields; |
| private final boolean isNumeric; |
| private boolean finished = false; |
| |
| private FieldUpdatesBuffer(Counter bytesUsed, DocValuesUpdate initialValue, int docUpTo, boolean isNumeric) { |
| this.bytesUsed = bytesUsed; |
| this.bytesUsed.addAndGet(SELF_SHALLOW_SIZE); |
| termValues = new BytesRefArray(bytesUsed); |
| termValues.append(initialValue.term.bytes); |
| fields = new String[] {initialValue.term.field}; |
| bytesUsed.addAndGet(sizeOfString(initialValue.term.field)); |
| docsUpTo = new int[] {docUpTo}; |
| if (initialValue.hasValue == false) { |
| hasValues = new FixedBitSet(1); |
| bytesUsed.addAndGet(hasValues.ramBytesUsed()); |
| } |
| this.isNumeric = isNumeric; |
| byteValues = isNumeric ? null : new BytesRefArray(bytesUsed); |
| } |
| |
| private static long sizeOfString(String string) { |
| return STRING_SHALLOW_SIZE + (string.length() * Character.BYTES); |
| } |
| |
| FieldUpdatesBuffer(Counter bytesUsed, DocValuesUpdate.NumericDocValuesUpdate initialValue, int docUpTo) { |
| this(bytesUsed, initialValue, docUpTo, true); |
| if (initialValue.hasValue()) { |
| numericValues = new long[] {initialValue.getValue()}; |
| maxNumeric = minNumeric = initialValue.getValue(); |
| } else { |
| numericValues = new long[] {0}; |
| } |
| bytesUsed.addAndGet(Long.BYTES); |
| } |
| |
| FieldUpdatesBuffer(Counter bytesUsed, DocValuesUpdate.BinaryDocValuesUpdate initialValue, int docUpTo) { |
| this(bytesUsed, initialValue, docUpTo, false); |
| if (initialValue.hasValue()) { |
| byteValues.append(initialValue.getValue()); |
| } |
| } |
| |
| long getMaxNumeric() { |
| assert isNumeric; |
| if (minNumeric == Long.MAX_VALUE && maxNumeric == Long.MIN_VALUE) { |
| return 0; // we don't have any value; |
| } |
| return maxNumeric; |
| } |
| |
| long getMinNumeric() { |
| assert isNumeric; |
| if (minNumeric == Long.MAX_VALUE && maxNumeric == Long.MIN_VALUE) { |
| return 0; // we don't have any value |
| } |
| return minNumeric; |
| } |
| |
| void add(String field, int docUpTo, int ord, boolean hasValue) { |
| assert finished == false : "buffer was finished already"; |
| if (fields[0].equals(field) == false || fields.length != 1 ) { |
| if (fields.length <= ord) { |
| String[] array = ArrayUtil.grow(fields, ord+1); |
| if (fields.length == 1) { |
| Arrays.fill(array, 1, ord, fields[0]); |
| } |
| bytesUsed.addAndGet((array.length - fields.length) * RamUsageEstimator.NUM_BYTES_OBJECT_REF); |
| fields = array; |
| } |
| if (field != fields[0]) { // that's an easy win of not accounting if there is an outlier |
| bytesUsed.addAndGet(sizeOfString(field)); |
| } |
| fields[ord] = field; |
| } |
| |
| if (docsUpTo[0] != docUpTo || docsUpTo.length != 1) { |
| if (docsUpTo.length <= ord) { |
| int[] array = ArrayUtil.grow(docsUpTo, ord+1); |
| if (docsUpTo.length == 1) { |
| Arrays.fill(array, 1, ord, docsUpTo[0]); |
| } |
| bytesUsed.addAndGet((array.length-docsUpTo.length) * Integer.BYTES); |
| docsUpTo = array; |
| } |
| docsUpTo[ord] = docUpTo; |
| } |
| |
| if (hasValue == false || hasValues != null) { |
| if (hasValues == null) { |
| hasValues = new FixedBitSet(ord+1); |
| hasValues.set(0, ord); |
| bytesUsed.addAndGet(hasValues.ramBytesUsed()); |
| } else if (hasValues.length() <= ord) { |
| FixedBitSet fixedBitSet = FixedBitSet.ensureCapacity(hasValues, ArrayUtil.oversize(ord + 1, 1)); |
| bytesUsed.addAndGet(fixedBitSet.ramBytesUsed()-hasValues.ramBytesUsed()); |
| hasValues = fixedBitSet; |
| } |
| if (hasValue) { |
| hasValues.set(ord); |
| } |
| } |
| } |
| |
| void addUpdate(Term term, long value, int docUpTo) { |
| assert isNumeric; |
| final int ord = append(term); |
| String field = term.field; |
| add(field, docUpTo, ord, true); |
| minNumeric = Math.min(minNumeric, value); |
| maxNumeric = Math.max(maxNumeric, value); |
| if (numericValues[0] != value || numericValues.length != 1) { |
| if (numericValues.length <= ord) { |
| long[] array = ArrayUtil.grow(numericValues, ord+1); |
| if (numericValues.length == 1) { |
| Arrays.fill(array, 1, ord, numericValues[0]); |
| } |
| bytesUsed.addAndGet((array.length-numericValues.length) * Long.BYTES); |
| numericValues = array; |
| } |
| numericValues[ord] = value; |
| } |
| } |
| |
| void addNoValue(Term term, int docUpTo) { |
| final int ord = append(term); |
| add(term.field, docUpTo, ord, false); |
| } |
| |
| void addUpdate(Term term, BytesRef value, int docUpTo) { |
| assert isNumeric == false; |
| final int ord = append(term); |
| byteValues.append(value); |
| add(term.field, docUpTo, ord, true); |
| } |
| |
| private int append(Term term) { |
| termValues.append(term.bytes); |
| return numUpdates++; |
| } |
| |
| void finish() { |
| if (finished) { |
| throw new IllegalStateException("buffer was finished already"); |
| } |
| finished = true; |
| final boolean sortedTerms = hasSingleValue() && hasValues == null && fields.length == 1; |
| if (sortedTerms) { |
| // sort by ascending by term, then sort descending by docsUpTo so that we can skip updates with lower docUpTo. |
| termSortState = termValues.sort(Comparator.naturalOrder(), |
| (i1, i2) -> Integer.compare( |
| docsUpTo[getArrayIndex(docsUpTo.length, i2)], |
| docsUpTo[getArrayIndex(docsUpTo.length, i1)])); |
| bytesUsed.addAndGet(termSortState.ramBytesUsed()); |
| } |
| } |
| |
| BufferedUpdateIterator iterator() { |
| if (finished == false) { |
| throw new IllegalStateException("buffer is not finished yet"); |
| } |
| return new BufferedUpdateIterator(); |
| } |
| |
| boolean isNumeric() { |
| assert isNumeric || byteValues != null; |
| return isNumeric; |
| } |
| |
| boolean hasSingleValue() { |
| // we only do this optimization for numerics so far. |
| return isNumeric && numericValues.length == 1; |
| } |
| |
| long getNumericValue(int idx) { |
| if (hasValues != null && hasValues.get(idx) == false) { |
| return 0; |
| } |
| return numericValues[getArrayIndex(numericValues.length, idx)]; |
| } |
| |
| /** |
| * Struct like class that is used to iterate over all updates in this buffer |
| */ |
| static class BufferedUpdate { |
| |
| private BufferedUpdate() {}; |
| /** |
| * the max document ID this update should be applied to |
| */ |
| int docUpTo; |
| /** |
| * a numeric value or 0 if this buffer holds binary updates |
| */ |
| long numericValue; |
| /** |
| * a binary value or null if this buffer holds numeric updates |
| */ |
| BytesRef binaryValue; |
| /** |
| * <code>true</code> if this update has a value |
| */ |
| boolean hasValue; |
| /** |
| * The update terms field. This will never be null. |
| */ |
| String termField; |
| /** |
| * The update terms value. This will never be null. |
| */ |
| BytesRef termValue; |
| |
| @Override |
| public int hashCode() { |
| throw new UnsupportedOperationException( |
| "this struct should not be use in map or other data-structures that use hashCode / equals"); |
| } |
| |
| @Override |
| public boolean equals(Object obj) { |
| throw new UnsupportedOperationException( |
| "this struct should not be use in map or other data-structures that use hashCode / equals"); |
| } |
| } |
| |
| /** |
| * An iterator that iterates over all updates in insertion order |
| */ |
| class BufferedUpdateIterator { |
| private final BytesRefArray.IndexedBytesRefIterator termValuesIterator; |
| private final BytesRefArray.IndexedBytesRefIterator lookAheadTermIterator; |
| private final BytesRefIterator byteValuesIterator; |
| private final BufferedUpdate bufferedUpdate = new BufferedUpdate(); |
| private final Bits updatesWithValue; |
| |
| BufferedUpdateIterator() { |
| this.termValuesIterator = termValues.iterator(termSortState); |
| this.lookAheadTermIterator = termSortState != null ? termValues.iterator(termSortState) : null; |
| this.byteValuesIterator = isNumeric ? null : byteValues.iterator(); |
| updatesWithValue = hasValues == null ? new Bits.MatchAllBits(numUpdates) : hasValues; |
| } |
| |
| /** |
| * If all updates update a single field to the same value, then we can apply these |
| * updates in the term order instead of the request order as both will yield the same result. |
| * This optimization allows us to iterate the term dictionary faster and de-duplicate updates. |
| */ |
| boolean isSortedTerms() { |
| return termSortState != null; |
| } |
| |
| /** |
| * Moves to the next BufferedUpdate or return null if all updates are consumed. |
| * The returned instance is a shared instance and must be fully consumed before the next call to this method. |
| */ |
| BufferedUpdate next() throws IOException { |
| BytesRef next = nextTerm(); |
| if (next != null) { |
| final int idx = termValuesIterator.ord(); |
| bufferedUpdate.termValue = next; |
| bufferedUpdate.hasValue = updatesWithValue.get(idx); |
| bufferedUpdate.termField = fields[getArrayIndex(fields.length, idx)]; |
| bufferedUpdate.docUpTo = docsUpTo[getArrayIndex(docsUpTo.length, idx)]; |
| if (bufferedUpdate.hasValue) { |
| if (isNumeric) { |
| bufferedUpdate.numericValue = numericValues[getArrayIndex(numericValues.length, idx)]; |
| bufferedUpdate.binaryValue = null; |
| } else { |
| bufferedUpdate.binaryValue = byteValuesIterator.next(); |
| } |
| } else { |
| bufferedUpdate.binaryValue = null; |
| bufferedUpdate.numericValue = 0; |
| } |
| return bufferedUpdate; |
| } else { |
| return null; |
| } |
| } |
| |
| BytesRef nextTerm() throws IOException { |
| if (lookAheadTermIterator != null) { |
| final BytesRef lastTerm = bufferedUpdate.termValue; |
| BytesRef lookAheadTerm; |
| while ((lookAheadTerm = lookAheadTermIterator.next()) != null && lookAheadTerm.equals(lastTerm)) { |
| BytesRef discardedTerm = termValuesIterator.next(); // discard as the docUpTo of the previous update is higher |
| assert discardedTerm.equals(lookAheadTerm) : "[" + discardedTerm + "] != [" + lookAheadTerm + "]"; |
| assert docsUpTo[getArrayIndex(docsUpTo.length, termValuesIterator.ord())] <= bufferedUpdate.docUpTo : |
| docsUpTo[getArrayIndex(docsUpTo.length, termValuesIterator.ord())] + ">" + bufferedUpdate.docUpTo; |
| } |
| } |
| return termValuesIterator.next(); |
| } |
| } |
| |
| private static int getArrayIndex(int arrayLength, int index) { |
| assert arrayLength == 1 || arrayLength > index : "illegal array index length: " + arrayLength + " index: " + index; |
| return Math.min(arrayLength-1, index); |
| } |
| } |