lucene/core/src/java/org/apache/lucene/index/VectorDocValuesWriter.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.index;


 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.nio.FloatBuffer;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;

 import org.apache.lucene.codecs.DocValuesConsumer;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.search.SortField;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefBuilder;
 import org.apache.lucene.util.Counter;
 import org.apache.lucene.util.FixedBitSet;
 import org.apache.lucene.util.RamUsageEstimator;
 import org.apache.lucene.util.packed.PackedInts;
 import org.apache.lucene.util.packed.PackedLongValues;

 import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;

 /** Writes float arrays using the same methods as BinaryDocValuesWriter while exposing its internal
  * buffer for random access reads. */
 public class VectorDocValuesWriter extends DocValuesWriter {

   /** Maximum length for a vector field. */
   private static final int MAX_LENGTH = ArrayUtil.MAX_ARRAY_LENGTH;

   // Number of float vector values per block:
   private final static int BLOCK_SIZE = 8192;

   private final List<ByteBuffer> buffers;
   private final Counter iwBytesUsed;
   private final PackedLongValues.Builder lengths;
   private final FieldInfo fieldInfo;
   private final int dimension;
   private final int bufferCapacity; // how many documents in a buffer
   private final DocsWithFieldSet docsWithField;

   // We record all docids having vectors so we can get their index with a binary search
   // nocommit Can we find a more efficient data structure for this map. We need a random access data
   // structure in order to find the vector for a given docid, so we can perform KNN graph searches
   // while adding docs. We *also* need to be able to iterate over docids with values.
   private int[] docBufferMap;
   private FloatBuffer currentBuffer;

   private long bytesUsed;
   private int lastDocID = -1;

   public VectorDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) {
     this.fieldInfo = fieldInfo;
     this.iwBytesUsed = iwBytesUsed;
     dimension = getDimensionFromAttribute(fieldInfo);
     bufferCapacity = BLOCK_SIZE / dimension / 4;
     //System.out.println("dimension=" + dimension + " buffer capacity=" + bufferCapacity);
     buffers = new ArrayList<>();
     docBufferMap = new int[bufferCapacity];
     allocateNewBuffer();
     lengths = PackedLongValues.deltaPackedBuilder(PackedInts.COMPACT);
     docsWithField = new DocsWithFieldSet();
     updateBytesUsed();
   }

   public static int getDimensionFromAttribute(FieldInfo fieldInfo) {
     String dimStr = fieldInfo.getAttribute(VectorDocValues.DIMENSION_ATTR);
     if (dimStr == null) {
       // TODO: make this impossible?
       throw new IllegalStateException("DocValues type for vector field '" + fieldInfo.name + "' was indexed without a dimension.");
     }
     int dim = Integer.valueOf(dimStr);
     if (dim > MAX_LENGTH) {
       throw new IllegalArgumentException("DocValuesField \"" + fieldInfo.name + "\" has dimension " + dim + ", which exceeds the maximum: " + MAX_LENGTH);
     }
     return dim;
   }

   public void addValue(int docID, float[] value) {
     if (docID <= lastDocID) {
       throw new IllegalArgumentException("DocValuesField \"" + fieldInfo.name + "\" appears more than once in this document (only one value is allowed per field)");
     }
     if (value == null) {
       throw new IllegalArgumentException("field=\"" + fieldInfo.name + "\": null value not allowed");
     }
     if (value.length != dimension) {
       throw new IllegalArgumentException("DocValuesField \"" + fieldInfo.name + "\" has the wrong dimension: " + value.length + ". It must match its type whose dimension is " + dimension);
     }
     ensureCapacity();
     // TODO: all the lengths are the same: can we just write out some degenerate PackedLongValues?
     lengths.add(value.length);
     currentBuffer.put(value);
     updateBytesUsed();
     docBufferMap[docsWithField.cost()] = docID;
     docsWithField.add(docID);
     lastDocID = docID;
   }

   private void ensureCapacity() {
     if (currentBuffer.hasRemaining() == false) {
       allocateNewBuffer();
     }
   }

   private void allocateNewBuffer() {
     buffers.add(ByteBuffer.allocate(4 * dimension * (BLOCK_SIZE / 4 / dimension)));
     currentBuffer = buffers.get(buffers.size() - 1).asFloatBuffer();
     docBufferMap = ArrayUtil.grow(docBufferMap, buffers.size() * bufferCapacity);
   }

   private void updateBytesUsed() {
     final long newBytesUsed =
       lengths.ramBytesUsed()
       + (dimension * 4 * BLOCK_SIZE * buffers.size())
       + docsWithField.ramBytesUsed()
       + RamUsageEstimator.sizeOf(docBufferMap);
     iwBytesUsed.addAndGet(newBytesUsed - bytesUsed);
     bytesUsed = newBytesUsed;
   }

   @Override
   public void finish(int maxDoc) {
   }

   @Override
   Sorter.DocComparator getDocComparator(int numDoc, SortField sortField) throws IOException {
     throw new IllegalArgumentException("It is forbidden to sort on a binary field");
   }

   private SortingLeafReader.CachedBinaryDVs sortDocValues(int maxDoc, Sorter.DocMap sortMap, BinaryDocValues oldValues) throws IOException {
     FixedBitSet docsWithField = new FixedBitSet(maxDoc);
     BytesRef[] values = new BytesRef[maxDoc];
     while (true) {
       int docID = oldValues.nextDoc();
       if (docID == NO_MORE_DOCS) {
         break;
       }
       int newDocID = sortMap.oldToNew(docID);
       docsWithField.set(newDocID);
       values[newDocID] = BytesRef.deepCopyOf(oldValues.binaryValue());
     }
     return new SortingLeafReader.CachedBinaryDVs(values, docsWithField);
   }

   @Override
   public void flush(SegmentWriteState state, Sorter.DocMap sortMap, DocValuesConsumer dvConsumer) throws IOException {
     final PackedLongValues lengths = this.lengths.build();
     final SortingLeafReader.CachedBinaryDVs sorted;
     if (sortMap != null) {
       sorted = sortDocValues(state.segmentInfo.maxDoc(), sortMap, new BufferedBinaryDocValues());
     } else {
       sorted = null;
     }
     dvConsumer.addBinaryField(fieldInfo,
                               new EmptyDocValuesProducer() {
                                 @Override
                                 public BinaryDocValues getBinary(FieldInfo fieldInfoIn) {
                                   if (fieldInfoIn != fieldInfo) {
                                     throw new IllegalArgumentException("wrong fieldInfo");
                                   }
                                   if (sorted == null) {
                                     return new BufferedBinaryDocValues();
                                   } else {
                                     return new SortingLeafReader.SortingBinaryDocValues(sorted);
                                   }
                                 }
                               });
   }

   public VectorDocValues getBufferedValues() {
     // nocommit avoid the extra bytes copy by writing a BufferedVectorDocValues with a FloatBuffer wrapping the byte buffers
     return VectorDocValues.get(new BufferedBinaryDocValues(), dimension);
   }

   // Iterates over the values as bytes in forward order. Also supports random access positioning via advanceExact.
   private class BufferedBinaryDocValues extends BinaryDocValues {
     final private int stride;
     final private BytesRefBuilder value;

     private int currentDocIndex;
     private int docId;

     BufferedBinaryDocValues() {
       value = new BytesRefBuilder();
       stride = dimension * 4;
       value.grow(stride);
       value.setLength(stride);
       currentDocIndex = -1;
       docId = -1;
     }

     @Override
     public int docID() {
       return docId;
     }

     @Override
     public int nextDoc() throws IOException {
       if (++currentDocIndex < docsWithField.cost()) {
         copyVector();
         docId = docBufferMap[currentDocIndex];
       } else {
         docId = NO_MORE_DOCS;
       }
       return docId;
     }

     private void copyVector() {
       ByteBuffer buffer = buffers.get(currentDocIndex / bufferCapacity);
       buffer.position((currentDocIndex % bufferCapacity) * stride);
       buffer.get(value.bytes(), 0, stride);
     }

     @Override
     public int advance(int target) {
       int idx = Arrays.binarySearch(docBufferMap, 0, docsWithField.cost(), target);
       if (idx < 0) {
         idx = -1 - idx;
         if (idx >= docBufferMap.length) {
           currentDocIndex = NO_MORE_DOCS;
           docId = NO_MORE_DOCS;
           return NO_MORE_DOCS;
         }
       }
       currentDocIndex = idx;
       copyVector();
       docId = docBufferMap[currentDocIndex];
       return docId;
     }

     @Override
     public boolean advanceExact(int target) throws IOException {
       int idx = Arrays.binarySearch(docBufferMap, 0, docsWithField.cost(), target);
       if (idx < 0) {
         return false;
       } else {
         currentDocIndex = idx;
         copyVector();
         docId = target;
         return true;
       }
     }

     @Override
     public long cost() {
       return docsWithField.cost();
     }

     @Override
     public BytesRef binaryValue() {
       return value.get();
     }
   }

   @Override
   DocIdSetIterator getDocIdSet() {
     return docsWithField.iterator();
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.index;


	import java.io.IOException;
	import java.nio.ByteBuffer;
	import java.nio.FloatBuffer;
	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.List;

	import org.apache.lucene.codecs.DocValuesConsumer;
	import org.apache.lucene.search.DocIdSetIterator;
	import org.apache.lucene.search.SortField;
	import org.apache.lucene.util.ArrayUtil;
	import org.apache.lucene.util.BytesRef;
	import org.apache.lucene.util.BytesRefBuilder;
	import org.apache.lucene.util.Counter;
	import org.apache.lucene.util.FixedBitSet;
	import org.apache.lucene.util.RamUsageEstimator;
	import org.apache.lucene.util.packed.PackedInts;
	import org.apache.lucene.util.packed.PackedLongValues;

	import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;

	/** Writes float arrays using the same methods as BinaryDocValuesWriter while exposing its internal
	* buffer for random access reads. */
	public class VectorDocValuesWriter extends DocValuesWriter {

	/** Maximum length for a vector field. */
	private static final int MAX_LENGTH = ArrayUtil.MAX_ARRAY_LENGTH;

	// Number of float vector values per block:
	private final static int BLOCK_SIZE = 8192;

	private final List<ByteBuffer> buffers;
	private final Counter iwBytesUsed;
	private final PackedLongValues.Builder lengths;
	private final FieldInfo fieldInfo;
	private final int dimension;
	private final int bufferCapacity; // how many documents in a buffer
	private final DocsWithFieldSet docsWithField;

	// We record all docids having vectors so we can get their index with a binary search
	// nocommit Can we find a more efficient data structure for this map. We need a random access data
	// structure in order to find the vector for a given docid, so we can perform KNN graph searches
	// while adding docs. We also need to be able to iterate over docids with values.
	private int[] docBufferMap;
	private FloatBuffer currentBuffer;

	private long bytesUsed;
	private int lastDocID = -1;

	public VectorDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) {
	this.fieldInfo = fieldInfo;
	this.iwBytesUsed = iwBytesUsed;
	dimension = getDimensionFromAttribute(fieldInfo);
	bufferCapacity = BLOCK_SIZE / dimension / 4;
	//System.out.println("dimension=" + dimension + " buffer capacity=" + bufferCapacity);
	buffers = new ArrayList<>();
	docBufferMap = new int[bufferCapacity];
	allocateNewBuffer();
	lengths = PackedLongValues.deltaPackedBuilder(PackedInts.COMPACT);
	docsWithField = new DocsWithFieldSet();
	updateBytesUsed();
	}

	public static int getDimensionFromAttribute(FieldInfo fieldInfo) {
	String dimStr = fieldInfo.getAttribute(VectorDocValues.DIMENSION_ATTR);
	if (dimStr == null) {
	// TODO: make this impossible?
	throw new IllegalStateException("DocValues type for vector field '" + fieldInfo.name + "' was indexed without a dimension.");
	}
	int dim = Integer.valueOf(dimStr);
	if (dim > MAX_LENGTH) {
	throw new IllegalArgumentException("DocValuesField \"" + fieldInfo.name + "\" has dimension " + dim + ", which exceeds the maximum: " + MAX_LENGTH);
	}
	return dim;
	}

	public void addValue(int docID, float[] value) {
	if (docID <= lastDocID) {
	throw new IllegalArgumentException("DocValuesField \"" + fieldInfo.name + "\" appears more than once in this document (only one value is allowed per field)");
	}
	if (value == null) {
	throw new IllegalArgumentException("field=\"" + fieldInfo.name + "\": null value not allowed");
	}
	if (value.length != dimension) {
	throw new IllegalArgumentException("DocValuesField \"" + fieldInfo.name + "\" has the wrong dimension: " + value.length + ". It must match its type whose dimension is " + dimension);
	}
	ensureCapacity();
	// TODO: all the lengths are the same: can we just write out some degenerate PackedLongValues?
	lengths.add(value.length);
	currentBuffer.put(value);
	updateBytesUsed();
	docBufferMap[docsWithField.cost()] = docID;
	docsWithField.add(docID);
	lastDocID = docID;
	}

	private void ensureCapacity() {
	if (currentBuffer.hasRemaining() == false) {
	allocateNewBuffer();
	}
	}

	private void allocateNewBuffer() {
	buffers.add(ByteBuffer.allocate(4 * dimension * (BLOCK_SIZE / 4 / dimension)));
	currentBuffer = buffers.get(buffers.size() - 1).asFloatBuffer();
	docBufferMap = ArrayUtil.grow(docBufferMap, buffers.size() * bufferCapacity);
	}

	private void updateBytesUsed() {
	final long newBytesUsed =
	lengths.ramBytesUsed()
	+ (dimension * 4 * BLOCK_SIZE * buffers.size())
	+ docsWithField.ramBytesUsed()
	+ RamUsageEstimator.sizeOf(docBufferMap);
	iwBytesUsed.addAndGet(newBytesUsed - bytesUsed);
	bytesUsed = newBytesUsed;
	}

	@Override
	public void finish(int maxDoc) {
	}

	@Override
	Sorter.DocComparator getDocComparator(int numDoc, SortField sortField) throws IOException {
	throw new IllegalArgumentException("It is forbidden to sort on a binary field");
	}

	private SortingLeafReader.CachedBinaryDVs sortDocValues(int maxDoc, Sorter.DocMap sortMap, BinaryDocValues oldValues) throws IOException {
	FixedBitSet docsWithField = new FixedBitSet(maxDoc);
	BytesRef[] values = new BytesRef[maxDoc];
	while (true) {
	int docID = oldValues.nextDoc();
	if (docID == NO_MORE_DOCS) {
	break;
	}
	int newDocID = sortMap.oldToNew(docID);
	docsWithField.set(newDocID);
	values[newDocID] = BytesRef.deepCopyOf(oldValues.binaryValue());
	}
	return new SortingLeafReader.CachedBinaryDVs(values, docsWithField);
	}

	@Override
	public void flush(SegmentWriteState state, Sorter.DocMap sortMap, DocValuesConsumer dvConsumer) throws IOException {
	final PackedLongValues lengths = this.lengths.build();
	final SortingLeafReader.CachedBinaryDVs sorted;
	if (sortMap != null) {
	sorted = sortDocValues(state.segmentInfo.maxDoc(), sortMap, new BufferedBinaryDocValues());
	} else {
	sorted = null;
	}
	dvConsumer.addBinaryField(fieldInfo,
	new EmptyDocValuesProducer() {
	@Override
	public BinaryDocValues getBinary(FieldInfo fieldInfoIn) {
	if (fieldInfoIn != fieldInfo) {
	throw new IllegalArgumentException("wrong fieldInfo");
	}
	if (sorted == null) {
	return new BufferedBinaryDocValues();
	} else {
	return new SortingLeafReader.SortingBinaryDocValues(sorted);
	}
	}
	});
	}

	public VectorDocValues getBufferedValues() {
	// nocommit avoid the extra bytes copy by writing a BufferedVectorDocValues with a FloatBuffer wrapping the byte buffers
	return VectorDocValues.get(new BufferedBinaryDocValues(), dimension);
	}

	// Iterates over the values as bytes in forward order. Also supports random access positioning via advanceExact.
	private class BufferedBinaryDocValues extends BinaryDocValues {
	final private int stride;
	final private BytesRefBuilder value;

	private int currentDocIndex;
	private int docId;

	BufferedBinaryDocValues() {
	value = new BytesRefBuilder();
	stride = dimension * 4;
	value.grow(stride);
	value.setLength(stride);
	currentDocIndex = -1;
	docId = -1;
	}

	@Override
	public int docID() {
	return docId;
	}

	@Override
	public int nextDoc() throws IOException {
	if (++currentDocIndex < docsWithField.cost()) {
	copyVector();
	docId = docBufferMap[currentDocIndex];
	} else {
	docId = NO_MORE_DOCS;
	}
	return docId;
	}

	private void copyVector() {
	ByteBuffer buffer = buffers.get(currentDocIndex / bufferCapacity);
	buffer.position((currentDocIndex % bufferCapacity) * stride);
	buffer.get(value.bytes(), 0, stride);
	}

	@Override
	public int advance(int target) {
	int idx = Arrays.binarySearch(docBufferMap, 0, docsWithField.cost(), target);
	if (idx < 0) {
	idx = -1 - idx;
	if (idx >= docBufferMap.length) {
	currentDocIndex = NO_MORE_DOCS;
	docId = NO_MORE_DOCS;
	return NO_MORE_DOCS;
	}
	}
	currentDocIndex = idx;
	copyVector();
	docId = docBufferMap[currentDocIndex];
	return docId;
	}

	@Override
	public boolean advanceExact(int target) throws IOException {
	int idx = Arrays.binarySearch(docBufferMap, 0, docsWithField.cost(), target);
	if (idx < 0) {
	return false;
	} else {
	currentDocIndex = idx;
	copyVector();
	docId = target;
	return true;
	}
	}

	@Override
	public long cost() {
	return docsWithField.cost();
	}

	@Override
	public BytesRef binaryValue() {
	return value.get();
	}
	}

	@Override
	DocIdSetIterator getDocIdSet() {
	return docsWithField.iterator();
	}
	}