lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java - lucene-solr - Git at Google

 package org.apache.lucene.index;

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 import java.io.IOException;

 import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
 import org.apache.lucene.util.ByteBlockPool;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefHash;
 import org.apache.lucene.util.Counter;
 import org.apache.lucene.util.IntBlockPool;
 import org.apache.lucene.util.BytesRefHash.BytesStartArray;
 import org.apache.lucene.util.BytesRefHash.MaxBytesLengthExceededException;

 final class TermsHashPerField extends InvertedDocConsumerPerField {
   private static final int HASH_INIT_SIZE = 4;

   final TermsHashConsumerPerField consumer;

   final TermsHash termsHash;

   final TermsHashPerField nextPerField;
   final DocumentsWriterPerThread.DocState docState;
   final FieldInvertState fieldState;
   TermToBytesRefAttribute termAtt;
   BytesRef termBytesRef;

   // Copied from our perThread
   final IntBlockPool intPool;
   final ByteBlockPool bytePool;
   final ByteBlockPool termBytePool;

   final int streamCount;
   final int numPostingInt;

   final FieldInfo fieldInfo;

   final BytesRefHash bytesHash;

   ParallelPostingsArray postingsArray;
   private final Counter bytesUsed;

   public TermsHashPerField(DocInverterPerField docInverterPerField, final TermsHash termsHash, final TermsHash nextTermsHash, final FieldInfo fieldInfo) {
     intPool = termsHash.intPool;
     bytePool = termsHash.bytePool;
     termBytePool = termsHash.termBytePool;
     docState = termsHash.docState;
     this.termsHash = termsHash;
     bytesUsed = termsHash.bytesUsed;
     fieldState = docInverterPerField.fieldState;
     this.consumer = termsHash.consumer.addField(this, fieldInfo);
     PostingsBytesStartArray byteStarts = new PostingsBytesStartArray(this, bytesUsed);
     bytesHash = new BytesRefHash(termBytePool, HASH_INIT_SIZE, byteStarts);
     streamCount = consumer.getStreamCount();
     numPostingInt = 2*streamCount;
     this.fieldInfo = fieldInfo;
     if (nextTermsHash != null)
       nextPerField = (TermsHashPerField) nextTermsHash.addField(docInverterPerField, fieldInfo);
     else
       nextPerField = null;
   }

   void reset() {
     bytesHash.clear(false);
     if (nextPerField != null)
       nextPerField.reset();
   }

   @Override
   public void abort() {
     reset();
     if (nextPerField != null)
       nextPerField.abort();
   }

   public void initReader(ByteSliceReader reader, int termID, int stream) {
     assert stream < streamCount;
     int intStart = postingsArray.intStarts[termID];
     final int[] ints = intPool.buffers[intStart >> IntBlockPool.INT_BLOCK_SHIFT];
     final int upto = intStart & IntBlockPool.INT_BLOCK_MASK;
     reader.init(bytePool,
                 postingsArray.byteStarts[termID]+stream*ByteBlockPool.FIRST_LEVEL_SIZE,
                 ints[upto+stream]);
   }

   /** Collapse the hash table & sort in-place. */
   public int[] sortPostings() {
     return bytesHash.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
   }

   private boolean doCall;
   private boolean doNextCall;

   @Override
   void start(IndexableField f) {
     termAtt = fieldState.attributeSource.getAttribute(TermToBytesRefAttribute.class);
     termBytesRef = termAtt.getBytesRef();
     consumer.start(f);
     if (nextPerField != null) {
       nextPerField.start(f);
     }
   }

   @Override
   boolean start(IndexableField[] fields, int count) throws IOException {
     doCall = consumer.start(fields, count);
     bytesHash.reinit();
     if (nextPerField != null) {
       doNextCall = nextPerField.start(fields, count);
     }
     return doCall || doNextCall;
   }

   // Secondary entry point (for 2nd & subsequent TermsHash),
   // because token text has already been "interned" into
   // textStart, so we hash by textStart.  term vectors use
   // this API.
   public void add(int textStart) throws IOException {
     int termID = bytesHash.addByPoolOffset(textStart);
     if (termID >= 0) {      // New posting
       // First time we are seeing this token since we last
       // flushed the hash.
       // Init stream slices
       if (numPostingInt + intPool.intUpto > IntBlockPool.INT_BLOCK_SIZE)
         intPool.nextBuffer();

       if (ByteBlockPool.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE) {
         bytePool.nextBuffer();
       }

       intUptos = intPool.buffer;
       intUptoStart = intPool.intUpto;
       intPool.intUpto += streamCount;

       postingsArray.intStarts[termID] = intUptoStart + intPool.intOffset;

       for(int i=0;i<streamCount;i++) {
         final int upto = bytePool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
         intUptos[intUptoStart+i] = upto + bytePool.byteOffset;
       }
       postingsArray.byteStarts[termID] = intUptos[intUptoStart];

       consumer.newTerm(termID);

     } else {
       termID = (-termID)-1;
       int intStart = postingsArray.intStarts[termID];
       intUptos = intPool.buffers[intStart >> IntBlockPool.INT_BLOCK_SHIFT];
       intUptoStart = intStart & IntBlockPool.INT_BLOCK_MASK;
       consumer.addTerm(termID);
     }
   }

   // Primary entry point (for first TermsHash); postings use
   // this API.
   @Override
   void add() throws IOException {

     // We are first in the chain so we must "intern" the
     // term text into textStart address
     // Get the text & hash of this term.
     int termID;
     try {
       termID = bytesHash.add(termBytesRef, termAtt.fillBytesRef());
     } catch (MaxBytesLengthExceededException e) {
       // Not enough room in current block
       // Just skip this term, to remain as robust as
       // possible during indexing.  A TokenFilter
       // can be inserted into the analyzer chain if
       // other behavior is wanted (pruning the term
       // to a prefix, throwing an exception, etc).
       if (docState.maxTermPrefix == null) {
         final int saved = termBytesRef.length;
         try {
           termBytesRef.length = Math.min(30, DocumentsWriterPerThread.MAX_TERM_LENGTH_UTF8);
           docState.maxTermPrefix = termBytesRef.toString();
         } finally {
           termBytesRef.length = saved;
         }
       }
       consumer.skippingLongTerm();
       return;
     }
     if (termID >= 0) {// New posting
       bytesHash.byteStart(termID);
       // Init stream slices
       if (numPostingInt + intPool.intUpto > IntBlockPool.INT_BLOCK_SIZE) {
         intPool.nextBuffer();
       }

       if (ByteBlockPool.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE) {
         bytePool.nextBuffer();
       }

       intUptos = intPool.buffer;
       intUptoStart = intPool.intUpto;
       intPool.intUpto += streamCount;

       postingsArray.intStarts[termID] = intUptoStart + intPool.intOffset;

       for(int i=0;i<streamCount;i++) {
         final int upto = bytePool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
         intUptos[intUptoStart+i] = upto + bytePool.byteOffset;
       }
       postingsArray.byteStarts[termID] = intUptos[intUptoStart];

       consumer.newTerm(termID);

     } else {
       termID = (-termID)-1;
       final int intStart = postingsArray.intStarts[termID];
       intUptos = intPool.buffers[intStart >> IntBlockPool.INT_BLOCK_SHIFT];
       intUptoStart = intStart & IntBlockPool.INT_BLOCK_MASK;
       consumer.addTerm(termID);
     }

     if (doNextCall)
       nextPerField.add(postingsArray.textStarts[termID]);
   }

   int[] intUptos;
   int intUptoStart;

   void writeByte(int stream, byte b) {
     int upto = intUptos[intUptoStart+stream];
     byte[] bytes = bytePool.buffers[upto >> ByteBlockPool.BYTE_BLOCK_SHIFT];
     assert bytes != null;
     int offset = upto & ByteBlockPool.BYTE_BLOCK_MASK;
     if (bytes[offset] != 0) {
       // End of slice; allocate a new one
       offset = bytePool.allocSlice(bytes, offset);
       bytes = bytePool.buffer;
       intUptos[intUptoStart+stream] = offset + bytePool.byteOffset;
     }
     bytes[offset] = b;
     (intUptos[intUptoStart+stream])++;
   }

   public void writeBytes(int stream, byte[] b, int offset, int len) {
     // TODO: optimize
     final int end = offset + len;
     for(int i=offset;i<end;i++)
       writeByte(stream, b[i]);
   }

   void writeVInt(int stream, int i) {
     assert stream < streamCount;
     while ((i & ~0x7F) != 0) {
       writeByte(stream, (byte)((i & 0x7f) | 0x80));
       i >>>= 7;
     }
     writeByte(stream, (byte) i);
   }

   @Override
   void finish() throws IOException {
     consumer.finish();
     if (nextPerField != null)
       nextPerField.finish();
   }

   private static final class PostingsBytesStartArray extends BytesStartArray {

     private final TermsHashPerField perField;
     private final Counter bytesUsed;

     private PostingsBytesStartArray(
         TermsHashPerField perField, Counter bytesUsed) {
       this.perField = perField;
       this.bytesUsed = bytesUsed;
     }

     @Override
     public int[] init() {
       if (perField.postingsArray == null) {
         perField.postingsArray = perField.consumer.createPostingsArray(2);
         bytesUsed.addAndGet(perField.postingsArray.size * perField.postingsArray.bytesPerPosting());
       }
       return perField.postingsArray.textStarts;
     }

     @Override
     public int[] grow() {
       ParallelPostingsArray postingsArray = perField.postingsArray;
       final int oldSize = perField.postingsArray.size;
       postingsArray = perField.postingsArray = postingsArray.grow();
       bytesUsed.addAndGet((postingsArray.bytesPerPosting() * (postingsArray.size - oldSize)));
       return postingsArray.textStarts;
     }

     @Override
     public int[] clear() {
       if(perField.postingsArray != null) {
         bytesUsed.addAndGet(-(perField.postingsArray.size * perField.postingsArray.bytesPerPosting()));
         perField.postingsArray = null;
       }
       return null;
     }

     @Override
     public Counter bytesUsed() {
       return bytesUsed;
     }

   }

 }
	package org.apache.lucene.index;

	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	import java.io.IOException;

	import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
	import org.apache.lucene.util.ByteBlockPool;
	import org.apache.lucene.util.BytesRef;
	import org.apache.lucene.util.BytesRefHash;
	import org.apache.lucene.util.Counter;
	import org.apache.lucene.util.IntBlockPool;
	import org.apache.lucene.util.BytesRefHash.BytesStartArray;
	import org.apache.lucene.util.BytesRefHash.MaxBytesLengthExceededException;

	final class TermsHashPerField extends InvertedDocConsumerPerField {
	private static final int HASH_INIT_SIZE = 4;

	final TermsHashConsumerPerField consumer;

	final TermsHash termsHash;

	final TermsHashPerField nextPerField;
	final DocumentsWriterPerThread.DocState docState;
	final FieldInvertState fieldState;
	TermToBytesRefAttribute termAtt;
	BytesRef termBytesRef;

	// Copied from our perThread
	final IntBlockPool intPool;
	final ByteBlockPool bytePool;
	final ByteBlockPool termBytePool;

	final int streamCount;
	final int numPostingInt;

	final FieldInfo fieldInfo;

	final BytesRefHash bytesHash;

	ParallelPostingsArray postingsArray;
	private final Counter bytesUsed;

	public TermsHashPerField(DocInverterPerField docInverterPerField, final TermsHash termsHash, final TermsHash nextTermsHash, final FieldInfo fieldInfo) {
	intPool = termsHash.intPool;
	bytePool = termsHash.bytePool;
	termBytePool = termsHash.termBytePool;
	docState = termsHash.docState;
	this.termsHash = termsHash;
	bytesUsed = termsHash.bytesUsed;
	fieldState = docInverterPerField.fieldState;
	this.consumer = termsHash.consumer.addField(this, fieldInfo);
	PostingsBytesStartArray byteStarts = new PostingsBytesStartArray(this, bytesUsed);
	bytesHash = new BytesRefHash(termBytePool, HASH_INIT_SIZE, byteStarts);
	streamCount = consumer.getStreamCount();
	numPostingInt = 2*streamCount;
	this.fieldInfo = fieldInfo;
	if (nextTermsHash != null)
	nextPerField = (TermsHashPerField) nextTermsHash.addField(docInverterPerField, fieldInfo);
	else
	nextPerField = null;
	}

	void reset() {
	bytesHash.clear(false);
	if (nextPerField != null)
	nextPerField.reset();
	}

	@Override
	public void abort() {
	reset();
	if (nextPerField != null)
	nextPerField.abort();
	}

	public void initReader(ByteSliceReader reader, int termID, int stream) {
	assert stream < streamCount;
	int intStart = postingsArray.intStarts[termID];
	final int[] ints = intPool.buffers[intStart >> IntBlockPool.INT_BLOCK_SHIFT];
	final int upto = intStart & IntBlockPool.INT_BLOCK_MASK;
	reader.init(bytePool,
	postingsArray.byteStarts[termID]+stream*ByteBlockPool.FIRST_LEVEL_SIZE,
	ints[upto+stream]);
	}

	/** Collapse the hash table & sort in-place. */
	public int[] sortPostings() {
	return bytesHash.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
	}

	private boolean doCall;
	private boolean doNextCall;

	@Override
	void start(IndexableField f) {
	termAtt = fieldState.attributeSource.getAttribute(TermToBytesRefAttribute.class);
	termBytesRef = termAtt.getBytesRef();
	consumer.start(f);
	if (nextPerField != null) {
	nextPerField.start(f);
	}
	}

	@Override
	boolean start(IndexableField[] fields, int count) throws IOException {
	doCall = consumer.start(fields, count);
	bytesHash.reinit();
	if (nextPerField != null) {
	doNextCall = nextPerField.start(fields, count);
	}
	return doCall \|\| doNextCall;
	}

	// Secondary entry point (for 2nd & subsequent TermsHash),
	// because token text has already been "interned" into
	// textStart, so we hash by textStart. term vectors use
	// this API.
	public void add(int textStart) throws IOException {
	int termID = bytesHash.addByPoolOffset(textStart);
	if (termID >= 0) { // New posting
	// First time we are seeing this token since we last
	// flushed the hash.
	// Init stream slices
	if (numPostingInt + intPool.intUpto > IntBlockPool.INT_BLOCK_SIZE)
	intPool.nextBuffer();

	if (ByteBlockPool.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE) {
	bytePool.nextBuffer();
	}

	intUptos = intPool.buffer;
	intUptoStart = intPool.intUpto;
	intPool.intUpto += streamCount;

	postingsArray.intStarts[termID] = intUptoStart + intPool.intOffset;

	for(int i=0;i<streamCount;i++) {
	final int upto = bytePool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
	intUptos[intUptoStart+i] = upto + bytePool.byteOffset;
	}
	postingsArray.byteStarts[termID] = intUptos[intUptoStart];

	consumer.newTerm(termID);

	} else {
	termID = (-termID)-1;
	int intStart = postingsArray.intStarts[termID];
	intUptos = intPool.buffers[intStart >> IntBlockPool.INT_BLOCK_SHIFT];
	intUptoStart = intStart & IntBlockPool.INT_BLOCK_MASK;
	consumer.addTerm(termID);
	}
	}

	// Primary entry point (for first TermsHash); postings use
	// this API.
	@Override
	void add() throws IOException {

	// We are first in the chain so we must "intern" the
	// term text into textStart address
	// Get the text & hash of this term.
	int termID;
	try {
	termID = bytesHash.add(termBytesRef, termAtt.fillBytesRef());
	} catch (MaxBytesLengthExceededException e) {
	// Not enough room in current block
	// Just skip this term, to remain as robust as
	// possible during indexing. A TokenFilter
	// can be inserted into the analyzer chain if
	// other behavior is wanted (pruning the term
	// to a prefix, throwing an exception, etc).
	if (docState.maxTermPrefix == null) {
	final int saved = termBytesRef.length;
	try {
	termBytesRef.length = Math.min(30, DocumentsWriterPerThread.MAX_TERM_LENGTH_UTF8);
	docState.maxTermPrefix = termBytesRef.toString();
	} finally {
	termBytesRef.length = saved;
	}
	}
	consumer.skippingLongTerm();
	return;
	}
	if (termID >= 0) {// New posting
	bytesHash.byteStart(termID);
	// Init stream slices
	if (numPostingInt + intPool.intUpto > IntBlockPool.INT_BLOCK_SIZE) {
	intPool.nextBuffer();
	}

	if (ByteBlockPool.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE) {
	bytePool.nextBuffer();
	}

	intUptos = intPool.buffer;
	intUptoStart = intPool.intUpto;
	intPool.intUpto += streamCount;

	postingsArray.intStarts[termID] = intUptoStart + intPool.intOffset;

	for(int i=0;i<streamCount;i++) {
	final int upto = bytePool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
	intUptos[intUptoStart+i] = upto + bytePool.byteOffset;
	}
	postingsArray.byteStarts[termID] = intUptos[intUptoStart];

	consumer.newTerm(termID);

	} else {
	termID = (-termID)-1;
	final int intStart = postingsArray.intStarts[termID];
	intUptos = intPool.buffers[intStart >> IntBlockPool.INT_BLOCK_SHIFT];
	intUptoStart = intStart & IntBlockPool.INT_BLOCK_MASK;
	consumer.addTerm(termID);
	}

	if (doNextCall)
	nextPerField.add(postingsArray.textStarts[termID]);
	}

	int[] intUptos;
	int intUptoStart;

	void writeByte(int stream, byte b) {
	int upto = intUptos[intUptoStart+stream];
	byte[] bytes = bytePool.buffers[upto >> ByteBlockPool.BYTE_BLOCK_SHIFT];
	assert bytes != null;
	int offset = upto & ByteBlockPool.BYTE_BLOCK_MASK;
	if (bytes[offset] != 0) {
	// End of slice; allocate a new one
	offset = bytePool.allocSlice(bytes, offset);
	bytes = bytePool.buffer;
	intUptos[intUptoStart+stream] = offset + bytePool.byteOffset;
	}
	bytes[offset] = b;
	(intUptos[intUptoStart+stream])++;
	}

	public void writeBytes(int stream, byte[] b, int offset, int len) {
	// TODO: optimize
	final int end = offset + len;
	for(int i=offset;i<end;i++)
	writeByte(stream, b[i]);
	}

	void writeVInt(int stream, int i) {
	assert stream < streamCount;
	while ((i & ~0x7F) != 0) {
	writeByte(stream, (byte)((i & 0x7f) \| 0x80));
	i >>>= 7;
	}
	writeByte(stream, (byte) i);
	}

	@Override
	void finish() throws IOException {
	consumer.finish();
	if (nextPerField != null)
	nextPerField.finish();
	}

	private static final class PostingsBytesStartArray extends BytesStartArray {

	private final TermsHashPerField perField;
	private final Counter bytesUsed;

	private PostingsBytesStartArray(
	TermsHashPerField perField, Counter bytesUsed) {
	this.perField = perField;
	this.bytesUsed = bytesUsed;
	}

	@Override
	public int[] init() {
	if (perField.postingsArray == null) {
	perField.postingsArray = perField.consumer.createPostingsArray(2);
	bytesUsed.addAndGet(perField.postingsArray.size * perField.postingsArray.bytesPerPosting());
	}
	return perField.postingsArray.textStarts;
	}

	@Override
	public int[] grow() {
	ParallelPostingsArray postingsArray = perField.postingsArray;
	final int oldSize = perField.postingsArray.size;
	postingsArray = perField.postingsArray = postingsArray.grow();
	bytesUsed.addAndGet((postingsArray.bytesPerPosting() * (postingsArray.size - oldSize)));
	return postingsArray.textStarts;
	}

	@Override
	public int[] clear() {
	if(perField.postingsArray != null) {
	bytesUsed.addAndGet(-(perField.postingsArray.size * perField.postingsArray.bytesPerPosting()));
	perField.postingsArray = null;
	}
	return null;
	}

	@Override
	public Counter bytesUsed() {
	return bytesUsed;
	}

	}

	}