solr/core/src/java/org/apache/solr/handler/tagger/TermPrefixCursor.java - lucene-solr - Git at Google

 /*
  * This software was produced for the U. S. Government
  * under Contract No. W15P7T-11-C-F600, and is
  * subject to the Rights in Noncommercial Computer Software
  * and Noncommercial Computer Software Documentation
  * Clause 252.227-7014 (JUN 1995)
  *
  * Copyright 2013 The MITRE Corporation. All Rights Reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.solr.handler.tagger;

 import java.io.IOException;
 import java.util.Map;

 import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter;
 import org.apache.lucene.index.PostingsEnum;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefBuilder;
 import org.apache.lucene.util.IntsRef;

 /**
  * Cursor into the terms that advances by prefix.
  */
 class TermPrefixCursor {

   //Note: this could be a lot more efficient if MemoryPostingsFormat supported ordinal lookup.
   // Maybe that could be added to Lucene.

   // TODO add bloom filter of hashcode of first ~ 6 bytes to avoid lookup into terms dict?

   private static final byte SEPARATOR_CHAR = ConcatenateGraphFilter.SEP_LABEL; // used to be ' '; TODO configurable?
   private static final IntsRef EMPTY_INTSREF = new IntsRef();

   private final TermsEnum termsEnum;
   private final Bits liveDocs;
   private final Map<BytesRef, IntsRef> docIdsCache;

   private BytesRef prefixBuf;//we append to this
   private BytesRefBuilder prefixBufBuilder = new BytesRefBuilder();
   private boolean prefixBufOnLoan;//if true, PB is loaned; needs to be copied
   private PostingsEnum postingsEnum;
   private IntsRef docIds;

   TermPrefixCursor(TermsEnum termsEnum, Bits liveDocs, Map<BytesRef, IntsRef> docIdsCache) {
     this.termsEnum = termsEnum;
     this.liveDocs = liveDocs;
     this.docIdsCache = docIdsCache;
   }

   /** Appends the separator char (if not the first) plus the given word to the prefix buffer,
    * then seeks to it. If the seek fails, false is returned and this cursor
    * can be re-used as if in a new state.  The {@code word} BytesRef is considered temporary,
    * and is not saved within this class. */
   boolean advance(BytesRef word) throws IOException {
     if (prefixBuf == null) { // first advance
       //set prefixBuf to word temporary. When advance() completes, we either null out or copy.
       prefixBuf = word;
       prefixBufOnLoan = true;
       if (seekPrefix()) {//... and we have to
         ensureBufIsACopy();
         return true;
       } else {
         prefixBuf = null;//just to be darned sure 'word' isn't referenced here
         return false;
       }

     } else { // subsequent advance
       //append to existing
       assert !prefixBufOnLoan;

       prefixBufBuilder.append(SEPARATOR_CHAR);
       prefixBufBuilder.append(word);
       prefixBuf = prefixBufBuilder.get();
       if (seekPrefix()) {
         return true;
       } else {
         prefixBuf = null;
         return false;
       }
     }
   }

   private void ensureBufIsACopy() {
     if (!prefixBufOnLoan)
       return;

     prefixBufBuilder.clear();
     prefixBufBuilder.copyBytes(prefixBuf);
     prefixBuf = prefixBufBuilder.get();
     prefixBufOnLoan = false;
   }

   /** Seeks to prefixBuf or the next term that is prefixed by prefixBuf plus the separator char.
    * Sets docIds. **/
   @SuppressWarnings({"fallthrough"})
   private boolean seekPrefix() throws IOException {
     TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(prefixBuf);

     docIds = null;//invalidate
     switch (seekStatus) {
       case END:
         return false;

       case FOUND:
         postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
         docIds = postingsEnumToIntsRef(postingsEnum, liveDocs);
         if (docIds.length > 0) {
           return true;
         }

         //Pretend we didn't find it; go to next term
         docIds = null;
         if (termsEnum.next() == null) { // case END
           return false;
         }
         //fall through to NOT_FOUND

       case NOT_FOUND:
         //termsEnum must start with prefixBuf to continue
         BytesRef teTerm = termsEnum.term();

         if (teTerm.length > prefixBuf.length) {
           for (int i = 0; i < prefixBuf.length; i++) {
             if (prefixBuf.bytes[prefixBuf.offset + i] != teTerm.bytes[teTerm.offset + i])
               return false;
           }
           if (teTerm.bytes[teTerm.offset + prefixBuf.length] != SEPARATOR_CHAR)
             return false;
           return true;
         }
         return false;
     }
     throw new IllegalStateException(seekStatus.toString());
   }

   /** Returns an IntsRef either cached or reading postingsEnum. Not null. */
   private IntsRef postingsEnumToIntsRef(PostingsEnum postingsEnum, Bits liveDocs) throws IOException {
     // (The cache can have empty IntsRefs)

     //lookup prefixBuf in a cache
     if (docIdsCache != null) {
       docIds = docIdsCache.get(prefixBuf);
       if (docIds != null) {
         return docIds;
       }
     }

     //read postingsEnum
     docIds = new IntsRef(termsEnum.docFreq());
     int docId;
     while ((docId = postingsEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) {
       if (liveDocs != null && !liveDocs.get(postingsEnum.docID())) {
         continue;
       }
       docIds.ints[docIds.length++] = docId;
     }
     if (docIds.length == 0)
       docIds = EMPTY_INTSREF;

     //cache
     if (docIdsCache != null) {
       ensureBufIsACopy();
       //clone is shallow; that's okay as the prefix isn't overwritten; it's just appended to
       docIdsCache.put(prefixBuf.clone(), docIds);
     }
     return docIds;
   }

   /** The docIds of the last call to advance, if it returned true. It might be null, but
    * its length won't be 0. Treat as immutable. */
   IntsRef getDocIds() {
     assert docIds == null || docIds.length != 0;
     return docIds;
   }
 }
	/*
	* This software was produced for the U. S. Government
	* under Contract No. W15P7T-11-C-F600, and is
	* subject to the Rights in Noncommercial Computer Software
	* and Noncommercial Computer Software Documentation
	* Clause 252.227-7014 (JUN 1995)
	*
	* Copyright 2013 The MITRE Corporation. All Rights Reserved.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.solr.handler.tagger;

	import java.io.IOException;
	import java.util.Map;

	import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter;
	import org.apache.lucene.index.PostingsEnum;
	import org.apache.lucene.index.TermsEnum;
	import org.apache.lucene.util.Bits;
	import org.apache.lucene.util.BytesRef;
	import org.apache.lucene.util.BytesRefBuilder;
	import org.apache.lucene.util.IntsRef;

	/**
	* Cursor into the terms that advances by prefix.
	*/
	class TermPrefixCursor {

	//Note: this could be a lot more efficient if MemoryPostingsFormat supported ordinal lookup.
	// Maybe that could be added to Lucene.

	// TODO add bloom filter of hashcode of first ~ 6 bytes to avoid lookup into terms dict?

	private static final byte SEPARATOR_CHAR = ConcatenateGraphFilter.SEP_LABEL; // used to be ' '; TODO configurable?
	private static final IntsRef EMPTY_INTSREF = new IntsRef();

	private final TermsEnum termsEnum;
	private final Bits liveDocs;
	private final Map<BytesRef, IntsRef> docIdsCache;

	private BytesRef prefixBuf;//we append to this
	private BytesRefBuilder prefixBufBuilder = new BytesRefBuilder();
	private boolean prefixBufOnLoan;//if true, PB is loaned; needs to be copied
	private PostingsEnum postingsEnum;
	private IntsRef docIds;

	TermPrefixCursor(TermsEnum termsEnum, Bits liveDocs, Map<BytesRef, IntsRef> docIdsCache) {
	this.termsEnum = termsEnum;
	this.liveDocs = liveDocs;
	this.docIdsCache = docIdsCache;
	}

	/** Appends the separator char (if not the first) plus the given word to the prefix buffer,
	* then seeks to it. If the seek fails, false is returned and this cursor
	* can be re-used as if in a new state. The {@code word} BytesRef is considered temporary,
	* and is not saved within this class. */
	boolean advance(BytesRef word) throws IOException {
	if (prefixBuf == null) { // first advance
	//set prefixBuf to word temporary. When advance() completes, we either null out or copy.
	prefixBuf = word;
	prefixBufOnLoan = true;
	if (seekPrefix()) {//... and we have to
	ensureBufIsACopy();
	return true;
	} else {
	prefixBuf = null;//just to be darned sure 'word' isn't referenced here
	return false;
	}

	} else { // subsequent advance
	//append to existing
	assert !prefixBufOnLoan;

	prefixBufBuilder.append(SEPARATOR_CHAR);
	prefixBufBuilder.append(word);
	prefixBuf = prefixBufBuilder.get();
	if (seekPrefix()) {
	return true;
	} else {
	prefixBuf = null;
	return false;
	}
	}
	}

	private void ensureBufIsACopy() {
	if (!prefixBufOnLoan)
	return;

	prefixBufBuilder.clear();
	prefixBufBuilder.copyBytes(prefixBuf);
	prefixBuf = prefixBufBuilder.get();
	prefixBufOnLoan = false;
	}

	/** Seeks to prefixBuf or the next term that is prefixed by prefixBuf plus the separator char.
	* Sets docIds. **/
	@SuppressWarnings({"fallthrough"})
	private boolean seekPrefix() throws IOException {
	TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(prefixBuf);

	docIds = null;//invalidate
	switch (seekStatus) {
	case END:
	return false;

	case FOUND:
	postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
	docIds = postingsEnumToIntsRef(postingsEnum, liveDocs);
	if (docIds.length > 0) {
	return true;
	}

	//Pretend we didn't find it; go to next term
	docIds = null;
	if (termsEnum.next() == null) { // case END
	return false;
	}
	//fall through to NOT_FOUND

	case NOT_FOUND:
	//termsEnum must start with prefixBuf to continue
	BytesRef teTerm = termsEnum.term();

	if (teTerm.length > prefixBuf.length) {
	for (int i = 0; i < prefixBuf.length; i++) {
	if (prefixBuf.bytes[prefixBuf.offset + i] != teTerm.bytes[teTerm.offset + i])
	return false;
	}
	if (teTerm.bytes[teTerm.offset + prefixBuf.length] != SEPARATOR_CHAR)
	return false;
	return true;
	}
	return false;
	}
	throw new IllegalStateException(seekStatus.toString());
	}

	/** Returns an IntsRef either cached or reading postingsEnum. Not null. */
	private IntsRef postingsEnumToIntsRef(PostingsEnum postingsEnum, Bits liveDocs) throws IOException {
	// (The cache can have empty IntsRefs)

	//lookup prefixBuf in a cache
	if (docIdsCache != null) {
	docIds = docIdsCache.get(prefixBuf);
	if (docIds != null) {
	return docIds;
	}
	}

	//read postingsEnum
	docIds = new IntsRef(termsEnum.docFreq());
	int docId;
	while ((docId = postingsEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) {
	if (liveDocs != null && !liveDocs.get(postingsEnum.docID())) {
	continue;
	}
	docIds.ints[docIds.length++] = docId;
	}
	if (docIds.length == 0)
	docIds = EMPTY_INTSREF;

	//cache
	if (docIdsCache != null) {
	ensureBufIsACopy();
	//clone is shallow; that's okay as the prefix isn't overwritten; it's just appended to
	docIdsCache.put(prefixBuf.clone(), docIds);
	}
	return docIds;
	}

	/** The docIds of the last call to advance, if it returned true. It might be null, but
	* its length won't be 0. Treat as immutable. */
	IntsRef getDocIds() {
	assert docIds == null \|\| docIds.length != 0;
	return docIds;
	}
	}