| /* |
| * This software was produced for the U. S. Government |
| * under Contract No. W15P7T-11-C-F600, and is |
| * subject to the Rights in Noncommercial Computer Software |
| * and Noncommercial Computer Software Documentation |
| * Clause 252.227-7014 (JUN 1995) |
| * |
| * Copyright 2013 The MITRE Corporation. All Rights Reserved. |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.solr.handler.tagger; |
| |
| import java.io.IOException; |
| import java.util.Map; |
| |
| import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter; |
| import org.apache.lucene.index.PostingsEnum; |
| import org.apache.lucene.index.TermsEnum; |
| import org.apache.lucene.util.Bits; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.BytesRefBuilder; |
| import org.apache.lucene.util.IntsRef; |
| |
| /** |
| * Cursor into the terms that advances by prefix. |
| */ |
| class TermPrefixCursor { |
| |
| //Note: this could be a lot more efficient if MemoryPostingsFormat supported ordinal lookup. |
| // Maybe that could be added to Lucene. |
| |
| // TODO add bloom filter of hashcode of first ~ 6 bytes to avoid lookup into terms dict? |
| |
| private static final byte SEPARATOR_CHAR = ConcatenateGraphFilter.SEP_LABEL; // used to be ' '; TODO configurable? |
| private static final IntsRef EMPTY_INTSREF = new IntsRef(); |
| |
| private final TermsEnum termsEnum; |
| private final Bits liveDocs; |
| private final Map<BytesRef, IntsRef> docIdsCache; |
| |
| private BytesRef prefixBuf;//we append to this |
| private BytesRefBuilder prefixBufBuilder = new BytesRefBuilder(); |
| private boolean prefixBufOnLoan;//if true, PB is loaned; needs to be copied |
| private PostingsEnum postingsEnum; |
| private IntsRef docIds; |
| |
| TermPrefixCursor(TermsEnum termsEnum, Bits liveDocs, Map<BytesRef, IntsRef> docIdsCache) { |
| this.termsEnum = termsEnum; |
| this.liveDocs = liveDocs; |
| this.docIdsCache = docIdsCache; |
| } |
| |
| /** Appends the separator char (if not the first) plus the given word to the prefix buffer, |
| * then seeks to it. If the seek fails, false is returned and this cursor |
| * can be re-used as if in a new state. The {@code word} BytesRef is considered temporary, |
| * and is not saved within this class. */ |
| boolean advance(BytesRef word) throws IOException { |
| if (prefixBuf == null) { // first advance |
| //set prefixBuf to word temporary. When advance() completes, we either null out or copy. |
| prefixBuf = word; |
| prefixBufOnLoan = true; |
| if (seekPrefix()) {//... and we have to |
| ensureBufIsACopy(); |
| return true; |
| } else { |
| prefixBuf = null;//just to be darned sure 'word' isn't referenced here |
| return false; |
| } |
| |
| } else { // subsequent advance |
| //append to existing |
| assert !prefixBufOnLoan; |
| |
| prefixBufBuilder.append(SEPARATOR_CHAR); |
| prefixBufBuilder.append(word); |
| prefixBuf = prefixBufBuilder.get(); |
| if (seekPrefix()) { |
| return true; |
| } else { |
| prefixBuf = null; |
| return false; |
| } |
| } |
| } |
| |
| private void ensureBufIsACopy() { |
| if (!prefixBufOnLoan) |
| return; |
| |
| prefixBufBuilder.clear(); |
| prefixBufBuilder.copyBytes(prefixBuf); |
| prefixBuf = prefixBufBuilder.get(); |
| prefixBufOnLoan = false; |
| } |
| |
| /** Seeks to prefixBuf or the next term that is prefixed by prefixBuf plus the separator char. |
| * Sets docIds. **/ |
| @SuppressWarnings({"fallthrough"}) |
| private boolean seekPrefix() throws IOException { |
| TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(prefixBuf); |
| |
| docIds = null;//invalidate |
| switch (seekStatus) { |
| case END: |
| return false; |
| |
| case FOUND: |
| postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE); |
| docIds = postingsEnumToIntsRef(postingsEnum, liveDocs); |
| if (docIds.length > 0) { |
| return true; |
| } |
| |
| //Pretend we didn't find it; go to next term |
| docIds = null; |
| if (termsEnum.next() == null) { // case END |
| return false; |
| } |
| //fall through to NOT_FOUND |
| |
| case NOT_FOUND: |
| //termsEnum must start with prefixBuf to continue |
| BytesRef teTerm = termsEnum.term(); |
| |
| if (teTerm.length > prefixBuf.length) { |
| for (int i = 0; i < prefixBuf.length; i++) { |
| if (prefixBuf.bytes[prefixBuf.offset + i] != teTerm.bytes[teTerm.offset + i]) |
| return false; |
| } |
| if (teTerm.bytes[teTerm.offset + prefixBuf.length] != SEPARATOR_CHAR) |
| return false; |
| return true; |
| } |
| return false; |
| } |
| throw new IllegalStateException(seekStatus.toString()); |
| } |
| |
| /** Returns an IntsRef either cached or reading postingsEnum. Not null. */ |
| private IntsRef postingsEnumToIntsRef(PostingsEnum postingsEnum, Bits liveDocs) throws IOException { |
| // (The cache can have empty IntsRefs) |
| |
| //lookup prefixBuf in a cache |
| if (docIdsCache != null) { |
| docIds = docIdsCache.get(prefixBuf); |
| if (docIds != null) { |
| return docIds; |
| } |
| } |
| |
| //read postingsEnum |
| docIds = new IntsRef(termsEnum.docFreq()); |
| int docId; |
| while ((docId = postingsEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) { |
| if (liveDocs != null && !liveDocs.get(postingsEnum.docID())) { |
| continue; |
| } |
| docIds.ints[docIds.length++] = docId; |
| } |
| if (docIds.length == 0) |
| docIds = EMPTY_INTSREF; |
| |
| //cache |
| if (docIdsCache != null) { |
| ensureBufIsACopy(); |
| //clone is shallow; that's okay as the prefix isn't overwritten; it's just appended to |
| docIdsCache.put(prefixBuf.clone(), docIds); |
| } |
| return docIds; |
| } |
| |
| /** The docIds of the last call to advance, if it returned true. It might be null, but |
| * its length won't be 0. Treat as immutable. */ |
| IntsRef getDocIds() { |
| assert docIds == null || docIds.length != 0; |
| return docIds; |
| } |
| } |