| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.search.highlight; |
| |
| import java.io.IOException; |
| |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; |
| import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| import org.apache.lucene.index.PostingsEnum; |
| import org.apache.lucene.index.Terms; |
| import org.apache.lucene.index.TermsEnum; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.BytesRefArray; |
| import org.apache.lucene.util.BytesRefBuilder; |
| import org.apache.lucene.util.CharsRefBuilder; |
| import org.apache.lucene.util.Counter; |
| import org.apache.lucene.util.UnicodeUtil; |
| |
| /** |
| * TokenStream created from a term vector field. The term vector requires positions and/or offsets (either). If you |
| * want payloads add PayloadAttributeImpl (as you would normally) but don't assume the attribute is already added just |
| * because you know the term vector has payloads, since the first call to incrementToken() will observe if you asked |
| * for them and if not then won't get them. This TokenStream supports an efficient {@link #reset()}, so there's |
| * no need to wrap with a caching impl. |
| * <p> |
| * The implementation will create an array of tokens indexed by token position. As long as there aren't massive jumps |
| * in positions, this is fine. And it assumes there aren't large numbers of tokens at the same position, since it adds |
| * them to a linked-list per position in O(N^2) complexity. When there aren't positions in the term vector, it divides |
| * the startOffset by 8 to use as a temporary substitute. In that case, tokens with the same startOffset will occupy |
| * the same final position; otherwise tokens become adjacent. |
| * |
| * @lucene.internal |
| */ |
| public final class TokenStreamFromTermVector extends TokenStream { |
| |
| private final Terms vector; |
| |
| private final CharTermAttribute termAttribute; |
| |
| private final PositionIncrementAttribute positionIncrementAttribute; |
| |
| private final int maxStartOffset; |
| |
| private OffsetAttribute offsetAttribute;//maybe null |
| |
| private PayloadAttribute payloadAttribute;//maybe null |
| |
| private CharsRefBuilder termCharsBuilder;//term data here |
| |
| private BytesRefArray payloadsBytesRefArray;//only used when payloadAttribute is non-null |
| private BytesRefBuilder spareBytesRefBuilder;//only used when payloadAttribute is non-null |
| |
| private TokenLL firstToken = null; // the head of a linked-list |
| |
| private TokenLL incrementToken = null; |
| |
| private boolean initialized = false;//lazy |
| |
| /** |
| * Constructor. The uninversion doesn't happen here; it's delayed till the first call to |
| * {@link #incrementToken}. |
| * |
| * @param vector Terms that contains the data for |
| * creating the TokenStream. Must have positions and/or offsets. |
| * @param maxStartOffset if a token's start offset exceeds this then the token is not added. -1 disables the limit. |
| */ |
| public TokenStreamFromTermVector(Terms vector, int maxStartOffset) throws IOException { |
| this.maxStartOffset = maxStartOffset < 0 ? Integer.MAX_VALUE : maxStartOffset; |
| assert !hasAttribute(PayloadAttribute.class) : "AttributeFactory shouldn't have payloads *yet*"; |
| if (!vector.hasPositions() && !vector.hasOffsets()) { |
| throw new IllegalArgumentException("The term vector needs positions and/or offsets."); |
| } |
| assert vector.hasFreqs(); |
| this.vector = vector; |
| termAttribute = addAttribute(CharTermAttribute.class); |
| positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class); |
| } |
| |
| public Terms getTermVectorTerms() { return vector; } |
| |
| @Override |
| public void reset() throws IOException { |
| incrementToken = null; |
| super.reset(); |
| } |
| |
| //We delay initialization because we can see which attributes the consumer wants, particularly payloads |
| private void init() throws IOException { |
| assert !initialized; |
| short dpEnumFlags = PostingsEnum.POSITIONS; |
| if (vector.hasOffsets()) { |
| dpEnumFlags |= PostingsEnum.OFFSETS; |
| offsetAttribute = addAttribute(OffsetAttribute.class); |
| } |
| if (vector.hasPayloads() && hasAttribute(PayloadAttribute.class)) { |
| dpEnumFlags |= (PostingsEnum.OFFSETS | PostingsEnum.PAYLOADS);//must ask for offsets too |
| payloadAttribute = getAttribute(PayloadAttribute.class); |
| payloadsBytesRefArray = new BytesRefArray(Counter.newCounter()); |
| spareBytesRefBuilder = new BytesRefBuilder(); |
| } |
| |
| // We put term data here |
| termCharsBuilder = new CharsRefBuilder(); |
| termCharsBuilder.grow((int) (vector.size() * 7));//7 is over-estimate of average term len |
| |
| // Step 1: iterate termsEnum and create a token, placing into an array of tokens by position |
| |
| TokenLL[] positionedTokens = initTokensArray(); |
| |
| int lastPosition = -1; |
| |
| final TermsEnum termsEnum = vector.iterator(); |
| BytesRef termBytesRef; |
| PostingsEnum dpEnum = null; |
| CharsRefBuilder tempCharsRefBuilder = new CharsRefBuilder();//only for UTF8->UTF16 call |
| //int sumFreq = 0; |
| while ((termBytesRef = termsEnum.next()) != null) { |
| //Grab the term (in same way as BytesRef.utf8ToString() but we don't want a String obj) |
| // note: if term vectors supported seek by ord then we might just keep an int and seek by ord on-demand |
| tempCharsRefBuilder.grow(termBytesRef.length); |
| final int termCharsLen = UnicodeUtil.UTF8toUTF16(termBytesRef, tempCharsRefBuilder.chars()); |
| final int termCharsOff = termCharsBuilder.length(); |
| termCharsBuilder.append(tempCharsRefBuilder.chars(), 0, termCharsLen); |
| |
| dpEnum = termsEnum.postings(dpEnum, dpEnumFlags); |
| assert dpEnum != null; // presumably checked by TokenSources.hasPositions earlier |
| dpEnum.nextDoc(); |
| final int freq = dpEnum.freq(); |
| //sumFreq += freq; |
| for (int j = 0; j < freq; j++) { |
| int pos = dpEnum.nextPosition(); |
| TokenLL token = new TokenLL(); |
| token.termCharsOff = termCharsOff; |
| token.termCharsLen = (short) Math.min(termCharsLen, Short.MAX_VALUE); |
| if (offsetAttribute != null) { |
| token.startOffset = dpEnum.startOffset(); |
| if (token.startOffset > maxStartOffset) { |
| continue;//filter this token out; exceeds threshold |
| } |
| token.endOffsetInc = (short) Math.min(dpEnum.endOffset() - token.startOffset, Short.MAX_VALUE); |
| if (pos == -1) { |
| pos = token.startOffset >> 3;//divide by 8 |
| } |
| } |
| |
| if (payloadAttribute != null) { |
| final BytesRef payload = dpEnum.getPayload(); |
| token.payloadIndex = payload == null ? -1 : payloadsBytesRefArray.append(payload); |
| } |
| |
| //Add token to an array indexed by position |
| if (positionedTokens.length <= pos) { |
| //grow, but not 2x since we think our original length estimate is close |
| TokenLL[] newPositionedTokens = new TokenLL[(int)((pos + 1) * 1.5f)]; |
| System.arraycopy(positionedTokens, 0, newPositionedTokens, 0, lastPosition + 1); |
| positionedTokens = newPositionedTokens; |
| } |
| positionedTokens[pos] = token.insertIntoSortedLinkedList(positionedTokens[pos]); |
| |
| lastPosition = Math.max(lastPosition, pos); |
| } |
| } |
| |
| // System.out.println(String.format( |
| // "SumFreq: %5d Size: %4d SumFreq/size: %3.3f MaxPos: %4d MaxPos/SumFreq: %3.3f WastePct: %3.3f", |
| // sumFreq, vector.size(), (sumFreq / (float)vector.size()), lastPosition, ((float)lastPosition)/sumFreq, |
| // (originalPositionEstimate/(lastPosition + 1.0f)))); |
| |
| // Step 2: Link all Tokens into a linked-list and set position increments as we go |
| |
| int prevTokenPos = -1; |
| TokenLL prevToken = null; |
| for (int pos = 0; pos <= lastPosition; pos++) { |
| TokenLL token = positionedTokens[pos]; |
| if (token == null) { |
| continue; |
| } |
| //link |
| if (prevToken != null) { |
| assert prevToken.next == null; |
| prevToken.next = token; //concatenate linked-list |
| } else { |
| assert firstToken == null; |
| firstToken = token; |
| } |
| //set increments |
| if (vector.hasPositions()) { |
| token.positionIncrement = pos - prevTokenPos; |
| while (token.next != null) { |
| token = token.next; |
| token.positionIncrement = 0; |
| } |
| } else { |
| token.positionIncrement = 1; |
| while (token.next != null) { |
| prevToken = token; |
| token = token.next; |
| if (prevToken.startOffset == token.startOffset) { |
| token.positionIncrement = 0; |
| } else { |
| token.positionIncrement = 1; |
| } |
| } |
| } |
| prevTokenPos = pos; |
| prevToken = token; |
| } |
| |
| initialized = true; |
| } |
| |
| private TokenLL[] initTokensArray() throws IOException { |
| // Estimate the number of position slots we need from term stats. We use some estimation factors taken from |
| // Wikipedia that reduce the likelihood of needing to expand the array. |
| int sumTotalTermFreq = (int) vector.getSumTotalTermFreq(); |
| assert sumTotalTermFreq != -1; |
| |
| final int originalPositionEstimate = (int) (sumTotalTermFreq * 1.5);//less than 1 in 10 docs exceed this |
| |
| // This estimate is based on maxStartOffset. Err on the side of this being larger than needed. |
| final int offsetLimitPositionEstimate = (int) (maxStartOffset / 5.0); |
| |
| // Take the smaller of the two estimates, but no smaller than 64 |
| return new TokenLL[Math.max(64, Math.min(originalPositionEstimate, offsetLimitPositionEstimate))]; |
| } |
| |
| @Override |
| public boolean incrementToken() throws IOException { |
| if (incrementToken == null) { |
| if (!initialized) { |
| init(); |
| assert initialized; |
| } |
| incrementToken = firstToken; |
| if (incrementToken == null) { |
| return false; |
| } |
| } else if (incrementToken.next != null) { |
| incrementToken = incrementToken.next; |
| } else { |
| return false; |
| } |
| clearAttributes(); |
| termAttribute.copyBuffer(termCharsBuilder.chars(), incrementToken.termCharsOff, incrementToken.termCharsLen); |
| positionIncrementAttribute.setPositionIncrement(incrementToken.positionIncrement); |
| if (offsetAttribute != null) { |
| offsetAttribute.setOffset(incrementToken.startOffset, incrementToken.startOffset + incrementToken.endOffsetInc); |
| } |
| if (payloadAttribute != null) { |
| if (incrementToken.payloadIndex == -1) { |
| payloadAttribute.setPayload(null); |
| } else { |
| payloadAttribute.setPayload(payloadsBytesRefArray.get(spareBytesRefBuilder, incrementToken.payloadIndex)); |
| } |
| } |
| return true; |
| } |
| |
| private static class TokenLL { |
| // This class should weigh 32 bytes, including object header |
| |
| int termCharsOff; // see termCharsBuilder |
| short termCharsLen; |
| |
| int positionIncrement; |
| int startOffset; |
| short endOffsetInc; // add to startOffset to get endOffset |
| int payloadIndex; |
| |
| TokenLL next; |
| |
| /** Given the head of a linked-list (possibly null) this inserts the token at the correct |
| * spot to maintain the desired order, and returns the head (which could be this token if it's the smallest). |
| * O(N^2) complexity but N should be a handful at most. |
| */ |
| TokenLL insertIntoSortedLinkedList(final TokenLL head) { |
| assert next == null; |
| if (head == null) { |
| return this; |
| } else if (this.compareOffsets(head) <= 0) { |
| this.next = head; |
| return this; |
| } |
| TokenLL prev = head; |
| while (prev.next != null && this.compareOffsets(prev.next) > 0) { |
| prev = prev.next; |
| } |
| this.next = prev.next; |
| prev.next = this; |
| return head; |
| } |
| |
| /** by startOffset then endOffset */ |
| int compareOffsets(TokenLL tokenB) { |
| int cmp = Integer.compare(this.startOffset, tokenB.startOffset); |
| if (cmp == 0) { |
| cmp = Short.compare(this.endOffsetInc, tokenB.endOffsetInc); |
| } |
| return cmp; |
| } |
| } |
| } |