| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.search.uhighlight; |
| |
| import java.io.IOException; |
| |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.TokenFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| |
| /** |
| * Provides a base class for analysis based offset strategies to extend from. |
| * Requires an Analyzer and provides an override-able method for altering how |
| * the TokenStream is created. |
| * |
| * @lucene.internal |
| */ |
| public abstract class AnalysisOffsetStrategy extends FieldOffsetStrategy { |
| |
| protected final Analyzer analyzer; |
| |
| public AnalysisOffsetStrategy(UHComponents components, Analyzer analyzer) { |
| super(components); |
| this.analyzer = analyzer; |
| if (analyzer.getOffsetGap(getField()) != 1) { // note: 1 is the default. It is RARELY changed. |
| throw new IllegalArgumentException( |
| "offset gap of the provided analyzer should be 1 (field " + getField() + ")"); |
| } |
| } |
| |
| @Override |
| public final UnifiedHighlighter.OffsetSource getOffsetSource() { |
| return UnifiedHighlighter.OffsetSource.ANALYSIS; |
| } |
| |
| protected TokenStream tokenStream(String content) throws IOException { |
| // If there is no splitChar in content then we needn't wrap: |
| int splitCharIdx = content.indexOf(UnifiedHighlighter.MULTIVAL_SEP_CHAR); |
| if (splitCharIdx == -1) { |
| return analyzer.tokenStream(getField(), content); |
| } |
| |
| TokenStream subTokenStream = analyzer.tokenStream(getField(), content.substring(0, splitCharIdx)); |
| |
| return new MultiValueTokenStream(subTokenStream, getField(), analyzer, content, UnifiedHighlighter.MULTIVAL_SEP_CHAR, splitCharIdx); |
| } |
| |
| /** |
| * Wraps an {@link Analyzer} and string text that represents multiple values delimited by a specified character. This |
| * exposes a TokenStream that matches what would get indexed considering the |
| * {@link Analyzer#getPositionIncrementGap(String)}. Currently this assumes {@link Analyzer#getOffsetGap(String)} is |
| * 1; an exception will be thrown if it isn't. |
| * <br /> |
| * It would be more orthogonal for this to be an Analyzer since we're wrapping an Analyzer but doing so seems like |
| * more work. The underlying components see a Reader not a String -- and the String is easy to |
| * split up without redundant buffering. |
| * |
| * @lucene.internal |
| */ |
| // TODO we could make this go away. MemoryIndexOffsetStrategy could simply split and analyze each value into the |
| // MemoryIndex. TokenStreamOffsetStrategy's hack TokenStreamPostingsEnum could incorporate this logic, |
| // albeit with less code, less hack. |
| private static final class MultiValueTokenStream extends TokenFilter { |
| |
| private final String fieldName; |
| private final Analyzer indexAnalyzer; |
| private final String content; |
| private final char splitChar; |
| |
| private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); |
| private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); |
| |
| private int startValIdx = 0; |
| private int endValIdx; |
| private int remainingPosInc = 0; |
| |
| private MultiValueTokenStream(TokenStream subTokenStream, String fieldName, Analyzer indexAnalyzer, |
| String content, char splitChar, int splitCharIdx) { |
| super(subTokenStream); // subTokenStream is already initialized to operate on the first value |
| this.fieldName = fieldName; |
| this.indexAnalyzer = indexAnalyzer; |
| this.content = content; |
| this.splitChar = splitChar; |
| this.endValIdx = splitCharIdx; |
| } |
| |
| @Override |
| public void reset() throws IOException { |
| if (startValIdx != 0) { |
| throw new IllegalStateException("This TokenStream wasn't developed to be re-used."); |
| // ... although we could if a need for it arises. |
| } |
| super.reset(); |
| } |
| |
| @Override |
| public boolean incrementToken() throws IOException { |
| while (true) { |
| |
| if (input.incrementToken()) { |
| // Position tracking: |
| if (remainingPosInc > 0) {//usually true first token of additional values (not first val) |
| posIncAtt.setPositionIncrement(remainingPosInc + posIncAtt.getPositionIncrement()); |
| remainingPosInc = 0;//reset |
| } |
| // Offset tracking: |
| offsetAtt.setOffset( |
| startValIdx + offsetAtt.startOffset(), |
| startValIdx + offsetAtt.endOffset() |
| ); |
| return true; |
| } |
| |
| if (endValIdx == content.length()) {//no more |
| return false; |
| } |
| |
| input.end(); // might adjust position increment |
| remainingPosInc += posIncAtt.getPositionIncrement(); |
| input.close(); |
| remainingPosInc += indexAnalyzer.getPositionIncrementGap(fieldName); |
| |
| // Get new tokenStream based on next segment divided by the splitChar |
| startValIdx = endValIdx + 1; |
| endValIdx = content.indexOf(splitChar, startValIdx); |
| if (endValIdx == -1) {//EOF |
| endValIdx = content.length(); |
| } |
| TokenStream tokenStream = indexAnalyzer.tokenStream(fieldName, content.substring(startValIdx, endValIdx)); |
| if (tokenStream != input) {// (input is defined in TokenFilter set in the constructor) |
| // This is a grand trick we do -- knowing that the analyzer's re-use strategy is going to produce the |
| // very same tokenStream instance and thus have the same AttributeSource as this wrapping TokenStream |
| // since we used it as our input in the constructor. |
| // Were this not the case, we'd have to copy every attribute of interest since we can't alter the |
| // AttributeSource of this wrapping TokenStream post-construction (it's all private/final). |
| // If this is a problem, we could do that instead; maybe with a custom CharTermAttribute that allows |
| // us to easily set the char[] reference without literally copying char by char. |
| throw new IllegalStateException("Require TokenStream re-use. Unsupported re-use strategy?: " + |
| indexAnalyzer.getReuseStrategy()); |
| } |
| tokenStream.reset(); |
| } // while loop to increment token of this new value |
| } |
| |
| @Override |
| public void end() throws IOException { |
| super.end(); |
| // Offset tracking: |
| offsetAtt.setOffset( |
| startValIdx + offsetAtt.startOffset(), |
| startValIdx + offsetAtt.endOffset()); |
| } |
| |
| } |
| } |