blob: b0ab6edb3163b48f1965269dae40d2f6d38a7629 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.RollingBuffer;
// TODO: cut SynFilter over to this
// TODO: somehow add "nuke this input token" capability...
/** An abstract TokenFilter to make it easier to build graph
* token filters requiring some lookahead. This class handles
* the details of buffering up tokens, recording them by
* position, restoring them, providing access to them, etc. */
public abstract class LookaheadTokenFilter<T extends LookaheadTokenFilter.Position> extends TokenFilter {
private final static boolean DEBUG = false;
protected final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
protected final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
protected final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
// Position of last read input token:
protected int inputPos;
// Position of next possible output token to return:
protected int outputPos;
// True if we hit end from our input:
protected boolean end;
private boolean tokenPending;
private boolean insertPending;
/** Holds all state for a single position; subclass this
* to record other state at each position. */
protected static class Position implements RollingBuffer.Resettable {
// Buffered input tokens at this position:
public final List<AttributeSource.State> inputTokens = new ArrayList<>();
// Next buffered token to be returned to consumer:
public int nextRead;
// Any token leaving from this position should have this startOffset:
public int startOffset = -1;
// Any token arriving to this position should have this endOffset:
public int endOffset = -1;
@Override
public void reset() {
inputTokens.clear();
nextRead = 0;
startOffset = -1;
endOffset = -1;
}
public void add(AttributeSource.State state) {
inputTokens.add(state);
}
public AttributeSource.State nextState() {
assert nextRead < inputTokens.size();
return inputTokens.get(nextRead++);
}
}
protected LookaheadTokenFilter(TokenStream input) {
super(input);
}
/** Call this only from within afterPosition, to insert a new
* token. After calling this you should set any
* necessary token you need. */
protected void insertToken() throws IOException {
if (tokenPending) {
positions.get(inputPos).add(captureState());
tokenPending = false;
}
assert !insertPending;
insertPending = true;
}
/** This is called when all input tokens leaving a given
* position have been returned. Override this and
* call insertToken and then set whichever token's
* attributes you want, if you want to inject
* a token starting from this position. */
protected void afterPosition() throws IOException {
}
protected abstract T newPosition();
protected final RollingBuffer<T> positions = new RollingBuffer<T>() {
@Override
protected T newInstance() {
return newPosition();
}
};
/** Returns true if there is a new token. */
protected boolean peekToken() throws IOException {
if (DEBUG) {
System.out.println("LTF.peekToken inputPos=" + inputPos + " outputPos=" + outputPos + " tokenPending=" + tokenPending);
}
assert !end;
assert inputPos == -1 || outputPos <= inputPos;
if (tokenPending) {
positions.get(inputPos).add(captureState());
tokenPending = false;
}
final boolean gotToken = input.incrementToken();
if (DEBUG) {
System.out.println(" input.incrToken() returned " + gotToken);
}
if (gotToken) {
inputPos += posIncAtt.getPositionIncrement();
assert inputPos >= 0;
if (DEBUG) {
System.out.println(" now inputPos=" + inputPos);
}
final Position startPosData = positions.get(inputPos);
final Position endPosData = positions.get(inputPos + posLenAtt.getPositionLength());
final int startOffset = offsetAtt.startOffset();
if (startPosData.startOffset == -1) {
startPosData.startOffset = startOffset;
} else {
// Make sure our input isn't messing up offsets:
assert startPosData.startOffset == startOffset: "prev startOffset=" + startPosData.startOffset + " vs new startOffset=" + startOffset + " inputPos=" + inputPos;
}
final int endOffset = offsetAtt.endOffset();
if (endPosData.endOffset == -1) {
endPosData.endOffset = endOffset;
} else {
// Make sure our input isn't messing up offsets:
assert endPosData.endOffset == endOffset: "prev endOffset=" + endPosData.endOffset + " vs new endOffset=" + endOffset + " inputPos=" + inputPos;
}
tokenPending = true;
} else {
end = true;
}
return gotToken;
}
/** Call this when you are done looking ahead; it will set
* the next token to return. Return the boolean back to
* the caller. */
protected boolean nextToken() throws IOException {
//System.out.println(" nextToken: tokenPending=" + tokenPending);
if (DEBUG) {
System.out.println("LTF.nextToken inputPos=" + inputPos + " outputPos=" + outputPos + " tokenPending=" + tokenPending);
}
Position posData = positions.get(outputPos);
// While loop here in case we have to
// skip over a hole from the input:
while (true) {
//System.out.println(" check buffer @ outputPos=" +
//outputPos + " inputPos=" + inputPos + " nextRead=" +
//posData.nextRead + " vs size=" +
//posData.inputTokens.size());
// See if we have a previously buffered token to
// return at the current position:
if (posData.nextRead < posData.inputTokens.size()) {
if (DEBUG) {
System.out.println(" return previously buffered token");
}
// This position has buffered tokens to serve up:
if (tokenPending) {
positions.get(inputPos).add(captureState());
tokenPending = false;
}
restoreState(positions.get(outputPos).nextState());
//System.out.println(" return!");
return true;
}
if (inputPos == -1 || outputPos == inputPos) {
// No more buffered tokens:
// We may still get input tokens at this position
//System.out.println(" break buffer");
if (tokenPending) {
// Fast path: just return token we had just incr'd,
// without having captured/restored its state:
if (DEBUG) {
System.out.println(" pass-through: return pending token");
}
tokenPending = false;
return true;
} else if (end || !peekToken()) {
if (DEBUG) {
System.out.println(" END");
}
afterPosition();
if (insertPending) {
// Subclass inserted a token at this same
// position:
if (DEBUG) {
System.out.println(" return inserted token");
}
assert insertedTokenConsistent();
insertPending = false;
return true;
}
return false;
}
} else {
if (posData.startOffset != -1) {
// This position had at least one token leaving
if (DEBUG) {
System.out.println(" call afterPosition");
}
afterPosition();
if (insertPending) {
// Subclass inserted a token at this same
// position:
if (DEBUG) {
System.out.println(" return inserted token");
}
assert insertedTokenConsistent();
insertPending = false;
return true;
}
}
// Done with this position; move on:
outputPos++;
if (DEBUG) {
System.out.println(" next position: outputPos=" + outputPos);
}
positions.freeBefore(outputPos);
posData = positions.get(outputPos);
}
}
}
// If subclass inserted a token, make sure it had in fact
// looked ahead enough:
private boolean insertedTokenConsistent() {
final int posLen = posLenAtt.getPositionLength();
final Position endPosData = positions.get(outputPos + posLen);
assert endPosData.endOffset != -1;
assert offsetAtt.endOffset() == endPosData.endOffset: "offsetAtt.endOffset=" + offsetAtt.endOffset() + " vs expected=" + endPosData.endOffset;
return true;
}
// TODO: end()?
// TODO: close()?
@Override
public void reset() throws IOException {
super.reset();
positions.reset();
inputPos = -1;
outputPos = 0;
tokenPending = false;
end = false;
}
}