blob: 9c1e02e8c4efd543e155f9f2668e96cbe1ff3336 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis;
import java.io.IOException;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Deque;
import java.util.List;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.util.AttributeSource;
/**
* An abstract TokenFilter that exposes its input stream as a graph
*
* Call {@link #incrementBaseToken()} to move the root of the graph to the next
* position in the TokenStream, {@link #incrementGraphToken()} to move along
* the current graph, and {@link #incrementGraph()} to reset to the next graph
* based at the current root.
*
* For example, given the stream 'a b/c:2 d e`, then with the base token at
* 'a', incrementGraphToken() will produce the stream 'a b d e', and then
* after calling incrementGraph() will produce the stream 'a c e'.
*/
public abstract class GraphTokenFilter extends TokenFilter {
private final Deque<Token> tokenPool = new ArrayDeque<>();
private final List<Token> currentGraph = new ArrayList<>();
/**
* The maximum permitted number of routes through a graph
*/
public static final int MAX_GRAPH_STACK_SIZE = 1000;
/**
* The maximum permitted read-ahead in the token stream
*/
public static final int MAX_TOKEN_CACHE_SIZE = 100;
private Token baseToken;
private int graphDepth;
private int graphPos;
private int trailingPositions = -1;
private int finalOffsets = -1;
private int stackSize;
private int cacheSize;
private final PositionIncrementAttribute posIncAtt;
private final OffsetAttribute offsetAtt;
/**
* Create a new GraphTokenFilter
*/
public GraphTokenFilter(TokenStream input) {
super(input);
this.posIncAtt = input.addAttribute(PositionIncrementAttribute.class);
this.offsetAtt = input.addAttribute(OffsetAttribute.class);
}
/**
* Move the root of the graph to the next token in the wrapped TokenStream
*
* @return {@code false} if the underlying stream is exhausted
*/
protected final boolean incrementBaseToken() throws IOException {
stackSize = 0;
graphDepth = 0;
graphPos = 0;
Token oldBase = baseToken;
baseToken = nextTokenInStream(baseToken);
if (baseToken == null) {
return false;
}
currentGraph.clear();
currentGraph.add(baseToken);
baseToken.attSource.copyTo(this);
recycleToken(oldBase);
return true;
}
/**
* Move to the next token in the current route through the graph
*
* @return {@code false} if there are not more tokens in the current graph
*/
protected final boolean incrementGraphToken() throws IOException {
if (graphPos < graphDepth) {
graphPos++;
currentGraph.get(graphPos).attSource.copyTo(this);
return true;
}
Token token = nextTokenInGraph(currentGraph.get(graphDepth));
if (token == null) {
return false;
}
graphDepth++;
graphPos++;
currentGraph.add(graphDepth, token);
token.attSource.copyTo(this);
return true;
}
/**
* Reset to the root token again, and move down the next route through the graph
*
* @return false if there are no more routes through the graph
*/
protected final boolean incrementGraph() throws IOException {
if (baseToken == null) {
return false;
}
graphPos = 0;
for (int i = graphDepth; i >= 1; i--) {
if (lastInStack(currentGraph.get(i)) == false) {
currentGraph.set(i, nextTokenInStream(currentGraph.get(i)));
for (int j = i + 1; j < graphDepth; j++) {
currentGraph.set(j, nextTokenInGraph(currentGraph.get(j)));
}
if (stackSize++ > MAX_GRAPH_STACK_SIZE) {
throw new IllegalStateException("Too many graph paths (> " + MAX_GRAPH_STACK_SIZE + ")");
}
currentGraph.get(0).attSource.copyTo(this);
graphDepth = i;
return true;
}
}
return false;
}
/**
* Return the number of trailing positions at the end of the graph
*
* NB this should only be called after {@link #incrementGraphToken()} has returned {@code false}
*/
public int getTrailingPositions() {
return trailingPositions;
}
@Override
public void end() throws IOException {
if (trailingPositions == -1) {
input.end();
trailingPositions = posIncAtt.getPositionIncrement();
finalOffsets = offsetAtt.endOffset();
}
else {
endAttributes();
this.posIncAtt.setPositionIncrement(trailingPositions);
this.offsetAtt.setOffset(finalOffsets, finalOffsets);
}
}
@Override
public void reset() throws IOException {
input.reset();
// new attributes can be added between reset() calls, so we can't reuse
// token objects from a previous run
tokenPool.clear();
cacheSize = 0;
graphDepth = 0;
trailingPositions = -1;
finalOffsets = -1;
baseToken = null;
}
int cachedTokenCount() {
return cacheSize;
}
private Token newToken() {
if (tokenPool.size() == 0) {
cacheSize++;
if (cacheSize > MAX_TOKEN_CACHE_SIZE) {
throw new IllegalStateException("Too many cached tokens (> " + MAX_TOKEN_CACHE_SIZE + ")");
}
return new Token(this.cloneAttributes());
}
Token token = tokenPool.removeFirst();
token.reset(input);
return token;
}
private void recycleToken(Token token) {
if (token == null)
return;
token.nextToken = null;
tokenPool.add(token);
}
private Token nextTokenInGraph(Token token) throws IOException {
int remaining = token.length();
do {
token = nextTokenInStream(token);
if (token == null) {
return null;
}
remaining -= token.posInc();
} while (remaining > 0);
return token;
}
// check if the next token in the tokenstream is at the same position as this one
private boolean lastInStack(Token token) throws IOException {
Token next = nextTokenInStream(token);
return next == null || next.posInc() != 0;
}
private Token nextTokenInStream(Token token) throws IOException {
if (token != null && token.nextToken != null) {
return token.nextToken;
}
if (this.trailingPositions != -1) {
// already hit the end
return null;
}
if (input.incrementToken() == false) {
input.end();
trailingPositions = posIncAtt.getPositionIncrement();
finalOffsets = offsetAtt.endOffset();
return null;
}
if (token == null) {
return newToken();
}
token.nextToken = newToken();
return token.nextToken;
}
private static class Token {
final AttributeSource attSource;
final PositionIncrementAttribute posIncAtt;
final PositionLengthAttribute lengthAtt;
Token nextToken;
Token(AttributeSource attSource) {
this.attSource = attSource;
this.posIncAtt = attSource.addAttribute(PositionIncrementAttribute.class);
boolean hasLengthAtt = attSource.hasAttribute(PositionLengthAttribute.class);
this.lengthAtt = hasLengthAtt ? attSource.addAttribute(PositionLengthAttribute.class) : null;
}
int posInc() {
return this.posIncAtt.getPositionIncrement();
}
int length() {
if (this.lengthAtt == null) {
return 1;
}
return this.lengthAtt.getPositionLength();
}
void reset(AttributeSource attSource) {
attSource.copyTo(this.attSource);
this.nextToken = null;
}
@Override
public String toString() {
return attSource.toString();
}
}
}