blob: 22501ef991bd4e50f58ea2c04436a2c4bb9781f2 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.core;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.RollingBuffer;
/**
* Converts an incoming graph token stream, such as one from {@link SynonymGraphFilter}, into a flat
* form so that all nodes form a single linear chain with no side paths. Every path through the
* graph touches every node. This is necessary when indexing a graph token stream, because the index
* does not save {@link PositionLengthAttribute} and so it cannot preserve the graph structure.
* However, at search time, query parsers can correctly handle the graph and this token filter
* should <b>not</b> be used.
*
* <p>If the graph was not already flat to start, this is likely a lossy process, i.e. it will often
* cause the graph to accept token sequences it should not, and to reject token sequences it should
* not.
*
* <p>However, when applying synonyms during indexing, this is necessary because Lucene already does
* not index a graph and so the indexing process is already lossy (it ignores the {@link
* PositionLengthAttribute}).
*
* @lucene.experimental
*/
public final class FlattenGraphFilter extends TokenFilter {
/** Holds all tokens leaving a given input position. */
private static final class InputNode implements RollingBuffer.Resettable {
private final List<AttributeSource.State> tokens = new ArrayList<>();
/** Our input node, or -1 if we haven't been assigned yet */
int node = -1;
/**
* Maximum to input node for all tokens leaving here; we use this to know when we can freeze.
*/
int maxToNode = -1;
/**
* Where we currently map to; this changes (can only increase as we see more input tokens),
* until we are finished with this position.
*/
int outputNode = -1;
/** Which token (index into {@link #tokens}) we will next output. */
int nextOut;
@Override
public void reset() {
tokens.clear();
node = -1;
outputNode = -1;
maxToNode = -1;
nextOut = 0;
}
}
/**
* Gathers up merged input positions into a single output position, only for the current
* "frontier" of nodes we've seen but can't yet output because they are not frozen.
*/
private static final class OutputNode implements RollingBuffer.Resettable {
private final List<Integer> inputNodes = new ArrayList<>();
/** Node ID for this output, or -1 if we haven't been assigned yet. */
int node = -1;
/** Which input node (index into {@link #inputNodes}) we will next output. */
int nextOut;
/** Start offset of tokens leaving this node. */
int startOffset = -1;
/** End offset of tokens arriving to this node. */
int endOffset = -1;
@Override
public void reset() {
inputNodes.clear();
node = -1;
nextOut = 0;
startOffset = -1;
endOffset = -1;
}
}
private final RollingBuffer<InputNode> inputNodes =
new RollingBuffer<InputNode>() {
@Override
protected InputNode newInstance() {
return new InputNode();
}
};
private final RollingBuffer<OutputNode> outputNodes =
new RollingBuffer<OutputNode>() {
@Override
protected OutputNode newInstance() {
return new OutputNode();
}
};
private final PositionIncrementAttribute posIncAtt =
addAttribute(PositionIncrementAttribute.class);
private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
/** Which input node the last seen token leaves from */
private int inputFrom;
/** We are currently releasing tokens leaving from this output node */
private int outputFrom;
// for debugging:
// private int retOutputFrom;
private boolean done;
private int lastOutputFrom;
private int finalOffset;
private int finalPosInc;
private int maxLookaheadUsed;
private int lastStartOffset;
public FlattenGraphFilter(TokenStream in) {
super(in);
}
private boolean releaseBufferedToken() {
// We only need the while loop (retry) if we have a hole (an output node that has no tokens
// leaving):
while (outputFrom < outputNodes.getMaxPos()) {
OutputNode output = outputNodes.get(outputFrom);
if (output.inputNodes.isEmpty()) {
// No tokens arrived to this node, which happens for the first node
// after a hole:
// System.out.println(" skip empty outputFrom=" + outputFrom);
outputFrom++;
continue;
}
int maxToNode = -1;
for (int inputNodeID : output.inputNodes) {
InputNode inputNode = inputNodes.get(inputNodeID);
assert inputNode.outputNode == outputFrom;
maxToNode = Math.max(maxToNode, inputNode.maxToNode);
}
// System.out.println(" release maxToNode=" + maxToNode + " vs inputFrom=" + inputFrom);
// TODO: we could shrink the frontier here somewhat if we
// always output posLen=1 as part of our "sausagizing":
if (maxToNode <= inputFrom || done) {
// System.out.println(" output node merged these inputs: " + output.inputNodes);
// These tokens are now frozen
assert output.nextOut < output.inputNodes.size()
: "output.nextOut="
+ output.nextOut
+ " vs output.inputNodes.size()="
+ output.inputNodes.size();
InputNode inputNode = inputNodes.get(output.inputNodes.get(output.nextOut));
if (done && inputNode.tokens.size() == 0 && outputFrom >= outputNodes.getMaxPos()) {
return false;
}
if (inputNode.tokens.size() == 0) {
assert inputNode.nextOut == 0;
assert output.nextOut == 0;
// Hole dest nodes should never be merged since 1) we always
// assign them to a new output position, and 2) since they never
// have arriving tokens they cannot be pushed:
assert output.inputNodes.size() == 1 : output.inputNodes.size();
outputFrom++;
inputNodes.freeBefore(output.inputNodes.get(0));
outputNodes.freeBefore(outputFrom);
continue;
}
assert inputNode.nextOut < inputNode.tokens.size();
restoreState(inputNode.tokens.get(inputNode.nextOut));
// Correct posInc
assert outputFrom >= lastOutputFrom;
posIncAtt.setPositionIncrement(outputFrom - lastOutputFrom);
int toInputNodeID = inputNode.node + posLenAtt.getPositionLength();
InputNode toInputNode = inputNodes.get(toInputNodeID);
// Correct posLen
assert toInputNode.outputNode > outputFrom;
posLenAtt.setPositionLength(toInputNode.outputNode - outputFrom);
lastOutputFrom = outputFrom;
inputNode.nextOut++;
// System.out.println(" ret " + this);
OutputNode outputEndNode = outputNodes.get(toInputNode.outputNode);
// Correct offsets
// This is a bit messy; we must do this so offset don't go backwards,
// which would otherwise happen if the replacement has more tokens
// than the input:
int startOffset = Math.max(lastStartOffset, output.startOffset);
// We must do this in case the incoming tokens have broken offsets:
int endOffset = Math.max(startOffset, outputEndNode.endOffset);
offsetAtt.setOffset(startOffset, endOffset);
lastStartOffset = startOffset;
if (inputNode.nextOut == inputNode.tokens.size()) {
output.nextOut++;
if (output.nextOut == output.inputNodes.size()) {
outputFrom++;
inputNodes.freeBefore(output.inputNodes.get(0));
outputNodes.freeBefore(outputFrom);
}
}
return true;
} else {
return false;
}
}
// System.out.println(" break false");
return false;
}
@Override
public boolean incrementToken() throws IOException {
// System.out.println("\nF.increment inputFrom=" + inputFrom + " outputFrom=" + outputFrom);
while (true) {
if (releaseBufferedToken()) {
// retOutputFrom += posIncAtt.getPositionIncrement();
// System.out.println(" return buffered: " + termAtt + " " + retOutputFrom + "-" +
// (retOutputFrom + posLenAtt.getPositionLength()));
// printStates();
return true;
} else if (done) {
// System.out.println(" done, return false");
return false;
}
if (input.incrementToken()) {
// Input node this token leaves from:
inputFrom += posIncAtt.getPositionIncrement();
int startOffset = offsetAtt.startOffset();
int endOffset = offsetAtt.endOffset();
// Input node this token goes to:
int inputTo = inputFrom + posLenAtt.getPositionLength();
// System.out.println(" input.inc " + termAtt + ": " + inputFrom + "-" + inputTo);
InputNode src = inputNodes.get(inputFrom);
if (src.node == -1) {
// This means the "from" node of this token was never seen as a "to" node,
// which should only happen if we just crossed a hole. This is a challenging
// case for us because we normally rely on the full dependencies expressed
// by the arcs to assign outgoing node IDs. It would be better if tokens
// were never dropped but instead just marked deleted with a new
// TermDeletedAttribute (boolean valued) ... but until that future, we have
// a hack here to forcefully jump the output node ID:
assert src.outputNode == -1;
src.node = inputFrom;
src.outputNode = outputNodes.getMaxPos() + 1;
// System.out.println(" hole: force to outputNode=" + src.outputNode);
OutputNode outSrc = outputNodes.get(src.outputNode);
// Not assigned yet:
assert outSrc.node == -1;
outSrc.node = src.outputNode;
outSrc.inputNodes.add(inputFrom);
outSrc.startOffset = startOffset;
} else {
OutputNode outSrc = outputNodes.get(src.outputNode);
if (outSrc.startOffset == -1 || startOffset > outSrc.startOffset) {
// "shrink wrap" the offsets so the original tokens (with most
// restrictive offsets) win:
outSrc.startOffset = Math.max(startOffset, outSrc.startOffset);
}
}
// Buffer this token:
src.tokens.add(captureState());
src.maxToNode = Math.max(src.maxToNode, inputTo);
maxLookaheadUsed = Math.max(maxLookaheadUsed, inputNodes.getBufferSize());
InputNode dest = inputNodes.get(inputTo);
if (dest.node == -1) {
// Common case: first time a token is arriving to this input position:
dest.node = inputTo;
}
// Always number output nodes sequentially:
int outputEndNode = src.outputNode + 1;
if (outputEndNode > dest.outputNode) {
if (dest.outputNode != -1) {
boolean removed =
outputNodes.get(dest.outputNode).inputNodes.remove(Integer.valueOf(inputTo));
assert removed;
}
// System.out.println(" increase output node: " + dest.outputNode + " vs " +
// outputEndNode);
outputNodes.get(outputEndNode).inputNodes.add(inputTo);
dest.outputNode = outputEndNode;
// Since all we ever do is merge incoming nodes together, and then renumber
// the merged nodes sequentially, we should only ever assign smaller node
// numbers:
assert outputEndNode <= inputTo
: "outputEndNode=" + outputEndNode + " vs inputTo=" + inputTo;
}
OutputNode outDest = outputNodes.get(dest.outputNode);
// "shrink wrap" the offsets so the original tokens (with most
// restrictive offsets) win:
if (outDest.endOffset == -1 || endOffset < outDest.endOffset) {
outDest.endOffset = endOffset;
}
} else {
// System.out.println(" got false from input");
input.end();
finalPosInc = posIncAtt.getPositionIncrement();
finalOffset = offsetAtt.endOffset();
done = true;
// Don't return false here: we need to force release any buffered tokens now
}
}
}
// Only for debugging:
/*
private void printStates() {
System.out.println("states:");
for(int i=outputFrom;i<outputNodes.getMaxPos();i++) {
OutputNode outputNode = outputNodes.get(i);
System.out.println(" output " + i + ": inputs " + outputNode.inputNodes);
for(int inputNodeID : outputNode.inputNodes) {
InputNode inputNode = inputNodes.get(inputNodeID);
assert inputNode.outputNode == i;
}
}
}
*/
@Override
public void end() throws IOException {
if (done == false) {
super.end();
} else {
// NOTE, shady: don't call super.end, because we did already from incrementToken
}
clearAttributes();
if (done) {
// On exc, done is false, and we will not have set these:
posIncAtt.setPositionIncrement(finalPosInc);
offsetAtt.setOffset(finalOffset, finalOffset);
} else {
super.end();
}
}
@Override
public void reset() throws IOException {
// System.out.println("F: reset");
super.reset();
inputFrom = -1;
inputNodes.reset();
InputNode in = inputNodes.get(0);
in.node = 0;
in.outputNode = 0;
outputNodes.reset();
OutputNode out = outputNodes.get(0);
out.node = 0;
out.inputNodes.add(0);
out.startOffset = 0;
outputFrom = 0;
// retOutputFrom = -1;
lastOutputFrom = -1;
done = false;
finalPosInc = -1;
finalOffset = -1;
lastStartOffset = 0;
maxLookaheadUsed = 0;
}
/** For testing */
public int getMaxLookaheadUsed() {
return maxLookaheadUsed;
}
}