| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.lucene.analysis.core; |
| |
| import java.io.IOException; |
| import java.util.ArrayList; |
| import java.util.List; |
| import org.apache.lucene.analysis.TokenFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.synonym.SynonymGraphFilter; |
| import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; |
| import org.apache.lucene.util.AttributeSource; |
| import org.apache.lucene.util.RollingBuffer; |
| |
| /** |
| * Converts an incoming graph token stream, such as one from {@link SynonymGraphFilter}, into a flat |
| * form so that all nodes form a single linear chain with no side paths. Every path through the |
| * graph touches every node. This is necessary when indexing a graph token stream, because the index |
| * does not save {@link PositionLengthAttribute} and so it cannot preserve the graph structure. |
| * However, at search time, query parsers can correctly handle the graph and this token filter |
| * should <b>not</b> be used. |
| * |
| * <p>If the graph was not already flat to start, this is likely a lossy process, i.e. it will often |
| * cause the graph to accept token sequences it should not, and to reject token sequences it should |
| * not. |
| * |
| * <p>However, when applying synonyms during indexing, this is necessary because Lucene already does |
| * not index a graph and so the indexing process is already lossy (it ignores the {@link |
| * PositionLengthAttribute}). |
| * |
| * @lucene.experimental |
| */ |
| public final class FlattenGraphFilter extends TokenFilter { |
| |
| /** Holds all tokens leaving a given input position. */ |
| private static final class InputNode implements RollingBuffer.Resettable { |
| private final List<AttributeSource.State> tokens = new ArrayList<>(); |
| |
| /** Our input node, or -1 if we haven't been assigned yet */ |
| int node = -1; |
| |
| /** |
| * Maximum to input node for all tokens leaving here; we use this to know when we can freeze. |
| */ |
| int maxToNode = -1; |
| |
| /** |
| * Where we currently map to; this changes (can only increase as we see more input tokens), |
| * until we are finished with this position. |
| */ |
| int outputNode = -1; |
| |
| /** Which token (index into {@link #tokens}) we will next output. */ |
| int nextOut; |
| |
| @Override |
| public void reset() { |
| tokens.clear(); |
| node = -1; |
| outputNode = -1; |
| maxToNode = -1; |
| nextOut = 0; |
| } |
| } |
| |
| /** |
| * Gathers up merged input positions into a single output position, only for the current |
| * "frontier" of nodes we've seen but can't yet output because they are not frozen. |
| */ |
| private static final class OutputNode implements RollingBuffer.Resettable { |
| private final List<Integer> inputNodes = new ArrayList<>(); |
| |
| /** Node ID for this output, or -1 if we haven't been assigned yet. */ |
| int node = -1; |
| |
| /** Which input node (index into {@link #inputNodes}) we will next output. */ |
| int nextOut; |
| |
| /** Start offset of tokens leaving this node. */ |
| int startOffset = -1; |
| |
| /** End offset of tokens arriving to this node. */ |
| int endOffset = -1; |
| |
| @Override |
| public void reset() { |
| inputNodes.clear(); |
| node = -1; |
| nextOut = 0; |
| startOffset = -1; |
| endOffset = -1; |
| } |
| } |
| |
| private final RollingBuffer<InputNode> inputNodes = |
| new RollingBuffer<InputNode>() { |
| @Override |
| protected InputNode newInstance() { |
| return new InputNode(); |
| } |
| }; |
| |
| private final RollingBuffer<OutputNode> outputNodes = |
| new RollingBuffer<OutputNode>() { |
| @Override |
| protected OutputNode newInstance() { |
| return new OutputNode(); |
| } |
| }; |
| |
| private final PositionIncrementAttribute posIncAtt = |
| addAttribute(PositionIncrementAttribute.class); |
| private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class); |
| private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); |
| |
| /** Which input node the last seen token leaves from */ |
| private int inputFrom; |
| |
| /** We are currently releasing tokens leaving from this output node */ |
| private int outputFrom; |
| |
| // for debugging: |
| // private int retOutputFrom; |
| |
| private boolean done; |
| |
| private int lastOutputFrom; |
| |
| private int finalOffset; |
| |
| private int finalPosInc; |
| |
| private int maxLookaheadUsed; |
| |
| private int lastStartOffset; |
| |
| public FlattenGraphFilter(TokenStream in) { |
| super(in); |
| } |
| |
| private boolean releaseBufferedToken() { |
| |
| // We only need the while loop (retry) if we have a hole (an output node that has no tokens |
| // leaving): |
| while (outputFrom < outputNodes.getMaxPos()) { |
| OutputNode output = outputNodes.get(outputFrom); |
| if (output.inputNodes.isEmpty()) { |
| // No tokens arrived to this node, which happens for the first node |
| // after a hole: |
| // System.out.println(" skip empty outputFrom=" + outputFrom); |
| outputFrom++; |
| continue; |
| } |
| |
| int maxToNode = -1; |
| for (int inputNodeID : output.inputNodes) { |
| InputNode inputNode = inputNodes.get(inputNodeID); |
| assert inputNode.outputNode == outputFrom; |
| maxToNode = Math.max(maxToNode, inputNode.maxToNode); |
| } |
| // System.out.println(" release maxToNode=" + maxToNode + " vs inputFrom=" + inputFrom); |
| |
| // TODO: we could shrink the frontier here somewhat if we |
| // always output posLen=1 as part of our "sausagizing": |
| if (maxToNode <= inputFrom || done) { |
| // System.out.println(" output node merged these inputs: " + output.inputNodes); |
| // These tokens are now frozen |
| assert output.nextOut < output.inputNodes.size() |
| : "output.nextOut=" |
| + output.nextOut |
| + " vs output.inputNodes.size()=" |
| + output.inputNodes.size(); |
| InputNode inputNode = inputNodes.get(output.inputNodes.get(output.nextOut)); |
| if (done && inputNode.tokens.size() == 0 && outputFrom >= outputNodes.getMaxPos()) { |
| return false; |
| } |
| if (inputNode.tokens.size() == 0) { |
| assert inputNode.nextOut == 0; |
| assert output.nextOut == 0; |
| // Hole dest nodes should never be merged since 1) we always |
| // assign them to a new output position, and 2) since they never |
| // have arriving tokens they cannot be pushed: |
| assert output.inputNodes.size() == 1 : output.inputNodes.size(); |
| outputFrom++; |
| inputNodes.freeBefore(output.inputNodes.get(0)); |
| outputNodes.freeBefore(outputFrom); |
| continue; |
| } |
| |
| assert inputNode.nextOut < inputNode.tokens.size(); |
| |
| restoreState(inputNode.tokens.get(inputNode.nextOut)); |
| |
| // Correct posInc |
| assert outputFrom >= lastOutputFrom; |
| posIncAtt.setPositionIncrement(outputFrom - lastOutputFrom); |
| int toInputNodeID = inputNode.node + posLenAtt.getPositionLength(); |
| InputNode toInputNode = inputNodes.get(toInputNodeID); |
| |
| // Correct posLen |
| assert toInputNode.outputNode > outputFrom; |
| posLenAtt.setPositionLength(toInputNode.outputNode - outputFrom); |
| lastOutputFrom = outputFrom; |
| inputNode.nextOut++; |
| // System.out.println(" ret " + this); |
| |
| OutputNode outputEndNode = outputNodes.get(toInputNode.outputNode); |
| |
| // Correct offsets |
| |
| // This is a bit messy; we must do this so offset don't go backwards, |
| // which would otherwise happen if the replacement has more tokens |
| // than the input: |
| int startOffset = Math.max(lastStartOffset, output.startOffset); |
| |
| // We must do this in case the incoming tokens have broken offsets: |
| int endOffset = Math.max(startOffset, outputEndNode.endOffset); |
| |
| offsetAtt.setOffset(startOffset, endOffset); |
| lastStartOffset = startOffset; |
| |
| if (inputNode.nextOut == inputNode.tokens.size()) { |
| output.nextOut++; |
| if (output.nextOut == output.inputNodes.size()) { |
| outputFrom++; |
| inputNodes.freeBefore(output.inputNodes.get(0)); |
| outputNodes.freeBefore(outputFrom); |
| } |
| } |
| |
| return true; |
| } else { |
| return false; |
| } |
| } |
| |
| // System.out.println(" break false"); |
| return false; |
| } |
| |
| @Override |
| public boolean incrementToken() throws IOException { |
| // System.out.println("\nF.increment inputFrom=" + inputFrom + " outputFrom=" + outputFrom); |
| |
| while (true) { |
| if (releaseBufferedToken()) { |
| // retOutputFrom += posIncAtt.getPositionIncrement(); |
| // System.out.println(" return buffered: " + termAtt + " " + retOutputFrom + "-" + |
| // (retOutputFrom + posLenAtt.getPositionLength())); |
| // printStates(); |
| return true; |
| } else if (done) { |
| // System.out.println(" done, return false"); |
| return false; |
| } |
| |
| if (input.incrementToken()) { |
| // Input node this token leaves from: |
| inputFrom += posIncAtt.getPositionIncrement(); |
| |
| int startOffset = offsetAtt.startOffset(); |
| int endOffset = offsetAtt.endOffset(); |
| |
| // Input node this token goes to: |
| int inputTo = inputFrom + posLenAtt.getPositionLength(); |
| // System.out.println(" input.inc " + termAtt + ": " + inputFrom + "-" + inputTo); |
| |
| InputNode src = inputNodes.get(inputFrom); |
| if (src.node == -1) { |
| // This means the "from" node of this token was never seen as a "to" node, |
| // which should only happen if we just crossed a hole. This is a challenging |
| // case for us because we normally rely on the full dependencies expressed |
| // by the arcs to assign outgoing node IDs. It would be better if tokens |
| // were never dropped but instead just marked deleted with a new |
| // TermDeletedAttribute (boolean valued) ... but until that future, we have |
| // a hack here to forcefully jump the output node ID: |
| assert src.outputNode == -1; |
| src.node = inputFrom; |
| |
| src.outputNode = outputNodes.getMaxPos() + 1; |
| // System.out.println(" hole: force to outputNode=" + src.outputNode); |
| OutputNode outSrc = outputNodes.get(src.outputNode); |
| |
| // Not assigned yet: |
| assert outSrc.node == -1; |
| outSrc.node = src.outputNode; |
| outSrc.inputNodes.add(inputFrom); |
| outSrc.startOffset = startOffset; |
| } else { |
| OutputNode outSrc = outputNodes.get(src.outputNode); |
| if (outSrc.startOffset == -1 || startOffset > outSrc.startOffset) { |
| // "shrink wrap" the offsets so the original tokens (with most |
| // restrictive offsets) win: |
| outSrc.startOffset = Math.max(startOffset, outSrc.startOffset); |
| } |
| } |
| |
| // Buffer this token: |
| src.tokens.add(captureState()); |
| src.maxToNode = Math.max(src.maxToNode, inputTo); |
| maxLookaheadUsed = Math.max(maxLookaheadUsed, inputNodes.getBufferSize()); |
| |
| InputNode dest = inputNodes.get(inputTo); |
| if (dest.node == -1) { |
| // Common case: first time a token is arriving to this input position: |
| dest.node = inputTo; |
| } |
| |
| // Always number output nodes sequentially: |
| int outputEndNode = src.outputNode + 1; |
| |
| if (outputEndNode > dest.outputNode) { |
| if (dest.outputNode != -1) { |
| boolean removed = |
| outputNodes.get(dest.outputNode).inputNodes.remove(Integer.valueOf(inputTo)); |
| assert removed; |
| } |
| // System.out.println(" increase output node: " + dest.outputNode + " vs " + |
| // outputEndNode); |
| outputNodes.get(outputEndNode).inputNodes.add(inputTo); |
| dest.outputNode = outputEndNode; |
| |
| // Since all we ever do is merge incoming nodes together, and then renumber |
| // the merged nodes sequentially, we should only ever assign smaller node |
| // numbers: |
| assert outputEndNode <= inputTo |
| : "outputEndNode=" + outputEndNode + " vs inputTo=" + inputTo; |
| } |
| |
| OutputNode outDest = outputNodes.get(dest.outputNode); |
| // "shrink wrap" the offsets so the original tokens (with most |
| // restrictive offsets) win: |
| if (outDest.endOffset == -1 || endOffset < outDest.endOffset) { |
| outDest.endOffset = endOffset; |
| } |
| |
| } else { |
| // System.out.println(" got false from input"); |
| input.end(); |
| finalPosInc = posIncAtt.getPositionIncrement(); |
| finalOffset = offsetAtt.endOffset(); |
| done = true; |
| // Don't return false here: we need to force release any buffered tokens now |
| } |
| } |
| } |
| |
| // Only for debugging: |
| /* |
| private void printStates() { |
| System.out.println("states:"); |
| for(int i=outputFrom;i<outputNodes.getMaxPos();i++) { |
| OutputNode outputNode = outputNodes.get(i); |
| System.out.println(" output " + i + ": inputs " + outputNode.inputNodes); |
| for(int inputNodeID : outputNode.inputNodes) { |
| InputNode inputNode = inputNodes.get(inputNodeID); |
| assert inputNode.outputNode == i; |
| } |
| } |
| } |
| */ |
| |
| @Override |
| public void end() throws IOException { |
| if (done == false) { |
| super.end(); |
| } else { |
| // NOTE, shady: don't call super.end, because we did already from incrementToken |
| } |
| |
| clearAttributes(); |
| if (done) { |
| // On exc, done is false, and we will not have set these: |
| posIncAtt.setPositionIncrement(finalPosInc); |
| offsetAtt.setOffset(finalOffset, finalOffset); |
| } else { |
| super.end(); |
| } |
| } |
| |
| @Override |
| public void reset() throws IOException { |
| // System.out.println("F: reset"); |
| super.reset(); |
| inputFrom = -1; |
| inputNodes.reset(); |
| InputNode in = inputNodes.get(0); |
| in.node = 0; |
| in.outputNode = 0; |
| |
| outputNodes.reset(); |
| OutputNode out = outputNodes.get(0); |
| out.node = 0; |
| out.inputNodes.add(0); |
| out.startOffset = 0; |
| outputFrom = 0; |
| // retOutputFrom = -1; |
| lastOutputFrom = -1; |
| done = false; |
| finalPosInc = -1; |
| finalOffset = -1; |
| lastStartOffset = 0; |
| maxLookaheadUsed = 0; |
| } |
| |
| /** For testing */ |
| public int getMaxLookaheadUsed() { |
| return maxLookaheadUsed; |
| } |
| } |