| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.lucene.analysis.core; |
| |
| import java.io.IOException; |
| import java.util.ArrayList; |
| import java.util.Collections; |
| import java.util.List; |
| |
| import org.apache.lucene.analysis.TokenFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.synonym.SynonymGraphFilter; |
| import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; |
| import org.apache.lucene.util.AttributeSource; |
| import org.apache.lucene.util.RollingBuffer; |
| |
| /** |
| * Converts an incoming graph token stream, such as one from |
| * {@link SynonymGraphFilter}, into a flat form so that |
| * all nodes form a single linear chain with no side paths. Every |
| * path through the graph touches every node. This is necessary |
| * when indexing a graph token stream, because the index does not |
| * save {@link PositionLengthAttribute} and so it cannot |
| * preserve the graph structure. However, at search time, |
| * query parsers can correctly handle the graph and this token |
| * filter should <b>not</b> be used. |
| * |
| * <p>If the graph was not already flat to start, this |
| * is likely a lossy process, i.e. it will often cause the |
| * graph to accept token sequences it should not, and to |
| * reject token sequences it should not. |
| * |
| * <p>However, when applying synonyms during indexing, this |
| * is necessary because Lucene already does not index a graph |
| * and so the indexing process is already lossy |
| * (it ignores the {@link PositionLengthAttribute}). |
| * |
| * @lucene.experimental |
| */ |
| public final class FlattenGraphFilter extends TokenFilter { |
| |
| /** Holds all tokens leaving a given input position. */ |
| private final static class InputNode implements RollingBuffer.Resettable { |
| private final List<AttributeSource.State> tokens = new ArrayList<>(); |
| |
| /** Our input node, or -1 if we haven't been assigned yet */ |
| int node = -1; |
| |
| /** Maximum to input node for all tokens leaving here; we use this |
| * to know when we can freeze. */ |
| int maxToNode = -1; |
| |
| /** Minimum to input node for all tokens leaving here; we use this to check if holes exist. */ |
| int minToNode = Integer.MAX_VALUE; |
| |
| /** |
| * Where we currently map to; this changes (can only increase as we see more input tokens), |
| * until we are finished with this position. |
| */ |
| int outputNode = -1; |
| |
| /** Which token (index into {@link #tokens}) we will next output. */ |
| int nextOut; |
| |
| @Override |
| public void reset() { |
| tokens.clear(); |
| node = -1; |
| outputNode = -1; |
| maxToNode = -1; |
| minToNode = Integer.MAX_VALUE; |
| nextOut = 0; |
| } |
| } |
| |
| /** Gathers up merged input positions into a single output position, |
| * only for the current "frontier" of nodes we've seen but can't yet |
| * output because they are not frozen. */ |
| private final static class OutputNode implements RollingBuffer.Resettable { |
| private final List<Integer> inputNodes = new ArrayList<>(); |
| |
| /** Node ID for this output, or -1 if we haven't been assigned yet. */ |
| int node = -1; |
| |
| /** Which input node (index into {@link #inputNodes}) we will next output. */ |
| int nextOut; |
| |
| /** Start offset of tokens leaving this node. */ |
| int startOffset = -1; |
| |
| /** End offset of tokens arriving to this node. */ |
| int endOffset = -1; |
| |
| @Override |
| public void reset() { |
| inputNodes.clear(); |
| node = -1; |
| nextOut = 0; |
| startOffset = -1; |
| endOffset = -1; |
| } |
| } |
| |
| private final RollingBuffer<InputNode> inputNodes = new RollingBuffer<InputNode>() { |
| @Override |
| protected InputNode newInstance() { |
| return new InputNode(); |
| } |
| }; |
| |
| private final RollingBuffer<OutputNode> outputNodes = new RollingBuffer<OutputNode>() { |
| @Override |
| protected OutputNode newInstance() { |
| return new OutputNode(); |
| } |
| }; |
| |
| private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); |
| private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class); |
| private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); |
| |
| /** Which input node the last seen token leaves from */ |
| private int inputFrom; |
| |
| /** We are currently releasing tokens leaving from this output node */ |
| private int outputFrom; |
| |
| // for debugging: |
| //private int retOutputFrom; |
| |
| private boolean done; |
| |
| private int lastOutputFrom; |
| |
| private int finalOffset; |
| |
| private int finalPosInc; |
| |
| private int maxLookaheadUsed; |
| |
| private int lastStartOffset; |
| |
| public FlattenGraphFilter(TokenStream in) { |
| super(in); |
| } |
| |
| private boolean releaseBufferedToken() { |
| |
| // We only need the while loop (retry) if we have a hole (an output node that has no tokens leaving): |
| while (outputFrom < outputNodes.getMaxPos()) { |
| OutputNode output = outputNodes.get(outputFrom); |
| if (output.inputNodes.isEmpty()) { |
| // No tokens arrived to this node, which happens for the first node |
| // after a hole: |
| //System.out.println(" skip empty outputFrom=" + outputFrom); |
| outputFrom++; |
| continue; |
| } |
| |
| int maxToNode = -1; |
| for(int inputNodeID : output.inputNodes) { |
| InputNode inputNode = inputNodes.get(inputNodeID); |
| assert inputNode.outputNode == outputFrom; |
| maxToNode = Math.max(maxToNode, inputNode.maxToNode); |
| } |
| //System.out.println(" release maxToNode=" + maxToNode + " vs inputFrom=" + inputFrom); |
| |
| // TODO: we could shrink the frontier here somewhat if we |
| // always output posLen=1 as part of our "sausagizing": |
| if (maxToNode <= inputFrom || done) { |
| //System.out.println(" output node merged these inputs: " + output.inputNodes); |
| // These tokens are now frozen |
| assert output.nextOut < output.inputNodes.size(): "output.nextOut=" + output.nextOut + " vs output.inputNodes.size()=" + output.inputNodes.size(); |
| InputNode inputNode = inputNodes.get(output.inputNodes.get(output.nextOut)); |
| if (done && inputNode.tokens.size() == 0 && outputFrom >= outputNodes.getMaxPos()) { |
| return false; |
| } |
| if (inputNode.tokens.size() == 0) { |
| assert inputNode.nextOut == 0; |
| // Hole dest nodes should never be merged since 1) we always |
| // assign them to a new output position, and 2) since they never |
| // have arriving tokens they cannot be pushed. Skip them but don't free |
| // input until all are checked. |
| // Related tests testAltPathLastStepLongHole, testAltPathLastStepHoleFollowedByHole, |
| // testAltPathLastStepHoleWithoutEndToken |
| if (output.inputNodes.size() > 1) { |
| output.nextOut++; |
| if (output.nextOut < output.inputNodes.size()) { |
| continue; |
| } |
| } |
| // Don't free from a hole src. Since no edge leaves here book keeping may be incorrect. |
| // Later output nodes may point to earlier input nodes. So we don't want to free them yet. |
| freeBefore(output); |
| continue; |
| } |
| |
| assert inputNode.nextOut < inputNode.tokens.size(); |
| |
| restoreState(inputNode.tokens.get(inputNode.nextOut)); |
| |
| // Correct posInc |
| assert outputFrom >= lastOutputFrom; |
| posIncAtt.setPositionIncrement(outputFrom - lastOutputFrom); |
| int toInputNodeID = inputNode.node + posLenAtt.getPositionLength(); |
| InputNode toInputNode = inputNodes.get(toInputNodeID); |
| |
| // Correct posLen |
| assert toInputNode.outputNode > outputFrom; |
| posLenAtt.setPositionLength(toInputNode.outputNode - outputFrom); |
| lastOutputFrom = outputFrom; |
| inputNode.nextOut++; |
| //System.out.println(" ret " + this); |
| |
| OutputNode outputEndNode = outputNodes.get(toInputNode.outputNode); |
| |
| // Correct offsets |
| |
| // This is a bit messy; we must do this so offset don't go backwards, |
| // which would otherwise happen if the replacement has more tokens |
| // than the input: |
| int startOffset = Math.max(lastStartOffset, output.startOffset); |
| |
| // We must do this in case the incoming tokens have broken offsets: |
| int endOffset = Math.max(startOffset, outputEndNode.endOffset); |
| |
| offsetAtt.setOffset(startOffset, endOffset); |
| lastStartOffset = startOffset; |
| |
| if (inputNode.nextOut == inputNode.tokens.size()) { |
| output.nextOut++; |
| if (output.nextOut == output.inputNodes.size()) { |
| freeBefore(output); |
| } |
| } |
| |
| return true; |
| } else { |
| return false; |
| } |
| } |
| |
| //System.out.println(" break false"); |
| return false; |
| } |
| |
| /** |
| * Free inputs nodes before the minimum input node for the given output. |
| * |
| * @param output target output node |
| */ |
| private void freeBefore(OutputNode output) { |
| /* We've released all of the tokens that end at the current output, so free all output nodes before this. |
| Input nodes are more complex. The second shingled tokens with alternate paths can appear later in the output graph |
| than some of their alternate path tokens. Because of this case we can only free from the minimum because |
| the minimum node will have come from before the second shingled token. |
| This means we have to hold onto input nodes whose tokens get stacked on previous nodes until |
| we've completely passed those inputs. |
| Related tests testShingledGap, testShingledGapWithHoles |
| */ |
| outputFrom++; |
| int freeBefore = Collections.min(output.inputNodes); |
| // This will catch a node being freed early if it is input to the next output. |
| // Could a freed early node be input to a later output? |
| assert outputNodes.get(outputFrom).inputNodes.stream().filter(n -> freeBefore > n).count() == 0 |
| : "FreeBefore " + freeBefore + " will free in use nodes"; |
| inputNodes.freeBefore(freeBefore); |
| outputNodes.freeBefore(outputFrom); |
| } |
| |
| @Override |
| public boolean incrementToken() throws IOException { |
| //System.out.println("\nF.increment inputFrom=" + inputFrom + " outputFrom=" + outputFrom); |
| |
| while (true) { |
| if (releaseBufferedToken()) { |
| //retOutputFrom += posIncAtt.getPositionIncrement(); |
| //System.out.println(" return buffered: " + termAtt + " " + retOutputFrom + "-" + (retOutputFrom + posLenAtt.getPositionLength())); |
| //printStates(); |
| return true; |
| } else if (done) { |
| //System.out.println(" done, return false"); |
| return false; |
| } |
| |
| if (input.incrementToken()) { |
| // Input node this token leaves from: |
| int positionIncrement = posIncAtt.getPositionIncrement(); |
| inputFrom += positionIncrement; |
| |
| int startOffset = offsetAtt.startOffset(); |
| int endOffset = offsetAtt.endOffset(); |
| |
| // Input node this token goes to: |
| int inputTo = inputFrom + posLenAtt.getPositionLength(); |
| //System.out.println(" input.inc " + termAtt + ": " + inputFrom + "-" + inputTo); |
| |
| InputNode src = inputNodes.get(inputFrom); |
| if (src.node == -1) { |
| recoverFromHole(src, startOffset, positionIncrement); |
| |
| } else { |
| OutputNode outSrc = outputNodes.get(src.outputNode); |
| /* If positionIncrement > 1 and the position we're incrementing from doesn't come to the current node we've crossed a hole. |
| * The long edge will point too far back and not account for the holes unless it gets fixed. |
| * example: |
| * _____abc______ |
| * | | |
| * | V |
| * O-a->O- ->O- ->O-d->O |
| * |
| * A long edge may have already made this fix though, if src is more than 1 position ahead in the output there's no additional work to do |
| * example: |
| * _____abc______ |
| * | ....bc....| |
| * | . VV |
| * O-a->O- ->O- ->O-d->O |
| */ |
| if (positionIncrement > 1 |
| && src.outputNode - inputNodes.get(inputFrom - positionIncrement).outputNode <= 1 |
| && inputNodes.get(inputFrom - positionIncrement).minToNode != inputFrom) { |
| /* If there was a hole at the end of an alternate path then the input and output nodes |
| * have been created, |
| * but the offsets and increments have not been maintained correctly. Here we go back |
| * and fix them. |
| * Related test testAltPathLastStepHole |
| * The last node in the alt path didn't arrive to remove this reference. |
| */ |
| assert inputNodes.get(inputFrom).tokens.isEmpty() : "about to remove non empty edge"; |
| outSrc.inputNodes.remove(Integer.valueOf(inputFrom)); |
| src.outputNode = -1; |
| int prevEndOffset = outSrc.endOffset; |
| |
| outSrc = recoverFromHole(src, startOffset, positionIncrement); |
| outSrc.endOffset = prevEndOffset; |
| } |
| |
| if (outSrc.startOffset == -1 || startOffset > outSrc.startOffset) { |
| // "shrink wrap" the offsets so the original tokens (with most |
| // restrictive offsets) win: |
| outSrc.startOffset = Math.max(startOffset, outSrc.startOffset); |
| } |
| } |
| |
| // Buffer this token: |
| src.tokens.add(captureState()); |
| src.maxToNode = Math.max(src.maxToNode, inputTo); |
| src.minToNode = Math.min(src.minToNode, inputTo); |
| maxLookaheadUsed = Math.max(maxLookaheadUsed, inputNodes.getBufferSize()); |
| |
| InputNode dest = inputNodes.get(inputTo); |
| if (dest.node == -1) { |
| // Common case: first time a token is arriving to this input position: |
| dest.node = inputTo; |
| } |
| |
| // Always number output nodes sequentially: |
| int outputEndNode = src.outputNode + 1; |
| |
| if (outputEndNode > dest.outputNode) { |
| if (dest.outputNode != -1) { |
| boolean removed = outputNodes.get(dest.outputNode).inputNodes.remove(Integer.valueOf(inputTo)); |
| assert removed; |
| } |
| //System.out.println(" increase output node: " + dest.outputNode + " vs " + outputEndNode); |
| outputNodes.get(outputEndNode).inputNodes.add(inputTo); |
| dest.outputNode = outputEndNode; |
| |
| // Since all we ever do is merge incoming nodes together, and then renumber |
| // the merged nodes sequentially, we should only ever assign smaller node |
| // numbers: |
| assert outputEndNode <= inputTo: "outputEndNode=" + outputEndNode + " vs inputTo=" + inputTo; |
| } |
| |
| OutputNode outDest = outputNodes.get(dest.outputNode); |
| // "shrink wrap" the offsets so the original tokens (with most |
| // restrictive offsets) win: |
| if (outDest.endOffset == -1 || endOffset < outDest.endOffset) { |
| outDest.endOffset = endOffset; |
| } |
| |
| } else { |
| //System.out.println(" got false from input"); |
| input.end(); |
| finalPosInc = posIncAtt.getPositionIncrement(); |
| finalOffset = offsetAtt.endOffset(); |
| done = true; |
| // Don't return false here: we need to force release any buffered tokens now |
| } |
| } |
| } |
| |
| private OutputNode recoverFromHole(InputNode src, int startOffset, int posinc) { |
| // This means the "from" node of this token was never seen as a "to" node, |
| // which should only happen if we just crossed a hole. This is a challenging |
| // case for us because we normally rely on the full dependencies expressed |
| // by the arcs to assign outgoing node IDs. It would be better if tokens |
| // were never dropped but instead just marked deleted with a new |
| // TermDeletedAttribute (boolean valued) ... but until that future, we have |
| // a hack here to forcefully jump the output node ID: |
| assert src.outputNode == -1; |
| src.node = inputFrom; |
| |
| int outIndex; |
| int previousInputFrom = inputFrom - posinc; |
| if (previousInputFrom >= 0) { |
| InputNode offsetSrc = inputNodes.get(previousInputFrom); |
| /* Select output src node. Need to make sure the new output node isn't placed too far ahead. |
| * If a disconnected node is placed at the end of the output graph that may place it after output nodes that map to input nodes that are after src in the input. |
| * Since it is disconnected there is no path to it, and there could be holes after meaning no paths to following nodes. This "floating" edge will cause problems in FreeBefore. |
| * In the following section make sure the edge connects to something. |
| * Related test testLongHole testAltPathLastStepHoleFollowedByHole, testAltPathFirstStepHole, testShingledGapWithHoles |
| */ |
| if (offsetSrc.minToNode < inputFrom) { |
| // There is a possible path to this node. |
| // place this node one position off from the possible path keeping a 1 inc gap. |
| // Can't be larger than 1 inc or risk getting disconnected. |
| outIndex = inputNodes.get(offsetSrc.minToNode).outputNode + 1; |
| } else { |
| // no information about how the current node was previously connected. |
| // Connect it to the end. |
| outIndex = outputNodes.getMaxPos(); |
| } |
| } else { |
| // in case the first token in the stream is a hole we have no input node to increment from. |
| outIndex = outputNodes.getMaxPos() + 1; |
| } |
| OutputNode outSrc = outputNodes.get(outIndex); |
| src.outputNode = outIndex; |
| |
| // OutSrc may have other inputs |
| if (outSrc.node == -1) { |
| outSrc.node = src.outputNode; |
| outSrc.startOffset = startOffset; |
| } else { |
| outSrc.startOffset = Math.max(startOffset, outSrc.startOffset); |
| } |
| outSrc.inputNodes.add(inputFrom); |
| return outSrc; |
| } |
| |
| // Only for debugging: |
| /* |
| private void printStates() { |
| System.out.println("states:"); |
| for(int i=outputFrom;i<outputNodes.getMaxPos();i++) { |
| OutputNode outputNode = outputNodes.get(i); |
| System.out.println(" output " + i + ": inputs " + outputNode.inputNodes); |
| for(int inputNodeID : outputNode.inputNodes) { |
| InputNode inputNode = inputNodes.get(inputNodeID); |
| assert inputNode.outputNode == i; |
| } |
| } |
| } |
| */ |
| |
| @Override |
| public void end() throws IOException { |
| if (done == false) { |
| super.end(); |
| } else { |
| // NOTE, shady: don't call super.end, because we did already from incrementToken |
| } |
| |
| clearAttributes(); |
| if (done) { |
| // On exc, done is false, and we will not have set these: |
| posIncAtt.setPositionIncrement(finalPosInc); |
| offsetAtt.setOffset(finalOffset, finalOffset); |
| } else { |
| super.end(); |
| } |
| } |
| |
| @Override |
| public void reset() throws IOException { |
| //System.out.println("F: reset"); |
| super.reset(); |
| inputFrom = -1; |
| inputNodes.reset(); |
| InputNode in = inputNodes.get(0); |
| in.node = 0; |
| in.outputNode = 0; |
| |
| outputNodes.reset(); |
| OutputNode out = outputNodes.get(0); |
| out.node = 0; |
| out.inputNodes.add(0); |
| out.startOffset = 0; |
| outputFrom = 0; |
| //retOutputFrom = -1; |
| lastOutputFrom = -1; |
| done = false; |
| finalPosInc = -1; |
| finalOffset = -1; |
| lastStartOffset = 0; |
| maxLookaheadUsed = 0; |
| } |
| |
| /** For testing */ |
| public int getMaxLookaheadUsed() { |
| return maxLookaheadUsed; |
| } |
| } |