blob: ef1bbd20bea5e3cb56913e8605fb19c0196e4f7b [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.Transition;
/** Converts an Automaton into a TokenStream. */
public class AutomatonToTokenStream {
private AutomatonToTokenStream() {}
/**
* converts an automaton into a TokenStream. This is done by first Topo sorting the nodes in the
* Automaton. Nodes that have the same distance from the start are grouped together to form the
* position nodes for the TokenStream. The resulting TokenStream releases edges from the automaton
* as tokens in order from the position nodes. This requires the automaton be a finite DAG.
*
* @param automaton automaton to convert. Must be a finite DAG.
* @return TokenStream representation of automaton.
*/
public static TokenStream toTokenStream(Automaton automaton) {
if (Operations.isFinite(automaton) == false) {
throw new IllegalArgumentException("Automaton must be finite");
}
List<List<Integer>> positionNodes = new ArrayList<>();
Transition[][] transitions = automaton.getSortedTransitions();
int[] indegree = new int[transitions.length];
for (int i = 0; i < transitions.length; i++) {
for (int edge = 0; edge < transitions[i].length; edge++) {
indegree[transitions[i][edge].dest] += 1;
}
}
if (indegree[0] != 0) {
throw new IllegalArgumentException("Start node has incoming edges, creating cycle");
}
LinkedList<RemapNode> noIncomingEdges = new LinkedList<>();
Map<Integer, Integer> idToPos = new HashMap<>();
noIncomingEdges.addLast(new RemapNode(0, 0));
while (noIncomingEdges.isEmpty() == false) {
RemapNode currState = noIncomingEdges.removeFirst();
for (int i = 0; i < transitions[currState.id].length; i++) {
indegree[transitions[currState.id][i].dest] -= 1;
if (indegree[transitions[currState.id][i].dest] == 0) {
noIncomingEdges.addLast(
new RemapNode(transitions[currState.id][i].dest, currState.pos + 1));
}
}
if (positionNodes.size() == currState.pos) {
List<Integer> posIncs = new ArrayList<>();
posIncs.add(currState.id);
positionNodes.add(posIncs);
} else {
positionNodes.get(currState.pos).add(currState.id);
}
idToPos.put(currState.id, currState.pos);
}
for (int i = 0; i < indegree.length; i++) {
if (indegree[i] != 0) {
throw new IllegalArgumentException("Cycle found in automaton");
}
}
List<List<EdgeToken>> edgesByLayer = new ArrayList<>();
for (List<Integer> layer : positionNodes) {
List<EdgeToken> edges = new ArrayList<>();
for (int state : layer) {
for (Transition t : transitions[state]) {
// each edge in the token stream can only be on value, though a transition takes a range.
for (int val = t.min; val <= t.max; val++) {
int destLayer = idToPos.get(t.dest);
edges.add(new EdgeToken(destLayer, val));
// If there's an intermediate accept state, add an edge to the terminal state.
if (automaton.isAccept(t.dest) && destLayer != positionNodes.size() - 1) {
edges.add(new EdgeToken(positionNodes.size() - 1, val));
}
}
}
}
edgesByLayer.add(edges);
}
return new TopoTokenStream(edgesByLayer);
}
/** Token Stream that outputs tokens from a topo sorted graph. */
private static class TopoTokenStream extends TokenStream {
private final List<List<EdgeToken>> edgesByPos;
private int currentPos;
private int currentEdgeIndex;
private CharTermAttribute charAttr = addAttribute(CharTermAttribute.class);
private PositionIncrementAttribute incAttr = addAttribute(PositionIncrementAttribute.class);
private PositionLengthAttribute lenAttr = addAttribute(PositionLengthAttribute.class);
private OffsetAttribute offAttr = addAttribute(OffsetAttribute.class);
public TopoTokenStream(List<List<EdgeToken>> edgesByPos) {
this.edgesByPos = edgesByPos;
}
@Override
public boolean incrementToken() throws IOException {
clearAttributes();
while (currentPos < edgesByPos.size()
&& currentEdgeIndex == edgesByPos.get(currentPos).size()) {
currentEdgeIndex = 0;
currentPos += 1;
}
if (currentPos == edgesByPos.size()) {
return false;
}
EdgeToken currentEdge = edgesByPos.get(currentPos).get(currentEdgeIndex);
charAttr.append((char) currentEdge.value);
incAttr.setPositionIncrement(currentEdgeIndex == 0 ? 1 : 0);
lenAttr.setPositionLength(currentEdge.destination - currentPos);
offAttr.setOffset(currentPos, currentEdge.destination);
currentEdgeIndex++;
return true;
}
@Override
public void reset() throws IOException {
super.reset();
clearAttributes();
currentPos = 0;
currentEdgeIndex = 0;
}
@Override
public void end() throws IOException {
clearAttributes();
incAttr.setPositionIncrement(0);
// -1 because we don't count the terminal state as a position in the TokenStream
offAttr.setOffset(edgesByPos.size() - 1, edgesByPos.size() - 1);
}
}
/** Edge between position nodes. These edges will be output as tokens in the TokenStream */
private static class EdgeToken {
public final int destination;
public final int value;
public EdgeToken(int destination, int value) {
this.destination = destination;
this.value = value;
}
}
/** Node that contains original node id and position in TokenStream */
private static class RemapNode {
public final int id;
public final int pos;
public RemapNode(int id, int pos) {
this.id = id;
this.pos = pos;
}
}
}