blob: 1f1998e2c95696a0b64a696843df7c7dde79bcb4 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util.graph;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.BitSet;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.index.Term;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.FiniteStringsIterator;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.Transition;
import static org.apache.lucene.util.automaton.Operations.DEFAULT_DETERMINIZE_WORK_LIMIT;
/**
* Consumes a TokenStream and creates an {@link Automaton} where the transition labels are terms from
* the {@link TermToBytesRefAttribute}.
* This class also provides helpers to explore the different paths of the {@link Automaton}.
*/
public final class GraphTokenStreamFiniteStrings {
private AttributeSource[] tokens = new AttributeSource[4];
private final Automaton det;
private final Transition transition = new Transition();
private class FiniteStringsTokenStream extends TokenStream {
private final IntsRef ids;
private final int end;
private int offset;
FiniteStringsTokenStream(final IntsRef ids) {
super(tokens[0].cloneAttributes());
assert ids != null;
this.ids = ids;
this.offset = ids.offset;
this.end = ids.offset + ids.length;
}
@Override
public boolean incrementToken() throws IOException {
if (offset < end) {
clearAttributes();
int id = ids.ints[offset];
tokens[id].copyTo(this);
offset++;
return true;
}
return false;
}
}
public GraphTokenStreamFiniteStrings(TokenStream in) throws IOException {
Automaton aut = build(in);
this.det = Operations.removeDeadStates(Operations.determinize(aut, DEFAULT_DETERMINIZE_WORK_LIMIT));
}
/**
* Returns whether the provided state is the start of multiple side paths of different length (eg: new york, ny)
*/
public boolean hasSidePath(int state) {
int numT = det.initTransition(state, transition);
if (numT <= 1) {
return false;
}
det.getNextTransition(transition);
int dest = transition.dest;
for (int i = 1; i < numT; i++) {
det.getNextTransition(transition);
if (dest != transition.dest) {
return true;
}
}
return false;
}
/**
* Returns the list of tokens that start at the provided state
*/
public List<AttributeSource> getTerms(int state) {
int numT = det.initTransition(state, transition);
List<AttributeSource> tokens = new ArrayList<> ();
for (int i = 0; i < numT; i++) {
det.getNextTransition(transition);
tokens.addAll(Arrays.asList(this.tokens).subList(transition.min, transition.max + 1));
}
return tokens;
}
/**
* Returns the list of terms that start at the provided state
*/
public Term[] getTerms(String field, int state) {
return getTerms(state).stream()
.map(s -> new Term(field, s.addAttribute(TermToBytesRefAttribute.class).getBytesRef()))
.toArray(Term[]::new);
}
/**
* Get all finite strings from the automaton.
*/
public Iterator<TokenStream> getFiniteStrings() throws IOException {
return getFiniteStrings(0, -1);
}
/**
* Get all finite strings that start at {@code startState} and end at {@code endState}.
*/
public Iterator<TokenStream> getFiniteStrings(int startState, int endState) {
final FiniteStringsIterator it = new FiniteStringsIterator(det, startState, endState);
return new Iterator<TokenStream> () {
IntsRef current;
boolean finished = false;
@Override
public boolean hasNext() {
if (finished == false && current == null) {
current = it.next();
if (current == null) {
finished = true;
}
}
return current != null;
}
@Override
public TokenStream next() {
if (current == null) {
hasNext();
}
TokenStream next = new FiniteStringsTokenStream(current);
current = null;
return next;
}
};
}
/**
* Returns the articulation points (or cut vertices) of the graph:
* https://en.wikipedia.org/wiki/Biconnected_component
*/
public int[] articulationPoints() {
if (det.getNumStates() == 0) {
return new int[0];
}
//
Automaton.Builder undirect = new Automaton.Builder();
undirect.copy(det);
for (int i = 0; i < det.getNumStates(); i++) {
int numT = det.initTransition(i, transition);
for (int j = 0; j < numT; j++) {
det.getNextTransition(transition);
undirect.addTransition(transition.dest, i, transition.min);
}
}
int numStates = det.getNumStates();
BitSet visited = new BitSet(numStates);
int[] depth = new int[det.getNumStates()];
int[] low = new int[det.getNumStates()];
int[] parent = new int[det.getNumStates()];
Arrays.fill(parent, -1);
List<Integer> points = new ArrayList<>();
articulationPointsRecurse(undirect.finish(), 0, 0, depth, low, parent, visited, points);
Collections.reverse(points);
return points.stream().mapToInt(p -> p).toArray();
}
/**
* Build an automaton from the provided {@link TokenStream}.
*/
private Automaton build(final TokenStream in) throws IOException {
Automaton.Builder builder = new Automaton.Builder();
final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
in.reset();
int pos = -1;
int prevIncr = 1;
int state = -1;
int id = -1;
int gap = 0;
while (in.incrementToken()) {
int currentIncr = posIncAtt.getPositionIncrement();
if (pos == -1 && currentIncr < 1) {
throw new IllegalStateException("Malformed TokenStream, start token can't have increment less than 1");
}
if (currentIncr == 0) {
if (gap > 0) {
pos -= gap;
}
}
else {
pos++;
gap = currentIncr - 1;
}
int endPos = pos + posLengthAtt.getPositionLength() + gap;
while (state < endPos) {
state = builder.createState();
}
id++;
if (tokens.length < id + 1) {
tokens = ArrayUtil.grow(tokens, id + 1);
}
tokens[id] = in.cloneAttributes();
builder.addTransition(pos, endPos, id);
pos += gap;
// we always produce linear token graphs from getFiniteStrings(), so we need to adjust
// posLength and posIncrement accordingly
tokens[id].addAttribute(PositionLengthAttribute.class).setPositionLength(1);
if (currentIncr == 0) {
// stacked token should have the same increment as original token at this position
tokens[id].addAttribute(PositionIncrementAttribute.class).setPositionIncrement(prevIncr);
}
// only save last increment on non-zero increment in case we have multiple stacked tokens
if (currentIncr > 0) {
prevIncr = currentIncr;
}
}
in.end();
if (state != -1) {
builder.setAccept(state, true);
}
return builder.finish();
}
private static void articulationPointsRecurse(Automaton a, int state, int d, int[] depth, int[] low, int[] parent,
BitSet visited, List<Integer> points) {
visited.set(state);
depth[state] = d;
low[state] = d;
int childCount = 0;
boolean isArticulation = false;
Transition t = new Transition();
int numT = a.initTransition(state, t);
for (int i = 0; i < numT; i++) {
a.getNextTransition(t);
if (visited.get(t.dest) == false) {
parent[t.dest] = state;
articulationPointsRecurse(a, t.dest, d + 1, depth, low, parent, visited, points);
childCount++;
if (low[t.dest] >= depth[state]) {
isArticulation = true;
}
low[state] = Math.min(low[state], low[t.dest]);
} else if (t.dest != parent[state]) {
low[state] = Math.min(low[state], depth[t.dest]);
}
}
if ((parent[state] != -1 && isArticulation) || (parent[state] == -1 && childCount > 1)) {
points.add(state);
}
}
}