lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.lucene.util.graph;

 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.BitSet;
 import java.util.Collections;
 import java.util.Iterator;
 import java.util.List;

 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.FiniteStringsIterator;
 import org.apache.lucene.util.automaton.Operations;
 import org.apache.lucene.util.automaton.Transition;

 import static org.apache.lucene.util.automaton.Operations.DEFAULT_DETERMINIZE_WORK_LIMIT;

 /**
  * Consumes a TokenStream and creates an {@link Automaton} where the transition labels are terms from
  * the {@link TermToBytesRefAttribute}.
  * This class also provides helpers to explore the different paths of the {@link Automaton}.
  */
 public final class GraphTokenStreamFiniteStrings {

   private AttributeSource[] tokens = new AttributeSource[4];
   private final Automaton det;
   private final Transition transition = new Transition();

   private class FiniteStringsTokenStream extends TokenStream {
     private final IntsRef ids;
     private final int end;
     private int offset;

     FiniteStringsTokenStream(final IntsRef ids) {
       super(tokens[0].cloneAttributes());
       assert ids != null;
       this.ids = ids;
       this.offset = ids.offset;
       this.end = ids.offset + ids.length;
     }

     @Override
     public boolean incrementToken() throws IOException {
       if (offset < end) {
         clearAttributes();
         int id = ids.ints[offset];
         tokens[id].copyTo(this);
         offset++;
         return true;
       }

       return false;
     }
   }

   public GraphTokenStreamFiniteStrings(TokenStream in) throws IOException {
     Automaton aut = build(in);
     this.det = Operations.removeDeadStates(Operations.determinize(aut, DEFAULT_DETERMINIZE_WORK_LIMIT));
   }

   /**
    * Returns whether the provided state is the start of multiple side paths of different length (eg: new york, ny)
    */
   public boolean hasSidePath(int state) {
     int numT = det.initTransition(state, transition);
     if (numT <= 1) {
       return false;
     }
     det.getNextTransition(transition);
     int dest = transition.dest;
     for (int i = 1; i < numT; i++) {
       det.getNextTransition(transition);
       if (dest != transition.dest) {
         return true;
       }
     }
     return false;
   }

   /**
    * Returns the list of tokens that start at the provided state
    */
   public List<AttributeSource> getTerms(int state) {
     int numT = det.initTransition(state, transition);
     List<AttributeSource> tokens = new ArrayList<> ();
     for (int i = 0; i < numT; i++) {
       det.getNextTransition(transition);
       tokens.addAll(Arrays.asList(this.tokens).subList(transition.min, transition.max + 1));
     }
     return tokens;
   }

   /**
    * Returns the list of terms that start at the provided state
    */
   public Term[] getTerms(String field, int state) {
     return getTerms(state).stream()
         .map(s -> new Term(field, s.addAttribute(TermToBytesRefAttribute.class).getBytesRef()))
         .toArray(Term[]::new);
   }

   /**
    * Get all finite strings from the automaton.
    */
   public Iterator<TokenStream> getFiniteStrings() throws IOException {
     return getFiniteStrings(0, -1);
   }


   /**
    * Get all finite strings that start at {@code startState} and end at {@code endState}.
    */
   public Iterator<TokenStream> getFiniteStrings(int startState, int endState) {
     final FiniteStringsIterator it = new FiniteStringsIterator(det, startState, endState);
     return new Iterator<TokenStream> () {
       IntsRef current;
       boolean finished = false;

       @Override
       public boolean hasNext() {
         if (finished == false && current == null) {
           current = it.next();
           if (current == null) {
             finished = true;
           }
         }
         return current != null;
       }

       @Override
       public TokenStream next() {
         if (current == null) {
           hasNext();
         }
         TokenStream next =  new FiniteStringsTokenStream(current);
         current = null;
         return next;
       }
     };
   }

   /**
    * Returns the articulation points (or cut vertices) of the graph:
    * https://en.wikipedia.org/wiki/Biconnected_component
    */
   public int[] articulationPoints() {
     if (det.getNumStates() == 0) {
       return new int[0];
     }
     //
     Automaton.Builder undirect = new Automaton.Builder();
     undirect.copy(det);
     for (int i = 0; i < det.getNumStates(); i++) {
       int numT = det.initTransition(i, transition);
       for (int j = 0; j < numT; j++) {
         det.getNextTransition(transition);
         undirect.addTransition(transition.dest, i, transition.min);
       }
     }
     int numStates = det.getNumStates();
     BitSet visited = new BitSet(numStates);
     int[] depth = new int[det.getNumStates()];
     int[] low = new int[det.getNumStates()];
     int[] parent = new int[det.getNumStates()];
     Arrays.fill(parent, -1);
     List<Integer> points = new ArrayList<>();
     articulationPointsRecurse(undirect.finish(), 0, 0, depth, low, parent, visited, points);
     Collections.reverse(points);
     return points.stream().mapToInt(p -> p).toArray();
   }

   /**
    * Build an automaton from the provided {@link TokenStream}.
    */
   private Automaton build(final TokenStream in) throws IOException {
     Automaton.Builder builder = new Automaton.Builder();

     final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
     final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);

     in.reset();

     int pos = -1;
     int prevIncr = 1;
     int state = -1;
     int id = -1;
     int gap = 0;
     while (in.incrementToken()) {
       int currentIncr = posIncAtt.getPositionIncrement();
       if (pos == -1 && currentIncr < 1) {
         throw new IllegalStateException("Malformed TokenStream, start token can't have increment less than 1");
       }

       if (currentIncr == 0) {
         if (gap > 0) {
           pos -= gap;
         }
       }
       else {
         pos++;
         gap = currentIncr - 1;
       }

       int endPos = pos + posLengthAtt.getPositionLength() + gap;
       while (state < endPos) {
         state = builder.createState();
       }

       id++;
       if (tokens.length < id + 1) {
         tokens = ArrayUtil.grow(tokens, id + 1);
       }

       tokens[id] = in.cloneAttributes();
       builder.addTransition(pos, endPos, id);
       pos += gap;

       // we always produce linear token graphs from getFiniteStrings(), so we need to adjust
       // posLength and posIncrement accordingly
       tokens[id].addAttribute(PositionLengthAttribute.class).setPositionLength(1);
       if (currentIncr == 0) {
         // stacked token should have the same increment as original token at this position
         tokens[id].addAttribute(PositionIncrementAttribute.class).setPositionIncrement(prevIncr);
       }

       // only save last increment on non-zero increment in case we have multiple stacked tokens
       if (currentIncr > 0) {
         prevIncr = currentIncr;
       }
     }

     in.end();
     if (state != -1) {
       builder.setAccept(state, true);
     }
     return builder.finish();
   }

   private static void articulationPointsRecurse(Automaton a, int state, int d, int[] depth, int[] low, int[] parent,
                                                 BitSet visited, List<Integer> points) {
     visited.set(state);
     depth[state] = d;
     low[state] = d;
     int childCount = 0;
     boolean isArticulation = false;
     Transition t = new Transition();
     int numT = a.initTransition(state, t);
     for (int i = 0; i < numT; i++) {
       a.getNextTransition(t);
       if (visited.get(t.dest) == false) {
         parent[t.dest] = state;
         articulationPointsRecurse(a, t.dest, d + 1, depth, low, parent, visited, points);
         childCount++;
         if (low[t.dest] >= depth[state]) {
           isArticulation = true;
         }
         low[state] = Math.min(low[state], low[t.dest]);
       } else if (t.dest != parent[state]) {
         low[state] = Math.min(low[state], depth[t.dest]);
       }
     }
     if ((parent[state] != -1 && isArticulation) || (parent[state] == -1 && childCount > 1)) {
       points.add(state);
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.lucene.util.graph;

	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.BitSet;
	import java.util.Collections;
	import java.util.Iterator;
	import java.util.List;

	import org.apache.lucene.analysis.TokenStream;
	import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
	import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
	import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
	import org.apache.lucene.index.Term;
	import org.apache.lucene.util.ArrayUtil;
	import org.apache.lucene.util.AttributeSource;
	import org.apache.lucene.util.IntsRef;
	import org.apache.lucene.util.automaton.Automaton;
	import org.apache.lucene.util.automaton.FiniteStringsIterator;
	import org.apache.lucene.util.automaton.Operations;
	import org.apache.lucene.util.automaton.Transition;

	import static org.apache.lucene.util.automaton.Operations.DEFAULT_DETERMINIZE_WORK_LIMIT;

	/**
	* Consumes a TokenStream and creates an {@link Automaton} where the transition labels are terms from
	* the {@link TermToBytesRefAttribute}.
	* This class also provides helpers to explore the different paths of the {@link Automaton}.
	*/
	public final class GraphTokenStreamFiniteStrings {

	private AttributeSource[] tokens = new AttributeSource[4];
	private final Automaton det;
	private final Transition transition = new Transition();

	private class FiniteStringsTokenStream extends TokenStream {
	private final IntsRef ids;
	private final int end;
	private int offset;

	FiniteStringsTokenStream(final IntsRef ids) {
	super(tokens[0].cloneAttributes());
	assert ids != null;
	this.ids = ids;
	this.offset = ids.offset;
	this.end = ids.offset + ids.length;
	}

	@Override
	public boolean incrementToken() throws IOException {
	if (offset < end) {
	clearAttributes();
	int id = ids.ints[offset];
	tokens[id].copyTo(this);
	offset++;
	return true;
	}

	return false;
	}
	}

	public GraphTokenStreamFiniteStrings(TokenStream in) throws IOException {
	Automaton aut = build(in);
	this.det = Operations.removeDeadStates(Operations.determinize(aut, DEFAULT_DETERMINIZE_WORK_LIMIT));
	}

	/**
	* Returns whether the provided state is the start of multiple side paths of different length (eg: new york, ny)
	*/
	public boolean hasSidePath(int state) {
	int numT = det.initTransition(state, transition);
	if (numT <= 1) {
	return false;
	}
	det.getNextTransition(transition);
	int dest = transition.dest;
	for (int i = 1; i < numT; i++) {
	det.getNextTransition(transition);
	if (dest != transition.dest) {
	return true;
	}
	}
	return false;
	}

	/**
	* Returns the list of tokens that start at the provided state
	*/
	public List<AttributeSource> getTerms(int state) {
	int numT = det.initTransition(state, transition);
	List<AttributeSource> tokens = new ArrayList<> ();
	for (int i = 0; i < numT; i++) {
	det.getNextTransition(transition);
	tokens.addAll(Arrays.asList(this.tokens).subList(transition.min, transition.max + 1));
	}
	return tokens;
	}

	/**
	* Returns the list of terms that start at the provided state
	*/
	public Term[] getTerms(String field, int state) {
	return getTerms(state).stream()
	.map(s -> new Term(field, s.addAttribute(TermToBytesRefAttribute.class).getBytesRef()))
	.toArray(Term[]::new);
	}

	/**
	* Get all finite strings from the automaton.
	*/
	public Iterator<TokenStream> getFiniteStrings() throws IOException {
	return getFiniteStrings(0, -1);
	}


	/**
	* Get all finite strings that start at {@code startState} and end at {@code endState}.
	*/
	public Iterator<TokenStream> getFiniteStrings(int startState, int endState) {
	final FiniteStringsIterator it = new FiniteStringsIterator(det, startState, endState);
	return new Iterator<TokenStream> () {
	IntsRef current;
	boolean finished = false;

	@Override
	public boolean hasNext() {
	if (finished == false && current == null) {
	current = it.next();
	if (current == null) {
	finished = true;
	}
	}
	return current != null;
	}

	@Override
	public TokenStream next() {
	if (current == null) {
	hasNext();
	}
	TokenStream next = new FiniteStringsTokenStream(current);
	current = null;
	return next;
	}
	};
	}

	/**
	* Returns the articulation points (or cut vertices) of the graph:
	* https://en.wikipedia.org/wiki/Biconnected_component
	*/
	public int[] articulationPoints() {
	if (det.getNumStates() == 0) {
	return new int[0];
	}
	//
	Automaton.Builder undirect = new Automaton.Builder();
	undirect.copy(det);
	for (int i = 0; i < det.getNumStates(); i++) {
	int numT = det.initTransition(i, transition);
	for (int j = 0; j < numT; j++) {
	det.getNextTransition(transition);
	undirect.addTransition(transition.dest, i, transition.min);
	}
	}
	int numStates = det.getNumStates();
	BitSet visited = new BitSet(numStates);
	int[] depth = new int[det.getNumStates()];
	int[] low = new int[det.getNumStates()];
	int[] parent = new int[det.getNumStates()];
	Arrays.fill(parent, -1);
	List<Integer> points = new ArrayList<>();
	articulationPointsRecurse(undirect.finish(), 0, 0, depth, low, parent, visited, points);
	Collections.reverse(points);
	return points.stream().mapToInt(p -> p).toArray();
	}

	/**
	* Build an automaton from the provided {@link TokenStream}.
	*/
	private Automaton build(final TokenStream in) throws IOException {
	Automaton.Builder builder = new Automaton.Builder();

	final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
	final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);

	in.reset();

	int pos = -1;
	int prevIncr = 1;
	int state = -1;
	int id = -1;
	int gap = 0;
	while (in.incrementToken()) {
	int currentIncr = posIncAtt.getPositionIncrement();
	if (pos == -1 && currentIncr < 1) {
	throw new IllegalStateException("Malformed TokenStream, start token can't have increment less than 1");
	}

	if (currentIncr == 0) {
	if (gap > 0) {
	pos -= gap;
	}
	}
	else {
	pos++;
	gap = currentIncr - 1;
	}

	int endPos = pos + posLengthAtt.getPositionLength() + gap;
	while (state < endPos) {
	state = builder.createState();
	}

	id++;
	if (tokens.length < id + 1) {
	tokens = ArrayUtil.grow(tokens, id + 1);
	}

	tokens[id] = in.cloneAttributes();
	builder.addTransition(pos, endPos, id);
	pos += gap;

	// we always produce linear token graphs from getFiniteStrings(), so we need to adjust
	// posLength and posIncrement accordingly
	tokens[id].addAttribute(PositionLengthAttribute.class).setPositionLength(1);
	if (currentIncr == 0) {
	// stacked token should have the same increment as original token at this position
	tokens[id].addAttribute(PositionIncrementAttribute.class).setPositionIncrement(prevIncr);
	}

	// only save last increment on non-zero increment in case we have multiple stacked tokens
	if (currentIncr > 0) {
	prevIncr = currentIncr;
	}
	}

	in.end();
	if (state != -1) {
	builder.setAccept(state, true);
	}
	return builder.finish();
	}

	private static void articulationPointsRecurse(Automaton a, int state, int d, int[] depth, int[] low, int[] parent,
	BitSet visited, List<Integer> points) {
	visited.set(state);
	depth[state] = d;
	low[state] = d;
	int childCount = 0;
	boolean isArticulation = false;
	Transition t = new Transition();
	int numT = a.initTransition(state, t);
	for (int i = 0; i < numT; i++) {
	a.getNextTransition(t);
	if (visited.get(t.dest) == false) {
	parent[t.dest] = state;
	articulationPointsRecurse(a, t.dest, d + 1, depth, low, parent, visited, points);
	childCount++;
	if (low[t.dest] >= depth[state]) {
	isArticulation = true;
	}
	low[state] = Math.min(low[state], low[t.dest]);
	} else if (t.dest != parent[state]) {
	low[state] = Math.min(low[state], depth[t.dest]);
	}
	}
	if ((parent[state] != -1 && isArticulation) \|\| (parent[state] == -1 && childCount > 1)) {
	points.add(state);
	}
	}
	}