lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.util.automaton;


 import java.util.*;

 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.UnicodeUtil;

 /**
  * Builds a minimal, deterministic {@link Automaton} that accepts a set of
  * strings. The algorithm requires sorted input data, but is very fast
  * (nearly linear with the input size).
  *
  * @see #build(Collection)
  * @see Automata#makeStringUnion(Collection)
  */
 public final class DaciukMihovAutomatonBuilder {

   /**
    * This builder rejects terms that are more than 1k chars long since it then
    * uses recursion based on the length of the string, which might cause stack
    * overflows.
    */
   static final int MAX_TERM_LENGTH = 1_000;

   /**
    * The default constructor is private.  Use static methods directly.
    */
   private DaciukMihovAutomatonBuilder() {
     super();
   }

   /**
    * DFSA state with <code>char</code> labels on transitions.
    */
   private final static class State {

     /** An empty set of labels. */
     private final static int[] NO_LABELS = new int[0];

     /** An empty set of states. */
     private final static State[] NO_STATES = new State[0];

     /**
      * Labels of outgoing transitions. Indexed identically to {@link #states}.
      * Labels must be sorted lexicographically.
      */
     int[] labels = NO_LABELS;

     /**
      * States reachable from outgoing transitions. Indexed identically to
      * {@link #labels}.
      */
     State[] states = NO_STATES;

     /**
      * <code>true</code> if this state corresponds to the end of at least one
      * input sequence.
      */
     boolean is_final;

     /**
      * Returns the target state of a transition leaving this state and labeled
      * with <code>label</code>. If no such transition exists, returns
      * <code>null</code>.
      */
     State getState(int label) {
       final int index = Arrays.binarySearch(labels, label);
       return index >= 0 ? states[index] : null;
     }

     /**
      * Two states are equal if:
      * <ul>
      * <li>they have an identical number of outgoing transitions, labeled with
      * the same labels</li>
      * <li>corresponding outgoing transitions lead to the same states (to states
      * with an identical right-language).
      * </ul>
      */
     @Override
     public boolean equals(Object obj) {
       final State other = (State) obj;
       return is_final == other.is_final
           && Arrays.equals(this.labels, other.labels)
           && referenceEquals(this.states, other.states);
     }

     /**
      * Compute the hash code of the <i>current</i> status of this state.
      */
     @Override
     public int hashCode() {
       int hash = is_final ? 1 : 0;

       hash ^= hash * 31 + this.labels.length;
       for (int c : this.labels)
         hash ^= hash * 31 + c;

       /*
        * Compare the right-language of this state using reference-identity of
        * outgoing states. This is possible because states are interned (stored
        * in registry) and traversed in post-order, so any outgoing transitions
        * are already interned.
        */
       for (State s : this.states) {
         hash ^= System.identityHashCode(s);
       }

       return hash;
     }

     /**
      * Return <code>true</code> if this state has any children (outgoing
      * transitions).
      */
     boolean hasChildren() {
       return labels.length > 0;
     }

     /**
      * Create a new outgoing transition labeled <code>label</code> and return
      * the newly created target state for this transition.
      */
     State newState(int label) {
       assert Arrays.binarySearch(labels, label) < 0 : "State already has transition labeled: "
           + label;

       labels = ArrayUtil.growExact(labels, labels.length + 1);
       states = ArrayUtil.growExact(states, states.length + 1);

       labels[labels.length - 1] = label;
       return states[states.length - 1] = new State();
     }

     /**
      * Return the most recent transitions's target state.
      */
     State lastChild() {
       assert hasChildren() : "No outgoing transitions.";
       return states[states.length - 1];
     }

     /**
      * Return the associated state if the most recent transition is labeled with
      * <code>label</code>.
      */
     State lastChild(int label) {
       final int index = labels.length - 1;
       State s = null;
       if (index >= 0 && labels[index] == label) {
         s = states[index];
       }
       assert s == getState(label);
       return s;
     }

     /**
      * Replace the last added outgoing transition's target state with the given
      * state.
      */
     void replaceLastChild(State state) {
       assert hasChildren() : "No outgoing transitions.";
       states[states.length - 1] = state;
     }

     /**
      * Compare two lists of objects for reference-equality.
      */
     private static boolean referenceEquals(Object[] a1, Object[] a2) {
       if (a1.length != a2.length) {
         return false;
       }

       for (int i = 0; i < a1.length; i++) {
         if (a1[i] != a2[i]) {
           return false;
         }
       }

       return true;
     }
   }

   /**
    * A "registry" for state interning.
    */
   private HashMap<State,State> stateRegistry = new HashMap<>();

   /**
    * Root automaton state.
    */
   private State root = new State();

   /**
    * Previous sequence added to the automaton in {@link #add(CharsRef)}.
    */
   private CharsRef previous;

   /**
    * A comparator used for enforcing sorted UTF8 order, used in assertions only.
    */
   @SuppressWarnings("deprecation")
   private static final Comparator<CharsRef> comparator = CharsRef.getUTF16SortedAsUTF8Comparator();

   /**
    * Add another character sequence to this automaton. The sequence must be
    * lexicographically larger or equal compared to any previous sequences added
    * to this automaton (the input must be sorted).
    */
   public void add(CharsRef current) {
     if (current.length > MAX_TERM_LENGTH) {
       throw new IllegalArgumentException("This builder doesn't allow terms that are larger than 1,000 characters, got " + current);
     }
     assert stateRegistry != null : "Automaton already built.";
     assert previous == null
         || comparator.compare(previous, current) <= 0 : "Input must be in sorted UTF-8 order: "
         + previous + " >= " + current;
     assert setPrevious(current);

     // Descend in the automaton (find matching prefix).
     int pos = 0, max = current.length();
     State next, state = root;
     while (pos < max && (next = state.lastChild(Character.codePointAt(current, pos))) != null) {
       state = next;
       // todo, optimize me
       pos += Character.charCount(Character.codePointAt(current, pos));
     }

     if (state.hasChildren()) replaceOrRegister(state);

     addSuffix(state, current, pos);
   }

   /**
    * Finalize the automaton and return the root state. No more strings can be
    * added to the builder after this call.
    *
    * @return Root automaton state.
    */
   public State complete() {
     if (this.stateRegistry == null) throw new IllegalStateException();

     if (root.hasChildren()) replaceOrRegister(root);

     stateRegistry = null;
     return root;
   }

   /**
    * Internal recursive traversal for conversion.
    */
   private static int convert(Automaton.Builder a, State s,
       IdentityHashMap<State,Integer> visited) {

     Integer converted = visited.get(s);
     if (converted != null) {
       return converted;
     }

     converted = a.createState();
     a.setAccept(converted, s.is_final);

     visited.put(s, converted);
     int i = 0;
     int[] labels = s.labels;
     for (DaciukMihovAutomatonBuilder.State target : s.states) {
       a.addTransition(converted, convert(a, target, visited), labels[i++]);
     }

     return converted;
   }

   /**
    * Build a minimal, deterministic automaton from a sorted list of {@link BytesRef} representing
    * strings in UTF-8. These strings must be binary-sorted.
    */
   public static Automaton build(Collection<BytesRef> input) {
     final DaciukMihovAutomatonBuilder builder = new DaciukMihovAutomatonBuilder();

     char[] chars = new char[0];
     CharsRef ref = new CharsRef();
     for (BytesRef b : input) {
       chars = ArrayUtil.grow(chars, b.length);
       final int len = UnicodeUtil.UTF8toUTF16(b, chars);
       ref.chars = chars;
       ref.length = len;
       builder.add(ref);
     }

     Automaton.Builder a = new Automaton.Builder();
     convert(a,
         builder.complete(),
         new IdentityHashMap<State,Integer>());

     return a.finish();
   }

   /**
    * Copy <code>current</code> into an internal buffer.
    */
   private boolean setPrevious(CharsRef current) {
     // don't need to copy, once we fix https://issues.apache.org/jira/browse/LUCENE-3277
     // still, called only from assert
     previous = CharsRef.deepCopyOf(current);
     return true;
   }

   /**
    * Replace last child of <code>state</code> with an already registered state
    * or stateRegistry the last child state.
    */
   private void replaceOrRegister(State state) {
     final State child = state.lastChild();

     if (child.hasChildren()) replaceOrRegister(child);

     final State registered = stateRegistry.get(child);
     if (registered != null) {
       state.replaceLastChild(registered);
     } else {
       stateRegistry.put(child, child);
     }
   }

   /**
    * Add a suffix of <code>current</code> starting at <code>fromIndex</code>
    * (inclusive) to state <code>state</code>.
    */
   private void addSuffix(State state, CharSequence current, int fromIndex) {
     final int len = current.length();
     while (fromIndex < len) {
       int cp = Character.codePointAt(current, fromIndex);
       state = state.newState(cp);
       fromIndex += Character.charCount(cp);
     }
     state.is_final = true;
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.util.automaton;


	import java.util.*;

	import org.apache.lucene.util.ArrayUtil;
	import org.apache.lucene.util.BytesRef;
	import org.apache.lucene.util.CharsRef;
	import org.apache.lucene.util.UnicodeUtil;

	/**
	* Builds a minimal, deterministic {@link Automaton} that accepts a set of
	* strings. The algorithm requires sorted input data, but is very fast
	* (nearly linear with the input size).
	*
	* @see #build(Collection)
	* @see Automata#makeStringUnion(Collection)
	*/
	public final class DaciukMihovAutomatonBuilder {

	/**
	* This builder rejects terms that are more than 1k chars long since it then
	* uses recursion based on the length of the string, which might cause stack
	* overflows.
	*/
	static final int MAX_TERM_LENGTH = 1_000;

	/**
	* The default constructor is private. Use static methods directly.
	*/
	private DaciukMihovAutomatonBuilder() {
	super();
	}

	/**
	* DFSA state with <code>char</code> labels on transitions.
	*/
	private final static class State {

	/** An empty set of labels. */
	private final static int[] NO_LABELS = new int[0];

	/** An empty set of states. */
	private final static State[] NO_STATES = new State[0];

	/**
	* Labels of outgoing transitions. Indexed identically to {@link #states}.
	* Labels must be sorted lexicographically.
	*/
	int[] labels = NO_LABELS;

	/**
	* States reachable from outgoing transitions. Indexed identically to
	* {@link #labels}.
	*/
	State[] states = NO_STATES;

	/**
	* <code>true</code> if this state corresponds to the end of at least one
	* input sequence.
	*/
	boolean is_final;

	/**
	* Returns the target state of a transition leaving this state and labeled
	* with <code>label</code>. If no such transition exists, returns
	* <code>null</code>.
	*/
	State getState(int label) {
	final int index = Arrays.binarySearch(labels, label);
	return index >= 0 ? states[index] : null;
	}

	/**
	* Two states are equal if:
	* <ul>
	* <li>they have an identical number of outgoing transitions, labeled with
	* the same labels</li>
	* <li>corresponding outgoing transitions lead to the same states (to states
	* with an identical right-language).
	* </ul>
	*/
	@Override
	public boolean equals(Object obj) {
	final State other = (State) obj;
	return is_final == other.is_final
	&& Arrays.equals(this.labels, other.labels)
	&& referenceEquals(this.states, other.states);
	}

	/**
	* Compute the hash code of the <i>current</i> status of this state.
	*/
	@Override
	public int hashCode() {
	int hash = is_final ? 1 : 0;

	hash ^= hash * 31 + this.labels.length;
	for (int c : this.labels)
	hash ^= hash * 31 + c;

	/*
	* Compare the right-language of this state using reference-identity of
	* outgoing states. This is possible because states are interned (stored
	* in registry) and traversed in post-order, so any outgoing transitions
	* are already interned.
	*/
	for (State s : this.states) {
	hash ^= System.identityHashCode(s);
	}

	return hash;
	}

	/**
	* Return <code>true</code> if this state has any children (outgoing
	* transitions).
	*/
	boolean hasChildren() {
	return labels.length > 0;
	}

	/**
	* Create a new outgoing transition labeled <code>label</code> and return
	* the newly created target state for this transition.
	*/
	State newState(int label) {
	assert Arrays.binarySearch(labels, label) < 0 : "State already has transition labeled: "
	+ label;

	labels = ArrayUtil.growExact(labels, labels.length + 1);
	states = ArrayUtil.growExact(states, states.length + 1);

	labels[labels.length - 1] = label;
	return states[states.length - 1] = new State();
	}

	/**
	* Return the most recent transitions's target state.
	*/
	State lastChild() {
	assert hasChildren() : "No outgoing transitions.";
	return states[states.length - 1];
	}

	/**
	* Return the associated state if the most recent transition is labeled with
	* <code>label</code>.
	*/
	State lastChild(int label) {
	final int index = labels.length - 1;
	State s = null;
	if (index >= 0 && labels[index] == label) {
	s = states[index];
	}
	assert s == getState(label);
	return s;
	}

	/**
	* Replace the last added outgoing transition's target state with the given
	* state.
	*/
	void replaceLastChild(State state) {
	assert hasChildren() : "No outgoing transitions.";
	states[states.length - 1] = state;
	}

	/**
	* Compare two lists of objects for reference-equality.
	*/
	private static boolean referenceEquals(Object[] a1, Object[] a2) {
	if (a1.length != a2.length) {
	return false;
	}

	for (int i = 0; i < a1.length; i++) {
	if (a1[i] != a2[i]) {
	return false;
	}
	}

	return true;
	}
	}

	/**
	* A "registry" for state interning.
	*/
	private HashMap<State,State> stateRegistry = new HashMap<>();

	/**
	* Root automaton state.
	*/
	private State root = new State();

	/**
	* Previous sequence added to the automaton in {@link #add(CharsRef)}.
	*/
	private CharsRef previous;

	/**
	* A comparator used for enforcing sorted UTF8 order, used in assertions only.
	*/
	@SuppressWarnings("deprecation")
	private static final Comparator<CharsRef> comparator = CharsRef.getUTF16SortedAsUTF8Comparator();

	/**
	* Add another character sequence to this automaton. The sequence must be
	* lexicographically larger or equal compared to any previous sequences added
	* to this automaton (the input must be sorted).
	*/
	public void add(CharsRef current) {
	if (current.length > MAX_TERM_LENGTH) {
	throw new IllegalArgumentException("This builder doesn't allow terms that are larger than 1,000 characters, got " + current);
	}
	assert stateRegistry != null : "Automaton already built.";
	assert previous == null
	\|\| comparator.compare(previous, current) <= 0 : "Input must be in sorted UTF-8 order: "
	+ previous + " >= " + current;
	assert setPrevious(current);

	// Descend in the automaton (find matching prefix).
	int pos = 0, max = current.length();
	State next, state = root;
	while (pos < max && (next = state.lastChild(Character.codePointAt(current, pos))) != null) {
	state = next;
	// todo, optimize me
	pos += Character.charCount(Character.codePointAt(current, pos));
	}

	if (state.hasChildren()) replaceOrRegister(state);

	addSuffix(state, current, pos);
	}

	/**
	* Finalize the automaton and return the root state. No more strings can be
	* added to the builder after this call.
	*
	* @return Root automaton state.
	*/
	public State complete() {
	if (this.stateRegistry == null) throw new IllegalStateException();

	if (root.hasChildren()) replaceOrRegister(root);

	stateRegistry = null;
	return root;
	}

	/**
	* Internal recursive traversal for conversion.
	*/
	private static int convert(Automaton.Builder a, State s,
	IdentityHashMap<State,Integer> visited) {

	Integer converted = visited.get(s);
	if (converted != null) {
	return converted;
	}

	converted = a.createState();
	a.setAccept(converted, s.is_final);

	visited.put(s, converted);
	int i = 0;
	int[] labels = s.labels;
	for (DaciukMihovAutomatonBuilder.State target : s.states) {
	a.addTransition(converted, convert(a, target, visited), labels[i++]);
	}

	return converted;
	}

	/**
	* Build a minimal, deterministic automaton from a sorted list of {@link BytesRef} representing
	* strings in UTF-8. These strings must be binary-sorted.
	*/
	public static Automaton build(Collection<BytesRef> input) {
	final DaciukMihovAutomatonBuilder builder = new DaciukMihovAutomatonBuilder();

	char[] chars = new char[0];
	CharsRef ref = new CharsRef();
	for (BytesRef b : input) {
	chars = ArrayUtil.grow(chars, b.length);
	final int len = UnicodeUtil.UTF8toUTF16(b, chars);
	ref.chars = chars;
	ref.length = len;
	builder.add(ref);
	}

	Automaton.Builder a = new Automaton.Builder();
	convert(a,
	builder.complete(),
	new IdentityHashMap<State,Integer>());

	return a.finish();
	}

	/**
	* Copy <code>current</code> into an internal buffer.
	*/
	private boolean setPrevious(CharsRef current) {
	// don't need to copy, once we fix https://issues.apache.org/jira/browse/LUCENE-3277
	// still, called only from assert
	previous = CharsRef.deepCopyOf(current);
	return true;
	}

	/**
	* Replace last child of <code>state</code> with an already registered state
	* or stateRegistry the last child state.
	*/
	private void replaceOrRegister(State state) {
	final State child = state.lastChild();

	if (child.hasChildren()) replaceOrRegister(child);

	final State registered = stateRegistry.get(child);
	if (registered != null) {
	state.replaceLastChild(registered);
	} else {
	stateRegistry.put(child, child);
	}
	}

	/**
	* Add a suffix of <code>current</code> starting at <code>fromIndex</code>
	* (inclusive) to state <code>state</code>.
	*/
	private void addSuffix(State state, CharSequence current, int fromIndex) {
	final int len = current.length();
	while (fromIndex < len) {
	int cp = Character.codePointAt(current, fromIndex);
	state = state.newState(cp);
	fromIndex += Character.charCount(cp);
	}
	state.is_final = true;
	}
	}