lucene/core/src/java/org/apache/lucene/util/automaton/SpecialOperations.java - lucene-solr - Git at Google

 /*
  * dk.brics.automaton
  *
  * Copyright (c) 2001-2009 Anders Moeller
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */

 package org.apache.lucene.util.automaton;

 import java.util.BitSet;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Set;

 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.fst.Util;

 /**
  * Special automata operations.
  *
  * @lucene.experimental
  */
 final public class SpecialOperations {

   private SpecialOperations() {}

   /**
    * Finds the largest entry whose value is less than or equal to c, or 0 if
    * there is no such entry.
    */
   static int findIndex(int c, int[] points) {
     int a = 0;
     int b = points.length;
     while (b - a > 1) {
       int d = (a + b) >>> 1;
       if (points[d] > c) b = d;
       else if (points[d] < c) a = d;
       else return d;
     }
     return a;
   }

   /**
    * Returns true if the language of this automaton is finite.
    */
   public static boolean isFinite(Automaton a) {
     if (a.isSingleton()) return true;
     return isFinite(a.initial, new BitSet(a.getNumberOfStates()), new BitSet(a.getNumberOfStates()));
   }

   /**
    * Checks whether there is a loop containing s. (This is sufficient since
    * there are never transitions to dead states.)
    */
   // TODO: not great that this is recursive... in theory a
   // large automata could exceed java's stack
   private static boolean isFinite(State s, BitSet path, BitSet visited) {
     path.set(s.number);
     for (Transition t : s.getTransitions())
       if (path.get(t.to.number) || (!visited.get(t.to.number) && !isFinite(t.to, path, visited))) return false;
     path.clear(s.number);
     visited.set(s.number);
     return true;
   }

   /**
    * Returns the longest string that is a prefix of all accepted strings and
    * visits each state at most once.
    *
    * @return common prefix
    */
   public static String getCommonPrefix(Automaton a) {
     if (a.isSingleton()) return a.singleton;
     StringBuilder b = new StringBuilder();
     HashSet<State> visited = new HashSet<State>();
     State s = a.initial;
     boolean done;
     do {
       done = true;
       visited.add(s);
       if (!s.accept && s.numTransitions() == 1) {
         Transition t = s.getTransitions().iterator().next();
         if (t.min == t.max && !visited.contains(t.to)) {
           b.appendCodePoint(t.min);
           s = t.to;
           done = false;
         }
       }
     } while (!done);
     return b.toString();
   }

   // TODO: this currently requites a determinized machine,
   // but it need not -- we can speed it up by walking the
   // NFA instead.  it'd still be fail fast.
   public static BytesRef getCommonPrefixBytesRef(Automaton a) {
     if (a.isSingleton()) return new BytesRef(a.singleton);
     BytesRef ref = new BytesRef(10);
     HashSet<State> visited = new HashSet<State>();
     State s = a.initial;
     boolean done;
     do {
       done = true;
       visited.add(s);
       if (!s.accept && s.numTransitions() == 1) {
         Transition t = s.getTransitions().iterator().next();
         if (t.min == t.max && !visited.contains(t.to)) {
           ref.grow(++ref.length);
           ref.bytes[ref.length - 1] = (byte)t.min;
           s = t.to;
           done = false;
         }
       }
     } while (!done);
     return ref;
   }

   /**
    * Returns the longest string that is a suffix of all accepted strings and
    * visits each state at most once.
    *
    * @return common suffix
    */
   public static String getCommonSuffix(Automaton a) {
     if (a.isSingleton()) // if singleton, the suffix is the string itself.
       return a.singleton;

     // reverse the language of the automaton, then reverse its common prefix.
     Automaton r = a.clone();
     reverse(r);
     r.determinize();
     return new StringBuilder(SpecialOperations.getCommonPrefix(r)).reverse().toString();
   }

   public static BytesRef getCommonSuffixBytesRef(Automaton a) {
     if (a.isSingleton()) // if singleton, the suffix is the string itself.
       return new BytesRef(a.singleton);

     // reverse the language of the automaton, then reverse its common prefix.
     Automaton r = a.clone();
     reverse(r);
     r.determinize();
     BytesRef ref = SpecialOperations.getCommonPrefixBytesRef(r);
     reverseBytes(ref);
     return ref;
   }

   private static void reverseBytes(BytesRef ref) {
     if (ref.length <= 1) return;
     int num = ref.length >> 1;
     for (int i = ref.offset; i < ( ref.offset + num ); i++) {
       byte b = ref.bytes[i];
       ref.bytes[i] = ref.bytes[ref.offset * 2 + ref.length - i - 1];
       ref.bytes[ref.offset * 2 + ref.length - i - 1] = b;
     }
   }

   /**
    * Reverses the language of the given (non-singleton) automaton while returning
    * the set of new initial states.
    */
   public static Set<State> reverse(Automaton a) {
     a.expandSingleton();
     // reverse all edges
     HashMap<State, HashSet<Transition>> m = new HashMap<State, HashSet<Transition>>();
     State[] states = a.getNumberedStates();
     Set<State> accept = new HashSet<State>();
     for (State s : states)
       if (s.isAccept())
         accept.add(s);
     for (State r : states) {
       m.put(r, new HashSet<Transition>());
       r.accept = false;
     }
     for (State r : states)
       for (Transition t : r.getTransitions())
         m.get(t.to).add(new Transition(t.min, t.max, r));
     for (State r : states) {
       Set<Transition> tr = m.get(r);
       r.setTransitions(tr.toArray(new Transition[tr.size()]));
     }
     // make new initial+final states
     a.initial.accept = true;
     a.initial = new State();
     for (State r : accept)
       a.initial.addEpsilon(r); // ensures that all initial states are reachable
     a.deterministic = false;
     a.clearNumberedStates();
     return accept;
   }

   // TODO: this is a dangerous method ... Automaton could be
   // huge ... and it's better in general for caller to
   // enumerate & process in a single walk:

   /**
    * Returns the set of accepted strings, assuming that at most
    * <code>limit</code> strings are accepted. If more than <code>limit</code>
    * strings are accepted, the first limit strings found are returned. If <code>limit</code>&lt;0, then
    * the limit is infinite.
    */
   public static Set<IntsRef> getFiniteStrings(Automaton a, int limit) {
     HashSet<IntsRef> strings = new HashSet<IntsRef>();
     if (a.isSingleton()) {
       if (limit > 0) {
         strings.add(Util.toUTF32(a.singleton, new IntsRef()));
       }
     } else if (!getFiniteStrings(a.initial, new HashSet<State>(), strings, new IntsRef(), limit)) {
       return strings;
     }
     return strings;
   }

   /**
    * Returns the strings that can be produced from the given state, or
    * false if more than <code>limit</code> strings are found.
    * <code>limit</code>&lt;0 means "infinite".
    */
   private static boolean getFiniteStrings(State s, HashSet<State> pathstates,
       HashSet<IntsRef> strings, IntsRef path, int limit) {
     pathstates.add(s);
     for (Transition t : s.getTransitions()) {
       if (pathstates.contains(t.to)) {
         return false;
       }
       for (int n = t.min; n <= t.max; n++) {
         path.grow(path.length+1);
         path.ints[path.length] = n;
         path.length++;
         if (t.to.accept) {
           strings.add(IntsRef.deepCopyOf(path));
           if (limit >= 0 && strings.size() > limit) {
             return false;
           }
         }
         if (!getFiniteStrings(t.to, pathstates, strings, path, limit)) {
           return false;
         }
         path.length--;
       }
     }
     pathstates.remove(s);
     return true;
   }
 }
	/*
	* dk.brics.automaton
	*
	* Copyright (c) 2001-2009 Anders Moeller
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	package org.apache.lucene.util.automaton;

	import java.util.BitSet;
	import java.util.HashMap;
	import java.util.HashSet;
	import java.util.Set;

	import org.apache.lucene.util.BytesRef;
	import org.apache.lucene.util.IntsRef;
	import org.apache.lucene.util.fst.Util;

	/**
	* Special automata operations.
	*
	* @lucene.experimental
	*/
	final public class SpecialOperations {

	private SpecialOperations() {}

	/**
	* Finds the largest entry whose value is less than or equal to c, or 0 if
	* there is no such entry.
	*/
	static int findIndex(int c, int[] points) {
	int a = 0;
	int b = points.length;
	while (b - a > 1) {
	int d = (a + b) >>> 1;
	if (points[d] > c) b = d;
	else if (points[d] < c) a = d;
	else return d;
	}
	return a;
	}

	/**
	* Returns true if the language of this automaton is finite.
	*/
	public static boolean isFinite(Automaton a) {
	if (a.isSingleton()) return true;
	return isFinite(a.initial, new BitSet(a.getNumberOfStates()), new BitSet(a.getNumberOfStates()));
	}

	/**
	* Checks whether there is a loop containing s. (This is sufficient since
	* there are never transitions to dead states.)
	*/
	// TODO: not great that this is recursive... in theory a
	// large automata could exceed java's stack
	private static boolean isFinite(State s, BitSet path, BitSet visited) {
	path.set(s.number);
	for (Transition t : s.getTransitions())
	if (path.get(t.to.number) \|\| (!visited.get(t.to.number) && !isFinite(t.to, path, visited))) return false;
	path.clear(s.number);
	visited.set(s.number);
	return true;
	}

	/**
	* Returns the longest string that is a prefix of all accepted strings and
	* visits each state at most once.
	*
	* @return common prefix
	*/
	public static String getCommonPrefix(Automaton a) {
	if (a.isSingleton()) return a.singleton;
	StringBuilder b = new StringBuilder();
	HashSet<State> visited = new HashSet<State>();
	State s = a.initial;
	boolean done;
	do {
	done = true;
	visited.add(s);
	if (!s.accept && s.numTransitions() == 1) {
	Transition t = s.getTransitions().iterator().next();
	if (t.min == t.max && !visited.contains(t.to)) {
	b.appendCodePoint(t.min);
	s = t.to;
	done = false;
	}
	}
	} while (!done);
	return b.toString();
	}

	// TODO: this currently requites a determinized machine,
	// but it need not -- we can speed it up by walking the
	// NFA instead. it'd still be fail fast.
	public static BytesRef getCommonPrefixBytesRef(Automaton a) {
	if (a.isSingleton()) return new BytesRef(a.singleton);
	BytesRef ref = new BytesRef(10);
	HashSet<State> visited = new HashSet<State>();
	State s = a.initial;
	boolean done;
	do {
	done = true;
	visited.add(s);
	if (!s.accept && s.numTransitions() == 1) {
	Transition t = s.getTransitions().iterator().next();
	if (t.min == t.max && !visited.contains(t.to)) {
	ref.grow(++ref.length);
	ref.bytes[ref.length - 1] = (byte)t.min;
	s = t.to;
	done = false;
	}
	}
	} while (!done);
	return ref;
	}

	/**
	* Returns the longest string that is a suffix of all accepted strings and
	* visits each state at most once.
	*
	* @return common suffix
	*/
	public static String getCommonSuffix(Automaton a) {
	if (a.isSingleton()) // if singleton, the suffix is the string itself.
	return a.singleton;

	// reverse the language of the automaton, then reverse its common prefix.
	Automaton r = a.clone();
	reverse(r);
	r.determinize();
	return new StringBuilder(SpecialOperations.getCommonPrefix(r)).reverse().toString();
	}

	public static BytesRef getCommonSuffixBytesRef(Automaton a) {
	if (a.isSingleton()) // if singleton, the suffix is the string itself.
	return new BytesRef(a.singleton);

	// reverse the language of the automaton, then reverse its common prefix.
	Automaton r = a.clone();
	reverse(r);
	r.determinize();
	BytesRef ref = SpecialOperations.getCommonPrefixBytesRef(r);
	reverseBytes(ref);
	return ref;
	}

	private static void reverseBytes(BytesRef ref) {
	if (ref.length <= 1) return;
	int num = ref.length >> 1;
	for (int i = ref.offset; i < ( ref.offset + num ); i++) {
	byte b = ref.bytes[i];
	ref.bytes[i] = ref.bytes[ref.offset * 2 + ref.length - i - 1];
	ref.bytes[ref.offset * 2 + ref.length - i - 1] = b;
	}
	}

	/**
	* Reverses the language of the given (non-singleton) automaton while returning
	* the set of new initial states.
	*/
	public static Set<State> reverse(Automaton a) {
	a.expandSingleton();
	// reverse all edges
	HashMap<State, HashSet<Transition>> m = new HashMap<State, HashSet<Transition>>();
	State[] states = a.getNumberedStates();
	Set<State> accept = new HashSet<State>();
	for (State s : states)
	if (s.isAccept())
	accept.add(s);
	for (State r : states) {
	m.put(r, new HashSet<Transition>());
	r.accept = false;
	}
	for (State r : states)
	for (Transition t : r.getTransitions())
	m.get(t.to).add(new Transition(t.min, t.max, r));
	for (State r : states) {
	Set<Transition> tr = m.get(r);
	r.setTransitions(tr.toArray(new Transition[tr.size()]));
	}
	// make new initial+final states
	a.initial.accept = true;
	a.initial = new State();
	for (State r : accept)
	a.initial.addEpsilon(r); // ensures that all initial states are reachable
	a.deterministic = false;
	a.clearNumberedStates();
	return accept;
	}

	// TODO: this is a dangerous method ... Automaton could be
	// huge ... and it's better in general for caller to
	// enumerate & process in a single walk:

	/**
	* Returns the set of accepted strings, assuming that at most
	* <code>limit</code> strings are accepted. If more than <code>limit</code>
	* strings are accepted, the first limit strings found are returned. If <code>limit</code><0, then
	* the limit is infinite.
	*/
	public static Set<IntsRef> getFiniteStrings(Automaton a, int limit) {
	HashSet<IntsRef> strings = new HashSet<IntsRef>();
	if (a.isSingleton()) {
	if (limit > 0) {
	strings.add(Util.toUTF32(a.singleton, new IntsRef()));
	}
	} else if (!getFiniteStrings(a.initial, new HashSet<State>(), strings, new IntsRef(), limit)) {
	return strings;
	}
	return strings;
	}

	/**
	* Returns the strings that can be produced from the given state, or
	* false if more than <code>limit</code> strings are found.
	* <code>limit</code><0 means "infinite".
	*/
	private static boolean getFiniteStrings(State s, HashSet<State> pathstates,
	HashSet<IntsRef> strings, IntsRef path, int limit) {
	pathstates.add(s);
	for (Transition t : s.getTransitions()) {
	if (pathstates.contains(t.to)) {
	return false;
	}
	for (int n = t.min; n <= t.max; n++) {
	path.grow(path.length+1);
	path.ints[path.length] = n;
	path.length++;
	if (t.to.accept) {
	strings.add(IntsRef.deepCopyOf(path));
	if (limit >= 0 && strings.size() > limit) {
	return false;
	}
	}
	if (!getFiniteStrings(t.to, pathstates, strings, path, limit)) {
	return false;
	}
	path.length--;
	}
	}
	pathstates.remove(s);
	return true;
	}
	}