lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java - lucene-solr - Git at Google

 package org.apache.lucene.util.automaton;

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;

 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.search.PrefixTermsEnum;
 import org.apache.lucene.index.SingleTermsEnum;
 import org.apache.lucene.util.BytesRef;

 /**
  * Immutable class holding compiled details for a given
  * Automaton.  The Automaton is deterministic, must not have
  * dead states but is not necessarily minimal.
  *
  * @lucene.experimental
  */
 public class CompiledAutomaton {
   /**
    * Automata are compiled into different internal forms for the
    * most efficient execution depending upon the language they accept.
    */
   public enum AUTOMATON_TYPE {
     /** Automaton that accepts no strings. */
     NONE,
     /** Automaton that accepts all possible strings. */
     ALL,
     /** Automaton that accepts only a single fixed string. */
     SINGLE,
     /** Automaton that matches all Strings with a constant prefix. */
     PREFIX,
     /** Catch-all for any other automata. */
     NORMAL
   };
   public final AUTOMATON_TYPE type;

   /**
    * For {@link AUTOMATON_TYPE#PREFIX}, this is the prefix term;
    * for {@link AUTOMATON_TYPE#SINGLE} this is the singleton term.
    */
   public final BytesRef term;

   /**
    * Matcher for quickly determining if a byte[] is accepted.
    * only valid for {@link AUTOMATON_TYPE#NORMAL}.
    */
   public final ByteRunAutomaton runAutomaton;
   // TODO: would be nice if these sortedTransitions had "int
   // to;" instead of "State to;" somehow:
   /**
    * Two dimensional array of transitions, indexed by state
    * number for traversal. The state numbering is consistent with
    * {@link #runAutomaton}.
    * Only valid for {@link AUTOMATON_TYPE#NORMAL}.
    */
   public final Transition[][] sortedTransitions;
   /**
    * Shared common suffix accepted by the automaton. Only valid
    * for {@link AUTOMATON_TYPE#NORMAL}, and only when the
    * automaton accepts an infinite language.
    */
   public final BytesRef commonSuffixRef;
   /**
    * Indicates if the automaton accepts a finite set of strings.
    * Null if this was not computed.
    * Only valid for {@link AUTOMATON_TYPE#NORMAL}.
    */
   public final Boolean finite;

   public CompiledAutomaton(Automaton automaton) {
     this(automaton, null, true);
   }

   public CompiledAutomaton(Automaton automaton, Boolean finite, boolean simplify) {

     if (simplify) {
       // Test whether the automaton is a "simple" form and
       // if so, don't create a runAutomaton.  Note that on a
       // large automaton these tests could be costly:
       if (BasicOperations.isEmpty(automaton)) {
         // matches nothing
         type = AUTOMATON_TYPE.NONE;
         term = null;
         commonSuffixRef = null;
         runAutomaton = null;
         sortedTransitions = null;
         this.finite = null;
         return;
       } else if (BasicOperations.isTotal(automaton)) {
         // matches all possible strings
         type = AUTOMATON_TYPE.ALL;
         term = null;
         commonSuffixRef = null;
         runAutomaton = null;
         sortedTransitions = null;
         this.finite = null;
         return;
       } else {
         final String commonPrefix;
         final String singleton;
         if (automaton.getSingleton() == null) {
           commonPrefix = SpecialOperations.getCommonPrefix(automaton);
           if (commonPrefix.length() > 0 && BasicOperations.sameLanguage(automaton, BasicAutomata.makeString(commonPrefix))) {
             singleton = commonPrefix;
           } else {
             singleton = null;
           }
         } else {
           commonPrefix = null;
           singleton = automaton.getSingleton();
         }

         if (singleton != null) {
           // matches a fixed string in singleton or expanded
           // representation
           type = AUTOMATON_TYPE.SINGLE;
           term = new BytesRef(singleton);
           commonSuffixRef = null;
           runAutomaton = null;
           sortedTransitions = null;
           this.finite = null;
           return;
         } else if (BasicOperations.sameLanguage(automaton, BasicOperations.concatenate(
                                                                                        BasicAutomata.makeString(commonPrefix), BasicAutomata.makeAnyString()))) {
           // matches a constant prefix
           type = AUTOMATON_TYPE.PREFIX;
           term = new BytesRef(commonPrefix);
           commonSuffixRef = null;
           runAutomaton = null;
           sortedTransitions = null;
           this.finite = null;
           return;
         }
       }
     }

     type = AUTOMATON_TYPE.NORMAL;
     term = null;
     if (finite == null) {
       this.finite = SpecialOperations.isFinite(automaton);
     } else {
       this.finite = finite;
     }
     Automaton utf8 = new UTF32ToUTF8().convert(automaton);
     if (this.finite) {
       commonSuffixRef = null;
     } else {
       commonSuffixRef = SpecialOperations.getCommonSuffixBytesRef(utf8);
     }
     runAutomaton = new ByteRunAutomaton(utf8, true);
     sortedTransitions = utf8.getSortedTransitions();
   }

   //private static final boolean DEBUG = BlockTreeTermsWriter.DEBUG;

   private BytesRef addTail(int state, BytesRef term, int idx, int leadLabel) {

     // Find biggest transition that's < label
     // TODO: use binary search here
     Transition maxTransition = null;
     for (Transition transition : sortedTransitions[state]) {
       if (transition.min < leadLabel) {
         maxTransition = transition;
       }
     }

     assert maxTransition != null;

     // Append floorLabel
     final int floorLabel;
     if (maxTransition.max > leadLabel-1) {
       floorLabel = leadLabel-1;
     } else {
       floorLabel = maxTransition.max;
     }
     if (idx >= term.bytes.length) {
       term.grow(1+idx);
     }
     //if (DEBUG) System.out.println("  add floorLabel=" + (char) floorLabel + " idx=" + idx);
     term.bytes[idx] = (byte) floorLabel;

     state = maxTransition.to.getNumber();
     idx++;

     // Push down to last accept state
     while (true) {
       Transition[] transitions = sortedTransitions[state];
       if (transitions.length == 0) {
         assert runAutomaton.isAccept(state);
         term.length = idx;
         //if (DEBUG) System.out.println("  return " + term.utf8ToString());
         return term;
       } else {
         // We are pushing "top" -- so get last label of
         // last transition:
         assert transitions.length != 0;
         Transition lastTransition = transitions[transitions.length-1];
         if (idx >= term.bytes.length) {
           term.grow(1+idx);
         }
         //if (DEBUG) System.out.println("  push maxLabel=" + (char) lastTransition.max + " idx=" + idx);
         term.bytes[idx] = (byte) lastTransition.max;
         state = lastTransition.to.getNumber();
         idx++;
       }
     }
   }

   // TODO: should this take startTerm too?  This way
   // Terms.intersect could forward to this method if type !=
   // NORMAL:
   public TermsEnum getTermsEnum(Terms terms) throws IOException {
     switch(type) {
     case NONE:
       return TermsEnum.EMPTY;
     case ALL:
       return terms.iterator(null);
     case SINGLE:
       return new SingleTermsEnum(terms.iterator(null), term);
     case PREFIX:
       // TODO: this is very likely faster than .intersect,
       // but we should test and maybe cutover
       return new PrefixTermsEnum(terms.iterator(null), term);
     case NORMAL:
       return terms.intersect(this, null);
     default:
       // unreachable
       throw new RuntimeException("unhandled case");
     }
   }

   /** Finds largest term accepted by this Automaton, that's
    *  <= the provided input term.  The result is placed in
    *  output; it's fine for output and input to point to
    *  the same BytesRef.  The returned result is either the
    *  provided output, or null if there is no floor term
    *  (ie, the provided input term is before the first term
    *  accepted by this Automaton). */
   public BytesRef floor(BytesRef input, BytesRef output) {

     output.offset = 0;
     //if (DEBUG) System.out.println("CA.floor input=" + input.utf8ToString());

     int state = runAutomaton.getInitialState();

     // Special case empty string:
     if (input.length == 0) {
       if (runAutomaton.isAccept(state)) {
         output.length = 0;
         return output;
       } else {
         return null;
       }
     }

     final List<Integer> stack = new ArrayList<Integer>();

     int idx = 0;
     while (true) {
       int label = input.bytes[input.offset + idx] & 0xff;
       int nextState = runAutomaton.step(state, label);
       //if (DEBUG) System.out.println("  cycle label=" + (char) label + " nextState=" + nextState);

       if (idx == input.length-1) {
         if (nextState != -1 && runAutomaton.isAccept(nextState)) {
           // Input string is accepted
           if (idx >= output.bytes.length) {
             output.grow(1+idx);
           }
           output.bytes[idx] = (byte) label;
           output.length = input.length;
           //if (DEBUG) System.out.println("  input is accepted; return term=" + output.utf8ToString());
           return output;
         } else {
           nextState = -1;
         }
       }

       if (nextState == -1) {

         // Pop back to a state that has a transition
         // <= our label:
         while (true) {
           Transition[] transitions = sortedTransitions[state];
           if (transitions.length == 0) {
             assert runAutomaton.isAccept(state);
             output.length = idx;
             //if (DEBUG) System.out.println("  return " + output.utf8ToString());
             return output;
           } else if (label-1 < transitions[0].min) {

             if (runAutomaton.isAccept(state)) {
               output.length = idx;
               //if (DEBUG) System.out.println("  return " + output.utf8ToString());
               return output;
             }
             // pop
             if (stack.size() == 0) {
               //if (DEBUG) System.out.println("  pop ord=" + idx + " return null");
               return null;
             } else {
               state = stack.remove(stack.size()-1);
               idx--;
               //if (DEBUG) System.out.println("  pop ord=" + (idx+1) + " label=" + (char) label + " first trans.min=" + (char) transitions[0].min);
               label = input.bytes[input.offset + idx] & 0xff;
             }

           } else {
             //if (DEBUG) System.out.println("  stop pop ord=" + idx + " first trans.min=" + (char) transitions[0].min);
             break;
           }
         }

         //if (DEBUG) System.out.println("  label=" + (char) label + " idx=" + idx);

         return addTail(state, output, idx, label);

       } else {
         if (idx >= output.bytes.length) {
           output.grow(1+idx);
         }
         output.bytes[idx] = (byte) label;
         stack.add(state);
         state = nextState;
         idx++;
       }
     }
   }
 }
	package org.apache.lucene.util.automaton;

	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.List;

	import org.apache.lucene.index.Terms;
	import org.apache.lucene.index.TermsEnum;
	import org.apache.lucene.search.PrefixTermsEnum;
	import org.apache.lucene.index.SingleTermsEnum;
	import org.apache.lucene.util.BytesRef;

	/**
	* Immutable class holding compiled details for a given
	* Automaton. The Automaton is deterministic, must not have
	* dead states but is not necessarily minimal.
	*
	* @lucene.experimental
	*/
	public class CompiledAutomaton {
	/**
	* Automata are compiled into different internal forms for the
	* most efficient execution depending upon the language they accept.
	*/
	public enum AUTOMATON_TYPE {
	/** Automaton that accepts no strings. */
	NONE,
	/** Automaton that accepts all possible strings. */
	ALL,
	/** Automaton that accepts only a single fixed string. */
	SINGLE,
	/** Automaton that matches all Strings with a constant prefix. */
	PREFIX,
	/** Catch-all for any other automata. */
	NORMAL
	};
	public final AUTOMATON_TYPE type;

	/**
	* For {@link AUTOMATON_TYPE#PREFIX}, this is the prefix term;
	* for {@link AUTOMATON_TYPE#SINGLE} this is the singleton term.
	*/
	public final BytesRef term;

	/**
	* Matcher for quickly determining if a byte[] is accepted.
	* only valid for {@link AUTOMATON_TYPE#NORMAL}.
	*/
	public final ByteRunAutomaton runAutomaton;
	// TODO: would be nice if these sortedTransitions had "int
	// to;" instead of "State to;" somehow:
	/**
	* Two dimensional array of transitions, indexed by state
	* number for traversal. The state numbering is consistent with
	* {@link #runAutomaton}.
	* Only valid for {@link AUTOMATON_TYPE#NORMAL}.
	*/
	public final Transition[][] sortedTransitions;
	/**
	* Shared common suffix accepted by the automaton. Only valid
	* for {@link AUTOMATON_TYPE#NORMAL}, and only when the
	* automaton accepts an infinite language.
	*/
	public final BytesRef commonSuffixRef;
	/**
	* Indicates if the automaton accepts a finite set of strings.
	* Null if this was not computed.
	* Only valid for {@link AUTOMATON_TYPE#NORMAL}.
	*/
	public final Boolean finite;

	public CompiledAutomaton(Automaton automaton) {
	this(automaton, null, true);
	}

	public CompiledAutomaton(Automaton automaton, Boolean finite, boolean simplify) {

	if (simplify) {
	// Test whether the automaton is a "simple" form and
	// if so, don't create a runAutomaton. Note that on a
	// large automaton these tests could be costly:
	if (BasicOperations.isEmpty(automaton)) {
	// matches nothing
	type = AUTOMATON_TYPE.NONE;
	term = null;
	commonSuffixRef = null;
	runAutomaton = null;
	sortedTransitions = null;
	this.finite = null;
	return;
	} else if (BasicOperations.isTotal(automaton)) {
	// matches all possible strings
	type = AUTOMATON_TYPE.ALL;
	term = null;
	commonSuffixRef = null;
	runAutomaton = null;
	sortedTransitions = null;
	this.finite = null;
	return;
	} else {
	final String commonPrefix;
	final String singleton;
	if (automaton.getSingleton() == null) {
	commonPrefix = SpecialOperations.getCommonPrefix(automaton);
	if (commonPrefix.length() > 0 && BasicOperations.sameLanguage(automaton, BasicAutomata.makeString(commonPrefix))) {
	singleton = commonPrefix;
	} else {
	singleton = null;
	}
	} else {
	commonPrefix = null;
	singleton = automaton.getSingleton();
	}

	if (singleton != null) {
	// matches a fixed string in singleton or expanded
	// representation
	type = AUTOMATON_TYPE.SINGLE;
	term = new BytesRef(singleton);
	commonSuffixRef = null;
	runAutomaton = null;
	sortedTransitions = null;
	this.finite = null;
	return;
	} else if (BasicOperations.sameLanguage(automaton, BasicOperations.concatenate(
	BasicAutomata.makeString(commonPrefix), BasicAutomata.makeAnyString()))) {
	// matches a constant prefix
	type = AUTOMATON_TYPE.PREFIX;
	term = new BytesRef(commonPrefix);
	commonSuffixRef = null;
	runAutomaton = null;
	sortedTransitions = null;
	this.finite = null;
	return;
	}
	}
	}

	type = AUTOMATON_TYPE.NORMAL;
	term = null;
	if (finite == null) {
	this.finite = SpecialOperations.isFinite(automaton);
	} else {
	this.finite = finite;
	}
	Automaton utf8 = new UTF32ToUTF8().convert(automaton);
	if (this.finite) {
	commonSuffixRef = null;
	} else {
	commonSuffixRef = SpecialOperations.getCommonSuffixBytesRef(utf8);
	}
	runAutomaton = new ByteRunAutomaton(utf8, true);
	sortedTransitions = utf8.getSortedTransitions();
	}

	//private static final boolean DEBUG = BlockTreeTermsWriter.DEBUG;

	private BytesRef addTail(int state, BytesRef term, int idx, int leadLabel) {

	// Find biggest transition that's < label
	// TODO: use binary search here
	Transition maxTransition = null;
	for (Transition transition : sortedTransitions[state]) {
	if (transition.min < leadLabel) {
	maxTransition = transition;
	}
	}

	assert maxTransition != null;

	// Append floorLabel
	final int floorLabel;
	if (maxTransition.max > leadLabel-1) {
	floorLabel = leadLabel-1;
	} else {
	floorLabel = maxTransition.max;
	}
	if (idx >= term.bytes.length) {
	term.grow(1+idx);
	}
	//if (DEBUG) System.out.println(" add floorLabel=" + (char) floorLabel + " idx=" + idx);
	term.bytes[idx] = (byte) floorLabel;

	state = maxTransition.to.getNumber();
	idx++;

	// Push down to last accept state
	while (true) {
	Transition[] transitions = sortedTransitions[state];
	if (transitions.length == 0) {
	assert runAutomaton.isAccept(state);
	term.length = idx;
	//if (DEBUG) System.out.println(" return " + term.utf8ToString());
	return term;
	} else {
	// We are pushing "top" -- so get last label of
	// last transition:
	assert transitions.length != 0;
	Transition lastTransition = transitions[transitions.length-1];
	if (idx >= term.bytes.length) {
	term.grow(1+idx);
	}
	//if (DEBUG) System.out.println(" push maxLabel=" + (char) lastTransition.max + " idx=" + idx);
	term.bytes[idx] = (byte) lastTransition.max;
	state = lastTransition.to.getNumber();
	idx++;
	}
	}
	}

	// TODO: should this take startTerm too? This way
	// Terms.intersect could forward to this method if type !=
	// NORMAL:
	public TermsEnum getTermsEnum(Terms terms) throws IOException {
	switch(type) {
	case NONE:
	return TermsEnum.EMPTY;
	case ALL:
	return terms.iterator(null);
	case SINGLE:
	return new SingleTermsEnum(terms.iterator(null), term);
	case PREFIX:
	// TODO: this is very likely faster than .intersect,
	// but we should test and maybe cutover
	return new PrefixTermsEnum(terms.iterator(null), term);
	case NORMAL:
	return terms.intersect(this, null);
	default:
	// unreachable
	throw new RuntimeException("unhandled case");
	}
	}

	/** Finds largest term accepted by this Automaton, that's
	* <= the provided input term. The result is placed in
	* output; it's fine for output and input to point to
	* the same BytesRef. The returned result is either the
	* provided output, or null if there is no floor term
	* (ie, the provided input term is before the first term
	* accepted by this Automaton). */
	public BytesRef floor(BytesRef input, BytesRef output) {

	output.offset = 0;
	//if (DEBUG) System.out.println("CA.floor input=" + input.utf8ToString());

	int state = runAutomaton.getInitialState();

	// Special case empty string:
	if (input.length == 0) {
	if (runAutomaton.isAccept(state)) {
	output.length = 0;
	return output;
	} else {
	return null;
	}
	}

	final List<Integer> stack = new ArrayList<Integer>();

	int idx = 0;
	while (true) {
	int label = input.bytes[input.offset + idx] & 0xff;
	int nextState = runAutomaton.step(state, label);
	//if (DEBUG) System.out.println(" cycle label=" + (char) label + " nextState=" + nextState);

	if (idx == input.length-1) {
	if (nextState != -1 && runAutomaton.isAccept(nextState)) {
	// Input string is accepted
	if (idx >= output.bytes.length) {
	output.grow(1+idx);
	}
	output.bytes[idx] = (byte) label;
	output.length = input.length;
	//if (DEBUG) System.out.println(" input is accepted; return term=" + output.utf8ToString());
	return output;
	} else {
	nextState = -1;
	}
	}

	if (nextState == -1) {

	// Pop back to a state that has a transition
	// <= our label:
	while (true) {
	Transition[] transitions = sortedTransitions[state];
	if (transitions.length == 0) {
	assert runAutomaton.isAccept(state);
	output.length = idx;
	//if (DEBUG) System.out.println(" return " + output.utf8ToString());
	return output;
	} else if (label-1 < transitions[0].min) {

	if (runAutomaton.isAccept(state)) {
	output.length = idx;
	//if (DEBUG) System.out.println(" return " + output.utf8ToString());
	return output;
	}
	// pop
	if (stack.size() == 0) {
	//if (DEBUG) System.out.println(" pop ord=" + idx + " return null");
	return null;
	} else {
	state = stack.remove(stack.size()-1);
	idx--;
	//if (DEBUG) System.out.println(" pop ord=" + (idx+1) + " label=" + (char) label + " first trans.min=" + (char) transitions[0].min);
	label = input.bytes[input.offset + idx] & 0xff;
	}

	} else {
	//if (DEBUG) System.out.println(" stop pop ord=" + idx + " first trans.min=" + (char) transitions[0].min);
	break;
	}
	}

	//if (DEBUG) System.out.println(" label=" + (char) label + " idx=" + idx);

	return addTail(state, output, idx, label);

	} else {
	if (idx >= output.bytes.length) {
	output.grow(1+idx);
	}
	output.bytes[idx] = (byte) label;
	stack.add(state);
	state = nextState;
	idx++;
	}
	}
	}
	}