LUCENE-3030: fix last nocommits git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/blocktree_3030@1158409 13f79535-47bb-0310-9956-ffa450edef68

commit: a6f790191a06f78e752553b73c2997db04fe278f [log] [tgz]
author: Michael McCandless <mikemccand@apache.org> Tue Aug 16 18:20:41 2011 +0000
committer: Michael McCandless <mikemccand@apache.org> Tue Aug 16 18:20:41 2011 +0000
tree: e82e6005ae917262b949f8a0bed7635ffa3d812a
parent: b259ca824a94567c2bf17db2fc5bfe60ea31d850 [diff]
diff --git a/TODO b/TODO
index d4435a5..c78aaa5 100644
--- a/TODO
+++ b/TODO

@@ -1,8 +1,4 @@
 
-FAIL: ant test-core -Dtestcase=TestBlockTree -Dtestmethod=testEasy -Dtests.seed=-7181425274859016595:531125480476701899
-
-TestFieldCache can spin w/ -codec Pulsing!?
-
 perf tests:
   - GRRR -- indexing MUCH slower now?
     trunk:
@@ -16,13 +12,8 @@
 
   - test perf on single seg index too!
   - try *larger* maxItemsInBlock: could give better net perf?  ie less seeking and more scanning
-  - run fuzzy q under profiler
 
-test if cutting over prefix query to .intersect is faster
-
-"upgrade" postings reader/writer API to match block tree; then non-tree codec uses it too and we only need 1 impl of each postings
-
-what to do about short terms that "force" a block to mark itself has hasTerms!!??
+what to do about short terms that "force" a block to mark itself as hasTerms!!??
   - maybe instead of "isLeafBlock" bit we encode "countUntilNextTerm"?  this way, for a block that only has empty string term we can stop scanning quickly?
   - maybe make a "terms block cache" that holds low-prefix LRU term blocks in ram...?
   - maybe a cache holding all short-length terms will be big perf boost?  it saves having to scan the low-depth blocks... or... maybe a bit noting whether this block contains any terms != empty string suffix; or, we separately hold all 'short'/'straggler' terms in a map, enabling the low-depth blocks to then 'lie' and say they have no terms?
@@ -30,11 +21,8 @@
 
 try forcing no hasTerms if depth < 2?
 
-automaton q should apply maxlength test on each scan'd term
-
-intersect should use suffix ref
-
 LATER:
+  - test if cutting over prefix query to .intersect is faster
   - maybe blocks should NOT store sub-block pointers?  it's reudundant w/ the index...
   - hmm: maybe switch PKLookupTask to intersect!?  do we have fast string builder?
   - hmm -- fix DOT when there are multiple outputs!?  oh, maybe not -- it just works?

diff --git a/lucene/src/java/org/apache/lucene/index/AutomatonTermsEnum.java b/lucene/src/java/org/apache/lucene/index/AutomatonTermsEnum.java
index 9b425fb..1a0ca33 100644
--- a/lucene/src/java/org/apache/lucene/index/AutomatonTermsEnum.java
+++ b/lucene/src/java/org/apache/lucene/index/AutomatonTermsEnum.java

@@ -83,6 +83,7 @@
     super(tenum);
     this.finite = compiled.finite;
     this.runAutomaton = compiled.runAutomaton;
+    assert this.runAutomaton != null;
     this.commonSuffixRef = compiled.commonSuffixRef;
     this.allTransitions = compiled.sortedTransitions;
 

diff --git a/lucene/src/java/org/apache/lucene/index/Terms.java b/lucene/src/java/org/apache/lucene/index/Terms.java
index 7486b6e..55517e5 100644
--- a/lucene/src/java/org/apache/lucene/index/Terms.java
+++ b/lucene/src/java/org/apache/lucene/index/Terms.java

@@ -55,6 +55,9 @@
     // TODO: eventually we could support seekCeil/Exact on
     // the returned enum, instead of only being able to seek
     // at the start
+    if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
+      throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
+    }
     if (startTerm == null) {
       return new AutomatonTermsEnum(iterator(), compiled);
     } else {

diff --git a/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsReader.java b/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsReader.java
index 6ed3149..2955f11 100644
--- a/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsReader.java
+++ b/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsReader.java

@@ -270,6 +270,9 @@
       try {
         return b.utf8ToString() + " " + b;
       } catch (Throwable t) {
+        // If BytesRef isn't actually UTF8, or it's eg a
+        // prefix of UTF8 that ends mid-unicode-char, we
+        // fallback to hex:
         return b.toString();
       }
     }
@@ -482,9 +485,9 @@
 
     @Override
     public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
-      // nocommit -- must check suffix ref
-      // nocommit -- if A is infinte maybe fallback to ATE?
-      // need to test...
+      if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
+        throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
+      }
       return new IntersectEnum(compiled, startTerm);
     }
     
@@ -1045,17 +1048,40 @@
               continue nextTerm;
             }
 
-            // nocommit -- in the case where common suffix
-            // len is > currentFrame.suffix we should check
-            // the suffix of the currentFrame.prefix too!
-            final int lenToCheck = Math.min(currentFrame.suffix, compiledAutomaton.commonSuffixRef.length);
-            final byte[] b1 = currentFrame.suffixBytes;
-            int pos1 = currentFrame.startBytePos + currentFrame.suffix - lenToCheck;
-            final byte[] b2 = compiledAutomaton.commonSuffixRef.bytes;
-            int pos2 = compiledAutomaton.commonSuffixRef.offset + compiledAutomaton.commonSuffixRef.length - lenToCheck;
-            final int pos2End = pos2 + lenToCheck;
-            while(pos2 < pos2End) {
-              if (b1[pos1++] != b2[pos2++]) {
+            final byte[] suffixBytes = currentFrame.suffixBytes;
+            final byte[] commonSuffixBytes = compiledAutomaton.commonSuffixRef.bytes;
+
+            final int lenInPrefix = compiledAutomaton.commonSuffixRef.length - currentFrame.suffix;
+            final int suffixLenToCheck;
+            assert compiledAutomaton.commonSuffixRef.offset == 0;
+            int suffixBytesPos;
+            int commonSuffixBytesPos = 0;
+
+            if (lenInPrefix > 0) {
+              // A prefix of the common suffix overlaps with
+              // the suffix of the block prefix so we first
+              // test whether the prefix part matches:
+              final byte[] termBytes = term.bytes;
+              int termBytesPos = currentFrame.prefix - lenInPrefix;
+              assert termBytesPos >= 0;
+              final int termBytesPosEnd = currentFrame.prefix;
+              while (termBytesPos < termBytesPosEnd) {
+                if (termBytes[termBytesPos++] != commonSuffixBytes[commonSuffixBytesPos++]) {
+                  if (DEBUG) {
+                    System.out.println("      skip: common suffix mismatch (in prefix)");
+                  }
+                  continue nextTerm;
+                }
+              }
+              suffixBytesPos = currentFrame.startBytePos;
+            } else {
+              suffixBytesPos = currentFrame.startBytePos + currentFrame.suffix - compiledAutomaton.commonSuffixRef.length;
+            }
+
+            // Test overlapping suffix part:
+            final int commonSuffixBytesPosEnd = compiledAutomaton.commonSuffixRef.length;
+            while (commonSuffixBytesPos < commonSuffixBytesPosEnd) {
+              if (suffixBytes[suffixBytesPos++] != commonSuffixBytes[commonSuffixBytesPos++]) {
                 if (DEBUG) {
                   System.out.println("      skip: common suffix mismatch");
                 }
@@ -1064,6 +1090,12 @@
             }
           }
 
+          // TODO: maybe we should do the same linear test
+          // that AutomatonTermsEnum does, so that if we
+          // reach a part of the automaton where .* is
+          // "temporarily" accepted, we just blindly .next()
+          // until the limit
+
           // See if the term prefix matches the automaton:
           int state = currentFrame.state;
           for (int idx=0;idx<currentFrame.suffix;idx++) {

diff --git a/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsWriter.java b/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsWriter.java
index e685135..fddb0b4 100644
--- a/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsWriter.java
+++ b/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsWriter.java

@@ -423,8 +423,9 @@
         // First pass: count up how many items fall under
         // each unique label after the prefix.
         
-        // nocommit: this is wasteful since the builder had
-        // already done this but we discarded it...
+        // TODO: this is wasteful since the builder had
+        // already done this (partitioned these sub-terms
+        // according to their leading prefix byte)
         
         final List<Object> slice = pending.subList(pending.size()-count, pending.size());
         int lastLabel = -1;

diff --git a/lucene/src/java/org/apache/lucene/search/AutomatonQuery.java b/lucene/src/java/org/apache/lucene/search/AutomatonQuery.java
index 82c450a..bfca59c 100644
--- a/lucene/src/java/org/apache/lucene/search/AutomatonQuery.java
+++ b/lucene/src/java/org/apache/lucene/search/AutomatonQuery.java

@@ -51,20 +51,10 @@
 public class AutomatonQuery extends MultiTermQuery {
   /** the automaton to match index terms against */
   protected final Automaton automaton;
+  protected final CompiledAutomaton compiled;
   /** term containing the field, and possibly some pattern structure */
   protected final Term term;
 
-  /** 
-   * abstraction for returning a termsenum:
-   * in the ctor the query computes one of these, the actual
-   * implementation depends upon the automaton's structure.
-   */
-  private abstract class TermsEnumFactory {
-    protected abstract TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException;
-  }
-  
-  private final TermsEnumFactory factory;
-
   /**
    * Create a new AutomatonQuery from an {@link Automaton}.
    * 
@@ -77,72 +67,12 @@
     super(term.field());
     this.term = term;
     this.automaton = automaton;
-    
-    if (BasicOperations.isEmpty(automaton)) {
-      // matches nothing
-      factory = new TermsEnumFactory() {
-        @Override
-        protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
-          return TermsEnum.EMPTY;
-        }
-      };
-    } else if (BasicOperations.isTotal(automaton)) {
-      // matches all possible strings
-      factory = new TermsEnumFactory() {
-        @Override
-        protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
-          return terms.iterator();
-        }
-      };
-    } else {
-      final String singleton;
-      final String commonPrefix;
-      
-      if (automaton.getSingleton() == null) {
-        commonPrefix = SpecialOperations.getCommonPrefix(automaton);
-        if (commonPrefix.length() > 0 && BasicOperations.sameLanguage(automaton, BasicAutomata.makeString(commonPrefix))) {
-          singleton = commonPrefix;
-        } else {
-          singleton = null;
-        }
-      } else {
-        commonPrefix = null;
-        singleton = automaton.getSingleton();
-      }
-      
-      if (singleton != null) {
-        // matches a fixed string in singleton or expanded representation
-        factory = new TermsEnumFactory() {
-          @Override
-          protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
-            return new SingleTermsEnum(terms.iterator(), new Term(field, singleton));
-          }
-        };
-      } else if (BasicOperations.sameLanguage(automaton, BasicOperations.concatenate(
-          BasicAutomata.makeString(commonPrefix), BasicAutomata.makeAnyString()))) {
-        // matches a constant prefix
-        factory = new TermsEnumFactory() {
-          @Override
-          protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
-            return new PrefixTermsEnum(terms.iterator(), new Term(field, commonPrefix));
-          }
-        };
-      } else {
-        final CompiledAutomaton compiled = 
-          new CompiledAutomaton(automaton, SpecialOperations.isFinite(automaton));
-        factory = new TermsEnumFactory() {
-          @Override
-          protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
-            return terms.intersect(compiled, null);
-          }
-        };
-      }
-    }
+    this.compiled = new CompiledAutomaton(automaton);
   }
 
   @Override
   protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
-    return factory.getTermsEnum(terms, atts);
+    return compiled.getTermsEnum(terms);
   }
 
   @Override

diff --git a/lucene/src/java/org/apache/lucene/search/FuzzyQuery.java b/lucene/src/java/org/apache/lucene/search/FuzzyQuery.java
index a7ea7d8..fa9f077 100644
--- a/lucene/src/java/org/apache/lucene/search/FuzzyQuery.java
+++ b/lucene/src/java/org/apache/lucene/search/FuzzyQuery.java

@@ -138,7 +138,7 @@
   @Override
   protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
     if (!termLongEnough) {  // can only match if it's exact
-      return new SingleTermsEnum(terms.iterator(), term);
+      return new SingleTermsEnum(terms.iterator(), term.bytes());
     }
     return new FuzzyTermsEnum(terms, atts, getTerm(), minimumSimilarity, prefixLength);
   }

diff --git a/lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java b/lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java
index 03a9ffc..64c057c 100644
--- a/lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java
+++ b/lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java

@@ -172,7 +172,7 @@
             UnicodeUtil.newString(termText, 0, realPrefixLength));
           a = BasicOperations.concatenate(prefix, a);
         }
-        runAutomata.add(new CompiledAutomaton(a, true));
+        runAutomata.add(new CompiledAutomaton(a, true, false));
       }
     }
     return runAutomata;

diff --git a/lucene/src/java/org/apache/lucene/search/PrefixQuery.java b/lucene/src/java/org/apache/lucene/search/PrefixQuery.java
index 27a89bb..42cdebf 100644
--- a/lucene/src/java/org/apache/lucene/search/PrefixQuery.java
+++ b/lucene/src/java/org/apache/lucene/search/PrefixQuery.java

@@ -51,7 +51,7 @@
       // no prefix -- match all terms for this field:
       return tenum;
     }
-    return new PrefixTermsEnum(tenum, prefix);
+    return new PrefixTermsEnum(tenum, prefix.bytes());
   }
 
   /** Prints a user-readable version of this query. */

diff --git a/lucene/src/java/org/apache/lucene/search/PrefixTermsEnum.java b/lucene/src/java/org/apache/lucene/search/PrefixTermsEnum.java
index a5f8fcf..c5bdd08 100644
--- a/lucene/src/java/org/apache/lucene/search/PrefixTermsEnum.java
+++ b/lucene/src/java/org/apache/lucene/search/PrefixTermsEnum.java

@@ -34,9 +34,9 @@
 
   private final BytesRef prefixRef;
 
-  public PrefixTermsEnum(TermsEnum tenum, Term prefix) throws IOException {
+  public PrefixTermsEnum(TermsEnum tenum, BytesRef prefixText) throws IOException {
     super(tenum);
-    setInitialSeekTerm(prefixRef = prefix.bytes());
+    setInitialSeekTerm(this.prefixRef = prefixText);
   }
 
   @Override

diff --git a/lucene/src/java/org/apache/lucene/search/SingleTermsEnum.java b/lucene/src/java/org/apache/lucene/search/SingleTermsEnum.java
index 55fadef..c2726a2 100644
--- a/lucene/src/java/org/apache/lucene/search/SingleTermsEnum.java
+++ b/lucene/src/java/org/apache/lucene/search/SingleTermsEnum.java

@@ -39,10 +39,10 @@
    * After calling the constructor the enumeration is already pointing to the term,
    * if it exists.
    */
-  public SingleTermsEnum(TermsEnum tenum, Term singleTerm) throws IOException {
+  public SingleTermsEnum(TermsEnum tenum, BytesRef termText) throws IOException {
     super(tenum);
-    singleRef = singleTerm.bytes();
-    setInitialSeekTerm(singleRef);
+    singleRef = termText;
+    setInitialSeekTerm(termText);
   }
 
   @Override

diff --git a/lucene/src/java/org/apache/lucene/search/TopTermsRewrite.java b/lucene/src/java/org/apache/lucene/search/TopTermsRewrite.java
index 4e633a7..0a75f37 100644
--- a/lucene/src/java/org/apache/lucene/search/TopTermsRewrite.java
+++ b/lucene/src/java/org/apache/lucene/search/TopTermsRewrite.java

@@ -89,13 +89,13 @@
     
       // for assert:
       private BytesRef lastTerm;
-      private boolean compareToLastTerm(BytesRef t) {
+      private boolean compareToLastTerm(BytesRef t) throws IOException {
         if (lastTerm == null && t != null) {
           lastTerm = new BytesRef(t);
         } else if (t == null) {
           lastTerm = null;
         } else {
-          assert lastTerm.compareTo(t) < 0: "lastTerm=" + lastTerm + " t=" + t;
+          assert termsEnum.getComparator().compare(lastTerm, t) < 0: "lastTerm=" + lastTerm + " t=" + t;
           lastTerm.copy(t);
         }
         return true;
@@ -106,10 +106,7 @@
         final float boost = boostAtt.getBoost();
 
         // make sure within a single seg we always collect
-        // terms in order -- nocommit dangerous, if codec
-        // uses its own termComp?  should we nuke that
-        // ability?  terms must always be "unsigned byte[]
-        // order"?
+        // terms in order
         assert compareToLastTerm(bytes);
 
         //System.out.println("TTR.collect term=" + bytes.utf8ToString() + " boost=" + boost + " ord=" + readerContext.ord);
@@ -160,11 +157,7 @@
     final Q q = getTopLevelQuery();
     final ScoreTerm[] scoreTerms = stQueue.toArray(new ScoreTerm[stQueue.size()]);
     ArrayUtil.mergeSort(scoreTerms, scoreTermSortByTermComp);
-    // nocommit -- how come I see 2 seekExacts for the same
-    // term in a row in the terms dict, around here...?
     
-    // could this be if we are using a non-intersect() capable terms impl (e.g. preflex)
-    // that its the off-by-one regarding 'start term' ? are we seeking backwards!!!!!!
     for (final ScoreTerm st : scoreTerms) {
       final Term term = new Term(query.field, st.bytes);
       assert reader.docFreq(term) == st.termState.docFreq() : "reader DF is " + reader.docFreq(term) + " vs " + st.termState.docFreq() + " term=" + term;

diff --git a/lucene/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java b/lucene/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java
index 7ef39ba..6dc9105 100644
--- a/lucene/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java
+++ b/lucene/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java

@@ -17,10 +17,17 @@
  * limitations under the License.
  */
   
+import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.index.codecs.BlockTreeTermsWriter;
+import org.apache.lucene.search.PrefixTermsEnum;
+import org.apache.lucene.search.SingleTermsEnum;
+import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.BytesRef;
 
 /**
@@ -31,29 +38,106 @@
  * @lucene.experimental
  */
 public class CompiledAutomaton {
+  public enum AUTOMATON_TYPE {NONE, ALL, SINGLE, PREFIX, NORMAL};
+  public final AUTOMATON_TYPE type;
+
+  // For PREFIX, this is the prefix term; for SINGLE this is
+  // the singleton term:
+  public final BytesRef term;
+
+  // NOTE: the next 4 members are only non-null if type ==
+  // NORMAL:
   public final ByteRunAutomaton runAutomaton;
   // TODO: would be nice if these sortedTransitions had "int
   // to;" instead of "State to;" somehow:
   public final Transition[][] sortedTransitions;
   public final BytesRef commonSuffixRef;
-  public final boolean finite;
+  public final Boolean finite;
 
-  // nocommit -- move pulling of a TermsEnum into here, so
-  // that we can optimize for cases where a simpler enume
-  // (prefix enum, all terms, no terms, etc.) can be used
-    
-  public CompiledAutomaton(Automaton automaton, boolean finite) {
+  public CompiledAutomaton(Automaton automaton) {
+    this(automaton, null, true);
+  }
+
+  public CompiledAutomaton(Automaton automaton, Boolean finite, boolean simplify) {
+
+    if (simplify) {
+      // Test whether the automaton is a "simple" form and
+      // if so, don't create a runAutomaton.  Note that on a
+      // large automaton these tests could be costly:
+      if (BasicOperations.isEmpty(automaton)) {
+        // matches nothing
+        type = AUTOMATON_TYPE.NONE;
+        term = null;
+        commonSuffixRef = null;
+        runAutomaton = null;
+        sortedTransitions = null;
+        this.finite = null;
+        return;
+      } else if (BasicOperations.isTotal(automaton)) {
+        // matches all possible strings
+        type = AUTOMATON_TYPE.ALL;
+        term = null;
+        commonSuffixRef = null;
+        runAutomaton = null;
+        sortedTransitions = null;
+        this.finite = null;
+        return;
+      } else {
+        final String commonPrefix;
+        final String singleton;
+        if (automaton.getSingleton() == null) {
+          commonPrefix = SpecialOperations.getCommonPrefix(automaton);
+          if (commonPrefix.length() > 0 && BasicOperations.sameLanguage(automaton, BasicAutomata.makeString(commonPrefix))) {
+            singleton = commonPrefix;
+          } else {
+            singleton = null;
+          }
+        } else {
+          commonPrefix = null;
+          singleton = automaton.getSingleton();
+        }
+      
+        if (singleton != null) {
+          // matches a fixed string in singleton or expanded
+          // representation
+          type = AUTOMATON_TYPE.SINGLE;
+          term = new BytesRef(singleton);
+          commonSuffixRef = null;
+          runAutomaton = null;
+          sortedTransitions = null;
+          this.finite = null;
+          return;
+        } else if (BasicOperations.sameLanguage(automaton, BasicOperations.concatenate(
+                                                                                       BasicAutomata.makeString(commonPrefix), BasicAutomata.makeAnyString()))) {
+          // matches a constant prefix
+          type = AUTOMATON_TYPE.PREFIX;
+          term = new BytesRef(commonPrefix);
+          commonSuffixRef = null;
+          runAutomaton = null;
+          sortedTransitions = null;
+          this.finite = null;
+          return;
+        }
+      }
+    }
+
+    type = AUTOMATON_TYPE.NORMAL;
+    term = null;
+    if (finite == null) {
+      this.finite = SpecialOperations.isFinite(automaton);
+    } else {
+      this.finite = finite;
+    }
     Automaton utf8 = new UTF32ToUTF8().convert(automaton);
-    runAutomaton = new ByteRunAutomaton(utf8, true);
-    sortedTransitions = utf8.getSortedTransitions();
-    this.finite = finite;
-    if (finite) {
+    if (this.finite) {
       commonSuffixRef = null;
     } else {
       commonSuffixRef = SpecialOperations.getCommonSuffixBytesRef(utf8);
     }
+    runAutomaton = new ByteRunAutomaton(utf8, true);
+    sortedTransitions = utf8.getSortedTransitions();
   }
-
+  
   private static final boolean DEBUG = BlockTreeTermsWriter.DEBUG;
 
   private BytesRef addTail(int state, BytesRef term, int idx, int leadLabel) {
@@ -109,6 +193,29 @@
     }
   }
 
+  // TODO: should this take startTerm too?  This way
+  // Terms.intersect could forward to this method if type !=
+  // NORMAL:
+  public TermsEnum getTermsEnum(Terms terms) throws IOException {
+    switch(type) {
+    case NONE:
+      return TermsEnum.EMPTY;
+    case ALL:
+      return terms.iterator();
+    case SINGLE:
+      return new SingleTermsEnum(terms.iterator(), term);
+    case PREFIX:
+      // TODO: this is very likely faster than .intersect,
+      // but we should test and maybe cutover
+      return new PrefixTermsEnum(terms.iterator(), term);
+    case NORMAL:
+      return terms.intersect(this, null);
+    default:
+      // unreachable
+      throw new RuntimeException("unhandled case");
+    }
+  }
+
   /** Finds largest term accepted by this Automaton, that's
    *  <= the provided input term.  The result is placed in
    *  output; it's fine for output and input to point to

diff --git a/lucene/src/java/org/apache/lucene/util/fst/Util.java b/lucene/src/java/org/apache/lucene/util/fst/Util.java
index 5aa638f..4b4e7ad 100644
--- a/lucene/src/java/org/apache/lucene/util/fst/Util.java
+++ b/lucene/src/java/org/apache/lucene/util/fst/Util.java

@@ -318,7 +318,6 @@
             }
 
             if (!fst.targetHasArcs(arc) && arc.isFinal() && arc.nextFinalOutput != NO_OUTPUT) {
-              // nocommit -- this is broken again?
               // Tricky special case: sometimes, due to
               // pruning, the builder can [sillily] produce
               // an FST with an arc into the final end state

diff --git a/lucene/src/test/org/apache/lucene/index/TestTermsEnum.java b/lucene/src/test/org/apache/lucene/index/TestTermsEnum.java
index 8dba83d..1861e7d 100644
--- a/lucene/src/test/org/apache/lucene/index/TestTermsEnum.java
+++ b/lucene/src/test/org/apache/lucene/index/TestTermsEnum.java

@@ -270,7 +270,7 @@
         }
         a = DaciukMihovAutomatonBuilder.build(sortedAcceptTerms);
       }
-      final CompiledAutomaton c = new CompiledAutomaton(a, true);
+      final CompiledAutomaton c = new CompiledAutomaton(a, true, false);
 
       final BytesRef[] acceptTermsArray = new BytesRef[acceptTerms.size()];
       final Set<BytesRef> acceptTermsSet = new HashSet<BytesRef>();

diff --git a/lucene/src/test/org/apache/lucene/index/TestTermsEnum2.java b/lucene/src/test/org/apache/lucene/index/TestTermsEnum2.java
index 6612360..cb60e0c 100644
--- a/lucene/src/test/org/apache/lucene/index/TestTermsEnum2.java
+++ b/lucene/src/test/org/apache/lucene/index/TestTermsEnum2.java

@@ -165,7 +165,7 @@
     for (int i = 0; i < numIterations; i++) {
       String reg = AutomatonTestUtil.randomRegexp(random);
       Automaton automaton = new RegExp(reg, RegExp.NONE).toAutomaton();
-      CompiledAutomaton ca = new CompiledAutomaton(automaton, SpecialOperations.isFinite(automaton));
+      CompiledAutomaton ca = new CompiledAutomaton(automaton, SpecialOperations.isFinite(automaton), false);
       TermsEnum te = MultiFields.getTerms(reader, "field").intersect(ca, null);
       Automaton expected = BasicOperations.intersection(termsAutomaton, automaton);
       TreeSet<BytesRef> found = new TreeSet<BytesRef>();

diff --git a/lucene/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java b/lucene/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java
index 5fe994c..f346d4c 100644
--- a/lucene/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java
+++ b/lucene/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java

@@ -36,7 +36,7 @@
     }
     Automaton a = BasicOperations.union(as);
     a.determinize();
-    return new CompiledAutomaton(a, true);
+    return new CompiledAutomaton(a, true, false);
   }
 
   private void testFloor(CompiledAutomaton c, String input, String expected) {

diff --git a/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java b/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java
index c6cb6ce..1ba73c3 100644
--- a/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java
+++ b/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java

@@ -1707,6 +1707,7 @@
     StringWriter w = new StringWriter();
     Util.toDot(fst, w, false, false);
     w.close();
+    //System.out.println(w.toString());
     assertTrue(w.toString().indexOf("label=\"t/[7]\"") != -1);
   }
 
@@ -1721,6 +1722,7 @@
     //Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp/out.dot"));
     Util.toDot(fst, w, false, false);
     w.close();
+    //System.out.println(w.toString());
     assertTrue(w.toString().indexOf("6 [shape=doublecircle") != -1);
   }
commit	a6f790191a06f78e752553b73c2997db04fe278f	[log] [tgz]
author	Michael McCandless <mikemccand@apache.org>	Tue Aug 16 18:20:41 2011 +0000
committer	Michael McCandless <mikemccand@apache.org>	Tue Aug 16 18:20:41 2011 +0000
tree	e82e6005ae917262b949f8a0bed7635ffa3d812a
parent	b259ca824a94567c2bf17db2fc5bfe60ea31d850 [diff]