blob: bb020b9d24e01f80a64bedc3137ca3f84bbfc255 [file] [log] [blame]
Index: lucene/common-build.xml
===================================================================
--- lucene/common-build.xml (revision 1392569)
+++ lucene/common-build.xml (working copy)
@@ -819,10 +819,10 @@
<classpath refid="clover.classpath" />
<!-- Assertions. -->
- <assertions>
+ <!-- <assertions>
<enable package="org.apache.lucene"/>
<enable package="org.apache.solr"/>
- </assertions>
+ </assertions> -->
<!-- JVM arguments and system properties. -->
<jvmarg line="${args}"/>
Index: lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java
===================================================================
--- lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java (revision 1392569)
+++ lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java (working copy)
@@ -36,6 +36,7 @@
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.search.suggest.Lookup; // javadocs
import org.apache.lucene.search.suggest.analyzing.AnalyzingSuggester;
+import org.apache.lucene.search.suggest.analyzing.FuzzySuggester;
import org.apache.lucene.search.suggest.fst.FSTCompletionLookup;
import org.apache.lucene.search.suggest.fst.WFSTCompletionLookup;
import org.apache.lucene.search.suggest.jaspell.JaspellLookup;
@@ -47,7 +48,7 @@
/**
* Benchmarks tests for implementations of {@link Lookup} interface.
*/
-@Ignore("COMMENT ME TO RUN BENCHMARKS!")
+//@Ignore("COMMENT ME TO RUN BENCHMARKS!")
public class LookupBenchmarkTest extends LuceneTestCase {
@SuppressWarnings("unchecked")
private final List<Class<? extends Lookup>> benchmarkClasses = Arrays.asList(
@@ -55,7 +56,8 @@
TSTLookup.class,
FSTCompletionLookup.class,
WFSTCompletionLookup.class,
- AnalyzingSuggester.class);
+ AnalyzingSuggester.class,
+ FuzzySuggester.class);
private final static int rounds = 15;
private final static int warmup = 5;
Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java
===================================================================
--- lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java (revision 0)
+++ lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java (working copy)
@@ -0,0 +1,87 @@
+package org.apache.lucene.search.suggest.analyzing;
+
+import java.util.Arrays;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.BasicAutomata;
+import org.apache.lucene.util.automaton.BasicOperations;
+import org.apache.lucene.util.automaton.LevenshteinAutomata;
+import org.apache.lucene.util.automaton.SpecialOperations;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class FuzzySuggester extends AnalyzingSuggester {
+ private final int maxEdits;
+ private final boolean transpositions;
+ private final int minPrefix;
+
+ public FuzzySuggester(Analyzer analyzer) {
+ this(analyzer, analyzer);
+ }
+
+ public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) {
+ this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, 1, true, 1);
+ }
+
+ // nocommit: probably want an option to like, require the first character or something :)
+ public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer,
+ int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions, int maxEdits, boolean transpositions, int minPrefix) {
+ super(indexAnalyzer, queryAnalyzer, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions);
+ this.maxEdits = maxEdits;
+ this.transpositions = transpositions;
+ this.minPrefix = minPrefix;
+ }
+
+ @Override
+ protected Automaton fudge(Automaton automaton) {
+ // nocommit: how slow can this be :)
+ Set<IntsRef> ref = SpecialOperations.getFiniteStrings(automaton, -1);
+ Automaton subs[] = new Automaton[ref.size()];
+ int upto = 0;
+ for (IntsRef path : ref) {
+ if (path.length <= minPrefix) {
+ subs[upto] = BasicAutomata.makeString(path.ints, path.offset, path.length);
+ upto++;
+ } else {
+ Automaton prefix = BasicAutomata.makeString(path.ints, path.offset, minPrefix);
+ int ints[] = new int[path.length-minPrefix];
+ System.arraycopy(path.ints, path.offset+minPrefix, ints, 0, ints.length);
+ LevenshteinAutomata lev = new LevenshteinAutomata(ints, 256, transpositions);
+ Automaton suffix = lev.toAutomaton(maxEdits);
+ Automaton combined = BasicOperations.concatenate(prefix, suffix);
+ combined.setDeterministic(true); // its like the special case in concatenate itself, except we cloneExpanded already
+ subs[upto] = combined;
+ upto++;
+ }
+ }
+ if (subs.length == 0) {
+ return BasicAutomata.makeEmpty(); // matches nothing
+ } else if (subs.length == 1) {
+ return subs[0];
+ } else {
+ Automaton a = BasicOperations.union(Arrays.asList(subs));
+ // nocommit: we could call fudge() before det?
+ // this only happens if you have multiple paths anyway (e.g. synonyms)
+ BasicOperations.determinize(a);
+ return a;
+ }
+ }
+}
Property changes on: lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
===================================================================
--- lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (revision 1392679)
+++ lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (working copy)
@@ -505,6 +505,8 @@
// while we convert
BasicOperations.determinize(automaton);
+ automaton = fudge(automaton);
+
final CharsRef spare = new CharsRef();
//System.out.println(" now intersect exactFirst=" + exactFirst);
@@ -655,4 +657,9 @@
return left.output1.compareTo(right.output1);
}
};
+
+ /** nocommit: terrible name and api! */
+ protected Automaton fudge(Automaton automaton) {
+ return automaton;
+ }
}
Index: lucene/test-framework/src/java/org/apache/lucene/util/TestRuleAssertionsRequired.java
===================================================================
--- lucene/test-framework/src/java/org/apache/lucene/util/TestRuleAssertionsRequired.java (revision 1392569)
+++ lucene/test-framework/src/java/org/apache/lucene/util/TestRuleAssertionsRequired.java (working copy)
@@ -35,7 +35,7 @@
String msg = "Test class requires enabled assertions, enable globally (-ea)" +
" or for Solr/Lucene subpackages only: " + description.getClassName();
System.err.println(msg);
- throw new Exception(msg);
+ // nocommit: throw new Exception(msg);
} catch (AssertionError e) {
// Ok, enabled.
}
Index: lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java
===================================================================
--- lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java (revision 1392569)
+++ lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java (working copy)
@@ -240,6 +240,20 @@
a.deterministic = true;
return a;
}
+
+ public static Automaton makeString(int[] word, int offset, int length) {
+ Automaton a = new Automaton();
+ a.setDeterministic(true);
+ State s = new State();
+ a.initial = s;
+ for (int i = offset; i < offset+length; i++) {
+ State s2 = new State();
+ s.addTransition(new Transition(word[i], s2));
+ s = s2;
+ }
+ s.accept = true;
+ return a;
+ }
/**
* Returns a new (deterministic and minimal) automaton that accepts the union
Index: lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java
===================================================================
--- lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java (revision 1392569)
+++ lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java (working copy)
@@ -33,12 +33,13 @@
/** @lucene.internal */
public static final int MAXIMUM_SUPPORTED_DISTANCE = 2;
/* input word */
- final String input;
final int word[];
/* the automata alphabet. */
final int alphabet[];
+ /* the maximum symbol in the alphabet (e.g. 256 for UTF-8 or 10FFFF for UTF-32) */
+ final int alphaMax;
- /* the unicode ranges outside of alphabet */
+ /* the ranges outside of alphabet */
final int rangeLower[];
final int rangeUpper[];
int numRanges = 0;
@@ -50,12 +51,15 @@
* Optionally count transpositions as a primitive edit.
*/
public LevenshteinAutomata(String input, boolean withTranspositions) {
- this.input = input;
- int length = Character.codePointCount(input, 0, input.length());
- word = new int[length];
- for (int i = 0, j = 0, cp = 0; i < input.length(); i += Character.charCount(cp)) {
- word[j++] = cp = input.codePointAt(i);
- }
+ this(codePoints(input), Character.MAX_CODE_POINT, withTranspositions);
+ }
+
+ /**
+ * Expert: Don't use this!
+ */
+ public LevenshteinAutomata(int[] word, int alphaMax, boolean withTranspositions) {
+ this.word = word;
+ this.alphaMax = alphaMax;
// calculate the alphabet
SortedSet<Integer> set = new TreeSet<Integer>();
@@ -81,9 +85,9 @@
lower = higher + 1;
}
/* add the final endpoint */
- if (lower <= Character.MAX_CODE_POINT) {
+ if (lower <= alphaMax) {
rangeLower[numRanges] = lower;
- rangeUpper[numRanges] = Character.MAX_CODE_POINT;
+ rangeUpper[numRanges] = alphaMax;
numRanges++;
}
@@ -94,6 +98,15 @@
};
}
+ private static int[] codePoints(String input) {
+ int length = Character.codePointCount(input, 0, input.length());
+ int word[] = new int[length];
+ for (int i = 0, j = 0, cp = 0; i < input.length(); i += Character.charCount(cp)) {
+ word[j++] = cp = input.codePointAt(i);
+ }
+ return word;
+ }
+
/**
* Compute a DFA that accepts all strings within an edit distance of <code>n</code>.
* <p>
@@ -106,8 +119,9 @@
* </p>
*/
public Automaton toAutomaton(int n) {
- if (n == 0)
- return BasicAutomata.makeString(input);
+ if (n == 0) {
+ return BasicAutomata.makeString(word, 0, word.length);
+ }
if (n >= descriptions.length)
return null;