| Index: lucene/common-build.xml |
| =================================================================== |
| --- lucene/common-build.xml (revision 1392569) |
| +++ lucene/common-build.xml (working copy) |
| @@ -819,10 +819,10 @@ |
| <classpath refid="clover.classpath" /> |
| |
| <!-- Assertions. --> |
| - <assertions> |
| + <!-- <assertions> |
| <enable package="org.apache.lucene"/> |
| <enable package="org.apache.solr"/> |
| - </assertions> |
| + </assertions> --> |
| |
| <!-- JVM arguments and system properties. --> |
| <jvmarg line="${args}"/> |
| Index: lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java |
| =================================================================== |
| --- lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java (revision 1392569) |
| +++ lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java (working copy) |
| @@ -36,6 +36,7 @@ |
| import org.apache.lucene.analysis.MockTokenizer; |
| import org.apache.lucene.search.suggest.Lookup; // javadocs |
| import org.apache.lucene.search.suggest.analyzing.AnalyzingSuggester; |
| +import org.apache.lucene.search.suggest.analyzing.FuzzySuggester; |
| import org.apache.lucene.search.suggest.fst.FSTCompletionLookup; |
| import org.apache.lucene.search.suggest.fst.WFSTCompletionLookup; |
| import org.apache.lucene.search.suggest.jaspell.JaspellLookup; |
| @@ -47,7 +48,7 @@ |
| /** |
| * Benchmarks tests for implementations of {@link Lookup} interface. |
| */ |
| -@Ignore("COMMENT ME TO RUN BENCHMARKS!") |
| +//@Ignore("COMMENT ME TO RUN BENCHMARKS!") |
| public class LookupBenchmarkTest extends LuceneTestCase { |
| @SuppressWarnings("unchecked") |
| private final List<Class<? extends Lookup>> benchmarkClasses = Arrays.asList( |
| @@ -55,7 +56,8 @@ |
| TSTLookup.class, |
| FSTCompletionLookup.class, |
| WFSTCompletionLookup.class, |
| - AnalyzingSuggester.class); |
| + AnalyzingSuggester.class, |
| + FuzzySuggester.class); |
| |
| private final static int rounds = 15; |
| private final static int warmup = 5; |
| Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java |
| =================================================================== |
| --- lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java (revision 0) |
| +++ lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java (working copy) |
| @@ -0,0 +1,87 @@ |
| +package org.apache.lucene.search.suggest.analyzing; |
| + |
| +import java.util.Arrays; |
| +import java.util.Set; |
| + |
| +import org.apache.lucene.analysis.Analyzer; |
| +import org.apache.lucene.util.IntsRef; |
| +import org.apache.lucene.util.automaton.Automaton; |
| +import org.apache.lucene.util.automaton.BasicAutomata; |
| +import org.apache.lucene.util.automaton.BasicOperations; |
| +import org.apache.lucene.util.automaton.LevenshteinAutomata; |
| +import org.apache.lucene.util.automaton.SpecialOperations; |
| + |
| +/* |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +public class FuzzySuggester extends AnalyzingSuggester { |
| + private final int maxEdits; |
| + private final boolean transpositions; |
| + private final int minPrefix; |
| + |
| + public FuzzySuggester(Analyzer analyzer) { |
| + this(analyzer, analyzer); |
| + } |
| + |
| + public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) { |
| + this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, 1, true, 1); |
| + } |
| + |
| + // nocommit: probably want an option to like, require the first character or something :) |
| + public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, |
| + int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions, int maxEdits, boolean transpositions, int minPrefix) { |
| + super(indexAnalyzer, queryAnalyzer, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions); |
| + this.maxEdits = maxEdits; |
| + this.transpositions = transpositions; |
| + this.minPrefix = minPrefix; |
| + } |
| + |
| + @Override |
| + protected Automaton fudge(Automaton automaton) { |
| + // nocommit: how slow can this be :) |
| + Set<IntsRef> ref = SpecialOperations.getFiniteStrings(automaton, -1); |
| + Automaton subs[] = new Automaton[ref.size()]; |
| + int upto = 0; |
| + for (IntsRef path : ref) { |
| + if (path.length <= minPrefix) { |
| + subs[upto] = BasicAutomata.makeString(path.ints, path.offset, path.length); |
| + upto++; |
| + } else { |
| + Automaton prefix = BasicAutomata.makeString(path.ints, path.offset, minPrefix); |
| + int ints[] = new int[path.length-minPrefix]; |
| + System.arraycopy(path.ints, path.offset+minPrefix, ints, 0, ints.length); |
| + LevenshteinAutomata lev = new LevenshteinAutomata(ints, 256, transpositions); |
| + Automaton suffix = lev.toAutomaton(maxEdits); |
| + Automaton combined = BasicOperations.concatenate(prefix, suffix); |
| + combined.setDeterministic(true); // its like the special case in concatenate itself, except we cloneExpanded already |
| + subs[upto] = combined; |
| + upto++; |
| + } |
| + } |
| + if (subs.length == 0) { |
| + return BasicAutomata.makeEmpty(); // matches nothing |
| + } else if (subs.length == 1) { |
| + return subs[0]; |
| + } else { |
| + Automaton a = BasicOperations.union(Arrays.asList(subs)); |
| + // nocommit: we could call fudge() before det? |
| + // this only happens if you have multiple paths anyway (e.g. synonyms) |
| + BasicOperations.determinize(a); |
| + return a; |
| + } |
| + } |
| +} |
| |
| Property changes on: lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java |
| ___________________________________________________________________ |
| Added: svn:eol-style |
| ## -0,0 +1 ## |
| +native |
| Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java |
| =================================================================== |
| --- lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (revision 1392679) |
| +++ lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (working copy) |
| @@ -505,6 +505,8 @@ |
| // while we convert |
| BasicOperations.determinize(automaton); |
| |
| + automaton = fudge(automaton); |
| + |
| final CharsRef spare = new CharsRef(); |
| |
| //System.out.println(" now intersect exactFirst=" + exactFirst); |
| @@ -655,4 +657,9 @@ |
| return left.output1.compareTo(right.output1); |
| } |
| }; |
| + |
| + /** nocommit: terrible name and api! */ |
| + protected Automaton fudge(Automaton automaton) { |
| + return automaton; |
| + } |
| } |
| Index: lucene/test-framework/src/java/org/apache/lucene/util/TestRuleAssertionsRequired.java |
| =================================================================== |
| --- lucene/test-framework/src/java/org/apache/lucene/util/TestRuleAssertionsRequired.java (revision 1392569) |
| +++ lucene/test-framework/src/java/org/apache/lucene/util/TestRuleAssertionsRequired.java (working copy) |
| @@ -35,7 +35,7 @@ |
| String msg = "Test class requires enabled assertions, enable globally (-ea)" + |
| " or for Solr/Lucene subpackages only: " + description.getClassName(); |
| System.err.println(msg); |
| - throw new Exception(msg); |
| + // nocommit: throw new Exception(msg); |
| } catch (AssertionError e) { |
| // Ok, enabled. |
| } |
| Index: lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java |
| =================================================================== |
| --- lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java (revision 1392569) |
| +++ lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java (working copy) |
| @@ -240,6 +240,20 @@ |
| a.deterministic = true; |
| return a; |
| } |
| + |
| + public static Automaton makeString(int[] word, int offset, int length) { |
| + Automaton a = new Automaton(); |
| + a.setDeterministic(true); |
| + State s = new State(); |
| + a.initial = s; |
| + for (int i = offset; i < offset+length; i++) { |
| + State s2 = new State(); |
| + s.addTransition(new Transition(word[i], s2)); |
| + s = s2; |
| + } |
| + s.accept = true; |
| + return a; |
| + } |
| |
| /** |
| * Returns a new (deterministic and minimal) automaton that accepts the union |
| Index: lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java |
| =================================================================== |
| --- lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java (revision 1392569) |
| +++ lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java (working copy) |
| @@ -33,12 +33,13 @@ |
| /** @lucene.internal */ |
| public static final int MAXIMUM_SUPPORTED_DISTANCE = 2; |
| /* input word */ |
| - final String input; |
| final int word[]; |
| /* the automata alphabet. */ |
| final int alphabet[]; |
| + /* the maximum symbol in the alphabet (e.g. 256 for UTF-8 or 10FFFF for UTF-32) */ |
| + final int alphaMax; |
| |
| - /* the unicode ranges outside of alphabet */ |
| + /* the ranges outside of alphabet */ |
| final int rangeLower[]; |
| final int rangeUpper[]; |
| int numRanges = 0; |
| @@ -50,12 +51,15 @@ |
| * Optionally count transpositions as a primitive edit. |
| */ |
| public LevenshteinAutomata(String input, boolean withTranspositions) { |
| - this.input = input; |
| - int length = Character.codePointCount(input, 0, input.length()); |
| - word = new int[length]; |
| - for (int i = 0, j = 0, cp = 0; i < input.length(); i += Character.charCount(cp)) { |
| - word[j++] = cp = input.codePointAt(i); |
| - } |
| + this(codePoints(input), Character.MAX_CODE_POINT, withTranspositions); |
| + } |
| + |
| + /** |
| + * Expert: Don't use this! |
| + */ |
| + public LevenshteinAutomata(int[] word, int alphaMax, boolean withTranspositions) { |
| + this.word = word; |
| + this.alphaMax = alphaMax; |
| |
| // calculate the alphabet |
| SortedSet<Integer> set = new TreeSet<Integer>(); |
| @@ -81,9 +85,9 @@ |
| lower = higher + 1; |
| } |
| /* add the final endpoint */ |
| - if (lower <= Character.MAX_CODE_POINT) { |
| + if (lower <= alphaMax) { |
| rangeLower[numRanges] = lower; |
| - rangeUpper[numRanges] = Character.MAX_CODE_POINT; |
| + rangeUpper[numRanges] = alphaMax; |
| numRanges++; |
| } |
| |
| @@ -94,6 +98,15 @@ |
| }; |
| } |
| |
| + private static int[] codePoints(String input) { |
| + int length = Character.codePointCount(input, 0, input.length()); |
| + int word[] = new int[length]; |
| + for (int i = 0, j = 0, cp = 0; i < input.length(); i += Character.charCount(cp)) { |
| + word[j++] = cp = input.codePointAt(i); |
| + } |
| + return word; |
| + } |
| + |
| /** |
| * Compute a DFA that accepts all strings within an edit distance of <code>n</code>. |
| * <p> |
| @@ -106,8 +119,9 @@ |
| * </p> |
| */ |
| public Automaton toAutomaton(int n) { |
| - if (n == 0) |
| - return BasicAutomata.makeString(input); |
| + if (n == 0) { |
| + return BasicAutomata.makeString(word, 0, word.length); |
| + } |
| |
| if (n >= descriptions.length) |
| return null; |