blob: 42e4b0710a284fc4ba8c661693cf68f317abf63f [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
import org.apache.lucene.util.automaton.TooComplexToDeterminizeException;
/**
* Builds a set of CompiledAutomaton for fuzzy matching on a given term,
* with specified maximum edit distance, fixed prefix and whether or not
* to allow transpositions.
*/
class FuzzyAutomatonBuilder {
private final String term;
private final int maxEdits;
private final LevenshteinAutomata levBuilder;
private final String prefix;
private final int termLength;
FuzzyAutomatonBuilder(String term, int maxEdits, int prefixLength, boolean transpositions) {
if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
throw new IllegalArgumentException("max edits must be 0.." + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE + ", inclusive; got: " + maxEdits);
}
if (prefixLength < 0) {
throw new IllegalArgumentException("prefixLength cannot be less than 0");
}
this.term = term;
this.maxEdits = maxEdits;
int[] codePoints = stringToUTF32(term);
this.termLength = codePoints.length;
prefixLength = Math.min(prefixLength, codePoints.length);
int[] suffix = new int[codePoints.length - prefixLength];
System.arraycopy(codePoints, prefixLength, suffix, 0, suffix.length);
this.levBuilder = new LevenshteinAutomata(suffix, Character.MAX_CODE_POINT, transpositions);
this.prefix = UnicodeUtil.newString(codePoints, 0, prefixLength);
}
CompiledAutomaton[] buildAutomatonSet() {
CompiledAutomaton[] compiled = new CompiledAutomaton[maxEdits + 1];
for (int i = 0; i <= maxEdits; i++) {
try {
compiled[i] = new CompiledAutomaton(levBuilder.toAutomaton(i, prefix), true, false);
}
catch (TooComplexToDeterminizeException e) {
throw new FuzzyTermsEnum.FuzzyTermsException(term, e);
}
}
return compiled;
}
CompiledAutomaton buildMaxEditAutomaton() {
try {
return new CompiledAutomaton(levBuilder.toAutomaton(maxEdits, prefix), true, false);
} catch (TooComplexToDeterminizeException e) {
throw new FuzzyTermsEnum.FuzzyTermsException(term, e);
}
}
int getTermLength() {
return this.termLength;
}
private static int[] stringToUTF32(String text) {
int[] termText = new int[text.codePointCount(0, text.length())];
for (int cp, i = 0, j = 0; i < text.length(); i += Character.charCount(cp)) {
termText[j++] = cp = text.codePointAt(i);
}
return termText;
}
}