| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.lucene.search; |
| |
| import org.apache.lucene.util.UnicodeUtil; |
| import org.apache.lucene.util.automaton.CompiledAutomaton; |
| import org.apache.lucene.util.automaton.LevenshteinAutomata; |
| import org.apache.lucene.util.automaton.TooComplexToDeterminizeException; |
| |
| /** |
| * Builds a set of CompiledAutomaton for fuzzy matching on a given term, |
| * with specified maximum edit distance, fixed prefix and whether or not |
| * to allow transpositions. |
| */ |
| class FuzzyAutomatonBuilder { |
| |
| private final String term; |
| private final int maxEdits; |
| private final LevenshteinAutomata levBuilder; |
| private final String prefix; |
| private final int termLength; |
| |
| FuzzyAutomatonBuilder(String term, int maxEdits, int prefixLength, boolean transpositions) { |
| if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) { |
| throw new IllegalArgumentException("max edits must be 0.." + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE + ", inclusive; got: " + maxEdits); |
| } |
| if (prefixLength < 0) { |
| throw new IllegalArgumentException("prefixLength cannot be less than 0"); |
| } |
| this.term = term; |
| this.maxEdits = maxEdits; |
| int[] codePoints = stringToUTF32(term); |
| this.termLength = codePoints.length; |
| prefixLength = Math.min(prefixLength, codePoints.length); |
| int[] suffix = new int[codePoints.length - prefixLength]; |
| System.arraycopy(codePoints, prefixLength, suffix, 0, suffix.length); |
| this.levBuilder = new LevenshteinAutomata(suffix, Character.MAX_CODE_POINT, transpositions); |
| this.prefix = UnicodeUtil.newString(codePoints, 0, prefixLength); |
| } |
| |
| CompiledAutomaton[] buildAutomatonSet() { |
| CompiledAutomaton[] compiled = new CompiledAutomaton[maxEdits + 1]; |
| for (int i = 0; i <= maxEdits; i++) { |
| try { |
| compiled[i] = new CompiledAutomaton(levBuilder.toAutomaton(i, prefix), true, false); |
| } |
| catch (TooComplexToDeterminizeException e) { |
| throw new FuzzyTermsEnum.FuzzyTermsException(term, e); |
| } |
| } |
| return compiled; |
| } |
| |
| CompiledAutomaton buildMaxEditAutomaton() { |
| try { |
| return new CompiledAutomaton(levBuilder.toAutomaton(maxEdits, prefix), true, false); |
| } catch (TooComplexToDeterminizeException e) { |
| throw new FuzzyTermsEnum.FuzzyTermsException(term, e); |
| } |
| } |
| |
| int getTermLength() { |
| return this.termLength; |
| } |
| |
| private static int[] stringToUTF32(String text) { |
| int[] termText = new int[text.codePointCount(0, text.length())]; |
| for (int cp, i = 0, j = 0; i < text.length(); i += Character.charCount(cp)) { |
| termText[j++] = cp = text.codePointAt(i); |
| } |
| return termText; |
| } |
| } |