| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.search.spell; |
| |
| import java.util.ArrayList; |
| import java.util.List; |
| import java.util.regex.Pattern; |
| |
| import junit.framework.Assert; |
| |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.MockAnalyzer; |
| import org.apache.lucene.analysis.MockTokenizer; |
| import org.apache.lucene.document.Document; |
| import org.apache.lucene.document.Field; |
| import org.apache.lucene.index.DirectoryReader; |
| import org.apache.lucene.index.IndexReader; |
| import org.apache.lucene.index.RandomIndexWriter; |
| import org.apache.lucene.index.Term; |
| import org.apache.lucene.search.spell.WordBreakSpellChecker.BreakSuggestionSortMethod; |
| import org.apache.lucene.store.Directory; |
| import org.apache.lucene.util.English; |
| import org.apache.lucene.util.IOUtils; |
| import org.apache.lucene.util.LuceneTestCase; |
| import org.apache.lucene.util.TestUtil; |
| |
| public class TestWordBreakSpellChecker extends LuceneTestCase { |
| private Directory dir; |
| private Analyzer analyzer; |
| |
| @Override |
| public void setUp() throws Exception { |
| super.setUp(); |
| dir = newDirectory(); |
| analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true); |
| RandomIndexWriter writer = new RandomIndexWriter(random(), dir, analyzer); |
| |
| for (int i = 900; i < 1112; i++) { |
| Document doc = new Document(); |
| String num = English.intToEnglish(i).replaceAll("[-]", " ").replaceAll("[,]", ""); |
| doc.add(newTextField("numbers", num, Field.Store.NO)); |
| writer.addDocument(doc); |
| } |
| |
| { |
| Document doc = new Document(); |
| doc.add(newTextField("numbers", "thou hast sand betwixt thy toes", Field.Store.NO)); |
| writer.addDocument(doc); |
| } |
| { |
| Document doc = new Document(); |
| doc.add(newTextField("numbers", "hundredeight eightyeight yeight", Field.Store.NO)); |
| writer.addDocument(doc); |
| } |
| { |
| Document doc = new Document(); |
| doc.add(newTextField("numbers", "tres y cinco", Field.Store.NO)); |
| writer.addDocument(doc); |
| } |
| |
| writer.commit(); |
| writer.close(); |
| } |
| |
| @Override |
| public void tearDown() throws Exception { |
| IOUtils.close(dir, analyzer); |
| super.tearDown(); |
| } |
| |
| public void testCombiningWords() throws Exception { |
| IndexReader ir = DirectoryReader.open(dir); |
| WordBreakSpellChecker wbsp = new WordBreakSpellChecker(); |
| |
| { |
| Term[] terms = { |
| new Term("numbers", "one"), |
| new Term("numbers", "hun"), |
| new Term("numbers", "dred"), |
| new Term("numbers", "eight"), |
| new Term("numbers", "y"), |
| new Term("numbers", "eight"), |
| }; |
| wbsp.setMaxChanges(3); |
| wbsp.setMaxCombineWordLength(20); |
| wbsp.setMinSuggestionFrequency(1); |
| CombineSuggestion[] cs = wbsp.suggestWordCombinations(terms, 10, ir, SuggestMode.SUGGEST_ALWAYS); |
| Assert.assertTrue(cs.length==5); |
| |
| Assert.assertTrue(cs[0].originalTermIndexes.length==2); |
| Assert.assertTrue(cs[0].originalTermIndexes[0]==1); |
| Assert.assertTrue(cs[0].originalTermIndexes[1]==2); |
| Assert.assertTrue(cs[0].suggestion.string.equals("hundred")); |
| Assert.assertTrue(cs[0].suggestion.score==1); |
| |
| Assert.assertTrue(cs[1].originalTermIndexes.length==2); |
| Assert.assertTrue(cs[1].originalTermIndexes[0]==3); |
| Assert.assertTrue(cs[1].originalTermIndexes[1]==4); |
| Assert.assertTrue(cs[1].suggestion.string.equals("eighty")); |
| Assert.assertTrue(cs[1].suggestion.score==1); |
| |
| Assert.assertTrue(cs[2].originalTermIndexes.length==2); |
| Assert.assertTrue(cs[2].originalTermIndexes[0]==4); |
| Assert.assertTrue(cs[2].originalTermIndexes[1]==5); |
| Assert.assertTrue(cs[2].suggestion.string.equals("yeight")); |
| Assert.assertTrue(cs[2].suggestion.score==1); |
| |
| for(int i=3 ; i<5 ; i++) { |
| Assert.assertTrue(cs[i].originalTermIndexes.length==3); |
| Assert.assertTrue(cs[i].suggestion.score==2); |
| Assert.assertTrue( |
| (cs[i].originalTermIndexes[0]==1 && |
| cs[i].originalTermIndexes[1]==2 && |
| cs[i].originalTermIndexes[2]==3 && |
| cs[i].suggestion.string.equals("hundredeight")) || |
| (cs[i].originalTermIndexes[0]==3 && |
| cs[i].originalTermIndexes[1]==4 && |
| cs[i].originalTermIndexes[2]==5 && |
| cs[i].suggestion.string.equals("eightyeight")) |
| ); |
| } |
| |
| cs = wbsp.suggestWordCombinations(terms, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX); |
| Assert.assertTrue(cs.length==2); |
| Assert.assertTrue(cs[0].originalTermIndexes.length==2); |
| Assert.assertTrue(cs[0].suggestion.score==1); |
| Assert.assertTrue(cs[0].originalTermIndexes[0]==1); |
| Assert.assertTrue(cs[0].originalTermIndexes[1]==2); |
| Assert.assertTrue(cs[0].suggestion.string.equals("hundred")); |
| Assert.assertTrue(cs[0].suggestion.score==1); |
| |
| Assert.assertTrue(cs[1].originalTermIndexes.length==3); |
| Assert.assertTrue(cs[1].suggestion.score==2); |
| Assert.assertTrue(cs[1].originalTermIndexes[0] == 1); |
| Assert.assertTrue(cs[1].originalTermIndexes[1] == 2); |
| Assert.assertTrue(cs[1].originalTermIndexes[2] == 3); |
| Assert.assertTrue(cs[1].suggestion.string.equals("hundredeight")); |
| } |
| ir.close(); |
| } |
| |
| public void testBreakingWords() throws Exception { |
| IndexReader ir = DirectoryReader.open(dir); |
| WordBreakSpellChecker wbsp = new WordBreakSpellChecker(); |
| |
| { |
| Term term = new Term("numbers", "ninetynine"); |
| wbsp.setMaxChanges(1); |
| wbsp.setMinBreakWordLength(1); |
| wbsp.setMinSuggestionFrequency(1); |
| SuggestWord[][] sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY); |
| Assert.assertTrue(sw.length==1); |
| Assert.assertTrue(sw[0].length==2); |
| Assert.assertTrue(sw[0][0].string.equals("ninety")); |
| Assert.assertTrue(sw[0][1].string.equals("nine")); |
| Assert.assertTrue(sw[0][0].score == 1); |
| Assert.assertTrue(sw[0][1].score == 1); |
| } |
| { |
| Term term = new Term("numbers", "onethousand"); |
| wbsp.setMaxChanges(1); |
| wbsp.setMinBreakWordLength(1); |
| wbsp.setMinSuggestionFrequency(1); |
| SuggestWord[][] sw = wbsp.suggestWordBreaks(term, 2, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY); |
| Assert.assertTrue(sw.length==1); |
| Assert.assertTrue(sw[0].length==2); |
| Assert.assertTrue(sw[0][0].string.equals("one")); |
| Assert.assertTrue(sw[0][1].string.equals("thousand")); |
| Assert.assertTrue(sw[0][0].score == 1); |
| Assert.assertTrue(sw[0][1].score == 1); |
| |
| wbsp.setMaxChanges(2); |
| wbsp.setMinSuggestionFrequency(1); |
| sw = wbsp.suggestWordBreaks(term, 1, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY); |
| Assert.assertTrue(sw.length==1); |
| Assert.assertTrue(sw[0].length==2); |
| |
| wbsp.setMaxChanges(2); |
| wbsp.setMinSuggestionFrequency(2); |
| sw = wbsp.suggestWordBreaks(term, 2, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY); |
| Assert.assertTrue(sw.length==1); |
| Assert.assertTrue(sw[0].length==2); |
| |
| wbsp.setMaxChanges(2); |
| wbsp.setMinSuggestionFrequency(1); |
| sw = wbsp.suggestWordBreaks(term, 2, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY); |
| Assert.assertTrue(sw.length==2); |
| Assert.assertTrue(sw[0].length==2); |
| Assert.assertTrue(sw[0][0].string.equals("one")); |
| Assert.assertTrue(sw[0][1].string.equals("thousand")); |
| Assert.assertTrue(sw[0][0].score == 1); |
| Assert.assertTrue(sw[0][1].score == 1); |
| Assert.assertTrue(sw[0][1].freq>1); |
| Assert.assertTrue(sw[0][0].freq>sw[0][1].freq); |
| Assert.assertTrue(sw[1].length==3); |
| Assert.assertTrue(sw[1][0].string.equals("one")); |
| Assert.assertTrue(sw[1][1].string.equals("thou")); |
| Assert.assertTrue(sw[1][2].string.equals("sand")); |
| Assert.assertTrue(sw[1][0].score == 2); |
| Assert.assertTrue(sw[1][1].score == 2); |
| Assert.assertTrue(sw[1][2].score == 2); |
| Assert.assertTrue(sw[1][0].freq>1); |
| Assert.assertTrue(sw[1][1].freq==1); |
| Assert.assertTrue(sw[1][2].freq==1); |
| } |
| { |
| Term term = new Term("numbers", "onethousandonehundredeleven"); |
| wbsp.setMaxChanges(3); |
| wbsp.setMinBreakWordLength(1); |
| wbsp.setMinSuggestionFrequency(1); |
| SuggestWord[][] sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY); |
| Assert.assertTrue(sw.length==0); |
| |
| wbsp.setMaxChanges(4); |
| sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY); |
| Assert.assertTrue(sw.length==1); |
| Assert.assertTrue(sw[0].length==5); |
| |
| wbsp.setMaxChanges(5); |
| sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY); |
| Assert.assertTrue(sw.length==2); |
| Assert.assertTrue(sw[0].length==5); |
| Assert.assertTrue(sw[0][1].string.equals("thousand")); |
| Assert.assertTrue(sw[1].length==6); |
| Assert.assertTrue(sw[1][1].string.equals("thou")); |
| Assert.assertTrue(sw[1][2].string.equals("sand")); |
| } |
| { |
| //make sure we can handle 2-char codepoints |
| Term term = new Term("numbers", "\uD864\uDC79"); |
| wbsp.setMaxChanges(1); |
| wbsp.setMinBreakWordLength(1); |
| wbsp.setMinSuggestionFrequency(1); |
| SuggestWord[][] sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY); |
| Assert.assertTrue(sw.length==0); |
| } |
| |
| ir.close(); |
| } |
| |
| public void testRandom() throws Exception { |
| int numDocs = TestUtil.nextInt(random(), (10 * RANDOM_MULTIPLIER), |
| (100 * RANDOM_MULTIPLIER)); |
| IndexReader ir = null; |
| |
| Directory dir = newDirectory(); |
| Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); |
| RandomIndexWriter writer = new RandomIndexWriter(random(), dir, analyzer); |
| int maxLength = TestUtil.nextInt(random(), 5, 50); |
| List<String> originals = new ArrayList<>(numDocs); |
| List<String[]> breaks = new ArrayList<>(numDocs); |
| for (int i = 0; i < numDocs; i++) { |
| String orig = ""; |
| if (random().nextBoolean()) { |
| while (!goodTestString(orig)) { |
| orig = TestUtil.randomSimpleString(random(), maxLength); |
| } |
| } else { |
| while (!goodTestString(orig)) { |
| orig = TestUtil.randomUnicodeString(random(), maxLength); |
| } |
| } |
| originals.add(orig); |
| int totalLength = orig.codePointCount(0, orig.length()); |
| int breakAt = orig.offsetByCodePoints(0, |
| TestUtil.nextInt(random(), 1, totalLength - 1)); |
| String[] broken = new String[2]; |
| broken[0] = orig.substring(0, breakAt); |
| broken[1] = orig.substring(breakAt); |
| breaks.add(broken); |
| Document doc = new Document(); |
| doc.add(newTextField("random_break", broken[0] + " " + broken[1], |
| Field.Store.NO)); |
| doc.add(newTextField("random_combine", orig, Field.Store.NO)); |
| writer.addDocument(doc); |
| } |
| writer.commit(); |
| writer.close(); |
| |
| ir = DirectoryReader.open(dir); |
| WordBreakSpellChecker wbsp = new WordBreakSpellChecker(); |
| wbsp.setMaxChanges(1); |
| wbsp.setMinBreakWordLength(1); |
| wbsp.setMinSuggestionFrequency(1); |
| wbsp.setMaxCombineWordLength(maxLength); |
| for (int i = 0; i < originals.size(); i++) { |
| String orig = originals.get(i); |
| String left = breaks.get(i)[0]; |
| String right = breaks.get(i)[1]; |
| { |
| Term term = new Term("random_break", orig); |
| |
| SuggestWord[][] sw = wbsp.suggestWordBreaks(term, originals.size(), |
| ir, SuggestMode.SUGGEST_ALWAYS, |
| BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY); |
| boolean failed = true; |
| for (SuggestWord[] sw1 : sw) { |
| Assert.assertTrue(sw1.length == 2); |
| if (sw1[0].string.equals(left) && sw1[1].string.equals(right)) { |
| failed = false; |
| } |
| } |
| Assert.assertFalse("Failed getting break suggestions\n >Original: " |
| + orig + "\n >Left: " + left + "\n >Right: " + right, failed); |
| } |
| { |
| Term[] terms = {new Term("random_combine", left), |
| new Term("random_combine", right)}; |
| CombineSuggestion[] cs = wbsp.suggestWordCombinations(terms, |
| originals.size(), ir, SuggestMode.SUGGEST_ALWAYS); |
| boolean failed = true; |
| for (CombineSuggestion cs1 : cs) { |
| Assert.assertTrue(cs1.originalTermIndexes.length == 2); |
| if (cs1.suggestion.string.equals(left + right)) { |
| failed = false; |
| } |
| } |
| Assert.assertFalse("Failed getting combine suggestions\n >Original: " |
| + orig + "\n >Left: " + left + "\n >Right: " + right, failed); |
| } |
| } |
| IOUtils.close(ir, dir, analyzer); |
| } |
| |
| private static final Pattern mockTokenizerWhitespacePattern = Pattern |
| .compile("[ \\t\\r\\n]"); |
| |
| private boolean goodTestString(String s) { |
| if (s.codePointCount(0, s.length()) < 2 |
| || mockTokenizerWhitespacePattern.matcher(s).find()) { |
| return false; |
| } |
| return true; |
| } |
| } |