blob: 35e8245ac8b6475672eb4b6b3c8554117d15a261 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.spell;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import junit.framework.Assert;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.spell.WordBreakSpellChecker.BreakSuggestionSortMethod;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.English;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
public class TestWordBreakSpellChecker extends LuceneTestCase {
private Directory dir;
private Analyzer analyzer;
@Override
public void setUp() throws Exception {
super.setUp();
dir = newDirectory();
analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true);
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, analyzer);
for (int i = 900; i < 1112; i++) {
Document doc = new Document();
String num = English.intToEnglish(i).replaceAll("[-]", " ").replaceAll("[,]", "");
doc.add(newTextField("numbers", num, Field.Store.NO));
writer.addDocument(doc);
}
{
Document doc = new Document();
doc.add(newTextField("numbers", "thou hast sand betwixt thy toes", Field.Store.NO));
writer.addDocument(doc);
}
{
Document doc = new Document();
doc.add(newTextField("numbers", "hundredeight eightyeight yeight", Field.Store.NO));
writer.addDocument(doc);
}
{
Document doc = new Document();
doc.add(newTextField("numbers", "tres y cinco", Field.Store.NO));
writer.addDocument(doc);
}
writer.commit();
writer.close();
}
@Override
public void tearDown() throws Exception {
IOUtils.close(dir, analyzer);
super.tearDown();
}
public void testCombiningWords() throws Exception {
IndexReader ir = DirectoryReader.open(dir);
WordBreakSpellChecker wbsp = new WordBreakSpellChecker();
{
Term[] terms = {
new Term("numbers", "one"),
new Term("numbers", "hun"),
new Term("numbers", "dred"),
new Term("numbers", "eight"),
new Term("numbers", "y"),
new Term("numbers", "eight"),
};
wbsp.setMaxChanges(3);
wbsp.setMaxCombineWordLength(20);
wbsp.setMinSuggestionFrequency(1);
CombineSuggestion[] cs = wbsp.suggestWordCombinations(terms, 10, ir, SuggestMode.SUGGEST_ALWAYS);
Assert.assertTrue(cs.length==5);
Assert.assertTrue(cs[0].originalTermIndexes.length==2);
Assert.assertTrue(cs[0].originalTermIndexes[0]==1);
Assert.assertTrue(cs[0].originalTermIndexes[1]==2);
Assert.assertTrue(cs[0].suggestion.string.equals("hundred"));
Assert.assertTrue(cs[0].suggestion.score==1);
Assert.assertTrue(cs[1].originalTermIndexes.length==2);
Assert.assertTrue(cs[1].originalTermIndexes[0]==3);
Assert.assertTrue(cs[1].originalTermIndexes[1]==4);
Assert.assertTrue(cs[1].suggestion.string.equals("eighty"));
Assert.assertTrue(cs[1].suggestion.score==1);
Assert.assertTrue(cs[2].originalTermIndexes.length==2);
Assert.assertTrue(cs[2].originalTermIndexes[0]==4);
Assert.assertTrue(cs[2].originalTermIndexes[1]==5);
Assert.assertTrue(cs[2].suggestion.string.equals("yeight"));
Assert.assertTrue(cs[2].suggestion.score==1);
for(int i=3 ; i<5 ; i++) {
Assert.assertTrue(cs[i].originalTermIndexes.length==3);
Assert.assertTrue(cs[i].suggestion.score==2);
Assert.assertTrue(
(cs[i].originalTermIndexes[0]==1 &&
cs[i].originalTermIndexes[1]==2 &&
cs[i].originalTermIndexes[2]==3 &&
cs[i].suggestion.string.equals("hundredeight")) ||
(cs[i].originalTermIndexes[0]==3 &&
cs[i].originalTermIndexes[1]==4 &&
cs[i].originalTermIndexes[2]==5 &&
cs[i].suggestion.string.equals("eightyeight"))
);
}
cs = wbsp.suggestWordCombinations(terms, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
Assert.assertTrue(cs.length==2);
Assert.assertTrue(cs[0].originalTermIndexes.length==2);
Assert.assertTrue(cs[0].suggestion.score==1);
Assert.assertTrue(cs[0].originalTermIndexes[0]==1);
Assert.assertTrue(cs[0].originalTermIndexes[1]==2);
Assert.assertTrue(cs[0].suggestion.string.equals("hundred"));
Assert.assertTrue(cs[0].suggestion.score==1);
Assert.assertTrue(cs[1].originalTermIndexes.length==3);
Assert.assertTrue(cs[1].suggestion.score==2);
Assert.assertTrue(cs[1].originalTermIndexes[0] == 1);
Assert.assertTrue(cs[1].originalTermIndexes[1] == 2);
Assert.assertTrue(cs[1].originalTermIndexes[2] == 3);
Assert.assertTrue(cs[1].suggestion.string.equals("hundredeight"));
}
ir.close();
}
public void testBreakingWords() throws Exception {
IndexReader ir = DirectoryReader.open(dir);
WordBreakSpellChecker wbsp = new WordBreakSpellChecker();
{
Term term = new Term("numbers", "ninetynine");
wbsp.setMaxChanges(1);
wbsp.setMinBreakWordLength(1);
wbsp.setMinSuggestionFrequency(1);
SuggestWord[][] sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
Assert.assertTrue(sw.length==1);
Assert.assertTrue(sw[0].length==2);
Assert.assertTrue(sw[0][0].string.equals("ninety"));
Assert.assertTrue(sw[0][1].string.equals("nine"));
Assert.assertTrue(sw[0][0].score == 1);
Assert.assertTrue(sw[0][1].score == 1);
}
{
Term term = new Term("numbers", "onethousand");
wbsp.setMaxChanges(1);
wbsp.setMinBreakWordLength(1);
wbsp.setMinSuggestionFrequency(1);
SuggestWord[][] sw = wbsp.suggestWordBreaks(term, 2, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
Assert.assertTrue(sw.length==1);
Assert.assertTrue(sw[0].length==2);
Assert.assertTrue(sw[0][0].string.equals("one"));
Assert.assertTrue(sw[0][1].string.equals("thousand"));
Assert.assertTrue(sw[0][0].score == 1);
Assert.assertTrue(sw[0][1].score == 1);
wbsp.setMaxChanges(2);
wbsp.setMinSuggestionFrequency(1);
sw = wbsp.suggestWordBreaks(term, 1, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
Assert.assertTrue(sw.length==1);
Assert.assertTrue(sw[0].length==2);
wbsp.setMaxChanges(2);
wbsp.setMinSuggestionFrequency(2);
sw = wbsp.suggestWordBreaks(term, 2, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
Assert.assertTrue(sw.length==1);
Assert.assertTrue(sw[0].length==2);
wbsp.setMaxChanges(2);
wbsp.setMinSuggestionFrequency(1);
sw = wbsp.suggestWordBreaks(term, 2, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
Assert.assertTrue(sw.length==2);
Assert.assertTrue(sw[0].length==2);
Assert.assertTrue(sw[0][0].string.equals("one"));
Assert.assertTrue(sw[0][1].string.equals("thousand"));
Assert.assertTrue(sw[0][0].score == 1);
Assert.assertTrue(sw[0][1].score == 1);
Assert.assertTrue(sw[0][1].freq>1);
Assert.assertTrue(sw[0][0].freq>sw[0][1].freq);
Assert.assertTrue(sw[1].length==3);
Assert.assertTrue(sw[1][0].string.equals("one"));
Assert.assertTrue(sw[1][1].string.equals("thou"));
Assert.assertTrue(sw[1][2].string.equals("sand"));
Assert.assertTrue(sw[1][0].score == 2);
Assert.assertTrue(sw[1][1].score == 2);
Assert.assertTrue(sw[1][2].score == 2);
Assert.assertTrue(sw[1][0].freq>1);
Assert.assertTrue(sw[1][1].freq==1);
Assert.assertTrue(sw[1][2].freq==1);
}
{
Term term = new Term("numbers", "onethousandonehundredeleven");
wbsp.setMaxChanges(3);
wbsp.setMinBreakWordLength(1);
wbsp.setMinSuggestionFrequency(1);
SuggestWord[][] sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
Assert.assertTrue(sw.length==0);
wbsp.setMaxChanges(4);
sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
Assert.assertTrue(sw.length==1);
Assert.assertTrue(sw[0].length==5);
wbsp.setMaxChanges(5);
sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
Assert.assertTrue(sw.length==2);
Assert.assertTrue(sw[0].length==5);
Assert.assertTrue(sw[0][1].string.equals("thousand"));
Assert.assertTrue(sw[1].length==6);
Assert.assertTrue(sw[1][1].string.equals("thou"));
Assert.assertTrue(sw[1][2].string.equals("sand"));
}
{
//make sure we can handle 2-char codepoints
Term term = new Term("numbers", "\uD864\uDC79");
wbsp.setMaxChanges(1);
wbsp.setMinBreakWordLength(1);
wbsp.setMinSuggestionFrequency(1);
SuggestWord[][] sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
Assert.assertTrue(sw.length==0);
}
ir.close();
}
public void testRandom() throws Exception {
int numDocs = TestUtil.nextInt(random(), (10 * RANDOM_MULTIPLIER),
(100 * RANDOM_MULTIPLIER));
IndexReader ir = null;
Directory dir = newDirectory();
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, analyzer);
int maxLength = TestUtil.nextInt(random(), 5, 50);
List<String> originals = new ArrayList<>(numDocs);
List<String[]> breaks = new ArrayList<>(numDocs);
for (int i = 0; i < numDocs; i++) {
String orig = "";
if (random().nextBoolean()) {
while (!goodTestString(orig)) {
orig = TestUtil.randomSimpleString(random(), maxLength);
}
} else {
while (!goodTestString(orig)) {
orig = TestUtil.randomUnicodeString(random(), maxLength);
}
}
originals.add(orig);
int totalLength = orig.codePointCount(0, orig.length());
int breakAt = orig.offsetByCodePoints(0,
TestUtil.nextInt(random(), 1, totalLength - 1));
String[] broken = new String[2];
broken[0] = orig.substring(0, breakAt);
broken[1] = orig.substring(breakAt);
breaks.add(broken);
Document doc = new Document();
doc.add(newTextField("random_break", broken[0] + " " + broken[1],
Field.Store.NO));
doc.add(newTextField("random_combine", orig, Field.Store.NO));
writer.addDocument(doc);
}
writer.commit();
writer.close();
ir = DirectoryReader.open(dir);
WordBreakSpellChecker wbsp = new WordBreakSpellChecker();
wbsp.setMaxChanges(1);
wbsp.setMinBreakWordLength(1);
wbsp.setMinSuggestionFrequency(1);
wbsp.setMaxCombineWordLength(maxLength);
for (int i = 0; i < originals.size(); i++) {
String orig = originals.get(i);
String left = breaks.get(i)[0];
String right = breaks.get(i)[1];
{
Term term = new Term("random_break", orig);
SuggestWord[][] sw = wbsp.suggestWordBreaks(term, originals.size(),
ir, SuggestMode.SUGGEST_ALWAYS,
BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
boolean failed = true;
for (SuggestWord[] sw1 : sw) {
Assert.assertTrue(sw1.length == 2);
if (sw1[0].string.equals(left) && sw1[1].string.equals(right)) {
failed = false;
}
}
Assert.assertFalse("Failed getting break suggestions\n >Original: "
+ orig + "\n >Left: " + left + "\n >Right: " + right, failed);
}
{
Term[] terms = {new Term("random_combine", left),
new Term("random_combine", right)};
CombineSuggestion[] cs = wbsp.suggestWordCombinations(terms,
originals.size(), ir, SuggestMode.SUGGEST_ALWAYS);
boolean failed = true;
for (CombineSuggestion cs1 : cs) {
Assert.assertTrue(cs1.originalTermIndexes.length == 2);
if (cs1.suggestion.string.equals(left + right)) {
failed = false;
}
}
Assert.assertFalse("Failed getting combine suggestions\n >Original: "
+ orig + "\n >Left: " + left + "\n >Right: " + right, failed);
}
}
IOUtils.close(ir, dir, analyzer);
}
private static final Pattern mockTokenizerWhitespacePattern = Pattern
.compile("[ \\t\\r\\n]");
private boolean goodTestString(String s) {
if (s.codePointCount(0, s.length()) < 2
|| mockTokenizerWhitespacePattern.matcher(s).find()) {
return false;
}
return true;
}
}