blob: 87f8306c8183d579b8e8a8c450e7f2f9b764557a [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.spell;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.English;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
public class TestDirectSpellChecker extends LuceneTestCase {
public void testInternalLevenshteinDistance() throws Exception {
DirectSpellChecker spellchecker = new DirectSpellChecker();
Directory dir = newDirectory();
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, true);
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, analyzer);
String[] termsToAdd = { "metanoia", "metanoian", "metanoiai", "metanoias", "metanoi𐑍" };
for (int i = 0; i < termsToAdd.length; i++) {
Document doc = new Document();
doc.add(newTextField("repentance", termsToAdd[i], Field.Store.NO));
writer.addDocument(doc);
}
IndexReader ir = writer.getReader();
String misspelled = "metanoix";
SuggestWord[] similar = spellchecker.suggestSimilar(new Term("repentance", misspelled), 4, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
assertTrue(similar.length == 4);
StringDistance sd = spellchecker.getDistance();
assertTrue(sd instanceof LuceneLevenshteinDistance);
for(SuggestWord word : similar) {
assertTrue(word.score==sd.getDistance(word.string, misspelled));
assertTrue(word.score==sd.getDistance(misspelled, word.string));
}
IOUtils.close(ir, writer, dir, analyzer);
}
public void testSimpleExamples() throws Exception {
DirectSpellChecker spellChecker = new DirectSpellChecker();
spellChecker.setMinQueryLength(0);
Directory dir = newDirectory();
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, analyzer);
for (int i = 0; i < 20; i++) {
Document doc = new Document();
doc.add(newTextField("numbers", English.intToEnglish(i), Field.Store.NO));
writer.addDocument(doc);
}
IndexReader ir = writer.getReader();
SuggestWord[] similar = spellChecker.suggestSimilar(new Term("numbers",
"fvie"), 2, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
assertTrue(similar.length > 0);
assertEquals("five", similar[0].string);
similar = spellChecker.suggestSimilar(new Term("numbers", "five"), 2, ir,
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
if (similar.length > 0) {
assertFalse(similar[0].string.equals("five")); // don't suggest a word for itself
}
similar = spellChecker.suggestSimilar(new Term("numbers", "fvie"), 2, ir,
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
assertTrue(similar.length > 0);
assertEquals("five", similar[0].string);
similar = spellChecker.suggestSimilar(new Term("numbers", "fiv"), 2, ir,
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
assertTrue(similar.length > 0);
assertEquals("five", similar[0].string);
similar = spellChecker.suggestSimilar(new Term("numbers", "fives"), 2, ir,
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
assertTrue(similar.length > 0);
assertEquals("five", similar[0].string);
assertTrue(similar.length > 0);
similar = spellChecker.suggestSimilar(new Term("numbers", "fie"), 2, ir,
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
assertEquals("five", similar[0].string);
// add some more documents
for (int i = 1000; i < 1100; i++) {
Document doc = new Document();
doc.add(newTextField("numbers", English.intToEnglish(i), Field.Store.NO));
writer.addDocument(doc);
}
ir.close();
ir = writer.getReader();
// look ma, no spellcheck index rebuild
similar = spellChecker.suggestSimilar(new Term("numbers", "tousand"), 10,
ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
assertTrue(similar.length > 0);
assertEquals("thousand", similar[0].string);
IOUtils.close(ir, writer, dir, analyzer);
}
public void testOptions() throws Exception {
Directory dir = newDirectory();
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, analyzer);
Document doc = new Document();
doc.add(newTextField("text", "foobar", Field.Store.NO));
writer.addDocument(doc);
doc.add(newTextField("text", "foobar", Field.Store.NO));
writer.addDocument(doc);
doc.add(newTextField("text", "foobaz", Field.Store.NO));
writer.addDocument(doc);
doc.add(newTextField("text", "fobar", Field.Store.NO));
writer.addDocument(doc);
IndexReader ir = writer.getReader();
DirectSpellChecker spellChecker = new DirectSpellChecker();
spellChecker.setMaxQueryFrequency(0F);
SuggestWord[] similar = spellChecker.suggestSimilar(new Term("text",
"fobar"), 1, ir, SuggestMode.SUGGEST_MORE_POPULAR);
assertEquals(0, similar.length);
// confirm that a term shorter than minQueryLength is not spellchecked
spellChecker = new DirectSpellChecker(); // reset defaults
spellChecker.setMinQueryLength(5);
similar = spellChecker.suggestSimilar(new Term("text", "foba"), 1, ir,
SuggestMode.SUGGEST_MORE_POPULAR);
assertEquals(0, similar.length);
// confirm that a term longer than maxQueryLength is not spellchecked
spellChecker = new DirectSpellChecker(); // reset defaults
spellChecker.setMaxQueryLength(5);
similar = spellChecker.suggestSimilar(new Term("text", "foobrr"), 1, ir,
SuggestMode.SUGGEST_MORE_POPULAR);
assertEquals(0, similar.length);
spellChecker = new DirectSpellChecker(); // reset defaults
spellChecker.setMaxEdits(1);
similar = spellChecker.suggestSimilar(new Term("text", "foobazzz"), 1, ir,
SuggestMode.SUGGEST_MORE_POPULAR);
assertEquals(0, similar.length);
spellChecker = new DirectSpellChecker(); // reset defaults
spellChecker.setAccuracy(0.9F);
similar = spellChecker.suggestSimilar(new Term("text", "foobazzz"), 1, ir,
SuggestMode.SUGGEST_MORE_POPULAR);
assertEquals(0, similar.length);
spellChecker = new DirectSpellChecker(); // reset defaults
spellChecker.setMinPrefix(0);
similar = spellChecker.suggestSimilar(new Term("text", "roobaz"), 1, ir,
SuggestMode.SUGGEST_MORE_POPULAR);
assertEquals(1, similar.length);
similar = spellChecker.suggestSimilar(new Term("text", "roobaz"), 1, ir,
SuggestMode.SUGGEST_MORE_POPULAR);
spellChecker = new DirectSpellChecker(); // reset defaults
spellChecker.setMinPrefix(1);
similar = spellChecker.suggestSimilar(new Term("text", "roobaz"), 1, ir,
SuggestMode.SUGGEST_MORE_POPULAR);
assertEquals(0, similar.length);
spellChecker = new DirectSpellChecker(); // reset defaults
spellChecker.setMaxEdits(2);
similar = spellChecker.suggestSimilar(new Term("text", "fobar"), 2, ir,
SuggestMode.SUGGEST_ALWAYS);
assertEquals(2, similar.length);
IOUtils.close(ir, writer, dir, analyzer);;
}
public void testBogusField() throws Exception {
DirectSpellChecker spellChecker = new DirectSpellChecker();
Directory dir = newDirectory();
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, analyzer);
for (int i = 0; i < 20; i++) {
Document doc = new Document();
doc.add(newTextField("numbers", English.intToEnglish(i), Field.Store.NO));
writer.addDocument(doc);
}
IndexReader ir = writer.getReader();
SuggestWord[] similar = spellChecker.suggestSimilar(new Term(
"bogusFieldBogusField", "fvie"), 2, ir,
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
assertEquals(0, similar.length);
IOUtils.close(ir, writer, dir, analyzer);
}
// simple test that transpositions work, we suggest five for fvie with ed=1
public void testTransposition() throws Exception {
DirectSpellChecker spellChecker = new DirectSpellChecker();
Directory dir = newDirectory();
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, analyzer);
for (int i = 0; i < 20; i++) {
Document doc = new Document();
doc.add(newTextField("numbers", English.intToEnglish(i), Field.Store.NO));
writer.addDocument(doc);
}
IndexReader ir = writer.getReader();
SuggestWord[] similar = spellChecker.suggestSimilar(new Term(
"numbers", "fvie"), 1, ir,
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
assertEquals(1, similar.length);
assertEquals("five", similar[0].string);
IOUtils.close(ir, writer, dir, analyzer);
}
// simple test that transpositions work, we suggest seventeen for seevntene with ed=2
public void testTransposition2() throws Exception {
DirectSpellChecker spellChecker = new DirectSpellChecker();
Directory dir = newDirectory();
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, analyzer);
for (int i = 0; i < 20; i++) {
Document doc = new Document();
doc.add(newTextField("numbers", English.intToEnglish(i), Field.Store.NO));
writer.addDocument(doc);
}
IndexReader ir = writer.getReader();
SuggestWord[] similar = spellChecker.suggestSimilar(new Term(
"numbers", "seevntene"), 2, ir,
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
assertEquals(1, similar.length);
assertEquals("seventeen", similar[0].string);
IOUtils.close(ir, writer, dir, analyzer);
}
}