blob: 1e4fce09fec7add04f720c7302938c104644918d [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.miscellaneous;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map.Entry;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharacterUtils;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.en.PorterStemFilter;
import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter.StemmerOverrideMap;
import org.apache.lucene.util.TestUtil;
/**
*
*/
public class TestStemmerOverrideFilter extends BaseTokenStreamTestCase {
private KeywordTokenizer keywordTokenizer(String data) throws IOException {
KeywordTokenizer tokenizer = new KeywordTokenizer();
tokenizer.setReader(new StringReader(data));
return tokenizer;
}
public void testOverride() throws IOException {
// lets make booked stem to books
// the override filter will convert "booked" to "books",
// but also mark it with KeywordAttribute so Porter will not change it.
StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder();
builder.add("booked", "books");
Tokenizer tokenizer = keywordTokenizer("booked");
TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(
tokenizer, builder.build()));
assertTokenStreamContents(stream, new String[] {"books"});
}
public void testIgnoreCase() throws IOException {
// lets make booked stem to books
// the override filter will convert "booked" to "books",
// but also mark it with KeywordAttribute so Porter will not change it.
StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(true);
builder.add("boOkEd", "books");
Tokenizer tokenizer = keywordTokenizer("BooKeD");
TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(
tokenizer, builder.build()));
assertTokenStreamContents(stream, new String[] {"books"});
}
public void testNoOverrides() throws IOException {
StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(true);
Tokenizer tokenizer = keywordTokenizer("book");
TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(
tokenizer, builder.build()));
assertTokenStreamContents(stream, new String[] {"book"});
}
public void testRandomRealisticWhiteSpace() throws IOException {
Map<String,String> map = new HashMap<>();
Set<String> seen = new HashSet<>();
int numTerms = atLeast(50);
boolean ignoreCase = random().nextBoolean();
for (int i = 0; i < numTerms; i++) {
String randomRealisticUnicodeString = TestUtil
.randomRealisticUnicodeString(random());
char[] charArray = randomRealisticUnicodeString.toCharArray();
StringBuilder builder = new StringBuilder();
for (int j = 0; j < charArray.length;) {
int cp = Character.codePointAt(charArray, j, charArray.length);
if (!Character.isWhitespace(cp)) {
builder.appendCodePoint(cp);
}
j += Character.charCount(cp);
}
if (builder.length() > 0) {
String inputValue = builder.toString();
// Make sure we don't try to add two inputs that vary only by case:
String seenInputValue;
if (ignoreCase) {
// TODO: can we simply use inputValue.toLowerCase(Locale.ROOT)???
char[] buffer = inputValue.toCharArray();
CharacterUtils.toLowerCase(buffer, 0, buffer.length);
seenInputValue = buffer.toString();
} else {
seenInputValue = inputValue;
}
if (seen.contains(seenInputValue) == false) {
seen.add(seenInputValue);
String value = TestUtil.randomSimpleString(random());
map.put(inputValue,
value.isEmpty() ? "a" : value);
}
}
}
if (map.isEmpty()) {
map.put("booked", "books");
}
StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(ignoreCase);
Set<Entry<String,String>> entrySet = map.entrySet();
StringBuilder input = new StringBuilder();
List<String> output = new ArrayList<>();
for (Entry<String,String> entry : entrySet) {
builder.add(entry.getKey(), entry.getValue());
if (random().nextBoolean() || output.isEmpty()) {
input.append(entry.getKey()).append(" ");
output.add(entry.getValue());
}
}
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(input.toString()));
TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(
tokenizer, builder.build()));
assertTokenStreamContents(stream, output.toArray(new String[0]));
}
public void testRandomRealisticKeyword() throws IOException {
Map<String,String> map = new HashMap<>();
int numTerms = atLeast(50);
for (int i = 0; i < numTerms; i++) {
String randomRealisticUnicodeString = TestUtil
.randomRealisticUnicodeString(random());
if (randomRealisticUnicodeString.length() > 0) {
String value = TestUtil.randomSimpleString(random());
map.put(randomRealisticUnicodeString,
value.isEmpty() ? "a" : value);
}
}
if (map.isEmpty()) {
map.put("booked", "books");
}
// This test might fail if ignoreCase is true since the map might have twice the same key, once
// lowercased and once uppercased
StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(false);
Set<Entry<String,String>> entrySet = map.entrySet();
for (Entry<String,String> entry : entrySet) {
builder.add(entry.getKey(), entry.getValue());
}
StemmerOverrideMap build = builder.build();
for (Entry<String,String> entry : entrySet) {
if (random().nextBoolean()) {
Tokenizer tokenizer = new KeywordTokenizer();
tokenizer.setReader(new StringReader(entry.getKey()));
TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(
tokenizer, build));
assertTokenStreamContents(stream, new String[] {entry.getValue()});
}
}
}
}