blob: 3b5ed189643eb61157fae59c2c526d205c7eb81f [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.core;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.StringReader;
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Modifier;
import java.net.URI;
import java.net.URL;
import java.nio.CharBuffer;
import java.nio.file.DirectoryStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.text.DateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.IdentityHashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.CharArrayMap;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.CharFilter;
import org.apache.lucene.analysis.CrankyTokenFilter;
import org.apache.lucene.analysis.MockTokenFilter;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ValidatingTokenFilter;
import org.apache.lucene.analysis.boost.DelimitedBoostTokenFilter;
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
import org.apache.lucene.analysis.cjk.CJKBigramFilter;
import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
import org.apache.lucene.analysis.commongrams.CommonGramsQueryFilter;
import org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter;
import org.apache.lucene.analysis.compound.TestCompoundWordTokenFilter;
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
import org.apache.lucene.analysis.hunspell.Dictionary;
import org.apache.lucene.analysis.hunspell.TestHunspellStemFilter;
import org.apache.lucene.analysis.minhash.MinHashFilter;
import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter;
import org.apache.lucene.analysis.miscellaneous.ConditionalTokenFilter;
import org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilter;
import org.apache.lucene.analysis.miscellaneous.FingerprintFilter;
import org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilter;
import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter;
import org.apache.lucene.analysis.miscellaneous.LimitTokenOffsetFilter;
import org.apache.lucene.analysis.miscellaneous.LimitTokenPositionFilter;
import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter;
import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter.StemmerOverrideMap;
import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter;
import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer;
import org.apache.lucene.analysis.pattern.PatternTypingFilter;
import org.apache.lucene.analysis.payloads.IdentityEncoder;
import org.apache.lucene.analysis.payloads.PayloadEncoder;
import org.apache.lucene.analysis.shingle.FixedShingleFilter;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.analysis.snowball.TestSnowball;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.Rethrow;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.Version;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.AutomatonTestUtil;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.RegExp;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.tartarus.snowball.SnowballProgram;
import org.xml.sax.InputSource;
/** tests random analysis chains */
public class TestRandomChains extends BaseTokenStreamTestCase {
static List<Constructor<? extends Tokenizer>> tokenizers;
static List<Constructor<? extends TokenFilter>> tokenfilters;
static List<Constructor<? extends CharFilter>> charfilters;
private static final Predicate<Object[]> ALWAYS = (objects -> true);
private static final Set<Class<?>> avoidConditionals = new HashSet<>();
static {
// These filters needs to consume the whole tokenstream, so conditionals don't make sense here
avoidConditionals.add(FingerprintFilter.class);
avoidConditionals.add(MinHashFilter.class);
avoidConditionals.add(ConcatenateGraphFilter.class);
// ShingleFilter doesn't handle input graphs correctly, so wrapping it in a condition can
// expose inconsistent offsets
// https://issues.apache.org/jira/browse/LUCENE-4170
avoidConditionals.add(ShingleFilter.class);
avoidConditionals.add(FixedShingleFilter.class);
// FlattenGraphFilter changes the output graph entirely, so wrapping it in a condition
// can break position lengths
avoidConditionals.add(FlattenGraphFilter.class);
// LimitToken*Filters don't set end offsets correctly
avoidConditionals.add(LimitTokenOffsetFilter.class);
avoidConditionals.add(LimitTokenCountFilter.class);
avoidConditionals.add(LimitTokenPositionFilter.class);
}
private static final Map<Constructor<?>,Predicate<Object[]>> brokenConstructors = new HashMap<>();
static {
try {
brokenConstructors.put(
LimitTokenCountFilter.class.getConstructor(TokenStream.class, int.class),
ALWAYS);
brokenConstructors.put(
LimitTokenCountFilter.class.getConstructor(TokenStream.class, int.class, boolean.class),
args -> {
assert args.length == 3;
return !((Boolean) args[2]); // args are broken if consumeAllTokens is false
});
brokenConstructors.put(
LimitTokenOffsetFilter.class.getConstructor(TokenStream.class, int.class),
ALWAYS);
brokenConstructors.put(
LimitTokenOffsetFilter.class.getConstructor(TokenStream.class, int.class, boolean.class),
args -> {
assert args.length == 3;
return !((Boolean) args[2]); // args are broken if consumeAllTokens is false
});
brokenConstructors.put(
LimitTokenPositionFilter.class.getConstructor(TokenStream.class, int.class),
ALWAYS);
brokenConstructors.put(
LimitTokenPositionFilter.class.getConstructor(TokenStream.class, int.class, boolean.class),
args -> {
assert args.length == 3;
return !((Boolean) args[2]); // args are broken if consumeAllTokens is false
});
for (Class<?> c : Arrays.<Class<?>>asList(
// doesn't actual reset itself! TODO this statement is probably obsolete as of LUCENE-6121 ?
CachingTokenFilter.class,
// LUCENE-8092: doesn't handle graph inputs
CJKBigramFilter.class,
// TODO: LUCENE-4983
CommonGramsFilter.class,
// TODO: doesn't handle graph inputs
CommonGramsQueryFilter.class,
// Not broken, simulates brokenness:
CrankyTokenFilter.class,
// TODO: doesn't handle graph inputs (or even look at positionIncrement)
HyphenatedWordsFilter.class,
// broken offsets
PathHierarchyTokenizer.class,
// broken offsets
ReversePathHierarchyTokenizer.class,
// Not broken: we forcefully add this, so we shouldn't
// also randomly pick it:
ValidatingTokenFilter.class,
// TODO: it seems to mess up offsets!?
WikipediaTokenizer.class,
// TODO: needs to be a tokenizer, doesnt handle graph inputs properly (a shingle or similar following will then cause pain)
WordDelimiterFilter.class,
// Cannot correct offsets when a char filter had changed them:
WordDelimiterGraphFilter.class,
// requires a special encoded token value, so it may fail with random data:
DelimitedTermFrequencyTokenFilter.class,
// requires a special encoded token value, so it may fail with random data:
DelimitedBoostTokenFilter.class,
// clones of core's filters:
org.apache.lucene.analysis.core.StopFilter.class,
org.apache.lucene.analysis.core.LowerCaseFilter.class)) {
for (Constructor<?> ctor : c.getConstructors()) {
brokenConstructors.put(ctor, ALWAYS);
}
}
} catch (Exception e) {
throw new Error(e);
}
}
@BeforeClass
public static void beforeClass() throws Exception {
List<Class<?>> analysisClasses = getClassesForPackage("org.apache.lucene.analysis");
tokenizers = new ArrayList<>();
tokenfilters = new ArrayList<>();
charfilters = new ArrayList<>();
for (final Class<?> c : analysisClasses) {
final int modifiers = c.getModifiers();
if (
// don't waste time with abstract classes or deprecated known-buggy ones
Modifier.isAbstract(modifiers) || !Modifier.isPublic(modifiers)
|| c.isSynthetic() || c.isAnonymousClass() || c.isMemberClass() || c.isInterface()
|| c.isAnnotationPresent(Deprecated.class)
|| !(Tokenizer.class.isAssignableFrom(c) || TokenFilter.class.isAssignableFrom(c) || CharFilter.class.isAssignableFrom(c))
) {
continue;
}
for (final Constructor<?> ctor : c.getConstructors()) {
// don't test synthetic or deprecated ctors, they likely have known bugs:
if (ctor.isSynthetic() || ctor.isAnnotationPresent(Deprecated.class) || brokenConstructors.get(ctor) == ALWAYS) {
continue;
}
// conditional filters are tested elsewhere
if (ConditionalTokenFilter.class.isAssignableFrom(c)) {
continue;
}
if (Tokenizer.class.isAssignableFrom(c)) {
assertTrue(ctor.toGenericString() + " has unsupported parameter types",
allowedTokenizerArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
tokenizers.add(castConstructor(Tokenizer.class, ctor));
} else if (TokenFilter.class.isAssignableFrom(c)) {
assertTrue(ctor.toGenericString() + " has unsupported parameter types",
allowedTokenFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
tokenfilters.add(castConstructor(TokenFilter.class, ctor));
} else if (CharFilter.class.isAssignableFrom(c)) {
assertTrue(ctor.toGenericString() + " has unsupported parameter types",
allowedCharFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
charfilters.add(castConstructor(CharFilter.class, ctor));
} else {
fail("Cannot get here");
}
}
}
final Comparator<Constructor<?>> ctorComp = (arg0, arg1) -> arg0.toGenericString().compareTo(arg1.toGenericString());
Collections.sort(tokenizers, ctorComp);
Collections.sort(tokenfilters, ctorComp);
Collections.sort(charfilters, ctorComp);
if (VERBOSE) {
System.out.println("tokenizers = " + tokenizers);
System.out.println("tokenfilters = " + tokenfilters);
System.out.println("charfilters = " + charfilters);
}
}
@AfterClass
public static void afterClass() {
tokenizers = null;
tokenfilters = null;
charfilters = null;
}
/** Hack to work around the stupidness of Oracle's strict Java backwards compatibility.
* {@code Class<T>#getConstructors()} should return unmodifiable {@code List<Constructor<T>>} not array! */
@SuppressWarnings("unchecked")
private static <T> Constructor<T> castConstructor(Class<T> instanceClazz, Constructor<?> ctor) {
return (Constructor<T>) ctor;
}
public static List<Class<?>> getClassesForPackage(String pckgname) throws Exception {
final List<Class<?>> classes = new ArrayList<>();
collectClassesForPackage(pckgname, classes);
assertFalse("No classes found in package '"+pckgname+"'; maybe your test classes are packaged as JAR file?", classes.isEmpty());
return classes;
}
private static void collectClassesForPackage(String pckgname, List<Class<?>> classes) throws Exception {
final ClassLoader cld = TestRandomChains.class.getClassLoader();
final String path = pckgname.replace('.', '/');
final Enumeration<URL> resources = cld.getResources(path);
while (resources.hasMoreElements()) {
final URI uri = resources.nextElement().toURI();
if (!"file".equalsIgnoreCase(uri.getScheme()))
continue;
final Path directory = Paths.get(uri);
if (Files.exists(directory)) {
try (DirectoryStream<Path> stream = Files.newDirectoryStream(directory)) {
for (Path file : stream) {
if (Files.isDirectory(file)) {
// recurse
String subPackage = pckgname + "." + file.getFileName().toString();
collectClassesForPackage(subPackage, classes);
}
String fname = file.getFileName().toString();
if (fname.endsWith(".class")) {
String clazzName = fname.substring(0, fname.length() - 6);
// exclude Test classes that happen to be in these packages.
// class.ForName'ing some of them can cause trouble.
if (!clazzName.endsWith("Test") && !clazzName.startsWith("Test")) {
// Don't run static initializers, as we won't use most of them.
// Java will do that automatically once accessed/instantiated.
classes.add(Class.forName(pckgname + '.' + clazzName, false, cld));
}
}
}
}
}
}
}
private static final Map<Class<?>,Function<Random,Object>> argProducers = new IdentityHashMap<Class<?>,Function<Random,Object>>() {{
put(int.class, random -> {
// TODO: could cause huge ram usage to use full int range for some filters
// (e.g. allocate enormous arrays)
// return Integer.valueOf(random.nextInt());
return Integer.valueOf(TestUtil.nextInt(random, -50, 50));
});
put(char.class, random -> {
// TODO: fix any filters that care to throw IAE instead.
// also add a unicode validating filter to validate termAtt?
// return Character.valueOf((char)random.nextInt(65536));
while(true) {
char c = (char)random.nextInt(65536);
if (c < '\uD800' || c > '\uDFFF') {
return Character.valueOf(c);
}
}
});
put(float.class, Random::nextFloat);
put(boolean.class, Random::nextBoolean);
put(byte.class, random -> (byte) random.nextInt(256));
put(byte[].class, random -> {
byte bytes[] = new byte[random.nextInt(256)];
random.nextBytes(bytes);
return bytes;
});
put(Random.class, random -> new Random(random.nextLong()));
put(Version.class, random -> Version.LATEST);
put(AttributeFactory.class, BaseTokenStreamTestCase::newAttributeFactory);
put(Set.class,random -> {
// TypeTokenFilter
Set<String> set = new HashSet<>();
int num = random.nextInt(5);
for (int i = 0; i < num; i++) {
set.add(StandardTokenizer.TOKEN_TYPES[random.nextInt(StandardTokenizer.TOKEN_TYPES.length)]);
}
return set;
});
put(Collection.class, random -> {
// CapitalizationFilter
Collection<char[]> col = new ArrayList<>();
int num = random.nextInt(5);
for (int i = 0; i < num; i++) {
col.add(TestUtil.randomSimpleString(random).toCharArray());
}
return col;
});
put(CharArraySet.class, random -> {
int num = random.nextInt(10);
CharArraySet set = new CharArraySet(num, random.nextBoolean());
for (int i = 0; i < num; i++) {
// TODO: make nastier
set.add(TestUtil.randomSimpleString(random));
}
return set;
});
// TODO: don't want to make the exponentially slow ones Dawid documents
// in TestPatternReplaceFilter, so dont use truly random patterns (for now)
put(Pattern.class, random -> Pattern.compile("a"));
put(Pattern[].class, random -> new Pattern[] {Pattern.compile("([a-z]+)"), Pattern.compile("([0-9]+)")});
put(PayloadEncoder.class, random -> new IdentityEncoder()); // the other encoders will throw exceptions if tokens arent numbers?
put(Dictionary.class, random -> {
// TODO: make nastier
InputStream affixStream = TestHunspellStemFilter.class.getResourceAsStream("simple.aff");
InputStream dictStream = TestHunspellStemFilter.class.getResourceAsStream("simple.dic");
try {
return new Dictionary(new RAMDirectory(), "dictionary", affixStream, dictStream);
} catch (Exception ex) {
Rethrow.rethrow(ex);
return null; // unreachable code
}
});
put(HyphenationTree.class, random -> {
// TODO: make nastier
try {
InputSource is = new InputSource(TestCompoundWordTokenFilter.class.getResource("da_UTF8.xml").toExternalForm());
HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);
return hyphenator;
} catch (Exception ex) {
Rethrow.rethrow(ex);
return null; // unreachable code
}
});
put(SnowballProgram.class, random -> {
try {
String lang = TestSnowball.SNOWBALL_LANGS[random.nextInt(TestSnowball.SNOWBALL_LANGS.length)];
Class<? extends SnowballProgram> clazz = Class.forName("org.tartarus.snowball.ext." + lang + "Stemmer").asSubclass(SnowballProgram.class);
return clazz.newInstance();
} catch (Exception ex) {
Rethrow.rethrow(ex);
return null; // unreachable code
}
});
put(String.class, random -> {
// TODO: make nastier
if (random.nextBoolean()) {
// a token type
return StandardTokenizer.TOKEN_TYPES[random.nextInt(StandardTokenizer.TOKEN_TYPES.length)];
} else {
return TestUtil.randomSimpleString(random);
}
});
put(NormalizeCharMap.class, random -> {
NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
// we can't add duplicate keys, or NormalizeCharMap gets angry
Set<String> keys = new HashSet<>();
int num = random.nextInt(5);
//System.out.println("NormalizeCharMap=");
for (int i = 0; i < num; i++) {
String key = TestUtil.randomSimpleString(random);
if (!keys.contains(key) && key.length() > 0) {
String value = TestUtil.randomSimpleString(random);
builder.add(key, value);
keys.add(key);
//System.out.println("mapping: '" + key + "' => '" + value + "'");
}
}
return builder.build();
});
put(CharacterRunAutomaton.class, random -> {
// TODO: could probably use a purely random automaton
switch(random.nextInt(5)) {
case 0: return MockTokenizer.KEYWORD;
case 1: return MockTokenizer.SIMPLE;
case 2: return MockTokenizer.WHITESPACE;
case 3: return MockTokenFilter.EMPTY_STOPSET;
default: return MockTokenFilter.ENGLISH_STOPSET;
}
});
put(CharArrayMap.class, random -> {
int num = random.nextInt(10);
CharArrayMap<String> map = new CharArrayMap<>(num, random.nextBoolean());
for (int i = 0; i < num; i++) {
// TODO: make nastier
map.put(TestUtil.randomSimpleString(random), TestUtil.randomSimpleString(random));
}
return map;
});
put(StemmerOverrideMap.class, random -> {
int num = random.nextInt(10);
StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(random.nextBoolean());
for (int i = 0; i < num; i++) {
String input = "";
do {
input = TestUtil.randomRealisticUnicodeString(random);
} while(input.isEmpty());
String out = ""; TestUtil.randomSimpleString(random);
do {
out = TestUtil.randomRealisticUnicodeString(random);
} while(out.isEmpty());
builder.add(input, out);
}
try {
return builder.build();
} catch (Exception ex) {
Rethrow.rethrow(ex);
return null; // unreachable code
}
});
put(SynonymMap.class, new Function<Random, Object>() {
@Override public Object apply(Random random) {
SynonymMap.Builder b = new SynonymMap.Builder(random.nextBoolean());
final int numEntries = atLeast(10);
for (int j = 0; j < numEntries; j++) {
addSyn(b, randomNonEmptyString(random), randomNonEmptyString(random), random.nextBoolean());
}
try {
return b.build();
} catch (Exception ex) {
Rethrow.rethrow(ex);
return null; // unreachable code
}
}
private void addSyn(SynonymMap.Builder b, String input, String output, boolean keepOrig) {
b.add(new CharsRef(input.replaceAll(" +", "\u0000")),
new CharsRef(output.replaceAll(" +", "\u0000")),
keepOrig);
}
private String randomNonEmptyString(Random random) {
while(true) {
final String s = TestUtil.randomUnicodeString(random).trim();
if (s.length() != 0 && s.indexOf('\u0000') == -1) {
return s;
}
}
}
});
put(DateFormat.class, random -> {
if (random.nextBoolean()) return null;
return DateFormat.getDateInstance(DateFormat.DEFAULT, randomLocale(random));
});
put(Automaton.class, random -> {
return Operations.determinize(new RegExp(AutomatonTestUtil.randomRegexp(random), RegExp.NONE).toAutomaton(), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
});
put(
PatternTypingFilter.PatternTypingRule[].class,
random -> {
int numRules = TestUtil.nextInt(random, 1, 3);
PatternTypingFilter.PatternTypingRule[] patternTypingRules =
new PatternTypingFilter.PatternTypingRule[numRules];
for (int i = 0; i < patternTypingRules.length; i++) {
String s = TestUtil.randomSimpleString(random, 1, 2);
// random regex with one group
String regex = s + "(.*)";
// pattern rule with a template that accepts one group.
patternTypingRules[i] =
new PatternTypingFilter.PatternTypingRule(
Pattern.compile(regex), TestUtil.nextInt(random, 1, 8), s + "_$1");
}
return patternTypingRules;
}); }};
static final Set<Class<?>> allowedTokenizerArgs, allowedTokenFilterArgs, allowedCharFilterArgs;
static {
allowedTokenizerArgs = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
allowedTokenizerArgs.addAll(argProducers.keySet());
allowedTokenizerArgs.add(Reader.class);
allowedTokenizerArgs.add(AttributeFactory.class);
allowedTokenizerArgs.add(AttributeSource.class);
allowedTokenizerArgs.add(Automaton.class);
allowedTokenFilterArgs = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
allowedTokenFilterArgs.addAll(argProducers.keySet());
allowedTokenFilterArgs.add(TokenStream.class);
// TODO: fix this one, thats broken:
allowedTokenFilterArgs.add(CommonGramsFilter.class);
allowedCharFilterArgs = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
allowedCharFilterArgs.addAll(argProducers.keySet());
allowedCharFilterArgs.add(Reader.class);
}
@SuppressWarnings("unchecked")
static <T> T newRandomArg(Random random, Class<T> paramType) {
final Function<Random,Object> producer = argProducers.get(paramType);
assertNotNull("No producer for arguments of type " + paramType.getName() + " found", producer);
return (T) producer.apply(random);
}
static Object[] newTokenizerArgs(Random random, Class<?>[] paramTypes) {
Object[] args = new Object[paramTypes.length];
for (int i = 0; i < args.length; i++) {
Class<?> paramType = paramTypes[i];
if (paramType == AttributeSource.class) {
// TODO: args[i] = new AttributeSource();
// this is currently too scary to deal with!
args[i] = null; // force IAE
} else {
args[i] = newRandomArg(random, paramType);
}
}
return args;
}
static Object[] newCharFilterArgs(Random random, Reader reader, Class<?>[] paramTypes) {
Object[] args = new Object[paramTypes.length];
for (int i = 0; i < args.length; i++) {
Class<?> paramType = paramTypes[i];
if (paramType == Reader.class) {
args[i] = reader;
} else {
args[i] = newRandomArg(random, paramType);
}
}
return args;
}
static Object[] newFilterArgs(Random random, TokenStream stream, Class<?>[] paramTypes) {
Object[] args = new Object[paramTypes.length];
for (int i = 0; i < args.length; i++) {
Class<?> paramType = paramTypes[i];
if (paramType == TokenStream.class) {
args[i] = stream;
} else if (paramType == CommonGramsFilter.class) {
// TODO: fix this one, thats broken: CommonGramsQueryFilter takes this one explicitly
args[i] = new CommonGramsFilter(stream, newRandomArg(random, CharArraySet.class));
} else {
args[i] = newRandomArg(random, paramType);
}
}
return args;
}
static class MockRandomAnalyzer extends Analyzer {
final long seed;
MockRandomAnalyzer(long seed) {
this.seed = seed;
}
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Random random = new Random(seed);
TokenizerSpec tokenizerSpec = newTokenizer(random);
//System.out.println("seed=" + seed + ",create tokenizer=" + tokenizerSpec.toString);
TokenFilterSpec filterSpec = newFilterChain(random, tokenizerSpec.tokenizer);
//System.out.println("seed=" + seed + ",create filter=" + filterSpec.toString);
return new TokenStreamComponents(tokenizerSpec.tokenizer, filterSpec.stream);
}
@Override
protected Reader initReader(String fieldName, Reader reader) {
Random random = new Random(seed);
CharFilterSpec charfilterspec = newCharFilterChain(random, reader);
return charfilterspec.reader;
}
@Override
public String toString() {
Random random = new Random(seed);
StringBuilder sb = new StringBuilder();
CharFilterSpec charFilterSpec = newCharFilterChain(random, new StringReader(""));
sb.append("\ncharfilters=");
sb.append(charFilterSpec.toString);
// intentional: initReader gets its own separate random
random = new Random(seed);
TokenizerSpec tokenizerSpec = newTokenizer(random);
sb.append("\n");
sb.append("tokenizer=");
sb.append(tokenizerSpec.toString);
TokenFilterSpec tokenFilterSpec = newFilterChain(random, tokenizerSpec.tokenizer);
sb.append("\n");
sb.append("filters=");
sb.append(tokenFilterSpec.toString);
return sb.toString();
}
private <T> T createComponent(Constructor<T> ctor, Object[] args, StringBuilder descr, boolean isConditional) {
try {
final T instance = ctor.newInstance(args);
/*
if (descr.length() > 0) {
descr.append(",");
}
*/
descr.append("\n ");
if (isConditional) {
descr.append("Conditional:");
}
descr.append(ctor.getDeclaringClass().getName());
String params = Arrays.deepToString(args);
params = params.substring(1, params.length()-1);
descr.append("(").append(params).append(")");
return instance;
} catch (InvocationTargetException ite) {
final Throwable cause = ite.getCause();
if (cause instanceof IllegalArgumentException ||
cause instanceof UnsupportedOperationException) {
// thats ok, ignore
if (VERBOSE) {
System.err.println("Ignoring IAE/UOE from ctor:");
cause.printStackTrace(System.err);
}
} else {
Rethrow.rethrow(cause);
}
} catch (IllegalAccessException | InstantiationException iae) {
Rethrow.rethrow(iae);
}
return null; // no success
}
private boolean broken(Constructor<?> ctor, Object[] args) {
final Predicate<Object[]> pred = brokenConstructors.get(ctor);
return pred != null && pred.test(args);
}
// create a new random tokenizer from classpath
private TokenizerSpec newTokenizer(Random random) {
TokenizerSpec spec = new TokenizerSpec();
while (spec.tokenizer == null) {
final Constructor<? extends Tokenizer> ctor = tokenizers.get(random.nextInt(tokenizers.size()));
final StringBuilder descr = new StringBuilder();
final Object args[] = newTokenizerArgs(random, ctor.getParameterTypes());
if (broken(ctor, args)) {
continue;
}
spec.tokenizer = createComponent(ctor, args, descr, false);
if (spec.tokenizer != null) {
spec.toString = descr.toString();
}
}
return spec;
}
private CharFilterSpec newCharFilterChain(Random random, Reader reader) {
CharFilterSpec spec = new CharFilterSpec();
spec.reader = reader;
StringBuilder descr = new StringBuilder();
int numFilters = random.nextInt(3);
for (int i = 0; i < numFilters; i++) {
while (true) {
final Constructor<? extends CharFilter> ctor = charfilters.get(random.nextInt(charfilters.size()));
final Object args[] = newCharFilterArgs(random, spec.reader, ctor.getParameterTypes());
if (broken(ctor, args)) {
continue;
}
reader = createComponent(ctor, args, descr, false);
if (reader != null) {
spec.reader = reader;
break;
}
}
}
spec.toString = descr.toString();
return spec;
}
private TokenFilterSpec newFilterChain(Random random, Tokenizer tokenizer) {
TokenFilterSpec spec = new TokenFilterSpec();
spec.stream = tokenizer;
StringBuilder descr = new StringBuilder();
int numFilters = random.nextInt(5);
for (int i = 0; i < numFilters; i++) {
// Insert ValidatingTF after each stage so we can
// catch problems right after the TF that "caused"
// them:
spec.stream = new ValidatingTokenFilter(spec.stream, "stage " + i);
while (true) {
final Constructor<? extends TokenFilter> ctor = tokenfilters.get(random.nextInt(tokenfilters.size()));
if (random.nextBoolean() && avoidConditionals.contains(ctor.getDeclaringClass()) == false) {
long seed = random.nextLong();
spec.stream = new ConditionalTokenFilter(spec.stream, in -> {
final Object args[] = newFilterArgs(random, in, ctor.getParameterTypes());
if (broken(ctor, args)) {
return in;
}
TokenStream ts = createComponent(ctor, args, descr, true);
if (ts == null) {
return in;
}
return ts;
}) {
Random random = new Random(seed);
@Override
public void reset() throws IOException {
super.reset();
random = new Random(seed);
}
@Override
protected boolean shouldFilter() throws IOException {
return random.nextBoolean();
}
};
break;
}
else {
final Object args[] = newFilterArgs(random, spec.stream, ctor.getParameterTypes());
if (broken(ctor, args)) {
continue;
}
final TokenFilter flt = createComponent(ctor, args, descr, false);
if (flt != null) {
spec.stream = flt;
break;
}
}
}
}
// Insert ValidatingTF after each stage so we can
// catch problems right after the TF that "caused"
// them:
spec.stream = new ValidatingTokenFilter(spec.stream, "last stage");
spec.toString = descr.toString();
return spec;
}
}
static class CheckThatYouDidntReadAnythingReaderWrapper extends CharFilter {
boolean readSomething;
CheckThatYouDidntReadAnythingReaderWrapper(Reader in) {
super(in);
}
@Override
public int correct(int currentOff) {
return currentOff; // we don't change any offsets
}
@Override
public int read(char[] cbuf, int off, int len) throws IOException {
readSomething = true;
return input.read(cbuf, off, len);
}
@Override
public int read() throws IOException {
readSomething = true;
return input.read();
}
@Override
public int read(CharBuffer target) throws IOException {
readSomething = true;
return input.read(target);
}
@Override
public int read(char[] cbuf) throws IOException {
readSomething = true;
return input.read(cbuf);
}
@Override
public long skip(long n) throws IOException {
readSomething = true;
return input.skip(n);
}
@Override
public void mark(int readAheadLimit) throws IOException {
input.mark(readAheadLimit);
}
@Override
public boolean markSupported() {
return input.markSupported();
}
@Override
public boolean ready() throws IOException {
return input.ready();
}
@Override
public void reset() throws IOException {
input.reset();
}
}
static class TokenizerSpec {
Tokenizer tokenizer;
String toString;
}
static class TokenFilterSpec {
TokenStream stream;
String toString;
}
static class CharFilterSpec {
Reader reader;
String toString;
}
public void testRandomChains() throws Throwable {
int numIterations = TEST_NIGHTLY ? atLeast(20) : 3;
Random random = random();
for (int i = 0; i < numIterations; i++) {
try (MockRandomAnalyzer a = new MockRandomAnalyzer(random.nextLong())) {
if (VERBOSE) {
System.out.println("Creating random analyzer:" + a);
}
try {
checkNormalize(a);
checkRandomData(random, a, 500*RANDOM_MULTIPLIER, 20, false,
false /* We already validate our own offsets... */);
} catch (Throwable e) {
System.err.println("Exception from random analyzer: " + a);
throw e;
}
}
}
}
public void checkNormalize(Analyzer a) {
// normalization should not modify characters that may be used for wildcards
// or regular expressions
String s = "([0-9]+)?*";
assertEquals(s, a.normalize("dummy", s).utf8ToString());
}
// we might regret this decision...
public void testRandomChainsWithLargeStrings() throws Throwable {
int numIterations = TEST_NIGHTLY ? atLeast(20) : 3;
Random random = random();
for (int i = 0; i < numIterations; i++) {
try (MockRandomAnalyzer a = new MockRandomAnalyzer(random.nextLong())) {
if (VERBOSE) {
System.out.println("Creating random analyzer:" + a);
}
try {
checkRandomData(random, a, 50*RANDOM_MULTIPLIER, 80, false,
false /* We already validate our own offsets... */);
} catch (Throwable e) {
System.err.println("Exception from random analyzer: " + a);
throw e;
}
}
}
}
}