| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.core; |
| |
| |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.Reader; |
| import java.io.StringReader; |
| import java.lang.reflect.Constructor; |
| import java.lang.reflect.InvocationTargetException; |
| import java.lang.reflect.Modifier; |
| import java.net.URI; |
| import java.net.URL; |
| import java.nio.CharBuffer; |
| import java.nio.file.DirectoryStream; |
| import java.nio.file.Files; |
| import java.nio.file.Path; |
| import java.nio.file.Paths; |
| import java.text.DateFormat; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.Collection; |
| import java.util.Collections; |
| import java.util.Comparator; |
| import java.util.Enumeration; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| import java.util.IdentityHashMap; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Random; |
| import java.util.Set; |
| import java.util.function.Function; |
| import java.util.function.Predicate; |
| import java.util.regex.Pattern; |
| |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| import org.apache.lucene.analysis.CachingTokenFilter; |
| import org.apache.lucene.analysis.CharArrayMap; |
| import org.apache.lucene.analysis.CharArraySet; |
| import org.apache.lucene.analysis.CharFilter; |
| import org.apache.lucene.analysis.CrankyTokenFilter; |
| import org.apache.lucene.analysis.MockTokenFilter; |
| import org.apache.lucene.analysis.MockTokenizer; |
| import org.apache.lucene.analysis.TokenFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.Tokenizer; |
| import org.apache.lucene.analysis.ValidatingTokenFilter; |
| import org.apache.lucene.analysis.boost.DelimitedBoostTokenFilter; |
| import org.apache.lucene.analysis.charfilter.NormalizeCharMap; |
| import org.apache.lucene.analysis.cjk.CJKBigramFilter; |
| import org.apache.lucene.analysis.commongrams.CommonGramsFilter; |
| import org.apache.lucene.analysis.commongrams.CommonGramsQueryFilter; |
| import org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter; |
| import org.apache.lucene.analysis.compound.TestCompoundWordTokenFilter; |
| import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree; |
| import org.apache.lucene.analysis.hunspell.Dictionary; |
| import org.apache.lucene.analysis.hunspell.TestHunspellStemFilter; |
| import org.apache.lucene.analysis.minhash.MinHashFilter; |
| import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter; |
| import org.apache.lucene.analysis.miscellaneous.ConditionalTokenFilter; |
| import org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilter; |
| import org.apache.lucene.analysis.miscellaneous.FingerprintFilter; |
| import org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilter; |
| import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter; |
| import org.apache.lucene.analysis.miscellaneous.LimitTokenOffsetFilter; |
| import org.apache.lucene.analysis.miscellaneous.LimitTokenPositionFilter; |
| import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter; |
| import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter.StemmerOverrideMap; |
| import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter; |
| import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter; |
| import org.apache.lucene.analysis.path.PathHierarchyTokenizer; |
| import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer; |
| import org.apache.lucene.analysis.pattern.PatternTypingFilter; |
| import org.apache.lucene.analysis.payloads.IdentityEncoder; |
| import org.apache.lucene.analysis.payloads.PayloadEncoder; |
| import org.apache.lucene.analysis.shingle.FixedShingleFilter; |
| import org.apache.lucene.analysis.shingle.ShingleFilter; |
| import org.apache.lucene.analysis.snowball.TestSnowball; |
| import org.apache.lucene.analysis.standard.StandardTokenizer; |
| import org.apache.lucene.analysis.synonym.SynonymMap; |
| import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer; |
| import org.apache.lucene.store.RAMDirectory; |
| import org.apache.lucene.util.AttributeFactory; |
| import org.apache.lucene.util.AttributeSource; |
| import org.apache.lucene.util.CharsRef; |
| import org.apache.lucene.util.Rethrow; |
| import org.apache.lucene.util.TestUtil; |
| import org.apache.lucene.util.Version; |
| import org.apache.lucene.util.automaton.Automaton; |
| import org.apache.lucene.util.automaton.AutomatonTestUtil; |
| import org.apache.lucene.util.automaton.CharacterRunAutomaton; |
| import org.apache.lucene.util.automaton.Operations; |
| import org.apache.lucene.util.automaton.RegExp; |
| import org.junit.AfterClass; |
| import org.junit.BeforeClass; |
| import org.tartarus.snowball.SnowballProgram; |
| import org.xml.sax.InputSource; |
| |
| /** tests random analysis chains */ |
| public class TestRandomChains extends BaseTokenStreamTestCase { |
| |
| static List<Constructor<? extends Tokenizer>> tokenizers; |
| static List<Constructor<? extends TokenFilter>> tokenfilters; |
| static List<Constructor<? extends CharFilter>> charfilters; |
| |
| private static final Predicate<Object[]> ALWAYS = (objects -> true); |
| |
| private static final Set<Class<?>> avoidConditionals = new HashSet<>(); |
| static { |
| // These filters needs to consume the whole tokenstream, so conditionals don't make sense here |
| avoidConditionals.add(FingerprintFilter.class); |
| avoidConditionals.add(MinHashFilter.class); |
| avoidConditionals.add(ConcatenateGraphFilter.class); |
| // ShingleFilter doesn't handle input graphs correctly, so wrapping it in a condition can |
| // expose inconsistent offsets |
| // https://issues.apache.org/jira/browse/LUCENE-4170 |
| avoidConditionals.add(ShingleFilter.class); |
| avoidConditionals.add(FixedShingleFilter.class); |
| // FlattenGraphFilter changes the output graph entirely, so wrapping it in a condition |
| // can break position lengths |
| avoidConditionals.add(FlattenGraphFilter.class); |
| // LimitToken*Filters don't set end offsets correctly |
| avoidConditionals.add(LimitTokenOffsetFilter.class); |
| avoidConditionals.add(LimitTokenCountFilter.class); |
| avoidConditionals.add(LimitTokenPositionFilter.class); |
| } |
| |
| private static final Map<Constructor<?>,Predicate<Object[]>> brokenConstructors = new HashMap<>(); |
| static { |
| try { |
| brokenConstructors.put( |
| LimitTokenCountFilter.class.getConstructor(TokenStream.class, int.class), |
| ALWAYS); |
| brokenConstructors.put( |
| LimitTokenCountFilter.class.getConstructor(TokenStream.class, int.class, boolean.class), |
| args -> { |
| assert args.length == 3; |
| return !((Boolean) args[2]); // args are broken if consumeAllTokens is false |
| }); |
| brokenConstructors.put( |
| LimitTokenOffsetFilter.class.getConstructor(TokenStream.class, int.class), |
| ALWAYS); |
| brokenConstructors.put( |
| LimitTokenOffsetFilter.class.getConstructor(TokenStream.class, int.class, boolean.class), |
| args -> { |
| assert args.length == 3; |
| return !((Boolean) args[2]); // args are broken if consumeAllTokens is false |
| }); |
| brokenConstructors.put( |
| LimitTokenPositionFilter.class.getConstructor(TokenStream.class, int.class), |
| ALWAYS); |
| brokenConstructors.put( |
| LimitTokenPositionFilter.class.getConstructor(TokenStream.class, int.class, boolean.class), |
| args -> { |
| assert args.length == 3; |
| return !((Boolean) args[2]); // args are broken if consumeAllTokens is false |
| }); |
| for (Class<?> c : Arrays.<Class<?>>asList( |
| // doesn't actual reset itself! TODO this statement is probably obsolete as of LUCENE-6121 ? |
| CachingTokenFilter.class, |
| // LUCENE-8092: doesn't handle graph inputs |
| CJKBigramFilter.class, |
| // TODO: LUCENE-4983 |
| CommonGramsFilter.class, |
| // TODO: doesn't handle graph inputs |
| CommonGramsQueryFilter.class, |
| // Not broken, simulates brokenness: |
| CrankyTokenFilter.class, |
| // TODO: doesn't handle graph inputs (or even look at positionIncrement) |
| HyphenatedWordsFilter.class, |
| // broken offsets |
| PathHierarchyTokenizer.class, |
| // broken offsets |
| ReversePathHierarchyTokenizer.class, |
| // Not broken: we forcefully add this, so we shouldn't |
| // also randomly pick it: |
| ValidatingTokenFilter.class, |
| // TODO: it seems to mess up offsets!? |
| WikipediaTokenizer.class, |
| // TODO: needs to be a tokenizer, doesnt handle graph inputs properly (a shingle or similar following will then cause pain) |
| WordDelimiterFilter.class, |
| // Cannot correct offsets when a char filter had changed them: |
| WordDelimiterGraphFilter.class, |
| // requires a special encoded token value, so it may fail with random data: |
| DelimitedTermFrequencyTokenFilter.class, |
| // requires a special encoded token value, so it may fail with random data: |
| DelimitedBoostTokenFilter.class, |
| // clones of core's filters: |
| org.apache.lucene.analysis.core.StopFilter.class, |
| org.apache.lucene.analysis.core.LowerCaseFilter.class)) { |
| for (Constructor<?> ctor : c.getConstructors()) { |
| brokenConstructors.put(ctor, ALWAYS); |
| } |
| } |
| } catch (Exception e) { |
| throw new Error(e); |
| } |
| } |
| |
| @BeforeClass |
| public static void beforeClass() throws Exception { |
| List<Class<?>> analysisClasses = getClassesForPackage("org.apache.lucene.analysis"); |
| tokenizers = new ArrayList<>(); |
| tokenfilters = new ArrayList<>(); |
| charfilters = new ArrayList<>(); |
| for (final Class<?> c : analysisClasses) { |
| final int modifiers = c.getModifiers(); |
| if ( |
| // don't waste time with abstract classes or deprecated known-buggy ones |
| Modifier.isAbstract(modifiers) || !Modifier.isPublic(modifiers) |
| || c.isSynthetic() || c.isAnonymousClass() || c.isMemberClass() || c.isInterface() |
| || c.isAnnotationPresent(Deprecated.class) |
| || !(Tokenizer.class.isAssignableFrom(c) || TokenFilter.class.isAssignableFrom(c) || CharFilter.class.isAssignableFrom(c)) |
| ) { |
| continue; |
| } |
| |
| for (final Constructor<?> ctor : c.getConstructors()) { |
| // don't test synthetic or deprecated ctors, they likely have known bugs: |
| if (ctor.isSynthetic() || ctor.isAnnotationPresent(Deprecated.class) || brokenConstructors.get(ctor) == ALWAYS) { |
| continue; |
| } |
| // conditional filters are tested elsewhere |
| if (ConditionalTokenFilter.class.isAssignableFrom(c)) { |
| continue; |
| } |
| if (Tokenizer.class.isAssignableFrom(c)) { |
| assertTrue(ctor.toGenericString() + " has unsupported parameter types", |
| allowedTokenizerArgs.containsAll(Arrays.asList(ctor.getParameterTypes()))); |
| tokenizers.add(castConstructor(Tokenizer.class, ctor)); |
| } else if (TokenFilter.class.isAssignableFrom(c)) { |
| assertTrue(ctor.toGenericString() + " has unsupported parameter types", |
| allowedTokenFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes()))); |
| tokenfilters.add(castConstructor(TokenFilter.class, ctor)); |
| } else if (CharFilter.class.isAssignableFrom(c)) { |
| assertTrue(ctor.toGenericString() + " has unsupported parameter types", |
| allowedCharFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes()))); |
| charfilters.add(castConstructor(CharFilter.class, ctor)); |
| } else { |
| fail("Cannot get here"); |
| } |
| } |
| } |
| |
| final Comparator<Constructor<?>> ctorComp = (arg0, arg1) -> arg0.toGenericString().compareTo(arg1.toGenericString()); |
| Collections.sort(tokenizers, ctorComp); |
| Collections.sort(tokenfilters, ctorComp); |
| Collections.sort(charfilters, ctorComp); |
| if (VERBOSE) { |
| System.out.println("tokenizers = " + tokenizers); |
| System.out.println("tokenfilters = " + tokenfilters); |
| System.out.println("charfilters = " + charfilters); |
| } |
| } |
| |
| @AfterClass |
| public static void afterClass() { |
| tokenizers = null; |
| tokenfilters = null; |
| charfilters = null; |
| } |
| |
| /** Hack to work around the stupidness of Oracle's strict Java backwards compatibility. |
| * {@code Class<T>#getConstructors()} should return unmodifiable {@code List<Constructor<T>>} not array! */ |
| @SuppressWarnings("unchecked") |
| private static <T> Constructor<T> castConstructor(Class<T> instanceClazz, Constructor<?> ctor) { |
| return (Constructor<T>) ctor; |
| } |
| |
| public static List<Class<?>> getClassesForPackage(String pckgname) throws Exception { |
| final List<Class<?>> classes = new ArrayList<>(); |
| collectClassesForPackage(pckgname, classes); |
| assertFalse("No classes found in package '"+pckgname+"'; maybe your test classes are packaged as JAR file?", classes.isEmpty()); |
| return classes; |
| } |
| |
| private static void collectClassesForPackage(String pckgname, List<Class<?>> classes) throws Exception { |
| final ClassLoader cld = TestRandomChains.class.getClassLoader(); |
| final String path = pckgname.replace('.', '/'); |
| final Enumeration<URL> resources = cld.getResources(path); |
| while (resources.hasMoreElements()) { |
| final URI uri = resources.nextElement().toURI(); |
| if (!"file".equalsIgnoreCase(uri.getScheme())) |
| continue; |
| final Path directory = Paths.get(uri); |
| if (Files.exists(directory)) { |
| try (DirectoryStream<Path> stream = Files.newDirectoryStream(directory)) { |
| for (Path file : stream) { |
| if (Files.isDirectory(file)) { |
| // recurse |
| String subPackage = pckgname + "." + file.getFileName().toString(); |
| collectClassesForPackage(subPackage, classes); |
| } |
| String fname = file.getFileName().toString(); |
| if (fname.endsWith(".class")) { |
| String clazzName = fname.substring(0, fname.length() - 6); |
| // exclude Test classes that happen to be in these packages. |
| // class.ForName'ing some of them can cause trouble. |
| if (!clazzName.endsWith("Test") && !clazzName.startsWith("Test")) { |
| // Don't run static initializers, as we won't use most of them. |
| // Java will do that automatically once accessed/instantiated. |
| classes.add(Class.forName(pckgname + '.' + clazzName, false, cld)); |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| private static final Map<Class<?>,Function<Random,Object>> argProducers = new IdentityHashMap<Class<?>,Function<Random,Object>>() {{ |
| put(int.class, random -> { |
| // TODO: could cause huge ram usage to use full int range for some filters |
| // (e.g. allocate enormous arrays) |
| // return Integer.valueOf(random.nextInt()); |
| return Integer.valueOf(TestUtil.nextInt(random, -50, 50)); |
| }); |
| put(char.class, random -> { |
| // TODO: fix any filters that care to throw IAE instead. |
| // also add a unicode validating filter to validate termAtt? |
| // return Character.valueOf((char)random.nextInt(65536)); |
| while(true) { |
| char c = (char)random.nextInt(65536); |
| if (c < '\uD800' || c > '\uDFFF') { |
| return Character.valueOf(c); |
| } |
| } |
| }); |
| put(float.class, Random::nextFloat); |
| put(boolean.class, Random::nextBoolean); |
| put(byte.class, random -> (byte) random.nextInt(256)); |
| put(byte[].class, random -> { |
| byte bytes[] = new byte[random.nextInt(256)]; |
| random.nextBytes(bytes); |
| return bytes; |
| }); |
| put(Random.class, random -> new Random(random.nextLong())); |
| put(Version.class, random -> Version.LATEST); |
| put(AttributeFactory.class, BaseTokenStreamTestCase::newAttributeFactory); |
| put(Set.class,random -> { |
| // TypeTokenFilter |
| Set<String> set = new HashSet<>(); |
| int num = random.nextInt(5); |
| for (int i = 0; i < num; i++) { |
| set.add(StandardTokenizer.TOKEN_TYPES[random.nextInt(StandardTokenizer.TOKEN_TYPES.length)]); |
| } |
| return set; |
| }); |
| put(Collection.class, random -> { |
| // CapitalizationFilter |
| Collection<char[]> col = new ArrayList<>(); |
| int num = random.nextInt(5); |
| for (int i = 0; i < num; i++) { |
| col.add(TestUtil.randomSimpleString(random).toCharArray()); |
| } |
| return col; |
| }); |
| put(CharArraySet.class, random -> { |
| int num = random.nextInt(10); |
| CharArraySet set = new CharArraySet(num, random.nextBoolean()); |
| for (int i = 0; i < num; i++) { |
| // TODO: make nastier |
| set.add(TestUtil.randomSimpleString(random)); |
| } |
| return set; |
| }); |
| // TODO: don't want to make the exponentially slow ones Dawid documents |
| // in TestPatternReplaceFilter, so dont use truly random patterns (for now) |
| put(Pattern.class, random -> Pattern.compile("a")); |
| put(Pattern[].class, random -> new Pattern[] {Pattern.compile("([a-z]+)"), Pattern.compile("([0-9]+)")}); |
| put(PayloadEncoder.class, random -> new IdentityEncoder()); // the other encoders will throw exceptions if tokens arent numbers? |
| put(Dictionary.class, random -> { |
| // TODO: make nastier |
| InputStream affixStream = TestHunspellStemFilter.class.getResourceAsStream("simple.aff"); |
| InputStream dictStream = TestHunspellStemFilter.class.getResourceAsStream("simple.dic"); |
| try { |
| return new Dictionary(new RAMDirectory(), "dictionary", affixStream, dictStream); |
| } catch (Exception ex) { |
| Rethrow.rethrow(ex); |
| return null; // unreachable code |
| } |
| }); |
| put(HyphenationTree.class, random -> { |
| // TODO: make nastier |
| try { |
| InputSource is = new InputSource(TestCompoundWordTokenFilter.class.getResource("da_UTF8.xml").toExternalForm()); |
| HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is); |
| return hyphenator; |
| } catch (Exception ex) { |
| Rethrow.rethrow(ex); |
| return null; // unreachable code |
| } |
| }); |
| put(SnowballProgram.class, random -> { |
| try { |
| String lang = TestSnowball.SNOWBALL_LANGS[random.nextInt(TestSnowball.SNOWBALL_LANGS.length)]; |
| Class<? extends SnowballProgram> clazz = Class.forName("org.tartarus.snowball.ext." + lang + "Stemmer").asSubclass(SnowballProgram.class); |
| return clazz.newInstance(); |
| } catch (Exception ex) { |
| Rethrow.rethrow(ex); |
| return null; // unreachable code |
| } |
| }); |
| put(String.class, random -> { |
| // TODO: make nastier |
| if (random.nextBoolean()) { |
| // a token type |
| return StandardTokenizer.TOKEN_TYPES[random.nextInt(StandardTokenizer.TOKEN_TYPES.length)]; |
| } else { |
| return TestUtil.randomSimpleString(random); |
| } |
| }); |
| put(NormalizeCharMap.class, random -> { |
| NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); |
| // we can't add duplicate keys, or NormalizeCharMap gets angry |
| Set<String> keys = new HashSet<>(); |
| int num = random.nextInt(5); |
| //System.out.println("NormalizeCharMap="); |
| for (int i = 0; i < num; i++) { |
| String key = TestUtil.randomSimpleString(random); |
| if (!keys.contains(key) && key.length() > 0) { |
| String value = TestUtil.randomSimpleString(random); |
| builder.add(key, value); |
| keys.add(key); |
| //System.out.println("mapping: '" + key + "' => '" + value + "'"); |
| } |
| } |
| return builder.build(); |
| }); |
| put(CharacterRunAutomaton.class, random -> { |
| // TODO: could probably use a purely random automaton |
| switch(random.nextInt(5)) { |
| case 0: return MockTokenizer.KEYWORD; |
| case 1: return MockTokenizer.SIMPLE; |
| case 2: return MockTokenizer.WHITESPACE; |
| case 3: return MockTokenFilter.EMPTY_STOPSET; |
| default: return MockTokenFilter.ENGLISH_STOPSET; |
| } |
| }); |
| put(CharArrayMap.class, random -> { |
| int num = random.nextInt(10); |
| CharArrayMap<String> map = new CharArrayMap<>(num, random.nextBoolean()); |
| for (int i = 0; i < num; i++) { |
| // TODO: make nastier |
| map.put(TestUtil.randomSimpleString(random), TestUtil.randomSimpleString(random)); |
| } |
| return map; |
| }); |
| put(StemmerOverrideMap.class, random -> { |
| int num = random.nextInt(10); |
| StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(random.nextBoolean()); |
| for (int i = 0; i < num; i++) { |
| String input = ""; |
| do { |
| input = TestUtil.randomRealisticUnicodeString(random); |
| } while(input.isEmpty()); |
| String out = ""; TestUtil.randomSimpleString(random); |
| do { |
| out = TestUtil.randomRealisticUnicodeString(random); |
| } while(out.isEmpty()); |
| builder.add(input, out); |
| } |
| try { |
| return builder.build(); |
| } catch (Exception ex) { |
| Rethrow.rethrow(ex); |
| return null; // unreachable code |
| } |
| }); |
| put(SynonymMap.class, new Function<Random, Object>() { |
| @Override public Object apply(Random random) { |
| SynonymMap.Builder b = new SynonymMap.Builder(random.nextBoolean()); |
| final int numEntries = atLeast(10); |
| for (int j = 0; j < numEntries; j++) { |
| addSyn(b, randomNonEmptyString(random), randomNonEmptyString(random), random.nextBoolean()); |
| } |
| try { |
| return b.build(); |
| } catch (Exception ex) { |
| Rethrow.rethrow(ex); |
| return null; // unreachable code |
| } |
| } |
| |
| private void addSyn(SynonymMap.Builder b, String input, String output, boolean keepOrig) { |
| b.add(new CharsRef(input.replaceAll(" +", "\u0000")), |
| new CharsRef(output.replaceAll(" +", "\u0000")), |
| keepOrig); |
| } |
| |
| private String randomNonEmptyString(Random random) { |
| while(true) { |
| final String s = TestUtil.randomUnicodeString(random).trim(); |
| if (s.length() != 0 && s.indexOf('\u0000') == -1) { |
| return s; |
| } |
| } |
| } |
| }); |
| put(DateFormat.class, random -> { |
| if (random.nextBoolean()) return null; |
| return DateFormat.getDateInstance(DateFormat.DEFAULT, randomLocale(random)); |
| }); |
| put(Automaton.class, random -> { |
| return Operations.determinize(new RegExp(AutomatonTestUtil.randomRegexp(random), RegExp.NONE).toAutomaton(), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT); |
| }); |
| put( |
| PatternTypingFilter.PatternTypingRule[].class, |
| random -> { |
| int numRules = TestUtil.nextInt(random, 1, 3); |
| PatternTypingFilter.PatternTypingRule[] patternTypingRules = |
| new PatternTypingFilter.PatternTypingRule[numRules]; |
| for (int i = 0; i < patternTypingRules.length; i++) { |
| String s = TestUtil.randomSimpleString(random, 1, 2); |
| // random regex with one group |
| String regex = s + "(.*)"; |
| // pattern rule with a template that accepts one group. |
| patternTypingRules[i] = |
| new PatternTypingFilter.PatternTypingRule( |
| Pattern.compile(regex), TestUtil.nextInt(random, 1, 8), s + "_$1"); |
| } |
| return patternTypingRules; |
| }); }}; |
| |
| static final Set<Class<?>> allowedTokenizerArgs, allowedTokenFilterArgs, allowedCharFilterArgs; |
| static { |
| allowedTokenizerArgs = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>()); |
| allowedTokenizerArgs.addAll(argProducers.keySet()); |
| allowedTokenizerArgs.add(Reader.class); |
| allowedTokenizerArgs.add(AttributeFactory.class); |
| allowedTokenizerArgs.add(AttributeSource.class); |
| allowedTokenizerArgs.add(Automaton.class); |
| |
| allowedTokenFilterArgs = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>()); |
| allowedTokenFilterArgs.addAll(argProducers.keySet()); |
| allowedTokenFilterArgs.add(TokenStream.class); |
| // TODO: fix this one, thats broken: |
| allowedTokenFilterArgs.add(CommonGramsFilter.class); |
| |
| allowedCharFilterArgs = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>()); |
| allowedCharFilterArgs.addAll(argProducers.keySet()); |
| allowedCharFilterArgs.add(Reader.class); |
| } |
| |
| @SuppressWarnings("unchecked") |
| static <T> T newRandomArg(Random random, Class<T> paramType) { |
| final Function<Random,Object> producer = argProducers.get(paramType); |
| assertNotNull("No producer for arguments of type " + paramType.getName() + " found", producer); |
| return (T) producer.apply(random); |
| } |
| |
| static Object[] newTokenizerArgs(Random random, Class<?>[] paramTypes) { |
| Object[] args = new Object[paramTypes.length]; |
| for (int i = 0; i < args.length; i++) { |
| Class<?> paramType = paramTypes[i]; |
| if (paramType == AttributeSource.class) { |
| // TODO: args[i] = new AttributeSource(); |
| // this is currently too scary to deal with! |
| args[i] = null; // force IAE |
| } else { |
| args[i] = newRandomArg(random, paramType); |
| } |
| } |
| return args; |
| } |
| |
| static Object[] newCharFilterArgs(Random random, Reader reader, Class<?>[] paramTypes) { |
| Object[] args = new Object[paramTypes.length]; |
| for (int i = 0; i < args.length; i++) { |
| Class<?> paramType = paramTypes[i]; |
| if (paramType == Reader.class) { |
| args[i] = reader; |
| } else { |
| args[i] = newRandomArg(random, paramType); |
| } |
| } |
| return args; |
| } |
| |
| static Object[] newFilterArgs(Random random, TokenStream stream, Class<?>[] paramTypes) { |
| Object[] args = new Object[paramTypes.length]; |
| for (int i = 0; i < args.length; i++) { |
| Class<?> paramType = paramTypes[i]; |
| if (paramType == TokenStream.class) { |
| args[i] = stream; |
| } else if (paramType == CommonGramsFilter.class) { |
| // TODO: fix this one, thats broken: CommonGramsQueryFilter takes this one explicitly |
| args[i] = new CommonGramsFilter(stream, newRandomArg(random, CharArraySet.class)); |
| } else { |
| args[i] = newRandomArg(random, paramType); |
| } |
| } |
| return args; |
| } |
| |
| static class MockRandomAnalyzer extends Analyzer { |
| final long seed; |
| |
| MockRandomAnalyzer(long seed) { |
| this.seed = seed; |
| } |
| |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName) { |
| Random random = new Random(seed); |
| TokenizerSpec tokenizerSpec = newTokenizer(random); |
| //System.out.println("seed=" + seed + ",create tokenizer=" + tokenizerSpec.toString); |
| TokenFilterSpec filterSpec = newFilterChain(random, tokenizerSpec.tokenizer); |
| //System.out.println("seed=" + seed + ",create filter=" + filterSpec.toString); |
| return new TokenStreamComponents(tokenizerSpec.tokenizer, filterSpec.stream); |
| } |
| |
| @Override |
| protected Reader initReader(String fieldName, Reader reader) { |
| Random random = new Random(seed); |
| CharFilterSpec charfilterspec = newCharFilterChain(random, reader); |
| return charfilterspec.reader; |
| } |
| |
| @Override |
| public String toString() { |
| Random random = new Random(seed); |
| StringBuilder sb = new StringBuilder(); |
| CharFilterSpec charFilterSpec = newCharFilterChain(random, new StringReader("")); |
| sb.append("\ncharfilters="); |
| sb.append(charFilterSpec.toString); |
| // intentional: initReader gets its own separate random |
| random = new Random(seed); |
| TokenizerSpec tokenizerSpec = newTokenizer(random); |
| sb.append("\n"); |
| sb.append("tokenizer="); |
| sb.append(tokenizerSpec.toString); |
| TokenFilterSpec tokenFilterSpec = newFilterChain(random, tokenizerSpec.tokenizer); |
| sb.append("\n"); |
| sb.append("filters="); |
| sb.append(tokenFilterSpec.toString); |
| return sb.toString(); |
| } |
| |
| private <T> T createComponent(Constructor<T> ctor, Object[] args, StringBuilder descr, boolean isConditional) { |
| try { |
| final T instance = ctor.newInstance(args); |
| /* |
| if (descr.length() > 0) { |
| descr.append(","); |
| } |
| */ |
| descr.append("\n "); |
| if (isConditional) { |
| descr.append("Conditional:"); |
| } |
| descr.append(ctor.getDeclaringClass().getName()); |
| String params = Arrays.deepToString(args); |
| params = params.substring(1, params.length()-1); |
| descr.append("(").append(params).append(")"); |
| return instance; |
| } catch (InvocationTargetException ite) { |
| final Throwable cause = ite.getCause(); |
| if (cause instanceof IllegalArgumentException || |
| cause instanceof UnsupportedOperationException) { |
| // thats ok, ignore |
| if (VERBOSE) { |
| System.err.println("Ignoring IAE/UOE from ctor:"); |
| cause.printStackTrace(System.err); |
| } |
| } else { |
| Rethrow.rethrow(cause); |
| } |
| } catch (IllegalAccessException | InstantiationException iae) { |
| Rethrow.rethrow(iae); |
| } |
| return null; // no success |
| } |
| |
| private boolean broken(Constructor<?> ctor, Object[] args) { |
| final Predicate<Object[]> pred = brokenConstructors.get(ctor); |
| return pred != null && pred.test(args); |
| } |
| |
| // create a new random tokenizer from classpath |
| private TokenizerSpec newTokenizer(Random random) { |
| TokenizerSpec spec = new TokenizerSpec(); |
| while (spec.tokenizer == null) { |
| final Constructor<? extends Tokenizer> ctor = tokenizers.get(random.nextInt(tokenizers.size())); |
| final StringBuilder descr = new StringBuilder(); |
| final Object args[] = newTokenizerArgs(random, ctor.getParameterTypes()); |
| if (broken(ctor, args)) { |
| continue; |
| } |
| spec.tokenizer = createComponent(ctor, args, descr, false); |
| if (spec.tokenizer != null) { |
| spec.toString = descr.toString(); |
| } |
| } |
| return spec; |
| } |
| |
| private CharFilterSpec newCharFilterChain(Random random, Reader reader) { |
| CharFilterSpec spec = new CharFilterSpec(); |
| spec.reader = reader; |
| StringBuilder descr = new StringBuilder(); |
| int numFilters = random.nextInt(3); |
| for (int i = 0; i < numFilters; i++) { |
| while (true) { |
| final Constructor<? extends CharFilter> ctor = charfilters.get(random.nextInt(charfilters.size())); |
| final Object args[] = newCharFilterArgs(random, spec.reader, ctor.getParameterTypes()); |
| if (broken(ctor, args)) { |
| continue; |
| } |
| reader = createComponent(ctor, args, descr, false); |
| if (reader != null) { |
| spec.reader = reader; |
| break; |
| } |
| } |
| } |
| spec.toString = descr.toString(); |
| return spec; |
| } |
| |
| private TokenFilterSpec newFilterChain(Random random, Tokenizer tokenizer) { |
| TokenFilterSpec spec = new TokenFilterSpec(); |
| spec.stream = tokenizer; |
| StringBuilder descr = new StringBuilder(); |
| int numFilters = random.nextInt(5); |
| for (int i = 0; i < numFilters; i++) { |
| |
| // Insert ValidatingTF after each stage so we can |
| // catch problems right after the TF that "caused" |
| // them: |
| spec.stream = new ValidatingTokenFilter(spec.stream, "stage " + i); |
| |
| while (true) { |
| final Constructor<? extends TokenFilter> ctor = tokenfilters.get(random.nextInt(tokenfilters.size())); |
| if (random.nextBoolean() && avoidConditionals.contains(ctor.getDeclaringClass()) == false) { |
| long seed = random.nextLong(); |
| spec.stream = new ConditionalTokenFilter(spec.stream, in -> { |
| final Object args[] = newFilterArgs(random, in, ctor.getParameterTypes()); |
| if (broken(ctor, args)) { |
| return in; |
| } |
| TokenStream ts = createComponent(ctor, args, descr, true); |
| if (ts == null) { |
| return in; |
| } |
| return ts; |
| }) { |
| Random random = new Random(seed); |
| |
| @Override |
| public void reset() throws IOException { |
| super.reset(); |
| random = new Random(seed); |
| } |
| |
| @Override |
| protected boolean shouldFilter() throws IOException { |
| return random.nextBoolean(); |
| } |
| }; |
| break; |
| } |
| else { |
| final Object args[] = newFilterArgs(random, spec.stream, ctor.getParameterTypes()); |
| if (broken(ctor, args)) { |
| continue; |
| } |
| final TokenFilter flt = createComponent(ctor, args, descr, false); |
| if (flt != null) { |
| spec.stream = flt; |
| break; |
| } |
| } |
| } |
| } |
| |
| // Insert ValidatingTF after each stage so we can |
| // catch problems right after the TF that "caused" |
| // them: |
| spec.stream = new ValidatingTokenFilter(spec.stream, "last stage"); |
| |
| spec.toString = descr.toString(); |
| return spec; |
| } |
| } |
| |
| static class CheckThatYouDidntReadAnythingReaderWrapper extends CharFilter { |
| boolean readSomething; |
| |
| CheckThatYouDidntReadAnythingReaderWrapper(Reader in) { |
| super(in); |
| } |
| |
| @Override |
| public int correct(int currentOff) { |
| return currentOff; // we don't change any offsets |
| } |
| |
| @Override |
| public int read(char[] cbuf, int off, int len) throws IOException { |
| readSomething = true; |
| return input.read(cbuf, off, len); |
| } |
| |
| @Override |
| public int read() throws IOException { |
| readSomething = true; |
| return input.read(); |
| } |
| |
| @Override |
| public int read(CharBuffer target) throws IOException { |
| readSomething = true; |
| return input.read(target); |
| } |
| |
| @Override |
| public int read(char[] cbuf) throws IOException { |
| readSomething = true; |
| return input.read(cbuf); |
| } |
| |
| @Override |
| public long skip(long n) throws IOException { |
| readSomething = true; |
| return input.skip(n); |
| } |
| |
| @Override |
| public void mark(int readAheadLimit) throws IOException { |
| input.mark(readAheadLimit); |
| } |
| |
| @Override |
| public boolean markSupported() { |
| return input.markSupported(); |
| } |
| |
| @Override |
| public boolean ready() throws IOException { |
| return input.ready(); |
| } |
| |
| @Override |
| public void reset() throws IOException { |
| input.reset(); |
| } |
| } |
| |
| static class TokenizerSpec { |
| Tokenizer tokenizer; |
| String toString; |
| } |
| |
| static class TokenFilterSpec { |
| TokenStream stream; |
| String toString; |
| } |
| |
| static class CharFilterSpec { |
| Reader reader; |
| String toString; |
| } |
| |
| public void testRandomChains() throws Throwable { |
| int numIterations = TEST_NIGHTLY ? atLeast(20) : 3; |
| Random random = random(); |
| for (int i = 0; i < numIterations; i++) { |
| try (MockRandomAnalyzer a = new MockRandomAnalyzer(random.nextLong())) { |
| if (VERBOSE) { |
| System.out.println("Creating random analyzer:" + a); |
| } |
| try { |
| checkNormalize(a); |
| checkRandomData(random, a, 500*RANDOM_MULTIPLIER, 20, false, |
| false /* We already validate our own offsets... */); |
| } catch (Throwable e) { |
| System.err.println("Exception from random analyzer: " + a); |
| throw e; |
| } |
| } |
| } |
| } |
| |
| public void checkNormalize(Analyzer a) { |
| // normalization should not modify characters that may be used for wildcards |
| // or regular expressions |
| String s = "([0-9]+)?*"; |
| assertEquals(s, a.normalize("dummy", s).utf8ToString()); |
| } |
| |
| // we might regret this decision... |
| public void testRandomChainsWithLargeStrings() throws Throwable { |
| int numIterations = TEST_NIGHTLY ? atLeast(20) : 3; |
| Random random = random(); |
| for (int i = 0; i < numIterations; i++) { |
| try (MockRandomAnalyzer a = new MockRandomAnalyzer(random.nextLong())) { |
| if (VERBOSE) { |
| System.out.println("Creating random analyzer:" + a); |
| } |
| try { |
| checkRandomData(random, a, 50*RANDOM_MULTIPLIER, 80, false, |
| false /* We already validate our own offsets... */); |
| } catch (Throwable e) { |
| System.err.println("Exception from random analyzer: " + a); |
| throw e; |
| } |
| } |
| } |
| } |
| } |