LUCENE-3969: Merged /lucene/dev/trunk:r1311219-1324765
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1324945 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
index d8fbd15..6978b77 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
@@ -100,7 +100,14 @@
}
}
- public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset) throws IOException {
+ // offsetsAreCorrect also validates:
+ // - graph offsets are correct (all tokens leaving from
+ // pos X have the same startOffset; all tokens
+ // arriving to pos Y have the same endOffset)
+ // - offsets only move forwards (startOffset >=
+ // lastStartOffset)
+ public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset,
+ boolean offsetsAreCorrect) throws IOException {
assertNotNull(output);
CheckClearAttributesAttribute checkClearAtt = ts.addAttribute(CheckClearAttributesAttribute.class);
@@ -137,6 +144,7 @@
ts.reset();
int pos = -1;
+ int lastStartOffset = 0;
for (int i = 0; i < output.length; i++) {
// extra safety to enforce, that the state is not preserved and also assign bogus values
ts.clearAttributes();
@@ -176,7 +184,12 @@
endOffset <= finalOffset.intValue());
}
- if (posLengthAtt != null && posIncrAtt != null) {
+ if (offsetsAreCorrect) {
+ assertTrue("offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset, offsetAtt.startOffset() >= lastStartOffset);
+ lastStartOffset = offsetAtt.startOffset();
+ }
+
+ if (offsetsAreCorrect && posLengthAtt != null && posIncrAtt != null) {
// Validate offset consistency in the graph, ie
// all tokens leaving from a certain pos have the
// same startOffset, and all tokens arriving to a
@@ -194,7 +207,7 @@
// We've seen a token leaving from this position
// before; verify the startOffset is the same:
//System.out.println(" + vs " + pos + " -> " + startOffset);
- assertEquals(posToStartOffset.get(pos).intValue(), startOffset);
+ assertEquals("pos=" + pos + " posLen=" + posLength + " token=" + termAtt, posToStartOffset.get(pos).intValue(), startOffset);
}
final int endPos = pos + posLength;
@@ -207,7 +220,7 @@
// We've seen a token arriving to this position
// before; verify the endOffset is the same:
//System.out.println(" + ve " + endPos + " -> " + endOffset);
- assertEquals(posToEndOffset.get(endPos).intValue(), endOffset);
+ assertEquals("pos=" + pos + " posLen=" + posLength + " token=" + termAtt, posToEndOffset.get(endPos).intValue(), endOffset);
}
}
}
@@ -222,7 +235,7 @@
assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1);
}
}
- assertFalse("TokenStream has more tokens than expected", ts.incrementToken());
+ assertFalse("TokenStream has more tokens than expected (expected count=" + output.length + ")", ts.incrementToken());
ts.end();
if (finalOffset != null) {
assertEquals("finalOffset ", finalOffset.intValue(), offsetAtt.endOffset());
@@ -233,6 +246,10 @@
ts.close();
}
+ public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset) throws IOException {
+ assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, true);
+ }
+
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], Integer finalOffset) throws IOException {
assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null, finalOffset);
}
@@ -280,6 +297,10 @@
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[]) throws IOException {
assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length());
}
+
+ public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], boolean offsetsAreCorrect) throws IOException {
+ assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length(), offsetsAreCorrect);
+ }
public static void assertAnalyzesTo(Analyzer a, String input, String[] output) throws IOException {
assertAnalyzesTo(a, input, output, null, null, null, null, null);
@@ -342,12 +363,12 @@
/** utility method for blasting tokenstreams with data to make sure they don't do anything crazy */
public static void checkRandomData(Random random, Analyzer a, int iterations) throws IOException {
- checkRandomData(random, a, iterations, 20, false);
+ checkRandomData(random, a, iterations, 20, false, true);
}
-
+
/** utility method for blasting tokenstreams with data to make sure they don't do anything crazy */
public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength) throws IOException {
- checkRandomData(random, a, iterations, maxWordLength, false);
+ checkRandomData(random, a, iterations, maxWordLength, false, true);
}
/**
@@ -355,43 +376,63 @@
* @param simple true if only ascii strings will be used (try to avoid)
*/
public static void checkRandomData(Random random, Analyzer a, int iterations, boolean simple) throws IOException {
- checkRandomData(random, a, iterations, 20, simple);
+ checkRandomData(random, a, iterations, 20, simple, true);
}
static class AnalysisThread extends Thread {
final int iterations;
final int maxWordLength;
- final Random random;
+ final long seed;
final Analyzer a;
+ final boolean useCharFilter;
final boolean simple;
+ final boolean offsetsAreCorrect;
+
+ // NOTE: not volatile because we don't want the tests to
+ // add memory barriers (ie alter how threads
+ // interact)... so this is just "best effort":
+ public boolean failed;
- AnalysisThread(Random random, Analyzer a, int iterations, int maxWordLength, boolean simple) {
- this.random = random;
+ AnalysisThread(long seed, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple, boolean offsetsAreCorrect) {
+ this.seed = seed;
this.a = a;
this.iterations = iterations;
this.maxWordLength = maxWordLength;
+ this.useCharFilter = useCharFilter;
this.simple = simple;
+ this.offsetsAreCorrect = offsetsAreCorrect;
}
@Override
public void run() {
+ boolean success = false;
try {
// see the part in checkRandomData where it replays the same text again
// to verify reproducability/reuse: hopefully this would catch thread hazards.
- checkRandomData(random, a, iterations, maxWordLength, random.nextBoolean(), simple);
+ checkRandomData(new Random(seed), a, iterations, maxWordLength, useCharFilter, simple, offsetsAreCorrect);
+ success = true;
} catch (IOException e) {
Rethrow.rethrow(e);
+ } finally {
+ failed = !success;
}
}
};
public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean simple) throws IOException {
- checkRandomData(random, a, iterations, maxWordLength, random.nextBoolean(), simple);
- // now test with multiple threads
+ checkRandomData(random, a, iterations, maxWordLength, simple, true);
+ }
+
+ public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean simple, boolean offsetsAreCorrect) throws IOException {
+ long seed = random.nextLong();
+ boolean useCharFilter = random.nextBoolean();
+ checkRandomData(new Random(seed), a, iterations, maxWordLength, useCharFilter, simple, offsetsAreCorrect);
+ // now test with multiple threads: note we do the EXACT same thing we did before in each thread,
+ // so this should only really fail from another thread if its an actual thread problem
int numThreads = _TestUtil.nextInt(random, 4, 8);
- Thread threads[] = new Thread[numThreads];
+ AnalysisThread threads[] = new AnalysisThread[numThreads];
for (int i = 0; i < threads.length; i++) {
- threads[i] = new AnalysisThread(new Random(random.nextLong()), a, iterations, maxWordLength, simple);
+ threads[i] = new AnalysisThread(seed, a, iterations, maxWordLength, useCharFilter, simple, offsetsAreCorrect);
}
for (int i = 0; i < threads.length; i++) {
threads[i].start();
@@ -403,9 +444,14 @@
throw new RuntimeException(e);
}
}
+ for (int i = 0; i < threads.length; i++) {
+ if (threads[i].failed) {
+ throw new RuntimeException("some thread(s) failed");
+ }
+ }
}
- private static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple) throws IOException {
+ private static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple, boolean offsetsAreCorrect) throws IOException {
final LineFileDocs docs = new LineFileDocs(random);
@@ -437,7 +483,7 @@
}
try {
- checkAnalysisConsistency(random, a, useCharFilter, text);
+ checkAnalysisConsistency(random, a, useCharFilter, text, offsetsAreCorrect);
} catch (Throwable t) {
// TODO: really we should pass a random seed to
// checkAnalysisConsistency then print it here too:
@@ -477,6 +523,10 @@
}
public static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text) throws IOException {
+ checkAnalysisConsistency(random, a, useCharFilter, text, true);
+ }
+
+ public static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text, boolean offsetsAreCorrect) throws IOException {
if (VERBOSE) {
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
@@ -616,7 +666,8 @@
types.toArray(new String[types.size()]),
toIntArray(positions),
toIntArray(positionLengths),
- text.length());
+ text.length(),
+ offsetsAreCorrect);
} else if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
// offset + pos + type
assertTokenStreamContents(ts,
@@ -626,7 +677,8 @@
types.toArray(new String[types.size()]),
toIntArray(positions),
null,
- text.length());
+ text.length(),
+ offsetsAreCorrect);
} else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
// offset + pos + posLength
assertTokenStreamContents(ts,
@@ -636,7 +688,8 @@
null,
toIntArray(positions),
toIntArray(positionLengths),
- text.length());
+ text.length(),
+ offsetsAreCorrect);
} else if (posIncAtt != null && offsetAtt != null) {
// offset + pos
assertTokenStreamContents(ts,
@@ -646,7 +699,8 @@
null,
toIntArray(positions),
null,
- text.length());
+ text.length(),
+ offsetsAreCorrect);
} else if (offsetAtt != null) {
// offset
assertTokenStreamContents(ts,
@@ -656,7 +710,8 @@
null,
null,
null,
- text.length());
+ text.length(),
+ offsetsAreCorrect);
} else {
// terms only
assertTokenStreamContents(ts,
diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/LookaheadTokenFilter.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/LookaheadTokenFilter.java
index 298ab96..9515ae9 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/LookaheadTokenFilter.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/LookaheadTokenFilter.java
@@ -151,7 +151,7 @@
startPosData.startOffset = startOffset;
} else {
// Make sure our input isn't messing up offsets:
- assert startPosData.startOffset == startOffset;
+ assert startPosData.startOffset == startOffset: "prev startOffset=" + startPosData.startOffset + " vs new startOffset=" + startOffset + " inputPos=" + inputPos;
}
final int endOffset = offsetAtt.endOffset();
@@ -159,7 +159,7 @@
endPosData.endOffset = endOffset;
} else {
// Make sure our input isn't messing up offsets:
- assert endPosData.endOffset == endOffset;
+ assert endPosData.endOffset == endOffset: "prev endOffset=" + endPosData.endOffset + " vs new endOffset=" + endOffset + " inputPos=" + inputPos;
}
tokenPending = true;
diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockAnalyzer.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockAnalyzer.java
index 642b28f..b1ab259 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockAnalyzer.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockAnalyzer.java
@@ -76,7 +76,7 @@
* MockAnalyzer(random, runAutomaton, lowerCase, MockTokenFilter.EMPTY_STOPSET, false}).
*/
public MockAnalyzer(Random random, CharacterRunAutomaton runAutomaton, boolean lowerCase) {
- this(random, runAutomaton, lowerCase, MockTokenFilter.EMPTY_STOPSET, false);
+ this(random, runAutomaton, lowerCase, MockTokenFilter.EMPTY_STOPSET, true);
}
/**
@@ -93,7 +93,8 @@
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
MockTokenizer tokenizer = new MockTokenizer(reader, runAutomaton, lowerCase, maxTokenLength);
tokenizer.setEnableChecks(enableChecks);
- TokenFilter filt = new MockTokenFilter(tokenizer, filter, enablePositionIncrements);
+ MockTokenFilter filt = new MockTokenFilter(tokenizer, filter);
+ filt.setEnablePositionIncrements(enablePositionIncrements);
return new TokenStreamComponents(tokenizer, maybePayload(filt, fieldName));
}
diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockCharFilter.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockCharFilter.java
index a488c4b..5a11b97 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockCharFilter.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockCharFilter.java
@@ -34,7 +34,9 @@
// TODO: instead of fixed remainder... maybe a fixed
// random seed?
this.remainder = remainder;
- assert remainder >= 0 && remainder < 10 : "invalid parameter";
+ if (remainder < 0 || remainder >= 10) {
+ throw new IllegalArgumentException("invalid remainder parameter (must be 0..10): " + remainder);
+ }
}
// for testing only, uses a remainder of 0
diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockFixedLengthPayloadFilter.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockFixedLengthPayloadFilter.java
index 74e2339..bbe2f37 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockFixedLengthPayloadFilter.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockFixedLengthPayloadFilter.java
@@ -34,6 +34,9 @@
public MockFixedLengthPayloadFilter(Random random, TokenStream in, int length) {
super(in);
+ if (length < 0) {
+ throw new IllegalArgumentException("length must be >= 0");
+ }
this.random = random;
this.bytes = new byte[length];
this.payload = new Payload(bytes);
diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockRandomLookaheadTokenFilter.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockRandomLookaheadTokenFilter.java
index e47551b..44215e7 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockRandomLookaheadTokenFilter.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockRandomLookaheadTokenFilter.java
@@ -31,10 +31,12 @@
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final Random random;
+ private final long seed;
public MockRandomLookaheadTokenFilter(Random random, TokenStream in) {
super(in);
- this.random = random;
+ this.seed = random.nextLong();
+ this.random = new Random(seed);
}
@Override
@@ -57,9 +59,6 @@
if (!end) {
while (true) {
- // We can use un-re-seeded random, because how far
- // ahead we peek should never alter the resulting
- // tokens as seen by the consumer:
if (random.nextInt(3) == 1) {
if (!peekToken()) {
if (DEBUG) {
@@ -91,4 +90,10 @@
}
return result;
}
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ random.setSeed(seed);
+ }
}
diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenFilter.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenFilter.java
index 97863a4..efc7633 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenFilter.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenFilter.java
@@ -55,7 +55,7 @@
makeString("with"))));
private final CharacterRunAutomaton filter;
- private boolean enablePositionIncrements = false;
+ private boolean enablePositionIncrements = true;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
@@ -67,14 +67,16 @@
* @param filter DFA representing the terms that should be removed.
* @param enablePositionIncrements true if the removal should accumulate position increments.
*/
- public MockTokenFilter(TokenStream input, CharacterRunAutomaton filter, boolean enablePositionIncrements) {
+ public MockTokenFilter(TokenStream input, CharacterRunAutomaton filter) {
super(input);
this.filter = filter;
- this.enablePositionIncrements = enablePositionIncrements;
}
@Override
public boolean incrementToken() throws IOException {
+ // TODO: fix me when posInc=false, to work like FilteringTokenFilter in that case and not return
+ // initial token with posInc=0 ever
+
// return the first non-stop word found
int skippedPositions = 0;
while (input.incrementToken()) {
diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java
new file mode 100644
index 0000000..f213545
--- /dev/null
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java
@@ -0,0 +1,170 @@
+package org.apache.lucene.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.util.Attribute;
+
+// TODO: rename to OffsetsXXXTF? ie we only validate
+// offsets (now anyway...)
+
+// TODO: also make a DebuggingTokenFilter, that just prints
+// all att values that come through it...
+
+// TODO: BTSTC should just append this to the chain
+// instead of checking itself:
+
+/** A TokenFilter that checks consistency of the tokens (eg
+ * offsets are consistent with one another). */
+public final class ValidatingTokenFilter extends TokenFilter {
+
+ private int pos;
+ private int lastStartOffset;
+
+ // Maps position to the start/end offset:
+ private final Map<Integer,Integer> posToStartOffset = new HashMap<Integer,Integer>();
+ private final Map<Integer,Integer> posToEndOffset = new HashMap<Integer,Integer>();
+
+ private final PositionIncrementAttribute posIncAtt = getAttrIfExists(PositionIncrementAttribute.class);
+ private final PositionLengthAttribute posLenAtt = getAttrIfExists(PositionLengthAttribute.class);
+ private final OffsetAttribute offsetAtt = getAttrIfExists(OffsetAttribute.class);
+ private final CharTermAttribute termAtt = getAttrIfExists(CharTermAttribute.class);
+ private final boolean offsetsAreCorrect;
+
+ private final String name;
+
+ // Returns null if the attr wasn't already added
+ private <A extends Attribute> A getAttrIfExists(Class<A> att) {
+ if (hasAttribute(att)) {
+ return getAttribute(att);
+ } else {
+ return null;
+ }
+ }
+
+ /** The name arg is used to identify this stage when
+ * throwing exceptions (useful if you have more than one
+ * instance in your chain). */
+ public ValidatingTokenFilter(TokenStream in, String name, boolean offsetsAreCorrect) {
+ super(in);
+ this.name = name;
+ this.offsetsAreCorrect = offsetsAreCorrect;
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (!input.incrementToken()) {
+ return false;
+ }
+
+ int startOffset = 0;
+ int endOffset = 0;
+ int posLen = 0;
+
+ if (posIncAtt != null) {
+ pos += posIncAtt.getPositionIncrement();
+ if (pos == -1) {
+ throw new IllegalStateException("first posInc must be > 0");
+ }
+ }
+
+ // System.out.println(" got token=" + termAtt + " pos=" + pos);
+
+ if (offsetAtt != null) {
+ startOffset = offsetAtt.startOffset();
+ endOffset = offsetAtt.endOffset();
+
+ if (startOffset < 0) {
+ throw new IllegalStateException(name + ": startOffset=" + startOffset + " is < 0");
+ }
+ if (endOffset < 0) {
+ throw new IllegalStateException(name + ": endOffset=" + endOffset + " is < 0");
+ }
+ if (endOffset < startOffset) {
+ throw new IllegalStateException(name + ": startOffset=" + startOffset + " is > endOffset=" + endOffset + " pos=" + pos + "; token=" + termAtt);
+ }
+ if (offsetsAreCorrect && offsetAtt.startOffset() < lastStartOffset) {
+ throw new IllegalStateException(name + ": offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset);
+ }
+ lastStartOffset = offsetAtt.startOffset();
+ }
+
+ posLen = posLenAtt == null ? 1 : posLenAtt.getPositionLength();
+
+ if (offsetAtt != null && posIncAtt != null && offsetsAreCorrect) {
+
+ if (!posToStartOffset.containsKey(pos)) {
+ // First time we've seen a token leaving from this position:
+ posToStartOffset.put(pos, startOffset);
+ //System.out.println(" + s " + pos + " -> " + startOffset);
+ } else {
+ // We've seen a token leaving from this position
+ // before; verify the startOffset is the same:
+ //System.out.println(" + vs " + pos + " -> " + startOffset);
+ final int oldStartOffset = posToStartOffset.get(pos);
+ if (oldStartOffset != startOffset) {
+ throw new IllegalStateException(name + ": inconsistent startOffset at pos=" + pos + ": " + oldStartOffset + " vs " + startOffset + "; token=" + termAtt);
+ }
+ }
+
+ final int endPos = pos + posLen;
+
+ if (!posToEndOffset.containsKey(endPos)) {
+ // First time we've seen a token arriving to this position:
+ posToEndOffset.put(endPos, endOffset);
+ //System.out.println(" + e " + endPos + " -> " + endOffset);
+ } else {
+ // We've seen a token arriving to this position
+ // before; verify the endOffset is the same:
+ //System.out.println(" + ve " + endPos + " -> " + endOffset);
+ final int oldEndOffset = posToEndOffset.get(endPos);
+ if (oldEndOffset != endOffset) {
+ throw new IllegalStateException(name + ": inconsistent endOffset at pos=" + endPos + ": " + oldEndOffset + " vs " + endOffset + "; token=" + termAtt);
+ }
+ }
+ }
+
+ return true;
+ }
+
+ @Override
+ public void end() throws IOException {
+ super.end();
+
+ // TODO: what else to validate
+
+ // TODO: check that endOffset is >= max(endOffset)
+ // we've seen
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ pos = -1;
+ posToStartOffset.clear();
+ posToEndOffset.clear();
+ lastStartOffset = 0;
+ }
+}
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilter.java
index 8232b88..9798464 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilter.java
@@ -16,6 +16,7 @@
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;
@@ -54,6 +55,7 @@
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
+ private final PositionLengthAttribute posLenAttribute = addAttribute(PositionLengthAttribute.class);
private int lastStartOffset;
private boolean lastWasCommon;
@@ -166,6 +168,7 @@
buffer.getChars(0, length, termText, 0);
termAttribute.setLength(length);
posIncAttribute.setPositionIncrement(0);
+ posLenAttribute.setPositionLength(2); // bigram
offsetAttribute.setOffset(lastStartOffset, endOffset);
typeAttribute.setType(GRAM_TYPE);
buffer.setLength(0);
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
index 3b3fae9..909ef5e 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
@@ -82,8 +82,17 @@
super(input);
this.tokens=new LinkedList<CompoundToken>();
+ if (minWordSize < 0) {
+ throw new IllegalArgumentException("minWordSize cannot be negative");
+ }
this.minWordSize=minWordSize;
+ if (minSubwordSize < 0) {
+ throw new IllegalArgumentException("minSubwordSize cannot be negative");
+ }
this.minSubwordSize=minSubwordSize;
+ if (maxSubwordSize < 0) {
+ throw new IllegalArgumentException("maxSubwordSize cannot be negative");
+ }
this.maxSubwordSize=maxSubwordSize;
this.onlyLongestMatch=onlyLongestMatch;
this.dictionary = dictionary;
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java
index 935c607..71d317b 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java
@@ -191,6 +191,8 @@
// we only put subwords to the token stream
// that are longer than minPartSize
if (partLength < this.minSubwordSize) {
+ // BOGUS/BROKEN/FUNKY/WACKO: somehow we have negative 'parts' according to the
+ // calculation above, and we rely upon minSubwordSize being >=0 to filter them out...
continue;
}
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java
index c9d73ef..44ee084 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java
@@ -43,16 +43,25 @@
public KeywordTokenizer(Reader input, int bufferSize) {
super(input);
+ if (bufferSize <= 0) {
+ throw new IllegalArgumentException("bufferSize must be > 0");
+ }
termAtt.resizeBuffer(bufferSize);
}
public KeywordTokenizer(AttributeSource source, Reader input, int bufferSize) {
super(source, input);
+ if (bufferSize <= 0) {
+ throw new IllegalArgumentException("bufferSize must be > 0");
+ }
termAtt.resizeBuffer(bufferSize);
}
public KeywordTokenizer(AttributeFactory factory, Reader input, int bufferSize) {
super(factory, input);
+ if (bufferSize <= 0) {
+ throw new IllegalArgumentException("bufferSize must be > 0");
+ }
termAtt.resizeBuffer(bufferSize);
}
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java
index 26b5b1d..3755775 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java
@@ -65,6 +65,12 @@
public PathHierarchyTokenizer(Reader input, int bufferSize, char delimiter, char replacement, int skip) {
super(input);
+ if (bufferSize < 0) {
+ throw new IllegalArgumentException("bufferSize cannot be negative");
+ }
+ if (skip < 0) {
+ throw new IllegalArgumentException("skip cannot be negative");
+ }
termAtt.resizeBuffer(bufferSize);
this.delimiter = delimiter;
@@ -85,10 +91,11 @@
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);
private int startPosition = 0;
- private int finalOffset = 0;
private int skipped = 0;
private boolean endDelimiter = false;
private StringBuilder resultToken;
+
+ private int charsRead = 0;
@Override
@@ -112,12 +119,13 @@
while (true) {
int c = input.read();
- if( c < 0 ){
+ if (c >= 0) {
+ charsRead++;
+ } else {
if( skipped > skip ) {
length += resultToken.length();
termAtt.setLength(length);
- finalOffset = correctOffset(startPosition + length);
- offsetAtt.setOffset(correctOffset(startPosition), finalOffset);
+ offsetAtt.setOffset(correctOffset(startPosition), correctOffset(startPosition + length));
if( added ){
resultToken.setLength(0);
resultToken.append(termAtt.buffer(), 0, length);
@@ -125,7 +133,6 @@
return added;
}
else{
- finalOffset = correctOffset(startPosition + length);
return false;
}
}
@@ -168,8 +175,7 @@
}
length += resultToken.length();
termAtt.setLength(length);
- finalOffset = correctOffset(startPosition + length);
- offsetAtt.setOffset(correctOffset(startPosition), finalOffset);
+ offsetAtt.setOffset(correctOffset(startPosition), correctOffset(startPosition+length));
resultToken.setLength(0);
resultToken.append(termAtt.buffer(), 0, length);
return true;
@@ -178,15 +184,17 @@
@Override
public final void end() {
// set final offset
+ int finalOffset = correctOffset(charsRead);
offsetAtt.setOffset(finalOffset, finalOffset);
}
@Override
- public void reset(Reader input) throws IOException {
- super.reset(input);
+ public void reset() throws IOException {
+ super.reset();
resultToken.setLength(0);
- finalOffset = 0;
+ charsRead = 0;
endDelimiter = false;
skipped = 0;
+ startPosition = 0;
}
}
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/path/ReversePathHierarchyTokenizer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/path/ReversePathHierarchyTokenizer.java
index fc8a683..97593c6 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/path/ReversePathHierarchyTokenizer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/path/ReversePathHierarchyTokenizer.java
@@ -77,6 +77,12 @@
public ReversePathHierarchyTokenizer(Reader input, int bufferSize, char delimiter, char replacement, int skip) {
super(input);
+ if (bufferSize < 0) {
+ throw new IllegalArgumentException("bufferSize cannot be negative");
+ }
+ if (skip < 0) {
+ throw new IllegalArgumentException("skip cannot be negative");
+ }
termAtt.resizeBuffer(bufferSize);
this.delimiter = delimiter;
this.replacement = replacement;
@@ -137,7 +143,11 @@
}
resultToken.getChars(0, resultToken.length(), resultTokenBuffer, 0);
resultToken.setLength(0);
- endPosition = delimiterPositions.get(delimitersCount-1 - skip);
+ int idx = delimitersCount-1 - skip;
+ if (idx >= 0) {
+ // otherwise its ok, because we will skip and return false
+ endPosition = delimiterPositions.get(idx);
+ }
finalOffset = correctOffset(length);
posAtt.setPositionIncrement(1);
}
@@ -163,10 +173,11 @@
}
@Override
- public void reset(Reader input) throws IOException {
- super.reset(input);
+ public void reset() throws IOException {
+ super.reset();
resultToken.setLength(0);
finalOffset = 0;
+ endPosition = 0;
skipped = 0;
delimitersCount = -1;
delimiterPositions.clear();
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java
index 3d43d17..bc80391 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java
@@ -69,8 +69,17 @@
super(input);
this.pattern = pattern;
this.group = group;
+
+ // Use "" instead of str so don't consume chars
+ // (fillBuffer) from the input on throwing IAE below:
+ matcher = pattern.matcher("");
+
+ // confusingly group count depends ENTIRELY on the pattern but is only accessible via matcher
+ if (group >= 0 && group > matcher.groupCount()) {
+ throw new IllegalArgumentException("invalid group specified: pattern only has: " + matcher.groupCount() + " capturing groups");
+ }
fillBuffer(str, input);
- matcher = pattern.matcher(str);
+ matcher.reset(str);
index = 0;
}
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/position/PositionFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/position/PositionFilter.java
index 97f5fef..04737ed 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/position/PositionFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/position/PositionFilter.java
@@ -57,6 +57,9 @@
*/
public PositionFilter(final TokenStream input, final int positionIncrement) {
super(input);
+ if (positionIncrement < 0) {
+ throw new IllegalArgumentException("positionIncrement may not be negative");
+ }
this.positionIncrement = positionIncrement;
}
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
index 464bde0..d0b8e05 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
@@ -23,9 +23,10 @@
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
@@ -150,6 +151,7 @@
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+ private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
@@ -319,6 +321,7 @@
noShingleOutput = false;
}
offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset());
+ posLenAtt.setPositionLength(builtGramSize);
isOutputHere = true;
gramSize.advance();
tokenAvailable = true;
@@ -436,6 +439,8 @@
super.reset();
gramSize.reset();
inputWindow.clear();
+ nextInputStreamToken = null;
+ isNextInputStreamToken = false;
numFillerTokensToInsert = 0;
isOutputHere = false;
noShingleOutput = true;
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java
index c69d470..7a2639e 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java
@@ -67,7 +67,7 @@
Class.forName("org.tartarus.snowball.ext." + name + "Stemmer").asSubclass(SnowballProgram.class);
stemmer = stemClass.newInstance();
} catch (Exception e) {
- throw new RuntimeException(e.toString());
+ throw new IllegalArgumentException("Invalid stemmer class specified: " + name, e);
}
}
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java
index c495bdd..c5ba3a0 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java
@@ -177,6 +177,12 @@
}
private void init(int tokenOutput, Set<String> untokenizedTypes) {
+ // TODO: cutover to enum
+ if (tokenOutput != TOKENS_ONLY &&
+ tokenOutput != UNTOKENIZED_ONLY &&
+ tokenOutput != BOTH) {
+ throw new IllegalArgumentException("tokenOutput must be TOKENS_ONLY, UNTOKENIZED_ONLY or BOTH");
+ }
this.tokenOutput = tokenOutput;
this.untokenizedTypes = untokenizedTypes;
}
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java
index 9740baf..fa77b40 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java
@@ -19,6 +19,8 @@
import java.io.Reader;
import java.io.StringReader;
+import java.util.HashSet;
+import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
@@ -27,6 +29,8 @@
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.util._TestUtil;
+import org.junit.Ignore;
public class TestMappingCharFilter extends BaseTokenStreamTestCase {
@@ -190,4 +194,67 @@
int numRounds = RANDOM_MULTIPLIER * 10000;
checkRandomData(random, analyzer, numRounds);
}
+
+ @Ignore("wrong finalOffset: https://issues.apache.org/jira/browse/LUCENE-3971")
+ public void testFinalOffsetSpecialCase() throws Exception {
+ final NormalizeCharMap map = new NormalizeCharMap();
+ map.add("t", "");
+ // even though this below rule has no effect, the test passes if you remove it!!
+ map.add("tmakdbl", "c");
+
+ Analyzer analyzer = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(tokenizer, tokenizer);
+ }
+
+ @Override
+ protected Reader initReader(Reader reader) {
+ return new MappingCharFilter(map, CharReader.get(reader));
+ }
+ };
+
+ String text = "gzw f quaxot";
+ checkAnalysisConsistency(random, analyzer, false, text);
+ }
+
+ @Ignore("wrong finalOffset: https://issues.apache.org/jira/browse/LUCENE-3971")
+ public void testRandomMaps() throws Exception {
+ for (int i = 0; i < 100; i++) {
+ final NormalizeCharMap map = randomMap();
+ Analyzer analyzer = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(tokenizer, tokenizer);
+ }
+
+ @Override
+ protected Reader initReader(Reader reader) {
+ return new MappingCharFilter(map, CharReader.get(reader));
+ }
+ };
+ int numRounds = RANDOM_MULTIPLIER * 100;
+ checkRandomData(random, analyzer, numRounds);
+ }
+ }
+
+ private NormalizeCharMap randomMap() {
+ NormalizeCharMap map = new NormalizeCharMap();
+ // we can't add duplicate keys, or NormalizeCharMap gets angry
+ Set<String> keys = new HashSet<String>();
+ int num = random.nextInt(5);
+ //System.out.println("NormalizeCharMap=");
+ for (int i = 0; i < num; i++) {
+ String key = _TestUtil.randomSimpleString(random);
+ if (!keys.contains(key)) {
+ String value = _TestUtil.randomSimpleString(random);
+ map.add(key, value);
+ keys.add(key);
+ //System.out.println("mapping: '" + key + "' => '" + value + "'");
+ }
+ }
+ return map;
+ }
}
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
index b40022a..016b107 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
@@ -18,17 +18,28 @@
*/
import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
import java.io.Reader;
import java.io.StringReader;
import java.lang.reflect.Constructor;
+import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Modifier;
import java.net.URL;
+import java.nio.CharBuffer;
import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.Enumeration;
+import java.util.HashSet;
+import java.util.IdentityHashMap;
import java.util.List;
+import java.util.Map;
import java.util.Random;
+import java.util.Set;
+import java.util.regex.Pattern;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
@@ -36,67 +47,174 @@
import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.EmptyTokenizer;
+import org.apache.lucene.analysis.MockGraphTokenFilter;
+import org.apache.lucene.analysis.MockRandomLookaheadTokenFilter;
+import org.apache.lucene.analysis.MockTokenFilter;
+import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer;
+import org.apache.lucene.analysis.ValidatingTokenFilter;
+import org.apache.lucene.analysis.charfilter.CharFilter;
+import org.apache.lucene.analysis.charfilter.MappingCharFilter;
+import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
+import org.apache.lucene.analysis.cjk.CJKBigramFilter;
+import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
+import org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilter;
+import org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter;
+import org.apache.lucene.analysis.compound.TestCompoundWordTokenFilter;
+import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
+import org.apache.lucene.analysis.hunspell.HunspellDictionary;
+import org.apache.lucene.analysis.hunspell.HunspellDictionaryTest;
+import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter;
+import org.apache.lucene.analysis.miscellaneous.TrimFilter;
+import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
import org.apache.lucene.analysis.ngram.NGramTokenFilter;
import org.apache.lucene.analysis.ngram.NGramTokenizer;
+import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
+import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer;
+import org.apache.lucene.analysis.payloads.IdentityEncoder;
+import org.apache.lucene.analysis.payloads.PayloadEncoder;
+import org.apache.lucene.analysis.position.PositionFilter;
+import org.apache.lucene.analysis.snowball.TestSnowball;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.synonym.SynonymMap;
+import org.apache.lucene.analysis.th.ThaiWordFilter;
+import org.apache.lucene.analysis.util.CharArrayMap;
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.util.AttributeSource.AttributeFactory;
+import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util.Rethrow;
import org.apache.lucene.util.Version;
+import org.apache.lucene.util._TestUtil;
+import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.junit.AfterClass;
import org.junit.BeforeClass;
+import org.tartarus.snowball.SnowballProgram;
+import org.xml.sax.InputSource;
/** tests random analysis chains */
public class TestRandomChains extends BaseTokenStreamTestCase {
- static List<Class<? extends Tokenizer>> tokenizers;
- static List<Class<? extends TokenFilter>> tokenfilters;
- static List<Class<? extends CharStream>> charfilters;
+
+ static List<Constructor<? extends Tokenizer>> tokenizers;
+ static List<Constructor<? extends TokenFilter>> tokenfilters;
+ static List<Constructor<? extends CharStream>> charfilters;
+
+ // TODO: fix those and remove
+ private static final Set<Class<?>> brokenComponents = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
+ static {
+ // TODO: can we promote some of these to be only
+ // offsets offenders?
+ Collections.<Class<?>>addAll(brokenComponents,
+ // TODO: fix basetokenstreamtestcase not to trip because this one has no CharTermAtt
+ EmptyTokenizer.class,
+ // doesn't actual reset itself!
+ CachingTokenFilter.class,
+ // doesn't consume whole stream!
+ LimitTokenCountFilter.class,
+ // Not broken: we forcefully add this, so we shouldn't
+ // also randomly pick it:
+ ValidatingTokenFilter.class,
+ // NOTE: these by themselves won't cause any 'basic assertions' to fail.
+ // but see https://issues.apache.org/jira/browse/LUCENE-3920, if any
+ // tokenfilter that combines words (e.g. shingles) comes after them,
+ // this will create bogus offsets because their 'offsets go backwards',
+ // causing shingle or whatever to make a single token with a
+ // startOffset thats > its endOffset
+ // (see LUCENE-3738 for a list of other offenders here)
+ // broken!
+ NGramTokenizer.class,
+ // broken!
+ NGramTokenFilter.class,
+ // broken!
+ EdgeNGramTokenizer.class,
+ // broken!
+ EdgeNGramTokenFilter.class,
+ // broken!
+ WordDelimiterFilter.class,
+ // broken!
+ TrimFilter.class,
+ // TODO: remove this class after we fix its finalOffset bug
+ MappingCharFilter.class
+ );
+ }
+
+ // TODO: also fix these and remove (maybe):
+ // Classes that don't produce consistent graph offsets:
+ private static final Set<Class<?>> brokenOffsetsComponents = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
+ static {
+ Collections.<Class<?>>addAll(brokenOffsetsComponents,
+ ReversePathHierarchyTokenizer.class,
+ PathHierarchyTokenizer.class,
+ HyphenationCompoundWordTokenFilter.class,
+ DictionaryCompoundWordTokenFilter.class,
+ // TODO: corrumpts graphs (offset consistency check):
+ PositionFilter.class,
+ // TODO: it seems to mess up offsets!?
+ WikipediaTokenizer.class,
+ // TODO: doesn't handle graph inputs
+ ThaiWordFilter.class,
+ // TODO: doesn't handle graph inputs
+ CJKBigramFilter.class
+ );
+ }
@BeforeClass
public static void beforeClass() throws Exception {
List<Class<?>> analysisClasses = new ArrayList<Class<?>>();
getClassesForPackage("org.apache.lucene.analysis", analysisClasses);
- tokenizers = new ArrayList<Class<? extends Tokenizer>>();
- tokenfilters = new ArrayList<Class<? extends TokenFilter>>();
- charfilters = new ArrayList<Class<? extends CharStream>>();
- for (Class<?> c : analysisClasses) {
- // don't waste time with abstract classes or deprecated known-buggy ones
+ tokenizers = new ArrayList<Constructor<? extends Tokenizer>>();
+ tokenfilters = new ArrayList<Constructor<? extends TokenFilter>>();
+ charfilters = new ArrayList<Constructor<? extends CharStream>>();
+ for (final Class<?> c : analysisClasses) {
final int modifiers = c.getModifiers();
- if (Modifier.isAbstract(modifiers) || !Modifier.isPublic(modifiers)
- || c.getAnnotation(Deprecated.class) != null
- || c.isSynthetic() || c.isAnonymousClass() || c.isMemberClass() || c.isInterface()
- // TODO: fix basetokenstreamtestcase not to trip because this one has no CharTermAtt
- || c.equals(EmptyTokenizer.class)
- // doesn't actual reset itself!
- || c.equals(CachingTokenFilter.class)
- // broken!
- || c.equals(NGramTokenizer.class)
- // broken!
- || c.equals(NGramTokenFilter.class)
- // broken!
- || c.equals(EdgeNGramTokenizer.class)
- // broken!
- || c.equals(EdgeNGramTokenFilter.class)) {
+ if (
+ // don't waste time with abstract classes or deprecated known-buggy ones
+ Modifier.isAbstract(modifiers) || !Modifier.isPublic(modifiers)
+ || c.isSynthetic() || c.isAnonymousClass() || c.isMemberClass() || c.isInterface()
+ || brokenComponents.contains(c)
+ || c.isAnnotationPresent(Deprecated.class)
+ || !(Tokenizer.class.isAssignableFrom(c) || TokenFilter.class.isAssignableFrom(c) || CharStream.class.isAssignableFrom(c))
+ ) {
continue;
}
- if (Tokenizer.class.isAssignableFrom(c)) {
- tokenizers.add(c.asSubclass(Tokenizer.class));
- } else if (TokenFilter.class.isAssignableFrom(c)) {
- tokenfilters.add(c.asSubclass(TokenFilter.class));
- } else if (CharStream.class.isAssignableFrom(c)) {
- charfilters.add(c.asSubclass(CharStream.class));
+
+ for (final Constructor<?> ctor : c.getConstructors()) {
+ // don't test synthetic or deprecated ctors, they likely have known bugs:
+ if (ctor.isSynthetic() || ctor.isAnnotationPresent(Deprecated.class)) {
+ continue;
+ }
+ if (Tokenizer.class.isAssignableFrom(c)) {
+ assertTrue(ctor.toGenericString() + " has unsupported parameter types",
+ allowedTokenizerArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
+ tokenizers.add(castConstructor(Tokenizer.class, ctor));
+ } else if (TokenFilter.class.isAssignableFrom(c)) {
+ assertTrue(ctor.toGenericString() + " has unsupported parameter types",
+ allowedTokenFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
+ tokenfilters.add(castConstructor(TokenFilter.class, ctor));
+ } else if (CharStream.class.isAssignableFrom(c)) {
+ assertTrue(ctor.toGenericString() + " has unsupported parameter types",
+ allowedCharFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
+ charfilters.add(castConstructor(CharStream.class, ctor));
+ } else {
+ fail("Cannot get here");
+ }
}
}
- final Comparator<Class<?>> classComp = new Comparator<Class<?>>() {
+
+ final Comparator<Constructor<?>> ctorComp = new Comparator<Constructor<?>>() {
@Override
- public int compare(Class<?> arg0, Class<?> arg1) {
- return arg0.getName().compareTo(arg1.getName());
+ public int compare(Constructor<?> arg0, Constructor<?> arg1) {
+ return arg0.toGenericString().compareTo(arg1.toGenericString());
}
};
- Collections.sort(tokenizers, classComp);
- Collections.sort(tokenfilters, classComp);
- Collections.sort(charfilters, classComp);
+ Collections.sort(tokenizers, ctorComp);
+ Collections.sort(tokenfilters, ctorComp);
+ Collections.sort(charfilters, ctorComp);
if (VERBOSE) {
System.out.println("tokenizers = " + tokenizers);
System.out.println("tokenfilters = " + tokenfilters);
@@ -111,170 +229,12 @@
charfilters = null;
}
- static class MockRandomAnalyzer extends Analyzer {
- final long seed;
-
- MockRandomAnalyzer(long seed) {
- this.seed = seed;
- }
-
- @Override
- protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
- Random random = new Random(seed);
- TokenizerSpec tokenizerspec = newTokenizer(random, reader);
- TokenFilterSpec filterspec = newFilterChain(random, tokenizerspec.tokenizer);
- return new TokenStreamComponents(tokenizerspec.tokenizer, filterspec.stream);
- }
-
- @Override
- protected Reader initReader(Reader reader) {
- Random random = new Random(seed);
- CharFilterSpec charfilterspec = newCharFilterChain(random, reader);
- return charfilterspec.reader;
- }
-
- @Override
- public String toString() {
- Random random = new Random(seed);
- StringBuilder sb = new StringBuilder();
- CharFilterSpec charfilterSpec = newCharFilterChain(random, new StringReader(""));
- sb.append("\ncharfilters=");
- sb.append(charfilterSpec.toString);
- // intentional: initReader gets its own separate random
- random = new Random(seed);
- TokenizerSpec tokenizerSpec = newTokenizer(random, charfilterSpec.reader);
- sb.append("\n");
- sb.append("tokenizer=");
- sb.append(tokenizerSpec.toString);
- TokenFilterSpec tokenfilterSpec = newFilterChain(random, tokenizerSpec.tokenizer);
- sb.append("\n");
- sb.append("filters=");
- sb.append(tokenfilterSpec.toString);
- return sb.toString();
- }
-
- // create a new random tokenizer from classpath
- private TokenizerSpec newTokenizer(Random random, Reader reader) {
- TokenizerSpec spec = new TokenizerSpec();
- boolean success = false;
- while (!success) {
- try {
- // TODO: check Reader+Version,Version+Reader too
- // also look for other variants and handle them special
- int idx = random.nextInt(tokenizers.size());
- try {
- Constructor<? extends Tokenizer> c = tokenizers.get(idx).getConstructor(Version.class, Reader.class);
- spec.tokenizer = c.newInstance(TEST_VERSION_CURRENT, reader);
- } catch (NoSuchMethodException e) {
- Constructor<? extends Tokenizer> c = tokenizers.get(idx).getConstructor(Reader.class);
- spec.tokenizer = c.newInstance(reader);
- }
- spec.toString = tokenizers.get(idx).toString();
- success = true;
- } catch (Exception e) {
- // ignore
- }
- }
- return spec;
- }
-
- private CharFilterSpec newCharFilterChain(Random random, Reader reader) {
- CharFilterSpec spec = new CharFilterSpec();
- spec.reader = reader;
- StringBuilder descr = new StringBuilder();
- int numFilters = random.nextInt(3);
- for (int i = 0; i < numFilters; i++) {
- boolean success = false;
- while (!success) {
- try {
- // TODO: also look for other variants and handle them special
- int idx = random.nextInt(charfilters.size());
- try {
- Constructor<? extends CharStream> c = charfilters.get(idx).getConstructor(Reader.class);
- spec.reader = c.newInstance(spec.reader);
- } catch (NoSuchMethodException e) {
- Constructor<? extends CharStream> c = charfilters.get(idx).getConstructor(CharStream.class);
- spec.reader = c.newInstance(CharReader.get(spec.reader));
- }
-
- if (descr.length() > 0) {
- descr.append(",");
- }
- descr.append(charfilters.get(idx).toString());
- success = true;
- } catch (Exception e) {
- // ignore
- }
- }
- }
- spec.toString = descr.toString();
- return spec;
- }
-
- private TokenFilterSpec newFilterChain(Random random, Tokenizer tokenizer) {
- TokenFilterSpec spec = new TokenFilterSpec();
- spec.stream = tokenizer;
- StringBuilder descr = new StringBuilder();
- int numFilters = random.nextInt(5);
- for (int i = 0; i < numFilters; i++) {
- boolean success = false;
- while (!success) {
- try {
- // TODO: also look for other variants and handle them special
- int idx = random.nextInt(tokenfilters.size());
- try {
- Constructor<? extends TokenFilter> c = tokenfilters.get(idx).getConstructor(Version.class, TokenStream.class);
- spec.stream = c.newInstance(TEST_VERSION_CURRENT, spec.stream);
- } catch (NoSuchMethodException e) {
- Constructor<? extends TokenFilter> c = tokenfilters.get(idx).getConstructor(TokenStream.class);
- spec.stream = c.newInstance(spec.stream);
- }
- if (descr.length() > 0) {
- descr.append(",");
- }
- descr.append(tokenfilters.get(idx).toString());
- success = true;
- } catch (Exception e) {
- // ignore
- }
- }
- }
- spec.toString = descr.toString();
- return spec;
- }
+ /** Hack to work around the stupidness of Oracle's strict Java backwards compatibility.
+ * {@code Class<T>#getConstructors()} should return unmodifiable {@code List<Constructor<T>>} not array! */
+ @SuppressWarnings("unchecked")
+ private static <T> Constructor<T> castConstructor(Class<T> instanceClazz, Constructor<?> ctor) {
+ return (Constructor<T>) ctor;
}
-
- static class TokenizerSpec {
- Tokenizer tokenizer;
- String toString;
- }
-
- static class TokenFilterSpec {
- TokenStream stream;
- String toString;
- }
-
- static class CharFilterSpec {
- Reader reader;
- String toString;
- }
-
- public void testRandomChains() throws Throwable {
- int numIterations = atLeast(20);
- for (int i = 0; i < numIterations; i++) {
- MockRandomAnalyzer a = new MockRandomAnalyzer(random.nextLong());
- if (VERBOSE) {
- System.out.println("Creating random analyzer:" + a);
- }
- try {
- checkRandomData(random, a, 1000);
- } catch (Throwable e) {
- System.err.println("Exception from random analyzer: " + a);
- throw e;
- }
- }
- }
-
private static void getClassesForPackage(String pckgname, List<Class<?>> classes) throws Exception {
final ClassLoader cld = TestRandomChains.class.getClassLoader();
final String path = pckgname.replace('.', '/');
@@ -303,4 +263,568 @@
}
}
}
+
+ private static interface ArgProducer {
+ Object create(Random random);
+ }
+
+ private static final Map<Class<?>,ArgProducer> argProducers = new IdentityHashMap<Class<?>,ArgProducer>() {{
+ put(int.class, new ArgProducer() {
+ @Override public Object create(Random random) {
+ // TODO: could cause huge ram usage to use full int range for some filters
+ // (e.g. allocate enormous arrays)
+ // return Integer.valueOf(random.nextInt());
+ return Integer.valueOf(_TestUtil.nextInt(random, -100, 100));
+ }
+ });
+ put(char.class, new ArgProducer() {
+ @Override public Object create(Random random) {
+ // TODO: fix any filters that care to throw IAE instead.
+ // also add a unicode validating filter to validate termAtt?
+ // return Character.valueOf((char)random.nextInt(65536));
+ while(true) {
+ char c = (char)random.nextInt(65536);
+ if (c < '\uD800' || c > '\uDFFF') {
+ return Character.valueOf(c);
+ }
+ }
+ }
+ });
+ put(float.class, new ArgProducer() {
+ @Override public Object create(Random random) {
+ return Float.valueOf(random.nextFloat());
+ }
+ });
+ put(boolean.class, new ArgProducer() {
+ @Override public Object create(Random random) {
+ return Boolean.valueOf(random.nextBoolean());
+ }
+ });
+ put(byte.class, new ArgProducer() {
+ @Override public Object create(Random random) {
+ // this wraps to negative when casting to byte
+ return Byte.valueOf((byte) random.nextInt(256));
+ }
+ });
+ put(byte[].class, new ArgProducer() {
+ @Override public Object create(Random random) {
+ byte bytes[] = new byte[random.nextInt(256)];
+ random.nextBytes(bytes);
+ return bytes;
+ }
+ });
+ put(Random.class, new ArgProducer() {
+ @Override public Object create(Random random) {
+ return new Random(random.nextLong());
+ }
+ });
+ put(Version.class, new ArgProducer() {
+ @Override public Object create(Random random) {
+ // we expect bugs in emulating old versions
+ return TEST_VERSION_CURRENT;
+ }
+ });
+ put(Set.class, new ArgProducer() {
+ @Override public Object create(Random random) {
+ // TypeTokenFilter
+ Set<String> set = new HashSet<String>();
+ int num = random.nextInt(5);
+ for (int i = 0; i < num; i++) {
+ set.add(StandardTokenizer.TOKEN_TYPES[random.nextInt(StandardTokenizer.TOKEN_TYPES.length)]);
+ }
+ return set;
+ }
+ });
+ put(Collection.class, new ArgProducer() {
+ @Override public Object create(Random random) {
+ // CapitalizationFilter
+ Collection<char[]> col = new ArrayList<char[]>();
+ int num = random.nextInt(5);
+ for (int i = 0; i < num; i++) {
+ col.add(_TestUtil.randomSimpleString(random).toCharArray());
+ }
+ return col;
+ }
+ });
+ put(CharArraySet.class, new ArgProducer() {
+ @Override public Object create(Random random) {
+ int num = random.nextInt(10);
+ CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, num, random.nextBoolean());
+ for (int i = 0; i < num; i++) {
+ // TODO: make nastier
+ set.add(_TestUtil.randomSimpleString(random));
+ }
+ return set;
+ }
+ });
+ put(Pattern.class, new ArgProducer() {
+ @Override public Object create(Random random) {
+ // TODO: don't want to make the exponentially slow ones Dawid documents
+ // in TestPatternReplaceFilter, so dont use truly random patterns (for now)
+ return Pattern.compile("a");
+ }
+ });
+ put(PayloadEncoder.class, new ArgProducer() {
+ @Override public Object create(Random random) {
+ return new IdentityEncoder(); // the other encoders will throw exceptions if tokens arent numbers?
+ }
+ });
+ put(HunspellDictionary.class, new ArgProducer() {
+ @Override public Object create(Random random) {
+ // TODO: make nastier
+ InputStream affixStream = HunspellDictionaryTest.class.getResourceAsStream("test.aff");
+ InputStream dictStream = HunspellDictionaryTest.class.getResourceAsStream("test.dic");
+ try {
+ return new HunspellDictionary(affixStream, dictStream, TEST_VERSION_CURRENT);
+ } catch (Exception ex) {
+ Rethrow.rethrow(ex);
+ return null; // unreachable code
+ }
+ }
+ });
+ put(EdgeNGramTokenizer.Side.class, new ArgProducer() {
+ @Override public Object create(Random random) {
+ return random.nextBoolean()
+ ? EdgeNGramTokenizer.Side.FRONT
+ : EdgeNGramTokenizer.Side.BACK;
+ }
+ });
+ put(EdgeNGramTokenFilter.Side.class, new ArgProducer() {
+ @Override public Object create(Random random) {
+ return random.nextBoolean()
+ ? EdgeNGramTokenFilter.Side.FRONT
+ : EdgeNGramTokenFilter.Side.BACK;
+ }
+ });
+ put(HyphenationTree.class, new ArgProducer() {
+ @Override public Object create(Random random) {
+ // TODO: make nastier
+ try {
+ InputSource is = new InputSource(TestCompoundWordTokenFilter.class.getResource("da_UTF8.xml").toExternalForm());
+ HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);
+ return hyphenator;
+ } catch (Exception ex) {
+ Rethrow.rethrow(ex);
+ return null; // unreachable code
+ }
+ }
+ });
+ put(SnowballProgram.class, new ArgProducer() {
+ @Override public Object create(Random random) {
+ try {
+ String lang = TestSnowball.SNOWBALL_LANGS[random.nextInt(TestSnowball.SNOWBALL_LANGS.length)];
+ Class<? extends SnowballProgram> clazz = Class.forName("org.tartarus.snowball.ext." + lang + "Stemmer").asSubclass(SnowballProgram.class);
+ return clazz.newInstance();
+ } catch (Exception ex) {
+ Rethrow.rethrow(ex);
+ return null; // unreachable code
+ }
+ }
+ });
+ put(String.class, new ArgProducer() {
+ @Override public Object create(Random random) {
+ // TODO: make nastier
+ if (random.nextBoolean()) {
+ // a token type
+ return StandardTokenizer.TOKEN_TYPES[random.nextInt(StandardTokenizer.TOKEN_TYPES.length)];
+ } else {
+ return _TestUtil.randomSimpleString(random);
+ }
+ }
+ });
+ put(NormalizeCharMap.class, new ArgProducer() {
+ @Override public Object create(Random random) {
+ NormalizeCharMap map = new NormalizeCharMap();
+ // we can't add duplicate keys, or NormalizeCharMap gets angry
+ Set<String> keys = new HashSet<String>();
+ int num = random.nextInt(5);
+ //System.out.println("NormalizeCharMap=");
+ for (int i = 0; i < num; i++) {
+ String key = _TestUtil.randomSimpleString(random);
+ if (!keys.contains(key)) {
+ String value = _TestUtil.randomSimpleString(random);
+ map.add(key, value);
+ keys.add(key);
+ //System.out.println("mapping: '" + key + "' => '" + value + "'");
+ }
+ }
+ return map;
+ }
+ });
+ put(CharacterRunAutomaton.class, new ArgProducer() {
+ @Override public Object create(Random random) {
+ // TODO: could probably use a purely random automaton
+ switch(random.nextInt(5)) {
+ case 0: return MockTokenizer.KEYWORD;
+ case 1: return MockTokenizer.SIMPLE;
+ case 2: return MockTokenizer.WHITESPACE;
+ case 3: return MockTokenFilter.EMPTY_STOPSET;
+ default: return MockTokenFilter.ENGLISH_STOPSET;
+ }
+ }
+ });
+ put(CharArrayMap.class, new ArgProducer() {
+ @Override public Object create(Random random) {
+ int num = random.nextInt(10);
+ CharArrayMap<String> map = new CharArrayMap<String>(TEST_VERSION_CURRENT, num, random.nextBoolean());
+ for (int i = 0; i < num; i++) {
+ // TODO: make nastier
+ map.put(_TestUtil.randomSimpleString(random), _TestUtil.randomSimpleString(random));
+ }
+ return map;
+ }
+ });
+ put(SynonymMap.class, new ArgProducer() {
+ @Override public Object create(Random random) {
+ SynonymMap.Builder b = new SynonymMap.Builder(random.nextBoolean());
+ final int numEntries = atLeast(10);
+ for (int j = 0; j < numEntries; j++) {
+ addSyn(b, randomNonEmptyString(random), randomNonEmptyString(random), random.nextBoolean());
+ }
+ try {
+ return b.build();
+ } catch (Exception ex) {
+ Rethrow.rethrow(ex);
+ return null; // unreachable code
+ }
+ }
+
+ private void addSyn(SynonymMap.Builder b, String input, String output, boolean keepOrig) {
+ b.add(new CharsRef(input.replaceAll(" +", "\u0000")),
+ new CharsRef(output.replaceAll(" +", "\u0000")),
+ keepOrig);
+ }
+
+ private String randomNonEmptyString(Random random) {
+ while(true) {
+ final String s = _TestUtil.randomUnicodeString(random).trim();
+ if (s.length() != 0 && s.indexOf('\u0000') == -1) {
+ return s;
+ }
+ }
+ }
+ });
+ }};
+
+ static final Set<Class<?>> allowedTokenizerArgs, allowedTokenFilterArgs, allowedCharFilterArgs;
+ static {
+ allowedTokenizerArgs = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
+ allowedTokenizerArgs.addAll(argProducers.keySet());
+ allowedTokenizerArgs.add(Reader.class);
+ allowedTokenizerArgs.add(AttributeFactory.class);
+ allowedTokenizerArgs.add(AttributeSource.class);
+
+ allowedTokenFilterArgs = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
+ allowedTokenFilterArgs.addAll(argProducers.keySet());
+ allowedTokenFilterArgs.add(TokenStream.class);
+ // TODO: fix this one, thats broken:
+ allowedTokenFilterArgs.add(CommonGramsFilter.class);
+
+ allowedCharFilterArgs = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
+ allowedCharFilterArgs.addAll(argProducers.keySet());
+ allowedCharFilterArgs.add(Reader.class);
+ allowedCharFilterArgs.add(CharStream.class);
+ }
+
+ @SuppressWarnings("unchecked")
+ static <T> T newRandomArg(Random random, Class<T> paramType) {
+ final ArgProducer producer = argProducers.get(paramType);
+ assertNotNull("No producer for arguments of type " + paramType.getName() + " found", producer);
+ return (T) producer.create(random);
+ }
+
+ static Object[] newTokenizerArgs(Random random, Reader reader, Class<?>[] paramTypes) {
+ Object[] args = new Object[paramTypes.length];
+ for (int i = 0; i < args.length; i++) {
+ Class<?> paramType = paramTypes[i];
+ if (paramType == Reader.class) {
+ args[i] = reader;
+ } else if (paramType == AttributeFactory.class) {
+ // TODO: maybe the collator one...???
+ args[i] = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY;
+ } else if (paramType == AttributeSource.class) {
+ // TODO: args[i] = new AttributeSource();
+ // this is currently too scary to deal with!
+ args[i] = null; // force IAE
+ } else {
+ args[i] = newRandomArg(random, paramType);
+ }
+ }
+ return args;
+ }
+
+ static Object[] newCharFilterArgs(Random random, Reader reader, Class<?>[] paramTypes) {
+ Object[] args = new Object[paramTypes.length];
+ for (int i = 0; i < args.length; i++) {
+ Class<?> paramType = paramTypes[i];
+ if (paramType == Reader.class) {
+ args[i] = reader;
+ } else if (paramType == CharStream.class) {
+ args[i] = CharReader.get(reader);
+ } else {
+ args[i] = newRandomArg(random, paramType);
+ }
+ }
+ return args;
+ }
+
+ static Object[] newFilterArgs(Random random, TokenStream stream, Class<?>[] paramTypes) {
+ Object[] args = new Object[paramTypes.length];
+ for (int i = 0; i < args.length; i++) {
+ Class<?> paramType = paramTypes[i];
+ if (paramType == TokenStream.class) {
+ args[i] = stream;
+ } else if (paramType == CommonGramsFilter.class) {
+ // TODO: fix this one, thats broken: CommonGramsQueryFilter takes this one explicitly
+ args[i] = new CommonGramsFilter(TEST_VERSION_CURRENT, stream, newRandomArg(random, CharArraySet.class));
+ } else {
+ args[i] = newRandomArg(random, paramType);
+ }
+ }
+ return args;
+ }
+
+ static class MockRandomAnalyzer extends Analyzer {
+ final long seed;
+
+ MockRandomAnalyzer(long seed) {
+ this.seed = seed;
+ }
+
+ public boolean offsetsAreCorrect() {
+ // TODO: can we not do the full chain here!?
+ Random random = new Random(seed);
+ TokenizerSpec tokenizerSpec = newTokenizer(random, new StringReader(""));
+ TokenFilterSpec filterSpec = newFilterChain(random, tokenizerSpec.tokenizer, tokenizerSpec.offsetsAreCorrect);
+ return filterSpec.offsetsAreCorrect;
+ }
+
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Random random = new Random(seed);
+ TokenizerSpec tokenizerSpec = newTokenizer(random, reader);
+ TokenFilterSpec filterSpec = newFilterChain(random, tokenizerSpec.tokenizer, tokenizerSpec.offsetsAreCorrect);
+ return new TokenStreamComponents(tokenizerSpec.tokenizer, filterSpec.stream);
+ }
+
+ @Override
+ protected Reader initReader(Reader reader) {
+ Random random = new Random(seed);
+ CharFilterSpec charfilterspec = newCharFilterChain(random, reader);
+ return charfilterspec.reader;
+ }
+
+ @Override
+ public String toString() {
+ Random random = new Random(seed);
+ StringBuilder sb = new StringBuilder();
+ CharFilterSpec charFilterSpec = newCharFilterChain(random, new StringReader(""));
+ sb.append("\ncharfilters=");
+ sb.append(charFilterSpec.toString);
+ // intentional: initReader gets its own separate random
+ random = new Random(seed);
+ TokenizerSpec tokenizerSpec = newTokenizer(random, charFilterSpec.reader);
+ sb.append("\n");
+ sb.append("tokenizer=");
+ sb.append(tokenizerSpec.toString);
+ TokenFilterSpec tokenFilterSpec = newFilterChain(random, tokenizerSpec.tokenizer, tokenizerSpec.offsetsAreCorrect);
+ sb.append("\n");
+ sb.append("filters=");
+ sb.append(tokenFilterSpec.toString);
+ sb.append("\n");
+ sb.append("offsetsAreCorrect=" + tokenFilterSpec.offsetsAreCorrect);
+ return sb.toString();
+ }
+
+ private <T> T createComponent(Constructor<T> ctor, Object[] args, StringBuilder descr) {
+ try {
+ final T instance = ctor.newInstance(args);
+ /*
+ if (descr.length() > 0) {
+ descr.append(",");
+ }
+ */
+ descr.append("\n ");
+ descr.append(ctor.getDeclaringClass().getName());
+ String params = Arrays.toString(args);
+ params = params.substring(1, params.length()-1);
+ descr.append("(").append(params).append(")");
+ return instance;
+ } catch (InvocationTargetException ite) {
+ final Throwable cause = ite.getCause();
+ if (cause instanceof IllegalArgumentException ||
+ cause instanceof UnsupportedOperationException) {
+ // thats ok, ignore
+ if (VERBOSE) {
+ System.err.println("Ignoring IAE/UOE from ctor:");
+ cause.printStackTrace(System.err);
+ }
+ } else {
+ Rethrow.rethrow(cause);
+ }
+ } catch (IllegalAccessException iae) {
+ Rethrow.rethrow(iae);
+ } catch (InstantiationException ie) {
+ Rethrow.rethrow(ie);
+ }
+ return null; // no success
+ }
+
+ // create a new random tokenizer from classpath
+ private TokenizerSpec newTokenizer(Random random, Reader reader) {
+ TokenizerSpec spec = new TokenizerSpec();
+ while (spec.tokenizer == null) {
+ final Constructor<? extends Tokenizer> ctor = tokenizers.get(random.nextInt(tokenizers.size()));
+ final StringBuilder descr = new StringBuilder();
+ final CheckThatYouDidntReadAnythingReaderWrapper wrapper = new CheckThatYouDidntReadAnythingReaderWrapper(reader);
+ final Object args[] = newTokenizerArgs(random, wrapper, ctor.getParameterTypes());
+ spec.tokenizer = createComponent(ctor, args, descr);
+ if (brokenOffsetsComponents.contains(ctor.getDeclaringClass())) {
+ spec.offsetsAreCorrect = false;
+ }
+ if (spec.tokenizer == null) {
+ assertFalse(ctor.getDeclaringClass().getName() + " has read something in ctor but failed with UOE/IAE", wrapper.readSomething);
+ }
+ spec.toString = descr.toString();
+ }
+ return spec;
+ }
+
+ private CharFilterSpec newCharFilterChain(Random random, Reader reader) {
+ CharFilterSpec spec = new CharFilterSpec();
+ spec.reader = reader;
+ StringBuilder descr = new StringBuilder();
+ int numFilters = random.nextInt(3);
+ for (int i = 0; i < numFilters; i++) {
+ while (true) {
+ final Constructor<? extends CharStream> ctor = charfilters.get(random.nextInt(charfilters.size()));
+ final Object args[] = newCharFilterArgs(random, spec.reader, ctor.getParameterTypes());
+ reader = createComponent(ctor, args, descr);
+ if (reader != null) {
+ spec.reader = reader;
+ break;
+ }
+ }
+ }
+ spec.toString = descr.toString();
+ return spec;
+ }
+
+ private TokenFilterSpec newFilterChain(Random random, Tokenizer tokenizer, boolean offsetsAreCorrect) {
+ TokenFilterSpec spec = new TokenFilterSpec();
+ spec.offsetsAreCorrect = offsetsAreCorrect;
+ spec.stream = tokenizer;
+ StringBuilder descr = new StringBuilder();
+ int numFilters = random.nextInt(5);
+ for (int i = 0; i < numFilters; i++) {
+
+ // Insert ValidatingTF after each stage so we can
+ // catch problems right after the TF that "caused"
+ // them:
+ spec.stream = new ValidatingTokenFilter(spec.stream, "stage " + i, spec.offsetsAreCorrect);
+
+ while (true) {
+ final Constructor<? extends TokenFilter> ctor = tokenfilters.get(random.nextInt(tokenfilters.size()));
+
+ // hack: MockGraph/MockLookahead has assertions that will trip if they follow
+ // an offsets violator. so we cant use them after e.g. wikipediatokenizer
+ if (!spec.offsetsAreCorrect &&
+ (ctor.getDeclaringClass().equals(MockGraphTokenFilter.class)
+ || ctor.getDeclaringClass().equals(MockRandomLookaheadTokenFilter.class))) {
+ continue;
+ }
+
+ final Object args[] = newFilterArgs(random, spec.stream, ctor.getParameterTypes());
+ final TokenFilter flt = createComponent(ctor, args, descr);
+ if (flt != null) {
+ if (brokenOffsetsComponents.contains(ctor.getDeclaringClass())) {
+ spec.offsetsAreCorrect = false;
+ }
+ spec.stream = flt;
+ break;
+ }
+ }
+ }
+
+ // Insert ValidatingTF after each stage so we can
+ // catch problems right after the TF that "caused"
+ // them:
+ spec.stream = new ValidatingTokenFilter(spec.stream, "last stage", spec.offsetsAreCorrect);
+
+ spec.toString = descr.toString();
+ return spec;
+ }
+ }
+
+ static final class CheckThatYouDidntReadAnythingReaderWrapper extends CharFilter {
+ boolean readSomething = false;
+
+ CheckThatYouDidntReadAnythingReaderWrapper(Reader in) {
+ super(CharReader.get(in));
+ }
+
+ @Override
+ public int read(char[] cbuf, int off, int len) throws IOException {
+ readSomething = true;
+ return super.read(cbuf, off, len);
+ }
+
+ @Override
+ public int read() throws IOException {
+ readSomething = true;
+ return super.read();
+ }
+
+ @Override
+ public int read(CharBuffer target) throws IOException {
+ readSomething = true;
+ return super.read(target);
+ }
+
+ @Override
+ public int read(char[] cbuf) throws IOException {
+ readSomething = true;
+ return super.read(cbuf);
+ }
+
+ @Override
+ public long skip(long n) throws IOException {
+ readSomething = true;
+ return super.skip(n);
+ }
+ }
+
+ static class TokenizerSpec {
+ Tokenizer tokenizer;
+ String toString;
+ boolean offsetsAreCorrect = true;
+ }
+
+ static class TokenFilterSpec {
+ TokenStream stream;
+ String toString;
+ boolean offsetsAreCorrect = true;
+ }
+
+ static class CharFilterSpec {
+ Reader reader;
+ String toString;
+ }
+
+ public void testRandomChains() throws Throwable {
+ int numIterations = atLeast(20);
+ for (int i = 0; i < numIterations; i++) {
+ MockRandomAnalyzer a = new MockRandomAnalyzer(random.nextLong());
+ if (VERBOSE) {
+ System.out.println("Creating random analyzer:" + a);
+ }
+ try {
+ checkRandomData(random, a, 1000, 20, false,
+ false /* We already validate our own offsets... */);
+ } catch (Throwable e) {
+ System.err.println("Exception from random analyzer: " + a);
+ throw e;
+ }
+ }
+ }
}
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilter.java
index 0179b94..e3e8813 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilter.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilter.java
@@ -65,7 +65,11 @@
new String[] { "a", "b", "c", "" },
new int[] { 1, 0, 1, 3 },
new int[] { 2, 1, 2, 3 },
- new int[] { 1, 1, 1, 1 });
+ null,
+ new int[] { 1, 1, 1, 1 },
+ null,
+ null,
+ false);
}
/**
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
index 754116c..54e68ab 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
@@ -72,14 +72,16 @@
assertTokenStreamContents(wdf,
new String[] { "foo", "bar", "foobar" },
new int[] { 5, 9, 5 },
- new int[] { 8, 12, 12 });
+ new int[] { 8, 12, 12 },
+ null, null, null, null, false);
wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("foo-bar", 5, 6)), DEFAULT_WORD_DELIM_TABLE, flags, null);
assertTokenStreamContents(wdf,
new String[] { "foo", "bar", "foobar" },
new int[] { 5, 5, 5 },
- new int[] { 6, 6, 6 });
+ new int[] { 6, 6, 6 },
+ null, null, null, null, false);
}
@Test
@@ -123,7 +125,8 @@
assertTokenStreamContents(wdf,
new String[] { "foo", "bar", "foobar"},
new int[] { 8, 12, 8 },
- new int[] { 11, 15, 15 });
+ new int[] { 11, 15, 15 },
+ null, null, null, null, false);
}
public void doSplit(final String input, String... output) throws Exception {
@@ -230,18 +233,27 @@
assertAnalyzesTo(a, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" },
new int[] { 0, 9 },
new int[] { 6, 13 },
- new int[] { 1, 1 });
+ null,
+ new int[] { 1, 1 },
+ null,
+ false);
/* only in this case, posInc of 2 ?! */
assertAnalyzesTo(a, "LUCENE / solR", new String[] { "LUCENE", "sol", "R", "solR" },
new int[] { 0, 9, 12, 9 },
new int[] { 6, 12, 13, 13 },
- new int[] { 1, 1, 1, 0 });
+ null,
+ new int[] { 1, 1, 1, 0 },
+ null,
+ false);
assertAnalyzesTo(a, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" },
new int[] { 0, 9, 15 },
new int[] { 6, 14, 19 },
- new int[] { 1, 1, 1 });
+ null,
+ new int[] { 1, 1, 1 },
+ null,
+ false);
/* analyzer that will consume tokens with large position increments */
Analyzer a2 = new Analyzer() {
@@ -258,24 +270,36 @@
assertAnalyzesTo(a2, "LUCENE largegap SOLR", new String[] { "LUCENE", "largegap", "SOLR" },
new int[] { 0, 7, 16 },
new int[] { 6, 15, 20 },
- new int[] { 1, 10, 1 });
+ null,
+ new int[] { 1, 10, 1 },
+ null,
+ false);
/* the "/" had a position increment of 10, where did it go?!?!! */
assertAnalyzesTo(a2, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" },
new int[] { 0, 9 },
new int[] { 6, 13 },
- new int[] { 1, 11 });
+ null,
+ new int[] { 1, 11 },
+ null,
+ false);
/* in this case, the increment of 10 from the "/" is carried over */
assertAnalyzesTo(a2, "LUCENE / solR", new String[] { "LUCENE", "sol", "R", "solR" },
new int[] { 0, 9, 12, 9 },
new int[] { 6, 12, 13, 13 },
- new int[] { 1, 11, 1, 0 });
+ null,
+ new int[] { 1, 11, 1, 0 },
+ null,
+ false);
assertAnalyzesTo(a2, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" },
new int[] { 0, 9, 15 },
new int[] { 6, 14, 19 },
- new int[] { 1, 11, 1 });
+ null,
+ new int[] { 1, 11, 1 },
+ null,
+ false);
Analyzer a3 = new Analyzer() {
@Override
@@ -292,14 +316,20 @@
new String[] { "lucene", "solr", "lucenesolr" },
new int[] { 0, 7, 0 },
new int[] { 6, 11, 11 },
- new int[] { 1, 1, 0 });
+ null,
+ new int[] { 1, 1, 0 },
+ null,
+ false);
/* the stopword should add a gap here */
assertAnalyzesTo(a3, "the lucene.solr",
new String[] { "lucene", "solr", "lucenesolr" },
new int[] { 4, 11, 4 },
new int[] { 10, 15, 15 },
- new int[] { 2, 1, 0 });
+ null,
+ new int[] { 2, 1, 0 },
+ null,
+ false);
}
/** blast some random strings through the analyzer */
@@ -322,7 +352,7 @@
return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords));
}
};
- checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+ checkRandomData(random, a, 10000*RANDOM_MULTIPLIER, 20, false, false);
}
}
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
index e8e7f6c..adb8870 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
@@ -94,7 +94,15 @@
public void testBackRangeOfNgrams() throws Exception {
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.BACK, 1, 3);
- assertTokenStreamContents(tokenizer, new String[]{"e","de","cde"}, new int[]{4,3,2}, new int[]{5,5,5});
+ assertTokenStreamContents(tokenizer,
+ new String[]{"e","de","cde"},
+ new int[]{4,3,2},
+ new int[]{5,5,5},
+ null,
+ null,
+ null,
+ null,
+ false);
}
public void testSmallTokenInStream() throws Exception {
@@ -151,7 +159,7 @@
new EdgeNGramTokenFilter(tokenizer, EdgeNGramTokenFilter.Side.BACK, 2, 15));
}
};
- checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
+ checkRandomData(random, b, 10000*RANDOM_MULTIPLIER, 20, false, false);
}
public void testEmptyTerm() throws Exception {
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java
index 90611a1..158c603 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java
@@ -90,7 +90,7 @@
public void testBackRangeOfNgrams() throws Exception {
EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.BACK, 1, 3);
- assertTokenStreamContents(tokenizer, new String[]{"e","de","cde"}, new int[]{4,3,2}, new int[]{5,5,5}, 5 /* abcde */);
+ assertTokenStreamContents(tokenizer, new String[]{"e","de","cde"}, new int[]{4,3,2}, new int[]{5,5,5}, null, null, null, 5 /* abcde */, false);
}
public void testReset() throws Exception {
@@ -109,8 +109,8 @@
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
- checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
- checkRandomData(random, a, 200*RANDOM_MULTIPLIER, 8192);
+ checkRandomData(random, a, 10000*RANDOM_MULTIPLIER, 20, false, false);
+ checkRandomData(random, a, 200*RANDOM_MULTIPLIER, 8192, false, false);
Analyzer b = new Analyzer() {
@Override
@@ -119,7 +119,7 @@
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
- checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
- checkRandomData(random, b, 200*RANDOM_MULTIPLIER, 8192);
+ checkRandomData(random, b, 10000*RANDOM_MULTIPLIER, 20, false, false);
+ checkRandomData(random, b, 200*RANDOM_MULTIPLIER, 8192, false, false);
}
}
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
index 3375c02..f5f3071 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
@@ -77,7 +77,8 @@
assertTokenStreamContents(filter,
new String[]{"a","b","c","d","e", "ab","bc","cd","de", "abc","bcd","cde"},
new int[]{0,1,2,3,4, 0,1,2,3, 0,1,2},
- new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5}
+ new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5},
+ null, null, null, null, false
);
}
@@ -130,7 +131,7 @@
new NGramTokenFilter(tokenizer, 2, 15));
}
};
- checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+ checkRandomData(random, a, 10000*RANDOM_MULTIPLIER, 20, false, false);
}
public void testEmptyTerm() throws Exception {
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
index 9dd3c65..86a9782 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
@@ -73,7 +73,11 @@
new String[]{"a","b","c","d","e", "ab","bc","cd","de", "abc","bcd","cde"},
new int[]{0,1,2,3,4, 0,1,2,3, 0,1,2},
new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5},
- 5 /* abcde */
+ null,
+ null,
+ null,
+ 5 /* abcde */,
+ false
);
}
@@ -98,7 +102,7 @@
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
- checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
- checkRandomData(random, a, 200*RANDOM_MULTIPLIER, 8192);
+ checkRandomData(random, a, 10000*RANDOM_MULTIPLIER, 20, false, false);
+ checkRandomData(random, a, 200*RANDOM_MULTIPLIER, 8192, false, false);
}
}
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowball.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowball.java
index 36bc262..7791fb4 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowball.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowball.java
@@ -142,14 +142,16 @@
}
}
+ /** for testing purposes ONLY */
+ public static String SNOWBALL_LANGS[] = {
+ "Armenian", "Basque", "Catalan", "Danish", "Dutch", "English",
+ "Finnish", "French", "German2", "German", "Hungarian", "Irish",
+ "Italian", "Kp", "Lovins", "Norwegian", "Porter", "Portuguese",
+ "Romanian", "Russian", "Spanish", "Swedish", "Turkish"
+ };
+
public void testEmptyTerm() throws IOException {
- String langs[] = {
- "Armenian", "Basque", "Catalan", "Danish", "Dutch", "English",
- "Finnish", "French", "German2", "German", "Hungarian", "Irish",
- "Italian", "Kp", "Lovins", "Norwegian", "Porter", "Portuguese",
- "Romanian", "Russian", "Spanish", "Swedish", "Turkish"
- };
- for (final String lang : langs) {
+ for (final String lang : SNOWBALL_LANGS) {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {