| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis; |
| |
| import java.io.IOException; |
| import java.io.PrintWriter; |
| import java.io.Reader; |
| import java.io.StringReader; |
| import java.io.StringWriter; |
| import java.io.Writer; |
| import java.nio.charset.StandardCharsets; |
| import java.nio.file.Files; |
| import java.nio.file.Paths; |
| import java.util.*; |
| import java.util.concurrent.CountDownLatch; |
| |
| import org.apache.lucene.analysis.tokenattributes.*; |
| import org.apache.lucene.document.Document; |
| import org.apache.lucene.document.Field; |
| import org.apache.lucene.document.FieldType; |
| import org.apache.lucene.document.TextField; |
| import org.apache.lucene.index.IndexOptions; |
| import org.apache.lucene.index.IndexableFieldType; |
| import org.apache.lucene.index.RandomIndexWriter; |
| import org.apache.lucene.store.Directory; |
| import org.apache.lucene.util.Attribute; |
| import org.apache.lucene.util.AttributeFactory; |
| import org.apache.lucene.util.AttributeImpl; |
| import org.apache.lucene.util.AttributeReflector; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.BytesRefBuilder; |
| import org.apache.lucene.util.IOUtils; |
| import org.apache.lucene.util.IntsRef; |
| import org.apache.lucene.util.LuceneTestCase; |
| import org.apache.lucene.util.Rethrow; |
| import org.apache.lucene.util.TestUtil; |
| import org.apache.lucene.util.automaton.Automaton; |
| import org.apache.lucene.util.automaton.AutomatonTestUtil; |
| import org.apache.lucene.util.fst.Util; |
| |
| /** |
| * Base class for all Lucene unit tests that use TokenStreams. |
| * <p> |
| * When writing unit tests for analysis components, it's highly recommended |
| * to use the helper methods here (especially in conjunction with {@link MockAnalyzer} or |
| * {@link MockTokenizer}), as they contain many assertions and checks to |
| * catch bugs. |
| * |
| * @see MockAnalyzer |
| * @see MockTokenizer |
| */ |
| public abstract class BaseTokenStreamTestCase extends LuceneTestCase { |
| // some helpers to test Analyzers and TokenStreams: |
| |
| /** |
| * Attribute that records if it was cleared or not. This is used |
| * for testing that clearAttributes() was called correctly. |
| */ |
| public static interface CheckClearAttributesAttribute extends Attribute { |
| boolean getAndResetClearCalled(); |
| } |
| |
| /** |
| * Attribute that records if it was cleared or not. This is used |
| * for testing that clearAttributes() was called correctly. |
| */ |
| public static final class CheckClearAttributesAttributeImpl extends AttributeImpl implements CheckClearAttributesAttribute { |
| private boolean clearCalled = false; |
| |
| @Override |
| public boolean getAndResetClearCalled() { |
| try { |
| return clearCalled; |
| } finally { |
| clearCalled = false; |
| } |
| } |
| |
| @Override |
| public void clear() { |
| clearCalled = true; |
| } |
| |
| @Override |
| public boolean equals(Object other) { |
| return ( |
| other instanceof CheckClearAttributesAttributeImpl && |
| ((CheckClearAttributesAttributeImpl) other).clearCalled == this.clearCalled |
| ); |
| } |
| |
| @Override |
| public int hashCode() { |
| return 76137213 ^ Boolean.valueOf(clearCalled).hashCode(); |
| } |
| |
| @Override |
| public void copyTo(AttributeImpl target) { |
| ((CheckClearAttributesAttributeImpl) target).clear(); |
| } |
| |
| @Override |
| public void reflectWith(AttributeReflector reflector) { |
| reflector.reflect(CheckClearAttributesAttribute.class, "clearCalled", clearCalled); |
| } |
| } |
| |
| // graphOffsetsAreCorrect validates: |
| // - graph offsets are correct (all tokens leaving from |
| // pos X have the same startOffset; all tokens |
| // arriving to pos Y have the same endOffset) |
| public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], |
| int posLengths[], Integer finalOffset, Integer finalPosInc, boolean[] keywordAtts, |
| boolean graphOffsetsAreCorrect, byte[][] payloads, int[] flags) throws IOException { |
| assertNotNull(output); |
| CheckClearAttributesAttribute checkClearAtt = ts.addAttribute(CheckClearAttributesAttribute.class); |
| |
| CharTermAttribute termAtt = null; |
| if (output.length > 0) { |
| assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class)); |
| termAtt = ts.getAttribute(CharTermAttribute.class); |
| } |
| |
| OffsetAttribute offsetAtt = null; |
| if (startOffsets != null || endOffsets != null || finalOffset != null) { |
| assertTrue("has no OffsetAttribute", ts.hasAttribute(OffsetAttribute.class)); |
| offsetAtt = ts.getAttribute(OffsetAttribute.class); |
| } |
| |
| TypeAttribute typeAtt = null; |
| if (types != null) { |
| assertTrue("has no TypeAttribute", ts.hasAttribute(TypeAttribute.class)); |
| typeAtt = ts.getAttribute(TypeAttribute.class); |
| } |
| |
| PositionIncrementAttribute posIncrAtt = null; |
| if (posIncrements != null || finalPosInc != null) { |
| assertTrue("has no PositionIncrementAttribute", ts.hasAttribute(PositionIncrementAttribute.class)); |
| posIncrAtt = ts.getAttribute(PositionIncrementAttribute.class); |
| } |
| |
| PositionLengthAttribute posLengthAtt = null; |
| if (posLengths != null) { |
| assertTrue("has no PositionLengthAttribute", ts.hasAttribute(PositionLengthAttribute.class)); |
| posLengthAtt = ts.getAttribute(PositionLengthAttribute.class); |
| } |
| |
| KeywordAttribute keywordAtt = null; |
| if (keywordAtts != null) { |
| assertTrue("has no KeywordAttribute", ts.hasAttribute(KeywordAttribute.class)); |
| keywordAtt = ts.getAttribute(KeywordAttribute.class); |
| } |
| |
| PayloadAttribute payloadAtt = null; |
| if (payloads != null) { |
| assertTrue("has no PayloadAttribute", ts.hasAttribute(PayloadAttribute.class)); |
| payloadAtt = ts.getAttribute(PayloadAttribute.class); |
| } |
| |
| FlagsAttribute flagsAtt = null; |
| if (flags != null) { |
| assertTrue("has no FlagsAttribute", ts.hasAttribute(FlagsAttribute.class)); |
| flagsAtt = ts.getAttribute(FlagsAttribute.class); |
| } |
| |
| // Maps position to the start/end offset: |
| final Map<Integer,Integer> posToStartOffset = new HashMap<>(); |
| final Map<Integer,Integer> posToEndOffset = new HashMap<>(); |
| |
| // TODO: would be nice to be able to assert silly duplicated tokens are not created, but a number of cases do this "legitimately": LUCENE-7622 |
| |
| ts.reset(); |
| int pos = -1; |
| int lastStartOffset = 0; |
| for (int i = 0; i < output.length; i++) { |
| // extra safety to enforce, that the state is not preserved and also assign bogus values |
| ts.clearAttributes(); |
| termAtt.setEmpty().append("bogusTerm"); |
| if (offsetAtt != null) offsetAtt.setOffset(14584724,24683243); |
| if (typeAtt != null) typeAtt.setType("bogusType"); |
| if (posIncrAtt != null) posIncrAtt.setPositionIncrement(45987657); |
| if (posLengthAtt != null) posLengthAtt.setPositionLength(45987653); |
| if (keywordAtt != null) keywordAtt.setKeyword((i&1) == 0); |
| if (payloadAtt != null) payloadAtt.setPayload(new BytesRef(new byte[] { 0x00, -0x21, 0x12, -0x43, 0x24 })); |
| if (flagsAtt != null) flagsAtt.setFlags(~0); // all 1's |
| |
| checkClearAtt.getAndResetClearCalled(); // reset it, because we called clearAttribute() before |
| assertTrue("token "+i+" does not exist", ts.incrementToken()); |
| assertTrue("clearAttributes() was not called correctly in TokenStream chain at token " + i, checkClearAtt.getAndResetClearCalled()); |
| |
| assertEquals("term "+i, output[i], termAtt.toString()); |
| if (startOffsets != null) { |
| assertEquals("startOffset " + i + " term=" + termAtt, startOffsets[i], offsetAtt.startOffset()); |
| } |
| if (endOffsets != null) { |
| assertEquals("endOffset " + i + " term=" + termAtt, endOffsets[i], offsetAtt.endOffset()); |
| } |
| if (types != null) { |
| assertEquals("type " + i + " term=" + termAtt, types[i], typeAtt.type()); |
| } |
| if (posIncrements != null) { |
| assertEquals("posIncrement " + i + " term=" + termAtt, posIncrements[i], posIncrAtt.getPositionIncrement()); |
| } |
| if (posLengths != null) { |
| assertEquals("posLength " + i + " term=" + termAtt, posLengths[i], posLengthAtt.getPositionLength()); |
| } |
| if (keywordAtts != null) { |
| assertEquals("keywordAtt " + i + " term=" + termAtt, keywordAtts[i], keywordAtt.isKeyword()); |
| } |
| if (flagsAtt != null) { |
| assertEquals("flagsAtt " + i + " term=" + termAtt, flags[i], flagsAtt.getFlags()); |
| } |
| if (payloads != null) { |
| if (payloads[i] != null) { |
| assertEquals("payloads " + i, new BytesRef(payloads[i]), payloadAtt.getPayload()); |
| } else { |
| assertNull("payloads " + i, payloads[i]); |
| } |
| } |
| if (posIncrAtt != null) { |
| if (i == 0) { |
| assertTrue("first posIncrement must be >= 1", posIncrAtt.getPositionIncrement() >= 1); |
| } else { |
| assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0); |
| } |
| } |
| if (posLengthAtt != null) { |
| assertTrue("posLength must be >= 1; got: " + posLengthAtt.getPositionLength(), posLengthAtt.getPositionLength() >= 1); |
| } |
| // we can enforce some basic things about a few attributes even if the caller doesn't check: |
| if (offsetAtt != null) { |
| final int startOffset = offsetAtt.startOffset(); |
| final int endOffset = offsetAtt.endOffset(); |
| if (finalOffset != null) { |
| assertTrue("startOffset (= " + startOffset + ") must be <= finalOffset (= " + finalOffset + ") term=" + termAtt, startOffset <= finalOffset.intValue()); |
| assertTrue("endOffset must be <= finalOffset: got endOffset=" + endOffset + " vs finalOffset=" + finalOffset.intValue() + " term=" + termAtt, |
| endOffset <= finalOffset.intValue()); |
| } |
| |
| assertTrue("offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset + " term=" + termAtt, offsetAtt.startOffset() >= lastStartOffset); |
| lastStartOffset = offsetAtt.startOffset(); |
| |
| if (graphOffsetsAreCorrect && posLengthAtt != null && posIncrAtt != null) { |
| // Validate offset consistency in the graph, ie |
| // all tokens leaving from a certain pos have the |
| // same startOffset, and all tokens arriving to a |
| // certain pos have the same endOffset: |
| final int posInc = posIncrAtt.getPositionIncrement(); |
| pos += posInc; |
| |
| final int posLength = posLengthAtt.getPositionLength(); |
| |
| if (!posToStartOffset.containsKey(pos)) { |
| // First time we've seen a token leaving from this position: |
| posToStartOffset.put(pos, startOffset); |
| //System.out.println(" + s " + pos + " -> " + startOffset); |
| } else { |
| // We've seen a token leaving from this position |
| // before; verify the startOffset is the same: |
| //System.out.println(" + vs " + pos + " -> " + startOffset); |
| assertEquals(i + " inconsistent startOffset: pos=" + pos + " posLen=" + posLength + " token=" + termAtt, posToStartOffset.get(pos).intValue(), startOffset); |
| } |
| |
| final int endPos = pos + posLength; |
| |
| if (!posToEndOffset.containsKey(endPos)) { |
| // First time we've seen a token arriving to this position: |
| posToEndOffset.put(endPos, endOffset); |
| //System.out.println(" + e " + endPos + " -> " + endOffset); |
| } else { |
| // We've seen a token arriving to this position |
| // before; verify the endOffset is the same: |
| //System.out.println(" + ve " + endPos + " -> " + endOffset); |
| assertEquals("inconsistent endOffset " + i + " pos=" + pos + " posLen=" + posLength + " token=" + termAtt, posToEndOffset.get(endPos).intValue(), endOffset); |
| } |
| } |
| } |
| } |
| |
| if (ts.incrementToken()) { |
| fail("TokenStream has more tokens than expected (expected count=" + output.length + "); extra token=" + ts.getAttribute(CharTermAttribute.class)); |
| } |
| |
| // repeat our extra safety checks for end() |
| ts.clearAttributes(); |
| if (termAtt != null) termAtt.setEmpty().append("bogusTerm"); |
| if (offsetAtt != null) offsetAtt.setOffset(14584724,24683243); |
| if (typeAtt != null) typeAtt.setType("bogusType"); |
| if (posIncrAtt != null) posIncrAtt.setPositionIncrement(45987657); |
| if (posLengthAtt != null) posLengthAtt.setPositionLength(45987653); |
| if (keywordAtt != null) keywordAtt.setKeyword(true); |
| if (payloadAtt != null) payloadAtt.setPayload(new BytesRef(new byte[] { 0x00, -0x21, 0x12, -0x43, 0x24 })); |
| if (flagsAtt != null) flagsAtt.setFlags(~0); // all 1's |
| |
| checkClearAtt.getAndResetClearCalled(); // reset it, because we called clearAttribute() before |
| |
| ts.end(); |
| assertTrue("super.end()/clearAttributes() was not called correctly in end()", checkClearAtt.getAndResetClearCalled()); |
| |
| if (finalOffset != null) { |
| assertEquals("finalOffset", finalOffset.intValue(), offsetAtt.endOffset()); |
| } |
| if (offsetAtt != null) { |
| assertTrue("finalOffset must be >= 0", offsetAtt.endOffset() >= 0); |
| } |
| if (finalPosInc != null) { |
| assertEquals("finalPosInc", finalPosInc.intValue(), posIncrAtt.getPositionIncrement()); |
| } |
| |
| ts.close(); |
| } |
| |
| public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], |
| int posLengths[], Integer finalOffset, boolean[] keywordAtts, |
| boolean graphOffsetsAreCorrect) throws IOException { |
| assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, null, keywordAtts, graphOffsetsAreCorrect, null, null); |
| } |
| public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], |
| int posLengths[], Integer finalOffset, Integer finalPosInc, boolean[] keywordAtts, |
| boolean graphOffsetsAreCorrect, byte[][] payloads) throws IOException { |
| assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, finalPosInc, keywordAtts, graphOffsetsAreCorrect, payloads, null); |
| } |
| |
| public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset, boolean graphOffsetsAreCorrect) throws IOException { |
| assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, null, graphOffsetsAreCorrect); |
| } |
| |
| public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset) throws IOException { |
| assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, true); |
| } |
| |
| public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], Integer finalOffset) throws IOException { |
| assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null, finalOffset); |
| } |
| |
| public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException { |
| assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null, null); |
| } |
| |
| public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int[] posLengths) throws IOException { |
| assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, null); |
| } |
| |
| public static void assertTokenStreamContents(TokenStream ts, String[] output) throws IOException { |
| assertTokenStreamContents(ts, output, null, null, null, null, null, null); |
| } |
| |
| public static void assertTokenStreamContents(TokenStream ts, String[] output, String[] types) throws IOException { |
| assertTokenStreamContents(ts, output, null, null, types, null, null, null); |
| } |
| |
| public static void assertTokenStreamContents(TokenStream ts, String[] output, int[] posIncrements) throws IOException { |
| assertTokenStreamContents(ts, output, null, null, null, posIncrements, null, null); |
| } |
| |
| public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[]) throws IOException { |
| assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null, null, null); |
| } |
| |
| public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], Integer finalOffset) throws IOException { |
| assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null, null, finalOffset); |
| } |
| |
| public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements) throws IOException { |
| assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, null, null); |
| } |
| |
| public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements, Integer finalOffset) throws IOException { |
| assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, null, finalOffset); |
| } |
| |
| public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements, int[] posLengths, Integer finalOffset) throws IOException { |
| assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, posLengths, finalOffset); |
| } |
| |
| public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException { |
| assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, null, input.length()); |
| checkResetException(a, input); |
| checkAnalysisConsistency(random(), a, true, input); |
| } |
| |
| public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[]) throws IOException { |
| assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length()); |
| checkResetException(a, input); |
| checkAnalysisConsistency(random(), a, true, input); |
| } |
| |
| public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], boolean graphOffsetsAreCorrect) throws IOException { |
| assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length(), graphOffsetsAreCorrect); |
| checkResetException(a, input); |
| checkAnalysisConsistency(random(), a, true, input, graphOffsetsAreCorrect); |
| } |
| |
| public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], boolean graphOffsetsAreCorrect, byte[][] payloads) throws IOException { |
| assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length(), null, null, graphOffsetsAreCorrect, payloads); |
| checkResetException(a, input); |
| checkAnalysisConsistency(random(), a, true, input, graphOffsetsAreCorrect); |
| } |
| |
| public static void assertAnalyzesTo(Analyzer a, String input, String[] output) throws IOException { |
| assertAnalyzesTo(a, input, output, null, null, null, null, null); |
| } |
| |
| public static void assertAnalyzesTo(Analyzer a, String input, String[] output, String[] types) throws IOException { |
| assertAnalyzesTo(a, input, output, null, null, types, null, null); |
| } |
| |
| public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int[] posIncrements) throws IOException { |
| assertAnalyzesTo(a, input, output, null, null, null, posIncrements, null); |
| } |
| |
| public static void assertAnalyzesToPositions(Analyzer a, String input, String[] output, int[] posIncrements, int[] posLengths) throws IOException { |
| assertAnalyzesTo(a, input, output, null, null, null, posIncrements, posLengths); |
| } |
| |
| public static void assertAnalyzesToPositions(Analyzer a, String input, String[] output, String[] types, int[] posIncrements, int[] posLengths) throws IOException { |
| assertAnalyzesTo(a, input, output, null, null, types, posIncrements, posLengths); |
| } |
| |
| public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[]) throws IOException { |
| assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, null, null); |
| } |
| |
| public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements) throws IOException { |
| assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, posIncrements, null); |
| } |
| |
| public static void checkResetException(Analyzer a, String input) throws IOException { |
| TokenStream ts = a.tokenStream("bogus", input); |
| try { |
| if (ts.incrementToken()) { |
| //System.out.println(ts.reflectAsString(false)); |
| fail("didn't get expected exception when reset() not called"); |
| } |
| } catch (IllegalStateException expected) { |
| // ok |
| } catch (Exception unexpected) { |
| unexpected.printStackTrace(System.err); |
| fail("got wrong exception when reset() not called: " + unexpected); |
| } finally { |
| // consume correctly |
| ts.reset(); |
| while (ts.incrementToken()) { } |
| ts.end(); |
| ts.close(); |
| } |
| |
| // check for a missing close() |
| ts = a.tokenStream("bogus", input); |
| ts.reset(); |
| while (ts.incrementToken()) {} |
| ts.end(); |
| try { |
| ts = a.tokenStream("bogus", input); |
| fail("didn't get expected exception when close() not called"); |
| } catch (IllegalStateException expected) { |
| // ok |
| } finally { |
| ts.close(); |
| } |
| } |
| |
| // simple utility method for testing stemmers |
| |
| public static void checkOneTerm(Analyzer a, final String input, final String expected) throws IOException { |
| assertAnalyzesTo(a, input, new String[]{expected}); |
| } |
| |
| /** utility method for blasting tokenstreams with data to make sure they don't do anything crazy */ |
| public static void checkRandomData(Random random, Analyzer a, int iterations) throws IOException { |
| checkRandomData(random, a, iterations, 20, false, true); |
| } |
| |
| /** utility method for blasting tokenstreams with data to make sure they don't do anything crazy */ |
| public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength) throws IOException { |
| checkRandomData(random, a, iterations, maxWordLength, false, true); |
| } |
| |
| /** |
| * utility method for blasting tokenstreams with data to make sure they don't do anything crazy |
| * @param simple true if only ascii strings will be used (try to avoid) |
| */ |
| public static void checkRandomData(Random random, Analyzer a, int iterations, boolean simple) throws IOException { |
| checkRandomData(random, a, iterations, 20, simple, true); |
| } |
| |
| /** Asserts that the given stream has expected number of tokens. */ |
| public static void assertStreamHasNumberOfTokens(TokenStream ts, int expectedCount) throws IOException { |
| ts.reset(); |
| int count = 0; |
| while (ts.incrementToken()) { |
| count++; |
| } |
| ts.end(); |
| assertEquals("wrong number of tokens", expectedCount, count); |
| } |
| |
| static class AnalysisThread extends Thread { |
| final int iterations; |
| final int maxWordLength; |
| final long seed; |
| final Analyzer a; |
| final boolean useCharFilter; |
| final boolean simple; |
| final boolean graphOffsetsAreCorrect; |
| final RandomIndexWriter iw; |
| final CountDownLatch latch; |
| |
| // NOTE: not volatile because we don't want the tests to |
| // add memory barriers (ie alter how threads |
| // interact)... so this is just "best effort": |
| public boolean failed; |
| |
| AnalysisThread(long seed, CountDownLatch latch, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple, boolean graphOffsetsAreCorrect, RandomIndexWriter iw) { |
| this.seed = seed; |
| this.a = a; |
| this.iterations = iterations; |
| this.maxWordLength = maxWordLength; |
| this.useCharFilter = useCharFilter; |
| this.simple = simple; |
| this.graphOffsetsAreCorrect = graphOffsetsAreCorrect; |
| this.iw = iw; |
| this.latch = latch; |
| } |
| |
| @Override |
| public void run() { |
| boolean success = false; |
| try { |
| latch.await(); |
| // see the part in checkRandomData where it replays the same text again |
| // to verify reproducability/reuse: hopefully this would catch thread hazards. |
| checkRandomData(new Random(seed), a, iterations, maxWordLength, useCharFilter, simple, graphOffsetsAreCorrect, iw); |
| success = true; |
| } catch (Exception e) { |
| Rethrow.rethrow(e); |
| } finally { |
| failed = !success; |
| } |
| } |
| }; |
| |
| public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean simple) throws IOException { |
| checkRandomData(random, a, iterations, maxWordLength, simple, true); |
| } |
| |
| public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean simple, boolean graphOffsetsAreCorrect) throws IOException { |
| checkResetException(a, "best effort"); |
| long seed = random.nextLong(); |
| boolean useCharFilter = random.nextBoolean(); |
| Directory dir = null; |
| RandomIndexWriter iw = null; |
| final String postingsFormat = TestUtil.getPostingsFormat("dummy"); |
| boolean codecOk = iterations * maxWordLength < 100000 && !(postingsFormat.equals("SimpleText")); |
| if (rarely(random) && codecOk) { |
| dir = newFSDirectory(createTempDir("bttc")); |
| iw = new RandomIndexWriter(new Random(seed), dir, a); |
| } |
| boolean success = false; |
| try { |
| checkRandomData(new Random(seed), a, iterations, maxWordLength, useCharFilter, simple, graphOffsetsAreCorrect, iw); |
| // now test with multiple threads: note we do the EXACT same thing we did before in each thread, |
| // so this should only really fail from another thread if it's an actual thread problem |
| int numThreads = TestUtil.nextInt(random, 2, 4); |
| final CountDownLatch startingGun = new CountDownLatch(1); |
| AnalysisThread threads[] = new AnalysisThread[numThreads]; |
| for (int i = 0; i < threads.length; i++) { |
| threads[i] = new AnalysisThread(seed, startingGun, a, iterations, maxWordLength, useCharFilter, simple, graphOffsetsAreCorrect, iw); |
| } |
| for (int i = 0; i < threads.length; i++) { |
| threads[i].start(); |
| } |
| startingGun.countDown(); |
| for (int i = 0; i < threads.length; i++) { |
| try { |
| threads[i].join(); |
| } catch (InterruptedException e) { |
| throw new RuntimeException(e); |
| } |
| } |
| for (int i = 0; i < threads.length; i++) { |
| if (threads[i].failed) { |
| throw new RuntimeException("some thread(s) failed"); |
| } |
| } |
| if (iw != null) { |
| iw.close(); |
| } |
| success = true; |
| } finally { |
| if (success) { |
| IOUtils.close(dir); |
| } else { |
| IOUtils.closeWhileHandlingException(dir); // checkindex |
| } |
| } |
| } |
| |
| private static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple, boolean graphOffsetsAreCorrect, RandomIndexWriter iw) throws IOException { |
| |
| Document doc = null; |
| Field field = null, currentField = null; |
| StringReader bogus = new StringReader(""); |
| if (iw != null) { |
| doc = new Document(); |
| FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); |
| if (random.nextBoolean()) { |
| ft.setStoreTermVectors(true); |
| ft.setStoreTermVectorOffsets(random.nextBoolean()); |
| ft.setStoreTermVectorPositions(random.nextBoolean()); |
| if (ft.storeTermVectorPositions()) { |
| ft.setStoreTermVectorPayloads(random.nextBoolean()); |
| } |
| } |
| if (random.nextBoolean()) { |
| ft.setOmitNorms(true); |
| } |
| switch(random.nextInt(4)) { |
| case 0: ft.setIndexOptions(IndexOptions.DOCS); break; |
| case 1: ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS); break; |
| case 2: ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); break; |
| default: |
| ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); |
| } |
| currentField = field = new Field("dummy", bogus, ft); |
| doc.add(currentField); |
| } |
| |
| for (int i = 0; i < iterations; i++) { |
| String text = TestUtil.randomAnalysisString(random, maxWordLength, simple); |
| |
| try { |
| checkAnalysisConsistency(random, a, useCharFilter, text, graphOffsetsAreCorrect, currentField); |
| if (iw != null) { |
| if (random.nextInt(7) == 0) { |
| // pile up a multivalued field |
| IndexableFieldType ft = field.fieldType(); |
| currentField = new Field("dummy", bogus, ft); |
| doc.add(currentField); |
| } else { |
| iw.addDocument(doc); |
| if (doc.getFields().size() > 1) { |
| // back to 1 field |
| currentField = field; |
| doc.removeFields("dummy"); |
| doc.add(currentField); |
| } |
| } |
| } |
| } catch (Throwable t) { |
| // TODO: really we should pass a random seed to |
| // checkAnalysisConsistency then print it here too: |
| System.err.println("TEST FAIL: useCharFilter=" + useCharFilter + " text='" + escape(text) + "'"); |
| Rethrow.rethrow(t); |
| } |
| } |
| } |
| |
| public static String escape(String s) { |
| int charUpto = 0; |
| final StringBuilder sb = new StringBuilder(); |
| while (charUpto < s.length()) { |
| final int c = s.charAt(charUpto); |
| if (c == 0xa) { |
| // Strangely, you cannot put \ u000A into Java |
| // sources (not in a comment nor a string |
| // constant)...: |
| sb.append("\\n"); |
| } else if (c == 0xd) { |
| // ... nor \ u000D: |
| sb.append("\\r"); |
| } else if (c == '"') { |
| sb.append("\\\""); |
| } else if (c == '\\') { |
| sb.append("\\\\"); |
| } else if (c >= 0x20 && c < 0x80) { |
| sb.append((char) c); |
| } else { |
| // TODO: we can make ascii easier to read if we |
| // don't escape... |
| sb.append(String.format(Locale.ROOT, "\\u%04x", c)); |
| } |
| charUpto++; |
| } |
| return sb.toString(); |
| } |
| |
| public static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text) throws IOException { |
| checkAnalysisConsistency(random, a, useCharFilter, text, true); |
| } |
| |
| public static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text, boolean graphOffsetsAreCorrect) throws IOException { |
| checkAnalysisConsistency(random, a, useCharFilter, text, graphOffsetsAreCorrect, null); |
| } |
| |
| private static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text, boolean graphOffsetsAreCorrect, Field field) throws IOException { |
| |
| if (VERBOSE) { |
| System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text); |
| } |
| |
| int remainder = random.nextInt(10); |
| Reader reader = new StringReader(text); |
| TokenStream ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader); |
| CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); |
| OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class); |
| PositionIncrementAttribute posIncAtt = ts.getAttribute(PositionIncrementAttribute.class); |
| PositionLengthAttribute posLengthAtt = ts.getAttribute(PositionLengthAttribute.class); |
| TypeAttribute typeAtt = ts.getAttribute(TypeAttribute.class); |
| List<String> tokens = new ArrayList<>(); |
| List<String> types = new ArrayList<>(); |
| List<Integer> positions = new ArrayList<>(); |
| List<Integer> positionLengths = new ArrayList<>(); |
| List<Integer> startOffsets = new ArrayList<>(); |
| List<Integer> endOffsets = new ArrayList<>(); |
| ts.reset(); |
| |
| // First pass: save away "correct" tokens |
| while (ts.incrementToken()) { |
| assertNotNull("has no CharTermAttribute", termAtt); |
| tokens.add(termAtt.toString()); |
| if (typeAtt != null) types.add(typeAtt.type()); |
| if (posIncAtt != null) positions.add(posIncAtt.getPositionIncrement()); |
| if (posLengthAtt != null) positionLengths.add(posLengthAtt.getPositionLength()); |
| if (offsetAtt != null) { |
| startOffsets.add(offsetAtt.startOffset()); |
| endOffsets.add(offsetAtt.endOffset()); |
| } |
| } |
| ts.end(); |
| ts.close(); |
| |
| // verify reusing is "reproducable" and also get the normal tokenstream sanity checks |
| if (!tokens.isEmpty()) { |
| |
| // KWTokenizer (for example) can produce a token |
| // even when input is length 0: |
| if (text.length() != 0) { |
| |
| // (Optional) second pass: do something evil: |
| final int evilness = random.nextInt(50); |
| if (evilness == 17) { |
| if (VERBOSE) { |
| System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis w/ exception"); |
| } |
| // Throw an errant exception from the Reader: |
| |
| MockReaderWrapper evilReader = new MockReaderWrapper(random, new StringReader(text)); |
| evilReader.throwExcAfterChar(random.nextInt(text.length()+1)); |
| reader = evilReader; |
| |
| try { |
| // NOTE: some Tokenizers go and read characters |
| // when you call .setReader(Reader), eg |
| // PatternTokenizer. This is a bit |
| // iffy... (really, they should only |
| // pull from the Reader when you call |
| // .incremenToken(), I think?), but we |
| // currently allow it, so, we must call |
| // a.tokenStream inside the try since we may |
| // hit the exc on init: |
| ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader); |
| ts.reset(); |
| while (ts.incrementToken()); |
| fail("did not hit exception"); |
| } catch (RuntimeException re) { |
| assertTrue(MockReaderWrapper.isMyEvilException(re)); |
| } |
| try { |
| ts.end(); |
| } catch (IllegalStateException ise) { |
| // Catch & ignore MockTokenizer's |
| // anger... |
| if (ise.getMessage().contains("end() called in wrong state=")) { |
| // OK |
| } else { |
| throw ise; |
| } |
| } |
| ts.close(); |
| } else if (evilness == 7) { |
| // Only consume a subset of the tokens: |
| final int numTokensToRead = random.nextInt(tokens.size()); |
| if (VERBOSE) { |
| System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis, only consuming " + numTokensToRead + " of " + tokens.size() + " tokens"); |
| } |
| |
| reader = new StringReader(text); |
| ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader); |
| ts.reset(); |
| for(int tokenCount=0;tokenCount<numTokensToRead;tokenCount++) { |
| assertTrue(ts.incrementToken()); |
| } |
| try { |
| ts.end(); |
| } catch (IllegalStateException ise) { |
| // Catch & ignore MockTokenizer's |
| // anger... |
| if (ise.getMessage().contains("end() called in wrong state=")) { |
| // OK |
| } else { |
| throw ise; |
| } |
| } |
| ts.close(); |
| } |
| } |
| } |
| |
| // Final pass: verify clean tokenization matches |
| // results from first pass: |
| |
| if (VERBOSE) { |
| System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis; " + tokens.size() + " tokens"); |
| } |
| reader = new StringReader(text); |
| |
| long seed = random.nextLong(); |
| random = new Random(seed); |
| if (random.nextInt(30) == 7) { |
| if (VERBOSE) { |
| System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: using spoon-feed reader"); |
| } |
| |
| reader = new MockReaderWrapper(random, reader); |
| } |
| |
| ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader); |
| if (typeAtt != null && posIncAtt != null && posLengthAtt != null && offsetAtt != null) { |
| // offset + pos + posLength + type |
| assertTokenStreamContents(ts, |
| tokens.toArray(new String[tokens.size()]), |
| toIntArray(startOffsets), |
| toIntArray(endOffsets), |
| types.toArray(new String[types.size()]), |
| toIntArray(positions), |
| toIntArray(positionLengths), |
| text.length(), |
| graphOffsetsAreCorrect); |
| } else if (typeAtt != null && posIncAtt != null && offsetAtt != null) { |
| // offset + pos + type |
| assertTokenStreamContents(ts, |
| tokens.toArray(new String[tokens.size()]), |
| toIntArray(startOffsets), |
| toIntArray(endOffsets), |
| types.toArray(new String[types.size()]), |
| toIntArray(positions), |
| null, |
| text.length(), |
| graphOffsetsAreCorrect); |
| } else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null) { |
| // offset + pos + posLength |
| assertTokenStreamContents(ts, |
| tokens.toArray(new String[tokens.size()]), |
| toIntArray(startOffsets), |
| toIntArray(endOffsets), |
| null, |
| toIntArray(positions), |
| toIntArray(positionLengths), |
| text.length(), |
| graphOffsetsAreCorrect); |
| } else if (posIncAtt != null && offsetAtt != null) { |
| // offset + pos |
| assertTokenStreamContents(ts, |
| tokens.toArray(new String[tokens.size()]), |
| toIntArray(startOffsets), |
| toIntArray(endOffsets), |
| null, |
| toIntArray(positions), |
| null, |
| text.length(), |
| graphOffsetsAreCorrect); |
| } else if (offsetAtt != null) { |
| // offset |
| assertTokenStreamContents(ts, |
| tokens.toArray(new String[tokens.size()]), |
| toIntArray(startOffsets), |
| toIntArray(endOffsets), |
| null, |
| null, |
| null, |
| text.length(), |
| graphOffsetsAreCorrect); |
| } else { |
| // terms only |
| assertTokenStreamContents(ts, |
| tokens.toArray(new String[tokens.size()])); |
| } |
| |
| a.normalize("dummy", text); |
| // TODO: what can we do besides testing that the above method does not throw? |
| |
| if (field != null) { |
| reader = new StringReader(text); |
| random = new Random(seed); |
| if (random.nextInt(30) == 7) { |
| if (VERBOSE) { |
| System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: indexing using spoon-feed reader"); |
| } |
| |
| reader = new MockReaderWrapper(random, reader); |
| } |
| |
| field.setReaderValue(useCharFilter ? new MockCharFilter(reader, remainder) : reader); |
| } |
| } |
| |
| protected String toDot(Analyzer a, String inputText) throws IOException { |
| final StringWriter sw = new StringWriter(); |
| final TokenStream ts = a.tokenStream("field", inputText); |
| ts.reset(); |
| new TokenStreamToDot(inputText, ts, new PrintWriter(sw)).toDot(); |
| return sw.toString(); |
| } |
| |
| protected void toDotFile(Analyzer a, String inputText, String localFileName) throws IOException { |
| Writer w = Files.newBufferedWriter(Paths.get(localFileName), StandardCharsets.UTF_8); |
| final TokenStream ts = a.tokenStream("field", inputText); |
| ts.reset(); |
| new TokenStreamToDot(inputText, ts, new PrintWriter(w)).toDot(); |
| w.close(); |
| } |
| |
| private static int[] toIntArray(List<Integer> list) { |
| return list.stream().mapToInt(Integer::intValue).toArray(); |
| } |
| |
| protected static MockTokenizer whitespaceMockTokenizer(Reader input) throws IOException { |
| MockTokenizer mockTokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| mockTokenizer.setReader(input); |
| return mockTokenizer; |
| } |
| |
| protected static MockTokenizer whitespaceMockTokenizer(String input) throws IOException { |
| MockTokenizer mockTokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| mockTokenizer.setReader(new StringReader(input)); |
| return mockTokenizer; |
| } |
| |
| protected static MockTokenizer keywordMockTokenizer(Reader input) throws IOException { |
| MockTokenizer mockTokenizer = new MockTokenizer(MockTokenizer.KEYWORD, false); |
| mockTokenizer.setReader(input); |
| return mockTokenizer; |
| } |
| |
| protected static MockTokenizer keywordMockTokenizer(String input) throws IOException { |
| MockTokenizer mockTokenizer = new MockTokenizer(MockTokenizer.KEYWORD, false); |
| mockTokenizer.setReader(new StringReader(input)); |
| return mockTokenizer; |
| } |
| |
| /** Returns a random AttributeFactory impl */ |
| public static AttributeFactory newAttributeFactory(Random random) { |
| switch (random.nextInt(3)) { |
| case 0: |
| return TokenStream.DEFAULT_TOKEN_ATTRIBUTE_FACTORY; |
| case 1: |
| return Token.TOKEN_ATTRIBUTE_FACTORY; |
| case 2: |
| return AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY; |
| default: |
| throw new AssertionError("Please fix the Random.nextInt() call above"); |
| } |
| } |
| |
| /** Returns a random AttributeFactory impl */ |
| public static AttributeFactory newAttributeFactory() { |
| return newAttributeFactory(random()); |
| } |
| |
| private static String toString(Set<String> strings) { |
| List<String> stringsList = new ArrayList<>(strings); |
| Collections.sort(stringsList); |
| StringBuilder b = new StringBuilder(); |
| for(String s : stringsList) { |
| b.append(" "); |
| b.append(s); |
| b.append('\n'); |
| } |
| return b.toString(); |
| } |
| |
| /** |
| * Enumerates all accepted strings in the token graph created by the analyzer on the provided text, and then |
| * asserts that it's equal to the expected strings. |
| * Uses {@link TokenStreamToAutomaton} to create an automaton. Asserts the finite strings of the automaton are all |
| * and only the given valid strings. |
| * @param analyzer analyzer containing the SynonymFilter under test. |
| * @param text text to be analyzed. |
| * @param expectedStrings all expected finite strings. |
| */ |
| public static void assertGraphStrings(Analyzer analyzer, String text, String... expectedStrings) throws IOException { |
| checkAnalysisConsistency(random(), analyzer, true, text, true); |
| try (TokenStream tokenStream = analyzer.tokenStream("dummy", text)) { |
| assertGraphStrings(tokenStream, expectedStrings); |
| } |
| } |
| |
| /** |
| * Enumerates all accepted strings in the token graph created by the already initialized {@link TokenStream}. |
| */ |
| public static void assertGraphStrings(TokenStream tokenStream, String... expectedStrings) throws IOException { |
| Automaton automaton = new TokenStreamToAutomaton().toAutomaton(tokenStream); |
| Set<IntsRef> actualStringPaths = AutomatonTestUtil.getFiniteStringsRecursive(automaton, -1); |
| |
| Set<String> expectedStringsSet = new HashSet<>(Arrays.asList(expectedStrings)); |
| |
| BytesRefBuilder scratchBytesRefBuilder = new BytesRefBuilder(); |
| Set<String> actualStrings = new HashSet<>(); |
| for (IntsRef ir: actualStringPaths) { |
| actualStrings.add(Util.toBytesRef(ir, scratchBytesRefBuilder).utf8ToString().replace((char) TokenStreamToAutomaton.POS_SEP, ' ')); |
| } |
| for (String s : actualStrings) { |
| assertTrue("Analyzer created unexpected string path: " + s + "\nexpected:\n" + toString(expectedStringsSet) + "\nactual:\n" + toString(actualStrings), expectedStringsSet.contains(s)); |
| } |
| for (String s : expectedStrings) { |
| assertTrue("Analyzer created unexpected string path: " + s + "\nexpected:\n" + toString(expectedStringsSet) + "\nactual:\n" + toString(actualStrings), actualStrings.contains(s)); |
| } |
| } |
| |
| /** Returns all paths accepted by the token stream graph produced by analyzing text with the provided analyzer. The tokens {@link |
| * CharTermAttribute} values are concatenated, and separated with space. */ |
| public static Set<String> getGraphStrings(Analyzer analyzer, String text) throws IOException { |
| try(TokenStream tokenStream = analyzer.tokenStream("dummy", text)) { |
| return getGraphStrings(tokenStream); |
| } |
| } |
| |
| /** Returns all paths accepted by the token stream graph produced by the already initialized {@link TokenStream}. */ |
| public static Set<String> getGraphStrings(TokenStream tokenStream) throws IOException { |
| Automaton automaton = new TokenStreamToAutomaton().toAutomaton(tokenStream); |
| Set<IntsRef> actualStringPaths = AutomatonTestUtil.getFiniteStringsRecursive(automaton, -1); |
| BytesRefBuilder scratchBytesRefBuilder = new BytesRefBuilder(); |
| Set<String> paths = new HashSet<>(); |
| for (IntsRef ir: actualStringPaths) { |
| paths.add(Util.toBytesRef(ir, scratchBytesRefBuilder).utf8ToString().replace((char) TokenStreamToAutomaton.POS_SEP, ' ')); |
| } |
| return paths; |
| } |
| |
| /** Returns a {@code String} summary of the tokens this analyzer produces on this text */ |
| public static String toString(Analyzer analyzer, String text) throws IOException { |
| try(TokenStream ts = analyzer.tokenStream("field", text)) { |
| StringBuilder b = new StringBuilder(); |
| CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); |
| PositionIncrementAttribute posIncAtt = ts.getAttribute(PositionIncrementAttribute.class); |
| PositionLengthAttribute posLengthAtt = ts.getAttribute(PositionLengthAttribute.class); |
| OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class); |
| assertNotNull(offsetAtt); |
| ts.reset(); |
| int pos = -1; |
| while (ts.incrementToken()) { |
| pos += posIncAtt.getPositionIncrement(); |
| b.append(termAtt); |
| b.append(" at pos="); |
| b.append(pos); |
| if (posLengthAtt != null) { |
| b.append(" to pos="); |
| b.append(pos + posLengthAtt.getPositionLength()); |
| } |
| b.append(" offsets="); |
| b.append(offsetAtt.startOffset()); |
| b.append('-'); |
| b.append(offsetAtt.endOffset()); |
| b.append('\n'); |
| } |
| ts.end(); |
| return b.toString(); |
| } |
| } |
| } |