| package org.apache.lucene.analysis; |
| |
| /** |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| import java.io.FileOutputStream; |
| import java.io.IOException; |
| import java.io.OutputStreamWriter; |
| import java.io.PrintWriter; |
| import java.io.Reader; |
| import java.io.StringReader; |
| import java.io.StringWriter; |
| import java.io.Writer; |
| import java.util.ArrayList; |
| import java.util.List; |
| import java.util.Random; |
| |
| import org.apache.lucene.analysis.tokenattributes.*; |
| import org.apache.lucene.util.Attribute; |
| import org.apache.lucene.util.AttributeImpl; |
| import org.apache.lucene.util.LuceneTestCase; |
| import org.apache.lucene.util._TestUtil; |
| |
| /** |
| * Base class for all Lucene unit tests that use TokenStreams. |
| * <p> |
| * When writing unit tests for analysis components, its highly recommended |
| * to use the helper methods here (especially in conjunction with {@link MockAnalyzer} or |
| * {@link MockTokenizer}), as they contain many assertions and checks to |
| * catch bugs. |
| * |
| * @see MockAnalyzer |
| * @see MockTokenizer |
| */ |
| public abstract class BaseTokenStreamTestCase extends LuceneTestCase { |
| // some helpers to test Analyzers and TokenStreams: |
| |
| public static interface CheckClearAttributesAttribute extends Attribute { |
| boolean getAndResetClearCalled(); |
| } |
| |
| public static final class CheckClearAttributesAttributeImpl extends AttributeImpl implements CheckClearAttributesAttribute { |
| private boolean clearCalled = false; |
| |
| public boolean getAndResetClearCalled() { |
| try { |
| return clearCalled; |
| } finally { |
| clearCalled = false; |
| } |
| } |
| |
| @Override |
| public void clear() { |
| clearCalled = true; |
| } |
| |
| @Override |
| public boolean equals(Object other) { |
| return ( |
| other instanceof CheckClearAttributesAttributeImpl && |
| ((CheckClearAttributesAttributeImpl) other).clearCalled == this.clearCalled |
| ); |
| } |
| |
| @Override |
| public int hashCode() { |
| return 76137213 ^ Boolean.valueOf(clearCalled).hashCode(); |
| } |
| |
| @Override |
| public void copyTo(AttributeImpl target) { |
| ((CheckClearAttributesAttributeImpl) target).clear(); |
| } |
| } |
| |
| public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset) throws IOException { |
| assertNotNull(output); |
| CheckClearAttributesAttribute checkClearAtt = ts.addAttribute(CheckClearAttributesAttribute.class); |
| |
| assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class)); |
| CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); |
| |
| OffsetAttribute offsetAtt = null; |
| if (startOffsets != null || endOffsets != null || finalOffset != null) { |
| assertTrue("has no OffsetAttribute", ts.hasAttribute(OffsetAttribute.class)); |
| offsetAtt = ts.getAttribute(OffsetAttribute.class); |
| } |
| |
| TypeAttribute typeAtt = null; |
| if (types != null) { |
| assertTrue("has no TypeAttribute", ts.hasAttribute(TypeAttribute.class)); |
| typeAtt = ts.getAttribute(TypeAttribute.class); |
| } |
| |
| PositionIncrementAttribute posIncrAtt = null; |
| if (posIncrements != null) { |
| assertTrue("has no PositionIncrementAttribute", ts.hasAttribute(PositionIncrementAttribute.class)); |
| posIncrAtt = ts.getAttribute(PositionIncrementAttribute.class); |
| } |
| |
| PositionLengthAttribute posLengthAtt = null; |
| if (posLengths != null) { |
| assertTrue("has no PositionLengthAttribute", ts.hasAttribute(PositionLengthAttribute.class)); |
| posLengthAtt = ts.getAttribute(PositionLengthAttribute.class); |
| } |
| |
| ts.reset(); |
| for (int i = 0; i < output.length; i++) { |
| // extra safety to enforce, that the state is not preserved and also assign bogus values |
| ts.clearAttributes(); |
| termAtt.setEmpty().append("bogusTerm"); |
| if (offsetAtt != null) offsetAtt.setOffset(14584724,24683243); |
| if (typeAtt != null) typeAtt.setType("bogusType"); |
| if (posIncrAtt != null) posIncrAtt.setPositionIncrement(45987657); |
| if (posLengthAtt != null) posLengthAtt.setPositionLength(45987653); |
| |
| checkClearAtt.getAndResetClearCalled(); // reset it, because we called clearAttribute() before |
| assertTrue("token "+i+" does not exist", ts.incrementToken()); |
| assertTrue("clearAttributes() was not called correctly in TokenStream chain", checkClearAtt.getAndResetClearCalled()); |
| |
| assertEquals("term "+i, output[i], termAtt.toString()); |
| if (startOffsets != null) |
| assertEquals("startOffset "+i, startOffsets[i], offsetAtt.startOffset()); |
| if (endOffsets != null) |
| assertEquals("endOffset "+i, endOffsets[i], offsetAtt.endOffset()); |
| if (types != null) |
| assertEquals("type "+i, types[i], typeAtt.type()); |
| if (posIncrements != null) |
| assertEquals("posIncrement "+i, posIncrements[i], posIncrAtt.getPositionIncrement()); |
| if (posLengths != null) |
| assertEquals("posLength "+i, posLengths[i], posLengthAtt.getPositionLength()); |
| |
| // we can enforce some basic things about a few attributes even if the caller doesn't check: |
| if (offsetAtt != null) { |
| assertTrue("startOffset must be >= 0", offsetAtt.startOffset() >= 0); |
| assertTrue("endOffset must be >= 0", offsetAtt.endOffset() >= 0); |
| assertTrue("endOffset must be >= startOffset", offsetAtt.endOffset() >= offsetAtt.startOffset()); |
| if (finalOffset != null) { |
| assertTrue("startOffset must be <= finalOffset", offsetAtt.startOffset() <= finalOffset.intValue()); |
| assertTrue("endOffset must be <= finalOffset: got endOffset=" + offsetAtt.endOffset() + " vs finalOffset=" + finalOffset.intValue(), |
| offsetAtt.endOffset() <= finalOffset.intValue()); |
| } |
| } |
| if (posIncrAtt != null) { |
| assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0); |
| } |
| if (posLengthAtt != null) { |
| assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1); |
| } |
| } |
| assertFalse("TokenStream has more tokens than expected", ts.incrementToken()); |
| ts.end(); |
| if (finalOffset != null) |
| assertEquals("finalOffset ", finalOffset.intValue(), offsetAtt.endOffset()); |
| if (offsetAtt != null) { |
| assertTrue("finalOffset must be >= 0", offsetAtt.endOffset() >= 0); |
| } |
| ts.close(); |
| } |
| |
| public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], Integer finalOffset) throws IOException { |
| assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null, finalOffset); |
| } |
| |
| public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException { |
| assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null, null); |
| } |
| |
| public static void assertTokenStreamContents(TokenStream ts, String[] output) throws IOException { |
| assertTokenStreamContents(ts, output, null, null, null, null, null, null); |
| } |
| |
| public static void assertTokenStreamContents(TokenStream ts, String[] output, String[] types) throws IOException { |
| assertTokenStreamContents(ts, output, null, null, types, null, null, null); |
| } |
| |
| public static void assertTokenStreamContents(TokenStream ts, String[] output, int[] posIncrements) throws IOException { |
| assertTokenStreamContents(ts, output, null, null, null, posIncrements, null, null); |
| } |
| |
| public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[]) throws IOException { |
| assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null, null, null); |
| } |
| |
| public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], Integer finalOffset) throws IOException { |
| assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null, null, finalOffset); |
| } |
| |
| public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements) throws IOException { |
| assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, null, null); |
| } |
| |
| public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements, Integer finalOffset) throws IOException { |
| assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, null, finalOffset); |
| } |
| |
| public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements, int[] posLengths, Integer finalOffset) throws IOException { |
| assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, posLengths, finalOffset); |
| } |
| |
| public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException { |
| assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, null, input.length()); |
| } |
| |
| public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[]) throws IOException { |
| assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length()); |
| } |
| |
| public static void assertAnalyzesTo(Analyzer a, String input, String[] output) throws IOException { |
| assertAnalyzesTo(a, input, output, null, null, null, null, null); |
| } |
| |
| public static void assertAnalyzesTo(Analyzer a, String input, String[] output, String[] types) throws IOException { |
| assertAnalyzesTo(a, input, output, null, null, types, null, null); |
| } |
| |
| public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int[] posIncrements) throws IOException { |
| assertAnalyzesTo(a, input, output, null, null, null, posIncrements, null); |
| } |
| |
| public static void assertAnalyzesToPositions(Analyzer a, String input, String[] output, int[] posIncrements, int[] posLengths) throws IOException { |
| assertAnalyzesTo(a, input, output, null, null, null, posIncrements, posLengths); |
| } |
| |
| public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[]) throws IOException { |
| assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, null, null); |
| } |
| |
| public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements) throws IOException { |
| assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, posIncrements, null); |
| } |
| |
| |
| public static void assertAnalyzesToReuse(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException { |
| assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, null, input.length()); |
| } |
| |
| public static void assertAnalyzesToReuse(Analyzer a, String input, String[] output) throws IOException { |
| assertAnalyzesToReuse(a, input, output, null, null, null, null); |
| } |
| |
| public static void assertAnalyzesToReuse(Analyzer a, String input, String[] output, String[] types) throws IOException { |
| assertAnalyzesToReuse(a, input, output, null, null, types, null); |
| } |
| |
| public static void assertAnalyzesToReuse(Analyzer a, String input, String[] output, int[] posIncrements) throws IOException { |
| assertAnalyzesToReuse(a, input, output, null, null, null, posIncrements); |
| } |
| |
| public static void assertAnalyzesToReuse(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[]) throws IOException { |
| assertAnalyzesToReuse(a, input, output, startOffsets, endOffsets, null, null); |
| } |
| |
| public static void assertAnalyzesToReuse(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements) throws IOException { |
| assertAnalyzesToReuse(a, input, output, startOffsets, endOffsets, null, posIncrements); |
| } |
| |
| // simple utility method for testing stemmers |
| |
| public static void checkOneTerm(Analyzer a, final String input, final String expected) throws IOException { |
| assertAnalyzesTo(a, input, new String[]{expected}); |
| } |
| |
| public static void checkOneTermReuse(Analyzer a, final String input, final String expected) throws IOException { |
| assertAnalyzesToReuse(a, input, new String[]{expected}); |
| } |
| |
| // simple utility method for blasting tokenstreams with data to make sure they don't do anything crazy |
| // TODO: add a MockCharStream, and use it here too, to ensure that correctOffset etc is being done by tokenizers. |
| public static void checkRandomData(Random random, Analyzer a, int iterations) throws IOException { |
| checkRandomData(random, a, iterations, 20); |
| // now test with multiple threads |
| int numThreads = _TestUtil.nextInt(random, 4, 8); |
| Thread threads[] = new Thread[numThreads]; |
| for (int i = 0; i < threads.length; i++) { |
| threads[i] = new AnalysisThread(new Random(random.nextLong()), a, iterations); |
| } |
| for (int i = 0; i < threads.length; i++) { |
| threads[i].start(); |
| } |
| for (int i = 0; i < threads.length; i++) { |
| try { |
| threads[i].join(); |
| } catch (InterruptedException e) { |
| throw new RuntimeException(e); |
| } |
| } |
| } |
| |
| static class AnalysisThread extends Thread { |
| final int iterations; |
| final Random random; |
| final Analyzer a; |
| |
| AnalysisThread(Random random, Analyzer a, int iterations) { |
| this.random = random; |
| this.a = a; |
| this.iterations = iterations; |
| } |
| |
| @Override |
| public void run() { |
| try { |
| // see the part in checkRandomData where it replays the same text again |
| // to verify reproducability/reuse: hopefully this would catch thread hazards. |
| checkRandomData(random, a, iterations, 20); |
| } catch (IOException e) { |
| throw new RuntimeException(e); |
| } |
| } |
| }; |
| |
| public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength) throws IOException { |
| checkRandomData(random, a, iterations, maxWordLength, random.nextBoolean()); |
| } |
| |
| public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter) throws IOException { |
| for (int i = 0; i < iterations; i++) { |
| String text; |
| switch(_TestUtil.nextInt(random, 0, 4)) { |
| case 0: |
| text = _TestUtil.randomSimpleString(random); |
| break; |
| case 1: |
| text = _TestUtil.randomRealisticUnicodeString(random, maxWordLength); |
| break; |
| case 2: |
| text = _TestUtil.randomHtmlishString(random, maxWordLength); |
| break; |
| default: |
| text = _TestUtil.randomUnicodeString(random, maxWordLength); |
| } |
| |
| if (VERBOSE) { |
| System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text); |
| } |
| |
| int remainder = random.nextInt(10); |
| Reader reader = new StringReader(text); |
| TokenStream ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader); |
| assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class)); |
| CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); |
| OffsetAttribute offsetAtt = ts.hasAttribute(OffsetAttribute.class) ? ts.getAttribute(OffsetAttribute.class) : null; |
| PositionIncrementAttribute posIncAtt = ts.hasAttribute(PositionIncrementAttribute.class) ? ts.getAttribute(PositionIncrementAttribute.class) : null; |
| PositionLengthAttribute posLengthAtt = ts.hasAttribute(PositionLengthAttribute.class) ? ts.getAttribute(PositionLengthAttribute.class) : null; |
| TypeAttribute typeAtt = ts.hasAttribute(TypeAttribute.class) ? ts.getAttribute(TypeAttribute.class) : null; |
| List<String> tokens = new ArrayList<String>(); |
| List<String> types = new ArrayList<String>(); |
| List<Integer> positions = new ArrayList<Integer>(); |
| List<Integer> positionLengths = new ArrayList<Integer>(); |
| List<Integer> startOffsets = new ArrayList<Integer>(); |
| List<Integer> endOffsets = new ArrayList<Integer>(); |
| ts.reset(); |
| while (ts.incrementToken()) { |
| tokens.add(termAtt.toString()); |
| if (typeAtt != null) types.add(typeAtt.type()); |
| if (posIncAtt != null) positions.add(posIncAtt.getPositionIncrement()); |
| if (posLengthAtt != null) positionLengths.add(posLengthAtt.getPositionLength()); |
| if (offsetAtt != null) { |
| startOffsets.add(offsetAtt.startOffset()); |
| endOffsets.add(offsetAtt.endOffset()); |
| } |
| } |
| ts.end(); |
| ts.close(); |
| // verify reusing is "reproducable" and also get the normal tokenstream sanity checks |
| if (!tokens.isEmpty()) { |
| if (VERBOSE) { |
| System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis; " + tokens.size() + " tokens"); |
| } |
| reader = new StringReader(text); |
| ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader); |
| if (typeAtt != null && posIncAtt != null && posLengthAtt != null && offsetAtt != null) { |
| // offset + pos + posLength + type |
| assertTokenStreamContents(ts, |
| tokens.toArray(new String[tokens.size()]), |
| toIntArray(startOffsets), |
| toIntArray(endOffsets), |
| types.toArray(new String[types.size()]), |
| toIntArray(positions), |
| toIntArray(positionLengths), |
| text.length()); |
| } else if (typeAtt != null && posIncAtt != null && offsetAtt != null) { |
| // offset + pos + type |
| assertTokenStreamContents(ts, |
| tokens.toArray(new String[tokens.size()]), |
| toIntArray(startOffsets), |
| toIntArray(endOffsets), |
| types.toArray(new String[types.size()]), |
| toIntArray(positions), |
| null, |
| text.length()); |
| } else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null) { |
| // offset + pos + posLength |
| assertTokenStreamContents(ts, |
| tokens.toArray(new String[tokens.size()]), |
| toIntArray(startOffsets), |
| toIntArray(endOffsets), |
| null, |
| toIntArray(positions), |
| toIntArray(positionLengths), |
| text.length()); |
| } else if (posIncAtt != null && offsetAtt != null) { |
| // offset + pos |
| assertTokenStreamContents(ts, |
| tokens.toArray(new String[tokens.size()]), |
| toIntArray(startOffsets), |
| toIntArray(endOffsets), |
| null, |
| toIntArray(positions), |
| null, |
| text.length()); |
| } else if (offsetAtt != null) { |
| // offset |
| assertTokenStreamContents(ts, |
| tokens.toArray(new String[tokens.size()]), |
| toIntArray(startOffsets), |
| toIntArray(endOffsets), |
| null, |
| null, |
| null, |
| text.length()); |
| } else { |
| // terms only |
| assertTokenStreamContents(ts, |
| tokens.toArray(new String[tokens.size()])); |
| } |
| } |
| } |
| } |
| |
| protected String toDot(Analyzer a, String inputText) throws IOException { |
| final StringWriter sw = new StringWriter(); |
| final TokenStream ts = a.tokenStream("field", new StringReader(inputText)); |
| ts.reset(); |
| new TokenStreamToDot(inputText, ts, new PrintWriter(sw)).toDot(); |
| return sw.toString(); |
| } |
| |
| protected void toDotFile(Analyzer a, String inputText, String localFileName) throws IOException { |
| Writer w = new OutputStreamWriter(new FileOutputStream(localFileName), "UTF-8"); |
| final TokenStream ts = a.tokenStream("field", new StringReader(inputText)); |
| ts.reset(); |
| new TokenStreamToDot(inputText, ts, new PrintWriter(w)).toDot(); |
| w.close(); |
| } |
| |
| static int[] toIntArray(List<Integer> list) { |
| int ret[] = new int[list.size()]; |
| int offset = 0; |
| for (Integer i : list) { |
| ret[offset++] = i; |
| } |
| return ret; |
| } |
| } |