blob: 263fc8b7c1e2e8621e67701b530ee7fcf1cafc1b [file] [log] [blame]
package org.apache.lucene.analysis.sinks;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.English;
/**
* tests for the TestTeeSinkTokenFilter
*/
public class TestTeeSinkTokenFilter extends BaseTokenStreamTestCase {
protected StringBuilder buffer1;
protected StringBuilder buffer2;
protected String[] tokens1;
protected String[] tokens2;
@Override
public void setUp() throws Exception {
super.setUp();
tokens1 = new String[]{"The", "quick", "Burgundy", "Fox", "jumped", "over", "the", "lazy", "Red", "Dogs"};
tokens2 = new String[]{"The", "Lazy", "Dogs", "should", "stay", "on", "the", "porch"};
buffer1 = new StringBuilder();
for (int i = 0; i < tokens1.length; i++) {
buffer1.append(tokens1[i]).append(' ');
}
buffer2 = new StringBuilder();
for (int i = 0; i < tokens2.length; i++) {
buffer2.append(tokens2[i]).append(' ');
}
}
static final TeeSinkTokenFilter.SinkFilter theFilter = new TeeSinkTokenFilter.SinkFilter() {
@Override
public boolean accept(AttributeSource a) {
CharTermAttribute termAtt = a.getAttribute(CharTermAttribute.class);
return termAtt.toString().equalsIgnoreCase("The");
}
};
static final TeeSinkTokenFilter.SinkFilter dogFilter = new TeeSinkTokenFilter.SinkFilter() {
@Override
public boolean accept(AttributeSource a) {
CharTermAttribute termAtt = a.getAttribute(CharTermAttribute.class);
return termAtt.toString().equalsIgnoreCase("Dogs");
}
};
// LUCENE-1448
// TODO: instead of testing it this way, we can test
// with BaseTokenStreamTestCase now...
public void testEndOffsetPositionWithTeeSinkTokenFilter() throws Exception {
Directory dir = newDirectory();
Analyzer analyzer = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false);
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer));
Document doc = new Document();
TokenStream tokenStream = analyzer.tokenStream("field", new StringReader("abcd "));
TeeSinkTokenFilter tee = new TeeSinkTokenFilter(tokenStream);
TokenStream sink = tee.newSinkTokenStream();
FieldType ft = new FieldType(TextField.TYPE_UNSTORED);
ft.setStoreTermVectors(true);
ft.setStoreTermVectorOffsets(true);
ft.setStoreTermVectorPositions(true);
Field f1 = new Field("field", tee, ft);
Field f2 = new Field("field", sink, ft);
doc.add(f1);
doc.add(f2);
w.addDocument(doc);
w.close();
IndexReader r = IndexReader.open(dir);
Terms vector = r.getTermVectors(0).terms("field");
assertEquals(1, vector.getUniqueTermCount());
TermsEnum termsEnum = vector.iterator(null);
termsEnum.next();
assertEquals(2, termsEnum.totalTermFreq());
DocsAndPositionsEnum positions = termsEnum.docsAndPositions(null, null, true);
assertTrue(positions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
assertEquals(2, positions.freq());
positions.nextPosition();
assertEquals(0, positions.startOffset());
assertEquals(4, positions.endOffset());
positions.nextPosition();
assertEquals(8, positions.startOffset());
assertEquals(12, positions.endOffset());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, positions.nextDoc());
r.close();
dir.close();
}
public void testGeneral() throws IOException {
final TeeSinkTokenFilter source = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer1.toString()), MockTokenizer.WHITESPACE, false));
final TokenStream sink1 = source.newSinkTokenStream();
final TokenStream sink2 = source.newSinkTokenStream(theFilter);
source.addAttribute(CheckClearAttributesAttribute.class);
sink1.addAttribute(CheckClearAttributesAttribute.class);
sink2.addAttribute(CheckClearAttributesAttribute.class);
assertTokenStreamContents(source, tokens1);
assertTokenStreamContents(sink1, tokens1);
assertTokenStreamContents(sink2, new String[]{"The", "the"});
}
public void testMultipleSources() throws Exception {
final TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer1.toString()), MockTokenizer.WHITESPACE, false));
final TeeSinkTokenFilter.SinkTokenStream dogDetector = tee1.newSinkTokenStream(dogFilter);
final TeeSinkTokenFilter.SinkTokenStream theDetector = tee1.newSinkTokenStream(theFilter);
tee1.reset();
final TokenStream source1 = new CachingTokenFilter(tee1);
tee1.addAttribute(CheckClearAttributesAttribute.class);
dogDetector.addAttribute(CheckClearAttributesAttribute.class);
theDetector.addAttribute(CheckClearAttributesAttribute.class);
final TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer2.toString()), MockTokenizer.WHITESPACE, false));
tee2.addSinkTokenStream(dogDetector);
tee2.addSinkTokenStream(theDetector);
final TokenStream source2 = tee2;
assertTokenStreamContents(source1, tokens1);
assertTokenStreamContents(source2, tokens2);
assertTokenStreamContents(theDetector, new String[]{"The", "the", "The", "the"});
assertTokenStreamContents(dogDetector, new String[]{"Dogs", "Dogs"});
source1.reset();
TokenStream lowerCasing = new LowerCaseFilter(TEST_VERSION_CURRENT, source1);
String[] lowerCaseTokens = new String[tokens1.length];
for (int i = 0; i < tokens1.length; i++)
lowerCaseTokens[i] = tokens1[i].toLowerCase();
assertTokenStreamContents(lowerCasing, lowerCaseTokens);
}
/**
* Not an explicit test, just useful to print out some info on performance
*
* @throws Exception
*/
public void performance() throws Exception {
int[] tokCount = {100, 500, 1000, 2000, 5000, 10000};
int[] modCounts = {1, 2, 5, 10, 20, 50, 100, 200, 500};
for (int k = 0; k < tokCount.length; k++) {
StringBuilder buffer = new StringBuilder();
System.out.println("-----Tokens: " + tokCount[k] + "-----");
for (int i = 0; i < tokCount[k]; i++) {
buffer.append(English.intToEnglish(i).toUpperCase()).append(' ');
}
//make sure we produce the same tokens
TeeSinkTokenFilter teeStream = new TeeSinkTokenFilter(new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.toString()))));
TokenStream sink = teeStream.newSinkTokenStream(new ModuloSinkFilter(100));
teeStream.consumeAllTokens();
TokenStream stream = new ModuloTokenFilter(new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.toString()))), 100);
CharTermAttribute tfTok = stream.addAttribute(CharTermAttribute.class);
CharTermAttribute sinkTok = sink.addAttribute(CharTermAttribute.class);
for (int i=0; stream.incrementToken(); i++) {
assertTrue(sink.incrementToken());
assertTrue(tfTok + " is not equal to " + sinkTok + " at token: " + i, tfTok.equals(sinkTok) == true);
}
//simulate two fields, each being analyzed once, for 20 documents
for (int j = 0; j < modCounts.length; j++) {
int tfPos = 0;
long start = System.currentTimeMillis();
for (int i = 0; i < 20; i++) {
stream = new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.toString())));
PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
while (stream.incrementToken()) {
tfPos += posIncrAtt.getPositionIncrement();
}
stream = new ModuloTokenFilter(new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.toString()))), modCounts[j]);
posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
while (stream.incrementToken()) {
tfPos += posIncrAtt.getPositionIncrement();
}
}
long finish = System.currentTimeMillis();
System.out.println("ModCount: " + modCounts[j] + " Two fields took " + (finish - start) + " ms");
int sinkPos = 0;
//simulate one field with one sink
start = System.currentTimeMillis();
for (int i = 0; i < 20; i++) {
teeStream = new TeeSinkTokenFilter(new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.toString()))));
sink = teeStream.newSinkTokenStream(new ModuloSinkFilter(modCounts[j]));
PositionIncrementAttribute posIncrAtt = teeStream.getAttribute(PositionIncrementAttribute.class);
while (teeStream.incrementToken()) {
sinkPos += posIncrAtt.getPositionIncrement();
}
//System.out.println("Modulo--------");
posIncrAtt = sink.getAttribute(PositionIncrementAttribute.class);
while (sink.incrementToken()) {
sinkPos += posIncrAtt.getPositionIncrement();
}
}
finish = System.currentTimeMillis();
System.out.println("ModCount: " + modCounts[j] + " Tee fields took " + (finish - start) + " ms");
assertTrue(sinkPos + " does not equal: " + tfPos, sinkPos == tfPos);
}
System.out.println("- End Tokens: " + tokCount[k] + "-----");
}
}
class ModuloTokenFilter extends TokenFilter {
int modCount;
ModuloTokenFilter(TokenStream input, int mc) {
super(input);
modCount = mc;
}
int count = 0;
//return every 100 tokens
@Override
public boolean incrementToken() throws IOException {
boolean hasNext;
for (hasNext = input.incrementToken();
hasNext && count % modCount != 0;
hasNext = input.incrementToken()) {
count++;
}
count++;
return hasNext;
}
}
class ModuloSinkFilter extends TeeSinkTokenFilter.SinkFilter {
int count = 0;
int modCount;
ModuloSinkFilter(int mc) {
modCount = mc;
}
@Override
public boolean accept(AttributeSource a) {
boolean b = (a != null && count % modCount == 0);
count++;
return b;
}
}
}