blob: 2d52d227eda4d43c1244d764bde62e4b56780401 [file] [log] [blame]
using Lucene.Net.Analysis.Core;
using Lucene.Net.Analysis.Standard;
using Lucene.Net.Analysis.TokenAttributes;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.Search;
using Lucene.Net.Util;
using NUnit.Framework;
using System;
using System.Globalization;
using System.IO;
using System.Text;
using Console = Lucene.Net.Util.SystemConsole;
namespace Lucene.Net.Analysis.Sinks
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// tests for the TestTeeSinkTokenFilter
/// </summary>
public class TestTeeSinkTokenFilter : BaseTokenStreamTestCase
{
protected internal StringBuilder buffer1;
protected internal StringBuilder buffer2;
protected internal string[] tokens1;
protected internal string[] tokens2;
public override void SetUp()
{
base.SetUp();
tokens1 = new string[] { "The", "quick", "Burgundy", "Fox", "jumped", "over", "the", "lazy", "Red", "Dogs" };
tokens2 = new string[] { "The", "Lazy", "Dogs", "should", "stay", "on", "the", "porch" };
buffer1 = new StringBuilder();
for (int i = 0; i < tokens1.Length; i++)
{
buffer1.Append(tokens1[i]).Append(' ');
}
buffer2 = new StringBuilder();
for (int i = 0; i < tokens2.Length; i++)
{
buffer2.Append(tokens2[i]).Append(' ');
}
}
internal static readonly TeeSinkTokenFilter.SinkFilter theFilter = new SinkFilterAnonymousInnerClassHelper();
private sealed class SinkFilterAnonymousInnerClassHelper : TeeSinkTokenFilter.SinkFilter
{
public override bool Accept(AttributeSource a)
{
ICharTermAttribute termAtt = a.GetAttribute<ICharTermAttribute>();
return termAtt.ToString().Equals("The", StringComparison.OrdinalIgnoreCase);
}
}
internal static readonly TeeSinkTokenFilter.SinkFilter dogFilter = new SinkFilterAnonymousInnerClassHelper2();
private sealed class SinkFilterAnonymousInnerClassHelper2 : TeeSinkTokenFilter.SinkFilter
{
public override bool Accept(AttributeSource a)
{
ICharTermAttribute termAtt = a.GetAttribute<ICharTermAttribute>();
return termAtt.ToString().Equals("Dogs", StringComparison.OrdinalIgnoreCase);
}
}
// LUCENE-1448
// TODO: instead of testing it this way, we can test
// with BaseTokenStreamTestCase now...
[Test]
public virtual void TestEndOffsetPositionWithTeeSinkTokenFilter()
{
Store.Directory dir = NewDirectory();
Analyzer analyzer = new MockAnalyzer(Random, MockTokenizer.WHITESPACE, false);
IndexWriter w = new IndexWriter(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer));
Document doc = new Document();
TokenStream tokenStream = analyzer.GetTokenStream("field", "abcd ");
TeeSinkTokenFilter tee = new TeeSinkTokenFilter(tokenStream);
TokenStream sink = tee.NewSinkTokenStream();
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.StoreTermVectors = true;
ft.StoreTermVectorOffsets = true;
ft.StoreTermVectorPositions = true;
Field f1 = new Field("field", tee, ft);
Field f2 = new Field("field", sink, ft);
doc.Add(f1);
doc.Add(f2);
w.AddDocument(doc);
w.Dispose();
IndexReader r = DirectoryReader.Open(dir);
Terms vector = r.GetTermVectors(0).GetTerms("field");
assertEquals(1, vector.Count);
TermsEnum termsEnum = vector.GetEnumerator();
termsEnum.MoveNext();
assertEquals(2, termsEnum.TotalTermFreq);
DocsAndPositionsEnum positions = termsEnum.DocsAndPositions(null, null);
assertTrue(positions.NextDoc() != DocIdSetIterator.NO_MORE_DOCS);
assertEquals(2, positions.Freq);
positions.NextPosition();
assertEquals(0, positions.StartOffset);
assertEquals(4, positions.EndOffset);
positions.NextPosition();
assertEquals(8, positions.StartOffset);
assertEquals(12, positions.EndOffset);
assertEquals(DocIdSetIterator.NO_MORE_DOCS, positions.NextDoc());
r.Dispose();
dir.Dispose();
}
[Test]
public virtual void TestGeneral()
{
TeeSinkTokenFilter source = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer1.ToString()), MockTokenizer.WHITESPACE, false));
TokenStream sink1 = source.NewSinkTokenStream();
TokenStream sink2 = source.NewSinkTokenStream(theFilter);
source.AddAttribute<ICheckClearAttributesAttribute>();
sink1.AddAttribute<ICheckClearAttributesAttribute>();
sink2.AddAttribute<ICheckClearAttributesAttribute>();
AssertTokenStreamContents(source, tokens1);
AssertTokenStreamContents(sink1, tokens1);
AssertTokenStreamContents(sink2, new string[] { "The", "the" });
}
[Test]
public virtual void TestMultipleSources()
{
TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer1.ToString()), MockTokenizer.WHITESPACE, false));
TeeSinkTokenFilter.SinkTokenStream dogDetector = tee1.NewSinkTokenStream(dogFilter);
TeeSinkTokenFilter.SinkTokenStream theDetector = tee1.NewSinkTokenStream(theFilter);
tee1.Reset();
TokenStream source1 = new CachingTokenFilter(tee1);
tee1.AddAttribute<ICheckClearAttributesAttribute>();
dogDetector.AddAttribute<ICheckClearAttributesAttribute>();
theDetector.AddAttribute<ICheckClearAttributesAttribute>();
TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer2.ToString()), MockTokenizer.WHITESPACE, false));
tee2.AddSinkTokenStream(dogDetector);
tee2.AddSinkTokenStream(theDetector);
TokenStream source2 = tee2;
AssertTokenStreamContents(source1, tokens1);
AssertTokenStreamContents(source2, tokens2);
AssertTokenStreamContents(theDetector, new string[] { "The", "the", "The", "the" });
AssertTokenStreamContents(dogDetector, new string[] { "Dogs", "Dogs" });
source1.Reset();
TokenStream lowerCasing = new LowerCaseFilter(TEST_VERSION_CURRENT, source1);
string[] lowerCaseTokens = new string[tokens1.Length];
for (int i = 0; i < tokens1.Length; i++)
{
lowerCaseTokens[i] = CultureInfo.InvariantCulture.TextInfo.ToLower(tokens1[i]);
}
AssertTokenStreamContents(lowerCasing, lowerCaseTokens);
}
/// <summary>
/// Not an explicit test, just useful to print out some info on performance
/// </summary>
public virtual void Performance()
{
int[] tokCount = new int[] { 100, 500, 1000, 2000, 5000, 10000 };
int[] modCounts = new int[] { 1, 2, 5, 10, 20, 50, 100, 200, 500 };
for (int k = 0; k < tokCount.Length; k++)
{
StringBuilder buffer = new StringBuilder();
Console.WriteLine("-----Tokens: " + tokCount[k] + "-----");
for (int i = 0; i < tokCount[k]; i++)
{
//buffer.Append(English.intToEnglish(i).toUpperCase(Locale.ROOT)).Append(' ');
buffer.Append(i.ToString(CultureInfo.InvariantCulture)).Append(' ');
}
//make sure we produce the same tokens
TeeSinkTokenFilter teeStream = new TeeSinkTokenFilter(new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString()))));
TokenStream sink = teeStream.NewSinkTokenStream(new ModuloSinkFilter(100));
teeStream.ConsumeAllTokens();
TokenStream stream = new ModuloTokenFilter(new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString()))), 100);
ICharTermAttribute tfTok = stream.AddAttribute<ICharTermAttribute>();
ICharTermAttribute sinkTok = sink.AddAttribute<ICharTermAttribute>();
for (int i = 0; stream.IncrementToken(); i++)
{
assertTrue(sink.IncrementToken());
assertTrue(tfTok + " is not equal to " + sinkTok + " at token: " + i, tfTok.Equals(sinkTok) == true);
}
//simulate two fields, each being analyzed once, for 20 documents
for (int j = 0; j < modCounts.Length; j++)
{
int tfPos = 0;
//long start = DateTimeHelperClass.CurrentUnixTimeMillis();
long start = Environment.TickCount;
for (int i = 0; i < 20; i++)
{
stream = new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString())));
IPositionIncrementAttribute posIncrAtt = stream.GetAttribute<IPositionIncrementAttribute>();
while (stream.IncrementToken())
{
tfPos += posIncrAtt.PositionIncrement;
}
stream = new ModuloTokenFilter(new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString()))), modCounts[j]);
posIncrAtt = stream.GetAttribute<IPositionIncrementAttribute>();
while (stream.IncrementToken())
{
tfPos += posIncrAtt.PositionIncrement;
}
}
//long finish = DateTimeHelperClass.CurrentUnixTimeMillis();
long finish = Environment.TickCount;
Console.WriteLine("ModCount: " + modCounts[j] + " Two fields took " + (finish - start) + " ms");
int sinkPos = 0;
//simulate one field with one sink
//start = DateTimeHelperClass.CurrentUnixTimeMillis();
start = Environment.TickCount;
for (int i = 0; i < 20; i++)
{
teeStream = new TeeSinkTokenFilter(new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString()))));
sink = teeStream.NewSinkTokenStream(new ModuloSinkFilter(modCounts[j]));
IPositionIncrementAttribute posIncrAtt = teeStream.GetAttribute<IPositionIncrementAttribute>();
while (teeStream.IncrementToken())
{
sinkPos += posIncrAtt.PositionIncrement;
}
//System.out.println("Modulo--------");
posIncrAtt = sink.GetAttribute<IPositionIncrementAttribute>();
while (sink.IncrementToken())
{
sinkPos += posIncrAtt.PositionIncrement;
}
}
//finish = DateTimeHelperClass.CurrentUnixTimeMillis();
finish = Environment.TickCount;
Console.WriteLine("ModCount: " + modCounts[j] + " Tee fields took " + (finish - start) + " ms");
assertTrue(sinkPos + " does not equal: " + tfPos, sinkPos == tfPos);
}
Console.WriteLine("- End Tokens: " + tokCount[k] + "-----");
}
}
internal sealed class ModuloTokenFilter : TokenFilter
{
internal int modCount;
internal ModuloTokenFilter(TokenStream input, int mc) : base(input)
{
modCount = mc;
}
internal int count = 0;
//return every 100 tokens
public override sealed bool IncrementToken()
{
bool hasNext;
for (hasNext = m_input.IncrementToken(); hasNext && count % modCount != 0; hasNext = m_input.IncrementToken())
{
count++;
}
count++;
return hasNext;
}
}
internal sealed class ModuloSinkFilter : TeeSinkTokenFilter.SinkFilter
{
internal int count = 0;
internal int modCount;
internal ModuloSinkFilter(int mc)
{
modCount = mc;
}
public override bool Accept(AttributeSource a)
{
bool b = (a != null && count % modCount == 0);
count++;
return b;
}
}
}
}