blob: 9176111cfe6419b83882e67ac8ba944c5557cba1 [file] [log] [blame]
using ICU4N.Text;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.TokenAttributes;
using Lucene.Net.Benchmarks.ByTask.Feeds;
using Lucene.Net.Benchmarks.ByTask.Stats;
using Lucene.Net.Benchmarks.ByTask.Tasks;
using Lucene.Net.Collation;
using Lucene.Net.Facet.Taxonomy;
using Lucene.Net.Index;
using Lucene.Net.Index.Extensions;
using Lucene.Net.Search;
using Lucene.Net.Util;
using NUnit.Framework;
using System;
using System.Collections.Generic;
using System.Globalization;
using System.IO;
using System.Text;
namespace Lucene.Net.Benchmarks.ByTask
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Test very simply that perf tasks - simple algorithms - are doing what they should.
/// </summary>
[SuppressCodecs("Lucene3x")]
public class TestPerfTasksLogic : BenchmarkTestCase
{
//public override void SetUp()
//{
// base.SetUp();
// copyToWorkDir("reuters.first20.lines.txt");
// copyToWorkDir("test-mapping-ISOLatin1Accent-partial.txt");
//}
public override void BeforeClass()
{
base.BeforeClass();
copyToWorkDir("reuters.first20.lines.txt");
copyToWorkDir("test-mapping-ISOLatin1Accent-partial.txt");
}
/**
* Test index creation logic
*/
[Test]
public void TestIndexAndSearchTasks()
{
// 1. alg definition (required in every "logic" test)
String[] algLines = {
"ResetSystemErase",
"CreateIndex",
"{ AddDoc } : 1000",
"ForceMerge(1)",
"CloseIndex",
"OpenReader",
"{ CountingSearchTest } : 200",
"CloseReader",
"[ CountingSearchTest > : 70",
"[ CountingSearchTest > : 9",
};
// 2. we test this value later
CountingSearchTestTask.numSearches = 0;
// 3. execute the algorithm (required in every "logic" test)
Benchmark benchmark = execBenchmark(algLines);
// 4. test specific checks after the benchmark run completed.
assertEquals("TestSearchTask was supposed to be called!", 279, CountingSearchTestTask.numSearches);
assertTrue("Index does not exist?...!", DirectoryReader.IndexExists(benchmark.RunData.Directory));
// now we should be able to open the index for write.
IndexWriter iw = new IndexWriter(benchmark.RunData.Directory,
new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random))
.SetOpenMode(OpenMode.APPEND));
iw.Dispose();
IndexReader ir = DirectoryReader.Open(benchmark.RunData.Directory);
assertEquals("1000 docs were added to the index, this is what we expect to find!", 1000, ir.NumDocs);
ir.Dispose();
}
/**
* Test timed sequence task.
*/
[Test]
public void TestTimedSearchTask()
{
String[] algLines = {
"log.step=100000",
"ResetSystemErase",
"CreateIndex",
"{ AddDoc } : 100",
"ForceMerge(1)",
"CloseIndex",
"OpenReader",
"{ CountingSearchTest } : .5s",
"CloseReader",
};
CountingSearchTestTask.numSearches = 0;
execBenchmark(algLines);
assertTrue(CountingSearchTestTask.numSearches > 0);
long elapsed = CountingSearchTestTask.prevLastMillis - CountingSearchTestTask.startMillis;
assertTrue("elapsed time was " + elapsed + " msec", elapsed <= 1500);
}
// disabled until we fix BG thread prio -- this test
// causes build to hang
[Test]
public void TestBGSearchTaskThreads()
{
String[] algLines = {
"log.time.step.msec = 100",
"log.step=100000",
"ResetSystemErase",
"CreateIndex",
"{ AddDoc } : 1000",
"ForceMerge(1)",
"CloseIndex",
"OpenReader",
"{",
" [ \"XSearch\" { CountingSearchTest > : * ] : 2 &-1",
" Wait(0.5)",
"}",
"CloseReader",
"RepSumByPref X"
};
CountingSearchTestTask.numSearches = 0;
execBenchmark(algLines);
// NOTE: cannot assert this, because on a super-slow
// system, it could be after waiting 0.5 seconds that
// the search threads hadn't yet succeeded in starting
// up and then they start up and do no searching:
//assertTrue(CountingSearchTestTask.numSearches > 0);
}
[Test]
public void TestHighlighting()
{
// 1. alg definition (required in every "logic" test)
String[] algLines = {
"doc.stored=true",
"content.source=Lucene.Net.Benchmarks.ByTask.Feeds.LineDocSource, Lucene.Net.Benchmark",
"docs.file=" + getReuters20LinesFile(),
"query.maker=" + typeof(ReutersQueryMaker).AssemblyQualifiedName,
"ResetSystemErase",
"CreateIndex",
"{ AddDoc } : 100",
"ForceMerge(1)",
"CloseIndex",
"OpenReader",
"{ CountingHighlighterTest(size[1],highlight[1],mergeContiguous[true],maxFrags[1],fields[body]) } : 200",
"CloseReader",
};
// 2. we test this value later
CountingHighlighterTestTask.numHighlightedResults = 0;
CountingHighlighterTestTask.numDocsRetrieved = 0;
// 3. execute the algorithm (required in every "logic" test)
Benchmark benchmark = execBenchmark(algLines);
// 4. test specific checks after the benchmark run completed.
assertEquals("TestSearchTask was supposed to be called!", 92, CountingHighlighterTestTask.numDocsRetrieved);
//pretty hard to figure out a priori how many docs are going to have highlighted fragments returned, but we can never have more than the number of docs
//we probably should use a different doc/query maker, but...
assertTrue("TestSearchTask was supposed to be called!", CountingHighlighterTestTask.numDocsRetrieved >= CountingHighlighterTestTask.numHighlightedResults && CountingHighlighterTestTask.numHighlightedResults > 0);
assertTrue("Index does not exist?...!", DirectoryReader.IndexExists(benchmark.RunData.Directory));
// now we should be able to open the index for write.
IndexWriter iw = new IndexWriter(benchmark.RunData.Directory, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)).SetOpenMode(OpenMode.APPEND));
iw.Dispose();
IndexReader ir = DirectoryReader.Open(benchmark.RunData.Directory);
assertEquals("100 docs were added to the index, this is what we expect to find!", 100, ir.NumDocs);
ir.Dispose();
}
[Test]
public void TestHighlightingTV()
{
// 1. alg definition (required in every "logic" test)
String[] algLines = {
"doc.stored=true",//doc storage is required in order to have text to highlight
"doc.term.vector=true",
"doc.term.vector.offsets=true",
"content.source=Lucene.Net.Benchmarks.ByTask.Feeds.LineDocSource, Lucene.Net.Benchmark",
"docs.file=" + getReuters20LinesFile(),
"query.maker=" + typeof(ReutersQueryMaker).AssemblyQualifiedName,
"ResetSystemErase",
"CreateIndex",
"{ AddDoc } : 1000",
"ForceMerge(1)",
"CloseIndex",
"OpenReader",
"{ CountingHighlighterTest(size[1],highlight[1],mergeContiguous[true],maxFrags[1],fields[body]) } : 200",
"CloseReader",
};
// 2. we test this value later
CountingHighlighterTestTask.numHighlightedResults = 0;
CountingHighlighterTestTask.numDocsRetrieved = 0;
// 3. execute the algorithm (required in every "logic" test)
Benchmark benchmark = execBenchmark(algLines);
// 4. test specific checks after the benchmark run completed.
assertEquals("TestSearchTask was supposed to be called!", 92, CountingHighlighterTestTask.numDocsRetrieved);
//pretty hard to figure out a priori how many docs are going to have highlighted fragments returned, but we can never have more than the number of docs
//we probably should use a different doc/query maker, but...
assertTrue("TestSearchTask was supposed to be called!", CountingHighlighterTestTask.numDocsRetrieved >= CountingHighlighterTestTask.numHighlightedResults && CountingHighlighterTestTask.numHighlightedResults > 0);
assertTrue("Index does not exist?...!", DirectoryReader.IndexExists(benchmark.RunData.Directory));
// now we should be able to open the index for write.
IndexWriter iw = new IndexWriter(benchmark.RunData.Directory, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)).SetOpenMode(OpenMode.APPEND));
iw.Dispose();
IndexReader ir = DirectoryReader.Open(benchmark.RunData.Directory);
assertEquals("1000 docs were added to the index, this is what we expect to find!", 1000, ir.NumDocs);
ir.Dispose();
}
[Test]
public void TestHighlightingNoTvNoStore()
{
// 1. alg definition (required in every "logic" test)
String[] algLines = {
"doc.stored=false",
"content.source=Lucene.Net.Benchmarks.ByTask.Feeds.LineDocSource, Lucene.Net.Benchmark",
"docs.file=" + getReuters20LinesFile(),
"query.maker=" + typeof(ReutersQueryMaker).AssemblyQualifiedName,
"ResetSystemErase",
"CreateIndex",
"{ AddDoc } : 1000",
"ForceMerge(1)",
"CloseIndex",
"OpenReader",
"{ CountingHighlighterTest(size[1],highlight[1],mergeContiguous[true],maxFrags[1],fields[body]) } : 200",
"CloseReader",
};
// 2. we test this value later
CountingHighlighterTestTask.numHighlightedResults = 0;
CountingHighlighterTestTask.numDocsRetrieved = 0;
// 3. execute the algorithm (required in every "logic" test)
try
{
Benchmark benchmark = execBenchmark(algLines);
assertTrue("CountingHighlighterTest should have thrown an exception", false);
assertNotNull(benchmark); // (avoid compile warning on unused variable)
}
#pragma warning disable 168
catch (Exception e)
#pragma warning restore 168
{
assertTrue(true);
}
}
/**
* Test Exhasting Doc Maker logic
*/
[Test]
public void TestExhaustContentSource()
{
// 1. alg definition (required in every "logic" test)
String[] algLines = {
"# ----- properties ",
"content.source=Lucene.Net.Benchmarks.ByTask.Feeds.SingleDocSource, Lucene.Net.Benchmark",
"content.source.log.step=1",
"doc.term.vector=false",
"content.source.forever=false",
"directory=RAMDirectory",
"doc.stored=false",
"doc.tokenized=false",
"# ----- alg ",
"CreateIndex",
"{ AddDoc } : * ",
"ForceMerge(1)",
"CloseIndex",
"OpenReader",
"{ CountingSearchTest } : 100",
"CloseReader",
"[ CountingSearchTest > : 30",
"[ CountingSearchTest > : 9",
};
// 2. we test this value later
CountingSearchTestTask.numSearches = 0;
// 3. execute the algorithm (required in every "logic" test)
Benchmark benchmark = execBenchmark(algLines);
// 4. test specific checks after the benchmark run completed.
assertEquals("TestSearchTask was supposed to be called!", 139, CountingSearchTestTask.numSearches);
assertTrue("Index does not exist?...!", DirectoryReader.IndexExists(benchmark.RunData.Directory));
// now we should be able to open the index for write.
IndexWriter iw = new IndexWriter(benchmark.RunData.Directory, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)).SetOpenMode(OpenMode.APPEND));
iw.Dispose();
IndexReader ir = DirectoryReader.Open(benchmark.RunData.Directory);
assertEquals("1 docs were added to the index, this is what we expect to find!", 1, ir.NumDocs);
ir.Dispose();
}
// LUCENE-1994: test thread safety of SortableSingleDocMaker
[Test]
public void TestDocMakerThreadSafety()
{
// 1. alg definition (required in every "logic" test)
String[] algLines = {
"# ----- properties ",
"content.source=Lucene.Net.Benchmarks.ByTask.Feeds.SortableSingleDocSource, Lucene.Net.Benchmark",
"doc.term.vector=false",
"log.step.AddDoc=10000",
"content.source.forever=true",
"directory=RAMDirectory",
"doc.reuse.fields=false",
"doc.stored=false",
"doc.tokenized=false",
"doc.index.props=true",
"# ----- alg ",
"CreateIndex",
"[ { AddDoc > : 250 ] : 4",
"CloseIndex",
};
// 2. we test this value later
CountingSearchTestTask.numSearches = 0;
// 3. execute the algorithm (required in every "logic" test)
Benchmark benchmark = execBenchmark(algLines);
DirectoryReader r = DirectoryReader.Open(benchmark.RunData.Directory);
SortedDocValues idx = FieldCache.DEFAULT.GetTermsIndex(SlowCompositeReaderWrapper.Wrap(r), "country");
int maxDoc = r.MaxDoc;
assertEquals(1000, maxDoc);
for (int i = 0; i < 1000; i++)
{
assertTrue("doc " + i + " has null country", idx.GetOrd(i) != -1);
}
r.Dispose();
}
/**
* Test Parallel Doc Maker logic (for LUCENE-940)
*/
[Test]
public void TestParallelDocMaker()
{
// 1. alg definition (required in every "logic" test)
String[] algLines = {
"# ----- properties ",
"content.source=Lucene.Net.Benchmarks.ByTask.Feeds.LineDocSource, Lucene.Net.Benchmark",
"docs.file=" + getReuters20LinesFile(),
"content.source.log.step=3",
"doc.term.vector=false",
"content.source.forever=false",
"directory=FSDirectory",
"doc.stored=false",
"doc.tokenized=false",
"# ----- alg ",
"CreateIndex",
"[ { AddDoc } : * ] : 4 ",
"CloseIndex",
};
// 2. execute the algorithm (required in every "logic" test)
Benchmark benchmark = execBenchmark(algLines);
// 3. test number of docs in the index
IndexReader ir = DirectoryReader.Open(benchmark.RunData.Directory);
int ndocsExpected = 20; // first 20 reuters docs.
assertEquals("wrong number of docs in the index!", ndocsExpected, ir.NumDocs);
ir.Dispose();
}
/**
* Test WriteLineDoc and LineDocSource.
*/
[Test]
public void TestLineDocFile()
{
FileInfo lineFile = CreateTempFile("test.reuters.lines", ".txt");
// We will call WriteLineDocs this many times
int NUM_TRY_DOCS = 50;
// Creates a line file with first 50 docs from SingleDocSource
String[] algLines1 = {
"# ----- properties ",
"content.source=Lucene.Net.Benchmarks.ByTask.Feeds.SingleDocSource, Lucene.Net.Benchmark",
"content.source.forever=true",
"line.file.out=" + lineFile.FullName.Replace('\\', '/'),
"# ----- alg ",
"{WriteLineDoc()}:" + NUM_TRY_DOCS,
};
// Run algo
Benchmark benchmark = execBenchmark(algLines1);
TextReader r =
new StreamReader(
new FileStream(lineFile.FullName, FileMode.Open, FileAccess.Read), Encoding.UTF8);
int numLines = 0;
String line;
while ((line = r.ReadLine()) != null)
{
if (numLines == 0 && line.StartsWith(WriteLineDocTask.FIELDS_HEADER_INDICATOR, StringComparison.Ordinal))
{
continue; // do not count the header line as a doc
}
numLines++;
}
r.Dispose();
assertEquals("did not see the right number of docs; should be " + NUM_TRY_DOCS + " but was " + numLines, NUM_TRY_DOCS, numLines);
// Index the line docs
String[] algLines2 = {
"# ----- properties ",
"analyzer=Lucene.Net.Analysis.Core.WhitespaceAnalyzer, Lucene.Net.Analysis.Common",
"content.source=Lucene.Net.Benchmarks.ByTask.Feeds.LineDocSource, Lucene.Net.Benchmark",
"docs.file=" + lineFile.FullName.Replace('\\', '/'),
"content.source.forever=false",
"doc.reuse.fields=false",
"ram.flush.mb=4",
"# ----- alg ",
"ResetSystemErase",
"CreateIndex",
"{AddDoc}: *",
"CloseIndex",
};
// Run algo
benchmark = execBenchmark(algLines2);
// now we should be able to open the index for write.
IndexWriter iw = new IndexWriter(benchmark.RunData.Directory,
new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random))
.SetOpenMode(OpenMode.APPEND));
iw.Dispose();
IndexReader ir = DirectoryReader.Open(benchmark.RunData.Directory);
assertEquals(numLines + " lines were created but " + ir.NumDocs + " docs are in the index", numLines, ir.NumDocs);
ir.Dispose();
lineFile.Delete();
}
/**
* Test ReadTokensTask
*/
[Test]
public void TestReadTokens()
{
// We will call ReadTokens on this many docs
int NUM_DOCS = 20;
// Read tokens from first NUM_DOCS docs from Reuters and
// then build index from the same docs
String[] algLines1 = {
"# ----- properties ",
"analyzer=Lucene.Net.Analysis.Core.WhitespaceAnalyzer, Lucene.Net.Analysis.Common",
"content.source=Lucene.Net.Benchmarks.ByTask.Feeds.LineDocSource, Lucene.Net.Benchmark",
"docs.file=" + getReuters20LinesFile(),
"# ----- alg ",
"{ReadTokens}: " + NUM_DOCS,
"ResetSystemErase",
"CreateIndex",
"{AddDoc}: " + NUM_DOCS,
"CloseIndex",
};
// Run algo
Benchmark benchmark = execBenchmark(algLines1);
IList<TaskStats> stats = benchmark.RunData.Points.TaskStats;
// Count how many tokens all ReadTokens saw
int totalTokenCount1 = 0;
foreach (TaskStats stat in stats)
{
if (stat.Task.GetName().Equals("ReadTokens", StringComparison.Ordinal))
{
totalTokenCount1 += stat.Count;
}
}
// Separately count how many tokens are actually in the index:
IndexReader reader = DirectoryReader.Open(benchmark.RunData.Directory);
assertEquals(NUM_DOCS, reader.NumDocs);
int totalTokenCount2 = 0;
Fields fields = MultiFields.GetFields(reader);
foreach (String fieldName in fields)
{
if (fieldName.Equals(DocMaker.ID_FIELD, StringComparison.Ordinal) || fieldName.Equals(DocMaker.DATE_MSEC_FIELD, StringComparison.Ordinal) || fieldName.Equals(DocMaker.TIME_SEC_FIELD, StringComparison.Ordinal))
{
continue;
}
Terms terms = fields.GetTerms(fieldName);
if (terms == null)
{
continue;
}
TermsEnum termsEnum = terms.GetEnumerator();
DocsEnum docs = null;
while (termsEnum.MoveNext())
{
docs = TestUtil.Docs(Random, termsEnum, MultiFields.GetLiveDocs(reader), docs, DocsFlags.FREQS);
while (docs.NextDoc() != DocIdSetIterator.NO_MORE_DOCS)
{
totalTokenCount2 += docs.Freq;
}
}
}
reader.Dispose();
// Make sure they are the same
assertEquals(totalTokenCount1, totalTokenCount2);
}
/**
* Test that " {[AddDoc(4000)]: 4} : * " works corrcetly (for LUCENE-941)
*/
[Test]
public void TestParallelExhausted()
{
// 1. alg definition (required in every "logic" test)
String[] algLines = {
"# ----- properties ",
"content.source=Lucene.Net.Benchmarks.ByTask.Feeds.LineDocSource, Lucene.Net.Benchmark",
"docs.file=" + getReuters20LinesFile(),
"content.source.log.step=3",
"doc.term.vector=false",
"content.source.forever=false",
"directory=RAMDirectory",
"doc.stored=false",
"doc.tokenized=false",
"task.max.depth.log=1",
"# ----- alg ",
"CreateIndex",
"{ [ AddDoc]: 4} : * ",
"ResetInputs ",
"{ [ AddDoc]: 4} : * ",
"WaitForMerges",
"CloseIndex",
};
// 2. execute the algorithm (required in every "logic" test)
Benchmark benchmark = execBenchmark(algLines);
// 3. test number of docs in the index
IndexReader ir = DirectoryReader.Open(benchmark.RunData.Directory);
int ndocsExpected = 2 * 20; // first 20 reuters docs.
assertEquals("wrong number of docs in the index!", ndocsExpected, ir.NumDocs);
ir.Dispose();
}
/**
* Test that exhaust in loop works as expected (LUCENE-1115).
*/
[Test]
public void TestExhaustedLooped()
{
// 1. alg definition (required in every "logic" test)
String[] algLines = {
"# ----- properties ",
"content.source=Lucene.Net.Benchmarks.ByTask.Feeds.LineDocSource, Lucene.Net.Benchmark",
"docs.file=" + getReuters20LinesFile(),
"content.source.log.step=3",
"doc.term.vector=false",
"content.source.forever=false",
"directory=RAMDirectory",
"doc.stored=false",
"doc.tokenized=false",
"task.max.depth.log=1",
"# ----- alg ",
"{ \"Rounds\"",
" ResetSystemErase",
" CreateIndex",
" { \"AddDocs\" AddDoc > : * ",
" WaitForMerges",
" CloseIndex",
"} : 2",
};
// 2. execute the algorithm (required in every "logic" test)
Benchmark benchmark = execBenchmark(algLines);
// 3. test number of docs in the index
IndexReader ir = DirectoryReader.Open(benchmark.RunData.Directory);
int ndocsExpected = 20; // first 20 reuters docs.
assertEquals("wrong number of docs in the index!", ndocsExpected, ir.NumDocs);
ir.Dispose();
}
/**
* Test that we can close IndexWriter with argument "false".
*/
[Test]
public void TestCloseIndexFalse()
{
// 1. alg definition (required in every "logic" test)
String[] algLines = {
"# ----- properties ",
"content.source=Lucene.Net.Benchmarks.ByTask.Feeds.LineDocSource, Lucene.Net.Benchmark",
"docs.file=" + getReuters20LinesFile(),
"ram.flush.mb=-1",
"max.buffered=2",
"content.source.log.step=3",
"doc.term.vector=false",
"content.source.forever=false",
"directory=RAMDirectory",
"doc.stored=false",
"doc.tokenized=false",
"debug.level=1",
"# ----- alg ",
"{ \"Rounds\"",
" ResetSystemErase",
" CreateIndex",
" { \"AddDocs\" AddDoc > : * ",
" CloseIndex(false)",
"} : 2",
};
// 2. execute the algorithm (required in every "logic" test)
Benchmark benchmark = execBenchmark(algLines);
// 3. test number of docs in the index
IndexReader ir = DirectoryReader.Open(benchmark.RunData.Directory);
int ndocsExpected = 20; // first 20 reuters docs.
assertEquals("wrong number of docs in the index!", ndocsExpected, ir.NumDocs);
ir.Dispose();
}
public class MyMergeScheduler : SerialMergeScheduler
{
internal bool called;
public MyMergeScheduler()
: base()
{
called = true;
}
}
/**
* Test that we can set merge scheduler".
*/
[Test]
public void TestMergeScheduler()
{
// 1. alg definition (required in every "logic" test)
String[] algLines = {
"# ----- properties ",
"content.source=Lucene.Net.Benchmarks.ByTask.Feeds.LineDocSource, Lucene.Net.Benchmark",
"docs.file=" + getReuters20LinesFile(),
"content.source.log.step=3",
"doc.term.vector=false",
"content.source.forever=false",
"directory=RAMDirectory",
"merge.scheduler=" + typeof(MyMergeScheduler).AssemblyQualifiedName,
"doc.stored=false",
"doc.tokenized=false",
"debug.level=1",
"# ----- alg ",
"{ \"Rounds\"",
" ResetSystemErase",
" CreateIndex",
" { \"AddDocs\" AddDoc > : * ",
"} : 2",
};
// 2. execute the algorithm (required in every "logic" test)
Benchmark benchmark = execBenchmark(algLines);
assertTrue("did not use the specified MergeScheduler",
((MyMergeScheduler)benchmark.RunData.IndexWriter.Config
.MergeScheduler).called);
benchmark.RunData.IndexWriter.Dispose();
// 3. test number of docs in the index
IndexReader ir = DirectoryReader.Open(benchmark.RunData.Directory);
int ndocsExpected = 20; // first 20 reuters docs.
assertEquals("wrong number of docs in the index!", ndocsExpected, ir.NumDocs);
ir.Dispose();
}
public class MyMergePolicy : LogDocMergePolicy
{
internal bool called;
public MyMergePolicy()
{
called = true;
}
}
/**
* Test that we can set merge policy".
*/
[Test]
public void TestMergePolicy()
{
// 1. alg definition (required in every "logic" test)
String[] algLines = {
"# ----- properties ",
"content.source=Lucene.Net.Benchmarks.ByTask.Feeds.LineDocSource, Lucene.Net.Benchmark",
"docs.file=" + getReuters20LinesFile(),
"content.source.log.step=3",
"ram.flush.mb=-1",
"max.buffered=2",
"doc.term.vector=false",
"content.source.forever=false",
"directory=RAMDirectory",
"merge.policy=" + typeof(MyMergePolicy).AssemblyQualifiedName,
"doc.stored=false",
"doc.tokenized=false",
"debug.level=1",
"# ----- alg ",
"{ \"Rounds\"",
" ResetSystemErase",
" CreateIndex",
" { \"AddDocs\" AddDoc > : * ",
"} : 2",
};
// 2. execute the algorithm (required in every "logic" test)
Benchmark benchmark = execBenchmark(algLines);
assertTrue("did not use the specified MergePolicy", ((MyMergePolicy)benchmark.RunData.IndexWriter.Config.MergePolicy).called);
benchmark.RunData.IndexWriter.Dispose();
// 3. test number of docs in the index
IndexReader ir = DirectoryReader.Open(benchmark.RunData.Directory);
int ndocsExpected = 20; // first 20 reuters docs.
assertEquals("wrong number of docs in the index!", ndocsExpected, ir.NumDocs);
ir.Dispose();
}
/**
* Test that IndexWriter settings stick.
*/
[Test]
public void TestIndexWriterSettings()
{
// 1. alg definition (required in every "logic" test)
String[] algLines = {
"# ----- properties ",
"content.source=Lucene.Net.Benchmarks.ByTask.Feeds.LineDocSource, Lucene.Net.Benchmark",
"docs.file=" + getReuters20LinesFile(),
"content.source.log.step=3",
"ram.flush.mb=-1",
"max.buffered=2",
"compound=cmpnd:true:false",
"doc.term.vector=vector:false:true",
"content.source.forever=false",
"directory=RAMDirectory",
"doc.stored=false",
"merge.factor=3",
"doc.tokenized=false",
"debug.level=1",
"# ----- alg ",
"{ \"Rounds\"",
" ResetSystemErase",
" CreateIndex",
" { \"AddDocs\" AddDoc > : * ",
" NewRound",
"} : 2",
};
// 2. execute the algorithm (required in every "logic" test)
Benchmark benchmark = execBenchmark(algLines);
IndexWriter writer = benchmark.RunData.IndexWriter;
assertEquals(2, writer.Config.MaxBufferedDocs);
assertEquals(IndexWriterConfig.DISABLE_AUTO_FLUSH, (int)writer.Config.RAMBufferSizeMB);
assertEquals(3, ((LogMergePolicy)writer.Config.MergePolicy).MergeFactor);
assertEquals(0.0d, writer.Config.MergePolicy.NoCFSRatio, 0.0);
writer.Dispose();
Store.Directory dir = benchmark.RunData.Directory;
IndexReader reader = DirectoryReader.Open(dir);
Fields tfv = reader.GetTermVectors(0);
assertNotNull(tfv);
assertTrue(tfv.Count > 0);
reader.Dispose();
}
/**
* Test indexing with facets tasks.
*/
[Test]
public void TestIndexingWithFacets()
{
// 1. alg definition (required in every "logic" test)
String[] algLines = {
"# ----- properties ",
"content.source=Lucene.Net.Benchmarks.ByTask.Feeds.LineDocSource, Lucene.Net.Benchmark",
"docs.file=" + getReuters20LinesFile(),
"content.source.log.step=100",
"content.source.forever=false",
"directory=RAMDirectory",
"doc.stored=false",
"merge.factor=3",
"doc.tokenized=false",
"debug.level=1",
"# ----- alg ",
"ResetSystemErase",
"CreateIndex",
"CreateTaxonomyIndex",
"{ \"AddDocs\" AddFacetedDoc > : * ",
"CloseIndex",
"CloseTaxonomyIndex",
"OpenTaxonomyReader",
};
// 2. execute the algorithm (required in every "logic" test)
Benchmark benchmark = execBenchmark(algLines);
PerfRunData runData = benchmark.RunData;
assertNull("taxo writer was not properly closed", runData.TaxonomyWriter);
TaxonomyReader taxoReader = runData.GetTaxonomyReader();
assertNotNull("taxo reader was not opened", taxoReader);
assertTrue("nothing was added to the taxnomy (expecting root and at least one addtional category)", taxoReader.Count > 1);
taxoReader.Dispose();
}
/**
* Test that we can call forceMerge(maxNumSegments).
*/
[Test]
public void TestForceMerge()
{
// 1. alg definition (required in every "logic" test)
String[] algLines = {
"# ----- properties ",
"content.source=Lucene.Net.Benchmarks.ByTask.Feeds.LineDocSource, Lucene.Net.Benchmark",
"docs.file=" + getReuters20LinesFile(),
"content.source.log.step=3",
"ram.flush.mb=-1",
"max.buffered=3",
"doc.term.vector=false",
"content.source.forever=false",
"directory=RAMDirectory",
"merge.policy=Lucene.Net.Index.LogDocMergePolicy, Lucene.Net",
"doc.stored=false",
"doc.tokenized=false",
"debug.level=1",
"# ----- alg ",
"{ \"Rounds\"",
" ResetSystemErase",
" CreateIndex",
" { \"AddDocs\" AddDoc > : * ",
" ForceMerge(3)",
" CloseIndex()",
"} : 2",
};
// 2. execute the algorithm (required in every "logic" test)
Benchmark benchmark = execBenchmark(algLines);
// 3. test number of docs in the index
IndexReader ir = DirectoryReader.Open(benchmark.RunData.Directory);
int ndocsExpected = 20; // first 20 reuters docs.
assertEquals("wrong number of docs in the index!", ndocsExpected, ir.NumDocs);
ir.Dispose();
// Make sure we have 3 segments:
SegmentInfos infos = new SegmentInfos();
infos.Read(benchmark.RunData.Directory);
assertEquals(3, infos.Count);
}
/**
* Test disabling task count (LUCENE-1136).
*/
[Test]
public void TestDisableCounting()
{
doTestDisableCounting(true);
doTestDisableCounting(false);
}
private void doTestDisableCounting(bool disable)
{
// 1. alg definition (required in every "logic" test)
String[] algLines = disableCountingLines(disable);
// 2. execute the algorithm (required in every "logic" test)
Benchmark benchmark = execBenchmark(algLines);
// 3. test counters
int n = disable ? 0 : 1;
int nChecked = 0;
foreach (TaskStats stats in benchmark.RunData.Points.TaskStats)
{
String taskName = stats.Task.GetName();
if (taskName.Equals("Rounds", StringComparison.Ordinal))
{
assertEquals("Wrong total count!", 20 + 2 * n, stats.Count);
nChecked++;
}
else if (taskName.Equals("CreateIndex", StringComparison.Ordinal))
{
assertEquals("Wrong count for CreateIndex!", n, stats.Count);
nChecked++;
}
else if (taskName.Equals("CloseIndex", StringComparison.Ordinal))
{
assertEquals("Wrong count for CloseIndex!", n, stats.Count);
nChecked++;
}
}
assertEquals("Missing some tasks to check!", 3, nChecked);
}
private String[] disableCountingLines(bool disable)
{
String dis = disable ? "-" : "";
return new String[] {
"# ----- properties ",
"content.source=Lucene.Net.Benchmarks.ByTask.Feeds.LineDocSource, Lucene.Net.Benchmark",
"docs.file=" + getReuters20LinesFile(),
"content.source.log.step=30",
"doc.term.vector=false",
"content.source.forever=false",
"directory=RAMDirectory",
"doc.stored=false",
"doc.tokenized=false",
"task.max.depth.log=1",
"# ----- alg ",
"{ \"Rounds\"",
" ResetSystemErase",
" "+dis+"CreateIndex", // optionally disable counting here
" { \"AddDocs\" AddDoc > : * ",
" "+dis+" CloseIndex", // optionally disable counting here (with extra blanks)
"}",
"RepSumByName",
};
}
/**
* Test that we can change the Locale in the runData,
* that it is parsed as we expect.
*/
[Test]
public void TestLocale()
{
// empty Locale: clear it (null)
Benchmark benchmark = execBenchmark(getLocaleConfig(""));
assertNull(benchmark.RunData.Locale);
// ROOT locale
benchmark = execBenchmark(getLocaleConfig("ROOT"));
assertEquals(CultureInfo.InvariantCulture, benchmark.RunData.Locale);
// specify just a language
benchmark = execBenchmark(getLocaleConfig("de"));
assertEquals(new CultureInfo("de"), benchmark.RunData.Locale);
// specify language + country
benchmark = execBenchmark(getLocaleConfig("en,US"));
assertEquals(new CultureInfo("en-US"), benchmark.RunData.Locale);
// specify language + country + variant
//benchmark = execBenchmark(getLocaleConfig("no,NO,NY"));
//assertEquals(new CultureInfo("no-NO"/*, "NY"*/), benchmark.RunData.Locale);
// LUCENENET specific - in .NET Norwegian is specified as either nb-NO (Bokmål) or
// nn-NO (Nynorsk) + a few other dialects. no-NO works sometimes, but is not
// supported across all OS's, so doesn't make a reliable test case.
benchmark = execBenchmark(getLocaleConfig("nb,NO,NY"));
assertEquals(new CultureInfo("nb-NO"/*, "NY"*/), benchmark.RunData.Locale);
}
private String[] getLocaleConfig(String localeParam)
{
String[] algLines = {
"# ----- properties ",
"content.source=Lucene.Net.Benchmarks.ByTask.Feeds.LineDocSource, Lucene.Net.Benchmark",
"docs.file=" + getReuters20LinesFile(),
"content.source.log.step=3",
"content.source.forever=false",
"directory=RAMDirectory",
"# ----- alg ",
"{ \"Rounds\"",
" ResetSystemErase",
" NewLocale(" + localeParam + ")",
" CreateIndex",
" { \"AddDocs\" AddDoc > : * ",
" NewRound",
"} : 1",
};
return algLines;
}
/**
* Test that we can create CollationAnalyzers.
*/
[Test]
public void TestCollator()
{
// LUCENENET specific - we don't have a JDK version of collator
// so we are using ICU
var collatorParam = "impl:icu";
// ROOT locale
Benchmark benchmark = execBenchmark(getCollatorConfig("ROOT", collatorParam));
ICUCollationKeyAnalyzer expected = new ICUCollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator
.GetInstance(CultureInfo.InvariantCulture));
assertEqualCollation(expected, benchmark.RunData.Analyzer, "foobar");
// specify just a language
benchmark = execBenchmark(getCollatorConfig("de", collatorParam));
expected = new ICUCollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.GetInstance(new CultureInfo("de")));
assertEqualCollation(expected, benchmark.RunData.Analyzer, "foobar");
// specify language + country
benchmark = execBenchmark(getCollatorConfig("en,US", collatorParam));
expected = new ICUCollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.GetInstance(new CultureInfo("en-US")));
assertEqualCollation(expected, benchmark.RunData.Analyzer, "foobar");
//// specify language + country + variant
//benchmark = execBenchmark(getCollatorConfig("no,NO,NY", collatorParam));
//expected = new ICUCollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.Create(new CultureInfo("no-NO"/*, "NY"*/), Collator.Fallback.FallbackAllowed));
//assertEqualCollation(expected, benchmark.RunData.Analyzer, "foobar");
// LUCENENET specific - in .NET Norwegian is specified as either nb-NO (Bokmål) or
// nn-NO (Nynorsk) + a few other dialects. no-NO works sometimes, but is not
// supported across all OS's, so doesn't make a reliable test case.
// specify language + country + variant
benchmark = execBenchmark(getCollatorConfig("nb,NO,NY", collatorParam));
expected = new ICUCollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.GetInstance(new CultureInfo("nb-NO"/*, "NY"*/)));
assertEqualCollation(expected, benchmark.RunData.Analyzer, "foobar");
}
private void assertEqualCollation(Analyzer a1, Analyzer a2, String text)
{
TokenStream ts1 = a1.GetTokenStream("bogus", text);
TokenStream ts2 = a2.GetTokenStream("bogus", text);
ts1.Reset();
ts2.Reset();
ITermToBytesRefAttribute termAtt1 = ts1.AddAttribute<ITermToBytesRefAttribute>();
ITermToBytesRefAttribute termAtt2 = ts2.AddAttribute<ITermToBytesRefAttribute>();
assertTrue(ts1.IncrementToken());
assertTrue(ts2.IncrementToken());
BytesRef bytes1 = termAtt1.BytesRef;
BytesRef bytes2 = termAtt2.BytesRef;
termAtt1.FillBytesRef();
termAtt2.FillBytesRef();
assertEquals(bytes1, bytes2);
assertFalse(ts1.IncrementToken());
assertFalse(ts2.IncrementToken());
ts1.Dispose();
ts2.Dispose();
}
private String[] getCollatorConfig(String localeParam,
String collationParam)
{
String[] algLines = {
"# ----- properties ",
"content.source=Lucene.Net.Benchmarks.ByTask.Feeds.LineDocSource, Lucene.Net.Benchmark",
"docs.file=" + getReuters20LinesFile(),
"content.source.log.step=3",
"content.source.forever=false",
"directory=RAMDirectory",
"# ----- alg ",
"{ \"Rounds\"",
" ResetSystemErase",
" NewLocale(" + localeParam + ")",
" NewCollationAnalyzer(" + collationParam + ")",
" CreateIndex",
" { \"AddDocs\" AddDoc > : * ",
" NewRound",
"} : 1",
};
return algLines;
}
/**
* Test that we can create shingle analyzers using AnalyzerFactory.
*/
[Test]
public void TestShingleAnalyzer()
{
String text = "one,two,three, four five six";
// StandardTokenizer, maxShingleSize, and outputUnigrams
Benchmark benchmark = execBenchmark(getAnalyzerFactoryConfig
("shingle-analyzer", "StandardTokenizer,ShingleFilter"));
benchmark.RunData.Analyzer.GetTokenStream
("bogus", text).Dispose();
BaseTokenStreamTestCase.AssertAnalyzesTo(benchmark.RunData.Analyzer, text,
new String[] { "one", "one two", "two", "two three",
"three", "three four", "four", "four five",
"five", "five six", "six" });
// StandardTokenizer, maxShingleSize = 3, and outputUnigrams = false
benchmark = execBenchmark
(getAnalyzerFactoryConfig
("shingle-analyzer",
"StandardTokenizer,ShingleFilter(maxShingleSize:3,outputUnigrams:false)"));
BaseTokenStreamTestCase.AssertAnalyzesTo(benchmark.RunData.Analyzer, text,
new String[] { "one two", "one two three", "two three",
"two three four", "three four",
"three four five", "four five",
"four five six", "five six" });
// WhitespaceTokenizer, default maxShingleSize and outputUnigrams
benchmark = execBenchmark
(getAnalyzerFactoryConfig("shingle-analyzer", "WhitespaceTokenizer,ShingleFilter"));
BaseTokenStreamTestCase.AssertAnalyzesTo(benchmark.RunData.Analyzer, text,
new String[] { "one,two,three,", "one,two,three, four",
"four", "four five", "five", "five six",
"six" });
// WhitespaceTokenizer, maxShingleSize=3 and outputUnigrams=false
benchmark = execBenchmark
(getAnalyzerFactoryConfig
("shingle-factory",
"WhitespaceTokenizer,ShingleFilter(outputUnigrams:false,maxShingleSize:3)"));
BaseTokenStreamTestCase.AssertAnalyzesTo(benchmark.RunData.Analyzer, text,
new String[] { "one,two,three, four",
"one,two,three, four five",
"four five", "four five six",
"five six" });
}
private String[] getAnalyzerFactoryConfig(String name, String @params)
{
//String singleQuoteEscapedName = name.Replace("'", "\\\\'");
//String[] algLines = {
// "content.source=Lucene.Net.Benchmarks.ByTask.Feeds.LineDocSource, Lucene.Net.Benchmark",
// "docs.file=" + getReuters20LinesFile(),
// "work.dir=" + getWorkDir().FullName.Replace(@"\\\\", "/"), // Fix Windows path
// "content.source.forever=false",
// "directory=RAMDirectory",
// "AnalyzerFactory(name:'" + singleQuoteEscapedName + "', " + @params + ")",
// "NewAnalyzer('" + singleQuoteEscapedName + "')",
// "CreateIndex",
// "{ \"AddDocs\" AddDoc > : * "
//};
//String singleQuoteEscapedName = name.Replace("'", @"\'");
String singleQuoteEscapedName = name.Replace("'", @"\'");
String[] algLines = {
"content.source=Lucene.Net.Benchmarks.ByTask.Feeds.LineDocSource, Lucene.Net.Benchmark",
"docs.file=" + getReuters20LinesFile(),
"work.dir=" + getWorkDir().FullName.Replace(@"\", "/"), // Fix Windows path
"content.source.forever=false",
"directory=RAMDirectory",
"AnalyzerFactory(name:'" + singleQuoteEscapedName + "', " + @params + ")",
"NewAnalyzer('" + singleQuoteEscapedName + "')",
"CreateIndex",
"{ \"AddDocs\" AddDoc > : * "
};
return algLines;
}
[Test]
public void TestAnalyzerFactory()
{
String text = "Fortieth, Quarantième, Cuadragésimo";
Benchmark benchmark = execBenchmark(getAnalyzerFactoryConfig
("ascii folded, pattern replaced, standard tokenized, downcased, bigrammed.'analyzer'",
"positionIncrementGap:100,offsetGap:1111,"
+ "MappingCharFilter(mapping:'test-mapping-ISOLatin1Accent-partial.txt'),"
+ "PatternReplaceCharFilterFactory(pattern:'e(\\\\\\\\S*)m',replacement:\"$1xxx$1\"),"
+ "StandardTokenizer,LowerCaseFilter,NGramTokenFilter(minGramSize:2,maxGramSize:2)"));
BaseTokenStreamTestCase.AssertAnalyzesTo(benchmark.RunData.Analyzer, text,
new String[] { "fo", "or", "rt", "ti", "ie", "et", "th",
"qu", "ua", "ar", "ra", "an", "nt", "ti", "ix", "xx", "xx", "xe",
"cu", "ua", "ad", "dr", "ra", "ag", "gs", "si", "ix", "xx", "xx", "xs", "si", "io"});
}
private String getReuters20LinesFile()
{
return getWorkDirResourcePath("reuters.first20.lines.txt");
}
}
}