blob: 4a62d2276ab703592b084a194aa5d9f1e97a4fa1 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.benchmark.byTask;
import java.io.BufferedReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.text.Collator;
import java.util.Collection;
import java.util.List;
import java.util.Locale;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.benchmark.BenchmarkTestCase;
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
import org.apache.lucene.benchmark.byTask.stats.TaskStats;
import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask;
import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
import org.apache.lucene.collation.CollationKeyAnalyzer;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.LogDocMergePolicy;
import org.apache.lucene.index.LogMergePolicy;
import org.apache.lucene.index.MultiTerms;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SegmentInfos;
import org.apache.lucene.index.SerialMergeScheduler;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
/** Test very simply that perf tasks - simple algorithms - are doing what they should. */
@LuceneTestCase.SuppressCodecs({"SimpleText", "Direct"})
public class TestPerfTasksLogic extends BenchmarkTestCase {
@Override
public void setUp() throws Exception {
super.setUp();
copyToWorkDir("reuters.first20.lines.txt");
copyToWorkDir("test-mapping-ISOLatin1Accent-partial.txt");
}
/** Test index creation logic */
public void testIndexAndSearchTasks() throws Exception {
// 1. alg definition (required in every "logic" test)
String algLines[] = {
"ResetSystemErase",
"CreateIndex",
"{ AddDoc } : 1000",
"ForceMerge(1)",
"CloseIndex",
"OpenReader",
"{ CountingSearchTest } : 200",
"CloseReader",
"[ CountingSearchTest > : 70",
"[ CountingSearchTest > : 9",
};
// 2. we test this value later
CountingSearchTestTask.numSearches = 0;
// 3. execute the algorithm (required in every "logic" test)
Benchmark benchmark = execBenchmark(algLines);
// 4. test specific checks after the benchmark run completed.
assertEquals(
"TestSearchTask was supposed to be called!", 279, CountingSearchTestTask.numSearches);
assertTrue(
"Index does not exist?...!",
DirectoryReader.indexExists(benchmark.getRunData().getDirectory()));
// now we should be able to open the index for write.
IndexWriter iw =
new IndexWriter(
benchmark.getRunData().getDirectory(),
new IndexWriterConfig(new MockAnalyzer(random())).setOpenMode(OpenMode.APPEND));
iw.close();
IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory());
assertEquals(
"1000 docs were added to the index, this is what we expect to find!", 1000, ir.numDocs());
ir.close();
}
/** Test timed sequence task. */
public void testTimedSearchTask() throws Exception {
String algLines[] = {
"log.step=100000",
"ResetSystemErase",
"CreateIndex",
"{ AddDoc } : 100",
"ForceMerge(1)",
"CloseIndex",
"OpenReader",
"{ CountingSearchTest } : .5s",
"CloseReader",
};
CountingSearchTestTask.numSearches = 0;
execBenchmark(algLines);
assertTrue(CountingSearchTestTask.numSearches > 0);
long elapsed = CountingSearchTestTask.prevLastMillis - CountingSearchTestTask.startMillis;
assertTrue("elapsed time was " + elapsed + " msec", elapsed <= 1500);
}
// disabled until we fix BG thread prio -- this test
// causes build to hang
public void testBGSearchTaskThreads() throws Exception {
String algLines[] = {
"log.time.step.msec = 100",
"log.step=100000",
"ResetSystemErase",
"CreateIndex",
"{ AddDoc } : 1000",
"ForceMerge(1)",
"CloseIndex",
"OpenReader",
"{",
" [ \"XSearch\" { CountingSearchTest > : * ] : 2 &-1",
" Wait(0.5)",
"}",
"CloseReader",
"RepSumByPref X"
};
CountingSearchTestTask.numSearches = 0;
execBenchmark(algLines);
// NOTE: cannot assert this, because on a super-slow
// system, it could be after waiting 0.5 seconds that
// the search threads hadn't yet succeeded in starting
// up and then they start up and do no searching:
// assertTrue(CountingSearchTestTask.numSearches > 0);
}
/** Test Exhasting Doc Maker logic */
public void testExhaustContentSource() throws Exception {
// 1. alg definition (required in every "logic" test)
String algLines[] = {
"# ----- properties ",
"content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource",
"content.source.log.step=1",
"doc.term.vector=false",
"content.source.forever=false",
"directory=ByteBuffersDirectory",
"doc.stored=false",
"doc.tokenized=false",
"# ----- alg ",
"CreateIndex",
"{ AddDoc } : * ",
"ForceMerge(1)",
"CloseIndex",
"OpenReader",
"{ CountingSearchTest } : 100",
"CloseReader",
"[ CountingSearchTest > : 30",
"[ CountingSearchTest > : 9",
};
// 2. we test this value later
CountingSearchTestTask.numSearches = 0;
// 3. execute the algorithm (required in every "logic" test)
Benchmark benchmark = execBenchmark(algLines);
// 4. test specific checks after the benchmark run completed.
assertEquals(
"TestSearchTask was supposed to be called!", 139, CountingSearchTestTask.numSearches);
assertTrue(
"Index does not exist?...!",
DirectoryReader.indexExists(benchmark.getRunData().getDirectory()));
// now we should be able to open the index for write.
IndexWriter iw =
new IndexWriter(
benchmark.getRunData().getDirectory(),
new IndexWriterConfig(new MockAnalyzer(random())).setOpenMode(OpenMode.APPEND));
iw.close();
IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory());
assertEquals(
"1 docs were added to the index, this is what we expect to find!", 1, ir.numDocs());
ir.close();
}
// LUCENE-1994: test thread safety of SortableSingleDocMaker
public void testDocMakerThreadSafety() throws Exception {
// 1. alg definition (required in every "logic" test)
String algLines[] = {
"# ----- properties ",
"content.source=org.apache.lucene.benchmark.byTask.feeds.SortableSingleDocSource",
"doc.term.vector=false",
"log.step.AddDoc=10000",
"content.source.forever=true",
"directory=ByteBuffersDirectory",
"doc.reuse.fields=false",
"doc.stored=true",
"doc.tokenized=false",
"doc.index.props=true",
"# ----- alg ",
"CreateIndex",
"[ { AddDoc > : 250 ] : 4",
"CloseIndex",
};
// 2. we test this value later
CountingSearchTestTask.numSearches = 0;
// 3. execute the algorithm (required in every "logic" test)
Benchmark benchmark = execBenchmark(algLines);
DirectoryReader r = DirectoryReader.open(benchmark.getRunData().getDirectory());
final int maxDoc = r.maxDoc();
assertEquals(1000, maxDoc);
for (int i = 0; i < 1000; i++) {
assertNotNull("doc " + i + " has null country", r.document(i).getField("country"));
}
r.close();
}
/** Test Parallel Doc Maker logic (for LUCENE-940) */
public void testParallelDocMaker() throws Exception {
// 1. alg definition (required in every "logic" test)
String algLines[] = {
"# ----- properties ",
"content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
"docs.file=" + getReuters20LinesFile(),
"content.source.log.step=3",
"doc.term.vector=false",
"content.source.forever=false",
"directory=FSDirectory",
"doc.stored=false",
"doc.tokenized=false",
"# ----- alg ",
"CreateIndex",
"[ { AddDoc } : * ] : 4 ",
"CloseIndex",
};
// 2. execute the algorithm (required in every "logic" test)
Benchmark benchmark = execBenchmark(algLines);
// 3. test number of docs in the index
IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory());
int ndocsExpected = 20; // first 20 reuters docs.
assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
ir.close();
}
/** Test WriteLineDoc and LineDocSource. */
public void testLineDocFile() throws Exception {
Path lineFile = createTempFile("test.reuters.lines", ".txt");
// We will call WriteLineDocs this many times
final int NUM_TRY_DOCS = 50;
// Creates a line file with first 50 docs from SingleDocSource
String algLines1[] = {
"# ----- properties ",
"content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource",
"content.source.forever=true",
"line.file.out=" + lineFile.toAbsolutePath().toString().replace('\\', '/'),
"# ----- alg ",
"{WriteLineDoc()}:" + NUM_TRY_DOCS,
};
// Run algo
Benchmark benchmark = execBenchmark(algLines1);
BufferedReader r = Files.newBufferedReader(lineFile, StandardCharsets.UTF_8);
int numLines = 0;
String line;
while ((line = r.readLine()) != null) {
if (numLines == 0 && line.startsWith(WriteLineDocTask.FIELDS_HEADER_INDICATOR)) {
continue; // do not count the header line as a doc
}
numLines++;
}
r.close();
assertEquals(
"did not see the right number of docs; should be " + NUM_TRY_DOCS + " but was " + numLines,
NUM_TRY_DOCS,
numLines);
// Index the line docs
String algLines2[] = {
"# ----- properties ",
"analyzer=org.apache.lucene.analysis.core.WhitespaceAnalyzer",
"content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
"docs.file=" + lineFile.toAbsolutePath().toString().replace('\\', '/'),
"content.source.forever=false",
"doc.reuse.fields=false",
"ram.flush.mb=4",
"# ----- alg ",
"ResetSystemErase",
"CreateIndex",
"{AddDoc}: *",
"CloseIndex",
};
// Run algo
benchmark = execBenchmark(algLines2);
// now we should be able to open the index for write.
IndexWriter iw =
new IndexWriter(
benchmark.getRunData().getDirectory(),
new IndexWriterConfig(new MockAnalyzer(random())).setOpenMode(OpenMode.APPEND));
iw.close();
IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory());
assertEquals(
numLines + " lines were created but " + ir.numDocs() + " docs are in the index",
numLines,
ir.numDocs());
ir.close();
}
/** Test ReadTokensTask */
public void testReadTokens() throws Exception {
// We will call ReadTokens on this many docs
final int NUM_DOCS = 20;
// Read tokens from first NUM_DOCS docs from Reuters and
// then build index from the same docs
String algLines1[] = {
"# ----- properties ",
"analyzer=org.apache.lucene.analysis.core.WhitespaceAnalyzer",
"content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
"docs.file=" + getReuters20LinesFile(),
"# ----- alg ",
"{ReadTokens}: " + NUM_DOCS,
"ResetSystemErase",
"CreateIndex",
"{AddDoc}: " + NUM_DOCS,
"CloseIndex",
};
// Run algo
Benchmark benchmark = execBenchmark(algLines1);
List<TaskStats> stats = benchmark.getRunData().getPoints().taskStats();
// Count how many tokens all ReadTokens saw
int totalTokenCount1 = 0;
for (final TaskStats stat : stats) {
if (stat.getTask().getName().equals("ReadTokens")) {
totalTokenCount1 += stat.getCount();
}
}
// Separately count how many tokens are actually in the index:
IndexReader reader = DirectoryReader.open(benchmark.getRunData().getDirectory());
assertEquals(NUM_DOCS, reader.numDocs());
int totalTokenCount2 = 0;
Collection<String> fields = FieldInfos.getIndexedFields(reader);
for (String fieldName : fields) {
if (fieldName.equals(DocMaker.ID_FIELD)
|| fieldName.equals(DocMaker.DATE_MSEC_FIELD)
|| fieldName.equals(DocMaker.TIME_SEC_FIELD)) {
continue;
}
Terms terms = MultiTerms.getTerms(reader, fieldName);
if (terms == null) {
continue;
}
TermsEnum termsEnum = terms.iterator();
PostingsEnum docs = null;
while (termsEnum.next() != null) {
docs = TestUtil.docs(random(), termsEnum, docs, PostingsEnum.FREQS);
while (docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
totalTokenCount2 += docs.freq();
}
}
}
reader.close();
// Make sure they are the same
assertEquals(totalTokenCount1, totalTokenCount2);
}
/** Test that " {[AddDoc(4000)]: 4} : * " works corrcetly (for LUCENE-941) */
public void testParallelExhausted() throws Exception {
// 1. alg definition (required in every "logic" test)
String algLines[] = {
"# ----- properties ",
"content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
"docs.file=" + getReuters20LinesFile(),
"content.source.log.step=3",
"doc.term.vector=false",
"content.source.forever=false",
"directory=ByteBuffersDirectory",
"doc.stored=false",
"doc.tokenized=false",
"task.max.depth.log=1",
"# ----- alg ",
"CreateIndex",
"{ [ AddDoc]: 4} : * ",
"ResetInputs ",
"{ [ AddDoc]: 4} : * ",
"CloseIndex",
};
// 2. execute the algorithm (required in every "logic" test)
Benchmark benchmark = execBenchmark(algLines);
// 3. test number of docs in the index
IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory());
int ndocsExpected = 2 * 20; // first 20 reuters docs.
assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
ir.close();
}
/** Test that exhaust in loop works as expected (LUCENE-1115). */
public void testExhaustedLooped() throws Exception {
// 1. alg definition (required in every "logic" test)
String algLines[] = {
"# ----- properties ",
"content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
"docs.file=" + getReuters20LinesFile(),
"content.source.log.step=3",
"doc.term.vector=false",
"content.source.forever=false",
"directory=ByteBuffersDirectory",
"doc.stored=false",
"doc.tokenized=false",
"task.max.depth.log=1",
"# ----- alg ",
"{ \"Rounds\"",
" ResetSystemErase",
" CreateIndex",
" { \"AddDocs\" AddDoc > : * ",
" CloseIndex",
"} : 2",
};
// 2. execute the algorithm (required in every "logic" test)
Benchmark benchmark = execBenchmark(algLines);
// 3. test number of docs in the index
IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory());
int ndocsExpected = 20; // first 20 reuters docs.
assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
ir.close();
}
/** Test that we can close IndexWriter with argument "false". */
public void testCloseIndexFalse() throws Exception {
// 1. alg definition (required in every "logic" test)
String algLines[] = {
"# ----- properties ",
"content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
"docs.file=" + getReuters20LinesFile(),
"ram.flush.mb=-1",
"max.buffered=2",
"content.source.log.step=3",
"doc.term.vector=false",
"content.source.forever=false",
"directory=ByteBuffersDirectory",
"doc.stored=false",
"doc.tokenized=false",
"debug.level=1",
"# ----- alg ",
"{ \"Rounds\"",
" ResetSystemErase",
" CreateIndex",
" { \"AddDocs\" AddDoc > : * ",
" CloseIndex(false)",
"} : 2",
};
// 2. execute the algorithm (required in every "logic" test)
Benchmark benchmark = execBenchmark(algLines);
// 3. test number of docs in the index
IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory());
int ndocsExpected = 20; // first 20 reuters docs.
assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
ir.close();
}
public static class MyMergeScheduler extends SerialMergeScheduler {
boolean called;
public MyMergeScheduler() {
super();
called = true;
}
}
/** Test that we can set merge scheduler". */
public void testMergeScheduler() throws Exception {
// 1. alg definition (required in every "logic" test)
String algLines[] = {
"# ----- properties ",
"content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
"docs.file=" + getReuters20LinesFile(),
"content.source.log.step=3",
"doc.term.vector=false",
"content.source.forever=false",
"directory=ByteBuffersDirectory",
"merge.scheduler=" + MyMergeScheduler.class.getName(),
"doc.stored=false",
"doc.tokenized=false",
"debug.level=1",
"# ----- alg ",
"{ \"Rounds\"",
" ResetSystemErase",
" CreateIndex",
" { \"AddDocs\" AddDoc > : * ",
"} : 2",
};
// 2. execute the algorithm (required in every "logic" test)
Benchmark benchmark = execBenchmark(algLines);
assertTrue(
"did not use the specified MergeScheduler",
((MyMergeScheduler) benchmark.getRunData().getIndexWriter().getConfig().getMergeScheduler())
.called);
benchmark.getRunData().getIndexWriter().close();
// 3. test number of docs in the index
IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory());
int ndocsExpected = 20; // first 20 reuters docs.
assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
ir.close();
}
public static class MyMergePolicy extends LogDocMergePolicy {
boolean called;
public MyMergePolicy() {
called = true;
}
}
/** Test that we can set merge policy". */
public void testMergePolicy() throws Exception {
// 1. alg definition (required in every "logic" test)
String algLines[] = {
"# ----- properties ",
"content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
"docs.file=" + getReuters20LinesFile(),
"content.source.log.step=3",
"ram.flush.mb=-1",
"max.buffered=2",
"doc.term.vector=false",
"content.source.forever=false",
"directory=ByteBuffersDirectory",
"merge.policy=" + MyMergePolicy.class.getName(),
"doc.stored=false",
"doc.tokenized=false",
"debug.level=1",
"# ----- alg ",
"{ \"Rounds\"",
" ResetSystemErase",
" CreateIndex",
" { \"AddDocs\" AddDoc > : * ",
"} : 2",
};
// 2. execute the algorithm (required in every "logic" test)
Benchmark benchmark = execBenchmark(algLines);
assertTrue(
"did not use the specified MergePolicy",
((MyMergePolicy) benchmark.getRunData().getIndexWriter().getConfig().getMergePolicy())
.called);
benchmark.getRunData().getIndexWriter().close();
// 3. test number of docs in the index
IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory());
int ndocsExpected = 20; // first 20 reuters docs.
assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
ir.close();
}
/** Test that IndexWriter settings stick. */
public void testIndexWriterSettings() throws Exception {
// 1. alg definition (required in every "logic" test)
String algLines[] = {
"# ----- properties ",
"content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
"docs.file=" + getReuters20LinesFile(),
"content.source.log.step=3",
"ram.flush.mb=-1",
"max.buffered=2",
"compound=cmpnd:true:false",
"doc.term.vector=vector:false:true",
"content.source.forever=false",
"directory=ByteBuffersDirectory",
"doc.stored=false",
"merge.factor=3",
"doc.tokenized=false",
"debug.level=1",
"# ----- alg ",
"{ \"Rounds\"",
" ResetSystemErase",
" CreateIndex",
" { \"AddDocs\" AddDoc > : * ",
" NewRound",
"} : 2",
};
// 2. execute the algorithm (required in every "logic" test)
Benchmark benchmark = execBenchmark(algLines);
final IndexWriter writer = benchmark.getRunData().getIndexWriter();
assertEquals(2, writer.getConfig().getMaxBufferedDocs());
assertEquals(
IndexWriterConfig.DISABLE_AUTO_FLUSH, (int) writer.getConfig().getRAMBufferSizeMB());
assertEquals(3, ((LogMergePolicy) writer.getConfig().getMergePolicy()).getMergeFactor());
assertEquals(0.0d, writer.getConfig().getMergePolicy().getNoCFSRatio(), 0.0);
writer.close();
Directory dir = benchmark.getRunData().getDirectory();
IndexReader reader = DirectoryReader.open(dir);
Fields tfv = reader.getTermVectors(0);
assertNotNull(tfv);
assertTrue(tfv.size() > 0);
reader.close();
}
/** Test indexing with facets tasks. */
public void testIndexingWithFacets() throws Exception {
// 1. alg definition (required in every "logic" test)
String algLines[] = {
"# ----- properties ",
"content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
"docs.file=" + getReuters20LinesFile(),
"content.source.log.step=100",
"content.source.forever=false",
"directory=ByteBuffersDirectory",
"doc.stored=false",
"merge.factor=3",
"doc.tokenized=false",
"debug.level=1",
"# ----- alg ",
"ResetSystemErase",
"CreateIndex",
"CreateTaxonomyIndex",
"{ \"AddDocs\" AddFacetedDoc > : * ",
"CloseIndex",
"CloseTaxonomyIndex",
"OpenTaxonomyReader",
};
// 2. execute the algorithm (required in every "logic" test)
Benchmark benchmark = execBenchmark(algLines);
PerfRunData runData = benchmark.getRunData();
assertNull("taxo writer was not properly closed", runData.getTaxonomyWriter());
TaxonomyReader taxoReader = runData.getTaxonomyReader();
assertNotNull("taxo reader was not opened", taxoReader);
assertTrue(
"nothing was added to the taxnomy (expecting root and at least one addtional category)",
taxoReader.getSize() > 1);
taxoReader.close();
}
/** Test that we can call forceMerge(maxNumSegments). */
public void testForceMerge() throws Exception {
// 1. alg definition (required in every "logic" test)
String algLines[] = {
"# ----- properties ",
"content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
"docs.file=" + getReuters20LinesFile(),
"content.source.log.step=3",
"ram.flush.mb=-1",
"max.buffered=3",
"doc.term.vector=false",
"content.source.forever=false",
"directory=ByteBuffersDirectory",
"merge.policy=org.apache.lucene.index.LogDocMergePolicy",
"doc.stored=false",
"doc.tokenized=false",
"debug.level=1",
"# ----- alg ",
"{ \"Rounds\"",
" ResetSystemErase",
" CreateIndex",
" { \"AddDocs\" AddDoc > : * ",
" ForceMerge(3)",
" CloseIndex()",
"} : 2",
};
// 2. execute the algorithm (required in every "logic" test)
Benchmark benchmark = execBenchmark(algLines);
// 3. test number of docs in the index
IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory());
int ndocsExpected = 20; // first 20 reuters docs.
assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
ir.close();
// Make sure we have 3 segments:
SegmentInfos infos = SegmentInfos.readLatestCommit(benchmark.getRunData().getDirectory());
assertEquals(3, infos.size());
}
/** Test disabling task count (LUCENE-1136). */
public void testDisableCounting() throws Exception {
doTestDisableCounting(true);
doTestDisableCounting(false);
}
private void doTestDisableCounting(boolean disable) throws Exception {
// 1. alg definition (required in every "logic" test)
String algLines[] = disableCountingLines(disable);
// 2. execute the algorithm (required in every "logic" test)
Benchmark benchmark = execBenchmark(algLines);
// 3. test counters
int n = disable ? 0 : 1;
int nChecked = 0;
for (final TaskStats stats : benchmark.getRunData().getPoints().taskStats()) {
String taskName = stats.getTask().getName();
if (taskName.equals("Rounds")) {
assertEquals("Wrong total count!", 20 + 2 * n, stats.getCount());
nChecked++;
} else if (taskName.equals("CreateIndex")) {
assertEquals("Wrong count for CreateIndex!", n, stats.getCount());
nChecked++;
} else if (taskName.equals("CloseIndex")) {
assertEquals("Wrong count for CloseIndex!", n, stats.getCount());
nChecked++;
}
}
assertEquals("Missing some tasks to check!", 3, nChecked);
}
private String[] disableCountingLines(boolean disable) {
String dis = disable ? "-" : "";
return new String[] {
"# ----- properties ",
"content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
"docs.file=" + getReuters20LinesFile(),
"content.source.log.step=30",
"doc.term.vector=false",
"content.source.forever=false",
"directory=ByteBuffersDirectory",
"doc.stored=false",
"doc.tokenized=false",
"task.max.depth.log=1",
"# ----- alg ",
"{ \"Rounds\"",
" ResetSystemErase",
" " + dis + "CreateIndex", // optionally disable counting here
" { \"AddDocs\" AddDoc > : * ",
" " + dis + " CloseIndex", // optionally disable counting here (with extra blanks)
"}",
"RepSumByName",
};
}
/** Test that we can change the Locale in the runData, that it is parsed as we expect. */
public void testLocale() throws Exception {
// empty Locale: clear it (null)
Benchmark benchmark = execBenchmark(getLocaleConfig(""));
assertNull(benchmark.getRunData().getLocale());
// ROOT locale
benchmark = execBenchmark(getLocaleConfig("ROOT"));
assertEquals(new Locale(""), benchmark.getRunData().getLocale());
// specify just a language
benchmark = execBenchmark(getLocaleConfig("de"));
assertEquals(new Locale("de"), benchmark.getRunData().getLocale());
// specify language + country
benchmark = execBenchmark(getLocaleConfig("en,US"));
assertEquals(new Locale("en", "US"), benchmark.getRunData().getLocale());
// specify language + country + variant
benchmark = execBenchmark(getLocaleConfig("no,NO,NY"));
assertEquals(new Locale("no", "NO", "NY"), benchmark.getRunData().getLocale());
}
private String[] getLocaleConfig(String localeParam) {
String algLines[] = {
"# ----- properties ",
"content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
"docs.file=" + getReuters20LinesFile(),
"content.source.log.step=3",
"content.source.forever=false",
"directory=ByteBuffersDirectory",
"# ----- alg ",
"{ \"Rounds\"",
" ResetSystemErase",
" NewLocale(" + localeParam + ")",
" CreateIndex",
" { \"AddDocs\" AddDoc > : * ",
" NewRound",
"} : 1",
};
return algLines;
}
/** Test that we can create CollationAnalyzers. */
public void testCollator() throws Exception {
// ROOT locale
Benchmark benchmark = execBenchmark(getCollatorConfig("ROOT", "impl:jdk"));
CollationKeyAnalyzer expected = new CollationKeyAnalyzer(Collator.getInstance(new Locale("")));
assertEqualCollation(expected, benchmark.getRunData().getAnalyzer(), "foobar");
// specify just a language
benchmark = execBenchmark(getCollatorConfig("de", "impl:jdk"));
expected = new CollationKeyAnalyzer(Collator.getInstance(new Locale("de")));
assertEqualCollation(expected, benchmark.getRunData().getAnalyzer(), "foobar");
// specify language + country
benchmark = execBenchmark(getCollatorConfig("en,US", "impl:jdk"));
expected = new CollationKeyAnalyzer(Collator.getInstance(new Locale("en", "US")));
assertEqualCollation(expected, benchmark.getRunData().getAnalyzer(), "foobar");
// specify language + country + variant
benchmark = execBenchmark(getCollatorConfig("no,NO,NY", "impl:jdk"));
expected = new CollationKeyAnalyzer(Collator.getInstance(new Locale("no", "NO", "NY")));
assertEqualCollation(expected, benchmark.getRunData().getAnalyzer(), "foobar");
}
private void assertEqualCollation(Analyzer a1, Analyzer a2, String text) throws Exception {
TokenStream ts1 = a1.tokenStream("bogus", text);
TokenStream ts2 = a2.tokenStream("bogus", text);
ts1.reset();
ts2.reset();
TermToBytesRefAttribute termAtt1 = ts1.addAttribute(TermToBytesRefAttribute.class);
TermToBytesRefAttribute termAtt2 = ts2.addAttribute(TermToBytesRefAttribute.class);
assertTrue(ts1.incrementToken());
assertTrue(ts2.incrementToken());
BytesRef bytes1 = termAtt1.getBytesRef();
BytesRef bytes2 = termAtt2.getBytesRef();
assertEquals(bytes1, bytes2);
assertFalse(ts1.incrementToken());
assertFalse(ts2.incrementToken());
ts1.close();
ts2.close();
}
private String[] getCollatorConfig(String localeParam, String collationParam) {
String algLines[] = {
"# ----- properties ",
"content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
"docs.file=" + getReuters20LinesFile(),
"content.source.log.step=3",
"content.source.forever=false",
"directory=ByteBuffersDirectory",
"# ----- alg ",
"{ \"Rounds\"",
" ResetSystemErase",
" NewLocale(" + localeParam + ")",
" NewCollationAnalyzer(" + collationParam + ")",
" CreateIndex",
" { \"AddDocs\" AddDoc > : * ",
" NewRound",
"} : 1",
};
return algLines;
}
/** Test that we can create shingle analyzers using AnalyzerFactory. */
public void testShingleAnalyzer() throws Exception {
String text = "one,two,three, four five six";
// StandardTokenizer, maxShingleSize, and outputUnigrams
Benchmark benchmark =
execBenchmark(
getAnalyzerFactoryConfig("shingle-analyzer", "StandardTokenizer,ShingleFilter"));
benchmark.getRunData().getAnalyzer().tokenStream("bogus", text).close();
BaseTokenStreamTestCase.assertAnalyzesTo(
benchmark.getRunData().getAnalyzer(),
text,
new String[] {
"one", "one two", "two", "two three",
"three", "three four", "four", "four five",
"five", "five six", "six"
});
// StandardTokenizer, maxShingleSize = 3, and outputUnigrams = false
benchmark =
execBenchmark(
getAnalyzerFactoryConfig(
"shingle-analyzer",
"StandardTokenizer,ShingleFilter(maxShingleSize:3,outputUnigrams:false)"));
BaseTokenStreamTestCase.assertAnalyzesTo(
benchmark.getRunData().getAnalyzer(),
text,
new String[] {
"one two",
"one two three",
"two three",
"two three four",
"three four",
"three four five",
"four five",
"four five six",
"five six"
});
// WhitespaceTokenizer, default maxShingleSize and outputUnigrams
benchmark =
execBenchmark(
getAnalyzerFactoryConfig("shingle-analyzer", "WhitespaceTokenizer,ShingleFilter"));
BaseTokenStreamTestCase.assertAnalyzesTo(
benchmark.getRunData().getAnalyzer(),
text,
new String[] {
"one,two,three,", "one,two,three, four", "four", "four five", "five", "five six", "six"
});
// WhitespaceTokenizer, maxShingleSize=3 and outputUnigrams=false
benchmark =
execBenchmark(
getAnalyzerFactoryConfig(
"shingle-factory",
"WhitespaceTokenizer,ShingleFilter(outputUnigrams:false,maxShingleSize:3)"));
BaseTokenStreamTestCase.assertAnalyzesTo(
benchmark.getRunData().getAnalyzer(),
text,
new String[] {
"one,two,three, four",
"one,two,three, four five",
"four five",
"four five six",
"five six"
});
}
private String[] getAnalyzerFactoryConfig(String name, String params) {
final String singleQuoteEscapedName = name.replaceAll("'", "\\\\'");
String algLines[] = {
"content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
"docs.file=" + getReuters20LinesFile(),
"work.dir="
+ getWorkDir().toAbsolutePath().toString().replaceAll("\\\\", "/"), // Fix Windows path
"content.source.forever=false",
"directory=ByteBuffersDirectory",
"AnalyzerFactory(name:'" + singleQuoteEscapedName + "', " + params + ")",
"NewAnalyzer('" + singleQuoteEscapedName + "')",
"CreateIndex",
"{ \"AddDocs\" AddDoc > : * "
};
return algLines;
}
public void testAnalyzerFactory() throws Exception {
String text = "Fortieth, Quarantième, Cuadragésimo";
Benchmark benchmark =
execBenchmark(
getAnalyzerFactoryConfig(
"ascii folded, pattern replaced, standard tokenized, downcased, bigrammed.'analyzer'",
"positionIncrementGap:100,offsetGap:1111,"
+ "MappingCharFilter(mapping:'test-mapping-ISOLatin1Accent-partial.txt'),"
+ "PatternReplaceCharFilterFactory(pattern:'e(\\\\\\\\S*)m',replacement:\"$1xxx$1\"),"
+ "StandardTokenizer,LowerCaseFilter,NGramTokenFilter(minGramSize:2,maxGramSize:2)"));
BaseTokenStreamTestCase.assertAnalyzesTo(
benchmark.getRunData().getAnalyzer(),
text,
new String[] {
"fo", "or", "rt", "ti", "ie", "et", "th", "qu", "ua", "ar", "ra", "an", "nt", "ti", "ix",
"xx", "xx", "xe", "cu", "ua", "ad", "dr", "ra", "ag", "gs", "si", "ix", "xx", "xx", "xs",
"si", "io"
});
}
private String getReuters20LinesFile() {
return getWorkDirResourcePath("reuters.first20.lines.txt");
}
}