lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.benchmark.byTask;

 import java.io.BufferedReader;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.text.Collator;
 import java.util.Collection;
 import java.util.List;
 import java.util.Locale;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
 import org.apache.lucene.benchmark.BenchmarkTestCase;
 import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
 import org.apache.lucene.benchmark.byTask.stats.TaskStats;
 import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask;
 import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
 import org.apache.lucene.collation.CollationKeyAnalyzer;
 import org.apache.lucene.facet.taxonomy.TaxonomyReader;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.FieldInfos;
 import org.apache.lucene.index.Fields;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.index.IndexWriterConfig.OpenMode;
 import org.apache.lucene.index.LogDocMergePolicy;
 import org.apache.lucene.index.LogMergePolicy;
 import org.apache.lucene.index.MultiTerms;
 import org.apache.lucene.index.PostingsEnum;
 import org.apache.lucene.index.SegmentInfos;
 import org.apache.lucene.index.SerialMergeScheduler;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.TestUtil;

 /** Test very simply that perf tasks - simple algorithms - are doing what they should. */
 @LuceneTestCase.SuppressCodecs({"SimpleText", "Direct"})
 public class TestPerfTasksLogic extends BenchmarkTestCase {

   @Override
   public void setUp() throws Exception {
     super.setUp();
     copyToWorkDir("reuters.first20.lines.txt");
     copyToWorkDir("test-mapping-ISOLatin1Accent-partial.txt");
   }

   /** Test index creation logic */
   public void testIndexAndSearchTasks() throws Exception {
     // 1. alg definition (required in every "logic" test)
     String algLines[] = {
       "ResetSystemErase",
       "CreateIndex",
       "{ AddDoc } : 1000",
       "ForceMerge(1)",
       "CloseIndex",
       "OpenReader",
       "{ CountingSearchTest } : 200",
       "CloseReader",
       "[ CountingSearchTest > : 70",
       "[ CountingSearchTest > : 9",
     };

     // 2. we test this value later
     CountingSearchTestTask.numSearches = 0;

     // 3. execute the algorithm  (required in every "logic" test)
     Benchmark benchmark = execBenchmark(algLines);

     // 4. test specific checks after the benchmark run completed.
     assertEquals(
         "TestSearchTask was supposed to be called!", 279, CountingSearchTestTask.numSearches);
     assertTrue(
         "Index does not exist?...!",
         DirectoryReader.indexExists(benchmark.getRunData().getDirectory()));
     // now we should be able to open the index for write.
     IndexWriter iw =
         new IndexWriter(
             benchmark.getRunData().getDirectory(),
             new IndexWriterConfig(new MockAnalyzer(random())).setOpenMode(OpenMode.APPEND));
     iw.close();
     IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory());
     assertEquals(
         "1000 docs were added to the index, this is what we expect to find!", 1000, ir.numDocs());
     ir.close();
   }

   /** Test timed sequence task. */
   public void testTimedSearchTask() throws Exception {
     String algLines[] = {
       "log.step=100000",
       "ResetSystemErase",
       "CreateIndex",
       "{ AddDoc } : 100",
       "ForceMerge(1)",
       "CloseIndex",
       "OpenReader",
       "{ CountingSearchTest } : .5s",
       "CloseReader",
     };

     CountingSearchTestTask.numSearches = 0;
     execBenchmark(algLines);
     assertTrue(CountingSearchTestTask.numSearches > 0);
     long elapsed = CountingSearchTestTask.prevLastMillis - CountingSearchTestTask.startMillis;
     assertTrue("elapsed time was " + elapsed + " msec", elapsed <= 1500);
   }

   // disabled until we fix BG thread prio -- this test
   // causes build to hang
   public void testBGSearchTaskThreads() throws Exception {
     String algLines[] = {
       "log.time.step.msec = 100",
       "log.step=100000",
       "ResetSystemErase",
       "CreateIndex",
       "{ AddDoc } : 1000",
       "ForceMerge(1)",
       "CloseIndex",
       "OpenReader",
       "{",
       "  [ \"XSearch\" { CountingSearchTest > : * ] : 2 &-1",
       "  Wait(0.5)",
       "}",
       "CloseReader",
       "RepSumByPref X"
     };

     CountingSearchTestTask.numSearches = 0;
     execBenchmark(algLines);

     // NOTE: cannot assert this, because on a super-slow
     // system, it could be after waiting 0.5 seconds that
     // the search threads hadn't yet succeeded in starting
     // up and then they start up and do no searching:
     // assertTrue(CountingSearchTestTask.numSearches > 0);
   }

   /** Test Exhasting Doc Maker logic */
   public void testExhaustContentSource() throws Exception {
     // 1. alg definition (required in every "logic" test)
     String algLines[] = {
       "# ----- properties ",
       "content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource",
       "content.source.log.step=1",
       "doc.term.vector=false",
       "content.source.forever=false",
       "directory=ByteBuffersDirectory",
       "doc.stored=false",
       "doc.tokenized=false",
       "# ----- alg ",
       "CreateIndex",
       "{ AddDoc } : * ",
       "ForceMerge(1)",
       "CloseIndex",
       "OpenReader",
       "{ CountingSearchTest } : 100",
       "CloseReader",
       "[ CountingSearchTest > : 30",
       "[ CountingSearchTest > : 9",
     };

     // 2. we test this value later
     CountingSearchTestTask.numSearches = 0;

     // 3. execute the algorithm  (required in every "logic" test)
     Benchmark benchmark = execBenchmark(algLines);

     // 4. test specific checks after the benchmark run completed.
     assertEquals(
         "TestSearchTask was supposed to be called!", 139, CountingSearchTestTask.numSearches);
     assertTrue(
         "Index does not exist?...!",
         DirectoryReader.indexExists(benchmark.getRunData().getDirectory()));
     // now we should be able to open the index for write.
     IndexWriter iw =
         new IndexWriter(
             benchmark.getRunData().getDirectory(),
             new IndexWriterConfig(new MockAnalyzer(random())).setOpenMode(OpenMode.APPEND));
     iw.close();
     IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory());
     assertEquals(
         "1 docs were added to the index, this is what we expect to find!", 1, ir.numDocs());
     ir.close();
   }

   // LUCENE-1994: test thread safety of SortableSingleDocMaker
   public void testDocMakerThreadSafety() throws Exception {
     // 1. alg definition (required in every "logic" test)
     String algLines[] = {
       "# ----- properties ",
       "content.source=org.apache.lucene.benchmark.byTask.feeds.SortableSingleDocSource",
       "doc.term.vector=false",
       "log.step.AddDoc=10000",
       "content.source.forever=true",
       "directory=ByteBuffersDirectory",
       "doc.reuse.fields=false",
       "doc.stored=true",
       "doc.tokenized=false",
       "doc.index.props=true",
       "# ----- alg ",
       "CreateIndex",
       "[ { AddDoc > : 250 ] : 4",
       "CloseIndex",
     };

     // 2. we test this value later
     CountingSearchTestTask.numSearches = 0;

     // 3. execute the algorithm  (required in every "logic" test)
     Benchmark benchmark = execBenchmark(algLines);

     DirectoryReader r = DirectoryReader.open(benchmark.getRunData().getDirectory());

     final int maxDoc = r.maxDoc();
     assertEquals(1000, maxDoc);
     for (int i = 0; i < 1000; i++) {
       assertNotNull("doc " + i + " has null country", r.document(i).getField("country"));
     }
     r.close();
   }

   /** Test Parallel Doc Maker logic (for LUCENE-940) */
   public void testParallelDocMaker() throws Exception {
     // 1. alg definition (required in every "logic" test)
     String algLines[] = {
       "# ----- properties ",
       "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
       "docs.file=" + getReuters20LinesFile(),
       "content.source.log.step=3",
       "doc.term.vector=false",
       "content.source.forever=false",
       "directory=FSDirectory",
       "doc.stored=false",
       "doc.tokenized=false",
       "# ----- alg ",
       "CreateIndex",
       "[ { AddDoc } : * ] : 4 ",
       "CloseIndex",
     };

     // 2. execute the algorithm  (required in every "logic" test)
     Benchmark benchmark = execBenchmark(algLines);

     // 3. test number of docs in the index
     IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory());
     int ndocsExpected = 20; // first 20 reuters docs.
     assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
     ir.close();
   }

   /** Test WriteLineDoc and LineDocSource. */
   public void testLineDocFile() throws Exception {
     Path lineFile = createTempFile("test.reuters.lines", ".txt");

     // We will call WriteLineDocs this many times
     final int NUM_TRY_DOCS = 50;

     // Creates a line file with first 50 docs from SingleDocSource
     String algLines1[] = {
       "# ----- properties ",
       "content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource",
       "content.source.forever=true",
       "line.file.out=" + lineFile.toAbsolutePath().toString().replace('\\', '/'),
       "# ----- alg ",
       "{WriteLineDoc()}:" + NUM_TRY_DOCS,
     };

     // Run algo
     Benchmark benchmark = execBenchmark(algLines1);

     BufferedReader r = Files.newBufferedReader(lineFile, StandardCharsets.UTF_8);
     int numLines = 0;
     String line;
     while ((line = r.readLine()) != null) {
       if (numLines == 0 && line.startsWith(WriteLineDocTask.FIELDS_HEADER_INDICATOR)) {
         continue; // do not count the header line as a doc
       }
       numLines++;
     }
     r.close();
     assertEquals(
         "did not see the right number of docs; should be " + NUM_TRY_DOCS + " but was " + numLines,
         NUM_TRY_DOCS,
         numLines);

     // Index the line docs
     String algLines2[] = {
       "# ----- properties ",
       "analyzer=org.apache.lucene.analysis.core.WhitespaceAnalyzer",
       "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
       "docs.file=" + lineFile.toAbsolutePath().toString().replace('\\', '/'),
       "content.source.forever=false",
       "doc.reuse.fields=false",
       "ram.flush.mb=4",
       "# ----- alg ",
       "ResetSystemErase",
       "CreateIndex",
       "{AddDoc}: *",
       "CloseIndex",
     };

     // Run algo
     benchmark = execBenchmark(algLines2);

     // now we should be able to open the index for write.
     IndexWriter iw =
         new IndexWriter(
             benchmark.getRunData().getDirectory(),
             new IndexWriterConfig(new MockAnalyzer(random())).setOpenMode(OpenMode.APPEND));
     iw.close();

     IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory());
     assertEquals(
         numLines + " lines were created but " + ir.numDocs() + " docs are in the index",
         numLines,
         ir.numDocs());
     ir.close();
   }

   /** Test ReadTokensTask */
   public void testReadTokens() throws Exception {

     // We will call ReadTokens on this many docs
     final int NUM_DOCS = 20;

     // Read tokens from first NUM_DOCS docs from Reuters and
     // then build index from the same docs
     String algLines1[] = {
       "# ----- properties ",
       "analyzer=org.apache.lucene.analysis.core.WhitespaceAnalyzer",
       "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
       "docs.file=" + getReuters20LinesFile(),
       "# ----- alg ",
       "{ReadTokens}: " + NUM_DOCS,
       "ResetSystemErase",
       "CreateIndex",
       "{AddDoc}: " + NUM_DOCS,
       "CloseIndex",
     };

     // Run algo
     Benchmark benchmark = execBenchmark(algLines1);

     List<TaskStats> stats = benchmark.getRunData().getPoints().taskStats();

     // Count how many tokens all ReadTokens saw
     int totalTokenCount1 = 0;
     for (final TaskStats stat : stats) {
       if (stat.getTask().getName().equals("ReadTokens")) {
         totalTokenCount1 += stat.getCount();
       }
     }

     // Separately count how many tokens are actually in the index:
     IndexReader reader = DirectoryReader.open(benchmark.getRunData().getDirectory());
     assertEquals(NUM_DOCS, reader.numDocs());

     int totalTokenCount2 = 0;

     Collection<String> fields = FieldInfos.getIndexedFields(reader);

     for (String fieldName : fields) {
       if (fieldName.equals(DocMaker.ID_FIELD)
           || fieldName.equals(DocMaker.DATE_MSEC_FIELD)
           || fieldName.equals(DocMaker.TIME_SEC_FIELD)) {
         continue;
       }
       Terms terms = MultiTerms.getTerms(reader, fieldName);
       if (terms == null) {
         continue;
       }
       TermsEnum termsEnum = terms.iterator();
       PostingsEnum docs = null;
       while (termsEnum.next() != null) {
         docs = TestUtil.docs(random(), termsEnum, docs, PostingsEnum.FREQS);
         while (docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
           totalTokenCount2 += docs.freq();
         }
       }
     }
     reader.close();

     // Make sure they are the same
     assertEquals(totalTokenCount1, totalTokenCount2);
   }

   /** Test that " {[AddDoc(4000)]: 4} : * " works corrcetly (for LUCENE-941) */
   public void testParallelExhausted() throws Exception {
     // 1. alg definition (required in every "logic" test)
     String algLines[] = {
       "# ----- properties ",
       "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
       "docs.file=" + getReuters20LinesFile(),
       "content.source.log.step=3",
       "doc.term.vector=false",
       "content.source.forever=false",
       "directory=ByteBuffersDirectory",
       "doc.stored=false",
       "doc.tokenized=false",
       "task.max.depth.log=1",
       "# ----- alg ",
       "CreateIndex",
       "{ [ AddDoc]: 4} : * ",
       "ResetInputs ",
       "{ [ AddDoc]: 4} : * ",
       "CloseIndex",
     };

     // 2. execute the algorithm  (required in every "logic" test)
     Benchmark benchmark = execBenchmark(algLines);

     // 3. test number of docs in the index
     IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory());
     int ndocsExpected = 2 * 20; // first 20 reuters docs.
     assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
     ir.close();
   }

   /** Test that exhaust in loop works as expected (LUCENE-1115). */
   public void testExhaustedLooped() throws Exception {
     // 1. alg definition (required in every "logic" test)
     String algLines[] = {
       "# ----- properties ",
       "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
       "docs.file=" + getReuters20LinesFile(),
       "content.source.log.step=3",
       "doc.term.vector=false",
       "content.source.forever=false",
       "directory=ByteBuffersDirectory",
       "doc.stored=false",
       "doc.tokenized=false",
       "task.max.depth.log=1",
       "# ----- alg ",
       "{ \"Rounds\"",
       "  ResetSystemErase",
       "  CreateIndex",
       "  { \"AddDocs\"  AddDoc > : * ",
       "  CloseIndex",
       "} : 2",
     };

     // 2. execute the algorithm  (required in every "logic" test)
     Benchmark benchmark = execBenchmark(algLines);

     // 3. test number of docs in the index
     IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory());
     int ndocsExpected = 20; // first 20 reuters docs.
     assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
     ir.close();
   }

   /** Test that we can close IndexWriter with argument "false". */
   public void testCloseIndexFalse() throws Exception {
     // 1. alg definition (required in every "logic" test)
     String algLines[] = {
       "# ----- properties ",
       "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
       "docs.file=" + getReuters20LinesFile(),
       "ram.flush.mb=-1",
       "max.buffered=2",
       "content.source.log.step=3",
       "doc.term.vector=false",
       "content.source.forever=false",
       "directory=ByteBuffersDirectory",
       "doc.stored=false",
       "doc.tokenized=false",
       "debug.level=1",
       "# ----- alg ",
       "{ \"Rounds\"",
       "  ResetSystemErase",
       "  CreateIndex",
       "  { \"AddDocs\"  AddDoc > : * ",
       "  CloseIndex(false)",
       "} : 2",
     };

     // 2. execute the algorithm  (required in every "logic" test)
     Benchmark benchmark = execBenchmark(algLines);

     // 3. test number of docs in the index
     IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory());
     int ndocsExpected = 20; // first 20 reuters docs.
     assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
     ir.close();
   }

   public static class MyMergeScheduler extends SerialMergeScheduler {
     boolean called;

     public MyMergeScheduler() {
       super();
       called = true;
     }
   }

   /** Test that we can set merge scheduler". */
   public void testMergeScheduler() throws Exception {
     // 1. alg definition (required in every "logic" test)
     String algLines[] = {
       "# ----- properties ",
       "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
       "docs.file=" + getReuters20LinesFile(),
       "content.source.log.step=3",
       "doc.term.vector=false",
       "content.source.forever=false",
       "directory=ByteBuffersDirectory",
       "merge.scheduler=" + MyMergeScheduler.class.getName(),
       "doc.stored=false",
       "doc.tokenized=false",
       "debug.level=1",
       "# ----- alg ",
       "{ \"Rounds\"",
       "  ResetSystemErase",
       "  CreateIndex",
       "  { \"AddDocs\"  AddDoc > : * ",
       "} : 2",
     };
     // 2. execute the algorithm  (required in every "logic" test)
     Benchmark benchmark = execBenchmark(algLines);

     assertTrue(
         "did not use the specified MergeScheduler",
         ((MyMergeScheduler) benchmark.getRunData().getIndexWriter().getConfig().getMergeScheduler())
             .called);
     benchmark.getRunData().getIndexWriter().close();

     // 3. test number of docs in the index
     IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory());
     int ndocsExpected = 20; // first 20 reuters docs.
     assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
     ir.close();
   }

   public static class MyMergePolicy extends LogDocMergePolicy {
     boolean called;

     public MyMergePolicy() {
       called = true;
     }
   }

   /** Test that we can set merge policy". */
   public void testMergePolicy() throws Exception {
     // 1. alg definition (required in every "logic" test)
     String algLines[] = {
       "# ----- properties ",
       "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
       "docs.file=" + getReuters20LinesFile(),
       "content.source.log.step=3",
       "ram.flush.mb=-1",
       "max.buffered=2",
       "doc.term.vector=false",
       "content.source.forever=false",
       "directory=ByteBuffersDirectory",
       "merge.policy=" + MyMergePolicy.class.getName(),
       "doc.stored=false",
       "doc.tokenized=false",
       "debug.level=1",
       "# ----- alg ",
       "{ \"Rounds\"",
       "  ResetSystemErase",
       "  CreateIndex",
       "  { \"AddDocs\"  AddDoc > : * ",
       "} : 2",
     };

     // 2. execute the algorithm  (required in every "logic" test)
     Benchmark benchmark = execBenchmark(algLines);
     assertTrue(
         "did not use the specified MergePolicy",
         ((MyMergePolicy) benchmark.getRunData().getIndexWriter().getConfig().getMergePolicy())
             .called);
     benchmark.getRunData().getIndexWriter().close();

     // 3. test number of docs in the index
     IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory());
     int ndocsExpected = 20; // first 20 reuters docs.
     assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
     ir.close();
   }

   /** Test that IndexWriter settings stick. */
   public void testIndexWriterSettings() throws Exception {
     // 1. alg definition (required in every "logic" test)
     String algLines[] = {
       "# ----- properties ",
       "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
       "docs.file=" + getReuters20LinesFile(),
       "content.source.log.step=3",
       "ram.flush.mb=-1",
       "max.buffered=2",
       "compound=cmpnd:true:false",
       "doc.term.vector=vector:false:true",
       "content.source.forever=false",
       "directory=ByteBuffersDirectory",
       "doc.stored=false",
       "merge.factor=3",
       "doc.tokenized=false",
       "debug.level=1",
       "# ----- alg ",
       "{ \"Rounds\"",
       "  ResetSystemErase",
       "  CreateIndex",
       "  { \"AddDocs\"  AddDoc > : * ",
       "  NewRound",
       "} : 2",
     };

     // 2. execute the algorithm  (required in every "logic" test)
     Benchmark benchmark = execBenchmark(algLines);
     final IndexWriter writer = benchmark.getRunData().getIndexWriter();
     assertEquals(2, writer.getConfig().getMaxBufferedDocs());
     assertEquals(
         IndexWriterConfig.DISABLE_AUTO_FLUSH, (int) writer.getConfig().getRAMBufferSizeMB());
     assertEquals(3, ((LogMergePolicy) writer.getConfig().getMergePolicy()).getMergeFactor());
     assertEquals(0.0d, writer.getConfig().getMergePolicy().getNoCFSRatio(), 0.0);
     writer.close();
     Directory dir = benchmark.getRunData().getDirectory();
     IndexReader reader = DirectoryReader.open(dir);
     Fields tfv = reader.getTermVectors(0);
     assertNotNull(tfv);
     assertTrue(tfv.size() > 0);
     reader.close();
   }

   /** Test indexing with facets tasks. */
   public void testIndexingWithFacets() throws Exception {
     // 1. alg definition (required in every "logic" test)
     String algLines[] = {
       "# ----- properties ",
       "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
       "docs.file=" + getReuters20LinesFile(),
       "content.source.log.step=100",
       "content.source.forever=false",
       "directory=ByteBuffersDirectory",
       "doc.stored=false",
       "merge.factor=3",
       "doc.tokenized=false",
       "debug.level=1",
       "# ----- alg ",
       "ResetSystemErase",
       "CreateIndex",
       "CreateTaxonomyIndex",
       "{ \"AddDocs\"  AddFacetedDoc > : * ",
       "CloseIndex",
       "CloseTaxonomyIndex",
       "OpenTaxonomyReader",
     };

     // 2. execute the algorithm  (required in every "logic" test)
     Benchmark benchmark = execBenchmark(algLines);
     PerfRunData runData = benchmark.getRunData();
     assertNull("taxo writer was not properly closed", runData.getTaxonomyWriter());
     TaxonomyReader taxoReader = runData.getTaxonomyReader();
     assertNotNull("taxo reader was not opened", taxoReader);
     assertTrue(
         "nothing was added to the taxnomy (expecting root and at least one addtional category)",
         taxoReader.getSize() > 1);
     taxoReader.close();
   }

   /** Test that we can call forceMerge(maxNumSegments). */
   public void testForceMerge() throws Exception {
     // 1. alg definition (required in every "logic" test)
     String algLines[] = {
       "# ----- properties ",
       "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
       "docs.file=" + getReuters20LinesFile(),
       "content.source.log.step=3",
       "ram.flush.mb=-1",
       "max.buffered=3",
       "doc.term.vector=false",
       "content.source.forever=false",
       "directory=ByteBuffersDirectory",
       "merge.policy=org.apache.lucene.index.LogDocMergePolicy",
       "doc.stored=false",
       "doc.tokenized=false",
       "debug.level=1",
       "# ----- alg ",
       "{ \"Rounds\"",
       "  ResetSystemErase",
       "  CreateIndex",
       "  { \"AddDocs\"  AddDoc > : * ",
       "  ForceMerge(3)",
       "  CloseIndex()",
       "} : 2",
     };

     // 2. execute the algorithm  (required in every "logic" test)
     Benchmark benchmark = execBenchmark(algLines);

     // 3. test number of docs in the index
     IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory());
     int ndocsExpected = 20; // first 20 reuters docs.
     assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
     ir.close();

     // Make sure we have 3 segments:
     SegmentInfos infos = SegmentInfos.readLatestCommit(benchmark.getRunData().getDirectory());
     assertEquals(3, infos.size());
   }

   /** Test disabling task count (LUCENE-1136). */
   public void testDisableCounting() throws Exception {
     doTestDisableCounting(true);
     doTestDisableCounting(false);
   }

   private void doTestDisableCounting(boolean disable) throws Exception {
     // 1. alg definition (required in every "logic" test)
     String algLines[] = disableCountingLines(disable);

     // 2. execute the algorithm  (required in every "logic" test)
     Benchmark benchmark = execBenchmark(algLines);

     // 3. test counters
     int n = disable ? 0 : 1;
     int nChecked = 0;
     for (final TaskStats stats : benchmark.getRunData().getPoints().taskStats()) {
       String taskName = stats.getTask().getName();
       if (taskName.equals("Rounds")) {
         assertEquals("Wrong total count!", 20 + 2 * n, stats.getCount());
         nChecked++;
       } else if (taskName.equals("CreateIndex")) {
         assertEquals("Wrong count for CreateIndex!", n, stats.getCount());
         nChecked++;
       } else if (taskName.equals("CloseIndex")) {
         assertEquals("Wrong count for CloseIndex!", n, stats.getCount());
         nChecked++;
       }
     }
     assertEquals("Missing some tasks to check!", 3, nChecked);
   }

   private String[] disableCountingLines(boolean disable) {
     String dis = disable ? "-" : "";
     return new String[] {
       "# ----- properties ",
       "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
       "docs.file=" + getReuters20LinesFile(),
       "content.source.log.step=30",
       "doc.term.vector=false",
       "content.source.forever=false",
       "directory=ByteBuffersDirectory",
       "doc.stored=false",
       "doc.tokenized=false",
       "task.max.depth.log=1",
       "# ----- alg ",
       "{ \"Rounds\"",
       "  ResetSystemErase",
       "  " + dis + "CreateIndex", // optionally disable counting here
       "  { \"AddDocs\"  AddDoc > : * ",
       "  " + dis + "  CloseIndex", // optionally disable counting here (with extra blanks)
       "}",
       "RepSumByName",
     };
   }

   /** Test that we can change the Locale in the runData, that it is parsed as we expect. */
   public void testLocale() throws Exception {
     // empty Locale: clear it (null)
     Benchmark benchmark = execBenchmark(getLocaleConfig(""));
     assertNull(benchmark.getRunData().getLocale());

     // ROOT locale
     benchmark = execBenchmark(getLocaleConfig("ROOT"));
     assertEquals(new Locale(""), benchmark.getRunData().getLocale());

     // specify just a language
     benchmark = execBenchmark(getLocaleConfig("de"));
     assertEquals(new Locale("de"), benchmark.getRunData().getLocale());

     // specify language + country
     benchmark = execBenchmark(getLocaleConfig("en,US"));
     assertEquals(new Locale("en", "US"), benchmark.getRunData().getLocale());

     // specify language + country + variant
     benchmark = execBenchmark(getLocaleConfig("no,NO,NY"));
     assertEquals(new Locale("no", "NO", "NY"), benchmark.getRunData().getLocale());
   }

   private String[] getLocaleConfig(String localeParam) {
     String algLines[] = {
       "# ----- properties ",
       "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
       "docs.file=" + getReuters20LinesFile(),
       "content.source.log.step=3",
       "content.source.forever=false",
       "directory=ByteBuffersDirectory",
       "# ----- alg ",
       "{ \"Rounds\"",
       "  ResetSystemErase",
       "  NewLocale(" + localeParam + ")",
       "  CreateIndex",
       "  { \"AddDocs\"  AddDoc > : * ",
       "  NewRound",
       "} : 1",
     };
     return algLines;
   }

   /** Test that we can create CollationAnalyzers. */
   public void testCollator() throws Exception {
     // ROOT locale
     Benchmark benchmark = execBenchmark(getCollatorConfig("ROOT", "impl:jdk"));
     CollationKeyAnalyzer expected = new CollationKeyAnalyzer(Collator.getInstance(new Locale("")));
     assertEqualCollation(expected, benchmark.getRunData().getAnalyzer(), "foobar");

     // specify just a language
     benchmark = execBenchmark(getCollatorConfig("de", "impl:jdk"));
     expected = new CollationKeyAnalyzer(Collator.getInstance(new Locale("de")));
     assertEqualCollation(expected, benchmark.getRunData().getAnalyzer(), "foobar");

     // specify language + country
     benchmark = execBenchmark(getCollatorConfig("en,US", "impl:jdk"));
     expected = new CollationKeyAnalyzer(Collator.getInstance(new Locale("en", "US")));
     assertEqualCollation(expected, benchmark.getRunData().getAnalyzer(), "foobar");

     // specify language + country + variant
     benchmark = execBenchmark(getCollatorConfig("no,NO,NY", "impl:jdk"));
     expected = new CollationKeyAnalyzer(Collator.getInstance(new Locale("no", "NO", "NY")));
     assertEqualCollation(expected, benchmark.getRunData().getAnalyzer(), "foobar");
   }

   private void assertEqualCollation(Analyzer a1, Analyzer a2, String text) throws Exception {
     TokenStream ts1 = a1.tokenStream("bogus", text);
     TokenStream ts2 = a2.tokenStream("bogus", text);
     ts1.reset();
     ts2.reset();
     TermToBytesRefAttribute termAtt1 = ts1.addAttribute(TermToBytesRefAttribute.class);
     TermToBytesRefAttribute termAtt2 = ts2.addAttribute(TermToBytesRefAttribute.class);
     assertTrue(ts1.incrementToken());
     assertTrue(ts2.incrementToken());
     BytesRef bytes1 = termAtt1.getBytesRef();
     BytesRef bytes2 = termAtt2.getBytesRef();
     assertEquals(bytes1, bytes2);
     assertFalse(ts1.incrementToken());
     assertFalse(ts2.incrementToken());
     ts1.close();
     ts2.close();
   }

   private String[] getCollatorConfig(String localeParam, String collationParam) {
     String algLines[] = {
       "# ----- properties ",
       "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
       "docs.file=" + getReuters20LinesFile(),
       "content.source.log.step=3",
       "content.source.forever=false",
       "directory=ByteBuffersDirectory",
       "# ----- alg ",
       "{ \"Rounds\"",
       "  ResetSystemErase",
       "  NewLocale(" + localeParam + ")",
       "  NewCollationAnalyzer(" + collationParam + ")",
       "  CreateIndex",
       "  { \"AddDocs\"  AddDoc > : * ",
       "  NewRound",
       "} : 1",
     };
     return algLines;
   }

   /** Test that we can create shingle analyzers using AnalyzerFactory. */
   public void testShingleAnalyzer() throws Exception {
     String text = "one,two,three, four five six";

     // StandardTokenizer, maxShingleSize, and outputUnigrams
     Benchmark benchmark =
         execBenchmark(
             getAnalyzerFactoryConfig("shingle-analyzer", "StandardTokenizer,ShingleFilter"));
     benchmark.getRunData().getAnalyzer().tokenStream("bogus", text).close();
     BaseTokenStreamTestCase.assertAnalyzesTo(
         benchmark.getRunData().getAnalyzer(),
         text,
         new String[] {
           "one", "one two", "two", "two three",
           "three", "three four", "four", "four five",
           "five", "five six", "six"
         });
     // StandardTokenizer, maxShingleSize = 3, and outputUnigrams = false
     benchmark =
         execBenchmark(
             getAnalyzerFactoryConfig(
                 "shingle-analyzer",
                 "StandardTokenizer,ShingleFilter(maxShingleSize:3,outputUnigrams:false)"));
     BaseTokenStreamTestCase.assertAnalyzesTo(
         benchmark.getRunData().getAnalyzer(),
         text,
         new String[] {
           "one two",
           "one two three",
           "two three",
           "two three four",
           "three four",
           "three four five",
           "four five",
           "four five six",
           "five six"
         });
     // WhitespaceTokenizer, default maxShingleSize and outputUnigrams
     benchmark =
         execBenchmark(
             getAnalyzerFactoryConfig("shingle-analyzer", "WhitespaceTokenizer,ShingleFilter"));
     BaseTokenStreamTestCase.assertAnalyzesTo(
         benchmark.getRunData().getAnalyzer(),
         text,
         new String[] {
           "one,two,three,", "one,two,three, four", "four", "four five", "five", "five six", "six"
         });

     // WhitespaceTokenizer, maxShingleSize=3 and outputUnigrams=false
     benchmark =
         execBenchmark(
             getAnalyzerFactoryConfig(
                 "shingle-factory",
                 "WhitespaceTokenizer,ShingleFilter(outputUnigrams:false,maxShingleSize:3)"));
     BaseTokenStreamTestCase.assertAnalyzesTo(
         benchmark.getRunData().getAnalyzer(),
         text,
         new String[] {
           "one,two,three, four",
           "one,two,three, four five",
           "four five",
           "four five six",
           "five six"
         });
   }

   private String[] getAnalyzerFactoryConfig(String name, String params) {
     final String singleQuoteEscapedName = name.replaceAll("'", "\\\\'");
     String algLines[] = {
       "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
       "docs.file=" + getReuters20LinesFile(),
       "work.dir="
           + getWorkDir().toAbsolutePath().toString().replaceAll("\\\\", "/"), // Fix Windows path
       "content.source.forever=false",
       "directory=ByteBuffersDirectory",
       "AnalyzerFactory(name:'" + singleQuoteEscapedName + "', " + params + ")",
       "NewAnalyzer('" + singleQuoteEscapedName + "')",
       "CreateIndex",
       "{ \"AddDocs\"  AddDoc > : * "
     };
     return algLines;
   }

   public void testAnalyzerFactory() throws Exception {
     String text = "Fortieth, Quarantième, Cuadragésimo";
     Benchmark benchmark =
         execBenchmark(
             getAnalyzerFactoryConfig(
                 "ascii folded, pattern replaced, standard tokenized, downcased, bigrammed.'analyzer'",
                 "positionIncrementGap:100,offsetGap:1111,"
                     + "MappingCharFilter(mapping:'test-mapping-ISOLatin1Accent-partial.txt'),"
                     + "PatternReplaceCharFilterFactory(pattern:'e(\\\\\\\\S*)m',replacement:\"$1xxx$1\"),"
                     + "StandardTokenizer,LowerCaseFilter,NGramTokenFilter(minGramSize:2,maxGramSize:2)"));
     BaseTokenStreamTestCase.assertAnalyzesTo(
         benchmark.getRunData().getAnalyzer(),
         text,
         new String[] {
           "fo", "or", "rt", "ti", "ie", "et", "th", "qu", "ua", "ar", "ra", "an", "nt", "ti", "ix",
           "xx", "xx", "xe", "cu", "ua", "ad", "dr", "ra", "ag", "gs", "si", "ix", "xx", "xx", "xs",
           "si", "io"
         });
   }

   private String getReuters20LinesFile() {
     return getWorkDirResourcePath("reuters.first20.lines.txt");
   }
 }