blob: e826e43c05adadb12e1f795524e8c30da5342121 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.benchmark.byTask.feeds;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Properties;
import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.benchmark.BenchmarkTestCase;
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.benchmark.byTask.feeds.LineDocSource.HeaderLineParser;
import org.apache.lucene.benchmark.byTask.feeds.LineDocSource.LineParser;
import org.apache.lucene.benchmark.byTask.tasks.AddDocTask;
import org.apache.lucene.benchmark.byTask.tasks.CloseIndexTask;
import org.apache.lucene.benchmark.byTask.tasks.CreateIndexTask;
import org.apache.lucene.benchmark.byTask.tasks.TaskSequence;
import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.util.IOUtils;
/** Tests the functionality of {@link LineDocSource}. */
public class LineDocSourceTest extends BenchmarkTestCase {
private static final CompressorStreamFactory csFactory = new CompressorStreamFactory();
private void createBZ2LineFile(Path file, boolean addHeader) throws Exception {
OutputStream out = Files.newOutputStream(file);
out = csFactory.createCompressorOutputStream("bzip2", out);
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, StandardCharsets.UTF_8));
writeDocsToFile(writer, addHeader, null);
writer.close();
}
private void writeDocsToFile(BufferedWriter writer, boolean addHeader, Properties otherFields) throws IOException {
if (addHeader) {
writer.write(WriteLineDocTask.FIELDS_HEADER_INDICATOR);
writer.write(WriteLineDocTask.SEP);
writer.write(DocMaker.TITLE_FIELD);
writer.write(WriteLineDocTask.SEP);
writer.write(DocMaker.DATE_FIELD);
writer.write(WriteLineDocTask.SEP);
writer.write(DocMaker.BODY_FIELD);
if (otherFields!=null) {
// additional field names in the header
for (Object fn : otherFields.keySet()) {
writer.write(WriteLineDocTask.SEP);
writer.write(fn.toString());
}
}
writer.newLine();
}
StringBuilder doc = new StringBuilder();
doc.append("title").append(WriteLineDocTask.SEP).append("date").append(WriteLineDocTask.SEP).append(DocMaker.BODY_FIELD);
if (otherFields!=null) {
// additional field values in the doc line
for (Object fv : otherFields.values()) {
doc.append(WriteLineDocTask.SEP).append(fv.toString());
}
}
writer.write(doc.toString());
writer.newLine();
}
private void createRegularLineFile(Path file, boolean addHeader) throws Exception {
OutputStream out = Files.newOutputStream(file);
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, StandardCharsets.UTF_8));
writeDocsToFile(writer, addHeader, null);
writer.close();
}
private void createRegularLineFileWithMoreFields(Path file, String...extraFields) throws Exception {
OutputStream out = Files.newOutputStream(file);
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, StandardCharsets.UTF_8));
Properties p = new Properties();
for (String f : extraFields) {
p.setProperty(f, f);
}
writeDocsToFile(writer, true, p);
writer.close();
}
private void doIndexAndSearchTest(Path file, Class<? extends LineParser> lineParserClass, String storedField) throws Exception {
doIndexAndSearchTestWithRepeats(file, lineParserClass, 1, storedField); // no extra repetitions
doIndexAndSearchTestWithRepeats(file, lineParserClass, 2, storedField); // 1 extra repetition
doIndexAndSearchTestWithRepeats(file, lineParserClass, 4, storedField); // 3 extra repetitions
}
private void doIndexAndSearchTestWithRepeats(Path file,
Class<? extends LineParser> lineParserClass, int numAdds, String storedField) throws Exception {
IndexReader reader = null;
IndexSearcher searcher = null;
PerfRunData runData = null;
try {
Properties props = new Properties();
// LineDocSource specific settings.
props.setProperty("docs.file", file.toAbsolutePath().toString());
if (lineParserClass != null) {
props.setProperty("line.parser", lineParserClass.getName());
}
// Indexing configuration.
props.setProperty("analyzer", WhitespaceAnalyzer.class.getName());
props.setProperty("content.source", LineDocSource.class.getName());
props.setProperty("directory", "RAMDirectory");
props.setProperty("doc.stored", "true");
props.setProperty("doc.index.props", "true");
// Create PerfRunData
Config config = new Config(props);
runData = new PerfRunData(config);
TaskSequence tasks = new TaskSequence(runData, "testBzip2", null, false);
tasks.addTask(new CreateIndexTask(runData));
for (int i=0; i<numAdds; i++) {
tasks.addTask(new AddDocTask(runData));
}
tasks.addTask(new CloseIndexTask(runData));
try {
tasks.doLogic();
} finally {
tasks.close();
}
reader = DirectoryReader.open(runData.getDirectory());
searcher = newSearcher(reader);
TopDocs td = searcher.search(new TermQuery(new Term("body", "body")), 10);
assertEquals(numAdds, td.totalHits.value);
assertNotNull(td.scoreDocs[0]);
if (storedField==null) {
storedField = DocMaker.BODY_FIELD; // added to all docs and satisfies field-name == value
}
assertEquals("Wrong field value", storedField, searcher.doc(0).get(storedField));
} finally {
IOUtils.close(reader, runData);
}
}
/* Tests LineDocSource with a bzip2 input stream. */
public void testBZip2() throws Exception {
Path file = getWorkDir().resolve("one-line.bz2");
createBZ2LineFile(file,true);
doIndexAndSearchTest(file, null, null);
}
public void testBZip2NoHeaderLine() throws Exception {
Path file = getWorkDir().resolve("one-line.bz2");
createBZ2LineFile(file,false);
doIndexAndSearchTest(file, null, null);
}
public void testRegularFile() throws Exception {
Path file = getWorkDir().resolve("one-line");
createRegularLineFile(file,true);
doIndexAndSearchTest(file, null, null);
}
public void testRegularFileSpecialHeader() throws Exception {
Path file = getWorkDir().resolve("one-line");
createRegularLineFile(file,true);
doIndexAndSearchTest(file, HeaderLineParser.class, null);
}
public void testRegularFileNoHeaderLine() throws Exception {
Path file = getWorkDir().resolve("one-line");
createRegularLineFile(file,false);
doIndexAndSearchTest(file, null, null);
}
public void testInvalidFormat() throws Exception {
String[] testCases = new String[] {
"", // empty line
"title", // just title
"title" + WriteLineDocTask.SEP, // title + SEP
"title" + WriteLineDocTask.SEP + "body", // title + SEP + body
// note that title + SEP + body + SEP is a valid line, which results in an
// empty body
};
for (int i = 0; i < testCases.length; i++) {
Path file = getWorkDir().resolve("one-line");
BufferedWriter writer = Files.newBufferedWriter(file, StandardCharsets.UTF_8);
writer.write(testCases[i]);
writer.newLine();
writer.close();
expectThrows(Exception.class, () -> {
doIndexAndSearchTest(file, null, null);
});
}
}
/** Doc Name is not part of the default header */
public void testWithDocsName() throws Exception {
Path file = getWorkDir().resolve("one-line");
createRegularLineFileWithMoreFields(file, DocMaker.NAME_FIELD);
doIndexAndSearchTest(file, null, DocMaker.NAME_FIELD);
}
/** Use fields names that are not defined in Docmaker and so will go to Properties */
public void testWithProperties() throws Exception {
Path file = getWorkDir().resolve("one-line");
String specialField = "mySpecialField";
createRegularLineFileWithMoreFields(file, specialField);
doIndexAndSearchTest(file, null, specialField);
}
}