| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.benchmark.byTask.feeds; |
| |
| |
| import java.io.BufferedWriter; |
| import java.io.IOException; |
| import java.io.OutputStream; |
| import java.io.OutputStreamWriter; |
| import java.nio.charset.StandardCharsets; |
| import java.nio.file.Files; |
| import java.nio.file.Path; |
| import java.util.Properties; |
| |
| import org.apache.commons.compress.compressors.CompressorStreamFactory; |
| import org.apache.lucene.analysis.core.WhitespaceAnalyzer; |
| import org.apache.lucene.benchmark.BenchmarkTestCase; |
| import org.apache.lucene.benchmark.byTask.PerfRunData; |
| import org.apache.lucene.benchmark.byTask.feeds.LineDocSource.HeaderLineParser; |
| import org.apache.lucene.benchmark.byTask.feeds.LineDocSource.LineParser; |
| import org.apache.lucene.benchmark.byTask.tasks.AddDocTask; |
| import org.apache.lucene.benchmark.byTask.tasks.CloseIndexTask; |
| import org.apache.lucene.benchmark.byTask.tasks.CreateIndexTask; |
| import org.apache.lucene.benchmark.byTask.tasks.TaskSequence; |
| import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask; |
| import org.apache.lucene.benchmark.byTask.utils.Config; |
| import org.apache.lucene.index.DirectoryReader; |
| import org.apache.lucene.index.IndexReader; |
| import org.apache.lucene.index.Term; |
| import org.apache.lucene.search.IndexSearcher; |
| import org.apache.lucene.search.TermQuery; |
| import org.apache.lucene.search.TopDocs; |
| import org.apache.lucene.util.IOUtils; |
| |
| /** Tests the functionality of {@link LineDocSource}. */ |
| public class LineDocSourceTest extends BenchmarkTestCase { |
| |
| private static final CompressorStreamFactory csFactory = new CompressorStreamFactory(); |
| |
| private void createBZ2LineFile(Path file, boolean addHeader) throws Exception { |
| OutputStream out = Files.newOutputStream(file); |
| out = csFactory.createCompressorOutputStream("bzip2", out); |
| BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, StandardCharsets.UTF_8)); |
| writeDocsToFile(writer, addHeader, null); |
| writer.close(); |
| } |
| |
| private void writeDocsToFile(BufferedWriter writer, boolean addHeader, Properties otherFields) throws IOException { |
| if (addHeader) { |
| writer.write(WriteLineDocTask.FIELDS_HEADER_INDICATOR); |
| writer.write(WriteLineDocTask.SEP); |
| writer.write(DocMaker.TITLE_FIELD); |
| writer.write(WriteLineDocTask.SEP); |
| writer.write(DocMaker.DATE_FIELD); |
| writer.write(WriteLineDocTask.SEP); |
| writer.write(DocMaker.BODY_FIELD); |
| if (otherFields!=null) { |
| // additional field names in the header |
| for (Object fn : otherFields.keySet()) { |
| writer.write(WriteLineDocTask.SEP); |
| writer.write(fn.toString()); |
| } |
| } |
| writer.newLine(); |
| } |
| StringBuilder doc = new StringBuilder(); |
| doc.append("title").append(WriteLineDocTask.SEP).append("date").append(WriteLineDocTask.SEP).append(DocMaker.BODY_FIELD); |
| if (otherFields!=null) { |
| // additional field values in the doc line |
| for (Object fv : otherFields.values()) { |
| doc.append(WriteLineDocTask.SEP).append(fv.toString()); |
| } |
| } |
| writer.write(doc.toString()); |
| writer.newLine(); |
| } |
| |
| private void createRegularLineFile(Path file, boolean addHeader) throws Exception { |
| OutputStream out = Files.newOutputStream(file); |
| BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, StandardCharsets.UTF_8)); |
| writeDocsToFile(writer, addHeader, null); |
| writer.close(); |
| } |
| |
| private void createRegularLineFileWithMoreFields(Path file, String...extraFields) throws Exception { |
| OutputStream out = Files.newOutputStream(file); |
| BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, StandardCharsets.UTF_8)); |
| Properties p = new Properties(); |
| for (String f : extraFields) { |
| p.setProperty(f, f); |
| } |
| writeDocsToFile(writer, true, p); |
| writer.close(); |
| } |
| |
| private void doIndexAndSearchTest(Path file, Class<? extends LineParser> lineParserClass, String storedField) throws Exception { |
| doIndexAndSearchTestWithRepeats(file, lineParserClass, 1, storedField); // no extra repetitions |
| doIndexAndSearchTestWithRepeats(file, lineParserClass, 2, storedField); // 1 extra repetition |
| doIndexAndSearchTestWithRepeats(file, lineParserClass, 4, storedField); // 3 extra repetitions |
| } |
| |
| private void doIndexAndSearchTestWithRepeats(Path file, |
| Class<? extends LineParser> lineParserClass, int numAdds, String storedField) throws Exception { |
| |
| IndexReader reader = null; |
| IndexSearcher searcher = null; |
| PerfRunData runData = null; |
| try { |
| Properties props = new Properties(); |
| |
| // LineDocSource specific settings. |
| props.setProperty("docs.file", file.toAbsolutePath().toString()); |
| if (lineParserClass != null) { |
| props.setProperty("line.parser", lineParserClass.getName()); |
| } |
| |
| // Indexing configuration. |
| props.setProperty("analyzer", WhitespaceAnalyzer.class.getName()); |
| props.setProperty("content.source", LineDocSource.class.getName()); |
| props.setProperty("directory", "RAMDirectory"); |
| props.setProperty("doc.stored", "true"); |
| props.setProperty("doc.index.props", "true"); |
| |
| // Create PerfRunData |
| Config config = new Config(props); |
| runData = new PerfRunData(config); |
| |
| TaskSequence tasks = new TaskSequence(runData, "testBzip2", null, false); |
| tasks.addTask(new CreateIndexTask(runData)); |
| for (int i=0; i<numAdds; i++) { |
| tasks.addTask(new AddDocTask(runData)); |
| } |
| tasks.addTask(new CloseIndexTask(runData)); |
| try { |
| tasks.doLogic(); |
| } finally { |
| tasks.close(); |
| } |
| |
| reader = DirectoryReader.open(runData.getDirectory()); |
| searcher = newSearcher(reader); |
| TopDocs td = searcher.search(new TermQuery(new Term("body", "body")), 10); |
| assertEquals(numAdds, td.totalHits.value); |
| assertNotNull(td.scoreDocs[0]); |
| |
| if (storedField==null) { |
| storedField = DocMaker.BODY_FIELD; // added to all docs and satisfies field-name == value |
| } |
| assertEquals("Wrong field value", storedField, searcher.doc(0).get(storedField)); |
| } finally { |
| IOUtils.close(reader, runData); |
| } |
| |
| } |
| |
| /* Tests LineDocSource with a bzip2 input stream. */ |
| public void testBZip2() throws Exception { |
| Path file = getWorkDir().resolve("one-line.bz2"); |
| createBZ2LineFile(file,true); |
| doIndexAndSearchTest(file, null, null); |
| } |
| |
| public void testBZip2NoHeaderLine() throws Exception { |
| Path file = getWorkDir().resolve("one-line.bz2"); |
| createBZ2LineFile(file,false); |
| doIndexAndSearchTest(file, null, null); |
| } |
| |
| public void testRegularFile() throws Exception { |
| Path file = getWorkDir().resolve("one-line"); |
| createRegularLineFile(file,true); |
| doIndexAndSearchTest(file, null, null); |
| } |
| |
| public void testRegularFileSpecialHeader() throws Exception { |
| Path file = getWorkDir().resolve("one-line"); |
| createRegularLineFile(file,true); |
| doIndexAndSearchTest(file, HeaderLineParser.class, null); |
| } |
| |
| public void testRegularFileNoHeaderLine() throws Exception { |
| Path file = getWorkDir().resolve("one-line"); |
| createRegularLineFile(file,false); |
| doIndexAndSearchTest(file, null, null); |
| } |
| |
| public void testInvalidFormat() throws Exception { |
| String[] testCases = new String[] { |
| "", // empty line |
| "title", // just title |
| "title" + WriteLineDocTask.SEP, // title + SEP |
| "title" + WriteLineDocTask.SEP + "body", // title + SEP + body |
| // note that title + SEP + body + SEP is a valid line, which results in an |
| // empty body |
| }; |
| |
| for (int i = 0; i < testCases.length; i++) { |
| Path file = getWorkDir().resolve("one-line"); |
| BufferedWriter writer = Files.newBufferedWriter(file, StandardCharsets.UTF_8); |
| writer.write(testCases[i]); |
| writer.newLine(); |
| writer.close(); |
| expectThrows(Exception.class, () -> { |
| doIndexAndSearchTest(file, null, null); |
| }); |
| } |
| } |
| |
| /** Doc Name is not part of the default header */ |
| public void testWithDocsName() throws Exception { |
| Path file = getWorkDir().resolve("one-line"); |
| createRegularLineFileWithMoreFields(file, DocMaker.NAME_FIELD); |
| doIndexAndSearchTest(file, null, DocMaker.NAME_FIELD); |
| } |
| |
| /** Use fields names that are not defined in Docmaker and so will go to Properties */ |
| public void testWithProperties() throws Exception { |
| Path file = getWorkDir().resolve("one-line"); |
| String specialField = "mySpecialField"; |
| createRegularLineFileWithMoreFields(file, specialField); |
| doIndexAndSearchTest(file, null, specialField); |
| } |
| |
| } |