blob: 54797f5907f61542470da79bb3071cf2efe253cf [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.benchmark.byTask.tasks;
import java.text.BreakIterator;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.DefaultEncoder;
import org.apache.lucene.search.highlight.Encoder;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.TokenSources;
import org.apache.lucene.search.uhighlight.UnifiedHighlighter;
import org.apache.lucene.search.vectorhighlight.BoundaryScanner;
import org.apache.lucene.search.vectorhighlight.BreakIteratorBoundaryScanner;
import org.apache.lucene.search.vectorhighlight.FastVectorHighlighter;
import org.apache.lucene.search.vectorhighlight.FieldQuery;
import org.apache.lucene.search.vectorhighlight.ScoreOrderFragmentsBuilder;
import org.apache.lucene.search.vectorhighlight.WeightedFragListBuilder;
import org.apache.lucene.util.ArrayUtil;
/**
* Search and Traverse and Retrieve docs task. Highlight the fields in the retrieved documents.
*
* <p>Note: This task reuses the reader if it is already open. Otherwise a reader is opened at start
* and closed at the end.
*
* <p>Takes optional multivalued, comma separated param string as:
* type[&lt;enum&gt;],maxFrags[&lt;int&gt;],fields[name1;name2;...]
*
* <ul>
* <li>type - the highlighter implementation, e.g. "UH"
* <li>maxFrags - The maximum number of fragments to score by the highlighter
* <li>fields - The fields to highlight. If not specified all fields will be highlighted (or at
* least attempted)
* </ul>
*
* Example:
*
* <pre>"SearchHlgtSameRdr" SearchTravRetHighlight(type[UH],maxFrags[3],fields[body]) &gt; : 1000
* </pre>
*
* Documents must be stored in order for this task to work. Additionally, term vector positions can
* be used as well, and offsets in postings is another option.
*
* <p>Other side effects: counts additional 1 (record) for each traversed hit, and 1 more for each
* retrieved (non null) document and 1 for each fragment returned.
*/
public class SearchTravRetHighlightTask extends SearchTravTask {
private int maxDocCharsToAnalyze; // max leading content chars to highlight
private int maxFrags = 1; // aka passages
private Set<String> hlFields = Collections.singleton("body");
private String type;
private HLImpl hlImpl;
private Analyzer analyzer;
public SearchTravRetHighlightTask(PerfRunData runData) {
super(runData);
}
@Override
public void setParams(String params) {
// can't call super because super doesn't understand our params syntax
this.params = params;
// TODO consider instead using data.getConfig().get("highlighter.*")?
String[] splits = params.split(",");
for (String split : splits) {
if (split.startsWith("type[") == true) {
type = split.substring("type[".length(), split.length() - 1);
} else if (split.startsWith("maxFrags[") == true) {
maxFrags =
(int) Float.parseFloat(split.substring("maxFrags[".length(), split.length() - 1));
} else if (split.startsWith("fields[") == true) {
String fieldNames = split.substring("fields[".length(), split.length() - 1);
String[] fieldSplits = fieldNames.split(";");
hlFields = new HashSet<>(Arrays.asList(fieldSplits));
}
}
}
@Override
public void setup() throws Exception {
super.setup();
// check to make sure either the doc is being stored
PerfRunData data = getRunData();
if (data.getConfig().get("doc.stored", false) == false) {
throw new Exception("doc.stored must be set to true");
}
maxDocCharsToAnalyze =
data.getConfig()
.get("highlighter.maxDocCharsToAnalyze", Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE);
analyzer = data.getAnalyzer();
String type = this.type;
if (type == null) {
type = data.getConfig().get("highlighter", null);
}
switch (type) {
case "NONE":
hlImpl = new NoHLImpl();
break;
case "SH_A":
hlImpl = new StandardHLImpl(false);
break;
case "SH_V":
hlImpl = new StandardHLImpl(true);
break;
case "FVH_V":
hlImpl = new FastVectorHLImpl();
break;
case "UH":
hlImpl = new UnifiedHLImpl(null);
break;
case "UH_A":
hlImpl = new UnifiedHLImpl(UnifiedHighlighter.OffsetSource.ANALYSIS);
break;
case "UH_V":
hlImpl = new UnifiedHLImpl(UnifiedHighlighter.OffsetSource.TERM_VECTORS);
break;
case "UH_P":
hlImpl = new UnifiedHLImpl(UnifiedHighlighter.OffsetSource.POSTINGS);
break;
case "UH_PV":
hlImpl = new UnifiedHLImpl(UnifiedHighlighter.OffsetSource.POSTINGS_WITH_TERM_VECTORS);
break;
default:
throw new Exception("unrecognized highlighter type: " + type + " (try 'UH')");
}
}
// here is where we intercept ReadTask's logic to do the highlighting, and nothing else (no
// retrieval of all field vals)
@Override
protected int withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception {
hlImpl.withTopDocs(searcher, q, hits);
// note: it'd be nice if we knew the sum kilobytes of text across these hits so we could return
// that. It'd be a more
// useful number to gauge the amount of work. But given "average" document sizes and lots of
// queries, returning the
// number of docs is reasonable.
return hits.scoreDocs.length; // always return # scored docs.
}
private interface HLImpl {
void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception;
}
private volatile int preventOptimizeAway = 0;
private class StandardHLImpl implements HLImpl {
SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<em>", "</em>");
DefaultEncoder encoder = new DefaultEncoder();
Highlighter highlighter = new Highlighter(formatter, encoder, null);
boolean termVecs;
StandardHLImpl(boolean termVecs) {
highlighter.setEncoder(new DefaultEncoder());
highlighter.setMaxDocCharsToAnalyze(maxDocCharsToAnalyze);
this.termVecs = termVecs;
}
@Override
public void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception {
IndexReader reader = searcher.getIndexReader();
highlighter.setFragmentScorer(new QueryScorer(q));
// highlighter.setTextFragmenter(); unfortunately no sentence mechanism, not even regex.
// Default here is trivial
for (ScoreDoc scoreDoc : docIdOrder(hits.scoreDocs)) {
Document document = reader.document(scoreDoc.doc, hlFields);
Fields tvFields = termVecs ? reader.getTermVectors(scoreDoc.doc) : null;
for (IndexableField indexableField : document) {
TokenStream tokenStream;
if (termVecs) {
tokenStream =
TokenSources.getTokenStream(
indexableField.name(),
tvFields,
indexableField.stringValue(),
analyzer,
maxDocCharsToAnalyze);
} else {
tokenStream = analyzer.tokenStream(indexableField.name(), indexableField.stringValue());
}
// will close TokenStream:
String[] fragments =
highlighter.getBestFragments(tokenStream, indexableField.stringValue(), maxFrags);
preventOptimizeAway = fragments.length;
}
}
}
}
private class FastVectorHLImpl implements HLImpl {
int fragSize = 100;
WeightedFragListBuilder fragListBuilder = new WeightedFragListBuilder();
BoundaryScanner bs =
new BreakIteratorBoundaryScanner(BreakIterator.getSentenceInstance(Locale.ENGLISH));
ScoreOrderFragmentsBuilder fragmentsBuilder = new ScoreOrderFragmentsBuilder(bs);
String[] preTags = {"<em>"};
String[] postTags = {"</em>"};
Encoder encoder = new DefaultEncoder(); // new SimpleHTMLEncoder();
FastVectorHighlighter highlighter =
new FastVectorHighlighter(
true, // phraseHighlight
false); // requireFieldMatch -- not pertinent to our benchmark
@Override
public void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception {
IndexReader reader = searcher.getIndexReader();
final FieldQuery fq = highlighter.getFieldQuery(q, reader);
for (ScoreDoc scoreDoc : docIdOrder(hits.scoreDocs)) {
for (String hlField : hlFields) {
String[] fragments =
highlighter.getBestFragments(
fq,
reader,
scoreDoc.doc,
hlField,
fragSize,
maxFrags,
fragListBuilder,
fragmentsBuilder,
preTags,
postTags,
encoder);
preventOptimizeAway = fragments.length;
}
}
}
}
private ScoreDoc[] docIdOrder(ScoreDoc[] scoreDocs) {
ScoreDoc[] clone = new ScoreDoc[scoreDocs.length];
System.arraycopy(scoreDocs, 0, clone, 0, scoreDocs.length);
ArrayUtil.introSort(clone, (a, b) -> Integer.compare(a.doc, b.doc));
return clone;
}
private class UnifiedHLImpl implements HLImpl {
UnifiedHighlighter highlighter;
IndexSearcher lastSearcher;
UnifiedHighlighter.OffsetSource offsetSource; // null means auto select
String[] fields = hlFields.toArray(new String[hlFields.size()]);
int[] maxPassages;
UnifiedHLImpl(final UnifiedHighlighter.OffsetSource offsetSource) {
this.offsetSource = offsetSource;
maxPassages = new int[hlFields.size()];
Arrays.fill(maxPassages, maxFrags);
}
private void reset(IndexSearcher searcher) {
if (lastSearcher == searcher) {
return;
}
lastSearcher = searcher;
highlighter =
new UnifiedHighlighter(searcher, analyzer) {
@Override
protected OffsetSource getOffsetSource(String field) {
return offsetSource != null ? offsetSource : super.getOffsetSource(field);
}
};
highlighter.setBreakIterator(() -> BreakIterator.getSentenceInstance(Locale.ENGLISH));
highlighter.setMaxLength(maxDocCharsToAnalyze);
highlighter.setHighlightPhrasesStrictly(true);
highlighter.setHandleMultiTermQuery(true);
}
@Override
public void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception {
reset(searcher);
Map<String, String[]> result = highlighter.highlightFields(fields, q, hits, maxPassages);
preventOptimizeAway = result.size();
}
}
private class NoHLImpl implements HLImpl {
@Override
public void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception {
// just retrieve the HL fields
for (ScoreDoc scoreDoc : docIdOrder(hits.scoreDocs)) {
preventOptimizeAway += searcher.doc(scoreDoc.doc, hlFields).iterator().hasNext() ? 2 : 1;
}
}
}
}