| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.lucene.benchmark.byTask.tasks; |
| |
| import java.text.BreakIterator; |
| import java.util.Arrays; |
| import java.util.Collections; |
| import java.util.HashSet; |
| import java.util.Locale; |
| import java.util.Map; |
| import java.util.Set; |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.benchmark.byTask.PerfRunData; |
| import org.apache.lucene.document.Document; |
| import org.apache.lucene.index.Fields; |
| import org.apache.lucene.index.IndexReader; |
| import org.apache.lucene.index.IndexableField; |
| import org.apache.lucene.search.IndexSearcher; |
| import org.apache.lucene.search.Query; |
| import org.apache.lucene.search.ScoreDoc; |
| import org.apache.lucene.search.TopDocs; |
| import org.apache.lucene.search.highlight.DefaultEncoder; |
| import org.apache.lucene.search.highlight.Encoder; |
| import org.apache.lucene.search.highlight.Highlighter; |
| import org.apache.lucene.search.highlight.QueryScorer; |
| import org.apache.lucene.search.highlight.SimpleHTMLFormatter; |
| import org.apache.lucene.search.highlight.TokenSources; |
| import org.apache.lucene.search.uhighlight.UnifiedHighlighter; |
| import org.apache.lucene.search.vectorhighlight.BoundaryScanner; |
| import org.apache.lucene.search.vectorhighlight.BreakIteratorBoundaryScanner; |
| import org.apache.lucene.search.vectorhighlight.FastVectorHighlighter; |
| import org.apache.lucene.search.vectorhighlight.FieldQuery; |
| import org.apache.lucene.search.vectorhighlight.ScoreOrderFragmentsBuilder; |
| import org.apache.lucene.search.vectorhighlight.WeightedFragListBuilder; |
| import org.apache.lucene.util.ArrayUtil; |
| |
| /** |
| * Search and Traverse and Retrieve docs task. Highlight the fields in the retrieved documents. |
| * |
| * <p>Note: This task reuses the reader if it is already open. Otherwise a reader is opened at start |
| * and closed at the end. |
| * |
| * <p>Takes optional multivalued, comma separated param string as: |
| * type[<enum>],maxFrags[<int>],fields[name1;name2;...] |
| * |
| * <ul> |
| * <li>type - the highlighter implementation, e.g. "UH" |
| * <li>maxFrags - The maximum number of fragments to score by the highlighter |
| * <li>fields - The fields to highlight. If not specified all fields will be highlighted (or at |
| * least attempted) |
| * </ul> |
| * |
| * Example: |
| * |
| * <pre>"SearchHlgtSameRdr" SearchTravRetHighlight(type[UH],maxFrags[3],fields[body]) > : 1000 |
| * </pre> |
| * |
| * Documents must be stored in order for this task to work. Additionally, term vector positions can |
| * be used as well, and offsets in postings is another option. |
| * |
| * <p>Other side effects: counts additional 1 (record) for each traversed hit, and 1 more for each |
| * retrieved (non null) document and 1 for each fragment returned. |
| */ |
| public class SearchTravRetHighlightTask extends SearchTravTask { |
| private int maxDocCharsToAnalyze; // max leading content chars to highlight |
| private int maxFrags = 1; // aka passages |
| private Set<String> hlFields = Collections.singleton("body"); |
| private String type; |
| private HLImpl hlImpl; |
| private Analyzer analyzer; |
| |
| public SearchTravRetHighlightTask(PerfRunData runData) { |
| super(runData); |
| } |
| |
| @Override |
| public void setParams(String params) { |
| // can't call super because super doesn't understand our params syntax |
| this.params = params; |
| // TODO consider instead using data.getConfig().get("highlighter.*")? |
| String[] splits = params.split(","); |
| for (String split : splits) { |
| if (split.startsWith("type[") == true) { |
| type = split.substring("type[".length(), split.length() - 1); |
| } else if (split.startsWith("maxFrags[") == true) { |
| maxFrags = |
| (int) Float.parseFloat(split.substring("maxFrags[".length(), split.length() - 1)); |
| } else if (split.startsWith("fields[") == true) { |
| String fieldNames = split.substring("fields[".length(), split.length() - 1); |
| String[] fieldSplits = fieldNames.split(";"); |
| hlFields = new HashSet<>(Arrays.asList(fieldSplits)); |
| } |
| } |
| } |
| |
| @Override |
| public void setup() throws Exception { |
| super.setup(); |
| // check to make sure either the doc is being stored |
| PerfRunData data = getRunData(); |
| if (data.getConfig().get("doc.stored", false) == false) { |
| throw new Exception("doc.stored must be set to true"); |
| } |
| maxDocCharsToAnalyze = |
| data.getConfig() |
| .get("highlighter.maxDocCharsToAnalyze", Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE); |
| analyzer = data.getAnalyzer(); |
| String type = this.type; |
| if (type == null) { |
| type = data.getConfig().get("highlighter", null); |
| } |
| switch (type) { |
| case "NONE": |
| hlImpl = new NoHLImpl(); |
| break; |
| case "SH_A": |
| hlImpl = new StandardHLImpl(false); |
| break; |
| case "SH_V": |
| hlImpl = new StandardHLImpl(true); |
| break; |
| |
| case "FVH_V": |
| hlImpl = new FastVectorHLImpl(); |
| break; |
| |
| case "UH": |
| hlImpl = new UnifiedHLImpl(null); |
| break; |
| case "UH_A": |
| hlImpl = new UnifiedHLImpl(UnifiedHighlighter.OffsetSource.ANALYSIS); |
| break; |
| case "UH_V": |
| hlImpl = new UnifiedHLImpl(UnifiedHighlighter.OffsetSource.TERM_VECTORS); |
| break; |
| case "UH_P": |
| hlImpl = new UnifiedHLImpl(UnifiedHighlighter.OffsetSource.POSTINGS); |
| break; |
| case "UH_PV": |
| hlImpl = new UnifiedHLImpl(UnifiedHighlighter.OffsetSource.POSTINGS_WITH_TERM_VECTORS); |
| break; |
| |
| default: |
| throw new Exception("unrecognized highlighter type: " + type + " (try 'UH')"); |
| } |
| } |
| |
| // here is where we intercept ReadTask's logic to do the highlighting, and nothing else (no |
| // retrieval of all field vals) |
| @Override |
| protected int withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception { |
| hlImpl.withTopDocs(searcher, q, hits); |
| // note: it'd be nice if we knew the sum kilobytes of text across these hits so we could return |
| // that. It'd be a more |
| // useful number to gauge the amount of work. But given "average" document sizes and lots of |
| // queries, returning the |
| // number of docs is reasonable. |
| return hits.scoreDocs.length; // always return # scored docs. |
| } |
| |
| private interface HLImpl { |
| void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception; |
| } |
| |
| private volatile int preventOptimizeAway = 0; |
| |
| private class StandardHLImpl implements HLImpl { |
| SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<em>", "</em>"); |
| DefaultEncoder encoder = new DefaultEncoder(); |
| Highlighter highlighter = new Highlighter(formatter, encoder, null); |
| boolean termVecs; |
| |
| StandardHLImpl(boolean termVecs) { |
| highlighter.setEncoder(new DefaultEncoder()); |
| highlighter.setMaxDocCharsToAnalyze(maxDocCharsToAnalyze); |
| this.termVecs = termVecs; |
| } |
| |
| @Override |
| public void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception { |
| IndexReader reader = searcher.getIndexReader(); |
| highlighter.setFragmentScorer(new QueryScorer(q)); |
| // highlighter.setTextFragmenter(); unfortunately no sentence mechanism, not even regex. |
| // Default here is trivial |
| for (ScoreDoc scoreDoc : docIdOrder(hits.scoreDocs)) { |
| Document document = reader.document(scoreDoc.doc, hlFields); |
| Fields tvFields = termVecs ? reader.getTermVectors(scoreDoc.doc) : null; |
| for (IndexableField indexableField : document) { |
| TokenStream tokenStream; |
| if (termVecs) { |
| tokenStream = |
| TokenSources.getTokenStream( |
| indexableField.name(), |
| tvFields, |
| indexableField.stringValue(), |
| analyzer, |
| maxDocCharsToAnalyze); |
| } else { |
| tokenStream = analyzer.tokenStream(indexableField.name(), indexableField.stringValue()); |
| } |
| // will close TokenStream: |
| String[] fragments = |
| highlighter.getBestFragments(tokenStream, indexableField.stringValue(), maxFrags); |
| preventOptimizeAway = fragments.length; |
| } |
| } |
| } |
| } |
| |
| private class FastVectorHLImpl implements HLImpl { |
| int fragSize = 100; |
| WeightedFragListBuilder fragListBuilder = new WeightedFragListBuilder(); |
| BoundaryScanner bs = |
| new BreakIteratorBoundaryScanner(BreakIterator.getSentenceInstance(Locale.ENGLISH)); |
| ScoreOrderFragmentsBuilder fragmentsBuilder = new ScoreOrderFragmentsBuilder(bs); |
| String[] preTags = {"<em>"}; |
| String[] postTags = {"</em>"}; |
| Encoder encoder = new DefaultEncoder(); // new SimpleHTMLEncoder(); |
| FastVectorHighlighter highlighter = |
| new FastVectorHighlighter( |
| true, // phraseHighlight |
| false); // requireFieldMatch -- not pertinent to our benchmark |
| |
| @Override |
| public void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception { |
| IndexReader reader = searcher.getIndexReader(); |
| final FieldQuery fq = highlighter.getFieldQuery(q, reader); |
| for (ScoreDoc scoreDoc : docIdOrder(hits.scoreDocs)) { |
| for (String hlField : hlFields) { |
| String[] fragments = |
| highlighter.getBestFragments( |
| fq, |
| reader, |
| scoreDoc.doc, |
| hlField, |
| fragSize, |
| maxFrags, |
| fragListBuilder, |
| fragmentsBuilder, |
| preTags, |
| postTags, |
| encoder); |
| preventOptimizeAway = fragments.length; |
| } |
| } |
| } |
| } |
| |
| private ScoreDoc[] docIdOrder(ScoreDoc[] scoreDocs) { |
| ScoreDoc[] clone = new ScoreDoc[scoreDocs.length]; |
| System.arraycopy(scoreDocs, 0, clone, 0, scoreDocs.length); |
| ArrayUtil.introSort(clone, (a, b) -> Integer.compare(a.doc, b.doc)); |
| return clone; |
| } |
| |
| private class UnifiedHLImpl implements HLImpl { |
| UnifiedHighlighter highlighter; |
| IndexSearcher lastSearcher; |
| UnifiedHighlighter.OffsetSource offsetSource; // null means auto select |
| String[] fields = hlFields.toArray(new String[hlFields.size()]); |
| int[] maxPassages; |
| |
| UnifiedHLImpl(final UnifiedHighlighter.OffsetSource offsetSource) { |
| this.offsetSource = offsetSource; |
| maxPassages = new int[hlFields.size()]; |
| Arrays.fill(maxPassages, maxFrags); |
| } |
| |
| private void reset(IndexSearcher searcher) { |
| if (lastSearcher == searcher) { |
| return; |
| } |
| lastSearcher = searcher; |
| highlighter = |
| new UnifiedHighlighter(searcher, analyzer) { |
| @Override |
| protected OffsetSource getOffsetSource(String field) { |
| return offsetSource != null ? offsetSource : super.getOffsetSource(field); |
| } |
| }; |
| highlighter.setBreakIterator(() -> BreakIterator.getSentenceInstance(Locale.ENGLISH)); |
| highlighter.setMaxLength(maxDocCharsToAnalyze); |
| highlighter.setHighlightPhrasesStrictly(true); |
| highlighter.setHandleMultiTermQuery(true); |
| } |
| |
| @Override |
| public void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception { |
| reset(searcher); |
| Map<String, String[]> result = highlighter.highlightFields(fields, q, hits, maxPassages); |
| preventOptimizeAway = result.size(); |
| } |
| } |
| |
| private class NoHLImpl implements HLImpl { |
| |
| @Override |
| public void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception { |
| // just retrieve the HL fields |
| for (ScoreDoc scoreDoc : docIdOrder(hits.scoreDocs)) { |
| preventOptimizeAway += searcher.doc(scoreDoc.doc, hlFields).iterator().hasNext() ? 2 : 1; |
| } |
| } |
| } |
| } |