| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.solr.highlight; |
| |
| import java.io.IOException; |
| import java.lang.invoke.MethodHandles; |
| import java.util.ArrayList; |
| import java.util.Collection; |
| import java.util.Collections; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| import java.util.LinkedList; |
| import java.util.List; |
| import java.util.ListIterator; |
| import java.util.Map; |
| import java.util.Set; |
| |
| import org.apache.lucene.analysis.CachingTokenFilter; |
| import org.apache.lucene.analysis.TokenFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| import org.apache.lucene.index.Fields; |
| import org.apache.lucene.index.FilterLeafReader; |
| import org.apache.lucene.index.IndexReader; |
| import org.apache.lucene.index.IndexableField; |
| import org.apache.lucene.index.LeafReader; |
| import org.apache.lucene.index.Terms; |
| import org.apache.lucene.search.Query; |
| import org.apache.lucene.search.highlight.Encoder; |
| import org.apache.lucene.search.highlight.Formatter; |
| import org.apache.lucene.search.highlight.Fragmenter; |
| import org.apache.lucene.search.highlight.Highlighter; |
| import org.apache.lucene.search.highlight.InvalidTokenOffsetsException; |
| import org.apache.lucene.search.highlight.OffsetLimitTokenFilter; |
| import org.apache.lucene.search.highlight.QueryScorer; |
| import org.apache.lucene.search.highlight.QueryTermScorer; |
| import org.apache.lucene.search.highlight.Scorer; |
| import org.apache.lucene.search.highlight.TextFragment; |
| import org.apache.lucene.search.highlight.TokenSources; |
| import org.apache.lucene.search.highlight.WeightedSpanTerm; |
| import org.apache.lucene.search.highlight.WeightedSpanTermExtractor; |
| import org.apache.lucene.search.join.ToChildBlockJoinQuery; |
| import org.apache.lucene.search.join.ToParentBlockJoinQuery; |
| import org.apache.lucene.search.vectorhighlight.BoundaryScanner; |
| import org.apache.lucene.search.vectorhighlight.FastVectorHighlighter; |
| import org.apache.lucene.search.vectorhighlight.FieldQuery; |
| import org.apache.lucene.search.vectorhighlight.FragListBuilder; |
| import org.apache.lucene.search.vectorhighlight.FragmentsBuilder; |
| import org.apache.lucene.util.AttributeSource.State; |
| import org.apache.solr.common.SolrDocument; |
| import org.apache.solr.common.SolrException; |
| import org.apache.solr.common.params.HighlightParams; |
| import org.apache.solr.common.params.MapSolrParams; |
| import org.apache.solr.common.params.SolrParams; |
| import org.apache.solr.common.util.NamedList; |
| import org.apache.solr.common.util.SimpleOrderedMap; |
| import org.apache.solr.core.PluginInfo; |
| import org.apache.solr.core.SolrCore; |
| import org.apache.solr.handler.component.HighlightComponent; |
| import org.apache.solr.request.SolrQueryRequest; |
| import org.apache.solr.schema.FieldType; |
| import org.apache.solr.schema.IndexSchema; |
| import org.apache.solr.schema.SchemaField; |
| import org.apache.solr.search.DocIterator; |
| import org.apache.solr.search.DocList; |
| import org.apache.solr.search.SolrIndexSearcher; |
| import org.apache.solr.search.SolrReturnFields; |
| import org.apache.solr.util.plugin.PluginInfoInitialized; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| /** |
| * |
| * @since solr 1.3 |
| */ |
| public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInfoInitialized { |
| |
| /** |
| * This constant was formerly part of HighlightParams. After deprecation it was removed so clients |
| * would no longer use it, but we still support it server side. |
| */ |
| private static final String USE_FVH = HighlightParams.HIGHLIGHT + ".useFastVectorHighlighter"; |
| |
| private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); |
| |
| protected final SolrCore solrCore; |
| |
| //Will be invoked via reflection |
| public DefaultSolrHighlighter(SolrCore solrCore) { |
| this.solrCore = solrCore; |
| } |
| |
| // Thread safe registry |
| protected final Map<String, SolrFormatter> formatters = |
| new HashMap<>(); |
| |
| // Thread safe registry |
| protected final Map<String, SolrEncoder> encoders = |
| new HashMap<>(); |
| |
| // Thread safe registry |
| protected final Map<String, SolrFragmenter> fragmenters = |
| new HashMap<>(); |
| |
| // Thread safe registry |
| protected final Map<String, SolrFragListBuilder> fragListBuilders = |
| new HashMap<>(); |
| |
| // Thread safe registry |
| protected final Map<String, SolrFragmentsBuilder> fragmentsBuilders = |
| new HashMap<>(); |
| |
| // Thread safe registry |
| protected final Map<String, SolrBoundaryScanner> boundaryScanners = |
| new HashMap<>(); |
| |
| @Override |
| public void init(PluginInfo info) { |
| formatters.clear(); |
| encoders.clear(); |
| fragmenters.clear(); |
| fragListBuilders.clear(); |
| fragmentsBuilders.clear(); |
| boundaryScanners.clear(); |
| |
| // Load the fragmenters |
| SolrFragmenter frag = solrCore.initPlugins(info.getChildren("fragmenter"), fragmenters, SolrFragmenter.class, null); |
| if (frag == null) { |
| frag = new GapFragmenter(); |
| solrCore.initDefaultPlugin(frag, SolrFragmenter.class); |
| } |
| fragmenters.put("", frag); |
| fragmenters.put(null, frag); |
| |
| // Load the formatters |
| SolrFormatter fmt = solrCore.initPlugins(info.getChildren("formatter"), formatters, SolrFormatter.class, null); |
| if (fmt == null) { |
| fmt = new HtmlFormatter(); |
| solrCore.initDefaultPlugin(fmt, SolrFormatter.class); |
| } |
| formatters.put("", fmt); |
| formatters.put(null, fmt); |
| |
| // Load the encoders |
| SolrEncoder enc = solrCore.initPlugins(info.getChildren("encoder"), encoders, SolrEncoder.class, null); |
| if (enc == null) { |
| enc = new DefaultEncoder(); |
| solrCore.initDefaultPlugin(enc, SolrEncoder.class); |
| } |
| encoders.put("", enc); |
| encoders.put(null, enc); |
| |
| // Load the FragListBuilders |
| SolrFragListBuilder fragListBuilder = solrCore.initPlugins(info.getChildren("fragListBuilder"), |
| fragListBuilders, SolrFragListBuilder.class, null); |
| if (fragListBuilder == null) { |
| fragListBuilder = new SimpleFragListBuilder(); |
| solrCore.initDefaultPlugin(fragListBuilder, SolrFragListBuilder.class); |
| } |
| fragListBuilders.put("", fragListBuilder); |
| fragListBuilders.put(null, fragListBuilder); |
| |
| // Load the FragmentsBuilders |
| SolrFragmentsBuilder fragsBuilder = solrCore.initPlugins(info.getChildren("fragmentsBuilder"), |
| fragmentsBuilders, SolrFragmentsBuilder.class, null); |
| if (fragsBuilder == null) { |
| fragsBuilder = new ScoreOrderFragmentsBuilder(); |
| solrCore.initDefaultPlugin(fragsBuilder, SolrFragmentsBuilder.class); |
| } |
| fragmentsBuilders.put("", fragsBuilder); |
| fragmentsBuilders.put(null, fragsBuilder); |
| |
| // Load the BoundaryScanners |
| SolrBoundaryScanner boundaryScanner = solrCore.initPlugins(info.getChildren("boundaryScanner"), |
| boundaryScanners, SolrBoundaryScanner.class, null); |
| if (boundaryScanner == null) { |
| boundaryScanner = new SimpleBoundaryScanner(); |
| solrCore.initDefaultPlugin(boundaryScanner, SolrBoundaryScanner.class); |
| } |
| boundaryScanners.put("", boundaryScanner); |
| boundaryScanners.put(null, boundaryScanner); |
| |
| } |
| |
| /** |
| * Return a phrase {@link org.apache.lucene.search.highlight.Highlighter} appropriate for this field. |
| * |
| * @param query The current Query |
| * @param fieldName The name of the field |
| * @param request The current SolrQueryRequest |
| * @param tokenStream document text tokenStream that implements reset() efficiently (e.g. CachingTokenFilter). |
| * If it's used, call reset() first. |
| * @throws IOException If there is a low-level I/O error. |
| */ |
| protected Highlighter getPhraseHighlighter(Query query, String fieldName, SolrQueryRequest request, TokenStream tokenStream) throws IOException { |
| SolrParams params = request.getParams(); |
| Highlighter highlighter = new Highlighter( |
| getFormatter(fieldName, params), |
| getEncoder(fieldName, params), |
| getSpanQueryScorer(query, fieldName, tokenStream, request)); |
| |
| highlighter.setTextFragmenter(getFragmenter(fieldName, params)); |
| |
| return highlighter; |
| } |
| |
| /** |
| * Return a {@link org.apache.lucene.search.highlight.Highlighter} appropriate for this field. |
| * |
| * @param query The current Query |
| * @param fieldName The name of the field |
| * @param request The current SolrQueryRequest |
| */ |
| protected Highlighter getHighlighter(Query query, String fieldName, SolrQueryRequest request) { |
| SolrParams params = request.getParams(); |
| Highlighter highlighter = new Highlighter( |
| getFormatter(fieldName, params), |
| getEncoder(fieldName, params), |
| getQueryScorer(query, fieldName, request)); |
| highlighter.setTextFragmenter(getFragmenter(fieldName, params)); |
| return highlighter; |
| } |
| |
| /** |
| * Return a {@link org.apache.lucene.search.highlight.QueryScorer} suitable for this Query and field. |
| * |
| * @param query The current query |
| * @param tokenStream document text tokenStream that implements reset() efficiently (e.g. CachingTokenFilter). |
| * If it's used, call reset() first. |
| * @param fieldName The name of the field |
| * @param request The SolrQueryRequest |
| */ |
| protected QueryScorer getSpanQueryScorer(Query query, String fieldName, TokenStream tokenStream, SolrQueryRequest request) { |
| QueryScorer scorer = new QueryScorer(query, |
| request.getParams().getFieldBool(fieldName, HighlightParams.FIELD_MATCH, false) ? fieldName : null) { |
| @Override |
| protected WeightedSpanTermExtractor newTermExtractor(String defaultField) { |
| return new CustomSpanTermExtractor(defaultField); |
| } |
| }; |
| scorer.setExpandMultiTermQuery(request.getParams().getBool(HighlightParams.HIGHLIGHT_MULTI_TERM, true)); |
| |
| boolean defaultPayloads = true;//overwritten below |
| try { |
| // It'd be nice to know if payloads are on the tokenStream but the presence of the attribute isn't a good |
| // indicator. |
| final Terms terms = request.getSearcher().getSlowAtomicReader().terms(fieldName); |
| if (terms != null) { |
| defaultPayloads = terms.hasPayloads(); |
| } |
| } catch (IOException e) { |
| log.error("Couldn't check for existence of payloads", e); |
| } |
| scorer.setUsePayloads(request.getParams().getFieldBool(fieldName, HighlightParams.PAYLOADS, defaultPayloads)); |
| return scorer; |
| } |
| |
| private static class CustomSpanTermExtractor extends WeightedSpanTermExtractor { |
| public CustomSpanTermExtractor(String defaultField) { |
| super(defaultField); |
| } |
| |
| @Override |
| protected void extract(Query query, float boost, Map<String, WeightedSpanTerm> terms) throws IOException { |
| // these queries are not supported in lucene highlighting out of the box since 8.0 |
| if (query instanceof ToParentBlockJoinQuery) { |
| extract(((ToParentBlockJoinQuery) query).getChildQuery(), boost, terms); |
| } else if (query instanceof ToChildBlockJoinQuery) { |
| extract(((ToChildBlockJoinQuery) query).getParentQuery(), boost, terms); |
| } else { |
| super.extract(query, boost, terms); |
| } |
| } |
| } |
| |
| /** |
| * Return a {@link org.apache.lucene.search.highlight.Scorer} suitable for this Query and field. |
| * |
| * @param query The current query |
| * @param fieldName The name of the field |
| * @param request The SolrQueryRequest |
| */ |
| protected Scorer getQueryScorer(Query query, String fieldName, SolrQueryRequest request) { |
| boolean reqFieldMatch = request.getParams().getFieldBool(fieldName, HighlightParams.FIELD_MATCH, false); |
| if (reqFieldMatch) { |
| return new QueryTermScorer(query, request.getSearcher().getIndexReader(), fieldName); |
| } else { |
| return new QueryTermScorer(query); |
| } |
| } |
| |
| /** |
| * Return the max number of snippets for this field. If this has not |
| * been configured for this field, fall back to the configured default |
| * or the solr default. |
| * |
| * @param fieldName The name of the field |
| * @param params The params controlling Highlighting |
| */ |
| protected int getMaxSnippets(String fieldName, SolrParams params) { |
| return params.getFieldInt(fieldName, HighlightParams.SNIPPETS, 1); |
| } |
| |
| /** |
| * Return whether adjacent fragments should be merged. |
| * |
| * @param fieldName The name of the field |
| * @param params The params controlling Highlighting |
| */ |
| protected boolean isMergeContiguousFragments(String fieldName, SolrParams params) { |
| return params.getFieldBool(fieldName, HighlightParams.MERGE_CONTIGUOUS_FRAGMENTS, false); |
| } |
| |
| /** |
| * Return a {@link org.apache.lucene.search.highlight.Formatter} appropriate for this field. If a formatter |
| * has not been configured for this field, fall back to the configured |
| * default or the solr default ({@link org.apache.lucene.search.highlight.SimpleHTMLFormatter}). |
| * |
| * @param fieldName The name of the field |
| * @param params The params controlling Highlighting |
| * @return An appropriate {@link org.apache.lucene.search.highlight.Formatter}. |
| */ |
| protected Formatter getFormatter(String fieldName, SolrParams params) { |
| String str = params.getFieldParam(fieldName, HighlightParams.FORMATTER); |
| SolrFormatter formatter = formatters.get(str); |
| if (formatter == null) { |
| throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Unknown formatter: " + str); |
| } |
| return formatter.getFormatter(fieldName, params); |
| } |
| |
| /** |
| * Return an {@link org.apache.lucene.search.highlight.Encoder} appropriate for this field. If an encoder |
| * has not been configured for this field, fall back to the configured |
| * default or the solr default ({@link org.apache.lucene.search.highlight.DefaultEncoder}). |
| * |
| * @param fieldName The name of the field |
| * @param params The params controlling Highlighting |
| * @return An appropriate {@link org.apache.lucene.search.highlight.Encoder}. |
| */ |
| protected Encoder getEncoder(String fieldName, SolrParams params) { |
| String str = params.getFieldParam(fieldName, HighlightParams.ENCODER); |
| SolrEncoder encoder = encoders.get(str); |
| if (encoder == null) { |
| throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Unknown encoder: " + str); |
| } |
| return encoder.getEncoder(fieldName, params); |
| } |
| |
| /** |
| * Return a {@link org.apache.lucene.search.highlight.Fragmenter} appropriate for this field. If a fragmenter |
| * has not been configured for this field, fall back to the configured |
| * default or the solr default ({@link GapFragmenter}). |
| * |
| * @param fieldName The name of the field |
| * @param params The params controlling Highlighting |
| * @return An appropriate {@link org.apache.lucene.search.highlight.Fragmenter}. |
| */ |
| protected Fragmenter getFragmenter(String fieldName, SolrParams params) { |
| String fmt = params.getFieldParam(fieldName, HighlightParams.FRAGMENTER); |
| SolrFragmenter frag = fragmenters.get(fmt); |
| if (frag == null) { |
| throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Unknown fragmenter: " + fmt); |
| } |
| return frag.getFragmenter(fieldName, params); |
| } |
| |
| protected FragListBuilder getFragListBuilder(String fieldName, SolrParams params) { |
| String flb = params.getFieldParam(fieldName, HighlightParams.FRAG_LIST_BUILDER); |
| SolrFragListBuilder solrFlb = fragListBuilders.get(flb); |
| if (solrFlb == null) { |
| throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Unknown fragListBuilder: " + flb); |
| } |
| return solrFlb.getFragListBuilder(params); |
| } |
| |
| protected FragmentsBuilder getFragmentsBuilder(String fieldName, SolrParams params) { |
| BoundaryScanner bs = getBoundaryScanner(fieldName, params); |
| return getSolrFragmentsBuilder(fieldName, params).getFragmentsBuilder(params, bs); |
| } |
| |
| protected SolrFragmentsBuilder getSolrFragmentsBuilder(String fieldName, SolrParams params) { |
| String fb = params.getFieldParam(fieldName, HighlightParams.FRAGMENTS_BUILDER); |
| SolrFragmentsBuilder solrFb = fragmentsBuilders.get(fb); |
| if (solrFb == null) { |
| throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Unknown fragmentsBuilder: " + fb); |
| } |
| return solrFb; |
| } |
| |
| protected BoundaryScanner getBoundaryScanner(String fieldName, SolrParams params) { |
| String bs = params.getFieldParam(fieldName, HighlightParams.BOUNDARY_SCANNER); |
| SolrBoundaryScanner solrBs = boundaryScanners.get(bs); |
| if (solrBs == null) { |
| throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Unknown boundaryScanner: " + bs); |
| } |
| return solrBs.getBoundaryScanner(fieldName, params); |
| } |
| |
| /** |
| * Generates a list of Highlighted query fragments for each item in a list |
| * of documents, or returns null if highlighting is disabled. |
| * |
| * @param docs query results |
| * @param query the query |
| * @param req the current request |
| * @param defaultFields default list of fields to summarize |
| * @return NamedList containing a NamedList for each document, which in |
| * turns contains sets (field, summary) pairs. |
| */ |
| @Override |
| @SuppressWarnings("unchecked") |
| public NamedList<Object> doHighlighting(DocList docs, Query query, SolrQueryRequest req, String[] defaultFields) throws IOException { |
| SolrParams params = req.getParams(); |
| if (!isHighlightingEnabled(params)) // also returns early if no unique key field |
| return null; |
| |
| boolean rewrite = query != null && !(Boolean.valueOf(params.get(HighlightParams.USE_PHRASE_HIGHLIGHTER, "true")) && |
| Boolean.valueOf(params.get(HighlightParams.HIGHLIGHT_MULTI_TERM, "true"))); |
| |
| if (rewrite) { |
| query = query.rewrite(req.getSearcher().getIndexReader()); |
| } |
| |
| SolrIndexSearcher searcher = req.getSearcher(); |
| IndexSchema schema = searcher.getSchema(); |
| |
| // fetch unique key if one exists. |
| SchemaField keyField = schema.getUniqueKeyField(); |
| if (keyField == null) { |
| return null;//exit early; we need a unique key field to populate the response |
| } |
| |
| String[] fieldNames = getHighlightFields(query, req, defaultFields); |
| |
| Set<String> preFetchFieldNames = getDocPrefetchFieldNames(fieldNames, req); |
| SolrReturnFields returnFields; |
| if (preFetchFieldNames != null) { |
| preFetchFieldNames.add(keyField.getName()); |
| returnFields = new SolrReturnFields(preFetchFieldNames.toArray(new String[0]), req); |
| } else { |
| returnFields = new SolrReturnFields(new String[0], req); |
| } |
| FvhContainer fvhContainer = new FvhContainer(null, null); // Lazy container for fvh and fieldQuery |
| |
| IndexReader reader = new TermVectorReusingLeafReader(req.getSearcher().getSlowAtomicReader()); // SOLR-5855 |
| |
| // Highlight each document |
| @SuppressWarnings({"rawtypes"}) |
| NamedList fragments = new SimpleOrderedMap(); |
| DocIterator iterator = docs.iterator(); |
| for (int i = 0; i < docs.size(); i++) { |
| int docId = iterator.nextDoc(); |
| SolrDocument doc = searcher.getDocFetcher().solrDoc(docId, returnFields); |
| |
| @SuppressWarnings("rawtypes") |
| NamedList docHighlights = new SimpleOrderedMap(); |
| // Highlight per-field |
| for (String fieldName : fieldNames) { |
| SchemaField schemaField = schema.getFieldOrNull(fieldName); |
| |
| Object fieldHighlights; // object type allows flexibility for subclassers |
| fieldHighlights = doHighlightingOfField(doc, docId, schemaField, fvhContainer, query, reader, req, params); |
| |
| if (fieldHighlights == null) { |
| fieldHighlights = alternateField(doc, docId, fieldName, fvhContainer, query, reader, req); |
| } |
| |
| if (fieldHighlights != null) { |
| docHighlights.add(fieldName, fieldHighlights); |
| } |
| } // for each field |
| fragments.add(schema.printableUniqueKey(doc), docHighlights); |
| } // for each doc |
| return fragments; |
| } |
| |
| protected Object doHighlightingOfField(SolrDocument doc, int docId, SchemaField schemaField, |
| FvhContainer fvhContainer, Query query, IndexReader reader, SolrQueryRequest req, |
| SolrParams params) throws IOException { |
| Object fieldHighlights; |
| if (schemaField == null) { |
| fieldHighlights = null; |
| } else if (schemaField.getType() instanceof org.apache.solr.schema.TrieField) { |
| // TODO: highlighting numeric fields is broken (Lucene) - so we disable them until fixed (see LUCENE-3080)! |
| fieldHighlights = null; |
| } else if (useFastVectorHighlighter(params, schemaField)) { |
| if (fvhContainer.fieldQuery == null) { |
| FastVectorHighlighter fvh = new FastVectorHighlighter( |
| // FVH cannot process hl.usePhraseHighlighter parameter per-field basis |
| params.getBool(HighlightParams.USE_PHRASE_HIGHLIGHTER, true), |
| // FVH cannot process hl.requireFieldMatch parameter per-field basis |
| params.getBool(HighlightParams.FIELD_MATCH, false)) { |
| @Override |
| public FieldQuery getFieldQuery(Query query, IndexReader reader) throws IOException { |
| return new FieldQuery(query, reader, phraseHighlight, fieldMatch) { |
| @Override |
| protected void flatten(Query sourceQuery, IndexReader reader, Collection<Query> flatQueries, float boost) throws IOException { |
| if (sourceQuery instanceof ToParentBlockJoinQuery) { |
| Query childQuery = ((ToParentBlockJoinQuery) sourceQuery).getChildQuery(); |
| if (childQuery != null) { |
| flatten(childQuery, reader, flatQueries, boost); |
| } |
| } else { |
| super.flatten(sourceQuery, reader, flatQueries, boost); |
| } |
| } |
| }; |
| } |
| }; |
| fvh.setPhraseLimit(params.getInt(HighlightParams.PHRASE_LIMIT, SolrHighlighter.DEFAULT_PHRASE_LIMIT)); |
| fvhContainer.fvh = fvh; |
| fvhContainer.fieldQuery = fvh.getFieldQuery(query, reader); |
| } |
| fieldHighlights = |
| doHighlightingByFastVectorHighlighter(doc, docId, schemaField, fvhContainer, reader, req); |
| } else { // standard/default highlighter |
| fieldHighlights = doHighlightingByHighlighter(doc, docId, schemaField, query, reader, req); |
| } |
| return fieldHighlights; |
| } |
| |
| /** |
| * Returns the field names to be passed to {@link org.apache.solr.search.SolrDocumentFetcher#solrDoc(int, SolrReturnFields)}. |
| * Subclasses might over-ride to include fields in search-results and other stored field values needed so as to avoid |
| * the possibility of extra trips to disk. The uniqueKey will be added after if the result isn't null. |
| */ |
| protected Set<String> getDocPrefetchFieldNames(String[] hlFieldNames, SolrQueryRequest req) { |
| Set<String> preFetchFieldNames = new HashSet<>(hlFieldNames.length + 1);//+1 for uniqueyKey added after |
| Collections.addAll(preFetchFieldNames, hlFieldNames); |
| for (String hlFieldName : hlFieldNames) { |
| String alternateField = req.getParams().getFieldParam(hlFieldName, HighlightParams.ALTERNATE_FIELD); |
| if (alternateField != null) { |
| preFetchFieldNames.add(alternateField); |
| } |
| } |
| return preFetchFieldNames; |
| } |
| |
| /** |
| * Determines if we should use the FastVectorHighlighter for this field. |
| */ |
| protected boolean useFastVectorHighlighter(SolrParams params, SchemaField schemaField) { |
| boolean methodFvh = |
| HighlightComponent.HighlightMethod.FAST_VECTOR.getMethodName().equals( |
| params.getFieldParam(schemaField.getName(), HighlightParams.METHOD)) |
| || params.getFieldBool(schemaField.getName(), USE_FVH, false); |
| if (!methodFvh) return false; |
| boolean termPosOff = schemaField.storeTermPositions() && schemaField.storeTermOffsets(); |
| if (!termPosOff) { |
| log.warn("Solr will use the standard Highlighter instead of FastVectorHighlighter because the {} field {}" |
| , "does not store TermVectors with TermPositions and TermOffsets.", schemaField.getName()); |
| } |
| return termPosOff; |
| } |
| |
| /** Highlights and returns the highlight object for this field -- a String[] by default. Null if none. */ |
| @SuppressWarnings("unchecked") |
| protected Object doHighlightingByFastVectorHighlighter(SolrDocument doc, int docId, |
| SchemaField schemaField, FvhContainer fvhContainer, |
| IndexReader reader, SolrQueryRequest req) throws IOException { |
| SolrParams params = req.getParams(); |
| String fieldName = schemaField.getName(); |
| SolrFragmentsBuilder solrFb = getSolrFragmentsBuilder(fieldName, params); |
| |
| String[] snippets = fvhContainer.fvh.getBestFragments(fvhContainer.fieldQuery, reader, docId, fieldName, |
| params.getFieldInt(fieldName, HighlightParams.FRAGSIZE, 100), |
| params.getFieldInt(fieldName, HighlightParams.SNIPPETS, 1), |
| getFragListBuilder(fieldName, params), |
| getFragmentsBuilder(fieldName, params), |
| solrFb.getPreTags(params, fieldName), |
| solrFb.getPostTags(params, fieldName), |
| getEncoder(fieldName, params)); |
| if (snippets != null && snippets.length > 0) |
| return snippets; |
| return null; |
| } |
| |
| /** Highlights and returns the highlight object for this field -- a String[] by default. Null if none. */ |
| @SuppressWarnings("unchecked") |
| protected Object doHighlightingByHighlighter(SolrDocument doc, int docId, SchemaField schemaField, Query query, |
| IndexReader reader, SolrQueryRequest req) throws IOException { |
| final SolrParams params = req.getParams(); |
| final String fieldName = schemaField.getName(); |
| |
| final int mvToExamine = |
| params.getFieldInt(fieldName, HighlightParams.MAX_MULTIVALUED_TO_EXAMINE, |
| (schemaField.multiValued()) ? Integer.MAX_VALUE : 1); |
| |
| // Technically this is the max *fragments* (snippets), not max values: |
| int mvToMatch = |
| params.getFieldInt(fieldName, HighlightParams.MAX_MULTIVALUED_TO_MATCH, Integer.MAX_VALUE); |
| if (mvToExamine <= 0 || mvToMatch <= 0) { |
| return null; |
| } |
| |
| int maxCharsToAnalyze = params.getFieldInt(fieldName, |
| HighlightParams.MAX_CHARS, DEFAULT_MAX_CHARS); |
| if (maxCharsToAnalyze < 0) {//e.g. -1 |
| maxCharsToAnalyze = Integer.MAX_VALUE; |
| } |
| |
| List<String> fieldValues = getFieldValues(doc, fieldName, mvToExamine, maxCharsToAnalyze, req); |
| if (fieldValues.isEmpty()) { |
| return null; |
| } |
| |
| // preserve order of values in a multiValued list |
| boolean preserveMulti = params.getFieldBool(fieldName, HighlightParams.PRESERVE_MULTI, false); |
| |
| int numFragments = getMaxSnippets(fieldName, params); |
| boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params); |
| |
| List<TextFragment> frags = new ArrayList<>(); |
| |
| //Try term vectors, which is faster |
| // note: offsets are minimally sufficient for this HL. |
| final Fields tvFields = schemaField.storeTermOffsets() ? reader.getTermVectors(docId) : null; |
| final TokenStream tvStream = |
| TokenSources.getTermVectorTokenStreamOrNull(fieldName, tvFields, maxCharsToAnalyze - 1); |
| // We need to wrap in OffsetWindowTokenFilter if multi-valued |
| try (OffsetWindowTokenFilter tvWindowStream = (tvStream != null && fieldValues.size() > 1) ? new OffsetWindowTokenFilter(tvStream) : null) { |
| |
| for (String thisText : fieldValues) { |
| if (mvToMatch <= 0 || maxCharsToAnalyze <= 0) { |
| break; |
| } |
| |
| TokenStream tstream; |
| if (tvWindowStream != null) { |
| // if we have a multi-valued field with term vectors, then get the next offset window |
| tstream = tvWindowStream.advanceToNextWindowOfLength(thisText.length()); |
| } else if (tvStream != null) { |
| tstream = tvStream; // single-valued with term vectors |
| } else { |
| // fall back to analyzer |
| tstream = createAnalyzerTStream(schemaField, thisText); |
| } |
| |
| Highlighter highlighter; |
| if (params.getFieldBool(fieldName, HighlightParams.USE_PHRASE_HIGHLIGHTER, true)) { |
| // We're going to call getPhraseHighlighter and it might consume the tokenStream. If it does, the tokenStream |
| // needs to implement reset() efficiently. |
| |
| //If the tokenStream is right from the term vectors, then CachingTokenFilter is unnecessary. |
| // It should be okay if OffsetLimit won't get applied in this case. |
| final TokenStream tempTokenStream; |
| if (tstream != tvStream) { |
| if (maxCharsToAnalyze >= thisText.length()) { |
| tempTokenStream = new CachingTokenFilter(tstream); |
| } else { |
| tempTokenStream = new CachingTokenFilter(new OffsetLimitTokenFilter(tstream, maxCharsToAnalyze)); |
| } |
| } else { |
| tempTokenStream = tstream; |
| } |
| |
| // get highlighter |
| highlighter = getPhraseHighlighter(query, fieldName, req, tempTokenStream); |
| |
| // if the CachingTokenFilter was consumed then use it going forward. |
| if (tempTokenStream instanceof CachingTokenFilter && ((CachingTokenFilter) tempTokenStream).isCached()) { |
| tstream = tempTokenStream; |
| } |
| //tstream.reset(); not needed; getBestTextFragments will reset it. |
| } else { |
| // use "the old way" |
| highlighter = getHighlighter(query, fieldName, req); |
| } |
| |
| highlighter.setMaxDocCharsToAnalyze(maxCharsToAnalyze); |
| maxCharsToAnalyze -= thisText.length(); |
| |
| // Highlight! |
| try { |
| TextFragment[] bestTextFragments = |
| highlighter.getBestTextFragments(tstream, thisText, mergeContiguousFragments, numFragments); |
| for (TextFragment bestTextFragment : bestTextFragments) { |
| if (bestTextFragment == null)//can happen via mergeContiguousFragments |
| continue; |
| // normally we want a score (must be highlighted), but if preserveMulti then we return a snippet regardless. |
| if (bestTextFragment.getScore() > 0 || preserveMulti) { |
| frags.add(bestTextFragment); |
| if (bestTextFragment.getScore() > 0) |
| --mvToMatch; // note: limits fragments (for multi-valued fields), not quite the number of values |
| } |
| } |
| } catch (InvalidTokenOffsetsException e) { |
| throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); |
| } |
| }//end field value loop |
| } |
| |
| // Put the fragments onto the Solr response (docSummaries) |
| if (frags.size() > 0) { |
| // sort such that the fragments with the highest score come first |
| if (!preserveMulti) { |
| Collections.sort(frags, (arg0, arg1) -> Float.compare(arg1.getScore(), arg0.getScore())); |
| } |
| |
| // Truncate list to hl.snippets, but not when hl.preserveMulti |
| if (frags.size() > numFragments && !preserveMulti) { |
| frags = frags.subList(0, numFragments); |
| } |
| return getResponseForFragments(frags, req); |
| } |
| return null;//no highlights for this field |
| } |
| |
| /** |
| * Fetches field values to highlight. If the field value should come from an atypical place (or another aliased |
| * field name, then a subclass could override to implement that. |
| */ |
| protected List<String> getFieldValues(SolrDocument doc, String fieldName, int maxValues, int maxCharsToAnalyze, |
| SolrQueryRequest req) { |
| // Collect the Fields we will examine (could be more than one if multi-valued) |
| Collection<Object> fieldValues = doc.getFieldValues(fieldName); |
| if (fieldValues == null) { |
| return Collections.emptyList(); |
| } |
| FieldType fieldType = req.getSchema().getFieldType(fieldName); |
| List<String> result = new ArrayList<>(); |
| for (Object value : fieldValues) { |
| String strValue; |
| if (value instanceof IndexableField) { |
| strValue = fieldType.toExternal((IndexableField) value); |
| } else { |
| strValue = value.toString(); // TODO FieldType needs an API for this, e.g. toExternalFromDv() |
| } |
| result.add(strValue); |
| |
| maxCharsToAnalyze -= strValue.length();//we exit early if we'll never get to analyze the value |
| maxValues--; |
| if (maxValues <= 0 || maxCharsToAnalyze <= 0) { |
| break; |
| } |
| } |
| return result; |
| } |
| |
| /** |
| * Given the fragments, return the result to be put in the field {@link NamedList}. This is an extension |
| * point to allow adding other metadata like the absolute offsets or scores. |
| */ |
| protected Object getResponseForFragments(List<TextFragment> frags, SolrQueryRequest req) { |
| // TODO: we can include score and position information in output as snippet attributes |
| ArrayList<String> fragTexts = new ArrayList<>(); |
| for (TextFragment fragment : frags) { |
| fragTexts.add(fragment.toString()); |
| } |
| return fragTexts.toArray(new String[fragTexts.size()]); |
| } |
| |
| /** Returns the alternate highlight object for this field -- a String[] by default. Null if none. */ |
| @SuppressWarnings("unchecked") |
| protected Object alternateField(SolrDocument doc, int docId, String fieldName, FvhContainer fvhContainer, Query query, |
| IndexReader reader, SolrQueryRequest req) throws IOException { |
| IndexSchema schema = req.getSearcher().getSchema(); |
| SolrParams params = req.getParams(); |
| String alternateField = params.getFieldParam(fieldName, HighlightParams.ALTERNATE_FIELD); |
| int alternateFieldLen = params.getFieldInt(fieldName, HighlightParams.ALTERNATE_FIELD_LENGTH, 0); |
| if (alternateField == null || alternateField.length() == 0) { |
| return null; |
| } |
| |
| if (params.getFieldBool(fieldName, HighlightParams.HIGHLIGHT_ALTERNATE, true) && !alternateField.equals(fieldName)) { |
| // Try to highlight alternate field |
| Object fieldHighlights = null; |
| SchemaField schemaField = schema.getFieldOrNull(alternateField); |
| if (schemaField != null) { |
| HashMap<String, String> invariants = new HashMap<>(); |
| invariants.put("f." + alternateField + "." + HighlightParams.SNIPPETS, "1"); |
| // Enforce maxAlternateFieldLength by FRAGSIZE. Minimum 18 due to FVH limitations |
| invariants.put("f." + alternateField + "." + HighlightParams.FRAGSIZE, |
| alternateFieldLen > 0 ? String.valueOf(Math.max(18, alternateFieldLen)) : String.valueOf(Integer.MAX_VALUE)); |
| SolrParams origParams = req.getParams(); |
| req.setParams(SolrParams.wrapDefaults(new MapSolrParams(invariants), origParams)); |
| fieldHighlights = doHighlightingOfField(doc, docId, schemaField, fvhContainer, query, reader, req, params); |
| req.setParams(origParams); |
| if (fieldHighlights != null) { |
| return fieldHighlights; |
| } |
| } |
| } |
| |
| |
| // Fallback to static non-highlighted |
| List<String> listFields = getFieldValues(doc, alternateField, Integer.MAX_VALUE, Integer.MAX_VALUE, req); |
| if (listFields.isEmpty()) { |
| // The alternate field did not exist, treat the original field as fallback instead |
| listFields = getFieldValues(doc, fieldName, Integer.MAX_VALUE, Integer.MAX_VALUE, req); |
| if (listFields.isEmpty()) { |
| return null; |
| } |
| } |
| |
| String[] altTexts = listFields.toArray(new String[listFields.size()]); |
| |
| Encoder encoder = getEncoder(fieldName, params); |
| List<String> altList = new ArrayList<>(); |
| int len = 0; |
| for (String altText : altTexts) { |
| if (alternateFieldLen <= 0) { |
| altList.add(encoder.encodeText(altText)); |
| } else { |
| altList.add(len + altText.length() > alternateFieldLen ? |
| encoder.encodeText(altText.substring(0, alternateFieldLen - len)) : |
| encoder.encodeText(altText)); |
| len += altText.length(); |
| if (len >= alternateFieldLen) break; |
| } |
| } |
| return altList; |
| } |
| |
| protected TokenStream createAnalyzerTStream(SchemaField schemaField, String docText) throws IOException { |
| final TokenStream tStream = schemaField.getType().getIndexAnalyzer().tokenStream(schemaField.getName(), docText); |
| return new TokenOrderingFilter(tStream, 10); |
| } |
| |
| // Wraps FVH to allow pass-by-reference. Public access to allow use in 3rd party subclasses |
| public static class FvhContainer { |
| FastVectorHighlighter fvh; |
| FieldQuery fieldQuery; |
| |
| public FvhContainer(FastVectorHighlighter fvh, FieldQuery fieldQuery) { |
| this.fvh = fvh; |
| this.fieldQuery = fieldQuery; |
| } |
| } |
| |
| |
| /** |
| * Orders Tokens in a window first by their startOffset ascending. |
| * endOffset is currently ignored. |
| * This is meant to work around fickleness in the highlighter only. It |
| * can mess up token positions and should not be used for indexing or querying. |
| */ |
| static final class TokenOrderingFilter extends TokenFilter { |
| private final int windowSize; |
| private final LinkedList<OrderedToken> queue = new LinkedList<>(); //TODO replace with Deque, Array impl |
| private boolean done = false; |
| private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); |
| |
| protected TokenOrderingFilter(TokenStream input, int windowSize) { |
| super(input); |
| this.windowSize = windowSize; |
| } |
| |
| @Override |
| public void reset() throws IOException { |
| super.reset(); |
| queue.clear(); |
| done = false; |
| } |
| |
| @Override |
| public boolean incrementToken() throws IOException { |
| while (!done && queue.size() < windowSize) { |
| if (!input.incrementToken()) { |
| done = true; |
| break; |
| } |
| |
| // reverse iterating for better efficiency since we know the |
| // list is already sorted, and most token start offsets will be too. |
| ListIterator<OrderedToken> iter = queue.listIterator(queue.size()); |
| while (iter.hasPrevious()) { |
| if (offsetAtt.startOffset() >= iter.previous().startOffset) { |
| // insertion will be before what next() would return (what |
| // we just compared against), so move back one so the insertion |
| // will be after. |
| iter.next(); |
| break; |
| } |
| } |
| OrderedToken ot = new OrderedToken(); |
| ot.state = captureState(); |
| ot.startOffset = offsetAtt.startOffset(); |
| iter.add(ot); |
| } |
| |
| if (queue.isEmpty()) { |
| return false; |
| } else { |
| restoreState(queue.removeFirst().state); |
| return true; |
| } |
| } |
| |
| } |
| |
| // for TokenOrderingFilter, so it can easily sort by startOffset |
| static class OrderedToken { |
| State state; |
| int startOffset; |
| } |
| |
| /** For use with term vectors of multi-valued fields. We want an offset based window into its TokenStream. */ |
| static final class OffsetWindowTokenFilter extends TokenFilter { |
| |
| private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); |
| private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); |
| private int windowStartOffset; |
| private int windowEndOffset = -1;//exclusive |
| private boolean windowTokenIncremented = false; |
| private boolean inputWasReset = false; |
| private State capturedState;//only used for first token of each subsequent window |
| |
| OffsetWindowTokenFilter(TokenStream input) {//input should not have been reset already |
| super(input); |
| } |
| |
| //Called at the start of each value/window |
| OffsetWindowTokenFilter advanceToNextWindowOfLength(int length) { |
| windowStartOffset = windowEndOffset + 1;//unclear why there's a single offset gap between values, but tests show it |
| windowEndOffset = windowStartOffset + length; |
| windowTokenIncremented = false;//thereby permit reset() |
| return this; |
| } |
| |
| @Override |
| public void reset() throws IOException { |
| //we do some state checking to ensure this is being used correctly |
| if (windowTokenIncremented) { |
| throw new IllegalStateException("This TokenStream does not support being subsequently reset()"); |
| } |
| if (!inputWasReset) { |
| super.reset(); |
| inputWasReset = true; |
| } |
| } |
| |
| @Override |
| public boolean incrementToken() throws IOException { |
| assert inputWasReset; |
| windowTokenIncremented = true; |
| while (true) { |
| //increment Token |
| if (capturedState == null) { |
| if (!input.incrementToken()) { |
| return false; |
| } |
| } else { |
| restoreState(capturedState); |
| capturedState = null; |
| //Set posInc to 1 on first token of subsequent windows. To be thorough, we could subtract posIncGap? |
| posIncAtt.setPositionIncrement(1); |
| } |
| |
| final int startOffset = offsetAtt.startOffset(); |
| final int endOffset = offsetAtt.endOffset(); |
| if (startOffset >= windowEndOffset) {//end of window |
| capturedState = captureState(); |
| return false; |
| } |
| if (startOffset >= windowStartOffset) {//in this window |
| offsetAtt.setOffset(startOffset - windowStartOffset, endOffset - windowStartOffset); |
| return true; |
| } |
| //otherwise this token is before the window; continue to advance |
| } |
| } |
| } |
| |
| /** |
| * Wraps a DirectoryReader that caches the {@link LeafReader#getTermVectors(int)} so that |
| * if the next call has the same ID, then it is reused. |
| */ |
| static class TermVectorReusingLeafReader extends FilterLeafReader { |
| |
| private int lastDocId = -1; |
| private Fields tvFields; |
| |
| public TermVectorReusingLeafReader(LeafReader in) { |
| super(in); |
| } |
| |
| @Override |
| public Fields getTermVectors(int docID) throws IOException { |
| if (docID != lastDocId) { |
| lastDocId = docID; |
| tvFields = in.getTermVectors(docID); |
| } |
| return tvFields; |
| } |
| |
| @Override |
| public CacheHelper getCoreCacheHelper() { |
| return null; |
| } |
| |
| @Override |
| public CacheHelper getReaderCacheHelper() { |
| return null; |
| } |
| |
| } |
| } |