blob: 3f05eceb83a3f0f6cde314082b0e1f9438b43c6f [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.highlight;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.FilterLeafReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.Terms;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.highlight.Encoder;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.OffsetLimitTokenFilter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.QueryTermScorer;
import org.apache.lucene.search.highlight.Scorer;
import org.apache.lucene.search.highlight.TextFragment;
import org.apache.lucene.search.highlight.TokenSources;
import org.apache.lucene.search.highlight.WeightedSpanTerm;
import org.apache.lucene.search.highlight.WeightedSpanTermExtractor;
import org.apache.lucene.search.join.ToChildBlockJoinQuery;
import org.apache.lucene.search.join.ToParentBlockJoinQuery;
import org.apache.lucene.search.vectorhighlight.BoundaryScanner;
import org.apache.lucene.search.vectorhighlight.FastVectorHighlighter;
import org.apache.lucene.search.vectorhighlight.FieldQuery;
import org.apache.lucene.search.vectorhighlight.FragListBuilder;
import org.apache.lucene.search.vectorhighlight.FragmentsBuilder;
import org.apache.lucene.util.AttributeSource.State;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.HighlightParams;
import org.apache.solr.common.params.MapSolrParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.core.PluginInfo;
import org.apache.solr.core.SolrCore;
import org.apache.solr.handler.component.HighlightComponent;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.DocIterator;
import org.apache.solr.search.DocList;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.search.SolrReturnFields;
import org.apache.solr.util.plugin.PluginInfoInitialized;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
*
* @since solr 1.3
*/
public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInfoInitialized {
/**
* This constant was formerly part of HighlightParams. After deprecation it was removed so clients
* would no longer use it, but we still support it server side.
*/
private static final String USE_FVH = HighlightParams.HIGHLIGHT + ".useFastVectorHighlighter";
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
protected final SolrCore solrCore;
//Will be invoked via reflection
public DefaultSolrHighlighter(SolrCore solrCore) {
this.solrCore = solrCore;
}
// Thread safe registry
protected final Map<String, SolrFormatter> formatters =
new HashMap<>();
// Thread safe registry
protected final Map<String, SolrEncoder> encoders =
new HashMap<>();
// Thread safe registry
protected final Map<String, SolrFragmenter> fragmenters =
new HashMap<>();
// Thread safe registry
protected final Map<String, SolrFragListBuilder> fragListBuilders =
new HashMap<>();
// Thread safe registry
protected final Map<String, SolrFragmentsBuilder> fragmentsBuilders =
new HashMap<>();
// Thread safe registry
protected final Map<String, SolrBoundaryScanner> boundaryScanners =
new HashMap<>();
@Override
public void init(PluginInfo info) {
formatters.clear();
encoders.clear();
fragmenters.clear();
fragListBuilders.clear();
fragmentsBuilders.clear();
boundaryScanners.clear();
// Load the fragmenters
SolrFragmenter frag = solrCore.initPlugins(info.getChildren("fragmenter"), fragmenters, SolrFragmenter.class, null);
if (frag == null) {
frag = new GapFragmenter();
solrCore.initDefaultPlugin(frag, SolrFragmenter.class);
}
fragmenters.put("", frag);
fragmenters.put(null, frag);
// Load the formatters
SolrFormatter fmt = solrCore.initPlugins(info.getChildren("formatter"), formatters, SolrFormatter.class, null);
if (fmt == null) {
fmt = new HtmlFormatter();
solrCore.initDefaultPlugin(fmt, SolrFormatter.class);
}
formatters.put("", fmt);
formatters.put(null, fmt);
// Load the encoders
SolrEncoder enc = solrCore.initPlugins(info.getChildren("encoder"), encoders, SolrEncoder.class, null);
if (enc == null) {
enc = new DefaultEncoder();
solrCore.initDefaultPlugin(enc, SolrEncoder.class);
}
encoders.put("", enc);
encoders.put(null, enc);
// Load the FragListBuilders
SolrFragListBuilder fragListBuilder = solrCore.initPlugins(info.getChildren("fragListBuilder"),
fragListBuilders, SolrFragListBuilder.class, null);
if (fragListBuilder == null) {
fragListBuilder = new SimpleFragListBuilder();
solrCore.initDefaultPlugin(fragListBuilder, SolrFragListBuilder.class);
}
fragListBuilders.put("", fragListBuilder);
fragListBuilders.put(null, fragListBuilder);
// Load the FragmentsBuilders
SolrFragmentsBuilder fragsBuilder = solrCore.initPlugins(info.getChildren("fragmentsBuilder"),
fragmentsBuilders, SolrFragmentsBuilder.class, null);
if (fragsBuilder == null) {
fragsBuilder = new ScoreOrderFragmentsBuilder();
solrCore.initDefaultPlugin(fragsBuilder, SolrFragmentsBuilder.class);
}
fragmentsBuilders.put("", fragsBuilder);
fragmentsBuilders.put(null, fragsBuilder);
// Load the BoundaryScanners
SolrBoundaryScanner boundaryScanner = solrCore.initPlugins(info.getChildren("boundaryScanner"),
boundaryScanners, SolrBoundaryScanner.class, null);
if (boundaryScanner == null) {
boundaryScanner = new SimpleBoundaryScanner();
solrCore.initDefaultPlugin(boundaryScanner, SolrBoundaryScanner.class);
}
boundaryScanners.put("", boundaryScanner);
boundaryScanners.put(null, boundaryScanner);
}
/**
* Return a phrase {@link org.apache.lucene.search.highlight.Highlighter} appropriate for this field.
*
* @param query The current Query
* @param fieldName The name of the field
* @param request The current SolrQueryRequest
* @param tokenStream document text tokenStream that implements reset() efficiently (e.g. CachingTokenFilter).
* If it's used, call reset() first.
* @throws IOException If there is a low-level I/O error.
*/
protected Highlighter getPhraseHighlighter(Query query, String fieldName, SolrQueryRequest request, TokenStream tokenStream) throws IOException {
SolrParams params = request.getParams();
Highlighter highlighter = new Highlighter(
getFormatter(fieldName, params),
getEncoder(fieldName, params),
getSpanQueryScorer(query, fieldName, tokenStream, request));
highlighter.setTextFragmenter(getFragmenter(fieldName, params));
return highlighter;
}
/**
* Return a {@link org.apache.lucene.search.highlight.Highlighter} appropriate for this field.
*
* @param query The current Query
* @param fieldName The name of the field
* @param request The current SolrQueryRequest
*/
protected Highlighter getHighlighter(Query query, String fieldName, SolrQueryRequest request) {
SolrParams params = request.getParams();
Highlighter highlighter = new Highlighter(
getFormatter(fieldName, params),
getEncoder(fieldName, params),
getQueryScorer(query, fieldName, request));
highlighter.setTextFragmenter(getFragmenter(fieldName, params));
return highlighter;
}
/**
* Return a {@link org.apache.lucene.search.highlight.QueryScorer} suitable for this Query and field.
*
* @param query The current query
* @param tokenStream document text tokenStream that implements reset() efficiently (e.g. CachingTokenFilter).
* If it's used, call reset() first.
* @param fieldName The name of the field
* @param request The SolrQueryRequest
*/
protected QueryScorer getSpanQueryScorer(Query query, String fieldName, TokenStream tokenStream, SolrQueryRequest request) {
QueryScorer scorer = new QueryScorer(query,
request.getParams().getFieldBool(fieldName, HighlightParams.FIELD_MATCH, false) ? fieldName : null) {
@Override
protected WeightedSpanTermExtractor newTermExtractor(String defaultField) {
return new CustomSpanTermExtractor(defaultField);
}
};
scorer.setExpandMultiTermQuery(request.getParams().getBool(HighlightParams.HIGHLIGHT_MULTI_TERM, true));
boolean defaultPayloads = true;//overwritten below
try {
// It'd be nice to know if payloads are on the tokenStream but the presence of the attribute isn't a good
// indicator.
final Terms terms = request.getSearcher().getSlowAtomicReader().terms(fieldName);
if (terms != null) {
defaultPayloads = terms.hasPayloads();
}
} catch (IOException e) {
log.error("Couldn't check for existence of payloads", e);
}
scorer.setUsePayloads(request.getParams().getFieldBool(fieldName, HighlightParams.PAYLOADS, defaultPayloads));
return scorer;
}
private static class CustomSpanTermExtractor extends WeightedSpanTermExtractor {
public CustomSpanTermExtractor(String defaultField) {
super(defaultField);
}
@Override
protected void extract(Query query, float boost, Map<String, WeightedSpanTerm> terms) throws IOException {
// these queries are not supported in lucene highlighting out of the box since 8.0
if (query instanceof ToParentBlockJoinQuery) {
extract(((ToParentBlockJoinQuery) query).getChildQuery(), boost, terms);
} else if (query instanceof ToChildBlockJoinQuery) {
extract(((ToChildBlockJoinQuery) query).getParentQuery(), boost, terms);
} else {
super.extract(query, boost, terms);
}
}
}
/**
* Return a {@link org.apache.lucene.search.highlight.Scorer} suitable for this Query and field.
*
* @param query The current query
* @param fieldName The name of the field
* @param request The SolrQueryRequest
*/
protected Scorer getQueryScorer(Query query, String fieldName, SolrQueryRequest request) {
boolean reqFieldMatch = request.getParams().getFieldBool(fieldName, HighlightParams.FIELD_MATCH, false);
if (reqFieldMatch) {
return new QueryTermScorer(query, request.getSearcher().getIndexReader(), fieldName);
} else {
return new QueryTermScorer(query);
}
}
/**
* Return the max number of snippets for this field. If this has not
* been configured for this field, fall back to the configured default
* or the solr default.
*
* @param fieldName The name of the field
* @param params The params controlling Highlighting
*/
protected int getMaxSnippets(String fieldName, SolrParams params) {
return params.getFieldInt(fieldName, HighlightParams.SNIPPETS, 1);
}
/**
* Return whether adjacent fragments should be merged.
*
* @param fieldName The name of the field
* @param params The params controlling Highlighting
*/
protected boolean isMergeContiguousFragments(String fieldName, SolrParams params) {
return params.getFieldBool(fieldName, HighlightParams.MERGE_CONTIGUOUS_FRAGMENTS, false);
}
/**
* Return a {@link org.apache.lucene.search.highlight.Formatter} appropriate for this field. If a formatter
* has not been configured for this field, fall back to the configured
* default or the solr default ({@link org.apache.lucene.search.highlight.SimpleHTMLFormatter}).
*
* @param fieldName The name of the field
* @param params The params controlling Highlighting
* @return An appropriate {@link org.apache.lucene.search.highlight.Formatter}.
*/
protected Formatter getFormatter(String fieldName, SolrParams params) {
String str = params.getFieldParam(fieldName, HighlightParams.FORMATTER);
SolrFormatter formatter = formatters.get(str);
if (formatter == null) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Unknown formatter: " + str);
}
return formatter.getFormatter(fieldName, params);
}
/**
* Return an {@link org.apache.lucene.search.highlight.Encoder} appropriate for this field. If an encoder
* has not been configured for this field, fall back to the configured
* default or the solr default ({@link org.apache.lucene.search.highlight.DefaultEncoder}).
*
* @param fieldName The name of the field
* @param params The params controlling Highlighting
* @return An appropriate {@link org.apache.lucene.search.highlight.Encoder}.
*/
protected Encoder getEncoder(String fieldName, SolrParams params) {
String str = params.getFieldParam(fieldName, HighlightParams.ENCODER);
SolrEncoder encoder = encoders.get(str);
if (encoder == null) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Unknown encoder: " + str);
}
return encoder.getEncoder(fieldName, params);
}
/**
* Return a {@link org.apache.lucene.search.highlight.Fragmenter} appropriate for this field. If a fragmenter
* has not been configured for this field, fall back to the configured
* default or the solr default ({@link GapFragmenter}).
*
* @param fieldName The name of the field
* @param params The params controlling Highlighting
* @return An appropriate {@link org.apache.lucene.search.highlight.Fragmenter}.
*/
protected Fragmenter getFragmenter(String fieldName, SolrParams params) {
String fmt = params.getFieldParam(fieldName, HighlightParams.FRAGMENTER);
SolrFragmenter frag = fragmenters.get(fmt);
if (frag == null) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Unknown fragmenter: " + fmt);
}
return frag.getFragmenter(fieldName, params);
}
protected FragListBuilder getFragListBuilder(String fieldName, SolrParams params) {
String flb = params.getFieldParam(fieldName, HighlightParams.FRAG_LIST_BUILDER);
SolrFragListBuilder solrFlb = fragListBuilders.get(flb);
if (solrFlb == null) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Unknown fragListBuilder: " + flb);
}
return solrFlb.getFragListBuilder(params);
}
protected FragmentsBuilder getFragmentsBuilder(String fieldName, SolrParams params) {
BoundaryScanner bs = getBoundaryScanner(fieldName, params);
return getSolrFragmentsBuilder(fieldName, params).getFragmentsBuilder(params, bs);
}
protected SolrFragmentsBuilder getSolrFragmentsBuilder(String fieldName, SolrParams params) {
String fb = params.getFieldParam(fieldName, HighlightParams.FRAGMENTS_BUILDER);
SolrFragmentsBuilder solrFb = fragmentsBuilders.get(fb);
if (solrFb == null) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Unknown fragmentsBuilder: " + fb);
}
return solrFb;
}
protected BoundaryScanner getBoundaryScanner(String fieldName, SolrParams params) {
String bs = params.getFieldParam(fieldName, HighlightParams.BOUNDARY_SCANNER);
SolrBoundaryScanner solrBs = boundaryScanners.get(bs);
if (solrBs == null) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Unknown boundaryScanner: " + bs);
}
return solrBs.getBoundaryScanner(fieldName, params);
}
/**
* Generates a list of Highlighted query fragments for each item in a list
* of documents, or returns null if highlighting is disabled.
*
* @param docs query results
* @param query the query
* @param req the current request
* @param defaultFields default list of fields to summarize
* @return NamedList containing a NamedList for each document, which in
* turns contains sets (field, summary) pairs.
*/
@Override
@SuppressWarnings("unchecked")
public NamedList<Object> doHighlighting(DocList docs, Query query, SolrQueryRequest req, String[] defaultFields) throws IOException {
SolrParams params = req.getParams();
if (!isHighlightingEnabled(params)) // also returns early if no unique key field
return null;
boolean rewrite = query != null && !(Boolean.valueOf(params.get(HighlightParams.USE_PHRASE_HIGHLIGHTER, "true")) &&
Boolean.valueOf(params.get(HighlightParams.HIGHLIGHT_MULTI_TERM, "true")));
if (rewrite) {
query = query.rewrite(req.getSearcher().getIndexReader());
}
SolrIndexSearcher searcher = req.getSearcher();
IndexSchema schema = searcher.getSchema();
// fetch unique key if one exists.
SchemaField keyField = schema.getUniqueKeyField();
if (keyField == null) {
return null;//exit early; we need a unique key field to populate the response
}
String[] fieldNames = getHighlightFields(query, req, defaultFields);
Set<String> preFetchFieldNames = getDocPrefetchFieldNames(fieldNames, req);
SolrReturnFields returnFields;
if (preFetchFieldNames != null) {
preFetchFieldNames.add(keyField.getName());
returnFields = new SolrReturnFields(preFetchFieldNames.toArray(new String[0]), req);
} else {
returnFields = new SolrReturnFields(new String[0], req);
}
FvhContainer fvhContainer = new FvhContainer(null, null); // Lazy container for fvh and fieldQuery
IndexReader reader = new TermVectorReusingLeafReader(req.getSearcher().getSlowAtomicReader()); // SOLR-5855
// Highlight each document
@SuppressWarnings({"rawtypes"})
NamedList fragments = new SimpleOrderedMap();
DocIterator iterator = docs.iterator();
for (int i = 0; i < docs.size(); i++) {
int docId = iterator.nextDoc();
SolrDocument doc = searcher.getDocFetcher().solrDoc(docId, returnFields);
@SuppressWarnings("rawtypes")
NamedList docHighlights = new SimpleOrderedMap();
// Highlight per-field
for (String fieldName : fieldNames) {
SchemaField schemaField = schema.getFieldOrNull(fieldName);
Object fieldHighlights; // object type allows flexibility for subclassers
fieldHighlights = doHighlightingOfField(doc, docId, schemaField, fvhContainer, query, reader, req, params);
if (fieldHighlights == null) {
fieldHighlights = alternateField(doc, docId, fieldName, fvhContainer, query, reader, req);
}
if (fieldHighlights != null) {
docHighlights.add(fieldName, fieldHighlights);
}
} // for each field
fragments.add(schema.printableUniqueKey(doc), docHighlights);
} // for each doc
return fragments;
}
protected Object doHighlightingOfField(SolrDocument doc, int docId, SchemaField schemaField,
FvhContainer fvhContainer, Query query, IndexReader reader, SolrQueryRequest req,
SolrParams params) throws IOException {
Object fieldHighlights;
if (schemaField == null) {
fieldHighlights = null;
} else if (schemaField.getType() instanceof org.apache.solr.schema.TrieField) {
// TODO: highlighting numeric fields is broken (Lucene) - so we disable them until fixed (see LUCENE-3080)!
fieldHighlights = null;
} else if (useFastVectorHighlighter(params, schemaField)) {
if (fvhContainer.fieldQuery == null) {
FastVectorHighlighter fvh = new FastVectorHighlighter(
// FVH cannot process hl.usePhraseHighlighter parameter per-field basis
params.getBool(HighlightParams.USE_PHRASE_HIGHLIGHTER, true),
// FVH cannot process hl.requireFieldMatch parameter per-field basis
params.getBool(HighlightParams.FIELD_MATCH, false)) {
@Override
public FieldQuery getFieldQuery(Query query, IndexReader reader) throws IOException {
return new FieldQuery(query, reader, phraseHighlight, fieldMatch) {
@Override
protected void flatten(Query sourceQuery, IndexReader reader, Collection<Query> flatQueries, float boost) throws IOException {
if (sourceQuery instanceof ToParentBlockJoinQuery) {
Query childQuery = ((ToParentBlockJoinQuery) sourceQuery).getChildQuery();
if (childQuery != null) {
flatten(childQuery, reader, flatQueries, boost);
}
} else {
super.flatten(sourceQuery, reader, flatQueries, boost);
}
}
};
}
};
fvh.setPhraseLimit(params.getInt(HighlightParams.PHRASE_LIMIT, SolrHighlighter.DEFAULT_PHRASE_LIMIT));
fvhContainer.fvh = fvh;
fvhContainer.fieldQuery = fvh.getFieldQuery(query, reader);
}
fieldHighlights =
doHighlightingByFastVectorHighlighter(doc, docId, schemaField, fvhContainer, reader, req);
} else { // standard/default highlighter
fieldHighlights = doHighlightingByHighlighter(doc, docId, schemaField, query, reader, req);
}
return fieldHighlights;
}
/**
* Returns the field names to be passed to {@link org.apache.solr.search.SolrDocumentFetcher#solrDoc(int, SolrReturnFields)}.
* Subclasses might over-ride to include fields in search-results and other stored field values needed so as to avoid
* the possibility of extra trips to disk. The uniqueKey will be added after if the result isn't null.
*/
protected Set<String> getDocPrefetchFieldNames(String[] hlFieldNames, SolrQueryRequest req) {
Set<String> preFetchFieldNames = new HashSet<>(hlFieldNames.length + 1);//+1 for uniqueyKey added after
Collections.addAll(preFetchFieldNames, hlFieldNames);
for (String hlFieldName : hlFieldNames) {
String alternateField = req.getParams().getFieldParam(hlFieldName, HighlightParams.ALTERNATE_FIELD);
if (alternateField != null) {
preFetchFieldNames.add(alternateField);
}
}
return preFetchFieldNames;
}
/**
* Determines if we should use the FastVectorHighlighter for this field.
*/
protected boolean useFastVectorHighlighter(SolrParams params, SchemaField schemaField) {
boolean methodFvh =
HighlightComponent.HighlightMethod.FAST_VECTOR.getMethodName().equals(
params.getFieldParam(schemaField.getName(), HighlightParams.METHOD))
|| params.getFieldBool(schemaField.getName(), USE_FVH, false);
if (!methodFvh) return false;
boolean termPosOff = schemaField.storeTermPositions() && schemaField.storeTermOffsets();
if (!termPosOff) {
log.warn("Solr will use the standard Highlighter instead of FastVectorHighlighter because the {} field {}"
, "does not store TermVectors with TermPositions and TermOffsets.", schemaField.getName());
}
return termPosOff;
}
/** Highlights and returns the highlight object for this field -- a String[] by default. Null if none. */
@SuppressWarnings("unchecked")
protected Object doHighlightingByFastVectorHighlighter(SolrDocument doc, int docId,
SchemaField schemaField, FvhContainer fvhContainer,
IndexReader reader, SolrQueryRequest req) throws IOException {
SolrParams params = req.getParams();
String fieldName = schemaField.getName();
SolrFragmentsBuilder solrFb = getSolrFragmentsBuilder(fieldName, params);
String[] snippets = fvhContainer.fvh.getBestFragments(fvhContainer.fieldQuery, reader, docId, fieldName,
params.getFieldInt(fieldName, HighlightParams.FRAGSIZE, 100),
params.getFieldInt(fieldName, HighlightParams.SNIPPETS, 1),
getFragListBuilder(fieldName, params),
getFragmentsBuilder(fieldName, params),
solrFb.getPreTags(params, fieldName),
solrFb.getPostTags(params, fieldName),
getEncoder(fieldName, params));
if (snippets != null && snippets.length > 0)
return snippets;
return null;
}
/** Highlights and returns the highlight object for this field -- a String[] by default. Null if none. */
@SuppressWarnings("unchecked")
protected Object doHighlightingByHighlighter(SolrDocument doc, int docId, SchemaField schemaField, Query query,
IndexReader reader, SolrQueryRequest req) throws IOException {
final SolrParams params = req.getParams();
final String fieldName = schemaField.getName();
final int mvToExamine =
params.getFieldInt(fieldName, HighlightParams.MAX_MULTIVALUED_TO_EXAMINE,
(schemaField.multiValued()) ? Integer.MAX_VALUE : 1);
// Technically this is the max *fragments* (snippets), not max values:
int mvToMatch =
params.getFieldInt(fieldName, HighlightParams.MAX_MULTIVALUED_TO_MATCH, Integer.MAX_VALUE);
if (mvToExamine <= 0 || mvToMatch <= 0) {
return null;
}
int maxCharsToAnalyze = params.getFieldInt(fieldName,
HighlightParams.MAX_CHARS, DEFAULT_MAX_CHARS);
if (maxCharsToAnalyze < 0) {//e.g. -1
maxCharsToAnalyze = Integer.MAX_VALUE;
}
List<String> fieldValues = getFieldValues(doc, fieldName, mvToExamine, maxCharsToAnalyze, req);
if (fieldValues.isEmpty()) {
return null;
}
// preserve order of values in a multiValued list
boolean preserveMulti = params.getFieldBool(fieldName, HighlightParams.PRESERVE_MULTI, false);
int numFragments = getMaxSnippets(fieldName, params);
boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params);
List<TextFragment> frags = new ArrayList<>();
//Try term vectors, which is faster
// note: offsets are minimally sufficient for this HL.
final Fields tvFields = schemaField.storeTermOffsets() ? reader.getTermVectors(docId) : null;
final TokenStream tvStream =
TokenSources.getTermVectorTokenStreamOrNull(fieldName, tvFields, maxCharsToAnalyze - 1);
// We need to wrap in OffsetWindowTokenFilter if multi-valued
try (OffsetWindowTokenFilter tvWindowStream = (tvStream != null && fieldValues.size() > 1) ? new OffsetWindowTokenFilter(tvStream) : null) {
for (String thisText : fieldValues) {
if (mvToMatch <= 0 || maxCharsToAnalyze <= 0) {
break;
}
TokenStream tstream;
if (tvWindowStream != null) {
// if we have a multi-valued field with term vectors, then get the next offset window
tstream = tvWindowStream.advanceToNextWindowOfLength(thisText.length());
} else if (tvStream != null) {
tstream = tvStream; // single-valued with term vectors
} else {
// fall back to analyzer
tstream = createAnalyzerTStream(schemaField, thisText);
}
Highlighter highlighter;
if (params.getFieldBool(fieldName, HighlightParams.USE_PHRASE_HIGHLIGHTER, true)) {
// We're going to call getPhraseHighlighter and it might consume the tokenStream. If it does, the tokenStream
// needs to implement reset() efficiently.
//If the tokenStream is right from the term vectors, then CachingTokenFilter is unnecessary.
// It should be okay if OffsetLimit won't get applied in this case.
final TokenStream tempTokenStream;
if (tstream != tvStream) {
if (maxCharsToAnalyze >= thisText.length()) {
tempTokenStream = new CachingTokenFilter(tstream);
} else {
tempTokenStream = new CachingTokenFilter(new OffsetLimitTokenFilter(tstream, maxCharsToAnalyze));
}
} else {
tempTokenStream = tstream;
}
// get highlighter
highlighter = getPhraseHighlighter(query, fieldName, req, tempTokenStream);
// if the CachingTokenFilter was consumed then use it going forward.
if (tempTokenStream instanceof CachingTokenFilter && ((CachingTokenFilter) tempTokenStream).isCached()) {
tstream = tempTokenStream;
}
//tstream.reset(); not needed; getBestTextFragments will reset it.
} else {
// use "the old way"
highlighter = getHighlighter(query, fieldName, req);
}
highlighter.setMaxDocCharsToAnalyze(maxCharsToAnalyze);
maxCharsToAnalyze -= thisText.length();
// Highlight!
try {
TextFragment[] bestTextFragments =
highlighter.getBestTextFragments(tstream, thisText, mergeContiguousFragments, numFragments);
for (TextFragment bestTextFragment : bestTextFragments) {
if (bestTextFragment == null)//can happen via mergeContiguousFragments
continue;
// normally we want a score (must be highlighted), but if preserveMulti then we return a snippet regardless.
if (bestTextFragment.getScore() > 0 || preserveMulti) {
frags.add(bestTextFragment);
if (bestTextFragment.getScore() > 0)
--mvToMatch; // note: limits fragments (for multi-valued fields), not quite the number of values
}
}
} catch (InvalidTokenOffsetsException e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
}
}//end field value loop
}
// Put the fragments onto the Solr response (docSummaries)
if (frags.size() > 0) {
// sort such that the fragments with the highest score come first
if (!preserveMulti) {
Collections.sort(frags, (arg0, arg1) -> Float.compare(arg1.getScore(), arg0.getScore()));
}
// Truncate list to hl.snippets, but not when hl.preserveMulti
if (frags.size() > numFragments && !preserveMulti) {
frags = frags.subList(0, numFragments);
}
return getResponseForFragments(frags, req);
}
return null;//no highlights for this field
}
/**
* Fetches field values to highlight. If the field value should come from an atypical place (or another aliased
* field name, then a subclass could override to implement that.
*/
protected List<String> getFieldValues(SolrDocument doc, String fieldName, int maxValues, int maxCharsToAnalyze,
SolrQueryRequest req) {
// Collect the Fields we will examine (could be more than one if multi-valued)
Collection<Object> fieldValues = doc.getFieldValues(fieldName);
if (fieldValues == null) {
return Collections.emptyList();
}
FieldType fieldType = req.getSchema().getFieldType(fieldName);
List<String> result = new ArrayList<>();
for (Object value : fieldValues) {
String strValue;
if (value instanceof IndexableField) {
strValue = fieldType.toExternal((IndexableField) value);
} else {
strValue = value.toString(); // TODO FieldType needs an API for this, e.g. toExternalFromDv()
}
result.add(strValue);
maxCharsToAnalyze -= strValue.length();//we exit early if we'll never get to analyze the value
maxValues--;
if (maxValues <= 0 || maxCharsToAnalyze <= 0) {
break;
}
}
return result;
}
/**
* Given the fragments, return the result to be put in the field {@link NamedList}. This is an extension
* point to allow adding other metadata like the absolute offsets or scores.
*/
protected Object getResponseForFragments(List<TextFragment> frags, SolrQueryRequest req) {
// TODO: we can include score and position information in output as snippet attributes
ArrayList<String> fragTexts = new ArrayList<>();
for (TextFragment fragment : frags) {
fragTexts.add(fragment.toString());
}
return fragTexts.toArray(new String[fragTexts.size()]);
}
/** Returns the alternate highlight object for this field -- a String[] by default. Null if none. */
@SuppressWarnings("unchecked")
protected Object alternateField(SolrDocument doc, int docId, String fieldName, FvhContainer fvhContainer, Query query,
IndexReader reader, SolrQueryRequest req) throws IOException {
IndexSchema schema = req.getSearcher().getSchema();
SolrParams params = req.getParams();
String alternateField = params.getFieldParam(fieldName, HighlightParams.ALTERNATE_FIELD);
int alternateFieldLen = params.getFieldInt(fieldName, HighlightParams.ALTERNATE_FIELD_LENGTH, 0);
if (alternateField == null || alternateField.length() == 0) {
return null;
}
if (params.getFieldBool(fieldName, HighlightParams.HIGHLIGHT_ALTERNATE, true) && !alternateField.equals(fieldName)) {
// Try to highlight alternate field
Object fieldHighlights = null;
SchemaField schemaField = schema.getFieldOrNull(alternateField);
if (schemaField != null) {
HashMap<String, String> invariants = new HashMap<>();
invariants.put("f." + alternateField + "." + HighlightParams.SNIPPETS, "1");
// Enforce maxAlternateFieldLength by FRAGSIZE. Minimum 18 due to FVH limitations
invariants.put("f." + alternateField + "." + HighlightParams.FRAGSIZE,
alternateFieldLen > 0 ? String.valueOf(Math.max(18, alternateFieldLen)) : String.valueOf(Integer.MAX_VALUE));
SolrParams origParams = req.getParams();
req.setParams(SolrParams.wrapDefaults(new MapSolrParams(invariants), origParams));
fieldHighlights = doHighlightingOfField(doc, docId, schemaField, fvhContainer, query, reader, req, params);
req.setParams(origParams);
if (fieldHighlights != null) {
return fieldHighlights;
}
}
}
// Fallback to static non-highlighted
List<String> listFields = getFieldValues(doc, alternateField, Integer.MAX_VALUE, Integer.MAX_VALUE, req);
if (listFields.isEmpty()) {
// The alternate field did not exist, treat the original field as fallback instead
listFields = getFieldValues(doc, fieldName, Integer.MAX_VALUE, Integer.MAX_VALUE, req);
if (listFields.isEmpty()) {
return null;
}
}
String[] altTexts = listFields.toArray(new String[listFields.size()]);
Encoder encoder = getEncoder(fieldName, params);
List<String> altList = new ArrayList<>();
int len = 0;
for (String altText : altTexts) {
if (alternateFieldLen <= 0) {
altList.add(encoder.encodeText(altText));
} else {
altList.add(len + altText.length() > alternateFieldLen ?
encoder.encodeText(altText.substring(0, alternateFieldLen - len)) :
encoder.encodeText(altText));
len += altText.length();
if (len >= alternateFieldLen) break;
}
}
return altList;
}
protected TokenStream createAnalyzerTStream(SchemaField schemaField, String docText) throws IOException {
final TokenStream tStream = schemaField.getType().getIndexAnalyzer().tokenStream(schemaField.getName(), docText);
return new TokenOrderingFilter(tStream, 10);
}
// Wraps FVH to allow pass-by-reference. Public access to allow use in 3rd party subclasses
public static class FvhContainer {
FastVectorHighlighter fvh;
FieldQuery fieldQuery;
public FvhContainer(FastVectorHighlighter fvh, FieldQuery fieldQuery) {
this.fvh = fvh;
this.fieldQuery = fieldQuery;
}
}
/**
* Orders Tokens in a window first by their startOffset ascending.
* endOffset is currently ignored.
* This is meant to work around fickleness in the highlighter only. It
* can mess up token positions and should not be used for indexing or querying.
*/
static final class TokenOrderingFilter extends TokenFilter {
private final int windowSize;
private final LinkedList<OrderedToken> queue = new LinkedList<>(); //TODO replace with Deque, Array impl
private boolean done = false;
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
protected TokenOrderingFilter(TokenStream input, int windowSize) {
super(input);
this.windowSize = windowSize;
}
@Override
public void reset() throws IOException {
super.reset();
queue.clear();
done = false;
}
@Override
public boolean incrementToken() throws IOException {
while (!done && queue.size() < windowSize) {
if (!input.incrementToken()) {
done = true;
break;
}
// reverse iterating for better efficiency since we know the
// list is already sorted, and most token start offsets will be too.
ListIterator<OrderedToken> iter = queue.listIterator(queue.size());
while (iter.hasPrevious()) {
if (offsetAtt.startOffset() >= iter.previous().startOffset) {
// insertion will be before what next() would return (what
// we just compared against), so move back one so the insertion
// will be after.
iter.next();
break;
}
}
OrderedToken ot = new OrderedToken();
ot.state = captureState();
ot.startOffset = offsetAtt.startOffset();
iter.add(ot);
}
if (queue.isEmpty()) {
return false;
} else {
restoreState(queue.removeFirst().state);
return true;
}
}
}
// for TokenOrderingFilter, so it can easily sort by startOffset
static class OrderedToken {
State state;
int startOffset;
}
/** For use with term vectors of multi-valued fields. We want an offset based window into its TokenStream. */
static final class OffsetWindowTokenFilter extends TokenFilter {
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
private int windowStartOffset;
private int windowEndOffset = -1;//exclusive
private boolean windowTokenIncremented = false;
private boolean inputWasReset = false;
private State capturedState;//only used for first token of each subsequent window
OffsetWindowTokenFilter(TokenStream input) {//input should not have been reset already
super(input);
}
//Called at the start of each value/window
OffsetWindowTokenFilter advanceToNextWindowOfLength(int length) {
windowStartOffset = windowEndOffset + 1;//unclear why there's a single offset gap between values, but tests show it
windowEndOffset = windowStartOffset + length;
windowTokenIncremented = false;//thereby permit reset()
return this;
}
@Override
public void reset() throws IOException {
//we do some state checking to ensure this is being used correctly
if (windowTokenIncremented) {
throw new IllegalStateException("This TokenStream does not support being subsequently reset()");
}
if (!inputWasReset) {
super.reset();
inputWasReset = true;
}
}
@Override
public boolean incrementToken() throws IOException {
assert inputWasReset;
windowTokenIncremented = true;
while (true) {
//increment Token
if (capturedState == null) {
if (!input.incrementToken()) {
return false;
}
} else {
restoreState(capturedState);
capturedState = null;
//Set posInc to 1 on first token of subsequent windows. To be thorough, we could subtract posIncGap?
posIncAtt.setPositionIncrement(1);
}
final int startOffset = offsetAtt.startOffset();
final int endOffset = offsetAtt.endOffset();
if (startOffset >= windowEndOffset) {//end of window
capturedState = captureState();
return false;
}
if (startOffset >= windowStartOffset) {//in this window
offsetAtt.setOffset(startOffset - windowStartOffset, endOffset - windowStartOffset);
return true;
}
//otherwise this token is before the window; continue to advance
}
}
}
/**
* Wraps a DirectoryReader that caches the {@link LeafReader#getTermVectors(int)} so that
* if the next call has the same ID, then it is reused.
*/
static class TermVectorReusingLeafReader extends FilterLeafReader {
private int lastDocId = -1;
private Fields tvFields;
public TermVectorReusingLeafReader(LeafReader in) {
super(in);
}
@Override
public Fields getTermVectors(int docID) throws IOException {
if (docID != lastDocId) {
lastDocId = docID;
tvFields = in.getTermVectors(docID);
}
return tvFields;
}
@Override
public CacheHelper getCoreCacheHelper() {
return null;
}
@Override
public CacheHelper getReaderCacheHelper() {
return null;
}
}
}