lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/MatchHighlighter.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.search.matchhighlight;

 import java.io.IOException;
 import java.io.UncheckedIOException;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.HashSet;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Objects;
 import java.util.function.Predicate;
 import java.util.stream.Stream;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.DocumentStoredFieldVisitor;
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.IndexableField;
 import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.ScoreDoc;
 import org.apache.lucene.search.TopDocs;

 /**
  * An example highlighter that combines several lower-level highlighting utilities in this package
  * into a fully featured, ready-to-use component.
  *
  * <p>Note that if you need to customize or tweak the details of highlighting, it is better to
  * assemble your own highlighter using those low-level building blocks, rather than extend or modify
  * this one.
  */
 public class MatchHighlighter {
   private final IndexSearcher searcher;
   private final OffsetsRetrievalStrategySupplier offsetsRetrievalStrategies;
   private final Analyzer analyzer;

   private final HashSet<String> fieldsAlwaysReturned = new HashSet<>();
   private final List<FieldValueHighlighter> fieldHighlighters = new ArrayList<>();

   /**
    * Actual per-field highlighter. Field highlighters are probed whether they are applicable to a
    * particular combination of (field, hasMatches) pair. If a highlighter declares it is applicable,
    * its {@link #format} method is invoked and the result is returned as the field's value.
    *
    * @see FieldValueHighlighters
    */
   public interface FieldValueHighlighter {
     /**
      * Check if this highlighter can be applied to a given field.
      *
      * @param field Field name
      * @param hasMatches {@code true} if the field has a non-empty set of match regions.
      */
     boolean isApplicable(String field, boolean hasMatches);

     /** Do format field values appropriately. */
     List<String> format(
         String field,
         String[] values,
         String contiguousValue,
         List<OffsetRange> valueRanges,
         List<QueryOffsetRange> matchOffsets);

     /**
      * @return Returns a set of fields that must be fetched for each document, regardless of whether
      *     they had matches or not. This is useful to load and return certain fields that should
      *     always be included (identifiers, document titles, etc.).
      */
     default Collection<String> alwaysFetchedFields() {
       return Collections.emptyList();
     }

     /** Returns a new field value highlighter that is a combination of this one and another one. */
     default FieldValueHighlighter or(FieldValueHighlighter other) {
       FieldValueHighlighter first = this;
       FieldValueHighlighter second = other;

       HashSet<String> fieldUnion = new HashSet<>();
       fieldUnion.addAll(first.alwaysFetchedFields());
       fieldUnion.addAll(second.alwaysFetchedFields());

       return new FieldValueHighlighter() {
         @Override
         public boolean isApplicable(String field, boolean hasMatches) {
           return first.isApplicable(field, hasMatches) || second.isApplicable(field, hasMatches);
         }

         @Override
         public List<String> format(
             String field,
             String[] values,
             String contiguousValue,
             List<OffsetRange> valueRanges,
             List<QueryOffsetRange> matchOffsets) {
           FieldValueHighlighter delegate =
               first.isApplicable(field, matchOffsets != null && !matchOffsets.isEmpty())
                   ? first
                   : second;
           return delegate.format(field, values, contiguousValue, valueRanges, matchOffsets);
         }

         @Override
         public Collection<String> alwaysFetchedFields() {
           return fieldUnion;
         }
       };
     }
   }

   /**
    * Append a new highlighter to field highlighters chain. The order of field highlighters is
    * important (first-matching wins).
    */
   public MatchHighlighter appendFieldHighlighter(FieldValueHighlighter highlighter) {
     fieldHighlighters.add(highlighter);
     fieldsAlwaysReturned.addAll(highlighter.alwaysFetchedFields());
     return this;
   }

   /** Always fetch the given set of fields for all input documents. */
   public void alwaysFetchFields(String... fields) {
     for (String fld : fields) {
       fieldsAlwaysReturned.add(Objects.requireNonNull(fld));
     }
   }

   /** Single document's highlights. */
   public static class DocHighlights {
     public final int docId;
     public final Map<String, List<String>> fields = new LinkedHashMap<>();

     public DocHighlights(int docId) {
       this.docId = docId;
     }
   }

   /** An {@link OffsetRange} of a match, together with the source query that caused it. */
   public static class QueryOffsetRange extends OffsetRange {
     public final Query query;

     QueryOffsetRange(Query query, int from, int to) {
       super(from, to);
       this.query = query;
     }

     @Override
     public QueryOffsetRange slice(int from, int to) {
       return new QueryOffsetRange(query, from, to);
     }
   }

   private static class DocHit {
     final int docId;
     private final LeafReader leafReader;
     private final int leafDocId;
     private final LinkedHashMap<String, List<QueryOffsetRange>> matchRanges = new LinkedHashMap<>();

     DocHit(int docId, LeafReader leafReader, int leafDocId) {
       this.docId = docId;
       this.leafReader = leafReader;
       this.leafDocId = leafDocId;
     }

     void addMatches(Query query, Map<String, List<OffsetRange>> hits) {
       hits.forEach(
           (field, offsets) -> {
             List<QueryOffsetRange> target =
                 matchRanges.computeIfAbsent(field, (fld) -> new ArrayList<>());
             offsets.forEach(o -> target.add(new QueryOffsetRange(query, o.from, o.to)));
           });
     }

     Document document(Predicate<String> needsField) throws IOException {
       // Only load the fields that have a chance to be highlighted.
       DocumentStoredFieldVisitor visitor =
           new DocumentStoredFieldVisitor() {
             @Override
             public Status needsField(FieldInfo fieldInfo) {
               return (matchRanges.containsKey(fieldInfo.name) || needsField.test(fieldInfo.name))
                   ? Status.YES
                   : Status.NO;
             }
           };

       leafReader.document(leafDocId, visitor);
       return visitor.getDocument();
     }
   }

   public MatchHighlighter(IndexSearcher searcher, Analyzer analyzer) {
     this(
         searcher,
         analyzer,
         MatchRegionRetriever.computeOffsetRetrievalStrategies(searcher.getIndexReader(), analyzer));
   }

   public MatchHighlighter(
       IndexSearcher searcher,
       Analyzer analyzer,
       OffsetsRetrievalStrategySupplier offsetsRetrievalStrategies) {
     this.searcher = searcher;
     this.offsetsRetrievalStrategies = offsetsRetrievalStrategies;
     this.analyzer = analyzer;
   }

   public Stream<DocHighlights> highlight(TopDocs topDocs, Query... queries) throws IOException {
     // We want to preserve topDocs document ordering and MatchRegionRetriever is optimized
     // for streaming, so we'll just prepopulate the map in proper order.
     LinkedHashMap<Integer, DocHit> docHits = new LinkedHashMap<>();
     for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
       docHits.put(scoreDoc.doc, null);
     }

     // Collect match ranges for each query and associate each range to the origin query.
     for (Query q : queries) {
       MatchRegionRetriever highlighter =
           new MatchRegionRetriever(searcher, searcher.rewrite(q), offsetsRetrievalStrategies);
       highlighter.highlightDocuments(
           topDocs,
           (int docId,
               LeafReader leafReader,
               int leafDocId,
               Map<String, List<OffsetRange>> hits) -> {
             DocHit docHit = docHits.get(docId);
             if (docHit == null) {
               docHit = new DocHit(docId, leafReader, leafDocId);
               docHits.put(docId, docHit);
             }
             docHit.addMatches(q, hits);
           });
     }

     return docHits.values().stream()
         .filter(Objects::nonNull) // This should always the case?
         .map(this::computeDocFieldValues);
   }

   private DocHighlights computeDocFieldValues(DocHit docHit) {
     Document doc;
     try {
       doc = docHit.document(fieldsAlwaysReturned::contains);
     } catch (IOException e) {
       throw new UncheckedIOException(e);
     }

     DocHighlights docHighlights = new DocHighlights(docHit.docId);

     HashSet<String> unique = new HashSet<>();
     for (IndexableField indexableField : doc) {
       String field = indexableField.name();
       if (!unique.add(field)) {
         continue;
       }

       String[] values = doc.getValues(field);
       String contiguousValue = contiguousFieldValue(field, values);
       List<OffsetRange> valueRanges = computeValueRanges(field, values);
       List<QueryOffsetRange> offsets = docHit.matchRanges.get(field);

       List<String> formattedValues =
           fieldValueHighlighter(field, offsets != null)
               .format(field, values, contiguousValue, valueRanges, offsets);

       if (formattedValues != null) {
         docHighlights.fields.put(field, formattedValues);
       }
     }

     return docHighlights;
   }

   private List<OffsetRange> computeValueRanges(String field, String[] values) {
     ArrayList<OffsetRange> valueRanges = new ArrayList<>();
     int offset = 0;
     for (CharSequence v : values) {
       valueRanges.add(new OffsetRange(offset, offset + v.length()));
       offset += v.length();
       offset += analyzer.getOffsetGap(field);
     }
     return valueRanges;
   }

   private String contiguousFieldValue(String field, String[] values) {
     String value;
     if (values.length == 1) {
       value = values[0];
     } else {
       // TODO: This can be inefficient if offset gap is large but the logic
       // of applying offsets would get much more complicated so leaving for now
       // (would have to recalculate all offsets to omit gaps).
       String fieldGapPadding = " ".repeat(analyzer.getOffsetGap(field));
       value = String.join(fieldGapPadding, values);
     }
     return value;
   }

   private FieldValueHighlighter fieldValueHighlighter(String field, boolean hasMatches) {
     for (FieldValueHighlighter highlighter : fieldHighlighters) {
       if (highlighter.isApplicable(field, hasMatches)) {
         return highlighter;
       }
     }
     throw new RuntimeException("No field highlighter could be matched to field: " + field);
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.search.matchhighlight;

	import java.io.IOException;
	import java.io.UncheckedIOException;
	import java.util.ArrayList;
	import java.util.Collection;
	import java.util.Collections;
	import java.util.HashSet;
	import java.util.LinkedHashMap;
	import java.util.List;
	import java.util.Map;
	import java.util.Objects;
	import java.util.function.Predicate;
	import java.util.stream.Stream;
	import org.apache.lucene.analysis.Analyzer;
	import org.apache.lucene.document.Document;
	import org.apache.lucene.document.DocumentStoredFieldVisitor;
	import org.apache.lucene.index.FieldInfo;
	import org.apache.lucene.index.IndexableField;
	import org.apache.lucene.index.LeafReader;
	import org.apache.lucene.search.IndexSearcher;
	import org.apache.lucene.search.Query;
	import org.apache.lucene.search.ScoreDoc;
	import org.apache.lucene.search.TopDocs;

	/**
	* An example highlighter that combines several lower-level highlighting utilities in this package
	* into a fully featured, ready-to-use component.
	*
	* <p>Note that if you need to customize or tweak the details of highlighting, it is better to
	* assemble your own highlighter using those low-level building blocks, rather than extend or modify
	* this one.
	*/
	public class MatchHighlighter {
	private final IndexSearcher searcher;
	private final OffsetsRetrievalStrategySupplier offsetsRetrievalStrategies;
	private final Analyzer analyzer;

	private final HashSet<String> fieldsAlwaysReturned = new HashSet<>();
	private final List<FieldValueHighlighter> fieldHighlighters = new ArrayList<>();

	/**
	* Actual per-field highlighter. Field highlighters are probed whether they are applicable to a
	* particular combination of (field, hasMatches) pair. If a highlighter declares it is applicable,
	* its {@link #format} method is invoked and the result is returned as the field's value.
	*
	* @see FieldValueHighlighters
	*/
	public interface FieldValueHighlighter {
	/**
	* Check if this highlighter can be applied to a given field.
	*
	* @param field Field name
	* @param hasMatches {@code true} if the field has a non-empty set of match regions.
	*/
	boolean isApplicable(String field, boolean hasMatches);

	/** Do format field values appropriately. */
	List<String> format(
	String field,
	String[] values,
	String contiguousValue,
	List<OffsetRange> valueRanges,
	List<QueryOffsetRange> matchOffsets);

	/**
	* @return Returns a set of fields that must be fetched for each document, regardless of whether
	* they had matches or not. This is useful to load and return certain fields that should
	* always be included (identifiers, document titles, etc.).
	*/
	default Collection<String> alwaysFetchedFields() {
	return Collections.emptyList();
	}

	/** Returns a new field value highlighter that is a combination of this one and another one. */
	default FieldValueHighlighter or(FieldValueHighlighter other) {
	FieldValueHighlighter first = this;
	FieldValueHighlighter second = other;

	HashSet<String> fieldUnion = new HashSet<>();
	fieldUnion.addAll(first.alwaysFetchedFields());
	fieldUnion.addAll(second.alwaysFetchedFields());

	return new FieldValueHighlighter() {
	@Override
	public boolean isApplicable(String field, boolean hasMatches) {
	return first.isApplicable(field, hasMatches) \|\| second.isApplicable(field, hasMatches);
	}

	@Override
	public List<String> format(
	String field,
	String[] values,
	String contiguousValue,
	List<OffsetRange> valueRanges,
	List<QueryOffsetRange> matchOffsets) {
	FieldValueHighlighter delegate =
	first.isApplicable(field, matchOffsets != null && !matchOffsets.isEmpty())
	? first
	: second;
	return delegate.format(field, values, contiguousValue, valueRanges, matchOffsets);
	}

	@Override
	public Collection<String> alwaysFetchedFields() {
	return fieldUnion;
	}
	};
	}
	}

	/**
	* Append a new highlighter to field highlighters chain. The order of field highlighters is
	* important (first-matching wins).
	*/
	public MatchHighlighter appendFieldHighlighter(FieldValueHighlighter highlighter) {
	fieldHighlighters.add(highlighter);
	fieldsAlwaysReturned.addAll(highlighter.alwaysFetchedFields());
	return this;
	}

	/** Always fetch the given set of fields for all input documents. */
	public void alwaysFetchFields(String... fields) {
	for (String fld : fields) {
	fieldsAlwaysReturned.add(Objects.requireNonNull(fld));
	}
	}

	/** Single document's highlights. */
	public static class DocHighlights {
	public final int docId;
	public final Map<String, List<String>> fields = new LinkedHashMap<>();

	public DocHighlights(int docId) {
	this.docId = docId;
	}
	}

	/** An {@link OffsetRange} of a match, together with the source query that caused it. */
	public static class QueryOffsetRange extends OffsetRange {
	public final Query query;

	QueryOffsetRange(Query query, int from, int to) {
	super(from, to);
	this.query = query;
	}

	@Override
	public QueryOffsetRange slice(int from, int to) {
	return new QueryOffsetRange(query, from, to);
	}
	}

	private static class DocHit {
	final int docId;
	private final LeafReader leafReader;
	private final int leafDocId;
	private final LinkedHashMap<String, List<QueryOffsetRange>> matchRanges = new LinkedHashMap<>();

	DocHit(int docId, LeafReader leafReader, int leafDocId) {
	this.docId = docId;
	this.leafReader = leafReader;
	this.leafDocId = leafDocId;
	}

	void addMatches(Query query, Map<String, List<OffsetRange>> hits) {
	hits.forEach(
	(field, offsets) -> {
	List<QueryOffsetRange> target =
	matchRanges.computeIfAbsent(field, (fld) -> new ArrayList<>());
	offsets.forEach(o -> target.add(new QueryOffsetRange(query, o.from, o.to)));
	});
	}

	Document document(Predicate<String> needsField) throws IOException {
	// Only load the fields that have a chance to be highlighted.
	DocumentStoredFieldVisitor visitor =
	new DocumentStoredFieldVisitor() {
	@Override
	public Status needsField(FieldInfo fieldInfo) {
	return (matchRanges.containsKey(fieldInfo.name) \|\| needsField.test(fieldInfo.name))
	? Status.YES
	: Status.NO;
	}
	};

	leafReader.document(leafDocId, visitor);
	return visitor.getDocument();
	}
	}

	public MatchHighlighter(IndexSearcher searcher, Analyzer analyzer) {
	this(
	searcher,
	analyzer,
	MatchRegionRetriever.computeOffsetRetrievalStrategies(searcher.getIndexReader(), analyzer));
	}

	public MatchHighlighter(
	IndexSearcher searcher,
	Analyzer analyzer,
	OffsetsRetrievalStrategySupplier offsetsRetrievalStrategies) {
	this.searcher = searcher;
	this.offsetsRetrievalStrategies = offsetsRetrievalStrategies;
	this.analyzer = analyzer;
	}

	public Stream<DocHighlights> highlight(TopDocs topDocs, Query... queries) throws IOException {
	// We want to preserve topDocs document ordering and MatchRegionRetriever is optimized
	// for streaming, so we'll just prepopulate the map in proper order.
	LinkedHashMap<Integer, DocHit> docHits = new LinkedHashMap<>();
	for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
	docHits.put(scoreDoc.doc, null);
	}

	// Collect match ranges for each query and associate each range to the origin query.
	for (Query q : queries) {
	MatchRegionRetriever highlighter =
	new MatchRegionRetriever(searcher, searcher.rewrite(q), offsetsRetrievalStrategies);
	highlighter.highlightDocuments(
	topDocs,
	(int docId,
	LeafReader leafReader,
	int leafDocId,
	Map<String, List<OffsetRange>> hits) -> {
	DocHit docHit = docHits.get(docId);
	if (docHit == null) {
	docHit = new DocHit(docId, leafReader, leafDocId);
	docHits.put(docId, docHit);
	}
	docHit.addMatches(q, hits);
	});
	}

	return docHits.values().stream()
	.filter(Objects::nonNull) // This should always the case?
	.map(this::computeDocFieldValues);
	}

	private DocHighlights computeDocFieldValues(DocHit docHit) {
	Document doc;
	try {
	doc = docHit.document(fieldsAlwaysReturned::contains);
	} catch (IOException e) {
	throw new UncheckedIOException(e);
	}

	DocHighlights docHighlights = new DocHighlights(docHit.docId);

	HashSet<String> unique = new HashSet<>();
	for (IndexableField indexableField : doc) {
	String field = indexableField.name();
	if (!unique.add(field)) {
	continue;
	}

	String[] values = doc.getValues(field);
	String contiguousValue = contiguousFieldValue(field, values);
	List<OffsetRange> valueRanges = computeValueRanges(field, values);
	List<QueryOffsetRange> offsets = docHit.matchRanges.get(field);

	List<String> formattedValues =
	fieldValueHighlighter(field, offsets != null)
	.format(field, values, contiguousValue, valueRanges, offsets);

	if (formattedValues != null) {
	docHighlights.fields.put(field, formattedValues);
	}
	}

	return docHighlights;
	}

	private List<OffsetRange> computeValueRanges(String field, String[] values) {
	ArrayList<OffsetRange> valueRanges = new ArrayList<>();
	int offset = 0;
	for (CharSequence v : values) {
	valueRanges.add(new OffsetRange(offset, offset + v.length()));
	offset += v.length();
	offset += analyzer.getOffsetGap(field);
	}
	return valueRanges;
	}

	private String contiguousFieldValue(String field, String[] values) {
	String value;
	if (values.length == 1) {
	value = values[0];
	} else {
	// TODO: This can be inefficient if offset gap is large but the logic
	// of applying offsets would get much more complicated so leaving for now
	// (would have to recalculate all offsets to omit gaps).
	String fieldGapPadding = " ".repeat(analyzer.getOffsetGap(field));
	value = String.join(fieldGapPadding, values);
	}
	return value;
	}

	private FieldValueHighlighter fieldValueHighlighter(String field, boolean hasMatches) {
	for (FieldValueHighlighter highlighter : fieldHighlighters) {
	if (highlighter.isApplicable(field, hasMatches)) {
	return highlighter;
	}
	}
	throw new RuntimeException("No field highlighter could be matched to field: " + field);
	}
	}