lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/MatchRegionRetriever.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.search.matchhighlight;

 import java.io.IOException;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.PrimitiveIterator;
 import java.util.Set;
 import java.util.TreeMap;
 import java.util.TreeSet;
 import java.util.function.Predicate;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.FieldInfos;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Matches;
 import org.apache.lucene.search.MatchesIterator;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.QueryVisitor;
 import org.apache.lucene.search.ScoreMode;
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.search.Weight;

 /**
  * Utility class to compute a list of "match regions" for a given query, searcher and document(s)
  * using {@link Matches} API.
  */
 public class MatchRegionRetriever {
   private final List<LeafReaderContext> leaves;
   private final Weight weight;
   private final TreeSet<String> affectedFields;
   private final Map<String, OffsetsRetrievalStrategy> offsetStrategies;
   private final Set<String> preloadFields;

   /**
    * A callback for accepting a single document (and its associated leaf reader, leaf document ID)
    * and its match offset ranges, as indicated by the {@link Matches} interface retrieved for the
    * query.
    */
   @FunctionalInterface
   public interface MatchOffsetsConsumer {
     void accept(
         int docId, LeafReader leafReader, int leafDocId, Map<String, List<OffsetRange>> hits)
         throws IOException;
   }

   /**
    * An abstraction that provides document values for a given field. Default implementation in
    * {@link DocumentFieldValueProvider} just reaches to a preloaded {@link Document}. It is possible
    * to write a more efficient implementation on top of a reusable character buffer (that reuses the
    * buffer while retrieving hit regions for documents).
    */
   @FunctionalInterface
   public interface FieldValueProvider {
     List<CharSequence> getValues(String field);
   }

   /**
    * A constructor with the default offset strategy supplier.
    *
    * @param analyzer An analyzer that may be used to reprocess (retokenize) document fields in the
    *     absence of position offsets in the index. Note that the analyzer must return tokens
    *     (positions and offsets) identical to the ones stored in the index.
    */
   public MatchRegionRetriever(IndexSearcher searcher, Query query, Analyzer analyzer)
       throws IOException {
     this(searcher, query, computeOffsetRetrievalStrategies(searcher.getIndexReader(), analyzer));
   }

   /**
    * @param searcher Index searcher to be used for retrieving matches.
    * @param query The query for which matches should be retrieved. The query should be rewritten
    *     against the provided searcher.
    * @param fieldOffsetStrategySupplier A custom supplier of per-field {@link
    *     OffsetsRetrievalStrategy} instances.
    */
   public MatchRegionRetriever(
       IndexSearcher searcher,
       Query query,
       OffsetsRetrievalStrategySupplier fieldOffsetStrategySupplier)
       throws IOException {
     leaves = searcher.getIndexReader().leaves();
     assert checkOrderConsistency(leaves);

     // We need full scoring mode so that we can receive matches from all sub-clauses
     // (no optimizations in Boolean queries take place).
     weight = searcher.createWeight(query, ScoreMode.COMPLETE, 0);

     // Compute the subset of fields affected by this query so that we don't load or scan
     // fields that are irrelevant.
     affectedFields = new TreeSet<>();
     query.visit(
         new QueryVisitor() {
           @Override
           public boolean acceptField(String field) {
             affectedFields.add(field);
             return false;
           }
         });

     // Compute value offset retrieval strategy for all affected fields.
     offsetStrategies = new HashMap<>();
     for (String field : affectedFields) {
       offsetStrategies.put(field, fieldOffsetStrategySupplier.apply(field));
     }

     // Ask offset strategies if they'll need field values.
     preloadFields = new HashSet<>();
     offsetStrategies.forEach(
         (field, strategy) -> {
           if (strategy.requiresDocument()) {
             preloadFields.add(field);
           }
         });

     // Only preload those field values that can be affected by the query and are required
     // by strategies.
     preloadFields.retainAll(affectedFields);
   }

   public void highlightDocuments(TopDocs topDocs, MatchOffsetsConsumer consumer)
       throws IOException {
     highlightDocuments(
         Arrays.stream(topDocs.scoreDocs).mapToInt(scoreDoc -> scoreDoc.doc).sorted().iterator(),
         consumer);
   }

   /**
    * Low-level, high-efficiency method for highlighting large numbers of documents at once in a
    * streaming fashion.
    *
    * @param docIds A stream of <em>sorted</em> document identifiers for which hit ranges should be
    *     returned.
    * @param consumer A streaming consumer for document-hits pairs.
    */
   public void highlightDocuments(PrimitiveIterator.OfInt docIds, MatchOffsetsConsumer consumer)
       throws IOException {
     if (leaves.isEmpty()) {
       return;
     }

     Iterator<LeafReaderContext> ctx = leaves.iterator();
     LeafReaderContext currentContext = ctx.next();
     int previousDocId = -1;
     Map<String, List<OffsetRange>> highlights = new TreeMap<>();
     while (docIds.hasNext()) {
       int docId = docIds.nextInt();

       if (docId < previousDocId) {
         throw new RuntimeException("Input document IDs must be sorted (increasing).");
       }
       previousDocId = docId;

       while (docId >= currentContext.docBase + currentContext.reader().maxDoc()) {
         currentContext = ctx.next();
       }

       int contextRelativeDocId = docId - currentContext.docBase;

       // Only preload fields we may potentially need.
       FieldValueProvider documentSupplier;
       if (preloadFields.isEmpty()) {
         documentSupplier = null;
       } else {
         Document doc = currentContext.reader().document(contextRelativeDocId, preloadFields);
         documentSupplier = new DocumentFieldValueProvider(doc);
       }

       highlights.clear();
       highlightDocument(
           currentContext, contextRelativeDocId, documentSupplier, (field) -> true, highlights);
       consumer.accept(docId, currentContext.reader(), contextRelativeDocId, highlights);
     }
   }

   /**
    * Low-level method for retrieving hit ranges for a single document. This method can be used with
    * custom document {@link FieldValueProvider}.
    */
   public void highlightDocument(
       LeafReaderContext leafReaderContext,
       int contextDocId,
       FieldValueProvider doc,
       Predicate<String> acceptField,
       Map<String, List<OffsetRange>> outputHighlights)
       throws IOException {
     Matches matches = weight.matches(leafReaderContext, contextDocId);
     if (matches == null) {
       return;
     }

     for (String field : affectedFields) {
       if (acceptField.test(field)) {
         MatchesIterator matchesIterator = matches.getMatches(field);
         if (matchesIterator == null) {
           // No matches on this field, even though the field was part of the query. This may be
           // possible
           // with complex queries that source non-text fields (have no "hit regions" in any textual
           // representation). Skip.
         } else {
           OffsetsRetrievalStrategy offsetStrategy = offsetStrategies.get(field);
           if (offsetStrategy == null) {
             throw new IOException(
                 "Non-empty matches but no offset retrieval strategy for field: " + field);
           }
           List<OffsetRange> ranges = offsetStrategy.get(matchesIterator, doc);
           if (!ranges.isEmpty()) {
             outputHighlights.put(field, ranges);
           }
         }
       }
     }
   }

   private boolean checkOrderConsistency(List<LeafReaderContext> leaves) {
     for (int i = 1; i < leaves.size(); i++) {
       LeafReaderContext prev = leaves.get(i - 1);
       LeafReaderContext next = leaves.get(i);
       assert prev.docBase <= next.docBase;
       assert prev.docBase + prev.reader().maxDoc() == next.docBase;
     }
     return true;
   }

   /**
    * Compute default strategies for retrieving offsets from {@link MatchesIterator} instances for a
    * set of given fields.
    */
   public static OffsetsRetrievalStrategySupplier computeOffsetRetrievalStrategies(
       IndexReader reader, Analyzer analyzer) {
     FieldInfos fieldInfos = FieldInfos.getMergedFieldInfos(reader);
     return (field) -> {
       FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
       if (fieldInfo == null) {
         return (mi, doc) -> {
           throw new IOException("FieldInfo is null for field: " + field);
         };
       }

       switch (fieldInfo.getIndexOptions()) {
         case DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS:
           return new OffsetsFromMatchIterator(field);

         case DOCS_AND_FREQS_AND_POSITIONS:
           return new OffsetsFromPositions(field, analyzer);

         case DOCS_AND_FREQS:
         case DOCS:
           // By default retrieve offsets from individual tokens
           // retrieved by the analyzer (possibly narrowed down to
           // only those terms that the query hinted at when passed
           // a QueryVisitor.
           //
           // Alternative strategies are also possible and may make sense
           // depending on the use case (OffsetsFromValues, for example).
           return new OffsetsFromTokens(field, analyzer);

         default:
           return (matchesIterator, doc) -> {
             throw new IOException(
                 "Field is indexed without positions and/or offsets: "
                     + field
                     + ", "
                     + fieldInfo.getIndexOptions());
           };
       }
     };
   }

   /** Implements {@link FieldValueProvider} wrapping a preloaded {@link Document}. */
   private static final class DocumentFieldValueProvider implements FieldValueProvider {
     private final Document doc;

     public DocumentFieldValueProvider(Document doc) {
       this.doc = doc;
     }

     @Override
     public List<CharSequence> getValues(String field) {
       return Arrays.asList(doc.getValues(field));
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.search.matchhighlight;

	import java.io.IOException;
	import java.util.Arrays;
	import java.util.HashMap;
	import java.util.HashSet;
	import java.util.Iterator;
	import java.util.List;
	import java.util.Map;
	import java.util.PrimitiveIterator;
	import java.util.Set;
	import java.util.TreeMap;
	import java.util.TreeSet;
	import java.util.function.Predicate;
	import org.apache.lucene.analysis.Analyzer;
	import org.apache.lucene.document.Document;
	import org.apache.lucene.index.FieldInfo;
	import org.apache.lucene.index.FieldInfos;
	import org.apache.lucene.index.IndexReader;
	import org.apache.lucene.index.LeafReader;
	import org.apache.lucene.index.LeafReaderContext;
	import org.apache.lucene.search.IndexSearcher;
	import org.apache.lucene.search.Matches;
	import org.apache.lucene.search.MatchesIterator;
	import org.apache.lucene.search.Query;
	import org.apache.lucene.search.QueryVisitor;
	import org.apache.lucene.search.ScoreMode;
	import org.apache.lucene.search.TopDocs;
	import org.apache.lucene.search.Weight;

	/**
	* Utility class to compute a list of "match regions" for a given query, searcher and document(s)
	* using {@link Matches} API.
	*/
	public class MatchRegionRetriever {
	private final List<LeafReaderContext> leaves;
	private final Weight weight;
	private final TreeSet<String> affectedFields;
	private final Map<String, OffsetsRetrievalStrategy> offsetStrategies;
	private final Set<String> preloadFields;

	/**
	* A callback for accepting a single document (and its associated leaf reader, leaf document ID)
	* and its match offset ranges, as indicated by the {@link Matches} interface retrieved for the
	* query.
	*/
	@FunctionalInterface
	public interface MatchOffsetsConsumer {
	void accept(
	int docId, LeafReader leafReader, int leafDocId, Map<String, List<OffsetRange>> hits)
	throws IOException;
	}

	/**
	* An abstraction that provides document values for a given field. Default implementation in
	* {@link DocumentFieldValueProvider} just reaches to a preloaded {@link Document}. It is possible
	* to write a more efficient implementation on top of a reusable character buffer (that reuses the
	* buffer while retrieving hit regions for documents).
	*/
	@FunctionalInterface
	public interface FieldValueProvider {
	List<CharSequence> getValues(String field);
	}

	/**
	* A constructor with the default offset strategy supplier.
	*
	* @param analyzer An analyzer that may be used to reprocess (retokenize) document fields in the
	* absence of position offsets in the index. Note that the analyzer must return tokens
	* (positions and offsets) identical to the ones stored in the index.
	*/
	public MatchRegionRetriever(IndexSearcher searcher, Query query, Analyzer analyzer)
	throws IOException {
	this(searcher, query, computeOffsetRetrievalStrategies(searcher.getIndexReader(), analyzer));
	}

	/**
	* @param searcher Index searcher to be used for retrieving matches.
	* @param query The query for which matches should be retrieved. The query should be rewritten
	* against the provided searcher.
	* @param fieldOffsetStrategySupplier A custom supplier of per-field {@link
	* OffsetsRetrievalStrategy} instances.
	*/
	public MatchRegionRetriever(
	IndexSearcher searcher,
	Query query,
	OffsetsRetrievalStrategySupplier fieldOffsetStrategySupplier)
	throws IOException {
	leaves = searcher.getIndexReader().leaves();
	assert checkOrderConsistency(leaves);

	// We need full scoring mode so that we can receive matches from all sub-clauses
	// (no optimizations in Boolean queries take place).
	weight = searcher.createWeight(query, ScoreMode.COMPLETE, 0);

	// Compute the subset of fields affected by this query so that we don't load or scan
	// fields that are irrelevant.
	affectedFields = new TreeSet<>();
	query.visit(
	new QueryVisitor() {
	@Override
	public boolean acceptField(String field) {
	affectedFields.add(field);
	return false;
	}
	});

	// Compute value offset retrieval strategy for all affected fields.
	offsetStrategies = new HashMap<>();
	for (String field : affectedFields) {
	offsetStrategies.put(field, fieldOffsetStrategySupplier.apply(field));
	}

	// Ask offset strategies if they'll need field values.
	preloadFields = new HashSet<>();
	offsetStrategies.forEach(
	(field, strategy) -> {
	if (strategy.requiresDocument()) {
	preloadFields.add(field);
	}
	});

	// Only preload those field values that can be affected by the query and are required
	// by strategies.
	preloadFields.retainAll(affectedFields);
	}

	public void highlightDocuments(TopDocs topDocs, MatchOffsetsConsumer consumer)
	throws IOException {
	highlightDocuments(
	Arrays.stream(topDocs.scoreDocs).mapToInt(scoreDoc -> scoreDoc.doc).sorted().iterator(),
	consumer);
	}

	/**
	* Low-level, high-efficiency method for highlighting large numbers of documents at once in a
	* streaming fashion.
	*
	* @param docIds A stream of <em>sorted</em> document identifiers for which hit ranges should be
	* returned.
	* @param consumer A streaming consumer for document-hits pairs.
	*/
	public void highlightDocuments(PrimitiveIterator.OfInt docIds, MatchOffsetsConsumer consumer)
	throws IOException {
	if (leaves.isEmpty()) {
	return;
	}

	Iterator<LeafReaderContext> ctx = leaves.iterator();
	LeafReaderContext currentContext = ctx.next();
	int previousDocId = -1;
	Map<String, List<OffsetRange>> highlights = new TreeMap<>();
	while (docIds.hasNext()) {
	int docId = docIds.nextInt();

	if (docId < previousDocId) {
	throw new RuntimeException("Input document IDs must be sorted (increasing).");
	}
	previousDocId = docId;

	while (docId >= currentContext.docBase + currentContext.reader().maxDoc()) {
	currentContext = ctx.next();
	}

	int contextRelativeDocId = docId - currentContext.docBase;

	// Only preload fields we may potentially need.
	FieldValueProvider documentSupplier;
	if (preloadFields.isEmpty()) {
	documentSupplier = null;
	} else {
	Document doc = currentContext.reader().document(contextRelativeDocId, preloadFields);
	documentSupplier = new DocumentFieldValueProvider(doc);
	}

	highlights.clear();
	highlightDocument(
	currentContext, contextRelativeDocId, documentSupplier, (field) -> true, highlights);
	consumer.accept(docId, currentContext.reader(), contextRelativeDocId, highlights);
	}
	}

	/**
	* Low-level method for retrieving hit ranges for a single document. This method can be used with
	* custom document {@link FieldValueProvider}.
	*/
	public void highlightDocument(
	LeafReaderContext leafReaderContext,
	int contextDocId,
	FieldValueProvider doc,
	Predicate<String> acceptField,
	Map<String, List<OffsetRange>> outputHighlights)
	throws IOException {
	Matches matches = weight.matches(leafReaderContext, contextDocId);
	if (matches == null) {
	return;
	}

	for (String field : affectedFields) {
	if (acceptField.test(field)) {
	MatchesIterator matchesIterator = matches.getMatches(field);
	if (matchesIterator == null) {
	// No matches on this field, even though the field was part of the query. This may be
	// possible
	// with complex queries that source non-text fields (have no "hit regions" in any textual
	// representation). Skip.
	} else {
	OffsetsRetrievalStrategy offsetStrategy = offsetStrategies.get(field);
	if (offsetStrategy == null) {
	throw new IOException(
	"Non-empty matches but no offset retrieval strategy for field: " + field);
	}
	List<OffsetRange> ranges = offsetStrategy.get(matchesIterator, doc);
	if (!ranges.isEmpty()) {
	outputHighlights.put(field, ranges);
	}
	}
	}
	}
	}

	private boolean checkOrderConsistency(List<LeafReaderContext> leaves) {
	for (int i = 1; i < leaves.size(); i++) {
	LeafReaderContext prev = leaves.get(i - 1);
	LeafReaderContext next = leaves.get(i);
	assert prev.docBase <= next.docBase;
	assert prev.docBase + prev.reader().maxDoc() == next.docBase;
	}
	return true;
	}

	/**
	* Compute default strategies for retrieving offsets from {@link MatchesIterator} instances for a
	* set of given fields.
	*/
	public static OffsetsRetrievalStrategySupplier computeOffsetRetrievalStrategies(
	IndexReader reader, Analyzer analyzer) {
	FieldInfos fieldInfos = FieldInfos.getMergedFieldInfos(reader);
	return (field) -> {
	FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
	if (fieldInfo == null) {
	return (mi, doc) -> {
	throw new IOException("FieldInfo is null for field: " + field);
	};
	}

	switch (fieldInfo.getIndexOptions()) {
	case DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS:
	return new OffsetsFromMatchIterator(field);

	case DOCS_AND_FREQS_AND_POSITIONS:
	return new OffsetsFromPositions(field, analyzer);

	case DOCS_AND_FREQS:
	case DOCS:
	// By default retrieve offsets from individual tokens
	// retrieved by the analyzer (possibly narrowed down to
	// only those terms that the query hinted at when passed
	// a QueryVisitor.
	//
	// Alternative strategies are also possible and may make sense
	// depending on the use case (OffsetsFromValues, for example).
	return new OffsetsFromTokens(field, analyzer);

	default:
	return (matchesIterator, doc) -> {
	throw new IOException(
	"Field is indexed without positions and/or offsets: "
	+ field
	+ ", "
	+ fieldInfo.getIndexOptions());
	};
	}
	};
	}

	/** Implements {@link FieldValueProvider} wrapping a preloaded {@link Document}. */
	private static final class DocumentFieldValueProvider implements FieldValueProvider {
	private final Document doc;

	public DocumentFieldValueProvider(Document doc) {
	this.doc = doc;
	}

	@Override
	public List<CharSequence> getValues(String field) {
	return Arrays.asList(doc.getValues(field));
	}
	}
	}