lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.search.uhighlight;

 import java.io.IOException;
 import java.text.BreakIterator;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.EnumSet;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
 import java.util.Objects;
 import java.util.Set;
 import java.util.SortedSet;
 import java.util.TreeSet;
 import java.util.function.Predicate;
 import java.util.function.Supplier;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.document.FieldType;
 import org.apache.lucene.index.BaseCompositeReader;
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.FieldInfos;
 import org.apache.lucene.index.Fields;
 import org.apache.lucene.index.FilterLeafReader;
 import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.MultiReader;
 import org.apache.lucene.index.ReaderUtil;
 import org.apache.lucene.index.StoredFieldVisitor;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.MatchAllDocsQuery;
 import org.apache.lucene.search.MatchNoDocsQuery;
 import org.apache.lucene.search.MultiTermQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.QueryVisitor;
 import org.apache.lucene.search.ScoreDoc;
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.search.Weight;
 import org.apache.lucene.search.spans.SpanQuery;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.InPlaceMergeSorter;

 /**
  * A Highlighter that can get offsets from either postings ({@link
  * IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}), term vectors ({@link
  * FieldType#setStoreTermVectorOffsets(boolean)}), or via re-analyzing text.
  *
  * <p>This highlighter treats the single original document as the whole corpus, and then scores
  * individual passages as if they were documents in this corpus. It uses a {@link BreakIterator} to
  * find passages in the text; by default it breaks using {@link
  * BreakIterator#getSentenceInstance(Locale) getSentenceInstance(Locale.ROOT)}. It then iterates in
  * parallel (merge sorting by offset) through the positions of all terms from the query, coalescing
  * those hits that occur in a single passage into a {@link Passage}, and then scores each Passage
  * using a separate {@link PassageScorer}. Passages are finally formatted into highlighted snippets
  * with a {@link PassageFormatter}.
  *
  * <p>You can customize the behavior by calling some of the setters, or by subclassing and
  * overriding some methods. Some important hooks:
  *
  * <ul>
  *   <li>{@link #getBreakIterator(String)}: Customize how the text is divided into passages.
  *   <li>{@link #getScorer(String)}: Customize how passages are ranked.
  *   <li>{@link #getFormatter(String)}: Customize how snippets are formatted.
  * </ul>
  *
  * <p>This is thread-safe.
  *
  * @lucene.experimental
  */
 public class UnifiedHighlighter {

   protected static final char MULTIVAL_SEP_CHAR = (char) 0;

   public static final int DEFAULT_MAX_LENGTH = 10000;

   public static final int DEFAULT_CACHE_CHARS_THRESHOLD = 524288; // ~ 1 MB (2 byte chars)

   static final IndexSearcher EMPTY_INDEXSEARCHER;

   static {
     try {
       IndexReader emptyReader = new MultiReader();
       EMPTY_INDEXSEARCHER = new IndexSearcher(emptyReader);
       EMPTY_INDEXSEARCHER.setQueryCache(null);
     } catch (IOException bogus) {
       throw new RuntimeException(bogus);
     }
   }

   protected static final LabelledCharArrayMatcher[] ZERO_LEN_AUTOMATA_ARRAY =
       new LabelledCharArrayMatcher[0];

   protected final IndexSearcher searcher; // if null, can only use highlightWithoutSearcher

   protected final Analyzer indexAnalyzer;

   private boolean defaultHandleMtq = true; // e.g. wildcards

   private boolean defaultHighlightPhrasesStrictly = true; // AKA "accuracy" or "query debugging"

   // For analysis, prefer MemoryIndexOffsetStrategy
   private boolean defaultPassageRelevancyOverSpeed = true;

   private int maxLength = DEFAULT_MAX_LENGTH;

   // BreakIterator is stateful so we use a Supplier factory method
   private Supplier<BreakIterator> defaultBreakIterator =
       () -> BreakIterator.getSentenceInstance(Locale.ROOT);

   private Predicate<String> defaultFieldMatcher;

   private PassageScorer defaultScorer = new PassageScorer();

   private PassageFormatter defaultFormatter = new DefaultPassageFormatter();

   private int defaultMaxNoHighlightPassages = -1;

   // lazy initialized with double-check locking; protected so subclass can init
   protected volatile FieldInfos fieldInfos;

   private int cacheFieldValCharsThreshold = DEFAULT_CACHE_CHARS_THRESHOLD;

   /** Extracts matching terms after rewriting against an empty index */
   protected static Set<Term> extractTerms(Query query) throws IOException {
     Set<Term> queryTerms = new HashSet<>();
     EMPTY_INDEXSEARCHER.rewrite(query).visit(QueryVisitor.termCollector(queryTerms));
     return queryTerms;
   }

   /**
    * Constructs the highlighter with the given index searcher and analyzer.
    *
    * @param indexSearcher Usually required, unless {@link #highlightWithoutSearcher(String, Query,
    *     String, int)} is used, in which case this needs to be null.
    * @param indexAnalyzer Required, even if in some circumstances it isn't used.
    */
   public UnifiedHighlighter(IndexSearcher indexSearcher, Analyzer indexAnalyzer) {
     this.searcher = indexSearcher; // TODO: make non nullable
     this.indexAnalyzer =
         Objects.requireNonNull(
             indexAnalyzer,
             "indexAnalyzer is required" + " (even if in some circumstances it isn't used)");
   }

   public void setHandleMultiTermQuery(boolean handleMtq) {
     this.defaultHandleMtq = handleMtq;
   }

   public void setHighlightPhrasesStrictly(boolean highlightPhrasesStrictly) {
     this.defaultHighlightPhrasesStrictly = highlightPhrasesStrictly;
   }

   public void setMaxLength(int maxLength) {
     if (maxLength < 0 || maxLength == Integer.MAX_VALUE) {
       // two reasons: no overflow problems in BreakIterator.preceding(offset+1),
       // our sentinel in the offsets queue uses this value to terminate.
       throw new IllegalArgumentException("maxLength must be < Integer.MAX_VALUE");
     }
     this.maxLength = maxLength;
   }

   public void setBreakIterator(Supplier<BreakIterator> breakIterator) {
     this.defaultBreakIterator = breakIterator;
   }

   public void setScorer(PassageScorer scorer) {
     this.defaultScorer = scorer;
   }

   public void setFormatter(PassageFormatter formatter) {
     this.defaultFormatter = formatter;
   }

   public void setMaxNoHighlightPassages(int defaultMaxNoHighlightPassages) {
     this.defaultMaxNoHighlightPassages = defaultMaxNoHighlightPassages;
   }

   public void setCacheFieldValCharsThreshold(int cacheFieldValCharsThreshold) {
     this.cacheFieldValCharsThreshold = cacheFieldValCharsThreshold;
   }

   public void setFieldMatcher(Predicate<String> predicate) {
     this.defaultFieldMatcher = predicate;
   }

   /**
    * Returns whether {@link MultiTermQuery} derivatives will be highlighted. By default it's
    * enabled. MTQ highlighting can be expensive, particularly when using offsets in postings.
    */
   protected boolean shouldHandleMultiTermQuery(String field) {
     return defaultHandleMtq;
   }

   /**
    * Returns whether position sensitive queries (e.g. phrases and {@link SpanQuery}ies) should be
    * highlighted strictly based on query matches (slower) versus any/all occurrences of the
    * underlying terms. By default it's enabled, but there's no overhead if such queries aren't used.
    */
   protected boolean shouldHighlightPhrasesStrictly(String field) {
     return defaultHighlightPhrasesStrictly;
   }

   protected boolean shouldPreferPassageRelevancyOverSpeed(String field) {
     return defaultPassageRelevancyOverSpeed;
   }

   /**
    * Returns the predicate to use for extracting the query part that must be highlighted. By default
    * only queries that target the current field are kept. (AKA requireFieldMatch)
    */
   protected Predicate<String> getFieldMatcher(String field) {
     if (defaultFieldMatcher != null) {
       return defaultFieldMatcher;
     } else {
       // requireFieldMatch = true
       return (qf) -> field.equals(qf);
     }
   }

   /**
    * The maximum content size to process. Content will be truncated to this size before
    * highlighting. Typically snippets closer to the beginning of the document better summarize its
    * content.
    */
   public int getMaxLength() {
     return maxLength;
   }

   /**
    * Returns the {@link BreakIterator} to use for dividing text into passages. This returns {@link
    * BreakIterator#getSentenceInstance(Locale)} by default; subclasses can override to customize.
    *
    * <p>Note: this highlighter will call {@link BreakIterator#preceding(int)} and {@link
    * BreakIterator#next()} many times on it. The default generic JDK implementation of {@code
    * preceding} performs poorly.
    */
   protected BreakIterator getBreakIterator(String field) {
     return defaultBreakIterator.get();
   }

   /**
    * Returns the {@link PassageScorer} to use for ranking passages. This returns a new {@code
    * PassageScorer} by default; subclasses can override to customize.
    */
   protected PassageScorer getScorer(String field) {
     return defaultScorer;
   }

   /**
    * Returns the {@link PassageFormatter} to use for formatting passages into highlighted snippets.
    * This returns a new {@code PassageFormatter} by default; subclasses can override to customize.
    */
   protected PassageFormatter getFormatter(String field) {
     return defaultFormatter;
   }

   /**
    * Returns the number of leading passages (as delineated by the {@link BreakIterator}) when no
    * highlights could be found. If it's less than 0 (the default) then this defaults to the {@code
    * maxPassages} parameter given for each request. If this is 0 then the resulting highlight is
    * null (not formatted).
    */
   protected int getMaxNoHighlightPassages(String field) {
     return defaultMaxNoHighlightPassages;
   }

   /**
    * Limits the amount of field value pre-fetching until this threshold is passed. The highlighter
    * internally highlights in batches of documents sized on the sum field value length (in chars) of
    * the fields to be highlighted (bounded by {@link #getMaxLength()} for each field). By setting
    * this to 0, you can force documents to be fetched and highlighted one at a time, which you
    * usually shouldn't do. The default is 524288 chars which translates to about a megabyte.
    * However, note that the highlighter sometimes ignores this and highlights one document at a time
    * (without caching a bunch of documents in advance) when it can detect there's no point in it --
    * such as when all fields will be highlighted via re-analysis as one example.
    */
   public int getCacheFieldValCharsThreshold() { // question: should we size by bytes instead?
     return cacheFieldValCharsThreshold;
   }

   /** ... as passed in from constructor. */
   public IndexSearcher getIndexSearcher() {
     return searcher;
   }

   /** ... as passed in from constructor. */
   public Analyzer getIndexAnalyzer() {
     return indexAnalyzer;
   }

   /** Source of term offsets; essential for highlighting. */
   public enum OffsetSource {
     POSTINGS,
     TERM_VECTORS,
     ANALYSIS,
     POSTINGS_WITH_TERM_VECTORS,
     NONE_NEEDED
   }

   /**
    * Determine the offset source for the specified field. The default algorithm is as follows:
    *
    * <ol>
    *   <li>This calls {@link #getFieldInfo(String)}. Note this returns null if there is no searcher
    *       or if the field isn't found there.
    *   <li>If there's a field info it has {@link
    *       IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS} then {@link OffsetSource#POSTINGS}
    *       is returned.
    *   <li>If there's a field info and {@link FieldInfo#hasVectors()} then {@link
    *       OffsetSource#TERM_VECTORS} is returned (note we can't check here if the TV has offsets;
    *       if there isn't then an exception will get thrown down the line).
    *   <li>Fall-back: {@link OffsetSource#ANALYSIS} is returned.
    * </ol>
    *
    * <p>Note that the highlighter sometimes switches to something else based on the query, such as
    * if you have {@link OffsetSource#POSTINGS_WITH_TERM_VECTORS} but in fact don't need term
    * vectors.
    */
   protected OffsetSource getOffsetSource(String field) {
     FieldInfo fieldInfo = getFieldInfo(field);
     if (fieldInfo != null) {
       if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) {
         return fieldInfo.hasVectors()
             ? OffsetSource.POSTINGS_WITH_TERM_VECTORS
             : OffsetSource.POSTINGS;
       }
       if (fieldInfo.hasVectors()) { // unfortunately we can't also check if the TV has offsets
         return OffsetSource.TERM_VECTORS;
       }
     }
     return OffsetSource.ANALYSIS;
   }

   /**
    * Called by the default implementation of {@link #getOffsetSource(String)}. If there is no
    * searcher then we simply always return null.
    */
   protected FieldInfo getFieldInfo(String field) {
     if (searcher == null) {
       return null;
     }
     // Need thread-safety for lazy-init but lets avoid 'synchronized' by using double-check locking
     // idiom
     FieldInfos fieldInfos = this.fieldInfos; // note: it's volatile; read once
     if (fieldInfos == null) {
       synchronized (this) {
         fieldInfos = this.fieldInfos;
         if (fieldInfos == null) {
           fieldInfos = FieldInfos.getMergedFieldInfos(searcher.getIndexReader());
           this.fieldInfos = fieldInfos;
         }
       }
     }
     return fieldInfos.fieldInfo(field);
   }

   /**
    * Highlights the top passages from a single field.
    *
    * @param field field name to highlight. Must have a stored string value and also be indexed with
    *     offsets.
    * @param query query to highlight.
    * @param topDocs TopDocs containing the summary result documents to highlight.
    * @return Array of formatted snippets corresponding to the documents in <code>topDocs</code>. If
    *     no highlights were found for a document, the first sentence for the field will be returned.
    * @throws IOException if an I/O error occurred during processing
    * @throws IllegalArgumentException if <code>field</code> was indexed without {@link
    *     IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
    */
   public String[] highlight(String field, Query query, TopDocs topDocs) throws IOException {
     return highlight(field, query, topDocs, 1);
   }

   /**
    * Highlights the top-N passages from a single field.
    *
    * @param field field name to highlight. Must have a stored string value.
    * @param query query to highlight.
    * @param topDocs TopDocs containing the summary result documents to highlight.
    * @param maxPassages The maximum number of top-N ranked passages used to form the highlighted
    *     snippets.
    * @return Array of formatted snippets corresponding to the documents in <code>topDocs</code>. If
    *     no highlights were found for a document, the first {@code maxPassages} sentences from the
    *     field will be returned.
    * @throws IOException if an I/O error occurred during processing
    * @throws IllegalArgumentException if <code>field</code> was indexed without {@link
    *     IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
    */
   public String[] highlight(String field, Query query, TopDocs topDocs, int maxPassages)
       throws IOException {
     Map<String, String[]> res =
         highlightFields(new String[] {field}, query, topDocs, new int[] {maxPassages});
     return res.get(field);
   }

   /**
    * Highlights the top passages from multiple fields.
    *
    * <p>Conceptually, this behaves as a more efficient form of:
    *
    * <pre class="prettyprint">
    * Map m = new HashMap();
    * for (String field : fields) {
    * m.put(field, highlight(field, query, topDocs));
    * }
    * return m;
    * </pre>
    *
    * @param fields field names to highlight. Must have a stored string value.
    * @param query query to highlight.
    * @param topDocs TopDocs containing the summary result documents to highlight.
    * @return Map keyed on field name, containing the array of formatted snippets corresponding to
    *     the documents in <code>topDocs</code>. If no highlights were found for a document, the
    *     first sentence from the field will be returned.
    * @throws IOException if an I/O error occurred during processing
    * @throws IllegalArgumentException if <code>field</code> was indexed without {@link
    *     IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
    */
   public Map<String, String[]> highlightFields(String[] fields, Query query, TopDocs topDocs)
       throws IOException {
     int maxPassages[] = new int[fields.length];
     Arrays.fill(maxPassages, 1);
     return highlightFields(fields, query, topDocs, maxPassages);
   }

   /**
    * Highlights the top-N passages from multiple fields.
    *
    * <p>Conceptually, this behaves as a more efficient form of:
    *
    * <pre class="prettyprint">
    * Map m = new HashMap();
    * for (String field : fields) {
    * m.put(field, highlight(field, query, topDocs, maxPassages));
    * }
    * return m;
    * </pre>
    *
    * @param fields field names to highlight. Must have a stored string value.
    * @param query query to highlight.
    * @param topDocs TopDocs containing the summary result documents to highlight.
    * @param maxPassages The maximum number of top-N ranked passages per-field used to form the
    *     highlighted snippets.
    * @return Map keyed on field name, containing the array of formatted snippets corresponding to
    *     the documents in <code>topDocs</code>. If no highlights were found for a document, the
    *     first {@code maxPassages} sentences from the field will be returned.
    * @throws IOException if an I/O error occurred during processing
    * @throws IllegalArgumentException if <code>field</code> was indexed without {@link
    *     IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
    */
   public Map<String, String[]> highlightFields(
       String[] fields, Query query, TopDocs topDocs, int[] maxPassages) throws IOException {
     final ScoreDoc scoreDocs[] = topDocs.scoreDocs;
     int docids[] = new int[scoreDocs.length];
     for (int i = 0; i < docids.length; i++) {
       docids[i] = scoreDocs[i].doc;
     }

     return highlightFields(fields, query, docids, maxPassages);
   }

   /**
    * Highlights the top-N passages from multiple fields, for the provided int[] docids.
    *
    * @param fieldsIn field names to highlight. Must have a stored string value.
    * @param query query to highlight.
    * @param docidsIn containing the document IDs to highlight.
    * @param maxPassagesIn The maximum number of top-N ranked passages per-field used to form the
    *     highlighted snippets.
    * @return Map keyed on field name, containing the array of formatted snippets corresponding to
    *     the documents in <code>docidsIn</code>. If no highlights were found for a document, the
    *     first {@code maxPassages} from the field will be returned.
    * @throws IOException if an I/O error occurred during processing
    * @throws IllegalArgumentException if <code>field</code> was indexed without {@link
    *     IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
    */
   public Map<String, String[]> highlightFields(
       String[] fieldsIn, Query query, int[] docidsIn, int[] maxPassagesIn) throws IOException {
     Map<String, String[]> snippets = new HashMap<>();
     for (Map.Entry<String, Object[]> ent :
         highlightFieldsAsObjects(fieldsIn, query, docidsIn, maxPassagesIn).entrySet()) {
       Object[] snippetObjects = ent.getValue();
       String[] snippetStrings = new String[snippetObjects.length];
       snippets.put(ent.getKey(), snippetStrings);
       for (int i = 0; i < snippetObjects.length; i++) {
         Object snippet = snippetObjects[i];
         if (snippet != null) {
           snippetStrings[i] = snippet.toString();
         }
       }
     }

     return snippets;
   }

   /**
    * Expert: highlights the top-N passages from multiple fields, for the provided int[] docids, to
    * custom Object as returned by the {@link PassageFormatter}. Use this API to render to something
    * other than String.
    *
    * @param fieldsIn field names to highlight. Must have a stored string value.
    * @param query query to highlight.
    * @param docIdsIn containing the document IDs to highlight.
    * @param maxPassagesIn The maximum number of top-N ranked passages per-field used to form the
    *     highlighted snippets.
    * @return Map keyed on field name, containing the array of formatted snippets corresponding to
    *     the documents in <code>docIdsIn</code>. If no highlights were found for a document, the
    *     first {@code maxPassages} from the field will be returned.
    * @throws IOException if an I/O error occurred during processing
    * @throws IllegalArgumentException if <code>field</code> was indexed without {@link
    *     IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
    */
   protected Map<String, Object[]> highlightFieldsAsObjects(
       String[] fieldsIn, Query query, int[] docIdsIn, int[] maxPassagesIn) throws IOException {
     if (fieldsIn.length < 1) {
       throw new IllegalArgumentException("fieldsIn must not be empty");
     }
     if (fieldsIn.length != maxPassagesIn.length) {
       throw new IllegalArgumentException("invalid number of maxPassagesIn");
     }
     if (searcher == null) {
       throw new IllegalStateException(
           "This method requires that an indexSearcher was passed in the "
               + "constructor.  Perhaps you mean to call highlightWithoutSearcher?");
     }

     // Sort docs & fields for sequential i/o

     // Sort doc IDs w/ index to original order: (copy input arrays since we sort in-place)
     int[] docIds = new int[docIdsIn.length];
     int[] docInIndexes = new int[docIds.length]; // fill in ascending order; points into docIdsIn[]
     copyAndSortDocIdsWithIndex(docIdsIn, docIds, docInIndexes); // latter 2 are "out" params

     // Sort fields w/ maxPassages pair: (copy input arrays since we sort in-place)
     final String fields[] = new String[fieldsIn.length];
     final int maxPassages[] = new int[maxPassagesIn.length];
     copyAndSortFieldsWithMaxPassages(
         fieldsIn, maxPassagesIn, fields, maxPassages); // latter 2 are "out" params

     // Init field highlighters (where most of the highlight logic lives, and on a per field basis)
     Set<Term> queryTerms = extractTerms(query);
     FieldHighlighter[] fieldHighlighters = new FieldHighlighter[fields.length];
     int numTermVectors = 0;
     int numPostings = 0;
     for (int f = 0; f < fields.length; f++) {
       FieldHighlighter fieldHighlighter =
           getFieldHighlighter(fields[f], query, queryTerms, maxPassages[f]);
       fieldHighlighters[f] = fieldHighlighter;

       switch (fieldHighlighter.getOffsetSource()) {
         case TERM_VECTORS:
           numTermVectors++;
           break;
         case POSTINGS:
           numPostings++;
           break;
         case POSTINGS_WITH_TERM_VECTORS:
           numTermVectors++;
           numPostings++;
           break;
         case ANALYSIS:
         case NONE_NEEDED:
         default:
           // do nothing
           break;
       }
     }

     int cacheCharsThreshold = calculateOptimalCacheCharsThreshold(numTermVectors, numPostings);

     IndexReader indexReaderWithTermVecCache =
         (numTermVectors >= 2) ? TermVectorReusingLeafReader.wrap(searcher.getIndexReader()) : null;

     // [fieldIdx][docIdInIndex] of highlightDoc result
     Object[][] highlightDocsInByField = new Object[fields.length][docIds.length];
     // Highlight in doc batches determined by loadFieldValues (consumes from docIdIter)
     DocIdSetIterator docIdIter = asDocIdSetIterator(docIds);
     for (int batchDocIdx = 0; batchDocIdx < docIds.length; ) {
       // Load the field values of the first batch of document(s) (note: commonly all docs are in
       // this batch)
       List<CharSequence[]> fieldValsByDoc = loadFieldValues(fields, docIdIter, cacheCharsThreshold);
       //    the size of the above list is the size of the batch (num of docs in the batch)

       // Highlight in per-field order first, then by doc (better I/O pattern)
       for (int fieldIdx = 0; fieldIdx < fields.length; fieldIdx++) {
         Object[] resultByDocIn = highlightDocsInByField[fieldIdx]; // parallel to docIdsIn
         FieldHighlighter fieldHighlighter = fieldHighlighters[fieldIdx];
         for (int docIdx = batchDocIdx; docIdx - batchDocIdx < fieldValsByDoc.size(); docIdx++) {
           int docId = docIds[docIdx]; // sorted order
           CharSequence content = fieldValsByDoc.get(docIdx - batchDocIdx)[fieldIdx];
           if (content == null) {
             continue;
           }
           IndexReader indexReader =
               (fieldHighlighter.getOffsetSource() == OffsetSource.TERM_VECTORS
                       && indexReaderWithTermVecCache != null)
                   ? indexReaderWithTermVecCache
                   : searcher.getIndexReader();
           final LeafReader leafReader;
           if (indexReader instanceof LeafReader) {
             leafReader = (LeafReader) indexReader;
           } else {
             List<LeafReaderContext> leaves = indexReader.leaves();
             LeafReaderContext leafReaderContext = leaves.get(ReaderUtil.subIndex(docId, leaves));
             leafReader = leafReaderContext.reader();
             docId -= leafReaderContext.docBase; // adjust 'doc' to be within this leaf reader
           }
           int docInIndex = docInIndexes[docIdx]; // original input order
           assert resultByDocIn[docInIndex] == null;
           resultByDocIn[docInIndex] =
               fieldHighlighter.highlightFieldForDoc(leafReader, docId, content.toString());
         }
       }

       batchDocIdx += fieldValsByDoc.size();
     }
     IOUtils.close(indexReaderWithTermVecCache); // FYI won't close underlying reader
     assert docIdIter.docID() == DocIdSetIterator.NO_MORE_DOCS
         || docIdIter.nextDoc() == DocIdSetIterator.NO_MORE_DOCS;

     // TODO reconsider the return type; since this is an "advanced" method, lets not return a Map?
     // Notice the only
     //    caller simply iterates it to build another structure.

     // field -> object highlights parallel to docIdsIn
     Map<String, Object[]> resultMap = new HashMap<>(fields.length);
     for (int f = 0; f < fields.length; f++) {
       resultMap.put(fields[f], highlightDocsInByField[f]);
     }
     return resultMap;
   }

   /**
    * When cacheCharsThreshold is 0, loadFieldValues() only fetches one document at a time. We
    * override it to be 0 in two circumstances:
    */
   private int calculateOptimalCacheCharsThreshold(int numTermVectors, int numPostings) {
     if (numPostings == 0 && numTermVectors == 0) {
       // (1) When all fields are ANALYSIS there's no point in caching a batch of documents
       // because no other info on disk is needed to highlight it.
       return 0;
     } else if (numTermVectors >= 2) {
       // (2) When two or more fields have term vectors, given the field-then-doc algorithm, the
       // underlying term
       // vectors will be fetched in a terrible access pattern unless we highlight a doc at a time
       // and use a special
       // current-doc TV cache.  So we do that.  Hopefully one day TVs will be improved to make this
       // pointless.
       return 0;
     } else {
       return getCacheFieldValCharsThreshold();
     }
   }

   private void copyAndSortFieldsWithMaxPassages(
       String[] fieldsIn, int[] maxPassagesIn, final String[] fields, final int[] maxPassages) {
     System.arraycopy(fieldsIn, 0, fields, 0, fieldsIn.length);
     System.arraycopy(maxPassagesIn, 0, maxPassages, 0, maxPassagesIn.length);
     new InPlaceMergeSorter() {
       @Override
       protected void swap(int i, int j) {
         String tmp = fields[i];
         fields[i] = fields[j];
         fields[j] = tmp;
         int tmp2 = maxPassages[i];
         maxPassages[i] = maxPassages[j];
         maxPassages[j] = tmp2;
       }

       @Override
       protected int compare(int i, int j) {
         return fields[i].compareTo(fields[j]);
       }
     }.sort(0, fields.length);
   }

   private void copyAndSortDocIdsWithIndex(
       int[] docIdsIn, final int[] docIds, final int[] docInIndexes) {
     System.arraycopy(docIdsIn, 0, docIds, 0, docIdsIn.length);
     for (int i = 0; i < docInIndexes.length; i++) {
       docInIndexes[i] = i;
     }
     new InPlaceMergeSorter() {
       @Override
       protected void swap(int i, int j) {
         int tmp = docIds[i];
         docIds[i] = docIds[j];
         docIds[j] = tmp;
         tmp = docInIndexes[i];
         docInIndexes[i] = docInIndexes[j];
         docInIndexes[j] = tmp;
       }

       @Override
       protected int compare(int i, int j) {
         return Integer.compare(docIds[i], docIds[j]);
       }
     }.sort(0, docIds.length);
   }

   /**
    * Highlights text passed as a parameter. This requires the {@link IndexSearcher} provided to this
    * highlighter is null. This use-case is more rare. Naturally, the mode of operation will be
    * {@link OffsetSource#ANALYSIS}. The result of this method is whatever the {@link
    * PassageFormatter} returns. For the {@link DefaultPassageFormatter} and assuming {@code content}
    * has non-zero length, the result will be a non-null string -- so it's safe to call {@link
    * Object#toString()} on it in that case.
    *
    * @param field field name to highlight (as found in the query).
    * @param query query to highlight.
    * @param content text to highlight.
    * @param maxPassages The maximum number of top-N ranked passages used to form the highlighted
    *     snippets.
    * @return result of the {@link PassageFormatter} -- probably a String. Might be null.
    * @throws IOException if an I/O error occurred during processing
    */
   // TODO make content a List? and return a List? and ensure getEmptyHighlight is never invoked
   // multiple times?
   public Object highlightWithoutSearcher(String field, Query query, String content, int maxPassages)
       throws IOException {
     if (this.searcher != null) {
       throw new IllegalStateException(
           "highlightWithoutSearcher should only be called on a "
               + getClass().getSimpleName()
               + " without an IndexSearcher.");
     }
     Objects.requireNonNull(content, "content is required");
     Set<Term> queryTerms = extractTerms(query);
     return getFieldHighlighter(field, query, queryTerms, maxPassages)
         .highlightFieldForDoc(null, -1, content);
   }

   protected FieldHighlighter getFieldHighlighter(
       String field, Query query, Set<Term> allTerms, int maxPassages) {
     UHComponents components = getHighlightComponents(field, query, allTerms);
     OffsetSource offsetSource = getOptimizedOffsetSource(components);
     return new FieldHighlighter(
         field,
         getOffsetStrategy(offsetSource, components),
         new SplittingBreakIterator(getBreakIterator(field), UnifiedHighlighter.MULTIVAL_SEP_CHAR),
         getScorer(field),
         maxPassages,
         getMaxNoHighlightPassages(field),
         getFormatter(field));
   }

   protected UHComponents getHighlightComponents(String field, Query query, Set<Term> allTerms) {
     Predicate<String> fieldMatcher = getFieldMatcher(field);
     Set<HighlightFlag> highlightFlags = getFlags(field);
     PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
     boolean queryHasUnrecognizedPart = hasUnrecognizedQuery(fieldMatcher, query);
     BytesRef[] terms = null;
     LabelledCharArrayMatcher[] automata = null;
     if (!highlightFlags.contains(HighlightFlag.WEIGHT_MATCHES) || !queryHasUnrecognizedPart) {
       terms = filterExtractedTerms(fieldMatcher, allTerms);
       automata = getAutomata(field, query, highlightFlags);
     } // otherwise don't need to extract
     return new UHComponents(
         field,
         fieldMatcher,
         query,
         terms,
         phraseHelper,
         automata,
         queryHasUnrecognizedPart,
         highlightFlags);
   }

   protected boolean hasUnrecognizedQuery(Predicate<String> fieldMatcher, Query query) {
     boolean[] hasUnknownLeaf = new boolean[1];
     query.visit(
         new QueryVisitor() {
           @Override
           public boolean acceptField(String field) {
             // checking hasUnknownLeaf is a trick to exit early
             return hasUnknownLeaf[0] == false && fieldMatcher.test(field);
           }

           @Override
           public void visitLeaf(Query query) {
             if (MultiTermHighlighting.canExtractAutomataFromLeafQuery(query) == false) {
               if (!(query instanceof MatchAllDocsQuery || query instanceof MatchNoDocsQuery)) {
                 hasUnknownLeaf[0] = true;
               }
             }
           }
         });
     return hasUnknownLeaf[0];
   }

   protected static BytesRef[] filterExtractedTerms(
       Predicate<String> fieldMatcher, Set<Term> queryTerms) {
     // Strip off the redundant field and sort the remaining terms
     SortedSet<BytesRef> filteredTerms = new TreeSet<>();
     for (Term term : queryTerms) {
       if (fieldMatcher.test(term.field())) {
         filteredTerms.add(term.bytes());
       }
     }
     return filteredTerms.toArray(new BytesRef[filteredTerms.size()]);
   }

   protected Set<HighlightFlag> getFlags(String field) {
     Set<HighlightFlag> highlightFlags = EnumSet.noneOf(HighlightFlag.class);
     if (shouldHandleMultiTermQuery(field)) {
       highlightFlags.add(HighlightFlag.MULTI_TERM_QUERY);
     }
     if (shouldHighlightPhrasesStrictly(field)) {
       highlightFlags.add(HighlightFlag.PHRASES);
     }
     if (shouldPreferPassageRelevancyOverSpeed(field)) {
       highlightFlags.add(HighlightFlag.PASSAGE_RELEVANCY_OVER_SPEED);
     }
     return highlightFlags;
   }

   protected PhraseHelper getPhraseHelper(
       String field, Query query, Set<HighlightFlag> highlightFlags) {
     boolean useWeightMatchesIter = highlightFlags.contains(HighlightFlag.WEIGHT_MATCHES);
     if (useWeightMatchesIter) {
       return PhraseHelper.NONE; // will be handled by Weight.matches which always considers phrases
     }
     boolean highlightPhrasesStrictly = highlightFlags.contains(HighlightFlag.PHRASES);
     boolean handleMultiTermQuery = highlightFlags.contains(HighlightFlag.MULTI_TERM_QUERY);
     return highlightPhrasesStrictly
         ? new PhraseHelper(
             query,
             field,
             getFieldMatcher(field),
             this::requiresRewrite,
             this::preSpanQueryRewrite,
             !handleMultiTermQuery)
         : PhraseHelper.NONE;
   }

   protected LabelledCharArrayMatcher[] getAutomata(
       String field, Query query, Set<HighlightFlag> highlightFlags) {
     // do we "eagerly" look in span queries for automata here, or do we not and let PhraseHelper
     // handle those?
     // if don't highlight phrases strictly,
     final boolean lookInSpan =
         !highlightFlags.contains(HighlightFlag.PHRASES) // no PhraseHelper
             || highlightFlags.contains(
                 HighlightFlag.WEIGHT_MATCHES); // Weight.Matches will find all

     return highlightFlags.contains(HighlightFlag.MULTI_TERM_QUERY)
         ? MultiTermHighlighting.extractAutomata(query, getFieldMatcher(field), lookInSpan)
         : ZERO_LEN_AUTOMATA_ARRAY;
   }

   protected OffsetSource getOptimizedOffsetSource(UHComponents components) {
     OffsetSource offsetSource = getOffsetSource(components.getField());

     // null automata means unknown, so assume a possibility
     boolean mtqOrRewrite =
         components.getAutomata() == null
             || components.getAutomata().length > 0
             || components.getPhraseHelper().willRewrite()
             || components.hasUnrecognizedQueryPart();

     // null terms means unknown, so assume something to highlight
     if (mtqOrRewrite == false
         && components.getTerms() != null
         && components.getTerms().length == 0) {
       return OffsetSource.NONE_NEEDED; // nothing to highlight
     }

     switch (offsetSource) {
       case POSTINGS:
         if (mtqOrRewrite) { // may need to see scan through all terms for the highlighted document
           // efficiently
           return OffsetSource.ANALYSIS;
         }
         break;
       case POSTINGS_WITH_TERM_VECTORS:
         if (mtqOrRewrite == false) {
           return OffsetSource.POSTINGS; // We don't need term vectors
         }
         break;
       case ANALYSIS:
       case TERM_VECTORS:
       case NONE_NEEDED:
       default:
         // stick with the original offset source
         break;
     }

     return offsetSource;
   }

   protected FieldOffsetStrategy getOffsetStrategy(
       OffsetSource offsetSource, UHComponents components) {
     switch (offsetSource) {
       case ANALYSIS:
         if (!components.getPhraseHelper().hasPositionSensitivity()
             && !components.getHighlightFlags().contains(HighlightFlag.PASSAGE_RELEVANCY_OVER_SPEED)
             && !components.getHighlightFlags().contains(HighlightFlag.WEIGHT_MATCHES)) {
           // skip using a memory index since it's pure term filtering
           return new TokenStreamOffsetStrategy(components, getIndexAnalyzer());
         } else {
           return new MemoryIndexOffsetStrategy(components, getIndexAnalyzer());
         }
       case NONE_NEEDED:
         return NoOpOffsetStrategy.INSTANCE;
       case TERM_VECTORS:
         return new TermVectorOffsetStrategy(components);
       case POSTINGS:
         return new PostingsOffsetStrategy(components);
       case POSTINGS_WITH_TERM_VECTORS:
         return new PostingsWithTermVectorsOffsetStrategy(components);
       default:
         throw new IllegalArgumentException("Unrecognized offset source " + offsetSource);
     }
   }

   /**
    * When highlighting phrases accurately, we need to know which {@link SpanQuery}'s need to have
    * {@link Query#rewrite(IndexReader)} called on them. It helps performance to avoid it if it's not
    * needed. This method will be invoked on all SpanQuery instances recursively. If you have custom
    * SpanQuery queries then override this to check instanceof and provide a definitive answer. If
    * the query isn't your custom one, simply return null to have the default rules apply, which
    * govern the ones included in Lucene.
    */
   protected Boolean requiresRewrite(SpanQuery spanQuery) {
     return null;
   }

   /**
    * When highlighting phrases accurately, we may need to handle custom queries that aren't
    * supported in the {@link org.apache.lucene.search.highlight.WeightedSpanTermExtractor} as called
    * by the {@code PhraseHelper}. Should custom query types be needed, this method should be
    * overriden to return a collection of queries if appropriate, or null if nothing to do. If the
    * query is not custom, simply returning null will allow the default rules to apply.
    *
    * @param query Query to be highlighted
    * @return A Collection of Query object(s) if needs to be rewritten, otherwise null.
    */
   protected Collection<Query> preSpanQueryRewrite(Query query) {
     return null;
   }

   private DocIdSetIterator asDocIdSetIterator(int[] sortedDocIds) {
     return new DocIdSetIterator() {
       int idx = -1;

       @Override
       public int docID() {
         if (idx < 0 || idx >= sortedDocIds.length) {
           return NO_MORE_DOCS;
         }
         return sortedDocIds[idx];
       }

       @Override
       public int nextDoc() throws IOException {
         idx++;
         return docID();
       }

       @Override
       public int advance(int target) throws IOException {
         return super.slowAdvance(target); // won't be called, so whatever
       }

       @Override
       public long cost() {
         return Math.max(0, sortedDocIds.length - (idx + 1)); // remaining docs
       }
     };
   }

   /**
    * Loads the String values for each docId by field to be highlighted. By default this loads from
    * stored fields by the same name as given, but a subclass can change the source. The returned
    * Strings must be identical to what was indexed (at least for postings or term-vectors offset
    * sources). This method must load fields for at least one document from the given {@link
    * DocIdSetIterator} but need not return all of them; by default the character lengths are summed
    * and this method will return early when {@code cacheCharsThreshold} is exceeded. Specifically if
    * that number is 0, then only one document is fetched no matter what. Values in the array of
    * {@link CharSequence} will be null if no value was found.
    */
   protected List<CharSequence[]> loadFieldValues(
       String[] fields, DocIdSetIterator docIter, int cacheCharsThreshold) throws IOException {
     List<CharSequence[]> docListOfFields =
         new ArrayList<>(cacheCharsThreshold == 0 ? 1 : (int) Math.min(64, docIter.cost()));

     LimitedStoredFieldVisitor visitor = newLimitedStoredFieldsVisitor(fields);
     int sumChars = 0;
     do {
       int docId = docIter.nextDoc();
       if (docId == DocIdSetIterator.NO_MORE_DOCS) {
         break;
       }
       visitor.init();
       searcher.doc(docId, visitor);
       CharSequence[] valuesByField = visitor.getValuesByField();
       docListOfFields.add(valuesByField);
       for (CharSequence val : valuesByField) {
         sumChars += (val == null ? 0 : val.length());
       }
     } while (sumChars <= cacheCharsThreshold && cacheCharsThreshold != 0);
     return docListOfFields;
   }

   /** @lucene.internal */
   protected LimitedStoredFieldVisitor newLimitedStoredFieldsVisitor(String[] fields) {
     return new LimitedStoredFieldVisitor(fields, MULTIVAL_SEP_CHAR, getMaxLength());
   }

   /**
    * Fetches stored fields for highlighting. Uses a multi-val separator char and honors a max length
    * to retrieve.
    *
    * @lucene.internal
    */
   protected static class LimitedStoredFieldVisitor extends StoredFieldVisitor {
     protected final String[] fields;
     protected final char valueSeparator;
     protected final int maxLength;
     protected CharSequence[] values; // starts off as String; may become StringBuilder.
     protected int currentField;

     public LimitedStoredFieldVisitor(String[] fields, char valueSeparator, int maxLength) {
       this.fields = fields;
       this.valueSeparator = valueSeparator;
       this.maxLength = maxLength;
     }

     void init() {
       values = new CharSequence[fields.length];
       currentField = -1;
     }

     @Override
     public void stringField(FieldInfo fieldInfo, String value) throws IOException {
       assert currentField >= 0;
       Objects.requireNonNull(value, "String value should not be null");
       CharSequence curValue = values[currentField];
       if (curValue == null) {
         // question: if truncate due to maxLength, should we try and avoid keeping the other chars
         // in-memory on
         //  the backing char[]?
         values[currentField] =
             value.substring(0, Math.min(maxLength, value.length())); // note: may return 'this'
         return;
       }
       final int lengthBudget = maxLength - curValue.length();
       if (lengthBudget <= 0) {
         return;
       }
       StringBuilder curValueBuilder;
       if (curValue instanceof StringBuilder) {
         curValueBuilder = (StringBuilder) curValue;
       } else {
         // upgrade String to StringBuilder. Choose a good initial size.
         curValueBuilder =
             new StringBuilder(curValue.length() + Math.min(lengthBudget, value.length() + 256));
         curValueBuilder.append(curValue);
       }
       curValueBuilder.append(valueSeparator);
       curValueBuilder.append(value.substring(0, Math.min(lengthBudget - 1, value.length())));
       values[currentField] = curValueBuilder;
     }

     @Override
     public Status needsField(FieldInfo fieldInfo) throws IOException {
       currentField = Arrays.binarySearch(fields, fieldInfo.name);
       if (currentField < 0) {
         return Status.NO;
       }
       CharSequence curVal = values[currentField];
       if (curVal != null && curVal.length() >= maxLength) {
         return fields.length == 1 ? Status.STOP : Status.NO;
       }
       return Status.YES;
     }

     CharSequence[] getValuesByField() {
       return this.values;
     }
   }

   /**
    * Wraps an IndexReader that remembers/caches the last call to {@link
    * LeafReader#getTermVectors(int)} so that if the next call has the same ID, then it is reused. If
    * TV's were column-stride (like doc-values), there would be no need for this.
    */
   private static class TermVectorReusingLeafReader extends FilterLeafReader {

     static IndexReader wrap(IndexReader reader) throws IOException {
       LeafReader[] leafReaders =
           reader.leaves().stream()
               .map(LeafReaderContext::reader)
               .map(TermVectorReusingLeafReader::new)
               .toArray(LeafReader[]::new);
       return new BaseCompositeReader<IndexReader>(leafReaders) {
         @Override
         protected void doClose() { // don't close the underlying reader
         }

         @Override
         public CacheHelper getReaderCacheHelper() {
           return null;
         }
       };
     }

     private int lastDocId = -1;
     private Fields tvFields;

     TermVectorReusingLeafReader(LeafReader in) {
       super(in);
     }

     @Override
     public Fields getTermVectors(int docID) throws IOException {
       if (docID != lastDocId) {
         lastDocId = docID;
         tvFields = in.getTermVectors(docID);
       }
       return tvFields;
     }

     @Override
     public CacheHelper getCoreCacheHelper() {
       return null;
     }

     @Override
     public CacheHelper getReaderCacheHelper() {
       return null;
     }
   }

   /** Flags for controlling highlighting behavior. */
   public enum HighlightFlag {
     /** @see UnifiedHighlighter#setHighlightPhrasesStrictly(boolean) */
     PHRASES,

     /** @see UnifiedHighlighter#setHandleMultiTermQuery(boolean) */
     MULTI_TERM_QUERY,

     /** Passage relevancy is more important than speed. True by default. */
     PASSAGE_RELEVANCY_OVER_SPEED,

     /**
      * Internally use the {@link Weight#matches(LeafReaderContext, int)} API for highlighting. It's
      * more accurate to the query, though might not calculate passage relevancy as well. Use of this
      * flag requires {@link #MULTI_TERM_QUERY} and {@link #PHRASES}. {@link
      * #PASSAGE_RELEVANCY_OVER_SPEED} will be ignored. False by default.
      */
     WEIGHT_MATCHES

     // TODO: useQueryBoosts
   }
 }