lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchHighlighter.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.search.matchhighlight;

 import com.carrotsearch.randomizedtesting.RandomizedTest;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
 import java.util.Objects;
 import java.util.Set;
 import java.util.function.Function;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
 import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
 import org.apache.lucene.analysis.synonym.SynonymMap;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.TextField;
 import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.index.IndexableField;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.queries.intervals.IntervalQuery;
 import org.apache.lucene.queries.intervals.Intervals;
 import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.MatchAllDocsQuery;
 import org.apache.lucene.search.PhraseQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.Sort;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.LuceneTestCase;
 import org.hamcrest.Matchers;
 import org.junit.Before;
 import org.junit.Test;

 public class TestMatchHighlighter extends LuceneTestCase {
   private static final String FLD_ID = "id";
   private static final String FLD_TEXT1 = "text1";
   private static final String FLD_TEXT2 = "text2";

   private FieldType TYPE_TEXT_POSITIONS_OFFSETS;
   private FieldType TYPE_TEXT_POSITIONS;

   private PerFieldAnalyzerWrapper analyzer;

   @Before
   public void setup() throws IOException {
     TYPE_TEXT_POSITIONS = TextField.TYPE_STORED;

     TYPE_TEXT_POSITIONS_OFFSETS = new FieldType(TextField.TYPE_STORED);
     TYPE_TEXT_POSITIONS_OFFSETS.setIndexOptions(
         IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
     TYPE_TEXT_POSITIONS_OFFSETS.freeze();

     Map<String, Analyzer> fieldAnalyzers = new HashMap<>();

     // Create an analyzer with some synonyms, just to showcase them.
     SynonymMap synonymMap =
         buildSynonymMap(
             new String[][] {
               {"moon\u0000shine", "firewater"},
               {"firewater", "moon\u0000shine"},
             });

     // Make a non-empty offset gap so that break iterator doesn't go haywire on multivalues
     // glued together.
     final int offsetGap = RandomizedTest.randomIntBetween(1, 2);
     final int positionGap = RandomizedTest.randomFrom(new int[] {0, 1, 100});
     Analyzer synonymsAnalyzer =
         new AnalyzerWithGaps(
             offsetGap,
             positionGap,
             new Analyzer() {
               @Override
               protected TokenStreamComponents createComponents(String fieldName) {
                 Tokenizer tokenizer = new WhitespaceTokenizer();
                 TokenStream tokenStream = new SynonymGraphFilter(tokenizer, synonymMap, true);
                 return new TokenStreamComponents(tokenizer, tokenStream);
               }
             });

     fieldAnalyzers.put(FLD_TEXT1, synonymsAnalyzer);
     fieldAnalyzers.put(FLD_TEXT2, synonymsAnalyzer);

     analyzer = new PerFieldAnalyzerWrapper(new MissingAnalyzer(), fieldAnalyzers);
   }

   static SynonymMap buildSynonymMap(String[][] synonyms) throws IOException {
     SynonymMap.Builder builder = new SynonymMap.Builder();
     for (String[] pair : synonyms) {
       assertThat(pair.length, Matchers.equalTo(2));
       builder.add(new CharsRef(pair[0]), new CharsRef(pair[1]), true);
     }
     return builder.build();
   }

   @Test
   public void testBasicUsage() throws IOException {
     new IndexBuilder(this::toField)
         .doc(FLD_TEXT1, "foo bar baz")
         .doc(FLD_TEXT1, "bar foo baz")
         .doc(
             fields -> {
               fields.add(FLD_TEXT1, "Very long content but not matching anything.");
               fields.add(FLD_TEXT2, "no foo but bar");
             })
         .build(
             analyzer,
             reader -> {
               Query query =
                   new BooleanQuery.Builder()
                       .add(new TermQuery(new Term(FLD_TEXT1, "foo")), BooleanClause.Occur.SHOULD)
                       .add(new TermQuery(new Term(FLD_TEXT2, "bar")), BooleanClause.Occur.SHOULD)
                       .build();

               // In the most basic scenario, we run a search against a query, retrieve
               // top docs...
               IndexSearcher searcher = new IndexSearcher(reader);
               Sort sortOrder = Sort.INDEXORDER; // So that results are consistently ordered.
               TopDocs topDocs = searcher.search(query, 10, sortOrder);

               // ...and would want a fixed set of fields from those documents, some of them
               // possibly highlighted if they matched the query.
               //
               // This configures the highlighter so that the FLD_ID field is always returned
               // verbatim,
               // and FLD_TEXT1 is returned *only if it contained a query match*.
               MatchHighlighter highlighter =
                   new MatchHighlighter(searcher, analyzer)
                       .appendFieldHighlighter(FieldValueHighlighters.verbatimValue(FLD_ID))
                       .appendFieldHighlighter(
                           FieldValueHighlighters.highlighted(
                               80 * 3, 1, new PassageFormatter("...", ">", "<"), FLD_TEXT1::equals))
                       .appendFieldHighlighter(FieldValueHighlighters.skipRemaining());

               // Note document field highlights are a stream over documents in topDocs. In the
               // remaining code we will just
               // collect them on the fly into a preformatted string.
               Stream<MatchHighlighter.DocHighlights> highlights =
                   highlighter.highlight(topDocs, query);
               assertHighlights(
                   toDocList(highlights),
                   " 0. id: 0",
                   "    text1: >foo< bar baz",
                   " 1. id: 1",
                   "    text1: bar >foo< baz",
                   " 2. id: 2");

               // In a more realistic use case, you'd want to show the value of a given field
               // *regardless* of whether it
               // contained a highlight or not -- it is odd that document "id: 2" above doesn't have
               // the 'text1' field
               // shown because that field wasn't part of the query match.
               //
               // Let's say the field is also potentially long; if it contains a match,
               // we would want to display the contextual snippet surrounding that match. If it does
               // not contain any
               // matches, we would want to display its content up to a given number of characters
               // (lead lines).
               //
               // Let's do this by adding an appropriate field highlighter on FLD_TEXT1.
               highlighter =
                   new MatchHighlighter(searcher, analyzer)
                       .appendFieldHighlighter(FieldValueHighlighters.verbatimValue(FLD_ID))
                       .appendFieldHighlighter(
                           FieldValueHighlighters.highlighted(
                               80 * 3, 1, new PassageFormatter("...", ">", "<"), FLD_TEXT1::equals))
                       .appendFieldHighlighter(
                           FieldValueHighlighters.maxLeadingCharacters(10, "...", Set.of(FLD_TEXT1)))
                       .appendFieldHighlighter(FieldValueHighlighters.skipRemaining());

               assertHighlights(
                   toDocList(highlighter.highlight(topDocs, query)),
                   " 0. id: 0",
                   "    text1: >foo< bar baz",
                   " 1. id: 1",
                   "    text1: bar >foo< baz",
                   " 2. id: 2",
                   "    text1: Very long...");

               // Field highlighters can apply to multiple fields and be chained for convenience.
               // For example, this defines a combined highlighter over both FLD_TEXT1 and FLD_TEXT2.
               Set<String> fields = Set.of(FLD_TEXT1, FLD_TEXT2);
               MatchHighlighter.FieldValueHighlighter highlightedOrAbbreviated =
                   FieldValueHighlighters.highlighted(
                           80 * 3, 1, new PassageFormatter("...", ">", "<"), fields::contains)
                       .or(FieldValueHighlighters.maxLeadingCharacters(10, "...", fields));

               highlighter =
                   new MatchHighlighter(searcher, analyzer)
                       .appendFieldHighlighter(FieldValueHighlighters.verbatimValue(FLD_ID))
                       .appendFieldHighlighter(highlightedOrAbbreviated)
                       .appendFieldHighlighter(FieldValueHighlighters.skipRemaining());

               assertHighlights(
                   toDocList(highlighter.highlight(topDocs, query)),
                   " 0. id: 0",
                   "    text1: >foo< bar baz",
                   " 1. id: 1",
                   "    text1: bar >foo< baz",
                   " 2. id: 2",
                   "    text1: Very long...",
                   "    text2: no foo but >bar<");
             });
   }

   @Test
   public void testSynonymHighlight() throws IOException {
     // There is nothing special needed to highlight or process complex queries, synonyms, etc.
     // Synonyms defined in the constructor of this class.
     new IndexBuilder(this::toField)
         .doc(FLD_TEXT1, "Where the moon shine falls, firewater flows.")
         .build(
             analyzer,
             reader -> {
               IndexSearcher searcher = new IndexSearcher(reader);
               Sort sortOrder = Sort.INDEXORDER; // So that results are consistently ordered.

               MatchHighlighter highlighter =
                   new MatchHighlighter(searcher, analyzer)
                       .appendFieldHighlighter(
                           FieldValueHighlighters.highlighted(
                               80 * 3, 1, new PassageFormatter("...", ">", "<"), FLD_TEXT1::equals))
                       .appendFieldHighlighter(FieldValueHighlighters.skipRemaining());

               Query query = new TermQuery(new Term(FLD_TEXT1, "firewater"));
               assertHighlights(
                   toDocList(highlighter.highlight(searcher.search(query, 10, sortOrder), query)),
                   "0. text1: Where the >moon shine< falls, >firewater< flows.");

               query = new PhraseQuery(FLD_TEXT1, "moon", "shine");
               assertHighlights(
                   toDocList(highlighter.highlight(searcher.search(query, 10, sortOrder), query)),
                   "0. text1: Where the >moon shine< falls, >firewater< flows.");
             });
   }

   @Test
   public void testCustomFieldHighlightHandling() throws IOException {
     // Match highlighter is a showcase of individual components in this package, suitable
     // to create any kind of field-display designs.
     //
     // In this example we will build a custom field highlighting handler that
     // highlights matches over a multivalued field, shows that field's values if it received
     // no matches and limits the number of values displayed to at most 2 (with an appropriate
     // message).
     new IndexBuilder(this::toField)
         // Just one document, one field, four values.
         .doc(FLD_TEXT1, "foo bar", "bar foo baz", "bar baz foo", "baz baz baz")
         .build(
             analyzer,
             reader -> {
               IndexSearcher searcher = new IndexSearcher(reader);
               Sort sortOrder = Sort.INDEXORDER;

               // Let's start with the simple predefined highlighter so that the field's value shows
               // and is highlighted when it was part of the hit.
               MatchHighlighter.FieldValueHighlighter highlighted =
                   FieldValueHighlighters.highlighted(
                       80 * 3, 2, new PassageFormatter("...", ">", "<"), FLD_TEXT1::equals);
               MatchHighlighter highlighter =
                   new MatchHighlighter(searcher, analyzer)
                       .appendFieldHighlighter(highlighted)
                       .appendFieldHighlighter(FieldValueHighlighters.skipRemaining());

               Query query = new TermQuery(new Term(FLD_TEXT1, "foo"));
               TopDocs topDocs = searcher.search(query, 10, sortOrder);

               // Note the highlighter is configured with at most 2 snippets so the match on the
               // third value ("bar baz foo") is omitted. Ellipsis isn't inserted too because
               // values are displayed in full.
               assertHighlights(
                   toDocList(highlighter.highlight(topDocs, query)),
                   "0. text1: >foo< bar, bar >foo< baz");

               // So the above works fine if the field received a match but omits it otherwise. We
               // can
               // force the display of this field by chaining with verbatim value highlighter:
               highlighter =
                   new MatchHighlighter(searcher, analyzer)
                       .appendFieldHighlighter(
                           highlighted.or(FieldValueHighlighters.verbatimValue(FLD_TEXT1)))
                       .appendFieldHighlighter(FieldValueHighlighters.skipRemaining());

               assertHighlights(
                   toDocList(highlighter.highlight(topDocs, new MatchAllDocsQuery())),
                   "0. text1: foo bar, bar foo baz, bar baz foo, baz baz baz");

               // But this is not exactly what we'd like because we want to limit the display of
               // values to the first two.
               // Let's just write a custom field highlighter handler that does it.
               class AtMostNValuesHighlighter implements MatchHighlighter.FieldValueHighlighter {
                 private final String field;
                 private final int limit;

                 AtMostNValuesHighlighter(String field, int limit) {
                   this.field = field;
                   this.limit = limit;
                 }

                 @Override
                 public boolean isApplicable(String field, boolean hasMatches) {
                   return Objects.equals(field, this.field);
                 }

                 @Override
                 public List<String> format(
                     String field,
                     String[] values,
                     String contiguousValue,
                     List<OffsetRange> valueRanges,
                     List<MatchHighlighter.QueryOffsetRange> matchOffsets) {
                   if (values.length <= limit) {
                     return Arrays.asList(values);
                   } else {
                     List<String> collected =
                         Stream.of(values).limit(limit).collect(Collectors.toList());
                     int remaining = values.length - collected.size();
                     collected.add(String.format(Locale.ROOT, "[%d omitted]", remaining));
                     return collected;
                   }
                 }

                 @Override
                 public Collection<String> alwaysFetchedFields() {
                   return Collections.singleton(field);
                 }
               }

               // We can now chain it as usual and contemplate the result.
               highlighter =
                   new MatchHighlighter(searcher, analyzer)
                       .appendFieldHighlighter(
                           highlighted.or(new AtMostNValuesHighlighter(FLD_TEXT1, 2)))
                       .appendFieldHighlighter(FieldValueHighlighters.skipRemaining());

               assertHighlights(
                   toDocList(highlighter.highlight(topDocs, query)),
                   "0. text1: >foo< bar, bar >foo< baz");
               assertHighlights(
                   toDocList(highlighter.highlight(topDocs, new MatchAllDocsQuery())),
                   "0. text1: foo bar, bar foo baz, [2 omitted]");
             });
   }

   @Test
   public void testHighlightMoreQueriesAtOnceShowoff() throws IOException {
     // Match highlighter underlying components are powerful enough to build interesting,
     // if not always super-practical, things. In this case, we would like to highlight
     // a set of matches of *more than one* query over the same set of input documents. This includes
     // highest-scoring passage resolution (from multiple hits) and different highlight markers
     // for each query.
     new IndexBuilder(this::toField)
         .doc(FLD_TEXT1, "foo bar baz")
         .doc(FLD_TEXT1, "foo baz bar")
         .build(
             analyzer,
             reader -> {
               // Let's start with the two queries. The first one will be an unordered
               // query for (foo, baz) with a max gap of 1; let's use intervals for this.
               Query q1 =
                   new IntervalQuery(
                       FLD_TEXT1,
                       Intervals.maxgaps(
                           1, Intervals.unordered(Intervals.term("foo"), Intervals.term("baz"))));

               // The second one will be a simpler term query for "bar".
               Query q2 = new TermQuery(new Term(FLD_TEXT1, "bar"));

               // Let's fetch matching documents by combining the two into a Boolean query.
               Query query =
                   new BooleanQuery.Builder()
                       .add(q1, BooleanClause.Occur.SHOULD)
                       .add(q2, BooleanClause.Occur.SHOULD)
                       .build();

               IndexSearcher searcher = new IndexSearcher(reader);
               Sort sortOrder = Sort.INDEXORDER; // So that results are consistently ordered.
               TopDocs topDocs = searcher.search(query, 10, sortOrder);

               // If we use the "regular" highlighter, the result will be slightly odd: a nested
               // highlight over "bar" within the first match. Also, you can't distinguish which of
               // the sub-queries
               // caused which highlight marker... but if it were HTML then you could give the span
               // some semi-translucent background and layered matches would be visible.
               MatchHighlighter highlighter =
                   new MatchHighlighter(searcher, analyzer)
                       .appendFieldHighlighter(
                           FieldValueHighlighters.highlighted(
                               80 * 3,
                               1,
                               new PassageFormatter("...", "<span>", "</span>"),
                               FLD_TEXT1::equals))
                       .appendFieldHighlighter(FieldValueHighlighters.skipRemaining());

               assertHighlights(
                   toDocList(highlighter.highlight(topDocs, query)),
                   "0. text1: <span>foo <span>bar</span> baz</span>",
                   "1. text1: <span>foo baz</span> <span>bar</span>");

               // To separate highlights for multiple queries we'll pass them separately to the
               // highlighter and differentiate highlight markers upon their application. Let's start
               // with the customized
               // field highlighter first. This utilizes the fact that match ranges passed from
               // MatchHighlighter
               // contain a reference to the original query which brought up the match.
               class SeparateMarkerFieldHighlighter
                   implements MatchHighlighter.FieldValueHighlighter {
                 private final String field;
                 private final Map<Query, String> queryClassMap;

                 SeparateMarkerFieldHighlighter(String field, Map<Query, String> queryClassMap) {
                   this.field = field;
                   this.queryClassMap = queryClassMap;
                 }

                 @Override
                 public boolean isApplicable(String field, boolean hasMatches) {
                   return Objects.equals(field, this.field) && hasMatches;
                 }

                 @Override
                 public List<String> format(
                     String field,
                     String[] values,
                     String contiguousValue,
                     List<OffsetRange> valueRanges,
                     List<MatchHighlighter.QueryOffsetRange> matchOffsets) {
                   PassageSelector passageSelector = new PassageSelector();
                   int maxPassageWindow = 80;
                   int maxPassages = 3;
                   List<Passage> bestPassages =
                       passageSelector.pickBest(
                           contiguousValue,
                           matchOffsets,
                           maxPassageWindow,
                           maxPassages,
                           valueRanges);

                   // We know the offset ranges passed to us by MatchHighlighter are instances of
                   // QueryOffsetRange
                   // so we compute the class based on that.
                   Function<OffsetRange, String> queryToClass =
                       (range) ->
                           queryClassMap.get(((MatchHighlighter.QueryOffsetRange) range).query);

                   PassageFormatter passageFormatter =
                       new PassageFormatter(
                           "...",
                           (range) -> "<span class='" + queryToClass.apply(range) + "'>",
                           (range) -> "</span>");

                   return passageFormatter.format(contiguousValue, bestPassages, valueRanges);
                 }
               }

               // And this is pretty much it. We now set up query classes to display, set up the
               // highlighter...
               Map<Query, String> queryClassMap = Map.of(q1, "q1", q2, "q2");
               highlighter =
                   new MatchHighlighter(searcher, analyzer)
                       .appendFieldHighlighter(
                           new SeparateMarkerFieldHighlighter(FLD_TEXT1, queryClassMap))
                       .appendFieldHighlighter(FieldValueHighlighters.skipRemaining());

               // ...and run highlighting. Note the query passed to the highlighter are individual
               // sub-clauses
               // of the Boolean query used to fetch documents.
               assertHighlights(
                   toDocList(highlighter.highlight(topDocs, q1, q2)),
                   "0. text1: <span class='q1'>foo <span class='q2'>bar</span> baz</span>",
                   "1. text1: <span class='q1'>foo baz</span> <span class='q2'>bar</span>");
             });
   }

   private void assertHighlights(List<List<String>> docList, String... expectedFormattedLines) {
     ArrayList<String> actualLines = new ArrayList<>();
     for (int doc = 0; doc < docList.size(); doc++) {
       List<String> fields = docList.get(doc);
       for (int i = 0; i < fields.size(); i++) {
         actualLines.add(
             (i == 0 ? String.format(Locale.ROOT, "%2d. ", doc) : "    ") + fields.get(i));
       }
     }

     if (!Arrays.equals(
         Stream.of(expectedFormattedLines).map(String::trim).toArray(),
         actualLines.stream().map(String::trim).toArray())) {
       throw new AssertionError(
           "Actual hits were:\n"
               + String.join("\n", actualLines)
               + "\n\n but expected them to be:\n"
               + String.join("\n", expectedFormattedLines));
     }
   }

   private List<List<String>> toDocList(Stream<MatchHighlighter.DocHighlights> highlights) {
     return highlights
         .map(
             docHighlights ->
                 docHighlights.fields.entrySet().stream()
                     .map(e -> e.getKey() + ": " + String.join(", ", e.getValue()))
                     .collect(Collectors.toList()))
         .collect(Collectors.toList());
   }

   private IndexableField toField(String name, String value) {
     switch (name) {
       case FLD_TEXT1:
         return new Field(name, value, TYPE_TEXT_POSITIONS_OFFSETS);
       case FLD_TEXT2:
         return new Field(name, value, TYPE_TEXT_POSITIONS);
       default:
         throw new AssertionError("Don't know how to handle this field: " + name);
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.search.matchhighlight;

	import com.carrotsearch.randomizedtesting.RandomizedTest;
	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.Collection;
	import java.util.Collections;
	import java.util.HashMap;
	import java.util.List;
	import java.util.Locale;
	import java.util.Map;
	import java.util.Objects;
	import java.util.Set;
	import java.util.function.Function;
	import java.util.stream.Collectors;
	import java.util.stream.Stream;
	import org.apache.lucene.analysis.Analyzer;
	import org.apache.lucene.analysis.TokenStream;
	import org.apache.lucene.analysis.Tokenizer;
	import org.apache.lucene.analysis.core.WhitespaceTokenizer;
	import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
	import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
	import org.apache.lucene.analysis.synonym.SynonymMap;
	import org.apache.lucene.document.Field;
	import org.apache.lucene.document.FieldType;
	import org.apache.lucene.document.TextField;
	import org.apache.lucene.index.IndexOptions;
	import org.apache.lucene.index.IndexableField;
	import org.apache.lucene.index.Term;
	import org.apache.lucene.queries.intervals.IntervalQuery;
	import org.apache.lucene.queries.intervals.Intervals;
	import org.apache.lucene.search.BooleanClause;
	import org.apache.lucene.search.BooleanQuery;
	import org.apache.lucene.search.IndexSearcher;
	import org.apache.lucene.search.MatchAllDocsQuery;
	import org.apache.lucene.search.PhraseQuery;
	import org.apache.lucene.search.Query;
	import org.apache.lucene.search.Sort;
	import org.apache.lucene.search.TermQuery;
	import org.apache.lucene.search.TopDocs;
	import org.apache.lucene.util.CharsRef;
	import org.apache.lucene.util.LuceneTestCase;
	import org.hamcrest.Matchers;
	import org.junit.Before;
	import org.junit.Test;

	public class TestMatchHighlighter extends LuceneTestCase {
	private static final String FLD_ID = "id";
	private static final String FLD_TEXT1 = "text1";
	private static final String FLD_TEXT2 = "text2";

	private FieldType TYPE_TEXT_POSITIONS_OFFSETS;
	private FieldType TYPE_TEXT_POSITIONS;

	private PerFieldAnalyzerWrapper analyzer;

	@Before
	public void setup() throws IOException {
	TYPE_TEXT_POSITIONS = TextField.TYPE_STORED;

	TYPE_TEXT_POSITIONS_OFFSETS = new FieldType(TextField.TYPE_STORED);
	TYPE_TEXT_POSITIONS_OFFSETS.setIndexOptions(
	IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
	TYPE_TEXT_POSITIONS_OFFSETS.freeze();

	Map<String, Analyzer> fieldAnalyzers = new HashMap<>();

	// Create an analyzer with some synonyms, just to showcase them.
	SynonymMap synonymMap =
	buildSynonymMap(
	new String[][] {
	{"moon\u0000shine", "firewater"},
	{"firewater", "moon\u0000shine"},
	});

	// Make a non-empty offset gap so that break iterator doesn't go haywire on multivalues
	// glued together.
	final int offsetGap = RandomizedTest.randomIntBetween(1, 2);
	final int positionGap = RandomizedTest.randomFrom(new int[] {0, 1, 100});
	Analyzer synonymsAnalyzer =
	new AnalyzerWithGaps(
	offsetGap,
	positionGap,
	new Analyzer() {
	@Override
	protected TokenStreamComponents createComponents(String fieldName) {
	Tokenizer tokenizer = new WhitespaceTokenizer();
	TokenStream tokenStream = new SynonymGraphFilter(tokenizer, synonymMap, true);
	return new TokenStreamComponents(tokenizer, tokenStream);
	}
	});

	fieldAnalyzers.put(FLD_TEXT1, synonymsAnalyzer);
	fieldAnalyzers.put(FLD_TEXT2, synonymsAnalyzer);

	analyzer = new PerFieldAnalyzerWrapper(new MissingAnalyzer(), fieldAnalyzers);
	}

	static SynonymMap buildSynonymMap(String[][] synonyms) throws IOException {
	SynonymMap.Builder builder = new SynonymMap.Builder();
	for (String[] pair : synonyms) {
	assertThat(pair.length, Matchers.equalTo(2));
	builder.add(new CharsRef(pair[0]), new CharsRef(pair[1]), true);
	}
	return builder.build();
	}

	@Test
	public void testBasicUsage() throws IOException {
	new IndexBuilder(this::toField)
	.doc(FLD_TEXT1, "foo bar baz")
	.doc(FLD_TEXT1, "bar foo baz")
	.doc(
	fields -> {
	fields.add(FLD_TEXT1, "Very long content but not matching anything.");
	fields.add(FLD_TEXT2, "no foo but bar");
	})
	.build(
	analyzer,
	reader -> {
	Query query =
	new BooleanQuery.Builder()
	.add(new TermQuery(new Term(FLD_TEXT1, "foo")), BooleanClause.Occur.SHOULD)
	.add(new TermQuery(new Term(FLD_TEXT2, "bar")), BooleanClause.Occur.SHOULD)
	.build();

	// In the most basic scenario, we run a search against a query, retrieve
	// top docs...
	IndexSearcher searcher = new IndexSearcher(reader);
	Sort sortOrder = Sort.INDEXORDER; // So that results are consistently ordered.
	TopDocs topDocs = searcher.search(query, 10, sortOrder);

	// ...and would want a fixed set of fields from those documents, some of them
	// possibly highlighted if they matched the query.
	//
	// This configures the highlighter so that the FLD_ID field is always returned
	// verbatim,
	// and FLD_TEXT1 is returned only if it contained a query match.
	MatchHighlighter highlighter =
	new MatchHighlighter(searcher, analyzer)
	.appendFieldHighlighter(FieldValueHighlighters.verbatimValue(FLD_ID))
	.appendFieldHighlighter(
	FieldValueHighlighters.highlighted(
	80 * 3, 1, new PassageFormatter("...", ">", "<"), FLD_TEXT1::equals))
	.appendFieldHighlighter(FieldValueHighlighters.skipRemaining());

	// Note document field highlights are a stream over documents in topDocs. In the
	// remaining code we will just
	// collect them on the fly into a preformatted string.
	Stream<MatchHighlighter.DocHighlights> highlights =
	highlighter.highlight(topDocs, query);
	assertHighlights(
	toDocList(highlights),
	" 0. id: 0",
	" text1: >foo< bar baz",
	" 1. id: 1",
	" text1: bar >foo< baz",
	" 2. id: 2");

	// In a more realistic use case, you'd want to show the value of a given field
	// regardless of whether it
	// contained a highlight or not -- it is odd that document "id: 2" above doesn't have
	// the 'text1' field
	// shown because that field wasn't part of the query match.
	//
	// Let's say the field is also potentially long; if it contains a match,
	// we would want to display the contextual snippet surrounding that match. If it does
	// not contain any
	// matches, we would want to display its content up to a given number of characters
	// (lead lines).
	//
	// Let's do this by adding an appropriate field highlighter on FLD_TEXT1.
	highlighter =
	new MatchHighlighter(searcher, analyzer)
	.appendFieldHighlighter(FieldValueHighlighters.verbatimValue(FLD_ID))
	.appendFieldHighlighter(
	FieldValueHighlighters.highlighted(
	80 * 3, 1, new PassageFormatter("...", ">", "<"), FLD_TEXT1::equals))
	.appendFieldHighlighter(
	FieldValueHighlighters.maxLeadingCharacters(10, "...", Set.of(FLD_TEXT1)))
	.appendFieldHighlighter(FieldValueHighlighters.skipRemaining());

	assertHighlights(
	toDocList(highlighter.highlight(topDocs, query)),
	" 0. id: 0",
	" text1: >foo< bar baz",
	" 1. id: 1",
	" text1: bar >foo< baz",
	" 2. id: 2",
	" text1: Very long...");

	// Field highlighters can apply to multiple fields and be chained for convenience.
	// For example, this defines a combined highlighter over both FLD_TEXT1 and FLD_TEXT2.
	Set<String> fields = Set.of(FLD_TEXT1, FLD_TEXT2);
	MatchHighlighter.FieldValueHighlighter highlightedOrAbbreviated =
	FieldValueHighlighters.highlighted(
	80 * 3, 1, new PassageFormatter("...", ">", "<"), fields::contains)
	.or(FieldValueHighlighters.maxLeadingCharacters(10, "...", fields));

	highlighter =
	new MatchHighlighter(searcher, analyzer)
	.appendFieldHighlighter(FieldValueHighlighters.verbatimValue(FLD_ID))
	.appendFieldHighlighter(highlightedOrAbbreviated)
	.appendFieldHighlighter(FieldValueHighlighters.skipRemaining());

	assertHighlights(
	toDocList(highlighter.highlight(topDocs, query)),
	" 0. id: 0",
	" text1: >foo< bar baz",
	" 1. id: 1",
	" text1: bar >foo< baz",
	" 2. id: 2",
	" text1: Very long...",
	" text2: no foo but >bar<");
	});
	}

	@Test
	public void testSynonymHighlight() throws IOException {
	// There is nothing special needed to highlight or process complex queries, synonyms, etc.
	// Synonyms defined in the constructor of this class.
	new IndexBuilder(this::toField)
	.doc(FLD_TEXT1, "Where the moon shine falls, firewater flows.")
	.build(
	analyzer,
	reader -> {
	IndexSearcher searcher = new IndexSearcher(reader);
	Sort sortOrder = Sort.INDEXORDER; // So that results are consistently ordered.

	MatchHighlighter highlighter =
	new MatchHighlighter(searcher, analyzer)
	.appendFieldHighlighter(
	FieldValueHighlighters.highlighted(
	80 * 3, 1, new PassageFormatter("...", ">", "<"), FLD_TEXT1::equals))
	.appendFieldHighlighter(FieldValueHighlighters.skipRemaining());

	Query query = new TermQuery(new Term(FLD_TEXT1, "firewater"));
	assertHighlights(
	toDocList(highlighter.highlight(searcher.search(query, 10, sortOrder), query)),
	"0. text1: Where the >moon shine< falls, >firewater< flows.");

	query = new PhraseQuery(FLD_TEXT1, "moon", "shine");
	assertHighlights(
	toDocList(highlighter.highlight(searcher.search(query, 10, sortOrder), query)),
	"0. text1: Where the >moon shine< falls, >firewater< flows.");
	});
	}

	@Test
	public void testCustomFieldHighlightHandling() throws IOException {
	// Match highlighter is a showcase of individual components in this package, suitable
	// to create any kind of field-display designs.
	//
	// In this example we will build a custom field highlighting handler that
	// highlights matches over a multivalued field, shows that field's values if it received
	// no matches and limits the number of values displayed to at most 2 (with an appropriate
	// message).
	new IndexBuilder(this::toField)
	// Just one document, one field, four values.
	.doc(FLD_TEXT1, "foo bar", "bar foo baz", "bar baz foo", "baz baz baz")
	.build(
	analyzer,
	reader -> {
	IndexSearcher searcher = new IndexSearcher(reader);
	Sort sortOrder = Sort.INDEXORDER;

	// Let's start with the simple predefined highlighter so that the field's value shows
	// and is highlighted when it was part of the hit.
	MatchHighlighter.FieldValueHighlighter highlighted =
	FieldValueHighlighters.highlighted(
	80 * 3, 2, new PassageFormatter("...", ">", "<"), FLD_TEXT1::equals);
	MatchHighlighter highlighter =
	new MatchHighlighter(searcher, analyzer)
	.appendFieldHighlighter(highlighted)
	.appendFieldHighlighter(FieldValueHighlighters.skipRemaining());

	Query query = new TermQuery(new Term(FLD_TEXT1, "foo"));
	TopDocs topDocs = searcher.search(query, 10, sortOrder);

	// Note the highlighter is configured with at most 2 snippets so the match on the
	// third value ("bar baz foo") is omitted. Ellipsis isn't inserted too because
	// values are displayed in full.
	assertHighlights(
	toDocList(highlighter.highlight(topDocs, query)),
	"0. text1: >foo< bar, bar >foo< baz");

	// So the above works fine if the field received a match but omits it otherwise. We
	// can
	// force the display of this field by chaining with verbatim value highlighter:
	highlighter =
	new MatchHighlighter(searcher, analyzer)
	.appendFieldHighlighter(
	highlighted.or(FieldValueHighlighters.verbatimValue(FLD_TEXT1)))
	.appendFieldHighlighter(FieldValueHighlighters.skipRemaining());

	assertHighlights(
	toDocList(highlighter.highlight(topDocs, new MatchAllDocsQuery())),
	"0. text1: foo bar, bar foo baz, bar baz foo, baz baz baz");

	// But this is not exactly what we'd like because we want to limit the display of
	// values to the first two.
	// Let's just write a custom field highlighter handler that does it.
	class AtMostNValuesHighlighter implements MatchHighlighter.FieldValueHighlighter {
	private final String field;
	private final int limit;

	AtMostNValuesHighlighter(String field, int limit) {
	this.field = field;
	this.limit = limit;
	}

	@Override
	public boolean isApplicable(String field, boolean hasMatches) {
	return Objects.equals(field, this.field);
	}

	@Override
	public List<String> format(
	String field,
	String[] values,
	String contiguousValue,
	List<OffsetRange> valueRanges,
	List<MatchHighlighter.QueryOffsetRange> matchOffsets) {
	if (values.length <= limit) {
	return Arrays.asList(values);
	} else {
	List<String> collected =
	Stream.of(values).limit(limit).collect(Collectors.toList());
	int remaining = values.length - collected.size();
	collected.add(String.format(Locale.ROOT, "[%d omitted]", remaining));
	return collected;
	}
	}

	@Override
	public Collection<String> alwaysFetchedFields() {
	return Collections.singleton(field);
	}
	}

	// We can now chain it as usual and contemplate the result.
	highlighter =
	new MatchHighlighter(searcher, analyzer)
	.appendFieldHighlighter(
	highlighted.or(new AtMostNValuesHighlighter(FLD_TEXT1, 2)))
	.appendFieldHighlighter(FieldValueHighlighters.skipRemaining());

	assertHighlights(
	toDocList(highlighter.highlight(topDocs, query)),
	"0. text1: >foo< bar, bar >foo< baz");
	assertHighlights(
	toDocList(highlighter.highlight(topDocs, new MatchAllDocsQuery())),
	"0. text1: foo bar, bar foo baz, [2 omitted]");
	});
	}

	@Test
	public void testHighlightMoreQueriesAtOnceShowoff() throws IOException {
	// Match highlighter underlying components are powerful enough to build interesting,
	// if not always super-practical, things. In this case, we would like to highlight
	// a set of matches of more than one query over the same set of input documents. This includes
	// highest-scoring passage resolution (from multiple hits) and different highlight markers
	// for each query.
	new IndexBuilder(this::toField)
	.doc(FLD_TEXT1, "foo bar baz")
	.doc(FLD_TEXT1, "foo baz bar")
	.build(
	analyzer,
	reader -> {
	// Let's start with the two queries. The first one will be an unordered
	// query for (foo, baz) with a max gap of 1; let's use intervals for this.
	Query q1 =
	new IntervalQuery(
	FLD_TEXT1,
	Intervals.maxgaps(
	1, Intervals.unordered(Intervals.term("foo"), Intervals.term("baz"))));

	// The second one will be a simpler term query for "bar".
	Query q2 = new TermQuery(new Term(FLD_TEXT1, "bar"));

	// Let's fetch matching documents by combining the two into a Boolean query.
	Query query =
	new BooleanQuery.Builder()
	.add(q1, BooleanClause.Occur.SHOULD)
	.add(q2, BooleanClause.Occur.SHOULD)
	.build();

	IndexSearcher searcher = new IndexSearcher(reader);
	Sort sortOrder = Sort.INDEXORDER; // So that results are consistently ordered.
	TopDocs topDocs = searcher.search(query, 10, sortOrder);

	// If we use the "regular" highlighter, the result will be slightly odd: a nested
	// highlight over "bar" within the first match. Also, you can't distinguish which of
	// the sub-queries
	// caused which highlight marker... but if it were HTML then you could give the span
	// some semi-translucent background and layered matches would be visible.
	MatchHighlighter highlighter =
	new MatchHighlighter(searcher, analyzer)
	.appendFieldHighlighter(
	FieldValueHighlighters.highlighted(
	80 * 3,
	1,
	new PassageFormatter("...", "<span>", "</span>"),
	FLD_TEXT1::equals))
	.appendFieldHighlighter(FieldValueHighlighters.skipRemaining());

	assertHighlights(
	toDocList(highlighter.highlight(topDocs, query)),
	"0. text1: <span>foo <span>bar</span> baz</span>",
	"1. text1: <span>foo baz</span> <span>bar</span>");

	// To separate highlights for multiple queries we'll pass them separately to the
	// highlighter and differentiate highlight markers upon their application. Let's start
	// with the customized
	// field highlighter first. This utilizes the fact that match ranges passed from
	// MatchHighlighter
	// contain a reference to the original query which brought up the match.
	class SeparateMarkerFieldHighlighter
	implements MatchHighlighter.FieldValueHighlighter {
	private final String field;
	private final Map<Query, String> queryClassMap;

	SeparateMarkerFieldHighlighter(String field, Map<Query, String> queryClassMap) {
	this.field = field;
	this.queryClassMap = queryClassMap;
	}

	@Override
	public boolean isApplicable(String field, boolean hasMatches) {
	return Objects.equals(field, this.field) && hasMatches;
	}

	@Override
	public List<String> format(
	String field,
	String[] values,
	String contiguousValue,
	List<OffsetRange> valueRanges,
	List<MatchHighlighter.QueryOffsetRange> matchOffsets) {
	PassageSelector passageSelector = new PassageSelector();
	int maxPassageWindow = 80;
	int maxPassages = 3;
	List<Passage> bestPassages =
	passageSelector.pickBest(
	contiguousValue,
	matchOffsets,
	maxPassageWindow,
	maxPassages,
	valueRanges);

	// We know the offset ranges passed to us by MatchHighlighter are instances of
	// QueryOffsetRange
	// so we compute the class based on that.
	Function<OffsetRange, String> queryToClass =
	(range) ->
	queryClassMap.get(((MatchHighlighter.QueryOffsetRange) range).query);

	PassageFormatter passageFormatter =
	new PassageFormatter(
	"...",
	(range) -> "<span class='" + queryToClass.apply(range) + "'>",
	(range) -> "</span>");

	return passageFormatter.format(contiguousValue, bestPassages, valueRanges);
	}
	}

	// And this is pretty much it. We now set up query classes to display, set up the
	// highlighter...
	Map<Query, String> queryClassMap = Map.of(q1, "q1", q2, "q2");
	highlighter =
	new MatchHighlighter(searcher, analyzer)
	.appendFieldHighlighter(
	new SeparateMarkerFieldHighlighter(FLD_TEXT1, queryClassMap))
	.appendFieldHighlighter(FieldValueHighlighters.skipRemaining());

	// ...and run highlighting. Note the query passed to the highlighter are individual
	// sub-clauses
	// of the Boolean query used to fetch documents.
	assertHighlights(
	toDocList(highlighter.highlight(topDocs, q1, q2)),
	"0. text1: <span class='q1'>foo <span class='q2'>bar</span> baz</span>",
	"1. text1: <span class='q1'>foo baz</span> <span class='q2'>bar</span>");
	});
	}

	private void assertHighlights(List<List<String>> docList, String... expectedFormattedLines) {
	ArrayList<String> actualLines = new ArrayList<>();
	for (int doc = 0; doc < docList.size(); doc++) {
	List<String> fields = docList.get(doc);
	for (int i = 0; i < fields.size(); i++) {
	actualLines.add(
	(i == 0 ? String.format(Locale.ROOT, "%2d. ", doc) : " ") + fields.get(i));
	}
	}

	if (!Arrays.equals(
	Stream.of(expectedFormattedLines).map(String::trim).toArray(),
	actualLines.stream().map(String::trim).toArray())) {
	throw new AssertionError(
	"Actual hits were:\n"
	+ String.join("\n", actualLines)
	+ "\n\n but expected them to be:\n"
	+ String.join("\n", expectedFormattedLines));
	}
	}

	private List<List<String>> toDocList(Stream<MatchHighlighter.DocHighlights> highlights) {
	return highlights
	.map(
	docHighlights ->
	docHighlights.fields.entrySet().stream()
	.map(e -> e.getKey() + ": " + String.join(", ", e.getValue()))
	.collect(Collectors.toList()))
	.collect(Collectors.toList());
	}

	private IndexableField toField(String name, String value) {
	switch (name) {
	case FLD_TEXT1:
	return new Field(name, value, TYPE_TEXT_POSITIONS_OFFSETS);
	case FLD_TEXT2:
	return new Field(name, value, TYPE_TEXT_POSITIONS);
	default:
	throw new AssertionError("Don't know how to handle this field: " + name);
	}
	}
	}