blob: d6cb282f138e369bb05d986eeb8ca4a0aa734b8c [file] [log] [blame]
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.TokenAttributes;
using Lucene.Net.Util;
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
namespace Lucene.Net.Search.Highlight
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
/// <summary>
/// Class used to markup highlighted terms found in the best sections of a
/// text, using configurable <see cref="IFragmenter"/>, <see cref="Scorer"/>, <see cref="IFormatter"/>,
/// <see cref="IEncoder"/> and tokenizers.
/// </summary>
public class Highlighter
public static readonly int DEFAULT_MAX_CHARS_TO_ANALYZE = 50 * 1024;
private int _maxDocCharsToAnalyze = DEFAULT_MAX_CHARS_TO_ANALYZE;
private readonly IFormatter _formatter; // LUCENENET: marked readonly
private IEncoder _encoder;
private IFragmenter _textFragmenter = new SimpleFragmenter();
private IScorer _fragmentScorer = null;
public Highlighter(IScorer fragmentScorer)
: this(new SimpleHTMLFormatter(), fragmentScorer)
public Highlighter(IFormatter formatter, IScorer fragmentScorer)
: this(formatter, new DefaultEncoder(), fragmentScorer)
public Highlighter(IFormatter formatter, IEncoder encoder, IScorer fragmentScorer)
_formatter = formatter;
_encoder = encoder;
_fragmentScorer = fragmentScorer;
/// <summary>
/// Highlights chosen terms in a text, extracting the most relevant section.
/// This is a convenience method that calls <see cref="GetBestFragment(TokenStream, string)"/>
/// </summary>
/// <param name="analyzer">the analyzer that will be used to split <paramref name="text"/> into chunks</param>
/// <param name="fieldName">Name of field used to influence analyzer's tokenization policy</param>
/// <param name="text">text to highlight terms in</param>
/// <returns>highlighted text fragment or null if no terms found</returns>
/// <exception cref="InvalidTokenOffsetsException">thrown if any token's EndOffset exceeds the provided text's length</exception>
public string GetBestFragment(Analyzer analyzer, string fieldName, string text)
TokenStream tokenStream = analyzer.GetTokenStream(fieldName, text);
return GetBestFragment(tokenStream, text);
/// <summary>
/// Highlights chosen terms in a text, extracting the most relevant section.
/// The document text is analysed in chunks to record hit statistics
/// across the document. After accumulating stats, the fragment with the highest score
/// is returned
/// </summary>
/// <param name="tokenStream">
/// A stream of tokens identified in the text parameter, including offset information.
/// This is typically produced by an analyzer re-parsing a document's
/// text. Some work may be done on retrieving TokenStreams more efficiently
/// by adding support for storing original text position data in the Lucene
/// index but this support is not currently available (as of Lucene 1.4 rc2).
/// </param>
/// <param name="text">text to highlight terms in</param>
/// <returns>highlighted text fragment or null if no terms found</returns>
/// <exception cref="InvalidTokenOffsetsException">thrown if any token's EndOffset exceeds the provided text's length</exception>
public string GetBestFragment(TokenStream tokenStream, string text)
string[] results = GetBestFragments(tokenStream, text, 1);
if (results.Length > 0)
return results[0];
return null;
/// <summary>
/// Highlights chosen terms in a text, extracting the most relevant sections.
/// This is a convenience method that calls <see cref="GetBestFragments(TokenStream, string, int)"/>
/// </summary>
/// <param name="analyzer">the analyzer that will be used to split <paramref name="text"/> into chunks</param>
/// <param name="fieldName">the name of the field being highlighted (used by analyzer)</param>
/// <param name="text">text to highlight terms in</param>
/// <param name="maxNumFragments">the maximum number of fragments.</param>
/// <returns>highlighted text fragments (between 0 and <paramref name="maxNumFragments"/> number of fragments)</returns>
/// <exception cref="InvalidTokenOffsetsException">thrown if any token's EndOffset exceeds the provided text's length</exception>
public string[] GetBestFragments(
Analyzer analyzer,
string fieldName,
string text,
int maxNumFragments)
TokenStream tokenStream = analyzer.GetTokenStream(fieldName, text);
return GetBestFragments(tokenStream, text, maxNumFragments);
/// <summary>
/// Highlights chosen terms in a text, extracting the most relevant sections.
/// The document text is analysed in chunks to record hit statistics
/// across the document. After accumulating stats, the fragments with the highest scores
/// are returned as an array of strings in order of score (contiguous fragments are merged into
/// one in their original order to improve readability)
/// </summary>
/// <param name="tokenStream"></param>
/// <param name="text">text to highlight terms in</param>
/// <param name="maxNumFragments">the maximum number of fragments.</param>
/// <returns>highlighted text fragments (between 0 and <paramref name="maxNumFragments"/> number of fragments)</returns>
/// <exception cref="InvalidTokenOffsetsException">thrown if any token's EndOffset exceeds the provided text's length</exception>
public string[] GetBestFragments(TokenStream tokenStream, string text, int maxNumFragments)
maxNumFragments = Math.Max(1, maxNumFragments); //sanity check
TextFragment[] frag = GetBestTextFragments(tokenStream, text, true, maxNumFragments);
//Get text
var fragTexts = new List<string>();
for (int i = 0; i < frag.Length; i++)
if ((frag[i] != null) && (frag[i].Score > 0))
return fragTexts.ToArray();
/// <summary>
/// Low level api to get the most relevant (formatted) sections of the document.
/// This method has been made public to allow visibility of score information held in <see cref="TextFragment"/> objects.
/// Thanks to Jason Calabrese for help in redefining the interface.
/// </summary>
/// <exception cref="IOException">If there is a low-level I/O error</exception>
/// <exception cref="InvalidTokenOffsetsException">thrown if any token's EndOffset exceeds the provided text's length</exception>
public TextFragment[] GetBestTextFragments(
TokenStream tokenStream,
string text,
bool mergeContiguousFragments,
int maxNumFragments)
var docFrags = new List<TextFragment>();
var newText = new StringBuilder();
var termAtt = tokenStream.AddAttribute<ICharTermAttribute>();
var offsetAtt = tokenStream.AddAttribute<IOffsetAttribute>();
var currentFrag = new TextFragment(newText, newText.Length, docFrags.Count);
if (_fragmentScorer is QueryScorer queryScorer) {
var newStream = _fragmentScorer.Init(tokenStream);
if (newStream != null)
tokenStream = newStream;
var fragQueue = new FragmentQueue(maxNumFragments);
string tokenText;
int startOffset;
int endOffset;
int lastEndOffset = 0;
_textFragmenter.Start(text, tokenStream);
var tokenGroup = new TokenGroup(tokenStream);
for (bool next = tokenStream.IncrementToken();
next && (offsetAtt.StartOffset < _maxDocCharsToAnalyze);
next = tokenStream.IncrementToken())
if ((offsetAtt.EndOffset > text.Length)
(offsetAtt.StartOffset > text.Length)
throw new InvalidTokenOffsetsException("Token " + termAtt.ToString()
+ " exceeds length of provided text sized " + text.Length);
if ((tokenGroup.NumTokens > 0) && (tokenGroup.IsDistinct()))
//the current token is distinct from previous tokens -
// markup the cached token group info
startOffset = tokenGroup.MatchStartOffset;
endOffset = tokenGroup.MatchEndOffset;
tokenText = text.Substring(startOffset, endOffset - startOffset);
string markedUpText = _formatter.HighlightTerm(_encoder.EncodeText(tokenText), tokenGroup);
//store any whitespace etc from between this and last group
if (startOffset > lastEndOffset)
newText.Append(_encoder.EncodeText(text.Substring(lastEndOffset, startOffset - lastEndOffset)));
lastEndOffset = Math.Max(endOffset, lastEndOffset);
//check if current token marks the start of a new fragment
if (_textFragmenter.IsNewFragment())
currentFrag.Score = _fragmentScorer.FragmentScore;
//record stats for a new fragment
currentFrag.TextEndPos = newText.Length;
currentFrag = new TextFragment(newText, newText.Length, docFrags.Count);
// if(lastEndOffset>maxDocBytesToAnalyze)
// {
// break;
// }
currentFrag.Score = _fragmentScorer.FragmentScore;
if (tokenGroup.NumTokens > 0)
//flush the accumulated text (same code as in above loop)
startOffset = tokenGroup.MatchStartOffset;
endOffset = tokenGroup.MatchEndOffset;
tokenText = text.Substring(startOffset, endOffset - startOffset);
var markedUpText = _formatter.HighlightTerm(_encoder.EncodeText(tokenText), tokenGroup);
//store any whitespace etc from between this and last group
if (startOffset > lastEndOffset)
newText.Append(_encoder.EncodeText(text.Substring(lastEndOffset, startOffset - lastEndOffset)));
lastEndOffset = Math.Max(lastEndOffset, endOffset);
//Test what remains of the original text beyond the point where we stopped analyzing
if (
// if there is text beyond the last token considered..
(lastEndOffset < text.Length)
// and that text is not too large...
(text.Length <= _maxDocCharsToAnalyze)
//append it to the last fragment
currentFrag.TextEndPos = newText.Length;
//sort the most relevant sections of the text
foreach (var f in docFrags)
currentFrag = f;
//If you are running with a version of Lucene before 11th Sept 03
// you do not have PriorityQueue.insert() - so uncomment the code below
if (currentFrag.getScore() >= minScore)
if (fragQueue.size() > maxNumFragments)
{ // if hit queue overfull
fragQueue.pop(); // remove lowest in hit queue
minScore = ((TextFragment); // reset minScore
//The above code caused a problem as a result of Christoph Goller's 11th Sept 03
//fix to PriorityQueue. The correct method to use here is the new "insert" method
//return the most relevant fragments
var frag = new TextFragment[fragQueue.Count];
for (int i = frag.Length - 1; i >= 0; i--)
frag[i] = fragQueue.Pop();
//merge any contiguous fragments to improve readability
if (mergeContiguousFragments)
List<TextFragment> fragTexts = new List<TextFragment>();
for (int i = 0; i < frag.Length; i++)
if ((frag[i] != null) && (frag[i].Score > 0))
frag = new TextFragment[fragTexts.Count];
return frag;
if (tokenStream != null)
catch (Exception)
/// <summary>
/// Improves readability of a score-sorted list of TextFragments by merging any fragments
/// that were contiguous in the original text into one larger fragment with the correct order.
/// This will leave a "null" in the array entry for the lesser scored fragment.
/// </summary>
/// <param name="frag">An array of document fragments in descending score</param>
private static void MergeContiguousFragments(TextFragment[] frag) // LUCENENET: CA1822: Mark members as static
bool mergingStillBeingDone;
if (frag.Length > 1)
mergingStillBeingDone = false; //initialise loop control flag
//for each fragment, scan other frags looking for contiguous blocks
for (int i = 0; i < frag.Length; i++)
if (frag[i] == null)
//merge any contiguous blocks
for (int x = 0; x < frag.Length; x++)
if (frag[x] == null)
if (frag[i] == null)
TextFragment frag1 = null;
TextFragment frag2 = null;
int frag1Num = 0;
int frag2Num = 0;
int bestScoringFragNum;
int worstScoringFragNum;
//if blocks are contiguous....
if (frag[i].Follows(frag[x]))
frag1 = frag[x];
frag1Num = x;
frag2 = frag[i];
frag2Num = i;
else if (frag[x].Follows(frag[i]))
frag1 = frag[i];
frag1Num = i;
frag2 = frag[x];
frag2Num = x;
//merging required..
if (frag1 != null)
if (frag1.Score > frag2.Score)
bestScoringFragNum = frag1Num;
worstScoringFragNum = frag2Num;
bestScoringFragNum = frag2Num;
worstScoringFragNum = frag1Num;
frag[worstScoringFragNum] = null;
mergingStillBeingDone = true;
frag[bestScoringFragNum] = frag1;
} while (mergingStillBeingDone);
/// <summary>
/// Highlights terms in the <paramref name="text"/>, extracting the most relevant sections
/// and concatenating the chosen fragments with a separator (typically "...").
/// The document text is analysed in chunks to record hit statistics
/// across the document. After accumulating stats, the fragments with the highest scores
/// are returned in order as "separator" delimited strings.
/// </summary>
/// <param name="tokenStream"></param>
/// <param name="text">text to highlight terms in</param>
/// <param name="maxNumFragments">the maximum number of fragments.</param>
/// <param name="separator">the separator used to intersperse the document fragments (typically "...")</param>
/// <returns>highlighted text</returns>
/// <exception cref="InvalidTokenOffsetsException">thrown if any token's EndOffset exceeds the provided text's length</exception>
public virtual string GetBestFragments(
TokenStream tokenStream,
string text,
int maxNumFragments,
string separator)
string[] sections = GetBestFragments(tokenStream, text, maxNumFragments);
StringBuilder result = new StringBuilder();
for (int i = 0; i < sections.Length; i++)
if (i > 0)
return result.ToString();
public virtual int MaxDocCharsToAnalyze
get => _maxDocCharsToAnalyze;
set => this._maxDocCharsToAnalyze = value;
public virtual IFragmenter TextFragmenter
get => _textFragmenter;
set => _textFragmenter = value;
public virtual IScorer FragmentScorer
get => _fragmentScorer;
set => _fragmentScorer = value;
public virtual IEncoder Encoder
get => _encoder;
set => this._encoder = value;
internal class FragmentQueue : PriorityQueue<TextFragment>
public FragmentQueue(int size) : base(size) { }
protected internal override bool LessThan(TextFragment fragA, TextFragment fragB)
if (fragA.Score == fragB.Score)
return fragA.FragNum > fragB.FragNum;
return fragA.Score < fragB.Score;