blob: 4e414eb8fb8d60fb5a94df690a7a496a4b2c7b93 [file] [log] [blame]
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.TokenAttributes;
using Lucene.Net.Util;
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
namespace Lucene.Net.Search.Highlight
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Class used to markup highlighted terms found in the best sections of a
/// text, using configurable <see cref="IFragmenter"/>, <see cref="Scorer"/>, <see cref="IFormatter"/>,
/// <see cref="IEncoder"/> and tokenizers.
/// </summary>
public class Highlighter
{
public static readonly int DEFAULT_MAX_CHARS_TO_ANALYZE = 50 * 1024;
private int _maxDocCharsToAnalyze = DEFAULT_MAX_CHARS_TO_ANALYZE;
private IFormatter _formatter;
private IEncoder _encoder;
private IFragmenter _textFragmenter = new SimpleFragmenter();
private IScorer _fragmentScorer = null;
public Highlighter(IScorer fragmentScorer)
: this(new SimpleHTMLFormatter(), fragmentScorer)
{
}
public Highlighter(IFormatter formatter, IScorer fragmentScorer)
: this(formatter, new DefaultEncoder(), fragmentScorer)
{
}
public Highlighter(IFormatter formatter, IEncoder encoder, IScorer fragmentScorer)
{
_formatter = formatter;
_encoder = encoder;
_fragmentScorer = fragmentScorer;
}
/// <summary>
/// Highlights chosen terms in a text, extracting the most relevant section.
/// This is a convenience method that calls <see cref="GetBestFragment(TokenStream, string)"/>
/// </summary>
/// <param name="analyzer">the analyzer that will be used to split <paramref name="text"/> into chunks</param>
/// <param name="fieldName">Name of field used to influence analyzer's tokenization policy</param>
/// <param name="text">text to highlight terms in</param>
/// <returns>highlighted text fragment or null if no terms found</returns>
/// <exception cref="InvalidTokenOffsetsException">thrown if any token's EndOffset exceeds the provided text's length</exception>
public string GetBestFragment(Analyzer analyzer, string fieldName, string text)
{
TokenStream tokenStream = analyzer.GetTokenStream(fieldName, text);
return GetBestFragment(tokenStream, text);
}
/// <summary>
/// Highlights chosen terms in a text, extracting the most relevant section.
/// The document text is analysed in chunks to record hit statistics
/// across the document. After accumulating stats, the fragment with the highest score
/// is returned
/// </summary>
/// <param name="tokenStream">
/// A stream of tokens identified in the text parameter, including offset information.
/// This is typically produced by an analyzer re-parsing a document's
/// text. Some work may be done on retrieving TokenStreams more efficiently
/// by adding support for storing original text position data in the Lucene
/// index but this support is not currently available (as of Lucene 1.4 rc2).
/// </param>
/// <param name="text">text to highlight terms in</param>
/// <returns>highlighted text fragment or null if no terms found</returns>
/// <exception cref="InvalidTokenOffsetsException">thrown if any token's EndOffset exceeds the provided text's length</exception>
public string GetBestFragment(TokenStream tokenStream, string text)
{
string[] results = GetBestFragments(tokenStream, text, 1);
if (results.Length > 0)
{
return results[0];
}
return null;
}
/// <summary>
/// Highlights chosen terms in a text, extracting the most relevant sections.
/// This is a convenience method that calls <see cref="GetBestFragments(TokenStream, string, int)"/>
/// </summary>
/// <param name="analyzer">the analyzer that will be used to split <paramref name="text"/> into chunks</param>
/// <param name="fieldName">the name of the field being highlighted (used by analyzer)</param>
/// <param name="text">text to highlight terms in</param>
/// <param name="maxNumFragments">the maximum number of fragments.</param>
/// <returns>highlighted text fragments (between 0 and <paramref name="maxNumFragments"/> number of fragments)</returns>
/// <exception cref="InvalidTokenOffsetsException">thrown if any token's EndOffset exceeds the provided text's length</exception>
public string[] GetBestFragments(
Analyzer analyzer,
string fieldName,
string text,
int maxNumFragments)
{
TokenStream tokenStream = analyzer.GetTokenStream(fieldName, text);
return GetBestFragments(tokenStream, text, maxNumFragments);
}
/// <summary>
/// Highlights chosen terms in a text, extracting the most relevant sections.
/// The document text is analysed in chunks to record hit statistics
/// across the document. After accumulating stats, the fragments with the highest scores
/// are returned as an array of strings in order of score (contiguous fragments are merged into
/// one in their original order to improve readability)
/// </summary>
/// <param name="tokenStream"></param>
/// <param name="text">text to highlight terms in</param>
/// <param name="maxNumFragments">the maximum number of fragments.</param>
/// <returns>highlighted text fragments (between 0 and <paramref name="maxNumFragments"/> number of fragments)</returns>
/// <exception cref="InvalidTokenOffsetsException">thrown if any token's EndOffset exceeds the provided text's length</exception>
public string[] GetBestFragments(TokenStream tokenStream, string text, int maxNumFragments)
{
maxNumFragments = Math.Max(1, maxNumFragments); //sanity check
TextFragment[] frag = GetBestTextFragments(tokenStream, text, true, maxNumFragments);
//Get text
var fragTexts = new List<string>();
for (int i = 0; i < frag.Length; i++)
{
if ((frag[i] != null) && (frag[i].Score > 0))
{
fragTexts.Add(frag[i].ToString());
}
}
return fragTexts.ToArray();
}
/// <summary>
/// Low level api to get the most relevant (formatted) sections of the document.
/// This method has been made public to allow visibility of score information held in <see cref="TextFragment"/> objects.
/// Thanks to Jason Calabrese for help in redefining the interface.
/// </summary>
/// <exception cref="IOException">If there is a low-level I/O error</exception>
/// <exception cref="InvalidTokenOffsetsException">thrown if any token's EndOffset exceeds the provided text's length</exception>
public TextFragment[] GetBestTextFragments(
TokenStream tokenStream,
string text,
bool mergeContiguousFragments,
int maxNumFragments)
{
var docFrags = new List<TextFragment>();
var newText = new StringBuilder();
var termAtt = tokenStream.AddAttribute<ICharTermAttribute>();
var offsetAtt = tokenStream.AddAttribute<IOffsetAttribute>();
tokenStream.Reset();
var currentFrag = new TextFragment(newText, newText.Length, docFrags.Count);
if (_fragmentScorer is QueryScorer) {
((QueryScorer)_fragmentScorer).SetMaxDocCharsToAnalyze(_maxDocCharsToAnalyze);
}
var newStream = _fragmentScorer.Init(tokenStream);
if (newStream != null)
{
tokenStream = newStream;
}
_fragmentScorer.StartFragment(currentFrag);
docFrags.Add(currentFrag);
var fragQueue = new FragmentQueue(maxNumFragments);
try
{
string tokenText;
int startOffset;
int endOffset;
int lastEndOffset = 0;
_textFragmenter.Start(text, tokenStream);
var tokenGroup = new TokenGroup(tokenStream);
for (bool next = tokenStream.IncrementToken();
next && (offsetAtt.StartOffset < _maxDocCharsToAnalyze);
next = tokenStream.IncrementToken())
{
if ((offsetAtt.EndOffset > text.Length)
||
(offsetAtt.StartOffset > text.Length)
)
{
throw new InvalidTokenOffsetsException("Token " + termAtt.ToString()
+ " exceeds length of provided text sized " + text.Length);
}
if ((tokenGroup.NumTokens > 0) && (tokenGroup.IsDistinct()))
{
//the current token is distinct from previous tokens -
// markup the cached token group info
startOffset = tokenGroup.MatchStartOffset;
endOffset = tokenGroup.MatchEndOffset;
tokenText = text.Substring(startOffset, endOffset - startOffset);
string markedUpText = _formatter.HighlightTerm(_encoder.EncodeText(tokenText), tokenGroup);
//store any whitespace etc from between this and last group
if (startOffset > lastEndOffset)
newText.Append(_encoder.EncodeText(text.Substring(lastEndOffset, startOffset - lastEndOffset)));
newText.Append(markedUpText);
lastEndOffset = Math.Max(endOffset, lastEndOffset);
tokenGroup.Clear();
//check if current token marks the start of a new fragment
if (_textFragmenter.IsNewFragment())
{
currentFrag.Score = _fragmentScorer.FragmentScore;
//record stats for a new fragment
currentFrag.TextEndPos = newText.Length;
currentFrag = new TextFragment(newText, newText.Length, docFrags.Count);
_fragmentScorer.StartFragment(currentFrag);
docFrags.Add(currentFrag);
}
}
tokenGroup.AddToken(_fragmentScorer.GetTokenScore());
// if(lastEndOffset>maxDocBytesToAnalyze)
// {
// break;
// }
}
currentFrag.Score = _fragmentScorer.FragmentScore;
if (tokenGroup.NumTokens > 0)
{
//flush the accumulated text (same code as in above loop)
startOffset = tokenGroup.MatchStartOffset;
endOffset = tokenGroup.MatchEndOffset;
tokenText = text.Substring(startOffset, endOffset - startOffset);
var markedUpText = _formatter.HighlightTerm(_encoder.EncodeText(tokenText), tokenGroup);
//store any whitespace etc from between this and last group
if (startOffset > lastEndOffset)
newText.Append(_encoder.EncodeText(text.Substring(lastEndOffset, startOffset - lastEndOffset)));
newText.Append(markedUpText);
lastEndOffset = Math.Max(lastEndOffset, endOffset);
}
//Test what remains of the original text beyond the point where we stopped analyzing
if (
// if there is text beyond the last token considered..
(lastEndOffset < text.Length)
&&
// and that text is not too large...
(text.Length <= _maxDocCharsToAnalyze)
)
{
//append it to the last fragment
newText.Append(_encoder.EncodeText(text.Substring(lastEndOffset)));
}
currentFrag.TextEndPos = newText.Length;
//sort the most relevant sections of the text
foreach (var f in docFrags)
{
currentFrag = f;
//If you are running with a version of Lucene before 11th Sept 03
// you do not have PriorityQueue.insert() - so uncomment the code below
/*
if (currentFrag.getScore() >= minScore)
{
fragQueue.put(currentFrag);
if (fragQueue.size() > maxNumFragments)
{ // if hit queue overfull
fragQueue.pop(); // remove lowest in hit queue
minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
}
}
*/
//The above code caused a problem as a result of Christoph Goller's 11th Sept 03
//fix to PriorityQueue. The correct method to use here is the new "insert" method
// USE ABOVE CODE IF THIS DOES NOT COMPILE!
fragQueue.InsertWithOverflow(currentFrag);
}
//return the most relevant fragments
var frag = new TextFragment[fragQueue.Count];
for (int i = frag.Length - 1; i >= 0; i--)
{
frag[i] = fragQueue.Pop();
}
//merge any contiguous fragments to improve readability
if (mergeContiguousFragments)
{
MergeContiguousFragments(frag);
List<TextFragment> fragTexts = new List<TextFragment>();
for (int i = 0; i < frag.Length; i++)
{
if ((frag[i] != null) && (frag[i].Score > 0))
{
fragTexts.Add(frag[i]);
}
}
frag = new TextFragment[fragTexts.Count];
fragTexts.CopyTo(frag);
}
return frag;
}
finally
{
if (tokenStream != null)
{
try
{
tokenStream.End();
tokenStream.Dispose();
}
catch (Exception)
{
}
}
}
}
/// <summary>
/// Improves readability of a score-sorted list of TextFragments by merging any fragments
/// that were contiguous in the original text into one larger fragment with the correct order.
/// This will leave a "null" in the array entry for the lesser scored fragment.
/// </summary>
/// <param name="frag">An array of document fragments in descending score</param>
private void MergeContiguousFragments(TextFragment[] frag)
{
bool mergingStillBeingDone;
if (frag.Length > 1)
do
{
mergingStillBeingDone = false; //initialise loop control flag
//for each fragment, scan other frags looking for contiguous blocks
for (int i = 0; i < frag.Length; i++)
{
if (frag[i] == null)
{
continue;
}
//merge any contiguous blocks
for (int x = 0; x < frag.Length; x++)
{
if (frag[x] == null)
{
continue;
}
if (frag[i] == null)
{
break;
}
TextFragment frag1 = null;
TextFragment frag2 = null;
int frag1Num = 0;
int frag2Num = 0;
int bestScoringFragNum;
int worstScoringFragNum;
//if blocks are contiguous....
if (frag[i].Follows(frag[x]))
{
frag1 = frag[x];
frag1Num = x;
frag2 = frag[i];
frag2Num = i;
}
else if (frag[x].Follows(frag[i]))
{
frag1 = frag[i];
frag1Num = i;
frag2 = frag[x];
frag2Num = x;
}
//merging required..
if (frag1 != null)
{
if (frag1.Score > frag2.Score)
{
bestScoringFragNum = frag1Num;
worstScoringFragNum = frag2Num;
}
else
{
bestScoringFragNum = frag2Num;
worstScoringFragNum = frag1Num;
}
frag1.Merge(frag2);
frag[worstScoringFragNum] = null;
mergingStillBeingDone = true;
frag[bestScoringFragNum] = frag1;
}
}
}
} while (mergingStillBeingDone);
}
/// <summary>
/// Highlights terms in the <paramref name="text"/>, extracting the most relevant sections
/// and concatenating the chosen fragments with a separator (typically "...").
/// The document text is analysed in chunks to record hit statistics
/// across the document. After accumulating stats, the fragments with the highest scores
/// are returned in order as "separator" delimited strings.
/// </summary>
/// <param name="tokenStream"></param>
/// <param name="text">text to highlight terms in</param>
/// <param name="maxNumFragments">the maximum number of fragments.</param>
/// <param name="separator">the separator used to intersperse the document fragments (typically "...")</param>
/// <returns>highlighted text</returns>
/// <exception cref="InvalidTokenOffsetsException">thrown if any token's EndOffset exceeds the provided text's length</exception>
public virtual string GetBestFragments(
TokenStream tokenStream,
string text,
int maxNumFragments,
string separator)
{
string[] sections = GetBestFragments(tokenStream, text, maxNumFragments);
StringBuilder result = new StringBuilder();
for (int i = 0; i < sections.Length; i++)
{
if (i > 0)
{
result.Append(separator);
}
result.Append(sections[i]);
}
return result.ToString();
}
public virtual int MaxDocCharsToAnalyze
{
get => _maxDocCharsToAnalyze;
set => this._maxDocCharsToAnalyze = value;
}
public virtual IFragmenter TextFragmenter
{
get => _textFragmenter;
set => _textFragmenter = value;
}
public virtual IScorer FragmentScorer
{
get => _fragmentScorer;
set => _fragmentScorer = value;
}
public virtual IEncoder Encoder
{
get => _encoder;
set => this._encoder = value;
}
}
internal class FragmentQueue : PriorityQueue<TextFragment>
{
public FragmentQueue(int size) : base(size) { }
protected internal override bool LessThan(TextFragment fragA, TextFragment fragB)
{
if (fragA.Score == fragB.Score)
return fragA.FragNum > fragB.FragNum;
else
return fragA.Score < fragB.Score;
}
}
}