| #if FEATURE_BREAKITERATOR |
| using J2N.Text; |
| using ICU4N.Text; |
| using Lucene.Net.Analysis; |
| using Lucene.Net.Index; |
| using Lucene.Net.Support; |
| using Lucene.Net.Util; |
| using Lucene.Net.Util.Automaton; |
| using System; |
| using System.Collections.Generic; |
| using System.Diagnostics; |
| using System.Globalization; |
| using System.IO; |
| using System.Linq; |
| using System.Text; |
| using JCG = J2N.Collections.Generic; |
| |
| namespace Lucene.Net.Search.PostingsHighlight |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /// <summary> |
| /// Simple highlighter that does not analyze fields nor use |
| /// term vectors. Instead it requires |
| /// <see cref="IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS"/>. |
| /// </summary> |
| /// <remarks> |
| /// PostingsHighlighter treats the single original document as the whole corpus, and then scores individual |
| /// passages as if they were documents in this corpus. It uses a <see cref="BreakIterator"/> to find |
| /// passages in the text; by default it breaks using <see cref="BreakIterator.GetSentenceInstance(CultureInfo)"/> (for sentence breaking). |
| /// It then iterates in parallel (merge sorting by offset) through |
| /// the positions of all terms from the query, coalescing those hits that occur in a single passage |
| /// into a <see cref="Passage"/>, and then scores each Passage using a separate <see cref="PassageScorer"/>. |
| /// Passages are finally formatted into highlighted snippets with a <see cref="PassageFormatter"/>. |
| /// <para/> |
| /// You can customize the behavior by subclassing this highlighter, some important hooks: |
| /// <list type="bullet"> |
| /// <item><description><see cref="GetBreakIterator(string)"/>: Customize how the text is divided into passages.</description></item> |
| /// <item><description><see cref="GetScorer(string)"/>: Customize how passages are ranked.</description></item> |
| /// <item><description><see cref="GetFormatter(string)"/>: Customize how snippets are formatted.</description></item> |
| /// <item><description><see cref="GetIndexAnalyzer(string)"/>: Enable highlighting of MultiTermQuerys such as <see cref="WildcardQuery"/>.</description></item> |
| /// </list> |
| /// <para/> |
| /// <b>WARNING</b>: The code is very new and probably still has some exciting bugs! |
| /// <para/> |
| /// Example usage: |
| /// <code> |
| /// // configure field with offsets at index time |
| /// IndexableFieldType offsetsType = new IndexableFieldType(TextField.TYPE_STORED); |
| /// offsetsType.IndexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; |
| /// Field body = new Field("body", "foobar", offsetsType); |
| /// |
| /// // retrieve highlights at query time |
| /// ICUPostingsHighlighter highlighter = new ICUPostingsHighlighter(); |
| /// Query query = new TermQuery(new Term("body", "highlighting")); |
| /// TopDocs topDocs = searcher.Search(query, n); |
| /// string highlights[] = highlighter.Highlight("body", query, searcher, topDocs); |
| /// </code> |
| /// <para/> |
| /// This is thread-safe, and can be used across different readers. |
| /// <para/> |
| /// Note that the .NET implementation differs from the <c>PostingsHighlighter</c> in Lucene in |
| /// that it is backed by an ICU <see cref="RuleBasedBreakIterator"/>, which differs slightly in default behavior |
| /// than the one in the JDK. However, the ICU <see cref="RuleBasedBreakIterator"/> behavior can be customized |
| /// to meet a lot of scenarios that the one in the JDK cannot. See the ICU documentation at |
| /// <a href="http://userguide.icu-project.org/boundaryanalysis/break-rules">http://userguide.icu-project.org/boundaryanalysis/break-rules</a> |
| /// for more information how to pass custom rules to an ICU <see cref="RuleBasedBreakIterator"/>. |
| /// <para/> |
| /// @lucene.experimental |
| /// </remarks> |
| [ExceptionToClassNameConvention] |
| public class ICUPostingsHighlighter // LUCENENET specific - renamed ICUPostingsHighlighter to reflect the change in default behavior |
| { |
| // TODO: maybe allow re-analysis for tiny fields? currently we require offsets, |
| // but if the analyzer is really fast and the field is tiny, this might really be |
| // unnecessary. |
| |
| /// <summary>for rewriting: we don't want slow processing from MTQs</summary> |
| private static readonly IndexReader EMPTY_INDEXREADER = new MultiReader(); |
| |
| /// <summary> |
| /// Default maximum content size to process. Typically snippets |
| /// closer to the beginning of the document better summarize its content |
| /// </summary> |
| public static readonly int DEFAULT_MAX_LENGTH = 10000; |
| |
| private readonly int maxLength; |
| |
| /// <summary> |
| /// Set the first time <see cref="GetFormatter(string)"/> is called, |
| /// and then reused. |
| /// </summary> |
| private PassageFormatter defaultFormatter; |
| |
| /// <summary> |
| /// Set the first time <see cref="GetScorer(string)"/> is called, and then reused. |
| /// </summary> |
| private PassageScorer defaultScorer; |
| |
| /// <summary> |
| /// Creates a new highlighter with <see cref="DEFAULT_MAX_LENGTH"/>. |
| /// </summary> |
| public ICUPostingsHighlighter() |
| : this(DEFAULT_MAX_LENGTH) |
| { |
| } |
| |
| /// <summary> |
| /// Creates a new highlighter, specifying maximum content length. |
| /// </summary> |
| /// <param name="maxLength">maximum content size to process.</param> |
| /// <exception cref="ArgumentException">if <paramref name="maxLength"/> is negative or <c>int.MaxValue</c></exception> |
| public ICUPostingsHighlighter(int maxLength) |
| { |
| if (maxLength < 0 || maxLength == int.MaxValue) |
| { |
| // two reasons: no overflow problems in BreakIterator.preceding(offset+1), |
| // our sentinel in the offsets queue uses this value to terminate. |
| throw new ArgumentException("maxLength must be < System.Int32.MaxValue"); |
| } |
| this.maxLength = maxLength; |
| } |
| |
| /// <summary> |
| /// Returns the <see cref="BreakIterator"/> to use for |
| /// dividing text into passages. This instantiates an |
| /// <see cref="BreakIterator.GetSentenceInstance(CultureInfo)"/> by default; |
| /// subclasses can override to customize. |
| /// </summary> |
| protected virtual BreakIterator GetBreakIterator(string field) |
| { |
| return BreakIterator.GetSentenceInstance(CultureInfo.InvariantCulture); |
| } |
| |
| /// <summary> |
| /// Returns the <see cref="PassageFormatter"/> to use for |
| /// formatting passages into highlighted snippets. This |
| /// returns a new <see cref="PassageFormatter"/> by default; |
| /// subclasses can override to customize. |
| /// </summary> |
| protected virtual PassageFormatter GetFormatter(string field) |
| { |
| if (defaultFormatter == null) |
| { |
| defaultFormatter = new DefaultPassageFormatter(); |
| } |
| return defaultFormatter; |
| } |
| |
| /// <summary> |
| /// Returns the <see cref="PassageScorer"/> to use for |
| /// ranking passages. This |
| /// returns a new <see cref="PassageScorer"/> by default; |
| /// subclasses can override to customize. |
| /// </summary> |
| protected virtual PassageScorer GetScorer(string field) |
| { |
| if (defaultScorer == null) |
| { |
| defaultScorer = new PassageScorer(); |
| } |
| return defaultScorer; |
| } |
| |
| /// <summary> |
| /// Highlights the top passages from a single field. |
| /// </summary> |
| /// <param name="field">field name to highlight. Must have a stored string value and also be indexed with offsets.</param> |
| /// <param name="query">query to highlight.</param> |
| /// <param name="searcher">searcher that was previously used to execute the query.</param> |
| /// <param name="topDocs">TopDocs containing the summary result documents to highlight.</param> |
| /// <returns> |
| /// Array of formatted snippets corresponding to the documents in <paramref name="topDocs"/>. |
| /// If no highlights were found for a document, the |
| /// first sentence for the field will be returned. |
| /// </returns> |
| /// <exception cref="IOException">if an I/O error occurred during processing</exception> |
| /// <exception cref="ArgumentException">if <paramref name="field"/> was indexed without <see cref="IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS"/></exception> |
| public virtual string[] Highlight(string field, Query query, IndexSearcher searcher, TopDocs topDocs) |
| { |
| return Highlight(field, query, searcher, topDocs, 1); |
| } |
| |
| /// <summary> |
| /// Highlights the top-N passages from a single field. |
| /// </summary> |
| /// <param name="field"> |
| /// field name to highlight. |
| /// Must have a stored string value and also be indexed with offsets. |
| /// </param> |
| /// <param name="query">query to highlight.</param> |
| /// <param name="searcher">searcher that was previously used to execute the query.</param> |
| /// <param name="topDocs">TopDocs containing the summary result documents to highlight.</param> |
| /// <param name="maxPassages">The maximum number of top-N ranked passages used to form the highlighted snippets.</param> |
| /// <returns> |
| /// Array of formatted snippets corresponding to the documents in <paramref name="topDocs"/>. |
| /// If no highlights were found for a document, the |
| /// first <paramref name="maxPassages"/> sentences from the |
| /// field will be returned. |
| /// </returns> |
| /// <exception cref="IOException">if an I/O error occurred during processing</exception> |
| /// <exception cref="ArgumentException">Illegal if <paramref name="field"/> was indexed without <see cref="IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS"/></exception> |
| public virtual string[] Highlight(string field, Query query, IndexSearcher searcher, TopDocs topDocs, int maxPassages) |
| { |
| IDictionary<string, string[]> res = HighlightFields(new string[] { field }, query, searcher, topDocs, new int[] { maxPassages }); |
| string[] result; |
| res.TryGetValue(field, out result); |
| return result; |
| } |
| |
| /// <summary> |
| /// Highlights the top passages from multiple fields. |
| /// <para/> |
| /// Conceptually, this behaves as a more efficient form of: |
| /// <code> |
| /// IDictionary<string, string[]> m = new Dictionary<string, string[]>(); |
| /// foreach (string field in fields) |
| /// { |
| /// m[field] = Highlight(field, query, searcher, topDocs); |
| /// } |
| /// return m; |
| /// </code> |
| /// </summary> |
| /// <param name="fields">field names to highlight. Must have a stored string value and also be indexed with offsets.</param> |
| /// <param name="query">query to highlight.</param> |
| /// <param name="searcher">searcher that was previously used to execute the query.</param> |
| /// <param name="topDocs">TopDocs containing the summary result documents to highlight.</param> |
| /// <returns> |
| /// <see cref="T:IDictionary{string, string[]}"/> keyed on field name, containing the array of formatted snippets |
| /// corresponding to the documents in <paramref name="topDocs"/>. |
| /// If no highlights were found for a document, the |
| /// first sentence from the field will be returned. |
| /// </returns> |
| /// <exception cref="IOException">if an I/O error occurred during processing</exception> |
| /// <exception cref="ArgumentException">if <c>field</c> was indexed without <see cref="IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS"/></exception> |
| public virtual IDictionary<string, string[]> HighlightFields(string[] fields, Query query, IndexSearcher searcher, TopDocs topDocs) |
| { |
| int[] maxPassages = new int[fields.Length]; |
| Arrays.Fill(maxPassages, 1); |
| return HighlightFields(fields, query, searcher, topDocs, maxPassages); |
| } |
| |
| /// <summary> |
| /// Highlights the top-N passages from multiple fields. |
| /// <para/> |
| /// Conceptually, this behaves as a more efficient form of: |
| /// <code> |
| /// IDictionary<string, string[]> m = new Dictionary<string, string[]>(); |
| /// foreach (string field in fields) |
| /// { |
| /// m[field] = Highlight(field, query, searcher, topDocs, maxPassages); |
| /// } |
| /// return m; |
| /// </code> |
| /// </summary> |
| /// <param name="fields">field names to highlight. Must have a stored string value and also be indexed with offsets.</param> |
| /// <param name="query">query to highlight.</param> |
| /// <param name="searcher">searcher that was previously used to execute the query.</param> |
| /// <param name="topDocs">TopDocs containing the summary result documents to highlight.</param> |
| /// <param name="maxPassages">The maximum number of top-N ranked passages per-field used to form the highlighted snippets.</param> |
| /// <returns> |
| /// <see cref="T:IDictionary{string, string[]}"/> keyed on field name, containing the array of formatted snippets |
| /// corresponding to the documents in <paramref name="topDocs"/>. |
| /// If no highlights were found for a document, the |
| /// first <paramref name="maxPassages"/> sentences from the |
| /// field will be returned. |
| /// </returns> |
| /// <exception cref="IOException">if an I/O error occurred during processing</exception> |
| /// <exception cref="ArgumentException">if <c>field</c> was indexed without <see cref="IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS"/></exception> |
| public virtual IDictionary<string, string[]> HighlightFields(string[] fields, Query query, IndexSearcher searcher, TopDocs topDocs, int[] maxPassages) |
| { |
| ScoreDoc[] scoreDocs = topDocs.ScoreDocs; |
| int[] docids = new int[scoreDocs.Length]; |
| for (int i = 0; i < docids.Length; i++) |
| { |
| docids[i] = scoreDocs[i].Doc; |
| } |
| |
| return HighlightFields(fields, query, searcher, docids, maxPassages); |
| } |
| |
| /// <summary> |
| /// Highlights the top-N passages from multiple fields, |
| /// for the provided int[] docids. |
| /// </summary> |
| /// <param name="fieldsIn">field names to highlight. Must have a stored string value and also be indexed with offsets.</param> |
| /// <param name="query">query to highlight.</param> |
| /// <param name="searcher">searcher that was previously used to execute the query.</param> |
| /// <param name="docidsIn">containing the document IDs to highlight.</param> |
| /// <param name="maxPassagesIn">The maximum number of top-N ranked passages per-field used to form the highlighted snippets.</param> |
| /// <returns> |
| /// <see cref="F:IDictionary{string, string[]}"/> keyed on field name, containing the array of formatted snippets |
| /// corresponding to the documents in <paramref name="docidsIn"/>. |
| /// If no highlights were found for a document, the |
| /// first <c>maxPassages</c> from the field will |
| /// be returned. |
| /// </returns> |
| /// <exception cref="IOException">if an I/O error occurred during processing</exception> |
| /// <exception cref="ArgumentException">if <c>field</c> was indexed without <see cref="IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS"/></exception> |
| public virtual IDictionary<string, string[]> HighlightFields(string[] fieldsIn, Query query, IndexSearcher searcher, int[] docidsIn, int[] maxPassagesIn) |
| { |
| IDictionary<string, string[]> snippets = new Dictionary<string, string[]>(); |
| foreach (var ent in HighlightFieldsAsObjects(fieldsIn, query, searcher, docidsIn, maxPassagesIn)) |
| { |
| object[] snippetObjects = ent.Value; |
| string[] snippetStrings = new string[snippetObjects.Length]; |
| snippets[ent.Key] = snippetStrings; |
| for (int i = 0; i < snippetObjects.Length; i++) |
| { |
| object snippet = snippetObjects[i]; |
| if (snippet != null) |
| { |
| snippetStrings[i] = snippet.ToString(); |
| } |
| } |
| } |
| |
| return snippets; |
| } |
| |
| internal class InPlaceMergeSorterAnonymousHelper : InPlaceMergeSorter |
| { |
| private readonly string[] fields; |
| private readonly int[] maxPassages; |
| public InPlaceMergeSorterAnonymousHelper(string[] fields, int[] maxPassages) |
| { |
| this.fields = fields; |
| this.maxPassages = maxPassages; |
| } |
| |
| protected override void Swap(int i, int j) |
| { |
| string tmp = fields[i]; |
| fields[i] = fields[j]; |
| fields[j] = tmp; |
| int tmp2 = maxPassages[i]; |
| maxPassages[i] = maxPassages[j]; |
| maxPassages[j] = tmp2; |
| } |
| |
| protected override int Compare(int i, int j) |
| { |
| return fields[i].CompareToOrdinal(fields[j]); |
| } |
| } |
| |
| /// <summary> |
| /// Expert: highlights the top-N passages from multiple fields, |
| /// for the provided int[] docids, to custom object as |
| /// returned by the <see cref="PassageFormatter"/>. Use |
| /// this API to render to something other than <see cref="string"/>. |
| /// </summary> |
| /// <param name="fieldsIn">field names to highlight. Must have a stored string value and also be indexed with offsets.</param> |
| /// <param name="query">query to highlight.</param> |
| /// <param name="searcher">searcher that was previously used to execute the query.</param> |
| /// <param name="docidsIn">containing the document IDs to highlight.</param> |
| /// <param name="maxPassagesIn">The maximum number of top-N ranked passages per-field used to form the highlighted snippets.</param> |
| /// <returns> |
| /// <see cref="T:IDictionary{string, object[]}"/> keyed on field name, containing the array of formatted snippets |
| /// corresponding to the documents in <paramref name="docidsIn"/>. |
| /// If no highlights were found for a document, the |
| /// first <paramref name="maxPassagesIn"/> from the field will |
| /// be returned. |
| /// </returns> |
| /// <exception cref="IOException">if an I/O error occurred during processing</exception> |
| /// <exception cref="ArgumentException">if <c>field</c> was indexed without <see cref="IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS"/></exception> |
| protected internal virtual IDictionary<string, object[]> HighlightFieldsAsObjects(string[] fieldsIn, Query query, IndexSearcher searcher, int[] docidsIn, int[] maxPassagesIn) |
| { |
| if (fieldsIn.Length < 1) |
| { |
| throw new ArgumentException("fieldsIn must not be empty"); |
| } |
| if (fieldsIn.Length != maxPassagesIn.Length) |
| { |
| throw new ArgumentException("invalid number of maxPassagesIn"); |
| } |
| IndexReader reader = searcher.IndexReader; |
| Query rewritten = Rewrite(query); |
| JCG.SortedSet<Term> queryTerms = new JCG.SortedSet<Term>(); |
| rewritten.ExtractTerms(queryTerms); |
| |
| IndexReaderContext readerContext = reader.Context; |
| IList<AtomicReaderContext> leaves = readerContext.Leaves; |
| |
| // Make our own copies because we sort in-place: |
| int[] docids = new int[docidsIn.Length]; |
| System.Array.Copy(docidsIn, 0, docids, 0, docidsIn.Length); |
| string[] fields = new string[fieldsIn.Length]; |
| System.Array.Copy(fieldsIn, 0, fields, 0, fieldsIn.Length); |
| int[] maxPassages = new int[maxPassagesIn.Length]; |
| System.Array.Copy(maxPassagesIn, 0, maxPassages, 0, maxPassagesIn.Length); |
| |
| // sort for sequential io |
| ArrayUtil.TimSort(docids); |
| new InPlaceMergeSorterAnonymousHelper(fields, maxPassages).Sort(0, fields.Length); |
| |
| // pull stored data: |
| IList<string[]> contents = LoadFieldValues(searcher, fields, docids, maxLength); |
| |
| IDictionary<string, object[]> highlights = new Dictionary<string, object[]>(); |
| for (int i = 0; i < fields.Length; i++) |
| { |
| string field = fields[i]; |
| int numPassages = maxPassages[i]; |
| Term floor = new Term(field, ""); |
| Term ceiling = new Term(field, UnicodeUtil.BIG_TERM); |
| |
| // LUCENENET: Call custom GetViewBetween overload to mimic Java's exclusive upper bound behavior. |
| var fieldTerms = queryTerms.GetViewBetween(floor, lowerValueInclusive: true, ceiling, upperValueInclusive: false); |
| |
| // TODO: should we have some reasonable defaults for term pruning? (e.g. stopwords) |
| |
| // Strip off the redundant field: |
| BytesRef[] terms = new BytesRef[fieldTerms.Count]; |
| int termUpto = 0; |
| foreach (Term term in fieldTerms) |
| { |
| terms[termUpto++] = term.Bytes; |
| } |
| IDictionary<int, object> fieldHighlights = HighlightField(field, contents[i], GetBreakIterator(field), terms, docids, leaves, numPassages, query); |
| |
| object[] result = new object[docids.Length]; |
| for (int j = 0; j < docidsIn.Length; j++) |
| { |
| fieldHighlights.TryGetValue(docidsIn[j], out result[j]); |
| } |
| highlights[field] = result; |
| } |
| return highlights; |
| } |
| |
| /// <summary> |
| /// Loads the string values for each field X docID to be |
| /// highlighted. By default this loads from stored |
| /// fields, but a subclass can change the source. This |
| /// method should allocate the string[fields.length][docids.length] |
| /// and fill all values. The returned strings must be |
| /// identical to what was indexed. |
| /// </summary> |
| protected virtual IList<string[]> LoadFieldValues(IndexSearcher searcher, string[] fields, int[] docids, int maxLength) |
| { |
| string[][] contents = RectangularArrays.ReturnRectangularArray<string>(fields.Length, docids.Length); |
| char[] valueSeparators = new char[fields.Length]; |
| for (int i = 0; i < fields.Length; i++) |
| { |
| valueSeparators[i] = GetMultiValuedSeparator(fields[i]); |
| } |
| LimitedStoredFieldVisitor visitor = new LimitedStoredFieldVisitor(fields, valueSeparators, maxLength); |
| for (int i = 0; i < docids.Length; i++) |
| { |
| searcher.Doc(docids[i], visitor); |
| for (int j = 0; j < fields.Length; j++) |
| { |
| contents[j][i] = visitor.GetValue(j); // LUCENENET: No point in doing ToString() on a string |
| } |
| visitor.Reset(); |
| } |
| return contents; |
| } |
| |
| /// <summary> |
| /// Returns the logical separator between values for multi-valued fields. |
| /// The default value is a space character, which means passages can span across values, |
| /// but a subclass can override, for example with <c>U+2029 PARAGRAPH SEPARATOR (PS)</c> |
| /// if each value holds a discrete passage for highlighting. |
| /// </summary> |
| protected virtual char GetMultiValuedSeparator(string field) |
| { |
| return ' '; |
| } |
| |
| /// <summary> |
| /// Returns the analyzer originally used to index the content for <paramref name="field"/>. |
| /// <para/> |
| /// This is used to highlight some <see cref="MultiTermQuery"/>s. |
| /// </summary> |
| /// <param name="field"></param> |
| /// <returns><see cref="Analyzer"/> or null (the default, meaning no special multi-term processing)</returns> |
| protected virtual Analyzer GetIndexAnalyzer(string field) |
| { |
| return null; |
| } |
| |
| private IDictionary<int, object> HighlightField(string field, string[] contents, BreakIterator bi, BytesRef[] terms, int[] docids, IList<AtomicReaderContext> leaves, int maxPassages, Query query) |
| { |
| IDictionary<int, object> highlights = new Dictionary<int, object>(); |
| |
| PassageFormatter fieldFormatter = GetFormatter(field); |
| if (fieldFormatter == null) |
| { |
| throw new NullReferenceException("PassageFormatter cannot be null"); |
| } |
| |
| // check if we should do any multiterm processing |
| Analyzer analyzer = GetIndexAnalyzer(field); |
| CharacterRunAutomaton[] automata = new CharacterRunAutomaton[0]; |
| if (analyzer != null) |
| { |
| automata = MultiTermHighlighting.ExtractAutomata(query, field); |
| } |
| |
| // resize 'terms', where the last term is the multiterm matcher |
| if (automata.Length > 0) |
| { |
| BytesRef[] newTerms = new BytesRef[terms.Length + 1]; |
| System.Array.Copy(terms, 0, newTerms, 0, terms.Length); |
| terms = newTerms; |
| } |
| |
| // we are processing in increasing docid order, so we only need to reinitialize stuff on segment changes |
| // otherwise, we will just advance() existing enums to the new document in the same segment. |
| DocsAndPositionsEnum[] postings = null; |
| TermsEnum termsEnum = null; |
| int lastLeaf = -1; |
| |
| for (int i = 0; i < docids.Length; i++) |
| { |
| string content = contents[i]; |
| if (content.Length == 0) |
| { |
| continue; // nothing to do |
| } |
| bi.SetText(content); |
| int doc = docids[i]; |
| int leaf = ReaderUtil.SubIndex(doc, leaves); |
| AtomicReaderContext subContext = leaves[leaf]; |
| AtomicReader r = subContext.AtomicReader; |
| |
| Debug.Assert(leaf >= lastLeaf); // increasing order |
| |
| // if the segment has changed, we must initialize new enums. |
| if (leaf != lastLeaf) |
| { |
| Terms t = r.GetTerms(field); |
| if (t != null) |
| { |
| termsEnum = t.GetIterator(null); |
| postings = new DocsAndPositionsEnum[terms.Length]; |
| } |
| } |
| if (termsEnum == null) |
| { |
| continue; // no terms for this field, nothing to do |
| } |
| |
| // if there are multi-term matches, we have to initialize the "fake" enum for each document |
| if (automata.Length > 0) |
| { |
| DocsAndPositionsEnum dp = MultiTermHighlighting.GetDocsEnum(analyzer.GetTokenStream(field, content), automata); |
| dp.Advance(doc - subContext.DocBase); |
| postings[terms.Length - 1] = dp; // last term is the multiterm matcher |
| } |
| |
| Passage[] passages = HighlightDoc(field, terms, content.Length, bi, doc - subContext.DocBase, termsEnum, postings, maxPassages); |
| |
| if (passages.Length == 0) |
| { |
| // no passages were returned, so ask for a default summary |
| passages = GetEmptyHighlight(field, bi, maxPassages); |
| } |
| |
| if (passages.Length > 0) |
| { |
| highlights[doc] = fieldFormatter.Format(passages, content); |
| } |
| |
| lastLeaf = leaf; |
| } |
| |
| return highlights; |
| } |
| |
| internal class HighlightDocComparerAnonymousHelper1 : IComparer<Passage> |
| { |
| public int Compare(Passage left, Passage right) |
| { |
| if (left.score < right.score) |
| { |
| return -1; |
| } |
| else if (left.score > right.score) |
| { |
| return 1; |
| } |
| else |
| { |
| return left.startOffset - right.startOffset; |
| } |
| } |
| } |
| |
| internal class HighlightDocComparerAnonymousHelper2 : IComparer<Passage> |
| { |
| public int Compare(Passage left, Passage right) |
| { |
| return left.startOffset - right.startOffset; |
| } |
| } |
| |
| // algorithm: treat sentence snippets as miniature documents |
| // we can intersect these with the postings lists via BreakIterator.preceding(offset),s |
| // score each sentence as norm(sentenceStartOffset) * sum(weight * tf(freq)) |
| private Passage[] HighlightDoc(string field, BytesRef[] terms, int contentLength, BreakIterator bi, int doc, |
| TermsEnum termsEnum, DocsAndPositionsEnum[] postings, int n) |
| { |
| PassageScorer scorer = GetScorer(field); |
| if (scorer == null) |
| { |
| throw new NullReferenceException("PassageScorer cannot be null"); |
| } |
| JCG.PriorityQueue<OffsetsEnum> pq = new JCG.PriorityQueue<OffsetsEnum>(); |
| float[] weights = new float[terms.Length]; |
| // initialize postings |
| for (int i = 0; i < terms.Length; i++) |
| { |
| DocsAndPositionsEnum de = postings[i]; |
| int pDoc; |
| if (de == EMPTY) |
| { |
| continue; |
| } |
| else if (de == null) |
| { |
| postings[i] = EMPTY; // initially |
| if (!termsEnum.SeekExact(terms[i])) |
| { |
| continue; // term not found |
| } |
| de = postings[i] = termsEnum.DocsAndPositions(null, null, DocsAndPositionsFlags.OFFSETS); |
| if (de == null) |
| { |
| // no positions available |
| throw new ArgumentException("field '" + field + "' was indexed without offsets, cannot highlight"); |
| } |
| pDoc = de.Advance(doc); |
| } |
| else |
| { |
| pDoc = de.DocID; |
| if (pDoc < doc) |
| { |
| pDoc = de.Advance(doc); |
| } |
| } |
| |
| if (doc == pDoc) |
| { |
| weights[i] = scorer.Weight(contentLength, de.Freq); |
| de.NextPosition(); |
| pq.Add(new OffsetsEnum(de, i)); |
| } |
| } |
| |
| pq.Add(new OffsetsEnum(EMPTY, int.MaxValue)); // a sentinel for termination |
| |
| JCG.PriorityQueue<Passage> passageQueue = new JCG.PriorityQueue<Passage>(n, new HighlightDocComparerAnonymousHelper1()); |
| Passage current = new Passage(); |
| |
| while (pq.TryDequeue(out OffsetsEnum off)) |
| { |
| DocsAndPositionsEnum dp = off.dp; |
| int start = dp.StartOffset; |
| if (start == -1) |
| { |
| throw new ArgumentException("field '" + field + "' was indexed without offsets, cannot highlight"); |
| } |
| int end = dp.EndOffset; |
| // LUCENE-5166: this hit would span the content limit... however more valid |
| // hits may exist (they are sorted by start). so we pretend like we never |
| // saw this term, it won't cause a passage to be added to passageQueue or anything. |
| Debug.Assert(EMPTY.StartOffset == int.MaxValue); |
| if (start < contentLength && end > contentLength) |
| { |
| continue; |
| } |
| if (start >= current.endOffset) |
| { |
| if (current.startOffset >= 0) |
| { |
| // finalize current |
| current.score *= scorer.Norm(current.startOffset); |
| // new sentence: first add 'current' to queue |
| if (passageQueue.Count == n && current.score < passageQueue.Peek().score) |
| { |
| current.Reset(); // can't compete, just reset it |
| } |
| else |
| { |
| passageQueue.Enqueue(current); |
| if (passageQueue.Count > n) |
| { |
| current = passageQueue.Dequeue(); |
| current.Reset(); |
| } |
| else |
| { |
| current = new Passage(); |
| } |
| } |
| } |
| // if we exceed limit, we are done |
| if (start >= contentLength) |
| { |
| Passage[] passages = passageQueue.ToArray(); |
| foreach (Passage p in passages) |
| { |
| p.Sort(); |
| } |
| // sort in ascending order |
| ArrayUtil.TimSort(passages, new HighlightDocComparerAnonymousHelper2()); |
| return passages; |
| } |
| // advance breakiterator |
| Debug.Assert(BreakIterator.Done < 0); |
| current.startOffset = Math.Max(bi.Preceding(start + 1), 0); |
| current.endOffset = Math.Min(bi.Next(), contentLength); |
| } |
| int tf = 0; |
| while (true) |
| { |
| tf++; |
| BytesRef term = terms[off.id]; |
| if (term == null) |
| { |
| // multitermquery match, pull from payload |
| term = off.dp.GetPayload(); |
| Debug.Assert(term != null); |
| } |
| current.AddMatch(start, end, term); |
| if (off.pos == dp.Freq) |
| { |
| break; // removed from pq |
| } |
| else |
| { |
| off.pos++; |
| dp.NextPosition(); |
| start = dp.StartOffset; |
| end = dp.EndOffset; |
| } |
| if (start >= current.endOffset || end > contentLength) |
| { |
| pq.Enqueue(off); |
| break; |
| } |
| } |
| current.score += weights[off.id] * scorer.Tf(tf, current.endOffset - current.startOffset); |
| } |
| |
| // Dead code but compiler disagrees: |
| Debug.Assert(false); |
| return null; |
| } |
| |
| /// <summary> |
| /// Called to summarize a document when no hits were |
| /// found. By default this just returns the first |
| /// <paramref name="maxPassages"/> sentences; subclasses can override |
| /// to customize. |
| /// </summary> |
| protected virtual Passage[] GetEmptyHighlight(string fieldName, BreakIterator bi, int maxPassages) |
| { |
| // BreakIterator should be un-next'd: |
| List<Passage> passages = new List<Passage>(); |
| int pos = bi.Current; |
| Debug.Assert(pos == 0); |
| while (passages.Count < maxPassages) |
| { |
| int next = bi.Next(); |
| if (next == BreakIterator.Done) |
| { |
| break; |
| } |
| Passage passage = new Passage(); |
| passage.score = float.NaN; |
| passage.startOffset = pos; |
| passage.endOffset = next; |
| passages.Add(passage); |
| pos = next; |
| } |
| |
| return passages.ToArray(/*new Passage[passages.size()]*/); |
| } |
| |
| internal class OffsetsEnum : IComparable<OffsetsEnum> |
| { |
| internal DocsAndPositionsEnum dp; |
| internal int pos; |
| internal int id; |
| |
| internal OffsetsEnum(DocsAndPositionsEnum dp, int id) |
| { |
| this.dp = dp; |
| this.id = id; |
| this.pos = 1; |
| } |
| |
| public virtual int CompareTo(OffsetsEnum other) |
| { |
| try |
| { |
| int off = dp.StartOffset; |
| int otherOff = other.dp.StartOffset; |
| if (off == otherOff) |
| { |
| return id - other.id; |
| } |
| else |
| { |
| return off.CompareTo(otherOff); |
| } |
| } |
| catch (IOException e) |
| { |
| throw new Exception(e.ToString(), e); |
| } |
| } |
| } |
| |
| private static readonly DocsAndPositionsEnum EMPTY = new DocsAndPositionsEnumAnonymousHelper(); |
| |
| /// <summary> |
| /// we rewrite against an empty indexreader: as we don't want things like |
| /// rangeQueries that don't summarize the document |
| /// </summary> |
| private class DocsAndPositionsEnumAnonymousHelper : DocsAndPositionsEnum |
| { |
| public override int NextPosition() |
| { |
| return 0; |
| } |
| |
| public override int StartOffset |
| { |
| get { return int.MaxValue; } |
| } |
| |
| public override int EndOffset |
| { |
| get { return int.MaxValue; } |
| } |
| |
| public override BytesRef GetPayload() |
| { |
| return null; |
| } |
| |
| public override int Freq |
| { |
| get { return 0; } |
| } |
| |
| public override int DocID |
| { |
| get { return NO_MORE_DOCS; } |
| } |
| |
| public override int NextDoc() |
| { |
| return NO_MORE_DOCS; |
| } |
| |
| public override int Advance(int target) |
| { |
| return NO_MORE_DOCS; |
| } |
| |
| public override long GetCost() |
| { |
| return 0; |
| } |
| } |
| |
| private static Query Rewrite(Query original) |
| { |
| Query query = original; |
| for (Query rewrittenQuery = query.Rewrite(EMPTY_INDEXREADER); rewrittenQuery != query; |
| rewrittenQuery = query.Rewrite(EMPTY_INDEXREADER)) |
| { |
| query = rewrittenQuery; |
| } |
| return query; |
| } |
| |
| private class LimitedStoredFieldVisitor : StoredFieldVisitor |
| { |
| private readonly string[] fields; |
| private readonly char[] valueSeparators; |
| private readonly int maxLength; |
| private readonly StringBuilder[] builders; |
| private int currentField = -1; |
| |
| public LimitedStoredFieldVisitor(string[] fields, char[] valueSeparators, int maxLength) |
| { |
| Debug.Assert(fields.Length == valueSeparators.Length); |
| this.fields = fields; |
| this.valueSeparators = valueSeparators; |
| this.maxLength = maxLength; |
| builders = new StringBuilder[fields.Length]; |
| for (int i = 0; i < builders.Length; i++) |
| { |
| builders[i] = new StringBuilder(); |
| } |
| } |
| |
| public override void StringField(Index.FieldInfo fieldInfo, string value) |
| { |
| Debug.Assert(currentField >= 0); |
| StringBuilder builder = builders[currentField]; |
| if (builder.Length > 0 && builder.Length < maxLength) |
| { |
| builder.Append(valueSeparators[currentField]); |
| } |
| if (builder.Length + value.Length > maxLength) |
| { |
| builder.Append(value, 0, maxLength - builder.Length); |
| } |
| else |
| { |
| builder.Append(value); |
| } |
| } |
| |
| public override Status NeedsField(Index.FieldInfo fieldInfo) |
| { |
| currentField = Array.BinarySearch(fields, fieldInfo.Name); |
| if (currentField < 0) |
| { |
| return Status.NO; |
| } |
| else if (builders[currentField].Length > maxLength) |
| { |
| return fields.Length == 1 ? Status.STOP : Status.NO; |
| } |
| return Status.YES; |
| } |
| |
| internal string GetValue(int i) |
| { |
| return builders[i].ToString(); |
| } |
| |
| internal void Reset() |
| { |
| currentField = -1; |
| for (int i = 0; i < fields.Length; i++) |
| { |
| builders[i].Length = 0; |
| } |
| } |
| } |
| } |
| } |
| #endif |