| using Lucene.Net.Analysis; |
| using Lucene.Net.Analysis.NGram; |
| using Lucene.Net.Analysis.TokenAttributes; |
| using Lucene.Net.Codecs.Lucene46; |
| using Lucene.Net.Documents; |
| using Lucene.Net.Index; |
| using Lucene.Net.Index.Sorter; |
| using Lucene.Net.Store; |
| using Lucene.Net.Util; |
| using System; |
| using System.Collections.Generic; |
| using System.IO; |
| using System.Text; |
| using JCG = J2N.Collections.Generic; |
| using Directory = Lucene.Net.Store.Directory; |
| |
| namespace Lucene.Net.Search.Suggest.Analyzing |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| // TODO: |
| // - a PostingsFormat that stores super-high-freq terms as |
| // a bitset should be a win for the prefix terms? |
| // (LUCENE-5052) |
| // - we could offer a better integration with |
| // DocumentDictionary and NRT? so that your suggester |
| // "automatically" keeps in sync w/ your index |
| |
| /// <summary> |
| /// Analyzes the input text and then suggests matches based |
| /// on prefix matches to any tokens in the indexed text. |
| /// This also highlights the tokens that match. |
| /// |
| /// <para>This suggester supports payloads. Matches are sorted only |
| /// by the suggest weight; it would be nice to support |
| /// blended score + weight sort in the future. This means |
| /// this suggester best applies when there is a strong |
| /// a-priori ranking of all the suggestions. |
| /// |
| /// </para> |
| /// <para>This suggester supports contexts, however the |
| /// contexts must be valid utf8 (arbitrary binary terms will |
| /// not work). |
| /// |
| /// @lucene.experimental |
| /// </para> |
| /// </summary> |
| |
| public class AnalyzingInfixSuggester : Lookup, IDisposable |
| { |
| private readonly object syncLock = new object(); //uses syncLock as substitute for Java's synchronized (method) keyword |
| |
| /// <summary> |
| /// Field name used for the indexed text. </summary> |
| protected const string TEXT_FIELD_NAME = "text"; |
| |
| /// <summary> |
| /// Field name used for the indexed text, as a |
| /// <see cref="StringField"/>, for exact lookup. |
| /// </summary> |
| protected const string EXACT_TEXT_FIELD_NAME = "exacttext"; |
| |
| /// <summary> |
| /// Field name used for the indexed context, as a |
| /// <see cref="StringField"/> and a <see cref="SortedSetDocValuesField"/>, for filtering. |
| /// </summary> |
| protected const string CONTEXTS_FIELD_NAME = "contexts"; |
| |
| /// <summary> |
| /// Analyzer used at search time </summary> |
| protected readonly Analyzer m_queryAnalyzer; |
| /// <summary> |
| /// Analyzer used at index time </summary> |
| protected readonly Analyzer m_indexAnalyzer; |
| internal readonly LuceneVersion matchVersion; |
| private readonly Directory dir; |
| internal readonly int minPrefixChars; |
| private readonly bool commitOnBuild; |
| |
| /// <summary> |
| /// Used for ongoing NRT additions/updates. </summary> |
| private IndexWriter writer; |
| |
| /// <summary> |
| /// <see cref="IndexSearcher"/> used for lookups. </summary> |
| protected SearcherManager m_searcherMgr; |
| |
| /// <summary> |
| /// Default minimum number of leading characters before |
| /// PrefixQuery is used (4). |
| /// </summary> |
| public const int DEFAULT_MIN_PREFIX_CHARS = 4; |
| |
| /// <summary> |
| /// How we sort the postings and search results. </summary> |
| private static readonly Sort SORT = new Sort(new SortField("weight", SortFieldType.INT64, true)); |
| |
| /// <summary> |
| /// Create a new instance, loading from a previously built |
| /// <see cref="AnalyzingInfixSuggester"/> directory, if it exists. |
| /// This directory must be |
| /// private to the infix suggester (i.e., not an external |
| /// Lucene index). Note that <see cref="Dispose()"/> |
| /// will also dispose the provided directory. |
| /// </summary> |
| public AnalyzingInfixSuggester(LuceneVersion matchVersion, Directory dir, Analyzer analyzer) |
| : this(matchVersion, dir, analyzer, analyzer, DEFAULT_MIN_PREFIX_CHARS) |
| { |
| } |
| |
| /// <summary> |
| /// Create a new instance, loading from a previously built |
| /// <see cref="AnalyzingInfixSuggester"/> directory, if it exists. This directory must be |
| /// private to the infix suggester (i.e., not an external |
| /// Lucene index). Note that <see cref="Dispose()"/> |
| /// will also dispose the provided directory. |
| /// </summary> |
| /// <param name="minPrefixChars"> Minimum number of leading characters |
| /// before <see cref="PrefixQuery"/> is used (default 4). |
| /// Prefixes shorter than this are indexed as character |
| /// ngrams (increasing index size but making lookups |
| /// faster). </param> |
| // LUCENENET specific - LUCENE-5889, a 4.11.0 feature. calls new constructor with extra param. |
| // LUCENENET TODO: Remove method at version 4.11.0. Was retained for perfect 4.8 compatibility |
| public AnalyzingInfixSuggester(LuceneVersion matchVersion, Directory dir, Analyzer indexAnalyzer, |
| Analyzer queryAnalyzer, int minPrefixChars) |
| : this(matchVersion, dir, indexAnalyzer, queryAnalyzer, minPrefixChars, commitOnBuild: false) |
| { |
| } |
| |
| |
| /// <summary> |
| /// Create a new instance, loading from a previously built |
| /// <see cref="AnalyzingInfixSuggester"/> directory, if it exists. This directory must be |
| /// private to the infix suggester (i.e., not an external |
| /// Lucene index). Note that <see cref="Dispose()"/> |
| /// will also dispose the provided directory. |
| /// </summary> |
| /// <param name="minPrefixChars"> Minimum number of leading characters |
| /// before <see cref="PrefixQuery"/> is used (default 4). |
| /// Prefixes shorter than this are indexed as character |
| /// ngrams (increasing index size but making lookups |
| /// faster). </param> |
| /// <param name="commitOnBuild"> Call commit after the index has finished building. This |
| /// would persist the suggester index to disk and future instances of this suggester can |
| /// use this pre-built dictionary. </param> |
| // LUCENENET specific - LUCENE-5889, a 4.11.0 feature. (Code moved from other constructor to here.) |
| public AnalyzingInfixSuggester(LuceneVersion matchVersion, Directory dir, Analyzer indexAnalyzer, |
| Analyzer queryAnalyzer, int minPrefixChars, bool commitOnBuild) |
| { |
| |
| if (minPrefixChars < 0) |
| { |
| throw new ArgumentException("minPrefixChars must be >= 0; got: " + minPrefixChars); |
| } |
| |
| this.m_queryAnalyzer = queryAnalyzer; |
| this.m_indexAnalyzer = indexAnalyzer; |
| this.matchVersion = matchVersion; |
| this.dir = dir; |
| this.minPrefixChars = minPrefixChars; |
| this.commitOnBuild = commitOnBuild; |
| |
| if (DirectoryReader.IndexExists(dir)) |
| { |
| // Already built; open it: |
| writer = new IndexWriter(dir, GetIndexWriterConfig(matchVersion, GetGramAnalyzer(), OpenMode.APPEND)); |
| m_searcherMgr = new SearcherManager(writer, true, null); |
| } |
| } |
| |
| /// <summary> |
| /// Override this to customize index settings, e.g. which |
| /// codec to use. |
| /// </summary> |
| protected internal virtual IndexWriterConfig GetIndexWriterConfig(LuceneVersion matchVersion, |
| Analyzer indexAnalyzer, OpenMode openMode) |
| { |
| IndexWriterConfig iwc = new IndexWriterConfig(matchVersion, indexAnalyzer) |
| { |
| Codec = new Lucene46Codec(), |
| OpenMode = openMode |
| }; |
| |
| // This way all merged segments will be sorted at |
| // merge time, allow for per-segment early termination |
| // when those segments are searched: |
| iwc.MergePolicy = new SortingMergePolicy(iwc.MergePolicy, SORT); |
| |
| return iwc; |
| } |
| |
| /// <summary> |
| /// Subclass can override to choose a specific |
| /// <see cref="Directory"/> implementation. |
| /// </summary> |
| protected internal virtual Directory GetDirectory(DirectoryInfo path) |
| { |
| return FSDirectory.Open(path); |
| } |
| |
| public override void Build(IInputEnumerator enumerator) |
| { |
| if (m_searcherMgr != null) |
| { |
| m_searcherMgr.Dispose(); |
| m_searcherMgr = null; |
| } |
| |
| if (writer != null) |
| { |
| writer.Dispose(); |
| writer = null; |
| } |
| |
| AtomicReader r = null; |
| bool success = false; |
| try |
| { |
| // First pass: build a temporary normal Lucene index, |
| // just indexing the suggestions as they iterate: |
| writer = new IndexWriter(dir, GetIndexWriterConfig(matchVersion, GetGramAnalyzer(), OpenMode.CREATE)); |
| //long t0 = System.nanoTime(); |
| |
| // TODO: use threads? |
| BytesRef text; |
| while (enumerator.MoveNext()) |
| { |
| text = enumerator.Current; |
| BytesRef payload; |
| if (enumerator.HasPayloads) |
| { |
| payload = enumerator.Payload; |
| } |
| else |
| { |
| payload = null; |
| } |
| |
| Add(text, enumerator.Contexts, enumerator.Weight, payload); |
| } |
| |
| //System.out.println("initial indexing time: " + ((System.nanoTime()-t0)/1000000) + " msec"); |
| if (commitOnBuild) //LUCENENET specific -Support for LUCENE - 5889. |
| { |
| Commit(); |
| } |
| m_searcherMgr = new SearcherManager(writer, true, null); |
| success = true; |
| } |
| finally |
| { |
| if (success) |
| { |
| IOUtils.Dispose(r); |
| } |
| else |
| { |
| IOUtils.DisposeWhileHandlingException(writer, r); |
| writer = null; |
| } |
| } |
| } |
| |
| //LUCENENET specific -Support for LUCENE - 5889. |
| public void Commit() |
| { |
| if (writer == null) |
| { |
| throw new InvalidOperationException("Cannot commit on an closed writer. Add documents first"); |
| } |
| writer.Commit(); |
| } |
| |
| private Analyzer GetGramAnalyzer() |
| => new AnalyzerWrapperAnonymousClass(this, Analyzer.PER_FIELD_REUSE_STRATEGY); |
| |
| private class AnalyzerWrapperAnonymousClass : AnalyzerWrapper |
| { |
| private readonly AnalyzingInfixSuggester outerInstance; |
| |
| public AnalyzerWrapperAnonymousClass(AnalyzingInfixSuggester outerInstance, ReuseStrategy reuseStrategy) |
| : base(reuseStrategy) |
| { |
| this.outerInstance = outerInstance; |
| } |
| |
| protected override Analyzer GetWrappedAnalyzer(string fieldName) |
| { |
| return outerInstance.m_indexAnalyzer; |
| } |
| |
| protected override TokenStreamComponents WrapComponents(string fieldName, TokenStreamComponents components) |
| { |
| if (fieldName.Equals("textgrams", StringComparison.Ordinal) && outerInstance.minPrefixChars > 0) |
| { |
| return new TokenStreamComponents(components.Tokenizer, |
| new EdgeNGramTokenFilter( |
| outerInstance.matchVersion, |
| components.TokenStream, |
| 1, |
| outerInstance.minPrefixChars)); |
| } |
| else |
| { |
| return components; |
| } |
| } |
| } |
| |
| //LUCENENET specific -Support for LUCENE - 5889. |
| private void EnsureOpen() |
| { |
| if (writer != null) |
| return; |
| |
| lock (syncLock) |
| { |
| if (writer == null) |
| { |
| if (m_searcherMgr != null) |
| { |
| m_searcherMgr.Dispose(); |
| m_searcherMgr = null; |
| } |
| writer = new IndexWriter(dir, GetIndexWriterConfig(matchVersion, GetGramAnalyzer(), OpenMode.CREATE)); |
| m_searcherMgr = new SearcherManager(writer, true, null); |
| } |
| } |
| } |
| |
| /// <summary> |
| /// Adds a new suggestion. Be sure to use <see cref="Update"/> |
| /// instead if you want to replace a previous suggestion. |
| /// After adding or updating a batch of new suggestions, |
| /// you must call <see cref="Refresh()"/> in the end in order to |
| /// see the suggestions in <see cref="DoLookup(string, IEnumerable{BytesRef}, int, bool, bool)"/> |
| /// </summary> |
| public virtual void Add(BytesRef text, IEnumerable<BytesRef> contexts, long weight, BytesRef payload) |
| { |
| EnsureOpen(); //LUCENENET specific -Support for LUCENE - 5889. |
| writer.AddDocument(BuildDocument(text, contexts, weight, payload)); |
| } |
| |
| /// <summary> |
| /// Updates a previous suggestion, matching the exact same |
| /// text as before. Use this to change the weight or |
| /// payload of an already added suggstion. If you know |
| /// this text is not already present you can use <see cref="Add"/> |
| /// instead. After adding or updating a batch of |
| /// new suggestions, you must call <see cref="Refresh()"/> in the |
| /// end in order to see the suggestions in <see cref="DoLookup(string, IEnumerable{BytesRef}, int, bool, bool)"/> |
| /// </summary> |
| public virtual void Update(BytesRef text, IEnumerable<BytesRef> contexts, long weight, BytesRef payload) |
| { |
| writer.UpdateDocument(new Term(EXACT_TEXT_FIELD_NAME, text.Utf8ToString()), BuildDocument(text, contexts, weight, payload)); |
| } |
| |
| private Document BuildDocument(BytesRef text, IEnumerable<BytesRef> contexts, long weight, BytesRef payload) |
| { |
| string textString = text.Utf8ToString(); |
| var ft = GetTextFieldType(); |
| var doc = new Document |
| { |
| new Field(TEXT_FIELD_NAME, textString, ft), |
| new Field("textgrams", textString, ft), |
| new StringField(EXACT_TEXT_FIELD_NAME, textString, Field.Store.NO), |
| new BinaryDocValuesField(TEXT_FIELD_NAME, text), |
| new NumericDocValuesField("weight", weight) |
| }; |
| if (payload != null) |
| { |
| doc.Add(new BinaryDocValuesField("payloads", payload)); |
| } |
| if (contexts != null) |
| { |
| foreach (BytesRef context in contexts) |
| { |
| // TODO: if we had a BinaryTermField we could fix |
| // this "must be valid ut8f" limitation: |
| doc.Add(new StringField(CONTEXTS_FIELD_NAME, context.Utf8ToString(), Field.Store.NO)); |
| doc.Add(new SortedSetDocValuesField(CONTEXTS_FIELD_NAME, context)); |
| } |
| } |
| return doc; |
| } |
| |
| /// <summary> |
| /// Reopens the underlying searcher; it's best to "batch |
| /// up" many additions/updates, and then call refresh |
| /// once in the end. |
| /// </summary> |
| public virtual void Refresh() |
| { |
| if (m_searcherMgr == null) |
| { |
| throw new InvalidOperationException("suggester was not built"); |
| } |
| m_searcherMgr.MaybeRefreshBlocking(); |
| } |
| |
| /// <summary> |
| /// Subclass can override this method to change the field type of the text field |
| /// e.g. to change the index options |
| /// </summary> |
| protected virtual FieldType GetTextFieldType() |
| { |
| var ft = new FieldType(TextField.TYPE_NOT_STORED); |
| ft.IndexOptions = IndexOptions.DOCS_ONLY; |
| ft.OmitNorms = true; |
| |
| return ft; |
| } |
| |
| public override IList<LookupResult> DoLookup(string key, IEnumerable<BytesRef> contexts, bool onlyMorePopular, int num) |
| { |
| return DoLookup(key, contexts, num, true, true); |
| } |
| |
| /// <summary> |
| /// Lookup, without any context. |
| /// </summary> |
| public virtual IList<LookupResult> DoLookup(string key, int num, bool allTermsRequired, bool doHighlight) |
| { |
| return DoLookup(key, null, num, allTermsRequired, doHighlight); |
| } |
| |
| /// <summary> |
| /// This is called if the last token isn't ended |
| /// (e.g. user did not type a space after it). Return an |
| /// appropriate <see cref="Query"/> clause to add to the <see cref="BooleanQuery"/>. |
| /// </summary> |
| protected internal virtual Query GetLastTokenQuery(string token) |
| { |
| if (token.Length < minPrefixChars) |
| { |
| // The leading ngram was directly indexed: |
| return new TermQuery(new Term("textgrams", token)); |
| } |
| |
| return new PrefixQuery(new Term(TEXT_FIELD_NAME, token)); |
| } |
| |
| /// <summary> |
| /// Retrieve suggestions, specifying whether all terms |
| /// must match (<paramref name="allTermsRequired"/>) and whether the hits |
| /// should be highlighted (<paramref name="doHighlight"/>). |
| /// </summary> |
| public virtual IList<LookupResult> DoLookup(string key, IEnumerable<BytesRef> contexts, int num, bool allTermsRequired, bool doHighlight) |
| { |
| |
| if (m_searcherMgr == null) |
| { |
| throw new InvalidOperationException("suggester was not built"); |
| } |
| |
| Occur occur; |
| if (allTermsRequired) |
| { |
| occur = Occur.MUST; |
| } |
| else |
| { |
| occur = Occur.SHOULD; |
| } |
| |
| TokenStream ts = null; |
| BooleanQuery query; |
| var matchedTokens = new JCG.HashSet<string>(); |
| string prefixToken = null; |
| |
| try |
| { |
| ts = m_queryAnalyzer.GetTokenStream("", new StringReader(key)); |
| |
| //long t0 = System.currentTimeMillis(); |
| ts.Reset(); |
| var termAtt = ts.AddAttribute<ICharTermAttribute>(); |
| var offsetAtt = ts.AddAttribute<IOffsetAttribute>(); |
| string lastToken = null; |
| query = new BooleanQuery(); |
| int maxEndOffset = -1; |
| matchedTokens = new JCG.HashSet<string>(); |
| while (ts.IncrementToken()) |
| { |
| if (lastToken != null) |
| { |
| matchedTokens.Add(lastToken); |
| query.Add(new TermQuery(new Term(TEXT_FIELD_NAME, lastToken)), occur); |
| } |
| lastToken = termAtt.ToString(); |
| if (lastToken != null) |
| { |
| maxEndOffset = Math.Max(maxEndOffset, offsetAtt.EndOffset); |
| } |
| } |
| ts.End(); |
| |
| if (lastToken != null) |
| { |
| Query lastQuery; |
| if (maxEndOffset == offsetAtt.EndOffset) |
| { |
| // Use PrefixQuery (or the ngram equivalent) when |
| // there was no trailing discarded chars in the |
| // string (e.g. whitespace), so that if query does |
| // not end with a space we show prefix matches for |
| // that token: |
| lastQuery = GetLastTokenQuery(lastToken); |
| prefixToken = lastToken; |
| } |
| else |
| { |
| // Use TermQuery for an exact match if there were |
| // trailing discarded chars (e.g. whitespace), so |
| // that if query ends with a space we only show |
| // exact matches for that term: |
| matchedTokens.Add(lastToken); |
| lastQuery = new TermQuery(new Term(TEXT_FIELD_NAME, lastToken)); |
| } |
| if (lastQuery != null) |
| { |
| query.Add(lastQuery, occur); |
| } |
| } |
| |
| if (contexts != null) |
| { |
| BooleanQuery sub = new BooleanQuery(); |
| query.Add(sub, Occur.MUST); |
| foreach (BytesRef context in contexts) |
| { |
| // NOTE: we "should" wrap this in |
| // ConstantScoreQuery, or maybe send this as a |
| // Filter instead to search, but since all of |
| // these are MUST'd, the change to the score won't |
| // affect the overall ranking. Since we indexed |
| // as DOCS_ONLY, the perf should be the same |
| // either way (no freq int[] blocks to decode): |
| |
| // TODO: if we had a BinaryTermField we could fix |
| // this "must be valid ut8f" limitation: |
| sub.Add(new TermQuery(new Term(CONTEXTS_FIELD_NAME, context.Utf8ToString())), Occur.SHOULD); |
| } |
| } |
| } |
| finally |
| { |
| IOUtils.DisposeWhileHandlingException(ts); |
| } |
| |
| // TODO: we could allow blended sort here, combining |
| // weight w/ score. Now we ignore score and sort only |
| // by weight: |
| |
| Query finalQuery = FinishQuery(query, allTermsRequired); |
| |
| //System.out.println("finalQuery=" + query); |
| |
| // Sort by weight, descending: |
| TopFieldCollector c = TopFieldCollector.Create(SORT, num, true, false, false, false); |
| |
| // We sorted postings by weight during indexing, so we |
| // only retrieve the first num hits now: |
| ICollector c2 = new EarlyTerminatingSortingCollector(c, SORT, num); |
| IndexSearcher searcher = m_searcherMgr.Acquire(); |
| IList<LookupResult> results = null; |
| try |
| { |
| //System.out.println("got searcher=" + searcher); |
| searcher.Search(finalQuery, c2); |
| |
| TopFieldDocs hits = (TopFieldDocs)c.GetTopDocs(); |
| |
| // Slower way if postings are not pre-sorted by weight: |
| // hits = searcher.search(query, null, num, SORT); |
| results = CreateResults(searcher, hits, num, key, doHighlight, matchedTokens, prefixToken); |
| } |
| finally |
| { |
| m_searcherMgr.Release(searcher); |
| } |
| |
| //System.out.println((System.currentTimeMillis() - t0) + " msec for infix suggest"); |
| //System.out.println(results); |
| |
| return results; |
| } |
| |
| /// <summary> |
| /// Create the results based on the search hits. |
| /// Can be overridden by subclass to add particular behavior (e.g. weight transformation) </summary> |
| /// <exception cref="IOException"> If there are problems reading fields from the underlying Lucene index. </exception> |
| protected internal virtual IList<LookupResult> CreateResults(IndexSearcher searcher, TopFieldDocs hits, int num, string charSequence, bool doHighlight, ICollection<string> matchedTokens, string prefixToken) |
| { |
| |
| BinaryDocValues textDV = MultiDocValues.GetBinaryValues(searcher.IndexReader, TEXT_FIELD_NAME); |
| |
| // This will just be null if app didn't pass payloads to build(): |
| // TODO: maybe just stored fields? they compress... |
| BinaryDocValues payloadsDV = MultiDocValues.GetBinaryValues(searcher.IndexReader, "payloads"); |
| IList<AtomicReaderContext> leaves = searcher.IndexReader.Leaves; |
| List<LookupResult> results = new List<LookupResult>(); |
| BytesRef scratch = new BytesRef(); |
| for (int i = 0; i < hits.ScoreDocs.Length; i++) |
| { |
| FieldDoc fd = (FieldDoc)hits.ScoreDocs[i]; |
| textDV.Get(fd.Doc, scratch); |
| string text = scratch.Utf8ToString(); |
| long score = (long)fd.Fields[0]; |
| |
| BytesRef payload; |
| if (payloadsDV != null) |
| { |
| payload = new BytesRef(); |
| payloadsDV.Get(fd.Doc, payload); |
| } |
| else |
| { |
| payload = null; |
| } |
| |
| // Must look up sorted-set by segment: |
| int segment = ReaderUtil.SubIndex(fd.Doc, leaves); |
| SortedSetDocValues contextsDV = leaves[segment].AtomicReader.GetSortedSetDocValues(CONTEXTS_FIELD_NAME); |
| ISet<BytesRef> contexts; |
| if (contextsDV != null) |
| { |
| contexts = new JCG.HashSet<BytesRef>(); |
| contextsDV.SetDocument(fd.Doc - leaves[segment].DocBase); |
| long ord; |
| while ((ord = contextsDV.NextOrd()) != SortedSetDocValues.NO_MORE_ORDS) |
| { |
| BytesRef context = new BytesRef(); |
| contextsDV.LookupOrd(ord, context); |
| contexts.Add(context); |
| } |
| } |
| else |
| { |
| contexts = null; |
| } |
| |
| LookupResult result; |
| |
| if (doHighlight) |
| { |
| object highlightKey = Highlight(text, matchedTokens, prefixToken); |
| result = new LookupResult(highlightKey.ToString(), highlightKey, score, payload, contexts); |
| } |
| else |
| { |
| result = new LookupResult(text, score, payload, contexts); |
| } |
| |
| results.Add(result); |
| } |
| |
| return results; |
| } |
| |
| /// <summary> |
| /// Subclass can override this to tweak the Query before |
| /// searching. |
| /// </summary> |
| protected internal virtual Query FinishQuery(BooleanQuery bq, bool allTermsRequired) |
| { |
| return bq; |
| } |
| |
| /// <summary> |
| /// Override this method to customize the Object |
| /// representing a single highlighted suggestions; the |
| /// result is set on each <see cref="Lookup.LookupResult.HighlightKey"/> |
| /// member. |
| /// </summary> |
| protected internal virtual object Highlight(string text, ICollection<string> matchedTokens, string prefixToken) |
| { |
| TokenStream ts = m_queryAnalyzer.GetTokenStream("text", new StringReader(text)); |
| try |
| { |
| var termAtt = ts.AddAttribute<ICharTermAttribute>(); |
| var offsetAtt = ts.AddAttribute<IOffsetAttribute>(); |
| ts.Reset(); |
| var sb = new StringBuilder(); |
| int upto = 0; |
| while (ts.IncrementToken()) |
| { |
| string token = termAtt.ToString(); |
| int startOffset = offsetAtt.StartOffset; |
| int endOffset = offsetAtt.EndOffset; |
| if (upto < startOffset) |
| { |
| AddNonMatch(sb, text.Substring(upto, startOffset - upto)); |
| upto = startOffset; |
| } |
| else if (upto > startOffset) |
| { |
| continue; |
| } |
| |
| if (matchedTokens.Contains(token)) |
| { |
| // Token matches. |
| AddWholeMatch(sb, text.Substring(startOffset, endOffset - startOffset), token); |
| upto = endOffset; |
| } |
| else if (prefixToken != null && token.StartsWith(prefixToken, StringComparison.Ordinal)) |
| { |
| AddPrefixMatch(sb, text.Substring(startOffset, endOffset - startOffset), token, prefixToken); |
| upto = endOffset; |
| } |
| } |
| ts.End(); |
| int endOffset2 = offsetAtt.EndOffset; |
| if (upto < endOffset2) |
| { |
| AddNonMatch(sb, text.Substring(upto)); |
| } |
| return sb.ToString(); |
| } |
| finally |
| { |
| IOUtils.DisposeWhileHandlingException(ts); |
| } |
| } |
| |
| /// <summary> |
| /// Called while highlighting a single result, to append a |
| /// non-matching chunk of text from the suggestion to the |
| /// provided fragments list. </summary> |
| /// <param name="sb"> The <see cref="StringBuilder"/> to append to </param> |
| /// <param name="text"> The text chunk to add </param> |
| protected internal virtual void AddNonMatch(StringBuilder sb, string text) |
| { |
| sb.Append(text); |
| } |
| |
| /// <summary> |
| /// Called while highlighting a single result, to append |
| /// the whole matched token to the provided fragments list. </summary> |
| /// <param name="sb"> The <see cref="StringBuilder"/> to append to </param> |
| /// <param name="surface"> The surface form (original) text </param> |
| /// <param name="analyzed"> The analyzed token corresponding to the surface form text </param> |
| protected internal virtual void AddWholeMatch(StringBuilder sb, string surface, string analyzed) |
| { |
| sb.Append("<b>"); |
| sb.Append(surface); |
| sb.Append("</b>"); |
| } |
| |
| /// <summary> |
| /// Called while highlighting a single result, to append a |
| /// matched prefix token, to the provided fragments list. </summary> |
| /// <param name="sb"> The <see cref="StringBuilder"/> to append to </param> |
| /// <param name="surface"> The fragment of the surface form |
| /// (indexed during <see cref="Build(IInputEnumerator)"/>, corresponding to |
| /// this match </param> |
| /// <param name="analyzed"> The analyzed token that matched </param> |
| /// <param name="prefixToken"> The prefix of the token that matched </param> |
| protected internal virtual void AddPrefixMatch(StringBuilder sb, string surface, string analyzed, string prefixToken) |
| { |
| // TODO: apps can try to invert their analysis logic |
| // here, e.g. downcase the two before checking prefix: |
| sb.Append("<b>"); |
| sb.Append(surface.Substring(0, prefixToken.Length - 0)); |
| sb.Append("</b>"); |
| if (prefixToken.Length < surface.Length) |
| { |
| sb.Append(surface.Substring(prefixToken.Length)); |
| } |
| } |
| |
| public override bool Store(DataOutput @in) |
| { |
| return false; |
| } |
| |
| public override bool Load(DataInput @out) |
| { |
| return false; |
| } |
| |
| public void Dispose() |
| { |
| Dispose(true); |
| GC.SuppressFinalize(this); |
| } |
| |
| protected virtual void Dispose(bool disposing) // LUCENENET specific - implemented proper dispose pattern |
| { |
| if (disposing) |
| { |
| if (m_searcherMgr != null) |
| { |
| m_searcherMgr.Dispose(); |
| m_searcherMgr = null; |
| } |
| if (writer != null) |
| { |
| writer.Dispose(); |
| dir.Dispose(); |
| writer = null; |
| } |
| } |
| } |
| |
| public override long GetSizeInBytes() |
| { |
| long mem = RamUsageEstimator.ShallowSizeOf(this); |
| try |
| { |
| if (m_searcherMgr != null) |
| { |
| IndexSearcher searcher = m_searcherMgr.Acquire(); |
| try |
| { |
| foreach (AtomicReaderContext context in searcher.IndexReader.Leaves) |
| { |
| AtomicReader reader = FilterAtomicReader.Unwrap(context.AtomicReader); |
| if (reader is SegmentReader) |
| { |
| mem += ((SegmentReader)context.Reader).RamBytesUsed(); |
| } |
| } |
| } |
| finally |
| { |
| m_searcherMgr.Release(searcher); |
| } |
| } |
| return mem; |
| } |
| catch (IOException ioe) |
| { |
| throw new Exception(ioe.ToString(), ioe); |
| } |
| } |
| |
| public override long Count |
| { |
| get |
| { |
| if (m_searcherMgr == null) |
| { |
| return 0; |
| } |
| IndexSearcher searcher = m_searcherMgr.Acquire(); |
| try |
| { |
| return searcher.IndexReader.NumDocs; |
| } |
| finally |
| { |
| m_searcherMgr.Release(searcher); |
| } |
| } |
| } |
| } |
| } |