| // Copyright (c) 2005 Bruno Martins |
| // All rights reserved. |
| // |
| // Redistribution and use in source and binary forms, with or without |
| // modification, are permitted provided that the following conditions |
| // are met: |
| // 1. Redistributions of source code must retain the above copyright |
| // notice, this list of conditions and the following disclaimer. |
| // 2. Redistributions in binary form must reproduce the above copyright |
| // notice, this list of conditions and the following disclaimer in the |
| // documentation and/or other materials provided with the distribution. |
| // 3. Neither the name of the organization nor the names of its contributors |
| // may be used to endorse or promote products derived from this software |
| // without specific prior written permission. |
| // |
| // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE |
| // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF |
| // THE POSSIBILITY OF SUCH DAMAGE. |
| |
| using J2N.Text; |
| using Lucene.Net.Util; |
| using System; |
| using System.Collections.Generic; |
| using System.Globalization; |
| using System.IO; |
| using System.IO.Compression; |
| using System.Text; |
| |
| namespace Lucene.Net.Search.Suggest.Jaspell |
| { |
| /// <summary> |
| /// Implementation of a Ternary Search Trie, a data structure for storing |
| /// <see cref="string"/>s that combines the compact size of a binary search |
| /// tree with the speed of a digital search trie, and is therefore ideal for |
| /// practical use in sorting and searching data. |
| /// <para> |
| /// |
| /// This data structure is faster than hashing for many typical search problems, |
| /// and supports a broader range of useful problems and operations. Ternary |
| /// searches are faster than hashing and more powerful, too. |
| /// </para> |
| /// <para> |
| /// |
| /// The theory of ternary search trees was described at a symposium in 1997 (see |
| /// "Fast Algorithms for Sorting and Searching Strings," by J.L. Bentley and R. |
| /// Sedgewick, Proceedings of the 8th Annual ACM-SIAM Symposium on Discrete |
| /// Algorithms, January 1997). Algorithms in C, Third Edition, by Robert |
| /// Sedgewick (Addison-Wesley, 1998) provides yet another view of ternary search |
| /// trees. |
| /// </para> |
| /// </summary> |
| public class JaspellTernarySearchTrie |
| { |
| |
| /// <summary> |
| /// An inner class of Ternary Search Trie that represents a node in the trie. |
| /// </summary> |
| public sealed class TSTNode |
| { |
| private readonly JaspellTernarySearchTrie outerInstance; |
| |
| |
| /// <summary> |
| /// Index values for accessing relatives array. </summary> |
| internal const int PARENT = 0, LOKID = 1, EQKID = 2, HIKID = 3; |
| |
| /// <summary> |
| /// The key to the node. </summary> |
| internal object data; |
| |
| /// <summary> |
| /// The relative nodes. </summary> |
| internal readonly TSTNode[] relatives = new TSTNode[4]; |
| |
| /// <summary> |
| /// The char used in the split. </summary> |
| internal char splitchar; |
| |
| /// <summary> |
| /// Constructor method. |
| /// </summary> |
| /// <param name="outerInstance">The containing <see cref="JaspellTernarySearchTrie"/></param> |
| /// <param name="splitchar"> |
| /// The char used in the split. </param> |
| /// <param name="parent"> |
| /// The parent node. </param> |
| internal TSTNode(JaspellTernarySearchTrie outerInstance, char splitchar, TSTNode parent) |
| { |
| this.outerInstance = outerInstance; |
| this.splitchar = splitchar; |
| relatives[PARENT] = parent; |
| } |
| |
| /// <summary> |
| /// Return an approximate memory usage for this node and its sub-nodes. </summary> |
| public long GetSizeInBytes() |
| { |
| long mem = RamUsageEstimator.ShallowSizeOf(this) + RamUsageEstimator.ShallowSizeOf(relatives); |
| foreach (TSTNode node in relatives) |
| { |
| // LUCENENET NOTE: Going with the summary of this method, which says it should not |
| // include the parent node. When we include the parent node, it results in overflowing |
| // the thread stack because we have infinite recursion. |
| // |
| // However, in version 6.2 of Lucene (latest) it mentions we should also estimate the parent node. |
| // https://github.com/apache/lucene-solr/blob/764d0f19151dbff6f5fcd9fc4b2682cf934590c5/lucene/suggest/src/java/org/apache/lucene/search/suggest/jaspell/JaspellTernarySearchTrie.java#L104 |
| // Not sure what the reason for that is, but it seems like a recipe for innaccuracy. |
| if (node != null && node != relatives[PARENT]) |
| { |
| mem += node.GetSizeInBytes(); |
| } |
| } |
| return mem; |
| } |
| } |
| |
| /// <summary> |
| /// Compares characters by alfabetical order. |
| /// </summary> |
| /// <param name="cCompare2"> |
| /// The first char in the comparison. </param> |
| /// <param name="cRef"> |
| /// The second char in the comparison. </param> |
| /// <param name="culture">The culture used for lowercasing.</param> |
| /// <returns> A negative number, 0 or a positive number if the second char is |
| /// less, equal or greater. </returns> |
| /// |
| private static int CompareCharsAlphabetically(char cCompare2, char cRef, CultureInfo culture) |
| { |
| var textInfo = culture.TextInfo; |
| return textInfo.ToLower(cCompare2) - textInfo.ToLower(cRef); |
| } |
| |
| /* what follows is the original Jaspell code. |
| private static int compareCharsAlphabetically(int cCompare2, int cRef) { |
| int cCompare = 0; |
| if (cCompare2 >= 65) { |
| if (cCompare2 < 89) { |
| cCompare = (2 * cCompare2) - 65; |
| } else if (cCompare2 < 97) { |
| cCompare = cCompare2 + 24; |
| } else if (cCompare2 < 121) { |
| cCompare = (2 * cCompare2) - 128; |
| } else cCompare = cCompare2; |
| } else cCompare = cCompare2; |
| if (cRef < 65) { |
| return cCompare - cRef; |
| } |
| if (cRef < 89) { |
| return cCompare - ((2 * cRef) - 65); |
| } |
| if (cRef < 97) { |
| return cCompare - (cRef + 24); |
| } |
| if (cRef < 121) { |
| return cCompare - ((2 * cRef) - 128); |
| } |
| return cCompare - cRef; |
| } |
| */ |
| |
| /// <summary> |
| /// The default number of values returned by the <see cref="MatchAlmost(string, int)"/> |
| /// method. |
| /// </summary> |
| private int defaultNumReturnValues = -1; |
| |
| /// <summary> |
| /// the number of differences allowed in a call to the <see cref="MatchAlmost(string, int)"/> |
| /// <c>key</c>. |
| /// </summary> |
| private int matchAlmostDiff; |
| |
| /// <summary> |
| /// The base node in the trie. </summary> |
| private TSTNode rootNode; |
| |
| // LUCENENET NOTE: Renamed from locale to culture. |
| private readonly CultureInfo culture; |
| |
| /// <summary> |
| /// Constructs an empty Ternary Search Trie. |
| /// </summary> |
| public JaspellTernarySearchTrie() |
| : this(CultureInfo.CurrentCulture) |
| { |
| } |
| |
| /// <summary> |
| /// Constructs an empty Ternary Search Trie, |
| /// specifying the <see cref="CultureInfo"/> used for lowercasing. |
| /// </summary> |
| public JaspellTernarySearchTrie(CultureInfo culture) |
| { |
| this.culture = culture ?? throw new ArgumentNullException(nameof(culture)); |
| } |
| |
| // for loading |
| internal virtual TSTNode Root |
| { |
| set |
| { |
| rootNode = value; |
| } |
| get |
| { |
| return rootNode; |
| } |
| } |
| |
| |
| /// <summary> |
| /// Constructs a Ternary Search Trie and loads data from a <see cref="FileInfo"/> |
| /// into the Trie. The file is a normal text document, where each line is of |
| /// the form word TAB float. |
| /// |
| /// <para>Uses the culture of the current thread to lowercase words before comparing.</para> |
| /// </summary> |
| /// <param name="file"> |
| /// The <see cref="FileInfo"/> with the data to load into the Trie. </param> |
| /// <exception cref="System.IO.IOException"> |
| /// A problem occured while reading the data. </exception> |
| public JaspellTernarySearchTrie(FileInfo file) |
| : this(file, false, CultureInfo.CurrentCulture) |
| { |
| } |
| |
| /// <summary> |
| /// Constructs a Ternary Search Trie and loads data from a <see cref="FileInfo"/> |
| /// into the Trie. The file is a normal text document, where each line is of |
| /// the form word TAB float. |
| /// |
| /// <para>Uses the supplied culture to lowercase words before comparing.</para> |
| /// </summary> |
| /// <param name="file"> |
| /// The <see cref="FileInfo"/> with the data to load into the Trie. </param> |
| /// <param name="culture">The culture used for lowercasing.</param> |
| /// <exception cref="System.IO.IOException"> |
| /// A problem occured while reading the data. </exception> |
| public JaspellTernarySearchTrie(FileInfo file, CultureInfo culture) |
| : this(file, false, culture) |
| { |
| } |
| |
| /// <summary> |
| /// Constructs a Ternary Search Trie and loads data from a <see cref="FileInfo"/> |
| /// into the Trie. The file is a normal text document, where each line is of |
| /// the form "word TAB float". |
| /// |
| /// <para>Uses the culture of the current thread to lowercase words before comparing.</para> |
| /// </summary> |
| /// <param name="file"> |
| /// The <see cref="FileInfo"/> with the data to load into the Trie. </param> |
| /// <param name="compression"> |
| /// If true, the file is compressed with the GZIP algorithm, and if |
| /// false, the file is a normal text document. </param> |
| /// <exception cref="System.IO.IOException"> |
| /// A problem occured while reading the data. </exception> |
| public JaspellTernarySearchTrie(FileInfo file, bool compression) |
| : this(file, compression, CultureInfo.CurrentCulture) |
| { } |
| |
| /// <summary> |
| /// Constructs a Ternary Search Trie and loads data from a <see cref="FileInfo"/> |
| /// into the Trie. The file is a normal text document, where each line is of |
| /// the form "word TAB float". |
| /// |
| /// <para>Uses the supplied culture to lowercase words before comparing.</para> |
| /// </summary> |
| /// <param name="file"> |
| /// The <see cref="FileInfo"/> with the data to load into the Trie. </param> |
| /// <param name="compression"> |
| /// If true, the file is compressed with the GZIP algorithm, and if |
| /// false, the file is a normal text document. </param> |
| /// <param name="culture">The culture used for lowercasing.</param> |
| /// <exception cref="System.IO.IOException"> |
| /// A problem occured while reading the data. </exception> |
| public JaspellTernarySearchTrie(FileInfo file, bool compression, CultureInfo culture) |
| : this(culture) |
| { |
| using (TextReader @in = (compression) ? |
| IOUtils.GetDecodingReader(new GZipStream(new FileStream(file.FullName, FileMode.Open), CompressionMode.Decompress), Encoding.UTF8) : |
| IOUtils.GetDecodingReader(new FileStream(file.FullName, FileMode.Open), Encoding.UTF8)) |
| { |
| string word; |
| int pos; |
| float? occur, one = new float?(1); |
| while ((word = @in.ReadLine()) != null) |
| { |
| pos = word.IndexOf('\t'); |
| occur = one; |
| if (pos != -1) |
| { |
| occur = Convert.ToSingle(word.Substring(pos + 1).Trim(), CultureInfo.InvariantCulture); |
| word = word.Substring(0, pos); |
| } |
| string key = culture.TextInfo.ToLower(word); |
| if (rootNode == null) |
| { |
| rootNode = new TSTNode(this, key[0], null); |
| } |
| TSTNode node = null; |
| if (key.Length > 0 && rootNode != null) |
| { |
| TSTNode currentNode = rootNode; |
| int charIndex = 0; |
| while (true) |
| { |
| if (currentNode == null) |
| { |
| break; |
| } |
| int charComp = CompareCharsAlphabetically(key[charIndex], currentNode.splitchar, culture); |
| if (charComp == 0) |
| { |
| charIndex++; |
| if (charIndex == key.Length) |
| { |
| node = currentNode; |
| break; |
| } |
| currentNode = currentNode.relatives[TSTNode.EQKID]; |
| } |
| else if (charComp < 0) |
| { |
| currentNode = currentNode.relatives[TSTNode.LOKID]; |
| } |
| else |
| { |
| currentNode = currentNode.relatives[TSTNode.HIKID]; |
| } |
| } |
| float? occur2 = null; |
| if (node != null) |
| { |
| occur2 = ((float?)(node.data)); |
| } |
| if (occur2 != null) |
| { |
| occur += (float)occur2; |
| } |
| currentNode = GetOrCreateNode(culture.TextInfo.ToLower(word.Trim())); |
| currentNode.data = occur; |
| } |
| } |
| } |
| } |
| |
| /// <summary> |
| /// Deletes the node passed in as an argument. If this node has non-null data, |
| /// then both the node and the data will be deleted. It also deletes any other |
| /// nodes in the trie that are no longer needed after the deletion of the node. |
| /// </summary> |
| /// <param name="nodeToDelete"> The node to delete. </param> |
| private void DeleteNode(TSTNode nodeToDelete) |
| { |
| if (nodeToDelete == null) |
| { |
| return; |
| } |
| nodeToDelete.data = null; |
| while (nodeToDelete != null) |
| { |
| nodeToDelete = DeleteNodeRecursion(nodeToDelete); |
| // deleteNodeRecursion(nodeToDelete); |
| } |
| } |
| |
| /// <summary> |
| /// Recursively visits each node to be deleted. |
| /// |
| /// To delete a node, first set its data to null, then pass it into this |
| /// method, then pass the node returned by this method into this method (make |
| /// sure you don't delete the data of any of the nodes returned from this |
| /// method!) and continue in this fashion until the node returned by this |
| /// method is <c>null</c>. |
| /// |
| /// The TSTNode instance returned by this method will be next node to be |
| /// operated on by <see cref="DeleteNodeRecursion(TSTNode)"/> (This emulates recursive |
| /// method call while avoiding the overhead normally associated with a |
| /// recursive method.) |
| /// </summary> |
| /// <param name="currentNode"> The node to delete. </param> |
| /// <returns> The next node to be called in deleteNodeRecursion. </returns> |
| private TSTNode DeleteNodeRecursion(TSTNode currentNode) |
| { |
| if (currentNode == null) |
| { |
| return null; |
| } |
| if (currentNode.relatives[TSTNode.EQKID] != null || currentNode.data != null) |
| { |
| return null; |
| } |
| // can't delete this node if it has a non-null eq kid or data |
| TSTNode currentParent = currentNode.relatives[TSTNode.PARENT]; |
| bool lokidNull = currentNode.relatives[TSTNode.LOKID] == null; |
| bool hikidNull = currentNode.relatives[TSTNode.HIKID] == null; |
| int childType; |
| if (currentParent.relatives[TSTNode.LOKID] == currentNode) |
| { |
| childType = TSTNode.LOKID; |
| } |
| else if (currentParent.relatives[TSTNode.EQKID] == currentNode) |
| { |
| childType = TSTNode.EQKID; |
| } |
| else if (currentParent.relatives[TSTNode.HIKID] == currentNode) |
| { |
| childType = TSTNode.HIKID; |
| } |
| else |
| { |
| rootNode = null; |
| return null; |
| } |
| if (lokidNull && hikidNull) |
| { |
| currentParent.relatives[childType] = null; |
| return currentParent; |
| } |
| if (lokidNull) |
| { |
| currentParent.relatives[childType] = currentNode.relatives[TSTNode.HIKID]; |
| currentNode.relatives[TSTNode.HIKID].relatives[TSTNode.PARENT] = currentParent; |
| return currentParent; |
| } |
| if (hikidNull) |
| { |
| currentParent.relatives[childType] = currentNode.relatives[TSTNode.LOKID]; |
| currentNode.relatives[TSTNode.LOKID].relatives[TSTNode.PARENT] = currentParent; |
| return currentParent; |
| } |
| int deltaHi = currentNode.relatives[TSTNode.HIKID].splitchar - currentNode.splitchar; |
| int deltaLo = currentNode.splitchar - currentNode.relatives[TSTNode.LOKID].splitchar; |
| int movingKid; |
| TSTNode targetNode; |
| if (deltaHi == deltaLo) |
| { |
| if (new Random(1).NextDouble() < 0.5) |
| { |
| deltaHi++; |
| } |
| else |
| { |
| deltaLo++; |
| } |
| } |
| if (deltaHi > deltaLo) |
| { |
| movingKid = TSTNode.HIKID; |
| targetNode = currentNode.relatives[TSTNode.LOKID]; |
| } |
| else |
| { |
| movingKid = TSTNode.LOKID; |
| targetNode = currentNode.relatives[TSTNode.HIKID]; |
| } |
| while (targetNode.relatives[movingKid] != null) |
| { |
| targetNode = targetNode.relatives[movingKid]; |
| } |
| targetNode.relatives[movingKid] = currentNode.relatives[movingKid]; |
| currentParent.relatives[childType] = targetNode; |
| targetNode.relatives[TSTNode.PARENT] = currentParent; |
| if (!lokidNull) |
| { |
| currentNode.relatives[TSTNode.LOKID] = null; |
| } |
| if (!hikidNull) |
| { |
| currentNode.relatives[TSTNode.HIKID] = null; |
| } |
| return currentParent; |
| } |
| |
| /// <summary> |
| /// Retrieve the object indexed by a key. |
| /// </summary> |
| /// <param name="key"> A <see cref="string"/> index. </param> |
| /// <returns> The object retrieved from the Ternary Search Trie. </returns> |
| public virtual object Get(string key) |
| { |
| TSTNode node = GetNode(key); |
| if (node == null) |
| { |
| return null; |
| } |
| return node.data; |
| } |
| |
| /// <summary> |
| /// Retrieve the <see cref="T:float?"/> indexed by key, increment it by one unit |
| /// and store the new <see cref="T:float?"/>. |
| /// </summary> |
| /// <param name="key"> A <see cref="string"/> index. </param> |
| /// <returns> The <see cref="T:float?"/> retrieved from the Ternary Search Trie. </returns> |
| public virtual float? GetAndIncrement(string key) |
| { |
| string key2 = culture.TextInfo.ToLower(key.Trim()); |
| TSTNode node = GetNode(key2); |
| if (node == null) |
| { |
| return null; |
| } |
| float? aux = (float?)(node.data); |
| if (aux == null) |
| { |
| aux = new float?(1); |
| } |
| else |
| { |
| aux = new float?((int)aux + 1); |
| } |
| Put(key2, aux); |
| return aux; |
| } |
| |
| /// <summary> |
| /// Returns the key that indexes the node argument. |
| /// </summary> |
| /// <param name="node"> |
| /// The node whose index is to be calculated. </param> |
| /// <returns> The <see cref="string"/> that indexes the node argument. </returns> |
| protected internal virtual string GetKey(TSTNode node) |
| { |
| StringBuilder getKeyBuffer = new StringBuilder(); |
| getKeyBuffer.Length = 0; |
| getKeyBuffer.Append("" + node.splitchar); |
| TSTNode currentNode; |
| TSTNode lastNode; |
| currentNode = node.relatives[TSTNode.PARENT]; |
| lastNode = node; |
| while (currentNode != null) |
| { |
| if (currentNode.relatives[TSTNode.EQKID] == lastNode) |
| { |
| getKeyBuffer.Append("" + currentNode.splitchar); |
| } |
| lastNode = currentNode; |
| currentNode = currentNode.relatives[TSTNode.PARENT]; |
| } |
| |
| getKeyBuffer.Reverse(); |
| return getKeyBuffer.ToString(); |
| } |
| |
| /// <summary> |
| /// Returns the node indexed by key, or <c>null</c> if that node doesn't |
| /// exist. Search begins at root node. |
| /// </summary> |
| /// <param name="key"> |
| /// A <see cref="string"/> that indexes the node that is returned. </param> |
| /// <returns> The node object indexed by key. This object is an instance of an |
| /// inner class named <see cref="TSTNode"/>. </returns> |
| public virtual TSTNode GetNode(string key) |
| { |
| return GetNode(key, rootNode); |
| } |
| |
| /// <summary> |
| /// Returns the node indexed by key, or <c>null</c> if that node doesn't |
| /// exist. The search begins at root node. |
| /// </summary> |
| /// <param name="key"> |
| /// A <see cref="string"/> that indexes the node that is returned. </param> |
| /// <param name="startNode"> |
| /// The top node defining the subtrie to be searched. </param> |
| /// <returns> The node object indexed by key. This object is an instance of an |
| /// inner class named <see cref="TSTNode"/>. </returns> |
| protected internal virtual TSTNode GetNode(string key, TSTNode startNode) |
| { |
| if (key == null || startNode == null || key.Length == 0) |
| { |
| return null; |
| } |
| TSTNode currentNode = startNode; |
| int charIndex = 0; |
| while (true) |
| { |
| if (currentNode == null) |
| { |
| return null; |
| } |
| int charComp = CompareCharsAlphabetically(key[charIndex], currentNode.splitchar, this.culture); |
| if (charComp == 0) |
| { |
| charIndex++; |
| if (charIndex == key.Length) |
| { |
| return currentNode; |
| } |
| currentNode = currentNode.relatives[TSTNode.EQKID]; |
| } |
| else if (charComp < 0) |
| { |
| currentNode = currentNode.relatives[TSTNode.LOKID]; |
| } |
| else |
| { |
| currentNode = currentNode.relatives[TSTNode.HIKID]; |
| } |
| } |
| } |
| |
| /// <summary> |
| /// Returns the node indexed by key, creating that node if it doesn't exist, |
| /// and creating any required intermediate nodes if they don't exist. |
| /// </summary> |
| /// <param name="key"> |
| /// A <see cref="string"/> that indexes the node that is returned. </param> |
| /// <returns> The node object indexed by key. This object is an instance of an |
| /// inner class named <see cref="TSTNode"/>. </returns> |
| /// <exception cref="NullReferenceException"> |
| /// If the key is <c>null</c>. </exception> |
| /// <exception cref="ArgumentException"> |
| /// If the key is an empty <see cref="string"/>. </exception> |
| protected internal virtual TSTNode GetOrCreateNode(string key) |
| { |
| if (key == null) |
| { |
| throw new NullReferenceException("attempt to get or create node with null key"); |
| } |
| if (key.Length == 0) |
| { |
| throw new System.ArgumentException("attempt to get or create node with key of zero length"); |
| } |
| if (rootNode == null) |
| { |
| rootNode = new TSTNode(this, key[0], null); |
| } |
| TSTNode currentNode = rootNode; |
| int charIndex = 0; |
| while (true) |
| { |
| int charComp = CompareCharsAlphabetically(key[charIndex], currentNode.splitchar, this.culture); |
| if (charComp == 0) |
| { |
| charIndex++; |
| if (charIndex == key.Length) |
| { |
| return currentNode; |
| } |
| if (currentNode.relatives[TSTNode.EQKID] == null) |
| { |
| currentNode.relatives[TSTNode.EQKID] = new TSTNode(this, key[charIndex], currentNode); |
| } |
| currentNode = currentNode.relatives[TSTNode.EQKID]; |
| } |
| else if (charComp < 0) |
| { |
| if (currentNode.relatives[TSTNode.LOKID] == null) |
| { |
| currentNode.relatives[TSTNode.LOKID] = new TSTNode(this, key[charIndex], currentNode); |
| } |
| currentNode = currentNode.relatives[TSTNode.LOKID]; |
| } |
| else |
| { |
| if (currentNode.relatives[TSTNode.HIKID] == null) |
| { |
| currentNode.relatives[TSTNode.HIKID] = new TSTNode(this, key[charIndex], currentNode); |
| } |
| currentNode = currentNode.relatives[TSTNode.HIKID]; |
| } |
| } |
| } |
| |
| /// <summary> |
| /// Returns a <see cref="IList{String}"/> of keys that almost match the argument key. |
| /// Keys returned will have exactly diff characters that do not match the |
| /// target key, where diff is equal to the last value set |
| /// to the <see cref="MatchAlmostDiff"/> property. |
| /// <para> |
| /// If the <see cref="MatchAlmost(string, int)"/> method is called before the |
| /// <see cref="MatchAlmostDiff"/> property has been called for the first time, |
| /// then diff = 0. |
| /// |
| /// </para> |
| /// </summary> |
| /// <param name="key"> |
| /// The target key. </param> |
| /// <returns> A <see cref="IList{String}"/> with the results. </returns> |
| public virtual IList<string> MatchAlmost(string key) |
| { |
| return MatchAlmost(key, defaultNumReturnValues); |
| } |
| |
| /// <summary> |
| /// Returns a <see cref="IList{String}"/> of keys that almost match the argument key. |
| /// Keys returned will have exactly diff characters that do not match the |
| /// target key, where diff is equal to the last value set |
| /// to the <see cref="MatchAlmostDiff"/> property. |
| /// <para> |
| /// If the <see cref="MatchAlmost(string, int)"/> method is called before the |
| /// <see cref="MatchAlmostDiff"/> property has been called for the first time, |
| /// then diff = 0. |
| /// |
| /// </para> |
| /// </summary> |
| /// <param name="key"> The target key. </param> |
| /// <param name="numReturnValues"> The maximum number of values returned by this method. </param> |
| /// <returns> A <see cref="IList{String}"/> with the results </returns> |
| public virtual IList<string> MatchAlmost(string key, int numReturnValues) |
| { |
| return MatchAlmostRecursion(rootNode, 0, matchAlmostDiff, key, ((numReturnValues < 0) ? -1 : numReturnValues), new List<string>(), false); |
| } |
| |
| /// <summary> |
| /// Recursivelly vists the nodes in order to find the ones that almost match a |
| /// given key. |
| /// </summary> |
| /// <param name="currentNode"> |
| /// The current node. </param> |
| /// <param name="charIndex"> |
| /// The current char. </param> |
| /// <param name="d"> |
| /// The number of differences so far. </param> |
| /// <param name="matchAlmostNumReturnValues"> |
| /// The maximum number of values in the result <see cref="List{String}"/>. </param> |
| /// <param name="matchAlmostResult2"> |
| /// The results so far. </param> |
| /// <param name="upTo"> |
| /// If true all keys having up to and including <see cref="MatchAlmostDiff"/> |
| /// mismatched letters will be included in the result (including a key |
| /// that is exactly the same as the target string) otherwise keys will |
| /// be included in the result only if they have exactly |
| /// <see cref="MatchAlmostDiff"/> number of mismatched letters. </param> |
| /// <param name="matchAlmostKey"> |
| /// The key being searched. </param> |
| /// <returns> A <see cref="IList{String}"/> with the results. </returns> |
| private IList<string> MatchAlmostRecursion(TSTNode currentNode, int charIndex, int d, string matchAlmostKey, int matchAlmostNumReturnValues, IList<string> matchAlmostResult2, bool upTo) |
| { |
| if ((currentNode == null) || (matchAlmostNumReturnValues != -1 && matchAlmostResult2.Count >= matchAlmostNumReturnValues) || (d < 0) || (charIndex >= matchAlmostKey.Length)) |
| { |
| return matchAlmostResult2; |
| } |
| int charComp = CompareCharsAlphabetically(matchAlmostKey[charIndex], currentNode.splitchar, this.culture); |
| IList<string> matchAlmostResult = matchAlmostResult2; |
| if ((d > 0) || (charComp < 0)) |
| { |
| matchAlmostResult = MatchAlmostRecursion(currentNode.relatives[TSTNode.LOKID], charIndex, d, matchAlmostKey, matchAlmostNumReturnValues, matchAlmostResult, upTo); |
| } |
| int nextD = (charComp == 0) ? d : d - 1; |
| bool cond = (upTo) ? (nextD >= 0) : (nextD == 0); |
| if ((matchAlmostKey.Length == charIndex + 1) && cond && (currentNode.data != null)) |
| { |
| matchAlmostResult.Add(GetKey(currentNode)); |
| } |
| matchAlmostResult = MatchAlmostRecursion(currentNode.relatives[TSTNode.EQKID], charIndex + 1, nextD, matchAlmostKey, matchAlmostNumReturnValues, matchAlmostResult, upTo); |
| if ((d > 0) || (charComp > 0)) |
| { |
| matchAlmostResult = MatchAlmostRecursion(currentNode.relatives[TSTNode.HIKID], charIndex, d, matchAlmostKey, matchAlmostNumReturnValues, matchAlmostResult, upTo); |
| } |
| return matchAlmostResult; |
| } |
| |
| /// <summary> |
| /// Returns an alphabetical <see cref="IList{String}"/> of all keys in the trie that |
| /// begin with a given prefix. Only keys for nodes having non-null data are |
| /// included in the <see cref="IList{String}"/>. |
| /// </summary> |
| /// <param name="prefix"> Each key returned from this method will begin with the characters |
| /// in prefix. </param> |
| /// <returns> A <see cref="IList{String}"/> with the results. </returns> |
| public virtual IList<string> MatchPrefix(string prefix) |
| { |
| return MatchPrefix(prefix, defaultNumReturnValues); |
| } |
| |
| /// <summary> |
| /// Returns an alphabetical <see cref="IList{String}"/> of all keys in the trie that |
| /// begin with a given prefix. Only keys for nodes having non-null data are |
| /// included in the <see cref="IList{String}"/>. |
| /// </summary> |
| /// <param name="prefix"> Each key returned from this method will begin with the characters |
| /// in prefix. </param> |
| /// <param name="numReturnValues"> The maximum number of values returned from this method. </param> |
| /// <returns> A <see cref="IList{String}"/> with the results </returns> |
| public virtual IList<string> MatchPrefix(string prefix, int numReturnValues) |
| { |
| List<string> sortKeysResult = new List<string>(); |
| TSTNode startNode = GetNode(prefix); |
| if (startNode == null) |
| { |
| return sortKeysResult; |
| } |
| if (startNode.data != null) |
| { |
| sortKeysResult.Add(GetKey(startNode)); |
| } |
| return SortKeysRecursion(startNode.relatives[TSTNode.EQKID], ((numReturnValues < 0) ? -1 : numReturnValues), sortKeysResult); |
| } |
| |
| /// <summary> |
| /// Returns the number of nodes in the trie that have non-null data. |
| /// </summary> |
| /// <returns> The number of nodes in the trie that have non-null data. </returns> |
| public virtual int NumDataNodes() |
| { |
| return NumDataNodes(rootNode); |
| } |
| |
| /// <summary> |
| /// Returns the number of nodes in the subtrie below and including the starting |
| /// node. The method counts only nodes that have non-null data. |
| /// </summary> |
| /// <param name="startingNode"> |
| /// The top node of the subtrie. the node that defines the subtrie. </param> |
| /// <returns> The total number of nodes in the subtrie. </returns> |
| protected internal virtual int NumDataNodes(TSTNode startingNode) |
| { |
| return RecursiveNodeCalculator(startingNode, true, 0); |
| } |
| |
| /// <summary> |
| /// Returns the total number of nodes in the trie. The method counts nodes |
| /// whether or not they have data. |
| /// </summary> |
| /// <returns> The total number of nodes in the trie. </returns> |
| public virtual int NumNodes() |
| { |
| return NumNodes(rootNode); |
| } |
| |
| /// <summary> |
| /// Returns the total number of nodes in the subtrie below and including the |
| /// starting Node. The method counts nodes whether or not they have data. |
| /// </summary> |
| /// <param name="startingNode"> |
| /// The top node of the subtrie. The node that defines the subtrie. </param> |
| /// <returns> The total number of nodes in the subtrie. </returns> |
| protected internal virtual int NumNodes(TSTNode startingNode) |
| { |
| return RecursiveNodeCalculator(startingNode, false, 0); |
| } |
| |
| /// <summary> |
| /// Stores a value in the trie. The value may be retrieved using the key. |
| /// </summary> |
| /// <param name="key"> A <see cref="string"/> that indexes the object to be stored. </param> |
| /// <param name="value"> The object to be stored in the Trie. </param> |
| public virtual void Put(string key, object value) |
| { |
| GetOrCreateNode(key).data = value; |
| } |
| |
| /// <summary> |
| /// Recursivelly visists each node to calculate the number of nodes. |
| /// </summary> |
| /// <param name="currentNode"> |
| /// The current node. </param> |
| /// <param name="checkData"> |
| /// If true we check the data to be different of <c>null</c>. </param> |
| /// <param name="numNodes2"> |
| /// The number of nodes so far. </param> |
| /// <returns> The number of nodes accounted. </returns> |
| private int RecursiveNodeCalculator(TSTNode currentNode, bool checkData, int numNodes2) |
| { |
| if (currentNode == null) |
| { |
| return numNodes2; |
| } |
| int numNodes = RecursiveNodeCalculator(currentNode.relatives[TSTNode.LOKID], checkData, numNodes2); |
| numNodes = RecursiveNodeCalculator(currentNode.relatives[TSTNode.EQKID], checkData, numNodes); |
| numNodes = RecursiveNodeCalculator(currentNode.relatives[TSTNode.HIKID], checkData, numNodes); |
| if (checkData) |
| { |
| if (currentNode.data != null) |
| { |
| numNodes++; |
| } |
| } |
| else |
| { |
| numNodes++; |
| } |
| return numNodes; |
| } |
| |
| /// <summary> |
| /// Removes the value indexed by key. Also removes all nodes that are rendered |
| /// unnecessary by the removal of this data. |
| /// </summary> |
| /// <param name="key"> A <see cref="string"/> that indexes the object to be removed from |
| /// the Trie. </param> |
| public virtual void Remove(string key) |
| { |
| DeleteNode(GetNode(this.culture.TextInfo.ToLower(key.Trim()))); |
| } |
| |
| /// <summary> |
| /// Sets the number of characters by which words can differ from target word |
| /// when calling the <see cref="MatchAlmost(string, int)"/> method. |
| /// <para> |
| /// Arguments less than 0 will set the char difference to 0, and arguments |
| /// greater than 3 will set the char difference to 3. |
| /// |
| /// </para> |
| /// </summary> |
| public virtual int MatchAlmostDiff |
| { |
| get // LUCENENET NOTE: Added property get per MSDN guidelines |
| { |
| return matchAlmostDiff; |
| } |
| set |
| { |
| if (value < 0) |
| { |
| matchAlmostDiff = 0; |
| } |
| else if (value > 3) |
| { |
| matchAlmostDiff = 3; |
| } |
| else |
| { |
| matchAlmostDiff = value; |
| } |
| } |
| } |
| |
| /// <summary> |
| /// Sets the default maximum number of values returned from the |
| /// <see cref="MatchPrefix(string, int)"/> and <see cref="MatchAlmost(string, int)"/> methods. |
| /// <para> |
| /// The value should be set this to -1 to get an unlimited number of return |
| /// values. note that the methods mentioned above provide overloaded versions |
| /// that allow you to specify the maximum number of return values, in which |
| /// case this value is temporarily overridden. |
| /// </para> |
| /// </summary> |
| public virtual int NumReturnValues |
| { |
| get // LUCENENET NOTE: Added property get per MSDN guidelines |
| { |
| return defaultNumReturnValues; |
| } |
| set |
| { |
| defaultNumReturnValues = (value < 0) ? -1 : value; |
| } |
| } |
| |
| /// <summary> |
| /// Returns keys sorted in alphabetical order. This includes the start Node and |
| /// all nodes connected to the start Node. |
| /// <para> |
| /// The number of keys returned is limited to numReturnValues. To get a list |
| /// that isn't limited in size, set numReturnValues to -1. |
| /// |
| /// </para> |
| /// </summary> |
| /// <param name="startNode"> |
| /// The top node defining the subtrie to be searched. </param> |
| /// <param name="numReturnValues"> |
| /// The maximum number of values returned from this method. </param> |
| /// <returns> A <see cref="IList{String}"/> with the results. </returns> |
| protected virtual IList<string> SortKeys(TSTNode startNode, int numReturnValues) |
| { |
| return SortKeysRecursion(startNode, ((numReturnValues < 0) ? -1 : numReturnValues), new List<string>()); |
| } |
| |
| /// <summary> |
| /// Returns keys sorted in alphabetical order. This includes the current Node |
| /// and all nodes connected to the current Node. |
| /// <para> |
| /// Sorted keys will be appended to the end of the resulting <see cref="List{String}"/>. |
| /// The result may be empty when this method is invoked, but may not be |
| /// <c>null</c>. |
| /// |
| /// </para> |
| /// </summary> |
| /// <param name="currentNode"> |
| /// The current node. </param> |
| /// <param name="sortKeysNumReturnValues"> |
| /// The maximum number of values in the result. </param> |
| /// <param name="sortKeysResult2"> |
| /// The results so far. </param> |
| /// <returns> A <see cref="IList{String}"/> with the results. </returns> |
| private IList<string> SortKeysRecursion(TSTNode currentNode, int sortKeysNumReturnValues, IList<string> sortKeysResult2) |
| { |
| if (currentNode == null) |
| { |
| return sortKeysResult2; |
| } |
| IList<string> sortKeysResult = SortKeysRecursion(currentNode.relatives[TSTNode.LOKID], sortKeysNumReturnValues, sortKeysResult2); |
| if (sortKeysNumReturnValues != -1 && sortKeysResult.Count >= sortKeysNumReturnValues) |
| { |
| return sortKeysResult; |
| } |
| if (currentNode.data != null) |
| { |
| sortKeysResult.Add(GetKey(currentNode)); |
| } |
| sortKeysResult = SortKeysRecursion(currentNode.relatives[TSTNode.EQKID], sortKeysNumReturnValues, sortKeysResult); |
| return SortKeysRecursion(currentNode.relatives[TSTNode.HIKID], sortKeysNumReturnValues, sortKeysResult); |
| } |
| |
| /// <summary> |
| /// Return an approximate memory usage for this trie. |
| /// </summary> |
| public virtual long GetSizeInBytes() |
| { |
| long mem = RamUsageEstimator.ShallowSizeOf(this); |
| TSTNode root = Root; |
| if (root != null) |
| { |
| mem += root.GetSizeInBytes(); |
| } |
| return mem; |
| } |
| } |
| } |