src/Lucene.Net.Analysis.Common/Analysis/Compound/Hyphenation/HyphenationTree.cs - lucenenet - Git at Google

 using J2N.Numerics;
 using System;
 using System.Collections.Generic;
 using System.IO;
 using System.Text;
 using System.Xml;
 using JCG = J2N.Collections.Generic;

 namespace Lucene.Net.Analysis.Compound.Hyphenation
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *      http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     /// <summary>
     /// This tree structure stores the hyphenation patterns in an efficient way for
     /// fast lookup. It provides the provides the method to hyphenate a word.
     /// <para/>
     /// This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
     /// </summary>
     public class HyphenationTree : TernaryTree, IPatternConsumer
     {
         /// <summary>
         /// value space: stores the interletter values
         /// </summary>
         protected ByteVector m_vspace;

         /// <summary>
         /// This map stores hyphenation exceptions
         /// </summary>
         protected IDictionary<string, IList<object>> m_stoplist;

         /// <summary>
         /// This map stores the character classes
         /// </summary>
         protected TernaryTree m_classmap;

         /// <summary>
         /// Temporary map to store interletter values on pattern loading.
         /// </summary>
 #if FEATURE_SERIALIZABLE
         [NonSerialized]
 #endif
         private TernaryTree ivalues;

         public HyphenationTree()
         {
             m_stoplist = new JCG.Dictionary<string, IList<object>>(23); // usually a small table
             m_classmap = new TernaryTree();
             m_vspace = new ByteVector();
             m_vspace.Alloc(1); // this reserves index 0, which we don't use
         }

         /// <summary>
         /// Packs the values by storing them in 4 bits, two values into a byte Values
         /// range is from 0 to 9. We use zero as terminator, so we'll add 1 to the
         /// value.
         /// </summary>
         /// <param name="values"> a string of digits from '0' to '9' representing the
         ///        interletter values. </param>
         /// <returns> the index into the vspace array where the packed values are stored. </returns>
         protected virtual int PackValues(string values)
         {
             int i, n = values.Length;
             int m = (n & 1) == 1 ? (n >> 1) + 2 : (n >> 1) + 1;
             int offset = m_vspace.Alloc(m);
             byte[] va = m_vspace.Array;
             for (i = 0; i < n; i++)
             {
                 int j = i >> 1;
                 byte v = (byte)((values[i] - '0' + 1) & 0x0f);
                 if ((i & 1) == 1)
                 {
                     va[j + offset] = (byte)(va[j + offset] | v);
                 }
                 else
                 {
                     va[j + offset] = (byte)(v << 4); // big endian
                 }
             }
             va[m - 1 + offset] = 0; // terminator
             return offset;
         }

         protected virtual string UnpackValues(int k)
         {
             StringBuilder buf = new StringBuilder();
             byte v = m_vspace[k++];
             while (v != 0)
             {
                 char c = (char)(v.TripleShift(4) - 1 + '0');;
                 buf.Append(c);
                 c = (char)(v & 0x0f);
                 if (c == 0)
                 {
                     break;
                 }
                 c = (char)(c - 1 + '0');
                 buf.Append(c);
                 v = m_vspace[k++];
             }
             return buf.ToString();
         }

         /// <summary>
         /// Read hyphenation patterns from an XML file.
         /// </summary>
         /// <param name="filename"> the filename </param>
         /// <exception cref="IOException"> In case the parsing fails </exception>
         public virtual void LoadPatterns(string filename)
         {
             LoadPatterns(filename, Encoding.UTF8);
         }

         /// <summary>
         /// Read hyphenation patterns from an XML file.
         /// </summary>
         /// <param name="filename"> the filename </param>
         /// <param name="encoding">The character encoding to use</param>
         /// <exception cref="IOException"> In case the parsing fails </exception>
         public virtual void LoadPatterns(string filename, Encoding encoding)
         {
             var src = new FileStream(filename, FileMode.Open, FileAccess.Read);
             LoadPatterns(src, encoding);
         }

         /// <summary>
         /// Read hyphenation patterns from an XML file.
         /// </summary>
         /// <param name="f"> a <see cref="FileInfo"/> object representing the file </param>
         /// <exception cref="IOException"> In case the parsing fails </exception>
         public virtual void LoadPatterns(FileInfo f)
         {
             LoadPatterns(f, Encoding.UTF8);
         }

         /// <summary>
         /// Read hyphenation patterns from an XML file.
         /// </summary>
         /// <param name="f"> a <see cref="FileInfo"/> object representing the file </param>
         /// <param name="encoding">The character encoding to use</param>
         /// <exception cref="IOException"> In case the parsing fails </exception>
         public virtual void LoadPatterns(FileInfo f, Encoding encoding)
         {
             var src = new FileStream(f.FullName, FileMode.Open, FileAccess.Read);
             LoadPatterns(src, encoding);
         }

         /// <summary>
         /// Read hyphenation patterns from an XML file.
         /// </summary>
         /// <param name="source"> <see cref="Stream"/> input source for the file </param>
         /// <exception cref="IOException"> In case the parsing fails </exception>
         public virtual void LoadPatterns(Stream source)
         {
             LoadPatterns(source, Encoding.UTF8);
         }

         /// <summary>
         /// Read hyphenation patterns from an XML file.
         /// </summary>
         /// <param name="source"> <see cref="Stream"/> input source for the file </param>
         /// <param name="encoding">The character encoding to use</param>
         /// <exception cref="IOException"> In case the parsing fails </exception>
         public virtual void LoadPatterns(Stream source, Encoding encoding)
         {
             var xmlReaderSettings =
                 new XmlReaderSettings
                 {
                     // DTD Processing currently is
                     // not supported in .NET Standard but will come back in .NET Standard 2.0.
                     // https://github.com/dotnet/corefx/issues/4376.
 #if FEATURE_DTD_PROCESSING
                     DtdProcessing = DtdProcessing.Parse,
                     XmlResolver = new PatternParser.DtdResolver()
 #else
                     DtdProcessing = DtdProcessing.Ignore
 #endif
                 };

             using var reader = XmlReader.Create(new StreamReader(source, encoding), xmlReaderSettings);
             LoadPatterns(reader);
         }

         /// <summary>
         /// Read hyphenation patterns from an <see cref="XmlReader"/>.
         /// </summary>
         /// <param name="source"> <see cref="XmlReader"/> input source for the file </param>
         /// <exception cref="IOException"> In case the parsing fails </exception>
         public virtual void LoadPatterns(XmlReader source)
         {
             PatternParser pp = new PatternParser(this);
             ivalues = new TernaryTree();

             pp.Parse(source);

             // patterns/values should be now in the tree
             // let's optimize a bit
             TrimToSize();
             m_vspace.TrimToSize();
             m_classmap.TrimToSize();

             // get rid of the auxiliary map
             ivalues = null;
         }

         public virtual string FindPattern(string pat)
         {
             int k = base.Find(pat);
             if (k >= 0)
             {
                 return UnpackValues(k);
             }
             return "";
         }

         /// <summary>
         /// String compare, returns 0 if equal or t is a substring of s
         /// </summary>
         protected virtual int HStrCmp(char[] s, int si, char[] t, int ti)
         {
             for (; s[si] == t[ti]; si++, ti++)
             {
                 if (s[si] == 0)
                 {
                     return 0;
                 }
             }
             if (t[ti] == 0)
             {
                 return 0;
             }
             return s[si] - t[ti];
         }

         protected virtual byte[] GetValues(int k)
         {
             StringBuilder buf = new StringBuilder();
             byte v = m_vspace[k++];
             while (v != 0)
             {
                 char c = (char)(v.TripleShift(4) - 1);
                 buf.Append(c);
                 c = (char)(v & 0x0f);
                 if (c == 0)
                 {
                     break;
                 }
                 c = (char)(c - 1);
                 buf.Append(c);
                 v = m_vspace[k++];
             }
             byte[] res = new byte[buf.Length];
             for (int i = 0; i < res.Length; i++)
             {
                 res[i] = (byte)buf[i];
             }
             return res;
         }

         /// <summary>
         /// <para>
         /// Search for all possible partial matches of word starting at index an update
         /// interletter values. In other words, it does something like:
         /// </para>
         /// <code>
         /// for (i=0; i&lt;patterns.Length; i++)
         /// {
         ///     if (word.Substring(index).StartsWith(patterns[i], StringComparison.Ordinal))
         ///         update_interletter_values(patterns[i]);
         /// }
         /// </code>
         /// <para>
         /// But it is done in an efficient way since the patterns are stored in a
         /// ternary tree. In fact, this is the whole purpose of having the tree: doing
         /// this search without having to test every single pattern. The number of
         /// patterns for languages such as English range from 4000 to 10000. Thus,
         /// doing thousands of string comparisons for each word to hyphenate would be
         /// really slow without the tree. The tradeoff is memory, but using a ternary
         /// tree instead of a trie, almost halves the the memory used by Lout or TeX.
         /// It's also faster than using a hash table
         /// </para>
         /// </summary>
         /// <param name="word"> null terminated word to match </param>
         /// <param name="index"> start index from word </param>
         /// <param name="il"> interletter values array to update </param>
         protected virtual void SearchPatterns(char[] word, int index, byte[] il)
         {
             byte[] values;
             int i = index;
             char p, q;
             char sp = word[i];
             p = m_root;

             while (p > 0 && p < m_sc.Length)
             {
                 if (m_sc[p] == 0xFFFF)
                 {
                     if (HStrCmp(word, i, m_kv.Array, m_lo[p]) == 0)
                     {
                         values = GetValues(m_eq[p]); // data pointer is in eq[]
                         int j = index;
                         for (int k = 0; k < values.Length; k++)
                         {
                             if (j < il.Length && values[k] > il[j])
                             {
                                 il[j] = values[k];
                             }
                             j++;
                         }
                     }
                     return;
                 }
                 int d = sp - m_sc[p];
                 if (d == 0)
                 {
                     if (sp == 0)
                     {
                         break;
                     }
                     sp = word[++i];
                     p = m_eq[p];
                     q = p;

                     // look for a pattern ending at this position by searching for
                     // the null char ( splitchar == 0 )
                     while (q > 0 && q < m_sc.Length)
                     {
                         if (m_sc[q] == 0xFFFF) // stop at compressed branch
                         {
                             break;
                         }
                         if (m_sc[q] == 0)
                         {
                             values = GetValues(m_eq[q]);
                             int j = index;
                             for (int k = 0; k < values.Length; k++)
                             {
                                 if (j < il.Length && values[k] > il[j])
                                 {
                                     il[j] = values[k];
                                 }
                                 j++;
                             }
                             break;
                         }
                         else
                         {
                             q = m_lo[q];

                             // actually the code should be: q = sc[q] < 0 ? hi[q] : lo[q]; but
                             // java chars are unsigned
                         }
                     }
                 }
                 else
                 {
                     p = d < 0 ? m_lo[p] : m_hi[p];
                 }
             }
         }

         /// <summary>
         /// Hyphenate word and return a <see cref="Hyphenation"/> object.
         /// </summary>
         /// <param name="word"> the word to be hyphenated </param>
         /// <param name="remainCharCount"> Minimum number of characters allowed before the
         ///        hyphenation point. </param>
         /// <param name="pushCharCount"> Minimum number of characters allowed after the
         ///        hyphenation point. </param>
         /// <returns> a <see cref="Hyphenation"/> object representing the
         ///         hyphenated word or null if word is not hyphenated. </returns>
         public virtual Hyphenation Hyphenate(string word, int remainCharCount, int pushCharCount)
         {
             char[] w = word.ToCharArray();
             return Hyphenate(w, 0, w.Length, remainCharCount, pushCharCount);
         }


         /// <summary>
         /// Hyphenate word and return an array of hyphenation points.
         /// </summary>
         /// <remarks>
         /// w = "****nnllllllnnn*****", where n is a non-letter, l is a letter, all n
         /// may be absent, the first n is at offset, the first l is at offset +
         /// iIgnoreAtBeginning; word = ".llllll.'\0'***", where all l in w are copied
         /// into word. In the first part of the routine len = w.length, in the second
         /// part of the routine len = word.length. Three indices are used: index(w),
         /// the index in w, index(word), the index in word, letterindex(word), the
         /// index in the letter part of word. The following relations exist: index(w) =
         /// offset + i - 1 index(word) = i - iIgnoreAtBeginning letterindex(word) =
         /// index(word) - 1 (see first loop). It follows that: index(w) - index(word) =
         /// offset - 1 + iIgnoreAtBeginning index(w) = letterindex(word) + offset +
         /// iIgnoreAtBeginning
         /// </remarks>
         /// <param name="w"> char array that contains the word </param>
         /// <param name="offset"> Offset to first character in word </param>
         /// <param name="len"> Length of word </param>
         /// <param name="remainCharCount"> Minimum number of characters allowed before the
         ///        hyphenation point. </param>
         /// <param name="pushCharCount"> Minimum number of characters allowed after the
         ///        hyphenation point. </param>
         /// <returns> a <see cref="Hyphenation"/> object representing the
         ///         hyphenated word or null if word is not hyphenated. </returns>
         public virtual Hyphenation Hyphenate(char[] w, int offset, int len, int remainCharCount, int pushCharCount)
         {
             int i;
             char[] word = new char[len + 3];

             // normalize word
             char[] c = new char[2];
             int iIgnoreAtBeginning = 0;
             int iLength = len;
             bool bEndOfLetters = false;
             for (i = 1; i <= len; i++)
             {
                 c[0] = w[offset + i - 1];
                 int nc = m_classmap.Find(c, 0);
                 if (nc < 0) // found a non-letter character ...
                 {
                     if (i == (1 + iIgnoreAtBeginning))
                     {
                         // ... before any letter character
                         iIgnoreAtBeginning++;
                     }
                     else
                     {
                         // ... after a letter character
                         bEndOfLetters = true;
                     }
                     iLength--;
                 }
                 else
                 {
                     if (!bEndOfLetters)
                     {
                         word[i - iIgnoreAtBeginning] = (char)nc;
                     }
                     else
                     {
                         return null;
                     }
                 }
             }
             len = iLength;
             if (len < (remainCharCount + pushCharCount))
             {
                 // word is too short to be hyphenated
                 return null;
             }
             int[] result = new int[len + 1];
             int k = 0;

             // check exception list first
             string sw = new string(word, 1, len);
             // LUCENENET: Eliminated extra lookup by using TryGetValue instead of ContainsKey
             if (m_stoplist.TryGetValue(sw, out IList<object> hw))
             {
                 // assume only simple hyphens (Hyphen.pre="-", Hyphen.post = Hyphen.no =
                 // null)
                 int j = 0;
                 for (i = 0; i < hw.Count; i++)
                 {
                     object o = hw[i];
                     // j = index(sw) = letterindex(word)?
                     // result[k] = corresponding index(w)
                     if (o is string)
                     {
                         j += ((string)o).Length;
                         if (j >= remainCharCount && j < (len - pushCharCount))
                         {
                             result[k++] = j + iIgnoreAtBeginning;
                         }
                     }
                 }
             }
             else
             {
                 // use algorithm to get hyphenation points
                 word[0] = '.'; // word start marker
                 word[len + 1] = '.'; // word end marker
                 word[len + 2] = (char)0; // null terminated
                 byte[] il = new byte[len + 3]; // initialized to zero
                 for (i = 0; i < len + 1; i++)
                 {
                     SearchPatterns(word, i, il);
                 }

                 // hyphenation points are located where interletter value is odd
                 // i is letterindex(word),
                 // i + 1 is index(word),
                 // result[k] = corresponding index(w)
                 for (i = 0; i < len; i++)
                 {
                     if (((il[i + 1] & 1) == 1) && i >= remainCharCount && i <= (len - pushCharCount))
                     {
                         result[k++] = i + iIgnoreAtBeginning;
                     }
                 }
             }

             if (k > 0)
             {
                 // trim result array
                 int[] res = new int[k + 2];
                 Array.Copy(result, 0, res, 1, k);
                 // We add the synthetical hyphenation points
                 // at the beginning and end of the word
                 res[0] = 0;
                 res[k + 1] = len;
                 return new Hyphenation(res);
             }
             else
             {
                 return null;
             }
         }

         /// <summary>
         /// Add a character class to the tree. It is used by
         /// <see cref="PatternParser"/> as callback to add character classes.
         /// Character classes define the valid word characters for hyphenation. If a
         /// word contains a character not defined in any of the classes, it is not
         /// hyphenated. It also defines a way to normalize the characters in order to
         /// compare them with the stored patterns. Usually pattern files use only lower
         /// case characters, in this case a class for letter 'a', for example, should
         /// be defined as "aA", the first character being the normalization char.
         /// </summary>
         public virtual void AddClass(string chargroup)
         {
             if (chargroup.Length > 0)
             {
                 char equivChar = chargroup[0];
                 char[] key = new char[2];
                 key[1] = (char)0;
                 for (int i = 0; i < chargroup.Length; i++)
                 {
                     key[0] = chargroup[i];
                     m_classmap.Insert(key, 0, equivChar);
                 }
             }
         }

         /// <summary>
         /// Add an exception to the tree. It is used by
         /// <see cref="PatternParser"/> class as callback to store the
         /// hyphenation exceptions.
         /// </summary>
         /// <param name="word"> normalized word </param>
         /// <param name="hyphenatedword"> a vector of alternating strings and
         ///        <see cref="Hyphen"/> objects. </param>
         public virtual void AddException(string word, IList<object> hyphenatedword)
         {
             m_stoplist[word] = hyphenatedword;
         }

         /// <summary>
         /// Add a pattern to the tree. Mainly, to be used by
         /// <see cref="PatternParser"/> class as callback to add a pattern to
         /// the tree.
         /// </summary>
         /// <param name="pattern"> the hyphenation pattern </param>
         /// <param name="ivalue"> interletter weight values indicating the desirability and
         ///        priority of hyphenating at a given point within the pattern. It
         ///        should contain only digit characters. (i.e. '0' to '9'). </param>
         public virtual void AddPattern(string pattern, string ivalue)
         {
             int k = ivalues.Find(ivalue);
             if (k <= 0)
             {
                 k = PackValues(ivalue);
                 ivalues.Insert(ivalue, (char)k);
             }
             Insert(pattern, (char)k);
         }

         // public override void printStats(PrintStream @out)
         // {
         //@out.println("Value space size = " + Convert.ToString(vspace.length(), CultureInfo.InvariantCulture));
         //base.printStats(@out);

         // }
     }
 }
	using J2N.Numerics;
	using System;
	using System.Collections.Generic;
	using System.IO;
	using System.Text;
	using System.Xml;
	using JCG = J2N.Collections.Generic;

	namespace Lucene.Net.Analysis.Compound.Hyphenation
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/// <summary>
	/// This tree structure stores the hyphenation patterns in an efficient way for
	/// fast lookup. It provides the provides the method to hyphenate a word.
	/// <para/>
	/// This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
	/// </summary>
	public class HyphenationTree : TernaryTree, IPatternConsumer
	{
	/// <summary>
	/// value space: stores the interletter values
	/// </summary>
	protected ByteVector m_vspace;

	/// <summary>
	/// This map stores hyphenation exceptions
	/// </summary>
	protected IDictionary<string, IList<object>> m_stoplist;

	/// <summary>
	/// This map stores the character classes
	/// </summary>
	protected TernaryTree m_classmap;

	/// <summary>
	/// Temporary map to store interletter values on pattern loading.
	/// </summary>
	#if FEATURE_SERIALIZABLE
	[NonSerialized]
	#endif
	private TernaryTree ivalues;

	public HyphenationTree()
	{
	m_stoplist = new JCG.Dictionary<string, IList<object>>(23); // usually a small table
	m_classmap = new TernaryTree();
	m_vspace = new ByteVector();
	m_vspace.Alloc(1); // this reserves index 0, which we don't use
	}

	/// <summary>
	/// Packs the values by storing them in 4 bits, two values into a byte Values
	/// range is from 0 to 9. We use zero as terminator, so we'll add 1 to the
	/// value.
	/// </summary>
	/// <param name="values"> a string of digits from '0' to '9' representing the
	/// interletter values. </param>
	/// <returns> the index into the vspace array where the packed values are stored. </returns>
	protected virtual int PackValues(string values)
	{
	int i, n = values.Length;
	int m = (n & 1) == 1 ? (n >> 1) + 2 : (n >> 1) + 1;
	int offset = m_vspace.Alloc(m);
	byte[] va = m_vspace.Array;
	for (i = 0; i < n; i++)
	{
	int j = i >> 1;
	byte v = (byte)((values[i] - '0' + 1) & 0x0f);
	if ((i & 1) == 1)
	{
	va[j + offset] = (byte)(va[j + offset] \| v);
	}
	else
	{
	va[j + offset] = (byte)(v << 4); // big endian
	}
	}
	va[m - 1 + offset] = 0; // terminator
	return offset;
	}

	protected virtual string UnpackValues(int k)
	{
	StringBuilder buf = new StringBuilder();
	byte v = m_vspace[k++];
	while (v != 0)
	{
	char c = (char)(v.TripleShift(4) - 1 + '0');;
	buf.Append(c);
	c = (char)(v & 0x0f);
	if (c == 0)
	{
	break;
	}
	c = (char)(c - 1 + '0');
	buf.Append(c);
	v = m_vspace[k++];
	}
	return buf.ToString();
	}

	/// <summary>
	/// Read hyphenation patterns from an XML file.
	/// </summary>
	/// <param name="filename"> the filename </param>
	/// <exception cref="IOException"> In case the parsing fails </exception>
	public virtual void LoadPatterns(string filename)
	{
	LoadPatterns(filename, Encoding.UTF8);
	}

	/// <summary>
	/// Read hyphenation patterns from an XML file.
	/// </summary>
	/// <param name="filename"> the filename </param>
	/// <param name="encoding">The character encoding to use</param>
	/// <exception cref="IOException"> In case the parsing fails </exception>
	public virtual void LoadPatterns(string filename, Encoding encoding)
	{
	var src = new FileStream(filename, FileMode.Open, FileAccess.Read);
	LoadPatterns(src, encoding);
	}

	/// <summary>
	/// Read hyphenation patterns from an XML file.
	/// </summary>
	/// <param name="f"> a <see cref="FileInfo"/> object representing the file </param>
	/// <exception cref="IOException"> In case the parsing fails </exception>
	public virtual void LoadPatterns(FileInfo f)
	{
	LoadPatterns(f, Encoding.UTF8);
	}

	/// <summary>
	/// Read hyphenation patterns from an XML file.
	/// </summary>
	/// <param name="f"> a <see cref="FileInfo"/> object representing the file </param>
	/// <param name="encoding">The character encoding to use</param>
	/// <exception cref="IOException"> In case the parsing fails </exception>
	public virtual void LoadPatterns(FileInfo f, Encoding encoding)
	{
	var src = new FileStream(f.FullName, FileMode.Open, FileAccess.Read);
	LoadPatterns(src, encoding);
	}

	/// <summary>
	/// Read hyphenation patterns from an XML file.
	/// </summary>
	/// <param name="source"> <see cref="Stream"/> input source for the file </param>
	/// <exception cref="IOException"> In case the parsing fails </exception>
	public virtual void LoadPatterns(Stream source)
	{
	LoadPatterns(source, Encoding.UTF8);
	}

	/// <summary>
	/// Read hyphenation patterns from an XML file.
	/// </summary>
	/// <param name="source"> <see cref="Stream"/> input source for the file </param>
	/// <param name="encoding">The character encoding to use</param>
	/// <exception cref="IOException"> In case the parsing fails </exception>
	public virtual void LoadPatterns(Stream source, Encoding encoding)
	{
	var xmlReaderSettings =
	new XmlReaderSettings
	{
	// DTD Processing currently is
	// not supported in .NET Standard but will come back in .NET Standard 2.0.
	// https://github.com/dotnet/corefx/issues/4376.
	#if FEATURE_DTD_PROCESSING
	DtdProcessing = DtdProcessing.Parse,
	XmlResolver = new PatternParser.DtdResolver()
	#else
	DtdProcessing = DtdProcessing.Ignore
	#endif
	};

	using var reader = XmlReader.Create(new StreamReader(source, encoding), xmlReaderSettings);
	LoadPatterns(reader);
	}

	/// <summary>
	/// Read hyphenation patterns from an <see cref="XmlReader"/>.
	/// </summary>
	/// <param name="source"> <see cref="XmlReader"/> input source for the file </param>
	/// <exception cref="IOException"> In case the parsing fails </exception>
	public virtual void LoadPatterns(XmlReader source)
	{
	PatternParser pp = new PatternParser(this);
	ivalues = new TernaryTree();

	pp.Parse(source);

	// patterns/values should be now in the tree
	// let's optimize a bit
	TrimToSize();
	m_vspace.TrimToSize();
	m_classmap.TrimToSize();

	// get rid of the auxiliary map
	ivalues = null;
	}

	public virtual string FindPattern(string pat)
	{
	int k = base.Find(pat);
	if (k >= 0)
	{
	return UnpackValues(k);
	}
	return "";
	}

	/// <summary>
	/// String compare, returns 0 if equal or t is a substring of s
	/// </summary>
	protected virtual int HStrCmp(char[] s, int si, char[] t, int ti)
	{
	for (; s[si] == t[ti]; si++, ti++)
	{
	if (s[si] == 0)
	{
	return 0;
	}
	}
	if (t[ti] == 0)
	{
	return 0;
	}
	return s[si] - t[ti];
	}

	protected virtual byte[] GetValues(int k)
	{
	StringBuilder buf = new StringBuilder();
	byte v = m_vspace[k++];
	while (v != 0)
	{
	char c = (char)(v.TripleShift(4) - 1);
	buf.Append(c);
	c = (char)(v & 0x0f);
	if (c == 0)
	{
	break;
	}
	c = (char)(c - 1);
	buf.Append(c);
	v = m_vspace[k++];
	}
	byte[] res = new byte[buf.Length];
	for (int i = 0; i < res.Length; i++)
	{
	res[i] = (byte)buf[i];
	}
	return res;
	}

	/// <summary>
	/// <para>
	/// Search for all possible partial matches of word starting at index an update
	/// interletter values. In other words, it does something like:
	/// </para>
	/// <code>
	/// for (i=0; i<patterns.Length; i++)
	/// {
	/// if (word.Substring(index).StartsWith(patterns[i], StringComparison.Ordinal))
	/// update_interletter_values(patterns[i]);
	/// }
	/// </code>
	/// <para>
	/// But it is done in an efficient way since the patterns are stored in a
	/// ternary tree. In fact, this is the whole purpose of having the tree: doing
	/// this search without having to test every single pattern. The number of
	/// patterns for languages such as English range from 4000 to 10000. Thus,
	/// doing thousands of string comparisons for each word to hyphenate would be
	/// really slow without the tree. The tradeoff is memory, but using a ternary
	/// tree instead of a trie, almost halves the the memory used by Lout or TeX.
	/// It's also faster than using a hash table
	/// </para>
	/// </summary>
	/// <param name="word"> null terminated word to match </param>
	/// <param name="index"> start index from word </param>
	/// <param name="il"> interletter values array to update </param>
	protected virtual void SearchPatterns(char[] word, int index, byte[] il)
	{
	byte[] values;
	int i = index;
	char p, q;
	char sp = word[i];
	p = m_root;

	while (p > 0 && p < m_sc.Length)
	{
	if (m_sc[p] == 0xFFFF)
	{
	if (HStrCmp(word, i, m_kv.Array, m_lo[p]) == 0)
	{
	values = GetValues(m_eq[p]); // data pointer is in eq[]
	int j = index;
	for (int k = 0; k < values.Length; k++)
	{
	if (j < il.Length && values[k] > il[j])
	{
	il[j] = values[k];
	}
	j++;
	}
	}
	return;
	}
	int d = sp - m_sc[p];
	if (d == 0)
	{
	if (sp == 0)
	{
	break;
	}
	sp = word[++i];
	p = m_eq[p];
	q = p;

	// look for a pattern ending at this position by searching for
	// the null char ( splitchar == 0 )
	while (q > 0 && q < m_sc.Length)
	{
	if (m_sc[q] == 0xFFFF) // stop at compressed branch
	{
	break;
	}
	if (m_sc[q] == 0)
	{
	values = GetValues(m_eq[q]);
	int j = index;
	for (int k = 0; k < values.Length; k++)
	{
	if (j < il.Length && values[k] > il[j])
	{
	il[j] = values[k];
	}
	j++;
	}
	break;
	}
	else
	{
	q = m_lo[q];

	// actually the code should be: q = sc[q] < 0 ? hi[q] : lo[q]; but
	// java chars are unsigned
	}
	}
	}
	else
	{
	p = d < 0 ? m_lo[p] : m_hi[p];
	}
	}
	}

	/// <summary>
	/// Hyphenate word and return a <see cref="Hyphenation"/> object.
	/// </summary>
	/// <param name="word"> the word to be hyphenated </param>
	/// <param name="remainCharCount"> Minimum number of characters allowed before the
	/// hyphenation point. </param>
	/// <param name="pushCharCount"> Minimum number of characters allowed after the
	/// hyphenation point. </param>
	/// <returns> a <see cref="Hyphenation"/> object representing the
	/// hyphenated word or null if word is not hyphenated. </returns>
	public virtual Hyphenation Hyphenate(string word, int remainCharCount, int pushCharCount)
	{
	char[] w = word.ToCharArray();
	return Hyphenate(w, 0, w.Length, remainCharCount, pushCharCount);
	}



	/// <summary>
	/// Hyphenate word and return an array of hyphenation points.
	/// </summary>
	/// <remarks>
	/// w = "**nnllllllnnn***", where n is a non-letter, l is a letter, all n
	/// may be absent, the first n is at offset, the first l is at offset +
	/// iIgnoreAtBeginning; word = ".llllll.'\0'***", where all l in w are copied
	/// into word. In the first part of the routine len = w.length, in the second
	/// part of the routine len = word.length. Three indices are used: index(w),
	/// the index in w, index(word), the index in word, letterindex(word), the
	/// index in the letter part of word. The following relations exist: index(w) =
	/// offset + i - 1 index(word) = i - iIgnoreAtBeginning letterindex(word) =
	/// index(word) - 1 (see first loop). It follows that: index(w) - index(word) =
	/// offset - 1 + iIgnoreAtBeginning index(w) = letterindex(word) + offset +
	/// iIgnoreAtBeginning
	/// </remarks>
	/// <param name="w"> char array that contains the word </param>
	/// <param name="offset"> Offset to first character in word </param>
	/// <param name="len"> Length of word </param>
	/// <param name="remainCharCount"> Minimum number of characters allowed before the
	/// hyphenation point. </param>
	/// <param name="pushCharCount"> Minimum number of characters allowed after the
	/// hyphenation point. </param>
	/// <returns> a <see cref="Hyphenation"/> object representing the
	/// hyphenated word or null if word is not hyphenated. </returns>
	public virtual Hyphenation Hyphenate(char[] w, int offset, int len, int remainCharCount, int pushCharCount)
	{
	int i;
	char[] word = new char[len + 3];

	// normalize word
	char[] c = new char[2];
	int iIgnoreAtBeginning = 0;
	int iLength = len;
	bool bEndOfLetters = false;
	for (i = 1; i <= len; i++)
	{
	c[0] = w[offset + i - 1];
	int nc = m_classmap.Find(c, 0);
	if (nc < 0) // found a non-letter character ...
	{
	if (i == (1 + iIgnoreAtBeginning))
	{
	// ... before any letter character
	iIgnoreAtBeginning++;
	}
	else
	{
	// ... after a letter character
	bEndOfLetters = true;
	}
	iLength--;
	}
	else
	{
	if (!bEndOfLetters)
	{
	word[i - iIgnoreAtBeginning] = (char)nc;
	}
	else
	{
	return null;
	}
	}
	}
	len = iLength;
	if (len < (remainCharCount + pushCharCount))
	{
	// word is too short to be hyphenated
	return null;
	}
	int[] result = new int[len + 1];
	int k = 0;

	// check exception list first
	string sw = new string(word, 1, len);
	// LUCENENET: Eliminated extra lookup by using TryGetValue instead of ContainsKey
	if (m_stoplist.TryGetValue(sw, out IList<object> hw))
	{
	// assume only simple hyphens (Hyphen.pre="-", Hyphen.post = Hyphen.no =
	// null)
	int j = 0;
	for (i = 0; i < hw.Count; i++)
	{
	object o = hw[i];
	// j = index(sw) = letterindex(word)?
	// result[k] = corresponding index(w)
	if (o is string)
	{
	j += ((string)o).Length;
	if (j >= remainCharCount && j < (len - pushCharCount))
	{
	result[k++] = j + iIgnoreAtBeginning;
	}
	}
	}
	}
	else
	{
	// use algorithm to get hyphenation points
	word[0] = '.'; // word start marker
	word[len + 1] = '.'; // word end marker
	word[len + 2] = (char)0; // null terminated
	byte[] il = new byte[len + 3]; // initialized to zero
	for (i = 0; i < len + 1; i++)
	{
	SearchPatterns(word, i, il);
	}

	// hyphenation points are located where interletter value is odd
	// i is letterindex(word),
	// i + 1 is index(word),
	// result[k] = corresponding index(w)
	for (i = 0; i < len; i++)
	{
	if (((il[i + 1] & 1) == 1) && i >= remainCharCount && i <= (len - pushCharCount))
	{
	result[k++] = i + iIgnoreAtBeginning;
	}
	}
	}

	if (k > 0)
	{
	// trim result array
	int[] res = new int[k + 2];
	Array.Copy(result, 0, res, 1, k);
	// We add the synthetical hyphenation points
	// at the beginning and end of the word
	res[0] = 0;
	res[k + 1] = len;
	return new Hyphenation(res);
	}
	else
	{
	return null;
	}
	}

	/// <summary>
	/// Add a character class to the tree. It is used by
	/// <see cref="PatternParser"/> as callback to add character classes.
	/// Character classes define the valid word characters for hyphenation. If a
	/// word contains a character not defined in any of the classes, it is not
	/// hyphenated. It also defines a way to normalize the characters in order to
	/// compare them with the stored patterns. Usually pattern files use only lower
	/// case characters, in this case a class for letter 'a', for example, should
	/// be defined as "aA", the first character being the normalization char.
	/// </summary>
	public virtual void AddClass(string chargroup)
	{
	if (chargroup.Length > 0)
	{
	char equivChar = chargroup[0];
	char[] key = new char[2];
	key[1] = (char)0;
	for (int i = 0; i < chargroup.Length; i++)
	{
	key[0] = chargroup[i];
	m_classmap.Insert(key, 0, equivChar);
	}
	}
	}

	/// <summary>
	/// Add an exception to the tree. It is used by
	/// <see cref="PatternParser"/> class as callback to store the
	/// hyphenation exceptions.
	/// </summary>
	/// <param name="word"> normalized word </param>
	/// <param name="hyphenatedword"> a vector of alternating strings and
	/// <see cref="Hyphen"/> objects. </param>
	public virtual void AddException(string word, IList<object> hyphenatedword)
	{
	m_stoplist[word] = hyphenatedword;
	}

	/// <summary>
	/// Add a pattern to the tree. Mainly, to be used by
	/// <see cref="PatternParser"/> class as callback to add a pattern to
	/// the tree.
	/// </summary>
	/// <param name="pattern"> the hyphenation pattern </param>
	/// <param name="ivalue"> interletter weight values indicating the desirability and
	/// priority of hyphenating at a given point within the pattern. It
	/// should contain only digit characters. (i.e. '0' to '9'). </param>
	public virtual void AddPattern(string pattern, string ivalue)
	{
	int k = ivalues.Find(ivalue);
	if (k <= 0)
	{
	k = PackValues(ivalue);
	ivalues.Insert(ivalue, (char)k);
	}
	Insert(pattern, (char)k);
	}

	// public override void printStats(PrintStream @out)
	// {
	//@out.println("Value space size = " + Convert.ToString(vspace.length(), CultureInfo.InvariantCulture));
	//base.printStats(@out);

	// }
	}
	}