blob: f9373a7ccc21ac5684f691f062be9b1ab5493440 [file] [log] [blame]
using J2N.Numerics;
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using System.Xml;
using JCG = J2N.Collections.Generic;
namespace Lucene.Net.Analysis.Compound.Hyphenation
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// This tree structure stores the hyphenation patterns in an efficient way for
/// fast lookup. It provides the provides the method to hyphenate a word.
/// <para/>
/// This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
/// </summary>
public class HyphenationTree : TernaryTree, IPatternConsumer
{
/// <summary>
/// value space: stores the interletter values
/// </summary>
protected ByteVector m_vspace;
/// <summary>
/// This map stores hyphenation exceptions
/// </summary>
protected IDictionary<string, IList<object>> m_stoplist;
/// <summary>
/// This map stores the character classes
/// </summary>
protected TernaryTree m_classmap;
/// <summary>
/// Temporary map to store interletter values on pattern loading.
/// </summary>
#if FEATURE_SERIALIZABLE
[NonSerialized]
#endif
private TernaryTree ivalues;
public HyphenationTree()
{
m_stoplist = new JCG.Dictionary<string, IList<object>>(23); // usually a small table
m_classmap = new TernaryTree();
m_vspace = new ByteVector();
m_vspace.Alloc(1); // this reserves index 0, which we don't use
}
/// <summary>
/// Packs the values by storing them in 4 bits, two values into a byte Values
/// range is from 0 to 9. We use zero as terminator, so we'll add 1 to the
/// value.
/// </summary>
/// <param name="values"> a string of digits from '0' to '9' representing the
/// interletter values. </param>
/// <returns> the index into the vspace array where the packed values are stored. </returns>
protected virtual int PackValues(string values)
{
int i, n = values.Length;
int m = (n & 1) == 1 ? (n >> 1) + 2 : (n >> 1) + 1;
int offset = m_vspace.Alloc(m);
byte[] va = m_vspace.Array;
for (i = 0; i < n; i++)
{
int j = i >> 1;
byte v = (byte)((values[i] - '0' + 1) & 0x0f);
if ((i & 1) == 1)
{
va[j + offset] = (byte)(va[j + offset] | v);
}
else
{
va[j + offset] = (byte)(v << 4); // big endian
}
}
va[m - 1 + offset] = 0; // terminator
return offset;
}
protected virtual string UnpackValues(int k)
{
StringBuilder buf = new StringBuilder();
byte v = m_vspace[k++];
while (v != 0)
{
char c = (char)(v.TripleShift(4) - 1 + '0');;
buf.Append(c);
c = (char)(v & 0x0f);
if (c == 0)
{
break;
}
c = (char)(c - 1 + '0');
buf.Append(c);
v = m_vspace[k++];
}
return buf.ToString();
}
/// <summary>
/// Read hyphenation patterns from an XML file.
/// </summary>
/// <param name="filename"> the filename </param>
/// <exception cref="IOException"> In case the parsing fails </exception>
public virtual void LoadPatterns(string filename)
{
LoadPatterns(filename, Encoding.UTF8);
}
/// <summary>
/// Read hyphenation patterns from an XML file.
/// </summary>
/// <param name="filename"> the filename </param>
/// <param name="encoding">The character encoding to use</param>
/// <exception cref="IOException"> In case the parsing fails </exception>
public virtual void LoadPatterns(string filename, Encoding encoding)
{
var src = new FileStream(filename, FileMode.Open, FileAccess.Read);
LoadPatterns(src, encoding);
}
/// <summary>
/// Read hyphenation patterns from an XML file.
/// </summary>
/// <param name="f"> a <see cref="FileInfo"/> object representing the file </param>
/// <exception cref="IOException"> In case the parsing fails </exception>
public virtual void LoadPatterns(FileInfo f)
{
LoadPatterns(f, Encoding.UTF8);
}
/// <summary>
/// Read hyphenation patterns from an XML file.
/// </summary>
/// <param name="f"> a <see cref="FileInfo"/> object representing the file </param>
/// <param name="encoding">The character encoding to use</param>
/// <exception cref="IOException"> In case the parsing fails </exception>
public virtual void LoadPatterns(FileInfo f, Encoding encoding)
{
var src = new FileStream(f.FullName, FileMode.Open, FileAccess.Read);
LoadPatterns(src, encoding);
}
/// <summary>
/// Read hyphenation patterns from an XML file.
/// </summary>
/// <param name="source"> <see cref="Stream"/> input source for the file </param>
/// <exception cref="IOException"> In case the parsing fails </exception>
public virtual void LoadPatterns(Stream source)
{
LoadPatterns(source, Encoding.UTF8);
}
/// <summary>
/// Read hyphenation patterns from an XML file.
/// </summary>
/// <param name="source"> <see cref="Stream"/> input source for the file </param>
/// <param name="encoding">The character encoding to use</param>
/// <exception cref="IOException"> In case the parsing fails </exception>
public virtual void LoadPatterns(Stream source, Encoding encoding)
{
var xmlReaderSettings =
new XmlReaderSettings
{
// DTD Processing currently is
// not supported in .NET Standard but will come back in .NET Standard 2.0.
// https://github.com/dotnet/corefx/issues/4376.
#if FEATURE_DTD_PROCESSING
DtdProcessing = DtdProcessing.Parse,
XmlResolver = new PatternParser.DtdResolver()
#else
DtdProcessing = DtdProcessing.Ignore
#endif
};
using var reader = XmlReader.Create(new StreamReader(source, encoding), xmlReaderSettings);
LoadPatterns(reader);
}
/// <summary>
/// Read hyphenation patterns from an <see cref="XmlReader"/>.
/// </summary>
/// <param name="source"> <see cref="XmlReader"/> input source for the file </param>
/// <exception cref="IOException"> In case the parsing fails </exception>
public virtual void LoadPatterns(XmlReader source)
{
PatternParser pp = new PatternParser(this);
ivalues = new TernaryTree();
pp.Parse(source);
// patterns/values should be now in the tree
// let's optimize a bit
TrimToSize();
m_vspace.TrimToSize();
m_classmap.TrimToSize();
// get rid of the auxiliary map
ivalues = null;
}
public virtual string FindPattern(string pat)
{
int k = base.Find(pat);
if (k >= 0)
{
return UnpackValues(k);
}
return "";
}
/// <summary>
/// String compare, returns 0 if equal or t is a substring of s
/// </summary>
protected virtual int HStrCmp(char[] s, int si, char[] t, int ti)
{
for (; s[si] == t[ti]; si++, ti++)
{
if (s[si] == 0)
{
return 0;
}
}
if (t[ti] == 0)
{
return 0;
}
return s[si] - t[ti];
}
protected virtual byte[] GetValues(int k)
{
StringBuilder buf = new StringBuilder();
byte v = m_vspace[k++];
while (v != 0)
{
char c = (char)(v.TripleShift(4) - 1);
buf.Append(c);
c = (char)(v & 0x0f);
if (c == 0)
{
break;
}
c = (char)(c - 1);
buf.Append(c);
v = m_vspace[k++];
}
byte[] res = new byte[buf.Length];
for (int i = 0; i < res.Length; i++)
{
res[i] = (byte)buf[i];
}
return res;
}
/// <summary>
/// <para>
/// Search for all possible partial matches of word starting at index an update
/// interletter values. In other words, it does something like:
/// </para>
/// <code>
/// for (i=0; i&lt;patterns.Length; i++)
/// {
/// if (word.Substring(index).StartsWith(patterns[i], StringComparison.Ordinal))
/// update_interletter_values(patterns[i]);
/// }
/// </code>
/// <para>
/// But it is done in an efficient way since the patterns are stored in a
/// ternary tree. In fact, this is the whole purpose of having the tree: doing
/// this search without having to test every single pattern. The number of
/// patterns for languages such as English range from 4000 to 10000. Thus,
/// doing thousands of string comparisons for each word to hyphenate would be
/// really slow without the tree. The tradeoff is memory, but using a ternary
/// tree instead of a trie, almost halves the the memory used by Lout or TeX.
/// It's also faster than using a hash table
/// </para>
/// </summary>
/// <param name="word"> null terminated word to match </param>
/// <param name="index"> start index from word </param>
/// <param name="il"> interletter values array to update </param>
protected virtual void SearchPatterns(char[] word, int index, byte[] il)
{
byte[] values;
int i = index;
char p, q;
char sp = word[i];
p = m_root;
while (p > 0 && p < m_sc.Length)
{
if (m_sc[p] == 0xFFFF)
{
if (HStrCmp(word, i, m_kv.Array, m_lo[p]) == 0)
{
values = GetValues(m_eq[p]); // data pointer is in eq[]
int j = index;
for (int k = 0; k < values.Length; k++)
{
if (j < il.Length && values[k] > il[j])
{
il[j] = values[k];
}
j++;
}
}
return;
}
int d = sp - m_sc[p];
if (d == 0)
{
if (sp == 0)
{
break;
}
sp = word[++i];
p = m_eq[p];
q = p;
// look for a pattern ending at this position by searching for
// the null char ( splitchar == 0 )
while (q > 0 && q < m_sc.Length)
{
if (m_sc[q] == 0xFFFF) // stop at compressed branch
{
break;
}
if (m_sc[q] == 0)
{
values = GetValues(m_eq[q]);
int j = index;
for (int k = 0; k < values.Length; k++)
{
if (j < il.Length && values[k] > il[j])
{
il[j] = values[k];
}
j++;
}
break;
}
else
{
q = m_lo[q];
// actually the code should be: q = sc[q] < 0 ? hi[q] : lo[q]; but
// java chars are unsigned
}
}
}
else
{
p = d < 0 ? m_lo[p] : m_hi[p];
}
}
}
/// <summary>
/// Hyphenate word and return a <see cref="Hyphenation"/> object.
/// </summary>
/// <param name="word"> the word to be hyphenated </param>
/// <param name="remainCharCount"> Minimum number of characters allowed before the
/// hyphenation point. </param>
/// <param name="pushCharCount"> Minimum number of characters allowed after the
/// hyphenation point. </param>
/// <returns> a <see cref="Hyphenation"/> object representing the
/// hyphenated word or null if word is not hyphenated. </returns>
public virtual Hyphenation Hyphenate(string word, int remainCharCount, int pushCharCount)
{
char[] w = word.ToCharArray();
return Hyphenate(w, 0, w.Length, remainCharCount, pushCharCount);
}
/// <summary>
/// Hyphenate word and return an array of hyphenation points.
/// </summary>
/// <remarks>
/// w = "****nnllllllnnn*****", where n is a non-letter, l is a letter, all n
/// may be absent, the first n is at offset, the first l is at offset +
/// iIgnoreAtBeginning; word = ".llllll.'\0'***", where all l in w are copied
/// into word. In the first part of the routine len = w.length, in the second
/// part of the routine len = word.length. Three indices are used: index(w),
/// the index in w, index(word), the index in word, letterindex(word), the
/// index in the letter part of word. The following relations exist: index(w) =
/// offset + i - 1 index(word) = i - iIgnoreAtBeginning letterindex(word) =
/// index(word) - 1 (see first loop). It follows that: index(w) - index(word) =
/// offset - 1 + iIgnoreAtBeginning index(w) = letterindex(word) + offset +
/// iIgnoreAtBeginning
/// </remarks>
/// <param name="w"> char array that contains the word </param>
/// <param name="offset"> Offset to first character in word </param>
/// <param name="len"> Length of word </param>
/// <param name="remainCharCount"> Minimum number of characters allowed before the
/// hyphenation point. </param>
/// <param name="pushCharCount"> Minimum number of characters allowed after the
/// hyphenation point. </param>
/// <returns> a <see cref="Hyphenation"/> object representing the
/// hyphenated word or null if word is not hyphenated. </returns>
public virtual Hyphenation Hyphenate(char[] w, int offset, int len, int remainCharCount, int pushCharCount)
{
int i;
char[] word = new char[len + 3];
// normalize word
char[] c = new char[2];
int iIgnoreAtBeginning = 0;
int iLength = len;
bool bEndOfLetters = false;
for (i = 1; i <= len; i++)
{
c[0] = w[offset + i - 1];
int nc = m_classmap.Find(c, 0);
if (nc < 0) // found a non-letter character ...
{
if (i == (1 + iIgnoreAtBeginning))
{
// ... before any letter character
iIgnoreAtBeginning++;
}
else
{
// ... after a letter character
bEndOfLetters = true;
}
iLength--;
}
else
{
if (!bEndOfLetters)
{
word[i - iIgnoreAtBeginning] = (char)nc;
}
else
{
return null;
}
}
}
len = iLength;
if (len < (remainCharCount + pushCharCount))
{
// word is too short to be hyphenated
return null;
}
int[] result = new int[len + 1];
int k = 0;
// check exception list first
string sw = new string(word, 1, len);
// LUCENENET: Eliminated extra lookup by using TryGetValue instead of ContainsKey
if (m_stoplist.TryGetValue(sw, out IList<object> hw))
{
// assume only simple hyphens (Hyphen.pre="-", Hyphen.post = Hyphen.no =
// null)
int j = 0;
for (i = 0; i < hw.Count; i++)
{
object o = hw[i];
// j = index(sw) = letterindex(word)?
// result[k] = corresponding index(w)
if (o is string)
{
j += ((string)o).Length;
if (j >= remainCharCount && j < (len - pushCharCount))
{
result[k++] = j + iIgnoreAtBeginning;
}
}
}
}
else
{
// use algorithm to get hyphenation points
word[0] = '.'; // word start marker
word[len + 1] = '.'; // word end marker
word[len + 2] = (char)0; // null terminated
byte[] il = new byte[len + 3]; // initialized to zero
for (i = 0; i < len + 1; i++)
{
SearchPatterns(word, i, il);
}
// hyphenation points are located where interletter value is odd
// i is letterindex(word),
// i + 1 is index(word),
// result[k] = corresponding index(w)
for (i = 0; i < len; i++)
{
if (((il[i + 1] & 1) == 1) && i >= remainCharCount && i <= (len - pushCharCount))
{
result[k++] = i + iIgnoreAtBeginning;
}
}
}
if (k > 0)
{
// trim result array
int[] res = new int[k + 2];
Array.Copy(result, 0, res, 1, k);
// We add the synthetical hyphenation points
// at the beginning and end of the word
res[0] = 0;
res[k + 1] = len;
return new Hyphenation(res);
}
else
{
return null;
}
}
/// <summary>
/// Add a character class to the tree. It is used by
/// <see cref="PatternParser"/> as callback to add character classes.
/// Character classes define the valid word characters for hyphenation. If a
/// word contains a character not defined in any of the classes, it is not
/// hyphenated. It also defines a way to normalize the characters in order to
/// compare them with the stored patterns. Usually pattern files use only lower
/// case characters, in this case a class for letter 'a', for example, should
/// be defined as "aA", the first character being the normalization char.
/// </summary>
public virtual void AddClass(string chargroup)
{
if (chargroup.Length > 0)
{
char equivChar = chargroup[0];
char[] key = new char[2];
key[1] = (char)0;
for (int i = 0; i < chargroup.Length; i++)
{
key[0] = chargroup[i];
m_classmap.Insert(key, 0, equivChar);
}
}
}
/// <summary>
/// Add an exception to the tree. It is used by
/// <see cref="PatternParser"/> class as callback to store the
/// hyphenation exceptions.
/// </summary>
/// <param name="word"> normalized word </param>
/// <param name="hyphenatedword"> a vector of alternating strings and
/// <see cref="Hyphen"/> objects. </param>
public virtual void AddException(string word, IList<object> hyphenatedword)
{
m_stoplist[word] = hyphenatedword;
}
/// <summary>
/// Add a pattern to the tree. Mainly, to be used by
/// <see cref="PatternParser"/> class as callback to add a pattern to
/// the tree.
/// </summary>
/// <param name="pattern"> the hyphenation pattern </param>
/// <param name="ivalue"> interletter weight values indicating the desirability and
/// priority of hyphenating at a given point within the pattern. It
/// should contain only digit characters. (i.e. '0' to '9'). </param>
public virtual void AddPattern(string pattern, string ivalue)
{
int k = ivalues.Find(ivalue);
if (k <= 0)
{
k = PackValues(ivalue);
ivalues.Insert(ivalue, (char)k);
}
Insert(pattern, (char)k);
}
// public override void printStats(PrintStream @out)
// {
//@out.println("Value space size = " + Convert.ToString(vspace.length(), CultureInfo.InvariantCulture));
//base.printStats(@out);
// }
}
}