blob: 04a60b2cc7fb9014af1aa40716818f72f4efeeae [file] [log] [blame]
/*
Copyright (c) 2001, Dr Martin Porter
Copyright (c) 2002, Richard Boulton
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holders nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
using Lucene.Net.Support;
using Lucene.Net.Util;
using System;
using System.Diagnostics.CodeAnalysis;
using System.Reflection;
using System.Text;
namespace Lucene.Net.Tartarus.Snowball
{
/// <summary>
/// This is the rev 502 of the Snowball SVN trunk,
/// but modified:
/// made abstract and introduced abstract method stem to avoid expensive reflection in filter class.
/// refactored StringBuffers to StringBuilder
/// uses char[] as buffer instead of StringBuffer/StringBuilder
/// eq_s,eq_s_b,insert,replace_s take CharSequence like eq_v and eq_v_b
/// reflection calls (Lovins, etc) use EMPTY_ARGS/EMPTY_PARAMS
/// </summary>
public abstract class SnowballProgram
{
private static readonly object[] EMPTY_ARGS = Arrays.Empty<object>();
protected SnowballProgram()
{
m_current = new char[8];
SetCurrent("");
}
public abstract bool Stem();
/// <summary>
/// Set the current string.
/// </summary>
public virtual void SetCurrent(string value)
{
m_current = value.ToCharArray();
m_cursor = 0;
m_limit = value.Length;
m_limit_backward = 0;
m_bra = m_cursor;
m_ket = m_limit;
}
/// <summary>
/// Get the current string.
/// </summary>
public virtual string Current => new string(m_current, 0, m_limit);
/// <summary>
/// Set the current string.
/// </summary>
/// <param name="text">character array containing input</param>
/// <param name="length">valid length of text.</param>
public virtual void SetCurrent(char[] text, int length)
{
m_current = text;
m_cursor = 0;
m_limit = length;
m_limit_backward = 0;
m_bra = m_cursor;
m_ket = m_limit;
}
/// <summary>
/// Get the current buffer containing the stem.
/// <para/>
/// NOTE: this may be a reference to a different character array than the
/// one originally provided with setCurrent, in the exceptional case that
/// stemming produced a longer intermediate or result string.
/// <para/>
/// It is necessary to use <see cref="CurrentBufferLength"/> to determine
/// the valid length of the returned buffer. For example, many words are
/// stemmed simply by subtracting from the length to remove suffixes.
/// </summary>
/// <seealso cref="CurrentBufferLength"/>
[WritableArray]
[SuppressMessage("Microsoft.Performance", "CA1819", Justification = "Lucene's design requires some writable array properties")]
public virtual char[] CurrentBuffer => m_current;
/// <summary>
/// Get the valid length of the character array in <seealso cref="CurrentBuffer"/>
/// </summary>
public virtual int CurrentBufferLength => m_limit;
// current string
protected char[] m_current;
protected int m_cursor;
protected int m_limit;
protected int m_limit_backward;
protected int m_bra;
protected int m_ket;
protected virtual void CopyFrom(SnowballProgram other)
{
m_current = other.m_current;
m_cursor = other.m_cursor;
m_limit = other.m_limit;
m_limit_backward = other.m_limit_backward;
m_bra = other.m_bra;
m_ket = other.m_ket;
}
protected virtual bool InGrouping(char[] s, int min, int max)
{
if (m_cursor >= m_limit) return false;
char ch = m_current[m_cursor];
if (ch > max || ch < min) return false;
ch -= (char)min;
if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false;
m_cursor++;
return true;
}
protected virtual bool InGroupingB(char[] s, int min, int max)
{
if (m_cursor <= m_limit_backward) return false;
char ch = m_current[m_cursor - 1];
if (ch > max || ch < min) return false;
ch -= (char)min;
if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false;
m_cursor--;
return true;
}
protected virtual bool OutGrouping(char[] s, int min, int max)
{
if (m_cursor >= m_limit) return false;
char ch = m_current[m_cursor];
if (ch > max || ch < min)
{
m_cursor++;
return true;
}
ch -= (char)min;
if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
{
m_cursor++;
return true;
}
return false;
}
protected virtual bool OutGroupingB(char[] s, int min, int max)
{
if (m_cursor <= m_limit_backward) return false;
char ch = m_current[m_cursor - 1];
if (ch > max || ch < min)
{
m_cursor--;
return true;
}
ch -= (char)min;
if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
{
m_cursor--;
return true;
}
return false;
}
protected virtual bool InRange(int min, int max)
{
if (m_cursor >= m_limit) return false;
char ch = m_current[m_cursor];
if (ch > max || ch < min) return false;
m_cursor++;
return true;
}
protected virtual bool InRangeB(int min, int max)
{
if (m_cursor <= m_limit_backward) return false;
char ch = m_current[m_cursor - 1];
if (ch > max || ch < min) return false;
m_cursor--;
return true;
}
protected virtual bool OutRange(int min, int max)
{
if (m_cursor >= m_limit) return false;
char ch = m_current[m_cursor];
if (!(ch > max || ch < min)) return false;
m_cursor++;
return true;
}
protected virtual bool OutRangeB(int min, int max)
{
if (m_cursor <= m_limit_backward) return false;
char ch = m_current[m_cursor - 1];
if (!(ch > max || ch < min)) return false;
m_cursor--;
return true;
}
protected virtual bool Eq_S(int s_size, string s)
{
if (m_limit - m_cursor < s_size) return false;
int i;
for (i = 0; i != s_size; i++)
{
if (m_current[m_cursor + i] != s[i]) return false;
}
m_cursor += s_size;
return true;
}
protected virtual bool Eq_S_B(int s_size, string s)
{
if (m_cursor - m_limit_backward < s_size) return false;
int i;
for (i = 0; i != s_size; i++)
{
if (m_current[m_cursor - s_size + i] != s[i]) return false;
}
m_cursor -= s_size;
return true;
}
protected virtual bool Eq_V(string s)
{
return Eq_S(s.Length, s);
}
protected virtual bool Eq_V_B(string s)
{
return Eq_S_B(s.Length, s);
}
protected virtual int FindAmong(Among[] v, int v_size)
{
int i = 0;
int j = v_size;
int c = m_cursor;
int l = m_limit;
int common_i = 0;
int common_j = 0;
bool first_key_inspected = false;
while (true)
{
int k = i + ((j - i) >> 1);
int diff = 0;
int common = common_i < common_j ? common_i : common_j; // smaller
Among w = v[k];
int i2;
for (i2 = common; i2 < w.Length; i2++)
{
if (c + common == l)
{
diff = -1;
break;
}
diff = m_current[c + common] - w.S[i2];
if (diff != 0) break;
common++;
}
if (diff < 0)
{
j = k;
common_j = common;
}
else
{
i = k;
common_i = common;
}
if (j - i <= 1)
{
if (i > 0) break; // v->s has been inspected
if (j == i) break; // only one item in v
// - but now we need to go round once more to get
// v->s inspected. This looks messy, but is actually
// the optimal approach.
if (first_key_inspected) break;
first_key_inspected = true;
}
}
while (true)
{
Among w = v[i];
if (common_i >= w.Length)
{
m_cursor = c + w.Length;
if (w.Method == null) return w.Result;
bool res;
try
{
object resobj = w.Method.Invoke(w.MethodObject, EMPTY_ARGS);
// LUCENENET-542: Fixed case-sensitivity problem
res = resobj.ToString().Equals("true", StringComparison.OrdinalIgnoreCase);
}
catch (TargetInvocationException /*e*/)
{
res = false;
// FIXME - debug message
}
catch (Exception /*e*/)
{
res = false;
// FIXME - debug message
}
m_cursor = c + w.Length;
if (res) return w.Result;
}
i = w.SubstringIndex;
if (i < 0) return 0;
}
}
// find_among_b is for backwards processing. Same comments apply
protected virtual int FindAmongB(Among[] v, int v_size)
{
int i = 0;
int j = v_size;
int c = m_cursor;
int lb = m_limit_backward;
int common_i = 0;
int common_j = 0;
bool first_key_inspected = false;
while (true)
{
int k = i + ((j - i) >> 1);
int diff = 0;
int common = common_i < common_j ? common_i : common_j;
Among w = v[k];
int i2;
for (i2 = w.Length - 1 - common; i2 >= 0; i2--)
{
if (c - common == lb)
{
diff = -1;
break;
}
diff = m_current[c - 1 - common] - w.S[i2];
if (diff != 0) break;
common++;
}
if (diff < 0)
{
j = k;
common_j = common;
}
else
{
i = k;
common_i = common;
}
if (j - i <= 1)
{
if (i > 0) break;
if (j == i) break;
if (first_key_inspected) break;
first_key_inspected = true;
}
}
while (true)
{
Among w = v[i];
if (common_i >= w.Length)
{
m_cursor = c - w.Length;
if (w.Method == null) return w.Result;
bool res;
try
{
object resobj = w.Method.Invoke(w.MethodObject, EMPTY_ARGS);
// LUCENENET-542: Fixed case-sensitivity problem
res = resobj.ToString().Equals("true", StringComparison.OrdinalIgnoreCase);
}
catch (TargetInvocationException /*e*/)
{
res = false;
// FIXME - debug message
}
catch (Exception /*e*/)
{
res = false;
// FIXME - debug message
}
m_cursor = c - w.Length;
if (res) return w.Result;
}
i = w.SubstringIndex;
if (i < 0) return 0;
}
}
/// <summary>
/// to replace chars between <paramref name="c_bra"/> and <paramref name="c_ket"/> in current by the
/// chars in <paramref name="s"/>.
/// </summary>
protected virtual int ReplaceS(int c_bra, int c_ket, string s)
{
int adjustment = s.Length - (c_ket - c_bra);
int newLength = m_limit + adjustment;
//resize if necessary
if (newLength > m_current.Length)
{
char[] newBuffer = new char[ArrayUtil.Oversize(newLength, RamUsageEstimator.NUM_BYTES_CHAR)];
System.Array.Copy(m_current, 0, newBuffer, 0, m_limit);
m_current = newBuffer;
}
// if the substring being replaced is longer or shorter than the
// replacement, need to shift things around
if (adjustment != 0 && c_ket < m_limit)
{
System.Array.Copy(m_current, c_ket, m_current, c_bra + s.Length,
m_limit - c_ket);
}
// insert the replacement text
// Note, faster is s.getChars(0, s.length(), current, c_bra);
// but would have to duplicate this method for both String and StringBuilder
for (int i = 0; i < s.Length; i++)
m_current[c_bra + i] = s[i];
m_limit += adjustment;
if (m_cursor >= c_ket) m_cursor += adjustment;
else if (m_cursor > c_bra) m_cursor = c_bra;
return adjustment;
}
protected virtual void SliceCheck()
{
if (m_bra < 0 ||
m_bra > m_ket ||
m_ket > m_limit)
{
throw new ArgumentException("faulty slice operation: bra=" + m_bra + ",ket=" + m_ket + ",limit=" + m_limit);
// FIXME: report error somehow.
/*
fprintf(stderr, "faulty slice operation:\n");
debug(z, -1, 0);
exit(1);
*/
}
}
protected virtual void SliceFrom(string s)
{
SliceCheck();
ReplaceS(m_bra, m_ket, s);
}
protected virtual void SliceDel()
{
SliceFrom("");
}
protected virtual void Insert(int c_bra, int c_ket, string s)
{
int adjustment = ReplaceS(c_bra, c_ket, s);
if (c_bra <= m_bra) m_bra += adjustment;
if (c_bra <= m_ket) m_ket += adjustment;
}
/// <summary>
/// Copy the slice into the supplied <see cref="StringBuilder"/>
/// </summary>
protected virtual StringBuilder SliceTo(StringBuilder s)
{
SliceCheck();
int len = m_ket - m_bra;
s.Length = 0;
s.Append(m_current, m_bra, len);
return s;
}
protected virtual StringBuilder AssignTo(StringBuilder s)
{
s.Length = 0;
s.Append(m_current, 0, m_limit);
return s;
}
/*
extern void debug(struct SN_env * z, int number, int line_count)
{ int i;
int limit = SIZE(z->p);
//if (number >= 0) printf("%3d (line %4d): '", number, line_count);
if (number >= 0) printf("%3d (line %4d): [%d]'", number, line_count,limit);
for (i = 0; i <= limit; i++)
{ if (z->lb == i) printf("{");
if (z->bra == i) printf("[");
if (z->c == i) printf("|");
if (z->ket == i) printf("]");
if (z->l == i) printf("}");
if (i < limit)
{ int ch = z->p[i];
if (ch == 0) ch = '#';
printf("%c", ch);
}
}
printf("'\n");
}
*/
}
}