blob: c3d7406aa1a0132e367dab5805b15cd93d7d2dd5 [file] [log] [blame]
using J2N;
using J2N.Text;
using System;
using System.Collections.Generic;
using System.Globalization;
using System.IO;
using System.Runtime.CompilerServices;
using System.Text;
using JCG = J2N.Collections.Generic;
/*
* dk.brics.automaton
*
* Copyright (c) 2001-2009 Anders Moeller
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* this SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* this SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
namespace Lucene.Net.Util.Automaton
{
// LUCENENET specific - converted constants from RegExp
// into a flags enum.
[Flags]
public enum RegExpSyntax
{
/// <summary>
/// Syntax flag, enables intersection (<c>&amp;</c>).
/// </summary>
INTERSECTION = 0x0001,
/// <summary>
/// Syntax flag, enables complement (<c>~</c>).
/// </summary>
COMPLEMENT = 0x0002,
/// <summary>
/// Syntax flag, enables empty language (<c>#</c>).
/// </summary>
EMPTY = 0x0004,
/// <summary>
/// Syntax flag, enables anystring (<c>@</c>).
/// </summary>
ANYSTRING = 0x0008,
/// <summary>
/// Syntax flag, enables named automata (<c>&lt;</c>identifier<c>&gt;</c>).
/// </summary>
AUTOMATON = 0x0010,
/// <summary>
/// Syntax flag, enables numerical intervals (
/// <c>&lt;<i>n</i>-<i>m</i>&gt;</c>).
/// </summary>
INTERVAL = 0x0020,
/// <summary>
/// Syntax flag, enables all optional regexp syntax.
/// </summary>
ALL = 0xffff,
/// <summary>
/// Syntax flag, enables no optional regexp syntax.
/// </summary>
NONE = 0x0000
}
/// <summary>
/// Regular Expression extension to <see cref="Util.Automaton.Automaton"/>.
/// <para/>
/// Regular expressions are built from the following abstract syntax:
/// <para/>
/// <list type="table">
/// <item>
/// <term><i>regexp</i></term>
/// <term>::=</term>
/// <term><i>unionexp</i></term>
/// <term></term>
/// <term></term>
/// </item>
/// <item>
/// <term></term>
/// <term>|</term>
/// <term></term>
/// <term></term>
/// <term></term>
/// </item>
///
/// <item>
/// <term><i>unionexp</i></term>
/// <term>::=</term>
/// <term><i>interexp</i>&#160;<tt><b>|</b></tt>&#160;<i>unionexp</i></term>
/// <term>(union)</term>
/// <term></term>
/// </item>
/// <item>
/// <term></term>
/// <term>|</term>
/// <term><i>interexp</i></term>
/// <term></term>
/// <term></term>
/// </item>
///
/// <item>
/// <term><i>interexp</i></term>
/// <term>::=</term>
/// <term><i>concatexp</i>&#160;<tt><b>&amp;</b></tt>&#160;<i>interexp</i></term>
/// <term>(intersection)</term>
/// <term><small>[OPTIONAL]</small></term>
/// </item>
/// <item>
/// <term></term>
/// <term>|</term>
/// <term><i>concatexp</i></term>
/// <term></term>
/// <term></term>
/// </item>
///
/// <item>
/// <term><i>concatexp</i></term>
/// <term>::=</term>
/// <term><i>repeatexp</i>&#160;<i>concatexp</i></term>
/// <term>(concatenation)</term>
/// <term></term>
/// </item>
/// <item>
/// <term></term>
/// <term>|</term>
/// <term><i>repeatexp</i></term>
/// <term></term>
/// <term></term>
/// </item>
///
/// <item>
/// <term><i>repeatexp</i></term>
/// <term>::=</term>
/// <term><i>repeatexp</i>&#160;<tt><b>?</b></tt></term>
/// <term>(zero or one occurrence)</term>
/// <term></term>
/// </item>
/// <item>
/// <term></term>
/// <term>|</term>
/// <term><i>repeatexp</i>&#160;<tt><b>*</b></tt></term>
/// <term>(zero or more occurrences)</term>
/// <term></term>
/// </item>
/// <item>
/// <term></term>
/// <term>|</term>
/// <term><i>repeatexp</i>&#160;<tt><b>+</b></tt></term>
/// <term>(one or more occurrences)</term>
/// <term></term>
/// </item>
/// <item>
/// <term></term>
/// <term>|</term>
/// <term><i>repeatexp</i>&#160;<tt><b>{</b><i>n</i><b>}</b></tt></term>
/// <term>(<tt><i>n</i></tt> occurrences)</term>
/// <term></term>
/// </item>
/// <item>
/// <term></term>
/// <term>|</term>
/// <term><i>repeatexp</i>&#160;<tt><b>{</b><i>n</i><b>,}</b></tt></term>
/// <term>(<tt><i>n</i></tt> or more occurrences)</term>
/// <term></term>
/// </item>
/// <item>
/// <term></term>
/// <term>|</term>
/// <term><i>repeatexp</i>&#160;<tt><b>{</b><i>n</i><b>,</b><i>m</i><b>}</b></tt></term>
/// <term>(<tt><i>n</i></tt> to <tt><i>m</i></tt> occurrences, including both)</term>
/// <term></term>
/// </item>
/// <item>
/// <term></term>
/// <term>|</term>
/// <term><i>complexp</i></term>
/// <term></term>
/// <term></term>
/// </item>
///
/// <item>
/// <term><i>complexp</i></term>
/// <term>::=</term>
/// <term><tt><b>~</b></tt>&#160;<i>complexp</i></term>
/// <term>(complement)</term>
/// <term><small>[OPTIONAL]</small></term>
/// </item>
/// <item>
/// <term></term>
/// <term>|</term>
/// <term><i>charclassexp</i></term>
/// <term></term>
/// <term></term>
/// </item>
///
/// <item>
/// <term><i>charclassexp</i></term>
/// <term>::=</term>
/// <term><tt><b>[</b></tt>&#160;<i>charclasses</i>&#160;<tt><b>]</b></tt></term>
/// <term>(character class)</term>
/// <term></term>
/// </item>
/// <item>
/// <term></term>
/// <term>|</term>
/// <term><tt><b>[^</b></tt>&#160;<i>charclasses</i>&#160;<tt><b>]</b></tt></term>
/// <term>(negated character class)</term>
/// <term></term>
/// </item>
/// <item>
/// <term></term>
/// <term>|</term>
/// <term><i>simpleexp</i></term>
/// <term></term>
/// <term></term>
/// </item>
///
/// <item>
/// <term><i>charclasses</i></term>
/// <term>::=</term>
/// <term><i>charclass</i>&#160;<i>charclasses</i></term>
/// <term></term>
/// <term></term>
/// </item>
/// <item>
/// <term></term>
/// <term>|</term>
/// <term><i>charclass</i></term>
/// <term></term>
/// <term></term>
/// </item>
///
/// <item>
/// <term><i>charclass</i></term>
/// <term>::=</term>
/// <term><i>charexp</i>&#160;<tt><b>-</b></tt>&#160;<i>charexp</i></term>
/// <term>(character range, including end-points)</term>
/// <term></term>
/// </item>
/// <item>
/// <term></term>
/// <term>|</term>
/// <term><i>charexp</i></term>
/// <term></term>
/// <term></term>
/// </item>
///
/// <item>
/// <term><i>simpleexp</i></term>
/// <term>::=</term>
/// <term><i>charexp</i></term>
/// <term></term>
/// <term></term>
/// </item>
/// <item>
/// <term></term>
/// <term>|</term>
/// <term><tt><b>.</b></tt></term>
/// <term>(any single character)</term>
/// <term></term>
/// </item>
/// <item>
/// <term></term>
/// <term>|</term>
/// <term><tt><b>#</b></tt></term>
/// <term>(the empty language)</term>
/// <term><small>[OPTIONAL]</small></term>
/// </item>
/// <item>
/// <term></term>
/// <term>|</term>
/// <term><tt><b>@</b></tt></term>
/// <term>(any string)</term>
/// <term><small>[OPTIONAL]</small></term>
/// </item>
/// <item>
/// <term></term>
/// <term>|</term>
/// <term><tt><b>"</b></tt>&#160;&lt;Unicode string without double-quotes&gt;&#160; <tt><b>"</b></tt></term>
/// <term>(a string)</term>
/// <term></term>
/// </item>
/// <item>
/// <term></term>
/// <term>|</term>
/// <term><tt><b>(</b></tt>&#160;<tt><b>)</b></tt></term>
/// <term>(the empty string)</term>
/// <term></term>
/// </item>
/// <item>
/// <term></term>
/// <term>|</term>
/// <term><tt><b>(</b></tt>&#160;<i>unionexp</i>&#160;<tt><b>)</b></tt></term>
/// <term>(precedence override)</term>
/// <term></term>
/// </item>
/// <item>
/// <term></term>
/// <term>|</term>
/// <term><tt><b>&lt;</b></tt>&#160;&lt;identifier&gt;&#160;<tt><b>&gt;</b></tt></term>
/// <term>(named automaton)</term>
/// <term><small>[OPTIONAL]</small></term>
/// </item>
/// <item>
/// <term></term>
/// <term>|</term>
/// <term><tt><b>&lt;</b><i>n</i>-<i>m</i><b>&gt;</b></tt></term>
/// <term>(numerical interval)</term>
/// <term><small>[OPTIONAL]</small></term>
/// </item>
///
/// <item>
/// <term><i>charexp</i></term>
/// <term>::=</term>
/// <term>&lt;Unicode character&gt;</term>
/// <term>(a single non-reserved character)</term>
/// <term></term>
/// </item>
/// <item>
/// <term></term>
/// <term>|</term>
/// <term><tt><b>\</b></tt>&#160;&lt;Unicode character&gt;&#160;</term>
/// <term>(a single character)</term>
/// <term></term>
/// </item>
///
/// </list>
///
/// <para/>
/// The productions marked <small>[OPTIONAL]</small> are only allowed if
/// specified by the syntax flags passed to the <see cref="RegExp"/> constructor.
/// The reserved characters used in the (enabled) syntax must be escaped with
/// backslash (<c>\</c>) or double-quotes (<c>"..."</c>). (In
/// contrast to other regexp syntaxes, this is required also in character
/// classes.) Be aware that dash (<c>-</c>) has a special meaning in
/// <i>charclass</i> expressions. An identifier is a string not containing right
/// angle bracket (<c>&gt;</c>) or dash (<c>-</c>). Numerical
/// intervals are specified by non-negative decimal integers and include both end
/// points, and if <c>n</c> and <c>m</c> have the same number
/// of digits, then the conforming strings must have that length (i.e. prefixed
/// by 0's).
/// <para/>
/// @lucene.experimental
/// </summary>
public class RegExp
{
internal enum Kind
{
REGEXP_UNION,
REGEXP_CONCATENATION,
REGEXP_INTERSECTION,
REGEXP_OPTIONAL,
REGEXP_REPEAT,
REGEXP_REPEAT_MIN,
REGEXP_REPEAT_MINMAX,
REGEXP_COMPLEMENT,
REGEXP_CHAR,
REGEXP_CHAR_RANGE,
REGEXP_ANYCHAR,
REGEXP_EMPTY,
REGEXP_STRING,
REGEXP_ANYSTRING,
REGEXP_AUTOMATON,
REGEXP_INTERVAL
}
// LUCENENET specific - made flags into their own [Flags] enum named RegExpSyntax and de-nested from this type
private static bool allow_mutation = false;
internal Kind kind;
internal RegExp exp1, exp2;
internal string s;
internal int c;
internal int min, max, digits;
internal int from, to;
internal string b;
internal RegExpSyntax flags;
internal int pos;
internal RegExp()
{
}
/// <summary>
/// Constructs new <see cref="RegExp"/> from a string. Same as
/// <c>RegExp(s, RegExpSyntax.ALL)</c>.
/// </summary>
/// <param name="s"> Regexp string. </param>
/// <exception cref="ArgumentException"> If an error occured while parsing the
/// regular expression. </exception>
public RegExp(string s)
: this(s, RegExpSyntax.ALL)
{
}
/// <summary>
/// Constructs new <see cref="RegExp"/> from a string.
/// </summary>
/// <param name="s"> Regexp string. </param>
/// <param name="syntax_flags"> Boolean 'or' of optional <see cref="RegExpSyntax"/> constructs to be
/// enabled. </param>
/// <exception cref="ArgumentException"> If an error occured while parsing the
/// regular expression </exception>
public RegExp(string s, RegExpSyntax syntax_flags)
{
b = s;
flags = syntax_flags;
RegExp e;
if (s.Length == 0)
{
e = MakeString("");
}
else
{
e = ParseUnionExp();
if (pos < b.Length)
{
throw new ArgumentException("end-of-string expected at position " + pos);
}
}
kind = e.kind;
exp1 = e.exp1;
exp2 = e.exp2;
this.s = e.s;
c = e.c;
min = e.min;
max = e.max;
digits = e.digits;
from = e.from;
to = e.to;
b = null;
}
/// <summary>
/// Constructs new <see cref="Automaton"/> from this <see cref="RegExp"/>. Same
/// as <c>ToAutomaton(null)</c> (empty automaton map).
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public virtual Automaton ToAutomaton()
{
return ToAutomatonAllowMutate(null, null);
}
/// <summary>
/// Constructs new <see cref="Automaton"/> from this <see cref="RegExp"/>. The
/// constructed automaton is minimal and deterministic and has no transitions
/// to dead states.
/// </summary>
/// <param name="automaton_provider"> Provider of automata for named identifiers. </param>
/// <exception cref="ArgumentException"> If this regular expression uses a named
/// identifier that is not available from the automaton provider. </exception>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public virtual Automaton ToAutomaton(IAutomatonProvider automaton_provider)
{
return ToAutomatonAllowMutate(null, automaton_provider);
}
/// <summary>
/// Constructs new <see cref="Automaton"/> from this <see cref="RegExp"/>. The
/// constructed automaton is minimal and deterministic and has no transitions
/// to dead states.
/// </summary>
/// <param name="automata"> A map from automaton identifiers to automata (of type
/// <see cref="Automaton"/>). </param>
/// <exception cref="ArgumentException"> If this regular expression uses a named
/// identifier that does not occur in the automaton map. </exception>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public virtual Automaton ToAutomaton(IDictionary<string, Automaton> automata)
{
return ToAutomatonAllowMutate(automata, null);
}
/// <summary>
/// Sets or resets allow mutate flag. If this flag is set, then automata
/// construction uses mutable automata, which is slightly faster but not thread
/// safe. By default, the flag is not set.
/// </summary>
/// <param name="flag"> If <c>true</c>, the flag is set </param>
/// <returns> Previous value of the flag. </returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public virtual bool SetAllowMutate(bool flag)
{
bool b = allow_mutation;
allow_mutation = flag;
return b;
}
private Automaton ToAutomatonAllowMutate(IDictionary<string, Automaton> automata, IAutomatonProvider automaton_provider)
{
bool b = false;
if (allow_mutation) // thread unsafe
{
b = Automaton.SetAllowMutate(true);
}
Automaton a = ToAutomaton(automata, automaton_provider);
if (allow_mutation)
{
Automaton.SetAllowMutate(b);
}
return a;
}
private Automaton ToAutomaton(IDictionary<string, Automaton> automata, IAutomatonProvider automaton_provider)
{
IList<Automaton> list;
Automaton a = null;
switch (kind)
{
case Kind.REGEXP_UNION:
list = new List<Automaton>();
FindLeaves(exp1, Kind.REGEXP_UNION, list, automata, automaton_provider);
FindLeaves(exp2, Kind.REGEXP_UNION, list, automata, automaton_provider);
a = BasicOperations.Union(list);
MinimizationOperations.Minimize(a);
break;
case Kind.REGEXP_CONCATENATION:
list = new List<Automaton>();
FindLeaves(exp1, Kind.REGEXP_CONCATENATION, list, automata, automaton_provider);
FindLeaves(exp2, Kind.REGEXP_CONCATENATION, list, automata, automaton_provider);
a = BasicOperations.Concatenate(list);
MinimizationOperations.Minimize(a);
break;
case Kind.REGEXP_INTERSECTION:
a = exp1.ToAutomaton(automata, automaton_provider).Intersection(exp2.ToAutomaton(automata, automaton_provider));
MinimizationOperations.Minimize(a);
break;
case Kind.REGEXP_OPTIONAL:
a = exp1.ToAutomaton(automata, automaton_provider).Optional();
MinimizationOperations.Minimize(a);
break;
case Kind.REGEXP_REPEAT:
a = exp1.ToAutomaton(automata, automaton_provider).Repeat();
MinimizationOperations.Minimize(a);
break;
case Kind.REGEXP_REPEAT_MIN:
a = exp1.ToAutomaton(automata, automaton_provider).Repeat(min);
MinimizationOperations.Minimize(a);
break;
case Kind.REGEXP_REPEAT_MINMAX:
a = exp1.ToAutomaton(automata, automaton_provider).Repeat(min, max);
MinimizationOperations.Minimize(a);
break;
case Kind.REGEXP_COMPLEMENT:
a = exp1.ToAutomaton(automata, automaton_provider).Complement();
MinimizationOperations.Minimize(a);
break;
case Kind.REGEXP_CHAR:
a = BasicAutomata.MakeChar(c);
break;
case Kind.REGEXP_CHAR_RANGE:
a = BasicAutomata.MakeCharRange(from, to);
break;
case Kind.REGEXP_ANYCHAR:
a = BasicAutomata.MakeAnyChar();
break;
case Kind.REGEXP_EMPTY:
a = BasicAutomata.MakeEmpty();
break;
case Kind.REGEXP_STRING:
a = BasicAutomata.MakeString(s);
break;
case Kind.REGEXP_ANYSTRING:
a = BasicAutomata.MakeAnyString();
break;
case Kind.REGEXP_AUTOMATON:
Automaton aa = null;
if (automata != null)
{
aa = automata[s];
}
if (aa == null && automaton_provider != null)
{
try
{
aa = automaton_provider.GetAutomaton(s);
}
catch (IOException e)
{
throw new ArgumentException(e.ToString(), e);
}
}
if (aa == null)
{
throw new ArgumentException("'" + s + "' not found");
}
a = (Automaton)aa.Clone(); // always clone here (ignore allow_mutate)
break;
case Kind.REGEXP_INTERVAL:
a = BasicAutomata.MakeInterval(min, max, digits);
break;
}
return a;
}
private void FindLeaves(RegExp exp, Kind kind, IList<Automaton> list, IDictionary<string, Automaton> automata, IAutomatonProvider automaton_provider)
{
if (exp.kind == kind)
{
FindLeaves(exp.exp1, kind, list, automata, automaton_provider);
FindLeaves(exp.exp2, kind, list, automata, automaton_provider);
}
else
{
list.Add(exp.ToAutomaton(automata, automaton_provider));
}
}
/// <summary>
/// Constructs string from parsed regular expression.
/// </summary>
public override string ToString()
{
return ToStringBuilder(new StringBuilder()).ToString();
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal virtual StringBuilder ToStringBuilder(StringBuilder b)
{
switch (kind)
{
case Kind.REGEXP_UNION:
b.Append("(");
exp1.ToStringBuilder(b);
b.Append("|");
exp2.ToStringBuilder(b);
b.Append(")");
break;
case Kind.REGEXP_CONCATENATION:
exp1.ToStringBuilder(b);
exp2.ToStringBuilder(b);
break;
case Kind.REGEXP_INTERSECTION:
b.Append("(");
exp1.ToStringBuilder(b);
b.Append("&");
exp2.ToStringBuilder(b);
b.Append(")");
break;
case Kind.REGEXP_OPTIONAL:
b.Append("(");
exp1.ToStringBuilder(b);
b.Append(")?");
break;
case Kind.REGEXP_REPEAT:
b.Append("(");
exp1.ToStringBuilder(b);
b.Append(")*");
break;
case Kind.REGEXP_REPEAT_MIN:
b.Append("(");
exp1.ToStringBuilder(b);
b.Append("){").Append(min).Append(",}");
break;
case Kind.REGEXP_REPEAT_MINMAX:
b.Append("(");
exp1.ToStringBuilder(b);
b.Append("){").Append(min).Append(",").Append(max).Append("}");
break;
case Kind.REGEXP_COMPLEMENT:
b.Append("~(");
exp1.ToStringBuilder(b);
b.Append(")");
break;
case Kind.REGEXP_CHAR:
b.Append("\\").AppendCodePoint(c);
break;
case Kind.REGEXP_CHAR_RANGE:
b.Append("[\\").AppendCodePoint(from).Append("-\\").AppendCodePoint(to).Append("]");
break;
case Kind.REGEXP_ANYCHAR:
b.Append(".");
break;
case Kind.REGEXP_EMPTY:
b.Append("#");
break;
case Kind.REGEXP_STRING:
b.Append("\"").Append(s).Append("\"");
break;
case Kind.REGEXP_ANYSTRING:
b.Append("@");
break;
case Kind.REGEXP_AUTOMATON:
b.Append("<").Append(s).Append(">");
break;
case Kind.REGEXP_INTERVAL:
string s1 = Convert.ToString(min, CultureInfo.InvariantCulture);
string s2 = Convert.ToString(max, CultureInfo.InvariantCulture);
b.Append("<");
if (digits > 0)
{
for (int i = s1.Length; i < digits; i++)
{
b.Append('0');
}
}
b.Append(s1).Append("-");
if (digits > 0)
{
for (int i = s2.Length; i < digits; i++)
{
b.Append('0');
}
}
b.Append(s2).Append(">");
break;
}
return b;
}
/// <summary>
/// Returns set of automaton identifiers that occur in this regular expression.
/// </summary>
public virtual ISet<string> GetIdentifiers()
{
ISet<string> set = new JCG.HashSet<string>();
GetIdentifiers(set);
return set;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal virtual void GetIdentifiers(ISet<string> set)
{
switch (kind)
{
case Kind.REGEXP_UNION:
case Kind.REGEXP_CONCATENATION:
case Kind.REGEXP_INTERSECTION:
exp1.GetIdentifiers(set);
exp2.GetIdentifiers(set);
break;
case Kind.REGEXP_OPTIONAL:
case Kind.REGEXP_REPEAT:
case Kind.REGEXP_REPEAT_MIN:
case Kind.REGEXP_REPEAT_MINMAX:
case Kind.REGEXP_COMPLEMENT:
exp1.GetIdentifiers(set);
break;
case Kind.REGEXP_AUTOMATON:
set.Add(s);
break;
default:
break;
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static RegExp MakeUnion(RegExp exp1, RegExp exp2)
{
return new RegExp
{
kind = Kind.REGEXP_UNION,
exp1 = exp1,
exp2 = exp2
};
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static RegExp MakeConcatenation(RegExp exp1, RegExp exp2)
{
if ((exp1.kind == Kind.REGEXP_CHAR || exp1.kind == Kind.REGEXP_STRING) && (exp2.kind == Kind.REGEXP_CHAR || exp2.kind == Kind.REGEXP_STRING))
{
return MakeString(exp1, exp2);
}
RegExp r = new RegExp
{
kind = Kind.REGEXP_CONCATENATION
};
if (exp1.kind == Kind.REGEXP_CONCATENATION && (exp1.exp2.kind == Kind.REGEXP_CHAR || exp1.exp2.kind == Kind.REGEXP_STRING) && (exp2.kind == Kind.REGEXP_CHAR || exp2.kind == Kind.REGEXP_STRING))
{
r.exp1 = exp1.exp1;
r.exp2 = MakeString(exp1.exp2, exp2);
}
else if ((exp1.kind == Kind.REGEXP_CHAR || exp1.kind == Kind.REGEXP_STRING) && exp2.kind == Kind.REGEXP_CONCATENATION && (exp2.exp1.kind == Kind.REGEXP_CHAR || exp2.exp1.kind == Kind.REGEXP_STRING))
{
r.exp1 = MakeString(exp1, exp2.exp1);
r.exp2 = exp2.exp2;
}
else
{
r.exp1 = exp1;
r.exp2 = exp2;
}
return r;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static RegExp MakeString(RegExp exp1, RegExp exp2)
{
StringBuilder b = new StringBuilder();
if (exp1.kind == Kind.REGEXP_STRING)
{
b.Append(exp1.s);
}
else
{
b.AppendCodePoint(exp1.c);
}
if (exp2.kind == Kind.REGEXP_STRING)
{
b.Append(exp2.s);
}
else
{
b.AppendCodePoint(exp2.c);
}
return MakeString(b.ToString());
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static RegExp MakeIntersection(RegExp exp1, RegExp exp2)
{
return new RegExp
{
kind = Kind.REGEXP_INTERSECTION,
exp1 = exp1,
exp2 = exp2
};
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static RegExp MakeOptional(RegExp exp)
{
return new RegExp
{
kind = Kind.REGEXP_OPTIONAL,
exp1 = exp
};
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static RegExp MakeRepeat(RegExp exp)
{
return new RegExp
{
kind = Kind.REGEXP_REPEAT,
exp1 = exp
};
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static RegExp MakeRepeat(RegExp exp, int min)
{
return new RegExp
{
kind = Kind.REGEXP_REPEAT_MIN,
exp1 = exp,
min = min
};
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static RegExp MakeRepeat(RegExp exp, int min, int max)
{
return new RegExp
{
kind = Kind.REGEXP_REPEAT_MINMAX,
exp1 = exp,
min = min,
max = max
};
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static RegExp MakeComplement(RegExp exp)
{
return new RegExp
{
kind = Kind.REGEXP_COMPLEMENT,
exp1 = exp
};
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static RegExp MakeChar(int c)
{
return new RegExp
{
kind = Kind.REGEXP_CHAR,
c = c
};
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static RegExp MakeCharRange(int from, int to)
{
if (from > to)
{
throw new ArgumentException($"invalid range: from ({from}) cannot be > to ({to})");
}
return new RegExp
{
kind = Kind.REGEXP_CHAR_RANGE,
from = from,
to = to
};
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static RegExp MakeAnyChar()
{
return new RegExp
{
kind = Kind.REGEXP_ANYCHAR
};
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static RegExp MakeEmpty()
{
return new RegExp
{
kind = Kind.REGEXP_EMPTY
};
}
internal static RegExp MakeString(string s)
{
return new RegExp
{
kind = Kind.REGEXP_STRING,
s = s
};
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static RegExp MakeAnyString()
{
return new RegExp
{
kind = Kind.REGEXP_ANYSTRING
};
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static RegExp MakeAutomaton(string s)
{
return new RegExp
{
kind = Kind.REGEXP_AUTOMATON,
s = s
};
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static RegExp MakeInterval(int min, int max, int digits)
{
return new RegExp
{
kind = Kind.REGEXP_INTERVAL,
min = min,
max = max,
digits = digits
};
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private bool Peek(string s)
{
return More() && s.IndexOf(b.CodePointAt(pos)) != -1;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private bool Match(int c)
{
if (pos >= b.Length)
{
return false;
}
if (b.CodePointAt(pos) == c)
{
pos += Character.CharCount(c);
return true;
}
return false;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private bool More()
{
return pos < b.Length;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private int Next()
{
if (!More())
{
throw new ArgumentException("unexpected end-of-string");
}
int ch = b.CodePointAt(pos);
pos += Character.CharCount(ch);
return ch;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private bool Check(RegExpSyntax flag)
{
return (flags & flag) != 0;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal RegExp ParseUnionExp()
{
RegExp e = ParseInterExp();
if (Match('|'))
{
e = MakeUnion(e, ParseUnionExp());
}
return e;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal RegExp ParseInterExp()
{
RegExp e = ParseConcatExp();
if (Check(RegExpSyntax.INTERSECTION) && Match('&'))
{
e = MakeIntersection(e, ParseInterExp());
}
return e;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal RegExp ParseConcatExp()
{
RegExp e = ParseRepeatExp();
if (More() && !Peek(")|") && (!Check(RegExpSyntax.INTERSECTION) || !Peek("&")))
{
e = MakeConcatenation(e, ParseConcatExp());
}
return e;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal RegExp ParseRepeatExp()
{
RegExp e = ParseComplExp();
while (Peek("?*+{"))
{
if (Match('?'))
{
e = MakeOptional(e);
}
else if (Match('*'))
{
e = MakeRepeat(e);
}
else if (Match('+'))
{
e = MakeRepeat(e, 1);
}
else if (Match('{'))
{
int start = pos;
while (Peek("0123456789"))
{
Next();
}
if (start == pos)
{
throw new ArgumentException("integer expected at position " + pos);
}
int n = Convert.ToInt32(b.Substring(start, pos - start), CultureInfo.InvariantCulture);
int m = -1;
if (Match(','))
{
start = pos;
while (Peek("0123456789"))
{
Next();
}
if (start != pos)
{
m = Convert.ToInt32(b.Substring(start, pos - start), CultureInfo.InvariantCulture);
}
}
else
{
m = n;
}
if (!Match('}'))
{
throw new ArgumentException("expected '}' at position " + pos);
}
if (m == -1)
{
e = MakeRepeat(e, n);
}
else
{
e = MakeRepeat(e, n, m);
}
}
}
return e;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal RegExp ParseComplExp()
{
if (Check(RegExpSyntax.COMPLEMENT) && Match('~'))
{
return MakeComplement(ParseComplExp());
}
else
{
return ParseCharClassExp();
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal RegExp ParseCharClassExp()
{
if (Match('['))
{
bool negate = false;
if (Match('^'))
{
negate = true;
}
RegExp e = ParseCharClasses();
if (negate)
{
e = MakeIntersection(MakeAnyChar(), MakeComplement(e));
}
if (!Match(']'))
{
throw new ArgumentException("expected ']' at position " + pos);
}
return e;
}
else
{
return ParseSimpleExp();
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal RegExp ParseCharClasses()
{
RegExp e = ParseCharClass();
while (More() && !Peek("]"))
{
e = MakeUnion(e, ParseCharClass());
}
return e;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal RegExp ParseCharClass()
{
int c = ParseCharExp();
if (Match('-'))
{
return MakeCharRange(c, ParseCharExp());
}
else
{
return MakeChar(c);
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal RegExp ParseSimpleExp()
{
if (Match('.'))
{
return MakeAnyChar();
}
else if (Check(RegExpSyntax.EMPTY) && Match('#'))
{
return MakeEmpty();
}
else if (Check(RegExpSyntax.ANYSTRING) && Match('@'))
{
return MakeAnyString();
}
else if (Match('"'))
{
int start = pos;
while (More() && !Peek("\""))
{
Next();
}
if (!Match('"'))
{
throw new ArgumentException("expected '\"' at position " + pos);
}
return MakeString(b.Substring(start, pos - 1 - start));
}
else if (Match('('))
{
if (Match(')'))
{
return MakeString("");
}
RegExp e = ParseUnionExp();
if (!Match(')'))
{
throw new ArgumentException("expected ')' at position " + pos);
}
return e;
}
else if ((Check(RegExpSyntax.AUTOMATON) || Check(RegExpSyntax.INTERVAL)) && Match('<'))
{
int start = pos;
while (More() && !Peek(">"))
{
Next();
}
if (!Match('>'))
{
throw new ArgumentException("expected '>' at position " + pos);
}
string s = b.Substring(start, pos - 1 - start);
int i = s.IndexOf('-');
if (i == -1)
{
if (!Check(RegExpSyntax.AUTOMATON))
{
throw new ArgumentException("interval syntax error at position " + (pos - 1));
}
return MakeAutomaton(s);
}
else
{
if (!Check(RegExpSyntax.INTERVAL))
{
throw new ArgumentException("illegal identifier at position " + (pos - 1));
}
try
{
if (i == 0 || i == s.Length - 1 || i != s.LastIndexOf('-'))
{
throw new FormatException();
}
string smin = s.Substring(0, i);
string smax = s.Substring(i + 1, s.Length - (i + 1));
int imin = Convert.ToInt32(smin, CultureInfo.InvariantCulture);
int imax = Convert.ToInt32(smax, CultureInfo.InvariantCulture);
int digits;
if (smin.Length == smax.Length)
{
digits = smin.Length;
}
else
{
digits = 0;
}
if (imin > imax)
{
int t = imin;
imin = imax;
imax = t;
}
return MakeInterval(imin, imax, digits);
}
#pragma warning disable 168
catch (FormatException e)
#pragma warning restore 168
{
throw new ArgumentException("interval syntax error at position " + (pos - 1), e);
}
}
}
else
{
return MakeChar(ParseCharExp());
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal int ParseCharExp()
{
Match('\\');
return Next();
}
}
}