src/Lucene.Net.QueryParser/Flexible/Standard/Parser/EscapeQuerySyntaxImpl.cs - lucenenet - Git at Google

 using J2N.Text;
 using Lucene.Net.QueryParsers.Flexible.Core.Messages;
 using Lucene.Net.QueryParsers.Flexible.Core.Parser;
 using Lucene.Net.QueryParsers.Flexible.Core.Util;
 using Lucene.Net.QueryParsers.Flexible.Messages;
 using System;
 using System.Globalization;
 using System.Text;

 namespace Lucene.Net.QueryParsers.Flexible.Standard.Parser
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     /// <summary>
     /// Implementation of <see cref="IEscapeQuerySyntax"/> for the standard lucene
     /// syntax.
     /// </summary>
     public class EscapeQuerySyntax : IEscapeQuerySyntax
     {
         private static readonly char[] wildcardChars = { '*', '?' };

         private static readonly string[] escapableTermExtraFirstChars = { "+", "-", "@" };

         private static readonly string[] escapableTermChars = { "\"", "<", ">", "=",
             "!", "(", ")", "^", "[", "{", ":", "]", "}", "~", "/" };

         // TODO: check what to do with these "*", "?", "\\"
         private static readonly string[] escapableQuotedChars = { "\"" };
         private static readonly string[] escapableWhiteChars = { " ", "\t", "\n", "\r",
             "\f", "\b", "\u3000" };
         private static readonly string[] escapableWordTokens = { "AND", "OR", "NOT",
             "TO", "WITHIN", "SENTENCE", "PARAGRAPH", "INORDER" };

         private static ICharSequence EscapeChar(ICharSequence str, CultureInfo locale)
         {
             if (str == null || str.Length == 0)
                 return str;

             ICharSequence buffer = str;

             // regular escapable Char for terms
             for (int i = 0; i < escapableTermChars.Length; i++)
             {
                 buffer = ReplaceIgnoreCase(buffer, locale.TextInfo.ToLower(escapableTermChars[i]),
                     "\\", locale);
             }

             // First Character of a term as more escaping chars
             for (int i = 0; i < escapableTermExtraFirstChars.Length; i++)
             {
                 if (buffer[0] == escapableTermExtraFirstChars[i][0])
                 {
                     buffer = new StringCharSequence("\\" + buffer[0]
                         + buffer.Subsequence(1, buffer.Length - 1).ToString()); // LUCENENET: Corrected 2nd Subsequence parameter
                     break;
                 }
             }

             return buffer;
         }

         private ICharSequence EscapeQuoted(ICharSequence str, CultureInfo locale)
         {
             if (str == null || str.Length == 0)
                 return str;

             ICharSequence buffer = str;

             for (int i = 0; i < escapableQuotedChars.Length; i++)
             {
                 buffer = ReplaceIgnoreCase(buffer, locale.TextInfo.ToLower(escapableTermChars[i]),
                     "\\", locale);
             }
             return buffer;
         }

         private static ICharSequence EscapeTerm(ICharSequence term, CultureInfo locale)
         {
             if (term == null)
                 return term;

             // Escape single Chars
             term = EscapeChar(term, locale);
             term = EscapeWhiteChar(term, locale);

             // Escape Parser Words
             for (int i = 0; i < escapableWordTokens.Length; i++)
             {
                 if (escapableWordTokens[i].Equals(term.ToString(), StringComparison.OrdinalIgnoreCase))
                     return new StringCharSequence("\\" + term);
             }
             return term;
         }

         /// <summary>
         /// replace with ignore case
         /// </summary>
         /// <param name="string">string to get replaced</param>
         /// <param name="sequence1">the old character sequence in lowercase</param>
         /// <param name="escapeChar">the new character to prefix sequence1 in return string.</param>
         /// <param name="locale"></param>
         /// <returns>the new <see cref="ICharSequence"/></returns>
         private static ICharSequence ReplaceIgnoreCase(ICharSequence @string,
             string sequence1, string escapeChar, CultureInfo locale)
         {
             if (escapeChar == null || sequence1 == null || @string == null)
                 throw new NullReferenceException(); // LUCNENET TODO: ArgumentException...

             // empty string case
             int count = @string.Length;
             int sequence1Length = sequence1.Length;
             if (sequence1Length == 0)
             {
                 StringBuilder result2 = new StringBuilder((count + 1)
                     * escapeChar.Length);
                 result2.Append(escapeChar);
                 for (int i = 0; i < count; i++)
                 {
                     result2.Append(@string[i]);
                     result2.Append(escapeChar);
                 }
                 return result2.ToString().AsCharSequence();
             }

             // normal case
             StringBuilder result = new StringBuilder();
             char first = sequence1[0];
             int start = 0, copyStart = 0, firstIndex;
             while (start < count)
             {
                 if ((firstIndex = locale.TextInfo.ToLower(@string.ToString()).IndexOf(first,
                     start)) == -1)
                     break;
                 bool found = true;
                 if (sequence1.Length > 1)
                 {
                     if (firstIndex + sequence1Length > count)
                         break;
                     for (int i = 1; i < sequence1Length; i++)
                     {
                         if (locale.TextInfo.ToLower(@string.ToString())[firstIndex + i] != sequence1[i])
                         {
                             found = false;
                             break;
                         }
                     }
                 }
                 if (found)
                 {
                     result.Append(@string.ToString().Substring(copyStart, firstIndex - copyStart));
                     result.Append(escapeChar);
                     result.Append(@string.ToString().Substring(firstIndex,
                         (firstIndex + sequence1Length) - firstIndex));
                     copyStart = start = firstIndex + sequence1Length;
                 }
                 else
                 {
                     start = firstIndex + 1;
                 }
             }

             if (result.Length == 0 && copyStart == 0)
                 return @string;
             result.Append(@string.ToString().Substring(copyStart));
             return result.ToString().AsCharSequence();
         }

         /// <summary>
         /// escape all tokens that are part of the parser syntax on a given string
         /// </summary>
         /// <param name="str">string to get replaced</param>
         /// <param name="locale">locale to be used when performing string compares</param>
         /// <returns>the new <see cref="ICharSequence"/></returns>
         private static ICharSequence EscapeWhiteChar(ICharSequence str,
             CultureInfo locale)
         {
             if (str == null || str.Length == 0)
                 return str;

             ICharSequence buffer = str;

             for (int i = 0; i < escapableWhiteChars.Length; i++)
             {
                 buffer = ReplaceIgnoreCase(buffer, locale.TextInfo.ToLower(escapableWhiteChars[i]),
                     "\\", locale);
             }
             return buffer;
         }

         // LUCENENET specific overload for text as string
         public virtual string Escape(string text, CultureInfo locale, EscapeQuerySyntaxType type)
         {
             if (text == null || text.Length == 0)
                 return text;

             return Escape(text.AsCharSequence(), locale, type).ToString();
         }

         public virtual ICharSequence Escape(ICharSequence text, CultureInfo locale, EscapeQuerySyntaxType type)
         {
             if (text == null || text.Length == 0)
                 return text;

             // escape wildcards and the escape char (this has to be perform before
             // anything else)
             // since we need to preserve the UnescapedCharSequence and escape the
             // original escape chars
             if (text is UnescapedCharSequence unescapedCharSequence)
             {
                 text = unescapedCharSequence.ToStringEscaped(wildcardChars);
             }
             else
             {
                 text = new UnescapedCharSequence(text).ToStringEscaped(wildcardChars);
             }

             if (type == EscapeQuerySyntaxType.STRING)
             {
                 return EscapeQuoted(text, locale);
             }
             else
             {
                 return EscapeTerm(text, locale);
             }
         }

         /// <summary>
         /// Returns a string where the escape char has been removed, or kept only once
         /// if there was a double escape.
         /// <para/>
         /// Supports escaped unicode characters, e. g. translates <c>A</c> to
         /// <c>A</c>.
         /// </summary>
         public static UnescapedCharSequence DiscardEscapeChar(string input)
         {
             // Create char array to hold unescaped char sequence
             char[] output = new char[input.Length];
             bool[] wasEscaped = new bool[input.Length];

             // The length of the output can be less than the input
             // due to discarded escape chars. This variable holds
             // the actual length of the output
             int length = 0;

             // We remember whether the last processed character was
             // an escape character
             bool lastCharWasEscapeChar = false;

             // The multiplier the current unicode digit must be multiplied with.
             // E. g. the first digit must be multiplied with 16^3, the second with
             // 16^2...
             int codePointMultiplier = 0;

             // Used to calculate the codepoint of the escaped unicode character
             int codePoint = 0;

             for (int i = 0; i < input.Length; i++)
             {
                 char curChar = input[i];
                 if (codePointMultiplier > 0)
                 {
                     codePoint += HexToInt32(curChar) * codePointMultiplier;
                     codePointMultiplier = (int)((uint)codePointMultiplier >> 4);
                     if (codePointMultiplier == 0)
                     {
                         output[length++] = (char)codePoint;
                         codePoint = 0;
                     }
                 }
                 else if (lastCharWasEscapeChar)
                 {
                     if (curChar == 'u')
                     {
                         // found an escaped unicode character
                         codePointMultiplier = 16 * 16 * 16;
                     }
                     else
                     {
                         // this character was escaped
                         output[length] = curChar;
                         wasEscaped[length] = true;
                         length++;
                     }
                     lastCharWasEscapeChar = false;
                 }
                 else
                 {
                     if (curChar == '\\')
                     {
                         lastCharWasEscapeChar = true;
                     }
                     else
                     {
                         output[length] = curChar;
                         length++;
                     }
                 }
             }

             if (codePointMultiplier > 0)
             {
                 throw new ParseException(new Message(
                     QueryParserMessages.INVALID_SYNTAX_ESCAPE_UNICODE_TRUNCATION));
             }

             if (lastCharWasEscapeChar)
             {
                 throw new ParseException(new Message(
                     QueryParserMessages.INVALID_SYNTAX_ESCAPE_CHARACTER));
             }

             return new UnescapedCharSequence(output, wasEscaped, 0, length);
         }

         /// <summary>
         /// Returns the numeric value of the hexadecimal character
         /// <para/>
         /// NOTE: This was hexToInt() in Lucene
         /// </summary>
         private static int HexToInt32(char c)
         {
             if ('0' <= c && c <= '9')
             {
                 return c - '0';
             }
             else if ('a' <= c && c <= 'f')
             {
                 return c - 'a' + 10;
             }
             else if ('A' <= c && c <= 'F')
             {
                 return c - 'A' + 10;
             }
             else
             {
                 throw new ParseException(new Message(
                     QueryParserMessages.INVALID_SYNTAX_ESCAPE_NONE_HEX_UNICODE, c));
             }
         }
     }
 }
	using J2N.Text;
	using Lucene.Net.QueryParsers.Flexible.Core.Messages;
	using Lucene.Net.QueryParsers.Flexible.Core.Parser;
	using Lucene.Net.QueryParsers.Flexible.Core.Util;
	using Lucene.Net.QueryParsers.Flexible.Messages;
	using System;
	using System.Globalization;
	using System.Text;

	namespace Lucene.Net.QueryParsers.Flexible.Standard.Parser
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/// <summary>
	/// Implementation of <see cref="IEscapeQuerySyntax"/> for the standard lucene
	/// syntax.
	/// </summary>
	public class EscapeQuerySyntax : IEscapeQuerySyntax
	{
	private static readonly char[] wildcardChars = { '*', '?' };

	private static readonly string[] escapableTermExtraFirstChars = { "+", "-", "@" };

	private static readonly string[] escapableTermChars = { "\"", "<", ">", "=",
	"!", "(", ")", "^", "[", "{", ":", "]", "}", "~", "/" };

	// TODO: check what to do with these "*", "?", "\\"
	private static readonly string[] escapableQuotedChars = { "\"" };
	private static readonly string[] escapableWhiteChars = { " ", "\t", "\n", "\r",
	"\f", "\b", "\u3000" };
	private static readonly string[] escapableWordTokens = { "AND", "OR", "NOT",
	"TO", "WITHIN", "SENTENCE", "PARAGRAPH", "INORDER" };

	private static ICharSequence EscapeChar(ICharSequence str, CultureInfo locale)
	{
	if (str == null \|\| str.Length == 0)
	return str;

	ICharSequence buffer = str;

	// regular escapable Char for terms
	for (int i = 0; i < escapableTermChars.Length; i++)
	{
	buffer = ReplaceIgnoreCase(buffer, locale.TextInfo.ToLower(escapableTermChars[i]),
	"\\", locale);
	}

	// First Character of a term as more escaping chars
	for (int i = 0; i < escapableTermExtraFirstChars.Length; i++)
	{
	if (buffer[0] == escapableTermExtraFirstChars[i][0])
	{
	buffer = new StringCharSequence("\\" + buffer[0]
	+ buffer.Subsequence(1, buffer.Length - 1).ToString()); // LUCENENET: Corrected 2nd Subsequence parameter
	break;
	}
	}

	return buffer;
	}

	private ICharSequence EscapeQuoted(ICharSequence str, CultureInfo locale)
	{
	if (str == null \|\| str.Length == 0)
	return str;

	ICharSequence buffer = str;

	for (int i = 0; i < escapableQuotedChars.Length; i++)
	{
	buffer = ReplaceIgnoreCase(buffer, locale.TextInfo.ToLower(escapableTermChars[i]),
	"\\", locale);
	}
	return buffer;
	}

	private static ICharSequence EscapeTerm(ICharSequence term, CultureInfo locale)
	{
	if (term == null)
	return term;

	// Escape single Chars
	term = EscapeChar(term, locale);
	term = EscapeWhiteChar(term, locale);

	// Escape Parser Words
	for (int i = 0; i < escapableWordTokens.Length; i++)
	{
	if (escapableWordTokens[i].Equals(term.ToString(), StringComparison.OrdinalIgnoreCase))
	return new StringCharSequence("\\" + term);
	}
	return term;
	}

	/// <summary>
	/// replace with ignore case
	/// </summary>
	/// <param name="string">string to get replaced</param>
	/// <param name="sequence1">the old character sequence in lowercase</param>
	/// <param name="escapeChar">the new character to prefix sequence1 in return string.</param>
	/// <param name="locale"></param>
	/// <returns>the new <see cref="ICharSequence"/></returns>
	private static ICharSequence ReplaceIgnoreCase(ICharSequence @string,
	string sequence1, string escapeChar, CultureInfo locale)
	{
	if (escapeChar == null \|\| sequence1 == null \|\| @string == null)
	throw new NullReferenceException(); // LUCNENET TODO: ArgumentException...

	// empty string case
	int count = @string.Length;
	int sequence1Length = sequence1.Length;
	if (sequence1Length == 0)
	{
	StringBuilder result2 = new StringBuilder((count + 1)
	* escapeChar.Length);
	result2.Append(escapeChar);
	for (int i = 0; i < count; i++)
	{
	result2.Append(@string[i]);
	result2.Append(escapeChar);
	}
	return result2.ToString().AsCharSequence();
	}

	// normal case
	StringBuilder result = new StringBuilder();
	char first = sequence1[0];
	int start = 0, copyStart = 0, firstIndex;
	while (start < count)
	{
	if ((firstIndex = locale.TextInfo.ToLower(@string.ToString()).IndexOf(first,
	start)) == -1)
	break;
	bool found = true;
	if (sequence1.Length > 1)
	{
	if (firstIndex + sequence1Length > count)
	break;
	for (int i = 1; i < sequence1Length; i++)
	{
	if (locale.TextInfo.ToLower(@string.ToString())[firstIndex + i] != sequence1[i])
	{
	found = false;
	break;
	}
	}
	}
	if (found)
	{
	result.Append(@string.ToString().Substring(copyStart, firstIndex - copyStart));
	result.Append(escapeChar);
	result.Append(@string.ToString().Substring(firstIndex,
	(firstIndex + sequence1Length) - firstIndex));
	copyStart = start = firstIndex + sequence1Length;
	}
	else
	{
	start = firstIndex + 1;
	}
	}

	if (result.Length == 0 && copyStart == 0)
	return @string;
	result.Append(@string.ToString().Substring(copyStart));
	return result.ToString().AsCharSequence();
	}

	/// <summary>
	/// escape all tokens that are part of the parser syntax on a given string
	/// </summary>
	/// <param name="str">string to get replaced</param>
	/// <param name="locale">locale to be used when performing string compares</param>
	/// <returns>the new <see cref="ICharSequence"/></returns>
	private static ICharSequence EscapeWhiteChar(ICharSequence str,
	CultureInfo locale)
	{
	if (str == null \|\| str.Length == 0)
	return str;

	ICharSequence buffer = str;

	for (int i = 0; i < escapableWhiteChars.Length; i++)
	{
	buffer = ReplaceIgnoreCase(buffer, locale.TextInfo.ToLower(escapableWhiteChars[i]),
	"\\", locale);
	}
	return buffer;
	}

	// LUCENENET specific overload for text as string
	public virtual string Escape(string text, CultureInfo locale, EscapeQuerySyntaxType type)
	{
	if (text == null \|\| text.Length == 0)
	return text;

	return Escape(text.AsCharSequence(), locale, type).ToString();
	}

	public virtual ICharSequence Escape(ICharSequence text, CultureInfo locale, EscapeQuerySyntaxType type)
	{
	if (text == null \|\| text.Length == 0)
	return text;

	// escape wildcards and the escape char (this has to be perform before
	// anything else)
	// since we need to preserve the UnescapedCharSequence and escape the
	// original escape chars
	if (text is UnescapedCharSequence unescapedCharSequence)
	{
	text = unescapedCharSequence.ToStringEscaped(wildcardChars);
	}
	else
	{
	text = new UnescapedCharSequence(text).ToStringEscaped(wildcardChars);
	}

	if (type == EscapeQuerySyntaxType.STRING)
	{
	return EscapeQuoted(text, locale);
	}
	else
	{
	return EscapeTerm(text, locale);
	}
	}

	/// <summary>
	/// Returns a string where the escape char has been removed, or kept only once
	/// if there was a double escape.
	/// <para/>
	/// Supports escaped unicode characters, e. g. translates <c>A</c> to
	/// <c>A</c>.
	/// </summary>
	public static UnescapedCharSequence DiscardEscapeChar(string input)
	{
	// Create char array to hold unescaped char sequence
	char[] output = new char[input.Length];
	bool[] wasEscaped = new bool[input.Length];

	// The length of the output can be less than the input
	// due to discarded escape chars. This variable holds
	// the actual length of the output
	int length = 0;

	// We remember whether the last processed character was
	// an escape character
	bool lastCharWasEscapeChar = false;

	// The multiplier the current unicode digit must be multiplied with.
	// E. g. the first digit must be multiplied with 16^3, the second with
	// 16^2...
	int codePointMultiplier = 0;

	// Used to calculate the codepoint of the escaped unicode character
	int codePoint = 0;

	for (int i = 0; i < input.Length; i++)
	{
	char curChar = input[i];
	if (codePointMultiplier > 0)
	{
	codePoint += HexToInt32(curChar) * codePointMultiplier;
	codePointMultiplier = (int)((uint)codePointMultiplier >> 4);
	if (codePointMultiplier == 0)
	{
	output[length++] = (char)codePoint;
	codePoint = 0;
	}
	}
	else if (lastCharWasEscapeChar)
	{
	if (curChar == 'u')
	{
	// found an escaped unicode character
	codePointMultiplier = 16 * 16 * 16;
	}
	else
	{
	// this character was escaped
	output[length] = curChar;
	wasEscaped[length] = true;
	length++;
	}
	lastCharWasEscapeChar = false;
	}
	else
	{
	if (curChar == '\\')
	{
	lastCharWasEscapeChar = true;
	}
	else
	{
	output[length] = curChar;
	length++;
	}
	}
	}

	if (codePointMultiplier > 0)
	{
	throw new ParseException(new Message(
	QueryParserMessages.INVALID_SYNTAX_ESCAPE_UNICODE_TRUNCATION));
	}

	if (lastCharWasEscapeChar)
	{
	throw new ParseException(new Message(
	QueryParserMessages.INVALID_SYNTAX_ESCAPE_CHARACTER));
	}

	return new UnescapedCharSequence(output, wasEscaped, 0, length);
	}

	/// <summary>
	/// Returns the numeric value of the hexadecimal character
	/// <para/>
	/// NOTE: This was hexToInt() in Lucene
	/// </summary>
	private static int HexToInt32(char c)
	{
	if ('0' <= c && c <= '9')
	{
	return c - '0';
	}
	else if ('a' <= c && c <= 'f')
	{
	return c - 'a' + 10;
	}
	else if ('A' <= c && c <= 'F')
	{
	return c - 'A' + 10;
	}
	else
	{
	throw new ParseException(new Message(
	QueryParserMessages.INVALID_SYNTAX_ESCAPE_NONE_HEX_UNICODE, c));
	}
	}
	}
	}