| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.queryparser.flexible.standard.parser; |
| |
| import java.util.Locale; |
| |
| import org.apache.lucene.queryparser.flexible.messages.MessageImpl; |
| import org.apache.lucene.queryparser.flexible.core.messages.QueryParserMessages; |
| import org.apache.lucene.queryparser.flexible.core.parser.EscapeQuerySyntax; |
| import org.apache.lucene.queryparser.flexible.core.util.UnescapedCharSequence; |
| |
| /** |
| * Implementation of {@link EscapeQuerySyntax} for the standard lucene |
| * syntax. |
| */ |
| public class EscapeQuerySyntaxImpl implements EscapeQuerySyntax { |
| |
| private static final char[] wildcardChars = { '*', '?' }; |
| |
| private static final String[] escapableTermExtraFirstChars = { "+", "-", "@" }; |
| |
| private static final String[] escapableTermChars = { "\"", "<", ">", "=", |
| "!", "(", ")", "^", "[", "{", ":", "]", "}", "~", "/" }; |
| |
| // TODO: check what to do with these "*", "?", "\\" |
| private static final String[] escapableQuotedChars = { "\"" }; |
| private static final String[] escapableWhiteChars = { " ", "\t", "\n", "\r", |
| "\f", "\b", "\u3000" }; |
| private static final String[] escapableWordTokens = { "AND", "OR", "NOT", |
| "TO", "WITHIN", "SENTENCE", "PARAGRAPH", "INORDER" }; |
| |
| private static final CharSequence escapeChar(CharSequence str, Locale locale) { |
| if (str == null || str.length() == 0) |
| return str; |
| |
| CharSequence buffer = str; |
| |
| // regular escapable Char for terms |
| for (int i = 0; i < escapableTermChars.length; i++) { |
| buffer = replaceIgnoreCase(buffer, escapableTermChars[i].toLowerCase(locale), |
| "\\", locale); |
| } |
| |
| // First Character of a term as more escaping chars |
| for (int i = 0; i < escapableTermExtraFirstChars.length; i++) { |
| if (buffer.charAt(0) == escapableTermExtraFirstChars[i].charAt(0)) { |
| buffer = "\\" + buffer.charAt(0) |
| + buffer.subSequence(1, buffer.length()); |
| break; |
| } |
| } |
| |
| return buffer; |
| } |
| |
| private final CharSequence escapeQuoted(CharSequence str, Locale locale) { |
| if (str == null || str.length() == 0) |
| return str; |
| |
| CharSequence buffer = str; |
| |
| for (int i = 0; i < escapableQuotedChars.length; i++) { |
| buffer = replaceIgnoreCase(buffer, escapableTermChars[i].toLowerCase(locale), |
| "\\", locale); |
| } |
| return buffer; |
| } |
| |
| private static final CharSequence escapeTerm(CharSequence term, Locale locale) { |
| if (term == null) |
| return term; |
| |
| // Escape single Chars |
| term = escapeChar(term, locale); |
| term = escapeWhiteChar(term, locale); |
| |
| // Escape Parser Words |
| for (int i = 0; i < escapableWordTokens.length; i++) { |
| if (escapableWordTokens[i].equalsIgnoreCase(term.toString())) |
| return "\\" + term; |
| } |
| return term; |
| } |
| |
| /** |
| * replace with ignore case |
| * |
| * @param string |
| * string to get replaced |
| * @param sequence1 |
| * the old character sequence in lowercase |
| * @param escapeChar |
| * the new character to prefix sequence1 in return string. |
| * @return the new String |
| */ |
| private static CharSequence replaceIgnoreCase(CharSequence string, |
| CharSequence sequence1, CharSequence escapeChar, Locale locale) { |
| if (escapeChar == null || sequence1 == null || string == null) |
| throw new NullPointerException(); |
| |
| // empty string case |
| int count = string.length(); |
| int sequence1Length = sequence1.length(); |
| if (sequence1Length == 0) { |
| StringBuilder result = new StringBuilder((count + 1) |
| * escapeChar.length()); |
| result.append(escapeChar); |
| for (int i = 0; i < count; i++) { |
| result.append(string.charAt(i)); |
| result.append(escapeChar); |
| } |
| return result.toString(); |
| } |
| |
| // normal case |
| StringBuilder result = new StringBuilder(); |
| char first = sequence1.charAt(0); |
| int start = 0, copyStart = 0, firstIndex; |
| while (start < count) { |
| if ((firstIndex = string.toString().toLowerCase(locale).indexOf(first, |
| start)) == -1) |
| break; |
| boolean found = true; |
| if (sequence1.length() > 1) { |
| if (firstIndex + sequence1Length > count) |
| break; |
| for (int i = 1; i < sequence1Length; i++) { |
| if (string.toString().toLowerCase(locale).charAt(firstIndex + i) != sequence1 |
| .charAt(i)) { |
| found = false; |
| break; |
| } |
| } |
| } |
| if (found) { |
| result.append(string.toString().substring(copyStart, firstIndex)); |
| result.append(escapeChar); |
| result.append(string.toString().substring(firstIndex, |
| firstIndex + sequence1Length)); |
| copyStart = start = firstIndex + sequence1Length; |
| } else { |
| start = firstIndex + 1; |
| } |
| } |
| if (result.length() == 0 && copyStart == 0) |
| return string; |
| result.append(string.toString().substring(copyStart)); |
| return result.toString(); |
| } |
| |
| /** |
| * escape all tokens that are part of the parser syntax on a given string |
| * |
| * @param str |
| * string to get replaced |
| * @param locale |
| * locale to be used when performing string compares |
| * @return the new String |
| */ |
| private static final CharSequence escapeWhiteChar(CharSequence str, |
| Locale locale) { |
| if (str == null || str.length() == 0) |
| return str; |
| |
| CharSequence buffer = str; |
| |
| for (int i = 0; i < escapableWhiteChars.length; i++) { |
| buffer = replaceIgnoreCase(buffer, escapableWhiteChars[i].toLowerCase(locale), |
| "\\", locale); |
| } |
| return buffer; |
| } |
| |
| @Override |
| public CharSequence escape(CharSequence text, Locale locale, Type type) { |
| if (text == null || text.length() == 0) |
| return text; |
| |
| // escape wildcards and the escape char (this has to be perform before |
| // anything else) |
| // since we need to preserve the UnescapedCharSequence and escape the |
| // original escape chars |
| if (text instanceof UnescapedCharSequence) { |
| text = ((UnescapedCharSequence) text).toStringEscaped(wildcardChars); |
| } else { |
| text = new UnescapedCharSequence(text).toStringEscaped(wildcardChars); |
| } |
| |
| if (type == Type.STRING) { |
| return escapeQuoted(text, locale); |
| } else { |
| return escapeTerm(text, locale); |
| } |
| } |
| |
| /** |
| * Returns a String where the escape char has been removed, or kept only once |
| * if there was a double escape. |
| * |
| * Supports escaped unicode characters, e. g. translates <code>A</code> to |
| * <code>A</code>. |
| * |
| */ |
| public static UnescapedCharSequence discardEscapeChar(CharSequence input) |
| throws ParseException { |
| // Create char array to hold unescaped char sequence |
| char[] output = new char[input.length()]; |
| boolean[] wasEscaped = new boolean[input.length()]; |
| |
| // The length of the output can be less than the input |
| // due to discarded escape chars. This variable holds |
| // the actual length of the output |
| int length = 0; |
| |
| // We remember whether the last processed character was |
| // an escape character |
| boolean lastCharWasEscapeChar = false; |
| |
| // The multiplier the current unicode digit must be multiplied with. |
| // E. g. the first digit must be multiplied with 16^3, the second with |
| // 16^2... |
| int codePointMultiplier = 0; |
| |
| // Used to calculate the codepoint of the escaped unicode character |
| int codePoint = 0; |
| |
| for (int i = 0; i < input.length(); i++) { |
| char curChar = input.charAt(i); |
| if (codePointMultiplier > 0) { |
| codePoint += hexToInt(curChar) * codePointMultiplier; |
| codePointMultiplier >>>= 4; |
| if (codePointMultiplier == 0) { |
| output[length++] = (char) codePoint; |
| codePoint = 0; |
| } |
| } else if (lastCharWasEscapeChar) { |
| if (curChar == 'u') { |
| // found an escaped unicode character |
| codePointMultiplier = 16 * 16 * 16; |
| } else { |
| // this character was escaped |
| output[length] = curChar; |
| wasEscaped[length] = true; |
| length++; |
| } |
| lastCharWasEscapeChar = false; |
| } else { |
| if (curChar == '\\') { |
| lastCharWasEscapeChar = true; |
| } else { |
| output[length] = curChar; |
| length++; |
| } |
| } |
| } |
| |
| if (codePointMultiplier > 0) { |
| throw new ParseException(new MessageImpl( |
| QueryParserMessages.INVALID_SYNTAX_ESCAPE_UNICODE_TRUNCATION)); |
| } |
| |
| if (lastCharWasEscapeChar) { |
| throw new ParseException(new MessageImpl( |
| QueryParserMessages.INVALID_SYNTAX_ESCAPE_CHARACTER)); |
| } |
| |
| return new UnescapedCharSequence(output, wasEscaped, 0, length); |
| } |
| |
| /** Returns the numeric value of the hexadecimal character */ |
| private static final int hexToInt(char c) throws ParseException { |
| if ('0' <= c && c <= '9') { |
| return c - '0'; |
| } else if ('a' <= c && c <= 'f') { |
| return c - 'a' + 10; |
| } else if ('A' <= c && c <= 'F') { |
| return c - 'A' + 10; |
| } else { |
| throw new ParseException(new MessageImpl( |
| QueryParserMessages.INVALID_SYNTAX_ESCAPE_NONE_HEX_UNICODE, c)); |
| } |
| } |
| |
| } |