blob: a7f63cd5a71d131839020fd8df9c92a77b0bc042 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.drill.exec.expr.fn.impl;
/**
* Code copied from optiq (https://github.com/julianhyde/optiq).
* Thanks goes to Julian Hyde and other contributors of optiq.
*/
/**
* Utilities for converting SQL {@code LIKE} and {@code SIMILAR} operators
* to regular expressions.
*/
public class RegexpUtil {
private static final String JAVA_REGEX_SPECIALS = "[]()|^-+*?{}$\\";
private static final String SQL_SIMILAR_SPECIALS = "[]()|^-+*_%?{}";
private static final String [] REG_CHAR_CLASSES = {
"[:ALPHA:]", "\\p{Alpha}",
"[:alpha:]", "\\p{Alpha}",
"[:UPPER:]", "\\p{Upper}",
"[:upper:]", "\\p{Upper}",
"[:LOWER:]", "\\p{Lower}",
"[:lower:]", "\\p{Lower}",
"[:DIGIT:]", "\\d",
"[:digit:]", "\\d",
"[:SPACE:]", " ",
"[:space:]", " ",
"[:WHITESPACE:]", "\\s",
"[:whitespace:]", "\\s",
"[:ALNUM:]", "\\p{Alnum}",
"[:alnum:]", "\\p{Alnum}"
};
// type of pattern string.
public enum SqlPatternType {
STARTS_WITH, // Starts with a constant string followed by any string values (ABC%)
ENDS_WITH, // Ends with a constant string, starts with any string values (%ABC)
CONTAINS, // Contains a constant string, starts and ends with any string values (%ABC%)
CONSTANT, // Is a constant string (ABC)
COMPLEX // Not a simple pattern. Needs regex evaluation.
};
public static class SqlPatternInfo {
private final SqlPatternType patternType; // type of pattern
// simple pattern with like meta characters(% and _) and escape characters removed.
// Used for simple pattern matching.
private final String simplePatternString;
// javaPatternString used for regex pattern match.
private final String javaPatternString;
public SqlPatternInfo(final SqlPatternType patternType, final String javaPatternString, final String simplePatternString) {
this.patternType = patternType;
this.simplePatternString = simplePatternString;
this.javaPatternString = javaPatternString;
}
public SqlPatternType getPatternType() {
return patternType;
}
public String getSimplePatternString() {
return simplePatternString;
}
public String getJavaPatternString() {
return javaPatternString;
}
}
/**
* Translates a SQL LIKE pattern to Java regex pattern. No escape char.
*/
public static SqlPatternInfo sqlToRegexLike(String sqlPattern) {
return sqlToRegexLike(sqlPattern, (char)0);
}
/**
* Translates a SQL LIKE pattern to Java regex pattern, with optional
* escape string.
*/
public static SqlPatternInfo sqlToRegexLike(
String sqlPattern,
CharSequence escapeStr) {
final char escapeChar;
if (escapeStr != null) {
if (escapeStr.length() != 1) {
throw invalidEscapeCharacter(escapeStr.toString());
}
escapeChar = escapeStr.charAt(0);
} else {
escapeChar = 0;
}
return sqlToRegexLike(sqlPattern, escapeChar);
}
/**
* Translates a SQL LIKE pattern to Java regex pattern.
*/
public static SqlPatternInfo sqlToRegexLike(
String sqlPattern,
char escapeChar) {
int i;
final int len = sqlPattern.length();
final StringBuilder javaPattern = new StringBuilder(len + len);
final StringBuilder simplePattern = new StringBuilder(len);
// Figure out the pattern type and build simplePatternString
// as we are going through the sql pattern string
// to build java regex pattern string. This is better instead of using
// regex later for determining if a pattern is simple or not.
// Saves CPU cycles.
// Start with patternType as CONSTANT
SqlPatternType patternType = SqlPatternType.CONSTANT;
for (i = 0; i < len; i++) {
char c = sqlPattern.charAt(i);
if (JAVA_REGEX_SPECIALS.indexOf(c) >= 0) {
javaPattern.append('\\');
}
if (c == escapeChar) {
if (i == (sqlPattern.length() - 1)) {
throw invalidEscapeSequence(sqlPattern, i);
}
char nextChar = sqlPattern.charAt(i + 1);
if ((nextChar == '_')
|| (nextChar == '%')
|| (nextChar == escapeChar)) {
javaPattern.append(nextChar);
simplePattern.append(nextChar);
i++;
} else {
throw invalidEscapeSequence(sqlPattern, i);
}
} else if (c == '_') {
// if we find _, it is not simple pattern, we are looking for only %
patternType = SqlPatternType.COMPLEX;
javaPattern.append('.');
} else if (c == '%') {
if (i == 0) {
// % at the start could potentially be one of the simple cases i.e. ENDS_WITH.
patternType = SqlPatternType.ENDS_WITH;
} else if (i == (len-1)) {
if (patternType == SqlPatternType.ENDS_WITH) {
// Starts and Ends with %. This is contains.
patternType = SqlPatternType.CONTAINS;
} else if (patternType == SqlPatternType.CONSTANT) {
// % at the end with constant string in the beginning i.e. STARTS_WITH
patternType = SqlPatternType.STARTS_WITH;
}
} else {
// If we find % anywhere other than start or end, it is not a simple case.
patternType = SqlPatternType.COMPLEX;
}
javaPattern.append(".");
javaPattern.append('*');
} else {
javaPattern.append(c);
simplePattern.append(c);
}
}
return new SqlPatternInfo(patternType, javaPattern.toString(), simplePattern.toString());
}
private static RuntimeException invalidEscapeCharacter(String s) {
return new RuntimeException(
"Invalid escape character '" + s + "'");
}
private static RuntimeException invalidEscapeSequence(String s, int i) {
return new RuntimeException(
"Invalid escape sequence '" + s + "', " + i);
}
private static void similarEscapeRuleChecking(
String sqlPattern,
char escapeChar) {
if (escapeChar == 0) {
return;
}
if (SQL_SIMILAR_SPECIALS.indexOf(escapeChar) >= 0) {
// The the escape character is a special character
// SQL 2003 Part 2 Section 8.6 General Rule 3.b
for (int i = 0; i < sqlPattern.length(); i++) {
if (sqlPattern.charAt(i) == escapeChar) {
if (i == (sqlPattern.length() - 1)) {
throw invalidEscapeSequence(sqlPattern, i);
}
char c = sqlPattern.charAt(i + 1);
if ((SQL_SIMILAR_SPECIALS.indexOf(c) < 0)
&& (c != escapeChar)) {
throw invalidEscapeSequence(sqlPattern, i);
}
}
}
}
// SQL 2003 Part 2 Section 8.6 General Rule 3.c
if (escapeChar == ':') {
int position;
position = sqlPattern.indexOf("[:");
if (position >= 0) {
position = sqlPattern.indexOf(":]");
}
if (position < 0) {
throw invalidEscapeSequence(sqlPattern, position);
}
}
}
private static RuntimeException invalidRegularExpression(
String pattern, int i) {
return new RuntimeException(
"Invalid regular expression '" + pattern + "'");
}
private static int sqlSimilarRewriteCharEnumeration(
String sqlPattern,
StringBuilder javaPattern,
int pos,
char escapeChar) {
int i;
for (i = pos + 1; i < sqlPattern.length(); i++) {
char c = sqlPattern.charAt(i);
if (c == ']') {
return i - 1;
} else if (c == escapeChar) {
i++;
char nextChar = sqlPattern.charAt(i);
if (SQL_SIMILAR_SPECIALS.indexOf(nextChar) >= 0) {
if (JAVA_REGEX_SPECIALS.indexOf(nextChar) >= 0) {
javaPattern.append('\\');
}
javaPattern.append(nextChar);
} else if (escapeChar == nextChar) {
javaPattern.append(nextChar);
} else {
throw invalidRegularExpression(sqlPattern, i);
}
} else if (c == '-') {
javaPattern.append('-');
} else if (c == '^') {
javaPattern.append('^');
} else if (sqlPattern.startsWith("[:", i)) {
int numOfRegCharSets = REG_CHAR_CLASSES.length / 2;
boolean found = false;
for (int j = 0; j < numOfRegCharSets; j++) {
if (sqlPattern.startsWith(REG_CHAR_CLASSES[j + j], i)) {
javaPattern.append(REG_CHAR_CLASSES[j + j + 1]);
i += REG_CHAR_CLASSES[j + j].length() - 1;
found = true;
break;
}
}
if (!found) {
throw invalidRegularExpression(sqlPattern, i);
}
} else if (SQL_SIMILAR_SPECIALS.indexOf(c) >= 0) {
throw invalidRegularExpression(sqlPattern, i);
} else {
javaPattern.append(c);
}
}
return i - 1;
}
/**
* Translates a SQL SIMILAR pattern to Java regex pattern. No escape char.
*/
public static String sqlToRegexSimilar(String sqlPattern) {
return sqlToRegexSimilar(sqlPattern, (char) 0);
}
/**
* Translates a SQL SIMILAR pattern to Java regex pattern, with optional
* escape string.
*/
public static String sqlToRegexSimilar(
String sqlPattern,
CharSequence escapeStr) {
final char escapeChar;
if (escapeStr != null) {
if (escapeStr.length() != 1) {
throw invalidEscapeCharacter(escapeStr.toString());
}
escapeChar = escapeStr.charAt(0);
} else {
escapeChar = 0;
}
return sqlToRegexSimilar(sqlPattern, escapeChar);
}
/**
* Translates SQL SIMILAR pattern to Java regex pattern.
*/
public static String sqlToRegexSimilar(
String sqlPattern,
char escapeChar) {
similarEscapeRuleChecking(sqlPattern, escapeChar);
boolean insideCharacterEnumeration = false;
final StringBuilder javaPattern =
new StringBuilder(sqlPattern.length() * 2);
final int len = sqlPattern.length();
for (int i = 0; i < len; i++) {
char c = sqlPattern.charAt(i);
if (c == escapeChar) {
if (i == (len - 1)) {
// It should never reach here after the escape rule
// checking.
throw invalidEscapeSequence(sqlPattern, i);
}
char nextChar = sqlPattern.charAt(i + 1);
if (SQL_SIMILAR_SPECIALS.indexOf(nextChar) >= 0) {
// special character, use \ to replace the escape char.
if (JAVA_REGEX_SPECIALS.indexOf(nextChar) >= 0) {
javaPattern.append('\\');
}
javaPattern.append(nextChar);
} else if (nextChar == escapeChar) {
javaPattern.append(nextChar);
} else {
// It should never reach here after the escape rule
// checking.
throw invalidEscapeSequence(sqlPattern, i);
}
i++; // we already process the next char.
} else {
switch (c) {
case '_':
javaPattern.append('.');
break;
case '%':
javaPattern.append('.');
javaPattern.append('*');
break;
case '[':
javaPattern.append('[');
insideCharacterEnumeration = true;
i = sqlSimilarRewriteCharEnumeration(
sqlPattern,
javaPattern,
i,
escapeChar);
break;
case ']':
if (!insideCharacterEnumeration) {
throw invalidRegularExpression(sqlPattern, i);
}
insideCharacterEnumeration = false;
javaPattern.append(']');
break;
case '\\':
javaPattern.append("\\\\");
break;
case '$':
// $ is special character in java regex, but regular in
// SQL regex.
javaPattern.append("\\$");
break;
default:
javaPattern.append(c);
}
}
}
if (insideCharacterEnumeration) {
throw invalidRegularExpression(sqlPattern, len);
}
return javaPattern.toString();
}
}