[CALCITE-6278] Add REGEXP, REGEXP_LIKE function (enabled in Spark library)
diff --git a/babel/src/test/java/org/apache/calcite/test/BabelQuidemTest.java b/babel/src/test/java/org/apache/calcite/test/BabelQuidemTest.java
index 69bdbaf..54058f6 100644
--- a/babel/src/test/java/org/apache/calcite/test/BabelQuidemTest.java
+++ b/babel/src/test/java/org/apache/calcite/test/BabelQuidemTest.java
@@ -131,6 +131,16 @@
SqlConformanceEnum.BABEL)
.with(CalciteConnectionProperty.LENIENT_OPERATOR_LOOKUP, true)
.connect();
+ case "scott-spark":
+ return CalciteAssert.that()
+ .with(CalciteAssert.SchemaSpec.SCOTT)
+ .with(CalciteConnectionProperty.FUN, "standard,spark")
+ .with(CalciteConnectionProperty.PARSER_FACTORY,
+ BabelDdlExecutor.class.getName() + "#PARSER_FACTORY")
+ .with(CalciteConnectionProperty.CONFORMANCE,
+ SqlConformanceEnum.BABEL)
+ .with(CalciteConnectionProperty.LENIENT_OPERATOR_LOOKUP, true)
+ .connect();
default:
return super.connect(name, reference);
}
diff --git a/babel/src/test/resources/sql/spark.iq b/babel/src/test/resources/sql/spark.iq
new file mode 100644
index 0000000..a2ac6a7
--- /dev/null
+++ b/babel/src/test/resources/sql/spark.iq
@@ -0,0 +1,243 @@
+# spark.iq - Babel test for Spark dialect of SQL
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+!use scott-spark
+!set outputformat csv
+
+#####################################################################
+# String functions ######################
+
+#####################################################################
+# RLIKE
+#
+# string1 RLIKE string2
+# Returns true if str matches regexp, or false otherwise.
+#
+# Returns BOOLEAN
+
+select NULL RLIKE 'abc*';
+EXPR$0
+null
+!ok
+
+select 'abc' RLIKE NULL;
+EXPR$0
+null
+!ok
+
+select 'abc' RLIKE '';
+EXPR$0
+true
+!ok
+
+SELECT 'abc def ghi' RLIKE 'abc';
+EXPR$0
+true
+!ok
+
+SELECT 'abc def ghi' RLIKE 'abcd';
+EXPR$0
+false
+!ok
+
+select 'abc' RLIKE '^\abc$';
+EXPR$0
+false
+!ok
+
+select '\abc' RLIKE '\abc$';
+EXPR$0
+false
+!ok
+
+select '\abc' RLIKE '^\abc$';
+EXPR$0
+false
+!ok
+
+select '\abc' RLIKE '^\\abc$';
+EXPR$0
+true
+!ok
+
+select 'abc' RLIKE '^abc$';
+EXPR$0
+true
+!ok
+
+select 'abc' RLIKE 'abz*';
+EXPR$0
+true
+!ok
+
+SELECT '%SystemDrive%\\Users\\John' RLIKE '%SystemDrive%\\\\Users.*';
+EXPR$0
+true
+!ok
+
+select '%SystemDrive%\Users\John' RLIKE '%SystemDrive%\\Users.*';
+EXPR$0
+true
+!ok
+
+#####################################################################
+# REGEXP
+#
+# REGEXP(str, regexp)
+# Returns true if str matches regexp, or false otherwise.
+#
+# Returns BOOLEAN
+
+select REGEXP(NULL, 'abc*');
+EXPR$0
+null
+!ok
+
+select REGEXP('abc', NULL);
+EXPR$0
+null
+!ok
+
+select REGEXP('abc', '');
+EXPR$0
+true
+!ok
+
+SELECT REGEXP('abc def ghi', 'abc');
+EXPR$0
+true
+!ok
+
+SELECT REGEXP('abc def ghi', 'abcd');
+EXPR$0
+false
+!ok
+
+select REGEXP('abc', '^\abc$');
+EXPR$0
+false
+!ok
+
+select REGEXP('\abc', '\abc$');
+EXPR$0
+false
+!ok
+
+select REGEXP('\abc', '^\abc$');
+EXPR$0
+false
+!ok
+
+select REGEXP('\abc', '^\\abc$');
+EXPR$0
+true
+!ok
+
+select REGEXP('abc', '^abc$');
+EXPR$0
+true
+!ok
+
+select REGEXP('abc', 'abz*');
+EXPR$0
+true
+!ok
+
+SELECT REGEXP('%SystemDrive%\\Users\\John', '%SystemDrive%\\\\Users.*');
+EXPR$0
+true
+!ok
+
+SELECT REGEXP('%SystemDrive%\Users\John', '%SystemDrive%\\Users.*');
+EXPR$0
+true
+!ok
+
+#####################################################################
+# REGEXP_LIKE
+#
+# REGEXP_LIKE(str, regexp)
+# Returns true if str matches regexp, or false otherwise.
+#
+# Returns BOOLEAN
+
+select REGEXP_LIKE(NULL, 'abc*');
+EXPR$0
+null
+!ok
+
+select REGEXP_LIKE('abc', NULL);
+EXPR$0
+null
+!ok
+
+select REGEXP_LIKE('abc', '');
+EXPR$0
+true
+!ok
+
+SELECT REGEXP_LIKE('abc def ghi', 'abc');
+EXPR$0
+true
+!ok
+
+SELECT REGEXP_LIKE('abc def ghi', 'abcd');
+EXPR$0
+false
+!ok
+
+select REGEXP_LIKE('abc', '^\abc$');
+EXPR$0
+false
+!ok
+
+select REGEXP_LIKE('\abc', '\abc$');
+EXPR$0
+false
+!ok
+
+select REGEXP_LIKE('\abc', '^\abc$');
+EXPR$0
+false
+!ok
+
+select REGEXP_LIKE('\abc', '^\\abc$');
+EXPR$0
+true
+!ok
+
+select REGEXP_LIKE('abc', '^abc$');
+EXPR$0
+true
+!ok
+
+select REGEXP_LIKE('abc', 'abz*');
+EXPR$0
+true
+!ok
+
+SELECT REGEXP_LIKE('%SystemDrive%\\Users\\John', '%SystemDrive%\\\\Users.*');
+EXPR$0
+true
+!ok
+
+SELECT REGEXP_LIKE('%SystemDrive%\Users\John', '%SystemDrive%\\Users.*');
+EXPR$0
+true
+!ok
+
+# End spark.iq
diff --git a/core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java b/core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java
index 0b9deb1..e37120e 100644
--- a/core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java
+++ b/core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java
@@ -232,10 +232,12 @@
import static org.apache.calcite.sql.fun.SqlLibraryOperators.PARSE_TIMESTAMP;
import static org.apache.calcite.sql.fun.SqlLibraryOperators.PARSE_URL;
import static org.apache.calcite.sql.fun.SqlLibraryOperators.POW;
+import static org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP;
import static org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_CONTAINS;
import static org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_EXTRACT;
import static org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_EXTRACT_ALL;
import static org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_INSTR;
+import static org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_LIKE;
import static org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_REPLACE;
import static org.apache.calcite.sql.fun.SqlLibraryOperators.REPEAT;
import static org.apache.calcite.sql.fun.SqlLibraryOperators.REVERSE;
@@ -597,6 +599,8 @@
defineMethod(SPLIT, BuiltInMethod.SPLIT.method, NullPolicy.STRICT);
defineReflective(PARSE_URL, BuiltInMethod.PARSE_URL2.method,
BuiltInMethod.PARSE_URL3.method);
+ defineReflective(REGEXP, BuiltInMethod.RLIKE.method);
+ defineReflective(REGEXP_LIKE, BuiltInMethod.RLIKE.method);
defineReflective(REGEXP_CONTAINS, BuiltInMethod.REGEXP_CONTAINS.method);
defineReflective(REGEXP_EXTRACT, BuiltInMethod.REGEXP_EXTRACT2.method,
BuiltInMethod.REGEXP_EXTRACT3.method, BuiltInMethod.REGEXP_EXTRACT4.method);
diff --git a/core/src/main/java/org/apache/calcite/sql/fun/SqlLibraryOperators.java b/core/src/main/java/org/apache/calcite/sql/fun/SqlLibraryOperators.java
index 003e66f..38b5b48 100644
--- a/core/src/main/java/org/apache/calcite/sql/fun/SqlLibraryOperators.java
+++ b/core/src/main/java/org/apache/calcite/sql/fun/SqlLibraryOperators.java
@@ -549,6 +549,20 @@
@LibraryOperator(libraries = {BIG_QUERY})
public static final SqlFunction REGEXP_SUBSTR = REGEXP_EXTRACT.withName("REGEXP_SUBSTR");
+ /** The "REGEXP(value, regexp)" function, equivalent to {@link #RLIKE}. */
+ @LibraryOperator(libraries = {SPARK})
+ public static final SqlFunction REGEXP =
+ SqlBasicFunction.create("REGEXP", ReturnTypes.BOOLEAN_NULLABLE,
+ OperandTypes.STRING_STRING,
+ SqlFunctionCategory.STRING);
+
+ /** The "REGEXP_LIKE(value, regexp)" function, equivalent to {@link #RLIKE}. */
+ @LibraryOperator(libraries = {SPARK})
+ public static final SqlFunction REGEXP_LIKE =
+ SqlBasicFunction.create("REGEXP_LIKE", ReturnTypes.BOOLEAN_NULLABLE,
+ OperandTypes.STRING_STRING,
+ SqlFunctionCategory.STRING);
+
@LibraryOperator(libraries = {MYSQL})
public static final SqlFunction COMPRESS =
SqlBasicFunction.create("COMPRESS",
diff --git a/site/_docs/reference.md b/site/_docs/reference.md
index 08d20ac..21edb54 100644
--- a/site/_docs/reference.md
+++ b/site/_docs/reference.md
@@ -2813,10 +2813,12 @@
| b | PARSE_TIMESTAMP(format, string[, timeZone]) | Uses format specified by *format* to convert *string* representation of timestamp to a TIMESTAMP WITH LOCAL TIME ZONE value in *timeZone*
| h s | PARSE_URL(urlString, partToExtract [, keyToExtract] ) | Returns the specified *partToExtract* from the *urlString*. Valid values for *partToExtract* include HOST, PATH, QUERY, REF, PROTOCOL, AUTHORITY, FILE, and USERINFO. *keyToExtract* specifies which query to extract
| b s | POW(numeric1, numeric2) | Returns *numeric1* raised to the power *numeric2*
+| s | REGEXP(string, regexp) | Equivalent to `string1 RLIKE string2`
| b | REGEXP_CONTAINS(string, regexp) | Returns whether *string* is a partial match for the *regexp*
| b | REGEXP_EXTRACT(string, regexp [, position [, occurrence]]) | Returns the substring in *string* that matches the *regexp*, starting search at *position* (default 1), and until locating the nth *occurrence* (default 1). Returns NULL if there is no match
| b | REGEXP_EXTRACT_ALL(string, regexp) | Returns an array of all substrings in *string* that matches the *regexp*. Returns an empty array if there is no match
| b | REGEXP_INSTR(string, regexp [, position [, occurrence [, occurrence_position]]]) | Returns the lowest 1-based position of the substring in *string* that matches the *regexp*, starting search at *position* (default 1), and until locating the nth *occurrence* (default 1). Setting occurrence_position (default 0) to 1 returns the end position of substring + 1. Returns 0 if there is no match
+| s | REGEXP_LIKE(string, regexp) | Equivalent to `string1 RLIKE string2`
| b m o | REGEXP_REPLACE(string, regexp, rep [, pos [, occurrence [, matchType]]]) | Replaces all substrings of *string* that match *regexp* with *rep* at the starting *pos* in expr (if omitted, the default is 1), *occurrence* specifies which occurrence of a match to search for (if omitted, the default is 1), *matchType* specifies how to perform matching
| b | REGEXP_SUBSTR(string, regexp [, position [, occurrence]]) | Synonym for REGEXP_EXTRACT
| b m p s | REPEAT(string, integer) | Returns a string consisting of *string* repeated of *integer* times; returns an empty string if *integer* is less than 1
diff --git a/testkit/src/main/java/org/apache/calcite/test/SqlOperatorTest.java b/testkit/src/main/java/org/apache/calcite/test/SqlOperatorTest.java
index 8aaef82..ebf23f2 100644
--- a/testkit/src/main/java/org/apache/calcite/test/SqlOperatorTest.java
+++ b/testkit/src/main/java/org/apache/calcite/test/SqlOperatorTest.java
@@ -3570,24 +3570,68 @@
}
@Test void testRlikeOperator() {
- SqlOperatorFixture f = fixture()
- .setFor(SqlLibraryOperators.RLIKE, VM_EXPAND);
- checkRlike(f.withLibrary(SqlLibrary.SPARK));
- checkRlike(f.withLibrary(SqlLibrary.HIVE));
+ final SqlOperatorFixture f = fixture().setFor(SqlLibraryOperators.RLIKE, VM_EXPAND);
+ checkRlikeFunc(f, SqlLibrary.HIVE, SqlLibraryOperators.RLIKE);
+ checkRlikeFunc(f, SqlLibrary.SPARK, SqlLibraryOperators.RLIKE);
+ checkRlikeFunc(f, SqlLibrary.SPARK, SqlLibraryOperators.REGEXP);
+ checkRlikeFunc(f, SqlLibrary.SPARK, SqlLibraryOperators.REGEXP_LIKE);
+ checkNotRlikeFunc(f.withLibrary(SqlLibrary.HIVE));
+ checkNotRlikeFunc(f.withLibrary(SqlLibrary.SPARK));
checkRlikeFails(f.withLibrary(SqlLibrary.MYSQL));
checkRlikeFails(f.withLibrary(SqlLibrary.ORACLE));
}
- static void checkRlike(SqlOperatorFixture f) {
- f.checkBoolean("'Merrisa@gmail.com' rlike '.+@*\\.com'", true);
- f.checkBoolean("'Merrisa@gmail.com' rlike '.com$'", true);
- f.checkBoolean("'acbd' rlike '^ac+'", true);
- f.checkBoolean("'acb' rlike 'acb|efg'", true);
- f.checkBoolean("'acb|efg' rlike 'acb\\|efg'", true);
- f.checkBoolean("'Acbd' rlike '^ac+'", false);
- f.checkBoolean("'Merrisa@gmail.com' rlike 'Merrisa_'", false);
- f.checkBoolean("'abcdef' rlike '%cd%'", false);
+ void checkRlikeFunc(SqlOperatorFixture f0, SqlLibrary library, SqlOperator operator) {
+ final Consumer<SqlOperatorFixture> consumer = f -> {
+ f.checkBoolean(binaryExpression(operator, "'Merrisa@gmail.com'", "'.+@*\\.com'"), true);
+ f.checkBoolean(binaryExpression(operator, "'Merrisa@gmail.com'", "'.com$'"), true);
+ f.checkBoolean(binaryExpression(operator, "'acbd'", "'^ac+'"), true);
+ f.checkBoolean(binaryExpression(operator, "'acb'", "'acb|efg'"), true);
+ f.checkBoolean(binaryExpression(operator, "'acb|efg'", "'acb\\|efg'"), true);
+ f.checkBoolean(binaryExpression(operator, "'Acbd'", "'^ac+'"), false);
+ f.checkBoolean(binaryExpression(operator, "'Merrisa@gmail.com'", "'Merrisa_'"), false);
+ f.checkBoolean(binaryExpression(operator, "'abcdef'", "'%cd%'"), false);
+ f.checkBoolean(binaryExpression(operator, "'abc def ghi'", "'abc'"), true);
+ f.checkBoolean(binaryExpression(operator, "'abc def ghi'", "'[a-z]+'"), true);
+ f.checkBoolean(
+ binaryExpression(operator, "'foo@bar.com'",
+ "'@[a-zA-Z0-9-]+\\.[a-zA-Z0-9.]+'"), true);
+ f.checkBoolean(
+ binaryExpression(operator, "'foo@.com'",
+ "'@[a-zA-Z0-9-]+\\.[a-zA-Z0-9.]+'"), false);
+ f.checkBoolean(binaryExpression(operator, "'5556664422'", "'^\\d{10}$'"), true);
+ f.checkBoolean(binaryExpression(operator, "'11555666442233'", "'^\\d{10}$'"), false);
+ f.checkBoolean(binaryExpression(operator, "'55566644221133'", "'\\d{10}'"), true);
+ f.checkBoolean(binaryExpression(operator, "'55as56664as422'", "'\\d{10}'"), false);
+ f.checkBoolean(binaryExpression(operator, "'55as56664as422'", "''"), true);
+ // test for string escaped
+ f.checkBoolean(binaryExpression(operator, "'abc'", "'^\\abc$'"), false);
+ f.checkBoolean(
+ binaryExpression(operator, "'%SystemDrive%\\Users\\John'",
+ "'%SystemDrive%\\\\Users.*'"), true);
+ f.checkBoolean(
+ binaryExpression(operator, "'%SystemDrive%\\\\Users\\\\John'",
+ "'%SystemDrive%\\\\\\\\Users.*'"), true);
+ f.checkFails(
+ binaryExpression(operator, "'%SystemDrive%\\Users\\John'",
+ "'%SystemDrive%\\Users.*'"),
+ "(?s).*Illegal/unsupported escape sequence near index.*", true);
+
+ f.checkQuery("select " + binaryExpression(operator, "'abc def ghi'", "'abc'"));
+ f.checkQuery(
+ "select " + binaryExpression(operator, "'foo@bar.com'",
+ "'@[a-zA-Z0-9-]+\\\\.[a-zA-Z0-9-.]+'"));
+ f.checkQuery("select " + binaryExpression(operator, "'55as56664as422'", "'\\d{10}'"));
+
+ f.checkNull(binaryExpression(operator, "'abc def ghi'", "cast(null as varchar)"));
+ f.checkNull(binaryExpression(operator, "cast(null as varchar)", "'abc'"));
+ f.checkNull(binaryExpression(operator, "cast(null as varchar)", "cast(null as varchar)"));
+ };
+ f0.forEachLibrary(list(library), consumer);
+ }
+
+ void checkNotRlikeFunc(SqlOperatorFixture f) {
f.setFor(SqlLibraryOperators.NOT_RLIKE, VM_EXPAND);
f.checkBoolean("'Merrisagmail' not rlike '.+@*\\.com'", true);
f.checkBoolean("'acbd' not rlike '^ac+'", false);
@@ -3595,6 +3639,14 @@
f.checkBoolean("'Merrisa@gmail.com' not rlike 'Merrisa_'", true);
}
+ private String binaryExpression(SqlOperator operator, String left, String right) {
+ if (SqlLibraryOperators.RLIKE == operator || SqlLibraryOperators.NOT_RLIKE == operator) {
+ return left + " " + operator.getName() + " " + right;
+ } else {
+ return operator.getName() + "( " + left + ", " + right + ")";
+ }
+ }
+
static void checkRlikeFails(SqlOperatorFixture f) {
final String noRlike = "(?s).*No match found for function signature RLIKE";
f.checkFails("^'Merrisa@gmail.com' rlike '.+@*\\.com'^", noRlike, false);