[HIVEMALL-289] Add str_contain(string str, array<string> match, boolean or=true) UDF
## What changes were proposed in this pull request?
Add str_contain(string str, array<string> match, boolean or=true) UDF
## What type of PR is it?
Feature
## What is the Jira issue?
https://issues.apache.org/jira/browse/HIVEMALL-289
## How was this patch tested?
manual tests on EMR
## How to use this feature?
```sql
select
str_contains('There are apple and orange', array('apple')),
str_contains('There are apple and orange', array('apple', 'banana'), true),
str_contains('There are apple and orange', array('apple', 'banana'), false);
> true, true, false
```
## Checklist
- [x] Did you apply source code formatter, i.e., `./bin/format_code.sh`, for your commit?
- [x] Did you run system tests on Hive (or Spark)?
Author: Makoto Yui <myui@apache.org>
Closes #225 from myui/HIVEMALL-289.
diff --git a/ChangeLog.md b/ChangeLog.md
index 244c9c5..2e2ef31 100644
--- a/ChangeLog.md
+++ b/ChangeLog.md
@@ -1,3 +1,22 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
# v0.6.0 - Dec 19, 2019
Major new features in this release includes:
diff --git a/core/src/main/java/hivemall/tools/strings/StrContainsUDF.java b/core/src/main/java/hivemall/tools/strings/StrContainsUDF.java
new file mode 100644
index 0000000..d7468a4
--- /dev/null
+++ b/core/src/main/java/hivemall/tools/strings/StrContainsUDF.java
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package hivemall.tools.strings;
+
+import hivemall.utils.hadoop.HiveUtils;
+
+import java.util.List;
+
+import org.apache.commons.lang.StringUtils;
+import org.apache.hadoop.hive.ql.exec.Description;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.udf.UDFType;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
+import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
+
+//@formatter:off
+@Description(name = "str_contains",
+ value = "_FUNC_(string query, array<string> searchTerms [, boolean orQuery=false])"
+ + " - Returns true if the given query contains search terms",
+ extended = "select\n" +
+ " str_contains('There are apple and orange', array('apple')), -- or=false\n" +
+ " str_contains('There are apple and orange', array('apple', 'banana'), true), -- or=true\n" +
+ " str_contains('There are apple and orange', array('apple', 'banana'), false); -- or=false\n" +
+ "> true, true, false")
+//@formatter:on
+@UDFType(deterministic = true, stateful = false)
+public final class StrContainsUDF extends GenericUDF {
+
+ private StringObjectInspector queryOI;
+ private ListObjectInspector searchTermsOI;
+ private BooleanObjectInspector orQueryOI;
+
+ @Override
+ public ObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentException {
+ if (argOIs.length != 2 && argOIs.length != 3) {
+ throw new UDFArgumentLengthException("str_contains expects two or three arguments");
+ }
+
+ this.queryOI = HiveUtils.asStringOI(argOIs, 0);
+ if (!HiveUtils.isStringListOI(argOIs[1])) {
+ throw new UDFArgumentTypeException(1,
+ "Expected array<string> for the second argument but got "
+ + argOIs[1].getTypeName());
+ }
+ this.searchTermsOI = HiveUtils.asListOI(argOIs, 1);
+
+ if (argOIs.length == 3) {
+ this.orQueryOI = HiveUtils.asBooleanOI(argOIs, 2);
+ }
+
+ return PrimitiveObjectInspectorFactory.javaBooleanObjectInspector;
+ }
+
+ @Override
+ public Boolean evaluate(DeferredObject[] args) throws HiveException {
+ final String query = queryOI.getPrimitiveJavaObject(args[0].get());
+ if (query == null) {
+ return null;
+ }
+
+ final List<String> searchTerms = HiveUtils.asStringList(args[1], searchTermsOI);
+ if (searchTerms == null || searchTerms.isEmpty()) {
+ return Boolean.FALSE;
+ }
+
+ boolean orQuery = false;
+ if (args.length == 3) {
+ orQuery = orQueryOI.get(args[2].get());
+ }
+
+ if (orQuery) {
+ for (String term : searchTerms) {
+ if (query.contains(term)) {
+ return Boolean.TRUE;
+ }
+ }
+ return Boolean.FALSE;
+ } else {
+ for (String term : searchTerms) {
+ if (!query.contains(term)) {
+ return Boolean.FALSE;
+ }
+ }
+ return Boolean.TRUE;
+ }
+ }
+
+ @Override
+ public String getDisplayString(String[] args) {
+ return "str_contains(" + StringUtils.join(args, ',') + ')';
+ }
+
+
+}
diff --git a/core/src/main/java/hivemall/utils/hadoop/HiveUtils.java b/core/src/main/java/hivemall/utils/hadoop/HiveUtils.java
index 293d236..b82f6d4 100644
--- a/core/src/main/java/hivemall/utils/hadoop/HiveUtils.java
+++ b/core/src/main/java/hivemall/utils/hadoop/HiveUtils.java
@@ -1066,6 +1066,17 @@
}
@Nonnull
+ public static BooleanObjectInspector asBooleanOI(@Nonnull final ObjectInspector[] argOIs,
+ final int argIndex) throws UDFArgumentException {
+ ObjectInspector argOI = getObjectInspector(argOIs, argIndex);
+ if (!BOOLEAN_TYPE_NAME.equals(argOI.getTypeName())) {
+ throw new UDFArgumentTypeException(argIndex,
+ "Argument type must be Boolean: " + argOI.getTypeName());
+ }
+ return (BooleanObjectInspector) argOI;
+ }
+
+ @Nonnull
public static IntObjectInspector asIntOI(@Nonnull final ObjectInspector argOI)
throws UDFArgumentException {
if (!INT_TYPE_NAME.equals(argOI.getTypeName())) {
diff --git a/resources/ddl/define-all-as-permanent.hive b/resources/ddl/define-all-as-permanent.hive
index 006bef9..f995d55 100644
--- a/resources/ddl/define-all-as-permanent.hive
+++ b/resources/ddl/define-all-as-permanent.hive
@@ -676,6 +676,9 @@
DROP FUNCTION IF EXISTS word_ngrams;
CREATE FUNCTION word_ngrams as 'hivemall.tools.text.WordNgramsUDF' USING JAR '${hivemall_jar}';
+DROP FUNCTION IF EXISTS str_contains;
+CREATE FUNCTION str_contains as 'hivemall.tools.strings.StrContainsUDF' USING JAR '${hivemall_jar}';
+
---------------------------------
-- Dataset generator functions --
---------------------------------
diff --git a/resources/ddl/define-all.hive b/resources/ddl/define-all.hive
index c0e319c..bf9bc7c 100644
--- a/resources/ddl/define-all.hive
+++ b/resources/ddl/define-all.hive
@@ -668,6 +668,9 @@
drop temporary function if exists word_ngrams;
create temporary function word_ngrams as 'hivemall.tools.text.WordNgramsUDF';
+drop temporary function if exists str_contains;
+create temporary function str_contains as 'hivemall.tools.strings.StrContainsUDF';
+
---------------------------------
-- Dataset generator functions --
---------------------------------
diff --git a/resources/ddl/define-all.spark b/resources/ddl/define-all.spark
index 84c1b9c..8529134 100644
--- a/resources/ddl/define-all.spark
+++ b/resources/ddl/define-all.spark
@@ -652,6 +652,9 @@
sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS word_ngrams")
sqlContext.sql("CREATE TEMPORARY FUNCTION word_ngrams AS 'hivemall.tools.text.WordNgramsUDF'")
+sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS str_contains")
+sqlContext.sql("CREATE TEMPORARY FUNCTION str_contains AS 'hivemall.tools.strings.StrContainsUDF'")
+
/**
* Dataset generator functions
*/
diff --git a/tools/hivemall-docs/src/main/java/hivemall/docs/FuncsListGeneratorMojo.java b/tools/hivemall-docs/src/main/java/hivemall/docs/FuncsListGeneratorMojo.java
index 54d7d4e..c383bd9 100644
--- a/tools/hivemall-docs/src/main/java/hivemall/docs/FuncsListGeneratorMojo.java
+++ b/tools/hivemall-docs/src/main/java/hivemall/docs/FuncsListGeneratorMojo.java
@@ -96,7 +96,7 @@
genericFuncsHeaders.put("# Sanity Checks",
Collections.singletonList("hivemall.tools.sanity"));
genericFuncsHeaders.put("# Text processing",
- Collections.singletonList("hivemall.tools.text"));
+ Arrays.asList("hivemall.tools.text", "hivemall.tools.strings"));
genericFuncsHeaders.put("# Timeseries",
Collections.singletonList("hivemall.tools.timeseries"));
genericFuncsHeaders.put("# Others", Collections.singletonList("hivemall.tools"));