PIG-5214: search any substring in the input string
git-svn-id: https://svn.apache.org/repos/asf/pig/trunk@1791153 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/CHANGES.txt b/CHANGES.txt
index 7ba7c09..42ff9c5 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -36,6 +36,8 @@
IMPROVEMENTS
+PIG-5214: search any substring in the input string (rainer-46 via daijy)
+
PIG-5210: Option to print MR/Tez plan before launching (ly16 via daijy)
PIG-5175: Upgrade jruby to 1.7.26 (daijy)
diff --git a/src/docs/src/documentation/content/xdocs/func.xml b/src/docs/src/documentation/content/xdocs/func.xml
index d792a2a..5662eb8 100644
--- a/src/docs/src/documentation/content/xdocs/func.xml
+++ b/src/docs/src/documentation/content/xdocs/func.xml
@@ -4352,7 +4352,74 @@
</section>
-<!-- ======================================================== -->
+<!-- ======================================================== -->
+ <section id="regex-search">
+ <title>REGEX_SEARCH</title>
+ <p>Performs regular expression matching and searches all matched characters in a string.</p>
+
+<section>
+ <title>Syntax</title>
+ <table>
+ <tr>
+ <td>
+ <p>REGEX_SEARCH(string, 'regExp');</p>
+ </td>
+ </tr>
+ </table>
+ </section>
+
+<section>
+ <title>Terms</title>
+ <table>
+ <tr>
+ <td>
+ <p>string</p>
+ </td>
+ <td>
+ <p>The string in which to perform the match.</p>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <p>'regExp'</p>
+ </td>
+ <td>
+ <p>The regular expression to which the string is to be matched, in quotes.</p>
+ </td>
+ </tr>
+ </table>
+</section>
+
+<section>
+ <title>Usage</title>
+ <p>
+Use the REGEX_SEARCH function to perform regular expression matching and to find all matched characters in a string.
+ </p>
+ <p>
+The function returns tuples which are placed in a bag. Each tuple only contains one field which represents a matched expression.
+ </p>
+</section>
+
+<section>
+ <title>Example</title>
+ <p>
+This is example will return the bag {(=04 ),(=06 ),(=96 )}.
+ </p>
+<source>
+REGEX_SEARCH('a=04 b=06 c=96 or more', '(=\\d+\\s)');
+</source>
+ <p>
+And this is example will return the bag {(04),(06),(96)}.
+ </p>
+<source>
+REGEX_SEARCH('a=04 b=06 c=96 or more', '=(\\d+)\\s');
+</source>
+
+ </section>
+</section>
+
+
+<!-- ======================================================== -->
<section id="replace">
<title>REPLACE</title>
<p>Replaces existing characters in a string with new characters.</p>
diff --git a/src/org/apache/pig/builtin/REGEX_SEARCH.java b/src/org/apache/pig/builtin/REGEX_SEARCH.java
new file mode 100644
index 0000000..2c6ee74
--- /dev/null
+++ b/src/org/apache/pig/builtin/REGEX_SEARCH.java
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pig.builtin;
+
+import java.io.IOException;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.pig.EvalFunc;
+import org.apache.pig.data.*;
+import org.apache.pig.impl.logicalLayer.schema.Schema;
+import org.apache.pig.impl.util.*;
+
+/**
+ * Search and find all matched characters in a string with a given
+ * regular expression.
+ *
+ * Example:
+ *
+ * a = LOAD 'mydata' AS (name:chararray);
+ * b = FOREACH A GENERATE REGEX_SEARCH(name, 'regEx');
+ *
+ * input tuple: the first field is a string on which performs regular expression matching;
+ * the second field is the regular expression;
+ */
+
+public class REGEX_SEARCH extends EvalFunc<DataBag> {
+ private static BagFactory bagFactory = BagFactory.getInstance();
+ private static TupleFactory tupleFactory = TupleFactory.getInstance();
+
+ public REGEX_SEARCH() {}
+
+ @Override
+ public DataBag exec(Tuple input) throws IOException {
+
+ if (input == null || input.size() < 1) {
+ return null;
+ }
+ if (input.get(0)==null)
+ return null;
+
+ try {
+ if (!input.get(1).equals(mExpression)) {
+ try {
+ mExpression = (String)input.get(1);
+ mPattern = Pattern.compile(mExpression);
+ } catch (Exception e) {
+ String msg = "StringSearchAll : Mal-Formed Regular expression : "+input.get(1);
+ throw new IOException(msg);
+ }
+ }
+ } catch (NullPointerException e) {
+ String msg = "StringSearchAll : Regular expression is null";
+ throw new IOException(msg);
+ }
+ Matcher m = mPattern.matcher((String)input.get(0));
+ if (!m.find()) {
+ return null;
+ }
+
+ Tuple tuple0 = tupleFactory.newTuple(1);
+ tuple0.set(0, m.group(1));
+ DataBag dataBag = bagFactory.newDefaultBag();
+ dataBag.add(tuple0);
+ while (m.find()) {
+ Tuple tuple = tupleFactory.newTuple(1);
+ tuple.set(0, m.group(1));
+ dataBag.add(tuple);
+ }
+ return dataBag;
+ }
+
+ String mExpression = null;
+ Pattern mPattern = null;
+ @Override
+ public Schema outputSchema(Schema input) {
+ try {
+ return new Schema(Utils.getSchemaFromString("{(match:chararray)}"));
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ @Override
+ public boolean allowCompileTimeCalculation() {
+ return true;
+ }
+
+}
diff --git a/test/org/apache/pig/test/TestBuiltin.java b/test/org/apache/pig/test/TestBuiltin.java
index fbc3f1e..05c7ca6 100644
--- a/test/org/apache/pig/test/TestBuiltin.java
+++ b/test/org/apache/pig/test/TestBuiltin.java
@@ -96,6 +96,7 @@
import org.apache.pig.builtin.RTRIM;
import org.apache.pig.builtin.SIZE;
import org.apache.pig.builtin.SPRINTF;
+import org.apache.pig.builtin.REGEX_SEARCH;
import org.apache.pig.builtin.STRSPLIT;
import org.apache.pig.builtin.SUBSTRING;
import org.apache.pig.builtin.SecondsBetween;
@@ -1993,6 +1994,32 @@
re = funce.exec(te3);
assertTrue(re==null);
+
+ // *** REGEX_SEARCH *** start
+ String matchSearch = "(=\\d+\\s)";
+ tupleFactory = TupleFactory.getInstance();
+ Tuple ts1 = tupleFactory.newTuple(2);
+ ts1.set(0, "a=04 b=06 c=96 or more");
+ ts1.set(1, matchSearch);
+
+ Tuple ts2 = tupleFactory.newTuple(2);
+ ts2.set(0, "a is 04 b is 06");
+ ts2.set(1, matchSearch);
+
+ Tuple ts3 = tupleFactory.newTuple(2);
+ ts3.set(0, null);
+ ts3.set(1, matchSearch);
+
+ REGEX_SEARCH funcs = new REGEX_SEARCH();
+ DataBag reb = funcs.exec(ts1);
+ DataBag b = Util.createBag(new Tuple[]{Util.buildTuple("=04 "), Util.buildTuple("=06 "), Util.buildTuple("=96 ")});
+ assertEquals(b, reb);
+
+ reb = funcs.exec(ts2);
+ assertTrue(reb==null);
+
+ reb = funcs.exec(ts3);
+ assertTrue(reb==null);
}
@Test