Add Prefix, Suffix and Ngram UDFs (#12392)

commit: 36c4b9a86fcab77e96cb1e90b1900efca0e1ce7c [log] [tgz]
author: deemoliu <qiaochu@uber.com> Tue Apr 23 15:20:35 2024 -0700
committer: GitHub <noreply@github.com> Tue Apr 23 15:20:35 2024 -0700
tree: 487e1edf5272d523490fe9a5e5d625083eace7ab
parent: 0caeccfc1399087885205e7e796d1ee8037f7867 [diff]
diff --git a/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/StringFunctions.java b/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/StringFunctions.java
index 374917e..31baeb5 100644
--- a/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/StringFunctions.java
+++ b/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/StringFunctions.java

@@ -18,6 +18,8 @@
  */
 package org.apache.pinot.common.function.scalar;
 
+import it.unimi.dsi.fastutil.objects.ObjectLinkedOpenHashSet;
+import it.unimi.dsi.fastutil.objects.ObjectSet;
 import java.io.UnsupportedEncodingException;
 import java.net.URLDecoder;
 import java.net.URLEncoder;
@@ -28,6 +30,7 @@
 import java.util.UUID;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
+import javax.annotation.Nullable;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.pinot.common.utils.RegexpPatternConverterUtils;
 import org.apache.pinot.spi.annotations.ScalarFunction;
@@ -581,6 +584,111 @@
   }
 
   /**
+   * @param input an input string for prefix strings generations.
+   * @param maxlength the max length of the prefix strings for the string.
+   * @return generate an array of prefix strings of the string that are shorter than the specified length.
+   */
+  @ScalarFunction
+  public static String[] prefixes(String input, int maxlength) {
+    int arrLength = Math.min(maxlength, input.length());
+    String[] prefixArr = new String[arrLength];
+    for (int prefixIdx = 1; prefixIdx <= arrLength; prefixIdx++) {
+      prefixArr[prefixIdx - 1] = input.substring(0, prefixIdx);
+    }
+    return prefixArr;
+  }
+
+  /**
+   * @param input an input string for prefix strings generations.
+   * @param maxlength the max length of the prefix strings for the string.
+   * @param prefix the prefix to be prepended to prefix strings generated. e.g. '^' for regex matching
+   * @return generate an array of prefix matchers of the string that are shorter than the specified length.
+   */
+  @ScalarFunction(nullableParameters = true, names = {"prefixesWithPrefix", "prefixes_with_prefix"})
+  public static String[] prefixesWithPrefix(String input, int maxlength, @Nullable String prefix) {
+    if (prefix == null) {
+      return prefixes(input, maxlength);
+    }
+    int arrLength = Math.min(maxlength, input.length());
+    String[] prefixArr = new String[arrLength];
+    for (int prefixIdx = 1; prefixIdx <= arrLength; prefixIdx++) {
+      prefixArr[prefixIdx - 1] = prefix + input.substring(0, prefixIdx);
+    }
+    return prefixArr;
+  }
+
+  /**
+   * @param input an input string for suffix strings generations.
+   * @param maxlength the max length of the suffix strings for the string.
+   * @return generate an array of suffix strings of the string that are shorter than the specified length.
+   */
+  @ScalarFunction
+  public static String[] suffixes(String input, int maxlength) {
+    int arrLength = Math.min(maxlength, input.length());
+    String[] suffixArr = new String[arrLength];
+    for (int suffixIdx = 1; suffixIdx <= arrLength; suffixIdx++) {
+      suffixArr[suffixIdx - 1] = input.substring(input.length() - suffixIdx);
+    }
+    return suffixArr;
+  }
+
+  /**
+   * @param input an input string for suffix strings generations.
+   * @param maxlength the max length of the suffix strings for the string.
+   * @param suffix the suffix string to be appended for suffix strings generated. e.g. '$' for regex matching.
+   * @return generate an array of suffix matchers of the string that are shorter than the specified length.
+   */
+  @ScalarFunction(nullableParameters = true, names = {"suffixesWithSuffix", "suffixes_with_suffix"})
+  public static String[] suffixesWithSuffix(String input, int maxlength, @Nullable String suffix) {
+    if (suffix == null) {
+      return suffixes(input, maxlength);
+    }
+    int arrLength = Math.min(maxlength, input.length());
+    String[] suffixArr = new String[arrLength];
+    for (int suffixIdx = 1; suffixIdx <= arrLength; suffixIdx++) {
+      suffixArr[suffixIdx - 1] = input.substring(input.length() - suffixIdx) + suffix;
+    }
+    return suffixArr;
+  }
+
+  /**
+   * @param input an input string for ngram generations.
+   * @param length the max length of the ngram for the string.
+   * @return generate an array of unique ngram of the string that length are exactly matching the specified length.
+   */
+  @ScalarFunction
+  public static String[] uniqueNgrams(String input, int length) {
+    if (length == 0 || length > input.length()) {
+      return new String[0];
+    }
+    ObjectSet<String> ngramSet = new ObjectLinkedOpenHashSet<>();
+    for (int i = 0; i < input.length() - length + 1; i++) {
+      ngramSet.add(input.substring(i, i + length));
+    }
+    return ngramSet.toArray(new String[0]);
+  }
+
+  /**
+   * @param input an input string for ngram generations.
+   * @param minGram the min length of the ngram for the string.
+   * @param maxGram the max length of the ngram for the string.
+   * @return generate an array of ngram of the string that length are within the specified range [minGram, maxGram].
+   */
+  @ScalarFunction
+  public static String[] uniqueNgrams(String input, int minGram, int maxGram) {
+    ObjectSet<String> ngramSet = new ObjectLinkedOpenHashSet<>();
+    for (int n = minGram; n <= maxGram && n <= input.length(); n++) {
+      if (n == 0) {
+        continue;
+      }
+      for (int i = 0; i < input.length() - n + 1; i++) {
+        ngramSet.add(input.substring(i, i + n));
+      }
+    }
+    return ngramSet.toArray(new String[0]);
+  }
+
+  /**
    * TODO: Revisit if index should be one-based (both Presto and Postgres use one-based index, which starts with 1)
    * @param input
    * @param delimiter

diff --git a/pinot-common/src/test/java/org/apache/pinot/common/function/scalar/StringFunctionsTest.java b/pinot-common/src/test/java/org/apache/pinot/common/function/scalar/StringFunctionsTest.java
index d75b8ad..6c9fa46 100644
--- a/pinot-common/src/test/java/org/apache/pinot/common/function/scalar/StringFunctionsTest.java
+++ b/pinot-common/src/test/java/org/apache/pinot/common/function/scalar/StringFunctionsTest.java

@@ -77,6 +77,41 @@
     };
   }
 
+  @DataProvider(name = "prefixAndSuffixTestCases")
+  public static Object[][] prefixAndSuffixTestCases() {
+    return new Object[][]{
+        {"abcde", 3, new String[]{"a", "ab", "abc"}, new String[]{"e", "de", "cde"}, new String[]{
+            "^a", "^ab", "^abc"}, new String[]{"e$", "de$", "cde$"}},
+        {"abcde", 0, new String[]{}, new String[]{}, new String[]{}, new String[]{}},
+        {"abcde", 9, new String[]{"a", "ab", "abc", "abcd", "abcde"}, new String[]{"e", "de", "cde", "bcde", "abcde"},
+        new String[]{"^a", "^ab", "^abc", "^abcd", "^abcde"}, new String[]{"e$", "de$", "cde$", "bcde$", "abcde$"}},
+        {"a", 3, new String[]{"a"}, new String[]{"a"}, new String[]{"^a"}, new String[]{"a$"}},
+        {"a", 0, new String[]{}, new String[]{}, new String[]{}, new String[]{}},
+        {"a", 9, new String[]{"a"}, new String[]{"a"}, new String[]{"^a"}, new String[]{"a$"}},
+        {"", 3, new String[]{}, new String[]{}, new String[]{}, new String[]{}},
+        {"", 0, new String[]{}, new String[]{}, new String[]{}, new String[]{}},
+        {"", 9, new String[]{}, new String[]{}, new String[]{}, new String[]{}}
+    };
+  }
+
+  @DataProvider(name = "ngramTestCases")
+  public static Object[][] ngramTestCases() {
+    return new Object[][]{
+        {"abcd", 0, 3, new String[]{"abc", "bcd"}, new String[]{"a", "b", "c", "d", "ab", "bc", "cd", "abc", "bcd"}},
+        {"abcd", 2, 2, new String[]{"ab", "bc", "cd"}, new String[]{"ab", "bc", "cd"}},
+        {"abcd", 3, 0, new String[]{}, new String[]{}},
+        {"abc", 0, 3, new String[]{"abc"}, new String[]{"a", "b", "c", "ab", "bc", "abc"}},
+        {"abc", 3, 0, new String[]{}, new String[]{}},
+        {"abc", 3, 3, new String[]{"abc"}, new String[]{"abc"}},
+        {"a", 0, 3, new String[]{}, new String[]{"a"}},
+        {"a", 2, 3, new String[]{}, new String[]{}},
+        {"a", 3, 3, new String[]{}, new String[]{}},
+        {"", 3, 0, new String[]{}, new String[]{}},
+        {"", 3, 3, new String[]{}, new String[]{}},
+        {"", 0, 3, new String[]{}, new String[]{}}
+    };
+  }
+
   @Test(dataProvider = "isJson")
   public void testIsJson(String input, boolean expectedValue) {
     assertEquals(StringFunctions.isJson(input), expectedValue);
@@ -88,4 +123,19 @@
     assertEquals(StringFunctions.splitPart(input, delimiter, index), expectedToken);
     assertEquals(StringFunctions.splitPart(input, delimiter, limit, index), expectedTokenWithLimitCounts);
   }
+
+  @Test(dataProvider = "prefixAndSuffixTestCases")
+  public void testPrefixAndSuffix(String input, int length, String[] expectedPrefix, String[] expectedSuffix,
+      String[] expectedPrefixWithRegexChar, String[] expectedSuffixWithRegexChar) {
+    assertEquals(StringFunctions.prefixes(input, length), expectedPrefix);
+    assertEquals(StringFunctions.suffixes(input, length), expectedSuffix);
+    assertEquals(StringFunctions.prefixesWithPrefix(input, length, "^"), expectedPrefixWithRegexChar);
+    assertEquals(StringFunctions.suffixesWithSuffix(input, length, "$"), expectedSuffixWithRegexChar);
+  }
+
+  @Test(dataProvider = "ngramTestCases")
+  public void testNGram(String input, int minGram, int maxGram, String[] expectedExactNGram, String[] expectedNGram) {
+    assertEquals(StringFunctions.uniqueNgrams(input, maxGram), expectedExactNGram);
+    assertEquals(StringFunctions.uniqueNgrams(input, minGram, maxGram), expectedNGram);
+  }
 }
commit	36c4b9a86fcab77e96cb1e90b1900efca0e1ce7c	[log] [tgz]
author	deemoliu <qiaochu@uber.com>	Tue Apr 23 15:20:35 2024 -0700
committer	GitHub <noreply@github.com>	Tue Apr 23 15:20:35 2024 -0700
tree	487e1edf5272d523490fe9a5e5d625083eace7ab
parent	0caeccfc1399087885205e7e796d1ee8037f7867 [diff]