HAWQ-1771. add TRANSLATE function and set KMP_LIMIT = 30

commit: b7f085da63ed1f6a43b673f6987597d37d11bf7f [log] [tgz]
author: ZongtianHou <houzongtian@outlook.com> Thu Nov 12 10:02:54 2020 +0800
committer: ZongtianHou <houzongtian@outlook.com> Thu Nov 12 10:11:49 2020 +0800
tree: 3a3bc9b85a78ca63b4a0a4220772f58a875f1bb2
parent: 6b045e7eb4cab980a338d3c6f31cb1b04c2f589f [diff]
diff --git a/depends/dbcommon/src/dbcommon/function/func-kind.h b/depends/dbcommon/src/dbcommon/function/func-kind.h
index 4fbf961..293ffb0 100644
--- a/depends/dbcommon/src/dbcommon/function/func-kind.h
+++ b/depends/dbcommon/src/dbcommon/function/func-kind.h

@@ -246,6 +246,7 @@
   STRING_RPAD,
   STRING_LPAD_NOFILL,
   STRING_RPAD_NOFILL,
+  STRING_TRANSLATE,
 
   // binary related functions
   BINARY_OCTET_LENGTH,

diff --git a/depends/dbcommon/src/dbcommon/function/func.cc b/depends/dbcommon/src/dbcommon/function/func.cc
index f9222b3..36bddd6 100644
--- a/depends/dbcommon/src/dbcommon/function/func.cc
+++ b/depends/dbcommon/src/dbcommon/function/func.cc

@@ -377,7 +377,7 @@
   FuncEntryArray.push_back({STRING_RPAD, "string_rpad", STRINGID, {STRINGID, INTID, STRINGID}, string_rpad, false});
   FuncEntryArray.push_back({STRING_LPAD_NOFILL, "string_lpad_nofill", STRINGID, {STRINGID, INTID}, string_lpad_nofill, false});
   FuncEntryArray.push_back({STRING_RPAD_NOFILL, "string_rpad_nofill", STRINGID, {STRINGID, INTID}, string_rpad_nofill, false});
-
+  FuncEntryArray.push_back({STRING_TRANSLATE, "string_translate", STRINGID, {STRINGID, STRINGID, STRINGID}, string_translate, false});
 
   FuncEntryArray.push_back({BINARY_OCTET_LENGTH, "binary_octet_length", INTID, {BINARYID}, binary_octet_length, false});
 

diff --git a/depends/dbcommon/src/dbcommon/function/string-binary-function.h b/depends/dbcommon/src/dbcommon/function/string-binary-function.h
index ab4499c..8375f8a 100644
--- a/depends/dbcommon/src/dbcommon/function/string-binary-function.h
+++ b/depends/dbcommon/src/dbcommon/function/string-binary-function.h

@@ -46,6 +46,7 @@
 Datum string_ascii(Datum *params, uint64_t size);
 Datum string_repeat(Datum *params, uint64_t size);
 Datum string_chr(Datum *params, uint64_t size);
+Datum string_translate(Datum *params, uint64_t size);
 
 Datum string_bpchar(Datum *params, uint64_t size);
 Datum string_varchar(Datum *params, uint64_t size);

diff --git a/depends/dbcommon/src/dbcommon/function/string-function.cc b/depends/dbcommon/src/dbcommon/function/string-function.cc
index e70a402..dbf11c6 100644
--- a/depends/dbcommon/src/dbcommon/function/string-function.cc
+++ b/depends/dbcommon/src/dbcommon/function/string-function.cc

@@ -351,16 +351,30 @@
     return *this;
   }
 
-  utf8ptr &operator+=(const int &len) {
-    int times = len;
+  utf8ptr &operator+=(const int32_t &len) {
+    int32_t times = len;
     while (times--) p_ += utf8_mblen(p_);
     return *this;
   }
 
+  utf8ptr &operator=(const char *p) {
+    if (p_ != p) p_ = p;
+    return *this;
+  }
+
+  bool operator==(const utf8ptr &tmp) {
+    int32_t len = utf8_mblen(p_);
+    const char *tmp_ = p_;
+    const char *cmp_ = tmp.p_;
+    while (len && *tmp_++ == *cmp_++) len--;
+    if (len) return false;
+    return true;
+  }
+
   char *get() { return const_cast<char *>(p_); }
 
-  int characterLength(const char *p) {
-    int len = 0;
+  int32_t characterLength(const char *p) {
+    int32_t len = 0;
     const char *tmp = p_;
     while (tmp != p) {
       tmp += utf8_mblen(tmp);
@@ -369,11 +383,11 @@
     return len;
   }
 
-  int characterLength(const int &len) {
-    int ret = 0, lenth = len;
+  int32_t characterLength(const int32_t &len) {
+    int32_t ret = 0, lenth = len;
     const char *tmp = p_;
     while (lenth > 0) {
-      int tLen = utf8_mblen(tmp);
+      int32_t tLen = utf8_mblen(tmp);
       lenth -= tLen;
       tmp += tLen;
       ret++;
@@ -381,12 +395,12 @@
     return ret;
   }
 
-  int byteLength(const int &len) {
-    int ret = 0;
-    int times = len;
+  int32_t byteLength(const int32_t &len) {
+    int32_t ret = 0;
+    int32_t times = len;
     const char *tmp = p_;
     while (times--) {
-      int tLen = utf8_mblen(tmp);
+      int32_t tLen = utf8_mblen(tmp);
       tmp += tLen;
       ret += tLen;
     }
@@ -459,7 +473,7 @@
   int32_t *__restrict__ next = reinterpret_cast<int32_t *>(kmpPosBuf->data());
 
   next[0] = -1;
-  int i = 0, j = -1;
+  int32_t i = 0, j = -1;
   while (i < subLen - 1) {
     if (j == -1 || subStr[i] == subStr[j])
       next[++i] = ++j;
@@ -469,7 +483,7 @@
 
   i = 0;
   j = 0;
-  int lLen = len, sLen = subLen;
+  int32_t lLen = len, sLen = subLen;
   while (i < lLen && j < sLen) {
     if (j == -1 || subStr[j] == str[i]) {
       i++;
@@ -488,10 +502,10 @@
                  uint64_t subLen) {
   if (len < subLen) return 0;
 
-  int times = len - subLen;
-  for (int i = 0; i <= times; i++) {
+  int32_t times = len - subLen;
+  for (int32_t i = 0; i <= times; i++) {
     bool flag = true;
-    for (int j = 0; j < subLen; j++)
+    for (int32_t j = 0; j < subLen; j++)
       if (str[i + j] != subStr[j]) {
         flag = false;
         break;
@@ -502,9 +516,10 @@
 }
 
 Datum string_position(Datum *params, uint64_t size) {
+  const uint32_t KMP_LIMIT = 30;
   auto subpos = [](ByteBuffer &buf, text src, text sub) -> int32_t {
     int32_t byteLen = 0;
-    if (sub.length < 15) {
+    if (sub.length < KMP_LIMIT) {
       byteLen = naivePos(src.val, sub.val, src.length, sub.length);
     } else {
       dbcommon::ByteBuffer kmpPosBuf(true);
@@ -522,7 +537,7 @@
     char *ret = const_cast<char *>(buf.tail() - str.length);
 
     char last = ' ';
-    int times = str.length;
+    int32_t times = str.length;
     while (times--) {
       if (((unsigned int)((last | 0x20) - 'a') >= 26u &&
            (unsigned int)(last - '0') >= 10u) &&
@@ -591,7 +606,7 @@
     utf8ptr utfStrPtr(str.val);
     utfStrPtr += pos;
     char *strBegin = utfStrPtr.get();
-    int len = str.val + str.length - strBegin;
+    int32_t len = str.val + str.length - strBegin;
     if (len < 0) len = 0;
     buf.resize(buf.size() + len);
     char *ret = const_cast<char *>(buf.tail() - len);
@@ -604,7 +619,7 @@
 inline int32_t myAscii(const unsigned char *data) {
   int32_t retval = 0;
   if (*data > 0x7F) {
-    int tsize = 0;
+    int32_t tsize = 0;
     if (*data >= 0xF0) {
       retval = *data & 0x07;
       tsize = 3;
@@ -674,14 +689,14 @@
 template <direction dir>
 Datum string_trim_blank(Datum *params, uint64_t size) {
   auto trim = [](ByteBuffer &buf, text str) {
-    int l = 0, r = str.length - 1;
+    int32_t l = 0, r = str.length - 1;
     if (dir == direction::left || dir == direction::both) {
       while (l <= r && str.val[l] == ' ') l++;
     }
     if (dir == direction::right || dir == direction::both) {
       while (l <= r && str.val[r] == ' ') r--;
     }
-    int len = r - l + 1;
+    int32_t len = r - l + 1;
     if (len < 0) len = 0;
     buf.resize(buf.size() + len);
     char *ret = const_cast<char *>(buf.tail() - len);
@@ -695,7 +710,7 @@
 template <direction dir>
 Datum string_trim_chars(Datum *params, uint64_t size) {
   auto trim = [](ByteBuffer &buf, text str, text chr) {
-    int l = 0, r = str.length - 1;
+    int32_t l = 0, r = str.length - 1;
     if (dir == direction::left || dir == direction::both) {
       std::string s(const_cast<char *>(chr.val), chr.length);
       while (l <= r && s.find(str.val[l]) != std::string::npos) l++;
@@ -704,7 +719,7 @@
       std::string s(const_cast<char *>(chr.val), chr.length);
       while (l <= r && s.find(str.val[r]) != std::string::npos) r--;
     }
-    int len = r - l + 1;
+    int32_t len = r - l + 1;
     if (len < 0) len = 0;
     buf.resize(buf.size() + len);
     char *ret = const_cast<char *>(buf.tail() - len);
@@ -767,7 +782,7 @@
 
 Datum string_chr(Datum *params, uint64_t size) {
   auto chr = [](ByteBuffer &buf, int32_t val) {
-    int len = 0;
+    int32_t len = 0;
     char wch[4];
     if (val > 0x7F) {
       if (val > 0x001fffff) {
@@ -872,7 +887,7 @@
     }
 
     int32_t writeLen = str.length < retByteLen ? str.length : retByteLen;
-    for (int i = 0; i < writeLen; i++) *ret++ = str.val[i];
+    for (int32_t i = 0; i < writeLen; i++) *ret++ = str.val[i];
 
     if (dir == direction::right) {
       int32_t remainder = retByteLen - str.length;
@@ -904,7 +919,7 @@
     if (strCharLen >= len) {
       retByteLen = utfStrPtr.byteLength(len);
     } else {
-      int rem = len - strCharLen;
+      int32_t rem = len - strCharLen;
       while (rem >= filCharLen) {
         retByteLen += fil.length;
         rem -= filCharLen;
@@ -922,10 +937,10 @@
       } else {
         while (remainder > 0) {
           if (remainder >= filCharLen) {
-            for (int i = 0; i < fil.length; i++) *ret++ = fil.val[i];
+            for (int32_t i = 0; i < fil.length; i++) *ret++ = fil.val[i];
           } else {
             int32_t fillLen = utfFilPtr.byteLength(remainder);
-            for (int i = 0; i < fillLen; i++) *ret++ = fil.val[i];
+            for (int32_t i = 0; i < fil.length; i++) *ret++ = str.val[i];
           }
           remainder -= filCharLen;
         }
@@ -933,7 +948,7 @@
     }
 
     int32_t writeLen = str.length < retByteLen ? str.length : retByteLen;
-    for (int i = 0; i < writeLen; i++) *ret++ = str.val[i];
+    for (int32_t i = 0; i < writeLen; i++) *ret++ = str.val[i];
 
     if (dir == direction::right) {
       int32_t remainder = len - strCharLen;
@@ -943,10 +958,10 @@
       } else {
         while (remainder > 0) {
           if (remainder >= filCharLen) {
-            for (int i = 0; i < fil.length; i++) *ret++ = fil.val[i];
+            for (int32_t i = 0; i < fil.length; i++) *ret++ = fil.val[i];
           } else {
             int32_t fillLen = utfFilPtr.byteLength(remainder);
-            for (int i = 0; i < fillLen; i++) *ret++ = fil.val[i];
+            for (int32_t i = 0; i < fillLen; i++) *ret++ = fil.val[i];
           }
           remainder -= filCharLen;
         }
@@ -966,4 +981,56 @@
   return string_pad_chars<direction::right>(params, size);
 }
 
+Datum string_translate(Datum *params, uint64_t size) {
+  auto translate = [](ByteBuffer &buf, text str, text from, text to) {
+    utf8ptr utfStrPtr(str.val);
+    utf8ptr utfFromPtr(from.val);
+    utf8ptr utfToPtr(to.val);
+    int32_t strCharLen = utfStrPtr.characterLength(str.val + str.length);
+    int32_t fromCharLen = utfFromPtr.characterLength(from.val + from.length);
+    int32_t toCharLen = utfToPtr.characterLength(to.val + to.length);
+    int32_t retByteLen = 0;
+    int32_t worstLen = strCharLen * 4;
+
+    // if (worstLen / 4 != strCharLen) {
+    //   it won't appear one number which has int32_t length;
+    //   LOG_ERROR(ERRCODE_PROGRAM_LIMIT_EXCEEDED,
+    //   "requested length too large");
+    // }
+
+    buf.resize(buf.size() + worstLen);
+    char *ret = const_cast<char *>(buf.tail() - worstLen);
+
+    auto writeByte = [&](utf8ptr src) {
+      char *tmp = src.get();
+      int32_t len = utf8_mblen(tmp);
+      retByteLen += len;
+      for (int32_t k = 0; k < len; k++) *ret++ = *tmp++;
+    };
+
+    for (int32_t i = 0; i < strCharLen; i++) {
+      int32_t j = 0;
+      utfFromPtr = from.val;
+      utfToPtr = to.val;
+      for (; j < fromCharLen; j++) {
+        if (utfStrPtr == utfFromPtr) {
+          if (j < toCharLen) {
+            utfToPtr += j;
+            writeByte(utfToPtr);
+          }
+          break;
+        }
+        ++utfFromPtr;
+      }
+      if (j == fromCharLen) {
+        writeByte(utfStrPtr);
+      }
+      ++utfStrPtr;
+    }
+    buf.resize(buf.size() - (worstLen - retByteLen));
+    return text(nullptr, retByteLen);
+  };
+  return three_params_bind<text, text, text, text>(params, size, translate);
+}
+
 }  // namespace dbcommon

diff --git a/depends/dbcommon/test/unit/function/test-string-function.cc b/depends/dbcommon/test/unit/function/test-string-function.cc
index ecef323..f374a21 100644
--- a/depends/dbcommon/test/unit/function/test-string-function.cc
+++ b/depends/dbcommon/test/unit/function/test-string-function.cc

@@ -1575,5 +1575,69 @@
         TestFunctionEntry{FuncKind::STRING_RPAD_NOFILL,
                           "Vector: NULL NULL NULL",
                           {"Scalar: NULL", "Vector: 4 5 6"}}));
-
+INSTANTIATE_TEST_CASE_P(
+    string_translate, TestFunction,
+    ::testing::Values(
+        TestFunctionEntry{
+            FuncKind::STRING_TRANSLATE,
+            "Vector: 小1灵b2 真笨 1六六五 6六caa 6六cb诶诶",
+            {"Vector: 小a灵b通 真厉害 1二3四五 6六c西ff 6六c西ff",
+             "Vector: a通 厉害 二3四 ff西 f西f", "Vector: 12 笨 六六 a比 诶b"}},
+        TestFunctionEntry{FuncKind::STRING_TRANSLATE,
+                          "Vector{delimiter=,}: 1b2d,1cc1,    ,bcbc,NULL,",
+                          {"Vector{delimiter=,}: abcd,abccba,aaaa,bcbc,NULL,",
+                           "Vector{delimiter=,}: ac,ab,a,,x,x",
+                           "Vector{delimiter=,}: 123,1, ,b,y,y"}},
+        TestFunctionEntry{FuncKind::STRING_TRANSLATE,
+                          "Vector{delimiter=,}: 1b2d,12cc21,111,",
+                          {"Vector{delimiter=,}: abcd,abccba,aaa,",
+                           "Vector{delimiter=,}: ac,ab,a,x", "Scalar: 123"}},
+        TestFunctionEntry{FuncKind::STRING_TRANSLATE,
+                          "Vector{delimiter=,}: NULL,NULL,NULL,NULL",
+                          {"Vector{delimiter=,}: abcd,abccba,aaa,",
+                           "Vector{delimiter=,}: ac,ab,a,x", "Scalar: NULL"}},
+        TestFunctionEntry{FuncKind::STRING_TRANSLATE,
+                          "Vector{delimiter=,}: xb3d,b11b, ",
+                          {"Vector{delimiter=,}: abcd,abccba, aaa",
+                           "Scalar: ca", "Vector{delimiter=,}: 3x,1,"}},
+        TestFunctionEntry{FuncKind::STRING_TRANSLATE,
+                          "Vector{delimiter=,}: NULL,NULL,NULL",
+                          {"Vector{delimiter=,}: abc,cdda,NULL", "Scalar: NULL",
+                           "Vector{delimiter=,}: 3x,1,"}},
+        TestFunctionEntry{FuncKind::STRING_TRANSLATE,
+                          "Vector{delimiter=,}: 12cde,12cc21,1 2 c 21",
+                          {"Vector{delimiter=,}: abcde,abccba,a b c ba",
+                           "Scalar: ab", "Scalar: 12"}},
+        TestFunctionEntry{FuncKind::STRING_TRANSLATE,
+                          "Vector{delimiter=,}: NULL,NULL,NULL",
+                          {"Vector{delimiter=,}: abcde,abccba,a b c ba",
+                           "Scalar: NULL", "Scalar: 12"}},
+        TestFunctionEntry{
+            FuncKind::STRING_TRANSLATE,
+            "Vector{delimiter=,}: 1b22b1,1cc1,1bccb1,bccb,abccba,NULL",
+            {"Scalar: abccba", "Vector{delimiter=,}: ac,ba,ad,da,,NULL",
+             "Vector{delimiter=,}: 123,1,12,2,1,y"}},
+        TestFunctionEntry{
+            FuncKind::STRING_TRANSLATE,
+            "Vector{delimiter=,}: NULL,NULL,NULL,NULL,NULL,NULL",
+            {"Scalar: NULL", "Vector{delimiter=,}: ac,ba,ad,da,,NULL",
+             "Vector{delimiter=,}: 123,1,12,2,1,y"}},
+        TestFunctionEntry{
+            FuncKind::STRING_TRANSLATE,
+            "Vector{delimiter=,}: a2112a,aa,NULL",
+            {"Scalar: abccba", "Scalar: cb", "Vector{delimiter=,}: 123,,NULL"}},
+        TestFunctionEntry{FuncKind::STRING_TRANSLATE,
+                          "Vector{delimiter=,}: NULL,NULL,NULL,NULL,NULL,NULL",
+                          {"Scalar: abc", "Scalar: NULL",
+                           "Vector{delimiter=,}: 123,1,12,2,1,y"}},
+        TestFunctionEntry{
+            FuncKind::STRING_TRANSLATE,
+            "Vector{delimiter=,}: a2112a,ab11ba,abccba,abccba,NULL",
+            {"Scalar: abccba", "Vector{delimiter=,}: cb,c, ,,NULL",
+             "Scalar: 123"}},
+        TestFunctionEntry{
+            FuncKind::STRING_TRANSLATE,
+            "Vector{delimiter=,}: NULL,NULL,NULL,NULL,NULL,NULL",
+            {"Scalar: NULL", "Vector{delimiter=,}: ac,ba,ad,da,,NULL",
+             "Scalar: 123"}}));
 }  // namespace dbcommon
commit	b7f085da63ed1f6a43b673f6987597d37d11bf7f	[log] [tgz]
author	ZongtianHou <houzongtian@outlook.com>	Thu Nov 12 10:02:54 2020 +0800
committer	ZongtianHou <houzongtian@outlook.com>	Thu Nov 12 10:11:49 2020 +0800
tree	3a3bc9b85a78ca63b4a0a4220772f58a875f1bb2
parent	6b045e7eb4cab980a338d3c6f31cb1b04c2f589f [diff]