HAWQ-1771. add TRANSLATE function and set KMP_LIMIT = 30
diff --git a/depends/dbcommon/src/dbcommon/function/func-kind.h b/depends/dbcommon/src/dbcommon/function/func-kind.h
index 4fbf961..293ffb0 100644
--- a/depends/dbcommon/src/dbcommon/function/func-kind.h
+++ b/depends/dbcommon/src/dbcommon/function/func-kind.h
@@ -246,6 +246,7 @@
STRING_RPAD,
STRING_LPAD_NOFILL,
STRING_RPAD_NOFILL,
+ STRING_TRANSLATE,
// binary related functions
BINARY_OCTET_LENGTH,
diff --git a/depends/dbcommon/src/dbcommon/function/func.cc b/depends/dbcommon/src/dbcommon/function/func.cc
index f9222b3..36bddd6 100644
--- a/depends/dbcommon/src/dbcommon/function/func.cc
+++ b/depends/dbcommon/src/dbcommon/function/func.cc
@@ -377,7 +377,7 @@
FuncEntryArray.push_back({STRING_RPAD, "string_rpad", STRINGID, {STRINGID, INTID, STRINGID}, string_rpad, false});
FuncEntryArray.push_back({STRING_LPAD_NOFILL, "string_lpad_nofill", STRINGID, {STRINGID, INTID}, string_lpad_nofill, false});
FuncEntryArray.push_back({STRING_RPAD_NOFILL, "string_rpad_nofill", STRINGID, {STRINGID, INTID}, string_rpad_nofill, false});
-
+ FuncEntryArray.push_back({STRING_TRANSLATE, "string_translate", STRINGID, {STRINGID, STRINGID, STRINGID}, string_translate, false});
FuncEntryArray.push_back({BINARY_OCTET_LENGTH, "binary_octet_length", INTID, {BINARYID}, binary_octet_length, false});
diff --git a/depends/dbcommon/src/dbcommon/function/string-binary-function.h b/depends/dbcommon/src/dbcommon/function/string-binary-function.h
index ab4499c..8375f8a 100644
--- a/depends/dbcommon/src/dbcommon/function/string-binary-function.h
+++ b/depends/dbcommon/src/dbcommon/function/string-binary-function.h
@@ -46,6 +46,7 @@
Datum string_ascii(Datum *params, uint64_t size);
Datum string_repeat(Datum *params, uint64_t size);
Datum string_chr(Datum *params, uint64_t size);
+Datum string_translate(Datum *params, uint64_t size);
Datum string_bpchar(Datum *params, uint64_t size);
Datum string_varchar(Datum *params, uint64_t size);
diff --git a/depends/dbcommon/src/dbcommon/function/string-function.cc b/depends/dbcommon/src/dbcommon/function/string-function.cc
index e70a402..dbf11c6 100644
--- a/depends/dbcommon/src/dbcommon/function/string-function.cc
+++ b/depends/dbcommon/src/dbcommon/function/string-function.cc
@@ -351,16 +351,30 @@
return *this;
}
- utf8ptr &operator+=(const int &len) {
- int times = len;
+ utf8ptr &operator+=(const int32_t &len) {
+ int32_t times = len;
while (times--) p_ += utf8_mblen(p_);
return *this;
}
+ utf8ptr &operator=(const char *p) {
+ if (p_ != p) p_ = p;
+ return *this;
+ }
+
+ bool operator==(const utf8ptr &tmp) {
+ int32_t len = utf8_mblen(p_);
+ const char *tmp_ = p_;
+ const char *cmp_ = tmp.p_;
+ while (len && *tmp_++ == *cmp_++) len--;
+ if (len) return false;
+ return true;
+ }
+
char *get() { return const_cast<char *>(p_); }
- int characterLength(const char *p) {
- int len = 0;
+ int32_t characterLength(const char *p) {
+ int32_t len = 0;
const char *tmp = p_;
while (tmp != p) {
tmp += utf8_mblen(tmp);
@@ -369,11 +383,11 @@
return len;
}
- int characterLength(const int &len) {
- int ret = 0, lenth = len;
+ int32_t characterLength(const int32_t &len) {
+ int32_t ret = 0, lenth = len;
const char *tmp = p_;
while (lenth > 0) {
- int tLen = utf8_mblen(tmp);
+ int32_t tLen = utf8_mblen(tmp);
lenth -= tLen;
tmp += tLen;
ret++;
@@ -381,12 +395,12 @@
return ret;
}
- int byteLength(const int &len) {
- int ret = 0;
- int times = len;
+ int32_t byteLength(const int32_t &len) {
+ int32_t ret = 0;
+ int32_t times = len;
const char *tmp = p_;
while (times--) {
- int tLen = utf8_mblen(tmp);
+ int32_t tLen = utf8_mblen(tmp);
tmp += tLen;
ret += tLen;
}
@@ -459,7 +473,7 @@
int32_t *__restrict__ next = reinterpret_cast<int32_t *>(kmpPosBuf->data());
next[0] = -1;
- int i = 0, j = -1;
+ int32_t i = 0, j = -1;
while (i < subLen - 1) {
if (j == -1 || subStr[i] == subStr[j])
next[++i] = ++j;
@@ -469,7 +483,7 @@
i = 0;
j = 0;
- int lLen = len, sLen = subLen;
+ int32_t lLen = len, sLen = subLen;
while (i < lLen && j < sLen) {
if (j == -1 || subStr[j] == str[i]) {
i++;
@@ -488,10 +502,10 @@
uint64_t subLen) {
if (len < subLen) return 0;
- int times = len - subLen;
- for (int i = 0; i <= times; i++) {
+ int32_t times = len - subLen;
+ for (int32_t i = 0; i <= times; i++) {
bool flag = true;
- for (int j = 0; j < subLen; j++)
+ for (int32_t j = 0; j < subLen; j++)
if (str[i + j] != subStr[j]) {
flag = false;
break;
@@ -502,9 +516,10 @@
}
Datum string_position(Datum *params, uint64_t size) {
+ const uint32_t KMP_LIMIT = 30;
auto subpos = [](ByteBuffer &buf, text src, text sub) -> int32_t {
int32_t byteLen = 0;
- if (sub.length < 15) {
+ if (sub.length < KMP_LIMIT) {
byteLen = naivePos(src.val, sub.val, src.length, sub.length);
} else {
dbcommon::ByteBuffer kmpPosBuf(true);
@@ -522,7 +537,7 @@
char *ret = const_cast<char *>(buf.tail() - str.length);
char last = ' ';
- int times = str.length;
+ int32_t times = str.length;
while (times--) {
if (((unsigned int)((last | 0x20) - 'a') >= 26u &&
(unsigned int)(last - '0') >= 10u) &&
@@ -591,7 +606,7 @@
utf8ptr utfStrPtr(str.val);
utfStrPtr += pos;
char *strBegin = utfStrPtr.get();
- int len = str.val + str.length - strBegin;
+ int32_t len = str.val + str.length - strBegin;
if (len < 0) len = 0;
buf.resize(buf.size() + len);
char *ret = const_cast<char *>(buf.tail() - len);
@@ -604,7 +619,7 @@
inline int32_t myAscii(const unsigned char *data) {
int32_t retval = 0;
if (*data > 0x7F) {
- int tsize = 0;
+ int32_t tsize = 0;
if (*data >= 0xF0) {
retval = *data & 0x07;
tsize = 3;
@@ -674,14 +689,14 @@
template <direction dir>
Datum string_trim_blank(Datum *params, uint64_t size) {
auto trim = [](ByteBuffer &buf, text str) {
- int l = 0, r = str.length - 1;
+ int32_t l = 0, r = str.length - 1;
if (dir == direction::left || dir == direction::both) {
while (l <= r && str.val[l] == ' ') l++;
}
if (dir == direction::right || dir == direction::both) {
while (l <= r && str.val[r] == ' ') r--;
}
- int len = r - l + 1;
+ int32_t len = r - l + 1;
if (len < 0) len = 0;
buf.resize(buf.size() + len);
char *ret = const_cast<char *>(buf.tail() - len);
@@ -695,7 +710,7 @@
template <direction dir>
Datum string_trim_chars(Datum *params, uint64_t size) {
auto trim = [](ByteBuffer &buf, text str, text chr) {
- int l = 0, r = str.length - 1;
+ int32_t l = 0, r = str.length - 1;
if (dir == direction::left || dir == direction::both) {
std::string s(const_cast<char *>(chr.val), chr.length);
while (l <= r && s.find(str.val[l]) != std::string::npos) l++;
@@ -704,7 +719,7 @@
std::string s(const_cast<char *>(chr.val), chr.length);
while (l <= r && s.find(str.val[r]) != std::string::npos) r--;
}
- int len = r - l + 1;
+ int32_t len = r - l + 1;
if (len < 0) len = 0;
buf.resize(buf.size() + len);
char *ret = const_cast<char *>(buf.tail() - len);
@@ -767,7 +782,7 @@
Datum string_chr(Datum *params, uint64_t size) {
auto chr = [](ByteBuffer &buf, int32_t val) {
- int len = 0;
+ int32_t len = 0;
char wch[4];
if (val > 0x7F) {
if (val > 0x001fffff) {
@@ -872,7 +887,7 @@
}
int32_t writeLen = str.length < retByteLen ? str.length : retByteLen;
- for (int i = 0; i < writeLen; i++) *ret++ = str.val[i];
+ for (int32_t i = 0; i < writeLen; i++) *ret++ = str.val[i];
if (dir == direction::right) {
int32_t remainder = retByteLen - str.length;
@@ -904,7 +919,7 @@
if (strCharLen >= len) {
retByteLen = utfStrPtr.byteLength(len);
} else {
- int rem = len - strCharLen;
+ int32_t rem = len - strCharLen;
while (rem >= filCharLen) {
retByteLen += fil.length;
rem -= filCharLen;
@@ -922,10 +937,10 @@
} else {
while (remainder > 0) {
if (remainder >= filCharLen) {
- for (int i = 0; i < fil.length; i++) *ret++ = fil.val[i];
+ for (int32_t i = 0; i < fil.length; i++) *ret++ = fil.val[i];
} else {
int32_t fillLen = utfFilPtr.byteLength(remainder);
- for (int i = 0; i < fillLen; i++) *ret++ = fil.val[i];
+ for (int32_t i = 0; i < fil.length; i++) *ret++ = str.val[i];
}
remainder -= filCharLen;
}
@@ -933,7 +948,7 @@
}
int32_t writeLen = str.length < retByteLen ? str.length : retByteLen;
- for (int i = 0; i < writeLen; i++) *ret++ = str.val[i];
+ for (int32_t i = 0; i < writeLen; i++) *ret++ = str.val[i];
if (dir == direction::right) {
int32_t remainder = len - strCharLen;
@@ -943,10 +958,10 @@
} else {
while (remainder > 0) {
if (remainder >= filCharLen) {
- for (int i = 0; i < fil.length; i++) *ret++ = fil.val[i];
+ for (int32_t i = 0; i < fil.length; i++) *ret++ = fil.val[i];
} else {
int32_t fillLen = utfFilPtr.byteLength(remainder);
- for (int i = 0; i < fillLen; i++) *ret++ = fil.val[i];
+ for (int32_t i = 0; i < fillLen; i++) *ret++ = fil.val[i];
}
remainder -= filCharLen;
}
@@ -966,4 +981,56 @@
return string_pad_chars<direction::right>(params, size);
}
+Datum string_translate(Datum *params, uint64_t size) {
+ auto translate = [](ByteBuffer &buf, text str, text from, text to) {
+ utf8ptr utfStrPtr(str.val);
+ utf8ptr utfFromPtr(from.val);
+ utf8ptr utfToPtr(to.val);
+ int32_t strCharLen = utfStrPtr.characterLength(str.val + str.length);
+ int32_t fromCharLen = utfFromPtr.characterLength(from.val + from.length);
+ int32_t toCharLen = utfToPtr.characterLength(to.val + to.length);
+ int32_t retByteLen = 0;
+ int32_t worstLen = strCharLen * 4;
+
+ // if (worstLen / 4 != strCharLen) {
+ // it won't appear one number which has int32_t length;
+ // LOG_ERROR(ERRCODE_PROGRAM_LIMIT_EXCEEDED,
+ // "requested length too large");
+ // }
+
+ buf.resize(buf.size() + worstLen);
+ char *ret = const_cast<char *>(buf.tail() - worstLen);
+
+ auto writeByte = [&](utf8ptr src) {
+ char *tmp = src.get();
+ int32_t len = utf8_mblen(tmp);
+ retByteLen += len;
+ for (int32_t k = 0; k < len; k++) *ret++ = *tmp++;
+ };
+
+ for (int32_t i = 0; i < strCharLen; i++) {
+ int32_t j = 0;
+ utfFromPtr = from.val;
+ utfToPtr = to.val;
+ for (; j < fromCharLen; j++) {
+ if (utfStrPtr == utfFromPtr) {
+ if (j < toCharLen) {
+ utfToPtr += j;
+ writeByte(utfToPtr);
+ }
+ break;
+ }
+ ++utfFromPtr;
+ }
+ if (j == fromCharLen) {
+ writeByte(utfStrPtr);
+ }
+ ++utfStrPtr;
+ }
+ buf.resize(buf.size() - (worstLen - retByteLen));
+ return text(nullptr, retByteLen);
+ };
+ return three_params_bind<text, text, text, text>(params, size, translate);
+}
+
} // namespace dbcommon
diff --git a/depends/dbcommon/test/unit/function/test-string-function.cc b/depends/dbcommon/test/unit/function/test-string-function.cc
index ecef323..f374a21 100644
--- a/depends/dbcommon/test/unit/function/test-string-function.cc
+++ b/depends/dbcommon/test/unit/function/test-string-function.cc
@@ -1575,5 +1575,69 @@
TestFunctionEntry{FuncKind::STRING_RPAD_NOFILL,
"Vector: NULL NULL NULL",
{"Scalar: NULL", "Vector: 4 5 6"}}));
-
+INSTANTIATE_TEST_CASE_P(
+ string_translate, TestFunction,
+ ::testing::Values(
+ TestFunctionEntry{
+ FuncKind::STRING_TRANSLATE,
+ "Vector: 小1灵b2 真笨 1六六五 6六caa 6六cb诶诶",
+ {"Vector: 小a灵b通 真厉害 1二3四五 6六c西ff 6六c西ff",
+ "Vector: a通 厉害 二3四 ff西 f西f", "Vector: 12 笨 六六 a比 诶b"}},
+ TestFunctionEntry{FuncKind::STRING_TRANSLATE,
+ "Vector{delimiter=,}: 1b2d,1cc1, ,bcbc,NULL,",
+ {"Vector{delimiter=,}: abcd,abccba,aaaa,bcbc,NULL,",
+ "Vector{delimiter=,}: ac,ab,a,,x,x",
+ "Vector{delimiter=,}: 123,1, ,b,y,y"}},
+ TestFunctionEntry{FuncKind::STRING_TRANSLATE,
+ "Vector{delimiter=,}: 1b2d,12cc21,111,",
+ {"Vector{delimiter=,}: abcd,abccba,aaa,",
+ "Vector{delimiter=,}: ac,ab,a,x", "Scalar: 123"}},
+ TestFunctionEntry{FuncKind::STRING_TRANSLATE,
+ "Vector{delimiter=,}: NULL,NULL,NULL,NULL",
+ {"Vector{delimiter=,}: abcd,abccba,aaa,",
+ "Vector{delimiter=,}: ac,ab,a,x", "Scalar: NULL"}},
+ TestFunctionEntry{FuncKind::STRING_TRANSLATE,
+ "Vector{delimiter=,}: xb3d,b11b, ",
+ {"Vector{delimiter=,}: abcd,abccba, aaa",
+ "Scalar: ca", "Vector{delimiter=,}: 3x,1,"}},
+ TestFunctionEntry{FuncKind::STRING_TRANSLATE,
+ "Vector{delimiter=,}: NULL,NULL,NULL",
+ {"Vector{delimiter=,}: abc,cdda,NULL", "Scalar: NULL",
+ "Vector{delimiter=,}: 3x,1,"}},
+ TestFunctionEntry{FuncKind::STRING_TRANSLATE,
+ "Vector{delimiter=,}: 12cde,12cc21,1 2 c 21",
+ {"Vector{delimiter=,}: abcde,abccba,a b c ba",
+ "Scalar: ab", "Scalar: 12"}},
+ TestFunctionEntry{FuncKind::STRING_TRANSLATE,
+ "Vector{delimiter=,}: NULL,NULL,NULL",
+ {"Vector{delimiter=,}: abcde,abccba,a b c ba",
+ "Scalar: NULL", "Scalar: 12"}},
+ TestFunctionEntry{
+ FuncKind::STRING_TRANSLATE,
+ "Vector{delimiter=,}: 1b22b1,1cc1,1bccb1,bccb,abccba,NULL",
+ {"Scalar: abccba", "Vector{delimiter=,}: ac,ba,ad,da,,NULL",
+ "Vector{delimiter=,}: 123,1,12,2,1,y"}},
+ TestFunctionEntry{
+ FuncKind::STRING_TRANSLATE,
+ "Vector{delimiter=,}: NULL,NULL,NULL,NULL,NULL,NULL",
+ {"Scalar: NULL", "Vector{delimiter=,}: ac,ba,ad,da,,NULL",
+ "Vector{delimiter=,}: 123,1,12,2,1,y"}},
+ TestFunctionEntry{
+ FuncKind::STRING_TRANSLATE,
+ "Vector{delimiter=,}: a2112a,aa,NULL",
+ {"Scalar: abccba", "Scalar: cb", "Vector{delimiter=,}: 123,,NULL"}},
+ TestFunctionEntry{FuncKind::STRING_TRANSLATE,
+ "Vector{delimiter=,}: NULL,NULL,NULL,NULL,NULL,NULL",
+ {"Scalar: abc", "Scalar: NULL",
+ "Vector{delimiter=,}: 123,1,12,2,1,y"}},
+ TestFunctionEntry{
+ FuncKind::STRING_TRANSLATE,
+ "Vector{delimiter=,}: a2112a,ab11ba,abccba,abccba,NULL",
+ {"Scalar: abccba", "Vector{delimiter=,}: cb,c, ,,NULL",
+ "Scalar: 123"}},
+ TestFunctionEntry{
+ FuncKind::STRING_TRANSLATE,
+ "Vector{delimiter=,}: NULL,NULL,NULL,NULL,NULL,NULL",
+ {"Scalar: NULL", "Vector{delimiter=,}: ac,ba,ad,da,,NULL",
+ "Scalar: 123"}}));
} // namespace dbcommon