blob: 797895bcc67d03aaeeb17fe6362df83572ca6837 [file]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#include "paimon/common/data/binary_string.h"
#include <cstdlib>
#include <cstring>
#include <string>
#include <utility>
#include <vector>
#include "gtest/gtest.h"
#include "paimon/memory/bytes.h"
#include "paimon/memory/memory_pool.h"
namespace paimon::test {
class BinaryStringTest : public testing::Test {
private:
BinaryString FromString(const std::string& str) {
auto pool = GetDefaultPool();
return BinaryString::FromString(str, pool.get());
}
template <typename T, typename U>
void InnerCheckEqual(T&& expected, U&& actual) {
ASSERT_EQ(std::forward<T>(expected), std::forward<U>(actual));
}
template <typename T>
void InnerCheck(T&& expected) {
ASSERT_TRUE(std::forward<T>(expected));
}
template <typename T>
void InnerCheckFalse(T&& expected) {
ASSERT_FALSE(std::forward<T>(expected));
}
void CheckBasic(const std::string& str, int32_t len) {
BinaryString s1 = FromString(str);
auto pool = GetDefaultPool();
std::shared_ptr<Bytes> bytes = Bytes::AllocateBytes(str, pool.get());
BinaryString s2 = BinaryString::FromBytes(bytes);
InnerCheckEqual(len, s1.NumChars());
InnerCheckEqual(len, s2.NumChars());
InnerCheckEqual(str, s1.ToString());
InnerCheckEqual(str, s2.ToString());
InnerCheck(s1 == s2);
InnerCheckEqual(s2.HashCode(), s1.HashCode());
InnerCheck(s1.Contains(s2));
InnerCheck(s2.Contains(s1));
InnerCheck(s1.StartsWith(s1));
InnerCheck(s1.EndsWith(s1));
InnerCheck(s2.StartsWith(s2));
InnerCheck(s2.EndsWith(s2));
}
};
TEST_F(BinaryStringTest, TestBasic) {
CheckBasic("", 0);
CheckBasic(",", 1);
CheckBasic("hello", 5);
CheckBasic("hello world", 11);
CheckBasic("Paimon中文社区", 10);
CheckBasic("中 文 社 区", 7);
CheckBasic("¡", 1); // 2 bytes char
CheckBasic("ку", 2); // 2 * 2 bytes chars
CheckBasic("︽﹋%", 3); // 3 * 3 bytes chars
// CheckBasic("\uD83E\uDD19", 1); // 4 bytes char
}
TEST_F(BinaryStringTest, EmptyStringTest) {
InnerCheckEqual(FromString(""), BinaryString::EmptyUtf8());
std::string empty_str;
auto pool = GetDefaultPool();
std::shared_ptr<Bytes> bytes = Bytes::AllocateBytes(empty_str, pool.get());
InnerCheckEqual(BinaryString::FromBytes(bytes), BinaryString::EmptyUtf8());
InnerCheckEqual(BinaryString::EmptyUtf8().NumChars(), 0);
InnerCheckEqual(BinaryString::EmptyUtf8().GetSizeInBytes(), 0);
}
TEST_F(BinaryStringTest, TestCompareTo) {
auto pool = GetDefaultPool();
InnerCheckEqual(FromString(" ").CompareTo(BinaryString::BlankString(3, pool.get())), 0);
InnerCheck(FromString("").CompareTo(FromString("a")) < 0);
InnerCheck(FromString("abc").CompareTo(FromString("ABC")) > 0);
InnerCheck(FromString("abc0").CompareTo(FromString("abc")) > 0);
InnerCheckEqual(FromString("abcabcabc").CompareTo(FromString("abcabcabc")), 0);
InnerCheck(FromString("aBcabcabc").CompareTo(FromString("Abcabcabc")) > 0);
InnerCheck(FromString("Abcabcabc").CompareTo(FromString("abcabcabC")) < 0);
InnerCheck(FromString("abcabcabc").CompareTo(FromString("abcabcabC")) > 0);
InnerCheck(FromString("abc").CompareTo(FromString("世界")) < 0);
InnerCheck(FromString("你好").CompareTo(FromString("世界")) > 0);
InnerCheck(FromString("你好123").CompareTo(FromString("你好122")) > 0);
}
TEST_F(BinaryStringTest, TestSingleSegment) {
// prepare
auto pool = GetDefaultPool();
std::shared_ptr<Bytes> data1 = Bytes::AllocateBytes("aaaaaabcde", pool.get());
MemorySegment seg1 = MemorySegment::Wrap(data1);
std::shared_ptr<Bytes> data2 = Bytes::AllocateBytes("abcdeb", pool.get());
MemorySegment seg2 = MemorySegment::Wrap(data2);
// test compare
BinaryString binary_string1 = BinaryString::FromAddress(seg1, 0, 10);
BinaryString binary_string2 = BinaryString::FromAddress(seg2, 0, 6);
InnerCheckEqual(binary_string1.ToString(), "aaaaaabcde");
InnerCheckEqual(binary_string2.ToString(), "abcdeb");
InnerCheckEqual(binary_string1.CompareTo(binary_string2), -1);
InnerCheckEqual(binary_string1, binary_string1);
InnerCheck(binary_string1 < binary_string2);
// test equal length compare
binary_string1 = BinaryString::FromAddress(seg1, 5, 5);
binary_string2 = BinaryString::FromAddress(seg2, 0, 5);
InnerCheckEqual(binary_string1.ToString(), "abcde");
InnerCheckEqual(binary_string2.ToString(), "abcde");
InnerCheckEqual(binary_string1, binary_string2);
// test not equal
binary_string1 = BinaryString::FromAddress(seg1, 0, 5);
binary_string2 = BinaryString::FromAddress(seg2, 0, 5);
InnerCheckEqual(binary_string1.ToString(), "aaaaa");
InnerCheckEqual(binary_string2.ToString(), "abcde");
InnerCheckEqual(binary_string1.CompareTo(binary_string2), -1);
InnerCheckEqual(binary_string2.CompareTo(binary_string1), 1);
// test with offset in single segment
std::shared_ptr<Bytes> data3 = Bytes::AllocateBytes(10, pool.get());
MemorySegment seg3 = MemorySegment::Wrap(data3);
seg3.Put(4, Bytes("abcdeb", pool.get()), 0, 6);
binary_string2 = BinaryString::FromAddress(seg3, 4, 6);
InnerCheckEqual(binary_string2.ToString(), "abcdeb");
InnerCheckEqual(binary_string1.CompareTo(binary_string2), -1);
InnerCheckEqual(binary_string2.CompareTo(binary_string1), 1);
}
TEST_F(BinaryStringTest, TestContains) {
InnerCheck(BinaryString::EmptyUtf8().Contains(BinaryString::EmptyUtf8()));
InnerCheck(FromString("hello").Contains(FromString("ello")));
InnerCheckFalse(FromString("hello").Contains(FromString("vello")));
InnerCheckFalse(FromString("hello").Contains(FromString("hellooo")));
InnerCheck(FromString("大千世界").Contains(FromString("千世界")));
InnerCheckFalse(FromString("大千世界").Contains(FromString("世千")));
InnerCheckFalse(FromString("大千世界").Contains(FromString("大千世界好")));
}
TEST_F(BinaryStringTest, TestStartsWith) {
InnerCheck(BinaryString::EmptyUtf8().StartsWith(BinaryString::EmptyUtf8()));
InnerCheck(FromString("hello").StartsWith(FromString("hell")));
InnerCheckFalse(FromString("hello").StartsWith(FromString("ell")));
InnerCheckFalse(FromString("hello").StartsWith(FromString("hellooo")));
InnerCheck(FromString("数据砖头").StartsWith(FromString("数据")));
InnerCheckFalse(FromString("大千世界").StartsWith(FromString("千")));
InnerCheckFalse(FromString("大千世界").StartsWith(FromString("大千世界好")));
}
TEST_F(BinaryStringTest, TestEndsWith) {
InnerCheck(BinaryString::EmptyUtf8().EndsWith(BinaryString::EmptyUtf8()));
InnerCheck(FromString("hello").EndsWith(FromString("ello")));
InnerCheckFalse(FromString("hello").EndsWith(FromString("ellov")));
InnerCheckFalse(FromString("hello").EndsWith(FromString("hhhello")));
InnerCheck(FromString("大千世界").EndsWith(FromString("世界")));
InnerCheckFalse(FromString("大千世界").EndsWith(FromString("世")));
InnerCheckFalse(FromString("数据砖头").EndsWith(FromString("我的数据砖头")));
}
TEST_F(BinaryStringTest, TestSubstring) {
auto pool = GetDefaultPool();
InnerCheckEqual(FromString("hello").Substring(0, 0, pool.get()), BinaryString::EmptyUtf8());
InnerCheckEqual(FromString("hello").Substring(1, 3, pool.get()), FromString("el"));
InnerCheckEqual(FromString("数据砖头").Substring(0, 1, pool.get()), FromString("数"));
InnerCheckEqual(FromString("数据砖头").Substring(1, 3, pool.get()), FromString("据砖"));
InnerCheckEqual(FromString("数据砖头").Substring(3, 5, pool.get()), FromString("头"));
InnerCheckEqual(FromString("ߵ梷").Substring(0, 2, pool.get()), FromString("ߵ梷"));
}
TEST_F(BinaryStringTest, TestSubStringAndCopyBinaryString) {
auto pool = GetDefaultPool();
std::string combined = "hello world!nice to meet you!";
std::shared_ptr<Bytes> bytes = Bytes::AllocateBytes(combined, pool.get());
MemorySegment seg = MemorySegment::Wrap(bytes);
BinaryString binary_string = BinaryString(seg, 0, combined.size());
int32_t left = 6, right = 20;
// Substring [left, right), The right is not included
InnerCheckEqual(binary_string.Substring(left, right, pool.get()),
FromString(combined.substr(left, right - left)));
// CopyBinaryString [left, right], The right is included
InnerCheckEqual(binary_string.CopyBinaryString(left, right, pool.get()),
FromString(combined.substr(left, right - left + 1)));
InnerCheckEqual(binary_string.CopyBinaryString(0, 11, pool.get()), FromString("hello world!"));
}
TEST_F(BinaryStringTest, TestIndexOf) {
{
InnerCheckEqual(BinaryString::EmptyUtf8().IndexOf(BinaryString::EmptyUtf8(), 0), 0);
InnerCheckEqual(BinaryString::EmptyUtf8().IndexOf(FromString("l"), 0), -1);
InnerCheckEqual(FromString("hello").IndexOf(BinaryString::EmptyUtf8(), 0), 0);
InnerCheckEqual(FromString("hello").IndexOf(FromString("l"), 0), 2);
InnerCheckEqual(FromString("hello").IndexOf(FromString("l"), 3), 3);
InnerCheckEqual(FromString("hello").IndexOf(FromString("a"), 0), -1);
InnerCheckEqual(FromString("hello").IndexOf(FromString("ll"), 0), 2);
InnerCheckEqual(FromString("hello").IndexOf(FromString("ll"), 4), -1);
InnerCheckEqual(FromString("数据砖头").IndexOf(FromString("据砖"), 0), 1);
InnerCheckEqual(FromString("数据砖头").IndexOf(FromString("数"), 3), -1);
InnerCheckEqual(FromString("数据砖头").IndexOf(FromString("数"), 0), 0);
InnerCheckEqual(FromString("数据砖头").IndexOf(FromString("头"), 0), 3);
}
{
auto pool = GetDefaultPool();
std::string combined = "Strive not to be a success, but rather to be of value.";
auto bytes = std::make_shared<Bytes>(combined, pool.get());
MemorySegment seg = MemorySegment::Wrap(bytes);
auto binary_string = BinaryString::FromAddress(seg, /*offset=*/0,
/*num_bytes=*/combined.length());
InnerCheckEqual(combined, binary_string.ToString());
InnerCheckEqual(binary_string.IndexOf(FromString("value"), 0), 48);
InnerCheckEqual(binary_string.IndexOf(FromString("value"), 5), 48);
InnerCheckEqual(binary_string.IndexOf(FromString("vvalue"), 0), -1);
InnerCheckEqual(binary_string.IndexOf(FromString("!"), 0), -1);
}
}
TEST_F(BinaryStringTest, TestToUpperLowerCase) {
auto pool = GetDefaultPool();
InnerCheckEqual(FromString("我是中国人").ToLowerCase(pool.get()), FromString("我是中国人"));
InnerCheckEqual(FromString("我是中国人").ToUpperCase(pool.get()), FromString("我是中国人"));
InnerCheckEqual(BinaryString::EmptyUtf8().ToUpperCase(pool.get()), BinaryString::EmptyUtf8());
InnerCheckEqual(FromString("aBcDeFg").ToLowerCase(pool.get()), FromString("abcdefg"));
InnerCheckEqual(FromString("aBcDeFg").ToUpperCase(pool.get()), FromString("ABCDEFG"));
InnerCheckEqual(FromString("!@#$%^*").ToLowerCase(pool.get()), FromString("!@#$%^*"));
InnerCheckEqual(FromString("!@#$%^*").ToLowerCase(pool.get()), FromString("!@#$%^*"));
InnerCheckEqual(BinaryString::EmptyUtf8().ToLowerCase(pool.get()), BinaryString::EmptyUtf8());
}
TEST_F(BinaryStringTest, TestEmptyString) {
BinaryString str2 = FromString("hahahahah");
BinaryString str3;
auto pool = GetDefaultPool();
{
std::shared_ptr<Bytes> bytes0 = Bytes::AllocateBytes(10, pool.get());
MemorySegment seg0 = MemorySegment::Wrap(bytes0);
str3 = BinaryString::FromAddress(seg0, /*offset=*/5, /*num_bytes=*/0);
}
InnerCheck(BinaryString::EmptyUtf8().CompareTo(str2) < 0);
InnerCheck(str2.CompareTo(BinaryString::EmptyUtf8()) > 0);
InnerCheckEqual(BinaryString::EmptyUtf8().CompareTo(str3), 0);
InnerCheckEqual(str3.CompareTo(BinaryString::EmptyUtf8()), 0);
InnerCheckFalse(str2 == BinaryString::EmptyUtf8());
InnerCheckFalse(BinaryString::EmptyUtf8() == str2);
InnerCheckEqual(str3, BinaryString::EmptyUtf8());
InnerCheckEqual(BinaryString::EmptyUtf8(), str3);
}
TEST_F(BinaryStringTest, TestSkipWrongFirstByte) {
auto pool = GetDefaultPool();
std::vector<int32_t> wrong_first_bytes = {0x80, 0x9F,
0xBF, // Skip Continuation bytes
0xC0,
0xC2, // 0xC0..0xC1 - disallowed in UTF-8
// 0xF5..0xFF - disallowed in UTF-8
0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD,
0xFE, 0xFF};
std::shared_ptr<Bytes> c = Bytes::AllocateBytes(1, pool.get());
for (int32_t wrong_first_byte : wrong_first_bytes) {
(*c)[0] = static_cast<char>(wrong_first_byte);
InnerCheckEqual(1, BinaryString::FromBytes(c).NumChars());
}
}
TEST_F(BinaryStringTest, TestFromBytes) {
auto pool = GetDefaultPool();
std::string s = "hahahe";
std::shared_ptr<Bytes> bytes = Bytes::AllocateBytes(s, pool.get());
InnerCheck(BinaryString::FromBytes(bytes, 0, 6) == BinaryString::FromString(s, pool.get()));
}
TEST_F(BinaryStringTest, TestCopy) {
auto pool = GetDefaultPool();
std::string s = "hahahe";
std::shared_ptr<Bytes> bytes = Bytes::AllocateBytes(s, pool.get());
BinaryString binary_string = BinaryString::FromBytes(bytes, 0, 6);
BinaryString copy_binary_string = binary_string.Copy(pool.get());
InnerCheckEqual(binary_string, copy_binary_string);
InnerCheckEqual(copy_binary_string.ByteAt(2), 'h');
}
TEST_F(BinaryStringTest, TestByteAt) {
auto pool = GetDefaultPool();
std::string combined = "helloworld!";
auto bytes = std::make_shared<Bytes>(combined, pool.get());
MemorySegment seg = MemorySegment::Wrap(bytes);
auto binary_string = BinaryString::FromAddress(seg, /*offset=*/2,
/*num_bytes=*/combined.length() - 2);
InnerCheckEqual(binary_string.ByteAt(0), 'l');
InnerCheckEqual(binary_string.ByteAt(5), 'r');
}
TEST_F(BinaryStringTest, TestNumChars) {
auto pool = GetDefaultPool();
{
auto bytes = std::make_shared<Bytes>("hello", pool.get());
MemorySegment seg = MemorySegment::Wrap(bytes);
auto binary_string = BinaryString::FromAddress(seg, /*offset=*/0,
/*num_bytes=*/5);
InnerCheckEqual(5, binary_string.NumChars());
}
{
auto bytes = std::make_shared<Bytes>("helloworld", pool.get());
MemorySegment seg = MemorySegment::Wrap(bytes);
auto binary_string = BinaryString::FromAddress(seg, /*offset=*/0,
/*num_bytes=*/10);
InnerCheckEqual(10, binary_string.NumChars());
}
}
TEST_F(BinaryStringTest, TestMatchAt) {
auto pool = GetDefaultPool();
{
// abc
std::shared_ptr<Bytes> bytes1 = Bytes::AllocateBytes("abc", pool.get());
MemorySegment seg1 = MemorySegment::Wrap(bytes1);
auto binary_string1 = BinaryString::FromAddress(seg1, /*offset=*/0,
/*num_bytes=*/3);
// bc
std::shared_ptr<Bytes> bytes2 = Bytes::AllocateBytes("bc", pool.get());
MemorySegment seg2 = MemorySegment::Wrap(bytes2);
auto binary_string2 = BinaryString::FromAddress(seg2, /*offset=*/0,
/*num_bytes=*/2);
InnerCheck(binary_string1.MatchAt(binary_string2, /*pos=*/1));
InnerCheckFalse(binary_string1.MatchAt(binary_string2, /*pos=*/0));
}
{
// abcdef
std::shared_ptr<Bytes> bytes1 = Bytes::AllocateBytes("abcdef", pool.get());
MemorySegment seg1 = MemorySegment::Wrap(bytes1);
auto binary_string1 = BinaryString::FromAddress(seg1, /*offset=*/0,
/*num_bytes=*/6);
// bc
std::shared_ptr<Bytes> bytes2 = Bytes::AllocateBytes("bc", pool.get());
MemorySegment seg2 = MemorySegment::Wrap(bytes2);
auto binary_string2 = BinaryString::FromAddress(seg2, /*offset=*/0,
/*num_bytes=*/2);
InnerCheck(binary_string1.MatchAt(binary_string2, /*pos=*/1));
InnerCheckFalse(binary_string1.MatchAt(binary_string2, /*pos=*/0));
}
}
} // namespace paimon::test