blob: 6e2c6b07b43cd01cf8020d5ff2ef9adde7bb182f [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "testutil/gtest-util.h"
#include "kudu/util/random.h"
#include "kudu/util/random_util.h"
#include "kudu/util/test_util.h"
#include "util/string-util.h"
#include "common/names.h"
namespace impala {
enum Truncation {
DOWN,
UP
};
void EvalTruncation(const string& original, const string& expected_result,
int32_t max_length, Truncation boundary) {
string result;
if (boundary == DOWN) {
ASSERT_OK(TruncateDown(original, max_length, &result));
} else {
ASSERT_OK(TruncateUp(original, max_length, &result));
}
EXPECT_EQ(expected_result, result);
}
TEST(TruncateDownTest, Basic) {
EvalTruncation("0123456789", "0123456789", 100, DOWN);
EvalTruncation("0123456789", "0123456789", 10, DOWN);
EvalTruncation("0123456789", "01234", 5, DOWN);
EvalTruncation("0123456789", "", 0, DOWN);
EvalTruncation("", "", 10, DOWN);
EvalTruncation(string("\0\0\0", 3), string("\0\0", 2), 2, DOWN);
EvalTruncation("asdfghjkl", "asdf", 4, DOWN);
char a[] = {'a', CHAR_MAX, CHAR_MIN, 'b', '\0'};
char b[] = {'a', CHAR_MAX, '\0'};
EvalTruncation(a, b, 2, DOWN);
}
TEST(TruncateUpTest, Basic) {
EvalTruncation("0123456789", "0123456789", 100, UP);
EvalTruncation("abcdefghij", "abcdefghij", 10, UP);
EvalTruncation("abcdefghij", "abcdefghj", 9, UP);
EvalTruncation("abcdefghij", "abcdf", 5, UP);
string max_string(100, 0xFF);
EvalTruncation(max_string, max_string, 100, UP);
string normal_plus_max = "abcdef" + max_string;
EvalTruncation(normal_plus_max, normal_plus_max, 200, UP);
EvalTruncation(normal_plus_max, "abcdeg", 10, UP);
string result;
Status s = TruncateUp(max_string, 10, &result);
EXPECT_EQ(s.GetDetail(), "TruncateUp() couldn't increase string.\n");
EvalTruncation("", "", 10, UP);
EvalTruncation(string("\0\0\0", 3), string("\0\001", 2), 2, UP);
EvalTruncation("asdfghjkl", "asdg", 4, UP);
char a[] = {0, (char)0x7F, (char)0xFF, 0};
char b[] = {0, (char)0x80, 0};
EvalTruncation(a, b, 2, UP);
}
TEST(CommaSeparatedContainsTest, Basic) {
// Basic tests with string present.
EXPECT_TRUE(CommaSeparatedContains("LZO", "LZO"));
EXPECT_TRUE(CommaSeparatedContains("foo,LZO", "LZO"));
EXPECT_TRUE(CommaSeparatedContains("LZO,bar", "LZO"));
EXPECT_TRUE(CommaSeparatedContains("foo,LZO,bar", "LZO"));
// Handles zero-length entries.
EXPECT_FALSE(CommaSeparatedContains("", "LZO"));
EXPECT_FALSE(CommaSeparatedContains(",", "LZO"));
EXPECT_FALSE(CommaSeparatedContains(",,", "LZO"));
EXPECT_TRUE(CommaSeparatedContains("foo,LZO,", "LZO"));
EXPECT_TRUE(CommaSeparatedContains(",foo,LZO,", "LZO"));
EXPECT_TRUE(CommaSeparatedContains(",foo,,LZO,", "LZO"));
// Basic tests with string absent.
EXPECT_FALSE(CommaSeparatedContains("foo,bar", "LZO"));
EXPECT_FALSE(CommaSeparatedContains("foo", "LZO"));
EXPECT_FALSE(CommaSeparatedContains("foo,", "LZO"));
EXPECT_FALSE(CommaSeparatedContains("foo,bar,baz", "LZO"));
EXPECT_FALSE(CommaSeparatedContains(",foo,LzO,", "LZO"));
// Pattern is longer than token.
EXPECT_FALSE(CommaSeparatedContains(",foo,LzO,", "ZZZZZ"));
// Pattern is longer than string.
EXPECT_FALSE(CommaSeparatedContains("foo", "ZZZZZ"));
// Whitespace is included in tokens alone.
EXPECT_FALSE(CommaSeparatedContains("foo , foo, foo,\nfoo,\tfoo", "foo"));
}
TEST(FindUtf8PosForwardTest, Basic) {
// Each Chinese character is encoded into 3 bytes in UTF-8.
EXPECT_EQ(0, FindUtf8PosForward("李小龙", 9, 0));
EXPECT_EQ(3, FindUtf8PosForward("李小龙", 9, 1));
EXPECT_EQ(6, FindUtf8PosForward("李小龙", 9, 2));
EXPECT_EQ(9, FindUtf8PosForward("李小龙", 9, 3));
EXPECT_EQ(9, FindUtf8PosForward("李小龙", 9, 4));
EXPECT_EQ(10, FindUtf8PosForward("李小龙Bruce Lee", 18, 4));
EXPECT_EQ(11, FindUtf8PosForward("李小龙Bruce Lee", 18, 5));
EXPECT_EQ(18, FindUtf8PosForward("李小龙Bruce Lee", 18, 50));
// Test with a combination of UTF8 characters in 1, 2, 3 and 4 bytes.
// 'Б', 'и' and 'ö' are encoded into 2 bytes. '和' is 3 bytes. '🙂' is 4 bytes.
int byte_lens[] = {
1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, // byte lengths for "hello Бopиc "
3, 1, 1, 2, 1, 1, 1, 1, // byte lengths for "和 Jörg! "
4, 1, 1, 1 // byte lengths for "🙂 Hi"
};
const char test_str[] = "Hello Бopиc 和 Jörg! 🙂 Hi";
// Ignore tailing '\0'
int total_byte_len = sizeof(test_str) / sizeof(char) - 1;
int total_chars = sizeof(byte_lens) / sizeof(int);
int pos = 0;
for (int i = 0; i < total_chars; ++i) {
EXPECT_EQ(pos, FindUtf8PosForward(test_str, total_byte_len, i));
pos += byte_lens[i];
}
EXPECT_EQ(total_byte_len, pos);
EXPECT_EQ(total_byte_len, FindUtf8PosForward(test_str, total_byte_len, total_chars));
// \x93 is a non-ascii byte and not the start byte of any UTF-8 characters. It will be
// treated as a malformed character and counted as one.
EXPECT_EQ(9, FindUtf8PosForward("李小龙 \x93\x93 ", 13, 3));
EXPECT_EQ(10, FindUtf8PosForward("李小龙 \x93\x93 ", 13, 4));
EXPECT_EQ(11, FindUtf8PosForward("李小龙 \x93\x93 ", 13, 5));
EXPECT_EQ(12, FindUtf8PosForward("李小龙 \x93\x93 ", 13, 6));
EXPECT_EQ(13, FindUtf8PosForward("李小龙 \x93\x93 ", 13, 7));
// Here we just need 4 characters, i.e. "李小龙 " in byte length 10. Set 'str_len'
// to 10 to make sure the remaining bytes won't be counted.
EXPECT_EQ(9, FindUtf8PosForward("李小龙 \x93\x93 ", 10, 3));
EXPECT_EQ(10, FindUtf8PosForward("李小龙 \x93\x93 ", 10, 4));
EXPECT_EQ(10, FindUtf8PosForward("李小龙 \x93\x93 ", 10, 5));
EXPECT_EQ(10, FindUtf8PosForward("李小龙 \x93\x93 ", 10, 6));
// More cases on malformed UTF-8.
// \xc3 is the start byte of a 2-bytes UTF-8 character.
// Make sure we won't get overflow index in results
EXPECT_EQ(0, FindUtf8PosForward("李小龙\xc3", 10, 0));
EXPECT_EQ(3, FindUtf8PosForward("李小龙\xc3", 10, 1));
EXPECT_EQ(6, FindUtf8PosForward("李小龙\xc3", 10, 2));
EXPECT_EQ(9, FindUtf8PosForward("李小龙\xc3", 10, 3));
EXPECT_EQ(10, FindUtf8PosForward("李小龙\xc3", 10, 4));
// Test cases for \xc3 in the middle, i.e. "李\xc3小龙".
// In UTF-8, "小" encodes to [\xe5\xb0\x8f], and "龙" encodes to [\xe9\xbe\x99].
// \xc3 is the start byte of a 2-bytes UTF-8 character. So "\xc3\xe5" is counted as
// one character. "\xb0" and "\x8f" is treated as two malformed characters. "龙" is
// still treated as one character since it's not messed up.
// This may be inconsistent with Hive. We just make sure it won't crash to process,
// and will deal with this in IMPALA-10761.
EXPECT_EQ(0, FindUtf8PosForward("李\xc3小龙", 10, 0));
EXPECT_EQ(3, FindUtf8PosForward("李\xc3小龙", 10, 1));
EXPECT_EQ(5, FindUtf8PosForward("李\xc3小龙", 10, 2));
EXPECT_EQ(6, FindUtf8PosForward("李\xc3小龙", 10, 3));
EXPECT_EQ(7, FindUtf8PosForward("李\xc3小龙", 10, 4));
EXPECT_EQ(10, FindUtf8PosForward("李\xc3小龙", 10, 5));
}
TEST(FindUtf8PosBackwardTest, Basic) {
// Each Chinese character is encoded into 3 bytes in UTF-8.
EXPECT_EQ(6, FindUtf8PosBackward("李小龙", 9, 0));
EXPECT_EQ(3, FindUtf8PosBackward("李小龙", 9, 1));
EXPECT_EQ(0, FindUtf8PosBackward("李小龙", 9, 2));
EXPECT_EQ(-1, FindUtf8PosBackward("李小龙", 9, 17));
EXPECT_EQ(17, FindUtf8PosBackward("李小龙Bruce Lee", 18, 0));
EXPECT_EQ(10, FindUtf8PosBackward("李小龙Bruce Lee", 18, 7));
EXPECT_EQ(9, FindUtf8PosBackward("李小龙Bruce Lee", 18, 8));
EXPECT_EQ(6, FindUtf8PosBackward("李小龙Bruce Lee", 18, 9));
EXPECT_EQ(3, FindUtf8PosBackward("李小龙Bruce Lee", 18, 10));
EXPECT_EQ(0, FindUtf8PosBackward("李小龙Bruce Lee", 18, 11));
EXPECT_EQ(0, FindUtf8PosBackward("hello李小龙Bruce Lee", 23, 16));
EXPECT_EQ(-1, FindUtf8PosBackward("hello李小龙Bruce Lee", 23, 50));
// Test with a combination of UTF8 characters in 1, 2, 3 and 4 bytes.
// 'Б', 'и' and 'ö' are encoded into 2 bytes. '和' is 3 bytes. '🙂' is 4 bytes.
int byte_lens[] = {
1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, // byte lengths for "hello Бopиc "
3, 1, 1, 2, 1, 1, 1, 1, // byte lengths for "和 Jörg! "
4, 1, 1, 1 // byte lengths for "🙂 Hi"
};
const char test_str[] = "Hello Бopиc 和 Jörg! 🙂 Hi";
// Ignore tailing '\0'
int total_byte_len = sizeof(test_str) / sizeof(char) - 1;
int total_chars = sizeof(byte_lens) / sizeof(int);
int pos = total_byte_len;
for (int i = 0; i < total_chars; ++i) {
pos -= byte_lens[total_chars - i - 1];
EXPECT_EQ(pos, FindUtf8PosBackward(test_str, total_byte_len, i));
}
EXPECT_EQ(0, pos);
EXPECT_EQ(-1, FindUtf8PosBackward(test_str, total_byte_len, total_chars));
// \x93 is a non-ascii byte and not the start byte of any UTF-8 characters. It will be
// treated as a malformed character and counted as one.
EXPECT_EQ(12, FindUtf8PosBackward("李小龙 \x93\x93 ", 13, 0));
EXPECT_EQ(11, FindUtf8PosBackward("李小龙 \x93\x93 ", 13, 1));
EXPECT_EQ(10, FindUtf8PosBackward("李小龙 \x93\x93 ", 13, 2));
EXPECT_EQ(9, FindUtf8PosBackward("李小龙 \x93\x93 ", 13, 3));
EXPECT_EQ(6, FindUtf8PosBackward("李小龙 \x93\x93 ", 13, 4));
// Here we just need 4 characters, i.e. "李小龙 " in byte length 10. Set 'str_len'
// to 10 to make sure the remaining bytes won't be counted.
EXPECT_EQ(9, FindUtf8PosBackward("李小龙 \x93\x93 ", 10, 0));
EXPECT_EQ(6, FindUtf8PosBackward("李小龙 \x93\x93 ", 10, 1));
EXPECT_EQ(3, FindUtf8PosBackward("李小龙 \x93\x93 ", 10, 2));
EXPECT_EQ(0, FindUtf8PosBackward("李小龙 \x93\x93 ", 10, 3));
EXPECT_EQ(-1, FindUtf8PosBackward("李小龙 \x93\x93 ", 10, 4));
// Test malformed UTF-8 bytes at the beginning.
EXPECT_EQ(1, FindUtf8PosBackward("\x93\x93李小龙", 11, 3));
EXPECT_EQ(0, FindUtf8PosBackward("\x93\x93李小龙", 11, 4));
EXPECT_EQ(-1, FindUtf8PosBackward("\x93\x93李小龙", 11, 5));
// More cases on malformed UTF-8.
// \xc3 is the start byte of a 2-bytes UTF-8 character.
EXPECT_EQ(9, FindUtf8PosBackward("李小龙\xc3", 10, 0));
EXPECT_EQ(6, FindUtf8PosBackward("李小龙\xc3", 10, 1));
EXPECT_EQ(3, FindUtf8PosBackward("李小龙\xc3", 10, 2));
EXPECT_EQ(0, FindUtf8PosBackward("李小龙\xc3", 10, 3));
EXPECT_EQ(-1, FindUtf8PosBackward("李小龙\xc3", 10, 4));
// Test cases for \xc3 in the middle, i.e. "李\xc3小龙".
EXPECT_EQ(7, FindUtf8PosBackward("李\xc3小龙", 10, 0));
EXPECT_EQ(4, FindUtf8PosBackward("李\xc3小龙", 10, 1));
EXPECT_EQ(3, FindUtf8PosBackward("李\xc3小龙", 10, 2));
EXPECT_EQ(0, FindUtf8PosBackward("李\xc3小龙", 10, 3));
EXPECT_EQ(-1, FindUtf8PosBackward("李\xc3小龙", 10, 4));
}
static const int MAX_LEN = 100;
static const int NUM_TRIALS = 100;
TEST(RandomFindUtf8PosTest, Basic) {
// Fuzz test to make sure we won't crash or return overflow index when processing
// malformed UTF-8 characters. It takes ~5s.
char buffer[MAX_LEN];
kudu::Random rng(kudu::SeedRandom());
for (int i = 0; i < NUM_TRIALS; ++i) {
RandomString(buffer, MAX_LEN, &rng);
// Use the random string 100 times.
for (int j = 0; j < 100; ++j) {
int len = rng.Uniform(MAX_LEN + 1);
int index = rng.Uniform(len + 1);
int pos = FindUtf8PosForward(buffer, len, index);
EXPECT_GE(pos, -1);
EXPECT_LE(pos, len);
pos = FindUtf8PosBackward(buffer, len, index);
EXPECT_GE(pos, -1);
EXPECT_LE(pos, len);
}
}
}
// StringStreamPopTest: These tests assert the functionality of the StringStreamPop class.
// Assert the most common use case where the last character is popped and a new character
// is written to the stream.
TEST(StringStreamPopTest, NotEmptyPopOnce) {
StringStreamPop fixture;
fixture << "this is a tes,";
fixture.move_back();
fixture << "t";
EXPECT_EQ("this is a test", fixture.str());
}
// Asssert where the stream only contains a single character that is popped before another
// character is written to the stream.
TEST(StringStreamPopTest, OneCharPop) {
StringStreamPop fixture;
fixture << "t";
fixture.move_back();
fixture << "v";
EXPECT_EQ("v", fixture.str());
}
// Assert where the last two characters of a non-empty stream are popped.
TEST(StringStreamPopTest, NotEmptyPopTwice) {
StringStreamPop fixture;
fixture << "this is a second te,,";
fixture.move_back();
fixture.move_back();
fixture << "st";
EXPECT_EQ("this is a second test", fixture.str());
}
// Assert where an empty stream has it's last (nonexistant) character popped.
TEST(StringStreamPopTest, EmptyPopOnce) {
StringStreamPop fixture;
fixture.move_back();
EXPECT_TRUE(fixture.str().empty());
}
// Assert where an empty stream has it's last (nonexistant) character popped twice.
TEST(StringStreamPopTest, EmptyPopTwice) {
StringStreamPop fixture;
fixture.move_back();
fixture.move_back();
EXPECT_TRUE(fixture.str().empty());
}
// Assert the move_back functionality does not actually remove the character.
TEST(StringStreamPopTest, PopOnceBeforeAppend) {
StringStreamPop fixture;
fixture.move_back();
fixture << "a";
fixture.move_back();
// This assertion is correct because the move_back() function only moves the write
// pointer, it does not modify the internal buffer.
EXPECT_EQ("a", fixture.str());
}
// Assert the StringStreamPop class behavior matches the behavior of the stringstream
// class.
TEST(StringStreamPopTest, CompareWithStringstream) {
StringStreamPop fixture;
stringstream expected;
expected << "C++ is" << " an " << "invisible found" << "ation of " << "everything!";
fixture << "C++ is" << " an " << "invisible found" << "ation of " << "everything?";
fixture.move_back();
fixture << '!';
EXPECT_EQ(expected.str(), fixture.str());
}
}