be/src/util/string-util-test.cc - impala - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 #include "testutil/gtest-util.h"

 #include "kudu/util/random.h"
 #include "kudu/util/random_util.h"
 #include "kudu/util/test_util.h"
 #include "util/string-util.h"

 #include "common/names.h"

 namespace impala {

 enum Truncation {
   DOWN,
   UP
 };

 void EvalTruncation(const string& original, const string& expected_result,
     int32_t max_length, Truncation boundary) {
   string result;
   if (boundary == DOWN) {
     ASSERT_OK(TruncateDown(original, max_length, &result));
   } else {
     ASSERT_OK(TruncateUp(original, max_length, &result));
   }
   EXPECT_EQ(expected_result, result);
 }

 TEST(TruncateDownTest, Basic) {
   EvalTruncation("0123456789", "0123456789", 100, DOWN);
   EvalTruncation("0123456789", "0123456789", 10, DOWN);
   EvalTruncation("0123456789", "01234", 5, DOWN);
   EvalTruncation("0123456789", "", 0, DOWN);
   EvalTruncation("", "", 10, DOWN);
   EvalTruncation(string("\0\0\0", 3), string("\0\0", 2), 2, DOWN);
   EvalTruncation("asdfghjkl", "asdf", 4, DOWN);
   char a[] = {'a', CHAR_MAX, CHAR_MIN, 'b', '\0'};
   char b[] = {'a', CHAR_MAX, '\0'};
   EvalTruncation(a, b, 2, DOWN);
 }

 TEST(TruncateUpTest, Basic) {
   EvalTruncation("0123456789", "0123456789", 100, UP);
   EvalTruncation("abcdefghij", "abcdefghij", 10, UP);
   EvalTruncation("abcdefghij", "abcdefghj", 9, UP);
   EvalTruncation("abcdefghij", "abcdf", 5, UP);

   string max_string(100, 0xFF);
   EvalTruncation(max_string, max_string, 100, UP);

   string normal_plus_max = "abcdef" + max_string;
   EvalTruncation(normal_plus_max, normal_plus_max, 200, UP);
   EvalTruncation(normal_plus_max, "abcdeg", 10, UP);

   string result;
   Status s = TruncateUp(max_string, 10, &result);
   EXPECT_EQ(s.GetDetail(), "TruncateUp() couldn't increase string.\n");

   EvalTruncation("", "", 10, UP);
   EvalTruncation(string("\0\0\0", 3), string("\0\001", 2), 2, UP);
   EvalTruncation("asdfghjkl", "asdg", 4, UP);
   char a[] = {0, (char)0x7F, (char)0xFF, 0};
   char b[] = {0, (char)0x80, 0};
   EvalTruncation(a, b, 2, UP);
 }

 TEST(CommaSeparatedContainsTest, Basic) {
   // Basic tests with string present.
   EXPECT_TRUE(CommaSeparatedContains("LZO", "LZO"));
   EXPECT_TRUE(CommaSeparatedContains("foo,LZO", "LZO"));
   EXPECT_TRUE(CommaSeparatedContains("LZO,bar", "LZO"));
   EXPECT_TRUE(CommaSeparatedContains("foo,LZO,bar", "LZO"));

   // Handles zero-length entries.
   EXPECT_FALSE(CommaSeparatedContains("", "LZO"));
   EXPECT_FALSE(CommaSeparatedContains(",", "LZO"));
   EXPECT_FALSE(CommaSeparatedContains(",,", "LZO"));
   EXPECT_TRUE(CommaSeparatedContains("foo,LZO,", "LZO"));
   EXPECT_TRUE(CommaSeparatedContains(",foo,LZO,", "LZO"));
   EXPECT_TRUE(CommaSeparatedContains(",foo,,LZO,", "LZO"));

   // Basic tests with string absent.
   EXPECT_FALSE(CommaSeparatedContains("foo,bar", "LZO"));
   EXPECT_FALSE(CommaSeparatedContains("foo", "LZO"));
   EXPECT_FALSE(CommaSeparatedContains("foo,", "LZO"));
   EXPECT_FALSE(CommaSeparatedContains("foo,bar,baz", "LZO"));
   EXPECT_FALSE(CommaSeparatedContains(",foo,LzO,", "LZO"));

   // Pattern is longer than token.
   EXPECT_FALSE(CommaSeparatedContains(",foo,LzO,", "ZZZZZ"));
   // Pattern is longer than string.
   EXPECT_FALSE(CommaSeparatedContains("foo", "ZZZZZ"));

   // Whitespace is included in tokens alone.
   EXPECT_FALSE(CommaSeparatedContains("foo , foo, foo,\nfoo,\tfoo", "foo"));
 }

 TEST(FindUtf8PosForwardTest, Basic) {
   // Each Chinese character is encoded into 3 bytes in UTF-8.
   EXPECT_EQ(0, FindUtf8PosForward("李小龙", 9, 0));
   EXPECT_EQ(3, FindUtf8PosForward("李小龙", 9, 1));
   EXPECT_EQ(6, FindUtf8PosForward("李小龙", 9, 2));
   EXPECT_EQ(9, FindUtf8PosForward("李小龙", 9, 3));
   EXPECT_EQ(9, FindUtf8PosForward("李小龙", 9, 4));
   EXPECT_EQ(10, FindUtf8PosForward("李小龙Bruce Lee", 18, 4));
   EXPECT_EQ(11, FindUtf8PosForward("李小龙Bruce Lee", 18, 5));
   EXPECT_EQ(18, FindUtf8PosForward("李小龙Bruce Lee", 18, 50));

   // Test with a combination of UTF8 characters in 1, 2, 3 and 4 bytes.
   // 'Б', 'и' and 'ö' are encoded into 2 bytes. '和' is 3 bytes. '🙂' is 4 bytes.
   int byte_lens[] = {
       1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, // byte lengths for "hello Бopиc "
       3, 1, 1, 2, 1, 1, 1, 1,             // byte lengths for "和 Jörg! "
       4, 1, 1, 1                          // byte lengths for "🙂 Hi"
   };
   const char test_str[] = "Hello Бopиc 和 Jörg! 🙂 Hi";
   // Ignore tailing '\0'
   int total_byte_len = sizeof(test_str) / sizeof(char) - 1;
   int total_chars = sizeof(byte_lens) / sizeof(int);
   int pos = 0;
   for (int i = 0; i < total_chars; ++i) {
     EXPECT_EQ(pos, FindUtf8PosForward(test_str, total_byte_len, i));
     pos += byte_lens[i];
   }
   EXPECT_EQ(total_byte_len, pos);
   EXPECT_EQ(total_byte_len, FindUtf8PosForward(test_str, total_byte_len, total_chars));

   // \x93 is a non-ascii byte and not the start byte of any UTF-8 characters. It will be
   // treated as a malformed character and counted as one.
   EXPECT_EQ(9, FindUtf8PosForward("李小龙 \x93\x93 ", 13, 3));
   EXPECT_EQ(10, FindUtf8PosForward("李小龙 \x93\x93 ", 13, 4));
   EXPECT_EQ(11, FindUtf8PosForward("李小龙 \x93\x93 ", 13, 5));
   EXPECT_EQ(12, FindUtf8PosForward("李小龙 \x93\x93 ", 13, 6));
   EXPECT_EQ(13, FindUtf8PosForward("李小龙 \x93\x93 ", 13, 7));
   // Here we just need 4 characters, i.e. "李小龙 " in byte length 10. Set 'str_len'
   // to 10 to make sure the remaining bytes won't be counted.
   EXPECT_EQ(9, FindUtf8PosForward("李小龙 \x93\x93 ", 10, 3));
   EXPECT_EQ(10, FindUtf8PosForward("李小龙 \x93\x93 ", 10, 4));
   EXPECT_EQ(10, FindUtf8PosForward("李小龙 \x93\x93 ", 10, 5));
   EXPECT_EQ(10, FindUtf8PosForward("李小龙 \x93\x93 ", 10, 6));

   // More cases on malformed UTF-8.
   // \xc3 is the start byte of a 2-bytes UTF-8 character.
   // Make sure we won't get overflow index in results
   EXPECT_EQ(0, FindUtf8PosForward("李小龙\xc3", 10, 0));
   EXPECT_EQ(3, FindUtf8PosForward("李小龙\xc3", 10, 1));
   EXPECT_EQ(6, FindUtf8PosForward("李小龙\xc3", 10, 2));
   EXPECT_EQ(9, FindUtf8PosForward("李小龙\xc3", 10, 3));
   EXPECT_EQ(10, FindUtf8PosForward("李小龙\xc3", 10, 4));
   // Test cases for \xc3 in the middle, i.e. "李\xc3小龙".
   // In UTF-8, "小" encodes to [\xe5\xb0\x8f], and "龙" encodes to [\xe9\xbe\x99].
   // \xc3 is the start byte of a 2-bytes UTF-8 character. So "\xc3\xe5" is counted as
   // one character. "\xb0" and "\x8f" is treated as two malformed characters. "龙" is
   // still treated as one character since it's not messed up.
   // This may be inconsistent with Hive. We just make sure it won't crash to process,
   // and will deal with this in IMPALA-10761.
   EXPECT_EQ(0, FindUtf8PosForward("李\xc3小龙", 10, 0));
   EXPECT_EQ(3, FindUtf8PosForward("李\xc3小龙", 10, 1));
   EXPECT_EQ(5, FindUtf8PosForward("李\xc3小龙", 10, 2));
   EXPECT_EQ(6, FindUtf8PosForward("李\xc3小龙", 10, 3));
   EXPECT_EQ(7, FindUtf8PosForward("李\xc3小龙", 10, 4));
   EXPECT_EQ(10, FindUtf8PosForward("李\xc3小龙", 10, 5));
 }

 TEST(FindUtf8PosBackwardTest, Basic) {
   // Each Chinese character is encoded into 3 bytes in UTF-8.
   EXPECT_EQ(6, FindUtf8PosBackward("李小龙", 9, 0));
   EXPECT_EQ(3, FindUtf8PosBackward("李小龙", 9, 1));
   EXPECT_EQ(0, FindUtf8PosBackward("李小龙", 9, 2));
   EXPECT_EQ(-1, FindUtf8PosBackward("李小龙", 9, 17));
   EXPECT_EQ(17, FindUtf8PosBackward("李小龙Bruce Lee", 18, 0));
   EXPECT_EQ(10, FindUtf8PosBackward("李小龙Bruce Lee", 18, 7));
   EXPECT_EQ(9, FindUtf8PosBackward("李小龙Bruce Lee", 18, 8));
   EXPECT_EQ(6, FindUtf8PosBackward("李小龙Bruce Lee", 18, 9));
   EXPECT_EQ(3, FindUtf8PosBackward("李小龙Bruce Lee", 18, 10));
   EXPECT_EQ(0, FindUtf8PosBackward("李小龙Bruce Lee", 18, 11));
   EXPECT_EQ(0, FindUtf8PosBackward("hello李小龙Bruce Lee", 23, 16));
   EXPECT_EQ(-1, FindUtf8PosBackward("hello李小龙Bruce Lee", 23, 50));

   // Test with a combination of UTF8 characters in 1, 2, 3 and 4 bytes.
   // 'Б', 'и' and 'ö' are encoded into 2 bytes. '和' is 3 bytes. '🙂' is 4 bytes.
   int byte_lens[] = {
       1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, // byte lengths for "hello Бopиc "
       3, 1, 1, 2, 1, 1, 1, 1,             // byte lengths for "和 Jörg! "
       4, 1, 1, 1                          // byte lengths for "🙂 Hi"
   };
   const char test_str[] = "Hello Бopиc 和 Jörg! 🙂 Hi";
   // Ignore tailing '\0'
   int total_byte_len = sizeof(test_str) / sizeof(char) - 1;
   int total_chars = sizeof(byte_lens) / sizeof(int);
   int pos = total_byte_len;
   for (int i = 0; i < total_chars; ++i) {
     pos -= byte_lens[total_chars - i - 1];
     EXPECT_EQ(pos, FindUtf8PosBackward(test_str, total_byte_len, i));
   }
   EXPECT_EQ(0, pos);
   EXPECT_EQ(-1, FindUtf8PosBackward(test_str, total_byte_len, total_chars));

   // \x93 is a non-ascii byte and not the start byte of any UTF-8 characters. It will be
   // treated as a malformed character and counted as one.
   EXPECT_EQ(12, FindUtf8PosBackward("李小龙 \x93\x93 ", 13, 0));
   EXPECT_EQ(11, FindUtf8PosBackward("李小龙 \x93\x93 ", 13, 1));
   EXPECT_EQ(10, FindUtf8PosBackward("李小龙 \x93\x93 ", 13, 2));
   EXPECT_EQ(9, FindUtf8PosBackward("李小龙 \x93\x93 ", 13, 3));
   EXPECT_EQ(6, FindUtf8PosBackward("李小龙 \x93\x93 ", 13, 4));
   // Here we just need 4 characters, i.e. "李小龙 " in byte length 10. Set 'str_len'
   // to 10 to make sure the remaining bytes won't be counted.
   EXPECT_EQ(9, FindUtf8PosBackward("李小龙 \x93\x93 ", 10, 0));
   EXPECT_EQ(6, FindUtf8PosBackward("李小龙 \x93\x93 ", 10, 1));
   EXPECT_EQ(3, FindUtf8PosBackward("李小龙 \x93\x93 ", 10, 2));
   EXPECT_EQ(0, FindUtf8PosBackward("李小龙 \x93\x93 ", 10, 3));
   EXPECT_EQ(-1, FindUtf8PosBackward("李小龙 \x93\x93 ", 10, 4));
   // Test malformed UTF-8 bytes at the beginning.
   EXPECT_EQ(1, FindUtf8PosBackward("\x93\x93李小龙", 11, 3));
   EXPECT_EQ(0, FindUtf8PosBackward("\x93\x93李小龙", 11, 4));
   EXPECT_EQ(-1, FindUtf8PosBackward("\x93\x93李小龙", 11, 5));

   // More cases on malformed UTF-8.
   // \xc3 is the start byte of a 2-bytes UTF-8 character.
   EXPECT_EQ(9, FindUtf8PosBackward("李小龙\xc3", 10, 0));
   EXPECT_EQ(6, FindUtf8PosBackward("李小龙\xc3", 10, 1));
   EXPECT_EQ(3, FindUtf8PosBackward("李小龙\xc3", 10, 2));
   EXPECT_EQ(0, FindUtf8PosBackward("李小龙\xc3", 10, 3));
   EXPECT_EQ(-1, FindUtf8PosBackward("李小龙\xc3", 10, 4));
   // Test cases for \xc3 in the middle, i.e. "李\xc3小龙".
   EXPECT_EQ(7, FindUtf8PosBackward("李\xc3小龙", 10, 0));
   EXPECT_EQ(4, FindUtf8PosBackward("李\xc3小龙", 10, 1));
   EXPECT_EQ(3, FindUtf8PosBackward("李\xc3小龙", 10, 2));
   EXPECT_EQ(0, FindUtf8PosBackward("李\xc3小龙", 10, 3));
   EXPECT_EQ(-1, FindUtf8PosBackward("李\xc3小龙", 10, 4));
 }

 static const int MAX_LEN = 100;
 static const int NUM_TRIALS = 100;
 TEST(RandomFindUtf8PosTest, Basic) {
   // Fuzz test to make sure we won't crash or return overflow index when processing
   // malformed UTF-8 characters. It takes ~5s.
   char buffer[MAX_LEN];
   kudu::Random rng(kudu::SeedRandom());
   for (int i = 0; i < NUM_TRIALS; ++i) {
     RandomString(buffer, MAX_LEN, &rng);
     // Use the random string 100 times.
     for (int j = 0; j < 100; ++j) {
       int len = rng.Uniform(MAX_LEN + 1);
       int index = rng.Uniform(len + 1);
       int pos = FindUtf8PosForward(buffer, len, index);
       EXPECT_GE(pos, -1);
       EXPECT_LE(pos, len);
       pos = FindUtf8PosBackward(buffer, len, index);
       EXPECT_GE(pos, -1);
       EXPECT_LE(pos, len);
     }
   }
 }

 // StringStreamPopTest: These tests assert the functionality of the StringStreamPop class.

 // Assert the most common use case where the last character is popped and a new character
 // is written to the stream.
 TEST(StringStreamPopTest, NotEmptyPopOnce) {
   StringStreamPop fixture;
   fixture << "this is a tes,";
   fixture.move_back();
   fixture << "t";
   EXPECT_EQ("this is a test", fixture.str());
 }

 // Asssert where the stream only contains a single character that is popped before another
 // character is written to the stream.
 TEST(StringStreamPopTest, OneCharPop) {
   StringStreamPop fixture;
   fixture << "t";
   fixture.move_back();
   fixture << "v";
   EXPECT_EQ("v", fixture.str());
 }

 // Assert where the last two characters of a non-empty stream are popped.
 TEST(StringStreamPopTest, NotEmptyPopTwice) {
   StringStreamPop fixture;
   fixture << "this is a second te,,";
   fixture.move_back();
   fixture.move_back();
   fixture << "st";
   EXPECT_EQ("this is a second test", fixture.str());
 }

 // Assert where an empty stream has it's last (nonexistant) character popped.
 TEST(StringStreamPopTest, EmptyPopOnce) {
   StringStreamPop fixture;
   fixture.move_back();
   EXPECT_TRUE(fixture.str().empty());
 }

 // Assert where an empty stream has it's last (nonexistant) character popped twice.
 TEST(StringStreamPopTest, EmptyPopTwice) {
   StringStreamPop fixture;
   fixture.move_back();
   fixture.move_back();
   EXPECT_TRUE(fixture.str().empty());
 }

 // Assert the move_back functionality does not actually remove the character.
 TEST(StringStreamPopTest, PopOnceBeforeAppend) {
   StringStreamPop fixture;
   fixture.move_back();
   fixture << "a";
   fixture.move_back();

   // This assertion is correct because the move_back() function only moves the write
   // pointer, it does not modify the internal buffer.
   EXPECT_EQ("a", fixture.str());
 }

 // Assert the StringStreamPop class behavior matches the behavior of the stringstream
 // class.
 TEST(StringStreamPopTest, CompareWithStringstream) {
   StringStreamPop fixture;
   stringstream expected;

   expected << "C++ is" << " an " << "invisible found" << "ation of " << "everything!";
   fixture  << "C++ is" << " an " << "invisible found" << "ation of " << "everything?";
   fixture.move_back();
   fixture << '!';

   EXPECT_EQ(expected.str(), fixture.str());
 }

 }
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.

	#include "testutil/gtest-util.h"

	#include "kudu/util/random.h"
	#include "kudu/util/random_util.h"
	#include "kudu/util/test_util.h"
	#include "util/string-util.h"

	#include "common/names.h"

	namespace impala {

	enum Truncation {
	DOWN,
	UP
	};

	void EvalTruncation(const string& original, const string& expected_result,
	int32_t max_length, Truncation boundary) {
	string result;
	if (boundary == DOWN) {
	ASSERT_OK(TruncateDown(original, max_length, &result));
	} else {
	ASSERT_OK(TruncateUp(original, max_length, &result));
	}
	EXPECT_EQ(expected_result, result);
	}

	TEST(TruncateDownTest, Basic) {
	EvalTruncation("0123456789", "0123456789", 100, DOWN);
	EvalTruncation("0123456789", "0123456789", 10, DOWN);
	EvalTruncation("0123456789", "01234", 5, DOWN);
	EvalTruncation("0123456789", "", 0, DOWN);
	EvalTruncation("", "", 10, DOWN);
	EvalTruncation(string("\0\0\0", 3), string("\0\0", 2), 2, DOWN);
	EvalTruncation("asdfghjkl", "asdf", 4, DOWN);
	char a[] = {'a', CHAR_MAX, CHAR_MIN, 'b', '\0'};
	char b[] = {'a', CHAR_MAX, '\0'};
	EvalTruncation(a, b, 2, DOWN);
	}

	TEST(TruncateUpTest, Basic) {
	EvalTruncation("0123456789", "0123456789", 100, UP);
	EvalTruncation("abcdefghij", "abcdefghij", 10, UP);
	EvalTruncation("abcdefghij", "abcdefghj", 9, UP);
	EvalTruncation("abcdefghij", "abcdf", 5, UP);

	string max_string(100, 0xFF);
	EvalTruncation(max_string, max_string, 100, UP);

	string normal_plus_max = "abcdef" + max_string;
	EvalTruncation(normal_plus_max, normal_plus_max, 200, UP);
	EvalTruncation(normal_plus_max, "abcdeg", 10, UP);

	string result;
	Status s = TruncateUp(max_string, 10, &result);
	EXPECT_EQ(s.GetDetail(), "TruncateUp() couldn't increase string.\n");

	EvalTruncation("", "", 10, UP);
	EvalTruncation(string("\0\0\0", 3), string("\0\001", 2), 2, UP);
	EvalTruncation("asdfghjkl", "asdg", 4, UP);
	char a[] = {0, (char)0x7F, (char)0xFF, 0};
	char b[] = {0, (char)0x80, 0};
	EvalTruncation(a, b, 2, UP);
	}

	TEST(CommaSeparatedContainsTest, Basic) {
	// Basic tests with string present.
	EXPECT_TRUE(CommaSeparatedContains("LZO", "LZO"));
	EXPECT_TRUE(CommaSeparatedContains("foo,LZO", "LZO"));
	EXPECT_TRUE(CommaSeparatedContains("LZO,bar", "LZO"));
	EXPECT_TRUE(CommaSeparatedContains("foo,LZO,bar", "LZO"));

	// Handles zero-length entries.
	EXPECT_FALSE(CommaSeparatedContains("", "LZO"));
	EXPECT_FALSE(CommaSeparatedContains(",", "LZO"));
	EXPECT_FALSE(CommaSeparatedContains(",,", "LZO"));
	EXPECT_TRUE(CommaSeparatedContains("foo,LZO,", "LZO"));
	EXPECT_TRUE(CommaSeparatedContains(",foo,LZO,", "LZO"));
	EXPECT_TRUE(CommaSeparatedContains(",foo,,LZO,", "LZO"));

	// Basic tests with string absent.
	EXPECT_FALSE(CommaSeparatedContains("foo,bar", "LZO"));
	EXPECT_FALSE(CommaSeparatedContains("foo", "LZO"));
	EXPECT_FALSE(CommaSeparatedContains("foo,", "LZO"));
	EXPECT_FALSE(CommaSeparatedContains("foo,bar,baz", "LZO"));
	EXPECT_FALSE(CommaSeparatedContains(",foo,LzO,", "LZO"));

	// Pattern is longer than token.
	EXPECT_FALSE(CommaSeparatedContains(",foo,LzO,", "ZZZZZ"));
	// Pattern is longer than string.
	EXPECT_FALSE(CommaSeparatedContains("foo", "ZZZZZ"));

	// Whitespace is included in tokens alone.
	EXPECT_FALSE(CommaSeparatedContains("foo , foo, foo,\nfoo,\tfoo", "foo"));
	}

	TEST(FindUtf8PosForwardTest, Basic) {
	// Each Chinese character is encoded into 3 bytes in UTF-8.
	EXPECT_EQ(0, FindUtf8PosForward("李小龙", 9, 0));
	EXPECT_EQ(3, FindUtf8PosForward("李小龙", 9, 1));
	EXPECT_EQ(6, FindUtf8PosForward("李小龙", 9, 2));
	EXPECT_EQ(9, FindUtf8PosForward("李小龙", 9, 3));
	EXPECT_EQ(9, FindUtf8PosForward("李小龙", 9, 4));
	EXPECT_EQ(10, FindUtf8PosForward("李小龙Bruce Lee", 18, 4));
	EXPECT_EQ(11, FindUtf8PosForward("李小龙Bruce Lee", 18, 5));
	EXPECT_EQ(18, FindUtf8PosForward("李小龙Bruce Lee", 18, 50));

	// Test with a combination of UTF8 characters in 1, 2, 3 and 4 bytes.
	// 'Б', 'и' and 'ö' are encoded into 2 bytes. '和' is 3 bytes. '🙂' is 4 bytes.
	int byte_lens[] = {
	1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, // byte lengths for "hello Бopиc "
	3, 1, 1, 2, 1, 1, 1, 1, // byte lengths for "和 Jörg! "
	4, 1, 1, 1 // byte lengths for "🙂 Hi"
	};
	const char test_str[] = "Hello Бopиc 和 Jörg! 🙂 Hi";
	// Ignore tailing '\0'
	int total_byte_len = sizeof(test_str) / sizeof(char) - 1;
	int total_chars = sizeof(byte_lens) / sizeof(int);
	int pos = 0;
	for (int i = 0; i < total_chars; ++i) {
	EXPECT_EQ(pos, FindUtf8PosForward(test_str, total_byte_len, i));
	pos += byte_lens[i];
	}
	EXPECT_EQ(total_byte_len, pos);
	EXPECT_EQ(total_byte_len, FindUtf8PosForward(test_str, total_byte_len, total_chars));

	// \x93 is a non-ascii byte and not the start byte of any UTF-8 characters. It will be
	// treated as a malformed character and counted as one.
	EXPECT_EQ(9, FindUtf8PosForward("李小龙 \x93\x93 ", 13, 3));
	EXPECT_EQ(10, FindUtf8PosForward("李小龙 \x93\x93 ", 13, 4));
	EXPECT_EQ(11, FindUtf8PosForward("李小龙 \x93\x93 ", 13, 5));
	EXPECT_EQ(12, FindUtf8PosForward("李小龙 \x93\x93 ", 13, 6));
	EXPECT_EQ(13, FindUtf8PosForward("李小龙 \x93\x93 ", 13, 7));
	// Here we just need 4 characters, i.e. "李小龙 " in byte length 10. Set 'str_len'
	// to 10 to make sure the remaining bytes won't be counted.
	EXPECT_EQ(9, FindUtf8PosForward("李小龙 \x93\x93 ", 10, 3));
	EXPECT_EQ(10, FindUtf8PosForward("李小龙 \x93\x93 ", 10, 4));
	EXPECT_EQ(10, FindUtf8PosForward("李小龙 \x93\x93 ", 10, 5));
	EXPECT_EQ(10, FindUtf8PosForward("李小龙 \x93\x93 ", 10, 6));

	// More cases on malformed UTF-8.
	// \xc3 is the start byte of a 2-bytes UTF-8 character.
	// Make sure we won't get overflow index in results
	EXPECT_EQ(0, FindUtf8PosForward("李小龙\xc3", 10, 0));
	EXPECT_EQ(3, FindUtf8PosForward("李小龙\xc3", 10, 1));
	EXPECT_EQ(6, FindUtf8PosForward("李小龙\xc3", 10, 2));
	EXPECT_EQ(9, FindUtf8PosForward("李小龙\xc3", 10, 3));
	EXPECT_EQ(10, FindUtf8PosForward("李小龙\xc3", 10, 4));
	// Test cases for \xc3 in the middle, i.e. "李\xc3小龙".
	// In UTF-8, "小" encodes to [\xe5\xb0\x8f], and "龙" encodes to [\xe9\xbe\x99].
	// \xc3 is the start byte of a 2-bytes UTF-8 character. So "\xc3\xe5" is counted as
	// one character. "\xb0" and "\x8f" is treated as two malformed characters. "龙" is
	// still treated as one character since it's not messed up.
	// This may be inconsistent with Hive. We just make sure it won't crash to process,
	// and will deal with this in IMPALA-10761.
	EXPECT_EQ(0, FindUtf8PosForward("李\xc3小龙", 10, 0));
	EXPECT_EQ(3, FindUtf8PosForward("李\xc3小龙", 10, 1));
	EXPECT_EQ(5, FindUtf8PosForward("李\xc3小龙", 10, 2));
	EXPECT_EQ(6, FindUtf8PosForward("李\xc3小龙", 10, 3));
	EXPECT_EQ(7, FindUtf8PosForward("李\xc3小龙", 10, 4));
	EXPECT_EQ(10, FindUtf8PosForward("李\xc3小龙", 10, 5));
	}

	TEST(FindUtf8PosBackwardTest, Basic) {
	// Each Chinese character is encoded into 3 bytes in UTF-8.
	EXPECT_EQ(6, FindUtf8PosBackward("李小龙", 9, 0));
	EXPECT_EQ(3, FindUtf8PosBackward("李小龙", 9, 1));
	EXPECT_EQ(0, FindUtf8PosBackward("李小龙", 9, 2));
	EXPECT_EQ(-1, FindUtf8PosBackward("李小龙", 9, 17));
	EXPECT_EQ(17, FindUtf8PosBackward("李小龙Bruce Lee", 18, 0));
	EXPECT_EQ(10, FindUtf8PosBackward("李小龙Bruce Lee", 18, 7));
	EXPECT_EQ(9, FindUtf8PosBackward("李小龙Bruce Lee", 18, 8));
	EXPECT_EQ(6, FindUtf8PosBackward("李小龙Bruce Lee", 18, 9));
	EXPECT_EQ(3, FindUtf8PosBackward("李小龙Bruce Lee", 18, 10));
	EXPECT_EQ(0, FindUtf8PosBackward("李小龙Bruce Lee", 18, 11));
	EXPECT_EQ(0, FindUtf8PosBackward("hello李小龙Bruce Lee", 23, 16));
	EXPECT_EQ(-1, FindUtf8PosBackward("hello李小龙Bruce Lee", 23, 50));

	// Test with a combination of UTF8 characters in 1, 2, 3 and 4 bytes.
	// 'Б', 'и' and 'ö' are encoded into 2 bytes. '和' is 3 bytes. '🙂' is 4 bytes.
	int byte_lens[] = {
	1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, // byte lengths for "hello Бopиc "
	3, 1, 1, 2, 1, 1, 1, 1, // byte lengths for "和 Jörg! "
	4, 1, 1, 1 // byte lengths for "🙂 Hi"
	};
	const char test_str[] = "Hello Бopиc 和 Jörg! 🙂 Hi";
	// Ignore tailing '\0'
	int total_byte_len = sizeof(test_str) / sizeof(char) - 1;
	int total_chars = sizeof(byte_lens) / sizeof(int);
	int pos = total_byte_len;
	for (int i = 0; i < total_chars; ++i) {
	pos -= byte_lens[total_chars - i - 1];
	EXPECT_EQ(pos, FindUtf8PosBackward(test_str, total_byte_len, i));
	}
	EXPECT_EQ(0, pos);
	EXPECT_EQ(-1, FindUtf8PosBackward(test_str, total_byte_len, total_chars));

	// \x93 is a non-ascii byte and not the start byte of any UTF-8 characters. It will be
	// treated as a malformed character and counted as one.
	EXPECT_EQ(12, FindUtf8PosBackward("李小龙 \x93\x93 ", 13, 0));
	EXPECT_EQ(11, FindUtf8PosBackward("李小龙 \x93\x93 ", 13, 1));
	EXPECT_EQ(10, FindUtf8PosBackward("李小龙 \x93\x93 ", 13, 2));
	EXPECT_EQ(9, FindUtf8PosBackward("李小龙 \x93\x93 ", 13, 3));
	EXPECT_EQ(6, FindUtf8PosBackward("李小龙 \x93\x93 ", 13, 4));
	// Here we just need 4 characters, i.e. "李小龙 " in byte length 10. Set 'str_len'
	// to 10 to make sure the remaining bytes won't be counted.
	EXPECT_EQ(9, FindUtf8PosBackward("李小龙 \x93\x93 ", 10, 0));
	EXPECT_EQ(6, FindUtf8PosBackward("李小龙 \x93\x93 ", 10, 1));
	EXPECT_EQ(3, FindUtf8PosBackward("李小龙 \x93\x93 ", 10, 2));
	EXPECT_EQ(0, FindUtf8PosBackward("李小龙 \x93\x93 ", 10, 3));
	EXPECT_EQ(-1, FindUtf8PosBackward("李小龙 \x93\x93 ", 10, 4));
	// Test malformed UTF-8 bytes at the beginning.
	EXPECT_EQ(1, FindUtf8PosBackward("\x93\x93李小龙", 11, 3));
	EXPECT_EQ(0, FindUtf8PosBackward("\x93\x93李小龙", 11, 4));
	EXPECT_EQ(-1, FindUtf8PosBackward("\x93\x93李小龙", 11, 5));

	// More cases on malformed UTF-8.
	// \xc3 is the start byte of a 2-bytes UTF-8 character.
	EXPECT_EQ(9, FindUtf8PosBackward("李小龙\xc3", 10, 0));
	EXPECT_EQ(6, FindUtf8PosBackward("李小龙\xc3", 10, 1));
	EXPECT_EQ(3, FindUtf8PosBackward("李小龙\xc3", 10, 2));
	EXPECT_EQ(0, FindUtf8PosBackward("李小龙\xc3", 10, 3));
	EXPECT_EQ(-1, FindUtf8PosBackward("李小龙\xc3", 10, 4));
	// Test cases for \xc3 in the middle, i.e. "李\xc3小龙".
	EXPECT_EQ(7, FindUtf8PosBackward("李\xc3小龙", 10, 0));
	EXPECT_EQ(4, FindUtf8PosBackward("李\xc3小龙", 10, 1));
	EXPECT_EQ(3, FindUtf8PosBackward("李\xc3小龙", 10, 2));
	EXPECT_EQ(0, FindUtf8PosBackward("李\xc3小龙", 10, 3));
	EXPECT_EQ(-1, FindUtf8PosBackward("李\xc3小龙", 10, 4));
	}

	static const int MAX_LEN = 100;
	static const int NUM_TRIALS = 100;
	TEST(RandomFindUtf8PosTest, Basic) {
	// Fuzz test to make sure we won't crash or return overflow index when processing
	// malformed UTF-8 characters. It takes ~5s.
	char buffer[MAX_LEN];
	kudu::Random rng(kudu::SeedRandom());
	for (int i = 0; i < NUM_TRIALS; ++i) {
	RandomString(buffer, MAX_LEN, &rng);
	// Use the random string 100 times.
	for (int j = 0; j < 100; ++j) {
	int len = rng.Uniform(MAX_LEN + 1);
	int index = rng.Uniform(len + 1);
	int pos = FindUtf8PosForward(buffer, len, index);
	EXPECT_GE(pos, -1);
	EXPECT_LE(pos, len);
	pos = FindUtf8PosBackward(buffer, len, index);
	EXPECT_GE(pos, -1);
	EXPECT_LE(pos, len);
	}
	}
	}

	// StringStreamPopTest: These tests assert the functionality of the StringStreamPop class.

	// Assert the most common use case where the last character is popped and a new character
	// is written to the stream.
	TEST(StringStreamPopTest, NotEmptyPopOnce) {
	StringStreamPop fixture;
	fixture << "this is a tes,";
	fixture.move_back();
	fixture << "t";
	EXPECT_EQ("this is a test", fixture.str());
	}

	// Asssert where the stream only contains a single character that is popped before another
	// character is written to the stream.
	TEST(StringStreamPopTest, OneCharPop) {
	StringStreamPop fixture;
	fixture << "t";
	fixture.move_back();
	fixture << "v";
	EXPECT_EQ("v", fixture.str());
	}

	// Assert where the last two characters of a non-empty stream are popped.
	TEST(StringStreamPopTest, NotEmptyPopTwice) {
	StringStreamPop fixture;
	fixture << "this is a second te,,";
	fixture.move_back();
	fixture.move_back();
	fixture << "st";
	EXPECT_EQ("this is a second test", fixture.str());
	}

	// Assert where an empty stream has it's last (nonexistant) character popped.
	TEST(StringStreamPopTest, EmptyPopOnce) {
	StringStreamPop fixture;
	fixture.move_back();
	EXPECT_TRUE(fixture.str().empty());
	}

	// Assert where an empty stream has it's last (nonexistant) character popped twice.
	TEST(StringStreamPopTest, EmptyPopTwice) {
	StringStreamPop fixture;
	fixture.move_back();
	fixture.move_back();
	EXPECT_TRUE(fixture.str().empty());
	}

	// Assert the move_back functionality does not actually remove the character.
	TEST(StringStreamPopTest, PopOnceBeforeAppend) {
	StringStreamPop fixture;
	fixture.move_back();
	fixture << "a";
	fixture.move_back();

	// This assertion is correct because the move_back() function only moves the write
	// pointer, it does not modify the internal buffer.
	EXPECT_EQ("a", fixture.str());
	}

	// Assert the StringStreamPop class behavior matches the behavior of the stringstream
	// class.
	TEST(StringStreamPopTest, CompareWithStringstream) {
	StringStreamPop fixture;
	stringstream expected;

	expected << "C++ is" << " an " << "invisible found" << "ation of " << "everything!";
	fixture << "C++ is" << " an " << "invisible found" << "ation of " << "everything?";
	fixture.move_back();
	fixture << '!';

	EXPECT_EQ(expected.str(), fixture.str());
	}

	}