cpp/src/arrow/util/utf8_util_test.cc - arrow - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 #include <cstdint>
 #include <random>
 #include <string>
 #include <vector>

 #include <gtest/gtest.h>

 #include "arrow/testing/gtest_util.h"
 #include "arrow/util/string.h"
 #include "arrow/util/utf8.h"

 namespace arrow {
 namespace util {

 class UTF8Test : public ::testing::Test {
  protected:
   static void SetUpTestCase() {
     InitializeUTF8();

     all_valid_sequences.clear();
     for (const auto& v :
          {valid_sequences_1, valid_sequences_2, valid_sequences_3, valid_sequences_4}) {
       all_valid_sequences.insert(all_valid_sequences.end(), v.begin(), v.end());
     }

     all_invalid_sequences.clear();
     for (const auto& v : {invalid_sequences_1, invalid_sequences_2, invalid_sequences_3,
                           invalid_sequences_4}) {
       all_invalid_sequences.insert(all_invalid_sequences.end(), v.begin(), v.end());
     }
   }

   static std::vector<std::string> valid_sequences_1;
   static std::vector<std::string> valid_sequences_2;
   static std::vector<std::string> valid_sequences_3;
   static std::vector<std::string> valid_sequences_4;

   static std::vector<std::string> all_valid_sequences;

   static std::vector<std::string> invalid_sequences_1;
   static std::vector<std::string> invalid_sequences_2;
   static std::vector<std::string> invalid_sequences_3;
   static std::vector<std::string> invalid_sequences_4;

   static std::vector<std::string> all_invalid_sequences;

   static std::vector<std::string> valid_sequences_ascii;
   static std::vector<std::string> invalid_sequences_ascii;
 };

 std::vector<std::string> UTF8Test::valid_sequences_1 = {"a", "\x7f",
                                                         std::string("\0", 1)};
 std::vector<std::string> UTF8Test::valid_sequences_2 = {"\xc2\x80", "\xc3\xbf",
                                                         "\xdf\xbf"};
 std::vector<std::string> UTF8Test::valid_sequences_3 = {"\xe0\xa0\x80", "\xe8\x9d\xa5",
                                                         "\xef\xbf\xbf"};
 std::vector<std::string> UTF8Test::valid_sequences_4 = {
     "\xf0\x90\x80\x80", "\xf0\x9f\xbf\xbf", "\xf4\x80\x80\x80", "\xf4\x8f\xbf\xbf"};

 std::vector<std::string> UTF8Test::all_valid_sequences;

 std::vector<std::string> UTF8Test::invalid_sequences_1 = {"\x80", "\xa0", "\xbf", "\xc0",
                                                           "\xc1"};
 std::vector<std::string> UTF8Test::invalid_sequences_2 = {
     "\x80\x80", "\x80\xbf", "\xbf\x80", "\xbf\xbf",
     "\xc1\x80", "\xc2\x7f", "\xc3\xff", "\xdf\xc0"};
 std::vector<std::string> UTF8Test::invalid_sequences_3 = {
     "\xe0\x80\x80", "\xe0\x9f\x80", "\xef\xbf\xc0", "\xef\xc0\xbf", "\xef\xff\xff",
     // Surrogates
     "\xed\xa0\x80", "\xed\xbf\xbf"};
 std::vector<std::string> UTF8Test::invalid_sequences_4 = {
     "\xf0\x80\x80\x80", "\xf0\x8f\x80\x80", "\xf4\x8f\xbf\xc0", "\xf4\x8f\xc0\xbf",
     "\xf4\x90\x80\x80"};

 std::vector<std::string> UTF8Test::all_invalid_sequences;

 std::vector<std::string> UTF8Test::valid_sequences_ascii = {"a", "\x7f", "B", "&"};
 std::vector<std::string> UTF8Test::invalid_sequences_ascii = {
     "\x80", "\xa0\x1e", "\xbf\xef\x6a", "\xc1\x9f\xc3\xd9"};

 class UTF8ValidationTest : public UTF8Test {};

 class ASCIIValidationTest : public UTF8Test {};

 ::testing::AssertionResult IsValidUTF8(const std::string& s) {
   if (ValidateUTF8(reinterpret_cast<const uint8_t*>(s.data()), s.size())) {
     return ::testing::AssertionSuccess();
   } else {
     std::string h = HexEncode(reinterpret_cast<const uint8_t*>(s.data()),
                               static_cast<int32_t>(s.size()));
     return ::testing::AssertionFailure()
            << "string '" << h << "' didn't validate as UTF8";
   }
 }

 ::testing::AssertionResult IsInvalidUTF8(const std::string& s) {
   if (!ValidateUTF8(reinterpret_cast<const uint8_t*>(s.data()), s.size())) {
     return ::testing::AssertionSuccess();
   } else {
     std::string h = HexEncode(reinterpret_cast<const uint8_t*>(s.data()),
                               static_cast<int32_t>(s.size()));
     return ::testing::AssertionFailure() << "string '" << h << "' validated as UTF8";
   }
 }

 ::testing::AssertionResult IsValidASCII(const std::string& s) {
   if (ValidateAscii(reinterpret_cast<const uint8_t*>(s.data()), s.size())) {
     return ::testing::AssertionSuccess();
   } else {
     std::string h = HexEncode(reinterpret_cast<const uint8_t*>(s.data()),
                               static_cast<int32_t>(s.size()));
     return ::testing::AssertionFailure()
            << "string '" << h << "' didn't validate as ASCII";
   }
 }

 ::testing::AssertionResult IsInvalidASCII(const std::string& s) {
   if (!ValidateAscii(reinterpret_cast<const uint8_t*>(s.data()), s.size())) {
     return ::testing::AssertionSuccess();
   } else {
     std::string h = HexEncode(reinterpret_cast<const uint8_t*>(s.data()),
                               static_cast<int32_t>(s.size()));
     return ::testing::AssertionFailure() << "string '" << h << "' validated as ASCII";
   }
 }

 template <typename ValidationFunc>
 void ValidateWithPrefixes(ValidationFunc&& validate, const std::string& s) {
   // Exercise SIMD optimizations
   for (int prefix_size = 1; prefix_size < 64; ++prefix_size) {
     std::string longer(prefix_size, 'x');
     longer.append(s);
     ASSERT_TRUE(validate(longer));
     longer.append(prefix_size, 'y');
     ASSERT_TRUE(validate(longer));
   }
 }

 void AssertValidUTF8(const std::string& s) { ASSERT_TRUE(IsValidUTF8(s)); }

 void AssertInvalidUTF8(const std::string& s) { ASSERT_TRUE(IsInvalidUTF8(s)); }

 void AssertValidASCII(const std::string& s) {
   ASSERT_TRUE(IsValidASCII(s));
   ValidateWithPrefixes(IsValidASCII, s);
 }

 void AssertInvalidASCII(const std::string& s) {
   ASSERT_TRUE(IsInvalidASCII(s));
   ValidateWithPrefixes(IsInvalidASCII, s);
 }

 TEST_F(ASCIIValidationTest, AsciiValid) {
   for (const auto& s : valid_sequences_ascii) {
     AssertValidASCII(s);
   }
 }

 TEST_F(ASCIIValidationTest, AsciiInvalid) {
   for (const auto& s : invalid_sequences_ascii) {
     AssertInvalidASCII(s);
   }
 }

 TEST_F(UTF8ValidationTest, EmptyString) { AssertValidUTF8(""); }

 TEST_F(UTF8ValidationTest, OneCharacterValid) {
   for (const auto& s : all_valid_sequences) {
     AssertValidUTF8(s);
   }
 }

 TEST_F(UTF8ValidationTest, TwoCharacterValid) {
   for (const auto& s1 : all_valid_sequences) {
     for (const auto& s2 : all_valid_sequences) {
       AssertValidUTF8(s1 + s2);
     }
   }
 }

 TEST_F(UTF8ValidationTest, RandomValid) {
 #ifdef ARROW_VALGRIND
   const int niters = 50;
 #else
   const int niters = 1000;
 #endif
   const int nchars = 100;
   std::default_random_engine gen(42);
   std::uniform_int_distribution<size_t> valid_dist(0, all_valid_sequences.size() - 1);

   for (int i = 0; i < niters; ++i) {
     std::string s;
     s.reserve(nchars * 4);
     for (int j = 0; j < nchars; ++j) {
       s += all_valid_sequences[valid_dist(gen)];
     }
     AssertValidUTF8(s);
   }
 }

 TEST_F(UTF8ValidationTest, OneCharacterTruncated) {
   for (const auto& s : all_valid_sequences) {
     if (s.size() > 1) {
       AssertInvalidUTF8(s.substr(0, s.size() - 1));
     }
   }
 }

 TEST_F(UTF8ValidationTest, TwoCharacterTruncated) {
   for (const auto& s1 : all_valid_sequences) {
     for (const auto& s2 : all_valid_sequences) {
       if (s2.size() > 1) {
         AssertInvalidUTF8(s1 + s2.substr(0, s2.size() - 1));
         AssertInvalidUTF8(s2.substr(0, s2.size() - 1) + s1);
       }
     }
   }
 }

 TEST_F(UTF8ValidationTest, OneCharacterInvalid) {
   for (const auto& s : all_invalid_sequences) {
     AssertInvalidUTF8(s);
   }
 }

 TEST_F(UTF8ValidationTest, TwoCharacterInvalid) {
   for (const auto& s1 : all_valid_sequences) {
     for (const auto& s2 : all_invalid_sequences) {
       AssertInvalidUTF8(s1 + s2);
       AssertInvalidUTF8(s2 + s1);
     }
   }
   for (const auto& s1 : all_invalid_sequences) {
     for (const auto& s2 : all_invalid_sequences) {
       AssertInvalidUTF8(s1 + s2);
     }
   }
 }

 TEST_F(UTF8ValidationTest, RandomInvalid) {
 #ifdef ARROW_VALGRIND
   const int niters = 50;
 #else
   const int niters = 1000;
 #endif
   const int nchars = 100;
   std::default_random_engine gen(42);
   std::uniform_int_distribution<size_t> valid_dist(0, all_valid_sequences.size() - 1);
   std::uniform_int_distribution<int> invalid_pos_dist(0, nchars - 1);
   std::uniform_int_distribution<size_t> invalid_dist(0, all_invalid_sequences.size() - 1);

   for (int i = 0; i < niters; ++i) {
     std::string s;
     s.reserve(nchars * 4);
     // Stuff a single invalid sequence somewhere in a valid UTF8 stream
     int invalid_pos = invalid_pos_dist(gen);
     for (int j = 0; j < nchars; ++j) {
       if (j == invalid_pos) {
         s += all_invalid_sequences[invalid_dist(gen)];
       } else {
         s += all_valid_sequences[valid_dist(gen)];
       }
     }
     AssertInvalidUTF8(s);
   }
 }

 TEST_F(UTF8ValidationTest, RandomTruncated) {
 #ifdef ARROW_VALGRIND
   const int niters = 50;
 #else
   const int niters = 1000;
 #endif
   const int nchars = 100;
   std::default_random_engine gen(42);
   std::uniform_int_distribution<size_t> valid_dist(0, all_valid_sequences.size() - 1);
   std::uniform_int_distribution<int> invalid_pos_dist(0, nchars - 1);

   for (int i = 0; i < niters; ++i) {
     std::string s;
     s.reserve(nchars * 4);
     // Truncate a single sequence somewhere in a valid UTF8 stream
     int invalid_pos = invalid_pos_dist(gen);
     for (int j = 0; j < nchars; ++j) {
       if (j == invalid_pos) {
         while (true) {
           // Ensure we truncate a 2-byte or more sequence
           const std::string& t = all_valid_sequences[valid_dist(gen)];
           if (t.size() > 1) {
             s += t.substr(0, t.size() - 1);
             break;
           }
         }
       } else {
         s += all_valid_sequences[valid_dist(gen)];
       }
     }
     AssertInvalidUTF8(s);
   }
 }

 TEST(SkipUTF8BOM, Basics) {
   auto CheckOk = [](const std::string& s, size_t expected_offset) -> void {
     const uint8_t* data = reinterpret_cast<const uint8_t*>(s.data());
     const uint8_t* res;
     ASSERT_OK_AND_ASSIGN(res, SkipUTF8BOM(data, static_cast<int64_t>(s.size())));
     ASSERT_NE(res, nullptr);
     ASSERT_EQ(res - data, expected_offset);
   };

   auto CheckTruncated = [](const std::string& s) -> void {
     const uint8_t* data = reinterpret_cast<const uint8_t*>(s.data());
     ASSERT_RAISES(Invalid, SkipUTF8BOM(data, static_cast<int64_t>(s.size())));
   };

   CheckOk("", 0);
   CheckOk("a", 0);
   CheckOk("ab", 0);
   CheckOk("abc", 0);
   CheckOk("abcd", 0);
   CheckOk("\xc3\xa9", 0);
   CheckOk("\xee", 0);
   CheckOk("\xef\xbc", 0);
   CheckOk("\xef\xbb\xbe", 0);
   CheckOk("\xef\xbb\xbf", 3);
   CheckOk("\xef\xbb\xbfx", 3);

   CheckTruncated("\xef");
   CheckTruncated("\xef\xbb");
 }

 TEST(UTF8ToWideString, Basics) {
   auto CheckOk = [](const std::string& s, const std::wstring& expected) -> void {
     ASSERT_OK_AND_ASSIGN(std::wstring ws, UTF8ToWideString(s));
     ASSERT_EQ(ws, expected);
   };

   auto CheckInvalid = [](const std::string& s) -> void {
     ASSERT_RAISES(Invalid, UTF8ToWideString(s));
   };

   CheckOk("", L"");
   CheckOk("foo", L"foo");
   CheckOk("h\xc3\xa9h\xc3\xa9", L"h\u00e9h\u00e9");
   CheckOk("\xf0\x9f\x98\x80", L"\U0001F600");
   CheckOk("\xf4\x8f\xbf\xbf", L"\U0010FFFF");
   CheckOk({0, 'x'}, {0, L'x'});

   CheckInvalid("\xff");
   CheckInvalid("h\xc3");
 }

 TEST(WideStringToUTF8, Basics) {
   auto CheckOk = [](const std::wstring& ws, const std::string& expected) -> void {
     ASSERT_OK_AND_ASSIGN(std::string s, WideStringToUTF8(ws));
     ASSERT_EQ(s, expected);
   };

   auto CheckInvalid = [](const std::wstring& ws) -> void {
     ASSERT_RAISES(Invalid, WideStringToUTF8(ws));
   };

   CheckOk(L"", "");
   CheckOk(L"foo", "foo");
   CheckOk(L"h\u00e9h\u00e9", "h\xc3\xa9h\xc3\xa9");
   CheckOk(L"\U0001F600", "\xf0\x9f\x98\x80");
   CheckOk(L"\U0010FFFF", "\xf4\x8f\xbf\xbf");
   CheckOk({0, L'x'}, {0, 'x'});

   // Lone surrogate
   CheckInvalid({0xD800});
   CheckInvalid({0xDFFF});
   // Invalid code point
 #if WCHAR_MAX > 0xFFFF
   CheckInvalid({0x110000});
 #endif
 }

 TEST(UTF8DecodeReverse, Basics) {
   auto CheckOk = [](const std::string& s) -> void {
     const uint8_t* begin = reinterpret_cast<const uint8_t*>(s.c_str());
     const uint8_t* end = begin + s.length();
     const uint8_t* i = end - 1;
     uint32_t codepoint;
     EXPECT_TRUE(UTF8DecodeReverse(&i, &codepoint));
     EXPECT_EQ(i, begin - 1);
   };

   // 0x80 == 0b10000000
   // 0xC0 == 0b11000000
   // 0xE0 == 0b11100000
   // 0xF0 == 0b11110000
   CheckOk("a");
   CheckOk("\xC0\x80");
   CheckOk("\xE0\x80\x80");
   CheckOk("\xF0\x80\x80\x80");

   auto CheckInvalid = [](const std::string& s) -> void {
     const uint8_t* begin = reinterpret_cast<const uint8_t*>(s.c_str());
     const uint8_t* end = begin + s.length();
     const uint8_t* i = end - 1;
     uint32_t codepoint;
     EXPECT_FALSE(UTF8DecodeReverse(&i, &codepoint));
   };

   // too many continuation code units
   CheckInvalid("a\x80");
   CheckInvalid("\xC0\x80\x80");
   CheckInvalid("\xE0\x80\x80\x80");
   CheckInvalid("\xF0\x80\x80\x80\x80");
   // not enough continuation code units
   CheckInvalid("\xC0");
   CheckInvalid("\xE0\x80");
   CheckInvalid("\xF0\x80\x80");
 }

 TEST(UTF8FindIf, Basics) {
   auto CheckOk = [](const std::string& s, unsigned char test, int64_t offset_left,
                     int64_t offset_right) -> void {
     const uint8_t* begin = reinterpret_cast<const uint8_t*>(s.c_str());
     const uint8_t* end = begin + s.length();
     std::reverse_iterator<const uint8_t*> rbegin(end);
     std::reverse_iterator<const uint8_t*> rend(begin);
     const uint8_t* left;
     const uint8_t* right;
     auto predicate = [&](uint32_t c) { return c == test; };
     EXPECT_TRUE(UTF8FindIf(begin, end, predicate, &left));
     EXPECT_TRUE(UTF8FindIfReverse(begin, end, predicate, &right));
     EXPECT_EQ(offset_left, left - begin);
     EXPECT_EQ(offset_right, right - begin);
     EXPECT_EQ(std::find_if(begin, end, predicate) - begin, left - begin);
     EXPECT_EQ(std::find_if(rbegin, rend, predicate).base() - begin, right - begin);
   };
   auto CheckOkUTF8 = [](const std::string& s, uint32_t test, int64_t offset_left,
                         int64_t offset_right) -> void {
     const uint8_t* begin = reinterpret_cast<const uint8_t*>(s.c_str());
     const uint8_t* end = begin + s.length();
     std::reverse_iterator<const uint8_t*> rbegin(end);
     std::reverse_iterator<const uint8_t*> rend(begin);
     const uint8_t* left;
     const uint8_t* right;
     auto predicate = [&](uint32_t c) { return c == test; };
     EXPECT_TRUE(UTF8FindIf(begin, end, predicate, &left));
     EXPECT_TRUE(UTF8FindIfReverse(begin, end, predicate, &right));
     EXPECT_EQ(offset_left, left - begin);
     EXPECT_EQ(offset_right, right - begin);
     // we cannot check the unicode version with find_if semantics, because it's byte based
     // EXPECT_EQ(std::find_if(begin, end, predicate) - begin, left - begin);
     // EXPECT_EQ(std::find_if(rbegin, rend, predicate).base() - begin, right - begin);
   };

   CheckOk("aaaba", 'a', 0, 5);
   CheckOkUTF8("aaaβa", 'a', 0, 6);

   CheckOk("aaaba", 'b', 3, 4);
   CheckOkUTF8("aaaβa", U'β', 3, 5);

   CheckOk("aaababa", 'b', 3, 6);
   CheckOkUTF8("aaaβaβa", U'β', 3, 8);

   CheckOk("aaababa", 'c', 7, 0);
   CheckOk("aaaβaβa", 'c', 9, 0);
   CheckOkUTF8("aaaβaβa", U'ɑ', 9, 0);

   CheckOk("a", 'a', 0, 1);
   CheckOkUTF8("ɑ", U'ɑ', 0, 2);

   CheckOk("a", 'b', 1, 0);
   CheckOkUTF8("ɑ", 'b', 2, 0);

   CheckOk("", 'b', 0, 0);
   CheckOkUTF8("", U'β', 0, 0);
 }

 }  // namespace util
 }  // namespace arrow
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.

	#include <cstdint>
	#include <random>
	#include <string>
	#include <vector>

	#include <gtest/gtest.h>

	#include "arrow/testing/gtest_util.h"
	#include "arrow/util/string.h"
	#include "arrow/util/utf8.h"

	namespace arrow {
	namespace util {

	class UTF8Test : public ::testing::Test {
	protected:
	static void SetUpTestCase() {
	InitializeUTF8();

	all_valid_sequences.clear();
	for (const auto& v :
	{valid_sequences_1, valid_sequences_2, valid_sequences_3, valid_sequences_4}) {
	all_valid_sequences.insert(all_valid_sequences.end(), v.begin(), v.end());
	}

	all_invalid_sequences.clear();
	for (const auto& v : {invalid_sequences_1, invalid_sequences_2, invalid_sequences_3,
	invalid_sequences_4}) {
	all_invalid_sequences.insert(all_invalid_sequences.end(), v.begin(), v.end());
	}
	}

	static std::vector<std::string> valid_sequences_1;
	static std::vector<std::string> valid_sequences_2;
	static std::vector<std::string> valid_sequences_3;
	static std::vector<std::string> valid_sequences_4;

	static std::vector<std::string> all_valid_sequences;

	static std::vector<std::string> invalid_sequences_1;
	static std::vector<std::string> invalid_sequences_2;
	static std::vector<std::string> invalid_sequences_3;
	static std::vector<std::string> invalid_sequences_4;

	static std::vector<std::string> all_invalid_sequences;

	static std::vector<std::string> valid_sequences_ascii;
	static std::vector<std::string> invalid_sequences_ascii;
	};

	std::vector<std::string> UTF8Test::valid_sequences_1 = {"a", "\x7f",
	std::string("\0", 1)};
	std::vector<std::string> UTF8Test::valid_sequences_2 = {"\xc2\x80", "\xc3\xbf",
	"\xdf\xbf"};
	std::vector<std::string> UTF8Test::valid_sequences_3 = {"\xe0\xa0\x80", "\xe8\x9d\xa5",
	"\xef\xbf\xbf"};
	std::vector<std::string> UTF8Test::valid_sequences_4 = {
	"\xf0\x90\x80\x80", "\xf0\x9f\xbf\xbf", "\xf4\x80\x80\x80", "\xf4\x8f\xbf\xbf"};

	std::vector<std::string> UTF8Test::all_valid_sequences;

	std::vector<std::string> UTF8Test::invalid_sequences_1 = {"\x80", "\xa0", "\xbf", "\xc0",
	"\xc1"};
	std::vector<std::string> UTF8Test::invalid_sequences_2 = {
	"\x80\x80", "\x80\xbf", "\xbf\x80", "\xbf\xbf",
	"\xc1\x80", "\xc2\x7f", "\xc3\xff", "\xdf\xc0"};
	std::vector<std::string> UTF8Test::invalid_sequences_3 = {
	"\xe0\x80\x80", "\xe0\x9f\x80", "\xef\xbf\xc0", "\xef\xc0\xbf", "\xef\xff\xff",
	// Surrogates
	"\xed\xa0\x80", "\xed\xbf\xbf"};
	std::vector<std::string> UTF8Test::invalid_sequences_4 = {
	"\xf0\x80\x80\x80", "\xf0\x8f\x80\x80", "\xf4\x8f\xbf\xc0", "\xf4\x8f\xc0\xbf",
	"\xf4\x90\x80\x80"};

	std::vector<std::string> UTF8Test::all_invalid_sequences;

	std::vector<std::string> UTF8Test::valid_sequences_ascii = {"a", "\x7f", "B", "&"};
	std::vector<std::string> UTF8Test::invalid_sequences_ascii = {
	"\x80", "\xa0\x1e", "\xbf\xef\x6a", "\xc1\x9f\xc3\xd9"};

	class UTF8ValidationTest : public UTF8Test {};

	class ASCIIValidationTest : public UTF8Test {};

	::testing::AssertionResult IsValidUTF8(const std::string& s) {
	if (ValidateUTF8(reinterpret_cast<const uint8_t*>(s.data()), s.size())) {
	return ::testing::AssertionSuccess();
	} else {
	std::string h = HexEncode(reinterpret_cast<const uint8_t*>(s.data()),
	static_cast<int32_t>(s.size()));
	return ::testing::AssertionFailure()
	<< "string '" << h << "' didn't validate as UTF8";
	}
	}

	::testing::AssertionResult IsInvalidUTF8(const std::string& s) {
	if (!ValidateUTF8(reinterpret_cast<const uint8_t*>(s.data()), s.size())) {
	return ::testing::AssertionSuccess();
	} else {
	std::string h = HexEncode(reinterpret_cast<const uint8_t*>(s.data()),
	static_cast<int32_t>(s.size()));
	return ::testing::AssertionFailure() << "string '" << h << "' validated as UTF8";
	}
	}

	::testing::AssertionResult IsValidASCII(const std::string& s) {
	if (ValidateAscii(reinterpret_cast<const uint8_t*>(s.data()), s.size())) {
	return ::testing::AssertionSuccess();
	} else {
	std::string h = HexEncode(reinterpret_cast<const uint8_t*>(s.data()),
	static_cast<int32_t>(s.size()));
	return ::testing::AssertionFailure()
	<< "string '" << h << "' didn't validate as ASCII";
	}
	}

	::testing::AssertionResult IsInvalidASCII(const std::string& s) {
	if (!ValidateAscii(reinterpret_cast<const uint8_t*>(s.data()), s.size())) {
	return ::testing::AssertionSuccess();
	} else {
	std::string h = HexEncode(reinterpret_cast<const uint8_t*>(s.data()),
	static_cast<int32_t>(s.size()));
	return ::testing::AssertionFailure() << "string '" << h << "' validated as ASCII";
	}
	}

	template <typename ValidationFunc>
	void ValidateWithPrefixes(ValidationFunc&& validate, const std::string& s) {
	// Exercise SIMD optimizations
	for (int prefix_size = 1; prefix_size < 64; ++prefix_size) {
	std::string longer(prefix_size, 'x');
	longer.append(s);
	ASSERT_TRUE(validate(longer));
	longer.append(prefix_size, 'y');
	ASSERT_TRUE(validate(longer));
	}
	}

	void AssertValidUTF8(const std::string& s) { ASSERT_TRUE(IsValidUTF8(s)); }

	void AssertInvalidUTF8(const std::string& s) { ASSERT_TRUE(IsInvalidUTF8(s)); }

	void AssertValidASCII(const std::string& s) {
	ASSERT_TRUE(IsValidASCII(s));
	ValidateWithPrefixes(IsValidASCII, s);
	}

	void AssertInvalidASCII(const std::string& s) {
	ASSERT_TRUE(IsInvalidASCII(s));
	ValidateWithPrefixes(IsInvalidASCII, s);
	}

	TEST_F(ASCIIValidationTest, AsciiValid) {
	for (const auto& s : valid_sequences_ascii) {
	AssertValidASCII(s);
	}
	}

	TEST_F(ASCIIValidationTest, AsciiInvalid) {
	for (const auto& s : invalid_sequences_ascii) {
	AssertInvalidASCII(s);
	}
	}

	TEST_F(UTF8ValidationTest, EmptyString) { AssertValidUTF8(""); }

	TEST_F(UTF8ValidationTest, OneCharacterValid) {
	for (const auto& s : all_valid_sequences) {
	AssertValidUTF8(s);
	}
	}

	TEST_F(UTF8ValidationTest, TwoCharacterValid) {
	for (const auto& s1 : all_valid_sequences) {
	for (const auto& s2 : all_valid_sequences) {
	AssertValidUTF8(s1 + s2);
	}
	}
	}

	TEST_F(UTF8ValidationTest, RandomValid) {
	#ifdef ARROW_VALGRIND
	const int niters = 50;
	#else
	const int niters = 1000;
	#endif
	const int nchars = 100;
	std::default_random_engine gen(42);
	std::uniform_int_distribution<size_t> valid_dist(0, all_valid_sequences.size() - 1);

	for (int i = 0; i < niters; ++i) {
	std::string s;
	s.reserve(nchars * 4);
	for (int j = 0; j < nchars; ++j) {
	s += all_valid_sequences[valid_dist(gen)];
	}
	AssertValidUTF8(s);
	}
	}

	TEST_F(UTF8ValidationTest, OneCharacterTruncated) {
	for (const auto& s : all_valid_sequences) {
	if (s.size() > 1) {
	AssertInvalidUTF8(s.substr(0, s.size() - 1));
	}
	}
	}

	TEST_F(UTF8ValidationTest, TwoCharacterTruncated) {
	for (const auto& s1 : all_valid_sequences) {
	for (const auto& s2 : all_valid_sequences) {
	if (s2.size() > 1) {
	AssertInvalidUTF8(s1 + s2.substr(0, s2.size() - 1));
	AssertInvalidUTF8(s2.substr(0, s2.size() - 1) + s1);
	}
	}
	}
	}

	TEST_F(UTF8ValidationTest, OneCharacterInvalid) {
	for (const auto& s : all_invalid_sequences) {
	AssertInvalidUTF8(s);
	}
	}

	TEST_F(UTF8ValidationTest, TwoCharacterInvalid) {
	for (const auto& s1 : all_valid_sequences) {
	for (const auto& s2 : all_invalid_sequences) {
	AssertInvalidUTF8(s1 + s2);
	AssertInvalidUTF8(s2 + s1);
	}
	}
	for (const auto& s1 : all_invalid_sequences) {
	for (const auto& s2 : all_invalid_sequences) {
	AssertInvalidUTF8(s1 + s2);
	}
	}
	}

	TEST_F(UTF8ValidationTest, RandomInvalid) {
	#ifdef ARROW_VALGRIND
	const int niters = 50;
	#else
	const int niters = 1000;
	#endif
	const int nchars = 100;
	std::default_random_engine gen(42);
	std::uniform_int_distribution<size_t> valid_dist(0, all_valid_sequences.size() - 1);
	std::uniform_int_distribution<int> invalid_pos_dist(0, nchars - 1);
	std::uniform_int_distribution<size_t> invalid_dist(0, all_invalid_sequences.size() - 1);

	for (int i = 0; i < niters; ++i) {
	std::string s;
	s.reserve(nchars * 4);
	// Stuff a single invalid sequence somewhere in a valid UTF8 stream
	int invalid_pos = invalid_pos_dist(gen);
	for (int j = 0; j < nchars; ++j) {
	if (j == invalid_pos) {
	s += all_invalid_sequences[invalid_dist(gen)];
	} else {
	s += all_valid_sequences[valid_dist(gen)];
	}
	}
	AssertInvalidUTF8(s);
	}
	}

	TEST_F(UTF8ValidationTest, RandomTruncated) {
	#ifdef ARROW_VALGRIND
	const int niters = 50;
	#else
	const int niters = 1000;
	#endif
	const int nchars = 100;
	std::default_random_engine gen(42);
	std::uniform_int_distribution<size_t> valid_dist(0, all_valid_sequences.size() - 1);
	std::uniform_int_distribution<int> invalid_pos_dist(0, nchars - 1);

	for (int i = 0; i < niters; ++i) {
	std::string s;
	s.reserve(nchars * 4);
	// Truncate a single sequence somewhere in a valid UTF8 stream
	int invalid_pos = invalid_pos_dist(gen);
	for (int j = 0; j < nchars; ++j) {
	if (j == invalid_pos) {
	while (true) {
	// Ensure we truncate a 2-byte or more sequence
	const std::string& t = all_valid_sequences[valid_dist(gen)];
	if (t.size() > 1) {
	s += t.substr(0, t.size() - 1);
	break;
	}
	}
	} else {
	s += all_valid_sequences[valid_dist(gen)];
	}
	}
	AssertInvalidUTF8(s);
	}
	}

	TEST(SkipUTF8BOM, Basics) {
	auto CheckOk = [](const std::string& s, size_t expected_offset) -> void {
	const uint8_t* data = reinterpret_cast<const uint8_t*>(s.data());
	const uint8_t* res;
	ASSERT_OK_AND_ASSIGN(res, SkipUTF8BOM(data, static_cast<int64_t>(s.size())));
	ASSERT_NE(res, nullptr);
	ASSERT_EQ(res - data, expected_offset);
	};

	auto CheckTruncated = [](const std::string& s) -> void {
	const uint8_t* data = reinterpret_cast<const uint8_t*>(s.data());
	ASSERT_RAISES(Invalid, SkipUTF8BOM(data, static_cast<int64_t>(s.size())));
	};

	CheckOk("", 0);
	CheckOk("a", 0);
	CheckOk("ab", 0);
	CheckOk("abc", 0);
	CheckOk("abcd", 0);
	CheckOk("\xc3\xa9", 0);
	CheckOk("\xee", 0);
	CheckOk("\xef\xbc", 0);
	CheckOk("\xef\xbb\xbe", 0);
	CheckOk("\xef\xbb\xbf", 3);
	CheckOk("\xef\xbb\xbfx", 3);

	CheckTruncated("\xef");
	CheckTruncated("\xef\xbb");
	}

	TEST(UTF8ToWideString, Basics) {
	auto CheckOk = [](const std::string& s, const std::wstring& expected) -> void {
	ASSERT_OK_AND_ASSIGN(std::wstring ws, UTF8ToWideString(s));
	ASSERT_EQ(ws, expected);
	};

	auto CheckInvalid = [](const std::string& s) -> void {
	ASSERT_RAISES(Invalid, UTF8ToWideString(s));
	};

	CheckOk("", L"");
	CheckOk("foo", L"foo");
	CheckOk("h\xc3\xa9h\xc3\xa9", L"h\u00e9h\u00e9");
	CheckOk("\xf0\x9f\x98\x80", L"\U0001F600");
	CheckOk("\xf4\x8f\xbf\xbf", L"\U0010FFFF");
	CheckOk({0, 'x'}, {0, L'x'});

	CheckInvalid("\xff");
	CheckInvalid("h\xc3");
	}

	TEST(WideStringToUTF8, Basics) {
	auto CheckOk = [](const std::wstring& ws, const std::string& expected) -> void {
	ASSERT_OK_AND_ASSIGN(std::string s, WideStringToUTF8(ws));
	ASSERT_EQ(s, expected);
	};

	auto CheckInvalid = [](const std::wstring& ws) -> void {
	ASSERT_RAISES(Invalid, WideStringToUTF8(ws));
	};

	CheckOk(L"", "");
	CheckOk(L"foo", "foo");
	CheckOk(L"h\u00e9h\u00e9", "h\xc3\xa9h\xc3\xa9");
	CheckOk(L"\U0001F600", "\xf0\x9f\x98\x80");
	CheckOk(L"\U0010FFFF", "\xf4\x8f\xbf\xbf");
	CheckOk({0, L'x'}, {0, 'x'});

	// Lone surrogate
	CheckInvalid({0xD800});
	CheckInvalid({0xDFFF});
	// Invalid code point
	#if WCHAR_MAX > 0xFFFF
	CheckInvalid({0x110000});
	#endif
	}

	TEST(UTF8DecodeReverse, Basics) {
	auto CheckOk = [](const std::string& s) -> void {
	const uint8_t* begin = reinterpret_cast<const uint8_t*>(s.c_str());
	const uint8_t* end = begin + s.length();
	const uint8_t* i = end - 1;
	uint32_t codepoint;
	EXPECT_TRUE(UTF8DecodeReverse(&i, &codepoint));
	EXPECT_EQ(i, begin - 1);
	};

	// 0x80 == 0b10000000
	// 0xC0 == 0b11000000
	// 0xE0 == 0b11100000
	// 0xF0 == 0b11110000
	CheckOk("a");
	CheckOk("\xC0\x80");
	CheckOk("\xE0\x80\x80");
	CheckOk("\xF0\x80\x80\x80");

	auto CheckInvalid = [](const std::string& s) -> void {
	const uint8_t* begin = reinterpret_cast<const uint8_t*>(s.c_str());
	const uint8_t* end = begin + s.length();
	const uint8_t* i = end - 1;
	uint32_t codepoint;
	EXPECT_FALSE(UTF8DecodeReverse(&i, &codepoint));
	};

	// too many continuation code units
	CheckInvalid("a\x80");
	CheckInvalid("\xC0\x80\x80");
	CheckInvalid("\xE0\x80\x80\x80");
	CheckInvalid("\xF0\x80\x80\x80\x80");
	// not enough continuation code units
	CheckInvalid("\xC0");
	CheckInvalid("\xE0\x80");
	CheckInvalid("\xF0\x80\x80");
	}

	TEST(UTF8FindIf, Basics) {
	auto CheckOk = [](const std::string& s, unsigned char test, int64_t offset_left,
	int64_t offset_right) -> void {
	const uint8_t* begin = reinterpret_cast<const uint8_t*>(s.c_str());
	const uint8_t* end = begin + s.length();
	std::reverse_iterator<const uint8_t*> rbegin(end);
	std::reverse_iterator<const uint8_t*> rend(begin);
	const uint8_t* left;
	const uint8_t* right;
	auto predicate = [&](uint32_t c) { return c == test; };
	EXPECT_TRUE(UTF8FindIf(begin, end, predicate, &left));
	EXPECT_TRUE(UTF8FindIfReverse(begin, end, predicate, &right));
	EXPECT_EQ(offset_left, left - begin);
	EXPECT_EQ(offset_right, right - begin);
	EXPECT_EQ(std::find_if(begin, end, predicate) - begin, left - begin);
	EXPECT_EQ(std::find_if(rbegin, rend, predicate).base() - begin, right - begin);
	};
	auto CheckOkUTF8 = [](const std::string& s, uint32_t test, int64_t offset_left,
	int64_t offset_right) -> void {
	const uint8_t* begin = reinterpret_cast<const uint8_t*>(s.c_str());
	const uint8_t* end = begin + s.length();
	std::reverse_iterator<const uint8_t*> rbegin(end);
	std::reverse_iterator<const uint8_t*> rend(begin);
	const uint8_t* left;
	const uint8_t* right;
	auto predicate = [&](uint32_t c) { return c == test; };
	EXPECT_TRUE(UTF8FindIf(begin, end, predicate, &left));
	EXPECT_TRUE(UTF8FindIfReverse(begin, end, predicate, &right));
	EXPECT_EQ(offset_left, left - begin);
	EXPECT_EQ(offset_right, right - begin);
	// we cannot check the unicode version with find_if semantics, because it's byte based
	// EXPECT_EQ(std::find_if(begin, end, predicate) - begin, left - begin);
	// EXPECT_EQ(std::find_if(rbegin, rend, predicate).base() - begin, right - begin);
	};

	CheckOk("aaaba", 'a', 0, 5);
	CheckOkUTF8("aaaβa", 'a', 0, 6);

	CheckOk("aaaba", 'b', 3, 4);
	CheckOkUTF8("aaaβa", U'β', 3, 5);

	CheckOk("aaababa", 'b', 3, 6);
	CheckOkUTF8("aaaβaβa", U'β', 3, 8);

	CheckOk("aaababa", 'c', 7, 0);
	CheckOk("aaaβaβa", 'c', 9, 0);
	CheckOkUTF8("aaaβaβa", U'ɑ', 9, 0);

	CheckOk("a", 'a', 0, 1);
	CheckOkUTF8("ɑ", U'ɑ', 0, 2);

	CheckOk("a", 'b', 1, 0);
	CheckOkUTF8("ɑ", 'b', 2, 0);

	CheckOk("", 'b', 0, 0);
	CheckOkUTF8("", U'β', 0, 0);
	}

	} // namespace util
	} // namespace arrow