blob: 478d8ade95f7903df29c0e4b910cd441be0db6ea [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include <cstdint>
#include <iterator>
#include <mutex>
#include <stdexcept>
#include <utility>
#include "arrow/result.h"
#include "arrow/util/logging.h"
#include "arrow/util/utf8.h"
#include "arrow/vendored/utfcpp/checked.h"
// Can be defined by utfcpp
namespace arrow {
namespace util {
namespace internal {
// Copyright (c) 2008-2010 Bjoern Hoehrmann <>
// See for details.
// clang-format off
const uint8_t utf8_small_table[] = { // NOLINT
// The first part of the table maps bytes to character classes that
// to reduce the size of the transition table and create bitmasks.
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // NOLINT
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // NOLINT
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // NOLINT
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // NOLINT
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // NOLINT
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // NOLINT
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // NOLINT
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, // NOLINT
// The second part is a transition table that maps a combination
// of a state of the automaton and a character class to a state.
// Character classes are between 0 and 11, states are multiples of 12.
0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, // NOLINT
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, // NOLINT
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, // NOLINT
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, // NOLINT
12,36,12,12,12,12,12,12,12,12,12,12, // NOLINT
// clang-format on
uint16_t utf8_large_table[9 * 256] = {0xffff};
static void InitializeLargeTable() {
for (uint32_t state = 0; state < 9; ++state) {
for (uint32_t byte = 0; byte < 256; ++byte) {
uint32_t byte_class = utf8_small_table[byte];
uint8_t next_state = utf8_small_table[256 + state * 12 + byte_class] / 12;
DCHECK_LT(next_state, 9);
utf8_large_table[state * 256 + byte] = static_cast<uint16_t>(next_state * 256);
ARROW_EXPORT void CheckUTF8Initialized() {
DCHECK_EQ(utf8_large_table[0], 0)
<< "InitializeUTF8() must be called before calling UTF8 routines";
} // namespace internal
static std::once_flag utf8_initialized;
void InitializeUTF8() {
std::call_once(utf8_initialized, internal::InitializeLargeTable);
static const uint8_t kBOM[] = {0xEF, 0xBB, 0xBF};
Result<const uint8_t*> SkipUTF8BOM(const uint8_t* data, int64_t size) {
int64_t i;
for (i = 0; i < static_cast<int64_t>(sizeof(kBOM)); ++i) {
if (size == 0) {
if (i == 0) {
// Empty string
return data;
} else {
return Status::Invalid("UTF8 string too short (truncated byte order mark?)");
if (data[i] != kBOM[i]) {
// BOM not found
return data;
// BOM found
return data + i;
namespace {
// Some platforms (such as old MinGWs) don't have the <codecvt> header,
// so call into a vendored utf8 implementation instead.
std::wstring UTF8ToWideStringInternal(const std::string& source) {
std::wstring ws;
::utf8::utf8to32(source.begin(), source.end(), std::back_inserter(ws));
::utf8::utf8to16(source.begin(), source.end(), std::back_inserter(ws));
return ws;
std::string WideStringToUTF8Internal(const std::wstring& source) {
std::string s;
::utf8::utf32to8(source.begin(), source.end(), std::back_inserter(s));
::utf8::utf16to8(source.begin(), source.end(), std::back_inserter(s));
return s;
} // namespace
Result<std::wstring> UTF8ToWideString(const std::string& source) {
try {
return UTF8ToWideStringInternal(source);
} catch (std::exception& e) {
return Status::Invalid(e.what());
ARROW_EXPORT Result<std::string> WideStringToUTF8(const std::wstring& source) {
try {
return WideStringToUTF8Internal(source);
} catch (std::exception& e) {
return Status::Invalid(e.what());
} // namespace util
} // namespace arrow