| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| #include <cstdint> |
| #include <iterator> |
| #include <mutex> |
| #include <stdexcept> |
| #include <utility> |
| |
| #include "arrow/result.h" |
| #include "arrow/util/logging.h" |
| #include "arrow/util/utf8.h" |
| #include "arrow/vendored/utfcpp/checked.h" |
| |
| // Can be defined by utfcpp |
| #ifdef NOEXCEPT |
| #undef NOEXCEPT |
| #endif |
| |
| namespace arrow { |
| namespace util { |
| namespace internal { |
| |
| // Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de> |
| // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. |
| |
| // clang-format off |
| const uint8_t utf8_small_table[] = { // NOLINT |
| // The first part of the table maps bytes to character classes that |
| // to reduce the size of the transition table and create bitmasks. |
| 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // NOLINT |
| 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // NOLINT |
| 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // NOLINT |
| 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // NOLINT |
| 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // NOLINT |
| 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // NOLINT |
| 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // NOLINT |
| 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, // NOLINT |
| |
| // The second part is a transition table that maps a combination |
| // of a state of the automaton and a character class to a state. |
| // Character classes are between 0 and 11, states are multiples of 12. |
| 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, // NOLINT |
| 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, // NOLINT |
| 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, // NOLINT |
| 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, // NOLINT |
| 12,36,12,12,12,12,12,12,12,12,12,12, // NOLINT |
| }; |
| // clang-format on |
| |
| uint16_t utf8_large_table[9 * 256] = {0xffff}; |
| |
| static void InitializeLargeTable() { |
| for (uint32_t state = 0; state < 9; ++state) { |
| for (uint32_t byte = 0; byte < 256; ++byte) { |
| uint32_t byte_class = utf8_small_table[byte]; |
| uint8_t next_state = utf8_small_table[256 + state * 12 + byte_class] / 12; |
| DCHECK_LT(next_state, 9); |
| utf8_large_table[state * 256 + byte] = static_cast<uint16_t>(next_state * 256); |
| } |
| } |
| } |
| |
| ARROW_EXPORT void CheckUTF8Initialized() { |
| DCHECK_EQ(utf8_large_table[0], 0) |
| << "InitializeUTF8() must be called before calling UTF8 routines"; |
| } |
| |
| } // namespace internal |
| |
| static std::once_flag utf8_initialized; |
| |
| void InitializeUTF8() { |
| std::call_once(utf8_initialized, internal::InitializeLargeTable); |
| } |
| |
| static const uint8_t kBOM[] = {0xEF, 0xBB, 0xBF}; |
| |
| Result<const uint8_t*> SkipUTF8BOM(const uint8_t* data, int64_t size) { |
| int64_t i; |
| for (i = 0; i < static_cast<int64_t>(sizeof(kBOM)); ++i) { |
| if (size == 0) { |
| if (i == 0) { |
| // Empty string |
| return data; |
| } else { |
| return Status::Invalid("UTF8 string too short (truncated byte order mark?)"); |
| } |
| } |
| if (data[i] != kBOM[i]) { |
| // BOM not found |
| return data; |
| } |
| --size; |
| } |
| // BOM found |
| return data + i; |
| } |
| |
| namespace { |
| |
| // Some platforms (such as old MinGWs) don't have the <codecvt> header, |
| // so call into a vendored utf8 implementation instead. |
| |
| std::wstring UTF8ToWideStringInternal(const std::string& source) { |
| std::wstring ws; |
| #if WCHAR_MAX > 0xFFFF |
| ::utf8::utf8to32(source.begin(), source.end(), std::back_inserter(ws)); |
| #else |
| ::utf8::utf8to16(source.begin(), source.end(), std::back_inserter(ws)); |
| #endif |
| return ws; |
| } |
| |
| std::string WideStringToUTF8Internal(const std::wstring& source) { |
| std::string s; |
| #if WCHAR_MAX > 0xFFFF |
| ::utf8::utf32to8(source.begin(), source.end(), std::back_inserter(s)); |
| #else |
| ::utf8::utf16to8(source.begin(), source.end(), std::back_inserter(s)); |
| #endif |
| return s; |
| } |
| |
| } // namespace |
| |
| Result<std::wstring> UTF8ToWideString(const std::string& source) { |
| try { |
| return UTF8ToWideStringInternal(source); |
| } catch (std::exception& e) { |
| return Status::Invalid(e.what()); |
| } |
| } |
| |
| ARROW_EXPORT Result<std::string> WideStringToUTF8(const std::wstring& source) { |
| try { |
| return WideStringToUTF8Internal(source); |
| } catch (std::exception& e) { |
| return Status::Invalid(e.what()); |
| } |
| } |
| |
| } // namespace util |
| } // namespace arrow |