| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| #pragma once |
| #include <fmt/format.h> |
| |
| #include <cstring> |
| |
| #include "vec/columns/column_string.h" |
| #include "vec/common/string_ref.h" |
| |
| namespace doris::vectorized { |
| static constexpr size_t DEFAULT_MAX_STRING_SIZE = 1073741824; // 1GB |
| static constexpr size_t DEFAULT_MAX_JSON_SIZE = 1073741824; // 1GB |
| |
| // store and commit data. only after commit the data is effective on its' base(ColumnString) |
| // everytime commit, the _data add one row. |
| class BufferWritable final { |
| public: |
| explicit BufferWritable(ColumnString& vector) |
| : _data(vector.get_chars()), _offsets(vector.get_offsets()) {} |
| |
| void write(const char* data, size_t len) { |
| _data.insert(data, data + len); |
| _now_offset += len; |
| } |
| |
| void write(char c) { |
| const char* p = &c; |
| _data.insert(p, p + 1); |
| _now_offset += 1; |
| } |
| |
| // commit may not be called if exception is thrown in writes(e.g. alloc mem failed) |
| void commit() { |
| ColumnString::check_chars_length(_offsets.back() + _now_offset, 0); |
| _offsets.push_back(_offsets.back() + _now_offset); |
| _now_offset = 0; |
| } |
| |
| char* data() { return reinterpret_cast<char*>(_data.data() + _now_offset + _offsets.back()); } |
| |
| void add_offset(size_t len) { _now_offset += len; } |
| |
| void resize(size_t size) { _data.resize(size + _now_offset + _offsets.back()); } |
| |
| template <typename T> |
| void write_number(T data) { |
| fmt::memory_buffer buffer; |
| fmt::format_to(buffer, "{}", data); |
| write(buffer.data(), buffer.size()); |
| } |
| |
| // Write a variable-length unsigned integer to the buffer |
| // maybe it's better not to use this |
| void write_var_uint(UInt64 x) { |
| char bytes[9]; |
| uint8_t i = 0; |
| while (i < 9) { |
| uint8_t byte = x & 0x7F; |
| if (x > 0x7F) { |
| byte |= 0x80; |
| } |
| |
| bytes[i++] = byte; |
| |
| x >>= 7; |
| if (!x) { |
| break; |
| } |
| } |
| write((char*)&i, 1); |
| write(bytes, i); |
| } |
| |
| template <typename Type> |
| void write_binary(const Type& x) { |
| static_assert(std::is_standard_layout_v<Type>); |
| write(reinterpret_cast<const char*>(&x), sizeof(x)); |
| } |
| |
| template <typename Type> |
| requires(std::is_same_v<Type, String> || std::is_same_v<Type, PaddedPODArray<UInt8>>) |
| void write_binary(const Type& s) { |
| write_var_uint(s.size()); |
| write(reinterpret_cast<const char*>(s.data()), s.size()); |
| } |
| |
| void write_binary(const StringRef& s) { |
| write_var_uint(s.size); |
| write(s.data, s.size); |
| } |
| |
| void write_char(char x) { write(x); } |
| |
| // Writes a C-string without creating a temporary object. If the string is a literal, then `strlen` is executed at the compilation stage. |
| // Use when the string is a literal. |
| void write_c_string(const char* s) { write(s, strlen(s)); } |
| |
| /** |
| * @brief Write a string in JSON format, escaping special characters. |
| * |
| * This function takes a string (as a char pointer and size) and writes it to the buffer |
| * as a JSON string literal. This involves: |
| * 1. Enclosing the string in double quotes ("..."). |
| * 2. Escaping control characters (e.g., \n, \t, \b). |
| * 3. Escaping JSON-specific characters like backslash (\\) and double-quote ("). |
| * 4. Escaping ASCII control characters (0x00-0x1F) using `\uXXXX` notation. |
| * 5. Escaping Unicode line separators U+2028 and U+2029 for JavaScript compatibility. |
| * |
| * @param s A pointer to the character data of the string. |
| * @param size The number of bytes in the string. |
| * |
| * @example |
| * // String to be written: |
| * // Hello, "world"! |
| * // (with a newline at the end) |
| * const char* my_str = "Hello, \"world\"!\n"; |
| * size_t my_size = 16; |
| * |
| * // The function will write the following to the buffer: |
| * // "Hello, \"world\"!\\n" |
| */ |
| void write_json_string(const char* s, size_t size) { |
| write_char('"'); |
| const char* begin = s; |
| const char* end = s + size; |
| for (const char* it = begin; it != end; ++it) { |
| switch (*it) { |
| case '\b': |
| write_char('\\'); |
| write_char('b'); |
| break; |
| case '\f': |
| write_char('\\'); |
| write_char('f'); |
| break; |
| case '\n': |
| write_char('\\'); |
| write_char('n'); |
| break; |
| case '\r': |
| write_char('\\'); |
| write_char('r'); |
| break; |
| case '\t': |
| write_char('\\'); |
| write_char('t'); |
| break; |
| case '\\': |
| write_char('\\'); |
| write_char('\\'); |
| break; |
| case '/': |
| write_char('/'); |
| break; |
| case '"': |
| write_char('\\'); |
| write_char('"'); |
| break; |
| default: |
| UInt8 c = *it; |
| if (c <= 0x1F) { |
| /// Escaping of ASCII control characters. |
| |
| UInt8 higher_half = c >> 4; |
| UInt8 lower_half = c & 0xF; |
| |
| write_c_string("\\u00"); |
| write_char('0' + higher_half); |
| |
| if (lower_half <= 9) { |
| write_char('0' + lower_half); |
| } else { |
| write_char('A' + lower_half - 10); |
| } |
| } else if (end - it >= 3 && it[0] == '\xE2' && it[1] == '\x80' && |
| (it[2] == '\xA8' || it[2] == '\xA9')) { |
| /// This is for compatibility with JavaScript, because unescaped line separators are prohibited in string literals, |
| /// and these code points are alternative line separators. |
| |
| if (it[2] == '\xA8') { |
| write_c_string("\\u2028"); |
| } |
| if (it[2] == '\xA9') { |
| write_c_string("\\u2029"); |
| } |
| |
| /// Byte sequence is 3 bytes long. We have additional two bytes to skip. |
| it += 2; |
| } else { |
| write_char(*it); |
| } |
| } |
| } |
| write_char('"'); |
| } |
| |
| void write_json_string(const StringRef& s) { write_json_string(s.data, s.size); } |
| void write_json_string(const std::string& s) { write_json_string(s.data(), s.size()); } |
| void write_json_string(std::string_view s) { write_json_string(s.data(), s.size()); } |
| |
| private: |
| ColumnString::Chars& _data; |
| ColumnString::Offsets& _offsets; |
| size_t _now_offset = 0; |
| }; |
| |
| using VectorBufferWriter = BufferWritable; |
| using BufferWriter = BufferWritable; |
| |
| // There is consumption of the buffer in the read method. |
| class BufferReadable { |
| public: |
| explicit BufferReadable(StringRef& ref) : _data(ref.data) {} |
| explicit BufferReadable(StringRef&& ref) : _data(ref.data) {} |
| ~BufferReadable() = default; |
| |
| StringRef read(size_t len) { |
| StringRef ref(_data, len); |
| _data += len; |
| return ref; |
| } |
| |
| void read(char* data, size_t len) { |
| memcpy(data, _data, len); |
| _data += len; |
| } |
| |
| const char* data() { return _data; } |
| |
| void add_offset(size_t len) { _data += len; } |
| |
| void read_var_uint(UInt64& x) { |
| x = 0; |
| // get length from first byte firstly |
| uint8_t len = 0; |
| read((char*)&len, 1); |
| auto ref = read(len); |
| // read data and set it to x per byte. |
| const char* bytes = ref.data; |
| for (size_t i = 0; i < 9; ++i) { |
| UInt64 byte = bytes[i]; |
| x |= (byte & 0x7F) << (7 * i); |
| |
| if (!(byte & 0x80)) { |
| return; |
| } |
| } |
| } |
| |
| template <typename Type> |
| void read_binary(Type& x) { |
| static_assert(std::is_standard_layout_v<Type>); |
| memcpy_fixed<Type>(reinterpret_cast<char*>(&x), _data); |
| _data += sizeof(x); |
| } |
| |
| template <typename Type> |
| requires(std::is_same_v<Type, String> || std::is_same_v<Type, PaddedPODArray<UInt8>>) |
| void read_binary(Type& s) { |
| UInt64 size = 0; |
| read_var_uint(size); |
| |
| if (size > DEFAULT_MAX_STRING_SIZE) { |
| throw doris::Exception(ErrorCode::INTERNAL_ERROR, |
| "Too large string size." |
| " size: {}, max: {}", |
| size, DEFAULT_MAX_STRING_SIZE); |
| } |
| |
| s.resize(size); |
| read((char*)s.data(), size); |
| } |
| |
| // Note that the StringRef in this function is just a reference, it should be copied outside |
| void read_binary(StringRef& s) { |
| UInt64 size = 0; |
| read_var_uint(size); |
| |
| if (size > DEFAULT_MAX_STRING_SIZE) { |
| throw doris::Exception(ErrorCode::INTERNAL_ERROR, |
| "Too large string size. " |
| " size: {}, max: {}", |
| size, DEFAULT_MAX_STRING_SIZE); |
| } |
| |
| s = read(size); |
| } |
| |
| ///TODO: Currently this function is only called in one place, we might need to convert all read_binary(StringRef) to this style? Or directly use read_binary(String) |
| StringRef read_binary_into(Arena& arena) { |
| UInt64 size = 0; |
| read_var_uint(size); |
| |
| char* data = arena.alloc(size); |
| read(data, size); |
| |
| return {data, size}; |
| } |
| |
| private: |
| const char* _data; |
| }; |
| |
| using VectorBufferReader = BufferReadable; |
| using BufferReader = BufferReadable; |
| } // namespace doris::vectorized |