| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| #include "text_converter.h" |
| |
| #include <glog/logging.h> |
| #include <sql.h> |
| #include <stdint.h> |
| |
| #include <algorithm> |
| #include <ostream> |
| |
| // IWYU pragma: no_include <opentelemetry/common/threadlocal.h> |
| #include "common/compiler_util.h" // IWYU pragma: keep |
| #include "olap/hll.h" |
| #include "runtime/decimalv2_value.h" |
| #include "runtime/define_primitive_type.h" |
| #include "runtime/descriptors.h" |
| #include "runtime/types.h" |
| #include "util/slice.h" |
| #include "util/string_parser.hpp" |
| #include "vec/columns/column_array.h" |
| #include "vec/columns/column_complex.h" |
| #include "vec/columns/column_map.h" |
| #include "vec/columns/column_nullable.h" |
| #include "vec/columns/column_string.h" |
| #include "vec/columns/column_struct.h" |
| #include "vec/columns/column_vector.h" |
| #include "vec/core/types.h" |
| #include "vec/runtime/vdatetime_value.h" |
| |
| namespace doris { |
| |
| TextConverter::TextConverter(char escape_char, char collection_delimiter, char map_kv_delimiter) |
| : _escape_char(escape_char), |
| _collection_delimiter(collection_delimiter), |
| _map_kv_delimiter(map_kv_delimiter) {} |
| |
| void TextConverter::write_string_column(const SlotDescriptor* slot_desc, |
| vectorized::MutableColumnPtr* column_ptr, const char* data, |
| size_t len, bool need_escape) { |
| DCHECK(column_ptr->get()->is_nullable()); |
| auto* nullable_column = reinterpret_cast<vectorized::ColumnNullable*>(column_ptr->get()); |
| if (need_escape) { |
| unescape_string_on_spot(data, &len); |
| } |
| if ((len == 2 && data[0] == '\\' && data[1] == 'N') || len == SQL_NULL_DATA) { |
| nullable_column->get_null_map_data().push_back(1); |
| reinterpret_cast<vectorized::ColumnString&>(nullable_column->get_nested_column()) |
| .insert_default(); |
| } else { |
| nullable_column->get_null_map_data().push_back(0); |
| reinterpret_cast<vectorized::ColumnString&>(nullable_column->get_nested_column()) |
| .insert_data(data, len); |
| } |
| } |
| |
| bool TextConverter::_write_data(const TypeDescriptor& type_desc, |
| vectorized::IColumn* nullable_col_ptr, const char* data, size_t len, |
| bool copy_string, bool need_escape, size_t rows, |
| char array_delimiter) { |
| vectorized::IColumn* col_ptr = nullable_col_ptr; |
| // \N means it's NULL |
| std::string col_type_name = col_ptr->get_name(); |
| bool is_null_able = typeid(*nullable_col_ptr) == typeid(vectorized::ColumnNullable); |
| if (is_null_able) { |
| auto* nullable_column = reinterpret_cast<vectorized::ColumnNullable*>(nullable_col_ptr); |
| if ((len == 2 && data[0] == '\\' && data[1] == 'N') || len == SQL_NULL_DATA) { |
| nullable_column->insert_many_defaults(rows); |
| return true; |
| } else { |
| auto& null_map = nullable_column->get_null_map_data(); |
| null_map.resize_fill(null_map.size() + rows, 0); |
| col_ptr = &nullable_column->get_nested_column(); |
| } |
| } |
| |
| StringParser::ParseResult parse_result = StringParser::PARSE_SUCCESS; |
| size_t origin_size = col_ptr->size(); |
| // Parse the raw-text data. Translate the text string to internal format. |
| switch (type_desc.type) { |
| case TYPE_HLL: { |
| HyperLogLog hyper_log_log(Slice(data, len)); |
| auto& hyper_data = reinterpret_cast<vectorized::ColumnHLL*>(col_ptr)->get_data(); |
| for (size_t i = 0; i < rows; ++i) { |
| hyper_data.emplace_back(hyper_log_log); |
| } |
| break; |
| } |
| case TYPE_STRING: |
| case TYPE_VARCHAR: |
| case TYPE_CHAR: { |
| if (need_escape) { |
| unescape_string_on_spot(data, &len); |
| } |
| reinterpret_cast<vectorized::ColumnString*>(col_ptr)->insert_many_data(data, len, rows); |
| break; |
| } |
| |
| case TYPE_BOOLEAN: { |
| bool num = StringParser::string_to_bool(data, len, &parse_result); |
| if (parse_result != StringParser::PARSE_SUCCESS) { |
| break; |
| } |
| reinterpret_cast<vectorized::ColumnVector<vectorized::UInt8>*>(col_ptr) |
| ->get_data() |
| .resize_fill(origin_size + rows, (uint8_t)num); |
| break; |
| } |
| case TYPE_TINYINT: { |
| int8_t num = StringParser::string_to_int<int8_t>(data, len, &parse_result); |
| if (parse_result != StringParser::PARSE_SUCCESS) { |
| break; |
| } |
| reinterpret_cast<vectorized::ColumnVector<vectorized::Int8>*>(col_ptr) |
| ->get_data() |
| .resize_fill(origin_size + rows, num); |
| break; |
| } |
| case TYPE_SMALLINT: { |
| int16_t num = StringParser::string_to_int<int16_t>(data, len, &parse_result); |
| if (parse_result != StringParser::PARSE_SUCCESS) { |
| break; |
| } |
| reinterpret_cast<vectorized::ColumnVector<vectorized::Int16>*>(col_ptr) |
| ->get_data() |
| .resize_fill(origin_size + rows, num); |
| break; |
| } |
| case TYPE_INT: { |
| int32_t num = StringParser::string_to_int<int32_t>(data, len, &parse_result); |
| if (parse_result != StringParser::PARSE_SUCCESS) { |
| break; |
| } |
| reinterpret_cast<vectorized::ColumnVector<vectorized::Int32>*>(col_ptr) |
| ->get_data() |
| .resize_fill(origin_size + rows, num); |
| break; |
| } |
| case TYPE_BIGINT: { |
| int64_t num = StringParser::string_to_int<int64_t>(data, len, &parse_result); |
| if (parse_result != StringParser::PARSE_SUCCESS) { |
| break; |
| } |
| reinterpret_cast<vectorized::ColumnVector<vectorized::Int64>*>(col_ptr) |
| ->get_data() |
| .resize_fill(origin_size + rows, num); |
| break; |
| } |
| case TYPE_LARGEINT: { |
| __int128 num = StringParser::string_to_int<__int128>(data, len, &parse_result); |
| if (parse_result != StringParser::PARSE_SUCCESS) { |
| break; |
| } |
| reinterpret_cast<vectorized::ColumnVector<vectorized::Int128>*>(col_ptr) |
| ->get_data() |
| .resize_fill(origin_size + rows, num); |
| break; |
| } |
| |
| case TYPE_FLOAT: { |
| float num = StringParser::string_to_float<float>(data, len, &parse_result); |
| if (parse_result != StringParser::PARSE_SUCCESS) { |
| break; |
| } |
| reinterpret_cast<vectorized::ColumnVector<vectorized::Float32>*>(col_ptr) |
| ->get_data() |
| .resize_fill(origin_size + rows, num); |
| break; |
| } |
| case TYPE_DOUBLE: { |
| double num = StringParser::string_to_float<double>(data, len, &parse_result); |
| if (parse_result != StringParser::PARSE_SUCCESS) { |
| break; |
| } |
| reinterpret_cast<vectorized::ColumnVector<vectorized::Float64>*>(col_ptr) |
| ->get_data() |
| .resize_fill(origin_size + rows, num); |
| break; |
| } |
| case TYPE_DATE: { |
| vectorized::VecDateTimeValue ts_slot; |
| if (!ts_slot.from_date_str(data, len)) { |
| parse_result = StringParser::PARSE_FAILURE; |
| break; |
| } |
| ts_slot.cast_to_date(); |
| reinterpret_cast<vectorized::ColumnVector<vectorized::Int64>*>(col_ptr) |
| ->get_data() |
| .resize_fill(origin_size + rows, *reinterpret_cast<int64_t*>(&ts_slot)); |
| break; |
| } |
| case TYPE_DATEV2: { |
| vectorized::DateV2Value<vectorized::DateV2ValueType> ts_slot; |
| if (!ts_slot.from_date_str(data, len)) { |
| parse_result = StringParser::PARSE_FAILURE; |
| break; |
| } |
| uint32_t int_val = ts_slot.to_date_int_val(); |
| reinterpret_cast<vectorized::ColumnVector<vectorized::UInt32>*>(col_ptr) |
| ->get_data() |
| .resize_fill(origin_size + rows, int_val); |
| break; |
| } |
| case TYPE_DATETIME: { |
| vectorized::VecDateTimeValue ts_slot; |
| if (!ts_slot.from_date_str(data, len)) { |
| parse_result = StringParser::PARSE_FAILURE; |
| break; |
| } |
| ts_slot.to_datetime(); |
| reinterpret_cast<vectorized::ColumnVector<vectorized::Int64>*>(col_ptr) |
| ->get_data() |
| .resize_fill(origin_size + rows, *reinterpret_cast<int64_t*>(&ts_slot)); |
| break; |
| } |
| case TYPE_DATETIMEV2: { |
| vectorized::DateV2Value<vectorized::DateTimeV2ValueType> ts_slot; |
| if (!ts_slot.from_date_str(data, len)) { |
| parse_result = StringParser::PARSE_FAILURE; |
| break; |
| } |
| uint64_t int_val = ts_slot.to_date_int_val(); |
| reinterpret_cast<vectorized::ColumnVector<vectorized::UInt64>*>(col_ptr) |
| ->get_data() |
| .resize_fill(origin_size + rows, int_val); |
| break; |
| } |
| case TYPE_DECIMALV2: { |
| DecimalV2Value decimal_slot; |
| if (decimal_slot.parse_from_str(data, len)) { |
| parse_result = StringParser::PARSE_FAILURE; |
| break; |
| } |
| reinterpret_cast<vectorized::ColumnVector<vectorized::Int128>*>(col_ptr) |
| ->get_data() |
| .resize_fill(origin_size + rows, decimal_slot.value()); |
| break; |
| } |
| case TYPE_DECIMAL32: { |
| StringParser::ParseResult result = StringParser::PARSE_SUCCESS; |
| int32_t value = StringParser::string_to_decimal<TYPE_DECIMAL32, int32_t>( |
| data, len, type_desc.precision, type_desc.scale, &result); |
| if (result != StringParser::PARSE_SUCCESS) { |
| parse_result = StringParser::PARSE_FAILURE; |
| break; |
| } |
| reinterpret_cast<vectorized::ColumnVector<vectorized::Int32>*>(col_ptr) |
| ->get_data() |
| .resize_fill(origin_size + rows, value); |
| break; |
| } |
| case TYPE_DECIMAL64: { |
| StringParser::ParseResult result = StringParser::PARSE_SUCCESS; |
| int64_t value = StringParser::string_to_decimal<TYPE_DECIMAL64, int64_t>( |
| data, len, type_desc.precision, type_desc.scale, &result); |
| if (result != StringParser::PARSE_SUCCESS) { |
| parse_result = StringParser::PARSE_FAILURE; |
| break; |
| } |
| reinterpret_cast<vectorized::ColumnVector<vectorized::Int64>*>(col_ptr) |
| ->get_data() |
| .resize_fill(origin_size + rows, value); |
| break; |
| } |
| case TYPE_DECIMAL128I: { |
| StringParser::ParseResult result = StringParser::PARSE_SUCCESS; |
| vectorized::Int128 value = |
| StringParser::string_to_decimal<TYPE_DECIMAL128I, vectorized::Int128>( |
| data, len, type_desc.precision, type_desc.scale, &result); |
| if (result != StringParser::PARSE_SUCCESS) { |
| parse_result = StringParser::PARSE_FAILURE; |
| break; |
| } |
| reinterpret_cast<vectorized::ColumnVector<vectorized::Int128>*>(col_ptr) |
| ->get_data() |
| .resize_fill(origin_size + rows, value); |
| break; |
| } |
| case TYPE_ARRAY: { |
| auto col = reinterpret_cast<vectorized::ColumnArray*>(col_ptr); |
| |
| std::vector<std::pair<size_t, size_t>> ranges; |
| for (size_t i = 0, from = 0; i <= len; i++) { |
| if (i < len && data[i] != array_delimiter && data[i] != _collection_delimiter) { |
| continue; |
| } |
| ranges.push_back({from, i - from}); |
| from = i + 1; |
| } |
| |
| auto sub_type = type_desc.children[0]; |
| for (int i = 0; i < rows; i++) { |
| for (auto range : ranges) { |
| _write_data(sub_type, &col->get_data(), data + range.first, range.second, |
| copy_string, need_escape, 1, array_delimiter + 1); |
| } |
| col->get_offsets().push_back(col->get_offsets().back() + ranges.size()); |
| } |
| |
| break; |
| } |
| case TYPE_MAP: { |
| auto col = reinterpret_cast<vectorized::ColumnMap*>(col_ptr); |
| |
| std::vector<std::array<size_t, 3>> ranges; |
| for (size_t i = 0, from = 0, kv = 0; i <= len; i++) { |
| /* |
| * In hive , when you special map key and value delimiter as ':' |
| * for map<int,timestamp> column , the query result is correct , but |
| * for map<timestamp, int> column and map<timestamp,timestamp> column , the query result is incorrect, |
| * because this field have many '_map_kv_delimiter'. |
| * |
| * So i use 'kv <= from' in order to get _map_kv_delimiter that appears first. |
| * */ |
| if (i < len && data[i] == _map_kv_delimiter && kv <= from) { |
| kv = i; |
| continue; |
| } |
| if ((i == len || data[i] == _collection_delimiter) && i >= kv + 1) { |
| ranges.push_back({from, kv, i - 1}); |
| from = i + 1; |
| kv = from; |
| } |
| } |
| |
| auto key_type = type_desc.children[0]; |
| auto value_type = type_desc.children[1]; |
| |
| for (int i = 0; i < rows; i++) { |
| for (auto range : ranges) { |
| _write_data(key_type, &col->get_keys(), data + range[0], range[1] - range[0], |
| copy_string, need_escape, 1, array_delimiter + 1); |
| |
| _write_data(value_type, &col->get_values(), data + range[1] + 1, |
| range[2] - range[1], copy_string, need_escape, 1, array_delimiter + 1); |
| } |
| |
| col->get_offsets().push_back(col->get_offsets().back() + ranges.size()); |
| } |
| |
| break; |
| } |
| case TYPE_STRUCT: { |
| auto col = reinterpret_cast<vectorized::ColumnStruct*>(col_ptr); |
| |
| std::vector<std::pair<size_t, size_t>> ranges; |
| for (size_t i = 0, from = 0; i <= len; i++) { |
| if (i == len || data[i] == _collection_delimiter) { |
| ranges.push_back({from, i - from}); |
| from = i + 1; |
| } |
| } |
| for (int i = 0; i < rows; i++) { |
| for (size_t loc = 0; loc < col->get_columns().size(); loc++) { |
| _write_data(type_desc.children[loc], &col->get_column(loc), |
| data + ranges[loc].first, ranges[loc].second, copy_string, need_escape, |
| rows, array_delimiter + 1); |
| } |
| } |
| break; |
| } |
| default: |
| DCHECK(false) << "bad slot type: " << type_desc; |
| break; |
| } |
| |
| if (UNLIKELY(parse_result == StringParser::PARSE_FAILURE)) { |
| if (is_null_able) { |
| auto* nullable_column = reinterpret_cast<vectorized::ColumnNullable*>(nullable_col_ptr); |
| size_t size = nullable_column->get_null_map_data().size(); |
| doris::vectorized::NullMap& null_map_data = nullable_column->get_null_map_data(); |
| for (int i = 1; i <= rows; ++i) { |
| null_map_data[size - i] = 1; |
| } |
| nullable_column->get_nested_column().insert_many_defaults(rows); |
| } |
| return false; |
| } |
| return true; |
| } |
| |
| bool TextConverter::write_vec_column(const SlotDescriptor* slot_desc, |
| vectorized::IColumn* nullable_col_ptr, const char* data, |
| size_t len, bool copy_string, bool need_escape, size_t rows) { |
| return _write_data(slot_desc->type(), nullable_col_ptr, data, len, copy_string, need_escape, |
| rows, '\2'); |
| } |
| |
| void TextConverter::unescape_string_on_spot(const char* src, size_t* len) { |
| const char* start = src; |
| char* dest_ptr = const_cast<char*>(src); |
| const char* end = src + *len; |
| bool escape_next_char = false; |
| |
| while (src < end) { |
| if (*src == _escape_char) { |
| escape_next_char = !escape_next_char; |
| } else { |
| escape_next_char = false; |
| } |
| |
| if (escape_next_char) { |
| ++src; |
| } else { |
| *dest_ptr++ = *src++; |
| } |
| } |
| |
| *len = dest_ptr - start; |
| } |
| |
| } // namespace doris |