| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| #pragma once |
| |
| #include <gen_cpp/Descriptors_types.h> |
| #include <parallel_hashmap/phmap.h> |
| |
| #include <cstddef> |
| #include <cstdint> |
| #include <map> |
| #include <string> |
| #include <string_view> |
| #include <unordered_map> |
| #include <vector> |
| |
| #include "common/status.h" |
| #include "olap/tablet_fwd.h" |
| #include "olap/tablet_schema.h" |
| #include "vec/aggregate_functions/aggregate_function.h" |
| #include "vec/columns/column.h" |
| #include "vec/columns/column_variant.h" |
| #include "vec/common/string_ref.h" |
| #include "vec/core/field.h" |
| #include "vec/core/types.h" |
| #include "vec/data_types/data_type.h" |
| #include "vec/json/json_parser.h" |
| |
| namespace doris { |
| class TabletSchema; |
| enum class FieldType; |
| namespace segment_v2 { |
| struct VariantStatisticsPB; |
| } // namespace segment_v2 |
| namespace vectorized { |
| class Block; |
| class IColumn; |
| struct ColumnWithTypeAndName; |
| class SimdJSONParser; |
| enum class ExtractType; |
| template <typename ParserImpl> |
| class JSONDataParser; |
| template <typename T> |
| class ColumnStr; |
| using ColumnString = ColumnStr<UInt32>; |
| using JsonParser = JSONDataParser<SimdJSONParser>; |
| } // namespace vectorized |
| } // namespace doris |
| |
| const std::string SPARSE_COLUMN_PATH = "__DORIS_VARIANT_SPARSE__"; |
| const std::string DOC_VALUE_COLUMN_PATH = "__DORIS_VARIANT_DOC_VALUE__"; |
| namespace doris::vectorized::variant_util { |
| |
| // Convert a restricted glob pattern into a regex (for tests/internal use). |
| Status glob_to_regex(const std::string& glob_pattern, std::string* regex_pattern); |
| |
| // Match a glob pattern against a path using RE2. |
| bool glob_match_re2(const std::string& glob_pattern, const std::string& candidate_path); |
| |
| using PathToNoneNullValues = std::unordered_map<std::string, int64_t>; |
| using PathToDataTypes = std::unordered_map<PathInData, std::vector<DataTypePtr>, PathInData::Hash>; |
| |
| struct VariantExtendedInfo { |
| PathToNoneNullValues path_to_none_null_values; // key: path, value: number of none null values |
| std::unordered_set<std::string> sparse_paths; // sparse paths in this variant column |
| std::unordered_set<std::string> typed_paths; // typed paths in this variant column |
| std::unordered_set<vectorized::PathInData, vectorized::PathInData::Hash> |
| nested_paths; // nested paths in this variant column |
| PathToDataTypes path_to_data_types; // key: path, value: data types |
| }; |
| |
| /// Returns number of dimensions in Array type. 0 if type is not array. |
| size_t get_number_of_dimensions(const IDataType& type); |
| |
| /// Returns number of dimensions in Array column. 0 if column is not array. |
| size_t get_number_of_dimensions(const IColumn& column); |
| |
| /// Returns type of scalars of Array of arbitrary dimensions. |
| DataTypePtr get_base_type_of_array(const DataTypePtr& type); |
| |
| // Cast column to dst type |
| Status cast_column(const ColumnWithTypeAndName& arg, const DataTypePtr& type, ColumnPtr* result); |
| |
| struct ExtraInfo { |
| // -1 indicates it's not a Frontend generated column |
| int32_t unique_id = -1; |
| int32_t parent_unique_id = -1; |
| vectorized::PathInData path_info; |
| }; |
| |
| TabletColumn get_column_by_type(const vectorized::DataTypePtr& data_type, const std::string& name, |
| const ExtraInfo& ext_info); |
| |
| // check if the tuple_paths has ambiguous paths |
| // situation: |
| // throw exception if there exists a prefix with matched names, but not matched structure (is Nested, number of dimensions). |
| Status check_variant_has_no_ambiguous_paths(const std::vector<PathInData>& paths); |
| |
| // Pick the tablet schema with the highest schema version as the reference. |
| // Then update all variant columns to there least common types. |
| // Return the final merged schema as common schema. |
| // If base_schema == nullptr then, max schema version tablet schema will be picked as base schema |
| Status get_least_common_schema(const std::vector<TabletSchemaSPtr>& schemas, |
| const TabletSchemaSPtr& base_schema, TabletSchemaSPtr& result, |
| bool check_schema_size = false); |
| |
| // Get least common types for extracted columns which has Path info, |
| // with a speicified variant column's unique id |
| Status update_least_common_schema(const std::vector<TabletSchemaSPtr>& schemas, |
| TabletSchemaSPtr& common_schema, int32_t variant_col_unique_id, |
| std::set<PathInData>* path_set); |
| |
| // inherit attributes like index/agg info from it's parent column |
| void inherit_column_attributes(TabletSchemaSPtr& schema); |
| |
| // source: variant column |
| // target: extracted column from variant column |
| void inherit_column_attributes(const TabletColumn& source, TabletColumn& target, |
| TabletSchemaSPtr* target_schema = nullptr); |
| |
| // Align variant subcolumn BF inheritance with FE BF-supported types. |
| bool is_bf_supported_by_fe_for_variant_subcolumn(FieldType type); |
| |
| // get sorted subcolumns of variant |
| vectorized::ColumnVariant::Subcolumns get_sorted_subcolumns( |
| const vectorized::ColumnVariant::Subcolumns& subcolumns); |
| |
| bool has_schema_index_diff(const TabletSchema* new_schema, const TabletSchema* old_schema, |
| int32_t new_col_idx, int32_t old_col_idx); |
| |
| // create ColumnMap<String, String> |
| TabletColumn create_sparse_column(const TabletColumn& variant); |
| |
| // Create one bucket sparse column: name = variant.name_lower_case() + "." + SPARSE_COLUMN_PATH + ".b{index}" |
| TabletColumn create_sparse_shard_column(const TabletColumn& variant, int bucket_index); |
| |
| TabletColumn create_doc_value_column(const TabletColumn& variant, int bucket_index); |
| |
| // Compute bucket id for given path string using SipHash64(path) % bucket_num. |
| uint32_t variant_binary_shard_of(const StringRef& path, uint32_t bucket_num); |
| |
| void get_field_info(const Field& field, FieldInfo* info); |
| |
| // inherit index from parent column |
| bool inherit_index(const std::vector<const TabletIndex*>& parent_indexes, |
| TabletIndexes& sub_column_indexes, FieldType column_type, |
| const std::string& suffix_path, bool is_array_nested_type = false); |
| |
| bool inherit_index(const std::vector<const TabletIndex*>& parent_indexes, |
| TabletIndexes& sub_column_indexes, const TabletColumn& column); |
| |
| bool inherit_index(const std::vector<const TabletIndex*>& parent_indexes, |
| TabletIndexes& sub_column_indexes, const segment_v2::ColumnMetaPB& column_pb); |
| |
| Status update_least_schema_internal(const std::map<PathInData, DataTypes>& subcolumns_types, |
| TabletSchemaSPtr& common_schema, int32_t variant_col_unique_id, |
| const std::map<std::string, TabletColumnPtr>& typed_columns, |
| std::set<PathInData>* path_set = nullptr); |
| |
| bool generate_sub_column_info(const TabletSchema& schema, int32_t col_unique_id, |
| const std::string& path, |
| TabletSchema::SubColumnInfo* sub_column_info); |
| |
| class VariantCompactionUtil { |
| public: |
| // get the subpaths and sparse paths for the variant column |
| static void get_subpaths(int32_t max_subcolumns_count, const PathToNoneNullValues& path_stats, |
| TabletSchema::PathsSetInfo& paths_set_info); |
| |
| // collect extended info from the variant column |
| static Status aggregate_variant_extended_info( |
| const RowsetSharedPtr& rs, |
| std::unordered_map<int32_t, VariantExtendedInfo>* uid_to_variant_extended_info); |
| |
| // collect path stats from the variant column |
| static Status aggregate_path_to_stats( |
| const RowsetSharedPtr& rs, |
| std::unordered_map<int32_t, PathToNoneNullValues>* uid_to_path_stats); |
| |
| // Build the temporary schema for compaction, this will reduce the memory usage of compacting variant columns |
| static Status get_extended_compaction_schema(const std::vector<RowsetSharedPtr>& rowsets, |
| TabletSchemaSPtr& target); |
| |
| // Used to collect all the subcolumns types of variant column from rowsets |
| static TabletSchemaSPtr calculate_variant_extended_schema( |
| const std::vector<RowsetSharedPtr>& rowsets, const TabletSchemaSPtr& base_schema); |
| |
| // Check if the path stats are consistent between inputs rowsets and output rowset. |
| // Used to check the correctness of compaction. |
| static Status check_path_stats(const std::vector<RowsetSharedPtr>& intputs, |
| RowsetSharedPtr output, BaseTabletSPtr tablet); |
| |
| // Calculate statistics about variant data paths from the encoded sparse column |
| static void calculate_variant_stats(const IColumn& encoded_sparse_column, |
| segment_v2::VariantStatisticsPB* stats, |
| size_t max_sparse_column_statistics_size, size_t row_pos, |
| size_t num_rows); |
| |
| static void get_compaction_subcolumns_from_subpaths( |
| TabletSchema::PathsSetInfo& paths_set_info, const TabletColumnPtr parent_column, |
| const TabletSchemaSPtr& target, const PathToDataTypes& path_to_data_types, |
| const std::unordered_set<std::string>& sparse_paths, TabletSchemaSPtr& output_schema); |
| |
| static void get_compaction_subcolumns_from_data_types( |
| TabletSchema::PathsSetInfo& paths_set_info, const TabletColumnPtr parent_column, |
| const TabletSchemaSPtr& target, const PathToDataTypes& path_to_data_types, |
| TabletSchemaSPtr& output_schema); |
| |
| static Status get_compaction_typed_columns(const TabletSchemaSPtr& target, |
| const std::unordered_set<std::string>& typed_paths, |
| const TabletColumnPtr parent_column, |
| TabletSchemaSPtr& output_schema, |
| TabletSchema::PathsSetInfo& paths_set_info); |
| |
| static Status get_compaction_nested_columns( |
| const std::unordered_set<vectorized::PathInData, vectorized::PathInData::Hash>& |
| nested_paths, |
| const PathToDataTypes& path_to_data_types, const TabletColumnPtr parent_column, |
| TabletSchemaSPtr& output_schema, TabletSchema::PathsSetInfo& paths_set_info); |
| }; |
| |
| // parse a batch of json strings into column object, throws doris::Execption when failed |
| // only UT test |
| void parse_json_to_variant(IColumn& column, const ColumnString& raw_json_column, |
| const ParseConfig& config); |
| |
| // Parse variant columns by picking variant positions from `variant_pos` and using provided ParseConfigs. |
| // only UT test |
| Status parse_and_materialize_variant_columns(Block& block, const std::vector<uint32_t>& variant_pos, |
| const std::vector<ParseConfig>& configs); |
| |
| // parse a single json, throws doris::Execption when failed |
| void parse_json_to_variant(IColumn& column, const StringRef& jsons, JsonParser* parser, |
| const ParseConfig& config); |
| |
| // Parse variant columns by picking variant positions from `column_pos` and generating ParseConfig |
| // based on tablet schema settings (flatten nested / doc snapshot mode). |
| Status parse_and_materialize_variant_columns(Block& block, const TabletSchema& tablet_schema, |
| const std::vector<uint32_t>& column_pos); |
| |
| // Parse doc snapshot column (paths/values/offsets stored in ColumnVariant) into per-path subcolumns. |
| // NOTE: Returned map keys are `std::string_view` pointing into the underlying doc snapshot paths |
| // column, so the input `variant` must outlive the returned map. |
| phmap::flat_hash_map<std::string_view, ColumnVariant::Subcolumn> materialize_docs_to_subcolumns_map( |
| const ColumnVariant& variant); |
| |
| } // namespace doris::vectorized::variant_util |