blob: b79dee1cb3999c24200d7455021c6103b8fcdd2a [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <parquet/statistics.h>
#include <array>
#include <string>
#include <unordered_map>
#include <vector>
#include "cctz/time_zone.h"
#include "vec/core/types.h"
#include "vec/exec/format/parquet/schema_desc.h"
namespace doris::vectorized::parquet_utils {
inline constexpr const char* MODE_SCHEMA = "parquet_schema";
inline constexpr const char* MODE_METADATA = "parquet_metadata";
inline constexpr const char* MODE_FILE_METADATA = "parquet_file_metadata";
inline constexpr const char* MODE_KEY_VALUE_METADATA = "parquet_kv_metadata";
inline constexpr const char* MODE_BLOOM_PROBE = "parquet_bloom_probe";
enum SchemaColumnIndex : size_t {
SCHEMA_FILE_NAME = 0,
SCHEMA_NAME,
SCHEMA_TYPE,
SCHEMA_TYPE_LENGTH,
SCHEMA_REPETITION_TYPE,
SCHEMA_NUM_CHILDREN,
SCHEMA_CONVERTED_TYPE,
SCHEMA_SCALE,
SCHEMA_PRECISION,
SCHEMA_FIELD_ID,
SCHEMA_LOGICAL_TYPE,
SCHEMA_COLUMN_COUNT
};
enum MetadataColumnIndex : size_t {
META_FILE_NAME = 0,
META_ROW_GROUP_ID,
META_ROW_GROUP_NUM_ROWS,
META_ROW_GROUP_NUM_COLUMNS,
META_ROW_GROUP_BYTES,
META_COLUMN_ID,
META_FILE_OFFSET,
META_NUM_VALUES,
META_PATH_IN_SCHEMA,
META_TYPE,
META_STATS_MIN,
META_STATS_MAX,
META_STATS_NULL_COUNT,
META_STATS_DISTINCT_COUNT,
META_STATS_MIN_VALUE,
META_STATS_MAX_VALUE,
META_COMPRESSION,
META_ENCODINGS,
META_INDEX_PAGE_OFFSET,
META_DICTIONARY_PAGE_OFFSET,
META_DATA_PAGE_OFFSET,
META_TOTAL_COMPRESSED_SIZE,
META_TOTAL_UNCOMPRESSED_SIZE,
META_KEY_VALUE_METADATA,
META_BLOOM_FILTER_OFFSET,
META_BLOOM_FILTER_LENGTH,
META_MIN_IS_EXACT,
META_MAX_IS_EXACT,
META_ROW_GROUP_COMPRESSED_BYTES,
META_COLUMN_COUNT
};
enum FileMetadataColumnIndex : size_t {
FILE_META_FILE_NAME = 0,
FILE_META_CREATED_BY,
FILE_META_NUM_ROWS,
FILE_META_NUM_ROW_GROUPS,
FILE_META_FORMAT_VERSION,
FILE_META_ENCRYPTION_ALGORITHM,
FILE_META_FOOTER_SIGNING_KEY_METADATA,
FILE_META_COLUMN_COUNT
};
enum KeyValueColumnIndex : size_t { KV_FILE_NAME = 0, KV_KEY, KV_VALUE, KV_COLUMN_COUNT };
enum BloomProbeColumnIndex : size_t {
BLOOM_FILE_NAME = 0,
BLOOM_ROW_GROUP_ID,
BLOOM_EXCLUDES,
BLOOM_COLUMN_COUNT
};
inline constexpr std::array<const char*, SCHEMA_COLUMN_COUNT> kSchemaColumnNames = {
"file_name", "name", "type", "type_length", "repetition_type", "num_children",
"converted_type", "scale", "precision", "field_id", "logical_type"};
inline constexpr std::array<const char*, META_COLUMN_COUNT> kMetadataColumnNames = {
"file_name",
"row_group_id",
"row_group_num_rows",
"row_group_num_columns",
"row_group_bytes",
"column_id",
"file_offset",
"num_values",
"path_in_schema",
"type",
"stats_min",
"stats_max",
"stats_null_count",
"stats_distinct_count",
"stats_min_value",
"stats_max_value",
"compression",
"encodings",
"index_page_offset",
"dictionary_page_offset",
"data_page_offset",
"total_compressed_size",
"total_uncompressed_size",
"key_value_metadata",
"bloom_filter_offset",
"bloom_filter_length",
"min_is_exact",
"max_is_exact",
"row_group_compressed_bytes"};
inline constexpr std::array<const char*, FILE_META_COLUMN_COUNT> kFileMetadataColumnNames = {
"file_name",
"created_by",
"num_rows",
"num_row_groups",
"format_version",
"encryption_algorithm",
"footer_signing_key_metadata"};
inline constexpr std::array<const char*, KV_COLUMN_COUNT> kKeyValueColumnNames = {"file_name",
"key", "value"};
inline constexpr std::array<const char*, BLOOM_COLUMN_COUNT> kBloomProbeColumnNames = {
"file_name", "row_group_id", "bloom_filter_excludes"};
std::string join_path(const std::vector<std::string>& items);
void insert_int32(MutableColumnPtr& column, Int32 value);
void insert_int64(MutableColumnPtr& column, Int64 value);
void insert_bool(MutableColumnPtr& column, bool value);
void insert_string(MutableColumnPtr& column, const std::string& value);
void insert_null(MutableColumnPtr& column);
std::string physical_type_to_string(tparquet::Type::type type);
std::string compression_to_string(tparquet::CompressionCodec::type codec);
std::string converted_type_to_string(tparquet::ConvertedType::type type);
std::string logical_type_to_string(const tparquet::SchemaElement& element);
std::string encodings_to_string(const std::vector<tparquet::Encoding::type>& encodings);
bool try_get_statistics_encoded_value(const tparquet::Statistics& statistics, bool is_min,
std::string* encoded_value);
std::string bytes_to_hex_string(const std::string& bytes);
std::string decode_statistics_value(const FieldSchema* schema_field,
tparquet::Type::type physical_type,
const std::string& encoded_value, const cctz::time_zone& ctz);
void build_path_map(const FieldSchema& field, const std::string& prefix,
std::unordered_map<std::string, const FieldSchema*>* map);
void merge_stats(const std::shared_ptr<::parquet::Statistics>& left,
const std::shared_ptr<::parquet::Statistics>& right);
} // namespace doris::vectorized::parquet_utils