be/src/vec/exec/format/table/parquet_utils.h - doris - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 #pragma once

 #include <parquet/statistics.h>

 #include <array>
 #include <string>
 #include <unordered_map>
 #include <vector>

 #include "cctz/time_zone.h"
 #include "vec/core/types.h"
 #include "vec/exec/format/parquet/schema_desc.h"

 namespace doris::vectorized::parquet_utils {

 inline constexpr const char* MODE_SCHEMA = "parquet_schema";
 inline constexpr const char* MODE_METADATA = "parquet_metadata";
 inline constexpr const char* MODE_FILE_METADATA = "parquet_file_metadata";
 inline constexpr const char* MODE_KEY_VALUE_METADATA = "parquet_kv_metadata";
 inline constexpr const char* MODE_BLOOM_PROBE = "parquet_bloom_probe";

 enum SchemaColumnIndex : size_t {
     SCHEMA_FILE_NAME = 0,
     SCHEMA_NAME,
     SCHEMA_TYPE,
     SCHEMA_TYPE_LENGTH,
     SCHEMA_REPETITION_TYPE,
     SCHEMA_NUM_CHILDREN,
     SCHEMA_CONVERTED_TYPE,
     SCHEMA_SCALE,
     SCHEMA_PRECISION,
     SCHEMA_FIELD_ID,
     SCHEMA_LOGICAL_TYPE,
     SCHEMA_COLUMN_COUNT
 };

 enum MetadataColumnIndex : size_t {
     META_FILE_NAME = 0,
     META_ROW_GROUP_ID,
     META_ROW_GROUP_NUM_ROWS,
     META_ROW_GROUP_NUM_COLUMNS,
     META_ROW_GROUP_BYTES,
     META_COLUMN_ID,
     META_FILE_OFFSET,
     META_NUM_VALUES,
     META_PATH_IN_SCHEMA,
     META_TYPE,
     META_STATS_MIN,
     META_STATS_MAX,
     META_STATS_NULL_COUNT,
     META_STATS_DISTINCT_COUNT,
     META_STATS_MIN_VALUE,
     META_STATS_MAX_VALUE,
     META_COMPRESSION,
     META_ENCODINGS,
     META_INDEX_PAGE_OFFSET,
     META_DICTIONARY_PAGE_OFFSET,
     META_DATA_PAGE_OFFSET,
     META_TOTAL_COMPRESSED_SIZE,
     META_TOTAL_UNCOMPRESSED_SIZE,
     META_KEY_VALUE_METADATA,
     META_BLOOM_FILTER_OFFSET,
     META_BLOOM_FILTER_LENGTH,
     META_MIN_IS_EXACT,
     META_MAX_IS_EXACT,
     META_ROW_GROUP_COMPRESSED_BYTES,
     META_COLUMN_COUNT
 };

 enum FileMetadataColumnIndex : size_t {
     FILE_META_FILE_NAME = 0,
     FILE_META_CREATED_BY,
     FILE_META_NUM_ROWS,
     FILE_META_NUM_ROW_GROUPS,
     FILE_META_FORMAT_VERSION,
     FILE_META_ENCRYPTION_ALGORITHM,
     FILE_META_FOOTER_SIGNING_KEY_METADATA,
     FILE_META_COLUMN_COUNT
 };

 enum KeyValueColumnIndex : size_t { KV_FILE_NAME = 0, KV_KEY, KV_VALUE, KV_COLUMN_COUNT };

 enum BloomProbeColumnIndex : size_t {
     BLOOM_FILE_NAME = 0,
     BLOOM_ROW_GROUP_ID,
     BLOOM_EXCLUDES,
     BLOOM_COLUMN_COUNT
 };

 inline constexpr std::array<const char*, SCHEMA_COLUMN_COUNT> kSchemaColumnNames = {
         "file_name",      "name",  "type",      "type_length", "repetition_type", "num_children",
         "converted_type", "scale", "precision", "field_id",    "logical_type"};

 inline constexpr std::array<const char*, META_COLUMN_COUNT> kMetadataColumnNames = {
         "file_name",
         "row_group_id",
         "row_group_num_rows",
         "row_group_num_columns",
         "row_group_bytes",
         "column_id",
         "file_offset",
         "num_values",
         "path_in_schema",
         "type",
         "stats_min",
         "stats_max",
         "stats_null_count",
         "stats_distinct_count",
         "stats_min_value",
         "stats_max_value",
         "compression",
         "encodings",
         "index_page_offset",
         "dictionary_page_offset",
         "data_page_offset",
         "total_compressed_size",
         "total_uncompressed_size",
         "key_value_metadata",
         "bloom_filter_offset",
         "bloom_filter_length",
         "min_is_exact",
         "max_is_exact",
         "row_group_compressed_bytes"};

 inline constexpr std::array<const char*, FILE_META_COLUMN_COUNT> kFileMetadataColumnNames = {
         "file_name",
         "created_by",
         "num_rows",
         "num_row_groups",
         "format_version",
         "encryption_algorithm",
         "footer_signing_key_metadata"};

 inline constexpr std::array<const char*, KV_COLUMN_COUNT> kKeyValueColumnNames = {"file_name",
                                                                                   "key", "value"};

 inline constexpr std::array<const char*, BLOOM_COLUMN_COUNT> kBloomProbeColumnNames = {
         "file_name", "row_group_id", "bloom_filter_excludes"};

 std::string join_path(const std::vector<std::string>& items);

 void insert_int32(MutableColumnPtr& column, Int32 value);
 void insert_int64(MutableColumnPtr& column, Int64 value);
 void insert_bool(MutableColumnPtr& column, bool value);
 void insert_string(MutableColumnPtr& column, const std::string& value);
 void insert_null(MutableColumnPtr& column);

 std::string physical_type_to_string(tparquet::Type::type type);
 std::string compression_to_string(tparquet::CompressionCodec::type codec);
 std::string converted_type_to_string(tparquet::ConvertedType::type type);
 std::string logical_type_to_string(const tparquet::SchemaElement& element);
 std::string encodings_to_string(const std::vector<tparquet::Encoding::type>& encodings);

 bool try_get_statistics_encoded_value(const tparquet::Statistics& statistics, bool is_min,
                                       std::string* encoded_value);
 std::string bytes_to_hex_string(const std::string& bytes);
 std::string decode_statistics_value(const FieldSchema* schema_field,
                                     tparquet::Type::type physical_type,
                                     const std::string& encoded_value, const cctz::time_zone& ctz);

 void build_path_map(const FieldSchema& field, const std::string& prefix,
                     std::unordered_map<std::string, const FieldSchema*>* map);

 void merge_stats(const std::shared_ptr<::parquet::Statistics>& left,
                  const std::shared_ptr<::parquet::Statistics>& right);

 } // namespace doris::vectorized::parquet_utils
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.

	#pragma once

	#include <parquet/statistics.h>

	#include <array>
	#include <string>
	#include <unordered_map>
	#include <vector>

	#include "cctz/time_zone.h"
	#include "vec/core/types.h"
	#include "vec/exec/format/parquet/schema_desc.h"

	namespace doris::vectorized::parquet_utils {

	inline constexpr const char* MODE_SCHEMA = "parquet_schema";
	inline constexpr const char* MODE_METADATA = "parquet_metadata";
	inline constexpr const char* MODE_FILE_METADATA = "parquet_file_metadata";
	inline constexpr const char* MODE_KEY_VALUE_METADATA = "parquet_kv_metadata";
	inline constexpr const char* MODE_BLOOM_PROBE = "parquet_bloom_probe";

	enum SchemaColumnIndex : size_t {
	SCHEMA_FILE_NAME = 0,
	SCHEMA_NAME,
	SCHEMA_TYPE,
	SCHEMA_TYPE_LENGTH,
	SCHEMA_REPETITION_TYPE,
	SCHEMA_NUM_CHILDREN,
	SCHEMA_CONVERTED_TYPE,
	SCHEMA_SCALE,
	SCHEMA_PRECISION,
	SCHEMA_FIELD_ID,
	SCHEMA_LOGICAL_TYPE,
	SCHEMA_COLUMN_COUNT
	};

	enum MetadataColumnIndex : size_t {
	META_FILE_NAME = 0,
	META_ROW_GROUP_ID,
	META_ROW_GROUP_NUM_ROWS,
	META_ROW_GROUP_NUM_COLUMNS,
	META_ROW_GROUP_BYTES,
	META_COLUMN_ID,
	META_FILE_OFFSET,
	META_NUM_VALUES,
	META_PATH_IN_SCHEMA,
	META_TYPE,
	META_STATS_MIN,
	META_STATS_MAX,
	META_STATS_NULL_COUNT,
	META_STATS_DISTINCT_COUNT,
	META_STATS_MIN_VALUE,
	META_STATS_MAX_VALUE,
	META_COMPRESSION,
	META_ENCODINGS,
	META_INDEX_PAGE_OFFSET,
	META_DICTIONARY_PAGE_OFFSET,
	META_DATA_PAGE_OFFSET,
	META_TOTAL_COMPRESSED_SIZE,
	META_TOTAL_UNCOMPRESSED_SIZE,
	META_KEY_VALUE_METADATA,
	META_BLOOM_FILTER_OFFSET,
	META_BLOOM_FILTER_LENGTH,
	META_MIN_IS_EXACT,
	META_MAX_IS_EXACT,
	META_ROW_GROUP_COMPRESSED_BYTES,
	META_COLUMN_COUNT
	};

	enum FileMetadataColumnIndex : size_t {
	FILE_META_FILE_NAME = 0,
	FILE_META_CREATED_BY,
	FILE_META_NUM_ROWS,
	FILE_META_NUM_ROW_GROUPS,
	FILE_META_FORMAT_VERSION,
	FILE_META_ENCRYPTION_ALGORITHM,
	FILE_META_FOOTER_SIGNING_KEY_METADATA,
	FILE_META_COLUMN_COUNT
	};

	enum KeyValueColumnIndex : size_t { KV_FILE_NAME = 0, KV_KEY, KV_VALUE, KV_COLUMN_COUNT };

	enum BloomProbeColumnIndex : size_t {
	BLOOM_FILE_NAME = 0,
	BLOOM_ROW_GROUP_ID,
	BLOOM_EXCLUDES,
	BLOOM_COLUMN_COUNT
	};

	inline constexpr std::array<const char*, SCHEMA_COLUMN_COUNT> kSchemaColumnNames = {
	"file_name", "name", "type", "type_length", "repetition_type", "num_children",
	"converted_type", "scale", "precision", "field_id", "logical_type"};

	inline constexpr std::array<const char*, META_COLUMN_COUNT> kMetadataColumnNames = {
	"file_name",
	"row_group_id",
	"row_group_num_rows",
	"row_group_num_columns",
	"row_group_bytes",
	"column_id",
	"file_offset",
	"num_values",
	"path_in_schema",
	"type",
	"stats_min",
	"stats_max",
	"stats_null_count",
	"stats_distinct_count",
	"stats_min_value",
	"stats_max_value",
	"compression",
	"encodings",
	"index_page_offset",
	"dictionary_page_offset",
	"data_page_offset",
	"total_compressed_size",
	"total_uncompressed_size",
	"key_value_metadata",
	"bloom_filter_offset",
	"bloom_filter_length",
	"min_is_exact",
	"max_is_exact",
	"row_group_compressed_bytes"};

	inline constexpr std::array<const char*, FILE_META_COLUMN_COUNT> kFileMetadataColumnNames = {
	"file_name",
	"created_by",
	"num_rows",
	"num_row_groups",
	"format_version",
	"encryption_algorithm",
	"footer_signing_key_metadata"};

	inline constexpr std::array<const char*, KV_COLUMN_COUNT> kKeyValueColumnNames = {"file_name",
	"key", "value"};

	inline constexpr std::array<const char*, BLOOM_COLUMN_COUNT> kBloomProbeColumnNames = {
	"file_name", "row_group_id", "bloom_filter_excludes"};

	std::string join_path(const std::vector<std::string>& items);

	void insert_int32(MutableColumnPtr& column, Int32 value);
	void insert_int64(MutableColumnPtr& column, Int64 value);
	void insert_bool(MutableColumnPtr& column, bool value);
	void insert_string(MutableColumnPtr& column, const std::string& value);
	void insert_null(MutableColumnPtr& column);

	std::string physical_type_to_string(tparquet::Type::type type);
	std::string compression_to_string(tparquet::CompressionCodec::type codec);
	std::string converted_type_to_string(tparquet::ConvertedType::type type);
	std::string logical_type_to_string(const tparquet::SchemaElement& element);
	std::string encodings_to_string(const std::vector<tparquet::Encoding::type>& encodings);

	bool try_get_statistics_encoded_value(const tparquet::Statistics& statistics, bool is_min,
	std::string* encoded_value);
	std::string bytes_to_hex_string(const std::string& bytes);
	std::string decode_statistics_value(const FieldSchema* schema_field,
	tparquet::Type::type physical_type,
	const std::string& encoded_value, const cctz::time_zone& ctz);

	void build_path_map(const FieldSchema& field, const std::string& prefix,
	std::unordered_map<std::string, const FieldSchema> map);

	void merge_stats(const std::shared_ptr<::parquet::Statistics>& left,
	const std::shared_ptr<::parquet::Statistics>& right);

	} // namespace doris::vectorized::parquet_utils