| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| #include "parquet/printer.h" |
| |
| #include <cstdint> |
| #include <cstdio> |
| #include <memory> |
| #include <ostream> |
| #include <string> |
| #include <vector> |
| |
| #include "arrow/util/key_value_metadata.h" |
| #include "arrow/util/string.h" |
| |
| #include "parquet/column_scanner.h" |
| #include "parquet/exception.h" |
| #include "parquet/file_reader.h" |
| #include "parquet/metadata.h" |
| #include "parquet/schema.h" |
| #include "parquet/statistics.h" |
| #include "parquet/types.h" |
| |
| namespace parquet { |
| |
| class ColumnReader; |
| |
| namespace { |
| |
| void PrintPageEncodingStats(std::ostream& stream, |
| const std::vector<PageEncodingStats>& encoding_stats) { |
| for (size_t i = 0; i < encoding_stats.size(); ++i) { |
| const auto& encoding = encoding_stats.at(i); |
| stream << EncodingToString(encoding.encoding); |
| if (encoding.page_type == parquet::PageType::DICTIONARY_PAGE) { |
| // Explicitly tell if this encoding comes from a dictionary page |
| stream << "(DICT_PAGE)"; |
| } |
| if (i + 1 != encoding_stats.size()) { |
| stream << " "; |
| } |
| } |
| } |
| |
| void PutChars(std::ostream& stream, char c, int n) { |
| for (int i = 0; i < n; ++i) { |
| stream.put(c); |
| } |
| } |
| |
| void PrintKeyValueMetadata(std::ostream& stream, |
| const KeyValueMetadata& key_value_metadata, |
| int indent_level = 0, int indent_width = 1) { |
| const int64_t size_of_key_value_metadata = key_value_metadata.size(); |
| PutChars(stream, ' ', indent_level * indent_width); |
| stream << "Key Value Metadata: " << size_of_key_value_metadata << " entries\n"; |
| for (int64_t i = 0; i < size_of_key_value_metadata; i++) { |
| PutChars(stream, ' ', (indent_level + 1) * indent_width); |
| stream << "Key nr " << i << " " << key_value_metadata.key(i) << ": " |
| << key_value_metadata.value(i) << "\n"; |
| } |
| } |
| |
| // the fixed initial size is just for an example |
| constexpr int kColWidth = 30; |
| |
| } // namespace |
| |
| // ---------------------------------------------------------------------- |
| // ParquetFilePrinter::DebugPrint |
| |
| void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selected_columns, |
| bool print_values, bool format_dump, |
| bool print_key_value_metadata, const char* filename) { |
| const FileMetaData* file_metadata = fileReader->metadata().get(); |
| |
| stream << "File Name: " << filename << "\n"; |
| stream << "Version: " << ParquetVersionToString(file_metadata->version()) << "\n"; |
| stream << "Created By: " << file_metadata->created_by() << "\n"; |
| stream << "Total rows: " << file_metadata->num_rows() << "\n"; |
| |
| if (print_key_value_metadata && file_metadata->key_value_metadata()) { |
| auto key_value_metadata = file_metadata->key_value_metadata(); |
| PrintKeyValueMetadata(stream, *key_value_metadata); |
| } |
| |
| stream << "Number of RowGroups: " << file_metadata->num_row_groups() << "\n"; |
| stream << "Number of Real Columns: " |
| << file_metadata->schema()->group_node()->field_count() << "\n"; |
| |
| if (selected_columns.size() == 0) { |
| for (int i = 0; i < file_metadata->num_columns(); i++) { |
| selected_columns.push_back(i); |
| } |
| } else { |
| for (auto i : selected_columns) { |
| if (i < 0 || i >= file_metadata->num_columns()) { |
| throw ParquetException("Selected column is out of range"); |
| } |
| } |
| } |
| |
| stream << "Number of Columns: " << file_metadata->num_columns() << "\n"; |
| stream << "Number of Selected Columns: " << selected_columns.size() << "\n"; |
| for (auto i : selected_columns) { |
| const ColumnDescriptor* descr = file_metadata->schema()->Column(i); |
| stream << "Column " << i << ": " << descr->path()->ToDotString() << " (" |
| << TypeToString(descr->physical_type(), descr->type_length()); |
| const auto& logical_type = descr->logical_type(); |
| if (!logical_type->is_none()) { |
| stream << " / " << logical_type->ToString(); |
| } |
| if (descr->converted_type() != ConvertedType::NONE) { |
| stream << " / " << ConvertedTypeToString(descr->converted_type()); |
| if (descr->converted_type() == ConvertedType::DECIMAL) { |
| stream << "(" << descr->type_precision() << "," << descr->type_scale() << ")"; |
| } |
| } |
| stream << ")" << std::endl; |
| } |
| |
| for (int r = 0; r < file_metadata->num_row_groups(); ++r) { |
| stream << "--- Row Group: " << r << " ---\n"; |
| |
| auto group_reader = fileReader->RowGroup(r); |
| std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r); |
| |
| stream << "--- Total Bytes: " << group_metadata->total_byte_size() << " ---\n"; |
| stream << "--- Total Compressed Bytes: " << group_metadata->total_compressed_size() |
| << " ---\n"; |
| auto sorting_columns = group_metadata->sorting_columns(); |
| if (!sorting_columns.empty()) { |
| stream << "--- Sort Columns:\n"; |
| for (auto column : sorting_columns) { |
| stream << "column_idx: " << column.column_idx |
| << ", descending: " << column.descending |
| << ", nulls_first: " << column.nulls_first << "\n"; |
| } |
| } |
| stream << "--- Rows: " << group_metadata->num_rows() << " ---\n"; |
| |
| // Print column metadata |
| for (auto i : selected_columns) { |
| auto column_chunk = group_metadata->ColumnChunk(i); |
| std::shared_ptr<EncodedStatistics> stats = column_chunk->encoded_statistics(); |
| |
| const ColumnDescriptor* descr = file_metadata->schema()->Column(i); |
| stream << "Column " << i << std::endl; |
| if (print_key_value_metadata && column_chunk->key_value_metadata()) { |
| PrintKeyValueMetadata(stream, *column_chunk->key_value_metadata(), 1, 2); |
| } |
| stream << " Values: " << column_chunk->num_values(); |
| if (column_chunk->is_stats_set()) { |
| std::string min = stats->min(), max = stats->max(); |
| std::string max_exact = |
| stats->is_max_value_exact.has_value() |
| ? (stats->is_max_value_exact.value() ? "true" : "false") |
| : "unknown"; |
| std::string min_exact = |
| stats->is_min_value_exact.has_value() |
| ? (stats->is_min_value_exact.value() ? "true" : "false") |
| : "unknown"; |
| stream << ", Null Values: " << stats->null_count |
| << ", Distinct Values: " << stats->distinct_count << std::endl |
| << " Max (exact: " << max_exact << "): " |
| << FormatStatValue(descr->physical_type(), max, descr->logical_type()) |
| << ", Min (exact: " << min_exact << "): " |
| << FormatStatValue(descr->physical_type(), min, descr->logical_type()); |
| } else { |
| stream << " Statistics Not Set"; |
| } |
| stream << std::endl |
| << " Compression: " |
| << ::arrow::internal::AsciiToUpper( |
| Codec::GetCodecAsString(column_chunk->compression())) |
| << ", Encodings: "; |
| if (column_chunk->encoding_stats().empty()) { |
| for (auto encoding : column_chunk->encodings()) { |
| stream << EncodingToString(encoding) << " "; |
| } |
| } else { |
| PrintPageEncodingStats(stream, column_chunk->encoding_stats()); |
| } |
| stream << std::endl |
| << " Uncompressed Size: " << column_chunk->total_uncompressed_size() |
| << ", Compressed Size: " << column_chunk->total_compressed_size() |
| << std::endl; |
| } |
| |
| if (!print_values) { |
| continue; |
| } |
| stream << "--- Values ---\n"; |
| |
| static constexpr int bufsize = kColWidth + 1; |
| char buffer[bufsize]; |
| |
| // Create readers for selected columns and print contents |
| std::vector<std::shared_ptr<Scanner>> scanners(selected_columns.size(), nullptr); |
| int j = 0; |
| for (auto i : selected_columns) { |
| std::shared_ptr<ColumnReader> col_reader = group_reader->Column(i); |
| // This is OK in this method as long as the RowGroupReader does not get |
| // deleted |
| auto& scanner = scanners[j++] = Scanner::Make(col_reader); |
| |
| if (format_dump) { |
| stream << "Column " << i << std::endl; |
| while (scanner->HasNext()) { |
| scanner->PrintNext(stream, 0, true); |
| stream << "\n"; |
| } |
| continue; |
| } |
| |
| snprintf(buffer, bufsize, "%-*s", kColWidth, |
| file_metadata->schema()->Column(i)->name().c_str()); |
| stream << buffer << '|'; |
| } |
| if (format_dump) { |
| continue; |
| } |
| stream << "\n"; |
| |
| bool hasRow; |
| do { |
| hasRow = false; |
| for (const auto& scanner : scanners) { |
| if (scanner->HasNext()) { |
| hasRow = true; |
| scanner->PrintNext(stream, kColWidth); |
| stream << '|'; |
| } |
| } |
| stream << "\n"; |
| } while (hasRow); |
| } |
| } |
| |
| void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list<int> selected_columns, |
| const char* filename) { |
| const FileMetaData* file_metadata = fileReader->metadata().get(); |
| stream << "{\n"; |
| stream << " \"FileName\": \"" << filename << "\",\n"; |
| stream << " \"Version\": \"" << ParquetVersionToString(file_metadata->version()) |
| << "\",\n"; |
| stream << " \"CreatedBy\": \"" << file_metadata->created_by() << "\",\n"; |
| stream << " \"TotalRows\": \"" << file_metadata->num_rows() << "\",\n"; |
| stream << " \"NumberOfRowGroups\": \"" << file_metadata->num_row_groups() << "\",\n"; |
| stream << " \"NumberOfRealColumns\": \"" |
| << file_metadata->schema()->group_node()->field_count() << "\",\n"; |
| stream << " \"NumberOfColumns\": \"" << file_metadata->num_columns() << "\",\n"; |
| |
| if (selected_columns.empty()) { |
| for (int i = 0; i < file_metadata->num_columns(); i++) { |
| selected_columns.push_back(i); |
| } |
| } else { |
| for (auto i : selected_columns) { |
| if (i < 0 || i >= file_metadata->num_columns()) { |
| throw ParquetException("Selected column is out of range"); |
| } |
| } |
| } |
| |
| stream << " \"Columns\": [\n"; |
| int c = 0; |
| for (auto i : selected_columns) { |
| const ColumnDescriptor* descr = file_metadata->schema()->Column(i); |
| stream << " { \"Id\": \"" << i << "\"," |
| << " \"Name\": \"" << descr->path()->ToDotString() << "\"," |
| << " \"PhysicalType\": \"" |
| << TypeToString(descr->physical_type(), descr->type_length()) << "\"," |
| << " \"ConvertedType\": \"" << ConvertedTypeToString(descr->converted_type()) |
| << "\"," |
| << " \"LogicalType\": " << (descr->logical_type())->ToJSON() << " }"; |
| c++; |
| if (c != static_cast<int>(selected_columns.size())) { |
| stream << ",\n"; |
| } |
| } |
| |
| stream << "\n ],\n \"RowGroups\": [\n"; |
| for (int r = 0; r < file_metadata->num_row_groups(); ++r) { |
| stream << " {\n \"Id\": \"" << r << "\", "; |
| |
| auto group_reader = fileReader->RowGroup(r); |
| std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r); |
| |
| stream << " \"TotalBytes\": \"" << group_metadata->total_byte_size() << "\", "; |
| stream << " \"TotalCompressedBytes\": \"" << group_metadata->total_compressed_size() |
| << "\", "; |
| auto row_group_sorting_columns = group_metadata->sorting_columns(); |
| if (!row_group_sorting_columns.empty()) { |
| stream << " \"SortColumns\": [\n"; |
| for (size_t i = 0; i < row_group_sorting_columns.size(); i++) { |
| stream << " {\"column_idx\": " << row_group_sorting_columns[i].column_idx |
| << ", \"descending\": " << row_group_sorting_columns[i].descending |
| << ", \"nulls_first\": " << row_group_sorting_columns[i].nulls_first |
| << "}"; |
| if (i + 1 != row_group_sorting_columns.size()) { |
| stream << ","; |
| } |
| stream << '\n'; |
| } |
| stream << " ], "; |
| } |
| stream << " \"Rows\": \"" << group_metadata->num_rows() << "\",\n"; |
| |
| // Print column metadata |
| stream << " \"ColumnChunks\": [\n"; |
| int c1 = 0; |
| for (auto i : selected_columns) { |
| auto column_chunk = group_metadata->ColumnChunk(i); |
| std::shared_ptr<Statistics> stats = column_chunk->statistics(); |
| |
| const ColumnDescriptor* descr = file_metadata->schema()->Column(i); |
| stream << " {\"Id\": \"" << i << "\", \"Values\": \"" |
| << column_chunk->num_values() << "\", " |
| << "\"StatsSet\": "; |
| if (column_chunk->is_stats_set()) { |
| stream << R"("True", "Stats": {)"; |
| if (stats->HasNullCount()) { |
| stream << R"("NumNulls": ")" << stats->null_count() << "\""; |
| } |
| if (stats->HasDistinctCount()) { |
| stream << ", " |
| << R"("DistinctValues": ")" << stats->distinct_count() << "\""; |
| } |
| if (stats->HasMinMax()) { |
| std::string min = stats->EncodeMin(), max = stats->EncodeMax(); |
| stream << ", " |
| << R"("Max": ")" |
| << FormatStatValue(descr->physical_type(), max, descr->logical_type()) |
| << "\", " |
| << R"("Min": ")" |
| << FormatStatValue(descr->physical_type(), min, descr->logical_type()) |
| << "\""; |
| if (stats->is_max_value_exact().has_value()) { |
| stream << ", " |
| << R"("IsMaxValueExact": ")" |
| << (stats->is_max_value_exact().value() ? "True" : "False") << "\""; |
| } else { |
| stream << ", " |
| << R"("IsMaxValueExact": "unknown")"; |
| } |
| if (stats->is_min_value_exact().has_value()) { |
| stream << ", " |
| << R"("IsMinValueExact": ")" |
| << (stats->is_min_value_exact().value() ? "True" : "False") << "\""; |
| } else { |
| stream << ", " |
| << R"("IsMinValueExact": "unknown")"; |
| } |
| } |
| stream << " },"; |
| } else { |
| stream << "\"False\","; |
| } |
| stream << "\n \"Compression\": \"" |
| << ::arrow::internal::AsciiToUpper( |
| Codec::GetCodecAsString(column_chunk->compression())) |
| << R"(", "Encodings": )"; |
| stream << "\""; |
| if (column_chunk->encoding_stats().empty()) { |
| for (auto encoding : column_chunk->encodings()) { |
| stream << EncodingToString(encoding) << " "; |
| } |
| } else { |
| PrintPageEncodingStats(stream, column_chunk->encoding_stats()); |
| } |
| stream << "\""; |
| stream << ", " |
| << R"("UncompressedSize": ")" << column_chunk->total_uncompressed_size() |
| << R"(", "CompressedSize": ")" << column_chunk->total_compressed_size() |
| << "\""; |
| |
| if (column_chunk->bloom_filter_offset()) { |
| // Output BloomFilter {offset, length} |
| stream << ", \"BloomFilter\": {" |
| << R"("offset": ")" << column_chunk->bloom_filter_offset().value() << "\""; |
| if (column_chunk->bloom_filter_length()) { |
| stream << R"(, "length": ")" << column_chunk->bloom_filter_length().value() |
| << "\""; |
| } |
| stream << "}"; |
| } |
| |
| if (column_chunk->GetColumnIndexLocation()) { |
| auto location = column_chunk->GetColumnIndexLocation().value(); |
| // Output ColumnIndex {offset, length} |
| stream << ", \"ColumnIndex\": {" |
| << R"("offset": ")" << location.offset; |
| stream << R"(", "length": ")" << location.length; |
| stream << "\"}"; |
| } |
| |
| if (column_chunk->GetOffsetIndexLocation()) { |
| auto location = column_chunk->GetOffsetIndexLocation().value(); |
| // Output OffsetIndex {offset, length} |
| stream << ", \"OffsetIndex\": {" |
| << R"("offset": ")" << location.offset << "\""; |
| stream << R"(, "length": ")" << location.length << "\""; |
| stream << "}"; |
| } |
| |
| // end of a ColumnChunk |
| stream << " }"; |
| c1++; |
| if (c1 != static_cast<int>(selected_columns.size())) { |
| stream << ",\n"; |
| } |
| } |
| |
| stream << "\n ]\n }"; |
| if ((r + 1) != static_cast<int>(file_metadata->num_row_groups())) { |
| stream << ",\n"; |
| } |
| } |
| stream << "\n ]\n}\n"; |
| } |
| |
| } // namespace parquet |