PARQUET-1256: Add --print-key-value-metadata option to parquet_reader tool
This is a minor change useful for debugging.
Now parquet_reader tool has --print-key-value-metadata which when present, dump of key values of file metadata is done.
Created https://issues.apache.org/jira/browse/PARQUET-1256
Author: Jacek Pliszka <Jacek.Pliszka@gmail.com>
Closes #450 from JacekPliszka/master and squashes the following commits:
0d9a108 [Jacek Pliszka] Added --print-key-value-metadata option to parquet_reader tool
diff --git a/src/parquet/printer.cc b/src/parquet/printer.cc
index 3f18a5c..9f26a41 100644
--- a/src/parquet/printer.cc
+++ b/src/parquet/printer.cc
@@ -33,13 +33,25 @@
#define COL_WIDTH "30"
void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selected_columns,
- bool print_values, const char* filename) {
+ bool print_values, bool print_key_value_metadata,
+ const char* filename) {
const FileMetaData* file_metadata = fileReader->metadata().get();
stream << "File Name: " << filename << "\n";
stream << "Version: " << file_metadata->version() << "\n";
stream << "Created By: " << file_metadata->created_by() << "\n";
stream << "Total rows: " << file_metadata->num_rows() << "\n";
+
+ if (print_key_value_metadata) {
+ auto key_value_metadata = file_metadata->key_value_metadata();
+ int64_t size_of_key_value_metadata = key_value_metadata->size();
+ stream << "Key Value File Metadata: " << size_of_key_value_metadata << " entries\n";
+ for (int64_t i = 0; i < size_of_key_value_metadata; i++) {
+ stream << " Key nr " << i << " " << key_value_metadata->key(i) << ": "
+ << key_value_metadata->value(i) << "\n";
+ }
+ }
+
stream << "Number of RowGroups: " << file_metadata->num_row_groups() << "\n";
stream << "Number of Real Columns: "
<< file_metadata->schema()->group_node()->field_count() << "\n";
diff --git a/src/parquet/printer.h b/src/parquet/printer.h
index 3b82882..1113c3f 100644
--- a/src/parquet/printer.h
+++ b/src/parquet/printer.h
@@ -38,7 +38,8 @@
~ParquetFilePrinter() {}
void DebugPrint(std::ostream& stream, std::list<int> selected_columns,
- bool print_values = true, const char* fileame = "No Name");
+ bool print_values = true, bool print_key_value_metadata = false,
+ const char* filename = "No Name");
void JSONPrint(std::ostream& stream, std::list<int> selected_columns,
const char* filename = "No Name");
diff --git a/tools/parquet_reader.cc b/tools/parquet_reader.cc
index 7ef59dc..34bdfc1 100644
--- a/tools/parquet_reader.cc
+++ b/tools/parquet_reader.cc
@@ -24,13 +24,14 @@
int main(int argc, char** argv) {
if (argc > 5 || argc < 2) {
std::cerr << "Usage: parquet_reader [--only-metadata] [--no-memory-map] [--json]"
- "[--columns=...] <file>"
+ "[--print-key-value-metadata] [--columns=...] <file>"
<< std::endl;
return -1;
}
std::string filename;
bool print_values = true;
+ bool print_key_value_metadata = false;
bool memory_map = true;
bool format_json = false;
@@ -42,6 +43,8 @@
for (int i = 1; i < argc; i++) {
if ((param = std::strstr(argv[i], "--only-metadata"))) {
print_values = false;
+ } else if ((param = std::strstr(argv[i], "--print-key-value-metadata"))) {
+ print_key_value_metadata = true;
} else if ((param = std::strstr(argv[i], "--no-memory-map"))) {
memory_map = false;
} else if ((param = std::strstr(argv[i], "--json"))) {
@@ -64,7 +67,8 @@
if (format_json) {
printer.JSONPrint(std::cout, columns, filename.c_str());
} else {
- printer.DebugPrint(std::cout, columns, print_values, filename.c_str());
+ printer.DebugPrint(std::cout, columns, print_values,
+ print_key_value_metadata, filename.c_str());
}
} catch (const std::exception& e) {
std::cerr << "Parquet error: " << e.what() << std::endl;