PARQUET-978: [C++] Minimizing footer reads for small(ish) metadata
I realized we don't have a full end to end `writer-reader-test` for parquet. Would have been easier to test this PR. Does it make sense adding one?
Author: Deepak Majeti <deepak.majeti@hpe.com>
Closes #341 from majetideepak/PARQUET-978 and squashes the following commits:
104df0c [Deepak Majeti] clang format
dbb28be [Deepak Majeti] check metadata size in reader-test
2b13715 [Deepak Majeti] Fix comment in writer-internal.cc
850b61c [Deepak Majeti] PARQUET-978
diff --git a/src/parquet/file/reader-internal.cc b/src/parquet/file/reader-internal.cc
index c05bb12..3543759 100644
--- a/src/parquet/file/reader-internal.cc
+++ b/src/parquet/file/reader-internal.cc
@@ -202,6 +202,8 @@
// ----------------------------------------------------------------------
// SerializedFile: Parquet on-disk layout
+// PARQUET-978: Minimize footer reads by reading 64 KB from the end of the file
+static constexpr int64_t DEFAULT_FOOTER_READ_SIZE = 64 * 1024;
static constexpr uint32_t FOOTER_SIZE = 8;
static constexpr uint8_t PARQUET_MAGIC[4] = {'P', 'A', 'R', '1'};
@@ -255,15 +257,19 @@
throw ParquetException("Corrupted file, smaller than file footer");
}
- uint8_t footer_buffer[FOOTER_SIZE];
+ uint8_t footer_buffer[DEFAULT_FOOTER_READ_SIZE];
+ int64_t footer_read_size = std::min(file_size, DEFAULT_FOOTER_READ_SIZE);
int64_t bytes_read =
- source_->ReadAt(file_size - FOOTER_SIZE, FOOTER_SIZE, footer_buffer);
+ source_->ReadAt(file_size - footer_read_size, footer_read_size, footer_buffer);
- if (bytes_read != FOOTER_SIZE || memcmp(footer_buffer + 4, PARQUET_MAGIC, 4) != 0) {
+ // Check if all bytes are read. Check if last 4 bytes read have the magic bits
+ if (bytes_read != footer_read_size ||
+ memcmp(footer_buffer + footer_read_size - 4, PARQUET_MAGIC, 4) != 0) {
throw ParquetException("Invalid parquet file. Corrupt footer.");
}
- uint32_t metadata_len = *reinterpret_cast<uint32_t*>(footer_buffer);
+ uint32_t metadata_len =
+ *reinterpret_cast<uint32_t*>(footer_buffer + footer_read_size - FOOTER_SIZE);
int64_t metadata_start = file_size - FOOTER_SIZE - metadata_len;
if (FOOTER_SIZE + metadata_len > file_size) {
throw ParquetException(
@@ -273,10 +279,17 @@
std::shared_ptr<PoolBuffer> metadata_buffer =
AllocateBuffer(properties_.memory_pool(), metadata_len);
- bytes_read =
- source_->ReadAt(metadata_start, metadata_len, metadata_buffer->mutable_data());
- if (bytes_read != metadata_len) {
- throw ParquetException("Invalid parquet file. Could not read metadata bytes.");
+
+ // Check if the footer_buffer contains the entire metadata
+ if (footer_read_size >= (metadata_len + FOOTER_SIZE)) {
+ memcpy(metadata_buffer->mutable_data(),
+ footer_buffer + (footer_read_size - metadata_len - FOOTER_SIZE), metadata_len);
+ } else {
+ bytes_read =
+ source_->ReadAt(metadata_start, metadata_len, metadata_buffer->mutable_data());
+ if (bytes_read != metadata_len) {
+ throw ParquetException("Invalid parquet file. Could not read metadata bytes.");
+ }
}
file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &metadata_len);
diff --git a/src/parquet/file/writer-internal.cc b/src/parquet/file/writer-internal.cc
index b69e87e..633498c 100644
--- a/src/parquet/file/writer-internal.cc
+++ b/src/parquet/file/writer-internal.cc
@@ -62,7 +62,6 @@
void SerializedPageWriter::Close(bool has_dictionary, bool fallback) {
// index_page_offset = 0 since they are not supported
- // TODO: Remove default fallback = 'false' when implemented
metadata_->Finish(num_values_, dictionary_page_offset_, 0, data_page_offset_,
total_compressed_size_, total_uncompressed_size_, has_dictionary, fallback);
diff --git a/src/parquet/reader-test.cc b/src/parquet/reader-test.cc
index 71f982b..e8a6813 100644
--- a/src/parquet/reader-test.cc
+++ b/src/parquet/reader-test.cc
@@ -83,6 +83,8 @@
ASSERT_EQ(8, reader_->metadata()->num_rows());
// This file only has 1 row group
ASSERT_EQ(1, reader_->metadata()->num_row_groups());
+ // Size of the metadata is 730 bytes
+ ASSERT_EQ(730, reader_->metadata()->size());
// This row group must have 8 rows
ASSERT_EQ(8, group->metadata()->num_rows());