PARQUET-1360: Use conforming API style, variable names in WriteFileMetaData functions
Per comments in PARQUET-1348. I also upgraded clang-format to 6.0 here so pardon the diff noise from that.
To summarize the style points:
* Don't pass `const std:shared_ptr<T>&` if `const T&` will do
* Don't pass `const std::shared_ptr<T>&` if `T*` will do
* I prefer to only use class static function members for alternate constructors
* Out arguments after in arguments
* `lower_with_underscores` for variable names
Author: Wes McKinney <wesm+git@apache.org>
Closes #482 from wesm/PARQUET-1360 and squashes the following commits:
375a442 [Wes McKinney] API conformity and changes per review comments in ARROW-1348
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b53f598..927f728 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -84,7 +84,7 @@
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake_modules")
set(BUILD_SUPPORT_DIR "${CMAKE_SOURCE_DIR}/build-support")
-set(CLANG_FORMAT_VERSION "5.0")
+set(CLANG_FORMAT_VERSION "6.0")
find_package(ClangTools)
if ("$ENV{CMAKE_EXPORT_COMPILE_COMMANDS}" STREQUAL "1" OR CLANG_TIDY_FOUND)
# Generate a Clang compile_commands.json "compilation database" file for use
diff --git a/src/parquet/arrow/arrow-reader-writer-test.cc b/src/parquet/arrow/arrow-reader-writer-test.cc
index 02b8d52..d4f5b00 100644
--- a/src/parquet/arrow/arrow-reader-writer-test.cc
+++ b/src/parquet/arrow/arrow-reader-writer-test.cc
@@ -52,16 +52,16 @@
using arrow::ChunkedArray;
using arrow::Column;
using arrow::DataType;
+using arrow::default_memory_pool;
using arrow::ListArray;
-using arrow::ResizableBuffer;
using arrow::PrimitiveArray;
+using arrow::ResizableBuffer;
using arrow::Status;
using arrow::Table;
using arrow::TimeUnit;
using arrow::compute::Datum;
using arrow::compute::DictionaryEncode;
using arrow::compute::FunctionContext;
-using arrow::default_memory_pool;
using arrow::io::BufferReader;
using arrow::test::randint;
@@ -861,26 +861,21 @@
std::unique_ptr<FileReader> reader;
ASSERT_NO_FATAL_FAILURE(this->ReaderFromSink(&reader));
- const std::shared_ptr<FileMetaData> fileMetaData = reader->parquet_reader()->metadata();
- ASSERT_EQ(1, fileMetaData->num_columns());
- ASSERT_EQ(100, fileMetaData->num_rows());
+ auto metadata = reader->parquet_reader()->metadata();
+ ASSERT_EQ(1, metadata->num_columns());
+ ASSERT_EQ(100, metadata->num_rows());
this->sink_ = std::make_shared<InMemoryOutputStream>();
- std::unique_ptr<FileMetaData> uniqueFileMetaData(fileMetaData.get());
-
- ASSERT_OK_NO_THROW(FileWriter::WriteMetaData(uniqueFileMetaData, this->sink_));
+ ASSERT_OK_NO_THROW(::parquet::arrow::WriteFileMetaData(*metadata, this->sink_.get()));
ASSERT_NO_FATAL_FAILURE(this->ReaderFromSink(&reader));
- const std::shared_ptr<FileMetaData> fileMetaDataWritten =
- reader->parquet_reader()->metadata();
- ASSERT_EQ(fileMetaData->size(), fileMetaDataWritten->size());
- ASSERT_EQ(fileMetaData->num_row_groups(), fileMetaDataWritten->num_row_groups());
- ASSERT_EQ(fileMetaData->num_rows(), fileMetaDataWritten->num_rows());
- ASSERT_EQ(fileMetaData->num_columns(), fileMetaDataWritten->num_columns());
- ASSERT_EQ(fileMetaData->RowGroup(0)->num_rows(),
- fileMetaDataWritten->RowGroup(0)->num_rows());
- uniqueFileMetaData.release();
+ auto metadata_written = reader->parquet_reader()->metadata();
+ ASSERT_EQ(metadata->size(), metadata_written->size());
+ ASSERT_EQ(metadata->num_row_groups(), metadata_written->num_row_groups());
+ ASSERT_EQ(metadata->num_rows(), metadata_written->num_rows());
+ ASSERT_EQ(metadata->num_columns(), metadata_written->num_columns());
+ ASSERT_EQ(metadata->RowGroup(0)->num_rows(), metadata_written->RowGroup(0)->num_rows());
}
using TestInt96ParquetIO = TestParquetIO<::arrow::TimestampType>;
@@ -1485,13 +1480,13 @@
// Regression for ARROW-2802
TEST(TestArrowReadWrite, CoerceTimestampsAndSupportDeprecatedInt96) {
using ::arrow::Column;
+ using ::arrow::default_memory_pool;
using ::arrow::Field;
using ::arrow::Schema;
using ::arrow::Table;
- using ::arrow::TimeUnit;
using ::arrow::TimestampBuilder;
using ::arrow::TimestampType;
- using ::arrow::default_memory_pool;
+ using ::arrow::TimeUnit;
auto timestamp_type = std::make_shared<TimestampType>(TimeUnit::NANO);
diff --git a/src/parquet/arrow/arrow-schema-test.cc b/src/parquet/arrow/arrow-schema-test.cc
index 5c16c04..cb2b850 100644
--- a/src/parquet/arrow/arrow-schema-test.cc
+++ b/src/parquet/arrow/arrow-schema-test.cc
@@ -62,8 +62,8 @@
for (int i = 0; i < expected_schema->num_fields(); ++i) {
auto lhs = result_schema_->field(i);
auto rhs = expected_schema->field(i);
- EXPECT_TRUE(lhs->Equals(rhs)) << i << " " << lhs->ToString()
- << " != " << rhs->ToString();
+ EXPECT_TRUE(lhs->Equals(rhs))
+ << i << " " << lhs->ToString() << " != " << rhs->ToString();
}
}
@@ -607,9 +607,10 @@
auto inner_group_type = std::make_shared<::arrow::StructType>(inner_group_fields);
auto outer_group_fields = {
std::make_shared<Field>("leaf2", INT32, true),
- std::make_shared<Field>("innerGroup", ::arrow::list(std::make_shared<Field>(
- "innerGroup", inner_group_type, false)),
- false)};
+ std::make_shared<Field>(
+ "innerGroup",
+ ::arrow::list(std::make_shared<Field>("innerGroup", inner_group_type, false)),
+ false)};
auto outer_group_type = std::make_shared<::arrow::StructType>(outer_group_fields);
arrow_fields.push_back(std::make_shared<Field>("leaf1", INT32, true));
diff --git a/src/parquet/arrow/reader.cc b/src/parquet/arrow/reader.cc
index 0f671b7..c0974ca 100644
--- a/src/parquet/arrow/reader.cc
+++ b/src/parquet/arrow/reader.cc
@@ -301,9 +301,7 @@
public:
explicit StructImpl(const std::vector<std::shared_ptr<ColumnReaderImpl>>& children,
int16_t struct_def_level, MemoryPool* pool, const Node* node)
- : children_(children),
- struct_def_level_(struct_def_level),
- pool_(pool) {
+ : children_(children), struct_def_level_(struct_def_level), pool_(pool) {
InitField(node, children);
}
diff --git a/src/parquet/arrow/test-util.h b/src/parquet/arrow/test-util.h
index 4db98b7..bfc78c8 100644
--- a/src/parquet/arrow/test-util.h
+++ b/src/parquet/arrow/test-util.h
@@ -402,17 +402,16 @@
&offsets_buffer));
memset(offsets_buffer->mutable_data(), 0, offsets_nbytes);
- auto value_field = ::arrow::field("item", ::arrow::float64(),
- false /* nullable_values */);
+ auto value_field =
+ ::arrow::field("item", ::arrow::float64(), false /* nullable_values */);
auto list_type = ::arrow::list(value_field);
std::vector<std::shared_ptr<Buffer>> child_buffers = {nullptr /* null bitmap */,
- nullptr /* values */ };
- auto child_data = ::arrow::ArrayData::Make(value_field->type(), 0,
- std::move(child_buffers));
+ nullptr /* values */};
+ auto child_data =
+ ::arrow::ArrayData::Make(value_field->type(), 0, std::move(child_buffers));
- std::vector<std::shared_ptr<Buffer>> buffers = {nullptr /* bitmap */,
- offsets_buffer };
+ std::vector<std::shared_ptr<Buffer>> buffers = {nullptr /* bitmap */, offsets_buffer};
auto array_data = ::arrow::ArrayData::Make(list_type, size, std::move(buffers));
array_data->child_data.push_back(child_data);
diff --git a/src/parquet/arrow/writer.cc b/src/parquet/arrow/writer.cc
index d1697c3..412d4e7 100644
--- a/src/parquet/arrow/writer.cc
+++ b/src/parquet/arrow/writer.cc
@@ -41,8 +41,8 @@
using arrow::ListArray;
using arrow::MemoryPool;
using arrow::NumericArray;
-using arrow::ResizableBuffer;
using arrow::PrimitiveArray;
+using arrow::ResizableBuffer;
using arrow::Status;
using arrow::Table;
using arrow::TimeUnit;
@@ -216,9 +216,11 @@
if (level_null_count && level_valid_bitmap == nullptr) {
// Special case: this is a null array (all elements are null)
RETURN_NOT_OK(def_levels_.Append(static_cast<int16_t>(def_level + 1)));
- } else if (nullable_level && ((level_null_count == 0) ||
- BitUtil::GetBit(level_valid_bitmap,
- inner_offset + i + array_offsets_[recursion_level]))) {
+ } else if (nullable_level &&
+ ((level_null_count == 0) ||
+ BitUtil::GetBit(
+ level_valid_bitmap,
+ inner_offset + i + array_offsets_[recursion_level]))) {
// Non-null element in a null level
RETURN_NOT_OK(def_levels_.Append(static_cast<int16_t>(def_level + 2)));
} else {
@@ -1092,19 +1094,17 @@
return Open(schema, pool, wrapper, properties, arrow_properties, writer);
}
-Status FileWriter::WriteMetaData(const std::unique_ptr<FileMetaData>& fileMetaData,
- const std::shared_ptr<OutputStream>& sink) {
- ParquetFileWriter::WriteMetaData(sink, fileMetaData);
+Status WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink) {
+ PARQUET_CATCH_NOT_OK(::parquet::WriteFileMetaData(file_metadata, sink));
return Status::OK();
}
-Status FileWriter::WriteMetaData(const std::unique_ptr<FileMetaData>& fileMetaData,
- const std::shared_ptr<::arrow::io::OutputStream>& sink) {
- auto wrapper = std::make_shared<ArrowOutputStream>(sink);
- return WriteMetaData(fileMetaData, wrapper);
+Status WriteFileMetaData(const FileMetaData& file_metadata,
+ const std::shared_ptr<::arrow::io::OutputStream>& sink) {
+ ArrowOutputStream wrapper(sink);
+ return ::parquet::arrow::WriteFileMetaData(file_metadata, &wrapper);
}
-
namespace {} // namespace
Status FileWriter::WriteTable(const Table& table, int64_t chunk_size) {
diff --git a/src/parquet/arrow/writer.h b/src/parquet/arrow/writer.h
index d62d3b0..ad6f1d5 100644
--- a/src/parquet/arrow/writer.h
+++ b/src/parquet/arrow/writer.h
@@ -132,14 +132,6 @@
const std::shared_ptr<ArrowWriterProperties>& arrow_properties,
std::unique_ptr<FileWriter>* writer);
- static ::arrow::Status WriteMetaData(
- const std::unique_ptr<FileMetaData>& fileMetaData,
- const std::shared_ptr<OutputStream>& sink);
-
- static ::arrow::Status WriteMetaData(
- const std::unique_ptr<FileMetaData>& fileMetaData,
- const std::shared_ptr<::arrow::io::OutputStream>& sink);
-
/// \brief Write a Table to Parquet.
::arrow::Status WriteTable(const ::arrow::Table& table, int64_t chunk_size);
@@ -161,6 +153,15 @@
std::unique_ptr<Impl> impl_;
};
+/// \brief Write Parquet file metadata only to indicated OutputStream
+PARQUET_EXPORT
+::arrow::Status WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink);
+
+/// \brief Write Parquet file metadata only to indicated Arrow OutputStream
+PARQUET_EXPORT
+::arrow::Status WriteFileMetaData(const FileMetaData& file_metadata,
+ const std::shared_ptr<::arrow::io::OutputStream>& sink);
+
/**
* Write a Table to Parquet.
*
diff --git a/src/parquet/encoding-benchmark.cc b/src/parquet/encoding-benchmark.cc
index 5ea8f8f..364cdba 100644
--- a/src/parquet/encoding-benchmark.cc
+++ b/src/parquet/encoding-benchmark.cc
@@ -20,8 +20,8 @@
#include "parquet/encoding-internal.h"
#include "parquet/util/memory.h"
-using arrow::MemoryPool;
using arrow::default_memory_pool;
+using arrow::MemoryPool;
namespace parquet {
diff --git a/src/parquet/encoding-test.cc b/src/parquet/encoding-test.cc
index 31bb79d..60285ab 100644
--- a/src/parquet/encoding-test.cc
+++ b/src/parquet/encoding-test.cc
@@ -30,8 +30,8 @@
#include "parquet/util/memory.h"
#include "parquet/util/test-common.h"
-using arrow::MemoryPool;
using arrow::default_memory_pool;
+using arrow::MemoryPool;
using std::string;
using std::vector;
diff --git a/src/parquet/encoding.h b/src/parquet/encoding.h
index 2742937..006f22f 100644
--- a/src/parquet/encoding.h
+++ b/src/parquet/encoding.h
@@ -51,12 +51,12 @@
virtual void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
int64_t valid_bits_offset) {
std::shared_ptr<ResizableBuffer> buffer;
- auto status = ::arrow::AllocateResizableBuffer(pool_, num_values * sizeof(T),
- &buffer);
+ auto status =
+ ::arrow::AllocateResizableBuffer(pool_, num_values * sizeof(T), &buffer);
if (!status.ok()) {
std::ostringstream ss;
- ss << "AllocateResizableBuffer failed in Encoder.PutSpaced in "
- << __FILE__ << ", on line " << __LINE__;
+ ss << "AllocateResizableBuffer failed in Encoder.PutSpaced in " << __FILE__
+ << ", on line " << __LINE__;
throw ParquetException(ss.str());
}
int32_t num_valid_values = 0;
diff --git a/src/parquet/file_writer.cc b/src/parquet/file_writer.cc
index cc34fd0..9b2d9b0 100644
--- a/src/parquet/file_writer.cc
+++ b/src/parquet/file_writer.cc
@@ -160,20 +160,6 @@
return result;
}
- static void WriteMetaData(
- const std::shared_ptr<OutputStream>& sink,
- const std::unique_ptr<FileMetaData>& fileMetaData) {
- // Write MetaData
- uint32_t metadata_len = static_cast<uint32_t>(sink->Tell());
-
- fileMetaData->WriteTo(sink.get());
- metadata_len = static_cast<uint32_t>(sink->Tell()) - metadata_len;
-
- // Write Footer
- sink->Write(reinterpret_cast<uint8_t*>(&metadata_len), 4);
- sink->Write(PARQUET_MAGIC, 4);
- }
-
void Close() override {
if (is_open_) {
if (row_group_writer_) {
@@ -183,7 +169,8 @@
row_group_writer_.reset();
// Write magic bytes and metadata
- WriteMetaData();
+ auto metadata = metadata_->Finish();
+ WriteFileMetaData(*metadata, sink_.get());
sink_->Close();
is_open_ = false;
@@ -246,11 +233,6 @@
// Parquet files always start with PAR1
sink_->Write(PARQUET_MAGIC, 4);
}
-
- void WriteMetaData() {
- auto metadata = metadata_->Finish();
- WriteMetaData(sink_, metadata);
- }
};
// ----------------------------------------------------------------------
@@ -285,16 +267,16 @@
return result;
}
-void ParquetFileWriter::WriteMetaData(
- const std::shared_ptr<::arrow::io::OutputStream> &sink,
- const std::unique_ptr<FileMetaData> &fileMetaData) {
- WriteMetaData(std::make_shared<ArrowOutputStream>(sink), fileMetaData);
-}
+void WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink) {
+ // Write MetaData
+ uint32_t metadata_len = static_cast<uint32_t>(sink->Tell());
-void ParquetFileWriter::WriteMetaData(
- const std::shared_ptr<OutputStream> &sink,
- const std::unique_ptr<FileMetaData> &fileMetaData) {
- FileSerializer::WriteMetaData(sink, fileMetaData);
+ file_metadata.WriteTo(sink);
+ metadata_len = static_cast<uint32_t>(sink->Tell()) - metadata_len;
+
+ // Write Footer
+ sink->Write(reinterpret_cast<uint8_t*>(&metadata_len), 4);
+ sink->Write(PARQUET_MAGIC, 4);
}
const SchemaDescriptor* ParquetFileWriter::schema() const { return contents_->schema(); }
diff --git a/src/parquet/file_writer.h b/src/parquet/file_writer.h
index e0d1dae..de17982 100644
--- a/src/parquet/file_writer.h
+++ b/src/parquet/file_writer.h
@@ -79,6 +79,9 @@
std::unique_ptr<Contents> contents_;
};
+PARQUET_EXPORT
+void WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink);
+
class PARQUET_EXPORT ParquetFileWriter {
public:
// Forward declare a virtual class 'Contents' to aid dependency injection and more
@@ -133,14 +136,6 @@
const std::shared_ptr<WriterProperties>& properties = default_writer_properties(),
const std::shared_ptr<const KeyValueMetadata>& key_value_metadata = nullptr);
- static void WriteMetaData(
- const std::shared_ptr<::arrow::io::OutputStream> &sink,
- const std::unique_ptr<FileMetaData> &fileMetaData);
-
- static void WriteMetaData(
- const std::shared_ptr<OutputStream> &sink,
- const std::unique_ptr<FileMetaData> &fileMetaData);
-
void Open(std::unique_ptr<Contents> contents);
void Close();
diff --git a/src/parquet/metadata.cc b/src/parquet/metadata.cc
index d9c5d29..39dee63 100644
--- a/src/parquet/metadata.cc
+++ b/src/parquet/metadata.cc
@@ -222,9 +222,7 @@
return impl_->data_page_offset();
}
-bool ColumnChunkMetaData::has_index_page() const {
- return impl_->has_index_page();
-}
+bool ColumnChunkMetaData::has_index_page() const { return impl_->has_index_page(); }
int64_t ColumnChunkMetaData::index_page_offset() const {
return impl_->index_page_offset();
@@ -345,7 +343,9 @@
const ApplicationVersion& writer_version() const { return writer_version_; }
- void WriteTo(OutputStream* dst) { SerializeThriftMsg(metadata_.get(), 1024, dst); }
+ void WriteTo(OutputStream* dst) const {
+ SerializeThriftMsg(metadata_.get(), 1024, dst);
+ }
std::unique_ptr<RowGroupMetaData> RowGroup(int i) {
if (!(i < num_row_groups())) {
@@ -462,7 +462,7 @@
return impl_->key_value_metadata();
}
-void FileMetaData::WriteTo(OutputStream* dst) { return impl_->WriteTo(dst); }
+void FileMetaData::WriteTo(OutputStream* dst) const { return impl_->WriteTo(dst); }
ApplicationVersion::ApplicationVersion(const std::string& application, int major,
int minor, int patch)
diff --git a/src/parquet/metadata.h b/src/parquet/metadata.h
index a9739ce..5d51e3d 100644
--- a/src/parquet/metadata.h
+++ b/src/parquet/metadata.h
@@ -171,7 +171,7 @@
const ApplicationVersion& writer_version() const;
- void WriteTo(OutputStream* dst);
+ void WriteTo(OutputStream* dst) const;
// Return const-pointer to make it clear that this object is not to be copied
const SchemaDescriptor* schema() const;
diff --git a/src/parquet/statistics-test.cc b/src/parquet/statistics-test.cc
index bf3d196..943d5cc 100644
--- a/src/parquet/statistics-test.cc
+++ b/src/parquet/statistics-test.cc
@@ -36,8 +36,8 @@
#include "parquet/types.h"
#include "parquet/util/memory.h"
-using arrow::MemoryPool;
using arrow::default_memory_pool;
+using arrow::MemoryPool;
namespace parquet {
@@ -194,8 +194,9 @@
}
template <typename TestType>
-typename std::vector<typename TestType::c_type> TestRowGroupStatistics<
- TestType>::GetDeepCopy(const std::vector<typename TestType::c_type>& values) {
+typename std::vector<typename TestType::c_type>
+TestRowGroupStatistics<TestType>::GetDeepCopy(
+ const std::vector<typename TestType::c_type>& values) {
return values;
}
diff --git a/src/parquet/statistics.cc b/src/parquet/statistics.cc
index 5b014ed..ea7f783 100644
--- a/src/parquet/statistics.cc
+++ b/src/parquet/statistics.cc
@@ -24,8 +24,8 @@
#include "parquet/statistics.h"
#include "parquet/util/memory.h"
-using arrow::MemoryPool;
using arrow::default_memory_pool;
+using arrow::MemoryPool;
namespace parquet {
diff --git a/src/parquet/util/memory-test.cc b/src/parquet/util/memory-test.cc
index cb8c706..bfd685d 100644
--- a/src/parquet/util/memory-test.cc
+++ b/src/parquet/util/memory-test.cc
@@ -27,8 +27,8 @@
#include "parquet/util/memory.h"
#include "parquet/util/test-common.h"
-using arrow::MemoryPool;
using arrow::default_memory_pool;
+using arrow::MemoryPool;
namespace parquet {
@@ -255,8 +255,8 @@
int64_t stream_offset = 10;
int64_t stream_size = source_size - stream_offset;
int64_t chunk_size = 50;
- std::shared_ptr<ResizableBuffer> buf = AllocateBuffer(default_memory_pool(),
- source_size);
+ std::shared_ptr<ResizableBuffer> buf =
+ AllocateBuffer(default_memory_pool(), source_size);
ASSERT_EQ(source_size, buf->size());
for (int i = 0; i < source_size; i++) {
buf->mutable_data()[i] = static_cast<uint8_t>(i);
diff --git a/src/parquet/util/memory.cc b/src/parquet/util/memory.cc
index df7ccc7..d9caf6e 100644
--- a/src/parquet/util/memory.cc
+++ b/src/parquet/util/memory.cc
@@ -36,9 +36,7 @@
template <class T>
Vector<T>::Vector(int64_t size, MemoryPool* pool)
- : buffer_(AllocateBuffer(pool, size * sizeof(T))),
- size_(size),
- capacity_(size) {
+ : buffer_(AllocateBuffer(pool, size * sizeof(T))), size_(size), capacity_(size) {
if (size > 0) {
data_ = reinterpret_cast<T*>(buffer_->mutable_data());
} else {
@@ -497,8 +495,7 @@
std::shared_ptr<ResizableBuffer> AllocateBuffer(MemoryPool* pool, int64_t size) {
std::shared_ptr<ResizableBuffer> result;
- PARQUET_THROW_NOT_OK(arrow::AllocateResizableBuffer(pool, size,
- &result));
+ PARQUET_THROW_NOT_OK(arrow::AllocateResizableBuffer(pool, size, &result));
return result;
}
diff --git a/src/parquet/util/memory.h b/src/parquet/util/memory.h
index 69dcebf..088f86f 100644
--- a/src/parquet/util/memory.h
+++ b/src/parquet/util/memory.h
@@ -438,7 +438,7 @@
};
std::shared_ptr<ResizableBuffer> PARQUET_EXPORT AllocateBuffer(
- ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), int64_t size = 0);
+ ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), int64_t size = 0);
} // namespace parquet