PARQUET-1346: [C++] Protect against empty Arrow arrays with null values
Author: Antoine Pitrou <antoine@python.org>
Closes #474 from pitrou/PARQUET-1346-null-values and squashes the following commits:
08bad23 [Antoine Pitrou] Do not ignore return value
a1c0378 [Antoine Pitrou] Fix uninitialized value
dfb42d2 [Antoine Pitrou] Try to fix lint
b514d0d [Antoine Pitrou] Try to fix compile failures on Travis-CI (due to old Arrow?)
3951f25 [Antoine Pitrou] PARQUET-1346: [C++] Protect against empty Arrow arrays with null values
diff --git a/src/parquet/arrow/arrow-reader-writer-test.cc b/src/parquet/arrow/arrow-reader-writer-test.cc
index 6d7e1eb..e0ff7aa 100644
--- a/src/parquet/arrow/arrow-reader-writer-test.cc
+++ b/src/parquet/arrow/arrow-reader-writer-test.cc
@@ -536,6 +536,13 @@
*out = MakeSimpleTable(lists->Slice(3, size - 6), nullable_lists);
}
+ // Prepare table of empty lists, with null values array (ARROW-2744)
+ void PrepareEmptyListsTable(int64_t size, std::shared_ptr<Table>* out) {
+ std::shared_ptr<Array> lists;
+ ASSERT_OK(MakeEmptyListsArray(size, &lists));
+ *out = MakeSimpleTable(lists, true /* nullable_lists */);
+ }
+
void PrepareListOfListTable(int64_t size, bool nullable_parent_lists,
bool nullable_lists, bool nullable_elements,
int64_t null_count, std::shared_ptr<Table>* out) {
@@ -713,6 +720,12 @@
ASSERT_NO_FATAL_FAILURE(this->CheckRoundTrip(table));
}
+TYPED_TEST(TestParquetIO, SingleEmptyListsColumnReadWrite) {
+ std::shared_ptr<Table> table;
+ ASSERT_NO_FATAL_FAILURE(this->PrepareEmptyListsTable(SMALL_SIZE, &table));
+ ASSERT_NO_FATAL_FAILURE(this->CheckRoundTrip(table));
+}
+
TYPED_TEST(TestParquetIO, SingleNullableListNullableColumnReadWrite) {
std::shared_ptr<Table> table;
ASSERT_NO_FATAL_FAILURE(this->PrepareListTable(SMALL_SIZE, true, true, 10, &table));
@@ -1524,8 +1537,6 @@
void MakeListArray(int num_rows, std::shared_ptr<::DataType>* out_type,
std::shared_ptr<Array>* out_array) {
- ::arrow::Int32Builder offset_builder;
-
std::vector<int32_t> length_draws;
randint(num_rows, 0, 100, &length_draws);
diff --git a/src/parquet/arrow/test-util.h b/src/parquet/arrow/test-util.h
index 7264324..c70e0ef 100644
--- a/src/parquet/arrow/test-util.h
+++ b/src/parquet/arrow/test-util.h
@@ -394,6 +394,33 @@
return Status::OK();
}
+// Make an array containing only empty lists, with a null values array
+Status MakeEmptyListsArray(int64_t size, std::shared_ptr<Array>* out_array) {
+ // Allocate an offsets buffer containing only zeroes
+ std::shared_ptr<Buffer> offsets_buffer;
+ const int64_t offsets_nbytes = (size + 1) * sizeof(int32_t);
+ RETURN_NOT_OK(::arrow::AllocateBuffer(::arrow::default_memory_pool(), offsets_nbytes,
+ &offsets_buffer));
+ memset(offsets_buffer->mutable_data(), 0, offsets_nbytes);
+
+ auto value_field = ::arrow::field("item", ::arrow::float64(),
+ false /* nullable_values */);
+ auto list_type = ::arrow::list(value_field);
+
+ std::vector<std::shared_ptr<Buffer>> child_buffers = {nullptr /* null bitmap */,
+ nullptr /* values */ };
+ auto child_data = ::arrow::ArrayData::Make(value_field->type(), 0,
+ std::move(child_buffers));
+
+ std::vector<std::shared_ptr<Buffer>> buffers = {nullptr /* bitmap */,
+ offsets_buffer };
+ auto array_data = ::arrow::ArrayData::Make(list_type, size, std::move(buffers));
+ array_data->child_data.push_back(child_data);
+
+ *out_array = ::arrow::MakeArray(array_data);
+ return Status::OK();
+}
+
static std::shared_ptr<::arrow::Column> MakeColumn(const std::string& name,
const std::shared_ptr<Array>& array,
bool nullable) {
diff --git a/src/parquet/arrow/writer.cc b/src/parquet/arrow/writer.cc
index 50b4649..f772738 100644
--- a/src/parquet/arrow/writer.cc
+++ b/src/parquet/arrow/writer.cc
@@ -411,8 +411,13 @@
using ArrowCType = typename ArrowType::c_type;
const auto& data = static_cast<const PrimitiveArray&>(array);
- auto values =
- reinterpret_cast<const ArrowCType*>(data.values()->data()) + data.offset();
+ const ArrowCType* values = nullptr;
+ // The values buffer may be null if the array is empty (ARROW-2744)
+ if (data.values() != nullptr) {
+ values = reinterpret_cast<const ArrowCType*>(data.values()->data()) + data.offset();
+ } else {
+ DCHECK_EQ(data.length(), 0);
+ }
if (writer_->descr()->schema_node()->is_required() || (data.null_count() == 0)) {
// no nulls, just dump the data
@@ -706,7 +711,13 @@
RETURN_NOT_OK(ctx_->GetScratchData<bool>(array.length(), &buffer));
const auto& data = static_cast<const BooleanArray&>(array);
- auto values = reinterpret_cast<const uint8_t*>(data.values()->data());
+ const uint8_t* values = nullptr;
+ // The values buffer may be null if the array is empty (ARROW-2744)
+ if (data.values() != nullptr) {
+ values = reinterpret_cast<const uint8_t*>(data.values()->data());
+ } else {
+ DCHECK_EQ(data.length(), 0);
+ }
int buffer_idx = 0;
int64_t offset = array.offset();