PARQUET-1121: Handle Dictionary[Null] arrays on writing Arrow tables
I will fix the underlying issue in Arrow but this fixes the issue so we can get a 1.3.1 release out soon.
Author: Korn, Uwe <Uwe.Korn@blue-yonder.com>
Closes #407 from xhochy/PARQUET-1121 and squashes the following commits:
85223b9 [Korn, Uwe] PARQUET-1121: Handle Dictionary[Null] arrays on writing Arrow tables
diff --git a/src/parquet/arrow/arrow-reader-writer-test.cc b/src/parquet/arrow/arrow-reader-writer-test.cc
index 4fd57ea..fc6410d 100644
--- a/src/parquet/arrow/arrow-reader-writer-test.cc
+++ b/src/parquet/arrow/arrow-reader-writer-test.cc
@@ -926,6 +926,34 @@
internal::AssertArraysEqual(*values, *chunked_array->chunk(0));
}
+TEST_F(TestNullParquetIO, NullDictionaryColumn) {
+ std::shared_ptr<Array> values = std::make_shared<::arrow::NullArray>(0);
+ std::shared_ptr<Array> indices =
+ std::make_shared<::arrow::Int8Array>(SMALL_SIZE, nullptr, nullptr, SMALL_SIZE);
+ std::shared_ptr<::arrow::DictionaryType> dict_type =
+ std::make_shared<::arrow::DictionaryType>(::arrow::int8(), values);
+ std::shared_ptr<Array> dict_values =
+ std::make_shared<::arrow::DictionaryArray>(dict_type, indices);
+ std::shared_ptr<Table> table = MakeSimpleTable(dict_values, true);
+ this->sink_ = std::make_shared<InMemoryOutputStream>();
+ ASSERT_OK_NO_THROW(WriteTable(*table, ::arrow::default_memory_pool(), this->sink_,
+ dict_values->length(), default_writer_properties()));
+
+ std::shared_ptr<Table> out;
+ std::unique_ptr<FileReader> reader;
+ this->ReaderFromSink(&reader);
+ this->ReadTableFromFile(std::move(reader), &out);
+ ASSERT_EQ(1, out->num_columns());
+ ASSERT_EQ(100, out->num_rows());
+
+ std::shared_ptr<ChunkedArray> chunked_array = out->column(0)->data();
+ ASSERT_EQ(1, chunked_array->num_chunks());
+
+ std::shared_ptr<Array> expected_values =
+ std::make_shared<::arrow::NullArray>(SMALL_SIZE);
+ AssertArraysEqual(*expected_values, *chunked_array->chunk(0));
+}
+
template <typename T>
using ParquetCDataType = typename ParquetDataType<T>::c_type;
diff --git a/src/parquet/arrow/writer.cc b/src/parquet/arrow/writer.cc
index e834042..b53c1ca 100644
--- a/src/parquet/arrow/writer.cc
+++ b/src/parquet/arrow/writer.cc
@@ -819,6 +819,12 @@
const ::arrow::DictionaryType& dict_type =
static_cast<const ::arrow::DictionaryType&>(*data.type());
+ // TODO(ARROW-1648): Remove this special handling once we require an Arrow
+ // version that has this fixed.
+ if (dict_type.dictionary()->type()->id() == ::arrow::Type::NA) {
+ return WriteColumnChunk(::arrow::NullArray(data.length()));
+ }
+
FunctionContext ctx(pool_);
std::shared_ptr<Array> plain_array;
RETURN_NOT_OK(