PARQUET-1273: Properly write dictionary values when writing in chunks The error was reported here: https://issues.apache.org/jira/browse/ARROW-1938. Because dictionary types are not supported in writing yet, the code converts the dictionary column to the actual values first before writing. However, the existing code was accidentally using zero as the offset and the length of the column as the size. This resulted in writing all of the column values for each chunk of the column that was supposed to be written. The fix is to pass the offset and size when recursively calling through to WriteColumnChunk with the "flattened" data. Author: Joshua Storck <joshua.storck@twosigma.com> Closes #453 from joshuastorck/ARROW_1938 and squashes the following commits: c2af50f [Joshua Storck] Remove extraneous semicolon in unit test 23f5722 [Joshua Storck] Ran clang-format on arrow-reader-writer-test.cc 314b159 [Joshua Storck] Removing print statements from AssertTableEqual f0bc71a [Joshua Storck] Fixing bug reported in https://issues.apache.org/jira/browse/ARROW-1938, namely preventing all of the values in a dictionary column from being written to parquet for each chunk created as a result of specifying row_group_size

commit: 494658a1f3f411c624dc49f191511febccd0d04c [log] [tgz]
author: Joshua Storck <joshua.storck@twosigma.com> Wed Apr 18 10:09:14 2018 +0200
committer: Uwe L. Korn <uwe@apache.org> Wed Apr 18 10:09:14 2018 +0200
tree: 2b5d7c76ae067ee5d799dd312e314f4ab512c98b
parent: 456801caedcb7f448b21dd1ac3f2c8120e4230cd [diff]
diff --git a/src/parquet/arrow/arrow-reader-writer-test.cc b/src/parquet/arrow/arrow-reader-writer-test.cc
index f2402df..93ecd3c 100644
--- a/src/parquet/arrow/arrow-reader-writer-test.cc
+++ b/src/parquet/arrow/arrow-reader-writer-test.cc

@@ -1759,6 +1759,69 @@
   CheckSimpleRoundtrip(table, table->num_rows());
 }
 
+TEST(TestArrowReadWrite, DictionaryColumnChunkedWrite) {
+  // This is a regression test for this:
+  //
+  // https://issues.apache.org/jira/browse/ARROW-1938
+  //
+  // As of the writing of this test, columns of type
+  // dictionary are written as their raw/expanded values.
+  // The regression was that the whole column was being
+  // written for each chunk.
+  using ::arrow::ArrayFromVector;
+
+  std::vector<std::string> values = {"first", "second", "third"};
+  auto type = ::arrow::utf8();
+  std::shared_ptr<Array> dict_values;
+  ArrayFromVector<::arrow::StringType, std::string>(values, &dict_values);
+
+  auto dict_type = ::arrow::dictionary(::arrow::int32(), dict_values);
+  auto f0 = field("dictionary", dict_type);
+  std::vector<std::shared_ptr<::arrow::Field>> fields;
+  fields.emplace_back(f0);
+  auto schema = ::arrow::schema(fields);
+
+  std::shared_ptr<Array> f0_values, f1_values;
+  ArrayFromVector<::arrow::Int32Type, int32_t>({0, 1, 0, 2, 1}, &f0_values);
+  ArrayFromVector<::arrow::Int32Type, int32_t>({2, 0, 1, 0, 2}, &f1_values);
+  ::arrow::ArrayVector dict_arrays = {
+      std::make_shared<::arrow::DictionaryArray>(dict_type, f0_values),
+      std::make_shared<::arrow::DictionaryArray>(dict_type, f1_values)};
+
+  std::vector<std::shared_ptr<::arrow::Column>> columns;
+  auto column = MakeColumn("column", dict_arrays, false);
+  columns.emplace_back(column);
+
+  auto table = Table::Make(schema, columns);
+
+  std::shared_ptr<Table> result;
+  DoSimpleRoundtrip(table, 1,
+                    // Just need to make sure that we make
+                    // a chunk size that is smaller than the
+                    // total number of values
+                    2, {}, &result);
+
+  std::vector<std::string> expected_values = {"first",  "second", "first", "third",
+                                              "second", "third",  "first", "second",
+                                              "first",  "third"};
+  columns.clear();
+
+  std::shared_ptr<Array> expected_array;
+  ArrayFromVector<::arrow::StringType, std::string>(expected_values, &expected_array);
+
+  // The column name gets changed on output to the name of the
+  // field, and it also turns into a nullable column
+  columns.emplace_back(MakeColumn("dictionary", expected_array, true));
+
+  fields.clear();
+  fields.emplace_back(::arrow::field("dictionary", ::arrow::utf8()));
+  schema = ::arrow::schema(fields);
+
+  auto expected_table = Table::Make(schema, columns);
+
+  AssertTablesEqual(*expected_table, *result, false);
+}
+
 TEST(TestArrowWrite, CheckChunkSize) {
   const int num_columns = 2;
   const int num_rows = 128;

diff --git a/src/parquet/arrow/writer.cc b/src/parquet/arrow/writer.cc
index a5dae3c..14e9c6e 100644
--- a/src/parquet/arrow/writer.cc
+++ b/src/parquet/arrow/writer.cc

@@ -969,7 +969,7 @@
       ::arrow::compute::Datum cast_output;
       RETURN_NOT_OK(Cast(&ctx, cast_input, dict_type.dictionary()->type(), CastOptions(),
                          &cast_output));
-      return WriteColumnChunk(cast_output.chunked_array(), 0, data->length());
+      return WriteColumnChunk(cast_output.chunked_array(), offset, size);
     }
 
     ColumnWriter* column_writer;
commit	494658a1f3f411c624dc49f191511febccd0d04c	[log] [tgz]
author	Joshua Storck <joshua.storck@twosigma.com>	Wed Apr 18 10:09:14 2018 +0200
committer	Uwe L. Korn <uwe@apache.org>	Wed Apr 18 10:09:14 2018 +0200
tree	2b5d7c76ae067ee5d799dd312e314f4ab512c98b
parent	456801caedcb7f448b21dd1ac3f2c8120e4230cd [diff]