PARQUET-1033: Improve documentation about WriteBatchSpaced
Author: Uwe L. Korn <uwe@apache.org>
Closes #354 from xhochy/PARQUET-1033 and squashes the following commits:
895676a [Uwe L. Korn] Remove trailing comment line
709ef32 [Uwe L. Korn] PARQUET-1033: Improve documentation about WriteBatchSpaced
diff --git a/src/parquet/column/column-writer-test.cc b/src/parquet/column/column-writer-test.cc
index 33eefac..6f47f3b 100644
--- a/src/parquet/column/column-writer-test.cc
+++ b/src/parquet/column/column-writer-test.cc
@@ -347,6 +347,36 @@
ASSERT_EQ(this->values_, this->values_out_);
}
+TYPED_TEST(TestPrimitiveWriter, OptionalSpaced) {
+ // Optional and non-repeated, with definition levels
+ // but no repetition levels
+ this->SetUpSchema(Repetition::OPTIONAL);
+
+ this->GenerateData(SMALL_SIZE);
+ std::vector<int16_t> definition_levels(SMALL_SIZE, 1);
+ std::vector<uint8_t> valid_bits(::arrow::BitUtil::BytesForBits(SMALL_SIZE), 255);
+
+ definition_levels[SMALL_SIZE - 1] = 0;
+ ::arrow::BitUtil::ClearBit(valid_bits.data(), SMALL_SIZE - 1);
+ definition_levels[1] = 0;
+ ::arrow::BitUtil::ClearBit(valid_bits.data(), 1);
+
+ auto writer = this->BuildWriter();
+ writer->WriteBatchSpaced(this->values_.size(), definition_levels.data(), nullptr,
+ valid_bits.data(), 0, this->values_ptr_);
+ writer->Close();
+
+ // PARQUET-703
+ ASSERT_EQ(100, this->metadata_num_values());
+
+ this->ReadColumn();
+ ASSERT_EQ(98, this->values_read_);
+ this->values_out_.resize(98);
+ this->values_.resize(99);
+ this->values_.erase(this->values_.cbegin() + 1);
+ ASSERT_EQ(this->values_, this->values_out_);
+}
+
TYPED_TEST(TestPrimitiveWriter, Repeated) {
// Optional and repeated, so definition and repetition levels
this->SetUpSchema(Repetition::REPEATED);
diff --git a/src/parquet/column/writer.h b/src/parquet/column/writer.h
index 5ffcf73..407e808 100644
--- a/src/parquet/column/writer.h
+++ b/src/parquet/column/writer.h
@@ -166,8 +166,32 @@
void WriteBatch(int64_t num_values, const int16_t* def_levels,
const int16_t* rep_levels, const T* values);
- // Write a batch of repetition levels, definition levels, and values to the
- // column.
+ /// Write a batch of repetition levels, definition levels, and values to the
+ /// column.
+ ///
+ /// In comparision to WriteBatch the length of repetition and definition levels
+ /// is the same as of the number of values read for max_definition_level == 1.
+ /// In the case of max_definition_level > 1, the repetition and definition
+ /// levels are larger than the values but the values include the null entries
+ /// with definition_level == (max_definition_level - 1). Thus we have to differentiate
+ /// in the parameters of this function if the input has the length of num_values or the
+ /// _number of rows in the lowest nesting level_.
+ ///
+ /// In the case that the most inner node in the Parquet is required, the _number of rows
+ /// in the lowest nesting level_ is equal to the number of non-null values. If the
+ /// inner-most schema node is optional, the _number of rows in the lowest nesting level_
+ /// also includes all values with definition_level == (max_definition_level - 1).
+ ///
+ /// @param num_values number of levels to write.
+ /// @param def_levels The Parquet definiton levels, length is num_values
+ /// @param rep_levels The Parquet repetition levels, length is num_values
+ /// @param valid_bits Bitmap that indicates if the row is null on the lowest nesting
+ /// level. The length is number of rows in the lowest nesting level.
+ /// @param valid_bits_offset The offset in bits of the valid_bits where the
+ /// first relevant bit resides.
+ /// @param values The values in the lowest nested level including
+ /// spacing for nulls on the lowest levels; input has the length
+ /// of the number of rows on the lowest nesting level.
void WriteBatchSpaced(int64_t num_values, const int16_t* def_levels,
const int16_t* rep_levels, const uint8_t* valid_bits, int64_t valid_bits_offset,
const T* values);