blob: 905502cb0a57383907679a67786b7476d7ed8317 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include <gtest/gtest.h>
#include <algorithm>
#include <array>
#include <cmath>
#include <cstdint>
#include <cstring>
#include <limits>
#include <memory>
#include <vector>
#include "arrow/array.h"
#include "arrow/buffer.h"
#include "arrow/memory_pool.h"
#include "arrow/testing/builder.h"
#include "arrow/testing/gtest_util.h"
#include "arrow/type_traits.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/bitmap_ops.h"
#include "arrow/util/config.h"
#include "arrow/util/float16.h"
#include "arrow/util/logging_internal.h"
#include "arrow/util/ubsan.h"
#include "parquet/column_reader.h"
#include "parquet/column_writer.h"
#include "parquet/file_reader.h"
#include "parquet/file_writer.h"
#include "parquet/platform.h"
#include "parquet/schema.h"
#include "parquet/statistics.h"
#include "parquet/test_util.h"
#include "parquet/thrift_internal.h"
#include "parquet/types.h"
using arrow::default_memory_pool;
using arrow::MemoryPool;
using arrow::util::Float16;
using arrow::util::SafeCopy;
namespace bit_util = arrow::bit_util;
namespace parquet {
using schema::GroupNode;
using schema::NodePtr;
using schema::PrimitiveNode;
namespace test {
// ----------------------------------------------------------------------
// Test comparators
static ByteArray ByteArrayFromString(const std::string& s) {
auto ptr = reinterpret_cast<const uint8_t*>(s.data());
return ByteArray(static_cast<uint32_t>(s.size()), ptr);
}
static FLBA FLBAFromString(const std::string& s) {
auto ptr = reinterpret_cast<const uint8_t*>(s.data());
return FLBA(ptr);
}
TEST(Comparison, SignedByteArray) {
// Signed byte array comparison is only used for Decimal comparison. When
// decimals are encoded as byte arrays they use twos complement big-endian
// encoded values. Comparisons of byte arrays of unequal types need to handle
// sign extension.
auto comparator = MakeComparator<ByteArrayType>(Type::BYTE_ARRAY, SortOrder::SIGNED);
struct Case {
std::vector<uint8_t> bytes;
int order;
ByteArray ToByteArray() const {
return ByteArray(static_cast<int>(bytes.size()), bytes.data());
}
};
// Test a mix of big-endian comparison values that are both equal and
// unequal after sign extension.
std::vector<Case> cases = {
{{0x80, 0x80, 0, 0}, 0}, {{/*0xFF,*/ 0x80, 0, 0}, 1},
{{0xFF, 0x80, 0, 0}, 1}, {{/*0xFF,*/ 0xFF, 0x01, 0}, 2},
{{/*0xFF, 0xFF,*/ 0x80, 0}, 3}, {{/*0xFF,*/ 0xFF, 0x80, 0}, 3},
{{0xFF, 0xFF, 0x80, 0}, 3}, {{/*0xFF,0xFF,0xFF,*/ 0x80}, 4},
{{/*0xFF, 0xFF, 0xFF,*/ 0xFF}, 5}, {{/*0, 0,*/ 0x01, 0x01}, 6},
{{/*0,*/ 0, 0x01, 0x01}, 6}, {{0, 0, 0x01, 0x01}, 6},
{{/*0,*/ 0x01, 0x01, 0}, 7}, {{0x01, 0x01, 0, 0}, 8}};
for (size_t x = 0; x < cases.size(); x++) {
const auto& case1 = cases[x];
// Empty array is always the smallest values
EXPECT_TRUE(comparator->Compare(ByteArray(), case1.ToByteArray())) << x;
EXPECT_FALSE(comparator->Compare(case1.ToByteArray(), ByteArray())) << x;
// Equals is always false.
EXPECT_FALSE(comparator->Compare(case1.ToByteArray(), case1.ToByteArray())) << x;
for (size_t y = 0; y < cases.size(); y++) {
const auto& case2 = cases[y];
if (case1.order < case2.order) {
EXPECT_TRUE(comparator->Compare(case1.ToByteArray(), case2.ToByteArray()))
<< x << " (order: " << case1.order << ") " << y << " (order: " << case2.order
<< ")";
} else {
EXPECT_FALSE(comparator->Compare(case1.ToByteArray(), case2.ToByteArray()))
<< x << " (order: " << case1.order << ") " << y << " (order: " << case2.order
<< ")";
}
}
}
}
TEST(Comparison, UnsignedByteArray) {
// Check if UTF-8 is compared using unsigned correctly
auto comparator = MakeComparator<ByteArrayType>(Type::BYTE_ARRAY, SortOrder::UNSIGNED);
std::string s1 = "arrange";
std::string s2 = "arrangement";
ByteArray s1ba = ByteArrayFromString(s1);
ByteArray s2ba = ByteArrayFromString(s2);
ASSERT_TRUE(comparator->Compare(s1ba, s2ba));
// Multi-byte UTF-8 characters
s1 = "braten";
s2 = "bügeln";
s1ba = ByteArrayFromString(s1);
s2ba = ByteArrayFromString(s2);
ASSERT_TRUE(comparator->Compare(s1ba, s2ba));
s1 = "ünk123456"; // ü = 252
s2 = "ănk123456"; // ă = 259
s1ba = ByteArrayFromString(s1);
s2ba = ByteArrayFromString(s2);
ASSERT_TRUE(comparator->Compare(s1ba, s2ba));
}
TEST(Comparison, SignedFLBA) {
int size = 4;
auto comparator =
MakeComparator<FLBAType>(Type::FIXED_LEN_BYTE_ARRAY, SortOrder::SIGNED, size);
std::vector<uint8_t> byte_values[] = {
{0x80, 0, 0, 0}, {0xFF, 0xFF, 0x01, 0}, {0xFF, 0xFF, 0x80, 0},
{0xFF, 0xFF, 0xFF, 0x80}, {0xFF, 0xFF, 0xFF, 0xFF}, {0, 0, 0x01, 0x01},
{0, 0x01, 0x01, 0}, {0x01, 0x01, 0, 0}};
std::vector<FLBA> values_to_compare;
for (auto& bytes : byte_values) {
values_to_compare.emplace_back(FLBA(bytes.data()));
}
for (size_t x = 0; x < values_to_compare.size(); x++) {
EXPECT_FALSE(comparator->Compare(values_to_compare[x], values_to_compare[x])) << x;
for (size_t y = x + 1; y < values_to_compare.size(); y++) {
EXPECT_TRUE(comparator->Compare(values_to_compare[x], values_to_compare[y]))
<< x << " " << y;
EXPECT_FALSE(comparator->Compare(values_to_compare[y], values_to_compare[x]))
<< y << " " << x;
}
}
}
TEST(Comparison, UnsignedFLBA) {
int size = 10;
auto comparator =
MakeComparator<FLBAType>(Type::FIXED_LEN_BYTE_ARRAY, SortOrder::UNSIGNED, size);
std::string s1 = "Anti123456";
std::string s2 = "Bunkd123456";
FLBA s1flba = FLBAFromString(s1);
FLBA s2flba = FLBAFromString(s2);
ASSERT_TRUE(comparator->Compare(s1flba, s2flba));
s1 = "Bunk123456";
s2 = "Bünk123456";
s1flba = FLBAFromString(s1);
s2flba = FLBAFromString(s2);
ASSERT_TRUE(comparator->Compare(s1flba, s2flba));
}
TEST(Comparison, SignedInt96) {
parquet::Int96 a{{1, 41, 14}}, b{{1, 41, 42}};
parquet::Int96 aa{{1, 41, 14}}, bb{{1, 41, 14}};
parquet::Int96 aaa{{1, 41, static_cast<uint32_t>(-14)}}, bbb{{1, 41, 42}};
auto comparator = MakeComparator<Int96Type>(Type::INT96, SortOrder::SIGNED);
ASSERT_TRUE(comparator->Compare(a, b));
ASSERT_TRUE(!comparator->Compare(aa, bb) && !comparator->Compare(bb, aa));
ASSERT_TRUE(comparator->Compare(aaa, bbb));
}
TEST(Comparison, UnsignedInt96) {
parquet::Int96 a{{1, 41, 14}}, b{{1, static_cast<uint32_t>(-41), 42}};
parquet::Int96 aa{{1, 41, 14}}, bb{{1, 41, static_cast<uint32_t>(-14)}};
parquet::Int96 aaa, bbb;
auto comparator = MakeComparator<Int96Type>(Type::INT96, SortOrder::UNSIGNED);
ASSERT_TRUE(comparator->Compare(a, b));
ASSERT_TRUE(comparator->Compare(aa, bb));
// INT96 Timestamp
aaa.value[2] = 2451545; // 2000-01-01
bbb.value[2] = 2451546; // 2000-01-02
// 12 hours + 34 minutes + 56 seconds.
Int96SetNanoSeconds(aaa, 45296000000000);
// 12 hours + 34 minutes + 50 seconds.
Int96SetNanoSeconds(bbb, 45290000000000);
ASSERT_TRUE(comparator->Compare(aaa, bbb));
aaa.value[2] = 2451545; // 2000-01-01
bbb.value[2] = 2451545; // 2000-01-01
// 11 hours + 34 minutes + 56 seconds.
Int96SetNanoSeconds(aaa, 41696000000000);
// 12 hours + 34 minutes + 50 seconds.
Int96SetNanoSeconds(bbb, 45290000000000);
ASSERT_TRUE(comparator->Compare(aaa, bbb));
aaa.value[2] = 2451545; // 2000-01-01
bbb.value[2] = 2451545; // 2000-01-01
// 12 hours + 34 minutes + 55 seconds.
Int96SetNanoSeconds(aaa, 45295000000000);
// 12 hours + 34 minutes + 56 seconds.
Int96SetNanoSeconds(bbb, 45296000000000);
ASSERT_TRUE(comparator->Compare(aaa, bbb));
}
TEST(Comparison, SignedInt64) {
int64_t a = 1, b = 4;
int64_t aa = 1, bb = 1;
int64_t aaa = -1, bbb = 1;
NodePtr node = PrimitiveNode::Make("SignedInt64", Repetition::REQUIRED, Type::INT64);
ColumnDescriptor descr(node, 0, 0);
auto comparator = MakeComparator<Int64Type>(&descr);
ASSERT_TRUE(comparator->Compare(a, b));
ASSERT_TRUE(!comparator->Compare(aa, bb) && !comparator->Compare(bb, aa));
ASSERT_TRUE(comparator->Compare(aaa, bbb));
}
TEST(Comparison, UnsignedInt64) {
uint64_t a = 1, b = 4;
uint64_t aa = 1, bb = 1;
uint64_t aaa = 1, bbb = -1;
NodePtr node = PrimitiveNode::Make("UnsignedInt64", Repetition::REQUIRED, Type::INT64,
ConvertedType::UINT_64);
ColumnDescriptor descr(node, 0, 0);
ASSERT_EQ(SortOrder::UNSIGNED, descr.sort_order());
auto comparator = MakeComparator<Int64Type>(&descr);
ASSERT_TRUE(comparator->Compare(a, b));
ASSERT_TRUE(!comparator->Compare(aa, bb) && !comparator->Compare(bb, aa));
ASSERT_TRUE(comparator->Compare(aaa, bbb));
}
TEST(Comparison, UnsignedInt32) {
uint32_t a = 1, b = 4;
uint32_t aa = 1, bb = 1;
uint32_t aaa = 1, bbb = -1;
NodePtr node = PrimitiveNode::Make("UnsignedInt32", Repetition::REQUIRED, Type::INT32,
ConvertedType::UINT_32);
ColumnDescriptor descr(node, 0, 0);
ASSERT_EQ(SortOrder::UNSIGNED, descr.sort_order());
auto comparator = MakeComparator<Int32Type>(&descr);
ASSERT_TRUE(comparator->Compare(a, b));
ASSERT_TRUE(!comparator->Compare(aa, bb) && !comparator->Compare(bb, aa));
ASSERT_TRUE(comparator->Compare(aaa, bbb));
}
TEST(Comparison, UnknownSortOrder) {
NodePtr node =
PrimitiveNode::Make("Unknown", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY,
ConvertedType::INTERVAL, 12);
ColumnDescriptor descr(node, 0, 0);
ASSERT_THROW(Comparator::Make(&descr), ParquetException);
}
// ----------------------------------------------------------------------
template <typename TestType>
class TestStatistics : public PrimitiveTypedTest<TestType> {
public:
using c_type = typename TestType::c_type;
std::vector<c_type> GetDeepCopy(
const std::vector<c_type>&); // allocates new memory for FLBA/ByteArray
c_type* GetValuesPointer(std::vector<c_type>&);
void DeepFree(std::vector<c_type>&);
void TestMinMaxEncode() {
this->GenerateData(1000);
auto statistics1 = MakeStatistics<TestType>(this->schema_.Column(0));
statistics1->Update(this->values_ptr_, this->values_.size(), 0);
std::string encoded_min = statistics1->EncodeMin();
std::string encoded_max = statistics1->EncodeMax();
auto statistics2 = MakeStatistics<TestType>(
this->schema_.Column(0), encoded_min, encoded_max, this->values_.size(),
/*null_count=*/0, /*distinct_count=*/0,
/*has_min_max=*/true, /*has_null_count=*/true, /*has_distinct_count=*/true,
/*is_min_value_exact=*/true, /*is_max_value_exact=*/true);
auto statistics3 = MakeStatistics<TestType>(this->schema_.Column(0));
std::vector<uint8_t> valid_bits(
bit_util::BytesForBits(static_cast<uint32_t>(this->values_.size())) + 1, 255);
statistics3->UpdateSpaced(this->values_ptr_, valid_bits.data(), 0,
this->values_.size(), this->values_.size(), 0);
std::string encoded_min_spaced = statistics3->EncodeMin();
std::string encoded_max_spaced = statistics3->EncodeMax();
// Use old API without is_{min/max}_value_exact
auto statistics4 = MakeStatistics<TestType>(
this->schema_.Column(0), encoded_min, encoded_max, this->values_.size(),
/*null_count=*/0, /*distinct_count=*/0,
/*has_min_max=*/true, /*has_null_count=*/true, /*has_distinct_count=*/true);
ASSERT_EQ(encoded_min, statistics2->EncodeMin());
ASSERT_EQ(encoded_max, statistics2->EncodeMax());
ASSERT_EQ(statistics1->min(), statistics2->min());
ASSERT_EQ(statistics1->max(), statistics2->max());
ASSERT_EQ(statistics1->is_min_value_exact(), std::make_optional(true));
ASSERT_EQ(statistics1->is_max_value_exact(), std::make_optional(true));
ASSERT_EQ(statistics2->is_min_value_exact(), std::make_optional(true));
ASSERT_EQ(statistics2->is_max_value_exact(), std::make_optional(true));
ASSERT_EQ(encoded_min_spaced, statistics2->EncodeMin());
ASSERT_EQ(encoded_max_spaced, statistics2->EncodeMax());
ASSERT_EQ(statistics3->min(), statistics2->min());
ASSERT_EQ(statistics3->max(), statistics2->max());
ASSERT_EQ(statistics3->is_min_value_exact(), std::make_optional(true));
ASSERT_EQ(statistics3->is_max_value_exact(), std::make_optional(true));
ASSERT_EQ(statistics4->min(), statistics2->min());
ASSERT_EQ(statistics4->max(), statistics2->max());
ASSERT_EQ(statistics4->is_min_value_exact(), std::nullopt);
ASSERT_EQ(statistics4->is_max_value_exact(), std::nullopt);
}
void TestReset() {
this->GenerateData(1000);
auto statistics = MakeStatistics<TestType>(this->schema_.Column(0));
statistics->Update(this->values_ptr_, this->values_.size(), 0);
ASSERT_EQ(this->values_.size(), statistics->num_values());
statistics->Reset();
ASSERT_TRUE(statistics->HasNullCount());
ASSERT_FALSE(statistics->HasMinMax());
ASSERT_FALSE(statistics->HasDistinctCount());
ASSERT_EQ(0, statistics->null_count());
ASSERT_EQ(0, statistics->num_values());
ASSERT_EQ(0, statistics->distinct_count());
ASSERT_EQ("", statistics->EncodeMin());
ASSERT_EQ("", statistics->EncodeMax());
}
void TestMerge() {
int num_null[2];
random_numbers(2, 42, 0, 100, num_null);
auto statistics1 = MakeStatistics<TestType>(this->schema_.Column(0));
this->GenerateData(1000);
statistics1->Update(this->values_ptr_, this->values_.size() - num_null[0],
num_null[0]);
auto statistics2 = MakeStatistics<TestType>(this->schema_.Column(0));
this->GenerateData(1000);
statistics2->Update(this->values_ptr_, this->values_.size() - num_null[1],
num_null[1]);
auto total = MakeStatistics<TestType>(this->schema_.Column(0));
total->Merge(*statistics1);
total->Merge(*statistics2);
ASSERT_EQ(num_null[0] + num_null[1], total->null_count());
ASSERT_EQ(this->values_.size() * 2 - num_null[0] - num_null[1], total->num_values());
ASSERT_EQ(total->min(), std::min(statistics1->min(), statistics2->min()));
ASSERT_EQ(total->max(), std::max(statistics1->max(), statistics2->max()));
}
void TestEquals() {
const auto n_values = 1;
auto statistics_have_minmax1 = MakeStatistics<TestType>(this->schema_.Column(0));
const auto seed1 = 1;
this->GenerateData(n_values, seed1);
statistics_have_minmax1->Update(this->values_ptr_, this->values_.size(), 0);
auto statistics_have_minmax2 = MakeStatistics<TestType>(this->schema_.Column(0));
const auto seed2 = 9999;
this->GenerateData(n_values, seed2);
statistics_have_minmax2->Update(this->values_ptr_, this->values_.size(), 0);
auto statistics_no_minmax = MakeStatistics<TestType>(this->schema_.Column(0));
ASSERT_EQ(true, statistics_have_minmax1->Equals(*statistics_have_minmax1));
ASSERT_EQ(true, statistics_no_minmax->Equals(*statistics_no_minmax));
ASSERT_EQ(false, statistics_have_minmax1->Equals(*statistics_have_minmax2));
ASSERT_EQ(false, statistics_have_minmax1->Equals(*statistics_no_minmax));
}
void TestFullRoundtrip(int64_t num_values, int64_t null_count) {
this->GenerateData(num_values);
// compute statistics for the whole batch
auto expected_stats = MakeStatistics<TestType>(this->schema_.Column(0));
expected_stats->Update(this->values_ptr_, num_values - null_count, null_count);
auto sink = CreateOutputStream();
auto gnode = std::static_pointer_cast<GroupNode>(this->node_);
std::shared_ptr<WriterProperties> writer_properties =
WriterProperties::Builder().enable_statistics("column")->build();
auto file_writer = ParquetFileWriter::Open(sink, gnode, writer_properties);
auto row_group_writer = file_writer->AppendRowGroup();
auto column_writer =
static_cast<TypedColumnWriter<TestType>*>(row_group_writer->NextColumn());
// simulate the case when data comes from multiple buffers,
// in which case special care is necessary for FLBA/ByteArray types
for (int i = 0; i < 2; i++) {
int64_t batch_num_values = i ? num_values - num_values / 2 : num_values / 2;
int64_t batch_null_count = i ? null_count : 0;
DCHECK(null_count <= num_values); // avoid too much headache
std::vector<int16_t> definition_levels(batch_null_count, 0);
definition_levels.insert(definition_levels.end(),
batch_num_values - batch_null_count, 1);
auto beg = this->values_.begin() + i * num_values / 2;
auto end = beg + batch_num_values;
std::vector<c_type> batch = GetDeepCopy(std::vector<c_type>(beg, end));
c_type* batch_values_ptr = GetValuesPointer(batch);
column_writer->WriteBatch(batch_num_values, definition_levels.data(), nullptr,
batch_values_ptr);
DeepFree(batch);
}
column_writer->Close();
row_group_writer->Close();
file_writer->Close();
ASSERT_OK_AND_ASSIGN(auto buffer, sink->Finish());
auto source = std::make_shared<::arrow::io::BufferReader>(buffer);
auto file_reader = ParquetFileReader::Open(source);
auto rg_reader = file_reader->RowGroup(0);
auto column_chunk = rg_reader->metadata()->ColumnChunk(0);
if (!column_chunk->is_stats_set()) return;
std::shared_ptr<Statistics> stats = column_chunk->statistics();
// check values after serialization + deserialization
EXPECT_EQ(null_count, stats->null_count());
EXPECT_EQ(num_values - null_count, stats->num_values());
EXPECT_TRUE(expected_stats->HasMinMax());
EXPECT_EQ(expected_stats->EncodeMin(), stats->EncodeMin());
EXPECT_EQ(expected_stats->EncodeMax(), stats->EncodeMax());
std::shared_ptr<EncodedStatistics> enc_stats = column_chunk->encoded_statistics();
EXPECT_EQ(null_count, enc_stats->null_count);
EXPECT_TRUE(enc_stats->has_min);
EXPECT_TRUE(enc_stats->has_max);
EXPECT_EQ(expected_stats->EncodeMin(), enc_stats->min());
EXPECT_EQ(expected_stats->EncodeMax(), enc_stats->max());
EXPECT_EQ(enc_stats->is_min_value_exact, std::make_optional(true));
EXPECT_EQ(enc_stats->is_max_value_exact, std::make_optional(true));
}
};
template <typename TestType>
typename TestType::c_type* TestStatistics<TestType>::GetValuesPointer(
std::vector<typename TestType::c_type>& values) {
return values.data();
}
template <>
bool* TestStatistics<BooleanType>::GetValuesPointer(std::vector<bool>& values) {
static std::vector<uint8_t> bool_buffer;
bool_buffer.clear();
bool_buffer.resize(values.size());
std::copy(values.begin(), values.end(), bool_buffer.begin());
return reinterpret_cast<bool*>(bool_buffer.data());
}
template <typename TestType>
typename std::vector<typename TestType::c_type> TestStatistics<TestType>::GetDeepCopy(
const std::vector<typename TestType::c_type>& values) {
return values;
}
template <>
std::vector<FLBA> TestStatistics<FLBAType>::GetDeepCopy(const std::vector<FLBA>& values) {
std::vector<FLBA> copy;
MemoryPool* pool = ::arrow::default_memory_pool();
for (const FLBA& flba : values) {
uint8_t* ptr;
PARQUET_THROW_NOT_OK(pool->Allocate(FLBA_LENGTH, &ptr));
memcpy(ptr, flba.ptr, FLBA_LENGTH);
copy.emplace_back(ptr);
}
return copy;
}
template <>
std::vector<ByteArray> TestStatistics<ByteArrayType>::GetDeepCopy(
const std::vector<ByteArray>& values) {
std::vector<ByteArray> copy;
MemoryPool* pool = default_memory_pool();
for (const ByteArray& ba : values) {
uint8_t* ptr;
PARQUET_THROW_NOT_OK(pool->Allocate(ba.len, &ptr));
memcpy(ptr, ba.ptr, ba.len);
copy.emplace_back(ba.len, ptr);
}
return copy;
}
template <typename TestType>
void TestStatistics<TestType>::DeepFree(std::vector<typename TestType::c_type>& values) {}
template <>
void TestStatistics<FLBAType>::DeepFree(std::vector<FLBA>& values) {
MemoryPool* pool = default_memory_pool();
for (FLBA& flba : values) {
auto ptr = const_cast<uint8_t*>(flba.ptr);
memset(ptr, 0, FLBA_LENGTH);
pool->Free(ptr, FLBA_LENGTH);
}
}
template <>
void TestStatistics<ByteArrayType>::DeepFree(std::vector<ByteArray>& values) {
MemoryPool* pool = default_memory_pool();
for (ByteArray& ba : values) {
auto ptr = const_cast<uint8_t*>(ba.ptr);
memset(ptr, 0, ba.len);
pool->Free(ptr, ba.len);
}
}
template <>
void TestStatistics<ByteArrayType>::TestMinMaxEncode() {
this->GenerateData(1000);
// Test that we encode min max strings correctly
auto statistics1 = MakeStatistics<ByteArrayType>(this->schema_.Column(0));
statistics1->Update(this->values_ptr_, this->values_.size(), 0);
std::string encoded_min = statistics1->EncodeMin();
std::string encoded_max = statistics1->EncodeMax();
// encoded is same as unencoded
ASSERT_EQ(encoded_min,
std::string(reinterpret_cast<const char*>(statistics1->min().ptr),
statistics1->min().len));
ASSERT_EQ(encoded_max,
std::string(reinterpret_cast<const char*>(statistics1->max().ptr),
statistics1->max().len));
auto statistics2 = MakeStatistics<ByteArrayType>(
this->schema_.Column(0), encoded_min, encoded_max, this->values_.size(),
/*null_count=*/0,
/*distinct_count=*/0, /*has_min_max=*/true, /*has_null_count=*/true,
/*has_distinct_count=*/true, /*is_min_value_exact=*/true,
/*is_max_value_exact=*/true);
ASSERT_EQ(encoded_min, statistics2->EncodeMin());
ASSERT_EQ(encoded_max, statistics2->EncodeMax());
ASSERT_EQ(statistics1->min(), statistics2->min());
ASSERT_EQ(statistics1->max(), statistics2->max());
}
using Types = ::testing::Types<Int32Type, Int64Type, FloatType, DoubleType, ByteArrayType,
FLBAType, BooleanType>;
TYPED_TEST_SUITE(TestStatistics, Types);
TYPED_TEST(TestStatistics, MinMaxEncode) {
this->SetUpSchema(Repetition::REQUIRED);
ASSERT_NO_FATAL_FAILURE(this->TestMinMaxEncode());
}
TYPED_TEST(TestStatistics, Reset) {
this->SetUpSchema(Repetition::OPTIONAL);
ASSERT_NO_FATAL_FAILURE(this->TestReset());
}
TYPED_TEST(TestStatistics, Equals) {
this->SetUpSchema(Repetition::OPTIONAL);
ASSERT_NO_FATAL_FAILURE(this->TestEquals());
}
TYPED_TEST(TestStatistics, FullRoundtrip) {
this->SetUpSchema(Repetition::OPTIONAL);
ASSERT_NO_FATAL_FAILURE(this->TestFullRoundtrip(100, 31));
ASSERT_NO_FATAL_FAILURE(this->TestFullRoundtrip(1000, 415));
ASSERT_NO_FATAL_FAILURE(this->TestFullRoundtrip(10000, 926));
}
template <typename TestType>
class TestNumericStatistics : public TestStatistics<TestType> {};
using NumericTypes = ::testing::Types<Int32Type, Int64Type, FloatType, DoubleType>;
TYPED_TEST_SUITE(TestNumericStatistics, NumericTypes);
TYPED_TEST(TestNumericStatistics, Merge) {
this->SetUpSchema(Repetition::OPTIONAL);
ASSERT_NO_FATAL_FAILURE(this->TestMerge());
}
TYPED_TEST(TestNumericStatistics, Equals) {
this->SetUpSchema(Repetition::OPTIONAL);
ASSERT_NO_FATAL_FAILURE(this->TestEquals());
}
template <typename TestType>
class TestStatisticsHasFlag : public TestStatistics<TestType> {
public:
void SetUp() override {
TestStatistics<TestType>::SetUp();
this->SetUpSchema(Repetition::OPTIONAL);
}
std::optional<int64_t> MergeDistinctCount(
std::optional<int64_t> initial,
const std::vector<std::optional<int64_t>>& subsequent) {
EncodedStatistics encoded_statistics;
if (initial) {
encoded_statistics.has_distinct_count = true;
encoded_statistics.distinct_count = *initial;
}
std::shared_ptr<TypedStatistics<TestType>> statistics =
std::dynamic_pointer_cast<TypedStatistics<TestType>>(
Statistics::Make(this->schema_.Column(0), &encoded_statistics,
/*num_values=*/1000));
for (const auto& distinct_count : subsequent) {
EncodedStatistics next_encoded_statistics;
if (distinct_count) {
next_encoded_statistics.has_distinct_count = true;
next_encoded_statistics.distinct_count = *distinct_count;
}
std::shared_ptr<TypedStatistics<TestType>> next_statistics =
std::dynamic_pointer_cast<TypedStatistics<TestType>>(
Statistics::Make(this->schema_.Column(0), &next_encoded_statistics,
/*num_values=*/1000));
statistics->Merge(*next_statistics);
}
EncodedStatistics final_statistics = statistics->Encode();
EXPECT_EQ(statistics->HasDistinctCount(), final_statistics.has_distinct_count);
if (statistics->HasDistinctCount()) {
EXPECT_EQ(statistics->distinct_count(), final_statistics.distinct_count);
return statistics->distinct_count();
}
return std::nullopt;
}
std::shared_ptr<TypedStatistics<TestType>> MergedStatistics(
const TypedStatistics<TestType>& stats1, const TypedStatistics<TestType>& stats2) {
auto chunk_statistics = MakeStatistics<TestType>(this->schema_.Column(0));
chunk_statistics->Merge(stats1);
chunk_statistics->Merge(stats2);
return chunk_statistics;
}
void VerifyMergedStatistics(
const TypedStatistics<TestType>& stats1, const TypedStatistics<TestType>& stats2,
const std::function<void(TypedStatistics<TestType>*)>& test_fn) {
ASSERT_NO_FATAL_FAILURE(test_fn(MergedStatistics(stats1, stats2).get()));
ASSERT_NO_FATAL_FAILURE(test_fn(MergedStatistics(stats2, stats1).get()));
}
// Distinct count should set to false when Merge is called, unless one of the statistics
// has a zero count.
void TestMergeDistinctCount() {
// Sanity tests.
ASSERT_EQ(std::nullopt, MergeDistinctCount(std::nullopt, {}));
ASSERT_EQ(10, MergeDistinctCount(10, {}));
ASSERT_EQ(std::nullopt, MergeDistinctCount(std::nullopt, {0}));
ASSERT_EQ(std::nullopt, MergeDistinctCount(std::nullopt, {10, 0}));
ASSERT_EQ(10, MergeDistinctCount(10, {0, 0}));
ASSERT_EQ(10, MergeDistinctCount(0, {10, 0}));
ASSERT_EQ(10, MergeDistinctCount(0, {0, 10}));
ASSERT_EQ(10, MergeDistinctCount(0, {0, 10, 0}));
ASSERT_EQ(std::nullopt, MergeDistinctCount(10, {0, 10}));
ASSERT_EQ(std::nullopt, MergeDistinctCount(10, {0, std::nullopt}));
ASSERT_EQ(std::nullopt, MergeDistinctCount(0, {std::nullopt, 0}));
}
// If all values in a page are null or nan, its stats should not set min-max.
// Merging its stats with another page having good min-max stats should not
// drop the valid min-max from the latter page.
void TestMergeMinMax() {
this->GenerateData(1000);
// Create a statistics object without min-max.
std::shared_ptr<TypedStatistics<TestType>> statistics1;
{
statistics1 = MakeStatistics<TestType>(this->schema_.Column(0));
statistics1->Update(this->values_ptr_, /*num_values=*/0,
/*null_count=*/this->values_.size());
auto encoded_stats1 = statistics1->Encode();
EXPECT_FALSE(statistics1->HasMinMax());
EXPECT_FALSE(encoded_stats1.has_min);
EXPECT_FALSE(encoded_stats1.has_max);
EXPECT_EQ(encoded_stats1.is_max_value_exact, std::nullopt);
EXPECT_EQ(encoded_stats1.is_min_value_exact, std::nullopt);
}
// Create a statistics object with min-max.
std::shared_ptr<TypedStatistics<TestType>> statistics2;
{
statistics2 = MakeStatistics<TestType>(this->schema_.Column(0));
statistics2->Update(this->values_ptr_, this->values_.size(), 0);
auto encoded_stats2 = statistics2->Encode();
EXPECT_TRUE(statistics2->HasMinMax());
EXPECT_TRUE(encoded_stats2.has_min);
EXPECT_TRUE(encoded_stats2.has_max);
EXPECT_EQ(encoded_stats2.is_min_value_exact, std::make_optional(true));
EXPECT_EQ(encoded_stats2.is_max_value_exact, std::make_optional(true));
}
VerifyMergedStatistics(*statistics1, *statistics2,
[](TypedStatistics<TestType>* merged_statistics) {
EXPECT_TRUE(merged_statistics->HasMinMax());
EXPECT_TRUE(merged_statistics->Encode().has_min);
EXPECT_TRUE(merged_statistics->Encode().has_max);
EXPECT_EQ(merged_statistics->Encode().is_min_value_exact,
std::make_optional(true));
EXPECT_EQ(merged_statistics->Encode().is_max_value_exact,
std::make_optional(true));
});
}
// Default statistics should have null_count even if no nulls is written.
// However, if statistics is created from thrift message, it might not
// have null_count. Merging statistics from such page will result in an
// invalid null_count as well.
void TestMergeNullCount() {
this->GenerateData(/*num_values=*/1000);
// Page should have null-count even if no nulls
std::shared_ptr<TypedStatistics<TestType>> statistics1;
{
statistics1 = MakeStatistics<TestType>(this->schema_.Column(0));
statistics1->Update(this->values_ptr_, /*num_values=*/this->values_.size(),
/*null_count=*/0);
auto encoded_stats1 = statistics1->Encode();
EXPECT_TRUE(statistics1->HasNullCount());
EXPECT_EQ(0, statistics1->null_count());
EXPECT_TRUE(statistics1->Encode().has_null_count);
}
// Merge with null-count should also have null count
VerifyMergedStatistics(*statistics1, *statistics1,
[](TypedStatistics<TestType>* merged_statistics) {
EXPECT_TRUE(merged_statistics->HasNullCount());
EXPECT_EQ(0, merged_statistics->null_count());
auto encoded = merged_statistics->Encode();
EXPECT_TRUE(encoded.has_null_count);
EXPECT_EQ(0, encoded.null_count);
});
// When loaded from thrift, might not have null count.
std::shared_ptr<TypedStatistics<TestType>> statistics2;
{
EncodedStatistics encoded_statistics2;
encoded_statistics2.has_null_count = false;
statistics2 = std::dynamic_pointer_cast<TypedStatistics<TestType>>(
Statistics::Make(this->schema_.Column(0), &encoded_statistics2,
/*num_values=*/1000));
EXPECT_FALSE(statistics2->Encode().has_null_count);
EXPECT_FALSE(statistics2->HasNullCount());
}
// Merge without null-count should not have null count
VerifyMergedStatistics(*statistics1, *statistics2,
[](TypedStatistics<TestType>* merged_statistics) {
EXPECT_FALSE(merged_statistics->HasNullCount());
EXPECT_FALSE(merged_statistics->Encode().has_null_count);
});
}
// statistics.all_null_value is used to build the page index.
// If statistics doesn't have null count, all_null_value should be false.
void TestMissingNullCount() {
EncodedStatistics encoded_statistics;
encoded_statistics.has_null_count = false;
auto statistics = Statistics::Make(this->schema_.Column(0), &encoded_statistics,
/*num_values=*/1000);
auto typed_stats = std::dynamic_pointer_cast<TypedStatistics<TestType>>(statistics);
EXPECT_FALSE(typed_stats->HasNullCount());
auto encoded = typed_stats->Encode();
EXPECT_FALSE(encoded.all_null_value);
EXPECT_FALSE(encoded.has_null_count);
EXPECT_FALSE(encoded.has_distinct_count);
EXPECT_FALSE(encoded.has_min);
EXPECT_FALSE(encoded.has_max);
EXPECT_FALSE(encoded.is_min_value_exact.has_value());
EXPECT_FALSE(encoded.is_max_value_exact.has_value());
}
};
TYPED_TEST_SUITE(TestStatisticsHasFlag, Types);
TYPED_TEST(TestStatisticsHasFlag, MergeDistinctCount) {
ASSERT_NO_FATAL_FAILURE(this->TestMergeDistinctCount());
}
TYPED_TEST(TestStatisticsHasFlag, MergeNullCount) {
ASSERT_NO_FATAL_FAILURE(this->TestMergeNullCount());
}
TYPED_TEST(TestStatisticsHasFlag, MergeMinMax) {
ASSERT_NO_FATAL_FAILURE(this->TestMergeMinMax());
}
TYPED_TEST(TestStatisticsHasFlag, MissingNullCount) {
ASSERT_NO_FATAL_FAILURE(this->TestMissingNullCount());
}
// Helper for basic statistics tests below
void AssertStatsSet(const ApplicationVersion& version,
std::shared_ptr<parquet::WriterProperties> props,
const ColumnDescriptor* column, bool expected_is_set) {
auto metadata_builder = ColumnChunkMetaDataBuilder::Make(props, column);
auto column_chunk = ColumnChunkMetaData::Make(metadata_builder->contents(), column,
default_reader_properties(), &version);
EncodedStatistics stats;
stats.set_is_signed(false);
metadata_builder->SetStatistics(stats);
ASSERT_EQ(column_chunk->is_stats_set(), expected_is_set);
if (expected_is_set) {
ASSERT_TRUE(column_chunk->encoded_statistics() != nullptr);
} else {
ASSERT_TRUE(column_chunk->encoded_statistics() == nullptr);
}
}
// Statistics are restricted for few types in older parquet version
TEST(CorruptStatistics, Basics) {
std::string created_by = "parquet-mr version 1.8.0";
ApplicationVersion version(created_by);
SchemaDescriptor schema;
schema::NodePtr node;
std::vector<schema::NodePtr> fields;
// Test Physical Types
fields.push_back(schema::PrimitiveNode::Make("col1", Repetition::OPTIONAL, Type::INT32,
ConvertedType::NONE));
fields.push_back(schema::PrimitiveNode::Make("col2", Repetition::OPTIONAL,
Type::BYTE_ARRAY, ConvertedType::NONE));
// Test Logical Types
fields.push_back(schema::PrimitiveNode::Make("col3", Repetition::OPTIONAL, Type::INT32,
ConvertedType::DATE));
fields.push_back(schema::PrimitiveNode::Make("col4", Repetition::OPTIONAL, Type::INT32,
ConvertedType::UINT_32));
fields.push_back(schema::PrimitiveNode::Make("col5", Repetition::OPTIONAL,
Type::FIXED_LEN_BYTE_ARRAY,
ConvertedType::INTERVAL, 12));
fields.push_back(schema::PrimitiveNode::Make("col6", Repetition::OPTIONAL,
Type::BYTE_ARRAY, ConvertedType::UTF8));
node = schema::GroupNode::Make("schema", Repetition::REQUIRED, fields);
schema.Init(node);
parquet::WriterProperties::Builder builder;
builder.created_by(created_by);
std::shared_ptr<parquet::WriterProperties> props = builder.build();
AssertStatsSet(version, props, schema.Column(0), true);
AssertStatsSet(version, props, schema.Column(1), false);
AssertStatsSet(version, props, schema.Column(2), true);
AssertStatsSet(version, props, schema.Column(3), false);
AssertStatsSet(version, props, schema.Column(4), false);
AssertStatsSet(version, props, schema.Column(5), false);
}
// Statistics for all types have no restrictions in newer parquet version
TEST(CorrectStatistics, Basics) {
std::string created_by = "parquet-cpp version 1.3.0";
ApplicationVersion version(created_by);
SchemaDescriptor schema;
schema::NodePtr node;
std::vector<schema::NodePtr> fields;
// Test Physical Types
fields.push_back(schema::PrimitiveNode::Make("col1", Repetition::OPTIONAL, Type::INT32,
ConvertedType::NONE));
fields.push_back(schema::PrimitiveNode::Make("col2", Repetition::OPTIONAL,
Type::BYTE_ARRAY, ConvertedType::NONE));
// Test Logical Types
fields.push_back(schema::PrimitiveNode::Make("col3", Repetition::OPTIONAL, Type::INT32,
ConvertedType::DATE));
fields.push_back(schema::PrimitiveNode::Make("col4", Repetition::OPTIONAL, Type::INT32,
ConvertedType::UINT_32));
fields.push_back(schema::PrimitiveNode::Make("col5", Repetition::OPTIONAL,
Type::FIXED_LEN_BYTE_ARRAY,
ConvertedType::INTERVAL, 12));
fields.push_back(schema::PrimitiveNode::Make("col6", Repetition::OPTIONAL,
Type::BYTE_ARRAY, ConvertedType::UTF8));
node = schema::GroupNode::Make("schema", Repetition::REQUIRED, fields);
schema.Init(node);
parquet::WriterProperties::Builder builder;
builder.created_by(created_by);
std::shared_ptr<parquet::WriterProperties> props = builder.build();
AssertStatsSet(version, props, schema.Column(0), true);
AssertStatsSet(version, props, schema.Column(1), true);
AssertStatsSet(version, props, schema.Column(2), true);
AssertStatsSet(version, props, schema.Column(3), true);
AssertStatsSet(version, props, schema.Column(4), true);
AssertStatsSet(version, props, schema.Column(5), true);
}
// Test SortOrder class
static const int NUM_VALUES = 10;
template <typename T>
struct RebindLogical {
using ParquetType = T;
using CType = typename T::c_type;
};
template <>
struct RebindLogical<Float16LogicalType> {
using ParquetType = FLBAType;
using CType = ParquetType::c_type;
};
template <typename T>
class TestStatisticsSortOrder : public ::testing::Test {
public:
using TestType = typename RebindLogical<T>::ParquetType;
using c_type = typename TestType::c_type;
void SetUp() override {
#ifndef ARROW_WITH_SNAPPY
GTEST_SKIP() << "Test requires Snappy compression";
#endif
}
void AddNodes(std::string name) {
fields_.push_back(schema::PrimitiveNode::Make(
name, Repetition::REQUIRED, TestType::type_num, ConvertedType::NONE));
}
void SetUpSchema() {
stats_.resize(fields_.size());
values_.resize(NUM_VALUES);
schema_ = std::static_pointer_cast<GroupNode>(
GroupNode::Make("Schema", Repetition::REQUIRED, fields_));
parquet_sink_ = CreateOutputStream();
}
void SetValues();
void WriteParquet() {
// Add writer properties
parquet::WriterProperties::Builder builder;
builder.compression(parquet::Compression::SNAPPY);
builder.created_by("parquet-cpp version 1.3.0");
std::shared_ptr<parquet::WriterProperties> props = builder.build();
// Create a ParquetFileWriter instance
auto file_writer = parquet::ParquetFileWriter::Open(parquet_sink_, schema_, props);
// Append a RowGroup with a specific number of rows.
auto rg_writer = file_writer->AppendRowGroup();
this->SetValues();
// Insert Values
for (int i = 0; i < static_cast<int>(fields_.size()); i++) {
auto column_writer =
static_cast<parquet::TypedColumnWriter<TestType>*>(rg_writer->NextColumn());
column_writer->WriteBatch(NUM_VALUES, nullptr, nullptr, values_.data());
}
}
void VerifyParquetStats() {
ASSERT_OK_AND_ASSIGN(auto pbuffer, parquet_sink_->Finish());
// Create a ParquetReader instance
std::unique_ptr<parquet::ParquetFileReader> parquet_reader =
parquet::ParquetFileReader::Open(
std::make_shared<::arrow::io::BufferReader>(pbuffer));
// Get the File MetaData
std::shared_ptr<parquet::FileMetaData> file_metadata = parquet_reader->metadata();
std::shared_ptr<parquet::RowGroupMetaData> rg_metadata = file_metadata->RowGroup(0);
for (int i = 0; i < static_cast<int>(fields_.size()); i++) {
ARROW_SCOPED_TRACE("Statistics for field #", i);
std::shared_ptr<parquet::ColumnChunkMetaData> cc_metadata =
rg_metadata->ColumnChunk(i);
EXPECT_EQ(stats_[i].min(), cc_metadata->statistics()->EncodeMin());
EXPECT_EQ(stats_[i].max(), cc_metadata->statistics()->EncodeMax());
EXPECT_EQ(stats_[i].is_max_value_exact, std::make_optional(true));
EXPECT_EQ(stats_[i].is_min_value_exact, std::make_optional(true));
}
}
protected:
std::vector<c_type> values_;
std::vector<uint8_t> values_buf_;
std::vector<schema::NodePtr> fields_;
std::shared_ptr<schema::GroupNode> schema_;
std::shared_ptr<::arrow::io::BufferOutputStream> parquet_sink_;
std::vector<EncodedStatistics> stats_;
};
using CompareTestTypes = ::testing::Types<Int32Type, Int64Type, FloatType, DoubleType,
ByteArrayType, FLBAType, Float16LogicalType>;
// TYPE::INT32
template <>
void TestStatisticsSortOrder<Int32Type>::AddNodes(std::string name) {
// UINT_32 logical type to set Unsigned Statistics
fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, Type::INT32,
ConvertedType::UINT_32));
// INT_32 logical type to set Signed Statistics
fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, Type::INT32,
ConvertedType::INT_32));
}
template <>
void TestStatisticsSortOrder<Int32Type>::SetValues() {
for (int i = 0; i < NUM_VALUES; i++) {
values_[i] = i - 5; // {-5, -4, -3, -2, -1, 0, 1, 2, 3, 4};
}
// Write UINT32 min/max values
stats_[0]
.set_min(std::string(reinterpret_cast<const char*>(&values_[5]), sizeof(c_type)))
.set_max(std::string(reinterpret_cast<const char*>(&values_[4]), sizeof(c_type)));
stats_[0].is_max_value_exact = true;
stats_[0].is_min_value_exact = true;
// Write INT32 min/max values
stats_[1]
.set_min(std::string(reinterpret_cast<const char*>(&values_[0]), sizeof(c_type)))
.set_max(std::string(reinterpret_cast<const char*>(&values_[9]), sizeof(c_type)));
stats_[1].is_max_value_exact = true;
stats_[1].is_min_value_exact = true;
}
// TYPE::INT64
template <>
void TestStatisticsSortOrder<Int64Type>::AddNodes(std::string name) {
// UINT_64 logical type to set Unsigned Statistics
fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, Type::INT64,
ConvertedType::UINT_64));
// INT_64 logical type to set Signed Statistics
fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, Type::INT64,
ConvertedType::INT_64));
}
template <>
void TestStatisticsSortOrder<Int64Type>::SetValues() {
for (int i = 0; i < NUM_VALUES; i++) {
values_[i] = i - 5; // {-5, -4, -3, -2, -1, 0, 1, 2, 3, 4};
}
// Write UINT64 min/max values
stats_[0]
.set_min(std::string(reinterpret_cast<const char*>(&values_[5]), sizeof(c_type)))
.set_max(std::string(reinterpret_cast<const char*>(&values_[4]), sizeof(c_type)));
stats_[0].is_max_value_exact = true;
stats_[0].is_min_value_exact = true;
// Write INT64 min/max values
stats_[1]
.set_min(std::string(reinterpret_cast<const char*>(&values_[0]), sizeof(c_type)))
.set_max(std::string(reinterpret_cast<const char*>(&values_[9]), sizeof(c_type)));
stats_[1].is_max_value_exact = true;
stats_[1].is_min_value_exact = true;
}
// TYPE::FLOAT
template <>
void TestStatisticsSortOrder<FloatType>::SetValues() {
for (int i = 0; i < NUM_VALUES; i++) {
values_[i] = static_cast<float>(i) -
5; // {-5.0, -4.0, -3.0, -2.0, -1.0, 0.0, 1.0, 2.0, 3.0, 4.0};
}
// Write Float min/max values
stats_[0]
.set_min(std::string(reinterpret_cast<const char*>(&values_[0]), sizeof(c_type)))
.set_max(std::string(reinterpret_cast<const char*>(&values_[9]), sizeof(c_type)));
stats_[0].is_max_value_exact = true;
stats_[0].is_min_value_exact = true;
}
// TYPE::DOUBLE
template <>
void TestStatisticsSortOrder<DoubleType>::SetValues() {
for (int i = 0; i < NUM_VALUES; i++) {
values_[i] = static_cast<float>(i) -
5; // {-5.0, -4.0, -3.0, -2.0, -1.0, 0.0, 1.0, 2.0, 3.0, 4.0};
}
// Write Double min/max values
stats_[0]
.set_min(std::string(reinterpret_cast<const char*>(&values_[0]), sizeof(c_type)))
.set_max(std::string(reinterpret_cast<const char*>(&values_[9]), sizeof(c_type)));
stats_[0].is_max_value_exact = true;
stats_[0].is_min_value_exact = true;
}
// TYPE::ByteArray
template <>
void TestStatisticsSortOrder<ByteArrayType>::AddNodes(std::string name) {
// UTF8 logical type to set Unsigned Statistics
fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED,
Type::BYTE_ARRAY, ConvertedType::UTF8));
}
template <>
void TestStatisticsSortOrder<ByteArrayType>::SetValues() {
int max_byte_array_len = 10;
size_t nbytes = NUM_VALUES * max_byte_array_len;
values_buf_.resize(nbytes);
std::vector<std::string> vals = {"c123", "b123", "a123", "d123", "e123",
"f123", "g123", "h123", "i123", "ü123"};
uint8_t* base = &values_buf_.data()[0];
for (int i = 0; i < NUM_VALUES; i++) {
memcpy(base, vals[i].c_str(), vals[i].length());
values_[i].ptr = base;
values_[i].len = static_cast<uint32_t>(vals[i].length());
base += vals[i].length();
}
// Write String min/max values
stats_[0]
.set_min(
std::string(reinterpret_cast<const char*>(vals[2].c_str()), vals[2].length()))
.set_max(
std::string(reinterpret_cast<const char*>(vals[9].c_str()), vals[9].length()));
stats_[0].is_max_value_exact = true;
stats_[0].is_min_value_exact = true;
}
// TYPE::FLBAArray
template <>
void TestStatisticsSortOrder<FLBAType>::AddNodes(std::string name) {
// FLBA has only Unsigned Statistics
fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED,
Type::FIXED_LEN_BYTE_ARRAY,
ConvertedType::NONE, FLBA_LENGTH));
}
template <>
void TestStatisticsSortOrder<FLBAType>::SetValues() {
size_t nbytes = NUM_VALUES * FLBA_LENGTH;
values_buf_.resize(nbytes);
char vals[NUM_VALUES][FLBA_LENGTH] = {"b12345", "a12345", "c12345", "d12345", "e12345",
"f12345", "g12345", "h12345", "z12345", "a12345"};
uint8_t* base = &values_buf_.data()[0];
for (int i = 0; i < NUM_VALUES; i++) {
memcpy(base, &vals[i][0], FLBA_LENGTH);
values_[i].ptr = base;
base += FLBA_LENGTH;
}
// Write FLBA min,max values
stats_[0]
.set_min(std::string(reinterpret_cast<const char*>(&vals[1][0]), FLBA_LENGTH))
.set_max(std::string(reinterpret_cast<const char*>(&vals[8][0]), FLBA_LENGTH));
stats_[0].is_max_value_exact = true;
stats_[0].is_min_value_exact = true;
}
template <>
void TestStatisticsSortOrder<Float16LogicalType>::AddNodes(std::string name) {
auto node =
schema::PrimitiveNode::Make(name, Repetition::REQUIRED, LogicalType::Float16(),
Type::FIXED_LEN_BYTE_ARRAY, sizeof(uint16_t));
fields_.push_back(std::move(node));
}
template <>
void TestStatisticsSortOrder<Float16LogicalType>::SetValues() {
constexpr int kValueLen = 2;
constexpr int kNumBytes = NUM_VALUES * kValueLen;
const Float16 f16_vals[NUM_VALUES] = {
Float16::FromFloat(+2.0f), Float16::FromFloat(-4.0f), Float16::FromFloat(+4.0f),
Float16::FromFloat(-2.0f), Float16::FromFloat(-1.0f), Float16::FromFloat(+3.0f),
Float16::FromFloat(+1.0f), Float16::FromFloat(-5.0f), Float16::FromFloat(+0.0f),
Float16::FromFloat(-3.0f),
};
values_buf_.resize(kNumBytes);
uint8_t* ptr = values_buf_.data();
for (int i = 0; i < NUM_VALUES; ++i) {
f16_vals[i].ToLittleEndian(ptr);
values_[i].ptr = ptr;
ptr += kValueLen;
}
stats_[0]
.set_min(std::string(reinterpret_cast<const char*>(values_[7].ptr), kValueLen))
.set_max(std::string(reinterpret_cast<const char*>(values_[2].ptr), kValueLen));
stats_[0].is_max_value_exact = true;
stats_[0].is_min_value_exact = true;
}
TYPED_TEST_SUITE(TestStatisticsSortOrder, CompareTestTypes);
TYPED_TEST(TestStatisticsSortOrder, MinMax) {
this->AddNodes("Column ");
this->SetUpSchema();
this->WriteParquet();
ASSERT_NO_FATAL_FAILURE(this->VerifyParquetStats());
}
template <typename ArrowType>
void TestByteArrayStatisticsFromArrow() {
using TypeTraits = ::arrow::TypeTraits<ArrowType>;
using ArrayType = typename TypeTraits::ArrayType;
auto values = ArrayFromJSON(TypeTraits::type_singleton(),
"[\"c123\", \"b123\", \"a123\", null, "
"null, \"f123\", \"g123\", \"h123\", \"i123\", \"ü123\"]");
const auto& typed_values = static_cast<const ArrayType&>(*values);
NodePtr node = PrimitiveNode::Make("field", Repetition::REQUIRED, Type::BYTE_ARRAY,
ConvertedType::UTF8);
ColumnDescriptor descr(node, 0, 0);
auto stats = MakeStatistics<ByteArrayType>(&descr);
ASSERT_NO_FATAL_FAILURE(stats->Update(*values));
ASSERT_EQ(ByteArray(typed_values.GetView(2)), stats->min());
ASSERT_EQ(ByteArray(typed_values.GetView(9)), stats->max());
ASSERT_EQ(2, stats->null_count());
}
TEST(TestByteArrayStatisticsFromArrow, StringType) {
// Part of ARROW-3246. Replicating TestStatisticsSortOrder test but via Arrow
TestByteArrayStatisticsFromArrow<::arrow::StringType>();
}
TEST(TestByteArrayStatisticsFromArrow, LargeStringType) {
TestByteArrayStatisticsFromArrow<::arrow::LargeStringType>();
}
// Ensure UNKNOWN sort order is handled properly
using TestStatisticsSortOrderFLBA = TestStatisticsSortOrder<FLBAType>;
TEST_F(TestStatisticsSortOrderFLBA, UnknownSortOrder) {
this->fields_.push_back(schema::PrimitiveNode::Make(
"Column 0", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY,
ConvertedType::INTERVAL, FLBA_LENGTH));
this->SetUpSchema();
this->WriteParquet();
ASSERT_OK_AND_ASSIGN(auto pbuffer, parquet_sink_->Finish());
// Create a ParquetReader instance
std::unique_ptr<parquet::ParquetFileReader> parquet_reader =
parquet::ParquetFileReader::Open(
std::make_shared<::arrow::io::BufferReader>(pbuffer));
// Get the File MetaData
std::shared_ptr<parquet::FileMetaData> file_metadata = parquet_reader->metadata();
std::shared_ptr<parquet::RowGroupMetaData> rg_metadata = file_metadata->RowGroup(0);
std::shared_ptr<parquet::ColumnChunkMetaData> cc_metadata = rg_metadata->ColumnChunk(0);
// stats should not be set for UNKNOWN sort order
ASSERT_FALSE(cc_metadata->is_stats_set());
}
template <typename T>
static std::string EncodeValue(const T& val) {
return std::string(reinterpret_cast<const char*>(&val), sizeof(val));
}
static std::string EncodeValue(const FLBA& val, int length = sizeof(uint16_t)) {
return std::string(reinterpret_cast<const char*>(val.ptr), length);
}
template <typename Stats, typename Array, typename T = typename Array::value_type>
void AssertMinMaxAre(Stats stats, const Array& values, T expected_min, T expected_max) {
stats->Update(values.data(), values.size(), 0);
ASSERT_TRUE(stats->HasMinMax());
EXPECT_EQ(stats->EncodeMin(), EncodeValue(expected_min));
EXPECT_EQ(stats->EncodeMax(), EncodeValue(expected_max));
EXPECT_EQ(stats->is_min_value_exact(), std::make_optional(true));
EXPECT_EQ(stats->is_max_value_exact(), std::make_optional(true));
}
template <typename Stats, typename Array, typename T = typename Stats::T>
void AssertMinMaxAre(Stats stats, const Array& values, const uint8_t* valid_bitmap,
T expected_min, T expected_max) {
auto n_values = values.size();
auto null_count = ::arrow::internal::CountSetBits(valid_bitmap, n_values, 0);
auto non_null_count = n_values - null_count;
stats->UpdateSpaced(values.data(), valid_bitmap, 0, non_null_count + null_count,
non_null_count, null_count);
ASSERT_TRUE(stats->HasMinMax());
EXPECT_EQ(stats->EncodeMin(), EncodeValue(expected_min));
EXPECT_EQ(stats->EncodeMax(), EncodeValue(expected_max));
EXPECT_EQ(stats->is_min_value_exact(), std::make_optional(true));
EXPECT_EQ(stats->is_max_value_exact(), std::make_optional(true));
}
template <typename Stats, typename Array>
void AssertUnsetMinMax(Stats stats, const Array& values) {
stats->Update(values.data(), values.size(), 0);
ASSERT_FALSE(stats->HasMinMax());
ASSERT_FALSE(stats->is_min_value_exact().has_value());
ASSERT_FALSE(stats->is_max_value_exact().has_value());
}
template <typename Stats, typename Array>
void AssertUnsetMinMax(Stats stats, const Array& values, const uint8_t* valid_bitmap) {
auto n_values = values.size();
auto null_count = ::arrow::internal::CountSetBits(valid_bitmap, n_values, 0);
auto non_null_count = n_values - null_count;
stats->UpdateSpaced(values.data(), valid_bitmap, 0, non_null_count + null_count,
non_null_count, null_count);
ASSERT_FALSE(stats->HasMinMax());
ASSERT_FALSE(stats->is_min_value_exact().has_value());
ASSERT_FALSE(stats->is_max_value_exact().has_value());
}
template <typename ParquetType, typename T = typename ParquetType::c_type>
void CheckExtrema() {
using UT = typename std::make_unsigned<T>::type;
const T smin = std::numeric_limits<T>::min();
const T smax = std::numeric_limits<T>::max();
const T umin = SafeCopy<T>(std::numeric_limits<UT>::min());
const T umax = SafeCopy<T>(std::numeric_limits<UT>::max());
constexpr int kNumValues = 8;
std::array<T, kNumValues> values{0, smin, smax, umin,
umax, smin + 1, smax - 1, umax - 1};
NodePtr unsigned_node = PrimitiveNode::Make(
"uint", Repetition::OPTIONAL,
LogicalType::Int(sizeof(T) * CHAR_BIT, false /*signed*/), ParquetType::type_num);
ColumnDescriptor unsigned_descr(unsigned_node, 1, 1);
NodePtr signed_node = PrimitiveNode::Make(
"int", Repetition::OPTIONAL,
LogicalType::Int(sizeof(T) * CHAR_BIT, true /*signed*/), ParquetType::type_num);
ColumnDescriptor signed_descr(signed_node, 1, 1);
{
ARROW_SCOPED_TRACE("unsigned statistics: umin = ", umin, ", umax = ", umax,
", node type = ", unsigned_node->logical_type()->ToString(),
", physical type = ", unsigned_descr.physical_type(),
", sort order = ", unsigned_descr.sort_order());
auto unsigned_stats = MakeStatistics<ParquetType>(&unsigned_descr);
AssertMinMaxAre(unsigned_stats, values, umin, umax);
}
{
ARROW_SCOPED_TRACE("signed statistics: smin = ", smin, ", smax = ", smax,
", node type = ", signed_node->logical_type()->ToString(),
", physical type = ", signed_descr.physical_type(),
", sort order = ", signed_descr.sort_order());
auto signed_stats = MakeStatistics<ParquetType>(&signed_descr);
AssertMinMaxAre(signed_stats, values, smin, smax);
}
// With validity bitmap
std::vector<bool> is_valid = {true, false, false, false, false, true, true, true};
std::shared_ptr<Buffer> valid_bitmap;
::arrow::BitmapFromVector(is_valid, &valid_bitmap);
{
ARROW_SCOPED_TRACE("spaced unsigned statistics: umin = ", umin, ", umax = ", umax,
", node type = ", unsigned_node->logical_type()->ToString(),
", physical type = ", unsigned_descr.physical_type(),
", sort order = ", unsigned_descr.sort_order());
auto unsigned_stats = MakeStatistics<ParquetType>(&unsigned_descr);
AssertMinMaxAre(unsigned_stats, values, valid_bitmap->data(), T{0}, umax - 1);
}
{
ARROW_SCOPED_TRACE("spaced signed statistics: smin = ", smin, ", smax = ", smax,
", node type = ", signed_node->logical_type()->ToString(),
", physical type = ", signed_descr.physical_type(),
", sort order = ", signed_descr.sort_order());
auto signed_stats = MakeStatistics<ParquetType>(&signed_descr);
AssertMinMaxAre(signed_stats, values, valid_bitmap->data(), smin + 1, smax - 1);
}
}
TEST(TestStatistic, Int32Extrema) { CheckExtrema<Int32Type>(); }
TEST(TestStatistic, Int64Extrema) { CheckExtrema<Int64Type>(); }
template <typename T>
class TestFloatStatistics : public ::testing::Test {
public:
using ParquetType = typename RebindLogical<T>::ParquetType;
using c_type = typename ParquetType::c_type;
void Init();
void SetUp() override {
this->Init();
ASSERT_NE(EncodeValue(negative_zero_), EncodeValue(positive_zero_));
}
bool signbit(c_type val);
void CheckEq(const c_type& l, const c_type& r);
NodePtr MakeNode(const std::string& name, Repetition::type rep);
template <typename Stats, typename Values>
void CheckMinMaxZeroesSign(Stats stats, const Values& values) {
stats->Update(values.data(), values.size(), /*null_count=*/0);
ASSERT_TRUE(stats->HasMinMax());
this->CheckEq(stats->min(), positive_zero_);
ASSERT_TRUE(this->signbit(stats->min()));
ASSERT_EQ(stats->EncodeMin(), EncodeValue(negative_zero_));
this->CheckEq(stats->max(), positive_zero_);
ASSERT_FALSE(this->signbit(stats->max()));
ASSERT_EQ(stats->EncodeMax(), EncodeValue(positive_zero_));
}
// ARROW-5562: Ensure that -0.0f and 0.0f values are properly handled like in
// parquet-mr
void TestNegativeZeroes() {
NodePtr node = this->MakeNode("f", Repetition::OPTIONAL);
ColumnDescriptor descr(node, 1, 1);
{
std::array<c_type, 2> values{negative_zero_, positive_zero_};
auto stats = MakeStatistics<ParquetType>(&descr);
CheckMinMaxZeroesSign(stats, values);
}
{
std::array<c_type, 2> values{positive_zero_, negative_zero_};
auto stats = MakeStatistics<ParquetType>(&descr);
CheckMinMaxZeroesSign(stats, values);
}
{
std::array<c_type, 2> values{negative_zero_, negative_zero_};
auto stats = MakeStatistics<ParquetType>(&descr);
CheckMinMaxZeroesSign(stats, values);
}
{
std::array<c_type, 2> values{positive_zero_, positive_zero_};
auto stats = MakeStatistics<ParquetType>(&descr);
CheckMinMaxZeroesSign(stats, values);
}
}
// PARQUET-1225: Float NaN values may lead to incorrect min-max
template <typename Values>
void CheckNaNs(ColumnDescriptor* descr, const Values& all_nans, const Values& some_nans,
const Values& other_nans, c_type min, c_type max, uint8_t valid_bitmap,
uint8_t valid_bitmap_no_nans) {
auto some_nan_stats = MakeStatistics<ParquetType>(descr);
// Ingesting only nans should not yield valid min max
AssertUnsetMinMax(some_nan_stats, all_nans);
// Ingesting a mix of NaNs and non-NaNs should yield a valid min max.
AssertMinMaxAre(some_nan_stats, some_nans, min, max);
// Ingesting only nans after a valid min/max, should have no effect
AssertMinMaxAre(some_nan_stats, all_nans, min, max);
some_nan_stats = MakeStatistics<ParquetType>(descr);
AssertUnsetMinMax(some_nan_stats, all_nans, &valid_bitmap);
// NaNs should not pollute min max when excluded via null bitmap.
AssertMinMaxAre(some_nan_stats, some_nans, &valid_bitmap_no_nans, min, max);
// Ingesting NaNs with a null bitmap should not change the result.
AssertMinMaxAre(some_nan_stats, some_nans, &valid_bitmap, min, max);
// An array that doesn't start with NaN
auto other_stats = MakeStatistics<ParquetType>(descr);
AssertMinMaxAre(other_stats, other_nans, min, max);
}
void TestNaNs();
protected:
std::vector<uint8_t> data_buf_;
c_type positive_zero_;
c_type negative_zero_;
};
template <typename T>
void TestFloatStatistics<T>::Init() {
positive_zero_ = c_type{};
negative_zero_ = -positive_zero_;
}
template <>
void TestFloatStatistics<Float16LogicalType>::Init() {
data_buf_.resize(4);
(+Float16(0)).ToLittleEndian(&data_buf_[0]);
positive_zero_ = FLBA{&data_buf_[0]};
(-Float16(0)).ToLittleEndian(&data_buf_[2]);
negative_zero_ = FLBA{&data_buf_[2]};
}
template <typename T>
NodePtr TestFloatStatistics<T>::MakeNode(const std::string& name, Repetition::type rep) {
return PrimitiveNode::Make(name, rep, ParquetType::type_num);
}
template <>
NodePtr TestFloatStatistics<Float16LogicalType>::MakeNode(const std::string& name,
Repetition::type rep) {
return PrimitiveNode::Make(name, rep, LogicalType::Float16(),
Type::FIXED_LEN_BYTE_ARRAY, 2);
}
template <typename T>
void TestFloatStatistics<T>::CheckEq(const c_type& l, const c_type& r) {
ASSERT_EQ(l, r);
}
template <>
void TestFloatStatistics<Float16LogicalType>::CheckEq(const c_type& a, const c_type& b) {
auto l = Float16::FromLittleEndian(a.ptr);
auto r = Float16::FromLittleEndian(b.ptr);
ASSERT_EQ(l, r);
}
template <typename T>
bool TestFloatStatistics<T>::signbit(c_type val) {
return std::signbit(val);
}
template <>
bool TestFloatStatistics<Float16LogicalType>::signbit(c_type val) {
return Float16::FromLittleEndian(val.ptr).signbit();
}
template <typename T>
void TestFloatStatistics<T>::TestNaNs() {
constexpr int kNumValues = 8;
NodePtr node = this->MakeNode("f", Repetition::OPTIONAL);
ColumnDescriptor descr(node, 1, 1);
constexpr c_type nan = std::numeric_limits<c_type>::quiet_NaN();
constexpr c_type min = -4.0f;
constexpr c_type max = 3.0f;
std::array<c_type, kNumValues> all_nans{nan, nan, nan, nan, nan, nan, nan, nan};
std::array<c_type, kNumValues> some_nans{nan, max, -3.0f, -1.0f, nan, 2.0f, min, nan};
std::array<c_type, kNumValues> other_nans{1.5f, max, -3.0f, -1.0f, nan, 2.0f, min, nan};
uint8_t valid_bitmap = 0x7F; // 0b01111111
// NaNs excluded
uint8_t valid_bitmap_no_nans = 0x6E; // 0b01101110
this->CheckNaNs(&descr, all_nans, some_nans, other_nans, min, max, valid_bitmap,
valid_bitmap_no_nans);
}
struct BufferedFloat16 {
explicit BufferedFloat16(Float16 f16) : f16(f16) {
this->f16.ToLittleEndian(bytes_.data());
}
explicit BufferedFloat16(float f) : BufferedFloat16(Float16::FromFloat(f)) {}
const uint8_t* bytes() const { return bytes_.data(); }
Float16 f16;
std::array<uint8_t, 2> bytes_;
};
template <>
void TestFloatStatistics<Float16LogicalType>::TestNaNs() {
constexpr int kNumValues = 8;
NodePtr node = this->MakeNode("f", Repetition::OPTIONAL);
ColumnDescriptor descr(node, 1, 1);
using F16 = BufferedFloat16;
const auto nan_f16 = F16(std::numeric_limits<Float16>::quiet_NaN());
const auto min_f16 = F16(-4.0f);
const auto max_f16 = F16(+3.0f);
const auto min = FLBA{min_f16.bytes()};
const auto max = FLBA{max_f16.bytes()};
std::array<F16, kNumValues> all_nans_f16 = {nan_f16, nan_f16, nan_f16, nan_f16,
nan_f16, nan_f16, nan_f16, nan_f16};
std::array<F16, kNumValues> some_nans_f16 = {
nan_f16, max_f16, F16(-3.0f), F16(-1.0f), nan_f16, F16(+2.0f), min_f16, nan_f16};
std::array<F16, kNumValues> other_nans_f16 = some_nans_f16;
other_nans_f16[0] = F16(+1.5f); // +1.5
auto prepare_values = [](const auto& values) -> std::vector<FLBA> {
std::vector<FLBA> out(values.size());
std::transform(values.begin(), values.end(), out.begin(),
[](const F16& f16) { return FLBA{f16.bytes()}; });
return out;
};
auto all_nans = prepare_values(all_nans_f16);
auto some_nans = prepare_values(some_nans_f16);
auto other_nans = prepare_values(other_nans_f16);
uint8_t valid_bitmap = 0x7F; // 0b01111111
// NaNs excluded
uint8_t valid_bitmap_no_nans = 0x6E; // 0b01101110
this->CheckNaNs(&descr, all_nans, some_nans, other_nans, min, max, valid_bitmap,
valid_bitmap_no_nans);
}
using FloatingPointTypes = ::testing::Types<FloatType, DoubleType, Float16LogicalType>;
TYPED_TEST_SUITE(TestFloatStatistics, FloatingPointTypes);
TYPED_TEST(TestFloatStatistics, NegativeZeros) { this->TestNegativeZeroes(); }
TYPED_TEST(TestFloatStatistics, NaNs) { this->TestNaNs(); }
// ARROW-7376
TEST(TestStatisticsSortOrderFloatNaN, NaNAndNullsInfiniteLoop) {
constexpr int kNumValues = 8;
NodePtr node = PrimitiveNode::Make("nan_float", Repetition::OPTIONAL, Type::FLOAT);
ColumnDescriptor descr(node, 1, 1);
constexpr float nan = std::numeric_limits<float>::quiet_NaN();
std::array<float, kNumValues> nans_but_last{nan, nan, nan, nan, nan, nan, nan, 0.0f};
uint8_t all_but_last_valid = 0x7F; // 0b01111111
auto stats = MakeStatistics<FloatType>(&descr);
AssertUnsetMinMax(stats, nans_but_last, &all_but_last_valid);
}
// Test read statistics for column with UNKNOWN sort order
TEST(TestStatisticsSortOrder, UNKNOWN) {
std::string dir_string(test::get_data_dir());
std::stringstream ss;
ss << dir_string << "/int96_from_spark.parquet";
auto path = ss.str();
// The file contains a single column of INT96 type (deprecated)
// with SortOrder UNKNOWN.
// It contains 6 values with a single null value.
// The null_count statistics value is preserved.
auto file_reader = ParquetFileReader::OpenFile(path);
auto rg_reader = file_reader->RowGroup(0);
auto metadata = rg_reader->metadata();
auto column_schema = metadata->schema()->Column(0);
ASSERT_EQ(SortOrder::UNKNOWN, column_schema->sort_order());
auto column_chunk = metadata->ColumnChunk(0);
ASSERT_TRUE(column_chunk->is_stats_set());
std::shared_ptr<EncodedStatistics> enc_stats = column_chunk->encoded_statistics();
ASSERT_TRUE(enc_stats->has_null_count);
ASSERT_FALSE(enc_stats->has_distinct_count);
ASSERT_FALSE(enc_stats->has_min);
ASSERT_FALSE(enc_stats->has_max);
ASSERT_EQ(1, enc_stats->null_count);
ASSERT_FALSE(enc_stats->is_max_value_exact.has_value());
ASSERT_FALSE(enc_stats->is_min_value_exact.has_value());
}
// Test statistics for binary column with UNSIGNED sort order
TEST(TestStatisticsSortOrderMinMax, Unsigned) {
std::string dir_string(test::get_data_dir());
std::stringstream ss;
ss << dir_string << "/binary.parquet";
auto path = ss.str();
// The file is generated by parquet-mr 1.10.0, the first version that
// supports correct statistics for binary data (see PARQUET-1025). It
// contains a single column of binary type. Data is just single byte values
// from 0x00 to 0x0B.
auto file_reader = ParquetFileReader::OpenFile(path);
auto rg_reader = file_reader->RowGroup(0);
auto metadata = rg_reader->metadata();
auto column_schema = metadata->schema()->Column(0);
ASSERT_EQ(SortOrder::UNSIGNED, column_schema->sort_order());
auto column_chunk = metadata->ColumnChunk(0);
ASSERT_TRUE(column_chunk->is_stats_set());
std::shared_ptr<Statistics> stats = column_chunk->statistics();
ASSERT_TRUE(stats != NULL);
ASSERT_EQ(0, stats->null_count());
ASSERT_EQ(12, stats->num_values());
ASSERT_EQ(0x00, stats->EncodeMin()[0]);
ASSERT_EQ(0x0b, stats->EncodeMax()[0]);
std::shared_ptr<EncodedStatistics> enc_stats = column_chunk->encoded_statistics();
ASSERT_FALSE(enc_stats->is_max_value_exact.has_value());
ASSERT_FALSE(enc_stats->is_min_value_exact.has_value());
}
// Test statistics for binary column with truncated max and min values
TEST(TestEncodedStatistics, TruncatedMinMax) {
std::string dir_string(test::get_data_dir());
std::stringstream ss;
ss << dir_string << "/binary_truncated_min_max.parquet";
auto path = ss.str();
// The file is generated by parquet-rs 55.1.0. It
// contains six columns of utf-8 and binary type. statistics_truncate_length
// is set to 2. Columns 0 and 1 will have truncation of min and max value,
// columns 2 and 3 will have truncation of min value only.
// Columns 4 and 5 will have no truncation where is_min_value_exact and
// is_max_value_exact are set to true.
// More file details in:
// https://github.com/apache/parquet-testing/tree/master/data#binary-truncated-min-and-max-statistics
auto file_reader = ParquetFileReader::OpenFile(path);
auto rg_reader = file_reader->RowGroup(0);
auto metadata = rg_reader->metadata();
auto column_schema = metadata->schema()->Column(0);
ASSERT_EQ(SortOrder::UNSIGNED, column_schema->sort_order());
ASSERT_EQ(6, metadata->num_columns());
for (int num_column = 0; num_column < metadata->num_columns(); ++num_column) {
auto column_chunk = metadata->ColumnChunk(num_column);
ASSERT_TRUE(column_chunk->is_stats_set());
std::shared_ptr<EncodedStatistics> encoded_statistics =
column_chunk->encoded_statistics();
ASSERT_TRUE(encoded_statistics != NULL);
ASSERT_EQ(0, encoded_statistics->null_count);
EXPECT_EQ("Al", encoded_statistics->min());
ASSERT_TRUE(encoded_statistics->is_max_value_exact.has_value());
ASSERT_TRUE(encoded_statistics->is_min_value_exact.has_value());
switch (num_column) {
case 2:
// Max couldn't truncate the utf-8 string longer than 2 bytes
EXPECT_EQ("🚀Kevin Bacon", encoded_statistics->max());
ASSERT_TRUE(encoded_statistics->is_max_value_exact.value());
ASSERT_FALSE(encoded_statistics->is_min_value_exact.value());
break;
case 3:
// Max couldn't truncate 0xFFFF binary string
EXPECT_EQ("\xFF\xFF\x1\x2", encoded_statistics->max());
ASSERT_TRUE(encoded_statistics->is_max_value_exact.value());
ASSERT_FALSE(encoded_statistics->is_min_value_exact.value());
break;
case 4:
case 5:
// Min and Max are not truncated, fit on 2 bytes
EXPECT_EQ("Ke", encoded_statistics->max());
ASSERT_TRUE(encoded_statistics->is_max_value_exact.value());
ASSERT_TRUE(encoded_statistics->is_min_value_exact.value());
break;
default:
// Max truncated to 2 bytes on columns 0 and 1
EXPECT_EQ("Kf", encoded_statistics->max());
ASSERT_FALSE(encoded_statistics->is_max_value_exact.value());
ASSERT_FALSE(encoded_statistics->is_min_value_exact.value());
}
}
}
TEST(TestEncodedStatistics, CopySafe) {
EncodedStatistics encoded_statistics;
encoded_statistics.set_max("abc");
ASSERT_TRUE(encoded_statistics.has_max);
encoded_statistics.is_max_value_exact = true;
ASSERT_TRUE(encoded_statistics.is_max_value_exact.has_value());
encoded_statistics.set_min("abc");
ASSERT_TRUE(encoded_statistics.has_min);
encoded_statistics.is_min_value_exact = true;
ASSERT_TRUE(encoded_statistics.is_min_value_exact.has_value());
EncodedStatistics copy_statistics = encoded_statistics;
copy_statistics.set_max("abcd");
copy_statistics.set_min("a");
copy_statistics.is_max_value_exact = false;
copy_statistics.is_min_value_exact = false;
EXPECT_EQ("abc", encoded_statistics.min());
EXPECT_EQ("abc", encoded_statistics.max());
EXPECT_EQ(encoded_statistics.is_min_value_exact, std::make_optional(true));
EXPECT_EQ(encoded_statistics.is_max_value_exact, std::make_optional(true));
}
TEST(TestEncodedStatistics, ApplyStatSizeLimits) {
EncodedStatistics encoded_statistics;
encoded_statistics.set_min("a");
ASSERT_TRUE(encoded_statistics.has_min);
encoded_statistics.set_max("abc");
ASSERT_TRUE(encoded_statistics.has_max);
encoded_statistics.ApplyStatSizeLimits(2);
ASSERT_TRUE(encoded_statistics.has_min);
ASSERT_EQ("a", encoded_statistics.min());
ASSERT_FALSE(encoded_statistics.has_max);
NodePtr node =
PrimitiveNode::Make("StringColumn", Repetition::REQUIRED, Type::BYTE_ARRAY);
ColumnDescriptor descr(node, 0, 0);
std::shared_ptr<TypedStatistics<::parquet::ByteArrayType>> statistics =
std::dynamic_pointer_cast<TypedStatistics<::parquet::ByteArrayType>>(
Statistics::Make(&descr, &encoded_statistics,
/*num_values=*/1000));
// GH-43382: HasMinMax should be false if one of min/max is not set.
EXPECT_FALSE(statistics->HasMinMax());
}
} // namespace test
} // namespace parquet