cpp/src/parquet/statistics.cc - arrow - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 #include "parquet/statistics.h"

 #include <algorithm>
 #include <cmath>
 #include <cstring>
 #include <limits>
 #include <optional>
 #include <type_traits>
 #include <utility>

 #include "arrow/array.h"
 #include "arrow/type.h"
 #include "arrow/type_traits.h"
 #include "arrow/util/bit_run_reader.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/float16.h"
 #include "arrow/util/logging_internal.h"
 #include "arrow/util/ubsan.h"
 #include "arrow/visit_data_inline.h"
 #include "parquet/encoding.h"
 #include "parquet/exception.h"
 #include "parquet/platform.h"
 #include "parquet/schema.h"

 using arrow::default_memory_pool;
 using arrow::MemoryPool;
 using arrow::internal::checked_cast;
 using arrow::util::Float16;
 using arrow::util::SafeCopy;
 using arrow::util::SafeLoad;

 namespace parquet {
 namespace {

 // ----------------------------------------------------------------------
 // Comparator implementations

 constexpr int value_length(int value_length, const ByteArray& value) { return value.len; }
 constexpr int value_length(int type_length, const FLBA& value) { return type_length; }

 // Static "constants" for normalizing float16 min/max values. These need to be expressed
 // as pointers because `Float16LogicalType` represents an FLBA.
 struct Float16Constants {
   static constexpr const uint8_t* lowest() { return lowest_.data(); }
   static constexpr const uint8_t* max() { return max_.data(); }
   static constexpr const uint8_t* positive_zero() { return positive_zero_.data(); }
   static constexpr const uint8_t* negative_zero() { return negative_zero_.data(); }

  private:
   using Bytes = std::array<uint8_t, 2>;
   static constexpr Bytes lowest_ =
       std::numeric_limits<Float16>::lowest().ToLittleEndian();
   static constexpr Bytes max_ = std::numeric_limits<Float16>::max().ToLittleEndian();
   static constexpr Bytes positive_zero_ = (+Float16::FromBits(0)).ToLittleEndian();
   static constexpr Bytes negative_zero_ = (-Float16::FromBits(0)).ToLittleEndian();
 };

 template <typename DType, bool is_signed>
 struct CompareHelper {
   using T = typename DType::c_type;

   static_assert(!std::is_unsigned<T>::value || std::is_same<T, bool>::value,
                 "T is an unsigned numeric");

   constexpr static T DefaultMin() { return std::numeric_limits<T>::max(); }
   constexpr static T DefaultMax() { return std::numeric_limits<T>::lowest(); }

   // MSVC17 fix, isnan is not overloaded for IntegralType as per C++11
   // standard requirements.
   template <typename T1 = T>
   static ::arrow::enable_if_t<std::is_floating_point<T1>::value, T> Coalesce(T val,
                                                                              T fallback) {
     return std::isnan(val) ? fallback : val;
   }

   template <typename T1 = T>
   static ::arrow::enable_if_t<!std::is_floating_point<T1>::value, T> Coalesce(
       T val, T fallback) {
     return val;
   }

   static inline bool Compare(int type_length, const T& a, const T& b) { return a < b; }

   static T Min(int type_length, T a, T b) { return a < b ? a : b; }
   static T Max(int type_length, T a, T b) { return a < b ? b : a; }
 };

 template <typename DType>
 struct UnsignedCompareHelperBase {
   using T = typename DType::c_type;
   using UCType = typename std::make_unsigned<T>::type;

   static_assert(!std::is_same<T, UCType>::value, "T is unsigned");
   static_assert(sizeof(T) == sizeof(UCType), "T and UCType not the same size");

   // NOTE: according to the C++ spec, unsigned-to-signed conversion is
   // implementation-defined if the original value does not fit in the signed type
   // (i.e., two's complement cannot be assumed even on mainstream machines,
   // because the compiler may decide otherwise).  Hence the use of `SafeCopy`
   // below for deterministic bit-casting.
   // (see "Integer conversions" in
   //  https://en.cppreference.com/w/cpp/language/implicit_conversion)

   static const T DefaultMin() { return SafeCopy<T>(std::numeric_limits<UCType>::max()); }
   static const T DefaultMax() { return 0; }

   static T Coalesce(T val, T fallback) { return val; }

   static bool Compare(int type_length, T a, T b) {
     return SafeCopy<UCType>(a) < SafeCopy<UCType>(b);
   }

   static T Min(int type_length, T a, T b) { return Compare(type_length, a, b) ? a : b; }
   static T Max(int type_length, T a, T b) { return Compare(type_length, a, b) ? b : a; }
 };

 template <>
 struct CompareHelper<Int32Type, false> : public UnsignedCompareHelperBase<Int32Type> {};

 template <>
 struct CompareHelper<Int64Type, false> : public UnsignedCompareHelperBase<Int64Type> {};

 template <bool is_signed>
 struct CompareHelper<Int96Type, is_signed> {
   using T = typename Int96Type::c_type;
   using msb_type = typename std::conditional<is_signed, int32_t, uint32_t>::type;

   static T DefaultMin() {
     uint32_t kMsbMax = SafeCopy<uint32_t>(std::numeric_limits<msb_type>::max());
     uint32_t kMax = std::numeric_limits<uint32_t>::max();
     return {kMax, kMax, kMsbMax};
   }
   static T DefaultMax() {
     uint32_t kMsbMin = SafeCopy<uint32_t>(std::numeric_limits<msb_type>::min());
     uint32_t kMin = std::numeric_limits<uint32_t>::min();
     return {kMin, kMin, kMsbMin};
   }
   static T Coalesce(T val, T fallback) { return val; }

   static inline bool Compare(int type_length, const T& a, const T& b) {
     if (a.value[2] != b.value[2]) {
       // Only the MSB bit is by Signed comparison. For little-endian, this is the
       // last bit of Int96 type.
       return SafeCopy<msb_type>(a.value[2]) < SafeCopy<msb_type>(b.value[2]);
     } else if (a.value[1] != b.value[1]) {
       return (a.value[1] < b.value[1]);
     }
     return (a.value[0] < b.value[0]);
   }

   static T Min(int type_length, const T& a, const T& b) {
     return Compare(0, a, b) ? a : b;
   }
   static T Max(int type_length, const T& a, const T& b) {
     return Compare(0, a, b) ? b : a;
   }
 };

 template <typename T, bool is_signed>
 struct BinaryLikeComparer {};

 template <typename T>
 struct BinaryLikeComparer<T, /*is_signed=*/false> {
   static bool Compare(int type_length, const T& a, const T& b) {
     int a_length = value_length(type_length, a);
     int b_length = value_length(type_length, b);
     // Unsigned comparison is used for non-numeric types so straight
     // lexicographic comparison makes sense. (a.ptr is always unsigned)....
     return std::lexicographical_compare(a.ptr, a.ptr + a_length, b.ptr, b.ptr + b_length);
   }
 };

 template <typename T>
 struct BinaryLikeComparer<T, /*is_signed=*/true> {
   static bool Compare(int type_length, const T& a, const T& b) {
     // Is signed is used for integers encoded as big-endian twos
     // complement integers. (e.g. decimals).
     int a_length = value_length(type_length, a);
     int b_length = value_length(type_length, b);

     // At least of the lengths is zero.
     if (a_length == 0 || b_length == 0) {
       return a_length == 0 && b_length > 0;
     }

     int8_t first_a = *a.ptr;
     int8_t first_b = *b.ptr;
     // We can short circuit for different signed numbers or
     // for equal length bytes arrays that have different first bytes.
     // The equality requirement is necessary for sign extension cases.
     // 0xFF10 should be equal to 0x10 (due to big endian sign extension).
     if ((0x80 & first_a) != (0x80 & first_b) ||
         (a_length == b_length && first_a != first_b)) {
       return first_a < first_b;
     }
     // When the lengths are unequal and the numbers are of the same
     // sign we need to do comparison by sign extending the shorter
     // value first, and once we get to equal sized arrays, lexicographical
     // unsigned comparison of everything but the first byte is sufficient.
     const uint8_t* a_start = a.ptr;
     const uint8_t* b_start = b.ptr;
     if (a_length != b_length) {
       const uint8_t* lead_start = nullptr;
       const uint8_t* lead_end = nullptr;
       if (a_length > b_length) {
         int lead_length = a_length - b_length;
         lead_start = a.ptr;
         lead_end = a.ptr + lead_length;
         a_start += lead_length;
       } else {
         DCHECK_LT(a_length, b_length);
         int lead_length = b_length - a_length;
         lead_start = b.ptr;
         lead_end = b.ptr + lead_length;
         b_start += lead_length;
       }
       // Compare extra bytes to the sign extension of the first
       // byte of the other number.
       uint8_t extension = first_a < 0 ? 0xFF : 0;
       bool not_equal = std::any_of(lead_start, lead_end,
                                    [extension](uint8_t a) { return extension != a; });
       if (not_equal) {
         // Since sign extension are extrema values for unsigned bytes:
         //
         // Four cases exist:
         //    negative values:
         //      b is the longer value.
         //        b must be the lesser value: return false
         //      else:
         //        a must be the lesser value: return true
         //
         //    positive values:
         //      b is the longer value.
         //        values in b must be greater than a: return true
         //      else:
         //        values in a must be greater than b: return false
         bool negative_values = first_a < 0;
         bool b_longer = a_length < b_length;
         return negative_values != b_longer;
       }
     } else {
       a_start++;
       b_start++;
     }
     return std::lexicographical_compare(a_start, a.ptr + a_length, b_start,
                                         b.ptr + b_length);
   }
 };

 template <typename DType, bool is_signed>
 struct BinaryLikeCompareHelperBase {
   using T = typename DType::c_type;

   static T DefaultMin() { return {}; }
   static T DefaultMax() { return {}; }
   static T Coalesce(T val, T fallback) { return val; }

   static inline bool Compare(int type_length, const T& a, const T& b) {
     return BinaryLikeComparer<T, is_signed>::Compare(type_length, a, b);
   }
   static T Min(int type_length, const T& a, const T& b) {
     if (a.ptr == nullptr) return b;
     if (b.ptr == nullptr) return a;
     return Compare(type_length, a, b) ? a : b;
   }

   static T Max(int type_length, const T& a, const T& b) {
     if (a.ptr == nullptr) return b;
     if (b.ptr == nullptr) return a;
     return Compare(type_length, a, b) ? b : a;
   }
 };

 template <bool is_signed>
 struct CompareHelper<ByteArrayType, is_signed>
     : public BinaryLikeCompareHelperBase<ByteArrayType, is_signed> {};

 template <bool is_signed>
 struct CompareHelper<FLBAType, is_signed>
     : public BinaryLikeCompareHelperBase<FLBAType, is_signed> {};

 template <>
 struct CompareHelper<Float16LogicalType, /*is_signed=*/true> {
   using T = FLBA;

   static T DefaultMin() { return T{Float16Constants::max()}; }
   static T DefaultMax() { return T{Float16Constants::lowest()}; }

   static T Coalesce(T val, T fallback) {
     return (val.ptr == nullptr || Float16::FromLittleEndian(val.ptr).is_nan()) ? fallback
                                                                                : val;
   }

   static inline bool Compare(int type_length, const T& a, const T& b) {
     const auto lhs = Float16::FromLittleEndian(a.ptr);
     const auto rhs = Float16::FromLittleEndian(b.ptr);
     // NaN is handled here (same behavior as native float compare)
     return lhs < rhs;
   }

   static T Min(int type_length, const T& a, const T& b) {
     if (a.ptr == nullptr) return b;
     if (b.ptr == nullptr) return a;
     return Compare(type_length, a, b) ? a : b;
   }

   static T Max(int type_length, const T& a, const T& b) {
     if (a.ptr == nullptr) return b;
     if (b.ptr == nullptr) return a;
     return Compare(type_length, a, b) ? b : a;
   }
 };

 using ::std::optional;

 template <typename T>
 ::arrow::enable_if_t<std::is_integral<T>::value, optional<std::pair<T, T>>>
 CleanStatistic(std::pair<T, T> min_max, LogicalType::Type::type) {
   return min_max;
 }

 std::optional<std::pair<Int96, Int96>> CleanStatistic(std::pair<Int96, Int96> min_max,
                                                       LogicalType::Type::type) {
   return min_max;
 }

 // In case of floating point types, the following rules are applied (as per
 // upstream parquet-mr):
 // - If any of min/max is NaN, return nothing.
 // - If min is 0.0f, replace with -0.0f
 // - If max is -0.0f, replace with 0.0f
 template <typename T>
 ::arrow::enable_if_t<std::is_floating_point<T>::value, optional<std::pair<T, T>>>
 CleanStatistic(std::pair<T, T> min_max, LogicalType::Type::type) {
   T min = min_max.first;
   T max = min_max.second;

   // Ignore if one of the value is nan.
   if (std::isnan(min) || std::isnan(max)) {
     return ::std::nullopt;
   }

   if (min == std::numeric_limits<T>::max() && max == std::numeric_limits<T>::lowest()) {
     return ::std::nullopt;
   }

   T zero{};

   if (min == zero && !std::signbit(min)) {
     min = -min;
   }

   if (max == zero && std::signbit(max)) {
     max = -max;
   }

   return {{min, max}};
 }

 optional<std::pair<FLBA, FLBA>> CleanFloat16Statistic(std::pair<FLBA, FLBA> min_max) {
   FLBA min_flba = min_max.first;
   FLBA max_flba = min_max.second;
   Float16 min = Float16::FromLittleEndian(min_flba.ptr);
   Float16 max = Float16::FromLittleEndian(max_flba.ptr);

   if (min.is_nan() || max.is_nan()) {
     return ::std::nullopt;
   }

   if (min == std::numeric_limits<Float16>::max() &&
       max == std::numeric_limits<Float16>::lowest()) {
     return ::std::nullopt;
   }

   if (min.is_zero() && !min.signbit()) {
     min_flba = FLBA{Float16Constants::negative_zero()};
   }
   if (max.is_zero() && max.signbit()) {
     max_flba = FLBA{Float16Constants::positive_zero()};
   }

   return {{min_flba, max_flba}};
 }

 optional<std::pair<FLBA, FLBA>> CleanStatistic(std::pair<FLBA, FLBA> min_max,
                                                LogicalType::Type::type logical_type) {
   if (min_max.first.ptr == nullptr || min_max.second.ptr == nullptr) {
     return ::std::nullopt;
   }
   if (logical_type == LogicalType::Type::FLOAT16) {
     return CleanFloat16Statistic(std::move(min_max));
   }
   return min_max;
 }

 optional<std::pair<ByteArray, ByteArray>> CleanStatistic(
     std::pair<ByteArray, ByteArray> min_max, LogicalType::Type::type) {
   if (min_max.first.ptr == nullptr || min_max.second.ptr == nullptr) {
     return ::std::nullopt;
   }
   return min_max;
 }

 template <typename T>
 struct RebindLogical {
   using DType = T;
   using c_type = typename DType::c_type;
 };

 template <>
 struct RebindLogical<Float16LogicalType> {
   using DType = FLBAType;
   using c_type = DType::c_type;
 };

 template <bool is_signed, typename DType>
 class TypedComparatorImpl
     : virtual public TypedComparator<typename RebindLogical<DType>::DType> {
  public:
   using T = typename RebindLogical<DType>::c_type;
   using Helper = CompareHelper<DType, is_signed>;

   explicit TypedComparatorImpl(int type_length = -1) : type_length_(type_length) {}

   bool CompareInline(const T& a, const T& b) const {
     return Helper::Compare(type_length_, a, b);
   }

   bool Compare(const T& a, const T& b) const override { return CompareInline(a, b); }

   std::pair<T, T> GetMinMax(const T* values, int64_t length) const override {
     DCHECK_GT(length, 0);

     T min = Helper::DefaultMin();
     T max = Helper::DefaultMax();

     for (int64_t i = 0; i < length; i++) {
       const auto val = SafeLoad(values + i);
       min = Helper::Min(type_length_, min, Helper::Coalesce(val, Helper::DefaultMin()));
       max = Helper::Max(type_length_, max, Helper::Coalesce(val, Helper::DefaultMax()));
     }

     return {min, max};
   }

   std::pair<T, T> GetMinMaxSpaced(const T* values, int64_t length,
                                   const uint8_t* valid_bits,
                                   int64_t valid_bits_offset) const override {
     DCHECK_GT(length, 0);

     T min = Helper::DefaultMin();
     T max = Helper::DefaultMax();

     ::arrow::internal::VisitSetBitRunsVoid(
         valid_bits, valid_bits_offset, length, [&](int64_t position, int64_t length) {
           for (int64_t i = 0; i < length; i++) {
             const auto val = SafeLoad(values + i + position);
             min = Helper::Min(type_length_, min,
                               Helper::Coalesce(val, Helper::DefaultMin()));
             max = Helper::Max(type_length_, max,
                               Helper::Coalesce(val, Helper::DefaultMax()));
           }
         });

     return {min, max};
   }

   std::pair<T, T> GetMinMax(const ::arrow::Array& values) const override {
     ParquetException::NYI(values.type()->ToString());
   }

  private:
   int type_length_;
 };

 // ARROW-11675: A hand-written version of GetMinMax(), to work around
 // what looks like a MSVC code generation bug.
 // This does not seem to be required for GetMinMaxSpaced().
 template <>
 std::pair<int32_t, int32_t>
 TypedComparatorImpl</*is_signed=*/false, Int32Type>::GetMinMax(const int32_t* values,
                                                                int64_t length) const {
   DCHECK_GT(length, 0);

   const uint32_t* unsigned_values = reinterpret_cast<const uint32_t*>(values);
   uint32_t min = std::numeric_limits<uint32_t>::max();
   uint32_t max = std::numeric_limits<uint32_t>::lowest();

   for (int64_t i = 0; i < length; i++) {
     const auto val = unsigned_values[i];
     min = std::min<uint32_t>(min, val);
     max = std::max<uint32_t>(max, val);
   }

   return {SafeCopy<int32_t>(min), SafeCopy<int32_t>(max)};
 }

 template <bool is_signed>
 std::pair<ByteArray, ByteArray> GetMinMaxBinaryHelper(
     const TypedComparatorImpl<is_signed, ByteArrayType>& comparator,
     const ::arrow::Array& values) {
   using Helper = CompareHelper<ByteArrayType, is_signed>;

   ByteArray min = Helper::DefaultMin();
   ByteArray max = Helper::DefaultMax();
   constexpr int type_length = -1;

   const auto valid_func = [&](ByteArray val) {
     min = Helper::Min(type_length, val, min);
     max = Helper::Max(type_length, val, max);
   };
   const auto null_func = [&]() {};

   if (::arrow::is_binary_like(values.type_id())) {
     ::arrow::VisitArraySpanInline<::arrow::BinaryType>(
         *values.data(), std::move(valid_func), std::move(null_func));
   } else if (::arrow::is_large_binary_like(values.type_id())) {
     ::arrow::VisitArraySpanInline<::arrow::LargeBinaryType>(
         *values.data(), std::move(valid_func), std::move(null_func));
   } else if (::arrow::is_binary_view_like(values.type_id())) {
     ::arrow::VisitArraySpanInline<::arrow::BinaryViewType>(
         *values.data(), std::move(valid_func), std::move(null_func));
   } else {
     throw ParquetException("Only binary-like data supported");
   }

   return {min, max};
 }

 template <>
 std::pair<ByteArray, ByteArray> TypedComparatorImpl<true, ByteArrayType>::GetMinMax(
     const ::arrow::Array& values) const {
   return GetMinMaxBinaryHelper<true>(*this, values);
 }

 template <>
 std::pair<ByteArray, ByteArray> TypedComparatorImpl<false, ByteArrayType>::GetMinMax(
     const ::arrow::Array& values) const {
   return GetMinMaxBinaryHelper<false>(*this, values);
 }

 LogicalType::Type::type LogicalTypeId(const ColumnDescriptor* descr) {
   if (const auto& logical_type = descr->logical_type()) {
     return logical_type->type();
   }
   return LogicalType::Type::NONE;
 }
 LogicalType::Type::type LogicalTypeId(const Statistics& stats) {
   return LogicalTypeId(stats.descr());
 }

 template <typename DType>
 class TypedStatisticsImpl : public TypedStatistics<DType> {
  public:
   using T = typename DType::c_type;

   // Create an empty stats.
   TypedStatisticsImpl(const ColumnDescriptor* descr, MemoryPool* pool)
       : descr_(descr),
         pool_(pool),
         min_buffer_(AllocateBuffer(pool_, 0)),
         max_buffer_(AllocateBuffer(pool_, 0)),
         logical_type_(LogicalTypeId(descr_)) {
     if (descr->sort_order() != SortOrder::UNKNOWN) {
       comparator_ = MakeComparator<DType>(descr);
     }
     TypedStatisticsImpl::Reset();
   }

   // Create stats from provided values.
   TypedStatisticsImpl(const T& min, const T& max, int64_t num_values, int64_t null_count,
                       int64_t distinct_count)
       : pool_(default_memory_pool()),
         min_buffer_(AllocateBuffer(pool_, 0)),
         max_buffer_(AllocateBuffer(pool_, 0)) {
     TypedStatisticsImpl::IncrementNumValues(num_values);
     TypedStatisticsImpl::IncrementNullCount(null_count);
     SetDistinctCount(distinct_count);

     Copy(min, &min_, min_buffer_.get());
     Copy(max, &max_, max_buffer_.get());
     has_min_max_ = true;
     statistics_.is_min_value_exact = true;
     statistics_.is_max_value_exact = true;
   }

   // Create stats from a thrift Statistics object.
   TypedStatisticsImpl(const ColumnDescriptor* descr, const std::string& encoded_min,
                       const std::string& encoded_max, int64_t num_values,
                       int64_t null_count, int64_t distinct_count, bool has_min_max,
                       bool has_null_count, bool has_distinct_count, MemoryPool* pool)
       : TypedStatisticsImpl(descr, encoded_min, encoded_max, num_values, null_count,
                             distinct_count, has_min_max, has_null_count,
                             has_distinct_count,
                             /*is_min_value_exact=*/std::nullopt,
                             /*is_max_value_exact=*/std::nullopt, pool) {}

   TypedStatisticsImpl(const ColumnDescriptor* descr, const std::string& encoded_min,
                       const std::string& encoded_max, int64_t num_values,
                       int64_t null_count, int64_t distinct_count, bool has_min_max,
                       bool has_null_count, bool has_distinct_count,
                       std::optional<bool> is_min_value_exact,
                       std::optional<bool> is_max_value_exact, MemoryPool* pool)
       : TypedStatisticsImpl(descr, pool) {
     TypedStatisticsImpl::IncrementNumValues(num_values);
     if (has_null_count) {
       TypedStatisticsImpl::IncrementNullCount(null_count);
     } else {
       has_null_count_ = false;
     }
     if (has_distinct_count) {
       SetDistinctCount(distinct_count);
     } else {
       has_distinct_count_ = false;
     }

     if (has_min_max) {
       PlainDecode(encoded_min, &min_);
       PlainDecode(encoded_max, &max_);
       statistics_.is_min_value_exact = is_min_value_exact;
       statistics_.is_max_value_exact = is_max_value_exact;
     }

     has_min_max_ = has_min_max;
   }

   bool HasDistinctCount() const override { return has_distinct_count_; };
   bool HasMinMax() const override { return has_min_max_; }
   bool HasNullCount() const override { return has_null_count_; };

   void IncrementNullCount(int64_t n) override {
     statistics_.null_count += n;
     has_null_count_ = true;
   }

   void IncrementNumValues(int64_t n) override { num_values_ += n; }

   static bool IsMeaningfulLogicalType(LogicalType::Type::type type) {
     switch (type) {
       case LogicalType::Type::FLOAT16:
         return true;
       default:
         return false;
     }
   }

   bool Equals(const Statistics& raw_other) const override {
     if (physical_type() != raw_other.physical_type()) return false;

     const auto other_logical_type = LogicalTypeId(raw_other);
     // Only compare against logical types that influence the interpretation of the
     // physical type
     if (IsMeaningfulLogicalType(logical_type_)) {
       if (logical_type_ != other_logical_type) return false;
     } else if (IsMeaningfulLogicalType(other_logical_type)) {
       return false;
     }

     const auto& other = checked_cast<const TypedStatisticsImpl&>(raw_other);

     if (has_min_max_ != other.has_min_max_) return false;
     if (has_min_max_) {
       if (!MinMaxEqual(other)) return false;
     }

     return null_count() == other.null_count() &&
            distinct_count() == other.distinct_count() &&
            num_values() == other.num_values() &&
            is_min_value_exact() == other.is_min_value_exact() &&
            is_max_value_exact() == other.is_max_value_exact();
   }

   bool MinMaxEqual(const TypedStatisticsImpl& other) const;

   void Reset() override {
     ResetCounts();
     ResetHasFlags();
   }

   void SetMinMax(const T& arg_min, const T& arg_max) override {
     SetMinMaxPair({arg_min, arg_max});
   }

   void Merge(const TypedStatistics<DType>& other) override {
     this->num_values_ += other.num_values();
     // null_count is always valid when merging page statistics into
     // column chunk statistics.
     if (other.HasNullCount()) {
       this->statistics_.null_count += other.null_count();
     } else {
       this->has_null_count_ = false;
     }
     if (has_distinct_count_ && other.HasDistinctCount() &&
         (distinct_count() == 0 || other.distinct_count() == 0)) {
       // We can merge distinct counts if either side is zero.
       statistics_.distinct_count =
           std::max(statistics_.distinct_count, other.distinct_count());
     } else {
       // Otherwise clear has_distinct_count_ as distinct count cannot be merged.
       this->has_distinct_count_ = false;
     }
     // Do not clear min/max here if the other side does not provide
     // min/max which may happen when other is an empty stats or all
     // its values are null and/or NaN.
     if (other.HasMinMax()) {
       SetMinMax(other.min(), other.max());
     }
   }

   void Update(const T* values, int64_t num_values, int64_t null_count) override;
   void UpdateSpaced(const T* values, const uint8_t* valid_bits, int64_t valid_bits_offset,
                     int64_t num_spaced_values, int64_t num_values,
                     int64_t null_count) override;

   void Update(const ::arrow::Array& values, bool update_counts) override {
     if (update_counts) {
       IncrementNullCount(values.null_count());
       IncrementNumValues(values.length() - values.null_count());
     }

     if (values.null_count() == values.length()) {
       return;
     }

     if (comparator_ == nullptr) return;
     SetMinMaxPair(comparator_->GetMinMax(values));
   }

   const T& min() const override { return min_; }

   const T& max() const override { return max_; }

   Type::type physical_type() const override { return descr_->physical_type(); }

   const ColumnDescriptor* descr() const override { return descr_; }

   std::string EncodeMin() const override {
     std::string s;
     if (HasMinMax()) this->PlainEncode(min_, &s);
     return s;
   }

   std::string EncodeMax() const override {
     std::string s;
     if (HasMinMax()) this->PlainEncode(max_, &s);
     return s;
   }

   EncodedStatistics Encode() override {
     EncodedStatistics s;
     if (HasMinMax()) {
       s.set_min(this->EncodeMin());
       s.set_max(this->EncodeMax());
       s.is_min_value_exact = this->is_min_value_exact();
       s.is_max_value_exact = this->is_max_value_exact();
     }
     if (HasNullCount()) {
       s.set_null_count(this->null_count());
       // num_values_ is reliable and it means number of non-null values.
       s.all_null_value = num_values_ == 0;
     }
     if (HasDistinctCount()) {
       s.set_distinct_count(this->distinct_count());
     }
     return s;
   }

   int64_t null_count() const override { return statistics_.null_count; }
   int64_t distinct_count() const override { return statistics_.distinct_count; }
   int64_t num_values() const override { return num_values_; }
   std::optional<bool> is_min_value_exact() const override {
     return statistics_.is_min_value_exact;
   }
   std::optional<bool> is_max_value_exact() const override {
     return statistics_.is_max_value_exact;
   }

  private:
   const ColumnDescriptor* descr_;
   bool has_min_max_ = false;
   bool has_null_count_ = false;
   bool has_distinct_count_ = false;
   T min_;
   T max_;
   ::arrow::MemoryPool* pool_;
   // Number of non-null values.
   // Please note that num_values_ is reliable when has_null_count_ is set.
   // When has_null_count_ is not set, e.g. a page statistics created from
   // a statistics thrift message which doesn't have the optional null_count,
   // `num_values_` may include null values.
   int64_t num_values_ = 0;
   EncodedStatistics statistics_;
   std::shared_ptr<TypedComparator<DType>> comparator_;
   std::shared_ptr<ResizableBuffer> min_buffer_, max_buffer_;
   LogicalType::Type::type logical_type_ = LogicalType::Type::NONE;

   void PlainEncode(const T& src, std::string* dst) const;
   void PlainDecode(const std::string& src, T* dst) const;

   void Copy(const T& src, T* dst, ResizableBuffer*) { *dst = src; }

   void SetDistinctCount(int64_t n) {
     // distinct count can only be "set", and cannot be incremented.
     statistics_.distinct_count = n;
     has_distinct_count_ = true;
   }

   void ResetCounts() {
     this->statistics_.null_count = 0;
     this->statistics_.distinct_count = 0;
     this->num_values_ = 0;
   }

   void ResetHasFlags() {
     // has_min_max_ will only be set when it meets any valid value.
     this->has_min_max_ = false;
     // has_distinct_count_ will only be set once SetDistinctCount()
     // is called because distinct count calculation is not cheap and
     // disabled by default.
     this->has_distinct_count_ = false;
     // Null count calculation is cheap and enabled by default.
     this->has_null_count_ = true;
   }

   void SetMinMaxPair(std::pair<T, T> min_max) {
     if (comparator_ == nullptr) return;
     // CleanStatistic can return a nullopt in case of erroneous values, e.g. NaN
     auto maybe_min_max = CleanStatistic(min_max, logical_type_);
     if (!maybe_min_max) return;

     auto min = maybe_min_max.value().first;
     auto max = maybe_min_max.value().second;

     if (!has_min_max_) {
       has_min_max_ = true;
       Copy(min, &min_, min_buffer_.get());
       Copy(max, &max_, max_buffer_.get());
     } else {
       Copy(comparator_->Compare(min_, min) ? min_ : min, &min_, min_buffer_.get());
       Copy(comparator_->Compare(max_, max) ? max : max_, &max_, max_buffer_.get());
     }
     statistics_.is_min_value_exact = true;
     statistics_.is_max_value_exact = true;
   }
 };

 template <>
 inline bool TypedStatisticsImpl<FLBAType>::MinMaxEqual(
     const TypedStatisticsImpl<FLBAType>& other) const {
   uint32_t len = descr_->type_length();
   return std::memcmp(min_.ptr, other.min_.ptr, len) == 0 &&
          std::memcmp(max_.ptr, other.max_.ptr, len) == 0;
 }

 template <typename DType>
 bool TypedStatisticsImpl<DType>::MinMaxEqual(
     const TypedStatisticsImpl<DType>& other) const {
   return min_ == other.min_ && max_ == other.max_;
 }

 template <>
 inline void TypedStatisticsImpl<FLBAType>::Copy(const FLBA& src, FLBA* dst,
                                                 ResizableBuffer* buffer) {
   if (dst->ptr == src.ptr) return;
   uint32_t len = descr_->type_length();
   PARQUET_THROW_NOT_OK(buffer->Resize(len, false));
   std::memcpy(buffer->mutable_data(), src.ptr, len);
   *dst = FLBA(buffer->data());
 }

 template <>
 inline void TypedStatisticsImpl<ByteArrayType>::Copy(const ByteArray& src, ByteArray* dst,
                                                      ResizableBuffer* buffer) {
   if (dst->ptr == src.ptr) return;
   PARQUET_THROW_NOT_OK(buffer->Resize(src.len, false));
   std::memcpy(buffer->mutable_data(), src.ptr, src.len);
   *dst = ByteArray(src.len, buffer->data());
 }

 template <typename DType>
 void TypedStatisticsImpl<DType>::Update(const T* values, int64_t num_values,
                                         int64_t null_count) {
   DCHECK_GE(num_values, 0);
   DCHECK_GE(null_count, 0);

   IncrementNullCount(null_count);
   IncrementNumValues(num_values);

   if (num_values == 0 || comparator_ == nullptr) return;
   SetMinMaxPair(comparator_->GetMinMax(values, num_values));
 }

 template <typename DType>
 void TypedStatisticsImpl<DType>::UpdateSpaced(const T* values, const uint8_t* valid_bits,
                                               int64_t valid_bits_offset,
                                               int64_t num_spaced_values,
                                               int64_t num_values, int64_t null_count) {
   DCHECK_GE(num_values, 0);
   DCHECK_GE(null_count, 0);

   IncrementNullCount(null_count);
   IncrementNumValues(num_values);

   if (num_values == 0 || comparator_ == nullptr) return;
   SetMinMaxPair(comparator_->GetMinMaxSpaced(values, num_spaced_values, valid_bits,
                                              valid_bits_offset));
 }

 template <typename DType>
 void TypedStatisticsImpl<DType>::PlainEncode(const T& src, std::string* dst) const {
   auto encoder = MakeTypedEncoder<DType>(Encoding::PLAIN, false, descr_, pool_);
   encoder->Put(&src, 1);
   auto buffer = encoder->FlushValues();
   auto ptr = reinterpret_cast<const char*>(buffer->data());
   dst->assign(ptr, static_cast<size_t>(buffer->size()));
 }

 template <typename DType>
 void TypedStatisticsImpl<DType>::PlainDecode(const std::string& src, T* dst) const {
   auto decoder = MakeTypedDecoder<DType>(Encoding::PLAIN, descr_);
   decoder->SetData(1, reinterpret_cast<const uint8_t*>(src.c_str()),
                    static_cast<int>(src.size()));
   int decoded_values = decoder->Decode(dst, 1);
   if (decoded_values != 1) {
     throw ParquetException("Failed to decode statistic value from plain encoded string");
   }
 }

 template <>
 void TypedStatisticsImpl<ByteArrayType>::PlainEncode(const T& src,
                                                      std::string* dst) const {
   dst->assign(reinterpret_cast<const char*>(src.ptr), src.len);
 }

 template <>
 void TypedStatisticsImpl<ByteArrayType>::PlainDecode(const std::string& src,
                                                      T* dst) const {
   dst->len = static_cast<uint32_t>(src.size());
   dst->ptr = reinterpret_cast<const uint8_t*>(src.c_str());
 }

 std::shared_ptr<Comparator> DoMakeComparator(Type::type physical_type,
                                              LogicalType::Type::type logical_type,
                                              SortOrder::type sort_order,
                                              int type_length) {
   if (SortOrder::SIGNED == sort_order) {
     switch (physical_type) {
       case Type::BOOLEAN:
         return std::make_shared<TypedComparatorImpl<true, BooleanType>>();
       case Type::INT32:
         return std::make_shared<TypedComparatorImpl<true, Int32Type>>();
       case Type::INT64:
         return std::make_shared<TypedComparatorImpl<true, Int64Type>>();
       case Type::INT96:
         return std::make_shared<TypedComparatorImpl<true, Int96Type>>();
       case Type::FLOAT:
         return std::make_shared<TypedComparatorImpl<true, FloatType>>();
       case Type::DOUBLE:
         return std::make_shared<TypedComparatorImpl<true, DoubleType>>();
       case Type::BYTE_ARRAY:
         return std::make_shared<TypedComparatorImpl<true, ByteArrayType>>();
       case Type::FIXED_LEN_BYTE_ARRAY:
         if (logical_type == LogicalType::Type::FLOAT16) {
           return std::make_shared<TypedComparatorImpl<true, Float16LogicalType>>(
               type_length);
         }
         return std::make_shared<TypedComparatorImpl<true, FLBAType>>(type_length);
       default:
         ParquetException::NYI("Signed Compare not implemented");
     }
   } else if (SortOrder::UNSIGNED == sort_order) {
     switch (physical_type) {
       case Type::INT32:
         return std::make_shared<TypedComparatorImpl<false, Int32Type>>();
       case Type::INT64:
         return std::make_shared<TypedComparatorImpl<false, Int64Type>>();
       case Type::INT96:
         return std::make_shared<TypedComparatorImpl<false, Int96Type>>();
       case Type::BYTE_ARRAY:
         return std::make_shared<TypedComparatorImpl<false, ByteArrayType>>();
       case Type::FIXED_LEN_BYTE_ARRAY:
         return std::make_shared<TypedComparatorImpl<false, FLBAType>>(type_length);
       default:
         ParquetException::NYI("Unsigned Compare not implemented");
     }
   } else {
     throw ParquetException("UNKNOWN Sort Order");
   }
   return nullptr;
 }

 }  // namespace

 // ----------------------------------------------------------------------
 // Public factory functions

 std::shared_ptr<Comparator> Comparator::Make(Type::type physical_type,
                                              SortOrder::type sort_order,
                                              int type_length) {
   return DoMakeComparator(physical_type, LogicalType::Type::NONE, sort_order,
                           type_length);
 }

 std::shared_ptr<Comparator> Comparator::Make(const ColumnDescriptor* descr) {
   return DoMakeComparator(descr->physical_type(), LogicalTypeId(descr),
                           descr->sort_order(), descr->type_length());
 }

 std::shared_ptr<Statistics> Statistics::Make(const ColumnDescriptor* descr,
                                              ::arrow::MemoryPool* pool) {
   switch (descr->physical_type()) {
     case Type::BOOLEAN:
       return std::make_shared<TypedStatisticsImpl<BooleanType>>(descr, pool);
     case Type::INT32:
       return std::make_shared<TypedStatisticsImpl<Int32Type>>(descr, pool);
     case Type::INT64:
       return std::make_shared<TypedStatisticsImpl<Int64Type>>(descr, pool);
     case Type::FLOAT:
       return std::make_shared<TypedStatisticsImpl<FloatType>>(descr, pool);
     case Type::DOUBLE:
       return std::make_shared<TypedStatisticsImpl<DoubleType>>(descr, pool);
     case Type::BYTE_ARRAY:
       return std::make_shared<TypedStatisticsImpl<ByteArrayType>>(descr, pool);
     case Type::FIXED_LEN_BYTE_ARRAY:
       return std::make_shared<TypedStatisticsImpl<FLBAType>>(descr, pool);
     default:
       ParquetException::NYI("Statistics not implemented");
   }
 }

 std::shared_ptr<Statistics> Statistics::Make(Type::type physical_type, const void* min,
                                              const void* max, int64_t num_values,
                                              int64_t null_count, int64_t distinct_count) {
 #define MAKE_STATS(CAP_TYPE, KLASS)                                                    \
   case Type::CAP_TYPE:                                                                 \
     return std::make_shared<TypedStatisticsImpl<KLASS>>(                               \
         *reinterpret_cast<const typename KLASS::c_type*>(min),                         \
         *reinterpret_cast<const typename KLASS::c_type*>(max), num_values, null_count, \
         distinct_count)

   switch (physical_type) {
     MAKE_STATS(BOOLEAN, BooleanType);
     MAKE_STATS(INT32, Int32Type);
     MAKE_STATS(INT64, Int64Type);
     MAKE_STATS(FLOAT, FloatType);
     MAKE_STATS(DOUBLE, DoubleType);
     MAKE_STATS(BYTE_ARRAY, ByteArrayType);
     MAKE_STATS(FIXED_LEN_BYTE_ARRAY, FLBAType);
     default:
       break;
   }
 #undef MAKE_STATS
   DCHECK(false) << "Cannot reach here";
   return nullptr;
 }

 std::shared_ptr<Statistics> Statistics::Make(const ColumnDescriptor* descr,
                                              const EncodedStatistics* encoded_stats,
                                              int64_t num_values,
                                              ::arrow::MemoryPool* pool) {
   DCHECK(encoded_stats != nullptr);
   return Make(descr, encoded_stats->min(), encoded_stats->max(), num_values,
               encoded_stats->null_count, encoded_stats->distinct_count,
               encoded_stats->has_min && encoded_stats->has_max,
               encoded_stats->has_null_count, encoded_stats->has_distinct_count,
               encoded_stats->is_min_value_exact, encoded_stats->is_max_value_exact, pool);
 }

 std::shared_ptr<Statistics> Statistics::Make(const ColumnDescriptor* descr,
                                              const std::string& encoded_min,
                                              const std::string& encoded_max,
                                              int64_t num_values, int64_t null_count,
                                              int64_t distinct_count, bool has_min_max,
                                              bool has_null_count, bool has_distinct_count,
                                              ::arrow::MemoryPool* pool) {
   return Statistics::Make(descr, encoded_min, encoded_max, num_values, null_count,
                           distinct_count, has_min_max, has_null_count, has_distinct_count,
                           /*is_min_value_exact=*/std::nullopt,
                           /*is_max_value_exact=*/std::nullopt, pool);
 }

 std::shared_ptr<Statistics> Statistics::Make(
     const ColumnDescriptor* descr, const std::string& encoded_min,
     const std::string& encoded_max, int64_t num_values, int64_t null_count,
     int64_t distinct_count, bool has_min_max, bool has_null_count,
     bool has_distinct_count, std::optional<bool> is_min_value_exact,
     std::optional<bool> is_max_value_exact, ::arrow::MemoryPool* pool) {
 #define MAKE_STATS(CAP_TYPE, KLASS)                                              \
   case Type::CAP_TYPE:                                                           \
     return std::make_shared<TypedStatisticsImpl<KLASS>>(                         \
         descr, encoded_min, encoded_max, num_values, null_count, distinct_count, \
         has_min_max, has_null_count, has_distinct_count, is_min_value_exact,     \
         is_max_value_exact, pool)

   switch (descr->physical_type()) {
     MAKE_STATS(BOOLEAN, BooleanType);
     MAKE_STATS(INT32, Int32Type);
     MAKE_STATS(INT64, Int64Type);
     MAKE_STATS(INT96, Int96Type);
     MAKE_STATS(FLOAT, FloatType);
     MAKE_STATS(DOUBLE, DoubleType);
     MAKE_STATS(BYTE_ARRAY, ByteArrayType);
     MAKE_STATS(FIXED_LEN_BYTE_ARRAY, FLBAType);
     default:
       break;
   }
 #undef MAKE_STATS
   DCHECK(false) << "Cannot reach here";
   return nullptr;
 }

 }  // namespace parquet