cpp/src/parquet/statistics.cc - arrow - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 #include <algorithm>
 #include <cmath>
 #include <cstring>
 #include <type_traits>

 #include "arrow/array.h"
 #include "arrow/type.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/logging.h"

 #include "parquet/encoding.h"
 #include "parquet/exception.h"
 #include "parquet/platform.h"
 #include "parquet/schema.h"
 #include "parquet/statistics.h"

 using arrow::default_memory_pool;
 using arrow::MemoryPool;
 using arrow::internal::checked_cast;

 namespace parquet {

 // ----------------------------------------------------------------------
 // Comparator implementations

 template <typename DType, bool is_signed>
 struct CompareHelper {
   typedef typename DType::c_type T;
   static inline bool Compare(int type_length, const T& a, const T& b) { return a < b; }
 };

 template <>
 struct CompareHelper<Int96Type, true> {
   static inline bool Compare(int type_length, const Int96& a, const Int96& b) {
     // Only the MSB bit is by Signed comparison
     // For little-endian, this is the last bit of Int96 type
     const int32_t amsb = static_cast<const int32_t>(a.value[2]);
     const int32_t bmsb = static_cast<const int32_t>(b.value[2]);
     if (amsb != bmsb) {
       return (amsb < bmsb);
     } else if (a.value[1] != b.value[1]) {
       return (a.value[1] < b.value[1]);
     }
     return (a.value[0] < b.value[0]);
   }
 };

 template <>
 struct CompareHelper<ByteArrayType, true> {
   static inline bool Compare(int type_length, const ByteArray& a, const ByteArray& b) {
     const int8_t* aptr = reinterpret_cast<const int8_t*>(a.ptr);
     const int8_t* bptr = reinterpret_cast<const int8_t*>(b.ptr);
     return std::lexicographical_compare(aptr, aptr + a.len, bptr, bptr + b.len);
   }
 };

 template <>
 struct CompareHelper<FLBAType, true> {
   static inline bool Compare(int type_length, const FLBA& a, const FLBA& b) {
     const int8_t* aptr = reinterpret_cast<const int8_t*>(a.ptr);
     const int8_t* bptr = reinterpret_cast<const int8_t*>(b.ptr);
     return std::lexicographical_compare(aptr, aptr + type_length, bptr,
                                         bptr + type_length);
   }
 };

 template <>
 struct CompareHelper<Int32Type, false> {
   static inline bool Compare(int type_length, int32_t a, int32_t b) {
     const uint32_t ua = a;
     const uint32_t ub = b;
     return ua < ub;
   }
 };

 template <>
 struct CompareHelper<Int64Type, false> {
   static inline bool Compare(int type_length, int64_t a, int64_t b) {
     const uint64_t ua = a;
     const uint64_t ub = b;
     return ua < ub;
   }
 };

 template <>
 struct CompareHelper<Int96Type, false> {
   static inline bool Compare(int type_length, const Int96& a, const Int96& b) {
     if (a.value[2] != b.value[2]) {
       return (a.value[2] < b.value[2]);
     } else if (a.value[1] != b.value[1]) {
       return (a.value[1] < b.value[1]);
     }
     return (a.value[0] < b.value[0]);
   }
 };

 template <>
 struct CompareHelper<ByteArrayType, false> {
   static inline bool Compare(int type_length, const ByteArray& a, const ByteArray& b) {
     const uint8_t* aptr = reinterpret_cast<const uint8_t*>(a.ptr);
     const uint8_t* bptr = reinterpret_cast<const uint8_t*>(b.ptr);
     return std::lexicographical_compare(aptr, aptr + a.len, bptr, bptr + b.len);
   }
 };

 template <>
 struct CompareHelper<FLBAType, false> {
   static inline bool Compare(int type_length, const FLBA& a, const FLBA& b) {
     const uint8_t* aptr = reinterpret_cast<const uint8_t*>(a.ptr);
     const uint8_t* bptr = reinterpret_cast<const uint8_t*>(b.ptr);
     return std::lexicographical_compare(aptr, aptr + type_length, bptr,
                                         bptr + type_length);
   }
 };

 template <typename T>
 T CleanStatistic(T val) {
   return val;
 }

 template <>
 float CleanStatistic(float val) {
   // ARROW-5562: Return positive 0 for -0 and any value within float epsilon of
   // 0
   return fabs(val) < 1E-7 ? 0.0f : val;
 }

 template <>
 double CleanStatistic(double val) {
   // ARROW-5562: Return positive 0 for -0 and any value within double epsilon
   // of 0
   return fabs(val) < 1E-13 ? 0.0 : val;
 }

 template <bool is_signed, typename DType>
 class TypedComparatorImpl : virtual public TypedComparator<DType> {
  public:
   typedef typename DType::c_type T;

   explicit TypedComparatorImpl(int type_length = -1) : type_length_(type_length) {}

   bool CompareInline(const T& a, const T& b) const {
     return CompareHelper<DType, is_signed>::Compare(type_length_, a, b);
   }

   bool Compare(const T& a, const T& b) override { return CompareInline(a, b); }

   void GetMinMax(const T* values, int64_t length, T* out_min, T* out_max) override {
     T min = values[0];
     T max = values[0];
     for (int64_t i = 1; i < length; i++) {
       if (CompareInline(values[i], min)) {
         min = values[i];
       } else if (CompareInline(max, values[i])) {
         max = values[i];
       }
     }
     *out_min = CleanStatistic<T>(min);
     *out_max = CleanStatistic<T>(max);
   }

   void GetMinMaxSpaced(const T* values, int64_t length, const uint8_t* valid_bits,
                        int64_t valid_bits_offset, T* out_min, T* out_max) override {
     ::arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset,
                                                       length);

     // Find the first non-null value
     int64_t first_non_null = 0;
     while (!valid_bits_reader.IsSet()) {
       ++first_non_null;
       valid_bits_reader.Next();
     }

     T min = values[first_non_null];
     T max = values[first_non_null];
     valid_bits_reader.Next();
     for (int64_t i = first_non_null + 1; i < length; i++) {
       if (valid_bits_reader.IsSet()) {
         if (CompareInline(values[i], min)) {
           min = values[i];
         } else if (CompareInline(max, values[i])) {
           max = values[i];
         }
       }
       valid_bits_reader.Next();
     }
     *out_min = CleanStatistic<T>(min);
     *out_max = CleanStatistic<T>(max);
   }

   void GetMinMax(const ::arrow::Array& values, T* out_min, T* out_max) override;

  private:
   int type_length_;
 };

 template <bool is_signed, typename DType>
 void TypedComparatorImpl<is_signed, DType>::GetMinMax(const ::arrow::Array& values,
                                                       typename DType::c_type* out_min,
                                                       typename DType::c_type* out_max) {
   ParquetException::NYI(values.type()->ToString());
 }

 template <bool is_signed>
 void GetMinMaxBinaryHelper(
     const TypedComparatorImpl<is_signed, ByteArrayType>& comparator,
     const ::arrow::Array& values, ByteArray* out_min, ByteArray* out_max) {
   const auto& data = checked_cast<const ::arrow::BinaryArray&>(values);

   ByteArray min, max;
   if (data.null_count() > 0) {
     ::arrow::internal::BitmapReader valid_bits_reader(data.null_bitmap_data(),
                                                       data.offset(), data.length());

     int64_t first_non_null = 0;
     while (!valid_bits_reader.IsSet()) {
       ++first_non_null;
       valid_bits_reader.Next();
     }
     min = data.GetView(first_non_null);
     max = data.GetView(first_non_null);
     for (int64_t i = first_non_null; i < data.length(); i++) {
       ByteArray val = data.GetView(i);
       if (valid_bits_reader.IsSet()) {
         if (comparator.CompareInline(val, min)) {
           min = val;
         } else if (comparator.CompareInline(max, val)) {
           max = val;
         }
       }
       valid_bits_reader.Next();
     }
   } else {
     min = data.GetView(0);
     max = data.GetView(0);
     for (int64_t i = 0; i < data.length(); i++) {
       ByteArray val = data.GetView(i);
       if (comparator.CompareInline(val, min)) {
         min = val;
       } else if (comparator.CompareInline(max, val)) {
         max = val;
       }
     }
   }
   *out_min = min;
   *out_max = max;
 }

 template <>
 void TypedComparatorImpl<true, ByteArrayType>::GetMinMax(const ::arrow::Array& values,
                                                          ByteArray* out_min,
                                                          ByteArray* out_max) {
   GetMinMaxBinaryHelper<true>(*this, values, out_min, out_max);
 }

 template <>
 void TypedComparatorImpl<false, ByteArrayType>::GetMinMax(const ::arrow::Array& values,
                                                           ByteArray* out_min,
                                                           ByteArray* out_max) {
   GetMinMaxBinaryHelper<false>(*this, values, out_min, out_max);
 }

 std::shared_ptr<Comparator> Comparator::Make(Type::type physical_type,
                                              SortOrder::type sort_order,
                                              int type_length) {
   if (SortOrder::SIGNED == sort_order) {
     switch (physical_type) {
       case Type::BOOLEAN:
         return std::make_shared<TypedComparatorImpl<true, BooleanType>>();
       case Type::INT32:
         return std::make_shared<TypedComparatorImpl<true, Int32Type>>();
       case Type::INT64:
         return std::make_shared<TypedComparatorImpl<true, Int64Type>>();
       case Type::INT96:
         return std::make_shared<TypedComparatorImpl<true, Int96Type>>();
       case Type::FLOAT:
         return std::make_shared<TypedComparatorImpl<true, FloatType>>();
       case Type::DOUBLE:
         return std::make_shared<TypedComparatorImpl<true, DoubleType>>();
       case Type::BYTE_ARRAY:
         return std::make_shared<TypedComparatorImpl<true, ByteArrayType>>();
       case Type::FIXED_LEN_BYTE_ARRAY:
         return std::make_shared<TypedComparatorImpl<true, FLBAType>>(type_length);
       default:
         ParquetException::NYI("Signed Compare not implemented");
     }
   } else if (SortOrder::UNSIGNED == sort_order) {
     switch (physical_type) {
       case Type::INT32:
         return std::make_shared<TypedComparatorImpl<false, Int32Type>>();
       case Type::INT64:
         return std::make_shared<TypedComparatorImpl<false, Int64Type>>();
       case Type::INT96:
         return std::make_shared<TypedComparatorImpl<false, Int96Type>>();
       case Type::BYTE_ARRAY:
         return std::make_shared<TypedComparatorImpl<false, ByteArrayType>>();
       case Type::FIXED_LEN_BYTE_ARRAY:
         return std::make_shared<TypedComparatorImpl<false, FLBAType>>(type_length);
       default:
         ParquetException::NYI("Unsigned Compare not implemented");
     }
   } else {
     throw ParquetException("UNKNOWN Sort Order");
   }
   return nullptr;
 }

 std::shared_ptr<Comparator> Comparator::Make(const ColumnDescriptor* descr) {
   return Make(descr->physical_type(), descr->sort_order(), descr->type_length());
 }

 // ----------------------------------------------------------------------

 template <typename T, typename Enable = void>
 struct StatsHelper {
   bool CanHaveNaN() { return false; }

   inline int64_t GetValueBeginOffset(const T* values, int64_t count) { return 0; }

   inline int64_t GetValueEndOffset(const T* values, int64_t count) { return count; }

   inline bool IsNaN(const T value) { return false; }
 };

 template <typename T>
 struct StatsHelper<T, typename std::enable_if<std::is_floating_point<T>::value>::type> {
   bool CanHaveNaN() { return true; }

   inline int64_t GetValueBeginOffset(const T* values, int64_t count) {
     // Skip NaNs
     for (int64_t i = 0; i < count; i++) {
       if (!std::isnan(values[i])) {
         return i;
       }
     }
     return count;
   }

   inline int64_t GetValueEndOffset(const T* values, int64_t count) {
     // Skip NaNs
     for (int64_t i = (count - 1); i >= 0; i--) {
       if (!std::isnan(values[i])) {
         return (i + 1);
       }
     }
     return 0;
   }

   inline bool IsNaN(const T value) { return std::isnan(value); }
 };

 template <typename T>
 void SetNaN(T* value) {
   // no-op
 }

 template <>
 void SetNaN<float>(float* value) {
   *value = std::nanf("");
 }

 template <>
 void SetNaN<double>(double* value) {
   *value = std::nan("");
 }

 template <typename DType>
 class TypedStatisticsImpl : public TypedStatistics<DType> {
  public:
   using T = typename DType::c_type;

   TypedStatisticsImpl(const ColumnDescriptor* descr, MemoryPool* pool)
       : descr_(descr),
         pool_(pool),
         min_buffer_(AllocateBuffer(pool_, 0)),
         max_buffer_(AllocateBuffer(pool_, 0)) {
     auto comp = Comparator::Make(descr);
     comparator_ = std::static_pointer_cast<TypedComparator<DType>>(comp);
     Reset();
   }

   TypedStatisticsImpl(const T& min, const T& max, int64_t num_values, int64_t null_count,
                       int64_t distinct_count)
       : pool_(default_memory_pool()),
         min_buffer_(AllocateBuffer(pool_, 0)),
         max_buffer_(AllocateBuffer(pool_, 0)) {
     IncrementNumValues(num_values);
     IncrementNullCount(null_count);
     IncrementDistinctCount(distinct_count);

     Copy(min, &min_, min_buffer_.get());
     Copy(max, &max_, max_buffer_.get());
     has_min_max_ = true;
   }

   TypedStatisticsImpl(const ColumnDescriptor* descr, const std::string& encoded_min,
                       const std::string& encoded_max, int64_t num_values,
                       int64_t null_count, int64_t distinct_count, bool has_min_max,
                       MemoryPool* pool)
       : TypedStatisticsImpl(descr, pool) {
     IncrementNumValues(num_values);
     IncrementNullCount(null_count);
     IncrementDistinctCount(distinct_count);

     if (!encoded_min.empty()) {
       PlainDecode(encoded_min, &min_);
     }
     if (!encoded_max.empty()) {
       PlainDecode(encoded_max, &max_);
     }
     has_min_max_ = has_min_max;
   }

   bool HasMinMax() const override { return has_min_max_; }

   void Reset() override {
     ResetCounts();
     has_min_max_ = false;
   }

   void SetMinMax(const T& arg_min, const T& arg_max) override {
     if (!has_min_max_) {
       has_min_max_ = true;
       Copy(arg_min, &min_, min_buffer_.get());
       Copy(arg_max, &max_, max_buffer_.get());
     } else {
       Copy(comparator_->Compare(min_, arg_min) ? min_ : arg_min, &min_,
            min_buffer_.get());
       Copy(comparator_->Compare(max_, arg_max) ? arg_max : max_, &max_,
            max_buffer_.get());
     }
   }

   void Merge(const TypedStatistics<DType>& other) override {
     this->MergeCounts(other);
     if (!other.HasMinMax()) return;
     SetMinMax(other.min(), other.max());
   }

   void Update(const T* values, int64_t num_not_null, int64_t num_null) override;
   void UpdateSpaced(const T* values, const uint8_t* valid_bits, int64_t valid_bits_spaced,
                     int64_t num_not_null, int64_t num_null) override;

   void Update(const ::arrow::Array& values) override {
     IncrementNullCount(values.null_count());
     IncrementNumValues(values.length() - values.null_count());

     // TODO: support distinct count?
     if (values.null_count() == values.length()) {
       return;
     }

     StatsHelper<T> helper;
     if (helper.CanHaveNaN()) {
       ParquetException::NYI("No NaN handling for Arrow arrays yet");
     }

     T batch_min, batch_max;
     comparator_->GetMinMax(values, &batch_min, &batch_max);
     SetMinMax(batch_min, batch_max);
   }

   const T& min() const override { return min_; }

   const T& max() const override { return max_; }

   Type::type physical_type() const override { return descr_->physical_type(); }

   const ColumnDescriptor* descr() const override { return descr_; }

   std::string EncodeMin() override {
     std::string s;
     if (HasMinMax()) this->PlainEncode(min_, &s);
     return s;
   }

   std::string EncodeMax() override {
     std::string s;
     if (HasMinMax()) this->PlainEncode(max_, &s);
     return s;
   }

   EncodedStatistics Encode() override {
     EncodedStatistics s;
     if (HasMinMax()) {
       s.set_min(this->EncodeMin());
       s.set_max(this->EncodeMax());
     }
     s.set_null_count(this->null_count());
     return s;
   }

   int64_t null_count() const override { return statistics_.null_count; }
   int64_t distinct_count() const override { return statistics_.distinct_count; }
   int64_t num_values() const override { return num_values_; }

  private:
   const ColumnDescriptor* descr_;
   bool has_min_max_ = false;
   T min_;
   T max_;
   ::arrow::MemoryPool* pool_;
   int64_t num_values_ = 0;
   EncodedStatistics statistics_;
   std::shared_ptr<TypedComparator<DType>> comparator_;
   std::shared_ptr<ResizableBuffer> min_buffer_, max_buffer_;

   void PlainEncode(const T& src, std::string* dst);
   void PlainDecode(const std::string& src, T* dst);

   void Copy(const T& src, T* dst, ResizableBuffer*) { *dst = src; }

   void IncrementNullCount(int64_t n) { statistics_.null_count += n; }

   void IncrementNumValues(int64_t n) { num_values_ += n; }

   void IncrementDistinctCount(int64_t n) { statistics_.distinct_count += n; }

   void MergeCounts(const Statistics& other) {
     this->statistics_.null_count += other.null_count();
     this->statistics_.distinct_count += other.distinct_count();
     this->num_values_ += other.num_values();
   }

   void ResetCounts() {
     this->statistics_.null_count = 0;
     this->statistics_.distinct_count = 0;
     this->num_values_ = 0;
   }
 };

 template <>
 inline void TypedStatisticsImpl<FLBAType>::Copy(const FLBA& src, FLBA* dst,
                                                 ResizableBuffer* buffer) {
   if (dst->ptr == src.ptr) return;
   uint32_t len = descr_->type_length();
   PARQUET_THROW_NOT_OK(buffer->Resize(len, false));
   std::memcpy(buffer->mutable_data(), src.ptr, len);
   *dst = FLBA(buffer->data());
 }

 template <>
 inline void TypedStatisticsImpl<ByteArrayType>::Copy(const ByteArray& src, ByteArray* dst,
                                                      ResizableBuffer* buffer) {
   if (dst->ptr == src.ptr) return;
   PARQUET_THROW_NOT_OK(buffer->Resize(src.len, false));
   std::memcpy(buffer->mutable_data(), src.ptr, src.len);
   *dst = ByteArray(src.len, buffer->data());
 }

 template <typename DType>
 void TypedStatisticsImpl<DType>::Update(const T* values, int64_t num_not_null,
                                         int64_t num_null) {
   DCHECK_GE(num_not_null, 0);
   DCHECK_GE(num_null, 0);

   IncrementNullCount(num_null);
   IncrementNumValues(num_not_null);
   // TODO: support distinct count?
   if (num_not_null == 0) return;

   // PARQUET-1225: Handle NaNs
   // The problem arises only if the starting/ending value(s)
   // of the values-buffer contain NaN
   StatsHelper<T> helper;
   int64_t begin_offset = helper.GetValueBeginOffset(values, num_not_null);
   int64_t end_offset = helper.GetValueEndOffset(values, num_not_null);

   // All values are NaN
   if (helper.CanHaveNaN() && end_offset < begin_offset) {
     // Set min/max to NaNs in this case.
     // Don't set has_min_max flag since
     // these values must be over-written by valid stats later
     if (!has_min_max_) {
       SetNaN(&min_);
       SetNaN(&max_);
     }
     return;
   }

   T batch_min, batch_max;
   comparator_->GetMinMax(values + begin_offset, end_offset - begin_offset, &batch_min,
                          &batch_max);
   SetMinMax(batch_min, batch_max);
 }

 template <typename DType>
 void TypedStatisticsImpl<DType>::UpdateSpaced(const T* values, const uint8_t* valid_bits,
                                               int64_t valid_bits_offset,
                                               int64_t num_not_null, int64_t num_null) {
   DCHECK_GE(num_not_null, 0);
   DCHECK_GE(num_null, 0);

   IncrementNullCount(num_null);
   IncrementNumValues(num_not_null);
   // TODO: support distinct count?
   if (num_not_null == 0) return;

   // Find first valid entry and use that for min/max
   // As (num_not_null != 0) there must be one
   int64_t length = num_null + num_not_null;
   int64_t i = 0;
   StatsHelper<T> helper;
   if (helper.CanHaveNaN()) {
     ::arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset,
                                                       length);
     for (; i < length; i++) {
       // PARQUET-1225: Handle NaNs
       if (valid_bits_reader.IsSet() && !helper.IsNaN(values[i])) {
         break;
       }
       valid_bits_reader.Next();
     }

     // All are NaNs and stats are not set yet
     if ((i == length) && helper.IsNaN(values[i - 1])) {
       // Don't set has_min_max flag since
       // these values must be over-written by valid stats later
       if (!has_min_max_) {
         SetNaN(&min_);
         SetNaN(&max_);
       }
       return;
     }
   }

   // Find min and max values from remaining non-NaN values
   T batch_min, batch_max;
   comparator_->GetMinMaxSpaced(values + i, length - i, valid_bits, valid_bits_offset + i,
                                &batch_min, &batch_max);
   SetMinMax(batch_min, batch_max);
 }

 template <typename DType>
 void TypedStatisticsImpl<DType>::PlainEncode(const T& src, std::string* dst) {
   auto encoder = MakeTypedEncoder<DType>(Encoding::PLAIN, false, descr_, pool_);
   encoder->Put(&src, 1);
   auto buffer = encoder->FlushValues();
   auto ptr = reinterpret_cast<const char*>(buffer->data());
   dst->assign(ptr, buffer->size());
 }

 template <typename DType>
 void TypedStatisticsImpl<DType>::PlainDecode(const std::string& src, T* dst) {
   auto decoder = MakeTypedDecoder<DType>(Encoding::PLAIN, descr_);
   decoder->SetData(1, reinterpret_cast<const uint8_t*>(src.c_str()),
                    static_cast<int>(src.size()));
   decoder->Decode(dst, 1);
 }

 template <>
 void TypedStatisticsImpl<ByteArrayType>::PlainEncode(const T& src, std::string* dst) {
   dst->assign(reinterpret_cast<const char*>(src.ptr), src.len);
 }

 template <>
 void TypedStatisticsImpl<ByteArrayType>::PlainDecode(const std::string& src, T* dst) {
   dst->len = static_cast<uint32_t>(src.size());
   dst->ptr = reinterpret_cast<const uint8_t*>(src.c_str());
 }

 // ----------------------------------------------------------------------
 // Public factory functions

 std::shared_ptr<Statistics> Statistics::Make(const ColumnDescriptor* descr,
                                              ::arrow::MemoryPool* pool) {
   switch (descr->physical_type()) {
     case Type::BOOLEAN:
       return std::make_shared<TypedStatisticsImpl<BooleanType>>(descr, pool);
     case Type::INT32:
       return std::make_shared<TypedStatisticsImpl<Int32Type>>(descr, pool);
     case Type::INT64:
       return std::make_shared<TypedStatisticsImpl<Int64Type>>(descr, pool);
     case Type::FLOAT:
       return std::make_shared<TypedStatisticsImpl<FloatType>>(descr, pool);
     case Type::DOUBLE:
       return std::make_shared<TypedStatisticsImpl<DoubleType>>(descr, pool);
     case Type::BYTE_ARRAY:
       return std::make_shared<TypedStatisticsImpl<ByteArrayType>>(descr, pool);
     case Type::FIXED_LEN_BYTE_ARRAY:
       return std::make_shared<TypedStatisticsImpl<FLBAType>>(descr, pool);
     default:
       ParquetException::NYI("Statistics not implemented");
   }
 }

 std::shared_ptr<Statistics> Statistics::Make(Type::type physical_type, const void* min,
                                              const void* max, int64_t num_values,
                                              int64_t null_count, int64_t distinct_count) {
 #define MAKE_STATS(CAP_TYPE, KLASS)                                                    \
   case Type::CAP_TYPE:                                                                 \
     return std::make_shared<TypedStatisticsImpl<KLASS>>(                               \
         *reinterpret_cast<const typename KLASS::c_type*>(min),                         \
         *reinterpret_cast<const typename KLASS::c_type*>(max), num_values, null_count, \
         distinct_count)

   switch (physical_type) {
     MAKE_STATS(BOOLEAN, BooleanType);
     MAKE_STATS(INT32, Int32Type);
     MAKE_STATS(INT64, Int64Type);
     MAKE_STATS(FLOAT, FloatType);
     MAKE_STATS(DOUBLE, DoubleType);
     MAKE_STATS(BYTE_ARRAY, ByteArrayType);
     MAKE_STATS(FIXED_LEN_BYTE_ARRAY, FLBAType);
     default:
       break;
   }
 #undef MAKE_STATS
   DCHECK(false) << "Cannot reach here";
   return nullptr;
 }

 std::shared_ptr<Statistics> Statistics::Make(const ColumnDescriptor* descr,
                                              const std::string& encoded_min,
                                              const std::string& encoded_max,
                                              int64_t num_values, int64_t null_count,
                                              int64_t distinct_count, bool has_min_max,
                                              ::arrow::MemoryPool* pool) {
 #define MAKE_STATS(CAP_TYPE, KLASS)                                              \
   case Type::CAP_TYPE:                                                           \
     return std::make_shared<TypedStatisticsImpl<KLASS>>(                         \
         descr, encoded_min, encoded_max, num_values, null_count, distinct_count, \
         has_min_max, pool)

   switch (descr->physical_type()) {
     MAKE_STATS(BOOLEAN, BooleanType);
     MAKE_STATS(INT32, Int32Type);
     MAKE_STATS(INT64, Int64Type);
     MAKE_STATS(FLOAT, FloatType);
     MAKE_STATS(DOUBLE, DoubleType);
     MAKE_STATS(BYTE_ARRAY, ByteArrayType);
     MAKE_STATS(FIXED_LEN_BYTE_ARRAY, FLBAType);
     default:
       break;
   }
 #undef MAKE_STATS
   DCHECK(false) << "Cannot reach here";
   return nullptr;
 }

 }  // namespace parquet