| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| //! Contains definitions for working with Parquet statistics. |
| //! |
| //! Though some common methods are available on enum, use pattern match to extract |
| //! actual min and max values from statistics, see below: |
| //! |
| //! # Examples |
| //! ```rust |
| //! use parquet::file::statistics::Statistics; |
| //! |
| //! let stats = Statistics::int32(Some(1), Some(10), None, Some(3), true); |
| //! assert_eq!(stats.null_count_opt(), Some(3)); |
| //! assert!(stats.is_min_max_deprecated()); |
| //! assert!(stats.min_is_exact()); |
| //! assert!(stats.max_is_exact()); |
| //! |
| //! match stats { |
| //! Statistics::Int32(ref typed) => { |
| //! assert_eq!(typed.min_opt(), Some(&1)); |
| //! assert_eq!(typed.max_opt(), Some(&10)); |
| //! } |
| //! _ => {} |
| //! } |
| //! ``` |
| |
| use std::fmt; |
| |
| use crate::basic::Type; |
| use crate::data_type::private::ParquetValueType; |
| use crate::data_type::*; |
| use crate::errors::{ParquetError, Result}; |
| use crate::file::metadata::thrift_gen::PageStatistics; |
| use crate::util::bit_util::FromBytes; |
| |
| pub(crate) mod private { |
| use super::*; |
| |
| pub trait MakeStatistics { |
| fn make_statistics(statistics: ValueStatistics<Self>) -> Statistics |
| where |
| Self: Sized; |
| } |
| |
| macro_rules! gen_make_statistics { |
| ($value_ty:ty, $stat:ident) => { |
| impl MakeStatistics for $value_ty { |
| fn make_statistics(statistics: ValueStatistics<Self>) -> Statistics |
| where |
| Self: Sized, |
| { |
| Statistics::$stat(statistics) |
| } |
| } |
| }; |
| } |
| |
| gen_make_statistics!(bool, Boolean); |
| gen_make_statistics!(i32, Int32); |
| gen_make_statistics!(i64, Int64); |
| gen_make_statistics!(Int96, Int96); |
| gen_make_statistics!(f32, Float); |
| gen_make_statistics!(f64, Double); |
| gen_make_statistics!(ByteArray, ByteArray); |
| gen_make_statistics!(FixedLenByteArray, FixedLenByteArray); |
| } |
| |
| /// Macro to generate methods to create Statistics. |
| macro_rules! statistics_new_func { |
| ($func:ident, $vtype:ty, $stat:ident) => { |
| #[doc = concat!("Creates new statistics for `", stringify!($stat), "` column type.")] |
| pub fn $func( |
| min: $vtype, |
| max: $vtype, |
| distinct: Option<u64>, |
| nulls: Option<u64>, |
| is_deprecated: bool, |
| ) -> Self { |
| Statistics::$stat(ValueStatistics::new( |
| min, |
| max, |
| distinct, |
| nulls, |
| is_deprecated, |
| )) |
| } |
| }; |
| } |
| |
| // Macro to generate getter functions for Statistics. |
| macro_rules! statistics_enum_func { |
| ($self:ident, $func:ident) => {{ |
| match *$self { |
| Statistics::Boolean(ref typed) => typed.$func(), |
| Statistics::Int32(ref typed) => typed.$func(), |
| Statistics::Int64(ref typed) => typed.$func(), |
| Statistics::Int96(ref typed) => typed.$func(), |
| Statistics::Float(ref typed) => typed.$func(), |
| Statistics::Double(ref typed) => typed.$func(), |
| Statistics::ByteArray(ref typed) => typed.$func(), |
| Statistics::FixedLenByteArray(ref typed) => typed.$func(), |
| } |
| }}; |
| } |
| |
| /// Converts Thrift definition into `Statistics`. |
| pub(crate) fn from_thrift_page_stats( |
| physical_type: Type, |
| thrift_stats: Option<PageStatistics>, |
| ) -> Result<Option<Statistics>> { |
| Ok(match thrift_stats { |
| Some(stats) => { |
| // Number of nulls recorded, when it is not available, we just mark it as 0. |
| // TODO this should be `None` if there is no information about NULLS. |
| // see https://github.com/apache/arrow-rs/pull/6216/files |
| let null_count = stats.null_count.unwrap_or(0); |
| |
| if null_count < 0 { |
| return Err(ParquetError::General(format!( |
| "Statistics null count is negative {null_count}", |
| ))); |
| } |
| |
| // Generic null count. |
| let null_count = Some(null_count as u64); |
| // Generic distinct count (count of distinct values occurring) |
| let distinct_count = stats.distinct_count.map(|value| value as u64); |
| // Whether or not statistics use deprecated min/max fields. |
| let old_format = stats.min_value.is_none() && stats.max_value.is_none(); |
| // Generic min value as bytes. |
| let min = if old_format { |
| stats.min |
| } else { |
| stats.min_value |
| }; |
| // Generic max value as bytes. |
| let max = if old_format { |
| stats.max |
| } else { |
| stats.max_value |
| }; |
| |
| fn check_len(min: &Option<Vec<u8>>, max: &Option<Vec<u8>>, len: usize) -> Result<()> { |
| if let Some(min) = min { |
| if min.len() < len { |
| return Err(ParquetError::General( |
| "Insufficient bytes to parse min statistic".to_string(), |
| )); |
| } |
| } |
| if let Some(max) = max { |
| if max.len() < len { |
| return Err(ParquetError::General( |
| "Insufficient bytes to parse max statistic".to_string(), |
| )); |
| } |
| } |
| Ok(()) |
| } |
| |
| match physical_type { |
| Type::BOOLEAN => check_len(&min, &max, 1), |
| Type::INT32 | Type::FLOAT => check_len(&min, &max, 4), |
| Type::INT64 | Type::DOUBLE => check_len(&min, &max, 8), |
| Type::INT96 => check_len(&min, &max, 12), |
| _ => Ok(()), |
| }?; |
| |
| // Values are encoded using PLAIN encoding definition, except that |
| // variable-length byte arrays do not include a length prefix. |
| // |
| // Instead of using actual decoder, we manually convert values. |
| let res = match physical_type { |
| Type::BOOLEAN => Statistics::boolean( |
| min.map(|data| data[0] != 0), |
| max.map(|data| data[0] != 0), |
| distinct_count, |
| null_count, |
| old_format, |
| ), |
| Type::INT32 => Statistics::int32( |
| min.map(|data| i32::from_le_bytes(data[..4].try_into().unwrap())), |
| max.map(|data| i32::from_le_bytes(data[..4].try_into().unwrap())), |
| distinct_count, |
| null_count, |
| old_format, |
| ), |
| Type::INT64 => Statistics::int64( |
| min.map(|data| i64::from_le_bytes(data[..8].try_into().unwrap())), |
| max.map(|data| i64::from_le_bytes(data[..8].try_into().unwrap())), |
| distinct_count, |
| null_count, |
| old_format, |
| ), |
| Type::INT96 => { |
| // INT96 statistics may not be correct, because comparison is signed |
| let min = if let Some(data) = min { |
| assert_eq!(data.len(), 12); |
| Some(Int96::try_from_le_slice(&data)?) |
| } else { |
| None |
| }; |
| let max = if let Some(data) = max { |
| assert_eq!(data.len(), 12); |
| Some(Int96::try_from_le_slice(&data)?) |
| } else { |
| None |
| }; |
| Statistics::int96(min, max, distinct_count, null_count, old_format) |
| } |
| Type::FLOAT => Statistics::float( |
| min.map(|data| f32::from_le_bytes(data[..4].try_into().unwrap())), |
| max.map(|data| f32::from_le_bytes(data[..4].try_into().unwrap())), |
| distinct_count, |
| null_count, |
| old_format, |
| ), |
| Type::DOUBLE => Statistics::double( |
| min.map(|data| f64::from_le_bytes(data[..8].try_into().unwrap())), |
| max.map(|data| f64::from_le_bytes(data[..8].try_into().unwrap())), |
| distinct_count, |
| null_count, |
| old_format, |
| ), |
| Type::BYTE_ARRAY => Statistics::ByteArray( |
| ValueStatistics::new( |
| min.map(ByteArray::from), |
| max.map(ByteArray::from), |
| distinct_count, |
| null_count, |
| old_format, |
| ) |
| .with_max_is_exact(stats.is_max_value_exact.unwrap_or(false)) |
| .with_min_is_exact(stats.is_min_value_exact.unwrap_or(false)), |
| ), |
| Type::FIXED_LEN_BYTE_ARRAY => Statistics::FixedLenByteArray( |
| ValueStatistics::new( |
| min.map(ByteArray::from).map(FixedLenByteArray::from), |
| max.map(ByteArray::from).map(FixedLenByteArray::from), |
| distinct_count, |
| null_count, |
| old_format, |
| ) |
| .with_max_is_exact(stats.is_max_value_exact.unwrap_or(false)) |
| .with_min_is_exact(stats.is_min_value_exact.unwrap_or(false)), |
| ), |
| }; |
| |
| Some(res) |
| } |
| None => None, |
| }) |
| } |
| |
| /// Convert Statistics into Thrift definition. |
| pub(crate) fn page_stats_to_thrift(stats: Option<&Statistics>) -> Option<PageStatistics> { |
| let stats = stats?; |
| |
| // record null count if it can fit in i64 |
| let null_count = stats |
| .null_count_opt() |
| .and_then(|value| i64::try_from(value).ok()); |
| |
| // record distinct count if it can fit in i64 |
| let distinct_count = stats |
| .distinct_count_opt() |
| .and_then(|value| i64::try_from(value).ok()); |
| |
| let mut thrift_stats = PageStatistics { |
| max: None, |
| min: None, |
| null_count, |
| distinct_count, |
| max_value: None, |
| min_value: None, |
| is_max_value_exact: None, |
| is_min_value_exact: None, |
| }; |
| |
| // Get min/max if set. |
| let (min, max, min_exact, max_exact) = ( |
| stats.min_bytes_opt().map(|x| x.to_vec()), |
| stats.max_bytes_opt().map(|x| x.to_vec()), |
| Some(stats.min_is_exact()), |
| Some(stats.max_is_exact()), |
| ); |
| if stats.is_min_max_backwards_compatible() { |
| // Copy to deprecated min, max values for compatibility with older readers |
| thrift_stats.min.clone_from(&min); |
| thrift_stats.max.clone_from(&max); |
| } |
| |
| if !stats.is_min_max_deprecated() { |
| thrift_stats.min_value = min; |
| thrift_stats.max_value = max; |
| } |
| |
| thrift_stats.is_min_value_exact = min_exact; |
| thrift_stats.is_max_value_exact = max_exact; |
| |
| Some(thrift_stats) |
| } |
| |
| /// Strongly typed statistics for a column chunk within a row group. |
| /// |
| /// This structure is a natively typed, in memory representation of the thrift |
| /// `Statistics` structure in a Parquet file footer. The statistics stored in |
| /// this structure can be used by query engines to skip decoding pages while |
| /// reading parquet data. |
| /// |
| /// Page level statistics are stored separately, in [ColumnIndexMetaData]. |
| /// |
| /// [ColumnIndexMetaData]: crate::file::page_index::column_index::ColumnIndexMetaData |
| #[derive(Debug, Clone, PartialEq)] |
| pub enum Statistics { |
| /// Statistics for Boolean column |
| Boolean(ValueStatistics<bool>), |
| /// Statistics for Int32 column |
| Int32(ValueStatistics<i32>), |
| /// Statistics for Int64 column |
| Int64(ValueStatistics<i64>), |
| /// Statistics for Int96 column |
| Int96(ValueStatistics<Int96>), |
| /// Statistics for Float column |
| Float(ValueStatistics<f32>), |
| /// Statistics for Double column |
| Double(ValueStatistics<f64>), |
| /// Statistics for ByteArray column |
| ByteArray(ValueStatistics<ByteArray>), |
| /// Statistics for FixedLenByteArray column |
| FixedLenByteArray(ValueStatistics<FixedLenByteArray>), |
| } |
| |
| impl<T: ParquetValueType> From<ValueStatistics<T>> for Statistics { |
| fn from(t: ValueStatistics<T>) -> Self { |
| T::make_statistics(t) |
| } |
| } |
| |
| impl Statistics { |
| /// Creates new statistics for a column type |
| pub fn new<T: ParquetValueType>( |
| min: Option<T>, |
| max: Option<T>, |
| distinct_count: Option<u64>, |
| null_count: Option<u64>, |
| is_deprecated: bool, |
| ) -> Self { |
| Self::from(ValueStatistics::new( |
| min, |
| max, |
| distinct_count, |
| null_count, |
| is_deprecated, |
| )) |
| } |
| |
| statistics_new_func![boolean, Option<bool>, Boolean]; |
| |
| statistics_new_func![int32, Option<i32>, Int32]; |
| |
| statistics_new_func![int64, Option<i64>, Int64]; |
| |
| statistics_new_func![int96, Option<Int96>, Int96]; |
| |
| statistics_new_func![float, Option<f32>, Float]; |
| |
| statistics_new_func![double, Option<f64>, Double]; |
| |
| statistics_new_func![byte_array, Option<ByteArray>, ByteArray]; |
| |
| statistics_new_func![ |
| fixed_len_byte_array, |
| Option<FixedLenByteArray>, |
| FixedLenByteArray |
| ]; |
| |
| /// Returns `true` if statistics have old `min` and `max` fields set. |
| /// This means that the column order is likely to be undefined, which, for old files |
| /// could mean a signed sort order of values. |
| /// |
| /// Refer to [`ColumnOrder`](crate::basic::ColumnOrder) and |
| /// [`SortOrder`](crate::basic::SortOrder) for more information. |
| pub fn is_min_max_deprecated(&self) -> bool { |
| statistics_enum_func![self, is_min_max_deprecated] |
| } |
| |
| /// Old versions of parquet stored statistics in `min` and `max` fields, ordered |
| /// using signed comparison. This resulted in an undefined ordering for unsigned |
| /// quantities, such as booleans and unsigned integers. |
| /// |
| /// These fields were therefore deprecated in favour of `min_value` and `max_value`, |
| /// which have a type-defined sort order. |
| /// |
| /// However, not all readers have been updated. For backwards compatibility, this method |
| /// returns `true` if the statistics within this have a signed sort order, that is |
| /// compatible with being stored in the deprecated `min` and `max` fields |
| pub fn is_min_max_backwards_compatible(&self) -> bool { |
| statistics_enum_func![self, is_min_max_backwards_compatible] |
| } |
| |
| /// Returns optional value of number of distinct values occurring. |
| /// When it is `None`, the value should be ignored. |
| pub fn distinct_count_opt(&self) -> Option<u64> { |
| statistics_enum_func![self, distinct_count] |
| } |
| |
| /// Returns number of null values for the column, if known. |
| /// Note that this includes all nulls when column is part of the complex type. |
| /// |
| /// Note this API returns Some(0) even if the null count was not present |
| /// in the statistics. |
| /// See <https://github.com/apache/arrow-rs/pull/6216/files> |
| pub fn null_count_opt(&self) -> Option<u64> { |
| statistics_enum_func![self, null_count_opt] |
| } |
| |
| /// Returns `true` if the min value is set, and is an exact min value. |
| pub fn min_is_exact(&self) -> bool { |
| statistics_enum_func![self, min_is_exact] |
| } |
| |
| /// Returns `true` if the max value is set, and is an exact max value. |
| pub fn max_is_exact(&self) -> bool { |
| statistics_enum_func![self, max_is_exact] |
| } |
| |
| /// Returns slice of bytes that represent min value, if min value is known. |
| pub fn min_bytes_opt(&self) -> Option<&[u8]> { |
| statistics_enum_func![self, min_bytes_opt] |
| } |
| |
| /// Returns slice of bytes that represent max value, if max value is known. |
| pub fn max_bytes_opt(&self) -> Option<&[u8]> { |
| statistics_enum_func![self, max_bytes_opt] |
| } |
| |
| /// Returns physical type associated with statistics. |
| pub fn physical_type(&self) -> Type { |
| match self { |
| Statistics::Boolean(_) => Type::BOOLEAN, |
| Statistics::Int32(_) => Type::INT32, |
| Statistics::Int64(_) => Type::INT64, |
| Statistics::Int96(_) => Type::INT96, |
| Statistics::Float(_) => Type::FLOAT, |
| Statistics::Double(_) => Type::DOUBLE, |
| Statistics::ByteArray(_) => Type::BYTE_ARRAY, |
| Statistics::FixedLenByteArray(_) => Type::FIXED_LEN_BYTE_ARRAY, |
| } |
| } |
| } |
| |
| impl fmt::Display for Statistics { |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| match self { |
| Statistics::Boolean(typed) => write!(f, "{typed}"), |
| Statistics::Int32(typed) => write!(f, "{typed}"), |
| Statistics::Int64(typed) => write!(f, "{typed}"), |
| Statistics::Int96(typed) => write!(f, "{typed}"), |
| Statistics::Float(typed) => write!(f, "{typed}"), |
| Statistics::Double(typed) => write!(f, "{typed}"), |
| Statistics::ByteArray(typed) => write!(f, "{typed}"), |
| Statistics::FixedLenByteArray(typed) => write!(f, "{typed}"), |
| } |
| } |
| } |
| |
| /// Typed implementation for [`Statistics`]. |
| pub type TypedStatistics<T> = ValueStatistics<<T as DataType>::T>; |
| |
| /// Typed statistics for one column chunk |
| /// |
| /// See [`Statistics`] for more details |
| #[derive(Clone, Eq, PartialEq)] |
| pub struct ValueStatistics<T> { |
| min: Option<T>, |
| max: Option<T>, |
| // Distinct count could be omitted in some cases |
| distinct_count: Option<u64>, |
| null_count: Option<u64>, |
| |
| // Whether or not the min or max values are exact, or truncated. |
| is_max_value_exact: bool, |
| is_min_value_exact: bool, |
| |
| /// If `true` populate the deprecated `min` and `max` fields instead of |
| /// `min_value` and `max_value` |
| is_min_max_deprecated: bool, |
| |
| /// If `true` the statistics are compatible with the deprecated `min` and |
| /// `max` fields. See [`ValueStatistics::is_min_max_backwards_compatible`] |
| is_min_max_backwards_compatible: bool, |
| } |
| |
| impl<T: ParquetValueType> ValueStatistics<T> { |
| /// Creates new typed statistics. |
| pub fn new( |
| min: Option<T>, |
| max: Option<T>, |
| distinct_count: Option<u64>, |
| null_count: Option<u64>, |
| is_min_max_deprecated: bool, |
| ) -> Self { |
| Self { |
| is_max_value_exact: max.is_some(), |
| is_min_value_exact: min.is_some(), |
| min, |
| max, |
| distinct_count, |
| null_count, |
| is_min_max_deprecated, |
| is_min_max_backwards_compatible: is_min_max_deprecated, |
| } |
| } |
| |
| /// Set whether the stored `min` field represents the exact |
| /// minimum, or just a bound on the minimum value. |
| /// |
| /// see [`Self::min_is_exact`] |
| pub fn with_min_is_exact(self, is_min_value_exact: bool) -> Self { |
| Self { |
| is_min_value_exact, |
| ..self |
| } |
| } |
| |
| /// Set whether the stored `max` field represents the exact |
| /// maximum, or just a bound on the maximum value. |
| /// |
| /// see [`Self::max_is_exact`] |
| pub fn with_max_is_exact(self, is_max_value_exact: bool) -> Self { |
| Self { |
| is_max_value_exact, |
| ..self |
| } |
| } |
| |
| /// Set whether to write the deprecated `min` and `max` fields |
| /// for compatibility with older parquet writers |
| /// |
| /// This should only be enabled if the field is signed, |
| /// see [`Self::is_min_max_backwards_compatible`] |
| pub fn with_backwards_compatible_min_max(self, backwards_compatible: bool) -> Self { |
| Self { |
| is_min_max_backwards_compatible: backwards_compatible, |
| ..self |
| } |
| } |
| |
| /// Returns min value of the statistics, if known. |
| pub fn min_opt(&self) -> Option<&T> { |
| self.min.as_ref() |
| } |
| |
| /// Returns max value of the statistics, if known. |
| pub fn max_opt(&self) -> Option<&T> { |
| self.max.as_ref() |
| } |
| |
| /// Returns min value as bytes of the statistics, if min value is known. |
| pub fn min_bytes_opt(&self) -> Option<&[u8]> { |
| self.min_opt().map(AsBytes::as_bytes) |
| } |
| |
| /// Returns max value as bytes of the statistics, if max value is known. |
| pub fn max_bytes_opt(&self) -> Option<&[u8]> { |
| self.max_opt().map(AsBytes::as_bytes) |
| } |
| |
| /// Whether or not min and max values are set. |
| /// Normally both min/max values will be set to `Some(value)` or `None`. |
| pub(crate) fn _internal_has_min_max_set(&self) -> bool { |
| self.min.is_some() && self.max.is_some() |
| } |
| |
| /// Whether or not max value is set, and is an exact value. |
| pub fn max_is_exact(&self) -> bool { |
| self.max.is_some() && self.is_max_value_exact |
| } |
| |
| /// Whether or not min value is set, and is an exact value. |
| pub fn min_is_exact(&self) -> bool { |
| self.min.is_some() && self.is_min_value_exact |
| } |
| |
| /// Returns optional value of number of distinct values occurring. |
| pub fn distinct_count(&self) -> Option<u64> { |
| self.distinct_count |
| } |
| |
| /// Returns null count. |
| pub fn null_count_opt(&self) -> Option<u64> { |
| self.null_count |
| } |
| |
| /// Returns `true` if statistics were created using old min/max fields. |
| fn is_min_max_deprecated(&self) -> bool { |
| self.is_min_max_deprecated |
| } |
| |
| /// Old versions of parquet stored statistics in `min` and `max` fields, ordered |
| /// using signed comparison. This resulted in an undefined ordering for unsigned |
| /// quantities, such as booleans and unsigned integers. |
| /// |
| /// These fields were therefore deprecated in favour of `min_value` and `max_value`, |
| /// which have a type-defined sort order. |
| /// |
| /// However, not all readers have been updated. For backwards compatibility, this method |
| /// returns `true` if the statistics within this have a signed sort order, that is |
| /// compatible with being stored in the deprecated `min` and `max` fields |
| pub fn is_min_max_backwards_compatible(&self) -> bool { |
| self.is_min_max_backwards_compatible |
| } |
| } |
| |
| impl<T: ParquetValueType> fmt::Display for ValueStatistics<T> { |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| write!(f, "{{")?; |
| write!(f, "min: ")?; |
| match self.min { |
| Some(ref value) => write!(f, "{value}")?, |
| None => write!(f, "N/A")?, |
| } |
| write!(f, ", max: ")?; |
| match self.max { |
| Some(ref value) => write!(f, "{value}")?, |
| None => write!(f, "N/A")?, |
| } |
| write!(f, ", distinct_count: ")?; |
| match self.distinct_count { |
| Some(value) => write!(f, "{value}")?, |
| None => write!(f, "N/A")?, |
| } |
| write!(f, ", null_count: ")?; |
| match self.null_count { |
| Some(value) => write!(f, "{value}")?, |
| None => write!(f, "N/A")?, |
| } |
| write!(f, ", min_max_deprecated: {}", self.is_min_max_deprecated)?; |
| write!(f, ", max_value_exact: {}", self.is_max_value_exact)?; |
| write!(f, ", min_value_exact: {}", self.is_min_value_exact)?; |
| write!(f, "}}") |
| } |
| } |
| |
| impl<T: ParquetValueType> fmt::Debug for ValueStatistics<T> { |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| write!( |
| f, |
| "{{min: {:?}, max: {:?}, distinct_count: {:?}, null_count: {:?}, \ |
| min_max_deprecated: {}, min_max_backwards_compatible: {}, max_value_exact: {}, min_value_exact: {}}}", |
| self.min, |
| self.max, |
| self.distinct_count, |
| self.null_count, |
| self.is_min_max_deprecated, |
| self.is_min_max_backwards_compatible, |
| self.is_max_value_exact, |
| self.is_min_value_exact |
| ) |
| } |
| } |
| |
| #[cfg(test)] |
| mod tests { |
| use super::*; |
| |
| #[test] |
| fn test_statistics_min_max_bytes() { |
| let stats = Statistics::int32(Some(-123), Some(234), None, Some(1), false); |
| assert_eq!(stats.min_bytes_opt(), Some((-123).as_bytes())); |
| assert_eq!(stats.max_bytes_opt(), Some(234.as_bytes())); |
| |
| let stats = Statistics::byte_array( |
| Some(ByteArray::from(vec![1, 2, 3])), |
| Some(ByteArray::from(vec![3, 4, 5])), |
| None, |
| Some(1), |
| true, |
| ); |
| assert_eq!(stats.min_bytes_opt().unwrap(), &[1, 2, 3]); |
| assert_eq!(stats.max_bytes_opt().unwrap(), &[3, 4, 5]); |
| } |
| |
| #[test] |
| #[should_panic(expected = "General(\"Statistics null count is negative -10\")")] |
| fn test_statistics_negative_null_count() { |
| let thrift_stats = PageStatistics { |
| max: None, |
| min: None, |
| null_count: Some(-10), |
| distinct_count: None, |
| max_value: None, |
| min_value: None, |
| is_max_value_exact: None, |
| is_min_value_exact: None, |
| }; |
| |
| from_thrift_page_stats(Type::INT32, Some(thrift_stats)).unwrap(); |
| } |
| |
| #[test] |
| fn test_statistics_thrift_none() { |
| assert_eq!(from_thrift_page_stats(Type::INT32, None).unwrap(), None); |
| assert_eq!( |
| from_thrift_page_stats(Type::BYTE_ARRAY, None).unwrap(), |
| None |
| ); |
| } |
| |
| #[test] |
| fn test_statistics_debug() { |
| let stats = Statistics::int32(Some(1), Some(12), None, Some(12), true); |
| assert_eq!( |
| format!("{stats:?}"), |
| "Int32({min: Some(1), max: Some(12), distinct_count: None, null_count: Some(12), \ |
| min_max_deprecated: true, min_max_backwards_compatible: true, max_value_exact: true, min_value_exact: true})" |
| ); |
| |
| let stats = Statistics::int32(None, None, None, Some(7), false); |
| assert_eq!( |
| format!("{stats:?}"), |
| "Int32({min: None, max: None, distinct_count: None, null_count: Some(7), \ |
| min_max_deprecated: false, min_max_backwards_compatible: false, max_value_exact: false, min_value_exact: false})" |
| ) |
| } |
| |
| #[test] |
| fn test_statistics_display() { |
| let stats = Statistics::int32(Some(1), Some(12), None, Some(12), true); |
| assert_eq!( |
| format!("{stats}"), |
| "{min: 1, max: 12, distinct_count: N/A, null_count: 12, min_max_deprecated: true, max_value_exact: true, min_value_exact: true}" |
| ); |
| |
| let stats = Statistics::int64(None, None, None, Some(7), false); |
| assert_eq!( |
| format!("{stats}"), |
| "{min: N/A, max: N/A, distinct_count: N/A, null_count: 7, min_max_deprecated: \ |
| false, max_value_exact: false, min_value_exact: false}" |
| ); |
| |
| let stats = Statistics::int96( |
| Some(Int96::from(vec![1, 0, 0])), |
| Some(Int96::from(vec![2, 3, 4])), |
| None, |
| Some(3), |
| true, |
| ); |
| assert_eq!( |
| format!("{stats}"), |
| "{min: [1, 0, 0], max: [2, 3, 4], distinct_count: N/A, null_count: 3, \ |
| min_max_deprecated: true, max_value_exact: true, min_value_exact: true}" |
| ); |
| |
| let stats = Statistics::ByteArray( |
| ValueStatistics::new( |
| Some(ByteArray::from(vec![1u8])), |
| Some(ByteArray::from(vec![2u8])), |
| Some(5), |
| Some(7), |
| false, |
| ) |
| .with_max_is_exact(false) |
| .with_min_is_exact(false), |
| ); |
| assert_eq!( |
| format!("{stats}"), |
| "{min: [1], max: [2], distinct_count: 5, null_count: 7, min_max_deprecated: false, max_value_exact: false, min_value_exact: false}" |
| ); |
| } |
| |
| #[test] |
| fn test_statistics_partial_eq() { |
| let expected = Statistics::int32(Some(12), Some(45), None, Some(11), true); |
| |
| assert!(Statistics::int32(Some(12), Some(45), None, Some(11), true) == expected); |
| assert!(Statistics::int32(Some(11), Some(45), None, Some(11), true) != expected); |
| assert!(Statistics::int32(Some(12), Some(44), None, Some(11), true) != expected); |
| assert!(Statistics::int32(Some(12), Some(45), None, Some(23), true) != expected); |
| assert!(Statistics::int32(Some(12), Some(45), None, Some(11), false) != expected); |
| |
| assert!( |
| Statistics::int32(Some(12), Some(45), None, Some(11), false) |
| != Statistics::int64(Some(12), Some(45), None, Some(11), false) |
| ); |
| |
| assert!( |
| Statistics::boolean(Some(false), Some(true), None, None, true) |
| != Statistics::double(Some(1.2), Some(4.5), None, None, true) |
| ); |
| |
| assert!( |
| Statistics::byte_array( |
| Some(ByteArray::from(vec![1, 2, 3])), |
| Some(ByteArray::from(vec![1, 2, 3])), |
| None, |
| None, |
| true |
| ) != Statistics::fixed_len_byte_array( |
| Some(ByteArray::from(vec![1, 2, 3]).into()), |
| Some(ByteArray::from(vec![1, 2, 3]).into()), |
| None, |
| None, |
| true, |
| ) |
| ); |
| |
| assert!( |
| Statistics::byte_array( |
| Some(ByteArray::from(vec![1, 2, 3])), |
| Some(ByteArray::from(vec![1, 2, 3])), |
| None, |
| None, |
| true, |
| ) != Statistics::ByteArray( |
| ValueStatistics::new( |
| Some(ByteArray::from(vec![1, 2, 3])), |
| Some(ByteArray::from(vec![1, 2, 3])), |
| None, |
| None, |
| true, |
| ) |
| .with_max_is_exact(false) |
| ) |
| ); |
| |
| assert!( |
| Statistics::fixed_len_byte_array( |
| Some(FixedLenByteArray::from(vec![1, 2, 3])), |
| Some(FixedLenByteArray::from(vec![1, 2, 3])), |
| None, |
| None, |
| true, |
| ) != Statistics::FixedLenByteArray( |
| ValueStatistics::new( |
| Some(FixedLenByteArray::from(vec![1, 2, 3])), |
| Some(FixedLenByteArray::from(vec![1, 2, 3])), |
| None, |
| None, |
| true, |
| ) |
| .with_min_is_exact(false) |
| ) |
| ); |
| } |
| |
| #[test] |
| fn test_statistics_from_thrift() { |
| // Helper method to check statistics conversion. |
| fn check_stats(stats: Statistics) { |
| let tpe = stats.physical_type(); |
| let thrift_stats = page_stats_to_thrift(Some(&stats)); |
| assert_eq!( |
| from_thrift_page_stats(tpe, thrift_stats).unwrap(), |
| Some(stats) |
| ); |
| } |
| |
| check_stats(Statistics::boolean( |
| Some(false), |
| Some(true), |
| None, |
| Some(7), |
| true, |
| )); |
| check_stats(Statistics::boolean( |
| Some(false), |
| Some(true), |
| None, |
| Some(7), |
| true, |
| )); |
| check_stats(Statistics::boolean( |
| Some(false), |
| Some(true), |
| None, |
| Some(0), |
| false, |
| )); |
| check_stats(Statistics::boolean( |
| Some(true), |
| Some(true), |
| None, |
| Some(7), |
| true, |
| )); |
| check_stats(Statistics::boolean( |
| Some(false), |
| Some(false), |
| None, |
| Some(7), |
| true, |
| )); |
| check_stats(Statistics::boolean(None, None, None, Some(7), true)); |
| |
| check_stats(Statistics::int32( |
| Some(-100), |
| Some(500), |
| None, |
| Some(7), |
| true, |
| )); |
| check_stats(Statistics::int32( |
| Some(-100), |
| Some(500), |
| None, |
| Some(0), |
| false, |
| )); |
| check_stats(Statistics::int32(None, None, None, Some(7), true)); |
| |
| check_stats(Statistics::int64( |
| Some(-100), |
| Some(200), |
| None, |
| Some(7), |
| true, |
| )); |
| check_stats(Statistics::int64( |
| Some(-100), |
| Some(200), |
| None, |
| Some(0), |
| false, |
| )); |
| check_stats(Statistics::int64(None, None, None, Some(7), true)); |
| |
| check_stats(Statistics::float(Some(1.2), Some(3.4), None, Some(7), true)); |
| check_stats(Statistics::float( |
| Some(1.2), |
| Some(3.4), |
| None, |
| Some(0), |
| false, |
| )); |
| check_stats(Statistics::float(None, None, None, Some(7), true)); |
| |
| check_stats(Statistics::double( |
| Some(1.2), |
| Some(3.4), |
| None, |
| Some(7), |
| true, |
| )); |
| check_stats(Statistics::double( |
| Some(1.2), |
| Some(3.4), |
| None, |
| Some(0), |
| false, |
| )); |
| check_stats(Statistics::double(None, None, None, Some(7), true)); |
| |
| check_stats(Statistics::byte_array( |
| Some(ByteArray::from(vec![1, 2, 3])), |
| Some(ByteArray::from(vec![3, 4, 5])), |
| None, |
| Some(7), |
| true, |
| )); |
| check_stats(Statistics::byte_array(None, None, None, Some(7), true)); |
| |
| check_stats(Statistics::fixed_len_byte_array( |
| Some(ByteArray::from(vec![1, 2, 3]).into()), |
| Some(ByteArray::from(vec![3, 4, 5]).into()), |
| None, |
| Some(7), |
| true, |
| )); |
| check_stats(Statistics::fixed_len_byte_array( |
| None, |
| None, |
| None, |
| Some(7), |
| true, |
| )); |
| } |
| |
| #[test] |
| fn test_count_encoding() { |
| statistics_count_test(None, None); |
| statistics_count_test(Some(0), Some(0)); |
| statistics_count_test(Some(100), Some(2000)); |
| statistics_count_test(Some(1), None); |
| statistics_count_test(None, Some(1)); |
| } |
| |
| #[test] |
| fn test_count_encoding_distinct_too_large() { |
| // statistics are stored using i64, so test trying to store larger values |
| let statistics = make_bool_stats(Some(u64::MAX), Some(100)); |
| let thrift_stats = page_stats_to_thrift(Some(&statistics)).unwrap(); |
| assert_eq!(thrift_stats.distinct_count, None); // can't store u64 max --> null |
| assert_eq!(thrift_stats.null_count, Some(100)); |
| } |
| |
| #[test] |
| fn test_count_encoding_null_too_large() { |
| // statistics are stored using i64, so test trying to store larger values |
| let statistics = make_bool_stats(Some(100), Some(u64::MAX)); |
| let thrift_stats = page_stats_to_thrift(Some(&statistics)).unwrap(); |
| assert_eq!(thrift_stats.distinct_count, Some(100)); |
| assert_eq!(thrift_stats.null_count, None); // can' store u64 max --> null |
| } |
| |
| #[test] |
| fn test_count_decoding_null_invalid() { |
| let tstatistics = PageStatistics { |
| null_count: Some(-42), |
| max: None, |
| min: None, |
| distinct_count: None, |
| max_value: None, |
| min_value: None, |
| is_max_value_exact: None, |
| is_min_value_exact: None, |
| }; |
| let err = from_thrift_page_stats(Type::BOOLEAN, Some(tstatistics)).unwrap_err(); |
| assert_eq!( |
| err.to_string(), |
| "Parquet error: Statistics null count is negative -42" |
| ); |
| } |
| |
| /// Writes statistics to thrift and reads them back and ensures: |
| /// - The statistics are the same |
| /// - The statistics written to thrift are the same as the original statistics |
| fn statistics_count_test(distinct_count: Option<u64>, null_count: Option<u64>) { |
| let statistics = make_bool_stats(distinct_count, null_count); |
| |
| let thrift_stats = page_stats_to_thrift(Some(&statistics)).unwrap(); |
| assert_eq!(thrift_stats.null_count.map(|c| c as u64), null_count); |
| assert_eq!( |
| thrift_stats.distinct_count.map(|c| c as u64), |
| distinct_count |
| ); |
| |
| let round_tripped = from_thrift_page_stats(Type::BOOLEAN, Some(thrift_stats)) |
| .unwrap() |
| .unwrap(); |
| // TODO: remove branch when we no longer support assuming null_count==None in the thrift |
| // means null_count = Some(0) |
| if null_count.is_none() { |
| assert_ne!(round_tripped, statistics); |
| assert!(round_tripped.null_count_opt().is_some()); |
| assert_eq!(round_tripped.null_count_opt(), Some(0)); |
| assert_eq!(round_tripped.min_bytes_opt(), statistics.min_bytes_opt()); |
| assert_eq!(round_tripped.max_bytes_opt(), statistics.max_bytes_opt()); |
| assert_eq!( |
| round_tripped.distinct_count_opt(), |
| statistics.distinct_count_opt() |
| ); |
| } else { |
| assert_eq!(round_tripped, statistics); |
| } |
| } |
| |
| fn make_bool_stats(distinct_count: Option<u64>, null_count: Option<u64>) -> Statistics { |
| let min = Some(true); |
| let max = Some(false); |
| let is_min_max_deprecated = false; |
| |
| // test is about the counts, so we aren't really testing the min/max values |
| Statistics::Boolean(ValueStatistics::new( |
| min, |
| max, |
| distinct_count, |
| null_count, |
| is_min_max_deprecated, |
| )) |
| } |
| } |