| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| #pragma once |
| |
| #include <cstdint> |
| #include <memory> |
| #include <optional> |
| |
| #include "parquet/platform.h" |
| #include "parquet/types.h" |
| |
| namespace parquet::geospatial { |
| |
| /// \brief The maximum number of dimensions represented by a geospatial type |
| /// (i.e., X, Y, Z, and M) |
| inline constexpr int kMaxDimensions = 4; |
| |
| /// \brief NaN, used to represent bounds for which predicate pushdown cannnot |
| /// be applied (e.g., because a writer did not provide bounds for a given dimension) |
| inline constexpr double kNaN = std::numeric_limits<double>::quiet_NaN(); |
| |
| /// \brief Structure represented encoded statistics to be written to and read from Parquet |
| /// serialized metadata. |
| /// |
| /// See the Parquet Thrift definition and GeoStatistics for the specific definition |
| /// of field values. |
| struct PARQUET_EXPORT EncodedGeoStatistics { |
| bool xy_bounds_present{false}; |
| double xmin{kNaN}; |
| double xmax{kNaN}; |
| double ymin{kNaN}; |
| double ymax{kNaN}; |
| |
| bool z_bounds_present{false}; |
| double zmin{kNaN}; |
| double zmax{kNaN}; |
| |
| bool m_bounds_present{false}; |
| double mmin{kNaN}; |
| double mmax{kNaN}; |
| |
| bool geospatial_types_present() const { return !geospatial_types.empty(); } |
| std::vector<int32_t> geospatial_types; |
| }; |
| |
| class GeoStatisticsImpl; |
| |
| /// \brief Base type for computing geospatial column statistics while writing a file |
| /// or representing them when reading a file |
| /// |
| /// These statistics track the minimum and maximum value (omitting NaN values) of the |
| /// four possible dimensions (X, Y, Z, and M) and the distinct set of geometry |
| /// type/dimension combinations (e.g., point XY, linestring XYZM) present in the data. |
| /// Any of these individual components may be "invalid": for example, when reading a |
| /// Parquet file, information about individual components obtained from the column |
| /// chunk metadata may have been missing or deemed unusable. Orthogonally, |
| /// any of these individual components may be "empty": for example, when using |
| /// GeoStatistics to accumulate bounds whilst writing, if all geometries in a column chunk |
| /// are null, all ranges (X, Y, Z, and M) will be empty. If all geometries in a column |
| /// chunk contain only XY coordinates (the most common case), the Z and M ranges will |
| /// be empty but the X and Y ranges will contain finite bounds. Empty ranges are |
| /// considered "valid" because they are known to represent exactly zero values (in |
| /// contrast to an invalid range, whose contents is completely unknown). These concepts |
| /// are all necessary for this object to accurately represent (1) accumulated or partially |
| /// accumulated statistics during the writing process and (2) deserialized statistics read |
| /// from the column chunk metadata during the reading process. |
| /// |
| /// EXPERIMENTAL |
| class PARQUET_EXPORT GeoStatistics { |
| public: |
| GeoStatistics(); |
| explicit GeoStatistics(const EncodedGeoStatistics& encoded); |
| |
| ~GeoStatistics(); |
| |
| /// \brief Return true if bounds, geometry types, and validity are identical |
| bool Equals(const GeoStatistics& other) const; |
| |
| /// \brief Update these statistics based on previously calculated or decoded statistics |
| /// |
| /// Merging statistics with wraparound X values is not currently supported. Merging |
| /// two GeoStatistics where one or both has a wraparound X range will result in these |
| /// statistics having an X dimension marked as invalid. |
| void Merge(const GeoStatistics& other); |
| |
| /// \brief Update these statistics based on values |
| void Update(const ByteArray* values, int64_t num_values); |
| |
| /// \brief Update these statistics based on the non-null elements of values |
| void UpdateSpaced(const ByteArray* values, const uint8_t* valid_bits, |
| int64_t valid_bits_offset, int64_t num_spaced_values, |
| int64_t num_values); |
| |
| /// \brief Update these statistics based on the non-null elements of values |
| /// |
| /// Currently, BinaryArray and LargeBinaryArray input is supported. |
| void Update(const ::arrow::Array& values); |
| |
| /// \brief Return these statistics to an empty state |
| void Reset(); |
| |
| /// \brief Encode the statistics for serializing to Thrift |
| /// |
| /// If invalid WKB was encountered or if the statistics contain NaN |
| /// for any reason, Encode() will return nullopt to indicate that |
| /// statistics should not be written to thrift. |
| std::optional<EncodedGeoStatistics> Encode() const; |
| |
| /// \brief Returns false if invalid WKB was encountered |
| bool is_valid() const; |
| |
| /// \brief Reset existing statistics and populate them from previously-encoded ones |
| void Decode(const EncodedGeoStatistics& encoded); |
| |
| /// \brief Minimum values in XYZM order |
| /// |
| /// For dimensions where dimension_valid() is false, the value will be NaN. For |
| /// dimensions where dimension_empty() is true, the value will be +Inf. |
| /// |
| /// For the first dimension (X) only, wraparound bounds apply where xmin > xmax. In this |
| /// case, these bounds represent the union of the intervals [xmax, Inf] and [-Inf, |
| /// xmin]. This implementation does not yet generate these types of bounds but they may |
| /// be encountered in statistics when reading a Parquet file. |
| std::array<double, kMaxDimensions> lower_bound() const; |
| |
| /// \brief Maximum values in XYZM order |
| /// |
| /// For dimensions where dimension_valid() is false, the value will be NaN. For |
| /// dimensions where dimension_empty() is true, the value will be -Inf. |
| /// |
| /// For the first dimension (X) only, wraparound bounds apply where xmin > xmax. In this |
| /// case, these bounds represent the union of the intervals [xmax, Inf] and [-Inf, |
| /// xmin]. This implementation does not yet generate these types of bounds but they may |
| /// be encountered in statistics when reading a Parquet file. |
| std::array<double, kMaxDimensions> upper_bound() const; |
| |
| /// \brief Dimension emptiness in XYZM order |
| /// |
| /// True for a given dimension if and only if zero non-NaN values were encountered |
| /// in that dimension and dimension_valid() is true for that dimension. |
| /// |
| /// When calculating statistics, zero or more of these values may be true because |
| /// this implementation calculates bounds for all dimensions; however, it may be |
| /// true that zero coordinates were encountered in a given dimension. For example, |
| /// dimension_empty() will return four true values if Update() was not called |
| /// or if Update() was called with only null values. If Update() was provided |
| /// one or more geometries with X and Y dimensions but not Z or M dimensions, |
| /// dimension_empty() will return true, true, false, false. |
| /// |
| /// For statistics read from a Parquet file, dimension_empty() will always contain |
| /// false values because there is no mechanism to communicate an empty interval |
| /// in the Thrift metadata. |
| std::array<bool, kMaxDimensions> dimension_empty() const; |
| |
| /// \brief Dimension validity (i.e. presence) in XYZM order |
| /// |
| /// When calculating statistics, this will always be true because this implementation |
| /// calculates statistics for all dimensions. When reading a Parquet file, one or more |
| /// of these values may be false because the file may not have provided bounds for all |
| /// dimensions. |
| /// |
| /// See documentation for dimension_empty(), lower_bound(), and/or upper_bound() for the |
| /// canonical values of those outputs for the dimensions where dimension_valid() is |
| /// false. |
| std::array<bool, kMaxDimensions> dimension_valid() const; |
| |
| /// \brief Return the geometry type codes |
| /// |
| /// This implementation always returns sorted output with no duplicates. When |
| /// calculating statistics, a value will always be returned (although the returned |
| /// vector may be empty if Update() was never called or was only called with null |
| /// values). When reading a Parquet file, std::nullopt may be returned because |
| /// the file may not have provided this information. |
| std::optional<std::vector<int32_t>> geometry_types() const; |
| |
| /// \brief Return a string representation of these statistics |
| std::string ToString() const; |
| |
| private: |
| std::unique_ptr<GeoStatisticsImpl> impl_; |
| }; |
| |
| } // namespace parquet::geospatial |