blob: 2de9e1d076f6254a56358234d0ff3dd9e3e557b5 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <algorithm>
#include <array>
#include <cmath>
#include <iostream>
#include <limits>
#include <string>
#include <unordered_set>
#include "arrow/util/logging_internal.h"
#include "parquet/platform.h"
namespace parquet::geospatial {
/// \brief Infinity, used to define bounds of empty bounding boxes
constexpr double kInf = std::numeric_limits<double>::infinity();
/// \brief Valid combinations of dimensions allowed by ISO well-known binary
///
/// These values correspond to the 0, 1000, 2000, 3000 component of the WKB integer
/// geometry type (i.e., the value of geometry_type // 1000).
enum class Dimensions {
kXY = 0,
kXYZ = 1,
kXYM = 2,
kXYZM = 3,
kValueMin = 0,
kValueMax = 3
};
/// \brief The supported set of geometry types allowed by ISO well-known binary
///
/// These values correspond to the 1, 2, ..., 7 component of the WKB integer
/// geometry type (i.e., the value of geometry_type % 1000).
enum class GeometryType {
kPoint = 1,
kLinestring = 2,
kPolygon = 3,
kMultiPoint = 4,
kMultiLinestring = 5,
kMultiPolygon = 6,
kGeometryCollection = 7,
kValueMin = 1,
kValueMax = 7
};
/// \brief A collection of intervals representing the encountered ranges of values
/// in each dimension.
///
/// The Parquet specification also supports wraparound bounding boxes in the X
/// dimension; however, this structure assumes min < max always as it is used for
/// the purposes of accumulating this type of bounds.
///
/// This class will ignore any NaN values it visits via UpdateXY[Z[M]](). This is
/// consistent with GEOS and ensures that ranges are accumulated in the same way that
/// other statistics in Parquet are accumulated (e.g., statistics of a double column will
/// return a min/max of all non-NaN values). In WKB specifically, POINT EMPTY is
/// represented by convention as all ordinate values filled with NaN, so this behaviour
/// allows for no special-casing of POINT EMPTY in the WKB reader.
///
/// This class will propagate any NaN values (per dimension) that were explicitly
/// specified via setting mins/maxes directly or by merging another BoundingBox that
/// contained NaN values. This definition ensures that NaN bounds obtained via
/// EncodedGeoStatistics (which may have been written by some other writer that generated
/// NaNs, either on purpose or by accident) are not silently overwritten.
struct PARQUET_EXPORT BoundingBox {
using XY = std::array<double, 2>;
using XYZ = std::array<double, 3>;
using XYM = std::array<double, 3>;
using XYZM = std::array<double, 4>;
BoundingBox(const XYZM& mins, const XYZM& maxes) : min(mins), max(maxes) {}
BoundingBox() : min{kInf, kInf, kInf, kInf}, max{-kInf, -kInf, -kInf, -kInf} {}
BoundingBox(const BoundingBox& other) = default;
BoundingBox& operator=(const BoundingBox&) = default;
/// \brief Update the X and Y bounds to ensure these bounds contain coord
void UpdateXY(::arrow::util::span<const double> coord) {
DCHECK_EQ(coord.size(), 2);
UpdateInternal(coord);
}
/// \brief Update the X, Y, and Z bounds to ensure these bounds contain coord
void UpdateXYZ(::arrow::util::span<const double> coord) {
DCHECK_EQ(coord.size(), 3);
UpdateInternal(coord);
}
/// \brief Update the X, Y, and M bounds to ensure these bounds contain coord
void UpdateXYM(::arrow::util::span<const double> coord) {
DCHECK_EQ(coord.size(), 3);
min[0] = std::min(min[0], coord[0]);
min[1] = std::min(min[1], coord[1]);
min[3] = std::min(min[3], coord[2]);
max[0] = std::max(max[0], coord[0]);
max[1] = std::max(max[1], coord[1]);
max[3] = std::max(max[3], coord[2]);
}
/// \brief Update the X, Y, Z, and M bounds to ensure these bounds contain coord
void UpdateXYZM(::arrow::util::span<const double> coord) {
DCHECK_EQ(coord.size(), 4);
UpdateInternal(coord);
}
/// \brief Reset these bounds to an empty state such that they contain no coordinates
void Reset() {
for (int i = 0; i < 4; i++) {
min[i] = kInf;
max[i] = -kInf;
}
}
/// \brief Update these bounds such they also contain other
void Merge(const BoundingBox& other) {
for (int i = 0; i < 4; i++) {
if (std::isnan(min[i]) || std::isnan(max[i]) || std::isnan(other.min[i]) ||
std::isnan(other.max[i])) {
min[i] = std::numeric_limits<double>::quiet_NaN();
max[i] = std::numeric_limits<double>::quiet_NaN();
} else {
min[i] = std::min(min[i], other.min[i]);
max[i] = std::max(max[i], other.max[i]);
}
}
}
std::string ToString() const;
XYZM min;
XYZM max;
private:
// This works for XY, XYZ, and XYZM
template <typename Coord>
void UpdateInternal(Coord coord) {
for (size_t i = 0; i < coord.size(); i++) {
min[i] = std::min(min[i], coord[i]);
max[i] = std::max(max[i], coord[i]);
}
}
};
inline bool operator==(const BoundingBox& lhs, const BoundingBox& rhs) {
return lhs.min == rhs.min && lhs.max == rhs.max;
}
inline bool operator!=(const BoundingBox& lhs, const BoundingBox& rhs) {
return !(lhs == rhs);
}
inline std::ostream& operator<<(std::ostream& os, const BoundingBox& obj) {
os << obj.ToString();
return os;
}
class WKBBuffer;
/// \brief Accumulate a BoundingBox and geometry types based on zero or more well-known
/// binary blobs
///
/// Note that this class is NOT appropriate for bounding a GEOGRAPHY,
/// whose bounds are not a function purely of the vertices. Geography bounding
/// is not yet implemented.
class PARQUET_EXPORT WKBGeometryBounder {
public:
/// \brief Accumulate the bounds of a serialized well-known binary geometry
///
/// Throws ParquetException for any parse errors encountered. Bounds for
/// any encountered coordinates are accumulated and the geometry type of
/// the geometry is added to the internal geometry type list.
void MergeGeometry(std::string_view bytes_wkb);
void MergeGeometry(::arrow::util::span<const uint8_t> bytes_wkb);
/// \brief Accumulate the bounds of a previously-calculated BoundingBox
void MergeBox(const BoundingBox& box) { box_.Merge(box); }
/// \brief Accumulate a previously-calculated list of geometry types
void MergeGeometryTypes(::arrow::util::span<const int32_t> geospatial_types) {
geospatial_types_.insert(geospatial_types.begin(), geospatial_types.end());
}
/// \brief Retrieve the accumulated bounds
const BoundingBox& Bounds() const { return box_; }
/// \brief Retrieve the accumulated geometry types
std::vector<int32_t> GeometryTypes() const;
/// \brief Reset the internal bounds and geometry types list to an empty state
void Reset() {
box_.Reset();
geospatial_types_.clear();
}
private:
BoundingBox box_;
std::unordered_set<int32_t> geospatial_types_;
void MergeGeometryInternal(WKBBuffer* src, bool record_wkb_type);
void MergeSequence(WKBBuffer* src, Dimensions dimensions, uint32_t n_coords, bool swap);
};
} // namespace parquet::geospatial