blob: d5c8d6628849d2bfd65006e1706c6b3ea4cd1c1b [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "parquet/geospatial/util_internal.h"
#include <sstream>
#include "arrow/util/endian.h"
#include "arrow/util/macros.h"
#include "arrow/util/ubsan.h"
#include "parquet/exception.h"
namespace parquet::geospatial {
std::string BoundingBox::ToString() const {
std::stringstream ss;
ss << "BoundingBox" << std::endl;
ss << " x: [" << min[0] << ", " << max[0] << "]" << std::endl;
ss << " y: [" << min[1] << ", " << max[1] << "]" << std::endl;
ss << " z: [" << min[2] << ", " << max[2] << "]" << std::endl;
ss << " m: [" << min[3] << ", " << max[3] << "]" << std::endl;
return ss.str();
}
/// \brief Object to keep track of the low-level consumption of a well-known binary
/// geometry
///
/// Briefly, ISO well-known binary supported by the Parquet spec is an endian byte
/// (0x01 or 0x00), followed by geometry type + dimensions encoded as a (uint32_t),
/// followed by geometry-specific data. Coordinate sequences are represented by a
/// uint32_t (the number of coordinates) plus a sequence of doubles (number of coordinates
/// multiplied by the number of dimensions).
class WKBBuffer {
public:
WKBBuffer() : data_(nullptr), size_(0) {}
WKBBuffer(const uint8_t* data, int64_t size) : data_(data), size_(size) {}
uint8_t ReadUInt8() { return ReadChecked<uint8_t>(); }
uint32_t ReadUInt32(bool swap) {
auto value = ReadChecked<uint32_t>();
if (swap) {
return ::arrow::bit_util::ByteSwap(value);
} else {
return value;
}
}
template <typename Coord, typename Visit>
void ReadCoords(uint32_t n_coords, bool swap, Visit&& visit) {
size_t total_bytes = n_coords * sizeof(Coord);
if (size_ < total_bytes) {
throw ParquetException("Can't read coordinate sequence of ", total_bytes,
" bytes from WKBBuffer with ", size_, " remaining");
}
if (swap) {
Coord coord;
for (uint32_t i = 0; i < n_coords; i++) {
coord = ReadUnchecked<Coord>();
for (auto& c : coord) {
c = ::arrow::bit_util::ByteSwap(c);
}
visit(coord);
}
} else {
for (uint32_t i = 0; i < n_coords; i++) {
visit(ReadUnchecked<Coord>());
}
}
}
size_t size() { return size_; }
private:
const uint8_t* data_;
size_t size_;
template <typename T>
T ReadChecked() {
if (ARROW_PREDICT_FALSE(size_ < sizeof(T))) {
throw ParquetException("Can't read ", sizeof(T), " bytes from WKBBuffer with ",
size_, " remaining");
}
return ReadUnchecked<T>();
}
template <typename T>
T ReadUnchecked() {
T out = ::arrow::util::SafeLoadAs<T>(data_);
data_ += sizeof(T);
size_ -= sizeof(T);
return out;
}
};
using GeometryTypeAndDimensions = std::pair<GeometryType, Dimensions>;
namespace {
GeometryTypeAndDimensions ParseGeometryType(uint32_t wkb_geometry_type) {
// The number 1000 can be used because WKB geometry types are constructed
// on purpose such that this relationship is true (e.g., LINESTRING ZM maps
// to 3002).
uint32_t geometry_type_component = wkb_geometry_type % 1000;
uint32_t dimensions_component = wkb_geometry_type / 1000;
auto min_geometry_type_value = static_cast<uint32_t>(GeometryType::kValueMin);
auto max_geometry_type_value = static_cast<uint32_t>(GeometryType::kValueMax);
auto min_dimension_value = static_cast<uint32_t>(Dimensions::kValueMin);
auto max_dimension_value = static_cast<uint32_t>(Dimensions::kValueMax);
if (geometry_type_component < min_geometry_type_value ||
geometry_type_component > max_geometry_type_value ||
dimensions_component < min_dimension_value ||
dimensions_component > max_dimension_value) {
throw ParquetException("Invalid WKB geometry type: ", wkb_geometry_type);
}
return {static_cast<GeometryType>(geometry_type_component),
static_cast<Dimensions>(dimensions_component)};
}
} // namespace
std::vector<int32_t> WKBGeometryBounder::GeometryTypes() const {
std::vector<int32_t> out(geospatial_types_.begin(), geospatial_types_.end());
std::sort(out.begin(), out.end());
return out;
}
void WKBGeometryBounder::MergeGeometry(std::string_view bytes_wkb) {
MergeGeometry(::arrow::util::span(reinterpret_cast<const uint8_t*>(bytes_wkb.data()),
bytes_wkb.size()));
}
void WKBGeometryBounder::MergeGeometry(::arrow::util::span<const uint8_t> bytes_wkb) {
WKBBuffer src{bytes_wkb.data(), static_cast<int64_t>(bytes_wkb.size())};
MergeGeometryInternal(&src, /*record_wkb_type=*/true);
if (src.size() != 0) {
throw ParquetException("Exepcted zero bytes after consuming WKB but got ",
src.size());
}
}
void WKBGeometryBounder::MergeGeometryInternal(WKBBuffer* src, bool record_wkb_type) {
uint8_t endian = src->ReadUInt8();
#if ARROW_LITTLE_ENDIAN
bool swap = endian != 0x01;
#else
bool swap = endian != 0x00;
#endif
uint32_t wkb_geometry_type = src->ReadUInt32(swap);
auto geometry_type_and_dimensions = ParseGeometryType(wkb_geometry_type);
auto [geometry_type, dimensions] = geometry_type_and_dimensions;
// Keep track of geometry types encountered if at the top level
if (record_wkb_type) {
geospatial_types_.insert(static_cast<int32_t>(wkb_geometry_type));
}
switch (geometry_type) {
case GeometryType::kPoint:
MergeSequence(src, dimensions, 1, swap);
break;
case GeometryType::kLinestring: {
uint32_t n_coords = src->ReadUInt32(swap);
MergeSequence(src, dimensions, n_coords, swap);
break;
}
case GeometryType::kPolygon: {
uint32_t n_parts = src->ReadUInt32(swap);
for (uint32_t i = 0; i < n_parts; i++) {
uint32_t n_coords = src->ReadUInt32(swap);
MergeSequence(src, dimensions, n_coords, swap);
}
break;
}
// These are all encoded the same in WKB, even though this encoding would
// allow for parts to be of a different geometry type or different dimensions.
// For the purposes of bounding, this does not cause us problems. We pass
// record_wkb_type = false because we do not want the child geometry to be
// added to the geometry_types list (e.g., for a MultiPoint, we only want
// the code for MultiPoint to be added, not the code for Point).
case GeometryType::kMultiPoint:
case GeometryType::kMultiLinestring:
case GeometryType::kMultiPolygon:
case GeometryType::kGeometryCollection: {
uint32_t n_parts = src->ReadUInt32(swap);
for (uint32_t i = 0; i < n_parts; i++) {
MergeGeometryInternal(src, /*record_wkb_type*/ false);
}
break;
}
}
}
void WKBGeometryBounder::MergeSequence(WKBBuffer* src, Dimensions dimensions,
uint32_t n_coords, bool swap) {
switch (dimensions) {
case Dimensions::kXY:
src->ReadCoords<BoundingBox::XY>(
n_coords, swap, [&](BoundingBox::XY coord) { box_.UpdateXY(coord); });
break;
case Dimensions::kXYZ:
src->ReadCoords<BoundingBox::XYZ>(
n_coords, swap, [&](BoundingBox::XYZ coord) { box_.UpdateXYZ(coord); });
break;
case Dimensions::kXYM:
src->ReadCoords<BoundingBox::XYM>(
n_coords, swap, [&](BoundingBox::XYM coord) { box_.UpdateXYM(coord); });
break;
case Dimensions::kXYZM:
src->ReadCoords<BoundingBox::XYZM>(
n_coords, swap, [&](BoundingBox::XYZM coord) { box_.UpdateXYZM(coord); });
break;
default:
throw ParquetException("Unknown dimensions");
}
}
} // namespace parquet::geospatial