blob: 5d71cb61f29ce041da17290f06ff6983f85f2072 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include "arrow/array/builder_binary.h"
#include "arrow/compute/api.h"
#include "arrow/testing/gtest_util.h"
#include "parquet/geospatial/statistics.h"
#include "parquet/test_util.h"
static constexpr double kInf = std::numeric_limits<double>::infinity();
namespace parquet::geospatial {
TEST(TestGeoStatistics, TestDefaults) {
GeoStatistics stats;
EXPECT_TRUE(stats.geometry_types().has_value());
EXPECT_EQ(stats.geometry_types().value().size(), 0);
EXPECT_TRUE(stats.is_valid());
EXPECT_THAT(stats.dimension_empty(), ::testing::ElementsAre(true, true, true, true));
EXPECT_THAT(stats.dimension_valid(), ::testing::ElementsAre(true, true, true, true));
for (int i = 0; i < kMaxDimensions; i++) {
EXPECT_EQ(stats.lower_bound()[i], kInf);
EXPECT_EQ(stats.upper_bound()[i], -kInf);
}
EXPECT_TRUE(stats.Equals(GeoStatistics()));
EXPECT_EQ(stats.ToString(),
"<GeoStatistics> x: empty y: empty z: empty m: empty "
"geometry_types: []");
// Merging empty with empty should equal empty
stats.Merge(GeoStatistics());
EXPECT_TRUE(stats.Equals(GeoStatistics()));
// There's no way to encode empty ranges in Thrift, so we pretend that we
// didn't calculate them.
auto encoded = stats.Encode();
EXPECT_FALSE(encoded->xy_bounds_present);
EXPECT_FALSE(encoded->z_bounds_present);
EXPECT_FALSE(encoded->m_bounds_present);
EXPECT_FALSE(encoded->geospatial_types_present());
// When imported, the statistics marked with everything uncalculated should
// be "valid" but should have individual components marked as invalid.
GeoStatistics valid_but_uncalculated;
valid_but_uncalculated.Decode(*encoded);
EXPECT_TRUE(valid_but_uncalculated.is_valid());
EXPECT_THAT(valid_but_uncalculated.dimension_valid(),
::testing::ElementsAre(false, false, false, false));
EXPECT_EQ(valid_but_uncalculated.geometry_types(), std::nullopt);
}
TEST(TestGeoStatistics, TestImportEncodedWithNaN) {
double nan_dbl = std::numeric_limits<double>::quiet_NaN();
EncodedGeoStatistics encoded_with_nan;
encoded_with_nan.xy_bounds_present = true;
encoded_with_nan.xmin = nan_dbl;
encoded_with_nan.xmax = nan_dbl;
encoded_with_nan.ymin = nan_dbl;
encoded_with_nan.ymax = nan_dbl;
encoded_with_nan.z_bounds_present = true;
encoded_with_nan.zmin = nan_dbl;
encoded_with_nan.zmax = nan_dbl;
encoded_with_nan.m_bounds_present = true;
encoded_with_nan.mmin = nan_dbl;
encoded_with_nan.mmax = nan_dbl;
GeoStatistics stats(encoded_with_nan);
EXPECT_THAT(stats.dimension_valid(),
::testing::ElementsAre(false, false, false, false));
for (int i = 0; i < kMaxDimensions; i++) {
EXPECT_TRUE(std::isnan(stats.lower_bound()[i]));
EXPECT_TRUE(std::isnan(stats.upper_bound()[i]));
}
// Ensure that if we merge finite values into nans we get dimensions marked as invalid
GeoStatistics stats_finite;
std::string xyzm0 = test::MakeWKBPoint({10, 11, 12, 13}, true, true);
ByteArray item0{xyzm0};
stats_finite.Update(&item0, /*num_values=*/1);
stats.Merge(stats_finite);
EXPECT_THAT(stats.dimension_valid(),
::testing::ElementsAre(false, false, false, false));
// Ensure that if we merge nans into finite values we also get dimensions marked as
// invalid
stats_finite.Merge(stats);
EXPECT_THAT(stats.dimension_valid(),
::testing::ElementsAre(false, false, false, false));
// Ensure that if we decode nans, we mark the ranges as uncalculated
auto encoded = stats.Encode();
EXPECT_FALSE(encoded->xy_bounds_present);
EXPECT_FALSE(encoded->z_bounds_present);
EXPECT_FALSE(encoded->m_bounds_present);
}
TEST(TestGeoStatistics, TestUpdateByteArray) {
GeoStatistics stats;
// Make items with all dimensions to ensure all dimensions are updated
// and returned properly (POINT XYZM has an integer code of 3001)
std::string xyzm0 = test::MakeWKBPoint({10, 11, 12, 13}, true, true);
ByteArray item0{xyzm0};
stats.Update(&item0, /*num_values=*/1);
EXPECT_TRUE(stats.is_valid());
EXPECT_THAT(stats.lower_bound(), ::testing::ElementsAre(10, 11, 12, 13));
EXPECT_THAT(stats.upper_bound(), ::testing::ElementsAre(10, 11, 12, 13));
EXPECT_THAT(*stats.geometry_types(), ::testing::ElementsAre(3001));
EXPECT_THAT(stats.dimension_empty(),
::testing::ElementsAre(false, false, false, false));
std::string xyzm1 = test::MakeWKBPoint({20, 21, 22, 23}, true, true);
ByteArray item1{xyzm1};
stats.Update(&item1, /*num_values=*/1);
EXPECT_TRUE(stats.is_valid());
EXPECT_THAT(stats.lower_bound(), ::testing::ElementsAre(10, 11, 12, 13));
EXPECT_THAT(stats.upper_bound(), ::testing::ElementsAre(20, 21, 22, 23));
EXPECT_THAT(*stats.geometry_types(), ::testing::ElementsAre(3001));
// Check recreating the statistics with actual values
auto encoded = stats.Encode();
EXPECT_TRUE(GeoStatistics(*encoded).Equals(stats));
EXPECT_EQ(stats.ToString(),
"<GeoStatistics> x: [10, 20] y: [11, 21] z: [12, 22] m: "
"[13, 23] geometry_types: [3001]");
// Check resetting to the original state
stats.Reset();
EXPECT_TRUE(stats.Equals(GeoStatistics()));
// Check UpdateSpaced()
// A null value that should be skipped
std::string xyzm2 = test::MakeWKBPoint({-30, -31, -32, -33}, true, true);
ByteArray item2{xyzm2};
// A non-null value that shouldn't be skipped
std::string xyzm3 = test::MakeWKBPoint({30, 31, 32, 33}, true, true);
ByteArray item3{xyzm3};
ByteArray items[] = {item0, item1, item2, item3};
// Validity bitmap with an extra bit on the front to check non-zero bits offset
uint8_t validity = 0b00010111;
GeoStatistics stats_spaced;
stats_spaced.UpdateSpaced(items, &validity, 1, 4, 4);
EXPECT_TRUE(stats.is_valid());
EXPECT_THAT(stats_spaced.lower_bound(), ::testing::ElementsAre(10, 11, 12, 13));
EXPECT_THAT(stats_spaced.upper_bound(), ::testing::ElementsAre(30, 31, 32, 33));
EXPECT_THAT(*stats_spaced.geometry_types(), ::testing::ElementsAre(3001));
// Check merge
stats.Merge(stats_spaced);
EXPECT_TRUE(stats.Equals(stats_spaced));
// Check ingest of invalid WKB
ByteArray invalid;
stats.Update(&invalid, /*num_values=*/1);
EXPECT_FALSE(stats.is_valid());
EXPECT_EQ(stats.Encode(), std::nullopt);
// This should be true even after ingesting more values
stats.Update(&item0, /*num_values=*/1);
EXPECT_FALSE(stats.is_valid());
EXPECT_EQ(stats.Encode(), std::nullopt);
EXPECT_EQ(stats.ToString(), "<GeoStatistics> invalid");
// And should cause other statistics to become invalid when merged with them
stats_spaced.Merge(stats);
EXPECT_FALSE(stats_spaced.is_valid());
EXPECT_EQ(stats_spaced.Encode(), std::nullopt);
}
TEST(TestGeoStatistics, TestUpdateXYZM) {
GeoStatistics stats;
GeoStatistics stats_not_equal;
EncodedGeoStatistics encoded;
GeoStatistics from_encoded;
// Test existence of x, y, z, and m by ingesting an XY, an XYZ, then an XYM
// and check that the has_(xyzm)() methods are working as expected and that
// geometry_types() accumulate (1 is the geometry type code for POINT,
// 1001 is the code for POINT Z, and 2001 is the code for POINT M).
std::string xy = test::MakeWKBPoint({10, 11, 0, 0}, false, false);
std::string xyz = test::MakeWKBPoint({10, 11, 12, 0}, true, false);
std::string xym = test::MakeWKBPoint({10, 11, 0, 13}, false, true);
ByteArray item_xy{xy};
stats.Update(&item_xy, /*num_values=*/1);
EXPECT_THAT(stats.lower_bound(), ::testing::ElementsAre(10, 11, kInf, kInf));
EXPECT_THAT(stats.upper_bound(), ::testing::ElementsAre(10, 11, -kInf, -kInf));
EXPECT_THAT(*stats.geometry_types(), ::testing::ElementsAre(1));
EXPECT_THAT(stats.dimension_empty(), ::testing::ElementsAre(false, false, true, true));
EXPECT_FALSE(stats.Equals(stats_not_equal));
stats_not_equal.Merge(stats);
// When we encode + decode the statistcs, we should ensure that the non-empty
// dimensions are kept and that the previously empty dimensions are now invalid
encoded = *stats.Encode();
EXPECT_TRUE(encoded.xy_bounds_present);
EXPECT_FALSE(encoded.z_bounds_present);
EXPECT_FALSE(encoded.m_bounds_present);
from_encoded.Decode(encoded);
EXPECT_THAT(from_encoded.dimension_empty(),
::testing::ElementsAre(false, false, false, false));
EXPECT_THAT(from_encoded.dimension_valid(),
::testing::ElementsAre(true, true, false, false));
ByteArray item_xyz{xyz};
stats.Update(&item_xyz, /*num_values=*/1);
EXPECT_THAT(stats.lower_bound(), ::testing::ElementsAre(10, 11, 12, kInf));
EXPECT_THAT(stats.upper_bound(), ::testing::ElementsAre(10, 11, 12, -kInf));
EXPECT_THAT(*stats.geometry_types(), ::testing::ElementsAre(1, 1001));
EXPECT_THAT(stats.dimension_empty(), ::testing::ElementsAre(false, false, false, true));
EXPECT_FALSE(stats.Equals(stats_not_equal));
stats_not_equal.Merge(stats);
encoded = *stats.Encode();
EXPECT_TRUE(encoded.xy_bounds_present);
EXPECT_TRUE(encoded.z_bounds_present);
EXPECT_FALSE(encoded.m_bounds_present);
from_encoded.Decode(encoded);
EXPECT_THAT(from_encoded.dimension_empty(),
::testing::ElementsAre(false, false, false, false));
EXPECT_THAT(from_encoded.dimension_valid(),
::testing::ElementsAre(true, true, true, false));
ByteArray item_xym{xym};
stats.Update(&item_xym, /*num_values=*/1);
EXPECT_THAT(stats.lower_bound(), ::testing::ElementsAre(10, 11, 12, 13));
EXPECT_THAT(stats.upper_bound(), ::testing::ElementsAre(10, 11, 12, 13));
EXPECT_THAT(*stats.geometry_types(), ::testing::ElementsAre(1, 1001, 2001));
EXPECT_THAT(stats.dimension_empty(),
::testing::ElementsAre(false, false, false, false));
EXPECT_FALSE(stats.Equals(stats_not_equal));
encoded = *stats.Encode();
EXPECT_TRUE(encoded.xy_bounds_present);
EXPECT_TRUE(encoded.z_bounds_present);
EXPECT_TRUE(encoded.m_bounds_present);
from_encoded.Decode(encoded);
EXPECT_THAT(from_encoded.dimension_empty(),
::testing::ElementsAre(false, false, false, false));
EXPECT_THAT(from_encoded.dimension_valid(),
::testing::ElementsAre(true, true, true, true));
}
TEST(TestGeoStatistics, TestUpdateArray) {
// Build WKB array with a null from POINT (0 1)...POINT (14, 15)
::arrow::BinaryBuilder builder;
for (int k = 0; k < 10; k++) {
std::string item = test::MakeWKBPoint(
{static_cast<double>(k), static_cast<double>(k + 1)}, false, false);
ASSERT_OK(builder.Append(item));
}
ASSERT_OK(builder.AppendNull());
for (int k = 10; k < 15; k++) {
std::string item = test::MakeWKBPoint(
{static_cast<double>(k), static_cast<double>(k + 1)}, false, false);
ASSERT_OK(builder.Append(item));
}
// Ensure we have both a binary array and a large binary array to work with
ASSERT_OK_AND_ASSIGN(const auto binary_array, builder.Finish());
ASSERT_OK_AND_ASSIGN(const auto large_binary_array,
::arrow::compute::Cast(binary_array, ::arrow::large_binary()));
GeoStatistics stats;
stats.Update(*binary_array);
EXPECT_THAT(stats.lower_bound(), ::testing::ElementsAre(0, 1, kInf, kInf));
EXPECT_THAT(stats.upper_bound(), ::testing::ElementsAre(14, 15, -kInf, -kInf));
GeoStatistics stats_large;
stats_large.Update(*large_binary_array.make_array());
EXPECT_TRUE(stats_large.Equals(stats));
}
TEST(TestGeoStatistics, TestUpdateArrayInvalid) {
// Build WKB array with invalid WKB (here, an empty string)
::arrow::BinaryBuilder builder;
ASSERT_OK(builder.Append(std::string()));
ASSERT_OK_AND_ASSIGN(const auto invalid_wkb, builder.Finish());
// This should result in statistics that are "unset"
GeoStatistics invalid;
invalid.Update(*invalid_wkb);
EXPECT_FALSE(invalid.is_valid());
EXPECT_EQ(invalid.Encode(), std::nullopt);
EXPECT_FALSE(invalid.Equals(GeoStatistics()));
// Make some valid statistics
EncodedGeoStatistics encoded_valid;
encoded_valid.xy_bounds_present = true;
encoded_valid.xmin = 0;
encoded_valid.xmax = 10;
encoded_valid.ymin = 20;
encoded_valid.ymax = 30;
GeoStatistics valid(encoded_valid);
// Make some statistics with unsupported wraparound
EncodedGeoStatistics encoded_unsupported;
encoded_unsupported.xy_bounds_present = true;
encoded_unsupported.xmin = 10;
encoded_unsupported.xmax = 0;
encoded_unsupported.ymin = 20;
encoded_unsupported.ymax = 30;
GeoStatistics unsupported(encoded_unsupported);
// Check that X values are marked as invalid if merging wraparound
// and a non-wraparound statistics
GeoStatistics valid_merge_unsupported(encoded_valid);
valid_merge_unsupported.Merge(unsupported);
EXPECT_THAT(valid_merge_unsupported.dimension_valid(),
::testing::ElementsAre(false, true, false, false));
GeoStatistics unsupported_merge_valid(encoded_unsupported);
unsupported_merge_valid.Merge(valid);
EXPECT_THAT(valid_merge_unsupported.dimension_valid(),
::testing::ElementsAre(false, true, false, false));
}
TEST(TestGeoStatistics, TestEquals) {
GeoStatistics stats_a;
GeoStatistics stats_b;
std::string wkb_empty = test::MakeWKBPoint({kNaN, kNaN, kNaN, kNaN}, true, true);
ByteArray pt_empty{wkb_empty};
std::string wkb_pt = test::MakeWKBPoint({10, 11, 12, 13}, true, true);
ByteArray pt{wkb_pt};
std::string wkb_pt2 = test::MakeWKBPoint({14, 15, 16, 17}, true, true);
ByteArray pt2{wkb_pt2};
ByteArray invalid;
// Both empty
EXPECT_TRUE(stats_a.Equals(stats_b));
// Both empty but one has different geometry types
stats_b.Update(&pt_empty, 1);
EXPECT_EQ(stats_a.is_valid(), stats_b.is_valid());
EXPECT_EQ(stats_a.upper_bound(), stats_b.upper_bound());
EXPECT_EQ(stats_a.lower_bound(), stats_b.lower_bound());
EXPECT_NE(stats_a.geometry_types(), stats_b.geometry_types());
EXPECT_FALSE(stats_a.Equals(stats_b));
// Both empty
stats_b.Reset();
EXPECT_TRUE(stats_a.Equals(stats_b));
// Geometry types equal but bounds not equal
stats_a.Update(&pt, 1);
stats_b.Update(&pt2, 1);
EXPECT_EQ(stats_a.is_valid(), stats_b.is_valid());
EXPECT_NE(stats_a.upper_bound(), stats_b.upper_bound());
EXPECT_NE(stats_a.lower_bound(), stats_b.lower_bound());
EXPECT_EQ(stats_a.geometry_types(), stats_b.geometry_types());
EXPECT_FALSE(stats_a.Equals(stats_b));
// Both empty
stats_a.Reset();
stats_b.Reset();
EXPECT_TRUE(stats_a.Equals(stats_b));
// Only validity is different
stats_b.Update(&invalid, 1);
EXPECT_NE(stats_a.is_valid(), stats_b.is_valid());
EXPECT_EQ(stats_a.upper_bound(), stats_b.upper_bound());
EXPECT_EQ(stats_a.lower_bound(), stats_b.lower_bound());
EXPECT_EQ(stats_a.geometry_types(), stats_b.geometry_types());
EXPECT_FALSE(stats_a.Equals(stats_b));
// Everything equal for a non-empty case
stats_a.Reset();
stats_b.Reset();
stats_a.Update(&pt, 1);
stats_b.Update(&pt, 1);
EXPECT_TRUE(stats_a.Equals(stats_b));
}
} // namespace parquet::geospatial