blob: b1da22bf373b13eacb7cd431f429e5cb4cecec23 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import json
from sedonadb.testing import (
DuckDB,
PostGIS,
SedonaDB,
DuckDBSingleThread,
PostGISSingleThread,
SedonaDBSingleThread,
)
class TestBenchBase:
def setup_class(self):
self.sedonadb = SedonaDB.create_or_skip()
self.postgis = PostGIS.create_or_skip()
self.duckdb = DuckDB.create_or_skip()
# Single-thread engine instances
self.sedonadb_single = SedonaDBSingleThread.create_or_skip()
self.postgis_single = PostGISSingleThread.create_or_skip()
self.duckdb_single = DuckDBSingleThread.create_or_skip()
num_geoms = 100_000
# Setup tables
for name, base_options in [
(
"points_simple",
{
"geom_type": "Point",
"target_rows": num_geoms,
},
),
(
"segments_large",
{
"geom_type": "LineString",
"target_rows": num_geoms,
"vertices_per_linestring_range": [2, 10],
},
),
(
"polygons_simple",
{
"geom_type": "Polygon",
"target_rows": num_geoms,
"vertices_per_linestring_range": [10, 10],
},
),
(
"polygons_complex",
{
"geom_type": "Polygon",
"target_rows": num_geoms,
"vertices_per_linestring_range": [500, 500],
},
),
(
"collections_simple",
{
"geom_type": "GeometryCollection",
"target_rows": num_geoms,
"vertices_per_linestring_range": [10, 10],
},
),
(
"collections_complex",
{
"geom_type": "GeometryCollection",
"target_rows": num_geoms,
"vertices_per_linestring_range": [500, 500],
},
),
]:
# Generate synthetic data with two different geometry sets that have overlapping spatial distribution
# The intersection rate between geom1 and geom2 will be around 2%.
# This creates more realistic workloads for spatial predicates.
# Options for first geometry set (geom1) - left-leaning distribution
options1 = base_options.copy()
options1.update(
{
"seed": 42,
"bounds": [0.0, 0.0, 80.0, 100.0], # Slightly left-leaning
"size_range": [
1.0,
15.0,
], # Medium-sized geometries for good intersection chance
}
)
# Options for second geometry set (geom2) - right-leaning distribution
options2 = base_options.copy()
options2.update(
{
"seed": 43,
"bounds": [20.0, 0.0, 100.0, 100.0], # Slightly right-leaning
"size_range": [1.0, 15.0], # Same size range for fair comparison
}
)
query = f"""
WITH geom1_data AS (
SELECT
geometry as geom1,
row_number() OVER () as id
FROM sd_random_geometry('{json.dumps(options1)}')
),
geom2_data AS (
SELECT
geometry as geom2,
row_number() OVER () as id
FROM sd_random_geometry('{json.dumps(options2)}')
)
SELECT
g1.geom1,
g2.geom2,
round(random() * 100) as integer
FROM geom1_data g1
JOIN geom2_data g2 ON g1.id = g2.id
"""
tab = self.sedonadb.execute_and_collect(query)
self.sedonadb.create_table_arrow(name, tab)
self.postgis.create_table_arrow(name, tab)
self.duckdb.create_table_arrow(name, tab)
self.sedonadb_single.create_table_arrow(name, tab)
self.duckdb_single.create_table_arrow(name, tab)
# We don't need to call self.postgis_single.create_table_arrow
# because it shares the same database with self.postgis
def _get_eng(self, eng):
if eng == SedonaDB:
return self.sedonadb
elif eng == PostGIS:
return self.postgis
elif eng == DuckDB:
return self.duckdb
elif eng == SedonaDBSingleThread:
return self.sedonadb_single
elif eng == PostGISSingleThread:
return self.postgis_single
elif eng == DuckDBSingleThread:
return self.duckdb_single
else:
raise ValueError(f"Unsupported engine: {eng}")