benchmarks/test_knn.py - sedona-db - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 import json
 import pytest
 from test_bench_base import TestBenchBase
 from sedonadb.testing import SedonaDB, PostGIS, DuckDB


 class TestBenchKNN(TestBenchBase):
     def setup_class(self):
         """Setup test data for KNN benchmarks"""
         self.sedonadb = SedonaDB.create_or_skip()
         self.postgis = PostGIS.create_or_skip()
         self.duckdb = DuckDB.create_or_skip()

         # Create building-like polygons (index side - fewer, larger geometries)
         building_options = {
             "geom_type": "Polygon",
             "target_rows": 2_000,
             "vertices_per_linestring_range": [4, 8],
             "size_range": [0.001, 0.01],
             "seed": 42,
         }

         building_query = f"""
             SELECT
                 geometry as geom,
                 round(random() * 1000) as building_id,
                 'Building_' || cast(round(random() * 1000) as varchar) as name
             FROM sd_random_geometry('{json.dumps(building_options)}')
         """
         building_tab = self.sedonadb.execute_and_collect(building_query)
         self.sedonadb.create_table_arrow("knn_buildings", building_tab)
         self.postgis.create_table_arrow("knn_buildings", building_tab)
         self.duckdb.create_table_arrow("knn_buildings", building_tab)

         # Create trip pickup points (probe side)
         trip_options = {
             "geom_type": "Point",
             "target_rows": 10_000,
             "seed": 43,
         }

         trip_query = f"""
             SELECT
                 geometry as geom,
                 round(random() * 100000) as trip_id
             FROM sd_random_geometry('{json.dumps(trip_options)}')
         """
         trip_tab = self.sedonadb.execute_and_collect(trip_query)
         self.sedonadb.create_table_arrow("knn_trips", trip_tab)
         self.postgis.create_table_arrow("knn_trips", trip_tab)
         self.duckdb.create_table_arrow("knn_trips", trip_tab)

         # Create a smaller test dataset for quick tests
         small_building_query = """
             SELECT * FROM knn_buildings LIMIT 1000
         """
         small_building_tab = self.sedonadb.execute_and_collect(small_building_query)
         self.sedonadb.create_table_arrow("knn_buildings_small", small_building_tab)
         self.postgis.create_table_arrow("knn_buildings_small", small_building_tab)
         self.duckdb.create_table_arrow("knn_buildings_small", small_building_tab)

         small_trip_query = """
             SELECT * FROM knn_trips LIMIT 5000
         """
         small_trip_tab = self.sedonadb.execute_and_collect(small_trip_query)
         self.sedonadb.create_table_arrow("knn_trips_small", small_trip_tab)
         self.postgis.create_table_arrow("knn_trips_small", small_trip_tab)
         self.duckdb.create_table_arrow("knn_trips_small", small_trip_tab)

     @pytest.mark.parametrize("k", [1, 5, 10])
     @pytest.mark.parametrize("engine", [SedonaDB, PostGIS, DuckDB])
     @pytest.mark.parametrize("dataset_size", ["small", "large"])
     def test_knn_performance(self, benchmark, k, engine, dataset_size):
         """Benchmark KNN query performance comparing SedonaDB vs PostGIS"""

         if dataset_size == "small":
             trip_table = "knn_trips_small"
             building_table = "knn_buildings_small"
             trip_limit = 100  # Test with 100 trips
         else:
             trip_table = "knn_trips_small"
             building_table = "knn_buildings"
             trip_limit = 1000

         # Get the appropriate engine instance
         eng = self._get_eng(engine)

         def run_knn_query():
             if engine == SedonaDB:
                 # SedonaDB syntax using ST_KNN function
                 query = f"""
                     WITH trip_sample AS (
                         SELECT trip_id, geom as trip_geom
                         FROM {trip_table}
                         LIMIT {trip_limit}
                     ),
                     building_with_geom AS (
                         SELECT building_id, name, geom as building_geom
                         FROM {building_table}
                     )
                     SELECT
                         t.trip_id,
                         b.building_id,
                         b.name,
                         ST_Distance(t.trip_geom, b.building_geom) as distance
                     FROM trip_sample t
                     JOIN building_with_geom b ON ST_KNN(t.trip_geom, b.building_geom, {k}, FALSE)
                     ORDER BY t.trip_id, distance
                 """
             elif engine == PostGIS:
                 # PostGIS syntax using distance operator and window functions
                 query = f"""
                     WITH trip_sample AS (
                         SELECT trip_id, geom as trip_geom
                         FROM {trip_table}
                         LIMIT {trip_limit}
                     ),
                     building_with_geom AS (
                         SELECT building_id, name, geom as building_geom
                         FROM {building_table}
                     ),
                     ranked_neighbors AS (
                         SELECT
                             t.trip_id,
                             b.building_id,
                             b.name,
                             ST_Distance(t.trip_geom, b.building_geom) as distance,
                             ROW_NUMBER() OVER (PARTITION BY t.trip_id ORDER BY t.trip_geom <-> b.building_geom) as rn
                         FROM trip_sample t
                         CROSS JOIN building_with_geom b
                     )
                     SELECT trip_id, building_id, name, distance
                     FROM ranked_neighbors
                     WHERE rn <= {k}
                     ORDER BY trip_id, distance
                 """
             else:  # DuckDB
                 # DuckDB KNN simulation using spatial joins with distance predicates
                 # Since DuckDB doesn't have native KNN, we use a cross join with distance calculation and ranking
                 query = f"""
                     WITH trip_sample AS (
                         SELECT trip_id, geom as trip_geom
                         FROM {trip_table}
                         LIMIT {trip_limit}
                     ),
                     building_with_geom AS (
                         SELECT building_id, name, geom as building_geom
                         FROM {building_table}
                     ),
                     distances_calculated AS (
                         SELECT
                             t.trip_id,
                             b.building_id,
                             b.name,
                             ST_Distance(t.trip_geom, b.building_geom) as distance
                         FROM trip_sample t
                         CROSS JOIN building_with_geom b
                     ),
                     ranked_neighbors AS (
                         SELECT *,
                             ROW_NUMBER() OVER (PARTITION BY trip_id ORDER BY distance ASC) as rn
                         FROM distances_calculated
                     )
                     SELECT trip_id, building_id, name, distance
                     FROM ranked_neighbors
                     WHERE rn <= {k}
                     ORDER BY trip_id, distance
                 """

             result = eng.execute_and_collect(query)
             return len(result)

         # Run the benchmark
         benchmark(run_knn_query)

     @pytest.mark.parametrize("k", [1, 5, 10, 20])
     @pytest.mark.parametrize("engine", [SedonaDB, PostGIS, DuckDB])
     def test_knn_scalability_by_k(self, benchmark, k, engine):
         """Test how KNN performance scales with increasing k values - SedonaDB vs PostGIS"""

         # Get the appropriate engine instance
         eng = self._get_eng(engine)

         def run_knn_query():
             if engine == SedonaDB:
                 # SedonaDB syntax
                 query = f"""
                     WITH trip_sample AS (
                         SELECT trip_id, geom as trip_geom
                         FROM knn_trips_small
                         LIMIT 50  -- Small sample for k scaling test
                     )
                     SELECT
                         COUNT(*) as result_count
                     FROM trip_sample t
                     JOIN knn_buildings_small b ON ST_KNN(t.trip_geom, b.geom, {k}, FALSE)
                 """
             elif engine == PostGIS:
                 # PostGIS syntax
                 query = f"""
                     WITH trip_sample AS (
                         SELECT trip_id, geom as trip_geom
                         FROM knn_trips_small
                         LIMIT 50
                     ),
                     ranked_neighbors AS (
                         SELECT
                             t.trip_id,
                             ROW_NUMBER() OVER (PARTITION BY t.trip_id ORDER BY t.trip_geom <-> b.geom) as rn
                         FROM trip_sample t
                         CROSS JOIN knn_buildings_small b
                     )
                     SELECT COUNT(*) as result_count
                     FROM ranked_neighbors
                     WHERE rn <= {k}
                 """
             else:  # DuckDB
                 # DuckDB KNN simulation
                 query = f"""
                     WITH trip_sample AS (
                         SELECT trip_id, geom as trip_geom
                         FROM knn_trips_small
                         LIMIT 50
                     ),
                     ranked_neighbors AS (
                         SELECT
                             t.trip_id,
                             ROW_NUMBER() OVER (PARTITION BY t.trip_id ORDER BY ST_Distance(t.trip_geom, b.geom) ASC) as rn
                         FROM trip_sample t
                         CROSS JOIN knn_buildings_small b
                     )
                     SELECT COUNT(*) as result_count
                     FROM ranked_neighbors
                     WHERE rn <= {k}
                 """

             result = eng.execute_and_collect(query)
             return result.to_pandas().iloc[0]["result_count"]

         # Run the benchmark
         benchmark(run_knn_query)
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	import json
	import pytest
	from test_bench_base import TestBenchBase
	from sedonadb.testing import SedonaDB, PostGIS, DuckDB


	class TestBenchKNN(TestBenchBase):
	def setup_class(self):
	"""Setup test data for KNN benchmarks"""
	self.sedonadb = SedonaDB.create_or_skip()
	self.postgis = PostGIS.create_or_skip()
	self.duckdb = DuckDB.create_or_skip()

	# Create building-like polygons (index side - fewer, larger geometries)
	building_options = {
	"geom_type": "Polygon",
	"target_rows": 2_000,
	"vertices_per_linestring_range": [4, 8],
	"size_range": [0.001, 0.01],
	"seed": 42,
	}

	building_query = f"""
	SELECT
	geometry as geom,
	round(random() * 1000) as building_id,
	'Building_' \|\| cast(round(random() * 1000) as varchar) as name
	FROM sd_random_geometry('{json.dumps(building_options)}')
	"""
	building_tab = self.sedonadb.execute_and_collect(building_query)
	self.sedonadb.create_table_arrow("knn_buildings", building_tab)
	self.postgis.create_table_arrow("knn_buildings", building_tab)
	self.duckdb.create_table_arrow("knn_buildings", building_tab)

	# Create trip pickup points (probe side)
	trip_options = {
	"geom_type": "Point",
	"target_rows": 10_000,
	"seed": 43,
	}

	trip_query = f"""
	SELECT
	geometry as geom,
	round(random() * 100000) as trip_id
	FROM sd_random_geometry('{json.dumps(trip_options)}')
	"""
	trip_tab = self.sedonadb.execute_and_collect(trip_query)
	self.sedonadb.create_table_arrow("knn_trips", trip_tab)
	self.postgis.create_table_arrow("knn_trips", trip_tab)
	self.duckdb.create_table_arrow("knn_trips", trip_tab)

	# Create a smaller test dataset for quick tests
	small_building_query = """
	SELECT * FROM knn_buildings LIMIT 1000
	"""
	small_building_tab = self.sedonadb.execute_and_collect(small_building_query)
	self.sedonadb.create_table_arrow("knn_buildings_small", small_building_tab)
	self.postgis.create_table_arrow("knn_buildings_small", small_building_tab)
	self.duckdb.create_table_arrow("knn_buildings_small", small_building_tab)

	small_trip_query = """
	SELECT * FROM knn_trips LIMIT 5000
	"""
	small_trip_tab = self.sedonadb.execute_and_collect(small_trip_query)
	self.sedonadb.create_table_arrow("knn_trips_small", small_trip_tab)
	self.postgis.create_table_arrow("knn_trips_small", small_trip_tab)
	self.duckdb.create_table_arrow("knn_trips_small", small_trip_tab)

	@pytest.mark.parametrize("k", [1, 5, 10])
	@pytest.mark.parametrize("engine", [SedonaDB, PostGIS, DuckDB])
	@pytest.mark.parametrize("dataset_size", ["small", "large"])
	def test_knn_performance(self, benchmark, k, engine, dataset_size):
	"""Benchmark KNN query performance comparing SedonaDB vs PostGIS"""

	if dataset_size == "small":
	trip_table = "knn_trips_small"
	building_table = "knn_buildings_small"
	trip_limit = 100 # Test with 100 trips
	else:
	trip_table = "knn_trips_small"
	building_table = "knn_buildings"
	trip_limit = 1000

	# Get the appropriate engine instance
	eng = self._get_eng(engine)

	def run_knn_query():
	if engine == SedonaDB:
	# SedonaDB syntax using ST_KNN function
	query = f"""
	WITH trip_sample AS (
	SELECT trip_id, geom as trip_geom
	FROM {trip_table}
	LIMIT {trip_limit}
	),
	building_with_geom AS (
	SELECT building_id, name, geom as building_geom
	FROM {building_table}
	)
	SELECT
	t.trip_id,
	b.building_id,
	b.name,
	ST_Distance(t.trip_geom, b.building_geom) as distance
	FROM trip_sample t
	JOIN building_with_geom b ON ST_KNN(t.trip_geom, b.building_geom, {k}, FALSE)
	ORDER BY t.trip_id, distance
	"""
	elif engine == PostGIS:
	# PostGIS syntax using distance operator and window functions
	query = f"""
	WITH trip_sample AS (
	SELECT trip_id, geom as trip_geom
	FROM {trip_table}
	LIMIT {trip_limit}
	),
	building_with_geom AS (
	SELECT building_id, name, geom as building_geom
	FROM {building_table}
	),
	ranked_neighbors AS (
	SELECT
	t.trip_id,
	b.building_id,
	b.name,
	ST_Distance(t.trip_geom, b.building_geom) as distance,
	ROW_NUMBER() OVER (PARTITION BY t.trip_id ORDER BY t.trip_geom <-> b.building_geom) as rn
	FROM trip_sample t
	CROSS JOIN building_with_geom b
	)
	SELECT trip_id, building_id, name, distance
	FROM ranked_neighbors
	WHERE rn <= {k}
	ORDER BY trip_id, distance
	"""
	else: # DuckDB
	# DuckDB KNN simulation using spatial joins with distance predicates
	# Since DuckDB doesn't have native KNN, we use a cross join with distance calculation and ranking
	query = f"""
	WITH trip_sample AS (
	SELECT trip_id, geom as trip_geom
	FROM {trip_table}
	LIMIT {trip_limit}
	),
	building_with_geom AS (
	SELECT building_id, name, geom as building_geom
	FROM {building_table}
	),
	distances_calculated AS (
	SELECT
	t.trip_id,
	b.building_id,
	b.name,
	ST_Distance(t.trip_geom, b.building_geom) as distance
	FROM trip_sample t
	CROSS JOIN building_with_geom b
	),
	ranked_neighbors AS (
	SELECT *,
	ROW_NUMBER() OVER (PARTITION BY trip_id ORDER BY distance ASC) as rn
	FROM distances_calculated
	)
	SELECT trip_id, building_id, name, distance
	FROM ranked_neighbors
	WHERE rn <= {k}
	ORDER BY trip_id, distance
	"""

	result = eng.execute_and_collect(query)
	return len(result)

	# Run the benchmark
	benchmark(run_knn_query)

	@pytest.mark.parametrize("k", [1, 5, 10, 20])
	@pytest.mark.parametrize("engine", [SedonaDB, PostGIS, DuckDB])
	def test_knn_scalability_by_k(self, benchmark, k, engine):
	"""Test how KNN performance scales with increasing k values - SedonaDB vs PostGIS"""

	# Get the appropriate engine instance
	eng = self._get_eng(engine)

	def run_knn_query():
	if engine == SedonaDB:
	# SedonaDB syntax
	query = f"""
	WITH trip_sample AS (
	SELECT trip_id, geom as trip_geom
	FROM knn_trips_small
	LIMIT 50 -- Small sample for k scaling test
	)
	SELECT
	COUNT(*) as result_count
	FROM trip_sample t
	JOIN knn_buildings_small b ON ST_KNN(t.trip_geom, b.geom, {k}, FALSE)
	"""
	elif engine == PostGIS:
	# PostGIS syntax
	query = f"""
	WITH trip_sample AS (
	SELECT trip_id, geom as trip_geom
	FROM knn_trips_small
	LIMIT 50
	),
	ranked_neighbors AS (
	SELECT
	t.trip_id,
	ROW_NUMBER() OVER (PARTITION BY t.trip_id ORDER BY t.trip_geom <-> b.geom) as rn
	FROM trip_sample t
	CROSS JOIN knn_buildings_small b
	)
	SELECT COUNT(*) as result_count
	FROM ranked_neighbors
	WHERE rn <= {k}
	"""
	else: # DuckDB
	# DuckDB KNN simulation
	query = f"""
	WITH trip_sample AS (
	SELECT trip_id, geom as trip_geom
	FROM knn_trips_small
	LIMIT 50
	),
	ranked_neighbors AS (
	SELECT
	t.trip_id,
	ROW_NUMBER() OVER (PARTITION BY t.trip_id ORDER BY ST_Distance(t.trip_geom, b.geom) ASC) as rn
	FROM trip_sample t
	CROSS JOIN knn_buildings_small b
	)
	SELECT COUNT(*) as result_count
	FROM ranked_neighbors
	WHERE rn <= {k}
	"""

	result = eng.execute_and_collect(query)
	return result.to_pandas().iloc[0]["result_count"]

	# Run the benchmark
	benchmark(run_knn_query)