python/sedonadb/tests/test_sjoin.py - sedona-db - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 import pytest
 import json
 from sedonadb.testing import PostGIS, SedonaDB


 @pytest.mark.parametrize(
     "join_type", ["INNER JOIN", "LEFT OUTER JOIN", "RIGHT OUTER JOIN"]
 )
 @pytest.mark.parametrize(
     "on",
     [
         "ST_Intersects(sjoin_point.geometry, sjoin_polygon.geometry)",
         "ST_Within(sjoin_point.geometry, sjoin_polygon.geometry)",
         "ST_Contains(sjoin_polygon.geometry, sjoin_point.geometry)",
         "ST_DWithin(sjoin_point.geometry, sjoin_polygon.geometry, 1.0)",
         "ST_DWithin(sjoin_point.geometry, sjoin_polygon.geometry, sjoin_point.dist / 100)",
         "ST_DWithin(sjoin_point.geometry, sjoin_polygon.geometry, sjoin_polygon.dist / 100)",
     ],
 )
 def test_spatial_join(join_type, on):
     with (
         SedonaDB.create_or_skip() as eng_sedonadb,
         PostGIS.create_or_skip() as eng_postgis,
     ):
         options = json.dumps(
             {
                 "geom_type": "Point",
                 "polygon_hole_rate": 0.5,
                 "num_parts_range": [2, 10],
                 "vertices_per_linestring_range": [2, 10],
                 "seed": 42,
             }
         )
         df_point = eng_sedonadb.execute_and_collect(
             f"SELECT * FROM sd_random_geometry('{options}') LIMIT 100"
         )
         options = json.dumps(
             {
                 "geom_type": "Polygon",
                 "polygon_hole_rate": 0.5,
                 "num_parts_range": [2, 10],
                 "vertices_per_linestring_range": [2, 10],
                 "seed": 43,
             }
         )
         df_polygon = eng_sedonadb.execute_and_collect(
             f"SELECT * FROM sd_random_geometry('{options}') LIMIT 100"
         )
         eng_sedonadb.create_table_arrow("sjoin_point", df_point)
         eng_sedonadb.create_table_arrow("sjoin_polygon", df_polygon)
         eng_postgis.create_table_arrow("sjoin_point", df_point)
         eng_postgis.create_table_arrow("sjoin_polygon", df_polygon)

         sql = f"""
                SELECT sjoin_point.id id0, sjoin_polygon.id id1
                FROM sjoin_point {join_type} sjoin_polygon
                ON {on}
                ORDER BY id0, id1
                """

         sedonadb_results = eng_sedonadb.execute_and_collect(sql).to_pandas()
         assert len(sedonadb_results) > 0
         eng_postgis.assert_query_result(sql, sedonadb_results)


 @pytest.mark.parametrize(
     "join_type", ["INNER JOIN", "LEFT OUTER JOIN", "RIGHT OUTER JOIN"]
 )
 @pytest.mark.parametrize(
     "on",
     [
         "ST_Intersects(sjoin_geog1.geog, sjoin_geog2.geog)",
         "ST_Distance(sjoin_geog1.geog, sjoin_geog2.geog) < 100000",
     ],
 )
 def test_spatial_join_geography(join_type, on):
     with (
         SedonaDB.create_or_skip() as eng_sedonadb,
         PostGIS.create_or_skip() as eng_postgis,
     ):
         # Select two sets of bounding boxes that cross the antimeridian,
         # which would be disjoint on a Euclidean plane. A geography join will produce non-empty results,
         # whereas a geometry join would not.
         west_most_bound = [-190, -10, -170, 10]
         east_most_bound = [170, -10, 190, 10]
         options = json.dumps(
             {
                 "geom_type": "Point",
                 "num_parts_range": [2, 10],
                 "vertices_per_linestring_range": [2, 10],
                 "bounds": west_most_bound,
                 "size_range": [0.1, 5],
                 "seed": 43,
             }
         )
         df_point = eng_sedonadb.execute_and_collect(
             f"SELECT id, ST_SetSRID(ST_GeogFromWKB(ST_AsBinary(geometry)), 4326) geog, dist FROM sd_random_geometry('{options}') LIMIT 100"
         )
         options = json.dumps(
             {
                 "geom_type": "Polygon",
                 "polygon_hole_rate": 0.5,
                 "num_parts_range": [2, 10],
                 "vertices_per_linestring_range": [2, 10],
                 "bounds": east_most_bound,
                 "size_range": [0.1, 5],
                 "seed": 44,
             }
         )
         df_polygon = eng_sedonadb.execute_and_collect(
             f"SELECT id, ST_SetSRID(ST_GeogFromWKB(ST_AsBinary(geometry)), 4326) geog, dist FROM sd_random_geometry('{options}') LIMIT 100"
         )
         eng_sedonadb.create_table_arrow("sjoin_geog1", df_point)
         eng_sedonadb.create_table_arrow("sjoin_geog2", df_polygon)
         eng_postgis.create_table_arrow("sjoin_geog1", df_point)
         eng_postgis.create_table_arrow("sjoin_geog2", df_polygon)

         sql = f"""
                SELECT sjoin_geog1.id id0, sjoin_geog2.id id1
                FROM sjoin_geog1 {join_type} sjoin_geog2
                ON {on}
                ORDER BY id0, id1
                """

         sedonadb_results = eng_sedonadb.execute_and_collect(sql).to_pandas()
         eng_postgis.assert_query_result(sql, sedonadb_results)


 def test_query_window_in_subquery():
     with (
         SedonaDB.create_or_skip() as eng_sedonadb,
         PostGIS.create_or_skip() as eng_postgis,
     ):
         options = json.dumps(
             {
                 "geom_type": "Point",
                 "seed": 42,
             }
         )
         df_point = eng_sedonadb.execute_and_collect(
             f"SELECT * FROM sd_random_geometry('{options}') LIMIT 100"
         )
         options = json.dumps(
             {
                 "geom_type": "Polygon",
                 "polygon_hole_rate": 0.5,
                 "num_parts_range": [2, 10],
                 "vertices_per_linestring_range": [2, 10],
                 "size_range": [50, 60],
                 "seed": 43,
             }
         )
         df_polygon = eng_sedonadb.execute_and_collect(
             f"SELECT * FROM sd_random_geometry('{options}') LIMIT 100"
         )
         eng_sedonadb.create_table_arrow("sjoin_point", df_point)
         eng_sedonadb.create_table_arrow("sjoin_polygon", df_polygon)
         eng_postgis.create_table_arrow("sjoin_point", df_point)
         eng_postgis.create_table_arrow("sjoin_polygon", df_polygon)

         # This should be optimized to a spatial join
         sql = """
                SELECT id FROM sjoin_point AS L
                WHERE ST_Intersects(L.geometry, (SELECT R.geometry FROM sjoin_polygon AS R WHERE R.id = 1))
                ORDER BY id
                """

         # Verify that the physical query plan should contain a SpatialJoinExec
         query_plan = eng_sedonadb.execute_and_collect(f"EXPLAIN {sql}").to_pandas()
         assert "SpatialJoinExec" in query_plan.iloc[1, 1]

         sedonadb_results = eng_sedonadb.execute_and_collect(sql).to_pandas()
         assert len(sedonadb_results) > 0
         eng_postgis.assert_query_result(sql, sedonadb_results)


 def test_non_optimizable_subquery():
     with (
         SedonaDB.create_or_skip() as eng_sedonadb,
         PostGIS.create_or_skip() as eng_postgis,
     ):
         options = json.dumps(
             {
                 "geom_type": "Point",
                 "seed": 42,
             }
         )
         df_main = eng_sedonadb.execute_and_collect(
             f"SELECT * FROM sd_random_geometry('{options}') LIMIT 100"
         )
         options = json.dumps(
             {
                 "geom_type": "Point",
                 "seed": 43,
             }
         )
         df_subquery = eng_sedonadb.execute_and_collect(
             f"SELECT * FROM sd_random_geometry('{options}') LIMIT 100"
         )
         eng_sedonadb.create_table_arrow("sjoin_main", df_main)
         eng_sedonadb.create_table_arrow("sjoin_subquery", df_subquery)
         eng_postgis.create_table_arrow("sjoin_main", df_main)
         eng_postgis.create_table_arrow("sjoin_subquery", df_subquery)

         # This cannot be optimized to a spatial join, but the query result should still be correct
         sql = """
                SELECT id FROM sjoin_main AS L
                WHERE ST_DWithin(L.geometry, ST_Point(10, 10), (SELECT R.dist FROM sjoin_subquery AS R WHERE R.id = 1))
                ORDER BY id
                """
         sedonadb_results = eng_sedonadb.execute_and_collect(sql).to_pandas()
         assert len(sedonadb_results) > 0
         eng_postgis.assert_query_result(sql, sedonadb_results)
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	import pytest
	import json
	from sedonadb.testing import PostGIS, SedonaDB


	@pytest.mark.parametrize(
	"join_type", ["INNER JOIN", "LEFT OUTER JOIN", "RIGHT OUTER JOIN"]
	)
	@pytest.mark.parametrize(
	"on",
	[
	"ST_Intersects(sjoin_point.geometry, sjoin_polygon.geometry)",
	"ST_Within(sjoin_point.geometry, sjoin_polygon.geometry)",
	"ST_Contains(sjoin_polygon.geometry, sjoin_point.geometry)",
	"ST_DWithin(sjoin_point.geometry, sjoin_polygon.geometry, 1.0)",
	"ST_DWithin(sjoin_point.geometry, sjoin_polygon.geometry, sjoin_point.dist / 100)",
	"ST_DWithin(sjoin_point.geometry, sjoin_polygon.geometry, sjoin_polygon.dist / 100)",
	],
	)
	def test_spatial_join(join_type, on):
	with (
	SedonaDB.create_or_skip() as eng_sedonadb,
	PostGIS.create_or_skip() as eng_postgis,
	):
	options = json.dumps(
	{
	"geom_type": "Point",
	"polygon_hole_rate": 0.5,
	"num_parts_range": [2, 10],
	"vertices_per_linestring_range": [2, 10],
	"seed": 42,
	}
	)
	df_point = eng_sedonadb.execute_and_collect(
	f"SELECT * FROM sd_random_geometry('{options}') LIMIT 100"
	)
	options = json.dumps(
	{
	"geom_type": "Polygon",
	"polygon_hole_rate": 0.5,
	"num_parts_range": [2, 10],
	"vertices_per_linestring_range": [2, 10],
	"seed": 43,
	}
	)
	df_polygon = eng_sedonadb.execute_and_collect(
	f"SELECT * FROM sd_random_geometry('{options}') LIMIT 100"
	)
	eng_sedonadb.create_table_arrow("sjoin_point", df_point)
	eng_sedonadb.create_table_arrow("sjoin_polygon", df_polygon)
	eng_postgis.create_table_arrow("sjoin_point", df_point)
	eng_postgis.create_table_arrow("sjoin_polygon", df_polygon)

	sql = f"""
	SELECT sjoin_point.id id0, sjoin_polygon.id id1
	FROM sjoin_point {join_type} sjoin_polygon
	ON {on}
	ORDER BY id0, id1
	"""

	sedonadb_results = eng_sedonadb.execute_and_collect(sql).to_pandas()
	assert len(sedonadb_results) > 0
	eng_postgis.assert_query_result(sql, sedonadb_results)


	@pytest.mark.parametrize(
	"join_type", ["INNER JOIN", "LEFT OUTER JOIN", "RIGHT OUTER JOIN"]
	)
	@pytest.mark.parametrize(
	"on",
	[
	"ST_Intersects(sjoin_geog1.geog, sjoin_geog2.geog)",
	"ST_Distance(sjoin_geog1.geog, sjoin_geog2.geog) < 100000",
	],
	)
	def test_spatial_join_geography(join_type, on):
	with (
	SedonaDB.create_or_skip() as eng_sedonadb,
	PostGIS.create_or_skip() as eng_postgis,
	):
	# Select two sets of bounding boxes that cross the antimeridian,
	# which would be disjoint on a Euclidean plane. A geography join will produce non-empty results,
	# whereas a geometry join would not.
	west_most_bound = [-190, -10, -170, 10]
	east_most_bound = [170, -10, 190, 10]
	options = json.dumps(
	{
	"geom_type": "Point",
	"num_parts_range": [2, 10],
	"vertices_per_linestring_range": [2, 10],
	"bounds": west_most_bound,
	"size_range": [0.1, 5],
	"seed": 43,
	}
	)
	df_point = eng_sedonadb.execute_and_collect(
	f"SELECT id, ST_SetSRID(ST_GeogFromWKB(ST_AsBinary(geometry)), 4326) geog, dist FROM sd_random_geometry('{options}') LIMIT 100"
	)
	options = json.dumps(
	{
	"geom_type": "Polygon",
	"polygon_hole_rate": 0.5,
	"num_parts_range": [2, 10],
	"vertices_per_linestring_range": [2, 10],
	"bounds": east_most_bound,
	"size_range": [0.1, 5],
	"seed": 44,
	}
	)
	df_polygon = eng_sedonadb.execute_and_collect(
	f"SELECT id, ST_SetSRID(ST_GeogFromWKB(ST_AsBinary(geometry)), 4326) geog, dist FROM sd_random_geometry('{options}') LIMIT 100"
	)
	eng_sedonadb.create_table_arrow("sjoin_geog1", df_point)
	eng_sedonadb.create_table_arrow("sjoin_geog2", df_polygon)
	eng_postgis.create_table_arrow("sjoin_geog1", df_point)
	eng_postgis.create_table_arrow("sjoin_geog2", df_polygon)

	sql = f"""
	SELECT sjoin_geog1.id id0, sjoin_geog2.id id1
	FROM sjoin_geog1 {join_type} sjoin_geog2
	ON {on}
	ORDER BY id0, id1
	"""

	sedonadb_results = eng_sedonadb.execute_and_collect(sql).to_pandas()
	eng_postgis.assert_query_result(sql, sedonadb_results)


	def test_query_window_in_subquery():
	with (
	SedonaDB.create_or_skip() as eng_sedonadb,
	PostGIS.create_or_skip() as eng_postgis,
	):
	options = json.dumps(
	{
	"geom_type": "Point",
	"seed": 42,
	}
	)
	df_point = eng_sedonadb.execute_and_collect(
	f"SELECT * FROM sd_random_geometry('{options}') LIMIT 100"
	)
	options = json.dumps(
	{
	"geom_type": "Polygon",
	"polygon_hole_rate": 0.5,
	"num_parts_range": [2, 10],
	"vertices_per_linestring_range": [2, 10],
	"size_range": [50, 60],
	"seed": 43,
	}
	)
	df_polygon = eng_sedonadb.execute_and_collect(
	f"SELECT * FROM sd_random_geometry('{options}') LIMIT 100"
	)
	eng_sedonadb.create_table_arrow("sjoin_point", df_point)
	eng_sedonadb.create_table_arrow("sjoin_polygon", df_polygon)
	eng_postgis.create_table_arrow("sjoin_point", df_point)
	eng_postgis.create_table_arrow("sjoin_polygon", df_polygon)

	# This should be optimized to a spatial join
	sql = """
	SELECT id FROM sjoin_point AS L
	WHERE ST_Intersects(L.geometry, (SELECT R.geometry FROM sjoin_polygon AS R WHERE R.id = 1))
	ORDER BY id
	"""

	# Verify that the physical query plan should contain a SpatialJoinExec
	query_plan = eng_sedonadb.execute_and_collect(f"EXPLAIN {sql}").to_pandas()
	assert "SpatialJoinExec" in query_plan.iloc[1, 1]

	sedonadb_results = eng_sedonadb.execute_and_collect(sql).to_pandas()
	assert len(sedonadb_results) > 0
	eng_postgis.assert_query_result(sql, sedonadb_results)


	def test_non_optimizable_subquery():
	with (
	SedonaDB.create_or_skip() as eng_sedonadb,
	PostGIS.create_or_skip() as eng_postgis,
	):
	options = json.dumps(
	{
	"geom_type": "Point",
	"seed": 42,
	}
	)
	df_main = eng_sedonadb.execute_and_collect(
	f"SELECT * FROM sd_random_geometry('{options}') LIMIT 100"
	)
	options = json.dumps(
	{
	"geom_type": "Point",
	"seed": 43,
	}
	)
	df_subquery = eng_sedonadb.execute_and_collect(
	f"SELECT * FROM sd_random_geometry('{options}') LIMIT 100"
	)
	eng_sedonadb.create_table_arrow("sjoin_main", df_main)
	eng_sedonadb.create_table_arrow("sjoin_subquery", df_subquery)
	eng_postgis.create_table_arrow("sjoin_main", df_main)
	eng_postgis.create_table_arrow("sjoin_subquery", df_subquery)

	# This cannot be optimized to a spatial join, but the query result should still be correct
	sql = """
	SELECT id FROM sjoin_main AS L
	WHERE ST_DWithin(L.geometry, ST_Point(10, 10), (SELECT R.dist FROM sjoin_subquery AS R WHERE R.id = 1))
	ORDER BY id
	"""
	sedonadb_results = eng_sedonadb.execute_and_collect(sql).to_pandas()
	assert len(sedonadb_results) > 0
	eng_postgis.assert_query_result(sql, sedonadb_results)