blob: 758b201464a8b276d58d6fa60269d4d46594041b [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import geoarrow.pyarrow as ga # noqa: F401
import pyarrow as pa
import pytest
import sedonadb
def test_options():
sd = sedonadb.connect()
assert "DataFrame object at" in repr(sd.sql("SELECT 1 as one"))
sd.options.interactive = True
assert "DataFrame object at" not in repr(sd.sql("SELECT 1 as one"))
def test_read_parquet(con, geoarrow_data):
# Check one file
tab = con.read_parquet(
geoarrow_data / "quadrangles/files/quadrangles_100k_geo.parquet"
).to_arrow_table()
assert tab["geometry"].type.extension_name == "geoarrow.wkb"
# Check many files
tab = con.read_parquet(
geoarrow_data.glob("example/files/*_geo.parquet")
).to_arrow_table()
assert tab["geometry"].type.extension_name == "geoarrow.wkb"
assert len(tab) == 244
def test_read_parquet_local_glob(con, geoarrow_data):
# The above test uses .glob() method, this test uses the raw string
tab = con.read_parquet(
geoarrow_data / "example/files/*_geo.parquet"
).to_arrow_table()
assert tab["geometry"].type.extension_name == "geoarrow.wkb"
assert len(tab) == 244
tab = con.read_parquet(
geoarrow_data / "example/files/example_polygon-*geo.parquet"
).to_arrow_table()
assert len(tab) == 12
def test_read_parquet_error(con):
with pytest.raises(sedonadb._lib.SedonaError, match="No table paths were provided"):
con.read_parquet([])
def test_dynamic_object_stores():
# We need a fresh connection here to make sure other autoregistration
# doesn't affect the ability of read_parquet() to use an object store
con = sedonadb.connect()
url = "https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0/example/files/example_geometry_geo.parquet"
schema = pa.schema(con.read_parquet(url))
assert schema.field("geometry").type.extension_name == "geoarrow.wkb"
def test_read_parquet_options_parameter(con, geoarrow_data):
"""Test the options parameter functionality for read_parquet()"""
test_file = geoarrow_data / "quadrangles/files/quadrangles_100k_geo.parquet"
# Test 1: Backward compatibility - no options parameter
tab1 = con.read_parquet(test_file).to_arrow_table()
assert tab1["geometry"].type.extension_name == "geoarrow.wkb"
# Test 2: options=None (explicit None)
tab2 = con.read_parquet(test_file, options=None).to_arrow_table()
assert tab2["geometry"].type.extension_name == "geoarrow.wkb"
assert len(tab2) == len(tab1) # Should be identical
# Test 3: Empty options dictionary
tab3 = con.read_parquet(test_file, options={}).to_arrow_table()
assert tab3["geometry"].type.extension_name == "geoarrow.wkb"
assert len(tab3) == len(tab1) # Should be identical
# Test 4: Options with string values
tab4 = con.read_parquet(
test_file, options={"test.option": "value"}
).to_arrow_table()
assert tab4["geometry"].type.extension_name == "geoarrow.wkb"
assert len(tab4) == len(
tab1
) # Should be identical (option ignored but not errored)
def test_read_geoparquet_s3_anonymous_access():
"""Test reading from a public S3 bucket geoparquet file with anonymous access"""
con = sedonadb.connect()
s3_url = "s3://wherobots-examples/data/onboarding_1/nyc_buildings.parquet"
# Use aws.skip_signature with region for anonymous access
tab = con.read_parquet(
s3_url, options={"aws.skip_signature": True, "aws.region": "us-west-2"}
).to_arrow_table()
assert len(tab) > 0
assert "geom" in tab.schema.names # This dataset uses 'geom' instead of 'geometry'
def test_read_parquet_invalid_aws_option():
"""Test that invalid AWS options are caught and provide helpful error messages"""
con = sedonadb.connect()
url = "s3://wherobots-examples/data/onboarding_1/nyc_buildings.parquet"
# Test with a misspelled AWS option
with pytest.raises(
sedonadb._lib.SedonaError,
match=r"Unknown AWS option 'aws\.skip_sig'\..*aws\.skip_signature",
):
con.read_parquet(url, options={"aws.skip_sig": "true"})
# Test with completely unknown AWS option
with pytest.raises(
sedonadb._lib.SedonaError,
match="Unknown AWS option.*aws.unknown_option.*Valid options are",
):
con.read_parquet(url, options={"aws.unknown_option": "value"})