| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| |
| import abc |
| import asyncio |
| import itertools |
| import os |
| |
| import asyncpg |
| import duckdb |
| import pandas |
| import psycopg |
| import sqlalchemy |
| |
| import adbc_driver_postgresql.dbapi |
| |
| |
| class BenchmarkBase(abc.ABC): |
| async_conn: asyncpg.Connection |
| async_runner: asyncio.Runner |
| conn: adbc_driver_postgresql.dbapi.Connection |
| duck: duckdb.DuckDBPyConnection |
| sqlalchemy_connection: sqlalchemy.engine.base.Connection |
| |
| def setup(self, *args, **kwargs) -> None: |
| self.uri = os.environ["ADBC_POSTGRESQL_TEST_URI"] |
| |
| self.table = self._make_table_name(*args, **kwargs) |
| |
| self.async_runner = asyncio.Runner() |
| self.async_conn = self.async_runner.run(asyncpg.connect(dsn=self.uri)) |
| |
| self.conn = adbc_driver_postgresql.dbapi.connect(self.uri) |
| |
| self.duck = duckdb.connect() |
| self.duck.sql("INSTALL postgres_scanner") |
| self.duck.sql("LOAD postgres_scanner") |
| self.duck.sql(f"CALL postgres_attach('{self.uri}')") |
| |
| uri = self.uri.replace("postgres://", "postgresql+psycopg2://") |
| self.sqlalchemy_connection = sqlalchemy.create_engine(uri).connect() |
| |
| def teardown(self, *args, **kwargs) -> None: |
| self.async_runner.close() |
| self.conn.close() |
| self.sqlalchemy_connection.close() |
| |
| @abc.abstractmethod |
| def _make_table_name(self, *args, **kwargs) -> str: |
| ... |
| |
| def time_pandas_adbc(self, row_count: int, data_type: str) -> None: |
| with self.conn.cursor() as cursor: |
| cursor.execute(f"SELECT * FROM {self.table}") |
| cursor.fetch_df() |
| |
| def time_pandas_asyncpg(self, row_count: int, data_type: str) -> None: |
| records = self.async_runner.run( |
| self.async_conn.fetch(f"SELECT * FROM {self.table}") |
| ) |
| pandas.DataFrame(records) |
| |
| # TODO: fails with 'undefined symbol' (probably need to get it into Conda) |
| # def time_pandas_pgeon(self, row_count: int) -> None: |
| # pgeon.copy_query(self.uri, f"SELECT * FROM {self.table}").to_pandas() |
| |
| def time_pandas_psycopg2(self, row_count: int, data_type: str) -> None: |
| pandas.read_sql_table(self.table, self.sqlalchemy_connection) |
| |
| def time_pandas_duckdb(self, row_count: int, data_type: str) -> None: |
| self.duck.sql(f"SELECT * FROM {self.table}").fetchdf() |
| |
| |
| class OneColumnSuite(BenchmarkBase): |
| """Benchmark the time it takes to fetch a single column of a given type.""" |
| |
| SETUP_QUERIES = [ |
| "DROP TABLE IF EXISTS {table_name}", |
| "CREATE TABLE {table_name} (items {data_type})", |
| """INSERT INTO {table_name} (items) |
| SELECT generated :: {data_type} |
| FROM GENERATE_SERIES(1, {row_count}) temp(generated)""", |
| # TODO: does an index matter, do we want to force PostgreSQL |
| # to update statistics? |
| ] |
| |
| param_data = { |
| "row_count": [10_000, 100_000, 1_000_000], |
| "data_type": ["INT", "BIGINT", "FLOAT", "DOUBLE PRECISION"], |
| } |
| |
| param_names = list(param_data.keys()) |
| params = list(param_data.values()) |
| |
| def setup_cache(self) -> None: |
| self.uri = os.environ["ADBC_POSTGRESQL_TEST_URI"] |
| with psycopg.connect(self.uri) as conn: |
| with conn.cursor() as cursor: |
| for row_count, data_type in itertools.product(*self.params): |
| table_name = self._make_table_name(row_count, data_type) |
| for query in self.SETUP_QUERIES: |
| cursor.execute( |
| query.format( |
| table_name=table_name, |
| row_count=row_count, |
| data_type=data_type, |
| ) |
| ) |
| |
| def _make_table_name(self, row_count: int, data_type: str) -> str: |
| return (f"bench_{row_count}_{data_type.replace(' ', '_')}").lower() |
| |
| |
| class MultiColumnSuite(BenchmarkBase): |
| """Benchmark the time it takes to fetch multiple columns of a given type.""" |
| |
| SETUP_QUERIES = [ |
| "DROP TABLE IF EXISTS {table_name}", |
| """ |
| CREATE TABLE {table_name} ( |
| a {data_type}, |
| b {data_type}, |
| c {data_type}, |
| d {data_type} |
| ) |
| """, |
| """ |
| INSERT INTO {table_name} (a, b, c, d) |
| SELECT generated :: {data_type}, |
| generated :: {data_type}, |
| generated :: {data_type}, |
| generated :: {data_type} |
| FROM GENERATE_SERIES(1, {row_count}) temp(generated) |
| """, |
| ] |
| |
| param_data = { |
| "row_count": [10_000, 100_000, 1_000_000], |
| "data_type": ["INT", "BIGINT", "FLOAT", "DOUBLE PRECISION"], |
| } |
| |
| param_names = list(param_data.keys()) |
| params = list(param_data.values()) |
| |
| def setup_cache(self) -> None: |
| self.uri = os.environ["ADBC_POSTGRESQL_TEST_URI"] |
| with psycopg.connect(self.uri) as conn: |
| with conn.cursor() as cursor: |
| for row_count, data_type in itertools.product(*self.params): |
| table_name = self._make_table_name(row_count, data_type) |
| for query in self.SETUP_QUERIES: |
| cursor.execute( |
| query.format( |
| table_name=table_name, |
| row_count=row_count, |
| data_type=data_type, |
| ) |
| ) |
| |
| def _make_table_name(self, row_count: int, data_type: str) -> str: |
| return (f"bench_{row_count}_{data_type.replace(' ', '_')}").lower() |