blob: e69194f50f70561a432c7f77d0149b13d25bfc07 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import json
import re
import urllib
from datetime import datetime
from re import Pattern
from typing import Any, Optional, TYPE_CHECKING
import pandas as pd
from apispec import APISpec
from apispec.ext.marshmallow import MarshmallowPlugin
from deprecation import deprecated
from flask_babel import gettext as __
from marshmallow import fields, Schema
from marshmallow.exceptions import ValidationError
from sqlalchemy import column, types
from sqlalchemy.engine.base import Engine
from sqlalchemy.engine.reflection import Inspector
from sqlalchemy.sql import sqltypes
from typing_extensions import TypedDict
from superset import sql_parse
from superset.constants import PASSWORD_MASK, TimeGrain
from superset.databases.schemas import encrypted_field_properties, EncryptedString
from superset.databases.utils import make_url_safe
from superset.db_engine_specs.base import BaseEngineSpec, BasicPropertiesType
from superset.db_engine_specs.exceptions import SupersetDBAPIConnectionError
from superset.errors import SupersetError, SupersetErrorType
from superset.exceptions import SupersetException
from superset.sql_parse import Table
from superset.utils import core as utils
from superset.utils.hashing import md5_sha_from_str
try:
from google.cloud import bigquery
from google.oauth2 import service_account
dependencies_installed = True
except ModuleNotFoundError:
dependencies_installed = False
try:
import pandas_gbq
can_upload = True
except ModuleNotFoundError:
can_upload = False
if TYPE_CHECKING:
from superset.models.core import Database # pragma: no cover
CONNECTION_DATABASE_PERMISSIONS_REGEX = re.compile(
"Access Denied: Project (?P<project_name>.+?): User does not have "
+ "bigquery.jobs.create permission in project (?P<project>.+?)"
)
TABLE_DOES_NOT_EXIST_REGEX = re.compile(
'Table name "(?P<table>.*?)" missing dataset while no default '
"dataset is set in the request"
)
COLUMN_DOES_NOT_EXIST_REGEX = re.compile(
r"Unrecognized name: (?P<column>.*?) at \[(?P<location>.+?)\]"
)
SCHEMA_DOES_NOT_EXIST_REGEX = re.compile(
r"bigquery error: 404 Not found: Dataset (?P<dataset>.*?):"
r"(?P<schema>.*?) was not found in location"
)
SYNTAX_ERROR_REGEX = re.compile(
'Syntax error: Expected end of input but got identifier "(?P<syntax_error>.+?)"'
)
ma_plugin = MarshmallowPlugin()
class BigQueryParametersSchema(Schema):
credentials_info = EncryptedString(
required=False,
metadata={"description": "Contents of BigQuery JSON credentials."},
)
query = fields.Dict(required=False)
class BigQueryParametersType(TypedDict):
credentials_info: dict[str, Any]
query: dict[str, Any]
class BigQueryEngineSpec(BaseEngineSpec): # pylint: disable=too-many-public-methods
"""Engine spec for Google's BigQuery
As contributed by @mxmzdlv on issue #945"""
engine = "bigquery"
engine_name = "Google BigQuery"
max_column_name_length = 128
disable_ssh_tunneling = True
parameters_schema = BigQueryParametersSchema()
default_driver = "bigquery"
sqlalchemy_uri_placeholder = "bigquery://{project_id}"
# BigQuery doesn't maintain context when running multiple statements in the
# same cursor, so we need to run all statements at once
run_multiple_statements_as_one = True
allows_hidden_cc_in_orderby = True
supports_catalog = True
"""
https://www.python.org/dev/peps/pep-0249/#arraysize
raw_connections bypass the sqlalchemy-bigquery query execution context and deal with
raw dbapi connection directly.
If this value is not set, the default value is set to 1, as described here,
https://googlecloudplatform.github.io/google-cloud-python/latest/_modules/google/cloud/bigquery/dbapi/cursor.html#Cursor
The default value of 5000 is derived from the sqlalchemy-bigquery.
https://github.com/googleapis/python-bigquery-sqlalchemy/blob/4e17259088f89eac155adc19e0985278a29ecf9c/sqlalchemy_bigquery/base.py#L762
"""
arraysize = 5000
_date_trunc_functions = {
"DATE": "DATE_TRUNC",
"DATETIME": "DATETIME_TRUNC",
"TIME": "TIME_TRUNC",
"TIMESTAMP": "TIMESTAMP_TRUNC",
}
_time_grain_expressions = {
None: "{col}",
TimeGrain.SECOND: "CAST(TIMESTAMP_SECONDS("
"UNIX_SECONDS(CAST({col} AS TIMESTAMP))"
") AS {type})",
TimeGrain.MINUTE: "CAST(TIMESTAMP_SECONDS("
"60 * DIV(UNIX_SECONDS(CAST({col} AS TIMESTAMP)), 60)"
") AS {type})",
TimeGrain.FIVE_MINUTES: "CAST(TIMESTAMP_SECONDS("
"5*60 * DIV(UNIX_SECONDS(CAST({col} AS TIMESTAMP)), 5*60)"
") AS {type})",
TimeGrain.TEN_MINUTES: "CAST(TIMESTAMP_SECONDS("
"10*60 * DIV(UNIX_SECONDS(CAST({col} AS TIMESTAMP)), 10*60)"
") AS {type})",
TimeGrain.FIFTEEN_MINUTES: "CAST(TIMESTAMP_SECONDS("
"15*60 * DIV(UNIX_SECONDS(CAST({col} AS TIMESTAMP)), 15*60)"
") AS {type})",
TimeGrain.THIRTY_MINUTES: "CAST(TIMESTAMP_SECONDS("
"30*60 * DIV(UNIX_SECONDS(CAST({col} AS TIMESTAMP)), 30*60)"
") AS {type})",
TimeGrain.HOUR: "{func}({col}, HOUR)",
TimeGrain.DAY: "{func}({col}, DAY)",
TimeGrain.WEEK: "{func}({col}, WEEK)",
TimeGrain.WEEK_STARTING_MONDAY: "{func}({col}, ISOWEEK)",
TimeGrain.MONTH: "{func}({col}, MONTH)",
TimeGrain.QUARTER: "{func}({col}, QUARTER)",
TimeGrain.YEAR: "{func}({col}, YEAR)",
}
custom_errors: dict[Pattern[str], tuple[str, SupersetErrorType, dict[str, Any]]] = {
CONNECTION_DATABASE_PERMISSIONS_REGEX: (
__(
"Unable to connect. Verify that the following roles are set "
'on the service account: "BigQuery Data Viewer", '
'"BigQuery Metadata Viewer", "BigQuery Job User" '
"and the following permissions are set "
'"bigquery.readsessions.create", '
'"bigquery.readsessions.getData"'
),
SupersetErrorType.CONNECTION_DATABASE_PERMISSIONS_ERROR,
{},
),
TABLE_DOES_NOT_EXIST_REGEX: (
__(
'The table "%(table)s" does not exist. '
"A valid table must be used to run this query.",
),
SupersetErrorType.TABLE_DOES_NOT_EXIST_ERROR,
{},
),
COLUMN_DOES_NOT_EXIST_REGEX: (
__('We can\'t seem to resolve column "%(column)s" at line %(location)s.'),
SupersetErrorType.COLUMN_DOES_NOT_EXIST_ERROR,
{},
),
SCHEMA_DOES_NOT_EXIST_REGEX: (
__(
'The schema "%(schema)s" does not exist. '
"A valid schema must be used to run this query."
),
SupersetErrorType.SCHEMA_DOES_NOT_EXIST_ERROR,
{},
),
SYNTAX_ERROR_REGEX: (
__(
"Please check your query for syntax errors at or near "
'"%(syntax_error)s". Then, try running your query again.'
),
SupersetErrorType.SYNTAX_ERROR,
{},
),
}
@classmethod
def convert_dttm(
cls, target_type: str, dttm: datetime, db_extra: Optional[dict[str, Any]] = None
) -> Optional[str]:
sqla_type = cls.get_sqla_column_type(target_type)
if isinstance(sqla_type, types.Date):
return f"CAST('{dttm.date().isoformat()}' AS DATE)"
if isinstance(sqla_type, types.TIMESTAMP):
return f"""CAST('{dttm.isoformat(timespec="microseconds")}' AS TIMESTAMP)"""
if isinstance(sqla_type, types.DateTime):
return f"""CAST('{dttm.isoformat(timespec="microseconds")}' AS DATETIME)"""
if isinstance(sqla_type, types.Time):
return f"""CAST('{dttm.strftime("%H:%M:%S.%f")}' AS TIME)"""
return None
@classmethod
def fetch_data(
cls, cursor: Any, limit: Optional[int] = None
) -> list[tuple[Any, ...]]:
data = super().fetch_data(cursor, limit)
# Support type BigQuery Row, introduced here PR #4071
# google.cloud.bigquery.table.Row
if data and type(data[0]).__name__ == "Row":
data = [r.values() for r in data] # type: ignore
return data
@staticmethod
def _mutate_label(label: str) -> str:
"""
BigQuery field_name should start with a letter or underscore and contain only
alphanumeric characters. Labels that start with a number are prefixed with an
underscore. Any unsupported characters are replaced with underscores and an
md5 hash is added to the end of the label to avoid possible collisions.
:param label: Expected expression label
:return: Conditionally mutated label
"""
label_hashed = "_" + md5_sha_from_str(label)
# if label starts with number, add underscore as first character
label_mutated = "_" + label if re.match(r"^\d", label) else label
# replace non-alphanumeric characters with underscores
label_mutated = re.sub(r"[^\w]+", "_", label_mutated)
if label_mutated != label:
# add first 5 chars from md5 hash to label to avoid possible collisions
label_mutated += label_hashed[:6]
return label_mutated
@classmethod
def _truncate_label(cls, label: str) -> str:
"""BigQuery requires column names start with either a letter or
underscore. To make sure this is always the case, an underscore is prefixed
to the md5 hash of the original label.
:param label: expected expression label
:return: truncated label
"""
return "_" + md5_sha_from_str(label)
@classmethod
@deprecated(deprecated_in="3.0")
def normalize_indexes(cls, indexes: list[dict[str, Any]]) -> list[dict[str, Any]]:
"""
Normalizes indexes for more consistency across db engines
:param indexes: Raw indexes as returned by SQLAlchemy
:return: cleaner, more aligned index definition
"""
normalized_idxs = []
# Fixing a bug/behavior observed in pybigquery==0.4.15 where
# the index's `column_names` == [None]
# Here we're returning only non-None indexes
for ix in indexes:
column_names = ix.get("column_names") or []
ix["column_names"] = [col for col in column_names if col is not None]
if ix["column_names"]:
normalized_idxs.append(ix)
return normalized_idxs
@classmethod
def get_indexes(
cls,
database: "Database",
inspector: Inspector,
table_name: str,
schema: Optional[str],
) -> list[dict[str, Any]]:
"""
Get the indexes associated with the specified schema/table.
:param database: The database to inspect
:param inspector: The SQLAlchemy inspector
:param table_name: The table to inspect
:param schema: The schema to inspect
:returns: The indexes
"""
return cls.normalize_indexes(inspector.get_indexes(table_name, schema))
@classmethod
def extra_table_metadata(
cls, database: "Database", table_name: str, schema_name: Optional[str]
) -> dict[str, Any]:
indexes = database.get_indexes(table_name, schema_name)
if not indexes:
return {}
partitions_columns = [
index.get("column_names", [])
for index in indexes
if index.get("name") == "partition"
]
cluster_columns = [
index.get("column_names", [])
for index in indexes
if index.get("name") == "clustering"
]
return {
"partitions": {"cols": partitions_columns},
"clustering": {"cols": cluster_columns},
}
@classmethod
def epoch_to_dttm(cls) -> str:
return "TIMESTAMP_SECONDS({col})"
@classmethod
def epoch_ms_to_dttm(cls) -> str:
return "TIMESTAMP_MILLIS({col})"
@classmethod
def df_to_sql(
cls,
database: "Database",
table: Table,
df: pd.DataFrame,
to_sql_kwargs: dict[str, Any],
) -> None:
"""
Upload data from a Pandas DataFrame to a database.
Calls `pandas_gbq.DataFrame.to_gbq` which requires `pandas_gbq` to be installed.
Note this method does not create metadata for the table.
:param database: The database to upload the data to
:param table: The table to upload the data to
:param df: The dataframe with data to be uploaded
:param to_sql_kwargs: The kwargs to be passed to pandas.DataFrame.to_sql` method
"""
if not can_upload:
raise SupersetException(
"Could not import libraries needed to upload data to BigQuery."
)
if not table.schema:
raise SupersetException("The table schema must be defined")
to_gbq_kwargs = {}
with cls.get_engine(database) as engine:
to_gbq_kwargs = {
"destination_table": str(table),
"project_id": engine.url.host,
}
# Add credentials if they are set on the SQLAlchemy dialect.
if creds := engine.dialect.credentials_info:
to_gbq_kwargs[
"credentials"
] = service_account.Credentials.from_service_account_info(creds)
# Only pass through supported kwargs.
supported_kwarg_keys = {"if_exists"}
for key in supported_kwarg_keys:
if key in to_sql_kwargs:
to_gbq_kwargs[key] = to_sql_kwargs[key]
pandas_gbq.to_gbq(df, **to_gbq_kwargs)
@classmethod
def _get_client(cls, engine: Engine) -> Any:
"""
Return the BigQuery client associated with an engine.
"""
if not dependencies_installed:
raise SupersetException(
"Could not import libraries needed to connect to BigQuery."
)
credentials = service_account.Credentials.from_service_account_info(
engine.dialect.credentials_info
)
return bigquery.Client(credentials=credentials)
@classmethod
def estimate_query_cost(
cls,
database: "Database",
schema: str,
sql: str,
source: Optional[utils.QuerySource] = None,
) -> list[dict[str, Any]]:
"""
Estimate the cost of a multiple statement SQL query.
:param database: Database instance
:param schema: Database schema
:param sql: SQL query with possibly multiple statements
:param source: Source of the query (eg, "sql_lab")
"""
extra = database.get_extra() or {}
if not cls.get_allow_cost_estimate(extra):
raise SupersetException("Database does not support cost estimation")
parsed_query = sql_parse.ParsedQuery(sql)
statements = parsed_query.get_statements()
costs = []
for statement in statements:
processed_statement = cls.process_statement(statement, database)
costs.append(cls.estimate_statement_cost(processed_statement, database))
return costs
@classmethod
def get_catalog_names(
cls,
database: "Database",
inspector: Inspector,
) -> list[str]:
"""
Get all catalogs.
In BigQuery, a catalog is called a "project".
"""
engine: Engine
with database.get_sqla_engine_with_context() as engine:
client = cls._get_client(engine)
projects = client.list_projects()
return sorted(project.project_id for project in projects)
@classmethod
def get_allow_cost_estimate(cls, extra: dict[str, Any]) -> bool:
return True
@classmethod
def estimate_statement_cost(cls, statement: str, cursor: Any) -> dict[str, Any]:
with cls.get_engine(cursor) as engine:
client = cls._get_client(engine)
job_config = bigquery.QueryJobConfig(dry_run=True)
query_job = client.query(
statement,
job_config=job_config,
) # Make an API request.
# Format Bytes.
# TODO: Humanize in case more db engine specs need to be added,
# this should be made a function outside this scope.
byte_division = 1024
if hasattr(query_job, "total_bytes_processed"):
query_bytes_processed = query_job.total_bytes_processed
if query_bytes_processed // byte_division == 0:
byte_type = "B"
total_bytes_processed = query_bytes_processed
elif query_bytes_processed // (byte_division**2) == 0:
byte_type = "KB"
total_bytes_processed = round(query_bytes_processed / byte_division, 2)
elif query_bytes_processed // (byte_division**3) == 0:
byte_type = "MB"
total_bytes_processed = round(
query_bytes_processed / (byte_division**2), 2
)
else:
byte_type = "GB"
total_bytes_processed = round(
query_bytes_processed / (byte_division**3), 2
)
return {f"{byte_type} Processed": total_bytes_processed}
return {}
@classmethod
def query_cost_formatter(
cls, raw_cost: list[dict[str, Any]]
) -> list[dict[str, str]]:
return [{k: str(v) for k, v in row.items()} for row in raw_cost]
@classmethod
def build_sqlalchemy_uri(
cls,
parameters: BigQueryParametersType,
encrypted_extra: Optional[dict[str, Any]] = None,
) -> str:
query = parameters.get("query", {})
query_params = urllib.parse.urlencode(query)
if encrypted_extra:
credentials_info = encrypted_extra.get("credentials_info")
if isinstance(credentials_info, str):
credentials_info = json.loads(credentials_info)
project_id = credentials_info.get("project_id")
if not encrypted_extra:
raise ValidationError("Missing service credentials")
if project_id:
return f"{cls.default_driver}://{project_id}/?{query_params}"
raise ValidationError("Invalid service credentials")
@classmethod
def get_parameters_from_uri(
cls,
uri: str,
encrypted_extra: Optional[dict[str, Any]] = None,
) -> Any:
value = make_url_safe(uri)
# Building parameters from encrypted_extra and uri
if encrypted_extra:
# ``value.query`` needs to be explicitly converted into a dict (from an
# ``immutabledict``) so that it can be JSON serialized
return {**encrypted_extra, "query": dict(value.query)}
raise ValidationError("Invalid service credentials")
@classmethod
def mask_encrypted_extra(cls, encrypted_extra: Optional[str]) -> Optional[str]:
if encrypted_extra is None:
return encrypted_extra
try:
config = json.loads(encrypted_extra)
except (json.JSONDecodeError, TypeError):
return encrypted_extra
try:
config["credentials_info"]["private_key"] = PASSWORD_MASK
except KeyError:
pass
return json.dumps(config)
@classmethod
def unmask_encrypted_extra(
cls, old: Optional[str], new: Optional[str]
) -> Optional[str]:
"""
Reuse ``private_key`` if available and unchanged.
"""
if old is None or new is None:
return new
try:
old_config = json.loads(old)
new_config = json.loads(new)
except (TypeError, json.JSONDecodeError):
return new
if "credentials_info" not in new_config:
return new
if "private_key" not in new_config["credentials_info"]:
return new
if new_config["credentials_info"]["private_key"] == PASSWORD_MASK:
new_config["credentials_info"]["private_key"] = old_config[
"credentials_info"
]["private_key"]
return json.dumps(new_config)
@classmethod
def get_dbapi_exception_mapping(cls) -> dict[type[Exception], type[Exception]]:
# pylint: disable=import-outside-toplevel
from google.auth.exceptions import DefaultCredentialsError
return {DefaultCredentialsError: SupersetDBAPIConnectionError}
@classmethod
def validate_parameters(
cls,
properties: BasicPropertiesType, # pylint: disable=unused-argument
) -> list[SupersetError]:
return []
@classmethod
def parameters_json_schema(cls) -> Any:
"""
Return configuration parameters as OpenAPI.
"""
if not cls.parameters_schema:
return None
spec = APISpec(
title="Database Parameters",
version="1.0.0",
openapi_version="3.0.0",
plugins=[ma_plugin],
)
ma_plugin.init_spec(spec)
ma_plugin.converter.add_attribute_function(encrypted_field_properties)
spec.components.schema(cls.__name__, schema=cls.parameters_schema)
return spec.to_dict()["components"]["schemas"][cls.__name__]
@classmethod
def select_star( # pylint: disable=too-many-arguments
cls,
database: "Database",
table_name: str,
engine: Engine,
schema: Optional[str] = None,
limit: int = 100,
show_cols: bool = False,
indent: bool = True,
latest_partition: bool = True,
cols: Optional[list[dict[str, Any]]] = None,
) -> str:
"""
Remove array structures from `SELECT *`.
BigQuery supports structures and arrays of structures, eg:
author STRUCT<name STRING, email STRING>
trailer ARRAY<STRUCT<key STRING, value STRING>>
When loading metadata for a table each key in the struct is displayed as a
separate pseudo-column, eg:
- author
- author.name
- author.email
- trailer
- trailer.key
- trailer.value
When generating the `SELECT *` statement we want to remove any keys from
structs inside an array, since selecting them results in an error. The correct
select statement should look like this:
SELECT
`author`,
`author`.`name`,
`author`.`email`,
`trailer`
FROM
table
Selecting `trailer.key` or `trailer.value` results in an error, as opposed to
selecting `author.name`, since they are keys in a structure inside an array.
This method removes any array pseudo-columns.
"""
if cols:
# For arrays of structs, remove the child columns, otherwise the query
# will fail.
array_prefixes = {
col["name"] for col in cols if isinstance(col["type"], sqltypes.ARRAY)
}
cols = [
col
for col in cols
if "." not in col["name"]
or col["name"].split(".")[0] not in array_prefixes
]
return super().select_star(
database,
table_name,
engine,
schema,
limit,
show_cols,
indent,
latest_partition,
cols,
)
@classmethod
def _get_fields(cls, cols: list[dict[str, Any]]) -> list[Any]:
"""
Label columns using their fully qualified name.
BigQuery supports columns of type `struct`, which are basically dictionaries.
When loading metadata for a table with struct columns, each key in the struct
is displayed as a separate pseudo-column, eg:
author STRUCT<name STRING, email STRING>
Will be shown as 3 columns:
- author
- author.name
- author.email
If we select those fields:
SELECT `author`, `author`.`name`, `author`.`email` FROM table
The resulting columns will be called "author", "name", and "email", This may
result in a clash with other columns. To prevent that, we explicitly label
the columns using their fully qualified name, so we end up with "author",
"author__name" and "author__email", respectively.
"""
return [column(c["name"]).label(c["name"].replace(".", "__")) for c in cols]
@classmethod
def parse_error_exception(cls, exception: Exception) -> Exception:
try:
return Exception(str(exception).splitlines()[0].strip())
except Exception: # pylint: disable=broad-except
# If for some reason we get an exception, for example, no new line
# We will return the original exception
return exception