| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| import hashlib |
| import re |
| from datetime import datetime |
| from typing import Any, Dict, List, Tuple |
| |
| import pandas as pd |
| from sqlalchemy import literal_column |
| |
| from superset.db_engine_specs.base import BaseEngineSpec |
| |
| pandas_dtype_map = { |
| "STRING": "object", |
| "BOOLEAN": "bool", |
| "INTEGER": "Int64", |
| "FLOAT": "float64", |
| "TIMESTAMP": "datetime64[ns]", |
| "DATETIME": "datetime64[ns]", |
| "DATE": "object", |
| "BYTES": "object", |
| "TIME": "object", |
| "RECORD": "object", |
| "NUMERIC": "object", |
| } |
| |
| |
| class BigQueryEngineSpec(BaseEngineSpec): |
| """Engine spec for Google's BigQuery |
| |
| As contributed by @mxmzdlv on issue #945""" |
| |
| engine = "bigquery" |
| max_column_name_length = 128 |
| |
| """ |
| https://www.python.org/dev/peps/pep-0249/#arraysize |
| raw_connections bypass the pybigquery query execution context and deal with |
| raw dbapi connection directly. |
| If this value is not set, the default value is set to 1, as described here, |
| https://googlecloudplatform.github.io/google-cloud-python/latest/_modules/google/cloud/bigquery/dbapi/cursor.html#Cursor |
| |
| The default value of 5000 is derived from the pybigquery. |
| https://github.com/mxmzdlv/pybigquery/blob/d214bb089ca0807ca9aaa6ce4d5a01172d40264e/pybigquery/sqlalchemy_bigquery.py#L102 |
| """ |
| arraysize = 5000 |
| |
| _time_grain_functions = { |
| None: "{col}", |
| "PT1S": "TIMESTAMP_TRUNC({col}, SECOND)", |
| "PT1M": "TIMESTAMP_TRUNC({col}, MINUTE)", |
| "PT1H": "TIMESTAMP_TRUNC({col}, HOUR)", |
| "P1D": "TIMESTAMP_TRUNC({col}, DAY)", |
| "P1W": "TIMESTAMP_TRUNC({col}, WEEK)", |
| "P1M": "TIMESTAMP_TRUNC({col}, MONTH)", |
| "P0.25Y": "TIMESTAMP_TRUNC({col}, QUARTER)", |
| "P1Y": "TIMESTAMP_TRUNC({col}, YEAR)", |
| } |
| |
| @classmethod |
| def convert_dttm(cls, target_type: str, dttm: datetime) -> str: |
| tt = target_type.upper() |
| if tt == "DATE": |
| return "'{}'".format(dttm.strftime("%Y-%m-%d")) |
| return "'{}'".format(dttm.strftime("%Y-%m-%d %H:%M:%S")) |
| |
| @classmethod |
| def fetch_data(cls, cursor, limit: int) -> List[Tuple]: |
| data = super(BigQueryEngineSpec, cls).fetch_data(cursor, limit) |
| if data and type(data[0]).__name__ == "Row": |
| data = [r.values() for r in data] # type: ignore |
| return data |
| |
| @staticmethod |
| def _mutate_label(label: str) -> str: |
| """ |
| BigQuery field_name should start with a letter or underscore and contain only |
| alphanumeric characters. Labels that start with a number are prefixed with an |
| underscore. Any unsupported characters are replaced with underscores and an |
| md5 hash is added to the end of the label to avoid possible collisions. |
| |
| :param label: Expected expression label |
| :return: Conditionally mutated label |
| """ |
| label_hashed = "_" + hashlib.md5(label.encode("utf-8")).hexdigest() |
| |
| # if label starts with number, add underscore as first character |
| label_mutated = "_" + label if re.match(r"^\d", label) else label |
| |
| # replace non-alphanumeric characters with underscores |
| label_mutated = re.sub(r"[^\w]+", "_", label_mutated) |
| if label_mutated != label: |
| # add first 5 chars from md5 hash to label to avoid possible collisions |
| label_mutated += label_hashed[:6] |
| |
| return label_mutated |
| |
| @classmethod |
| def _truncate_label(cls, label: str) -> str: |
| """BigQuery requires column names start with either a letter or |
| underscore. To make sure this is always the case, an underscore is prefixed |
| to the md5 hash of the original label. |
| |
| :param label: expected expression label |
| :return: truncated label |
| """ |
| return "_" + hashlib.md5(label.encode("utf-8")).hexdigest() |
| |
| @classmethod |
| def extra_table_metadata( |
| cls, database, table_name: str, schema_name: str |
| ) -> Dict[str, Any]: |
| indexes = database.get_indexes(table_name, schema_name) |
| if not indexes: |
| return {} |
| partitions_columns = [ |
| index.get("column_names", []) |
| for index in indexes |
| if index.get("name") == "partition" |
| ] |
| cluster_columns = [ |
| index.get("column_names", []) |
| for index in indexes |
| if index.get("name") == "clustering" |
| ] |
| return { |
| "partitions": {"cols": partitions_columns}, |
| "clustering": {"cols": cluster_columns}, |
| } |
| |
| @classmethod |
| def _get_fields(cls, cols): |
| """ |
| BigQuery dialect requires us to not use backtick in the fieldname which are |
| nested. |
| Using literal_column handles that issue. |
| https://docs.sqlalchemy.org/en/latest/core/tutorial.html#using-more-specific-text-with-table-literal-column-and-column |
| Also explicility specifying column names so we don't encounter duplicate |
| column names in the result. |
| """ |
| return [ |
| literal_column(c.get("name")).label(c.get("name").replace(".", "__")) |
| for c in cols |
| ] |
| |
| @classmethod |
| def epoch_to_dttm(cls) -> str: |
| return "TIMESTAMP_SECONDS({col})" |
| |
| @classmethod |
| def epoch_ms_to_dttm(cls) -> str: |
| return "TIMESTAMP_MILLIS({col})" |
| |
| @classmethod |
| def df_to_sql(cls, df: pd.DataFrame, **kwargs): |
| """ |
| Upload data from a Pandas DataFrame to BigQuery. Calls |
| `DataFrame.to_gbq()` which requires `pandas_gbq` to be installed. |
| |
| :param df: Dataframe with data to be uploaded |
| :param kwargs: kwargs to be passed to to_gbq() method. Requires both `schema |
| and ``name` to be present in kwargs, which are combined and passed to |
| `to_gbq()` as `destination_table`. |
| """ |
| try: |
| import pandas_gbq |
| except ImportError: |
| raise Exception( |
| "Could not import the library `pandas_gbq`, which is " |
| "required to be installed in your environment in order " |
| "to upload data to BigQuery" |
| ) |
| |
| if not ("name" in kwargs and "schema" in kwargs): |
| raise Exception("name and schema need to be defined in kwargs") |
| gbq_kwargs = {} |
| gbq_kwargs["project_id"] = kwargs["con"].engine.url.host |
| gbq_kwargs["destination_table"] = f"{kwargs.pop('schema')}.{kwargs.pop('name')}" |
| |
| # Only pass through supported kwargs |
| supported_kwarg_keys = {"if_exists"} |
| for key in supported_kwarg_keys: |
| if key in kwargs: |
| gbq_kwargs[key] = kwargs[key] |
| pandas_gbq.to_gbq(df, **gbq_kwargs) |
| |
| @classmethod |
| def get_pandas_dtype(cls, cursor_description: List[tuple]) -> Dict[str, str]: |
| return { |
| col[0]: pandas_dtype_map.get(col[1], "object") for col in cursor_description |
| } |