| """ Superset wrapper around pandas.DataFrame. |
| |
| TODO(bkyryliuk): add support for the conventions like: *_dim or dim_* |
| dimensions, *_ts, ts_*, ds_*, *_ds - datetime, etc. |
| TODO(bkyryliuk): recognize integer encoded enums. |
| |
| """ |
| from __future__ import absolute_import |
| from __future__ import division |
| from __future__ import print_function |
| from __future__ import unicode_literals |
| |
| from datetime import datetime, date |
| from past.builtins import basestring |
| |
| import pandas as pd |
| import numpy as np |
| |
| |
| INFER_COL_TYPES_THRESHOLD = 95 |
| INFER_COL_TYPES_SAMPLE_SIZE = 100 |
| |
| |
| class SupersetDataFrame(object): |
| # Mapping numpy dtype.char to generic database types |
| type_map = { |
| 'b': 'BOOL', # boolean |
| 'i': 'INT', # (signed) integer |
| 'u': 'INT', # unsigned integer |
| 'l': 'INT', # 64bit integer |
| 'f': 'FLOAT', # floating-point |
| 'c': 'FLOAT', # complex-floating point |
| 'm': None, # timedelta |
| 'M': 'DATETIME', # datetime |
| 'O': 'OBJECT', # (Python) objects |
| 'S': 'BYTE', # (byte-)string |
| 'U': 'STRING', # Unicode |
| 'V': None, # raw data (void) |
| } |
| |
| def __init__(self, df): |
| self.__df = df.where((pd.notnull(df)), None) |
| |
| @property |
| def size(self): |
| return len(self.__df.index) |
| |
| @property |
| def data(self): |
| return self.__df.to_dict(orient='records') |
| |
| @classmethod |
| def db_type(cls, dtype): |
| """Given a numpy dtype, Returns a generic database type""" |
| return cls.type_map.get(dtype.char) |
| |
| @classmethod |
| def datetime_conversion_rate(cls, data_series): |
| success = 0 |
| total = 0 |
| for value in data_series: |
| total += 1 |
| try: |
| pd.to_datetime(value) |
| success += 1 |
| except Exception: |
| continue |
| return 100 * success / total |
| |
| @classmethod |
| def is_date(cls, dtype): |
| if dtype.name: |
| return dtype.name.startswith('datetime') |
| |
| @classmethod |
| def is_dimension(cls, dtype, column_name): |
| if cls.is_id(column_name): |
| return False |
| return dtype.name in ('object', 'bool') |
| |
| @classmethod |
| def is_id(cls, column_name): |
| return column_name.startswith('id') or column_name.endswith('id') |
| |
| @classmethod |
| def agg_func(cls, dtype, column_name): |
| # consider checking for key substring too. |
| if cls.is_id(column_name): |
| return 'count_distinct' |
| if np.issubdtype(dtype, np.number): |
| return 'sum' |
| |
| @property |
| def columns(self): |
| """Provides metadata about columns for data visualization. |
| |
| :return: dict, with the fields name, type, is_date, is_dim and agg. |
| """ |
| if self.__df.empty: |
| return None |
| |
| columns = [] |
| sample_size = min(INFER_COL_TYPES_SAMPLE_SIZE, len(self.__df.index)) |
| sample = self.__df |
| if sample_size: |
| sample = self.__df.sample(sample_size) |
| for col in self.__df.dtypes.keys(): |
| col_db_type = self.db_type(self.__df.dtypes[col]) |
| column = { |
| 'name': col, |
| 'agg': self.agg_func(self.__df.dtypes[col], col), |
| 'type': col_db_type, |
| 'is_date': self.is_date(self.__df.dtypes[col]), |
| 'is_dim': self.is_dimension(self.__df.dtypes[col], col), |
| } |
| |
| if column['type'] in ('OBJECT', None): |
| v = sample[col].iloc[0] if not sample[col].empty else None |
| if isinstance(v, basestring): |
| column['type'] = 'STRING' |
| elif isinstance(v, int): |
| column['type'] = 'INT' |
| elif isinstance(v, float): |
| column['type'] = 'FLOAT' |
| elif isinstance(v, (datetime, date)): |
| column['type'] = 'DATETIME' |
| column['is_date'] = True |
| column['is_dim'] = False |
| # check if encoded datetime |
| if ( |
| column['type'] == 'STRING' and |
| self.datetime_conversion_rate(sample[col]) > |
| INFER_COL_TYPES_THRESHOLD): |
| column.update({ |
| 'is_date': True, |
| 'is_dim': False, |
| 'agg': None |
| }) |
| # 'agg' is optional attribute |
| if not column['agg']: |
| column.pop('agg', None) |
| columns.append(column) |
| return columns |