| # -*- coding: utf-8 -*- |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| |
| from collections import OrderedDict |
| from datetime import date, time |
| |
| import numpy as np |
| import pandas as pd |
| import pyarrow as pa |
| |
| |
| def dataframe_with_arrays(include_index=False): |
| """ |
| Dataframe with numpy arrays columns of every possible primtive type. |
| |
| Returns |
| ------- |
| df: pandas.DataFrame |
| schema: pyarrow.Schema |
| Arrow schema definition that is in line with the constructed df. |
| """ |
| dtypes = [('i1', pa.int8()), ('i2', pa.int16()), |
| ('i4', pa.int32()), ('i8', pa.int64()), |
| ('u1', pa.uint8()), ('u2', pa.uint16()), |
| ('u4', pa.uint32()), ('u8', pa.uint64()), |
| ('f4', pa.float32()), ('f8', pa.float64())] |
| |
| arrays = OrderedDict() |
| fields = [] |
| for dtype, arrow_dtype in dtypes: |
| fields.append(pa.field(dtype, pa.list_(arrow_dtype))) |
| arrays[dtype] = [ |
| np.arange(10, dtype=dtype), |
| np.arange(5, dtype=dtype), |
| None, |
| np.arange(1, dtype=dtype) |
| ] |
| |
| fields.append(pa.field('str', pa.list_(pa.string()))) |
| arrays['str'] = [ |
| np.array([u"1", u"ä"], dtype="object"), |
| None, |
| np.array([u"1"], dtype="object"), |
| np.array([u"1", u"2", u"3"], dtype="object") |
| ] |
| |
| fields.append(pa.field('datetime64', pa.list_(pa.timestamp('ms')))) |
| arrays['datetime64'] = [ |
| np.array(['2007-07-13T01:23:34.123456789', |
| None, |
| '2010-08-13T05:46:57.437699912'], |
| dtype='datetime64[ms]'), |
| None, |
| None, |
| np.array(['2007-07-13T02', |
| None, |
| '2010-08-13T05:46:57.437699912'], |
| dtype='datetime64[ms]'), |
| ] |
| |
| if include_index: |
| fields.append(pa.field('__index_level_0__', pa.int64())) |
| df = pd.DataFrame(arrays) |
| schema = pa.schema(fields) |
| |
| return df, schema |
| |
| |
| def dataframe_with_lists(include_index=False, parquet_compatible=False): |
| """ |
| Dataframe with list columns of every possible primtive type. |
| |
| Returns |
| ------- |
| df: pandas.DataFrame |
| schema: pyarrow.Schema |
| Arrow schema definition that is in line with the constructed df. |
| parquet_compatible: bool |
| Exclude types not supported by parquet |
| """ |
| arrays = OrderedDict() |
| fields = [] |
| |
| fields.append(pa.field('int64', pa.list_(pa.int64()))) |
| arrays['int64'] = [ |
| [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], |
| [0, 1, 2, 3, 4], |
| None, |
| [], |
| np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9] * 2, |
| dtype=np.int64)[::2] |
| ] |
| fields.append(pa.field('double', pa.list_(pa.float64()))) |
| arrays['double'] = [ |
| [0., 1., 2., 3., 4., 5., 6., 7., 8., 9.], |
| [0., 1., 2., 3., 4.], |
| None, |
| [], |
| np.array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.] * 2)[::2], |
| ] |
| fields.append(pa.field('bytes_list', pa.list_(pa.binary()))) |
| arrays['bytes_list'] = [ |
| [b"1", b"f"], |
| None, |
| [b"1"], |
| [b"1", b"2", b"3"], |
| [], |
| ] |
| fields.append(pa.field('str_list', pa.list_(pa.string()))) |
| arrays['str_list'] = [ |
| [u"1", u"ä"], |
| None, |
| [u"1"], |
| [u"1", u"2", u"3"], |
| [], |
| ] |
| |
| date_data = [ |
| [], |
| [date(2018, 1, 1), date(2032, 12, 30)], |
| [date(2000, 6, 7)], |
| None, |
| [date(1969, 6, 9), date(1972, 7, 3)] |
| ] |
| time_data = [ |
| [time(23, 11, 11), time(1, 2, 3), time(23, 59, 59)], |
| [], |
| [time(22, 5, 59)], |
| None, |
| [time(0, 0, 0), time(18, 0, 2), time(12, 7, 3)] |
| ] |
| |
| temporal_pairs = [ |
| (pa.date32(), date_data), |
| (pa.date64(), date_data), |
| (pa.time32('s'), time_data), |
| (pa.time32('ms'), time_data), |
| (pa.time64('us'), time_data) |
| ] |
| if not parquet_compatible: |
| temporal_pairs += [ |
| (pa.time64('ns'), time_data), |
| ] |
| |
| for value_type, data in temporal_pairs: |
| field_name = '{}_list'.format(value_type) |
| field_type = pa.list_(value_type) |
| field = pa.field(field_name, field_type) |
| fields.append(field) |
| arrays[field_name] = data |
| |
| if include_index: |
| fields.append(pa.field('__index_level_0__', pa.int64())) |
| |
| df = pd.DataFrame(arrays) |
| schema = pa.schema(fields) |
| |
| return df, schema |