# -*- coding: utf-8 -*-
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import six
import decimal
import json
import multiprocessing as mp
from collections import OrderedDict
from datetime import date, datetime, time, timedelta
from distutils.version import LooseVersion
import hypothesis as h
import hypothesis.extra.pytz as tzst
import hypothesis.strategies as st
import numpy as np
import numpy.testing as npt
import pytest
import pytz
from pyarrow.pandas_compat import get_logical_type, _pandas_api
import pyarrow as pa
import pandas as pd
import pandas.util.testing as tm
from .pandas_examples import dataframe_with_arrays, dataframe_with_lists
except ImportError:
# Marks all of the tests in this module
pytestmark = pytest.mark.pandas
def _alltypes_example(size=100):
return pd.DataFrame({
'uint8': np.arange(size, dtype=np.uint8),
'uint16': np.arange(size, dtype=np.uint16),
'uint32': np.arange(size, dtype=np.uint32),
'uint64': np.arange(size, dtype=np.uint64),
'int8': np.arange(size, dtype=np.int16),
'int16': np.arange(size, dtype=np.int16),
'int32': np.arange(size, dtype=np.int32),
'int64': np.arange(size, dtype=np.int64),
'float32': np.arange(size, dtype=np.float32),
'float64': np.arange(size, dtype=np.float64),
'bool': np.random.randn(size) > 0,
# TODO(wesm): Pandas only support ns resolution, Arrow supports s, ms,
# us, ns
'datetime': np.arange("2016-01-01T00:00:00.001", size,
'str': [str(x) for x in range(size)],
'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None],
'empty_str': [''] * size
def _check_pandas_roundtrip(df, expected=None, use_threads=True,
check_dtype=True, schema=None,
klass = pa.RecordBatch if as_batch else pa.Table
table = klass.from_pandas(df, schema=schema,
nthreads=2 if use_threads else 1)
result = table.to_pandas(use_threads=use_threads)
if expected_schema:
# all occurences of _check_pandas_roundtrip passes expected_schema
# without the pandas generated key-value metadata
assert table.schema.equals(expected_schema, check_metadata=False)
if expected is None:
expected = df if schema is None else df[schema.names]
tm.assert_frame_equal(result, expected, check_dtype=check_dtype,
check_index_type=('equiv' if preserve_index
else False))
def _check_series_roundtrip(s, type_=None, expected_pa_type=None):
arr = pa.array(s, from_pandas=True, type=type_)
if type_ is not None and expected_pa_type is None:
expected_pa_type = type_
if expected_pa_type is not None:
assert arr.type == expected_pa_type
result = pd.Series(arr.to_pandas(),
if pa.types.is_timestamp(arr.type) and is not None:
result = (result.dt.tz_localize('utc')
tm.assert_series_equal(s, result)
def _check_array_roundtrip(values, expected=None, mask=None,
arr = pa.array(values, from_pandas=True, mask=mask, type=type)
result = arr.to_pandas()
values_nulls = pd.isnull(values)
if mask is None:
assert arr.null_count == values_nulls.sum()
assert arr.null_count == (mask | values_nulls).sum()
if expected is None:
if mask is None:
expected = pd.Series(values)
expected = pd.Series(, mask=mask))
tm.assert_series_equal(pd.Series(result), expected, check_names=False)
def _check_array_from_pandas_roundtrip(np_array, type=None):
arr = pa.array(np_array, from_pandas=True, type=type)
result = arr.to_pandas()
npt.assert_array_equal(result, np_array)
class TestConvertMetadata(object):
Conversion tests for Pandas metadata & indices.
def test_non_string_columns(self):
df = pd.DataFrame({0: [1, 2, 3]})
table = pa.Table.from_pandas(df)
assert table.column(0).name == '0'
def test_from_pandas_with_columns(self):
df = pd.DataFrame({0: [1, 2, 3], 1: [1, 3, 3], 2: [2, 4, 5]},
columns=[1, 0])
table = pa.Table.from_pandas(df, columns=[0, 1])
expected = pa.Table.from_pandas(df[[0, 1]])
assert expected.equals(table)
record_batch_table = pa.RecordBatch.from_pandas(df, columns=[0, 1])
record_batch_expected = pa.RecordBatch.from_pandas(df[[0, 1]])
assert record_batch_expected.equals(record_batch_table)
def test_column_index_names_are_preserved(self):
df = pd.DataFrame({'data': [1, 2, 3]})
df.columns.names = ['a']
_check_pandas_roundtrip(df, preserve_index=True)
def test_range_index_shortcut(self):
# ARROW-1639
index_name = 'foo'
df = pd.DataFrame({'a': [1, 2, 3, 4]},
index=pd.RangeIndex(0, 8, step=2, name=index_name))
df2 = pd.DataFrame({'a': [4, 5, 6, 7]},
index=pd.RangeIndex(0, 4))
table = pa.Table.from_pandas(df)
table_no_index_name = pa.Table.from_pandas(df2)
# The RangeIndex is tracked in the metadata only
assert len(table.schema) == 1
result = table.to_pandas()
tm.assert_frame_equal(result, df)
assert isinstance(result.index, pd.RangeIndex)
assert _pandas_api.get_rangeindex_attribute(result.index, 'step') == 2
assert == index_name
result2 = table_no_index_name.to_pandas()
tm.assert_frame_equal(result2, df2)
assert isinstance(result2.index, pd.RangeIndex)
assert _pandas_api.get_rangeindex_attribute(result2.index, 'step') == 1
assert is None
def test_range_index_force_serialization(self):
# ARROW-5427: preserve_index=True will force the RangeIndex to
# be serialized as a column rather than tracked more
# efficiently as metadata
df = pd.DataFrame({'a': [1, 2, 3, 4]},
index=pd.RangeIndex(0, 8, step=2, name='foo'))
table = pa.Table.from_pandas(df, preserve_index=True)
assert table.num_columns == 2
assert 'foo' in table.column_names
restored = table.to_pandas()
tm.assert_frame_equal(restored, df)
def test_rangeindex_doesnt_warn(self):
# ARROW-5606: pandas 0.25 deprecated private _start/stop/step
# attributes -> can be removed if support < pd 0.25 is dropped
df = pd.DataFrame(np.random.randn(4, 2), columns=['a', 'b'])
with pytest.warns(None) as record:
_check_pandas_roundtrip(df, preserve_index=True)
assert len(record) == 0
def test_multiindex_columns(self):
columns = pd.MultiIndex.from_arrays([
['one', 'two'], ['X', 'Y']
df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=columns)
_check_pandas_roundtrip(df, preserve_index=True)
def test_multiindex_columns_with_dtypes(self):
columns = pd.MultiIndex.from_arrays(
['one', 'two'],
pd.DatetimeIndex(['2017-08-01', '2017-08-02']),
names=['level_1', 'level_2'],
df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=columns)
_check_pandas_roundtrip(df, preserve_index=True)
def test_multiindex_columns_unicode(self):
columns = pd.MultiIndex.from_arrays([[u'あ', u'い'], ['X', 'Y']])
df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=columns)
_check_pandas_roundtrip(df, preserve_index=True)
def test_multiindex_doesnt_warn(self):
# ARROW-3953: pandas 0.24 rename of MultiIndex labels to codes
columns = pd.MultiIndex.from_arrays([['one', 'two'], ['X', 'Y']])
df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=columns)
with pytest.warns(None) as record:
_check_pandas_roundtrip(df, preserve_index=True)
assert len(record) == 0
def test_integer_index_column(self):
df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')])
_check_pandas_roundtrip(df, preserve_index=True)
def test_index_metadata_field_name(self):
# test None case, and strangely named non-index columns
df = pd.DataFrame(
[(1, 'a', 3.1), (2, 'b', 2.2), (3, 'c', 1.3)],
[['c', 'b', 'a'], [3, 2, 1]],
names=[None, 'foo']
columns=['a', None, '__index_level_0__'],
with pytest.warns(UserWarning):
t = pa.Table.from_pandas(df, preserve_index=True)
js = t.schema.pandas_metadata
col1, col2, col3, idx0, foo = js['columns']
assert col1['name'] == 'a'
assert col1['name'] == col1['field_name']
assert col2['name'] is None
assert col2['field_name'] == 'None'
assert col3['name'] == '__index_level_0__'
assert col3['name'] == col3['field_name']
idx0_descr, foo_descr = js['index_columns']
assert idx0_descr == '__index_level_0__'
assert idx0['field_name'] == idx0_descr
assert idx0['name'] is None
assert foo_descr == 'foo'
assert foo['field_name'] == foo_descr
assert foo['name'] == foo_descr
def test_categorical_column_index(self):
df = pd.DataFrame(
[(1, 'a', 2.0), (2, 'b', 3.0), (3, 'c', 4.0)],
columns=pd.Index(list('def'), dtype='category')
t = pa.Table.from_pandas(df, preserve_index=True)
js = t.schema.pandas_metadata
column_indexes, = js['column_indexes']
assert column_indexes['name'] is None
assert column_indexes['pandas_type'] == 'categorical'
assert column_indexes['numpy_type'] == 'int8'
md = column_indexes['metadata']
assert md['num_categories'] == 3
assert md['ordered'] is False
def test_string_column_index(self):
df = pd.DataFrame(
[(1, 'a', 2.0), (2, 'b', 3.0), (3, 'c', 4.0)],
columns=pd.Index(list('def'), name='stringz')
t = pa.Table.from_pandas(df, preserve_index=True)
js = t.schema.pandas_metadata
column_indexes, = js['column_indexes']
assert column_indexes['name'] == 'stringz'
assert column_indexes['name'] == column_indexes['field_name']
assert column_indexes['numpy_type'] == 'object'
assert column_indexes['pandas_type'] == (
'bytes' if six.PY2 else 'unicode'
md = column_indexes['metadata']
if not six.PY2:
assert len(md) == 1
assert md['encoding'] == 'UTF-8'
assert md is None or 'encoding' not in md
def test_datetimetz_column_index(self):
df = pd.DataFrame(
[(1, 'a', 2.0), (2, 'b', 3.0), (3, 'c', 4.0)],
start='2017-01-01', periods=3, tz='America/New_York'
t = pa.Table.from_pandas(df, preserve_index=True)
js = t.schema.pandas_metadata
column_indexes, = js['column_indexes']
assert column_indexes['name'] is None
assert column_indexes['pandas_type'] == 'datetimetz'
assert column_indexes['numpy_type'] == 'datetime64[ns]'
md = column_indexes['metadata']
assert md['timezone'] == 'America/New_York'
def test_datetimetz_row_index(self):
df = pd.DataFrame({
'a': pd.date_range(
start='2017-01-01', periods=3, tz='America/New_York'
df = df.set_index('a')
_check_pandas_roundtrip(df, preserve_index=True)
def test_categorical_row_index(self):
df = pd.DataFrame({'a': [1, 2, 3], 'b': [1, 2, 3]})
df['a'] = df.a.astype('category')
df = df.set_index('a')
_check_pandas_roundtrip(df, preserve_index=True)
def test_duplicate_column_names_does_not_crash(self):
df = pd.DataFrame([(1, 'a'), (2, 'b')], columns=list('aa'))
with pytest.raises(ValueError):
def test_dictionary_indices_boundscheck(self):
# ARROW-1658. No validation of indices leads to segfaults in pandas
indices = [[0, 1], [0, -1]]
for inds in indices:
arr = pa.DictionaryArray.from_arrays(inds, ['a'], safe=False)
batch = pa.RecordBatch.from_arrays([arr], ['foo'])
table = pa.Table.from_batches([batch, batch, batch])
with pytest.raises(pa.ArrowInvalid):
with pytest.raises(pa.ArrowInvalid):
def test_unicode_with_unicode_column_and_index(self):
df = pd.DataFrame({u'あ': [u'い']}, index=[u'う'])
_check_pandas_roundtrip(df, preserve_index=True)
def test_mixed_column_names(self):
# mixed type column names are not reconstructed exactly
df = pd.DataFrame({'a': [1, 2], 'b': [3, 4]})
for cols in [[u'あ', b'a'], [1, '2'], [1, 1.5]]:
df.columns = pd.Index(cols, dtype=object)
# assert that the from_pandas raises the warning
with pytest.warns(UserWarning):
expected = df.copy()
expected.columns = df.columns.astype(six.text_type)
with pytest.warns(UserWarning):
_check_pandas_roundtrip(df, expected=expected,
def test_binary_column_name(self):
column_data = [u'い']
key = u'あ'.encode('utf8')
data = {key: column_data}
df = pd.DataFrame(data)
# we can't use _check_pandas_roundtrip here because our metdata
# is always decoded as utf8: even if binary goes in, utf8 comes out
t = pa.Table.from_pandas(df, preserve_index=True)
df2 = t.to_pandas()
assert df.values[0] == df2.values[0]
assert df.index.values[0] == df2.index.values[0]
assert df.columns[0] == key
def test_multiindex_duplicate_values(self):
num_rows = 3
numbers = list(range(num_rows))
index = pd.MultiIndex.from_arrays(
[['foo', 'foo', 'bar'], numbers],
names=['foobar', 'some_numbers'],
df = pd.DataFrame({'numbers': numbers}, index=index)
table = pa.Table.from_pandas(df)
result_df = table.to_pandas()
tm.assert_frame_equal(result_df, df)
def test_metadata_with_mixed_types(self):
df = pd.DataFrame({'data': [b'some_bytes', u'some_unicode']})
table = pa.Table.from_pandas(df)
js = table.schema.pandas_metadata
assert 'mixed' not in js
data_column = js['columns'][0]
assert data_column['pandas_type'] == 'bytes'
assert data_column['numpy_type'] == 'object'
def test_ignore_metadata(self):
df = pd.DataFrame({'a': [1, 2, 3], 'b': ['foo', 'bar', 'baz']},
index=['one', 'two', 'three'])
table = pa.Table.from_pandas(df)
result = table.to_pandas(ignore_metadata=True)
expected = (table.cast(table.schema.remove_metadata())
assert result.equals(expected)
def test_list_metadata(self):
df = pd.DataFrame({'data': [[1], [2, 3, 4], [5] * 7]})
schema = pa.schema([pa.field('data', type=pa.list_(pa.int64()))])
table = pa.Table.from_pandas(df, schema=schema)
js = table.schema.pandas_metadata
assert 'mixed' not in js
data_column = js['columns'][0]
assert data_column['pandas_type'] == 'list[int64]'
assert data_column['numpy_type'] == 'object'
def test_struct_metadata(self):
df = pd.DataFrame({'dicts': [{'a': 1, 'b': 2}, {'a': 3, 'b': 4}]})
table = pa.Table.from_pandas(df)
pandas_metadata = table.schema.pandas_metadata
assert pandas_metadata['columns'][0]['pandas_type'] == 'object'
def test_decimal_metadata(self):
expected = pd.DataFrame({
'decimals': [
table = pa.Table.from_pandas(expected)
js = table.schema.pandas_metadata
assert 'mixed' not in js
data_column = js['columns'][0]
assert data_column['pandas_type'] == 'decimal'
assert data_column['numpy_type'] == 'object'
assert data_column['metadata'] == {'precision': 26, 'scale': 11}
def test_table_column_subset_metadata(self):
# ARROW-1883
# non-default index
for index in [
pd.Index(['a', 'b', 'c'], name='index'),
pd.date_range("2017-01-01", periods=3, tz='Europe/Brussels')]:
df = pd.DataFrame({'a': [1, 2, 3],
'b': [.1, .2, .3]}, index=index)
table = pa.Table.from_pandas(df)
table_subset = table.remove_column(1)
result = table_subset.to_pandas()
tm.assert_frame_equal(result, df[['a']])
table_subset2 = table_subset.remove_column(1)
result = table_subset2.to_pandas()
tm.assert_frame_equal(result, df[['a']].reset_index(drop=True))
def test_empty_list_metadata(self):
# Create table with array of empty lists, forced to have type
# list(string) in pyarrow
c1 = [["test"], ["a", "b"], None]
c2 = [[], [], []]
arrays = OrderedDict([
('c1', pa.array(c1, type=pa.list_(pa.string()))),
('c2', pa.array(c2, type=pa.list_(pa.string()))),
rb = pa.RecordBatch.from_arrays(
tbl = pa.Table.from_batches([rb])
# First roundtrip changes schema, because pandas cannot preserve the
# type of empty lists
df = tbl.to_pandas()
tbl2 = pa.Table.from_pandas(df)
md2 = tbl2.schema.pandas_metadata
# Second roundtrip
df2 = tbl2.to_pandas()
expected = pd.DataFrame(OrderedDict([('c1', c1), ('c2', c2)]))
tm.assert_frame_equal(df2, expected)
assert md2['columns'] == [
'name': 'c1',
'field_name': 'c1',
'metadata': None,
'numpy_type': 'object',
'pandas_type': 'list[unicode]',
'name': 'c2',
'field_name': 'c2',
'metadata': None,
'numpy_type': 'object',
'pandas_type': 'list[empty]',
def test_metadata_pandas_version(self):
df = pd.DataFrame({'a': [1, 2, 3], 'b': [1, 2, 3]})
table = pa.Table.from_pandas(df)
assert table.schema.pandas_metadata['pandas_version'] is not None
class TestConvertPrimitiveTypes(object):
Conversion tests for primitive (e.g. numeric) types.
def test_float_no_nulls(self):
data = {}
fields = []
dtypes = [('f2', pa.float16()),
('f4', pa.float32()),
('f8', pa.float64())]
num_values = 100
for numpy_dtype, arrow_dtype in dtypes:
values = np.random.randn(num_values)
data[numpy_dtype] = values.astype(numpy_dtype)
fields.append(pa.field(numpy_dtype, arrow_dtype))
df = pd.DataFrame(data)
schema = pa.schema(fields)
_check_pandas_roundtrip(df, expected_schema=schema)
def test_float_nulls(self):
num_values = 100
null_mask = np.random.randint(0, 10, size=num_values) < 3
dtypes = [('f2', pa.float16()),
('f4', pa.float32()),
('f8', pa.float64())]
names = ['f2', 'f4', 'f8']
expected_cols = []
arrays = []
fields = []
for name, arrow_dtype in dtypes:
values = np.random.randn(num_values).astype(name)
arr = pa.array(values, from_pandas=True, mask=null_mask)
fields.append(pa.field(name, arrow_dtype))
values[null_mask] = np.nan
ex_frame = pd.DataFrame(dict(zip(names, expected_cols)),
table = pa.Table.from_arrays(arrays, names)
assert table.schema.equals(pa.schema(fields))
result = table.to_pandas()
tm.assert_frame_equal(result, ex_frame)
def test_float_nulls_to_ints(self):
# ARROW-2135
df = pd.DataFrame({"a": [1.0, 2.0,]})
schema = pa.schema([pa.field("a", pa.int16(), nullable=True)])
table = pa.Table.from_pandas(df, schema=schema, safe=False)
assert table[0].to_pylist() == [1, 2, None]
tm.assert_frame_equal(df, table.to_pandas())
def test_float_nulls_to_boolean(self):
s = pd.Series([0.0, 1.0, 2.0, None, -3.0])
expected = pd.Series([False, True, True, None, True])
_check_array_roundtrip(s, expected=expected, type=pa.bool_())
def test_series_from_pandas_false_respected(self):
# Check that explicit from_pandas=False is respected
s = pd.Series([0.0, np.nan])
arr = pa.array(s, from_pandas=False)
assert arr.null_count == 0
assert np.isnan(arr[1].as_py())
def test_integer_no_nulls(self):
data = OrderedDict()
fields = []
numpy_dtypes = [
('i1', pa.int8()), ('i2', pa.int16()),
('i4', pa.int32()), ('i8', pa.int64()),
('u1', pa.uint8()), ('u2', pa.uint16()),
('u4', pa.uint32()), ('u8', pa.uint64()),
('longlong', pa.int64()), ('ulonglong', pa.uint64())
num_values = 100
for dtype, arrow_dtype in numpy_dtypes:
info = np.iinfo(dtype)
values = np.random.randint(max(info.min, np.iinfo(np.int_).min),
min(info.max, np.iinfo(np.int_).max),
data[dtype] = values.astype(dtype)
fields.append(pa.field(dtype, arrow_dtype))
df = pd.DataFrame(data)
schema = pa.schema(fields)
_check_pandas_roundtrip(df, expected_schema=schema)
def test_all_integer_types(self):
# Test all Numpy integer aliases
data = OrderedDict()
numpy_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8',
'byte', 'ubyte', 'short', 'ushort', 'intc', 'uintc',
'int_', 'uint', 'longlong', 'ulonglong']
for dtype in numpy_dtypes:
data[dtype] = np.arange(12, dtype=dtype)
df = pd.DataFrame(data)
# Do the same with pa.array()
# (for some reason, it doesn't use the same code paths at all)
for np_arr in data.values():
arr = pa.array(np_arr)
assert arr.to_pylist() == np_arr.tolist()
def test_integer_byteorder(self):
# Byteswapped arrays are not supported yet
int_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8']
for dt in int_dtypes:
for order in '=<>':
data = np.array([1, 2, 42], dtype=order + dt)
for np_arr in (data, data[::2]):
if data.dtype.isnative:
arr = pa.array(data)
assert arr.to_pylist() == data.tolist()
with pytest.raises(NotImplementedError):
arr = pa.array(data)
def test_integer_with_nulls(self):
# pandas requires upcast to float dtype
int_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8']
num_values = 100
null_mask = np.random.randint(0, 10, size=num_values) < 3
expected_cols = []
arrays = []
for name in int_dtypes:
values = np.random.randint(0, 100, size=num_values)
arr = pa.array(values, mask=null_mask)
expected = values.astype('f8')
expected[null_mask] = np.nan
ex_frame = pd.DataFrame(dict(zip(int_dtypes, expected_cols)),
table = pa.Table.from_arrays(arrays, int_dtypes)
result = table.to_pandas()
tm.assert_frame_equal(result, ex_frame)
def test_array_from_pandas_type_cast(self):
arr = np.arange(10, dtype='int64')
target_type = pa.int8()
result = pa.array(arr, type=target_type)
expected = pa.array(arr.astype('int8'))
assert result.equals(expected)
def test_boolean_no_nulls(self):
num_values = 100
df = pd.DataFrame({'bools': np.random.randn(num_values) > 0})
field = pa.field('bools', pa.bool_())
schema = pa.schema([field])
_check_pandas_roundtrip(df, expected_schema=schema)
def test_boolean_nulls(self):
# pandas requires upcast to object dtype
num_values = 100
mask = np.random.randint(0, 10, size=num_values) < 3
values = np.random.randint(0, 10, size=num_values) < 5
arr = pa.array(values, mask=mask)
expected = values.astype(object)
expected[mask] = None
field = pa.field('bools', pa.bool_())
schema = pa.schema([field])
ex_frame = pd.DataFrame({'bools': expected})
table = pa.Table.from_arrays([arr], ['bools'])
assert table.schema.equals(schema)
result = table.to_pandas()
tm.assert_frame_equal(result, ex_frame)
def test_boolean_to_int(self):
# test from dtype=bool
s = pd.Series([True, True, False, True, True] * 2)
expected = pd.Series([1, 1, 0, 1, 1] * 2)
_check_array_roundtrip(s, expected=expected, type=pa.int64())
def test_boolean_objects_to_int(self):
# test from dtype=object
s = pd.Series([True, True, False, True, True] * 2, dtype=object)
expected = pd.Series([1, 1, 0, 1, 1] * 2)
expected_msg = 'Expected integer, got bool'
with pytest.raises(pa.ArrowTypeError, match=expected_msg):
_check_array_roundtrip(s, expected=expected, type=pa.int64())
def test_boolean_nulls_to_float(self):
# test from dtype=object
s = pd.Series([True, True, False, None, True] * 2)
expected = pd.Series([1.0, 1.0, 0.0, None, 1.0] * 2)
_check_array_roundtrip(s, expected=expected, type=pa.float64())
def test_float_object_nulls(self):
arr = np.array([None, 1.5, np.float64(3.5)] * 5, dtype=object)
df = pd.DataFrame({'floats': arr})
expected = pd.DataFrame({'floats': pd.to_numeric(arr)})
field = pa.field('floats', pa.float64())
schema = pa.schema([field])
_check_pandas_roundtrip(df, expected=expected,
def test_float_with_null_as_integer(self):
# ARROW-2298
s = pd.Series([np.nan, 1., 2., np.nan])
types = [pa.int8(), pa.int16(), pa.int32(), pa.int64(),
pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()]
for ty in types:
result = pa.array(s, type=ty)
expected = pa.array([None, 1, 2, None], type=ty)
assert result.equals(expected)
df = pd.DataFrame({'has_nulls': s})
schema = pa.schema([pa.field('has_nulls', ty)])
result = pa.Table.from_pandas(df, schema=schema,
assert result[0].data.chunk(0).equals(expected)
def test_int_object_nulls(self):
arr = np.array([None, 1, np.int64(3)] * 5, dtype=object)
df = pd.DataFrame({'ints': arr})
expected = pd.DataFrame({'ints': pd.to_numeric(arr)})
field = pa.field('ints', pa.int64())
schema = pa.schema([field])
_check_pandas_roundtrip(df, expected=expected,
def test_boolean_object_nulls(self):
arr = np.array([False, None, True] * 100, dtype=object)
df = pd.DataFrame({'bools': arr})
field = pa.field('bools', pa.bool_())
schema = pa.schema([field])
_check_pandas_roundtrip(df, expected_schema=schema)
def test_all_nulls_cast_numeric(self):
arr = np.array([None], dtype=object)
def _check_type(t):
a2 = pa.array(arr, type=t)
assert a2.type == t
assert a2[0].as_py() is None
def test_half_floats_from_numpy(self):
arr = np.array([1.5, np.nan], dtype=np.float16)
a = pa.array(arr, type=pa.float16())
x, y = a.to_pylist()
assert isinstance(x, np.float16)
assert x == 1.5
assert isinstance(y, np.float16)
assert np.isnan(y)
a = pa.array(arr, type=pa.float16(), from_pandas=True)
x, y = a.to_pylist()
assert isinstance(x, np.float16)
assert x == 1.5
assert y is None
['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8'])
def test_array_integer_object_nulls_option(dtype):
num_values = 100
null_mask = np.random.randint(0, 10, size=num_values) < 3
values = np.random.randint(0, 100, size=num_values, dtype=dtype)
array = pa.array(values, mask=null_mask)
if null_mask.any():
expected = values.astype('O')
expected[null_mask] = None
expected = values
result = array.to_pandas(integer_object_nulls=True)
np.testing.assert_equal(result, expected)
['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8'])
def test_table_integer_object_nulls_option(dtype):
num_values = 100
null_mask = np.random.randint(0, 10, size=num_values) < 3
values = np.random.randint(0, 100, size=num_values, dtype=dtype)
array = pa.array(values, mask=null_mask)
if null_mask.any():
expected = values.astype('O')
expected[null_mask] = None
expected = values
expected = pd.DataFrame({dtype: expected})
table = pa.Table.from_arrays([array], [dtype])
result = table.to_pandas(integer_object_nulls=True)
tm.assert_frame_equal(result, expected)
class TestConvertDateTimeLikeTypes(object):
Conversion tests for datetime- and timestamp-like types (date64, etc.).
def test_timestamps_notimezone_no_nulls(self):
df = pd.DataFrame({
'datetime64': np.array([
field = pa.field('datetime64', pa.timestamp('ns'))
schema = pa.schema([field])
def test_timestamps_notimezone_nulls(self):
df = pd.DataFrame({
'datetime64': np.array([
field = pa.field('datetime64', pa.timestamp('ns'))
schema = pa.schema([field])
def test_timestamps_with_timezone(self):
df = pd.DataFrame({
'datetime64': np.array([
df['datetime64'] = df['datetime64'].dt.tz_localize('US/Eastern')
# drop-in a null and ns instead of ms
df = pd.DataFrame({
'datetime64': np.array([
df['datetime64'] = df['datetime64'].dt.tz_localize('US/Eastern')
def test_python_datetime(self):
# ARROW-2106
date_array = [ + timedelta(days=x) for x in range(10)]
df = pd.DataFrame({
'datetime': pd.Series(date_array, dtype=object)
table = pa.Table.from_pandas(df)
assert isinstance(table[0].data.chunk(0), pa.TimestampArray)
result = table.to_pandas()
expected_df = pd.DataFrame({
'datetime': date_array
tm.assert_frame_equal(expected_df, result)
def test_python_datetime_with_pytz_tzinfo(self):
for tz in [pytz.utc, pytz.timezone('US/Eastern'), pytz.FixedOffset(1)]:
values = [datetime(2018, 1, 1, 12, 23, 45, tzinfo=tz)]
df = pd.DataFrame({'datetime': values})
@h.given(st.none() | tzst.timezones())
def test_python_datetime_with_pytz_timezone(self, tz):
values = [datetime(2018, 1, 1, 12, 23, 45, tzinfo=tz)]
df = pd.DataFrame({'datetime': values})
@pytest.mark.skipif(six.PY2, reason='datetime.timezone is available since '
'python version 3.2')
def test_python_datetime_with_timezone_tzinfo(self):
from datetime import timezone
values = [datetime(2018, 1, 1, 12, 23, 45, tzinfo=pytz.utc)]
df = pd.DataFrame({'datetime': values})
# datetime.timezone is going to be pytz.FixedOffset
hours = 1
tz_timezone = timezone(timedelta(hours=hours))
tz_pytz = pytz.FixedOffset(hours * 60)
values = [datetime(2018, 1, 1, 12, 23, 45, tzinfo=tz_timezone)]
values_exp = [datetime(2018, 1, 1, 12, 23, 45, tzinfo=tz_pytz)]
df = pd.DataFrame({'datetime': values})
df_exp = pd.DataFrame({'datetime': values_exp})
_check_pandas_roundtrip(df, expected=df_exp)
def test_python_datetime_subclass(self):
class MyDatetime(datetime):
# see
nanosecond = 0.0
date_array = [MyDatetime(2000, 1, 1, 1, 1, 1)]
df = pd.DataFrame({"datetime": pd.Series(date_array, dtype=object)})
table = pa.Table.from_pandas(df)
assert isinstance(table[0].data.chunk(0), pa.TimestampArray)
result = table.to_pandas()
expected_df = pd.DataFrame({"datetime": date_array})
expected_df["datetime"] = pd.to_datetime(expected_df["datetime"])
tm.assert_frame_equal(expected_df, result)
def test_python_date_subclass(self):
class MyDate(date):
date_array = [MyDate(2000, 1, 1)]
df = pd.DataFrame({"date": pd.Series(date_array, dtype=object)})
table = pa.Table.from_pandas(df)
assert isinstance(table[0].data.chunk(0), pa.Date32Array)
result = table.to_pandas()
expected_df = pd.DataFrame(
{"date": np.array([date(2000, 1, 1)], dtype=object)}
tm.assert_frame_equal(expected_df, result)
def test_datetime64_to_date32(self):
# ARROW-1718
arr = pa.array([date(2017, 10, 23), None])
c = pa.Column.from_array("d", arr)
s = c.to_pandas()
arr2 = pa.Array.from_pandas(s, type=pa.date32())
assert arr2.equals(arr.cast('date32'))
@pytest.mark.parametrize('mask', [
np.array([True, False, False]),
def test_pandas_datetime_to_date64(self, mask):
s = pd.to_datetime([
arr = pa.Array.from_pandas(s, type=pa.date64(), mask=mask)
data = np.array([
date(2018, 5, 10),
date(2018, 5, 11),
date(2018, 5, 12)
expected = pa.array(data, mask=mask, type=pa.date64())
assert arr.equals(expected)
@pytest.mark.parametrize('mask', [
np.array([True, False, False])
def test_pandas_datetime_to_date64_failures(self, mask):
s = pd.to_datetime([
expected_msg = 'Timestamp value had non-zero intraday milliseconds'
with pytest.raises(pa.ArrowInvalid, match=expected_msg):
pa.Array.from_pandas(s, type=pa.date64(), mask=mask)
def test_array_types_date_as_object(self):
data = [date(2000, 1, 1),
date(1970, 1, 1),
date(2040, 2, 26)]
expected = np.array(['2000-01-01',
'2040-02-26'], dtype='datetime64')
objects = [
# The second value is the expected value for date_as_object=False
(pa.array(data), expected),
(pa.chunked_array([data]), expected),
(pa.column('date', [data]), expected.astype('M8[ns]'))]
assert objects[0][0].equals(pa.array(expected))
for obj, expected_datetime64 in objects:
result = obj.to_pandas()
expected_obj = expected.astype(object)
assert result.dtype == expected_obj.dtype
npt.assert_array_equal(result, expected_obj)
result = obj.to_pandas(date_as_object=False)
assert result.dtype == expected_datetime64.dtype
npt.assert_array_equal(result, expected_datetime64)
def test_table_convert_date_as_object(self):
df = pd.DataFrame({
'date': [date(2000, 1, 1),
date(1970, 1, 1),
date(2040, 2, 26)]})
table = pa.Table.from_pandas(df, preserve_index=False)
df_datetime = table.to_pandas(date_as_object=False)
df_object = table.to_pandas()
tm.assert_frame_equal(df.astype('datetime64[ns]'), df_datetime,
tm.assert_frame_equal(df, df_object, check_dtype=True)
def test_date_infer(self):
df = pd.DataFrame({
'date': [date(2000, 1, 1),
date(1970, 1, 1),
date(2040, 2, 26)]})
table = pa.Table.from_pandas(df, preserve_index=False)
field = pa.field('date', pa.date32())
# schema's metadata is generated by from_pandas conversion
expected_schema = pa.schema([field], metadata=table.schema.metadata)
assert table.schema.equals(expected_schema)
result = table.to_pandas()
tm.assert_frame_equal(result, df)
def test_date_mask(self):
arr = np.array([date(2017, 4, 3), date(2017, 4, 4)],
mask = [True, False]
result = pa.array(arr, mask=np.array(mask))
expected = np.array([None, date(2017, 4, 4)], dtype='datetime64[D]')
expected = pa.array(expected, from_pandas=True)
assert expected.equals(result)
def test_date_objects_typed(self):
arr = np.array([
date(2017, 4, 3),
date(2017, 4, 4),
date(2017, 4, 5)], dtype=object)
arr_i4 = np.array([17259, -1, 17260, 17261], dtype='int32')
arr_i8 = arr_i4.astype('int64') * 86400000
mask = np.array([False, True, False, False])
t32 = pa.date32()
t64 = pa.date64()
a32 = pa.array(arr, type=t32)
a64 = pa.array(arr, type=t64)
a32_expected = pa.array(arr_i4, mask=mask, type=t32)
a64_expected = pa.array(arr_i8, mask=mask, type=t64)
assert a32.equals(a32_expected)
assert a64.equals(a64_expected)
# Test converting back to pandas
colnames = ['date32', 'date64']
table = pa.Table.from_arrays([a32, a64], colnames)
ex_values = (np.array(['2017-04-03', '2017-04-04', '2017-04-04',
ex_values[1] = pd.NaT.value
ex_datetime64ns = ex_values.astype('datetime64[ns]')
expected_pandas = pd.DataFrame({'date32': ex_datetime64ns,
'date64': ex_datetime64ns},
table_pandas = table.to_pandas(date_as_object=False)
tm.assert_frame_equal(table_pandas, expected_pandas)
table_pandas_objects = table.to_pandas()
ex_objects = ex_values.astype('object')
expected_pandas_objects = pd.DataFrame({'date32': ex_objects,
'date64': ex_objects},
def test_dates_from_integers(self):
t1 = pa.date32()
t2 = pa.date64()
arr = np.array([17259, 17260, 17261], dtype='int32')
arr2 = arr.astype('int64') * 86400000
a1 = pa.array(arr, type=t1)
a2 = pa.array(arr2, type=t2)
expected = date(2017, 4, 3)
assert a1[0].as_py() == expected
assert a2[0].as_py() == expected
@pytest.mark.xfail(reason="not supported ATM",
def test_timedelta(self):
# TODO(jreback): Pandas only support ns resolution
# Arrow supports ??? for resolution
df = pd.DataFrame({
'timedelta': np.arange(start=0, stop=3 * 86400000,
def test_pytime_from_pandas(self):
pytimes = [time(1, 2, 3, 1356),
time(4, 5, 6, 1356)]
# microseconds
t1 = pa.time64('us')
aobjs = np.array(pytimes + [None], dtype=object)
parr = pa.array(aobjs)
assert parr.type == t1
assert parr[0].as_py() == pytimes[0]
assert parr[1].as_py() == pytimes[1]
assert parr[2] is pa.NA
# DataFrame
df = pd.DataFrame({'times': aobjs})
batch = pa.RecordBatch.from_pandas(df)
assert batch[0].equals(parr)
# Test ndarray of int64 values
arr = np.array([_pytime_to_micros(v) for v in pytimes],
a1 = pa.array(arr, type=pa.time64('us'))
assert a1[0].as_py() == pytimes[0]
a2 = pa.array(arr * 1000, type=pa.time64('ns'))
assert a2[0].as_py() == pytimes[0]
a3 = pa.array((arr / 1000).astype('i4'),
assert a3[0].as_py() == pytimes[0].replace(microsecond=1000)
a4 = pa.array((arr / 1000000).astype('i4'),
assert a4[0].as_py() == pytimes[0].replace(microsecond=0)
def test_arrow_time_to_pandas(self):
pytimes = [time(1, 2, 3, 1356),
time(4, 5, 6, 1356),
time(0, 0, 0)]
expected = np.array(pytimes[:2] + [None])
expected_ms = np.array([x.replace(microsecond=1000)
for x in pytimes[:2]] +
expected_s = np.array([x.replace(microsecond=0)
for x in pytimes[:2]] +
arr = np.array([_pytime_to_micros(v) for v in pytimes],
arr = np.array([_pytime_to_micros(v) for v in pytimes],
null_mask = np.array([False, False, True], dtype=bool)
a1 = pa.array(arr, mask=null_mask, type=pa.time64('us'))
a2 = pa.array(arr * 1000, mask=null_mask,
a3 = pa.array((arr / 1000).astype('i4'), mask=null_mask,
a4 = pa.array((arr / 1000000).astype('i4'), mask=null_mask,
names = ['time64[us]', 'time64[ns]', 'time32[ms]', 'time32[s]']
batch = pa.RecordBatch.from_arrays([a1, a2, a3, a4], names)
arr = a1.to_pandas()
assert (arr == expected).all()
arr = a2.to_pandas()
assert (arr == expected).all()
arr = a3.to_pandas()
assert (arr == expected_ms).all()
arr = a4.to_pandas()
assert (arr == expected_s).all()
df = batch.to_pandas()
expected_df = pd.DataFrame({'time64[us]': expected,
'time64[ns]': expected,
'time32[ms]': expected_ms,
'time32[s]': expected_s},
tm.assert_frame_equal(df, expected_df)
def test_numpy_datetime64_columns(self):
datetime64_ns = np.array([
datetime64_us = np.array([
datetime64_ms = np.array([
datetime64_s = np.array([
@pytest.mark.parametrize('dtype', [pa.date32(), pa.date64()])
def test_numpy_datetime64_day_unit(self, dtype):
datetime64_d = np.array([
_check_array_from_pandas_roundtrip(datetime64_d, type=dtype)
def test_array_from_pandas_date_with_mask(self):
m = np.array([True, False, True])
data = pd.Series([
date(1990, 1, 1),
date(1991, 1, 1),
date(1992, 1, 1)
result = pa.Array.from_pandas(data, mask=m)
expected = pd.Series([None, date(1991, 1, 1), None])
assert pa.Array.from_pandas(expected).equals(result)
def test_fixed_offset_timezone(self):
df = pd.DataFrame({
'a': [
pd.Timestamp('2012-11-11 00:00:00+01:00'),
# ----------------------------------------------------------------------
# Conversion tests for string and binary types.
class TestConvertStringLikeTypes(object):
def test_pandas_unicode(self):
repeats = 1000
values = [u'foo', None, u'bar', u'mañana', np.nan]
df = pd.DataFrame({'strings': values * repeats})
field = pa.field('strings', pa.string())
schema = pa.schema([field])
_check_pandas_roundtrip(df, expected_schema=schema)
def test_bytes_to_binary(self):
values = [u'qux', b'foo', None, bytearray(b'barz'), 'qux', np.nan]
df = pd.DataFrame({'strings': values})
table = pa.Table.from_pandas(df)
assert table[0].type == pa.binary()
values2 = [b'qux', b'foo', None, b'barz', b'qux', np.nan]
expected = pd.DataFrame({'strings': values2})
_check_pandas_roundtrip(df, expected)
def test_bytes_exceed_2gb(self):
v1 = b'x' * 100000000
v2 = b'x' * 147483646
# ARROW-2227, hit exactly 2GB on the nose
df = pd.DataFrame({
'strings': [v1] * 20 + [v2] + ['x'] * 20
arr = pa.array(df['strings'])
assert isinstance(arr, pa.ChunkedArray)
assert arr.num_chunks == 2
arr = None
table = pa.Table.from_pandas(df)
assert table[0].data.num_chunks == 2
def test_fixed_size_bytes(self):
values = [b'foo', None, bytearray(b'bar'), None, None, b'hey']
df = pd.DataFrame({'strings': values})
schema = pa.schema([pa.field('strings', pa.binary(3))])
table = pa.Table.from_pandas(df, schema=schema)
assert table.schema[0].type == schema[0].type
assert table.schema[0].name == schema[0].name
result = table.to_pandas()
tm.assert_frame_equal(result, df)
def test_fixed_size_bytes_does_not_accept_varying_lengths(self):
values = [b'foo', None, b'ba', None, None, b'hey']
df = pd.DataFrame({'strings': values})
schema = pa.schema([pa.field('strings', pa.binary(3))])
with pytest.raises(pa.ArrowInvalid):
pa.Table.from_pandas(df, schema=schema)
def test_variable_size_bytes(self):
s = pd.Series([b'123', b'', b'a', None])
_check_series_roundtrip(s, type_=pa.binary())
def test_binary_from_bytearray(self):
s = pd.Series([bytearray(b'123'), bytearray(b''), bytearray(b'a'),
# Explicitly set type
_check_series_roundtrip(s, type_=pa.binary())
# Infer type from bytearrays
_check_series_roundtrip(s, expected_pa_type=pa.binary())
def test_table_empty_str(self):
values = ['', '', '', '', '']
df = pd.DataFrame({'strings': values})
field = pa.field('strings', pa.string())
schema = pa.schema([field])
table = pa.Table.from_pandas(df, schema=schema)
result1 = table.to_pandas(strings_to_categorical=False)
expected1 = pd.DataFrame({'strings': values})
tm.assert_frame_equal(result1, expected1, check_dtype=True)
result2 = table.to_pandas(strings_to_categorical=True)
expected2 = pd.DataFrame({'strings': pd.Categorical(values)})
tm.assert_frame_equal(result2, expected2, check_dtype=True)
def test_selective_categoricals(self):
values = ['', '', '', '', '']
df = pd.DataFrame({'strings': values})
field = pa.field('strings', pa.string())
schema = pa.schema([field])
table = pa.Table.from_pandas(df, schema=schema)
expected_str = pd.DataFrame({'strings': values})
expected_cat = pd.DataFrame({'strings': pd.Categorical(values)})
result1 = table.to_pandas(categories=['strings'])
tm.assert_frame_equal(result1, expected_cat, check_dtype=True)
result2 = table.to_pandas(categories=[])
tm.assert_frame_equal(result2, expected_str, check_dtype=True)
result3 = table.to_pandas(categories=('strings',))
tm.assert_frame_equal(result3, expected_cat, check_dtype=True)
result4 = table.to_pandas(categories=tuple())
tm.assert_frame_equal(result4, expected_str, check_dtype=True)
def test_to_pandas_categorical_zero_length(self):
# ARROW-3586
array = pa.array([], type=pa.int32())
table = pa.Table.from_arrays(arrays=[array], names=['col'])
# This would segfault under 0.11.0
def test_table_str_to_categorical_without_na(self):
values = ['a', 'a', 'b', 'b', 'c']
df = pd.DataFrame({'strings': values})
field = pa.field('strings', pa.string())
schema = pa.schema([field])
table = pa.Table.from_pandas(df, schema=schema)
result = table.to_pandas(strings_to_categorical=True)
expected = pd.DataFrame({'strings': pd.Categorical(values)})
tm.assert_frame_equal(result, expected, check_dtype=True)
with pytest.raises(pa.ArrowInvalid):
def test_table_str_to_categorical_with_na(self):
values = [None, 'a', 'b', np.nan]
df = pd.DataFrame({'strings': values})
field = pa.field('strings', pa.string())
schema = pa.schema([field])
table = pa.Table.from_pandas(df, schema=schema)
result = table.to_pandas(strings_to_categorical=True)
expected = pd.DataFrame({'strings': pd.Categorical(values)})
tm.assert_frame_equal(result, expected, check_dtype=True)
with pytest.raises(pa.ArrowInvalid):
# Regression test for ARROW-2101
def test_array_of_bytes_to_strings(self):
converted = pa.array(np.array([b'x'], dtype=object), pa.string())
assert converted.type == pa.string()
# Make sure that if an ndarray of bytes is passed to the array
# constructor and the type is string, it will fail if those bytes
# cannot be converted to utf-8
def test_array_of_bytes_to_strings_bad_data(self):
with pytest.raises(
match="was not a utf8 string"):
pa.array(np.array([b'\x80\x81'], dtype=object), pa.string())
def test_numpy_string_array_to_fixed_size_binary(self):
arr = np.array([b'foo', b'bar', b'baz'], dtype='|S3')
converted = pa.array(arr, type=pa.binary(3))
expected = pa.array(list(arr), type=pa.binary(3))
assert converted.equals(expected)
mask = np.array([True, False, True])
converted = pa.array(arr, type=pa.binary(3), mask=mask)
expected = pa.array([b'foo', None, b'baz'], type=pa.binary(3))
assert converted.equals(expected)
with pytest.raises(pa.lib.ArrowInvalid,
match=r'Got bytestring of length 3 \(expected 4\)'):
arr = np.array([b'foo', b'bar', b'baz'], dtype='|S3')
pa.array(arr, type=pa.binary(4))
with pytest.raises(
match=r'Got bytestring of length 12 \(expected 3\)'):
arr = np.array([b'foo', b'bar', b'baz'], dtype='|U3')
pa.array(arr, type=pa.binary(3))
class TestConvertDecimalTypes(object):
Conversion test for decimal types.
decimal32 = [
decimal64 = [
decimal128 = [
@pytest.mark.parametrize(('values', 'expected_type'), [
pytest.param(decimal32, pa.decimal128(7, 3), id='decimal32'),
pytest.param(decimal64, pa.decimal128(12, 6), id='decimal64'),
pytest.param(decimal128, pa.decimal128(26, 11), id='decimal128')
def test_decimal_from_pandas(self, values, expected_type):
expected = pd.DataFrame({'decimals': values})
table = pa.Table.from_pandas(expected, preserve_index=False)
field = pa.field('decimals', expected_type)
# schema's metadata is generated by from_pandas conversion
expected_schema = pa.schema([field], metadata=table.schema.metadata)
assert table.schema.equals(expected_schema)
@pytest.mark.parametrize('values', [
pytest.param(decimal32, id='decimal32'),
pytest.param(decimal64, id='decimal64'),
pytest.param(decimal128, id='decimal128')
def test_decimal_to_pandas(self, values):
expected = pd.DataFrame({'decimals': values})
converted = pa.Table.from_pandas(expected)
df = converted.to_pandas()
tm.assert_frame_equal(df, expected)
def test_decimal_fails_with_truncation(self):
data1 = [decimal.Decimal('1.234')]
type1 = pa.decimal128(10, 2)
with pytest.raises(pa.ArrowInvalid):
pa.array(data1, type=type1)
data2 = [decimal.Decimal('1.2345')]
type2 = pa.decimal128(10, 3)
with pytest.raises(pa.ArrowInvalid):
pa.array(data2, type=type2)
def test_decimal_with_different_precisions(self):
data = [
series = pd.Series(data)
array = pa.array(series)
assert array.to_pylist() == data
assert array.type == pa.decimal128(3, 3)
array = pa.array(data, type=pa.decimal128(12, 5))
expected = [decimal.Decimal('0.01000'), decimal.Decimal('0.00100')]
assert array.to_pylist() == expected
def test_decimal_with_None_explicit_type(self):
series = pd.Series([decimal.Decimal('3.14'), None])
_check_series_roundtrip(series, type_=pa.decimal128(12, 5))
# Test that having all None values still produces decimal array
series = pd.Series([None] * 2)
_check_series_roundtrip(series, type_=pa.decimal128(12, 5))
def test_decimal_with_None_infer_type(self):
series = pd.Series([decimal.Decimal('3.14'), None])
_check_series_roundtrip(series, expected_pa_type=pa.decimal128(3, 2))
def test_strided_objects(self, tmpdir):
# see ARROW-3053
data = {
'a': {0: 'a'},
'b': {0: decimal.Decimal('0.0')}
# This yields strided objects
df = pd.DataFrame.from_dict(data)
class TestConvertListTypes(object):
Conversion tests for list<> types.
def test_column_of_arrays(self):
df, schema = dataframe_with_arrays()
_check_pandas_roundtrip(df, schema=schema, expected_schema=schema)
table = pa.Table.from_pandas(df, schema=schema, preserve_index=False)
# schema's metadata is generated by from_pandas conversion
expected_schema = schema.add_metadata(table.schema.metadata)
assert table.schema.equals(expected_schema)
for column in df.columns:
field = schema.field_by_name(column)
_check_array_roundtrip(df[column], type=field.type)
def test_column_of_arrays_to_py(self):
# Test regression in ARROW-1199 not caught in above test
dtype = 'i1'
arr = np.array([
np.arange(10, dtype=dtype),
np.arange(5, dtype=dtype),
np.arange(1, dtype=dtype)
type_ = pa.list_(pa.int8())
parr = pa.array(arr, type=type_)
assert parr[0].as_py() == list(range(10))
assert parr[1].as_py() == list(range(5))
assert parr[2].as_py() is None
assert parr[3].as_py() == [0]
def test_column_of_boolean_list(self):
# ARROW-4370: Table to pandas conversion fails for list of bool
array = pa.array([[True, False], [True]], type=pa.list_(pa.bool_()))
table = pa.Table.from_arrays([array], names=['col1'])
df = table.to_pandas()
expected_df = pd.DataFrame({'col1': [[True, False], [True]]})
tm.assert_frame_equal(df, expected_df)
def test_column_of_decimal_list(self):
array = pa.array([[decimal.Decimal('1'), decimal.Decimal('2')],
type=pa.list_(pa.decimal128(2, 1)))
table = pa.Table.from_arrays([array], names=['col1'])
df = table.to_pandas()
expected_df = pd.DataFrame(
{'col1': [[decimal.Decimal('1'), decimal.Decimal('2')],
tm.assert_frame_equal(df, expected_df)
def test_column_of_lists(self):
df, schema = dataframe_with_lists()
_check_pandas_roundtrip(df, schema=schema, expected_schema=schema)
table = pa.Table.from_pandas(df, schema=schema, preserve_index=False)
# schema's metadata is generated by from_pandas conversion
expected_schema = schema.add_metadata(table.schema.metadata)
assert table.schema.equals(expected_schema)
for column in df.columns:
field = schema.field_by_name(column)
_check_array_roundtrip(df[column], type=field.type)
def test_column_of_lists_first_empty(self):
# ARROW-2124
num_lists = [[], [2, 3, 4], [3, 6, 7, 8], [], [2]]
series = pd.Series([np.array(s, dtype=float) for s in num_lists])
arr = pa.array(series)
result = pd.Series(arr.to_pandas())
tm.assert_series_equal(result, series)
def test_column_of_lists_chunked(self):
# ARROW-1357
df = pd.DataFrame({
'lists': np.array([
[1, 2],
[2, 3],
[4, 5],
[6, 7],
[8, 9]
], dtype=object)
schema = pa.schema([
pa.field('lists', pa.list_(pa.int64()))
t1 = pa.Table.from_pandas(df[:2], schema=schema)
t2 = pa.Table.from_pandas(df[2:], schema=schema)
table = pa.concat_tables([t1, t2])
result = table.to_pandas()
tm.assert_frame_equal(result, df)
def test_column_of_lists_chunked2(self):
data1 = [[0, 1], [2, 3], [4, 5], [6, 7], [10, 11],
[12, 13], [14, 15], [16, 17]]
data2 = [[8, 9], [18, 19]]
a1 = pa.array(data1)
a2 = pa.array(data2)
t1 = pa.Table.from_arrays([a1], names=['a'])
t2 = pa.Table.from_arrays([a2], names=['a'])
concatenated = pa.concat_tables([t1, t2])
result = concatenated.to_pandas()
expected = pd.DataFrame({'a': data1 + data2})
tm.assert_frame_equal(result, expected)
def test_column_of_lists_strided(self):
df, schema = dataframe_with_lists()
df = pd.concat([df] * 6, ignore_index=True)
arr = df['int64'].values[::3]
assert arr.strides[0] != 8
def test_nested_lists_all_none(self):
data = np.array([[None, None], None], dtype=object)
arr = pa.array(data)
expected = pa.array(list(data))
assert arr.equals(expected)
assert arr.type == pa.list_(pa.null())
data2 = np.array([None, None, [None, None],
np.array([None, None], dtype=object)],
arr = pa.array(data2)
expected = pa.array([None, None, [None, None], [None, None]])
assert arr.equals(expected)
def test_nested_lists_all_empty(self):
# ARROW-2128
data = pd.Series([[], [], []])
arr = pa.array(data)
expected = pa.array(list(data))
assert arr.equals(expected)
assert arr.type == pa.list_(pa.null())
def test_nested_list_first_empty(self):
# ARROW-2711
data = pd.Series([[], [u"a"]])
arr = pa.array(data)
expected = pa.array(list(data))
assert arr.equals(expected)
assert arr.type == pa.list_(pa.string())
def test_nested_smaller_ints(self):
# ARROW-1345, ARROW-2008, there were some type inference bugs happening
# before
data = pd.Series([np.array([1, 2, 3], dtype='i1'), None])
result = pa.array(data)
result2 = pa.array(data.values)
expected = pa.array([[1, 2, 3], None], type=pa.list_(pa.int8()))
assert result.equals(expected)
assert result2.equals(expected)
data3 = pd.Series([np.array([1, 2, 3], dtype='f4'), None])
result3 = pa.array(data3)
expected3 = pa.array([[1, 2, 3], None], type=pa.list_(pa.float32()))
assert result3.equals(expected3)
def test_infer_lists(self):
data = OrderedDict([
('nan_ints', [[None, 1], [2, 3]]),
('ints', [[0, 1], [2, 3]]),
('strs', [[None, u'b'], [u'c', u'd']]),
('nested_strs', [[[None, u'b'], [u'c', u'd']], None])
df = pd.DataFrame(data)
expected_schema = pa.schema([
pa.field('nan_ints', pa.list_(pa.int64())),
pa.field('ints', pa.list_(pa.int64())),
pa.field('strs', pa.list_(pa.string())),
pa.field('nested_strs', pa.list_(pa.list_(pa.string())))
_check_pandas_roundtrip(df, expected_schema=expected_schema)
def test_infer_numpy_array(self):
data = OrderedDict([
('ints', [
np.array([0, 1], dtype=np.int64),
np.array([2, 3], dtype=np.int64)
df = pd.DataFrame(data)
expected_schema = pa.schema([
pa.field('ints', pa.list_(pa.int64()))
_check_pandas_roundtrip(df, expected_schema=expected_schema)
@pytest.mark.parametrize('t,data,expected', [
[[1, 2], [3], None],
[None, [3], None]
[[u'aaa', u'bb'], [u'c'], None],
[None, [u'c'], None]
[[None, None], [None], None],
[None, [None], None]
def test_array_from_pandas_typed_array_with_mask(self, t, data, expected):
m = np.array([True, False, True])
s = pd.Series(data)
result = pa.Array.from_pandas(s, mask=m, type=pa.list_(t()))
assert pa.Array.from_pandas(expected,
def test_empty_list_roundtrip(self):
empty_list_array = np.empty((3,), dtype=object)
df = pd.DataFrame({'a': np.array(['1', '2', '3']),
'b': empty_list_array})
tbl = pa.Table.from_pandas(df)
result = tbl.to_pandas()
tm.assert_frame_equal(result, df)
def test_array_from_nested_arrays(self):
df, schema = dataframe_with_arrays()
for field in schema:
arr = df[].values
expected = pa.array(list(arr), type=field.type)
result = pa.array(arr)
assert result.type == field.type # == list<scalar>
assert result.equals(expected)
class TestConvertStructTypes(object):
Conversion tests for struct types.
def test_pandas_roundtrip(self):
df = pd.DataFrame({'dicts': [{'a': 1, 'b': 2}, {'a': 3, 'b': 4}]})
expected_schema = pa.schema([
('dicts', pa.struct([('a', pa.int64()), ('b', pa.int64())])),
_check_pandas_roundtrip(df, expected_schema=expected_schema)
# specifying schema explicitly in from_pandas
df, schema=expected_schema, expected_schema=expected_schema)
def test_to_pandas(self):
ints = pa.array([None, 2, 3], type=pa.int64())
strs = pa.array([u'a', None, u'c'], type=pa.string())
bools = pa.array([True, False, None], type=pa.bool_())
arr = pa.StructArray.from_arrays(
[ints, strs, bools],
['ints', 'strs', 'bools'])
expected = pd.Series([
{'ints': None, 'strs': u'a', 'bools': True},
{'ints': 2, 'strs': None, 'bools': False},
{'ints': 3, 'strs': u'c', 'bools': None},
series = pd.Series(arr.to_pandas())
tm.assert_series_equal(series, expected)
def test_from_numpy(self):
dt = np.dtype([('x', np.int32),
(('y_title', 'y'), np.bool_)])
ty = pa.struct([pa.field('x', pa.int32()),
pa.field('y', pa.bool_())])
data = np.array([], dtype=dt)
arr = pa.array(data, type=ty)
assert arr.to_pylist() == []
data = np.array([(42, True), (43, False)], dtype=dt)
arr = pa.array(data, type=ty)
assert arr.to_pylist() == [{'x': 42, 'y': True},
{'x': 43, 'y': False}]
# With mask
arr = pa.array(data, mask=np.bool_([False, True]), type=ty)
assert arr.to_pylist() == [{'x': 42, 'y': True}, None]
# Trivial struct type
dt = np.dtype([])
ty = pa.struct([])
data = np.array([], dtype=dt)
arr = pa.array(data, type=ty)
assert arr.to_pylist() == []
data = np.array([(), ()], dtype=dt)
arr = pa.array(data, type=ty)
assert arr.to_pylist() == [{}, {}]
def test_from_numpy_nested(self):
# Note: an object field inside a struct
dt = np.dtype([('x', np.dtype([('xx', np.int8),
('yy', np.bool_)])),
('y', np.int16),
('z', np.object_)])
# Note: itemsize is not a multiple of sizeof(object)
assert dt.itemsize == 12
ty = pa.struct([pa.field('x', pa.struct([pa.field('xx', pa.int8()),
pa.field('yy', pa.bool_())])),
pa.field('y', pa.int16()),
pa.field('z', pa.string())])
data = np.array([], dtype=dt)
arr = pa.array(data, type=ty)
assert arr.to_pylist() == []
data = np.array([
((1, True), 2, 'foo'),
((3, False), 4, 'bar')], dtype=dt)
arr = pa.array(data, type=ty)
assert arr.to_pylist() == [
{'x': {'xx': 1, 'yy': True}, 'y': 2, 'z': 'foo'},
{'x': {'xx': 3, 'yy': False}, 'y': 4, 'z': 'bar'}]
def test_from_numpy_large(self):
# Exercise rechunking + nulls
target_size = 3 * 1024**3 # 4GB
dt = np.dtype([('x', np.float64), ('y', 'object')])
bs = 65536 - dt.itemsize
block = b'.' * bs
n = target_size // (bs + dt.itemsize)
data = np.zeros(n, dtype=dt)
data['x'] = np.random.random_sample(n)
data['y'] = block
# Add implicit nulls
data['x'][data['x'] < 0.2] = np.nan
ty = pa.struct([pa.field('x', pa.float64()),
pa.field('y', pa.binary(bs))])
arr = pa.array(data, type=ty, from_pandas=True)
assert arr.num_chunks == 2
def iter_chunked_array(arr):
for chunk in arr.iterchunks():
for item in chunk:
yield item
def check(arr, data, mask=None):
assert len(arr) == len(data)
xs = data['x']
ys = data['y']
for i, obj in enumerate(iter_chunked_array(arr)):
d = obj.as_py()
if mask is not None and mask[i]:
assert d is None
x = xs[i]
if np.isnan(x):
assert d['x'] is None
assert d['x'] == x
assert d['y'] == ys[i]
except Exception:
print("Failed at index", i)
check(arr, data)
del arr
# Now with explicit mask
mask = np.random.random_sample(n) < 0.2
arr = pa.array(data, type=ty, mask=mask, from_pandas=True)
assert arr.num_chunks == 2
check(arr, data, mask)
del arr
def test_from_numpy_bad_input(self):
ty = pa.struct([pa.field('x', pa.int32()),
pa.field('y', pa.bool_())])
dt = np.dtype([('x', np.int32),
('z', np.bool_)])
data = np.array([], dtype=dt)
with pytest.raises(TypeError,
match="Missing field 'y'"):
pa.array(data, type=ty)
data = np.int32([])
with pytest.raises(TypeError,
match="Expected struct array"):
pa.array(data, type=ty)
def test_from_tuples(self):
df = pd.DataFrame({'tuples': [(1, 2), (3, 4)]})
expected_df = pd.DataFrame(
{'tuples': [{'a': 1, 'b': 2}, {'a': 3, 'b': 4}]})
# conversion from tuples works when specifying expected struct type
struct_type = pa.struct([('a', pa.int64()), ('b', pa.int64())])
arr = np.asarray(df['tuples'])
arr, expected=expected_df['tuples'], type=struct_type)
expected_schema = pa.schema([('tuples', struct_type)])
df, expected=expected_df, schema=expected_schema,
class TestZeroCopyConversion(object):
Tests that zero-copy conversion works with some types.
def test_zero_copy_success(self):
result = pa.array([0, 1, 2]).to_pandas(zero_copy_only=True)
npt.assert_array_equal(result, [0, 1, 2])
def test_zero_copy_dictionaries(self):
arr = pa.DictionaryArray.from_arrays(
np.array([0, 0]),
result = arr.to_pandas(zero_copy_only=True)
values = pd.Categorical([5, 5])
tm.assert_series_equal(pd.Series(result), pd.Series(values),
def check_zero_copy_failure(self, arr):
with pytest.raises(pa.ArrowInvalid):
def test_zero_copy_failure_on_object_types(self):
self.check_zero_copy_failure(pa.array(['A', 'B', 'C']))
def test_zero_copy_failure_with_int_when_nulls(self):
self.check_zero_copy_failure(pa.array([0, 1, None]))
def test_zero_copy_failure_with_float_when_nulls(self):
self.check_zero_copy_failure(pa.array([0.0, 1.0, None]))
def test_zero_copy_failure_on_bool_types(self):
self.check_zero_copy_failure(pa.array([True, False]))
def test_zero_copy_failure_on_list_types(self):
arr = pa.array([[1, 2], [8, 9]], type=pa.list_(pa.int64()))
def test_zero_copy_failure_on_timestamp_types(self):
arr = np.array(['2007-07-13'], dtype='datetime64[ns]')
# This function must be at the top-level for Python 2.7's multiprocessing
def _non_threaded_conversion():
df = _alltypes_example()
_check_pandas_roundtrip(df, use_threads=False)
_check_pandas_roundtrip(df, use_threads=False, as_batch=True)
def _threaded_conversion():
df = _alltypes_example()
_check_pandas_roundtrip(df, use_threads=True)
_check_pandas_roundtrip(df, use_threads=True, as_batch=True)
class TestConvertMisc(object):
Miscellaneous conversion tests.
type_pairs = [
(np.int8, pa.int8()),
(np.int16, pa.int16()),
(np.int32, pa.int32()),
(np.int64, pa.int64()),
(np.uint8, pa.uint8()),
(np.uint16, pa.uint16()),
(np.uint32, pa.uint32()),
(np.uint64, pa.uint64()),
(np.float16, pa.float16()),
(np.float32, pa.float32()),
(np.float64, pa.float64()),
# XXX unsupported
# (np.dtype([('a', 'i2')]), pa.struct([pa.field('a', pa.int16())])),
(np.object, pa.string()),
(np.object, pa.binary()),
(np.object, pa.binary(10)),
(np.object, pa.list_(pa.int64())),
def test_all_none_objects(self):
df = pd.DataFrame({'a': [None, None, None]})
def test_all_none_category(self):
df = pd.DataFrame({'a': [None, None, None]})
df['a'] = df['a'].astype('category')
def test_empty_arrays(self):
for dtype, pa_type in self.type_pairs:
arr = np.array([], dtype=dtype)
_check_array_roundtrip(arr, type=pa_type)
def test_non_threaded_conversion(self):
def test_threaded_conversion_multiprocess(self):
# Parallel conversion should work from child processes too (ARROW-2963)
pool = mp.Pool(2)
def test_category(self):
repeats = 5
v1 = ['foo', None, 'bar', 'qux', np.nan]
v2 = [4, 5, 6, 7, 8]
v3 = [b'foo', None, b'bar', b'qux', np.nan]
arrays = {
'cat_strings': pd.Categorical(v1 * repeats),
'cat_strings_with_na': pd.Categorical(v1 * repeats,
categories=['foo', 'bar']),
'cat_ints': pd.Categorical(v2 * repeats),
'cat_binary': pd.Categorical(v3 * repeats),
'cat_strings_ordered': pd.Categorical(
v1 * repeats, categories=['bar', 'qux', 'foo'],
'ints': v2 * repeats,
'ints2': v2 * repeats,
'strings': v1 * repeats,
'strings2': v1 * repeats,
'strings3': v3 * repeats}
df = pd.DataFrame(arrays)
for k in arrays:
def test_category_implicit_from_pandas(self):
# ARROW-3374
def _check(v):
arr = pa.array(v)
result = arr.to_pandas()
tm.assert_series_equal(pd.Series(result), pd.Series(v))
arrays = [
pd.Categorical(['a', 'b', 'c'], categories=['a', 'b']),
pd.Categorical(['a', 'b', 'c'], categories=['a', 'b'],
for arr in arrays:
def test_empty_category(self):
# ARROW-2443
df = pd.DataFrame({'cat': pd.Categorical([])})
def test_mixed_types_fails(self):
data = pd.DataFrame({'a': ['a', 1, 2.0]})
with pytest.raises(pa.ArrowTypeError):
data = pd.DataFrame({'a': [1, True]})
with pytest.raises(pa.ArrowTypeError):
data = pd.DataFrame({'a': ['a', 1, 2.0]})
expected_msg = 'Conversion failed for column a'
with pytest.raises(pa.ArrowTypeError, match=expected_msg):
def test_strided_data_import(self):
cases = []
columns = ['a', 'b', 'c']
N, K = 100, 3
random_numbers = np.random.randn(N, K).copy() * 100
numeric_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8',
'f4', 'f8']
for type_name in numeric_dtypes:
# strings
cases.append(np.array([tm.rands(10) for i in range(N * K)],
.reshape(N, K).copy())
# booleans
boolean_objects = (np.array([True, False, True] * N, dtype=object)
.reshape(N, K).copy())
# add some nulls, so dtype comes back as objects
boolean_objects[5] = None
cases.append(np.arange("2016-01-01T00:00:00.001", N * K,
.reshape(N, K).copy())
strided_mask = (random_numbers > 0).astype(bool)[:, 0]
for case in cases:
df = pd.DataFrame(case, columns=columns)
col = df['a']
_check_array_roundtrip(col, mask=strided_mask)
def test_all_nones(self):
def _check_series(s):
converted = pa.array(s)
assert isinstance(converted, pa.NullArray)
assert len(converted) == 3
assert converted.null_count == 3
for item in converted:
assert item is pa.NA
_check_series(pd.Series([None] * 3, dtype=object))
_check_series(pd.Series([np.nan] * 3, dtype=object))
_check_series(pd.Series([None, np.nan, None], dtype=object))
def test_partial_schema(self):
data = OrderedDict([
('a', [0, 1, 2, 3, 4]),
('b', np.array([-10, -5, 0, 5, 10], dtype=np.int32)),
('c', [-10, -5, 0, 5, 10])
df = pd.DataFrame(data)
partial_schema = pa.schema([
pa.field('c', pa.int64()),
pa.field('a', pa.int64())
_check_pandas_roundtrip(df, schema=partial_schema,
def test_table_batch_empty_dataframe(self):
df = pd.DataFrame({})
_check_pandas_roundtrip(df, as_batch=True)
df2 = pd.DataFrame({}, index=[0, 1, 2])
_check_pandas_roundtrip(df2, preserve_index=True)
_check_pandas_roundtrip(df2, as_batch=True, preserve_index=True)
def test_convert_empty_table(self):
arr = pa.array([], type=pa.int64())
tm.assert_almost_equal(arr.to_pandas(), np.array([], dtype=np.int64))
arr = pa.array([], type=pa.string())
tm.assert_almost_equal(arr.to_pandas(), np.array([], dtype=object))
arr = pa.array([], type=pa.list_(pa.int64()))
tm.assert_almost_equal(arr.to_pandas(), np.array([], dtype=object))
arr = pa.array([], type=pa.struct([pa.field('a', pa.int64())]))
tm.assert_almost_equal(arr.to_pandas(), np.array([], dtype=object))
def test_non_natural_stride(self):
ARROW-2172: converting from a Numpy array with a stride that's
not a multiple of itemsize.
dtype = np.dtype([('x', np.int32), ('y', np.int16)])
data = np.array([(42, -1), (-43, 2)], dtype=dtype)
assert data.strides == (6,)
arr = pa.array(data['x'], type=pa.int32())
assert arr.to_pylist() == [42, -43]
arr = pa.array(data['y'], type=pa.int16())
assert arr.to_pylist() == [-1, 2]
def test_safe_unsafe_casts(self):
# ARROW-2799
df = pd.DataFrame({
'A': list('abc'),
'B': np.linspace(0, 1, 3)
schema = pa.schema([
pa.field('A', pa.string()),
pa.field('B', pa.int32())
with pytest.raises(ValueError):
pa.Table.from_pandas(df, schema=schema)
table = pa.Table.from_pandas(df, schema=schema, safe=False)
assert table.column('B').type == pa.int32()
def test_error_sparse(self):
# ARROW-2818
df = pd.DataFrame({'a': pd.SparseArray([1, np.nan, 3])})
with pytest.raises(TypeError, match="Sparse pandas data"):
def test_safe_cast_from_float_with_nans_to_int():
# TODO(kszucs): write tests for creating Date32 and Date64 arrays, see
# ARROW-4258 and
values = pd.Series([1, 2, None, 4])
arr = pa.Array.from_pandas(values, type=pa.int32(), safe=True)
expected = pa.array([1, 2, None, 4], type=pa.int32())
assert arr.equals(expected)
def _fully_loaded_dataframe_example():
index = pd.MultiIndex.from_arrays([
pd.date_range('2000-01-01', periods=5).repeat(2),
np.tile(np.array(['foo', 'bar'], dtype=object), 5)
c1 = pd.date_range('2000-01-01', periods=10)
data = {
0: c1,
1: c1.tz_localize('utc'),
2: c1.tz_localize('US/Eastern'),
3: c1[::2].tz_localize('utc').repeat(2).astype('category'),
4: ['foo', 'bar'] * 5,
5: pd.Series(['foo', 'bar'] * 5).astype('category').values,
6: [True, False] * 5,
7: np.random.randn(10),
8: np.random.randint(0, 100, size=10),
9: pd.period_range('2013', periods=10, freq='M')
if LooseVersion(pd.__version__) >= '0.21':
# There is an issue with pickling IntervalIndex in pandas 0.20.x
data[10] = pd.interval_range(start=1, freq=1, periods=10)
return pd.DataFrame(data, index=index)
@pytest.mark.parametrize('columns', ([b'foo'], ['foo']))
def test_roundtrip_with_bytes_unicode(columns):
df = pd.DataFrame(columns=columns)
table1 = pa.Table.from_pandas(df)
table2 = pa.Table.from_pandas(table1.to_pandas())
assert table1.equals(table2)
assert table1.schema.equals(table2.schema)
assert table1.schema.metadata == table2.schema.metadata
def _check_serialize_components_roundtrip(df):
ctx = pa.default_serialization_context()
components = ctx.serialize(df).to_components()
deserialized = ctx.deserialize_components(components)
tm.assert_frame_equal(df, deserialized)
@pytest.mark.skipif(LooseVersion(np.__version__) >= '0.16',
reason='Until numpy/numpy#12745 is resolved')
def test_serialize_deserialize_pandas():
# ARROW-1784, serialize and deserialize DataFrame by decomposing
# BlockManager
df = _fully_loaded_dataframe_example()
def _pytime_from_micros(val):
microseconds = val % 1000000
val //= 1000000
seconds = val % 60
val //= 60
minutes = val % 60
hours = val // 60
return time(hours, minutes, seconds, microseconds)
def _pytime_to_micros(pytime):
return (pytime.hour * 3600000000 +
pytime.minute * 60000000 +
pytime.second * 1000000 +
def test_convert_unsupported_type_error_message():
# ARROW-1454
df = pd.DataFrame({
't1': pd.date_range('2000-01-01', periods=20),
't2': pd.date_range('2000-05-01', periods=20)
# timedelta64 as yet unsupported
df['diff'] = df.t2 - df.t1
expected_msg = 'Conversion failed for column diff with type timedelta64'
with pytest.raises(pa.ArrowNotImplementedError, match=expected_msg):
# ----------------------------------------------------------------------
# Test object deduplication in to_pandas
def _generate_dedup_example(nunique, repeats):
unique_values = [tm.rands(10) for i in range(nunique)]
return unique_values * repeats
def _assert_nunique(obj, expected):
assert len({id(x) for x in obj}) == expected
def test_to_pandas_deduplicate_strings_array_types():
nunique = 100
repeats = 10
values = _generate_dedup_example(nunique, repeats)
for arr in [pa.array(values, type=pa.binary()),
pa.array(values, type=pa.utf8()),
pa.chunked_array([values, values]),
pa.column('foo', [values, values])]:
_assert_nunique(arr.to_pandas(), nunique)
_assert_nunique(arr.to_pandas(deduplicate_objects=False), len(arr))
def test_to_pandas_deduplicate_strings_table_types():
nunique = 100
repeats = 10
values = _generate_dedup_example(nunique, repeats)
arr = pa.array(values)
rb = pa.RecordBatch.from_arrays([arr], ['foo'])
tbl = pa.Table.from_batches([rb])
for obj in [rb, tbl]:
_assert_nunique(obj.to_pandas()['foo'], nunique)
def test_to_pandas_deduplicate_integers_as_objects():
nunique = 100
repeats = 10
# Python automatically interns smaller integers
unique_values = list(np.random.randint(10000000, 1000000000, size=nunique))
unique_values[nunique // 2] = None
arr = pa.array(unique_values * repeats)
_assert_nunique(arr.to_pandas(integer_object_nulls=True), nunique)
# Account for None
(nunique - 1) * repeats + 1)
def test_to_pandas_deduplicate_date_time():
nunique = 100
repeats = 10
unique_values = list(range(nunique))
cases = [
# raw type, array type, to_pandas options
('int32', 'date32', {'date_as_object': True}),
('int64', 'date64', {'date_as_object': True}),
('int32', 'time32[ms]', {}),
('int64', 'time64[us]', {})
for raw_type, array_type, pandas_options in cases:
raw_arr = pa.array(unique_values * repeats, type=raw_type)
casted_arr = raw_arr.cast(array_type)
# ---------------------------------------------------------------------
def test_table_from_pandas_checks_field_nullability():
# ARROW-2136
df = pd.DataFrame({'a': [1.2, 2.1, 3.1],
'b': [np.nan, 'string', 'foo']})
schema = pa.schema([pa.field('a', pa.float64(), nullable=False),
pa.field('b', pa.utf8(), nullable=False)])
with pytest.raises(ValueError):
pa.Table.from_pandas(df, schema=schema)
def test_table_from_pandas_keeps_column_order_of_dataframe():
df1 = pd.DataFrame(OrderedDict([
('partition', [0, 0, 1, 1]),
('arrays', [[0, 1, 2], [3, 4], None, None]),
('floats', [None, None, 1.1, 3.3])
df2 = df1[['floats', 'partition', 'arrays']]
schema1 = pa.schema([
('partition', pa.int64()),
('arrays', pa.list_(pa.int64())),
('floats', pa.float64()),
schema2 = pa.schema([
('floats', pa.float64()),
('partition', pa.int64()),
('arrays', pa.list_(pa.int64()))
table1 = pa.Table.from_pandas(df1, preserve_index=False)
table2 = pa.Table.from_pandas(df2, preserve_index=False)
assert table1.schema.equals(schema1, check_metadata=False)
assert table2.schema.equals(schema2, check_metadata=False)
def test_table_from_pandas_keeps_column_order_of_schema():
# ARROW-3766
df = pd.DataFrame(OrderedDict([
('partition', [0, 0, 1, 1]),
('arrays', [[0, 1, 2], [3, 4], None, None]),
('floats', [None, None, 1.1, 3.3])
schema = pa.schema([
('floats', pa.float64()),
('arrays', pa.list_(pa.int32())),
('partition', pa.int32())
df1 = df[df.partition == 0]
df2 = df[df.partition == 1][['floats', 'partition', 'arrays']]
table1 = pa.Table.from_pandas(df1, schema=schema, preserve_index=False)
table2 = pa.Table.from_pandas(df2, schema=schema, preserve_index=False)
assert table1.schema.equals(schema, check_metadata=False)
assert table1.schema.equals(table2.schema, check_metadata=False)
def test_table_from_pandas_columns_argument_only_does_filtering():
df = pd.DataFrame(OrderedDict([
('partition', [0, 0, 1, 1]),
('arrays', [[0, 1, 2], [3, 4], None, None]),
('floats', [None, None, 1.1, 3.3])
columns1 = ['arrays', 'floats', 'partition']
schema1 = pa.schema([
('arrays', pa.list_(pa.int64())),
('floats', pa.float64()),
('partition', pa.int64())
columns2 = ['floats', 'partition']
schema2 = pa.schema([
('floats', pa.float64()),
('partition', pa.int64())
table1 = pa.Table.from_pandas(df, columns=columns1, preserve_index=False)
table2 = pa.Table.from_pandas(df, columns=columns2, preserve_index=False)
assert table1.schema.equals(schema1, check_metadata=False)
assert table2.schema.equals(schema2, check_metadata=False)
def test_table_from_pandas_columns_and_schema_are_mutually_exclusive():
df = pd.DataFrame(OrderedDict([
('partition', [0, 0, 1, 1]),
('arrays', [[0, 1, 2], [3, 4], None, None]),
('floats', [None, None, 1.1, 3.3])
schema = pa.schema([
('partition', pa.int32()),
('arrays', pa.list_(pa.int32())),
('floats', pa.float64()),
columns = ['arrays', 'floats']
with pytest.raises(ValueError):
pa.Table.from_pandas(df, schema=schema, columns=columns)
def test_table_from_pandas_keeps_schema_nullability():
# ARROW-5169
df = pd.DataFrame({'a': [1, 2, 3, 4]})
schema = pa.schema([
pa.field('a', pa.int64(), nullable=False),
table = pa.Table.from_pandas(df)
assert table.schema.field_by_name('a').nullable is True
table = pa.Table.from_pandas(df, schema=schema)
assert table.schema.field_by_name('a').nullable is False
# ----------------------------------------------------------------------
# RecordBatch, Table
def test_recordbatch_from_to_pandas():
data = pd.DataFrame({
'c1': np.array([1, 2, 3, 4, 5], dtype='int64'),
'c2': np.array([1, 2, 3, 4, 5], dtype='uint32'),
'c3': np.random.randn(5),
'c4': ['foo', 'bar', None, 'baz', 'qux'],
'c5': [False, True, False, True, False]
batch = pa.RecordBatch.from_pandas(data)
result = batch.to_pandas()
tm.assert_frame_equal(data, result)
def test_recordbatchlist_to_pandas():
data1 = pd.DataFrame({
'c1': np.array([1, 1, 2], dtype='uint32'),
'c2': np.array([1.0, 2.0, 3.0], dtype='float64'),
'c3': [True, None, False],
'c4': ['foo', 'bar', None]
data2 = pd.DataFrame({
'c1': np.array([3, 5], dtype='uint32'),
'c2': np.array([4.0, 5.0], dtype='float64'),
'c3': [True, True],
'c4': ['baz', 'qux']
batch1 = pa.RecordBatch.from_pandas(data1)
batch2 = pa.RecordBatch.from_pandas(data2)
table = pa.Table.from_batches([batch1, batch2])
result = table.to_pandas()
data = pd.concat([data1, data2]).reset_index(drop=True)
tm.assert_frame_equal(data, result)
# ----------------------------------------------------------------------
# Metadata serialization
('type', 'expected'),
(pa.null(), 'empty'),
(pa.bool_(), 'bool'),
(pa.int8(), 'int8'),
(pa.int16(), 'int16'),
(pa.int32(), 'int32'),
(pa.int64(), 'int64'),
(pa.uint8(), 'uint8'),
(pa.uint16(), 'uint16'),
(pa.uint32(), 'uint32'),
(pa.uint64(), 'uint64'),
(pa.float16(), 'float16'),
(pa.float32(), 'float32'),
(pa.float64(), 'float64'),
(pa.date32(), 'date'),
(pa.date64(), 'date'),
(pa.binary(), 'bytes'),
(pa.binary(length=4), 'bytes'),
(pa.string(), 'unicode'),
(pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'),
(pa.decimal128(18, 3), 'decimal'),
(pa.timestamp('ms'), 'datetime'),
(pa.timestamp('us', 'UTC'), 'datetimetz'),
(pa.time32('s'), 'time'),
(pa.time64('us'), 'time')
def test_logical_type(type, expected):
assert get_logical_type(type) == expected
# ----------------------------------------------------------------------
# Some nested array tests array tests
def test_array_from_py_float32():
data = [[1.2, 3.4], [9.0, 42.0]]
t = pa.float32()
arr1 = pa.array(data[0], type=t)
arr2 = pa.array(data, type=pa.list_(t))
expected1 = np.array(data[0], dtype=np.float32)
expected2 = pd.Series([np.array(data[0], dtype=np.float32),
np.array(data[1], dtype=np.float32)])
assert arr1.type == t
assert arr1.equals(pa.array(expected1))
assert arr2.equals(pa.array(expected2))
# ----------------------------------------------------------------------
# Timestamp tests
def test_cast_timestamp_unit():
# ARROW-1680
val =
s = pd.Series([val])
s_nyc = s.dt.tz_localize('tzlocal()').dt.tz_convert('America/New_York')
us_with_tz = pa.timestamp('us', tz='America/New_York')
arr = pa.Array.from_pandas(s_nyc, type=us_with_tz)
# ARROW-1906
assert arr.type == us_with_tz
arr2 = pa.Array.from_pandas(s, type=pa.timestamp('us'))
assert arr[0].as_py() == s_nyc[0]
assert arr2[0].as_py() == s[0]
# Disallow truncation
arr = pa.array([123123], type='int64').cast(pa.timestamp('ms'))
expected = pa.array([123], type='int64').cast(pa.timestamp('s'))
target = pa.timestamp('s')
with pytest.raises(ValueError):
result = arr.cast(target, safe=False)
assert result.equals(expected)
# ARROW-1949
series = pd.Series([pd.Timestamp(1), pd.Timestamp(10), pd.Timestamp(1000)])
expected = pa.array([0, 0, 1], type=pa.timestamp('us'))
with pytest.raises(ValueError):
pa.array(series, type=pa.timestamp('us'))
with pytest.raises(ValueError):
pa.Array.from_pandas(series, type=pa.timestamp('us'))
result = pa.Array.from_pandas(series, type=pa.timestamp('us'), safe=False)
assert result.equals(expected)
result = pa.array(series, type=pa.timestamp('us'), safe=False)
assert result.equals(expected)
# ----------------------------------------------------------------------
# DictionaryArray tests
def test_dictionary_with_pandas():
indices = np.repeat([0, 1, 2], 2)
dictionary = np.array(['foo', 'bar', 'baz'], dtype=object)
mask = np.array([False, False, True, False, False, False])
d1 = pa.DictionaryArray.from_arrays(indices, dictionary)
d2 = pa.DictionaryArray.from_arrays(indices, dictionary, mask=mask)
pandas1 = d1.to_pandas()
ex_pandas1 = pd.Categorical.from_codes(indices, categories=dictionary)
tm.assert_series_equal(pd.Series(pandas1), pd.Series(ex_pandas1))
pandas2 = d2.to_pandas()
ex_pandas2 = pd.Categorical.from_codes(np.where(mask, -1, indices),
tm.assert_series_equal(pd.Series(pandas2), pd.Series(ex_pandas2))
def test_variable_dictionary_with_pandas():
a1 = pa.DictionaryArray.from_arrays([0, 1, 2], ['a', 'b', 'c'])
a2 = pa.DictionaryArray.from_arrays([0, 1], ['a', 'c'])
a = pa.chunked_array([a1, a2])
assert a.to_pylist() == ['a', 'b', 'c', 'a', 'c']
with pytest.raises(NotImplementedError):
a = pa.chunked_array([a2, a1])
assert a.to_pylist() == ['a', 'c', 'a', 'b', 'c']
with pytest.raises(NotImplementedError):
# ----------------------------------------------------------------------
# Legacy metadata compatibility tests
def test_range_index_pre_0_12():
# Forward compatibility for metadata created from pandas.RangeIndex
# prior to pyarrow 0.13.0
a_values = [u'foo', u'bar', None, u'baz']
b_values = [u'a', u'a', u'b', u'b']
a_arrow = pa.array(a_values, type='utf8')
b_arrow = pa.array(b_values, type='utf8')
rng_index_arrow = pa.array([0, 2, 4, 6], type='int64')
gen_name_0 = '__index_level_0__'
gen_name_1 = '__index_level_1__'
# Case 1: named RangeIndex
e1 = pd.DataFrame({
'a': a_values
}, index=pd.RangeIndex(0, 8, step=2, name='qux'))
t1 = pa.Table.from_arrays([a_arrow, rng_index_arrow],
names=['a', 'qux'])
t1 = t1.replace_schema_metadata({
b'pandas': json.dumps(
{'index_columns': ['qux'],
'column_indexes': [{'name': None,
'field_name': None,
'pandas_type': 'unicode',
'numpy_type': 'object',
'metadata': {'encoding': 'UTF-8'}}],
'columns': [{'name': 'a',
'field_name': 'a',
'pandas_type': 'unicode',
'numpy_type': 'object',
'metadata': None},
{'name': 'qux',
'field_name': 'qux',
'pandas_type': 'int64',
'numpy_type': 'int64',
'metadata': None}],
'pandas_version': '0.23.4'}
r1 = t1.to_pandas()
tm.assert_frame_equal(r1, e1)
# Case 2: named RangeIndex, but conflicts with an actual column
e2 = pd.DataFrame({
'qux': a_values
}, index=pd.RangeIndex(0, 8, step=2, name='qux'))
t2 = pa.Table.from_arrays([a_arrow, rng_index_arrow],
names=['qux', gen_name_0])
t2 = t2.replace_schema_metadata({
b'pandas': json.dumps(
{'index_columns': [gen_name_0],
'column_indexes': [{'name': None,
'field_name': None,
'pandas_type': 'unicode',
'numpy_type': 'object',
'metadata': {'encoding': 'UTF-8'}}],
'columns': [{'name': 'a',
'field_name': 'a',
'pandas_type': 'unicode',
'numpy_type': 'object',
'metadata': None},
{'name': 'qux',
'field_name': gen_name_0,
'pandas_type': 'int64',
'numpy_type': 'int64',
'metadata': None}],
'pandas_version': '0.23.4'}
r2 = t2.to_pandas()
tm.assert_frame_equal(r2, e2)
# Case 3: unnamed RangeIndex
e3 = pd.DataFrame({
'a': a_values
}, index=pd.RangeIndex(0, 8, step=2, name=None))
t3 = pa.Table.from_arrays([a_arrow, rng_index_arrow],
names=['a', gen_name_0])
t3 = t3.replace_schema_metadata({
b'pandas': json.dumps(
{'index_columns': [gen_name_0],
'column_indexes': [{'name': None,
'field_name': None,
'pandas_type': 'unicode',
'numpy_type': 'object',
'metadata': {'encoding': 'UTF-8'}}],
'columns': [{'name': 'a',
'field_name': 'a',
'pandas_type': 'unicode',
'numpy_type': 'object',
'metadata': None},
{'name': None,
'field_name': gen_name_0,
'pandas_type': 'int64',
'numpy_type': 'int64',
'metadata': None}],
'pandas_version': '0.23.4'}
r3 = t3.to_pandas()
tm.assert_frame_equal(r3, e3)
# Case 4: MultiIndex with named RangeIndex
e4 = pd.DataFrame({
'a': a_values
}, index=[pd.RangeIndex(0, 8, step=2, name='qux'), b_values])
t4 = pa.Table.from_arrays([a_arrow, rng_index_arrow, b_arrow],
names=['a', 'qux', gen_name_1])
t4 = t4.replace_schema_metadata({
b'pandas': json.dumps(
{'index_columns': ['qux', gen_name_1],
'column_indexes': [{'name': None,
'field_name': None,
'pandas_type': 'unicode',
'numpy_type': 'object',
'metadata': {'encoding': 'UTF-8'}}],
'columns': [{'name': 'a',
'field_name': 'a',
'pandas_type': 'unicode',
'numpy_type': 'object',
'metadata': None},
{'name': 'qux',
'field_name': 'qux',
'pandas_type': 'int64',
'numpy_type': 'int64',
'metadata': None},
{'name': None,
'field_name': gen_name_1,
'pandas_type': 'unicode',
'numpy_type': 'object',
'metadata': None}],
'pandas_version': '0.23.4'}
r4 = t4.to_pandas()
tm.assert_frame_equal(r4, e4)
# Case 4: MultiIndex with unnamed RangeIndex
e5 = pd.DataFrame({
'a': a_values
}, index=[pd.RangeIndex(0, 8, step=2, name=None), b_values])
t5 = pa.Table.from_arrays([a_arrow, rng_index_arrow, b_arrow],
names=['a', gen_name_0, gen_name_1])
t5 = t5.replace_schema_metadata({
b'pandas': json.dumps(
{'index_columns': [gen_name_0, gen_name_1],
'column_indexes': [{'name': None,
'field_name': None,
'pandas_type': 'unicode',
'numpy_type': 'object',
'metadata': {'encoding': 'UTF-8'}}],
'columns': [{'name': 'a',
'field_name': 'a',
'pandas_type': 'unicode',
'numpy_type': 'object',
'metadata': None},
{'name': None,
'field_name': gen_name_0,
'pandas_type': 'int64',
'numpy_type': 'int64',
'metadata': None},
{'name': None,
'field_name': gen_name_1,
'pandas_type': 'unicode',
'numpy_type': 'object',
'metadata': None}],
'pandas_version': '0.23.4'}
r5 = t5.to_pandas()
tm.assert_frame_equal(r5, e5)