python/pyarrow/tests/test_pandas.py - arrow - Git at Google

 # -*- coding: utf-8 -*-
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 import six
 import decimal
 import json
 import multiprocessing as mp

 from collections import OrderedDict
 from datetime import date, datetime, time, timedelta
 from distutils.version import LooseVersion

 import hypothesis as h
 import hypothesis.extra.pytz as tzst
 import hypothesis.strategies as st
 import numpy as np
 import numpy.testing as npt
 import pytest
 import pytz

 from pyarrow.pandas_compat import get_logical_type, _pandas_api

 import pyarrow as pa

 try:
     import pandas as pd
     import pandas.util.testing as tm
     from .pandas_examples import dataframe_with_arrays, dataframe_with_lists
 except ImportError:
     pass


 # Marks all of the tests in this module
 pytestmark = pytest.mark.pandas


 def _alltypes_example(size=100):
     return pd.DataFrame({
         'uint8': np.arange(size, dtype=np.uint8),
         'uint16': np.arange(size, dtype=np.uint16),
         'uint32': np.arange(size, dtype=np.uint32),
         'uint64': np.arange(size, dtype=np.uint64),
         'int8': np.arange(size, dtype=np.int16),
         'int16': np.arange(size, dtype=np.int16),
         'int32': np.arange(size, dtype=np.int32),
         'int64': np.arange(size, dtype=np.int64),
         'float32': np.arange(size, dtype=np.float32),
         'float64': np.arange(size, dtype=np.float64),
         'bool': np.random.randn(size) > 0,
         # TODO(wesm): Pandas only support ns resolution, Arrow supports s, ms,
         # us, ns
         'datetime': np.arange("2016-01-01T00:00:00.001", size,
                               dtype='datetime64[ms]'),
         'str': [str(x) for x in range(size)],
         'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None],
         'empty_str': [''] * size
     })


 def _check_pandas_roundtrip(df, expected=None, use_threads=True,
                             expected_schema=None,
                             check_dtype=True, schema=None,
                             preserve_index=False,
                             as_batch=False):
     klass = pa.RecordBatch if as_batch else pa.Table
     table = klass.from_pandas(df, schema=schema,
                               preserve_index=preserve_index,
                               nthreads=2 if use_threads else 1)
     result = table.to_pandas(use_threads=use_threads)

     if expected_schema:
         # all occurences of _check_pandas_roundtrip passes expected_schema
         # without the pandas generated key-value metadata
         assert table.schema.equals(expected_schema, check_metadata=False)

     if expected is None:
         expected = df if schema is None else df[schema.names]

     tm.assert_frame_equal(result, expected, check_dtype=check_dtype,
                           check_index_type=('equiv' if preserve_index
                                             else False))


 def _check_series_roundtrip(s, type_=None, expected_pa_type=None):
     arr = pa.array(s, from_pandas=True, type=type_)

     if type_ is not None and expected_pa_type is None:
         expected_pa_type = type_

     if expected_pa_type is not None:
         assert arr.type == expected_pa_type

     result = pd.Series(arr.to_pandas(), name=s.name)
     if pa.types.is_timestamp(arr.type) and arr.type.tz is not None:
         result = (result.dt.tz_localize('utc')
                   .dt.tz_convert(arr.type.tz))

     tm.assert_series_equal(s, result)


 def _check_array_roundtrip(values, expected=None, mask=None,
                            type=None):
     arr = pa.array(values, from_pandas=True, mask=mask, type=type)
     result = arr.to_pandas()

     values_nulls = pd.isnull(values)
     if mask is None:
         assert arr.null_count == values_nulls.sum()
     else:
         assert arr.null_count == (mask | values_nulls).sum()

     if expected is None:
         if mask is None:
             expected = pd.Series(values)
         else:
             expected = pd.Series(np.ma.masked_array(values, mask=mask))

     tm.assert_series_equal(pd.Series(result), expected, check_names=False)


 def _check_array_from_pandas_roundtrip(np_array, type=None):
     arr = pa.array(np_array, from_pandas=True, type=type)
     result = arr.to_pandas()
     npt.assert_array_equal(result, np_array)


 class TestConvertMetadata(object):
     """
     Conversion tests for Pandas metadata & indices.
     """

     def test_non_string_columns(self):
         df = pd.DataFrame({0: [1, 2, 3]})
         table = pa.Table.from_pandas(df)
         assert table.column(0).name == '0'

     def test_from_pandas_with_columns(self):
         df = pd.DataFrame({0: [1, 2, 3], 1: [1, 3, 3], 2: [2, 4, 5]},
                           columns=[1, 0])

         table = pa.Table.from_pandas(df, columns=[0, 1])
         expected = pa.Table.from_pandas(df[[0, 1]])
         assert expected.equals(table)

         record_batch_table = pa.RecordBatch.from_pandas(df, columns=[0, 1])
         record_batch_expected = pa.RecordBatch.from_pandas(df[[0, 1]])
         assert record_batch_expected.equals(record_batch_table)

     def test_column_index_names_are_preserved(self):
         df = pd.DataFrame({'data': [1, 2, 3]})
         df.columns.names = ['a']
         _check_pandas_roundtrip(df, preserve_index=True)

     def test_range_index_shortcut(self):
         # ARROW-1639
         index_name = 'foo'
         df = pd.DataFrame({'a': [1, 2, 3, 4]},
                           index=pd.RangeIndex(0, 8, step=2, name=index_name))

         df2 = pd.DataFrame({'a': [4, 5, 6, 7]},
                            index=pd.RangeIndex(0, 4))

         table = pa.Table.from_pandas(df)
         table_no_index_name = pa.Table.from_pandas(df2)

         # The RangeIndex is tracked in the metadata only
         assert len(table.schema) == 1

         result = table.to_pandas()
         tm.assert_frame_equal(result, df)
         assert isinstance(result.index, pd.RangeIndex)
         assert _pandas_api.get_rangeindex_attribute(result.index, 'step') == 2
         assert result.index.name == index_name

         result2 = table_no_index_name.to_pandas()
         tm.assert_frame_equal(result2, df2)
         assert isinstance(result2.index, pd.RangeIndex)
         assert _pandas_api.get_rangeindex_attribute(result2.index, 'step') == 1
         assert result2.index.name is None

     def test_range_index_force_serialization(self):
         # ARROW-5427: preserve_index=True will force the RangeIndex to
         # be serialized as a column rather than tracked more
         # efficiently as metadata
         df = pd.DataFrame({'a': [1, 2, 3, 4]},
                           index=pd.RangeIndex(0, 8, step=2, name='foo'))

         table = pa.Table.from_pandas(df, preserve_index=True)
         assert table.num_columns == 2
         assert 'foo' in table.column_names

         restored = table.to_pandas()
         tm.assert_frame_equal(restored, df)

     def test_rangeindex_doesnt_warn(self):
         # ARROW-5606: pandas 0.25 deprecated private _start/stop/step
         # attributes -> can be removed if support < pd 0.25 is dropped
         df = pd.DataFrame(np.random.randn(4, 2), columns=['a', 'b'])

         with pytest.warns(None) as record:
             _check_pandas_roundtrip(df, preserve_index=True)

         assert len(record) == 0

     def test_multiindex_columns(self):
         columns = pd.MultiIndex.from_arrays([
             ['one', 'two'], ['X', 'Y']
         ])
         df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=columns)
         _check_pandas_roundtrip(df, preserve_index=True)

     def test_multiindex_columns_with_dtypes(self):
         columns = pd.MultiIndex.from_arrays(
             [
                 ['one', 'two'],
                 pd.DatetimeIndex(['2017-08-01', '2017-08-02']),
             ],
             names=['level_1', 'level_2'],
         )
         df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=columns)
         _check_pandas_roundtrip(df, preserve_index=True)

     def test_multiindex_columns_unicode(self):
         columns = pd.MultiIndex.from_arrays([[u'あ', u'い'], ['X', 'Y']])
         df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=columns)
         _check_pandas_roundtrip(df, preserve_index=True)

     def test_multiindex_doesnt_warn(self):
         # ARROW-3953: pandas 0.24 rename of MultiIndex labels to codes
         columns = pd.MultiIndex.from_arrays([['one', 'two'], ['X', 'Y']])
         df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=columns)

         with pytest.warns(None) as record:
             _check_pandas_roundtrip(df, preserve_index=True)

         assert len(record) == 0

     def test_integer_index_column(self):
         df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')])
         _check_pandas_roundtrip(df, preserve_index=True)

     def test_index_metadata_field_name(self):
         # test None case, and strangely named non-index columns
         df = pd.DataFrame(
             [(1, 'a', 3.1), (2, 'b', 2.2), (3, 'c', 1.3)],
             index=pd.MultiIndex.from_arrays(
                 [['c', 'b', 'a'], [3, 2, 1]],
                 names=[None, 'foo']
             ),
             columns=['a', None, '__index_level_0__'],
         )
         with pytest.warns(UserWarning):
             t = pa.Table.from_pandas(df, preserve_index=True)
         js = t.schema.pandas_metadata

         col1, col2, col3, idx0, foo = js['columns']

         assert col1['name'] == 'a'
         assert col1['name'] == col1['field_name']

         assert col2['name'] is None
         assert col2['field_name'] == 'None'

         assert col3['name'] == '__index_level_0__'
         assert col3['name'] == col3['field_name']

         idx0_descr, foo_descr = js['index_columns']
         assert idx0_descr == '__index_level_0__'
         assert idx0['field_name'] == idx0_descr
         assert idx0['name'] is None

         assert foo_descr == 'foo'
         assert foo['field_name'] == foo_descr
         assert foo['name'] == foo_descr

     def test_categorical_column_index(self):
         df = pd.DataFrame(
             [(1, 'a', 2.0), (2, 'b', 3.0), (3, 'c', 4.0)],
             columns=pd.Index(list('def'), dtype='category')
         )
         t = pa.Table.from_pandas(df, preserve_index=True)
         js = t.schema.pandas_metadata

         column_indexes, = js['column_indexes']
         assert column_indexes['name'] is None
         assert column_indexes['pandas_type'] == 'categorical'
         assert column_indexes['numpy_type'] == 'int8'

         md = column_indexes['metadata']
         assert md['num_categories'] == 3
         assert md['ordered'] is False

     def test_string_column_index(self):
         df = pd.DataFrame(
             [(1, 'a', 2.0), (2, 'b', 3.0), (3, 'c', 4.0)],
             columns=pd.Index(list('def'), name='stringz')
         )
         t = pa.Table.from_pandas(df, preserve_index=True)
         js = t.schema.pandas_metadata

         column_indexes, = js['column_indexes']
         assert column_indexes['name'] == 'stringz'
         assert column_indexes['name'] == column_indexes['field_name']
         assert column_indexes['numpy_type'] == 'object'
         assert column_indexes['pandas_type'] == (
             'bytes' if six.PY2 else 'unicode'
         )

         md = column_indexes['metadata']

         if not six.PY2:
             assert len(md) == 1
             assert md['encoding'] == 'UTF-8'
         else:
             assert md is None or 'encoding' not in md

     def test_datetimetz_column_index(self):
         df = pd.DataFrame(
             [(1, 'a', 2.0), (2, 'b', 3.0), (3, 'c', 4.0)],
             columns=pd.date_range(
                 start='2017-01-01', periods=3, tz='America/New_York'
             )
         )
         t = pa.Table.from_pandas(df, preserve_index=True)
         js = t.schema.pandas_metadata

         column_indexes, = js['column_indexes']
         assert column_indexes['name'] is None
         assert column_indexes['pandas_type'] == 'datetimetz'
         assert column_indexes['numpy_type'] == 'datetime64[ns]'

         md = column_indexes['metadata']
         assert md['timezone'] == 'America/New_York'

     def test_datetimetz_row_index(self):
         df = pd.DataFrame({
             'a': pd.date_range(
                 start='2017-01-01', periods=3, tz='America/New_York'
             )
         })
         df = df.set_index('a')

         _check_pandas_roundtrip(df, preserve_index=True)

     def test_categorical_row_index(self):
         df = pd.DataFrame({'a': [1, 2, 3], 'b': [1, 2, 3]})
         df['a'] = df.a.astype('category')
         df = df.set_index('a')

         _check_pandas_roundtrip(df, preserve_index=True)

     def test_duplicate_column_names_does_not_crash(self):
         df = pd.DataFrame([(1, 'a'), (2, 'b')], columns=list('aa'))
         with pytest.raises(ValueError):
             pa.Table.from_pandas(df)

     def test_dictionary_indices_boundscheck(self):
         # ARROW-1658. No validation of indices leads to segfaults in pandas
         indices = [[0, 1], [0, -1]]

         for inds in indices:
             arr = pa.DictionaryArray.from_arrays(inds, ['a'], safe=False)
             batch = pa.RecordBatch.from_arrays([arr], ['foo'])
             table = pa.Table.from_batches([batch, batch, batch])

             with pytest.raises(pa.ArrowInvalid):
                 arr.to_pandas()

             with pytest.raises(pa.ArrowInvalid):
                 table.to_pandas()

     def test_unicode_with_unicode_column_and_index(self):
         df = pd.DataFrame({u'あ': [u'い']}, index=[u'う'])

         _check_pandas_roundtrip(df, preserve_index=True)

     def test_mixed_column_names(self):
         # mixed type column names are not reconstructed exactly
         df = pd.DataFrame({'a': [1, 2], 'b': [3, 4]})

         for cols in [[u'あ', b'a'], [1, '2'], [1, 1.5]]:
             df.columns = pd.Index(cols, dtype=object)

             # assert that the from_pandas raises the warning
             with pytest.warns(UserWarning):
                 pa.Table.from_pandas(df)

             expected = df.copy()
             expected.columns = df.columns.astype(six.text_type)
             with pytest.warns(UserWarning):
                 _check_pandas_roundtrip(df, expected=expected,
                                         preserve_index=True)

     def test_binary_column_name(self):
         column_data = [u'い']
         key = u'あ'.encode('utf8')
         data = {key: column_data}
         df = pd.DataFrame(data)

         # we can't use _check_pandas_roundtrip here because our metdata
         # is always decoded as utf8: even if binary goes in, utf8 comes out
         t = pa.Table.from_pandas(df, preserve_index=True)
         df2 = t.to_pandas()
         assert df.values[0] == df2.values[0]
         assert df.index.values[0] == df2.index.values[0]
         assert df.columns[0] == key

     def test_multiindex_duplicate_values(self):
         num_rows = 3
         numbers = list(range(num_rows))
         index = pd.MultiIndex.from_arrays(
             [['foo', 'foo', 'bar'], numbers],
             names=['foobar', 'some_numbers'],
         )

         df = pd.DataFrame({'numbers': numbers}, index=index)

         table = pa.Table.from_pandas(df)
         result_df = table.to_pandas()
         tm.assert_frame_equal(result_df, df)

     def test_metadata_with_mixed_types(self):
         df = pd.DataFrame({'data': [b'some_bytes', u'some_unicode']})
         table = pa.Table.from_pandas(df)
         js = table.schema.pandas_metadata
         assert 'mixed' not in js
         data_column = js['columns'][0]
         assert data_column['pandas_type'] == 'bytes'
         assert data_column['numpy_type'] == 'object'

     def test_ignore_metadata(self):
         df = pd.DataFrame({'a': [1, 2, 3], 'b': ['foo', 'bar', 'baz']},
                           index=['one', 'two', 'three'])
         table = pa.Table.from_pandas(df)

         result = table.to_pandas(ignore_metadata=True)
         expected = (table.cast(table.schema.remove_metadata())
                     .to_pandas())

         assert result.equals(expected)

     def test_list_metadata(self):
         df = pd.DataFrame({'data': [[1], [2, 3, 4], [5] * 7]})
         schema = pa.schema([pa.field('data', type=pa.list_(pa.int64()))])
         table = pa.Table.from_pandas(df, schema=schema)
         js = table.schema.pandas_metadata
         assert 'mixed' not in js
         data_column = js['columns'][0]
         assert data_column['pandas_type'] == 'list[int64]'
         assert data_column['numpy_type'] == 'object'

     def test_struct_metadata(self):
         df = pd.DataFrame({'dicts': [{'a': 1, 'b': 2}, {'a': 3, 'b': 4}]})
         table = pa.Table.from_pandas(df)
         pandas_metadata = table.schema.pandas_metadata
         assert pandas_metadata['columns'][0]['pandas_type'] == 'object'

     def test_decimal_metadata(self):
         expected = pd.DataFrame({
             'decimals': [
                 decimal.Decimal('394092382910493.12341234678'),
                 -decimal.Decimal('314292388910493.12343437128'),
             ]
         })
         table = pa.Table.from_pandas(expected)
         js = table.schema.pandas_metadata
         assert 'mixed' not in js
         data_column = js['columns'][0]
         assert data_column['pandas_type'] == 'decimal'
         assert data_column['numpy_type'] == 'object'
         assert data_column['metadata'] == {'precision': 26, 'scale': 11}

     def test_table_column_subset_metadata(self):
         # ARROW-1883
         # non-default index
         for index in [
                 pd.Index(['a', 'b', 'c'], name='index'),
                 pd.date_range("2017-01-01", periods=3, tz='Europe/Brussels')]:
             df = pd.DataFrame({'a': [1, 2, 3],
                                'b': [.1, .2, .3]}, index=index)
             table = pa.Table.from_pandas(df)

             table_subset = table.remove_column(1)
             result = table_subset.to_pandas()
             tm.assert_frame_equal(result, df[['a']])

             table_subset2 = table_subset.remove_column(1)
             result = table_subset2.to_pandas()
             tm.assert_frame_equal(result, df[['a']].reset_index(drop=True))

     def test_empty_list_metadata(self):
         # Create table with array of empty lists, forced to have type
         # list(string) in pyarrow
         c1 = [["test"], ["a", "b"], None]
         c2 = [[], [], []]
         arrays = OrderedDict([
             ('c1', pa.array(c1, type=pa.list_(pa.string()))),
             ('c2', pa.array(c2, type=pa.list_(pa.string()))),
         ])
         rb = pa.RecordBatch.from_arrays(
             list(arrays.values()),
             list(arrays.keys())
         )
         tbl = pa.Table.from_batches([rb])

         # First roundtrip changes schema, because pandas cannot preserve the
         # type of empty lists
         df = tbl.to_pandas()
         tbl2 = pa.Table.from_pandas(df)
         md2 = tbl2.schema.pandas_metadata

         # Second roundtrip
         df2 = tbl2.to_pandas()
         expected = pd.DataFrame(OrderedDict([('c1', c1), ('c2', c2)]))

         tm.assert_frame_equal(df2, expected)

         assert md2['columns'] == [
             {
                 'name': 'c1',
                 'field_name': 'c1',
                 'metadata': None,
                 'numpy_type': 'object',
                 'pandas_type': 'list[unicode]',
             },
             {
                 'name': 'c2',
                 'field_name': 'c2',
                 'metadata': None,
                 'numpy_type': 'object',
                 'pandas_type': 'list[empty]',
             }
         ]

     def test_metadata_pandas_version(self):
         df = pd.DataFrame({'a': [1, 2, 3], 'b': [1, 2, 3]})
         table = pa.Table.from_pandas(df)
         assert table.schema.pandas_metadata['pandas_version'] is not None


 class TestConvertPrimitiveTypes(object):
     """
     Conversion tests for primitive (e.g. numeric) types.
     """

     def test_float_no_nulls(self):
         data = {}
         fields = []
         dtypes = [('f2', pa.float16()),
                   ('f4', pa.float32()),
                   ('f8', pa.float64())]
         num_values = 100

         for numpy_dtype, arrow_dtype in dtypes:
             values = np.random.randn(num_values)
             data[numpy_dtype] = values.astype(numpy_dtype)
             fields.append(pa.field(numpy_dtype, arrow_dtype))

         df = pd.DataFrame(data)
         schema = pa.schema(fields)
         _check_pandas_roundtrip(df, expected_schema=schema)

     def test_float_nulls(self):
         num_values = 100

         null_mask = np.random.randint(0, 10, size=num_values) < 3
         dtypes = [('f2', pa.float16()),
                   ('f4', pa.float32()),
                   ('f8', pa.float64())]
         names = ['f2', 'f4', 'f8']
         expected_cols = []

         arrays = []
         fields = []
         for name, arrow_dtype in dtypes:
             values = np.random.randn(num_values).astype(name)

             arr = pa.array(values, from_pandas=True, mask=null_mask)
             arrays.append(arr)
             fields.append(pa.field(name, arrow_dtype))
             values[null_mask] = np.nan

             expected_cols.append(values)

         ex_frame = pd.DataFrame(dict(zip(names, expected_cols)),
                                 columns=names)

         table = pa.Table.from_arrays(arrays, names)
         assert table.schema.equals(pa.schema(fields))
         result = table.to_pandas()
         tm.assert_frame_equal(result, ex_frame)

     def test_float_nulls_to_ints(self):
         # ARROW-2135
         df = pd.DataFrame({"a": [1.0, 2.0, pd.np.NaN]})
         schema = pa.schema([pa.field("a", pa.int16(), nullable=True)])
         table = pa.Table.from_pandas(df, schema=schema, safe=False)
         assert table[0].to_pylist() == [1, 2, None]
         tm.assert_frame_equal(df, table.to_pandas())

     def test_float_nulls_to_boolean(self):
         s = pd.Series([0.0, 1.0, 2.0, None, -3.0])
         expected = pd.Series([False, True, True, None, True])
         _check_array_roundtrip(s, expected=expected, type=pa.bool_())

     def test_series_from_pandas_false_respected(self):
         # Check that explicit from_pandas=False is respected
         s = pd.Series([0.0, np.nan])
         arr = pa.array(s, from_pandas=False)
         assert arr.null_count == 0
         assert np.isnan(arr[1].as_py())

     def test_integer_no_nulls(self):
         data = OrderedDict()
         fields = []

         numpy_dtypes = [
             ('i1', pa.int8()), ('i2', pa.int16()),
             ('i4', pa.int32()), ('i8', pa.int64()),
             ('u1', pa.uint8()), ('u2', pa.uint16()),
             ('u4', pa.uint32()), ('u8', pa.uint64()),
             ('longlong', pa.int64()), ('ulonglong', pa.uint64())
         ]
         num_values = 100

         for dtype, arrow_dtype in numpy_dtypes:
             info = np.iinfo(dtype)
             values = np.random.randint(max(info.min, np.iinfo(np.int_).min),
                                        min(info.max, np.iinfo(np.int_).max),
                                        size=num_values)
             data[dtype] = values.astype(dtype)
             fields.append(pa.field(dtype, arrow_dtype))

         df = pd.DataFrame(data)
         schema = pa.schema(fields)
         _check_pandas_roundtrip(df, expected_schema=schema)

     def test_all_integer_types(self):
         # Test all Numpy integer aliases
         data = OrderedDict()
         numpy_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8',
                         'byte', 'ubyte', 'short', 'ushort', 'intc', 'uintc',
                         'int_', 'uint', 'longlong', 'ulonglong']
         for dtype in numpy_dtypes:
             data[dtype] = np.arange(12, dtype=dtype)
         df = pd.DataFrame(data)
         _check_pandas_roundtrip(df)

         # Do the same with pa.array()
         # (for some reason, it doesn't use the same code paths at all)
         for np_arr in data.values():
             arr = pa.array(np_arr)
             assert arr.to_pylist() == np_arr.tolist()

     def test_integer_byteorder(self):
         # Byteswapped arrays are not supported yet
         int_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8']
         for dt in int_dtypes:
             for order in '=<>':
                 data = np.array([1, 2, 42], dtype=order + dt)
                 for np_arr in (data, data[::2]):
                     if data.dtype.isnative:
                         arr = pa.array(data)
                         assert arr.to_pylist() == data.tolist()
                     else:
                         with pytest.raises(NotImplementedError):
                             arr = pa.array(data)

     def test_integer_with_nulls(self):
         # pandas requires upcast to float dtype

         int_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8']
         num_values = 100

         null_mask = np.random.randint(0, 10, size=num_values) < 3

         expected_cols = []
         arrays = []
         for name in int_dtypes:
             values = np.random.randint(0, 100, size=num_values)

             arr = pa.array(values, mask=null_mask)
             arrays.append(arr)

             expected = values.astype('f8')
             expected[null_mask] = np.nan

             expected_cols.append(expected)

         ex_frame = pd.DataFrame(dict(zip(int_dtypes, expected_cols)),
                                 columns=int_dtypes)

         table = pa.Table.from_arrays(arrays, int_dtypes)
         result = table.to_pandas()

         tm.assert_frame_equal(result, ex_frame)

     def test_array_from_pandas_type_cast(self):
         arr = np.arange(10, dtype='int64')

         target_type = pa.int8()

         result = pa.array(arr, type=target_type)
         expected = pa.array(arr.astype('int8'))
         assert result.equals(expected)

     def test_boolean_no_nulls(self):
         num_values = 100

         np.random.seed(0)

         df = pd.DataFrame({'bools': np.random.randn(num_values) > 0})
         field = pa.field('bools', pa.bool_())
         schema = pa.schema([field])
         _check_pandas_roundtrip(df, expected_schema=schema)

     def test_boolean_nulls(self):
         # pandas requires upcast to object dtype
         num_values = 100
         np.random.seed(0)

         mask = np.random.randint(0, 10, size=num_values) < 3
         values = np.random.randint(0, 10, size=num_values) < 5

         arr = pa.array(values, mask=mask)

         expected = values.astype(object)
         expected[mask] = None

         field = pa.field('bools', pa.bool_())
         schema = pa.schema([field])
         ex_frame = pd.DataFrame({'bools': expected})

         table = pa.Table.from_arrays([arr], ['bools'])
         assert table.schema.equals(schema)
         result = table.to_pandas()

         tm.assert_frame_equal(result, ex_frame)

     def test_boolean_to_int(self):
         # test from dtype=bool
         s = pd.Series([True, True, False, True, True] * 2)
         expected = pd.Series([1, 1, 0, 1, 1] * 2)
         _check_array_roundtrip(s, expected=expected, type=pa.int64())

     def test_boolean_objects_to_int(self):
         # test from dtype=object
         s = pd.Series([True, True, False, True, True] * 2, dtype=object)
         expected = pd.Series([1, 1, 0, 1, 1] * 2)
         expected_msg = 'Expected integer, got bool'
         with pytest.raises(pa.ArrowTypeError, match=expected_msg):
             _check_array_roundtrip(s, expected=expected, type=pa.int64())

     def test_boolean_nulls_to_float(self):
         # test from dtype=object
         s = pd.Series([True, True, False, None, True] * 2)
         expected = pd.Series([1.0, 1.0, 0.0, None, 1.0] * 2)
         _check_array_roundtrip(s, expected=expected, type=pa.float64())

     def test_float_object_nulls(self):
         arr = np.array([None, 1.5, np.float64(3.5)] * 5, dtype=object)
         df = pd.DataFrame({'floats': arr})
         expected = pd.DataFrame({'floats': pd.to_numeric(arr)})
         field = pa.field('floats', pa.float64())
         schema = pa.schema([field])
         _check_pandas_roundtrip(df, expected=expected,
                                 expected_schema=schema)

     def test_float_with_null_as_integer(self):
         # ARROW-2298
         s = pd.Series([np.nan, 1., 2., np.nan])

         types = [pa.int8(), pa.int16(), pa.int32(), pa.int64(),
                  pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()]
         for ty in types:
             result = pa.array(s, type=ty)
             expected = pa.array([None, 1, 2, None], type=ty)
             assert result.equals(expected)

             df = pd.DataFrame({'has_nulls': s})
             schema = pa.schema([pa.field('has_nulls', ty)])
             result = pa.Table.from_pandas(df, schema=schema,
                                           preserve_index=False)
             assert result[0].data.chunk(0).equals(expected)

     def test_int_object_nulls(self):
         arr = np.array([None, 1, np.int64(3)] * 5, dtype=object)
         df = pd.DataFrame({'ints': arr})
         expected = pd.DataFrame({'ints': pd.to_numeric(arr)})
         field = pa.field('ints', pa.int64())
         schema = pa.schema([field])
         _check_pandas_roundtrip(df, expected=expected,
                                 expected_schema=schema)

     def test_boolean_object_nulls(self):
         arr = np.array([False, None, True] * 100, dtype=object)
         df = pd.DataFrame({'bools': arr})
         field = pa.field('bools', pa.bool_())
         schema = pa.schema([field])
         _check_pandas_roundtrip(df, expected_schema=schema)

     def test_all_nulls_cast_numeric(self):
         arr = np.array([None], dtype=object)

         def _check_type(t):
             a2 = pa.array(arr, type=t)
             assert a2.type == t
             assert a2[0].as_py() is None

         _check_type(pa.int32())
         _check_type(pa.float64())

     def test_half_floats_from_numpy(self):
         arr = np.array([1.5, np.nan], dtype=np.float16)
         a = pa.array(arr, type=pa.float16())
         x, y = a.to_pylist()
         assert isinstance(x, np.float16)
         assert x == 1.5
         assert isinstance(y, np.float16)
         assert np.isnan(y)

         a = pa.array(arr, type=pa.float16(), from_pandas=True)
         x, y = a.to_pylist()
         assert isinstance(x, np.float16)
         assert x == 1.5
         assert y is None


 @pytest.mark.parametrize('dtype',
                          ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8'])
 def test_array_integer_object_nulls_option(dtype):
     num_values = 100

     null_mask = np.random.randint(0, 10, size=num_values) < 3
     values = np.random.randint(0, 100, size=num_values, dtype=dtype)

     array = pa.array(values, mask=null_mask)

     if null_mask.any():
         expected = values.astype('O')
         expected[null_mask] = None
     else:
         expected = values

     result = array.to_pandas(integer_object_nulls=True)

     np.testing.assert_equal(result, expected)


 @pytest.mark.parametrize('dtype',
                          ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8'])
 def test_table_integer_object_nulls_option(dtype):
     num_values = 100

     null_mask = np.random.randint(0, 10, size=num_values) < 3
     values = np.random.randint(0, 100, size=num_values, dtype=dtype)

     array = pa.array(values, mask=null_mask)

     if null_mask.any():
         expected = values.astype('O')
         expected[null_mask] = None
     else:
         expected = values

     expected = pd.DataFrame({dtype: expected})

     table = pa.Table.from_arrays([array], [dtype])
     result = table.to_pandas(integer_object_nulls=True)

     tm.assert_frame_equal(result, expected)


 class TestConvertDateTimeLikeTypes(object):
     """
     Conversion tests for datetime- and timestamp-like types (date64, etc.).
     """

     def test_timestamps_notimezone_no_nulls(self):
         df = pd.DataFrame({
             'datetime64': np.array([
                 '2007-07-13T01:23:34.123456789',
                 '2006-01-13T12:34:56.432539784',
                 '2010-08-13T05:46:57.437699912'],
                 dtype='datetime64[ns]')
         })
         field = pa.field('datetime64', pa.timestamp('ns'))
         schema = pa.schema([field])
         _check_pandas_roundtrip(
             df,
             expected_schema=schema,
         )

     def test_timestamps_notimezone_nulls(self):
         df = pd.DataFrame({
             'datetime64': np.array([
                 '2007-07-13T01:23:34.123456789',
                 None,
                 '2010-08-13T05:46:57.437699912'],
                 dtype='datetime64[ns]')
         })
         field = pa.field('datetime64', pa.timestamp('ns'))
         schema = pa.schema([field])
         _check_pandas_roundtrip(
             df,
             expected_schema=schema,
         )

     def test_timestamps_with_timezone(self):
         df = pd.DataFrame({
             'datetime64': np.array([
                 '2007-07-13T01:23:34.123',
                 '2006-01-13T12:34:56.432',
                 '2010-08-13T05:46:57.437'],
                 dtype='datetime64[ms]')
         })
         df['datetime64'] = df['datetime64'].dt.tz_localize('US/Eastern')
         _check_pandas_roundtrip(df)

         _check_series_roundtrip(df['datetime64'])

         # drop-in a null and ns instead of ms
         df = pd.DataFrame({
             'datetime64': np.array([
                 '2007-07-13T01:23:34.123456789',
                 None,
                 '2006-01-13T12:34:56.432539784',
                 '2010-08-13T05:46:57.437699912'],
                 dtype='datetime64[ns]')
         })
         df['datetime64'] = df['datetime64'].dt.tz_localize('US/Eastern')

         _check_pandas_roundtrip(df)

     def test_python_datetime(self):
         # ARROW-2106
         date_array = [datetime.today() + timedelta(days=x) for x in range(10)]
         df = pd.DataFrame({
             'datetime': pd.Series(date_array, dtype=object)
         })

         table = pa.Table.from_pandas(df)
         assert isinstance(table[0].data.chunk(0), pa.TimestampArray)

         result = table.to_pandas()
         expected_df = pd.DataFrame({
             'datetime': date_array
         })
         tm.assert_frame_equal(expected_df, result)

     def test_python_datetime_with_pytz_tzinfo(self):
         for tz in [pytz.utc, pytz.timezone('US/Eastern'), pytz.FixedOffset(1)]:
             values = [datetime(2018, 1, 1, 12, 23, 45, tzinfo=tz)]
             df = pd.DataFrame({'datetime': values})
             _check_pandas_roundtrip(df)

     @h.given(st.none() | tzst.timezones())
     def test_python_datetime_with_pytz_timezone(self, tz):
         values = [datetime(2018, 1, 1, 12, 23, 45, tzinfo=tz)]
         df = pd.DataFrame({'datetime': values})
         _check_pandas_roundtrip(df)

     @pytest.mark.skipif(six.PY2, reason='datetime.timezone is available since '
                                         'python version 3.2')
     def test_python_datetime_with_timezone_tzinfo(self):
         from datetime import timezone

         values = [datetime(2018, 1, 1, 12, 23, 45, tzinfo=pytz.utc)]
         df = pd.DataFrame({'datetime': values})
         _check_pandas_roundtrip(df)

         # datetime.timezone is going to be pytz.FixedOffset
         hours = 1
         tz_timezone = timezone(timedelta(hours=hours))
         tz_pytz = pytz.FixedOffset(hours * 60)
         values = [datetime(2018, 1, 1, 12, 23, 45, tzinfo=tz_timezone)]
         values_exp = [datetime(2018, 1, 1, 12, 23, 45, tzinfo=tz_pytz)]
         df = pd.DataFrame({'datetime': values})
         df_exp = pd.DataFrame({'datetime': values_exp})
         _check_pandas_roundtrip(df, expected=df_exp)

     def test_python_datetime_subclass(self):

         class MyDatetime(datetime):
             # see https://github.com/pandas-dev/pandas/issues/21142
             nanosecond = 0.0

         date_array = [MyDatetime(2000, 1, 1, 1, 1, 1)]
         df = pd.DataFrame({"datetime": pd.Series(date_array, dtype=object)})

         table = pa.Table.from_pandas(df)
         assert isinstance(table[0].data.chunk(0), pa.TimestampArray)

         result = table.to_pandas()
         expected_df = pd.DataFrame({"datetime": date_array})

         # https://github.com/pandas-dev/pandas/issues/21142
         expected_df["datetime"] = pd.to_datetime(expected_df["datetime"])

         tm.assert_frame_equal(expected_df, result)

     def test_python_date_subclass(self):

         class MyDate(date):
             pass

         date_array = [MyDate(2000, 1, 1)]
         df = pd.DataFrame({"date": pd.Series(date_array, dtype=object)})

         table = pa.Table.from_pandas(df)
         assert isinstance(table[0].data.chunk(0), pa.Date32Array)

         result = table.to_pandas()
         expected_df = pd.DataFrame(
             {"date": np.array([date(2000, 1, 1)], dtype=object)}
         )
         tm.assert_frame_equal(expected_df, result)

     def test_datetime64_to_date32(self):
         # ARROW-1718
         arr = pa.array([date(2017, 10, 23), None])
         c = pa.Column.from_array("d", arr)
         s = c.to_pandas()

         arr2 = pa.Array.from_pandas(s, type=pa.date32())

         assert arr2.equals(arr.cast('date32'))

     @pytest.mark.parametrize('mask', [
         None,
         np.array([True, False, False]),
     ])
     def test_pandas_datetime_to_date64(self, mask):
         s = pd.to_datetime([
             '2018-05-10T00:00:00',
             '2018-05-11T00:00:00',
             '2018-05-12T00:00:00',
         ])
         arr = pa.Array.from_pandas(s, type=pa.date64(), mask=mask)

         data = np.array([
             date(2018, 5, 10),
             date(2018, 5, 11),
             date(2018, 5, 12)
         ])
         expected = pa.array(data, mask=mask, type=pa.date64())

         assert arr.equals(expected)

     @pytest.mark.parametrize('mask', [
         None,
         np.array([True, False, False])
     ])
     def test_pandas_datetime_to_date64_failures(self, mask):
         s = pd.to_datetime([
             '2018-05-10T10:24:01',
             '2018-05-11T10:24:01',
             '2018-05-12T10:24:01',
         ])

         expected_msg = 'Timestamp value had non-zero intraday milliseconds'
         with pytest.raises(pa.ArrowInvalid, match=expected_msg):
             pa.Array.from_pandas(s, type=pa.date64(), mask=mask)

     def test_array_types_date_as_object(self):
         data = [date(2000, 1, 1),
                 None,
                 date(1970, 1, 1),
                 date(2040, 2, 26)]
         expected = np.array(['2000-01-01',
                              None,
                              '1970-01-01',
                              '2040-02-26'], dtype='datetime64')

         objects = [
             # The second value is the expected value for date_as_object=False
             (pa.array(data), expected),
             (pa.chunked_array([data]), expected),
             (pa.column('date', [data]), expected.astype('M8[ns]'))]

         assert objects[0][0].equals(pa.array(expected))

         for obj, expected_datetime64 in objects:
             result = obj.to_pandas()
             expected_obj = expected.astype(object)
             assert result.dtype == expected_obj.dtype
             npt.assert_array_equal(result, expected_obj)

             result = obj.to_pandas(date_as_object=False)
             assert result.dtype == expected_datetime64.dtype
             npt.assert_array_equal(result, expected_datetime64)

     def test_table_convert_date_as_object(self):
         df = pd.DataFrame({
             'date': [date(2000, 1, 1),
                      None,
                      date(1970, 1, 1),
                      date(2040, 2, 26)]})

         table = pa.Table.from_pandas(df, preserve_index=False)

         df_datetime = table.to_pandas(date_as_object=False)
         df_object = table.to_pandas()

         tm.assert_frame_equal(df.astype('datetime64[ns]'), df_datetime,
                               check_dtype=True)
         tm.assert_frame_equal(df, df_object, check_dtype=True)

     def test_date_infer(self):
         df = pd.DataFrame({
             'date': [date(2000, 1, 1),
                      None,
                      date(1970, 1, 1),
                      date(2040, 2, 26)]})
         table = pa.Table.from_pandas(df, preserve_index=False)
         field = pa.field('date', pa.date32())

         # schema's metadata is generated by from_pandas conversion
         expected_schema = pa.schema([field], metadata=table.schema.metadata)
         assert table.schema.equals(expected_schema)

         result = table.to_pandas()
         tm.assert_frame_equal(result, df)

     def test_date_mask(self):
         arr = np.array([date(2017, 4, 3), date(2017, 4, 4)],
                        dtype='datetime64[D]')
         mask = [True, False]
         result = pa.array(arr, mask=np.array(mask))
         expected = np.array([None, date(2017, 4, 4)], dtype='datetime64[D]')
         expected = pa.array(expected, from_pandas=True)
         assert expected.equals(result)

     def test_date_objects_typed(self):
         arr = np.array([
             date(2017, 4, 3),
             None,
             date(2017, 4, 4),
             date(2017, 4, 5)], dtype=object)

         arr_i4 = np.array([17259, -1, 17260, 17261], dtype='int32')
         arr_i8 = arr_i4.astype('int64') * 86400000
         mask = np.array([False, True, False, False])

         t32 = pa.date32()
         t64 = pa.date64()

         a32 = pa.array(arr, type=t32)
         a64 = pa.array(arr, type=t64)

         a32_expected = pa.array(arr_i4, mask=mask, type=t32)
         a64_expected = pa.array(arr_i8, mask=mask, type=t64)

         assert a32.equals(a32_expected)
         assert a64.equals(a64_expected)

         # Test converting back to pandas
         colnames = ['date32', 'date64']
         table = pa.Table.from_arrays([a32, a64], colnames)

         ex_values = (np.array(['2017-04-03', '2017-04-04', '2017-04-04',
                                '2017-04-05'],
                               dtype='datetime64[D]'))
         ex_values[1] = pd.NaT.value

         ex_datetime64ns = ex_values.astype('datetime64[ns]')
         expected_pandas = pd.DataFrame({'date32': ex_datetime64ns,
                                         'date64': ex_datetime64ns},
                                        columns=colnames)
         table_pandas = table.to_pandas(date_as_object=False)
         tm.assert_frame_equal(table_pandas, expected_pandas)

         table_pandas_objects = table.to_pandas()
         ex_objects = ex_values.astype('object')
         expected_pandas_objects = pd.DataFrame({'date32': ex_objects,
                                                 'date64': ex_objects},
                                                columns=colnames)
         tm.assert_frame_equal(table_pandas_objects,
                               expected_pandas_objects)

     def test_dates_from_integers(self):
         t1 = pa.date32()
         t2 = pa.date64()

         arr = np.array([17259, 17260, 17261], dtype='int32')
         arr2 = arr.astype('int64') * 86400000

         a1 = pa.array(arr, type=t1)
         a2 = pa.array(arr2, type=t2)

         expected = date(2017, 4, 3)
         assert a1[0].as_py() == expected
         assert a2[0].as_py() == expected

     @pytest.mark.xfail(reason="not supported ATM",
                        raises=NotImplementedError)
     def test_timedelta(self):
         # TODO(jreback): Pandas only support ns resolution
         # Arrow supports ??? for resolution
         df = pd.DataFrame({
             'timedelta': np.arange(start=0, stop=3 * 86400000,
                                    step=86400000,
                                    dtype='timedelta64[ms]')
         })
         pa.Table.from_pandas(df)

     def test_pytime_from_pandas(self):
         pytimes = [time(1, 2, 3, 1356),
                    time(4, 5, 6, 1356)]

         # microseconds
         t1 = pa.time64('us')

         aobjs = np.array(pytimes + [None], dtype=object)
         parr = pa.array(aobjs)
         assert parr.type == t1
         assert parr[0].as_py() == pytimes[0]
         assert parr[1].as_py() == pytimes[1]
         assert parr[2] is pa.NA

         # DataFrame
         df = pd.DataFrame({'times': aobjs})
         batch = pa.RecordBatch.from_pandas(df)
         assert batch[0].equals(parr)

         # Test ndarray of int64 values
         arr = np.array([_pytime_to_micros(v) for v in pytimes],
                        dtype='int64')

         a1 = pa.array(arr, type=pa.time64('us'))
         assert a1[0].as_py() == pytimes[0]

         a2 = pa.array(arr * 1000, type=pa.time64('ns'))
         assert a2[0].as_py() == pytimes[0]

         a3 = pa.array((arr / 1000).astype('i4'),
                       type=pa.time32('ms'))
         assert a3[0].as_py() == pytimes[0].replace(microsecond=1000)

         a4 = pa.array((arr / 1000000).astype('i4'),
                       type=pa.time32('s'))
         assert a4[0].as_py() == pytimes[0].replace(microsecond=0)

     def test_arrow_time_to_pandas(self):
         pytimes = [time(1, 2, 3, 1356),
                    time(4, 5, 6, 1356),
                    time(0, 0, 0)]

         expected = np.array(pytimes[:2] + [None])
         expected_ms = np.array([x.replace(microsecond=1000)
                                 for x in pytimes[:2]] +
                                [None])
         expected_s = np.array([x.replace(microsecond=0)
                                for x in pytimes[:2]] +
                               [None])

         arr = np.array([_pytime_to_micros(v) for v in pytimes],
                        dtype='int64')
         arr = np.array([_pytime_to_micros(v) for v in pytimes],
                        dtype='int64')

         null_mask = np.array([False, False, True], dtype=bool)

         a1 = pa.array(arr, mask=null_mask, type=pa.time64('us'))
         a2 = pa.array(arr * 1000, mask=null_mask,
                       type=pa.time64('ns'))

         a3 = pa.array((arr / 1000).astype('i4'), mask=null_mask,
                       type=pa.time32('ms'))
         a4 = pa.array((arr / 1000000).astype('i4'), mask=null_mask,
                       type=pa.time32('s'))

         names = ['time64[us]', 'time64[ns]', 'time32[ms]', 'time32[s]']
         batch = pa.RecordBatch.from_arrays([a1, a2, a3, a4], names)
         arr = a1.to_pandas()
         assert (arr == expected).all()

         arr = a2.to_pandas()
         assert (arr == expected).all()

         arr = a3.to_pandas()
         assert (arr == expected_ms).all()

         arr = a4.to_pandas()
         assert (arr == expected_s).all()

         df = batch.to_pandas()
         expected_df = pd.DataFrame({'time64[us]': expected,
                                     'time64[ns]': expected,
                                     'time32[ms]': expected_ms,
                                     'time32[s]': expected_s},
                                    columns=names)

         tm.assert_frame_equal(df, expected_df)

     def test_numpy_datetime64_columns(self):
         datetime64_ns = np.array([
                 '2007-07-13T01:23:34.123456789',
                 None,
                 '2006-01-13T12:34:56.432539784',
                 '2010-08-13T05:46:57.437699912'],
                 dtype='datetime64[ns]')
         _check_array_from_pandas_roundtrip(datetime64_ns)

         datetime64_us = np.array([
                 '2007-07-13T01:23:34.123456',
                 None,
                 '2006-01-13T12:34:56.432539',
                 '2010-08-13T05:46:57.437699'],
                 dtype='datetime64[us]')
         _check_array_from_pandas_roundtrip(datetime64_us)

         datetime64_ms = np.array([
                 '2007-07-13T01:23:34.123',
                 None,
                 '2006-01-13T12:34:56.432',
                 '2010-08-13T05:46:57.437'],
                 dtype='datetime64[ms]')
         _check_array_from_pandas_roundtrip(datetime64_ms)

         datetime64_s = np.array([
                 '2007-07-13T01:23:34',
                 None,
                 '2006-01-13T12:34:56',
                 '2010-08-13T05:46:57'],
                 dtype='datetime64[s]')
         _check_array_from_pandas_roundtrip(datetime64_s)

     @pytest.mark.parametrize('dtype', [pa.date32(), pa.date64()])
     def test_numpy_datetime64_day_unit(self, dtype):
         datetime64_d = np.array([
                 '2007-07-13',
                 None,
                 '2006-01-15',
                 '2010-08-19'],
                 dtype='datetime64[D]')
         _check_array_from_pandas_roundtrip(datetime64_d, type=dtype)

     def test_array_from_pandas_date_with_mask(self):
         m = np.array([True, False, True])
         data = pd.Series([
             date(1990, 1, 1),
             date(1991, 1, 1),
             date(1992, 1, 1)
         ])

         result = pa.Array.from_pandas(data, mask=m)

         expected = pd.Series([None, date(1991, 1, 1), None])
         assert pa.Array.from_pandas(expected).equals(result)

     def test_fixed_offset_timezone(self):
         df = pd.DataFrame({
             'a': [
                 pd.Timestamp('2012-11-11 00:00:00+01:00'),
                 pd.NaT
                 ]
              })
         _check_pandas_roundtrip(df)
         _check_serialize_components_roundtrip(df)

 # ----------------------------------------------------------------------
 # Conversion tests for string and binary types.


 class TestConvertStringLikeTypes(object):

     def test_pandas_unicode(self):
         repeats = 1000
         values = [u'foo', None, u'bar', u'mañana', np.nan]
         df = pd.DataFrame({'strings': values * repeats})
         field = pa.field('strings', pa.string())
         schema = pa.schema([field])

         _check_pandas_roundtrip(df, expected_schema=schema)

     def test_bytes_to_binary(self):
         values = [u'qux', b'foo', None, bytearray(b'barz'), 'qux', np.nan]
         df = pd.DataFrame({'strings': values})

         table = pa.Table.from_pandas(df)
         assert table[0].type == pa.binary()

         values2 = [b'qux', b'foo', None, b'barz', b'qux', np.nan]
         expected = pd.DataFrame({'strings': values2})
         _check_pandas_roundtrip(df, expected)

     @pytest.mark.large_memory
     def test_bytes_exceed_2gb(self):
         v1 = b'x' * 100000000
         v2 = b'x' * 147483646

         # ARROW-2227, hit exactly 2GB on the nose
         df = pd.DataFrame({
             'strings': [v1] * 20 + [v2] + ['x'] * 20
         })
         arr = pa.array(df['strings'])
         assert isinstance(arr, pa.ChunkedArray)
         assert arr.num_chunks == 2
         arr = None

         table = pa.Table.from_pandas(df)
         assert table[0].data.num_chunks == 2

     def test_fixed_size_bytes(self):
         values = [b'foo', None, bytearray(b'bar'), None, None, b'hey']
         df = pd.DataFrame({'strings': values})
         schema = pa.schema([pa.field('strings', pa.binary(3))])
         table = pa.Table.from_pandas(df, schema=schema)
         assert table.schema[0].type == schema[0].type
         assert table.schema[0].name == schema[0].name
         result = table.to_pandas()
         tm.assert_frame_equal(result, df)

     def test_fixed_size_bytes_does_not_accept_varying_lengths(self):
         values = [b'foo', None, b'ba', None, None, b'hey']
         df = pd.DataFrame({'strings': values})
         schema = pa.schema([pa.field('strings', pa.binary(3))])
         with pytest.raises(pa.ArrowInvalid):
             pa.Table.from_pandas(df, schema=schema)

     def test_variable_size_bytes(self):
         s = pd.Series([b'123', b'', b'a', None])
         _check_series_roundtrip(s, type_=pa.binary())

     def test_binary_from_bytearray(self):
         s = pd.Series([bytearray(b'123'), bytearray(b''), bytearray(b'a'),
                        None])
         # Explicitly set type
         _check_series_roundtrip(s, type_=pa.binary())
         # Infer type from bytearrays
         _check_series_roundtrip(s, expected_pa_type=pa.binary())

     def test_table_empty_str(self):
         values = ['', '', '', '', '']
         df = pd.DataFrame({'strings': values})
         field = pa.field('strings', pa.string())
         schema = pa.schema([field])
         table = pa.Table.from_pandas(df, schema=schema)

         result1 = table.to_pandas(strings_to_categorical=False)
         expected1 = pd.DataFrame({'strings': values})
         tm.assert_frame_equal(result1, expected1, check_dtype=True)

         result2 = table.to_pandas(strings_to_categorical=True)
         expected2 = pd.DataFrame({'strings': pd.Categorical(values)})
         tm.assert_frame_equal(result2, expected2, check_dtype=True)

     def test_selective_categoricals(self):
         values = ['', '', '', '', '']
         df = pd.DataFrame({'strings': values})
         field = pa.field('strings', pa.string())
         schema = pa.schema([field])
         table = pa.Table.from_pandas(df, schema=schema)
         expected_str = pd.DataFrame({'strings': values})
         expected_cat = pd.DataFrame({'strings': pd.Categorical(values)})

         result1 = table.to_pandas(categories=['strings'])
         tm.assert_frame_equal(result1, expected_cat, check_dtype=True)
         result2 = table.to_pandas(categories=[])
         tm.assert_frame_equal(result2, expected_str, check_dtype=True)
         result3 = table.to_pandas(categories=('strings',))
         tm.assert_frame_equal(result3, expected_cat, check_dtype=True)
         result4 = table.to_pandas(categories=tuple())
         tm.assert_frame_equal(result4, expected_str, check_dtype=True)

     def test_to_pandas_categorical_zero_length(self):
         # ARROW-3586
         array = pa.array([], type=pa.int32())
         table = pa.Table.from_arrays(arrays=[array], names=['col'])
         # This would segfault under 0.11.0
         table.to_pandas(categories=['col'])

     def test_table_str_to_categorical_without_na(self):
         values = ['a', 'a', 'b', 'b', 'c']
         df = pd.DataFrame({'strings': values})
         field = pa.field('strings', pa.string())
         schema = pa.schema([field])
         table = pa.Table.from_pandas(df, schema=schema)

         result = table.to_pandas(strings_to_categorical=True)
         expected = pd.DataFrame({'strings': pd.Categorical(values)})
         tm.assert_frame_equal(result, expected, check_dtype=True)

         with pytest.raises(pa.ArrowInvalid):
             table.to_pandas(strings_to_categorical=True,
                             zero_copy_only=True)

     def test_table_str_to_categorical_with_na(self):
         values = [None, 'a', 'b', np.nan]
         df = pd.DataFrame({'strings': values})
         field = pa.field('strings', pa.string())
         schema = pa.schema([field])
         table = pa.Table.from_pandas(df, schema=schema)

         result = table.to_pandas(strings_to_categorical=True)
         expected = pd.DataFrame({'strings': pd.Categorical(values)})
         tm.assert_frame_equal(result, expected, check_dtype=True)

         with pytest.raises(pa.ArrowInvalid):
             table.to_pandas(strings_to_categorical=True,
                             zero_copy_only=True)

     # Regression test for ARROW-2101
     def test_array_of_bytes_to_strings(self):
         converted = pa.array(np.array([b'x'], dtype=object), pa.string())
         assert converted.type == pa.string()

     # Make sure that if an ndarray of bytes is passed to the array
     # constructor and the type is string, it will fail if those bytes
     # cannot be converted to utf-8
     def test_array_of_bytes_to_strings_bad_data(self):
         with pytest.raises(
                 pa.lib.ArrowInvalid,
                 match="was not a utf8 string"):
             pa.array(np.array([b'\x80\x81'], dtype=object), pa.string())

     def test_numpy_string_array_to_fixed_size_binary(self):
         arr = np.array([b'foo', b'bar', b'baz'], dtype='|S3')

         converted = pa.array(arr, type=pa.binary(3))
         expected = pa.array(list(arr), type=pa.binary(3))
         assert converted.equals(expected)

         mask = np.array([True, False, True])
         converted = pa.array(arr, type=pa.binary(3), mask=mask)
         expected = pa.array([b'foo', None, b'baz'], type=pa.binary(3))
         assert converted.equals(expected)

         with pytest.raises(pa.lib.ArrowInvalid,
                            match=r'Got bytestring of length 3 \(expected 4\)'):
             arr = np.array([b'foo', b'bar', b'baz'], dtype='|S3')
             pa.array(arr, type=pa.binary(4))

         with pytest.raises(
                 pa.lib.ArrowInvalid,
                 match=r'Got bytestring of length 12 \(expected 3\)'):
             arr = np.array([b'foo', b'bar', b'baz'], dtype='|U3')
             pa.array(arr, type=pa.binary(3))


 class TestConvertDecimalTypes(object):
     """
     Conversion test for decimal types.
     """
     decimal32 = [
         decimal.Decimal('-1234.123'),
         decimal.Decimal('1234.439')
     ]
     decimal64 = [
         decimal.Decimal('-129934.123331'),
         decimal.Decimal('129534.123731')
     ]
     decimal128 = [
         decimal.Decimal('394092382910493.12341234678'),
         decimal.Decimal('-314292388910493.12343437128')
     ]

     @pytest.mark.parametrize(('values', 'expected_type'), [
         pytest.param(decimal32, pa.decimal128(7, 3), id='decimal32'),
         pytest.param(decimal64, pa.decimal128(12, 6), id='decimal64'),
         pytest.param(decimal128, pa.decimal128(26, 11), id='decimal128')
     ])
     def test_decimal_from_pandas(self, values, expected_type):
         expected = pd.DataFrame({'decimals': values})
         table = pa.Table.from_pandas(expected, preserve_index=False)
         field = pa.field('decimals', expected_type)

         # schema's metadata is generated by from_pandas conversion
         expected_schema = pa.schema([field], metadata=table.schema.metadata)
         assert table.schema.equals(expected_schema)

     @pytest.mark.parametrize('values', [
         pytest.param(decimal32, id='decimal32'),
         pytest.param(decimal64, id='decimal64'),
         pytest.param(decimal128, id='decimal128')
     ])
     def test_decimal_to_pandas(self, values):
         expected = pd.DataFrame({'decimals': values})
         converted = pa.Table.from_pandas(expected)
         df = converted.to_pandas()
         tm.assert_frame_equal(df, expected)

     def test_decimal_fails_with_truncation(self):
         data1 = [decimal.Decimal('1.234')]
         type1 = pa.decimal128(10, 2)
         with pytest.raises(pa.ArrowInvalid):
             pa.array(data1, type=type1)

         data2 = [decimal.Decimal('1.2345')]
         type2 = pa.decimal128(10, 3)
         with pytest.raises(pa.ArrowInvalid):
             pa.array(data2, type=type2)

     def test_decimal_with_different_precisions(self):
         data = [
             decimal.Decimal('0.01'),
             decimal.Decimal('0.001'),
         ]
         series = pd.Series(data)
         array = pa.array(series)
         assert array.to_pylist() == data
         assert array.type == pa.decimal128(3, 3)

         array = pa.array(data, type=pa.decimal128(12, 5))
         expected = [decimal.Decimal('0.01000'), decimal.Decimal('0.00100')]
         assert array.to_pylist() == expected

     def test_decimal_with_None_explicit_type(self):
         series = pd.Series([decimal.Decimal('3.14'), None])
         _check_series_roundtrip(series, type_=pa.decimal128(12, 5))

         # Test that having all None values still produces decimal array
         series = pd.Series([None] * 2)
         _check_series_roundtrip(series, type_=pa.decimal128(12, 5))

     def test_decimal_with_None_infer_type(self):
         series = pd.Series([decimal.Decimal('3.14'), None])
         _check_series_roundtrip(series, expected_pa_type=pa.decimal128(3, 2))

     def test_strided_objects(self, tmpdir):
         # see ARROW-3053
         data = {
             'a': {0: 'a'},
             'b': {0: decimal.Decimal('0.0')}
         }

         # This yields strided objects
         df = pd.DataFrame.from_dict(data)
         _check_pandas_roundtrip(df)


 class TestConvertListTypes(object):
     """
     Conversion tests for list<> types.
     """

     def test_column_of_arrays(self):
         df, schema = dataframe_with_arrays()
         _check_pandas_roundtrip(df, schema=schema, expected_schema=schema)
         table = pa.Table.from_pandas(df, schema=schema, preserve_index=False)

         # schema's metadata is generated by from_pandas conversion
         expected_schema = schema.add_metadata(table.schema.metadata)
         assert table.schema.equals(expected_schema)

         for column in df.columns:
             field = schema.field_by_name(column)
             _check_array_roundtrip(df[column], type=field.type)

     def test_column_of_arrays_to_py(self):
         # Test regression in ARROW-1199 not caught in above test
         dtype = 'i1'
         arr = np.array([
             np.arange(10, dtype=dtype),
             np.arange(5, dtype=dtype),
             None,
             np.arange(1, dtype=dtype)
         ])
         type_ = pa.list_(pa.int8())
         parr = pa.array(arr, type=type_)

         assert parr[0].as_py() == list(range(10))
         assert parr[1].as_py() == list(range(5))
         assert parr[2].as_py() is None
         assert parr[3].as_py() == [0]

     def test_column_of_boolean_list(self):
         # ARROW-4370: Table to pandas conversion fails for list of bool
         array = pa.array([[True, False], [True]], type=pa.list_(pa.bool_()))
         table = pa.Table.from_arrays([array], names=['col1'])
         df = table.to_pandas()

         expected_df = pd.DataFrame({'col1': [[True, False], [True]]})
         tm.assert_frame_equal(df, expected_df)

     def test_column_of_decimal_list(self):
         array = pa.array([[decimal.Decimal('1'), decimal.Decimal('2')],
                          [decimal.Decimal('3.3')]],
                          type=pa.list_(pa.decimal128(2, 1)))
         table = pa.Table.from_arrays([array], names=['col1'])
         df = table.to_pandas()

         expected_df = pd.DataFrame(
                 {'col1': [[decimal.Decimal('1'), decimal.Decimal('2')],
                           [decimal.Decimal('3.3')]]})
         tm.assert_frame_equal(df, expected_df)

     def test_column_of_lists(self):
         df, schema = dataframe_with_lists()
         _check_pandas_roundtrip(df, schema=schema, expected_schema=schema)
         table = pa.Table.from_pandas(df, schema=schema, preserve_index=False)

         # schema's metadata is generated by from_pandas conversion
         expected_schema = schema.add_metadata(table.schema.metadata)
         assert table.schema.equals(expected_schema)

         for column in df.columns:
             field = schema.field_by_name(column)
             _check_array_roundtrip(df[column], type=field.type)

     def test_column_of_lists_first_empty(self):
         # ARROW-2124
         num_lists = [[], [2, 3, 4], [3, 6, 7, 8], [], [2]]
         series = pd.Series([np.array(s, dtype=float) for s in num_lists])
         arr = pa.array(series)
         result = pd.Series(arr.to_pandas())
         tm.assert_series_equal(result, series)

     def test_column_of_lists_chunked(self):
         # ARROW-1357
         df = pd.DataFrame({
             'lists': np.array([
                 [1, 2],
                 None,
                 [2, 3],
                 [4, 5],
                 [6, 7],
                 [8, 9]
             ], dtype=object)
         })

         schema = pa.schema([
             pa.field('lists', pa.list_(pa.int64()))
         ])

         t1 = pa.Table.from_pandas(df[:2], schema=schema)
         t2 = pa.Table.from_pandas(df[2:], schema=schema)

         table = pa.concat_tables([t1, t2])
         result = table.to_pandas()

         tm.assert_frame_equal(result, df)

     def test_column_of_lists_chunked2(self):
         data1 = [[0, 1], [2, 3], [4, 5], [6, 7], [10, 11],
                  [12, 13], [14, 15], [16, 17]]
         data2 = [[8, 9], [18, 19]]

         a1 = pa.array(data1)
         a2 = pa.array(data2)

         t1 = pa.Table.from_arrays([a1], names=['a'])
         t2 = pa.Table.from_arrays([a2], names=['a'])

         concatenated = pa.concat_tables([t1, t2])

         result = concatenated.to_pandas()
         expected = pd.DataFrame({'a': data1 + data2})

         tm.assert_frame_equal(result, expected)

     def test_column_of_lists_strided(self):
         df, schema = dataframe_with_lists()
         df = pd.concat([df] * 6, ignore_index=True)

         arr = df['int64'].values[::3]
         assert arr.strides[0] != 8

         _check_array_roundtrip(arr)

     def test_nested_lists_all_none(self):
         data = np.array([[None, None], None], dtype=object)

         arr = pa.array(data)
         expected = pa.array(list(data))
         assert arr.equals(expected)
         assert arr.type == pa.list_(pa.null())

         data2 = np.array([None, None, [None, None],
                           np.array([None, None], dtype=object)],
                          dtype=object)
         arr = pa.array(data2)
         expected = pa.array([None, None, [None, None], [None, None]])
         assert arr.equals(expected)

     def test_nested_lists_all_empty(self):
         # ARROW-2128
         data = pd.Series([[], [], []])
         arr = pa.array(data)
         expected = pa.array(list(data))
         assert arr.equals(expected)
         assert arr.type == pa.list_(pa.null())

     def test_nested_list_first_empty(self):
         # ARROW-2711
         data = pd.Series([[], [u"a"]])
         arr = pa.array(data)
         expected = pa.array(list(data))
         assert arr.equals(expected)
         assert arr.type == pa.list_(pa.string())

     def test_nested_smaller_ints(self):
         # ARROW-1345, ARROW-2008, there were some type inference bugs happening
         # before
         data = pd.Series([np.array([1, 2, 3], dtype='i1'), None])
         result = pa.array(data)
         result2 = pa.array(data.values)
         expected = pa.array([[1, 2, 3], None], type=pa.list_(pa.int8()))
         assert result.equals(expected)
         assert result2.equals(expected)

         data3 = pd.Series([np.array([1, 2, 3], dtype='f4'), None])
         result3 = pa.array(data3)
         expected3 = pa.array([[1, 2, 3], None], type=pa.list_(pa.float32()))
         assert result3.equals(expected3)

     def test_infer_lists(self):
         data = OrderedDict([
             ('nan_ints', [[None, 1], [2, 3]]),
             ('ints', [[0, 1], [2, 3]]),
             ('strs', [[None, u'b'], [u'c', u'd']]),
             ('nested_strs', [[[None, u'b'], [u'c', u'd']], None])
         ])
         df = pd.DataFrame(data)

         expected_schema = pa.schema([
             pa.field('nan_ints', pa.list_(pa.int64())),
             pa.field('ints', pa.list_(pa.int64())),
             pa.field('strs', pa.list_(pa.string())),
             pa.field('nested_strs', pa.list_(pa.list_(pa.string())))
         ])

         _check_pandas_roundtrip(df, expected_schema=expected_schema)

     def test_infer_numpy_array(self):
         data = OrderedDict([
             ('ints', [
                 np.array([0, 1], dtype=np.int64),
                 np.array([2, 3], dtype=np.int64)
             ])
         ])
         df = pd.DataFrame(data)
         expected_schema = pa.schema([
             pa.field('ints', pa.list_(pa.int64()))
         ])

         _check_pandas_roundtrip(df, expected_schema=expected_schema)

     @pytest.mark.parametrize('t,data,expected', [
         (
             pa.int64,
             [[1, 2], [3], None],
             [None, [3], None]
         ),
         (
             pa.string,
             [[u'aaa', u'bb'], [u'c'], None],
             [None, [u'c'], None]
         ),
         (
             pa.null,
             [[None, None], [None], None],
             [None, [None], None]
         )
     ])
     def test_array_from_pandas_typed_array_with_mask(self, t, data, expected):
         m = np.array([True, False, True])

         s = pd.Series(data)
         result = pa.Array.from_pandas(s, mask=m, type=pa.list_(t()))

         assert pa.Array.from_pandas(expected,
                                     type=pa.list_(t())).equals(result)

     def test_empty_list_roundtrip(self):
         empty_list_array = np.empty((3,), dtype=object)
         empty_list_array.fill([])

         df = pd.DataFrame({'a': np.array(['1', '2', '3']),
                            'b': empty_list_array})
         tbl = pa.Table.from_pandas(df)

         result = tbl.to_pandas()

         tm.assert_frame_equal(result, df)

     def test_array_from_nested_arrays(self):
         df, schema = dataframe_with_arrays()
         for field in schema:
             arr = df[field.name].values
             expected = pa.array(list(arr), type=field.type)
             result = pa.array(arr)
             assert result.type == field.type  # == list<scalar>
             assert result.equals(expected)


 class TestConvertStructTypes(object):
     """
     Conversion tests for struct types.
     """

     def test_pandas_roundtrip(self):
         df = pd.DataFrame({'dicts': [{'a': 1, 'b': 2}, {'a': 3, 'b': 4}]})

         expected_schema = pa.schema([
             ('dicts', pa.struct([('a', pa.int64()), ('b', pa.int64())])),
         ])

         _check_pandas_roundtrip(df, expected_schema=expected_schema)

         # specifying schema explicitly in from_pandas
         _check_pandas_roundtrip(
             df, schema=expected_schema, expected_schema=expected_schema)

     def test_to_pandas(self):
         ints = pa.array([None, 2, 3], type=pa.int64())
         strs = pa.array([u'a', None, u'c'], type=pa.string())
         bools = pa.array([True, False, None], type=pa.bool_())
         arr = pa.StructArray.from_arrays(
             [ints, strs, bools],
             ['ints', 'strs', 'bools'])

         expected = pd.Series([
             {'ints': None, 'strs': u'a', 'bools': True},
             {'ints': 2, 'strs': None, 'bools': False},
             {'ints': 3, 'strs': u'c', 'bools': None},
         ])

         series = pd.Series(arr.to_pandas())
         tm.assert_series_equal(series, expected)

     def test_from_numpy(self):
         dt = np.dtype([('x', np.int32),
                        (('y_title', 'y'), np.bool_)])
         ty = pa.struct([pa.field('x', pa.int32()),
                         pa.field('y', pa.bool_())])

         data = np.array([], dtype=dt)
         arr = pa.array(data, type=ty)
         assert arr.to_pylist() == []

         data = np.array([(42, True), (43, False)], dtype=dt)
         arr = pa.array(data, type=ty)
         assert arr.to_pylist() == [{'x': 42, 'y': True},
                                    {'x': 43, 'y': False}]

         # With mask
         arr = pa.array(data, mask=np.bool_([False, True]), type=ty)
         assert arr.to_pylist() == [{'x': 42, 'y': True}, None]

         # Trivial struct type
         dt = np.dtype([])
         ty = pa.struct([])

         data = np.array([], dtype=dt)
         arr = pa.array(data, type=ty)
         assert arr.to_pylist() == []

         data = np.array([(), ()], dtype=dt)
         arr = pa.array(data, type=ty)
         assert arr.to_pylist() == [{}, {}]

     def test_from_numpy_nested(self):
         # Note: an object field inside a struct
         dt = np.dtype([('x', np.dtype([('xx', np.int8),
                                        ('yy', np.bool_)])),
                        ('y', np.int16),
                        ('z', np.object_)])
         # Note: itemsize is not a multiple of sizeof(object)
         assert dt.itemsize == 12
         ty = pa.struct([pa.field('x', pa.struct([pa.field('xx', pa.int8()),
                                                  pa.field('yy', pa.bool_())])),
                         pa.field('y', pa.int16()),
                         pa.field('z', pa.string())])

         data = np.array([], dtype=dt)
         arr = pa.array(data, type=ty)
         assert arr.to_pylist() == []

         data = np.array([
             ((1, True), 2, 'foo'),
             ((3, False), 4, 'bar')], dtype=dt)
         arr = pa.array(data, type=ty)
         assert arr.to_pylist() == [
             {'x': {'xx': 1, 'yy': True}, 'y': 2, 'z': 'foo'},
             {'x': {'xx': 3, 'yy': False}, 'y': 4, 'z': 'bar'}]

     @pytest.mark.large_memory
     def test_from_numpy_large(self):
         # Exercise rechunking + nulls
         target_size = 3 * 1024**3  # 4GB
         dt = np.dtype([('x', np.float64), ('y', 'object')])
         bs = 65536 - dt.itemsize
         block = b'.' * bs
         n = target_size // (bs + dt.itemsize)
         data = np.zeros(n, dtype=dt)
         data['x'] = np.random.random_sample(n)
         data['y'] = block
         # Add implicit nulls
         data['x'][data['x'] < 0.2] = np.nan

         ty = pa.struct([pa.field('x', pa.float64()),
                         pa.field('y', pa.binary(bs))])
         arr = pa.array(data, type=ty, from_pandas=True)
         assert arr.num_chunks == 2

         def iter_chunked_array(arr):
             for chunk in arr.iterchunks():
                 for item in chunk:
                     yield item

         def check(arr, data, mask=None):
             assert len(arr) == len(data)
             xs = data['x']
             ys = data['y']
             for i, obj in enumerate(iter_chunked_array(arr)):
                 try:
                     d = obj.as_py()
                     if mask is not None and mask[i]:
                         assert d is None
                     else:
                         x = xs[i]
                         if np.isnan(x):
                             assert d['x'] is None
                         else:
                             assert d['x'] == x
                         assert d['y'] == ys[i]
                 except Exception:
                     print("Failed at index", i)
                     raise

         check(arr, data)
         del arr

         # Now with explicit mask
         mask = np.random.random_sample(n) < 0.2
         arr = pa.array(data, type=ty, mask=mask, from_pandas=True)
         assert arr.num_chunks == 2

         check(arr, data, mask)
         del arr

     def test_from_numpy_bad_input(self):
         ty = pa.struct([pa.field('x', pa.int32()),
                         pa.field('y', pa.bool_())])
         dt = np.dtype([('x', np.int32),
                        ('z', np.bool_)])

         data = np.array([], dtype=dt)
         with pytest.raises(TypeError,
                            match="Missing field 'y'"):
             pa.array(data, type=ty)
         data = np.int32([])
         with pytest.raises(TypeError,
                            match="Expected struct array"):
             pa.array(data, type=ty)

     def test_from_tuples(self):
         df = pd.DataFrame({'tuples': [(1, 2), (3, 4)]})
         expected_df = pd.DataFrame(
             {'tuples': [{'a': 1, 'b': 2}, {'a': 3, 'b': 4}]})

         # conversion from tuples works when specifying expected struct type
         struct_type = pa.struct([('a', pa.int64()), ('b', pa.int64())])

         arr = np.asarray(df['tuples'])
         _check_array_roundtrip(
             arr, expected=expected_df['tuples'], type=struct_type)

         expected_schema = pa.schema([('tuples', struct_type)])
         _check_pandas_roundtrip(
             df, expected=expected_df, schema=expected_schema,
             expected_schema=expected_schema)


 class TestZeroCopyConversion(object):
     """
     Tests that zero-copy conversion works with some types.
     """

     def test_zero_copy_success(self):
         result = pa.array([0, 1, 2]).to_pandas(zero_copy_only=True)
         npt.assert_array_equal(result, [0, 1, 2])

     def test_zero_copy_dictionaries(self):
         arr = pa.DictionaryArray.from_arrays(
             np.array([0, 0]),
             np.array([5]))

         result = arr.to_pandas(zero_copy_only=True)
         values = pd.Categorical([5, 5])

         tm.assert_series_equal(pd.Series(result), pd.Series(values),
                                check_names=False)

     def check_zero_copy_failure(self, arr):
         with pytest.raises(pa.ArrowInvalid):
             arr.to_pandas(zero_copy_only=True)

     def test_zero_copy_failure_on_object_types(self):
         self.check_zero_copy_failure(pa.array(['A', 'B', 'C']))

     def test_zero_copy_failure_with_int_when_nulls(self):
         self.check_zero_copy_failure(pa.array([0, 1, None]))

     def test_zero_copy_failure_with_float_when_nulls(self):
         self.check_zero_copy_failure(pa.array([0.0, 1.0, None]))

     def test_zero_copy_failure_on_bool_types(self):
         self.check_zero_copy_failure(pa.array([True, False]))

     def test_zero_copy_failure_on_list_types(self):
         arr = pa.array([[1, 2], [8, 9]], type=pa.list_(pa.int64()))
         self.check_zero_copy_failure(arr)

     def test_zero_copy_failure_on_timestamp_types(self):
         arr = np.array(['2007-07-13'], dtype='datetime64[ns]')
         self.check_zero_copy_failure(pa.array(arr))


 # This function must be at the top-level for Python 2.7's multiprocessing
 def _non_threaded_conversion():
     df = _alltypes_example()
     _check_pandas_roundtrip(df, use_threads=False)
     _check_pandas_roundtrip(df, use_threads=False, as_batch=True)


 def _threaded_conversion():
     df = _alltypes_example()
     _check_pandas_roundtrip(df, use_threads=True)
     _check_pandas_roundtrip(df, use_threads=True, as_batch=True)


 class TestConvertMisc(object):
     """
     Miscellaneous conversion tests.
     """

     type_pairs = [
         (np.int8, pa.int8()),
         (np.int16, pa.int16()),
         (np.int32, pa.int32()),
         (np.int64, pa.int64()),
         (np.uint8, pa.uint8()),
         (np.uint16, pa.uint16()),
         (np.uint32, pa.uint32()),
         (np.uint64, pa.uint64()),
         (np.float16, pa.float16()),
         (np.float32, pa.float32()),
         (np.float64, pa.float64()),
         # XXX unsupported
         # (np.dtype([('a', 'i2')]), pa.struct([pa.field('a', pa.int16())])),
         (np.object, pa.string()),
         (np.object, pa.binary()),
         (np.object, pa.binary(10)),
         (np.object, pa.list_(pa.int64())),
     ]

     def test_all_none_objects(self):
         df = pd.DataFrame({'a': [None, None, None]})
         _check_pandas_roundtrip(df)

     def test_all_none_category(self):
         df = pd.DataFrame({'a': [None, None, None]})
         df['a'] = df['a'].astype('category')
         _check_pandas_roundtrip(df)

     def test_empty_arrays(self):
         for dtype, pa_type in self.type_pairs:
             arr = np.array([], dtype=dtype)
             _check_array_roundtrip(arr, type=pa_type)

     def test_non_threaded_conversion(self):
         _non_threaded_conversion()

     def test_threaded_conversion_multiprocess(self):
         # Parallel conversion should work from child processes too (ARROW-2963)
         pool = mp.Pool(2)
         try:
             pool.apply(_threaded_conversion)
         finally:
             pool.close()
             pool.join()

     def test_category(self):
         repeats = 5
         v1 = ['foo', None, 'bar', 'qux', np.nan]
         v2 = [4, 5, 6, 7, 8]
         v3 = [b'foo', None, b'bar', b'qux', np.nan]

         arrays = {
             'cat_strings': pd.Categorical(v1 * repeats),
             'cat_strings_with_na': pd.Categorical(v1 * repeats,
                                                   categories=['foo', 'bar']),
             'cat_ints': pd.Categorical(v2 * repeats),
             'cat_binary': pd.Categorical(v3 * repeats),
             'cat_strings_ordered': pd.Categorical(
                 v1 * repeats, categories=['bar', 'qux', 'foo'],
                 ordered=True),
             'ints': v2 * repeats,
             'ints2': v2 * repeats,
             'strings': v1 * repeats,
             'strings2': v1 * repeats,
             'strings3': v3 * repeats}
         df = pd.DataFrame(arrays)
         _check_pandas_roundtrip(df)

         for k in arrays:
             _check_array_roundtrip(arrays[k])

     def test_category_implicit_from_pandas(self):
         # ARROW-3374
         def _check(v):
             arr = pa.array(v)
             result = arr.to_pandas()
             tm.assert_series_equal(pd.Series(result), pd.Series(v))

         arrays = [
             pd.Categorical(['a', 'b', 'c'], categories=['a', 'b']),
             pd.Categorical(['a', 'b', 'c'], categories=['a', 'b'],
                            ordered=True)
         ]
         for arr in arrays:
             _check(arr)

     def test_empty_category(self):
         # ARROW-2443
         df = pd.DataFrame({'cat': pd.Categorical([])})
         _check_pandas_roundtrip(df)

     def test_mixed_types_fails(self):
         data = pd.DataFrame({'a': ['a', 1, 2.0]})
         with pytest.raises(pa.ArrowTypeError):
             pa.Table.from_pandas(data)

         data = pd.DataFrame({'a': [1, True]})
         with pytest.raises(pa.ArrowTypeError):
             pa.Table.from_pandas(data)

         data = pd.DataFrame({'a': ['a', 1, 2.0]})
         expected_msg = 'Conversion failed for column a'
         with pytest.raises(pa.ArrowTypeError, match=expected_msg):
             pa.Table.from_pandas(data)

     def test_strided_data_import(self):
         cases = []

         columns = ['a', 'b', 'c']
         N, K = 100, 3
         random_numbers = np.random.randn(N, K).copy() * 100

         numeric_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8',
                           'f4', 'f8']

         for type_name in numeric_dtypes:
             cases.append(random_numbers.astype(type_name))

         # strings
         cases.append(np.array([tm.rands(10) for i in range(N * K)],
                               dtype=object)
                      .reshape(N, K).copy())

         # booleans
         boolean_objects = (np.array([True, False, True] * N, dtype=object)
                            .reshape(N, K).copy())

         # add some nulls, so dtype comes back as objects
         boolean_objects[5] = None
         cases.append(boolean_objects)

         cases.append(np.arange("2016-01-01T00:00:00.001", N * K,
                                dtype='datetime64[ms]')
                      .reshape(N, K).copy())

         strided_mask = (random_numbers > 0).astype(bool)[:, 0]

         for case in cases:
             df = pd.DataFrame(case, columns=columns)
             col = df['a']

             _check_pandas_roundtrip(df)
             _check_array_roundtrip(col)
             _check_array_roundtrip(col, mask=strided_mask)

     def test_all_nones(self):
         def _check_series(s):
             converted = pa.array(s)
             assert isinstance(converted, pa.NullArray)
             assert len(converted) == 3
             assert converted.null_count == 3
             for item in converted:
                 assert item is pa.NA

         _check_series(pd.Series([None] * 3, dtype=object))
         _check_series(pd.Series([np.nan] * 3, dtype=object))
         _check_series(pd.Series([None, np.nan, None], dtype=object))

     def test_partial_schema(self):
         data = OrderedDict([
             ('a', [0, 1, 2, 3, 4]),
             ('b', np.array([-10, -5, 0, 5, 10], dtype=np.int32)),
             ('c', [-10, -5, 0, 5, 10])
         ])
         df = pd.DataFrame(data)

         partial_schema = pa.schema([
             pa.field('c', pa.int64()),
             pa.field('a', pa.int64())
         ])

         _check_pandas_roundtrip(df, schema=partial_schema,
                                 expected_schema=partial_schema)

     def test_table_batch_empty_dataframe(self):
         df = pd.DataFrame({})
         _check_pandas_roundtrip(df)
         _check_pandas_roundtrip(df, as_batch=True)

         df2 = pd.DataFrame({}, index=[0, 1, 2])
         _check_pandas_roundtrip(df2, preserve_index=True)
         _check_pandas_roundtrip(df2, as_batch=True, preserve_index=True)

     def test_convert_empty_table(self):
         arr = pa.array([], type=pa.int64())
         tm.assert_almost_equal(arr.to_pandas(), np.array([], dtype=np.int64))
         arr = pa.array([], type=pa.string())
         tm.assert_almost_equal(arr.to_pandas(), np.array([], dtype=object))
         arr = pa.array([], type=pa.list_(pa.int64()))
         tm.assert_almost_equal(arr.to_pandas(), np.array([], dtype=object))
         arr = pa.array([], type=pa.struct([pa.field('a', pa.int64())]))
         tm.assert_almost_equal(arr.to_pandas(), np.array([], dtype=object))

     def test_non_natural_stride(self):
         """
         ARROW-2172: converting from a Numpy array with a stride that's
         not a multiple of itemsize.
         """
         dtype = np.dtype([('x', np.int32), ('y', np.int16)])
         data = np.array([(42, -1), (-43, 2)], dtype=dtype)
         assert data.strides == (6,)
         arr = pa.array(data['x'], type=pa.int32())
         assert arr.to_pylist() == [42, -43]
         arr = pa.array(data['y'], type=pa.int16())
         assert arr.to_pylist() == [-1, 2]

     def test_safe_unsafe_casts(self):
         # ARROW-2799
         df = pd.DataFrame({
             'A': list('abc'),
             'B': np.linspace(0, 1, 3)
         })

         schema = pa.schema([
             pa.field('A', pa.string()),
             pa.field('B', pa.int32())
         ])

         with pytest.raises(ValueError):
             pa.Table.from_pandas(df, schema=schema)

         table = pa.Table.from_pandas(df, schema=schema, safe=False)
         assert table.column('B').type == pa.int32()

     def test_error_sparse(self):
         # ARROW-2818
         df = pd.DataFrame({'a': pd.SparseArray([1, np.nan, 3])})
         with pytest.raises(TypeError, match="Sparse pandas data"):
             pa.Table.from_pandas(df)


 def test_safe_cast_from_float_with_nans_to_int():
     # TODO(kszucs): write tests for creating Date32 and Date64 arrays, see
     #               ARROW-4258 and https://github.com/apache/arrow/pull/3395
     values = pd.Series([1, 2, None, 4])
     arr = pa.Array.from_pandas(values, type=pa.int32(), safe=True)
     expected = pa.array([1, 2, None, 4], type=pa.int32())
     assert arr.equals(expected)


 def _fully_loaded_dataframe_example():
     index = pd.MultiIndex.from_arrays([
         pd.date_range('2000-01-01', periods=5).repeat(2),
         np.tile(np.array(['foo', 'bar'], dtype=object), 5)
     ])

     c1 = pd.date_range('2000-01-01', periods=10)
     data = {
         0: c1,
         1: c1.tz_localize('utc'),
         2: c1.tz_localize('US/Eastern'),
         3: c1[::2].tz_localize('utc').repeat(2).astype('category'),
         4: ['foo', 'bar'] * 5,
         5: pd.Series(['foo', 'bar'] * 5).astype('category').values,
         6: [True, False] * 5,
         7: np.random.randn(10),
         8: np.random.randint(0, 100, size=10),
         9: pd.period_range('2013', periods=10, freq='M')
     }

     if LooseVersion(pd.__version__) >= '0.21':
         # There is an issue with pickling IntervalIndex in pandas 0.20.x
         data[10] = pd.interval_range(start=1, freq=1, periods=10)

     return pd.DataFrame(data, index=index)


 @pytest.mark.parametrize('columns', ([b'foo'], ['foo']))
 def test_roundtrip_with_bytes_unicode(columns):
     df = pd.DataFrame(columns=columns)
     table1 = pa.Table.from_pandas(df)
     table2 = pa.Table.from_pandas(table1.to_pandas())
     assert table1.equals(table2)
     assert table1.schema.equals(table2.schema)
     assert table1.schema.metadata == table2.schema.metadata


 def _check_serialize_components_roundtrip(df):
     ctx = pa.default_serialization_context()

     components = ctx.serialize(df).to_components()
     deserialized = ctx.deserialize_components(components)

     tm.assert_frame_equal(df, deserialized)


 @pytest.mark.skipif(LooseVersion(np.__version__) >= '0.16',
                     reason='Until numpy/numpy#12745 is resolved')
 def test_serialize_deserialize_pandas():
     # ARROW-1784, serialize and deserialize DataFrame by decomposing
     # BlockManager
     df = _fully_loaded_dataframe_example()
     _check_serialize_components_roundtrip(df)


 def _pytime_from_micros(val):
     microseconds = val % 1000000
     val //= 1000000
     seconds = val % 60
     val //= 60
     minutes = val % 60
     hours = val // 60
     return time(hours, minutes, seconds, microseconds)


 def _pytime_to_micros(pytime):
     return (pytime.hour * 3600000000 +
             pytime.minute * 60000000 +
             pytime.second * 1000000 +
             pytime.microsecond)


 def test_convert_unsupported_type_error_message():
     # ARROW-1454

     df = pd.DataFrame({
         't1': pd.date_range('2000-01-01', periods=20),
         't2': pd.date_range('2000-05-01', periods=20)
     })

     # timedelta64 as yet unsupported
     df['diff'] = df.t2 - df.t1

     expected_msg = 'Conversion failed for column diff with type timedelta64'
     with pytest.raises(pa.ArrowNotImplementedError, match=expected_msg):
         pa.Table.from_pandas(df)


 # ----------------------------------------------------------------------
 # Test object deduplication in to_pandas


 def _generate_dedup_example(nunique, repeats):
     unique_values = [tm.rands(10) for i in range(nunique)]
     return unique_values * repeats


 def _assert_nunique(obj, expected):
     assert len({id(x) for x in obj}) == expected


 def test_to_pandas_deduplicate_strings_array_types():
     nunique = 100
     repeats = 10
     values = _generate_dedup_example(nunique, repeats)

     for arr in [pa.array(values, type=pa.binary()),
                 pa.array(values, type=pa.utf8()),
                 pa.chunked_array([values, values]),
                 pa.column('foo', [values, values])]:
         _assert_nunique(arr.to_pandas(), nunique)
         _assert_nunique(arr.to_pandas(deduplicate_objects=False), len(arr))


 def test_to_pandas_deduplicate_strings_table_types():
     nunique = 100
     repeats = 10
     values = _generate_dedup_example(nunique, repeats)

     arr = pa.array(values)
     rb = pa.RecordBatch.from_arrays([arr], ['foo'])
     tbl = pa.Table.from_batches([rb])

     for obj in [rb, tbl]:
         _assert_nunique(obj.to_pandas()['foo'], nunique)
         _assert_nunique(obj.to_pandas(deduplicate_objects=False)['foo'],
                         len(obj))


 def test_to_pandas_deduplicate_integers_as_objects():
     nunique = 100
     repeats = 10

     # Python automatically interns smaller integers
     unique_values = list(np.random.randint(10000000, 1000000000, size=nunique))
     unique_values[nunique // 2] = None

     arr = pa.array(unique_values * repeats)

     _assert_nunique(arr.to_pandas(integer_object_nulls=True), nunique)
     _assert_nunique(arr.to_pandas(integer_object_nulls=True,
                                   deduplicate_objects=False),
                     # Account for None
                     (nunique - 1) * repeats + 1)


 def test_to_pandas_deduplicate_date_time():
     nunique = 100
     repeats = 10

     unique_values = list(range(nunique))

     cases = [
         # raw type, array type, to_pandas options
         ('int32', 'date32', {'date_as_object': True}),
         ('int64', 'date64', {'date_as_object': True}),
         ('int32', 'time32[ms]', {}),
         ('int64', 'time64[us]', {})
     ]

     for raw_type, array_type, pandas_options in cases:
         raw_arr = pa.array(unique_values * repeats, type=raw_type)
         casted_arr = raw_arr.cast(array_type)

         _assert_nunique(casted_arr.to_pandas(**pandas_options),
                         nunique)
         _assert_nunique(casted_arr.to_pandas(deduplicate_objects=False,
                                              **pandas_options),
                         len(casted_arr))


 # ---------------------------------------------------------------------

 def test_table_from_pandas_checks_field_nullability():
     # ARROW-2136
     df = pd.DataFrame({'a': [1.2, 2.1, 3.1],
                        'b': [np.nan, 'string', 'foo']})
     schema = pa.schema([pa.field('a', pa.float64(), nullable=False),
                         pa.field('b', pa.utf8(), nullable=False)])

     with pytest.raises(ValueError):
         pa.Table.from_pandas(df, schema=schema)


 def test_table_from_pandas_keeps_column_order_of_dataframe():
     df1 = pd.DataFrame(OrderedDict([
         ('partition', [0, 0, 1, 1]),
         ('arrays', [[0, 1, 2], [3, 4], None, None]),
         ('floats', [None, None, 1.1, 3.3])
     ]))
     df2 = df1[['floats', 'partition', 'arrays']]

     schema1 = pa.schema([
         ('partition', pa.int64()),
         ('arrays', pa.list_(pa.int64())),
         ('floats', pa.float64()),
     ])
     schema2 = pa.schema([
         ('floats', pa.float64()),
         ('partition', pa.int64()),
         ('arrays', pa.list_(pa.int64()))
     ])

     table1 = pa.Table.from_pandas(df1, preserve_index=False)
     table2 = pa.Table.from_pandas(df2, preserve_index=False)

     assert table1.schema.equals(schema1, check_metadata=False)
     assert table2.schema.equals(schema2, check_metadata=False)


 def test_table_from_pandas_keeps_column_order_of_schema():
     # ARROW-3766
     df = pd.DataFrame(OrderedDict([
         ('partition', [0, 0, 1, 1]),
         ('arrays', [[0, 1, 2], [3, 4], None, None]),
         ('floats', [None, None, 1.1, 3.3])
     ]))

     schema = pa.schema([
         ('floats', pa.float64()),
         ('arrays', pa.list_(pa.int32())),
         ('partition', pa.int32())
     ])

     df1 = df[df.partition == 0]
     df2 = df[df.partition == 1][['floats', 'partition', 'arrays']]

     table1 = pa.Table.from_pandas(df1, schema=schema, preserve_index=False)
     table2 = pa.Table.from_pandas(df2, schema=schema, preserve_index=False)

     assert table1.schema.equals(schema, check_metadata=False)
     assert table1.schema.equals(table2.schema, check_metadata=False)


 def test_table_from_pandas_columns_argument_only_does_filtering():
     df = pd.DataFrame(OrderedDict([
         ('partition', [0, 0, 1, 1]),
         ('arrays', [[0, 1, 2], [3, 4], None, None]),
         ('floats', [None, None, 1.1, 3.3])
     ]))

     columns1 = ['arrays', 'floats', 'partition']
     schema1 = pa.schema([
         ('arrays', pa.list_(pa.int64())),
         ('floats', pa.float64()),
         ('partition', pa.int64())
     ])

     columns2 = ['floats', 'partition']
     schema2 = pa.schema([
         ('floats', pa.float64()),
         ('partition', pa.int64())
     ])

     table1 = pa.Table.from_pandas(df, columns=columns1, preserve_index=False)
     table2 = pa.Table.from_pandas(df, columns=columns2, preserve_index=False)

     assert table1.schema.equals(schema1, check_metadata=False)
     assert table2.schema.equals(schema2, check_metadata=False)


 def test_table_from_pandas_columns_and_schema_are_mutually_exclusive():
     df = pd.DataFrame(OrderedDict([
         ('partition', [0, 0, 1, 1]),
         ('arrays', [[0, 1, 2], [3, 4], None, None]),
         ('floats', [None, None, 1.1, 3.3])
     ]))
     schema = pa.schema([
         ('partition', pa.int32()),
         ('arrays', pa.list_(pa.int32())),
         ('floats', pa.float64()),
     ])
     columns = ['arrays', 'floats']

     with pytest.raises(ValueError):
         pa.Table.from_pandas(df, schema=schema, columns=columns)


 def test_table_from_pandas_keeps_schema_nullability():
     # ARROW-5169
     df = pd.DataFrame({'a': [1, 2, 3, 4]})

     schema = pa.schema([
         pa.field('a', pa.int64(), nullable=False),
     ])

     table = pa.Table.from_pandas(df)
     assert table.schema.field_by_name('a').nullable is True
     table = pa.Table.from_pandas(df, schema=schema)
     assert table.schema.field_by_name('a').nullable is False


 # ----------------------------------------------------------------------
 # RecordBatch, Table


 def test_recordbatch_from_to_pandas():
     data = pd.DataFrame({
         'c1': np.array([1, 2, 3, 4, 5], dtype='int64'),
         'c2': np.array([1, 2, 3, 4, 5], dtype='uint32'),
         'c3': np.random.randn(5),
         'c4': ['foo', 'bar', None, 'baz', 'qux'],
         'c5': [False, True, False, True, False]
     })

     batch = pa.RecordBatch.from_pandas(data)
     result = batch.to_pandas()
     tm.assert_frame_equal(data, result)


 def test_recordbatchlist_to_pandas():
     data1 = pd.DataFrame({
         'c1': np.array([1, 1, 2], dtype='uint32'),
         'c2': np.array([1.0, 2.0, 3.0], dtype='float64'),
         'c3': [True, None, False],
         'c4': ['foo', 'bar', None]
     })

     data2 = pd.DataFrame({
         'c1': np.array([3, 5], dtype='uint32'),
         'c2': np.array([4.0, 5.0], dtype='float64'),
         'c3': [True, True],
         'c4': ['baz', 'qux']
     })

     batch1 = pa.RecordBatch.from_pandas(data1)
     batch2 = pa.RecordBatch.from_pandas(data2)

     table = pa.Table.from_batches([batch1, batch2])
     result = table.to_pandas()
     data = pd.concat([data1, data2]).reset_index(drop=True)
     tm.assert_frame_equal(data, result)


 # ----------------------------------------------------------------------
 # Metadata serialization


 @pytest.mark.parametrize(
     ('type', 'expected'),
     [
         (pa.null(), 'empty'),
         (pa.bool_(), 'bool'),
         (pa.int8(), 'int8'),
         (pa.int16(), 'int16'),
         (pa.int32(), 'int32'),
         (pa.int64(), 'int64'),
         (pa.uint8(), 'uint8'),
         (pa.uint16(), 'uint16'),
         (pa.uint32(), 'uint32'),
         (pa.uint64(), 'uint64'),
         (pa.float16(), 'float16'),
         (pa.float32(), 'float32'),
         (pa.float64(), 'float64'),
         (pa.date32(), 'date'),
         (pa.date64(), 'date'),
         (pa.binary(), 'bytes'),
         (pa.binary(length=4), 'bytes'),
         (pa.string(), 'unicode'),
         (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'),
         (pa.decimal128(18, 3), 'decimal'),
         (pa.timestamp('ms'), 'datetime'),
         (pa.timestamp('us', 'UTC'), 'datetimetz'),
         (pa.time32('s'), 'time'),
         (pa.time64('us'), 'time')
     ]
 )
 def test_logical_type(type, expected):
     assert get_logical_type(type) == expected


 # ----------------------------------------------------------------------
 # Some nested array tests array tests


 def test_array_from_py_float32():
     data = [[1.2, 3.4], [9.0, 42.0]]

     t = pa.float32()

     arr1 = pa.array(data[0], type=t)
     arr2 = pa.array(data, type=pa.list_(t))

     expected1 = np.array(data[0], dtype=np.float32)
     expected2 = pd.Series([np.array(data[0], dtype=np.float32),
                            np.array(data[1], dtype=np.float32)])

     assert arr1.type == t
     assert arr1.equals(pa.array(expected1))
     assert arr2.equals(pa.array(expected2))


 # ----------------------------------------------------------------------
 # Timestamp tests


 def test_cast_timestamp_unit():
     # ARROW-1680
     val = datetime.now()
     s = pd.Series([val])
     s_nyc = s.dt.tz_localize('tzlocal()').dt.tz_convert('America/New_York')

     us_with_tz = pa.timestamp('us', tz='America/New_York')

     arr = pa.Array.from_pandas(s_nyc, type=us_with_tz)

     # ARROW-1906
     assert arr.type == us_with_tz

     arr2 = pa.Array.from_pandas(s, type=pa.timestamp('us'))

     assert arr[0].as_py() == s_nyc[0]
     assert arr2[0].as_py() == s[0]

     # Disallow truncation
     arr = pa.array([123123], type='int64').cast(pa.timestamp('ms'))
     expected = pa.array([123], type='int64').cast(pa.timestamp('s'))

     target = pa.timestamp('s')
     with pytest.raises(ValueError):
         arr.cast(target)

     result = arr.cast(target, safe=False)
     assert result.equals(expected)

     # ARROW-1949
     series = pd.Series([pd.Timestamp(1), pd.Timestamp(10), pd.Timestamp(1000)])
     expected = pa.array([0, 0, 1], type=pa.timestamp('us'))

     with pytest.raises(ValueError):
         pa.array(series, type=pa.timestamp('us'))

     with pytest.raises(ValueError):
         pa.Array.from_pandas(series, type=pa.timestamp('us'))

     result = pa.Array.from_pandas(series, type=pa.timestamp('us'), safe=False)
     assert result.equals(expected)

     result = pa.array(series, type=pa.timestamp('us'), safe=False)
     assert result.equals(expected)


 # ----------------------------------------------------------------------
 # DictionaryArray tests


 def test_dictionary_with_pandas():
     indices = np.repeat([0, 1, 2], 2)
     dictionary = np.array(['foo', 'bar', 'baz'], dtype=object)
     mask = np.array([False, False, True, False, False, False])

     d1 = pa.DictionaryArray.from_arrays(indices, dictionary)
     d2 = pa.DictionaryArray.from_arrays(indices, dictionary, mask=mask)

     pandas1 = d1.to_pandas()
     ex_pandas1 = pd.Categorical.from_codes(indices, categories=dictionary)

     tm.assert_series_equal(pd.Series(pandas1), pd.Series(ex_pandas1))

     pandas2 = d2.to_pandas()
     ex_pandas2 = pd.Categorical.from_codes(np.where(mask, -1, indices),
                                            categories=dictionary)

     tm.assert_series_equal(pd.Series(pandas2), pd.Series(ex_pandas2))


 def test_variable_dictionary_with_pandas():
     a1 = pa.DictionaryArray.from_arrays([0, 1, 2], ['a', 'b', 'c'])
     a2 = pa.DictionaryArray.from_arrays([0, 1], ['a', 'c'])

     a = pa.chunked_array([a1, a2])
     assert a.to_pylist() == ['a', 'b', 'c', 'a', 'c']
     with pytest.raises(NotImplementedError):
         a.to_pandas()

     a = pa.chunked_array([a2, a1])
     assert a.to_pylist() == ['a', 'c', 'a', 'b', 'c']
     with pytest.raises(NotImplementedError):
         a.to_pandas()


 # ----------------------------------------------------------------------
 # Legacy metadata compatibility tests


 def test_range_index_pre_0_12():
     # Forward compatibility for metadata created from pandas.RangeIndex
     # prior to pyarrow 0.13.0
     a_values = [u'foo', u'bar', None, u'baz']
     b_values = [u'a', u'a', u'b', u'b']
     a_arrow = pa.array(a_values, type='utf8')
     b_arrow = pa.array(b_values, type='utf8')

     rng_index_arrow = pa.array([0, 2, 4, 6], type='int64')

     gen_name_0 = '__index_level_0__'
     gen_name_1 = '__index_level_1__'

     # Case 1: named RangeIndex
     e1 = pd.DataFrame({
         'a': a_values
     }, index=pd.RangeIndex(0, 8, step=2, name='qux'))
     t1 = pa.Table.from_arrays([a_arrow, rng_index_arrow],
                               names=['a', 'qux'])
     t1 = t1.replace_schema_metadata({
         b'pandas': json.dumps(
             {'index_columns': ['qux'],
              'column_indexes': [{'name': None,
                                  'field_name': None,
                                  'pandas_type': 'unicode',
                                  'numpy_type': 'object',
                                  'metadata': {'encoding': 'UTF-8'}}],
              'columns': [{'name': 'a',
                           'field_name': 'a',
                           'pandas_type': 'unicode',
                           'numpy_type': 'object',
                           'metadata': None},
                          {'name': 'qux',
                           'field_name': 'qux',
                           'pandas_type': 'int64',
                           'numpy_type': 'int64',
                           'metadata': None}],
              'pandas_version': '0.23.4'}
         )})
     r1 = t1.to_pandas()
     tm.assert_frame_equal(r1, e1)

     # Case 2: named RangeIndex, but conflicts with an actual column
     e2 = pd.DataFrame({
         'qux': a_values
     }, index=pd.RangeIndex(0, 8, step=2, name='qux'))
     t2 = pa.Table.from_arrays([a_arrow, rng_index_arrow],
                               names=['qux', gen_name_0])
     t2 = t2.replace_schema_metadata({
         b'pandas': json.dumps(
             {'index_columns': [gen_name_0],
              'column_indexes': [{'name': None,
                                  'field_name': None,
                                  'pandas_type': 'unicode',
                                  'numpy_type': 'object',
                                  'metadata': {'encoding': 'UTF-8'}}],
              'columns': [{'name': 'a',
                           'field_name': 'a',
                           'pandas_type': 'unicode',
                           'numpy_type': 'object',
                           'metadata': None},
                          {'name': 'qux',
                           'field_name': gen_name_0,
                           'pandas_type': 'int64',
                           'numpy_type': 'int64',
                           'metadata': None}],
              'pandas_version': '0.23.4'}
         )})
     r2 = t2.to_pandas()
     tm.assert_frame_equal(r2, e2)

     # Case 3: unnamed RangeIndex
     e3 = pd.DataFrame({
         'a': a_values
     }, index=pd.RangeIndex(0, 8, step=2, name=None))
     t3 = pa.Table.from_arrays([a_arrow, rng_index_arrow],
                               names=['a', gen_name_0])
     t3 = t3.replace_schema_metadata({
         b'pandas': json.dumps(
             {'index_columns': [gen_name_0],
              'column_indexes': [{'name': None,
                                  'field_name': None,
                                  'pandas_type': 'unicode',
                                  'numpy_type': 'object',
                                  'metadata': {'encoding': 'UTF-8'}}],
              'columns': [{'name': 'a',
                           'field_name': 'a',
                           'pandas_type': 'unicode',
                           'numpy_type': 'object',
                           'metadata': None},
                          {'name': None,
                           'field_name': gen_name_0,
                           'pandas_type': 'int64',
                           'numpy_type': 'int64',
                           'metadata': None}],
              'pandas_version': '0.23.4'}
         )})
     r3 = t3.to_pandas()
     tm.assert_frame_equal(r3, e3)

     # Case 4: MultiIndex with named RangeIndex
     e4 = pd.DataFrame({
         'a': a_values
     }, index=[pd.RangeIndex(0, 8, step=2, name='qux'), b_values])
     t4 = pa.Table.from_arrays([a_arrow, rng_index_arrow, b_arrow],
                               names=['a', 'qux', gen_name_1])
     t4 = t4.replace_schema_metadata({
         b'pandas': json.dumps(
             {'index_columns': ['qux', gen_name_1],
              'column_indexes': [{'name': None,
                                  'field_name': None,
                                  'pandas_type': 'unicode',
                                  'numpy_type': 'object',
                                  'metadata': {'encoding': 'UTF-8'}}],
              'columns': [{'name': 'a',
                           'field_name': 'a',
                           'pandas_type': 'unicode',
                           'numpy_type': 'object',
                           'metadata': None},
                          {'name': 'qux',
                           'field_name': 'qux',
                           'pandas_type': 'int64',
                           'numpy_type': 'int64',
                           'metadata': None},
                          {'name': None,
                           'field_name': gen_name_1,
                           'pandas_type': 'unicode',
                           'numpy_type': 'object',
                           'metadata': None}],
              'pandas_version': '0.23.4'}
         )})
     r4 = t4.to_pandas()
     tm.assert_frame_equal(r4, e4)

     # Case 4: MultiIndex with unnamed RangeIndex
     e5 = pd.DataFrame({
         'a': a_values
     }, index=[pd.RangeIndex(0, 8, step=2, name=None), b_values])
     t5 = pa.Table.from_arrays([a_arrow, rng_index_arrow, b_arrow],
                               names=['a', gen_name_0, gen_name_1])
     t5 = t5.replace_schema_metadata({
         b'pandas': json.dumps(
             {'index_columns': [gen_name_0, gen_name_1],
              'column_indexes': [{'name': None,
                                  'field_name': None,
                                  'pandas_type': 'unicode',
                                  'numpy_type': 'object',
                                  'metadata': {'encoding': 'UTF-8'}}],
              'columns': [{'name': 'a',
                           'field_name': 'a',
                           'pandas_type': 'unicode',
                           'numpy_type': 'object',
                           'metadata': None},
                          {'name': None,
                           'field_name': gen_name_0,
                           'pandas_type': 'int64',
                           'numpy_type': 'int64',
                           'metadata': None},
                          {'name': None,
                           'field_name': gen_name_1,
                           'pandas_type': 'unicode',
                           'numpy_type': 'object',
                           'metadata': None}],
              'pandas_version': '0.23.4'}
         )})
     r5 = t5.to_pandas()
     tm.assert_frame_equal(r5, e5)