python/pyarrow/tests/test_feather.py - arrow - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 import io
 import os
 import sys
 import tempfile
 import pytest
 import hypothesis as h
 import hypothesis.strategies as st

 import numpy as np

 import pyarrow as pa
 import pyarrow.tests.strategies as past
 from pyarrow.feather import (read_feather, write_feather, read_table,
                              FeatherDataset)


 try:
     from pandas.testing import assert_frame_equal
     import pandas as pd
     import pyarrow.pandas_compat
 except ImportError:
     pass


 @pytest.fixture(scope='module')
 def datadir(base_datadir):
     return base_datadir / 'feather'


 def random_path(prefix='feather_'):
     return tempfile.mktemp(prefix=prefix)


 @pytest.fixture(scope="module", params=[1, 2])
 def version(request):
     yield request.param


 @pytest.fixture(scope="module", params=[None, "uncompressed", "lz4", "zstd"])
 def compression(request):
     if request.param in ['lz4', 'zstd'] and not pa.Codec.is_available(
             request.param):
         pytest.skip(f'{request.param} is not available')
     yield request.param


 TEST_FILES = None


 def setup_module(module):
     global TEST_FILES
     TEST_FILES = []


 def teardown_module(module):
     for path in TEST_FILES:
         try:
             os.remove(path)
         except os.error:
             pass


 @pytest.mark.pandas
 def test_file_not_exist():
     with pytest.raises(pa.ArrowIOError):
         read_feather('test_invalid_file')


 def _check_pandas_roundtrip(df, expected=None, path=None,
                             columns=None, use_threads=False,
                             version=None, compression=None,
                             compression_level=None):
     if path is None:
         path = random_path()

     TEST_FILES.append(path)
     write_feather(df, path, compression=compression,
                   compression_level=compression_level, version=version)
     if not os.path.exists(path):
         raise Exception('file not written')

     result = read_feather(path, columns, use_threads=use_threads)
     if expected is None:
         expected = df

     assert_frame_equal(result, expected)


 def _check_arrow_roundtrip(table, path=None, compression=None):
     if path is None:
         path = random_path()

     TEST_FILES.append(path)
     write_feather(table, path, compression=compression)
     if not os.path.exists(path):
         raise Exception('file not written')

     result = read_table(path)
     assert result.equals(table)


 def _assert_error_on_write(df, exc, path=None, version=2):
     # check that we are raising the exception
     # on writing

     if path is None:
         path = random_path()

     TEST_FILES.append(path)

     def f():
         write_feather(df, path, version=version)

     pytest.raises(exc, f)


 def test_dataset(version):
     num_values = (100, 100)
     num_files = 5
     paths = [random_path() for i in range(num_files)]
     data = {
         "col_" + str(i): np.random.randn(num_values[0])
         for i in range(num_values[1])
     }
     table = pa.table(data)

     TEST_FILES.extend(paths)
     for index, path in enumerate(paths):
         rows = (
             index * (num_values[0] // num_files),
             (index + 1) * (num_values[0] // num_files),
         )

         write_feather(table[rows[0]: rows[1]], path, version=version)

     data = FeatherDataset(paths).read_table()
     assert data.equals(table)


 @pytest.mark.pandas
 def test_float_no_nulls(version):
     data = {}
     numpy_dtypes = ['f4', 'f8']
     num_values = 100

     for dtype in numpy_dtypes:
         values = np.random.randn(num_values)
         data[dtype] = values.astype(dtype)

     df = pd.DataFrame(data)
     _check_pandas_roundtrip(df, version=version)


 @pytest.mark.pandas
 def test_read_table(version):
     num_values = (100, 100)
     path = random_path()

     TEST_FILES.append(path)

     values = np.random.randint(0, 100, size=num_values)
     columns = ['col_' + str(i) for i in range(100)]
     table = pa.Table.from_arrays(values, columns)

     write_feather(table, path, version=version)

     result = read_table(path)
     assert result.equals(table)

     # Test without memory mapping
     result = read_table(path, memory_map=False)
     assert result.equals(table)

     result = read_feather(path, memory_map=False)
     assert_frame_equal(table.to_pandas(), result)


 @pytest.mark.pandas
 def test_float_nulls(version):
     num_values = 100

     path = random_path()
     TEST_FILES.append(path)

     null_mask = np.random.randint(0, 10, size=num_values) < 3
     dtypes = ['f4', 'f8']
     expected_cols = []

     arrays = []
     for name in dtypes:
         values = np.random.randn(num_values).astype(name)
         arrays.append(pa.array(values, mask=null_mask))

         values[null_mask] = np.nan

         expected_cols.append(values)

     table = pa.table(arrays, names=dtypes)
     _check_arrow_roundtrip(table)

     df = table.to_pandas()
     _check_pandas_roundtrip(df, version=version)


 @pytest.mark.pandas
 def test_integer_no_nulls(version):
     data, arr = {}, []

     numpy_dtypes = ['i1', 'i2', 'i4', 'i8',
                     'u1', 'u2', 'u4', 'u8']
     num_values = 100

     for dtype in numpy_dtypes:
         values = np.random.randint(0, 100, size=num_values)
         data[dtype] = values.astype(dtype)
         arr.append(values.astype(dtype))

     df = pd.DataFrame(data)
     _check_pandas_roundtrip(df, version=version)

     table = pa.table(arr, names=numpy_dtypes)
     _check_arrow_roundtrip(table)


 @pytest.mark.pandas
 def test_platform_numpy_integers(version):
     data = {}

     numpy_dtypes = ['longlong']
     num_values = 100

     for dtype in numpy_dtypes:
         values = np.random.randint(0, 100, size=num_values)
         data[dtype] = values.astype(dtype)

     df = pd.DataFrame(data)
     _check_pandas_roundtrip(df, version=version)


 @pytest.mark.pandas
 def test_integer_with_nulls(version):
     # pandas requires upcast to float dtype
     path = random_path()
     TEST_FILES.append(path)

     int_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8']
     num_values = 100

     arrays = []
     null_mask = np.random.randint(0, 10, size=num_values) < 3
     expected_cols = []
     for name in int_dtypes:
         values = np.random.randint(0, 100, size=num_values)
         arrays.append(pa.array(values, mask=null_mask))

         expected = values.astype('f8')
         expected[null_mask] = np.nan

         expected_cols.append(expected)

     table = pa.table(arrays, names=int_dtypes)
     _check_arrow_roundtrip(table)

     df = table.to_pandas()
     _check_pandas_roundtrip(df, version=version)


 @pytest.mark.pandas
 def test_boolean_no_nulls(version):
     num_values = 100

     np.random.seed(0)

     df = pd.DataFrame({'bools': np.random.randn(num_values) > 0})
     _check_pandas_roundtrip(df, version=version)


 @pytest.mark.pandas
 def test_boolean_nulls(version):
     # pandas requires upcast to object dtype
     path = random_path()
     TEST_FILES.append(path)

     num_values = 100
     np.random.seed(0)

     mask = np.random.randint(0, 10, size=num_values) < 3
     values = np.random.randint(0, 10, size=num_values) < 5

     table = pa.table([pa.array(values, mask=mask)], names=['bools'])
     _check_arrow_roundtrip(table)

     df = table.to_pandas()
     _check_pandas_roundtrip(df, version=version)


 def test_buffer_bounds_error(version):
     # ARROW-1676
     path = random_path()
     TEST_FILES.append(path)

     for i in range(16, 256):
         table = pa.Table.from_arrays(
             [pa.array([None] + list(range(i)), type=pa.float64())],
             names=["arr"]
         )
         _check_arrow_roundtrip(table)


 def test_boolean_object_nulls(version):
     repeats = 100
     table = pa.Table.from_arrays(
         [np.array([False, None, True] * repeats, dtype=object)],
         names=["arr"]
     )
     _check_arrow_roundtrip(table)


 @pytest.mark.pandas
 def test_delete_partial_file_on_error(version):
     if sys.platform == 'win32':
         pytest.skip('Windows hangs on to file handle for some reason')

     class CustomClass:
         pass

     # strings will fail
     df = pd.DataFrame(
         {
             'numbers': range(5),
             'strings': [b'foo', None, 'bar', CustomClass(), np.nan]},
         columns=['numbers', 'strings'])

     path = random_path()
     try:
         write_feather(df, path, version=version)
     except Exception:
         pass

     assert not os.path.exists(path)


 @pytest.mark.pandas
 def test_strings(version):
     repeats = 1000

     # Mixed bytes, unicode, strings coerced to binary
     values = [b'foo', None, 'bar', 'qux', np.nan]
     df = pd.DataFrame({'strings': values * repeats})

     ex_values = [b'foo', None, b'bar', b'qux', np.nan]
     expected = pd.DataFrame({'strings': ex_values * repeats})
     _check_pandas_roundtrip(df, expected, version=version)

     # embedded nulls are ok
     values = ['foo', None, 'bar', 'qux', None]
     df = pd.DataFrame({'strings': values * repeats})
     expected = pd.DataFrame({'strings': values * repeats})
     _check_pandas_roundtrip(df, expected, version=version)

     values = ['foo', None, 'bar', 'qux', np.nan]
     df = pd.DataFrame({'strings': values * repeats})
     expected = pd.DataFrame({'strings': values * repeats})
     _check_pandas_roundtrip(df, expected, version=version)


 @pytest.mark.pandas
 def test_empty_strings(version):
     df = pd.DataFrame({'strings': [''] * 10})
     _check_pandas_roundtrip(df, version=version)


 @pytest.mark.pandas
 def test_all_none(version):
     df = pd.DataFrame({'all_none': [None] * 10})
     _check_pandas_roundtrip(df, version=version)


 @pytest.mark.pandas
 def test_all_null_category(version):
     # ARROW-1188
     df = pd.DataFrame({"A": (1, 2, 3), "B": (None, None, None)})
     df = df.assign(B=df.B.astype("category"))
     _check_pandas_roundtrip(df, version=version)


 @pytest.mark.pandas
 def test_multithreaded_read(version):
     data = {'c{}'.format(i): [''] * 10
             for i in range(100)}
     df = pd.DataFrame(data)
     _check_pandas_roundtrip(df, use_threads=True, version=version)


 @pytest.mark.pandas
 def test_nan_as_null(version):
     # Create a nan that is not numpy.nan
     values = np.array(['foo', np.nan, np.nan * 2, 'bar'] * 10)
     df = pd.DataFrame({'strings': values})
     _check_pandas_roundtrip(df, version=version)


 @pytest.mark.pandas
 def test_category(version):
     repeats = 1000
     values = ['foo', None, 'bar', 'qux', np.nan]
     df = pd.DataFrame({'strings': values * repeats})
     df['strings'] = df['strings'].astype('category')

     values = ['foo', None, 'bar', 'qux', None]
     expected = pd.DataFrame({'strings': pd.Categorical(values * repeats)})
     _check_pandas_roundtrip(df, expected, version=version)


 @pytest.mark.pandas
 def test_timestamp(version):
     df = pd.DataFrame({'naive': pd.date_range('2016-03-28', periods=10)})
     df['with_tz'] = (df.naive.dt.tz_localize('utc')
                      .dt.tz_convert('America/Los_Angeles'))

     _check_pandas_roundtrip(df, version=version)


 @pytest.mark.pandas
 def test_timestamp_with_nulls(version):
     df = pd.DataFrame({'test': [pd.Timestamp(2016, 1, 1),
                                 None,
                                 pd.Timestamp(2016, 1, 3)]})
     df['with_tz'] = df.test.dt.tz_localize('utc')

     _check_pandas_roundtrip(df, version=version)


 @pytest.mark.pandas
 @pytest.mark.xfail(reason="not supported", raises=TypeError)
 def test_timedelta_with_nulls_v1():
     df = pd.DataFrame({'test': [pd.Timedelta('1 day'),
                                 None,
                                 pd.Timedelta('3 day')]})
     _check_pandas_roundtrip(df, version=1)


 @pytest.mark.pandas
 def test_timedelta_with_nulls():
     df = pd.DataFrame({'test': [pd.Timedelta('1 day'),
                                 None,
                                 pd.Timedelta('3 day')]})
     _check_pandas_roundtrip(df, version=2)


 @pytest.mark.pandas
 def test_out_of_float64_timestamp_with_nulls(version):
     df = pd.DataFrame(
         {'test': pd.DatetimeIndex([1451606400000000001,
                                    None, 14516064000030405])})
     df['with_tz'] = df.test.dt.tz_localize('utc')
     _check_pandas_roundtrip(df, version=version)


 @pytest.mark.pandas
 def test_non_string_columns(version):
     df = pd.DataFrame({0: [1, 2, 3, 4],
                        1: [True, False, True, False]})

     expected = df.rename(columns=str)
     _check_pandas_roundtrip(df, expected, version=version)


 @pytest.mark.pandas
 @pytest.mark.skipif(not os.path.supports_unicode_filenames,
                     reason='unicode filenames not supported')
 def test_unicode_filename(version):
     # GH #209
     name = (b'Besa_Kavaj\xc3\xab.feather').decode('utf-8')
     df = pd.DataFrame({'foo': [1, 2, 3, 4]})
     _check_pandas_roundtrip(df, path=random_path(prefix=name),
                             version=version)


 @pytest.mark.pandas
 def test_read_columns(version):
     df = pd.DataFrame({
         'foo': [1, 2, 3, 4],
         'boo': [5, 6, 7, 8],
         'woo': [1, 3, 5, 7]
     })
     expected = df[['boo', 'woo']]

     _check_pandas_roundtrip(df, expected, version=version,
                             columns=['boo', 'woo'])


 def test_overwritten_file(version):
     path = random_path()
     TEST_FILES.append(path)

     num_values = 100
     np.random.seed(0)

     values = np.random.randint(0, 10, size=num_values)

     table = pa.table({'ints': values})
     write_feather(table, path)

     table = pa.table({'more_ints': values[0:num_values//2]})
     _check_arrow_roundtrip(table, path=path)


 @pytest.mark.pandas
 def test_filelike_objects(version):
     buf = io.BytesIO()

     # the copy makes it non-strided
     df = pd.DataFrame(np.arange(12).reshape(4, 3),
                       columns=['a', 'b', 'c']).copy()
     write_feather(df, buf, version=version)

     buf.seek(0)

     result = read_feather(buf)
     assert_frame_equal(result, df)


 @pytest.mark.pandas
 @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning")
 @pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning")
 def test_sparse_dataframe(version):
     if not pa.pandas_compat._pandas_api.has_sparse:
         pytest.skip("version of pandas does not support SparseDataFrame")
     # GH #221
     data = {'A': [0, 1, 2],
             'B': [1, 0, 1]}
     df = pd.DataFrame(data).to_sparse(fill_value=1)
     expected = df.to_dense()
     _check_pandas_roundtrip(df, expected, version=version)


 @pytest.mark.pandas
 def test_duplicate_columns_pandas():

     # https://github.com/wesm/feather/issues/53
     # not currently able to handle duplicate columns
     df = pd.DataFrame(np.arange(12).reshape(4, 3),
                       columns=list('aaa')).copy()
     _assert_error_on_write(df, ValueError)


 def test_duplicate_columns():
     # only works for version 2
     table = pa.table([[1, 2, 3], [4, 5, 6], [7, 8, 9]], names=['a', 'a', 'b'])
     _check_arrow_roundtrip(table)
     _assert_error_on_write(table, ValueError, version=1)


 @pytest.mark.pandas
 def test_unsupported():
     # https://github.com/wesm/feather/issues/240
     # serializing actual python objects

     # custom python objects
     class A:
         pass

     df = pd.DataFrame({'a': [A(), A()]})
     _assert_error_on_write(df, ValueError)

     # non-strings
     df = pd.DataFrame({'a': ['a', 1, 2.0]})
     _assert_error_on_write(df, TypeError)


 @pytest.mark.pandas
 def test_v2_set_chunksize():
     df = pd.DataFrame({'A': np.arange(1000)})
     table = pa.table(df)

     buf = io.BytesIO()
     write_feather(table, buf, chunksize=250, version=2)

     result = buf.getvalue()

     ipc_file = pa.ipc.open_file(pa.BufferReader(result))
     assert ipc_file.num_record_batches == 4
     assert len(ipc_file.get_batch(0)) == 250


 @pytest.mark.pandas
 @pytest.mark.lz4
 @pytest.mark.snappy
 @pytest.mark.zstd
 def test_v2_compression_options():
     df = pd.DataFrame({'A': np.arange(1000)})

     cases = [
         # compression, compression_level
         ('uncompressed', None),
         ('lz4', None),
         ('zstd', 1),
         ('zstd', 10)
     ]

     for compression, compression_level in cases:
         _check_pandas_roundtrip(df, compression=compression,
                                 compression_level=compression_level)

     buf = io.BytesIO()

     # LZ4 doesn't support compression_level
     with pytest.raises(pa.ArrowInvalid,
                        match="doesn't support setting a compression level"):
         write_feather(df, buf, compression='lz4', compression_level=10)

     # Trying to compress with V1
     with pytest.raises(
             ValueError,
             match="Feather V1 files do not support compression option"):
         write_feather(df, buf, compression='lz4', version=1)

     # Trying to set chunksize with V1
     with pytest.raises(
             ValueError,
             match="Feather V1 files do not support chunksize option"):
         write_feather(df, buf, chunksize=4096, version=1)

     # Unsupported compressor
     with pytest.raises(ValueError,
                        match='compression="snappy" not supported'):
         write_feather(df, buf, compression='snappy')


 def test_v2_lz4_default_compression():
     # ARROW-8750: Make sure that the compression=None option selects lz4 if
     # it's available
     if not pa.Codec.is_available('lz4_frame'):
         pytest.skip("LZ4 compression support is not built in C++")

     # some highly compressible data
     t = pa.table([np.repeat(0, 100000)], names=['f0'])

     buf = io.BytesIO()
     write_feather(t, buf)
     default_result = buf.getvalue()

     buf = io.BytesIO()
     write_feather(t, buf, compression='uncompressed')
     uncompressed_result = buf.getvalue()

     assert len(default_result) < len(uncompressed_result)


 def test_v1_unsupported_types():
     table = pa.table([pa.array([[1, 2, 3], [], None])], names=['f0'])

     buf = io.BytesIO()
     with pytest.raises(TypeError,
                        match=("Unsupported Feather V1 type: "
                               "list<item: int64>. "
                               "Use V2 format to serialize all Arrow types.")):
         write_feather(table, buf, version=1)


 @pytest.mark.slow
 @pytest.mark.pandas
 def test_large_dataframe(version):
     df = pd.DataFrame({'A': np.arange(400000000)})
     _check_pandas_roundtrip(df, version=version)


 @pytest.mark.large_memory
 @pytest.mark.pandas
 def test_chunked_binary_error_message():
     # ARROW-3058: As Feather does not yet support chunked columns, we at least
     # make sure it's clear to the user what is going on

     # 2^31 + 1 bytes
     values = [b'x'] + [
         b'x' * (1 << 20)
     ] * 2 * (1 << 10)
     df = pd.DataFrame({'byte_col': values})

     # Works fine with version 2
     buf = io.BytesIO()
     write_feather(df, buf, version=2)
     result = read_feather(pa.BufferReader(buf.getvalue()))
     assert_frame_equal(result, df)

     with pytest.raises(ValueError, match="'byte_col' exceeds 2GB maximum "
                        "capacity of a Feather binary column. This restriction "
                        "may be lifted in the future"):
         write_feather(df, io.BytesIO(), version=1)


 def test_feather_without_pandas(tempdir, version):
     # ARROW-8345
     table = pa.table([pa.array([1, 2, 3])], names=['f0'])
     path = str(tempdir / "data.feather")
     _check_arrow_roundtrip(table, path)


 @pytest.mark.pandas
 def test_read_column_selection(version):
     # ARROW-8641
     df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=['a', 'b', 'c'])

     # select columns as string names or integer indices
     _check_pandas_roundtrip(
         df, columns=['a', 'c'], expected=df[['a', 'c']], version=version)
     _check_pandas_roundtrip(
         df, columns=[0, 2], expected=df[['a', 'c']], version=version)

     # different order is followed
     _check_pandas_roundtrip(
         df, columns=['b', 'a'], expected=df[['b', 'a']], version=version)
     _check_pandas_roundtrip(
         df, columns=[1, 0], expected=df[['b', 'a']], version=version)


 def test_read_column_duplicated_selection(tempdir, version):
     # duplicated columns in the column selection
     table = pa.table([[1, 2, 3], [4, 5, 6], [7, 8, 9]], names=['a', 'b', 'c'])
     path = str(tempdir / "data.feather")
     write_feather(table, path, version=version)

     expected = pa.table([[1, 2, 3], [4, 5, 6], [1, 2, 3]],
                         names=['a', 'b', 'a'])
     for col_selection in [['a', 'b', 'a'], [0, 1, 0]]:
         result = read_table(path, columns=col_selection)
         assert result.equals(expected)


 def test_read_column_duplicated_in_file(tempdir):
     # duplicated columns in feather file (only works for feather v2)
     table = pa.table([[1, 2, 3], [4, 5, 6], [7, 8, 9]], names=['a', 'b', 'a'])
     path = str(tempdir / "data.feather")
     write_feather(table, path, version=2)

     # no selection works fine
     result = read_table(path)
     assert result.equals(table)

     # selection with indices works
     result = read_table(path, columns=[0, 2])
     assert result.column_names == ['a', 'a']

     # selection with column names errors
     with pytest.raises(ValueError):
         read_table(path, columns=['a', 'b'])


 def test_nested_types(compression):
     # https://issues.apache.org/jira/browse/ARROW-8860
     table = pa.table({'col': pa.StructArray.from_arrays(
         [[0, 1, 2], [1, 2, 3]], names=["f1", "f2"])})
     _check_arrow_roundtrip(table, compression=compression)

     table = pa.table({'col': pa.array([[1, 2], [3, 4]])})
     _check_arrow_roundtrip(table, compression=compression)

     table = pa.table({'col': pa.array([[[1, 2], [3, 4]], [[5, 6], None]])})
     _check_arrow_roundtrip(table, compression=compression)


 @h.given(past.all_tables, st.sampled_from(["uncompressed", "lz4", "zstd"]))
 def test_roundtrip(table, compression):
     _check_arrow_roundtrip(table, compression=compression)


 @pytest.mark.lz4
 def test_feather_v017_experimental_compression_backward_compatibility(datadir):
     # ARROW-11163 - ensure newer pyarrow versions can read the old feather
     # files from version 0.17.0 with experimental compression support (before
     # it was officially added to IPC format in 1.0.0)

     # file generated with:
     #     table = pa.table({'a': range(5)})
     #     from pyarrow import feather
     #     feather.write_feather(
     #         table, "v0.17.0.version=2-compression=lz4.feather",
     #         compression="lz4", version=2)
     expected = pa.table({'a': range(5)})
     result = read_table(datadir / "v0.17.0.version=2-compression=lz4.feather")
     assert result.equals(expected)
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	import io
	import os
	import sys
	import tempfile
	import pytest
	import hypothesis as h
	import hypothesis.strategies as st

	import numpy as np

	import pyarrow as pa
	import pyarrow.tests.strategies as past
	from pyarrow.feather import (read_feather, write_feather, read_table,
	FeatherDataset)


	try:
	from pandas.testing import assert_frame_equal
	import pandas as pd
	import pyarrow.pandas_compat
	except ImportError:
	pass


	@pytest.fixture(scope='module')
	def datadir(base_datadir):
	return base_datadir / 'feather'


	def random_path(prefix='feather_'):
	return tempfile.mktemp(prefix=prefix)


	@pytest.fixture(scope="module", params=[1, 2])
	def version(request):
	yield request.param


	@pytest.fixture(scope="module", params=[None, "uncompressed", "lz4", "zstd"])
	def compression(request):
	if request.param in ['lz4', 'zstd'] and not pa.Codec.is_available(
	request.param):
	pytest.skip(f'{request.param} is not available')
	yield request.param


	TEST_FILES = None


	def setup_module(module):
	global TEST_FILES
	TEST_FILES = []


	def teardown_module(module):
	for path in TEST_FILES:
	try:
	os.remove(path)
	except os.error:
	pass


	@pytest.mark.pandas
	def test_file_not_exist():
	with pytest.raises(pa.ArrowIOError):
	read_feather('test_invalid_file')


	def _check_pandas_roundtrip(df, expected=None, path=None,
	columns=None, use_threads=False,
	version=None, compression=None,
	compression_level=None):
	if path is None:
	path = random_path()

	TEST_FILES.append(path)
	write_feather(df, path, compression=compression,
	compression_level=compression_level, version=version)
	if not os.path.exists(path):
	raise Exception('file not written')

	result = read_feather(path, columns, use_threads=use_threads)
	if expected is None:
	expected = df

	assert_frame_equal(result, expected)


	def _check_arrow_roundtrip(table, path=None, compression=None):
	if path is None:
	path = random_path()

	TEST_FILES.append(path)
	write_feather(table, path, compression=compression)
	if not os.path.exists(path):
	raise Exception('file not written')

	result = read_table(path)
	assert result.equals(table)


	def _assert_error_on_write(df, exc, path=None, version=2):
	# check that we are raising the exception
	# on writing

	if path is None:
	path = random_path()

	TEST_FILES.append(path)

	def f():
	write_feather(df, path, version=version)

	pytest.raises(exc, f)


	def test_dataset(version):
	num_values = (100, 100)
	num_files = 5
	paths = [random_path() for i in range(num_files)]
	data = {
	"col_" + str(i): np.random.randn(num_values[0])
	for i in range(num_values[1])
	}
	table = pa.table(data)

	TEST_FILES.extend(paths)
	for index, path in enumerate(paths):
	rows = (
	index * (num_values[0] // num_files),
	(index + 1) * (num_values[0] // num_files),
	)

	write_feather(table[rows[0]: rows[1]], path, version=version)

	data = FeatherDataset(paths).read_table()
	assert data.equals(table)


	@pytest.mark.pandas
	def test_float_no_nulls(version):
	data = {}
	numpy_dtypes = ['f4', 'f8']
	num_values = 100

	for dtype in numpy_dtypes:
	values = np.random.randn(num_values)
	data[dtype] = values.astype(dtype)

	df = pd.DataFrame(data)
	_check_pandas_roundtrip(df, version=version)


	@pytest.mark.pandas
	def test_read_table(version):
	num_values = (100, 100)
	path = random_path()

	TEST_FILES.append(path)

	values = np.random.randint(0, 100, size=num_values)
	columns = ['col_' + str(i) for i in range(100)]
	table = pa.Table.from_arrays(values, columns)

	write_feather(table, path, version=version)

	result = read_table(path)
	assert result.equals(table)

	# Test without memory mapping
	result = read_table(path, memory_map=False)
	assert result.equals(table)

	result = read_feather(path, memory_map=False)
	assert_frame_equal(table.to_pandas(), result)


	@pytest.mark.pandas
	def test_float_nulls(version):
	num_values = 100

	path = random_path()
	TEST_FILES.append(path)

	null_mask = np.random.randint(0, 10, size=num_values) < 3
	dtypes = ['f4', 'f8']
	expected_cols = []

	arrays = []
	for name in dtypes:
	values = np.random.randn(num_values).astype(name)
	arrays.append(pa.array(values, mask=null_mask))

	values[null_mask] = np.nan

	expected_cols.append(values)

	table = pa.table(arrays, names=dtypes)
	_check_arrow_roundtrip(table)

	df = table.to_pandas()
	_check_pandas_roundtrip(df, version=version)


	@pytest.mark.pandas
	def test_integer_no_nulls(version):
	data, arr = {}, []

	numpy_dtypes = ['i1', 'i2', 'i4', 'i8',
	'u1', 'u2', 'u4', 'u8']
	num_values = 100

	for dtype in numpy_dtypes:
	values = np.random.randint(0, 100, size=num_values)
	data[dtype] = values.astype(dtype)
	arr.append(values.astype(dtype))

	df = pd.DataFrame(data)
	_check_pandas_roundtrip(df, version=version)

	table = pa.table(arr, names=numpy_dtypes)
	_check_arrow_roundtrip(table)


	@pytest.mark.pandas
	def test_platform_numpy_integers(version):
	data = {}

	numpy_dtypes = ['longlong']
	num_values = 100

	for dtype in numpy_dtypes:
	values = np.random.randint(0, 100, size=num_values)
	data[dtype] = values.astype(dtype)

	df = pd.DataFrame(data)
	_check_pandas_roundtrip(df, version=version)


	@pytest.mark.pandas
	def test_integer_with_nulls(version):
	# pandas requires upcast to float dtype
	path = random_path()
	TEST_FILES.append(path)

	int_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8']
	num_values = 100

	arrays = []
	null_mask = np.random.randint(0, 10, size=num_values) < 3
	expected_cols = []
	for name in int_dtypes:
	values = np.random.randint(0, 100, size=num_values)
	arrays.append(pa.array(values, mask=null_mask))

	expected = values.astype('f8')
	expected[null_mask] = np.nan

	expected_cols.append(expected)

	table = pa.table(arrays, names=int_dtypes)
	_check_arrow_roundtrip(table)

	df = table.to_pandas()
	_check_pandas_roundtrip(df, version=version)


	@pytest.mark.pandas
	def test_boolean_no_nulls(version):
	num_values = 100

	np.random.seed(0)

	df = pd.DataFrame({'bools': np.random.randn(num_values) > 0})
	_check_pandas_roundtrip(df, version=version)


	@pytest.mark.pandas
	def test_boolean_nulls(version):
	# pandas requires upcast to object dtype
	path = random_path()
	TEST_FILES.append(path)

	num_values = 100
	np.random.seed(0)

	mask = np.random.randint(0, 10, size=num_values) < 3
	values = np.random.randint(0, 10, size=num_values) < 5

	table = pa.table([pa.array(values, mask=mask)], names=['bools'])
	_check_arrow_roundtrip(table)

	df = table.to_pandas()
	_check_pandas_roundtrip(df, version=version)


	def test_buffer_bounds_error(version):
	# ARROW-1676
	path = random_path()
	TEST_FILES.append(path)

	for i in range(16, 256):
	table = pa.Table.from_arrays(
	[pa.array([None] + list(range(i)), type=pa.float64())],
	names=["arr"]
	)
	_check_arrow_roundtrip(table)


	def test_boolean_object_nulls(version):
	repeats = 100
	table = pa.Table.from_arrays(
	[np.array([False, None, True] * repeats, dtype=object)],
	names=["arr"]
	)
	_check_arrow_roundtrip(table)


	@pytest.mark.pandas
	def test_delete_partial_file_on_error(version):
	if sys.platform == 'win32':
	pytest.skip('Windows hangs on to file handle for some reason')

	class CustomClass:
	pass

	# strings will fail
	df = pd.DataFrame(
	{
	'numbers': range(5),
	'strings': [b'foo', None, 'bar', CustomClass(), np.nan]},
	columns=['numbers', 'strings'])

	path = random_path()
	try:
	write_feather(df, path, version=version)
	except Exception:
	pass

	assert not os.path.exists(path)


	@pytest.mark.pandas
	def test_strings(version):
	repeats = 1000

	# Mixed bytes, unicode, strings coerced to binary
	values = [b'foo', None, 'bar', 'qux', np.nan]
	df = pd.DataFrame({'strings': values * repeats})

	ex_values = [b'foo', None, b'bar', b'qux', np.nan]
	expected = pd.DataFrame({'strings': ex_values * repeats})
	_check_pandas_roundtrip(df, expected, version=version)

	# embedded nulls are ok
	values = ['foo', None, 'bar', 'qux', None]
	df = pd.DataFrame({'strings': values * repeats})
	expected = pd.DataFrame({'strings': values * repeats})
	_check_pandas_roundtrip(df, expected, version=version)

	values = ['foo', None, 'bar', 'qux', np.nan]
	df = pd.DataFrame({'strings': values * repeats})
	expected = pd.DataFrame({'strings': values * repeats})
	_check_pandas_roundtrip(df, expected, version=version)


	@pytest.mark.pandas
	def test_empty_strings(version):
	df = pd.DataFrame({'strings': [''] * 10})
	_check_pandas_roundtrip(df, version=version)


	@pytest.mark.pandas
	def test_all_none(version):
	df = pd.DataFrame({'all_none': [None] * 10})
	_check_pandas_roundtrip(df, version=version)


	@pytest.mark.pandas
	def test_all_null_category(version):
	# ARROW-1188
	df = pd.DataFrame({"A": (1, 2, 3), "B": (None, None, None)})
	df = df.assign(B=df.B.astype("category"))
	_check_pandas_roundtrip(df, version=version)


	@pytest.mark.pandas
	def test_multithreaded_read(version):
	data = {'c{}'.format(i): [''] * 10
	for i in range(100)}
	df = pd.DataFrame(data)
	_check_pandas_roundtrip(df, use_threads=True, version=version)


	@pytest.mark.pandas
	def test_nan_as_null(version):
	# Create a nan that is not numpy.nan
	values = np.array(['foo', np.nan, np.nan * 2, 'bar'] * 10)
	df = pd.DataFrame({'strings': values})
	_check_pandas_roundtrip(df, version=version)


	@pytest.mark.pandas
	def test_category(version):
	repeats = 1000
	values = ['foo', None, 'bar', 'qux', np.nan]
	df = pd.DataFrame({'strings': values * repeats})
	df['strings'] = df['strings'].astype('category')

	values = ['foo', None, 'bar', 'qux', None]
	expected = pd.DataFrame({'strings': pd.Categorical(values * repeats)})
	_check_pandas_roundtrip(df, expected, version=version)


	@pytest.mark.pandas
	def test_timestamp(version):
	df = pd.DataFrame({'naive': pd.date_range('2016-03-28', periods=10)})
	df['with_tz'] = (df.naive.dt.tz_localize('utc')
	.dt.tz_convert('America/Los_Angeles'))

	_check_pandas_roundtrip(df, version=version)


	@pytest.mark.pandas
	def test_timestamp_with_nulls(version):
	df = pd.DataFrame({'test': [pd.Timestamp(2016, 1, 1),
	None,
	pd.Timestamp(2016, 1, 3)]})
	df['with_tz'] = df.test.dt.tz_localize('utc')

	_check_pandas_roundtrip(df, version=version)


	@pytest.mark.pandas
	@pytest.mark.xfail(reason="not supported", raises=TypeError)
	def test_timedelta_with_nulls_v1():
	df = pd.DataFrame({'test': [pd.Timedelta('1 day'),
	None,
	pd.Timedelta('3 day')]})
	_check_pandas_roundtrip(df, version=1)


	@pytest.mark.pandas
	def test_timedelta_with_nulls():
	df = pd.DataFrame({'test': [pd.Timedelta('1 day'),
	None,
	pd.Timedelta('3 day')]})
	_check_pandas_roundtrip(df, version=2)


	@pytest.mark.pandas
	def test_out_of_float64_timestamp_with_nulls(version):
	df = pd.DataFrame(
	{'test': pd.DatetimeIndex([1451606400000000001,
	None, 14516064000030405])})
	df['with_tz'] = df.test.dt.tz_localize('utc')
	_check_pandas_roundtrip(df, version=version)


	@pytest.mark.pandas
	def test_non_string_columns(version):
	df = pd.DataFrame({0: [1, 2, 3, 4],
	1: [True, False, True, False]})

	expected = df.rename(columns=str)
	_check_pandas_roundtrip(df, expected, version=version)


	@pytest.mark.pandas
	@pytest.mark.skipif(not os.path.supports_unicode_filenames,
	reason='unicode filenames not supported')
	def test_unicode_filename(version):
	# GH #209
	name = (b'Besa_Kavaj\xc3\xab.feather').decode('utf-8')
	df = pd.DataFrame({'foo': [1, 2, 3, 4]})
	_check_pandas_roundtrip(df, path=random_path(prefix=name),
	version=version)


	@pytest.mark.pandas
	def test_read_columns(version):
	df = pd.DataFrame({
	'foo': [1, 2, 3, 4],
	'boo': [5, 6, 7, 8],
	'woo': [1, 3, 5, 7]
	})
	expected = df[['boo', 'woo']]

	_check_pandas_roundtrip(df, expected, version=version,
	columns=['boo', 'woo'])


	def test_overwritten_file(version):
	path = random_path()
	TEST_FILES.append(path)

	num_values = 100
	np.random.seed(0)

	values = np.random.randint(0, 10, size=num_values)

	table = pa.table({'ints': values})
	write_feather(table, path)

	table = pa.table({'more_ints': values[0:num_values//2]})
	_check_arrow_roundtrip(table, path=path)


	@pytest.mark.pandas
	def test_filelike_objects(version):
	buf = io.BytesIO()

	# the copy makes it non-strided
	df = pd.DataFrame(np.arange(12).reshape(4, 3),
	columns=['a', 'b', 'c']).copy()
	write_feather(df, buf, version=version)

	buf.seek(0)

	result = read_feather(buf)
	assert_frame_equal(result, df)


	@pytest.mark.pandas
	@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning")
	@pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning")
	def test_sparse_dataframe(version):
	if not pa.pandas_compat._pandas_api.has_sparse:
	pytest.skip("version of pandas does not support SparseDataFrame")
	# GH #221
	data = {'A': [0, 1, 2],
	'B': [1, 0, 1]}
	df = pd.DataFrame(data).to_sparse(fill_value=1)
	expected = df.to_dense()
	_check_pandas_roundtrip(df, expected, version=version)


	@pytest.mark.pandas
	def test_duplicate_columns_pandas():

	# https://github.com/wesm/feather/issues/53
	# not currently able to handle duplicate columns
	df = pd.DataFrame(np.arange(12).reshape(4, 3),
	columns=list('aaa')).copy()
	_assert_error_on_write(df, ValueError)


	def test_duplicate_columns():
	# only works for version 2
	table = pa.table([[1, 2, 3], [4, 5, 6], [7, 8, 9]], names=['a', 'a', 'b'])
	_check_arrow_roundtrip(table)
	_assert_error_on_write(table, ValueError, version=1)


	@pytest.mark.pandas
	def test_unsupported():
	# https://github.com/wesm/feather/issues/240
	# serializing actual python objects

	# custom python objects
	class A:
	pass

	df = pd.DataFrame({'a': [A(), A()]})
	_assert_error_on_write(df, ValueError)

	# non-strings
	df = pd.DataFrame({'a': ['a', 1, 2.0]})
	_assert_error_on_write(df, TypeError)


	@pytest.mark.pandas
	def test_v2_set_chunksize():
	df = pd.DataFrame({'A': np.arange(1000)})
	table = pa.table(df)

	buf = io.BytesIO()
	write_feather(table, buf, chunksize=250, version=2)

	result = buf.getvalue()

	ipc_file = pa.ipc.open_file(pa.BufferReader(result))
	assert ipc_file.num_record_batches == 4
	assert len(ipc_file.get_batch(0)) == 250


	@pytest.mark.pandas
	@pytest.mark.lz4
	@pytest.mark.snappy
	@pytest.mark.zstd
	def test_v2_compression_options():
	df = pd.DataFrame({'A': np.arange(1000)})

	cases = [
	# compression, compression_level
	('uncompressed', None),
	('lz4', None),
	('zstd', 1),
	('zstd', 10)
	]

	for compression, compression_level in cases:
	_check_pandas_roundtrip(df, compression=compression,
	compression_level=compression_level)

	buf = io.BytesIO()

	# LZ4 doesn't support compression_level
	with pytest.raises(pa.ArrowInvalid,
	match="doesn't support setting a compression level"):
	write_feather(df, buf, compression='lz4', compression_level=10)

	# Trying to compress with V1
	with pytest.raises(
	ValueError,
	match="Feather V1 files do not support compression option"):
	write_feather(df, buf, compression='lz4', version=1)

	# Trying to set chunksize with V1
	with pytest.raises(
	ValueError,
	match="Feather V1 files do not support chunksize option"):
	write_feather(df, buf, chunksize=4096, version=1)

	# Unsupported compressor
	with pytest.raises(ValueError,
	match='compression="snappy" not supported'):
	write_feather(df, buf, compression='snappy')


	def test_v2_lz4_default_compression():
	# ARROW-8750: Make sure that the compression=None option selects lz4 if
	# it's available
	if not pa.Codec.is_available('lz4_frame'):
	pytest.skip("LZ4 compression support is not built in C++")

	# some highly compressible data
	t = pa.table([np.repeat(0, 100000)], names=['f0'])

	buf = io.BytesIO()
	write_feather(t, buf)
	default_result = buf.getvalue()

	buf = io.BytesIO()
	write_feather(t, buf, compression='uncompressed')
	uncompressed_result = buf.getvalue()

	assert len(default_result) < len(uncompressed_result)


	def test_v1_unsupported_types():
	table = pa.table([pa.array([[1, 2, 3], [], None])], names=['f0'])

	buf = io.BytesIO()
	with pytest.raises(TypeError,
	match=("Unsupported Feather V1 type: "
	"list<item: int64>. "
	"Use V2 format to serialize all Arrow types.")):
	write_feather(table, buf, version=1)


	@pytest.mark.slow
	@pytest.mark.pandas
	def test_large_dataframe(version):
	df = pd.DataFrame({'A': np.arange(400000000)})
	_check_pandas_roundtrip(df, version=version)


	@pytest.mark.large_memory
	@pytest.mark.pandas
	def test_chunked_binary_error_message():
	# ARROW-3058: As Feather does not yet support chunked columns, we at least
	# make sure it's clear to the user what is going on

	# 2^31 + 1 bytes
	values = [b'x'] + [
	b'x' * (1 << 20)
	] * 2 * (1 << 10)
	df = pd.DataFrame({'byte_col': values})

	# Works fine with version 2
	buf = io.BytesIO()
	write_feather(df, buf, version=2)
	result = read_feather(pa.BufferReader(buf.getvalue()))
	assert_frame_equal(result, df)

	with pytest.raises(ValueError, match="'byte_col' exceeds 2GB maximum "
	"capacity of a Feather binary column. This restriction "
	"may be lifted in the future"):
	write_feather(df, io.BytesIO(), version=1)


	def test_feather_without_pandas(tempdir, version):
	# ARROW-8345
	table = pa.table([pa.array([1, 2, 3])], names=['f0'])
	path = str(tempdir / "data.feather")
	_check_arrow_roundtrip(table, path)


	@pytest.mark.pandas
	def test_read_column_selection(version):
	# ARROW-8641
	df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=['a', 'b', 'c'])

	# select columns as string names or integer indices
	_check_pandas_roundtrip(
	df, columns=['a', 'c'], expected=df[['a', 'c']], version=version)
	_check_pandas_roundtrip(
	df, columns=[0, 2], expected=df[['a', 'c']], version=version)

	# different order is followed
	_check_pandas_roundtrip(
	df, columns=['b', 'a'], expected=df[['b', 'a']], version=version)
	_check_pandas_roundtrip(
	df, columns=[1, 0], expected=df[['b', 'a']], version=version)


	def test_read_column_duplicated_selection(tempdir, version):
	# duplicated columns in the column selection
	table = pa.table([[1, 2, 3], [4, 5, 6], [7, 8, 9]], names=['a', 'b', 'c'])
	path = str(tempdir / "data.feather")
	write_feather(table, path, version=version)

	expected = pa.table([[1, 2, 3], [4, 5, 6], [1, 2, 3]],
	names=['a', 'b', 'a'])
	for col_selection in [['a', 'b', 'a'], [0, 1, 0]]:
	result = read_table(path, columns=col_selection)
	assert result.equals(expected)


	def test_read_column_duplicated_in_file(tempdir):
	# duplicated columns in feather file (only works for feather v2)
	table = pa.table([[1, 2, 3], [4, 5, 6], [7, 8, 9]], names=['a', 'b', 'a'])
	path = str(tempdir / "data.feather")
	write_feather(table, path, version=2)

	# no selection works fine
	result = read_table(path)
	assert result.equals(table)

	# selection with indices works
	result = read_table(path, columns=[0, 2])
	assert result.column_names == ['a', 'a']

	# selection with column names errors
	with pytest.raises(ValueError):
	read_table(path, columns=['a', 'b'])


	def test_nested_types(compression):
	# https://issues.apache.org/jira/browse/ARROW-8860
	table = pa.table({'col': pa.StructArray.from_arrays(
	[[0, 1, 2], [1, 2, 3]], names=["f1", "f2"])})
	_check_arrow_roundtrip(table, compression=compression)

	table = pa.table({'col': pa.array([[1, 2], [3, 4]])})
	_check_arrow_roundtrip(table, compression=compression)

	table = pa.table({'col': pa.array([[[1, 2], [3, 4]], [[5, 6], None]])})
	_check_arrow_roundtrip(table, compression=compression)


	@h.given(past.all_tables, st.sampled_from(["uncompressed", "lz4", "zstd"]))
	def test_roundtrip(table, compression):
	_check_arrow_roundtrip(table, compression=compression)


	@pytest.mark.lz4
	def test_feather_v017_experimental_compression_backward_compatibility(datadir):
	# ARROW-11163 - ensure newer pyarrow versions can read the old feather
	# files from version 0.17.0 with experimental compression support (before
	# it was officially added to IPC format in 1.0.0)

	# file generated with:
	# table = pa.table({'a': range(5)})
	# from pyarrow import feather
	# feather.write_feather(
	# table, "v0.17.0.version=2-compression=lz4.feather",
	# compression="lz4", version=2)
	expected = pa.table({'a': range(5)})
	result = read_table(datadir / "v0.17.0.version=2-compression=lz4.feather")
	assert result.equals(expected)