| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| |
| import io |
| import os |
| import sys |
| import tempfile |
| import pytest |
| import hypothesis as h |
| import hypothesis.strategies as st |
| |
| import numpy as np |
| |
| import pyarrow as pa |
| import pyarrow.tests.strategies as past |
| from pyarrow.feather import (read_feather, write_feather, read_table, |
| FeatherDataset) |
| |
| |
| try: |
| from pandas.testing import assert_frame_equal |
| import pandas as pd |
| import pyarrow.pandas_compat |
| except ImportError: |
| pass |
| |
| |
| @pytest.fixture(scope='module') |
| def datadir(base_datadir): |
| return base_datadir / 'feather' |
| |
| |
| def random_path(prefix='feather_'): |
| return tempfile.mktemp(prefix=prefix) |
| |
| |
| @pytest.fixture(scope="module", params=[1, 2]) |
| def version(request): |
| yield request.param |
| |
| |
| @pytest.fixture(scope="module", params=[None, "uncompressed", "lz4", "zstd"]) |
| def compression(request): |
| if request.param in ['lz4', 'zstd'] and not pa.Codec.is_available( |
| request.param): |
| pytest.skip(f'{request.param} is not available') |
| yield request.param |
| |
| |
| TEST_FILES = None |
| |
| |
| def setup_module(module): |
| global TEST_FILES |
| TEST_FILES = [] |
| |
| |
| def teardown_module(module): |
| for path in TEST_FILES: |
| try: |
| os.remove(path) |
| except os.error: |
| pass |
| |
| |
| @pytest.mark.pandas |
| def test_file_not_exist(): |
| with pytest.raises(pa.ArrowIOError): |
| read_feather('test_invalid_file') |
| |
| |
| def _check_pandas_roundtrip(df, expected=None, path=None, |
| columns=None, use_threads=False, |
| version=None, compression=None, |
| compression_level=None): |
| if path is None: |
| path = random_path() |
| |
| TEST_FILES.append(path) |
| write_feather(df, path, compression=compression, |
| compression_level=compression_level, version=version) |
| if not os.path.exists(path): |
| raise Exception('file not written') |
| |
| result = read_feather(path, columns, use_threads=use_threads) |
| if expected is None: |
| expected = df |
| |
| assert_frame_equal(result, expected) |
| |
| |
| def _check_arrow_roundtrip(table, path=None, compression=None): |
| if path is None: |
| path = random_path() |
| |
| TEST_FILES.append(path) |
| write_feather(table, path, compression=compression) |
| if not os.path.exists(path): |
| raise Exception('file not written') |
| |
| result = read_table(path) |
| assert result.equals(table) |
| |
| |
| def _assert_error_on_write(df, exc, path=None, version=2): |
| # check that we are raising the exception |
| # on writing |
| |
| if path is None: |
| path = random_path() |
| |
| TEST_FILES.append(path) |
| |
| def f(): |
| write_feather(df, path, version=version) |
| |
| pytest.raises(exc, f) |
| |
| |
| def test_dataset(version): |
| num_values = (100, 100) |
| num_files = 5 |
| paths = [random_path() for i in range(num_files)] |
| data = { |
| "col_" + str(i): np.random.randn(num_values[0]) |
| for i in range(num_values[1]) |
| } |
| table = pa.table(data) |
| |
| TEST_FILES.extend(paths) |
| for index, path in enumerate(paths): |
| rows = ( |
| index * (num_values[0] // num_files), |
| (index + 1) * (num_values[0] // num_files), |
| ) |
| |
| write_feather(table[rows[0]: rows[1]], path, version=version) |
| |
| data = FeatherDataset(paths).read_table() |
| assert data.equals(table) |
| |
| |
| @pytest.mark.pandas |
| def test_float_no_nulls(version): |
| data = {} |
| numpy_dtypes = ['f4', 'f8'] |
| num_values = 100 |
| |
| for dtype in numpy_dtypes: |
| values = np.random.randn(num_values) |
| data[dtype] = values.astype(dtype) |
| |
| df = pd.DataFrame(data) |
| _check_pandas_roundtrip(df, version=version) |
| |
| |
| @pytest.mark.pandas |
| def test_read_table(version): |
| num_values = (100, 100) |
| path = random_path() |
| |
| TEST_FILES.append(path) |
| |
| values = np.random.randint(0, 100, size=num_values) |
| columns = ['col_' + str(i) for i in range(100)] |
| table = pa.Table.from_arrays(values, columns) |
| |
| write_feather(table, path, version=version) |
| |
| result = read_table(path) |
| assert result.equals(table) |
| |
| # Test without memory mapping |
| result = read_table(path, memory_map=False) |
| assert result.equals(table) |
| |
| result = read_feather(path, memory_map=False) |
| assert_frame_equal(table.to_pandas(), result) |
| |
| |
| @pytest.mark.pandas |
| def test_float_nulls(version): |
| num_values = 100 |
| |
| path = random_path() |
| TEST_FILES.append(path) |
| |
| null_mask = np.random.randint(0, 10, size=num_values) < 3 |
| dtypes = ['f4', 'f8'] |
| expected_cols = [] |
| |
| arrays = [] |
| for name in dtypes: |
| values = np.random.randn(num_values).astype(name) |
| arrays.append(pa.array(values, mask=null_mask)) |
| |
| values[null_mask] = np.nan |
| |
| expected_cols.append(values) |
| |
| table = pa.table(arrays, names=dtypes) |
| _check_arrow_roundtrip(table) |
| |
| df = table.to_pandas() |
| _check_pandas_roundtrip(df, version=version) |
| |
| |
| @pytest.mark.pandas |
| def test_integer_no_nulls(version): |
| data, arr = {}, [] |
| |
| numpy_dtypes = ['i1', 'i2', 'i4', 'i8', |
| 'u1', 'u2', 'u4', 'u8'] |
| num_values = 100 |
| |
| for dtype in numpy_dtypes: |
| values = np.random.randint(0, 100, size=num_values) |
| data[dtype] = values.astype(dtype) |
| arr.append(values.astype(dtype)) |
| |
| df = pd.DataFrame(data) |
| _check_pandas_roundtrip(df, version=version) |
| |
| table = pa.table(arr, names=numpy_dtypes) |
| _check_arrow_roundtrip(table) |
| |
| |
| @pytest.mark.pandas |
| def test_platform_numpy_integers(version): |
| data = {} |
| |
| numpy_dtypes = ['longlong'] |
| num_values = 100 |
| |
| for dtype in numpy_dtypes: |
| values = np.random.randint(0, 100, size=num_values) |
| data[dtype] = values.astype(dtype) |
| |
| df = pd.DataFrame(data) |
| _check_pandas_roundtrip(df, version=version) |
| |
| |
| @pytest.mark.pandas |
| def test_integer_with_nulls(version): |
| # pandas requires upcast to float dtype |
| path = random_path() |
| TEST_FILES.append(path) |
| |
| int_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8'] |
| num_values = 100 |
| |
| arrays = [] |
| null_mask = np.random.randint(0, 10, size=num_values) < 3 |
| expected_cols = [] |
| for name in int_dtypes: |
| values = np.random.randint(0, 100, size=num_values) |
| arrays.append(pa.array(values, mask=null_mask)) |
| |
| expected = values.astype('f8') |
| expected[null_mask] = np.nan |
| |
| expected_cols.append(expected) |
| |
| table = pa.table(arrays, names=int_dtypes) |
| _check_arrow_roundtrip(table) |
| |
| df = table.to_pandas() |
| _check_pandas_roundtrip(df, version=version) |
| |
| |
| @pytest.mark.pandas |
| def test_boolean_no_nulls(version): |
| num_values = 100 |
| |
| np.random.seed(0) |
| |
| df = pd.DataFrame({'bools': np.random.randn(num_values) > 0}) |
| _check_pandas_roundtrip(df, version=version) |
| |
| |
| @pytest.mark.pandas |
| def test_boolean_nulls(version): |
| # pandas requires upcast to object dtype |
| path = random_path() |
| TEST_FILES.append(path) |
| |
| num_values = 100 |
| np.random.seed(0) |
| |
| mask = np.random.randint(0, 10, size=num_values) < 3 |
| values = np.random.randint(0, 10, size=num_values) < 5 |
| |
| table = pa.table([pa.array(values, mask=mask)], names=['bools']) |
| _check_arrow_roundtrip(table) |
| |
| df = table.to_pandas() |
| _check_pandas_roundtrip(df, version=version) |
| |
| |
| def test_buffer_bounds_error(version): |
| # ARROW-1676 |
| path = random_path() |
| TEST_FILES.append(path) |
| |
| for i in range(16, 256): |
| table = pa.Table.from_arrays( |
| [pa.array([None] + list(range(i)), type=pa.float64())], |
| names=["arr"] |
| ) |
| _check_arrow_roundtrip(table) |
| |
| |
| def test_boolean_object_nulls(version): |
| repeats = 100 |
| table = pa.Table.from_arrays( |
| [np.array([False, None, True] * repeats, dtype=object)], |
| names=["arr"] |
| ) |
| _check_arrow_roundtrip(table) |
| |
| |
| @pytest.mark.pandas |
| def test_delete_partial_file_on_error(version): |
| if sys.platform == 'win32': |
| pytest.skip('Windows hangs on to file handle for some reason') |
| |
| class CustomClass: |
| pass |
| |
| # strings will fail |
| df = pd.DataFrame( |
| { |
| 'numbers': range(5), |
| 'strings': [b'foo', None, 'bar', CustomClass(), np.nan]}, |
| columns=['numbers', 'strings']) |
| |
| path = random_path() |
| try: |
| write_feather(df, path, version=version) |
| except Exception: |
| pass |
| |
| assert not os.path.exists(path) |
| |
| |
| @pytest.mark.pandas |
| def test_strings(version): |
| repeats = 1000 |
| |
| # Mixed bytes, unicode, strings coerced to binary |
| values = [b'foo', None, 'bar', 'qux', np.nan] |
| df = pd.DataFrame({'strings': values * repeats}) |
| |
| ex_values = [b'foo', None, b'bar', b'qux', np.nan] |
| expected = pd.DataFrame({'strings': ex_values * repeats}) |
| _check_pandas_roundtrip(df, expected, version=version) |
| |
| # embedded nulls are ok |
| values = ['foo', None, 'bar', 'qux', None] |
| df = pd.DataFrame({'strings': values * repeats}) |
| expected = pd.DataFrame({'strings': values * repeats}) |
| _check_pandas_roundtrip(df, expected, version=version) |
| |
| values = ['foo', None, 'bar', 'qux', np.nan] |
| df = pd.DataFrame({'strings': values * repeats}) |
| expected = pd.DataFrame({'strings': values * repeats}) |
| _check_pandas_roundtrip(df, expected, version=version) |
| |
| |
| @pytest.mark.pandas |
| def test_empty_strings(version): |
| df = pd.DataFrame({'strings': [''] * 10}) |
| _check_pandas_roundtrip(df, version=version) |
| |
| |
| @pytest.mark.pandas |
| def test_all_none(version): |
| df = pd.DataFrame({'all_none': [None] * 10}) |
| _check_pandas_roundtrip(df, version=version) |
| |
| |
| @pytest.mark.pandas |
| def test_all_null_category(version): |
| # ARROW-1188 |
| df = pd.DataFrame({"A": (1, 2, 3), "B": (None, None, None)}) |
| df = df.assign(B=df.B.astype("category")) |
| _check_pandas_roundtrip(df, version=version) |
| |
| |
| @pytest.mark.pandas |
| def test_multithreaded_read(version): |
| data = {'c{}'.format(i): [''] * 10 |
| for i in range(100)} |
| df = pd.DataFrame(data) |
| _check_pandas_roundtrip(df, use_threads=True, version=version) |
| |
| |
| @pytest.mark.pandas |
| def test_nan_as_null(version): |
| # Create a nan that is not numpy.nan |
| values = np.array(['foo', np.nan, np.nan * 2, 'bar'] * 10) |
| df = pd.DataFrame({'strings': values}) |
| _check_pandas_roundtrip(df, version=version) |
| |
| |
| @pytest.mark.pandas |
| def test_category(version): |
| repeats = 1000 |
| values = ['foo', None, 'bar', 'qux', np.nan] |
| df = pd.DataFrame({'strings': values * repeats}) |
| df['strings'] = df['strings'].astype('category') |
| |
| values = ['foo', None, 'bar', 'qux', None] |
| expected = pd.DataFrame({'strings': pd.Categorical(values * repeats)}) |
| _check_pandas_roundtrip(df, expected, version=version) |
| |
| |
| @pytest.mark.pandas |
| def test_timestamp(version): |
| df = pd.DataFrame({'naive': pd.date_range('2016-03-28', periods=10)}) |
| df['with_tz'] = (df.naive.dt.tz_localize('utc') |
| .dt.tz_convert('America/Los_Angeles')) |
| |
| _check_pandas_roundtrip(df, version=version) |
| |
| |
| @pytest.mark.pandas |
| def test_timestamp_with_nulls(version): |
| df = pd.DataFrame({'test': [pd.Timestamp(2016, 1, 1), |
| None, |
| pd.Timestamp(2016, 1, 3)]}) |
| df['with_tz'] = df.test.dt.tz_localize('utc') |
| |
| _check_pandas_roundtrip(df, version=version) |
| |
| |
| @pytest.mark.pandas |
| @pytest.mark.xfail(reason="not supported", raises=TypeError) |
| def test_timedelta_with_nulls_v1(): |
| df = pd.DataFrame({'test': [pd.Timedelta('1 day'), |
| None, |
| pd.Timedelta('3 day')]}) |
| _check_pandas_roundtrip(df, version=1) |
| |
| |
| @pytest.mark.pandas |
| def test_timedelta_with_nulls(): |
| df = pd.DataFrame({'test': [pd.Timedelta('1 day'), |
| None, |
| pd.Timedelta('3 day')]}) |
| _check_pandas_roundtrip(df, version=2) |
| |
| |
| @pytest.mark.pandas |
| def test_out_of_float64_timestamp_with_nulls(version): |
| df = pd.DataFrame( |
| {'test': pd.DatetimeIndex([1451606400000000001, |
| None, 14516064000030405])}) |
| df['with_tz'] = df.test.dt.tz_localize('utc') |
| _check_pandas_roundtrip(df, version=version) |
| |
| |
| @pytest.mark.pandas |
| def test_non_string_columns(version): |
| df = pd.DataFrame({0: [1, 2, 3, 4], |
| 1: [True, False, True, False]}) |
| |
| expected = df.rename(columns=str) |
| _check_pandas_roundtrip(df, expected, version=version) |
| |
| |
| @pytest.mark.pandas |
| @pytest.mark.skipif(not os.path.supports_unicode_filenames, |
| reason='unicode filenames not supported') |
| def test_unicode_filename(version): |
| # GH #209 |
| name = (b'Besa_Kavaj\xc3\xab.feather').decode('utf-8') |
| df = pd.DataFrame({'foo': [1, 2, 3, 4]}) |
| _check_pandas_roundtrip(df, path=random_path(prefix=name), |
| version=version) |
| |
| |
| @pytest.mark.pandas |
| def test_read_columns(version): |
| df = pd.DataFrame({ |
| 'foo': [1, 2, 3, 4], |
| 'boo': [5, 6, 7, 8], |
| 'woo': [1, 3, 5, 7] |
| }) |
| expected = df[['boo', 'woo']] |
| |
| _check_pandas_roundtrip(df, expected, version=version, |
| columns=['boo', 'woo']) |
| |
| |
| def test_overwritten_file(version): |
| path = random_path() |
| TEST_FILES.append(path) |
| |
| num_values = 100 |
| np.random.seed(0) |
| |
| values = np.random.randint(0, 10, size=num_values) |
| |
| table = pa.table({'ints': values}) |
| write_feather(table, path) |
| |
| table = pa.table({'more_ints': values[0:num_values//2]}) |
| _check_arrow_roundtrip(table, path=path) |
| |
| |
| @pytest.mark.pandas |
| def test_filelike_objects(version): |
| buf = io.BytesIO() |
| |
| # the copy makes it non-strided |
| df = pd.DataFrame(np.arange(12).reshape(4, 3), |
| columns=['a', 'b', 'c']).copy() |
| write_feather(df, buf, version=version) |
| |
| buf.seek(0) |
| |
| result = read_feather(buf) |
| assert_frame_equal(result, df) |
| |
| |
| @pytest.mark.pandas |
| @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") |
| @pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") |
| def test_sparse_dataframe(version): |
| if not pa.pandas_compat._pandas_api.has_sparse: |
| pytest.skip("version of pandas does not support SparseDataFrame") |
| # GH #221 |
| data = {'A': [0, 1, 2], |
| 'B': [1, 0, 1]} |
| df = pd.DataFrame(data).to_sparse(fill_value=1) |
| expected = df.to_dense() |
| _check_pandas_roundtrip(df, expected, version=version) |
| |
| |
| @pytest.mark.pandas |
| def test_duplicate_columns_pandas(): |
| |
| # https://github.com/wesm/feather/issues/53 |
| # not currently able to handle duplicate columns |
| df = pd.DataFrame(np.arange(12).reshape(4, 3), |
| columns=list('aaa')).copy() |
| _assert_error_on_write(df, ValueError) |
| |
| |
| def test_duplicate_columns(): |
| # only works for version 2 |
| table = pa.table([[1, 2, 3], [4, 5, 6], [7, 8, 9]], names=['a', 'a', 'b']) |
| _check_arrow_roundtrip(table) |
| _assert_error_on_write(table, ValueError, version=1) |
| |
| |
| @pytest.mark.pandas |
| def test_unsupported(): |
| # https://github.com/wesm/feather/issues/240 |
| # serializing actual python objects |
| |
| # custom python objects |
| class A: |
| pass |
| |
| df = pd.DataFrame({'a': [A(), A()]}) |
| _assert_error_on_write(df, ValueError) |
| |
| # non-strings |
| df = pd.DataFrame({'a': ['a', 1, 2.0]}) |
| _assert_error_on_write(df, TypeError) |
| |
| |
| @pytest.mark.pandas |
| def test_v2_set_chunksize(): |
| df = pd.DataFrame({'A': np.arange(1000)}) |
| table = pa.table(df) |
| |
| buf = io.BytesIO() |
| write_feather(table, buf, chunksize=250, version=2) |
| |
| result = buf.getvalue() |
| |
| ipc_file = pa.ipc.open_file(pa.BufferReader(result)) |
| assert ipc_file.num_record_batches == 4 |
| assert len(ipc_file.get_batch(0)) == 250 |
| |
| |
| @pytest.mark.pandas |
| @pytest.mark.lz4 |
| @pytest.mark.snappy |
| @pytest.mark.zstd |
| def test_v2_compression_options(): |
| df = pd.DataFrame({'A': np.arange(1000)}) |
| |
| cases = [ |
| # compression, compression_level |
| ('uncompressed', None), |
| ('lz4', None), |
| ('zstd', 1), |
| ('zstd', 10) |
| ] |
| |
| for compression, compression_level in cases: |
| _check_pandas_roundtrip(df, compression=compression, |
| compression_level=compression_level) |
| |
| buf = io.BytesIO() |
| |
| # LZ4 doesn't support compression_level |
| with pytest.raises(pa.ArrowInvalid, |
| match="doesn't support setting a compression level"): |
| write_feather(df, buf, compression='lz4', compression_level=10) |
| |
| # Trying to compress with V1 |
| with pytest.raises( |
| ValueError, |
| match="Feather V1 files do not support compression option"): |
| write_feather(df, buf, compression='lz4', version=1) |
| |
| # Trying to set chunksize with V1 |
| with pytest.raises( |
| ValueError, |
| match="Feather V1 files do not support chunksize option"): |
| write_feather(df, buf, chunksize=4096, version=1) |
| |
| # Unsupported compressor |
| with pytest.raises(ValueError, |
| match='compression="snappy" not supported'): |
| write_feather(df, buf, compression='snappy') |
| |
| |
| def test_v2_lz4_default_compression(): |
| # ARROW-8750: Make sure that the compression=None option selects lz4 if |
| # it's available |
| if not pa.Codec.is_available('lz4_frame'): |
| pytest.skip("LZ4 compression support is not built in C++") |
| |
| # some highly compressible data |
| t = pa.table([np.repeat(0, 100000)], names=['f0']) |
| |
| buf = io.BytesIO() |
| write_feather(t, buf) |
| default_result = buf.getvalue() |
| |
| buf = io.BytesIO() |
| write_feather(t, buf, compression='uncompressed') |
| uncompressed_result = buf.getvalue() |
| |
| assert len(default_result) < len(uncompressed_result) |
| |
| |
| def test_v1_unsupported_types(): |
| table = pa.table([pa.array([[1, 2, 3], [], None])], names=['f0']) |
| |
| buf = io.BytesIO() |
| with pytest.raises(TypeError, |
| match=("Unsupported Feather V1 type: " |
| "list<item: int64>. " |
| "Use V2 format to serialize all Arrow types.")): |
| write_feather(table, buf, version=1) |
| |
| |
| @pytest.mark.slow |
| @pytest.mark.pandas |
| def test_large_dataframe(version): |
| df = pd.DataFrame({'A': np.arange(400000000)}) |
| _check_pandas_roundtrip(df, version=version) |
| |
| |
| @pytest.mark.large_memory |
| @pytest.mark.pandas |
| def test_chunked_binary_error_message(): |
| # ARROW-3058: As Feather does not yet support chunked columns, we at least |
| # make sure it's clear to the user what is going on |
| |
| # 2^31 + 1 bytes |
| values = [b'x'] + [ |
| b'x' * (1 << 20) |
| ] * 2 * (1 << 10) |
| df = pd.DataFrame({'byte_col': values}) |
| |
| # Works fine with version 2 |
| buf = io.BytesIO() |
| write_feather(df, buf, version=2) |
| result = read_feather(pa.BufferReader(buf.getvalue())) |
| assert_frame_equal(result, df) |
| |
| with pytest.raises(ValueError, match="'byte_col' exceeds 2GB maximum " |
| "capacity of a Feather binary column. This restriction " |
| "may be lifted in the future"): |
| write_feather(df, io.BytesIO(), version=1) |
| |
| |
| def test_feather_without_pandas(tempdir, version): |
| # ARROW-8345 |
| table = pa.table([pa.array([1, 2, 3])], names=['f0']) |
| path = str(tempdir / "data.feather") |
| _check_arrow_roundtrip(table, path) |
| |
| |
| @pytest.mark.pandas |
| def test_read_column_selection(version): |
| # ARROW-8641 |
| df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=['a', 'b', 'c']) |
| |
| # select columns as string names or integer indices |
| _check_pandas_roundtrip( |
| df, columns=['a', 'c'], expected=df[['a', 'c']], version=version) |
| _check_pandas_roundtrip( |
| df, columns=[0, 2], expected=df[['a', 'c']], version=version) |
| |
| # different order is followed |
| _check_pandas_roundtrip( |
| df, columns=['b', 'a'], expected=df[['b', 'a']], version=version) |
| _check_pandas_roundtrip( |
| df, columns=[1, 0], expected=df[['b', 'a']], version=version) |
| |
| |
| def test_read_column_duplicated_selection(tempdir, version): |
| # duplicated columns in the column selection |
| table = pa.table([[1, 2, 3], [4, 5, 6], [7, 8, 9]], names=['a', 'b', 'c']) |
| path = str(tempdir / "data.feather") |
| write_feather(table, path, version=version) |
| |
| expected = pa.table([[1, 2, 3], [4, 5, 6], [1, 2, 3]], |
| names=['a', 'b', 'a']) |
| for col_selection in [['a', 'b', 'a'], [0, 1, 0]]: |
| result = read_table(path, columns=col_selection) |
| assert result.equals(expected) |
| |
| |
| def test_read_column_duplicated_in_file(tempdir): |
| # duplicated columns in feather file (only works for feather v2) |
| table = pa.table([[1, 2, 3], [4, 5, 6], [7, 8, 9]], names=['a', 'b', 'a']) |
| path = str(tempdir / "data.feather") |
| write_feather(table, path, version=2) |
| |
| # no selection works fine |
| result = read_table(path) |
| assert result.equals(table) |
| |
| # selection with indices works |
| result = read_table(path, columns=[0, 2]) |
| assert result.column_names == ['a', 'a'] |
| |
| # selection with column names errors |
| with pytest.raises(ValueError): |
| read_table(path, columns=['a', 'b']) |
| |
| |
| def test_nested_types(compression): |
| # https://issues.apache.org/jira/browse/ARROW-8860 |
| table = pa.table({'col': pa.StructArray.from_arrays( |
| [[0, 1, 2], [1, 2, 3]], names=["f1", "f2"])}) |
| _check_arrow_roundtrip(table, compression=compression) |
| |
| table = pa.table({'col': pa.array([[1, 2], [3, 4]])}) |
| _check_arrow_roundtrip(table, compression=compression) |
| |
| table = pa.table({'col': pa.array([[[1, 2], [3, 4]], [[5, 6], None]])}) |
| _check_arrow_roundtrip(table, compression=compression) |
| |
| |
| @h.given(past.all_tables, st.sampled_from(["uncompressed", "lz4", "zstd"])) |
| def test_roundtrip(table, compression): |
| _check_arrow_roundtrip(table, compression=compression) |
| |
| |
| @pytest.mark.lz4 |
| def test_feather_v017_experimental_compression_backward_compatibility(datadir): |
| # ARROW-11163 - ensure newer pyarrow versions can read the old feather |
| # files from version 0.17.0 with experimental compression support (before |
| # it was officially added to IPC format in 1.0.0) |
| |
| # file generated with: |
| # table = pa.table({'a': range(5)}) |
| # from pyarrow import feather |
| # feather.write_feather( |
| # table, "v0.17.0.version=2-compression=lz4.feather", |
| # compression="lz4", version=2) |
| expected = pa.table({'a': range(5)}) |
| result = read_table(datadir / "v0.17.0.version=2-compression=lz4.feather") |
| assert result.equals(expected) |