blob: f26f7ca95b6938c2d5f23b1bf9c253d3b611b0a6 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import io
import os
import sys
import tempfile
import unittest
import pytest
from numpy.testing import assert_array_equal
import numpy as np
import pyarrow as pa
from pyarrow.feather import (read_feather, write_feather,
read_table, FeatherReader, FeatherDataset)
from pyarrow.lib import FeatherWriter
try:
from pandas.util.testing import assert_frame_equal
import pandas as pd
except ImportError:
pass
# TODO(wesm): The Feather tests currently are tangled with pandas
# dependency. We should isolate the pandas-depending parts and mark those with
# pytest.mark.pandas
pytestmark = pytest.mark.pandas
def random_path(prefix='feather_'):
return tempfile.mktemp(prefix=prefix)
class TestFeatherReader(unittest.TestCase):
def setUp(self):
self.test_files = []
def tearDown(self):
for path in self.test_files:
try:
os.remove(path)
except os.error:
pass
def test_file_not_exist(self):
with pytest.raises(pa.ArrowIOError):
FeatherReader('test_invalid_file')
def _get_null_counts(self, path, columns=None):
reader = FeatherReader(path)
counts = []
for i in range(reader.num_columns):
col = reader.get_column(i)
if columns is None or col.name in columns:
counts.append(col.null_count)
return counts
def _check_pandas_roundtrip(self, df, expected=None, path=None,
columns=None, null_counts=None,
use_threads=False):
if path is None:
path = random_path()
self.test_files.append(path)
write_feather(df, path)
if not os.path.exists(path):
raise Exception('file not written')
result = read_feather(path, columns, use_threads=use_threads)
if expected is None:
expected = df
assert_frame_equal(result, expected)
if null_counts is None:
null_counts = np.zeros(len(expected.columns))
np.testing.assert_array_equal(self._get_null_counts(path, columns),
null_counts)
def _assert_error_on_write(self, df, exc, path=None):
# check that we are raising the exception
# on writing
if path is None:
path = random_path()
self.test_files.append(path)
def f():
write_feather(df, path)
pytest.raises(exc, f)
def test_dataset(self):
num_values = (100, 100)
num_files = 5
paths = [random_path() for i in range(num_files)]
df = pd.DataFrame(np.random.randn(*num_values),
columns=['col_' + str(i)
for i in range(num_values[1])])
self.test_files.extend(paths)
for index, path in enumerate(paths):
rows = (index * (num_values[0] // num_files),
(index + 1) * (num_values[0] // num_files))
writer = FeatherWriter()
writer.open(path)
for col in range(num_values[1]):
writer.write_array(df.columns[col],
df.iloc[rows[0]:rows[1], col])
writer.close()
data = FeatherDataset(paths).read_pandas()
assert_frame_equal(data, df)
def test_num_columns_attr(self):
df0 = pd.DataFrame({})
df1 = pd.DataFrame({
'foo': [1, 2, 3, 4, 5]
})
df2 = pd.DataFrame({
'foo': [1, 2, 3, 4, 5],
'bar': [1, 2, 3, 4, 5]
})
for df, ncols in zip([df0, df1, df2], [0, 1, 2]):
path = random_path()
self.test_files.append(path)
write_feather(df, path)
reader = FeatherReader(path)
assert reader.num_columns == ncols
def test_num_rows_attr(self):
df = pd.DataFrame({'foo': [1, 2, 3, 4, 5]})
path = random_path()
self.test_files.append(path)
write_feather(df, path)
reader = FeatherReader(path)
assert reader.num_rows == len(df)
df = pd.DataFrame({})
path = random_path()
self.test_files.append(path)
write_feather(df, path)
reader = FeatherReader(path)
assert reader.num_rows == 0
def test_float_no_nulls(self):
data = {}
numpy_dtypes = ['f4', 'f8']
num_values = 100
for dtype in numpy_dtypes:
values = np.random.randn(num_values)
data[dtype] = values.astype(dtype)
df = pd.DataFrame(data)
self._check_pandas_roundtrip(df)
def test_read_table(self):
num_values = (100, 100)
path = random_path()
self.test_files.append(path)
writer = FeatherWriter()
writer.open(path)
values = np.random.randint(0, 100, size=num_values)
for i in range(100):
writer.write_array('col_' + str(i), values[:, i])
writer.close()
data = pd.DataFrame(values,
columns=['col_' + str(i) for i in range(100)])
table = pa.Table.from_pandas(data)
result = read_table(path)
assert_frame_equal(table.to_pandas(), result.to_pandas())
def test_float_nulls(self):
num_values = 100
path = random_path()
self.test_files.append(path)
writer = FeatherWriter()
writer.open(path)
null_mask = np.random.randint(0, 10, size=num_values) < 3
dtypes = ['f4', 'f8']
expected_cols = []
null_counts = []
for name in dtypes:
values = np.random.randn(num_values).astype(name)
writer.write_array(name, values, null_mask)
values[null_mask] = np.nan
expected_cols.append(values)
null_counts.append(null_mask.sum())
writer.close()
ex_frame = pd.DataFrame(dict(zip(dtypes, expected_cols)),
columns=dtypes)
result = read_feather(path)
assert_frame_equal(result, ex_frame)
assert_array_equal(self._get_null_counts(path), null_counts)
def test_integer_no_nulls(self):
data = {}
numpy_dtypes = ['i1', 'i2', 'i4', 'i8',
'u1', 'u2', 'u4', 'u8']
num_values = 100
for dtype in numpy_dtypes:
values = np.random.randint(0, 100, size=num_values)
data[dtype] = values.astype(dtype)
df = pd.DataFrame(data)
self._check_pandas_roundtrip(df)
def test_platform_numpy_integers(self):
data = {}
numpy_dtypes = ['longlong']
num_values = 100
for dtype in numpy_dtypes:
values = np.random.randint(0, 100, size=num_values)
data[dtype] = values.astype(dtype)
df = pd.DataFrame(data)
self._check_pandas_roundtrip(df)
def test_integer_with_nulls(self):
# pandas requires upcast to float dtype
path = random_path()
self.test_files.append(path)
int_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8']
num_values = 100
writer = FeatherWriter()
writer.open(path)
null_mask = np.random.randint(0, 10, size=num_values) < 3
expected_cols = []
for name in int_dtypes:
values = np.random.randint(0, 100, size=num_values)
writer.write_array(name, values, null_mask)
expected = values.astype('f8')
expected[null_mask] = np.nan
expected_cols.append(expected)
ex_frame = pd.DataFrame(dict(zip(int_dtypes, expected_cols)),
columns=int_dtypes)
writer.close()
result = read_feather(path)
assert_frame_equal(result, ex_frame)
def test_boolean_no_nulls(self):
num_values = 100
np.random.seed(0)
df = pd.DataFrame({'bools': np.random.randn(num_values) > 0})
self._check_pandas_roundtrip(df)
def test_boolean_nulls(self):
# pandas requires upcast to object dtype
path = random_path()
self.test_files.append(path)
num_values = 100
np.random.seed(0)
writer = FeatherWriter()
writer.open(path)
mask = np.random.randint(0, 10, size=num_values) < 3
values = np.random.randint(0, 10, size=num_values) < 5
writer.write_array('bools', values, mask)
expected = values.astype(object)
expected[mask] = None
writer.close()
ex_frame = pd.DataFrame({'bools': expected})
result = read_feather(path)
assert_frame_equal(result, ex_frame)
def test_buffer_bounds_error(self):
# ARROW-1676
path = random_path()
self.test_files.append(path)
for i in range(16, 256):
values = pa.array([None] + list(range(i)), type=pa.float64())
writer = FeatherWriter()
writer.open(path)
writer.write_array('arr', values)
writer.close()
result = read_feather(path)
expected = pd.DataFrame({'arr': values.to_pandas()})
assert_frame_equal(result, expected)
self._check_pandas_roundtrip(expected, null_counts=[1])
def test_boolean_object_nulls(self):
repeats = 100
arr = np.array([False, None, True] * repeats, dtype=object)
df = pd.DataFrame({'bools': arr})
self._check_pandas_roundtrip(df, null_counts=[1 * repeats])
def test_delete_partial_file_on_error(self):
if sys.platform == 'win32':
pytest.skip('Windows hangs on to file handle for some reason')
class CustomClass(object):
pass
# strings will fail
df = pd.DataFrame(
{
'numbers': range(5),
'strings': [b'foo', None, u'bar', CustomClass(), np.nan]},
columns=['numbers', 'strings'])
path = random_path()
try:
write_feather(df, path)
except Exception:
pass
assert not os.path.exists(path)
def test_strings(self):
repeats = 1000
# Mixed bytes, unicode, strings coerced to binary
values = [b'foo', None, u'bar', 'qux', np.nan]
df = pd.DataFrame({'strings': values * repeats})
ex_values = [b'foo', None, b'bar', b'qux', np.nan]
expected = pd.DataFrame({'strings': ex_values * repeats})
self._check_pandas_roundtrip(df, expected, null_counts=[2 * repeats])
# embedded nulls are ok
values = ['foo', None, 'bar', 'qux', None]
df = pd.DataFrame({'strings': values * repeats})
expected = pd.DataFrame({'strings': values * repeats})
self._check_pandas_roundtrip(df, expected, null_counts=[2 * repeats])
values = ['foo', None, 'bar', 'qux', np.nan]
df = pd.DataFrame({'strings': values * repeats})
expected = pd.DataFrame({'strings': values * repeats})
self._check_pandas_roundtrip(df, expected, null_counts=[2 * repeats])
def test_empty_strings(self):
df = pd.DataFrame({'strings': [''] * 10})
self._check_pandas_roundtrip(df)
def test_all_none(self):
df = pd.DataFrame({'all_none': [None] * 10})
self._check_pandas_roundtrip(df, null_counts=[10])
def test_all_null_category(self):
# ARROW-1188
df = pd.DataFrame({"A": (1, 2, 3), "B": (None, None, None)})
df = df.assign(B=df.B.astype("category"))
self._check_pandas_roundtrip(df, null_counts=[0, 3])
def test_multithreaded_read(self):
data = {'c{0}'.format(i): [''] * 10
for i in range(100)}
df = pd.DataFrame(data)
self._check_pandas_roundtrip(df, use_threads=True)
def test_nan_as_null(self):
# Create a nan that is not numpy.nan
values = np.array(['foo', np.nan, np.nan * 2, 'bar'] * 10)
df = pd.DataFrame({'strings': values})
self._check_pandas_roundtrip(df)
def test_category(self):
repeats = 1000
values = ['foo', None, u'bar', 'qux', np.nan]
df = pd.DataFrame({'strings': values * repeats})
df['strings'] = df['strings'].astype('category')
values = ['foo', None, 'bar', 'qux', None]
expected = pd.DataFrame({'strings': pd.Categorical(values * repeats)})
self._check_pandas_roundtrip(df, expected,
null_counts=[2 * repeats])
def test_timestamp(self):
df = pd.DataFrame({'naive': pd.date_range('2016-03-28', periods=10)})
df['with_tz'] = (df.naive.dt.tz_localize('utc')
.dt.tz_convert('America/Los_Angeles'))
self._check_pandas_roundtrip(df)
def test_timestamp_with_nulls(self):
df = pd.DataFrame({'test': [pd.datetime(2016, 1, 1),
None,
pd.datetime(2016, 1, 3)]})
df['with_tz'] = df.test.dt.tz_localize('utc')
self._check_pandas_roundtrip(df, null_counts=[1, 1])
@pytest.mark.xfail(reason="not supported ATM",
raises=NotImplementedError)
def test_timedelta_with_nulls(self):
df = pd.DataFrame({'test': [pd.Timedelta('1 day'),
None,
pd.Timedelta('3 day')]})
self._check_pandas_roundtrip(df, null_counts=[1, 1])
def test_out_of_float64_timestamp_with_nulls(self):
df = pd.DataFrame(
{'test': pd.DatetimeIndex([1451606400000000001,
None, 14516064000030405])})
df['with_tz'] = df.test.dt.tz_localize('utc')
self._check_pandas_roundtrip(df, null_counts=[1, 1])
def test_non_string_columns(self):
df = pd.DataFrame({0: [1, 2, 3, 4],
1: [True, False, True, False]})
expected = df.rename(columns=str)
self._check_pandas_roundtrip(df, expected)
@pytest.mark.skipif(not os.path.supports_unicode_filenames,
reason='unicode filenames not supported')
def test_unicode_filename(self):
# GH #209
name = (b'Besa_Kavaj\xc3\xab.feather').decode('utf-8')
df = pd.DataFrame({'foo': [1, 2, 3, 4]})
self._check_pandas_roundtrip(df, path=random_path(prefix=name))
def test_read_columns(self):
data = {'foo': [1, 2, 3, 4],
'boo': [5, 6, 7, 8],
'woo': [1, 3, 5, 7]}
columns = list(data.keys())[1:3]
df = pd.DataFrame(data)
expected = pd.DataFrame({c: data[c] for c in columns})
self._check_pandas_roundtrip(df, expected, columns=columns)
def test_overwritten_file(self):
path = random_path()
self.test_files.append(path)
num_values = 100
np.random.seed(0)
values = np.random.randint(0, 10, size=num_values)
write_feather(pd.DataFrame({'ints': values}), path)
df = pd.DataFrame({'ints': values[0: num_values//2]})
self._check_pandas_roundtrip(df, path=path)
def test_filelike_objects(self):
from io import BytesIO
buf = BytesIO()
# the copy makes it non-strided
df = pd.DataFrame(np.arange(12).reshape(4, 3),
columns=['a', 'b', 'c']).copy()
write_feather(df, buf)
buf.seek(0)
result = read_feather(buf)
assert_frame_equal(result, df)
def test_sparse_dataframe(self):
# GH #221
data = {'A': [0, 1, 2],
'B': [1, 0, 1]}
df = pd.DataFrame(data).to_sparse(fill_value=1)
expected = df.to_dense()
self._check_pandas_roundtrip(df, expected)
def test_duplicate_columns(self):
# https://github.com/wesm/feather/issues/53
# not currently able to handle duplicate columns
df = pd.DataFrame(np.arange(12).reshape(4, 3),
columns=list('aaa')).copy()
self._assert_error_on_write(df, ValueError)
def test_unsupported(self):
# https://github.com/wesm/feather/issues/240
# serializing actual python objects
# period
df = pd.DataFrame({'a': pd.period_range('2013', freq='M', periods=3)})
self._assert_error_on_write(df, ValueError)
# non-strings
df = pd.DataFrame({'a': ['a', 1, 2.0]})
self._assert_error_on_write(df, TypeError)
@pytest.mark.slow
def test_large_dataframe(self):
df = pd.DataFrame({'A': np.arange(400000000)})
self._check_pandas_roundtrip(df)
@pytest.mark.large_memory
def test_chunked_binary_error_message():
# ARROW-3058: As Feather does not yet support chunked columns, we at least
# make sure it's clear to the user what is going on
# 2^31 + 1 bytes
values = [b'x'] + [
b'x' * (1 << 20)
] * 2 * (1 << 10)
df = pd.DataFrame({'byte_col': values})
with pytest.raises(ValueError, match="'byte_col' exceeds 2GB maximum "
"capacity of a Feather binary column. This restriction "
"may be lifted in the future"):
write_feather(df, io.BytesIO())