| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| |
| import io |
| import json |
| |
| try: |
| import numpy as np |
| except ImportError: |
| np = None |
| import pytest |
| |
| import pyarrow as pa |
| from pyarrow.fs import LocalFileSystem, SubTreeFileSystem |
| from pyarrow.util import guid |
| from pyarrow.vendored.version import Version |
| |
| try: |
| import pyarrow.parquet as pq |
| from pyarrow.tests.parquet.common import (_read_table, _test_dataframe, |
| _write_table) |
| except ImportError: |
| pq = None |
| |
| |
| try: |
| import pandas as pd |
| import pandas.testing as tm |
| |
| from pyarrow.tests.parquet.common import (_roundtrip_pandas_dataframe, |
| alltypes_sample) |
| except ImportError: |
| pd = tm = None |
| |
| |
| # Marks all of the tests in this module |
| # Ignore these with pytest ... -m 'not parquet' |
| pytestmark = pytest.mark.parquet |
| |
| |
| @pytest.mark.pandas |
| def test_pandas_parquet_custom_metadata(tempdir): |
| df = alltypes_sample(size=10000) |
| |
| filename = tempdir / 'pandas_roundtrip.parquet' |
| arrow_table = pa.Table.from_pandas(df) |
| assert b'pandas' in arrow_table.schema.metadata |
| |
| _write_table(arrow_table, filename) |
| |
| metadata = pq.read_metadata(filename).metadata |
| assert b'pandas' in metadata |
| |
| js = json.loads(metadata[b'pandas'].decode('utf8')) |
| assert js['index_columns'] == [{'kind': 'range', |
| 'name': None, |
| 'start': 0, 'stop': 10000, |
| 'step': 1}] |
| |
| |
| @pytest.mark.pandas |
| def test_merging_parquet_tables_with_different_pandas_metadata(tempdir): |
| # ARROW-3728: Merging Parquet Files - Pandas Meta in Schema Mismatch |
| schema = pa.schema([ |
| pa.field('int', pa.int16()), |
| pa.field('float', pa.float32()), |
| pa.field('string', pa.string()) |
| ]) |
| df1 = pd.DataFrame({ |
| 'int': np.arange(3, dtype=np.uint8), |
| 'float': np.arange(3, dtype=np.float32), |
| 'string': ['ABBA', 'EDDA', 'ACDC'] |
| }) |
| df2 = pd.DataFrame({ |
| 'int': [4, 5], |
| 'float': [1.1, None], |
| 'string': [None, None] |
| }) |
| table1 = pa.Table.from_pandas(df1, schema=schema, preserve_index=False) |
| table2 = pa.Table.from_pandas(df2, schema=schema, preserve_index=False) |
| |
| assert not table1.schema.equals(table2.schema, check_metadata=True) |
| assert table1.schema.equals(table2.schema) |
| |
| writer = pq.ParquetWriter(tempdir / 'merged.parquet', schema=schema) |
| writer.write_table(table1) |
| writer.write_table(table2) |
| |
| |
| @pytest.mark.pandas |
| def test_attributes_metadata_persistence(tempdir): |
| # GH-45382: Add support for pandas DataFrame.attrs |
| # During the .parquet file writing, the attrs are serialised into json |
| # along with the rest of the pandas.DataFrame metadata. |
| |
| filename = tempdir / "metadata_persistence.parquet" |
| df = alltypes_sample(size=10000) |
| df.attrs = { |
| 'float16': 'half-precision', |
| 'float32': 'single precision', |
| 'float64': 'double precision', |
| 'desciption': 'Attributes Persistence Test DataFrame', |
| } |
| |
| table = pa.Table.from_pandas(df) |
| assert b'attributes' in table.schema.metadata[b'pandas'] |
| |
| _write_table(table, filename) |
| metadata = pq.read_metadata(filename).metadata |
| js = json.loads(metadata[b'pandas'].decode('utf8')) |
| assert 'attributes' in js |
| assert js['attributes'] == df.attrs |
| |
| |
| @pytest.mark.pandas |
| def test_pandas_parquet_column_multiindex(tempdir): |
| df = alltypes_sample(size=10) |
| df.columns = pd.MultiIndex.from_tuples( |
| list(zip(df.columns, df.columns[::-1])), |
| names=['level_1', 'level_2'] |
| ) |
| |
| filename = tempdir / 'pandas_roundtrip.parquet' |
| arrow_table = pa.Table.from_pandas(df) |
| assert arrow_table.schema.pandas_metadata is not None |
| |
| _write_table(arrow_table, filename) |
| |
| table_read = pq.read_pandas(filename) |
| df_read = table_read.to_pandas() |
| tm.assert_frame_equal(df, df_read) |
| |
| |
| @pytest.mark.pandas |
| def test_pandas_parquet_2_roundtrip_read_pandas_no_index_written(tempdir): |
| df = alltypes_sample(size=10000) |
| |
| filename = tempdir / 'pandas_roundtrip.parquet' |
| arrow_table = pa.Table.from_pandas(df, preserve_index=False) |
| js = arrow_table.schema.pandas_metadata |
| assert not js['index_columns'] |
| # ARROW-2170 |
| # While index_columns should be empty, columns needs to be filled still. |
| assert js['columns'] |
| |
| _write_table(arrow_table, filename) |
| table_read = pq.read_pandas(filename) |
| |
| js = table_read.schema.pandas_metadata |
| assert not js['index_columns'] |
| |
| read_metadata = table_read.schema.metadata |
| assert arrow_table.schema.metadata == read_metadata |
| |
| df_read = table_read.to_pandas() |
| tm.assert_frame_equal(df, df_read) |
| |
| |
| @pytest.mark.pandas |
| def test_pandas_parquet_native_file_roundtrip(): |
| df = _test_dataframe(10000) |
| arrow_table = pa.Table.from_pandas(df) |
| imos = pa.BufferOutputStream() |
| _write_table(arrow_table, imos, version='2.6') |
| buf = imos.getvalue() |
| reader = pa.BufferReader(buf) |
| df_read = _read_table(reader).to_pandas() |
| tm.assert_frame_equal(df, df_read) |
| |
| |
| @pytest.mark.pandas |
| def test_read_pandas_column_subset(): |
| df = _test_dataframe(10000) |
| arrow_table = pa.Table.from_pandas(df) |
| imos = pa.BufferOutputStream() |
| _write_table(arrow_table, imos, version='2.6') |
| buf = imos.getvalue() |
| reader = pa.BufferReader(buf) |
| df_read = pq.read_pandas( |
| reader, columns=['strings', 'uint8'], |
| ).to_pandas() |
| tm.assert_frame_equal(df[['strings', 'uint8']], df_read) |
| |
| |
| @pytest.mark.pandas |
| def test_pandas_parquet_empty_roundtrip(): |
| df = _test_dataframe(0) |
| arrow_table = pa.Table.from_pandas(df) |
| imos = pa.BufferOutputStream() |
| _write_table(arrow_table, imos, version='2.6') |
| buf = imos.getvalue() |
| reader = pa.BufferReader(buf) |
| df_read = _read_table(reader).to_pandas() |
| tm.assert_frame_equal(df, df_read) |
| |
| |
| @pytest.mark.pandas |
| def test_pandas_can_write_nested_data(): |
| data = { |
| "agg_col": [ |
| {"page_type": 1}, |
| {"record_type": 1}, |
| {"non_consecutive_home": 0}, |
| ], |
| "uid_first": "1001" |
| } |
| df = pd.DataFrame(data=data) |
| arrow_table = pa.Table.from_pandas(df) |
| imos = pa.BufferOutputStream() |
| # This succeeds under V2 |
| _write_table(arrow_table, imos) |
| |
| |
| @pytest.mark.pandas |
| def test_pandas_parquet_pyfile_roundtrip(tempdir): |
| filename = tempdir / 'pandas_pyfile_roundtrip.parquet' |
| size = 5 |
| df = pd.DataFrame({ |
| 'int64': np.arange(size, dtype=np.int64), |
| 'float32': np.arange(size, dtype=np.float32), |
| 'float64': np.arange(size, dtype=np.float64), |
| 'bool': np.random.randn(size) > 0, |
| 'strings': ['foo', 'bar', None, 'baz', 'qux'] |
| }) |
| |
| arrow_table = pa.Table.from_pandas(df) |
| |
| with filename.open('wb') as f: |
| _write_table(arrow_table, f, version="2.6") |
| |
| data = io.BytesIO(filename.read_bytes()) |
| |
| table_read = _read_table(data) |
| df_read = table_read.to_pandas() |
| tm.assert_frame_equal(df, df_read) |
| |
| |
| @pytest.mark.pandas |
| def test_pandas_parquet_configuration_options(tempdir): |
| size = 10000 |
| np.random.seed(0) |
| df = pd.DataFrame({ |
| 'uint8': np.arange(size, dtype=np.uint8), |
| 'uint16': np.arange(size, dtype=np.uint16), |
| 'uint32': np.arange(size, dtype=np.uint32), |
| 'uint64': np.arange(size, dtype=np.uint64), |
| 'int8': np.arange(size, dtype=np.int16), |
| 'int16': np.arange(size, dtype=np.int16), |
| 'int32': np.arange(size, dtype=np.int32), |
| 'int64': np.arange(size, dtype=np.int64), |
| 'float32': np.arange(size, dtype=np.float32), |
| 'float64': np.arange(size, dtype=np.float64), |
| 'bool': np.random.randn(size) > 0 |
| }) |
| filename = tempdir / 'pandas_roundtrip.parquet' |
| arrow_table = pa.Table.from_pandas(df) |
| |
| for use_dictionary in [True, False]: |
| _write_table(arrow_table, filename, version='2.6', |
| use_dictionary=use_dictionary) |
| table_read = _read_table(filename) |
| df_read = table_read.to_pandas() |
| tm.assert_frame_equal(df, df_read) |
| |
| for write_statistics in [True, False]: |
| _write_table(arrow_table, filename, version='2.6', |
| write_statistics=write_statistics) |
| table_read = _read_table(filename) |
| df_read = table_read.to_pandas() |
| tm.assert_frame_equal(df, df_read) |
| |
| for compression in ['NONE', 'SNAPPY', 'GZIP', 'LZ4', 'ZSTD']: |
| if (compression != 'NONE' and |
| not pa.lib.Codec.is_available(compression)): |
| continue |
| _write_table(arrow_table, filename, version='2.6', |
| compression=compression) |
| table_read = _read_table(filename) |
| df_read = table_read.to_pandas() |
| tm.assert_frame_equal(df, df_read) |
| |
| |
| @pytest.mark.pandas |
| def test_spark_flavor_preserves_pandas_metadata(): |
| df = _test_dataframe(size=100) |
| df.index = np.arange(0, 10 * len(df), 10) |
| df.index.name = 'foo' |
| |
| result = _roundtrip_pandas_dataframe(df, {'flavor': 'spark'}) |
| tm.assert_frame_equal(result, df) |
| |
| |
| @pytest.mark.pandas |
| def test_index_column_name_duplicate(tempdir): |
| data = { |
| 'close': { |
| pd.Timestamp('2017-06-30 01:31:00'): 154.99958999999998, |
| pd.Timestamp('2017-06-30 01:32:00'): 154.99958999999998, |
| }, |
| 'time': { |
| pd.Timestamp('2017-06-30 01:31:00'): pd.Timestamp( |
| '2017-06-30 01:31:00' |
| ), |
| pd.Timestamp('2017-06-30 01:32:00'): pd.Timestamp( |
| '2017-06-30 01:32:00' |
| ), |
| } |
| } |
| path = str(tempdir / 'data.parquet') |
| |
| # Pandas v2 defaults to [ns], but Arrow defaults to [us] time units |
| # so we need to cast the pandas dtype. Pandas v1 will always silently |
| # coerce to [ns] due to lack of non-[ns] support. |
| dfx = pd.DataFrame(data, dtype='datetime64[us]').set_index('time', drop=False) |
| |
| tdfx = pa.Table.from_pandas(dfx) |
| _write_table(tdfx, path) |
| arrow_table = _read_table(path) |
| result_df = arrow_table.to_pandas() |
| tm.assert_frame_equal(result_df, dfx) |
| |
| |
| @pytest.mark.pandas |
| def test_multiindex_duplicate_values(tempdir): |
| num_rows = 3 |
| numbers = list(range(num_rows)) |
| index = pd.MultiIndex.from_arrays( |
| [['foo', 'foo', 'bar'], numbers], |
| names=['foobar', 'some_numbers'], |
| ) |
| |
| df = pd.DataFrame({'numbers': numbers}, index=index) |
| table = pa.Table.from_pandas(df) |
| |
| filename = tempdir / 'dup_multi_index_levels.parquet' |
| |
| _write_table(table, filename) |
| result_table = _read_table(filename) |
| assert table.equals(result_table) |
| |
| result_df = result_table.to_pandas() |
| tm.assert_frame_equal(result_df, df) |
| |
| |
| @pytest.mark.pandas |
| def test_backwards_compatible_index_naming(datadir): |
| expected_string = b"""\ |
| carat cut color clarity depth table price x y z |
| 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 |
| 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31 |
| 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31 |
| 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63 |
| 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75 |
| 0.24 Very Good J VVS2 62.8 57.0 336 3.94 3.96 2.48 |
| 0.24 Very Good I VVS1 62.3 57.0 336 3.95 3.98 2.47 |
| 0.26 Very Good H SI1 61.9 55.0 337 4.07 4.11 2.53 |
| 0.22 Fair E VS2 65.1 61.0 337 3.87 3.78 2.49 |
| 0.23 Very Good H VS1 59.4 61.0 338 4.00 4.05 2.39""" |
| expected = pd.read_csv(io.BytesIO(expected_string), sep=r'\s{2,}', |
| index_col=None, header=0, engine='python') |
| table = _read_table(datadir / 'v0.7.1.parquet') |
| result = table.to_pandas() |
| tm.assert_frame_equal(result, expected) |
| |
| |
| @pytest.mark.pandas |
| def test_backwards_compatible_index_multi_level_named(datadir): |
| expected_string = b"""\ |
| carat cut color clarity depth table price x y z |
| 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 |
| 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31 |
| 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31 |
| 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63 |
| 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75 |
| 0.24 Very Good J VVS2 62.8 57.0 336 3.94 3.96 2.48 |
| 0.24 Very Good I VVS1 62.3 57.0 336 3.95 3.98 2.47 |
| 0.26 Very Good H SI1 61.9 55.0 337 4.07 4.11 2.53 |
| 0.22 Fair E VS2 65.1 61.0 337 3.87 3.78 2.49 |
| 0.23 Very Good H VS1 59.4 61.0 338 4.00 4.05 2.39""" |
| expected = pd.read_csv( |
| io.BytesIO(expected_string), sep=r'\s{2,}', |
| index_col=['cut', 'color', 'clarity'], |
| header=0, engine='python' |
| ).sort_index() |
| |
| table = _read_table(datadir / 'v0.7.1.all-named-index.parquet') |
| result = table.to_pandas() |
| tm.assert_frame_equal(result, expected) |
| |
| |
| @pytest.mark.pandas |
| def test_backwards_compatible_index_multi_level_some_named(datadir): |
| expected_string = b"""\ |
| carat cut color clarity depth table price x y z |
| 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 |
| 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31 |
| 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31 |
| 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63 |
| 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75 |
| 0.24 Very Good J VVS2 62.8 57.0 336 3.94 3.96 2.48 |
| 0.24 Very Good I VVS1 62.3 57.0 336 3.95 3.98 2.47 |
| 0.26 Very Good H SI1 61.9 55.0 337 4.07 4.11 2.53 |
| 0.22 Fair E VS2 65.1 61.0 337 3.87 3.78 2.49 |
| 0.23 Very Good H VS1 59.4 61.0 338 4.00 4.05 2.39""" |
| expected = pd.read_csv( |
| io.BytesIO(expected_string), |
| sep=r'\s{2,}', index_col=['cut', 'color', 'clarity'], |
| header=0, engine='python' |
| ).sort_index() |
| expected.index = expected.index.set_names(['cut', None, 'clarity']) |
| |
| table = _read_table(datadir / 'v0.7.1.some-named-index.parquet') |
| result = table.to_pandas() |
| tm.assert_frame_equal(result, expected) |
| |
| |
| @pytest.mark.pandas |
| def test_backwards_compatible_column_metadata_handling(datadir): |
| if Version("2.2.0") <= Version(pd.__version__): |
| # TODO: regression in pandas |
| # https://github.com/pandas-dev/pandas/issues/56775 |
| pytest.skip("Regression in pandas 2.2.0") |
| expected = pd.DataFrame( |
| {'a': [1, 2, 3], 'b': [.1, .2, .3], |
| 'c': pd.date_range("2017-01-01", periods=3, tz='Europe/Brussels')}) |
| expected.index = pd.MultiIndex.from_arrays( |
| [['a', 'b', 'c'], |
| pd.date_range("2017-01-01", periods=3, tz='Europe/Brussels')], |
| names=['index', None]) |
| |
| path = datadir / 'v0.7.1.column-metadata-handling.parquet' |
| table = _read_table(path) |
| result = table.to_pandas() |
| tm.assert_frame_equal(result, expected) |
| |
| table = _read_table( |
| path, columns=['a']) |
| result = table.to_pandas() |
| tm.assert_frame_equal(result, expected[['a']].reset_index(drop=True)) |
| |
| |
| @pytest.mark.pandas |
| def test_categorical_index_survives_roundtrip(): |
| # ARROW-3652, addressed by ARROW-3246 |
| df = pd.DataFrame([['a', 'b'], ['c', 'd']], columns=['c1', 'c2']) |
| df['c1'] = df['c1'].astype('category') |
| df = df.set_index(['c1']) |
| |
| table = pa.Table.from_pandas(df) |
| bos = pa.BufferOutputStream() |
| pq.write_table(table, bos) |
| ref_df = pq.read_pandas(bos.getvalue()).to_pandas() |
| assert isinstance(ref_df.index, pd.CategoricalIndex) |
| assert ref_df.index.equals(df.index) |
| |
| |
| @pytest.mark.pandas |
| def test_categorical_order_survives_roundtrip(): |
| # ARROW-6302 |
| df = pd.DataFrame({"a": pd.Categorical( |
| ["a", "b", "c", "a"], categories=["b", "c", "d"], ordered=True)}) |
| |
| table = pa.Table.from_pandas(df) |
| bos = pa.BufferOutputStream() |
| pq.write_table(table, bos) |
| |
| contents = bos.getvalue() |
| result = pq.read_pandas(contents).to_pandas() |
| |
| tm.assert_frame_equal(result, df) |
| |
| |
| @pytest.mark.pandas |
| def test_pandas_categorical_na_type_row_groups(): |
| # ARROW-5085 |
| df = pd.DataFrame({"col": [None] * 100, "int": [1.0] * 100}) |
| df_category = df.astype({"col": "category", "int": "category"}) |
| table = pa.Table.from_pandas(df) |
| table_cat = pa.Table.from_pandas(df_category) |
| buf = pa.BufferOutputStream() |
| |
| # it works |
| pq.write_table(table_cat, buf, version='2.6', chunk_size=10) |
| result = pq.read_table(buf.getvalue()) |
| |
| # Result is non-categorical |
| assert result[0].equals(table[0]) |
| assert result[1].equals(table[1]) |
| |
| |
| @pytest.mark.pandas |
| def test_pandas_categorical_roundtrip(): |
| # ARROW-5480, this was enabled by ARROW-3246 |
| |
| # Have one of the categories unobserved and include a null (-1) |
| codes = np.array([2, 0, 0, 2, 0, -1, 2], dtype='int32') |
| categories = ['foo', 'bar', 'baz'] |
| df = pd.DataFrame({'x': pd.Categorical.from_codes( |
| codes, categories=categories)}) |
| |
| buf = pa.BufferOutputStream() |
| pq.write_table(pa.table(df), buf) |
| |
| result = pq.read_table(buf.getvalue()).to_pandas() |
| assert result.x.dtype == 'category' |
| assert (result.x.cat.categories == categories).all() |
| tm.assert_frame_equal(result, df) |
| |
| |
| @pytest.mark.pandas |
| def test_categories_with_string_pyarrow_dtype(tempdir): |
| # gh-33727: writing to parquet should not fail |
| if Version(pd.__version__) < Version("1.3.0"): |
| pytest.skip("PyArrow backed string data type introduced in pandas 1.3.0") |
| |
| df1 = pd.DataFrame({"x": ["foo", "bar", "foo"]}, dtype="string[pyarrow]") |
| df1 = df1.astype("category") |
| |
| df2 = pd.DataFrame({"x": ["foo", "bar", "foo"]}) |
| df2 = df2.astype("category") |
| |
| # categories should be converted to pa.Array |
| assert pa.array(df1["x"]).to_pylist() == pa.array(df2["x"]).to_pylist() |
| assert pa.array(df1["x"].cat.categories.values).to_pylist() == pa.array( |
| df2["x"].cat.categories.values).to_pylist() |
| |
| path = str(tempdir / 'cat.parquet') |
| pq.write_table(pa.table(df1), path) |
| result = pq.read_table(path).to_pandas() |
| |
| tm.assert_frame_equal(result, df2) |
| |
| |
| @pytest.mark.pandas |
| def test_write_to_dataset_pandas_preserve_extensiondtypes(tempdir): |
| df = pd.DataFrame({'part': 'a', "col": [1, 2, 3]}) |
| df['col'] = df['col'].astype("Int64") |
| table = pa.table(df) |
| |
| pq.write_to_dataset( |
| table, str(tempdir / "case1"), partition_cols=['part'], |
| ) |
| result = pq.read_table(str(tempdir / "case1")).to_pandas() |
| tm.assert_frame_equal(result[["col"]], df[["col"]]) |
| |
| pq.write_to_dataset(table, str(tempdir / "case2")) |
| result = pq.read_table(str(tempdir / "case2")).to_pandas() |
| tm.assert_frame_equal(result[["col"]], df[["col"]]) |
| |
| pq.write_table(table, str(tempdir / "data.parquet")) |
| result = pq.read_table(str(tempdir / "data.parquet")).to_pandas() |
| tm.assert_frame_equal(result[["col"]], df[["col"]]) |
| |
| |
| @pytest.mark.pandas |
| def test_write_to_dataset_pandas_preserve_index(tempdir): |
| # ARROW-8251 - preserve pandas index in roundtrip |
| |
| df = pd.DataFrame({'part': ['a', 'a', 'b'], "col": [1, 2, 3]}) |
| df.index = pd.Index(['a', 'b', 'c'], name="idx") |
| table = pa.table(df) |
| df_cat = df[["col", "part"]].copy() |
| df_cat["part"] = df_cat["part"].astype("category") |
| |
| pq.write_to_dataset( |
| table, str(tempdir / "case1"), partition_cols=['part'], |
| ) |
| result = pq.read_table(str(tempdir / "case1")).to_pandas() |
| tm.assert_frame_equal(result, df_cat) |
| |
| pq.write_to_dataset(table, str(tempdir / "case2")) |
| result = pq.read_table(str(tempdir / "case2")).to_pandas() |
| tm.assert_frame_equal(result, df) |
| |
| pq.write_table(table, str(tempdir / "data.parquet")) |
| result = pq.read_table(str(tempdir / "data.parquet")).to_pandas() |
| tm.assert_frame_equal(result, df) |
| |
| |
| @pytest.mark.pandas |
| @pytest.mark.parametrize('preserve_index', [True, False, None]) |
| @pytest.mark.parametrize('metadata_fname', ["_metadata", "_common_metadata"]) |
| def test_dataset_read_pandas_common_metadata( |
| tempdir, preserve_index, metadata_fname |
| ): |
| # ARROW-1103 |
| nfiles = 5 |
| size = 5 |
| |
| dirpath = tempdir / guid() |
| dirpath.mkdir() |
| |
| test_data = [] |
| frames = [] |
| paths = [] |
| for i in range(nfiles): |
| df = _test_dataframe(size, seed=i) |
| df.index = pd.Index( |
| np.arange(i * size, (i + 1) * size, dtype="int64"), name='index' |
| ) |
| |
| path = dirpath / f'{i}.parquet' |
| |
| table = pa.Table.from_pandas(df, preserve_index=preserve_index) |
| |
| # Obliterate metadata |
| table = table.replace_schema_metadata(None) |
| assert table.schema.metadata is None |
| |
| _write_table(table, path) |
| test_data.append(table) |
| frames.append(df) |
| paths.append(path) |
| |
| # Write _metadata common file |
| table_for_metadata = pa.Table.from_pandas( |
| df, preserve_index=preserve_index |
| ) |
| pq.write_metadata(table_for_metadata.schema, dirpath / metadata_fname) |
| |
| dataset = pq.ParquetDataset(dirpath) |
| columns = ['uint8', 'strings'] |
| result = dataset.read_pandas(columns=columns).to_pandas() |
| expected = pd.concat([x[columns] for x in frames]) |
| expected.index.name = ( |
| df.index.name if preserve_index is not False else None) |
| tm.assert_frame_equal(result, expected) |
| |
| |
| @pytest.mark.pandas |
| def test_read_pandas_passthrough_keywords(tempdir): |
| # ARROW-11464 - previously not all keywords were passed through (such as |
| # the filesystem keyword) |
| df = pd.DataFrame({'a': [1, 2, 3]}) |
| |
| filename = tempdir / 'data.parquet' |
| _write_table(df, filename) |
| |
| result = pq.read_pandas( |
| 'data.parquet', |
| filesystem=SubTreeFileSystem(str(tempdir), LocalFileSystem()) |
| ) |
| assert result.equals(pa.table(df)) |
| |
| |
| @pytest.mark.pandas |
| def test_read_pandas_map_fields(tempdir): |
| # ARROW-10140 - table created from Pandas with mapping fields |
| df = pd.DataFrame({ |
| 'col1': pd.Series([ |
| [('id', 'something'), ('value2', 'else')], |
| [('id', 'something2'), ('value', 'else2')], |
| ]), |
| 'col2': pd.Series(['foo', 'bar']) |
| }) |
| |
| filename = tempdir / 'data.parquet' |
| |
| udt = pa.map_(pa.string(), pa.string()) |
| schema = pa.schema([pa.field('col1', udt), pa.field('col2', pa.string())]) |
| arrow_table = pa.Table.from_pandas(df, schema) |
| |
| _write_table(arrow_table, filename) |
| |
| result = pq.read_pandas(filename).to_pandas() |
| tm.assert_frame_equal(result, df) |