ARROW-12299: [Python] Recognize new filesytems in pq.write_to_dataset
This adds similar logic as we have in ParquetDataset: if a new-style filesystem is passed (which is not supported in the legacy implementation), automatically default to the new implementation.
Closes #9967 from jorisvandenbossche/ARROW-12299
Authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
index 37bca4e..1b0a336 100644
--- a/python/pyarrow/parquet.py
+++ b/python/pyarrow/parquet.py
@@ -1845,7 +1845,7 @@
def write_to_dataset(table, root_path, partition_cols=None,
partition_filename_cb=None, filesystem=None,
- use_legacy_dataset=True, **kwargs):
+ use_legacy_dataset=None, **kwargs):
"""Wrapper around parquet.write_table for writing a Table to
Parquet format by partitions.
For each combination of partition columns and values,
@@ -1879,7 +1879,8 @@
A callback function that takes the partition key(s) as an argument
and allow you to override the partition filename. If nothing is
passed, the filename will consist of a uuid.
- use_legacy_dataset : bool, default True
+ use_legacy_dataset : bool
+ Default is True unless a ``pyarrow.fs`` filesystem is passed.
Set to False to enable the new code path (experimental, using the
new Arrow Dataset API). This is more efficient when using partition
columns, but does not (yet) support `partition_filename_cb` and
@@ -1891,6 +1892,14 @@
file metadata instances of dataset pieces. The file paths in the
ColumnChunkMetaData will be set relative to `root_path`.
"""
+ if use_legacy_dataset is None:
+ # if a new filesystem is passed -> default to new implementation
+ if isinstance(filesystem, FileSystem):
+ use_legacy_dataset = False
+ # otherwise the default is still True
+ else:
+ use_legacy_dataset = True
+
if not use_legacy_dataset:
import pyarrow.dataset as ds
diff --git a/python/pyarrow/tests/parquet/test_dataset.py b/python/pyarrow/tests/parquet/test_dataset.py
index a0d417b..fb58fc4 100644
--- a/python/pyarrow/tests/parquet/test_dataset.py
+++ b/python/pyarrow/tests/parquet/test_dataset.py
@@ -1377,6 +1377,17 @@
assert sorted(expected_basenames) == sorted(output_basenames)
+@pytest.mark.pandas
+def test_write_to_dataset_filesystem(tempdir):
+ df = pd.DataFrame({'A': [1, 2, 3]})
+ table = pa.Table.from_pandas(df)
+ path = str(tempdir)
+
+ pq.write_to_dataset(table, path, filesystem=fs.LocalFileSystem())
+ result = pq.read_table(path)
+ assert result.equals(table)
+
+
# TODO(dataset) support pickling
def _make_dataset_for_pickling(tempdir, N=100):
path = tempdir / 'data.parquet'