| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| |
| from pathlib import Path |
| |
| import pyarrow as pa |
| import pytest |
| from datafusion import column |
| from datafusion.io import read_avro, read_csv, read_json, read_parquet |
| |
| from .utils import range_table |
| |
| |
| def test_read_json_global_ctx(ctx): |
| path = Path(__file__).parent.resolve() |
| |
| # Default |
| test_data_path = Path(path) / "data_test_context" / "data.json" |
| df = read_json(test_data_path) |
| result = df.collect() |
| |
| assert result[0].column(0) == pa.array(["a", "b", "c"]) |
| assert result[0].column(1) == pa.array([1, 2, 3]) |
| |
| # Schema |
| schema = pa.schema( |
| [ |
| pa.field("A", pa.string(), nullable=True), |
| ] |
| ) |
| df = read_json(test_data_path, schema=schema) |
| result = df.collect() |
| |
| assert result[0].column(0) == pa.array(["a", "b", "c"]) |
| assert result[0].schema == schema |
| |
| # File extension |
| test_data_path = Path(path) / "data_test_context" / "data.json" |
| df = read_json(test_data_path, file_extension=".json") |
| result = df.collect() |
| |
| assert result[0].column(0) == pa.array(["a", "b", "c"]) |
| assert result[0].column(1) == pa.array([1, 2, 3]) |
| |
| |
| def test_read_parquet_global(): |
| parquet_df = read_parquet(path="parquet/data/alltypes_plain.parquet") |
| parquet_df.show() |
| assert parquet_df is not None |
| |
| path = Path.cwd() / "parquet/data/alltypes_plain.parquet" |
| parquet_df = read_parquet(path=path) |
| assert parquet_df is not None |
| |
| |
| def test_read_csv(): |
| csv_df = read_csv(path="testing/data/csv/aggregate_test_100.csv") |
| csv_df.select(column("c1")).show() |
| |
| |
| def test_read_csv_list(): |
| csv_df = read_csv(path=["testing/data/csv/aggregate_test_100.csv"]) |
| expected = csv_df.count() * 2 |
| |
| double_csv_df = read_csv( |
| path=[ |
| "testing/data/csv/aggregate_test_100.csv", |
| "testing/data/csv/aggregate_test_100.csv", |
| ] |
| ) |
| actual = double_csv_df.count() |
| |
| double_csv_df.select(column("c1")).show() |
| assert actual == expected |
| |
| |
| def test_read_avro(): |
| avro_df = read_avro(path="testing/data/avro/alltypes_plain.avro") |
| avro_df.show() |
| assert avro_df is not None |
| |
| path = Path.cwd() / "testing/data/avro/alltypes_plain.avro" |
| avro_df = read_avro(path=path) |
| assert avro_df is not None |
| |
| |
| def test_arrow_c_stream_large_dataset(ctx): |
| """DataFrame streaming yields batches incrementally using Arrow APIs. |
| |
| This test constructs a DataFrame that would be far larger than available |
| memory if materialized. Use the public API |
| ``pa.RecordBatchReader.from_stream(df)`` (which is same as |
| ``pa.RecordBatchReader._import_from_c_capsule(df.__arrow_c_stream__())``) |
| to read record batches incrementally without collecting the full dataset, |
| so reading a handful of batches should not exhaust process memory. |
| """ |
| # Create a very large DataFrame using range; this would be terabytes if collected |
| df = range_table(ctx, 0, 1 << 40) |
| |
| reader = pa.RecordBatchReader.from_stream(df) |
| |
| # Track RSS before consuming batches |
| # RSS is a practical measure of RAM usage visible to the OS. It excludes memory |
| # that has been swapped out and provides a simple cross-platform-ish indicator |
| # (psutil normalizes per-OS sources). |
| psutil = pytest.importorskip("psutil") |
| process = psutil.Process() |
| start_rss = process.memory_info().rss |
| |
| for _ in range(5): |
| batch = reader.read_next_batch() |
| assert batch is not None |
| assert len(batch) > 0 |
| current_rss = process.memory_info().rss |
| # Ensure memory usage hasn't grown substantially (>50MB) |
| assert current_rss - start_rss < 50 * 1024 * 1024 |
| |
| |
| def test_table_from_arrow_c_stream(ctx, fail_collect): |
| df = range_table(ctx, 0, 10) |
| |
| table = pa.table(df) |
| assert table.shape == (10, 1) |
| assert table.column_names == ["value"] |