| |
| <!DOCTYPE html> |
| |
| <html> |
| <head> |
| <meta charset="utf-8" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1.0" /> |
| <title>Tabular Datasets — Apache Arrow v4.0.1</title> |
| |
| <link href="../_static/css/theme.css" rel="stylesheet" /> |
| <link href="../_static/css/index.c5995385ac14fb8791e8eb36b4908be2.css" rel="stylesheet" /> |
| |
| |
| <link rel="stylesheet" |
| href="../_static/vendor/fontawesome/5.13.0/css/all.min.css"> |
| <link rel="preload" as="font" type="font/woff2" crossorigin |
| href="../_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff2"> |
| <link rel="preload" as="font" type="font/woff2" crossorigin |
| href="../_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff2"> |
| |
| |
| |
| |
| |
| <link rel="stylesheet" type="text/css" href="../_static/pygments.css" /> |
| <link rel="stylesheet" type="text/css" href="../_static/basic.css" /> |
| <link rel="stylesheet" type="text/css" href="../_static/theme_overrides.css" /> |
| |
| <link rel="preload" as="script" href="../_static/js/index.1c5a1a01449ed65a7b51.js"> |
| |
| <script data-url_root="../" id="documentation_options" src="../_static/documentation_options.js"></script> |
| <script src="../_static/jquery.js"></script> |
| <script src="../_static/underscore.js"></script> |
| <script src="../_static/doctools.js"></script> |
| <link rel="canonical" href="https://arrow.apache.org/docs/python/dataset.html" /> |
| <link rel="shortcut icon" href="../_static/favicon.ico"/> |
| <link rel="canonical" href="https://arrow.apache.org/docs/python/dataset.html" /> |
| <link rel="index" title="Index" href="../genindex.html" /> |
| <link rel="search" title="Search" href="../search.html" /> |
| <link rel="next" title="CUDA Integration" href="cuda.html" /> |
| <link rel="prev" title="Reading and Writing the Apache Parquet Format" href="parquet.html" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1" /> |
| <meta name="docsearch:language" content="en" /> |
| |
| </head> |
| <body data-spy="scroll" data-target="#bd-toc-nav" data-offset="80"> |
| |
| <div class="container-fluid" id="banner"></div> |
| |
| |
| |
| |
| <div class="container-xl"> |
| <div class="row"> |
| |
| |
| <!-- Only show if we have sidebars configured, else just a small margin --> |
| <div class="col-12 col-md-3 bd-sidebar"> |
| <a class="navbar-brand" href="../index.html"> |
| <img src="../_static/arrow.png" class="logo" alt="logo"> |
| </a> |
| |
| <form class="bd-search d-flex align-items-center" action="../search.html" method="get"> |
| <i class="icon fas fa-search"></i> |
| <input type="search" class="form-control" name="q" id="search-input" placeholder="Search the docs ..." aria-label="Search the docs ..." autocomplete="off" > |
| </form> |
| |
| <nav class="bd-links" id="bd-docs-nav" aria-label="Main navigation"> |
| <div class="bd-toc-item active"> |
| |
| <p class="caption"> |
| <span class="caption-text"> |
| Specifications and Protocols |
| </span> |
| </p> |
| <ul class="nav bd-sidenav"> |
| <li class="toctree-l1"> |
| <a class="reference internal" href="../format/Versioning.html"> |
| Format Versioning and Stability |
| </a> |
| </li> |
| <li class="toctree-l1"> |
| <a class="reference internal" href="../format/Columnar.html"> |
| Arrow Columnar Format |
| </a> |
| </li> |
| <li class="toctree-l1"> |
| <a class="reference internal" href="../format/Flight.html"> |
| Arrow Flight RPC |
| </a> |
| </li> |
| <li class="toctree-l1"> |
| <a class="reference internal" href="../format/Integration.html"> |
| Integration Testing |
| </a> |
| </li> |
| <li class="toctree-l1"> |
| <a class="reference internal" href="../format/CDataInterface.html"> |
| The Arrow C data interface |
| </a> |
| </li> |
| <li class="toctree-l1"> |
| <a class="reference internal" href="../format/CStreamInterface.html"> |
| The Arrow C stream interface |
| </a> |
| </li> |
| <li class="toctree-l1"> |
| <a class="reference internal" href="../format/Other.html"> |
| Other Data Structures |
| </a> |
| </li> |
| </ul> |
| <p class="caption"> |
| <span class="caption-text"> |
| Libraries |
| </span> |
| </p> |
| <ul class="current nav bd-sidenav"> |
| <li class="toctree-l1"> |
| <a class="reference internal" href="../status.html"> |
| Implementation Status |
| </a> |
| </li> |
| <li class="toctree-l1"> |
| <a class="reference external" href="https://arrow.apache.org/docs/c_glib/"> |
| C/GLib |
| </a> |
| </li> |
| <li class="toctree-l1 has-children"> |
| <a class="reference internal" href="../cpp/index.html"> |
| C++ |
| </a> |
| <input class="toctree-checkbox" id="toctree-checkbox-1" name="toctree-checkbox-1" type="checkbox"/> |
| <label for="toctree-checkbox-1"> |
| <i class="fas fa-chevron-down"> |
| </i> |
| </label> |
| <ul> |
| <li class="toctree-l2 has-children"> |
| <a class="reference internal" href="../cpp/getting_started.html"> |
| User Guide |
| </a> |
| <input class="toctree-checkbox" id="toctree-checkbox-2" name="toctree-checkbox-2" type="checkbox"/> |
| <label for="toctree-checkbox-2"> |
| <i class="fas fa-chevron-down"> |
| </i> |
| </label> |
| <ul> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="../cpp/overview.html"> |
| High-Level Overview |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="../cpp/conventions.html"> |
| Conventions |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="../cpp/cmake.html"> |
| Using Arrow C++ in your own project |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="../cpp/memory.html"> |
| Memory Management |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="../cpp/arrays.html"> |
| Arrays |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="../cpp/datatypes.html"> |
| Data Types |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="../cpp/tables.html"> |
| Tabular Data |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="../cpp/compute.html"> |
| Compute Functions |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="../cpp/io.html"> |
| Input / output and filesystems |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="../cpp/ipc.html"> |
| Reading and writing the Arrow IPC format |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="../cpp/parquet.html"> |
| Reading and writing Parquet files |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="../cpp/csv.html"> |
| Reading CSV files |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="../cpp/json.html"> |
| Reading JSON files |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="../cpp/dataset.html"> |
| Tabular Datasets |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="../cpp/flight.html"> |
| Arrow Flight RPC |
| </a> |
| </li> |
| </ul> |
| </li> |
| <li class="toctree-l2 has-children"> |
| <a class="reference internal" href="../cpp/examples/index.html"> |
| Examples |
| </a> |
| <input class="toctree-checkbox" id="toctree-checkbox-3" name="toctree-checkbox-3" type="checkbox"/> |
| <label for="toctree-checkbox-3"> |
| <i class="fas fa-chevron-down"> |
| </i> |
| </label> |
| <ul> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="../cpp/examples/cmake_minimal_build.html"> |
| Minimal build using CMake |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="../cpp/examples/dataset_documentation_example.html"> |
| Arrow Datasets example |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="../cpp/examples/row_columnar_conversion.html"> |
| Row to columnar conversion |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="../cpp/examples/tuple_range_conversion.html"> |
| std::tuple-like ranges to Arrow |
| </a> |
| </li> |
| </ul> |
| </li> |
| <li class="toctree-l2 has-children"> |
| <a class="reference internal" href="../cpp/api.html"> |
| API Reference |
| </a> |
| <input class="toctree-checkbox" id="toctree-checkbox-4" name="toctree-checkbox-4" type="checkbox"/> |
| <label for="toctree-checkbox-4"> |
| <i class="fas fa-chevron-down"> |
| </i> |
| </label> |
| <ul> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="../cpp/api/support.html"> |
| Programming Support |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="../cpp/api/memory.html"> |
| Memory (management) |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="../cpp/api/datatype.html"> |
| Data Types |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="../cpp/api/array.html"> |
| Arrays |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="../cpp/api/scalar.html"> |
| Scalars |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="../cpp/api/builder.html"> |
| Array Builders |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="../cpp/api/table.html"> |
| Two-dimensional Datasets |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="../cpp/api/c_abi.html"> |
| C Interfaces |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="../cpp/api/compute.html"> |
| Compute Functions |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="../cpp/api/tensor.html"> |
| Tensors |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="../cpp/api/utilities.html"> |
| Utilities |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="../cpp/api/io.html"> |
| Input / output |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="../cpp/api/ipc.html"> |
| Arrow IPC |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="../cpp/api/formats.html"> |
| File Formats |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="../cpp/api/cuda.html"> |
| CUDA support |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="../cpp/api/flight.html"> |
| Arrow Flight RPC |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="../cpp/api/filesystem.html"> |
| Filesystems |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="../cpp/api/dataset.html"> |
| Dataset |
| </a> |
| </li> |
| </ul> |
| </li> |
| </ul> |
| </li> |
| <li class="toctree-l1"> |
| <a class="reference external" href="https://github.com/apache/arrow/blob/master/csharp/README.md"> |
| C# |
| </a> |
| </li> |
| <li class="toctree-l1"> |
| <a class="reference external" href="https://godoc.org/github.com/apache/arrow/go/arrow"> |
| Go |
| </a> |
| </li> |
| <li class="toctree-l1 has-children"> |
| <a class="reference internal" href="../java/index.html"> |
| Java |
| </a> |
| <input class="toctree-checkbox" id="toctree-checkbox-5" name="toctree-checkbox-5" type="checkbox"/> |
| <label for="toctree-checkbox-5"> |
| <i class="fas fa-chevron-down"> |
| </i> |
| </label> |
| <ul> |
| <li class="toctree-l2"> |
| <a class="reference internal" href="../java/vector.html"> |
| ValueVector |
| </a> |
| </li> |
| <li class="toctree-l2"> |
| <a class="reference internal" href="../java/vector_schema_root.html"> |
| VectorSchemaRoot |
| </a> |
| </li> |
| <li class="toctree-l2"> |
| <a class="reference internal" href="../java/ipc.html"> |
| Reading/Writing IPC formats |
| </a> |
| </li> |
| <li class="toctree-l2"> |
| <a class="reference external" href="https://arrow.apache.org/docs/java/reference/"> |
| Reference (javadoc) |
| </a> |
| </li> |
| </ul> |
| </li> |
| <li class="toctree-l1"> |
| <a class="reference external" href="https://arrow.apache.org/docs/js/"> |
| JavaScript |
| </a> |
| </li> |
| <li class="toctree-l1"> |
| <a class="reference external" href="https://github.com/apache/arrow/blob/master/julia/Arrow/README.md"> |
| Julia |
| </a> |
| </li> |
| <li class="toctree-l1"> |
| <a class="reference external" href="https://github.com/apache/arrow/blob/master/matlab/README.md"> |
| MATLAB |
| </a> |
| </li> |
| <li class="toctree-l1 current active has-children"> |
| <a class="reference internal" href="index.html"> |
| Python |
| </a> |
| <input checked="" class="toctree-checkbox" id="toctree-checkbox-6" name="toctree-checkbox-6" type="checkbox"/> |
| <label for="toctree-checkbox-6"> |
| <i class="fas fa-chevron-down"> |
| </i> |
| </label> |
| <ul class="current"> |
| <li class="toctree-l2"> |
| <a class="reference internal" href="install.html"> |
| Installing PyArrow |
| </a> |
| </li> |
| <li class="toctree-l2"> |
| <a class="reference internal" href="memory.html"> |
| Memory and IO Interfaces |
| </a> |
| </li> |
| <li class="toctree-l2"> |
| <a class="reference internal" href="data.html"> |
| Data Types and In-Memory Data Model |
| </a> |
| </li> |
| <li class="toctree-l2"> |
| <a class="reference internal" href="compute.html"> |
| Compute Functions |
| </a> |
| </li> |
| <li class="toctree-l2"> |
| <a class="reference internal" href="ipc.html"> |
| Streaming, Serialization, and IPC |
| </a> |
| </li> |
| <li class="toctree-l2"> |
| <a class="reference internal" href="filesystems.html"> |
| Filesystem Interface |
| </a> |
| </li> |
| <li class="toctree-l2 has-children"> |
| <a class="reference internal" href="filesystems_deprecated.html"> |
| Filesystem Interface (legacy) |
| </a> |
| <input class="toctree-checkbox" id="toctree-checkbox-7" name="toctree-checkbox-7" type="checkbox"/> |
| <label for="toctree-checkbox-7"> |
| <i class="fas fa-chevron-down"> |
| </i> |
| </label> |
| <ul> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="generated/pyarrow.hdfs.connect.html"> |
| pyarrow.hdfs.connect |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="generated/pyarrow.HadoopFileSystem.cat.html"> |
| pyarrow.HadoopFileSystem.cat |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="generated/pyarrow.HadoopFileSystem.chmod.html"> |
| pyarrow.HadoopFileSystem.chmod |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="generated/pyarrow.HadoopFileSystem.chown.html"> |
| pyarrow.HadoopFileSystem.chown |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="generated/pyarrow.HadoopFileSystem.delete.html"> |
| pyarrow.HadoopFileSystem.delete |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="generated/pyarrow.HadoopFileSystem.df.html"> |
| pyarrow.HadoopFileSystem.df |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="generated/pyarrow.HadoopFileSystem.disk_usage.html"> |
| pyarrow.HadoopFileSystem.disk_usage |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="generated/pyarrow.HadoopFileSystem.download.html"> |
| pyarrow.HadoopFileSystem.download |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="generated/pyarrow.HadoopFileSystem.exists.html"> |
| pyarrow.HadoopFileSystem.exists |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="generated/pyarrow.HadoopFileSystem.get_capacity.html"> |
| pyarrow.HadoopFileSystem.get_capacity |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="generated/pyarrow.HadoopFileSystem.get_space_used.html"> |
| pyarrow.HadoopFileSystem.get_space_used |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="generated/pyarrow.HadoopFileSystem.info.html"> |
| pyarrow.HadoopFileSystem.info |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="generated/pyarrow.HadoopFileSystem.ls.html"> |
| pyarrow.HadoopFileSystem.ls |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="generated/pyarrow.HadoopFileSystem.mkdir.html"> |
| pyarrow.HadoopFileSystem.mkdir |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="generated/pyarrow.HadoopFileSystem.open.html"> |
| pyarrow.HadoopFileSystem.open |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="generated/pyarrow.HadoopFileSystem.rename.html"> |
| pyarrow.HadoopFileSystem.rename |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="generated/pyarrow.HadoopFileSystem.rm.html"> |
| pyarrow.HadoopFileSystem.rm |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="generated/pyarrow.HadoopFileSystem.upload.html"> |
| pyarrow.HadoopFileSystem.upload |
| </a> |
| </li> |
| <li class="toctree-l3"> |
| <a class="reference internal" href="generated/pyarrow.HdfsFile.html"> |
| pyarrow.HdfsFile |
| </a> |
| </li> |
| </ul> |
| </li> |
| <li class="toctree-l2"> |
| <a class="reference internal" href="plasma.html"> |
| The Plasma In-Memory Object Store |
| </a> |
| </li> |
| <li class="toctree-l2"> |
| <a class="reference internal" href="numpy.html"> |
| NumPy Integration |
| </a> |
| </li> |
| <li class="toctree-l2"> |
| <a class="reference internal" href="pandas.html"> |
| Pandas Integration |
| </a> |
| </li> |
| <li class="toctree-l2"> |
| <a class="reference internal" href="timestamps.html"> |
| Timestamps |
| </a> |
| </li> |
| <li class="toctree-l2"> |
| <a class="reference internal" href="csv.html"> |
| Reading CSV files |
| </a> |
| </li> |
| <li class="toctree-l2"> |
| <a class="reference internal" href="feather.html"> |
| Feather File Format |
| </a> |
| </li> |
| <li class="toctree-l2"> |
| <a class="reference internal" href="json.html"> |
| Reading JSON files |
| </a> |
| </li> |
| <li class="toctree-l2"> |
| <a class="reference internal" href="parquet.html"> |
| Reading and Writing the Apache Parquet Format |
| </a> |
| </li> |
| <li class="toctree-l2 current active"> |
| <a class="current reference internal" href="#"> |
| Tabular Datasets |
| </a> |
| </li> |
| <li class="toctree-l2"> |
| <a class="reference internal" href="cuda.html"> |
| CUDA Integration |
| </a> |
| </li> |
| <li class="toctree-l2"> |
| <a class="reference internal" href="extending_types.html"> |
| Extending pyarrow |
| </a> |
| </li> |
| <li class="toctree-l2"> |
| <a class="reference internal" href="extending.html"> |
| Using pyarrow from C++ and Cython Code |
| </a> |
| </li> |
| <li class="toctree-l2 has-children"> |
| <a class="reference internal" href="api.html"> |
| API Reference |
| </a> |
| <input class="toctree-checkbox" id="toctree-checkbox-8" name="toctree-checkbox-8" type="checkbox"/> |
| <label for="toctree-checkbox-8"> |
| <i class="fas fa-chevron-down"> |
| </i> |
| </label> |
| <ul> |
| <li class="toctree-l3 has-children"> |
| <a class="reference internal" href="api/datatypes.html"> |
| Data Types and Schemas |
| </a> |
| <input class="toctree-checkbox" id="toctree-checkbox-9" name="toctree-checkbox-9" type="checkbox"/> |
| <label for="toctree-checkbox-9"> |
| <i class="fas fa-chevron-down"> |
| </i> |
| </label> |
| <ul> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.null.html"> |
| pyarrow.null |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.bool_.html"> |
| pyarrow.bool_ |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.int8.html"> |
| pyarrow.int8 |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.int16.html"> |
| pyarrow.int16 |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.int32.html"> |
| pyarrow.int32 |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.int64.html"> |
| pyarrow.int64 |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.uint8.html"> |
| pyarrow.uint8 |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.uint16.html"> |
| pyarrow.uint16 |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.uint32.html"> |
| pyarrow.uint32 |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.uint64.html"> |
| pyarrow.uint64 |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.float16.html"> |
| pyarrow.float16 |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.float32.html"> |
| pyarrow.float32 |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.float64.html"> |
| pyarrow.float64 |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.time32.html"> |
| pyarrow.time32 |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.time64.html"> |
| pyarrow.time64 |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.timestamp.html"> |
| pyarrow.timestamp |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.date32.html"> |
| pyarrow.date32 |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.date64.html"> |
| pyarrow.date64 |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.binary.html"> |
| pyarrow.binary |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.string.html"> |
| pyarrow.string |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.utf8.html"> |
| pyarrow.utf8 |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.large_binary.html"> |
| pyarrow.large_binary |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.large_string.html"> |
| pyarrow.large_string |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.large_utf8.html"> |
| pyarrow.large_utf8 |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.decimal128.html"> |
| pyarrow.decimal128 |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.list_.html"> |
| pyarrow.list_ |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.large_list.html"> |
| pyarrow.large_list |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.map_.html"> |
| pyarrow.map_ |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.struct.html"> |
| pyarrow.struct |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.dictionary.html"> |
| pyarrow.dictionary |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.field.html"> |
| pyarrow.field |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.schema.html"> |
| pyarrow.schema |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.from_numpy_dtype.html"> |
| pyarrow.from_numpy_dtype |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.DataType.html"> |
| pyarrow.DataType |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.DictionaryType.html"> |
| pyarrow.DictionaryType |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.ListType.html"> |
| pyarrow.ListType |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.MapType.html"> |
| pyarrow.MapType |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.StructType.html"> |
| pyarrow.StructType |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.UnionType.html"> |
| pyarrow.UnionType |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.TimestampType.html"> |
| pyarrow.TimestampType |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.Time32Type.html"> |
| pyarrow.Time32Type |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.Time64Type.html"> |
| pyarrow.Time64Type |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.FixedSizeBinaryType.html"> |
| pyarrow.FixedSizeBinaryType |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.Decimal128Type.html"> |
| pyarrow.Decimal128Type |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.Field.html"> |
| pyarrow.Field |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.Schema.html"> |
| pyarrow.Schema |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.ExtensionType.html"> |
| pyarrow.ExtensionType |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.PyExtensionType.html"> |
| pyarrow.PyExtensionType |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.register_extension_type.html"> |
| pyarrow.register_extension_type |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.unregister_extension_type.html"> |
| pyarrow.unregister_extension_type |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.types.is_boolean.html"> |
| pyarrow.types.is_boolean |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.types.is_integer.html"> |
| pyarrow.types.is_integer |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.types.is_signed_integer.html"> |
| pyarrow.types.is_signed_integer |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.types.is_unsigned_integer.html"> |
| pyarrow.types.is_unsigned_integer |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.types.is_int8.html"> |
| pyarrow.types.is_int8 |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.types.is_int16.html"> |
| pyarrow.types.is_int16 |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.types.is_int32.html"> |
| pyarrow.types.is_int32 |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.types.is_int64.html"> |
| pyarrow.types.is_int64 |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.types.is_uint8.html"> |
| pyarrow.types.is_uint8 |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.types.is_uint16.html"> |
| pyarrow.types.is_uint16 |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.types.is_uint32.html"> |
| pyarrow.types.is_uint32 |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.types.is_uint64.html"> |
| pyarrow.types.is_uint64 |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.types.is_floating.html"> |
| pyarrow.types.is_floating |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.types.is_float16.html"> |
| pyarrow.types.is_float16 |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.types.is_float32.html"> |
| pyarrow.types.is_float32 |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.types.is_float64.html"> |
| pyarrow.types.is_float64 |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.types.is_decimal.html"> |
| pyarrow.types.is_decimal |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.types.is_list.html"> |
| pyarrow.types.is_list |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.types.is_large_list.html"> |
| pyarrow.types.is_large_list |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.types.is_struct.html"> |
| pyarrow.types.is_struct |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.types.is_union.html"> |
| pyarrow.types.is_union |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.types.is_nested.html"> |
| pyarrow.types.is_nested |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.types.is_temporal.html"> |
| pyarrow.types.is_temporal |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.types.is_timestamp.html"> |
| pyarrow.types.is_timestamp |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.types.is_date.html"> |
| pyarrow.types.is_date |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.types.is_date32.html"> |
| pyarrow.types.is_date32 |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.types.is_date64.html"> |
| pyarrow.types.is_date64 |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.types.is_time.html"> |
| pyarrow.types.is_time |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.types.is_time32.html"> |
| pyarrow.types.is_time32 |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.types.is_time64.html"> |
| pyarrow.types.is_time64 |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.types.is_null.html"> |
| pyarrow.types.is_null |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.types.is_binary.html"> |
| pyarrow.types.is_binary |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.types.is_unicode.html"> |
| pyarrow.types.is_unicode |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.types.is_string.html"> |
| pyarrow.types.is_string |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.types.is_large_binary.html"> |
| pyarrow.types.is_large_binary |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.types.is_large_unicode.html"> |
| pyarrow.types.is_large_unicode |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.types.is_large_string.html"> |
| pyarrow.types.is_large_string |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.types.is_fixed_size_binary.html"> |
| pyarrow.types.is_fixed_size_binary |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.types.is_map.html"> |
| pyarrow.types.is_map |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.types.is_dictionary.html"> |
| pyarrow.types.is_dictionary |
| </a> |
| </li> |
| </ul> |
| </li> |
| <li class="toctree-l3 has-children"> |
| <a class="reference internal" href="api/arrays.html"> |
| Arrays and Scalars |
| </a> |
| <input class="toctree-checkbox" id="toctree-checkbox-10" name="toctree-checkbox-10" type="checkbox"/> |
| <label for="toctree-checkbox-10"> |
| <i class="fas fa-chevron-down"> |
| </i> |
| </label> |
| <ul> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.array.html"> |
| pyarrow.array |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.nulls.html"> |
| pyarrow.nulls |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.Array.html"> |
| pyarrow.Array |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.BooleanArray.html"> |
| pyarrow.BooleanArray |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.FloatingPointArray.html"> |
| pyarrow.FloatingPointArray |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.IntegerArray.html"> |
| pyarrow.IntegerArray |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.Int8Array.html"> |
| pyarrow.Int8Array |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.Int16Array.html"> |
| pyarrow.Int16Array |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.Int32Array.html"> |
| pyarrow.Int32Array |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.Int64Array.html"> |
| pyarrow.Int64Array |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.NullArray.html"> |
| pyarrow.NullArray |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.NumericArray.html"> |
| pyarrow.NumericArray |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.UInt8Array.html"> |
| pyarrow.UInt8Array |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.UInt16Array.html"> |
| pyarrow.UInt16Array |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.UInt32Array.html"> |
| pyarrow.UInt32Array |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.UInt64Array.html"> |
| pyarrow.UInt64Array |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.BinaryArray.html"> |
| pyarrow.BinaryArray |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.StringArray.html"> |
| pyarrow.StringArray |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.FixedSizeBinaryArray.html"> |
| pyarrow.FixedSizeBinaryArray |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.LargeBinaryArray.html"> |
| pyarrow.LargeBinaryArray |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.LargeStringArray.html"> |
| pyarrow.LargeStringArray |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.Time32Array.html"> |
| pyarrow.Time32Array |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.Time64Array.html"> |
| pyarrow.Time64Array |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.Date32Array.html"> |
| pyarrow.Date32Array |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.Date64Array.html"> |
| pyarrow.Date64Array |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.TimestampArray.html"> |
| pyarrow.TimestampArray |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.Decimal128Array.html"> |
| pyarrow.Decimal128Array |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.DictionaryArray.html"> |
| pyarrow.DictionaryArray |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.ListArray.html"> |
| pyarrow.ListArray |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.LargeListArray.html"> |
| pyarrow.LargeListArray |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.StructArray.html"> |
| pyarrow.StructArray |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.UnionArray.html"> |
| pyarrow.UnionArray |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.ExtensionArray.html"> |
| pyarrow.ExtensionArray |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.scalar.html"> |
| pyarrow.scalar |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.NA.html"> |
| pyarrow.NA |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.Scalar.html"> |
| pyarrow.Scalar |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.BooleanScalar.html"> |
| pyarrow.BooleanScalar |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.Int8Scalar.html"> |
| pyarrow.Int8Scalar |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.Int16Scalar.html"> |
| pyarrow.Int16Scalar |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.Int32Scalar.html"> |
| pyarrow.Int32Scalar |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.Int64Scalar.html"> |
| pyarrow.Int64Scalar |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.UInt8Scalar.html"> |
| pyarrow.UInt8Scalar |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.UInt16Scalar.html"> |
| pyarrow.UInt16Scalar |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.UInt32Scalar.html"> |
| pyarrow.UInt32Scalar |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.UInt64Scalar.html"> |
| pyarrow.UInt64Scalar |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.FloatScalar.html"> |
| pyarrow.FloatScalar |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.DoubleScalar.html"> |
| pyarrow.DoubleScalar |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.BinaryScalar.html"> |
| pyarrow.BinaryScalar |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.StringScalar.html"> |
| pyarrow.StringScalar |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.FixedSizeBinaryScalar.html"> |
| pyarrow.FixedSizeBinaryScalar |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.LargeBinaryScalar.html"> |
| pyarrow.LargeBinaryScalar |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.LargeStringScalar.html"> |
| pyarrow.LargeStringScalar |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.Time32Scalar.html"> |
| pyarrow.Time32Scalar |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.Time64Scalar.html"> |
| pyarrow.Time64Scalar |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.Date32Scalar.html"> |
| pyarrow.Date32Scalar |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.Date64Scalar.html"> |
| pyarrow.Date64Scalar |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.TimestampScalar.html"> |
| pyarrow.TimestampScalar |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.Decimal128Scalar.html"> |
| pyarrow.Decimal128Scalar |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.DictionaryScalar.html"> |
| pyarrow.DictionaryScalar |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.ListScalar.html"> |
| pyarrow.ListScalar |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.LargeListScalar.html"> |
| pyarrow.LargeListScalar |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.StructScalar.html"> |
| pyarrow.StructScalar |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.UnionScalar.html"> |
| pyarrow.UnionScalar |
| </a> |
| </li> |
| </ul> |
| </li> |
| <li class="toctree-l3 has-children"> |
| <a class="reference internal" href="api/memory.html"> |
| Buffers and Memory |
| </a> |
| <input class="toctree-checkbox" id="toctree-checkbox-11" name="toctree-checkbox-11" type="checkbox"/> |
| <label for="toctree-checkbox-11"> |
| <i class="fas fa-chevron-down"> |
| </i> |
| </label> |
| <ul> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.allocate_buffer.html"> |
| pyarrow.allocate_buffer |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.py_buffer.html"> |
| pyarrow.py_buffer |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.foreign_buffer.html"> |
| pyarrow.foreign_buffer |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.Buffer.html"> |
| pyarrow.Buffer |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.ResizableBuffer.html"> |
| pyarrow.ResizableBuffer |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compress.html"> |
| pyarrow.compress |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.decompress.html"> |
| pyarrow.decompress |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.MemoryPool.html"> |
| pyarrow.MemoryPool |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.default_memory_pool.html"> |
| pyarrow.default_memory_pool |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.jemalloc_memory_pool.html"> |
| pyarrow.jemalloc_memory_pool |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.mimalloc_memory_pool.html"> |
| pyarrow.mimalloc_memory_pool |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.system_memory_pool.html"> |
| pyarrow.system_memory_pool |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.jemalloc_set_decay_ms.html"> |
| pyarrow.jemalloc_set_decay_ms |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.set_memory_pool.html"> |
| pyarrow.set_memory_pool |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.log_memory_allocations.html"> |
| pyarrow.log_memory_allocations |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.total_allocated_bytes.html"> |
| pyarrow.total_allocated_bytes |
| </a> |
| </li> |
| </ul> |
| </li> |
| <li class="toctree-l3 has-children"> |
| <a class="reference internal" href="api/compute.html"> |
| Compute Functions |
| </a> |
| <input class="toctree-checkbox" id="toctree-checkbox-12" name="toctree-checkbox-12" type="checkbox"/> |
| <label for="toctree-checkbox-12"> |
| <i class="fas fa-chevron-down"> |
| </i> |
| </label> |
| <ul> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.count.html"> |
| pyarrow.compute.count |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.mean.html"> |
| pyarrow.compute.mean |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.min_max.html"> |
| pyarrow.compute.min_max |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.mode.html"> |
| pyarrow.compute.mode |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.stddev.html"> |
| pyarrow.compute.stddev |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.sum.html"> |
| pyarrow.compute.sum |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.variance.html"> |
| pyarrow.compute.variance |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.add.html"> |
| pyarrow.compute.add |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.add_checked.html"> |
| pyarrow.compute.add_checked |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.divide.html"> |
| pyarrow.compute.divide |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.divide_checked.html"> |
| pyarrow.compute.divide_checked |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.multiply.html"> |
| pyarrow.compute.multiply |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.multiply_checked.html"> |
| pyarrow.compute.multiply_checked |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.subtract.html"> |
| pyarrow.compute.subtract |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.subtract_checked.html"> |
| pyarrow.compute.subtract_checked |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.power.html"> |
| pyarrow.compute.power |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.power_checked.html"> |
| pyarrow.compute.power_checked |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.equal.html"> |
| pyarrow.compute.equal |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.greater.html"> |
| pyarrow.compute.greater |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.greater_equal.html"> |
| pyarrow.compute.greater_equal |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.less.html"> |
| pyarrow.compute.less |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.less_equal.html"> |
| pyarrow.compute.less_equal |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.not_equal.html"> |
| pyarrow.compute.not_equal |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.and_.html"> |
| pyarrow.compute.and_ |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.and_kleene.html"> |
| pyarrow.compute.and_kleene |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.all.html"> |
| pyarrow.compute.all |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.any.html"> |
| pyarrow.compute.any |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.invert.html"> |
| pyarrow.compute.invert |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.or_.html"> |
| pyarrow.compute.or_ |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.or_kleene.html"> |
| pyarrow.compute.or_kleene |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.xor.html"> |
| pyarrow.compute.xor |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.ascii_is_alnum.html"> |
| pyarrow.compute.ascii_is_alnum |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.ascii_is_alpha.html"> |
| pyarrow.compute.ascii_is_alpha |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.ascii_is_decimal.html"> |
| pyarrow.compute.ascii_is_decimal |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.ascii_is_lower.html"> |
| pyarrow.compute.ascii_is_lower |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.ascii_is_printable.html"> |
| pyarrow.compute.ascii_is_printable |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.ascii_is_space.html"> |
| pyarrow.compute.ascii_is_space |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.ascii_is_upper.html"> |
| pyarrow.compute.ascii_is_upper |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.utf8_is_alnum.html"> |
| pyarrow.compute.utf8_is_alnum |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.utf8_is_alpha.html"> |
| pyarrow.compute.utf8_is_alpha |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.utf8_is_decimal.html"> |
| pyarrow.compute.utf8_is_decimal |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.utf8_is_digit.html"> |
| pyarrow.compute.utf8_is_digit |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.utf8_is_lower.html"> |
| pyarrow.compute.utf8_is_lower |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.utf8_is_numeric.html"> |
| pyarrow.compute.utf8_is_numeric |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.utf8_is_printable.html"> |
| pyarrow.compute.utf8_is_printable |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.utf8_is_space.html"> |
| pyarrow.compute.utf8_is_space |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.utf8_is_upper.html"> |
| pyarrow.compute.utf8_is_upper |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.ascii_is_title.html"> |
| pyarrow.compute.ascii_is_title |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.utf8_is_title.html"> |
| pyarrow.compute.utf8_is_title |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.string_is_ascii.html"> |
| pyarrow.compute.string_is_ascii |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.ascii_lower.html"> |
| pyarrow.compute.ascii_lower |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.ascii_upper.html"> |
| pyarrow.compute.ascii_upper |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.utf8_lower.html"> |
| pyarrow.compute.utf8_lower |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.utf8_upper.html"> |
| pyarrow.compute.utf8_upper |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.index_in.html"> |
| pyarrow.compute.index_in |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.is_in.html"> |
| pyarrow.compute.is_in |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.match_substring.html"> |
| pyarrow.compute.match_substring |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.match_substring_regex.html"> |
| pyarrow.compute.match_substring_regex |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.cast.html"> |
| pyarrow.compute.cast |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.strptime.html"> |
| pyarrow.compute.strptime |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.filter.html"> |
| pyarrow.compute.filter |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.take.html"> |
| pyarrow.compute.take |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.dictionary_encode.html"> |
| pyarrow.compute.dictionary_encode |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.unique.html"> |
| pyarrow.compute.unique |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.value_counts.html"> |
| pyarrow.compute.value_counts |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.partition_nth_indices.html"> |
| pyarrow.compute.partition_nth_indices |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.sort_indices.html"> |
| pyarrow.compute.sort_indices |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.binary_length.html"> |
| pyarrow.compute.binary_length |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.fill_null.html"> |
| pyarrow.compute.fill_null |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.is_null.html"> |
| pyarrow.compute.is_null |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.is_valid.html"> |
| pyarrow.compute.is_valid |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.list_value_length.html"> |
| pyarrow.compute.list_value_length |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.list_flatten.html"> |
| pyarrow.compute.list_flatten |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.compute.list_parent_indices.html"> |
| pyarrow.compute.list_parent_indices |
| </a> |
| </li> |
| </ul> |
| </li> |
| <li class="toctree-l3 has-children"> |
| <a class="reference internal" href="api/files.html"> |
| Streams and File Access |
| </a> |
| <input class="toctree-checkbox" id="toctree-checkbox-13" name="toctree-checkbox-13" type="checkbox"/> |
| <label for="toctree-checkbox-13"> |
| <i class="fas fa-chevron-down"> |
| </i> |
| </label> |
| <ul> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.input_stream.html"> |
| pyarrow.input_stream |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.output_stream.html"> |
| pyarrow.output_stream |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.memory_map.html"> |
| pyarrow.memory_map |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.create_memory_map.html"> |
| pyarrow.create_memory_map |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.NativeFile.html"> |
| pyarrow.NativeFile |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.OSFile.html"> |
| pyarrow.OSFile |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.PythonFile.html"> |
| pyarrow.PythonFile |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.BufferReader.html"> |
| pyarrow.BufferReader |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.BufferOutputStream.html"> |
| pyarrow.BufferOutputStream |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.FixedSizeBufferWriter.html"> |
| pyarrow.FixedSizeBufferWriter |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.MemoryMappedFile.html"> |
| pyarrow.MemoryMappedFile |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.CompressedInputStream.html"> |
| pyarrow.CompressedInputStream |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.CompressedOutputStream.html"> |
| pyarrow.CompressedOutputStream |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.hdfs.connect.html"> |
| pyarrow.hdfs.connect |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.LocalFileSystem.html"> |
| pyarrow.LocalFileSystem |
| </a> |
| </li> |
| </ul> |
| </li> |
| <li class="toctree-l3 has-children"> |
| <a class="reference internal" href="api/tables.html"> |
| Tables and Tensors |
| </a> |
| <input class="toctree-checkbox" id="toctree-checkbox-14" name="toctree-checkbox-14" type="checkbox"/> |
| <label for="toctree-checkbox-14"> |
| <i class="fas fa-chevron-down"> |
| </i> |
| </label> |
| <ul> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.chunked_array.html"> |
| pyarrow.chunked_array |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.concat_arrays.html"> |
| pyarrow.concat_arrays |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.concat_tables.html"> |
| pyarrow.concat_tables |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.record_batch.html"> |
| pyarrow.record_batch |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.table.html"> |
| pyarrow.table |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.ChunkedArray.html"> |
| pyarrow.ChunkedArray |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.RecordBatch.html"> |
| pyarrow.RecordBatch |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.Table.html"> |
| pyarrow.Table |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.Tensor.html"> |
| pyarrow.Tensor |
| </a> |
| </li> |
| </ul> |
| </li> |
| <li class="toctree-l3 has-children"> |
| <a class="reference internal" href="api/ipc.html"> |
| Serialization and IPC |
| </a> |
| <input class="toctree-checkbox" id="toctree-checkbox-15" name="toctree-checkbox-15" type="checkbox"/> |
| <label for="toctree-checkbox-15"> |
| <i class="fas fa-chevron-down"> |
| </i> |
| </label> |
| <ul> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.ipc.new_file.html"> |
| pyarrow.ipc.new_file |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.ipc.open_file.html"> |
| pyarrow.ipc.open_file |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.ipc.new_stream.html"> |
| pyarrow.ipc.new_stream |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.ipc.open_stream.html"> |
| pyarrow.ipc.open_stream |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.ipc.read_message.html"> |
| pyarrow.ipc.read_message |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.ipc.read_record_batch.html"> |
| pyarrow.ipc.read_record_batch |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.ipc.get_record_batch_size.html"> |
| pyarrow.ipc.get_record_batch_size |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.ipc.read_tensor.html"> |
| pyarrow.ipc.read_tensor |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.ipc.write_tensor.html"> |
| pyarrow.ipc.write_tensor |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.ipc.get_tensor_size.html"> |
| pyarrow.ipc.get_tensor_size |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.ipc.Message.html"> |
| pyarrow.ipc.Message |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.ipc.MessageReader.html"> |
| pyarrow.ipc.MessageReader |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.ipc.RecordBatchFileReader.html"> |
| pyarrow.ipc.RecordBatchFileReader |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.ipc.RecordBatchFileWriter.html"> |
| pyarrow.ipc.RecordBatchFileWriter |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.ipc.RecordBatchStreamReader.html"> |
| pyarrow.ipc.RecordBatchStreamReader |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.ipc.RecordBatchStreamWriter.html"> |
| pyarrow.ipc.RecordBatchStreamWriter |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.serialize.html"> |
| pyarrow.serialize |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.serialize_to.html"> |
| pyarrow.serialize_to |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.deserialize.html"> |
| pyarrow.deserialize |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.deserialize_components.html"> |
| pyarrow.deserialize_components |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.deserialize_from.html"> |
| pyarrow.deserialize_from |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.read_serialized.html"> |
| pyarrow.read_serialized |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.SerializedPyObject.html"> |
| pyarrow.SerializedPyObject |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.SerializationContext.html"> |
| pyarrow.SerializationContext |
| </a> |
| </li> |
| </ul> |
| </li> |
| <li class="toctree-l3 has-children"> |
| <a class="reference internal" href="api/flight.html"> |
| Arrow Flight |
| </a> |
| <input class="toctree-checkbox" id="toctree-checkbox-16" name="toctree-checkbox-16" type="checkbox"/> |
| <label for="toctree-checkbox-16"> |
| <i class="fas fa-chevron-down"> |
| </i> |
| </label> |
| <ul> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.flight.Action.html"> |
| pyarrow.flight.Action |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.flight.ActionType.html"> |
| pyarrow.flight.ActionType |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.flight.DescriptorType.html"> |
| pyarrow.flight.DescriptorType |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.flight.FlightDescriptor.html"> |
| pyarrow.flight.FlightDescriptor |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.flight.FlightEndpoint.html"> |
| pyarrow.flight.FlightEndpoint |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.flight.FlightInfo.html"> |
| pyarrow.flight.FlightInfo |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.flight.Location.html"> |
| pyarrow.flight.Location |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.flight.Ticket.html"> |
| pyarrow.flight.Ticket |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.flight.Result.html"> |
| pyarrow.flight.Result |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.flight.FlightCallOptions.html"> |
| pyarrow.flight.FlightCallOptions |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.flight.FlightClient.html"> |
| pyarrow.flight.FlightClient |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.flight.ClientMiddlewareFactory.html"> |
| pyarrow.flight.ClientMiddlewareFactory |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.flight.ClientMiddleware.html"> |
| pyarrow.flight.ClientMiddleware |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.flight.FlightServerBase.html"> |
| pyarrow.flight.FlightServerBase |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.flight.GeneratorStream.html"> |
| pyarrow.flight.GeneratorStream |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.flight.RecordBatchStream.html"> |
| pyarrow.flight.RecordBatchStream |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.flight.ServerMiddlewareFactory.html"> |
| pyarrow.flight.ServerMiddlewareFactory |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.flight.ServerMiddleware.html"> |
| pyarrow.flight.ServerMiddleware |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.flight.ClientAuthHandler.html"> |
| pyarrow.flight.ClientAuthHandler |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.flight.ServerAuthHandler.html"> |
| pyarrow.flight.ServerAuthHandler |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.flight.FlightMethod.html"> |
| pyarrow.flight.FlightMethod |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.flight.CallInfo.html"> |
| pyarrow.flight.CallInfo |
| </a> |
| </li> |
| </ul> |
| </li> |
| <li class="toctree-l3 has-children"> |
| <a class="reference internal" href="api/formats.html"> |
| Tabular File Formats |
| </a> |
| <input class="toctree-checkbox" id="toctree-checkbox-17" name="toctree-checkbox-17" type="checkbox"/> |
| <label for="toctree-checkbox-17"> |
| <i class="fas fa-chevron-down"> |
| </i> |
| </label> |
| <ul> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.csv.ReadOptions.html"> |
| pyarrow.csv.ReadOptions |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.csv.ParseOptions.html"> |
| pyarrow.csv.ParseOptions |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.csv.ConvertOptions.html"> |
| pyarrow.csv.ConvertOptions |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.csv.read_csv.html"> |
| pyarrow.csv.read_csv |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.csv.open_csv.html"> |
| pyarrow.csv.open_csv |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.csv.CSVStreamingReader.html"> |
| pyarrow.csv.CSVStreamingReader |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.feather.read_feather.html"> |
| pyarrow.feather.read_feather |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.feather.read_table.html"> |
| pyarrow.feather.read_table |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.feather.write_feather.html"> |
| pyarrow.feather.write_feather |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.json.ReadOptions.html"> |
| pyarrow.json.ReadOptions |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.json.ParseOptions.html"> |
| pyarrow.json.ParseOptions |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.json.read_json.html"> |
| pyarrow.json.read_json |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.parquet.ParquetDataset.html"> |
| pyarrow.parquet.ParquetDataset |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.parquet.ParquetFile.html"> |
| pyarrow.parquet.ParquetFile |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.parquet.ParquetWriter.html"> |
| pyarrow.parquet.ParquetWriter |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.parquet.read_table.html"> |
| pyarrow.parquet.read_table |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.parquet.read_metadata.html"> |
| pyarrow.parquet.read_metadata |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.parquet.read_pandas.html"> |
| pyarrow.parquet.read_pandas |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.parquet.read_schema.html"> |
| pyarrow.parquet.read_schema |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.parquet.write_metadata.html"> |
| pyarrow.parquet.write_metadata |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.parquet.write_table.html"> |
| pyarrow.parquet.write_table |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.parquet.write_to_dataset.html"> |
| pyarrow.parquet.write_to_dataset |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.orc.ORCFile.html"> |
| pyarrow.orc.ORCFile |
| </a> |
| </li> |
| </ul> |
| </li> |
| <li class="toctree-l3 has-children"> |
| <a class="reference internal" href="api/filesystems.html"> |
| Filesystems |
| </a> |
| <input class="toctree-checkbox" id="toctree-checkbox-18" name="toctree-checkbox-18" type="checkbox"/> |
| <label for="toctree-checkbox-18"> |
| <i class="fas fa-chevron-down"> |
| </i> |
| </label> |
| <ul> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.fs.FileInfo.html"> |
| pyarrow.fs.FileInfo |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.fs.FileSelector.html"> |
| pyarrow.fs.FileSelector |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.fs.FileSystem.html"> |
| pyarrow.fs.FileSystem |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.fs.LocalFileSystem.html"> |
| pyarrow.fs.LocalFileSystem |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.fs.S3FileSystem.html"> |
| pyarrow.fs.S3FileSystem |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.fs.HadoopFileSystem.html"> |
| pyarrow.fs.HadoopFileSystem |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.fs.SubTreeFileSystem.html"> |
| pyarrow.fs.SubTreeFileSystem |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.fs.PyFileSystem.html"> |
| pyarrow.fs.PyFileSystem |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.fs.FileSystemHandler.html"> |
| pyarrow.fs.FileSystemHandler |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.fs.FSSpecHandler.html"> |
| pyarrow.fs.FSSpecHandler |
| </a> |
| </li> |
| </ul> |
| </li> |
| <li class="toctree-l3 has-children"> |
| <a class="reference internal" href="api/dataset.html"> |
| Dataset |
| </a> |
| <input class="toctree-checkbox" id="toctree-checkbox-19" name="toctree-checkbox-19" type="checkbox"/> |
| <label for="toctree-checkbox-19"> |
| <i class="fas fa-chevron-down"> |
| </i> |
| </label> |
| <ul> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.dataset.dataset.html"> |
| pyarrow.dataset.dataset |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.dataset.parquet_dataset.html"> |
| pyarrow.dataset.parquet_dataset |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.dataset.partitioning.html"> |
| pyarrow.dataset.partitioning |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.dataset.field.html"> |
| pyarrow.dataset.field |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.dataset.scalar.html"> |
| pyarrow.dataset.scalar |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.dataset.FileFormat.html"> |
| pyarrow.dataset.FileFormat |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.dataset.ParquetFileFormat.html"> |
| pyarrow.dataset.ParquetFileFormat |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.dataset.Partitioning.html"> |
| pyarrow.dataset.Partitioning |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.dataset.PartitioningFactory.html"> |
| pyarrow.dataset.PartitioningFactory |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.dataset.DirectoryPartitioning.html"> |
| pyarrow.dataset.DirectoryPartitioning |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.dataset.HivePartitioning.html"> |
| pyarrow.dataset.HivePartitioning |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.dataset.Dataset.html"> |
| pyarrow.dataset.Dataset |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.dataset.FileSystemDataset.html"> |
| pyarrow.dataset.FileSystemDataset |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.dataset.FileSystemFactoryOptions.html"> |
| pyarrow.dataset.FileSystemFactoryOptions |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.dataset.FileSystemDatasetFactory.html"> |
| pyarrow.dataset.FileSystemDatasetFactory |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.dataset.UnionDataset.html"> |
| pyarrow.dataset.UnionDataset |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.dataset.Scanner.html"> |
| pyarrow.dataset.Scanner |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.dataset.Expression.html"> |
| pyarrow.dataset.Expression |
| </a> |
| </li> |
| </ul> |
| </li> |
| <li class="toctree-l3 has-children"> |
| <a class="reference internal" href="api/plasma.html"> |
| Plasma In-Memory Object Store |
| </a> |
| <input class="toctree-checkbox" id="toctree-checkbox-20" name="toctree-checkbox-20" type="checkbox"/> |
| <label for="toctree-checkbox-20"> |
| <i class="fas fa-chevron-down"> |
| </i> |
| </label> |
| <ul> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.plasma.ObjectID.html"> |
| pyarrow.plasma.ObjectID |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.plasma.PlasmaClient.html"> |
| pyarrow.plasma.PlasmaClient |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.plasma.PlasmaBuffer.html"> |
| pyarrow.plasma.PlasmaBuffer |
| </a> |
| </li> |
| </ul> |
| </li> |
| <li class="toctree-l3 has-children"> |
| <a class="reference internal" href="api/cuda.html"> |
| CUDA Integration |
| </a> |
| <input class="toctree-checkbox" id="toctree-checkbox-21" name="toctree-checkbox-21" type="checkbox"/> |
| <label for="toctree-checkbox-21"> |
| <i class="fas fa-chevron-down"> |
| </i> |
| </label> |
| <ul> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.cuda.Context.html"> |
| pyarrow.cuda.Context |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.cuda.CudaBuffer.html"> |
| pyarrow.cuda.CudaBuffer |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.cuda.new_host_buffer.html"> |
| pyarrow.cuda.new_host_buffer |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.cuda.HostBuffer.html"> |
| pyarrow.cuda.HostBuffer |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.cuda.BufferReader.html"> |
| pyarrow.cuda.BufferReader |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.cuda.BufferWriter.html"> |
| pyarrow.cuda.BufferWriter |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.cuda.serialize_record_batch.html"> |
| pyarrow.cuda.serialize_record_batch |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.cuda.read_record_batch.html"> |
| pyarrow.cuda.read_record_batch |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.cuda.read_message.html"> |
| pyarrow.cuda.read_message |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.cuda.IpcMemHandle.html"> |
| pyarrow.cuda.IpcMemHandle |
| </a> |
| </li> |
| </ul> |
| </li> |
| <li class="toctree-l3 has-children"> |
| <a class="reference internal" href="api/misc.html"> |
| Miscellaneous |
| </a> |
| <input class="toctree-checkbox" id="toctree-checkbox-22" name="toctree-checkbox-22" type="checkbox"/> |
| <label for="toctree-checkbox-22"> |
| <i class="fas fa-chevron-down"> |
| </i> |
| </label> |
| <ul> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.cpu_count.html"> |
| pyarrow.cpu_count |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.set_cpu_count.html"> |
| pyarrow.set_cpu_count |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.get_include.html"> |
| pyarrow.get_include |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.get_libraries.html"> |
| pyarrow.get_libraries |
| </a> |
| </li> |
| <li class="toctree-l4"> |
| <a class="reference internal" href="generated/pyarrow.get_library_dirs.html"> |
| pyarrow.get_library_dirs |
| </a> |
| </li> |
| </ul> |
| </li> |
| </ul> |
| </li> |
| <li class="toctree-l2"> |
| <a class="reference internal" href="getting_involved.html"> |
| Getting Involved |
| </a> |
| </li> |
| <li class="toctree-l2"> |
| <a class="reference internal" href="benchmarks.html"> |
| Benchmarks |
| </a> |
| </li> |
| </ul> |
| </li> |
| <li class="toctree-l1"> |
| <a class="reference external" href="https://arrow.apache.org/docs/r/"> |
| R |
| </a> |
| </li> |
| <li class="toctree-l1"> |
| <a class="reference external" href="https://github.com/apache/arrow/blob/master/ruby/README.md"> |
| Ruby |
| </a> |
| </li> |
| <li class="toctree-l1"> |
| <a class="reference external" href="https://docs.rs/crate/arrow/"> |
| Rust |
| </a> |
| </li> |
| </ul> |
| <p class="caption"> |
| <span class="caption-text"> |
| Development |
| </span> |
| </p> |
| <ul class="nav bd-sidenav"> |
| <li class="toctree-l1"> |
| <a class="reference internal" href="../developers/contributing.html"> |
| Contributing to Apache Arrow |
| </a> |
| </li> |
| <li class="toctree-l1 has-children"> |
| <a class="reference internal" href="../developers/cpp/index.html"> |
| C++ Development |
| </a> |
| <input class="toctree-checkbox" id="toctree-checkbox-23" name="toctree-checkbox-23" type="checkbox"/> |
| <label for="toctree-checkbox-23"> |
| <i class="fas fa-chevron-down"> |
| </i> |
| </label> |
| <ul> |
| <li class="toctree-l2"> |
| <a class="reference internal" href="../developers/cpp/building.html"> |
| Building Arrow C++ |
| </a> |
| </li> |
| <li class="toctree-l2"> |
| <a class="reference internal" href="../developers/cpp/development.html"> |
| Development Guidelines |
| </a> |
| </li> |
| <li class="toctree-l2"> |
| <a class="reference internal" href="../developers/cpp/windows.html"> |
| Developing on Windows |
| </a> |
| </li> |
| <li class="toctree-l2"> |
| <a class="reference internal" href="../developers/cpp/conventions.html"> |
| Conventions |
| </a> |
| </li> |
| <li class="toctree-l2"> |
| <a class="reference internal" href="../developers/cpp/fuzzing.html"> |
| Fuzzing Arrow C++ |
| </a> |
| </li> |
| </ul> |
| </li> |
| <li class="toctree-l1"> |
| <a class="reference internal" href="../developers/python.html"> |
| Python Development |
| </a> |
| </li> |
| <li class="toctree-l1"> |
| <a class="reference internal" href="../developers/archery.html"> |
| Daily Development using Archery |
| </a> |
| </li> |
| <li class="toctree-l1"> |
| <a class="reference internal" href="../developers/crossbow.html"> |
| Packaging and Testing with Crossbow |
| </a> |
| </li> |
| <li class="toctree-l1"> |
| <a class="reference internal" href="../developers/docker.html"> |
| Running Docker Builds |
| </a> |
| </li> |
| <li class="toctree-l1"> |
| <a class="reference internal" href="../developers/benchmarks.html"> |
| Benchmarks |
| </a> |
| </li> |
| <li class="toctree-l1"> |
| <a class="reference internal" href="../developers/documentation.html"> |
| Building the Documentation |
| </a> |
| </li> |
| </ul> |
| |
| |
| </div> |
| </nav> |
| </div> |
| |
| |
| |
| |
| <div class="d-none d-xl-block col-xl-2 bd-toc"> |
| |
| |
| <div class="toc-item"> |
| |
| <div class="tocsection onthispage pt-5 pb-3"> |
| <i class="fas fa-list"></i> On this page |
| </div> |
| |
| <nav id="bd-toc-nav"> |
| <ul class="visible nav section-nav flex-column"> |
| <li class="toc-h2 nav-item toc-entry"> |
| <a class="reference internal nav-link" href="#reading-datasets"> |
| Reading Datasets |
| </a> |
| <ul class="visible nav section-nav flex-column"> |
| <li class="toc-h3 nav-item toc-entry"> |
| <a class="reference internal nav-link" href="#dataset-discovery"> |
| Dataset discovery |
| </a> |
| </li> |
| <li class="toc-h3 nav-item toc-entry"> |
| <a class="reference internal nav-link" href="#reading-different-file-formats"> |
| Reading different file formats |
| </a> |
| </li> |
| <li class="toc-h3 nav-item toc-entry"> |
| <a class="reference internal nav-link" href="#customizing-file-formats"> |
| Customizing file formats |
| </a> |
| </li> |
| </ul> |
| </li> |
| <li class="toc-h2 nav-item toc-entry"> |
| <a class="reference internal nav-link" href="#filtering-data"> |
| Filtering data |
| </a> |
| </li> |
| <li class="toc-h2 nav-item toc-entry"> |
| <a class="reference internal nav-link" href="#projecting-columns"> |
| Projecting columns |
| </a> |
| </li> |
| <li class="toc-h2 nav-item toc-entry"> |
| <a class="reference internal nav-link" href="#reading-partitioned-data"> |
| Reading partitioned data |
| </a> |
| <ul class="visible nav section-nav flex-column"> |
| <li class="toc-h3 nav-item toc-entry"> |
| <a class="reference internal nav-link" href="#different-partitioning-schemes"> |
| Different partitioning schemes |
| </a> |
| </li> |
| </ul> |
| </li> |
| <li class="toc-h2 nav-item toc-entry"> |
| <a class="reference internal nav-link" href="#reading-from-cloud-storage"> |
| Reading from cloud storage |
| </a> |
| </li> |
| <li class="toc-h2 nav-item toc-entry"> |
| <a class="reference internal nav-link" href="#reading-from-minio"> |
| Reading from Minio |
| </a> |
| </li> |
| <li class="toc-h2 nav-item toc-entry"> |
| <a class="reference internal nav-link" href="#working-with-parquet-datasets"> |
| Working with Parquet Datasets |
| </a> |
| </li> |
| <li class="toc-h2 nav-item toc-entry"> |
| <a class="reference internal nav-link" href="#manual-specification-of-the-dataset"> |
| Manual specification of the Dataset |
| </a> |
| </li> |
| <li class="toc-h2 nav-item toc-entry"> |
| <a class="reference internal nav-link" href="#manual-scheduling"> |
| Manual scheduling |
| </a> |
| </li> |
| </ul> |
| |
| </nav> |
| </div> |
| |
| <div class="toc-item"> |
| |
| </div> |
| |
| |
| </div> |
| |
| |
| |
| |
| |
| |
| <main class="col-12 col-md-9 col-xl-7 py-md-5 pl-md-5 pr-md-4 bd-content" role="main"> |
| |
| <div> |
| |
| <div class="section" id="tabular-datasets"> |
| <span id="dataset"></span><h1>Tabular Datasets<a class="headerlink" href="#tabular-datasets" title="Permalink to this headline">¶</a></h1> |
| <div class="admonition warning"> |
| <p class="admonition-title">Warning</p> |
| <p>The <code class="docutils literal notranslate"><span class="pre">pyarrow.dataset</span></code> module is experimental (specifically the classes), |
| and a stable API is not yet guaranteed.</p> |
| </div> |
| <p>The <code class="docutils literal notranslate"><span class="pre">pyarrow.dataset</span></code> module provides functionality to efficiently work with |
| tabular, potentially larger than memory, and multi-file datasets. This includes:</p> |
| <ul class="simple"> |
| <li><p>A unified interface that supports different sources and file formats |
| (Parquet, Feather / Arrow IPC, and CSV files) and different file systems |
| (local, cloud).</p></li> |
| <li><p>Discovery of sources (crawling directories, handle directory-based partitioned |
| datasets, basic schema normalization, ..)</p></li> |
| <li><p>Optimized reading with predicate pushdown (filtering rows), projection |
| (selecting and deriving columns), and optionally parallel reading.</p></li> |
| </ul> |
| <p>Currently, only Parquet, Feather / Arrow IPC, and CSV files are supported. The |
| goal is to expand this in the future to other file formats and data sources |
| (e.g. database connections).</p> |
| <p>For those familiar with the existing <a class="reference internal" href="generated/pyarrow.parquet.ParquetDataset.html#pyarrow.parquet.ParquetDataset" title="pyarrow.parquet.ParquetDataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">pyarrow.parquet.ParquetDataset</span></code></a> for |
| reading Parquet datasets: <code class="docutils literal notranslate"><span class="pre">pyarrow.dataset</span></code>’s goal is similar but not specific |
| to the Parquet format and not tied to Python: the same datasets API is exposed |
| in the R bindings or Arrow. In addition <code class="docutils literal notranslate"><span class="pre">pyarrow.dataset</span></code> boasts improved |
| performance and new features (e.g. filtering within files rather than only on |
| partition keys).</p> |
| <div class="section" id="reading-datasets"> |
| <h2>Reading Datasets<a class="headerlink" href="#reading-datasets" title="Permalink to this headline">¶</a></h2> |
| <p>For the examples below, let’s create a small dataset consisting |
| of a directory with two parquet files:</p> |
| <div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [1]: </span><span class="kn">import</span> <span class="nn">tempfile</span> |
| |
| <span class="gp">In [2]: </span><span class="kn">import</span> <span class="nn">pathlib</span> |
| |
| <span class="gp">In [3]: </span><span class="kn">import</span> <span class="nn">pyarrow</span> <span class="kn">as</span> <span class="nn">pa</span> |
| |
| <span class="gp">In [4]: </span><span class="kn">import</span> <span class="nn">pyarrow.parquet</span> <span class="kn">as</span> <span class="nn">pq</span> |
| |
| <span class="gp">In [5]: </span><span class="n">base</span> <span class="o">=</span> <span class="n">pathlib</span><span class="o">.</span><span class="n">Path</span><span class="p">(</span><span class="n">tempfile</span><span class="o">.</span><span class="n">gettempdir</span><span class="p">())</span> |
| |
| <span class="gp">In [6]: </span><span class="p">(</span><span class="n">base</span> <span class="o">/</span> <span class="s2">"parquet_dataset"</span><span class="p">)</span><span class="o">.</span><span class="n">mkdir</span><span class="p">(</span><span class="n">exist_ok</span><span class="o">=</span><span class="bp">True</span><span class="p">)</span> |
| |
| <span class="go"># creating an Arrow Table</span> |
| <span class="gp">In [7]: </span><span class="n">table</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">table</span><span class="p">({</span><span class="s1">'a'</span><span class="p">:</span> <span class="nb">range</span><span class="p">(</span><span class="mi">10</span><span class="p">),</span> <span class="s1">'b'</span><span class="p">:</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="mi">10</span><span class="p">),</span> <span class="s1">'c'</span><span class="p">:</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">]</span> <span class="o">*</span> <span class="mi">5</span><span class="p">})</span> |
| |
| <span class="go"># writing it into two parquet files</span> |
| <span class="gp">In [8]: </span><span class="n">pq</span><span class="o">.</span><span class="n">write_table</span><span class="p">(</span><span class="n">table</span><span class="o">.</span><span class="n">slice</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">5</span><span class="p">),</span> <span class="n">base</span> <span class="o">/</span> <span class="s2">"parquet_dataset/data1.parquet"</span><span class="p">)</span> |
| |
| <span class="gp">In [9]: </span><span class="n">pq</span><span class="o">.</span><span class="n">write_table</span><span class="p">(</span><span class="n">table</span><span class="o">.</span><span class="n">slice</span><span class="p">(</span><span class="mi">5</span><span class="p">,</span> <span class="mi">10</span><span class="p">),</span> <span class="n">base</span> <span class="o">/</span> <span class="s2">"parquet_dataset/data2.parquet"</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| <div class="section" id="dataset-discovery"> |
| <h3>Dataset discovery<a class="headerlink" href="#dataset-discovery" title="Permalink to this headline">¶</a></h3> |
| <p>A <a class="reference internal" href="generated/pyarrow.dataset.Dataset.html#pyarrow.dataset.Dataset" title="pyarrow.dataset.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a> object can be created with the <a class="reference internal" href="generated/pyarrow.dataset.dataset.html#pyarrow.dataset.dataset" title="pyarrow.dataset.dataset"><code class="xref py py-func docutils literal notranslate"><span class="pre">dataset()</span></code></a> function. We |
| can pass it the path to the directory containing the data files:</p> |
| <div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [10]: </span><span class="kn">import</span> <span class="nn">pyarrow.dataset</span> <span class="kn">as</span> <span class="nn">ds</span> |
| |
| <span class="gp">In [11]: </span><span class="n">dataset</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">dataset</span><span class="p">(</span><span class="n">base</span> <span class="o">/</span> <span class="s2">"parquet_dataset"</span><span class="p">,</span> <span class="n">format</span><span class="o">=</span><span class="s2">"parquet"</span><span class="p">)</span> |
| |
| <span class="gp">In [12]: </span><span class="n">dataset</span> |
| <span class="gh">Out[12]: </span><span class="go"><pyarrow._dataset.FileSystemDataset at 0x7f06a26cbdb0></span> |
| </pre></div> |
| </div> |
| <p>In addition to searching a base directory, <a class="reference internal" href="generated/pyarrow.dataset.dataset.html#pyarrow.dataset.dataset" title="pyarrow.dataset.dataset"><code class="xref py py-func docutils literal notranslate"><span class="pre">dataset()</span></code></a> accepts a path to a |
| single file or a list of file paths.</p> |
| <p>Creating a <a class="reference internal" href="generated/pyarrow.dataset.Dataset.html#pyarrow.dataset.Dataset" title="pyarrow.dataset.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a> object does not begin reading the data itself. If |
| needed, it only crawls the directory to find all the files:</p> |
| <div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [13]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">files</span> |
| <span class="gh">Out[13]: </span><span class="go">['/tmp/parquet_dataset/data1.parquet', '/tmp/parquet_dataset/data2.parquet']</span> |
| </pre></div> |
| </div> |
| <p>… and infers the dataset’s schema (by default from the first file):</p> |
| <div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [14]: </span><span class="k">print</span><span class="p">(</span><span class="n">dataset</span><span class="o">.</span><span class="n">schema</span><span class="o">.</span><span class="n">to_string</span><span class="p">(</span><span class="n">show_field_metadata</span><span class="o">=</span><span class="bp">False</span><span class="p">))</span> |
| <span class="go">a: int64</span> |
| <span class="go">b: double</span> |
| <span class="go">c: int64</span> |
| </pre></div> |
| </div> |
| <p>Using the <a class="reference internal" href="generated/pyarrow.dataset.Dataset.html#pyarrow.dataset.Dataset.to_table" title="pyarrow.dataset.Dataset.to_table"><code class="xref py py-meth docutils literal notranslate"><span class="pre">Dataset.to_table()</span></code></a> method we can read the dataset (or a portion |
| of it) into a pyarrow Table (note that depending on the size of your dataset |
| this can require a lot of memory, see below on filtering / iterative loading):</p> |
| <div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [15]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">to_table</span><span class="p">()</span> |
| <span class="gh">Out[15]: </span><span class="go"></span> |
| <span class="go">pyarrow.Table</span> |
| <span class="go">a: int64</span> |
| <span class="go">b: double</span> |
| <span class="go">c: int64</span> |
| |
| <span class="go"># converting to pandas to see the contents of the scanned table</span> |
| <span class="gp">In [16]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">to_table</span><span class="p">()</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">()</span> |
| <span class="gh">Out[16]: </span><span class="go"></span> |
| <span class="go"> a b c</span> |
| <span class="go">0 0 -0.561748 1</span> |
| <span class="go">1 1 0.275964 2</span> |
| <span class="go">2 2 0.992622 1</span> |
| <span class="go">3 3 -0.238940 2</span> |
| <span class="go">4 4 -0.473575 1</span> |
| <span class="go">5 5 1.865719 2</span> |
| <span class="go">6 6 0.237635 1</span> |
| <span class="go">7 7 0.578001 2</span> |
| <span class="go">8 8 0.776791 1</span> |
| <span class="go">9 9 -0.115339 2</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="section" id="reading-different-file-formats"> |
| <h3>Reading different file formats<a class="headerlink" href="#reading-different-file-formats" title="Permalink to this headline">¶</a></h3> |
| <p>The above examples use Parquet files as dataset source but the Dataset API |
| provides a consistent interface across multiple file formats and filesystems. |
| Currently, Parquet, Feather / Arrow IPC, and CSV file formats are supported; |
| more formats are planned in the future.</p> |
| <p>If we save the table as Feather files instead of Parquet files:</p> |
| <div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [17]: </span><span class="kn">import</span> <span class="nn">pyarrow.feather</span> <span class="kn">as</span> <span class="nn">feather</span> |
| |
| <span class="gp">In [18]: </span><span class="n">feather</span><span class="o">.</span><span class="n">write_feather</span><span class="p">(</span><span class="n">table</span><span class="p">,</span> <span class="n">base</span> <span class="o">/</span> <span class="s2">"data.feather"</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| <p>…then we can read the Feather file using the same functions, but with specifying |
| <code class="docutils literal notranslate"><span class="pre">format="feather"</span></code>:</p> |
| <div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [19]: </span><span class="n">dataset</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">dataset</span><span class="p">(</span><span class="n">base</span> <span class="o">/</span> <span class="s2">"data.feather"</span><span class="p">,</span> <span class="n">format</span><span class="o">=</span><span class="s2">"feather"</span><span class="p">)</span> |
| |
| <span class="gp">In [20]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">to_table</span><span class="p">()</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">()</span><span class="o">.</span><span class="n">head</span><span class="p">()</span> |
| <span class="gh">Out[20]: </span><span class="go"></span> |
| <span class="go"> a b c</span> |
| <span class="go">0 0 -0.561748 1</span> |
| <span class="go">1 1 0.275964 2</span> |
| <span class="go">2 2 0.992622 1</span> |
| <span class="go">3 3 -0.238940 2</span> |
| <span class="go">4 4 -0.473575 1</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="section" id="customizing-file-formats"> |
| <h3>Customizing file formats<a class="headerlink" href="#customizing-file-formats" title="Permalink to this headline">¶</a></h3> |
| <p>The format name as a string, like:</p> |
| <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">ds</span><span class="o">.</span><span class="n">dataset</span><span class="p">(</span><span class="o">...</span><span class="p">,</span> <span class="nb">format</span><span class="o">=</span><span class="s2">"parquet"</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| <p>is short hand for a default constructed <a class="reference internal" href="generated/pyarrow.dataset.ParquetFileFormat.html#pyarrow.dataset.ParquetFileFormat" title="pyarrow.dataset.ParquetFileFormat"><code class="xref py py-class docutils literal notranslate"><span class="pre">ParquetFileFormat</span></code></a>:</p> |
| <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">ds</span><span class="o">.</span><span class="n">dataset</span><span class="p">(</span><span class="o">...</span><span class="p">,</span> <span class="nb">format</span><span class="o">=</span><span class="n">ds</span><span class="o">.</span><span class="n">ParquetFileForma</span><span class="p">())</span> |
| </pre></div> |
| </div> |
| <p>The <a class="reference internal" href="generated/pyarrow.dataset.FileFormat.html#pyarrow.dataset.FileFormat" title="pyarrow.dataset.FileFormat"><code class="xref py py-class docutils literal notranslate"><span class="pre">FileFormat</span></code></a> objects can be customized using keywords. For example:</p> |
| <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">parquet_format</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">ParquetFileFormat</span><span class="p">(</span><span class="n">read_options</span><span class="o">=</span><span class="p">{</span><span class="s1">'dictionary_columns'</span><span class="p">:</span> <span class="p">[</span><span class="s1">'a'</span><span class="p">]})</span> |
| <span class="n">ds</span><span class="o">.</span><span class="n">dataset</span><span class="p">(</span><span class="o">...</span><span class="p">,</span> <span class="nb">format</span><span class="o">=</span><span class="n">parquet_format</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| <p>Will configure column <code class="docutils literal notranslate"><span class="pre">"a"</span></code> to be dictionary encoded on scan.</p> |
| </div> |
| </div> |
| <div class="section" id="filtering-data"> |
| <h2>Filtering data<a class="headerlink" href="#filtering-data" title="Permalink to this headline">¶</a></h2> |
| <p>To avoid reading all data when only needing a subset, the <code class="docutils literal notranslate"><span class="pre">columns</span></code> and |
| <code class="docutils literal notranslate"><span class="pre">filter</span></code> keywords can be used.</p> |
| <p>The <code class="docutils literal notranslate"><span class="pre">columns</span></code> keyword can be used to only read the specified columns:</p> |
| <div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [21]: </span><span class="n">dataset</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">dataset</span><span class="p">(</span><span class="n">base</span> <span class="o">/</span> <span class="s2">"parquet_dataset"</span><span class="p">,</span> <span class="n">format</span><span class="o">=</span><span class="s2">"parquet"</span><span class="p">)</span> |
| |
| <span class="gp">In [22]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">to_table</span><span class="p">(</span><span class="n">columns</span><span class="o">=</span><span class="p">[</span><span class="s1">'a'</span><span class="p">,</span> <span class="s1">'b'</span><span class="p">])</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">()</span> |
| <span class="gh">Out[22]: </span><span class="go"></span> |
| <span class="go"> a b</span> |
| <span class="go">0 0 -0.561748</span> |
| <span class="go">1 1 0.275964</span> |
| <span class="go">2 2 0.992622</span> |
| <span class="go">3 3 -0.238940</span> |
| <span class="go">4 4 -0.473575</span> |
| <span class="go">5 5 1.865719</span> |
| <span class="go">6 6 0.237635</span> |
| <span class="go">7 7 0.578001</span> |
| <span class="go">8 8 0.776791</span> |
| <span class="go">9 9 -0.115339</span> |
| </pre></div> |
| </div> |
| <p>With the <code class="docutils literal notranslate"><span class="pre">filter</span></code> keyword, rows which do not match the filter predicate will |
| not be included in the returned table. The keyword expects a boolean |
| <a class="reference internal" href="generated/pyarrow.dataset.Expression.html#pyarrow.dataset.Expression" title="pyarrow.dataset.Expression"><code class="xref py py-class docutils literal notranslate"><span class="pre">Expression</span></code></a> referencing at least one of the columns:</p> |
| <div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [23]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">to_table</span><span class="p">(</span><span class="nb">filter</span><span class="o">=</span><span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s1">'a'</span><span class="p">)</span> <span class="o">>=</span> <span class="mi">7</span><span class="p">)</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">()</span> |
| <span class="gh">Out[23]: </span><span class="go"></span> |
| <span class="go"> a b c</span> |
| <span class="go">0 7 0.578001 2</span> |
| <span class="go">1 8 0.776791 1</span> |
| <span class="go">2 9 -0.115339 2</span> |
| |
| <span class="gp">In [24]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">to_table</span><span class="p">(</span><span class="nb">filter</span><span class="o">=</span><span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s1">'c'</span><span class="p">)</span> <span class="o">==</span> <span class="mi">2</span><span class="p">)</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">()</span> |
| <span class="gh">Out[24]: </span><span class="go"></span> |
| <span class="go"> a b c</span> |
| <span class="go">0 1 0.275964 2</span> |
| <span class="go">1 3 -0.238940 2</span> |
| <span class="go">2 5 1.865719 2</span> |
| <span class="go">3 7 0.578001 2</span> |
| <span class="go">4 9 -0.115339 2</span> |
| </pre></div> |
| </div> |
| <p>The easiest way to construct those <a class="reference internal" href="generated/pyarrow.dataset.Expression.html#pyarrow.dataset.Expression" title="pyarrow.dataset.Expression"><code class="xref py py-class docutils literal notranslate"><span class="pre">Expression</span></code></a> objects is by using the |
| <a class="reference internal" href="generated/pyarrow.dataset.field.html#pyarrow.dataset.field" title="pyarrow.dataset.field"><code class="xref py py-func docutils literal notranslate"><span class="pre">field()</span></code></a> helper function. Any column - not just partition columns - can be |
| referenced using the <a class="reference internal" href="generated/pyarrow.dataset.field.html#pyarrow.dataset.field" title="pyarrow.dataset.field"><code class="xref py py-func docutils literal notranslate"><span class="pre">field()</span></code></a> function (which creates a |
| <code class="xref py py-class docutils literal notranslate"><span class="pre">FieldExpression</span></code>). Operator overloads are provided to compose filters |
| including the comparisons (equal, larger/less than, etc), set membership |
| testing, and boolean combinations (<code class="docutils literal notranslate"><span class="pre">&</span></code>, <code class="docutils literal notranslate"><span class="pre">|</span></code>, <code class="docutils literal notranslate"><span class="pre">~</span></code>):</p> |
| <div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [25]: </span><span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s1">'a'</span><span class="p">)</span> <span class="o">!=</span> <span class="mi">3</span> |
| <span class="gh">Out[25]: </span><span class="go"><pyarrow.dataset.Expression (a != 3)></span> |
| |
| <span class="gp">In [26]: </span><span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s1">'a'</span><span class="p">)</span><span class="o">.</span><span class="n">isin</span><span class="p">([</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">])</span> |
| <span class="gh">Out[26]: </span><span class="go"></span> |
| <span class="go"><pyarrow.dataset.Expression is_in(a, value_set=[</span> |
| <span class="go"> 1,</span> |
| <span class="go"> 2,</span> |
| <span class="go"> 3</span> |
| <span class="go">], skip_nulls)></span> |
| |
| <span class="gp">In [27]: </span><span class="p">(</span><span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s1">'a'</span><span class="p">)</span> <span class="o">></span> <span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s1">'b'</span><span class="p">))</span> <span class="o">&</span> <span class="p">(</span><span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s1">'b'</span><span class="p">)</span> <span class="o">></span> <span class="mi">1</span><span class="p">)</span> |
| <span class="gh">Out[27]: </span><span class="go"><pyarrow.dataset.Expression ((a > b) and (b > 1))></span> |
| </pre></div> |
| </div> |
| <p>Note that <a class="reference internal" href="generated/pyarrow.dataset.Expression.html#pyarrow.dataset.Expression" title="pyarrow.dataset.Expression"><code class="xref py py-class docutils literal notranslate"><span class="pre">Expression</span></code></a> objects can <strong>not</strong> be combined by python logical |
| operators <code class="docutils literal notranslate"><span class="pre">and</span></code>, <code class="docutils literal notranslate"><span class="pre">or</span></code> and <code class="docutils literal notranslate"><span class="pre">not</span></code>.</p> |
| </div> |
| <div class="section" id="projecting-columns"> |
| <h2>Projecting columns<a class="headerlink" href="#projecting-columns" title="Permalink to this headline">¶</a></h2> |
| <p>The <code class="docutils literal notranslate"><span class="pre">columns</span></code> keyword can be used to read a subset of the columns of the |
| dataset by passing it a list of column names. The keyword can also be used |
| for more complex projections in combination with expressions.</p> |
| <p>In this case, we pass it a dictionary with the keys being the resulting |
| column names and the values the expression that is used to construct the column |
| values:</p> |
| <div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [28]: </span><span class="n">projection</span> <span class="o">=</span> <span class="p">{</span> |
| <span class="gp"> ....: </span> <span class="s2">"a_renamed"</span><span class="p">:</span> <span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s2">"a"</span><span class="p">),</span> |
| <span class="gp"> ....: </span> <span class="s2">"b_as_float32"</span><span class="p">:</span> <span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s2">"b"</span><span class="p">)</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="s2">"float32"</span><span class="p">),</span> |
| <span class="gp"> ....: </span> <span class="s2">"c_1"</span><span class="p">:</span> <span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s2">"c"</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">,</span> |
| <span class="gp"> ....: </span><span class="p">}</span> |
| <span class="gp"> ....: </span> |
| |
| <span class="gp">In [29]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">to_table</span><span class="p">(</span><span class="n">columns</span><span class="o">=</span><span class="n">projection</span><span class="p">)</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">()</span><span class="o">.</span><span class="n">head</span><span class="p">()</span> |
| <span class="gh">Out[29]: </span><span class="go"></span> |
| <span class="go"> a_renamed b_as_float32 c_1</span> |
| <span class="go">0 0 -0.561748 True</span> |
| <span class="go">1 1 0.275964 False</span> |
| <span class="go">2 2 0.992622 True</span> |
| <span class="go">3 3 -0.238940 False</span> |
| <span class="go">4 4 -0.473575 True</span> |
| </pre></div> |
| </div> |
| <p>The dictionary also determines the column selection (only the keys in the |
| dictionary will be present as columns in the resulting table). If you want |
| to include a derived column in <em>addition</em> to the existing columns, you can |
| build up the dictionary from the dataset schema:</p> |
| <div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [30]: </span><span class="n">projection</span> <span class="o">=</span> <span class="p">{</span><span class="n">col</span><span class="p">:</span> <span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">dataset</span><span class="o">.</span><span class="n">schema</span><span class="o">.</span><span class="n">names</span><span class="p">}</span> |
| |
| <span class="gp">In [31]: </span><span class="n">projection</span><span class="o">.</span><span class="n">update</span><span class="p">({</span><span class="s2">"b_large"</span><span class="p">:</span> <span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s2">"b"</span><span class="p">)</span> <span class="o">></span> <span class="mi">1</span><span class="p">})</span> |
| |
| <span class="gp">In [32]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">to_table</span><span class="p">(</span><span class="n">columns</span><span class="o">=</span><span class="n">projection</span><span class="p">)</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">()</span><span class="o">.</span><span class="n">head</span><span class="p">()</span> |
| <span class="gh">Out[32]: </span><span class="go"></span> |
| <span class="go"> a b c b_large</span> |
| <span class="go">0 0 -0.561748 1 False</span> |
| <span class="go">1 1 0.275964 2 False</span> |
| <span class="go">2 2 0.992622 1 False</span> |
| <span class="go">3 3 -0.238940 2 False</span> |
| <span class="go">4 4 -0.473575 1 False</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="section" id="reading-partitioned-data"> |
| <h2>Reading partitioned data<a class="headerlink" href="#reading-partitioned-data" title="Permalink to this headline">¶</a></h2> |
| <p>Above, a dataset consisting of a flat directory with files was shown. However, a |
| dataset can exploit a nested directory structure defining a partitioned dataset, |
| where the sub-directory names hold information about which subset of the data is |
| stored in that directory.</p> |
| <p>For example, a dataset partitioned by year and month may look like on disk:</p> |
| <div class="highlight-text notranslate"><div class="highlight"><pre><span></span>dataset_name/ |
| year=2007/ |
| month=01/ |
| data0.parquet |
| data1.parquet |
| ... |
| month=02/ |
| data0.parquet |
| data1.parquet |
| ... |
| month=03/ |
| ... |
| year=2008/ |
| month=01/ |
| ... |
| ... |
| </pre></div> |
| </div> |
| <p>The above partitioning scheme is using “/key=value/” directory names, as found |
| in Apache Hive.</p> |
| <p>Let’s create a small partitioned dataset. The <a class="reference internal" href="generated/pyarrow.parquet.write_to_dataset.html#pyarrow.parquet.write_to_dataset" title="pyarrow.parquet.write_to_dataset"><code class="xref py py-func docutils literal notranslate"><span class="pre">write_to_dataset()</span></code></a> |
| function can write such hive-like partitioned datasets.</p> |
| <div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [33]: </span><span class="n">table</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">table</span><span class="p">({</span><span class="s1">'a'</span><span class="p">:</span> <span class="nb">range</span><span class="p">(</span><span class="mi">10</span><span class="p">),</span> <span class="s1">'b'</span><span class="p">:</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="mi">10</span><span class="p">),</span> <span class="s1">'c'</span><span class="p">:</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">]</span> <span class="o">*</span> <span class="mi">5</span><span class="p">,</span> |
| <span class="gp"> ....: </span> <span class="s1">'part'</span><span class="p">:</span> <span class="p">[</span><span class="s1">'a'</span><span class="p">]</span> <span class="o">*</span> <span class="mi">5</span> <span class="o">+</span> <span class="p">[</span><span class="s1">'b'</span><span class="p">]</span> <span class="o">*</span> <span class="mi">5</span><span class="p">})</span> |
| <span class="gp"> ....: </span> |
| |
| <span class="gp">In [34]: </span><span class="n">pq</span><span class="o">.</span><span class="n">write_to_dataset</span><span class="p">(</span><span class="n">table</span><span class="p">,</span> <span class="nb">str</span><span class="p">(</span><span class="n">base</span> <span class="o">/</span> <span class="s2">"parquet_dataset_partitioned"</span><span class="p">),</span> |
| <span class="gp"> ....: </span> <span class="n">partition_cols</span><span class="o">=</span><span class="p">[</span><span class="s1">'part'</span><span class="p">])</span> |
| <span class="gp"> ....: </span> |
| </pre></div> |
| </div> |
| <p>The above created a directory with two subdirectories (“part=a” and “part=b”), |
| and the Parquet files written in those directories no longer include the “part” |
| column.</p> |
| <p>Reading this dataset with <a class="reference internal" href="generated/pyarrow.dataset.dataset.html#pyarrow.dataset.dataset" title="pyarrow.dataset.dataset"><code class="xref py py-func docutils literal notranslate"><span class="pre">dataset()</span></code></a>, we now specify that the dataset |
| should use a hive-like partitioning scheme with the <cite>partitioning</cite> keyword:</p> |
| <div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [35]: </span><span class="n">dataset</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">dataset</span><span class="p">(</span><span class="nb">str</span><span class="p">(</span><span class="n">base</span> <span class="o">/</span> <span class="s2">"parquet_dataset_partitioned"</span><span class="p">),</span> <span class="n">format</span><span class="o">=</span><span class="s2">"parquet"</span><span class="p">,</span> |
| <span class="gp"> ....: </span> <span class="n">partitioning</span><span class="o">=</span><span class="s2">"hive"</span><span class="p">)</span> |
| <span class="gp"> ....: </span> |
| |
| <span class="gp">In [36]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">files</span> |
| <span class="gh">Out[36]: </span><span class="go"></span> |
| <span class="go">['/tmp/parquet_dataset_partitioned/part=a/6778b16fc0284a1ab96768217808b72b.parquet',</span> |
| <span class="go"> '/tmp/parquet_dataset_partitioned/part=b/3cafd61610c84ba595ba0bdef7cd97a6.parquet']</span> |
| </pre></div> |
| </div> |
| <p>Although the partition fields are not included in the actual Parquet files, |
| they will be added back to the resulting table when scanning this dataset:</p> |
| <div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [37]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">to_table</span><span class="p">()</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">()</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="mi">3</span><span class="p">)</span> |
| <span class="gh">Out[37]: </span><span class="go"></span> |
| <span class="go"> a b c part</span> |
| <span class="go">0 0 1.227143 1 a</span> |
| <span class="go">1 1 -0.052853 2 a</span> |
| <span class="go">2 2 1.673034 1 a</span> |
| </pre></div> |
| </div> |
| <p>We can now filter on the partition keys, which avoids loading files |
| altogether if they do not match the filter:</p> |
| <div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [38]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">to_table</span><span class="p">(</span><span class="nb">filter</span><span class="o">=</span><span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s2">"part"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"b"</span><span class="p">)</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">()</span> |
| <span class="gh">Out[38]: </span><span class="go"></span> |
| <span class="go"> a b c part</span> |
| <span class="go">0 5 0.834894 2 b</span> |
| <span class="go">1 6 -0.446624 1 b</span> |
| <span class="go">2 7 -1.198733 2 b</span> |
| <span class="go">3 8 1.609412 1 b</span> |
| <span class="go">4 9 -0.775587 2 b</span> |
| </pre></div> |
| </div> |
| <div class="section" id="different-partitioning-schemes"> |
| <h3>Different partitioning schemes<a class="headerlink" href="#different-partitioning-schemes" title="Permalink to this headline">¶</a></h3> |
| <p>The above example uses a hive-like directory scheme, such as “/year=2009/month=11/day=15”. |
| We specified this passing the <code class="docutils literal notranslate"><span class="pre">partitioning="hive"</span></code> keyword. In this case, |
| the types of the partition keys are inferred from the file paths.</p> |
| <p>It is also possible to explicitly define the schema of the partition keys |
| using the <a class="reference internal" href="generated/pyarrow.dataset.partitioning.html#pyarrow.dataset.partitioning" title="pyarrow.dataset.partitioning"><code class="xref py py-func docutils literal notranslate"><span class="pre">partitioning()</span></code></a> function. For example:</p> |
| <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">part</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">partitioning</span><span class="p">(</span> |
| <span class="n">pa</span><span class="o">.</span><span class="n">schema</span><span class="p">([(</span><span class="s2">"year"</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">int16</span><span class="p">()),</span> <span class="p">(</span><span class="s2">"month"</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">int8</span><span class="p">()),</span> <span class="p">(</span><span class="s2">"day"</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">int32</span><span class="p">())]),</span> |
| <span class="n">flavor</span><span class="o">=</span><span class="s2">"hive"</span> |
| <span class="p">)</span> |
| <span class="n">dataset</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">dataset</span><span class="p">(</span><span class="o">...</span><span class="p">,</span> <span class="n">partitioning</span><span class="o">=</span><span class="n">part</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| <p>“Directory partitioning” is also supported, where the segments in the file path |
| represent the values of the partition keys without including the name (the |
| field name are implicit in the segment’s index). For example, given field names |
| “year”, “month”, and “day”, one path might be “/2019/11/15”.</p> |
| <p>Since the names are not included in the file paths, these must be specified |
| when constructing a directory partitioning:</p> |
| <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">part</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">partitioning</span><span class="p">(</span><span class="n">field_names</span><span class="o">=</span><span class="p">[</span><span class="s2">"year"</span><span class="p">,</span> <span class="s2">"month"</span><span class="p">,</span> <span class="s2">"day"</span><span class="p">])</span> |
| </pre></div> |
| </div> |
| <p>Directory partitioning also supports providing a full schema rather than inferring |
| types from file paths.</p> |
| </div> |
| </div> |
| <div class="section" id="reading-from-cloud-storage"> |
| <h2>Reading from cloud storage<a class="headerlink" href="#reading-from-cloud-storage" title="Permalink to this headline">¶</a></h2> |
| <p>In addition to local files, pyarrow also supports reading from cloud storage. |
| Currently, <a class="reference internal" href="generated/pyarrow.fs.HadoopFileSystem.html#pyarrow.fs.HadoopFileSystem" title="pyarrow.fs.HadoopFileSystem"><code class="xref py py-class docutils literal notranslate"><span class="pre">HDFS</span></code></a> and |
| <a class="reference internal" href="generated/pyarrow.fs.S3FileSystem.html#pyarrow.fs.S3FileSystem" title="pyarrow.fs.S3FileSystem"><code class="xref py py-class docutils literal notranslate"><span class="pre">Amazon</span> <span class="pre">S3-compatible</span> <span class="pre">storage</span></code></a> are supported.</p> |
| <p>When passing a file URI, the file system will be inferred. For example, |
| specifying a S3 path:</p> |
| <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">dataset</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">dataset</span><span class="p">(</span><span class="s2">"s3://ursa-labs-taxi-data/"</span><span class="p">,</span> <span class="n">partitioning</span><span class="o">=</span><span class="p">[</span><span class="s2">"year"</span><span class="p">,</span> <span class="s2">"month"</span><span class="p">])</span> |
| </pre></div> |
| </div> |
| <p>Typically, you will want to customize the connection parameters, and then |
| a file system object can be created and passed to the <code class="docutils literal notranslate"><span class="pre">filesystem</span></code> keyword:</p> |
| <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">pyarrow</span> <span class="kn">import</span> <span class="n">fs</span> |
| |
| <span class="n">s3</span> <span class="o">=</span> <span class="n">fs</span><span class="o">.</span><span class="n">S3FileSystem</span><span class="p">(</span><span class="n">region</span><span class="o">=</span><span class="s2">"us-east-2"</span><span class="p">)</span> |
| <span class="n">dataset</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">dataset</span><span class="p">(</span><span class="s2">"ursa-labs-taxi-data/"</span><span class="p">,</span> <span class="n">filesystem</span><span class="o">=</span><span class="n">s3</span><span class="p">,</span> |
| <span class="n">partitioning</span><span class="o">=</span><span class="p">[</span><span class="s2">"year"</span><span class="p">,</span> <span class="s2">"month"</span><span class="p">])</span> |
| </pre></div> |
| </div> |
| <p>The currently available classes are <a class="reference internal" href="generated/pyarrow.fs.S3FileSystem.html#pyarrow.fs.S3FileSystem" title="pyarrow.fs.S3FileSystem"><code class="xref py py-class docutils literal notranslate"><span class="pre">S3FileSystem</span></code></a> and |
| <a class="reference internal" href="generated/pyarrow.fs.HadoopFileSystem.html#pyarrow.fs.HadoopFileSystem" title="pyarrow.fs.HadoopFileSystem"><code class="xref py py-class docutils literal notranslate"><span class="pre">HadoopFileSystem</span></code></a>. See the <a class="reference internal" href="filesystems.html#filesystem"><span class="std std-ref">Filesystem Interface</span></a> docs for more |
| details.</p> |
| </div> |
| <div class="section" id="reading-from-minio"> |
| <h2>Reading from Minio<a class="headerlink" href="#reading-from-minio" title="Permalink to this headline">¶</a></h2> |
| <p>In addition to cloud storage, pyarrow also supports reading from a |
| <a class="reference external" href="https://github.com/minio/minio">MinIO</a> object storage instance emulating S3 |
| APIs. Paired with <a class="reference external" href="https://github.com/shopify/toxiproxy">toxiproxy</a>, this is |
| useful for testing or benchmarking.</p> |
| <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">pyarrow</span> <span class="kn">import</span> <span class="n">fs</span> |
| |
| <span class="c1"># By default, MinIO will listen for unencrypted HTTP traffic.</span> |
| <span class="n">minio</span> <span class="o">=</span> <span class="n">fs</span><span class="o">.</span><span class="n">S3FileSystem</span><span class="p">(</span><span class="n">scheme</span><span class="o">=</span><span class="s2">"http"</span><span class="p">,</span> <span class="n">endpoint</span><span class="o">=</span><span class="s2">"localhost:9000"</span><span class="p">)</span> |
| <span class="n">dataset</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">dataset</span><span class="p">(</span><span class="s2">"ursa-labs-taxi-data/"</span><span class="p">,</span> <span class="n">filesystem</span><span class="o">=</span><span class="n">minio</span><span class="p">,</span> |
| <span class="n">partitioning</span><span class="o">=</span><span class="p">[</span><span class="s2">"year"</span><span class="p">,</span> <span class="s2">"month"</span><span class="p">])</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="section" id="working-with-parquet-datasets"> |
| <h2>Working with Parquet Datasets<a class="headerlink" href="#working-with-parquet-datasets" title="Permalink to this headline">¶</a></h2> |
| <p>While the Datasets API provides a unified interface to different file formats, |
| some specific methods exist for Parquet Datasets.</p> |
| <p>Some processing frameworks such as Dask (optionally) use a <code class="docutils literal notranslate"><span class="pre">_metadata</span></code> file |
| with partitioned datasets which includes information about the schema and the |
| row group metadata of the full dataset. Using such file can give a more |
| efficient creation of a parquet Dataset, since it does not need to infer the |
| schema and crawl the directories for all Parquet files (this is especially the |
| case for filesystems where accessing files is expensive). The |
| <a class="reference internal" href="generated/pyarrow.dataset.parquet_dataset.html#pyarrow.dataset.parquet_dataset" title="pyarrow.dataset.parquet_dataset"><code class="xref py py-func docutils literal notranslate"><span class="pre">parquet_dataset()</span></code></a> function allows to create a Dataset from a partitioned |
| dataset with a <code class="docutils literal notranslate"><span class="pre">_metadata</span></code> file:</p> |
| <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">dataset</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">parquet_dataset</span><span class="p">(</span><span class="s2">"/path/to/dir/_metadata"</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| <p>By default, the constructed <a class="reference internal" href="generated/pyarrow.dataset.Dataset.html#pyarrow.dataset.Dataset" title="pyarrow.dataset.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a> object for Parquet datasets maps |
| each fragment to a single Parquet file. If you want fragments mapping to each |
| row group of a Parquet file, you can use the <code class="docutils literal notranslate"><span class="pre">split_by_row_group()</span></code> method of |
| the fragments:</p> |
| <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">fragments</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">dataset</span><span class="o">.</span><span class="n">get_fragments</span><span class="p">())</span> |
| <span class="n">fragments</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">split_by_row_group</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| <p>This method returns a list of new Fragments mapping to each row group of |
| the original Fragment (Parquet file). Both <code class="docutils literal notranslate"><span class="pre">get_fragments()</span></code> and |
| <code class="docutils literal notranslate"><span class="pre">split_by_row_group()</span></code> accept an optional filter expression to get a |
| filtered list of fragments.</p> |
| </div> |
| <div class="section" id="manual-specification-of-the-dataset"> |
| <h2>Manual specification of the Dataset<a class="headerlink" href="#manual-specification-of-the-dataset" title="Permalink to this headline">¶</a></h2> |
| <p>The <a class="reference internal" href="generated/pyarrow.dataset.dataset.html#pyarrow.dataset.dataset" title="pyarrow.dataset.dataset"><code class="xref py py-func docutils literal notranslate"><span class="pre">dataset()</span></code></a> function allows easy creation of a Dataset viewing a directory, |
| crawling all subdirectories for files and partitioning information. However |
| sometimes discovery is not required and the dataset’s files and partitions |
| are already known (for example, when this information is stored in metadata). |
| In this case it is possible to create a Dataset explicitly without any |
| automatic discovery or inference.</p> |
| <p>For the example here, we are going to use a dataset where the file names contain |
| additional partitioning information:</p> |
| <div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="go"># creating a dummy dataset: directory with two files</span> |
| <span class="gp">In [39]: </span><span class="n">table</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">table</span><span class="p">({</span><span class="s1">'col1'</span><span class="p">:</span> <span class="nb">range</span><span class="p">(</span><span class="mi">3</span><span class="p">),</span> <span class="s1">'col2'</span><span class="p">:</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="mi">3</span><span class="p">)})</span> |
| |
| <span class="gp">In [40]: </span><span class="p">(</span><span class="n">base</span> <span class="o">/</span> <span class="s2">"parquet_dataset_manual"</span><span class="p">)</span><span class="o">.</span><span class="n">mkdir</span><span class="p">(</span><span class="n">exist_ok</span><span class="o">=</span><span class="bp">True</span><span class="p">)</span> |
| |
| <span class="gp">In [41]: </span><span class="n">pq</span><span class="o">.</span><span class="n">write_table</span><span class="p">(</span><span class="n">table</span><span class="p">,</span> <span class="n">base</span> <span class="o">/</span> <span class="s2">"parquet_dataset_manual"</span> <span class="o">/</span> <span class="s2">"data_2018.parquet"</span><span class="p">)</span> |
| |
| <span class="gp">In [42]: </span><span class="n">pq</span><span class="o">.</span><span class="n">write_table</span><span class="p">(</span><span class="n">table</span><span class="p">,</span> <span class="n">base</span> <span class="o">/</span> <span class="s2">"parquet_dataset_manual"</span> <span class="o">/</span> <span class="s2">"data_2019.parquet"</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| <p>To create a Dataset from a list of files, we need to specify the paths, schema, |
| format, filesystem, and partition expressions manually:</p> |
| <div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [43]: </span><span class="kn">from</span> <span class="nn">pyarrow</span> <span class="kn">import</span> <span class="n">fs</span> |
| |
| <span class="gp">In [44]: </span><span class="n">schema</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">schema</span><span class="p">([(</span><span class="s2">"year"</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">int64</span><span class="p">()),</span> <span class="p">(</span><span class="s2">"col1"</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">int64</span><span class="p">()),</span> <span class="p">(</span><span class="s2">"col2"</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">float64</span><span class="p">())])</span> |
| |
| <span class="gp">In [45]: </span><span class="n">dataset</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">FileSystemDataset</span><span class="o">.</span><span class="n">from_paths</span><span class="p">(</span> |
| <span class="gp"> ....: </span> <span class="p">[</span><span class="s2">"data_2018.parquet"</span><span class="p">,</span> <span class="s2">"data_2019.parquet"</span><span class="p">],</span> <span class="n">schema</span><span class="o">=</span><span class="n">schema</span><span class="p">,</span> <span class="n">format</span><span class="o">=</span><span class="n">ds</span><span class="o">.</span><span class="n">ParquetFileFormat</span><span class="p">(),</span> |
| <span class="gp"> ....: </span> <span class="n">filesystem</span><span class="o">=</span><span class="n">fs</span><span class="o">.</span><span class="n">SubTreeFileSystem</span><span class="p">(</span><span class="nb">str</span><span class="p">(</span><span class="n">base</span> <span class="o">/</span> <span class="s2">"parquet_dataset_manual"</span><span class="p">),</span> <span class="n">fs</span><span class="o">.</span><span class="n">LocalFileSystem</span><span class="p">()),</span> |
| <span class="gp"> ....: </span> <span class="n">partitions</span><span class="o">=</span><span class="p">[</span><span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s1">'year'</span><span class="p">)</span> <span class="o">==</span> <span class="mi">2018</span><span class="p">,</span> <span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s1">'year'</span><span class="p">)</span> <span class="o">==</span> <span class="mi">2019</span><span class="p">])</span> |
| <span class="gp"> ....: </span> |
| </pre></div> |
| </div> |
| <p>Since we specified the “partition expressions” for our files, this information |
| is materialized as columns when reading the data and can be used for filtering:</p> |
| <div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [46]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">to_table</span><span class="p">()</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">()</span> |
| <span class="gh">Out[46]: </span><span class="go"></span> |
| <span class="go"> year col1 col2</span> |
| <span class="go">0 2018 0 0.495000</span> |
| <span class="go">1 2018 1 1.753592</span> |
| <span class="go">2 2018 2 -0.569114</span> |
| <span class="go">3 2019 0 0.495000</span> |
| <span class="go">4 2019 1 1.753592</span> |
| <span class="go">5 2019 2 -0.569114</span> |
| |
| <span class="gp">In [47]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">to_table</span><span class="p">(</span><span class="nb">filter</span><span class="o">=</span><span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s1">'year'</span><span class="p">)</span> <span class="o">==</span> <span class="mi">2019</span><span class="p">)</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">()</span> |
| <span class="gh">Out[47]: </span><span class="go"></span> |
| <span class="go"> year col1 col2</span> |
| <span class="go">0 2019 0 0.495000</span> |
| <span class="go">1 2019 1 1.753592</span> |
| <span class="go">2 2019 2 -0.569114</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="section" id="manual-scheduling"> |
| <h2>Manual scheduling<a class="headerlink" href="#manual-scheduling" title="Permalink to this headline">¶</a></h2> |
| <p>The <a class="reference internal" href="generated/pyarrow.dataset.Dataset.html#pyarrow.dataset.Dataset.to_table" title="pyarrow.dataset.Dataset.to_table"><code class="xref py py-func docutils literal notranslate"><span class="pre">to_table()</span></code></a> method loads all selected data into memory |
| at once resulting in a pyarrow Table. Alternatively, a dataset can also be |
| scanned one RecordBatch at a time in an iterative manner using the |
| <a class="reference internal" href="generated/pyarrow.dataset.Dataset.html#pyarrow.dataset.Dataset.scan" title="pyarrow.dataset.Dataset.scan"><code class="xref py py-func docutils literal notranslate"><span class="pre">scan()</span></code></a> method:</p> |
| <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="k">for</span> <span class="n">scan_task</span> <span class="ow">in</span> <span class="n">dataset</span><span class="o">.</span><span class="n">scan</span><span class="p">(</span><span class="n">columns</span><span class="o">=</span><span class="p">[</span><span class="o">...</span><span class="p">],</span> <span class="nb">filter</span><span class="o">=...</span><span class="p">):</span> |
| <span class="k">for</span> <span class="n">record_batch</span> <span class="ow">in</span> <span class="n">scan_task</span><span class="o">.</span><span class="n">execute</span><span class="p">():</span> |
| <span class="c1"># process the record batch</span> |
| </pre></div> |
| </div> |
| </div> |
| </div> |
| |
| |
| </div> |
| |
| |
| <div class='prev-next-bottom'> |
| |
| <a class='left-prev' id="prev-link" href="parquet.html" title="previous page">Reading and Writing the Apache Parquet Format</a> |
| <a class='right-next' id="next-link" href="cuda.html" title="next page">CUDA Integration</a> |
| |
| </div> |
| |
| </main> |
| |
| |
| </div> |
| </div> |
| |
| <script src="../_static/js/index.1c5a1a01449ed65a7b51.js"></script> |
| |
| |
| <!-- Matomo --> |
| <script> |
| var _paq = window._paq = window._paq || []; |
| /* tracker methods like "setCustomDimension" should be called before "trackPageView" */ |
| /* We explicitly disable cookie tracking to avoid privacy issues */ |
| _paq.push(['disableCookies']); |
| _paq.push(['trackPageView']); |
| _paq.push(['enableLinkTracking']); |
| (function() { |
| var u="https://analytics.apache.org/"; |
| _paq.push(['setTrackerUrl', u+'matomo.php']); |
| _paq.push(['setSiteId', '20']); |
| var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0]; |
| g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s); |
| })(); |
| </script> |
| <!-- End Matomo Code --> |
| <footer class="footer mt-5 mt-md-0"> |
| <div class="container"> |
| |
| <div class="footer-item"> |
| <p class="copyright"> |
| © Copyright 2016-2019 Apache Software Foundation.<br/> |
| </p> |
| </div> |
| |
| <div class="footer-item"> |
| <p class="sphinx-version"> |
| Created using <a href="http://sphinx-doc.org/">Sphinx</a> 4.0.2.<br/> |
| </p> |
| </div> |
| |
| </div> |
| </footer> |
| <script type="text/javascript" src="/docs/_static/versionwarning.js"></script> </body> |
| </html> |