blob: 431b3817ec942898ee21012a5eb452487b39d094 [file] [log] [blame]
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Tabular Datasets &#8212; Apache Arrow v4.0.1</title>
<link href="../_static/css/theme.css" rel="stylesheet" />
<link href="../_static/css/index.c5995385ac14fb8791e8eb36b4908be2.css" rel="stylesheet" />
<link rel="stylesheet"
href="../_static/vendor/fontawesome/5.13.0/css/all.min.css">
<link rel="preload" as="font" type="font/woff2" crossorigin
href="../_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff2">
<link rel="preload" as="font" type="font/woff2" crossorigin
href="../_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff2">
<link rel="stylesheet" type="text/css" href="../_static/pygments.css" />
<link rel="stylesheet" type="text/css" href="../_static/basic.css" />
<link rel="stylesheet" type="text/css" href="../_static/theme_overrides.css" />
<link rel="preload" as="script" href="../_static/js/index.1c5a1a01449ed65a7b51.js">
<script data-url_root="../" id="documentation_options" src="../_static/documentation_options.js"></script>
<script src="../_static/jquery.js"></script>
<script src="../_static/underscore.js"></script>
<script src="../_static/doctools.js"></script>
<link rel="canonical" href="https://arrow.apache.org/docs/python/dataset.html" />
<link rel="shortcut icon" href="../_static/favicon.ico"/>
<link rel="canonical" href="https://arrow.apache.org/docs/python/dataset.html" />
<link rel="index" title="Index" href="../genindex.html" />
<link rel="search" title="Search" href="../search.html" />
<link rel="next" title="CUDA Integration" href="cuda.html" />
<link rel="prev" title="Reading and Writing the Apache Parquet Format" href="parquet.html" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="docsearch:language" content="en" />
</head>
<body data-spy="scroll" data-target="#bd-toc-nav" data-offset="80">
<div class="container-fluid" id="banner"></div>
<div class="container-xl">
<div class="row">
<!-- Only show if we have sidebars configured, else just a small margin -->
<div class="col-12 col-md-3 bd-sidebar">
<a class="navbar-brand" href="../index.html">
<img src="../_static/arrow.png" class="logo" alt="logo">
</a>
<form class="bd-search d-flex align-items-center" action="../search.html" method="get">
<i class="icon fas fa-search"></i>
<input type="search" class="form-control" name="q" id="search-input" placeholder="Search the docs ..." aria-label="Search the docs ..." autocomplete="off" >
</form>
<nav class="bd-links" id="bd-docs-nav" aria-label="Main navigation">
<div class="bd-toc-item active">
<p class="caption">
<span class="caption-text">
Specifications and Protocols
</span>
</p>
<ul class="nav bd-sidenav">
<li class="toctree-l1">
<a class="reference internal" href="../format/Versioning.html">
Format Versioning and Stability
</a>
</li>
<li class="toctree-l1">
<a class="reference internal" href="../format/Columnar.html">
Arrow Columnar Format
</a>
</li>
<li class="toctree-l1">
<a class="reference internal" href="../format/Flight.html">
Arrow Flight RPC
</a>
</li>
<li class="toctree-l1">
<a class="reference internal" href="../format/Integration.html">
Integration Testing
</a>
</li>
<li class="toctree-l1">
<a class="reference internal" href="../format/CDataInterface.html">
The Arrow C data interface
</a>
</li>
<li class="toctree-l1">
<a class="reference internal" href="../format/CStreamInterface.html">
The Arrow C stream interface
</a>
</li>
<li class="toctree-l1">
<a class="reference internal" href="../format/Other.html">
Other Data Structures
</a>
</li>
</ul>
<p class="caption">
<span class="caption-text">
Libraries
</span>
</p>
<ul class="current nav bd-sidenav">
<li class="toctree-l1">
<a class="reference internal" href="../status.html">
Implementation Status
</a>
</li>
<li class="toctree-l1">
<a class="reference external" href="https://arrow.apache.org/docs/c_glib/">
C/GLib
</a>
</li>
<li class="toctree-l1 has-children">
<a class="reference internal" href="../cpp/index.html">
C++
</a>
<input class="toctree-checkbox" id="toctree-checkbox-1" name="toctree-checkbox-1" type="checkbox"/>
<label for="toctree-checkbox-1">
<i class="fas fa-chevron-down">
</i>
</label>
<ul>
<li class="toctree-l2 has-children">
<a class="reference internal" href="../cpp/getting_started.html">
User Guide
</a>
<input class="toctree-checkbox" id="toctree-checkbox-2" name="toctree-checkbox-2" type="checkbox"/>
<label for="toctree-checkbox-2">
<i class="fas fa-chevron-down">
</i>
</label>
<ul>
<li class="toctree-l3">
<a class="reference internal" href="../cpp/overview.html">
High-Level Overview
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../cpp/conventions.html">
Conventions
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../cpp/cmake.html">
Using Arrow C++ in your own project
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../cpp/memory.html">
Memory Management
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../cpp/arrays.html">
Arrays
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../cpp/datatypes.html">
Data Types
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../cpp/tables.html">
Tabular Data
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../cpp/compute.html">
Compute Functions
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../cpp/io.html">
Input / output and filesystems
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../cpp/ipc.html">
Reading and writing the Arrow IPC format
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../cpp/parquet.html">
Reading and writing Parquet files
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../cpp/csv.html">
Reading CSV files
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../cpp/json.html">
Reading JSON files
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../cpp/dataset.html">
Tabular Datasets
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../cpp/flight.html">
Arrow Flight RPC
</a>
</li>
</ul>
</li>
<li class="toctree-l2 has-children">
<a class="reference internal" href="../cpp/examples/index.html">
Examples
</a>
<input class="toctree-checkbox" id="toctree-checkbox-3" name="toctree-checkbox-3" type="checkbox"/>
<label for="toctree-checkbox-3">
<i class="fas fa-chevron-down">
</i>
</label>
<ul>
<li class="toctree-l3">
<a class="reference internal" href="../cpp/examples/cmake_minimal_build.html">
Minimal build using CMake
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../cpp/examples/dataset_documentation_example.html">
Arrow Datasets example
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../cpp/examples/row_columnar_conversion.html">
Row to columnar conversion
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../cpp/examples/tuple_range_conversion.html">
std::tuple-like ranges to Arrow
</a>
</li>
</ul>
</li>
<li class="toctree-l2 has-children">
<a class="reference internal" href="../cpp/api.html">
API Reference
</a>
<input class="toctree-checkbox" id="toctree-checkbox-4" name="toctree-checkbox-4" type="checkbox"/>
<label for="toctree-checkbox-4">
<i class="fas fa-chevron-down">
</i>
</label>
<ul>
<li class="toctree-l3">
<a class="reference internal" href="../cpp/api/support.html">
Programming Support
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../cpp/api/memory.html">
Memory (management)
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../cpp/api/datatype.html">
Data Types
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../cpp/api/array.html">
Arrays
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../cpp/api/scalar.html">
Scalars
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../cpp/api/builder.html">
Array Builders
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../cpp/api/table.html">
Two-dimensional Datasets
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../cpp/api/c_abi.html">
C Interfaces
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../cpp/api/compute.html">
Compute Functions
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../cpp/api/tensor.html">
Tensors
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../cpp/api/utilities.html">
Utilities
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../cpp/api/io.html">
Input / output
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../cpp/api/ipc.html">
Arrow IPC
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../cpp/api/formats.html">
File Formats
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../cpp/api/cuda.html">
CUDA support
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../cpp/api/flight.html">
Arrow Flight RPC
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../cpp/api/filesystem.html">
Filesystems
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../cpp/api/dataset.html">
Dataset
</a>
</li>
</ul>
</li>
</ul>
</li>
<li class="toctree-l1">
<a class="reference external" href="https://github.com/apache/arrow/blob/master/csharp/README.md">
C#
</a>
</li>
<li class="toctree-l1">
<a class="reference external" href="https://godoc.org/github.com/apache/arrow/go/arrow">
Go
</a>
</li>
<li class="toctree-l1 has-children">
<a class="reference internal" href="../java/index.html">
Java
</a>
<input class="toctree-checkbox" id="toctree-checkbox-5" name="toctree-checkbox-5" type="checkbox"/>
<label for="toctree-checkbox-5">
<i class="fas fa-chevron-down">
</i>
</label>
<ul>
<li class="toctree-l2">
<a class="reference internal" href="../java/vector.html">
ValueVector
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="../java/vector_schema_root.html">
VectorSchemaRoot
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="../java/ipc.html">
Reading/Writing IPC formats
</a>
</li>
<li class="toctree-l2">
<a class="reference external" href="https://arrow.apache.org/docs/java/reference/">
Reference (javadoc)
</a>
</li>
</ul>
</li>
<li class="toctree-l1">
<a class="reference external" href="https://arrow.apache.org/docs/js/">
JavaScript
</a>
</li>
<li class="toctree-l1">
<a class="reference external" href="https://github.com/apache/arrow/blob/master/julia/Arrow/README.md">
Julia
</a>
</li>
<li class="toctree-l1">
<a class="reference external" href="https://github.com/apache/arrow/blob/master/matlab/README.md">
MATLAB
</a>
</li>
<li class="toctree-l1 current active has-children">
<a class="reference internal" href="index.html">
Python
</a>
<input checked="" class="toctree-checkbox" id="toctree-checkbox-6" name="toctree-checkbox-6" type="checkbox"/>
<label for="toctree-checkbox-6">
<i class="fas fa-chevron-down">
</i>
</label>
<ul class="current">
<li class="toctree-l2">
<a class="reference internal" href="install.html">
Installing PyArrow
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="memory.html">
Memory and IO Interfaces
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="data.html">
Data Types and In-Memory Data Model
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="compute.html">
Compute Functions
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="ipc.html">
Streaming, Serialization, and IPC
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="filesystems.html">
Filesystem Interface
</a>
</li>
<li class="toctree-l2 has-children">
<a class="reference internal" href="filesystems_deprecated.html">
Filesystem Interface (legacy)
</a>
<input class="toctree-checkbox" id="toctree-checkbox-7" name="toctree-checkbox-7" type="checkbox"/>
<label for="toctree-checkbox-7">
<i class="fas fa-chevron-down">
</i>
</label>
<ul>
<li class="toctree-l3">
<a class="reference internal" href="generated/pyarrow.hdfs.connect.html">
pyarrow.hdfs.connect
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="generated/pyarrow.HadoopFileSystem.cat.html">
pyarrow.HadoopFileSystem.cat
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="generated/pyarrow.HadoopFileSystem.chmod.html">
pyarrow.HadoopFileSystem.chmod
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="generated/pyarrow.HadoopFileSystem.chown.html">
pyarrow.HadoopFileSystem.chown
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="generated/pyarrow.HadoopFileSystem.delete.html">
pyarrow.HadoopFileSystem.delete
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="generated/pyarrow.HadoopFileSystem.df.html">
pyarrow.HadoopFileSystem.df
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="generated/pyarrow.HadoopFileSystem.disk_usage.html">
pyarrow.HadoopFileSystem.disk_usage
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="generated/pyarrow.HadoopFileSystem.download.html">
pyarrow.HadoopFileSystem.download
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="generated/pyarrow.HadoopFileSystem.exists.html">
pyarrow.HadoopFileSystem.exists
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="generated/pyarrow.HadoopFileSystem.get_capacity.html">
pyarrow.HadoopFileSystem.get_capacity
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="generated/pyarrow.HadoopFileSystem.get_space_used.html">
pyarrow.HadoopFileSystem.get_space_used
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="generated/pyarrow.HadoopFileSystem.info.html">
pyarrow.HadoopFileSystem.info
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="generated/pyarrow.HadoopFileSystem.ls.html">
pyarrow.HadoopFileSystem.ls
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="generated/pyarrow.HadoopFileSystem.mkdir.html">
pyarrow.HadoopFileSystem.mkdir
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="generated/pyarrow.HadoopFileSystem.open.html">
pyarrow.HadoopFileSystem.open
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="generated/pyarrow.HadoopFileSystem.rename.html">
pyarrow.HadoopFileSystem.rename
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="generated/pyarrow.HadoopFileSystem.rm.html">
pyarrow.HadoopFileSystem.rm
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="generated/pyarrow.HadoopFileSystem.upload.html">
pyarrow.HadoopFileSystem.upload
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="generated/pyarrow.HdfsFile.html">
pyarrow.HdfsFile
</a>
</li>
</ul>
</li>
<li class="toctree-l2">
<a class="reference internal" href="plasma.html">
The Plasma In-Memory Object Store
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="numpy.html">
NumPy Integration
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="pandas.html">
Pandas Integration
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="timestamps.html">
Timestamps
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="csv.html">
Reading CSV files
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="feather.html">
Feather File Format
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="json.html">
Reading JSON files
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="parquet.html">
Reading and Writing the Apache Parquet Format
</a>
</li>
<li class="toctree-l2 current active">
<a class="current reference internal" href="#">
Tabular Datasets
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="cuda.html">
CUDA Integration
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="extending_types.html">
Extending pyarrow
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="extending.html">
Using pyarrow from C++ and Cython Code
</a>
</li>
<li class="toctree-l2 has-children">
<a class="reference internal" href="api.html">
API Reference
</a>
<input class="toctree-checkbox" id="toctree-checkbox-8" name="toctree-checkbox-8" type="checkbox"/>
<label for="toctree-checkbox-8">
<i class="fas fa-chevron-down">
</i>
</label>
<ul>
<li class="toctree-l3 has-children">
<a class="reference internal" href="api/datatypes.html">
Data Types and Schemas
</a>
<input class="toctree-checkbox" id="toctree-checkbox-9" name="toctree-checkbox-9" type="checkbox"/>
<label for="toctree-checkbox-9">
<i class="fas fa-chevron-down">
</i>
</label>
<ul>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.null.html">
pyarrow.null
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.bool_.html">
pyarrow.bool_
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.int8.html">
pyarrow.int8
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.int16.html">
pyarrow.int16
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.int32.html">
pyarrow.int32
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.int64.html">
pyarrow.int64
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.uint8.html">
pyarrow.uint8
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.uint16.html">
pyarrow.uint16
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.uint32.html">
pyarrow.uint32
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.uint64.html">
pyarrow.uint64
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.float16.html">
pyarrow.float16
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.float32.html">
pyarrow.float32
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.float64.html">
pyarrow.float64
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.time32.html">
pyarrow.time32
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.time64.html">
pyarrow.time64
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.timestamp.html">
pyarrow.timestamp
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.date32.html">
pyarrow.date32
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.date64.html">
pyarrow.date64
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.binary.html">
pyarrow.binary
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.string.html">
pyarrow.string
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.utf8.html">
pyarrow.utf8
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.large_binary.html">
pyarrow.large_binary
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.large_string.html">
pyarrow.large_string
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.large_utf8.html">
pyarrow.large_utf8
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.decimal128.html">
pyarrow.decimal128
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.list_.html">
pyarrow.list_
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.large_list.html">
pyarrow.large_list
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.map_.html">
pyarrow.map_
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.struct.html">
pyarrow.struct
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.dictionary.html">
pyarrow.dictionary
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.field.html">
pyarrow.field
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.schema.html">
pyarrow.schema
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.from_numpy_dtype.html">
pyarrow.from_numpy_dtype
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.DataType.html">
pyarrow.DataType
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.DictionaryType.html">
pyarrow.DictionaryType
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.ListType.html">
pyarrow.ListType
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.MapType.html">
pyarrow.MapType
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.StructType.html">
pyarrow.StructType
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.UnionType.html">
pyarrow.UnionType
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.TimestampType.html">
pyarrow.TimestampType
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.Time32Type.html">
pyarrow.Time32Type
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.Time64Type.html">
pyarrow.Time64Type
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.FixedSizeBinaryType.html">
pyarrow.FixedSizeBinaryType
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.Decimal128Type.html">
pyarrow.Decimal128Type
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.Field.html">
pyarrow.Field
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.Schema.html">
pyarrow.Schema
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.ExtensionType.html">
pyarrow.ExtensionType
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.PyExtensionType.html">
pyarrow.PyExtensionType
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.register_extension_type.html">
pyarrow.register_extension_type
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.unregister_extension_type.html">
pyarrow.unregister_extension_type
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.types.is_boolean.html">
pyarrow.types.is_boolean
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.types.is_integer.html">
pyarrow.types.is_integer
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.types.is_signed_integer.html">
pyarrow.types.is_signed_integer
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.types.is_unsigned_integer.html">
pyarrow.types.is_unsigned_integer
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.types.is_int8.html">
pyarrow.types.is_int8
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.types.is_int16.html">
pyarrow.types.is_int16
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.types.is_int32.html">
pyarrow.types.is_int32
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.types.is_int64.html">
pyarrow.types.is_int64
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.types.is_uint8.html">
pyarrow.types.is_uint8
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.types.is_uint16.html">
pyarrow.types.is_uint16
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.types.is_uint32.html">
pyarrow.types.is_uint32
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.types.is_uint64.html">
pyarrow.types.is_uint64
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.types.is_floating.html">
pyarrow.types.is_floating
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.types.is_float16.html">
pyarrow.types.is_float16
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.types.is_float32.html">
pyarrow.types.is_float32
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.types.is_float64.html">
pyarrow.types.is_float64
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.types.is_decimal.html">
pyarrow.types.is_decimal
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.types.is_list.html">
pyarrow.types.is_list
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.types.is_large_list.html">
pyarrow.types.is_large_list
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.types.is_struct.html">
pyarrow.types.is_struct
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.types.is_union.html">
pyarrow.types.is_union
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.types.is_nested.html">
pyarrow.types.is_nested
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.types.is_temporal.html">
pyarrow.types.is_temporal
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.types.is_timestamp.html">
pyarrow.types.is_timestamp
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.types.is_date.html">
pyarrow.types.is_date
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.types.is_date32.html">
pyarrow.types.is_date32
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.types.is_date64.html">
pyarrow.types.is_date64
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.types.is_time.html">
pyarrow.types.is_time
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.types.is_time32.html">
pyarrow.types.is_time32
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.types.is_time64.html">
pyarrow.types.is_time64
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.types.is_null.html">
pyarrow.types.is_null
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.types.is_binary.html">
pyarrow.types.is_binary
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.types.is_unicode.html">
pyarrow.types.is_unicode
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.types.is_string.html">
pyarrow.types.is_string
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.types.is_large_binary.html">
pyarrow.types.is_large_binary
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.types.is_large_unicode.html">
pyarrow.types.is_large_unicode
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.types.is_large_string.html">
pyarrow.types.is_large_string
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.types.is_fixed_size_binary.html">
pyarrow.types.is_fixed_size_binary
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.types.is_map.html">
pyarrow.types.is_map
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.types.is_dictionary.html">
pyarrow.types.is_dictionary
</a>
</li>
</ul>
</li>
<li class="toctree-l3 has-children">
<a class="reference internal" href="api/arrays.html">
Arrays and Scalars
</a>
<input class="toctree-checkbox" id="toctree-checkbox-10" name="toctree-checkbox-10" type="checkbox"/>
<label for="toctree-checkbox-10">
<i class="fas fa-chevron-down">
</i>
</label>
<ul>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.array.html">
pyarrow.array
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.nulls.html">
pyarrow.nulls
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.Array.html">
pyarrow.Array
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.BooleanArray.html">
pyarrow.BooleanArray
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.FloatingPointArray.html">
pyarrow.FloatingPointArray
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.IntegerArray.html">
pyarrow.IntegerArray
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.Int8Array.html">
pyarrow.Int8Array
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.Int16Array.html">
pyarrow.Int16Array
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.Int32Array.html">
pyarrow.Int32Array
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.Int64Array.html">
pyarrow.Int64Array
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.NullArray.html">
pyarrow.NullArray
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.NumericArray.html">
pyarrow.NumericArray
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.UInt8Array.html">
pyarrow.UInt8Array
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.UInt16Array.html">
pyarrow.UInt16Array
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.UInt32Array.html">
pyarrow.UInt32Array
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.UInt64Array.html">
pyarrow.UInt64Array
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.BinaryArray.html">
pyarrow.BinaryArray
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.StringArray.html">
pyarrow.StringArray
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.FixedSizeBinaryArray.html">
pyarrow.FixedSizeBinaryArray
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.LargeBinaryArray.html">
pyarrow.LargeBinaryArray
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.LargeStringArray.html">
pyarrow.LargeStringArray
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.Time32Array.html">
pyarrow.Time32Array
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.Time64Array.html">
pyarrow.Time64Array
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.Date32Array.html">
pyarrow.Date32Array
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.Date64Array.html">
pyarrow.Date64Array
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.TimestampArray.html">
pyarrow.TimestampArray
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.Decimal128Array.html">
pyarrow.Decimal128Array
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.DictionaryArray.html">
pyarrow.DictionaryArray
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.ListArray.html">
pyarrow.ListArray
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.LargeListArray.html">
pyarrow.LargeListArray
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.StructArray.html">
pyarrow.StructArray
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.UnionArray.html">
pyarrow.UnionArray
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.ExtensionArray.html">
pyarrow.ExtensionArray
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.scalar.html">
pyarrow.scalar
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.NA.html">
pyarrow.NA
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.Scalar.html">
pyarrow.Scalar
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.BooleanScalar.html">
pyarrow.BooleanScalar
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.Int8Scalar.html">
pyarrow.Int8Scalar
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.Int16Scalar.html">
pyarrow.Int16Scalar
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.Int32Scalar.html">
pyarrow.Int32Scalar
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.Int64Scalar.html">
pyarrow.Int64Scalar
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.UInt8Scalar.html">
pyarrow.UInt8Scalar
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.UInt16Scalar.html">
pyarrow.UInt16Scalar
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.UInt32Scalar.html">
pyarrow.UInt32Scalar
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.UInt64Scalar.html">
pyarrow.UInt64Scalar
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.FloatScalar.html">
pyarrow.FloatScalar
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.DoubleScalar.html">
pyarrow.DoubleScalar
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.BinaryScalar.html">
pyarrow.BinaryScalar
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.StringScalar.html">
pyarrow.StringScalar
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.FixedSizeBinaryScalar.html">
pyarrow.FixedSizeBinaryScalar
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.LargeBinaryScalar.html">
pyarrow.LargeBinaryScalar
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.LargeStringScalar.html">
pyarrow.LargeStringScalar
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.Time32Scalar.html">
pyarrow.Time32Scalar
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.Time64Scalar.html">
pyarrow.Time64Scalar
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.Date32Scalar.html">
pyarrow.Date32Scalar
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.Date64Scalar.html">
pyarrow.Date64Scalar
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.TimestampScalar.html">
pyarrow.TimestampScalar
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.Decimal128Scalar.html">
pyarrow.Decimal128Scalar
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.DictionaryScalar.html">
pyarrow.DictionaryScalar
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.ListScalar.html">
pyarrow.ListScalar
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.LargeListScalar.html">
pyarrow.LargeListScalar
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.StructScalar.html">
pyarrow.StructScalar
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.UnionScalar.html">
pyarrow.UnionScalar
</a>
</li>
</ul>
</li>
<li class="toctree-l3 has-children">
<a class="reference internal" href="api/memory.html">
Buffers and Memory
</a>
<input class="toctree-checkbox" id="toctree-checkbox-11" name="toctree-checkbox-11" type="checkbox"/>
<label for="toctree-checkbox-11">
<i class="fas fa-chevron-down">
</i>
</label>
<ul>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.allocate_buffer.html">
pyarrow.allocate_buffer
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.py_buffer.html">
pyarrow.py_buffer
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.foreign_buffer.html">
pyarrow.foreign_buffer
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.Buffer.html">
pyarrow.Buffer
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.ResizableBuffer.html">
pyarrow.ResizableBuffer
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compress.html">
pyarrow.compress
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.decompress.html">
pyarrow.decompress
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.MemoryPool.html">
pyarrow.MemoryPool
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.default_memory_pool.html">
pyarrow.default_memory_pool
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.jemalloc_memory_pool.html">
pyarrow.jemalloc_memory_pool
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.mimalloc_memory_pool.html">
pyarrow.mimalloc_memory_pool
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.system_memory_pool.html">
pyarrow.system_memory_pool
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.jemalloc_set_decay_ms.html">
pyarrow.jemalloc_set_decay_ms
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.set_memory_pool.html">
pyarrow.set_memory_pool
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.log_memory_allocations.html">
pyarrow.log_memory_allocations
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.total_allocated_bytes.html">
pyarrow.total_allocated_bytes
</a>
</li>
</ul>
</li>
<li class="toctree-l3 has-children">
<a class="reference internal" href="api/compute.html">
Compute Functions
</a>
<input class="toctree-checkbox" id="toctree-checkbox-12" name="toctree-checkbox-12" type="checkbox"/>
<label for="toctree-checkbox-12">
<i class="fas fa-chevron-down">
</i>
</label>
<ul>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.count.html">
pyarrow.compute.count
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.mean.html">
pyarrow.compute.mean
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.min_max.html">
pyarrow.compute.min_max
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.mode.html">
pyarrow.compute.mode
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.stddev.html">
pyarrow.compute.stddev
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.sum.html">
pyarrow.compute.sum
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.variance.html">
pyarrow.compute.variance
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.add.html">
pyarrow.compute.add
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.add_checked.html">
pyarrow.compute.add_checked
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.divide.html">
pyarrow.compute.divide
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.divide_checked.html">
pyarrow.compute.divide_checked
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.multiply.html">
pyarrow.compute.multiply
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.multiply_checked.html">
pyarrow.compute.multiply_checked
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.subtract.html">
pyarrow.compute.subtract
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.subtract_checked.html">
pyarrow.compute.subtract_checked
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.power.html">
pyarrow.compute.power
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.power_checked.html">
pyarrow.compute.power_checked
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.equal.html">
pyarrow.compute.equal
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.greater.html">
pyarrow.compute.greater
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.greater_equal.html">
pyarrow.compute.greater_equal
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.less.html">
pyarrow.compute.less
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.less_equal.html">
pyarrow.compute.less_equal
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.not_equal.html">
pyarrow.compute.not_equal
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.and_.html">
pyarrow.compute.and_
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.and_kleene.html">
pyarrow.compute.and_kleene
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.all.html">
pyarrow.compute.all
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.any.html">
pyarrow.compute.any
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.invert.html">
pyarrow.compute.invert
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.or_.html">
pyarrow.compute.or_
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.or_kleene.html">
pyarrow.compute.or_kleene
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.xor.html">
pyarrow.compute.xor
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.ascii_is_alnum.html">
pyarrow.compute.ascii_is_alnum
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.ascii_is_alpha.html">
pyarrow.compute.ascii_is_alpha
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.ascii_is_decimal.html">
pyarrow.compute.ascii_is_decimal
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.ascii_is_lower.html">
pyarrow.compute.ascii_is_lower
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.ascii_is_printable.html">
pyarrow.compute.ascii_is_printable
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.ascii_is_space.html">
pyarrow.compute.ascii_is_space
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.ascii_is_upper.html">
pyarrow.compute.ascii_is_upper
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.utf8_is_alnum.html">
pyarrow.compute.utf8_is_alnum
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.utf8_is_alpha.html">
pyarrow.compute.utf8_is_alpha
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.utf8_is_decimal.html">
pyarrow.compute.utf8_is_decimal
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.utf8_is_digit.html">
pyarrow.compute.utf8_is_digit
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.utf8_is_lower.html">
pyarrow.compute.utf8_is_lower
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.utf8_is_numeric.html">
pyarrow.compute.utf8_is_numeric
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.utf8_is_printable.html">
pyarrow.compute.utf8_is_printable
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.utf8_is_space.html">
pyarrow.compute.utf8_is_space
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.utf8_is_upper.html">
pyarrow.compute.utf8_is_upper
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.ascii_is_title.html">
pyarrow.compute.ascii_is_title
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.utf8_is_title.html">
pyarrow.compute.utf8_is_title
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.string_is_ascii.html">
pyarrow.compute.string_is_ascii
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.ascii_lower.html">
pyarrow.compute.ascii_lower
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.ascii_upper.html">
pyarrow.compute.ascii_upper
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.utf8_lower.html">
pyarrow.compute.utf8_lower
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.utf8_upper.html">
pyarrow.compute.utf8_upper
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.index_in.html">
pyarrow.compute.index_in
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.is_in.html">
pyarrow.compute.is_in
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.match_substring.html">
pyarrow.compute.match_substring
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.match_substring_regex.html">
pyarrow.compute.match_substring_regex
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.cast.html">
pyarrow.compute.cast
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.strptime.html">
pyarrow.compute.strptime
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.filter.html">
pyarrow.compute.filter
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.take.html">
pyarrow.compute.take
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.dictionary_encode.html">
pyarrow.compute.dictionary_encode
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.unique.html">
pyarrow.compute.unique
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.value_counts.html">
pyarrow.compute.value_counts
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.partition_nth_indices.html">
pyarrow.compute.partition_nth_indices
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.sort_indices.html">
pyarrow.compute.sort_indices
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.binary_length.html">
pyarrow.compute.binary_length
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.fill_null.html">
pyarrow.compute.fill_null
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.is_null.html">
pyarrow.compute.is_null
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.is_valid.html">
pyarrow.compute.is_valid
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.list_value_length.html">
pyarrow.compute.list_value_length
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.list_flatten.html">
pyarrow.compute.list_flatten
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.compute.list_parent_indices.html">
pyarrow.compute.list_parent_indices
</a>
</li>
</ul>
</li>
<li class="toctree-l3 has-children">
<a class="reference internal" href="api/files.html">
Streams and File Access
</a>
<input class="toctree-checkbox" id="toctree-checkbox-13" name="toctree-checkbox-13" type="checkbox"/>
<label for="toctree-checkbox-13">
<i class="fas fa-chevron-down">
</i>
</label>
<ul>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.input_stream.html">
pyarrow.input_stream
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.output_stream.html">
pyarrow.output_stream
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.memory_map.html">
pyarrow.memory_map
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.create_memory_map.html">
pyarrow.create_memory_map
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.NativeFile.html">
pyarrow.NativeFile
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.OSFile.html">
pyarrow.OSFile
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.PythonFile.html">
pyarrow.PythonFile
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.BufferReader.html">
pyarrow.BufferReader
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.BufferOutputStream.html">
pyarrow.BufferOutputStream
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.FixedSizeBufferWriter.html">
pyarrow.FixedSizeBufferWriter
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.MemoryMappedFile.html">
pyarrow.MemoryMappedFile
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.CompressedInputStream.html">
pyarrow.CompressedInputStream
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.CompressedOutputStream.html">
pyarrow.CompressedOutputStream
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.hdfs.connect.html">
pyarrow.hdfs.connect
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.LocalFileSystem.html">
pyarrow.LocalFileSystem
</a>
</li>
</ul>
</li>
<li class="toctree-l3 has-children">
<a class="reference internal" href="api/tables.html">
Tables and Tensors
</a>
<input class="toctree-checkbox" id="toctree-checkbox-14" name="toctree-checkbox-14" type="checkbox"/>
<label for="toctree-checkbox-14">
<i class="fas fa-chevron-down">
</i>
</label>
<ul>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.chunked_array.html">
pyarrow.chunked_array
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.concat_arrays.html">
pyarrow.concat_arrays
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.concat_tables.html">
pyarrow.concat_tables
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.record_batch.html">
pyarrow.record_batch
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.table.html">
pyarrow.table
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.ChunkedArray.html">
pyarrow.ChunkedArray
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.RecordBatch.html">
pyarrow.RecordBatch
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.Table.html">
pyarrow.Table
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.Tensor.html">
pyarrow.Tensor
</a>
</li>
</ul>
</li>
<li class="toctree-l3 has-children">
<a class="reference internal" href="api/ipc.html">
Serialization and IPC
</a>
<input class="toctree-checkbox" id="toctree-checkbox-15" name="toctree-checkbox-15" type="checkbox"/>
<label for="toctree-checkbox-15">
<i class="fas fa-chevron-down">
</i>
</label>
<ul>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.ipc.new_file.html">
pyarrow.ipc.new_file
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.ipc.open_file.html">
pyarrow.ipc.open_file
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.ipc.new_stream.html">
pyarrow.ipc.new_stream
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.ipc.open_stream.html">
pyarrow.ipc.open_stream
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.ipc.read_message.html">
pyarrow.ipc.read_message
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.ipc.read_record_batch.html">
pyarrow.ipc.read_record_batch
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.ipc.get_record_batch_size.html">
pyarrow.ipc.get_record_batch_size
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.ipc.read_tensor.html">
pyarrow.ipc.read_tensor
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.ipc.write_tensor.html">
pyarrow.ipc.write_tensor
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.ipc.get_tensor_size.html">
pyarrow.ipc.get_tensor_size
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.ipc.Message.html">
pyarrow.ipc.Message
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.ipc.MessageReader.html">
pyarrow.ipc.MessageReader
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.ipc.RecordBatchFileReader.html">
pyarrow.ipc.RecordBatchFileReader
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.ipc.RecordBatchFileWriter.html">
pyarrow.ipc.RecordBatchFileWriter
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.ipc.RecordBatchStreamReader.html">
pyarrow.ipc.RecordBatchStreamReader
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.ipc.RecordBatchStreamWriter.html">
pyarrow.ipc.RecordBatchStreamWriter
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.serialize.html">
pyarrow.serialize
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.serialize_to.html">
pyarrow.serialize_to
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.deserialize.html">
pyarrow.deserialize
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.deserialize_components.html">
pyarrow.deserialize_components
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.deserialize_from.html">
pyarrow.deserialize_from
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.read_serialized.html">
pyarrow.read_serialized
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.SerializedPyObject.html">
pyarrow.SerializedPyObject
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.SerializationContext.html">
pyarrow.SerializationContext
</a>
</li>
</ul>
</li>
<li class="toctree-l3 has-children">
<a class="reference internal" href="api/flight.html">
Arrow Flight
</a>
<input class="toctree-checkbox" id="toctree-checkbox-16" name="toctree-checkbox-16" type="checkbox"/>
<label for="toctree-checkbox-16">
<i class="fas fa-chevron-down">
</i>
</label>
<ul>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.flight.Action.html">
pyarrow.flight.Action
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.flight.ActionType.html">
pyarrow.flight.ActionType
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.flight.DescriptorType.html">
pyarrow.flight.DescriptorType
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.flight.FlightDescriptor.html">
pyarrow.flight.FlightDescriptor
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.flight.FlightEndpoint.html">
pyarrow.flight.FlightEndpoint
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.flight.FlightInfo.html">
pyarrow.flight.FlightInfo
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.flight.Location.html">
pyarrow.flight.Location
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.flight.Ticket.html">
pyarrow.flight.Ticket
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.flight.Result.html">
pyarrow.flight.Result
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.flight.FlightCallOptions.html">
pyarrow.flight.FlightCallOptions
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.flight.FlightClient.html">
pyarrow.flight.FlightClient
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.flight.ClientMiddlewareFactory.html">
pyarrow.flight.ClientMiddlewareFactory
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.flight.ClientMiddleware.html">
pyarrow.flight.ClientMiddleware
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.flight.FlightServerBase.html">
pyarrow.flight.FlightServerBase
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.flight.GeneratorStream.html">
pyarrow.flight.GeneratorStream
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.flight.RecordBatchStream.html">
pyarrow.flight.RecordBatchStream
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.flight.ServerMiddlewareFactory.html">
pyarrow.flight.ServerMiddlewareFactory
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.flight.ServerMiddleware.html">
pyarrow.flight.ServerMiddleware
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.flight.ClientAuthHandler.html">
pyarrow.flight.ClientAuthHandler
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.flight.ServerAuthHandler.html">
pyarrow.flight.ServerAuthHandler
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.flight.FlightMethod.html">
pyarrow.flight.FlightMethod
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.flight.CallInfo.html">
pyarrow.flight.CallInfo
</a>
</li>
</ul>
</li>
<li class="toctree-l3 has-children">
<a class="reference internal" href="api/formats.html">
Tabular File Formats
</a>
<input class="toctree-checkbox" id="toctree-checkbox-17" name="toctree-checkbox-17" type="checkbox"/>
<label for="toctree-checkbox-17">
<i class="fas fa-chevron-down">
</i>
</label>
<ul>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.csv.ReadOptions.html">
pyarrow.csv.ReadOptions
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.csv.ParseOptions.html">
pyarrow.csv.ParseOptions
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.csv.ConvertOptions.html">
pyarrow.csv.ConvertOptions
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.csv.read_csv.html">
pyarrow.csv.read_csv
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.csv.open_csv.html">
pyarrow.csv.open_csv
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.csv.CSVStreamingReader.html">
pyarrow.csv.CSVStreamingReader
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.feather.read_feather.html">
pyarrow.feather.read_feather
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.feather.read_table.html">
pyarrow.feather.read_table
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.feather.write_feather.html">
pyarrow.feather.write_feather
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.json.ReadOptions.html">
pyarrow.json.ReadOptions
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.json.ParseOptions.html">
pyarrow.json.ParseOptions
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.json.read_json.html">
pyarrow.json.read_json
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.parquet.ParquetDataset.html">
pyarrow.parquet.ParquetDataset
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.parquet.ParquetFile.html">
pyarrow.parquet.ParquetFile
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.parquet.ParquetWriter.html">
pyarrow.parquet.ParquetWriter
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.parquet.read_table.html">
pyarrow.parquet.read_table
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.parquet.read_metadata.html">
pyarrow.parquet.read_metadata
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.parquet.read_pandas.html">
pyarrow.parquet.read_pandas
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.parquet.read_schema.html">
pyarrow.parquet.read_schema
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.parquet.write_metadata.html">
pyarrow.parquet.write_metadata
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.parquet.write_table.html">
pyarrow.parquet.write_table
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.parquet.write_to_dataset.html">
pyarrow.parquet.write_to_dataset
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.orc.ORCFile.html">
pyarrow.orc.ORCFile
</a>
</li>
</ul>
</li>
<li class="toctree-l3 has-children">
<a class="reference internal" href="api/filesystems.html">
Filesystems
</a>
<input class="toctree-checkbox" id="toctree-checkbox-18" name="toctree-checkbox-18" type="checkbox"/>
<label for="toctree-checkbox-18">
<i class="fas fa-chevron-down">
</i>
</label>
<ul>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.fs.FileInfo.html">
pyarrow.fs.FileInfo
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.fs.FileSelector.html">
pyarrow.fs.FileSelector
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.fs.FileSystem.html">
pyarrow.fs.FileSystem
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.fs.LocalFileSystem.html">
pyarrow.fs.LocalFileSystem
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.fs.S3FileSystem.html">
pyarrow.fs.S3FileSystem
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.fs.HadoopFileSystem.html">
pyarrow.fs.HadoopFileSystem
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.fs.SubTreeFileSystem.html">
pyarrow.fs.SubTreeFileSystem
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.fs.PyFileSystem.html">
pyarrow.fs.PyFileSystem
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.fs.FileSystemHandler.html">
pyarrow.fs.FileSystemHandler
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.fs.FSSpecHandler.html">
pyarrow.fs.FSSpecHandler
</a>
</li>
</ul>
</li>
<li class="toctree-l3 has-children">
<a class="reference internal" href="api/dataset.html">
Dataset
</a>
<input class="toctree-checkbox" id="toctree-checkbox-19" name="toctree-checkbox-19" type="checkbox"/>
<label for="toctree-checkbox-19">
<i class="fas fa-chevron-down">
</i>
</label>
<ul>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.dataset.dataset.html">
pyarrow.dataset.dataset
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.dataset.parquet_dataset.html">
pyarrow.dataset.parquet_dataset
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.dataset.partitioning.html">
pyarrow.dataset.partitioning
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.dataset.field.html">
pyarrow.dataset.field
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.dataset.scalar.html">
pyarrow.dataset.scalar
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.dataset.FileFormat.html">
pyarrow.dataset.FileFormat
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.dataset.ParquetFileFormat.html">
pyarrow.dataset.ParquetFileFormat
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.dataset.Partitioning.html">
pyarrow.dataset.Partitioning
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.dataset.PartitioningFactory.html">
pyarrow.dataset.PartitioningFactory
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.dataset.DirectoryPartitioning.html">
pyarrow.dataset.DirectoryPartitioning
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.dataset.HivePartitioning.html">
pyarrow.dataset.HivePartitioning
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.dataset.Dataset.html">
pyarrow.dataset.Dataset
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.dataset.FileSystemDataset.html">
pyarrow.dataset.FileSystemDataset
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.dataset.FileSystemFactoryOptions.html">
pyarrow.dataset.FileSystemFactoryOptions
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.dataset.FileSystemDatasetFactory.html">
pyarrow.dataset.FileSystemDatasetFactory
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.dataset.UnionDataset.html">
pyarrow.dataset.UnionDataset
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.dataset.Scanner.html">
pyarrow.dataset.Scanner
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.dataset.Expression.html">
pyarrow.dataset.Expression
</a>
</li>
</ul>
</li>
<li class="toctree-l3 has-children">
<a class="reference internal" href="api/plasma.html">
Plasma In-Memory Object Store
</a>
<input class="toctree-checkbox" id="toctree-checkbox-20" name="toctree-checkbox-20" type="checkbox"/>
<label for="toctree-checkbox-20">
<i class="fas fa-chevron-down">
</i>
</label>
<ul>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.plasma.ObjectID.html">
pyarrow.plasma.ObjectID
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.plasma.PlasmaClient.html">
pyarrow.plasma.PlasmaClient
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.plasma.PlasmaBuffer.html">
pyarrow.plasma.PlasmaBuffer
</a>
</li>
</ul>
</li>
<li class="toctree-l3 has-children">
<a class="reference internal" href="api/cuda.html">
CUDA Integration
</a>
<input class="toctree-checkbox" id="toctree-checkbox-21" name="toctree-checkbox-21" type="checkbox"/>
<label for="toctree-checkbox-21">
<i class="fas fa-chevron-down">
</i>
</label>
<ul>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.cuda.Context.html">
pyarrow.cuda.Context
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.cuda.CudaBuffer.html">
pyarrow.cuda.CudaBuffer
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.cuda.new_host_buffer.html">
pyarrow.cuda.new_host_buffer
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.cuda.HostBuffer.html">
pyarrow.cuda.HostBuffer
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.cuda.BufferReader.html">
pyarrow.cuda.BufferReader
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.cuda.BufferWriter.html">
pyarrow.cuda.BufferWriter
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.cuda.serialize_record_batch.html">
pyarrow.cuda.serialize_record_batch
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.cuda.read_record_batch.html">
pyarrow.cuda.read_record_batch
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.cuda.read_message.html">
pyarrow.cuda.read_message
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.cuda.IpcMemHandle.html">
pyarrow.cuda.IpcMemHandle
</a>
</li>
</ul>
</li>
<li class="toctree-l3 has-children">
<a class="reference internal" href="api/misc.html">
Miscellaneous
</a>
<input class="toctree-checkbox" id="toctree-checkbox-22" name="toctree-checkbox-22" type="checkbox"/>
<label for="toctree-checkbox-22">
<i class="fas fa-chevron-down">
</i>
</label>
<ul>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.cpu_count.html">
pyarrow.cpu_count
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.set_cpu_count.html">
pyarrow.set_cpu_count
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.get_include.html">
pyarrow.get_include
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.get_libraries.html">
pyarrow.get_libraries
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="generated/pyarrow.get_library_dirs.html">
pyarrow.get_library_dirs
</a>
</li>
</ul>
</li>
</ul>
</li>
<li class="toctree-l2">
<a class="reference internal" href="getting_involved.html">
Getting Involved
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="benchmarks.html">
Benchmarks
</a>
</li>
</ul>
</li>
<li class="toctree-l1">
<a class="reference external" href="https://arrow.apache.org/docs/r/">
R
</a>
</li>
<li class="toctree-l1">
<a class="reference external" href="https://github.com/apache/arrow/blob/master/ruby/README.md">
Ruby
</a>
</li>
<li class="toctree-l1">
<a class="reference external" href="https://docs.rs/crate/arrow/">
Rust
</a>
</li>
</ul>
<p class="caption">
<span class="caption-text">
Development
</span>
</p>
<ul class="nav bd-sidenav">
<li class="toctree-l1">
<a class="reference internal" href="../developers/contributing.html">
Contributing to Apache Arrow
</a>
</li>
<li class="toctree-l1 has-children">
<a class="reference internal" href="../developers/cpp/index.html">
C++ Development
</a>
<input class="toctree-checkbox" id="toctree-checkbox-23" name="toctree-checkbox-23" type="checkbox"/>
<label for="toctree-checkbox-23">
<i class="fas fa-chevron-down">
</i>
</label>
<ul>
<li class="toctree-l2">
<a class="reference internal" href="../developers/cpp/building.html">
Building Arrow C++
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="../developers/cpp/development.html">
Development Guidelines
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="../developers/cpp/windows.html">
Developing on Windows
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="../developers/cpp/conventions.html">
Conventions
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="../developers/cpp/fuzzing.html">
Fuzzing Arrow C++
</a>
</li>
</ul>
</li>
<li class="toctree-l1">
<a class="reference internal" href="../developers/python.html">
Python Development
</a>
</li>
<li class="toctree-l1">
<a class="reference internal" href="../developers/archery.html">
Daily Development using Archery
</a>
</li>
<li class="toctree-l1">
<a class="reference internal" href="../developers/crossbow.html">
Packaging and Testing with Crossbow
</a>
</li>
<li class="toctree-l1">
<a class="reference internal" href="../developers/docker.html">
Running Docker Builds
</a>
</li>
<li class="toctree-l1">
<a class="reference internal" href="../developers/benchmarks.html">
Benchmarks
</a>
</li>
<li class="toctree-l1">
<a class="reference internal" href="../developers/documentation.html">
Building the Documentation
</a>
</li>
</ul>
</div>
</nav>
</div>
<div class="d-none d-xl-block col-xl-2 bd-toc">
<div class="toc-item">
<div class="tocsection onthispage pt-5 pb-3">
<i class="fas fa-list"></i> On this page
</div>
<nav id="bd-toc-nav">
<ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry">
<a class="reference internal nav-link" href="#reading-datasets">
Reading Datasets
</a>
<ul class="visible nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry">
<a class="reference internal nav-link" href="#dataset-discovery">
Dataset discovery
</a>
</li>
<li class="toc-h3 nav-item toc-entry">
<a class="reference internal nav-link" href="#reading-different-file-formats">
Reading different file formats
</a>
</li>
<li class="toc-h3 nav-item toc-entry">
<a class="reference internal nav-link" href="#customizing-file-formats">
Customizing file formats
</a>
</li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry">
<a class="reference internal nav-link" href="#filtering-data">
Filtering data
</a>
</li>
<li class="toc-h2 nav-item toc-entry">
<a class="reference internal nav-link" href="#projecting-columns">
Projecting columns
</a>
</li>
<li class="toc-h2 nav-item toc-entry">
<a class="reference internal nav-link" href="#reading-partitioned-data">
Reading partitioned data
</a>
<ul class="visible nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry">
<a class="reference internal nav-link" href="#different-partitioning-schemes">
Different partitioning schemes
</a>
</li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry">
<a class="reference internal nav-link" href="#reading-from-cloud-storage">
Reading from cloud storage
</a>
</li>
<li class="toc-h2 nav-item toc-entry">
<a class="reference internal nav-link" href="#reading-from-minio">
Reading from Minio
</a>
</li>
<li class="toc-h2 nav-item toc-entry">
<a class="reference internal nav-link" href="#working-with-parquet-datasets">
Working with Parquet Datasets
</a>
</li>
<li class="toc-h2 nav-item toc-entry">
<a class="reference internal nav-link" href="#manual-specification-of-the-dataset">
Manual specification of the Dataset
</a>
</li>
<li class="toc-h2 nav-item toc-entry">
<a class="reference internal nav-link" href="#manual-scheduling">
Manual scheduling
</a>
</li>
</ul>
</nav>
</div>
<div class="toc-item">
</div>
</div>
<main class="col-12 col-md-9 col-xl-7 py-md-5 pl-md-5 pr-md-4 bd-content" role="main">
<div>
<div class="section" id="tabular-datasets">
<span id="dataset"></span><h1>Tabular Datasets<a class="headerlink" href="#tabular-datasets" title="Permalink to this headline"></a></h1>
<div class="admonition warning">
<p class="admonition-title">Warning</p>
<p>The <code class="docutils literal notranslate"><span class="pre">pyarrow.dataset</span></code> module is experimental (specifically the classes),
and a stable API is not yet guaranteed.</p>
</div>
<p>The <code class="docutils literal notranslate"><span class="pre">pyarrow.dataset</span></code> module provides functionality to efficiently work with
tabular, potentially larger than memory, and multi-file datasets. This includes:</p>
<ul class="simple">
<li><p>A unified interface that supports different sources and file formats
(Parquet, Feather / Arrow IPC, and CSV files) and different file systems
(local, cloud).</p></li>
<li><p>Discovery of sources (crawling directories, handle directory-based partitioned
datasets, basic schema normalization, ..)</p></li>
<li><p>Optimized reading with predicate pushdown (filtering rows), projection
(selecting and deriving columns), and optionally parallel reading.</p></li>
</ul>
<p>Currently, only Parquet, Feather / Arrow IPC, and CSV files are supported. The
goal is to expand this in the future to other file formats and data sources
(e.g. database connections).</p>
<p>For those familiar with the existing <a class="reference internal" href="generated/pyarrow.parquet.ParquetDataset.html#pyarrow.parquet.ParquetDataset" title="pyarrow.parquet.ParquetDataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">pyarrow.parquet.ParquetDataset</span></code></a> for
reading Parquet datasets: <code class="docutils literal notranslate"><span class="pre">pyarrow.dataset</span></code>’s goal is similar but not specific
to the Parquet format and not tied to Python: the same datasets API is exposed
in the R bindings or Arrow. In addition <code class="docutils literal notranslate"><span class="pre">pyarrow.dataset</span></code> boasts improved
performance and new features (e.g. filtering within files rather than only on
partition keys).</p>
<div class="section" id="reading-datasets">
<h2>Reading Datasets<a class="headerlink" href="#reading-datasets" title="Permalink to this headline"></a></h2>
<p>For the examples below, let’s create a small dataset consisting
of a directory with two parquet files:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [1]: </span><span class="kn">import</span> <span class="nn">tempfile</span>
<span class="gp">In [2]: </span><span class="kn">import</span> <span class="nn">pathlib</span>
<span class="gp">In [3]: </span><span class="kn">import</span> <span class="nn">pyarrow</span> <span class="kn">as</span> <span class="nn">pa</span>
<span class="gp">In [4]: </span><span class="kn">import</span> <span class="nn">pyarrow.parquet</span> <span class="kn">as</span> <span class="nn">pq</span>
<span class="gp">In [5]: </span><span class="n">base</span> <span class="o">=</span> <span class="n">pathlib</span><span class="o">.</span><span class="n">Path</span><span class="p">(</span><span class="n">tempfile</span><span class="o">.</span><span class="n">gettempdir</span><span class="p">())</span>
<span class="gp">In [6]: </span><span class="p">(</span><span class="n">base</span> <span class="o">/</span> <span class="s2">&quot;parquet_dataset&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">mkdir</span><span class="p">(</span><span class="n">exist_ok</span><span class="o">=</span><span class="bp">True</span><span class="p">)</span>
<span class="go"># creating an Arrow Table</span>
<span class="gp">In [7]: </span><span class="n">table</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">table</span><span class="p">({</span><span class="s1">&#39;a&#39;</span><span class="p">:</span> <span class="nb">range</span><span class="p">(</span><span class="mi">10</span><span class="p">),</span> <span class="s1">&#39;b&#39;</span><span class="p">:</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="mi">10</span><span class="p">),</span> <span class="s1">&#39;c&#39;</span><span class="p">:</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">]</span> <span class="o">*</span> <span class="mi">5</span><span class="p">})</span>
<span class="go"># writing it into two parquet files</span>
<span class="gp">In [8]: </span><span class="n">pq</span><span class="o">.</span><span class="n">write_table</span><span class="p">(</span><span class="n">table</span><span class="o">.</span><span class="n">slice</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">5</span><span class="p">),</span> <span class="n">base</span> <span class="o">/</span> <span class="s2">&quot;parquet_dataset/data1.parquet&quot;</span><span class="p">)</span>
<span class="gp">In [9]: </span><span class="n">pq</span><span class="o">.</span><span class="n">write_table</span><span class="p">(</span><span class="n">table</span><span class="o">.</span><span class="n">slice</span><span class="p">(</span><span class="mi">5</span><span class="p">,</span> <span class="mi">10</span><span class="p">),</span> <span class="n">base</span> <span class="o">/</span> <span class="s2">&quot;parquet_dataset/data2.parquet&quot;</span><span class="p">)</span>
</pre></div>
</div>
<div class="section" id="dataset-discovery">
<h3>Dataset discovery<a class="headerlink" href="#dataset-discovery" title="Permalink to this headline"></a></h3>
<p>A <a class="reference internal" href="generated/pyarrow.dataset.Dataset.html#pyarrow.dataset.Dataset" title="pyarrow.dataset.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a> object can be created with the <a class="reference internal" href="generated/pyarrow.dataset.dataset.html#pyarrow.dataset.dataset" title="pyarrow.dataset.dataset"><code class="xref py py-func docutils literal notranslate"><span class="pre">dataset()</span></code></a> function. We
can pass it the path to the directory containing the data files:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [10]: </span><span class="kn">import</span> <span class="nn">pyarrow.dataset</span> <span class="kn">as</span> <span class="nn">ds</span>
<span class="gp">In [11]: </span><span class="n">dataset</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">dataset</span><span class="p">(</span><span class="n">base</span> <span class="o">/</span> <span class="s2">&quot;parquet_dataset&quot;</span><span class="p">,</span> <span class="n">format</span><span class="o">=</span><span class="s2">&quot;parquet&quot;</span><span class="p">)</span>
<span class="gp">In [12]: </span><span class="n">dataset</span>
<span class="gh">Out[12]: </span><span class="go">&lt;pyarrow._dataset.FileSystemDataset at 0x7f06a26cbdb0&gt;</span>
</pre></div>
</div>
<p>In addition to searching a base directory, <a class="reference internal" href="generated/pyarrow.dataset.dataset.html#pyarrow.dataset.dataset" title="pyarrow.dataset.dataset"><code class="xref py py-func docutils literal notranslate"><span class="pre">dataset()</span></code></a> accepts a path to a
single file or a list of file paths.</p>
<p>Creating a <a class="reference internal" href="generated/pyarrow.dataset.Dataset.html#pyarrow.dataset.Dataset" title="pyarrow.dataset.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a> object does not begin reading the data itself. If
needed, it only crawls the directory to find all the files:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [13]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">files</span>
<span class="gh">Out[13]: </span><span class="go">[&#39;/tmp/parquet_dataset/data1.parquet&#39;, &#39;/tmp/parquet_dataset/data2.parquet&#39;]</span>
</pre></div>
</div>
<p>… and infers the dataset’s schema (by default from the first file):</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [14]: </span><span class="k">print</span><span class="p">(</span><span class="n">dataset</span><span class="o">.</span><span class="n">schema</span><span class="o">.</span><span class="n">to_string</span><span class="p">(</span><span class="n">show_field_metadata</span><span class="o">=</span><span class="bp">False</span><span class="p">))</span>
<span class="go">a: int64</span>
<span class="go">b: double</span>
<span class="go">c: int64</span>
</pre></div>
</div>
<p>Using the <a class="reference internal" href="generated/pyarrow.dataset.Dataset.html#pyarrow.dataset.Dataset.to_table" title="pyarrow.dataset.Dataset.to_table"><code class="xref py py-meth docutils literal notranslate"><span class="pre">Dataset.to_table()</span></code></a> method we can read the dataset (or a portion
of it) into a pyarrow Table (note that depending on the size of your dataset
this can require a lot of memory, see below on filtering / iterative loading):</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [15]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">to_table</span><span class="p">()</span>
<span class="gh">Out[15]: </span><span class="go"></span>
<span class="go">pyarrow.Table</span>
<span class="go">a: int64</span>
<span class="go">b: double</span>
<span class="go">c: int64</span>
<span class="go"># converting to pandas to see the contents of the scanned table</span>
<span class="gp">In [16]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">to_table</span><span class="p">()</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">()</span>
<span class="gh">Out[16]: </span><span class="go"></span>
<span class="go"> a b c</span>
<span class="go">0 0 -0.561748 1</span>
<span class="go">1 1 0.275964 2</span>
<span class="go">2 2 0.992622 1</span>
<span class="go">3 3 -0.238940 2</span>
<span class="go">4 4 -0.473575 1</span>
<span class="go">5 5 1.865719 2</span>
<span class="go">6 6 0.237635 1</span>
<span class="go">7 7 0.578001 2</span>
<span class="go">8 8 0.776791 1</span>
<span class="go">9 9 -0.115339 2</span>
</pre></div>
</div>
</div>
<div class="section" id="reading-different-file-formats">
<h3>Reading different file formats<a class="headerlink" href="#reading-different-file-formats" title="Permalink to this headline"></a></h3>
<p>The above examples use Parquet files as dataset source but the Dataset API
provides a consistent interface across multiple file formats and filesystems.
Currently, Parquet, Feather / Arrow IPC, and CSV file formats are supported;
more formats are planned in the future.</p>
<p>If we save the table as Feather files instead of Parquet files:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [17]: </span><span class="kn">import</span> <span class="nn">pyarrow.feather</span> <span class="kn">as</span> <span class="nn">feather</span>
<span class="gp">In [18]: </span><span class="n">feather</span><span class="o">.</span><span class="n">write_feather</span><span class="p">(</span><span class="n">table</span><span class="p">,</span> <span class="n">base</span> <span class="o">/</span> <span class="s2">&quot;data.feather&quot;</span><span class="p">)</span>
</pre></div>
</div>
<p>…then we can read the Feather file using the same functions, but with specifying
<code class="docutils literal notranslate"><span class="pre">format=&quot;feather&quot;</span></code>:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [19]: </span><span class="n">dataset</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">dataset</span><span class="p">(</span><span class="n">base</span> <span class="o">/</span> <span class="s2">&quot;data.feather&quot;</span><span class="p">,</span> <span class="n">format</span><span class="o">=</span><span class="s2">&quot;feather&quot;</span><span class="p">)</span>
<span class="gp">In [20]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">to_table</span><span class="p">()</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">()</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="gh">Out[20]: </span><span class="go"></span>
<span class="go"> a b c</span>
<span class="go">0 0 -0.561748 1</span>
<span class="go">1 1 0.275964 2</span>
<span class="go">2 2 0.992622 1</span>
<span class="go">3 3 -0.238940 2</span>
<span class="go">4 4 -0.473575 1</span>
</pre></div>
</div>
</div>
<div class="section" id="customizing-file-formats">
<h3>Customizing file formats<a class="headerlink" href="#customizing-file-formats" title="Permalink to this headline"></a></h3>
<p>The format name as a string, like:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">ds</span><span class="o">.</span><span class="n">dataset</span><span class="p">(</span><span class="o">...</span><span class="p">,</span> <span class="nb">format</span><span class="o">=</span><span class="s2">&quot;parquet&quot;</span><span class="p">)</span>
</pre></div>
</div>
<p>is short hand for a default constructed <a class="reference internal" href="generated/pyarrow.dataset.ParquetFileFormat.html#pyarrow.dataset.ParquetFileFormat" title="pyarrow.dataset.ParquetFileFormat"><code class="xref py py-class docutils literal notranslate"><span class="pre">ParquetFileFormat</span></code></a>:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">ds</span><span class="o">.</span><span class="n">dataset</span><span class="p">(</span><span class="o">...</span><span class="p">,</span> <span class="nb">format</span><span class="o">=</span><span class="n">ds</span><span class="o">.</span><span class="n">ParquetFileForma</span><span class="p">())</span>
</pre></div>
</div>
<p>The <a class="reference internal" href="generated/pyarrow.dataset.FileFormat.html#pyarrow.dataset.FileFormat" title="pyarrow.dataset.FileFormat"><code class="xref py py-class docutils literal notranslate"><span class="pre">FileFormat</span></code></a> objects can be customized using keywords. For example:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">parquet_format</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">ParquetFileFormat</span><span class="p">(</span><span class="n">read_options</span><span class="o">=</span><span class="p">{</span><span class="s1">&#39;dictionary_columns&#39;</span><span class="p">:</span> <span class="p">[</span><span class="s1">&#39;a&#39;</span><span class="p">]})</span>
<span class="n">ds</span><span class="o">.</span><span class="n">dataset</span><span class="p">(</span><span class="o">...</span><span class="p">,</span> <span class="nb">format</span><span class="o">=</span><span class="n">parquet_format</span><span class="p">)</span>
</pre></div>
</div>
<p>Will configure column <code class="docutils literal notranslate"><span class="pre">&quot;a&quot;</span></code> to be dictionary encoded on scan.</p>
</div>
</div>
<div class="section" id="filtering-data">
<h2>Filtering data<a class="headerlink" href="#filtering-data" title="Permalink to this headline"></a></h2>
<p>To avoid reading all data when only needing a subset, the <code class="docutils literal notranslate"><span class="pre">columns</span></code> and
<code class="docutils literal notranslate"><span class="pre">filter</span></code> keywords can be used.</p>
<p>The <code class="docutils literal notranslate"><span class="pre">columns</span></code> keyword can be used to only read the specified columns:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [21]: </span><span class="n">dataset</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">dataset</span><span class="p">(</span><span class="n">base</span> <span class="o">/</span> <span class="s2">&quot;parquet_dataset&quot;</span><span class="p">,</span> <span class="n">format</span><span class="o">=</span><span class="s2">&quot;parquet&quot;</span><span class="p">)</span>
<span class="gp">In [22]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">to_table</span><span class="p">(</span><span class="n">columns</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39;a&#39;</span><span class="p">,</span> <span class="s1">&#39;b&#39;</span><span class="p">])</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">()</span>
<span class="gh">Out[22]: </span><span class="go"></span>
<span class="go"> a b</span>
<span class="go">0 0 -0.561748</span>
<span class="go">1 1 0.275964</span>
<span class="go">2 2 0.992622</span>
<span class="go">3 3 -0.238940</span>
<span class="go">4 4 -0.473575</span>
<span class="go">5 5 1.865719</span>
<span class="go">6 6 0.237635</span>
<span class="go">7 7 0.578001</span>
<span class="go">8 8 0.776791</span>
<span class="go">9 9 -0.115339</span>
</pre></div>
</div>
<p>With the <code class="docutils literal notranslate"><span class="pre">filter</span></code> keyword, rows which do not match the filter predicate will
not be included in the returned table. The keyword expects a boolean
<a class="reference internal" href="generated/pyarrow.dataset.Expression.html#pyarrow.dataset.Expression" title="pyarrow.dataset.Expression"><code class="xref py py-class docutils literal notranslate"><span class="pre">Expression</span></code></a> referencing at least one of the columns:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [23]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">to_table</span><span class="p">(</span><span class="nb">filter</span><span class="o">=</span><span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s1">&#39;a&#39;</span><span class="p">)</span> <span class="o">&gt;=</span> <span class="mi">7</span><span class="p">)</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">()</span>
<span class="gh">Out[23]: </span><span class="go"></span>
<span class="go"> a b c</span>
<span class="go">0 7 0.578001 2</span>
<span class="go">1 8 0.776791 1</span>
<span class="go">2 9 -0.115339 2</span>
<span class="gp">In [24]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">to_table</span><span class="p">(</span><span class="nb">filter</span><span class="o">=</span><span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s1">&#39;c&#39;</span><span class="p">)</span> <span class="o">==</span> <span class="mi">2</span><span class="p">)</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">()</span>
<span class="gh">Out[24]: </span><span class="go"></span>
<span class="go"> a b c</span>
<span class="go">0 1 0.275964 2</span>
<span class="go">1 3 -0.238940 2</span>
<span class="go">2 5 1.865719 2</span>
<span class="go">3 7 0.578001 2</span>
<span class="go">4 9 -0.115339 2</span>
</pre></div>
</div>
<p>The easiest way to construct those <a class="reference internal" href="generated/pyarrow.dataset.Expression.html#pyarrow.dataset.Expression" title="pyarrow.dataset.Expression"><code class="xref py py-class docutils literal notranslate"><span class="pre">Expression</span></code></a> objects is by using the
<a class="reference internal" href="generated/pyarrow.dataset.field.html#pyarrow.dataset.field" title="pyarrow.dataset.field"><code class="xref py py-func docutils literal notranslate"><span class="pre">field()</span></code></a> helper function. Any column - not just partition columns - can be
referenced using the <a class="reference internal" href="generated/pyarrow.dataset.field.html#pyarrow.dataset.field" title="pyarrow.dataset.field"><code class="xref py py-func docutils literal notranslate"><span class="pre">field()</span></code></a> function (which creates a
<code class="xref py py-class docutils literal notranslate"><span class="pre">FieldExpression</span></code>). Operator overloads are provided to compose filters
including the comparisons (equal, larger/less than, etc), set membership
testing, and boolean combinations (<code class="docutils literal notranslate"><span class="pre">&amp;</span></code>, <code class="docutils literal notranslate"><span class="pre">|</span></code>, <code class="docutils literal notranslate"><span class="pre">~</span></code>):</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [25]: </span><span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s1">&#39;a&#39;</span><span class="p">)</span> <span class="o">!=</span> <span class="mi">3</span>
<span class="gh">Out[25]: </span><span class="go">&lt;pyarrow.dataset.Expression (a != 3)&gt;</span>
<span class="gp">In [26]: </span><span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s1">&#39;a&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">isin</span><span class="p">([</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">])</span>
<span class="gh">Out[26]: </span><span class="go"></span>
<span class="go">&lt;pyarrow.dataset.Expression is_in(a, value_set=[</span>
<span class="go"> 1,</span>
<span class="go"> 2,</span>
<span class="go"> 3</span>
<span class="go">], skip_nulls)&gt;</span>
<span class="gp">In [27]: </span><span class="p">(</span><span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s1">&#39;a&#39;</span><span class="p">)</span> <span class="o">&gt;</span> <span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s1">&#39;b&#39;</span><span class="p">))</span> <span class="o">&amp;</span> <span class="p">(</span><span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s1">&#39;b&#39;</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">1</span><span class="p">)</span>
<span class="gh">Out[27]: </span><span class="go">&lt;pyarrow.dataset.Expression ((a &gt; b) and (b &gt; 1))&gt;</span>
</pre></div>
</div>
<p>Note that <a class="reference internal" href="generated/pyarrow.dataset.Expression.html#pyarrow.dataset.Expression" title="pyarrow.dataset.Expression"><code class="xref py py-class docutils literal notranslate"><span class="pre">Expression</span></code></a> objects can <strong>not</strong> be combined by python logical
operators <code class="docutils literal notranslate"><span class="pre">and</span></code>, <code class="docutils literal notranslate"><span class="pre">or</span></code> and <code class="docutils literal notranslate"><span class="pre">not</span></code>.</p>
</div>
<div class="section" id="projecting-columns">
<h2>Projecting columns<a class="headerlink" href="#projecting-columns" title="Permalink to this headline"></a></h2>
<p>The <code class="docutils literal notranslate"><span class="pre">columns</span></code> keyword can be used to read a subset of the columns of the
dataset by passing it a list of column names. The keyword can also be used
for more complex projections in combination with expressions.</p>
<p>In this case, we pass it a dictionary with the keys being the resulting
column names and the values the expression that is used to construct the column
values:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [28]: </span><span class="n">projection</span> <span class="o">=</span> <span class="p">{</span>
<span class="gp"> ....: </span> <span class="s2">&quot;a_renamed&quot;</span><span class="p">:</span> <span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s2">&quot;a&quot;</span><span class="p">),</span>
<span class="gp"> ....: </span> <span class="s2">&quot;b_as_float32&quot;</span><span class="p">:</span> <span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s2">&quot;b&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="s2">&quot;float32&quot;</span><span class="p">),</span>
<span class="gp"> ....: </span> <span class="s2">&quot;c_1&quot;</span><span class="p">:</span> <span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s2">&quot;c&quot;</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">,</span>
<span class="gp"> ....: </span><span class="p">}</span>
<span class="gp"> ....: </span>
<span class="gp">In [29]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">to_table</span><span class="p">(</span><span class="n">columns</span><span class="o">=</span><span class="n">projection</span><span class="p">)</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">()</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="gh">Out[29]: </span><span class="go"></span>
<span class="go"> a_renamed b_as_float32 c_1</span>
<span class="go">0 0 -0.561748 True</span>
<span class="go">1 1 0.275964 False</span>
<span class="go">2 2 0.992622 True</span>
<span class="go">3 3 -0.238940 False</span>
<span class="go">4 4 -0.473575 True</span>
</pre></div>
</div>
<p>The dictionary also determines the column selection (only the keys in the
dictionary will be present as columns in the resulting table). If you want
to include a derived column in <em>addition</em> to the existing columns, you can
build up the dictionary from the dataset schema:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [30]: </span><span class="n">projection</span> <span class="o">=</span> <span class="p">{</span><span class="n">col</span><span class="p">:</span> <span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">dataset</span><span class="o">.</span><span class="n">schema</span><span class="o">.</span><span class="n">names</span><span class="p">}</span>
<span class="gp">In [31]: </span><span class="n">projection</span><span class="o">.</span><span class="n">update</span><span class="p">({</span><span class="s2">&quot;b_large&quot;</span><span class="p">:</span> <span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s2">&quot;b&quot;</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">1</span><span class="p">})</span>
<span class="gp">In [32]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">to_table</span><span class="p">(</span><span class="n">columns</span><span class="o">=</span><span class="n">projection</span><span class="p">)</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">()</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="gh">Out[32]: </span><span class="go"></span>
<span class="go"> a b c b_large</span>
<span class="go">0 0 -0.561748 1 False</span>
<span class="go">1 1 0.275964 2 False</span>
<span class="go">2 2 0.992622 1 False</span>
<span class="go">3 3 -0.238940 2 False</span>
<span class="go">4 4 -0.473575 1 False</span>
</pre></div>
</div>
</div>
<div class="section" id="reading-partitioned-data">
<h2>Reading partitioned data<a class="headerlink" href="#reading-partitioned-data" title="Permalink to this headline"></a></h2>
<p>Above, a dataset consisting of a flat directory with files was shown. However, a
dataset can exploit a nested directory structure defining a partitioned dataset,
where the sub-directory names hold information about which subset of the data is
stored in that directory.</p>
<p>For example, a dataset partitioned by year and month may look like on disk:</p>
<div class="highlight-text notranslate"><div class="highlight"><pre><span></span>dataset_name/
year=2007/
month=01/
data0.parquet
data1.parquet
...
month=02/
data0.parquet
data1.parquet
...
month=03/
...
year=2008/
month=01/
...
...
</pre></div>
</div>
<p>The above partitioning scheme is using “/key=value/” directory names, as found
in Apache Hive.</p>
<p>Let’s create a small partitioned dataset. The <a class="reference internal" href="generated/pyarrow.parquet.write_to_dataset.html#pyarrow.parquet.write_to_dataset" title="pyarrow.parquet.write_to_dataset"><code class="xref py py-func docutils literal notranslate"><span class="pre">write_to_dataset()</span></code></a>
function can write such hive-like partitioned datasets.</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [33]: </span><span class="n">table</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">table</span><span class="p">({</span><span class="s1">&#39;a&#39;</span><span class="p">:</span> <span class="nb">range</span><span class="p">(</span><span class="mi">10</span><span class="p">),</span> <span class="s1">&#39;b&#39;</span><span class="p">:</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="mi">10</span><span class="p">),</span> <span class="s1">&#39;c&#39;</span><span class="p">:</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">]</span> <span class="o">*</span> <span class="mi">5</span><span class="p">,</span>
<span class="gp"> ....: </span> <span class="s1">&#39;part&#39;</span><span class="p">:</span> <span class="p">[</span><span class="s1">&#39;a&#39;</span><span class="p">]</span> <span class="o">*</span> <span class="mi">5</span> <span class="o">+</span> <span class="p">[</span><span class="s1">&#39;b&#39;</span><span class="p">]</span> <span class="o">*</span> <span class="mi">5</span><span class="p">})</span>
<span class="gp"> ....: </span>
<span class="gp">In [34]: </span><span class="n">pq</span><span class="o">.</span><span class="n">write_to_dataset</span><span class="p">(</span><span class="n">table</span><span class="p">,</span> <span class="nb">str</span><span class="p">(</span><span class="n">base</span> <span class="o">/</span> <span class="s2">&quot;parquet_dataset_partitioned&quot;</span><span class="p">),</span>
<span class="gp"> ....: </span> <span class="n">partition_cols</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39;part&#39;</span><span class="p">])</span>
<span class="gp"> ....: </span>
</pre></div>
</div>
<p>The above created a directory with two subdirectories (“part=a” and “part=b”),
and the Parquet files written in those directories no longer include the “part”
column.</p>
<p>Reading this dataset with <a class="reference internal" href="generated/pyarrow.dataset.dataset.html#pyarrow.dataset.dataset" title="pyarrow.dataset.dataset"><code class="xref py py-func docutils literal notranslate"><span class="pre">dataset()</span></code></a>, we now specify that the dataset
should use a hive-like partitioning scheme with the <cite>partitioning</cite> keyword:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [35]: </span><span class="n">dataset</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">dataset</span><span class="p">(</span><span class="nb">str</span><span class="p">(</span><span class="n">base</span> <span class="o">/</span> <span class="s2">&quot;parquet_dataset_partitioned&quot;</span><span class="p">),</span> <span class="n">format</span><span class="o">=</span><span class="s2">&quot;parquet&quot;</span><span class="p">,</span>
<span class="gp"> ....: </span> <span class="n">partitioning</span><span class="o">=</span><span class="s2">&quot;hive&quot;</span><span class="p">)</span>
<span class="gp"> ....: </span>
<span class="gp">In [36]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">files</span>
<span class="gh">Out[36]: </span><span class="go"></span>
<span class="go">[&#39;/tmp/parquet_dataset_partitioned/part=a/6778b16fc0284a1ab96768217808b72b.parquet&#39;,</span>
<span class="go"> &#39;/tmp/parquet_dataset_partitioned/part=b/3cafd61610c84ba595ba0bdef7cd97a6.parquet&#39;]</span>
</pre></div>
</div>
<p>Although the partition fields are not included in the actual Parquet files,
they will be added back to the resulting table when scanning this dataset:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [37]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">to_table</span><span class="p">()</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">()</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="mi">3</span><span class="p">)</span>
<span class="gh">Out[37]: </span><span class="go"></span>
<span class="go"> a b c part</span>
<span class="go">0 0 1.227143 1 a</span>
<span class="go">1 1 -0.052853 2 a</span>
<span class="go">2 2 1.673034 1 a</span>
</pre></div>
</div>
<p>We can now filter on the partition keys, which avoids loading files
altogether if they do not match the filter:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [38]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">to_table</span><span class="p">(</span><span class="nb">filter</span><span class="o">=</span><span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s2">&quot;part&quot;</span><span class="p">)</span> <span class="o">==</span> <span class="s2">&quot;b&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">()</span>
<span class="gh">Out[38]: </span><span class="go"></span>
<span class="go"> a b c part</span>
<span class="go">0 5 0.834894 2 b</span>
<span class="go">1 6 -0.446624 1 b</span>
<span class="go">2 7 -1.198733 2 b</span>
<span class="go">3 8 1.609412 1 b</span>
<span class="go">4 9 -0.775587 2 b</span>
</pre></div>
</div>
<div class="section" id="different-partitioning-schemes">
<h3>Different partitioning schemes<a class="headerlink" href="#different-partitioning-schemes" title="Permalink to this headline"></a></h3>
<p>The above example uses a hive-like directory scheme, such as “/year=2009/month=11/day=15”.
We specified this passing the <code class="docutils literal notranslate"><span class="pre">partitioning=&quot;hive&quot;</span></code> keyword. In this case,
the types of the partition keys are inferred from the file paths.</p>
<p>It is also possible to explicitly define the schema of the partition keys
using the <a class="reference internal" href="generated/pyarrow.dataset.partitioning.html#pyarrow.dataset.partitioning" title="pyarrow.dataset.partitioning"><code class="xref py py-func docutils literal notranslate"><span class="pre">partitioning()</span></code></a> function. For example:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">part</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">partitioning</span><span class="p">(</span>
<span class="n">pa</span><span class="o">.</span><span class="n">schema</span><span class="p">([(</span><span class="s2">&quot;year&quot;</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">int16</span><span class="p">()),</span> <span class="p">(</span><span class="s2">&quot;month&quot;</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">int8</span><span class="p">()),</span> <span class="p">(</span><span class="s2">&quot;day&quot;</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">int32</span><span class="p">())]),</span>
<span class="n">flavor</span><span class="o">=</span><span class="s2">&quot;hive&quot;</span>
<span class="p">)</span>
<span class="n">dataset</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">dataset</span><span class="p">(</span><span class="o">...</span><span class="p">,</span> <span class="n">partitioning</span><span class="o">=</span><span class="n">part</span><span class="p">)</span>
</pre></div>
</div>
<p>“Directory partitioning” is also supported, where the segments in the file path
represent the values of the partition keys without including the name (the
field name are implicit in the segment’s index). For example, given field names
“year”, “month”, and “day”, one path might be “/2019/11/15”.</p>
<p>Since the names are not included in the file paths, these must be specified
when constructing a directory partitioning:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">part</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">partitioning</span><span class="p">(</span><span class="n">field_names</span><span class="o">=</span><span class="p">[</span><span class="s2">&quot;year&quot;</span><span class="p">,</span> <span class="s2">&quot;month&quot;</span><span class="p">,</span> <span class="s2">&quot;day&quot;</span><span class="p">])</span>
</pre></div>
</div>
<p>Directory partitioning also supports providing a full schema rather than inferring
types from file paths.</p>
</div>
</div>
<div class="section" id="reading-from-cloud-storage">
<h2>Reading from cloud storage<a class="headerlink" href="#reading-from-cloud-storage" title="Permalink to this headline"></a></h2>
<p>In addition to local files, pyarrow also supports reading from cloud storage.
Currently, <a class="reference internal" href="generated/pyarrow.fs.HadoopFileSystem.html#pyarrow.fs.HadoopFileSystem" title="pyarrow.fs.HadoopFileSystem"><code class="xref py py-class docutils literal notranslate"><span class="pre">HDFS</span></code></a> and
<a class="reference internal" href="generated/pyarrow.fs.S3FileSystem.html#pyarrow.fs.S3FileSystem" title="pyarrow.fs.S3FileSystem"><code class="xref py py-class docutils literal notranslate"><span class="pre">Amazon</span> <span class="pre">S3-compatible</span> <span class="pre">storage</span></code></a> are supported.</p>
<p>When passing a file URI, the file system will be inferred. For example,
specifying a S3 path:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">dataset</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">dataset</span><span class="p">(</span><span class="s2">&quot;s3://ursa-labs-taxi-data/&quot;</span><span class="p">,</span> <span class="n">partitioning</span><span class="o">=</span><span class="p">[</span><span class="s2">&quot;year&quot;</span><span class="p">,</span> <span class="s2">&quot;month&quot;</span><span class="p">])</span>
</pre></div>
</div>
<p>Typically, you will want to customize the connection parameters, and then
a file system object can be created and passed to the <code class="docutils literal notranslate"><span class="pre">filesystem</span></code> keyword:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">pyarrow</span> <span class="kn">import</span> <span class="n">fs</span>
<span class="n">s3</span> <span class="o">=</span> <span class="n">fs</span><span class="o">.</span><span class="n">S3FileSystem</span><span class="p">(</span><span class="n">region</span><span class="o">=</span><span class="s2">&quot;us-east-2&quot;</span><span class="p">)</span>
<span class="n">dataset</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">dataset</span><span class="p">(</span><span class="s2">&quot;ursa-labs-taxi-data/&quot;</span><span class="p">,</span> <span class="n">filesystem</span><span class="o">=</span><span class="n">s3</span><span class="p">,</span>
<span class="n">partitioning</span><span class="o">=</span><span class="p">[</span><span class="s2">&quot;year&quot;</span><span class="p">,</span> <span class="s2">&quot;month&quot;</span><span class="p">])</span>
</pre></div>
</div>
<p>The currently available classes are <a class="reference internal" href="generated/pyarrow.fs.S3FileSystem.html#pyarrow.fs.S3FileSystem" title="pyarrow.fs.S3FileSystem"><code class="xref py py-class docutils literal notranslate"><span class="pre">S3FileSystem</span></code></a> and
<a class="reference internal" href="generated/pyarrow.fs.HadoopFileSystem.html#pyarrow.fs.HadoopFileSystem" title="pyarrow.fs.HadoopFileSystem"><code class="xref py py-class docutils literal notranslate"><span class="pre">HadoopFileSystem</span></code></a>. See the <a class="reference internal" href="filesystems.html#filesystem"><span class="std std-ref">Filesystem Interface</span></a> docs for more
details.</p>
</div>
<div class="section" id="reading-from-minio">
<h2>Reading from Minio<a class="headerlink" href="#reading-from-minio" title="Permalink to this headline"></a></h2>
<p>In addition to cloud storage, pyarrow also supports reading from a
<a class="reference external" href="https://github.com/minio/minio">MinIO</a> object storage instance emulating S3
APIs. Paired with <a class="reference external" href="https://github.com/shopify/toxiproxy">toxiproxy</a>, this is
useful for testing or benchmarking.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">pyarrow</span> <span class="kn">import</span> <span class="n">fs</span>
<span class="c1"># By default, MinIO will listen for unencrypted HTTP traffic.</span>
<span class="n">minio</span> <span class="o">=</span> <span class="n">fs</span><span class="o">.</span><span class="n">S3FileSystem</span><span class="p">(</span><span class="n">scheme</span><span class="o">=</span><span class="s2">&quot;http&quot;</span><span class="p">,</span> <span class="n">endpoint</span><span class="o">=</span><span class="s2">&quot;localhost:9000&quot;</span><span class="p">)</span>
<span class="n">dataset</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">dataset</span><span class="p">(</span><span class="s2">&quot;ursa-labs-taxi-data/&quot;</span><span class="p">,</span> <span class="n">filesystem</span><span class="o">=</span><span class="n">minio</span><span class="p">,</span>
<span class="n">partitioning</span><span class="o">=</span><span class="p">[</span><span class="s2">&quot;year&quot;</span><span class="p">,</span> <span class="s2">&quot;month&quot;</span><span class="p">])</span>
</pre></div>
</div>
</div>
<div class="section" id="working-with-parquet-datasets">
<h2>Working with Parquet Datasets<a class="headerlink" href="#working-with-parquet-datasets" title="Permalink to this headline"></a></h2>
<p>While the Datasets API provides a unified interface to different file formats,
some specific methods exist for Parquet Datasets.</p>
<p>Some processing frameworks such as Dask (optionally) use a <code class="docutils literal notranslate"><span class="pre">_metadata</span></code> file
with partitioned datasets which includes information about the schema and the
row group metadata of the full dataset. Using such file can give a more
efficient creation of a parquet Dataset, since it does not need to infer the
schema and crawl the directories for all Parquet files (this is especially the
case for filesystems where accessing files is expensive). The
<a class="reference internal" href="generated/pyarrow.dataset.parquet_dataset.html#pyarrow.dataset.parquet_dataset" title="pyarrow.dataset.parquet_dataset"><code class="xref py py-func docutils literal notranslate"><span class="pre">parquet_dataset()</span></code></a> function allows to create a Dataset from a partitioned
dataset with a <code class="docutils literal notranslate"><span class="pre">_metadata</span></code> file:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">dataset</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">parquet_dataset</span><span class="p">(</span><span class="s2">&quot;/path/to/dir/_metadata&quot;</span><span class="p">)</span>
</pre></div>
</div>
<p>By default, the constructed <a class="reference internal" href="generated/pyarrow.dataset.Dataset.html#pyarrow.dataset.Dataset" title="pyarrow.dataset.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a> object for Parquet datasets maps
each fragment to a single Parquet file. If you want fragments mapping to each
row group of a Parquet file, you can use the <code class="docutils literal notranslate"><span class="pre">split_by_row_group()</span></code> method of
the fragments:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">fragments</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">dataset</span><span class="o">.</span><span class="n">get_fragments</span><span class="p">())</span>
<span class="n">fragments</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">split_by_row_group</span><span class="p">()</span>
</pre></div>
</div>
<p>This method returns a list of new Fragments mapping to each row group of
the original Fragment (Parquet file). Both <code class="docutils literal notranslate"><span class="pre">get_fragments()</span></code> and
<code class="docutils literal notranslate"><span class="pre">split_by_row_group()</span></code> accept an optional filter expression to get a
filtered list of fragments.</p>
</div>
<div class="section" id="manual-specification-of-the-dataset">
<h2>Manual specification of the Dataset<a class="headerlink" href="#manual-specification-of-the-dataset" title="Permalink to this headline"></a></h2>
<p>The <a class="reference internal" href="generated/pyarrow.dataset.dataset.html#pyarrow.dataset.dataset" title="pyarrow.dataset.dataset"><code class="xref py py-func docutils literal notranslate"><span class="pre">dataset()</span></code></a> function allows easy creation of a Dataset viewing a directory,
crawling all subdirectories for files and partitioning information. However
sometimes discovery is not required and the dataset’s files and partitions
are already known (for example, when this information is stored in metadata).
In this case it is possible to create a Dataset explicitly without any
automatic discovery or inference.</p>
<p>For the example here, we are going to use a dataset where the file names contain
additional partitioning information:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="go"># creating a dummy dataset: directory with two files</span>
<span class="gp">In [39]: </span><span class="n">table</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">table</span><span class="p">({</span><span class="s1">&#39;col1&#39;</span><span class="p">:</span> <span class="nb">range</span><span class="p">(</span><span class="mi">3</span><span class="p">),</span> <span class="s1">&#39;col2&#39;</span><span class="p">:</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="mi">3</span><span class="p">)})</span>
<span class="gp">In [40]: </span><span class="p">(</span><span class="n">base</span> <span class="o">/</span> <span class="s2">&quot;parquet_dataset_manual&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">mkdir</span><span class="p">(</span><span class="n">exist_ok</span><span class="o">=</span><span class="bp">True</span><span class="p">)</span>
<span class="gp">In [41]: </span><span class="n">pq</span><span class="o">.</span><span class="n">write_table</span><span class="p">(</span><span class="n">table</span><span class="p">,</span> <span class="n">base</span> <span class="o">/</span> <span class="s2">&quot;parquet_dataset_manual&quot;</span> <span class="o">/</span> <span class="s2">&quot;data_2018.parquet&quot;</span><span class="p">)</span>
<span class="gp">In [42]: </span><span class="n">pq</span><span class="o">.</span><span class="n">write_table</span><span class="p">(</span><span class="n">table</span><span class="p">,</span> <span class="n">base</span> <span class="o">/</span> <span class="s2">&quot;parquet_dataset_manual&quot;</span> <span class="o">/</span> <span class="s2">&quot;data_2019.parquet&quot;</span><span class="p">)</span>
</pre></div>
</div>
<p>To create a Dataset from a list of files, we need to specify the paths, schema,
format, filesystem, and partition expressions manually:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [43]: </span><span class="kn">from</span> <span class="nn">pyarrow</span> <span class="kn">import</span> <span class="n">fs</span>
<span class="gp">In [44]: </span><span class="n">schema</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">schema</span><span class="p">([(</span><span class="s2">&quot;year&quot;</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">int64</span><span class="p">()),</span> <span class="p">(</span><span class="s2">&quot;col1&quot;</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">int64</span><span class="p">()),</span> <span class="p">(</span><span class="s2">&quot;col2&quot;</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">float64</span><span class="p">())])</span>
<span class="gp">In [45]: </span><span class="n">dataset</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">FileSystemDataset</span><span class="o">.</span><span class="n">from_paths</span><span class="p">(</span>
<span class="gp"> ....: </span> <span class="p">[</span><span class="s2">&quot;data_2018.parquet&quot;</span><span class="p">,</span> <span class="s2">&quot;data_2019.parquet&quot;</span><span class="p">],</span> <span class="n">schema</span><span class="o">=</span><span class="n">schema</span><span class="p">,</span> <span class="n">format</span><span class="o">=</span><span class="n">ds</span><span class="o">.</span><span class="n">ParquetFileFormat</span><span class="p">(),</span>
<span class="gp"> ....: </span> <span class="n">filesystem</span><span class="o">=</span><span class="n">fs</span><span class="o">.</span><span class="n">SubTreeFileSystem</span><span class="p">(</span><span class="nb">str</span><span class="p">(</span><span class="n">base</span> <span class="o">/</span> <span class="s2">&quot;parquet_dataset_manual&quot;</span><span class="p">),</span> <span class="n">fs</span><span class="o">.</span><span class="n">LocalFileSystem</span><span class="p">()),</span>
<span class="gp"> ....: </span> <span class="n">partitions</span><span class="o">=</span><span class="p">[</span><span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s1">&#39;year&#39;</span><span class="p">)</span> <span class="o">==</span> <span class="mi">2018</span><span class="p">,</span> <span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s1">&#39;year&#39;</span><span class="p">)</span> <span class="o">==</span> <span class="mi">2019</span><span class="p">])</span>
<span class="gp"> ....: </span>
</pre></div>
</div>
<p>Since we specified the “partition expressions” for our files, this information
is materialized as columns when reading the data and can be used for filtering:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [46]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">to_table</span><span class="p">()</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">()</span>
<span class="gh">Out[46]: </span><span class="go"></span>
<span class="go"> year col1 col2</span>
<span class="go">0 2018 0 0.495000</span>
<span class="go">1 2018 1 1.753592</span>
<span class="go">2 2018 2 -0.569114</span>
<span class="go">3 2019 0 0.495000</span>
<span class="go">4 2019 1 1.753592</span>
<span class="go">5 2019 2 -0.569114</span>
<span class="gp">In [47]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">to_table</span><span class="p">(</span><span class="nb">filter</span><span class="o">=</span><span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s1">&#39;year&#39;</span><span class="p">)</span> <span class="o">==</span> <span class="mi">2019</span><span class="p">)</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">()</span>
<span class="gh">Out[47]: </span><span class="go"></span>
<span class="go"> year col1 col2</span>
<span class="go">0 2019 0 0.495000</span>
<span class="go">1 2019 1 1.753592</span>
<span class="go">2 2019 2 -0.569114</span>
</pre></div>
</div>
</div>
<div class="section" id="manual-scheduling">
<h2>Manual scheduling<a class="headerlink" href="#manual-scheduling" title="Permalink to this headline"></a></h2>
<p>The <a class="reference internal" href="generated/pyarrow.dataset.Dataset.html#pyarrow.dataset.Dataset.to_table" title="pyarrow.dataset.Dataset.to_table"><code class="xref py py-func docutils literal notranslate"><span class="pre">to_table()</span></code></a> method loads all selected data into memory
at once resulting in a pyarrow Table. Alternatively, a dataset can also be
scanned one RecordBatch at a time in an iterative manner using the
<a class="reference internal" href="generated/pyarrow.dataset.Dataset.html#pyarrow.dataset.Dataset.scan" title="pyarrow.dataset.Dataset.scan"><code class="xref py py-func docutils literal notranslate"><span class="pre">scan()</span></code></a> method:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="k">for</span> <span class="n">scan_task</span> <span class="ow">in</span> <span class="n">dataset</span><span class="o">.</span><span class="n">scan</span><span class="p">(</span><span class="n">columns</span><span class="o">=</span><span class="p">[</span><span class="o">...</span><span class="p">],</span> <span class="nb">filter</span><span class="o">=...</span><span class="p">):</span>
<span class="k">for</span> <span class="n">record_batch</span> <span class="ow">in</span> <span class="n">scan_task</span><span class="o">.</span><span class="n">execute</span><span class="p">():</span>
<span class="c1"># process the record batch</span>
</pre></div>
</div>
</div>
</div>
</div>
<div class='prev-next-bottom'>
<a class='left-prev' id="prev-link" href="parquet.html" title="previous page">Reading and Writing the Apache Parquet Format</a>
<a class='right-next' id="next-link" href="cuda.html" title="next page">CUDA Integration</a>
</div>
</main>
</div>
</div>
<script src="../_static/js/index.1c5a1a01449ed65a7b51.js"></script>
<!-- Matomo -->
<script>
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
/* We explicitly disable cookie tracking to avoid privacy issues */
_paq.push(['disableCookies']);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '20']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
<footer class="footer mt-5 mt-md-0">
<div class="container">
<div class="footer-item">
<p class="copyright">
&copy; Copyright 2016-2019 Apache Software Foundation.<br/>
</p>
</div>
<div class="footer-item">
<p class="sphinx-version">
Created using <a href="http://sphinx-doc.org/">Sphinx</a> 4.0.2.<br/>
</p>
</div>
</div>
</footer>
<script type="text/javascript" src="/docs/_static/versionwarning.js"></script> </body>
</html>