blob: dc12832c97fbfbc98733d27398708138b8ddf099 [file] [log] [blame]
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/dataset.R
\name{Partitioning}
\alias{Partitioning}
\alias{DirectoryPartitioning}
\alias{HivePartitioning}
\alias{DirectoryPartitioningFactory}
\alias{HivePartitioningFactory}
\title{Define Partitioning for a Dataset}
\description{
Pass a \code{Partitioning} object to a \link{FileSystemDatasetFactory}'s \verb{$create()}
method to indicate how the file's paths should be interpreted to define
partitioning.
\code{DirectoryPartitioning} describes how to interpret raw path segments, in
order. For example, \code{schema(year = int16(), month = int8())} would define
partitions for file paths like "2019/01/file.parquet",
"2019/02/file.parquet", etc.
\code{HivePartitioning} is for Hive-style partitioning, which embeds field
names and values in path segments, such as
"/year=2019/month=2/data.parquet". Because fields are named in the path
segments, order does not matter.
\code{PartitioningFactory} subclasses instruct the \code{DatasetFactory} to detect
partition features from the file paths.
}
\section{Factory}{
Both \code{DirectoryPartitioning$create()} and \code{HivePartitioning$create()}
methods take a \link{Schema} as a single input argument. The helper
function \code{\link[=hive_partition]{hive_partition(...)}} is shorthand for
\code{HivePartitioning$create(schema(...))}.
With \code{DirectoryPartitioningFactory$create()}, you can provide just the
names of the path segments (in our example, \code{c("year", "month")}), and
the \code{DatasetFactory} will infer the data types for those partition variables.
\code{HivePartitioningFactory$create()} takes no arguments: both variable names
and their types can be inferred from the file paths. \code{hive_partition()} with
no arguments returns a \code{HivePartitioningFactory}.
}