blob: 225aab28dd256e8c2d06710e36c978288fd6b023 [file] [log] [blame]
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/dataset-write.R
\name{write_dataset}
\alias{write_dataset}
\title{Write a dataset}
\usage{
write_dataset(
dataset,
path,
format = c("parquet", "feather", "arrow", "ipc"),
partitioning = dplyr::group_vars(dataset),
basename_template = paste0("part-{i}.", as.character(format)),
hive_style = TRUE,
...
)
}
\arguments{
\item{dataset}{\link{Dataset}, \link{RecordBatch}, \link{Table}, \code{arrow_dplyr_query}, or
\code{data.frame}. If an \code{arrow_dplyr_query} or \code{grouped_df},
\code{schema} and \code{partitioning} will be taken from the result of any \code{select()}
and \code{group_by()} operations done on the dataset. \code{filter()} queries will be
applied to restrict written rows.
Note that \code{select()}-ed columns may not be renamed.}
\item{path}{string path, URI, or \code{SubTreeFileSystem} referencing a directory
to write to (directory will be created if it does not exist)}
\item{format}{a string identifier of the file format. Default is to use
"parquet" (see \link{FileFormat})}
\item{partitioning}{\code{Partitioning} or a character vector of columns to
use as partition keys (to be written as path segments). Default is to
use the current \code{group_by()} columns.}
\item{basename_template}{string template for the names of files to be written.
Must contain \code{"{i}"}, which will be replaced with an autoincremented
integer to generate basenames of datafiles. For example, \code{"part-{i}.feather"}
will yield \verb{"part-0.feather", ...}.}
\item{hive_style}{logical: write partition segments as Hive-style
(\code{key1=value1/key2=value2/file.ext}) or as just bare values. Default is \code{TRUE}.}
\item{...}{additional format-specific arguments. For available Parquet
options, see \code{\link[=write_parquet]{write_parquet()}}. The available Feather options are
\itemize{
\item \code{use_legacy_format} logical: write data formatted so that Arrow libraries
versions 0.14 and lower can read it. Default is \code{FALSE}. You can also
enable this by setting the environment variable \code{ARROW_PRE_0_15_IPC_FORMAT=1}.
\item \code{metadata_version}: A string like "V5" or the equivalent integer indicating
the Arrow IPC MetadataVersion. Default (NULL) will use the latest version,
unless the environment variable \code{ARROW_PRE_1_0_METADATA_VERSION=1}, in
which case it will be V4.
\item \code{codec}: A \link{Codec} which will be used to compress body buffers of written
files. Default (NULL) will not compress body buffers.
\item \code{null_fallback}: character to be used in place of missing values (\code{NA} or
\code{NULL}) when using Hive-style partitioning. See \code{\link[=hive_partition]{hive_partition()}}.
}}
}
\value{
The input \code{dataset}, invisibly
}
\description{
This function allows you to write a dataset. By writing to more efficient
binary storage formats, and by specifying relevant partitioning, you can
make it much faster to read and query.
}