blob: 34cffefbce5d3e415a5ba4d37bf90352489b28f5 [file] [log] [blame]
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/dataset-write.R
\name{write_dataset}
\alias{write_dataset}
\title{Write a dataset}
\usage{
write_dataset(
dataset,
path,
format = c("parquet", "feather", "arrow", "ipc", "csv", "tsv", "txt", "text"),
partitioning = dplyr::group_vars(dataset),
basename_template = paste0("part-{i}.", as.character(format)),
hive_style = TRUE,
existing_data_behavior = c("overwrite", "error", "delete_matching"),
max_partitions = 1024L,
max_open_files = 900L,
max_rows_per_file = 0L,
min_rows_per_group = 0L,
max_rows_per_group = bitwShiftL(1, 20),
...
)
}
\arguments{
\item{dataset}{\link{Dataset}, \link{RecordBatch}, \link{Table}, \code{arrow_dplyr_query}, or
\code{data.frame}. If an \code{arrow_dplyr_query}, the query will be evaluated and
the result will be written. This means that you can \code{select()}, \code{filter()}, \code{mutate()},
etc. to transform the data before it is written if you need to.}
\item{path}{string path, URI, or \code{SubTreeFileSystem} referencing a directory
to write to (directory will be created if it does not exist)}
\item{format}{a string identifier of the file format. Default is to use
"parquet" (see \link{FileFormat})}
\item{partitioning}{\code{Partitioning} or a character vector of columns to
use as partition keys (to be written as path segments). Default is to
use the current \code{group_by()} columns.}
\item{basename_template}{string template for the names of files to be written.
Must contain \code{"{i}"}, which will be replaced with an autoincremented
integer to generate basenames of datafiles. For example, \code{"part-{i}.arrow"}
will yield \verb{"part-0.arrow", ...}.
If not specified, it defaults to \code{"part-{i}.<default extension>"}.}
\item{hive_style}{logical: write partition segments as Hive-style
(\code{key1=value1/key2=value2/file.ext}) or as just bare values. Default is \code{TRUE}.}
\item{existing_data_behavior}{The behavior to use when there is already data
in the destination directory. Must be one of "overwrite", "error", or
"delete_matching".
\itemize{
\item "overwrite" (the default) then any new files created will overwrite
existing files
\item "error" then the operation will fail if the destination directory is not
empty
\item "delete_matching" then the writer will delete any existing partitions
if data is going to be written to those partitions and will leave alone
partitions which data is not written to.
}}
\item{max_partitions}{maximum number of partitions any batch may be
written into. Default is 1024L.}
\item{max_open_files}{maximum number of files that can be left opened
during a write operation. If greater than 0 then this will limit the
maximum number of files that can be left open. If an attempt is made to open
too many files then the least recently used file will be closed.
If this setting is set too low you may end up fragmenting your data
into many small files. The default is 900 which also allows some # of files to be
open by the scanner before hitting the default Linux limit of 1024.}
\item{max_rows_per_file}{maximum number of rows per file.
If greater than 0 then this will limit how many rows are placed in any single file.
Default is 0L.}
\item{min_rows_per_group}{write the row groups to the disk when this number of
rows have accumulated. Default is 0L.}
\item{max_rows_per_group}{maximum rows allowed in a single
group and when this number of rows is exceeded, it is split and the next set
of rows is written to the next group. This value must be set such that it is
greater than \code{min_rows_per_group}. Default is 1024 * 1024.}
\item{...}{additional format-specific arguments. For available Parquet
options, see \code{\link[=write_parquet]{write_parquet()}}. The available Feather options are:
\itemize{
\item \code{use_legacy_format} logical: write data formatted so that Arrow libraries
versions 0.14 and lower can read it. Default is \code{FALSE}. You can also
enable this by setting the environment variable \code{ARROW_PRE_0_15_IPC_FORMAT=1}.
\item \code{metadata_version}: A string like "V5" or the equivalent integer indicating
the Arrow IPC MetadataVersion. Default (\code{NULL}) will use the latest version,
unless the environment variable \code{ARROW_PRE_1_0_METADATA_VERSION=1}, in
which case it will be V4.
\item \code{codec}: A \link{Codec} which will be used to compress body buffers of written
files. Default (NULL) will not compress body buffers.
\item \code{null_fallback}: character to be used in place of missing values (\code{NA} or
\code{NULL}) when using Hive-style partitioning. See \code{\link[=hive_partition]{hive_partition()}}.
}}
}
\value{
The input \code{dataset}, invisibly
}
\description{
This function allows you to write a dataset. By writing to more efficient
binary storage formats, and by specifying relevant partitioning, you can
make it much faster to read and query.
}
\examples{
\dontshow{if (arrow_with_dataset() & arrow_with_parquet() & requireNamespace("dplyr", quietly = TRUE)) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
# You can write datasets partitioned by the values in a column (here: "cyl").
# This creates a structure of the form cyl=X/part-Z.parquet.
one_level_tree <- tempfile()
write_dataset(mtcars, one_level_tree, partitioning = "cyl")
list.files(one_level_tree, recursive = TRUE)
# You can also partition by the values in multiple columns
# (here: "cyl" and "gear").
# This creates a structure of the form cyl=X/gear=Y/part-Z.parquet.
two_levels_tree <- tempfile()
write_dataset(mtcars, two_levels_tree, partitioning = c("cyl", "gear"))
list.files(two_levels_tree, recursive = TRUE)
# In the two previous examples we would have:
# X = {4,6,8}, the number of cylinders.
# Y = {3,4,5}, the number of forward gears.
# Z = {0,1,2}, the number of saved parts, starting from 0.
# You can obtain the same result as as the previous examples using arrow with
# a dplyr pipeline. This will be the same as two_levels_tree above, but the
# output directory will be different.
library(dplyr)
two_levels_tree_2 <- tempfile()
mtcars \%>\%
group_by(cyl, gear) \%>\%
write_dataset(two_levels_tree_2)
list.files(two_levels_tree_2, recursive = TRUE)
# And you can also turn off the Hive-style directory naming where the column
# name is included with the values by using `hive_style = FALSE`.
# Write a structure X/Y/part-Z.parquet.
two_levels_tree_no_hive <- tempfile()
mtcars \%>\%
group_by(cyl, gear) \%>\%
write_dataset(two_levels_tree_no_hive, hive_style = FALSE)
list.files(two_levels_tree_no_hive, recursive = TRUE)
\dontshow{\}) # examplesIf}
}