blob: c89c709dfb0d05079025aad9605b669b6bf2c3a7 [file] [log] [blame]
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/parquet.R
\name{write_parquet}
\alias{write_parquet}
\title{Write Parquet file to disk}
\usage{
write_parquet(
x,
sink,
chunk_size = NULL,
version = NULL,
compression = default_parquet_compression(),
compression_level = NULL,
use_dictionary = NULL,
write_statistics = NULL,
data_page_size = NULL,
use_deprecated_int96_timestamps = FALSE,
coerce_timestamps = NULL,
allow_truncated_timestamps = FALSE,
properties = NULL,
arrow_properties = NULL
)
}
\arguments{
\item{x}{\code{data.frame}, \link{RecordBatch}, or \link{Table}}
\item{sink}{A string file path, URI, or \link{OutputStream}, or path in a file
system (\code{SubTreeFileSystem})}
\item{chunk_size}{chunk size in number of rows. If NULL, the total number of rows is used.}
\item{version}{parquet version, "1.0" or "2.0". Default "1.0". Numeric values
are coerced to character.}
\item{compression}{compression algorithm. Default "snappy". See details.}
\item{compression_level}{compression level. Meaning depends on compression algorithm}
\item{use_dictionary}{Specify if we should use dictionary encoding. Default \code{TRUE}}
\item{write_statistics}{Specify if we should write statistics. Default \code{TRUE}}
\item{data_page_size}{Set a target threshold for the approximate encoded
size of data pages within a column chunk (in bytes). Default 1 MiB.}
\item{use_deprecated_int96_timestamps}{Write timestamps to INT96 Parquet format. Default \code{FALSE}.}
\item{coerce_timestamps}{Cast timestamps a particular resolution. Can be
\code{NULL}, "ms" or "us". Default \code{NULL} (no casting)}
\item{allow_truncated_timestamps}{Allow loss of data when coercing timestamps to a
particular resolution. E.g. if microsecond or nanosecond data is lost when coercing
to "ms", do not raise an exception}
\item{properties}{A \code{ParquetWriterProperties} object, used instead of the options
enumerated in this function's signature. Providing \code{properties} as an argument
is deprecated; if you need to assemble \code{ParquetWriterProperties} outside
of \code{write_parquet()}, use \code{ParquetFileWriter} instead.}
\item{arrow_properties}{A \code{ParquetArrowWriterProperties} object. Like
\code{properties}, this argument is deprecated.}
}
\value{
the input \code{x} invisibly.
}
\description{
\href{https://parquet.apache.org/}{Parquet} is a columnar storage file format.
This function enables you to write Parquet files from R.
}
\details{
Due to features of the format, Parquet files cannot be appended to.
If you want to use the Parquet format but also want the ability to extend
your dataset, you can write to additional Parquet files and then treat
the whole directory of files as a \link{Dataset} you can query.
See \code{vignette("dataset", package = "arrow")} for examples of this.
The parameters \code{compression}, \code{compression_level}, \code{use_dictionary} and
\code{write_statistics} support various patterns:
\itemize{
\item The default \code{NULL} leaves the parameter unspecified, and the C++ library
uses an appropriate default for each column (defaults listed above)
\item A single, unnamed, value (e.g. a single string for \code{compression}) applies to all columns
\item An unnamed vector, of the same size as the number of columns, to specify a
value for each column, in positional order
\item A named vector, to specify the value for the named columns, the default
value for the setting is used when not supplied
}
The \code{compression} argument can be any of the following (case insensitive):
"uncompressed", "snappy", "gzip", "brotli", "zstd", "lz4", "lzo" or "bz2".
Only "uncompressed" is guaranteed to be available, but "snappy" and "gzip"
are almost always included. See \code{\link[=codec_is_available]{codec_is_available()}}.
The default "snappy" is used if available, otherwise "uncompressed". To
disable compression, set \code{compression = "uncompressed"}.
Note that "uncompressed" columns may still have dictionary encoding.
}
\examples{
\dontrun{
tf1 <- tempfile(fileext = ".parquet")
write_parquet(data.frame(x = 1:5), tf1)
# using compression
if (codec_is_available("gzip")) {
tf2 <- tempfile(fileext = ".gz.parquet")
write_parquet(data.frame(x = 1:5), tf2, compression = "gzip", compression_level = 5)
}
}
}