r/man/write_parquet.Rd - arrow - Git at Google

 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/parquet.R
 \name{write_parquet}
 \alias{write_parquet}
 \title{Write Parquet file to disk}
 \usage{
 write_parquet(
   x,
   sink,
   chunk_size = NULL,
   version = "2.4",
   compression = default_parquet_compression(),
   compression_level = NULL,
   use_dictionary = NULL,
   write_statistics = NULL,
   data_page_size = NULL,
   use_deprecated_int96_timestamps = FALSE,
   coerce_timestamps = NULL,
   allow_truncated_timestamps = FALSE
 )
 }
 \arguments{
 \item{x}{\code{data.frame}, \link{RecordBatch}, or \link{Table}}

 \item{sink}{A string file path, URI, or \link{OutputStream}, or path in a file
 system (\code{SubTreeFileSystem})}

 \item{chunk_size}{how many rows of data to write to disk at once. This
 directly corresponds to how many rows will be in each row group in
 parquet. If \code{NULL}, a best guess will be made for optimal size (based on
 the number of columns and number of rows), though if the data has fewer
 than 250 million cells (rows x cols), then the total number of rows is
 used.}

 \item{version}{parquet version: "1.0", "2.0" (deprecated), "2.4" (default),
 "2.6", or "latest" (currently equivalent to 2.6). Numeric values are
 coerced to character.}

 \item{compression}{compression algorithm. Default "snappy". See details.}

 \item{compression_level}{compression level. Meaning depends on compression
 algorithm}

 \item{use_dictionary}{logical: use dictionary encoding? Default \code{TRUE}}

 \item{write_statistics}{logical: include statistics? Default \code{TRUE}}

 \item{data_page_size}{Set a target threshold for the approximate encoded
 size of data pages within a column chunk (in bytes). Default 1 MiB.}

 \item{use_deprecated_int96_timestamps}{logical: write timestamps to INT96
 Parquet format, which has been deprecated? Default \code{FALSE}.}

 \item{coerce_timestamps}{Cast timestamps a particular resolution. Can be
 \code{NULL}, "ms" or "us". Default \code{NULL} (no casting)}

 \item{allow_truncated_timestamps}{logical: Allow loss of data when coercing
 timestamps to a particular resolution. E.g. if microsecond or nanosecond
 data is lost when coercing to "ms", do not raise an exception. Default
 \code{FALSE}.}
 }
 \value{
 the input \code{x} invisibly.
 }
 \description{
 \href{https://parquet.apache.org/}{Parquet} is a columnar storage file format.
 This function enables you to write Parquet files from R.
 }
 \details{
 Due to features of the format, Parquet files cannot be appended to.
 If you want to use the Parquet format but also want the ability to extend
 your dataset, you can write to additional Parquet files and then treat
 the whole directory of files as a \link{Dataset} you can query.
 See the \href{https://arrow.apache.org/docs/r/articles/dataset.html}{dataset
 article} for examples of this.

 The parameters \code{compression}, \code{compression_level}, \code{use_dictionary} and
 \code{write_statistics} support various patterns:
 \itemize{
 \item The default \code{NULL} leaves the parameter unspecified, and the C++ library
 uses an appropriate default for each column (defaults listed above)
 \item A single, unnamed, value (e.g. a single string for \code{compression}) applies to all columns
 \item An unnamed vector, of the same size as the number of columns, to specify a
 value for each column, in positional order
 \item A named vector, to specify the value for the named columns, the default
 value for the setting is used when not supplied
 }

 The \code{compression} argument can be any of the following (case insensitive):
 "uncompressed", "snappy", "gzip", "brotli", "zstd", "lz4", "lzo" or "bz2".
 Only "uncompressed" is guaranteed to be available, but "snappy" and "gzip"
 are almost always included. See \code{\link[=codec_is_available]{codec_is_available()}}.
 The default "snappy" is used if available, otherwise "uncompressed". To
 disable compression, set \code{compression = "uncompressed"}.
 Note that "uncompressed" columns may still have dictionary encoding.
 }
 \examples{
 \dontshow{if (arrow_with_parquet()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
 tf1 <- tempfile(fileext = ".parquet")
 write_parquet(data.frame(x = 1:5), tf1)

 # using compression
 if (codec_is_available("gzip")) {
   tf2 <- tempfile(fileext = ".gz.parquet")
   write_parquet(data.frame(x = 1:5), tf2, compression = "gzip", compression_level = 5)
 }
 \dontshow{\}) # examplesIf}
 }
 \seealso{
 \link{ParquetFileWriter} for a lower-level interface to Parquet writing.
 }
	% Generated by roxygen2: do not edit by hand
	% Please edit documentation in R/parquet.R
	\name{write_parquet}
	\alias{write_parquet}
	\title{Write Parquet file to disk}
	\usage{
	write_parquet(
	x,
	sink,
	chunk_size = NULL,
	version = "2.4",
	compression = default_parquet_compression(),
	compression_level = NULL,
	use_dictionary = NULL,
	write_statistics = NULL,
	data_page_size = NULL,
	use_deprecated_int96_timestamps = FALSE,
	coerce_timestamps = NULL,
	allow_truncated_timestamps = FALSE
	)
	}
	\arguments{
	\item{x}{\code{data.frame}, \link{RecordBatch}, or \link{Table}}

	\item{sink}{A string file path, URI, or \link{OutputStream}, or path in a file
	system (\code{SubTreeFileSystem})}

	\item{chunk_size}{how many rows of data to write to disk at once. This
	directly corresponds to how many rows will be in each row group in
	parquet. If \code{NULL}, a best guess will be made for optimal size (based on
	the number of columns and number of rows), though if the data has fewer
	than 250 million cells (rows x cols), then the total number of rows is
	used.}

	\item{version}{parquet version: "1.0", "2.0" (deprecated), "2.4" (default),
	"2.6", or "latest" (currently equivalent to 2.6). Numeric values are
	coerced to character.}

	\item{compression}{compression algorithm. Default "snappy". See details.}

	\item{compression_level}{compression level. Meaning depends on compression
	algorithm}

	\item{use_dictionary}{logical: use dictionary encoding? Default \code{TRUE}}

	\item{write_statistics}{logical: include statistics? Default \code{TRUE}}

	\item{data_page_size}{Set a target threshold for the approximate encoded
	size of data pages within a column chunk (in bytes). Default 1 MiB.}

	\item{use_deprecated_int96_timestamps}{logical: write timestamps to INT96
	Parquet format, which has been deprecated? Default \code{FALSE}.}

	\item{coerce_timestamps}{Cast timestamps a particular resolution. Can be
	\code{NULL}, "ms" or "us". Default \code{NULL} (no casting)}

	\item{allow_truncated_timestamps}{logical: Allow loss of data when coercing
	timestamps to a particular resolution. E.g. if microsecond or nanosecond
	data is lost when coercing to "ms", do not raise an exception. Default
	\code{FALSE}.}
	}
	\value{
	the input \code{x} invisibly.
	}
	\description{
	\href{https://parquet.apache.org/}{Parquet} is a columnar storage file format.
	This function enables you to write Parquet files from R.
	}
	\details{
	Due to features of the format, Parquet files cannot be appended to.
	If you want to use the Parquet format but also want the ability to extend
	your dataset, you can write to additional Parquet files and then treat
	the whole directory of files as a \link{Dataset} you can query.
	See the \href{https://arrow.apache.org/docs/r/articles/dataset.html}{dataset
	article} for examples of this.

	The parameters \code{compression}, \code{compression_level}, \code{use_dictionary} and
	\code{write_statistics} support various patterns:
	\itemize{
	\item The default \code{NULL} leaves the parameter unspecified, and the C++ library
	uses an appropriate default for each column (defaults listed above)
	\item A single, unnamed, value (e.g. a single string for \code{compression}) applies to all columns
	\item An unnamed vector, of the same size as the number of columns, to specify a
	value for each column, in positional order
	\item A named vector, to specify the value for the named columns, the default
	value for the setting is used when not supplied
	}

	The \code{compression} argument can be any of the following (case insensitive):
	"uncompressed", "snappy", "gzip", "brotli", "zstd", "lz4", "lzo" or "bz2".
	Only "uncompressed" is guaranteed to be available, but "snappy" and "gzip"
	are almost always included. See \code{\link[=codec_is_available]{codec_is_available()}}.
	The default "snappy" is used if available, otherwise "uncompressed". To
	disable compression, set \code{compression = "uncompressed"}.
	Note that "uncompressed" columns may still have dictionary encoding.
	}
	\examples{
	\dontshow{if (arrow_with_parquet()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
	tf1 <- tempfile(fileext = ".parquet")
	write_parquet(data.frame(x = 1:5), tf1)

	# using compression
	if (codec_is_available("gzip")) {
	tf2 <- tempfile(fileext = ".gz.parquet")
	write_parquet(data.frame(x = 1:5), tf2, compression = "gzip", compression_level = 5)
	}
	\dontshow{\}) # examplesIf}
	}
	\seealso{
	\link{ParquetFileWriter} for a lower-level interface to Parquet writing.
	}