r/man/FileFormat.Rd - arrow - Git at Google

 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/dataset-format.R
 \name{FileFormat}
 \alias{FileFormat}
 \alias{ParquetFileFormat}
 \alias{IpcFileFormat}
 \title{Dataset file formats}
 \description{
 A \code{FileFormat} holds information about how to read and parse the files
 included in a \code{Dataset}. There are subclasses corresponding to the supported
 file formats (\code{ParquetFileFormat} and \code{IpcFileFormat}).
 }
 \section{Factory}{

 \code{FileFormat$create()} takes the following arguments:
 \itemize{
 \item \code{format}: A string identifier of the file format. Currently supported values:
 \itemize{
 \item "parquet"
 \item "ipc"/"arrow"/"feather", all aliases for each other; for Feather, note that
 only version 2 files are supported
 \item "csv"/"text", aliases for the same thing (because comma is the default
 delimiter for text files
 \item "tsv", equivalent to passing \verb{format = "text", delimiter = "\\t"}
 }
 \item \code{...}: Additional format-specific options

 \code{format = "parquet"}:
 \itemize{
 \item \code{dict_columns}: Names of columns which should be read as dictionaries.
 \item Any Parquet options from \link{FragmentScanOptions}.
 }

 \code{format = "text"}: see \link{CsvParseOptions}. Note that you can specify them either
 with the Arrow C++ library naming ("delimiter", "quoting", etc.) or the
 \code{readr}-style naming used in \code{\link[=read_csv_arrow]{read_csv_arrow()}} ("delim", "quote", etc.).
 Not all \code{readr} options are currently supported; please file an issue if
 you encounter one that \code{arrow} should support. Also, the following options are
 supported. From \link{CsvReadOptions}:
 \itemize{
 \item \code{skip_rows}
 \item \code{column_names}. Note that if a \link{Schema} is specified, \code{column_names} must match those specified in the schema.
 \item \code{autogenerate_column_names}
 From \link{CsvFragmentScanOptions} (these values can be overridden at scan time):
 \item \code{convert_options}: a \link{CsvConvertOptions}
 \item \code{block_size}
 }
 }

 It returns the appropriate subclass of \code{FileFormat} (e.g. \code{ParquetFileFormat})
 }

 \examples{
 \dontshow{if (arrow_with_dataset()) withAutoprint(\{ # examplesIf}
 ## Semi-colon delimited files
 # Set up directory for examples
 tf <- tempfile()
 dir.create(tf)
 on.exit(unlink(tf))
 write.table(mtcars, file.path(tf, "file1.txt"), sep = ";", row.names = FALSE)

 # Create FileFormat object
 format <- FileFormat$create(format = "text", delimiter = ";")

 open_dataset(tf, format = format)
 \dontshow{\}) # examplesIf}
 }
	% Generated by roxygen2: do not edit by hand
	% Please edit documentation in R/dataset-format.R
	\name{FileFormat}
	\alias{FileFormat}
	\alias{ParquetFileFormat}
	\alias{IpcFileFormat}
	\title{Dataset file formats}
	\description{
	A \code{FileFormat} holds information about how to read and parse the files
	included in a \code{Dataset}. There are subclasses corresponding to the supported
	file formats (\code{ParquetFileFormat} and \code{IpcFileFormat}).
	}
	\section{Factory}{

	\code{FileFormat$create()} takes the following arguments:
	\itemize{
	\item \code{format}: A string identifier of the file format. Currently supported values:
	\itemize{
	\item "parquet"
	\item "ipc"/"arrow"/"feather", all aliases for each other; for Feather, note that
	only version 2 files are supported
	\item "csv"/"text", aliases for the same thing (because comma is the default
	delimiter for text files
	\item "tsv", equivalent to passing \verb{format = "text", delimiter = "\\t"}
	}
	\item \code{...}: Additional format-specific options

	\code{format = "parquet"}:
	\itemize{
	\item \code{dict_columns}: Names of columns which should be read as dictionaries.
	\item Any Parquet options from \link{FragmentScanOptions}.
	}

	\code{format = "text"}: see \link{CsvParseOptions}. Note that you can specify them either
	with the Arrow C++ library naming ("delimiter", "quoting", etc.) or the
	\code{readr}-style naming used in \code{\link[=read_csv_arrow]{read_csv_arrow()}} ("delim", "quote", etc.).
	Not all \code{readr} options are currently supported; please file an issue if
	you encounter one that \code{arrow} should support. Also, the following options are
	supported. From \link{CsvReadOptions}:
	\itemize{
	\item \code{skip_rows}
	\item \code{column_names}. Note that if a \link{Schema} is specified, \code{column_names} must match those specified in the schema.
	\item \code{autogenerate_column_names}
	From \link{CsvFragmentScanOptions} (these values can be overridden at scan time):
	\item \code{convert_options}: a \link{CsvConvertOptions}
	\item \code{block_size}
	}
	}

	It returns the appropriate subclass of \code{FileFormat} (e.g. \code{ParquetFileFormat})
	}

	\examples{
	\dontshow{if (arrow_with_dataset()) withAutoprint(\{ # examplesIf}
	## Semi-colon delimited files
	# Set up directory for examples
	tf <- tempfile()
	dir.create(tf)
	on.exit(unlink(tf))
	write.table(mtcars, file.path(tf, "file1.txt"), sep = ";", row.names = FALSE)

	# Create FileFormat object
	format <- FileFormat$create(format = "text", delimiter = ";")

	open_dataset(tf, format = format)
	\dontshow{\}) # examplesIf}
	}