r/man/dataset_factory.Rd - arrow - Git at Google

 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/dataset-factory.R
 \name{dataset_factory}
 \alias{dataset_factory}
 \title{Create a DatasetFactory}
 \usage{
 dataset_factory(
   x,
   filesystem = NULL,
   format = c("parquet", "arrow", "ipc", "feather", "csv", "tsv", "text"),
   partitioning = NULL,
   ...
 )
 }
 \arguments{
 \item{x}{A string path to a directory containing data files, a vector of one
 one or more string paths to data files, or a list of \code{DatasetFactory} objects
 whose datasets should be combined. If this argument is specified it will be
 used to construct a \code{UnionDatasetFactory} and other arguments will be
 ignored.}

 \item{filesystem}{A \link{FileSystem} object; if omitted, the \code{FileSystem} will
 be detected from \code{x}}

 \item{format}{A \link{FileFormat} object, or a string identifier of the format of
 the files in \code{x}. Currently supported values:
 \itemize{
 \item "parquet"
 \item "ipc"/"arrow"/"feather", all aliases for each other; for Feather, note that
 only version 2 files are supported
 \item "csv"/"text", aliases for the same thing (because comma is the default
 delimiter for text files
 \item "tsv", equivalent to passing \verb{format = "text", delimiter = "\\t"}
 }

 Default is "parquet", unless a \code{delimiter} is also specified, in which case
 it is assumed to be "text".}

 \item{partitioning}{One of
 \itemize{
 \item A \code{Schema}, in which case the file paths relative to \code{sources} will be
 parsed, and path segments will be matched with the schema fields. For
 example, \code{schema(year = int16(), month = int8())} would create partitions
 for file paths like "2019/01/file.parquet", "2019/02/file.parquet", etc.
 \item A character vector that defines the field names corresponding to those
 path segments (that is, you're providing the names that would correspond
 to a \code{Schema} but the types will be autodetected)
 \item A \code{HivePartitioning} or \code{HivePartitioningFactory}, as returned
 by \code{\link[=hive_partition]{hive_partition()}} which parses explicit or autodetected fields from
 Hive-style path segments
 \item \code{NULL} for no partitioning
 }}

 \item{...}{Additional format-specific options, passed to
 \code{FileFormat$create()}. For CSV options, note that you can specify them either
 with the Arrow C++ library naming ("delimiter", "quoting", etc.) or the
 \code{readr}-style naming used in \code{\link[=read_csv_arrow]{read_csv_arrow()}} ("delim", "quote", etc.).
 Not all \code{readr} options are currently supported; please file an issue if you
 encounter one that \code{arrow} should support.}
 }
 \value{
 A \code{DatasetFactory} object. Pass this to \code{\link[=open_dataset]{open_dataset()}},
 in a list potentially with other \code{DatasetFactory} objects, to create
 a \code{Dataset}.
 }
 \description{
 A \link{Dataset} can constructed using one or more \link{DatasetFactory}s.
 This function helps you construct a \code{DatasetFactory} that you can pass to
 \code{\link[=open_dataset]{open_dataset()}}.
 }
 \details{
 If you would only have a single \code{DatasetFactory} (for example, you have a
 single directory containing Parquet files), you can call \code{open_dataset()}
 directly. Use \code{dataset_factory()} when you
 want to combine different directories, file systems, or file formats.
 }
	% Generated by roxygen2: do not edit by hand
	% Please edit documentation in R/dataset-factory.R
	\name{dataset_factory}
	\alias{dataset_factory}
	\title{Create a DatasetFactory}
	\usage{
	dataset_factory(
	x,
	filesystem = NULL,
	format = c("parquet", "arrow", "ipc", "feather", "csv", "tsv", "text"),
	partitioning = NULL,
	...
	)
	}
	\arguments{
	\item{x}{A string path to a directory containing data files, a vector of one
	one or more string paths to data files, or a list of \code{DatasetFactory} objects
	whose datasets should be combined. If this argument is specified it will be
	used to construct a \code{UnionDatasetFactory} and other arguments will be
	ignored.}

	\item{filesystem}{A \link{FileSystem} object; if omitted, the \code{FileSystem} will
	be detected from \code{x}}

	\item{format}{A \link{FileFormat} object, or a string identifier of the format of
	the files in \code{x}. Currently supported values:
	\itemize{
	\item "parquet"
	\item "ipc"/"arrow"/"feather", all aliases for each other; for Feather, note that
	only version 2 files are supported
	\item "csv"/"text", aliases for the same thing (because comma is the default
	delimiter for text files
	\item "tsv", equivalent to passing \verb{format = "text", delimiter = "\\t"}
	}

	Default is "parquet", unless a \code{delimiter} is also specified, in which case
	it is assumed to be "text".}

	\item{partitioning}{One of
	\itemize{
	\item A \code{Schema}, in which case the file paths relative to \code{sources} will be
	parsed, and path segments will be matched with the schema fields. For
	example, \code{schema(year = int16(), month = int8())} would create partitions
	for file paths like "2019/01/file.parquet", "2019/02/file.parquet", etc.
	\item A character vector that defines the field names corresponding to those
	path segments (that is, you're providing the names that would correspond
	to a \code{Schema} but the types will be autodetected)
	\item A \code{HivePartitioning} or \code{HivePartitioningFactory}, as returned
	by \code{\link[=hive_partition]{hive_partition()}} which parses explicit or autodetected fields from
	Hive-style path segments
	\item \code{NULL} for no partitioning
	}}

	\item{...}{Additional format-specific options, passed to
	\code{FileFormat$create()}. For CSV options, note that you can specify them either
	with the Arrow C++ library naming ("delimiter", "quoting", etc.) or the
	\code{readr}-style naming used in \code{\link[=read_csv_arrow]{read_csv_arrow()}} ("delim", "quote", etc.).
	Not all \code{readr} options are currently supported; please file an issue if you
	encounter one that \code{arrow} should support.}
	}
	\value{
	A \code{DatasetFactory} object. Pass this to \code{\link[=open_dataset]{open_dataset()}},
	in a list potentially with other \code{DatasetFactory} objects, to create
	a \code{Dataset}.
	}
	\description{
	A \link{Dataset} can constructed using one or more \link{DatasetFactory}s.
	This function helps you construct a \code{DatasetFactory} that you can pass to
	\code{\link[=open_dataset]{open_dataset()}}.
	}
	\details{
	If you would only have a single \code{DatasetFactory} (for example, you have a
	single directory containing Parquet files), you can call \code{open_dataset()}
	directly. Use \code{dataset_factory()} when you
	want to combine different directories, file systems, or file formats.
	}