r/man/open_delim_dataset.Rd - arrow - Git at Google

 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/dataset.R
 \name{open_delim_dataset}
 \alias{open_delim_dataset}
 \alias{open_csv_dataset}
 \alias{open_tsv_dataset}
 \title{Open a multi-file dataset of CSV or other delimiter-separated format}
 \usage{
 open_delim_dataset(
   sources,
   schema = NULL,
   partitioning = hive_partition(),
   hive_style = NA,
   unify_schemas = NULL,
   factory_options = list(),
   delim = ",",
   quote = "\\"",
   escape_double = TRUE,
   escape_backslash = FALSE,
   col_names = TRUE,
   col_types = NULL,
   na = c("", "NA"),
   skip_empty_rows = TRUE,
   skip = 0L,
   convert_options = NULL,
   read_options = NULL,
   timestamp_parsers = NULL
 )

 open_csv_dataset(
   sources,
   schema = NULL,
   partitioning = hive_partition(),
   hive_style = NA,
   unify_schemas = NULL,
   factory_options = list(),
   quote = "\\"",
   escape_double = TRUE,
   escape_backslash = FALSE,
   col_names = TRUE,
   col_types = NULL,
   na = c("", "NA"),
   skip_empty_rows = TRUE,
   skip = 0L,
   convert_options = NULL,
   read_options = NULL,
   timestamp_parsers = NULL
 )

 open_tsv_dataset(
   sources,
   schema = NULL,
   partitioning = hive_partition(),
   hive_style = NA,
   unify_schemas = NULL,
   factory_options = list(),
   quote = "\\"",
   escape_double = TRUE,
   escape_backslash = FALSE,
   col_names = TRUE,
   col_types = NULL,
   na = c("", "NA"),
   skip_empty_rows = TRUE,
   skip = 0L,
   convert_options = NULL,
   read_options = NULL,
   timestamp_parsers = NULL
 )
 }
 \arguments{
 \item{sources}{One of:
 \itemize{
 \item a string path or URI to a directory containing data files
 \item a \link{FileSystem} that references a directory containing data files
 (such as what is returned by \code{\link[=s3_bucket]{s3_bucket()}})
 \item a string path or URI to a single file
 \item a character vector of paths or URIs to individual data files
 \item a list of \code{Dataset} objects as created by this function
 \item a list of \code{DatasetFactory} objects as created by \code{\link[=dataset_factory]{dataset_factory()}}.
 }

 When \code{sources} is a vector of file URIs, they must all use the same protocol
 and point to files located in the same file system and having the same
 format.}

 \item{schema}{\link{Schema} for the \code{Dataset}. If \code{NULL} (the default), the schema
 will be inferred from the data sources.}

 \item{partitioning}{When \code{sources} is a directory path/URI, one of:
 \itemize{
 \item a \code{Schema}, in which case the file paths relative to \code{sources} will be
 parsed, and path segments will be matched with the schema fields.
 \item a character vector that defines the field names corresponding to those
 path segments (that is, you're providing the names that would correspond
 to a \code{Schema} but the types will be autodetected)
 \item a \code{Partitioning} or \code{PartitioningFactory}, such as returned
 by \code{\link[=hive_partition]{hive_partition()}}
 \item \code{NULL} for no partitioning
 }

 The default is to autodetect Hive-style partitions unless
 \code{hive_style = FALSE}. See the "Partitioning" section for details.
 When \code{sources} is not a directory path/URI, \code{partitioning} is ignored.}

 \item{hive_style}{Logical: should \code{partitioning} be interpreted as
 Hive-style? Default is \code{NA}, which means to inspect the file paths for
 Hive-style partitioning and behave accordingly.}

 \item{unify_schemas}{logical: should all data fragments (files, \code{Dataset}s)
 be scanned in order to create a unified schema from them? If \code{FALSE}, only
 the first fragment will be inspected for its schema. Use this fast path
 when you know and trust that all fragments have an identical schema.
 The default is \code{FALSE} when creating a dataset from a directory path/URI or
 vector of file paths/URIs (because there may be many files and scanning may
 be slow) but \code{TRUE} when \code{sources} is a list of \code{Dataset}s (because there
 should be few \code{Dataset}s in the list and their \code{Schema}s are already in
 memory).}

 \item{factory_options}{list of optional FileSystemFactoryOptions:
 \itemize{
 \item \code{partition_base_dir}: string path segment prefix to ignore when
 discovering partition information with DirectoryPartitioning. Not
 meaningful (ignored with a warning) for HivePartitioning, nor is it
 valid when providing a vector of file paths.
 \item \code{exclude_invalid_files}: logical: should files that are not valid data
 files be excluded? Default is \code{FALSE} because checking all files up
 front incurs I/O and thus will be slower, especially on remote
 filesystems. If false and there are invalid files, there will be an
 error at scan time. This is the only FileSystemFactoryOption that is
 valid for both when providing a directory path in which to discover
 files and when providing a vector of file paths.
 \item \code{selector_ignore_prefixes}: character vector of file prefixes to ignore
 when discovering files in a directory. If invalid files can be excluded
 by a common filename prefix this way, you can avoid the I/O cost of
 \code{exclude_invalid_files}. Not valid when providing a vector of file paths
 (but if you're providing the file list, you can filter invalid files
 yourself).
 }}

 \item{delim}{Single character used to separate fields within a record.}

 \item{quote}{Single character used to quote strings.}

 \item{escape_double}{Does the file escape quotes by doubling them?
 i.e. If this option is \code{TRUE}, the value \verb{""""} represents
 a single quote, \verb{\\"}.}

 \item{escape_backslash}{Does the file use backslashes to escape special
 characters? This is more general than \code{escape_double} as backslashes
 can be used to escape the delimiter character, the quote character, or
 to add special characters like \verb{\\\\n}.}

 \item{col_names}{If \code{TRUE}, the first row of the input will be used as the
 column names and will not be included in the data frame. If \code{FALSE}, column
 names will be generated by Arrow, starting with "f0", "f1", ..., "fN".
 Alternatively, you can specify a character vector of column names.}

 \item{col_types}{A compact string representation of the column types,
 an Arrow \link{Schema}, or \code{NULL} (the default) to infer types from the data.}

 \item{na}{A character vector of strings to interpret as missing values.}

 \item{skip_empty_rows}{Should blank rows be ignored altogether? If
 \code{TRUE}, blank rows will not be represented at all. If \code{FALSE}, they will be
 filled with missings.}

 \item{skip}{Number of lines to skip before reading data.}

 \item{convert_options}{see \link[=CsvReadOptions]{file reader options}}

 \item{read_options}{see \link[=CsvReadOptions]{file reader options}}

 \item{timestamp_parsers}{User-defined timestamp parsers. If more than one
 parser is specified, the CSV conversion logic will try parsing values
 starting from the beginning of this vector. Possible values are:
 \itemize{
 \item \code{NULL}: the default, which uses the ISO-8601 parser
 \item a character vector of \link[base:strptime]{strptime} parse strings
 \item a list of \link{TimestampParser} objects
 }}
 }
 \description{
 A wrapper around \link{open_dataset} which explicitly includes parameters mirroring \code{\link[=read_csv_arrow]{read_csv_arrow()}},
 \code{\link[=read_delim_arrow]{read_delim_arrow()}}, and \code{\link[=read_tsv_arrow]{read_tsv_arrow()}} to allows for easy switching between functions
 for opening single files and functions for opening datasets.
 }
 \section{Options currently supported by \code{\link[=read_delim_arrow]{read_delim_arrow()}} which are not supported here}{

 \itemize{
 \item \code{file} (instead, please specify files in \code{sources})
 \item \code{col_select} (instead, subset columns after dataset creation)
 \item \code{quoted_na}
 \item \code{as_data_frame} (instead, convert to data frame after dataset creation)
 \item \code{parse_options}
 }
 }

 \examples{
 \dontshow{if (arrow_with_dataset()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
 # Set up directory for examples
 tf <- tempfile()
 dir.create(tf)
 df <- data.frame(x = c("1", "2", "NULL"))

 file_path <- file.path(tf, "file1.txt")
 write.table(df, file_path, sep = ",", row.names = FALSE)

 read_csv_arrow(file_path, na = c("", "NA", "NULL"), col_names = "y", skip = 1)
 open_csv_dataset(file_path, na = c("", "NA", "NULL"), col_names = "y", skip = 1)

 unlink(tf)
 \dontshow{\}) # examplesIf}
 }
 \seealso{
 \code{\link[=open_dataset]{open_dataset()}}
 }
	% Generated by roxygen2: do not edit by hand
	% Please edit documentation in R/dataset.R
	\name{open_delim_dataset}
	\alias{open_delim_dataset}
	\alias{open_csv_dataset}
	\alias{open_tsv_dataset}
	\title{Open a multi-file dataset of CSV or other delimiter-separated format}
	\usage{
	open_delim_dataset(
	sources,
	schema = NULL,
	partitioning = hive_partition(),
	hive_style = NA,
	unify_schemas = NULL,
	factory_options = list(),
	delim = ",",
	quote = "\\"",
	escape_double = TRUE,
	escape_backslash = FALSE,
	col_names = TRUE,
	col_types = NULL,
	na = c("", "NA"),
	skip_empty_rows = TRUE,
	skip = 0L,
	convert_options = NULL,
	read_options = NULL,
	timestamp_parsers = NULL
	)

	open_csv_dataset(
	sources,
	schema = NULL,
	partitioning = hive_partition(),
	hive_style = NA,
	unify_schemas = NULL,
	factory_options = list(),
	quote = "\\"",
	escape_double = TRUE,
	escape_backslash = FALSE,
	col_names = TRUE,
	col_types = NULL,
	na = c("", "NA"),
	skip_empty_rows = TRUE,
	skip = 0L,
	convert_options = NULL,
	read_options = NULL,
	timestamp_parsers = NULL
	)

	open_tsv_dataset(
	sources,
	schema = NULL,
	partitioning = hive_partition(),
	hive_style = NA,
	unify_schemas = NULL,
	factory_options = list(),
	quote = "\\"",
	escape_double = TRUE,
	escape_backslash = FALSE,
	col_names = TRUE,
	col_types = NULL,
	na = c("", "NA"),
	skip_empty_rows = TRUE,
	skip = 0L,
	convert_options = NULL,
	read_options = NULL,
	timestamp_parsers = NULL
	)
	}
	\arguments{
	\item{sources}{One of:
	\itemize{
	\item a string path or URI to a directory containing data files
	\item a \link{FileSystem} that references a directory containing data files
	(such as what is returned by \code{\link[=s3_bucket]{s3_bucket()}})
	\item a string path or URI to a single file
	\item a character vector of paths or URIs to individual data files
	\item a list of \code{Dataset} objects as created by this function
	\item a list of \code{DatasetFactory} objects as created by \code{\link[=dataset_factory]{dataset_factory()}}.
	}

	When \code{sources} is a vector of file URIs, they must all use the same protocol
	and point to files located in the same file system and having the same
	format.}

	\item{schema}{\link{Schema} for the \code{Dataset}. If \code{NULL} (the default), the schema
	will be inferred from the data sources.}

	\item{partitioning}{When \code{sources} is a directory path/URI, one of:
	\itemize{
	\item a \code{Schema}, in which case the file paths relative to \code{sources} will be
	parsed, and path segments will be matched with the schema fields.
	\item a character vector that defines the field names corresponding to those
	path segments (that is, you're providing the names that would correspond
	to a \code{Schema} but the types will be autodetected)
	\item a \code{Partitioning} or \code{PartitioningFactory}, such as returned
	by \code{\link[=hive_partition]{hive_partition()}}
	\item \code{NULL} for no partitioning
	}

	The default is to autodetect Hive-style partitions unless
	\code{hive_style = FALSE}. See the "Partitioning" section for details.
	When \code{sources} is not a directory path/URI, \code{partitioning} is ignored.}

	\item{hive_style}{Logical: should \code{partitioning} be interpreted as
	Hive-style? Default is \code{NA}, which means to inspect the file paths for
	Hive-style partitioning and behave accordingly.}

	\item{unify_schemas}{logical: should all data fragments (files, \code{Dataset}s)
	be scanned in order to create a unified schema from them? If \code{FALSE}, only
	the first fragment will be inspected for its schema. Use this fast path
	when you know and trust that all fragments have an identical schema.
	The default is \code{FALSE} when creating a dataset from a directory path/URI or
	vector of file paths/URIs (because there may be many files and scanning may
	be slow) but \code{TRUE} when \code{sources} is a list of \code{Dataset}s (because there
	should be few \code{Dataset}s in the list and their \code{Schema}s are already in
	memory).}

	\item{factory_options}{list of optional FileSystemFactoryOptions:
	\itemize{
	\item \code{partition_base_dir}: string path segment prefix to ignore when
	discovering partition information with DirectoryPartitioning. Not
	meaningful (ignored with a warning) for HivePartitioning, nor is it
	valid when providing a vector of file paths.
	\item \code{exclude_invalid_files}: logical: should files that are not valid data
	files be excluded? Default is \code{FALSE} because checking all files up
	front incurs I/O and thus will be slower, especially on remote
	filesystems. If false and there are invalid files, there will be an
	error at scan time. This is the only FileSystemFactoryOption that is
	valid for both when providing a directory path in which to discover
	files and when providing a vector of file paths.
	\item \code{selector_ignore_prefixes}: character vector of file prefixes to ignore
	when discovering files in a directory. If invalid files can be excluded
	by a common filename prefix this way, you can avoid the I/O cost of
	\code{exclude_invalid_files}. Not valid when providing a vector of file paths
	(but if you're providing the file list, you can filter invalid files
	yourself).
	}}

	\item{delim}{Single character used to separate fields within a record.}

	\item{quote}{Single character used to quote strings.}

	\item{escape_double}{Does the file escape quotes by doubling them?
	i.e. If this option is \code{TRUE}, the value \verb{""""} represents
	a single quote, \verb{\\"}.}

	\item{escape_backslash}{Does the file use backslashes to escape special
	characters? This is more general than \code{escape_double} as backslashes
	can be used to escape the delimiter character, the quote character, or
	to add special characters like \verb{\\\\n}.}

	\item{col_names}{If \code{TRUE}, the first row of the input will be used as the
	column names and will not be included in the data frame. If \code{FALSE}, column
	names will be generated by Arrow, starting with "f0", "f1", ..., "fN".
	Alternatively, you can specify a character vector of column names.}

	\item{col_types}{A compact string representation of the column types,
	an Arrow \link{Schema}, or \code{NULL} (the default) to infer types from the data.}

	\item{na}{A character vector of strings to interpret as missing values.}

	\item{skip_empty_rows}{Should blank rows be ignored altogether? If
	\code{TRUE}, blank rows will not be represented at all. If \code{FALSE}, they will be
	filled with missings.}

	\item{skip}{Number of lines to skip before reading data.}

	\item{convert_options}{see \link[=CsvReadOptions]{file reader options}}

	\item{read_options}{see \link[=CsvReadOptions]{file reader options}}

	\item{timestamp_parsers}{User-defined timestamp parsers. If more than one
	parser is specified, the CSV conversion logic will try parsing values
	starting from the beginning of this vector. Possible values are:
	\itemize{
	\item \code{NULL}: the default, which uses the ISO-8601 parser
	\item a character vector of \link[base:strptime]{strptime} parse strings
	\item a list of \link{TimestampParser} objects
	}}
	}
	\description{
	A wrapper around \link{open_dataset} which explicitly includes parameters mirroring \code{\link[=read_csv_arrow]{read_csv_arrow()}},
	\code{\link[=read_delim_arrow]{read_delim_arrow()}}, and \code{\link[=read_tsv_arrow]{read_tsv_arrow()}} to allows for easy switching between functions
	for opening single files and functions for opening datasets.
	}
	\section{Options currently supported by \code{\link[=read_delim_arrow]{read_delim_arrow()}} which are not supported here}{

	\itemize{
	\item \code{file} (instead, please specify files in \code{sources})
	\item \code{col_select} (instead, subset columns after dataset creation)
	\item \code{quoted_na}
	\item \code{as_data_frame} (instead, convert to data frame after dataset creation)
	\item \code{parse_options}
	}
	}

	\examples{
	\dontshow{if (arrow_with_dataset()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
	# Set up directory for examples
	tf <- tempfile()
	dir.create(tf)
	df <- data.frame(x = c("1", "2", "NULL"))

	file_path <- file.path(tf, "file1.txt")
	write.table(df, file_path, sep = ",", row.names = FALSE)

	read_csv_arrow(file_path, na = c("", "NA", "NULL"), col_names = "y", skip = 1)
	open_csv_dataset(file_path, na = c("", "NA", "NULL"), col_names = "y", skip = 1)

	unlink(tf)
	\dontshow{\}) # examplesIf}
	}
	\seealso{
	\code{\link[=open_dataset]{open_dataset()}}
	}