| % Generated by roxygen2: do not edit by hand |
| % Please edit documentation in R/dataset.R |
| \name{open_delim_dataset} |
| \alias{open_delim_dataset} |
| \alias{open_csv_dataset} |
| \alias{open_tsv_dataset} |
| \title{Open a multi-file dataset of CSV or other delimiter-separated format} |
| \usage{ |
| open_delim_dataset( |
| sources, |
| schema = NULL, |
| partitioning = hive_partition(), |
| hive_style = NA, |
| unify_schemas = NULL, |
| factory_options = list(), |
| delim = ",", |
| quote = "\\"", |
| escape_double = TRUE, |
| escape_backslash = FALSE, |
| col_names = TRUE, |
| col_types = NULL, |
| na = c("", "NA"), |
| skip_empty_rows = TRUE, |
| skip = 0L, |
| convert_options = NULL, |
| read_options = NULL, |
| timestamp_parsers = NULL |
| ) |
| |
| open_csv_dataset( |
| sources, |
| schema = NULL, |
| partitioning = hive_partition(), |
| hive_style = NA, |
| unify_schemas = NULL, |
| factory_options = list(), |
| quote = "\\"", |
| escape_double = TRUE, |
| escape_backslash = FALSE, |
| col_names = TRUE, |
| col_types = NULL, |
| na = c("", "NA"), |
| skip_empty_rows = TRUE, |
| skip = 0L, |
| convert_options = NULL, |
| read_options = NULL, |
| timestamp_parsers = NULL |
| ) |
| |
| open_tsv_dataset( |
| sources, |
| schema = NULL, |
| partitioning = hive_partition(), |
| hive_style = NA, |
| unify_schemas = NULL, |
| factory_options = list(), |
| quote = "\\"", |
| escape_double = TRUE, |
| escape_backslash = FALSE, |
| col_names = TRUE, |
| col_types = NULL, |
| na = c("", "NA"), |
| skip_empty_rows = TRUE, |
| skip = 0L, |
| convert_options = NULL, |
| read_options = NULL, |
| timestamp_parsers = NULL |
| ) |
| } |
| \arguments{ |
| \item{sources}{One of: |
| \itemize{ |
| \item a string path or URI to a directory containing data files |
| \item a \link{FileSystem} that references a directory containing data files |
| (such as what is returned by \code{\link[=s3_bucket]{s3_bucket()}}) |
| \item a string path or URI to a single file |
| \item a character vector of paths or URIs to individual data files |
| \item a list of \code{Dataset} objects as created by this function |
| \item a list of \code{DatasetFactory} objects as created by \code{\link[=dataset_factory]{dataset_factory()}}. |
| } |
| |
| When \code{sources} is a vector of file URIs, they must all use the same protocol |
| and point to files located in the same file system and having the same |
| format.} |
| |
| \item{schema}{\link{Schema} for the \code{Dataset}. If \code{NULL} (the default), the schema |
| will be inferred from the data sources.} |
| |
| \item{partitioning}{When \code{sources} is a directory path/URI, one of: |
| \itemize{ |
| \item a \code{Schema}, in which case the file paths relative to \code{sources} will be |
| parsed, and path segments will be matched with the schema fields. |
| \item a character vector that defines the field names corresponding to those |
| path segments (that is, you're providing the names that would correspond |
| to a \code{Schema} but the types will be autodetected) |
| \item a \code{Partitioning} or \code{PartitioningFactory}, such as returned |
| by \code{\link[=hive_partition]{hive_partition()}} |
| \item \code{NULL} for no partitioning |
| } |
| |
| The default is to autodetect Hive-style partitions unless |
| \code{hive_style = FALSE}. See the "Partitioning" section for details. |
| When \code{sources} is not a directory path/URI, \code{partitioning} is ignored.} |
| |
| \item{hive_style}{Logical: should \code{partitioning} be interpreted as |
| Hive-style? Default is \code{NA}, which means to inspect the file paths for |
| Hive-style partitioning and behave accordingly.} |
| |
| \item{unify_schemas}{logical: should all data fragments (files, \code{Dataset}s) |
| be scanned in order to create a unified schema from them? If \code{FALSE}, only |
| the first fragment will be inspected for its schema. Use this fast path |
| when you know and trust that all fragments have an identical schema. |
| The default is \code{FALSE} when creating a dataset from a directory path/URI or |
| vector of file paths/URIs (because there may be many files and scanning may |
| be slow) but \code{TRUE} when \code{sources} is a list of \code{Dataset}s (because there |
| should be few \code{Dataset}s in the list and their \code{Schema}s are already in |
| memory).} |
| |
| \item{factory_options}{list of optional FileSystemFactoryOptions: |
| \itemize{ |
| \item \code{partition_base_dir}: string path segment prefix to ignore when |
| discovering partition information with DirectoryPartitioning. Not |
| meaningful (ignored with a warning) for HivePartitioning, nor is it |
| valid when providing a vector of file paths. |
| \item \code{exclude_invalid_files}: logical: should files that are not valid data |
| files be excluded? Default is \code{FALSE} because checking all files up |
| front incurs I/O and thus will be slower, especially on remote |
| filesystems. If false and there are invalid files, there will be an |
| error at scan time. This is the only FileSystemFactoryOption that is |
| valid for both when providing a directory path in which to discover |
| files and when providing a vector of file paths. |
| \item \code{selector_ignore_prefixes}: character vector of file prefixes to ignore |
| when discovering files in a directory. If invalid files can be excluded |
| by a common filename prefix this way, you can avoid the I/O cost of |
| \code{exclude_invalid_files}. Not valid when providing a vector of file paths |
| (but if you're providing the file list, you can filter invalid files |
| yourself). |
| }} |
| |
| \item{delim}{Single character used to separate fields within a record.} |
| |
| \item{quote}{Single character used to quote strings.} |
| |
| \item{escape_double}{Does the file escape quotes by doubling them? |
| i.e. If this option is \code{TRUE}, the value \verb{""""} represents |
| a single quote, \verb{\\"}.} |
| |
| \item{escape_backslash}{Does the file use backslashes to escape special |
| characters? This is more general than \code{escape_double} as backslashes |
| can be used to escape the delimiter character, the quote character, or |
| to add special characters like \verb{\\\\n}.} |
| |
| \item{col_names}{If \code{TRUE}, the first row of the input will be used as the |
| column names and will not be included in the data frame. If \code{FALSE}, column |
| names will be generated by Arrow, starting with "f0", "f1", ..., "fN". |
| Alternatively, you can specify a character vector of column names.} |
| |
| \item{col_types}{A compact string representation of the column types, |
| an Arrow \link{Schema}, or \code{NULL} (the default) to infer types from the data.} |
| |
| \item{na}{A character vector of strings to interpret as missing values.} |
| |
| \item{skip_empty_rows}{Should blank rows be ignored altogether? If |
| \code{TRUE}, blank rows will not be represented at all. If \code{FALSE}, they will be |
| filled with missings.} |
| |
| \item{skip}{Number of lines to skip before reading data.} |
| |
| \item{convert_options}{see \link[=CsvReadOptions]{file reader options}} |
| |
| \item{read_options}{see \link[=CsvReadOptions]{file reader options}} |
| |
| \item{timestamp_parsers}{User-defined timestamp parsers. If more than one |
| parser is specified, the CSV conversion logic will try parsing values |
| starting from the beginning of this vector. Possible values are: |
| \itemize{ |
| \item \code{NULL}: the default, which uses the ISO-8601 parser |
| \item a character vector of \link[base:strptime]{strptime} parse strings |
| \item a list of \link{TimestampParser} objects |
| }} |
| } |
| \description{ |
| A wrapper around \link{open_dataset} which explicitly includes parameters mirroring \code{\link[=read_csv_arrow]{read_csv_arrow()}}, |
| \code{\link[=read_delim_arrow]{read_delim_arrow()}}, and \code{\link[=read_tsv_arrow]{read_tsv_arrow()}} to allows for easy switching between functions |
| for opening single files and functions for opening datasets. |
| } |
| \section{Options currently supported by \code{\link[=read_delim_arrow]{read_delim_arrow()}} which are not supported here}{ |
| |
| \itemize{ |
| \item \code{file} (instead, please specify files in \code{sources}) |
| \item \code{col_select} (instead, subset columns after dataset creation) |
| \item \code{quoted_na} |
| \item \code{as_data_frame} (instead, convert to data frame after dataset creation) |
| \item \code{parse_options} |
| } |
| } |
| |
| \examples{ |
| \dontshow{if (arrow_with_dataset()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} |
| # Set up directory for examples |
| tf <- tempfile() |
| dir.create(tf) |
| df <- data.frame(x = c("1", "2", "NULL")) |
| |
| file_path <- file.path(tf, "file1.txt") |
| write.table(df, file_path, sep = ",", row.names = FALSE) |
| |
| read_csv_arrow(file_path, na = c("", "NA", "NULL"), col_names = "y", skip = 1) |
| open_csv_dataset(file_path, na = c("", "NA", "NULL"), col_names = "y", skip = 1) |
| |
| unlink(tf) |
| \dontshow{\}) # examplesIf} |
| } |
| \seealso{ |
| \code{\link[=open_dataset]{open_dataset()}} |
| } |