| % Generated by roxygen2: do not edit by hand |
| % Please edit documentation in R/csv.R |
| \name{read_delim_arrow} |
| \alias{read_delim_arrow} |
| \alias{read_csv_arrow} |
| \alias{read_tsv_arrow} |
| \title{Read a CSV or other delimited file with Arrow} |
| \usage{ |
| read_delim_arrow( |
| file, |
| delim = ",", |
| quote = "\\"", |
| escape_double = TRUE, |
| escape_backslash = FALSE, |
| schema = NULL, |
| col_names = TRUE, |
| col_types = NULL, |
| col_select = NULL, |
| na = c("", "NA"), |
| quoted_na = TRUE, |
| skip_empty_rows = TRUE, |
| skip = 0L, |
| parse_options = NULL, |
| convert_options = NULL, |
| read_options = NULL, |
| as_data_frame = TRUE, |
| timestamp_parsers = NULL |
| ) |
| |
| read_csv_arrow( |
| file, |
| quote = "\\"", |
| escape_double = TRUE, |
| escape_backslash = FALSE, |
| schema = NULL, |
| col_names = TRUE, |
| col_types = NULL, |
| col_select = NULL, |
| na = c("", "NA"), |
| quoted_na = TRUE, |
| skip_empty_rows = TRUE, |
| skip = 0L, |
| parse_options = NULL, |
| convert_options = NULL, |
| read_options = NULL, |
| as_data_frame = TRUE, |
| timestamp_parsers = NULL |
| ) |
| |
| read_tsv_arrow( |
| file, |
| quote = "\\"", |
| escape_double = TRUE, |
| escape_backslash = FALSE, |
| schema = NULL, |
| col_names = TRUE, |
| col_types = NULL, |
| col_select = NULL, |
| na = c("", "NA"), |
| quoted_na = TRUE, |
| skip_empty_rows = TRUE, |
| skip = 0L, |
| parse_options = NULL, |
| convert_options = NULL, |
| read_options = NULL, |
| as_data_frame = TRUE, |
| timestamp_parsers = NULL |
| ) |
| } |
| \arguments{ |
| \item{file}{A character file name or URI, \code{raw} vector, an Arrow input stream, |
| or a \code{FileSystem} with path (\code{SubTreeFileSystem}). |
| If a file name, a memory-mapped Arrow \link{InputStream} will be opened and |
| closed when finished; compression will be detected from the file extension |
| and handled automatically. If an input stream is provided, it will be left |
| open.} |
| |
| \item{delim}{Single character used to separate fields within a record.} |
| |
| \item{quote}{Single character used to quote strings.} |
| |
| \item{escape_double}{Does the file escape quotes by doubling them? |
| i.e. If this option is \code{TRUE}, the value \verb{""""} represents |
| a single quote, \verb{\\"}.} |
| |
| \item{escape_backslash}{Does the file use backslashes to escape special |
| characters? This is more general than \code{escape_double} as backslashes |
| can be used to escape the delimiter character, the quote character, or |
| to add special characters like \verb{\\\\n}.} |
| |
| \item{schema}{\link{Schema} that describes the table. If provided, it will be |
| used to satisfy both \code{col_names} and \code{col_types}.} |
| |
| \item{col_names}{If \code{TRUE}, the first row of the input will be used as the |
| column names and will not be included in the data frame. If \code{FALSE}, column |
| names will be generated by Arrow, starting with "f0", "f1", ..., "fN". |
| Alternatively, you can specify a character vector of column names.} |
| |
| \item{col_types}{A compact string representation of the column types, or |
| \code{NULL} (the default) to infer types from the data.} |
| |
| \item{col_select}{A character vector of column names to keep, as in the |
| "select" argument to \code{data.table::fread()}, or a |
| \link[tidyselect:vars_select]{tidy selection specification} |
| of columns, as used in \code{dplyr::select()}.} |
| |
| \item{na}{A character vector of strings to interpret as missing values.} |
| |
| \item{quoted_na}{Should missing values inside quotes be treated as missing |
| values (the default) or strings. (Note that this is different from the |
| the Arrow C++ default for the corresponding convert option, |
| \code{strings_can_be_null}.)} |
| |
| \item{skip_empty_rows}{Should blank rows be ignored altogether? If |
| \code{TRUE}, blank rows will not be represented at all. If \code{FALSE}, they will be |
| filled with missings.} |
| |
| \item{skip}{Number of lines to skip before reading data.} |
| |
| \item{parse_options}{see \link[=CsvReadOptions]{file reader options}. |
| If given, this overrides any |
| parsing options provided in other arguments (e.g. \code{delim}, \code{quote}, etc.).} |
| |
| \item{convert_options}{see \link[=CsvReadOptions]{file reader options}} |
| |
| \item{read_options}{see \link[=CsvReadOptions]{file reader options}} |
| |
| \item{as_data_frame}{Should the function return a \code{data.frame} (default) or |
| an Arrow \link{Table}?} |
| |
| \item{timestamp_parsers}{User-defined timestamp parsers. If more than one |
| parser is specified, the CSV conversion logic will try parsing values |
| starting from the beginning of this vector. Possible values are: |
| \itemize{ |
| \item \code{NULL}: the default, which uses the ISO-8601 parser |
| \item a character vector of \link[base:strptime]{strptime} parse strings |
| \item a list of \link{TimestampParser} objects |
| }} |
| } |
| \value{ |
| A \code{data.frame}, or a Table if \code{as_data_frame = FALSE}. |
| } |
| \description{ |
| These functions uses the Arrow C++ CSV reader to read into a \code{data.frame}. |
| Arrow C++ options have been mapped to argument names that follow those of |
| \code{readr::read_delim()}, and \code{col_select} was inspired by \code{vroom::vroom()}. |
| } |
| \details{ |
| \code{read_csv_arrow()} and \code{read_tsv_arrow()} are wrappers around |
| \code{read_delim_arrow()} that specify a delimiter. |
| |
| Note that not all \code{readr} options are currently implemented here. Please file |
| an issue if you encounter one that \code{arrow} should support. |
| |
| If you need to control Arrow-specific reader parameters that don't have an |
| equivalent in \code{readr::read_csv()}, you can either provide them in the |
| \code{parse_options}, \code{convert_options}, or \code{read_options} arguments, or you can |
| use \link{CsvTableReader} directly for lower-level access. |
| } |
| \section{Specifying column types and names}{ |
| |
| |
| By default, the CSV reader will infer the column names and data types from the file, but there |
| are a few ways you can specify them directly. |
| |
| One way is to provide an Arrow \link{Schema} in the \code{schema} argument, |
| which is an ordered map of column name to type. |
| When provided, it satisfies both the \code{col_names} and \code{col_types} arguments. |
| This is good if you know all of this information up front. |
| |
| You can also pass a \code{Schema} to the \code{col_types} argument. If you do this, |
| column names will still be inferred from the file unless you also specify |
| \code{col_names}. In either case, the column names in the \code{Schema} must match the |
| data's column names, whether they are explicitly provided or inferred. That |
| said, this \code{Schema} does not have to reference all columns: those omitted |
| will have their types inferred. |
| |
| Alternatively, you can declare column types by providing the compact string representation |
| that \code{readr} uses to the \code{col_types} argument. This means you provide a |
| single string, one character per column, where the characters map to Arrow |
| types analogously to the \code{readr} type mapping: |
| \itemize{ |
| \item "c": \code{utf8()} |
| \item "i": \code{int32()} |
| \item "n": \code{float64()} |
| \item "d": \code{float64()} |
| \item "l": \code{bool()} |
| \item "f": \code{dictionary()} |
| \item "D": \code{date32()} |
| \item "T": \code{time32()} |
| \item "t": \code{timestamp()} |
| \item "_": \code{null()} |
| \item "-": \code{null()} |
| \item "?": infer the type from the data |
| } |
| |
| If you use the compact string representation for \code{col_types}, you must also |
| specify \code{col_names}. |
| |
| Regardless of how types are specified, all columns with a \code{null()} type will |
| be dropped. |
| |
| Note that if you are specifying column names, whether by \code{schema} or |
| \code{col_names}, and the CSV file has a header row that would otherwise be used |
| to idenfity column names, you'll need to add \code{skip = 1} to skip that row. |
| } |
| |
| \examples{ |
| \donttest{ |
| tf <- tempfile() |
| on.exit(unlink(tf)) |
| write.csv(mtcars, file = tf) |
| df <- read_csv_arrow(tf) |
| dim(df) |
| # Can select columns |
| df <- read_csv_arrow(tf, col_select = starts_with("d")) |
| } |
| } |