| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| |
| #' Read a CSV or other delimited file with Arrow |
| #' |
| #' These functions uses the Arrow C++ CSV reader to read into a `data.frame`. |
| #' Arrow C++ options have been mapped to argument names that follow those of |
| #' `readr::read_delim()`, and `col_select` was inspired by `vroom::vroom()`. |
| #' |
| #' `read_csv_arrow()` and `read_tsv_arrow()` are wrappers around |
| #' `read_delim_arrow()` that specify a delimiter. |
| #' |
| #' Note that not all `readr` options are currently implemented here. Please file |
| #' an issue if you encounter one that `arrow` should support. |
| #' |
| #' If you need to control Arrow-specific reader parameters that don't have an |
| #' equivalent in `readr::read_csv()`, you can either provide them in the |
| #' `parse_options`, `convert_options`, or `read_options` arguments, or you can |
| #' use [CsvTableReader] directly for lower-level access. |
| #' |
| #' @section Specifying column types and names: |
| #' |
| #' By default, the CSV reader will infer the column names and data types from the file, but there |
| #' are a few ways you can specify them directly. |
| #' |
| #' One way is to provide an Arrow [Schema] in the `schema` argument, |
| #' which is an ordered map of column name to type. |
| #' When provided, it satisfies both the `col_names` and `col_types` arguments. |
| #' This is good if you know all of this information up front. |
| #' |
| #' You can also pass a `Schema` to the `col_types` argument. If you do this, |
| #' column names will still be inferred from the file unless you also specify |
| #' `col_names`. In either case, the column names in the `Schema` must match the |
| #' data's column names, whether they are explicitly provided or inferred. That |
| #' said, this `Schema` does not have to reference all columns: those omitted |
| #' will have their types inferred. |
| #' |
| #' Alternatively, you can declare column types by providing the compact string representation |
| #' that `readr` uses to the `col_types` argument. This means you provide a |
| #' single string, one character per column, where the characters map to Arrow |
| #' types analogously to the `readr` type mapping: |
| #' |
| #' * "c": `utf8()` |
| #' * "i": `int32()` |
| #' * "n": `float64()` |
| #' * "d": `float64()` |
| #' * "l": `bool()` |
| #' * "f": `dictionary()` |
| #' * "D": `date32()` |
| #' * "T": `time32()` |
| #' * "t": `timestamp()` |
| #' * "_": `null()` |
| #' * "-": `null()` |
| #' * "?": infer the type from the data |
| #' |
| #' If you use the compact string representation for `col_types`, you must also |
| #' specify `col_names`. |
| #' |
| #' Regardless of how types are specified, all columns with a `null()` type will |
| #' be dropped. |
| #' |
| #' Note that if you are specifying column names, whether by `schema` or |
| #' `col_names`, and the CSV file has a header row that would otherwise be used |
| #' to idenfity column names, you'll need to add `skip = 1` to skip that row. |
| #' |
| #' @param file A character file name or URI, `raw` vector, an Arrow input stream, |
| #' or a `FileSystem` with path (`SubTreeFileSystem`). |
| #' If a file name, a memory-mapped Arrow [InputStream] will be opened and |
| #' closed when finished; compression will be detected from the file extension |
| #' and handled automatically. If an input stream is provided, it will be left |
| #' open. |
| #' @param delim Single character used to separate fields within a record. |
| #' @param quote Single character used to quote strings. |
| #' @param escape_double Does the file escape quotes by doubling them? |
| #' i.e. If this option is `TRUE`, the value `""""` represents |
| #' a single quote, `\"`. |
| #' @param escape_backslash Does the file use backslashes to escape special |
| #' characters? This is more general than `escape_double` as backslashes |
| #' can be used to escape the delimiter character, the quote character, or |
| #' to add special characters like `\\n`. |
| #' @param schema [Schema] that describes the table. If provided, it will be |
| #' used to satisfy both `col_names` and `col_types`. |
| #' @param col_names If `TRUE`, the first row of the input will be used as the |
| #' column names and will not be included in the data frame. If `FALSE`, column |
| #' names will be generated by Arrow, starting with "f0", "f1", ..., "fN". |
| #' Alternatively, you can specify a character vector of column names. |
| #' @param col_types A compact string representation of the column types, or |
| #' `NULL` (the default) to infer types from the data. |
| #' @param col_select A character vector of column names to keep, as in the |
| #' "select" argument to `data.table::fread()`, or a |
| #' [tidy selection specification][tidyselect::vars_select()] |
| #' of columns, as used in `dplyr::select()`. |
| #' @param na A character vector of strings to interpret as missing values. |
| #' @param quoted_na Should missing values inside quotes be treated as missing |
| #' values (the default) or strings. (Note that this is different from the |
| #' the Arrow C++ default for the corresponding convert option, |
| #' `strings_can_be_null`.) |
| #' @param skip_empty_rows Should blank rows be ignored altogether? If |
| #' `TRUE`, blank rows will not be represented at all. If `FALSE`, they will be |
| #' filled with missings. |
| #' @param skip Number of lines to skip before reading data. |
| #' @param timestamp_parsers User-defined timestamp parsers. If more than one |
| #' parser is specified, the CSV conversion logic will try parsing values |
| #' starting from the beginning of this vector. Possible values are: |
| #' - `NULL`: the default, which uses the ISO-8601 parser |
| #' - a character vector of [strptime][base::strptime()] parse strings |
| #' - a list of [TimestampParser] objects |
| #' @param parse_options see [file reader options][CsvReadOptions]. |
| #' If given, this overrides any |
| #' parsing options provided in other arguments (e.g. `delim`, `quote`, etc.). |
| #' @param convert_options see [file reader options][CsvReadOptions] |
| #' @param read_options see [file reader options][CsvReadOptions] |
| #' @param as_data_frame Should the function return a `data.frame` (default) or |
| #' an Arrow [Table]? |
| #' |
| #' @return A `data.frame`, or a Table if `as_data_frame = FALSE`. |
| #' @export |
| #' @examples |
| #' \donttest{ |
| #' tf <- tempfile() |
| #' on.exit(unlink(tf)) |
| #' write.csv(mtcars, file = tf) |
| #' df <- read_csv_arrow(tf) |
| #' dim(df) |
| #' # Can select columns |
| #' df <- read_csv_arrow(tf, col_select = starts_with("d")) |
| #' } |
| read_delim_arrow <- function(file, |
| delim = ",", |
| quote = '"', |
| escape_double = TRUE, |
| escape_backslash = FALSE, |
| schema = NULL, |
| col_names = TRUE, |
| col_types = NULL, |
| col_select = NULL, |
| na = c("", "NA"), |
| quoted_na = TRUE, |
| skip_empty_rows = TRUE, |
| skip = 0L, |
| parse_options = NULL, |
| convert_options = NULL, |
| read_options = NULL, |
| as_data_frame = TRUE, |
| timestamp_parsers = NULL) { |
| if (inherits(schema, "Schema")) { |
| col_names <- names(schema) |
| col_types <- schema |
| } |
| if (is.null(parse_options)) { |
| parse_options <- readr_to_csv_parse_options( |
| delim, |
| quote, |
| escape_double, |
| escape_backslash, |
| skip_empty_rows |
| ) |
| } |
| if (is.null(read_options)) { |
| read_options <- readr_to_csv_read_options(skip, col_names) |
| } |
| if (is.null(convert_options)) { |
| convert_options <- readr_to_csv_convert_options( |
| na, |
| quoted_na, |
| col_types = col_types, |
| col_names = read_options$column_names, |
| timestamp_parsers = timestamp_parsers |
| ) |
| } |
| |
| if (!inherits(file, "InputStream")) { |
| file <- make_readable_file(file) |
| on.exit(file$close()) |
| } |
| reader <- CsvTableReader$create( |
| file, |
| read_options = read_options, |
| parse_options = parse_options, |
| convert_options = convert_options |
| ) |
| |
| tab <- reader$Read() |
| |
| # TODO: move this into convert_options using include_columns |
| col_select <- enquo(col_select) |
| if (!quo_is_null(col_select)) { |
| tab <- tab[vars_select(names(tab), !!col_select)] |
| } |
| |
| if (isTRUE(as_data_frame)) { |
| tab <- as.data.frame(tab) |
| } |
| |
| tab |
| } |
| |
| #' @rdname read_delim_arrow |
| #' @export |
| read_csv_arrow <- function(file, |
| quote = '"', |
| escape_double = TRUE, |
| escape_backslash = FALSE, |
| schema = NULL, |
| col_names = TRUE, |
| col_types = NULL, |
| col_select = NULL, |
| na = c("", "NA"), |
| quoted_na = TRUE, |
| skip_empty_rows = TRUE, |
| skip = 0L, |
| parse_options = NULL, |
| convert_options = NULL, |
| read_options = NULL, |
| as_data_frame = TRUE, |
| timestamp_parsers = NULL) { |
| |
| mc <- match.call() |
| mc$delim <- "," |
| mc[[1]] <- get("read_delim_arrow", envir = asNamespace("arrow")) |
| eval.parent(mc) |
| } |
| |
| #' @rdname read_delim_arrow |
| #' @export |
| read_tsv_arrow <- function(file, |
| quote = '"', |
| escape_double = TRUE, |
| escape_backslash = FALSE, |
| schema = NULL, |
| col_names = TRUE, |
| col_types = NULL, |
| col_select = NULL, |
| na = c("", "NA"), |
| quoted_na = TRUE, |
| skip_empty_rows = TRUE, |
| skip = 0L, |
| parse_options = NULL, |
| convert_options = NULL, |
| read_options = NULL, |
| as_data_frame = TRUE, |
| timestamp_parsers = NULL) { |
| |
| mc <- match.call() |
| mc$delim <- "\t" |
| mc[[1]] <- get("read_delim_arrow", envir = asNamespace("arrow")) |
| eval.parent(mc) |
| } |
| |
| #' @title Arrow CSV and JSON table reader classes |
| #' @rdname CsvTableReader |
| #' @name CsvTableReader |
| #' @docType class |
| #' @usage NULL |
| #' @format NULL |
| #' @description `CsvTableReader` and `JsonTableReader` wrap the Arrow C++ CSV |
| #' and JSON table readers. See their usage in [read_csv_arrow()] and |
| #' [read_json_arrow()], respectively. |
| #' |
| #' @section Factory: |
| #' |
| #' The `CsvTableReader$create()` and `JsonTableReader$create()` factory methods |
| #' take the following arguments: |
| #' |
| #' - `file` An Arrow [InputStream] |
| #' - `convert_options` (CSV only), `parse_options`, `read_options`: see |
| #' [CsvReadOptions] |
| #' - `...` additional parameters. |
| #' |
| #' @section Methods: |
| #' |
| #' - `$Read()`: returns an Arrow Table. |
| #' |
| #' @include arrow-package.R |
| #' @export |
| CsvTableReader <- R6Class("CsvTableReader", inherit = ArrowObject, |
| public = list( |
| Read = function() shared_ptr(Table, csv___TableReader__Read(self)) |
| ) |
| ) |
| CsvTableReader$create <- function(file, |
| read_options = CsvReadOptions$create(), |
| parse_options = CsvParseOptions$create(), |
| convert_options = CsvConvertOptions$create(), |
| ...) { |
| assert_is(file, "InputStream") |
| shared_ptr( |
| CsvTableReader, |
| csv___TableReader__Make(file, read_options, parse_options, convert_options) |
| ) |
| } |
| |
| #' @title File reader options |
| #' @rdname CsvReadOptions |
| #' @name CsvReadOptions |
| #' @docType class |
| #' @usage NULL |
| #' @format NULL |
| #' @description `CsvReadOptions`, `CsvParseOptions`, `CsvConvertOptions`, |
| #' `JsonReadOptions`, `JsonParseOptions`, and `TimestampParser` are containers for various |
| #' file reading options. See their usage in [read_csv_arrow()] and |
| #' [read_json_arrow()], respectively. |
| #' |
| #' @section Factory: |
| #' |
| #' The `CsvReadOptions$create()` and `JsonReadOptions$create()` factory methods |
| #' take the following arguments: |
| #' |
| #' - `use_threads` Whether to use the global CPU thread pool |
| #' - `block_size` Block size we request from the IO layer; also determines |
| #' the size of chunks when use_threads is `TRUE`. NB: if `FALSE`, JSON input |
| #' must end with an empty line. |
| #' |
| #' `CsvReadOptions$create()` further accepts these additional arguments: |
| #' |
| #' - `skip_rows` Number of lines to skip before reading data (default 0) |
| #' - `column_names` Character vector to supply column names. If length-0 |
| #' (the default), the first non-skipped row will be parsed to generate column |
| #' names, unless `autogenerate_column_names` is `TRUE`. |
| #' - `autogenerate_column_names` Logical: generate column names instead of |
| #' using the first non-skipped row (the default)? If `TRUE`, column names will |
| #' be "f0", "f1", ..., "fN". |
| #' |
| #' `CsvParseOptions$create()` takes the following arguments: |
| #' |
| #' - `delimiter` Field delimiting character (default `","`) |
| #' - `quoting` Logical: are strings quoted? (default `TRUE`) |
| #' - `quote_char` Quoting character, if `quoting` is `TRUE` |
| #' - `double_quote` Logical: are quotes inside values double-quoted? (default `TRUE`) |
| #' - `escaping` Logical: whether escaping is used (default `FALSE`) |
| #' - `escape_char` Escaping character, if `escaping` is `TRUE` |
| #' - `newlines_in_values` Logical: are values allowed to contain CR (`0x0d`) |
| #' and LF (`0x0a`) characters? (default `FALSE`) |
| #' - `ignore_empty_lines` Logical: should empty lines be ignored (default) or |
| #' generate a row of missing values (if `FALSE`)? |
| #' |
| #' `JsonParseOptions$create()` accepts only the `newlines_in_values` argument. |
| #' |
| #' `CsvConvertOptions$create()` takes the following arguments: |
| #' |
| #' - `check_utf8` Logical: check UTF8 validity of string columns? (default `TRUE`) |
| #' - `null_values` character vector of recognized spellings for null values. |
| #' Analogous to the `na.strings` argument to |
| #' [`read.csv()`][utils::read.csv()] or `na` in `readr::read_csv()`. |
| #' - `strings_can_be_null` Logical: can string / binary columns have |
| #' null values? Similar to the `quoted_na` argument to `readr::read_csv()`. |
| #' (default `FALSE`) |
| #' - `true_values` character vector of recognized spellings for `TRUE` values |
| #' - `false_values` character vector of recognized spellings for `FALSE` values |
| #' - `col_types` A `Schema` or `NULL` to infer types |
| #' - `auto_dict_encode` Logical: Whether to try to automatically |
| #' dictionary-encode string / binary data (think `stringsAsFactors`). Default `FALSE`. |
| #' This setting is ignored for non-inferred columns (those in `col_types`). |
| #' - `auto_dict_max_cardinality` If `auto_dict_encode`, string/binary columns |
| #' are dictionary-encoded up to this number of unique values (default 50), |
| #' after which it switches to regular encoding. |
| #' - `include_columns` If non-empty, indicates the names of columns from the |
| #' CSV file that should be actually read and converted (in the vector's order). |
| #' - `include_missing_columns` Logical: if `include_columns` is provided, should |
| #' columns named in it but not found in the data be included as a column of |
| #' type `null()`? The default (`FALSE`) means that the reader will instead |
| #' raise an error. |
| #' - `timestamp_parsers` User-defined timestamp parsers. If more than one |
| #' parser is specified, the CSV conversion logic will try parsing values |
| #' starting from the beginning of this vector. Possible values are |
| #' (a) `NULL`, the default, which uses the ISO-8601 parser; |
| #' (b) a character vector of [strptime][base::strptime()] parse strings; or |
| #' (c) a list of [TimestampParser] objects. |
| #' |
| #' `TimestampParser$create()` takes an optional `format` string argument. |
| #' See [`strptime()`][base::strptime()] for example syntax. |
| #' The default is to use an ISO-8601 format parser. |
| #' @section Active bindings: |
| #' |
| #' - `column_names`: from `CsvReadOptions` |
| #' |
| #' @export |
| CsvReadOptions <- R6Class("CsvReadOptions", |
| inherit = ArrowObject, |
| active = list( |
| column_names = function() csv___ReadOptions__column_names(self) |
| ) |
| ) |
| CsvReadOptions$create <- function(use_threads = option_use_threads(), |
| block_size = 1048576L, |
| skip_rows = 0L, |
| column_names = character(0), |
| autogenerate_column_names = FALSE) { |
| |
| shared_ptr(CsvReadOptions, csv___ReadOptions__initialize( |
| list( |
| use_threads = use_threads, |
| block_size = block_size, |
| skip_rows = skip_rows, |
| column_names = column_names, |
| autogenerate_column_names = autogenerate_column_names |
| ) |
| )) |
| } |
| |
| readr_to_csv_read_options <- function(skip, col_names, col_types) { |
| if (isTRUE(col_names)) { |
| # C++ default to parse is 0-length string array |
| col_names <- character(0) |
| } |
| if (identical(col_names, FALSE)) { |
| CsvReadOptions$create(skip_rows = skip, autogenerate_column_names = TRUE) |
| } else { |
| CsvReadOptions$create(skip_rows = skip, column_names = col_names) |
| } |
| } |
| |
| #' @rdname CsvReadOptions |
| #' @usage NULL |
| #' @format NULL |
| #' @docType class |
| #' @export |
| CsvParseOptions <- R6Class("CsvParseOptions", inherit = ArrowObject) |
| CsvParseOptions$create <- function(delimiter = ",", |
| quoting = TRUE, |
| quote_char = '"', |
| double_quote = TRUE, |
| escaping = FALSE, |
| escape_char = '\\', |
| newlines_in_values = FALSE, |
| ignore_empty_lines = TRUE) { |
| |
| shared_ptr(CsvParseOptions, csv___ParseOptions__initialize( |
| list( |
| delimiter = delimiter, |
| quoting = quoting, |
| quote_char = quote_char, |
| double_quote = double_quote, |
| escaping = escaping, |
| escape_char = escape_char, |
| newlines_in_values = newlines_in_values, |
| ignore_empty_lines = ignore_empty_lines |
| ) |
| )) |
| } |
| |
| readr_to_csv_parse_options <- function(delim = ",", |
| quote = '"', |
| escape_double = TRUE, |
| escape_backslash = FALSE, |
| skip_empty_rows = TRUE) { |
| # This function translates from the readr argument list to the arrow arg names |
| # TODO: validate inputs |
| CsvParseOptions$create( |
| delimiter = delim, |
| quoting = nzchar(quote), |
| quote_char = quote, |
| double_quote = escape_double, |
| escaping = escape_backslash, |
| escape_char = '\\', |
| newlines_in_values = escape_backslash, |
| ignore_empty_lines = skip_empty_rows |
| ) |
| } |
| |
| #' @rdname CsvReadOptions |
| #' @usage NULL |
| #' @format NULL |
| #' @docType class |
| #' @export |
| TimestampParser <- R6Class("TimestampParser", inherit = ArrowObject, |
| public = list( |
| kind = function() TimestampParser__kind(self), |
| format = function() TimestampParser__format(self) |
| ) |
| ) |
| TimestampParser$create <- function(format = NULL) { |
| if (is.null(format)) { |
| shared_ptr(TimestampParser, TimestampParser__MakeISO8601()) |
| } else { |
| shared_ptr(TimestampParser, TimestampParser__MakeStrptime(format)) |
| } |
| } |
| |
| #' @rdname CsvReadOptions |
| #' @usage NULL |
| #' @format NULL |
| #' @docType class |
| #' @export |
| CsvConvertOptions <- R6Class("CsvConvertOptions", inherit = ArrowObject) |
| CsvConvertOptions$create <- function(check_utf8 = TRUE, |
| null_values = c("", "NA"), |
| true_values = c("T", "true", "TRUE"), |
| false_values= c("F", "false", "FALSE"), |
| strings_can_be_null = FALSE, |
| col_types = NULL, |
| auto_dict_encode = FALSE, |
| auto_dict_max_cardinality = 50L, |
| include_columns = character(), |
| include_missing_columns = FALSE, |
| timestamp_parsers = NULL) { |
| |
| if (!is.null(col_types) && !inherits(col_types, "Schema")) { |
| abort(c( |
| "Unsupported `col_types` specification.", |
| i = "`col_types` must be NULL, or a <Schema>." |
| )) |
| } |
| |
| shared_ptr(CsvConvertOptions, csv___ConvertOptions__initialize( |
| list( |
| check_utf8 = check_utf8, |
| null_values = null_values, |
| strings_can_be_null = strings_can_be_null, |
| col_types = col_types, |
| true_values = true_values, |
| false_values = false_values, |
| auto_dict_encode = auto_dict_encode, |
| auto_dict_max_cardinality = auto_dict_max_cardinality, |
| include_columns = include_columns, |
| include_missing_columns = include_missing_columns, |
| timestamp_parsers = timestamp_parsers |
| ) |
| )) |
| } |
| |
| readr_to_csv_convert_options <- function(na, |
| quoted_na, |
| col_types = NULL, |
| col_names = NULL, |
| timestamp_parsers = NULL) { |
| include_columns <- character() |
| |
| if (is.character(col_types)) { |
| if (length(col_types) != 1L) { |
| abort("`col_types` is a character vector that is not of size 1") |
| } |
| n <- nchar(col_types) |
| specs <- substring(col_types, seq_len(n), seq_len(n)) |
| if (!is_bare_character(col_names, n)) { |
| abort("Compact specification for `col_types` requires `col_names`") |
| } |
| |
| col_types <- set_names(nm = col_names, map2(specs, col_names, ~{ |
| switch(.x, |
| "c" = utf8(), |
| "i" = int32(), |
| "n" = float64(), |
| "d" = float64(), |
| "l" = bool(), |
| "f" = dictionary(), |
| "D" = date32(), |
| "T" = time32(), |
| "t" = timestamp(), |
| "_" = null(), |
| "-" = null(), |
| "?" = NULL, |
| abort("Unsupported compact specification: '", .x,"' for column '", .y, "'") |
| ) |
| })) |
| # To "guess" types, omit them from col_types |
| col_types <- keep(col_types, ~!is.null(.x)) |
| col_types <- schema(!!!col_types) |
| } |
| |
| if (!is.null(col_types)) { |
| assert_is(col_types, "Schema") |
| # If any columns are null(), drop them |
| # (by specifying the other columns in include_columns) |
| nulls <- map_lgl(col_types$fields, ~.$type$Equals(null())) |
| if (any(nulls)) { |
| include_columns <- setdiff(col_names, names(col_types)[nulls]) |
| } |
| } |
| CsvConvertOptions$create( |
| null_values = na, |
| strings_can_be_null = quoted_na, |
| col_types = col_types, |
| timestamp_parsers = timestamp_parsers, |
| include_columns = include_columns |
| ) |
| } |