| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| |
| #' Read a CSV or other delimited file with Arrow |
| #' |
| #' These functions uses the Arrow C++ CSV reader to read into a `data.frame`. |
| #' Arrow C++ options have been mapped to argument names that follow those of |
| #' `readr::read_delim()`, and `col_select` was inspired by `vroom::vroom()`. |
| #' |
| #' `read_csv_arrow()` and `read_tsv_arrow()` are wrappers around |
| #' `read_delim_arrow()` that specify a delimiter. |
| #' |
| #' Note that not all `readr` options are currently implemented here. Please file |
| #' an issue if you encounter one that `arrow` should support. |
| #' |
| #' If you need to control Arrow-specific reader parameters that don't have an |
| #' equivalent in `readr::read_csv()`, you can either provide them in the |
| #' `parse_options`, `convert_options`, or `read_options` arguments, or you can |
| #' use [CsvTableReader] directly for lower-level access. |
| #' |
| #' @param file A character file name, `raw` vector, or an Arrow input stream. |
| #' If a file name, a memory-mapped Arrow [InputStream] will be opened and |
| #' closed when finished; compression will be detected from the file extension |
| #' and handled automatically. If an input stream is provided, it will be left |
| #' open. |
| #' @param delim Single character used to separate fields within a record. |
| #' @param quote Single character used to quote strings. |
| #' @param escape_double Does the file escape quotes by doubling them? |
| #' i.e. If this option is `TRUE`, the value `""""` represents |
| #' a single quote, `\"`. |
| #' @param escape_backslash Does the file use backslashes to escape special |
| #' characters? This is more general than `escape_double` as backslashes |
| #' can be used to escape the delimiter character, the quote character, or |
| #' to add special characters like `\\n`. |
| #' @param col_names If `TRUE`, the first row of the input will be used as the |
| #' column names and will not be included in the data frame. If `FALSE`, column |
| #' names will be generated by Arrow, starting with "f0", "f1", ..., "fN". |
| #' Alternatively, you can specify a character vector of column names. |
| #' @param col_select A character vector of column names to keep, as in the |
| #' "select" argument to `data.table::fread()`, or a |
| #' [tidy selection specification][tidyselect::vars_select()] |
| #' of columns, as used in `dplyr::select()`. |
| #' @param na A character vector of strings to interpret as missing values. |
| #' @param quoted_na Should missing values inside quotes be treated as missing |
| #' values (the default) or strings. (Note that this is different from the |
| #' the Arrow C++ default for the corresponding convert option, |
| #' `strings_can_be_null`.) |
| #' @param skip_empty_rows Should blank rows be ignored altogether? If |
| #' `TRUE`, blank rows will not be represented at all. If `FALSE`, they will be |
| #' filled with missings. |
| #' @param skip Number of lines to skip before reading data. |
| #' @param parse_options see [file reader options][CsvReadOptions]. |
| #' If given, this overrides any |
| #' parsing options provided in other arguments (e.g. `delim`, `quote`, etc.). |
| #' @param convert_options see [file reader options][CsvReadOptions] |
| #' @param read_options see [file reader options][CsvReadOptions] |
| #' @param as_data_frame Should the function return a `data.frame` (default) or |
| #' an Arrow [Table]? |
| #' |
| #' @return A `data.frame`, or a Table if `as_data_frame = FALSE`. |
| #' @export |
| #' @examples |
| #' \donttest{ |
| #' tf <- tempfile() |
| #' on.exit(unlink(tf)) |
| #' write.csv(mtcars, file = tf) |
| #' df <- read_csv_arrow(tf) |
| #' dim(df) |
| #' # Can select columns |
| #' df <- read_csv_arrow(tf, col_select = starts_with("d")) |
| #' } |
| read_delim_arrow <- function(file, |
| delim = ",", |
| quote = '"', |
| escape_double = TRUE, |
| escape_backslash = FALSE, |
| col_names = TRUE, |
| # col_types = TRUE, |
| col_select = NULL, |
| na = c("", "NA"), |
| quoted_na = TRUE, |
| skip_empty_rows = TRUE, |
| skip = 0L, |
| parse_options = NULL, |
| convert_options = NULL, |
| read_options = NULL, |
| as_data_frame = TRUE) { |
| |
| if (is.null(parse_options)) { |
| parse_options <- readr_to_csv_parse_options( |
| delim, |
| quote, |
| escape_double, |
| escape_backslash, |
| skip_empty_rows |
| ) |
| } |
| |
| if (is.null(read_options)) { |
| read_options <- readr_to_csv_read_options(skip, col_names) |
| } |
| if (is.null(convert_options)) { |
| # TODO: col_types (needs wiring in CsvConvertOptions) |
| convert_options <- readr_to_csv_convert_options(na, quoted_na) |
| } |
| |
| if (!inherits(file, "InputStream")) { |
| file <- make_readable_file(file) |
| on.exit(file$close()) |
| } |
| reader <- CsvTableReader$create( |
| file, |
| read_options = read_options, |
| parse_options = parse_options, |
| convert_options = convert_options |
| ) |
| |
| tab <- reader$Read()$select(!!enquo(col_select)) |
| |
| if (isTRUE(as_data_frame)) { |
| tab <- as.data.frame(tab) |
| } |
| |
| tab |
| } |
| |
| #' @rdname read_delim_arrow |
| #' @export |
| read_csv_arrow <- function(file, |
| quote = '"', |
| escape_double = TRUE, |
| escape_backslash = FALSE, |
| col_names = TRUE, |
| # col_types = TRUE, |
| col_select = NULL, |
| na = c("", "NA"), |
| quoted_na = TRUE, |
| skip_empty_rows = TRUE, |
| skip = 0L, |
| parse_options = NULL, |
| convert_options = NULL, |
| read_options = NULL, |
| as_data_frame = TRUE) { |
| |
| mc <- match.call() |
| mc$delim <- "," |
| mc[[1]] <- get("read_delim_arrow", envir = asNamespace("arrow")) |
| eval.parent(mc) |
| } |
| |
| #' @rdname read_delim_arrow |
| #' @export |
| read_tsv_arrow <- function(file, |
| quote = '"', |
| escape_double = TRUE, |
| escape_backslash = FALSE, |
| col_names = TRUE, |
| # col_types = TRUE, |
| col_select = NULL, |
| na = c("", "NA"), |
| quoted_na = TRUE, |
| skip_empty_rows = TRUE, |
| skip = 0L, |
| parse_options = NULL, |
| convert_options = NULL, |
| read_options = NULL, |
| as_data_frame = TRUE) { |
| |
| mc <- match.call() |
| mc$delim <- "\t" |
| mc[[1]] <- get("read_delim_arrow", envir = asNamespace("arrow")) |
| eval.parent(mc) |
| } |
| |
| #' @title Arrow CSV and JSON table reader classes |
| #' @rdname CsvTableReader |
| #' @name CsvTableReader |
| #' @docType class |
| #' @usage NULL |
| #' @format NULL |
| #' @description `CsvTableReader` and `JsonTableReader` wrap the Arrow C++ CSV |
| #' and JSON table readers. See their usage in [read_csv_arrow()] and |
| #' [read_json_arrow()], respectively. |
| #' |
| #' @section Factory: |
| #' |
| #' The `CsvTableReader$create()` and `JsonTableReader$create()` factory methods |
| #' take the following arguments: |
| #' |
| #' - `file` A character path to a local file, or an Arrow input stream |
| #' - `convert_options` (CSV only), `parse_options`, `read_options`: see |
| #' [CsvReadOptions] |
| #' - `...` additional parameters. |
| #' |
| #' @section Methods: |
| #' |
| #' - `$Read()`: returns an Arrow Table. |
| #' |
| #' @include arrow-package.R |
| #' @export |
| CsvTableReader <- R6Class("CsvTableReader", inherit = ArrowObject, |
| public = list( |
| Read = function() shared_ptr(Table, csv___TableReader__Read(self)) |
| ) |
| ) |
| CsvTableReader$create <- function(file, |
| read_options = CsvReadOptions$create(), |
| parse_options = CsvParseOptions$create(), |
| convert_options = CsvConvertOptions$create(), |
| ...) { |
| file <- make_readable_file(file) |
| shared_ptr( |
| CsvTableReader, |
| csv___TableReader__Make(file, read_options, parse_options, convert_options) |
| ) |
| } |
| |
| #' @title File reader options |
| #' @rdname CsvReadOptions |
| #' @name CsvReadOptions |
| #' @docType class |
| #' @usage NULL |
| #' @format NULL |
| #' @description `CsvReadOptions`, `CsvParseOptions`, `CsvConvertOptions`, |
| #' `JsonReadOptions`, and `JsonParseOptions` are containers for various |
| #' file reading options. See their usage in [read_csv_arrow()] and |
| #' [read_json_arrow()], respectively. |
| #' |
| #' @section Factory: |
| #' |
| #' The `CsvReadOptions$create()` and `JsonReadOptions$create()` factory methods |
| #' take the following arguments: |
| #' |
| #' - `use_threads` Whether to use the global CPU thread pool |
| #' - `block_size` Block size we request from the IO layer; also determines |
| #' the size of chunks when use_threads is `TRUE`. NB: if `FALSE`, JSON input |
| #' must end with an empty line. |
| #' |
| #' `CsvReadOptions$create()` further accepts these additional arguments: |
| #' |
| #' - `skip_rows` Number of lines to skip before reading data (default 0) |
| #' - `column_names` Character vector to supply column names. If length-0 |
| #' (the default), the first non-skipped row will be parsed to generate column |
| #' names, unless `autogenerate_column_names` is `TRUE`. |
| #' - `autogenerate_column_names` Logical: generate column names instead of |
| #' using the first non-skipped row (the default)? If `TRUE`, column names will |
| #' be "f0", "f1", ..., "fN". |
| #' |
| #' `CsvParseOptions$create()` takes the following arguments: |
| #' |
| #' - `delimiter` Field delimiting character (default `","`) |
| #' - `quoting` Logical: are strings quoted? (default `TRUE`) |
| #' - `quote_char` Quoting character, if `quoting` is `TRUE` |
| #' - `double_quote` Logical: are quotes inside values double-quoted? (default `TRUE`) |
| #' - `escaping` Logical: whether escaping is used (default `FALSE`) |
| #' - `escape_char` Escaping character, if `escaping` is `TRUE` |
| #' - `newlines_in_values` Logical: are values allowed to contain CR (`0x0d`) |
| #' and LF (`0x0a`) characters? (default `FALSE`) |
| #' - `ignore_empty_lines` Logical: should empty lines be ignored (default) or |
| #' generate a row of missing values (if `FALSE`)? |
| #' |
| #' `JsonParseOptions$create()` accepts only the `newlines_in_values` argument. |
| #' |
| #' `CsvConvertOptions$create()` takes the following arguments: |
| #' |
| #' - `check_utf8` Logical: check UTF8 validity of string columns? (default `TRUE`) |
| #' - `null_values` character vector of recognized spellings for null values. |
| #' Analogous to the `na.strings` argument to |
| #' [`read.csv()`][utils::read.csv()] or `na` in `readr::read_csv()`. |
| #' - `strings_can_be_null` Logical: can string / binary columns have |
| #' null values? Similar to the `quoted_na` argument to `readr::read_csv()`. |
| #' (default `FALSE`) |
| #' |
| #' @section Methods: |
| #' |
| #' These classes have no implemented methods. They are containers for the |
| #' options. |
| #' |
| #' @export |
| CsvReadOptions <- R6Class("CsvReadOptions", inherit = ArrowObject) |
| CsvReadOptions$create <- function(use_threads = option_use_threads(), |
| block_size = 1048576L, |
| skip_rows = 0L, |
| column_names = character(0), |
| autogenerate_column_names = FALSE) { |
| shared_ptr(CsvReadOptions, csv___ReadOptions__initialize( |
| list( |
| use_threads = use_threads, |
| block_size = block_size, |
| skip_rows = skip_rows, |
| column_names = column_names, |
| autogenerate_column_names = autogenerate_column_names |
| ) |
| )) |
| } |
| |
| readr_to_csv_read_options <- function(skip, col_names) { |
| if (isTRUE(col_names)) { |
| # C++ default to parse is 0-length string array |
| col_names <- character(0) |
| } |
| if (identical(col_names, FALSE)) { |
| CsvReadOptions$create(skip_rows = skip, autogenerate_column_names = TRUE) |
| } else { |
| CsvReadOptions$create(skip_rows = skip, column_names = col_names) |
| } |
| } |
| |
| #' @rdname CsvReadOptions |
| #' @usage NULL |
| #' @format NULL |
| #' @docType class |
| #' @export |
| CsvParseOptions <- R6Class("CsvParseOptions", inherit = ArrowObject) |
| CsvParseOptions$create <- function(delimiter = ",", |
| quoting = TRUE, |
| quote_char = '"', |
| double_quote = TRUE, |
| escaping = FALSE, |
| escape_char = '\\', |
| newlines_in_values = FALSE, |
| ignore_empty_lines = TRUE) { |
| |
| shared_ptr(CsvParseOptions, csv___ParseOptions__initialize( |
| list( |
| delimiter = delimiter, |
| quoting = quoting, |
| quote_char = quote_char, |
| double_quote = double_quote, |
| escaping = escaping, |
| escape_char = escape_char, |
| newlines_in_values = newlines_in_values, |
| ignore_empty_lines = ignore_empty_lines |
| ) |
| )) |
| } |
| |
| readr_to_csv_parse_options <- function(delim = ",", |
| quote = '"', |
| escape_double = TRUE, |
| escape_backslash = FALSE, |
| skip_empty_rows = TRUE) { |
| # This function translates from the readr argument list to the arrow arg names |
| # TODO: validate inputs |
| CsvParseOptions$create( |
| delimiter = delim, |
| quoting = nzchar(quote), |
| quote_char = quote, |
| double_quote = escape_double, |
| escaping = escape_backslash, |
| escape_char = '\\', |
| newlines_in_values = escape_backslash, |
| ignore_empty_lines = skip_empty_rows |
| ) |
| } |
| |
| #' @rdname CsvReadOptions |
| #' @usage NULL |
| #' @format NULL |
| #' @docType class |
| #' @export |
| CsvConvertOptions <- R6Class("CsvConvertOptions", inherit = ArrowObject) |
| CsvConvertOptions$create <- function(check_utf8 = TRUE, |
| null_values = c("", "NA"), |
| strings_can_be_null = FALSE) { |
| # TODO: there are more conversion options available: |
| # // Optional per-column types (disabling type inference on those columns) |
| # std::unordered_map<std::string, std::shared_ptr<DataType>> column_types; |
| # // Recognized spellings for boolean values |
| # std::vector<std::string> true_values; |
| # std::vector<std::string> false_values; |
| |
| shared_ptr(CsvConvertOptions, csv___ConvertOptions__initialize( |
| list( |
| check_utf8 = check_utf8, |
| null_values = null_values, |
| strings_can_be_null = strings_can_be_null |
| ) |
| )) |
| } |
| |
| readr_to_csv_convert_options <- function(na, quoted_na) { |
| CsvConvertOptions$create(null_values = na, strings_can_be_null = quoted_na) |
| } |