blob: 569033ff7f895b296060ba6c7206a540974b2f5a [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#' Dataset file formats
#'
#' @description
#' A `FileFormat` holds information about how to read and parse the files
#' included in a `Dataset`. There are subclasses corresponding to the supported
#' file formats (`ParquetFileFormat` and `IpcFileFormat`).
#'
#' @section Factory:
#' `FileFormat$create()` takes the following arguments:
#' * `format`: A string identifier of the file format. Currently supported values:
#' * "parquet"
#' * "ipc"/"arrow"/"feather", all aliases for each other; for Feather, note that
#' only version 2 files are supported
#' * "csv"/"text", aliases for the same thing (because comma is the default
#' delimiter for text files
#' * "tsv", equivalent to passing `format = "text", delimiter = "\t"`
#' * `...`: Additional format-specific options
#'
#' `format = "parquet"``:
#' * `dict_columns`: Names of columns which should be read as dictionaries.
#' * Any Parquet options from [FragmentScanOptions].
#'
#' `format = "text"`: see [CsvParseOptions]. Note that you can specify them either
#' with the Arrow C++ library naming ("delimiter", "quoting", etc.) or the
#' `readr`-style naming used in [read_csv_arrow()] ("delim", "quote", etc.).
#' Not all `readr` options are currently supported; please file an issue if
#' you encounter one that `arrow` should support. Also, the following options are
#' supported. From [CsvReadOptions]:
#' * `skip_rows`
#' * `column_names`
#' * `autogenerate_column_names`
#' From [CsvFragmentScanOptions] (these values can be overridden at scan time):
#' * `convert_options`: a [CsvConvertOptions]
#' * `block_size`
#'
#' It returns the appropriate subclass of `FileFormat` (e.g. `ParquetFileFormat`)
#' @rdname FileFormat
#' @name FileFormat
#' @examplesIf arrow_with_dataset() && tolower(Sys.info()[["sysname"]]) != "windows"
#' ## Semi-colon delimited files
#' # Set up directory for examples
#' tf <- tempfile()
#' dir.create(tf)
#' on.exit(unlink(tf))
#' write.table(mtcars, file.path(tf, "file1.txt"), sep = ";", row.names = FALSE)
#'
#' # Create FileFormat object
#' format <- FileFormat$create(format = "text", delimiter = ";")
#'
#' open_dataset(tf, format = format)
#' @export
FileFormat <- R6Class("FileFormat", inherit = ArrowObject,
active = list(
# @description
# Return the `FileFormat`'s type
type = function() dataset___FileFormat__type_name(self)
)
)
FileFormat$create <- function(format, ...) {
opt_names <- names(list(...))
if (format %in% c("csv", "text") || any(opt_names %in% c("delim", "delimiter"))) {
CsvFileFormat$create(...)
} else if (format == c("tsv")) {
CsvFileFormat$create(delimiter = "\t", ...)
} else if (format == "parquet") {
ParquetFileFormat$create(...)
} else if (format %in% c("ipc", "arrow", "feather")) { # These are aliases for the same thing
dataset___IpcFileFormat__Make()
} else {
stop("Unsupported file format: ", format, call. = FALSE)
}
}
#' @export
as.character.FileFormat <- function(x, ...) {
out <- x$type
# Slight hack: special case IPC -> feather, otherwise is just the type_name
ifelse(out == "ipc", "feather", out)
}
#' @usage NULL
#' @format NULL
#' @rdname FileFormat
#' @export
ParquetFileFormat <- R6Class("ParquetFileFormat", inherit = FileFormat)
ParquetFileFormat$create <- function(...,
dict_columns = character(0)) {
options <- ParquetFragmentScanOptions$create(...)
dataset___ParquetFileFormat__Make(options, dict_columns)
}
#' @usage NULL
#' @format NULL
#' @rdname FileFormat
#' @export
IpcFileFormat <- R6Class("IpcFileFormat", inherit = FileFormat)
#' @usage NULL
#' @format NULL
#' @rdname FileFormat
#' @export
CsvFileFormat <- R6Class("CsvFileFormat", inherit = FileFormat)
CsvFileFormat$create <- function(..., opts = csv_file_format_parse_options(...),
convert_options = csv_file_format_convert_options(...),
read_options = csv_file_format_read_options(...)) {
dataset___CsvFileFormat__Make(opts, convert_options, read_options)
}
# Support both readr-style option names and Arrow C++ option names
csv_file_format_parse_options <- function(...) {
opts <- list(...)
# Filter out arguments meant for CsvConvertOptions/CsvReadOptions
convert_opts <- names(formals(CsvConvertOptions$create))
read_opts <- names(formals(CsvReadOptions$create))
opts[convert_opts] <- NULL
opts[read_opts] <- NULL
opt_names <- names(opts)
# Catch any readr-style options specified with full option names that are
# supported by read_delim_arrow() (and its wrappers) but are not yet
# supported here
unsup_readr_opts <- setdiff(
names(formals(read_delim_arrow)),
names(formals(readr_to_csv_parse_options))
)
is_unsup_opt <- opt_names %in% unsup_readr_opts
unsup_opts <- opt_names[is_unsup_opt]
if (length(unsup_opts)) {
stop(
"The following ",
ngettext(length(unsup_opts), "option is ", "options are "),
"supported in \"read_delim_arrow\" functions ",
"but not yet supported here: ",
oxford_paste(unsup_opts),
call. = FALSE
)
}
# Catch any options with full or partial names that do not match any of the
# recognized Arrow C++ option names or readr-style option names
arrow_opts <- names(formals(CsvParseOptions$create))
readr_opts <- names(formals(readr_to_csv_parse_options))
is_arrow_opt <- !is.na(pmatch(opt_names, arrow_opts))
is_readr_opt <- !is.na(pmatch(opt_names, readr_opts))
unrec_opts <- opt_names[!is_arrow_opt & !is_readr_opt]
if (length(unrec_opts)) {
stop(
"Unrecognized ",
ngettext(length(unrec_opts), "option", "options"),
": ",
oxford_paste(unrec_opts),
call. = FALSE
)
}
# Catch options with ambiguous partial names (such as "del") that make it
# unclear whether the user is specifying Arrow C++ options ("delimiter") or
# readr-style options ("delim")
is_ambig_opt <- is.na(pmatch(opt_names, c(arrow_opts, readr_opts)))
ambig_opts <- opt_names[is_ambig_opt]
if (length(ambig_opts)) {
stop("Ambiguous ",
ngettext(length(ambig_opts), "option", "options"),
": ",
oxford_paste(ambig_opts),
". Use full argument names",
call. = FALSE)
}
if (any(is_readr_opt)) {
# Catch cases when the user specifies a mix of Arrow C++ options and
# readr-style options
if (!all(is_readr_opt)) {
stop("Use either Arrow parse options or readr parse options, not both",
call. = FALSE)
}
do.call(readr_to_csv_parse_options, opts) # all options have readr-style names
} else {
do.call(CsvParseOptions$create, opts) # all options have Arrow C++ names
}
}
csv_file_format_convert_options <- function(...) {
opts <- list(...)
# Filter out arguments meant for CsvParseOptions/CsvReadOptions
arrow_opts <- names(formals(CsvParseOptions$create))
readr_opts <- names(formals(readr_to_csv_parse_options))
read_opts <- names(formals(CsvReadOptions$create))
opts[arrow_opts] <- NULL
opts[readr_opts] <- NULL
opts[read_opts] <- NULL
do.call(CsvConvertOptions$create, opts)
}
csv_file_format_read_options <- function(...) {
opts <- list(...)
# Filter out arguments meant for CsvParseOptions/CsvConvertOptions
arrow_opts <- names(formals(CsvParseOptions$create))
readr_opts <- names(formals(readr_to_csv_parse_options))
convert_opts <- names(formals(CsvConvertOptions$create))
opts[arrow_opts] <- NULL
opts[readr_opts] <- NULL
opts[convert_opts] <- NULL
do.call(CsvReadOptions$create, opts)
}
#' Format-specific scan options
#'
#' @description
#' A `FragmentScanOptions` holds options specific to a `FileFormat` and a scan
#' operation.
#'
#' @section Factory:
#' `FragmentScanOptions$create()` takes the following arguments:
#' * `format`: A string identifier of the file format. Currently supported values:
#' * "parquet"
#' * "csv"/"text", aliases for the same format.
#' * `...`: Additional format-specific options
#'
#' `format = "parquet"``:
#' * `use_buffered_stream`: Read files through buffered input streams rather than
#' loading entire row groups at once. This may be enabled
#' to reduce memory overhead. Disabled by default.
#' * `buffer_size`: Size of buffered stream, if enabled. Default is 8KB.
#' * `pre_buffer`: Pre-buffer the raw Parquet data. This can improve performance
#' on high-latency filesystems. Disabled by default.
#
#' `format = "text"`: see [CsvConvertOptions]. Note that options can only be
#' specified with the Arrow C++ library naming. Also, "block_size" from
#' [CsvReadOptions] may be given.
#'
#' It returns the appropriate subclass of `FragmentScanOptions`
#' (e.g. `CsvFragmentScanOptions`).
#' @rdname FragmentScanOptions
#' @name FragmentScanOptions
#' @export
FragmentScanOptions <- R6Class("FragmentScanOptions", inherit = ArrowObject,
active = list(
# @description
# Return the `FragmentScanOptions`'s type
type = function() dataset___FragmentScanOptions__type_name(self)
)
)
FragmentScanOptions$create <- function(format, ...) {
opt_names <- names(list(...))
if (format %in% c("csv", "text", "tsv")) {
CsvFragmentScanOptions$create(...)
} else if (format == "parquet") {
ParquetFragmentScanOptions$create(...)
} else {
stop("Unsupported file format: ", format, call. = FALSE)
}
}
#' @export
as.character.FragmentScanOptions <- function(x, ...) {
x$type
}
#' @usage NULL
#' @format NULL
#' @rdname FragmentScanOptions
#' @export
CsvFragmentScanOptions <- R6Class("CsvFragmentScanOptions", inherit = FragmentScanOptions)
CsvFragmentScanOptions$create <- function(...,
convert_opts = csv_file_format_convert_options(...),
read_opts = csv_file_format_read_options(...)) {
dataset___CsvFragmentScanOptions__Make(convert_opts, read_opts)
}
#' @usage NULL
#' @format NULL
#' @rdname FragmentScanOptions
#' @export
ParquetFragmentScanOptions <- R6Class("ParquetFragmentScanOptions", inherit = FragmentScanOptions)
ParquetFragmentScanOptions$create <- function(use_buffered_stream = FALSE,
buffer_size = 8196,
pre_buffer = FALSE) {
dataset___ParquetFragmentScanOptions__Make(use_buffered_stream, buffer_size, pre_buffer)
}
#' Format-specific write options
#'
#' @description
#' A `FileWriteOptions` holds write options specific to a `FileFormat`.
FileWriteOptions <- R6Class("FileWriteOptions", inherit = ArrowObject,
public = list(
update = function(table, ...) {
if (self$type == "parquet") {
dataset___ParquetFileWriteOptions__update(self,
ParquetWriterProperties$create(table, ...),
ParquetArrowWriterProperties$create(...))
} else if (self$type == "ipc") {
args <- list(...)
if (is.null(args$codec)) {
dataset___IpcFileWriteOptions__update1(self,
get_ipc_use_legacy_format(args$use_legacy_format),
get_ipc_metadata_version(args$metadata_version))
} else {
dataset___IpcFileWriteOptions__update2(self,
get_ipc_use_legacy_format(args$use_legacy_format),
args$codec,
get_ipc_metadata_version(args$metadata_version))
}
} else if (self$type == "csv") {
dataset___CsvFileWriteOptions__update(self,
CsvWriteOptions$create(...))
}
invisible(self)
}
),
active = list(
type = function() dataset___FileWriteOptions__type_name(self)
)
)
FileWriteOptions$create <- function(format, ...) {
if (!inherits(format, "FileFormat")) {
format <- FileFormat$create(format)
}
options <- dataset___FileFormat__DefaultWriteOptions(format)
options$update(...)
}