blob: 8300e415e2c9e7da8bbb1c1d77c887352484e63e [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#' Dataset file formats
#'
#' @description
#' A `FileFormat` holds information about how to read and parse the files
#' included in a `Dataset`. There are subclasses corresponding to the supported
#' file formats (`ParquetFileFormat` and `IpcFileFormat`).
#'
#' @section Factory:
#' `FileFormat$create()` takes the following arguments:
#' * `format`: A string identifier of the file format. Currently supported values:
#' * "parquet"
#' * "ipc"/"arrow"/"feather", all aliases for each other; for Feather, note that
#' only version 2 files are supported
#' * "csv"/"text", aliases for the same thing (because comma is the default
#' delimiter for text files
#' * "tsv", equivalent to passing `format = "text", delimiter = "\t"`
#' * `...`: Additional format-specific options
#'
#' `format = "parquet"``:
#' * `use_buffered_stream`: Read files through buffered input streams rather than
#' loading entire row groups at once. This may be enabled
#' to reduce memory overhead. Disabled by default.
#' * `buffer_size`: Size of buffered stream, if enabled. Default is 8KB.
#' * `dict_columns`: Names of columns which should be read as dictionaries.
#'
#' `format = "text"`: see [CsvReadOptions]. Note that you can specify them either
#' with the Arrow C++ library naming ("delimiter", "quoting", etc.) or the
#' `readr`-style naming used in [read_csv_arrow()] ("delim", "quote", etc.)
#'
#' It returns the appropriate subclass of `FileFormat` (e.g. `ParquetFileFormat`)
#' @rdname FileFormat
#' @name FileFormat
#' @export
FileFormat <- R6Class("FileFormat", inherit = ArrowObject,
public = list(
..dispatch = function() {
type <- self$type
if (type == "parquet") {
shared_ptr(ParquetFileFormat, self$pointer())
} else if (type == "ipc") {
shared_ptr(IpcFileFormat, self$pointer())
} else if (type == "csv") {
shared_ptr(CsvFileFormat, self$pointer())
} else {
self
}
}
),
active = list(
# @description
# Return the `FileFormat`'s type
type = function() dataset___FileFormat__type_name(self)
)
)
FileFormat$create <- function(format, ...) {
opt_names <- names(list(...))
if (format %in% c("csv", "text") || any(opt_names %in% c("delim", "delimiter"))) {
CsvFileFormat$create(...)
} else if (format == c("tsv")) {
CsvFileFormat$create(delimiter = "\t", ...)
} else if (format == "parquet") {
ParquetFileFormat$create(...)
} else if (format %in% c("ipc", "arrow", "feather")) { # These are aliases for the same thing
shared_ptr(IpcFileFormat, dataset___IpcFileFormat__Make())
} else {
stop("Unsupported file format: ", format, call. = FALSE)
}
}
#' @export
as.character.FileFormat <- function(x, ...) {
out <- x$type
# Slight hack: special case IPC -> feather, otherwise is just the type_name
ifelse(out == "ipc", "feather", out)
}
#' @usage NULL
#' @format NULL
#' @rdname FileFormat
#' @export
ParquetFileFormat <- R6Class("ParquetFileFormat", inherit = FileFormat)
ParquetFileFormat$create <- function(use_buffered_stream = FALSE,
buffer_size = 8196,
dict_columns = character(0)) {
shared_ptr(ParquetFileFormat, dataset___ParquetFileFormat__Make(
use_buffered_stream, buffer_size, dict_columns))
}
#' @usage NULL
#' @format NULL
#' @rdname FileFormat
#' @export
IpcFileFormat <- R6Class("IpcFileFormat", inherit = FileFormat)
#' @usage NULL
#' @format NULL
#' @rdname FileFormat
#' @export
CsvFileFormat <- R6Class("CsvFileFormat", inherit = FileFormat)
CsvFileFormat$create <- function(..., opts = csv_file_format_parse_options(...)) {
shared_ptr(CsvFileFormat, dataset___CsvFileFormat__Make(opts))
}
csv_file_format_parse_options <- function(...) {
# Support both the readr spelling of options and the arrow spelling
readr_opts <- c("delim", "quote", "escape_double", "escape_backslash", "skip_empty_rows")
if (any(readr_opts %in% names(list(...)))) {
readr_to_csv_parse_options(...)
} else {
CsvParseOptions$create(...)
}
}
#' Format-specific write options
#'
#' @description
#' A `FileWriteOptions` holds write options specific to a `FileFormat`.
FileWriteOptions <- R6Class("FileWriteOptions", inherit = ArrowObject,
public = list(
update = function(...) {
if (self$type == "parquet") {
dataset___ParquetFileWriteOptions__update(self,
ParquetWriterProperties$create(...),
ParquetArrowWriterProperties$create(...))
} else if (self$type == "ipc") {
args <- list(...)
if (is.null(args$codec)) {
dataset___IpcFileWriteOptions__update1(self,
get_ipc_use_legacy_format(args$use_legacy_format),
get_ipc_metadata_version(args$metadata_version))
} else {
dataset___IpcFileWriteOptions__update2(self,
get_ipc_use_legacy_format(args$use_legacy_format),
args$codec,
get_ipc_metadata_version(args$metadata_version))
}
}
invisible(self)
}
),
active = list(
type = function() dataset___FileWriteOptions__type_name(self)
)
)
FileWriteOptions$create <- function(format, ...) {
if (!inherits(format, "FileFormat")) {
format <- FileFormat$create(format)
}
options <- shared_ptr(FileWriteOptions, dataset___FileFormat__DefaultWriteOptions(format))
options$update(...)
}