| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| |
| #' Dataset file formats |
| #' |
| #' @description |
| #' A `FileFormat` holds information about how to read and parse the files |
| #' included in a `Dataset`. There are subclasses corresponding to the supported |
| #' file formats (`ParquetFileFormat` and `IpcFileFormat`). |
| #' |
| #' @section Factory: |
| #' `FileFormat$create()` takes the following arguments: |
| #' * `format`: A string identifier of the file format. Currently supported values: |
| #' * "parquet" |
| #' * "ipc"/"arrow"/"feather", all aliases for each other; for Feather, note that |
| #' only version 2 files are supported |
| #' * "csv"/"text", aliases for the same thing (because comma is the default |
| #' delimiter for text files |
| #' * "tsv", equivalent to passing `format = "text", delimiter = "\t"` |
| #' * `...`: Additional format-specific options |
| #' |
| #' `format = "parquet"``: |
| #' * `use_buffered_stream`: Read files through buffered input streams rather than |
| #' loading entire row groups at once. This may be enabled |
| #' to reduce memory overhead. Disabled by default. |
| #' * `buffer_size`: Size of buffered stream, if enabled. Default is 8KB. |
| #' * `dict_columns`: Names of columns which should be read as dictionaries. |
| #' |
| #' `format = "text"`: see [CsvReadOptions]. Note that you can specify them either |
| #' with the Arrow C++ library naming ("delimiter", "quoting", etc.) or the |
| #' `readr`-style naming used in [read_csv_arrow()] ("delim", "quote", etc.) |
| #' |
| #' It returns the appropriate subclass of `FileFormat` (e.g. `ParquetFileFormat`) |
| #' @rdname FileFormat |
| #' @name FileFormat |
| #' @export |
| FileFormat <- R6Class("FileFormat", inherit = ArrowObject, |
| public = list( |
| ..dispatch = function() { |
| type <- self$type |
| if (type == "parquet") { |
| shared_ptr(ParquetFileFormat, self$pointer()) |
| } else if (type == "ipc") { |
| shared_ptr(IpcFileFormat, self$pointer()) |
| } else if (type == "csv") { |
| shared_ptr(CsvFileFormat, self$pointer()) |
| } else { |
| self |
| } |
| } |
| ), |
| active = list( |
| # @description |
| # Return the `FileFormat`'s type |
| type = function() dataset___FileFormat__type_name(self) |
| ) |
| ) |
| FileFormat$create <- function(format, ...) { |
| opt_names <- names(list(...)) |
| if (format %in% c("csv", "text") || any(opt_names %in% c("delim", "delimiter"))) { |
| CsvFileFormat$create(...) |
| } else if (format == c("tsv")) { |
| CsvFileFormat$create(delimiter = "\t", ...) |
| } else if (format == "parquet") { |
| ParquetFileFormat$create(...) |
| } else if (format %in% c("ipc", "arrow", "feather")) { # These are aliases for the same thing |
| shared_ptr(IpcFileFormat, dataset___IpcFileFormat__Make()) |
| } else { |
| stop("Unsupported file format: ", format, call. = FALSE) |
| } |
| } |
| |
| #' @export |
| as.character.FileFormat <- function(x, ...) { |
| out <- x$type |
| # Slight hack: special case IPC -> feather, otherwise is just the type_name |
| ifelse(out == "ipc", "feather", out) |
| } |
| |
| #' @usage NULL |
| #' @format NULL |
| #' @rdname FileFormat |
| #' @export |
| ParquetFileFormat <- R6Class("ParquetFileFormat", inherit = FileFormat) |
| ParquetFileFormat$create <- function(use_buffered_stream = FALSE, |
| buffer_size = 8196, |
| dict_columns = character(0)) { |
| shared_ptr(ParquetFileFormat, dataset___ParquetFileFormat__Make( |
| use_buffered_stream, buffer_size, dict_columns)) |
| } |
| |
| #' @usage NULL |
| #' @format NULL |
| #' @rdname FileFormat |
| #' @export |
| IpcFileFormat <- R6Class("IpcFileFormat", inherit = FileFormat) |
| |
| #' @usage NULL |
| #' @format NULL |
| #' @rdname FileFormat |
| #' @export |
| CsvFileFormat <- R6Class("CsvFileFormat", inherit = FileFormat) |
| CsvFileFormat$create <- function(..., opts = csv_file_format_parse_options(...)) { |
| shared_ptr(CsvFileFormat, dataset___CsvFileFormat__Make(opts)) |
| } |
| |
| csv_file_format_parse_options <- function(...) { |
| # Support both the readr spelling of options and the arrow spelling |
| readr_opts <- c("delim", "quote", "escape_double", "escape_backslash", "skip_empty_rows") |
| if (any(readr_opts %in% names(list(...)))) { |
| readr_to_csv_parse_options(...) |
| } else { |
| CsvParseOptions$create(...) |
| } |
| } |
| |
| #' Format-specific write options |
| #' |
| #' @description |
| #' A `FileWriteOptions` holds write options specific to a `FileFormat`. |
| FileWriteOptions <- R6Class("FileWriteOptions", inherit = ArrowObject, |
| public = list( |
| update = function(...) { |
| if (self$type == "parquet") { |
| dataset___ParquetFileWriteOptions__update(self, |
| ParquetWriterProperties$create(...), |
| ParquetArrowWriterProperties$create(...)) |
| } else if (self$type == "ipc") { |
| args <- list(...) |
| if (is.null(args$codec)) { |
| dataset___IpcFileWriteOptions__update1(self, |
| get_ipc_use_legacy_format(args$use_legacy_format), |
| get_ipc_metadata_version(args$metadata_version)) |
| } else { |
| dataset___IpcFileWriteOptions__update2(self, |
| get_ipc_use_legacy_format(args$use_legacy_format), |
| args$codec, |
| get_ipc_metadata_version(args$metadata_version)) |
| } |
| } |
| invisible(self) |
| } |
| ), |
| active = list( |
| type = function() dataset___FileWriteOptions__type_name(self) |
| ) |
| ) |
| FileWriteOptions$create <- function(format, ...) { |
| if (!inherits(format, "FileFormat")) { |
| format <- FileFormat$create(format) |
| } |
| options <- shared_ptr(FileWriteOptions, dataset___FileFormat__DefaultWriteOptions(format)) |
| options$update(...) |
| } |