| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| |
| #' Write data in the Feather format |
| #' |
| #' Feather provides binary columnar serialization for data frames. |
| #' It is designed to make reading and writing data frames efficient, |
| #' and to make sharing data across data analysis languages easy. |
| #' This function writes both the original, limited specification of the format |
| #' and the version 2 specification, which is the Apache Arrow IPC file format. |
| #' |
| #' @param x `data.frame`, [RecordBatch], or [Table] |
| #' @param sink A string file path or [OutputStream] |
| #' @param version integer Feather file version. Version 2 is the current. |
| #' Version 1 is the more limited legacy format. |
| #' @param chunk_size For V2 files, the number of rows that each chunk of data |
| #' should have in the file. Use a smaller `chunk_size` when you need faster |
| #' random row access. Default is 64K. This option is not supported for V1. |
| #' @param compression Name of compression codec to use, if any. Default is |
| #' "lz4" if LZ4 is available in your build of the Arrow C++ library, otherwise |
| #' "uncompressed". "zstd" is the other available codec and generally has better |
| #' compression ratios in exchange for slower read and write performance |
| #' See [codec_is_available()]. This option is not supported for V1. |
| #' @param compression_level If `compression` is "zstd", you may |
| #' specify an integer compression level. If omitted, the compression codec's |
| #' default compression level is used. |
| #' |
| #' @return The input `x`, invisibly. Note that if `sink` is an [OutputStream], |
| #' the stream will be left open. |
| #' @export |
| #' @seealso [RecordBatchWriter] for lower-level access to writing Arrow IPC data. |
| #' @examples |
| #' \donttest{ |
| #' tf <- tempfile() |
| #' on.exit(unlink(tf)) |
| #' write_feather(mtcars, tf) |
| #' } |
| #' @include arrow-package.R |
| write_feather <- function(x, |
| sink, |
| version = 2, |
| chunk_size = 65536L, |
| compression = c("default", "lz4", "uncompressed", "zstd"), |
| compression_level = NULL) { |
| # Handle and validate options before touching data |
| version <- as.integer(version) |
| assert_that(version %in% 1:2) |
| compression <- match.arg(compression) |
| chunk_size <- as.integer(chunk_size) |
| assert_that(chunk_size > 0) |
| if (compression == "default") { |
| if (version == 2 && codec_is_available("lz4")) { |
| compression <- "lz4" |
| } else { |
| compression <- "uncompressed" |
| } |
| } |
| if (is.null(compression_level)) { |
| # Use -1 as sentinal for "default" |
| compression_level <- -1L |
| } |
| compression_level <- as.integer(compression_level) |
| # Now make sure that options make sense together |
| if (version == 1) { |
| if (chunk_size != 65536L) { |
| stop("Feather version 1 does not support the 'chunk_size' option", call. = FALSE) |
| } |
| if (compression != "uncompressed") { |
| stop("Feather version 1 does not support the 'compression' option", call. = FALSE) |
| } |
| if (compression_level != -1L) { |
| stop("Feather version 1 does not support the 'compression_level' option", call. = FALSE) |
| } |
| } |
| if (compression != "zstd" && compression_level != -1L) { |
| stop("Can only specify a 'compression_level' when 'compression' is 'zstd'", call. = FALSE) |
| } |
| # Finally, add 1 to version because 2 means V1 and 3 means V2 :shrug: |
| version <- version + 1L |
| |
| # "lz4" is the convenience |
| if (compression == "lz4") { |
| compression <- "lz4_frame" |
| } |
| |
| compression <- compression_from_name(compression) |
| |
| x_out <- x |
| if (is.data.frame(x) || inherits(x, "RecordBatch")) { |
| x <- Table$create(x) |
| } |
| assert_is(x, "Table") |
| |
| if (is.string(sink)) { |
| sink <- FileOutputStream$create(sink) |
| on.exit(sink$close()) |
| } |
| assert_is(sink, "OutputStream") |
| ipc___WriteFeather__Table(sink, x, version, chunk_size, compression, compression_level) |
| invisible(x_out) |
| } |
| |
| #' Read a Feather file |
| #' |
| #' Feather provides binary columnar serialization for data frames. |
| #' It is designed to make reading and writing data frames efficient, |
| #' and to make sharing data across data analysis languages easy. |
| #' This function reads both the original, limited specification of the format |
| #' and the version 2 specification, which is the Apache Arrow IPC file format. |
| #' |
| #' @inheritParams read_ipc_stream |
| #' @inheritParams read_delim_arrow |
| #' @param ... additional parameters, passed to [FeatherReader$create()][FeatherReader] |
| #' |
| #' @return A `data.frame` if `as_data_frame` is `TRUE` (the default), or an |
| #' Arrow [Table] otherwise |
| #' |
| #' @export |
| #' @seealso [FeatherReader] and [RecordBatchReader] for lower-level access to reading Arrow IPC data. |
| #' @examples |
| #' \donttest{ |
| #' tf <- tempfile() |
| #' on.exit(unlink(tf)) |
| #' write_feather(mtcars, tf) |
| #' df <- read_feather(tf) |
| #' dim(df) |
| #' # Can select columns |
| #' df <- read_feather(tf, col_select = starts_with("d")) |
| #' } |
| read_feather <- function(file, col_select = NULL, as_data_frame = TRUE, ...) { |
| if (!inherits(file, "InputStream")) { |
| file <- make_readable_file(file) |
| on.exit(file$close()) |
| } |
| reader <- FeatherReader$create(file, ...) |
| |
| all_columns <- ipc___feather___Reader__column_names(reader) |
| col_select <- enquo(col_select) |
| columns <- if (!quo_is_null(col_select)) { |
| vars_select(all_columns, !!col_select) |
| } |
| |
| out <- reader$Read(columns) |
| |
| if (isTRUE(as_data_frame)) { |
| out <- as.data.frame(out) |
| } |
| out |
| } |
| |
| #' @title FeatherReader class |
| #' @rdname FeatherReader |
| #' @name FeatherReader |
| #' @docType class |
| #' @usage NULL |
| #' @format NULL |
| #' @description This class enables you to interact with Feather files. Create |
| #' one to connect to a file or other InputStream, and call `Read()` on it to |
| #' make an `arrow::Table`. See its usage in [`read_feather()`]. |
| #' |
| #' @section Factory: |
| #' |
| #' The `FeatherReader$create()` factory method instantiates the object and |
| #' takes the following arguments: |
| #' |
| #' - `file` an Arrow file connection object inheriting from `RandomAccessFile`. |
| #' - `mmap` Logical: whether to memory-map the file (default `TRUE`) |
| #' - `...` Additional arguments, currently ignored |
| #' |
| #' @section Methods: |
| #' |
| #' - `$Read(columns)`: Returns a `Table` of the selected columns, a vector of |
| #' integer indices |
| #' - `$version`: Active binding, returns `1` or `2`, according to the Feather |
| #' file version |
| #' |
| #' @export |
| #' @include arrow-package.R |
| FeatherReader <- R6Class("FeatherReader", inherit = ArrowObject, |
| public = list( |
| Read = function(columns) { |
| shared_ptr(Table, ipc___feather___Reader__Read(self, columns)) |
| } |
| ), |
| active = list( |
| # versions are officially 2 for V1 and 3 for V2 :shrug: |
| version = function() ipc___feather___Reader__version(self) - 1L |
| ) |
| ) |
| |
| FeatherReader$create <- function(file, mmap = TRUE, ...) { |
| assert_is(file, "RandomAccessFile") |
| shared_ptr(FeatherReader, ipc___feather___Reader__Open(file)) |
| } |