blob: 3e390018c825fc812caf47b1bb40505961335b8f [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#' Write a Feather file (an Arrow IPC file)
#'
#' Feather provides binary columnar serialization for data frames.
#' It is designed to make reading and writing data frames efficient,
#' and to make sharing data across data analysis languages easy.
#' [write_feather()] can write both the Feather Version 1 (V1),
#' a legacy version available starting in 2016, and the Version 2 (V2),
#' which is the Apache Arrow IPC file format.
#' The default version is V2.
#' V1 files are distinct from Arrow IPC files and lack many feathures,
#' such as the ability to store all Arrow data tyeps, and compression support.
#' [write_ipc_file()] can only write V2 files.
#'
#' @param x `data.frame`, [RecordBatch], or [Table]
#' @param sink A string file path, URI, or [OutputStream], or path in a file
#' system (`SubTreeFileSystem`)
#' @param version integer Feather file version, Version 1 or Version 2. Version 2 is the default.
#' @param chunk_size For V2 files, the number of rows that each chunk of data
#' should have in the file. Use a smaller `chunk_size` when you need faster
#' random row access. Default is 64K. This option is not supported for V1.
#' @param compression Name of compression codec to use, if any. Default is
#' "lz4" if LZ4 is available in your build of the Arrow C++ library, otherwise
#' "uncompressed". "zstd" is the other available codec and generally has better
#' compression ratios in exchange for slower read and write performance.
#' "lz4" is shorthand for the "lz4_frame" codec.
#' See [codec_is_available()] for details.
#' `TRUE` and `FALSE` can also be used in place of "default" and "uncompressed".
#' This option is not supported for V1.
#' @param compression_level If `compression` is "zstd", you may
#' specify an integer compression level. If omitted, the compression codec's
#' default compression level is used.
#'
#' @return The input `x`, invisibly. Note that if `sink` is an [OutputStream],
#' the stream will be left open.
#' @export
#' @seealso [RecordBatchWriter] for lower-level access to writing Arrow IPC data.
#' @seealso [Schema] for information about schemas and metadata handling.
#' @examples
#' # We recommend the ".arrow" extension for Arrow IPC files (Feather V2).
#' tf1 <- tempfile(fileext = ".feather")
#' tf2 <- tempfile(fileext = ".arrow")
#' tf3 <- tempfile(fileext = ".arrow")
#' on.exit({
#' unlink(tf1)
#' unlink(tf2)
#' unlink(tf3)
#' })
#' write_feather(mtcars, tf1, version = 1)
#' write_feather(mtcars, tf2)
#' write_ipc_file(mtcars, tf3)
#' @include arrow-object.R
write_feather <- function(x,
sink,
version = 2,
chunk_size = 65536L,
compression = c("default", "lz4", "lz4_frame", "uncompressed", "zstd"),
compression_level = NULL) {
# Handle and validate options before touching data
version <- as.integer(version)
assert_that(version %in% 1:2)
if (isTRUE(compression)) compression <- "default"
if (isFALSE(compression)) compression <- "uncompressed"
# TODO(ARROW-17221): if (missing(compression)), we could detect_compression(sink) here
compression <- match.arg(compression)
chunk_size <- as.integer(chunk_size)
assert_that(chunk_size > 0)
if (compression == "default") {
if (version == 2 && codec_is_available("lz4")) {
compression <- "lz4"
} else {
compression <- "uncompressed"
}
}
if (is.null(compression_level)) {
# Use -1 as sentinal for "default"
compression_level <- -1L
}
compression_level <- as.integer(compression_level)
# Now make sure that options make sense together
if (version == 1) {
if (chunk_size != 65536L) {
stop("Feather version 1 does not support the 'chunk_size' option", call. = FALSE)
}
if (compression != "uncompressed") {
stop("Feather version 1 does not support the 'compression' option", call. = FALSE)
}
if (compression_level != -1L) {
stop("Feather version 1 does not support the 'compression_level' option", call. = FALSE)
}
}
if (compression != "zstd" && compression_level != -1L) {
stop("Can only specify a 'compression_level' when 'compression' is 'zstd'", call. = FALSE)
}
# Finally, add 1 to version because 2 means V1 and 3 means V2 :shrug:
version <- version + 1L
# "lz4" is the convenience
if (compression == "lz4") {
compression <- "lz4_frame"
}
compression <- compression_from_name(compression)
x_out <- x
x <- as_writable_table(x)
if (!inherits(sink, "OutputStream")) {
sink <- make_output_stream(sink)
on.exit(sink$close())
}
ipc___WriteFeather__Table(sink, x, version, chunk_size, compression, compression_level)
invisible(x_out)
}
#' @rdname write_feather
#' @export
write_ipc_file <- function(x,
sink,
chunk_size = 65536L,
compression = c("default", "lz4", "lz4_frame", "uncompressed", "zstd"),
compression_level = NULL) {
mc <- match.call()
mc$version <- 2
mc[[1]] <- get("write_feather", envir = asNamespace("arrow"))
eval.parent(mc)
}
#' Read a Feather file (an Arrow IPC file)
#'
#' Feather provides binary columnar serialization for data frames.
#' It is designed to make reading and writing data frames efficient,
#' and to make sharing data across data analysis languages easy.
#' [read_feather()] can read both the Feather Version 1 (V1), a legacy version available starting in 2016,
#' and the Version 2 (V2), which is the Apache Arrow IPC file format.
#' [read_ipc_file()] is an alias of [read_feather()].
#'
#' @inheritParams read_ipc_stream
#' @inheritParams read_delim_arrow
#' @inheritParams make_readable_file
#'
#' @return A `tibble` if `as_data_frame` is `TRUE` (the default), or an
#' Arrow [Table] otherwise
#'
#' @export
#' @seealso [FeatherReader] and [RecordBatchReader] for lower-level access to reading Arrow IPC data.
#' @examples
#' # We recommend the ".arrow" extension for Arrow IPC files (Feather V2).
#' tf <- tempfile(fileext = ".arrow")
#' on.exit(unlink(tf))
#' write_feather(mtcars, tf)
#' df <- read_feather(tf)
#' dim(df)
#' # Can select columns
#' df <- read_feather(tf, col_select = starts_with("d"))
read_feather <- function(file, col_select = NULL, as_data_frame = TRUE, mmap = TRUE) {
if (!inherits(file, "RandomAccessFile")) {
# Compression is handled inside the IPC file format, so we don't need
# to detect from the file extension and wrap in a CompressedInputStream
# TODO: Why is this the only read_format() functions that allows passing
# mmap to make_readable_file?
file <- make_readable_file(file, mmap)
on.exit(file$close())
}
reader <- FeatherReader$create(file)
col_select <- enquo(col_select)
columns <- if (!quo_is_null(col_select)) {
sim_df <- as.data.frame(reader$schema)
indices <- eval_select(col_select, sim_df)
names(reader)[indices]
}
out <- tryCatch(
reader$Read(columns),
error = read_compressed_error
)
if (isTRUE(as_data_frame)) {
df <- out$to_data_frame()
out <- apply_arrow_r_metadata(df, out$metadata$r)
}
out
}
#' @rdname read_feather
#' @export
read_ipc_file <- read_feather
#' @title FeatherReader class
#' @rdname FeatherReader
#' @name FeatherReader
#' @docType class
#' @usage NULL
#' @format NULL
#' @description This class enables you to interact with Feather files. Create
#' one to connect to a file or other InputStream, and call `Read()` on it to
#' make an `arrow::Table`. See its usage in [`read_feather()`].
#'
#' @section Factory:
#'
#' The `FeatherReader$create()` factory method instantiates the object and
#' takes the following argument:
#'
#' - `file` an Arrow file connection object inheriting from `RandomAccessFile`.
#'
#' @section Methods:
#'
#' - `$Read(columns)`: Returns a `Table` of the selected columns, a vector of
#' integer indices
#' - `$column_names`: Active binding, returns the column names in the Feather file
#' - `$schema`: Active binding, returns the schema of the Feather file
#' - `$version`: Active binding, returns `1` or `2`, according to the Feather
#' file version
#'
#' @export
#' @include arrow-object.R
FeatherReader <- R6Class("FeatherReader",
inherit = ArrowObject,
public = list(
Read = function(columns) {
ipc___feather___Reader__Read(self, columns)
},
print = function(...) {
cat("FeatherReader:\n")
print(self$schema)
invisible(self)
}
),
active = list(
# versions are officially 2 for V1 and 3 for V2 :shrug:
version = function() ipc___feather___Reader__version(self) - 1L,
column_names = function() names(self$schema),
schema = function() ipc___feather___Reader__schema(self)
)
)
#' @export
names.FeatherReader <- function(x) x$column_names
FeatherReader$create <- function(file) {
assert_is(file, "RandomAccessFile")
ipc___feather___Reader__Open(file)
}