r/R/feather.R - arrow - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 #' Write data in the Feather format
 #'
 #' Feather provides binary columnar serialization for data frames.
 #' It is designed to make reading and writing data frames efficient,
 #' and to make sharing data across data analysis languages easy.
 #' This function writes both the original, limited specification of the format
 #' and the version 2 specification, which is the Apache Arrow IPC file format.
 #'
 #' @param x `data.frame`, [RecordBatch], or [Table]
 #' @param sink A string file path or [OutputStream]
 #' @param version integer Feather file version. Version 2 is the current.
 #' Version 1 is the more limited legacy format.
 #' @param chunk_size For V2 files, the number of rows that each chunk of data
 #' should have in the file. Use a smaller `chunk_size` when you need faster
 #' random row access. Default is 64K. This option is not supported for V1.
 #' @param compression Name of compression codec to use, if any. Default is
 #' "lz4" if LZ4 is available in your build of the Arrow C++ library, otherwise
 #' "uncompressed". "zstd" is the other available codec and generally has better
 #' compression ratios in exchange for slower read and write performance
 #' See [codec_is_available()]. This option is not supported for V1.
 #' @param compression_level If `compression` is "zstd", you may
 #' specify an integer compression level. If omitted, the compression codec's
 #' default compression level is used.
 #'
 #' @return The input `x`, invisibly. Note that if `sink` is an [OutputStream],
 #' the stream will be left open.
 #' @export
 #' @seealso [RecordBatchWriter] for lower-level access to writing Arrow IPC data.
 #' @examples
 #' \donttest{
 #' tf <- tempfile()
 #' on.exit(unlink(tf))
 #' write_feather(mtcars, tf)
 #' }
 #' @include arrow-package.R
 write_feather <- function(x,
                           sink,
                           version = 2,
                           chunk_size = 65536L,
                           compression = c("default", "lz4", "uncompressed", "zstd"),
                           compression_level = NULL) {
   # Handle and validate options before touching data
   version <- as.integer(version)
   assert_that(version %in% 1:2)
   compression <- match.arg(compression)
   chunk_size <- as.integer(chunk_size)
   assert_that(chunk_size > 0)
   if (compression == "default") {
     if (version == 2 && codec_is_available("lz4")) {
       compression <- "lz4"
     } else {
       compression <- "uncompressed"
     }
   }
   if (is.null(compression_level)) {
     # Use -1 as sentinal for "default"
     compression_level <- -1L
   }
   compression_level <- as.integer(compression_level)
   # Now make sure that options make sense together
   if (version == 1) {
     if (chunk_size != 65536L) {
       stop("Feather version 1 does not support the 'chunk_size' option", call. = FALSE)
     }
     if (compression != "uncompressed") {
       stop("Feather version 1 does not support the 'compression' option", call. = FALSE)
     }
     if (compression_level != -1L) {
       stop("Feather version 1 does not support the 'compression_level' option", call. = FALSE)
     }
   }
   if (compression != "zstd" && compression_level != -1L) {
     stop("Can only specify a 'compression_level' when 'compression' is 'zstd'", call. = FALSE)
   }
   # Finally, add 1 to version because 2 means V1 and 3 means V2 :shrug:
   version <- version + 1L

   # "lz4" is the convenience
   if (compression == "lz4") {
      compression <- "lz4_frame"
   }

   compression <- compression_from_name(compression)

   x_out <- x
   if (is.data.frame(x) || inherits(x, "RecordBatch")) {
     x <- Table$create(x)
   }
   assert_is(x, "Table")

   if (is.string(sink)) {
     sink <- FileOutputStream$create(sink)
     on.exit(sink$close())
   }
   assert_is(sink, "OutputStream")
   ipc___WriteFeather__Table(sink, x, version, chunk_size, compression, compression_level)
   invisible(x_out)
 }

 #' Read a Feather file
 #'
 #' Feather provides binary columnar serialization for data frames.
 #' It is designed to make reading and writing data frames efficient,
 #' and to make sharing data across data analysis languages easy.
 #' This function reads both the original, limited specification of the format
 #' and the version 2 specification, which is the Apache Arrow IPC file format.
 #'
 #' @inheritParams read_ipc_stream
 #' @inheritParams read_delim_arrow
 #' @param ... additional parameters, passed to [FeatherReader$create()][FeatherReader]
 #'
 #' @return A `data.frame` if `as_data_frame` is `TRUE` (the default), or an
 #' Arrow [Table] otherwise
 #'
 #' @export
 #' @seealso [FeatherReader] and [RecordBatchReader] for lower-level access to reading Arrow IPC data.
 #' @examples
 #' \donttest{
 #' tf <- tempfile()
 #' on.exit(unlink(tf))
 #' write_feather(mtcars, tf)
 #' df <- read_feather(tf)
 #' dim(df)
 #' # Can select columns
 #' df <- read_feather(tf, col_select = starts_with("d"))
 #' }
 read_feather <- function(file, col_select = NULL, as_data_frame = TRUE, ...) {
   if (!inherits(file, "InputStream")) {
     file <- make_readable_file(file)
     on.exit(file$close())
   }
   reader <- FeatherReader$create(file, ...)

   all_columns <- ipc___feather___Reader__column_names(reader)
   col_select <- enquo(col_select)
   columns <- if (!quo_is_null(col_select)) {
     vars_select(all_columns, !!col_select)
   }

   out <- reader$Read(columns)

   if (isTRUE(as_data_frame)) {
     out <- as.data.frame(out)
   }
   out
 }

 #' @title FeatherReader class
 #' @rdname FeatherReader
 #' @name FeatherReader
 #' @docType class
 #' @usage NULL
 #' @format NULL
 #' @description This class enables you to interact with Feather files. Create
 #' one to connect to a file or other InputStream, and call `Read()` on it to
 #' make an `arrow::Table`. See its usage in [`read_feather()`].
 #'
 #' @section Factory:
 #'
 #' The `FeatherReader$create()` factory method instantiates the object and
 #' takes the following arguments:
 #'
 #' - `file` an Arrow file connection object inheriting from `RandomAccessFile`.
 #' - `mmap` Logical: whether to memory-map the file (default `TRUE`)
 #' - `...` Additional arguments, currently ignored
 #'
 #' @section Methods:
 #'
 #' - `$Read(columns)`: Returns a `Table` of the selected columns, a vector of
 #'   integer indices
 #' - `$version`: Active binding, returns `1` or `2`, according to the Feather
 #'   file version
 #'
 #' @export
 #' @include arrow-package.R
 FeatherReader <- R6Class("FeatherReader", inherit = ArrowObject,
   public = list(
     Read = function(columns) {
       shared_ptr(Table, ipc___feather___Reader__Read(self, columns))
     }
   ),
   active = list(
     # versions are officially 2 for V1 and 3 for V2 :shrug:
     version = function() ipc___feather___Reader__version(self) - 1L
   )
 )

 FeatherReader$create <- function(file, mmap = TRUE, ...) {
   assert_is(file, "RandomAccessFile")
   shared_ptr(FeatherReader, ipc___feather___Reader__Open(file))
 }
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	#' Write data in the Feather format
	#'
	#' Feather provides binary columnar serialization for data frames.
	#' It is designed to make reading and writing data frames efficient,
	#' and to make sharing data across data analysis languages easy.
	#' This function writes both the original, limited specification of the format
	#' and the version 2 specification, which is the Apache Arrow IPC file format.
	#'
	#' @param x `data.frame`, [RecordBatch], or [Table]
	#' @param sink A string file path or [OutputStream]
	#' @param version integer Feather file version. Version 2 is the current.
	#' Version 1 is the more limited legacy format.
	#' @param chunk_size For V2 files, the number of rows that each chunk of data
	#' should have in the file. Use a smaller `chunk_size` when you need faster
	#' random row access. Default is 64K. This option is not supported for V1.
	#' @param compression Name of compression codec to use, if any. Default is
	#' "lz4" if LZ4 is available in your build of the Arrow C++ library, otherwise
	#' "uncompressed". "zstd" is the other available codec and generally has better
	#' compression ratios in exchange for slower read and write performance
	#' See [codec_is_available()]. This option is not supported for V1.
	#' @param compression_level If `compression` is "zstd", you may
	#' specify an integer compression level. If omitted, the compression codec's
	#' default compression level is used.
	#'
	#' @return The input `x`, invisibly. Note that if `sink` is an [OutputStream],
	#' the stream will be left open.
	#' @export
	#' @seealso [RecordBatchWriter] for lower-level access to writing Arrow IPC data.
	#' @examples
	#' \donttest{
	#' tf <- tempfile()
	#' on.exit(unlink(tf))
	#' write_feather(mtcars, tf)
	#' }
	#' @include arrow-package.R
	write_feather <- function(x,
	sink,
	version = 2,
	chunk_size = 65536L,
	compression = c("default", "lz4", "uncompressed", "zstd"),
	compression_level = NULL) {
	# Handle and validate options before touching data
	version <- as.integer(version)
	assert_that(version %in% 1:2)
	compression <- match.arg(compression)
	chunk_size <- as.integer(chunk_size)
	assert_that(chunk_size > 0)
	if (compression == "default") {
	if (version == 2 && codec_is_available("lz4")) {
	compression <- "lz4"
	} else {
	compression <- "uncompressed"
	}
	}
	if (is.null(compression_level)) {
	# Use -1 as sentinal for "default"
	compression_level <- -1L
	}
	compression_level <- as.integer(compression_level)
	# Now make sure that options make sense together
	if (version == 1) {
	if (chunk_size != 65536L) {
	stop("Feather version 1 does not support the 'chunk_size' option", call. = FALSE)
	}
	if (compression != "uncompressed") {
	stop("Feather version 1 does not support the 'compression' option", call. = FALSE)
	}
	if (compression_level != -1L) {
	stop("Feather version 1 does not support the 'compression_level' option", call. = FALSE)
	}
	}
	if (compression != "zstd" && compression_level != -1L) {
	stop("Can only specify a 'compression_level' when 'compression' is 'zstd'", call. = FALSE)
	}
	# Finally, add 1 to version because 2 means V1 and 3 means V2 :shrug:
	version <- version + 1L

	# "lz4" is the convenience
	if (compression == "lz4") {
	compression <- "lz4_frame"
	}

	compression <- compression_from_name(compression)

	x_out <- x
	if (is.data.frame(x) \|\| inherits(x, "RecordBatch")) {
	x <- Table$create(x)
	}
	assert_is(x, "Table")

	if (is.string(sink)) {
	sink <- FileOutputStream$create(sink)
	on.exit(sink$close())
	}
	assert_is(sink, "OutputStream")
	ipc___WriteFeather__Table(sink, x, version, chunk_size, compression, compression_level)
	invisible(x_out)
	}

	#' Read a Feather file
	#'
	#' Feather provides binary columnar serialization for data frames.
	#' It is designed to make reading and writing data frames efficient,
	#' and to make sharing data across data analysis languages easy.
	#' This function reads both the original, limited specification of the format
	#' and the version 2 specification, which is the Apache Arrow IPC file format.
	#'
	#' @inheritParams read_ipc_stream
	#' @inheritParams read_delim_arrow
	#' @param ... additional parameters, passed to [FeatherReader$create()][FeatherReader]
	#'
	#' @return A `data.frame` if `as_data_frame` is `TRUE` (the default), or an
	#' Arrow [Table] otherwise
	#'
	#' @export
	#' @seealso [FeatherReader] and [RecordBatchReader] for lower-level access to reading Arrow IPC data.
	#' @examples
	#' \donttest{
	#' tf <- tempfile()
	#' on.exit(unlink(tf))
	#' write_feather(mtcars, tf)
	#' df <- read_feather(tf)
	#' dim(df)
	#' # Can select columns
	#' df <- read_feather(tf, col_select = starts_with("d"))
	#' }
	read_feather <- function(file, col_select = NULL, as_data_frame = TRUE, ...) {
	if (!inherits(file, "InputStream")) {
	file <- make_readable_file(file)
	on.exit(file$close())
	}
	reader <- FeatherReader$create(file, ...)

	all_columns <- ipc___feather___Reader__column_names(reader)
	col_select <- enquo(col_select)
	columns <- if (!quo_is_null(col_select)) {
	vars_select(all_columns, !!col_select)
	}

	out <- reader$Read(columns)

	if (isTRUE(as_data_frame)) {
	out <- as.data.frame(out)
	}
	out
	}

	#' @title FeatherReader class
	#' @rdname FeatherReader
	#' @name FeatherReader
	#' @docType class
	#' @usage NULL
	#' @format NULL
	#' @description This class enables you to interact with Feather files. Create
	#' one to connect to a file or other InputStream, and call `Read()` on it to
	#' make an `arrow::Table`. See its usage in [`read_feather()`].
	#'
	#' @section Factory:
	#'
	#' The `FeatherReader$create()` factory method instantiates the object and
	#' takes the following arguments:
	#'
	#' - `file` an Arrow file connection object inheriting from `RandomAccessFile`.
	#' - `mmap` Logical: whether to memory-map the file (default `TRUE`)
	#' - `...` Additional arguments, currently ignored
	#'
	#' @section Methods:
	#'
	#' - `$Read(columns)`: Returns a `Table` of the selected columns, a vector of
	#' integer indices
	#' - `$version`: Active binding, returns `1` or `2`, according to the Feather
	#' file version
	#'
	#' @export
	#' @include arrow-package.R
	FeatherReader <- R6Class("FeatherReader", inherit = ArrowObject,
	public = list(
	Read = function(columns) {
	shared_ptr(Table, ipc___feather___Reader__Read(self, columns))
	}
	),
	active = list(
	# versions are officially 2 for V1 and 3 for V2 :shrug:
	version = function() ipc___feather___Reader__version(self) - 1L
	)
	)

	FeatherReader$create <- function(file, mmap = TRUE, ...) {
	assert_is(file, "RandomAccessFile")
	shared_ptr(FeatherReader, ipc___feather___Reader__Open(file))
	}