blob: 5347dc9e697422ebbffce55b457a24b663491cb9 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#' Convert an object to a nanoarrow array
#'
#' In nanoarrow an 'array' refers to the `struct ArrowArray` definition
#' in the Arrow C data interface. At the R level, we attach a
#' [schema][as_nanoarrow_schema] such that functionally the nanoarrow_array
#' class can be used in a similar way as an `arrow::Array`. Note that in
#' nanoarrow an `arrow::RecordBatch` and a non-nullable `arrow::StructArray`
#' are represented identically.
#'
#' @param x An object to convert to a array
#' @param schema An optional schema used to enforce conversion to a particular
#' type. Defaults to [infer_nanoarrow_schema()].
#' @param ... Passed to S3 methods
#'
#' @return An object of class 'nanoarrow_array'
#' @export
#'
#' @examples
#' (array <- as_nanoarrow_array(1:5))
#' as.vector(array)
#'
#' (array <- as_nanoarrow_array(data.frame(x = 1:5)))
#' as.data.frame(array)
#'
as_nanoarrow_array <- function(x, ..., schema = NULL) {
UseMethod("as_nanoarrow_array")
}
# See as-array.R for S3 method implementations
#' @export
as.vector.nanoarrow_array <- function(x, mode = "any") {
stopifnot(identical(mode, "any"))
convert_array(x, to = infer_nanoarrow_ptype(x))
}
#' @export
as.data.frame.nanoarrow_array <- function(x, ...) {
schema <- infer_nanoarrow_schema(x)
if (schema$format != "+s") {
stop(
sprintf(
"Can't convert array with type %s to data.frame()",
nanoarrow_schema_formatted(schema)
)
)
}
.Call(nanoarrow_c_convert_array, x, NULL)
}
# exported in zzz.R
as_tibble.nanoarrow_array <- function(x, ...) {
tibble::as_tibble(as.data.frame.nanoarrow_array(x), ...)
}
#' @export
infer_nanoarrow_schema.nanoarrow_array <- function(x, ...) {
.Call(nanoarrow_c_infer_schema_array, x) %||%
stop("nanoarrow_array() has no associated schema")
}
#' @importFrom utils str
#' @export
str.nanoarrow_array <- function(object, ...) {
cat(sprintf("%s\n", format(object, .recursive = FALSE)))
if (nanoarrow_pointer_is_valid(object)) {
# Use the str() of the list version but remove the first
# line of the output ("List of 6")
info <- nanoarrow_array_proxy_safe(object)
raw_str_output <- utils::capture.output(str(info, ...))
cat(paste0(raw_str_output[-1], collapse = "\n"))
cat("\n")
}
invisible(object)
}
#' @export
print.nanoarrow_array <- function(x, ...) {
str(x, ...)
invisible(x)
}
#' @export
format.nanoarrow_array <- function(x, ..., .recursive = TRUE) {
if (nanoarrow_pointer_is_valid(x)) {
schema <- .Call(nanoarrow_c_infer_schema_array, x)
if (is.null(schema)) {
sprintf("<nanoarrow_array <unknown schema>[%s]>", x$length)
} else {
sprintf(
"<nanoarrow_array %s[%s]>",
nanoarrow_schema_formatted(schema, .recursive),
x$length
)
}
} else {
"<nanoarrow_array[invalid pointer]>"
}
}
# This is the list()-like interface to nanoarrow_array that allows $ and [[
# to make nice auto-complete for the array fields
#' @export
length.nanoarrow_array <- function(x, ...) {
6L
}
#' @export
names.nanoarrow_array <- function(x, ...) {
c("length", "null_count", "offset", "buffers", "children", "dictionary")
}
#' @export
`[[.nanoarrow_array` <- function(x, i, ...) {
nanoarrow_array_proxy_safe(x)[[i]]
}
#' @export
`$.nanoarrow_array` <- function(x, i, ...) {
nanoarrow_array_proxy_safe(x)[[i]]
}
#' @export
`[[<-.nanoarrow_array` <- function(x, i, value) {
if (is.numeric(i) && isTRUE(i %in% 1:6)) {
i <- names.nanoarrow_array()[[i]]
}
if (is.character(i) && (length(i) == 1L) && !is.na(i)) {
new_values <- list(value)
names(new_values) <- i
return(nanoarrow_array_modify(x, new_values))
}
stop("`i` must be character(1) or integer(1) %in% 1:6")
}
#' @export
`$<-.nanoarrow_array` <- function(x, i, value) {
new_values <- list(value)
names(new_values) <- i
nanoarrow_array_modify(x, new_values)
}
# A version of nanoarrow_array_proxy() that is less likely to error for invalid
# arrays and/or schemas
nanoarrow_array_proxy_safe <- function(array, recursive = FALSE) {
schema <- .Call(nanoarrow_c_infer_schema_array, array)
tryCatch(
nanoarrow_array_proxy(array, schema = schema, recursive = recursive),
error = function(...) nanoarrow_array_proxy(array, recursive = recursive)
)
}
nanoarrow_array_proxy <- function(array, schema = NULL, recursive = FALSE) {
if (!is.null(schema)) {
array_view <- .Call(nanoarrow_c_array_view, array, schema)
result <- .Call(nanoarrow_c_array_proxy, array, array_view, recursive)
names(result$children) <- names(schema$children)
if (!recursive) {
# Pass on some information from the schema if we have it
result$children <- Map(
nanoarrow_array_set_schema,
result$children,
schema$children
)
if (!is.null(result$dictionary)) {
nanoarrow_array_set_schema(result$dictionary, schema$dictionary)
}
}
} else {
result <- .Call(nanoarrow_c_array_proxy, array, NULL, recursive)
}
result
}
#' Modify nanoarrow arrays
#'
#' Create a new array or from an existing array, modify one or more parameters.
#' When importing an array from elsewhere, `nanoarrow_array_set_schema()` is
#' useful to attach the data type information to the array (without this
#' information there is little that nanoarrow can do with the array since its
#' content cannot be otherwise interpreted). `nanoarrow_array_modify()` can
#' create a shallow copy and modify various parameters to create a new array,
#' including setting children and buffers recursively. These functions power the
#' `$<-` operator, which can modify one parameter at a time.
#'
#' @param array A [nanoarrow_array][as_nanoarrow_array].
#' @param schema A [nanoarrow_schema][as_nanoarrow_schema] to attach to this
#' `array`.
#' @param new_values A named `list()` of values to replace.
#' @param validate Use `FALSE` to skip validation. Skipping validation may
#' result in creating an array that will crash R.
#'
#' @return
#' - `nanoarrow_array_init()` returns a possibly invalid but initialized
#' array with a given `schema`.
#' - `nanoarrow_array_set_schema()` returns `array`, invisibly. Note that
#' `array` is modified in place by reference.
#' - `nanoarrow_array_modify()` returns a shallow copy of `array` with the
#' modified parameters such that the original array remains valid.
#' @export
#'
#' @examples
#' nanoarrow_array_init(na_string())
#'
#' # Modify an array using $ and <-
#' array <- as_nanoarrow_array(1:5)
#' array$length <- 4
#' as.vector(array)
#'
#' # Modify potentially more than one component at a time
#' array <- as_nanoarrow_array(1:5)
#' as.vector(nanoarrow_array_modify(array, list(length = 4)))
#'
#' # Attach a schema to an array
#' array <- as_nanoarrow_array(-1L)
#' nanoarrow_array_set_schema(array, na_uint32())
#' as.vector(array)
#'
nanoarrow_array_init <- function(schema) {
.Call(nanoarrow_c_array_init, schema)
}
#' @rdname nanoarrow_array_init
#' @export
nanoarrow_array_set_schema <- function(array, schema, validate = TRUE) {
.Call(nanoarrow_c_array_set_schema, array, schema, as.logical(validate)[1])
invisible(array)
}
#' @rdname nanoarrow_array_init
#' @export
nanoarrow_array_modify <- function(array, new_values, validate = TRUE) {
array <- as_nanoarrow_array(array)
if (length(new_values) == 0) {
return(array)
}
# Make sure new_values has names to iterate over
new_names <- names(new_values)
if (is.null(new_names) || all(new_names == "", na.rm = TRUE)) {
stop("`new_values` must be named")
}
# Make a copy and modify it. This is a deep copy in the sense that all
# children are modifiable; however, it's a shallow copy in the sense that
# none of the buffers are copied.
schema <- .Call(nanoarrow_c_infer_schema_array, array)
array_copy <- array_shallow_copy(array, schema, validate = validate)
for (i in seq_along(new_values)) {
nm <- new_names[i]
value <- new_values[[i]]
switch(
nm,
length = .Call(nanoarrow_c_array_set_length, array_copy, as.double(value)),
null_count = .Call(nanoarrow_c_array_set_null_count, array_copy, as.double(value)),
offset = .Call(nanoarrow_c_array_set_offset, array_copy, as.double(value)),
buffers = {
value <- lapply(value, as_nanoarrow_buffer)
.Call(nanoarrow_c_array_set_buffers, array_copy, value)
},
children = {
value <- lapply(value, as_nanoarrow_array)
value_copy <- lapply(value, array_shallow_copy, validate = validate)
.Call(nanoarrow_c_array_set_children, array_copy, value_copy)
if (!is.null(schema)) {
schema <- nanoarrow_schema_modify(
schema,
list(children = lapply(value, infer_nanoarrow_schema)),
validate = validate
)
}
},
dictionary = {
if (!is.null(value)) {
value <- as_nanoarrow_array(value)
value_copy <- array_shallow_copy(value, validate = validate)
} else {
value_copy <- NULL
}
.Call(nanoarrow_c_array_set_dictionary, array_copy, value_copy)
if (!is.null(schema) && !is.null(value)) {
schema <- nanoarrow_schema_modify(
schema,
list(dictionary = infer_nanoarrow_schema(value)),
validate = validate
)
} else if (!is.null(schema)) {
schema <- nanoarrow_schema_modify(
schema,
list(dictionary = NULL),
validate = validate
)
}
},
stop(sprintf("Can't modify array[[%s]]: does not exist", deparse(nm)))
)
}
if (!is.null(schema) && validate) {
array_copy <- .Call(nanoarrow_c_array_validate_after_modify, array_copy, schema)
}
if (!is.null(schema)) {
nanoarrow_array_set_schema(array_copy, schema, validate = validate)
}
array_copy
}
array_shallow_copy <- function(array, schema = NULL, validate = TRUE) {
array_copy <- nanoarrow_allocate_array()
nanoarrow_pointer_export(array, array_copy)
schema <- schema %||% .Call(nanoarrow_c_infer_schema_array, array)
# For validation, use some of the infrastructure we already have in place
# to make sure array_copy knows how long each buffer is
if (!is.null(schema) && validate) {
copy_buffers_recursive(array, array_copy)
}
array_copy
}
copy_buffers_recursive <- function(array, array_copy) {
proxy <- nanoarrow_array_proxy_safe(array)
proxy_copy <- nanoarrow_array_proxy(array_copy)
.Call(nanoarrow_c_array_set_buffers, array_copy, proxy$buffers)
for (i in seq_along(proxy$children)) {
copy_buffers_recursive(proxy$children[[i]], proxy_copy$children[[i]])
}
if (!is.null(proxy$dictionary)) {
copy_buffers_recursive(proxy$dictionary, proxy_copy$dictionary)
}
}