blob: 6de82d175a28dfad4c859deb4abfe8292b563166 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#' Convert an Array into an R vector
#'
#' Converts `array` to the type specified by `to`. This is a low-level interface;
#' most users should use `as.data.frame()` or `as.vector()` unless finer-grained
#' control is needed over the conversion. This function is an S3 generic
#' dispatching on `to`: developers may implement their own S3 methods for
#' custom vector types.
#'
#' @param array A [nanoarrow_array][as_nanoarrow_array].
#' @param to A target prototype object describing the type to which `array`
#' should be converted, or `NULL` to use the default conversion as
#' returned by [infer_nanoarrow_ptype()]. Alternatively, a function can be
#' passed to perform an alternative calculation of the default ptype as
#' a function of `array` and the default inference of the prototype.
#' @param ... Passed to S3 methods
#'
#' @return An R vector of type `to`.
#' @export
#'
#' @details
#' Conversions are implemented for the following R vector types:
#'
#' - [logical()]: Any numeric type can be converted to [logical()] in addition
#' to the bool type. For numeric types, any non-zero value is considered `TRUE`.
#' - [integer()]: Any numeric type can be converted to [integer()]; however,
#' a warning will be signaled if the any value is outside the range of the
#' 32-bit integer.
#' - [double()]: Any numeric type can be converted to [double()]. This
#' conversion currently does not warn for values that may not roundtrip
#' through a floating-point double (e.g., very large uint64 and int64 values).
#' - [character()]: String and large string types can be converted to
#' [character()]. The conversion does not check for valid UTF-8: if you need
#' finer-grained control over encodings, use `to = blob::blob()`.
#' - [factor()]: Dictionary-encoded arrays of strings can be converted to
#' `factor()`; however, this must be specified explicitly (i.e.,
#' `convert_array(array, factor())`) because arrays arriving
#' in chunks can have dictionaries that contain different levels. Use
#' `convert_array(array, factor(levels = c(...)))` to materialize an array
#' into a vector with known levels.
#' - [Date][as.Date()]: Only the date32 type can be converted to an R Date vector.
#' - [hms::hms()]: Time32 and time64 types can be converted to [hms::hms()].
#' - [difftime()]: Time32, time64, and duration types can be converted to
#' R [difftime()] vectors. The value is converted to match the [units()]
#' attribute of `to`.
#' - [blob::blob()]: String, large string, binary, and large binary types can
#' be converted to [blob::blob()].
#' - [vctrs::list_of()]: List, large list, and fixed-size list types can be
#' converted to [vctrs::list_of()].
#' - [data.frame()]: Struct types can be converted to [data.frame()].
#' - [vctrs::unspecified()]: Any type can be converted to [vctrs::unspecified()];
#' however, a warning will be raised if any non-null values are encountered.
#'
#' In addition to the above conversions, a null array may be converted to any
#' target prototype except [data.frame()]. Extension arrays are currently
#' converted as their storage type.
#'
#' @examples
#' array <- as_nanoarrow_array(data.frame(x = 1:5))
#' str(convert_array(array))
#' str(convert_array(array, to = data.frame(x = double())))
#'
convert_array <- function(array, to = NULL, ...) {
stopifnot(inherits(array, "nanoarrow_array"))
UseMethod("convert_array", to)
}
#' @export
convert_array.default <- function(array, to = NULL, ..., .from_c = FALSE) {
if (.from_c) {
# Handle extension conversion
# We don't need the user-friendly versions and this is performance-sensitive
schema <- .Call(nanoarrow_c_infer_schema_array, array)
parsed <- .Call(nanoarrow_c_schema_parse, schema)
if (!is.null(parsed$extension_name)) {
spec <- resolve_nanoarrow_extension(parsed$extension_name)
return(convert_array_extension(spec, array, to, ...))
}
# Handle default dictionary conversion since it's the same for all types
dictionary <- array$dictionary
if (!is.null(dictionary)) {
values <- .Call(nanoarrow_c_convert_array, dictionary, to)
array$dictionary <- NULL
indices <- .Call(nanoarrow_c_convert_array, array, integer())
return(vec_slice2(values, indices + 1L))
}
stop_cant_convert_array(array, to)
}
if (is.function(to)) {
to <- to(array, infer_nanoarrow_ptype(array))
}
.Call(nanoarrow_c_convert_array, array, to)
}
# This is defined because it's verbose to pass named arguments from C.
# When converting data frame columns, we try the internal C conversions
# first to save R evaluation overhead. When the internal conversions fail,
# we call convert_array() to dispatch to conversions defined via S3
# dispatch, making sure to let the default method know that we've already
# tried the internal C conversions.
convert_fallback_other <- function(array, offset, length, to) {
# If we need to modify offset/length, do it using a shallow copy.
if (!is.null(offset)) {
array <- nanoarrow_array_modify(
array,
list(offset = offset, length = length),
validate = FALSE
)
}
# Call convert_array() on a single chunk. Use .from_c = TRUE to ensure that
# methods do not attempt to pass the same array back to the C conversions.
# When the result is passed back to C it is checked enough to avoid segfault
# but not necessarily for correctness (e.g., factors with levels that don't
# correspond to 'to'). This result may be used as-is or may be copied into
# a slice of another vector.
convert_array(array, to, .from_c = TRUE)
}
#' @export
convert_array.double <- function(array, to, ...) {
# Handle conversion from decimal128 via arrow
schema <- infer_nanoarrow_schema(array)
parsed <- nanoarrow_schema_parse(schema)
if (parsed$type == "decimal128") {
assert_arrow_installed(
sprintf(
"convert %s array to object of type double",
nanoarrow_schema_formatted(schema)
)
)
arrow_array <- as_arrow_array.nanoarrow_array(array)
arrow_array$as_vector()
} else {
NextMethod()
}
}
#' @export
convert_array.vctrs_partial_frame <- function(array, to, ...) {
ptype <- infer_nanoarrow_ptype(array)
if (!is.data.frame(ptype)) {
stop_cant_convert_array(array, to)
}
ptype <- vctrs::vec_ptype_common(ptype, to)
.Call(nanoarrow_c_convert_array, array, ptype)
}
#' @export
convert_array.factor <- function(array, to, ...) {
if (!is.null(array$dictionary)) {
levels_final <- levels(to)
levels <- convert_array(array$dictionary, character())
array$dictionary <- NULL
indices <- convert_array(array, integer()) + 1L
# Handle empty factor() as the sentinel for "auto levels"
if (identical(levels(to), character())) {
levels(to) <- levels
}
if (identical(levels, levels(to))) {
fct_data <- indices
} else if (all(levels %in% levels(to))) {
level_map <- match(levels, levels(to))
fct_data <- level_map[indices]
} else {
stop("Error converting to factor: some levels in data do not exist in levels")
}
} else {
strings <- convert_array(array, character())
# Handle empty factor() as the sentinel for "auto levels"
if (identical(levels(to), character())) {
fct_data <- factor(strings, levels)
levels(to) <- levels(fct_data)
} else {
fct_data <- factor(strings, levels = levels(to))
}
}
# Restore other attributes (e.g., ordered, labels)
attributes(fct_data) <- attributes(to)
fct_data
}
stop_cant_convert_array <- function(array, to, n = 0) {
stop_cant_convert_schema(infer_nanoarrow_schema(array), to, n - 1)
}
stop_cant_convert_schema <- function(schema, to, n = 0) {
schema_label <- nanoarrow_schema_formatted(schema)
if (is.null(schema$name) || identical(schema$name, "")) {
cnd <- simpleError(
sprintf(
"Can't convert array <%s> to R vector of type %s",
schema_label,
class(to)[1]
),
call = sys.call(n - 1)
)
} else {
cnd <- simpleError(
sprintf(
"Can't convert `%s` <%s> to R vector of type %s",
schema$name,
schema_label,
class(to)[1]
),
call = sys.call(n - 1)
)
}
stop(cnd)
}