| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| |
| #' Convert an object to a nanoarrow schema |
| #' |
| #' In nanoarrow a 'schema' refers to a `struct ArrowSchema` as defined in the |
| #' Arrow C Data interface. This data structure can be used to represent an |
| #' [arrow::schema()], an [arrow::field()], or an `arrow::DataType`. Note that |
| #' in nanoarrow, an [arrow::schema()] and a non-nullable [arrow::struct()] |
| #' are represented identically. |
| #' |
| #' @param x An object to convert to a schema |
| #' @param recursive Use `TRUE` to include a `children` member when parsing |
| #' schemas. |
| #' @param new_values New schema component to assign |
| #' @param validate Use `FALSE` to skip schema validation |
| #' @param ... Passed to S3 methods |
| #' |
| #' @return An object of class 'nanoarrow_schema' |
| #' @export |
| #' |
| #' @examples |
| #' infer_nanoarrow_schema(integer()) |
| #' infer_nanoarrow_schema(data.frame(x = integer())) |
| #' |
| as_nanoarrow_schema <- function(x, ...) { |
| UseMethod("as_nanoarrow_schema") |
| } |
| |
| #' @export |
| as_nanoarrow_schema.nanoarrow_schema <- function(x, ...) { |
| x |
| } |
| |
| #' @rdname as_nanoarrow_schema |
| #' @export |
| infer_nanoarrow_schema <- function(x, ...) { |
| UseMethod("infer_nanoarrow_schema") |
| } |
| |
| #' @export |
| infer_nanoarrow_schema.default <- function(x, ...) { |
| cls <- paste(class(x), collapse = "/") |
| stop(sprintf("Can't infer Arrow type for object of class %s", cls)) |
| } |
| |
| #' @export |
| infer_nanoarrow_schema.raw <- function(x, ...) { |
| na_uint8() |
| } |
| |
| #' @export |
| infer_nanoarrow_schema.logical <- function(x, ...) { |
| na_bool() |
| } |
| |
| #' @export |
| infer_nanoarrow_schema.integer <- function(x, ...) { |
| na_int32() |
| } |
| |
| #' @export |
| infer_nanoarrow_schema.double <- function(x, ...) { |
| na_double() |
| } |
| |
| #' @export |
| infer_nanoarrow_schema.character <- function(x, ...) { |
| if (length(x) > 0 && sum(nchar(x, type = "bytes"), na.rm = TRUE) > .Machine$integer.max) { |
| na_large_string() |
| } else { |
| na_string() |
| } |
| } |
| |
| #' @export |
| infer_nanoarrow_schema.integer64 <- function(x, ...) { |
| na_int64() |
| } |
| |
| #' @export |
| infer_nanoarrow_schema.factor <- function(x, ...) { |
| na_dictionary( |
| infer_nanoarrow_schema(levels(x)), |
| na_int32(), |
| ordered = is.ordered(x) |
| ) |
| } |
| |
| #' @export |
| infer_nanoarrow_schema.POSIXct <- function(x, ...) { |
| tz <- attr(x, "tzone") |
| if (is.null(tz) || identical(tz, "")) { |
| tz <- Sys.timezone() |
| } |
| |
| na_timestamp(timezone = tz) |
| } |
| |
| #' @export |
| infer_nanoarrow_schema.POSIXlt <- function(x, ...) { |
| infer_nanoarrow_schema(new_data_frame(x, length(x))) |
| } |
| |
| #' @export |
| infer_nanoarrow_schema.Date <- function(x, ...) { |
| na_date32() |
| } |
| |
| #' @export |
| infer_nanoarrow_schema.difftime <- function(x, ...) { |
| # A balance between safety for large time ranges (not overflowing) |
| # and safety for small time ranges (not truncating) |
| na_duration(unit = "us") |
| } |
| |
| #' @export |
| infer_nanoarrow_schema.data.frame <- function(x, ...) { |
| na_struct(lapply(x, infer_nanoarrow_schema), nullable = FALSE) |
| } |
| |
| #' @export |
| infer_nanoarrow_schema.hms <- function(x, ...) { |
| # As a default, ms is safer than s and less likely to truncate |
| na_time32(unit = "ms") |
| } |
| |
| #' @export |
| infer_nanoarrow_schema.blob <- function(x, ...) { |
| if (length(x) > 0 && sum(lengths(x)) > .Machine$integer.max) { |
| na_large_binary() |
| } else { |
| na_binary() |
| } |
| } |
| |
| #' @export |
| infer_nanoarrow_schema.vctrs_unspecified <- function(x, ...) { |
| na_na() |
| } |
| |
| #' @export |
| infer_nanoarrow_schema.vctrs_list_of <- function(x, ...) { |
| child_type <- infer_nanoarrow_schema(attr(x, "ptype")) |
| if (length(x) > 0 && sum(lengths(x)) > .Machine$integer.max) { |
| na_large_list(child_type) |
| } else { |
| na_list(child_type) |
| } |
| } |
| |
| #' @export |
| infer_nanoarrow_schema.AsIs <- function(x, ...) { |
| # NextMethod() goes directly to `default` |
| class(x) <- class(x)[-1] |
| infer_nanoarrow_schema(x) |
| } |
| |
| #' @export |
| infer_nanoarrow_schema.list <- function(x, ...) { |
| # TODO: Move this to C |
| is_null <- vapply(x, is.null, logical(1)) |
| |
| if (all(is_null)) { |
| return(na_list(na_na())) |
| } |
| |
| is_raw <- vapply(x, is.raw, logical(1)) |
| if (!all(is_raw | is_null)) { |
| return(NextMethod()) |
| } |
| |
| if (length(x) > 0 && sum(lengths(x)) > .Machine$integer.max) { |
| na_large_binary() |
| } else { |
| na_binary() |
| } |
| } |
| |
| #' @rdname as_nanoarrow_schema |
| #' @export |
| nanoarrow_schema_parse <- function(x, recursive = FALSE) { |
| parsed <- .Call(nanoarrow_c_schema_parse, as_nanoarrow_schema(x)) |
| parsed_null <- vapply(parsed, is.null, logical(1)) |
| result <- parsed[!parsed_null] |
| |
| if (recursive && length(x$children) > 0) { |
| result$children <- lapply(x$children, nanoarrow_schema_parse, TRUE) |
| } |
| |
| result |
| } |
| |
| #' @rdname as_nanoarrow_schema |
| #' @export |
| nanoarrow_schema_modify <- function(x, new_values, validate = TRUE) { |
| schema <- as_nanoarrow_schema(x) |
| |
| if (length(new_values) == 0) { |
| return(schema) |
| } |
| |
| # Make sure new_values has names to iterate over |
| new_names <- names(new_values) |
| if (is.null(new_names) || all(new_names == "", na.rm = TRUE)) { |
| stop("`new_values` must be named") |
| } |
| |
| # Make a deep copy and modify it. Possibly not as efficient as it could be |
| # but it's unclear to what degree performance is an issue for R-level |
| # schema modification. |
| schema_deep_copy <- nanoarrow_allocate_schema() |
| nanoarrow_pointer_export(schema, schema_deep_copy) |
| |
| for (i in seq_along(new_values)) { |
| nm <- new_names[i] |
| value <- new_values[[i]] |
| |
| switch( |
| nm, |
| format = .Call( |
| nanoarrow_c_schema_set_format, |
| schema_deep_copy, |
| as.character(value) |
| ), |
| name = { |
| if (!is.null(value)) { |
| value <- as.character(value) |
| } |
| |
| .Call(nanoarrow_c_schema_set_name, schema_deep_copy, value) |
| }, |
| flags = .Call( |
| nanoarrow_c_schema_set_flags, |
| schema_deep_copy, |
| as.integer(value) |
| ), |
| metadata = .Call( |
| nanoarrow_c_schema_set_metadata, |
| schema_deep_copy, |
| as.list(value) |
| ), |
| children = { |
| if (!is.null(value)) { |
| value <- lapply(value, as_nanoarrow_schema) |
| } |
| |
| .Call(nanoarrow_c_schema_set_children, schema_deep_copy, value) |
| }, |
| dictionary = { |
| if (!is.null(value)) { |
| value <- as_nanoarrow_schema(value) |
| } |
| |
| .Call(nanoarrow_c_schema_set_dictionary, schema_deep_copy, value) |
| }, |
| stop(sprintf("Can't modify schema[[%s]]: does not exist", deparse(nm))) |
| ) |
| } |
| |
| if (validate) { |
| nanoarrow_schema_parse(schema_deep_copy, recursive = FALSE) |
| } |
| |
| schema_deep_copy |
| } |
| |
| nanoarrow_schema_identical <- function(x, y) { |
| identical(x, y) || |
| identical( |
| nanoarrow_schema_proxy(x, recursive = TRUE), |
| nanoarrow_schema_proxy(y, recursive = TRUE) |
| ) |
| } |
| |
| #' @importFrom utils str |
| #' @export |
| str.nanoarrow_schema <- function(object, ...) { |
| cat(sprintf("%s\n", format(object, .recursive = FALSE))) |
| |
| if (nanoarrow_pointer_is_valid(object)) { |
| # Use the str() of the list version but remove the first |
| # line of the output ("List of 6") |
| info <- nanoarrow_schema_proxy(object) |
| raw_str_output <- utils::capture.output(str(info, ...)) |
| cat(paste0(raw_str_output[-1], collapse = "\n")) |
| cat("\n") |
| } |
| |
| invisible(object) |
| } |
| |
| #' @export |
| print.nanoarrow_schema <- function(x, ...) { |
| str(x, ...) |
| invisible(x) |
| } |
| |
| #' @export |
| format.nanoarrow_schema <- function(x, ..., .recursive = TRUE) { |
| sprintf( |
| "<nanoarrow_schema %s>", |
| nanoarrow_schema_formatted(x, .recursive) |
| ) |
| } |
| |
| # This is the list()-like interface to nanoarrow_schema that allows $ and [[ |
| # to make nice auto-complete for the schema fields |
| |
| #' @export |
| length.nanoarrow_schema <- function(x, ...) { |
| 6L |
| } |
| |
| #' @export |
| names.nanoarrow_schema <- function(x, ...) { |
| c("format", "name", "metadata", "flags", "children", "dictionary") |
| } |
| |
| #' @export |
| `[[.nanoarrow_schema` <- function(x, i, ...) { |
| nanoarrow_schema_proxy(x)[[i]] |
| } |
| |
| #' @export |
| `$.nanoarrow_schema` <- function(x, i, ...) { |
| nanoarrow_schema_proxy(x)[[i]] |
| } |
| |
| #' @export |
| `[[<-.nanoarrow_schema` <- function(x, i, value) { |
| if (is.numeric(i) && isTRUE(i %in% 1:6)) { |
| i <- names.nanoarrow_schema()[[i]] |
| } |
| |
| if (is.character(i) && (length(i) == 1L) && !is.na(i)) { |
| new_values <- list(value) |
| names(new_values) <- i |
| return(nanoarrow_schema_modify(x, new_values)) |
| } |
| |
| stop("`i` must be character(1) or integer(1) %in% 1:6") |
| } |
| |
| #' @export |
| `$<-.nanoarrow_schema` <- function(x, i, value) { |
| new_values <- list(value) |
| names(new_values) <- i |
| nanoarrow_schema_modify(x, new_values) |
| } |
| |
| nanoarrow_schema_formatted <- function(x, recursive = TRUE) { |
| .Call(nanoarrow_c_schema_format, x, as.logical(recursive)[1]) |
| } |
| |
| nanoarrow_schema_proxy <- function(schema, recursive = FALSE) { |
| result <- .Call(nanoarrow_c_schema_to_list, schema) |
| if (recursive && !is.null(schema$children)) { |
| result$children <- lapply( |
| schema$children, |
| nanoarrow_schema_proxy, |
| recursive = TRUE |
| ) |
| } |
| |
| if (recursive && !is.null(schema$dictionary)) { |
| result$dictionary <- nanoarrow_schema_proxy(schema$dictionary, recursive = TRUE) |
| } |
| |
| result$metadata <- list_of_raw_to_metadata(result$metadata) |
| |
| result |
| } |
| |
| list_of_raw_to_metadata <- function(metadata) { |
| lapply(metadata, function(x) { |
| if (is.character(x) || any(x == 0)) { |
| x |
| } else { |
| x_str <- iconv(list(x), from = "UTF-8", to = "UTF-8", mark = TRUE)[[1]] |
| if (is.na(x_str)) x else x_str |
| } |
| }) |
| } |