| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| |
| #' @include arrow-package.R |
| #' @title Schema class |
| #' |
| #' @description A `Schema` is a list of [Field]s, which map names to |
| #' Arrow [data types][data-type]. Create a `Schema` when you |
| #' want to convert an R `data.frame` to Arrow but don't want to rely on the |
| #' default mapping of R types to Arrow types, such as when you want to choose a |
| #' specific numeric precision, or when creating a [Dataset] and you want to |
| #' ensure a specific schema rather than inferring it from the various files. |
| #' |
| #' Many Arrow objects, including [Table] and [Dataset], have a `$schema` method |
| #' (active binding) that lets you access their schema. |
| #' |
| #' @usage NULL |
| #' @format NULL |
| #' @docType class |
| #' @section Methods: |
| #' |
| #' - `$ToString()`: convert to a string |
| #' - `$field(i)`: returns the field at index `i` (0-based) |
| #' - `$GetFieldByName(x)`: returns the field with name `x` |
| #' - `$WithMetadata(metadata)`: returns a new `Schema` with the key-value |
| #' `metadata` set. Note that all list elements in `metadata` will be coerced |
| #' to `character`. |
| #' |
| #' @section Active bindings: |
| #' |
| #' - `$names`: returns the field names (called in `names(Schema)`) |
| #' - `$num_fields`: returns the number of fields (called in `length(Schema)`) |
| #' - `$fields`: returns the list of `Field`s in the `Schema`, suitable for |
| #' iterating over |
| #' - `$HasMetadata`: logical: does this `Schema` have extra metadata? |
| #' - `$metadata`: returns the key-value metadata as a named list. |
| #' Modify or replace by assigning in (`sch$metadata <- new_metadata`). |
| #' All list elements are coerced to string. |
| #' |
| #' @section R Metadata: |
| #' |
| #' When converting a data.frame to an Arrow Table or RecordBatch, attributes |
| #' from the `data.frame` are saved alongside tables so that the object can be |
| #' reconstructed faithfully in R (e.g. with `as.data.frame()`). This metadata |
| #' can be both at the top-level of the `data.frame` (e.g. `attributes(df)`) or |
| #' at the column (e.g. `attributes(df$col_a)`) or for list columns only: |
| #' element level (e.g. `attributes(df[1, "col_a"])`). For example, this allows |
| #' for storing `haven` columns in a table and being able to faithfully |
| #' re-create them when pulled back into R. This metadata is separate from the |
| #' schema (column names and types) which is compatible with other Arrow |
| #' clients. The R metadata is only read by R and is ignored by other clients |
| #' (e.g. Pandas has its own custom metadata). This metadata is stored in |
| #' `$metadata$r`. |
| #' |
| #' Since Schema metadata keys and values must be strings, this metadata is |
| #' saved by serializing R's attribute list structure to a string. If the |
| #' serialized metadata exceeds 100Kb in size, by default it is compressed |
| #' starting in version 3.0.0. To disable this compression (e.g. for tables |
| #' that are compatible with Arrow versions before 3.0.0 and include large |
| #' amounts of metadata), set the option `arrow.compress_metadata` to `FALSE`. |
| #' Files with compressed metadata are readable by older versions of arrow, but |
| #' the metadata is dropped. |
| #' |
| #' @rdname Schema |
| #' @name Schema |
| #' @examples |
| #' \donttest{ |
| #' df <- data.frame(col1 = 2:4, col2 = c(0.1, 0.3, 0.5)) |
| #' tab1 <- Table$create(df) |
| #' tab1$schema |
| #' tab2 <- Table$create(df, schema = schema(col1 = int8(), col2 = float32())) |
| #' tab2$schema |
| #' } |
| #' @export |
| Schema <- R6Class("Schema", |
| inherit = ArrowObject, |
| public = list( |
| ToString = function() { |
| fields <- print_schema_fields(self) |
| if (self$HasMetadata) { |
| fields <- paste0(fields, "\n\nSee $metadata for additional Schema metadata") |
| } |
| fields |
| }, |
| field = function(i) Schema__field(self, i), |
| GetFieldByName = function(x) Schema__GetFieldByName(self, x), |
| AddField = function(i, field) { |
| assert_is(field, "Field") |
| Schema__AddField(self, i, field) |
| }, |
| SetField = function(i, field) { |
| assert_is(field, "Field") |
| Schema__SetField(self, i, field) |
| }, |
| RemoveField = function(i) Schema__RemoveField(self, i), |
| serialize = function() Schema__serialize(self), |
| WithMetadata = function(metadata = NULL) { |
| metadata <- prepare_key_value_metadata(metadata) |
| Schema__WithMetadata(self, metadata) |
| }, |
| Equals = function(other, check_metadata = FALSE, ...) { |
| inherits(other, "Schema") && Schema__Equals(self, other, isTRUE(check_metadata)) |
| } |
| ), |
| active = list( |
| names = function() { |
| Schema__field_names(self) |
| }, |
| num_fields = function() Schema__num_fields(self), |
| fields = function() Schema__fields(self), |
| HasMetadata = function() Schema__HasMetadata(self), |
| metadata = function(new_metadata) { |
| if (missing(new_metadata)) { |
| Schema__metadata(self) |
| } else { |
| # Set the metadata |
| out <- self$WithMetadata(new_metadata) |
| # $WithMetadata returns a new object but we're modifying in place, |
| # so swap in that new C++ object pointer into our R6 object |
| self$set_pointer(out$pointer()) |
| self |
| } |
| } |
| ) |
| ) |
| Schema$create <- function(...) schema_(.fields(list2(...))) |
| |
| prepare_key_value_metadata <- function(metadata) { |
| # key-value-metadata must be a named character vector; |
| # this function validates and coerces |
| if (is.null(metadata)) { |
| # NULL to remove metadata, so equivalent to setting an empty list |
| metadata <- empty_named_list() |
| } |
| if (is.null(names(metadata))) { |
| stop( |
| "Key-value metadata must be a named list or character vector", |
| call. = FALSE |
| ) |
| } |
| map_chr(metadata, as.character) |
| } |
| |
| print_schema_fields <- function(s) { |
| # Alternative to Schema__ToString that doesn't print metadata |
| paste(map_chr(s$fields, ~.$ToString()), collapse = "\n") |
| } |
| |
| #' @param ... named list of [data types][data-type] |
| #' @export |
| #' @rdname Schema |
| schema <- Schema$create |
| |
| #' @export |
| names.Schema <- function(x) x$names |
| |
| #' @export |
| length.Schema <- function(x) x$num_fields |
| |
| #' @export |
| `[[.Schema` <- function(x, i, ...) { |
| if (is.character(i)) { |
| x$GetFieldByName(i) |
| } else if (is.numeric(i)) { |
| x$field(i - 1) |
| } else { |
| stop("'i' must be character or numeric, not ", class(i), call. = FALSE) |
| } |
| } |
| |
| #' @export |
| `[[<-.Schema` <- function(x, i, value) { |
| assert_that(length(i) == 1) |
| if (is.character(i)) { |
| field_names <- names(x) |
| if (anyDuplicated(field_names)) { |
| stop("Cannot update field by name with duplicates", call. = FALSE) |
| } |
| |
| # If i is character, it's the field name |
| if (!is.null(value) && !inherits(value, "Field")) { |
| value <- field(i, as_type(value, "value")) |
| } |
| |
| # No match means we're adding to the end |
| i <- match(i, field_names, nomatch = length(field_names) + 1L) |
| } else { |
| assert_that(is.numeric(i), !is.na(i), i > 0) |
| # If i is numeric and we have a type, |
| # we need to grab the existing field name for the new one |
| if (!is.null(value) && !inherits(value, "Field")) { |
| value <- field(names(x)[i], as_type(value, "value")) |
| } |
| } |
| |
| i <- as.integer(i - 1L) |
| if (i >= length(x)) { |
| if (!is.null(value)) { |
| x <- x$AddField(i, value) |
| } |
| } else if (is.null(value)) { |
| x <- x$RemoveField(i) |
| } else { |
| x <- x$SetField(i, value) |
| } |
| x |
| } |
| |
| #' @export |
| `$<-.Schema` <- `$<-.ArrowTabular` |
| |
| #' @export |
| `[.Schema` <- function(x, i, ...) { |
| if (is.logical(i)) { |
| i <- rep_len(i, length(x)) # For R recycling behavior |
| i <- which(i) |
| } |
| if (is.numeric(i)) { |
| if (all(i < 0)) { |
| # in R, negative i means "everything but i" |
| i <- setdiff(seq_len(length(x)), -1 * i) |
| } |
| } |
| fields <- map(i, ~x[[.]]) |
| invalid <- map_lgl(fields, is.null) |
| if (any(invalid)) { |
| stop( |
| "Invalid field name", ifelse(sum(invalid) > 1, "s: ", ": "), |
| oxford_paste(i[invalid]), |
| call. = FALSE |
| ) |
| } |
| schema_(fields) |
| } |
| |
| #' @export |
| `$.Schema` <- function(x, name, ...) { |
| assert_that(is.string(name)) |
| if (name %in% ls(x)) { |
| get(name, x) |
| } else { |
| x$GetFieldByName(name) |
| } |
| } |
| |
| #' @export |
| as.list.Schema <- function(x, ...) x$fields |
| |
| #' read a Schema from a stream |
| #' |
| #' @param stream a `Message`, `InputStream`, or `Buffer` |
| #' @param ... currently ignored |
| #' @return A [Schema] |
| #' @export |
| read_schema <- function(stream, ...) { |
| if (inherits(stream, "Message")) { |
| return(ipc___ReadSchema_Message(stream)) |
| } else { |
| if (!inherits(stream, "InputStream")) { |
| stream <- BufferReader$create(stream) |
| on.exit(stream$close()) |
| } |
| return(ipc___ReadSchema_InputStream(stream)) |
| } |
| } |
| |
| #' Combine and harmonize schemas |
| #' |
| #' @param ... [Schema]s to unify |
| #' @param schemas Alternatively, a list of schemas |
| #' @return A `Schema` with the union of fields contained in the inputs |
| #' @export |
| #' @examples |
| #' \dontrun{ |
| #' a <- schema(b = double(), c = bool()) |
| #' z <- schema(b = double(), k = utf8()) |
| #' unify_schemas(a, z) |
| #' } |
| unify_schemas <- function(..., schemas = list(...)) { |
| arrow__UnifySchemas(schemas) |
| } |
| |
| #' @export |
| print.arrow_r_metadata <- function(x, ...) { |
| utils::str(x) |
| utils::str(.unserialize_arrow_r_metadata(x)) |
| invisible(x) |
| } |