r/R/schema.R - arrow - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 #' @include arrow-package.R
 #' @title Schema class
 #'
 #' @description A `Schema` is a list of [Field]s, which map names to
 #' Arrow [data types][data-type]. Create a `Schema` when you
 #' want to convert an R `data.frame` to Arrow but don't want to rely on the
 #' default mapping of R types to Arrow types, such as when you want to choose a
 #' specific numeric precision, or when creating a [Dataset] and you want to
 #' ensure a specific schema rather than inferring it from the various files.
 #'
 #' Many Arrow objects, including [Table] and [Dataset], have a `$schema` method
 #' (active binding) that lets you access their schema.
 #'
 #' @usage NULL
 #' @format NULL
 #' @docType class
 #' @section Methods:
 #'
 #' - `$ToString()`: convert to a string
 #' - `$field(i)`: returns the field at index `i` (0-based)
 #' - `$GetFieldByName(x)`: returns the field with name `x`
 #' - `$WithMetadata(metadata)`: returns a new `Schema` with the key-value
 #'    `metadata` set. Note that all list elements in `metadata` will be coerced
 #'    to `character`.
 #'
 #' @section Active bindings:
 #'
 #' - `$names`: returns the field names (called in `names(Schema)`)
 #' - `$num_fields`: returns the number of fields (called in `length(Schema)`)
 #' - `$fields`: returns the list of `Field`s in the `Schema`, suitable for
 #'   iterating over
 #' - `$HasMetadata`: logical: does this `Schema` have extra metadata?
 #' - `$metadata`: returns the key-value metadata as a named list.
 #'    Modify or replace by assigning in (`sch$metadata <- new_metadata`).
 #'    All list elements are coerced to string.
 #'
 #' @section R Metadata:
 #'
 #'   When converting a data.frame to an Arrow Table or RecordBatch, attributes
 #'   from the `data.frame` are saved alongside tables so that the object can be
 #'   reconstructed faithfully in R (e.g. with `as.data.frame()`). This metadata
 #'   can be both at the top-level of the `data.frame` (e.g. `attributes(df)`) or
 #'   at the column (e.g. `attributes(df$col_a)`) or for list columns only:
 #'   element level (e.g. `attributes(df[1, "col_a"])`). For example, this allows
 #'   for storing `haven` columns in a table and being able to faithfully
 #'   re-create them when pulled back into R. This metadata is separate from the
 #'   schema (column names and types) which is compatible with other Arrow
 #'   clients. The R metadata is only read by R and is ignored by other clients
 #'   (e.g. Pandas has its own custom metadata). This metadata is stored in
 #'   `$metadata$r`.
 #'
 #'   Since Schema metadata keys and values must be strings, this metadata is
 #'   saved by serializing R's attribute list structure to a string. If the
 #'   serialized metadata exceeds 100Kb in size, by default it is compressed
 #'   starting in version 3.0.0. To disable this compression (e.g. for tables
 #'   that are compatible with Arrow versions before 3.0.0 and include large
 #'   amounts of metadata), set the option `arrow.compress_metadata` to `FALSE`.
 #'   Files with compressed metadata are readable by older versions of arrow, but
 #'   the metadata is dropped.
 #'
 #' @rdname Schema
 #' @name Schema
 #' @examples
 #' \donttest{
 #' df <- data.frame(col1 = 2:4, col2 = c(0.1, 0.3, 0.5))
 #' tab1 <- Table$create(df)
 #' tab1$schema
 #' tab2 <- Table$create(df, schema = schema(col1 = int8(), col2 = float32()))
 #' tab2$schema
 #' }
 #' @export
 Schema <- R6Class("Schema",
   inherit = ArrowObject,
   public = list(
     ToString = function() {
       fields <- print_schema_fields(self)
       if (self$HasMetadata) {
         fields <- paste0(fields, "\n\nSee $metadata for additional Schema metadata")
       }
       fields
     },
     field = function(i) Schema__field(self, i),
     GetFieldByName = function(x) Schema__GetFieldByName(self, x),
     AddField = function(i, field) {
       assert_is(field, "Field")
       Schema__AddField(self, i, field)
     },
     SetField = function(i, field) {
       assert_is(field, "Field")
       Schema__SetField(self, i, field)
     },
     RemoveField = function(i) Schema__RemoveField(self, i),
     serialize = function() Schema__serialize(self),
     WithMetadata = function(metadata = NULL) {
       metadata <- prepare_key_value_metadata(metadata)
       Schema__WithMetadata(self, metadata)
     },
     Equals = function(other, check_metadata = FALSE, ...) {
       inherits(other, "Schema") && Schema__Equals(self, other, isTRUE(check_metadata))
     }
   ),
   active = list(
     names = function() {
       Schema__field_names(self)
     },
     num_fields = function() Schema__num_fields(self),
     fields = function() Schema__fields(self),
     HasMetadata = function() Schema__HasMetadata(self),
     metadata = function(new_metadata) {
       if (missing(new_metadata)) {
         Schema__metadata(self)
       } else {
         # Set the metadata
         out <- self$WithMetadata(new_metadata)
         # $WithMetadata returns a new object but we're modifying in place,
         # so swap in that new C++ object pointer into our R6 object
         self$set_pointer(out$pointer())
         self
       }
     }
   )
 )
 Schema$create <- function(...) schema_(.fields(list2(...)))

 prepare_key_value_metadata <- function(metadata) {
   # key-value-metadata must be a named character vector;
   # this function validates and coerces
   if (is.null(metadata)) {
     # NULL to remove metadata, so equivalent to setting an empty list
     metadata <- empty_named_list()
   }
   if (is.null(names(metadata))) {
     stop(
       "Key-value metadata must be a named list or character vector",
       call. = FALSE
     )
   }
   map_chr(metadata, as.character)
 }

 print_schema_fields <- function(s) {
   # Alternative to Schema__ToString that doesn't print metadata
   paste(map_chr(s$fields, ~.$ToString()), collapse = "\n")
 }

 #' @param ... named list of [data types][data-type]
 #' @export
 #' @rdname Schema
 schema <- Schema$create

 #' @export
 names.Schema <- function(x) x$names

 #' @export
 length.Schema <- function(x) x$num_fields

 #' @export
 `[[.Schema` <- function(x, i, ...) {
   if (is.character(i)) {
     x$GetFieldByName(i)
   } else if (is.numeric(i)) {
     x$field(i - 1)
   } else {
     stop("'i' must be character or numeric, not ", class(i), call. = FALSE)
   }
 }

 #' @export
 `[[<-.Schema` <- function(x, i, value) {
   assert_that(length(i) == 1)
   if (is.character(i)) {
     field_names <- names(x)
     if (anyDuplicated(field_names)) {
       stop("Cannot update field by name with duplicates", call. = FALSE)
     }

     # If i is character, it's the field name
     if (!is.null(value) && !inherits(value, "Field")) {
       value <- field(i, as_type(value, "value"))
     }

     # No match means we're adding to the end
     i <- match(i, field_names, nomatch = length(field_names) + 1L)
   } else {
     assert_that(is.numeric(i), !is.na(i), i > 0)
     # If i is numeric and we have a type,
     # we need to grab the existing field name for the new one
     if (!is.null(value) && !inherits(value, "Field")) {
       value <- field(names(x)[i], as_type(value, "value"))
     }
   }

   i <- as.integer(i - 1L)
   if (i >= length(x)) {
     if (!is.null(value)) {
       x <- x$AddField(i, value)
     }
   } else if (is.null(value)) {
     x <- x$RemoveField(i)
   } else {
     x <- x$SetField(i, value)
   }
   x
 }

 #' @export
 `$<-.Schema` <- `$<-.ArrowTabular`

 #' @export
 `[.Schema` <- function(x, i, ...) {
   if (is.logical(i)) {
     i <- rep_len(i, length(x)) # For R recycling behavior
     i <- which(i)
   }
   if (is.numeric(i)) {
     if (all(i < 0)) {
       # in R, negative i means "everything but i"
       i <- setdiff(seq_len(length(x)), -1 * i)
     }
   }
   fields <- map(i, ~x[[.]])
   invalid <- map_lgl(fields, is.null)
   if (any(invalid)) {
     stop(
       "Invalid field name", ifelse(sum(invalid) > 1, "s: ", ": "),
       oxford_paste(i[invalid]),
       call. = FALSE
     )
   }
   schema_(fields)
 }

 #' @export
 `$.Schema` <- function(x, name, ...) {
   assert_that(is.string(name))
   if (name %in% ls(x)) {
     get(name, x)
   } else {
     x$GetFieldByName(name)
   }
 }

 #' @export
 as.list.Schema <- function(x, ...) x$fields

 #' read a Schema from a stream
 #'
 #' @param stream a `Message`, `InputStream`, or `Buffer`
 #' @param ... currently ignored
 #' @return A [Schema]
 #' @export
 read_schema <- function(stream, ...) {
   if (inherits(stream, "Message")) {
     return(ipc___ReadSchema_Message(stream))
   } else {
     if (!inherits(stream, "InputStream")) {
       stream <- BufferReader$create(stream)
       on.exit(stream$close())
     }
     return(ipc___ReadSchema_InputStream(stream))
   }
 }

 #' Combine and harmonize schemas
 #'
 #' @param ... [Schema]s to unify
 #' @param schemas Alternatively, a list of schemas
 #' @return A `Schema` with the union of fields contained in the inputs
 #' @export
 #' @examples
 #' \dontrun{
 #' a <- schema(b = double(), c = bool())
 #' z <- schema(b = double(), k = utf8())
 #' unify_schemas(a, z)
 #' }
 unify_schemas <- function(..., schemas = list(...)) {
   arrow__UnifySchemas(schemas)
 }

 #' @export
 print.arrow_r_metadata <- function(x, ...) {
   utils::str(x)
   utils::str(.unserialize_arrow_r_metadata(x))
   invisible(x)
 }
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	#' @include arrow-package.R
	#' @title Schema class
	#'
	#' @description A `Schema` is a list of [Field]s, which map names to
	#' Arrow [data types][data-type]. Create a `Schema` when you
	#' want to convert an R `data.frame` to Arrow but don't want to rely on the
	#' default mapping of R types to Arrow types, such as when you want to choose a
	#' specific numeric precision, or when creating a [Dataset] and you want to
	#' ensure a specific schema rather than inferring it from the various files.
	#'
	#' Many Arrow objects, including [Table] and [Dataset], have a `$schema` method
	#' (active binding) that lets you access their schema.
	#'
	#' @usage NULL
	#' @format NULL
	#' @docType class
	#' @section Methods:
	#'
	#' - `$ToString()`: convert to a string
	#' - `$field(i)`: returns the field at index `i` (0-based)
	#' - `$GetFieldByName(x)`: returns the field with name `x`
	#' - `$WithMetadata(metadata)`: returns a new `Schema` with the key-value
	#' `metadata` set. Note that all list elements in `metadata` will be coerced
	#' to `character`.
	#'
	#' @section Active bindings:
	#'
	#' - `$names`: returns the field names (called in `names(Schema)`)
	#' - `$num_fields`: returns the number of fields (called in `length(Schema)`)
	#' - `$fields`: returns the list of `Field`s in the `Schema`, suitable for
	#' iterating over
	#' - `$HasMetadata`: logical: does this `Schema` have extra metadata?
	#' - `$metadata`: returns the key-value metadata as a named list.
	#' Modify or replace by assigning in (`sch$metadata <- new_metadata`).
	#' All list elements are coerced to string.
	#'
	#' @section R Metadata:
	#'
	#' When converting a data.frame to an Arrow Table or RecordBatch, attributes
	#' from the `data.frame` are saved alongside tables so that the object can be
	#' reconstructed faithfully in R (e.g. with `as.data.frame()`). This metadata
	#' can be both at the top-level of the `data.frame` (e.g. `attributes(df)`) or
	#' at the column (e.g. `attributes(df$col_a)`) or for list columns only:
	#' element level (e.g. `attributes(df[1, "col_a"])`). For example, this allows
	#' for storing `haven` columns in a table and being able to faithfully
	#' re-create them when pulled back into R. This metadata is separate from the
	#' schema (column names and types) which is compatible with other Arrow
	#' clients. The R metadata is only read by R and is ignored by other clients
	#' (e.g. Pandas has its own custom metadata). This metadata is stored in
	#' `$metadata$r`.
	#'
	#' Since Schema metadata keys and values must be strings, this metadata is
	#' saved by serializing R's attribute list structure to a string. If the
	#' serialized metadata exceeds 100Kb in size, by default it is compressed
	#' starting in version 3.0.0. To disable this compression (e.g. for tables
	#' that are compatible with Arrow versions before 3.0.0 and include large
	#' amounts of metadata), set the option `arrow.compress_metadata` to `FALSE`.
	#' Files with compressed metadata are readable by older versions of arrow, but
	#' the metadata is dropped.
	#'
	#' @rdname Schema
	#' @name Schema
	#' @examples
	#' \donttest{
	#' df <- data.frame(col1 = 2:4, col2 = c(0.1, 0.3, 0.5))
	#' tab1 <- Table$create(df)
	#' tab1$schema
	#' tab2 <- Table$create(df, schema = schema(col1 = int8(), col2 = float32()))
	#' tab2$schema
	#' }
	#' @export
	Schema <- R6Class("Schema",
	inherit = ArrowObject,
	public = list(
	ToString = function() {
	fields <- print_schema_fields(self)
	if (self$HasMetadata) {
	fields <- paste0(fields, "\n\nSee $metadata for additional Schema metadata")
	}
	fields
	},
	field = function(i) Schema__field(self, i),
	GetFieldByName = function(x) Schema__GetFieldByName(self, x),
	AddField = function(i, field) {
	assert_is(field, "Field")
	Schema__AddField(self, i, field)
	},
	SetField = function(i, field) {
	assert_is(field, "Field")
	Schema__SetField(self, i, field)
	},
	RemoveField = function(i) Schema__RemoveField(self, i),
	serialize = function() Schema__serialize(self),
	WithMetadata = function(metadata = NULL) {
	metadata <- prepare_key_value_metadata(metadata)
	Schema__WithMetadata(self, metadata)
	},
	Equals = function(other, check_metadata = FALSE, ...) {
	inherits(other, "Schema") && Schema__Equals(self, other, isTRUE(check_metadata))
	}
	),
	active = list(
	names = function() {
	Schema__field_names(self)
	},
	num_fields = function() Schema__num_fields(self),
	fields = function() Schema__fields(self),
	HasMetadata = function() Schema__HasMetadata(self),
	metadata = function(new_metadata) {
	if (missing(new_metadata)) {
	Schema__metadata(self)
	} else {
	# Set the metadata
	out <- self$WithMetadata(new_metadata)
	# $WithMetadata returns a new object but we're modifying in place,
	# so swap in that new C++ object pointer into our R6 object
	self$set_pointer(out$pointer())
	self
	}
	}
	)
	)
	Schema$create <- function(...) schema_(.fields(list2(...)))

	prepare_key_value_metadata <- function(metadata) {
	# key-value-metadata must be a named character vector;
	# this function validates and coerces
	if (is.null(metadata)) {
	# NULL to remove metadata, so equivalent to setting an empty list
	metadata <- empty_named_list()
	}
	if (is.null(names(metadata))) {
	stop(
	"Key-value metadata must be a named list or character vector",
	call. = FALSE
	)
	}
	map_chr(metadata, as.character)
	}

	print_schema_fields <- function(s) {
	# Alternative to Schema__ToString that doesn't print metadata
	paste(map_chr(s$fields, ~.$ToString()), collapse = "\n")
	}

	#' @param ... named list of [data types][data-type]
	#' @export
	#' @rdname Schema
	schema <- Schema$create

	#' @export
	names.Schema <- function(x) x$names

	#' @export
	length.Schema <- function(x) x$num_fields

	#' @export
	`[[.Schema` <- function(x, i, ...) {
	if (is.character(i)) {
	x$GetFieldByName(i)
	} else if (is.numeric(i)) {
	x$field(i - 1)
	} else {
	stop("'i' must be character or numeric, not ", class(i), call. = FALSE)
	}
	}

	#' @export
	`[[<-.Schema` <- function(x, i, value) {
	assert_that(length(i) == 1)
	if (is.character(i)) {
	field_names <- names(x)
	if (anyDuplicated(field_names)) {
	stop("Cannot update field by name with duplicates", call. = FALSE)
	}

	# If i is character, it's the field name
	if (!is.null(value) && !inherits(value, "Field")) {
	value <- field(i, as_type(value, "value"))
	}

	# No match means we're adding to the end
	i <- match(i, field_names, nomatch = length(field_names) + 1L)
	} else {
	assert_that(is.numeric(i), !is.na(i), i > 0)
	# If i is numeric and we have a type,
	# we need to grab the existing field name for the new one
	if (!is.null(value) && !inherits(value, "Field")) {
	value <- field(names(x)[i], as_type(value, "value"))
	}
	}

	i <- as.integer(i - 1L)
	if (i >= length(x)) {
	if (!is.null(value)) {
	x <- x$AddField(i, value)
	}
	} else if (is.null(value)) {
	x <- x$RemoveField(i)
	} else {
	x <- x$SetField(i, value)
	}
	x
	}

	#' @export
	`$<-.Schema` <- `$<-.ArrowTabular`

	#' @export
	`[.Schema` <- function(x, i, ...) {
	if (is.logical(i)) {
	i <- rep_len(i, length(x)) # For R recycling behavior
	i <- which(i)
	}
	if (is.numeric(i)) {
	if (all(i < 0)) {
	# in R, negative i means "everything but i"
	i <- setdiff(seq_len(length(x)), -1 * i)
	}
	}
	fields <- map(i, ~x[[.]])
	invalid <- map_lgl(fields, is.null)
	if (any(invalid)) {
	stop(
	"Invalid field name", ifelse(sum(invalid) > 1, "s: ", ": "),
	oxford_paste(i[invalid]),
	call. = FALSE
	)
	}
	schema_(fields)
	}

	#' @export
	`$.Schema` <- function(x, name, ...) {
	assert_that(is.string(name))
	if (name %in% ls(x)) {
	get(name, x)
	} else {
	x$GetFieldByName(name)
	}
	}

	#' @export
	as.list.Schema <- function(x, ...) x$fields

	#' read a Schema from a stream
	#'
	#' @param stream a `Message`, `InputStream`, or `Buffer`
	#' @param ... currently ignored
	#' @return A [Schema]
	#' @export
	read_schema <- function(stream, ...) {
	if (inherits(stream, "Message")) {
	return(ipc___ReadSchema_Message(stream))
	} else {
	if (!inherits(stream, "InputStream")) {
	stream <- BufferReader$create(stream)
	on.exit(stream$close())
	}
	return(ipc___ReadSchema_InputStream(stream))
	}
	}

	#' Combine and harmonize schemas
	#'
	#' @param ... [Schema]s to unify
	#' @param schemas Alternatively, a list of schemas
	#' @return A `Schema` with the union of fields contained in the inputs
	#' @export
	#' @examples
	#' \dontrun{
	#' a <- schema(b = double(), c = bool())
	#' z <- schema(b = double(), k = utf8())
	#' unify_schemas(a, z)
	#' }
	unify_schemas <- function(..., schemas = list(...)) {
	arrow__UnifySchemas(schemas)
	}

	#' @export
	print.arrow_r_metadata <- function(x, ...) {
	utils::str(x)
	utils::str(.unserialize_arrow_r_metadata(x))
	invisible(x)
	}