blob: e5b12b80ae61591efd535a66cfaed34fae58e9ad [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#' @include arrow-object.R
#' @title ExtensionArray class
#'
#' @usage NULL
#' @format NULL
#' @docType class
#'
#' @section Methods:
#'
#' The `ExtensionArray` class inherits from `Array`, but also provides
#' access to the underlying storage of the extension.
#'
#' - `$storage()`: Returns the underlying [Array] used to store
#' values.
#'
#' The `ExtensionArray` is not intended to be subclassed for extension
#' types.
#'
#' @rdname ExtensionArray
#' @name ExtensionArray
#' @export
ExtensionArray <- R6Class("ExtensionArray",
inherit = Array,
public = list(
storage = function() {
ExtensionArray__storage(self)
},
as_vector = function() {
self$type$as_vector(self)
}
)
)
ExtensionArray$create <- function(x, type) {
assert_is(type, "ExtensionType")
if (inherits(x, "ExtensionArray") && type$Equals(x$type)) {
return(x)
}
storage <- Array$create(x, type = type$storage_type())
type$WrapArray(storage)
}
#' @title ExtensionType class
#'
#' @usage NULL
#' @format NULL
#' @docType class
#'
#' @section Methods:
#'
#' The `ExtensionType` class inherits from `DataType`, but also defines
#' extra methods specific to extension types:
#'
#' - `$storage_type()`: Returns the underlying [DataType] used to store
#' values.
#' - `$storage_id()`: Returns the [Type] identifier corresponding to the
#' `$storage_type()`.
#' - `$extension_name()`: Returns the extension name.
#' - `$extension_metadata()`: Returns the serialized version of the extension
#' metadata as a [raw()] vector.
#' - `$extension_metadata_utf8()`: Returns the serialized version of the
#' extension metadata as a UTF-8 encoded string.
#' - `$WrapArray(array)`: Wraps a storage [Array] into an [ExtensionArray]
#' with this extension type.
#'
#' In addition, subclasses may override the following methods to customize
#' the behaviour of extension classes.
#'
#' - `$deserialize_instance()`: This method is called when a new [ExtensionType]
#' is initialized and is responsible for parsing and validating
#' the serialized extension_metadata (a [raw()] vector)
#' such that its contents can be inspected by fields and/or methods
#' of the R6 ExtensionType subclass. Implementations must also check the
#' `storage_type` to make sure it is compatible with the extension type.
#' - `$as_vector(extension_array)`: Convert an [Array] or [ChunkedArray] to an R
#' vector. This method is called by [as.vector()] on [ExtensionArray]
#' objects, when a [RecordBatch] containing an [ExtensionArray] is
#' converted to a [data.frame()], or when a [ChunkedArray] (e.g., a column
#' in a [Table]) is converted to an R vector. The default method returns the
#' converted storage array.
#' - `$ToString()` Return a string representation that will be printed
#' to the console when this type or an Array of this type is printed.
#'
#' @rdname ExtensionType
#' @name ExtensionType
#' @export
ExtensionType <- R6Class("ExtensionType",
inherit = DataType,
public = list(
# In addition to the initialization that occurs for all
# ArrowObject instances, we call deserialize_instance(), which can
# be overridden to populate custom fields
initialize = function(xp) {
super$initialize(xp)
self$deserialize_instance()
},
# Because of how C++ shared_ptr<> objects are converted to R objects,
# the initial object that is instantiated will be of this class
# (ExtensionType), but the R6Class object that was registered is
# available from C++. We need this in order to produce the correct
# R6 subclass when a shared_ptr<ExtensionType> is returned to R.
r6_class = function() {
ExtensionType__r6_class(self)
},
storage_type = function() {
ExtensionType__storage_type(self)
},
storage_id = function() {
self$storage_type()$id
},
extension_name = function() {
ExtensionType__extension_name(self)
},
extension_metadata = function() {
ExtensionType__Serialize(self)
},
# To make sure this conversion is done properly
extension_metadata_utf8 = function() {
metadata_utf8 <- rawToChar(self$extension_metadata())
Encoding(metadata_utf8) <- "UTF-8"
metadata_utf8
},
WrapArray = function(array) {
assert_is(array, "Array")
ExtensionType__MakeArray(self, array$data())
},
deserialize_instance = function() {
# Do nothing by default but allow other classes to override this method
# to populate R6 class members.
},
ExtensionEquals = function(other) {
inherits(other, "ExtensionType") &&
identical(other$extension_name(), self$extension_name()) &&
identical(other$extension_metadata(), self$extension_metadata())
},
as_vector = function(extension_array) {
if (inherits(extension_array, "ChunkedArray")) {
# Converting one array at a time so that users don't have to remember
# to implement two methods. Converting all the storage arrays to
# a ChunkedArray and then converting is probably faster
# (VctrsExtensionType does this).
storage_vectors <- lapply(
seq_len(extension_array$num_chunks) - 1L,
function(i) self$as_vector(extension_array$chunk(i))
)
vctrs::vec_c(!!!storage_vectors)
} else if (inherits(extension_array, "ExtensionArray")) {
extension_array$storage()$as_vector()
} else {
abort(
c(
"`extension_array` must be a ChunkedArray or ExtensionArray",
i = sprintf(
"Got object of type %s",
paste(class(extension_array), collapse = " / ")
)
)
)
}
},
ToString = function() {
# metadata is probably valid UTF-8 (e.g., JSON), but might not be
# and it's confusing to error when printing the object. This heuristic
# isn't perfect (but subclasses should override this method anyway)
metadata_raw <- self$extension_metadata()
if (as.raw(0x00) %in% metadata_raw) {
if (length(metadata_raw) > 20) {
sprintf(
"<%s %s...>",
class(self)[1],
paste(format(head(metadata_raw, 20)), collapse = " ")
)
} else {
sprintf(
"<%s %s>",
class(self)[1],
paste(format(metadata_raw), collapse = " ")
)
}
} else {
paste0(class(self)[1], " <", self$extension_metadata_utf8(), ">")
}
}
)
)
# ExtensionType$new() is what gets used by the generated wrapper code to
# create an R6 object when a shared_ptr<DataType> is returned to R and
# that object has type_id() EXTENSION_TYPE. Rather than add complexity
# to the wrapper code, we modify ExtensionType$new() to do what we need
# it to do here (which is to return an instance of a custom R6
# type whose .deserialize_instance method is called to populate custom fields).
ExtensionType$.default_new <- ExtensionType$new
ExtensionType$new <- function(xp) {
super <- ExtensionType$.default_new(xp)
r6_class <- super$r6_class()
if (identical(r6_class$classname, "ExtensionType")) {
super
} else {
r6_class$new(xp)
}
}
ExtensionType$create <- function(storage_type,
extension_name,
extension_metadata = raw(),
type_class = ExtensionType) {
if (is.string(extension_metadata)) {
extension_metadata <- charToRaw(enc2utf8(extension_metadata))
}
assert_that(is.string(extension_name), is.raw(extension_metadata))
assert_is(storage_type, "DataType")
assert_is(type_class, "R6ClassGenerator")
ExtensionType__initialize(
storage_type,
extension_name,
extension_metadata,
type_class
)
}
#' Extension types
#'
#' Extension arrays are wrappers around regular Arrow [Array] objects
#' that provide some customized behaviour and/or storage. A common use-case
#' for extension types is to define a customized conversion between an
#' an Arrow [Array] and an R object when the default conversion is slow
#' or loses metadata important to the interpretation of values in the array.
#' For most types, the built-in
#' [vctrs extension type][vctrs_extension_type] is probably sufficient.
#'
#' These functions create, register, and unregister [ExtensionType]
#' and [ExtensionArray] objects. To use an extension type you will have to:
#'
#' - Define an [R6::R6Class] that inherits from [ExtensionType] and reimplement
#' one or more methods (e.g., `deserialize_instance()`).
#' - Make a type constructor function (e.g., `my_extension_type()`) that calls
#' [new_extension_type()] to create an R6 instance that can be used as a
#' [data type][data-type] elsewhere in the package.
#' - Make an array constructor function (e.g., `my_extension_array()`) that
#' calls [new_extension_array()] to create an [Array] instance of your
#' extension type.
#' - Register a dummy instance of your extension type created using
#' you constructor function using [register_extension_type()].
#'
#' If defining an extension type in an R package, you will probably want to
#' use [reregister_extension_type()] in that package's [.onLoad()] hook
#' since your package will probably get reloaded in the same R session
#' during its development and [register_extension_type()] will error if
#' called twice for the same `extension_name`. For an example of an
#' extension type that uses most of these features, see
#' [vctrs_extension_type()].
#'
#' @param storage_type The [data type][data-type] of the underlying storage
#' array.
#' @param storage_array An [Array] object of the underlying storage.
#' @param extension_type An [ExtensionType] instance.
#' @param extension_name The extension name. This should be namespaced using
#' "dot" syntax (i.e., "some_package.some_type"). The namespace "arrow"
#' is reserved for extension types defined by the Apache Arrow libraries.
#' @param extension_metadata A [raw()] or [character()] vector containing the
#' serialized version of the type. Character vectors must be length 1 and
#' are converted to UTF-8 before converting to [raw()].
#' @param type_class An [R6::R6Class] whose `$new()` class method will be
#' used to construct a new instance of the type.
#'
#' @return
#' - `new_extension_type()` returns an [ExtensionType] instance according
#' to the `type_class` specified.
#' - `new_extension_array()` returns an [ExtensionArray] whose `$type`
#' corresponds to `extension_type`.
#' - `register_extension_type()`, `unregister_extension_type()`
#' and `reregister_extension_type()` return `NULL`, invisibly.
#' @export
#'
#' @examples
#' # Create the R6 type whose methods control how Array objects are
#' # converted to R objects, how equality between types is computed,
#' # and how types are printed.
#' QuantizedType <- R6::R6Class(
#' "QuantizedType",
#' inherit = ExtensionType,
#' public = list(
#' # methods to access the custom metadata fields
#' center = function() private$.center,
#' scale = function() private$.scale,
#'
#' # called when an Array of this type is converted to an R vector
#' as_vector = function(extension_array) {
#' if (inherits(extension_array, "ExtensionArray")) {
#' unquantized_arrow <-
#' (extension_array$storage()$cast(float64()) / private$.scale) +
#' private$.center
#'
#' as.vector(unquantized_arrow)
#' } else {
#' super$as_vector(extension_array)
#' }
#' },
#'
#' # populate the custom metadata fields from the serialized metadata
#' deserialize_instance = function() {
#' vals <- as.numeric(strsplit(self$extension_metadata_utf8(), ";")[[1]])
#' private$.center <- vals[1]
#' private$.scale <- vals[2]
#' }
#' ),
#' private = list(
#' .center = NULL,
#' .scale = NULL
#' )
#' )
#'
#' # Create a helper type constructor that calls new_extension_type()
#' quantized <- function(center = 0, scale = 1, storage_type = int32()) {
#' new_extension_type(
#' storage_type = storage_type,
#' extension_name = "arrow.example.quantized",
#' extension_metadata = paste(center, scale, sep = ";"),
#' type_class = QuantizedType
#' )
#' }
#'
#' # Create a helper array constructor that calls new_extension_array()
#' quantized_array <- function(x, center = 0, scale = 1,
#' storage_type = int32()) {
#' type <- quantized(center, scale, storage_type)
#' new_extension_array(
#' Array$create((x - center) * scale, type = storage_type),
#' type
#' )
#' }
#'
#' # Register the extension type so that Arrow knows what to do when
#' # it encounters this extension type
#' reregister_extension_type(quantized())
#'
#' # Create Array objects and use them!
#' (vals <- runif(5, min = 19, max = 21))
#'
#' (array <- quantized_array(
#' vals,
#' center = 20,
#' scale = 2^15 - 1,
#' storage_type = int16()
#' )
#' )
#'
#' array$type$center()
#' array$type$scale()
#'
#' as.vector(array)
new_extension_type <- function(storage_type,
extension_name,
extension_metadata = raw(),
type_class = ExtensionType) {
ExtensionType$create(
storage_type,
extension_name,
extension_metadata,
type_class
)
}
#' @rdname new_extension_type
#' @export
new_extension_array <- function(storage_array, extension_type) {
ExtensionArray$create(storage_array, extension_type)
}
#' @rdname new_extension_type
#' @export
register_extension_type <- function(extension_type) {
assert_is(extension_type, "ExtensionType")
arrow__RegisterRExtensionType(extension_type)
}
#' @rdname new_extension_type
#' @export
reregister_extension_type <- function(extension_type) {
tryCatch(
register_extension_type(extension_type),
error = function(e) {
unregister_extension_type(extension_type$extension_name())
register_extension_type(extension_type)
}
)
}
#' @rdname new_extension_type
#' @export
unregister_extension_type <- function(extension_name) {
arrow__UnregisterRExtensionType(extension_name)
}
#' @importFrom utils capture.output
VctrsExtensionType <- R6Class("VctrsExtensionType",
inherit = ExtensionType,
public = list(
ptype = function() private$.ptype,
ToString = function() {
paste0(capture.output(print(self$ptype())), collapse = "\n")
},
deserialize_instance = function() {
private$.ptype <- safe_unserialize(self$extension_metadata())
attributes(private$.ptype) <- safe_r_metadata(attributes(private$.ptype))
},
ExtensionEquals = function(other) {
inherits(other, "VctrsExtensionType") && identical(self$ptype(), other$ptype())
},
as_vector = function(extension_array) {
if (inherits(extension_array, "ChunkedArray")) {
# rather than convert one array at a time, use more Arrow
# machinery to convert the whole ChunkedArray at once
storage_arrays <- lapply(
seq_len(extension_array$num_chunks) - 1L,
function(i) extension_array$chunk(i)$storage()
)
storage <- chunked_array(!!!storage_arrays, type = self$storage_type())
vctrs::vec_restore(storage$as_vector(), self$ptype())
} else if (inherits(extension_array, "Array")) {
vctrs::vec_restore(
super$as_vector(extension_array),
self$ptype()
)
} else {
super$as_vector(extension_array)
}
}
),
private = list(
.ptype = NULL
)
)
#' Extension type for generic typed vectors
#'
#' Most common R vector types are converted automatically to a suitable
#' Arrow [data type][data-type] without the need for an extension type. For
#' vector types whose conversion is not suitably handled by default, you can
#' create a [vctrs_extension_array()], which passes [vctrs::vec_data()] to
#' `Array$create()` and calls [vctrs::vec_restore()] when the [Array] is
#' converted back into an R vector.
#'
#' @param x A vctr (i.e., [vctrs::vec_is()] returns `TRUE`).
#' @param ptype A [vctrs::vec_ptype()], which is usually a zero-length
#' version of the object with the appropriate attributes set. This value
#' will be serialized using [serialize()], so it should not refer to any
#' R object that can't be saved/reloaded.
#' @inheritParams new_extension_type
#'
#' @return
#' - `vctrs_extension_array()` returns an [ExtensionArray] instance with a
#' `vctrs_extension_type()`.
#' - `vctrs_extension_type()` returns an [ExtensionType] instance for the
#' extension name "arrow.r.vctrs".
#' @export
#'
#' @examples
#' (array <- vctrs_extension_array(as.POSIXlt("2022-01-02 03:45", tz = "UTC")))
#' array$type
#' as.vector(array)
#'
#' temp_feather <- tempfile()
#' write_feather(arrow_table(col = array), temp_feather)
#' read_feather(temp_feather)
#' unlink(temp_feather)
vctrs_extension_array <- function(x, ptype = vctrs::vec_ptype(x),
storage_type = NULL) {
if (inherits(x, "ExtensionArray") && inherits(x$type, "VctrsExtensionType")) {
return(x)
}
vctrs::vec_assert(x)
storage <- Array$create(vctrs::vec_data(x), type = storage_type)
type <- vctrs_extension_type(ptype, storage$type)
new_extension_array(storage, type)
}
#' @rdname vctrs_extension_array
#' @export
vctrs_extension_type <- function(x,
storage_type = infer_type(vctrs::vec_data(x))) {
ptype <- vctrs::vec_ptype(x)
# Make sure there are no unsupported objects buried in there
attributes(ptype) <- safe_r_metadata(attributes(ptype))
new_extension_type(
storage_type = storage_type,
extension_name = "arrow.r.vctrs",
extension_metadata = serialize(ptype, NULL, ascii = TRUE),
type_class = VctrsExtensionType
)
}