r/R/table.R - arrow - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 #' @include record-batch.R
 #' @title Table class
 #' @description A Table is a sequence of [chunked arrays][ChunkedArray]. They
 #' have a similar interface to [record batches][RecordBatch], but they can be
 #' composed from multiple record batches or chunked arrays.
 #' @usage NULL
 #' @format NULL
 #' @docType class
 #'
 #' @section Factory:
 #'
 #' The `Table$create()` function takes the following arguments:
 #'
 #' * `...` arrays, chunked arrays, or R vectors, with names; alternatively,
 #'    an unnamed series of [record batches][RecordBatch] may also be provided,
 #'    which will be stacked as rows in the table.
 #' * `schema` a [Schema], or `NULL` (the default) to infer the schema from
 #'    the data in `...`
 #'
 #' @section S3 Methods and Usage:
 #' Tables are data-frame-like, and many methods you expect to work on
 #' a `data.frame` are implemented for `Table`. This includes `[`, `[[`,
 #' `$`, `names`, `dim`, `nrow`, `ncol`, `head`, and `tail`. You can also pull
 #' the data from an Arrow table into R with `as.data.frame()`. See the
 #' examples.
 #'
 #' A caveat about the `$` method: because `Table` is an `R6` object,
 #' `$` is also used to access the object's methods (see below). Methods take
 #' precedence over the table's columns. So, `tab$Slice` would return the
 #' "Slice" method function even if there were a column in the table called
 #' "Slice".
 #'
 #' @section R6 Methods:
 #' In addition to the more R-friendly S3 methods, a `Table` object has
 #' the following R6 methods that map onto the underlying C++ methods:
 #'
 #' - `$column(i)`: Extract a `ChunkedArray` by integer position from the table
 #' - `$ColumnNames()`: Get all column names (called by `names(tab)`)
 #' - `$GetColumnByName(name)`: Extract a `ChunkedArray` by string name
 #' - `$field(i)`: Extract a `Field` from the table schema by integer position
 #' - `$SelectColumns(indices)`: Return new `Table` with specified columns, expressed as 0-based integers.
 #' - `$Slice(offset, length = NULL)`: Create a zero-copy view starting at the
 #'    indicated integer offset and going for the given length, or to the end
 #'    of the table if `NULL`, the default.
 #' - `$Take(i)`: return an `Table` with rows at positions given by
 #'    integers `i`. If `i` is an Arrow `Array` or `ChunkedArray`, it will be
 #'    coerced to an R vector before taking.
 #' - `$Filter(i, keep_na = TRUE)`: return an `Table` with rows at positions where logical
 #'    vector or Arrow boolean-type `(Chunked)Array` `i` is `TRUE`.
 #' - `$serialize(output_stream, ...)`: Write the table to the given
 #'    [OutputStream]
 #' - `$cast(target_schema, safe = TRUE, options = cast_options(safe))`: Alter
 #'    the schema of the record batch.
 #'
 #' There are also some active bindings:
 #' - `$num_columns`
 #' - `$num_rows`
 #' - `$schema`
 #' - `$metadata`: Returns the key-value metadata of the `Schema` as a named list.
 #'    Modify or replace by assigning in (`tab$metadata <- new_metadata`).
 #'    All list elements are coerced to string.
 #' - `$columns`: Returns a list of `ChunkedArray`s
 #' @rdname Table
 #' @name Table
 #' @examples
 #' \donttest{
 #' tab <- Table$create(name = rownames(mtcars), mtcars)
 #' dim(tab)
 #' dim(head(tab))
 #' names(tab)
 #' tab$mpg
 #' tab[["cyl"]]
 #' as.data.frame(tab[4:8, c("gear", "hp", "wt")])
 #' }
 #' @export
 Table <- R6Class("Table", inherit = ArrowObject,
   public = list(
     column = function(i) {
       shared_ptr(ChunkedArray, Table__column(self, i))
     },
     ColumnNames = function() Table__ColumnNames(self),
     GetColumnByName = function(name) {
       assert_is(name, "character")
       assert_that(length(name) == 1)
       shared_ptr(ChunkedArray, Table__GetColumnByName(self, name))
     },
     field = function(i) shared_ptr(Field, Table__field(self, i)),

     serialize = function(output_stream, ...) write_table(self, output_stream, ...),
     ToString = function() ToString_tabular(self),

     cast = function(target_schema, safe = TRUE, options = cast_options(safe)) {
       assert_is(target_schema, "Schema")
       assert_is(options, "CastOptions")
       assert_that(identical(self$schema$names, target_schema$names), msg = "incompatible schemas")
       shared_ptr(Table, Table__cast(self, target_schema, options))
     },

     SelectColumns = function(indices) {
       shared_ptr(Table, Table__SelectColumns(self, indices))
     },

     Slice = function(offset, length = NULL) {
       if (is.null(length)) {
         shared_ptr(Table, Table__Slice1(self, offset))
       } else {
         shared_ptr(Table, Table__Slice2(self, offset, length))
       }
     },
     Take = function(i) {
       if (is.numeric(i)) {
         i <- as.integer(i)
       }
       if (is.integer(i)) {
         i <- Array$create(i)
       }
       shared_ptr(Table, call_function("take", self, i))
     },
     Filter = function(i, keep_na = TRUE) {
       if (is.logical(i)) {
         i <- Array$create(i)
       }
       shared_ptr(Table, call_function("filter", self, i, options = list(keep_na = keep_na)))
     },

     Equals = function(other, check_metadata = FALSE, ...) {
       inherits(other, "Table") && Table__Equals(self, other, isTRUE(check_metadata))
     },

     Validate = function() {
       Table__Validate(self)
     },

     ValidateFull = function() {
       Table__ValidateFull(self)
     }
   ),

   active = list(
     num_columns = function() Table__num_columns(self),
     num_rows = function() Table__num_rows(self),
     schema = function() shared_ptr(Schema, Table__schema(self)),
     metadata = function(new) {
       if (missing(new)) {
         # Get the metadata (from the schema)
         self$schema$metadata
       } else {
         # Set the metadata
         new <- prepare_key_value_metadata(new)
         out <- Table__ReplaceSchemaMetadata(self, new)
         # ReplaceSchemaMetadata returns a new object but we're modifying in place,
         # so swap in that new C++ object pointer into our R6 object
         self$set_pointer(out)
         self
       }
     },
     columns = function() map(Table__columns(self), shared_ptr, class = ChunkedArray)
   )
 )

 arrow_attributes <- function(x, only_top_level = FALSE) {
   att <- attributes(x)

   removed_attributes <- character()
   if (identical(class(x), c("tbl_df", "tbl", "data.frame"))) {
     removed_attributes <- c("class", "row.names", "names")
   } else if (inherits(x, "data.frame")) {
     removed_attributes <- c("row.names", "names")
   } else if (inherits(x, "factor")) {
     removed_attributes <- c("class", "levels")
   } else if (inherits(x, "integer64") || inherits(x, "Date")) {
     removed_attributes <- c("class")
   } else if (inherits(x, "POSIXct")) {
     removed_attributes <- c("class", "tzone")
   } else if (inherits(x, "hms") || inherits(x, "difftime")) {
     removed_attributes <- c("class", "units")
   }

   att <- att[setdiff(names(att), removed_attributes)]
   if (isTRUE(only_top_level)) {
     return(att)
   }

   if (is.data.frame(x)) {
     columns <- map(x, arrow_attributes)
     if (length(att) || !all(map_lgl(columns, is.null))) {
       list(attributes = att, columns = columns)
     }
   } else if (length(att)) {
     list(attributes = att, columns = NULL)
   } else {
     NULL
   }
 }

 Table$create <- function(..., schema = NULL) {
   dots <- list2(...)
   # making sure there are always names
   if (is.null(names(dots))) {
     names(dots) <- rep_len("", length(dots))
   }
   stopifnot(length(dots) > 0)
   if (all_record_batches(dots)) {
     shared_ptr(Table, Table__from_record_batches(dots, schema))
   } else {
     shared_ptr(Table, Table__from_dots(dots, schema))
   }
 }

 #' @export
 as.data.frame.Table <- function(x, row.names = NULL, optional = FALSE, ...) {
   df <- Table__to_dataframe(x, use_threads = option_use_threads())
   if (!is.null(r_metadata <- x$metadata$r)) {
     df <- apply_arrow_r_metadata(df, .unserialize_arrow_r_metadata(r_metadata))
   }
   df
 }

 #' @export
 as.list.Table <- as.list.RecordBatch

 #' @export
 row.names.Table <- row.names.RecordBatch

 #' @export
 dimnames.Table <- dimnames.RecordBatch

 #' @export
 dim.Table <- function(x) c(x$num_rows, x$num_columns)

 #' @export
 names.Table <- function(x) x$ColumnNames()

 #' @export
 `[.Table` <- `[.RecordBatch`

 #' @export
 `[[.Table` <- `[[.RecordBatch`

 #' @export
 `$.Table` <- `$.RecordBatch`

 #' @export
 head.Table <- head.RecordBatch

 #' @export
 tail.Table <- tail.RecordBatch
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	#' @include record-batch.R
	#' @title Table class
	#' @description A Table is a sequence of [chunked arrays][ChunkedArray]. They
	#' have a similar interface to [record batches][RecordBatch], but they can be
	#' composed from multiple record batches or chunked arrays.
	#' @usage NULL
	#' @format NULL
	#' @docType class
	#'
	#' @section Factory:
	#'
	#' The `Table$create()` function takes the following arguments:
	#'
	#' * `...` arrays, chunked arrays, or R vectors, with names; alternatively,
	#' an unnamed series of [record batches][RecordBatch] may also be provided,
	#' which will be stacked as rows in the table.
	#' * `schema` a [Schema], or `NULL` (the default) to infer the schema from
	#' the data in `...`
	#'
	#' @section S3 Methods and Usage:
	#' Tables are data-frame-like, and many methods you expect to work on
	#' a `data.frame` are implemented for `Table`. This includes `[`, `[[`,
	#' `$`, `names`, `dim`, `nrow`, `ncol`, `head`, and `tail`. You can also pull
	#' the data from an Arrow table into R with `as.data.frame()`. See the
	#' examples.
	#'
	#' A caveat about the `$` method: because `Table` is an `R6` object,
	#' `$` is also used to access the object's methods (see below). Methods take
	#' precedence over the table's columns. So, `tab$Slice` would return the
	#' "Slice" method function even if there were a column in the table called
	#' "Slice".
	#'
	#' @section R6 Methods:
	#' In addition to the more R-friendly S3 methods, a `Table` object has
	#' the following R6 methods that map onto the underlying C++ methods:
	#'
	#' - `$column(i)`: Extract a `ChunkedArray` by integer position from the table
	#' - `$ColumnNames()`: Get all column names (called by `names(tab)`)
	#' - `$GetColumnByName(name)`: Extract a `ChunkedArray` by string name
	#' - `$field(i)`: Extract a `Field` from the table schema by integer position
	#' - `$SelectColumns(indices)`: Return new `Table` with specified columns, expressed as 0-based integers.
	#' - `$Slice(offset, length = NULL)`: Create a zero-copy view starting at the
	#' indicated integer offset and going for the given length, or to the end
	#' of the table if `NULL`, the default.
	#' - `$Take(i)`: return an `Table` with rows at positions given by
	#' integers `i`. If `i` is an Arrow `Array` or `ChunkedArray`, it will be
	#' coerced to an R vector before taking.
	#' - `$Filter(i, keep_na = TRUE)`: return an `Table` with rows at positions where logical
	#' vector or Arrow boolean-type `(Chunked)Array` `i` is `TRUE`.
	#' - `$serialize(output_stream, ...)`: Write the table to the given
	#' [OutputStream]
	#' - `$cast(target_schema, safe = TRUE, options = cast_options(safe))`: Alter
	#' the schema of the record batch.
	#'
	#' There are also some active bindings:
	#' - `$num_columns`
	#' - `$num_rows`
	#' - `$schema`
	#' - `$metadata`: Returns the key-value metadata of the `Schema` as a named list.
	#' Modify or replace by assigning in (`tab$metadata <- new_metadata`).
	#' All list elements are coerced to string.
	#' - `$columns`: Returns a list of `ChunkedArray`s
	#' @rdname Table
	#' @name Table
	#' @examples
	#' \donttest{
	#' tab <- Table$create(name = rownames(mtcars), mtcars)
	#' dim(tab)
	#' dim(head(tab))
	#' names(tab)
	#' tab$mpg
	#' tab[["cyl"]]
	#' as.data.frame(tab[4:8, c("gear", "hp", "wt")])
	#' }
	#' @export
	Table <- R6Class("Table", inherit = ArrowObject,
	public = list(
	column = function(i) {
	shared_ptr(ChunkedArray, Table__column(self, i))
	},
	ColumnNames = function() Table__ColumnNames(self),
	GetColumnByName = function(name) {
	assert_is(name, "character")
	assert_that(length(name) == 1)
	shared_ptr(ChunkedArray, Table__GetColumnByName(self, name))
	},
	field = function(i) shared_ptr(Field, Table__field(self, i)),

	serialize = function(output_stream, ...) write_table(self, output_stream, ...),
	ToString = function() ToString_tabular(self),

	cast = function(target_schema, safe = TRUE, options = cast_options(safe)) {
	assert_is(target_schema, "Schema")
	assert_is(options, "CastOptions")
	assert_that(identical(self$schema$names, target_schema$names), msg = "incompatible schemas")
	shared_ptr(Table, Table__cast(self, target_schema, options))
	},

	SelectColumns = function(indices) {
	shared_ptr(Table, Table__SelectColumns(self, indices))
	},

	Slice = function(offset, length = NULL) {
	if (is.null(length)) {
	shared_ptr(Table, Table__Slice1(self, offset))
	} else {
	shared_ptr(Table, Table__Slice2(self, offset, length))
	}
	},
	Take = function(i) {
	if (is.numeric(i)) {
	i <- as.integer(i)
	}
	if (is.integer(i)) {
	i <- Array$create(i)
	}
	shared_ptr(Table, call_function("take", self, i))
	},
	Filter = function(i, keep_na = TRUE) {
	if (is.logical(i)) {
	i <- Array$create(i)
	}
	shared_ptr(Table, call_function("filter", self, i, options = list(keep_na = keep_na)))
	},

	Equals = function(other, check_metadata = FALSE, ...) {
	inherits(other, "Table") && Table__Equals(self, other, isTRUE(check_metadata))
	},

	Validate = function() {
	Table__Validate(self)
	},

	ValidateFull = function() {
	Table__ValidateFull(self)
	}
	),

	active = list(
	num_columns = function() Table__num_columns(self),
	num_rows = function() Table__num_rows(self),
	schema = function() shared_ptr(Schema, Table__schema(self)),
	metadata = function(new) {
	if (missing(new)) {
	# Get the metadata (from the schema)
	self$schema$metadata
	} else {
	# Set the metadata
	new <- prepare_key_value_metadata(new)
	out <- Table__ReplaceSchemaMetadata(self, new)
	# ReplaceSchemaMetadata returns a new object but we're modifying in place,
	# so swap in that new C++ object pointer into our R6 object
	self$set_pointer(out)
	self
	}
	},
	columns = function() map(Table__columns(self), shared_ptr, class = ChunkedArray)
	)
	)

	arrow_attributes <- function(x, only_top_level = FALSE) {
	att <- attributes(x)

	removed_attributes <- character()
	if (identical(class(x), c("tbl_df", "tbl", "data.frame"))) {
	removed_attributes <- c("class", "row.names", "names")
	} else if (inherits(x, "data.frame")) {
	removed_attributes <- c("row.names", "names")
	} else if (inherits(x, "factor")) {
	removed_attributes <- c("class", "levels")
	} else if (inherits(x, "integer64") \|\| inherits(x, "Date")) {
	removed_attributes <- c("class")
	} else if (inherits(x, "POSIXct")) {
	removed_attributes <- c("class", "tzone")
	} else if (inherits(x, "hms") \|\| inherits(x, "difftime")) {
	removed_attributes <- c("class", "units")
	}

	att <- att[setdiff(names(att), removed_attributes)]
	if (isTRUE(only_top_level)) {
	return(att)
	}

	if (is.data.frame(x)) {
	columns <- map(x, arrow_attributes)
	if (length(att) \|\| !all(map_lgl(columns, is.null))) {
	list(attributes = att, columns = columns)
	}
	} else if (length(att)) {
	list(attributes = att, columns = NULL)
	} else {
	NULL
	}
	}

	Table$create <- function(..., schema = NULL) {
	dots <- list2(...)
	# making sure there are always names
	if (is.null(names(dots))) {
	names(dots) <- rep_len("", length(dots))
	}
	stopifnot(length(dots) > 0)
	if (all_record_batches(dots)) {
	shared_ptr(Table, Table__from_record_batches(dots, schema))
	} else {
	shared_ptr(Table, Table__from_dots(dots, schema))
	}
	}

	#' @export
	as.data.frame.Table <- function(x, row.names = NULL, optional = FALSE, ...) {
	df <- Table__to_dataframe(x, use_threads = option_use_threads())
	if (!is.null(r_metadata <- x$metadata$r)) {
	df <- apply_arrow_r_metadata(df, .unserialize_arrow_r_metadata(r_metadata))
	}
	df
	}

	#' @export
	as.list.Table <- as.list.RecordBatch

	#' @export
	row.names.Table <- row.names.RecordBatch

	#' @export
	dimnames.Table <- dimnames.RecordBatch

	#' @export
	dim.Table <- function(x) c(x$num_rows, x$num_columns)

	#' @export
	names.Table <- function(x) x$ColumnNames()

	#' @export
	`[.Table` <- `[.RecordBatch`

	#' @export
	`[[.Table` <- `[[.RecordBatch`

	#' @export
	`$.Table` <- `$.RecordBatch`

	#' @export
	head.Table <- head.RecordBatch

	#' @export
	tail.Table <- tail.RecordBatch