r/R/arrow-tabular.R - arrow - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 #' @include arrow-datum.R

 # Base class for RecordBatch and Table for S3 method dispatch only.
 # Does not exist in C++ class hierarchy
 ArrowTabular <- R6Class("ArrowTabular", inherit = ArrowObject,
   public = list(
     ToString = function() ToString_tabular(self),
     Take = function(i) {
       if (is.numeric(i)) {
         i <- as.integer(i)
       }
       if (is.integer(i)) {
         i <- Array$create(i)
       }
       assert_that(is.Array(i))
       call_function("take", self, i)
     },
     Filter = function(i, keep_na = TRUE) {
       if (is.logical(i)) {
         i <- Array$create(i)
       }
       assert_that(is.Array(i, "bool"))
       call_function("filter", self, i, options = list(keep_na = keep_na))
     },
     SortIndices = function(names, descending = FALSE) {
       assert_that(is.character(names))
       assert_that(length(names) > 0)
       assert_that(!any(is.na(names)))
       if (length(descending) == 1L) {
         descending <- rep_len(descending, length(names))
       }
       assert_that(is.logical(descending))
       assert_that(identical(length(names), length(descending)))
       assert_that(!any(is.na(descending)))
       call_function(
         "sort_indices",
         self,
         # cpp11 does not support logical vectors so convert to integer
         options = list(names = names, orders = as.integer(descending))
       )
     }
   )
 )

 #' @export
 as.data.frame.ArrowTabular <- function(x, row.names = NULL, optional = FALSE, ...) {
   tryCatch(
     df <- x$to_data_frame(),
     error = handle_embedded_nul_error
   )
   if (!is.null(r_metadata <- x$metadata$r)) {
     df <- apply_arrow_r_metadata(df, .unserialize_arrow_r_metadata(r_metadata))
   }
   df
 }

 #' @export
 `names<-.ArrowTabular` <- function(x, value) x$RenameColumns(value)

 #' @importFrom methods as
 #' @export
 `[.ArrowTabular` <- function(x, i, j, ..., drop = FALSE) {
   if (nargs() == 2L) {
     # List-like column extraction (x[i])
     return(x[, i])
   }
   if (!missing(j)) {
     # Selecting columns is cheaper than filtering rows, so do it first.
     # That way, if we're filtering too, we have fewer arrays to filter/slice/take
     if (is.character(j)) {
       j_new <- match(j, names(x))
       if (any(is.na(j_new))) {
         stop("Column not found: ", oxford_paste(j[is.na(j_new)]), call. = FALSE)
       }
       j <- j_new
     }
     if (is_integerish(j)) {
       if (any(is.na(j))) {
         stop("Column indices cannot be NA", call. = FALSE)
       }
       if (length(j) && all(j < 0)) {
         # in R, negative j means "everything but j"
         j <- setdiff(seq_len(x$num_columns), -1 * j)
       }
       x <- x$SelectColumns(as.integer(j) - 1L)
     }

     if (drop && ncol(x) == 1L) {
       x <- x$column(0)
     }
   }
   if (!missing(i)) {
     x <- filter_rows(x, i, ...)
   }
   x
 }

 #' @export
 `[[.ArrowTabular` <- function(x, i, ...) {
   if (is.character(i)) {
     x$GetColumnByName(i)
   } else if (is.numeric(i)) {
     x$column(i - 1)
   } else {
     stop("'i' must be character or numeric, not ", class(i), call. = FALSE)
   }
 }

 #' @export
 `$.ArrowTabular` <- function(x, name, ...) {
   assert_that(is.string(name))
   if (name %in% ls(x)) {
     get(name, x)
   } else {
     x$GetColumnByName(name)
   }
 }

 #' @export
 `[[<-.ArrowTabular` <- function(x, i, value) {
   if (!is.character(i) & !is.numeric(i)) {
     stop("'i' must be character or numeric, not ", class(i), call. = FALSE)
   }
   assert_that(length(i) == 1, !is.na(i))

   if (is.null(value)) {
     if (is.character(i)) {
       i <- match(i, names(x))
     }
     x <- x$RemoveColumn(i - 1L)
   } else {
     if (!is.character(i)) {
       # get or create a/the column name
       if (i <= x$num_columns) {
         i <- names(x)[i]
       } else {
         i <- as.character(i)
       }
     }

     # auto-magic recycling on non-ArrowObjects
     if (!inherits(value, "ArrowObject")) {
       value <- vctrs::vec_recycle(value, x$num_rows)
     }

     # construct the field
     if (inherits(x, "RecordBatch") && !inherits(value, "Array")) {
       value <- Array$create(value)
     } else if (inherits(x, "Table") && !inherits(value, "ChunkedArray")) {
       value <- ChunkedArray$create(value)
     }
     new_field <- field(i, value$type)

     if (i %in% names(x)) {
       i <- match(i, names(x)) - 1L
       x <- x$SetColumn(i, new_field, value)
     } else {
       i <- x$num_columns
       x <- x$AddColumn(i, new_field, value)
     }
   }
   x
 }

 #' @export
 `$<-.ArrowTabular` <- function(x, i, value) {
   assert_that(is.string(i))
   # We need to check if `i` is in names in case it is an active binding (e.g.
   # `metadata`, in which case we use assign to change the active binding instead
   # of the column in the table)
   if (i %in% ls(x)) {
     assign(i, value, x)
   } else {
     x[[i]] <- value
   }
   x
 }

 #' @export
 dim.ArrowTabular <- function(x) c(x$num_rows, x$num_columns)

 #' @export
 as.list.ArrowTabular <- function(x, ...) as.list(as.data.frame(x, ...))

 #' @export
 row.names.ArrowTabular <- function(x) as.character(seq_len(nrow(x)))

 #' @export
 dimnames.ArrowTabular <- function(x) list(row.names(x), names(x))

 #' @export
 head.ArrowTabular <- head.ArrowDatum

 #' @export
 tail.ArrowTabular <- tail.ArrowDatum

 #' @export
 na.fail.ArrowTabular <- function(object, ...) {
   for (col in seq_len(object$num_columns)) {
     if (object$column(col - 1L)$null_count > 0) {
       stop("missing values in object", call. = FALSE)
     }
   }
   object
 }

 #' @export
 na.omit.ArrowTabular <- function(object, ...) {
   not_na <- map(object$columns, ~call_function("is_valid", .x))
   not_na_agg <- Reduce("&", not_na)
   object$Filter(not_na_agg)
 }

 #' @export
 na.exclude.ArrowTabular <- na.omit.ArrowTabular

 ToString_tabular <- function(x, ...) {
   # Generic to work with both RecordBatch and Table
   sch <- unlist(strsplit(x$schema$ToString(), "\n"))
   sch <- sub("(.*): (.*)", "$\\1 <\\2>", sch)
   dims <- sprintf("%s rows x %s columns", nrow(x), ncol(x))
   paste(c(dims, sch), collapse = "\n")
 }

 #' @export
 length.ArrowTabular <- function(x) x$num_columns
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	#' @include arrow-datum.R

	# Base class for RecordBatch and Table for S3 method dispatch only.
	# Does not exist in C++ class hierarchy
	ArrowTabular <- R6Class("ArrowTabular", inherit = ArrowObject,
	public = list(
	ToString = function() ToString_tabular(self),
	Take = function(i) {
	if (is.numeric(i)) {
	i <- as.integer(i)
	}
	if (is.integer(i)) {
	i <- Array$create(i)
	}
	assert_that(is.Array(i))
	call_function("take", self, i)
	},
	Filter = function(i, keep_na = TRUE) {
	if (is.logical(i)) {
	i <- Array$create(i)
	}
	assert_that(is.Array(i, "bool"))
	call_function("filter", self, i, options = list(keep_na = keep_na))
	},
	SortIndices = function(names, descending = FALSE) {
	assert_that(is.character(names))
	assert_that(length(names) > 0)
	assert_that(!any(is.na(names)))
	if (length(descending) == 1L) {
	descending <- rep_len(descending, length(names))
	}
	assert_that(is.logical(descending))
	assert_that(identical(length(names), length(descending)))
	assert_that(!any(is.na(descending)))
	call_function(
	"sort_indices",
	self,
	# cpp11 does not support logical vectors so convert to integer
	options = list(names = names, orders = as.integer(descending))
	)
	}
	)
	)

	#' @export
	as.data.frame.ArrowTabular <- function(x, row.names = NULL, optional = FALSE, ...) {
	tryCatch(
	df <- x$to_data_frame(),
	error = handle_embedded_nul_error
	)
	if (!is.null(r_metadata <- x$metadata$r)) {
	df <- apply_arrow_r_metadata(df, .unserialize_arrow_r_metadata(r_metadata))
	}
	df
	}

	#' @export
	`names<-.ArrowTabular` <- function(x, value) x$RenameColumns(value)

	#' @importFrom methods as
	#' @export
	`[.ArrowTabular` <- function(x, i, j, ..., drop = FALSE) {
	if (nargs() == 2L) {
	# List-like column extraction (x[i])
	return(x[, i])
	}
	if (!missing(j)) {
	# Selecting columns is cheaper than filtering rows, so do it first.
	# That way, if we're filtering too, we have fewer arrays to filter/slice/take
	if (is.character(j)) {
	j_new <- match(j, names(x))
	if (any(is.na(j_new))) {
	stop("Column not found: ", oxford_paste(j[is.na(j_new)]), call. = FALSE)
	}
	j <- j_new
	}
	if (is_integerish(j)) {
	if (any(is.na(j))) {
	stop("Column indices cannot be NA", call. = FALSE)
	}
	if (length(j) && all(j < 0)) {
	# in R, negative j means "everything but j"
	j <- setdiff(seq_len(x$num_columns), -1 * j)
	}
	x <- x$SelectColumns(as.integer(j) - 1L)
	}

	if (drop && ncol(x) == 1L) {
	x <- x$column(0)
	}
	}
	if (!missing(i)) {
	x <- filter_rows(x, i, ...)
	}
	x
	}

	#' @export
	`[[.ArrowTabular` <- function(x, i, ...) {
	if (is.character(i)) {
	x$GetColumnByName(i)
	} else if (is.numeric(i)) {
	x$column(i - 1)
	} else {
	stop("'i' must be character or numeric, not ", class(i), call. = FALSE)
	}
	}

	#' @export
	`$.ArrowTabular` <- function(x, name, ...) {
	assert_that(is.string(name))
	if (name %in% ls(x)) {
	get(name, x)
	} else {
	x$GetColumnByName(name)
	}
	}

	#' @export
	`[[<-.ArrowTabular` <- function(x, i, value) {
	if (!is.character(i) & !is.numeric(i)) {
	stop("'i' must be character or numeric, not ", class(i), call. = FALSE)
	}
	assert_that(length(i) == 1, !is.na(i))

	if (is.null(value)) {
	if (is.character(i)) {
	i <- match(i, names(x))
	}
	x <- x$RemoveColumn(i - 1L)
	} else {
	if (!is.character(i)) {
	# get or create a/the column name
	if (i <= x$num_columns) {
	i <- names(x)[i]
	} else {
	i <- as.character(i)
	}
	}

	# auto-magic recycling on non-ArrowObjects
	if (!inherits(value, "ArrowObject")) {
	value <- vctrs::vec_recycle(value, x$num_rows)
	}

	# construct the field
	if (inherits(x, "RecordBatch") && !inherits(value, "Array")) {
	value <- Array$create(value)
	} else if (inherits(x, "Table") && !inherits(value, "ChunkedArray")) {
	value <- ChunkedArray$create(value)
	}
	new_field <- field(i, value$type)

	if (i %in% names(x)) {
	i <- match(i, names(x)) - 1L
	x <- x$SetColumn(i, new_field, value)
	} else {
	i <- x$num_columns
	x <- x$AddColumn(i, new_field, value)
	}
	}
	x
	}

	#' @export
	`$<-.ArrowTabular` <- function(x, i, value) {
	assert_that(is.string(i))
	# We need to check if `i` is in names in case it is an active binding (e.g.
	# `metadata`, in which case we use assign to change the active binding instead
	# of the column in the table)
	if (i %in% ls(x)) {
	assign(i, value, x)
	} else {
	x[[i]] <- value
	}
	x
	}

	#' @export
	dim.ArrowTabular <- function(x) c(x$num_rows, x$num_columns)

	#' @export
	as.list.ArrowTabular <- function(x, ...) as.list(as.data.frame(x, ...))

	#' @export
	row.names.ArrowTabular <- function(x) as.character(seq_len(nrow(x)))

	#' @export
	dimnames.ArrowTabular <- function(x) list(row.names(x), names(x))

	#' @export
	head.ArrowTabular <- head.ArrowDatum

	#' @export
	tail.ArrowTabular <- tail.ArrowDatum

	#' @export
	na.fail.ArrowTabular <- function(object, ...) {
	for (col in seq_len(object$num_columns)) {
	if (object$column(col - 1L)$null_count > 0) {
	stop("missing values in object", call. = FALSE)
	}
	}
	object
	}

	#' @export
	na.omit.ArrowTabular <- function(object, ...) {
	not_na <- map(object$columns, ~call_function("is_valid", .x))
	not_na_agg <- Reduce("&", not_na)
	object$Filter(not_na_agg)
	}

	#' @export
	na.exclude.ArrowTabular <- na.omit.ArrowTabular

	ToString_tabular <- function(x, ...) {
	# Generic to work with both RecordBatch and Table
	sch <- unlist(strsplit(x$schema$ToString(), "\n"))
	sch <- sub("(.): (.)", "$\\1 <\\2>", sch)
	dims <- sprintf("%s rows x %s columns", nrow(x), ncol(x))
	paste(c(dims, sch), collapse = "\n")
	}

	#' @export
	length.ArrowTabular <- function(x) x$num_columns