| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| |
| context("RecordBatch") |
| |
| test_that("RecordBatch", { |
| # Note that we're reusing `tbl` and `batch` throughout the tests in this file |
| tbl <- tibble::tibble( |
| int = 1:10, |
| dbl = as.numeric(1:10), |
| lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE), |
| chr = letters[1:10], |
| fct = factor(letters[1:10]) |
| ) |
| batch <- record_batch(tbl) |
| |
| expect_equal(batch, batch) |
| expect_equal( |
| batch$schema, |
| schema( |
| int = int32(), dbl = float64(), |
| lgl = boolean(), chr = utf8(), |
| fct = dictionary(int8(), utf8()) |
| ) |
| ) |
| expect_equal(batch$num_columns, 5L) |
| expect_equal(batch$num_rows, 10L) |
| expect_equal(batch$column_name(0), "int") |
| expect_equal(batch$column_name(1), "dbl") |
| expect_equal(batch$column_name(2), "lgl") |
| expect_equal(batch$column_name(3), "chr") |
| expect_equal(batch$column_name(4), "fct") |
| expect_equal(names(batch), c("int", "dbl", "lgl", "chr", "fct")) |
| |
| # input validation |
| expect_error(batch$column_name(NA), "'i' cannot be NA") |
| expect_error(batch$column_name(-1), "subscript out of bounds") |
| expect_error(batch$column_name(1000), "subscript out of bounds") |
| expect_error(batch$column_name(1:2)) |
| expect_error(batch$column_name("one")) |
| |
| col_int <- batch$column(0) |
| expect_true(inherits(col_int, 'Array')) |
| expect_equal(col_int$as_vector(), tbl$int) |
| expect_equal(col_int$type, int32()) |
| |
| col_dbl <- batch$column(1) |
| expect_true(inherits(col_dbl, 'Array')) |
| expect_equal(col_dbl$as_vector(), tbl$dbl) |
| expect_equal(col_dbl$type, float64()) |
| |
| col_lgl <- batch$column(2) |
| expect_true(inherits(col_dbl, 'Array')) |
| expect_equal(col_lgl$as_vector(), tbl$lgl) |
| expect_equal(col_lgl$type, boolean()) |
| |
| col_chr <- batch$column(3) |
| expect_true(inherits(col_chr, 'Array')) |
| expect_equal(col_chr$as_vector(), tbl$chr) |
| expect_equal(col_chr$type, utf8()) |
| |
| col_fct <- batch$column(4) |
| expect_true(inherits(col_fct, 'Array')) |
| expect_equal(col_fct$as_vector(), tbl$fct) |
| expect_equal(col_fct$type, dictionary(int8(), utf8())) |
| |
| # input validation |
| expect_error(batch$column(NA), "'i' cannot be NA") |
| expect_error(batch$column(-1), "subscript out of bounds") |
| expect_error(batch$column(1000), "subscript out of bounds") |
| expect_error(batch$column(1:2)) |
| expect_error(batch$column("one")) |
| |
| batch2 <- batch$RemoveColumn(0) |
| expect_equal( |
| batch2$schema, |
| schema(dbl = float64(), lgl = boolean(), chr = utf8(), fct = dictionary(int8(), utf8())) |
| ) |
| expect_equal(batch2$column(0), batch$column(1)) |
| expect_data_frame(batch2, tbl[,-1]) |
| |
| # input validation |
| expect_error(batch$RemoveColumn(NA), "'i' cannot be NA") |
| expect_error(batch$RemoveColumn(-1), "subscript out of bounds") |
| expect_error(batch$RemoveColumn(1000), "subscript out of bounds") |
| expect_error(batch$RemoveColumn(1:2)) |
| expect_error(batch$RemoveColumn("one")) |
| }) |
| |
| test_that("RecordBatch S3 methods", { |
| tab <- RecordBatch$create(example_data) |
| for (f in c("dim", "nrow", "ncol", "dimnames", "colnames", "row.names", "as.list")) { |
| fun <- get(f) |
| expect_identical(fun(tab), fun(example_data), info = f) |
| } |
| }) |
| |
| test_that("RecordBatch$Slice", { |
| batch3 <- batch$Slice(5) |
| expect_data_frame(batch3, tbl[6:10,]) |
| |
| batch4 <- batch$Slice(5, 2) |
| expect_data_frame(batch4, tbl[6:7,]) |
| |
| # Input validation |
| expect_error(batch$Slice("ten")) |
| expect_error(batch$Slice(NA_integer_), "Slice 'offset' cannot be NA") |
| expect_error(batch$Slice(NA), "Slice 'offset' cannot be NA") |
| expect_error(batch$Slice(10, "ten")) |
| expect_error(batch$Slice(10, NA_integer_), "Slice 'length' cannot be NA") |
| expect_error(batch$Slice(NA_integer_, NA_integer_), "Slice 'offset' cannot be NA") |
| expect_error(batch$Slice(c(10, 10))) |
| expect_error(batch$Slice(10, c(10, 10))) |
| expect_error(batch$Slice(1000), "Slice 'offset' greater than array length") |
| expect_error(batch$Slice(-1), "Slice 'offset' cannot be negative") |
| expect_error(batch4$Slice(10, 10), "Slice 'offset' greater than array length") |
| expect_error(batch$Slice(10, -1), "Slice 'length' cannot be negative") |
| expect_error(batch$Slice(-1, 10), "Slice 'offset' cannot be negative") |
| }) |
| |
| test_that("[ on RecordBatch", { |
| expect_data_frame(batch[6:7,], tbl[6:7,]) |
| expect_data_frame(batch[c(6, 7),], tbl[6:7,]) |
| expect_data_frame(batch[6:7, 2:4], tbl[6:7, 2:4]) |
| expect_data_frame(batch[, c("dbl", "fct")], tbl[, c(2, 5)]) |
| expect_identical(as.vector(batch[, "chr", drop = TRUE]), tbl$chr) |
| expect_data_frame(batch[c(7, 3, 5), 2:4], tbl[c(7, 3, 5), 2:4]) |
| expect_data_frame( |
| batch[rep(c(FALSE, TRUE), 5),], |
| tbl[c(2, 4, 6, 8, 10),] |
| ) |
| # bool Array |
| expect_data_frame(batch[batch$lgl,], tbl[tbl$lgl,]) |
| # int Array |
| expect_data_frame(batch[Array$create(5:6), 2:4], tbl[6:7, 2:4]) |
| |
| # input validation |
| expect_error(batch[, c("dbl", "NOTACOLUMN")], 'Column not found: "NOTACOLUMN"') |
| expect_error(batch[, c(6, NA)], 'Column indices cannot be NA') |
| expect_error(batch[, c(2, -2)], 'Invalid column index') |
| }) |
| |
| test_that("[[ and $ on RecordBatch", { |
| expect_vector(batch[["int"]], tbl$int) |
| expect_vector(batch$int, tbl$int) |
| expect_vector(batch[[4]], tbl$chr) |
| expect_null(batch$qwerty) |
| expect_null(batch[["asdf"]]) |
| expect_error(batch[[c(4, 3)]]) |
| expect_error(batch[[NA]], "'i' must be character or numeric, not logical") |
| expect_error(batch[[NULL]], "'i' must be character or numeric, not NULL") |
| expect_error(batch[[c("asdf", "jkl;")]], 'name is not a string', fixed = TRUE) |
| }) |
| |
| test_that("[[<- assignment", { |
| tbl <- tibble::tibble( |
| int = 1:10, |
| dbl = as.numeric(1:10), |
| lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE), |
| chr = letters[1:10], |
| fct = factor(letters[1:10]) |
| ) |
| batch <- RecordBatch$create(tbl) |
| |
| # can remove a column |
| batch[["chr"]] <- NULL |
| expect_data_frame(batch, tbl[-4]) |
| |
| # can remove a column by index |
| batch[[4]] <- NULL |
| expect_data_frame(batch, tbl[1:3]) |
| |
| # can add a named column |
| batch[["new"]] <- letters[10:1] |
| expect_data_frame(batch, dplyr::bind_cols(tbl[1:3], new = letters[10:1])) |
| |
| # can replace a column by index |
| batch[[2]] <- as.numeric(10:1) |
| expect_vector(batch[[2]], as.numeric(10:1)) |
| |
| # can add a column by index |
| batch[[5]] <- as.numeric(10:1) |
| expect_vector(batch[[5]], as.numeric(10:1)) |
| expect_vector(batch[["5"]], as.numeric(10:1)) |
| |
| # can replace a column |
| batch[["int"]] <- 10:1 |
| expect_vector(batch[["int"]], 10:1) |
| |
| # can use $ |
| batch$new <- NULL |
| expect_null(as.vector(batch$new)) |
| expect_identical(dim(batch), c(10L, 4L)) |
| |
| batch$int <- 1:10 |
| expect_vector(batch$int, 1:10) |
| |
| # recycling |
| batch[["atom"]] <- 1L |
| expect_vector(batch[["atom"]], rep(1L, 10)) |
| |
| expect_error( |
| batch[["atom"]] <- 1:6, |
| "Can't recycle input of size 6 to size 10." |
| ) |
| |
| # assign Arrow array |
| array <- Array$create(c(10:1)) |
| batch$array <- array |
| expect_vector(batch$array, 10:1) |
| |
| # nonsense indexes |
| expect_error(batch[[NA]] <- letters[10:1], "'i' must be character or numeric, not logical") |
| expect_error(batch[[NULL]] <- letters[10:1], "'i' must be character or numeric, not NULL") |
| expect_error(batch[[NA_integer_]] <- letters[10:1], "!is.na(i) is not TRUE", fixed = TRUE) |
| expect_error(batch[[NA_real_]] <- letters[10:1], "!is.na(i) is not TRUE", fixed = TRUE) |
| expect_error(batch[[NA_character_]] <- letters[10:1], "!is.na(i) is not TRUE", fixed = TRUE) |
| expect_error(batch[[c(1, 4)]] <- letters[10:1], "length(i) not equal to 1", fixed = TRUE) |
| }) |
| |
| test_that("head and tail on RecordBatch", { |
| tbl <- tibble::tibble( |
| int = 1:10, |
| dbl = as.numeric(1:10), |
| lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE), |
| chr = letters[1:10], |
| fct = factor(letters[1:10]) |
| ) |
| batch <- RecordBatch$create(tbl) |
| expect_data_frame(head(batch), head(tbl)) |
| expect_data_frame(head(batch, 4), head(tbl, 4)) |
| expect_data_frame(head(batch, 40), head(tbl, 40)) |
| expect_data_frame(head(batch, -4), head(tbl, -4)) |
| expect_data_frame(head(batch, -40), head(tbl, -40)) |
| expect_data_frame(tail(batch), tail(tbl)) |
| expect_data_frame(tail(batch, 4), tail(tbl, 4)) |
| expect_data_frame(tail(batch, 40), tail(tbl, 40)) |
| expect_data_frame(tail(batch, -4), tail(tbl, -4)) |
| expect_data_frame(tail(batch, -40), tail(tbl, -40)) |
| }) |
| |
| test_that("RecordBatch print method", { |
| expect_output( |
| print(batch), |
| paste( |
| "RecordBatch", |
| "10 rows x 5 columns", |
| "$int <int32>", |
| "$dbl <double>", |
| "$lgl <bool>", |
| "$chr <string>", |
| "$fct <dictionary<values=string, indices=int8>>", |
| sep = "\n" |
| ), |
| fixed = TRUE |
| ) |
| }) |
| |
| test_that("RecordBatch with 0 rows are supported", { |
| tbl <- tibble::tibble( |
| int = integer(), |
| dbl = numeric(), |
| lgl = logical(), |
| chr = character(), |
| fct = factor(character(), levels = c("a", "b")) |
| ) |
| |
| batch <- record_batch(tbl) |
| expect_equal(batch$num_columns, 5L) |
| expect_equal(batch$num_rows, 0L) |
| expect_equal( |
| batch$schema, |
| schema( |
| int = int32(), |
| dbl = float64(), |
| lgl = boolean(), |
| chr = utf8(), |
| fct = dictionary(int8(), utf8()) |
| ) |
| ) |
| }) |
| |
| test_that("RecordBatch cast (ARROW-3741)", { |
| batch <- record_batch(x = 1:10, y = 1:10) |
| |
| expect_error(batch$cast(schema(x = int32()))) |
| expect_error(batch$cast(schema(x = int32(), z = int32()))) |
| |
| s2 <- schema(x = int16(), y = int64()) |
| batch2 <- batch$cast(s2) |
| expect_equal(batch2$schema, s2) |
| expect_equal(batch2$column(0L)$type, int16()) |
| expect_equal(batch2$column(1L)$type, int64()) |
| }) |
| |
| test_that("record_batch() handles schema= argument", { |
| s <- schema(x = int32(), y = int32()) |
| batch <- record_batch(x = 1:10, y = 1:10, schema = s) |
| expect_equal(s, batch$schema) |
| |
| s <- schema(x = int32(), y = float64()) |
| batch <- record_batch(x = 1:10, y = 1:10, schema = s) |
| expect_equal(s, batch$schema) |
| |
| s <- schema(x = int32(), y = utf8()) |
| expect_error(record_batch(x = 1:10, y = 1:10, schema = s)) |
| }) |
| |
| test_that("record_batch(schema=) does some basic consistency checking of the schema", { |
| s <- schema(x = int32()) |
| expect_error(record_batch(x = 1:10, y = 1:10, schema = s)) |
| expect_error(record_batch(z = 1:10, schema = s)) |
| }) |
| |
| test_that("RecordBatch dim() and nrow() (ARROW-3816)", { |
| batch <- record_batch(x = 1:10, y = 1:10) |
| expect_equal(dim(batch), c(10L, 2L)) |
| expect_equal(nrow(batch), 10L) |
| }) |
| |
| test_that("record_batch() handles Array", { |
| batch <- record_batch(x = 1:10, y = Array$create(1:10)) |
| expect_equal(batch$schema, schema(x = int32(), y = int32())) |
| }) |
| |
| test_that("record_batch() handles data frame columns", { |
| tib <- tibble::tibble(x = 1:10, y = 1:10) |
| # because tib is named here, this becomes a struct array |
| batch <- record_batch(a = 1:10, b = tib) |
| expect_equivalent( |
| batch$schema, |
| schema( |
| a = int32(), |
| b = struct(x = int32(), y = int32()) |
| ) |
| ) |
| out <- as.data.frame(batch) |
| expect_equivalent(out, tibble::tibble(a = 1:10, b = tib)) |
| |
| # if not named, columns from tib are auto spliced |
| batch2 <- record_batch(a = 1:10, tib) |
| expect_equal( |
| batch2$schema, |
| schema(a = int32(), x = int32(), y = int32()) |
| ) |
| out <- as.data.frame(batch2) |
| expect_equivalent(out, tibble::tibble(a = 1:10, !!!tib)) |
| }) |
| |
| test_that("record_batch() handles data frame columns with schema spec", { |
| tib <- tibble::tibble(x = 1:10, y = 1:10) |
| tib_float <- tib |
| tib_float$y <- as.numeric(tib_float$y) |
| schema <- schema(a = int32(), b = struct(x = int16(), y = float64())) |
| batch <- record_batch(a = 1:10, b = tib, schema = schema) |
| expect_equivalent(batch$schema, schema) |
| out <- as.data.frame(batch) |
| expect_equivalent(out, tibble::tibble(a = 1:10, b = tib_float)) |
| |
| schema <- schema(a = int32(), b = struct(x = int16(), y = utf8())) |
| expect_error(record_batch(a = 1:10, b = tib, schema = schema)) |
| }) |
| |
| test_that("record_batch() auto splices (ARROW-5718)", { |
| df <- tibble::tibble(x = 1:10, y = letters[1:10]) |
| batch1 <- record_batch(df) |
| batch2 <- record_batch(!!!df) |
| expect_equal(batch1, batch2) |
| expect_equal(batch1$schema, schema(x = int32(), y = utf8())) |
| expect_data_frame(batch1, df) |
| |
| batch3 <- record_batch(df, z = 1:10) |
| batch4 <- record_batch(!!!df, z = 1:10) |
| expect_equal(batch3, batch4) |
| expect_equal(batch3$schema, schema(x = int32(), y = utf8(), z = int32())) |
| expect_equivalent(as.data.frame(batch3), cbind(df, data.frame(z = 1:10))) |
| |
| s <- schema(x = float64(), y = utf8()) |
| batch5 <- record_batch(df, schema = s) |
| batch6 <- record_batch(!!!df, schema = s) |
| expect_equal(batch5, batch6) |
| expect_equal(batch5$schema, s) |
| expect_equivalent(as.data.frame(batch5), df) |
| |
| s2 <- schema(x = float64(), y = utf8(), z = int16()) |
| batch7 <- record_batch(df, z = 1:10, schema = s2) |
| batch8 <- record_batch(!!!df, z = 1:10, schema = s2) |
| expect_equal(batch7, batch8) |
| expect_equal(batch7$schema, s2) |
| expect_equivalent(as.data.frame(batch7), cbind(df, data.frame(z = 1:10))) |
| }) |
| |
| test_that("record_batch() only auto splice data frames", { |
| expect_error( |
| record_batch(1:10), |
| regexp = "only data frames are allowed as unnamed arguments to be auto spliced" |
| ) |
| }) |
| |
| test_that("record_batch() handles null type (ARROW-7064)", { |
| batch <- record_batch(a = 1:10, n = vctrs::unspecified(10)) |
| expect_equivalent(batch$schema, schema(a = int32(), n = null())) |
| }) |
| |
| test_that("record_batch() scalar recycling", { |
| skip("Not implemented (ARROW-11705)") |
| expect_data_frame( |
| record_batch(a = 1:10, b = 5), |
| tibble::tibble(a = 1:10, b = 5) |
| ) |
| }) |
| |
| test_that("RecordBatch$Equals", { |
| df <- tibble::tibble(x = 1:10, y = letters[1:10]) |
| a <- record_batch(df) |
| b <- record_batch(df) |
| expect_equal(a, b) |
| expect_true(a$Equals(b)) |
| expect_false(a$Equals(df)) |
| }) |
| |
| test_that("RecordBatch$Equals(check_metadata)", { |
| df <- tibble::tibble(x = 1:2, y = c("a", "b")) |
| rb1 <- record_batch(df) |
| rb2 <- record_batch(df, schema = rb1$schema$WithMetadata(list(some="metadata"))) |
| |
| expect_r6_class(rb1, "RecordBatch") |
| expect_r6_class(rb2, "RecordBatch") |
| expect_false(rb1$schema$HasMetadata) |
| expect_true(rb2$schema$HasMetadata) |
| expect_identical(rb2$schema$metadata, list(some = "metadata")) |
| |
| expect_true(rb1 == rb2) |
| expect_true(rb1$Equals(rb2)) |
| expect_false(rb1$Equals(rb2, check_metadata = TRUE)) |
| |
| expect_failure(expect_equal(rb1, rb2)) # expect_equal has check_metadata=TRUE |
| expect_equivalent(rb1, rb2) # expect_equivalent has check_metadata=FALSE |
| |
| expect_false(rb1$Equals(24)) # Not a RecordBatch |
| }) |
| |
| test_that("RecordBatch name assignment", { |
| rb <- record_batch(x = 1:10, y = 1:10) |
| expect_identical(names(rb), c("x", "y")) |
| names(rb) <- c("a", "b") |
| expect_identical(names(rb), c("a", "b")) |
| expect_error(names(rb) <- "f") |
| expect_error(names(rb) <- letters) |
| expect_error(names(rb) <- character(0)) |
| expect_error(names(rb) <- NULL) |
| expect_error(names(rb) <- c(TRUE, FALSE)) |
| }) |
| |
| test_that("record_batch() with different length arrays", { |
| msg <- "All arrays must have the same length" |
| expect_error(record_batch(a=1:5, b = 42), msg) |
| expect_error(record_batch(a=1:5, b = 1:6), msg) |
| }) |
| |
| test_that("Handling string data with embedded nuls", { |
| raws <- structure(list( |
| as.raw(c(0x70, 0x65, 0x72, 0x73, 0x6f, 0x6e)), |
| as.raw(c(0x77, 0x6f, 0x6d, 0x61, 0x6e)), |
| as.raw(c(0x6d, 0x61, 0x00, 0x6e)), # <-- there's your nul, 0x00 |
| as.raw(c(0x63, 0x61, 0x6d, 0x65, 0x72, 0x61)), |
| as.raw(c(0x74, 0x76))), |
| class = c("arrow_binary", "vctrs_vctr", "list")) |
| batch_with_nul <- record_batch(a = 1:5, b = raws) |
| batch_with_nul$b <- batch_with_nul$b$cast(utf8()) |
| expect_error( |
| as.data.frame(batch_with_nul), |
| "embedded nul in string: 'ma\\0n'; to strip nuls when converting from Arrow to R, set options(arrow.skip_nul = TRUE)", |
| fixed = TRUE |
| ) |
| |
| withr::with_options(list(arrow.skip_nul = TRUE), { |
| expect_warning( |
| expect_equivalent( |
| as.data.frame(batch_with_nul)$b, |
| c("person", "woman", "man", "camera", "tv") |
| ), |
| "Stripping '\\0' (nul) from character vector", |
| fixed = TRUE |
| ) |
| }) |
| }) |