blob: e72067a6d5fac28f30caf1f88d6f7d1adcdbb87f [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
context("ChunkedArray")
expect_chunked_roundtrip <- function(x, type) {
a <- ChunkedArray$create(!!!x)
flat_x <- unlist(x, recursive = FALSE)
attributes(flat_x) <- attributes(x[[1]])
expect_type_equal(a$type, type)
expect_identical(a$num_chunks, length(x))
expect_identical(length(a), length(flat_x))
if (!inherits(type, "ListType")) {
# TODO: revisit how missingness works with ListArrays
# R list objects don't handle missingness the same way as other vectors.
# Is there some vctrs thing we should do on the roundtrip back to R?
expect_identical(as.vector(is.na(a)), is.na(flat_x))
}
expect_equal(as.vector(a), flat_x)
expect_equal(as.vector(a$chunk(0)), x[[1]])
if (length(flat_x)) {
a_sliced <- a$Slice(1)
x_sliced <- flat_x[-1]
expect_type_equal(a_sliced$type, type)
expect_identical(length(a_sliced), length(x_sliced))
if (!inherits(type, "ListType")) {
expect_identical(as.vector(is.na(a_sliced)), is.na(x_sliced))
}
expect_equal(as.vector(a_sliced), x_sliced)
}
invisible(a)
}
test_that("ChunkedArray", {
x <- expect_chunked_roundtrip(list(1:10, 1:10, 1:5), int32())
y <- x$Slice(8)
expect_equal(y$type, int32())
expect_equal(y$num_chunks, 3L)
expect_equal(length(y), 17L)
expect_equal(as.vector(y), c(9:10, 1:10, 1:5))
z <- x$Slice(8, 5)
expect_equal(z$type, int32())
expect_equal(z$num_chunks, 2L)
expect_equal(z$length(), 5L)
expect_equal(z$as_vector(), c(9:10, 1:3))
expect_chunked_roundtrip(list(c(1,2,3), c(4,5,6)), float64())
# input validation
expect_error(x$chunk(14), "subscript out of bounds")
expect_error(x$chunk("one"))
expect_error(x$chunk(NA_integer_), "'i' cannot be NA")
expect_error(x$chunk(-1), "subscript out of bounds")
expect_error(x$Slice("ten"))
expect_error(x$Slice(NA_integer_), "Slice 'offset' cannot be NA")
expect_error(x$Slice(NA), "Slice 'offset' cannot be NA")
expect_error(x$Slice(10, "ten"))
expect_error(x$Slice(10, NA_integer_), "Slice 'length' cannot be NA")
expect_error(x$Slice(NA_integer_, NA_integer_), "Slice 'offset' cannot be NA")
expect_error(x$Slice(c(10, 10)))
expect_error(x$Slice(10, c(10, 10)))
expect_error(x$Slice(1000), "Slice 'offset' greater than array length")
expect_error(x$Slice(-1), "Slice 'offset' cannot be negative")
expect_error(z$Slice(10, 10), "Slice 'offset' greater than array length")
expect_error(x$Slice(10, -1), "Slice 'length' cannot be negative")
expect_error(x$Slice(-1, 10), "Slice 'offset' cannot be negative")
expect_warning(x$Slice(10, 15), NA)
expect_warning(
overslice <- x$Slice(10, 16),
"Slice 'length' greater than available length"
)
expect_equal(length(overslice), 15)
expect_warning(z$Slice(2, 10), "Slice 'length' greater than available length")
})
test_that("print ChunkedArray", {
verify_output(test_path("test-chunked-array.txt"), {
chunked_array(c(1,2,3), c(4,5,6))
chunked_array(1:30, c(4,5,6))
chunked_array(1:30)
chunked_array(factor(c("a", "b")), factor(c("c", "d")))
})
})
test_that("ChunkedArray handles !!! splicing", {
data <- list(1, 2, 3)
x <- chunked_array(!!!data)
expect_equal(x$type, float64())
expect_equal(x$num_chunks, 3L)
})
test_that("ChunkedArray handles NA", {
data <- list(1:10, c(NA, 2:10), c(1:3, NA, 5L))
x <- chunked_array(!!!data)
expect_equal(x$type, int32())
expect_equal(x$num_chunks, 3L)
expect_equal(length(x), 25L)
expect_equal(as.vector(x), c(1:10, c(NA, 2:10), c(1:3, NA, 5)))
chunks <- x$chunks
expect_equal(as.vector(is.na(chunks[[2]])), is.na(data[[2]]))
expect_equal(as.vector(is.na(x)), c(is.na(data[[1]]), is.na(data[[2]]), is.na(data[[3]])))
})
test_that("ChunkedArray handles NaN", {
data <- list(as.numeric(1:10), c(NaN, 2:10), c(1:3, NaN, 5L))
x <- chunked_array(!!!data)
expect_equal(x$type, float64())
expect_equal(x$num_chunks, 3L)
expect_equal(length(x), 25L)
expect_equal(as.vector(x), c(1:10, c(NaN, 2:10), c(1:3, NaN, 5)))
chunks <- x$chunks
expect_equal(as.vector(is.nan(chunks[[2]])), is.nan(data[[2]]))
expect_equal(as.vector(is.nan(x)), c(is.nan(data[[1]]), is.nan(data[[2]]), is.nan(data[[3]])))
})
test_that("ChunkedArray supports logical vectors (ARROW-3341)", {
# with NA
data <- purrr::rerun(3, sample(c(TRUE, FALSE, NA), 100, replace = TRUE))
expect_chunked_roundtrip(data, bool())
# without NA
data <- purrr::rerun(3, sample(c(TRUE, FALSE), 100, replace = TRUE))
expect_chunked_roundtrip(data, bool())
})
test_that("ChunkedArray supports character vectors (ARROW-3339)", {
data <- list(
c("itsy", NA, "spider"),
c("Climbed", "up", "the", "water", "spout"),
c("Down", "came", "the", "rain"),
"And washed the spider out. "
)
expect_chunked_roundtrip(data, utf8())
})
test_that("ChunkedArray supports factors (ARROW-3716)", {
f <- factor(c("itsy", "bitsy", "spider", "spider"))
expect_chunked_roundtrip(list(f, f, f), dictionary(int8()))
})
test_that("ChunkedArray supports dates (ARROW-3716)", {
d <- Sys.Date() + 1:10
expect_chunked_roundtrip(list(d, d), date32())
})
test_that("ChunkedArray supports POSIXct (ARROW-3716)", {
times <- lubridate::ymd_hms("2018-10-07 19:04:05") + 1:10
expect_chunked_roundtrip(list(times, times), timestamp("us", "UTC"))
})
test_that("ChunkedArray supports integer64 (ARROW-3716)", {
x <- bit64::as.integer64(1:10) + MAX_INT
expect_chunked_roundtrip(list(x, x), int64())
# Also with a first chunk that would downcast
zero <- Array$create(0L)$cast(int64())
expect_type_equal(zero, int64())
ca <- ChunkedArray$create(zero, x)
expect_type_equal(ca, int64())
expect_s3_class(as.vector(ca), "integer64")
expect_identical(as.vector(ca), c(bit64::as.integer64(0L), x))
})
test_that("ChunkedArray supports difftime", {
time <- hms::hms(56, 34, 12)
expect_chunked_roundtrip(list(time, time), time32("s"))
})
test_that("integer types casts for ChunkedArray (ARROW-3741)", {
int_types <- c(int8(), int16(), int32(), int64())
uint_types <- c(uint8(), uint16(), uint32(), uint64())
float_types <- c(float32(), float64()) # float16() not really supported in C++ yet
all_types <- c(
int_types,
uint_types,
float_types
)
a <- chunked_array(1:10, 1:10)
for (type in c(int_types, uint_types)) {
casted <- a$cast(type)
expect_r6_class(casted, "ChunkedArray")
expect_type_equal(casted$type, type)
}
# Also test casting to double(), not actually a type, a base R function but should be alias for float64
dbl <- a$cast(double())
expect_r6_class(dbl, "ChunkedArray")
expect_type_equal(dbl$type, float64())
})
test_that("chunked_array() supports the type= argument. conversion from INTSXP and int64 to all int types", {
num_int32 <- 12L
num_int64 <- bit64::as.integer64(10)
for (type in all_types) {
expect_type_equal(chunked_array(num_int32, type = type)$type, type)
expect_type_equal(chunked_array(num_int64, type = type)$type, type)
}
# also test creating with double() "type"
expect_type_equal(chunked_array(num_int32, type = double())$type, float64())
})
test_that("ChunkedArray$create() aborts on overflow", {
expect_error(chunked_array(128L, type = int8())$type)
expect_error(chunked_array(-129L, type = int8())$type)
expect_error(chunked_array(256L, type = uint8())$type)
expect_error(chunked_array(-1L, type = uint8())$type)
expect_error(chunked_array(32768L, type = int16())$type)
expect_error(chunked_array(-32769L, type = int16())$type)
expect_error(chunked_array(65536L, type = uint16())$type)
expect_error(chunked_array(-1L, type = uint16())$type)
expect_error(chunked_array(65536L, type = uint16())$type)
expect_error(chunked_array(-1L, type = uint16())$type)
expect_error(chunked_array(bit64::as.integer64(2^31), type = int32()))
expect_error(chunked_array(bit64::as.integer64(2^32), type = uint32()))
})
test_that("chunked_array() convert doubles to integers", {
for (type in c(int_types, uint_types)) {
a <- chunked_array(10, type = type)
expect_type_equal(a$type, type)
if (type != uint64()) {
# exception for unsigned integer 64 that
# wa cannot handle yet
expect_true(as.vector(a) == 10)
}
}
})
test_that("chunked_array() uses the first ... to infer type", {
a <- chunked_array(10, 10L)
expect_type_equal(a$type, float64())
})
test_that("chunked_array() handles downcasting", {
a <- chunked_array(10L, 10)
expect_type_equal(a$type, int32())
expect_equal(as.vector(a), c(10L, 10L))
})
test_that("chunked_array() makes chunks of the same type", {
a <- chunked_array(10L, bit64::as.integer64(13), type = int64())
for(chunk in a$chunks) {
expect_type_equal(chunk$type, int64())
}
})
test_that("chunked_array() handles 0 chunks if given a type", {
for (type in all_types) {
a <- chunked_array(type = type)
expect_type_equal(a$type, as_type(type))
expect_equal(length(a), 0L)
}
})
test_that("chunked_array() can ingest arrays (ARROW-3815)", {
expect_equal(
as.vector(chunked_array(1:5, Array$create(6:10))),
1:10
)
})
test_that("chunked_array() handles data frame -> struct arrays (ARROW-3811)", {
df <- tibble::tibble(x = 1:10, y = x / 2, z = letters[1:10])
a <- chunked_array(df, df)
expect_type_equal(a$type, struct(x = int32(), y = float64(), z = utf8()))
expect_equivalent(a$as_vector(), rbind(df, df))
})
test_that("ChunkedArray$View() (ARROW-6542)", {
a <- ChunkedArray$create(1:3, 1:4)
b <- a$View(float32())
expect_equal(b$type, float32())
expect_equal(length(b), 7L)
expect_true(all(
sapply(b$chunks, function(.x) .x$type == float32())
))
# Input validation
expect_error(a$View("not a type"), "type must be a DataType, not character")
})
test_that("ChunkedArray$Validate()", {
a <- ChunkedArray$create(1:10)
expect_error(a$Validate(), NA)
})
test_that("[ ChunkedArray", {
one_chunk <- chunked_array(2:11)
x <- chunked_array(1:10, 31:40, 51:55)
# Slice
expect_vector(x[8:12], c(8:10, 31:32))
# Take from same chunk
expect_vector(x[c(11, 15, 12)], c(31, 35, 32))
# Take from multiple chunks (calls Concatenate)
expect_vector(x[c(2, 11, 15, 12, 3)], c(2, 31, 35, 32, 3))
# Take with Array (note these are 0-based)
take1 <- Array$create(c(10L, 14L, 11L))
expect_vector(x[take1], c(31, 35, 32))
# Take with ChunkedArray
take2 <- ChunkedArray$create(c(10L, 14L), 11L)
expect_vector(x[take2], c(31, 35, 32))
# Filter (with recycling)
expect_vector(
one_chunk[c(FALSE, TRUE, FALSE, FALSE, TRUE)],
c(3, 6, 8, 11)
)
# Filter where both are 1-chunk
expect_vector(
one_chunk[ChunkedArray$create(rep(c(FALSE, TRUE, FALSE, FALSE, TRUE), 2))],
c(3, 6, 8, 11)
)
# Filter multi-chunk with logical (-> Array)
expect_vector(
x[c(FALSE, TRUE, FALSE, FALSE, TRUE)],
c(2, 5, 7, 10, 32, 35, 37, 40, 52, 55)
)
# Filter with a chunked array with different sized chunks
p1 <- c(FALSE, TRUE, FALSE, FALSE, TRUE)
p2 <- c(TRUE, FALSE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE)
filt <- ChunkedArray$create(p1, p2, p2)
expect_vector(
x[filt],
c(2, 5, 6, 8, 9, 35, 36, 38, 39, 55)
)
})
test_that("ChunkedArray head/tail", {
vec <- 11:20
a <- ChunkedArray$create(11:15, 16:20)
expect_vector(head(a), head(vec))
expect_vector(head(a, 4), head(vec, 4))
expect_vector(head(a, 40), head(vec, 40))
expect_vector(head(a, -4), head(vec, -4))
expect_vector(head(a, -40), head(vec, -40))
expect_vector(tail(a), tail(vec))
expect_vector(tail(a, 4), tail(vec, 4))
expect_vector(tail(a, 40), tail(vec, 40))
expect_vector(tail(a, -40), tail(vec, -40))
})
test_that("ChunkedArray$Equals", {
vec <- 11:20
a <- ChunkedArray$create(vec[1:5], vec[6:10])
b <- ChunkedArray$create(vec[1:5], vec[6:10])
expect_equal(a, b)
expect_true(a$Equals(b))
expect_false(a$Equals(vec))
})
test_that("Converting a chunked array unifies factors (ARROW-8374)", {
f1 <- factor(c("a"), levels = c("a", "b"))
f2 <- factor(c("c"), levels = c("c", "d"))
f3 <- factor(NA, levels = "a")
f4 <- factor()
res <- factor(c("a", "c", NA), levels = c("a", "b", "c", "d"))
ca <- ChunkedArray$create(f1, f2, f3, f4)
expect_identical(ca$as_vector(), res)
})
test_that("Handling string data with embedded nuls", {
raws <- structure(list(
as.raw(c(0x70, 0x65, 0x72, 0x73, 0x6f, 0x6e)),
as.raw(c(0x77, 0x6f, 0x6d, 0x61, 0x6e)),
as.raw(c(0x6d, 0x61, 0x00, 0x6e)), # <-- there's your nul, 0x00
as.raw(c(0x66, 0x00, 0x00, 0x61, 0x00, 0x6e)), # multiple nuls
as.raw(c(0x63, 0x61, 0x6d, 0x65, 0x72, 0x61)),
as.raw(c(0x74, 0x76))),
class = c("arrow_binary", "vctrs_vctr", "list"))
chunked_array_with_nul <- ChunkedArray$create(raws)$cast(utf8())
expect_error(
as.vector(chunked_array_with_nul),
"embedded nul in string: 'ma\\0n'; to strip nuls when converting from Arrow to R, set options(arrow.skip_nul = TRUE)",
fixed = TRUE
)
withr::with_options(list(arrow.skip_nul = TRUE), {
expect_warning(
expect_identical(
as.vector(chunked_array_with_nul),
c("person", "woman", "man", "fan", "camera", "tv")
),
"Stripping '\\0' (nul) from character vector",
fixed = TRUE
)
})
})