blob: c4ab042380f6afe054b565eb26d3860c87d0a597 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
library(dplyr)
library(stringr)
tbl <- example_data
# Add some better string data
tbl$verses <- verses[[1]]
# c(" a ", " b ", " c ", ...) increasing padding
# nchar = 3 5 7 9 11 13 15 17 19 21
tbl$padded_strings <- stringr::str_pad(letters[1:10], width = 2*(1:10)+1, side = "both")
tbl$some_negative <- tbl$int * (-1)^(1:nrow(tbl))
test_that("filter() on is.na()", {
expect_dplyr_equal(
input %>%
filter(is.na(lgl)) %>%
select(chr, int, lgl) %>%
collect(),
tbl
)
})
test_that("filter() with NAs in selection", {
expect_dplyr_equal(
input %>%
filter(lgl) %>%
select(chr, int, lgl) %>%
collect(),
tbl
)
})
test_that("Filter returning an empty Table should not segfault (ARROW-8354)", {
expect_dplyr_equal(
input %>%
filter(false) %>%
select(chr, int, lgl) %>%
collect(),
tbl
)
})
test_that("filtering with expression", {
char_sym <- "b"
expect_dplyr_equal(
input %>%
filter(chr == char_sym) %>%
select(string = chr, int) %>%
collect(),
tbl
)
})
test_that("filtering with arithmetic", {
expect_dplyr_equal(
input %>%
filter(dbl + 1 > 3) %>%
select(string = chr, int, dbl) %>%
collect(),
tbl
)
expect_dplyr_equal(
input %>%
filter(dbl / 2 > 3) %>%
select(string = chr, int, dbl) %>%
collect(),
tbl
)
expect_dplyr_equal(
input %>%
filter(dbl / 2L > 3) %>%
select(string = chr, int, dbl) %>%
collect(),
tbl
)
expect_dplyr_equal(
input %>%
filter(int / 2 > 3) %>%
select(string = chr, int, dbl) %>%
collect(),
tbl
)
expect_dplyr_equal(
input %>%
filter(int / 2L > 3) %>%
select(string = chr, int, dbl) %>%
collect(),
tbl
)
expect_dplyr_equal(
input %>%
filter(dbl %/% 2 > 3) %>%
select(string = chr, int, dbl) %>%
collect(),
tbl
)
})
test_that("filtering with expression + autocasting", {
expect_dplyr_equal(
input %>%
filter(dbl + 1 > 3L) %>% # test autocasting with comparison to 3L
select(string = chr, int, dbl) %>%
collect(),
tbl
)
expect_dplyr_equal(
input %>%
filter(int + 1 > 3) %>%
select(string = chr, int, dbl) %>%
collect(),
tbl
)
})
test_that("More complex select/filter", {
expect_dplyr_equal(
input %>%
filter(dbl > 2, chr == "d" | chr == "f") %>%
select(chr, int, lgl) %>%
filter(int < 5) %>%
select(int, chr) %>%
collect(),
tbl
)
})
test_that("filter() with %in%", {
expect_dplyr_equal(
input %>%
filter(dbl > 2, chr %in% c("d", "f")) %>%
collect(),
tbl
)
})
test_that("Negative scalar values", {
expect_dplyr_equal(
input %>%
filter(some_negative > -2) %>%
collect(),
tbl
)
expect_dplyr_equal(
input %>%
filter(some_negative %in% -1) %>%
collect(),
tbl
)
expect_dplyr_equal(
input %>%
filter(int == -some_negative) %>%
collect(),
tbl
)
})
test_that("filter() with between()", {
expect_dplyr_equal(
input %>%
filter(between(dbl, 1, 2)) %>%
collect(),
tbl
)
expect_dplyr_equal(
input %>%
filter(between(dbl, 0.5, 2)) %>%
collect(),
tbl
)
expect_identical(
tbl %>%
record_batch() %>%
filter(between(dbl, int, dbl2)) %>%
collect(),
tbl %>%
filter(dbl >= int, dbl <= dbl2)
)
expect_error(
tbl %>%
record_batch() %>%
filter(between(dbl, 1, "2")) %>%
collect()
)
expect_error(
tbl %>%
record_batch() %>%
filter(between(dbl, 1, NA)) %>%
collect()
)
expect_error(
tbl %>%
record_batch() %>%
filter(between(chr, 1, 2)) %>%
collect()
)
})
test_that("filter() with string ops", {
skip_if_not_available("utf8proc")
skip_if(getRversion() < "3.4.0", "R < 3.4")
# Extra instrumentation to ensure that we're calling Arrow compute here
# because many base R string functions implicitly call as.character,
# which means they still work on Arrays but actually force data into R
# 1) wrapper that raises a warning if as.character is called. Can't wrap
# the whole test because as.character apparently gets called in other
# (presumably legitimate) places
# 2) Wrap the test in expect_warning(expr, NA) to catch the warning
with_no_as_character <- function(expr) {
trace(
"as.character",
tracer = quote(warning("as.character was called")),
print = FALSE,
where = toupper
)
on.exit(untrace("as.character", where = toupper))
force(expr)
}
expect_warning(
expect_dplyr_equal(
input %>%
filter(dbl > 2, with_no_as_character(toupper(chr)) %in% c("D", "F")) %>%
collect(),
tbl
),
NA)
expect_dplyr_equal(
input %>%
filter(dbl > 2, str_length(verses) > 25) %>%
collect(),
tbl
)
expect_dplyr_equal(
input %>%
filter(dbl > 2, str_length(str_trim(padded_strings, "left")) > 5) %>%
collect(),
tbl
)
})
test_that("filter environment scope", {
# "object 'b_var' not found"
expect_dplyr_error(input %>% filter(chr == b_var), tbl)
b_var <- "b"
expect_dplyr_equal(
input %>%
filter(chr == b_var) %>%
collect(),
tbl
)
# Also for functions
# 'could not find function "isEqualTo"' because we haven't defined it yet
expect_dplyr_error(input %>% filter(isEqualTo(int, 4)), tbl)
skip("Need to substitute in user defined function too")
# TODO: fix this: this isEqualTo function is eagerly evaluating; it should
# instead yield array_expressions. Probably bc the parent env of the function
# has the Ops.Array methods defined; we need to move it so that the parent
# env is the data mask we use in the dplyr eval
isEqualTo <- function(x, y) x == y & !is.na(x)
expect_dplyr_equal(
input %>%
select(-fct) %>% # factor levels aren't identical
filter(isEqualTo(int, 4)) %>%
collect(),
tbl
)
})
test_that("Filtering on a column that doesn't exist errors correctly", {
with_language("fr", {
# expect_warning(., NA) because the usual behavior when it hits a filter
# that it can't evaluate is to raise a warning, collect() to R, and retry
# the filter. But we want this to error the first time because it's
# a user error, not solvable by retrying in R
expect_warning(
expect_error(
tbl %>% record_batch() %>% filter(not_a_col == 42) %>% collect(),
"objet 'not_a_col' introuvable"
),
NA
)
})
with_language("en", {
expect_warning(
expect_error(
tbl %>% record_batch() %>% filter(not_a_col == 42) %>% collect(),
"object 'not_a_col' not found"
),
NA
)
})
})
test_that("Filtering with a function that doesn't have an Array/expr method still works", {
expect_warning(
expect_dplyr_equal(
input %>%
filter(int > 2, pnorm(dbl) > .99) %>%
collect(),
tbl
),
'Filter expression not implemented in Arrow: pnorm(dbl) > 0.99; pulling data into R',
fixed = TRUE
)
})
test_that("Calling Arrow compute functions 'directly'", {
expect_equal(
tbl %>%
record_batch() %>%
filter(arrow_add(dbl, 1) > 3L) %>%
select(string = chr, int, dbl) %>%
collect(),
tbl %>%
filter(dbl + 1 > 3L) %>%
select(string = chr, int, dbl)
)
expect_dplyr_equal(
tbl %>%
record_batch() %>%
filter(arrow_greater(arrow_add(dbl, 1), 3L)) %>%
select(string = chr, int, dbl) %>%
collect(),
tbl %>%
filter(dbl + 1 > 3L) %>%
select(string = chr, int, dbl)
)
})
test_that("filter() with .data pronoun", {
expect_dplyr_equal(
input %>%
filter(.data$dbl > 4) %>%
select(.data$chr, .data$int, .data$lgl) %>%
collect(),
tbl
)
expect_dplyr_equal(
input %>%
filter(is.na(.data$lgl)) %>%
select(.data$chr, .data$int, .data$lgl) %>%
collect(),
tbl
)
# and the .env pronoun too!
chr <- 4
expect_dplyr_equal(
input %>%
filter(.data$dbl > .env$chr) %>%
select(.data$chr, .data$int, .data$lgl) %>%
collect(),
tbl
)
skip("test now faulty - code no longer gives error & outputs a empty tibble")
# but there is an error if we don't override the masking with `.env`
expect_dplyr_error(
input %>%
filter(.data$dbl > chr) %>%
select(.data$chr, .data$int, .data$lgl) %>%
collect(),
tbl
)
})