blob: d7df83cc7a64fbd2fa3cb44bfbc8abe0cbbea64d [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
skip_if_not_available("utf8proc")
library(dplyr)
library(stringr)
test_that("grepl with ignore.case = FALSE and fixed = TRUE", {
df <- tibble(x = c("Foo", "bar"))
expect_dplyr_equal(
input %>%
filter(grepl("o", x, fixed = TRUE)) %>%
collect(),
df
)
})
test_that("sub and gsub with ignore.case = FALSE and fixed = TRUE", {
df <- tibble(x = c("Foo", "bar"))
expect_dplyr_equal(
input %>%
transmute(x = sub("Foo", "baz", x, fixed = TRUE)) %>%
collect(),
df
)
expect_dplyr_equal(
input %>%
transmute(x = gsub("o", "u", x, fixed = TRUE)) %>%
collect(),
df
)
})
# many of the remainder of these tests require RE2
skip_if_not_available("re2")
test_that("grepl", {
df <- tibble(x = c("Foo", "bar"))
for (fixed in c(TRUE, FALSE)) {
expect_dplyr_equal(
input %>%
filter(grepl("Foo", x, fixed = fixed)) %>%
collect(),
df
)
expect_dplyr_equal(
input %>%
transmute(x = grepl("^B.+", x, ignore.case = FALSE, fixed = fixed)) %>%
collect(),
df
)
expect_dplyr_equal(
input %>%
filter(grepl("Foo", x, ignore.case = FALSE, fixed = fixed)) %>%
collect(),
df
)
}
})
test_that("grepl with ignore.case = TRUE and fixed = TRUE", {
df <- tibble(x = c("Foo", "bar"))
# base::grepl() ignores ignore.case = TRUE with a warning when fixed = TRUE,
# so we can't use expect_dplyr_equal() for these tests
expect_equal(
df %>%
Table$create() %>%
filter(grepl("O", x, ignore.case = TRUE, fixed = TRUE)) %>%
collect(),
tibble(x = "Foo")
)
expect_equal(
df %>%
Table$create() %>%
filter(x = grepl("^B.+", x, ignore.case = TRUE, fixed = TRUE)) %>%
collect(),
tibble(x = character(0))
)
})
test_that("str_detect", {
df <- tibble(x = c("Foo", "bar"))
expect_dplyr_equal(
input %>%
filter(str_detect(x, regex("^F"))) %>%
collect(),
df
)
expect_dplyr_equal(
input %>%
transmute(x = str_detect(x, regex("^f[A-Z]{2}", ignore_case = TRUE))) %>%
collect(),
df
)
expect_dplyr_equal(
input %>%
transmute(x = str_detect(x, regex("^f[A-Z]{2}", ignore_case = TRUE), negate = TRUE)) %>%
collect(),
df
)
expect_dplyr_equal(
input %>%
filter(str_detect(x, fixed("o"))) %>%
collect(),
df
)
expect_dplyr_equal(
input %>%
filter(str_detect(x, fixed("O"))) %>%
collect(),
df
)
expect_dplyr_equal(
input %>%
filter(str_detect(x, fixed("O", ignore_case = TRUE))) %>%
collect(),
df
)
expect_dplyr_equal(
input %>%
filter(str_detect(x, fixed("O", ignore_case = TRUE), negate = TRUE)) %>%
collect(),
df
)
})
test_that("sub and gsub", {
df <- tibble(x = c("Foo", "bar"))
for (fixed in c(TRUE, FALSE)) {
expect_dplyr_equal(
input %>%
transmute(x = sub("Foo", "baz", x, fixed = fixed)) %>%
collect(),
df
)
expect_dplyr_equal(
input %>%
transmute(x = sub("^B.+", "baz", x, ignore.case = FALSE, fixed = fixed)) %>%
collect(),
df
)
expect_dplyr_equal(
input %>%
transmute(x = sub("Foo", "baz", x, ignore.case = FALSE, fixed = fixed)) %>%
collect(),
df
)
}
})
test_that("sub and gsub with ignore.case = TRUE and fixed = TRUE", {
df <- tibble(x = c("Foo", "bar"))
# base::sub() and base::gsub() ignore ignore.case = TRUE with a warning when
# fixed = TRUE, so we can't use expect_dplyr_equal() for these tests
expect_equal(
df %>%
Table$create() %>%
transmute(x = sub("O", "u", x, ignore.case = TRUE, fixed = TRUE)) %>%
collect(),
tibble(x = c("Fuo", "bar"))
)
expect_equal(
df %>%
Table$create() %>%
transmute(x = gsub("o", "u", x, ignore.case = TRUE, fixed = TRUE)) %>%
collect(),
tibble(x = c("Fuu", "bar"))
)
expect_equal(
df %>%
Table$create() %>%
transmute(x = sub("^B.+", "baz", x, ignore.case = TRUE, fixed = TRUE)) %>%
collect(),
df # unchanged
)
})
test_that("str_replace and str_replace_all", {
df <- tibble(x = c("Foo", "bar"))
expect_dplyr_equal(
input %>%
transmute(x = str_replace_all(x, "^F", "baz")) %>%
collect(),
df
)
expect_dplyr_equal(
input %>%
transmute(x = str_replace_all(x, regex("^F"), "baz")) %>%
collect(),
df
)
expect_dplyr_equal(
input %>%
mutate(x = str_replace(x, "^F[a-z]{2}", "baz")) %>%
collect(),
df
)
expect_dplyr_equal(
input %>%
transmute(x = str_replace(x, regex("^f[A-Z]{2}", ignore_case = TRUE), "baz")) %>%
collect(),
df
)
expect_dplyr_equal(
input %>%
transmute(x = str_replace_all(x, fixed("o"), "u")) %>%
collect(),
df
)
expect_dplyr_equal(
input %>%
transmute(x = str_replace(x, fixed("O"), "u")) %>%
collect(),
df
)
expect_dplyr_equal(
input %>%
transmute(x = str_replace(x, fixed("O", ignore_case = TRUE), "u")) %>%
collect(),
df
)
})
test_that("strsplit and str_split", {
df <- tibble(x = c("Foo and bar", "baz and qux and quux"))
expect_dplyr_equal(
input %>%
mutate(x = strsplit(x, "and")) %>%
collect(),
df
)
expect_dplyr_equal(
input %>%
mutate(x = strsplit(x, "and.*", fixed = TRUE)) %>%
collect(),
df
)
expect_dplyr_equal(
input %>%
mutate(x = str_split(x, "and")) %>%
collect(),
df
)
expect_dplyr_equal(
input %>%
mutate(x = str_split(x, "and", n = 2)) %>%
collect(),
df
)
expect_dplyr_equal(
input %>%
mutate(x = str_split(x, fixed("and"), n = 2)) %>%
collect(),
df
)
expect_dplyr_equal(
input %>%
mutate(x = str_split(x, regex("and"), n = 2)) %>%
collect(),
df
)
})
test_that("arrow_*_split_whitespace functions", {
# use only ASCII whitespace characters
df_ascii <- tibble(x = c("Foo\nand bar", "baz\tand qux and quux"))
# use only non-ASCII whitespace characters
df_utf8 <- tibble(x = c("Foo\u00A0and\u2000bar", "baz\u2006and\u1680qux\u3000and\u2008quux"))
df_split <- tibble(x = list(c("Foo", "and", "bar"), c("baz", "and", "qux", "and", "quux")))
# use default option values
expect_equivalent(
df_ascii %>%
Table$create() %>%
mutate(x = arrow_ascii_split_whitespace(x)) %>%
collect(),
df_split
)
expect_equivalent(
df_utf8 %>%
Table$create() %>%
mutate(x = arrow_utf8_split_whitespace(x)) %>%
collect(),
df_split
)
# specify non-default option values
expect_equivalent(
df_ascii %>%
Table$create() %>%
mutate(
x = arrow_ascii_split_whitespace(x, options = list(max_splits = 1, reverse = TRUE))
) %>%
collect(),
tibble(x = list(c("Foo\nand", "bar"), c("baz\tand qux and", "quux")))
)
expect_equivalent(
df_utf8 %>%
Table$create() %>%
mutate(
x = arrow_utf8_split_whitespace(x, options = list(max_splits = 1, reverse = TRUE))
) %>%
collect(),
tibble(x = list(c("Foo\u00A0and", "bar"), c("baz\u2006and\u1680qux\u3000and", "quux")))
)
})
test_that("errors and warnings in string splitting", {
df <- tibble(x = c("Foo and bar", "baz and qux and quux"))
# These conditions generate an error, but abandon_ship() catches the error,
# issues a warning, and pulls the data into R
expect_warning(
df %>%
Table$create() %>%
mutate(x = strsplit(x, "and.*", fixed = FALSE)) %>%
collect(),
regexp = "not supported"
)
expect_warning(
df %>%
Table$create() %>%
mutate(x = str_split(x, "and.?")) %>%
collect()
)
expect_warning(
df %>%
Table$create() %>%
mutate(x = str_split(x, regex("and.?"), n = 2)) %>%
collect(),
regexp = "not supported"
)
expect_warning(
df %>%
Table$create() %>%
mutate(x = str_split(x, fixed("and", ignore_case = TRUE))) %>%
collect(),
"not supported"
)
expect_warning(
df %>%
Table$create() %>%
mutate(x = str_split(x, coll("and.?"))) %>%
collect(),
regexp = "not supported"
)
expect_warning(
df %>%
Table$create() %>%
mutate(x = str_split(x, boundary(type = "word"))) %>%
collect(),
regexp = "not supported"
)
expect_warning(
df %>%
Table$create() %>%
mutate(x = str_split(x, "and", n = 0)) %>%
collect(),
regexp = "not supported"
)
# This condition generates a warning
expect_warning(
df %>%
Table$create() %>%
mutate(x = str_split(x, fixed("and"), simplify = TRUE)) %>%
collect(),
"ignored"
)
})
test_that("errors and warnings in string detection and replacement", {
df <- tibble(x = c("Foo", "bar"))
# These conditions generate an error, but abandon_ship() catches the error,
# issues a warning, and pulls the data into R
expect_warning(
df %>%
Table$create() %>%
filter(str_detect(x, boundary(type = "character"))) %>%
collect(),
regexp = "not implemented"
)
expect_warning(
df %>%
Table$create() %>%
mutate(x = str_replace_all(x, coll("o", locale = "en"), "รณ")) %>%
collect(),
regexp = "not supported"
)
# This condition generates a warning
expect_warning(
df %>%
Table$create() %>%
transmute(x = str_replace_all(x, regex("o", multiline = TRUE), "u")),
"Ignoring pattern modifier argument not supported in Arrow: \"multiline\""
)
})
test_that("backreferences in pattern in string detection", {
skip("RE2 does not support backreferences in pattern (https://github.com/google/re2/issues/101)")
df <- tibble(x = c("Foo", "bar"))
expect_dplyr_equal(
input %>%
filter(str_detect(x, regex("F([aeiou])\\1"))) %>%
collect(),
df
)
})
test_that("backreferences (substitutions) in string replacement", {
df <- tibble(x = c("Foo", "bar"))
expect_dplyr_equal(
input %>%
transmute(desc = sub(
"(?:https?|ftp)://([^/\r\n]+)(/[^\r\n]*)?",
"path `\\2` on server `\\1`",
url
)
) %>%
collect(),
tibble(url = "https://arrow.apache.org/docs/r/")
)
expect_dplyr_equal(
input %>%
transmute(x = str_replace(x, "^(\\w)o(.*)", "\\1\\2p")) %>%
collect(),
df
)
expect_dplyr_equal(
input %>%
transmute(x = str_replace(x, regex("^(\\w)o(.*)", ignore_case = TRUE), "\\1\\2p")) %>%
collect(),
df
)
expect_dplyr_equal(
input %>%
transmute(x = str_replace(x, regex("^(\\w)o(.*)", ignore_case = TRUE), "\\1\\2p")) %>%
collect(),
df
)
})
test_that("edge cases in string detection and replacement", {
# in case-insensitive fixed match/replace, test that "\\E" in the search
# string and backslashes in the replacement string are interpreted literally.
# this test does not use expect_dplyr_equal() because base::sub() and
# base::grepl() do not support ignore.case = TRUE when fixed = TRUE.
expect_equal(
tibble(x = c("\\Q\\e\\D")) %>%
Table$create() %>%
filter(grepl("\\E", x, ignore.case = TRUE, fixed = TRUE)) %>%
collect(),
tibble(x = c("\\Q\\e\\D"))
)
expect_equal(
tibble(x = c("\\Q\\e\\D")) %>%
Table$create() %>%
transmute(x = sub("\\E", "\\L", x, ignore.case = TRUE, fixed = TRUE)) %>%
collect(),
tibble(x = c("\\Q\\L\\D"))
)
# test that a user's "(?i)" prefix does not break the "(?i)" prefix that's
# added in case-insensitive regex match/replace
expect_dplyr_equal(
input %>%
filter(grepl("(?i)^[abc]{3}$", x, ignore.case = TRUE, fixed = FALSE)) %>%
collect(),
tibble(x = c("ABC"))
)
expect_dplyr_equal(
input %>%
transmute(x = sub("(?i)^[abc]{3}$", "123", x, ignore.case = TRUE, fixed = FALSE)) %>%
collect(),
tibble(x = c("ABC"))
)
})