| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| |
| skip_if_not_available("utf8proc") |
| skip_if_not_available("acero") |
| skip_on_cran() |
| |
| library(dplyr, warn.conflicts = FALSE) |
| library(lubridate) |
| library(stringr) |
| library(stringi) |
| |
| tbl <- example_data |
| # Add some better string data |
| tbl$verses <- verses[[1]] |
| # c(" a ", " b ", " c ", ...) increasing padding |
| # nchar = 3 5 7 9 11 13 15 17 19 21 |
| tbl$padded_strings <- stringr::str_pad(letters[1:10], width = 2 * (1:10) + 1, side = "both") |
| tbl$some_grouping <- rep(c(1, 2), 5) |
| |
| test_that("paste, paste0, and str_c", { |
| df <- tibble( |
| v = c("A", "B", "C"), |
| w = c("a", "b", "c"), |
| x = c("d", NA_character_, "f"), |
| y = c(NA_character_, "h", "i"), |
| z = c(1.1, 2.2, NA) |
| ) |
| x <- Expression$field_ref("x") |
| y <- Expression$field_ref("y") |
| |
| # no NAs in data |
| compare_dplyr_binding( |
| .input %>% |
| transmute( |
| a = paste(v, w), |
| a2 = base::paste(v, w) |
| ) %>% |
| collect(), |
| df |
| ) |
| compare_dplyr_binding( |
| .input %>% |
| transmute(paste(v, w, sep = "-")) %>% |
| collect(), |
| df |
| ) |
| compare_dplyr_binding( |
| .input %>% |
| transmute( |
| a = paste0(v, w), |
| a2 = base::paste0(v, w) |
| ) %>% |
| collect(), |
| df |
| ) |
| compare_dplyr_binding( |
| .input %>% |
| transmute( |
| a = str_c(v, w), |
| a2 = stringr::str_c(v, w) |
| ) %>% |
| collect(), |
| df |
| ) |
| compare_dplyr_binding( |
| .input %>% |
| transmute(str_c(v, w, sep = "+")) %>% |
| collect(), |
| df |
| ) |
| |
| # NAs in data |
| compare_dplyr_binding( |
| .input %>% |
| transmute(paste(x, y)) %>% |
| collect(), |
| df |
| ) |
| compare_dplyr_binding( |
| .input %>% |
| transmute(paste(x, y, sep = "-")) %>% |
| collect(), |
| df |
| ) |
| compare_dplyr_binding( |
| .input %>% |
| transmute(str_c(x, y)) %>% |
| collect(), |
| df |
| ) |
| |
| # non-character column in dots |
| compare_dplyr_binding( |
| .input %>% |
| transmute(paste0(x, y, z)) %>% |
| collect(), |
| df |
| ) |
| |
| # literal string in dots |
| compare_dplyr_binding( |
| .input %>% |
| transmute(paste(x, "foo", y)) %>% |
| collect(), |
| df |
| ) |
| |
| # literal NA in dots |
| compare_dplyr_binding( |
| .input %>% |
| transmute(paste(x, NA, y)) %>% |
| collect(), |
| df |
| ) |
| |
| # expressions in dots |
| compare_dplyr_binding( |
| .input %>% |
| transmute(paste0(x, toupper(y), as.character(z))) %>% |
| collect(), |
| df |
| ) |
| |
| # sep is literal NA |
| # errors in paste() (consistent with base::paste()) |
| expect_error( |
| call_binding("paste", x, y, sep = NA_character_), |
| "Invalid separator" |
| ) |
| # In next release of stringr (late 2022), str_c also errors |
| expect_error( |
| call_binding("str_c", x, y, sep = NA_character_), |
| "`sep` must be a single string, not `NA`." |
| ) |
| |
| # sep passed in dots to paste0 (which doesn't take a sep argument) |
| compare_dplyr_binding( |
| .input %>% |
| transmute(paste0(x, y, sep = "-")) %>% |
| collect(), |
| df |
| ) |
| |
| # known differences |
| |
| # arrow allows the separator to be an array |
| expect_equal( |
| df %>% |
| Table$create() %>% |
| transmute(result = paste(x, y, sep = w)) %>% |
| collect(), |
| df %>% |
| transmute(result = paste(x, w, y, sep = "")) |
| ) |
| |
| # expected errors |
| |
| # collapse argument not supported |
| expect_error( |
| call_binding("paste", x, y, collapse = ""), |
| "collapse" |
| ) |
| expect_error( |
| call_binding("paste0", x, y, collapse = ""), |
| "collapse" |
| ) |
| expect_error( |
| call_binding("str_c", x, y, collapse = ""), |
| "collapse" |
| ) |
| |
| # literal vectors of length != 1 not supported |
| expect_error( |
| call_binding("paste", x, character(0), y), |
| "Literal vectors of length != 1 not supported in string concatenation" |
| ) |
| expect_error( |
| call_binding("paste", x, c(",", ";"), y), |
| "Literal vectors of length != 1 not supported in string concatenation" |
| ) |
| }) |
| |
| test_that("grepl with ignore.case = FALSE and fixed = TRUE", { |
| df <- tibble(x = c("Foo", "bar", NA_character_)) |
| compare_dplyr_binding( |
| .input %>% |
| filter(grepl("o", x, fixed = TRUE)) %>% |
| collect(), |
| df |
| ) |
| compare_dplyr_binding( |
| .input %>% |
| mutate(x = grepl("o", x, fixed = TRUE)) %>% |
| collect(), |
| df |
| ) |
| }) |
| |
| test_that("sub and gsub with ignore.case = FALSE and fixed = TRUE", { |
| df <- tibble(x = c("Foo", "bar")) |
| compare_dplyr_binding( |
| .input %>% |
| transmute(x = sub("Foo", "baz", x, fixed = TRUE)) %>% |
| collect(), |
| df |
| ) |
| compare_dplyr_binding( |
| .input %>% |
| transmute(x = gsub("o", "u", x, fixed = TRUE)) %>% |
| collect(), |
| df |
| ) |
| }) |
| |
| # many of the remainder of these tests require RE2 |
| skip_if_not_available("re2") |
| |
| test_that("grepl", { |
| df <- tibble(x = c("Foo", "bar", NA_character_)) |
| |
| for (fixed in c(TRUE, FALSE)) { |
| compare_dplyr_binding( |
| .input %>% |
| filter(grepl("Foo", x, fixed = fixed)) %>% |
| collect(), |
| df |
| ) |
| compare_dplyr_binding( |
| .input %>% |
| transmute(x = grepl("^B.+", x, ignore.case = FALSE, fixed = fixed)) %>% |
| collect(), |
| df |
| ) |
| compare_dplyr_binding( |
| .input %>% |
| filter(grepl("Foo", x, ignore.case = FALSE, fixed = fixed)) %>% |
| collect(), |
| df |
| ) |
| # with namespacing |
| compare_dplyr_binding( |
| .input %>% |
| filter(base::grepl("Foo", x, fixed = fixed)) %>% |
| collect(), |
| df |
| ) |
| } |
| }) |
| |
| test_that("grepl with ignore.case = TRUE and fixed = TRUE", { |
| df <- tibble(x = c("Foo", "bar", NA_character_)) |
| |
| # base::grepl() ignores ignore.case = TRUE with a warning when fixed = TRUE, |
| # so we can't use compare_dplyr_binding() for these tests |
| expect_equal( |
| df %>% |
| Table$create() %>% |
| filter(grepl("O", x, ignore.case = TRUE, fixed = TRUE)) %>% |
| collect(), |
| tibble(x = "Foo") |
| ) |
| expect_equal( |
| df %>% |
| Table$create() %>% |
| filter(grepl("^B.+", x, ignore.case = TRUE, fixed = TRUE)) %>% |
| collect(), |
| tibble(x = character(0)) |
| ) |
| expect_equal( |
| df %>% |
| Table$create() %>% |
| mutate( |
| a = grepl("O", x, ignore.case = TRUE, fixed = TRUE) |
| ) %>% |
| collect(), |
| tibble( |
| x = c("Foo", "bar", NA_character_), |
| a = c(TRUE, FALSE, FALSE) |
| ) |
| ) |
| }) |
| |
| test_that("str_detect", { |
| df <- tibble(x = c("Foo", "bar", NA_character_)) |
| |
| compare_dplyr_binding( |
| .input %>% |
| filter(str_detect(x, regex("^F"))) %>% |
| collect(), |
| df |
| ) |
| compare_dplyr_binding( |
| .input %>% |
| transmute( |
| a = str_detect(x, regex("^f[A-Z]{2}", ignore_case = TRUE)), |
| a2 = stringr::str_detect(x, regex("^f[A-Z]{2}", ignore_case = TRUE)) |
| ) %>% |
| collect(), |
| df |
| ) |
| compare_dplyr_binding( |
| .input %>% |
| transmute(x = str_detect(x, regex("^f[A-Z]{2}", ignore_case = TRUE), negate = TRUE)) %>% |
| collect(), |
| df |
| ) |
| compare_dplyr_binding( |
| .input %>% |
| filter(str_detect(x, fixed("o"))) %>% |
| collect(), |
| df |
| ) |
| compare_dplyr_binding( |
| .input %>% |
| filter(str_detect(x, fixed("O"))) %>% |
| collect(), |
| df |
| ) |
| compare_dplyr_binding( |
| .input %>% |
| filter(str_detect(x, fixed("O", ignore_case = TRUE))) %>% |
| collect(), |
| df |
| ) |
| compare_dplyr_binding( |
| .input %>% |
| filter(str_detect(x, fixed("O", ignore_case = TRUE), negate = TRUE)) %>% |
| collect(), |
| df |
| ) |
| }) |
| |
| test_that("sub and gsub", { |
| df <- tibble(x = c("Foo", "bar")) |
| |
| for (fixed in c(TRUE, FALSE)) { |
| compare_dplyr_binding( |
| .input %>% |
| transmute(x = sub("Foo", "baz", x, fixed = fixed)) %>% |
| collect(), |
| df |
| ) |
| compare_dplyr_binding( |
| .input %>% |
| transmute(x = sub("^B.+", "baz", x, ignore.case = FALSE, fixed = fixed)) %>% |
| collect(), |
| df |
| ) |
| compare_dplyr_binding( |
| .input %>% |
| transmute(x = sub("Foo", "baz", x, ignore.case = FALSE, fixed = fixed)) %>% |
| collect(), |
| df |
| ) |
| } |
| }) |
| |
| test_that("sub and gsub with ignore.case = TRUE and fixed = TRUE", { |
| df <- tibble(x = c("Foo", "bar")) |
| |
| # base::sub() and base::gsub() ignore ignore.case = TRUE with a warning when |
| # fixed = TRUE, so we can't use compare_dplyr_binding() for these tests |
| expect_equal( |
| df %>% |
| Table$create() %>% |
| transmute(x = sub("O", "u", x, ignore.case = TRUE, fixed = TRUE)) %>% |
| collect(), |
| tibble(x = c("Fuo", "bar")) |
| ) |
| expect_equal( |
| df %>% |
| Table$create() %>% |
| transmute(x = gsub("o", "u", x, ignore.case = TRUE, fixed = TRUE)) %>% |
| collect(), |
| tibble(x = c("Fuu", "bar")) |
| ) |
| expect_equal( |
| df %>% |
| Table$create() %>% |
| transmute(x = sub("^B.+", "baz", x, ignore.case = TRUE, fixed = TRUE)) %>% |
| collect(), |
| df # unchanged |
| ) |
| }) |
| |
| test_that("sub and gsub with namespacing", { |
| compare_dplyr_binding( |
| .input %>% |
| mutate(verses_new = base::gsub("o", "u", verses, fixed = TRUE)) %>% |
| collect(), |
| tbl |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| mutate(verses_new = base::sub("o", "u", verses, fixed = TRUE)) %>% |
| collect(), |
| tbl |
| ) |
| }) |
| |
| test_that("str_replace and str_replace_all", { |
| df <- tibble(x = c("Foo", "bar")) |
| |
| compare_dplyr_binding( |
| .input %>% |
| transmute(x = str_replace_all(x, "^F", "baz")) %>% |
| collect(), |
| df |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| transmute(x = str_replace_all(x, regex("^F"), "baz")) %>% |
| collect(), |
| df |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| mutate(x = str_replace(x, "^F[a-z]{2}", "baz")) %>% |
| collect(), |
| df |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| transmute(x = str_replace(x, regex("^f[A-Z]{2}", ignore_case = TRUE), "baz")) %>% |
| collect(), |
| df |
| ) |
| compare_dplyr_binding( |
| .input %>% |
| transmute( |
| x = str_replace_all(x, fixed("o"), "u"), |
| x2 = stringr::str_replace_all(x, fixed("o"), "u") |
| ) %>% |
| collect(), |
| df |
| ) |
| compare_dplyr_binding( |
| .input %>% |
| transmute( |
| x = str_replace(x, fixed("O"), "u"), |
| x2 = stringr::str_replace(x, fixed("O"), "u") |
| ) %>% |
| collect(), |
| df |
| ) |
| compare_dplyr_binding( |
| .input %>% |
| transmute(x = str_replace(x, fixed("O", ignore_case = TRUE), "u")) %>% |
| collect(), |
| df |
| ) |
| }) |
| |
| test_that("strsplit and str_split", { |
| df <- tibble(x = c("Foo and bar", "baz and qux and quux")) |
| |
| compare_dplyr_binding( |
| .input %>% |
| mutate(x = strsplit(x, "and")) %>% |
| collect(), |
| df, |
| # `ignore_attr = TRUE` because the vctr coming back from arrow (ListArray) |
| # has type information in it, but it's just a bare list from R/dplyr. |
| ignore_attr = TRUE |
| ) |
| compare_dplyr_binding( |
| .input %>% |
| mutate(x = strsplit(x, "and.*", fixed = TRUE)) %>% |
| collect(), |
| df, |
| ignore_attr = TRUE |
| ) |
| compare_dplyr_binding( |
| .input %>% |
| mutate( |
| a = strsplit(x, " +and +"), |
| a2 = base::strsplit(x, " +and +") |
| ) %>% |
| collect(), |
| df, |
| ignore_attr = TRUE |
| ) |
| compare_dplyr_binding( |
| .input %>% |
| mutate( |
| a = str_split(x, "and"), |
| a2 = stringr::str_split(x, "and") |
| ) %>% |
| collect(), |
| df, |
| ignore_attr = TRUE |
| ) |
| compare_dplyr_binding( |
| .input %>% |
| mutate(x = str_split(x, "and", n = 2)) %>% |
| collect(), |
| df, |
| ignore_attr = TRUE |
| ) |
| compare_dplyr_binding( |
| .input %>% |
| mutate(x = str_split(x, fixed("and"), n = 2)) %>% |
| collect(), |
| df, |
| ignore_attr = TRUE |
| ) |
| compare_dplyr_binding( |
| .input %>% |
| mutate(x = str_split(x, regex("and"), n = 2)) %>% |
| collect(), |
| df, |
| ignore_attr = TRUE |
| ) |
| compare_dplyr_binding( |
| .input %>% |
| mutate(x = str_split(x, "Foo|bar", n = 2)) %>% |
| collect(), |
| df, |
| ignore_attr = TRUE |
| ) |
| }) |
| |
| test_that("strrep and str_dup", { |
| df <- tibble(x = c("foo1", " \tB a R\n", "!apACHe aRroW!")) |
| for (times in 0:8) { |
| compare_dplyr_binding( |
| .input %>% |
| mutate(x = strrep(x, times)) %>% |
| collect(), |
| df |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| mutate(x = str_dup(x, times)) %>% |
| collect(), |
| df |
| ) |
| } |
| }) |
| |
| test_that("str_to_lower, str_to_upper, and str_to_title", { |
| df <- tibble(x = c("foo1", " \tB a R\n", "!apACHe aRroW!")) |
| compare_dplyr_binding( |
| .input %>% |
| transmute( |
| x_lower = str_to_lower(x), |
| x_upper = str_to_upper(x), |
| x_title = str_to_title(x), |
| x_lower_nmspc = stringr::str_to_lower(x), |
| x_upper_nmspc = stringr::str_to_upper(x), |
| x_title_nmspc = stringr::str_to_title(x) |
| ) %>% |
| collect(), |
| df |
| ) |
| |
| # Error checking a single function because they all use the same code path. |
| expect_error( |
| call_binding("str_to_lower", "Apache Arrow", locale = "sp"), |
| "Providing a value for 'locale' other than the default ('en') is not supported in Arrow", |
| fixed = TRUE |
| ) |
| }) |
| |
| test_that("arrow_*_split_whitespace functions", { |
| # use only ASCII whitespace characters |
| df_ascii <- tibble(x = c("Foo\nand bar", "baz\tand qux and quux")) |
| |
| # use only non-ASCII whitespace characters |
| df_utf8 <- tibble(x = c("Foo\u00A0and\u2000bar", "baz\u2006and\u1680qux\u3000and\u2008quux")) |
| |
| df_split <- tibble(x = list(c("Foo", "and", "bar"), c("baz", "and", "qux", "and", "quux"))) |
| |
| # use default option values |
| expect_equal( |
| df_ascii %>% |
| Table$create() %>% |
| mutate(x = arrow_ascii_split_whitespace(x)) %>% |
| collect(), |
| df_split, |
| ignore_attr = TRUE |
| ) |
| expect_equal( |
| df_utf8 %>% |
| Table$create() %>% |
| mutate(x = arrow_utf8_split_whitespace(x)) %>% |
| collect(), |
| df_split, |
| ignore_attr = TRUE |
| ) |
| |
| # specify non-default option values |
| expect_equal( |
| df_ascii %>% |
| Table$create() %>% |
| mutate( |
| x = arrow_ascii_split_whitespace(x, options = list(max_splits = 1, reverse = TRUE)) |
| ) %>% |
| collect(), |
| tibble(x = list(c("Foo\nand", "bar"), c("baz\tand qux and", "quux"))), |
| ignore_attr = TRUE |
| ) |
| expect_equal( |
| df_utf8 %>% |
| Table$create() %>% |
| mutate( |
| x = arrow_utf8_split_whitespace(x, options = list(max_splits = 1, reverse = TRUE)) |
| ) %>% |
| collect(), |
| tibble(x = list(c("Foo\u00A0and", "bar"), c("baz\u2006and\u1680qux\u3000and", "quux"))), |
| ignore_attr = TRUE |
| ) |
| }) |
| |
| test_that("errors and warnings in string splitting", { |
| # These conditions generate an error, but abandon_ship() catches the error, |
| # issues a warning, and pulls the data into R (if computing on InMemoryDataset) |
| # Elsewhere we test that abandon_ship() works, |
| # so here we can just call the functions directly |
| |
| x <- Expression$field_ref("x") |
| expect_error( |
| call_binding("str_split", x, fixed("and", ignore_case = TRUE)), |
| "Case-insensitive string splitting not supported in Arrow" |
| ) |
| expect_error( |
| call_binding("str_split", x, coll("and.?")), |
| "Pattern modifier `coll()` not supported in Arrow", |
| fixed = TRUE |
| ) |
| expect_error( |
| call_binding("str_split", x, boundary(type = "word")), |
| "Pattern modifier `boundary()` not supported in Arrow", |
| fixed = TRUE |
| ) |
| expect_error( |
| call_binding("str_split", x, "and", n = 0), |
| "Splitting strings into zero parts not supported in Arrow" |
| ) |
| |
| # This condition generates a warning |
| expect_warning( |
| call_binding("str_split", x, fixed("and"), simplify = TRUE), |
| "Argument 'simplify = TRUE' will be ignored" |
| ) |
| }) |
| |
| test_that("errors and warnings in string detection and replacement", { |
| x <- Expression$field_ref("x") |
| |
| expect_error( |
| call_binding("str_detect", x, boundary(type = "character")), |
| "Pattern modifier `boundary()` not supported in Arrow", |
| fixed = TRUE |
| ) |
| expect_error( |
| call_binding("str_replace_all", x, coll("o", locale = "en"), "ó"), |
| "Pattern modifier `coll()` not supported in Arrow", |
| fixed = TRUE |
| ) |
| |
| # This condition generates a warning |
| expect_warning( |
| call_binding("str_replace_all", x, regex("o", multiline = TRUE), "u"), |
| "Ignoring pattern modifier argument not supported in Arrow: \"multiline\"" |
| ) |
| }) |
| |
| test_that("backreferences in pattern in string detection", { |
| skip("RE2 does not support backreferences in pattern (https://github.com/google/re2/issues/101)") |
| df <- tibble(x = c("Foo", "bar")) |
| |
| compare_dplyr_binding( |
| .input %>% |
| filter(str_detect(x, regex("F([aeiou])\\1"))) %>% |
| collect(), |
| df |
| ) |
| }) |
| |
| test_that("backreferences (substitutions) in string replacement", { |
| df <- tibble(x = c("Foo", "bar")) |
| |
| compare_dplyr_binding( |
| .input %>% |
| transmute(desc = sub( |
| "(?:https?|ftp)://([^/\r\n]+)(/[^\r\n]*)?", |
| "path `\\2` on server `\\1`", |
| url |
| )) %>% |
| collect(), |
| tibble(url = "https://arrow.apache.org/docs/r/") |
| ) |
| compare_dplyr_binding( |
| .input %>% |
| transmute(x = str_replace(x, "^(\\w)o(.*)", "\\1\\2p")) %>% |
| collect(), |
| df |
| ) |
| compare_dplyr_binding( |
| .input %>% |
| transmute(x = str_replace(x, regex("^(\\w)o(.*)", ignore_case = TRUE), "\\1\\2p")) %>% |
| collect(), |
| df |
| ) |
| compare_dplyr_binding( |
| .input %>% |
| transmute(x = str_replace(x, regex("^(\\w)o(.*)", ignore_case = TRUE), "\\1\\2p")) %>% |
| collect(), |
| df |
| ) |
| }) |
| |
| test_that("edge cases in string detection and replacement", { |
| # in case-insensitive fixed match/replace, test that "\\E" in the search |
| # string and backslashes in the replacement string are interpreted literally. |
| # this test does not use compare_dplyr_binding() because base::sub() and |
| # base::grepl() do not support ignore.case = TRUE when fixed = TRUE. |
| expect_equal( |
| tibble(x = c("\\Q\\e\\D")) %>% |
| Table$create() %>% |
| filter(grepl("\\E", x, ignore.case = TRUE, fixed = TRUE)) %>% |
| collect(), |
| tibble(x = c("\\Q\\e\\D")) |
| ) |
| expect_equal( |
| tibble(x = c("\\Q\\e\\D")) %>% |
| Table$create() %>% |
| transmute(x = sub("\\E", "\\L", x, ignore.case = TRUE, fixed = TRUE)) %>% |
| collect(), |
| tibble(x = c("\\Q\\L\\D")) |
| ) |
| |
| # test that a user's "(?i)" prefix does not break the "(?i)" prefix that's |
| # added in case-insensitive regex match/replace |
| compare_dplyr_binding( |
| .input %>% |
| filter(grepl("(?i)^[abc]{3}$", x, ignore.case = TRUE, fixed = FALSE)) %>% |
| collect(), |
| tibble(x = c("ABC")) |
| ) |
| compare_dplyr_binding( |
| .input %>% |
| transmute(x = sub("(?i)^[abc]{3}$", "123", x, ignore.case = TRUE, fixed = FALSE)) %>% |
| collect(), |
| tibble(x = c("ABC")) |
| ) |
| }) |
| |
| test_that("arrow_find_substring and arrow_find_substring_regex", { |
| df <- tibble(x = c("Foo and Bar", "baz and qux and quux")) |
| |
| expect_equal( |
| df %>% |
| Table$create() %>% |
| mutate(x = arrow_find_substring(x, options = list(pattern = "b"))) %>% |
| collect(), |
| tibble(x = c(-1, 0)) |
| ) |
| expect_equal( |
| df %>% |
| Table$create() %>% |
| mutate(x = arrow_find_substring( |
| x, |
| options = list(pattern = "b", ignore_case = TRUE) |
| )) %>% |
| collect(), |
| tibble(x = c(8, 0)) |
| ) |
| expect_equal( |
| df %>% |
| Table$create() %>% |
| mutate(x = arrow_find_substring_regex( |
| x, |
| options = list(pattern = "^[fb]") |
| )) %>% |
| collect(), |
| tibble(x = c(-1, 0)) |
| ) |
| expect_equal( |
| df %>% |
| Table$create() %>% |
| mutate(x = arrow_find_substring_regex( |
| x, |
| options = list(pattern = "[AEIOU]", ignore_case = TRUE) |
| )) %>% |
| collect(), |
| tibble(x = c(1, 1)) |
| ) |
| }) |
| |
| test_that("stri_reverse and arrow_ascii_reverse functions", { |
| df_ascii <- tibble(x = c("Foo\nand bar", "baz\tand qux and quux")) |
| |
| df_utf8 <- tibble(x = c("Foo\u00A0\u0061nd\u00A0bar", "\u0062az\u00A0and\u00A0qux\u3000and\u00A0quux")) |
| |
| compare_dplyr_binding( |
| .input %>% |
| mutate(x = stri_reverse(x)) %>% |
| collect(), |
| df_utf8 |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| mutate(x = stri_reverse(x)) %>% |
| collect(), |
| df_ascii |
| ) |
| |
| expect_equal( |
| df_ascii %>% |
| Table$create() %>% |
| mutate(x = arrow_ascii_reverse(x)) %>% |
| collect(), |
| tibble(x = c("rab dna\nooF", "xuuq dna xuq dna\tzab")) |
| ) |
| |
| expect_error( |
| df_utf8 %>% |
| Table$create() %>% |
| mutate(x = arrow_ascii_reverse(x)) %>% |
| collect(), |
| "Invalid: Non-ASCII sequence in input" |
| ) |
| }) |
| |
| test_that("str_like", { |
| df <- tibble(x = c("Foo and bar", "baz and qux and quux")) |
| |
| # No match - entire string |
| compare_dplyr_binding( |
| .input %>% |
| mutate(x = str_like(x, "baz")) %>% |
| collect(), |
| df |
| ) |
| # with namespacing |
| compare_dplyr_binding( |
| .input %>% |
| mutate(x = stringr::str_like(x, "baz")) %>% |
| collect(), |
| df |
| ) |
| |
| # Match - entire string |
| compare_dplyr_binding( |
| .input %>% |
| mutate(x = str_like(x, "Foo and bar")) %>% |
| collect(), |
| df |
| ) |
| |
| # Wildcard |
| compare_dplyr_binding( |
| .input %>% |
| mutate(x = str_like(x, "f%", ignore_case = TRUE)) %>% |
| collect(), |
| df |
| ) |
| |
| # Ignore case |
| compare_dplyr_binding( |
| .input %>% |
| mutate(x = str_like(x, "f%", ignore_case = FALSE)) %>% |
| collect(), |
| df |
| ) |
| |
| # Single character |
| compare_dplyr_binding( |
| .input %>% |
| mutate(x = str_like(x, "_a%")) %>% |
| collect(), |
| df |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| mutate(x = str_like(x, "%baz%")) %>% |
| collect(), |
| df |
| ) |
| }) |
| |
| test_that("str_pad", { |
| df <- tibble(x = c("Foo and bar", "baz and qux and quux")) |
| |
| compare_dplyr_binding( |
| .input %>% |
| mutate(x = str_pad(x, width = 31)) %>% |
| collect(), |
| df |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| mutate(x = str_pad(x, width = 30, side = "right")) %>% |
| collect(), |
| df |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| mutate(x = str_pad(x, width = 31, side = "left", pad = "+")) %>% |
| collect(), |
| df |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| mutate(x = str_pad(x, width = 10, side = "left", pad = "+")) %>% |
| collect(), |
| df |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| mutate( |
| a = str_pad(x, width = 31, side = "both"), |
| a2 = stringr::str_pad(x, width = 31, side = "both") |
| ) %>% |
| collect(), |
| df |
| ) |
| }) |
| |
| test_that("substr with string()", { |
| df <- tibble(x = "Apache Arrow") |
| |
| compare_dplyr_binding( |
| .input %>% |
| mutate(y = substr(x, 1, 6)) %>% |
| collect(), |
| df |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| mutate(y = substr(x, 0, 6)) %>% |
| collect(), |
| df |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| mutate(y = substr(x, -1, 6)) %>% |
| collect(), |
| df |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| mutate(y = substr(x, 6, 1)) %>% |
| collect(), |
| df |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| mutate(y = substr(x, -1, -2)) %>% |
| collect(), |
| df |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| mutate(y = substr(x, 9, 6)) %>% |
| collect(), |
| df |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| mutate(y = substr(x, 1, 6)) %>% |
| collect(), |
| df |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| mutate(y = substr(x, 8, 12)) %>% |
| collect(), |
| df |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| mutate( |
| y = substr(x, -5, -1), |
| y2 = base::substr(x, -5, -1) |
| ) %>% |
| collect(), |
| df |
| ) |
| |
| expect_error( |
| call_binding("substr", "Apache Arrow", c(1, 2), 3), |
| "`start` must be length 1 - other lengths are not supported in Arrow" |
| ) |
| |
| expect_error( |
| call_binding("substr", "Apache Arrow", 1, c(2, 3)), |
| "`stop` must be length 1 - other lengths are not supported in Arrow" |
| ) |
| }) |
| |
| test_that("substr with binary()", { |
| batch <- record_batch(x = list(charToRaw("Apache Arrow"))) |
| |
| # Check a field reference input |
| expect_identical( |
| batch %>% |
| transmute(y = substr(x, 1, 3)) %>% |
| collect() %>% |
| # because of the arrow_binary class |
| mutate(y = unclass(y)), |
| tibble::tibble(y = list(charToRaw("Apa"))) |
| ) |
| |
| # Check a Scalar input |
| scalar <- Scalar$create(batch$x) |
| expect_identical( |
| batch %>% |
| transmute(y = substr(scalar, 1, 3)) %>% |
| collect() %>% |
| # because of the arrow_binary class |
| mutate(y = unclass(y)), |
| tibble::tibble(y = list(charToRaw("Apa"))) |
| ) |
| }) |
| |
| test_that("substring", { |
| # binding for substring just calls call_binding("substr", ...), |
| # tested extensively above |
| df <- tibble(x = "Apache Arrow") |
| |
| compare_dplyr_binding( |
| .input %>% |
| mutate( |
| y = substring(x, 1, 6), |
| y2 = base::substring(x, 1, 6) |
| ) %>% |
| collect(), |
| df |
| ) |
| }) |
| |
| test_that("str_sub", { |
| df <- tibble(x = "Apache Arrow") |
| |
| compare_dplyr_binding( |
| .input %>% |
| mutate(y = str_sub(x, 1, 6)) %>% |
| collect(), |
| df |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| mutate(y = str_sub(x, 0, 6)) %>% |
| collect(), |
| df |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| mutate(y = str_sub(x, -1, 6)) %>% |
| collect(), |
| df |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| mutate(y = str_sub(x, 6, 1)) %>% |
| collect(), |
| df |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| mutate(y = str_sub(x, -1, -2)) %>% |
| collect(), |
| df |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| mutate(y = str_sub(x, -1, 3)) %>% |
| collect(), |
| df |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| mutate(y = str_sub(x, 9, 6)) %>% |
| collect(), |
| df |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| mutate(y = str_sub(x, 1, 6)) %>% |
| collect(), |
| df |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| mutate(y = str_sub(x, 8, 12)) %>% |
| collect(), |
| df |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| mutate( |
| y = str_sub(x, -5, -1), |
| y2 = stringr::str_sub(x, -5, -1) |
| ) %>% |
| collect(), |
| df |
| ) |
| |
| expect_error( |
| call_binding("str_sub", "Apache Arrow", c(1, 2), 3), |
| "`start` must be length 1 - other lengths are not supported in Arrow" |
| ) |
| |
| expect_error( |
| call_binding("str_sub", "Apache Arrow", 1, c(2, 3)), |
| "`end` must be length 1 - other lengths are not supported in Arrow" |
| ) |
| }) |
| |
| test_that("str_starts, str_ends, startsWith, endsWith", { |
| df <- tibble(x = c("Foo", "bar", "baz", "qux", NA_character_)) |
| |
| compare_dplyr_binding( |
| .input %>% |
| filter(str_starts(x, "b.*")) %>% |
| collect(), |
| df |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| filter(str_starts(x, "b.*", negate = TRUE)) %>% |
| collect(), |
| df |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| filter(str_starts(x, fixed("b.*"))) %>% |
| collect(), |
| df |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| filter(str_starts(x, fixed("b"))) %>% |
| collect(), |
| df |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| transmute( |
| a = str_starts(x, "b.*"), |
| a2 = stringr::str_starts(x, "b.*"), |
| b = str_starts(x, "b.*", negate = TRUE), |
| c = str_starts(x, fixed("b")), |
| d = str_starts(x, fixed("b"), negate = TRUE) |
| ) %>% |
| collect(), |
| df |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| filter(str_ends(x, "r")) %>% |
| collect(), |
| df |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| filter(str_ends(x, "r", negate = TRUE)) %>% |
| collect(), |
| df |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| filter(str_ends(x, fixed("r$"))) %>% |
| collect(), |
| df |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| filter(str_ends(x, fixed("r"))) %>% |
| collect(), |
| df |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| transmute( |
| a = str_ends(x, "r"), |
| a2 = stringr::str_ends(x, "r"), |
| b = str_ends(x, "r", negate = TRUE), |
| c = str_ends(x, fixed("r")), |
| d = str_ends(x, fixed("r"), negate = TRUE) |
| ) %>% |
| collect(), |
| df |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| filter(startsWith(x, "b")) %>% |
| collect(), |
| df |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| filter(endsWith(x, "r")) %>% |
| collect(), |
| df |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| filter(startsWith(x, "b.*")) %>% |
| collect(), |
| df |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| filter(endsWith(x, "r$")) %>% |
| collect(), |
| df |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| transmute( |
| a = startsWith(x, "b"), |
| b = endsWith(x, "r"), |
| a2 = base::startsWith(x, "b"), |
| b2 = base::endsWith(x, "r") |
| ) %>% |
| collect(), |
| df |
| ) |
| }) |
| |
| test_that("str_count", { |
| df <- tibble( |
| cities = c("Kolkata", "Dar es Salaam", "Tel Aviv", "San Antonio", "Cluj Napoca", "Bern", "Bogota"), |
| dots = c("a.", "...", ".a.a", "a..a.", "ab...", "dse....", ".f..d..") |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| mutate( |
| a_count = str_count(cities, pattern = "a"), |
| a_count_nmspc = stringr::str_count(cities, pattern = "a") |
| ) %>% |
| collect(), |
| df |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| mutate(p_count = str_count(cities, pattern = "d")) %>% |
| collect(), |
| df |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| mutate(p_count = str_count(cities, |
| pattern = regex("d", ignore_case = TRUE) |
| )) %>% |
| collect(), |
| df |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| mutate(e_count = str_count(cities, pattern = "u")) %>% |
| collect(), |
| df |
| ) |
| |
| # call_binding("str_count", ) is not vectorised over pattern |
| compare_dplyr_binding( |
| .input %>% |
| mutate(let_count = str_count(cities, pattern = c("a", "b", "e", "g", "p", "n", "s"))) %>% |
| collect(), |
| df, |
| warning = TRUE |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| mutate(dots_count = str_count(dots, ".")) %>% |
| collect(), |
| df |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| mutate(dots_count = str_count(dots, fixed("."))) %>% |
| collect(), |
| df |
| ) |
| }) |
| |
| test_that("base::tolower and base::toupper", { |
| compare_dplyr_binding( |
| .input %>% |
| mutate( |
| verse_to_upper = toupper(verses), |
| verse_to_lower = tolower(verses), |
| verse_to_upper_nmspc = base::toupper(verses), |
| verse_to_lower_nmspc = base::tolower(verses) |
| ) %>% |
| collect(), |
| tbl |
| ) |
| }) |
| |
| test_that("namespaced unary and binary string functions", { |
| # str_length and stringi::stri_reverse |
| compare_dplyr_binding( |
| .input %>% |
| mutate( |
| verse_length = stringr::str_length(verses), |
| reverses_verse = stringi::stri_reverse(verses) |
| ) %>% |
| collect(), |
| tbl |
| ) |
| |
| # stringr::str_dup and base::strrep |
| df <- tibble(x = c("foo1", " \tB a R\n", "!apACHe aRroW!")) |
| for (times in 0:8) { |
| compare_dplyr_binding( |
| .input %>% |
| mutate(x = base::strrep(x, times)) %>% |
| collect(), |
| df |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| mutate(x = stringr::str_dup(x, times)) %>% |
| collect(), |
| df |
| ) |
| } |
| }) |
| |
| test_that("nchar with namespacing", { |
| compare_dplyr_binding( |
| .input %>% |
| mutate(verses_nchar = base::nchar(verses)) %>% |
| collect(), |
| tbl |
| ) |
| }) |
| |
| test_that("str_trim()", { |
| compare_dplyr_binding( |
| .input %>% |
| mutate( |
| left_trim_padded_string = str_trim(padded_strings, "left"), |
| right_trim_padded_string = str_trim(padded_strings, "right"), |
| both_trim_padded_string = str_trim(padded_strings, "both"), |
| left_trim_padded_string_nmspc = stringr::str_trim(padded_strings, "left"), |
| right_trim_padded_string_nmspc = stringr::str_trim(padded_strings, "right"), |
| both_trim_padded_string_nmspc = stringr::str_trim(padded_strings, "both") |
| ) %>% |
| collect(), |
| tbl |
| ) |
| }) |
| |
| test_that("str_remove and str_remove_all", { |
| df <- tibble(x = c("Foo", "bar")) |
| |
| compare_dplyr_binding( |
| .input %>% |
| transmute(x = str_remove_all(x, "^F")) %>% |
| collect(), |
| df |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| transmute(x = str_remove_all(x, regex("^F"))) %>% |
| collect(), |
| df |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| mutate(x = str_remove(x, "^F[a-z]{2}")) %>% |
| collect(), |
| df |
| ) |
| |
| compare_dplyr_binding( |
| .input %>% |
| transmute(x = str_remove(x, regex("^f[A-Z]{2}", ignore_case = TRUE))) %>% |
| collect(), |
| df |
| ) |
| compare_dplyr_binding( |
| .input %>% |
| transmute( |
| x = str_remove_all(x, fixed("o")), |
| x2 = stringr::str_remove_all(x, fixed("o")) |
| ) %>% |
| collect(), |
| df |
| ) |
| compare_dplyr_binding( |
| .input %>% |
| transmute( |
| x = str_remove(x, fixed("O")), |
| x2 = stringr::str_remove(x, fixed("O")) |
| ) %>% |
| collect(), |
| df |
| ) |
| compare_dplyr_binding( |
| .input %>% |
| transmute(x = str_remove(x, fixed("O", ignore_case = TRUE))) %>% |
| collect(), |
| df |
| ) |
| }) |