ARROW-11468: [R] Allow user to pass schema to read_json_arrow()
A couple of things I wanted to check are expected behaviour:
1. If I specify in the schema that a numeric column should be a string column, I get the error `Error: Invalid: JSON parse error: Column(/third_col) changed from string to number in row 0`
(e.g. if I run the following)
```
tf <- tempfile()
writeLines('
{ "hello": 3.5, "world": 2, "third_col": 99}
{ "hello": 3.25, "world": 5, "third_col": 98}
{ "hello": 3.125, "world": 8, "third_col": 97 }
{ "hello": 0.0, "world": 10, "third_col": 96}
', tf)
read_json_arrow(tf, schema = schema(third_col = utf8(), world = float64()))
```
2. As can be seen in the tests output (will delete the `print` statements before this is merged), table columns are returned in the order specified in the schema and then the columns not mentioned in the schema.
Closes #9950 from thisisnic/ARROW-11468
Authored-by: Nic Crane <thisisnic@gmail.com>
Signed-off-by: Neal Richardson <neal.p.richardson@gmail.com>
diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R
index 9811dc9..a33cf22 100644
--- a/r/R/arrowExports.R
+++ b/r/R/arrowExports.R
@@ -1052,8 +1052,12 @@
.Call(`_arrow_json___ReadOptions__initialize`, use_threads, block_size)
}
-json___ParseOptions__initialize <- function(newlines_in_values){
- .Call(`_arrow_json___ParseOptions__initialize`, newlines_in_values)
+json___ParseOptions__initialize1 <- function(newlines_in_values){
+ .Call(`_arrow_json___ParseOptions__initialize1`, newlines_in_values)
+}
+
+json___ParseOptions__initialize2 <- function(newlines_in_values, explicit_schema){
+ .Call(`_arrow_json___ParseOptions__initialize2`, newlines_in_values, explicit_schema)
}
json___TableReader__Make <- function(input, read_options, parse_options){
diff --git a/r/R/json.R b/r/R/json.R
index cc16774..89595a5 100644
--- a/r/R/json.R
+++ b/r/R/json.R
@@ -20,6 +20,7 @@
#' Using [JsonTableReader]
#'
#' @inheritParams read_delim_arrow
+#' @param schema [Schema] that describes the table.
#' @param ... Additional options passed to `JsonTableReader$create()`
#'
#' @return A `data.frame`, or a Table if `as_data_frame = FALSE`.
@@ -38,12 +39,13 @@
read_json_arrow <- function(file,
col_select = NULL,
as_data_frame = TRUE,
+ schema = NULL,
...) {
if (!inherits(file, "InputStream")) {
file <- make_readable_file(file)
on.exit(file$close())
}
- tab <- JsonTableReader$create(file, ...)$Read()
+ tab <- JsonTableReader$create(file, schema = schema, ...)$Read()
col_select <- enquo(col_select)
if (!quo_is_null(col_select)) {
@@ -69,7 +71,8 @@
)
JsonTableReader$create <- function(file,
read_options = JsonReadOptions$create(),
- parse_options = JsonParseOptions$create(),
+ parse_options = JsonParseOptions$create(schema = schema),
+ schema = NULL,
...) {
assert_is(file, "InputStream")
json___TableReader__Make(file, read_options, parse_options)
@@ -91,6 +94,11 @@
#' @docType class
#' @export
JsonParseOptions <- R6Class("JsonParseOptions", inherit = ArrowObject)
-JsonParseOptions$create <- function(newlines_in_values = FALSE) {
- json___ParseOptions__initialize(newlines_in_values)
+JsonParseOptions$create <- function(newlines_in_values = FALSE, schema = NULL) {
+ if (is.null(schema)) {
+ json___ParseOptions__initialize1(newlines_in_values)
+ } else {
+ json___ParseOptions__initialize2(newlines_in_values, schema)
+ }
+
}
diff --git a/r/man/read_json_arrow.Rd b/r/man/read_json_arrow.Rd
index 8111891..83765b2 100644
--- a/r/man/read_json_arrow.Rd
+++ b/r/man/read_json_arrow.Rd
@@ -4,7 +4,13 @@
\alias{read_json_arrow}
\title{Read a JSON file}
\usage{
-read_json_arrow(file, col_select = NULL, as_data_frame = TRUE, ...)
+read_json_arrow(
+ file,
+ col_select = NULL,
+ as_data_frame = TRUE,
+ schema = NULL,
+ ...
+)
}
\arguments{
\item{file}{A character file name or URI, \code{raw} vector, an Arrow input stream,
@@ -22,6 +28,8 @@
\item{as_data_frame}{Should the function return a \code{data.frame} (default) or
an Arrow \link{Table}?}
+\item{schema}{\link{Schema} that describes the table.}
+
\item{...}{Additional options passed to \code{JsonTableReader$create()}}
}
\value{
diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp
index 4c2ebed..6dc50c3 100644
--- a/r/src/arrowExports.cpp
+++ b/r/src/arrowExports.cpp
@@ -4115,16 +4115,32 @@
// json.cpp
#if defined(ARROW_R_WITH_ARROW)
-std::shared_ptr<arrow::json::ParseOptions> json___ParseOptions__initialize(bool newlines_in_values);
-extern "C" SEXP _arrow_json___ParseOptions__initialize(SEXP newlines_in_values_sexp){
+std::shared_ptr<arrow::json::ParseOptions> json___ParseOptions__initialize1(bool newlines_in_values);
+extern "C" SEXP _arrow_json___ParseOptions__initialize1(SEXP newlines_in_values_sexp){
BEGIN_CPP11
arrow::r::Input<bool>::type newlines_in_values(newlines_in_values_sexp);
- return cpp11::as_sexp(json___ParseOptions__initialize(newlines_in_values));
+ return cpp11::as_sexp(json___ParseOptions__initialize1(newlines_in_values));
END_CPP11
}
#else
-extern "C" SEXP _arrow_json___ParseOptions__initialize(SEXP newlines_in_values_sexp){
- Rf_error("Cannot call json___ParseOptions__initialize(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+extern "C" SEXP _arrow_json___ParseOptions__initialize1(SEXP newlines_in_values_sexp){
+ Rf_error("Cannot call json___ParseOptions__initialize1(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// json.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::json::ParseOptions> json___ParseOptions__initialize2(bool newlines_in_values, const std::shared_ptr<arrow::Schema>& explicit_schema);
+extern "C" SEXP _arrow_json___ParseOptions__initialize2(SEXP newlines_in_values_sexp, SEXP explicit_schema_sexp){
+BEGIN_CPP11
+ arrow::r::Input<bool>::type newlines_in_values(newlines_in_values_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::Schema>&>::type explicit_schema(explicit_schema_sexp);
+ return cpp11::as_sexp(json___ParseOptions__initialize2(newlines_in_values, explicit_schema));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_json___ParseOptions__initialize2(SEXP newlines_in_values_sexp, SEXP explicit_schema_sexp){
+ Rf_error("Cannot call json___ParseOptions__initialize2(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
}
#endif
@@ -6835,7 +6851,8 @@
{ "_arrow_io___BufferOutputStream__Tell", (DL_FUNC) &_arrow_io___BufferOutputStream__Tell, 1},
{ "_arrow_io___BufferOutputStream__Write", (DL_FUNC) &_arrow_io___BufferOutputStream__Write, 2},
{ "_arrow_json___ReadOptions__initialize", (DL_FUNC) &_arrow_json___ReadOptions__initialize, 2},
- { "_arrow_json___ParseOptions__initialize", (DL_FUNC) &_arrow_json___ParseOptions__initialize, 1},
+ { "_arrow_json___ParseOptions__initialize1", (DL_FUNC) &_arrow_json___ParseOptions__initialize1, 1},
+ { "_arrow_json___ParseOptions__initialize2", (DL_FUNC) &_arrow_json___ParseOptions__initialize2, 2},
{ "_arrow_json___TableReader__Make", (DL_FUNC) &_arrow_json___TableReader__Make, 3},
{ "_arrow_json___TableReader__Read", (DL_FUNC) &_arrow_json___TableReader__Read, 1},
{ "_arrow_MemoryPool__default", (DL_FUNC) &_arrow_MemoryPool__default, 0},
diff --git a/r/src/json.cpp b/r/src/json.cpp
index 87d4062..edc5e07 100644
--- a/r/src/json.cpp
+++ b/r/src/json.cpp
@@ -31,7 +31,7 @@
}
// [[arrow::export]]
-std::shared_ptr<arrow::json::ParseOptions> json___ParseOptions__initialize(
+std::shared_ptr<arrow::json::ParseOptions> json___ParseOptions__initialize1(
bool newlines_in_values) {
auto res =
std::make_shared<arrow::json::ParseOptions>(arrow::json::ParseOptions::Defaults());
@@ -40,6 +40,16 @@
}
// [[arrow::export]]
+std::shared_ptr<arrow::json::ParseOptions> json___ParseOptions__initialize2(
+ bool newlines_in_values, const std::shared_ptr<arrow::Schema>& explicit_schema) {
+ auto res =
+ std::make_shared<arrow::json::ParseOptions>(arrow::json::ParseOptions::Defaults());
+ res->newlines_in_values = newlines_in_values;
+ res->explicit_schema = explicit_schema;
+ return res;
+}
+
+// [[arrow::export]]
std::shared_ptr<arrow::json::TableReader> json___TableReader__Make(
const std::shared_ptr<arrow::io::InputStream>& input,
const std::shared_ptr<arrow::json::ReadOptions>& read_options,
diff --git a/r/tests/testthat/test-json.R b/r/tests/testthat/test-json.R
index b0b508b..ad5ff8a 100644
--- a/r/tests/testthat/test-json.R
+++ b/r/tests/testthat/test-json.R
@@ -86,6 +86,94 @@
expect_equal(names(tab2), c("hello", "world"))
})
+test_that("read_json_arrow(schema=) with empty schema", {
+ tf <- tempfile()
+ writeLines('
+ { "hello": 3.5, "world": 2, "third_col": 99}
+ { "hello": 3.25, "world": 5, "third_col": 98}
+ { "hello": 3.125, "world": 8, "third_col": 97 }
+ { "hello": 0.0, "world": 10, "third_col": 96}
+ ', tf)
+
+ tab1 <- read_json_arrow(tf, schema = schema())
+
+ expect_identical(
+ tab1,
+ tibble::tibble(
+ hello = c(3.5, 3.25, 3.125, 0),
+ world = c(2L, 5L, 8L, 10L),
+ third_col = c(99L,98L,97L,96L)
+ )
+ )
+})
+
+test_that("read_json_arrow(schema=) with partial schema", {
+ tf <- tempfile()
+ writeLines('
+ { "hello": 3.5, "world": 2, "third_col": 99}
+ { "hello": 3.25, "world": 5, "third_col": 98}
+ { "hello": 3.125, "world": 8, "third_col": 97 }
+ { "hello": 0.0, "world": 10, "third_col": 96}
+ ', tf)
+
+ tab1 <- read_json_arrow(tf, schema = schema(third_col = float64(), world = float64()))
+
+ expect_identical(
+ tab1,
+ tibble::tibble(
+ third_col = c(99,98,97,96),
+ world = c(2, 5, 8, 10),
+ hello = c(3.5, 3.25, 3.125, 0)
+ )
+ )
+
+ tf2 <- tempfile()
+ writeLines('
+ { "hello": 3.5, "world": 2, "third_col": "99"}
+ { "hello": 3.25, "world": 5, "third_col": "98"}
+ { "hello": 3.125, "world": 8, "third_col": "97"}
+ ', tf2)
+
+ tab2 <- read_json_arrow(tf2, schema = schema(third_col = string(), world = float64()))
+
+ expect_identical(
+ tab2,
+ tibble::tibble(
+ third_col = c("99","98","97"),
+ world = c(2, 5, 8),
+ hello = c(3.5, 3.25, 3.125)
+ )
+ )
+})
+
+test_that("read_json_arrow(schema=) with full schema", {
+ tf <- tempfile()
+ writeLines('
+ { "hello": 3.5, "world": 2, "third_col": 99}
+ { "hello": 3.25, "world": 5, "third_col": 98}
+ { "hello": 3.125, "world": 8, "third_col": 97}
+ { "hello": 0.0, "world": 10, "third_col": 96}
+ ', tf)
+
+ tab1 <- read_json_arrow(
+ tf,
+ schema = schema(
+ hello = float64(),
+ third_col = float64(),
+ world = float64()
+ )
+ )
+
+ expect_identical(
+ tab1,
+ tibble::tibble(
+ hello = c(3.5, 3.25, 3.125, 0),
+ third_col = c(99,98,97,96),
+ world = c(2, 5, 8, 10)
+ )
+ )
+})
+
test_that("Can read json file with nested columns (ARROW-5503)", {
tf <- tempfile()
on.exit(unlink(tf))