commit	e4f0754e03c84e66b3d93db11c902d99db74fa31	[log] [tgz]
author	Dewey Dunnington <dewey@dunnington.ca>	Fri Feb 23 16:07:22 2024 -0400
committer	GitHub <noreply@github.com>	Fri Feb 23 16:07:22 2024 -0400
tree	e90de3ae3288d3e2e673df89aa95d7e5460fc4e2
parent	c66ddc35a9ccf0374aadc2d3a8821431ed0c9ca6 [diff]

feat(r): Add bindings for IPC reader (#390)

This PR adds bindings to nanoarrow's IPC reader from R. The entrypoint
for a user is `read_nanoarrow()`, which accepts raw vectors,
connections, and file paths (thin wrapper around connections). It also
fixes a number of compiler warnings in the IPC extension.

The implementation is not particularly complicated from the R side, but
the main drawback of adding IPC support is that the flatbuffers
implementation (flatcc) actively does not care about gcc compiler
warnings (whereas CRAN actively cares about them).

These are all slower than the arrow package, which has more tools at its
disposal to prevent copies.

``` r
library(arrow, warn.conflicts = FALSE)
library(nanoarrow)

# Basic read example
tf <- tempfile()
write_ipc_stream(dplyr::starwars, tf)
read_nanoarrow(tf) |> tibble::as_tibble()
#> # A tibble: 87 × 14
#>    name     height  mass hair_color skin_color eye_color birth_year sex   gender
#>    <chr>     <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
#>  1 Luke Sk…    172    77 blond      fair       blue            19   male  mascu…
#>  2 C-3PO       167    75 <NA>       gold       yellow         112   none  mascu…
#>  3 R2-D2        96    32 <NA>       white, bl… red             33   none  mascu…
#>  4 Darth V…    202   136 none       white      yellow          41.9 male  mascu…
#>  5 Leia Or…    150    49 brown      light      brown           19   fema… femin…
#>  6 Owen La…    178   120 brown, gr… light      blue            52   male  mascu…
#>  7 Beru Wh…    165    75 brown      light      blue            47   fema… femin…
#>  8 R5-D4        97    32 <NA>       white, red red             NA   none  mascu…
#>  9 Biggs D…    183    84 black      light      brown           24   male  mascu…
#> 10 Obi-Wan…    182    77 auburn, w… fair       blue-gray       57   male  mascu…
#> # ℹ 77 more rows
#> # ℹ 5 more variables: homeworld <chr>, species <chr>, films <list<chr>>,
#> #   vehicles <list<chr>>, starships <list<chr>>

df_bigish <- nanoarrow:::vec_gen(data.frame(x = character()), n = 1e6)
write_ipc_stream(df_bigish, tf)

# Wrapper because mmap is apparently not passed through from read_ipc_stream()
# and this is pretty significant
read_ipc_stream_wrap <- function(f, ..., mmap) {
  arrow::read_ipc_stream(
    arrow:::make_readable_file(f, mmap = mmap, random_access = FALSE),
    ...
  )
}

tf_raw <- brio::read_file_raw(tf)

# Slower than arrow for raw vector input because of C implementation,
# which doesn't currently share the global buffer (just shares buffers
# between columns within a single batch)
bench::mark(
  nanoarrow = read_nanoarrow(tf_raw) |> collect_array_stream(),
  arrow = read_ipc_stream(buffer(tf_raw), as_data_frame = FALSE),
  check = FALSE
)
#> # A tibble: 2 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 nanoarrow    1.27ms   1.84ms      439.    41.5KB     2.18
#> 2 arrow      509.26µs 528.65µs     1821.     3.6MB    79.3


# Slower than arrow, maybe because of C implementation, but definitely
# because it uses base::readBin() which necessiates an extra copy
bench::mark(
  nanoarrow = read_nanoarrow(tf) |> collect_array_stream(),
  arrow_mmap = read_ipc_stream_wrap(tf, mmap = TRUE, as_data_frame = FALSE),
  arrow = read_ipc_stream_wrap(tf, mmap = FALSE, as_data_frame = FALSE),
  check = FALSE
)
#> # A tibble: 3 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 nanoarrow    5.18ms   5.66ms      174.    16.2MB   189.  
#> 2 arrow_mmap 613.48µs  640.5µs     1526.   528.6KB    13.9 
#> 3 arrow        2.18ms   2.84ms      339.   551.6KB     4.06
```

<sup>Created on 2024-02-19 with [reprex
v2.0.2](https://reprex.tidyverse.org)</sup>

17 files changed

tree: e90de3ae3288d3e2e673df89aa95d7e5460fc4e2

README.md

nanoarrow

The nanoarrow library is a set of helper functions to interpret and generate Arrow C Data Interface and Arrow C Stream Interface structures. The library is in active early development and users should update regularly from the main branch of this repository.

Whereas the current suite of Arrow implementations provide the basis for a comprehensive data analysis toolkit, this library is intended to support clients that wish to produce or interpret Arrow C Data and/or Arrow C Stream structures where linking to a higher level Arrow binding is difficult or impossible.

Using the C library

The nanoarrow C library is intended to be copied and vendored. This can be done using CMake or by using the bundled nanoarrow.h/nanorrow.c distribution available in the dist/ directory in this repository. Examples of both can be found in the examples/ directory in this repository.

A simple producer example:

#include "nanoarrow.h"

int make_simple_array(struct ArrowArray* array_out, struct ArrowSchema* schema_out) {
  struct ArrowError error;
  array_out->release = NULL;
  schema_out->release = NULL;

  NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromType(array_out, NANOARROW_TYPE_INT32));

  NANOARROW_RETURN_NOT_OK(ArrowArrayStartAppending(array_out));
  NANOARROW_RETURN_NOT_OK(ArrowArrayAppendInt(array_out, 1));
  NANOARROW_RETURN_NOT_OK(ArrowArrayAppendInt(array_out, 2));
  NANOARROW_RETURN_NOT_OK(ArrowArrayAppendInt(array_out, 3));
  NANOARROW_RETURN_NOT_OK(ArrowArrayFinishBuildingDefault(array_out, &error));

  NANOARROW_RETURN_NOT_OK(ArrowSchemaInitFromType(schema_out, NANOARROW_TYPE_INT32));

  return NANOARROW_OK;
}

A simple consumer example:

#include <stdio.h>

#include "nanoarrow.h"

int print_simple_array(struct ArrowArray* array, struct ArrowSchema* schema) {
  struct ArrowError error;
  struct ArrowArrayView array_view;
  NANOARROW_RETURN_NOT_OK(ArrowArrayViewInitFromSchema(&array_view, schema, &error));

  if (array_view.storage_type != NANOARROW_TYPE_INT32) {
    printf("Array has storage that is not int32\n");
  }

  int result = ArrowArrayViewSetArray(&array_view, array, &error);
  if (result != NANOARROW_OK) {
    ArrowArrayViewReset(&array_view);
    return result;
  }

  for (int64_t i = 0; i < array->length; i++) {
    printf("%d\n", (int)ArrowArrayViewGetIntUnsafe(&array_view, i));
  }

  ArrowArrayViewReset(&array_view);
  return NANOARROW_OK;
}