blob: 48b8c327a11c878ccc476fc730b28da245b4ed29 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#define R_NO_REMAP
#include <R.h>
#include <Rinternals.h>
#include "nanoarrow.h"
// Needed for the list_of materializer
#include "convert.h"
#include "materialize.h"
#include "materialize_blob.h"
#include "materialize_chr.h"
#include "materialize_date.h"
#include "materialize_dbl.h"
#include "materialize_difftime.h"
#include "materialize_int.h"
#include "materialize_lgl.h"
#include "materialize_posixct.h"
#include "materialize_unspecified.h"
SEXP nanoarrow_alloc_type(enum VectorType vector_type, R_xlen_t len) {
switch (vector_type) {
case VECTOR_TYPE_LGL:
return Rf_allocVector(LGLSXP, len);
case VECTOR_TYPE_INT:
return Rf_allocVector(INTSXP, len);
case VECTOR_TYPE_DBL:
return Rf_allocVector(REALSXP, len);
case VECTOR_TYPE_CHR:
return Rf_allocVector(STRSXP, len);
default:
return R_NilValue;
}
}
// A version of Rf_getAttrib(x, sym) != R_NilValue that never
// expands the row.names attribute
static int has_attrib_safe(SEXP x, SEXP sym) {
for (SEXP atts = ATTRIB(x); atts != R_NilValue; atts = CDR(atts)) {
if (TAG(atts) == sym) return TRUE;
}
return FALSE;
}
R_xlen_t nanoarrow_data_frame_size(SEXP x) {
if (Rf_length(x) > 0) {
// This both avoids materializing the row.names attribute and
// makes this work with struct-style vctrs that don't have a
// row.names attribute but that always have one or more element
return Rf_xlength(VECTOR_ELT(x, 0));
} else {
// Since ALTREP was introduced, materializing the row.names attribute is
// usually deferred such that values in the form c(NA, -nrow), 1:nrow, or
// as.character(1:nrow) are never actually computed when the length is
// taken.
return Rf_xlength(Rf_getAttrib(x, R_RowNamesSymbol));
}
}
void nanoarrow_set_rownames(SEXP x, R_xlen_t len) {
// If len fits in the integer range, we can use the c(NA, -nrow)
// shortcut for the row.names attribute. R expands this when
// the actual value is accessed (even from Rf_getAttrib()).
// If len does not fit in the integer range, we need
// as.character(seq_len(nrow)) (which returns a deferred ALTREP
// string conversion of an ALTREP sequence in recent R). Manipulating
// data frames with more than INT_MAX rows is not supported in most
// places but column access still works.
if (len <= INT_MAX) {
SEXP rownames = PROTECT(Rf_allocVector(INTSXP, 2));
INTEGER(rownames)[0] = NA_INTEGER;
INTEGER(rownames)[1] = -len;
Rf_setAttrib(x, R_RowNamesSymbol, rownames);
UNPROTECT(1);
} else {
SEXP length_dbl = PROTECT(Rf_ScalarReal(len));
SEXP seq_len_symbol = PROTECT(Rf_install("seq_len"));
SEXP seq_len_call = PROTECT(Rf_lang2(seq_len_symbol, length_dbl));
SEXP rownames_call = PROTECT(Rf_lang2(R_AsCharacterSymbol, seq_len_call));
Rf_setAttrib(x, R_RowNamesSymbol, Rf_eval(rownames_call, R_BaseNamespace));
UNPROTECT(4);
}
}
int nanoarrow_ptype_is_data_frame(SEXP ptype) {
return Rf_isObject(ptype) && TYPEOF(ptype) == VECSXP &&
(Rf_inherits(ptype, "data.frame") ||
(Rf_xlength(ptype) > 0 && has_attrib_safe(ptype, R_NamesSymbol)));
}
SEXP nanoarrow_materialize_realloc(SEXP ptype, R_xlen_t len) {
SEXP result;
if (Rf_isObject(ptype)) {
if (nanoarrow_ptype_is_data_frame(ptype)) {
R_xlen_t num_cols = Rf_xlength(ptype);
result = PROTECT(Rf_allocVector(VECSXP, num_cols));
for (R_xlen_t i = 0; i < num_cols; i++) {
SET_VECTOR_ELT(result, i,
nanoarrow_materialize_realloc(VECTOR_ELT(ptype, i), len));
}
// Set attributes from ptype
Rf_setAttrib(result, R_NamesSymbol, Rf_getAttrib(ptype, R_NamesSymbol));
Rf_copyMostAttrib(ptype, result);
// ...except rownames
if (Rf_inherits(ptype, "data.frame")) {
nanoarrow_set_rownames(result, len);
}
} else {
result = PROTECT(Rf_allocVector(TYPEOF(ptype), len));
Rf_copyMostAttrib(ptype, result);
}
} else {
result = PROTECT(Rf_allocVector(TYPEOF(ptype), len));
}
UNPROTECT(1);
return result;
}
// Used in union building to pre-set all values to null
static void fill_vec_with_nulls(SEXP x, R_xlen_t offset, R_xlen_t len) {
if (nanoarrow_ptype_is_data_frame(x)) {
for (R_xlen_t i = 0; i < Rf_xlength(x); i++) {
fill_vec_with_nulls(VECTOR_ELT(x, i), offset, len);
}
return;
}
switch (TYPEOF(x)) {
case LGLSXP:
case INTSXP: {
int* values = INTEGER(x);
for (R_xlen_t i = 0; i < len; i++) {
values[offset + i] = NA_INTEGER;
}
return;
}
case REALSXP: {
double* values = REAL(x);
for (R_xlen_t i = 0; i < len; i++) {
values[offset + i] = NA_REAL;
}
return;
}
case STRSXP:
for (R_xlen_t i = 0; i < len; i++) {
SET_STRING_ELT(x, offset + i, NA_STRING);
}
return;
case VECSXP:
for (R_xlen_t i = 0; i < len; i++) {
SET_VECTOR_ELT(x, offset + i, R_NilValue);
}
return;
default:
Rf_error("Attempt to fill vector with nulls with unsupported type");
}
}
static int nanoarrow_materialize_data_frame(struct RConverter* converter,
SEXP converter_xptr) {
if (converter->ptype_view.vector_type != VECTOR_TYPE_DATA_FRAME) {
return EINVAL;
}
SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr);
SEXP child_converter_xptrs = VECTOR_ELT(converter_shelter, 3);
switch (converter->array_view.storage_type) {
case NANOARROW_TYPE_STRUCT:
for (R_xlen_t i = 0; i < converter->n_children; i++) {
converter->children[i]->src.offset = converter->src.offset;
converter->children[i]->src.length = converter->src.length;
converter->children[i]->dst.offset = converter->dst.offset;
converter->children[i]->dst.length = converter->dst.length;
SEXP child_converter_xptr = VECTOR_ELT(child_converter_xptrs, i);
NANOARROW_RETURN_NOT_OK(
nanoarrow_materialize(converter->children[i], child_converter_xptr));
}
return NANOARROW_OK;
case NANOARROW_TYPE_DENSE_UNION:
case NANOARROW_TYPE_SPARSE_UNION:
// Pre-fill everything with nulls
fill_vec_with_nulls(converter->dst.vec_sexp, converter->dst.offset,
converter->dst.length);
// Fill in the possibly non-null values one at a time
for (R_xlen_t i = 0; i < converter->dst.length; i++) {
int64_t child_index = ArrowArrayViewUnionChildIndex(&converter->array_view,
converter->src.offset + i);
int64_t child_offset = ArrowArrayViewUnionChildOffset(&converter->array_view,
converter->src.offset + i);
converter->children[child_index]->src.offset = child_offset;
converter->children[child_index]->src.length = 1;
converter->children[child_index]->dst.offset = converter->dst.offset + i;
converter->children[child_index]->dst.length = 1;
SEXP child_converter_xptr = VECTOR_ELT(child_converter_xptrs, child_index);
NANOARROW_RETURN_NOT_OK(nanoarrow_materialize(converter->children[child_index],
child_converter_xptr));
}
return NANOARROW_OK;
default:
return ENOTSUP;
}
}
static int materialize_list_element(struct RConverter* converter, SEXP converter_xptr,
int64_t offset, int64_t length) {
if (nanoarrow_converter_reserve(converter_xptr, length) != NANOARROW_OK) {
nanoarrow_converter_stop(converter_xptr);
}
converter->src.offset = offset;
converter->src.length = length;
converter->dst.offset = 0;
converter->dst.length = length;
if (nanoarrow_converter_materialize_n(converter_xptr, length) != length) {
return EINVAL;
}
NANOARROW_RETURN_NOT_OK(nanoarrow_converter_finalize(converter_xptr));
return NANOARROW_OK;
}
static int nanoarrow_materialize_list_of(struct RConverter* converter,
SEXP converter_xptr) {
SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr);
SEXP child_converter_xptrs = VECTOR_ELT(converter_shelter, 3);
struct RConverter* child_converter = converter->children[0];
SEXP child_converter_xptr = VECTOR_ELT(child_converter_xptrs, 0);
struct ArrayViewSlice* src = &converter->src;
struct VectorSlice* dst = &converter->dst;
const int32_t* offsets = src->array_view->buffer_views[1].data.as_int32;
const int64_t* large_offsets = src->array_view->buffer_views[1].data.as_int64;
int64_t raw_src_offset = src->array_view->array->offset + src->offset;
int64_t offset;
int64_t length;
switch (src->array_view->storage_type) {
case NANOARROW_TYPE_NA:
return NANOARROW_OK;
case NANOARROW_TYPE_LIST:
for (int64_t i = 0; i < dst->length; i++) {
if (!ArrowArrayViewIsNull(src->array_view, src->offset + i)) {
offset = offsets[raw_src_offset + i];
length = offsets[raw_src_offset + i + 1] - offset;
NANOARROW_RETURN_NOT_OK(materialize_list_element(
child_converter, child_converter_xptr, offset, length));
SET_VECTOR_ELT(dst->vec_sexp, dst->offset + i,
nanoarrow_converter_release_result(child_converter_xptr));
}
}
break;
case NANOARROW_TYPE_LARGE_LIST:
for (int64_t i = 0; i < dst->length; i++) {
if (!ArrowArrayViewIsNull(src->array_view, src->offset + i)) {
offset = large_offsets[raw_src_offset + i];
length = large_offsets[raw_src_offset + i + 1] - offset;
NANOARROW_RETURN_NOT_OK(materialize_list_element(
child_converter, child_converter_xptr, offset, length));
SET_VECTOR_ELT(dst->vec_sexp, dst->offset + i,
nanoarrow_converter_release_result(child_converter_xptr));
}
}
break;
case NANOARROW_TYPE_FIXED_SIZE_LIST:
length = src->array_view->layout.child_size_elements;
for (int64_t i = 0; i < dst->length; i++) {
if (!ArrowArrayViewIsNull(src->array_view, src->offset + i)) {
offset = (raw_src_offset + i) * length;
NANOARROW_RETURN_NOT_OK(materialize_list_element(
child_converter, child_converter_xptr, offset, length));
SET_VECTOR_ELT(dst->vec_sexp, dst->offset + i,
nanoarrow_converter_release_result(child_converter_xptr));
}
}
break;
default:
return EINVAL;
}
return NANOARROW_OK;
}
int nanoarrow_materialize(struct RConverter* converter, SEXP converter_xptr) {
struct ArrayViewSlice* src = &converter->src;
struct VectorSlice* dst = &converter->dst;
struct MaterializeOptions* options = converter->options;
switch (converter->ptype_view.vector_type) {
case VECTOR_TYPE_UNSPECIFIED:
return nanoarrow_materialize_unspecified(src, dst, options);
case VECTOR_TYPE_LGL:
return nanoarrow_materialize_lgl(src, dst, options);
case VECTOR_TYPE_INT:
return nanoarrow_materialize_int(src, dst, options);
case VECTOR_TYPE_DBL:
return nanoarrow_materialize_dbl(converter);
case VECTOR_TYPE_CHR:
return nanoarrow_materialize_chr(src, dst, options);
case VECTOR_TYPE_POSIXCT:
return nanoarrow_materialize_posixct(converter);
case VECTOR_TYPE_DATE:
return nanoarrow_materialize_date(converter);
case VECTOR_TYPE_DIFFTIME:
return nanoarrow_materialize_difftime(converter);
case VECTOR_TYPE_BLOB:
return nanoarrow_materialize_blob(src, dst, options);
case VECTOR_TYPE_LIST_OF:
return nanoarrow_materialize_list_of(converter, converter_xptr);
case VECTOR_TYPE_DATA_FRAME:
return nanoarrow_materialize_data_frame(converter, converter_xptr);
default:
return ENOTSUP;
}
}