blob: 4711c4b730a47fca92ed6e132a3a0116421aae2b [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include <memory>
#include "./arrow_types.h"
#include "./arrow_vctrs.h"
#if defined(ARROW_R_WITH_ARROW)
#include <arrow/array/array_base.h>
#include <arrow/builder.h>
#include <arrow/chunked_array.h>
#include <arrow/util/bitmap_writer.h>
#include <arrow/visitor_inline.h>
using arrow::internal::checked_cast;
namespace arrow {
namespace r {
template <typename T>
inline bool is_na(T value) {
return false;
}
template <>
inline bool is_na<int64_t>(int64_t value) {
return value == NA_INT64;
}
template <>
inline bool is_na<double>(double value) {
return ISNA(value);
}
template <>
inline bool is_na<int>(int value) {
return value == NA_INTEGER;
}
struct VectorToArrayConverter {
Status Visit(const arrow::NullType& type) {
auto* null_builder = checked_cast<NullBuilder*>(builder);
return null_builder->AppendNulls(XLENGTH(x));
}
Status Visit(const arrow::BooleanType& type) {
ARROW_RETURN_IF(TYPEOF(x) != LGLSXP, Status::RError("Expecting a logical vector"));
R_xlen_t n = XLENGTH(x);
auto* bool_builder = checked_cast<BooleanBuilder*>(builder);
auto* p = LOGICAL(x);
RETURN_NOT_OK(bool_builder->Reserve(n));
for (R_xlen_t i = 0; i < n; i++) {
auto value = p[i];
if (value == NA_LOGICAL) {
bool_builder->UnsafeAppendNull();
} else {
bool_builder->UnsafeAppend(value == 1);
}
}
return Status::OK();
}
Status Visit(const arrow::Int32Type& type) {
ARROW_RETURN_IF(TYPEOF(x) != INTSXP, Status::RError("Expecting an integer vector"));
auto* int_builder = checked_cast<Int32Builder*>(builder);
R_xlen_t n = XLENGTH(x);
const auto* data = INTEGER(x);
RETURN_NOT_OK(int_builder->Reserve(n));
for (R_xlen_t i = 0; i < n; i++) {
const auto value = data[i];
if (value == NA_INTEGER) {
int_builder->UnsafeAppendNull();
} else {
int_builder->UnsafeAppend(value);
}
}
return Status::OK();
}
Status Visit(const arrow::Int64Type& type) {
ARROW_RETURN_IF(TYPEOF(x) != REALSXP, Status::RError("Expecting a numeric vector"));
ARROW_RETURN_IF(Rf_inherits(x, "integer64"),
Status::RError("Expecting a vector that inherits integer64"));
auto* int_builder = checked_cast<Int64Builder*>(builder);
R_xlen_t n = XLENGTH(x);
const auto* data = (REAL(x));
RETURN_NOT_OK(int_builder->Reserve(n));
for (R_xlen_t i = 0; i < n; i++) {
const auto value = arrow::util::SafeCopy<int64_t>(data[i]);
if (value == NA_INT64) {
int_builder->UnsafeAppendNull();
} else {
int_builder->UnsafeAppend(value);
}
}
return Status::OK();
}
Status Visit(const arrow::DoubleType& type) {
ARROW_RETURN_IF(TYPEOF(x) != REALSXP, Status::RError("Expecting a numeric vector"));
auto* double_builder = checked_cast<DoubleBuilder*>(builder);
R_xlen_t n = XLENGTH(x);
const auto* data = (REAL(x));
RETURN_NOT_OK(double_builder->Reserve(n));
for (R_xlen_t i = 0; i < n; i++) {
const auto value = data[i];
if (ISNA(value)) {
double_builder->UnsafeAppendNull();
} else {
double_builder->UnsafeAppend(value);
}
}
return Status::OK();
}
Status Visit(const arrow::BinaryType& type) {
if (!(Rf_inherits(x, "vctrs_list_of") &&
TYPEOF(Rf_getAttrib(x, symbols::ptype)) == RAWSXP)) {
return Status::RError("Expecting a list of raw vectors");
}
return Status::OK();
}
Status Visit(const arrow::FixedSizeBinaryType& type) {
if (!(Rf_inherits(x, "vctrs_list_of") &&
TYPEOF(Rf_getAttrib(x, symbols::ptype)) == RAWSXP)) {
return Status::RError("Expecting a list of raw vectors");
}
return Status::OK();
}
template <typename T>
arrow::enable_if_base_binary<T, Status> Visit(const T& type) {
using BuilderType = typename TypeTraits<T>::BuilderType;
ARROW_RETURN_IF(TYPEOF(x) != STRSXP, Status::RError("Expecting a character vector"));
auto* binary_builder = checked_cast<BuilderType*>(builder);
R_xlen_t n = XLENGTH(x);
RETURN_NOT_OK(builder->Reserve(n));
for (R_xlen_t i = 0; i < n; i++) {
SEXP s = STRING_ELT(x, i);
if (s == NA_STRING) {
RETURN_NOT_OK(binary_builder->AppendNull());
continue;
} else {
// Make sure we're ingesting UTF-8
s = Rf_mkCharCE(Rf_translateCharUTF8(s), CE_UTF8);
}
RETURN_NOT_OK(binary_builder->Append(CHAR(s), LENGTH(s)));
}
return Status::OK();
}
template <typename T>
arrow::enable_if_base_list<T, Status> Visit(const T& type) {
using BuilderType = typename TypeTraits<T>::BuilderType;
ARROW_RETURN_IF(TYPEOF(x) != VECSXP, Status::RError("Expecting a list vector"));
auto* list_builder = checked_cast<BuilderType*>(builder);
auto* value_builder = list_builder->value_builder();
auto value_type = type.value_type();
R_xlen_t n = XLENGTH(x);
RETURN_NOT_OK(builder->Reserve(n));
for (R_xlen_t i = 0; i < n; i++) {
SEXP vector = VECTOR_ELT(x, i);
if (Rf_isNull(vector)) {
RETURN_NOT_OK(list_builder->AppendNull());
continue;
}
RETURN_NOT_OK(list_builder->Append());
// Recurse.
VectorToArrayConverter converter{vector, value_builder};
Status status = arrow::VisitTypeInline(*value_type, &converter);
if (!status.ok()) {
return Status::RError("Cannot convert list element ", (i + 1),
" to an Array of type `", value_type->ToString(),
"` : ", status.message());
}
}
return Status::OK();
}
Status Visit(const FixedSizeListType& type) {
ARROW_RETURN_IF(TYPEOF(x) != VECSXP, Status::RError("Expecting a list vector"));
auto* fixed_size_list_builder = checked_cast<FixedSizeListBuilder*>(builder);
auto* value_builder = fixed_size_list_builder->value_builder();
auto value_type = type.value_type();
int list_size = type.list_size();
R_xlen_t n = XLENGTH(x);
RETURN_NOT_OK(builder->Reserve(n));
for (R_xlen_t i = 0; i < n; i++) {
SEXP vector = VECTOR_ELT(x, i);
if (Rf_isNull(vector)) {
RETURN_NOT_OK(fixed_size_list_builder->AppendNull());
continue;
}
RETURN_NOT_OK(fixed_size_list_builder->Append());
auto vect_type = arrow::r::InferArrowType(vector);
if (!value_type->Equals(vect_type)) {
return Status::RError("FixedSizeList vector expecting elements vector of type ",
value_type->ToString(), " but got ", vect_type->ToString());
}
int vector_size = vctrs::short_vec_size(vector);
if (vector_size != list_size) {
return Status::RError("FixedSizeList vector expecting elements vector of size ",
list_size, ", not ", vector_size);
}
// Recurse.
VectorToArrayConverter converter{vector, value_builder};
RETURN_NOT_OK(arrow::VisitTypeInline(*value_type, &converter));
}
return Status::OK();
}
template <typename T>
arrow::enable_if_t<is_struct_type<T>::value, Status> Visit(const T& type) {
using BuilderType = typename TypeTraits<T>::BuilderType;
ARROW_RETURN_IF(!Rf_inherits(x, "data.frame"),
Status::RError("Expecting a data frame"));
auto* struct_builder = checked_cast<BuilderType*>(builder);
int64_t n = vctrs::short_vec_size(x);
RETURN_NOT_OK(struct_builder->Reserve(n));
RETURN_NOT_OK(struct_builder->AppendValues(n, NULLPTR));
int num_fields = struct_builder->num_fields();
// Visit each column of the data frame using the associated
// field builder
for (R_xlen_t i = 0; i < num_fields; i++) {
auto column_builder = struct_builder->field_builder(i);
SEXP x_i = VECTOR_ELT(x, i);
int64_t n_i = vctrs::short_vec_size(x_i);
if (n_i != n) {
SEXP name_i = STRING_ELT(Rf_getAttrib(x, R_NamesSymbol), i);
return Status::RError("Degenerated data frame. Column '", CHAR(name_i),
"' has size ", n_i, " instead of the number of rows: ", n);
}
VectorToArrayConverter converter{x_i, column_builder};
RETURN_NOT_OK(arrow::VisitTypeInline(*column_builder->type().get(), &converter));
}
return Status::OK();
}
template <typename T>
arrow::enable_if_t<std::is_same<DictionaryType, T>::value, Status> Visit(
const T& type) {
// TODO: perhaps this replaces MakeFactorArrayImpl ?
ARROW_RETURN_IF(!Rf_isFactor(x), Status::RError("Expecting a factor"));
int64_t n = vctrs::short_vec_size(x);
auto* dict_builder = checked_cast<StringDictionaryBuilder*>(builder);
RETURN_NOT_OK(dict_builder->Reserve(n));
SEXP levels = Rf_getAttrib(x, R_LevelsSymbol);
auto memo = VectorToArrayConverter::Visit(levels, utf8());
RETURN_NOT_OK(dict_builder->InsertMemoValues(*memo));
int* p_values = INTEGER(x);
for (int64_t i = 0; i < n; i++, ++p_values) {
int v = *p_values;
if (v == NA_INTEGER) {
RETURN_NOT_OK(dict_builder->AppendNull());
} else {
RETURN_NOT_OK(dict_builder->Append(CHAR(STRING_ELT(levels, v - 1))));
}
}
return Status::OK();
}
Status Visit(const arrow::DataType& type) {
return Status::NotImplemented("Converting vector to arrow type ", type.ToString(),
" not implemented");
}
static std::shared_ptr<Array> Visit(SEXP x, const std::shared_ptr<DataType>& type) {
std::unique_ptr<ArrayBuilder> builder;
StopIfNotOk(MakeBuilder(arrow::default_memory_pool(), type, &builder));
VectorToArrayConverter converter{x, builder.get()};
StopIfNotOk(arrow::VisitTypeInline(*type, &converter));
std::shared_ptr<Array> result;
StopIfNotOk(builder->Finish(&result));
return result;
}
SEXP x;
arrow::ArrayBuilder* builder;
};
template <typename Type>
std::shared_ptr<Array> MakeFactorArrayImpl(Rcpp::IntegerVector_ factor,
const std::shared_ptr<arrow::DataType>& type) {
using value_type = typename arrow::TypeTraits<Type>::ArrayType::value_type;
auto n = factor.size();
std::shared_ptr<Buffer> indices_buffer =
ValueOrStop(AllocateBuffer(n * sizeof(value_type)));
std::vector<std::shared_ptr<Buffer>> buffers{nullptr, indices_buffer};
int64_t null_count = 0;
R_xlen_t i = 0;
auto p_factor = factor.begin();
auto p_indices = reinterpret_cast<value_type*>(indices_buffer->mutable_data());
for (; i < n; i++, ++p_indices, ++p_factor) {
if (*p_factor == NA_INTEGER) break;
*p_indices = *p_factor - 1;
}
if (i < n) {
// there are NA's so we need a null buffer
auto null_buffer = ValueOrStop(AllocateBuffer(BitUtil::BytesForBits(n)));
internal::FirstTimeBitmapWriter null_bitmap_writer(null_buffer->mutable_data(), 0, n);
// catch up
for (R_xlen_t j = 0; j < i; j++, null_bitmap_writer.Next()) {
null_bitmap_writer.Set();
}
// resume offset filling
for (; i < n; i++, ++p_indices, ++p_factor, null_bitmap_writer.Next()) {
if (*p_factor == NA_INTEGER) {
null_bitmap_writer.Clear();
null_count++;
} else {
null_bitmap_writer.Set();
*p_indices = *p_factor - 1;
}
}
null_bitmap_writer.Finish();
buffers[0] = std::move(null_buffer);
}
auto array_indices_data =
ArrayData::Make(std::make_shared<Type>(), n, std::move(buffers), null_count, 0);
auto array_indices = MakeArray(array_indices_data);
SEXP levels = Rf_getAttrib(factor, R_LevelsSymbol);
auto dict = VectorToArrayConverter::Visit(levels, utf8());
return ValueOrStop(DictionaryArray::FromArrays(type, array_indices, dict));
}
std::shared_ptr<Array> MakeFactorArray(Rcpp::IntegerVector_ factor,
const std::shared_ptr<arrow::DataType>& type) {
const auto& dict_type = checked_cast<const arrow::DictionaryType&>(*type);
switch (dict_type.index_type()->id()) {
case Type::INT8:
return MakeFactorArrayImpl<arrow::Int8Type>(factor, type);
case Type::INT16:
return MakeFactorArrayImpl<arrow::Int16Type>(factor, type);
case Type::INT32:
return MakeFactorArrayImpl<arrow::Int32Type>(factor, type);
case Type::INT64:
return MakeFactorArrayImpl<arrow::Int64Type>(factor, type);
default:
Rcpp::stop(tfm::format("Cannot convert to dictionary with index_type %s",
dict_type.index_type()->ToString()));
}
}
std::shared_ptr<Array> MakeStructArray(SEXP df, const std::shared_ptr<DataType>& type) {
int n = type->num_fields();
std::vector<std::shared_ptr<Array>> children(n);
for (int i = 0; i < n; i++) {
children[i] = Array__from_vector(VECTOR_ELT(df, i), type->field(i)->type(), true);
}
int64_t rows = n ? children[0]->length() : 0;
return std::make_shared<StructArray>(type, rows, children);
}
template <typename T>
int64_t time_cast(T value);
template <>
inline int64_t time_cast<int>(int value) {
return static_cast<int64_t>(value) * 1000;
}
template <>
inline int64_t time_cast<double>(double value) {
return static_cast<int64_t>(value * 1000);
}
} // namespace r
} // namespace arrow
// ---------------- new api
namespace arrow {
namespace internal {
template <typename T, typename Target,
typename std::enable_if<std::is_signed<Target>::value, Target>::type = 0>
Status int_cast(T x, Target* out) {
if (static_cast<int64_t>(x) < std::numeric_limits<Target>::min() ||
static_cast<int64_t>(x) > std::numeric_limits<Target>::max()) {
return Status::Invalid("Value is too large to fit in C integer type");
}
*out = static_cast<Target>(x);
return Status::OK();
}
template <typename T>
struct usigned_type;
template <typename T, typename Target,
typename std::enable_if<std::is_unsigned<Target>::value, Target>::type = 0>
Status int_cast(T x, Target* out) {
// we need to compare between unsigned integers
uint64_t x64 = x;
if (x64 < 0 || x64 > std::numeric_limits<Target>::max()) {
return Status::Invalid("Value is too large to fit in C integer type");
}
*out = static_cast<Target>(x);
return Status::OK();
}
template <typename Int>
Status double_cast(Int x, double* out) {
*out = static_cast<double>(x);
return Status::OK();
}
template <>
Status double_cast<int64_t>(int64_t x, double* out) {
constexpr int64_t kDoubleMax = 1LL << 53;
constexpr int64_t kDoubleMin = -(1LL << 53);
if (x < kDoubleMin || x > kDoubleMax) {
return Status::Invalid("integer value ", x, " is outside of the range exactly",
" representable by a IEEE 754 double precision value");
}
*out = static_cast<double>(x);
return Status::OK();
}
// used for int and int64_t
template <typename T>
Status float_cast(T x, float* out) {
constexpr int64_t kHalfFloatMax = 1LL << 24;
constexpr int64_t kHalfFloatMin = -(1LL << 24);
int64_t x64 = static_cast<int64_t>(x);
if (x64 < kHalfFloatMin || x64 > kHalfFloatMax) {
return Status::Invalid("integer value ", x, " is outside of the range exactly",
" representable by a IEEE 754 half precision value");
}
*out = static_cast<float>(x);
return Status::OK();
}
template <>
Status float_cast<double>(double x, float* out) {
// TODO: is there some sort of floating point overflow ?
*out = static_cast<float>(x);
return Status::OK();
}
} // namespace internal
namespace r {
class VectorConverter;
Status GetConverter(const std::shared_ptr<DataType>& type,
std::unique_ptr<VectorConverter>* out);
class VectorConverter {
public:
virtual ~VectorConverter() = default;
virtual Status Init(ArrayBuilder* builder) = 0;
virtual Status Ingest(SEXP obj) = 0;
virtual Status GetResult(std::shared_ptr<arrow::Array>* result) {
return builder_->Finish(result);
}
ArrayBuilder* builder() const { return builder_; }
protected:
ArrayBuilder* builder_;
};
class NullVectorConverter : public VectorConverter {
public:
using BuilderType = NullBuilder;
~NullVectorConverter() {}
Status Init(ArrayBuilder* builder) override {
builder_ = builder;
typed_builder_ = checked_cast<BuilderType*>(builder_);
return Status::OK();
}
Status Ingest(SEXP obj) override {
RETURN_NOT_OK(typed_builder_->AppendNulls(XLENGTH(obj)));
return Status::OK();
}
protected:
BuilderType* typed_builder_;
};
template <typename Type, typename Enable = void>
struct Unbox {};
// unboxer for int type
template <typename Type>
struct Unbox<Type, enable_if_integer<Type>> {
using BuilderType = typename TypeTraits<Type>::BuilderType;
using ArrayType = typename TypeTraits<Type>::ArrayType;
using CType = typename ArrayType::value_type;
static inline Status Ingest(BuilderType* builder, SEXP obj) {
switch (TYPEOF(obj)) {
case INTSXP:
return IngestRange<int>(builder, INTEGER(obj), XLENGTH(obj));
case REALSXP:
if (Rf_inherits(obj, "integer64")) {
return IngestRange<int64_t>(builder, reinterpret_cast<int64_t*>(REAL(obj)),
XLENGTH(obj));
}
return IngestRange(builder, REAL(obj), XLENGTH(obj));
// TODO: handle raw and logical
default:
break;
}
return Status::Invalid(
tfm::format("Cannot convert R vector of type %s to integer Arrow array",
Rcpp::type2name(obj)));
}
template <typename T>
static inline Status IngestRange(BuilderType* builder, T* p, R_xlen_t n) {
RETURN_NOT_OK(builder->Resize(n));
for (R_xlen_t i = 0; i < n; i++, ++p) {
if (is_na<T>(*p)) {
builder->UnsafeAppendNull();
} else {
CType value = 0;
RETURN_NOT_OK(internal::int_cast(*p, &value));
builder->UnsafeAppend(value);
}
}
return Status::OK();
}
};
template <>
struct Unbox<DoubleType> {
static inline Status Ingest(DoubleBuilder* builder, SEXP obj) {
switch (TYPEOF(obj)) {
// TODO: handle RAW
case INTSXP:
return IngestIntRange<int>(builder, INTEGER(obj), XLENGTH(obj), NA_INTEGER);
case REALSXP:
if (Rf_inherits(obj, "integer64")) {
return IngestIntRange<int64_t>(builder, reinterpret_cast<int64_t*>(REAL(obj)),
XLENGTH(obj), NA_INT64);
}
return IngestDoubleRange(builder, REAL(obj), XLENGTH(obj));
}
return Status::Invalid("Cannot convert R object to double type");
}
template <typename T>
static inline Status IngestIntRange(DoubleBuilder* builder, T* p, R_xlen_t n, T na) {
RETURN_NOT_OK(builder->Resize(n));
for (R_xlen_t i = 0; i < n; i++, ++p) {
if (*p == NA_INTEGER) {
builder->UnsafeAppendNull();
} else {
double value = 0;
RETURN_NOT_OK(internal::double_cast(*p, &value));
builder->UnsafeAppend(value);
}
}
return Status::OK();
}
static inline Status IngestDoubleRange(DoubleBuilder* builder, double* p, R_xlen_t n) {
RETURN_NOT_OK(builder->Resize(n));
for (R_xlen_t i = 0; i < n; i++, ++p) {
if (ISNA(*p)) {
builder->UnsafeAppendNull();
} else {
builder->UnsafeAppend(*p);
}
}
return Status::OK();
}
};
template <>
struct Unbox<FloatType> {
static inline Status Ingest(FloatBuilder* builder, SEXP obj) {
switch (TYPEOF(obj)) {
// TODO: handle RAW
case INTSXP:
return IngestIntRange<int>(builder, INTEGER(obj), XLENGTH(obj), NA_INTEGER);
case REALSXP:
if (Rf_inherits(obj, "integer64")) {
return IngestIntRange<int64_t>(builder, reinterpret_cast<int64_t*>(REAL(obj)),
XLENGTH(obj), NA_INT64);
}
return IngestDoubleRange(builder, REAL(obj), XLENGTH(obj));
}
return Status::Invalid("Cannot convert R object to double type");
}
template <typename T>
static inline Status IngestIntRange(FloatBuilder* builder, T* p, R_xlen_t n, T na) {
RETURN_NOT_OK(builder->Resize(n));
for (R_xlen_t i = 0; i < n; i++, ++p) {
if (*p == NA_INTEGER) {
builder->UnsafeAppendNull();
} else {
float value = 0;
RETURN_NOT_OK(internal::float_cast(*p, &value));
builder->UnsafeAppend(value);
}
}
return Status::OK();
}
static inline Status IngestDoubleRange(FloatBuilder* builder, double* p, R_xlen_t n) {
RETURN_NOT_OK(builder->Resize(n));
for (R_xlen_t i = 0; i < n; i++, ++p) {
if (ISNA(*p)) {
builder->UnsafeAppendNull();
} else {
float value;
RETURN_NOT_OK(internal::float_cast(*p, &value));
builder->UnsafeAppend(value);
}
}
return Status::OK();
}
};
template <>
struct Unbox<BooleanType> {
static inline Status Ingest(BooleanBuilder* builder, SEXP obj) {
switch (TYPEOF(obj)) {
case LGLSXP: {
R_xlen_t n = XLENGTH(obj);
RETURN_NOT_OK(builder->Resize(n));
int* p = LOGICAL(obj);
for (R_xlen_t i = 0; i < n; i++, ++p) {
if (*p == NA_LOGICAL) {
builder->UnsafeAppendNull();
} else {
builder->UnsafeAppend(*p == 1);
}
}
return Status::OK();
}
default:
break;
}
// TODO: include more information about the R object and the target type
return Status::Invalid("Cannot convert R object to boolean type");
}
};
template <>
struct Unbox<Date32Type> {
static inline Status Ingest(Date32Builder* builder, SEXP obj) {
switch (TYPEOF(obj)) {
case INTSXP:
if (Rf_inherits(obj, "Date")) {
return IngestIntRange(builder, INTEGER(obj), XLENGTH(obj));
}
break;
case REALSXP:
if (Rf_inherits(obj, "Date")) {
return IngestDoubleRange(builder, REAL(obj), XLENGTH(obj));
}
break;
default:
break;
}
return Status::Invalid("Cannot convert R object to date32 type");
}
static inline Status IngestIntRange(Date32Builder* builder, int* p, R_xlen_t n) {
RETURN_NOT_OK(builder->Resize(n));
for (R_xlen_t i = 0; i < n; i++, ++p) {
if (*p == NA_INTEGER) {
builder->UnsafeAppendNull();
} else {
builder->UnsafeAppend(*p);
}
}
return Status::OK();
}
static inline Status IngestDoubleRange(Date32Builder* builder, double* p, R_xlen_t n) {
RETURN_NOT_OK(builder->Resize(n));
for (R_xlen_t i = 0; i < n; i++, ++p) {
if (ISNA(*p)) {
builder->UnsafeAppendNull();
} else {
builder->UnsafeAppend(static_cast<int>(*p));
}
}
return Status::OK();
}
};
template <>
struct Unbox<Date64Type> {
constexpr static int64_t kMillisecondsPerDay = 86400000;
static inline Status Ingest(Date64Builder* builder, SEXP obj) {
switch (TYPEOF(obj)) {
case INTSXP:
// number of days since epoch
if (Rf_inherits(obj, "Date")) {
return IngestDateInt32Range(builder, INTEGER(obj), XLENGTH(obj));
}
break;
case REALSXP:
// (fractional number of days since epoch)
if (Rf_inherits(obj, "Date")) {
return IngestDateDoubleRange<kMillisecondsPerDay>(builder, REAL(obj),
XLENGTH(obj));
}
// number of seconds since epoch
if (Rf_inherits(obj, "POSIXct")) {
return IngestDateDoubleRange<1000>(builder, REAL(obj), XLENGTH(obj));
}
}
return Status::Invalid("Cannot convert R object to date64 type");
}
// ingest a integer vector that represents number of days since epoch
static inline Status IngestDateInt32Range(Date64Builder* builder, int* p, R_xlen_t n) {
RETURN_NOT_OK(builder->Resize(n));
for (R_xlen_t i = 0; i < n; i++, ++p) {
if (*p == NA_INTEGER) {
builder->UnsafeAppendNull();
} else {
builder->UnsafeAppend(*p * kMillisecondsPerDay);
}
}
return Status::OK();
}
// ingest a numeric vector that represents (fractional) number of days since epoch
template <int64_t MULTIPLIER>
static inline Status IngestDateDoubleRange(Date64Builder* builder, double* p,
R_xlen_t n) {
RETURN_NOT_OK(builder->Resize(n));
for (R_xlen_t i = 0; i < n; i++, ++p) {
if (ISNA(*p)) {
builder->UnsafeAppendNull();
} else {
builder->UnsafeAppend(static_cast<int64_t>(*p * MULTIPLIER));
}
}
return Status::OK();
}
};
template <typename Type, class Derived>
class TypedVectorConverter : public VectorConverter {
public:
using BuilderType = typename TypeTraits<Type>::BuilderType;
Status Init(ArrayBuilder* builder) override {
builder_ = builder;
typed_builder_ = checked_cast<BuilderType*>(builder_);
return Status::OK();
}
Status Ingest(SEXP obj) override { return Unbox<Type>::Ingest(typed_builder_, obj); }
protected:
BuilderType* typed_builder_;
};
template <typename Type>
class NumericVectorConverter
: public TypedVectorConverter<Type, NumericVectorConverter<Type>> {};
class BooleanVectorConverter
: public TypedVectorConverter<BooleanType, BooleanVectorConverter> {};
class Date32Converter : public TypedVectorConverter<Date32Type, Date32Converter> {};
class Date64Converter : public TypedVectorConverter<Date64Type, Date64Converter> {};
inline int64_t get_time_multiplier(TimeUnit::type unit) {
switch (unit) {
case TimeUnit::SECOND:
return 1;
case TimeUnit::MILLI:
return 1000;
case TimeUnit::MICRO:
return 1000000;
case TimeUnit::NANO:
return 1000000000;
default:
return 0;
}
}
template <typename Type>
class TimeConverter : public VectorConverter {
using BuilderType = typename TypeTraits<Type>::BuilderType;
public:
explicit TimeConverter(TimeUnit::type unit)
: unit_(unit), multiplier_(get_time_multiplier(unit)) {}
Status Init(ArrayBuilder* builder) override {
builder_ = builder;
typed_builder_ = checked_cast<BuilderType*>(builder);
return Status::OK();
}
Status Ingest(SEXP obj) override {
if (valid_R_object(obj)) {
int difftime_multiplier;
RETURN_NOT_OK(GetDifftimeMultiplier(obj, &difftime_multiplier));
return Ingest_POSIXct(REAL(obj), XLENGTH(obj), difftime_multiplier);
}
return Status::Invalid("Cannot convert R object to timestamp type");
}
protected:
TimeUnit::type unit_;
BuilderType* typed_builder_;
int64_t multiplier_;
Status Ingest_POSIXct(double* p, R_xlen_t n, int difftime_multiplier) {
RETURN_NOT_OK(typed_builder_->Resize(n));
for (R_xlen_t i = 0; i < n; i++, ++p) {
if (ISNA(*p)) {
typed_builder_->UnsafeAppendNull();
} else {
typed_builder_->UnsafeAppend(
static_cast<int64_t>(*p * multiplier_ * difftime_multiplier));
}
}
return Status::OK();
}
virtual bool valid_R_object(SEXP obj) = 0;
// only used for Time32 and Time64
virtual Status GetDifftimeMultiplier(SEXP obj, int* res) {
std::string unit(CHAR(STRING_ELT(Rf_getAttrib(obj, symbols::units), 0)));
if (unit == "secs") {
*res = 1;
} else if (unit == "mins") {
*res = 60;
} else if (unit == "hours") {
*res = 3600;
} else if (unit == "days") {
*res = 86400;
} else if (unit == "weeks") {
*res = 604800;
} else {
return Status::Invalid("unknown difftime unit");
}
return Status::OK();
}
};
class TimestampConverter : public TimeConverter<TimestampType> {
public:
explicit TimestampConverter(TimeUnit::type unit) : TimeConverter<TimestampType>(unit) {}
protected:
bool valid_R_object(SEXP obj) override {
return TYPEOF(obj) == REALSXP && Rf_inherits(obj, "POSIXct");
}
Status GetDifftimeMultiplier(SEXP obj, int* res) override {
*res = 1;
return Status::OK();
}
};
class Time32Converter : public TimeConverter<Time32Type> {
public:
explicit Time32Converter(TimeUnit::type unit) : TimeConverter<Time32Type>(unit) {}
protected:
bool valid_R_object(SEXP obj) override {
return TYPEOF(obj) == REALSXP && Rf_inherits(obj, "difftime");
}
};
class Time64Converter : public TimeConverter<Time64Type> {
public:
explicit Time64Converter(TimeUnit::type unit) : TimeConverter<Time64Type>(unit) {}
protected:
bool valid_R_object(SEXP obj) override {
return TYPEOF(obj) == REALSXP && Rf_inherits(obj, "difftime");
}
};
template <typename Builder>
class BinaryVectorConverter : public VectorConverter {
public:
~BinaryVectorConverter() {}
Status Init(ArrayBuilder* builder) {
typed_builder_ = checked_cast<Builder*>(builder);
return Status::OK();
}
Status Ingest(SEXP obj) {
ARROW_RETURN_IF(TYPEOF(obj) != VECSXP, Status::RError("Expecting a list"));
R_xlen_t n = XLENGTH(obj);
// Reserve enough space before appending
int64_t size = 0;
for (R_xlen_t i = 0; i < n; i++) {
SEXP obj_i = VECTOR_ELT(obj, i);
if (!Rf_isNull(obj_i)) {
ARROW_RETURN_IF(TYPEOF(obj_i) != RAWSXP,
Status::RError("Expecting a raw vector"));
size += XLENGTH(obj_i);
}
}
RETURN_NOT_OK(typed_builder_->Reserve(size));
// append
for (R_xlen_t i = 0; i < n; i++) {
SEXP obj_i = VECTOR_ELT(obj, i);
if (Rf_isNull(obj_i)) {
RETURN_NOT_OK(typed_builder_->AppendNull());
} else {
RETURN_NOT_OK(typed_builder_->Append(RAW(obj_i), XLENGTH(obj_i)));
}
}
return Status::OK();
}
Status GetResult(std::shared_ptr<arrow::Array>* result) {
return typed_builder_->Finish(result);
}
private:
Builder* typed_builder_;
};
class FixedSizeBinaryVectorConverter : public VectorConverter {
public:
~FixedSizeBinaryVectorConverter() {}
Status Init(ArrayBuilder* builder) {
typed_builder_ = checked_cast<FixedSizeBinaryBuilder*>(builder);
return Status::OK();
}
Status Ingest(SEXP obj) {
ARROW_RETURN_IF(TYPEOF(obj) != VECSXP, Status::RError("Expecting a list"));
R_xlen_t n = XLENGTH(obj);
// Reserve enough space before appending
int32_t byte_width = typed_builder_->byte_width();
for (R_xlen_t i = 0; i < n; i++) {
SEXP obj_i = VECTOR_ELT(obj, i);
if (!Rf_isNull(obj_i)) {
ARROW_RETURN_IF(TYPEOF(obj_i) != RAWSXP,
Status::RError("Expecting a raw vector"));
ARROW_RETURN_IF(XLENGTH(obj_i) != byte_width,
Status::RError("Expecting a raw vector of ", byte_width,
" bytes, not ", XLENGTH(obj_i)));
}
}
RETURN_NOT_OK(typed_builder_->Reserve(n * byte_width));
// append
for (R_xlen_t i = 0; i < n; i++) {
SEXP obj_i = VECTOR_ELT(obj, i);
if (Rf_isNull(obj_i)) {
RETURN_NOT_OK(typed_builder_->AppendNull());
} else {
RETURN_NOT_OK(typed_builder_->Append(RAW(obj_i)));
}
}
return Status::OK();
}
Status GetResult(std::shared_ptr<arrow::Array>* result) {
return typed_builder_->Finish(result);
}
private:
FixedSizeBinaryBuilder* typed_builder_;
};
template <typename Builder>
class StringVectorConverter : public VectorConverter {
public:
~StringVectorConverter() {}
Status Init(ArrayBuilder* builder) {
typed_builder_ = checked_cast<Builder*>(builder);
return Status::OK();
}
Status Ingest(SEXP obj) {
ARROW_RETURN_IF(TYPEOF(obj) != STRSXP,
Status::RError("Expecting a character vector"));
R_xlen_t n = XLENGTH(obj);
// Reserve enough space before appending
int64_t size = 0;
for (R_xlen_t i = 0; i < n; i++) {
SEXP string_i = STRING_ELT(obj, i);
if (string_i != NA_STRING) {
size += XLENGTH(Rf_mkCharCE(Rf_translateCharUTF8(string_i), CE_UTF8));
}
}
RETURN_NOT_OK(typed_builder_->Reserve(size));
// append
for (R_xlen_t i = 0; i < n; i++) {
SEXP string_i = STRING_ELT(obj, i);
if (string_i == NA_STRING) {
RETURN_NOT_OK(typed_builder_->AppendNull());
} else {
// Make sure we're ingesting UTF-8
string_i = Rf_mkCharCE(Rf_translateCharUTF8(string_i), CE_UTF8);
RETURN_NOT_OK(typed_builder_->Append(CHAR(string_i), XLENGTH(string_i)));
}
}
return Status::OK();
}
Status GetResult(std::shared_ptr<arrow::Array>* result) {
return typed_builder_->Finish(result);
}
private:
Builder* typed_builder_;
};
#define NUMERIC_CONVERTER(TYPE_ENUM, TYPE) \
case Type::TYPE_ENUM: \
*out = \
std::unique_ptr<NumericVectorConverter<TYPE>>(new NumericVectorConverter<TYPE>); \
return Status::OK()
#define SIMPLE_CONVERTER_CASE(TYPE_ENUM, TYPE) \
case Type::TYPE_ENUM: \
*out = std::unique_ptr<TYPE>(new TYPE); \
return Status::OK()
#define TIME_CONVERTER_CASE(TYPE_ENUM, DATA_TYPE, TYPE) \
case Type::TYPE_ENUM: \
*out = \
std::unique_ptr<TYPE>(new TYPE(checked_cast<DATA_TYPE*>(type.get())->unit())); \
return Status::OK()
Status GetConverter(const std::shared_ptr<DataType>& type,
std::unique_ptr<VectorConverter>* out) {
switch (type->id()) {
SIMPLE_CONVERTER_CASE(BINARY, BinaryVectorConverter<arrow::BinaryBuilder>);
SIMPLE_CONVERTER_CASE(LARGE_BINARY, BinaryVectorConverter<arrow::LargeBinaryBuilder>);
SIMPLE_CONVERTER_CASE(FIXED_SIZE_BINARY, FixedSizeBinaryVectorConverter);
SIMPLE_CONVERTER_CASE(BOOL, BooleanVectorConverter);
SIMPLE_CONVERTER_CASE(STRING, StringVectorConverter<arrow::StringBuilder>);
SIMPLE_CONVERTER_CASE(LARGE_STRING, StringVectorConverter<arrow::LargeStringBuilder>);
NUMERIC_CONVERTER(INT8, Int8Type);
NUMERIC_CONVERTER(INT16, Int16Type);
NUMERIC_CONVERTER(INT32, Int32Type);
NUMERIC_CONVERTER(INT64, Int64Type);
NUMERIC_CONVERTER(UINT8, UInt8Type);
NUMERIC_CONVERTER(UINT16, UInt16Type);
NUMERIC_CONVERTER(UINT32, UInt32Type);
NUMERIC_CONVERTER(UINT64, UInt64Type);
// TODO: not sure how to handle half floats
// the python code uses npy_half
// NUMERIC_CONVERTER(HALF_FLOAT, HalfFloatType);
NUMERIC_CONVERTER(FLOAT, FloatType);
NUMERIC_CONVERTER(DOUBLE, DoubleType);
SIMPLE_CONVERTER_CASE(DATE32, Date32Converter);
SIMPLE_CONVERTER_CASE(DATE64, Date64Converter);
// TODO: probably after we merge ARROW-3628
// case Type::DECIMAL:
TIME_CONVERTER_CASE(TIME32, Time32Type, Time32Converter);
TIME_CONVERTER_CASE(TIME64, Time64Type, Time64Converter);
TIME_CONVERTER_CASE(TIMESTAMP, TimestampType, TimestampConverter);
case Type::NA:
*out = std::unique_ptr<NullVectorConverter>(new NullVectorConverter);
return Status::OK();
default:
break;
}
return Status::NotImplemented("type not implemented");
}
static inline std::shared_ptr<arrow::DataType> IndexTypeForFactors(int n_factors) {
if (n_factors < INT8_MAX) {
return arrow::int8();
} else if (n_factors < INT16_MAX) {
return arrow::int16();
} else {
return arrow::int32();
}
}
std::shared_ptr<arrow::DataType> InferArrowTypeFromFactor(SEXP factor) {
SEXP factors = Rf_getAttrib(factor, R_LevelsSymbol);
auto index_type = IndexTypeForFactors(Rf_length(factors));
bool is_ordered = Rf_inherits(factor, "ordered");
return dictionary(index_type, arrow::utf8(), is_ordered);
}
template <int VectorType>
std::shared_ptr<arrow::DataType> InferArrowTypeFromVector(SEXP x) {
Rcpp::stop("Unknown vector type: ", VectorType);
}
template <>
std::shared_ptr<arrow::DataType> InferArrowTypeFromVector<ENVSXP>(SEXP x) {
if (Rf_inherits(x, "Array")) {
Rcpp::ConstReferenceSmartPtrInputParameter<std::shared_ptr<arrow::Array>> array(x);
return static_cast<std::shared_ptr<arrow::Array>>(array)->type();
}
Rcpp::stop("Unrecognized vector instance for type ENVSXP");
}
template <>
std::shared_ptr<arrow::DataType> InferArrowTypeFromVector<LGLSXP>(SEXP x) {
return Rf_inherits(x, "vctrs_unspecified") ? null() : boolean();
}
template <>
std::shared_ptr<arrow::DataType> InferArrowTypeFromVector<INTSXP>(SEXP x) {
if (Rf_isFactor(x)) {
return InferArrowTypeFromFactor(x);
} else if (Rf_inherits(x, "Date")) {
return date32();
} else if (Rf_inherits(x, "POSIXct")) {
auto tzone_sexp = Rf_getAttrib(x, symbols::tzone);
if (Rf_isNull(tzone_sexp)) {
return timestamp(TimeUnit::MICRO);
} else {
return timestamp(TimeUnit::MICRO, CHAR(STRING_ELT(tzone_sexp, 0)));
}
}
return int32();
}
template <>
std::shared_ptr<arrow::DataType> InferArrowTypeFromVector<REALSXP>(SEXP x) {
if (Rf_inherits(x, "Date")) {
return date32();
}
if (Rf_inherits(x, "POSIXct")) {
auto tzone_sexp = Rf_getAttrib(x, symbols::tzone);
if (Rf_isNull(tzone_sexp)) {
return timestamp(TimeUnit::MICRO);
} else {
return timestamp(TimeUnit::MICRO, CHAR(STRING_ELT(tzone_sexp, 0)));
}
}
if (Rf_inherits(x, "integer64")) {
return int64();
}
if (Rf_inherits(x, "difftime")) {
return time32(TimeUnit::SECOND);
}
return float64();
}
template <>
std::shared_ptr<arrow::DataType> InferArrowTypeFromVector<STRSXP>(SEXP x) {
// See how big the character vector is
R_xlen_t n = XLENGTH(x);
int64_t size = 0;
for (R_xlen_t i = 0; i < n; i++) {
SEXP string_i = STRING_ELT(x, i);
if (string_i != NA_STRING) {
size += XLENGTH(Rf_mkCharCE(Rf_translateCharUTF8(string_i), CE_UTF8));
}
if (size > arrow::kBinaryMemoryLimit) {
// Exceeds 2GB capacity of utf8 type, so use large
return large_utf8();
}
}
return utf8();
}
static inline std::shared_ptr<arrow::DataType> InferArrowTypeFromDataFrame(SEXP x) {
R_xlen_t n = XLENGTH(x);
SEXP names = Rf_getAttrib(x, R_NamesSymbol);
std::vector<std::shared_ptr<arrow::Field>> fields(n);
for (R_xlen_t i = 0; i < n; i++) {
// Make sure we're ingesting UTF-8
const auto* field_name =
CHAR(Rf_mkCharCE(Rf_translateCharUTF8(STRING_ELT(names, i)), CE_UTF8));
fields[i] = arrow::field(field_name, InferArrowType(VECTOR_ELT(x, i)));
}
return arrow::struct_(std::move(fields));
}
template <>
std::shared_ptr<arrow::DataType> InferArrowTypeFromVector<VECSXP>(SEXP x) {
if (Rf_inherits(x, "data.frame") || Rf_inherits(x, "POSIXlt")) {
return InferArrowTypeFromDataFrame(x);
} else {
// some known special cases
if (Rf_inherits(x, "arrow_fixed_size_binary")) {
SEXP byte_width = Rf_getAttrib(x, symbols::byte_width);
if (Rf_isNull(byte_width) || TYPEOF(byte_width) != INTSXP ||
XLENGTH(byte_width) != 1) {
Rcpp::stop("malformed arrow_fixed_size_binary object");
}
return arrow::fixed_size_binary(INTEGER(byte_width)[0]);
}
if (Rf_inherits(x, "arrow_binary")) {
return arrow::binary();
}
if (Rf_inherits(x, "arrow_large_binary")) {
return arrow::large_binary();
}
SEXP ptype = Rf_getAttrib(x, symbols::ptype);
if (Rf_isNull(ptype)) {
if (XLENGTH(x) == 0) {
Rcpp::stop(
"Requires at least one element to infer the values' type of a list vector");
}
ptype = VECTOR_ELT(x, 0);
}
return arrow::list(InferArrowType(ptype));
}
}
std::shared_ptr<arrow::DataType> InferArrowType(SEXP x) {
switch (TYPEOF(x)) {
case ENVSXP:
return InferArrowTypeFromVector<ENVSXP>(x);
case LGLSXP:
return InferArrowTypeFromVector<LGLSXP>(x);
case INTSXP:
return InferArrowTypeFromVector<INTSXP>(x);
case REALSXP:
return InferArrowTypeFromVector<REALSXP>(x);
case RAWSXP:
return int8();
case STRSXP:
return InferArrowTypeFromVector<STRSXP>(x);
case VECSXP:
return InferArrowTypeFromVector<VECSXP>(x);
default:
break;
}
Rcpp::stop("Cannot infer type from vector");
}
// in some situations we can just use the memory of the R object in an RBuffer
// instead of going through ArrayBuilder, etc ...
bool can_reuse_memory(SEXP x, const std::shared_ptr<arrow::DataType>& type) {
switch (type->id()) {
case Type::INT32:
return TYPEOF(x) == INTSXP && !OBJECT(x);
case Type::DOUBLE:
return TYPEOF(x) == REALSXP && !OBJECT(x);
case Type::INT8:
return TYPEOF(x) == RAWSXP && !OBJECT(x);
case Type::INT64:
return TYPEOF(x) == REALSXP && Rf_inherits(x, "integer64");
default:
break;
}
return false;
}
// this is only used on some special cases when the arrow Array can just use the memory of
// the R object, via an RBuffer, hence be zero copy
template <int RTYPE, typename Type>
std::shared_ptr<Array> MakeSimpleArray(SEXP x) {
using value_type = typename arrow::TypeTraits<Type>::ArrayType::value_type;
Rcpp::Vector<RTYPE, Rcpp::NoProtectStorage> vec(x);
auto n = vec.size();
auto p_vec_start = reinterpret_cast<value_type*>(vec.begin());
auto p_vec_end = p_vec_start + n;
std::vector<std::shared_ptr<Buffer>> buffers{nullptr,
std::make_shared<RBuffer<RTYPE>>(vec)};
int null_count = 0;
auto first_na = std::find_if(p_vec_start, p_vec_end, is_na<value_type>);
if (first_na < p_vec_end) {
auto null_bitmap = ValueOrStop(AllocateBuffer(BitUtil::BytesForBits(n)));
internal::FirstTimeBitmapWriter bitmap_writer(null_bitmap->mutable_data(), 0, n);
// first loop to clear all the bits before the first NA
auto j = std::distance(p_vec_start, first_na);
int i = 0;
for (; i < j; i++, bitmap_writer.Next()) {
bitmap_writer.Set();
}
auto p_vec = first_na;
// then finish
for (; i < n; i++, bitmap_writer.Next(), ++p_vec) {
if (is_na<value_type>(*p_vec)) {
bitmap_writer.Clear();
null_count++;
} else {
bitmap_writer.Set();
}
}
bitmap_writer.Finish();
buffers[0] = std::move(null_bitmap);
}
auto data = ArrayData::Make(std::make_shared<Type>(), LENGTH(x), std::move(buffers),
null_count, 0 /*offset*/);
// return the right Array class
return std::make_shared<typename TypeTraits<Type>::ArrayType>(data);
}
std::shared_ptr<arrow::Array> Array__from_vector_reuse_memory(SEXP x) {
auto type = TYPEOF(x);
if (type == INTSXP) {
return MakeSimpleArray<INTSXP, Int32Type>(x);
} else if (type == REALSXP && Rf_inherits(x, "integer64")) {
return MakeSimpleArray<REALSXP, Int64Type>(x);
} else if (type == REALSXP) {
return MakeSimpleArray<REALSXP, DoubleType>(x);
} else if (type == RAWSXP) {
return MakeSimpleArray<RAWSXP, UInt8Type>(x);
}
Rcpp::stop("Unreachable: you might need to fix can_reuse_memory()");
}
bool CheckCompatibleFactor(SEXP obj, const std::shared_ptr<arrow::DataType>& type) {
if (!Rf_inherits(obj, "factor")) {
return false;
}
const auto& dict_type = checked_cast<const arrow::DictionaryType&>(*type);
return dict_type.value_type()->Equals(utf8());
}
arrow::Status CheckCompatibleStruct(SEXP obj,
const std::shared_ptr<arrow::DataType>& type) {
if (!Rf_inherits(obj, "data.frame")) {
return Status::RError("Conversion to struct arrays requires a data.frame");
}
// check the number of columns
int num_fields = type->num_fields();
if (XLENGTH(obj) != num_fields) {
return Status::RError("Number of fields in struct (", num_fields,
") incompatible with number of columns in the data frame (",
XLENGTH(obj), ")");
}
// check the names of each column
//
// the columns themselves are not checked against the
// types of the fields, because Array__from_vector will error
// when not compatible.
SEXP names = Rf_getAttrib(obj, R_NamesSymbol);
SEXP name_i;
for (int i = 0; i < num_fields; i++) {
name_i = Rf_mkCharCE(Rf_translateCharUTF8(STRING_ELT(names, i)), CE_UTF8);
if (type->field(i)->name() != CHAR(name_i)) {
return Status::RError("Field name in position ", i, " (", type->field(i)->name(),
") does not match the name of the column of the data frame (",
CHAR(name_i), ")");
}
}
return Status::OK();
}
std::shared_ptr<arrow::Array> Array__from_vector(
SEXP x, const std::shared_ptr<arrow::DataType>& type, bool type_inferred) {
// short circuit if `x` is already an Array
if (Rf_inherits(x, "Array")) {
return Rcpp::ConstReferenceSmartPtrInputParameter<std::shared_ptr<arrow::Array>>(x);
}
// special case when we can just use the data from the R vector
// directly. This still needs to handle the null bitmap
if (arrow::r::can_reuse_memory(x, type)) {
return arrow::r::Array__from_vector_reuse_memory(x);
}
// factors only when type has been inferred
if (type->id() == Type::DICTIONARY) {
if (type_inferred || arrow::r::CheckCompatibleFactor(x, type)) {
// TODO: use VectorToArrayConverter instead, but it does not appear to work
// correctly with ordered dictionary yet
//
// return VectorToArrayConverter::Visit(x, type);
return arrow::r::MakeFactorArray(x, type);
}
Rcpp::stop("Object incompatible with dictionary type");
}
if (type->id() == Type::LIST || type->id() == Type::LARGE_LIST ||
type->id() == Type::FIXED_SIZE_LIST) {
return VectorToArrayConverter::Visit(x, type);
}
// struct types
if (type->id() == Type::STRUCT) {
if (!type_inferred) {
StopIfNotOk(arrow::r::CheckCompatibleStruct(x, type));
}
// TODO: when the type has been infered, we could go through
// VectorToArrayConverter:
//
// else {
// return VectorToArrayConverter::Visit(df, type);
// }
return arrow::r::MakeStructArray(x, type);
}
// general conversion with converter and builder
std::unique_ptr<arrow::r::VectorConverter> converter;
StopIfNotOk(arrow::r::GetConverter(type, &converter));
// Create ArrayBuilder for type
std::unique_ptr<arrow::ArrayBuilder> type_builder;
StopIfNotOk(arrow::MakeBuilder(arrow::default_memory_pool(), type, &type_builder));
StopIfNotOk(converter->Init(type_builder.get()));
// ingest R data and grab the result array
StopIfNotOk(converter->Ingest(x));
std::shared_ptr<arrow::Array> result;
StopIfNotOk(converter->GetResult(&result));
return result;
}
} // namespace r
} // namespace arrow
// [[arrow::export]]
std::shared_ptr<arrow::DataType> Array__infer_type(SEXP x) {
return arrow::r::InferArrowType(x);
}
// [[arrow::export]]
std::shared_ptr<arrow::Array> Array__from_vector(SEXP x, SEXP s_type) {
// the type might be NULL, in which case we need to infer it from the data
// we keep track of whether it was inferred or supplied
bool type_inferred = Rf_isNull(s_type);
std::shared_ptr<arrow::DataType> type;
if (type_inferred) {
type = arrow::r::InferArrowType(x);
} else {
type = arrow::r::extract<arrow::DataType>(s_type);
}
return arrow::r::Array__from_vector(x, type, type_inferred);
}
// [[arrow::export]]
std::shared_ptr<arrow::ChunkedArray> ChunkedArray__from_list(Rcpp::List chunks,
SEXP s_type) {
std::vector<std::shared_ptr<arrow::Array>> vec;
// the type might be NULL, in which case we need to infer it from the data
// we keep track of whether it was inferred or supplied
bool type_inferred = Rf_isNull(s_type);
R_xlen_t n = XLENGTH(chunks);
std::shared_ptr<arrow::DataType> type;
if (type_inferred) {
if (n == 0) {
Rcpp::stop("type must be specified for empty list");
}
type = arrow::r::InferArrowType(VECTOR_ELT(chunks, 0));
} else {
type = arrow::r::extract<arrow::DataType>(s_type);
}
if (n == 0) {
std::shared_ptr<arrow::Array> array;
std::unique_ptr<arrow::ArrayBuilder> type_builder;
StopIfNotOk(arrow::MakeBuilder(arrow::default_memory_pool(), type, &type_builder));
StopIfNotOk(type_builder->Finish(&array));
vec.push_back(array);
} else {
// the first - might differ from the rest of the loop
// because we might have inferred the type from the first element of the list
//
// this only really matters for dictionary arrays
vec.push_back(
arrow::r::Array__from_vector(VECTOR_ELT(chunks, 0), type, type_inferred));
for (R_xlen_t i = 1; i < n; i++) {
vec.push_back(arrow::r::Array__from_vector(VECTOR_ELT(chunks, i), type, false));
}
}
return std::make_shared<arrow::ChunkedArray>(std::move(vec));
}
// [[arrow::export]]
std::shared_ptr<arrow::Array> DictionaryArray__FromArrays(
const std::shared_ptr<arrow::DataType>& type,
const std::shared_ptr<arrow::Array>& indices,
const std::shared_ptr<arrow::Array>& dict) {
return ValueOrStop(arrow::DictionaryArray::FromArrays(type, indices, dict));
}
#endif