blob: e11e1f7d56f944e71552b74a1178a5d322690627 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "arrow_types.h"
using namespace Rcpp;
using namespace arrow;
namespace arrow {
namespace r {
template <int RTYPE, typename Vec = Rcpp::Vector<RTYPE>>
class SimpleRBuffer : public Buffer {
public:
SimpleRBuffer(Vec vec)
: Buffer(reinterpret_cast<const uint8_t*>(vec.begin()),
vec.size() * sizeof(typename Vec::stored_type)),
vec_(vec) {}
private:
// vec_ holds the memory
Vec vec_;
};
template <int RTYPE, typename Type>
std::shared_ptr<Array> SimpleArray(SEXP x) {
Rcpp::Vector<RTYPE> vec(x);
auto n = vec.size();
std::vector<std::shared_ptr<Buffer>> buffers{
nullptr, std::make_shared<SimpleRBuffer<RTYPE>>(vec)};
int null_count = 0;
if (RTYPE != RAWSXP) {
std::shared_ptr<Buffer> null_bitmap;
auto first_na = std::find_if(vec.begin(), vec.end(), Rcpp::Vector<RTYPE>::is_na);
if (first_na < vec.end()) {
R_ERROR_NOT_OK(AllocateBuffer(ceil((double)n / 8), &null_bitmap));
internal::FirstTimeBitmapWriter bitmap_writer(null_bitmap->mutable_data(), 0, n);
// first loop to clear all the bits before the first NA
auto j = std::distance(vec.begin(), first_na);
int i = 0;
for (; i < j; i++, bitmap_writer.Next()) {
bitmap_writer.Set();
}
// then finish
for (; i < n; i++, bitmap_writer.Next()) {
if (Rcpp::Vector<RTYPE>::is_na(vec[i])) {
bitmap_writer.Clear();
null_count++;
} else {
bitmap_writer.Set();
}
}
bitmap_writer.Finish();
buffers[0] = std::move(null_bitmap);
}
}
auto data = ArrayData::Make(
std::make_shared<Type>(), LENGTH(x), std::move(buffers), null_count, 0 /*offset*/
);
// return the right Array class
return std::make_shared<typename TypeTraits<Type>::ArrayType>(data);
}
std::shared_ptr<arrow::Array> MakeBooleanArray(LogicalVector_ vec) {
R_xlen_t n = vec.size();
// allocate a buffer for the data
std::shared_ptr<Buffer> data_bitmap;
R_ERROR_NOT_OK(AllocateBuffer(ceil((double)n / 8), &data_bitmap));
auto data_bitmap_data = data_bitmap->mutable_data();
internal::FirstTimeBitmapWriter bitmap_writer(data_bitmap_data, 0, n);
R_xlen_t null_count = 0;
// loop until the first no null
R_xlen_t i = 0;
for (; i < n; i++, bitmap_writer.Next()) {
if (vec[i] == 0) {
bitmap_writer.Clear();
} else if (vec[i] == NA_LOGICAL) {
break;
} else {
bitmap_writer.Set();
}
}
std::shared_ptr<arrow::Buffer> null_bitmap(nullptr);
if (i < n) {
// there has been a null before the end, so we need
// to collect that information in a null bitmap
R_ERROR_NOT_OK(AllocateBuffer(ceil((double)n / 8), &null_bitmap));
auto null_bitmap_data = null_bitmap->mutable_data();
internal::FirstTimeBitmapWriter null_bitmap_writer(null_bitmap_data, 0, n);
// catch up on the initial `i` bits
for (R_xlen_t j = 0; j < i; j++, null_bitmap_writer.Next()) {
null_bitmap_writer.Set();
}
// finish both bitmaps
for (; i < n; i++, bitmap_writer.Next(), null_bitmap_writer.Next()) {
if (vec[i] == 0) {
bitmap_writer.Clear();
null_bitmap_writer.Set();
} else if (vec[i] == NA_LOGICAL) {
null_bitmap_writer.Clear();
null_count++;
} else {
bitmap_writer.Set();
null_bitmap_writer.Set();
}
}
null_bitmap_writer.Finish();
}
bitmap_writer.Finish();
auto data =
ArrayData::Make(boolean(), n, {std::move(null_bitmap), std::move(data_bitmap)},
null_count, 0 /*offset*/
);
// return the right Array class
return MakeArray(data);
}
std::shared_ptr<Array> MakeStringArray(StringVector_ vec) {
R_xlen_t n = vec.size();
std::shared_ptr<Buffer> null_buffer(nullptr);
std::shared_ptr<Buffer> offset_buffer;
R_ERROR_NOT_OK(AllocateBuffer((n + 1) * sizeof(int32_t), &offset_buffer));
R_xlen_t i = 0;
int current_offset = 0;
int64_t null_count = 0;
auto p_offset = reinterpret_cast<int32_t*>(offset_buffer->mutable_data());
*p_offset = 0;
for (++p_offset; i < n; i++, ++p_offset) {
SEXP s = STRING_ELT(vec, i);
if (s == NA_STRING) {
// break as we are going to need a null_bitmap buffer
break;
}
*p_offset = current_offset += LENGTH(s);
}
if (i < n) {
R_ERROR_NOT_OK(AllocateBuffer(ceil((double)n / 8), &null_buffer));
internal::FirstTimeBitmapWriter null_bitmap_writer(null_buffer->mutable_data(), 0, n);
// catch up
for (R_xlen_t j = 0; j < i; j++, null_bitmap_writer.Next()) {
null_bitmap_writer.Set();
}
// resume offset filling
for (; i < n; i++, ++p_offset, null_bitmap_writer.Next()) {
SEXP s = STRING_ELT(vec, i);
if (s == NA_STRING) {
null_bitmap_writer.Clear();
*p_offset = current_offset;
null_count++;
} else {
null_bitmap_writer.Set();
*p_offset = current_offset += LENGTH(s);
}
}
null_bitmap_writer.Finish();
}
// ----- data buffer
std::shared_ptr<Buffer> value_buffer;
R_ERROR_NOT_OK(AllocateBuffer(current_offset, &value_buffer));
p_offset = reinterpret_cast<int32_t*>(offset_buffer->mutable_data());
auto p_data = reinterpret_cast<char*>(value_buffer->mutable_data());
for (R_xlen_t i = 0; i < n; i++) {
SEXP s = STRING_ELT(vec, i);
if (s != NA_STRING) {
auto ni = LENGTH(s);
std::copy_n(CHAR(s), ni, p_data);
p_data += ni;
}
}
auto data = ArrayData::Make(arrow::utf8(), n,
{null_buffer, offset_buffer, value_buffer}, null_count, 0);
return MakeArray(data);
}
} // namespace r
} // namespace arrow
// [[Rcpp::export]]
std::shared_ptr<arrow::Array> Array__from_vector(SEXP x) {
switch (TYPEOF(x)) {
case LGLSXP:
return arrow::r::MakeBooleanArray(x);
case INTSXP:
if (Rf_isFactor(x)) {
break;
}
return arrow::r::SimpleArray<INTSXP, arrow::Int32Type>(x);
case REALSXP:
// TODO: Dates, ...
return arrow::r::SimpleArray<REALSXP, arrow::DoubleType>(x);
case RAWSXP:
return arrow::r::SimpleArray<RAWSXP, arrow::Int8Type>(x);
case STRSXP:
return arrow::r::MakeStringArray(x);
default:
break;
}
stop("not handled");
return nullptr;
}
template <int RTYPE>
inline SEXP simple_Array_to_Vector(const std::shared_ptr<arrow::Array>& array) {
using stored_type = typename Rcpp::Vector<RTYPE>::stored_type;
auto start = reinterpret_cast<const stored_type*>(
array->data()->buffers[1]->data() + array->offset() * sizeof(stored_type));
size_t n = array->length();
Rcpp::Vector<RTYPE> vec(start, start + n);
if (array->null_count() && RTYPE != RAWSXP) {
// TODO: not sure what to do with RAWSXP since
// R raw vector do not have a concept of missing data
arrow::internal::BitmapReader bitmap_reader(array->null_bitmap()->data(),
array->offset(), n);
for (size_t i = 0; i < n; i++, bitmap_reader.Next()) {
if (bitmap_reader.IsNotSet()) {
vec[i] = Rcpp::Vector<RTYPE>::get_na();
}
}
}
return vec;
}
inline SEXP BooleanArray_to_Vector(const std::shared_ptr<arrow::Array>& array) {
size_t n = array->length();
LogicalVector vec(n);
// process the data
arrow::internal::BitmapReader data_reader(array->data()->buffers[1]->data(),
array->offset(), n);
for (size_t i = 0; i < n; i++, data_reader.Next()) {
vec[i] = data_reader.IsSet();
}
// then the null bitmap if needed
if (array->null_count()) {
arrow::internal::BitmapReader null_reader(array->null_bitmap()->data(),
array->offset(), n);
for (size_t i = 0; i < n; i++, null_reader.Next()) {
if (null_reader.IsNotSet()) {
vec[i] = LogicalVector::get_na();
}
}
}
return vec;
}
inline SEXP StringArray_to_Vector(const std::shared_ptr<arrow::Array>& array) {
auto n = array->length();
Rcpp::CharacterVector res(n);
const auto& buffers = array->data()->buffers;
auto p_offset = reinterpret_cast<const int32_t*>(buffers[1]->data()) + array->offset();
auto p_data = reinterpret_cast<const char*>(buffers[2]->data()) + *p_offset;
if (array->null_count()) {
// need to watch for nulls
arrow::internal::BitmapReader null_reader(array->null_bitmap_data(), array->offset(),
n);
for (int i = 0; i < n; i++, null_reader.Next()) {
if (null_reader.IsSet()) {
auto diff = p_offset[i + 1] - p_offset[i];
SET_STRING_ELT(res, i, Rf_mkCharLenCE(p_data, diff, CE_UTF8));
p_data += diff;
} else {
SET_STRING_ELT(res, i, NA_STRING);
}
}
} else {
// no need to check for nulls
// TODO: altrep mark this as no na
for (int i = 0; i < n; i++) {
auto diff = p_offset[i + 1] - p_offset[i];
SET_STRING_ELT(res, i, Rf_mkCharLenCE(p_data, diff, CE_UTF8));
p_data += diff;
}
}
return res;
}
// [[Rcpp::export]]
SEXP Array__as_vector(const std::shared_ptr<arrow::Array>& array) {
switch (array->type_id()) {
case Type::BOOL:
return BooleanArray_to_Vector(array);
case Type::INT8:
return simple_Array_to_Vector<RAWSXP>(array);
case Type::INT32:
return simple_Array_to_Vector<INTSXP>(array);
case Type::DOUBLE:
return simple_Array_to_Vector<REALSXP>(array);
case Type::STRING:
return StringArray_to_Vector(array);
default:
break;
}
stop(tfm::format("cannot handle Array of type %d", array->type_id()));
return R_NilValue;
}
// [[Rcpp::export]]
std::shared_ptr<arrow::Array> Array__Slice1(const std::shared_ptr<arrow::Array>& array,
int offset) {
return array->Slice(offset);
}
// [[Rcpp::export]]
std::shared_ptr<arrow::Array> Array__Slice2(const std::shared_ptr<arrow::Array>& array,
int offset, int length) {
return array->Slice(offset, length);
}
// [[Rcpp::export]]
bool Array__IsNull(const std::shared_ptr<arrow::Array>& x, int i) { return x->IsNull(i); }
// [[Rcpp::export]]
bool Array__IsValid(const std::shared_ptr<arrow::Array>& x, int i) {
return x->IsValid(i);
}
// [[Rcpp::export]]
int Array__length(const std::shared_ptr<arrow::Array>& x) { return x->length(); }
// [[Rcpp::export]]
int Array__offset(const std::shared_ptr<arrow::Array>& x) { return x->offset(); }
// [[Rcpp::export]]
int Array__null_count(const std::shared_ptr<arrow::Array>& x) { return x->null_count(); }
// [[Rcpp::export]]
std::shared_ptr<arrow::DataType> Array__type(const std::shared_ptr<arrow::Array>& x) {
return x->type();
}
// [[Rcpp::export]]
std::string Array__ToString(const std::shared_ptr<arrow::Array>& x) {
return x->ToString();
}
// [[Rcpp::export]]
arrow::Type::type Array__type_id(const std::shared_ptr<arrow::Array>& x) {
return x->type_id();
}
// [[Rcpp::export]]
bool Array__Equals(const std::shared_ptr<arrow::Array>& lhs,
const std::shared_ptr<arrow::Array>& rhs) {
return lhs->Equals(rhs);
}
// [[Rcpp::export]]
bool Array__ApproxEquals(const std::shared_ptr<arrow::Array>& lhs,
const std::shared_ptr<arrow::Array>& rhs) {
return lhs->ApproxEquals(rhs);
}
// [[Rcpp::export]]
std::shared_ptr<arrow::ArrayData> Array__data(
const std::shared_ptr<arrow::Array>& array) {
return array->data();
}
// [[Rcpp::export]]
bool Array__RangeEquals(const std::shared_ptr<arrow::Array>& self,
const std::shared_ptr<arrow::Array>& other, int start_idx,
int end_idx, int other_start_idx) {
return self->RangeEquals(*other, start_idx, end_idx, other_start_idx);
}
// [[Rcpp::export]]
LogicalVector Array__Mask(const std::shared_ptr<arrow::Array>& array) {
if (array->null_count() == 0) {
return LogicalVector(array->length(), true);
}
auto n = array->length();
LogicalVector res(no_init(n));
arrow::internal::BitmapReader bitmap_reader(array->null_bitmap()->data(),
array->offset(), n);
for (size_t i = 0; i < array->length(); i++, bitmap_reader.Next()) {
res[i] = bitmap_reader.IsSet();
}
return res;
}