blob: 616be0467f9402926eb2927d84966fc6e93103b0 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include <memory>
#include "./arrow_types.h"
#include "./arrow_vctrs.h"
#include <arrow/array/array_base.h>
#include <arrow/chunked_array.h>
namespace arrow {
namespace r {
static inline std::shared_ptr<arrow::DataType> IndexTypeForFactors(int n_factors) {
if (n_factors < INT8_MAX) {
return arrow::int8();
} else if (n_factors < INT16_MAX) {
return arrow::int16();
} else {
return arrow::int32();
}
}
std::shared_ptr<arrow::DataType> InferArrowTypeFromFactor(SEXP factor) {
SEXP factors = Rf_getAttrib(factor, R_LevelsSymbol);
auto index_type = IndexTypeForFactors(Rf_length(factors));
bool is_ordered = Rf_inherits(factor, "ordered");
return dictionary(index_type, arrow::utf8(), is_ordered);
}
template <int VectorType>
std::shared_ptr<arrow::DataType> InferArrowTypeFromVector(SEXP x) {
cpp11::stop("Unknown vector type: ", VectorType);
}
template <>
std::shared_ptr<arrow::DataType> InferArrowTypeFromVector<ENVSXP>(SEXP x) {
if (Rf_inherits(x, "Array")) {
return cpp11::as_cpp<std::shared_ptr<arrow::Array>>(x)->type();
}
cpp11::stop("Unrecognized vector instance for type ENVSXP");
}
template <>
std::shared_ptr<arrow::DataType> InferArrowTypeFromVector<LGLSXP>(SEXP x) {
return Rf_inherits(x, "vctrs_unspecified") ? null() : boolean();
}
template <>
std::shared_ptr<arrow::DataType> InferArrowTypeFromVector<INTSXP>(SEXP x) {
if (Rf_isFactor(x)) {
return InferArrowTypeFromFactor(x);
} else if (Rf_inherits(x, "Date")) {
return date32();
} else if (Rf_inherits(x, "POSIXct")) {
auto tzone_sexp = Rf_getAttrib(x, symbols::tzone);
if (Rf_isNull(tzone_sexp)) {
auto systzone_sexp = cpp11::package("base")["Sys.timezone"];
return timestamp(TimeUnit::MICRO, CHAR(STRING_ELT(systzone_sexp(), 0)));
} else {
return timestamp(TimeUnit::MICRO, CHAR(STRING_ELT(tzone_sexp, 0)));
}
}
return int32();
}
template <>
std::shared_ptr<arrow::DataType> InferArrowTypeFromVector<REALSXP>(SEXP x) {
if (Rf_inherits(x, "Date")) {
return date32();
}
if (Rf_inherits(x, "POSIXct")) {
auto tzone_sexp = Rf_getAttrib(x, symbols::tzone);
if (Rf_isNull(tzone_sexp)) {
auto systzone_sexp = cpp11::package("base")["Sys.timezone"];
return timestamp(TimeUnit::MICRO, CHAR(STRING_ELT(systzone_sexp(), 0)));
} else {
return timestamp(TimeUnit::MICRO, CHAR(STRING_ELT(tzone_sexp, 0)));
}
}
if (Rf_inherits(x, "integer64")) {
return int64();
}
if (Rf_inherits(x, "hms")) {
return time32(TimeUnit::SECOND);
}
if (Rf_inherits(x, "difftime")) {
return duration(TimeUnit::SECOND);
}
return float64();
}
template <>
std::shared_ptr<arrow::DataType> InferArrowTypeFromVector<STRSXP>(SEXP x) {
return cpp11::unwind_protect([&] {
R_xlen_t n = XLENGTH(x);
int64_t size = 0;
for (R_xlen_t i = 0; i < n; i++) {
size += arrow::r::unsafe::r_string_size(STRING_ELT(x, i));
if (size > arrow::kBinaryMemoryLimit) {
// Exceeds 2GB capacity of utf8 type, so use large
return large_utf8();
}
}
return utf8();
});
}
static inline std::shared_ptr<arrow::DataType> InferArrowTypeFromDataFrame(
cpp11::list x) {
R_xlen_t n = x.size();
cpp11::strings names(x.attr(R_NamesSymbol));
std::vector<std::shared_ptr<arrow::Field>> fields(n);
for (R_xlen_t i = 0; i < n; i++) {
fields[i] = arrow::field(names[i], InferArrowType(x[i]));
}
return arrow::struct_(std::move(fields));
}
template <>
std::shared_ptr<arrow::DataType> InferArrowTypeFromVector<VECSXP>(SEXP x) {
if (Rf_inherits(x, "data.frame") || Rf_inherits(x, "POSIXlt")) {
return InferArrowTypeFromDataFrame(x);
} else {
// some known special cases
if (Rf_inherits(x, "arrow_fixed_size_binary")) {
SEXP byte_width = Rf_getAttrib(x, symbols::byte_width);
if (Rf_isNull(byte_width) || TYPEOF(byte_width) != INTSXP ||
XLENGTH(byte_width) != 1) {
cpp11::stop("malformed arrow_fixed_size_binary object");
}
return arrow::fixed_size_binary(INTEGER(byte_width)[0]);
}
if (Rf_inherits(x, "arrow_binary")) {
return arrow::binary();
}
if (Rf_inherits(x, "arrow_large_binary")) {
return arrow::large_binary();
}
SEXP ptype = Rf_getAttrib(x, symbols::ptype);
if (Rf_isNull(ptype)) {
if (XLENGTH(x) == 0) {
cpp11::stop(
"Requires at least one element to infer the values' type of a list vector");
}
ptype = VECTOR_ELT(x, 0);
}
return arrow::list(InferArrowType(ptype));
}
}
std::shared_ptr<arrow::DataType> InferArrowType(SEXP x) {
if (arrow::r::altrep::is_arrow_altrep(x)) {
return arrow::r::altrep::vec_to_arrow_altrep_bypass(x)->type();
}
// If we handle the conversion in C++ we do so here; otherwise we call
// the type() S3 generic to infer the type of the object. For data.frame,
// this code is sufficiently recursive such that it correctly calls into
// R to infer column types where can_convert_native() is false.
if (can_convert_native(x) || Rf_inherits(x, "data.frame")) {
switch (TYPEOF(x)) {
case ENVSXP:
return InferArrowTypeFromVector<ENVSXP>(x);
case LGLSXP:
return InferArrowTypeFromVector<LGLSXP>(x);
case INTSXP:
return InferArrowTypeFromVector<INTSXP>(x);
case REALSXP:
return InferArrowTypeFromVector<REALSXP>(x);
case RAWSXP:
return uint8();
case STRSXP:
return InferArrowTypeFromVector<STRSXP>(x);
case VECSXP:
return InferArrowTypeFromVector<VECSXP>(x);
default:
cpp11::stop("Cannot infer type from vector");
}
} else {
cpp11::sexp type_result = cpp11::package("arrow")["infer_type"](
x, cpp11::named_arg("from_array_infer_type") = true);
if (!Rf_inherits(type_result, "DataType")) {
cpp11::stop("type() did not return an object of type DataType");
}
return cpp11::as_cpp<std::shared_ptr<arrow::DataType>>(type_result);
}
}
} // namespace r
} // namespace arrow
// [[arrow::export]]
std::shared_ptr<arrow::DataType> Array__infer_type(SEXP x) {
return arrow::r::InferArrowType(x);
}