blob: 2124d0a4e4b79b49d563f9bb7ae6996998f473d9 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "arrow/tensor/converter_internal.h"
#include <algorithm>
#include <cstdint>
#include <memory>
#include <numeric>
#include <vector>
#include "arrow/buffer.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/macros.h"
#include "arrow/visitor_inline.h"
namespace arrow {
class MemoryPool;
namespace internal {
namespace {
template <typename c_index_type>
inline void IncrementRowMajorIndex(std::vector<c_index_type>& coord,
const std::vector<int64_t>& shape) {
const int64_t ndim = shape.size();
++coord[ndim - 1];
if (coord[ndim - 1] == shape[ndim - 1]) {
int64_t d = ndim - 1;
while (d > 0 && coord[d] == shape[d]) {
coord[d] = 0;
++coord[d - 1];
--d;
}
}
}
template <typename c_index_type, typename c_value_type>
void ConvertRowMajorTensor(const Tensor& tensor, c_index_type* indices,
c_value_type* values, const int64_t size) {
const auto ndim = tensor.ndim();
const auto& shape = tensor.shape();
const c_value_type* tensor_data =
reinterpret_cast<const c_value_type*>(tensor.raw_data());
constexpr c_value_type zero = 0;
std::vector<c_index_type> coord(ndim, 0);
for (int64_t n = tensor.size(); n > 0; --n) {
const c_value_type x = *tensor_data;
if (ARROW_PREDICT_FALSE(x != zero)) {
std::copy(coord.begin(), coord.end(), indices);
*values++ = x;
indices += ndim;
}
IncrementRowMajorIndex(coord, shape);
++tensor_data;
}
}
template <typename c_index_type, typename c_value_type>
void ConvertColumnMajorTensor(const Tensor& tensor, c_index_type* out_indices,
c_value_type* out_values, const int64_t size) {
const auto ndim = tensor.ndim();
std::vector<c_index_type> indices(ndim * size);
std::vector<c_value_type> values(size);
ConvertRowMajorTensor(tensor, indices.data(), values.data(), size);
// transpose indices
for (int64_t i = 0; i < size; ++i) {
for (int j = 0; j < ndim / 2; ++j) {
std::swap(indices[i * ndim + j], indices[i * ndim + ndim - j - 1]);
}
}
// sort indices
std::vector<int64_t> order(size);
std::iota(order.begin(), order.end(), 0);
std::sort(order.begin(), order.end(), [&](const int64_t xi, const int64_t yi) {
const int64_t x_offset = xi * ndim;
const int64_t y_offset = yi * ndim;
for (int j = 0; j < ndim; ++j) {
const auto x = indices[x_offset + j];
const auto y = indices[y_offset + j];
if (x < y) return true;
if (x > y) return false;
}
return false;
});
// transfer result
const auto* indices_data = indices.data();
for (int64_t i = 0; i < size; ++i) {
out_values[i] = values[i];
std::copy_n(indices_data, ndim, out_indices);
indices_data += ndim;
out_indices += ndim;
}
}
template <typename c_index_type, typename c_value_type>
void ConvertStridedTensor(const Tensor& tensor, c_index_type* indices,
c_value_type* values, const int64_t size) {
using ValueType = typename CTypeTraits<c_value_type>::ArrowType;
const auto& shape = tensor.shape();
const auto ndim = tensor.ndim();
std::vector<int64_t> coord(ndim, 0);
constexpr c_value_type zero = 0;
c_value_type x;
int64_t i;
for (int64_t n = tensor.size(); n > 0; --n) {
x = tensor.Value<ValueType>(coord);
if (ARROW_PREDICT_FALSE(x != zero)) {
*values++ = x;
for (i = 0; i < ndim; ++i) {
*indices++ = static_cast<c_index_type>(coord[i]);
}
}
IncrementRowMajorIndex(coord, shape);
}
}
#define CONVERT_TENSOR(func, index_type, value_type, indices, values, size) \
func<index_type, value_type>(tensor_, reinterpret_cast<index_type*>(indices), \
reinterpret_cast<value_type*>(values), size)
// Using ARROW_EXPAND is necessary to expand __VA_ARGS__ correctly on VC++.
#define CONVERT_ROW_MAJOR_TENSOR(index_type, value_type, ...) \
ARROW_EXPAND(CONVERT_TENSOR(ConvertRowMajorTensor, index_type, value_type, __VA_ARGS__))
#define CONVERT_COLUMN_MAJOR_TENSOR(index_type, value_type, ...) \
ARROW_EXPAND( \
CONVERT_TENSOR(ConvertColumnMajorTensor, index_type, value_type, __VA_ARGS__))
#define CONVERT_STRIDED_TENSOR(index_type, value_type, ...) \
ARROW_EXPAND(CONVERT_TENSOR(ConvertStridedTensor, index_type, value_type, __VA_ARGS__))
// ----------------------------------------------------------------------
// SparseTensorConverter for SparseCOOIndex
class SparseCOOTensorConverter : private SparseTensorConverterMixin {
using SparseTensorConverterMixin::AssignIndex;
using SparseTensorConverterMixin::IsNonZero;
public:
SparseCOOTensorConverter(const Tensor& tensor,
const std::shared_ptr<DataType>& index_value_type,
MemoryPool* pool)
: tensor_(tensor), index_value_type_(index_value_type), pool_(pool) {}
Status Convert() {
RETURN_NOT_OK(::arrow::internal::CheckSparseIndexMaximumValue(index_value_type_,
tensor_.shape()));
const int index_elsize = GetByteWidth(*index_value_type_);
const int value_elsize = GetByteWidth(*tensor_.type());
const int64_t ndim = tensor_.ndim();
ARROW_ASSIGN_OR_RAISE(int64_t nonzero_count, tensor_.CountNonZero());
ARROW_ASSIGN_OR_RAISE(auto indices_buffer,
AllocateBuffer(index_elsize * ndim * nonzero_count, pool_));
uint8_t* indices = indices_buffer->mutable_data();
ARROW_ASSIGN_OR_RAISE(auto values_buffer,
AllocateBuffer(value_elsize * nonzero_count, pool_));
uint8_t* values = values_buffer->mutable_data();
const uint8_t* tensor_data = tensor_.raw_data();
if (ndim <= 1) {
const int64_t count = ndim == 0 ? 1 : tensor_.shape()[0];
for (int64_t i = 0; i < count; ++i) {
if (std::any_of(tensor_data, tensor_data + value_elsize, IsNonZero)) {
AssignIndex(indices, i, index_elsize);
std::copy_n(tensor_data, value_elsize, values);
indices += index_elsize;
values += value_elsize;
}
tensor_data += value_elsize;
}
} else if (tensor_.is_row_major()) {
DISPATCH(CONVERT_ROW_MAJOR_TENSOR, index_elsize, value_elsize, indices, values,
nonzero_count);
} else if (tensor_.is_column_major()) {
DISPATCH(CONVERT_COLUMN_MAJOR_TENSOR, index_elsize, value_elsize, indices, values,
nonzero_count);
} else {
DISPATCH(CONVERT_STRIDED_TENSOR, index_elsize, value_elsize, indices, values,
nonzero_count);
}
// make results
const std::vector<int64_t> indices_shape = {nonzero_count, ndim};
std::vector<int64_t> indices_strides;
RETURN_NOT_OK(internal::ComputeRowMajorStrides(
checked_cast<const FixedWidthType&>(*index_value_type_), indices_shape,
&indices_strides));
auto coords = std::make_shared<Tensor>(index_value_type_, std::move(indices_buffer),
indices_shape, indices_strides);
ARROW_ASSIGN_OR_RAISE(sparse_index, SparseCOOIndex::Make(coords, true));
data = std::move(values_buffer);
return Status::OK();
}
std::shared_ptr<SparseCOOIndex> sparse_index;
std::shared_ptr<Buffer> data;
private:
const Tensor& tensor_;
const std::shared_ptr<DataType>& index_value_type_;
MemoryPool* pool_;
};
} // namespace
void SparseTensorConverterMixin::AssignIndex(uint8_t* indices, int64_t val,
const int elsize) {
switch (elsize) {
case 1:
*indices = static_cast<uint8_t>(val);
break;
case 2:
*reinterpret_cast<uint16_t*>(indices) = static_cast<uint16_t>(val);
break;
case 4:
*reinterpret_cast<uint32_t*>(indices) = static_cast<uint32_t>(val);
break;
case 8:
*reinterpret_cast<int64_t*>(indices) = val;
break;
default:
break;
}
}
int64_t SparseTensorConverterMixin::GetIndexValue(const uint8_t* value_ptr,
const int elsize) {
switch (elsize) {
case 1:
return *value_ptr;
case 2:
return *reinterpret_cast<const uint16_t*>(value_ptr);
case 4:
return *reinterpret_cast<const uint32_t*>(value_ptr);
case 8:
return *reinterpret_cast<const int64_t*>(value_ptr);
default:
return 0;
}
}
Status MakeSparseCOOTensorFromTensor(const Tensor& tensor,
const std::shared_ptr<DataType>& index_value_type,
MemoryPool* pool,
std::shared_ptr<SparseIndex>* out_sparse_index,
std::shared_ptr<Buffer>* out_data) {
SparseCOOTensorConverter converter(tensor, index_value_type, pool);
RETURN_NOT_OK(converter.Convert());
*out_sparse_index = checked_pointer_cast<SparseIndex>(converter.sparse_index);
*out_data = converter.data;
return Status::OK();
}
Result<std::shared_ptr<Tensor>> MakeTensorFromSparseCOOTensor(
MemoryPool* pool, const SparseCOOTensor* sparse_tensor) {
const auto& sparse_index =
checked_cast<const SparseCOOIndex&>(*sparse_tensor->sparse_index());
const auto& coords = sparse_index.indices();
const auto* coords_data = coords->raw_data();
const int index_elsize = GetByteWidth(*coords->type());
const auto& value_type = checked_cast<const FixedWidthType&>(*sparse_tensor->type());
const int value_elsize = GetByteWidth(value_type);
ARROW_ASSIGN_OR_RAISE(auto values_buffer,
AllocateBuffer(value_elsize * sparse_tensor->size(), pool));
auto values = values_buffer->mutable_data();
std::fill_n(values, value_elsize * sparse_tensor->size(), 0);
std::vector<int64_t> strides;
RETURN_NOT_OK(ComputeRowMajorStrides(value_type, sparse_tensor->shape(), &strides));
const auto* raw_data = sparse_tensor->raw_data();
const int ndim = sparse_tensor->ndim();
for (int64_t i = 0; i < sparse_tensor->non_zero_length(); ++i) {
int64_t offset = 0;
for (int j = 0; j < ndim; ++j) {
auto index = static_cast<int64_t>(
SparseTensorConverterMixin::GetIndexValue(coords_data, index_elsize));
offset += index * strides[j];
coords_data += index_elsize;
}
std::copy_n(raw_data, value_elsize, values + offset);
raw_data += value_elsize;
}
return std::make_shared<Tensor>(sparse_tensor->type(), std::move(values_buffer),
sparse_tensor->shape(), strides,
sparse_tensor->dim_names());
}
} // namespace internal
} // namespace arrow