blob: 77a71d8a12e4ba07b42ea699c62277b54a2d454c [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "arrow/tensor/converter.h"
#include <algorithm>
#include <cstdint>
#include <limits>
#include <memory>
#include <vector>
#include "arrow/buffer.h"
#include "arrow/buffer_builder.h"
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/sort.h"
#include "arrow/visitor_inline.h"
namespace arrow {
class MemoryPool;
namespace internal {
namespace {
inline void IncrementIndex(std::vector<int64_t>& coord, const std::vector<int64_t>& shape,
const std::vector<int64_t>& axis_order) {
const int64_t ndim = shape.size();
const int64_t last_axis = axis_order[ndim - 1];
++coord[last_axis];
if (coord[last_axis] == shape[last_axis]) {
int64_t d = ndim - 1;
while (d > 0 && coord[axis_order[d]] == shape[axis_order[d]]) {
coord[axis_order[d]] = 0;
++coord[axis_order[d - 1]];
--d;
}
}
}
// ----------------------------------------------------------------------
// SparseTensorConverter for SparseCSFIndex
class SparseCSFTensorConverter : private SparseTensorConverterMixin {
using SparseTensorConverterMixin::AssignIndex;
using SparseTensorConverterMixin::IsNonZero;
public:
SparseCSFTensorConverter(const Tensor& tensor,
const std::shared_ptr<DataType>& index_value_type,
MemoryPool* pool)
: tensor_(tensor), index_value_type_(index_value_type), pool_(pool) {}
Status Convert() {
RETURN_NOT_OK(::arrow::internal::CheckSparseIndexMaximumValue(index_value_type_,
tensor_.shape()));
const int index_elsize = GetByteWidth(*index_value_type_);
const int value_elsize = GetByteWidth(*tensor_.type());
const int64_t ndim = tensor_.ndim();
// Axis order as ascending order of dimension size is a good heuristic but is not
// necessarily optimal.
std::vector<int64_t> axis_order = internal::ArgSort(tensor_.shape());
ARROW_ASSIGN_OR_RAISE(int64_t nonzero_count, tensor_.CountNonZero());
ARROW_ASSIGN_OR_RAISE(auto values_buffer,
AllocateBuffer(value_elsize * nonzero_count, pool_));
auto* values = values_buffer->mutable_data();
std::vector<int64_t> counts(ndim, 0);
std::vector<int64_t> coord(ndim, 0);
std::vector<int64_t> previous_coord(ndim, -1);
std::vector<BufferBuilder> indptr_buffer_builders(ndim - 1);
std::vector<BufferBuilder> indices_buffer_builders(ndim);
const auto* tensor_data = tensor_.raw_data();
uint8_t index_buffer[sizeof(int64_t)];
if (ndim <= 1) {
return Status::NotImplemented("TODO for ndim <= 1");
} else {
const auto& shape = tensor_.shape();
for (int64_t n = tensor_.size(); n > 0; n--) {
const auto offset = tensor_.CalculateValueOffset(coord);
const auto xp = tensor_data + offset;
if (std::any_of(xp, xp + value_elsize, IsNonZero)) {
bool tree_split = false;
std::copy_n(xp, value_elsize, values);
values += value_elsize;
for (int64_t i = 0; i < ndim; ++i) {
int64_t dimension = axis_order[i];
tree_split = tree_split || (coord[dimension] != previous_coord[dimension]);
if (tree_split) {
if (i < ndim - 1) {
AssignIndex(index_buffer, counts[i + 1], index_elsize);
RETURN_NOT_OK(
indptr_buffer_builders[i].Append(index_buffer, index_elsize));
}
AssignIndex(index_buffer, coord[dimension], index_elsize);
RETURN_NOT_OK(
indices_buffer_builders[i].Append(index_buffer, index_elsize));
++counts[i];
}
}
previous_coord = coord;
}
IncrementIndex(coord, shape, axis_order);
}
}
for (int64_t column = 0; column < ndim - 1; ++column) {
AssignIndex(index_buffer, counts[column + 1], index_elsize);
RETURN_NOT_OK(indptr_buffer_builders[column].Append(index_buffer, index_elsize));
}
// make results
data = std::move(values_buffer);
std::vector<std::shared_ptr<Buffer>> indptr_buffers(ndim - 1);
std::vector<std::shared_ptr<Buffer>> indices_buffers(ndim);
std::vector<int64_t> indptr_shapes(counts.begin(), counts.end() - 1);
std::vector<int64_t> indices_shapes = counts;
for (int64_t column = 0; column < ndim; ++column) {
RETURN_NOT_OK(
indices_buffer_builders[column].Finish(&indices_buffers[column], true));
}
for (int64_t column = 0; column < ndim - 1; ++column) {
RETURN_NOT_OK(indptr_buffer_builders[column].Finish(&indptr_buffers[column], true));
}
ARROW_ASSIGN_OR_RAISE(
sparse_index, SparseCSFIndex::Make(index_value_type_, indices_shapes, axis_order,
indptr_buffers, indices_buffers));
return Status::OK();
}
std::shared_ptr<SparseCSFIndex> sparse_index;
std::shared_ptr<Buffer> data;
private:
const Tensor& tensor_;
const std::shared_ptr<DataType>& index_value_type_;
MemoryPool* pool_;
};
class TensorBuilderFromSparseCSFTensor : private SparseTensorConverterMixin {
using SparseTensorConverterMixin::GetIndexValue;
MemoryPool* pool_;
const SparseCSFTensor* sparse_tensor_;
const SparseCSFIndex* sparse_index_;
const std::vector<std::shared_ptr<Tensor>>& indptr_;
const std::vector<std::shared_ptr<Tensor>>& indices_;
const std::vector<int64_t>& axis_order_;
const std::vector<int64_t>& shape_;
const int64_t non_zero_length_;
const int ndim_;
const int64_t tensor_size_;
const FixedWidthType& value_type_;
const int value_elsize_;
const uint8_t* raw_data_;
std::vector<int64_t> strides_;
std::shared_ptr<Buffer> values_buffer_;
uint8_t* values_;
public:
TensorBuilderFromSparseCSFTensor(const SparseCSFTensor* sparse_tensor, MemoryPool* pool)
: pool_(pool),
sparse_tensor_(sparse_tensor),
sparse_index_(
checked_cast<const SparseCSFIndex*>(sparse_tensor->sparse_index().get())),
indptr_(sparse_index_->indptr()),
indices_(sparse_index_->indices()),
axis_order_(sparse_index_->axis_order()),
shape_(sparse_tensor->shape()),
non_zero_length_(sparse_tensor->non_zero_length()),
ndim_(sparse_tensor->ndim()),
tensor_size_(sparse_tensor->size()),
value_type_(checked_cast<const FixedWidthType&>(*sparse_tensor->type())),
value_elsize_(GetByteWidth(value_type_)),
raw_data_(sparse_tensor->raw_data()) {}
int ElementSize(const std::shared_ptr<Tensor>& tensor) const {
return GetByteWidth(*tensor->type());
}
Result<std::shared_ptr<Tensor>> Build() {
RETURN_NOT_OK(internal::ComputeRowMajorStrides(value_type_, shape_, &strides_));
ARROW_ASSIGN_OR_RAISE(values_buffer_,
AllocateBuffer(value_elsize_ * tensor_size_, pool_));
values_ = values_buffer_->mutable_data();
std::fill_n(values_, value_elsize_ * tensor_size_, 0);
const int64_t start = 0;
const int64_t stop = indptr_[0]->size() - 1;
ExpandValues(0, 0, start, stop);
return std::make_shared<Tensor>(sparse_tensor_->type(), std::move(values_buffer_),
shape_, strides_, sparse_tensor_->dim_names());
}
void ExpandValues(const int64_t dim, const int64_t dim_offset, const int64_t start,
const int64_t stop) {
const auto& cur_indices = indices_[dim];
const int indices_elsize = ElementSize(cur_indices);
const auto* indices_data = cur_indices->raw_data() + start * indices_elsize;
if (dim == ndim_ - 1) {
for (auto i = start; i < stop; ++i) {
const int64_t index =
SparseTensorConverterMixin::GetIndexValue(indices_data, indices_elsize);
const int64_t offset = dim_offset + index * strides_[axis_order_[dim]];
std::copy_n(raw_data_ + i * value_elsize_, value_elsize_, values_ + offset);
indices_data += indices_elsize;
}
} else {
const auto& cur_indptr = indptr_[dim];
const int indptr_elsize = ElementSize(cur_indptr);
const auto* indptr_data = cur_indptr->raw_data() + start * indptr_elsize;
for (int64_t i = start; i < stop; ++i) {
const int64_t index =
SparseTensorConverterMixin::GetIndexValue(indices_data, indices_elsize);
const int64_t offset = dim_offset + index * strides_[axis_order_[dim]];
const int64_t next_start = GetIndexValue(indptr_data, indptr_elsize);
const int64_t next_stop =
GetIndexValue(indptr_data + indptr_elsize, indptr_elsize);
ExpandValues(dim + 1, offset, next_start, next_stop);
indices_data += indices_elsize;
indptr_data += indptr_elsize;
}
}
}
};
} // namespace
Status MakeSparseCSFTensorFromTensor(const Tensor& tensor,
const std::shared_ptr<DataType>& index_value_type,
MemoryPool* pool,
std::shared_ptr<SparseIndex>* out_sparse_index,
std::shared_ptr<Buffer>* out_data) {
SparseCSFTensorConverter converter(tensor, index_value_type, pool);
RETURN_NOT_OK(converter.Convert());
*out_sparse_index = checked_pointer_cast<SparseIndex>(converter.sparse_index);
*out_data = converter.data;
return Status::OK();
}
Result<std::shared_ptr<Tensor>> MakeTensorFromSparseCSFTensor(
MemoryPool* pool, const SparseCSFTensor* sparse_tensor) {
TensorBuilderFromSparseCSFTensor builder(sparse_tensor, pool);
return builder.Build();
}
} // namespace internal
} // namespace arrow