blob: 137b5d3202f22070f76a1774c1844d3525f75a64 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "arrow/tensor/converter.h"
#include <cstdint>
#include <limits>
#include <memory>
#include <vector>
#include "arrow/buffer.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/util/checked_cast.h"
#include "arrow/visitor_inline.h"
namespace arrow {
class MemoryPool;
namespace internal {
namespace {
// ----------------------------------------------------------------------
// SparseTensorConverter for SparseCSRIndex
class SparseCSXMatrixConverter : private SparseTensorConverterMixin {
using SparseTensorConverterMixin::AssignIndex;
using SparseTensorConverterMixin::IsNonZero;
public:
SparseCSXMatrixConverter(SparseMatrixCompressedAxis axis, const Tensor& tensor,
const std::shared_ptr<DataType>& index_value_type,
MemoryPool* pool)
: axis_(axis), tensor_(tensor), index_value_type_(index_value_type), pool_(pool) {}
Status Convert() {
RETURN_NOT_OK(::arrow::internal::CheckSparseIndexMaximumValue(index_value_type_,
tensor_.shape()));
const int index_elsize = GetByteWidth(*index_value_type_);
const int value_elsize = GetByteWidth(*tensor_.type());
const int64_t ndim = tensor_.ndim();
if (ndim > 2) {
return Status::Invalid("Invalid tensor dimension");
}
const int major_axis = static_cast<int>(axis_);
const int64_t n_major = tensor_.shape()[major_axis];
const int64_t n_minor = tensor_.shape()[1 - major_axis];
ARROW_ASSIGN_OR_RAISE(int64_t nonzero_count, tensor_.CountNonZero());
std::shared_ptr<Buffer> indptr_buffer;
std::shared_ptr<Buffer> indices_buffer;
ARROW_ASSIGN_OR_RAISE(auto values_buffer,
AllocateBuffer(value_elsize * nonzero_count, pool_));
auto* values = values_buffer->mutable_data();
const auto* tensor_data = tensor_.raw_data();
if (ndim <= 1) {
return Status::NotImplemented("TODO for ndim <= 1");
} else {
ARROW_ASSIGN_OR_RAISE(indptr_buffer,
AllocateBuffer(index_elsize * (n_major + 1), pool_));
auto* indptr = indptr_buffer->mutable_data();
ARROW_ASSIGN_OR_RAISE(indices_buffer,
AllocateBuffer(index_elsize * nonzero_count, pool_));
auto* indices = indices_buffer->mutable_data();
std::vector<int64_t> coords(2);
int64_t k = 0;
std::fill_n(indptr, index_elsize, 0);
indptr += index_elsize;
for (int64_t i = 0; i < n_major; ++i) {
for (int64_t j = 0; j < n_minor; ++j) {
if (axis_ == SparseMatrixCompressedAxis::ROW) {
coords = {i, j};
} else {
coords = {j, i};
}
const int64_t offset = tensor_.CalculateValueOffset(coords);
if (std::any_of(tensor_data + offset, tensor_data + offset + value_elsize,
IsNonZero)) {
std::copy_n(tensor_data + offset, value_elsize, values);
values += value_elsize;
AssignIndex(indices, j, index_elsize);
indices += index_elsize;
k++;
}
}
AssignIndex(indptr, k, index_elsize);
indptr += index_elsize;
}
}
std::vector<int64_t> indptr_shape({n_major + 1});
std::shared_ptr<Tensor> indptr_tensor =
std::make_shared<Tensor>(index_value_type_, indptr_buffer, indptr_shape);
std::vector<int64_t> indices_shape({nonzero_count});
std::shared_ptr<Tensor> indices_tensor =
std::make_shared<Tensor>(index_value_type_, indices_buffer, indices_shape);
if (axis_ == SparseMatrixCompressedAxis::ROW) {
sparse_index = std::make_shared<SparseCSRIndex>(indptr_tensor, indices_tensor);
} else {
sparse_index = std::make_shared<SparseCSCIndex>(indptr_tensor, indices_tensor);
}
data = std::move(values_buffer);
return Status::OK();
}
std::shared_ptr<SparseIndex> sparse_index;
std::shared_ptr<Buffer> data;
private:
SparseMatrixCompressedAxis axis_;
const Tensor& tensor_;
const std::shared_ptr<DataType>& index_value_type_;
MemoryPool* pool_;
};
} // namespace
Status MakeSparseCSXMatrixFromTensor(SparseMatrixCompressedAxis axis,
const Tensor& tensor,
const std::shared_ptr<DataType>& index_value_type,
MemoryPool* pool,
std::shared_ptr<SparseIndex>* out_sparse_index,
std::shared_ptr<Buffer>* out_data) {
SparseCSXMatrixConverter converter(axis, tensor, index_value_type, pool);
RETURN_NOT_OK(converter.Convert());
*out_sparse_index = converter.sparse_index;
*out_data = converter.data;
return Status::OK();
}
Result<std::shared_ptr<Tensor>> MakeTensorFromSparseCSXMatrix(
SparseMatrixCompressedAxis axis, MemoryPool* pool,
const std::shared_ptr<Tensor>& indptr, const std::shared_ptr<Tensor>& indices,
const int64_t non_zero_length, const std::shared_ptr<DataType>& value_type,
const std::vector<int64_t>& shape, const int64_t tensor_size, const uint8_t* raw_data,
const std::vector<std::string>& dim_names) {
const auto* indptr_data = indptr->raw_data();
const auto* indices_data = indices->raw_data();
const int indptr_elsize = GetByteWidth(*indptr->type());
const int indices_elsize = GetByteWidth(*indices->type());
const auto& fw_value_type = checked_cast<const FixedWidthType&>(*value_type);
const int value_elsize = GetByteWidth(fw_value_type);
ARROW_ASSIGN_OR_RAISE(auto values_buffer,
AllocateBuffer(value_elsize * tensor_size, pool));
auto values = values_buffer->mutable_data();
std::fill_n(values, value_elsize * tensor_size, 0);
std::vector<int64_t> strides;
RETURN_NOT_OK(ComputeRowMajorStrides(fw_value_type, shape, &strides));
const auto nc = shape[1];
int64_t offset = 0;
for (int64_t i = 0; i < indptr->size() - 1; ++i) {
const auto start =
SparseTensorConverterMixin::GetIndexValue(indptr_data, indptr_elsize);
const auto stop = SparseTensorConverterMixin::GetIndexValue(
indptr_data + indptr_elsize, indptr_elsize);
for (int64_t j = start; j < stop; ++j) {
const auto index = SparseTensorConverterMixin::GetIndexValue(
indices_data + j * indices_elsize, indices_elsize);
switch (axis) {
case SparseMatrixCompressedAxis::ROW:
offset = (index + i * nc) * value_elsize;
break;
case SparseMatrixCompressedAxis::COLUMN:
offset = (i + index * nc) * value_elsize;
break;
}
std::copy_n(raw_data, value_elsize, values + offset);
raw_data += value_elsize;
}
indptr_data += indptr_elsize;
}
return std::make_shared<Tensor>(value_type, std::move(values_buffer), shape, strides,
dim_names);
}
Result<std::shared_ptr<Tensor>> MakeTensorFromSparseCSRMatrix(
MemoryPool* pool, const SparseCSRMatrix* sparse_tensor) {
const auto& sparse_index =
internal::checked_cast<const SparseCSRIndex&>(*sparse_tensor->sparse_index());
const auto& indptr = sparse_index.indptr();
const auto& indices = sparse_index.indices();
const auto non_zero_length = sparse_tensor->non_zero_length();
return MakeTensorFromSparseCSXMatrix(
SparseMatrixCompressedAxis::ROW, pool, indptr, indices, non_zero_length,
sparse_tensor->type(), sparse_tensor->shape(), sparse_tensor->size(),
sparse_tensor->raw_data(), sparse_tensor->dim_names());
}
Result<std::shared_ptr<Tensor>> MakeTensorFromSparseCSCMatrix(
MemoryPool* pool, const SparseCSCMatrix* sparse_tensor) {
const auto& sparse_index =
internal::checked_cast<const SparseCSCIndex&>(*sparse_tensor->sparse_index());
const auto& indptr = sparse_index.indptr();
const auto& indices = sparse_index.indices();
const auto non_zero_length = sparse_tensor->non_zero_length();
return MakeTensorFromSparseCSXMatrix(
SparseMatrixCompressedAxis::COLUMN, pool, indptr, indices, non_zero_length,
sparse_tensor->type(), sparse_tensor->shape(), sparse_tensor->size(),
sparse_tensor->raw_data(), sparse_tensor->dim_names());
}
} // namespace internal
} // namespace arrow