cpp/src/arrow/tensor/csx_converter.cc - arrow - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 #include "arrow/tensor/converter.h"

 #include <cstdint>
 #include <limits>
 #include <memory>
 #include <vector>

 #include "arrow/buffer.h"
 #include "arrow/status.h"
 #include "arrow/type.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/visitor_inline.h"

 namespace arrow {

 class MemoryPool;

 namespace internal {
 namespace {

 // ----------------------------------------------------------------------
 // SparseTensorConverter for SparseCSRIndex

 class SparseCSXMatrixConverter : private SparseTensorConverterMixin {
   using SparseTensorConverterMixin::AssignIndex;
   using SparseTensorConverterMixin::IsNonZero;

  public:
   SparseCSXMatrixConverter(SparseMatrixCompressedAxis axis, const Tensor& tensor,
                            const std::shared_ptr<DataType>& index_value_type,
                            MemoryPool* pool)
       : axis_(axis), tensor_(tensor), index_value_type_(index_value_type), pool_(pool) {}

   Status Convert() {
     RETURN_NOT_OK(::arrow::internal::CheckSparseIndexMaximumValue(index_value_type_,
                                                                   tensor_.shape()));

     const int index_elsize = GetByteWidth(*index_value_type_);
     const int value_elsize = GetByteWidth(*tensor_.type());

     const int64_t ndim = tensor_.ndim();
     if (ndim > 2) {
       return Status::Invalid("Invalid tensor dimension");
     }

     const int major_axis = static_cast<int>(axis_);
     const int64_t n_major = tensor_.shape()[major_axis];
     const int64_t n_minor = tensor_.shape()[1 - major_axis];
     ARROW_ASSIGN_OR_RAISE(int64_t nonzero_count, tensor_.CountNonZero());

     std::shared_ptr<Buffer> indptr_buffer;
     std::shared_ptr<Buffer> indices_buffer;

     ARROW_ASSIGN_OR_RAISE(auto values_buffer,
                           AllocateBuffer(value_elsize * nonzero_count, pool_));
     auto* values = values_buffer->mutable_data();

     const auto* tensor_data = tensor_.raw_data();

     if (ndim <= 1) {
       return Status::NotImplemented("TODO for ndim <= 1");
     } else {
       ARROW_ASSIGN_OR_RAISE(indptr_buffer,
                             AllocateBuffer(index_elsize * (n_major + 1), pool_));
       auto* indptr = indptr_buffer->mutable_data();

       ARROW_ASSIGN_OR_RAISE(indices_buffer,
                             AllocateBuffer(index_elsize * nonzero_count, pool_));
       auto* indices = indices_buffer->mutable_data();

       std::vector<int64_t> coords(2);
       int64_t k = 0;
       std::fill_n(indptr, index_elsize, 0);
       indptr += index_elsize;
       for (int64_t i = 0; i < n_major; ++i) {
         for (int64_t j = 0; j < n_minor; ++j) {
           if (axis_ == SparseMatrixCompressedAxis::ROW) {
             coords = {i, j};
           } else {
             coords = {j, i};
           }
           const int64_t offset = tensor_.CalculateValueOffset(coords);
           if (std::any_of(tensor_data + offset, tensor_data + offset + value_elsize,
                           IsNonZero)) {
             std::copy_n(tensor_data + offset, value_elsize, values);
             values += value_elsize;

             AssignIndex(indices, j, index_elsize);
             indices += index_elsize;

             k++;
           }
         }
         AssignIndex(indptr, k, index_elsize);
         indptr += index_elsize;
       }
     }

     std::vector<int64_t> indptr_shape({n_major + 1});
     std::shared_ptr<Tensor> indptr_tensor =
         std::make_shared<Tensor>(index_value_type_, indptr_buffer, indptr_shape);

     std::vector<int64_t> indices_shape({nonzero_count});
     std::shared_ptr<Tensor> indices_tensor =
         std::make_shared<Tensor>(index_value_type_, indices_buffer, indices_shape);

     if (axis_ == SparseMatrixCompressedAxis::ROW) {
       sparse_index = std::make_shared<SparseCSRIndex>(indptr_tensor, indices_tensor);
     } else {
       sparse_index = std::make_shared<SparseCSCIndex>(indptr_tensor, indices_tensor);
     }
     data = std::move(values_buffer);

     return Status::OK();
   }

   std::shared_ptr<SparseIndex> sparse_index;
   std::shared_ptr<Buffer> data;

  private:
   SparseMatrixCompressedAxis axis_;
   const Tensor& tensor_;
   const std::shared_ptr<DataType>& index_value_type_;
   MemoryPool* pool_;
 };

 }  // namespace

 Status MakeSparseCSXMatrixFromTensor(SparseMatrixCompressedAxis axis,
                                      const Tensor& tensor,
                                      const std::shared_ptr<DataType>& index_value_type,
                                      MemoryPool* pool,
                                      std::shared_ptr<SparseIndex>* out_sparse_index,
                                      std::shared_ptr<Buffer>* out_data) {
   SparseCSXMatrixConverter converter(axis, tensor, index_value_type, pool);
   RETURN_NOT_OK(converter.Convert());

   *out_sparse_index = converter.sparse_index;
   *out_data = converter.data;
   return Status::OK();
 }

 Result<std::shared_ptr<Tensor>> MakeTensorFromSparseCSXMatrix(
     SparseMatrixCompressedAxis axis, MemoryPool* pool,
     const std::shared_ptr<Tensor>& indptr, const std::shared_ptr<Tensor>& indices,
     const int64_t non_zero_length, const std::shared_ptr<DataType>& value_type,
     const std::vector<int64_t>& shape, const int64_t tensor_size, const uint8_t* raw_data,
     const std::vector<std::string>& dim_names) {
   const auto* indptr_data = indptr->raw_data();
   const auto* indices_data = indices->raw_data();

   const int indptr_elsize = GetByteWidth(*indptr->type());
   const int indices_elsize = GetByteWidth(*indices->type());

   const auto& fw_value_type = checked_cast<const FixedWidthType&>(*value_type);
   const int value_elsize = GetByteWidth(fw_value_type);
   ARROW_ASSIGN_OR_RAISE(auto values_buffer,
                         AllocateBuffer(value_elsize * tensor_size, pool));
   auto values = values_buffer->mutable_data();
   std::fill_n(values, value_elsize * tensor_size, 0);

   std::vector<int64_t> strides;
   RETURN_NOT_OK(ComputeRowMajorStrides(fw_value_type, shape, &strides));

   const auto nc = shape[1];

   int64_t offset = 0;
   for (int64_t i = 0; i < indptr->size() - 1; ++i) {
     const auto start =
         SparseTensorConverterMixin::GetIndexValue(indptr_data, indptr_elsize);
     const auto stop = SparseTensorConverterMixin::GetIndexValue(
         indptr_data + indptr_elsize, indptr_elsize);

     for (int64_t j = start; j < stop; ++j) {
       const auto index = SparseTensorConverterMixin::GetIndexValue(
           indices_data + j * indices_elsize, indices_elsize);
       switch (axis) {
         case SparseMatrixCompressedAxis::ROW:
           offset = (index + i * nc) * value_elsize;
           break;
         case SparseMatrixCompressedAxis::COLUMN:
           offset = (i + index * nc) * value_elsize;
           break;
       }

       std::copy_n(raw_data, value_elsize, values + offset);
       raw_data += value_elsize;
     }

     indptr_data += indptr_elsize;
   }

   return std::make_shared<Tensor>(value_type, std::move(values_buffer), shape, strides,
                                   dim_names);
 }

 Result<std::shared_ptr<Tensor>> MakeTensorFromSparseCSRMatrix(
     MemoryPool* pool, const SparseCSRMatrix* sparse_tensor) {
   const auto& sparse_index =
       internal::checked_cast<const SparseCSRIndex&>(*sparse_tensor->sparse_index());
   const auto& indptr = sparse_index.indptr();
   const auto& indices = sparse_index.indices();
   const auto non_zero_length = sparse_tensor->non_zero_length();
   return MakeTensorFromSparseCSXMatrix(
       SparseMatrixCompressedAxis::ROW, pool, indptr, indices, non_zero_length,
       sparse_tensor->type(), sparse_tensor->shape(), sparse_tensor->size(),
       sparse_tensor->raw_data(), sparse_tensor->dim_names());
 }

 Result<std::shared_ptr<Tensor>> MakeTensorFromSparseCSCMatrix(
     MemoryPool* pool, const SparseCSCMatrix* sparse_tensor) {
   const auto& sparse_index =
       internal::checked_cast<const SparseCSCIndex&>(*sparse_tensor->sparse_index());
   const auto& indptr = sparse_index.indptr();
   const auto& indices = sparse_index.indices();
   const auto non_zero_length = sparse_tensor->non_zero_length();
   return MakeTensorFromSparseCSXMatrix(
       SparseMatrixCompressedAxis::COLUMN, pool, indptr, indices, non_zero_length,
       sparse_tensor->type(), sparse_tensor->shape(), sparse_tensor->size(),
       sparse_tensor->raw_data(), sparse_tensor->dim_names());
 }

 }  // namespace internal
 }  // namespace arrow
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.

	#include "arrow/tensor/converter.h"

	#include <cstdint>
	#include <limits>
	#include <memory>
	#include <vector>

	#include "arrow/buffer.h"
	#include "arrow/status.h"
	#include "arrow/type.h"
	#include "arrow/util/checked_cast.h"
	#include "arrow/visitor_inline.h"

	namespace arrow {

	class MemoryPool;

	namespace internal {
	namespace {

	// ----------------------------------------------------------------------
	// SparseTensorConverter for SparseCSRIndex

	class SparseCSXMatrixConverter : private SparseTensorConverterMixin {
	using SparseTensorConverterMixin::AssignIndex;
	using SparseTensorConverterMixin::IsNonZero;

	public:
	SparseCSXMatrixConverter(SparseMatrixCompressedAxis axis, const Tensor& tensor,
	const std::shared_ptr<DataType>& index_value_type,
	MemoryPool* pool)
	: axis_(axis), tensor_(tensor), index_value_type_(index_value_type), pool_(pool) {}

	Status Convert() {
	RETURN_NOT_OK(::arrow::internal::CheckSparseIndexMaximumValue(index_value_type_,
	tensor_.shape()));

	const int index_elsize = GetByteWidth(*index_value_type_);
	const int value_elsize = GetByteWidth(*tensor_.type());

	const int64_t ndim = tensor_.ndim();
	if (ndim > 2) {
	return Status::Invalid("Invalid tensor dimension");
	}

	const int major_axis = static_cast<int>(axis_);
	const int64_t n_major = tensor_.shape()[major_axis];
	const int64_t n_minor = tensor_.shape()[1 - major_axis];
	ARROW_ASSIGN_OR_RAISE(int64_t nonzero_count, tensor_.CountNonZero());

	std::shared_ptr<Buffer> indptr_buffer;
	std::shared_ptr<Buffer> indices_buffer;

	ARROW_ASSIGN_OR_RAISE(auto values_buffer,
	AllocateBuffer(value_elsize * nonzero_count, pool_));
	auto* values = values_buffer->mutable_data();

	const auto* tensor_data = tensor_.raw_data();

	if (ndim <= 1) {
	return Status::NotImplemented("TODO for ndim <= 1");
	} else {
	ARROW_ASSIGN_OR_RAISE(indptr_buffer,
	AllocateBuffer(index_elsize * (n_major + 1), pool_));
	auto* indptr = indptr_buffer->mutable_data();

	ARROW_ASSIGN_OR_RAISE(indices_buffer,
	AllocateBuffer(index_elsize * nonzero_count, pool_));
	auto* indices = indices_buffer->mutable_data();

	std::vector<int64_t> coords(2);
	int64_t k = 0;
	std::fill_n(indptr, index_elsize, 0);
	indptr += index_elsize;
	for (int64_t i = 0; i < n_major; ++i) {
	for (int64_t j = 0; j < n_minor; ++j) {
	if (axis_ == SparseMatrixCompressedAxis::ROW) {
	coords = {i, j};
	} else {
	coords = {j, i};
	}
	const int64_t offset = tensor_.CalculateValueOffset(coords);
	if (std::any_of(tensor_data + offset, tensor_data + offset + value_elsize,
	IsNonZero)) {
	std::copy_n(tensor_data + offset, value_elsize, values);
	values += value_elsize;

	AssignIndex(indices, j, index_elsize);
	indices += index_elsize;

	k++;
	}
	}
	AssignIndex(indptr, k, index_elsize);
	indptr += index_elsize;
	}
	}

	std::vector<int64_t> indptr_shape({n_major + 1});
	std::shared_ptr<Tensor> indptr_tensor =
	std::make_shared<Tensor>(index_value_type_, indptr_buffer, indptr_shape);

	std::vector<int64_t> indices_shape({nonzero_count});
	std::shared_ptr<Tensor> indices_tensor =
	std::make_shared<Tensor>(index_value_type_, indices_buffer, indices_shape);

	if (axis_ == SparseMatrixCompressedAxis::ROW) {
	sparse_index = std::make_shared<SparseCSRIndex>(indptr_tensor, indices_tensor);
	} else {
	sparse_index = std::make_shared<SparseCSCIndex>(indptr_tensor, indices_tensor);
	}
	data = std::move(values_buffer);

	return Status::OK();
	}

	std::shared_ptr<SparseIndex> sparse_index;
	std::shared_ptr<Buffer> data;

	private:
	SparseMatrixCompressedAxis axis_;
	const Tensor& tensor_;
	const std::shared_ptr<DataType>& index_value_type_;
	MemoryPool* pool_;
	};

	} // namespace

	Status MakeSparseCSXMatrixFromTensor(SparseMatrixCompressedAxis axis,
	const Tensor& tensor,
	const std::shared_ptr<DataType>& index_value_type,
	MemoryPool* pool,
	std::shared_ptr<SparseIndex>* out_sparse_index,
	std::shared_ptr<Buffer>* out_data) {
	SparseCSXMatrixConverter converter(axis, tensor, index_value_type, pool);
	RETURN_NOT_OK(converter.Convert());

	*out_sparse_index = converter.sparse_index;
	*out_data = converter.data;
	return Status::OK();
	}

	Result<std::shared_ptr<Tensor>> MakeTensorFromSparseCSXMatrix(
	SparseMatrixCompressedAxis axis, MemoryPool* pool,
	const std::shared_ptr<Tensor>& indptr, const std::shared_ptr<Tensor>& indices,
	const int64_t non_zero_length, const std::shared_ptr<DataType>& value_type,
	const std::vector<int64_t>& shape, const int64_t tensor_size, const uint8_t* raw_data,
	const std::vector<std::string>& dim_names) {
	const auto* indptr_data = indptr->raw_data();
	const auto* indices_data = indices->raw_data();

	const int indptr_elsize = GetByteWidth(*indptr->type());
	const int indices_elsize = GetByteWidth(*indices->type());

	const auto& fw_value_type = checked_cast<const FixedWidthType&>(*value_type);
	const int value_elsize = GetByteWidth(fw_value_type);
	ARROW_ASSIGN_OR_RAISE(auto values_buffer,
	AllocateBuffer(value_elsize * tensor_size, pool));
	auto values = values_buffer->mutable_data();
	std::fill_n(values, value_elsize * tensor_size, 0);

	std::vector<int64_t> strides;
	RETURN_NOT_OK(ComputeRowMajorStrides(fw_value_type, shape, &strides));

	const auto nc = shape[1];

	int64_t offset = 0;
	for (int64_t i = 0; i < indptr->size() - 1; ++i) {
	const auto start =
	SparseTensorConverterMixin::GetIndexValue(indptr_data, indptr_elsize);
	const auto stop = SparseTensorConverterMixin::GetIndexValue(
	indptr_data + indptr_elsize, indptr_elsize);

	for (int64_t j = start; j < stop; ++j) {
	const auto index = SparseTensorConverterMixin::GetIndexValue(
	indices_data + j * indices_elsize, indices_elsize);
	switch (axis) {
	case SparseMatrixCompressedAxis::ROW:
	offset = (index + i * nc) * value_elsize;
	break;
	case SparseMatrixCompressedAxis::COLUMN:
	offset = (i + index * nc) * value_elsize;
	break;
	}

	std::copy_n(raw_data, value_elsize, values + offset);
	raw_data += value_elsize;
	}

	indptr_data += indptr_elsize;
	}

	return std::make_shared<Tensor>(value_type, std::move(values_buffer), shape, strides,
	dim_names);
	}

	Result<std::shared_ptr<Tensor>> MakeTensorFromSparseCSRMatrix(
	MemoryPool* pool, const SparseCSRMatrix* sparse_tensor) {
	const auto& sparse_index =
	internal::checked_cast<const SparseCSRIndex&>(*sparse_tensor->sparse_index());
	const auto& indptr = sparse_index.indptr();
	const auto& indices = sparse_index.indices();
	const auto non_zero_length = sparse_tensor->non_zero_length();
	return MakeTensorFromSparseCSXMatrix(
	SparseMatrixCompressedAxis::ROW, pool, indptr, indices, non_zero_length,
	sparse_tensor->type(), sparse_tensor->shape(), sparse_tensor->size(),
	sparse_tensor->raw_data(), sparse_tensor->dim_names());
	}

	Result<std::shared_ptr<Tensor>> MakeTensorFromSparseCSCMatrix(
	MemoryPool* pool, const SparseCSCMatrix* sparse_tensor) {
	const auto& sparse_index =
	internal::checked_cast<const SparseCSCIndex&>(*sparse_tensor->sparse_index());
	const auto& indptr = sparse_index.indptr();
	const auto& indices = sparse_index.indices();
	const auto non_zero_length = sparse_tensor->non_zero_length();
	return MakeTensorFromSparseCSXMatrix(
	SparseMatrixCompressedAxis::COLUMN, pool, indptr, indices, non_zero_length,
	sparse_tensor->type(), sparse_tensor->shape(), sparse_tensor->size(),
	sparse_tensor->raw_data(), sparse_tensor->dim_names());
	}

	} // namespace internal
	} // namespace arrow