blob: 93606fcde86f7d0d4f4a5ee6e4f7f414f8bf814b [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*!
* \file cast_storage-inl.h
* \brief CPU implementation of cast_storage operator for dense and sparse tensors
*/
#ifndef MXNET_OPERATOR_TENSOR_CAST_STORAGE_INL_H_
#define MXNET_OPERATOR_TENSOR_CAST_STORAGE_INL_H_
#include <dmlc/timer.h>
#include <mxnet/ndarray.h>
#include <vector>
#include <algorithm>
#include "../mxnet_op.h"
#include "../operator_common.h"
#include "../../src/operator/tensor/init_op.h"
#ifdef __CUDACC__
#include "./cast_storage-inl.cuh"
#endif // __CUDACC__
#if MXNET_USE_MKLDNN == 1
#include "../nn/mkldnn/mkldnn_base-inl.h"
#endif
namespace mxnet {
namespace op {
/*!
* \brief CPU Kernel for marking row_idx of a RSP tensor per row.
*/
struct MarkRspRowIdx {
// i represents the row index of the tensor data
template<typename DType, typename RType>
MSHADOW_CINLINE static void Map(int i,
RType* row_idx,
const DType* data,
const nnvm::dim_t row_length) {
using nnvm::dim_t;
dim_t j = 0;
dim_t offset = i * row_length;
for (; j < row_length; ++j) {
if (data[offset+j] != 0) {
break;
}
}
if (row_length == j) {
row_idx[i] = 0; // mark as zero for zero row
} else {
row_idx[i] = 1; // mark as one for non-zero row
}
}
};
/*!
* \brief CPU implementation of casting a dns tensor to rsp type.
*/
inline void CastStorageDnsRspImpl(const OpContext& ctx,
const cpu& cpu_dev,
const TBlob& dns,
NDArray* rsp) {
using namespace rowsparse;
using namespace mshadow;
using nnvm::dim_t;
CHECK(rsp != nullptr);
CHECK_EQ(rsp->storage_type(), kRowSparseStorage);
CHECK_EQ(dns.shape_, rsp->shape());
mshadow::Stream<cpu>* s = ctx.get_stream<cpu>();
MSHADOW_TYPE_SWITCH(dns.type_flag_, DType, { // data type
MSHADOW_IDX_TYPE_SWITCH(rsp->aux_type(kIdx), RType, { // row idx type
const dim_t num_rows = dns.shape_[0];
const dim_t row_length = dns.shape_.ProdShape(1, dns.shape_.ndim());
rsp->CheckAndAllocAuxData(kIdx, Shape1(num_rows));
TBlob row_idx_blob = rsp->aux_data(kIdx);
RType* row_idx = row_idx_blob.dptr<RType>();
dim_t num_threads = num_rows;
mxnet_op::Kernel<MarkRspRowIdx, cpu>::Launch(s, num_threads,
row_idx, dns.dptr<DType>(), row_length);
dim_t nnr = 0;
nnr = common::ParallelAccumulate(row_idx, num_rows, nnr);
rsp->set_aux_shape(kIdx, Shape1(nnr));
if (0 == nnr) return;
auto storage_shape = dns.shape_;
storage_shape[0] = nnr;
rsp->CheckAndAllocData(storage_shape);
auto dns_data = dns.get_with_shape<cpu, 2, DType>(Shape2(num_rows, row_length), s);
auto rsp_data = rsp->data().get_with_shape<cpu, 2, DType>(Shape2(nnr, row_length), s);
dim_t idx = 0;
for (dim_t i = 0; i < num_rows; ++i) {
if (row_idx[i] > 0) {
row_idx[idx] = i;
Copy(rsp_data[idx], dns_data[i], s);
++idx;
}
}
});
});
}
// TODO(haibin) Use memcopy instead will be much faster than assigning each individual element
struct CastStorageRspDnsKernel {
template<typename DType, typename IType>
MSHADOW_XINLINE static void Map(int i,
const nnvm::dim_t row_length,
const IType* idx,
const DType *data,
DType* dns) {
using nnvm::dim_t;
IType rid = idx[i];
dim_t dns_offset = rid * row_length;
dim_t rsp_offset = i * row_length;
for (dim_t col = 0; col < row_length; col++) {
dns[dns_offset + col] = data[rsp_offset + col];
}
}
};
/*!
* \brief This function assumes that the memory for dns has been allocated already
* since the shape is known at binding stage.
*/
template<typename xpu>
void CastStorageRspDnsImpl(const OpContext& ctx,
const NDArray& rsp,
TBlob* dns) {
mshadow::Stream<xpu>* s = ctx.get_stream<xpu>();
CHECK_EQ(rsp.storage_type(), kRowSparseStorage);
using nnvm::dim_t;
MSHADOW_TYPE_SWITCH(dns->type_flag_, DType, {
MSHADOW_IDX_TYPE_SWITCH(rsp.aux_type(rowsparse::kIdx), IType, {
// assign zeros
mxnet_op::Kernel<mxnet_op::set_zero, xpu>::Launch(s, dns->Size(), dns->dptr<DType>());
if (rsp.storage_initialized()) {
// copy over row by row
auto in_idx = rsp.aux_data(rowsparse::kIdx).FlatTo1D<xpu, IType>(s).dptr_;
auto in_data = rsp.data().dptr<DType>();
auto out_data = dns->dptr<DType>();
auto shape = rsp.shape();
const dim_t num_rows = rsp.aux_shape(rowsparse::kIdx).Size();
const dim_t row_length = shape.ProdShape(1, shape.ndim());
const dim_t num_threads = num_rows;
mxnet_op::Kernel<CastStorageRspDnsKernel, xpu>::Launch(s, num_threads,
row_length, in_idx, in_data, out_data);
}
});
});
}
/*!
* \brief CPU kernel for initializing the indptr in a csr matrix.
*/
struct FillCsrIndPtr {
/*!
* \brief
* \param i the i-th row of the dns tensor
* \param indptr the indptr of the csr tensor
* \param dns the dns tensor
* \param num_rows number of rows of the dns tensor
* \param num_cols number of columns of the dns tensor
*/
template<typename DType, typename IType>
MSHADOW_CINLINE static void Map(int i,
IType* indptr,
const DType* dns,
const nnvm::dim_t num_rows,
const nnvm::dim_t num_cols) {
using nnvm::dim_t;
indptr[i+1] = 0;
const dim_t offset = i * num_cols;
for (dim_t j = 0; j < num_cols; ++j) {
if (dns[offset+j] != 0) {
++indptr[i+1];
}
}
}
};
/*!
* \brief CPU kernel for initializing the col_idx and value array of the csr matrix.
*/
struct FillCsrColIdxAndVals {
/*!
* \brief
* \param i the i-th row of the dns tensor
* \param val value array of the csr tensor
* \param col_idx column idx array of the csr tensor
* \param indptr indptr array of the csr tensor
* \param dns dns tensor
* \param num_rows number of rows of the dns tensor
* \param num_cols number of columns of the dns tensor
*/
template<typename DType, typename IType, typename CType>
MSHADOW_CINLINE static void Map(int i,
DType* val,
CType* col_idx,
const IType* indptr,
const DType* dns,
const nnvm::dim_t num_rows,
const nnvm::dim_t num_cols) {
using nnvm::dim_t;
const dim_t offset = i * num_cols;
IType k = indptr[i];
for (dim_t j = 0; j < num_cols; ++j) {
if (dns[offset+j] != 0) {
val[k] = dns[offset+j];
col_idx[k] = j;
++k;
}
}
}
};
/*!
* \brief CPU implementation of casting a dns matrix to csr type.
*/
inline void CastStorageDnsCsrImpl(const OpContext& ctx,
const cpu& cpu_dev,
const TBlob& dns,
NDArray* csr) {
CHECK(csr != nullptr);
CHECK_EQ(csr->storage_type(), kCSRStorage);
CHECK_EQ(dns.shape_.ndim(), 2);
CHECK_EQ(dns.shape_, csr->shape());
using mshadow::Shape1;
using nnvm::dim_t;
mshadow::Stream<cpu>* s = ctx.get_stream<cpu>();
MSHADOW_TYPE_SWITCH(dns.type_flag_, DType, { // data type
MSHADOW_IDX_TYPE_SWITCH(csr->aux_type(csr::kIndPtr), IType, { // indptr type
MSHADOW_IDX_TYPE_SWITCH(csr->aux_type(csr::kIdx), CType, { // col idx type
const dim_t num_rows = dns.shape_[0];
const dim_t num_cols = dns.shape_[1];
csr->CheckAndAllocAuxData(csr::kIndPtr, mshadow::Shape1(num_rows+1));
IType* indptr = csr->aux_data(csr::kIndPtr).dptr<IType>();
DType* dns_data = dns.dptr<DType>();
dim_t num_threads = num_rows;
mxnet_op::Kernel<FillCsrIndPtr, cpu>::Launch(s, num_threads,
indptr, dns_data, num_rows, num_cols);
// single thread to accumulate indptr
// indptr[num_rows] indicates the number of non-zero elements
indptr[0] = 0;
for (dim_t i = 0; i < num_rows; ++i) {
indptr[i+1] += indptr[i];
}
// allocate column idx array and value array
csr->CheckAndAllocAuxData(csr::kIdx, Shape1(static_cast<index_t>(indptr[num_rows])));
csr->CheckAndAllocData(Shape1(static_cast<index_t>(indptr[num_rows])));
// fill col_idx and value arrays of the csr
mxnet_op::Kernel<FillCsrColIdxAndVals, cpu>::Launch(s, num_threads,
csr->data().dptr<DType>(), csr->aux_data(csr::kIdx).dptr<CType>(),
indptr, dns_data, num_rows, num_cols);
});
});
});
}
/*!
* \brief This is the kernel for copying csr.data to its corresponding dns matrix.
*/
struct CopyCsrDataToDns {
/*!
* \brief
* \param i the i-th row of the dns tensor
* \param dns_data data blob of the dns tensor
* \param col_idx column idx array of the csr tensor
* \param indptr indptr array of the csr tensor
* \param csr_data data blob of the csr tensor
* \param num_cols number of columns of the dns tensor
*/
template<typename DType, typename IType, typename CType>
MSHADOW_XINLINE static void Map(int i,
DType* dns_data,
const CType* col_idx,
const IType* indptr,
const DType* csr_data,
const nnvm::dim_t num_cols) {
const nnvm::dim_t offset = i * num_cols;
for (IType j = indptr[i]; j < indptr[i+1]; ++j) {
dns_data[offset+col_idx[j]] = csr_data[j];
}
}
};
/*!
* \brief Casts a csr matrix to dns format.
*/
template<typename xpu>
void CastStorageCsrDnsImpl(const OpContext& ctx,
const NDArray& csr,
TBlob* dns) {
CHECK(dns != nullptr);
CHECK_EQ(csr.storage_type(), kCSRStorage);
CHECK_EQ(dns->shape_.ndim(), 2);
CHECK_EQ(dns->shape_, csr.shape());
using nnvm::dim_t;
mshadow::Stream<xpu>* s = ctx.get_stream<xpu>();
MSHADOW_TYPE_SWITCH(dns->type_flag_, DType, { // data type
MSHADOW_IDX_TYPE_SWITCH(csr.aux_type(csr::kIndPtr), IType, { // indptr type
MSHADOW_IDX_TYPE_SWITCH(csr.aux_type(csr::kIdx), CType, { // col idx type
const dim_t num_rows = dns->shape_[0];
const dim_t num_cols = dns->shape_[1];
DType* dns_data = dns->dptr<DType>();
dim_t num_threads = dns->shape_.Size();
mxnet_op::Kernel<mxnet_op::set_zero, xpu>::Launch(s, num_threads, dns_data);
if (!csr.storage_initialized()) return;
const IType* indptr = csr.aux_data(csr::kIndPtr).dptr<IType>();
const CType* col_idx = csr.aux_data(csr::kIdx).dptr<CType>();
const DType* csr_data = csr.data().dptr<DType>();
num_threads = num_rows;
mxnet_op::Kernel<CopyCsrDataToDns, xpu>::Launch(s, num_threads,
dns_data, col_idx, indptr, csr_data, num_cols);
});
});
});
}
/*!
* \brief Casts a csr matrix to another csr.
*/
template <typename xpu>
void CastStorageCsrCsrImpl(const OpContext& ctx, const NDArray& csr,
NDArray* output) {
mshadow::Stream<xpu>* s = ctx.get_stream<xpu>();
if (!csr.storage_initialized()) {
FillZerosCsrImpl(s, *output);
return;
}
mxnet::ShapeVector aux_shapes({csr.aux_shape(csr::kIndPtr), csr.aux_shape(csr::kIdx)});
output->CheckAndAlloc(aux_shapes);
const TBlob& val = output->data();
const TBlob& indptr = output->aux_data(csr::kIndPtr);
const TBlob& idx = output->aux_data(csr::kIdx);
mxnet_op::copy(s, val, csr.data());
mxnet_op::copy(s, indptr, csr.aux_data(csr::kIndPtr));
mxnet_op::copy(s, idx, csr.aux_data(csr::kIdx));
}
/*!
* \brief Casts a rsp matrix to another rsp.
*/
template <typename xpu>
void CastStorageRspRspImpl(const OpContext& ctx, const NDArray& rsp,
NDArray* output) {
CHECK_EQ(rsp.storage_type(), output->storage_type())
<< "Copying with different storage type";
mshadow::Stream<xpu>* s = ctx.get_stream<xpu>();
if (!rsp.storage_initialized()) {
FillZerosRspImpl(s, *output);
return;
}
auto aux_shape = rsp.aux_shape(rowsparse::kIdx);
output->CheckAndAlloc({aux_shape});
const TBlob& val = output->data();
const TBlob& idx = output->aux_data(rowsparse::kIdx);
const TBlob& from_val = rsp.data();
const TBlob& from_idx = rsp.aux_data(rowsparse::kIdx);
mxnet_op::copy(s, val, from_val);
mxnet_op::copy(s, idx, from_idx);
}
template<typename xpu>
void CastStorageComputeImpl(const OpContext& ctx,
const NDArray& input,
const NDArray& output) {
const auto src_stype = input.storage_type();
const auto dst_stype = output.storage_type();
if (src_stype == kRowSparseStorage && dst_stype == kDefaultStorage) {
TBlob ret = output.data();
CastStorageRspDnsImpl<xpu>(ctx, input, &ret);
} else if (src_stype == kDefaultStorage && dst_stype == kRowSparseStorage) {
NDArray ret = output; // get rid of the const qualifer
CastStorageDnsRspImpl(ctx, xpu(), input.data(), &ret);
} else if (src_stype == kDefaultStorage && dst_stype == kCSRStorage) {
NDArray ret = output; // get rid of the const qualifer
CastStorageDnsCsrImpl(ctx, xpu(), input.data(), &ret);
} else if (src_stype == kCSRStorage && dst_stype == kDefaultStorage) {
TBlob ret = output.data();
CastStorageCsrDnsImpl<xpu>(ctx, input, &ret);
} else if (src_stype == kCSRStorage && dst_stype == kCSRStorage) {
NDArray ret = output;
CastStorageCsrCsrImpl<xpu>(ctx, input, &ret);
} else if (src_stype == kRowSparseStorage && dst_stype == kRowSparseStorage) {
NDArray ret = output;
CastStorageRspRspImpl<xpu>(ctx, input, &ret);
#if MXNET_USE_MKLDNN == 1
} else if (src_stype == kDefaultStorage && dst_stype == kDefaultStorage) {
CHECK_EQ(output.ctx().dev_type, input.ctx().dev_type);
// If one of them uses the MKLDNN layout.
if (input.IsMKLDNNData() || output.IsMKLDNNData()) {
NDArray tmp_input = input;
// If the input data is MKLDNN and is a view, we need to reorder the input
// data first.
if (input.IsMKLDNNData() && input.IsView())
tmp_input = input.Reorder2Default();
const mkldnn::memory *in_mem = tmp_input.GetMKLDNNData();
const_cast<NDArray &>(output).CopyFrom(*in_mem);
MKLDNNStream::Get()->Submit();
} else {
mxnet_op::copy(ctx.get_stream<xpu>(), output.data(), input.data());
}
#endif
} else {
LOG(FATAL) << "Not implemented from " << src_stype << " to " << dst_stype;
}
}
struct CastStorageParam : public dmlc::Parameter<CastStorageParam> {
int stype;
DMLC_DECLARE_PARAMETER(CastStorageParam) {
DMLC_DECLARE_FIELD(stype)
.add_enum("default", kDefaultStorage)
.add_enum("row_sparse", kRowSparseStorage)
.add_enum("csr", kCSRStorage)
.describe("Output storage type.");
}
};
inline bool CastStorageInferStorageType(const nnvm::NodeAttrs& attrs,
const int dev_mask,
DispatchMode* dispatch_mode,
std::vector<int> *in_attrs,
std::vector<int> *out_attrs) {
CHECK_EQ(in_attrs->size(), 1U);
CHECK_EQ(out_attrs->size(), 1U);
CHECK_NE(in_attrs->at(0), kUndefinedStorage)
<< "src ndarray's storage type must be specified";
const CastStorageParam& param = nnvm::get<CastStorageParam>(attrs.parsed);
CHECK_NE(param.stype, kUndefinedStorage)
<< "dst ndarray's storage type must be specified";
const auto& in_stype = in_attrs->at(0);
const auto& param_stype = static_cast<NDArrayStorageType>(param.stype);
bool dispatched = false;
// dns -> dns, dns -> rsp, dns -> csr
if (!dispatched && in_stype == kDefaultStorage && param_stype == kDefaultStorage) {
// dns -> dns
DispatchMode mode = DispatchMode::kFCompute;
#if MXNET_USE_MKLDNN == 1
// If we use MKLDNN and the arrays are in CPU memory, the array may store
// MKLDNN layout, we should convert its layout explicitly.
if (dev_mask == kCPU)
mode = DispatchMode::kFComputeEx;
#endif
dispatched = storage_type_assign(out_attrs, kDefaultStorage, dispatch_mode, mode);
}
if (!dispatched && in_stype == kDefaultStorage &&
(param_stype == kRowSparseStorage || param_stype == kCSRStorage)) {
// dns -> rsp, dns -> csr
dispatched = storage_type_assign(out_attrs, param_stype,
dispatch_mode, DispatchMode::kFComputeEx);
}
if (!dispatched && in_stype == kRowSparseStorage &&
(param_stype == kRowSparseStorage || param_stype == kDefaultStorage)) {
// rsp -> rsp, rsp -> dns
dispatched = storage_type_assign(out_attrs, param_stype,
dispatch_mode, DispatchMode::kFComputeEx);
}
if (!dispatched && in_stype == kCSRStorage &&
(param_stype == kCSRStorage || param_stype == kDefaultStorage)) {
// csr -> csr, csr -> dns
dispatched = storage_type_assign(out_attrs, param_stype,
dispatch_mode, DispatchMode::kFComputeEx);
}
return dispatched;
}
template<typename xpu>
void CastStorageComputeEx(const nnvm::NodeAttrs& attrs,
const OpContext& ctx,
const std::vector<NDArray>& inputs,
const std::vector<OpReqType>& req,
const std::vector<NDArray>& outputs) {
CHECK_EQ(inputs.size(), 1);
CHECK_EQ(outputs.size(), 1);
if (req[0] == kNullOp) return;
CHECK_EQ(req[0], kWriteTo) << "CastStorageComputeEx expects req[0] == kWriteTo";
CastStorageComputeImpl<xpu>(ctx, inputs[0], outputs[0]);
}
} // namespace op
} // namespace mxnet
#endif // MXNET_OPERATOR_TENSOR_CAST_STORAGE_INL_H_