| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| /*! |
| * \file broadcast_reduce_op.h |
| * \brief Function definition of broadcast and reduce operators |
| */ |
| #ifndef MXNET_OPERATOR_NUMPY_NP_BROADCAST_REDUCE_OP_H_ |
| #define MXNET_OPERATOR_NUMPY_NP_BROADCAST_REDUCE_OP_H_ |
| |
| #include <algorithm> |
| #include <vector> |
| #include <string> |
| #include "../../common/utils.h" |
| #include "../nn/moments-inl.h" |
| #include "../tensor/broadcast_reduce_op.h" |
| #include "../tensor/elemwise_binary_broadcast_op.h" |
| #include "../../api/operator/op_utils.h" |
| |
| namespace mxnet { |
| namespace op { |
| |
| struct NumpyReduceAxesParam : public dmlc::Parameter<NumpyReduceAxesParam> { |
| dmlc::optional<mxnet::Tuple<int>> axis; |
| dmlc::optional<int> dtype; |
| bool keepdims; |
| dmlc::optional<double> initial; |
| DMLC_DECLARE_PARAMETER(NumpyReduceAxesParam) { |
| DMLC_DECLARE_FIELD(axis) |
| .set_default(dmlc::optional<mxnet::Tuple<int>>()) |
| .describe( |
| "Axis or axes along which a sum is performed. The default, axis=None, will sum " |
| "all of the elements of the input array. If axis is negative it counts from the " |
| "last to the first axis."); |
| DMLC_DECLARE_FIELD(dtype) |
| .add_enum("float16", mshadow::kFloat16) |
| .add_enum("float32", mshadow::kFloat32) |
| .add_enum("float64", mshadow::kFloat64) |
| .add_enum("int8", mshadow::kInt8) |
| .add_enum("int32", mshadow::kInt32) |
| .add_enum("int64", mshadow::kInt64) |
| .add_enum("bool", mshadow::kBool) |
| .set_default(dmlc::optional<int>()) |
| .describe( |
| "The type of the returned array and of the accumulator in which the elements are " |
| "summed. The dtype of a is used by default unless a has an integer dtype of less " |
| "precision than the default platform integer. In that case, if a is signed then " |
| "the platform integer is used while if a is unsigned then an unsigned integer of " |
| "the same precision as the platform integer is used."); |
| DMLC_DECLARE_FIELD(keepdims).set_default(false).describe( |
| "If this is set to `True`, the reduced axes are left " |
| "in the result as dimension with size one."); |
| DMLC_DECLARE_FIELD(initial) |
| .set_default(dmlc::optional<double>()) |
| .describe("Starting value for the sum."); |
| } |
| |
| bool operator==(const NumpyReduceAxesParam& other) const { |
| return this->axis == other.axis && this->dtype == other.dtype && |
| this->keepdims == other.keepdims && this->initial == other.initial; |
| } |
| |
| void SetAttrDict(std::unordered_map<std::string, std::string>* dict) { |
| std::ostringstream axis_s, dtype_s, keepdims_s, initial_s; |
| axis_s << axis; |
| dtype_s << dtype; |
| keepdims_s << keepdims; |
| initial_s << initial; |
| (*dict)["axis"] = axis_s.str(); |
| if (dtype.has_value()) { |
| (*dict)["dtype"] = MXNetTypeWithBool2String(dtype.value()); |
| } else { |
| (*dict)["dtype"] = dtype_s.str(); |
| } |
| (*dict)["keepdims"] = keepdims_s.str(); |
| (*dict)["initial"] = initial_s.str(); |
| } |
| }; |
| |
| struct NumpyReduceAxesNoDTypeParam : public dmlc::Parameter<NumpyReduceAxesNoDTypeParam> { |
| dmlc::optional<mxnet::Tuple<int>> axis; |
| bool keepdims; |
| dmlc::optional<double> initial; |
| DMLC_DECLARE_PARAMETER(NumpyReduceAxesNoDTypeParam) { |
| DMLC_DECLARE_FIELD(axis) |
| .set_default(dmlc::optional<mxnet::Tuple<int>>()) |
| .describe( |
| "Axis or axes along which a sum is performed. The default, axis=None, will sum " |
| "all of the elements of the input array. If axis is negative it counts from the " |
| "last to the first axis."); |
| DMLC_DECLARE_FIELD(keepdims).set_default(false).describe( |
| "If this is set to `True`, the reduced axes are left " |
| "in the result as dimension with size one."); |
| DMLC_DECLARE_FIELD(initial) |
| .set_default(dmlc::optional<double>()) |
| .describe("Starting value for the sum."); |
| } |
| void SetAttrDict(std::unordered_map<std::string, std::string>* dict) { |
| std::ostringstream axis_s, keepdims_s, initial_s; |
| axis_s << axis; |
| keepdims_s << keepdims; |
| initial_s << initial; |
| (*dict)["axis"] = axis_s.str(); |
| (*dict)["keepdims"] = keepdims_s.str(); |
| (*dict)["initial"] = initial_s.str(); |
| } |
| }; |
| |
| struct NumpyReduceAxesBoolParam : public dmlc::Parameter<NumpyReduceAxesBoolParam> { |
| dmlc::optional<mxnet::Tuple<int>> axis; |
| bool keepdims; |
| DMLC_DECLARE_PARAMETER(NumpyReduceAxesBoolParam) { |
| DMLC_DECLARE_FIELD(axis) |
| .set_default(dmlc::optional<mxnet::Tuple<int>>()) |
| .describe( |
| "Axis or axes along which a sum is performed. The default, axis=None, will sum " |
| "all of the elements of the input array. If axis is negative it counts from the " |
| "last to the first axis."); |
| DMLC_DECLARE_FIELD(keepdims).set_default(false).describe( |
| "If this is set to `True`, the reduced axes are left " |
| "in the result as dimension with size one."); |
| } |
| void SetAttrDict(std::unordered_map<std::string, std::string>* dict) { |
| std::ostringstream axis_s, keepdims_s; |
| axis_s << axis; |
| keepdims_s << keepdims; |
| (*dict)["axis"] = axis_s.str(); |
| (*dict)["keepdims"] = keepdims_s.str(); |
| } |
| }; |
| |
| inline TShape NumpyReduceAxesShapeImpl(const TShape& ishape, |
| const dmlc::optional<mxnet::Tuple<int>>& axis, |
| bool keepdims) { |
| // If input is a scalar, output should be a scalar too |
| if (ishape.ndim() == 0) { |
| if (axis.has_value()) { |
| const mxnet::Tuple<int>& axes = axis.value(); |
| if (axes.ndim() > 0) { |
| CHECK_EQ(axes.ndim(), 1); |
| CHECK(axes[0] == 0 || axes[0] == -1); |
| } |
| } |
| return TShape(0, -1); |
| } |
| |
| // axis=None, do global reduction |
| if (!axis.has_value()) { |
| if (keepdims) { |
| return TShape(ishape.ndim(), 1); |
| } else { |
| return TShape(0, -1); |
| } |
| } |
| |
| // axis = (), will return identity(input) |
| if (axis.value().ndim() == 0) { |
| return ishape; |
| } |
| |
| // axis has value |
| mxnet::Tuple<int> axes(axis.value()); |
| for (index_t i = 0; i < axes.ndim(); i++) { |
| if (axes[i] < 0) { |
| axes[i] += ishape.ndim(); |
| } |
| } |
| std::sort(axes.begin(), axes.end()); |
| |
| for (index_t i = 1; i < axes.ndim(); i++) { |
| CHECK_LT(axes[i - 1], axes[i]) << "Reduction axes have duplicates " << axes; |
| } |
| CHECK_LT(axes[axes.ndim() - 1], ishape.ndim()) |
| << "Reduction axis " << axes[axes.ndim() - 1] << " Exceeds input dimensions " << ishape; |
| CHECK_GE(axes[0], 0) << "Reduction axis " << axis.value() << " Exceeds input dimensions " |
| << ishape; |
| |
| TShape oshape; |
| if (keepdims) { |
| oshape = TShape(ishape); |
| } else { |
| oshape = TShape(ishape.ndim() - axes.ndim(), -1); |
| } |
| |
| if (keepdims) { |
| for (index_t i = 0; i < axes.ndim(); ++i) { |
| oshape[axes[i]] = 1; |
| } |
| } else { |
| for (index_t i = 0, j = 0, k = 0; i < ishape.ndim(); ++i) { |
| if (j < axes.ndim() && i == axes[j]) { |
| ++j; |
| continue; |
| } |
| oshape[k++] = ishape[i]; |
| } |
| } |
| return oshape; |
| } |
| |
| inline bool NumpyReduceAxesShape(const nnvm::NodeAttrs& attrs, |
| std::vector<TShape>* in_attrs, |
| std::vector<TShape>* out_attrs) { |
| CHECK_EQ(in_attrs->size(), 1U); |
| CHECK_EQ(out_attrs->size(), 1U); |
| if (!shape_is_known(in_attrs->at(0))) { |
| return false; |
| } |
| const NumpyReduceAxesParam& param = nnvm::get<NumpyReduceAxesParam>(attrs.parsed); |
| SHAPE_ASSIGN_CHECK( |
| *out_attrs, 0, NumpyReduceAxesShapeImpl((*in_attrs)[0], param.axis, param.keepdims)); |
| return shape_is_known(out_attrs->at(0)); |
| } |
| |
| inline bool NumpyReduceAxesBoolShape(const nnvm::NodeAttrs& attrs, |
| std::vector<TShape>* in_attrs, |
| std::vector<TShape>* out_attrs) { |
| CHECK_EQ(in_attrs->size(), 1U); |
| CHECK_EQ(out_attrs->size(), 1U); |
| if (!shape_is_known(in_attrs->at(0))) { |
| return false; |
| } |
| const NumpyReduceAxesBoolParam& param = nnvm::get<NumpyReduceAxesBoolParam>(attrs.parsed); |
| SHAPE_ASSIGN_CHECK( |
| *out_attrs, 0, NumpyReduceAxesShapeImpl((*in_attrs)[0], param.axis, param.keepdims)); |
| return shape_is_known(out_attrs->at(0)); |
| } |
| |
| inline bool NumpyReduceAxesNoDTypeShape(const nnvm::NodeAttrs& attrs, |
| std::vector<TShape>* in_attrs, |
| std::vector<TShape>* out_attrs) { |
| CHECK_EQ(in_attrs->size(), 1U); |
| CHECK_EQ(out_attrs->size(), 1U); |
| if (!shape_is_known(in_attrs->at(0))) { |
| return false; |
| } |
| const NumpyReduceAxesNoDTypeParam& param = nnvm::get<NumpyReduceAxesNoDTypeParam>(attrs.parsed); |
| // check the case where the reduction axis should not be zero |
| bool is_all_reducded_axes_not_zero = true; |
| const TShape& ishape = (*in_attrs)[0]; |
| if (param.axis.has_value()) { |
| const mxnet::Tuple<int>& axes = param.axis.value(); |
| for (int i = 0; i < axes.ndim(); ++i) { |
| if ((axes[i] >= 0) && (ishape[axes[i]] == 0)) { |
| is_all_reducded_axes_not_zero = false; |
| break; |
| } |
| } |
| } else { |
| if (ishape.Size() == 0) { |
| // global reduction should excuted only when input have size more than 0 |
| is_all_reducded_axes_not_zero = false; |
| } |
| } |
| CHECK(is_all_reducded_axes_not_zero) |
| << "zero-size array to reduction operation maximum which has no identity"; |
| SHAPE_ASSIGN_CHECK( |
| *out_attrs, 0, NumpyReduceAxesShapeImpl((*in_attrs)[0], param.axis, param.keepdims)); |
| return shape_is_known(out_attrs->at(0)); |
| } |
| |
| template <bool safe_acc_hint = false> |
| inline bool NeedSafeAcc(int itype, int otype) { |
| bool rule = (itype != otype) || (itype != mshadow::kFloat32 && itype != mshadow::kFloat64); |
| return safe_acc_hint && rule; |
| } |
| |
| namespace mxnet_op { |
| struct set_to_nan { |
| template <typename DType> |
| MSHADOW_XINLINE static void Map(index_t i, DType* out) { |
| out[i] = DType(nanf("")); |
| } |
| }; |
| |
| } // namespace mxnet_op |
| |
| void TVMOpReduce(const OpContext& ctx, |
| const TBlob& input, |
| const dmlc::optional<mxnet::Tuple<int>>& axis, |
| const TBlob& output, |
| const OpReqType req, |
| const std::string& reducer_name); |
| |
| template <typename xpu, |
| typename reducer, |
| bool safe_acc_hint = false, |
| bool normalize = false, |
| typename OP = op::mshadow_op::identity> |
| void NumpyReduceAxesCompute(const nnvm::NodeAttrs& attrs, |
| const OpContext& ctx, |
| const std::vector<TBlob>& inputs, |
| const std::vector<OpReqType>& req, |
| const std::vector<TBlob>& outputs) { |
| using namespace mshadow; |
| if (req[0] == kNullOp) |
| return; |
| const auto& param = nnvm::get<NumpyReduceAxesParam>(attrs.parsed); |
| if (param.initial.has_value()) { |
| LOG(FATAL) << "initial is not supported yet"; |
| } |
| Stream<xpu>* s = ctx.get_stream<xpu>(); |
| if (outputs[0].shape_.Size() == 0) |
| return; |
| if (inputs[0].shape_.Size() == 0 && outputs[0].shape_.Size() != 0) { |
| using namespace mxnet_op; |
| if (normalize) { |
| LOG(WARNING) << "WARNING: Mean of empty slice."; |
| if (mxnet::common::is_float(outputs[0].type_flag_)) { |
| MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, { |
| Kernel<set_to_nan, xpu>::Launch(s, outputs[0].shape_.Size(), outputs[0].dptr<DType>()); |
| }); |
| } else { |
| LOG(WARNING) << "WARNING: nan is outside the range of" |
| << "representable values of type 'int'"; |
| MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, { |
| Kernel<set_zero, xpu>::Launch(s, outputs[0].shape_.Size(), outputs[0].dptr<DType>()); |
| }); |
| } |
| } else if (std::is_same<reducer, mshadow_op::sum>::value) { |
| MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, { |
| Kernel<set_zero, xpu>::Launch(s, outputs[0].shape_.Size(), outputs[0].dptr<DType>()); |
| }); |
| } else { |
| MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, { |
| Kernel<set_one, xpu>::Launch(s, outputs[0].shape_.Size(), outputs[0].dptr<DType>()); |
| }); |
| } |
| return; |
| } |
| CHECK_NE(req[0], kWriteInplace) << "Reduce does not support write in-place"; |
| #if MXNET_USE_TVM_OP |
| // If boolean ndarray, use the kernel generated by TVM |
| if (inputs[0].type_flag_ == mshadow::kBool) { |
| std::string reducer_name; |
| if (std::is_same<reducer, mshadow_op::sum>::value) { |
| reducer_name = "sum"; |
| } else { |
| LOG(FATAL) << "Only reduce op: `sum` is supported for boolean ndarrays"; |
| } |
| TVMOpReduce(ctx, inputs[0], param.axis, outputs[0], req[0], reducer_name); |
| if (normalize) { |
| using namespace mshadow::expr; |
| MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, OType, { |
| auto out = outputs[0].FlatTo2D<xpu, OType>(s); |
| out /= scalar<OType>(inputs[0].Size() / outputs[0].Size()); |
| }); |
| } |
| return; |
| } |
| #endif |
| if (param.axis.has_value() && param.axis.value().ndim() == 0) { |
| UnaryOp::IdentityCompute<xpu>(attrs, ctx, inputs, req, outputs); |
| } |
| TShape small; |
| if (param.keepdims) { |
| small = outputs[0].shape_; |
| } else { |
| small = NumpyReduceAxesShapeImpl(inputs[0].shape_, param.axis, true); |
| } |
| |
| if (NeedSafeAcc<safe_acc_hint>(inputs[0].type_flag_, outputs[0].type_flag_)) { |
| ReduceAxesComputeImpl<xpu, reducer, true, normalize, OP>(ctx, inputs, req, outputs, small); |
| } else { |
| ReduceAxesComputeImpl<xpu, reducer, false, normalize, OP>(ctx, inputs, req, outputs, small); |
| } |
| } |
| |
| template <typename xpu, typename reducer, typename OP = op::mshadow_op::identity> |
| void NumpyReduceAxesNoDTypeCompute(const nnvm::NodeAttrs& attrs, |
| const OpContext& ctx, |
| const std::vector<TBlob>& inputs, |
| const std::vector<OpReqType>& req, |
| const std::vector<TBlob>& outputs) { |
| const NumpyReduceAxesNoDTypeParam& param = nnvm::get<NumpyReduceAxesNoDTypeParam>(attrs.parsed); |
| if (param.initial.has_value()) { |
| LOG(FATAL) << "initial is not supported yet"; |
| } |
| if (inputs[0].shape_.Size() == 0U || outputs[0].shape_.Size() == 0U) |
| return; // zero-size tensor |
| if (param.axis.has_value() && param.axis.value().ndim() == 0) { |
| UnaryOp::IdentityCompute<xpu>(attrs, ctx, inputs, req, outputs); |
| } |
| TShape small; |
| if (param.keepdims) { |
| small = outputs[0].shape_; |
| } else { |
| small = NumpyReduceAxesShapeImpl(inputs[0].shape_, param.axis, true); |
| } |
| ReduceAxesComputeImpl<xpu, reducer, false, false, OP>(ctx, inputs, req, outputs, small); |
| } |
| |
| template <typename xpu, typename reducer, typename OP = op::mshadow_op::NonZero, int init> |
| void NumpyReduceAxesBoolCompute(const nnvm::NodeAttrs& attrs, |
| const OpContext& ctx, |
| const std::vector<TBlob>& inputs, |
| const std::vector<OpReqType>& req, |
| const std::vector<TBlob>& outputs) { |
| const NumpyReduceAxesBoolParam& param = nnvm::get<NumpyReduceAxesBoolParam>(attrs.parsed); |
| mshadow::Stream<xpu>* s = ctx.get_stream<xpu>(); |
| if (outputs[0].shape_.Size() == 0) |
| return; |
| if (inputs[0].shape_.Size() == 0 && outputs[0].shape_.Size() != 0) { |
| using namespace mxnet_op; |
| if (init == 0) { |
| Kernel<set_false, xpu>::Launch(s, outputs[0].shape_.Size(), outputs[0].dptr<bool>()); |
| } else { |
| Kernel<set_true, xpu>::Launch(s, outputs[0].shape_.Size(), outputs[0].dptr<bool>()); |
| } |
| return; |
| } |
| if (param.axis.has_value() && param.axis.value().ndim() == 0) { |
| UnaryOp::IdentityCompute<xpu>(attrs, ctx, inputs, req, outputs); |
| } |
| TShape small; |
| if (param.keepdims) { |
| small = outputs[0].shape_; |
| } else { |
| small = NumpyReduceAxesShapeImpl(inputs[0].shape_, param.axis, true); |
| } |
| ReduceAxesComputeBoolImpl<xpu, reducer, false, false, OP>(ctx, inputs, req, outputs, small); |
| } |
| |
| template <typename xpu, typename reducer> |
| void NumpySearchAxisCompute(const nnvm::NodeAttrs& attrs, |
| const OpContext& ctx, |
| const std::vector<TBlob>& inputs, |
| const std::vector<OpReqType>& req, |
| const std::vector<TBlob>& outputs) { |
| using namespace mshadow; |
| using namespace mshadow::expr; |
| const ReduceAxisParam& param = nnvm::get<ReduceAxisParam>(attrs.parsed); |
| Stream<xpu>* s = ctx.get_stream<xpu>(); |
| int axis = inputs[0].ndim(); |
| TBlob input = inputs[0]; |
| if (param.axis.has_value()) { |
| axis = param.axis.value(); |
| } else { |
| // If global reduction, reshape the input tensor into 2D shape (1, inputs[0].shape_.Size()) |
| // and search on axis = 1. |
| mxnet::TShape shape_2d(2, 1); |
| shape_2d[1] = input.shape_.Size(); |
| input = TBlob(input.dptr_, shape_2d, input.dev_mask(), input.type_flag_, input.dev_id()); |
| axis = 1; |
| } |
| axis = CheckAxis(axis, input.shape_.ndim()); |
| if (inputs[0].shape_.ndim() != 0) { |
| if (param.axis.has_value()) { |
| // cannot do argmax in an empty dimension |
| CHECK_NE(inputs[0].shape_[axis], 0) |
| << "searching input tensor of shape " << inputs[0].shape_ << " along axis = " << axis |
| << " of zero dim-size is not allowed"; |
| } else { |
| // cannot do argmax on an empty array |
| CHECK_NE(inputs[0].shape_.Size(), 0U) << "attempt to search an empty sequence"; |
| } |
| } |
| if (input.shape_.Size() == 0U) |
| return; // zero-size tensor |
| mxnet::TShape shape = AxisShapeCompact(input.shape_, &axis, false); |
| MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, DType, { |
| Tensor<xpu, 2, int64_t> out = |
| outputs[0].get_with_shape<xpu, 2, int64_t>(Shape2(shape[0], shape[2]), s); |
| Tensor<xpu, 3, DType> in = input.get_with_shape<xpu, 3, DType>(shape.get<3>(), s); |
| CHECK(req[0] != kAddTo) << "AddTo is not supported"; |
| ASSIGN_DISPATCH(out, req[0], tcast<int64_t>(reduce_with_axis<reducer, true>(in, 1))); |
| }); |
| } |
| |
| struct arg_min_max_parse { |
| template <typename DType, typename OType> |
| MSHADOW_XINLINE static void Map(index_t i, OType* out_data, const DType* in_data) { |
| out_data[i] = in_data[i].idx; |
| } |
| }; |
| |
| template <typename Reducer, int NDim, typename DType, typename OType> |
| void NumpyArgMinMaxReduce(mshadow::Stream<cpu>* s, |
| const TBlob& in_data, |
| const TBlob& out_data, |
| const mshadow::Tensor<cpu, 1, char>& workspace) { |
| using namespace mshadow; |
| Shape<NDim> rshape, rstride; |
| broadcast::diff<NDim>(out_data.shape_.get<NDim>(), in_data.shape_.get<NDim>(), &rshape, &rstride); |
| size_t N = out_data.shape_.Size(), M = rshape.Size(); |
| broadcast::seq_reduce_compute<Reducer, |
| NDim, |
| OType, |
| DType, |
| OType, |
| mxnet::op::mshadow_op::identity, |
| mxnet::op::mshadow_op::arg_min_max_set_index<OType, index_t>>( |
| N, |
| M, |
| false, |
| in_data.dptr<DType>(), |
| static_cast<OType*>(out_data.dptr_), |
| in_data.shape_.get<NDim>(), |
| out_data.shape_.get<NDim>(), |
| rshape, |
| rstride); |
| } |
| |
| template <typename Reducer, typename xpu, typename IType> |
| void NumpyArgMinMaxCompute(const nnvm::NodeAttrs& attrs, |
| const OpContext& ctx, |
| const std::vector<TBlob>& inputs, |
| const std::vector<OpReqType>& req, |
| const std::vector<TBlob>& outputs) { |
| using namespace mshadow; |
| using namespace mshadow::expr; |
| if (req[0] == kNullOp) |
| return; |
| // parse param |
| const auto& param = nnvm::get<ReduceAxisParam>(attrs.parsed); |
| mshadow::Stream<xpu>* s = ctx.get_stream<xpu>(); |
| TBlob out = outputs[0]; |
| TBlob in = inputs[0]; |
| // do some shape checks |
| if (in.shape_.ndim() != 0) { |
| if (param.axis.has_value()) { |
| // cannot do argmax in an empty dimension |
| int axis = param.axis.value(); |
| axis = CheckAxis(axis, in.shape_.ndim()); |
| CHECK_NE(in.shape_[axis], 0) |
| << "searching input tensor of shape " << inputs[0].shape_ << " along axis = " << axis |
| << " of zero dim-size is not allowed"; |
| } else { |
| // cannot do argmax on an empty array |
| CHECK_NE(in.shape_.Size(), 0U) << "attempt to search an empty sequence"; |
| } |
| } |
| if (in.shape_.Size() == 0U) |
| return; // zero-size tensor |
| // prepare shape |
| dmlc::optional<mxnet::Tuple<int>> axes; |
| if (param.axis.has_value()) { |
| mxnet::Tuple<int> t({param.axis.value()}); |
| axes = dmlc::optional<mxnet::Tuple<int>>(t); |
| } |
| TShape small; |
| if (param.keepdims) { |
| small = outputs[0].shape_; |
| } else { |
| small = NumpyReduceAxesShapeImpl(in.shape_, axes, true); |
| } |
| mxnet::TShape src_shape, dst_shape; |
| BroadcastReduceShapeCompact(in.shape_, small, &src_shape, &dst_shape); |
| const TBlob in_data = in.reshape(src_shape); |
| // request a work space |
| size_t workspace_size = broadcast::ReduceWorkspaceSize(s, dst_shape, req[0], src_shape); |
| MSHADOW_TYPE_SWITCH_WITH_BOOL(in.type_flag_, DType, { |
| // define OType |
| typedef mxnet::op::mshadow_op::IndexedNum<IType, DType> OType; |
| // switch dim |
| BROADCAST_NDIM_SWITCH(dst_shape.ndim(), NDim, { |
| constexpr size_t align_size = 1024; |
| const size_t aligned_first_workspace_size = |
| ((workspace_size + align_size - 1) / align_size) * align_size; |
| workspace_size = aligned_first_workspace_size + sizeof(OType) * out.shape_.Size(); |
| Tensor<xpu, 1, char> workspace = |
| ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(workspace_size), s); |
| // set up intermediate output |
| TBlob intermediate = out; |
| intermediate.dptr_ = |
| reinterpret_cast<int64_t*>(workspace.dptr_ + aligned_first_workspace_size); |
| // reshape the input and intermediate output tensor |
| const TBlob intermediate_out_data = intermediate.reshape(dst_shape); |
| NumpyArgMinMaxReduce<Reducer, NDim, DType, OType>( |
| s, in_data, intermediate_out_data, workspace); |
| // parse the indices from the intermediate tensor back to the actual output tensor |
| using namespace mxnet_op; |
| Kernel<arg_min_max_parse, xpu>::Launch(s, |
| out.shape_.Size(), |
| outputs[0].dptr<int64_t>(), |
| static_cast<OType*>(intermediate_out_data.dptr_)); |
| }); |
| }); |
| } |
| |
| #if MXNET_USE_CUDA |
| |
| struct NumpyArgMinMaxRTCCompute { |
| std::string reducer; |
| |
| void operator()(const nnvm::NodeAttrs& attrs, |
| const OpContext& ctx, |
| const std::vector<TBlob>& inputs, |
| const std::vector<OpReqType>& req, |
| const std::vector<TBlob>& outputs); |
| }; |
| |
| #endif |
| |
| template <typename xpu, bool normalize = false> |
| inline void NumpyReduceAxesBackwardUseNone(const nnvm::NodeAttrs& attrs, |
| const OpContext& ctx, |
| const std::vector<TBlob>& inputs, |
| const std::vector<OpReqType>& req, |
| const std::vector<TBlob>& outputs) { |
| using namespace mshadow; |
| using namespace mshadow::expr; |
| CHECK_NE(outputs[0].type_flag_, kBool) << "reduce operators do not support gradient calculation " |
| "for input tensors of boolean type."; |
| if (outputs[0].shape_.Size() == 0) |
| return; |
| const NumpyReduceAxesParam& param = nnvm::get<NumpyReduceAxesParam>(attrs.parsed); |
| TShape small; |
| if (param.keepdims) { |
| small = inputs[0].shape_; |
| } else { |
| small = NumpyReduceAxesShapeImpl(outputs[0].shape_, param.axis, true); |
| } |
| |
| BroadcastComputeImpl<xpu>(attrs, ctx, inputs, req, outputs, small); |
| |
| if (normalize) { |
| Stream<xpu>* s = ctx.get_stream<xpu>(); |
| MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, IType, { |
| Tensor<xpu, 1, IType> igrad = outputs[0].FlatTo1D<xpu, IType>(s); |
| igrad /= scalar<IType>(outputs[0].Size() / inputs[0].Size()); |
| }); |
| } |
| } |
| |
| template <typename xpu, typename OP, bool normalize = false> |
| void NumpyReduceAxesBackwardUseInOut(const nnvm::NodeAttrs& attrs, |
| const OpContext& ctx, |
| const std::vector<TBlob>& inputs, |
| const std::vector<OpReqType>& req, |
| const std::vector<TBlob>& outputs) { |
| using namespace mshadow; |
| using namespace mshadow::expr; |
| const NumpyReduceAxesParam& param = nnvm::get<NumpyReduceAxesParam>(attrs.parsed); |
| TShape small; |
| if (param.keepdims) { |
| small = inputs[0].shape_; |
| } else { |
| small = NumpyReduceAxesShapeImpl(outputs[0].shape_, param.axis, true); |
| } |
| ReduceAxesBackwardUseInOutImpl<xpu, OP, normalize>(ctx, small, inputs, req, outputs); |
| } |
| |
| struct NumpyMomentsParam : public dmlc::Parameter<NumpyMomentsParam> { |
| dmlc::optional<mxnet::Tuple<int>> axis; |
| dmlc::optional<int> dtype; |
| bool keepdims; |
| int ddof; |
| DMLC_DECLARE_PARAMETER(NumpyMomentsParam) { |
| DMLC_DECLARE_FIELD(axis) |
| .set_default(dmlc::optional<mxnet::Tuple<int>>()) |
| .describe( |
| "Axis or axes along which a sum is performed. The default, axis=None, will sum " |
| "all of the elements of the input array. If axis is negative it counts from the " |
| "last to the first axis."); |
| DMLC_DECLARE_FIELD(dtype) |
| .add_enum("float16", mshadow::kFloat16) |
| .add_enum("float32", mshadow::kFloat32) |
| .add_enum("float64", mshadow::kFloat64) |
| .add_enum("int8", mshadow::kInt8) |
| .add_enum("int32", mshadow::kInt32) |
| .add_enum("int64", mshadow::kInt64) |
| .set_default(dmlc::optional<int>()) |
| .describe( |
| "The type of the returned array and of the accumulator in which the elements are " |
| "summed. The dtype of a is used by default unless a has an integer dtype of less " |
| "precision than the default platform integer. In that case, if a is signed then " |
| "the platform integer is used while if a is unsigned then an unsigned integer of " |
| "the same precision as the platform integer is used."); |
| DMLC_DECLARE_FIELD(keepdims).set_default(false).describe( |
| "If this is set to `True`, the reduced axes are left " |
| "in the result as dimension with size one."); |
| DMLC_DECLARE_FIELD(ddof).set_default(0).describe("Starting value for the sum."); |
| } |
| |
| void SetAttrDict(std::unordered_map<std::string, std::string>* dict) { |
| std::ostringstream axis_s, dtype_s, keepdims_s, ddof_s; |
| axis_s << axis; |
| keepdims_s << keepdims; |
| ddof_s << ddof; |
| (*dict)["axis"] = axis_s.str(); |
| dtype_s << dtype; |
| if (dtype.has_value()) { |
| (*dict)["dtype"] = MXNetTypeWithBool2String(dtype.value()); |
| } else { |
| (*dict)["dtype"] = dtype_s.str(); |
| } |
| (*dict)["keepdims"] = keepdims_s.str(); |
| (*dict)["ddof"] = ddof_s.str(); |
| } |
| }; |
| |
| struct NumpyWeightedAverageParam : public dmlc::Parameter<NumpyWeightedAverageParam> { |
| dmlc::optional<mxnet::Tuple<int>> axis; |
| bool returned; |
| bool weighted; |
| |
| DMLC_DECLARE_PARAMETER(NumpyWeightedAverageParam) { |
| DMLC_DECLARE_FIELD(axis) |
| .set_default(dmlc::optional<mxnet::Tuple<int>>()) |
| .describe( |
| "Axis or axes along which a average is performed. " |
| "The default, axis=None, will average " |
| "all of the elements of the input array. If axis is negative it counts from the " |
| "last to the first axis."); |
| DMLC_DECLARE_FIELD(returned).set_default(false).describe( |
| "If True, the tuple (average, sum_of_weights) is returned," |
| "otherwise only the average is returned." |
| "If weights=None, sum_of_weights is equivalent to" |
| "the number of elements over which the average is taken."); |
| DMLC_DECLARE_FIELD(weighted).set_default(true).describe( |
| "Auxiliary flag to deal with none weights."); |
| } |
| |
| void SetAttrDict(std::unordered_map<std::string, std::string>* dict) { |
| std::ostringstream axis_s, returned_s, weighted_s; |
| axis_s << axis; |
| returned_s << returned; |
| weighted_s << weighted; |
| (*dict)["axis"] = axis_s.str(); |
| (*dict)["returned"] = returned_s.str(); |
| (*dict)["weighted"] = weighted_s.str(); |
| } |
| }; |
| |
| inline bool NumpyWeightedAverageShape(const nnvm::NodeAttrs& attrs, |
| std::vector<TShape>* in_attrs, |
| std::vector<TShape>* out_attrs) { |
| const auto& param = nnvm::get<NumpyWeightedAverageParam>(attrs.parsed); |
| CHECK_EQ(in_attrs->size(), (param.weighted ? 2U : 1U)); |
| CHECK_EQ(out_attrs->size(), 2U); |
| if (!shape_is_known(in_attrs->at(0))) { |
| return false; |
| } |
| |
| const TShape& a_shape = (*in_attrs)[0]; |
| SHAPE_ASSIGN_CHECK(*out_attrs, 0, NumpyReduceAxesShapeImpl(a_shape, param.axis, false)); |
| |
| if (param.weighted) { |
| const TShape& w_shape = (*in_attrs)[1]; |
| if (w_shape.ndim() != a_shape.ndim()) { |
| CHECK_EQ(w_shape.ndim(), 1U) << "1D weights expected when shapes of a and weights differ."; |
| CHECK_EQ(param.axis.has_value(), true) |
| << "Axis must be specified when shapes of a and weights differ."; |
| mxnet::Tuple<int> axes(param.axis.value()); |
| CHECK_EQ(axes.ndim(), 1U) << "Axis must be int when shapes of a and weights differ."; |
| int red_axis = axes[0] < 0 ? axes[0] + a_shape.ndim() : axes[0]; |
| CHECK_EQ(a_shape[red_axis], w_shape[0]) |
| << "Length of weights not compatible with specified axis."; |
| SHAPE_ASSIGN_CHECK( |
| *out_attrs, |
| 1, |
| NumpyReduceAxesShapeImpl(w_shape, dmlc::optional<mxnet::Tuple<int>>(), false)); |
| } else { |
| for (int i = 0; i < w_shape.ndim(); i++) { |
| CHECK_EQ(w_shape[i], a_shape[i]); |
| } |
| SHAPE_ASSIGN_CHECK(*out_attrs, 1, NumpyReduceAxesShapeImpl(w_shape, param.axis, false)); |
| } |
| } else { |
| SHAPE_ASSIGN_CHECK(*out_attrs, 1, TShape(0, -1)); |
| } |
| |
| return shape_is_known(out_attrs->at(0)) && shape_is_known(out_attrs->at(1)); |
| } |
| |
| template <int req, int NDim, bool onedim = false> |
| struct avg_grad_a_kernel { |
| template <typename DType> |
| MSHADOW_XINLINE static void Map(int i, |
| DType* out, |
| const DType* w, |
| const DType* scl, |
| const DType* ograd, |
| mshadow::Shape<NDim> small, |
| mshadow::Shape<NDim> big) { |
| // partial a = w / sum(w) |
| size_t big_idx = i; |
| size_t small_idx = i; |
| size_t big_stride = 1; |
| size_t small_stride = 1; |
| size_t red_axis_idx = 0; |
| for (int axis = NDim - 1; axis >= 0; --axis) { |
| size_t axis_idx = big_idx % big[axis]; |
| small_idx -= axis_idx * big_stride; |
| if (small[axis] != 1) { |
| small_idx += axis_idx * small_stride; |
| } else if (onedim && small[axis] != big[axis]) { |
| red_axis_idx = axis_idx; |
| } |
| big_idx /= big[axis]; |
| big_stride *= big[axis]; |
| small_stride *= small[axis]; |
| } |
| if (onedim) { |
| KERNEL_ASSIGN(out[i], req, (ograd[small_idx] * (w[red_axis_idx] / *scl))); |
| } else { |
| KERNEL_ASSIGN(out[i], req, (ograd[small_idx] * (w[i] / scl[small_idx]))); |
| } |
| } |
| }; |
| |
| template <int req, int NDim> |
| struct avg_grad_w_kernel { |
| template <typename DType> |
| MSHADOW_XINLINE static void Map(int i, |
| DType* out, |
| const DType* a, |
| const DType* scl, |
| const DType* sum_of_wa, |
| const DType* ograd, |
| mshadow::Shape<NDim> small, |
| mshadow::Shape<NDim> big) { |
| // partial w = (a * sum(w) - sum(a*w)) / (sum(w) * sum(w)) |
| size_t big_idx = i; |
| size_t small_idx = i; |
| size_t big_stride = 1; |
| size_t small_stride = 1; |
| for (int axis = NDim - 1; axis >= 0; --axis) { |
| size_t axis_idx = big_idx % big[axis]; |
| small_idx -= axis_idx * big_stride; |
| if (small[axis] != 1) { |
| small_idx += axis_idx * small_stride; |
| } |
| big_idx /= big[axis]; |
| big_stride *= big[axis]; |
| small_stride *= small[axis]; |
| } |
| DType ret = |
| ograd[small_idx] * |
| (((a[i] * scl[small_idx] - sum_of_wa[small_idx]) / scl[small_idx]) / scl[small_idx]); |
| KERNEL_ASSIGN(out[i], req, ret); |
| } |
| }; |
| |
| template <int req, int NDim> |
| struct avg_grad_w_1D_kernel { |
| template <typename DType> |
| MSHADOW_XINLINE static void Map(int i, |
| DType* out, |
| const DType* a, |
| const DType* scl, |
| const DType* sum_of_wa, |
| const DType* ograd, |
| mshadow::Shape<NDim> big, |
| const int red_axis) { |
| DType scl_val = *scl; |
| size_t tail = 1; |
| size_t head = 1; |
| for (int axis = NDim - 1; axis > red_axis; --axis) { |
| tail *= big[axis]; |
| } |
| for (int axis = 0; axis < red_axis; ++axis) { |
| head *= big[axis]; |
| } |
| DType ret = 0; |
| for (size_t j = 0; j < head; ++j) { |
| for (size_t k = 0; k < tail; ++k) { |
| size_t a_idx = j * (tail * big[red_axis]) + i * tail + k; |
| size_t small_idx = j * tail + k; |
| ret += (ograd[small_idx] * |
| (((a[a_idx] * scl_val - sum_of_wa[small_idx]) / scl_val) / scl_val)); |
| } |
| } |
| KERNEL_ASSIGN(out[i], req, ret); |
| } |
| }; |
| |
| template <typename xpu, bool back = false> |
| void NumpyWeightedAverageComputeImpl(const nnvm::NodeAttrs& attrs, |
| const OpContext& ctx, |
| const std::vector<TBlob>& inputs, |
| const std::vector<OpReqType>& req, |
| const std::vector<TBlob>& outputs, |
| const dmlc::optional<mxnet::Tuple<int>>& axis) { |
| using namespace mshadow; |
| using namespace mxnet_op; |
| Stream<xpu>* s = ctx.get_stream<xpu>(); |
| const TBlob& data = inputs[0]; |
| TShape small1 = NumpyReduceAxesShapeImpl(data.shape_, axis, true); |
| // Reshape weights |
| TShape small2 = small1; |
| TBlob weights = inputs[1]; |
| |
| bool one_dim = weights.shape_.ndim() != data.shape_.ndim(); |
| |
| int red_axis = -1; |
| |
| if (one_dim) { |
| CHECK_EQ(weights.shape_.ndim(), 1U) |
| << "1D weights expected when shapes of a and weights differ."; |
| CHECK_EQ(axis.has_value(), true) |
| << "Axis must be specified when shapes of a and weights differ."; |
| Tuple<int> axes(axis.value()); |
| CHECK_EQ(axes.ndim(), 1U) << "Axis must be int when shapes of a and weights differ."; |
| red_axis = axes[0] < 0 ? axes[0] + data.shape_.ndim() : axes[0]; |
| CHECK_EQ(weights.shape_[0], data.shape_[red_axis]) |
| << "Length of weights not compatible with specified axis."; |
| TShape new_w_shape(data.shape_.ndim(), 1); |
| new_w_shape[red_axis] = weights.shape_[0]; |
| weights = weights.reshape(new_w_shape); |
| small2 = TShape(new_w_shape.ndim(), 1); |
| } |
| TBlob wa; |
| TBlob sum_of_wa; |
| Tensor<xpu, 1, char> workspace; |
| MSHADOW_TYPE_SWITCH(data.type_flag_, DType, { |
| // Get temp space |
| size_t temp_data_size = data.shape_.Size() * sizeof(DType); |
| size_t temp_sum_size = small1.Size() * sizeof(DType); |
| TShape src_shape, dst_shape; |
| BroadcastReduceShapeCompact(data.shape_, small1, &src_shape, &dst_shape); |
| size_t workspace_size = 0; |
| workspace_size = broadcast::ReduceWorkspaceSize(s, dst_shape, {kWriteTo}, src_shape); |
| size_t temp_mem_size = temp_data_size + temp_sum_size + workspace_size; |
| Tensor<xpu, 1, char> temp_mem = |
| ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(temp_mem_size), s); |
| auto* temp_data_ptr = reinterpret_cast<DType*>(temp_mem.dptr_); |
| auto* temp_sum_ptr = reinterpret_cast<DType*>(temp_mem.dptr_ + temp_data_size); |
| char* workspace_ptr = temp_mem.dptr_ + temp_data_size + temp_sum_size; |
| workspace = Tensor<xpu, 1, char>(workspace_ptr, Shape1(workspace_size), s); |
| |
| // Compute weighted data |
| wa = TBlob(temp_data_ptr, data.shape_, xpu::kDevMask); |
| sum_of_wa = TBlob(temp_sum_ptr, small1, xpu::kDevMask); |
| }); |
| #if !defined(__CUDACC__) |
| BinaryBroadcastCompute<xpu, mshadow_op::mul>(attrs, ctx, {data, weights}, {kWriteTo}, {wa}); |
| |
| // Compute sum of weighted data |
| ReduceAxesComputeImpl<xpu, mshadow_op::sum, true>( |
| ctx, {wa}, {kWriteTo}, {sum_of_wa}, small1, &workspace); |
| #else |
| BinaryBroadcastRTCCompute{"mul"}(attrs, ctx, {data, weights}, {kWriteTo}, {wa}); // NOLINT |
| |
| // Compute sum of weighted data |
| ReduceAxesRTCComputeImpl( |
| ctx, {wa}, {kWriteTo}, {sum_of_wa}, small1, "red::sum{}", &workspace, false, "identity"); |
| #endif |
| if (!back) { |
| const TBlob& avg = outputs[0]; |
| const TBlob& sum_of_weights = outputs[1]; |
| // Compute sum of weight |
| TBlob scl = sum_of_weights.reshape(small2); |
| #if !defined(__CUDACC__) |
| ReduceAxesComputeImpl<xpu, mshadow_op::sum, true>( |
| ctx, {weights}, {kWriteTo}, {scl}, small2, &workspace); |
| // Compute avg and assign output |
| BinaryBroadcastCompute<xpu, mshadow_op::div>( |
| attrs, ctx, {sum_of_wa, scl}, req, {avg.reshape(small1)}); |
| #else |
| ReduceAxesRTCComputeImpl( |
| ctx, {weights}, {kWriteTo}, {scl}, small2, "red::sum{}", &workspace, false, "identity"); |
| // Compute avg and assign output |
| BinaryBroadcastRTCCompute{"div"}( // NOLINT |
| attrs, |
| ctx, |
| {sum_of_wa, scl}, |
| req, |
| {avg.reshape(small1)}); |
| #endif |
| } else { |
| MSHADOW_TYPE_SWITCH(data.type_flag_, DType, { |
| // Compute and assign the derivatives of a and weights |
| const TBlob& igrad_a = outputs[0]; |
| const TBlob& igrad_w = outputs[1]; |
| const TBlob& scl = inputs[2]; |
| const TBlob& ograd = inputs[3]; |
| MXNET_NDIM_SWITCH(igrad_a.shape_.ndim(), NDim, { |
| MXNET_ASSIGN_REQ_SWITCH(req[0], req_a, { |
| if (one_dim) { |
| // 1D weights |
| Kernel<avg_grad_a_kernel<req_a, NDim, true>, xpu>::Launch(s, |
| igrad_a.shape_.Size(), |
| igrad_a.dptr<DType>(), |
| weights.dptr<DType>(), |
| scl.dptr<DType>(), |
| ograd.dptr<DType>(), |
| small1.get<NDim>(), |
| igrad_a.shape_.get<NDim>()); |
| } else { |
| Kernel<avg_grad_a_kernel<req_a, NDim, false>, xpu>::Launch(s, |
| igrad_a.shape_.Size(), |
| igrad_a.dptr<DType>(), |
| weights.dptr<DType>(), |
| scl.dptr<DType>(), |
| ograd.dptr<DType>(), |
| small1.get<NDim>(), |
| igrad_a.shape_.get<NDim>()); |
| } |
| }); |
| MXNET_ASSIGN_REQ_SWITCH(req[1], req_w, { |
| if (one_dim) { |
| Kernel<avg_grad_w_1D_kernel<req_w, NDim>, xpu>::Launch(s, |
| igrad_w.shape_.Size(), |
| igrad_w.dptr<DType>(), |
| data.dptr<DType>(), |
| scl.dptr<DType>(), |
| sum_of_wa.dptr<DType>(), |
| ograd.dptr<DType>(), |
| data.shape_.get<NDim>(), |
| red_axis); |
| } else { |
| Kernel<avg_grad_w_kernel<req_w, NDim>, xpu>::Launch(s, |
| igrad_w.shape_.Size(), |
| igrad_w.dptr<DType>(), |
| data.dptr<DType>(), |
| scl.dptr<DType>(), |
| sum_of_wa.dptr<DType>(), |
| ograd.dptr<DType>(), |
| small1.get<NDim>(), |
| igrad_w.shape_.get<NDim>()); |
| } |
| }); |
| }) |
| }); |
| } |
| } |
| |
| template <typename xpu> |
| void NumpyWeightedAverageForward(const nnvm::NodeAttrs& attrs, |
| const OpContext& ctx, |
| const std::vector<TBlob>& inputs, |
| const std::vector<OpReqType>& req, |
| const std::vector<TBlob>& outputs) { |
| using namespace mshadow; |
| using namespace mshadow::expr; |
| if (req[0] == kNullOp) |
| return; |
| CHECK_NE(req[0], kWriteInplace) << "Average does not support write in-place"; |
| const auto& param = nnvm::get<NumpyWeightedAverageParam>(attrs.parsed); |
| const TBlob& data = inputs[0]; |
| TShape small; |
| MSHADOW_TYPE_SWITCH(data.type_flag_, DType, { |
| if (!param.weighted) { |
| small = NumpyReduceAxesShapeImpl(data.shape_, param.axis, true); |
| // Compute sum of weights which equals to the product of sizes of reduced axes |
| Stream<xpu>* s = ctx.get_stream<xpu>(); |
| auto ret = outputs[1].FlatTo1D<xpu, DType>(s); |
| ret = scalar<DType>(data.shape_.Size() / small.Size()); |
| } |
| }); |
| if (!param.weighted) { |
| // Compute mean |
| #if !defined(__CUDACC__) |
| ReduceAxesComputeImpl<xpu, mshadow_op::sum, true, true>(ctx, inputs, req, {outputs[0]}, small); |
| #else |
| ReduceAxesRTCComputeImpl(ctx, inputs, req, {outputs[0]}, small, "red::sum{}", nullptr, true); |
| #endif |
| } else { |
| NumpyWeightedAverageComputeImpl<xpu>(attrs, ctx, inputs, req, outputs, param.axis); |
| } |
| } |
| |
| template <typename xpu> |
| void NumpyWeightedAverageBackward(const nnvm::NodeAttrs& attrs, |
| const OpContext& ctx, |
| const std::vector<TBlob>& inputs, |
| const std::vector<OpReqType>& req, |
| const std::vector<TBlob>& outputs) { |
| using namespace mshadow; |
| using namespace mshadow::expr; |
| const auto& param = nnvm::get<NumpyWeightedAverageParam>(attrs.parsed); |
| if (req[0] == kNullOp && !param.weighted) |
| return; |
| CHECK_EQ(inputs.size(), (param.weighted ? 6U : 5U)); |
| CHECK_EQ(outputs.size(), (param.weighted ? 2U : 1U)); |
| const TBlob& ograd = inputs[0]; |
| const TBlob& data = inputs[2]; |
| MSHADOW_TYPE_SWITCH(data.type_flag_, DType, { |
| if (!param.weighted) { |
| TShape small = NumpyReduceAxesShapeImpl(outputs[0].shape_, param.axis, true); |
| Stream<xpu>* s = ctx.get_stream<xpu>(); |
| auto ograd_tensor = ograd.FlatTo1D<xpu, DType>(s); |
| ograd_tensor /= scalar<DType>(data.shape_.Size() / small.Size()); |
| BroadcastComputeImpl<xpu>(attrs, ctx, {ograd}, req, {outputs[0]}, small); |
| } else { |
| const TBlob& weights = inputs[3]; |
| const TBlob& scl = inputs[5]; |
| NumpyWeightedAverageComputeImpl<xpu, true>( |
| attrs, ctx, {data, weights, scl, ograd}, req, outputs, param.axis); |
| } |
| }); |
| } |
| |
| template <typename xpu, bool sqrt> |
| void NumpyMomentsForward(const nnvm::NodeAttrs& attrs, |
| const OpContext& ctx, |
| const std::vector<TBlob>& inputs, |
| const std::vector<OpReqType>& req, |
| const std::vector<TBlob>& outputs) { |
| using namespace mshadow; |
| using namespace mshadow::expr; |
| using namespace mshadow_op; |
| using namespace mxnet_op; |
| |
| CHECK_EQ(inputs.size(), 1U); |
| CHECK_EQ(req.size(), 2U); |
| CHECK_EQ(outputs.size(), 2U); |
| |
| const NumpyMomentsParam& param = nnvm::get<NumpyMomentsParam>(attrs.parsed); |
| |
| Stream<xpu>* s = ctx.get_stream<xpu>(); |
| |
| const TBlob& data = inputs[0]; |
| const TBlob& moment = outputs[0]; |
| const TBlob& mean = outputs[1]; |
| |
| mxnet::TShape small; |
| if (param.keepdims) { |
| small = moment.shape_; |
| } else { |
| small = NumpyReduceAxesShapeImpl(data.shape_, param.axis, true); |
| } |
| |
| mxnet::TShape src_shape, dst_shape; |
| BroadcastReduceShapeCompact(data.shape_, small, &src_shape, &dst_shape); |
| |
| // Get workspace and temp space for data - mean |
| size_t workspace_size = broadcast::ReduceWorkspaceSize(s, dst_shape, req[0], src_shape); |
| size_t temp_data_size = data.shape_.Size() * common::mshadow_type_info(inputs[0].type_flag_).size; |
| size_t temp_mem_size = temp_data_size + workspace_size; |
| Tensor<xpu, 1, char> temp_mem = |
| ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(temp_mem_size), s); |
| char* workspace_ptr = temp_mem.dptr_ + temp_data_size; |
| Tensor<xpu, 1, char> workspace(workspace_ptr, Shape1(workspace_size), s); |
| // Compute mean |
| #if !defined(__CUDACC__) |
| ReduceAxesComputeImpl<xpu, mshadow_op::sum, true, true>( |
| ctx, inputs, {kWriteTo}, {mean}, small, &workspace); |
| #else |
| ReduceAxesRTCComputeImpl( |
| ctx, inputs, {kWriteTo}, {mean}, small, "red::sum{}", &workspace, true, "identity"); |
| #endif |
| // Compute data - mean |
| Shape<6> data_shape, mean_shape; |
| for (int i = 0; i < 6; ++i) { |
| data_shape[i] = (i < data.shape_.ndim()) ? data.shape_[i] : 1; |
| mean_shape[i] = (i < small.ndim()) ? small[i] : 1; |
| } |
| #if !defined(__CUDACC__) |
| MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, DType, { |
| MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, OType, { |
| DType* temp_data_ptr = reinterpret_cast<DType*>(temp_mem.dptr_); |
| Kernel<VarBroadcastKernel, xpu>::Launch(s, |
| data_shape.Size(), |
| temp_data_ptr, |
| data.dptr<DType>(), |
| mean.dptr<DType>(), |
| data_shape, |
| mean_shape); |
| Tensor<xpu, 1, DType> temp_data_tensor(temp_data_ptr, Shape1(data.shape_.Size()), s); |
| TBlob temp_data_blob = TBlob(temp_data_tensor).reshape(data.shape_); |
| ReduceAxesComputeImpl<xpu, mshadow_op::sum, true, true>( |
| ctx, {temp_data_blob}, {req[0]}, {moment}, small, &workspace, param.ddof); |
| if (sqrt && req[0] != kNullOp) { |
| Tensor<xpu, 1, OType> moment_tensor = moment.FlatTo1D<xpu, OType>(s); |
| moment_tensor = F<mshadow_op::square_root>(moment_tensor); |
| } |
| }); |
| }); |
| #else |
| MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, DType, { |
| DType* temp_data_ptr = reinterpret_cast<DType*>(temp_mem.dptr_); |
| Kernel<VarBroadcastKernel, xpu>::Launch(s, |
| data_shape.Size(), |
| temp_data_ptr, |
| data.dptr<DType>(), |
| mean.dptr<DType>(), |
| data_shape, |
| mean_shape); |
| Tensor<xpu, 1, DType> temp_data_tensor(temp_data_ptr, Shape1(data.shape_.Size()), s); |
| TBlob temp_data_blob = TBlob(temp_data_tensor).reshape(data.shape_); |
| ReduceAxesRTCComputeImpl(ctx, |
| {temp_data_blob}, |
| {req[0]}, |
| {moment}, |
| small, |
| "red::sum{}", |
| &workspace, |
| true, |
| "identity", |
| param.ddof); |
| if (sqrt && req[0] != kNullOp) { |
| UnaryRTCCompute{"sqrt"}({}, ctx, {moment}, {kWriteInplace}, {moment}); // NOLINT |
| } |
| }); |
| #endif |
| } |
| |
| template <typename xpu> |
| void NumpyBroadcastToForward(const nnvm::NodeAttrs& attrs, |
| const OpContext& ctx, |
| const std::vector<TBlob>& inputs, |
| const std::vector<OpReqType>& req, |
| const std::vector<TBlob>& outputs) { |
| if (outputs[0].shape_.Size() == 0U) |
| return; // zero-size tensor |
| TShape expanded_ishape(outputs[0].shape_.ndim(), 1); |
| const TShape& ishape = inputs[0].shape_; |
| CHECK_LE(ishape.ndim(), expanded_ishape.ndim()) << "output ndim cannot be less than input ndim"; |
| const int ndim_delta = expanded_ishape.ndim() - ishape.ndim(); |
| for (int i = 0; i < ishape.ndim(); ++i) { |
| expanded_ishape[i + ndim_delta] = ishape[i]; |
| } |
| BroadcastComputeImpl<xpu>( |
| attrs, ctx, {inputs[0].reshape(expanded_ishape)}, req, outputs, expanded_ishape); |
| } |
| |
| template <typename xpu> |
| void NumpyBroadcastToBackward(const nnvm::NodeAttrs& attrs, |
| const OpContext& ctx, |
| const std::vector<TBlob>& inputs, |
| const std::vector<OpReqType>& req, |
| const std::vector<TBlob>& outputs) { |
| if (inputs[0].shape_.Size() == 0U) |
| return; // zero-size ograd |
| TShape expanded_igrad_shape(inputs[0].shape_.ndim(), 1); |
| const TShape& igrad_shape = outputs[0].shape_; |
| CHECK_LE(igrad_shape.ndim(), expanded_igrad_shape.ndim()) |
| << "output ndim cannot be less than input ndim"; |
| const int ndim_delta = expanded_igrad_shape.ndim() - igrad_shape.ndim(); |
| for (int i = 0; i < igrad_shape.ndim(); ++i) { |
| expanded_igrad_shape[i + ndim_delta] = igrad_shape[i]; |
| } |
| #if !defined(__CUDACC__) |
| if (NeedSafeAcc<true>(inputs[0].type_flag_, outputs[0].type_flag_)) { |
| ReduceAxesComputeImpl<xpu, mshadow_op::sum, true>( |
| ctx, inputs, req, {outputs[0].reshape(expanded_igrad_shape)}, expanded_igrad_shape); |
| } else { |
| ReduceAxesComputeImpl<xpu, mshadow_op::sum, false>( |
| ctx, inputs, req, {outputs[0].reshape(expanded_igrad_shape)}, expanded_igrad_shape); |
| } |
| #else |
| ReduceAxesRTCComputeImpl(ctx, |
| inputs, |
| req, |
| {outputs[0].reshape(expanded_igrad_shape)}, |
| expanded_igrad_shape, |
| "red::sum{}", |
| nullptr, |
| false); |
| #endif |
| } |
| |
| template <typename xpu, typename OP> |
| void NumpyReduceAxesNoDTypeBackward(const nnvm::NodeAttrs& attrs, |
| const OpContext& ctx, |
| const std::vector<TBlob>& inputs, |
| const std::vector<OpReqType>& req, |
| const std::vector<TBlob>& outputs) { |
| using namespace mshadow; |
| using namespace mshadow::expr; |
| const NumpyReduceAxesNoDTypeParam& param = nnvm::get<NumpyReduceAxesNoDTypeParam>(attrs.parsed); |
| TShape small; |
| if (param.keepdims) { |
| small = inputs[0].shape_; |
| } else { |
| small = NumpyReduceAxesShapeImpl(outputs[0].shape_, param.axis, true); |
| } |
| ReduceAxesBackwardUseInOutImpl<xpu, OP, false>(ctx, small, inputs, req, outputs); |
| } |
| |
| } // namespace op |
| } // namespace mxnet |
| |
| namespace std { |
| template <> |
| struct hash<mxnet::op::NumpyReduceAxesParam> { |
| size_t operator()(const mxnet::op::NumpyReduceAxesParam& val) { |
| size_t ret = 0; |
| ret = dmlc::HashCombine(ret, val.axis); |
| ret = dmlc::HashCombine(ret, val.dtype); |
| ret = dmlc::HashCombine(ret, val.keepdims); |
| ret = dmlc::HashCombine(ret, val.initial); |
| return ret; |
| } |
| }; |
| } // namespace std |
| #endif // MXNET_OPERATOR_NUMPY_NP_BROADCAST_REDUCE_OP_H_ |