blob: 4259751b71c4192bd5c3dfde775190613ea75746 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*!
* \file test_op.h
* \brief operator unit test utility functions
* \author Chris Olivier
*
* These classes offer a framework for developing, testing and debugging operators
* in C++. They work for both CPU and GPU modes, as well as offer a timing
* infrastructure in order to test inidividual operator performance.
*
* Operator data can be validated against general logic,
* stored scalar values (which can be generated by this code from an existing operator via
* BasicOperatorData::dumpC(), as well as against each other (ie check that
* GPU, CPU, MKL, and CUDNN operators produce the same output given the same input.
*
* test_util.h: General testing utility functionality
* test_perf.h: Performance-related classes
* test_op.h: Operator-specific testing classes
*/
#ifndef TEST_LEGACY_OP_H_
#define TEST_LEGACY_OP_H_
#include <list>
#include <string>
#include <algorithm>
#include <map>
#include <vector>
#include <list>
#include "profiler/vtune.h"
#include "../../../include/mxnet/operator.h"
#include "./test_op.h"
#include "./test_op_runner.h"
namespace mxnet {
namespace test {
namespace op {
/*!
* \brief Manage test blobs and context, and universal logic
* Create an operator from its "Prop" class and sets up the operator
* and resources for both forward and backward passes
* \tparam DType
*/
template <typename DType, typename AccReal>
class LegacyOperatorExecutor : public OperatorDataInitializer<DType>,
public OperatorExecutorTiming {
public:
typedef DType DataType;
typedef AccReal AccRealType;
/*! \brief Manage test blobs and context */
LegacyOperatorExecutor(const bool isGPU, const mxnet::ShapeVector& topShapes)
#if !MXNET_USE_CUDA
: isGPU_(false)
#else
: isGPU_(isGPU)
#endif
,
initializeForward_(0) // unit testing may call inits in any order based
,
initializeBackward_(0) // upon its use-case (ie may not want to run forward pass first)
,
initializeCallback_(0) {
opContext_.is_train = true;
opContext_.run_ctx.stream = nullptr;
CHECK(!topShapes.empty());
shape_input_vec_ = topShapes;
}
inline mxnet::Context getContext() {
return isGPU_ ? mxnet::Context::GPU(0) : mxnet::Context{};
}
/*! \brief Initialize forward blob data values */
virtual void resetForward() {}
/*! \brief Initialize backward blob data values */
virtual void resetBackward() {}
/*! \brief Initialize auxiliary and output blobs */
template <typename OperatorPropertyType>
bool initForward(const OperatorPropertyType& opProp, std::vector<int>* in_type) {
if (!initializeForward_++) {
shape_input_vec_.resize(opProp.ListArguments().size());
op_.reset(opProp.CreateOperatorEx(getContext(), &shape_input_vec_, in_type));
if (op_) {
const size_t output_count = opProp.ListOutputs().size();
const size_t aux_count = opProp.ListAuxiliaryStates().size();
// Figure out what sort of blobs we need to allocate
mxnet::ShapeVector out_shape, aux_shape;
out_shape.resize(output_count);
aux_shape.resize(aux_count);
opProp.InferShape(&shape_input_vec_, &out_shape, &aux_shape);
std::vector<int> out_type(output_count, -1), aux_type(aux_count, -1);
opProp.InferType(in_type, &out_type, &aux_type);
// Allocate top blobs (input)
for (size_t x = 0, n = shape_input_vec_.size(); x < n; ++x) {
int type;
if (x < in_type->size()) {
type = (*in_type)[x];
} else {
type = x ? mshadow::DataType<AccReal>::kFlag : mshadow::DataType<DType>::kFlag;
}
allocateBlob(&c_.blob_input_vec_, shape_input_vec_[x], false, type);
}
// Allocate aux blobs (scratch, hidden, etc.)
for (size_t x = 0, n = aux_shape.size(); x < n; ++x) {
CHECK(x < aux_type.size());
allocateBlob(&c_.blob_aux_states_, aux_shape[x], false, aux_type[x]);
}
// Allocate bottom blobs (output)
for (size_t x = 0, n = out_shape.size(); x < n; ++x) {
CHECK(x < out_type.size());
allocateBlob(&c_.blob_output_vec_, out_shape[x], false, out_type[x]);
}
// Get the resource of temporal space
mxnet::ShapeVector inputShapes;
for (size_t x = 0, n = shape_input_vec_.size(); x < n; ++x) {
inputShapes.emplace_back(shape_input_vec_[x]);
}
allocateResources(opProp.ForwardResource(inputShapes));
resetForward();
return true;
}
return false;
} else {
return true;
}
}
/*! \brief Initialize auxiliary and output blobs */
template <typename OperatorPropertyType>
bool initBackward(const OperatorPropertyType& opProp, std::vector<int>* in_type) {
initForward(opProp, in_type);
if (!initializeBackward_++) {
for (size_t x = 0, n = static_cast<size_t>(opProp.NumVisibleOutputs()); x < n; ++x) {
CHECK_LT(x, c_.blob_output_vec_.size());
allocateBlob(&c_.blob_out_grad_,
c_.blob_output_vec_[x].shape_,
false,
c_.blob_output_vec_[x].type_flag_);
}
for (size_t x = 0, n = c_.blob_input_vec_.size(); x < n; ++x) {
allocateBlob(&c_.blob_in_grad_,
c_.blob_input_vec_[x].shape_,
false,
c_.blob_input_vec_[x].type_flag_);
}
// Get the resource of temporal space
mxnet::ShapeVector ishapes;
allocateResources(opProp.BackwardResource(ishapes));
resetBackward();
return false;
} else {
return true;
}
}
/*! \brief Run operator forward */
void forward(const size_t count = 1) {
const std::vector<OpReqType> req(c_.blob_output_vec_.size(), kWriteTo);
// Possibly move data to/from CPU and GPU (outside of timing scope)
MXNET_CUDA_ONLY(
std::unique_ptr<GPUOpData> gpuData(isGPU_ ? new GPUOpData(c_, &opContext_) : nullptr));
perf::TimingItem timeF(&OperatorExecutorTiming::GetTiming(), Forward, "Forward", count);
if (!isGPU_) {
mxnet::profiler::vtune::VTuneResume profile; // VTune sample only this scope
for (size_t x = 0; x < count; ++x) {
op()->Forward(
opContext_, c_.blob_input_vec_, req, c_.blob_output_vec_, c_.blob_aux_states_);
}
} else {
for (size_t x = 0; x < count; ++x) {
MXNET_CUDA_ONLY(op()->Forward(opContext_,
gpuData->blob_input_vec_,
req,
gpuData->blob_output_vec_,
gpuData->blob_aux_states_));
}
}
}
/*! \brief Run operator backwards */
void backward(const size_t count = 1) {
const std::vector<OpReqType> req(c_.blob_in_grad_.size(), kWriteTo);
// Possibly move data to/from CPU and GPU (outside of timing scope)
MXNET_CUDA_ONLY(
std::unique_ptr<GPUOpData> gpuData(isGPU_ ? new GPUOpData(c_, &opContext_) : nullptr));
perf::TimingItem timeB(&OperatorExecutorTiming::GetTiming(), Backward, "Backward", count);
if (!isGPU_) {
mxnet::profiler::vtune::VTuneResume profile; // VTune sample only this scope
for (size_t x = 0; x < count; ++x) {
op()->Backward(opContext_,
c_.blob_out_grad_,
c_.blob_input_vec_,
c_.blob_output_vec_,
req,
c_.blob_in_grad_,
c_.blob_aux_states_);
}
} else {
for (size_t x = 0; x < count; ++x) {
MXNET_CUDA_ONLY(op()->Backward(opContext_,
gpuData->blob_out_grad_,
gpuData->blob_input_vec_,
gpuData->blob_output_vec_,
req,
gpuData->blob_in_grad_,
gpuData->blob_aux_states_));
}
}
}
/*!
* \brief Test if operator has a backward pass
* \return true if this operator has a backward pass
*/
MSHADOW_CINLINE bool HasBackward() const {
return true;
}
/*! \brief Getter functions for the operator */
inline Operator* op() {
return op_.get();
}
inline const Operator* op() const {
return op_.get();
}
enum BlobVectorType { kInput, kOutput, kAux, kInGrad, kOutGrad, kBlobVectorTypeCount };
#define CASE_STR(__v$) \
case (__v$): \
return #__v$
/*! \brief Convert BlobVectorType enum into a string */
static inline const char* bvt2String(const BlobVectorType bvt) {
switch (bvt) {
CASE_STR(kInput);
CASE_STR(kOutput);
CASE_STR(kAux);
CASE_STR(kInGrad);
CASE_STR(kOutGrad);
default:
CHECK(false);
return "";
}
}
#undef CASE_STR
/*! \brief Return a particular blob in a test data set */
inline const std::vector<TBlob>& getBlobVect(const BlobVectorType bvt) const {
switch (bvt) {
case kInput:
return c_.blob_input_vec_;
case kOutput:
return c_.blob_output_vec_;
case kAux:
return c_.blob_aux_states_;
case kInGrad:
return c_.blob_in_grad_;
case kOutGrad:
return c_.blob_out_grad_;
default:
CHECK(false);
return c_.blob_input_vec_;
}
}
/*! \brief Dump an operator's data set into compilable C++ data code for runtime validation
* When writing an operator test, you can generate a "known good operator data state" in C++
* code with this function, and then use load() to load the blob states into this
* class (BasicOperatorData).
* After that, you can compare with the "actual" operator state (BasicOperatorData) of
* the operator that you are testing.
*/
template <typename Stream>
inline void dumpC(Stream* _os, const std::string& label) {
Stream& os = *_os;
os << "static const std::vector< std::vector< std::vector<float> > > ___" << label
<< "_data_shape_";
const mxnet::TShape& shape = shape_input_vec_[0];
for (size_t i = 0, n = shape.ndim(); i < n; ++i) {
os << shape[i] << "_";
}
os << "__ =" << std::endl << "{" << std::endl;
for (size_t x = 0; x < kBlobVectorTypeCount; ++x) {
os << " { /* " << bvt2String(BlobVectorType(x)) << " */" << std::endl;
const std::vector<TBlob>& blobVect = getBlobVect(BlobVectorType(x));
for (size_t i = 0, n = blobVect.size(); i < n; ++i) {
os << " { ";
test::dump<DType>(&os, blobVect[i]);
os << " }";
if (i < n - 1) {
os << ",";
}
os << std::endl;
}
os << " }";
if (x < kBlobVectorTypeCount - 1) {
os << ",";
}
os << std::endl;
}
os << "};" << std::endl;
}
static inline void copy(const TBlob& blob,
const DType array[],
const size_t start,
const size_t end) {
const size_t blobSize = blob.Size();
DType* p = blob.dptr<DType>();
for (size_t i = 0, n = end - start; i < n; ++i) {
CHECK_LT(i, blobSize);
p[i] = array[i + start];
}
}
/*! \brief Runtime load of the C++ data code generated by dumpC() */
void load(const std::vector<std::vector<std::vector<DType>>>& cData) {
for (size_t i = 0, ni = cData.size(); i < ni; ++i) {
for (size_t j = 0, nj = cData[i].size(); j < nj; ++j) {
const TBlob& blob = getBlobVect(BlobVectorType(i))[j];
const size_t sourceDataSize = cData[i][j].size();
CHECK_EQ(sourceDataSize, blob.Size());
const DType* sourceData = &cData[i][j][0];
copy(blob, sourceData, 0, sourceDataSize);
}
}
}
/*! \brief Runtime load of the C++ data code generated by dumpC() */
void load(const std::vector<std::vector<std::vector<DType>>>& cData, const BlobVectorType type) {
CHECK_LT(type, cData.size());
for (size_t j = 0, nj = cData[type].size(); j < nj; ++j) {
const TBlob& blob = getBlobVect(type)[j];
const size_t sourceDataSize = cData[type][j].size();
CHECK_EQ(sourceDataSize, blob.Size());
const DType* sourceData = &cData[type][j][0];
copy(blob, sourceData, 0, sourceDataSize);
}
}
/*! \brief Runtime load of the C++ data code generated by dumpC() */
void load(const std::vector<std::vector<std::vector<DType>>>& cData,
const BlobVectorType type,
const int idx) {
CHECK_LT(type, cData.size());
CHECK_LT(idx, cData[type].size());
const TBlob& blob = getBlobVect(type)[idx];
const size_t sourceDataSize = cData[type][idx].size();
CHECK_EQ(sourceDataSize, blob.Size());
const DType* sourceData = &cData[type][idx][0];
copy(blob, sourceData, 0, sourceDataSize);
}
// void FillRandom() {
// for (size_t j = 0, jn = this->c_.all_blob_vects_.size(); j < jn; ++j) {
// std::vector<TBlob> *data_vect = this->c_.all_blob_vects_[j];
// if (data_vect) {
// for (size_t i = 0, n = data_vect->size(); i < n; ++i) {
// OperatorDataInitializer<DType>::FillRandom((*data_vect)[i]);
// }
// }
// }
// }
std::vector<TBlob>& inputs() {
return c_.blob_input_vec_;
}
const std::vector<TBlob>& inputs() const {
return c_.blob_input_vec_;
}
std::vector<TBlob>& outputs() {
return c_.blob_output_vec_;
}
const std::vector<TBlob>& outputs() const {
return c_.blob_output_vec_;
}
std::vector<TBlob>& bwd_inputs() {
return c_.blob_out_grad_;
}
std::vector<TBlob>& bwd_outputs() {
return c_.blob_in_grad_;
}
/*! \brief Input and output blobs */
OpContext opContext_;
mxnet::ShapeVector shape_input_vec_;
struct OpData {
std::vector<TBlob> blob_input_vec_;
std::vector<TBlob> blob_output_vec_;
std::vector<TBlob> blob_aux_states_;
std::vector<TBlob> blob_in_grad_;
std::vector<TBlob> blob_out_grad_; // Remaining err (loss) pushing back upstream
std::vector<std::vector<TBlob>*> all_blob_vects_;
inline OpData() {
all_blob_vects_.emplace_back(&blob_input_vec_);
all_blob_vects_.emplace_back(&blob_output_vec_);
all_blob_vects_.emplace_back(&blob_aux_states_);
all_blob_vects_.emplace_back(&blob_in_grad_);
all_blob_vects_.emplace_back(&blob_out_grad_); // Remaining err (loss) pushing back upstream
}
virtual ~OpData() {}
};
#if MXNET_USE_CUDA
class GPUOpData : public OpData {
GPUOpData() = delete;
GPUOpData(const GPUOpData& o) = delete;
public:
inline GPUOpData(const OpData& cpuData, OpContext* opContext)
: cpuData_(cpuData), allocGPUStream_(opContext) {
// Copy CPU->GPU
CHECK_EQ(gpuBlobs_.size(), 0U);
CHECK_EQ(cpuData_.all_blob_vects_.size(), this->all_blob_vects_.size());
for (size_t bvt = 0, nbvt = cpuData_.all_blob_vects_.size(); bvt < nbvt; ++bvt) {
std::vector<TBlob>& bv_src = *cpuData_.all_blob_vects_[bvt];
std::vector<TBlob>& bvt_dest = *this->all_blob_vects_[bvt];
for (size_t i = 0, n = bv_src.size(); i < n; ++i) {
const TBlob& srcBlob = bv_src[i];
TBlob* destBlob =
allocateBlob(&gpuBlobs_, &bvt_dest, srcBlob.shape_, true, srcBlob.type_flag_);
Context cpu_ctx, gpu_ctx;
cpu_ctx.dev_type = Context::kCPU;
gpu_ctx.dev_type = Context::kGPU;
cpu_ctx.dev_id = gpu_ctx.dev_id = 0;
mxnet::ndarray::Copy<cpu, gpu>(
srcBlob, destBlob, cpu_ctx, gpu_ctx, allocGPUStream_.opContext_.run_ctx);
}
}
cudaDeviceSynchronize();
}
inline ~GPUOpData() {
// Copy GPU->CPU
cudaDeviceSynchronize();
for (size_t bvt = 0, nbvt = this->all_blob_vects_.size(); bvt < nbvt; ++bvt) {
std::vector<TBlob>& bv_src = *this->all_blob_vects_[bvt];
std::vector<TBlob>& bvt_dest = *cpuData_.all_blob_vects_[bvt];
for (size_t i = 0, n = bv_src.size(); i < n; ++i) {
const TBlob& srcBlob = bv_src[i];
TBlob* destBlob = &bvt_dest[i];
Context cpu_ctx, gpu_ctx;
cpu_ctx.dev_type = Context::kCPU;
gpu_ctx.dev_type = Context::kGPU;
cpu_ctx.dev_id = gpu_ctx.dev_id = 0;
mxnet::ndarray::Copy<gpu, cpu>(
srcBlob, destBlob, gpu_ctx, cpu_ctx, allocGPUStream_.opContext_.run_ctx);
}
}
gpuBlobs_.clear(); // Force deallocation of the GPU blob data
cudaDeviceSynchronize();
}
private:
/*! \brief Reference to the src/dest CPU data */
const OpData& cpuData_;
/*! \brief The GPU-allocated blobs */
std::list<std::unique_ptr<test::StandaloneBlob>> gpuBlobs_;
/*! \brief Scoped GPU stream */
GPUStreamScope allocGPUStream_;
};
#endif // MXNET_USE_CUDA
protected:
OpData c_;
/*! \brief Allocate the operator's resource requests */
void allocateResources(const std::vector<ResourceRequest>& reqs) {
std::map<Context, Resource> cached_temp;
Context ctx;
ctx.dev_type = isGPU_ ? Context::kGPU : Context::kCPU;
ctx.dev_id = 0;
for (const ResourceRequest& req : reqs) {
switch (req.type) {
case ResourceRequest::kTempSpace: {
if (cached_temp.count(ctx) != 0) {
opContext_.requested.emplace_back(cached_temp.at(ctx));
} else {
Resource r = ResourceManager::Get()->Request(ctx, req);
opContext_.requested.emplace_back(r);
cached_temp[ctx] = r;
}
break;
}
case ResourceRequest::kRandom: {
opContext_.requested.emplace_back(ResourceManager::Get()->Request(ctx, req));
break;
}
case ResourceRequest::kParallelRandom: {
Resource rm = ResourceManager::Get()->Request(ctx, req);
if (ctx.dev_mask() == Context::kCPU) {
common::random::RandGenerator<cpu, DType>::AllocState(
rm.get_parallel_random<cpu, DType>());
}
opContext_.requested.emplace_back(rm);
break;
}
#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
case ResourceRequest::kCuDNNDropoutDesc: {
opContext_.requested.push_back(ResourceManager::Get()->Request(ctx, req));
break;
}
#endif // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
default:
LOG(FATAL) << "resource type " << req.type << " is not yet supported";
}
}
}
/*! \brief Locally allocate a managed TBlob and insert into the supplied vector */
static TBlob* allocateBlob(std::list<std::unique_ptr<test::StandaloneBlob>>* standalone_blobs,
std::vector<TBlob>* dest,
const mxnet::TShape& shape,
const bool isGPU,
const int dtype) {
test::StandaloneBlob* blob = new test::StandaloneBlob(shape, isGPU, dtype);
CHECK_NE(blob, static_cast<TBlob*>(nullptr));
standalone_blobs->emplace_back(std::unique_ptr<test::StandaloneBlob>(blob));
(*dest).emplace_back(*blob);
return blob;
}
/*! \brief Locally allocate a managed TBlob and insert into the supplied vector */
inline TBlob* allocateBlob(std::vector<TBlob>* dest,
const mxnet::TShape& shape,
const bool isGPU,
const int dtype) {
return allocateBlob(&standalone_blobs_, dest, shape, isGPU, dtype);
}
/*! \brief Performance timing categories */
enum TimingId { Forward, Backward };
/*! \brief The operator */
std::unique_ptr<Operator> op_;
/*! \brief Is this for a GPU? */
const bool isGPU_;
/*! \brief Assure that the Forward initialized only once */
std::atomic<int> initializeForward_;
/*! \brief Assure that the Forward initialized only once */
std::atomic<int> initializeBackward_;
/*! \brief Assure that the callback is initialized only once */
std::atomic<int> initializeCallback_;
/*! \brief scoped lifecycle management of allocated blobs */
std::list<std::unique_ptr<test::StandaloneBlob>> standalone_blobs_;
};
template <typename OperatorProp, typename DType, typename AccReal>
using LegacyOpRunner =
mxnet::test::OperatorRunner<OperatorProp, LegacyOperatorExecutor<DType, AccReal>>;
} // namespace op
} // namespace test
} // namespace mxnet
#endif // TEST_LEGACY_OP_H_