blob: 9f5a2e04c54e5df180163dce8b740b42c44bfaba [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*!
* \file test_tune.h
* \brief operator tuning tester
* \author Chris Olivier
*/
#ifndef TEST_TUNE_H_
#define TEST_TUNE_H_
#ifndef _WIN32
#include <sys/time.h>
#else
#include <Windows.h>
#endif
#include <dmlc/logging.h>
#include <iomanip>
#include <iostream>
#include <atomic>
#include <unordered_set>
#include <unordered_map>
#include <mutex>
#include <vector>
#include <utility>
#include <algorithm>
#include <string>
#include <map>
#include "../../src/operator/operator_tune-inl.h"
#include "./test_util.h"
#include "./test_op.h"
#include "./test_core_op.h"
namespace mxnet {
namespace test {
namespace tune {
/*!
* \brief Tuning tests, which whether the correct tuning mode is selected by Auto
* \note This class makes no attempt at being performant (i.e. it does all sorts of slow
* deep copies and that sort of thing), so don't insert any of thios code in the main
* trunk unless you've verified the performance characteristics for that chunk of code
* \tparam DType Data type to test
*/
template<typename DType>
class TuningTester {
public:
using kwargs_t = test::op::kwargs_t;
using bool_mode_pair = std::pair<bool, ::mxnet::op::tune::TuningMode>;
using shape_vect = mxnet::ShapeVector;
using shape_vec_to_bool_map = std::map<shape_vect, bool_mode_pair, test::less_shapevect>;
private:
using ShapesToPerfTimingMap =
std::map<shape_vect, test::perf::timing_map_t, test::less_shapevect>;
/*!
* \brief Run timing test on various data shapes and sizes
* \param isGPU true if the GPU should be used for the timing test
* \param op_kwargs operator parameters
* \param op_name The operator's registered name (with nnvm)
* \param backward_op_name The backward operator's registered name (with nnvm)
* \return ShapesToPerfTimingMap map holsing timing data for shapes
*/
ShapesToPerfTimingMap RunCoreOpTimingTest(const bool isGPU,
const kwargs_t &op_kwargs,
const std::vector<shape_vect>& shapes,
const char *op_name,
const char *backward_op_name = "") {
ShapesToPerfTimingMap res;
const kwargs_t kwargs = test::op::CoreOpExecutor<DType>::ArgsWithOpName(
op_kwargs, op_name, backward_op_name);
// prime code and cache before the performance runs
test::op::CoreOperatorRunner<DType> runner;
runner.set_total_iterations(total_iterations_);
runner.set_verbose(false);
runner.RunBidirectional(false, {{10, 3, 18, 128}}, kwargs, 1);
// Do the performance runs
const char *pu = isGPU ? "GPU" : "CPU";
for (const mxnet::ShapeVector &this_run_shapes : shapes) {
test::perf::timing_map_t tmap = runner.TimingTest(std::string(op_name) + " Operator " + pu,
isGPU, false, kwargs,
0, calls_per_iteration_,
this_run_shapes);
CHECK(res.find(this_run_shapes) == res.end());
res[this_run_shapes] = tmap;
}
return res;
}
using tuned_timing_t = std::map<
shape_vect,
std::map<::mxnet::op::tune::TuningMode, test::perf::timing_map_t>, test::less_shapevect>;
using modesort_t = std::multimap<double, ::mxnet::op::tune::TuningMode>;
/*!
* \brief Check if the tuning succeeded
* \param mode_sort modesort_t structure produced by 'CalculateModeSort'
* \param closeness_factor fraction of largest standard time (omp, no omp) which is an acceptable
* range
* \return a pair <bool, TuningMode> consisting of true or false signifying if the test appears to
* have made the correct decision, and the TuningMode which was closest in timing to
* the Auto mode.
*/
static bool_mode_pair CheckCorrectTuning(const modesort_t &mode_sort,
const double closeness_factor = 0.25) {
CHECK_EQ(mode_sort.size(), 3U);
// Determine fastest normal mode
::mxnet::op::tune::TuningMode fastest_standard_mode = ::mxnet::op::tune::kAuto;
for (auto i = mode_sort.begin(), e = mode_sort.end(); i != e; ++i) {
if (i->second != ::mxnet::op::tune::kAuto) {
fastest_standard_mode = i->second;
break;
}
}
CHECK_NE(fastest_standard_mode, ::mxnet::op::tune::kAuto);
// We should be closest to the faster of kNeverOMP and kAlwaysOMP
// Take into account some variance, especially if kNeverOMP and kAlwaysOMP are close together
std::map<::mxnet::op::tune::TuningMode, double> mode2time;
for (auto i = mode_sort.begin(), e = mode_sort.end(); i != e; ++i) {
mode2time[i->second] = i->first;
}
const double time_auto = mode2time[::mxnet::op::tune::kAuto];
const double time_no_omp = mode2time[::mxnet::op::tune::kNeverOMP];
const double time_omp = mode2time[::mxnet::op::tune::kAlwaysOMP];
// Figure out which one we are closest to and return that to help in the analysis
::mxnet::op::tune::TuningMode closest_to;
if (fabs(time_auto - time_no_omp) < fabs(time_auto - time_omp)) {
closest_to = ::mxnet::op::tune::kNeverOMP;
} else {
closest_to = ::mxnet::op::tune::kAlwaysOMP;
}
// If difference between OMP and no OMP is < closeness_factor of largest of the two,
// then we just want to make sure we are close to both of these
const double fastest_standard_time = std::min(time_no_omp, time_omp);
const double allowed_difference = closeness_factor * fastest_standard_time;
const double mustbe_asfast = fastest_standard_time + allowed_difference;
return { time_auto <= mustbe_asfast || closest_to == fastest_standard_mode,
closest_to };
}
public:
/*!
* \brief Given timing statistics, determine if 'Auto' mode made the correct choice.
* \param direction Compute direction for which to check (Forward or Backward)
* \param verbose If true, print the statistical info
* \return A map of shape vectors to a pair <bool, TuningMode> consisting of true or false
* signifying if the test appears to have made the correct decision, and the TuningMode
* which was closest in timing to the Auto mode.
*/
shape_vec_to_bool_map CalculateModeSort(const test::op::TimingDirection direction,
bool verbose = true) const {
if (test::csv) {
verbose = false;
}
shape_vec_to_bool_map results;
// Incredibly inefficient method of grouping the results
for (const auto &i : timing_) {
// print shapes
const shape_vect &shapes = i.first;
if (verbose || test::csv) {
if (!test::csv) {
for (size_t x = 0, n = shapes.size(); x < n; ++x) {
const mxnet::TShape &shape = shapes[x];
if (x) {
std::cout << ", ";
}
std::cout << shape;
}
const mxnet::TShape &lhs_shape = shapes[0];
std::cout << " lhs=" << test::pretty_num(lhs_shape.Size()) << " items";
std::cout << "\t(" << TimingDirectionAsString(direction) << ")" << std::endl;
} else {
std::cout << test::pretty_num(shapes[0].Size()) << ",";
}
}
const auto &mode2timing = i.second;
modesort_t mode_sort;
for (const auto &j : mode2timing) {
const ::mxnet::op::tune::TuningMode mode = j.first;
const test::perf::timing_map_t &tm = j.second;
if (tm.find(direction) != tm.end()) {
const test::perf::TimingInstrument::Info &info = tm.find(direction)->second;
double duration = info.TimeEach();
mode_sort.insert({duration, mode});
if (test::csv) {
std::cout << TimingDirectionAsString(direction) << ","
<< ::mxnet::op::tune::TuningModeToString(mode) << ","
<< duration << ",";
}
}
}
if (test::csv) {
std::cout << std::endl << std::flush;
}
if (!mode_sort.empty()) {
// Now we have modes sorted by performance, fastest to slowest
const bool_mode_pair result = CheckCorrectTuning(mode_sort);
if (verbose && !test::csv) {
for (const auto &k : mode_sort) {
std::cout << "\t" << ::mxnet::op::tune::TuningModeToString(k.second)
<< ": " << k.first << " ms";
if (k.second == ::mxnet::op::tune::kAuto) {
std::cout << " (" << ::mxnet::op::tune::TuningModeToString(result.second) << ")";
}
std::cout << std::endl;
}
std::cout << std::flush;
if (!result.first) {
std::cout << "*** WARNING: Wrong OMP state selected ***" << std::endl << std::flush;
}
}
CHECK(results.find(shapes) == results.end()) << "Duplicate entry for set of shapes";
results[shapes] = result;
}
}
return results;
}
/*!
* \brief Perform execution runs for a given forward (and optionally backward) operator
* \param kwargs Parameters for the operator
* \param op_name Name by which the operator is registered with nnvm
* \param backward_op_name Backward operator name
*/
void TestTunedOperator(const kwargs_t &kwargs,
const bool verbose,
const std::vector<shape_vect>& shapevec_vectors,
const char *op_name,
const char *backward_op_name = COREOP_BWD_OP_NAME_VALUE_NONE) {
timing_.clear();
using namespace mxnet::op;
tuned_timing_t timing;
for (int x = 0; x < 1; ++x) {
for (auto mode : {::mxnet::op::tune::kNeverOMP,
::mxnet::op::tune::kAuto,
::mxnet::op::tune::kAlwaysOMP
}) {
if (verbose && !test::csv) {
std::cout << std::endl << ::mxnet::op::tune::TuningModeToString(mode)
<< std::endl << std::flush;
}
mxnet::op::OperatorTune<DType>::set_tuning_mode(mode);
const ShapesToPerfTimingMap shapes2perfmap = RunCoreOpTimingTest(false,
kwargs,
shapevec_vectors,
op_name,
backward_op_name);
for (const auto &item : shapes2perfmap) {
const shape_vect &shapes = item.first;
const test::perf::timing_map_t &tm = item.second;
timing_[shapes][mode] = tm;
}
}
}
}
/*!
* \brief Calculate the success rate of the run based upon Auto being close to the faster
* OMP/non-OMP attempt
* \param modes List of directions to use in calculation (Forward, Backward). Empty list means all
* \param verbose Whether to print info
* \return Success rate ratio (#success/#TOTAL) (0.0-1.0)
*/
float CalculateSuccessRate(std::vector<test::op::TimingDirection> directions = {},
bool verbose = true) const {
size_t count = 0, success = 0;
if (directions.empty()) {
directions = {test::op::kForward, test::op::kBackward};
}
for (const test::op::TimingDirection direction : directions) {
typename test::tune::TuningTester<DType>::shape_vec_to_bool_map res_fwd =
CalculateModeSort(direction, verbose);
for (auto iter = res_fwd.begin(), e = res_fwd.end(); iter != e; ++iter) {
++count;
if (iter->second.first) {
++success;
}
}
}
if (count) {
return static_cast<float>(success) / static_cast<float>(count);
}
return 1.0f; // nothing ventured, nothing failed (glass-is-half-full angle)
}
void set_calls_per_iteration(size_t calls_per_iterations) {
calls_per_iteration_ = calls_per_iterations;
}
size_t calls_per_iteration(size_t calls_per_iterations) const {
return calls_per_iteration_;
}
void set_total_iterations(size_t iterations) { total_iterations_ = iterations; }
size_t total_iterations(size_t iterations) const { return total_iterations_; }
private:
/*! \brief Number of iterations */
size_t total_iterations_ = 10;
/*! \brief Calls per iteration */
size_t calls_per_iteration_ = 50;
/*! \brief Raw timing data */
tuned_timing_t timing_;
};
} // namespace tune
} // namespace test
} // namespace mxnet
#endif // TEST_TUNE_H_