blob: 777fb47f9945fb3ada56afcff3e771af82c3ebeb [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/**
* Copyright (c) 2015 by Contributors
*/
#ifndef MXNET_KVSTORE_GPU_TOPOLOGY_H_
#define MXNET_KVSTORE_GPU_TOPOLOGY_H_
#if MXNET_USE_CUDA
#include <cuda_runtime_api.h>
#include <cuda.h>
#endif
#include <iostream>
#include <vector>
#include <algorithm>
#include <utility>
#include <limits>
#include <random>
#include <stack>
#include <queue>
#include <string>
#include <unordered_set>
#include <unordered_map>
#define MXNET_KVSTORE_MAXDEPTH 16
namespace mxnet {
namespace kvstore {
static bool kLogTree = dmlc::GetEnv("MXNET_KVSTORE_LOGTREE", false);
template <typename T>
inline void PrintVector(const std::string& str, const std::vector<T>& vec) {
LOG(INFO) << str << ":";
std::string output;
for (unsigned i = 0; i < vec.size(); ++i)
output += std::to_string(vec[i]) + " ";
LOG(INFO) << output;
}
template <typename T>
inline void PrintMatrix(const std::string& str, const std::vector<T>& matrix,
int num_rows, int num_cols) {
LOG(INFO) << str << ":";
int count = 0;
for (int row = 0; row < num_rows; ++row) {
std::string output;
for (int col = 0; col < num_cols; ++col) {
output += std::to_string(static_cast<int>(matrix[count++])) + " ";
}
LOG(INFO) << output;
}
}
inline void PrintTopo(const std::string& str, const std::vector<size_t>& topo_row,
std::vector<size_t> scan_row) {
LOG(INFO) << str << ":";
int depth = scan_row.size()-1;
for (int row = 0; row < depth; ++row) {
int start = scan_row[row];
int end = scan_row[row+1];
std::string output;
for (; start < end; start++) {
for (int i = 0; i < (2 << (depth-row-2))+1; ++i) {
output += " ";
}
output += std::to_string(topo_row[start]);
}
LOG(INFO) << output;
}
}
/**
* \brief Uses BFS to find whether undirected graph is connected or not given its
* adjacency matrix
* Note: only consider matrix values > 1, because we care about whether it is
* connected using only NVLink connections
*/
template <typename T>
inline bool IsConnected(const std::vector<T>& matrix, int num_gpus) {
int source = 0;
std::vector<bool> visited(num_gpus, false);
std::queue<int> work_list;
work_list.push(source);
visited[source] = true;
while (!work_list.empty()) {
int curr = work_list.front();
work_list.pop();
for (int i = 0; i < num_gpus; ++i) {
int neighbour = matrix[curr*num_gpus + i];
if (i != curr && neighbour > 1 && visited[i] == false) {
visited[i] = true;
work_list.push(i);
}
}
}
for (int i = 0; i < num_gpus; ++i) {
if (visited[i] == false)
return false;
}
return true;
}
/**
* \brief Generate adjacency matrix with row/col numbering from 0, 1, ..., n_gpu
* \param devs is a vector of GPU contexts
* \param p2p_matrix is adjacency matrix of P2P connections where
* 0: no P2P connection
* 1: P2P connection
* \param matrix is adjacency matrix of link topology graph
* where edge weight represents relative performance of NVIDIA GPUs
* 0: Self-connection
* 1: PCI-E
* 2: 1 NVLink connection
* 3: 2 NVLink connections
*/
template <typename T>
inline void GetP2PWeight(const std::vector<Context>& devs,
const std::vector<int>& p2p_matrix,
std::vector<T>* matrix) {
int num_gpus = devs.size();
int count = 0;
std::vector<int> zero_dev_id(num_gpus, -1);
for (auto d : devs) {
zero_dev_id[count] = d.dev_id;
count++;
}
#if MXNET_USE_CUDA
cudaDeviceP2PAttr attr;
attr = cudaDevP2PAttrPerformanceRank;
std::vector<int> max(num_gpus, 0);
for (int row = 0; row < num_gpus; ++row) {
for (int col = 0; col < num_gpus; ++col) {
if (row == col) {
(*matrix)[row*num_gpus+col] = 0;
} else {
int value;
int row_gpu = zero_dev_id[row];
int col_gpu = zero_dev_id[col];
cudaDeviceGetP2PAttribute(&value, attr, row_gpu, col_gpu);
if (value > max[row])
max[row] = value;
(*matrix)[row*num_gpus+col] = static_cast<T>(value)+1;
}
}
}
// Check that all P2P connections are detected by GetP2PAttribute
// If yes, then continue as before
// If not, then treat fallback to using p2p_matrix (from EnableP2P)
//
// We have observed that with CUDA 9.0 p3.16xlarge:
//
// 0 2 2 3 3 1 1 1 . v v v v . . .
// 2 0 3 2 1 3 1 1 v . v v . v . .
// 2 3 0 3 1 1 2 1 v v . v . . v .
// 3 2 3 0 1 1 1 2 v v v . . . . v
// 3 1 1 1 0 2 2 3 v . . . . v v v
// 1 3 1 1 2 0 3 2 . v . . v . v v
// 1 1 2 1 2 3 0 3 . . v . v v . v
// 1 1 1 2 3 2 3 0 . . . v v v v .
//
// matrix p2p_matrix
//
// Here, they are correctly detected, because the 2s and 3s correspond to
// links that have P2P connections between them. However for CUDA 9.2 p3.16xlarge:
//
// 0 2 2 1 1 1 1 1 . v v v v . . .
// 2 0 1 2 1 1 1 1 v . v v . v . .
// 2 1 0 1 1 1 2 1 v v . v . . v .
// 1 2 1 0 1 1 1 2 v v v . . . . v
// 1 1 1 1 0 2 2 1 v . . . . v v v
// 1 1 1 1 2 0 1 2 . v . . v . v v
// 1 1 2 1 2 1 0 1 . . v . v v . v
// 1 1 1 2 1 2 1 0 . . . v v v v .
//
// matrix p2p_matrix
//
// The fastest connections (3 - double NVLink) are not recognized as being any
if (kLogTree) {
PrintMatrix("matrix", *matrix, num_gpus, num_gpus);
PrintMatrix("p2p_matrix", p2p_matrix, num_gpus, num_gpus);
}
// different from (1 - non-P2P PCI-E). This is why we fallback to p2p_matrix.
bool matrix_correct = true;
for (unsigned i = 0; i < p2p_matrix.size(); ++i) {
if (p2p_matrix[i] > 0 && (*matrix)[i] == 1) {
matrix_correct = false;
break;
}
}
if (!matrix_correct) {
LOG(WARNING) << "cudaDeviceGetP2PAttribute incorrect. "
<< "Falling back to cudaDeviceEnablePeerAccess for topology detection";
for (unsigned i = 0; i < p2p_matrix.size(); ++i) {
if (p2p_matrix[i] > 0)
(*matrix)[i] = 2;
else
(*matrix)[i] = 1;
}
}
// If all GPUs are connected by NVLink, then we can use NVLink only
// to communicate instead of going over PCI-E, so we set PCI-E links to 0
//
// Otherwise, we will make distinction between PCI-E GPUDirect links and
// PCI-E through CPU links, which are slower and show queueing effect (i.e.
// The most packets there are, the slower).
//
// For the latter links, we will set links that were 0 to 1/num_gpus to
// account for this queuing effect.
bool connected = IsConnected(*matrix, num_gpus);
if (connected) {
for (auto& matrix_value : *matrix) {
matrix_value = (matrix_value == 1) ? 0 : matrix_value;
}
} else {
for (auto& matrix_value : *matrix) {
matrix_value = (matrix_value == 1) ? 1./num_gpus : matrix_value;
}
}
if (kLogTree)
PrintMatrix("Weight", *matrix, num_gpus, num_gpus);
#else
LOG(WARNING) << "GPU required for link topology";
#endif
}
/**
* \brief Dense matrix-vector multiplication
* Assume: matrix is square
* y = A*x (no accumulate)
*/
template <typename T>
inline void gemv(const std::vector<T>& A, const std::vector<int>& x,
std::vector<T>* y) {
int nrows = x.size();
int count = 0;
for (int row=0; row < nrows; ++row) {
(*y)[row] = 0;
for (int col=0; col < nrows; ++col) {
(*y)[row] += A[count]*static_cast<T>(x[col]);
count++;
}
}
}
/**
* \brief Element-wise multiplication between 2 dense vectors
* w = w * alpha*u
*/
template <typename T>
inline void ewisemult(const std::vector<int>& u, T alpha, std::vector<T>* w) {
int nelem = u.size();
for (int i=0; i < nelem; ++i) {
(*w)[i] *= alpha*static_cast<T>(u[i]);
}
}
/**
* \brief Computes best 2 nodes a,b to swap given objective function:
* g = max_{a \in A, b \in B} D(a) + D(b) - 2*W(a,b)
*
* Optimization: Only need to look at upper triangular since weight matrix is
* symmetric
*/
template <typename T>
inline void FindBestMove(const std::vector<T>& W,
const std::vector<int>& P_temp,
const std::vector<T>& D,
const std::unordered_set<int>& used,
int* a, int* b, T* g) {
int nrows = P_temp.size();
*g = 0;
*a = -1;
*b = -1;
for (int row=0; row < nrows; ++row) {
if (P_temp[row] == 0 || used.find(row) != used.end()) continue;
for (int col=row+1; col < nrows; ++col) {
if (P_temp[col] == 0 || P_temp[row] == P_temp[col]) continue;
T cost = D[row]+D[col]-2*W[row*nrows+col];
if (cost > *g) {
*g = cost;
*a = row;
*b = col;
}
}
}
}
/**
* \brief Performs partition on each existing partition in graph W if partition has
* more than 4 elements in it
* \param stop returns true if no partitions with >=4 elements found
* returns false otherwise
* \param cluster_pairs stores the mapping that tells us which 2 clusters are
* the output of partitioning one large cluster
*/
template <typename T>
inline bool KernighanLin(const std::vector<T>& W, std::vector<int>* P,
int* num_partitions,
std::vector<std::pair<int, int>>* cluster_pairs,
std::mt19937* gen) {
std::vector<int> histogram(*num_partitions, 0);
std::vector<int> P_temp(P->size(), 0);
std::vector<int> P_temp2(P->size(), 0);
std::vector<T> D(P->size(), 0);
std::vector<T> D_temp(P->size(), 0);
// 0) For every partition, determine if it can be partitioned further.
// To do this, we must do a histogram of each partition:
for (int partition : *P) {
histogram[partition]++;
}
bool stop = true;
for (unsigned color=0; color < histogram.size(); ++color) {
int partition_size = histogram[color];
// Save cluster in preparation for push to topo in GenerateBinaryTree()
if (partition_size <= 2) {
cluster_pairs->push_back(
std::pair<int, int>(static_cast<int>(color), -partition_size));
// Do Kernighan-Lin if clustering is necessary
} else {
stop = false;
// 1) If it has more than 4 elements, we can partition further.
// Assign random balanced partition of it
// -balanced is more important than random, so allocate first half to A
// and rest to B
int first_partition = 0;
int target_partition = partition_size/2;
std::vector<int> cluster_list;
for (unsigned i = 0; i < P->size(); ++i) {
// Required to shift from [0,1] to {-1,1}
// 1 means vertex i is in Cluster A
// -1 means vertex i is in Cluster B
if ((*P)[i] == static_cast<int>(color)) {
cluster_list.push_back(i);
} else {
P_temp[i] = 0;
}
}
// 1b) Shuffle using random generator
std::shuffle(cluster_list.begin(), cluster_list.end(), *gen);
for (int cluster : cluster_list) {
if (first_partition < target_partition) {
int dest = cluster;
P_temp[dest] = 1;
first_partition++;
} else {
int dest = cluster;
P_temp[dest] = -1;
}
}
// 2) Do iterations of Kernighan-Lin until convergence
T g_max = 0;
int g_k = -1;
unsigned count = 0;
do {
count++;
P_temp2 = P_temp;
// a) Compute difference between external and internal costs of all
// elements in vector D
gemv(W, P_temp, &D);
ewisemult(P_temp, -1.f, &D);
// av and bv are used to hold candidates for moving
// gv stores the score associated with move
std::vector<int> av;
std::vector<int> bv;
std::vector<T> gv;
std::unordered_set<int> used;
for (int iter=0; iter < partition_size/2; ++iter) {
// b) Find best move by looking through upper triangular of W matrix
int a, b;
T g;
FindBestMove(W, P_temp, D, used, &a, &b, &g);
if (g > 0) {
} else {
g_max = 0;
break;
}
// c) Store best move to av, bv, gv
av.push_back(a);
bv.push_back(b);
gv.push_back(g);
// d) Eliminate best move from consideration in vector P_temp
P_temp[a] *= -1;
P_temp[b] *= -1;
used.insert(a);
used.insert(b);
// e) Update D using P_temp
gemv(W, P_temp, &D);
ewisemult(P_temp, -1.f, &D);
D[a] = 0;
D[b] = 0;
}
// 3) Find when to stop by doing linear scan through gv
// Recompute score g_max
for (unsigned k = 0; k < gv.size(); ++k) {
if (k > 0)
gv[k] += gv[k-1];
if (gv[k] > g_max) {
g_max = gv[k];
g_k = k + 1;
}
}
// 4) If move is "good", commit moves by updating P_temp and P_temp2
// Otherwise, rollback changes to P_temp2
if (g_max > 0) {
for (int i = 0; i < g_k; i++) {
int a = av[i];
int b = bv[i];
int temp = P_temp2[a];
P_temp2[a] = P_temp2[b];
P_temp2[b] = temp;
P_temp = P_temp2;
}
} else {
P_temp = P_temp2;
}
} while (g_max > 0 && count <= P->size());
// 5) Update P using P_temp
int moves = 0;
for (unsigned i=0; i < P->size(); ++i) {
if (P_temp[i] == -1) {
(*P)[i] = *num_partitions;
moves++;
}
}
cluster_pairs->push_back(std::pair<int, int>(static_cast<int>(color),
static_cast<int>(*num_partitions)));
(*num_partitions)++;
}
}
return stop;
}
/**
* \brief Returns root of a given color if found in roots
* Returns -1 if it is not found
*/
inline int GetRoot(const std::vector<int>& P, int color,
const std::unordered_set<int>& roots) {
for (auto root : roots) {
if (P[root] == color)
return root;
}
return -1;
}
/**
* \brief Returns root of a given color if found in roots
* Returns -1 if it is not found
*/
inline int GetChild(const std::vector<int>& P, int color, int parent) {
for (unsigned i = 0; i < P.size(); ++i) {
if (P[i] == color && static_cast<int>(i) != parent)
return i;
}
return -1;
}
// Computes highest weighted edge a-b
//
// Contraints:
// -vertex a must be parent
// -vertex b must be in dest_cluster
//
// @output: b is vector of candidates if a tie happens
// g is weight of edge
// Optimization: Only need to look at row a in matrix
template <typename T>
inline void FindBestEdge(const std::vector<T>& W, const std::vector<int>& P,
int parent, int dest_cluster, std::vector<int>* b, T* g) {
int nrows = P.size();
int row = parent;
*g = 0;
b->push_back(-1);
for (int col=0; col < nrows; ++col) {
if (col == row || P[col] != dest_cluster) continue;
T cost = W[row*nrows+col];
if (cost > *g) {
b->clear();
}
if (cost >= *g) {
b->push_back(col);
*g = cost;
}
}
}
// Given a vector of color pairs, appends to binary tree matrix topo
// @input: W gives the link topology
// P gives the result of KL partitioning
// cluster_pairs gives pairing between clusters, an edge is found
// between each pairing
// roots gives source vertices
// gen gives random number generation to break ties
// @output: cluster_pairs
// topo_row says where new edges are appended to
// scan_row says where we should start looking for topo_row
template <typename T>
inline int KLGenerateBinaryTree(const std::vector<T>& W,
const std::vector<int>& P,
std::vector<std::pair<int, int>>* cluster_pairs,
std::unordered_set<int>* roots,
std::vector<size_t>* topo_row,
std::vector<size_t>* scan_row,
std::mt19937* gen) {
std::unordered_set<int> new_roots;
std::unordered_map<int, int> new_topo;
int reset = 0;
for (unsigned i = 0; i < cluster_pairs->size(); ++i) {
if (i == 0)
scan_row->push_back(topo_row->size());
int parent, child = -1;
if ((*cluster_pairs)[i].second == -2) {
// Root must be color of pair.first
int color = (*cluster_pairs)[i].first;
parent = GetRoot(P, color, *roots);
if (parent == -1) return 1;
child = GetChild(P, color, parent);
} else if ((*cluster_pairs)[i].second == -1) {
int color = (*cluster_pairs)[i].first;
parent = GetRoot(P, color, *roots);
if (parent == -1) return 1;
child = parent;
} else {
// Root must exist in either first or second element of pair
int color = (*cluster_pairs)[i].first;
parent = GetRoot(P, color, *roots);
color = (parent == -1) ? (*cluster_pairs)[i].second : color;
parent = (parent == -1) ? GetRoot(P, color, *roots) : parent;
int from_cluster = color;
int dest_cluster = (from_cluster == (*cluster_pairs)[i].first) ?
(*cluster_pairs)[i].second : (*cluster_pairs)[i].first;
std::vector<int> candidates;
T weight;
FindBestEdge(W, P, parent, dest_cluster, &candidates, &weight);
// If no candidates
if (candidates[0] != -1) {
std::shuffle(candidates.begin(), candidates.end(), *gen);
child = candidates[0];
}
if (child == -1) {
new_roots.insert(parent);
return 1;
} else {
new_roots.insert(parent);
new_roots.insert(child);
}
}
new_topo[parent] = child;
}
int depth = scan_row->size();
int start = (*scan_row)[depth-2];
int end = (*scan_row)[depth-1];
for (int i = start; i < end; ++i) {
int parent = (*topo_row)[i];
int child;
// If not first, check previous level whether or not we are encountering
// this root for the first time in this level of the tree
if (i != start && parent == static_cast<int>((*topo_row)[i-1]))
child = parent;
else
child = new_topo[parent];
topo_row->push_back(parent);
topo_row->push_back(child);
}
cluster_pairs->clear();
roots->clear();
*roots = std::move(new_roots);
return reset;
}
// @input: n is the number of nodes in a balanced binary tree
// @output: returns how many levels of binary tree there are
inline int ComputeDepth(int n) {
for (int depth = 0; depth < MXNET_KVSTORE_MAXDEPTH; ++depth) {
int num = 2 << depth;
if (n <= num)
return depth+1;
}
return 0;
}
// Checks whether a given state forms a spanning tree that satisfies:
// -balanced
// -binary
// -each edge in tree corresponds to link in network topology
// -each edge in tree does not form self-loop
template <typename T>
inline bool IsValid(const std::vector<T>& W, const std::vector<int>& state,
int num_elements, int row, int depth) {
// At each level of tree, check whether edge:
// -corresponds to link in network topology
// -corresponds to self-loop
for (int i = 0; i < depth; ++i) {
int stride = 1 << i;
for (int j = 0; j+stride < row; j += 2*stride) {
int from = state[j];
int dest = state[j+stride];
if (W[from*num_elements + dest] == static_cast<T>(0) && from != dest) {
return false;
}
}
}
// If we encounter GPU for first time, increment found_vec.
// Otherwise, do nothing
std::unordered_set<int> found;
std::vector<int> found_vec(num_elements, 0);
for (auto val : state) {
if (val == -1)
continue;
if (val < num_elements) {
if (found.find(val) == found.end()) {
found.insert(val);
found_vec[val] = 1;
}
} else {
return false;
}
}
// modifier is maximum number of repeats a single GPU can take
// e.g. 5 GPUs in 3-level binary tree => one GPU can repeat 3x
// GPU0 GPU0 GPU0 GPU0 GPU1 GPU2 GPU3 GPU4
int modifier = (1 << depth) - num_elements;
int num_found = found.size();
// So we know we have an invalid state if we find:
// -only 4 unique GPUs
// -9 unique GPUs
if (row < num_elements) {
if (num_found > row || num_found < row - modifier) {
return false;
}
// If we are at last recursive level, we can apply a more stringent check:
// -if some GPU is not found, then we are in invalid state
} else if (row == static_cast<int>(state.size())) {
for (int i = 0; i < num_elements; ++i) {
if (found_vec[i] == 0) {
return false;
}
}
}
return true;
}
// This function takes a spanning tree encoded as state (result), which may have
// repeated GPUs representing NO-SENDs and converts it into a unique format.
// This has the effect of recognizing redundant sends, grouping them together,
// so that the Reduce call knows not to perform a CopyFromTo.
//
// Initial result: [3 0 0 4 1 2 5 6]
// Final result: [3 3 0 4 1 2 5 6]
//
// Initial:
// 3
// 3 1
// 3 0 1 5
// 3 0 0 4 1 2 5 6 // GPU3 will make redundant send to GPU0
//
// Final:
// 3
// 3 1
// 3 0 1 5
// 3 3 0 4 1 2 5 6 // GPU3 knows not to make redundant send to itself
inline void Postprocess(std::vector<int>* result, int num_elements, int depth) {
for (int level = depth - 1; level >= 0; --level) {
int stride = 1 << level;
std::vector<int> histogram_above(num_elements, 0);
for (unsigned i = 0; i < result->size(); i += 2*stride) {
int val = (*result)[i];
histogram_above[val]++;
}
std::vector<int> histogram(num_elements, 0);
for (unsigned i = 0; i < result->size(); i += stride) {
int val = (*result)[i];
histogram[val]++;
}
for (int i = result->size()-stride; i-stride >= 0; i -= 2*stride) {
int from = (*result)[i];
int dest = (*result)[i-stride];
if ((histogram[from] > 1 || histogram_above[from] >= 1) && from != dest) {
(*result)[i] = dest;
histogram[from]--;
}
}
}
}
// Given a spanning tree encoded as a state (result) and weight of each edge
// in the link topology graph, compute its weight.
// @input: penalty controls whether or not penalties are applied to tree
// -usually turned on when backtracking to get better solutions
// -usually turned off when outside the penalty to get weight of tree
template <typename T>
inline T ComputeTreeWeight(const std::vector<T>& W, const std::vector<int>& result,
int num_elements, int depth, bool penalty) {
T weight = 0.f;
std::unordered_set<int> links_used;
for (int i = 0; i < depth; ++i) {
int stride = 1 << i;
std::vector<bool> nodes_used(num_elements, false);
for (unsigned j = 0; j+stride < result.size(); j += 2*stride) {
int from = result[j];
int dest = result[j+stride];
if (from != dest) {
weight += W[from*num_elements+dest];
// Penalize: (1) use of redundant edges in a single tree
// (2) repeated use of a GPU in a single tree at the same
// level above the leaf level
if (links_used.find(from*num_elements+dest) != links_used.end()
&& penalty) {
weight -= 100;
}
links_used.insert(from*num_elements+dest);
links_used.insert(dest*num_elements+from);
}
nodes_used[from] = true;
if (i > 0 && nodes_used[dest] && penalty) {
weight -= 10;
}
nodes_used[dest] = true;
}
}
return weight;
}
/**
* \brief Given a spanning tree encoded as result, which was convenient for performing
* backtracking, convert it topology_ and scan_ in the classic "binary tree
* stored in an array" format. For binary trees scan_ is redundant, but this
* additional data structure leaves future generalization to k-radix trees.
*
* Initial result: [3 3 0 4 1 2 5 6]
* topology_: [3 3 1 3 0 1 5 3 3 0 4 1 2 5 6]
* scan_: [0 1 3 7 15]
*
* topology_ is stored in the classic "binary tree stored in an array" format
* e.g. 3
* 3 1
* 3 0 1 5
* 3 3 0 4 1 2 5 6
*
* Returns false if invalid tree in result
* Otherwise returns true
*/
inline bool FormTopology(const std::vector<int>& result,
std::vector<size_t>* topo_row,
std::vector<size_t>* scan_row,
int depth) {
for (int result_value : result)
if (result_value == -1)
return false;
scan_row->push_back(topo_row->size());
for (int i = depth; i > 0; --i) {
int stride = 1 << i;
for (unsigned j = 0; j < result.size(); j += stride) {
int from = result[j];
topo_row->push_back(from);
}
scan_row->push_back(topo_row->size());
}
// Insert at the end, result vector
topo_row->insert(topo_row->end(), result.begin(), result.end());
scan_row->push_back(topo_row->size());
return true;
}
/**
* \brief Recursive function that finds a spanning tree, which fulfills the following
* conditions:
* -balanced
* -binary
* -maximum weight
*/
template <typename T>
inline bool RecursiveBacktrack(const std::vector<T>& W,
std::vector<int>* state,
std::vector<int>* best_result,
T* best_result_weight,
int row,
int num_elements,
int depth,
bool optimal) {
if (row == static_cast<int>(state->size())) {
std::vector<int> result = *state;
Postprocess(&result, num_elements, depth);
T weight = ComputeTreeWeight(W, result, num_elements, depth, true);
// Save this spanning tree if it is highest weight tree found sofar
if (weight > *best_result_weight) {
std::swap(*best_result_weight, weight);
*best_result = result;
}
return !optimal;
}
// If not last recursive level, try to find valid tree for next level
bool stop = false;
for (int j = 0; j < num_elements; ++j) {
(*state)[row] = j;
if (IsValid(W, state, num_elements, row+1, depth))
stop = RecursiveBacktrack(W, state, best_result, best_result_weight,
row+1, num_elements, depth, optimal);
(*state)[row] = -1;
if (stop)
return stop;
}
return stop;
}
template <typename T>
inline void IterativeBacktrack(const std::vector<T>& W,
std::vector<int>* state,
std::vector<int>* best_result,
T* best_result_weight,
int row,
int num_elements,
int depth,
bool optimal) {
std::stack<int> state_stack;
row = 1;
int pos = 0;
state_stack.push(pos);
while (true) {
// If there is no valid position, 2 cases:
// a) if stack is empty, break and stop search
// b) if stack is not empty, pop stack and set current position to next
// position backtrack to previous row
while (!state_stack.empty() && pos >= num_elements) {
pos = state_stack.top();
pos++;
state_stack.pop();
(*state)[state_stack.size()+1] = -1;
row--;
}
if (state_stack.empty()) break;
(*state)[row] = pos;
// If there is a valid position push the position to stack, set current
// position to 0 and move to next row
if (IsValid(W, *state, num_elements, row+1, depth)) {
state_stack.push(pos);
pos = 0;
row++;
} else {
pos++;
(*state)[row] = -1;
}
// If stack has size N, a solution is found
// Pop stack, set current position to next position
// Backtrack to find next solution
if (row == static_cast<int>(state->size())) {
std::vector<int> result = *state;
Postprocess(&result, num_elements, depth);
T weight = ComputeTreeWeight(W, result, num_elements, depth, true);
// Save this spanning tree if it is highest weight tree found so far
if (weight > *best_result_weight) {
std::swap(*best_result_weight, weight);
*best_result = result;
}
if (!optimal) break;
pos = state_stack.top();
pos++;
state_stack.pop();
(*state)[state_stack.size()] = -1;
row--;
}
}
}
/**
* \brief Apply penalty factor alpha to each link in link topology graph that is used
* by the spanning tree
*/
template <typename T>
inline void UpdateWeight(std::vector<T>* W, const std::vector<size_t>& topo_row,
int num_elements, float alpha) {
for (unsigned i = 1; i < topo_row.size() - 1; i += 2) {
unsigned parent = topo_row[i];
unsigned child = topo_row[i+1];
if (!(parent >= num_elements*num_elements ||
child >= num_elements*num_elements) && (parent != child)) {
(*W)[parent*num_elements+child] *= alpha;
(*W)[child*num_elements+parent] *= alpha;
}
}
}
/**
* \brief Do brute-force backtracking approach if Kernighan-Lin fails to find a binary
* tree of height Log P.
*
* Constraints:
* 1) minimize depth (balance)
* 2) maximize edge weight
* 3) tree is binary
*/
template <typename T>
inline bool BacktrackGenerateBinaryTree(std::vector<T>* W,
int num_elements,
int root,
std::vector<size_t>* topo_row,
std::vector<size_t>* scan_row) {
// Clear before starting
topo_row->clear();
scan_row->clear();
// Compute depth
// num_elements: depth
// 5: 3 8
// 6: 3 8
// 7: 3 8
// 8: 3 8
// 9: 4 16
int depth = ComputeDepth(num_elements);
int depth_leaves = 1 << depth;
// State vector
// -1 means unplaced
std::vector<int> state(depth_leaves, -1);
std::vector<int> result(depth_leaves, -1);
T result_weight = std::numeric_limits<T>::lowest();
// Place root and try all combinations
state[0] = root;
// Seek optimal solution until depth <= 3 i.e. 8 GPUs
// For larger numbers of GPUs, settle for first tree found (non-optimal), but
// this saves a lot of runtime, because Backtrack is exponential time
if (depth <= 3) {
IterativeBacktrack(*W, &state, &result, &result_weight, 1, num_elements,
depth, true);
} else {
IterativeBacktrack(*W, &state, &result, &result_weight, 1, num_elements,
depth, false);
}
return FormTopology(result, topo_row, scan_row, depth);
}
/**
* \brief ComputeTreesFromRoot does the same thing as ComputeTrees, with the only
* exception being it will do it from a fixed GPU as root
*/
template <typename T>
inline void ComputeTreesFromRoot(std::vector<T>* W,
int num_elements,
int root,
float alpha,
bool backtrack,
std::vector<size_t>* topo,
std::vector<size_t>* scan) {
int num_partitions = 1;
// Initialize partition array to indicate which partition each element belongs
// to beginning with 0
std::vector<int> P(num_elements, 0);
// Initialize vector of pairs that will tell us edges between what 2 clusters
// we should be looking to build the tree from
std::vector<std::pair<int, int>> cluster_pairs;
// Initialize vector of roots that will tell us edges between
std::unordered_set<int> roots;
roots.insert(root);
// Will be used to obtain a seed for the random number engine
// RNG: Standard mersenne_twister_engine seeded with rd()
// -use 0 for testing (TODO: remove this)
// std::random_device rd;
// std::mt19937 gen(rd());
std::mt19937 gen(1);
// Temporary variables for rewinding
std::vector<int> P_temp;
int num_partitions_temp;
std::unordered_set<int> roots_temp;
std::vector<size_t> topo_temp;
std::vector<size_t> scan_temp;
// Determine number of partition levels
// If first partition, determine root of maximal spanning tree
bool stop = false;
int reset = 1;
int level = 0;
while (!backtrack && (!stop || reset)) {
if (reset == 1) {
cluster_pairs.clear();
P_temp = P;
num_partitions_temp = num_partitions;
roots_temp = roots;
topo_temp = *topo;
scan_temp = *scan;
}
// Run Kernighan-Lin to generate partition
stop = KernighanLin(*W, &P_temp, &num_partitions_temp, &cluster_pairs,
&gen);
// Use partitions found and a given root to find best inter-cluster edge for
// each pair of clusters, and returns them as roots of next cluster
// If reset is true, then rewind back to previous clustering
reset = KLGenerateBinaryTree(*W, P_temp, &cluster_pairs, &roots_temp,
&topo_temp, &scan_temp, &gen);
if (reset)
level++;
if (level > 10) break;
}
bool success = true;
if (reset == 1) {
LOG(INFO) << "No valid binary tree found from root " << root << ", try backtracking";
success = BacktrackGenerateBinaryTree(W, num_elements, root, topo, scan);
} else {
*topo = topo_temp;
*scan = scan_temp;
scan->push_back(topo->size());
}
if (success)
UpdateWeight(W, *topo, num_elements, alpha);
else
LOG(FATAL) << "No valid binary tree found from root " << root << " using backtracking";
}
/**
* \brief ComputeTrees computes balanced binary spanning trees of maximum edge weight
* given a link topology graph stored in adjacency matrix format
* \param W is the link topology matrix
* \param num_elements is the number of GPUs
* \param alpha is the link usage penalty
* \param backtrack is whether or not we use backtracking to generate trees
* \param topo stores the trees generated
* \param scan stores the start of each level of each tree
*/
template <typename T>
inline void ComputeTrees(const std::vector<T>& W,
int num_elements,
float alpha,
bool backtrack,
std::vector<std::vector<size_t>>* topo,
std::vector<std::vector<size_t>>* scan) {
std::vector<T> W_copy = W;
topo->clear();
scan->clear();
for (int i = 0; i < num_elements; ++i) {
topo->push_back(std::vector<size_t>());
scan->push_back(std::vector<size_t>());
(*topo)[i].push_back(i);
(*scan)[i].push_back(0);
ComputeTreesFromRoot(&W_copy, num_elements, i, alpha, backtrack,
&((*topo)[i]), &((*scan)[i]));
}
// Note: must sum up adj matrix to show link usage before we readjust topo
// from 0, 1, ..., n_gpus format to dev_id format, which will cause segfault
std::vector<int> adj(W.size(), 0);
for (int row = 0; row < num_elements; ++row) {
for (unsigned col = 1; col < (*topo)[0].size(); col += 2) {
int from = std::min((*topo)[row][col], (*topo)[row][col+1]);
int dest = std::max((*topo)[row][col], (*topo)[row][col+1]);
if (from != dest) {
adj.at(from*num_elements+dest) += 1;
adj.at(dest*num_elements+from) += 1;
}
}
}
std::vector<std::vector<size_t>> topo_temp(num_elements,
std::vector<size_t>());
if (kLogTree) {
for (int i = 0; i < num_elements; ++i)
PrintTopo("Tree "+std::to_string(i), (*topo)[i], (*scan)[i]);
PrintMatrix("W", W, num_elements, num_elements);
PrintMatrix("Links", adj, num_elements, num_elements);
}
}
} // namespace kvstore
} // namespace mxnet
#endif // MXNET_KVSTORE_GPU_TOPOLOGY_H_