tests/cpp/kvstore/gpu_topology_test.cc - mxnet - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 /*!
  * \file gpu_topology_test.cc
  * \brief gpu topology tests
  */

 #if MXNET_USE_CUDA

 #include <gtest/gtest.h>
 #include <mxnet/base.h>
 #include <mxnet/kvstore.h>
 #include "../src/kvstore/gpu_topology.h"

 void GenerateMatrix(std::vector<float>* W, int num_gpus, std::mt19937* gen) {
   std::uniform_real_distribution<> dis(0., 1.);
   for (int row = 0; row < num_gpus; ++row) {
     for (int col = row + 1; col < num_gpus; ++col) {
       double sample = dis(*gen);
       if (sample < 0.33) {
         (*W)[row * num_gpus + col] = 1.;
         (*W)[col * num_gpus + row] = 1.;
       } else if (sample < 0.66f) {
         (*W)[row * num_gpus + col] = 2.;
         (*W)[col * num_gpus + row] = 2.;
       } else {
         (*W)[row * num_gpus + col] = 3.;
         (*W)[col * num_gpus + row] = 3.;
       }
     }
   }
 }

 bool IsSatisfactory(const std::vector<float>& W, int num_gpus, int depth) {
   for (int row = 0; row < num_gpus; ++row) {
     int out_edges = 0;
     for (int col = 0; col < num_gpus; ++col) {
       if (W[row * num_gpus + col] > 0.f)
         out_edges++;
     }
     if (out_edges < depth)
       return false;
   }
   return true;
 }

 // Generates random link topology matrix using random number generator
 void TestComputeTreesRandomized(int num_gpus, float alpha, int backtrack, std::mt19937* gen) {
   std::uniform_real_distribution<> dis(0.f, 1.f);
   bool satisfied = false;
   std::vector<float> W(num_gpus * num_gpus, 0.f);
   int depth = mxnet::kvstore::ComputeDepth(num_gpus);
   GenerateMatrix(&W, num_gpus, gen);
   satisfied = IsSatisfactory(W, num_gpus, depth);
   if (mxnet::kvstore::kLogTree && !satisfied) {
     LOG(ERROR) << " topology connectivity not satisfied "
                   "(out edges per node less than tree depth)";
     mxnet::kvstore::PrintMatrix("W", W, num_gpus, num_gpus);
   }

   std::vector<std::vector<size_t>> topo;
   std::vector<std::vector<size_t>> scan;
   mxnet::kvstore::ComputeTrees(W, num_gpus, alpha, backtrack, &topo, &scan);

   unsigned correct_topo_size = (1 << (depth + 1)) - 1;
   unsigned correct_scan_size = depth + 2;
   ASSERT_EQ(topo.size(), static_cast<unsigned>(num_gpus));
   for (unsigned i = 0; i < topo.size(); ++i) {
     ASSERT_EQ(correct_topo_size, topo[i].size());
     ASSERT_EQ(correct_scan_size, scan[i].size());
   }
 }

 // Permutes matrix W using permutation vector P and stores output in matrix A
 // Assumption: W is square and symmetric
 void PermuteMatrix(const std::vector<int>& W, const std::vector<int>& P, std::vector<int>* A) {
   int nrows = P.size();
   std::vector<int> temp(nrows * nrows, 0);

   int count = 0;
   for (int row = 0; row < nrows; ++row) {
     for (int col = 0; col < nrows; ++col) {
       int row_start = P[row];
       temp[count]   = W[row_start * nrows + col];
       count++;
     }
   }

   count = 0;
   for (int row = 0; row < nrows; ++row) {
     for (int col = 0; col < nrows; ++col) {
       int col_index = P[col];
       (*A)[count]   = temp[row * nrows + col_index];
       count++;
     }
   }
 }

 TEST(GpuTopology, TestFormTopology) {
   std::vector<int> state0 = {3, 2, 1, 5, 0, 0, 4, 6};
   std::vector<size_t> topo0;
   std::vector<size_t> scan0;
   std::vector<int> correct0      = {3, 3, 0, 3, 1, 0, 4, 3, 2, 1, 5, 0, 0, 4, 6};
   std::vector<int> correct_scan0 = {0, 1, 3, 7, 15};
   mxnet::kvstore::FormTopology(state0, &topo0, &scan0, 3);
   ASSERT_EQ(topo0.size(), correct0.size());
   for (unsigned i = 0; i < correct0.size(); ++i)
     ASSERT_EQ(static_cast<int>(topo0[i]), correct0[i]);
   ASSERT_EQ(scan0.size(), correct_scan0.size());
   for (unsigned i = 0; i < correct_scan0.size(); ++i)
     ASSERT_EQ(static_cast<int>(scan0[i]), correct_scan0[i]);

   std::vector<int> state1 = {3, 2, 0, 4, 1, 1, 5, 6};
   std::vector<size_t> topo1;
   std::vector<size_t> scan1;
   std::vector<int> correct1      = {3, 3, 1, 3, 0, 1, 5, 3, 2, 0, 4, 1, 1, 5, 6};
   std::vector<int> correct_scan1 = {0, 1, 3, 7, 15};
   mxnet::kvstore::FormTopology(state1, &topo1, &scan1, 3);
   ASSERT_EQ(topo1.size(), correct1.size());
   for (unsigned i = 0; i < correct1.size(); ++i)
     ASSERT_EQ(static_cast<int>(topo1[i]), correct1[i]);
   ASSERT_EQ(scan1.size(), correct_scan1.size());
   for (unsigned i = 0; i < correct_scan1.size(); ++i)
     ASSERT_EQ(static_cast<int>(scan1[i]), correct_scan1[i]);
 }

 TEST(GpuTopology, TestComputeTreeWeight) {
   std::vector<int> W = {0, 2, 2, 3, 3, 0, 0, 2, 0, 3, 2, 0, 3, 0, 2, 3, 0, 3, 0, 0, 2, 3, 2, 3, 0,
                         0, 0, 0, 3, 0, 0, 0, 0, 2, 2, 0, 3, 0, 0, 2, 0, 3, 0, 0, 2, 0, 2, 3, 0};

   std::vector<int> state0 = {3, 2, 1, 5, 0, 0, 4, 6};
   ASSERT_EQ(mxnet::kvstore::ComputeTreeWeight(W, state0, 7, 3, false), 16);

   std::vector<int> state1 = {3, 2, 0, 4, 1, 1, 5, 6};
   ASSERT_EQ(mxnet::kvstore::ComputeTreeWeight(W, state1, 7, 3, false), 17);
 }

 TEST(GpuTopology, TestPostprocess) {
   std::vector<int> result0  = {3, 0, 0, 4, 1, 2, 5, 6};
   std::vector<int> correct0 = {3, 3, 0, 4, 1, 2, 5, 6};
   mxnet::kvstore::Postprocess(&result0, 7, 3);
   for (unsigned i = 0; i < correct0.size(); ++i)
     ASSERT_EQ(result0[i], correct0[i]);

   std::vector<int> result1  = {2, 0, 0, 4, 1, 3, 5, 1};
   std::vector<int> correct1 = {2, 2, 0, 4, 1, 3, 5, 5};
   mxnet::kvstore::Postprocess(&result1, 6, 3);
   for (unsigned i = 0; i < correct1.size(); ++i)
     ASSERT_EQ(result1[i], correct1[i]);

   std::vector<int> result2  = {5, 4, 1, 3, 1, 0, 2, 0};
   std::vector<int> correct2 = {5, 4, 5, 3, 1, 0, 2, 2};
   mxnet::kvstore::Postprocess(&result2, 6, 3);
   for (unsigned i = 0; i < correct2.size(); ++i)
     ASSERT_EQ(result2[i], correct2[i]);

   std::vector<int> result3  = {10, 10, 0, 0, 0, 0, 0, 1, 2, 3, 6, 4, 7, 5, 8, 9};
   std::vector<int> correct3 = {10, 10, 10, 10, 0, 0, 0, 1, 2, 3, 6, 4, 7, 5, 8, 9};
   mxnet::kvstore::Postprocess(&result3, 11, 4);
   for (unsigned i = 0; i < correct3.size(); ++i)
     ASSERT_EQ(result3[i], correct3[i]);
 }

 TEST(GpuTopology, TestDepth) {
   ASSERT_EQ(mxnet::kvstore::ComputeDepth(2), 1);
   ASSERT_EQ(mxnet::kvstore::ComputeDepth(3), 2);
   ASSERT_EQ(mxnet::kvstore::ComputeDepth(8), 3);
   ASSERT_EQ(mxnet::kvstore::ComputeDepth(7), 3);
   ASSERT_EQ(mxnet::kvstore::ComputeDepth(5), 3);
   ASSERT_EQ(mxnet::kvstore::ComputeDepth(4), 2);
   ASSERT_EQ(mxnet::kvstore::ComputeDepth(16), 4);
 }

 TEST(GpuTopology, TestIsValid) {
   std::vector<int> W = {0, 2, 2, 3, 3, 0, 0, 2, 0, 3, 2, 0, 3, 0, 2, 3, 0, 3, 0, 0, 2, 3, 2, 3, 0,
                         0, 0, 0, 3, 0, 0, 0, 0, 2, 2, 0, 3, 0, 0, 2, 0, 3, 0, 0, 2, 0, 2, 3, 0};

   std::vector<int> state0 = {3, 2, 1, 5, 0, 0, 4, 6};
   ASSERT_EQ(mxnet::kvstore::IsValid(W, state0, 7, 7, 3), true);

   // 3 connects to 1 first
   std::vector<int> state1 = {3, 2, 0, 4, 1, 1, 5, 6};
   ASSERT_EQ(mxnet::kvstore::IsValid(W, state1, 7, 7, 3), true);

   // 3 does not connect to 5
   std::vector<int> state2 = {3, 2, 5, 1, 0, 4, 2, 5};
   ASSERT_EQ(mxnet::kvstore::IsValid(W, state2, 7, 7, 3), false);

   // 7 exceeds number of GPUs
   std::vector<int> state3 = {3, 7, 2, 6, 0, 1, 4, 5};
   ASSERT_EQ(mxnet::kvstore::IsValid(W, state3, 7, 7, 3), false);

   // Test -1
   std::vector<int> state4 = {3, -1, 2, 6, 0, 1, 4, 5};
   ASSERT_EQ(mxnet::kvstore::IsValid(W, state4, 7, 7, 3), true);

   // Test -1
   std::vector<int> state5 = {3, -1, 2, 6, 0, 1, 4, -1};
   ASSERT_EQ(mxnet::kvstore::IsValid(W, state5, 7, 8, 3), false);

   // Test 1 row
   std::vector<int> state6 = {3, -1, -1, -1, -1, -1, -1, -1};
   ASSERT_EQ(mxnet::kvstore::IsValid(W, state6, 7, 1, 3), true);
 }

 // gemvTest
 TEST(GpuTopology, TestGemv) {
   std::vector<int> A = {0, 2, 2, 3, 3, 1, 1, 1,   // 13
                         2, 0, 3, 2, 1, 3, 1, 1,   // 13
                         2, 3, 0, 3, 1, 1, 2, 1,   // 13
                         3, 2, 3, 0, 1, 1, 1, 2,   // 13
                         3, 1, 1, 1, 0, 2, 2, 3,   // 13
                         1, 3, 1, 1, 2, 0, 3, 2,   // 13
                         1, 1, 2, 1, 2, 3, 0, 3,   // 13
                         1, 1, 1, 2, 3, 2, 3, 0};  // 13
   std::vector<int> x(8, 1);
   std::vector<int> y(8, 0);
   std::iota(y.begin(), y.end(), 0);
   std::vector<int> correct_y(8, 13);
   mxnet::kvstore::gemv(A, x, &y);

   ASSERT_EQ(y.size(), correct_y.size());
   for (unsigned i = 0; i < y.size(); ++i)
     ASSERT_EQ(y[i], correct_y[i]);
 }

 // ewisemultTest
 TEST(GpuTopology, TestEwisemult) {
   std::vector<int> x(8, 1);
   std::vector<int> y(8, 0);
   std::iota(y.begin(), y.end(), 0);
   int alpha                  = 5;
   std::vector<int> correct_y = {0, 5, 10, 15, 20, 25, 30, 35};
   mxnet::kvstore::ewisemult(x, alpha, &y);

   ASSERT_EQ(y.size(), correct_y.size());
   for (unsigned i = 0; i < y.size(); ++i)
     ASSERT_EQ(y[i], correct_y[i]);
 }

 // FindBestMoveTest
 TEST(GpuTopology, TestFindBestMove) {
   std::vector<int> W = {0, 2, 2, 3, 3, 1, 1, 1, 2, 0, 3, 2, 1, 3, 1, 1, 2, 3, 0, 3, 1, 1,
                         2, 1, 3, 2, 3, 0, 1, 1, 1, 2, 3, 1, 1, 1, 0, 2, 2, 3, 1, 3, 1, 1,
                         2, 0, 3, 2, 1, 1, 2, 1, 2, 3, 0, 3, 1, 1, 1, 2, 3, 2, 3, 0};
   std::vector<int> P(8, 0);
   std::iota(P.begin(), P.end(), 1);
   std::unordered_set<int> used;

   std::vector<int> D1 = {20, 0, 0, 0, 0, 0, 0, 20};
   int a1, b1, g1;
   int correct_a1 = 0;
   int correct_b1 = 7;
   int correct_g1 = 38;
   mxnet::kvstore::FindBestMove(W, P, D1, used, &a1, &b1, &g1);
   ASSERT_EQ(a1, correct_a1);
   ASSERT_EQ(b1, correct_b1);
   ASSERT_EQ(g1, correct_g1);

   // -1, -1, 0 indicates no best edge found
   std::vector<int> D2 = {0, 0, 0, 0, 0, 0, 0, 0};
   int a2, b2, g2;
   int correct_a2 = -1;
   int correct_b2 = -1;
   int correct_g2 = 0;
   mxnet::kvstore::FindBestMove(W, P, D2, used, &a2, &b2, &g2);
   ASSERT_EQ(a2, correct_a2);
   ASSERT_EQ(b2, correct_b2);
   ASSERT_EQ(g2, correct_g2);
 }

 // GetRootTest
 TEST(GpuTopology, TestGetRoot) {
   std::vector<int> P = {0, 0, 1, 1, 2, 2, 3, 3};

   // Test when roots are non-empty, and matches color
   std::unordered_set<int> roots1 = {0, 2, 4, 6};
   std::vector<int> color1        = {0, 1, 2, 3};
   for (unsigned i = 0; i < color1.size(); ++i) {
     int root1         = mxnet::kvstore::GetRoot(P, color1[i], roots1);
     int correct_root1 = 2 * i;
     ASSERT_EQ(root1, correct_root1);
   }

   // Test when roots is empty
   std::unordered_set<int> roots2;
   int color2        = 0;
   int correct_root2 = -1;
   int root2         = mxnet::kvstore::GetRoot(P, color2, roots2);
   ASSERT_EQ(root2, correct_root2);

   // Test when roots is non-empty, but no root matches color
   std::unordered_set<int> roots3 = {0};
   int color3                     = 1;
   int correct_root3              = -1;
   int root3                      = mxnet::kvstore::GetRoot(P, color3, roots3);
   ASSERT_EQ(root3, correct_root3);

   std::vector<int> P2            = {0, 1, 1, 0, 2, 3, 3, 2};
   std::unordered_set<int> roots4 = roots1;
   int color4                     = 0;
   int correct_root4              = 0;
   int root4                      = mxnet::kvstore::GetRoot(P, color4, roots4);
   ASSERT_EQ(root4, correct_root4);
 }

 // GetChildTest
 TEST(GpuTopology, TestGetChild) {
   std::vector<int> P = {0, 0, 1, 2, 2, 2, 3, 3};

   // Test when color is not found
   int color1         = 4;
   int parent1        = 4;
   int correct_child1 = -1;
   int child1         = mxnet::kvstore::GetChild(P, color1, parent1);
   ASSERT_EQ(child1, correct_child1);

   // Test when color is found, but is equal to parent
   int color2         = 1;
   int parent2        = 2;
   int correct_child2 = -1;
   int child2         = mxnet::kvstore::GetChild(P, color2, parent2);
   ASSERT_EQ(child2, correct_child2);

   // Test when color is found and not equal to parent
   int color3         = 3;
   int parent3        = 6;
   int correct_child3 = 7;
   int child3         = mxnet::kvstore::GetChild(P, color3, parent3);
   ASSERT_EQ(child3, correct_child3);
 }

 // FindBestEdgeTest
 TEST(GpuTopology, TestFindBestEdge) {
   std::vector<int> W = {0, 2, 2, 3, 3, 1, 1, 1, 2, 0, 3, 2, 1, 3, 1, 1, 2, 3, 0, 3, 1, 1,
                         2, 1, 3, 2, 3, 0, 1, 1, 1, 2, 3, 1, 1, 1, 0, 2, 2, 3, 1, 3, 1, 1,
                         2, 0, 3, 2, 1, 1, 2, 1, 2, 3, 0, 3, 1, 1, 1, 2, 3, 2, 3, 0};
   std::vector<int> P(8, 0);
   std::unordered_set<int> used;

   int parent1 = 3;
   int dest1   = 0;
   std::vector<int> b1;
   int g1;
   std::vector<int> correct_b1 = {0, 2};
   int correct_g1              = 3;
   mxnet::kvstore::FindBestEdge(W, P, parent1, dest1, &b1, &g1);
   ASSERT_EQ(b1.size(), correct_b1.size());
   for (unsigned i = 0; i < b1.size(); ++i)
     ASSERT_EQ(b1[i], correct_b1[i]);
   ASSERT_EQ(g1, correct_g1);

   // {-1}, 0 indicates no best edge found
   int parent2 = 4;
   int dest2   = 1;
   std::vector<int> b2;
   int g2;
   std::vector<int> correct_b2 = {-1};
   int correct_g2              = 0;
   mxnet::kvstore::FindBestEdge(W, P, parent2, dest2, &b2, &g2);
   ASSERT_EQ(b2.size(), correct_b2.size());
   for (unsigned i = 0; i < b2.size(); ++i)
     ASSERT_EQ(b2[i], correct_b2[i]);
   ASSERT_EQ(g2, correct_g2);
 }

 // KLGenerateBinaryTreeTest
 TEST(GpuTopology, TestKLGenerateBinaryTree1) {
   std::vector<int> W = {0, 2, 3, 3, 3, 1, 1, 1, 2, 0, 3, 2, 1, 3, 1, 1, 2, 3, 0, 3, 1, 1,
                         2, 1, 3, 2, 3, 0, 1, 1, 1, 2, 3, 1, 1, 1, 0, 2, 3, 3, 1, 3, 1, 1,
                         2, 0, 3, 2, 1, 1, 2, 1, 2, 3, 0, 3, 1, 1, 1, 2, 3, 2, 3, 0};
   std::vector<int> P = {0, 1, 1, 0, 2, 3, 3, 2};
   std::vector<std::pair<int, int>> cluster_pairs;
   cluster_pairs.push_back(std::pair<int, int>(0, -2));
   cluster_pairs.push_back(std::pair<int, int>(1, -2));
   cluster_pairs.push_back(std::pair<int, int>(2, -2));
   cluster_pairs.push_back(std::pair<int, int>(3, -2));
   std::unordered_set<int> roots = {0, 2, 4, 6};
   std::vector<size_t> topo      = {0, 2, 4, 6};
   std::vector<size_t> scan(2, 0);
   std::mt19937 gen(1);
   mxnet::kvstore::KLGenerateBinaryTree(W, P, &cluster_pairs, &roots, &topo, &scan, &gen);
   std::vector<size_t> correct_topo = {0, 2, 4, 6, 0, 3, 2, 1, 4, 7, 6, 5};
   std::vector<size_t> correct_scan = {0, 0, 4};
   ASSERT_EQ(topo.size(), correct_topo.size());
   for (unsigned i = 0; i < topo.size(); ++i)
     ASSERT_EQ(topo[i], correct_topo[i]);
   ASSERT_EQ(scan.size(), correct_scan.size());
   for (unsigned i = 0; i < scan.size(); ++i)
     ASSERT_EQ(scan[i], correct_scan[i]);
 }

 TEST(GpuTopology, TestKLGenerateBinaryTree2) {
   std::vector<int> W = {0, 2, 3, 3, 3, 1, 1, 1, 2, 0, 3, 2, 1, 3, 1, 1, 2, 3, 0, 3, 1, 1,
                         2, 1, 3, 2, 3, 0, 1, 1, 1, 2, 3, 1, 1, 1, 0, 2, 3, 3, 1, 3, 1, 1,
                         2, 0, 3, 2, 1, 1, 2, 1, 2, 3, 0, 3, 1, 1, 1, 2, 3, 2, 3, 0};
   std::vector<int> P = {0, 1, 1, 0, 2, 3, 3, 2};
   std::vector<std::pair<int, int>> cluster_pairs;
   cluster_pairs.push_back(std::pair<int, int>(0, -2));
   cluster_pairs.push_back(std::pair<int, int>(1, -2));
   cluster_pairs.push_back(std::pair<int, int>(2, -2));
   cluster_pairs.push_back(std::pair<int, int>(3, -2));
   std::unordered_set<int> roots = {0, 2, 4, 6};
   std::vector<size_t> topo      = {0, 6, 4, 2};
   std::vector<size_t> scan(2, 0);
   std::mt19937 gen(1);
   mxnet::kvstore::KLGenerateBinaryTree(W, P, &cluster_pairs, &roots, &topo, &scan, &gen);
   std::vector<size_t> correct_topo = {0, 6, 4, 2, 0, 3, 6, 5, 4, 7, 2, 1};
   std::vector<size_t> correct_scan = {0, 0, 4};
   ASSERT_EQ(topo.size(), correct_topo.size());
   for (unsigned i = 0; i < topo.size(); ++i)
     ASSERT_EQ(topo[i], correct_topo[i]);
   ASSERT_EQ(scan.size(), correct_scan.size());
   for (unsigned i = 0; i < scan.size(); ++i)
     ASSERT_EQ(scan[i], correct_scan[i]);
 }

 // UpdateWeightTest
 TEST(GpuTopology, TestUpdateWeight) {
   std::vector<float> W         = {0.f, 1.f, 1.f, 0.f};
   std::vector<size_t> topo     = {1, 1, 0};
   int num_gpus                 = 2;
   float alpha                  = 0.7;
   std::vector<float> correct_W = {0.f, 0.7f, 0.7f, 0.f};
   mxnet::kvstore::UpdateWeight(&W, topo, num_gpus, alpha);
   ASSERT_EQ(W.size(), correct_W.size());
   for (unsigned i = 0; i < W.size(); ++i) {
     ASSERT_EQ(W[i], correct_W[i]);
   }
 }

 // ComputeTreesFromRoot
 TEST(GpuTopology, TestComputeTreesFromRoot1) {
   std::vector<float> W       = {0, 2, 2, 3, 3, 1, 1, 1, 2, 0, 3, 2, 1, 3, 1, 1, 2, 3, 0, 3, 1, 1,
                           2, 1, 3, 2, 3, 0, 1, 1, 1, 2, 3, 1, 1, 1, 0, 2, 2, 3, 1, 3, 1, 1,
                           2, 0, 3, 2, 1, 1, 2, 1, 2, 3, 0, 3, 1, 1, 1, 2, 3, 2, 3, 0};
   int num_gpus               = 8;
   int root                   = 0;
   float alpha                = 0.7;
   bool backtrack             = true;
   unsigned correct_topo_size = 15;
   unsigned correct_scan_size = 5;
   std::vector<size_t> topo;
   std::vector<size_t> scan;

   mxnet::kvstore::ComputeTreesFromRoot(&W, num_gpus, root, alpha, backtrack, &topo, &scan);

   ASSERT_EQ(topo.size(), correct_topo_size);
   ASSERT_EQ(scan.size(), correct_scan_size);
 }

 // IsConnected
 // Test on graph that is "disconnected" by NVLink
 TEST(GpuTopology, TestIsConnected1) {
   std::vector<float> W = {0, 0, 2, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 2, 0, 0};
   int num_gpus         = 4;

   bool connected = mxnet::kvstore::IsConnected(W, num_gpus);

   bool correct_connected = false;
   ASSERT_EQ(connected, correct_connected);
 }

 // IsConnected
 // Test on graph that is "disconnected" by NVLink
 TEST(GpuTopology, TestIsConnected2) {
   std::vector<float> W = {1, 1, 2, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1, 1};
   int num_gpus         = 4;

   bool connected = mxnet::kvstore::IsConnected(W, num_gpus);

   bool correct_connected = false;
   ASSERT_EQ(connected, correct_connected);
 }

 // IsConnected
 // Test on graph that is "disconnected" by NVLink
 TEST(GpuTopology, TestIsConnected3) {
   std::vector<float> W = {1, 1, 2, 2, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1, 1};
   int num_gpus         = 4;

   bool connected = mxnet::kvstore::IsConnected(W, num_gpus);

   bool correct_connected = true;
   ASSERT_EQ(connected, correct_connected);
 }

 // ComputeTreesTest with backtracking
 TEST(GpuTopology, TestComputeTrees1) {
   std::mt19937 gen(1);
   float alpha    = 0.7;
   bool backtrack = true;
   for (int num_gpus = 2; num_gpus <= 8; ++num_gpus) {
     LOG(INFO) << "Testing " << num_gpus << " x " << num_gpus;
     for (int i = 0; i < 5; ++i) {
       TestComputeTreesRandomized(num_gpus, alpha, backtrack, &gen);
     }
   }
 }

 // ComputeTreesTest with Kernighan-Lin
 TEST(GpuTopology, TestComputeTrees2) {
   std::mt19937 gen(1);
   float alpha    = 0.7;
   bool backtrack = false;
   // Do 5 randomized tests per GPU count from 2 to 16
   for (int num_gpus = 2; num_gpus <= 16; ++num_gpus) {
     LOG(INFO) << "Testing " << num_gpus << " x " << num_gpus;
     for (int i = 0; i < 5; ++i) {
       TestComputeTreesRandomized(num_gpus, alpha, backtrack, &gen);
     }
   }
 }

 TEST(GpuTopology, TestPermuteMatrix) {
   std::vector<int> W = {0, 2, 2, 3, 3, 1, 1, 1, 2, 0, 3, 2, 1, 3, 1, 1, 2, 3, 0, 3, 1, 1,
                         2, 1, 3, 2, 3, 0, 1, 1, 1, 2, 3, 1, 1, 1, 0, 2, 2, 3, 1, 3, 1, 1,
                         2, 0, 3, 2, 1, 1, 2, 1, 2, 3, 0, 3, 1, 1, 1, 2, 3, 2, 3, 0};

   std::vector<int> P1 = {0, 1, 2, 3, 4, 5, 6, 7};
   std::vector<int> A(8 * 8, 0);
   PermuteMatrix(W, P1, &A);
   for (unsigned i = 0; i < W.size(); ++i)
     ASSERT_EQ(A[i], W[i]);
 }

 TEST(GpuTopology, TestKernighanLin1) {
   std::vector<float> W = {0, 1, 2, 3, 2, 4, 1, 0, 1, 4, 2, 1, 2, 1, 0, 3, 2, 1,
                           3, 4, 3, 0, 4, 3, 2, 2, 2, 4, 0, 2, 4, 1, 1, 3, 2, 0};
   std::vector<int> P(6, 0);
   std::vector<std::pair<int, int>> cluster_pairs;
   int num_partitions = 1;
   std::mt19937 gen(1);
   bool stop = mxnet::kvstore::KernighanLin(W, &P, &num_partitions, &cluster_pairs, &gen);

   std::vector<std::pair<int, int>> correct_pairs;
   correct_pairs.push_back(std::pair<int, int>(0, 1));
   std::vector<int> correct_P = {0, 1, 0, 1, 1, 0};
   ASSERT_EQ(stop, false);
   ASSERT_EQ(num_partitions, 2);
   ASSERT_EQ(cluster_pairs.size(), correct_pairs.size());
   for (unsigned i = 0; i < cluster_pairs.size(); ++i) {
     ASSERT_EQ(cluster_pairs[i].first, correct_pairs[i].first);
     ASSERT_EQ(cluster_pairs[i].second, correct_pairs[i].second);
   }
   ASSERT_EQ(P.size(), correct_P.size());
   unsigned error = 0;
   for (unsigned i = 0; i < P.size(); ++i) {
     if (P[i] != correct_P[i])
       error++;
   }
   EXPECT_TRUE(error == 0 || error == P.size())
       << "Where real value: " << error << " not equal neither: " << 0 << " nor: " << P.size()
       << ".";
 }

 TEST(GpuTopology, TestKernighanLin2) {
   std::vector<float> W = {0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1,
                           1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0,
                           1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0};
   std::vector<int> P(8, 0);
   std::vector<std::pair<int, int>> cluster_pairs;
   int num_partitions = 1;
   std::mt19937 gen(1);
   bool stop = mxnet::kvstore::KernighanLin(W, &P, &num_partitions, &cluster_pairs, &gen);

   std::vector<std::pair<int, int>> correct_pairs;
   correct_pairs.push_back(std::pair<int, int>(0, 1));
   std::vector<int> correct_P = {0, 0, 1, 1, 0, 0, 1, 1};
   ASSERT_EQ(stop, false);
   ASSERT_EQ(num_partitions, 2);
   ASSERT_EQ(cluster_pairs.size(), correct_pairs.size());
   for (unsigned i = 0; i < cluster_pairs.size(); ++i) {
     ASSERT_EQ(cluster_pairs[i].first, correct_pairs[i].first);
     ASSERT_EQ(cluster_pairs[i].second, correct_pairs[i].second);
   }
   ASSERT_EQ(P.size(), correct_P.size());
   unsigned error = 0;
   for (unsigned i = 0; i < P.size(); ++i) {
     if (P[i] != correct_P[i])
       error++;
   }
   EXPECT_TRUE(error == 0 || error == P.size())
       << "Where real value: " << error << " not equal neither: " << 0 << " nor: " << P.size()
       << ".";
 }

 #endif  // MXNET_USE_CUDA
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	/*!
	* \file gpu_topology_test.cc
	* \brief gpu topology tests
	*/

	#if MXNET_USE_CUDA

	#include <gtest/gtest.h>
	#include <mxnet/base.h>
	#include <mxnet/kvstore.h>
	#include "../src/kvstore/gpu_topology.h"

	void GenerateMatrix(std::vector<float>* W, int num_gpus, std::mt19937* gen) {
	std::uniform_real_distribution<> dis(0., 1.);
	for (int row = 0; row < num_gpus; ++row) {
	for (int col = row + 1; col < num_gpus; ++col) {
	double sample = dis(*gen);
	if (sample < 0.33) {
	(W)[row num_gpus + col] = 1.;
	(W)[col num_gpus + row] = 1.;
	} else if (sample < 0.66f) {
	(W)[row num_gpus + col] = 2.;
	(W)[col num_gpus + row] = 2.;
	} else {
	(W)[row num_gpus + col] = 3.;
	(W)[col num_gpus + row] = 3.;
	}
	}
	}
	}

	bool IsSatisfactory(const std::vector<float>& W, int num_gpus, int depth) {
	for (int row = 0; row < num_gpus; ++row) {
	int out_edges = 0;
	for (int col = 0; col < num_gpus; ++col) {
	if (W[row * num_gpus + col] > 0.f)
	out_edges++;
	}
	if (out_edges < depth)
	return false;
	}
	return true;
	}

	// Generates random link topology matrix using random number generator
	void TestComputeTreesRandomized(int num_gpus, float alpha, int backtrack, std::mt19937* gen) {
	std::uniform_real_distribution<> dis(0.f, 1.f);
	bool satisfied = false;
	std::vector<float> W(num_gpus * num_gpus, 0.f);
	int depth = mxnet::kvstore::ComputeDepth(num_gpus);
	GenerateMatrix(&W, num_gpus, gen);
	satisfied = IsSatisfactory(W, num_gpus, depth);
	if (mxnet::kvstore::kLogTree && !satisfied) {
	LOG(ERROR) << " topology connectivity not satisfied "
	"(out edges per node less than tree depth)";
	mxnet::kvstore::PrintMatrix("W", W, num_gpus, num_gpus);
	}

	std::vector<std::vector<size_t>> topo;
	std::vector<std::vector<size_t>> scan;
	mxnet::kvstore::ComputeTrees(W, num_gpus, alpha, backtrack, &topo, &scan);

	unsigned correct_topo_size = (1 << (depth + 1)) - 1;
	unsigned correct_scan_size = depth + 2;
	ASSERT_EQ(topo.size(), static_cast<unsigned>(num_gpus));
	for (unsigned i = 0; i < topo.size(); ++i) {
	ASSERT_EQ(correct_topo_size, topo[i].size());
	ASSERT_EQ(correct_scan_size, scan[i].size());
	}
	}

	// Permutes matrix W using permutation vector P and stores output in matrix A
	// Assumption: W is square and symmetric
	void PermuteMatrix(const std::vector<int>& W, const std::vector<int>& P, std::vector<int>* A) {
	int nrows = P.size();
	std::vector<int> temp(nrows * nrows, 0);

	int count = 0;
	for (int row = 0; row < nrows; ++row) {
	for (int col = 0; col < nrows; ++col) {
	int row_start = P[row];
	temp[count] = W[row_start * nrows + col];
	count++;
	}
	}

	count = 0;
	for (int row = 0; row < nrows; ++row) {
	for (int col = 0; col < nrows; ++col) {
	int col_index = P[col];
	(A)[count] = temp[row nrows + col_index];
	count++;
	}
	}
	}

	TEST(GpuTopology, TestFormTopology) {
	std::vector<int> state0 = {3, 2, 1, 5, 0, 0, 4, 6};
	std::vector<size_t> topo0;
	std::vector<size_t> scan0;
	std::vector<int> correct0 = {3, 3, 0, 3, 1, 0, 4, 3, 2, 1, 5, 0, 0, 4, 6};
	std::vector<int> correct_scan0 = {0, 1, 3, 7, 15};
	mxnet::kvstore::FormTopology(state0, &topo0, &scan0, 3);
	ASSERT_EQ(topo0.size(), correct0.size());
	for (unsigned i = 0; i < correct0.size(); ++i)
	ASSERT_EQ(static_cast<int>(topo0[i]), correct0[i]);
	ASSERT_EQ(scan0.size(), correct_scan0.size());
	for (unsigned i = 0; i < correct_scan0.size(); ++i)
	ASSERT_EQ(static_cast<int>(scan0[i]), correct_scan0[i]);

	std::vector<int> state1 = {3, 2, 0, 4, 1, 1, 5, 6};
	std::vector<size_t> topo1;
	std::vector<size_t> scan1;
	std::vector<int> correct1 = {3, 3, 1, 3, 0, 1, 5, 3, 2, 0, 4, 1, 1, 5, 6};
	std::vector<int> correct_scan1 = {0, 1, 3, 7, 15};
	mxnet::kvstore::FormTopology(state1, &topo1, &scan1, 3);
	ASSERT_EQ(topo1.size(), correct1.size());
	for (unsigned i = 0; i < correct1.size(); ++i)
	ASSERT_EQ(static_cast<int>(topo1[i]), correct1[i]);
	ASSERT_EQ(scan1.size(), correct_scan1.size());
	for (unsigned i = 0; i < correct_scan1.size(); ++i)
	ASSERT_EQ(static_cast<int>(scan1[i]), correct_scan1[i]);
	}

	TEST(GpuTopology, TestComputeTreeWeight) {
	std::vector<int> W = {0, 2, 2, 3, 3, 0, 0, 2, 0, 3, 2, 0, 3, 0, 2, 3, 0, 3, 0, 0, 2, 3, 2, 3, 0,
	0, 0, 0, 3, 0, 0, 0, 0, 2, 2, 0, 3, 0, 0, 2, 0, 3, 0, 0, 2, 0, 2, 3, 0};

	std::vector<int> state0 = {3, 2, 1, 5, 0, 0, 4, 6};
	ASSERT_EQ(mxnet::kvstore::ComputeTreeWeight(W, state0, 7, 3, false), 16);

	std::vector<int> state1 = {3, 2, 0, 4, 1, 1, 5, 6};
	ASSERT_EQ(mxnet::kvstore::ComputeTreeWeight(W, state1, 7, 3, false), 17);
	}

	TEST(GpuTopology, TestPostprocess) {
	std::vector<int> result0 = {3, 0, 0, 4, 1, 2, 5, 6};
	std::vector<int> correct0 = {3, 3, 0, 4, 1, 2, 5, 6};
	mxnet::kvstore::Postprocess(&result0, 7, 3);
	for (unsigned i = 0; i < correct0.size(); ++i)
	ASSERT_EQ(result0[i], correct0[i]);

	std::vector<int> result1 = {2, 0, 0, 4, 1, 3, 5, 1};
	std::vector<int> correct1 = {2, 2, 0, 4, 1, 3, 5, 5};
	mxnet::kvstore::Postprocess(&result1, 6, 3);
	for (unsigned i = 0; i < correct1.size(); ++i)
	ASSERT_EQ(result1[i], correct1[i]);

	std::vector<int> result2 = {5, 4, 1, 3, 1, 0, 2, 0};
	std::vector<int> correct2 = {5, 4, 5, 3, 1, 0, 2, 2};
	mxnet::kvstore::Postprocess(&result2, 6, 3);
	for (unsigned i = 0; i < correct2.size(); ++i)
	ASSERT_EQ(result2[i], correct2[i]);

	std::vector<int> result3 = {10, 10, 0, 0, 0, 0, 0, 1, 2, 3, 6, 4, 7, 5, 8, 9};
	std::vector<int> correct3 = {10, 10, 10, 10, 0, 0, 0, 1, 2, 3, 6, 4, 7, 5, 8, 9};
	mxnet::kvstore::Postprocess(&result3, 11, 4);
	for (unsigned i = 0; i < correct3.size(); ++i)
	ASSERT_EQ(result3[i], correct3[i]);
	}

	TEST(GpuTopology, TestDepth) {
	ASSERT_EQ(mxnet::kvstore::ComputeDepth(2), 1);
	ASSERT_EQ(mxnet::kvstore::ComputeDepth(3), 2);
	ASSERT_EQ(mxnet::kvstore::ComputeDepth(8), 3);
	ASSERT_EQ(mxnet::kvstore::ComputeDepth(7), 3);
	ASSERT_EQ(mxnet::kvstore::ComputeDepth(5), 3);
	ASSERT_EQ(mxnet::kvstore::ComputeDepth(4), 2);
	ASSERT_EQ(mxnet::kvstore::ComputeDepth(16), 4);
	}

	TEST(GpuTopology, TestIsValid) {
	std::vector<int> W = {0, 2, 2, 3, 3, 0, 0, 2, 0, 3, 2, 0, 3, 0, 2, 3, 0, 3, 0, 0, 2, 3, 2, 3, 0,
	0, 0, 0, 3, 0, 0, 0, 0, 2, 2, 0, 3, 0, 0, 2, 0, 3, 0, 0, 2, 0, 2, 3, 0};

	std::vector<int> state0 = {3, 2, 1, 5, 0, 0, 4, 6};
	ASSERT_EQ(mxnet::kvstore::IsValid(W, state0, 7, 7, 3), true);

	// 3 connects to 1 first
	std::vector<int> state1 = {3, 2, 0, 4, 1, 1, 5, 6};
	ASSERT_EQ(mxnet::kvstore::IsValid(W, state1, 7, 7, 3), true);

	// 3 does not connect to 5
	std::vector<int> state2 = {3, 2, 5, 1, 0, 4, 2, 5};
	ASSERT_EQ(mxnet::kvstore::IsValid(W, state2, 7, 7, 3), false);

	// 7 exceeds number of GPUs
	std::vector<int> state3 = {3, 7, 2, 6, 0, 1, 4, 5};
	ASSERT_EQ(mxnet::kvstore::IsValid(W, state3, 7, 7, 3), false);

	// Test -1
	std::vector<int> state4 = {3, -1, 2, 6, 0, 1, 4, 5};
	ASSERT_EQ(mxnet::kvstore::IsValid(W, state4, 7, 7, 3), true);

	// Test -1
	std::vector<int> state5 = {3, -1, 2, 6, 0, 1, 4, -1};
	ASSERT_EQ(mxnet::kvstore::IsValid(W, state5, 7, 8, 3), false);

	// Test 1 row
	std::vector<int> state6 = {3, -1, -1, -1, -1, -1, -1, -1};
	ASSERT_EQ(mxnet::kvstore::IsValid(W, state6, 7, 1, 3), true);
	}

	// gemvTest
	TEST(GpuTopology, TestGemv) {
	std::vector<int> A = {0, 2, 2, 3, 3, 1, 1, 1, // 13
	2, 0, 3, 2, 1, 3, 1, 1, // 13
	2, 3, 0, 3, 1, 1, 2, 1, // 13
	3, 2, 3, 0, 1, 1, 1, 2, // 13
	3, 1, 1, 1, 0, 2, 2, 3, // 13
	1, 3, 1, 1, 2, 0, 3, 2, // 13
	1, 1, 2, 1, 2, 3, 0, 3, // 13
	1, 1, 1, 2, 3, 2, 3, 0}; // 13
	std::vector<int> x(8, 1);
	std::vector<int> y(8, 0);
	std::iota(y.begin(), y.end(), 0);
	std::vector<int> correct_y(8, 13);
	mxnet::kvstore::gemv(A, x, &y);

	ASSERT_EQ(y.size(), correct_y.size());
	for (unsigned i = 0; i < y.size(); ++i)
	ASSERT_EQ(y[i], correct_y[i]);
	}

	// ewisemultTest
	TEST(GpuTopology, TestEwisemult) {
	std::vector<int> x(8, 1);
	std::vector<int> y(8, 0);
	std::iota(y.begin(), y.end(), 0);
	int alpha = 5;
	std::vector<int> correct_y = {0, 5, 10, 15, 20, 25, 30, 35};
	mxnet::kvstore::ewisemult(x, alpha, &y);

	ASSERT_EQ(y.size(), correct_y.size());
	for (unsigned i = 0; i < y.size(); ++i)
	ASSERT_EQ(y[i], correct_y[i]);
	}

	// FindBestMoveTest
	TEST(GpuTopology, TestFindBestMove) {
	std::vector<int> W = {0, 2, 2, 3, 3, 1, 1, 1, 2, 0, 3, 2, 1, 3, 1, 1, 2, 3, 0, 3, 1, 1,
	2, 1, 3, 2, 3, 0, 1, 1, 1, 2, 3, 1, 1, 1, 0, 2, 2, 3, 1, 3, 1, 1,
	2, 0, 3, 2, 1, 1, 2, 1, 2, 3, 0, 3, 1, 1, 1, 2, 3, 2, 3, 0};
	std::vector<int> P(8, 0);
	std::iota(P.begin(), P.end(), 1);
	std::unordered_set<int> used;

	std::vector<int> D1 = {20, 0, 0, 0, 0, 0, 0, 20};
	int a1, b1, g1;
	int correct_a1 = 0;
	int correct_b1 = 7;
	int correct_g1 = 38;
	mxnet::kvstore::FindBestMove(W, P, D1, used, &a1, &b1, &g1);
	ASSERT_EQ(a1, correct_a1);
	ASSERT_EQ(b1, correct_b1);
	ASSERT_EQ(g1, correct_g1);

	// -1, -1, 0 indicates no best edge found
	std::vector<int> D2 = {0, 0, 0, 0, 0, 0, 0, 0};
	int a2, b2, g2;
	int correct_a2 = -1;
	int correct_b2 = -1;
	int correct_g2 = 0;
	mxnet::kvstore::FindBestMove(W, P, D2, used, &a2, &b2, &g2);
	ASSERT_EQ(a2, correct_a2);
	ASSERT_EQ(b2, correct_b2);
	ASSERT_EQ(g2, correct_g2);
	}

	// GetRootTest
	TEST(GpuTopology, TestGetRoot) {
	std::vector<int> P = {0, 0, 1, 1, 2, 2, 3, 3};

	// Test when roots are non-empty, and matches color
	std::unordered_set<int> roots1 = {0, 2, 4, 6};
	std::vector<int> color1 = {0, 1, 2, 3};
	for (unsigned i = 0; i < color1.size(); ++i) {
	int root1 = mxnet::kvstore::GetRoot(P, color1[i], roots1);
	int correct_root1 = 2 * i;
	ASSERT_EQ(root1, correct_root1);
	}

	// Test when roots is empty
	std::unordered_set<int> roots2;
	int color2 = 0;
	int correct_root2 = -1;
	int root2 = mxnet::kvstore::GetRoot(P, color2, roots2);
	ASSERT_EQ(root2, correct_root2);

	// Test when roots is non-empty, but no root matches color
	std::unordered_set<int> roots3 = {0};
	int color3 = 1;
	int correct_root3 = -1;
	int root3 = mxnet::kvstore::GetRoot(P, color3, roots3);
	ASSERT_EQ(root3, correct_root3);

	std::vector<int> P2 = {0, 1, 1, 0, 2, 3, 3, 2};
	std::unordered_set<int> roots4 = roots1;
	int color4 = 0;
	int correct_root4 = 0;
	int root4 = mxnet::kvstore::GetRoot(P, color4, roots4);
	ASSERT_EQ(root4, correct_root4);
	}

	// GetChildTest
	TEST(GpuTopology, TestGetChild) {
	std::vector<int> P = {0, 0, 1, 2, 2, 2, 3, 3};

	// Test when color is not found
	int color1 = 4;
	int parent1 = 4;
	int correct_child1 = -1;
	int child1 = mxnet::kvstore::GetChild(P, color1, parent1);
	ASSERT_EQ(child1, correct_child1);

	// Test when color is found, but is equal to parent
	int color2 = 1;
	int parent2 = 2;
	int correct_child2 = -1;
	int child2 = mxnet::kvstore::GetChild(P, color2, parent2);
	ASSERT_EQ(child2, correct_child2);

	// Test when color is found and not equal to parent
	int color3 = 3;
	int parent3 = 6;
	int correct_child3 = 7;
	int child3 = mxnet::kvstore::GetChild(P, color3, parent3);
	ASSERT_EQ(child3, correct_child3);
	}

	// FindBestEdgeTest
	TEST(GpuTopology, TestFindBestEdge) {
	std::vector<int> W = {0, 2, 2, 3, 3, 1, 1, 1, 2, 0, 3, 2, 1, 3, 1, 1, 2, 3, 0, 3, 1, 1,
	2, 1, 3, 2, 3, 0, 1, 1, 1, 2, 3, 1, 1, 1, 0, 2, 2, 3, 1, 3, 1, 1,
	2, 0, 3, 2, 1, 1, 2, 1, 2, 3, 0, 3, 1, 1, 1, 2, 3, 2, 3, 0};
	std::vector<int> P(8, 0);
	std::unordered_set<int> used;

	int parent1 = 3;
	int dest1 = 0;
	std::vector<int> b1;
	int g1;
	std::vector<int> correct_b1 = {0, 2};
	int correct_g1 = 3;
	mxnet::kvstore::FindBestEdge(W, P, parent1, dest1, &b1, &g1);
	ASSERT_EQ(b1.size(), correct_b1.size());
	for (unsigned i = 0; i < b1.size(); ++i)
	ASSERT_EQ(b1[i], correct_b1[i]);
	ASSERT_EQ(g1, correct_g1);

	// {-1}, 0 indicates no best edge found
	int parent2 = 4;
	int dest2 = 1;
	std::vector<int> b2;
	int g2;
	std::vector<int> correct_b2 = {-1};
	int correct_g2 = 0;
	mxnet::kvstore::FindBestEdge(W, P, parent2, dest2, &b2, &g2);
	ASSERT_EQ(b2.size(), correct_b2.size());
	for (unsigned i = 0; i < b2.size(); ++i)
	ASSERT_EQ(b2[i], correct_b2[i]);
	ASSERT_EQ(g2, correct_g2);
	}

	// KLGenerateBinaryTreeTest
	TEST(GpuTopology, TestKLGenerateBinaryTree1) {
	std::vector<int> W = {0, 2, 3, 3, 3, 1, 1, 1, 2, 0, 3, 2, 1, 3, 1, 1, 2, 3, 0, 3, 1, 1,
	2, 1, 3, 2, 3, 0, 1, 1, 1, 2, 3, 1, 1, 1, 0, 2, 3, 3, 1, 3, 1, 1,
	2, 0, 3, 2, 1, 1, 2, 1, 2, 3, 0, 3, 1, 1, 1, 2, 3, 2, 3, 0};
	std::vector<int> P = {0, 1, 1, 0, 2, 3, 3, 2};
	std::vector<std::pair<int, int>> cluster_pairs;
	cluster_pairs.push_back(std::pair<int, int>(0, -2));
	cluster_pairs.push_back(std::pair<int, int>(1, -2));
	cluster_pairs.push_back(std::pair<int, int>(2, -2));
	cluster_pairs.push_back(std::pair<int, int>(3, -2));
	std::unordered_set<int> roots = {0, 2, 4, 6};
	std::vector<size_t> topo = {0, 2, 4, 6};
	std::vector<size_t> scan(2, 0);
	std::mt19937 gen(1);
	mxnet::kvstore::KLGenerateBinaryTree(W, P, &cluster_pairs, &roots, &topo, &scan, &gen);
	std::vector<size_t> correct_topo = {0, 2, 4, 6, 0, 3, 2, 1, 4, 7, 6, 5};
	std::vector<size_t> correct_scan = {0, 0, 4};
	ASSERT_EQ(topo.size(), correct_topo.size());
	for (unsigned i = 0; i < topo.size(); ++i)
	ASSERT_EQ(topo[i], correct_topo[i]);
	ASSERT_EQ(scan.size(), correct_scan.size());
	for (unsigned i = 0; i < scan.size(); ++i)
	ASSERT_EQ(scan[i], correct_scan[i]);
	}

	TEST(GpuTopology, TestKLGenerateBinaryTree2) {
	std::vector<int> W = {0, 2, 3, 3, 3, 1, 1, 1, 2, 0, 3, 2, 1, 3, 1, 1, 2, 3, 0, 3, 1, 1,
	2, 1, 3, 2, 3, 0, 1, 1, 1, 2, 3, 1, 1, 1, 0, 2, 3, 3, 1, 3, 1, 1,
	2, 0, 3, 2, 1, 1, 2, 1, 2, 3, 0, 3, 1, 1, 1, 2, 3, 2, 3, 0};
	std::vector<int> P = {0, 1, 1, 0, 2, 3, 3, 2};
	std::vector<std::pair<int, int>> cluster_pairs;
	cluster_pairs.push_back(std::pair<int, int>(0, -2));
	cluster_pairs.push_back(std::pair<int, int>(1, -2));
	cluster_pairs.push_back(std::pair<int, int>(2, -2));
	cluster_pairs.push_back(std::pair<int, int>(3, -2));
	std::unordered_set<int> roots = {0, 2, 4, 6};
	std::vector<size_t> topo = {0, 6, 4, 2};
	std::vector<size_t> scan(2, 0);
	std::mt19937 gen(1);
	mxnet::kvstore::KLGenerateBinaryTree(W, P, &cluster_pairs, &roots, &topo, &scan, &gen);
	std::vector<size_t> correct_topo = {0, 6, 4, 2, 0, 3, 6, 5, 4, 7, 2, 1};
	std::vector<size_t> correct_scan = {0, 0, 4};
	ASSERT_EQ(topo.size(), correct_topo.size());
	for (unsigned i = 0; i < topo.size(); ++i)
	ASSERT_EQ(topo[i], correct_topo[i]);
	ASSERT_EQ(scan.size(), correct_scan.size());
	for (unsigned i = 0; i < scan.size(); ++i)
	ASSERT_EQ(scan[i], correct_scan[i]);
	}

	// UpdateWeightTest
	TEST(GpuTopology, TestUpdateWeight) {
	std::vector<float> W = {0.f, 1.f, 1.f, 0.f};
	std::vector<size_t> topo = {1, 1, 0};
	int num_gpus = 2;
	float alpha = 0.7;
	std::vector<float> correct_W = {0.f, 0.7f, 0.7f, 0.f};
	mxnet::kvstore::UpdateWeight(&W, topo, num_gpus, alpha);
	ASSERT_EQ(W.size(), correct_W.size());
	for (unsigned i = 0; i < W.size(); ++i) {
	ASSERT_EQ(W[i], correct_W[i]);
	}
	}

	// ComputeTreesFromRoot
	TEST(GpuTopology, TestComputeTreesFromRoot1) {
	std::vector<float> W = {0, 2, 2, 3, 3, 1, 1, 1, 2, 0, 3, 2, 1, 3, 1, 1, 2, 3, 0, 3, 1, 1,
	2, 1, 3, 2, 3, 0, 1, 1, 1, 2, 3, 1, 1, 1, 0, 2, 2, 3, 1, 3, 1, 1,
	2, 0, 3, 2, 1, 1, 2, 1, 2, 3, 0, 3, 1, 1, 1, 2, 3, 2, 3, 0};
	int num_gpus = 8;
	int root = 0;
	float alpha = 0.7;
	bool backtrack = true;
	unsigned correct_topo_size = 15;
	unsigned correct_scan_size = 5;
	std::vector<size_t> topo;
	std::vector<size_t> scan;

	mxnet::kvstore::ComputeTreesFromRoot(&W, num_gpus, root, alpha, backtrack, &topo, &scan);

	ASSERT_EQ(topo.size(), correct_topo_size);
	ASSERT_EQ(scan.size(), correct_scan_size);
	}

	// IsConnected
	// Test on graph that is "disconnected" by NVLink
	TEST(GpuTopology, TestIsConnected1) {
	std::vector<float> W = {0, 0, 2, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 2, 0, 0};
	int num_gpus = 4;

	bool connected = mxnet::kvstore::IsConnected(W, num_gpus);

	bool correct_connected = false;
	ASSERT_EQ(connected, correct_connected);
	}

	// IsConnected
	// Test on graph that is "disconnected" by NVLink
	TEST(GpuTopology, TestIsConnected2) {
	std::vector<float> W = {1, 1, 2, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1, 1};
	int num_gpus = 4;

	bool connected = mxnet::kvstore::IsConnected(W, num_gpus);

	bool correct_connected = false;
	ASSERT_EQ(connected, correct_connected);
	}

	// IsConnected
	// Test on graph that is "disconnected" by NVLink
	TEST(GpuTopology, TestIsConnected3) {
	std::vector<float> W = {1, 1, 2, 2, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1, 1};
	int num_gpus = 4;

	bool connected = mxnet::kvstore::IsConnected(W, num_gpus);

	bool correct_connected = true;
	ASSERT_EQ(connected, correct_connected);
	}

	// ComputeTreesTest with backtracking
	TEST(GpuTopology, TestComputeTrees1) {
	std::mt19937 gen(1);
	float alpha = 0.7;
	bool backtrack = true;
	for (int num_gpus = 2; num_gpus <= 8; ++num_gpus) {
	LOG(INFO) << "Testing " << num_gpus << " x " << num_gpus;
	for (int i = 0; i < 5; ++i) {
	TestComputeTreesRandomized(num_gpus, alpha, backtrack, &gen);
	}
	}
	}

	// ComputeTreesTest with Kernighan-Lin
	TEST(GpuTopology, TestComputeTrees2) {
	std::mt19937 gen(1);
	float alpha = 0.7;
	bool backtrack = false;
	// Do 5 randomized tests per GPU count from 2 to 16
	for (int num_gpus = 2; num_gpus <= 16; ++num_gpus) {
	LOG(INFO) << "Testing " << num_gpus << " x " << num_gpus;
	for (int i = 0; i < 5; ++i) {
	TestComputeTreesRandomized(num_gpus, alpha, backtrack, &gen);
	}
	}
	}

	TEST(GpuTopology, TestPermuteMatrix) {
	std::vector<int> W = {0, 2, 2, 3, 3, 1, 1, 1, 2, 0, 3, 2, 1, 3, 1, 1, 2, 3, 0, 3, 1, 1,
	2, 1, 3, 2, 3, 0, 1, 1, 1, 2, 3, 1, 1, 1, 0, 2, 2, 3, 1, 3, 1, 1,
	2, 0, 3, 2, 1, 1, 2, 1, 2, 3, 0, 3, 1, 1, 1, 2, 3, 2, 3, 0};

	std::vector<int> P1 = {0, 1, 2, 3, 4, 5, 6, 7};
	std::vector<int> A(8 * 8, 0);
	PermuteMatrix(W, P1, &A);
	for (unsigned i = 0; i < W.size(); ++i)
	ASSERT_EQ(A[i], W[i]);
	}

	TEST(GpuTopology, TestKernighanLin1) {
	std::vector<float> W = {0, 1, 2, 3, 2, 4, 1, 0, 1, 4, 2, 1, 2, 1, 0, 3, 2, 1,
	3, 4, 3, 0, 4, 3, 2, 2, 2, 4, 0, 2, 4, 1, 1, 3, 2, 0};
	std::vector<int> P(6, 0);
	std::vector<std::pair<int, int>> cluster_pairs;
	int num_partitions = 1;
	std::mt19937 gen(1);
	bool stop = mxnet::kvstore::KernighanLin(W, &P, &num_partitions, &cluster_pairs, &gen);

	std::vector<std::pair<int, int>> correct_pairs;
	correct_pairs.push_back(std::pair<int, int>(0, 1));
	std::vector<int> correct_P = {0, 1, 0, 1, 1, 0};
	ASSERT_EQ(stop, false);
	ASSERT_EQ(num_partitions, 2);
	ASSERT_EQ(cluster_pairs.size(), correct_pairs.size());
	for (unsigned i = 0; i < cluster_pairs.size(); ++i) {
	ASSERT_EQ(cluster_pairs[i].first, correct_pairs[i].first);
	ASSERT_EQ(cluster_pairs[i].second, correct_pairs[i].second);
	}
	ASSERT_EQ(P.size(), correct_P.size());
	unsigned error = 0;
	for (unsigned i = 0; i < P.size(); ++i) {
	if (P[i] != correct_P[i])
	error++;
	}
	EXPECT_TRUE(error == 0 \|\| error == P.size())
	<< "Where real value: " << error << " not equal neither: " << 0 << " nor: " << P.size()
	<< ".";
	}

	TEST(GpuTopology, TestKernighanLin2) {
	std::vector<float> W = {0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1,
	1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0,
	1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0};
	std::vector<int> P(8, 0);
	std::vector<std::pair<int, int>> cluster_pairs;
	int num_partitions = 1;
	std::mt19937 gen(1);
	bool stop = mxnet::kvstore::KernighanLin(W, &P, &num_partitions, &cluster_pairs, &gen);

	std::vector<std::pair<int, int>> correct_pairs;
	correct_pairs.push_back(std::pair<int, int>(0, 1));
	std::vector<int> correct_P = {0, 0, 1, 1, 0, 0, 1, 1};
	ASSERT_EQ(stop, false);
	ASSERT_EQ(num_partitions, 2);
	ASSERT_EQ(cluster_pairs.size(), correct_pairs.size());
	for (unsigned i = 0; i < cluster_pairs.size(); ++i) {
	ASSERT_EQ(cluster_pairs[i].first, correct_pairs[i].first);
	ASSERT_EQ(cluster_pairs[i].second, correct_pairs[i].second);
	}
	ASSERT_EQ(P.size(), correct_P.size());
	unsigned error = 0;
	for (unsigned i = 0; i < P.size(); ++i) {
	if (P[i] != correct_P[i])
	error++;
	}
	EXPECT_TRUE(error == 0 \|\| error == P.size())
	<< "Where real value: " << error << " not equal neither: " << 0 << " nor: " << P.size()
	<< ".";
	}

	#endif // MXNET_USE_CUDA