src/core/device/cuda_gpu.cc - singa - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 #include "singa/singa_config.h"
 #ifdef USE_CUDA
 #include <cublas_v2.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <curand.h>

 #include <chrono>
 #include <iostream>

 #include "singa/core/device.h"
 #include "singa/utils/cuda_utils.h"
 namespace singa {

 const cudaMemcpyKind copyKind[] = {cudaMemcpyHostToHost, cudaMemcpyHostToDevice,
                                    cudaMemcpyDeviceToHost,
                                    cudaMemcpyDeviceToDevice};

 CudaGPU::~CudaGPU() {
   if (ctx_.cublas_handle) CUBLAS_CHECK(cublasDestroy(ctx_.cublas_handle));
   if (ctx_.curand_generator)
     CURAND_CHECK(curandDestroyGenerator(ctx_.curand_generator));
 #ifdef USE_CUDNN
   if (ctx_.cudnn_handle) {
     auto status = cudnnDestroy(ctx_.cudnn_handle);
     CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << cudnnGetErrorString(status);
   }
 #endif

   // Explicitly destroys and cleans up all resources associated with current
   // device
   cudaDeviceReset();
   // the returned code incidate "driver shutting down" after reset
 }
 const int kNumCudaStream = 1;

 CudaGPU::CudaGPU(int id) : Device(id, kNumCudaStream) {
   MemPoolConf conf;
   conf.add_device(id);
   pool_ = std::make_shared<CnMemPool>(conf);
   Setup();
 }

 CudaGPU::CudaGPU(int id, std::shared_ptr<DeviceMemPool> pool)
     : Device(id, kNumCudaStream) {
   CHECK(pool != nullptr);
   pool_ = pool;
   Setup();
 }

 void CudaGPU::Setup() {
   lang_ = kCuda;
   ctx_.stream = NULL;  // use the default sync stream

   // TODO(wangwei) create one handle for each steam?
   // Preserse for future use instead of default sync stream, for concurrency
   // cudaStreamCreate(&ctx_.stream);

 #ifdef USE_DIST
   CUDA_CHECK(cudaStreamCreateWithFlags(&ctx_.s, cudaStreamNonBlocking));
   CUDA_CHECK(cudaStreamCreateWithFlags(&ctx_.c1, cudaStreamNonBlocking));
   CUDA_CHECK(cudaStreamCreateWithFlags(&ctx_.c2, cudaStreamNonBlocking));
 #endif  // USE_DIST

   CUDA_CHECK(cudaSetDevice(id_));
   // use curandCreateGeneratorHost for CudaHost device
   CURAND_CHECK(
       curandCreateGenerator(&ctx_.curand_generator, CURAND_RNG_PSEUDO_DEFAULT));
   CURAND_CHECK(curandSetStream(ctx_.curand_generator, ctx_.stream));
   auto seed = std::chrono::system_clock::now().time_since_epoch().count();
   SetRandSeed(seed);
   // TODO(wangwei) if one generator per stream, then need diff offset per gen?
   CURAND_CHECK(curandSetGeneratorOffset(ctx_.curand_generator, 0));
   CUBLAS_CHECK(cublasCreate(&(ctx_.cublas_handle)));
   CUBLAS_CHECK(cublasSetStream(ctx_.cublas_handle, ctx_.stream));

 #ifdef USE_CUDNN
   // TODO(wangwei) create one handle for each stream?
   auto status = cudnnCreate(&ctx_.cudnn_handle);
   CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << cudnnGetErrorString(status);
   cudnnSetStream(ctx_.cudnn_handle, ctx_.stream);
 #endif  // USE_CUDNN
 }

 void CudaGPU::SetRandSeed(unsigned seed) {
   CHECK(ctx_.curand_generator);
   CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(ctx_.curand_generator, seed));
 }

 void CudaGPU::DoExec(function<void(Context*)>&& fn, int executor) { fn(&ctx_); }

 void CudaGPU::SyncBeforeCountingTime() {
   // synchronization before counting time
   bool previous_state = graph_enabled();
   graph_enabled_ = false;
   Sync();
   graph_enabled_ = previous_state;
 }

 void CudaGPU::EvaluateTimeElapsed(Node* node) {
   float totalTime;

   cudaEventElapsedTime(&totalTime, node->start_, node->end_);

   cudaEventDestroy(node->start_);
   cudaEventDestroy(node->end_);

   node->time_elapsed_inc(totalTime * 0.001);
 }

 void CudaGPU::TimeProfilingDoExec(function<void(Context*)>&& fn, int executor,
                                   Node* node) {
   // time profiling using cudaEvent
   cudaEventCreate(&(node->start_));
   cudaEventCreate(&(node->end_));

 #ifdef USE_DIST
   if (node->op_name().find("Dist") != std::string::npos) {
     if (node->op_name().find("Dist_s") != std::string::npos)
       cudaEventRecord(node->start_, ctx_.s);
     else if (node->op_name().find("Dist_c1") != std::string::npos)
       cudaEventRecord(node->start_, ctx_.c1);
     else if (node->op_name().find("Dist_c2") != std::string::npos)
       cudaEventRecord(node->start_, ctx_.c2);
     else if (node->op_name().find("Dist_c1c2") != std::string::npos)
       cudaEventRecord(node->start_, ctx_.c1);
   } else {
     cudaEventRecord(node->start_, ctx_.stream);
   }
 #else
   cudaEventRecord(node->start_, ctx_.stream);
 #endif  // USE_DIST

   fn(&ctx_);

 #ifdef USE_DIST
   if (node->op_name().find("Dist") != std::string::npos) {
     if (node->op_name().find("Dist_s") != std::string::npos)
       cudaEventRecord(node->end_, ctx_.s);
     else if (node->op_name().find("Dist_c1") != std::string::npos)
       cudaEventRecord(node->end_, ctx_.c1);
     else if (node->op_name().find("Dist_c2") != std::string::npos)
       cudaEventRecord(node->end_, ctx_.c2);
     else if (node->op_name().find("Dist_c1c2") != std::string::npos)
       cudaEventRecord(node->end_, ctx_.c2);
   } else {
     cudaEventRecord(node->end_, ctx_.stream);
   }
 #else
   cudaEventRecord(node->end_, ctx_.stream);
 #endif  // USE_DIST
 }

 void CudaGPU::CopyToFrom(void* dst, const void* src, size_t nBytes,
                          CopyDirection direction, Context* ctx) {
   // cudaMemcpy(dst, src, nBytes, copyKind[direction]);
   cudaMemcpyAsync(dst, src, nBytes, copyKind[direction], ctx_.stream);
 }

 size_t CudaGPU::GetAllocatedMem() {
   if (pool_ != nullptr) {
     auto ret = pool_->GetMemUsage();
     return ret.second - ret.first;
   }
   LOG(ERROR) << "The memory pool is not set";
   return 0u;
 }

 /// Allocate gpu memory.
 void* CudaGPU::Malloc(int size) {
   void* ptr = nullptr;
   if (size > 0) {
     CUDA_CHECK(cudaSetDevice(id_));
     pool_->Malloc((void**)&ptr, size);
     // Comment out for future analysis: without cnmem
     // CUDA_CHECK(cudaMemsetAsync(ptr, 0, size, ctx_.stream));
   }
   return ptr;
 }

 /// Free gpu memory.
 void CudaGPU::Free(void* ptr) {
   if (ptr != nullptr) {
     CUDA_CHECK(cudaSetDevice(id_));
     pool_->Free(ptr);
   }
 }

 void CudaGPU::Sync() {
   Exec([this](Context* ctx) { CUDA_CHECK(cudaDeviceSynchronize()); }, {}, {},
        "Waiting");
 }

 }  // namespace singa
 #endif  // USE_CUDA
	/**
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	#include "singa/singa_config.h"
	#ifdef USE_CUDA
	#include <cublas_v2.h>
	#include <cuda.h>
	#include <cuda_runtime.h>
	#include <curand.h>

	#include <chrono>
	#include <iostream>

	#include "singa/core/device.h"
	#include "singa/utils/cuda_utils.h"
	namespace singa {

	const cudaMemcpyKind copyKind[] = {cudaMemcpyHostToHost, cudaMemcpyHostToDevice,
	cudaMemcpyDeviceToHost,
	cudaMemcpyDeviceToDevice};

	CudaGPU::~CudaGPU() {
	if (ctx_.cublas_handle) CUBLAS_CHECK(cublasDestroy(ctx_.cublas_handle));
	if (ctx_.curand_generator)
	CURAND_CHECK(curandDestroyGenerator(ctx_.curand_generator));
	#ifdef USE_CUDNN
	if (ctx_.cudnn_handle) {
	auto status = cudnnDestroy(ctx_.cudnn_handle);
	CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << cudnnGetErrorString(status);
	}
	#endif

	// Explicitly destroys and cleans up all resources associated with current
	// device
	cudaDeviceReset();
	// the returned code incidate "driver shutting down" after reset
	}
	const int kNumCudaStream = 1;

	CudaGPU::CudaGPU(int id) : Device(id, kNumCudaStream) {
	MemPoolConf conf;
	conf.add_device(id);
	pool_ = std::make_shared<CnMemPool>(conf);
	Setup();
	}

	CudaGPU::CudaGPU(int id, std::shared_ptr<DeviceMemPool> pool)
	: Device(id, kNumCudaStream) {
	CHECK(pool != nullptr);
	pool_ = pool;
	Setup();
	}

	void CudaGPU::Setup() {
	lang_ = kCuda;
	ctx_.stream = NULL; // use the default sync stream

	// TODO(wangwei) create one handle for each steam?
	// Preserse for future use instead of default sync stream, for concurrency
	// cudaStreamCreate(&ctx_.stream);

	#ifdef USE_DIST
	CUDA_CHECK(cudaStreamCreateWithFlags(&ctx_.s, cudaStreamNonBlocking));
	CUDA_CHECK(cudaStreamCreateWithFlags(&ctx_.c1, cudaStreamNonBlocking));
	CUDA_CHECK(cudaStreamCreateWithFlags(&ctx_.c2, cudaStreamNonBlocking));
	#endif // USE_DIST

	CUDA_CHECK(cudaSetDevice(id_));
	// use curandCreateGeneratorHost for CudaHost device
	CURAND_CHECK(
	curandCreateGenerator(&ctx_.curand_generator, CURAND_RNG_PSEUDO_DEFAULT));
	CURAND_CHECK(curandSetStream(ctx_.curand_generator, ctx_.stream));
	auto seed = std::chrono::system_clock::now().time_since_epoch().count();
	SetRandSeed(seed);
	// TODO(wangwei) if one generator per stream, then need diff offset per gen?
	CURAND_CHECK(curandSetGeneratorOffset(ctx_.curand_generator, 0));
	CUBLAS_CHECK(cublasCreate(&(ctx_.cublas_handle)));
	CUBLAS_CHECK(cublasSetStream(ctx_.cublas_handle, ctx_.stream));

	#ifdef USE_CUDNN
	// TODO(wangwei) create one handle for each stream?
	auto status = cudnnCreate(&ctx_.cudnn_handle);
	CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << cudnnGetErrorString(status);
	cudnnSetStream(ctx_.cudnn_handle, ctx_.stream);
	#endif // USE_CUDNN
	}

	void CudaGPU::SetRandSeed(unsigned seed) {
	CHECK(ctx_.curand_generator);
	CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(ctx_.curand_generator, seed));
	}

	void CudaGPU::DoExec(function<void(Context*)>&& fn, int executor) { fn(&ctx_); }

	void CudaGPU::SyncBeforeCountingTime() {
	// synchronization before counting time
	bool previous_state = graph_enabled();
	graph_enabled_ = false;
	Sync();
	graph_enabled_ = previous_state;
	}

	void CudaGPU::EvaluateTimeElapsed(Node* node) {
	float totalTime;

	cudaEventElapsedTime(&totalTime, node->start_, node->end_);

	cudaEventDestroy(node->start_);
	cudaEventDestroy(node->end_);

	node->time_elapsed_inc(totalTime * 0.001);
	}

	void CudaGPU::TimeProfilingDoExec(function<void(Context*)>&& fn, int executor,
	Node* node) {
	// time profiling using cudaEvent
	cudaEventCreate(&(node->start_));
	cudaEventCreate(&(node->end_));

	#ifdef USE_DIST
	if (node->op_name().find("Dist") != std::string::npos) {
	if (node->op_name().find("Dist_s") != std::string::npos)
	cudaEventRecord(node->start_, ctx_.s);
	else if (node->op_name().find("Dist_c1") != std::string::npos)
	cudaEventRecord(node->start_, ctx_.c1);
	else if (node->op_name().find("Dist_c2") != std::string::npos)
	cudaEventRecord(node->start_, ctx_.c2);
	else if (node->op_name().find("Dist_c1c2") != std::string::npos)
	cudaEventRecord(node->start_, ctx_.c1);
	} else {
	cudaEventRecord(node->start_, ctx_.stream);
	}
	#else
	cudaEventRecord(node->start_, ctx_.stream);
	#endif // USE_DIST

	fn(&ctx_);

	#ifdef USE_DIST
	if (node->op_name().find("Dist") != std::string::npos) {
	if (node->op_name().find("Dist_s") != std::string::npos)
	cudaEventRecord(node->end_, ctx_.s);
	else if (node->op_name().find("Dist_c1") != std::string::npos)
	cudaEventRecord(node->end_, ctx_.c1);
	else if (node->op_name().find("Dist_c2") != std::string::npos)
	cudaEventRecord(node->end_, ctx_.c2);
	else if (node->op_name().find("Dist_c1c2") != std::string::npos)
	cudaEventRecord(node->end_, ctx_.c2);
	} else {
	cudaEventRecord(node->end_, ctx_.stream);
	}
	#else
	cudaEventRecord(node->end_, ctx_.stream);
	#endif // USE_DIST
	}

	void CudaGPU::CopyToFrom(void* dst, const void* src, size_t nBytes,
	CopyDirection direction, Context* ctx) {
	// cudaMemcpy(dst, src, nBytes, copyKind[direction]);
	cudaMemcpyAsync(dst, src, nBytes, copyKind[direction], ctx_.stream);
	}

	size_t CudaGPU::GetAllocatedMem() {
	if (pool_ != nullptr) {
	auto ret = pool_->GetMemUsage();
	return ret.second - ret.first;
	}
	LOG(ERROR) << "The memory pool is not set";
	return 0u;
	}

	/// Allocate gpu memory.
	void* CudaGPU::Malloc(int size) {
	void* ptr = nullptr;
	if (size > 0) {
	CUDA_CHECK(cudaSetDevice(id_));
	pool_->Malloc((void**)&ptr, size);
	// Comment out for future analysis: without cnmem
	// CUDA_CHECK(cudaMemsetAsync(ptr, 0, size, ctx_.stream));
	}
	return ptr;
	}

	/// Free gpu memory.
	void CudaGPU::Free(void* ptr) {
	if (ptr != nullptr) {
	CUDA_CHECK(cudaSetDevice(id_));
	pool_->Free(ptr);
	}
	}

	void CudaGPU::Sync() {
	Exec([this](Context* ctx) { CUDA_CHECK(cudaDeviceSynchronize()); }, {}, {},
	"Waiting");
	}

	} // namespace singa
	#endif // USE_CUDA