blob: 6025a5e86684873da519948b4ce84e3605a42600 [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "singa/singa_config.h"
#ifdef USE_CUDA
#include <cublas_v2.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <curand.h>
#include <chrono>
#include <iostream>
#include "singa/core/device.h"
#include "singa/utils/cuda_utils.h"
namespace singa {
const cudaMemcpyKind copyKind[] = {cudaMemcpyHostToHost, cudaMemcpyHostToDevice,
cudaMemcpyDeviceToHost,
cudaMemcpyDeviceToDevice};
CudaGPU::~CudaGPU() {
if (ctx_.cublas_handle) CUBLAS_CHECK(cublasDestroy(ctx_.cublas_handle));
if (ctx_.curand_generator)
CURAND_CHECK(curandDestroyGenerator(ctx_.curand_generator));
#ifdef USE_CUDNN
if (ctx_.cudnn_handle) {
auto status = cudnnDestroy(ctx_.cudnn_handle);
CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << cudnnGetErrorString(status);
}
#endif
// Explicitly destroys and cleans up all resources associated with current
// device
cudaDeviceReset();
// the returned code incidate "driver shutting down" after reset
}
const int kNumCudaStream = 1;
CudaGPU::CudaGPU(int id) : Device(id, kNumCudaStream) {
MemPoolConf conf;
conf.add_device(id);
pool_ = std::make_shared<CnMemPool>(conf);
Setup();
}
CudaGPU::CudaGPU(int id, std::shared_ptr<DeviceMemPool> pool)
: Device(id, kNumCudaStream) {
CHECK(pool != nullptr);
pool_ = pool;
Setup();
}
void CudaGPU::Setup() {
lang_ = kCuda;
ctx_.stream = NULL; // use the default sync stream
// TODO(wangwei) create one handle for each steam?
// Preserse for future use instead of default sync stream, for concurrency
// cudaStreamCreate(&ctx_.stream);
#ifdef USE_DIST
CUDA_CHECK(cudaStreamCreateWithFlags(&ctx_.s, cudaStreamNonBlocking));
CUDA_CHECK(cudaStreamCreateWithFlags(&ctx_.c1, cudaStreamNonBlocking));
CUDA_CHECK(cudaStreamCreateWithFlags(&ctx_.c2, cudaStreamNonBlocking));
#endif // USE_DIST
CUDA_CHECK(cudaSetDevice(id_));
// use curandCreateGeneratorHost for CudaHost device
CURAND_CHECK(
curandCreateGenerator(&ctx_.curand_generator, CURAND_RNG_PSEUDO_DEFAULT));
CURAND_CHECK(curandSetStream(ctx_.curand_generator, ctx_.stream));
auto seed = std::chrono::system_clock::now().time_since_epoch().count();
SetRandSeed(seed);
// TODO(wangwei) if one generator per stream, then need diff offset per gen?
CURAND_CHECK(curandSetGeneratorOffset(ctx_.curand_generator, 0));
CUBLAS_CHECK(cublasCreate(&(ctx_.cublas_handle)));
CUBLAS_CHECK(cublasSetStream(ctx_.cublas_handle, ctx_.stream));
#ifdef USE_CUDNN
// TODO(wangwei) create one handle for each stream?
auto status = cudnnCreate(&ctx_.cudnn_handle);
CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << cudnnGetErrorString(status);
cudnnSetStream(ctx_.cudnn_handle, ctx_.stream);
#endif // USE_CUDNN
}
void CudaGPU::SetRandSeed(unsigned seed) {
CHECK(ctx_.curand_generator);
CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(ctx_.curand_generator, seed));
}
void CudaGPU::DoExec(function<void(Context*)>&& fn, int executor) { fn(&ctx_); }
void CudaGPU::SyncBeforeCountingTime() {
// synchronization before counting time
bool previous_state = graph_enabled();
graph_enabled_ = false;
Sync();
graph_enabled_ = previous_state;
}
void CudaGPU::EvaluateTimeElapsed(Node* node) {
float totalTime;
cudaEventElapsedTime(&totalTime, node->start_, node->end_);
cudaEventDestroy(node->start_);
cudaEventDestroy(node->end_);
node->time_elapsed_inc(totalTime * 0.001);
}
void CudaGPU::TimeProfilingDoExec(function<void(Context*)>&& fn, int executor,
Node* node) {
// time profiling using cudaEvent
cudaEventCreate(&(node->start_));
cudaEventCreate(&(node->end_));
#ifdef USE_DIST
if (node->op_name().find("Dist") != std::string::npos) {
if (node->op_name().find("Dist_s") != std::string::npos)
cudaEventRecord(node->start_, ctx_.s);
else if (node->op_name().find("Dist_c1") != std::string::npos)
cudaEventRecord(node->start_, ctx_.c1);
else if (node->op_name().find("Dist_c2") != std::string::npos)
cudaEventRecord(node->start_, ctx_.c2);
else if (node->op_name().find("Dist_c1c2") != std::string::npos)
cudaEventRecord(node->start_, ctx_.c1);
} else {
cudaEventRecord(node->start_, ctx_.stream);
}
#else
cudaEventRecord(node->start_, ctx_.stream);
#endif // USE_DIST
fn(&ctx_);
#ifdef USE_DIST
if (node->op_name().find("Dist") != std::string::npos) {
if (node->op_name().find("Dist_s") != std::string::npos)
cudaEventRecord(node->end_, ctx_.s);
else if (node->op_name().find("Dist_c1") != std::string::npos)
cudaEventRecord(node->end_, ctx_.c1);
else if (node->op_name().find("Dist_c2") != std::string::npos)
cudaEventRecord(node->end_, ctx_.c2);
else if (node->op_name().find("Dist_c1c2") != std::string::npos)
cudaEventRecord(node->end_, ctx_.c2);
} else {
cudaEventRecord(node->end_, ctx_.stream);
}
#else
cudaEventRecord(node->end_, ctx_.stream);
#endif // USE_DIST
}
void CudaGPU::CopyToFrom(void* dst, const void* src, size_t nBytes,
CopyDirection direction, Context* ctx) {
// cudaMemcpy(dst, src, nBytes, copyKind[direction]);
cudaMemcpyAsync(dst, src, nBytes, copyKind[direction], ctx_.stream);
}
size_t CudaGPU::GetAllocatedMem() {
if (pool_ != nullptr) {
auto ret = pool_->GetMemUsage();
return ret.second - ret.first;
}
LOG(ERROR) << "The memory pool is not set";
return 0u;
}
/// Allocate gpu memory.
void* CudaGPU::Malloc(int size) {
void* ptr = nullptr;
if (size > 0) {
CUDA_CHECK(cudaSetDevice(id_));
pool_->Malloc((void**)&ptr, size);
// Comment out for future analysis: without cnmem
// CUDA_CHECK(cudaMemsetAsync(ptr, 0, size, ctx_.stream));
}
return ptr;
}
/// Free gpu memory.
void CudaGPU::Free(void* ptr) {
if (ptr != nullptr) {
CUDA_CHECK(cudaSetDevice(id_));
pool_->Free(ptr);
}
}
void CudaGPU::Sync() {
Exec([this](Context* ctx) { CUDA_CHECK(cudaDeviceSynchronize()); }, {}, {},
"Waiting");
}
} // namespace singa
#endif // USE_CUDA