blob: 810d41f5b190a0d00639aa244e2a9d97357c3ee3 [file] [log] [blame]
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
#include <type_traits>
#include <vector>
#include <string>
#include <functional>
#include <memory>
#include "singa/singa_config.h"
#include "singa/core/common.h"
#include "singa/core/memory.h"
#include "singa/core/scheduler.h"
#include "singa/proto/core.pb.h"
#ifdef USE_CUDA
#include "singa/utils/cuda_utils.h"
#endif // USE_CUDA
// cl2.hpp includes cl.h, do not re-include.
#include <unordered_map>
#include <CL/cl2.hpp>
#include "singa/utils/opencl_utils.h"
#endif // USE_OPENCL
using std::vector;
using std::string;
using std::function;
using std::shared_ptr;
namespace singa {
/// Allocate memory and execute Tensor operations.
/// There are three types of devices distinguished by their programming
/// languages, namely cpp, cuda and opencl.
class Device {
// Device() = default;
virtual ~Device() {}
/// Constructor with device ID, num of executors (e.g., cuda streams),
/// max mem size to use (in MB)
Device(int id, int num_executors);
virtual void SetRandSeed(unsigned seed) = 0;
/// Called by Tensor.
Block* NewBlock(int size);
/// Called by Tensor.
void FreeBlock(Block* block);
/// Return the size (bytes) of memory in use
/// TODO(wangwei) override this function for all devices.
virtual size_t GetAllocatedMem() {
return 0u;
/// Copy data within or across devices.
void CopyDataToFrom(Block* dst, Block* src, size_t nBytes,
CopyDirection direction, int dst_offset, int src_offset);
void CopyDataFromHostPtr(Block* dst, const void* src, size_t nBytes,
size_t dst_offset = 0);
/// Submit the operation to the device, which may execute it right now or
/// delay it depending on the scheduler.
void Exec(function<void(Context*)>&& fn, const vector<Block*> read_blocks,
const vector<Block*> write_blocks,
bool use_rand_generator = false);
// Wait for one event.
// void WaitFor();
/// wait for all operations submitted to this device.
void Sync();
/// Return the programming language for this device.
LangType lang() const {
return lang_;
virtual std::shared_ptr<Device> host() const { return host_;}
Context* context(int k) {
return &ctx_;
int id() const { return id_; }
Device() {};
/// Execute one operation on one executor.
virtual void DoExec(function<void(Context*)>&& fn, int executor) = 0;
virtual void CopyToFrom(void* dst, const void* src, size_t nBytes,
CopyDirection direction, Context* ctx) = 0;
/// Allocate device memory.
virtual void* Malloc(int size) = 0;
/// Free device memory.
virtual void Free(void* ptr) = 0;
int id_ = 0;
int num_executors_ = 0;
unsigned seed_ = 0;
// Scheduler* scheduler_ = nullptr;
// VirtualMemory* vm_ = nullptr;
/// Programming language type, could be kCpp, kCuda, kOpencl
LangType lang_;
// SafeQueue<Operation> op_queue_;
// SafeQueue<Operation> op_log_;
/// The host device
std::shared_ptr<Device> host_;
// TODO(wangwei) define multiple contexts, one per executor
Context ctx_;
/// a singleton CppDevice as the host for all devices.
extern std::shared_ptr<Device> defaultDevice;
/// Represent a CPU device which may have multiple threads/executors.
/// It runs cpp code.
class CppCPU : public Device {
~CppCPU() {};
std::shared_ptr<Device> host() const override { return defaultDevice;}
void SetRandSeed(unsigned seed) override;
void DoExec(function<void(Context*)>&& fn, int executor) override;
void CopyToFrom(void* dst, const void* src, size_t nBytes,
CopyDirection direction, Context* ctx) override;
/// Allocate cpu memory.
void* Malloc(int size) override;
/// Free cpu memory.
void Free(void* ptr) override;
// Implement Device using OpenCL libs.
// class OpenclDevice : public Device { };
#ifdef USE_CUDA
// Represent a Nvidia GPU which runs cuda code.
class CudaGPU : public Device {
/// Construct the device using default mem pool setting.
CudaGPU(int id = 0);
/// Construct the device given the physical device ID and memory pool.
CudaGPU(int id, std::shared_ptr<DeviceMemPool> pool);
void SetRandSeed(unsigned seed) override;
size_t GetAllocatedMem() override;
void DoExec(function<void(Context*)>&& fn, int executor) override;
void CopyToFrom(void* dst, const void* src, size_t nBytes,
CopyDirection direction, Context* ctx) override;
/// Allocate cpu memory.
void* Malloc(int size) override;
/// Free cpu memory.
void Free(void* ptr) override;
void Setup();
shared_ptr<DeviceMemPool> pool_;
/// CudaCPU which uses cudaMallocHost to allocate pinned memory for host.
#endif // USE_CUDA
// Implement Device using OpenCL libs.
class OpenclDevice : public singa::Device {
// TODO: Constructor arguments to consider:
// Path to kernel sources?
// Select only certain device types?
OpenclDevice(int id = 0, int num_executors = 1);
/// Get the specified kernel.
cl::Kernel GetKernel(const std::string& kname, cl_int* status = nullptr);
/// Get the command queue associated with this device.
cl::CommandQueue GetCmdQ() { return cmdq; }
/// Prints information about all Devices in each Platform.
void PrintAllDeviceInfo();
/// Prints status about CL source code builds.
void PrintClBuildInfo(cl::Program &p);
// Overridden, inherited methods
void SetRandSeed(unsigned seed) override;
void CopyDataToFrom(Block* dst, Block* src, size_t nBytes,
CopyDirection direction, int dst_offset = 0,
int src_offset = 0);
void CopyDataFromHostPtr(Block* dst, const void* src, size_t nBytes = 0,
size_t dst_offset = 0) override;*/
/// The OpenCL device that this object represents.
/// Each OpenclDevice contains exactly one cl::Device for the lifetime of the
/// object.
cl::Device this_device;
/// Each OpenclDevice has one OpenCL context. It is created along with the
/// creation of this object.
cl::Context ocl_ctx;
/// The CommandQueue that is associated with this device.
/// Since each OpenclDevice contains only one cl::Device and one cl::Context,
/// it naturally also contains one cl::CommandQueue that is associated
/// with said Device and Context.
cl::CommandQueue cmdq;
/// A list of kernels that has been compiled on this device.
std::shared_ptr<std::unordered_map<std::string, cl::Kernel>> kernels;
/// Searches the given paths for all .cl files and builds
/// OpenCL programs, then stores them in the Kernels map.
void BuildPrograms(const std::string &kdir = cl_src_path);
// Overridden, inherited methods.
void DoExec(function<void(Context*)>&& fn, int executor) override;
void CopyToFrom(void* dst, const void* src, size_t nBytes,
CopyDirection direction, Context* ctx = nullptr) override;
/// Allocates memory on this OpenCL device
/// by creating and returning an empty cl::Buffer object.
/// with the indicated size.
void* Malloc(int size) override;
/// Converts the void pointer into a Buffer object, then deletes the object.
/// This has the effect of freeing up device memory.
void Free(void* ptr) override;
/// Copies a data block from host to device.
/// src: a pointer to an array of data.
/// dst: a pointer to a cl::Buffer object.
void WriteToDevice(cl::Buffer* dst, const void* src, const size_t size);
/// Reads a data block from device to host.
/// src: a pointer to an cl::Buffer object.
/// dst: a pointer to an malloc'ed empty array.
void ReadFromDevice(void* dst, const cl::Buffer* src, const size_t size);
/// Duplicates a block of data on the device.
/// src: a pointer to the original cl::Buffer object.
/// dst: a pointer to the new cl::Buffer object to copy the data into.
void CopyDeviceBuffer(cl::Buffer* dst, const cl::Buffer* src, const size_t size);
static const std::string cl_src_path;
#endif // USE_OPENCL
/// This class queries all available calculating devices on a given machine
/// grouped according to manufacturer or device drivers. All methods should be static.
/// If CUDA or OPENCL are not enabled, then the respective related methods should
/// return something that indicates their absence (for example, 0 devices);
/// however they should always be available regardless of compile-time switches.
class Platform {
/// Return the defualt host device
static std::shared_ptr<Device> GetDefaultDevice() {
return defaultDevice;
#ifdef USE_CUDA
/// Return the number of total available GPUs
static int GetNumGPUs();
/// Return the device IDs of available GPUs.
/// TODO(wangwei) return the IDs according to free memory in decending order
static const std::vector<int> GetGPUIDs();
static const std::pair<size_t, size_t> GetGPUMemSize(const int device);
/// Return the memory of a GPU <free, total>
static const std::vector<std::pair<size_t, size_t>> GetGPUMemSize();
/// Return a string containing all hardware info, e.g., version, memory size.
static const std::string DeviceQuery(int id, bool verbose = false);
/// Create a set of CudaGPU Device using 'num_devices' free GPUs.
static const std::vector<std::shared_ptr<Device>>
CreateCudaGPUs(const size_t num_devices, size_t init_size = 0);
/// Create a set of CudaGPU Device using given GPU IDs.
static const std::vector<std::shared_ptr<Device>>
CreateCudaGPUsOn(const std::vector<int> &devices, size_t init_size = 0);
#endif // USE_CUDA
/// Create a \p num_devices set of valid OpenCL devices, regardless of
/// platforms. If there are fewer valid devices than requested, then this
/// method will return as many as possible.If OpenCL is not in use, this
/// method will return an empty array.
const std::vector<std::shared_ptr<Device> > CreateOpenclDevices(
const size_t num_devices);
/// Create a set of valid OpenCL devices, regardless of platforms, assigning
/// \p id to each device in sequence.
/// If there are fewer valid devices than requested, then this method will
/// return as many as possible.
/// If OpenCL is not in use, this method will return an empty array.
const std::vector<std::shared_ptr<Device> >
CreateOpenclDevices(const vector<int> &id);
/// This function is implementd by Caffe (
/// This function checks the availability of GPU #device_id.
/// It attempts to create a context on the device by calling cudaFree(0).
/// cudaSetDevice() alone is not sufficient to check the availability.
/// It lazily records device_id, however, does not initialize a
/// context. So it does not know if the host thread has the permission to use
/// the device or not.
/// In a shared environment where the devices are set to EXCLUSIVE_PROCESS
/// or EXCLUSIVE_THREAD mode, cudaSetDevice() returns cudaSuccess
/// even if the device is exclusively occupied by another process or thread.
/// Cuda operations that initialize the context are needed to check
/// the permission. cudaFree(0) is one of those with no side effect,
/// except the context initialization.
static bool CheckDevice(const int device_id);
cl::Platform clPlatform;
#endif // USE_OPENCL
} // namespace singa