include/singa/core/device.h - singa - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #ifndef SINGA_CORE_DEVICE_H_
 #define SINGA_CORE_DEVICE_H_

 #include <chrono>
 #include <functional>
 #include <map>
 #include <memory>
 #include <mutex>
 #include <string>
 #include <type_traits>
 #include <unordered_set>
 #include <vector>

 #include "singa/core/common.h"
 #include "singa/core/memory.h"
 #include "singa/core/scheduler.h"
 #include "singa/proto/core.pb.h"
 #include "singa/singa_config.h"
 #include "singa/utils/safe_queue.h"

 #ifdef USE_CUDA
 #include "singa/utils/cuda_utils.h"
 #endif  // USE_CUDA

 #ifdef USE_OPENCL
 #include "singa/utils/opencl_utils.h"
 #endif  // USE_OPENCL

 using std::function;
 using std::shared_ptr;
 using std::string;
 using std::vector;

 namespace singa {

 /// Allocate memory and execute Tensor operations.
 /// There are three types of devices distinguished by their programming
 /// languages, namely cpp, cuda and opencl.
 class Device {
  public:
   // Device() = default;
   virtual ~Device();
   /// Constructor with device ID, num of executors (e.g., cuda streams),
   /// max mem size to use (in MB)
   Device(int id, int num_executors);

   void Reset();

   virtual void SetRandSeed(unsigned seed) = 0;

   void EnableGraph(bool enable) { graph_enabled_ = enable; }

   static void EnableLazyAlloc(bool enbale) { lazy_alloc_ = enbale; }

   /// Called by Tensor.
   Block* NewBlock(int size);

   /// Called by Tensor.
   void FreeBlock(Block* block);

   /// Return the size (bytes) of memory in use
   /// TODO(wangwei) override this function for all devices.
   virtual size_t GetAllocatedMem() { return 0u; }

   /// Copy data within or across devices.
   virtual void CopyDataToFrom(Block* dst, Block* src, size_t nBytes,
                               CopyDirection direction, int dst_offset,
                               int src_offset, Context* ctx);

   void CopyDataFromHostPtr(Block* dst, const void* src, size_t nBytes,
                            size_t dst_offset = 0, Context* ctx = nullptr);
   /// Submit the operation to the device, which may execute it right now or
   /// delay it depending on the scheduler.
   void Exec(function<void(Context*)>&& fn, const vector<Block*> read_blocks,
             const vector<Block*> write_blocks, string op_name = "no_name",
             bool use_rand_generator = false);

   void RunGraph(bool serial = false);

   void ResetGraph() { graph_->Reset(); }

   // Wait for one event.
   // void WaitFor();

   /// wait for all operations submitted to this device.
   virtual void Sync();

   int id() const { return id_; }

   /// Return the programming language for this device.
   LangType lang() const { return lang_; }

   Context* context(int k) { return &ctx_; }

   bool graph_enabled() const { return graph_enabled_; }

   /// Verbosity of the time profiling function:
   /// verbosity == 0 (default) -> no logging
   /// verbosity == 1 -> display forward and backward propagation time
   /// verbosity == 2 -> display each operation time (OP_ID, op name, time)
   int verbosity() const { return verbosity_; }
   /// the number of initial iteration that is skipped for time profiling
   int skip_iteration() const { return skip_iteration_; }

   virtual std::shared_ptr<Device> host() const { return host_; }

   void PrintTimeProfiling();
   void SetVerbosity(int verbosity) { verbosity_ = verbosity; };
   void SetSkipIteration(int skip_iteration) {
     skip_iteration_ = skip_iteration;
   };

  protected:
   /// Execute one operation on one executor.
   virtual void DoExec(function<void(Context*)>&& fn, int executor) = 0;
   virtual void TimeProfilingDoExec(function<void(Context*)>&& fn, int executor,
                                    Node* node) = 0;
   virtual void EvaluateTimeElapsed(Node* node) = 0;

   virtual void CopyToFrom(void* dst, const void* src, size_t nBytes,
                           CopyDirection direction, Context* ctx) = 0;

   /// Allocate device memory.
   virtual void* Malloc(int size) = 0;

   /// Free device memory.
   virtual void Free(void* ptr) = 0;

  private:
   Device(){};

  protected:
   friend class Block;
   friend class Graph;

   int id_ = 0;
   int num_executors_ = 0;
   unsigned seed_ = 0;
   bool graph_enabled_ = false;
   int verbosity_ = 0;
   int skip_iteration_ = 5;
   /// The computational graph
   Graph* graph_ = nullptr;
   /// Programming language type, could be kCpp, kCuda, kOpencl
   LangType lang_;
   /// The host device
   std::shared_ptr<Device> host_;
   // TODO(wangwei) define multiple contexts, one per executor
   Context ctx_;
   // Scheduler* scheduler_ = nullptr;
   // VirtualMemory* vm_ = nullptr;
   // SafeQueue<Operation> op_queue_;
   // SafeQueue<Operation> op_log_;

   static bool lazy_alloc_;
 };

 /// a singleton CppDevice as the host for all devices.
 extern std::shared_ptr<Device> defaultDevice;

 /// Represent a CPU device which may have multiple threads/executors.
 /// It runs cpp code.
 class CppCPU : public Device {
  public:
   ~CppCPU();
   CppCPU();

   std::shared_ptr<Device> host() const override { return defaultDevice; }
   void SetRandSeed(unsigned seed) override;

  protected:
   void DoExec(function<void(Context*)>&& fn, int executor) override;
   void TimeProfilingDoExec(function<void(Context*)>&& fn, int executor,
                            Node* node) override;
   void EvaluateTimeElapsed(Node* node) override;

   void CopyToFrom(void* dst, const void* src, size_t nBytes,
                   CopyDirection direction, Context* ctx) override;

   /// Allocate cpu memory.
   void* Malloc(int size) override;

   /// Free cpu memory.
   void Free(void* ptr) override;
 };

 // Implement Device using OpenCL libs.
 // class OpenclDevice : public Device { };

 #ifdef USE_CUDA
 // Represent a Nvidia GPU which runs cuda code.
 class CudaGPU : public Device {
  public:
   ~CudaGPU();
   /// Construct the device using default mem pool setting.
   CudaGPU(int id = 0);
   /// Construct the device given the physical device ID and memory pool.
   CudaGPU(int id, std::shared_ptr<DeviceMemPool> pool);

   void SetRandSeed(unsigned seed) override;
   size_t GetAllocatedMem() override;
   void Sync() override;

  protected:
   void DoExec(function<void(Context*)>&& fn, int executor) override;
   void TimeProfilingDoExec(function<void(Context*)>&& fn, int executor,
                            Node* node) override;
   void EvaluateTimeElapsed(Node* node) override;

   void SyncBeforeCountingTime();

   void CopyToFrom(void* dst, const void* src, size_t nBytes,
                   CopyDirection direction, Context* ctx) override;

   /// Allocate cpu memory.
   void* Malloc(int size) override;

   /// Free cpu memory.
   void Free(void* ptr) override;

  private:
   void Setup();

  private:
   shared_ptr<DeviceMemPool> pool_;
 };

 /// CudaCPU which uses cudaMallocHost to allocate pinned memory for host.

 #endif  // USE_CUDA

 #ifdef USE_OPENCL

 // Implement Device using OpenCL libs.
 class OpenclDevice : public singa::Device {
  public:
   // TODO: Constructor arguments to consider:
   // Path to kernel sources?
   // Select only certain device types?
   OpenclDevice(int id = 0, int num_executors = 1);
   ~OpenclDevice();

   // Overridden, inherited methods
   void SetRandSeed(unsigned seed) override;

   virtual void CopyDataToFrom(Block* dst, Block* src, size_t nBytes,
                               CopyDirection direction, int dst_offset = 0,
                               int src_offset = 0,
                               Context* ctx = nullptr) override;

  protected:
   /// The OpenCL device that this object represents.
   /// Each OpenclDevice contains exactly one cl::Device for the lifetime of the
   /// object.
   viennacl::ocl::device this_device;

   /// Each OpenclDevice has one OpenCL context. It is created along with the
   /// creation of this object.
   viennacl::ocl::context vcl_ctx;

   /// Searches the given paths for all .cl files and builds
   /// OpenCL programs, then stores them in the Kernels map.
   void BuildPrograms();

   // Overridden, inherited methods.

   void DoExec(function<void(Context*)>&& fn, int executor) override;

   void CopyToFrom(void* dst, const void* src, size_t nBytes,
                   CopyDirection direction, Context* ctx = nullptr) override;

   /// Allocates memory on this OpenCL device
   /// by creating and returning an empty cl::Buffer object.
   /// with the indicated size.
   void* Malloc(int size) override;

   /// Converts the void pointer into a Buffer object, then deletes the object.
   /// This has the effect of freeing up device memory.
   void Free(void* ptr) override;

  private:
   static const std::string cl_src_path;
 };
 #endif  // USE_OPENCL

 /// This class queries all available calculating devices on a given machine
 /// grouped according to manufacturer or device drivers. All methods should be
 /// static.
 /// If CUDA or OPENCL are not enabled, then the respective related methods
 /// should
 /// return something that indicates their absence (for example, 0 devices);
 /// however they should always be available regardless of compile-time switches.
 class Platform {
  public:
   /// Return the default host device
   static std::shared_ptr<Device> GetDefaultDevice() {
     // cannot reset cpu device, which leads to error
     // defaultDevice->Reset();
     return defaultDevice;
   }

 #ifdef USE_CUDA
   /// Return the number of total available GPUs
   static int GetNumGPUs();

   /// Return the device IDs of available GPUs.
   /// TODO(wangwei) return the IDs according to free memory in decending order
   static const std::vector<int> GetGPUIDs();

   static const std::pair<size_t, size_t> GetGPUMemSize(const int device);

   /// Return the memory of a GPU <free, total>
   static const std::vector<std::pair<size_t, size_t>> GetGPUMemSize();

   /// Return a string containing all hardware info, e.g., version, memory size.
   static const std::string DeviceQuery(int id, bool verbose = false);

   /// Create a set of CudaGPU Device using 'num_devices' free GPUs.
   static const std::vector<std::shared_ptr<Device>> CreateCudaGPUs(
       const size_t num_devices, size_t init_size = 0);

   /// Create a set of CudaGPU Device using given GPU IDs.
   static const std::vector<std::shared_ptr<Device>> CreateCudaGPUsOn(
       const std::vector<int>& devices, size_t init_size = 0);

   static std::vector<std::shared_ptr<Device>> UsedDevice;
   /// This function is implementd by Caffe (http://caffe.berkeleyvision.org/).
   /// This function checks the availability of GPU #device_id.
   /// It attempts to create a context on the device by calling cudaFree(0).
   /// cudaSetDevice() alone is not sufficient to check the availability.
   /// It lazily records device_id, however, does not initialize a
   /// context. So it does not know if the host thread has the permission to use
   /// the device or not.
   ///
   /// In a shared environment where the devices are set to EXCLUSIVE_PROCESS
   /// or EXCLUSIVE_THREAD mode, cudaSetDevice() returns cudaSuccess
   /// even if the device is exclusively occupied by another process or thread.
   /// Cuda operations that initialize the context are needed to check
   /// the permission. cudaFree(0) is one of those with no side effect,
   /// except the context initialization.
   static bool CheckDevice(const int device_id);
   static std::mutex mtx_;
 #endif  // USE_CUDA

 #ifdef USE_OPENCL

   const int GetNumOpenclPlatforms();

   const int GetNumOpenclDevices();

   static const std::shared_ptr<Device> GetDefaultOpenclDevice();

 /// Create a \p num_devices set of valid OpenCL devices, regardless of
 /// platforms.  If there are fewer valid devices than requested, then this
 /// method will return as many as possible. If OpenCL is not in use, this
 /// method will return an empty array.
 //  static const std::vector<std::shared_ptr<Device>>
 //  CreateOpenclDevices(const size_t num_devices);

 /// Create a set of valid OpenCL devices, regardless of platforms, assigning
 /// \p id to each device in sequence.
 /// If there are fewer valid devices than requested, then this method will
 /// return as many as possible.
 /// If OpenCL is not in use, this method will return an empty array.
 //  const std::vector<std::shared_ptr<Device>>
 //  CreateOpenclDevices(const vector<int> &id);
 #endif  // USE_OPENCL
 };

 }  // namespace singa

 #endif  // SINGA_CORE_DEVICE_H_
	/**
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#ifndef SINGA_CORE_DEVICE_H_
	#define SINGA_CORE_DEVICE_H_

	#include <chrono>
	#include <functional>
	#include <map>
	#include <memory>
	#include <mutex>
	#include <string>
	#include <type_traits>
	#include <unordered_set>
	#include <vector>

	#include "singa/core/common.h"
	#include "singa/core/memory.h"
	#include "singa/core/scheduler.h"
	#include "singa/proto/core.pb.h"
	#include "singa/singa_config.h"
	#include "singa/utils/safe_queue.h"

	#ifdef USE_CUDA
	#include "singa/utils/cuda_utils.h"
	#endif // USE_CUDA

	#ifdef USE_OPENCL
	#include "singa/utils/opencl_utils.h"
	#endif // USE_OPENCL

	using std::function;
	using std::shared_ptr;
	using std::string;
	using std::vector;

	namespace singa {

	/// Allocate memory and execute Tensor operations.
	/// There are three types of devices distinguished by their programming
	/// languages, namely cpp, cuda and opencl.
	class Device {
	public:
	// Device() = default;
	virtual ~Device();
	/// Constructor with device ID, num of executors (e.g., cuda streams),
	/// max mem size to use (in MB)
	Device(int id, int num_executors);

	void Reset();

	virtual void SetRandSeed(unsigned seed) = 0;

	void EnableGraph(bool enable) { graph_enabled_ = enable; }

	static void EnableLazyAlloc(bool enbale) { lazy_alloc_ = enbale; }

	/// Called by Tensor.
	Block* NewBlock(int size);

	/// Called by Tensor.
	void FreeBlock(Block* block);

	/// Return the size (bytes) of memory in use
	/// TODO(wangwei) override this function for all devices.
	virtual size_t GetAllocatedMem() { return 0u; }

	/// Copy data within or across devices.
	virtual void CopyDataToFrom(Block* dst, Block* src, size_t nBytes,
	CopyDirection direction, int dst_offset,
	int src_offset, Context* ctx);

	void CopyDataFromHostPtr(Block* dst, const void* src, size_t nBytes,
	size_t dst_offset = 0, Context* ctx = nullptr);
	/// Submit the operation to the device, which may execute it right now or
	/// delay it depending on the scheduler.
	void Exec(function<void(Context)>&& fn, const vector<Block> read_blocks,
	const vector<Block*> write_blocks, string op_name = "no_name",
	bool use_rand_generator = false);

	void RunGraph(bool serial = false);

	void ResetGraph() { graph_->Reset(); }

	// Wait for one event.
	// void WaitFor();

	/// wait for all operations submitted to this device.
	virtual void Sync();

	int id() const { return id_; }

	/// Return the programming language for this device.
	LangType lang() const { return lang_; }

	Context* context(int k) { return &ctx_; }

	bool graph_enabled() const { return graph_enabled_; }

	/// Verbosity of the time profiling function:
	/// verbosity == 0 (default) -> no logging
	/// verbosity == 1 -> display forward and backward propagation time
	/// verbosity == 2 -> display each operation time (OP_ID, op name, time)
	int verbosity() const { return verbosity_; }
	/// the number of initial iteration that is skipped for time profiling
	int skip_iteration() const { return skip_iteration_; }

	virtual std::shared_ptr<Device> host() const { return host_; }

	void PrintTimeProfiling();
	void SetVerbosity(int verbosity) { verbosity_ = verbosity; };
	void SetSkipIteration(int skip_iteration) {
	skip_iteration_ = skip_iteration;
	};

	protected:
	/// Execute one operation on one executor.
	virtual void DoExec(function<void(Context*)>&& fn, int executor) = 0;
	virtual void TimeProfilingDoExec(function<void(Context*)>&& fn, int executor,
	Node* node) = 0;
	virtual void EvaluateTimeElapsed(Node* node) = 0;

	virtual void CopyToFrom(void* dst, const void* src, size_t nBytes,
	CopyDirection direction, Context* ctx) = 0;

	/// Allocate device memory.
	virtual void* Malloc(int size) = 0;

	/// Free device memory.
	virtual void Free(void* ptr) = 0;

	private:
	Device(){};

	protected:
	friend class Block;
	friend class Graph;

	int id_ = 0;
	int num_executors_ = 0;
	unsigned seed_ = 0;
	bool graph_enabled_ = false;
	int verbosity_ = 0;
	int skip_iteration_ = 5;
	/// The computational graph
	Graph* graph_ = nullptr;
	/// Programming language type, could be kCpp, kCuda, kOpencl
	LangType lang_;
	/// The host device
	std::shared_ptr<Device> host_;
	// TODO(wangwei) define multiple contexts, one per executor
	Context ctx_;
	// Scheduler* scheduler_ = nullptr;
	// VirtualMemory* vm_ = nullptr;
	// SafeQueue<Operation> op_queue_;
	// SafeQueue<Operation> op_log_;

	static bool lazy_alloc_;
	};

	/// a singleton CppDevice as the host for all devices.
	extern std::shared_ptr<Device> defaultDevice;

	/// Represent a CPU device which may have multiple threads/executors.
	/// It runs cpp code.
	class CppCPU : public Device {
	public:
	~CppCPU();
	CppCPU();

	std::shared_ptr<Device> host() const override { return defaultDevice; }
	void SetRandSeed(unsigned seed) override;

	protected:
	void DoExec(function<void(Context*)>&& fn, int executor) override;
	void TimeProfilingDoExec(function<void(Context*)>&& fn, int executor,
	Node* node) override;
	void EvaluateTimeElapsed(Node* node) override;

	void CopyToFrom(void* dst, const void* src, size_t nBytes,
	CopyDirection direction, Context* ctx) override;

	/// Allocate cpu memory.
	void* Malloc(int size) override;

	/// Free cpu memory.
	void Free(void* ptr) override;
	};

	// Implement Device using OpenCL libs.
	// class OpenclDevice : public Device { };

	#ifdef USE_CUDA
	// Represent a Nvidia GPU which runs cuda code.
	class CudaGPU : public Device {
	public:
	~CudaGPU();
	/// Construct the device using default mem pool setting.
	CudaGPU(int id = 0);
	/// Construct the device given the physical device ID and memory pool.
	CudaGPU(int id, std::shared_ptr<DeviceMemPool> pool);

	void SetRandSeed(unsigned seed) override;
	size_t GetAllocatedMem() override;
	void Sync() override;

	protected:
	void DoExec(function<void(Context*)>&& fn, int executor) override;
	void TimeProfilingDoExec(function<void(Context*)>&& fn, int executor,
	Node* node) override;
	void EvaluateTimeElapsed(Node* node) override;

	void SyncBeforeCountingTime();

	void CopyToFrom(void* dst, const void* src, size_t nBytes,
	CopyDirection direction, Context* ctx) override;

	/// Allocate cpu memory.
	void* Malloc(int size) override;

	/// Free cpu memory.
	void Free(void* ptr) override;

	private:
	void Setup();

	private:
	shared_ptr<DeviceMemPool> pool_;
	};

	/// CudaCPU which uses cudaMallocHost to allocate pinned memory for host.

	#endif // USE_CUDA

	#ifdef USE_OPENCL

	// Implement Device using OpenCL libs.
	class OpenclDevice : public singa::Device {
	public:
	// TODO: Constructor arguments to consider:
	// Path to kernel sources?
	// Select only certain device types?
	OpenclDevice(int id = 0, int num_executors = 1);
	~OpenclDevice();

	// Overridden, inherited methods
	void SetRandSeed(unsigned seed) override;

	virtual void CopyDataToFrom(Block* dst, Block* src, size_t nBytes,
	CopyDirection direction, int dst_offset = 0,
	int src_offset = 0,
	Context* ctx = nullptr) override;

	protected:
	/// The OpenCL device that this object represents.
	/// Each OpenclDevice contains exactly one cl::Device for the lifetime of the
	/// object.
	viennacl::ocl::device this_device;

	/// Each OpenclDevice has one OpenCL context. It is created along with the
	/// creation of this object.
	viennacl::ocl::context vcl_ctx;

	/// Searches the given paths for all .cl files and builds
	/// OpenCL programs, then stores them in the Kernels map.
	void BuildPrograms();

	// Overridden, inherited methods.

	void DoExec(function<void(Context*)>&& fn, int executor) override;

	void CopyToFrom(void* dst, const void* src, size_t nBytes,
	CopyDirection direction, Context* ctx = nullptr) override;

	/// Allocates memory on this OpenCL device
	/// by creating and returning an empty cl::Buffer object.
	/// with the indicated size.
	void* Malloc(int size) override;

	/// Converts the void pointer into a Buffer object, then deletes the object.
	/// This has the effect of freeing up device memory.
	void Free(void* ptr) override;

	private:
	static const std::string cl_src_path;
	};
	#endif // USE_OPENCL

	/// This class queries all available calculating devices on a given machine
	/// grouped according to manufacturer or device drivers. All methods should be
	/// static.
	/// If CUDA or OPENCL are not enabled, then the respective related methods
	/// should
	/// return something that indicates their absence (for example, 0 devices);
	/// however they should always be available regardless of compile-time switches.
	class Platform {
	public:
	/// Return the default host device
	static std::shared_ptr<Device> GetDefaultDevice() {
	// cannot reset cpu device, which leads to error
	// defaultDevice->Reset();
	return defaultDevice;
	}

	#ifdef USE_CUDA
	/// Return the number of total available GPUs
	static int GetNumGPUs();

	/// Return the device IDs of available GPUs.
	/// TODO(wangwei) return the IDs according to free memory in decending order
	static const std::vector<int> GetGPUIDs();

	static const std::pair<size_t, size_t> GetGPUMemSize(const int device);

	/// Return the memory of a GPU <free, total>
	static const std::vector<std::pair<size_t, size_t>> GetGPUMemSize();

	/// Return a string containing all hardware info, e.g., version, memory size.
	static const std::string DeviceQuery(int id, bool verbose = false);

	/// Create a set of CudaGPU Device using 'num_devices' free GPUs.
	static const std::vector<std::shared_ptr<Device>> CreateCudaGPUs(
	const size_t num_devices, size_t init_size = 0);

	/// Create a set of CudaGPU Device using given GPU IDs.
	static const std::vector<std::shared_ptr<Device>> CreateCudaGPUsOn(
	const std::vector<int>& devices, size_t init_size = 0);

	static std::vector<std::shared_ptr<Device>> UsedDevice;
	/// This function is implementd by Caffe (http://caffe.berkeleyvision.org/).
	/// This function checks the availability of GPU #device_id.
	/// It attempts to create a context on the device by calling cudaFree(0).
	/// cudaSetDevice() alone is not sufficient to check the availability.
	/// It lazily records device_id, however, does not initialize a
	/// context. So it does not know if the host thread has the permission to use
	/// the device or not.
	///
	/// In a shared environment where the devices are set to EXCLUSIVE_PROCESS
	/// or EXCLUSIVE_THREAD mode, cudaSetDevice() returns cudaSuccess
	/// even if the device is exclusively occupied by another process or thread.
	/// Cuda operations that initialize the context are needed to check
	/// the permission. cudaFree(0) is one of those with no side effect,
	/// except the context initialization.
	static bool CheckDevice(const int device_id);
	static std::mutex mtx_;
	#endif // USE_CUDA

	#ifdef USE_OPENCL

	const int GetNumOpenclPlatforms();

	const int GetNumOpenclDevices();

	static const std::shared_ptr<Device> GetDefaultOpenclDevice();

	/// Create a \p num_devices set of valid OpenCL devices, regardless of
	/// platforms. If there are fewer valid devices than requested, then this
	/// method will return as many as possible. If OpenCL is not in use, this
	/// method will return an empty array.
	// static const std::vector<std::shared_ptr<Device>>
	// CreateOpenclDevices(const size_t num_devices);

	/// Create a set of valid OpenCL devices, regardless of platforms, assigning
	/// \p id to each device in sequence.
	/// If there are fewer valid devices than requested, then this method will
	/// return as many as possible.
	/// If OpenCL is not in use, this method will return an empty array.
	// const std::vector<std::shared_ptr<Device>>
	// CreateOpenclDevices(const vector<int> &id);
	#endif // USE_OPENCL
	};

	} // namespace singa

	#endif // SINGA_CORE_DEVICE_H_