cpp/src/arrow/gpu/cuda_context.h - arrow - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 #pragma once

 #include <cstdint>
 #include <memory>
 #include <string>

 #include "arrow/device.h"
 #include "arrow/result.h"
 #include "arrow/util/visibility.h"

 namespace arrow {
 namespace cuda {

 // Forward declaration
 class CudaContext;
 class CudaDevice;
 class CudaDeviceManager;
 class CudaBuffer;
 class CudaHostBuffer;
 class CudaIpcMemHandle;
 class CudaMemoryManager;

 // XXX Should CudaContext be merged into CudaMemoryManager?

 class ARROW_EXPORT CudaDeviceManager {
  public:
   static Result<CudaDeviceManager*> Instance();

   /// \brief Get a CudaDevice instance for a particular device
   /// \param[in] device_number the CUDA device number
   Result<std::shared_ptr<CudaDevice>> GetDevice(int device_number);

   /// \brief Get the CUDA driver context for a particular device
   /// \param[in] device_number the CUDA device number
   /// \return cached context
   Result<std::shared_ptr<CudaContext>> GetContext(int device_number);

   /// \brief Get the shared CUDA driver context for a particular device
   /// \param[in] device_number the CUDA device number
   /// \param[in] handle CUDA context handle created by another library
   /// \return shared context
   Result<std::shared_ptr<CudaContext>> GetSharedContext(int device_number, void* handle);

   /// \brief Allocate host memory with fast access to given GPU device
   /// \param[in] device_number the CUDA device number
   /// \param[in] nbytes number of bytes
   /// \return Host buffer or Status
   Result<std::shared_ptr<CudaHostBuffer>> AllocateHost(int device_number, int64_t nbytes);

   /// \brief Free host memory
   ///
   /// The given memory pointer must have been allocated with AllocateHost.
   Status FreeHost(void* data, int64_t nbytes);

   int num_devices() const;

  private:
   CudaDeviceManager();
   static std::unique_ptr<CudaDeviceManager> instance_;

   class Impl;
   std::unique_ptr<Impl> impl_;

   friend class CudaContext;
   friend class CudaDevice;
 };

 /// \brief Device implementation for CUDA
 ///
 /// Each CudaDevice instance is tied to a particular CUDA device
 /// (identified by its logical device number).
 class ARROW_EXPORT CudaDevice : public Device {
  public:
   const char* type_name() const override;
   std::string ToString() const override;
   bool Equals(const Device&) const override;
   std::shared_ptr<MemoryManager> default_memory_manager() override;

   /// \brief Return a CudaDevice instance for a particular device
   /// \param[in] device_number the CUDA device number
   static Result<std::shared_ptr<CudaDevice>> Make(int device_number);

   /// \brief Return the device logical number
   int device_number() const;

   /// \brief Return the GPU model name
   std::string device_name() const;

   /// \brief Return total memory on this device
   int64_t total_memory() const;

   /// \brief Return a raw CUDA device handle
   ///
   /// The returned value can be used to expose this device to other libraries.
   /// It should be interpreted as `CUdevice`.
   int handle() const;

   /// \brief Get a CUDA driver context for this device
   ///
   /// The returned context is associated with the primary CUDA context for the
   /// device.  This is the recommended way of getting a context for a device,
   /// as it allows interoperating transparently with any library using the
   /// primary CUDA context API.
   Result<std::shared_ptr<CudaContext>> GetContext();

   /// \brief Get a CUDA driver context for this device, using an existing handle
   ///
   /// The handle is not owned: it will not be released when the CudaContext
   /// is destroyed.  This function should only be used if you need interoperation
   /// with a library that uses a non-primary context.
   ///
   /// \param[in] handle CUDA context handle created by another library
   Result<std::shared_ptr<CudaContext>> GetSharedContext(void* handle);

   /// \brief Allocate a host-residing, GPU-accessible buffer
   ///
   /// The buffer is allocated using this device's primary context.
   ///
   /// \param[in] size The buffer size in bytes
   Result<std::shared_ptr<CudaHostBuffer>> AllocateHostBuffer(int64_t size);

  protected:
   struct Impl;

   friend class CudaContext;
   /// \cond FALSE
   // (note: emits warning on Doxygen < 1.8.15)
   friend class CudaDeviceManager::Impl;
   /// \endcond

   explicit CudaDevice(Impl);
   std::unique_ptr<Impl> impl_;
 };

 /// \brief Return whether a device instance is a CudaDevice
 ARROW_EXPORT
 bool IsCudaDevice(const Device& device);

 /// \brief Cast a device instance to a CudaDevice
 ///
 /// An error is returned if the device is not a CudaDevice.
 ARROW_EXPORT
 Result<std::shared_ptr<CudaDevice>> AsCudaDevice(const std::shared_ptr<Device>& device);

 /// \brief MemoryManager implementation for CUDA
 class ARROW_EXPORT CudaMemoryManager : public MemoryManager {
  public:
   Result<std::shared_ptr<io::RandomAccessFile>> GetBufferReader(
       std::shared_ptr<Buffer> buf) override;
   Result<std::shared_ptr<io::OutputStream>> GetBufferWriter(
       std::shared_ptr<Buffer> buf) override;

   Result<std::shared_ptr<Buffer>> AllocateBuffer(int64_t size) override;

   /// \brief The CudaDevice instance tied to this MemoryManager
   ///
   /// This is a useful shorthand returning a concrete-typed pointer, avoiding
   /// having to cast the `device()` result.
   std::shared_ptr<CudaDevice> cuda_device() const;

  protected:
   using MemoryManager::MemoryManager;
   static std::shared_ptr<CudaMemoryManager> Make(const std::shared_ptr<Device>& device);

   Result<std::shared_ptr<Buffer>> CopyBufferFrom(
       const std::shared_ptr<Buffer>& buf,
       const std::shared_ptr<MemoryManager>& from) override;
   Result<std::shared_ptr<Buffer>> CopyBufferTo(
       const std::shared_ptr<Buffer>& buf,
       const std::shared_ptr<MemoryManager>& to) override;
   Result<std::shared_ptr<Buffer>> ViewBufferFrom(
       const std::shared_ptr<Buffer>& buf,
       const std::shared_ptr<MemoryManager>& from) override;
   Result<std::shared_ptr<Buffer>> ViewBufferTo(
       const std::shared_ptr<Buffer>& buf,
       const std::shared_ptr<MemoryManager>& to) override;

   friend class CudaDevice;
 };

 /// \brief Return whether a MemoryManager instance is a CudaMemoryManager
 ARROW_EXPORT
 bool IsCudaMemoryManager(const MemoryManager& mm);

 /// \brief Cast a MemoryManager instance to a CudaMemoryManager
 ///
 /// An error is returned if the MemoryManager is not a CudaMemoryManager.
 ARROW_EXPORT
 Result<std::shared_ptr<CudaMemoryManager>> AsCudaMemoryManager(
     const std::shared_ptr<MemoryManager>& mm);

 /// \class CudaContext
 /// \brief Object-oriented interface to the low-level CUDA driver API
 class ARROW_EXPORT CudaContext : public std::enable_shared_from_this<CudaContext> {
  public:
   ~CudaContext();

   Status Close();

   /// \brief Allocate CUDA memory on GPU device for this context
   /// \param[in] nbytes number of bytes
   /// \return the allocated buffer
   Result<std::shared_ptr<CudaBuffer>> Allocate(int64_t nbytes);

   /// \brief Release CUDA memory on GPU device for this context
   /// \param[in] device_ptr the buffer address
   /// \param[in] nbytes number of bytes
   /// \return Status
   Status Free(void* device_ptr, int64_t nbytes);

   /// \brief Create a view of CUDA memory on GPU device of this context
   /// \param[in] data the starting device address
   /// \param[in] nbytes number of bytes
   /// \return the view buffer
   ///
   /// \note The caller is responsible for allocating and freeing the
   /// memory as well as ensuring that the memory belongs to the CUDA
   /// context that this CudaContext instance holds.
   Result<std::shared_ptr<CudaBuffer>> View(uint8_t* data, int64_t nbytes);

   /// \brief Open existing CUDA IPC memory handle
   /// \param[in] ipc_handle opaque pointer to CUipcMemHandle (driver API)
   /// \return a CudaBuffer referencing the IPC segment
   Result<std::shared_ptr<CudaBuffer>> OpenIpcBuffer(const CudaIpcMemHandle& ipc_handle);

   /// \brief Close memory mapped with IPC buffer
   /// \param[in] buffer a CudaBuffer referencing
   /// \return Status
   Status CloseIpcBuffer(CudaBuffer* buffer);

   /// \brief Block until the all device tasks are completed.
   Status Synchronize(void);

   int64_t bytes_allocated() const;

   /// \brief Expose CUDA context handle to other libraries
   void* handle() const;

   /// \brief Return the default memory manager tied to this context's device
   std::shared_ptr<CudaMemoryManager> memory_manager() const;

   /// \brief Return the device instance associated with this context
   std::shared_ptr<CudaDevice> device() const;

   /// \brief Return the logical device number
   int device_number() const;

   /// \brief Return the device address that is reachable from kernels
   /// running in the context
   /// \param[in] addr device or host memory address
   /// \return the device address
   ///
   /// The device address is defined as a memory address accessible by
   /// device. While it is often a device memory address, it can be
   /// also a host memory address, for instance, when the memory is
   /// allocated as host memory (using cudaMallocHost or cudaHostAlloc)
   /// or as managed memory (using cudaMallocManaged) or the host
   /// memory is page-locked (using cudaHostRegister).
   Result<uintptr_t> GetDeviceAddress(uint8_t* addr);
   Result<uintptr_t> GetDeviceAddress(uintptr_t addr);

  private:
   CudaContext();

   Result<std::shared_ptr<CudaIpcMemHandle>> ExportIpcBuffer(void* data, int64_t size);
   Status CopyHostToDevice(void* dst, const void* src, int64_t nbytes);
   Status CopyHostToDevice(uintptr_t dst, const void* src, int64_t nbytes);
   Status CopyDeviceToHost(void* dst, const void* src, int64_t nbytes);
   Status CopyDeviceToHost(void* dst, uintptr_t src, int64_t nbytes);
   Status CopyDeviceToDevice(void* dst, const void* src, int64_t nbytes);
   Status CopyDeviceToDevice(uintptr_t dst, uintptr_t src, int64_t nbytes);
   Status CopyDeviceToAnotherDevice(const std::shared_ptr<CudaContext>& dst_ctx, void* dst,
                                    const void* src, int64_t nbytes);
   Status CopyDeviceToAnotherDevice(const std::shared_ptr<CudaContext>& dst_ctx,
                                    uintptr_t dst, uintptr_t src, int64_t nbytes);

   class Impl;
   std::unique_ptr<Impl> impl_;

   friend class CudaBuffer;
   friend class CudaBufferReader;
   friend class CudaBufferWriter;
   friend class CudaDevice;
   friend class CudaMemoryManager;
   /// \cond FALSE
   // (note: emits warning on Doxygen < 1.8.15)
   friend class CudaDeviceManager::Impl;
   /// \endcond
 };

 }  // namespace cuda
 }  // namespace arrow
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.

	#pragma once

	#include <cstdint>
	#include <memory>
	#include <string>

	#include "arrow/device.h"
	#include "arrow/result.h"
	#include "arrow/util/visibility.h"

	namespace arrow {
	namespace cuda {

	// Forward declaration
	class CudaContext;
	class CudaDevice;
	class CudaDeviceManager;
	class CudaBuffer;
	class CudaHostBuffer;
	class CudaIpcMemHandle;
	class CudaMemoryManager;

	// XXX Should CudaContext be merged into CudaMemoryManager?

	class ARROW_EXPORT CudaDeviceManager {
	public:
	static Result<CudaDeviceManager*> Instance();

	/// \brief Get a CudaDevice instance for a particular device
	/// \param[in] device_number the CUDA device number
	Result<std::shared_ptr<CudaDevice>> GetDevice(int device_number);

	/// \brief Get the CUDA driver context for a particular device
	/// \param[in] device_number the CUDA device number
	/// \return cached context
	Result<std::shared_ptr<CudaContext>> GetContext(int device_number);

	/// \brief Get the shared CUDA driver context for a particular device
	/// \param[in] device_number the CUDA device number
	/// \param[in] handle CUDA context handle created by another library
	/// \return shared context
	Result<std::shared_ptr<CudaContext>> GetSharedContext(int device_number, void* handle);

	/// \brief Allocate host memory with fast access to given GPU device
	/// \param[in] device_number the CUDA device number
	/// \param[in] nbytes number of bytes
	/// \return Host buffer or Status
	Result<std::shared_ptr<CudaHostBuffer>> AllocateHost(int device_number, int64_t nbytes);

	/// \brief Free host memory
	///
	/// The given memory pointer must have been allocated with AllocateHost.
	Status FreeHost(void* data, int64_t nbytes);

	int num_devices() const;

	private:
	CudaDeviceManager();
	static std::unique_ptr<CudaDeviceManager> instance_;

	class Impl;
	std::unique_ptr<Impl> impl_;

	friend class CudaContext;
	friend class CudaDevice;
	};

	/// \brief Device implementation for CUDA
	///
	/// Each CudaDevice instance is tied to a particular CUDA device
	/// (identified by its logical device number).
	class ARROW_EXPORT CudaDevice : public Device {
	public:
	const char* type_name() const override;
	std::string ToString() const override;
	bool Equals(const Device&) const override;
	std::shared_ptr<MemoryManager> default_memory_manager() override;

	/// \brief Return a CudaDevice instance for a particular device
	/// \param[in] device_number the CUDA device number
	static Result<std::shared_ptr<CudaDevice>> Make(int device_number);

	/// \brief Return the device logical number
	int device_number() const;

	/// \brief Return the GPU model name
	std::string device_name() const;

	/// \brief Return total memory on this device
	int64_t total_memory() const;

	/// \brief Return a raw CUDA device handle
	///
	/// The returned value can be used to expose this device to other libraries.
	/// It should be interpreted as `CUdevice`.
	int handle() const;

	/// \brief Get a CUDA driver context for this device
	///
	/// The returned context is associated with the primary CUDA context for the
	/// device. This is the recommended way of getting a context for a device,
	/// as it allows interoperating transparently with any library using the
	/// primary CUDA context API.
	Result<std::shared_ptr<CudaContext>> GetContext();

	/// \brief Get a CUDA driver context for this device, using an existing handle
	///
	/// The handle is not owned: it will not be released when the CudaContext
	/// is destroyed. This function should only be used if you need interoperation
	/// with a library that uses a non-primary context.
	///
	/// \param[in] handle CUDA context handle created by another library
	Result<std::shared_ptr<CudaContext>> GetSharedContext(void* handle);

	/// \brief Allocate a host-residing, GPU-accessible buffer
	///
	/// The buffer is allocated using this device's primary context.
	///
	/// \param[in] size The buffer size in bytes
	Result<std::shared_ptr<CudaHostBuffer>> AllocateHostBuffer(int64_t size);

	protected:
	struct Impl;

	friend class CudaContext;
	/// \cond FALSE
	// (note: emits warning on Doxygen < 1.8.15)
	friend class CudaDeviceManager::Impl;
	/// \endcond

	explicit CudaDevice(Impl);
	std::unique_ptr<Impl> impl_;
	};

	/// \brief Return whether a device instance is a CudaDevice
	ARROW_EXPORT
	bool IsCudaDevice(const Device& device);

	/// \brief Cast a device instance to a CudaDevice
	///
	/// An error is returned if the device is not a CudaDevice.
	ARROW_EXPORT
	Result<std::shared_ptr<CudaDevice>> AsCudaDevice(const std::shared_ptr<Device>& device);

	/// \brief MemoryManager implementation for CUDA
	class ARROW_EXPORT CudaMemoryManager : public MemoryManager {
	public:
	Result<std::shared_ptr<io::RandomAccessFile>> GetBufferReader(
	std::shared_ptr<Buffer> buf) override;
	Result<std::shared_ptr<io::OutputStream>> GetBufferWriter(
	std::shared_ptr<Buffer> buf) override;

	Result<std::shared_ptr<Buffer>> AllocateBuffer(int64_t size) override;

	/// \brief The CudaDevice instance tied to this MemoryManager
	///
	/// This is a useful shorthand returning a concrete-typed pointer, avoiding
	/// having to cast the `device()` result.
	std::shared_ptr<CudaDevice> cuda_device() const;

	protected:
	using MemoryManager::MemoryManager;
	static std::shared_ptr<CudaMemoryManager> Make(const std::shared_ptr<Device>& device);

	Result<std::shared_ptr<Buffer>> CopyBufferFrom(
	const std::shared_ptr<Buffer>& buf,
	const std::shared_ptr<MemoryManager>& from) override;
	Result<std::shared_ptr<Buffer>> CopyBufferTo(
	const std::shared_ptr<Buffer>& buf,
	const std::shared_ptr<MemoryManager>& to) override;
	Result<std::shared_ptr<Buffer>> ViewBufferFrom(
	const std::shared_ptr<Buffer>& buf,
	const std::shared_ptr<MemoryManager>& from) override;
	Result<std::shared_ptr<Buffer>> ViewBufferTo(
	const std::shared_ptr<Buffer>& buf,
	const std::shared_ptr<MemoryManager>& to) override;

	friend class CudaDevice;
	};

	/// \brief Return whether a MemoryManager instance is a CudaMemoryManager
	ARROW_EXPORT
	bool IsCudaMemoryManager(const MemoryManager& mm);

	/// \brief Cast a MemoryManager instance to a CudaMemoryManager
	///
	/// An error is returned if the MemoryManager is not a CudaMemoryManager.
	ARROW_EXPORT
	Result<std::shared_ptr<CudaMemoryManager>> AsCudaMemoryManager(
	const std::shared_ptr<MemoryManager>& mm);

	/// \class CudaContext
	/// \brief Object-oriented interface to the low-level CUDA driver API
	class ARROW_EXPORT CudaContext : public std::enable_shared_from_this<CudaContext> {
	public:
	~CudaContext();

	Status Close();

	/// \brief Allocate CUDA memory on GPU device for this context
	/// \param[in] nbytes number of bytes
	/// \return the allocated buffer
	Result<std::shared_ptr<CudaBuffer>> Allocate(int64_t nbytes);

	/// \brief Release CUDA memory on GPU device for this context
	/// \param[in] device_ptr the buffer address
	/// \param[in] nbytes number of bytes
	/// \return Status
	Status Free(void* device_ptr, int64_t nbytes);

	/// \brief Create a view of CUDA memory on GPU device of this context
	/// \param[in] data the starting device address
	/// \param[in] nbytes number of bytes
	/// \return the view buffer
	///
	/// \note The caller is responsible for allocating and freeing the
	/// memory as well as ensuring that the memory belongs to the CUDA
	/// context that this CudaContext instance holds.
	Result<std::shared_ptr<CudaBuffer>> View(uint8_t* data, int64_t nbytes);

	/// \brief Open existing CUDA IPC memory handle
	/// \param[in] ipc_handle opaque pointer to CUipcMemHandle (driver API)
	/// \return a CudaBuffer referencing the IPC segment
	Result<std::shared_ptr<CudaBuffer>> OpenIpcBuffer(const CudaIpcMemHandle& ipc_handle);

	/// \brief Close memory mapped with IPC buffer
	/// \param[in] buffer a CudaBuffer referencing
	/// \return Status
	Status CloseIpcBuffer(CudaBuffer* buffer);

	/// \brief Block until the all device tasks are completed.
	Status Synchronize(void);

	int64_t bytes_allocated() const;

	/// \brief Expose CUDA context handle to other libraries
	void* handle() const;

	/// \brief Return the default memory manager tied to this context's device
	std::shared_ptr<CudaMemoryManager> memory_manager() const;

	/// \brief Return the device instance associated with this context
	std::shared_ptr<CudaDevice> device() const;

	/// \brief Return the logical device number
	int device_number() const;

	/// \brief Return the device address that is reachable from kernels
	/// running in the context
	/// \param[in] addr device or host memory address
	/// \return the device address
	///
	/// The device address is defined as a memory address accessible by
	/// device. While it is often a device memory address, it can be
	/// also a host memory address, for instance, when the memory is
	/// allocated as host memory (using cudaMallocHost or cudaHostAlloc)
	/// or as managed memory (using cudaMallocManaged) or the host
	/// memory is page-locked (using cudaHostRegister).
	Result<uintptr_t> GetDeviceAddress(uint8_t* addr);
	Result<uintptr_t> GetDeviceAddress(uintptr_t addr);

	private:
	CudaContext();

	Result<std::shared_ptr<CudaIpcMemHandle>> ExportIpcBuffer(void* data, int64_t size);
	Status CopyHostToDevice(void* dst, const void* src, int64_t nbytes);
	Status CopyHostToDevice(uintptr_t dst, const void* src, int64_t nbytes);
	Status CopyDeviceToHost(void* dst, const void* src, int64_t nbytes);
	Status CopyDeviceToHost(void* dst, uintptr_t src, int64_t nbytes);
	Status CopyDeviceToDevice(void* dst, const void* src, int64_t nbytes);
	Status CopyDeviceToDevice(uintptr_t dst, uintptr_t src, int64_t nbytes);
	Status CopyDeviceToAnotherDevice(const std::shared_ptr<CudaContext>& dst_ctx, void* dst,
	const void* src, int64_t nbytes);
	Status CopyDeviceToAnotherDevice(const std::shared_ptr<CudaContext>& dst_ctx,
	uintptr_t dst, uintptr_t src, int64_t nbytes);

	class Impl;
	std::unique_ptr<Impl> impl_;

	friend class CudaBuffer;
	friend class CudaBufferReader;
	friend class CudaBufferWriter;
	friend class CudaDevice;
	friend class CudaMemoryManager;
	/// \cond FALSE
	// (note: emits warning on Doxygen < 1.8.15)
	friend class CudaDeviceManager::Impl;
	/// \endcond
	};

	} // namespace cuda
	} // namespace arrow