blob: 246883c9b99f93b0b3e774039e2aa0645fa1b706 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <memory>
#include <string>
#include "arrow/device.h"
#include "arrow/result.h"
#include "arrow/util/visibility.h"
namespace arrow {
namespace cuda {
// Forward declaration
class CudaContext;
class CudaDevice;
class CudaDeviceManager;
class CudaBuffer;
class CudaHostBuffer;
class CudaIpcMemHandle;
class CudaMemoryManager;
// XXX Should CudaContext be merged into CudaMemoryManager?
class ARROW_EXPORT CudaDeviceManager {
public:
static Result<CudaDeviceManager*> Instance();
/// \brief Get a CudaDevice instance for a particular device
/// \param[in] device_number the CUDA device number
Result<std::shared_ptr<CudaDevice>> GetDevice(int device_number);
/// \brief Get the CUDA driver context for a particular device
/// \param[in] device_number the CUDA device number
/// \return cached context
Result<std::shared_ptr<CudaContext>> GetContext(int device_number);
/// \brief Get the shared CUDA driver context for a particular device
/// \param[in] device_number the CUDA device number
/// \param[in] handle CUDA context handle created by another library
/// \return shared context
Result<std::shared_ptr<CudaContext>> GetSharedContext(int device_number, void* handle);
/// \brief Allocate host memory with fast access to given GPU device
/// \param[in] device_number the CUDA device number
/// \param[in] nbytes number of bytes
/// \return Host buffer or Status
Result<std::shared_ptr<CudaHostBuffer>> AllocateHost(int device_number, int64_t nbytes);
/// \brief Free host memory
///
/// The given memory pointer must have been allocated with AllocateHost.
Status FreeHost(void* data, int64_t nbytes);
int num_devices() const;
private:
CudaDeviceManager();
static std::unique_ptr<CudaDeviceManager> instance_;
class Impl;
std::unique_ptr<Impl> impl_;
friend class CudaContext;
friend class CudaDevice;
};
/// \brief Device implementation for CUDA
///
/// Each CudaDevice instance is tied to a particular CUDA device
/// (identified by its logical device number).
class ARROW_EXPORT CudaDevice : public Device {
public:
const char* type_name() const override;
std::string ToString() const override;
bool Equals(const Device&) const override;
std::shared_ptr<MemoryManager> default_memory_manager() override;
/// \brief Return a CudaDevice instance for a particular device
/// \param[in] device_number the CUDA device number
static Result<std::shared_ptr<CudaDevice>> Make(int device_number);
/// \brief Return the device logical number
int device_number() const;
/// \brief Return the GPU model name
std::string device_name() const;
/// \brief Return total memory on this device
int64_t total_memory() const;
/// \brief Return a raw CUDA device handle
///
/// The returned value can be used to expose this device to other libraries.
/// It should be interpreted as `CUdevice`.
int handle() const;
/// \brief Get a CUDA driver context for this device
///
/// The returned context is associated with the primary CUDA context for the
/// device. This is the recommended way of getting a context for a device,
/// as it allows interoperating transparently with any library using the
/// primary CUDA context API.
Result<std::shared_ptr<CudaContext>> GetContext();
/// \brief Get a CUDA driver context for this device, using an existing handle
///
/// The handle is not owned: it will not be released when the CudaContext
/// is destroyed. This function should only be used if you need interoperation
/// with a library that uses a non-primary context.
///
/// \param[in] handle CUDA context handle created by another library
Result<std::shared_ptr<CudaContext>> GetSharedContext(void* handle);
/// \brief Allocate a host-residing, GPU-accessible buffer
///
/// The buffer is allocated using this device's primary context.
///
/// \param[in] size The buffer size in bytes
Result<std::shared_ptr<CudaHostBuffer>> AllocateHostBuffer(int64_t size);
protected:
struct Impl;
friend class CudaContext;
/// \cond FALSE
// (note: emits warning on Doxygen < 1.8.15)
friend class CudaDeviceManager::Impl;
/// \endcond
explicit CudaDevice(Impl);
std::unique_ptr<Impl> impl_;
};
/// \brief Return whether a device instance is a CudaDevice
ARROW_EXPORT
bool IsCudaDevice(const Device& device);
/// \brief Cast a device instance to a CudaDevice
///
/// An error is returned if the device is not a CudaDevice.
ARROW_EXPORT
Result<std::shared_ptr<CudaDevice>> AsCudaDevice(const std::shared_ptr<Device>& device);
/// \brief MemoryManager implementation for CUDA
class ARROW_EXPORT CudaMemoryManager : public MemoryManager {
public:
Result<std::shared_ptr<io::RandomAccessFile>> GetBufferReader(
std::shared_ptr<Buffer> buf) override;
Result<std::shared_ptr<io::OutputStream>> GetBufferWriter(
std::shared_ptr<Buffer> buf) override;
Result<std::shared_ptr<Buffer>> AllocateBuffer(int64_t size) override;
/// \brief The CudaDevice instance tied to this MemoryManager
///
/// This is a useful shorthand returning a concrete-typed pointer, avoiding
/// having to cast the `device()` result.
std::shared_ptr<CudaDevice> cuda_device() const;
protected:
using MemoryManager::MemoryManager;
static std::shared_ptr<CudaMemoryManager> Make(const std::shared_ptr<Device>& device);
Result<std::shared_ptr<Buffer>> CopyBufferFrom(
const std::shared_ptr<Buffer>& buf,
const std::shared_ptr<MemoryManager>& from) override;
Result<std::shared_ptr<Buffer>> CopyBufferTo(
const std::shared_ptr<Buffer>& buf,
const std::shared_ptr<MemoryManager>& to) override;
Result<std::shared_ptr<Buffer>> ViewBufferFrom(
const std::shared_ptr<Buffer>& buf,
const std::shared_ptr<MemoryManager>& from) override;
Result<std::shared_ptr<Buffer>> ViewBufferTo(
const std::shared_ptr<Buffer>& buf,
const std::shared_ptr<MemoryManager>& to) override;
friend class CudaDevice;
};
/// \brief Return whether a MemoryManager instance is a CudaMemoryManager
ARROW_EXPORT
bool IsCudaMemoryManager(const MemoryManager& mm);
/// \brief Cast a MemoryManager instance to a CudaMemoryManager
///
/// An error is returned if the MemoryManager is not a CudaMemoryManager.
ARROW_EXPORT
Result<std::shared_ptr<CudaMemoryManager>> AsCudaMemoryManager(
const std::shared_ptr<MemoryManager>& mm);
/// \class CudaContext
/// \brief Object-oriented interface to the low-level CUDA driver API
class ARROW_EXPORT CudaContext : public std::enable_shared_from_this<CudaContext> {
public:
~CudaContext();
Status Close();
/// \brief Allocate CUDA memory on GPU device for this context
/// \param[in] nbytes number of bytes
/// \return the allocated buffer
Result<std::shared_ptr<CudaBuffer>> Allocate(int64_t nbytes);
/// \brief Release CUDA memory on GPU device for this context
/// \param[in] device_ptr the buffer address
/// \param[in] nbytes number of bytes
/// \return Status
Status Free(void* device_ptr, int64_t nbytes);
/// \brief Create a view of CUDA memory on GPU device of this context
/// \param[in] data the starting device address
/// \param[in] nbytes number of bytes
/// \return the view buffer
///
/// \note The caller is responsible for allocating and freeing the
/// memory as well as ensuring that the memory belongs to the CUDA
/// context that this CudaContext instance holds.
Result<std::shared_ptr<CudaBuffer>> View(uint8_t* data, int64_t nbytes);
/// \brief Open existing CUDA IPC memory handle
/// \param[in] ipc_handle opaque pointer to CUipcMemHandle (driver API)
/// \return a CudaBuffer referencing the IPC segment
Result<std::shared_ptr<CudaBuffer>> OpenIpcBuffer(const CudaIpcMemHandle& ipc_handle);
/// \brief Close memory mapped with IPC buffer
/// \param[in] buffer a CudaBuffer referencing
/// \return Status
Status CloseIpcBuffer(CudaBuffer* buffer);
/// \brief Block until the all device tasks are completed.
Status Synchronize(void);
int64_t bytes_allocated() const;
/// \brief Expose CUDA context handle to other libraries
void* handle() const;
/// \brief Return the default memory manager tied to this context's device
std::shared_ptr<CudaMemoryManager> memory_manager() const;
/// \brief Return the device instance associated with this context
std::shared_ptr<CudaDevice> device() const;
/// \brief Return the logical device number
int device_number() const;
/// \brief Return the device address that is reachable from kernels
/// running in the context
/// \param[in] addr device or host memory address
/// \return the device address
///
/// The device address is defined as a memory address accessible by
/// device. While it is often a device memory address, it can be
/// also a host memory address, for instance, when the memory is
/// allocated as host memory (using cudaMallocHost or cudaHostAlloc)
/// or as managed memory (using cudaMallocManaged) or the host
/// memory is page-locked (using cudaHostRegister).
Result<uintptr_t> GetDeviceAddress(uint8_t* addr);
Result<uintptr_t> GetDeviceAddress(uintptr_t addr);
private:
CudaContext();
Result<std::shared_ptr<CudaIpcMemHandle>> ExportIpcBuffer(void* data, int64_t size);
Status CopyHostToDevice(void* dst, const void* src, int64_t nbytes);
Status CopyHostToDevice(uintptr_t dst, const void* src, int64_t nbytes);
Status CopyDeviceToHost(void* dst, const void* src, int64_t nbytes);
Status CopyDeviceToHost(void* dst, uintptr_t src, int64_t nbytes);
Status CopyDeviceToDevice(void* dst, const void* src, int64_t nbytes);
Status CopyDeviceToDevice(uintptr_t dst, uintptr_t src, int64_t nbytes);
Status CopyDeviceToAnotherDevice(const std::shared_ptr<CudaContext>& dst_ctx, void* dst,
const void* src, int64_t nbytes);
Status CopyDeviceToAnotherDevice(const std::shared_ptr<CudaContext>& dst_ctx,
uintptr_t dst, uintptr_t src, int64_t nbytes);
class Impl;
std::unique_ptr<Impl> impl_;
friend class CudaBuffer;
friend class CudaBufferReader;
friend class CudaBufferWriter;
friend class CudaDevice;
friend class CudaMemoryManager;
/// \cond FALSE
// (note: emits warning on Doxygen < 1.8.15)
friend class CudaDeviceManager::Impl;
/// \endcond
};
} // namespace cuda
} // namespace arrow