blob: 4efd3889407e16c15b0f30c0723c6aecac21f05f [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <memory>
#include "arrow/buffer.h"
#include "arrow/io/concurrency.h"
#include "arrow/type_fwd.h"
namespace arrow {
namespace cuda {
class CudaContext;
class CudaIpcMemHandle;
/// \class CudaBuffer
/// \brief An Arrow buffer located on a GPU device
///
/// Be careful using this in any Arrow code which may not be GPU-aware
class ARROW_EXPORT CudaBuffer : public Buffer {
public:
// XXX deprecate?
CudaBuffer(uint8_t* data, int64_t size, const std::shared_ptr<CudaContext>& context,
bool own_data = false, bool is_ipc = false);
CudaBuffer(uintptr_t address, int64_t size, const std::shared_ptr<CudaContext>& context,
bool own_data = false, bool is_ipc = false);
CudaBuffer(const std::shared_ptr<CudaBuffer>& parent, const int64_t offset,
const int64_t size);
~CudaBuffer();
/// \brief Convert back generic buffer into CudaBuffer
/// \param[in] buffer buffer to convert
/// \return CudaBuffer or Status
///
/// \note This function returns an error if the buffer isn't backed
/// by GPU memory
static Result<std::shared_ptr<CudaBuffer>> FromBuffer(std::shared_ptr<Buffer> buffer);
/// \brief Copy memory from GPU device to CPU host
/// \param[in] position start position inside buffer to copy bytes from
/// \param[in] nbytes number of bytes to copy
/// \param[out] out start address of the host memory area to copy to
/// \return Status
Status CopyToHost(const int64_t position, const int64_t nbytes, void* out) const;
/// \brief Copy memory to device at position
/// \param[in] position start position to copy bytes to
/// \param[in] data the host data to copy
/// \param[in] nbytes number of bytes to copy
/// \return Status
Status CopyFromHost(const int64_t position, const void* data, int64_t nbytes);
/// \brief Copy memory from device to device at position
/// \param[in] position start position inside buffer to copy bytes to
/// \param[in] data start address of the device memory area to copy from
/// \param[in] nbytes number of bytes to copy
/// \return Status
///
/// \note It is assumed that both source and destination device
/// memories have been allocated within the same context.
Status CopyFromDevice(const int64_t position, const void* data, int64_t nbytes);
/// \brief Copy memory from another device to device at position
/// \param[in] src_ctx context of the source device memory
/// \param[in] position start position inside buffer to copy bytes to
/// \param[in] data start address of the another device memory area to copy from
/// \param[in] nbytes number of bytes to copy
/// \return Status
Status CopyFromAnotherDevice(const std::shared_ptr<CudaContext>& src_ctx,
const int64_t position, const void* data, int64_t nbytes);
/// \brief Expose this device buffer as IPC memory which can be used in other processes
/// \return Handle or Status
///
/// \note After calling this function, this device memory will not be freed
/// when the CudaBuffer is destructed
virtual Result<std::shared_ptr<CudaIpcMemHandle>> ExportForIpc();
const std::shared_ptr<CudaContext>& context() const { return context_; }
protected:
std::shared_ptr<CudaContext> context_;
bool own_data_;
bool is_ipc_;
virtual Status Close();
};
/// \class CudaHostBuffer
/// \brief Device-accessible CPU memory created using cudaHostAlloc
class ARROW_EXPORT CudaHostBuffer : public MutableBuffer {
public:
using MutableBuffer::MutableBuffer;
~CudaHostBuffer();
/// \brief Return a device address the GPU can read this memory from.
Result<uintptr_t> GetDeviceAddress(const std::shared_ptr<CudaContext>& ctx);
};
/// \class CudaIpcHandle
/// \brief A container for a CUDA IPC handle
class ARROW_EXPORT CudaIpcMemHandle {
public:
~CudaIpcMemHandle();
/// \brief Create CudaIpcMemHandle from opaque buffer (e.g. from another process)
/// \param[in] opaque_handle a CUipcMemHandle as a const void*
/// \return Handle or Status
static Result<std::shared_ptr<CudaIpcMemHandle>> FromBuffer(const void* opaque_handle);
/// \brief Write CudaIpcMemHandle to a Buffer
/// \param[in] pool a MemoryPool to allocate memory from
/// \return Buffer or Status
Result<std::shared_ptr<Buffer>> Serialize(
MemoryPool* pool = default_memory_pool()) const;
private:
explicit CudaIpcMemHandle(const void* handle);
CudaIpcMemHandle(int64_t memory_size, const void* cu_handle);
struct CudaIpcMemHandleImpl;
std::unique_ptr<CudaIpcMemHandleImpl> impl_;
const void* handle() const;
int64_t memory_size() const;
friend CudaBuffer;
friend CudaContext;
};
/// \class CudaBufferReader
/// \brief File interface for zero-copy read from CUDA buffers
///
/// CAUTION: reading to a Buffer returns a Buffer pointing to device memory.
/// It will generally not be compatible with Arrow code expecting a buffer
/// pointing to CPU memory.
/// Reading to a raw pointer, though, copies device memory into the host
/// memory pointed to.
class ARROW_EXPORT CudaBufferReader
: public ::arrow::io::internal::RandomAccessFileConcurrencyWrapper<CudaBufferReader> {
public:
explicit CudaBufferReader(const std::shared_ptr<Buffer>& buffer);
bool closed() const override;
bool supports_zero_copy() const override;
std::shared_ptr<CudaBuffer> buffer() const { return buffer_; }
protected:
friend ::arrow::io::internal::RandomAccessFileConcurrencyWrapper<CudaBufferReader>;
Status DoClose();
Result<int64_t> DoRead(int64_t nbytes, void* buffer);
Result<std::shared_ptr<Buffer>> DoRead(int64_t nbytes);
Result<int64_t> DoReadAt(int64_t position, int64_t nbytes, void* out);
Result<std::shared_ptr<Buffer>> DoReadAt(int64_t position, int64_t nbytes);
Result<int64_t> DoTell() const;
Status DoSeek(int64_t position);
Result<int64_t> DoGetSize();
Status CheckClosed() const {
if (!is_open_) {
return Status::Invalid("Operation forbidden on closed CudaBufferReader");
}
return Status::OK();
}
std::shared_ptr<CudaBuffer> buffer_;
std::shared_ptr<CudaContext> context_;
const uintptr_t address_;
int64_t size_;
int64_t position_;
bool is_open_;
};
/// \class CudaBufferWriter
/// \brief File interface for writing to CUDA buffers, with optional buffering
class ARROW_EXPORT CudaBufferWriter : public io::WritableFile {
public:
explicit CudaBufferWriter(const std::shared_ptr<CudaBuffer>& buffer);
~CudaBufferWriter() override;
/// \brief Close writer and flush buffered bytes to GPU
Status Close() override;
bool closed() const override;
/// \brief Flush buffered bytes to GPU
Status Flush() override;
Status Seek(int64_t position) override;
Status Write(const void* data, int64_t nbytes) override;
Status WriteAt(int64_t position, const void* data, int64_t nbytes) override;
Result<int64_t> Tell() const override;
/// \brief Set CPU buffer size to limit calls to cudaMemcpy
/// \param[in] buffer_size the size of CPU buffer to allocate
/// \return Status
///
/// By default writes are unbuffered
Status SetBufferSize(const int64_t buffer_size);
/// \brief Returns size of host (CPU) buffer, 0 for unbuffered
int64_t buffer_size() const;
/// \brief Returns number of bytes buffered on host
int64_t num_bytes_buffered() const;
private:
class CudaBufferWriterImpl;
std::unique_ptr<CudaBufferWriterImpl> impl_;
};
/// \brief Allocate CUDA-accessible memory on CPU host
///
/// The GPU will benefit from fast access to this CPU-located buffer,
/// including fast memory copy.
///
/// \param[in] device_number device to expose host memory
/// \param[in] size number of bytes
/// \return Host buffer or Status
ARROW_EXPORT
Result<std::shared_ptr<CudaHostBuffer>> AllocateCudaHostBuffer(int device_number,
const int64_t size);
/// Low-level: get a device address through which the CPU data be accessed.
Result<uintptr_t> GetDeviceAddress(const uint8_t* cpu_data,
const std::shared_ptr<CudaContext>& ctx);
/// Low-level: get a CPU address through which the device data be accessed.
Result<uint8_t*> GetHostAddress(uintptr_t device_ptr);
} // namespace cuda
} // namespace arrow