| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| package org.apache.sysds.runtime.instructions.gpu.context; |
| |
| import static jcuda.jcublas.JCublas2.cublasCreate; |
| import static jcuda.jcublas.JCublas2.cublasDestroy; |
| import static jcuda.jcudnn.JCudnn.cudnnCreate; |
| import static jcuda.jcudnn.JCudnn.cudnnDestroy; |
| import static jcuda.jcusolver.JCusolverDn.cusolverDnCreate; |
| import static jcuda.jcusolver.JCusolverDn.cusolverDnDestroy; |
| import static jcuda.jcusparse.JCusparse.cusparseCreate; |
| import static jcuda.jcusparse.JCusparse.cusparseDestroy; |
| import static jcuda.runtime.JCuda.cudaDeviceScheduleBlockingSync; |
| import static jcuda.runtime.JCuda.cudaGetDeviceCount; |
| import static jcuda.runtime.JCuda.cudaSetDevice; |
| import static jcuda.runtime.JCuda.cudaSetDeviceFlags; |
| |
| import org.apache.commons.logging.Log; |
| import org.apache.commons.logging.LogFactory; |
| import org.apache.sysds.api.DMLScript; |
| import org.apache.sysds.runtime.DMLRuntimeException; |
| import org.apache.sysds.runtime.controlprogram.caching.MatrixObject; |
| import org.apache.sysds.utils.GPUStatistics; |
| |
| import jcuda.Pointer; |
| import jcuda.jcublas.cublasHandle; |
| import jcuda.jcudnn.cudnnHandle; |
| import jcuda.jcusolver.cusolverDnHandle; |
| import jcuda.jcusparse.cusparseHandle; |
| import jcuda.runtime.JCuda; |
| import jcuda.runtime.cudaDeviceProp; |
| |
| /** |
| * Represents a context per GPU accessible through the same JVM. |
| * Each context holds cublas, cusparse, cudnn... handles which are separate for each GPU. |
| */ |
| public class GPUContext { |
| |
| protected static final Log LOG = LogFactory.getLog(GPUContext.class.getName()); |
| /** |
| * The minimum CUDA Compute capability needed for SystemDS. |
| * After compute capability 3.0, 2^31 - 1 blocks and 1024 threads per block are supported. |
| * If SystemDS needs to run on an older card, this logic can be revisited. |
| */ |
| final int MAJOR_REQUIRED = 3; |
| final int MINOR_REQUIRED = 0; |
| /** |
| * active device assigned to this GPUContext instance |
| */ |
| private final int deviceNum; |
| |
| /** |
| * cudnnHandle for Deep Neural Network operations on the GPU |
| */ |
| private cudnnHandle cudnnHandle; |
| /** |
| * cublasHandle for BLAS operations on the GPU |
| */ |
| private cublasHandle cublasHandle; |
| /** |
| * cusparseHandle for certain sparse BLAS operations on the GPU |
| */ |
| private cusparseHandle cusparseHandle; |
| /** |
| * cusolverDnHandle for invoking solve() function on dense matrices on the GPU |
| */ |
| private volatile cusolverDnHandle cusolverDnHandle; |
| /** |
| * to launch custom CUDA kernel, specific to the active GPU for this GPUContext |
| */ |
| private JCudaKernels kernels; |
| |
| private final GPUMemoryManager memoryManager; |
| |
| public GPUMemoryManager getMemoryManager() { |
| return memoryManager; |
| } |
| |
| protected GPUContext(int deviceNum) { |
| this.deviceNum = deviceNum; |
| |
| cudaSetDevice(deviceNum); |
| |
| cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); |
| |
| long start = -1; |
| if (DMLScript.STATISTICS) |
| start = System.nanoTime(); |
| initializeCudaLibraryHandles(); |
| |
| |
| if (DMLScript.STATISTICS) |
| GPUStatistics.cudaLibrariesInitTime = System.nanoTime() - start; |
| |
| memoryManager = new GPUMemoryManager(this); |
| } |
| |
| /** |
| * Returns which device is currently being used. |
| * |
| * @return the current device for the calling host thread |
| */ |
| public static int cudaGetDevice() { |
| int[] device = new int[1]; |
| JCuda.cudaGetDevice(device); |
| return device[0]; |
| } |
| |
| /** |
| * Print information of memory usage. |
| * |
| * @param opcode opcode of caller |
| */ |
| public void printMemoryInfo(String opcode) { |
| if (LOG.isDebugEnabled()) { |
| LOG.debug(opcode + ": " + memoryManager.toString()); |
| } |
| } |
| |
| private void initializeCudaLibraryHandles() throws DMLRuntimeException { |
| // We don't need to explicitly delete the handles if we are planning to create them again. |
| // This has a huge performance impact on scripts that has large number of layers (i.e. FunctionCallCP) for example ResNet. |
| // If this is absolutely required for parfor, please add appropriate safeguard for non-parfor scripts. |
| // deleteCudaLibraryHandles(); |
| if (cudnnHandle == null) { |
| cudnnHandle = new cudnnHandle(); |
| cudnnCreate(cudnnHandle); |
| } |
| |
| if (cublasHandle == null) { |
| cublasHandle = new cublasHandle(); |
| cublasCreate(cublasHandle); |
| } |
| // For cublas v2, cublasSetPointerMode tells Cublas whether to expect scalar arguments on device or on host |
| // This applies to arguments like "alpha" in Dgemm, and "y" in Ddot. |
| // cublasSetPointerMode(LibMatrixCUDA.cublasHandle, cublasPointerMode.CUBLAS_POINTER_MODE_DEVICE); |
| |
| if (cusparseHandle == null) { |
| cusparseHandle = new cusparseHandle(); |
| cusparseCreate(cusparseHandle); |
| } |
| |
| if (kernels == null) { |
| kernels = new JCudaKernels(); |
| } |
| } |
| |
| /** |
| * Returns which device is assigned to this GPUContext instance. |
| * |
| * @return active device assigned to this GPUContext instance |
| */ |
| public int getDeviceNum() { |
| return deviceNum; |
| } |
| |
| /** |
| * Sets the device for the calling thread. |
| * This method must be called after |
| * {@link org.apache.sysds.runtime.controlprogram.context.ExecutionContext#getGPUContext(int)} |
| * If in a multithreaded environment like parfor, this method must be called when in the |
| * appropriate thread. |
| * |
| */ |
| public void initializeThread() { |
| cudaSetDevice(deviceNum); |
| initializeCudaLibraryHandles(); |
| } |
| |
| /** |
| * Invokes memory manager's malloc method |
| * |
| * @param instructionName name of instruction for which to record per instruction performance statistics, null if |
| * you don't want to record |
| * @param size size of data (in bytes) to allocate |
| * @param initialize if cudaMemset() should be called |
| * @return jcuda pointer |
| */ |
| public Pointer allocate(String instructionName, long size, boolean initialize) { |
| return memoryManager.malloc(instructionName, size, initialize); |
| } |
| |
| /** |
| * Default behavior for gpu memory allocation (init to zero) |
| * |
| * @param instructionName Name of the instruction calling allocate |
| * @param size size in bytes |
| * @return jcuda pointer |
| */ |
| public Pointer allocate(String instructionName, long size) { |
| return memoryManager.malloc(instructionName, size, true); |
| } |
| |
| /** |
| * Does cudaFree calls, lazily. |
| * |
| * @param instructionName name of the instruction for which to record per instruction free time, null if you do not |
| * want to record |
| * @param toFree {@link Pointer} instance to be freed |
| * @param eager true if to be done eagerly |
| */ |
| public void cudaFreeHelper(String instructionName, final Pointer toFree, boolean eager) { |
| memoryManager.free(instructionName, toFree, eager); |
| } |
| |
| |
| /** |
| * Gets the available memory on GPU that SystemDS can use. |
| * |
| * @return the available memory in bytes |
| */ |
| public long getAvailableMemory() { |
| return memoryManager.allocator.getAvailableMemory(); |
| } |
| |
| /** |
| * Makes sure that GPU that SystemDS is trying to use has the minimum compute capability needed. |
| */ |
| public void ensureComputeCapability() { |
| int[] devices = { -1 }; |
| cudaGetDeviceCount(devices); |
| if (devices[0] == -1) { |
| throw new DMLRuntimeException("Call to cudaGetDeviceCount returned 0 devices"); |
| } |
| boolean isComputeCapable = true; |
| for (int i = 0; i < devices[0]; i++) { |
| cudaDeviceProp properties = GPUContextPool.getGPUProperties(i); |
| int major = properties.major; |
| int minor = properties.minor; |
| if (major < MAJOR_REQUIRED) { |
| isComputeCapable = false; |
| } else if (major == MAJOR_REQUIRED && minor < MINOR_REQUIRED) { |
| isComputeCapable = false; |
| } |
| } |
| if (!isComputeCapable) { |
| throw new DMLRuntimeException( |
| "One of the CUDA cards on the system has compute capability lower than " + MAJOR_REQUIRED + "." |
| + MINOR_REQUIRED); |
| } |
| } |
| |
| /** |
| * Instantiates a new {@link GPUObject} initialized with the given {@link org.apache.sysds.runtime.controlprogram.caching.MatrixObject MatrixObject}. |
| * |
| * @param mo a {@link org.apache.sysds.runtime.controlprogram.caching.MatrixObject MatrixObject} that represents a matrix |
| * @return a new {@link GPUObject} instance |
| */ |
| public GPUObject createGPUObject(MatrixObject mo) { |
| GPUObject ret = new GPUObject(this, mo); |
| getMemoryManager().getGPUMatrixMemoryManager().addGPUObject(ret); |
| return ret; |
| } |
| |
| /** |
| * Shallow copy the given source {@link GPUObject} to a new {@link GPUObject} and |
| * assign that to the given {@link MatrixObject}. |
| * This copy doesn't memcopy the device memory. |
| * |
| * @param source a {@link GPUObject} which is the source of the copy |
| * @param mo a {@link MatrixObject} to associate with the new {@link GPUObject} |
| * @return a new {@link GPUObject} instance |
| */ |
| public GPUObject shallowCopyGPUObject(GPUObject source, MatrixObject mo) { |
| GPUObject ret = new GPUObject(this, source, mo); |
| getMemoryManager().getGPUMatrixMemoryManager().addGPUObject(ret); |
| |
| return ret; |
| } |
| |
| /** |
| * Gets the device properties for the active GPU (set with cudaSetDevice()). |
| * |
| * @return the device properties |
| */ |
| public cudaDeviceProp getGPUProperties() { |
| return GPUContextPool.getGPUProperties(deviceNum); |
| } |
| |
| /** |
| * Gets the maximum number of threads per block for "active" GPU. |
| * |
| * @return the maximum number of threads per block |
| */ |
| public int getMaxThreadsPerBlock() { |
| cudaDeviceProp deviceProps = getGPUProperties(); |
| return deviceProps.maxThreadsPerBlock; |
| } |
| |
| /** |
| * Gets the maximum number of blocks supported by the active cuda device. |
| * |
| * @return the maximum number of blocks supported |
| */ |
| public int getMaxBlocks() { |
| cudaDeviceProp deviceProp = getGPUProperties(); |
| return deviceProp.maxGridSize[0]; |
| } |
| |
| /** |
| * Gets the shared memory per block supported by the active cuda device. |
| * |
| * @return the shared memory per block |
| */ |
| public long getMaxSharedMemory() { |
| cudaDeviceProp deviceProp = getGPUProperties(); |
| return deviceProp.sharedMemPerBlock; |
| } |
| |
| /** |
| * Gets the warp size supported by the active cuda device. |
| * |
| * @return the warp size |
| */ |
| public int getWarpSize() { |
| cudaDeviceProp deviceProp = getGPUProperties(); |
| return deviceProp.warpSize; |
| } |
| |
| /** |
| * Returns the cudnnHandle for Deep Neural Network operations on the GPU. |
| * |
| * @return cudnnHandle for current thread |
| */ |
| public cudnnHandle getCudnnHandle() { |
| return cudnnHandle; |
| } |
| |
| /** |
| * Returns cublasHandle for BLAS operations on the GPU. |
| * |
| * @return cublasHandle for current thread |
| */ |
| public cublasHandle getCublasHandle() { |
| return cublasHandle; |
| } |
| |
| /** |
| * Returns cusparseHandle for certain sparse BLAS operations on the GPU. |
| * |
| * @return cusparseHandle for current thread |
| */ |
| public cusparseHandle getCusparseHandle() { |
| return cusparseHandle; |
| } |
| |
| /** |
| * Returns cusolverDnHandle for invoking solve() function on dense matrices on the GPU. |
| * |
| * @return cusolverDnHandle for current thread |
| */ |
| public cusolverDnHandle getCusolverDnHandle() { |
| if (cusolverDnHandle == null) { |
| synchronized(this) { |
| if (cusolverDnHandle == null) { |
| // Since cusolverDnHandle handle is rarely used and occupies unnecessary memory, it is only initialized when needed. |
| cusolverDnHandle = new cusolverDnHandle(); |
| cusolverDnCreate(cusolverDnHandle); |
| } |
| } |
| } |
| return cusolverDnHandle; |
| } |
| |
| /** |
| * Returns utility class used to launch custom CUDA kernel, specific to the active GPU for this GPUContext. |
| * |
| * @return {@link JCudaKernels} for current thread |
| */ |
| public JCudaKernels getKernels() { |
| return kernels; |
| } |
| |
| /** |
| * Destroys this GPUContext object. |
| * |
| */ |
| public void destroy() { |
| if (LOG.isTraceEnabled()) { |
| LOG.trace("GPU : this context was destroyed, this = " + this); |
| } |
| clearMemory(); |
| |
| deleteCudaLibraryHandles(); |
| } |
| |
| /** |
| * Deletes CUDA library handles |
| */ |
| private void deleteCudaLibraryHandles() { |
| if (cudnnHandle != null) |
| cudnnDestroy(cudnnHandle); |
| |
| if (cublasHandle != null) |
| cublasDestroy(cublasHandle); |
| |
| if (cusparseHandle != null) |
| cusparseDestroy(cusparseHandle); |
| |
| if (cusolverDnHandle != null) |
| cusolverDnDestroy(cusolverDnHandle); |
| |
| cudnnHandle = null; |
| cublasHandle = null; |
| cusparseHandle = null; |
| cusolverDnHandle = null; |
| } |
| |
| /** |
| * Clears all memory used by this {@link GPUContext}. |
| * Be careful to ensure that no memory is currently being used in the temporary memory before invoking this. |
| * If memory is being used between MLContext invocations, they are pointed to by a {@link GPUObject} instance |
| * which would be part of the {@link MatrixObject}. The cleanup of that {@link MatrixObject} instance will |
| * cause the memory associated with that block on the GPU to be freed up. |
| */ |
| public void clearMemory() { |
| memoryManager.clearMemory(); |
| } |
| |
| public void clearTemporaryMemory() { |
| memoryManager.clearTemporaryMemory(); |
| } |
| |
| @Override |
| public String toString() { |
| return "GPUContext{" + "deviceNum=" + deviceNum + '}'; |
| } |
| } |