| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| package org.apache.sysds.runtime.instructions.gpu.context; |
| |
| import jcuda.Pointer; |
| import jcuda.jcusparse.cusparseDirection; |
| import jcuda.jcusparse.cusparseHandle; |
| import jcuda.jcusparse.cusparseMatDescr; |
| import org.apache.commons.logging.Log; |
| import org.apache.commons.logging.LogFactory; |
| import org.apache.sysds.api.DMLScript; |
| import org.apache.sysds.runtime.DMLRuntimeException; |
| import org.apache.sysds.runtime.controlprogram.caching.MatrixObject; |
| import org.apache.sysds.runtime.data.SparseBlock; |
| import org.apache.sysds.runtime.data.SparseBlockCOO; |
| import org.apache.sysds.runtime.data.SparseBlockCSR; |
| import org.apache.sysds.runtime.data.SparseBlockMCSR; |
| import org.apache.sysds.runtime.matrix.data.LibMatrixCUDA; |
| import org.apache.sysds.runtime.matrix.data.MatrixBlock; |
| import org.apache.sysds.utils.GPUStatistics; |
| |
| import java.util.concurrent.atomic.AtomicLong; |
| import java.util.concurrent.atomic.LongAdder; |
| |
| import static jcuda.jcublas.cublasOperation.CUBLAS_OP_T; |
| import static jcuda.runtime.JCuda.cudaMemcpy; |
| import static jcuda.runtime.JCuda.cudaMemset; |
| import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice; |
| import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost; |
| |
| /** |
| * Handle to a matrix block on the GPU |
| */ |
| public class GPUObject { |
| |
| private static final Log LOG = LogFactory.getLog(GPUObject.class.getName()); |
| |
| /** |
| * GPUContext that owns this GPUObject |
| */ |
| private final GPUContext gpuContext; |
| |
| /** |
| * Pointer to the underlying dense matrix block on GPU |
| */ |
| Pointer jcudaDenseMatrixPtr = null; |
| |
| /** |
| * Pointer to the underlying sparse matrix block on GPU |
| */ |
| private CSRPointer jcudaSparseMatrixPtr = null; |
| |
| /** |
| * whether the block attached to this {@link GPUContext} is dirty on the device and needs to be copied back to host |
| */ |
| protected boolean dirty = false; |
| |
| /** |
| * number of read locks on this object (this GPUObject is being used in a current instruction) |
| */ |
| protected LongAdder readLocks = new LongAdder(); |
| |
| /** |
| * whether write lock on this object (this GPUObject is being used in a current instruction) |
| */ |
| protected boolean writeLock = false; |
| |
| /** |
| * Timestamp, needed by {@link GPUContext\#evict(long)} |
| */ |
| AtomicLong timestamp = new AtomicLong(); |
| |
| /** |
| * Whether this block is in sparse format |
| */ |
| protected boolean isSparse = false; |
| |
| /** |
| * Enclosing {@link MatrixObject} instance |
| */ |
| MatrixObject mat = null; |
| |
| /** |
| * Shadow buffer instance |
| */ |
| final ShadowBuffer shadowBuffer; |
| |
| // ---------------------------------------------------------------------- |
| // Methods used to access, set and check jcudaDenseMatrixPtr |
| |
| /** |
| * Pointer to dense matrix |
| * |
| * @return a pointer to the dense matrix |
| */ |
| public Pointer getDensePointer() { |
| if(jcudaDenseMatrixPtr == null && shadowBuffer.isBuffered() && getJcudaSparseMatrixPtr() == null) { |
| shadowBuffer.moveToDevice(); |
| } |
| return jcudaDenseMatrixPtr; |
| } |
| |
| /** |
| * Checks if the dense pointer is null |
| * |
| * @return if the state of dense pointer is null |
| */ |
| public boolean isDensePointerNull() { |
| return jcudaDenseMatrixPtr == null; |
| } |
| |
| /** |
| * Removes the dense pointer and potential soft reference |
| */ |
| public void clearDensePointer() { |
| jcudaDenseMatrixPtr = null; |
| shadowBuffer.clearShadowPointer(); |
| } |
| |
| public MatrixObject getMatrixObject() { |
| return mat; |
| } |
| |
| /** |
| * Convenience method to directly set the dense matrix pointer on GPU |
| * |
| * @param densePtr dense pointer |
| */ |
| public void setDensePointer(Pointer densePtr) { |
| if (!this.isDensePointerNull()) { |
| throw new DMLRuntimeException("jcudaDenseMatrixPtr was already allocated for " + this + ", this will cause a memory leak on the GPU"); |
| } |
| this.jcudaDenseMatrixPtr = densePtr; |
| this.isSparse = false; |
| if(LOG.isDebugEnabled()) { |
| LOG.debug("Setting dense pointer of size " + getGPUContext().getMemoryManager().getSizeAllocatedGPUPointer(densePtr)); |
| } |
| if (getJcudaSparseMatrixPtr() != null) { |
| getJcudaSparseMatrixPtr().deallocate(); |
| jcudaSparseMatrixPtr = null; |
| } |
| } |
| |
| public void setDirty(boolean flag) { |
| dirty = flag; |
| } |
| // ---------------------------------------------------------------------- |
| |
| |
| @Override |
| public Object clone() { |
| GPUObject me = this; |
| GPUObject that = new GPUObject(me.gpuContext, me.mat); |
| that.dirty = me.dirty; |
| // The only place clone is getting called: LibMatrixCUDA's solve |
| that.readLocks.reset(); |
| that.writeLock = false; |
| |
| that.timestamp = new AtomicLong(me.timestamp.get()); |
| that.isSparse = me.isSparse; |
| |
| try { |
| if (!me.isDensePointerNull()) { |
| long rows = me.mat.getNumRows(); |
| long cols = me.mat.getNumColumns(); |
| long size = rows * cols * LibMatrixCUDA.sizeOfDataType; |
| that.setDensePointer(allocate(size, false)); |
| cudaMemcpy(that.getDensePointer(), me.getDensePointer(), size, cudaMemcpyDeviceToDevice); |
| } |
| |
| if (me.getJcudaSparseMatrixPtr() != null) { |
| long rows = mat.getNumRows(); |
| that.jcudaSparseMatrixPtr = me.jcudaSparseMatrixPtr.clone((int) rows); |
| } |
| |
| } catch (DMLRuntimeException e) { |
| throw new RuntimeException(e); |
| } |
| |
| return that; |
| } |
| |
| private Pointer allocate(long size, boolean initialize) { |
| return getGPUContext().allocate(null, size, initialize); |
| } |
| |
| private void cudaFreeHelper(Pointer toFree) throws DMLRuntimeException { |
| getGPUContext().cudaFreeHelper(null, toFree, DMLScript.EAGER_CUDA_FREE); |
| } |
| |
| GPUContext getGPUContext() { return gpuContext; } |
| |
| /** |
| * Transposes a dense matrix on the GPU by calling the cublasDgeam operation |
| * |
| * @param gCtx a valid {@link GPUContext} |
| * @param densePtr Pointer to dense matrix on the GPU |
| * @param m rows in ouput matrix |
| * @param n columns in output matrix |
| * @param lda rows in input matrix |
| * @param ldc columns in output matrix |
| * @return transposed matrix |
| */ |
| public static Pointer transpose(GPUContext gCtx, Pointer densePtr, int m, int n, int lda, int ldc) { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : transpose of block of size [" + m + "," + n + "]" + ", GPUContext=" + gCtx); |
| } |
| Pointer alpha = LibMatrixCUDA.one(); |
| Pointer beta = LibMatrixCUDA.zero(); |
| Pointer A = densePtr; |
| Pointer C = gCtx.allocate(null, m * getDatatypeSizeOf(n), false); |
| |
| // Transpose the matrix to get a dense matrix |
| LibMatrixCUDA.cudaSupportFunctions.cublasgeam(gCtx.getCublasHandle(), CUBLAS_OP_T, CUBLAS_OP_T, m, n, alpha, A, lda, beta, new Pointer(), |
| lda, C, ldc); |
| return C; |
| } |
| |
| |
| /** |
| * Convenience method to convert a CSR matrix to a dense matrix on the GPU |
| * Since the allocated matrix is temporary, bookkeeping is not updated. |
| * Also note that the input dense matrix is expected to be in COLUMN MAJOR FORMAT |
| * Caller is responsible for deallocating memory on GPU. |
| * |
| * @param gCtx a valid {@link GPUContext} |
| * @param cusparseHandle handle to cusparse library |
| * @param densePtr [in] dense matrix pointer on the GPU in row major |
| * @param rows number of rows |
| * @param cols number of columns |
| * @return CSR (compressed sparse row) pointer |
| */ |
| public static CSRPointer columnMajorDenseToRowMajorSparse(GPUContext gCtx, cusparseHandle cusparseHandle, |
| Pointer densePtr, int rows, int cols) { |
| cusparseMatDescr matDescr = CSRPointer.getDefaultCuSparseMatrixDescriptor(); |
| Pointer nnzPerRowPtr = null; |
| Pointer nnzTotalDevHostPtr = null; |
| |
| nnzPerRowPtr = gCtx.allocate(null, getIntSizeOf(rows), false); |
| nnzTotalDevHostPtr = gCtx.allocate(null, getIntSizeOf(1), false); |
| |
| // Output is in dense vector format, convert it to CSR |
| LibMatrixCUDA.cudaSupportFunctions.cusparsennz(cusparseHandle, cusparseDirection.CUSPARSE_DIRECTION_ROW, rows, cols, matDescr, densePtr, rows, |
| nnzPerRowPtr, nnzTotalDevHostPtr); |
| //cudaDeviceSynchronize(); |
| int[] nnzC = { -1 }; |
| |
| cudaMemcpy(Pointer.to(nnzC), nnzTotalDevHostPtr, getIntSizeOf(1), cudaMemcpyDeviceToHost); |
| if (nnzC[0] == -1) { |
| throw new DMLRuntimeException( |
| "cusparseDnnz did not calculate the correct number of nnz from the sparse-matrix vector mulitply on the GPU"); |
| } |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : col-major dense size[" + rows + "," + cols + "] to row-major sparse of with nnz = " + nnzC[0] |
| + ", GPUContext=" + gCtx); |
| } |
| |
| CSRPointer C = CSRPointer.allocateEmpty(gCtx, nnzC[0], rows); |
| LibMatrixCUDA.cudaSupportFunctions.cusparsedense2csr(cusparseHandle, rows, cols, matDescr, densePtr, rows, nnzPerRowPtr, C.val, C.rowPtr, |
| C.colInd); |
| //cudaDeviceSynchronize(); |
| |
| gCtx.cudaFreeHelper(null, nnzPerRowPtr, DMLScript.EAGER_CUDA_FREE); |
| gCtx.cudaFreeHelper(null, nnzTotalDevHostPtr, DMLScript.EAGER_CUDA_FREE); |
| |
| return C; |
| } |
| |
| /** |
| * Convenience method to directly examine the Sparse matrix on GPU |
| * |
| * @return CSR (compressed sparse row) pointer |
| */ |
| public CSRPointer getSparseMatrixCudaPointer() { |
| return getJcudaSparseMatrixPtr(); |
| } |
| |
| /** |
| * Convenience method to directly set the sparse matrix on GPU |
| * Needed for operations like cusparseDcsrgemm(cusparseHandle, int, int, int, int, int, cusparseMatDescr, int, Pointer, Pointer, Pointer, cusparseMatDescr, int, Pointer, Pointer, Pointer, cusparseMatDescr, Pointer, Pointer, Pointer) |
| * |
| * @param sparseMatrixPtr CSR (compressed sparse row) pointer |
| */ |
| public void setSparseMatrixCudaPointer(CSRPointer sparseMatrixPtr) { |
| if (this.jcudaSparseMatrixPtr != null) { |
| throw new DMLRuntimeException("jcudaSparseMatrixPtr was already allocated for " + this + ", this will cause a memory leak on the GPU"); |
| } |
| this.jcudaSparseMatrixPtr = sparseMatrixPtr; |
| this.isSparse = true; |
| if (!isDensePointerNull() && !shadowBuffer.isBuffered()) { |
| cudaFreeHelper(getDensePointer()); |
| clearDensePointer(); |
| } |
| } |
| |
| /** |
| * Converts this GPUObject from dense to sparse format. |
| */ |
| public void denseToSparse() { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : dense -> sparse on " + this + ", GPUContext=" + getGPUContext()); |
| } |
| long t0 = 0; |
| if (DMLScript.STATISTICS) |
| t0 = System.nanoTime(); |
| cusparseHandle cusparseHandle = getGPUContext().getCusparseHandle(); |
| if (cusparseHandle == null) |
| throw new DMLRuntimeException("Expected cusparse to be initialized"); |
| int rows = toIntExact(mat.getNumRows()); |
| int cols = toIntExact(mat.getNumColumns()); |
| |
| if ((isDensePointerNull() && !shadowBuffer.isBuffered()) || !isAllocated()) |
| throw new DMLRuntimeException("Expected allocated dense matrix before denseToSparse() call"); |
| |
| denseRowMajorToColumnMajor(); |
| setSparseMatrixCudaPointer( |
| columnMajorDenseToRowMajorSparse(getGPUContext(), cusparseHandle, getDensePointer(), rows, |
| cols)); |
| // TODO: What if mat.getNnz() is -1 ? |
| if (DMLScript.STATISTICS) |
| GPUStatistics.cudaDenseToSparseTime.add(System.nanoTime() - t0); |
| if (DMLScript.STATISTICS) |
| GPUStatistics.cudaDenseToSparseCount.add(1); |
| } |
| |
| /** |
| * Convenience method. Converts Row Major Dense Matrix to Column Major Dense Matrix |
| */ |
| public void denseRowMajorToColumnMajor() { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : dense Ptr row-major -> col-major on " + this + ", GPUContext=" + getGPUContext()); |
| } |
| int m = toIntExact(mat.getNumRows()); |
| int n = toIntExact(mat.getNumColumns()); |
| int lda = n; |
| int ldc = m; |
| if (!isAllocated()) { |
| throw new DMLRuntimeException("Error in converting row major to column major : data is not allocated"); |
| } |
| |
| Pointer tmp = transpose(getGPUContext(), getDensePointer(), m, n, lda, ldc); |
| cudaFreeHelper(getDensePointer()); |
| clearDensePointer(); |
| setDensePointer(tmp); |
| } |
| |
| /** |
| * Convenience method. Converts Column Major Dense Matrix to Row Major Dense Matrix |
| */ |
| public void denseColumnMajorToRowMajor() { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : dense Ptr row-major -> col-major on " + this + ", GPUContext=" + getGPUContext()); |
| } |
| |
| int n = toIntExact(mat.getNumRows()); |
| int m = toIntExact(mat.getNumColumns()); |
| int lda = n; |
| int ldc = m; |
| if (!isAllocated()) { |
| throw new DMLRuntimeException("Error in converting column major to row major : data is not allocated"); |
| } |
| |
| Pointer tmp = transpose(getGPUContext(), getDensePointer(), m, n, lda, ldc); |
| cudaFreeHelper(getDensePointer()); |
| clearDensePointer(); |
| setDensePointer(tmp); |
| } |
| |
| /** |
| * Convert sparse to dense (Performs transpose, use sparseToColumnMajorDense if the kernel can deal with column major format) |
| */ |
| public void sparseToDense() { |
| sparseToDense(null); |
| } |
| |
| /** |
| * Convert sparse to dense (Performs transpose, use sparseToColumnMajorDense if the kernel can deal with column major format) |
| * Also records per instruction invokation of sparseToDense. |
| * |
| * @param instructionName Name of the instruction for which statistics are recorded in {@link GPUStatistics} |
| */ |
| public void sparseToDense(String instructionName) { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : sparse -> dense on " + this + ", GPUContext=" + getGPUContext()); |
| } |
| long start = 0, end = 0; |
| if (DMLScript.STATISTICS) |
| start = System.nanoTime(); |
| if (getJcudaSparseMatrixPtr() == null || !isAllocated()) |
| throw new DMLRuntimeException("Expected allocated sparse matrix before sparseToDense() call"); |
| |
| sparseToColumnMajorDense(); |
| denseColumnMajorToRowMajor(); |
| if (DMLScript.STATISTICS) |
| end = System.nanoTime(); |
| if (DMLScript.STATISTICS) |
| GPUStatistics.cudaSparseToDenseTime.add(end - start); |
| if (DMLScript.STATISTICS) |
| GPUStatistics.cudaSparseToDenseCount.add(1); |
| } |
| |
| /** |
| * More efficient method to convert sparse to dense but returns dense in column major format |
| */ |
| public void sparseToColumnMajorDense() { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : sparse -> col-major dense on " + this + ", GPUContext=" + getGPUContext()); |
| } |
| if (getJcudaSparseMatrixPtr() == null || !isAllocated()) |
| throw new DMLRuntimeException("Expected allocated sparse matrix before sparseToDense() call"); |
| |
| cusparseHandle cusparseHandle = getGPUContext().getCusparseHandle(); |
| if (cusparseHandle == null) |
| throw new DMLRuntimeException("Expected cusparse to be initialized"); |
| int rows = toIntExact(mat.getNumRows()); |
| int cols = toIntExact(mat.getNumColumns()); |
| setDensePointer(getJcudaSparseMatrixPtr().toColumnMajorDenseMatrix(cusparseHandle, null, rows, cols, null)); |
| } |
| |
| /** |
| * Initializes this GPUObject with a {@link MatrixObject} instance which will contain metadata about the enclosing matrix block |
| * |
| * @param mat2 the matrix object that owns this {@link GPUObject} |
| */ |
| GPUObject(GPUContext gCtx, MatrixObject mat2) { |
| gpuContext = gCtx; |
| this.mat = mat2; |
| this.shadowBuffer = new ShadowBuffer(this); |
| } |
| |
| public GPUObject(GPUContext gCtx, MatrixObject mat, Pointer ptr) { |
| gpuContext = gCtx; |
| this.mat = mat; |
| setDensePointer(ptr); |
| isSparse = false; |
| this.shadowBuffer = new ShadowBuffer(this); |
| } |
| |
| public GPUObject(GPUContext gCtx, GPUObject that, MatrixObject mat) { |
| dirty = that.dirty; |
| readLocks.reset(); |
| writeLock = false; |
| timestamp = new AtomicLong(that.timestamp.get()); |
| isSparse = that.isSparse; |
| if (!that.isDensePointerNull()) |
| setDensePointer(that.getDensePointer()); |
| if (that.getJcudaSparseMatrixPtr() != null) |
| setSparseMatrixCudaPointer(that.getSparseMatrixCudaPointer()); |
| gpuContext = gCtx; |
| this.mat = mat; |
| shadowBuffer = new ShadowBuffer(this); |
| } |
| |
| public boolean isSparse() { |
| return isSparse; |
| } |
| |
| private static long getDatatypeSizeOf(long numElems) { |
| return numElems * LibMatrixCUDA.sizeOfDataType; |
| } |
| |
| private static long getIntSizeOf(long numElems) { |
| return numElems * jcuda.Sizeof.INT; |
| } |
| |
| public boolean isAllocated() { |
| boolean eitherAllocated = shadowBuffer.isBuffered() || !isDensePointerNull() || getJcudaSparseMatrixPtr() != null; |
| return eitherAllocated; |
| } |
| |
| /** |
| * Allocates a sparse and empty {@link GPUObject} |
| * This is the result of operations that are both non zero matrices. |
| */ |
| public void allocateSparseAndEmpty() { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : allocate sparse and empty block on " + this + ", GPUContext=" + getGPUContext()); |
| } |
| setSparseMatrixCudaPointer(CSRPointer.allocateEmpty(getGPUContext(), 0, mat.getNumRows())); |
| } |
| |
| /** |
| * Allocates a dense matrix of size obtained from the attached matrix metadata |
| * and fills it up with a single value |
| * |
| * @param v value to fill up the dense matrix |
| */ |
| public void allocateAndFillDense(double v) { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : allocate and fill dense with value " + v + " on " + this + ", GPUContext=" + getGPUContext()); |
| } |
| long rows = mat.getNumRows(); |
| long cols = mat.getNumColumns(); |
| int numElems = toIntExact(rows * cols); |
| long size = getDatatypeSizeOf(numElems); |
| setDensePointer(allocate(size,false)); |
| getGPUContext().getKernels().launchKernel("fill", ExecutionConfig.getConfigForSimpleVectorOperations |
| (numElems), getDensePointer(), v, numElems); |
| } |
| |
| /** |
| * If this {@link GPUObject} is sparse and empty |
| * Being allocated is a prerequisite to being sparse and empty. |
| * |
| * @return true if sparse and empty |
| */ |
| public boolean isSparseAndEmpty() { |
| boolean isSparseAndAllocated = isAllocated() && LibMatrixCUDA.isInSparseFormat(getGPUContext(), mat); |
| boolean isEmptyAndSparseAndAllocated = isSparseAndAllocated && getJcudaSparseMatrixPtr().nnz == 0; |
| return isEmptyAndSparseAndAllocated; |
| } |
| |
| |
| /** |
| * Being allocated is a prerequisite for computing nnz. |
| * Note: if the matrix is in dense format, it explicitly re-computes the number of nonzeros. |
| * |
| * @param instName instruction name |
| * @param recomputeDenseNNZ recompute NNZ if dense |
| * @return the number of nonzeroes |
| */ |
| public long getNnz(String instName, boolean recomputeDenseNNZ) { |
| if(isAllocated()) { |
| if(LibMatrixCUDA.isInSparseFormat(getGPUContext(), mat)) { |
| return getJcudaSparseMatrixPtr().nnz; |
| } |
| else { |
| if(!recomputeDenseNNZ) |
| return -1; |
| |
| GPUContext gCtx = getGPUContext(); |
| cusparseHandle cusparseHandle = gCtx.getCusparseHandle(); |
| cusparseMatDescr matDescr = CSRPointer.getDefaultCuSparseMatrixDescriptor(); |
| if (cusparseHandle == null) |
| throw new DMLRuntimeException("Expected cusparse to be initialized"); |
| int rows = toIntExact(mat.getNumRows()); |
| int cols = toIntExact(mat.getNumColumns()); |
| Pointer nnzPerRowPtr = null; |
| Pointer nnzTotalDevHostPtr = null; |
| nnzPerRowPtr = gCtx.allocate(instName, getIntSizeOf(rows), false); |
| nnzTotalDevHostPtr = gCtx.allocate(instName, getIntSizeOf(1), false); |
| LibMatrixCUDA.cudaSupportFunctions.cusparsennz(cusparseHandle, cusparseDirection.CUSPARSE_DIRECTION_ROW, rows, cols, matDescr, getDensePointer(), rows, |
| nnzPerRowPtr, nnzTotalDevHostPtr); |
| int[] nnzC = { -1 }; |
| cudaMemcpy(Pointer.to(nnzC), nnzTotalDevHostPtr, getIntSizeOf(1), cudaMemcpyDeviceToHost); |
| if (nnzC[0] == -1) { |
| throw new DMLRuntimeException( |
| "cusparseDnnz did not calculate the correct number of nnz on the GPU"); |
| } |
| gCtx.cudaFreeHelper(instName, nnzPerRowPtr, DMLScript.EAGER_CUDA_FREE); |
| gCtx.cudaFreeHelper(instName, nnzTotalDevHostPtr, DMLScript.EAGER_CUDA_FREE); |
| return nnzC[0]; |
| } |
| } |
| else |
| throw new DMLRuntimeException("Expected the GPU object to be allocated"); |
| } |
| |
| public boolean acquireDeviceRead(String opcode) { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : acquireDeviceRead on " + this); |
| } |
| boolean transferred = false; |
| if (!isAllocated()) { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : in acquireDeviceRead, data is not allocated, copying from host, on " + this + ", GPUContext=" |
| + getGPUContext()); |
| } |
| copyFromHostToDevice(opcode); |
| transferred = true; |
| } |
| addReadLock(); |
| if (!isAllocated()) |
| throw new DMLRuntimeException("Expected device data to be allocated"); |
| return transferred; |
| } |
| |
| public boolean acquireDeviceModifyDense() { |
| return acquireDeviceModifyDense(true); |
| } |
| |
| public boolean acquireDeviceModifyDense(boolean initialize) { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : acquireDeviceModifyDense on " + this + ", GPUContext=" + getGPUContext()); |
| } |
| boolean allocated = false; |
| if (!isAllocated()) { |
| mat.setDirty(true); |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : data is not allocated, allocating a dense block, on " + this); |
| } |
| // Dense block, size = numRows * numCols |
| allocateDenseMatrixOnDevice(initialize); |
| allocated = true; |
| } |
| dirty = true; |
| if (!isAllocated()) |
| throw new DMLRuntimeException("Expected device data to be allocated"); |
| return allocated; |
| } |
| |
| public boolean acquireDeviceModifySparse() { |
| return acquireDeviceModifySparse(true); |
| } |
| |
| public boolean acquireDeviceModifySparse(boolean initialize) { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : acquireDeviceModifySparse on " + this + ", GPUContext=" + getGPUContext()); |
| } |
| boolean allocated = false; |
| isSparse = true; |
| if (!isAllocated()) { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : data is not allocated, allocating a sparse block, on " + this); |
| } |
| mat.setDirty(true); |
| allocateSparseMatrixOnDevice(initialize); |
| allocated = true; |
| } |
| dirty = true; |
| if (!isAllocated()) |
| throw new DMLRuntimeException("Expected device data to be allocated"); |
| return allocated; |
| } |
| |
| /** |
| * if the data is allocated on the GPU and is dirty, it is copied back to the host memory |
| * |
| * @param instName name of the instruction |
| * @return true if a copy to host happened, false otherwise |
| */ |
| public boolean acquireHostRead(String instName) { |
| boolean copied = false; |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : acquireDeviceModifySparse on " + this + ", GPUContext=" + getGPUContext()); |
| } |
| if (isAllocated() && dirty) { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : data is dirty on device, copying to host, on " + this + ", GPUContext=" |
| + getGPUContext()); |
| } |
| |
| if (isAllocated() && dirty) { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : data is dirty on device, copying to host, on " + this + ", GPUContext=" |
| + getGPUContext()); |
| } |
| // TODO: Future optimization: |
| // For now, we are deleting the device data when copied from device to host. |
| // This can be optimized later by treating acquiredModify+release as a new state |
| copyFromDeviceToHost(instName, false, true); |
| copied = true; |
| } |
| } |
| return copied; |
| } |
| |
| public boolean isLocked() { |
| return writeLock || readLocks.longValue() > 0; |
| } |
| |
| public void addReadLock() { |
| if(writeLock) |
| throw new DMLRuntimeException("Attempting to add a read lock when writeLock="+ writeLock); |
| else |
| readLocks.increment(); |
| } |
| |
| public void addWriteLock() { |
| if(readLocks.longValue() > 0) |
| throw new DMLRuntimeException("Attempting to add a write lock when readLocks="+ readLocks.longValue()); |
| else if(writeLock) |
| throw new DMLRuntimeException("Attempting to add a write lock when writeLock="+ writeLock); |
| else |
| writeLock = true; |
| } |
| |
| public void releaseReadLock() { |
| readLocks.decrement(); |
| if(readLocks.longValue() < 0) |
| throw new DMLRuntimeException("Attempting to release a read lock when readLocks="+ readLocks.longValue()); |
| } |
| |
| public void releaseWriteLock() { |
| if(writeLock) |
| writeLock = false; |
| else |
| throw new DMLRuntimeException("Internal state error : Attempting to release write lock on a GPUObject, which was already released"); |
| } |
| |
| public void resetReadWriteLock() { |
| readLocks.reset(); |
| writeLock = false; |
| } |
| |
| /** |
| * Updates the locks depending on the eviction policy selected |
| */ |
| private void updateReleaseLocks() { |
| timestamp.set(System.nanoTime()); |
| } |
| |
| /** |
| * Releases input allocated on GPU |
| */ |
| public void releaseInput() { |
| releaseReadLock(); |
| updateReleaseLocks(); |
| if (!isAllocated()) |
| throw new DMLRuntimeException("Attempting to release an input before allocating it"); |
| } |
| |
| /** |
| * releases output allocated on GPU |
| */ |
| public void releaseOutput() { |
| releaseWriteLock(); |
| updateReleaseLocks(); |
| // Currently, there is no convenient way to acquireDeviceModify independently of dense/sparse format. |
| // Hence, allowing resetting releaseOutput again. |
| // Ideally, we would want to throw CacheException("Attempting to release an output that was not acquired via acquireDeviceModify") if !isDirty() |
| dirty = true; |
| if (!isAllocated()) |
| throw new DMLRuntimeException("Attempting to release an output before allocating it"); |
| } |
| |
| void allocateDenseMatrixOnDevice(boolean initialize) { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : allocateDenseMatrixOnDevice, on " + this + ", GPUContext=" + getGPUContext()); |
| } |
| if(isAllocated()) |
| throw new DMLRuntimeException("Internal error - trying to allocated dense matrix to a GPUObject that is already allocated"); |
| long rows = mat.getNumRows(); |
| long cols = mat.getNumColumns(); |
| if(rows <= 0) |
| throw new DMLRuntimeException("Internal error - invalid number of rows when allocating dense matrix:" + rows); |
| if(cols <= 0) |
| throw new DMLRuntimeException("Internal error - invalid number of columns when allocating dense matrix:" + cols); |
| long size = getDatatypeSizeOf(rows * cols); |
| Pointer tmp = allocate(size, initialize); |
| setDensePointer(tmp); |
| } |
| |
| void allocateSparseMatrixOnDevice(boolean initialize) { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : allocateSparseMatrixOnDevice, on " + this + ", GPUContext=" + getGPUContext()); |
| } |
| if(isAllocated()) |
| throw new DMLRuntimeException("Internal error - trying to allocated sparse matrix to a GPUObject that is already allocated"); |
| long rows = mat.getNumRows(); |
| long nnz = mat.getNnz(); |
| if(rows <= 0) |
| throw new DMLRuntimeException("Internal error - invalid number of rows when allocating sparse matrix"); |
| if(nnz < 0) |
| throw new DMLRuntimeException("Internal error - invalid number of non zeroes when allocating a sparse matrix"); |
| CSRPointer tmp = CSRPointer.allocateEmpty(getGPUContext(), nnz, rows, initialize); |
| setSparseMatrixCudaPointer(tmp); |
| } |
| |
| void allocateSparseMatrixOnDevice(long numVals, boolean initialize) { |
| // This method is called when #values > nnz |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : allocateSparseMatrixOnDevice, on " + this + ", GPUContext=" + getGPUContext()); |
| } |
| if(isAllocated()) |
| throw new DMLRuntimeException("Internal error - trying to allocated sparse matrix to a GPUObject that is already allocated"); |
| long rows = mat.getNumRows(); |
| long nnz = mat.getNnz(); |
| if(rows <= 0) |
| throw new DMLRuntimeException("Internal error - invalid number of rows when allocating sparse matrix"); |
| if(nnz < 0) |
| throw new DMLRuntimeException("Internal error - invalid number of non zeroes when allocating a sparse matrix"); |
| CSRPointer tmp = CSRPointer.allocateEmpty(getGPUContext(), numVals, rows, initialize); |
| setSparseMatrixCudaPointer(tmp); |
| } |
| |
| // Method to find the estimated size of this GPU Object in the device |
| public long getSizeOnDevice() { |
| long GPUSize = 0; |
| long rlen = mat.getNumRows(); |
| long clen = mat.getNumColumns(); |
| long nnz = mat.getNnz(); |
| |
| if (LibMatrixCUDA.isInSparseFormat(getGPUContext(), mat)) { |
| GPUSize = CSRPointer.estimateSize(nnz, rlen); |
| } else { |
| GPUSize = getDatatypeSizeOf(rlen * clen); |
| } |
| return GPUSize; |
| } |
| |
| // Method to find the allocated size of this GPU Object in the device |
| public long getAllocatedSize() { |
| return gpuContext.getMemoryManager().getSizeAllocatedGPUPointer(getDensePointer()); |
| } |
| |
| void copyFromHostToDevice(String opcode) { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : copyFromHostToDevice, on " + this + ", GPUContext=" + getGPUContext()); |
| } |
| long start = 0; |
| if (DMLScript.STATISTICS) |
| start = System.nanoTime(); |
| |
| MatrixBlock tmp = mat.acquireRead(); |
| |
| if (tmp.isInSparseFormat()) { |
| int rowPtr[] = null; |
| int colInd[] = null; |
| double[] values = null; |
| |
| // Only recompute non-zero if unknown, else this will incur huge penalty !! |
| if(tmp.getNonZeros() < 0) { |
| tmp.recomputeNonZeros(); |
| } |
| long nnz = tmp.getNonZeros(); |
| mat.getDataCharacteristics().setNonZeros(nnz); |
| |
| SparseBlock block = tmp.getSparseBlock(); |
| boolean copyToDevice = true; |
| if (block == null && tmp.getNonZeros() == 0) { |
| copyToDevice = false; |
| } else if (block == null && tmp.getNonZeros() != 0) { |
| throw new DMLRuntimeException("Expected CP sparse block to be not null."); |
| } |
| else { |
| // CSR is the preferred format for cuSparse GEMM |
| // Converts MCSR and COO to CSR |
| SparseBlockCSR csrBlock = null; |
| long t0 = 0; |
| if (block instanceof SparseBlockCSR) { |
| csrBlock = (SparseBlockCSR) block; |
| } else if (block instanceof SparseBlockCOO) { |
| // TODO - should we do this on the GPU using cusparse<t>coo2csr() ? |
| if (DMLScript.STATISTICS) |
| t0 = System.nanoTime(); |
| SparseBlockCOO cooBlock = (SparseBlockCOO) block; |
| csrBlock = new SparseBlockCSR(toIntExact(mat.getNumRows()), cooBlock.rowIndexes(), |
| cooBlock.indexes(), cooBlock.values()); |
| if (DMLScript.STATISTICS) |
| GPUStatistics.cudaSparseConversionTime.add(System.nanoTime() - t0); |
| if (DMLScript.STATISTICS) |
| GPUStatistics.cudaSparseConversionCount.increment(); |
| } else if (block instanceof SparseBlockMCSR) { |
| if (DMLScript.STATISTICS) |
| t0 = System.nanoTime(); |
| SparseBlockMCSR mcsrBlock = (SparseBlockMCSR) block; |
| csrBlock = new SparseBlockCSR(mcsrBlock.getRows(), toIntExact(mcsrBlock.size())); |
| if (DMLScript.STATISTICS) |
| GPUStatistics.cudaSparseConversionTime.add(System.nanoTime() - t0); |
| if (DMLScript.STATISTICS) |
| GPUStatistics.cudaSparseConversionCount.increment(); |
| } else { |
| throw new DMLRuntimeException("Unsupported sparse matrix format for CUDA operations"); |
| } |
| rowPtr = csrBlock.rowPointers(); |
| colInd = csrBlock.indexes(); |
| values = csrBlock.values(); |
| } |
| |
| if (values != null) |
| if(values.length > tmp.getNonZeros()) |
| allocateSparseMatrixOnDevice(values.length, false); |
| else |
| allocateSparseMatrixOnDevice(false); |
| else |
| allocateSparseMatrixOnDevice(false); |
| |
| if (copyToDevice) { |
| CSRPointer.copyToDevice(getGPUContext(), getJcudaSparseMatrixPtr(), |
| tmp.getNumRows(), tmp.getNonZeros(), rowPtr, colInd, values); |
| if (DMLScript.STATISTICS) |
| GPUStatistics.cudaToDevCount.add(3); |
| } |
| } else { |
| double[] data = tmp.getDenseBlockValues(); |
| |
| if (data == null && tmp.getSparseBlock() != null) |
| throw new DMLRuntimeException("Incorrect sparsity calculation"); |
| else if (data == null && tmp.getNonZeros() != 0) |
| throw new DMLRuntimeException("MatrixBlock is not allocated"); |
| |
| allocateDenseMatrixOnDevice(false); |
| |
| if (tmp.getNonZeros() == 0) { |
| // Minor optimization: No need to allocate empty error for CPU |
| // data = new double[tmp.getNumRows() * tmp.getNumColumns()]; |
| cudaMemset(getDensePointer(), 0, getDatatypeSizeOf(mat.getNumRows() * mat.getNumColumns())); |
| } |
| else { |
| // Copy dense block |
| // H2D now only measures the time taken to do |
| LibMatrixCUDA.cudaSupportFunctions.hostToDevice(getGPUContext(), data, getDensePointer(), opcode); |
| if (DMLScript.STATISTICS) |
| GPUStatistics.cudaToDevCount.add(1); |
| } |
| } |
| |
| mat.release(); |
| |
| if (DMLScript.STATISTICS) |
| GPUStatistics.cudaToDevTime.add(System.nanoTime() - start); |
| } |
| |
| public static int toIntExact(long l) { |
| if (l < Integer.MIN_VALUE || l > Integer.MAX_VALUE) { |
| throw new DMLRuntimeException("Cannot be cast to int:" + l); |
| } |
| return (int) l; |
| } |
| |
| |
| /** |
| * Copies the data from device to host. |
| * Currently eagerDelete and isEviction are both provided for better control in different scenarios. |
| * In future, we can force eagerDelete if isEviction is true, else false. |
| * |
| * @param instName opcode of the instruction for fine-grained statistics |
| * @param isEviction is called for eviction |
| * @param eagerDelete whether to perform eager deletion of the device data. |
| * @throws DMLRuntimeException if error occurs |
| */ |
| protected void copyFromDeviceToHost(String instName, boolean isEviction, boolean eagerDelete) throws DMLRuntimeException { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : copyFromDeviceToHost, on " + this + ", GPUContext=" + getGPUContext()); |
| } |
| if(shadowBuffer.isBuffered()) { |
| if(isEviction) { |
| // If already copied to shadow buffer as part of previous eviction, do nothing. |
| return; |
| } |
| else { |
| // If already copied to shadow buffer as part of previous eviction and this is not an eviction (i.e. bufferpool call for subsequent CP/Spark instruction), |
| // then copy from shadow buffer to MatrixObject. |
| shadowBuffer.moveToHost(); |
| return; |
| } |
| } |
| else if(shadowBuffer.isEligibleForBuffering(isEviction, eagerDelete)) { |
| // Perform shadow buffering if (1) single precision, (2) during eviction, (3) eagerDelete is true |
| // (4) for dense matrices, and (5) if the given matrix can fit into the shadow buffer. |
| shadowBuffer.moveFromDevice(instName); |
| return; |
| } |
| else if (isDensePointerNull() && getJcudaSparseMatrixPtr() == null) { |
| throw new DMLRuntimeException( |
| "Cannot copy from device to host as JCuda dense/sparse pointer is not allocated"); |
| } |
| else if (!isDensePointerNull() && getJcudaSparseMatrixPtr() != null) { |
| throw new DMLRuntimeException("Invalid state : JCuda dense/sparse pointer are both allocated"); |
| } |
| else if(getJcudaSparseMatrixPtr() != null && !LibMatrixCUDA.isInSparseFormat(getGPUContext(), mat)) { |
| throw new DMLRuntimeException( |
| "Block not in sparse format on host yet the device sparse matrix pointer is not null"); |
| } |
| else if(getJcudaSparseMatrixPtr() != null && isSparseAndEmpty()) { |
| mat.acquireModify(new MatrixBlock((int)mat.getNumRows(), (int)mat.getNumColumns(), 0l)); // empty block |
| mat.release(); |
| return; |
| } |
| boolean sparse = false; |
| if(isDensePointerNull()) |
| sparse = true; |
| MatrixBlock tmp = null; |
| long start = DMLScript.STATISTICS ? System.nanoTime() : 0; |
| if (!isDensePointerNull()) { |
| tmp = new MatrixBlock(toIntExact(mat.getNumRows()), toIntExact(mat.getNumColumns()), false); |
| tmp.allocateDenseBlock(); |
| LibMatrixCUDA.cudaSupportFunctions.deviceToHost(getGPUContext(), |
| getDensePointer(), tmp.getDenseBlockValues(), instName, isEviction); |
| if(eagerDelete) |
| clearData(instName, true); |
| tmp.recomputeNonZeros(); |
| } else { |
| int rows = toIntExact(mat.getNumRows()); |
| int cols = toIntExact(mat.getNumColumns()); |
| int nnz = toIntExact(getJcudaSparseMatrixPtr().nnz); |
| double[] values = new double[nnz]; |
| LibMatrixCUDA.cudaSupportFunctions.deviceToHost(getGPUContext(), getJcudaSparseMatrixPtr().val, values, instName, isEviction); |
| int[] rowPtr = new int[rows + 1]; |
| int[] colInd = new int[nnz]; |
| CSRPointer.copyPtrToHost(getJcudaSparseMatrixPtr(), rows, nnz, rowPtr, colInd); |
| if(eagerDelete) |
| clearData(instName, true); |
| SparseBlockCSR sparseBlock = new SparseBlockCSR(rowPtr, colInd, values, nnz); |
| tmp = new MatrixBlock(rows, cols, nnz, sparseBlock); |
| } |
| mat.acquireModify(tmp); |
| mat.release(); |
| if (DMLScript.STATISTICS && !isEviction) { |
| // Eviction time measure in malloc |
| long totalTime = System.nanoTime() - start; |
| int count = sparse ? 3 : 1; |
| GPUStatistics.cudaFromDevTime.add(totalTime); |
| GPUStatistics.cudaFromDevCount.add(count); |
| } |
| dirty = false; |
| } |
| |
| /** |
| * Clears the data associated with this {@link GPUObject} instance |
| * |
| * @param opcode opcode of the instruction |
| * @param eager whether to be done synchronously or asynchronously |
| * @throws DMLRuntimeException if error occurs |
| */ |
| synchronized public void clearData(String opcode, boolean eager) throws DMLRuntimeException { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : clearData on " + this + ", GPUContext=" + getGPUContext()); |
| } |
| if (!isDensePointerNull()) { |
| getGPUContext().cudaFreeHelper(opcode, getDensePointer(), eager); |
| } |
| if (getJcudaSparseMatrixPtr() != null) { |
| getJcudaSparseMatrixPtr().deallocate(eager); |
| } |
| clearDensePointer(); |
| shadowBuffer.clearShadowPointer(); |
| jcudaSparseMatrixPtr = null; |
| resetReadWriteLock(); |
| getGPUContext().getMemoryManager().removeGPUObject(this); |
| } |
| |
| public void clearGPUObject() { |
| if(LOG.isTraceEnabled()) |
| LOG.trace("GPU : clearData on " + this + ", GPUContext=" + getGPUContext()); |
| |
| clearDensePointer(); |
| shadowBuffer.clearShadowPointer(); |
| jcudaSparseMatrixPtr = null; |
| resetReadWriteLock(); |
| getGPUContext().getMemoryManager().removeGPUObject(this); |
| } |
| |
| /** |
| * Pointer to sparse matrix |
| * |
| * @return ? |
| */ |
| public CSRPointer getJcudaSparseMatrixPtr() { |
| return jcudaSparseMatrixPtr; |
| } |
| |
| /** |
| * Whether this block is dirty on the GPU |
| * |
| * @return ? |
| */ |
| public boolean isDirty() { |
| return dirty; |
| } |
| |
| @Override |
| public String toString() { |
| final StringBuilder sb = new StringBuilder("GPUObject{"); |
| sb.append(", dirty=").append(dirty); |
| sb.append(", readLocks=").append(readLocks.longValue()); |
| sb.append(", writeLock=").append(writeLock); |
| sb.append(", sparse? ").append(isSparse); |
| sb.append(", dims=[").append(mat.getNumRows()).append(",").append(mat.getNumColumns()).append("]"); |
| if(!isDensePointerNull()) |
| sb.append(", densePtr=").append(getDensePointer()); |
| if(jcudaSparseMatrixPtr != null) |
| sb.append(", sparsePtr=").append(jcudaSparseMatrixPtr); |
| sb.append('}'); |
| return sb.toString(); |
| } |
| |
| private static long getPointerAddressInternal(Pointer p) { |
| // WORKAROUND until a method like CUdeviceptr#getAddress exists in jCuda |
| class PointerWithAddress extends Pointer |
| { |
| PointerWithAddress(Pointer other) |
| { |
| super(other); |
| } |
| long getAddress() |
| { |
| return getNativePointer() + getByteOffset(); |
| } |
| } |
| return new PointerWithAddress(p).getAddress(); |
| } |
| |
| public long getDensePointerAddress() { |
| return getPointerAddressInternal(getDensePointer()); |
| } |
| |
| public static long getPointerAddress(Pointer p) { |
| return (p == null) ? 0 : getPointerAddressInternal(p); |
| } |
| } |