| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| package org.apache.sysds.runtime.instructions.gpu.context; |
| |
| import static jcuda.jcusparse.JCusparse.cusparseCreateMatDescr; |
| import static jcuda.jcusparse.JCusparse.cusparseSetMatIndexBase; |
| import static jcuda.jcusparse.JCusparse.cusparseSetMatType; |
| import static jcuda.jcusparse.JCusparse.cusparseSetPointerMode; |
| import static jcuda.jcusparse.JCusparse.cusparseXcsrgeamNnz; |
| import static jcuda.jcusparse.JCusparse.cusparseXcsrgemmNnz; |
| import static jcuda.jcusparse.cusparseIndexBase.CUSPARSE_INDEX_BASE_ZERO; |
| import static jcuda.jcusparse.cusparseMatrixType.CUSPARSE_MATRIX_TYPE_GENERAL; |
| import static jcuda.runtime.JCuda.cudaMemcpy; |
| import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice; |
| import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost; |
| import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice; |
| |
| import org.apache.commons.logging.Log; |
| import org.apache.commons.logging.LogFactory; |
| import org.apache.sysds.api.DMLScript; |
| import org.apache.sysds.runtime.DMLRuntimeException; |
| import org.apache.sysds.runtime.matrix.data.LibMatrixCUDA; |
| import org.apache.sysds.utils.GPUStatistics; |
| import org.apache.sysds.utils.Statistics; |
| |
| import jcuda.Pointer; |
| import jcuda.Sizeof; |
| import jcuda.jcublas.cublasHandle; |
| import jcuda.jcusparse.cusparseHandle; |
| import jcuda.jcusparse.cusparseMatDescr; |
| import jcuda.jcusparse.cusparsePointerMode; |
| |
| /** |
| * Compressed Sparse Row (CSR) format for CUDA |
| * Generalized matrix multiply is implemented for CSR format in the cuSparse library among other operations |
| * |
| * Since we assume that the matrix is stored with zero-based indexing (i.e. CUSPARSE_INDEX_BASE_ZERO), |
| * the matrix |
| * 1.0 4.0 0.0 0.0 0.0 |
| * 0.0 2.0 3.0 0.0 0.0 |
| * 5.0 0.0 0.0 7.0 8.0 |
| * 0.0 0.0 9.0 0.0 6.0 |
| * |
| * is stored as |
| * val = 1.0 4.0 2.0 3.0 5.0 7.0 8.0 9.0 6.0 |
| * rowPtr = 0.0 2.0 4.0 7.0 9.0 |
| * colInd = 0.0 1.0 1.0 2.0 0.0 3.0 4.0 2.0 4.0 |
| */ |
| public class CSRPointer { |
| |
| private static final Log LOG = LogFactory.getLog(CSRPointer.class.getName()); |
| |
| private static final double ULTRA_SPARSITY_TURN_POINT = 0.00004; |
| public static cusparseMatDescr matrixDescriptor; |
| /** |
| * {@link GPUContext} instance to track the GPU to do work on |
| */ |
| private final GPUContext gpuContext; |
| /** |
| * Number of non zeroes |
| */ |
| public long nnz; |
| |
| /** |
| * double array of non zero values |
| */ |
| public Pointer val; |
| |
| /** |
| * integer array of start of all rows and end of last row + 1 |
| */ |
| public Pointer rowPtr; |
| |
| /** |
| * integer array of nnz values' column indices |
| */ |
| public Pointer colInd; |
| |
| /** |
| * descriptor of matrix, only CUSPARSE_MATRIX_TYPE_GENERAL supported |
| */ |
| public cusparseMatDescr descr; |
| |
| /** |
| * Default constructor to help with Factory method {@link #allocateEmpty(GPUContext, long, long)} |
| * |
| * @param gCtx a valid {@link GPUContext} |
| */ |
| private CSRPointer(GPUContext gCtx) { |
| gpuContext = gCtx; |
| val = new Pointer(); |
| rowPtr = new Pointer(); |
| colInd = new Pointer(); |
| allocateMatDescrPointer(); |
| } |
| |
| private static long getDataTypeSizeOf(long numElems) { |
| return numElems * LibMatrixCUDA.sizeOfDataType; |
| } |
| |
| private static long getIntSizeOf(long numElems) { |
| return numElems * Sizeof.INT; |
| } |
| |
| public static int toIntExact(long l) { |
| if (l < Integer.MIN_VALUE || l > Integer.MAX_VALUE) { |
| throw new DMLRuntimeException("Cannot be cast to int:" + l); |
| } |
| return (int) l; |
| } |
| |
| /** |
| * @return Singleton default matrix descriptor object |
| * (set with CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_INDEX_BASE_ZERO) |
| */ |
| public static cusparseMatDescr getDefaultCuSparseMatrixDescriptor() { |
| if (matrixDescriptor == null) { |
| // Code from JCuda Samples - http://www.jcuda.org/samples/JCusparseSample.java |
| matrixDescriptor = new cusparseMatDescr(); |
| cusparseCreateMatDescr(matrixDescriptor); |
| cusparseSetMatType(matrixDescriptor, CUSPARSE_MATRIX_TYPE_GENERAL); |
| cusparseSetMatIndexBase(matrixDescriptor, CUSPARSE_INDEX_BASE_ZERO); |
| } |
| return matrixDescriptor; |
| } |
| |
| /** |
| * Estimate the size of a CSR matrix in GPU memory |
| * Size of pointers is not needed and is not added in |
| * |
| * @param nnz2 number of non zeroes |
| * @param rows number of rows |
| * @return size estimate |
| */ |
| public static long estimateSize(long nnz2, long rows) { |
| long sizeofValArray = getDataTypeSizeOf(nnz2); |
| long sizeofRowPtrArray = getIntSizeOf(rows + 1); |
| long sizeofColIndArray = getIntSizeOf(nnz2); |
| long sizeofDescr = getIntSizeOf(4); |
| // From the CUSPARSE documentation, the cusparseMatDescr in native code is represented as: |
| // typedef struct { |
| // cusparseMatrixType_t MatrixType; |
| // cusparseFillMode_t FillMode; |
| // cusparseDiagType_t DiagType; |
| // cusparseIndexBase_t IndexBase; |
| // } cusparseMatDescr_t; |
| long tot = sizeofValArray + sizeofRowPtrArray + sizeofColIndArray + sizeofDescr; |
| return tot; |
| } |
| |
| /** |
| * Static method to copy a CSR sparse matrix from Host to Device |
| * |
| * @param gCtx GPUContext |
| * @param dest [input] destination location (on GPU) |
| * @param rows number of rows |
| * @param nnz number of non-zeroes |
| * @param rowPtr integer array of row pointers |
| * @param colInd integer array of column indices |
| * @param values double array of non zero values |
| */ |
| public static void copyToDevice(GPUContext gCtx, CSRPointer dest, int rows, long nnz, int[] rowPtr, int[] colInd, double[] values) { |
| CSRPointer r = dest; |
| long t0 = 0; |
| if (DMLScript.STATISTICS) |
| t0 = System.nanoTime(); |
| r.nnz = nnz; |
| if(rows < 0) throw new DMLRuntimeException("Incorrect input parameter: rows=" + rows); |
| if(nnz < 0) throw new DMLRuntimeException("Incorrect input parameter: nnz=" + nnz); |
| if(rowPtr.length < rows + 1) throw new DMLRuntimeException("The length of rowPtr needs to be greater than or equal to " + (rows + 1)); |
| if(colInd.length < nnz) throw new DMLRuntimeException("The length of colInd needs to be greater than or equal to " + nnz); |
| if(values.length < nnz) throw new DMLRuntimeException("The length of values needs to be greater than or equal to " + nnz); |
| LibMatrixCUDA.cudaSupportFunctions.hostToDevice(gCtx, values, r.val, null); |
| cudaMemcpy(r.rowPtr, Pointer.to(rowPtr), getIntSizeOf(rows + 1), cudaMemcpyHostToDevice); |
| cudaMemcpy(r.colInd, Pointer.to(colInd), getIntSizeOf(nnz), cudaMemcpyHostToDevice); |
| if (DMLScript.STATISTICS) |
| GPUStatistics.cudaToDevTime.add(System.nanoTime() - t0); |
| if (DMLScript.STATISTICS) |
| GPUStatistics.cudaToDevCount.add(3); |
| } |
| |
| /** |
| * Static method to copy a CSR sparse matrix from Device to host |
| * |
| * @param src [input] source location (on GPU) |
| * @param rows [input] number of rows |
| * @param nnz [input] number of non-zeroes |
| * @param rowPtr [output] pre-allocated integer array of row pointers of size (rows+1) |
| * @param colInd [output] pre-allocated integer array of column indices of size nnz |
| */ |
| public static void copyPtrToHost(CSRPointer src, int rows, long nnz, int[] rowPtr, int[] colInd) { |
| CSRPointer r = src; |
| cudaMemcpy(Pointer.to(rowPtr), r.rowPtr, getIntSizeOf(rows + 1), cudaMemcpyDeviceToHost); |
| cudaMemcpy(Pointer.to(colInd), r.colInd, getIntSizeOf(nnz), cudaMemcpyDeviceToHost); |
| } |
| |
| /** |
| * Estimates the number of non zero elements from the results of a sparse cusparseDgeam operation |
| * C = a op(A) + b op(B) |
| * |
| * @param gCtx a valid {@link GPUContext} |
| * @param handle a valid {@link cusparseHandle} |
| * @param A Sparse Matrix A on GPU |
| * @param B Sparse Matrix B on GPU |
| * @param m Rows in A |
| * @param n Columns in Bs |
| * @return CSR (compressed sparse row) pointer |
| */ |
| public static CSRPointer allocateForDgeam(GPUContext gCtx, cusparseHandle handle, CSRPointer A, CSRPointer B, int m, int n) { |
| if (A.nnz >= Integer.MAX_VALUE || B.nnz >= Integer.MAX_VALUE) |
| throw new DMLRuntimeException("Number of non zeroes is larger than supported by cuSparse"); |
| CSRPointer C = new CSRPointer(gCtx); |
| step1AllocateRowPointers(gCtx, handle, C, m); |
| step2GatherNNZGeam(gCtx, handle, A, B, C, m, n); |
| step3AllocateValNInd(gCtx, handle, C); |
| return C; |
| } |
| |
| /** |
| * Estimates the number of non-zero elements from the result of a sparse matrix multiplication C = A * B |
| * and returns the {@link CSRPointer} to C with the appropriate GPU memory. |
| * |
| * @param gCtx a valid {@link GPUContext} |
| * @param handle a valid {@link cusparseHandle} |
| * @param A Sparse Matrix A on GPU |
| * @param transA 'T' if A is to be transposed, 'N' otherwise |
| * @param B Sparse Matrix B on GPU |
| * @param transB 'T' if B is to be transposed, 'N' otherwise |
| * @param m Rows in A |
| * @param n Columns in B |
| * @param k Columns in A / Rows in B |
| * @return a {@link CSRPointer} instance that encapsulates the CSR matrix on GPU |
| */ |
| public static CSRPointer allocateForMatrixMultiply(GPUContext gCtx, cusparseHandle handle, CSRPointer A, int transA, |
| CSRPointer B, int transB, int m, int n, int k) { |
| // Following the code example at http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csrgemm and at |
| // https://github.com/jcuda/jcuda-matrix-utils/blob/master/JCudaMatrixUtils/src/test/java/org/jcuda/matrix/samples/JCusparseSampleDgemm.java |
| CSRPointer C = new CSRPointer(gCtx); |
| step1AllocateRowPointers(gCtx, handle, C, m); |
| step2GatherNNZGemm(gCtx, handle, A, transA, B, transB, C, m, n, k); |
| step3AllocateValNInd(gCtx, handle, C); |
| return C; |
| } |
| |
| /** |
| * Factory method to allocate an empty CSR Sparse matrix on the GPU |
| * |
| * @param gCtx a valid {@link GPUContext} |
| * @param nnz2 number of non-zeroes |
| * @param rows number of rows |
| * @return a {@link CSRPointer} instance that encapsulates the CSR matrix on GPU |
| */ |
| public static CSRPointer allocateEmpty(GPUContext gCtx, long nnz2, long rows) { |
| LOG.trace("GPU : allocateEmpty from CSRPointer with nnz=" + nnz2 + " and rows=" + rows + ", GPUContext=" + gCtx); |
| if(nnz2 < 0) throw new DMLRuntimeException("Incorrect usage of internal API, number of non zeroes is less than 0 when trying to allocate sparse data on GPU"); |
| if(rows <= 0) throw new DMLRuntimeException("Incorrect usage of internal API, number of rows is less than or equal to 0 when trying to allocate sparse data on GPU"); |
| CSRPointer r = new CSRPointer(gCtx); |
| r.nnz = nnz2; |
| if (nnz2 == 0) { |
| // The convention for an empty sparse matrix is to just have an instance of the CSRPointer object |
| // with no memory allocated on the GPU. |
| return r; |
| } |
| // increment the cudaCount by 1 for the allocation of all 3 arrays |
| r.val = gCtx.allocate(null, getDataTypeSizeOf(nnz2)); |
| r.rowPtr = gCtx.allocate(null, getIntSizeOf(rows + 1)); |
| r.colInd = gCtx.allocate(null, getIntSizeOf(nnz2)); |
| return r; |
| } |
| |
| /** |
| * Allocate row pointers of m+1 elements |
| * |
| * @param gCtx a valid {@link GPUContext} |
| * @param handle a valid {@link cusparseHandle} |
| * @param C Output matrix |
| * @param rowsC number of rows in C |
| */ |
| private static void step1AllocateRowPointers(GPUContext gCtx, cusparseHandle handle, CSRPointer C, int rowsC) { |
| LOG.trace("GPU : step1AllocateRowPointers" + ", GPUContext=" + gCtx); |
| cusparseSetPointerMode(handle, cusparsePointerMode.CUSPARSE_POINTER_MODE_HOST); |
| //cudaDeviceSynchronize; |
| // Do not increment the cudaCount of allocations on GPU |
| C.rowPtr = gCtx.allocate(null, getIntSizeOf((long) rowsC + 1)); |
| } |
| |
| /** |
| * Determine total number of nonzero element for the cusparseDgeam operation. |
| * This is done from either (nnzC=*nnzTotalDevHostPtr) or (nnzC=csrRowPtrC(m)-csrRowPtrC(0)) |
| * |
| * @param gCtx a valid {@link GPUContext} |
| * @param handle a valid {@link cusparseHandle} |
| * @param A Sparse Matrix A on GPU |
| * @param B Sparse Matrix B on GPU |
| * @param C Output Sparse Matrix C on GPU |
| * @param m Rows in C |
| * @param n Columns in C |
| */ |
| private static void step2GatherNNZGeam(GPUContext gCtx, cusparseHandle handle, CSRPointer A, CSRPointer B, CSRPointer C, int m, int n) { |
| LOG.trace("GPU : step2GatherNNZGeam for DGEAM" + ", GPUContext=" + gCtx); |
| int[] CnnzArray = { -1 }; |
| cusparseXcsrgeamNnz(handle, m, n, A.descr, toIntExact(A.nnz), A.rowPtr, A.colInd, B.descr, toIntExact(B.nnz), |
| B.rowPtr, B.colInd, C.descr, C.rowPtr, Pointer.to(CnnzArray)); |
| //cudaDeviceSynchronize; |
| if (CnnzArray[0] != -1) { |
| C.nnz = CnnzArray[0]; |
| } else { |
| int baseArray[] = { 0 }; |
| cudaMemcpy(Pointer.to(CnnzArray), C.rowPtr.withByteOffset(getIntSizeOf(m)), getIntSizeOf(1), |
| cudaMemcpyDeviceToHost); |
| cudaMemcpy(Pointer.to(baseArray), C.rowPtr, getIntSizeOf(1), cudaMemcpyDeviceToHost); |
| C.nnz = CnnzArray[0] - baseArray[0]; |
| } |
| } |
| |
| /** |
| * Determine total number of nonzero element for the cusparseDgemm operation. |
| * |
| * @param gCtx a valid {@link GPUContext} |
| * @param handle a valid {@link cusparseHandle} |
| * @param A Sparse Matrix A on GPU |
| * @param transA op - whether A is transposed |
| * @param B Sparse Matrix B on GPU |
| * @param transB op - whether B is transposed |
| * @param C Output Sparse Matrix C on GPU |
| * @param m Number of rows of sparse matrix op ( A ) and C |
| * @param n Number of columns of sparse matrix op ( B ) and C |
| * @param k Number of columns/rows of sparse matrix op ( A ) / op ( B ) |
| */ |
| private static void step2GatherNNZGemm(GPUContext gCtx, cusparseHandle handle, CSRPointer A, int transA, |
| CSRPointer B, int transB, CSRPointer C, int m, int n, int k) { |
| LOG.trace("GPU : step2GatherNNZGemm for DGEMM" + ", GPUContext=" + gCtx); |
| int[] CnnzArray = { -1 }; |
| if (A.nnz >= Integer.MAX_VALUE || B.nnz >= Integer.MAX_VALUE) { |
| throw new DMLRuntimeException("Number of non zeroes is larger than supported by cuSparse"); |
| } |
| cusparseXcsrgemmNnz(handle, transA, transB, m, n, k, A.descr, toIntExact(A.nnz), A.rowPtr, A.colInd, B.descr, |
| toIntExact(B.nnz), B.rowPtr, B.colInd, C.descr, C.rowPtr, Pointer.to(CnnzArray)); |
| //cudaDeviceSynchronize; |
| if (CnnzArray[0] != -1) { |
| C.nnz = CnnzArray[0]; |
| } else { |
| int baseArray[] = { 0 }; |
| cudaMemcpy(Pointer.to(CnnzArray), C.rowPtr.withByteOffset(getIntSizeOf(m)), getIntSizeOf(1), |
| cudaMemcpyDeviceToHost); |
| cudaMemcpy(Pointer.to(baseArray), C.rowPtr, getIntSizeOf(1), cudaMemcpyDeviceToHost); |
| C.nnz = CnnzArray[0] - baseArray[0]; |
| } |
| } |
| |
| /** |
| * Allocate val and index pointers. |
| * |
| * @param gCtx a valid {@link GPUContext} |
| * @param handle a valid {@link cusparseHandle} |
| * @param C Output sparse matrix on GPU |
| */ |
| private static void step3AllocateValNInd(GPUContext gCtx, cusparseHandle handle, CSRPointer C) { |
| LOG.trace("GPU : step3AllocateValNInd" + ", GPUContext=" + gCtx); |
| // Increment cudaCount by one when all three arrays of CSR sparse array are allocated |
| C.val = gCtx.allocate(null, getDataTypeSizeOf(C.nnz)); |
| C.colInd = gCtx.allocate(null, getIntSizeOf(C.nnz)); |
| } |
| |
| // ============================================================================================== |
| |
| // The following methods estimate the memory needed for sparse matrices that are |
| // results of operations on other sparse matrices using the cuSparse Library. |
| // The operation is C = op(A) binaryOperation op(B), C is the output and A & B are the inputs |
| // op = whether to transpose or not |
| // binaryOperation = For cuSparse, +, - are *(matmul) are supported |
| |
| // From CuSparse Manual, |
| // Since A and B have different sparsity patterns, cuSPARSE adopts a two-step approach |
| // to complete sparse matrix C. In the first step, the user allocates csrRowPtrC of m+1 |
| // elements and uses function cusparseXcsrgeamNnz() to determine csrRowPtrC |
| // and the total number of nonzero elements. In the second step, the user gathers nnzC |
| //(number of nonzero elements of matrix C) from either (nnzC=*nnzTotalDevHostPtr) |
| // or (nnzC=csrRowPtrC(m)-csrRowPtrC(0)) and allocates csrValC, csrColIndC of |
| // nnzC elements respectively, then finally calls function cusparse[S|D|C|Z]csrgeam() |
| // to complete matrix C. |
| |
| public CSRPointer clone(int rows) { |
| CSRPointer me = this; |
| CSRPointer that = new CSRPointer(me.getGPUContext()); |
| that.allocateMatDescrPointer(); |
| that.nnz = me.nnz; |
| that.val = allocate(that.nnz * LibMatrixCUDA.sizeOfDataType); |
| that.rowPtr = allocate(rows * Sizeof.INT); |
| that.colInd = allocate(that.nnz * Sizeof.INT); |
| cudaMemcpy(that.val, me.val, that.nnz * LibMatrixCUDA.sizeOfDataType, cudaMemcpyDeviceToDevice); |
| cudaMemcpy(that.rowPtr, me.rowPtr, rows * Sizeof.INT, cudaMemcpyDeviceToDevice); |
| cudaMemcpy(that.colInd, me.colInd, that.nnz * Sizeof.INT, cudaMemcpyDeviceToDevice); |
| return that; |
| } |
| |
| private Pointer allocate(long size) { |
| return getGPUContext().allocate(null, size); |
| } |
| |
| private GPUContext getGPUContext() { |
| return gpuContext; |
| } |
| |
| // ============================================================================================== |
| |
| /** |
| * Check for ultra sparsity |
| * |
| * @param rows number of rows |
| * @param cols number of columns |
| * @return true if ultra sparse |
| */ |
| public boolean isUltraSparse(int rows, int cols) { |
| double sp = ((double) nnz / rows / cols); |
| return sp < ULTRA_SPARSITY_TURN_POINT; |
| } |
| |
| /** |
| * Initializes {@link #descr} to CUSPARSE_MATRIX_TYPE_GENERAL, |
| * the default that works for DGEMM. |
| */ |
| private void allocateMatDescrPointer() { |
| this.descr = getDefaultCuSparseMatrixDescriptor(); |
| } |
| |
| /** |
| * Copies this CSR matrix on the GPU to a dense column-major matrix |
| * on the GPU. This is a temporary matrix for operations such as |
| * cusparseDcsrmv. |
| * Since the allocated matrix is temporary, bookkeeping is not updated. |
| * The caller is responsible for calling "free" on the returned Pointer object |
| * |
| * @param cusparseHandle a valid {@link cusparseHandle} |
| * @param cublasHandle a valid {@link cublasHandle} |
| * @param rows number of rows in this CSR matrix |
| * @param cols number of columns in this CSR matrix |
| * @param instName name of the invoking instruction to record{@link Statistics}. |
| * @return A {@link Pointer} to the allocated dense matrix (in column-major format) |
| */ |
| public Pointer toColumnMajorDenseMatrix(cusparseHandle cusparseHandle, cublasHandle cublasHandle, int rows, |
| int cols, String instName) { |
| LOG.trace("GPU : sparse -> column major dense (inside CSRPointer) on " + this + ", GPUContext=" |
| + getGPUContext()); |
| long size = rows * getDataTypeSizeOf(cols); |
| Pointer A = allocate(size); |
| // If this sparse block is empty, the allocated dense matrix, initialized to zeroes, will be returned. |
| if (val != null && rowPtr != null && colInd != null && nnz > 0) { |
| // Note: cusparseDcsr2dense method cannot handle empty blocks |
| LibMatrixCUDA.cudaSupportFunctions.cusparsecsr2dense(cusparseHandle, rows, cols, descr, val, rowPtr, colInd, A, rows); |
| //cudaDeviceSynchronize; |
| } else { |
| LOG.debug("in CSRPointer, the values array, row pointers array or column indices array was null"); |
| } |
| return A; |
| } |
| |
| /** |
| * Calls cudaFree lazily on the allocated {@link Pointer} instances |
| * |
| */ |
| public void deallocate() { |
| deallocate(DMLScript.EAGER_CUDA_FREE); |
| } |
| |
| /** |
| * Calls cudaFree lazily or eagerly on the allocated {@link Pointer} instances |
| * |
| * @param eager whether to do eager or lazy cudaFrees |
| */ |
| public void deallocate(boolean eager) { |
| if (nnz > 0) { |
| if (val != null) |
| getGPUContext().cudaFreeHelper(null, val, eager); |
| if (rowPtr != null) |
| getGPUContext().cudaFreeHelper(null, rowPtr, eager); |
| if (colInd != null) |
| getGPUContext().cudaFreeHelper(null, colInd, eager); |
| } |
| val = null; |
| rowPtr = null; |
| colInd = null; |
| } |
| |
| @Override |
| public String toString() { |
| return "CSRPointer{" + "nnz=" + nnz + '}'; |
| } |
| } |