src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CSRPointer.java - systemds - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 package org.apache.sysds.runtime.instructions.gpu.context;

 import static jcuda.jcusparse.JCusparse.cusparseCreateMatDescr;
 import static jcuda.jcusparse.JCusparse.cusparseSetMatIndexBase;
 import static jcuda.jcusparse.JCusparse.cusparseSetMatType;
 import static jcuda.jcusparse.JCusparse.cusparseSetPointerMode;
 import static jcuda.jcusparse.JCusparse.cusparseXcsrgeamNnz;
 import static jcuda.jcusparse.JCusparse.cusparseXcsrgemmNnz;
 import static jcuda.jcusparse.cusparseIndexBase.CUSPARSE_INDEX_BASE_ZERO;
 import static jcuda.jcusparse.cusparseMatrixType.CUSPARSE_MATRIX_TYPE_GENERAL;
 import static jcuda.runtime.JCuda.cudaMemcpy;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;

 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.api.DMLScript;
 import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.matrix.data.LibMatrixCUDA;
 import org.apache.sysds.utils.GPUStatistics;
 import org.apache.sysds.utils.Statistics;

 import jcuda.Pointer;
 import jcuda.Sizeof;
 import jcuda.jcublas.cublasHandle;
 import jcuda.jcusparse.cusparseHandle;
 import jcuda.jcusparse.cusparseMatDescr;
 import jcuda.jcusparse.cusparsePointerMode;

 /**
  * Compressed Sparse Row (CSR) format for CUDA
  * Generalized matrix multiply is implemented for CSR format in the cuSparse library among other operations
  *
  * Since we assume that the matrix is stored with zero-based indexing (i.e. CUSPARSE_INDEX_BASE_ZERO),
  * the matrix
  * 1.0 4.0 0.0 0.0 0.0
  * 0.0 2.0 3.0 0.0 0.0
  * 5.0 0.0 0.0 7.0 8.0
  * 0.0 0.0 9.0 0.0 6.0
  *
  * is stored as
  * val = 1.0 4.0 2.0 3.0 5.0 7.0 8.0 9.0 6.0
  * rowPtr = 0.0 2.0 4.0 7.0 9.0
  * colInd = 0.0 1.0 1.0 2.0 0.0 3.0 4.0 2.0 4.0
  */
 public class CSRPointer {

 	private static final Log LOG = LogFactory.getLog(CSRPointer.class.getName());

 	private static final double ULTRA_SPARSITY_TURN_POINT = 0.00004;
 	public static cusparseMatDescr matrixDescriptor;
 	/**
 	 * {@link GPUContext} instance to track the GPU to do work on
 	 */
 	private final GPUContext gpuContext;
 	/**
 	 * Number of non zeroes
 	 */
 	public long nnz;

 	/**
 	 * double array of non zero values
 	 */
 	public Pointer val;

 	/**
 	 * integer array of start of all rows and end of last row + 1
 	 */
 	public Pointer rowPtr;

 	/**
 	 * integer array of nnz values' column indices
 	 */
 	public Pointer colInd;

 	/**
 	 * descriptor of matrix, only CUSPARSE_MATRIX_TYPE_GENERAL supported
 	 */
 	public cusparseMatDescr descr;

 	/**
 	 * Default constructor to help with Factory method {@link #allocateEmpty(GPUContext, long, long)}
 	 *
 	 * @param gCtx a valid {@link GPUContext}
 	 */
 	private CSRPointer(GPUContext gCtx) {
 		gpuContext = gCtx;
 		val = new Pointer();
 		rowPtr = new Pointer();
 		colInd = new Pointer();
 		allocateMatDescrPointer();
 	}

 	private static long getDataTypeSizeOf(long numElems) {
 		return numElems * LibMatrixCUDA.sizeOfDataType;
 	}

 	private static long getIntSizeOf(long numElems) {
 		return numElems * Sizeof.INT;
 	}

 	public static int toIntExact(long l) {
 		if (l < Integer.MIN_VALUE || l > Integer.MAX_VALUE) {
 			throw new DMLRuntimeException("Cannot be cast to int:" + l);
 		}
 		return (int) l;
 	}

 	/**
 	 * @return Singleton default matrix descriptor object
 	 * (set with CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_INDEX_BASE_ZERO)
 	 */
 	public static cusparseMatDescr getDefaultCuSparseMatrixDescriptor() {
 		if (matrixDescriptor == null) {
 			// Code from JCuda Samples - http://www.jcuda.org/samples/JCusparseSample.java
 			matrixDescriptor = new cusparseMatDescr();
 			cusparseCreateMatDescr(matrixDescriptor);
 			cusparseSetMatType(matrixDescriptor, CUSPARSE_MATRIX_TYPE_GENERAL);
 			cusparseSetMatIndexBase(matrixDescriptor, CUSPARSE_INDEX_BASE_ZERO);
 		}
 		return matrixDescriptor;
 	}

 	/**
 	 * Estimate the size of a CSR matrix in GPU memory
 	 * Size of pointers is not needed and is not added in
 	 *
 	 * @param nnz2 number of non zeroes
 	 * @param rows number of rows
 	 * @return size estimate
 	 */
 	public static long estimateSize(long nnz2, long rows) {
 		long sizeofValArray = getDataTypeSizeOf(nnz2);
 		long sizeofRowPtrArray = getIntSizeOf(rows + 1);
 		long sizeofColIndArray = getIntSizeOf(nnz2);
 		long sizeofDescr = getIntSizeOf(4);
 		// From the CUSPARSE documentation, the cusparseMatDescr in native code is represented as:
 		// typedef struct {
 		// 	cusparseMatrixType_t MatrixType;
 		//	cusparseFillMode_t FillMode;
 		//	cusparseDiagType_t DiagType;
 		// 	cusparseIndexBase_t IndexBase;
 		// } cusparseMatDescr_t;
 		long tot = sizeofValArray + sizeofRowPtrArray + sizeofColIndArray + sizeofDescr;
 		return tot;
 	}

 	/**
 	 * Static method to copy a CSR sparse matrix from Host to Device
 	 *
 	 * @param gCtx GPUContext
 	 * @param dest   [input] destination location (on GPU)
 	 * @param rows   number of rows
 	 * @param nnz    number of non-zeroes
 	 * @param rowPtr integer array of row pointers
 	 * @param colInd integer array of column indices
 	 * @param values double array of non zero values
 	 */
 	public static void copyToDevice(GPUContext gCtx, CSRPointer dest, int rows, long nnz, int[] rowPtr, int[] colInd, double[] values) {
 		CSRPointer r = dest;
 		long t0 = 0;
 		if (DMLScript.STATISTICS)
 			t0 = System.nanoTime();
 		r.nnz = nnz;
 		if(rows < 0) throw new DMLRuntimeException("Incorrect input parameter: rows=" + rows);
 		if(nnz < 0) throw new DMLRuntimeException("Incorrect input parameter: nnz=" + nnz);
 		if(rowPtr.length < rows + 1) throw new DMLRuntimeException("The length of rowPtr needs to be greater than or equal to " + (rows + 1));
 		if(colInd.length < nnz) throw new DMLRuntimeException("The length of colInd needs to be greater than or equal to " + nnz);
 		if(values.length < nnz) throw new DMLRuntimeException("The length of values needs to be greater than or equal to " + nnz);
 		LibMatrixCUDA.cudaSupportFunctions.hostToDevice(gCtx, values, r.val, null);
 		cudaMemcpy(r.rowPtr, Pointer.to(rowPtr), getIntSizeOf(rows + 1), cudaMemcpyHostToDevice);
 		cudaMemcpy(r.colInd, Pointer.to(colInd), getIntSizeOf(nnz), cudaMemcpyHostToDevice);
 		if (DMLScript.STATISTICS)
 			GPUStatistics.cudaToDevTime.add(System.nanoTime() - t0);
 		if (DMLScript.STATISTICS)
 			GPUStatistics.cudaToDevCount.add(3);
 	}

 	/**
 	 * Static method to copy a CSR sparse matrix from Device to host
 	 *
 	 * @param src    [input] source location (on GPU)
 	 * @param rows   [input] number of rows
 	 * @param nnz    [input] number of non-zeroes
 	 * @param rowPtr [output] pre-allocated integer array of row pointers of size (rows+1)
 	 * @param colInd [output] pre-allocated integer array of column indices of size nnz
 	 */
 	public static void copyPtrToHost(CSRPointer src, int rows, long nnz, int[] rowPtr, int[] colInd) {
 		CSRPointer r = src;
 		cudaMemcpy(Pointer.to(rowPtr), r.rowPtr, getIntSizeOf(rows + 1), cudaMemcpyDeviceToHost);
 		cudaMemcpy(Pointer.to(colInd), r.colInd, getIntSizeOf(nnz), cudaMemcpyDeviceToHost);
 	}

 	/**
 	 * Estimates the number of non zero elements from the results of a sparse cusparseDgeam operation
 	 * C = a op(A) + b op(B)
 	 *
 	 * @param gCtx   a valid {@link GPUContext}
 	 * @param handle a valid {@link cusparseHandle}
 	 * @param A      Sparse Matrix A on GPU
 	 * @param B      Sparse Matrix B on GPU
 	 * @param m      Rows in A
 	 * @param n      Columns in Bs
 	 * @return CSR (compressed sparse row) pointer
 	 */
 	public static CSRPointer allocateForDgeam(GPUContext gCtx, cusparseHandle handle, CSRPointer A, CSRPointer B, int m, int n) {
 		if (A.nnz >= Integer.MAX_VALUE || B.nnz >= Integer.MAX_VALUE)
 			throw new DMLRuntimeException("Number of non zeroes is larger than supported by cuSparse");
 		CSRPointer C = new CSRPointer(gCtx);
 		step1AllocateRowPointers(gCtx, handle, C, m);
 		step2GatherNNZGeam(gCtx, handle, A, B, C, m, n);
 		step3AllocateValNInd(gCtx, handle, C);
 		return C;
 	}

 	/**
 	 * Estimates the number of non-zero elements from the result of a sparse matrix multiplication C = A * B
 	 * and returns the {@link CSRPointer} to C with the appropriate GPU memory.
 	 *
 	 * @param gCtx   a valid {@link GPUContext}
 	 * @param handle a valid {@link cusparseHandle}
 	 * @param A      Sparse Matrix A on GPU
 	 * @param transA 'T' if A is to be transposed, 'N' otherwise
 	 * @param B      Sparse Matrix B on GPU
 	 * @param transB 'T' if B is to be transposed, 'N' otherwise
 	 * @param m      Rows in A
 	 * @param n      Columns in B
 	 * @param k      Columns in A / Rows in B
 	 * @return a {@link CSRPointer} instance that encapsulates the CSR matrix on GPU
 	 */
 	public static CSRPointer allocateForMatrixMultiply(GPUContext gCtx, cusparseHandle handle, CSRPointer A, int transA,
 			CSRPointer B, int transB, int m, int n, int k) {
 		// Following the code example at http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csrgemm and at
 		// https://github.com/jcuda/jcuda-matrix-utils/blob/master/JCudaMatrixUtils/src/test/java/org/jcuda/matrix/samples/JCusparseSampleDgemm.java
 		CSRPointer C = new CSRPointer(gCtx);
 		step1AllocateRowPointers(gCtx, handle, C, m);
 		step2GatherNNZGemm(gCtx, handle, A, transA, B, transB, C, m, n, k);
 		step3AllocateValNInd(gCtx, handle, C);
 		return C;
 	}

 	/**
 	 * Factory method to allocate an empty CSR Sparse matrix on the GPU
 	 *
 	 * @param gCtx a valid {@link GPUContext}
 	 * @param nnz2 number of non-zeroes
 	 * @param rows number of rows
 	 * @return a {@link CSRPointer} instance that encapsulates the CSR matrix on GPU
 	 */
 	public static CSRPointer allocateEmpty(GPUContext gCtx, long nnz2, long rows) {
 		LOG.trace("GPU : allocateEmpty from CSRPointer with nnz=" + nnz2 + " and rows=" + rows + ", GPUContext=" + gCtx);
 		if(nnz2 < 0) throw new DMLRuntimeException("Incorrect usage of internal API, number of non zeroes is less than 0 when trying to allocate sparse data on GPU");
 		if(rows <= 0) throw new DMLRuntimeException("Incorrect usage of internal API, number of rows is less than or equal to 0 when trying to allocate sparse data on GPU");
 		CSRPointer r = new CSRPointer(gCtx);
 		r.nnz = nnz2;
 		if (nnz2 == 0) {
 			// The convention for an empty sparse matrix is to just have an instance of the CSRPointer object
 			// with no memory allocated on the GPU.
 			return r;
 		}
 		// increment the cudaCount by 1 for the allocation of all 3 arrays
 		r.val = gCtx.allocate(null, getDataTypeSizeOf(nnz2));
 		r.rowPtr = gCtx.allocate(null, getIntSizeOf(rows + 1));
 		r.colInd = gCtx.allocate(null, getIntSizeOf(nnz2));
 		return r;
 	}

 	/**
 	 * Allocate row pointers of m+1 elements
 	 *
 	 * @param gCtx   a valid {@link GPUContext}
 	 * @param handle a valid {@link cusparseHandle}
 	 * @param C      Output matrix
 	 * @param rowsC  number of rows in C
 	 */
 	private static void step1AllocateRowPointers(GPUContext gCtx, cusparseHandle handle, CSRPointer C, int rowsC) {
 		LOG.trace("GPU : step1AllocateRowPointers" + ", GPUContext=" + gCtx);
 		cusparseSetPointerMode(handle, cusparsePointerMode.CUSPARSE_POINTER_MODE_HOST);
 		//cudaDeviceSynchronize;
 		// Do not increment the cudaCount of allocations on GPU
 		C.rowPtr = gCtx.allocate(null, getIntSizeOf((long) rowsC + 1));
 	}

 	/**
 	 * Determine total number of nonzero element for the cusparseDgeam  operation.
 	 * This is done from either (nnzC=*nnzTotalDevHostPtr) or (nnzC=csrRowPtrC(m)-csrRowPtrC(0))
 	 *
 	 * @param gCtx   a valid {@link GPUContext}
 	 * @param handle a valid {@link cusparseHandle}
 	 * @param A      Sparse Matrix A on GPU
 	 * @param B      Sparse Matrix B on GPU
 	 * @param C      Output Sparse Matrix C on GPU
 	 * @param m      Rows in C
 	 * @param n      Columns in C
 	 */
 	private static void step2GatherNNZGeam(GPUContext gCtx, cusparseHandle handle, CSRPointer A, CSRPointer B, CSRPointer C, int m, int n) {
 		LOG.trace("GPU : step2GatherNNZGeam for DGEAM" + ", GPUContext=" + gCtx);
 		int[] CnnzArray = { -1 };
 		cusparseXcsrgeamNnz(handle, m, n, A.descr, toIntExact(A.nnz), A.rowPtr, A.colInd, B.descr, toIntExact(B.nnz),
 				B.rowPtr, B.colInd, C.descr, C.rowPtr, Pointer.to(CnnzArray));
 		//cudaDeviceSynchronize;
 		if (CnnzArray[0] != -1) {
 			C.nnz = CnnzArray[0];
 		} else {
 			int baseArray[] = { 0 };
 			cudaMemcpy(Pointer.to(CnnzArray), C.rowPtr.withByteOffset(getIntSizeOf(m)), getIntSizeOf(1),
 					cudaMemcpyDeviceToHost);
 			cudaMemcpy(Pointer.to(baseArray), C.rowPtr, getIntSizeOf(1), cudaMemcpyDeviceToHost);
 			C.nnz = CnnzArray[0] - baseArray[0];
 		}
 	}

 	/**
 	 * Determine total number of nonzero element for the cusparseDgemm operation.
 	 *
 	 * @param gCtx   a valid {@link GPUContext}
 	 * @param handle a valid {@link cusparseHandle}
 	 * @param A      Sparse Matrix A on GPU
 	 * @param transA op - whether A is transposed
 	 * @param B      Sparse Matrix B on GPU
 	 * @param transB op - whether B is transposed
 	 * @param C      Output Sparse Matrix C on GPU
 	 * @param m      Number of rows of sparse matrix op ( A ) and C
 	 * @param n      Number of columns of sparse matrix op ( B ) and C
 	 * @param k      Number of columns/rows of sparse matrix op ( A ) / op ( B )
 	 */
 	private static void step2GatherNNZGemm(GPUContext gCtx, cusparseHandle handle, CSRPointer A, int transA,
 			CSRPointer B, int transB, CSRPointer C, int m, int n, int k) {
 		LOG.trace("GPU : step2GatherNNZGemm for DGEMM" + ", GPUContext=" + gCtx);
 		int[] CnnzArray = { -1 };
 		if (A.nnz >= Integer.MAX_VALUE || B.nnz >= Integer.MAX_VALUE) {
 			throw new DMLRuntimeException("Number of non zeroes is larger than supported by cuSparse");
 		}
 		cusparseXcsrgemmNnz(handle, transA, transB, m, n, k, A.descr, toIntExact(A.nnz), A.rowPtr, A.colInd, B.descr,
 				toIntExact(B.nnz), B.rowPtr, B.colInd, C.descr, C.rowPtr, Pointer.to(CnnzArray));
 		//cudaDeviceSynchronize;
 		if (CnnzArray[0] != -1) {
 			C.nnz = CnnzArray[0];
 		} else {
 			int baseArray[] = { 0 };
 			cudaMemcpy(Pointer.to(CnnzArray), C.rowPtr.withByteOffset(getIntSizeOf(m)), getIntSizeOf(1),
 					cudaMemcpyDeviceToHost);
 			cudaMemcpy(Pointer.to(baseArray), C.rowPtr, getIntSizeOf(1), cudaMemcpyDeviceToHost);
 			C.nnz = CnnzArray[0] - baseArray[0];
 		}
 	}

 	/**
 	 * Allocate val and index pointers.
 	 *
 	 * @param gCtx   a valid {@link GPUContext}
 	 * @param handle a valid {@link cusparseHandle}
 	 * @param C      Output sparse matrix on GPU
 	 */
 	private static void step3AllocateValNInd(GPUContext gCtx, cusparseHandle handle, CSRPointer C) {
 		LOG.trace("GPU : step3AllocateValNInd" + ", GPUContext=" + gCtx);
 		// Increment cudaCount by one when all three arrays of CSR sparse array are allocated
 		C.val = gCtx.allocate(null, getDataTypeSizeOf(C.nnz));
 		C.colInd = gCtx.allocate(null, getIntSizeOf(C.nnz));
 	}

 	// ==============================================================================================

 	// The following methods estimate the memory needed for sparse matrices that are
 	// results of operations on other sparse matrices using the cuSparse Library.
 	// The operation is C = op(A) binaryOperation op(B), C is the output and A & B are the inputs
 	// op = whether to transpose or not
 	// binaryOperation = For cuSparse, +, - are *(matmul) are supported

 	// From CuSparse Manual,
 	// Since A and B have different sparsity patterns, cuSPARSE adopts a two-step approach
 	// to complete sparse matrix C. In the first step, the user allocates csrRowPtrC of m+1
 	// elements and uses function cusparseXcsrgeamNnz() to determine csrRowPtrC
 	// and the total number of nonzero elements. In the second step, the user gathers nnzC
 	//(number of nonzero elements of matrix C) from either (nnzC=*nnzTotalDevHostPtr)
 	// or (nnzC=csrRowPtrC(m)-csrRowPtrC(0)) and allocates csrValC, csrColIndC of
 	// nnzC elements respectively, then finally calls function cusparse[S|D|C|Z]csrgeam()
 	// to complete matrix C.

 	public CSRPointer clone(int rows) {
 		CSRPointer me = this;
 		CSRPointer that = new CSRPointer(me.getGPUContext());
 		that.allocateMatDescrPointer();
 		that.nnz = me.nnz;
 		that.val = allocate(that.nnz * LibMatrixCUDA.sizeOfDataType);
 		that.rowPtr = allocate(rows * Sizeof.INT);
 		that.colInd = allocate(that.nnz * Sizeof.INT);
 		cudaMemcpy(that.val, me.val, that.nnz * LibMatrixCUDA.sizeOfDataType, cudaMemcpyDeviceToDevice);
 		cudaMemcpy(that.rowPtr, me.rowPtr, rows * Sizeof.INT, cudaMemcpyDeviceToDevice);
 		cudaMemcpy(that.colInd, me.colInd, that.nnz * Sizeof.INT, cudaMemcpyDeviceToDevice);
 		return that;
 	}

 	private Pointer allocate(long size) {
 		return getGPUContext().allocate(null, size);
 	}

 	private GPUContext getGPUContext() {
 		return gpuContext;
 	}

 	// ==============================================================================================

 	/**
 	 * Check for ultra sparsity
 	 *
 	 * @param rows number of rows
 	 * @param cols number of columns
 	 * @return true if ultra sparse
 	 */
 	public boolean isUltraSparse(int rows, int cols) {
 		double sp = ((double) nnz / rows / cols);
 		return sp < ULTRA_SPARSITY_TURN_POINT;
 	}

 	/**
 	 * Initializes {@link #descr} to CUSPARSE_MATRIX_TYPE_GENERAL,
 	 * the default that works for DGEMM.
 	 */
 	private void allocateMatDescrPointer() {
 		this.descr = getDefaultCuSparseMatrixDescriptor();
 	}

 	/**
 	 * Copies this CSR matrix on the GPU to a dense column-major matrix
 	 * on the GPU. This is a temporary matrix for operations such as
 	 * cusparseDcsrmv.
 	 * Since the allocated matrix is temporary, bookkeeping is not updated.
 	 * The caller is responsible for calling "free" on the returned Pointer object
 	 *
 	 * @param cusparseHandle a valid {@link cusparseHandle}
 	 * @param cublasHandle   a valid {@link cublasHandle}
 	 * @param rows           number of rows in this CSR matrix
 	 * @param cols           number of columns in this CSR matrix
 	 * @param instName          name of the invoking instruction to record{@link Statistics}.
 	 * @return A {@link Pointer} to the allocated dense matrix (in column-major format)
 	 */
 	public Pointer toColumnMajorDenseMatrix(cusparseHandle cusparseHandle, cublasHandle cublasHandle, int rows,
 			int cols, String instName) {
 		LOG.trace("GPU : sparse -> column major dense (inside CSRPointer) on " + this + ", GPUContext="
 				+ getGPUContext());
 		long size = rows * getDataTypeSizeOf(cols);
 		Pointer A = allocate(size);
 		// If this sparse block is empty, the allocated dense matrix, initialized to zeroes, will be returned.
 		if (val != null && rowPtr != null && colInd != null && nnz > 0) {
 			// Note: cusparseDcsr2dense method cannot handle empty blocks
 			LibMatrixCUDA.cudaSupportFunctions.cusparsecsr2dense(cusparseHandle, rows, cols, descr, val, rowPtr, colInd, A, rows);
 			//cudaDeviceSynchronize;
 		} else {
 			LOG.debug("in CSRPointer, the values array, row pointers array or column indices array was null");
 		}
 		return A;
 	}

 	/**
 	 * Calls cudaFree lazily on the allocated {@link Pointer} instances
 	 *
 	 */
 	public void deallocate() {
 		deallocate(DMLScript.EAGER_CUDA_FREE);
 	}

 	/**
 	 * Calls cudaFree lazily or eagerly on the allocated {@link Pointer} instances
 	 *
 	 * @param eager whether to do eager or lazy cudaFrees
 	 */
 	public void deallocate(boolean eager) {
 		if (nnz > 0) {
 			if (val != null)
 				getGPUContext().cudaFreeHelper(null, val, eager);
 			if (rowPtr != null)
 				getGPUContext().cudaFreeHelper(null, rowPtr, eager);
 			if (colInd != null)
 				getGPUContext().cudaFreeHelper(null, colInd, eager);
 		}
 		val = null;
 		rowPtr = null;
 		colInd = null;
 	}

 	@Override
 	public String toString() {
 		return "CSRPointer{" + "nnz=" + nnz + '}';
 	}
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	package org.apache.sysds.runtime.instructions.gpu.context;

	import static jcuda.jcusparse.JCusparse.cusparseCreateMatDescr;
	import static jcuda.jcusparse.JCusparse.cusparseSetMatIndexBase;
	import static jcuda.jcusparse.JCusparse.cusparseSetMatType;
	import static jcuda.jcusparse.JCusparse.cusparseSetPointerMode;
	import static jcuda.jcusparse.JCusparse.cusparseXcsrgeamNnz;
	import static jcuda.jcusparse.JCusparse.cusparseXcsrgemmNnz;
	import static jcuda.jcusparse.cusparseIndexBase.CUSPARSE_INDEX_BASE_ZERO;
	import static jcuda.jcusparse.cusparseMatrixType.CUSPARSE_MATRIX_TYPE_GENERAL;
	import static jcuda.runtime.JCuda.cudaMemcpy;
	import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice;
	import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
	import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;

	import org.apache.commons.logging.Log;
	import org.apache.commons.logging.LogFactory;
	import org.apache.sysds.api.DMLScript;
	import org.apache.sysds.runtime.DMLRuntimeException;
	import org.apache.sysds.runtime.matrix.data.LibMatrixCUDA;
	import org.apache.sysds.utils.GPUStatistics;
	import org.apache.sysds.utils.Statistics;

	import jcuda.Pointer;
	import jcuda.Sizeof;
	import jcuda.jcublas.cublasHandle;
	import jcuda.jcusparse.cusparseHandle;
	import jcuda.jcusparse.cusparseMatDescr;
	import jcuda.jcusparse.cusparsePointerMode;

	/**
	* Compressed Sparse Row (CSR) format for CUDA
	* Generalized matrix multiply is implemented for CSR format in the cuSparse library among other operations
	*
	* Since we assume that the matrix is stored with zero-based indexing (i.e. CUSPARSE_INDEX_BASE_ZERO),
	* the matrix
	* 1.0 4.0 0.0 0.0 0.0
	* 0.0 2.0 3.0 0.0 0.0
	* 5.0 0.0 0.0 7.0 8.0
	* 0.0 0.0 9.0 0.0 6.0
	*
	* is stored as
	* val = 1.0 4.0 2.0 3.0 5.0 7.0 8.0 9.0 6.0
	* rowPtr = 0.0 2.0 4.0 7.0 9.0
	* colInd = 0.0 1.0 1.0 2.0 0.0 3.0 4.0 2.0 4.0
	*/
	public class CSRPointer {

	private static final Log LOG = LogFactory.getLog(CSRPointer.class.getName());

	private static final double ULTRA_SPARSITY_TURN_POINT = 0.00004;
	public static cusparseMatDescr matrixDescriptor;
	/**
	* {@link GPUContext} instance to track the GPU to do work on
	*/
	private final GPUContext gpuContext;
	/**
	* Number of non zeroes
	*/
	public long nnz;

	/**
	* double array of non zero values
	*/
	public Pointer val;

	/**
	* integer array of start of all rows and end of last row + 1
	*/
	public Pointer rowPtr;

	/**
	* integer array of nnz values' column indices
	*/
	public Pointer colInd;

	/**
	* descriptor of matrix, only CUSPARSE_MATRIX_TYPE_GENERAL supported
	*/
	public cusparseMatDescr descr;

	/**
	* Default constructor to help with Factory method {@link #allocateEmpty(GPUContext, long, long)}
	*
	* @param gCtx a valid {@link GPUContext}
	*/
	private CSRPointer(GPUContext gCtx) {
	gpuContext = gCtx;
	val = new Pointer();
	rowPtr = new Pointer();
	colInd = new Pointer();
	allocateMatDescrPointer();
	}

	private static long getDataTypeSizeOf(long numElems) {
	return numElems * LibMatrixCUDA.sizeOfDataType;
	}

	private static long getIntSizeOf(long numElems) {
	return numElems * Sizeof.INT;
	}

	public static int toIntExact(long l) {
	if (l < Integer.MIN_VALUE \|\| l > Integer.MAX_VALUE) {
	throw new DMLRuntimeException("Cannot be cast to int:" + l);
	}
	return (int) l;
	}

	/**
	* @return Singleton default matrix descriptor object
	* (set with CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_INDEX_BASE_ZERO)
	*/
	public static cusparseMatDescr getDefaultCuSparseMatrixDescriptor() {
	if (matrixDescriptor == null) {
	// Code from JCuda Samples - http://www.jcuda.org/samples/JCusparseSample.java
	matrixDescriptor = new cusparseMatDescr();
	cusparseCreateMatDescr(matrixDescriptor);
	cusparseSetMatType(matrixDescriptor, CUSPARSE_MATRIX_TYPE_GENERAL);
	cusparseSetMatIndexBase(matrixDescriptor, CUSPARSE_INDEX_BASE_ZERO);
	}
	return matrixDescriptor;
	}

	/**
	* Estimate the size of a CSR matrix in GPU memory
	* Size of pointers is not needed and is not added in
	*
	* @param nnz2 number of non zeroes
	* @param rows number of rows
	* @return size estimate
	*/
	public static long estimateSize(long nnz2, long rows) {
	long sizeofValArray = getDataTypeSizeOf(nnz2);
	long sizeofRowPtrArray = getIntSizeOf(rows + 1);
	long sizeofColIndArray = getIntSizeOf(nnz2);
	long sizeofDescr = getIntSizeOf(4);
	// From the CUSPARSE documentation, the cusparseMatDescr in native code is represented as:
	// typedef struct {
	// cusparseMatrixType_t MatrixType;
	// cusparseFillMode_t FillMode;
	// cusparseDiagType_t DiagType;
	// cusparseIndexBase_t IndexBase;
	// } cusparseMatDescr_t;
	long tot = sizeofValArray + sizeofRowPtrArray + sizeofColIndArray + sizeofDescr;
	return tot;
	}

	/**
	* Static method to copy a CSR sparse matrix from Host to Device
	*
	* @param gCtx GPUContext
	* @param dest [input] destination location (on GPU)
	* @param rows number of rows
	* @param nnz number of non-zeroes
	* @param rowPtr integer array of row pointers
	* @param colInd integer array of column indices
	* @param values double array of non zero values
	*/
	public static void copyToDevice(GPUContext gCtx, CSRPointer dest, int rows, long nnz, int[] rowPtr, int[] colInd, double[] values) {
	CSRPointer r = dest;
	long t0 = 0;
	if (DMLScript.STATISTICS)
	t0 = System.nanoTime();
	r.nnz = nnz;
	if(rows < 0) throw new DMLRuntimeException("Incorrect input parameter: rows=" + rows);
	if(nnz < 0) throw new DMLRuntimeException("Incorrect input parameter: nnz=" + nnz);
	if(rowPtr.length < rows + 1) throw new DMLRuntimeException("The length of rowPtr needs to be greater than or equal to " + (rows + 1));
	if(colInd.length < nnz) throw new DMLRuntimeException("The length of colInd needs to be greater than or equal to " + nnz);
	if(values.length < nnz) throw new DMLRuntimeException("The length of values needs to be greater than or equal to " + nnz);
	LibMatrixCUDA.cudaSupportFunctions.hostToDevice(gCtx, values, r.val, null);
	cudaMemcpy(r.rowPtr, Pointer.to(rowPtr), getIntSizeOf(rows + 1), cudaMemcpyHostToDevice);
	cudaMemcpy(r.colInd, Pointer.to(colInd), getIntSizeOf(nnz), cudaMemcpyHostToDevice);
	if (DMLScript.STATISTICS)
	GPUStatistics.cudaToDevTime.add(System.nanoTime() - t0);
	if (DMLScript.STATISTICS)
	GPUStatistics.cudaToDevCount.add(3);
	}

	/**
	* Static method to copy a CSR sparse matrix from Device to host
	*
	* @param src [input] source location (on GPU)
	* @param rows [input] number of rows
	* @param nnz [input] number of non-zeroes
	* @param rowPtr [output] pre-allocated integer array of row pointers of size (rows+1)
	* @param colInd [output] pre-allocated integer array of column indices of size nnz
	*/
	public static void copyPtrToHost(CSRPointer src, int rows, long nnz, int[] rowPtr, int[] colInd) {
	CSRPointer r = src;
	cudaMemcpy(Pointer.to(rowPtr), r.rowPtr, getIntSizeOf(rows + 1), cudaMemcpyDeviceToHost);
	cudaMemcpy(Pointer.to(colInd), r.colInd, getIntSizeOf(nnz), cudaMemcpyDeviceToHost);
	}

	/**
	* Estimates the number of non zero elements from the results of a sparse cusparseDgeam operation
	* C = a op(A) + b op(B)
	*
	* @param gCtx a valid {@link GPUContext}
	* @param handle a valid {@link cusparseHandle}
	* @param A Sparse Matrix A on GPU
	* @param B Sparse Matrix B on GPU
	* @param m Rows in A
	* @param n Columns in Bs
	* @return CSR (compressed sparse row) pointer
	*/
	public static CSRPointer allocateForDgeam(GPUContext gCtx, cusparseHandle handle, CSRPointer A, CSRPointer B, int m, int n) {
	if (A.nnz >= Integer.MAX_VALUE \|\| B.nnz >= Integer.MAX_VALUE)
	throw new DMLRuntimeException("Number of non zeroes is larger than supported by cuSparse");
	CSRPointer C = new CSRPointer(gCtx);
	step1AllocateRowPointers(gCtx, handle, C, m);
	step2GatherNNZGeam(gCtx, handle, A, B, C, m, n);
	step3AllocateValNInd(gCtx, handle, C);
	return C;
	}

	/**
	* Estimates the number of non-zero elements from the result of a sparse matrix multiplication C = A * B
	* and returns the {@link CSRPointer} to C with the appropriate GPU memory.
	*
	* @param gCtx a valid {@link GPUContext}
	* @param handle a valid {@link cusparseHandle}
	* @param A Sparse Matrix A on GPU
	* @param transA 'T' if A is to be transposed, 'N' otherwise
	* @param B Sparse Matrix B on GPU
	* @param transB 'T' if B is to be transposed, 'N' otherwise
	* @param m Rows in A
	* @param n Columns in B
	* @param k Columns in A / Rows in B
	* @return a {@link CSRPointer} instance that encapsulates the CSR matrix on GPU
	*/
	public static CSRPointer allocateForMatrixMultiply(GPUContext gCtx, cusparseHandle handle, CSRPointer A, int transA,
	CSRPointer B, int transB, int m, int n, int k) {
	// Following the code example at http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csrgemm and at
	// https://github.com/jcuda/jcuda-matrix-utils/blob/master/JCudaMatrixUtils/src/test/java/org/jcuda/matrix/samples/JCusparseSampleDgemm.java
	CSRPointer C = new CSRPointer(gCtx);
	step1AllocateRowPointers(gCtx, handle, C, m);
	step2GatherNNZGemm(gCtx, handle, A, transA, B, transB, C, m, n, k);
	step3AllocateValNInd(gCtx, handle, C);
	return C;
	}

	/**
	* Factory method to allocate an empty CSR Sparse matrix on the GPU
	*
	* @param gCtx a valid {@link GPUContext}
	* @param nnz2 number of non-zeroes
	* @param rows number of rows
	* @return a {@link CSRPointer} instance that encapsulates the CSR matrix on GPU
	*/
	public static CSRPointer allocateEmpty(GPUContext gCtx, long nnz2, long rows) {
	LOG.trace("GPU : allocateEmpty from CSRPointer with nnz=" + nnz2 + " and rows=" + rows + ", GPUContext=" + gCtx);
	if(nnz2 < 0) throw new DMLRuntimeException("Incorrect usage of internal API, number of non zeroes is less than 0 when trying to allocate sparse data on GPU");
	if(rows <= 0) throw new DMLRuntimeException("Incorrect usage of internal API, number of rows is less than or equal to 0 when trying to allocate sparse data on GPU");
	CSRPointer r = new CSRPointer(gCtx);
	r.nnz = nnz2;
	if (nnz2 == 0) {
	// The convention for an empty sparse matrix is to just have an instance of the CSRPointer object
	// with no memory allocated on the GPU.
	return r;
	}
	// increment the cudaCount by 1 for the allocation of all 3 arrays
	r.val = gCtx.allocate(null, getDataTypeSizeOf(nnz2));
	r.rowPtr = gCtx.allocate(null, getIntSizeOf(rows + 1));
	r.colInd = gCtx.allocate(null, getIntSizeOf(nnz2));
	return r;
	}

	/**
	* Allocate row pointers of m+1 elements
	*
	* @param gCtx a valid {@link GPUContext}
	* @param handle a valid {@link cusparseHandle}
	* @param C Output matrix
	* @param rowsC number of rows in C
	*/
	private static void step1AllocateRowPointers(GPUContext gCtx, cusparseHandle handle, CSRPointer C, int rowsC) {
	LOG.trace("GPU : step1AllocateRowPointers" + ", GPUContext=" + gCtx);
	cusparseSetPointerMode(handle, cusparsePointerMode.CUSPARSE_POINTER_MODE_HOST);
	//cudaDeviceSynchronize;
	// Do not increment the cudaCount of allocations on GPU
	C.rowPtr = gCtx.allocate(null, getIntSizeOf((long) rowsC + 1));
	}

	/**
	* Determine total number of nonzero element for the cusparseDgeam operation.
	* This is done from either (nnzC=*nnzTotalDevHostPtr) or (nnzC=csrRowPtrC(m)-csrRowPtrC(0))
	*
	* @param gCtx a valid {@link GPUContext}
	* @param handle a valid {@link cusparseHandle}
	* @param A Sparse Matrix A on GPU
	* @param B Sparse Matrix B on GPU
	* @param C Output Sparse Matrix C on GPU
	* @param m Rows in C
	* @param n Columns in C
	*/
	private static void step2GatherNNZGeam(GPUContext gCtx, cusparseHandle handle, CSRPointer A, CSRPointer B, CSRPointer C, int m, int n) {
	LOG.trace("GPU : step2GatherNNZGeam for DGEAM" + ", GPUContext=" + gCtx);
	int[] CnnzArray = { -1 };
	cusparseXcsrgeamNnz(handle, m, n, A.descr, toIntExact(A.nnz), A.rowPtr, A.colInd, B.descr, toIntExact(B.nnz),
	B.rowPtr, B.colInd, C.descr, C.rowPtr, Pointer.to(CnnzArray));
	//cudaDeviceSynchronize;
	if (CnnzArray[0] != -1) {
	C.nnz = CnnzArray[0];
	} else {
	int baseArray[] = { 0 };
	cudaMemcpy(Pointer.to(CnnzArray), C.rowPtr.withByteOffset(getIntSizeOf(m)), getIntSizeOf(1),
	cudaMemcpyDeviceToHost);
	cudaMemcpy(Pointer.to(baseArray), C.rowPtr, getIntSizeOf(1), cudaMemcpyDeviceToHost);
	C.nnz = CnnzArray[0] - baseArray[0];
	}
	}

	/**
	* Determine total number of nonzero element for the cusparseDgemm operation.
	*
	* @param gCtx a valid {@link GPUContext}
	* @param handle a valid {@link cusparseHandle}
	* @param A Sparse Matrix A on GPU
	* @param transA op - whether A is transposed
	* @param B Sparse Matrix B on GPU
	* @param transB op - whether B is transposed
	* @param C Output Sparse Matrix C on GPU
	* @param m Number of rows of sparse matrix op ( A ) and C
	* @param n Number of columns of sparse matrix op ( B ) and C
	* @param k Number of columns/rows of sparse matrix op ( A ) / op ( B )
	*/
	private static void step2GatherNNZGemm(GPUContext gCtx, cusparseHandle handle, CSRPointer A, int transA,
	CSRPointer B, int transB, CSRPointer C, int m, int n, int k) {
	LOG.trace("GPU : step2GatherNNZGemm for DGEMM" + ", GPUContext=" + gCtx);
	int[] CnnzArray = { -1 };
	if (A.nnz >= Integer.MAX_VALUE \|\| B.nnz >= Integer.MAX_VALUE) {
	throw new DMLRuntimeException("Number of non zeroes is larger than supported by cuSparse");
	}
	cusparseXcsrgemmNnz(handle, transA, transB, m, n, k, A.descr, toIntExact(A.nnz), A.rowPtr, A.colInd, B.descr,
	toIntExact(B.nnz), B.rowPtr, B.colInd, C.descr, C.rowPtr, Pointer.to(CnnzArray));
	//cudaDeviceSynchronize;
	if (CnnzArray[0] != -1) {
	C.nnz = CnnzArray[0];
	} else {
	int baseArray[] = { 0 };
	cudaMemcpy(Pointer.to(CnnzArray), C.rowPtr.withByteOffset(getIntSizeOf(m)), getIntSizeOf(1),
	cudaMemcpyDeviceToHost);
	cudaMemcpy(Pointer.to(baseArray), C.rowPtr, getIntSizeOf(1), cudaMemcpyDeviceToHost);
	C.nnz = CnnzArray[0] - baseArray[0];
	}
	}

	/**
	* Allocate val and index pointers.
	*
	* @param gCtx a valid {@link GPUContext}
	* @param handle a valid {@link cusparseHandle}
	* @param C Output sparse matrix on GPU
	*/
	private static void step3AllocateValNInd(GPUContext gCtx, cusparseHandle handle, CSRPointer C) {
	LOG.trace("GPU : step3AllocateValNInd" + ", GPUContext=" + gCtx);
	// Increment cudaCount by one when all three arrays of CSR sparse array are allocated
	C.val = gCtx.allocate(null, getDataTypeSizeOf(C.nnz));
	C.colInd = gCtx.allocate(null, getIntSizeOf(C.nnz));
	}

	// ==============================================================================================

	// The following methods estimate the memory needed for sparse matrices that are
	// results of operations on other sparse matrices using the cuSparse Library.
	// The operation is C = op(A) binaryOperation op(B), C is the output and A & B are the inputs
	// op = whether to transpose or not
	// binaryOperation = For cuSparse, +, - are *(matmul) are supported

	// From CuSparse Manual,
	// Since A and B have different sparsity patterns, cuSPARSE adopts a two-step approach
	// to complete sparse matrix C. In the first step, the user allocates csrRowPtrC of m+1
	// elements and uses function cusparseXcsrgeamNnz() to determine csrRowPtrC
	// and the total number of nonzero elements. In the second step, the user gathers nnzC
	//(number of nonzero elements of matrix C) from either (nnzC=*nnzTotalDevHostPtr)
	// or (nnzC=csrRowPtrC(m)-csrRowPtrC(0)) and allocates csrValC, csrColIndC of
	// nnzC elements respectively, then finally calls function cusparse[S\|D\|C\|Z]csrgeam()
	// to complete matrix C.

	public CSRPointer clone(int rows) {
	CSRPointer me = this;
	CSRPointer that = new CSRPointer(me.getGPUContext());
	that.allocateMatDescrPointer();
	that.nnz = me.nnz;
	that.val = allocate(that.nnz * LibMatrixCUDA.sizeOfDataType);
	that.rowPtr = allocate(rows * Sizeof.INT);
	that.colInd = allocate(that.nnz * Sizeof.INT);
	cudaMemcpy(that.val, me.val, that.nnz * LibMatrixCUDA.sizeOfDataType, cudaMemcpyDeviceToDevice);
	cudaMemcpy(that.rowPtr, me.rowPtr, rows * Sizeof.INT, cudaMemcpyDeviceToDevice);
	cudaMemcpy(that.colInd, me.colInd, that.nnz * Sizeof.INT, cudaMemcpyDeviceToDevice);
	return that;
	}

	private Pointer allocate(long size) {
	return getGPUContext().allocate(null, size);
	}

	private GPUContext getGPUContext() {
	return gpuContext;
	}

	// ==============================================================================================

	/**
	* Check for ultra sparsity
	*
	* @param rows number of rows
	* @param cols number of columns
	* @return true if ultra sparse
	*/
	public boolean isUltraSparse(int rows, int cols) {
	double sp = ((double) nnz / rows / cols);
	return sp < ULTRA_SPARSITY_TURN_POINT;
	}

	/**
	* Initializes {@link #descr} to CUSPARSE_MATRIX_TYPE_GENERAL,
	* the default that works for DGEMM.
	*/
	private void allocateMatDescrPointer() {
	this.descr = getDefaultCuSparseMatrixDescriptor();
	}

	/**
	* Copies this CSR matrix on the GPU to a dense column-major matrix
	* on the GPU. This is a temporary matrix for operations such as
	* cusparseDcsrmv.
	* Since the allocated matrix is temporary, bookkeeping is not updated.
	* The caller is responsible for calling "free" on the returned Pointer object
	*
	* @param cusparseHandle a valid {@link cusparseHandle}
	* @param cublasHandle a valid {@link cublasHandle}
	* @param rows number of rows in this CSR matrix
	* @param cols number of columns in this CSR matrix
	* @param instName name of the invoking instruction to record{@link Statistics}.
	* @return A {@link Pointer} to the allocated dense matrix (in column-major format)
	*/
	public Pointer toColumnMajorDenseMatrix(cusparseHandle cusparseHandle, cublasHandle cublasHandle, int rows,
	int cols, String instName) {
	LOG.trace("GPU : sparse -> column major dense (inside CSRPointer) on " + this + ", GPUContext="
	+ getGPUContext());
	long size = rows * getDataTypeSizeOf(cols);
	Pointer A = allocate(size);
	// If this sparse block is empty, the allocated dense matrix, initialized to zeroes, will be returned.
	if (val != null && rowPtr != null && colInd != null && nnz > 0) {
	// Note: cusparseDcsr2dense method cannot handle empty blocks
	LibMatrixCUDA.cudaSupportFunctions.cusparsecsr2dense(cusparseHandle, rows, cols, descr, val, rowPtr, colInd, A, rows);
	//cudaDeviceSynchronize;
	} else {
	LOG.debug("in CSRPointer, the values array, row pointers array or column indices array was null");
	}
	return A;
	}

	/**
	* Calls cudaFree lazily on the allocated {@link Pointer} instances
	*
	*/
	public void deallocate() {
	deallocate(DMLScript.EAGER_CUDA_FREE);
	}

	/**
	* Calls cudaFree lazily or eagerly on the allocated {@link Pointer} instances
	*
	* @param eager whether to do eager or lazy cudaFrees
	*/
	public void deallocate(boolean eager) {
	if (nnz > 0) {
	if (val != null)
	getGPUContext().cudaFreeHelper(null, val, eager);
	if (rowPtr != null)
	getGPUContext().cudaFreeHelper(null, rowPtr, eager);
	if (colInd != null)
	getGPUContext().cudaFreeHelper(null, colInd, eager);
	}
	val = null;
	rowPtr = null;
	colInd = null;
	}

	@Override
	public String toString() {
	return "CSRPointer{" + "nnz=" + nnz + '}';
	}
	}