[SYSTEMDS-3905] Upgrade GPU backend to latest CUDA versions

This commit upgrades the GPU backend to the latest NVIDIA software stack:
CUDA 12.6, cuSPARSE 12, cuBLAS 12, cuDNN 9.
Additionally, this commit adds multiple GPU tests.

Closes #2271
Closes #2303
diff --git a/pom.xml b/pom.xml
index 5d24858..ea312ca 100644
--- a/pom.xml
+++ b/pom.xml
@@ -49,7 +49,7 @@
 		<project.build.outputTimestamp>1</project.build.outputTimestamp>
 		<enableGPU>false</enableGPU>
 		<jcuda.scope>provided</jcuda.scope>
-		<jcuda.version>10.2.0</jcuda.version>
+		<jcuda.version>12.6.0</jcuda.version>
 		<slf4j.version>2.0.11</slf4j.version>
 		<log4j.version>2.22.1</log4j.version>
 		<maven-clean-plugin.version>3.2.0</maven-clean-plugin.version>
@@ -1078,47 +1078,6 @@
 			<version>${jcuda.version}</version>
 			<scope>${jcuda.scope}</scope>
 		</dependency>
-
-		<dependency>
-			<groupId>org.jcuda</groupId>
-			<artifactId>jcuda-natives</artifactId>
-			<classifier>apple-x86_64</classifier>
-			<version>${jcuda.version}</version>
-			<scope>${jcuda.scope}</scope>
-		</dependency>
-
-		<dependency>
-			<groupId>org.jcuda</groupId>
-			<artifactId>jcublas-natives</artifactId>
-			<classifier>apple-x86_64</classifier>
-			<version>${jcuda.version}</version>
-			<scope>${jcuda.scope}</scope>
-		</dependency>
-
-		<dependency>
-			<groupId>org.jcuda</groupId>
-			<artifactId>jcusparse-natives</artifactId>
-			<classifier>apple-x86_64</classifier>
-			<version>${jcuda.version}</version>
-			<scope>${jcuda.scope}</scope>
-		</dependency>
-
-		<dependency>
-			<groupId>org.jcuda</groupId>
-			<artifactId>jcusolver-natives</artifactId>
-			<classifier>apple-x86_64</classifier>
-			<version>${jcuda.version}</version>
-			<scope>${jcuda.scope}</scope>
-		</dependency>
-
-		<dependency>
-			<groupId>org.jcuda</groupId>
-			<artifactId>jcudnn-natives</artifactId>
-			<classifier>apple-x86_64</classifier>
-			<version>${jcuda.version}</version>
-			<scope>${jcuda.scope}</scope>
-		</dependency>
-
 		<dependency>
 			<groupId>org.apache.spark</groupId>
 			<artifactId>spark-core_${scala.binary.version}</artifactId>
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CSRPointer.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CSRPointer.java
index 3125d43..9473e93 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CSRPointer.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CSRPointer.java
@@ -19,19 +19,11 @@
 
 package org.apache.sysds.runtime.instructions.gpu.context;
 
-import static jcuda.jcusparse.JCusparse.cusparseCreateMatDescr;
-import static jcuda.jcusparse.JCusparse.cusparseSetMatIndexBase;
-import static jcuda.jcusparse.JCusparse.cusparseSetMatType;
-import static jcuda.jcusparse.JCusparse.cusparseSetPointerMode;
-import static jcuda.jcusparse.JCusparse.cusparseXcsrgeamNnz;
-import static jcuda.jcusparse.JCusparse.cusparseXcsrgemmNnz;
-import static jcuda.jcusparse.cusparseIndexBase.CUSPARSE_INDEX_BASE_ZERO;
-import static jcuda.jcusparse.cusparseMatrixType.CUSPARSE_MATRIX_TYPE_GENERAL;
-import static jcuda.runtime.JCuda.cudaMemcpy;
-import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice;
-import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
-import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
-
+import jcuda.Pointer;
+import jcuda.Sizeof;
+import jcuda.cudaDataType;
+import jcuda.jcublas.cublasHandle;
+import jcuda.jcusparse.*;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.api.DMLScript;
@@ -39,12 +31,18 @@
 import org.apache.sysds.runtime.matrix.data.LibMatrixCUDA;
 import org.apache.sysds.utils.Statistics;
 
-import jcuda.Pointer;
-import jcuda.Sizeof;
-import jcuda.jcublas.cublasHandle;
-import jcuda.jcusparse.cusparseHandle;
-import jcuda.jcusparse.cusparseMatDescr;
-import jcuda.jcusparse.cusparsePointerMode;
+import static jcuda.jcusparse.JCusparse.*;
+import static jcuda.jcusparse.cusparseIndexBase.CUSPARSE_INDEX_BASE_ZERO;
+import static jcuda.jcusparse.cusparseMatrixType.CUSPARSE_MATRIX_TYPE_GENERAL;
+import static jcuda.runtime.JCuda.*;
+import static jcuda.runtime.cudaMemcpyKind.*;
+import static jcuda.jcusparse.cusparseIndexType.CUSPARSE_INDEX_32I;
+import static jcuda.cudaDataType.CUDA_R_64F;
+import static jcuda.cudaDataType.CUDA_R_32F;
+import static jcuda.jcusparse.cusparseSpGEMMAlg.CUSPARSE_SPGEMM_DEFAULT;
+import static org.apache.sysds.runtime.matrix.data.LibMatrixCUDA.cudaSupportFunctions;
+import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_TRANSPOSE;
+import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_NON_TRANSPOSE;
 
 /**
  * Compressed Sparse Row (CSR) format for CUDA
@@ -98,6 +96,16 @@
 	public cusparseMatDescr descr;
 
 	/**
+	 * descriptor of sparse matrix
+	 */
+	public cusparseSpMatDescr spMatDescr;
+
+	/**
+	 * descriptor for sparse GEMM, only use for result matrix.
+	 */
+	public cusparseSpGEMMDescr spgemmDesc;
+
+	/**
 	 * Default constructor to help with Factory method {@link #allocateEmpty(GPUContext, long, long)}
 	 *
 	 * @param gCtx a valid {@link GPUContext}
@@ -107,6 +115,8 @@
 		val = new Pointer();
 		rowPtr = new Pointer();
 		colInd = new Pointer();
+		spMatDescr = new cusparseSpMatDescr();
+		spgemmDesc = null;
 		allocateMatDescrPointer();
 	}
 
@@ -183,7 +193,7 @@
 		if(rowPtr.length < rows + 1) throw new DMLRuntimeException("The length of rowPtr needs to be greater than or equal to " + (rows + 1));
 		if(colInd.length < nnz) throw new DMLRuntimeException("The length of colInd needs to be greater than or equal to " + nnz);
 		if(values.length < nnz) throw new DMLRuntimeException("The length of values needs to be greater than or equal to " + nnz);
-		LibMatrixCUDA.cudaSupportFunctions.hostToDevice(gCtx, values, r.val, null);
+		cudaSupportFunctions.hostToDevice(gCtx, values, r.val, null);
 		cudaMemcpy(r.rowPtr, Pointer.to(rowPtr), getIntSizeOf(rows + 1), cudaMemcpyHostToDevice);
 		cudaMemcpy(r.colInd, Pointer.to(colInd), getIntSizeOf(nnz), cudaMemcpyHostToDevice);
 		//if (DMLScript.STATISTICS)
@@ -220,15 +230,18 @@
 	 * @return CSR (compressed sparse row) pointer
 	 */
 	public static CSRPointer allocateForDgeam(GPUContext gCtx, cusparseHandle handle, CSRPointer A, CSRPointer B, int m, int n) {
-		if (A.nnz >= Integer.MAX_VALUE || B.nnz >= Integer.MAX_VALUE)
+		if(A.nnz >= Integer.MAX_VALUE || B.nnz >= Integer.MAX_VALUE)
 			throw new DMLRuntimeException("Number of non zeroes is larger than supported by cuSparse");
 		CSRPointer C = new CSRPointer(gCtx);
 		step1AllocateRowPointers(gCtx, handle, C, m);
+		ensureSorted(handle, m, n, toIntExact(A.nnz), A.rowPtr, A.colInd, A.descr);
+		ensureSorted(handle, m, n, toIntExact(B.nnz), B.rowPtr, B.colInd, B.descr);
 		step2GatherNNZGeam(gCtx, handle, A, B, C, m, n);
 		step3AllocateValNInd(gCtx, handle, C);
 		return C;
 	}
 
+
 	/**
 	 * Estimates the number of non-zero elements from the result of a sparse matrix multiplication C = A * B
 	 * and returns the {@link CSRPointer} to C with the appropriate GPU memory.
@@ -245,12 +258,10 @@
 	 * @return a {@link CSRPointer} instance that encapsulates the CSR matrix on GPU
 	 */
 	public static CSRPointer allocateForMatrixMultiply(GPUContext gCtx, cusparseHandle handle, CSRPointer A, int transA,
-			CSRPointer B, int transB, int m, int n, int k) {
-		// Following the code example at http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csrgemm and at
-		// https://github.com/jcuda/jcuda-matrix-utils/blob/master/JCudaMatrixUtils/src/test/java/org/jcuda/matrix/samples/JCusparseSampleDgemm.java
+		CSRPointer B, int transB, int m, int n, int k, int dataType) {
 		CSRPointer C = new CSRPointer(gCtx);
 		step1AllocateRowPointers(gCtx, handle, C, m);
-		step2GatherNNZGemm(gCtx, handle, A, transA, B, transB, C, m, n, k);
+		step2GatherNNZGemm(gCtx, handle, A, transA, B, transB, C, m, n, k, dataType);
 		step3AllocateValNInd(gCtx, handle, C);
 		return C;
 	}
@@ -316,21 +327,30 @@
 	 * @param m      Rows in C
 	 * @param n      Columns in C
 	 */
+
 	private static void step2GatherNNZGeam(GPUContext gCtx, cusparseHandle handle, CSRPointer A, CSRPointer B, CSRPointer C, int m, int n) {
 		LOG.trace("GPU : step2GatherNNZGeam for DGEAM" + ", GPUContext=" + gCtx);
-		int[] CnnzArray = { -1 };
-		cusparseXcsrgeamNnz(handle, m, n, A.descr, toIntExact(A.nnz), A.rowPtr, A.colInd, B.descr, toIntExact(B.nnz),
-				B.rowPtr, B.colInd, C.descr, C.rowPtr, Pointer.to(CnnzArray));
-		//cudaDeviceSynchronize;
-		if (CnnzArray[0] != -1) {
+		long[] pBufferSizeInBytes = {0};
+		cusparseDcsrgeam2_bufferSizeExt(handle, m, n, Pointer.to(new double[]{1.0}), A.descr, toIntExact(A.nnz), A.val, A.rowPtr, A.colInd,
+			Pointer.to(new double[]{1.0}), B.descr, toIntExact(B.nnz), B.val, B.rowPtr, B.colInd, C.descr, C.val, C.rowPtr, C.colInd, pBufferSizeInBytes);
+		Pointer buffer = new Pointer();
+		cudaMalloc(buffer, pBufferSizeInBytes[0]);
+		int[] CnnzArray = {-1};
+		cusparseXcsrgeam2Nnz(handle, m, n, A.descr, toIntExact(A.nnz), A.rowPtr, A.colInd, B.descr, toIntExact(B.nnz), B.rowPtr, B.colInd,
+			C.descr, C.rowPtr, Pointer.to(CnnzArray) ,buffer);
+		if(CnnzArray[0] != -1) {
 			C.nnz = CnnzArray[0];
-		} else {
-			int baseArray[] = { 0 };
-			cudaMemcpy(Pointer.to(CnnzArray), C.rowPtr.withByteOffset(getIntSizeOf(m)), getIntSizeOf(1),
-					cudaMemcpyDeviceToHost);
-			cudaMemcpy(Pointer.to(baseArray), C.rowPtr, getIntSizeOf(1), cudaMemcpyDeviceToHost);
+		}
+		else {                            // fall-back (rare older devices)
+			int[] baseArray = {0};
+			cudaMemcpy(Pointer.to(CnnzArray),
+				C.rowPtr.withByteOffset((long)m * Sizeof.INT),
+				Sizeof.INT, cudaMemcpyDeviceToHost);
+			cudaMemcpy(Pointer.to(baseArray),
+				C.rowPtr, Sizeof.INT, cudaMemcpyDeviceToHost);
 			C.nnz = CnnzArray[0] - baseArray[0];
 		}
+		cudaFree(buffer);
 	}
 
 	/**
@@ -347,25 +367,68 @@
 	 * @param n      Number of columns of sparse matrix op ( B ) and C
 	 * @param k      Number of columns/rows of sparse matrix op ( A ) / op ( B )
 	 */
+
 	private static void step2GatherNNZGemm(GPUContext gCtx, cusparseHandle handle, CSRPointer A, int transA,
-			CSRPointer B, int transB, CSRPointer C, int m, int n, int k) {
-		LOG.trace("GPU : step2GatherNNZGemm for DGEMM" + ", GPUContext=" + gCtx);
-		int[] CnnzArray = { -1 };
-		if (A.nnz >= Integer.MAX_VALUE || B.nnz >= Integer.MAX_VALUE) {
-			throw new DMLRuntimeException("Number of non zeroes is larger than supported by cuSparse");
-		}
-		cusparseXcsrgemmNnz(handle, transA, transB, m, n, k, A.descr, toIntExact(A.nnz), A.rowPtr, A.colInd, B.descr,
-				toIntExact(B.nnz), B.rowPtr, B.colInd, C.descr, C.rowPtr, Pointer.to(CnnzArray));
-		//cudaDeviceSynchronize;
-		if (CnnzArray[0] != -1) {
-			C.nnz = CnnzArray[0];
-		} else {
-			int baseArray[] = { 0 };
-			cudaMemcpy(Pointer.to(CnnzArray), C.rowPtr.withByteOffset(getIntSizeOf(m)), getIntSizeOf(1),
-					cudaMemcpyDeviceToHost);
-			cudaMemcpy(Pointer.to(baseArray), C.rowPtr, getIntSizeOf(1), cudaMemcpyDeviceToHost);
-			C.nnz = CnnzArray[0] - baseArray[0];
-		}
+		CSRPointer B, int transB, CSRPointer C, int m, int n, int k,
+		int dataType)            // C = op(A)·op(B)  (m×k)·(k×n)
+	{
+		LOG.trace("GPU : step2GatherNNZGemm (SpGEMM), GPUContext=" + gCtx);
+
+		// Ensure that NNZ does not exceed limit
+		if(A.nnz >= Integer.MAX_VALUE || B.nnz >= Integer.MAX_VALUE)
+			throw new DMLRuntimeException("Number of non-zeros exceeds cuSPARSE 32-bit limit");
+
+		// Get index base -> 0 or 1
+		int baseA = cusparseGetMatIndexBase(A.descr);
+		int baseB = cusparseGetMatIndexBase(B.descr);
+		int baseC = cusparseGetMatIndexBase(C.descr);
+
+		// cuSPARSE 12 requires using cusparseSpMatDescr
+		// We do not recreate the existing matrices but only register the data with
+		// A.spMatDescr, B.spMatDescr, C.spMatDescr descriptors are merely registrations of the existing CSR arrays so cuSPARSE can reference them
+		cusparseCreateCsr(A.spMatDescr, m, k, A.nnz, A.rowPtr, A.colInd, A.val, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+			baseA, dataType);
+		cusparseCreateCsr(B.spMatDescr, k, n, B.nnz, B.rowPtr, B.colInd, B.val, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+			baseB, dataType);
+		cusparseCreateCsr(C.spMatDescr, m, n, 0, C.rowPtr, null, null, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, baseC,
+			dataType);
+		// SpGEMM Computation
+		C.spgemmDesc = new cusparseSpGEMMDescr();
+		cusparseSpGEMM_createDescr(C.spgemmDesc);
+		double[] alpha = {1.0};
+		double[] beta = {0.0};
+		Pointer alphaPtr = Pointer.to(alpha);
+		Pointer betaPtr = Pointer.to(beta);
+		int alg = CUSPARSE_SPGEMM_DEFAULT;
+
+		// ask bufferSize1 bytes for external memory
+		long[] bufferSize1 = {0};
+		cusparseSpGEMM_workEstimation(handle, transA, transB, alphaPtr, A.spMatDescr.asConst(), B.spMatDescr.asConst(),
+			betaPtr, C.spMatDescr, dataType, alg, C.spgemmDesc, bufferSize1, null);
+		Pointer buffer1 = new Pointer();
+		if(bufferSize1[0] > 0)
+			cudaMalloc(buffer1, bufferSize1[0]);
+		// inspect the matrices A and B to understand the memory requirement for the next step
+		cusparseSpGEMM_workEstimation(handle, transA, transB, alphaPtr, A.spMatDescr.asConst(), B.spMatDescr.asConst(),
+			betaPtr, C.spMatDescr, dataType, alg, C.spgemmDesc, bufferSize1, buffer1);
+
+		// ask bufferSize2 bytes for external memory
+		long[] bufferSize2 = {0};
+		cusparseSpGEMM_compute(handle, transA, transB, alphaPtr, A.spMatDescr.asConst(), B.spMatDescr.asConst(),
+			betaPtr, C.spMatDescr, dataType, alg, C.spgemmDesc, bufferSize2, null);
+		Pointer buffer2 = new Pointer();
+		if(bufferSize2[0] > 0)
+			cudaMalloc(buffer2, bufferSize2[0]);
+		// compute the intermediate product of A * B
+		cusparseSpGEMM_compute(handle, transA, transB, alphaPtr, A.spMatDescr.asConst(), B.spMatDescr.asConst(),
+			betaPtr, C.spMatDescr, dataType, alg, C.spgemmDesc, bufferSize2, buffer2);
+
+		// obtain nnz of C
+		long[] rows = {0};
+		long[] cols = {0};
+		long[] nnz = {0};
+		cusparseSpMatGetSize(C.spMatDescr.asConst(), rows, cols, nnz);
+		C.nnz = nnz[0];
 	}
 
 	/**
@@ -381,6 +444,100 @@
 
 		C.val = gCtx.allocate(null, getDataTypeSizeOf(C.nnz), false);
 		C.colInd = gCtx.allocate(null, getIntSizeOf(C.nnz), false);
+
+		// special case for sparse GEMM
+		// requires cusparseSpGEMMDescr since CUDA 12
+		if(C.spgemmDesc != null)
+			cusparseCsrSetPointers(C.spMatDescr, C.rowPtr, C.colInd, C.val);
+	}
+
+	/**
+	 * Sort rows of matrix.
+	 *
+	 * @param handle
+	 * @param m
+	 * @param n
+	 * @param nnz
+	 * @param rowPtr
+	 * @param colInd
+	 * @param descr
+	 */
+	private static void ensureSorted(cusparseHandle handle, int m, int n, int nnz, Pointer rowPtr, Pointer colInd,
+		cusparseMatDescr descr) {
+
+		// 1. workspace size for sorting
+		long[] bufSize = {0};
+		JCusparse.cusparseXcsrsort_bufferSizeExt(handle, m, n, nnz, rowPtr, colInd, bufSize);
+
+		Pointer work = new Pointer();
+		if(bufSize[0] > 0)
+			cudaMalloc(work, bufSize[0]);
+
+		// 2. create a permutation array (can be NULL if you don’t care)
+		Pointer P = new Pointer();
+		cudaMalloc(P, (long) nnz * Sizeof.INT);
+		JCusparse.cusparseCreateIdentityPermutation(handle, nnz, P);
+
+		// 3. sort in-place
+		JCusparse.cusparseXcsrsort(handle, m, n, nnz, descr, rowPtr, colInd, P, work);
+
+		cudaFree(P);
+		if(bufSize[0] > 0)
+			cudaFree(work);
+
+	}
+
+	/**
+	 * Physically transpose a CSR matrix (srcRows × srcCols ➜ srcCols × srcRows). The trick:  CSR ➜ CSC of the original
+	 * matrix is bit-for-bit the CSR of the transposed matrix, so we leverage cuSPARSE csr2cscEx2.
+	 *
+	 * @param gCtx
+	 * @param handle
+	 * @param src
+	 * @param srcRows
+	 * @param srcCols
+	 * @param dataType
+	 * @return
+	 */
+	public static CSRPointer transposeCSR(GPUContext gCtx, cusparseHandle handle, CSRPointer src, int srcRows,
+		int srcCols, int dataType) {
+
+		CSRPointer dst = new CSRPointer(gCtx);
+		dst.nnz = src.nnz;
+
+		// allocate transposed arrays: rows = srcCols, cols = srcRows
+		dst.rowPtr = gCtx.allocate(null, getIntSizeOf((long) srcCols + 1), true);
+		dst.colInd = gCtx.allocate(null, getIntSizeOf(dst.nnz), false);
+		dst.val = gCtx.allocate(null, getDataTypeSizeOf(dst.nnz), false);
+
+		// CSR -> CSC (of src) == CSR (of t(src))
+		int copyValues = 1;
+		int idxBase = cusparseGetMatIndexBase(src.descr);
+		cudaSupportFunctions.cusparsecsr2csc(handle, srcRows, srcCols, toIntExact(dst.nnz), src.val, src.rowPtr,
+			src.colInd, dst.val, dst.colInd, dst.rowPtr, copyValues, idxBase);
+
+		// classical descriptor (needed by ensureSorted)
+		cusparseCreateMatDescr(dst.descr);
+		cusparseSetMatIndexBase(dst.descr, idxBase);
+
+		// cuSPARSE 12 SpMat descriptor
+		cusparseCreateCsr(dst.spMatDescr,
+			/*rows*/ srcCols,   // m
+			/*cols*/ srcRows,   // k
+			dst.nnz, dst.rowPtr, dst.colInd, dst.val, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, idxBase, dataType);
+
+		return dst;
+	}
+
+	/**
+	 * Helper function to check matrix dimensions
+	 *
+	 * @param desc
+	 */
+	public static void getCSRMatrixInfo(cusparseSpMatDescr desc) {
+		long[] r = {0}, c = {0}, z = {0};
+		cusparseSpMatGetSize(desc.asConst(), r, c, z);
+		System.err.println("DEBUG  A  rows=" + r[0] + " cols=" + c[0] + " nnz=" + z[0]);
 	}
 
 	// ==============================================================================================
@@ -468,7 +625,7 @@
 		// If this sparse block is empty, the allocated dense matrix, initialized to zeroes, will be returned.
 		if (val != null && rowPtr != null && colInd != null && nnz > 0) {
 			// Note: cusparseDcsr2dense method cannot handle empty blocks
-			LibMatrixCUDA.cudaSupportFunctions.cusparsecsr2dense(cusparseHandle, rows, cols, descr, val, rowPtr, colInd, A, rows);
+			cudaSupportFunctions.cusparsecsr2dense(cusparseHandle, rows, cols, descr, val, rowPtr, colInd, A, rows, nnz);
 			//cudaDeviceSynchronize;
 		} else {
 			LOG.debug("in CSRPointer, the values array, row pointers array or column indices array was null");
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/CudaSupportFunctions.java b/src/main/java/org/apache/sysds/runtime/matrix/data/CudaSupportFunctions.java
index 6879a20..3093b90 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/CudaSupportFunctions.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/CudaSupportFunctions.java
@@ -24,6 +24,8 @@
 import jcuda.jcusparse.cusparseHandle;
 import jcuda.jcusparse.cusparseMatDescr;
 
+import jcuda.jcusparse.cusparseSpGEMMDescr;
+import jcuda.jcusparse.cusparseSpMatDescr;
 import org.apache.sysds.runtime.instructions.gpu.context.GPUContext;
 
 import jcuda.Pointer;
@@ -51,24 +53,24 @@
  * 	matrix_atan(A, C, size);
  * } 
  * </code>
- * 
+ *
  * 2. The CUDA library calls (such as CuBLAS, CuSPARSE, etc) go through this interface.
  * The naming and parameters of the methods in this class are consistent with that of CUDA library to simplify development.
- * 
+ *
  * 3. During SystemDS initialization, the appropriate class implementing CudaKernels interface is set based on the configuration property sysds.dataType.
  */
 public interface CudaSupportFunctions {
 	public static boolean PERFORM_CONVERSION_ON_DEVICE = true;
-	public int cusparsecsrgemm(cusparseHandle handle, int transA, int transB, int m, int n, int k, 
-			cusparseMatDescr descrA, int nnzA, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA, 
-			cusparseMatDescr descrB, int nnzB, Pointer csrValB, Pointer csrRowPtrB, Pointer csrColIndB, 
-			cusparseMatDescr descrC, Pointer csrValC, Pointer csrRowPtrC, Pointer csrColIndC);
-	public int	cublasgeam(cublasHandle handle, int transa, int transb, int m, int n, jcuda.Pointer alpha, jcuda.Pointer A, 
-			int lda, jcuda.Pointer beta, jcuda.Pointer B, int ldb, jcuda.Pointer C, int ldc);
-	public int	cusparsecsrmv(cusparseHandle handle, int transA, int m, int n, int nnz, jcuda.Pointer alpha, cusparseMatDescr descrA, jcuda.Pointer csrValA, jcuda.Pointer csrRowPtrA, jcuda.Pointer csrColIndA, 
-			jcuda.Pointer x, jcuda.Pointer beta, jcuda.Pointer y);
-	public int	cusparsecsrmm2(cusparseHandle handle, int transa, int transb, int m, int n, int k, int nnz, jcuda.Pointer alpha, cusparseMatDescr descrA, jcuda.Pointer csrValA, jcuda.Pointer csrRowPtrA, jcuda.Pointer csrColIndA, 
-			jcuda.Pointer B, int ldb, jcuda.Pointer beta, jcuda.Pointer C, int ldc);
+
+	public int cusparsecsrgemm(cusparseHandle handle, int transA, int transB, int alg, cusparseSpMatDescr spMatDescrA,
+		cusparseSpMatDescr spMatDescrB, cusparseSpMatDescr spMatDescrC, cusparseSpGEMMDescr spgemmDescr);
+	public int cublasgeam(cublasHandle handle, int transa, int transb, int m, int n, jcuda.Pointer alpha,
+		jcuda.Pointer A, int lda, jcuda.Pointer beta, jcuda.Pointer B, int ldb, jcuda.Pointer C, int ldc);
+	public int cusparsecsrmv(cusparseHandle handle, int transA, int m, int n, int nnz, jcuda.Pointer alpha,
+		cusparseSpMatDescr spMatDescrA, cusparseMatDescr descrA, jcuda.Pointer csrValA, jcuda.Pointer csrRowPtrA,
+		jcuda.Pointer csrColIndA, jcuda.Pointer x, jcuda.Pointer beta, jcuda.Pointer y);
+	public int cusparsecsrmm2(cusparseHandle handle, int transa, int transb, int m, int n, int k, int nnz, jcuda.Pointer alpha, cusparseMatDescr descrA, cusparseSpMatDescr spMatDescrA, jcuda.Pointer csrValA, jcuda.Pointer csrRowPtrA, jcuda.Pointer csrColIndA,
+		jcuda.Pointer B, int ldb, jcuda.Pointer beta, jcuda.Pointer C, int ldc);
 	public int cublasdot(cublasHandle handle, int n, jcuda.Pointer x, int incx, jcuda.Pointer y, int incy, jcuda.Pointer result);
 	public int cublasgemv(cublasHandle handle, int trans, int m, int n, jcuda.Pointer alpha, jcuda.Pointer A, int lda, jcuda.Pointer x, int incx, jcuda.Pointer beta, jcuda.Pointer y, int incy);
 	public int cublasgemm(cublasHandle handle, int transa, int transb, int m, int n, int k, jcuda.Pointer alpha, jcuda.Pointer A, int lda, jcuda.Pointer B, int ldb, jcuda.Pointer beta, jcuda.Pointer C, int ldc);
@@ -80,7 +82,7 @@
 	public int cusolverDngeqrf(cusolverDnHandle handle, int m, int n, Pointer A, int lda, Pointer TAU, Pointer Workspace, int Lwork, Pointer devInfo);
 	public int cusolverDnormqr(cusolverDnHandle handle, int side, int trans, int m, int n, int k, Pointer A, int lda, Pointer tau, Pointer C, int ldc, Pointer work, int lwork, Pointer devInfo);
 	public int cusparsecsrgeam(cusparseHandle handle, int m, int n, jcuda.Pointer alpha, cusparseMatDescr descrA, int nnzA, jcuda.Pointer csrValA, jcuda.Pointer csrRowPtrA, jcuda.Pointer csrColIndA, jcuda.Pointer beta, cusparseMatDescr descrB, int nnzB, jcuda.Pointer csrValB, jcuda.Pointer csrRowPtrB, jcuda.Pointer csrColIndB, cusparseMatDescr descrC, jcuda.Pointer csrValC, jcuda.Pointer csrRowPtrC, jcuda.Pointer csrColIndC);
-	public int cusparsecsr2dense(cusparseHandle handle, int m, int n, cusparseMatDescr descrA, jcuda.Pointer csrValA, jcuda.Pointer csrRowPtrA, jcuda.Pointer csrColIndA, jcuda.Pointer A, int lda) ;
+	public int cusparsecsr2dense(cusparseHandle handle, int m, int n, cusparseMatDescr descrA, jcuda.Pointer csrValA, jcuda.Pointer csrRowPtrA, jcuda.Pointer csrColIndA, jcuda.Pointer A, int lda, long nnz) ;
 	public int cusparsedense2csr(cusparseHandle handle, int m, int n, cusparseMatDescr descrA, jcuda.Pointer A, int lda, jcuda.Pointer nnzPerRow, jcuda.Pointer csrValA, jcuda.Pointer csrRowPtrA, jcuda.Pointer csrColIndA);
 	public int cusparsennz(cusparseHandle handle, int dirA, int m, int n, cusparseMatDescr descrA, jcuda.Pointer A, int lda, jcuda.Pointer nnzPerRowCol, jcuda.Pointer nnzTotalDevHostPtr);
 	public void deviceToHost(GPUContext gCtx, Pointer src, double [] dest, String instName, boolean isEviction);
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java b/src/main/java/org/apache/sysds/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java
index 3ffac4b..2a5e035 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java
@@ -18,10 +18,14 @@
  */
 package org.apache.sysds.runtime.matrix.data;
 
-import static jcuda.runtime.JCuda.cudaMemcpy;
+import static jcuda.jcusparse.JCusparse.*;
+import static jcuda.jcusparse.JCusparse.cusparseGetMatIndexBase;
+import static jcuda.runtime.JCuda.*;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
 
+import jcuda.jcusparse.cusparseSpMatDescr;
+import jcuda.jcusparse.cusparseSpGEMMDescr;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.runtime.DMLRuntimeException;
@@ -36,73 +40,162 @@
 import jcuda.jcusparse.JCusparse;
 import jcuda.jcusparse.cusparseHandle;
 import jcuda.jcusparse.cusparseMatDescr;
+import jcuda.jcusparse.cusparseDnVecDescr;
+import jcuda.jcusparse.cusparseDnMatDescr;
+
+import static jcuda.jcusparse.cusparseIndexType.CUSPARSE_INDEX_32I;
+import static jcuda.jcusparse.cusparseIndexBase.CUSPARSE_INDEX_BASE_ZERO;
+import static jcuda.cudaDataType.CUDA_R_64F;
+import static jcuda.jcusparse.cusparseSpGEMMAlg.CUSPARSE_SPGEMM_DEFAULT;
+import static jcuda.jcusparse.cusparseStatus.CUSPARSE_STATUS_SUCCESS;
+import static jcuda.jcusparse.cusparseSpMVAlg.CUSPARSE_SPMV_ALG_DEFAULT;
+import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_NON_TRANSPOSE;
+import static jcuda.jcusparse.cusparseOrder.CUSPARSE_ORDER_COL;
+import static jcuda.jcusparse.cusparseSpMMAlg.CUSPARSE_SPMM_ALG_DEFAULT;
+import static jcuda.jcusparse.cusparseCsr2CscAlg.CUSPARSE_CSR2CSC_ALG1;
+import static jcuda.jcusparse.cusparseSparseToDenseAlg.CUSPARSE_SPARSETODENSE_ALG_DEFAULT;
+import static jcuda.jcusparse.cusparseIndexBase.CUSPARSE_INDEX_BASE_ONE;
+import static jcuda.jcusparse.cusparseDenseToSparseAlg.CUSPARSE_DENSETOSPARSE_ALG_DEFAULT;
 
 public class DoublePrecisionCudaSupportFunctions implements CudaSupportFunctions {
 
 	private static final Log LOG = LogFactory.getLog(DoublePrecisionCudaSupportFunctions.class.getName());
-	
+
 	@Override
-	public int cusparsecsrgemm(cusparseHandle handle, int transA, int transB, int m, int n, int k,
-			cusparseMatDescr descrA, int nnzA, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA,
-			cusparseMatDescr descrB, int nnzB, Pointer csrValB, Pointer csrRowPtrB, Pointer csrColIndB,
-			cusparseMatDescr descrC, Pointer csrValC, Pointer csrRowPtrC, Pointer csrColIndC) {
-		return JCusparse.cusparseDcsrgemm(handle, transA,  transB,  m,  n,  k,
-				 descrA,  nnzA,  csrValA,  csrRowPtrA,  csrColIndA,
-				 descrB,  nnzB,  csrValB,  csrRowPtrB,  csrColIndB,
-				 descrC,  csrValC,  csrRowPtrC,  csrColIndC);
+	public int cusparsecsrgemm(cusparseHandle handle, int transA, int transB, int alg, cusparseSpMatDescr spMatDescrA,
+		cusparseSpMatDescr spMatDescrB, cusparseSpMatDescr spMatDescrC, cusparseSpGEMMDescr spgemmDescr) {
+		double[] alpha = {1.0}, beta = {0.0};
+		Pointer alphaPtr = Pointer.to(alpha), betaPtr = Pointer.to(beta);
+		return cusparseSpGEMM_copy(handle, transA, transB, alphaPtr, spMatDescrA.asConst(), spMatDescrB.asConst(),
+			betaPtr, spMatDescrC, CUDA_R_64F, alg, spgemmDescr);
 	}
-	
+
 	@Override
 	public int cublasgeam(cublasHandle handle, int transa, int transb, int m, int n, Pointer alpha, Pointer A, int lda,
-			Pointer beta, Pointer B, int ldb, Pointer C, int ldc) {
+		Pointer beta, Pointer B, int ldb, Pointer C, int ldc) {
 		return JCublas2.cublasDgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
 	}
-	
+
 	@Override
 	public int cusparsecsrmv(cusparseHandle handle, int transA, int m, int n, int nnz, Pointer alpha,
-			cusparseMatDescr descrA, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA, Pointer x, Pointer beta,
-			Pointer y) {
-		return JCusparse.cusparseDcsrmv(handle, transA, m, n, nnz, alpha, 
-				descrA, csrValA, csrRowPtrA, csrColIndA, x, beta, y);
+		cusparseSpMatDescr spMatDescrA, cusparseMatDescr descrA, Pointer csrValA, Pointer csrRowPtrA,
+		Pointer csrColIndA, Pointer x, Pointer beta, Pointer y) {
+		// Create sparse matrix A in CSR format
+		int idxBase = cusparseGetMatIndexBase(descrA);
+		int dataType = CUDA_R_64F;
+		cusparseCreateCsr(spMatDescrA, m, n, nnz, csrRowPtrA, csrColIndA, csrValA, CUSPARSE_INDEX_32I,
+			CUSPARSE_INDEX_32I, idxBase, dataType);
+		// Create dense vectors vecX and vecY
+		cusparseDnVecDescr vecX = new cusparseDnVecDescr();
+		cusparseDnVecDescr vecY = new cusparseDnVecDescr();
+		cusparseCreateDnVec(vecX, n, x, dataType);
+		cusparseCreateDnVec(vecY, m, y, dataType);
+		// allocate an external buffer if needed
+		long[] bufferSize = {0};
+		int alg = CUSPARSE_SPMV_ALG_DEFAULT;
+		cusparseSpMV_bufferSize(handle, transA, alpha, spMatDescrA.asConst(), vecX.asConst(), beta, vecY, dataType, alg,
+			bufferSize);
+		// execute SpMV
+		Pointer dBuffer = new Pointer();
+		if(bufferSize[0] > 0)
+			cudaMalloc(dBuffer, bufferSize[0]);
+		try {
+			return cusparseSpMV(handle, transA, alpha, spMatDescrA.asConst(), vecX.asConst(), beta, vecY, dataType, alg,
+				dBuffer);
+		}
+		finally {
+			if(bufferSize[0] > 0)
+				cudaFree(dBuffer);
+			cusparseDestroyDnVec(vecX.asConst());
+			cusparseDestroyDnVec(vecY.asConst());
+		}
 	}
-	
+
 	@Override
-	public int	cusparsecsrmm2(cusparseHandle handle, int transa, int transb, int m, int n, int k, int nnz, jcuda.Pointer alpha, cusparseMatDescr descrA, 
-			jcuda.Pointer csrValA, jcuda.Pointer csrRowPtrA, jcuda.Pointer csrColIndA, 
-			jcuda.Pointer B, int ldb, jcuda.Pointer beta, jcuda.Pointer C, int ldc) {
-		return JCusparse.cusparseDcsrmm2(handle, transa, transb, m, n, k, nnz, alpha, descrA, csrValA, 
-				csrRowPtrA, csrColIndA, B, ldb, beta, C, ldc);
+	public int cusparsecsrmm2(cusparseHandle handle, int transA, int transB, int m, int n, int k, int nnz,
+		Pointer alpha, cusparseMatDescr descrA, cusparseSpMatDescr spMatDescrA, Pointer csrValA, Pointer csrRowPtrA,
+		Pointer csrColIndA, Pointer B, int ldb, Pointer beta, Pointer C, int ldc) {
+
+		int dataType = CUDA_R_64F;
+		int idxBase = cusparseGetMatIndexBase(descrA);
+		// Create sparse matrix A in CSR format
+		cusparseCreateCsr(spMatDescrA, m, n, nnz, csrRowPtrA, csrColIndA, csrValA, CUSPARSE_INDEX_32I,
+			CUSPARSE_INDEX_32I, idxBase, dataType);
+		// Create dense matrix B
+		cusparseDnMatDescr dnMatB = new cusparseDnMatDescr();
+		cusparseCreateDnMat(dnMatB, k, n, ldb, B, dataType, CUSPARSE_ORDER_COL);
+		// Create dense matrix C
+		cusparseDnMatDescr dnMatC = new cusparseDnMatDescr();
+		cusparseCreateDnMat(dnMatC, m, n, ldc, C, dataType, CUSPARSE_ORDER_COL);
+		// allocate an external buffer if needed
+		long[] bufferSize = {0};
+		int alg = CUSPARSE_SPMM_ALG_DEFAULT;
+		cusparseSpMM_bufferSize(handle, transA, transB, alpha, spMatDescrA.asConst(), dnMatB.asConst(), beta, dnMatC,
+			dataType, alg, bufferSize);
+		// execute SpMM
+		Pointer dBuffer = new Pointer();
+		if(bufferSize[0] > 0)
+			cudaMalloc(dBuffer, bufferSize[0]);
+		try {
+			return cusparseSpMM(handle, transA, transB, alpha, spMatDescrA.asConst(), dnMatB.asConst(), beta, dnMatC,
+				dataType, alg, dBuffer);
+		}
+		finally {
+			if(bufferSize[0] > 0)
+				cudaFree(dBuffer);
+			cusparseDestroySpMat(spMatDescrA.asConst());
+			cusparseDestroyDnMat(dnMatB.asConst());
+			cusparseDestroyDnMat(dnMatC.asConst());
+		}
 	}
-	
+
 	@Override
 	public int cublasdot(cublasHandle handle, int n, Pointer x, int incx, Pointer y, int incy, Pointer result) {
 		return JCublas2.cublasDdot(handle, n, x, incx, y, incy, result);
 	}
-	
+
 	@Override
 	public int cublasgemv(cublasHandle handle, int trans, int m, int n, Pointer alpha, Pointer A, int lda, Pointer x,
-			int incx, Pointer beta, Pointer y, int incy) {
+		int incx, Pointer beta, Pointer y, int incy) {
 		return JCublas2.cublasDgemv(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
 	}
-	
+
 	@Override
 	public int cublasgemm(cublasHandle handle, int transa, int transb, int m, int n, int k, Pointer alpha, Pointer A,
-			int lda, Pointer B, int ldb, Pointer beta, Pointer C, int ldc) {
+		int lda, Pointer B, int ldb, Pointer beta, Pointer C, int ldc) {
 		return JCublas2.cublasDgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 	}
-	
+
 	@Override
 	public int cusparsecsr2csc(cusparseHandle handle, int m, int n, int nnz, Pointer csrVal, Pointer csrRowPtr,
-			Pointer csrColInd, Pointer cscVal, Pointer cscRowInd, Pointer cscColPtr, int copyValues, int idxBase) {
-		return JCusparse.cusparseDcsr2csc(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscRowInd, cscColPtr, copyValues, idxBase);
+		Pointer csrColInd, Pointer cscVal, Pointer cscRowInd, Pointer cscColPtr, int copyValues, int idxBase) {
+
+		int valType = CUDA_R_64F;            // double precision
+		int alg = CUSPARSE_CSR2CSC_ALG1;     // always supported
+
+		long[] bufferSize = {0};
+		cusparseCsr2cscEx2_bufferSize(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscColPtr, cscRowInd,
+			valType, copyValues, idxBase, alg, bufferSize);
+
+		Pointer buffer = new Pointer();
+		if(bufferSize[0] > 0)
+			cudaMalloc(buffer, bufferSize[0]);
+		try {
+			return cusparseCsr2cscEx2(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscColPtr, cscRowInd,
+				valType, copyValues, idxBase, alg, buffer);
+		}
+		finally {
+			if(bufferSize[0] > 0)
+				cudaFree(buffer);
+		}
 	}
-	
+
 	@Override
 	public int cublassyrk(cublasHandle handle, int uplo, int trans, int n, int k, Pointer alpha, Pointer A, int lda,
-			Pointer beta, Pointer C, int ldc) {
+		Pointer beta, Pointer C, int ldc) {
 		return JCublas2.cublasDsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 	}
-	
+
 	@Override
 	public int cublasaxpy(cublasHandle handle, int n, Pointer alpha, Pointer x, int incx, Pointer y, int incy) {
 		return JCublas2.cublasDaxpy(handle, n, alpha, x, incx, y, incy);
@@ -127,35 +220,136 @@
 
 	@Override
 	public int cusolverDnormqr(cusolverDnHandle handle, int side, int trans, int m, int n, int k, Pointer A, int lda,
-			Pointer tau, Pointer C, int ldc, Pointer work, int lwork, Pointer devInfo) {
+		Pointer tau, Pointer C, int ldc, Pointer work, int lwork, Pointer devInfo) {
 		return JCusolverDn.cusolverDnDormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, devInfo);
 	}
-	
+
 	@Override
 	public int cusparsecsrgeam(cusparseHandle handle, int m, int n, Pointer alpha, cusparseMatDescr descrA, int nnzA,
-			Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA, Pointer beta, cusparseMatDescr descrB, int nnzB,
-			Pointer csrValB, Pointer csrRowPtrB, Pointer csrColIndB, cusparseMatDescr descrC, Pointer csrValC,
-			Pointer csrRowPtrC, Pointer csrColIndC) {
-		return JCusparse.cusparseDcsrgeam(handle, m, n, alpha, descrA, nnzA, 
-				csrValA, csrRowPtrA, csrColIndA, beta, descrB, nnzB, 
-				csrValB, csrRowPtrB, csrColIndB, descrC, csrValC, csrRowPtrC, csrColIndC);
+		Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA, Pointer beta, cusparseMatDescr descrB, int nnzB,
+		Pointer csrValB, Pointer csrRowPtrB, Pointer csrColIndB, cusparseMatDescr descrC, Pointer csrValC,
+		Pointer csrRowPtrC, Pointer csrColIndC) {
+
+		long[] pBufferSizeInBytes = {0};
+
+		int status = JCusparse.cusparseDcsrgeam2_bufferSizeExt(handle, m, n, alpha, descrA, nnzA, csrValA, csrRowPtrA,
+			csrColIndA, beta, descrB, nnzB, csrValB, csrRowPtrB, csrColIndB, descrC, csrValC, csrRowPtrC, csrColIndC,
+			pBufferSizeInBytes);
+		if(status != CUSPARSE_STATUS_SUCCESS)
+			return status;
+
+		Pointer buffer = new Pointer();
+		if(pBufferSizeInBytes[0] > 0)
+			cudaMalloc(buffer, pBufferSizeInBytes[0]);
+
+		try {
+			// C = α*A + β*B
+			return JCusparse.cusparseDcsrgeam2(handle, m, n, alpha, descrA, nnzA, csrValA, csrRowPtrA, csrColIndA, beta,
+				descrB, nnzB, csrValB, csrRowPtrB, csrColIndB, descrC, csrValC, csrRowPtrC, csrColIndC, buffer);
+		}
+		finally {
+			if(pBufferSizeInBytes[0] > 0)
+				cudaFree(buffer);
+		}
 	}
-	
+
 	@Override
 	public int cusparsecsr2dense(cusparseHandle handle, int m, int n, cusparseMatDescr descrA, Pointer csrValA,
-			Pointer csrRowPtrA, Pointer csrColIndA, Pointer A, int lda) {
-		return JCusparse.cusparseDcsr2dense(handle, m, n, descrA, csrValA, csrRowPtrA, csrColIndA, A, lda);
+		Pointer csrRowPtrA, Pointer csrColIndA, Pointer A, int lda, long nnz) {
+
+		// Get index base from legacy descriptor -> 0 or 1
+		int idxBase = JCusparse.cusparseGetMatIndexBase(descrA);
+
+		// Create generric sparse-matrix descriptor required by CUDA 12
+		cusparseSpMatDescr spMatA = new cusparseSpMatDescr();
+
+		// Build CSR descriptor
+		cusparseCreateCsr(spMatA, m, n, nnz, csrRowPtrA, csrColIndA, csrValA, CUSPARSE_INDEX_32I,
+			CUSPARSE_INDEX_32I, idxBase, CUDA_R_64F);
+
+		// Build dense descriptor
+		cusparseDnMatDescr matB = new cusparseDnMatDescr();
+		cusparseCreateDnMat(matB, m, n, lda, A, CUDA_R_64F, CUSPARSE_ORDER_COL);
+
+		// Determine buffer size
+		int alg = CUSPARSE_SPARSETODENSE_ALG_DEFAULT;
+		long[] bufSize = {0};
+		cusparseSparseToDense_bufferSize(handle, spMatA.asConst(), matB, alg,
+			bufSize);    //bufSize[0] now holds the exact byte count
+
+		// Allocate scratch space of the requested size
+		Pointer dBuffer = new Pointer();
+		if(bufSize[0] > 0) {
+			cudaMalloc(dBuffer, bufSize[0]);
+		}
+		try {
+			// Write dense matrix
+			int algSparseToDense = CUSPARSE_SPARSETODENSE_ALG_DEFAULT;
+			return cusparseSparseToDense(handle, spMatA.asConst(), matB, algSparseToDense, dBuffer);
+		}
+		finally {
+			if(bufSize[0] > 0)
+				cudaFree(dBuffer);
+			cusparseDestroyDnMat(matB.asConst());
+			cusparseDestroySpMat(spMatA.asConst());
+		}
 	}
 
 	@Override
 	public int cusparsedense2csr(cusparseHandle handle, int m, int n, cusparseMatDescr descrA, Pointer A, int lda,
-			Pointer nnzPerRow, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA) {
-		return JCusparse.cusparseDdense2csr(handle, m, n, descrA, A, lda, nnzPerRow, csrValA, csrRowPtrA, csrColIndA);
+		Pointer nnzPerRow, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA) {
+		// setup
+		int dataType = CUDA_R_64F;
+		cusparseSpMatDescr csrDesc = new cusparseSpMatDescr();
+		cusparseDnMatDescr denseDesc = new cusparseDnMatDescr();
+		int idxBase = cusparseGetMatIndexBase(descrA);
+		int alg = CUSPARSE_DENSETOSPARSE_ALG_DEFAULT;
+		long[] bufferSize = {0};
+		Pointer dBuffer = new Pointer();
+
+		// Create dense matrix A
+		cusparseCreateDnMat(denseDesc, m, n, lda, A, dataType, CUSPARSE_ORDER_COL);
+
+		// Create sparse matrix B in CSR format
+		cusparseCreateCsr(csrDesc, m, n, 0, csrRowPtrA, csrColIndA, csrValA, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+			idxBase, dataType);
+
+		// allocate an external buffer if needed
+		cusparseDenseToSparse_bufferSize(handle, denseDesc.asConst(), csrDesc, alg, bufferSize);
+		if(bufferSize[0] > 0)
+			cudaMalloc(dBuffer, bufferSize[0]);
+
+		// prepare Sparse to Dense conversion
+		cusparseDenseToSparse_analysis(handle, denseDesc.asConst(), csrDesc, alg, dBuffer);
+
+		/** Keep this in case needed later for debugging*/
+		/*long[] rowsTmp = {0}, colsTmp = {0}, nnz  = {0};
+		JCusparse.cusparseSpMatGetSize(csrDesc.asConst(), rowsTmp, colsTmp, nnz);
+
+		// only allocate if caller passed null pointers
+		if (csrColIndA == null)
+			cudaMalloc(csrColIndA, nnz[0] * Sizeof.INT);
+		if (csrValA == null)
+			cudaMalloc(csrValA, nnz[0] * Sizeof.DOUBLE);*/
+
+		// re-attach column & value pointers
+		JCusparse.cusparseCsrSetPointers(csrDesc, csrRowPtrA, csrColIndA, csrValA);
+
+		try {
+			// execute Sparse to Dense conversion
+			return cusparseDenseToSparse_convert(handle, denseDesc.asConst(), csrDesc, alg, dBuffer);
+		}
+		finally {
+			cusparseDestroyDnMat(denseDesc.asConst());
+			cusparseDestroySpMat(csrDesc.asConst());
+			if(bufferSize[0] > 0)
+				cudaFree(dBuffer);
+		}
 	}
 
 	@Override
 	public int cusparsennz(cusparseHandle handle, int dirA, int m, int n, cusparseMatDescr descrA, Pointer A, int lda,
-			Pointer nnzPerRowCol, Pointer nnzTotalDevHostPtr) {
+		Pointer nnzPerRowCol, Pointer nnzTotalDevHostPtr) {
 		return JCusparse.cusparseDnnz(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol, nnzTotalDevHostPtr);
 	}
 
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCUDA.java
index 974d131..3a9cf83 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCUDA.java
@@ -1629,7 +1629,7 @@
 			if (in1 == in2 && isLeftTransposed == true && isLeftTransposed == isRightTransposed) {
 				// Special case for transpose
 
-				int nnz = (int)A.nnz;
+				int nnz = toInt(A.nnz);
 				CSRPointer C = CSRPointer.allocateEmpty(gCtx, nnz, n);
 				out.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
 				cudaSupportFunctions.cusparsecsr2csc(getCusparseHandle(gCtx), m, n, nnz, A.val, A.rowPtr, A.colInd, C.val, C.colInd, C.rowPtr, cusparseAction.CUSPARSE_ACTION_NUMERIC, cusparseIndexBase.CUSPARSE_INDEX_BASE_ZERO);
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNN.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNN.java
index ea6d409..f32dbd0 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNN.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNN.java
@@ -34,16 +34,21 @@
 import static jcuda.jcudnn.cudnnNanPropagation.CUDNN_PROPAGATE_NAN;
 import static jcuda.jcudnn.cudnnTensorFormat.CUDNN_TENSOR_NCHW;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice;
+import static jcuda.jcudnn.cudnnRNNDataLayout.CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED;
+import static jcuda.jcudnn.cudnnForwardMode.CUDNN_FWD_MODE_TRAINING;
 import static jcuda.runtime.JCuda.cudaMemcpy;
 import static jcuda.jcudnn.JCudnn.cudnnBatchNormalizationForwardTraining;
 import static jcuda.jcudnn.JCudnn.cudnnBatchNormalizationForwardInference;
 import static jcuda.jcudnn.JCudnn.cudnnBatchNormalizationBackward;
 import static jcuda.runtime.JCuda.cudaMemset;
+import static jcuda.jcudnn.cudnnSoftmaxAlgorithm.CUDNN_SOFTMAX_ACCURATE;
+import static jcuda.jcudnn.cudnnSoftmaxMode.CUDNN_SOFTMAX_MODE_CHANNEL;
+
+import jcuda.jcudnn.cudnnRNNDataDescriptor;
 import jcuda.CudaException;
 import jcuda.Pointer;
 import jcuda.jcudnn.JCudnn;
 import jcuda.jcudnn.cudnnActivationDescriptor;
-import jcuda.jcudnn.cudnnConvolutionFwdPreference;
 import jcuda.jcudnn.cudnnHandle;
 import jcuda.jcudnn.cudnnStatus;
 import jcuda.jcudnn.cudnnTensorDescriptor;
@@ -62,8 +67,7 @@
 import org.apache.sysds.runtime.matrix.data.LibMatrixDNN.PoolingType;
 import org.apache.sysds.utils.Statistics;
 
-import static jcuda.jcudnn.cudnnSoftmaxAlgorithm.CUDNN_SOFTMAX_ACCURATE;
-import static jcuda.jcudnn.cudnnSoftmaxMode.CUDNN_SOFTMAX_MODE_CHANNEL;
+
 
 /**
  * This class contains method that invoke CuDNN operations.
@@ -73,14 +77,14 @@
 	// Currently we only use nnz information from the sparse matrix which is pre-computed
 	// TODO: experiment how often does dense matrix is empty where recomputing nnz before calling CuDNN will help
 	private static final boolean RECOMPUTE_DENSE_NNZ = false;
-	
-	protected static int CONVOLUTION_PREFERENCE = cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
+
+	//protected static int CONVOLUTION_PREFERENCE = cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
 	private static final Log LOG = LogFactory.getLog(LibMatrixCuDNN.class.getName());
 
 	protected static cudnnHandle getCudnnHandle(GPUContext gCtx) {
 		return gCtx.getCudnnHandle();
 	}
-	
+
 	/**
 	 * Does a 2D convolution followed by a bias_add
 	 *
@@ -833,130 +837,227 @@
 	 * @param T sequence length
 	 * @throws DMLRuntimeException if error
 	 */
-	public static void lstm(ExecutionContext ec, GPUContext gCtx, String instName,
-			Pointer X,  Pointer wPointer, Pointer out0, Pointer c0, boolean return_sequences,
-			String outputName, String cyName, int N, int M, int D, int T) throws DMLRuntimeException {
-		singleLayerUnidirectionalRNNForward(ec, gCtx, instName, X, out0, c0, wPointer, outputName, cyName, "lstm", return_sequences, N, M, D, T);
+	public static void lstm(ExecutionContext ec, GPUContext gCtx, String instName, Pointer X, Pointer wPointer,
+		Pointer out0, Pointer c0, boolean return_sequences, String outputName, String cyName, int N, int M, int D,
+		int T) throws DMLRuntimeException {
+		singleLayerUnidirectionalRNNForward(ec, gCtx, instName, X, out0, c0, wPointer, outputName, cyName, "lstm",
+			return_sequences, N, M, D, T);
 	}
-	
+
+	/**
+	 * Run a single-layer, unidirectional RNN/LSTM/GRU forward pass.
+	 *
+	 * @param ec               Execution context
+	 * @param gCtx             GPU context
+	 * @param instName         Instruction name for memory tracking
+	 * @param x                Input  X  (device pointer, shape N×D packed by time)
+	 * @param hx               Initial hidden state H₀ (device pointer, N×M)
+	 * @param cx               Initial cell state   C₀ (only for LSTM, else dummy)
+	 * @param wPointer         Flat weight buffer, already on device
+	 * @param outputName       SystemDS name for Y / last-state     output
+	 * @param cyName           SystemDS name for final cell state   output
+	 * @param rnnMode          "lstm" / "gru" / "rnn_relu" / "rnn_tanh"
+	 * @param return_sequences true ⇒ return the whole Y; false ⇒ only last step
+	 * @param N                Batch size
+	 * @param M                Hidden size
+	 * @param D                Input size
+	 * @param T                Sequence length
+	 */
 	private static void singleLayerUnidirectionalRNNForward(ExecutionContext ec, GPUContext gCtx, String instName,
-			Pointer x, Pointer hx, Pointer cx, Pointer wPointer,  // input
-			String outputName, String cyName,  					 // output
-			String rnnMode, boolean return_sequences, int N, int M, int D, int T) throws DMLRuntimeException {
+		Pointer x, Pointer hx, Pointer cx, Pointer wPointer, String outputName, String cyName, String rnnMode,
+		boolean return_sequences, int N, int M, int D, int T) throws DMLRuntimeException {
 		boolean hasCarry = rnnMode.equalsIgnoreCase(Opcodes.LSTM.toString());
-		// Get output pointers
-		Pointer cudnnYPointer = gCtx.allocate(instName, (long) N *T*M*sizeOfDataType, false);
-		Pointer hyPointer = !return_sequences ? getDenseOutputPointer(ec, gCtx, instName, outputName, N, M) : gCtx.allocate(instName,
-			(long) N*M*sizeOfDataType, false);
+
+		/* ------------------------------------------------------------------ */
+		/* 0. Allocate output buffers                                         */
+		/* ------------------------------------------------------------------ */
+		Pointer yCudnn = gCtx.allocate(instName, (long) N * T * M * sizeOfDataType, false);          // Y from cuDNN
+
+		Pointer hyPointer = !return_sequences ? getDenseOutputPointer(ec, gCtx, instName, outputName, N,
+			M) : gCtx.allocate(instName, (long) N * M * sizeOfDataType, false);
+
 		Pointer cyPointer = hasCarry ? getDenseOutputPointer(ec, gCtx, instName, cyName, N, M) : new Pointer();
-		// Pointer wPointer = getDensePointerForCuDNN(gCtx, w, instName, D+M+2, 4*M);
-		
-		try(LibMatrixCuDNNRnnAlgorithm algo = new LibMatrixCuDNNRnnAlgorithm(ec, gCtx, instName, rnnMode, N, T, M, D, true, wPointer)) {
-			JCudnn.cudnnRNNForwardTraining(gCtx.getCudnnHandle(), algo.rnnDesc, T, 
-					algo.xDesc, x, 
-					algo.hxDesc, hx, 
-					algo.cxDesc, cx, 
-					algo.wDesc, wPointer, 
-					algo.yDesc, cudnnYPointer, 
-					algo.hyDesc, hyPointer, 
-					algo.cyDesc, cyPointer, 
-					algo.workSpace, algo.sizeInBytes, 
-					algo.reserveSpace, algo.reserveSpaceSizeInBytes);
+
+		/* ------------------------------------------------------------------ */
+		/* 1. Build helper with v8 RNN descriptor                             */
+		/* ------------------------------------------------------------------ */
+		try(LibMatrixCuDNNRnnAlgorithm algo = new LibMatrixCuDNNRnnAlgorithm(ec, gCtx, instName, rnnMode, N, T, M, D,
+			/*training*/true, wPointer)) {
+			/* -------------------------------------------------------------- */
+			/* 1a. Single RNN-DATA descriptors for X and Y                    */
+			/* -------------------------------------------------------------- */
+			cudnnRNNDataDescriptor xDesc = new cudnnRNNDataDescriptor();
+			JCudnn.cudnnCreateRNNDataDescriptor(xDesc);
+			JCudnn.cudnnSetRNNDataDescriptor(xDesc, LibMatrixCUDA.CUDNN_DATA_TYPE,
+				CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED, T, N, D, null,
+				null);                             // uniform length = T
+
+			cudnnRNNDataDescriptor yDesc = new cudnnRNNDataDescriptor();
+			JCudnn.cudnnCreateRNNDataDescriptor(yDesc);
+			JCudnn.cudnnSetRNNDataDescriptor(yDesc, LibMatrixCUDA.CUDNN_DATA_TYPE,
+				CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED, T, N, M, null, null);
+
+			/* -------------------------------------------------------------- */
+			/* 1b. Obtain size cuDNN expects for packed weight-space          */
+			/*     and reuse existing wPointer buffer                         */
+			/* -------------------------------------------------------------- */
+			long[] wSpaceBytes = {0};
+			JCudnn.cudnnGetRNNWeightSpaceSize(gCtx.getCudnnHandle(), algo.rnnDesc, wSpaceBytes);
+			long weightSpaceSize = wSpaceBytes[0];
+			Pointer weightSpace = wPointer;  // assume caller already packed
+
+			/* -------------------------------------------------------------- */
+			/* 2. Forward pass (training mode)                                */
+			/* -------------------------------------------------------------- */
+			JCudnn.cudnnRNNForward(gCtx.getCudnnHandle(), algo.rnnDesc, CUDNN_FWD_MODE_TRAINING,
+				// unified API flag
+				null,                                // devSeqLengths (uniform)
+				xDesc, x, yDesc, yCudnn, algo.hxDesc, hx, hyPointer, algo.cxDesc, cx, cyPointer, weightSpaceSize,
+				weightSpace, algo.sizeInBytes, algo.workSpace, algo.reserveSpaceSizeInBytes, algo.reserveSpace);
+
+			/* ------------------------------------------------------------------ */
+			/* 3. Copy / reshape Y when user asked for full sequences              */
+			/* ------------------------------------------------------------------ */
+			if(return_sequences) {
+				gCtx.cudaFreeHelper(instName, hyPointer, DMLScript.EAGER_CUDA_FREE);
+
+				Pointer ySysds = getDenseOutputPointer(ec, gCtx, instName, outputName, N, (long) T * M);
+
+				LibMatrixCUDA.getCudaKernels(gCtx)
+					.launchKernel("prepare_lstm_output", ExecutionConfig.getConfigForSimpleVectorOperations(N * T * M),
+						ySysds, yCudnn, N, T, M, N * T * M);
+			}
+
+			/* ------------------------------------------------------------------ */
+			/* 4. Free temporaries                                                */
+			/* ------------------------------------------------------------------ */
+			gCtx.cudaFreeHelper(instName, yCudnn, DMLScript.EAGER_CUDA_FREE);
+			JCudnn.cudnnDestroyRNNDataDescriptor(xDesc);
+			JCudnn.cudnnDestroyRNNDataDescriptor(yDesc);
 		}
-		
-		if(return_sequences) {
-			gCtx.cudaFreeHelper(instName, hyPointer, DMLScript.EAGER_CUDA_FREE);
-			Pointer sysdsYPointer = getDenseOutputPointer(ec, gCtx, instName, outputName, N, T*M);
-			LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_output",
-					ExecutionConfig.getConfigForSimpleVectorOperations(N*T*M),
-					sysdsYPointer, cudnnYPointer, N, T, M, N*T*M);
-		}
-		gCtx.cudaFreeHelper(instName, cudnnYPointer, DMLScript.EAGER_CUDA_FREE);
 	}
-	
-	public static void lstmBackward(ExecutionContext ec, GPUContext gCtx, String instName,
-			Pointer x, Pointer hx, Pointer cx, Pointer wPointer, String doutName, String dcyName,  // input
-			String dxName, String dwName, String dbName, String dhxName, String dcxName,  	// output
-			boolean return_sequences, int N, int M, int D, int T) throws DMLRuntimeException {
-		// Transform the input dout and prepare them for cudnnRNNBackwardData
-		Pointer dy = gCtx.allocate(instName, (long) N *T*M*sizeOfDataType, false);
-		int size = return_sequences ? N*T*M : N*M;
+
+	public static void lstmBackward(ExecutionContext ec, GPUContext gCtx, String instName, Pointer x, Pointer hx,
+		Pointer cx, Pointer wPointer,          // inputs
+		String doutName, String dcyName,                              // grad-in
+		String dxName, String dwName, String dbName,                  // grad-out
+		String dhxName, String dcxName, boolean return_sequences, int N, int M, int D, int T)
+		throws DMLRuntimeException {
+		/* ------------------------------------------------------------------ */
+		/* 0. Prepare dY from dout (SystemDS layout → cuDNN layout)           */
+		/* ------------------------------------------------------------------ */
+		long elemsY = (long) N * T * M;
+		Pointer dY = gCtx.allocate(instName, elemsY * sizeOfDataType, false);
+		Pointer yPointer = gCtx.allocate(instName, (long) N * T * M * sizeOfDataType, false);
+
+		long doutElems = return_sequences ? elemsY : (long) N * M;
 		LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_backward_gradients",
-				ExecutionConfig.getConfigForSimpleVectorOperations(size),
-				getDenseInputPointer(ec, gCtx, instName, doutName, N, return_sequences ? (long) T*M : M),
-				dy, N, T, M, size, return_sequences ? 1 : 0);
+			ExecutionConfig.getConfigForSimpleVectorOperations((int) doutElems),
+			getDenseInputPointer(ec, gCtx, instName, doutName, N, return_sequences ? (long) T * M : M), dY, N, T, M,
+			doutElems, return_sequences ? 1 : 0);
+
 		ec.releaseMatrixInputForGPUInstruction(doutName);
-				
-		// Allocate intermediate pointers computed by forward
-		Pointer yPointer = gCtx.allocate(instName, (long) N *T*M*sizeOfDataType, false);
-		try(LibMatrixCuDNNRnnAlgorithm algo = new LibMatrixCuDNNRnnAlgorithm(ec, gCtx, instName, "lstm", N, T, M, D, true, wPointer)) {
-			JCudnn.cudnnRNNForwardTraining(gCtx.getCudnnHandle(), algo.rnnDesc, T, 
-					algo.xDesc, x, 
-					algo.hxDesc, hx, 
-					algo.cxDesc, cx, 
-					algo.wDesc, wPointer, 
-					algo.yDesc, yPointer, 
-					algo.hyDesc, new Pointer(), 
-					algo.cyDesc, new Pointer(), 
-					algo.workSpace, algo.sizeInBytes, 
-					algo.reserveSpace, algo.reserveSpaceSizeInBytes);
-			
-			Pointer cudnnDx = gCtx.allocate(instName, (long) N *T*D*LibMatrixCUDA.sizeOfDataType, false);
-			JCudnn.cudnnRNNBackwardData(gCtx.getCudnnHandle(), algo.rnnDesc, T, 
-					algo.yDesc, yPointer,
-					// ----------------------
-					// Additional inputs:
-					algo.dyDesc, dy, 
-					algo.dhyDesc, new Pointer(), 
-					algo.dcyDesc, getDenseInputPointer(ec, gCtx, instName, dcyName, N, M),
-					// ----------------------
-					algo.wDesc, wPointer, 
-					algo.hxDesc, hx,
-					algo.cxDesc, cx,
-					// ----------------------
-					// Output:
-					algo.dxDesc, cudnnDx, 
-					algo.dhxDesc, getDenseOutputPointer(ec, gCtx, instName, dhxName, N, M), 
-					algo.dcxDesc, getDenseOutputPointer(ec, gCtx, instName, dcxName, N, M),
-					// ----------------------
-					algo.workSpace, algo.sizeInBytes, 
-					algo.reserveSpace, algo.reserveSpaceSizeInBytes);
-			gCtx.cudaFreeHelper(instName, dy, DMLScript.EAGER_CUDA_FREE);
+
+		/* ------------------------------------------------------------------ */
+		/* 1. Build helper → rnnDesc (v8) and workspace sizes                 */
+		/* ------------------------------------------------------------------ */
+		try(LibMatrixCuDNNRnnAlgorithm algo = new LibMatrixCuDNNRnnAlgorithm(ec, gCtx, instName, "lstm", N, T, M, D,
+			/*training*/true, wPointer)) {
+			/* -------------------------------------------------------------- */
+			/* 1a. Create single RNN-DATA descriptors for X and Y             */
+			/* -------------------------------------------------------------- */
+			cudnnRNNDataDescriptor xDesc = new cudnnRNNDataDescriptor();
+			JCudnn.cudnnCreateRNNDataDescriptor(xDesc);
+			JCudnn.cudnnSetRNNDataDescriptor(xDesc, LibMatrixCUDA.CUDNN_DATA_TYPE,
+				CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED, T, N, D, null, null);
+
+			cudnnRNNDataDescriptor yDesc = new cudnnRNNDataDescriptor();
+			JCudnn.cudnnCreateRNNDataDescriptor(yDesc);
+			JCudnn.cudnnSetRNNDataDescriptor(yDesc, LibMatrixCUDA.CUDNN_DATA_TYPE,
+				CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED, T, N, M, null, null);
+
+			/* -------------------------------------------------------------- */
+			/* 1b. Packed weight-space info                                   */
+			/* -------------------------------------------------------------- */
+			long[] wSpaceBytes = {0};
+			JCudnn.cudnnGetRNNWeightSpaceSize(gCtx.getCudnnHandle(), algo.rnnDesc, wSpaceBytes);
+			long weightSpaceSize = wSpaceBytes[0];
+			Pointer weightSpace = wPointer;   // flat weights already packed
+
+			/* -------------------------------------------------------------- */
+			/* 2. Forward pass (needed to fill reserve-space)                 */
+			/* -------------------------------------------------------------- */
+			JCudnn.cudnnRNNForward(gCtx.getCudnnHandle(), algo.rnnDesc, CUDNN_FWD_MODE_TRAINING, null, xDesc, x, yDesc,
+				yPointer,            // algo.yTemp sized N·T·M
+				algo.hxDesc, hx, new Pointer(),   // hy unused
+				algo.cxDesc, cx, new Pointer(),   // cy unused
+				weightSpaceSize, weightSpace, algo.sizeInBytes, algo.workSpace, algo.reserveSpaceSizeInBytes,
+				algo.reserveSpace);
+
+			/* -------------------------------------------------------------- */
+			/* 3. Back-prop through time: dX, dH₀, dC₀                        */
+			/* -------------------------------------------------------------- */
+			Pointer dX = gCtx.allocate(instName, (long) N * T * D * sizeOfDataType, false);
+
+			JCudnn.cudnnRNNBackwardData_v8(gCtx.getCudnnHandle(), algo.rnnDesc, null,
+				// devSeqLengths
+				yDesc, yPointer, dY,          // y, dy
+				xDesc, dX,                      // out: dx
+				algo.hxDesc, hx, new Pointer(),                  // dhy = 0
+				getDenseOutputPointer(ec, gCtx, instName, dhxName, N, M), algo.cxDesc, cx,
+				getDenseInputPointer(ec, gCtx, instName, dcyName, N, M),
+				getDenseOutputPointer(ec, gCtx, instName, dcxName, N, M), weightSpaceSize, weightSpace,
+				algo.sizeInBytes, algo.workSpace, algo.reserveSpaceSizeInBytes, algo.reserveSpace);
+
 			ec.releaseMatrixInputForGPUInstruction(dcyName);
 			ec.releaseMatrixOutputForGPUInstruction(dhxName);
 			ec.releaseMatrixOutputForGPUInstruction(dcxName);
-			
-			Pointer smlDx = getDenseOutputPointer(ec, gCtx, instName, dxName, N, T*D);
-			LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_dinput",
-					ExecutionConfig.getConfigForSimpleVectorOperations(N*T*D),
-					smlDx, cudnnDx, N, D, T*D, N*T*D);
+
+			/* Copy dX back into SystemDS layout --------------------------- */
+			Pointer sysdsDx = getDenseOutputPointer(ec, gCtx, instName, dxName, N, (long) T * D);
+
+			LibMatrixCUDA.getCudaKernels(gCtx)
+				.launchKernel("prepare_lstm_dinput", ExecutionConfig.getConfigForSimpleVectorOperations(N * T * D),
+					sysdsDx, dX, N, D, T * D, N * T * D);
+
 			ec.releaseMatrixOutputForGPUInstruction(dxName);
-			gCtx.cudaFreeHelper(instName, cudnnDx, DMLScript.EAGER_CUDA_FREE);
-			
-			// -------------------------------------------------------------------------------------------
-			Pointer cudnnDwPointer = gCtx.allocate(instName, (D+M+2)*(4L *M)*LibMatrixCUDA.sizeOfDataType, false);
-			JCudnn.cudnnRNNBackwardWeights(gCtx.getCudnnHandle(), algo.rnnDesc, T, 
-					algo.xDesc, x, 
-					algo.hxDesc, hx, 
-					algo.yDesc, yPointer, 
-					algo.workSpace, algo.sizeInBytes, 
-					algo.dwDesc, cudnnDwPointer, 
-					algo.reserveSpace, algo.reserveSpaceSizeInBytes);
+			gCtx.cudaFreeHelper(instName, dX, DMLScript.EAGER_CUDA_FREE);
+
+			/* -------------------------------------------------------------- */
+			/* 4. Weight & bias gradients                                     */
+			/* -------------------------------------------------------------- */
+			long dWeightBytes = weightSpaceSize;
+			Pointer dWeightSpace = gCtx.allocate(instName, dWeightBytes, false);
+
+			JCudnn.cudnnRNNBackwardWeights_v8(gCtx.getCudnnHandle(), algo.rnnDesc,
+				/*addGrad=*/0, null,                     // devSeqLengths
+				xDesc, x, algo.hxDesc, hx, yDesc, yPointer, dWeightBytes, dWeightSpace, algo.sizeInBytes,
+				algo.workSpace, algo.reserveSpaceSizeInBytes, algo.reserveSpace);
+
+			/* Split packed dWeightSpace into SystemDS tensors ------------- */
 			LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_dweight",
-					ExecutionConfig.getConfigForSimpleVectorOperations((D+M+2)*(4*M)),
-					getDenseOutputPointer(ec, gCtx, instName, dwName, D+M, 4*M), 
-					getDenseOutputPointer(ec, gCtx, instName, dbName, 1, 4*M), cudnnDwPointer, D, M);
-			gCtx.cudaFreeHelper(instName, cudnnDwPointer, DMLScript.EAGER_CUDA_FREE);
+				ExecutionConfig.getConfigForSimpleVectorOperations((D + M + 2) * (4 * M)),
+				getDenseOutputPointer(ec, gCtx, instName, dwName, D + M, 4L * M),
+				getDenseOutputPointer(ec, gCtx, instName, dbName, 1, 4L * M), dWeightSpace, D, M);
+
+			gCtx.cudaFreeHelper(instName, dWeightSpace, DMLScript.EAGER_CUDA_FREE);
 			ec.releaseMatrixOutputForGPUInstruction(dwName);
 			ec.releaseMatrixOutputForGPUInstruction(dbName);
-			// -------------------------------------------------------------------------------------------
-			
+
+			/* -------------------------------------------------------------- */
+			/* 5. Free temporaries                                            */
+			/* -------------------------------------------------------------- */
+			gCtx.cudaFreeHelper(instName, dY, DMLScript.EAGER_CUDA_FREE);
 			gCtx.cudaFreeHelper(instName, yPointer, DMLScript.EAGER_CUDA_FREE);
+
+			JCudnn.cudnnDestroyRNNDataDescriptor(xDesc);
+			JCudnn.cudnnDestroyRNNDataDescriptor(yDesc);
 		}
 	}
-	
-	
-	
+
+
+
+
 	/**
 	 * Performs the forward BatchNormalization layer computation for training
 	 * @param gCtx   a valid {@link GPUContext}
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
index ac12b50..fab29fd 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
@@ -20,11 +20,15 @@
 package org.apache.sysds.runtime.matrix.data;
 
 import jcuda.Pointer;
-import jcuda.jcudnn.cudnnConvolutionBwdFilterPreference;
+import jcuda.jcudnn.cudnnConvolutionBwdDataAlgo;
+import jcuda.jcudnn.cudnnConvolutionBwdDataAlgoPerf;
+import jcuda.jcudnn.cudnnConvolutionBwdFilterAlgo;
+import jcuda.jcudnn.cudnnConvolutionBwdFilterAlgoPerf;
 import jcuda.jcudnn.cudnnConvolutionDescriptor;
 import jcuda.jcudnn.cudnnConvolutionFwdAlgo;
 import jcuda.jcudnn.cudnnFilterDescriptor;
 import jcuda.jcudnn.cudnnTensorDescriptor;
+
 import static jcuda.jcudnn.JCudnn.cudnnCreateConvolutionDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnCreateFilterDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnCreateTensorDescriptor;
@@ -167,22 +171,38 @@
 	public static LibMatrixCuDNNConvolutionAlgorithm cudnnGetConvolutionBackwardFilterAlgorithm(
 			GPUContext gCtx, String instName, int N, int C, int H, int W, int K, int R, int S, 
 			int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q, long workspaceLimit) {
-		LibMatrixCuDNNConvolutionAlgorithm ret = new LibMatrixCuDNNConvolutionAlgorithm(gCtx, instName, N, C, H, W, K, R, S, 
-				pad_h, pad_w, stride_h, stride_w, P, Q);
-		
-		int[] algos = {-1};
-		long[] sizeInBytesArray = {Math.min(workspaceLimit, MAX_WORKSPACE_LIMIT_BYTES)};
-		jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardFilterAlgorithm(
-				LibMatrixCuDNN.getCudnnHandle(gCtx), 
-				ret.nchwTensorDesc, ret.nkpqTensorDesc, ret.convDesc, ret.filterDesc, 
-				cudnnConvolutionBwdFilterPreference.CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, sizeInBytesArray[0], algos);
-		jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardFilterWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx), 
-				ret.nchwTensorDesc, ret.nkpqTensorDesc, ret.convDesc, ret.filterDesc, algos[0], sizeInBytesArray);
-		if (sizeInBytesArray[0] != 0)
+		LibMatrixCuDNNConvolutionAlgorithm ret = new LibMatrixCuDNNConvolutionAlgorithm(gCtx, instName, N, C, H, W, K,
+			R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
+
+		final int maxAlgos = cudnnConvolutionBwdFilterAlgo.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT;
+		cudnnConvolutionBwdFilterAlgoPerf[] perf = new cudnnConvolutionBwdFilterAlgoPerf[maxAlgos];
+		for(int i = 0; i < maxAlgos; ++i)
+			perf[i] = new cudnnConvolutionBwdFilterAlgoPerf();
+		int[] returnedAlgoCount = {0};
+		jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardFilterAlgorithm_v7(LibMatrixCuDNN.getCudnnHandle(gCtx),
+			ret.nchwTensorDesc, ret.nkpqTensorDesc, ret.convDesc, ret.filterDesc, maxAlgos, returnedAlgoCount, perf);
+
+		long workspaceCap = Math.min(workspaceLimit, MAX_WORKSPACE_LIMIT_BYTES);
+		int chosenAlgo = cudnnConvolutionBwdFilterAlgo.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
+		long chosenWs = 0;
+
+		for(int i = 0; i < returnedAlgoCount[0]; ++i) {
+			if(perf[i].memory <= workspaceCap) {
+				chosenAlgo = perf[i].algo;
+				chosenWs = perf[i].memory;
+				break;
+			}
+		}
+
+		long[] sizeInBytesArray = {chosenWs};
+
+		jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardFilterWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx),
+			ret.nchwTensorDesc, ret.nkpqTensorDesc, ret.convDesc, ret.filterDesc, chosenAlgo, sizeInBytesArray);
+		if(sizeInBytesArray[0] != 0)
 			ret.workSpace = gCtx.allocate(instName, sizeInBytesArray[0], false);
 		ret.sizeInBytes = sizeInBytesArray[0];
-		ret.algo = algos[0];
-		
+		ret.algo = chosenAlgo;
+
 		return ret;
 	}
 	
@@ -220,18 +240,36 @@
 			ret.algo = jcuda.jcudnn.cudnnConvolutionBwdDataAlgo.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
 		}
 		else {
-			int[] algos = {-1};
-			long[] sizeInBytesArray = {Math.min(workspaceLimit, MAX_WORKSPACE_LIMIT_BYTES)};
-			jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataAlgorithm(
-					LibMatrixCuDNN.getCudnnHandle(gCtx), 
-					ret.filterDesc, ret.nkpqTensorDesc, ret.convDesc, ret.nchwTensorDesc,
-					jcuda.jcudnn.cudnnConvolutionBwdDataPreference.CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, sizeInBytesArray[0], algos);
-			jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx), 
-					ret.filterDesc, ret.nkpqTensorDesc, ret.convDesc, ret.nchwTensorDesc, algos[0], sizeInBytesArray);
-			if (sizeInBytesArray[0] != 0)
+			final int max = cudnnConvolutionBwdDataAlgo.CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT;
+			cudnnConvolutionBwdDataAlgoPerf[] perf = new cudnnConvolutionBwdDataAlgoPerf[max];
+			for(int i = 0; i < max; ++i)
+				perf[i] = new cudnnConvolutionBwdDataAlgoPerf();
+			int[] nReturned = {0};
+
+			jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataAlgorithm_v7(LibMatrixCuDNN.getCudnnHandle(gCtx),
+				ret.filterDesc, ret.nkpqTensorDesc, ret.convDesc, ret.nchwTensorDesc, max, nReturned, perf);
+
+			long cap = Math.min(workspaceLimit, MAX_WORKSPACE_LIMIT_BYTES);
+			int chosenAlgo = cudnnConvolutionBwdDataAlgo.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
+			long chosenWs = 0;
+
+			for(int i = 0; i < nReturned[0]; ++i) {
+				if(perf[i].memory <= cap) {
+					chosenAlgo = perf[i].algo;
+					chosenWs = perf[i].memory;
+					break;
+				}
+			}
+
+			long[] sizeInBytesArray = {chosenWs};
+			jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx),
+				ret.filterDesc, ret.nkpqTensorDesc, ret.convDesc, ret.nchwTensorDesc, chosenAlgo, sizeInBytesArray);
+
+			if(sizeInBytesArray[0] != 0)
 				ret.workSpace = gCtx.allocate(instName, sizeInBytesArray[0], false);
+
 			ret.sizeInBytes = sizeInBytesArray[0];
-			ret.algo = algos[0];
+			ret.algo = chosenAlgo;
 		}
 		return ret;
 	}
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java
index abcb4f0..55f47cb 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java
@@ -26,7 +26,18 @@
 import static jcuda.jcudnn.JCudnn.cudnnSetTensorNdDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnDestroyDropoutDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnDestroyRNNDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnGetRNNWeightSpaceSize;
 import static jcuda.jcudnn.cudnnTensorFormat.CUDNN_TENSOR_NCHW;
+import static jcuda.jcudnn.cudnnDataType.CUDNN_DATA_HALF;
+import static jcuda.jcudnn.cudnnDataType.CUDNN_DATA_BFLOAT16;
+import static jcuda.jcudnn.cudnnMathType.CUDNN_DEFAULT_MATH;
+import static jcuda.jcudnn.cudnnMathType.CUDNN_TENSOR_OP_MATH;
+import static jcuda.jcudnn.cudnnRNNAlgo.CUDNN_RNN_ALGO_STANDARD;
+import static jcuda.jcudnn.cudnnRNNBiasMode.CUDNN_RNN_DOUBLE_BIAS;
+import static jcuda.jcudnn.cudnnDirectionMode.CUDNN_UNIDIRECTIONAL;
+import static jcuda.jcudnn.cudnnRNNInputMode.CUDNN_LINEAR_INPUT;
+import static jcuda.jcudnn.cudnnRNNDataLayout.CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED;
+import static jcuda.jcudnn.cudnnForwardMode.CUDNN_FWD_MODE_TRAINING;
 
 import org.apache.sysds.api.DMLScript;
 import org.apache.sysds.common.Opcodes;
@@ -35,9 +46,6 @@
 import org.apache.sysds.runtime.instructions.gpu.context.GPUContext;
 
 import static jcuda.jcudnn.JCudnn.cudnnCreateRNNDescriptor;
-import static jcuda.jcudnn.cudnnRNNInputMode.CUDNN_LINEAR_INPUT;
-import static jcuda.jcudnn.cudnnDirectionMode.CUDNN_UNIDIRECTIONAL;
-import static jcuda.jcudnn.cudnnRNNAlgo.CUDNN_RNN_ALGO_STANDARD;
 
 import jcuda.Pointer;
 import jcuda.jcudnn.JCudnn;
@@ -45,6 +53,7 @@
 import jcuda.jcudnn.cudnnFilterDescriptor;
 import jcuda.jcudnn.cudnnRNNDescriptor;
 import jcuda.jcudnn.cudnnTensorDescriptor;
+import jcuda.jcudnn.cudnnRNNDataDescriptor;
 
 public class LibMatrixCuDNNRnnAlgorithm implements java.lang.AutoCloseable {
 	GPUContext gCtx;
@@ -52,6 +61,7 @@
 	cudnnDropoutDescriptor dropoutDesc;
 	cudnnRNNDescriptor rnnDesc;
 	cudnnTensorDescriptor[] xDesc, dxDesc, yDesc, dyDesc; // of length T
+	cudnnRNNDataDescriptor xDataDesc;
 	cudnnTensorDescriptor hxDesc, cxDesc, hyDesc, cyDesc, dhxDesc, dcxDesc, dhyDesc, dcyDesc; 
 	cudnnFilterDescriptor wDesc;
 	cudnnFilterDescriptor dwDesc;
@@ -90,25 +100,37 @@
 		JCudnn.cudnnDropoutGetStatesSize(gCtx.getCudnnHandle(), _dropOutSizeInBytes);
 		dropOutSizeInBytes = _dropOutSizeInBytes[0];
 		dropOutStateSpace = new Pointer();
-		if (dropOutSizeInBytes != 0)
+		if(dropOutSizeInBytes != 0)
 			dropOutStateSpace = gCtx.allocate(instName, dropOutSizeInBytes, false);
-		JCudnn.cudnnSetDropoutDescriptor(dropoutDesc, gCtx.getCudnnHandle(), 0, dropOutStateSpace, dropOutSizeInBytes, 12345);
-		
+		JCudnn.cudnnSetDropoutDescriptor(dropoutDesc, gCtx.getCudnnHandle(), 0, dropOutStateSpace, dropOutSizeInBytes,
+			12345);
+
 		// Initialize RNN descriptor
 		rnnDesc = new cudnnRNNDescriptor();
 		cudnnCreateRNNDescriptor(rnnDesc);
-		JCudnn.cudnnSetRNNDescriptor_v6(gCtx.getCudnnHandle(), rnnDesc, M, 1, dropoutDesc, 
-				CUDNN_LINEAR_INPUT, CUDNN_UNIDIRECTIONAL, 
-				getCuDNNRnnMode(rnnMode), CUDNN_RNN_ALGO_STANDARD, LibMatrixCUDA.CUDNN_DATA_TYPE);
-		
+		int mathType = (LibMatrixCUDA.CUDNN_DATA_TYPE == CUDNN_DATA_HALF ||
+			LibMatrixCUDA.CUDNN_DATA_TYPE == CUDNN_DATA_BFLOAT16) ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH;
+		int mathPrec = LibMatrixCUDA.CUDNN_DATA_TYPE;
+		JCudnn.cudnnSetRNNDescriptor_v8(rnnDesc, CUDNN_RNN_ALGO_STANDARD, getCuDNNRnnMode(rnnMode),
+			CUDNN_RNN_DOUBLE_BIAS, CUDNN_UNIDIRECTIONAL, CUDNN_LINEAR_INPUT, LibMatrixCUDA.CUDNN_DATA_TYPE, mathPrec,
+			mathType, D, M, 0, 1, dropoutDesc, 0);
+
+		// ── inside the constructor, after rnnDesc has been configured ──
+		xDataDesc = new cudnnRNNDataDescriptor();
+		JCudnn.cudnnCreateRNNDataDescriptor(xDataDesc);
+		JCudnn.cudnnSetRNNDataDescriptor(xDataDesc, LibMatrixCUDA.CUDNN_DATA_TYPE,
+			CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED, T, N, D, null, null);
+
 		// Allocate filter descriptor
 		int expectedNumWeights = getExpectedNumWeights();
-		if(rnnMode.equalsIgnoreCase(Opcodes.LSTM.toString()) && (D+M+2)*4*M != expectedNumWeights) {
-			throw new DMLRuntimeException("Incorrect number of RNN parameters " +  (D+M+2)*4*M + " != " +  expectedNumWeights + ", where numFeatures=" + D + ", hiddenSize=" + M);
+		if(rnnMode.equalsIgnoreCase(Opcodes.LSTM.toString()) && (D + M + 2) * 4 * M != expectedNumWeights) {
+			throw new DMLRuntimeException(
+				"Incorrect number of RNN parameters " + (D + M + 2) * 4 * M + " != " + expectedNumWeights +
+					", where numFeatures=" + D + ", hiddenSize=" + M);
 		}
 		wDesc = allocateFilterDescriptor(expectedNumWeights);
 		dwDesc = allocateFilterDescriptor(expectedNumWeights);
-		
+
 		// Setup workspace
 		workSpace = new Pointer(); reserveSpace = new Pointer();
 		sizeInBytes = getWorkspaceSize(T);
@@ -142,15 +164,25 @@
 	}
 	
 	private long getWorkspaceSize(int seqLength) {
-		long [] sizeInBytesArray = new long[1];
-		JCudnn.cudnnGetRNNWorkspaceSize(gCtx.getCudnnHandle(), rnnDesc, seqLength, xDesc, sizeInBytesArray);
-		return sizeInBytesArray[0];
+		long[] workSize = {0};
+		long[] reserveSize = {0};          // v9 returns both sizes in one call
+
+		JCudnn.cudnnGetRNNTempSpaceSizes(gCtx.getCudnnHandle(), rnnDesc, CUDNN_FWD_MODE_TRAINING, xDataDesc, workSize,
+			reserveSize);
+
+		// keep reserveSpaceSizeInBytes in sync with the new API
+		reserveSpaceSizeInBytes = reserveSize[0];
+		return workSize[0];
 	}
 	
 	private long getReservespaceSize(int seqLength) {
-		long [] sizeInBytesArray = new long[1];
-		JCudnn.cudnnGetRNNTrainingReserveSize(gCtx.getCudnnHandle(), rnnDesc, seqLength, xDesc, sizeInBytesArray);
-		return sizeInBytesArray[0];
+		long[] workSize = {0};
+		long[] reserveSize = {0};
+
+		JCudnn.cudnnGetRNNTempSpaceSizes(gCtx.getCudnnHandle(), rnnDesc, CUDNN_FWD_MODE_TRAINING, xDataDesc, workSize,
+			reserveSize);
+
+		return reserveSize[0];
 	}
 	
 	private static int getCuDNNRnnMode(String rnnMode) throws DMLRuntimeException {
@@ -174,10 +206,17 @@
 	}
 	
 	private int getExpectedNumWeights() throws DMLRuntimeException {
-		long [] weightSizeInBytesArray = {-1}; // (D+M+2)*4*M
-		JCudnn.cudnnGetRNNParamsSize(gCtx.getCudnnHandle(), rnnDesc, xDesc[0], weightSizeInBytesArray, LibMatrixCUDA.CUDNN_DATA_TYPE);
-		// check if (D+M+2)*4M == weightsSize / sizeof(dataType) where weightsSize is given by 'cudnnGetRNNParamsSize'.
-		return LibMatrixCUDA.toInt(weightSizeInBytesArray[0]/LibMatrixCUDA.sizeOfDataType);
+		long[] weightSpaceBytes = {-1};
+
+		// v9 API: returns the size (in bytes) of the packed “weight space”
+		cudnnGetRNNWeightSpaceSize(gCtx.getCudnnHandle(), rnnDesc, weightSpaceBytes);
+
+		if(weightSpaceBytes[0] < 0)
+			throw new DMLRuntimeException("cuDNN returned a negative weight-space size");
+
+		// convert from bytes to number of scalars
+		long numScalars = weightSpaceBytes[0] / LibMatrixCUDA.sizeOfDataType;
+		return LibMatrixCUDA.toInt(numScalars);
 	}
 	
 	private static cudnnFilterDescriptor allocateFilterDescriptor(int numWeights) {
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuMatMult.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuMatMult.java
index 5753041..e0f3d12 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuMatMult.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuMatMult.java
@@ -18,10 +18,16 @@
  */
 package org.apache.sysds.runtime.matrix.data;
 
+import static jcuda.cudaDataType.CUDA_R_32F;
+import static jcuda.cudaDataType.CUDA_R_64F;
 import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_NON_TRANSPOSE;
 import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_TRANSPOSE;
 import static jcuda.runtime.JCuda.cudaMemcpy;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
+import static jcuda.jcusparse.cusparseSpMMAlg.CUSPARSE_SPMM_ALG_DEFAULT;
+import static org.apache.sysds.runtime.instructions.gpu.context.CSRPointer.getCSRMatrixInfo;
+import static org.apache.sysds.runtime.instructions.gpu.context.CSRPointer.transposeCSR;
+
 import jcuda.Pointer;
 
 import org.apache.commons.logging.Log;
@@ -149,23 +155,31 @@
 			// -------------------------------------------------------------------------------------
 			// sparse-sparse matrix multiplication
 			params.validate();
-			int transa = cusparseOp(isLeftTransposed);
-			int transb = cusparseOp(isRightTransposed);
+			int transA = cusparseOp(isLeftTransposed);
+			int transB = cusparseOp(isRightTransposed);
+			int dataType = (sizeOfDataType == 4) ? CUDA_R_32F : CUDA_R_64F;
 
 			// Step 1: Allocate output => sparse format
 			ec.allocateGPUMatrixObject(outputName, outRLen, outCLen);
-
 			// Step 2: Get the handles to sparse/dense pointers for left, right
 			// and output
 			CSRPointer A = left.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
 			CSRPointer B = right.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
-			CSRPointer C = CSRPointer.allocateForMatrixMultiply(gCtx, getCusparseHandle(gCtx), A, transa, B, transb,
-					params.m, params.n, params.k);
-		
+			// transpose if required
+			// cusparseSpGEMM works only with CUSPARSE_OPERATION_NON_TRANSPOSE
+			if(transA == CUSPARSE_OPERATION_TRANSPOSE) {
+				A = transposeCSR(gCtx, getCusparseHandle(gCtx), A, params.k, params.m, dataType);
+			}
+			if(transB == CUSPARSE_OPERATION_TRANSPOSE) {
+				B = transposeCSR(gCtx, getCusparseHandle(gCtx), B, params.n, params.k, dataType);
+			}
+			transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
+			transB = CUSPARSE_OPERATION_NON_TRANSPOSE;
+			CSRPointer C = CSRPointer.allocateForMatrixMultiply(gCtx, getCusparseHandle(gCtx), A, transA, B, transB,
+				params.m, params.n, params.k, dataType);
 			// Step 3: Invoke the kernel
-			cudaSupportFunctions.cusparsecsrgemm(getCusparseHandle(gCtx), transa, transb, params.m, params.n, params.k, A.descr,
-					(int) A.nnz, A.val, A.rowPtr, A.colInd, B.descr, (int) B.nnz, B.val, B.rowPtr, B.colInd, C.descr,
-					C.val, C.rowPtr, C.colInd);
+			cudaSupportFunctions.cusparsecsrgemm(getCusparseHandle(gCtx), transA, transB, CUSPARSE_SPMM_ALG_DEFAULT,
+				A.spMatDescr, B.spMatDescr, C.spMatDescr, C.spgemmDesc);
 			output.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
 			// -------------------------------------------------------------------------------------
 		} else if (!isM1Sparse && isM2Sparse) {
@@ -306,7 +320,7 @@
 			int m = toInt(param.rightNumRows);
 			int n = toInt(param.rightNumCols);
 			int transa = reverseCusparseOp(cusparseOp(param.isLeftTransposed));
-			cudaSupportFunctions.cusparsecsrmv(handle, transa, m, n, toInt(B.nnz), one(), B.descr, B.val, B.rowPtr, B.colInd, A,
+			cudaSupportFunctions.cusparsecsrmv(handle, transa, m, n, toInt(B.nnz), one(), B.spMatDescr, B.descr, B.val, B.rowPtr, B.colInd, A,
 					zero(), C);
 		} else {
 			int m = toInt(param.rightNumRows);
@@ -316,7 +330,7 @@
 			int transa = reverseCusparseOp(cusparseOp(param.isLeftTransposed));
 			int transb = cusparseOp(param.isRightTransposed);
 			LOG.debug(" GPU Sparse-Dense Matrix Multiply (rhs transpose) ");
-			cudaSupportFunctions.cusparsecsrmm2(handle, transa, transb, m, param.n, k, toInt(B.nnz), one(), B.descr, B.val,
+			cudaSupportFunctions.cusparsecsrmm2(handle, transa, transb, m, param.n, k, toInt(B.nnz), one(), B.descr, B.spMatDescr, B.val,
 					B.rowPtr, B.colInd, A, param.ldb, zero(), C, param.ldc);
 		}
 	}
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java b/src/main/java/org/apache/sysds/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
index e25a68a..ab9f6d8 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
@@ -18,6 +18,8 @@
  */
 package org.apache.sysds.runtime.matrix.data;
 
+import static jcuda.cudaDataType.CUDA_R_64F;
+import static jcuda.jcusparse.JCusparse.*;
 import static jcuda.runtime.JCuda.cudaMemcpy;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
@@ -42,42 +44,118 @@
 import jcuda.jcusparse.JCusparse;
 import jcuda.jcusparse.cusparseHandle;
 import jcuda.jcusparse.cusparseMatDescr;
+import jcuda.jcusparse.cusparseSpMatDescr;
+import jcuda.jcusparse.cusparseSpGEMMDescr;
+
+import static jcuda.jcusparse.cusparseIndexType.CUSPARSE_INDEX_32I;
+import static jcuda.jcusparse.cusparseIndexBase.CUSPARSE_INDEX_BASE_ZERO;
+import static jcuda.cudaDataType.CUDA_R_32F;
+import static jcuda.jcusparse.cusparseSpGEMMAlg.CUSPARSE_SPGEMM_DEFAULT;
+import static jcuda.jcusparse.cusparseStatus.CUSPARSE_STATUS_SUCCESS;
+import static jcuda.jcusparse.cusparseSpMVAlg.CUSPARSE_SPMV_ALG_DEFAULT;
+import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_NON_TRANSPOSE;
+import static jcuda.jcusparse.cusparseOrder.CUSPARSE_ORDER_COL;
+import static jcuda.jcusparse.cusparseSpMMAlg.CUSPARSE_SPMM_ALG_DEFAULT;
+import static jcuda.jcusparse.cusparseCsr2CscAlg.CUSPARSE_CSR2CSC_ALG1;
+import static jcuda.jcusparse.cusparseSparseToDenseAlg.CUSPARSE_SPARSETODENSE_ALG_DEFAULT;
+import static jcuda.jcusparse.cusparseIndexBase.CUSPARSE_INDEX_BASE_ONE;
+import static jcuda.jcusparse.cusparseDenseToSparseAlg.CUSPARSE_DENSETOSPARSE_ALG_DEFAULT;
+import static jcuda.runtime.JCuda.cudaMalloc;
+import static jcuda.runtime.JCuda.cudaFree;
+
+import jcuda.jcusparse.cusparseDnVecDescr;
+import jcuda.jcusparse.cusparseDnMatDescr;
 
 public class SinglePrecisionCudaSupportFunctions implements CudaSupportFunctions {
-	
+
 	private static final Log LOG = LogFactory.getLog(SinglePrecisionCudaSupportFunctions.class.getName());
 
 	@Override
-	public int cusparsecsrgemm(cusparseHandle handle, int transA, int transB, int m, int n, int k,
-			cusparseMatDescr descrA, int nnzA, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA,
-			cusparseMatDescr descrB, int nnzB, Pointer csrValB, Pointer csrRowPtrB, Pointer csrColIndB,
-			cusparseMatDescr descrC, Pointer csrValC, Pointer csrRowPtrC, Pointer csrColIndC) {
-		return JCusparse.cusparseScsrgemm(handle, transA,  transB,  m,  n,  k,
-				 descrA,  nnzA,  csrValA,  csrRowPtrA,  csrColIndA,
-				 descrB,  nnzB,  csrValB,  csrRowPtrB,  csrColIndB,
-				 descrC,  csrValC,  csrRowPtrC,  csrColIndC);
+	public int cusparsecsrgemm(cusparseHandle handle, int transA, int transB, int alg, cusparseSpMatDescr spMatDescrA,
+		cusparseSpMatDescr spMatDescrB, cusparseSpMatDescr spMatDescrC, cusparseSpGEMMDescr spgemmDescr) {
+		double[] alpha = {1.0}, beta = {0.0};
+		Pointer alphaPtr = Pointer.to(alpha), betaPtr = Pointer.to(beta);
+		return cusparseSpGEMM_copy(handle, transA, transB, alphaPtr, spMatDescrA.asConst(), spMatDescrB.asConst(),
+			betaPtr, spMatDescrC, CUDA_R_32F, alg, spgemmDescr);
 	}
 
 	@Override
 	public int cublasgeam(cublasHandle handle, int transa, int transb, int m, int n, Pointer alpha, Pointer A, int lda,
-			Pointer beta, Pointer B, int ldb, Pointer C, int ldc) {
+		Pointer beta, Pointer B, int ldb, Pointer C, int ldc) {
 		return JCublas2.cublasSgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
 	}
 
 	@Override
 	public int cusparsecsrmv(cusparseHandle handle, int transA, int m, int n, int nnz, Pointer alpha,
-			cusparseMatDescr descrA, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA, Pointer x, Pointer beta,
-			Pointer y) {
-		return JCusparse.cusparseScsrmv(handle, transA, m, n, nnz, alpha, 
-				descrA, csrValA, csrRowPtrA, csrColIndA, x, beta, y);
+		cusparseSpMatDescr spMatDescrA, cusparseMatDescr descrA, Pointer csrValA, Pointer csrRowPtrA,
+		Pointer csrColIndA, Pointer x, Pointer beta, Pointer y) {
+		// Create sparse matrix A in CSR format
+		int idxBase = cusparseGetMatIndexBase(descrA);
+		int dataType = CUDA_R_32F;
+		cusparseCreateCsr(spMatDescrA, m, n, nnz, csrRowPtrA, csrColIndA, csrValA, CUSPARSE_INDEX_32I,
+			CUSPARSE_INDEX_32I, idxBase, dataType);
+		// Create dense vectors vecX and vecY
+		cusparseDnVecDescr vecX = new cusparseDnVecDescr();
+		cusparseDnVecDescr vecY = new cusparseDnVecDescr();
+		cusparseCreateDnVec(vecX, n, x, dataType);
+		cusparseCreateDnVec(vecY, m, y, dataType);
+		// allocate an external buffer if needed
+		long[] bufferSize = {0};
+		int alg = CUSPARSE_SPMV_ALG_DEFAULT;
+		cusparseSpMV_bufferSize(handle, transA, alpha, spMatDescrA.asConst(), vecX.asConst(), beta, vecY, dataType, alg,
+			bufferSize);
+		// execute SpMV
+		Pointer dBuffer = new Pointer();
+		if(bufferSize[0] > 0)
+			cudaMalloc(dBuffer, bufferSize[0]);
+		try {
+			return cusparseSpMV(handle, transA, alpha, spMatDescrA.asConst(), vecX.asConst(), beta, vecY, dataType, alg,
+				dBuffer);
+		}
+		finally {
+			if(bufferSize[0] > 0)
+				cudaFree(dBuffer);
+			cusparseDestroyDnVec(vecX.asConst());
+			cusparseDestroyDnVec(vecY.asConst());
+		}
 	}
-	
+
 	@Override
-	public int	cusparsecsrmm2(cusparseHandle handle, int transa, int transb, int m, int n, int k, int nnz, jcuda.Pointer alpha, cusparseMatDescr descrA, 
-			jcuda.Pointer csrValA, jcuda.Pointer csrRowPtrA, jcuda.Pointer csrColIndA, 
-			jcuda.Pointer B, int ldb, jcuda.Pointer beta, jcuda.Pointer C, int ldc) {
-		return JCusparse.cusparseScsrmm2(handle, transa, transb, m, n, k, nnz, alpha, descrA, csrValA, 
-				csrRowPtrA, csrColIndA, B, ldb, beta, C, ldc);
+	public int cusparsecsrmm2(cusparseHandle handle, int transA, int transB, int m, int n, int k, int nnz,
+		Pointer alpha, cusparseMatDescr descrA, cusparseSpMatDescr spMatDescrA, Pointer csrValA, Pointer csrRowPtrA,
+		Pointer csrColIndA, Pointer B, int ldb, Pointer beta, Pointer C, int ldc) {
+
+		int dataType = CUDA_R_32F;
+		int idxBase = cusparseGetMatIndexBase(descrA);
+		// Create sparse matrix A in CSR format
+		cusparseCreateCsr(spMatDescrA, m, n, nnz, csrRowPtrA, csrColIndA, csrValA, CUSPARSE_INDEX_32I,
+			CUSPARSE_INDEX_32I, idxBase, dataType);
+		// Create dense matrix B
+		cusparseDnMatDescr dnMatB = new cusparseDnMatDescr();
+		cusparseCreateDnMat(dnMatB, k, n, ldb, B, dataType, CUSPARSE_ORDER_COL);
+		// Create dense matrix C
+		cusparseDnMatDescr dnMatC = new cusparseDnMatDescr();
+		cusparseCreateDnMat(dnMatC, m, n, ldc, C, dataType, CUSPARSE_ORDER_COL);
+		// allocate an external buffer if needed
+		long[] bufferSize = {0};
+		int alg = CUSPARSE_SPMM_ALG_DEFAULT;
+		cusparseSpMM_bufferSize(handle, transA, transB, alpha, spMatDescrA.asConst(), dnMatB.asConst(), beta, dnMatC,
+			dataType, alg, bufferSize);
+		// execute SpMM
+		Pointer dBuffer = new Pointer();
+		if(bufferSize[0] > 0)
+			cudaMalloc(dBuffer, bufferSize[0]);
+		try {
+			return cusparseSpMM(handle, transA, transB, alpha, spMatDescrA.asConst(), dnMatB.asConst(), beta, dnMatC,
+				dataType, alg, dBuffer);
+		}
+		finally {
+			if(bufferSize[0] > 0)
+				cudaFree(dBuffer);
+			cusparseDestroySpMat(spMatDescrA.asConst());
+			cusparseDestroyDnMat(dnMatB.asConst());
+			cusparseDestroyDnMat(dnMatC.asConst());
+		}
 	}
 
 	@Override
@@ -87,25 +165,43 @@
 
 	@Override
 	public int cublasgemv(cublasHandle handle, int trans, int m, int n, Pointer alpha, Pointer A, int lda, Pointer x,
-			int incx, Pointer beta, Pointer y, int incy) {
+		int incx, Pointer beta, Pointer y, int incy) {
 		return JCublas2.cublasSgemv(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
 	}
 
 	@Override
 	public int cublasgemm(cublasHandle handle, int transa, int transb, int m, int n, int k, Pointer alpha, Pointer A,
-			int lda, Pointer B, int ldb, Pointer beta, Pointer C, int ldc) {
+		int lda, Pointer B, int ldb, Pointer beta, Pointer C, int ldc) {
 		return JCublas2.cublasSgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 	}
 
 	@Override
 	public int cusparsecsr2csc(cusparseHandle handle, int m, int n, int nnz, Pointer csrVal, Pointer csrRowPtr,
-			Pointer csrColInd, Pointer cscVal, Pointer cscRowInd, Pointer cscColPtr, int copyValues, int idxBase) {
-		return JCusparse.cusparseScsr2csc(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscRowInd, cscColPtr, copyValues, idxBase);
+		Pointer csrColInd, Pointer cscVal, Pointer cscRowInd, Pointer cscColPtr, int copyValues, int idxBase) {
+
+		int valType = CUDA_R_32F;            // single precision
+		int alg = CUSPARSE_CSR2CSC_ALG1;     // always supported
+
+		long[] bufferSize = {0};
+		cusparseCsr2cscEx2_bufferSize(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscColPtr, cscRowInd,
+			valType, copyValues, idxBase, alg, bufferSize);
+
+		Pointer buffer = new Pointer();
+		if(bufferSize[0] > 0)
+			cudaMalloc(buffer, bufferSize[0]);
+		try {
+			return cusparseCsr2cscEx2(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscColPtr, cscRowInd,
+				valType, copyValues, idxBase, alg, buffer);
+		}
+		finally {
+			if(bufferSize[0] > 0)
+				cudaFree(buffer);
+		}
 	}
 
 	@Override
 	public int cublassyrk(cublasHandle handle, int uplo, int trans, int n, int k, Pointer alpha, Pointer A, int lda,
-			Pointer beta, Pointer C, int ldc) {
+		Pointer beta, Pointer C, int ldc) {
 		return JCublas2.cublasSsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 	}
 
@@ -130,41 +226,143 @@
 			Pointer Workspace, int Lwork, Pointer devInfo) {
 		return JCusolverDn.cusolverDnSgeqrf(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
 	}
-	
+
 	@Override
 	public int cusolverDnormqr(cusolverDnHandle handle, int side, int trans, int m, int n, int k, Pointer A, int lda,
-			Pointer tau, Pointer C, int ldc, Pointer work, int lwork, Pointer devInfo) {
+		Pointer tau, Pointer C, int ldc, Pointer work, int lwork, Pointer devInfo) {
 		return JCusolverDn.cusolverDnSormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, devInfo);
 	}
 
 	@Override
 	public int cusparsecsrgeam(cusparseHandle handle, int m, int n, Pointer alpha, cusparseMatDescr descrA, int nnzA,
-			Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA, Pointer beta, cusparseMatDescr descrB, int nnzB,
-			Pointer csrValB, Pointer csrRowPtrB, Pointer csrColIndB, cusparseMatDescr descrC, Pointer csrValC,
-			Pointer csrRowPtrC, Pointer csrColIndC) {
-		return JCusparse.cusparseScsrgeam(handle, m, n, alpha, descrA, nnzA, 
-				csrValA, csrRowPtrA, csrColIndA, beta, descrB, nnzB, 
-				csrValB, csrRowPtrB, csrColIndB, descrC, csrValC, csrRowPtrC, csrColIndC);
+		Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA, Pointer beta, cusparseMatDescr descrB, int nnzB,
+		Pointer csrValB, Pointer csrRowPtrB, Pointer csrColIndB, cusparseMatDescr descrC, Pointer csrValC,
+		Pointer csrRowPtrC, Pointer csrColIndC) {
+
+		long[] pBufferSizeInBytes = {0};
+
+		int status = JCusparse.cusparseDcsrgeam2_bufferSizeExt(handle, m, n, alpha, descrA, nnzA, csrValA, csrRowPtrA,
+			csrColIndA, beta, descrB, nnzB, csrValB, csrRowPtrB, csrColIndB, descrC, csrValC, csrRowPtrC, csrColIndC,
+			pBufferSizeInBytes);
+		if(status != CUSPARSE_STATUS_SUCCESS)
+			return status;
+
+		Pointer buffer = new Pointer();
+		if(pBufferSizeInBytes[0] > 0)
+			cudaMalloc(buffer, pBufferSizeInBytes[0]);
+
+		try {
+			// C = α*A + β*B
+			return JCusparse.cusparseDcsrgeam2(handle, m, n, alpha, descrA, nnzA, csrValA, csrRowPtrA, csrColIndA, beta,
+				descrB, nnzB, csrValB, csrRowPtrB, csrColIndB, descrC, csrValC, csrRowPtrC, csrColIndC, buffer);
+		}
+		finally {
+			if(pBufferSizeInBytes[0] > 0)
+				cudaFree(buffer);
+		}
 	}
 
 	@Override
 	public int cusparsecsr2dense(cusparseHandle handle, int m, int n, cusparseMatDescr descrA, Pointer csrValA,
-			Pointer csrRowPtrA, Pointer csrColIndA, Pointer A, int lda) {
-		return JCusparse.cusparseScsr2dense(handle, m, n, descrA, csrValA, csrRowPtrA, csrColIndA, A, lda);
+		Pointer csrRowPtrA, Pointer csrColIndA, Pointer A, int lda, long nnz) {
+
+		// Get index base from legacy descriptor -> 0 or 1
+		int idxBase = JCusparse.cusparseGetMatIndexBase(descrA);
+
+		// Create generric sparse-matrix descriptor required by CUDA 12
+		cusparseSpMatDescr spMatA = new cusparseSpMatDescr();
+
+		// Build CSR descriptor
+		cusparseCreateCsr(spMatA, m, n, nnz, csrRowPtrA, csrColIndA, csrValA, CUSPARSE_INDEX_32I,
+			CUSPARSE_INDEX_32I, idxBase, CUDA_R_32F);
+
+		// Build dense descriptor
+		cusparseDnMatDescr matB = new cusparseDnMatDescr();
+		cusparseCreateDnMat(matB, m, n, lda, A, CUDA_R_32F, CUSPARSE_ORDER_COL);
+
+		// Determine buffer size
+		int alg = CUSPARSE_SPARSETODENSE_ALG_DEFAULT;
+		long[] bufSize = {0};
+		cusparseSparseToDense_bufferSize(handle, spMatA.asConst(), matB, alg,
+			bufSize);    //bufSize[0] now holds the exact byte count
+
+		// Allocate scratch space of the requested size
+		Pointer dBuffer = new Pointer();
+		if(bufSize[0] > 0) {
+			cudaMalloc(dBuffer, bufSize[0]);
+		}
+		try {
+			// Write dense matrix
+			int algSparseToDense = CUSPARSE_SPARSETODENSE_ALG_DEFAULT;
+			return cusparseSparseToDense(handle, spMatA.asConst(), matB, algSparseToDense, dBuffer);
+		}
+		finally {
+			if(bufSize[0] > 0)
+				cudaFree(dBuffer);
+			cusparseDestroyDnMat(matB.asConst());
+			cusparseDestroySpMat(spMatA.asConst());
+		}
+
 	}
-	
+
 	@Override
 	public int cusparsedense2csr(cusparseHandle handle, int m, int n, cusparseMatDescr descrA, Pointer A, int lda,
-			Pointer nnzPerRow, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA) {
-		return JCusparse.cusparseSdense2csr(handle, m, n, descrA, A, lda, nnzPerRow, csrValA, csrRowPtrA, csrColIndA);
+		Pointer nnzPerRow, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA) {
+		// setup
+		int dataType = CUDA_R_32F;
+		cusparseSpMatDescr csrDesc = new cusparseSpMatDescr();
+		cusparseDnMatDescr denseDesc = new cusparseDnMatDescr();
+		int idxBase = cusparseGetMatIndexBase(descrA);
+		int alg = CUSPARSE_DENSETOSPARSE_ALG_DEFAULT;
+		long[] bufferSize = {0};
+		Pointer dBuffer = new Pointer();
+
+		// Create dense matrix A
+		cusparseCreateDnMat(denseDesc, m, n, lda, A, dataType, CUSPARSE_ORDER_COL);
+
+		// Create sparse matrix B in CSR format
+		cusparseCreateCsr(csrDesc, m, n, 0, csrRowPtrA, csrColIndA, csrValA, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+			idxBase, dataType);
+
+		// allocate an external buffer if needed
+		cusparseDenseToSparse_bufferSize(handle, denseDesc.asConst(), csrDesc, alg, bufferSize);
+		if(bufferSize[0] > 0)
+			cudaMalloc(dBuffer, bufferSize[0]);
+
+		// prepare Sparse to Dense conversion
+		cusparseDenseToSparse_analysis(handle, denseDesc.asConst(), csrDesc, alg, dBuffer);
+
+		/** Keep this in case needed later for debugging*/
+		/*long[] rowsTmp = {0}, colsTmp = {0}, nnz  = {0};
+		JCusparse.cusparseSpMatGetSize(csrDesc.asConst(), rowsTmp, colsTmp, nnz);
+
+		// only allocate if caller passed null pointers
+		if (csrColIndA == null)
+			cudaMalloc(csrColIndA, nnz[0] * Sizeof.INT);
+		if (csrValA == null)
+			cudaMalloc(csrValA, nnz[0] * Sizeof.DOUBLE);*/
+
+		// re-attach column & value pointers
+		JCusparse.cusparseCsrSetPointers(csrDesc, csrRowPtrA, csrColIndA, csrValA);
+
+		try {
+			// execute Sparse to Dense conversion
+			return cusparseDenseToSparse_convert(handle, denseDesc.asConst(), csrDesc, alg, dBuffer);
+		}
+		finally {
+			cusparseDestroyDnMat(denseDesc.asConst());
+			cusparseDestroySpMat(csrDesc.asConst());
+			if(bufferSize[0] > 0)
+				cudaFree(dBuffer);
+		}
 	}
-	
+
 	@Override
 	public int cusparsennz(cusparseHandle handle, int dirA, int m, int n, cusparseMatDescr descrA, Pointer A, int lda,
-			Pointer nnzPerRowCol, Pointer nnzTotalDevHostPtr) {
+		Pointer nnzPerRowCol, Pointer nnzTotalDevHostPtr) {
 		return JCusparse.cusparseSnnz(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol, nnzTotalDevHostPtr);
 	}
-	
+
 	@Override
 	public void deviceToHost(GPUContext gCtx, Pointer src, double[] dest, String instName, boolean isEviction) {
 		long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
diff --git a/src/test/java/org/apache/sysds/test/TestUtils.java b/src/test/java/org/apache/sysds/test/TestUtils.java
index a1fae4f..107d33b 100644
--- a/src/test/java/org/apache/sysds/test/TestUtils.java
+++ b/src/test/java/org/apache/sysds/test/TestUtils.java
@@ -52,6 +52,9 @@
 import java.util.StringTokenizer;
 import java.util.concurrent.TimeUnit;
 
+import jcuda.CudaException;
+import jcuda.runtime.JCuda;
+import jcuda.runtime.cudaError;
 import org.apache.commons.io.FileUtils;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.NotImplementedException;
@@ -95,6 +98,7 @@
 import org.apache.sysds.runtime.util.DataConverter;
 import org.apache.sysds.runtime.util.UtilFunctions;
 import org.junit.Assert;
+import org.junit.Assume;
 
 /**
  * <p>
@@ -3919,7 +3923,7 @@
 				return true;
 		return false;
 	}
-	
+
 	public static int isGPUAvailable() {
 		// returns cudaSuccess if at least one gpu is available
 		//final int[] deviceCount = new int[1];
@@ -3928,10 +3932,32 @@
 		return 1; //return false for now
 	}
 
-	public static MatrixBlock mockNonContiguousMatrix(MatrixBlock db){
+	public static void checkGPU() {
+		boolean gpuAvailable = false;
+		try {
+			// Ask JCuda to throw Java exceptions (much nicer than error codes)
+			JCuda.setExceptionsEnabled(true);
+
+			// How many devices does the runtime see?
+			int[] devCount = {0};
+			int status = JCuda.cudaGetDeviceCount(devCount);
+
+			gpuAvailable = (status == cudaError.cudaSuccess) && (devCount[0] > 0);
+		}
+		catch(UnsatisfiedLinkError | CudaException ex) {
+			// - native JCuda libs not on the class-path
+			// - or they were built for the wrong CUDA version
+			gpuAvailable = false;
+		}
+
+		Assume.assumeTrue("Skipping GPU test: no compatible CUDA device " + "or JCuda native libraries not available.",
+			gpuAvailable);
+	}
+
+	public static MatrixBlock mockNonContiguousMatrix(MatrixBlock db) {
 		db.sparseToDense();
 		double[] vals = db.getDenseBlockValues();
-		int[] dims = new int[]{db.getNumRows(), db.getNumColumns()};
+		int[] dims = new int[] {db.getNumRows(), db.getNumColumns()};
 		MatrixBlock m = new MatrixBlock(db.getNumRows(), db.getNumColumns(), new DenseBlockFP64Mock(dims, vals));
 		m.setNonZeros(db.getNonZeros());
 		return m;
diff --git a/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCublasDotTest.java b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCublasDotTest.java
new file mode 100644
index 0000000..b202382
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCublasDotTest.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.gpu.cudaSupportFunctions;
+
+import org.apache.sysds.runtime.matrix.data.MatrixValue;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.util.HashMap;
+
+public class CudaCublasDotTest extends AutomatedTestBase {
+
+	private static final String TEST_NAME = "CudaSupportFunctionsDot";
+	private static final String TEST_DIR = "gpu/cudaSupportFunctions/";
+	private static final String TEST_CLASS_DIR = TEST_DIR + CudaCublasDotTest.class.getSimpleName() + "/";
+
+	private static final int rows = 200;
+	private static final int cols = 200;
+	private static final double eps = Math.pow(10, -10);
+
+	@BeforeClass
+	public static void ensureGPU() {
+		TestUtils.checkGPU();
+	}
+
+	@Override
+	public void setUp() {
+		TestUtils.clearAssertionInformation();
+		addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[] {"R"}));
+	}
+
+	@Test
+	public void testCublasDot() {
+		testCudaCublasDot();
+	}
+
+	private void testCudaCublasDot() {
+		TestConfiguration config = getTestConfiguration(TEST_NAME);
+		loadTestConfiguration(config);
+
+		String HOME = SCRIPT_DIR + TEST_DIR;
+		fullDMLScriptName = HOME + TEST_NAME + ".dml";
+		programArgs = new String[] {"-stats", "-gpu", "-args", input("A"), input("B"), output("R")};
+		fullRScriptName = HOME + TEST_NAME + ".R";
+		rCmd = getRCmd(inputDir(), expectedDir());
+
+		// A is dense vector, B is a dense vector
+		double[][] A = getRandomMatrix(1, cols, -1, 1, 0.70d, 5);
+		double[][] B = getRandomMatrix(rows, 1, -1, 1, 0.80d, 3);
+		writeInputMatrixWithMTD("A", A, true);
+		writeInputMatrixWithMTD("B", B, true);
+
+		runTest(true, false, null, -1);
+		runRScript(true);
+
+		//compare matrices
+		HashMap<MatrixValue.CellIndex, Double> dmlfile = readDMLMatrixFromOutputDir("R");
+		HashMap<MatrixValue.CellIndex, Double> rfile = readRMatrixFromExpectedDir("R");
+		TestUtils.compareMatrices(dmlfile, rfile, eps, "Stat-DML", "Stat-R");
+
+		Assert.assertTrue(heavyHittersContainsString("gpu_ba+*"));
+
+	}
+}
diff --git a/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCublasGeMVTest.java b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCublasGeMVTest.java
new file mode 100644
index 0000000..5b3c6ab
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCublasGeMVTest.java
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.gpu.cudaSupportFunctions;
+
+import org.apache.sysds.runtime.matrix.data.MatrixValue;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.util.HashMap;
+
+public class CudaCublasGeMVTest extends AutomatedTestBase {
+
+	private static final String TEST_NAME = "CudaSupportFunctionsMV";
+	private static final String TEST_DIR = "gpu/cudaSupportFunctions/";
+	private static final String TEST_CLASS_DIR = TEST_DIR + CudaCublasGeMVTest.class.getSimpleName() + "/";
+
+	private static final int rows = 200;
+	private static final int cols = 200;
+	private static final double eps = Math.pow(10, -10);
+
+	@BeforeClass
+	public static void ensureGPU() {
+		TestUtils.checkGPU();
+	}
+
+	@Override
+	public void setUp() {
+		TestUtils.clearAssertionInformation();
+		addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[] {"R"}));
+	}
+
+	@Test
+	public void testCublasGeMV() {
+		testCudaCublasGeMVTest(1);
+	}
+
+	@Test
+	public void testCublasGeMVLeftTranspose() {
+		testCudaCublasGeMVTest(2);
+	}
+
+
+	private void testCudaCublasGeMVTest(int ID) {
+
+		TestConfiguration config = getTestConfiguration(TEST_NAME);
+		loadTestConfiguration(config);
+
+		String HOME = SCRIPT_DIR + TEST_DIR;
+		fullDMLScriptName = HOME + TEST_NAME + ".dml";
+		programArgs = new String[] {"-stats", "-gpu", "-args", input("A"), input("X"), String.valueOf(ID), output("R")};
+		fullRScriptName = HOME + TEST_NAME + ".R";
+		rCmd = getRCmd(inputDir(),  String.valueOf(ID), expectedDir());
+
+		// A is dense matrix, B is a dense vector
+		double[][] A = getRandomMatrix(rows, cols, -1, 1, 0.70d, 5);
+		double[][] X = getRandomMatrix(rows, 1, -1, 1, 0.80d, 3);
+		writeInputMatrixWithMTD("A", A, true);
+		writeInputMatrixWithMTD("X", X, true);
+
+		runTest(true, false, null, -1);
+		runRScript(true);
+
+		//compare matrices
+		HashMap<MatrixValue.CellIndex, Double> dmlfile = readDMLMatrixFromOutputDir("R");
+		HashMap<MatrixValue.CellIndex, Double> rfile = readRMatrixFromExpectedDir("R");
+		TestUtils.compareMatrices(dmlfile, rfile, eps, "Stat-DML", "Stat-R");
+
+		Assert.assertTrue(heavyHittersContainsString("gpu_ba+*"));
+
+	}
+}
diff --git a/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCublasGeamTest.java b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCublasGeamTest.java
new file mode 100644
index 0000000..1809c5d
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCublasGeamTest.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.gpu.cudaSupportFunctions;
+
+import org.apache.sysds.runtime.matrix.data.MatrixValue;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.util.HashMap;
+
+public class CudaCublasGeamTest extends AutomatedTestBase {
+
+    private static final String TEST_NAME = "CudaSupportFunctionsGeam";
+    private static final String TEST_DIR = "gpu/cudaSupportFunctions/";
+    private static final String TEST_CLASS_DIR = TEST_DIR + CudaCublasGeamTest.class.getSimpleName() + "/";
+
+    private static final int rows = 200;
+    private static final int cols = 200;
+
+    private static final double eps = Math.pow(10, -10);
+
+    @BeforeClass
+    public static void ensureGPU() {
+        TestUtils.checkGPU();
+    }
+
+    @Override
+    public void setUp() {
+        TestUtils.clearAssertionInformation();
+        addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[] {"R"}));
+    }
+
+    @Test
+    public void testCublasGeamNoTranspose() {
+        testCublasGeam(1);
+    }
+
+    @Test
+    public void testCublasGeamLeftTranspose() {
+        testCublasGeam(2);
+    }
+
+    @Test
+    public void testCublasGeamRightTranspose() {
+        testCublasGeam(3);
+    }
+
+    @Test
+    public void testCublasGeamBothTranspose() {
+        testCublasGeam(4);
+    }
+
+    private void testCublasGeam(int ID) {
+
+        TestConfiguration config = getTestConfiguration(TEST_NAME);
+        loadTestConfiguration(config);
+
+        String HOME = SCRIPT_DIR + TEST_DIR;
+        fullDMLScriptName = HOME + TEST_NAME + ".dml";
+        programArgs = new String[] {"-stats", "-gpu", "-args", input("A"), input("B"), String.valueOf(ID), output("R")};
+        fullRScriptName = HOME + TEST_NAME + ".R";
+        rCmd = getRCmd(inputDir(), String.valueOf(ID), expectedDir());
+
+        // both matrices have to be dense
+        double[][] A = getRandomMatrix(rows, cols, -1, 1, 0.70d, 5);
+        double[][] B = getRandomMatrix(rows, cols, -1, 1, 0.60d, 3);
+        writeInputMatrixWithMTD("A", A, true);
+        writeInputMatrixWithMTD("B", B, true);
+
+        runTest(true, false, null, -1);
+        runRScript(true);
+
+        //compare matrices
+        HashMap<MatrixValue.CellIndex, Double> dmlfile = readDMLMatrixFromOutputDir("R");
+        HashMap<MatrixValue.CellIndex, Double> rfile = readRMatrixFromExpectedDir("R");
+        TestUtils.compareMatrices(dmlfile, rfile, eps, "Stat-DML", "Stat-R");
+
+        Assert.assertTrue(heavyHittersContainsString("gpu_+"));
+    }
+}
diff --git a/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCublasGemmTest.java b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCublasGemmTest.java
new file mode 100644
index 0000000..e23c11f
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCublasGemmTest.java
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.gpu.cudaSupportFunctions;
+
+import org.apache.sysds.runtime.matrix.data.MatrixValue;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.util.HashMap;
+
+public class CudaCublasGemmTest extends AutomatedTestBase {
+
+	private static final String TEST_NAME = "CudaSupportFunctionsMM";
+	private static final String TEST_DIR = "gpu/cudaSupportFunctions/";
+	private static final String TEST_CLASS_DIR = TEST_DIR + CudaCublasGemmTest.class.getSimpleName() + "/";
+
+	private static final int rows = 200;
+	private static final int cols = 200;
+	private static final double eps = Math.pow(10, -10);
+
+	@BeforeClass
+	public static void ensureGPU() {
+		TestUtils.checkGPU();
+	}
+
+	@Override
+	public void setUp() {
+		TestUtils.clearAssertionInformation();
+		addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[] {"R"}));
+	}
+
+	@Test
+	public void testCudaCublasGemmNoTranspose(){
+		testCublasGemm(1);
+	}
+
+	@Test
+	public void testCudaCublasGemmLeftTranspose(){
+		testCublasGemm(2);
+	}
+
+	@Test
+	public void testCudaCublasGemmRightTranspose(){
+		testCublasGemm(3);
+	}
+
+	@Test
+	public void testCudaCublasGemmBothTranspose(){
+		testCublasGemm(4);
+	}
+
+	private void testCublasGemm(int ID){
+		TestConfiguration config = getTestConfiguration(TEST_NAME);
+		loadTestConfiguration(config);
+
+		String HOME = SCRIPT_DIR + TEST_DIR;
+		fullDMLScriptName = HOME + TEST_NAME + ".dml";
+		programArgs = new String[] {"-stats", "-gpu", "-args", input("A"), input("B"), String.valueOf(ID), output("R")};
+		fullRScriptName = HOME + TEST_NAME + ".R";
+		rCmd = getRCmd(inputDir(), String.valueOf(ID), expectedDir());
+
+		// A is dense matrix, B is a dense vector
+		double[][] A = getRandomMatrix(rows, cols, -1, 1, 0.70d, 5);
+		double[][] B = getRandomMatrix(rows, cols, -1, 1, 0.80d, 3);
+		writeInputMatrixWithMTD("A", A, true);
+		writeInputMatrixWithMTD("B", B, true);
+
+		runTest(true, false, null, -1);
+		runRScript(true);
+
+		//compare matrices
+		HashMap<MatrixValue.CellIndex, Double> dmlfile = readDMLMatrixFromOutputDir("R");
+		HashMap<MatrixValue.CellIndex, Double> rfile = readRMatrixFromExpectedDir("R");
+		TestUtils.compareMatrices(dmlfile, rfile, eps, "Stat-DML", "Stat-R");
+
+		Assert.assertTrue(heavyHittersContainsString("gpu_ba+*"));
+	}
+}
diff --git a/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsr2CscTest.java b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsr2CscTest.java
new file mode 100644
index 0000000..63acbc7
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsr2CscTest.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.gpu.cudaSupportFunctions;
+
+import org.apache.sysds.runtime.matrix.data.MatrixValue;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.util.HashMap;
+
+public class CudaCusparseCsr2CscTest extends AutomatedTestBase {
+
+	private static final String TEST_NAME = "CudaSupportFunctionsTranspose";
+	private static final String TEST_DIR = "gpu/cudaSupportFunctions/";
+	private static final String TEST_CLASS_DIR = TEST_DIR + CudaCusparseCsr2CscTest.class.getSimpleName() + "/";
+
+	private static final int rows = 200;
+	private static final int cols = 200;
+	private static final double eps = Math.pow(10, -10);
+
+	@BeforeClass
+	public static void ensureGPU() {
+		TestUtils.checkGPU();
+	}
+
+	@Override
+	public void setUp() {
+		TestUtils.clearAssertionInformation();
+		addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[] {"R"}));
+	}
+
+	@Test
+	public void testCusparseCsr2CscTest() {
+		testCudaCusparseCsr2Csc();
+	}
+
+	private void testCudaCusparseCsr2Csc() {
+		TestConfiguration config = getTestConfiguration(TEST_NAME);
+		loadTestConfiguration(config);
+
+		String HOME = SCRIPT_DIR + TEST_DIR;
+		fullDMLScriptName = HOME + TEST_NAME + ".dml";
+		programArgs = new String[] {"-stats", "-gpu", "-args", input("A"), output("R")};
+		fullRScriptName = HOME + TEST_NAME + ".R";
+		rCmd = getRCmd(inputDir(), expectedDir());
+
+		// A is a sparse matrix
+		double[][] A = getRandomMatrix(rows, cols, -1, 1, 0.30d, 5);
+		writeInputMatrixWithMTD("A", A, true);
+
+		runTest(true, false, null, -1);
+		runRScript(true);
+
+		//compare matrices
+		HashMap<MatrixValue.CellIndex, Double> dmlfile = readDMLMatrixFromOutputDir("R");
+		HashMap<MatrixValue.CellIndex, Double> rfile = readRMatrixFromExpectedDir("R");
+		TestUtils.compareMatrices(dmlfile, rfile, eps, "Stat-DML", "Stat-R");
+
+		Assert.assertTrue(heavyHittersContainsString("gpu_r'"));
+	}
+}
diff --git a/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsrGeamTest.java b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsrGeamTest.java
new file mode 100644
index 0000000..7e9f708
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsrGeamTest.java
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.gpu.cudaSupportFunctions;
+
+import org.apache.sysds.runtime.matrix.data.MatrixValue;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.util.HashMap;
+
+public class CudaCusparseCsrGeamTest extends AutomatedTestBase {
+
+	private static final String TEST_NAME = "CudaSupportFunctionsGeam";
+	private static final String TEST_DIR = "gpu/cudaSupportFunctions/";
+	private static final String TEST_CLASS_DIR = TEST_DIR + CudaCusparseCsrGeamTest.class.getSimpleName() + "/";
+
+	private static final int rows = 200;
+	private static final int cols = 200;
+
+	private static final double eps = Math.pow(10, -10);
+
+	@BeforeClass
+	public static void ensureGPU() {
+		TestUtils.checkGPU();
+	}
+
+	@Override
+	public void setUp() {
+		TestUtils.clearAssertionInformation();
+		addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[] {"R"}));
+	}
+
+	@Test
+	public void testCusparseCsrGeamNoTranspose() {
+		testCusparseCsrGeam(1);
+	}
+
+	@Test
+	public void testCusparseCsrGeamLeftTranspose() {
+		testCusparseCsrGeam(2);
+	}
+
+	@Test
+	public void testCusparseCsrGeamRightTranspose() {
+		testCusparseCsrGeam(3);
+	}
+
+	@Test
+	public void testCusparseCsrGeamBothTranspose() {
+		testCusparseCsrGeam(4);
+	}
+
+	private void testCusparseCsrGeam(int ID) {
+		TestConfiguration config = getTestConfiguration(TEST_NAME);
+		loadTestConfiguration(config);
+
+		String HOME = SCRIPT_DIR + TEST_DIR;
+		fullDMLScriptName = HOME + TEST_NAME + ".dml";
+		programArgs = new String[] {"-stats", "-gpu", "-args", input("A"), input("B"), String.valueOf(ID), output("R")};
+		fullRScriptName = HOME + TEST_NAME + ".R";
+		rCmd = getRCmd(inputDir(), String.valueOf(ID), expectedDir());
+
+		// both matrices have to be dense
+		double[][] A = getRandomMatrix(rows, cols, -1, 1, 0.20d, 5);
+		double[][] B = getRandomMatrix(rows, cols, -1, 1, 0.20d, 3);
+		writeInputMatrixWithMTD("A", A, true);
+		writeInputMatrixWithMTD("B", B, true);
+
+		runTest(true, false, null, -1);
+		runRScript(true);
+
+		//compare matrices
+		HashMap<MatrixValue.CellIndex, Double> dmlfile = readDMLMatrixFromOutputDir("R");
+		HashMap<MatrixValue.CellIndex, Double> rfile = readRMatrixFromExpectedDir("R");
+		TestUtils.compareMatrices(dmlfile, rfile, eps, "Stat-DML", "Stat-R");
+
+		Assert.assertTrue(heavyHittersContainsString("gpu_+"));
+	}
+}
diff --git a/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsrGemmTest.java b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsrGemmTest.java
new file mode 100644
index 0000000..4ed06fb
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsrGemmTest.java
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.gpu.cudaSupportFunctions;
+
+import org.apache.sysds.runtime.matrix.data.MatrixValue;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.util.HashMap;
+
+public class CudaCusparseCsrGemmTest extends AutomatedTestBase {
+
+    private static final String TEST_NAME = "CudaSupportFunctionsMM";
+    private static final String TEST_DIR = "gpu/cudaSupportFunctions/";
+    private static final String TEST_CLASS_DIR = TEST_DIR + CudaCusparseCsrGemmTest.class.getSimpleName() + "/";
+
+    private static final int rows = 200;
+    private static final int cols = 200;
+
+    private static final double eps = Math.pow(10, -10);
+
+    @BeforeClass
+    public static void ensureGPU() {
+        TestUtils.checkGPU();
+    }
+
+    @Override
+    public void setUp() {
+        TestUtils.clearAssertionInformation();
+        addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[]{"R"}));
+    }
+
+    @Test
+    public void testCusparseCsrGemmNoTranspose() {
+        testCusparseCsrGemm(1);
+    }
+
+    @Test
+    public void testCusparseCsrGemmLeftTranspose() {
+        testCusparseCsrGemm(2);
+    }
+
+    @Test
+    public void testCusparseCsrGemmRightTranspose() {
+        testCusparseCsrGemm(3);
+    }
+
+    @Test
+    public void testCusparseCsrGemmBothTranspose() {
+        testCusparseCsrGemm(4);
+    }
+
+
+    private void testCusparseCsrGemm(int ID) {
+
+        TestConfiguration config = getTestConfiguration(TEST_NAME);
+        loadTestConfiguration(config);
+
+        String HOME = SCRIPT_DIR + TEST_DIR;
+        fullDMLScriptName = HOME + TEST_NAME + ".dml";
+        programArgs = new String[]{"-stats", "-gpu", "-args", input("A"), input("B"), String.valueOf(ID), output("R")};
+        fullRScriptName = HOME + TEST_NAME + ".R";
+        rCmd = getRCmd(inputDir(), String.valueOf(ID), expectedDir());
+
+        // both matrices have to be sparse
+        double[][] A = getRandomMatrix(rows, cols, -1, 1, 0.30d, 5);
+        double[][] B = getRandomMatrix(rows, cols, -1, 1, 0.20d, 3);
+        writeInputMatrixWithMTD("A", A, true);
+        writeInputMatrixWithMTD("B", B, true);
+
+        runTest(true, false, null, -1);
+        runRScript(true);
+
+        //compare matrices
+        HashMap<MatrixValue.CellIndex, Double> dmlfile = readDMLMatrixFromOutputDir("R");
+        HashMap<MatrixValue.CellIndex, Double> rfile = readRMatrixFromExpectedDir("R");
+        TestUtils.compareMatrices(dmlfile, rfile, eps, "Stat-DML", "Stat-R");
+
+        Assert.assertTrue(heavyHittersContainsString("gpu_ba+*"));
+    }
+}
diff --git a/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsrMMTest.java b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsrMMTest.java
new file mode 100644
index 0000000..bdd8165
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsrMMTest.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.gpu.cudaSupportFunctions;
+
+import org.apache.sysds.runtime.matrix.data.MatrixValue;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.util.HashMap;
+
+public class CudaCusparseCsrMMTest extends AutomatedTestBase {
+
+	private static final String TEST_NAME = "CudaSupportFunctionsMM";
+	private static final String TEST_DIR = "gpu/cudaSupportFunctions/";
+	private static final String TEST_CLASS_DIR = TEST_DIR + CudaCusparseCsrMMTest.class.getSimpleName() + "/";
+
+	private static final int rows = 200;
+	private static final int cols = 200;
+	private static final double eps = Math.pow(10, -10);
+
+	@BeforeClass
+	public static void ensureGPU() {
+		TestUtils.checkGPU();
+	}
+
+	@Override
+	public void setUp() {
+		TestUtils.clearAssertionInformation();
+		addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[] {"R"}));
+	}
+
+	@Test
+	public void testCusparseCsrMMNoTranspose() {
+		testCusparseCsrMM(1);
+	}
+
+	@Test
+	public void testCusparseCsrMMLeftTranspose() {
+		testCusparseCsrMM(2);
+	}
+
+	@Test
+	public void testCusparseCsrMMRightTranspose() {
+		testCusparseCsrMM(3);
+	}
+
+	@Test
+	public void testCusparseCsrMMBothTranspose() {
+		testCusparseCsrMM(4);
+	}
+
+	private void testCusparseCsrMM(int ID) {
+
+		TestConfiguration config = getTestConfiguration(TEST_NAME);
+		loadTestConfiguration(config);
+
+		String HOME = SCRIPT_DIR + TEST_DIR;
+		fullDMLScriptName = HOME + TEST_NAME + ".dml";
+		programArgs = new String[] {"-stats", "-gpu", "-args", input("A"), input("B"), String.valueOf(ID), output("R")};
+		fullRScriptName = HOME + TEST_NAME + ".R";
+		rCmd = getRCmd(inputDir(), String.valueOf(ID), expectedDir());
+
+		// A is sparse matrix, B a dense matrix
+		double[][] A = getRandomMatrix(rows, cols, -1, 1, 0.20d, 5);
+		double[][] B = getRandomMatrix(rows, cols, -1, 1, 0.80d, 3);
+		writeInputMatrixWithMTD("A", A, true);
+		writeInputMatrixWithMTD("B", B, true);
+
+		runTest(true, false, null, -1);
+		runRScript(true);
+
+		//compare matrices
+		HashMap<MatrixValue.CellIndex, Double> dmlfile = readDMLMatrixFromOutputDir("R");
+		HashMap<MatrixValue.CellIndex, Double> rfile = readRMatrixFromExpectedDir("R");
+		TestUtils.compareMatrices(dmlfile, rfile, eps, "Stat-DML", "Stat-R");
+
+		Assert.assertTrue(heavyHittersContainsString("gpu_ba+*"));
+
+	}
+}
diff --git a/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsrMVTest.java b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsrMVTest.java
new file mode 100644
index 0000000..0566375
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsrMVTest.java
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.gpu.cudaSupportFunctions;
+
+import org.apache.sysds.runtime.matrix.data.MatrixValue;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.util.HashMap;
+
+public class CudaCusparseCsrMVTest extends AutomatedTestBase {
+
+	private static final String TEST_NAME = "CudaSupportFunctionsMV";
+	private static final String TEST_DIR = "gpu/cudaSupportFunctions/";
+	private static final String TEST_CLASS_DIR = TEST_DIR + CudaCusparseCsrMVTest.class.getSimpleName() + "/";
+
+	private static final int rows = 200;
+	private static final int cols = 200;
+	private static final double eps = Math.pow(10, -10);
+
+	@BeforeClass
+	public static void ensureGPU() {
+		TestUtils.checkGPU();
+	}
+
+	@Override
+	public void setUp() {
+		TestUtils.clearAssertionInformation();
+		addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[] {"R"}));
+	}
+
+	@Test
+	public void testCusparseCsrMVNoTranspose() {
+		testCusparseCsrMV(1);
+	}
+
+	@Test
+	public void testCusparseCsrMVLeftTranspose() {
+		testCusparseCsrMV(2);
+	}
+
+	private void testCusparseCsrMV(int ID) {
+
+		TestConfiguration config = getTestConfiguration(TEST_NAME);
+		loadTestConfiguration(config);
+
+		String HOME = SCRIPT_DIR + TEST_DIR;
+		fullDMLScriptName = HOME + TEST_NAME + ".dml";
+		programArgs = new String[] {"-stats", "-gpu", "-args", input("A"), input("X"), String.valueOf(ID), output("R")};
+		fullRScriptName = HOME + TEST_NAME + ".R";
+		rCmd = getRCmd(inputDir(), String.valueOf(ID), expectedDir());
+
+		// A is sparse matrix, X dense vector
+		double[][] A = getRandomMatrix(rows, cols, -1, 1, 0.20d, 5);
+		double[][] X = getRandomMatrix(rows, 1, -1, 1, 0.80d, 3);
+		writeInputMatrixWithMTD("A", A, true);
+		writeInputMatrixWithMTD("X", X, true);
+
+		runTest(true, false, null, -1);
+		runRScript(true);
+
+		//compare matrices
+		HashMap<MatrixValue.CellIndex, Double> dmlfile = readDMLMatrixFromOutputDir("R");
+		HashMap<MatrixValue.CellIndex, Double> rfile = readRMatrixFromExpectedDir("R");
+		TestUtils.compareMatrices(dmlfile, rfile, eps, "Stat-DML", "Stat-R");
+
+		Assert.assertTrue(heavyHittersContainsString("gpu_ba+*"));
+
+	}
+}
diff --git a/src/test/java/org/apache/sysds/test/gpu/nn/DNNOperationsGPUTest.java b/src/test/java/org/apache/sysds/test/gpu/nn/DNNOperationsGPUTest.java
new file mode 100644
index 0000000..2bd80c9
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/gpu/nn/DNNOperationsGPUTest.java
@@ -0,0 +1,276 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.gpu.nn;
+
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.functions.dnn.Conv1DTest;
+import org.apache.sysds.test.functions.dnn.Conv2DTest;
+import org.apache.sysds.test.functions.dnn.Conv2DBackwardTest;
+import org.apache.sysds.test.functions.dnn.Conv2DBackwardDataTest;
+import org.apache.sysds.test.functions.dnn.PoolTest;
+import org.apache.sysds.test.functions.dnn.PoolBackwardTest;
+import org.apache.sysds.test.functions.dnn.ReluBackwardTest;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class DNNOperationsGPUTest extends AutomatedTestBase {
+
+	@Override
+	public void setUp() {
+		TEST_GPU = true;
+		VERBOSE_STATS = true;
+	}
+
+	@Test
+	public void Conv1DGPUTest() {
+		Conv1DTest dmlTestCase = new Conv1DTest();
+		dmlTestCase.setUpBase();
+		dmlTestCase.setUp();
+		dmlTestCase.testSimpleConv1DDenseSingleBatchSingleChannelSingleFilter();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d"));
+		dmlTestCase.testConv1DDense1();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d", "gpu_append"));
+		dmlTestCase.testConv1DDense2();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d"));
+		dmlTestCase.testConv1DDense3();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d"));
+		dmlTestCase.testConv1DDense4();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d"));
+		dmlTestCase.testConv1DDense5();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d"));
+		dmlTestCase.testConv1DDense6();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d"));
+		dmlTestCase.testConv1DDense7();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d"));
+		dmlTestCase.testConv1DBackwardDataDense1();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d_backward_data"));
+		dmlTestCase.testConv1DBackwardFilterDense1();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d_backward_filter"));
+		dmlTestCase.testConv1DBackwardFilterDense2();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d_backward_filter"));
+	}
+
+	@Test
+	public void Conv2DGPUTest() {
+		Conv2DTest dmlTestCase = new Conv2DTest();
+		dmlTestCase.setUpBase();
+		dmlTestCase.setUp();
+		dmlTestCase.testConv2DDense1();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d_bias_add"));
+		dmlTestCase.testConv2DDense2();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d_bias_add"));
+		dmlTestCase.testConv2DDense3();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d_bias_add"));
+		dmlTestCase.testConv2DDense4();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d_bias_add"));
+		dmlTestCase.testConv2DDense5();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d_bias_add"));
+		dmlTestCase.testConv2DDense6();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d_bias_add"));
+		dmlTestCase.testConv2DDense7();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d_bias_add"));
+		dmlTestCase.testConv2DSparse1a();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_bias_add", "gpu_*", "gpu_>"));
+		dmlTestCase.testConv2DSparse2a();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_bias_add", "gpu_*", "gpu_>"));
+		dmlTestCase.testConv2DSparse3a();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_bias_add", "gpu_*", "gpu_>"));
+		dmlTestCase.testConv2DSparse4a();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_bias_add", "gpu_*", "gpu_>"));
+		dmlTestCase.testConv2DSparse5a();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_bias_add", "gpu_*", "gpu_>"));
+		dmlTestCase.testConv2DSparse6a();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_bias_add", "gpu_*", "gpu_>"));
+		dmlTestCase.testConv2DSparse7a();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_bias_add", "gpu_*", "gpu_>"));
+		dmlTestCase.testConv2DSparse1b();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_bias_add", "gpu_*", "gpu_>"));
+		dmlTestCase.testConv2DSparse2b();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_bias_add", "gpu_*", "gpu_>"));
+		dmlTestCase.testConv2DSparse3b();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_bias_add", "gpu_*", "gpu_>"));
+		dmlTestCase.testConv2DSparse4b();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_bias_add", "gpu_*", "gpu_>"));
+		dmlTestCase.testConv2DSparse5b();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_bias_add", "gpu_*", "gpu_>"));
+		dmlTestCase.testConv2DSparse6b();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_bias_add", "gpu_*", "gpu_>"));
+		dmlTestCase.testConv2DSparse7b();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_bias_add", "gpu_*", "gpu_>"));
+	}
+
+	@Test
+	public void Conv2DBackwardGPUTest() {
+		Conv2DBackwardTest dmlTestCase = new Conv2DBackwardTest();
+		dmlTestCase.setUpBase();
+		dmlTestCase.setUp();
+		dmlTestCase.testConv2DBackwardFilterDense1();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d_backward_filter"));
+		dmlTestCase.testConv2DBackwardFilterDense2();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d_backward_filter"));
+		dmlTestCase.testConv2DBackwardFilterDense3();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d_backward_filter"));
+		dmlTestCase.testConv2DBackwardFilterDense4();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d_backward_filter"));
+		dmlTestCase.testConv2DBackwardFilterDense5();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d_backward_filter"));
+		dmlTestCase.testConv2DBackwardFilterSparse1();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_backward_filter", "gpu_>"));
+		dmlTestCase.testConv2DBackwardFilterSparse2();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_backward_filter", "gpu_>"));
+		dmlTestCase.testConv2DBackwardFilterSparse3();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_backward_filter", "gpu_>"));
+		dmlTestCase.testConv2DBackwardFilterSparse4();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_backward_filter", "gpu_>"));
+		dmlTestCase.testConv2DBackwardFilterSparse5();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_backward_filter", "gpu_>"));
+		dmlTestCase.testConv2DBackwardFilterSparse6();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_backward_filter", "gpu_>"));
+		dmlTestCase.testConv2DBackwardFilterSparse7();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_backward_filter", "gpu_>"));
+		dmlTestCase.testConv2DBackwardFilterSparse8();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_backward_filter", "gpu_>"));
+		dmlTestCase.testConv2DBackwardFilterSparse9();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_backward_filter", "gpu_>"));
+		dmlTestCase.testConv2DBackwardFilterSparse10();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_backward_filter", "gpu_>"));
+		dmlTestCase.testConv2DBackwardFilterSparse11();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_backward_filter", "gpu_>"));
+		dmlTestCase.testConv2DBackwardFilterSparse12();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_backward_filter", "gpu_>"));
+		dmlTestCase.testConv2DBackwardFilterSparse13();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_backward_filter", "gpu_>"));
+		dmlTestCase.testConv2DBackwardFilterSparse14();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_backward_filter", "gpu_>"));
+		dmlTestCase.testConv2DBackwardFilterSparse15();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_backward_filter", "gpu_>"));
+		dmlTestCase.testConv2DBackwardFilterSparse16();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_backward_filter", "gpu_>"));
+		dmlTestCase.testConv2DBackwardFilterSparse17();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_backward_filter", "gpu_>"));
+	}
+
+	@Test
+	public void Conv2DBackwardDataGPUTest() {
+		Conv2DBackwardDataTest dmlTestCase = new Conv2DBackwardDataTest();
+		dmlTestCase.setUpBase();
+		dmlTestCase.setUp();
+		dmlTestCase.testConv2DBwdDataDense1();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d_backward_data"));
+		dmlTestCase.testConv2DDense2();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d_backward_data"));
+		dmlTestCase.testConv2DDense3();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d_backward_data"));
+		dmlTestCase.testConv2DBwdDataDense4();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d_backward_data"));
+		dmlTestCase.testConv2DBwdDataSparse1();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_backward_data", "gpu_>"));
+		dmlTestCase.testConv2DBwdDataSparse2();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_backward_data"));
+		dmlTestCase.testConv2DBwdDataSparse3();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_backward_data", "gpu_>"));
+		dmlTestCase.testConv2DBwdDataSparse4();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_backward_data", "gpu_>"));
+		dmlTestCase.testConv2DBwdDataSparse5();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_backward_data", "gpu_>"));
+		dmlTestCase.testConv2DBwdDataSparse6();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_backward_data", "gpu_>"));
+		dmlTestCase.testConv2DBwdDataSparse7();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_backward_data", "gpu_>"));
+	}
+
+	@Test
+	public void PoolGPUTest() {
+		PoolTest dmlTestCase = new PoolTest();
+		dmlTestCase.setUpBase();
+		dmlTestCase.setUp();
+		dmlTestCase.testMaxPool2DDense1();
+		Assert.assertTrue(heavyHittersContainsString("gpu_maxpooling"));
+		dmlTestCase.testMaxPool2DDense2();
+		Assert.assertTrue(heavyHittersContainsString("gpu_maxpooling"));
+		dmlTestCase.testMaxPool2DDense3();
+		Assert.assertTrue(heavyHittersContainsString("gpu_maxpooling"));
+		dmlTestCase.testMaxPool2DDense4();
+		Assert.assertTrue(heavyHittersContainsString("gpu_maxpooling"));
+		dmlTestCase.testMaxPool2DDense5();
+		Assert.assertTrue(heavyHittersContainsString("gpu_maxpooling"));
+		dmlTestCase.testMaxPool2DSparse1();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_maxpooling", "gpu_>"));
+		dmlTestCase.testMaxPool2DSparse2();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_maxpooling", "gpu_>"));
+		dmlTestCase.testMaxPool2DSparse3();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_maxpooling", "gpu_>"));
+		dmlTestCase.testMaxPool2DSparse4();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_maxpooling", "gpu_>"));
+		dmlTestCase.testMaxPool2DSparse5();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_maxpooling", "gpu_>"));
+	}
+
+	@Test
+	public void PoolBackwardGPUTest() {
+		PoolBackwardTest dmlTestCase = new PoolBackwardTest();
+		dmlTestCase.setUpBase();
+		dmlTestCase.setUp();
+		dmlTestCase.testMaxPool2DBackwardDense1();
+		Assert.assertTrue(heavyHittersContainsString("gpu_maxpooling_backward"));
+		dmlTestCase.testMaxPool2DBackwardDense2();
+		Assert.assertTrue(heavyHittersContainsString("gpu_maxpooling_backward"));
+		dmlTestCase.testMaxPool2DBackwardDense3();
+		Assert.assertTrue(heavyHittersContainsString("gpu_maxpooling_backward"));
+		dmlTestCase.testMaxPool2DBackwardSparse1();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_maxpooling_backward", "gpu_>"));
+		dmlTestCase.testMaxPool2DBackwardSparse2();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_maxpooling_backward", "gpu_>"));
+		dmlTestCase.testMaxPool2DBackwardSparse3();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_maxpooling_backward", "gpu_>"));
+		dmlTestCase.testMaxPool2DBackwardSparse4();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_maxpooling_backward", "gpu_>"));
+		dmlTestCase.testMaxPool2DBackwardSparse5();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_maxpooling_backward", "gpu_>"));
+		dmlTestCase.testMaxPool2DBackwardSparse6();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_maxpooling_backward", "gpu_>"));
+		dmlTestCase.testMaxPool2DBackwardSparse7();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_maxpooling_backward", "gpu_>"));
+		dmlTestCase.testMaxPool2DBackwardSparse8();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_maxpooling_backward", "gpu_>"));
+		dmlTestCase.testMaxPool2DBackwardSparse9();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_maxpooling_backward", "gpu_>"));
+		dmlTestCase.testMaxPool2DBackwardSparse10();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_maxpooling_backward", "gpu_>"));
+		dmlTestCase.testMaxPool2DBackwardSparse11();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_maxpooling_backward", "gpu_>"));
+		dmlTestCase.testMaxPool2DBackwardSparse12();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_maxpooling_backward", "gpu_>"));
+	}
+
+	@Test
+	public void ReluBackwardGPUTest() {
+		ReluBackwardTest dmlTestCase = new ReluBackwardTest();
+		dmlTestCase.setUpBase();
+		dmlTestCase.setUp();
+		dmlTestCase.testReluBackwardDense1();
+		Assert.assertTrue(heavyHittersContainsString("gpu_relu_backward"));
+		dmlTestCase.testReluBackwardDense2();
+		Assert.assertTrue(heavyHittersContainsString("gpu_relu_backward"));
+		dmlTestCase.testReluBackwardDense3();
+		Assert.assertTrue(heavyHittersContainsString("gpu_relu_backward"));
+	}
+
+}
diff --git a/src/test/java/org/apache/sysds/test/gpu/nn/ResNet18GPUTest.java b/src/test/java/org/apache/sysds/test/gpu/nn/ResNet18GPUTest.java
new file mode 100644
index 0000000..d306271
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/gpu/nn/ResNet18GPUTest.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.gpu.nn;
+
+import jcuda.CudaException;
+import jcuda.runtime.JCuda;
+import jcuda.runtime.cudaError;
+import org.apache.sysds.runtime.matrix.data.MatrixValue;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Assert;
+import org.junit.Assume;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.util.HashMap;
+
+public class ResNet18GPUTest extends AutomatedTestBase {
+
+	private static final String TEST_NAME = "ResNet18GPU";
+	private static final String TEST_DIR = "gpu/nn/";
+	private static final String TEST_CLASS_DIR = TEST_DIR + ResNet18GPUTest.class.getSimpleName() + "/";
+
+	private static final double eps = Math.pow(10, -10);
+
+	@BeforeClass
+	public static void checkGPU() {
+		boolean gpuAvailable = false;
+		try {
+			// Ask JCuda to throw Java exceptions (much nicer than error codes)
+			JCuda.setExceptionsEnabled(true);
+
+			// How many devices does the runtime see?
+			int[] devCount = {0};
+			int status = JCuda.cudaGetDeviceCount(devCount);
+
+			gpuAvailable = (status == cudaError.cudaSuccess) && (devCount[0] > 0);
+		}
+		catch(UnsatisfiedLinkError | CudaException ex) {
+			// - native JCuda libs not on the class-path
+			// - or they were built for the wrong CUDA version
+			gpuAvailable = false;
+		}
+
+		Assume.assumeTrue("Skipping GPU test: no compatible CUDA device " + "or JCuda native libraries not available.",
+			gpuAvailable);
+	}
+
+	@Override
+	public void setUp() {
+		TestUtils.clearAssertionInformation();
+		addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[] {"R"}));
+	}
+
+	@Test
+	public void testResnet18GPU() {
+		runResNet18GPU();
+	}
+
+	private void runResNet18GPU() {
+
+		TestConfiguration config = getTestConfiguration(TEST_NAME);
+		loadTestConfiguration(config);
+
+		String HOME = SCRIPT_DIR + TEST_DIR;
+		fullDMLScriptName = HOME + TEST_NAME + ".dml";
+		programArgs = new String[] {"-stats", "-gpu", "-args", output("R")};
+
+		runTest(true, false, null, -1);
+		HashMap<MatrixValue.CellIndex, Double> out = readDMLMatrixFromOutputDir("R");
+
+		double v1 = out.get(new MatrixValue.CellIndex(1, 1));
+		double v2 = out.get(new MatrixValue.CellIndex(1, 2));
+		double v3 = out.get(new MatrixValue.CellIndex(1, 3));
+
+		Assert.assertTrue(v1 == 612 || v1 == 640);
+		Assert.assertEquals(192, v2, 0.0);
+		Assert.assertEquals(192, v3, 0.0);
+
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_bias_add", "gpu_batch_norm2d", "gpu_softmax"));
+
+	}
+}
diff --git a/src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsDot.R b/src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsDot.R
new file mode 100644
index 0000000..872ec03
--- /dev/null
+++ b/src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsDot.R
@@ -0,0 +1,39 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+args <- commandArgs(TRUE)
+
+# Set options for numeric precision
+options(digits=22)
+
+# Load required libraries
+library("Matrix")
+library("matrixStats")
+
+# Read matrices
+A = as.matrix(readMM(paste(args[1], "A.mtx", sep="")))
+B = as.matrix(readMM(paste(args[1], "B.mtx", sep="")))
+
+# Perform operations
+R=A%*%B
+#Write result matrix R
+writeMM(as(R, "CsparseMatrix"), paste(args[2], "R", sep=""));
diff --git a/src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsDot.dml b/src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsDot.dml
new file mode 100644
index 0000000..528d3e2
--- /dev/null
+++ b/src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsDot.dml
@@ -0,0 +1,30 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Read matrices
+A = read($1)
+B = read($2)
+
+# Perform operations
+R = A%*%B
+
+# Write the result matrix R
+write(R, $3)
diff --git a/src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsGeam.R b/src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsGeam.R
new file mode 100644
index 0000000..39abbe4
--- /dev/null
+++ b/src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsGeam.R
@@ -0,0 +1,50 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+args <- commandArgs(TRUE)
+
+# Set options for numeric precision
+options(digits=22)
+
+# Load required libraries
+library("Matrix")
+library("matrixStats")
+
+# Read matrices and operation type
+A = as.matrix(readMM(paste(args[1], "A.mtx", sep="")))
+B = as.matrix(readMM(paste(args[1], "B.mtx", sep="")))
+type = as.integer(args[2])
+
+
+# Perform operations
+if(type==1){
+    R = A + B
+} else if(type==2) {
+    R = t(A) + B
+} else if(type==3) {
+    R = A + t(B)
+} else if(type==4){
+    R = t(A) + t(B)
+}
+
+#Write result matrix R
+writeMM(as(R, "CsparseMatrix"), paste(args[3], "R", sep=""));
diff --git a/src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsGeam.dml b/src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsGeam.dml
new file mode 100644
index 0000000..bfed8c6
--- /dev/null
+++ b/src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsGeam.dml
@@ -0,0 +1,43 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+# Read matrices A, B, and operation type
+A = read($1)
+B = read($2)
+type = $3
+
+# Perform operations
+if(type==1){
+    R = A + B
+}
+else if(type==2) {
+    R = t(A) + B
+}
+else if(type==3) {
+    R = A + t(B)
+}
+else if(type==4) {
+    R = t(A) + t(B)
+}
+
+# Write the result matrix R
+write(R, $4)
diff --git a/src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsMM.R b/src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsMM.R
new file mode 100644
index 0000000..eb37802
--- /dev/null
+++ b/src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsMM.R
@@ -0,0 +1,50 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+args <- commandArgs(TRUE)
+
+# Set options for numeric precision
+options(digits=22)
+
+# Load required libraries
+library("Matrix")
+library("matrixStats")
+
+# Read matrices and operation type
+A = as.matrix(readMM(paste(args[1], "A.mtx", sep="")))
+B = as.matrix(readMM(paste(args[1], "B.mtx", sep="")))
+type = as.integer(args[2])
+
+
+# Perform operations
+if(type==1){
+    R = A %*% B
+} else if(type==2) {
+    R = t(A) %*% B
+} else if(type==3) {
+    R = A %*% t(B)
+} else if(type==4){
+    R = t(A) %*% t(B)
+}
+
+#Write result matrix R
+writeMM(as(R, "CsparseMatrix"), paste(args[3], "R", sep=""));
diff --git a/src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsMM.dml b/src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsMM.dml
new file mode 100644
index 0000000..fbe8912
--- /dev/null
+++ b/src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsMM.dml
@@ -0,0 +1,43 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+# Read matrices A, B, and operation type
+A = read($1)
+B = read($2)
+type = $3
+
+# Perform operations
+if(type==1){
+    R = A %*% B
+}
+else if(type==2) {
+    R = t(A) %*% B
+}
+else if(type==3) {
+    R = A %*% t(B)
+}
+else if(type==4) {
+    R = t(A) %*% t(B)
+}
+
+# Write the result matrix R
+write(R, $4)
diff --git a/src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsMV.R b/src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsMV.R
new file mode 100644
index 0000000..321da58
--- /dev/null
+++ b/src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsMV.R
@@ -0,0 +1,46 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+args <- commandArgs(TRUE)
+
+# Set options for numeric precision
+options(digits=22)
+
+# Load required libraries
+library("Matrix")
+library("matrixStats")
+
+# Read matrices and operation type
+A = as.matrix(readMM(paste(args[1], "A.mtx", sep="")))
+X = as.matrix(readMM(paste(args[1], "X.mtx", sep="")))
+type = as.integer(args[2])
+
+
+# Perform operations
+if(type==1){
+    R = A %*% X
+} else if(type==2) {
+    R = t(A) %*% X
+}
+
+#Write result matrix R
+writeMM(as(R, "CsparseMatrix"), paste(args[3], "R", sep=""));
diff --git a/src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsMV.dml b/src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsMV.dml
new file mode 100644
index 0000000..be444fb
--- /dev/null
+++ b/src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsMV.dml
@@ -0,0 +1,36 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Read matrix A, vector X, and operation type
+A = read($1)
+X = read($2)
+type = $3
+
+# Perform operations
+if(type==1){
+    R = A %*% X
+}
+else if(type==2) {
+    R = t(A) %*% X
+}
+
+# Write the result matrix R
+write(R, $4)
diff --git a/src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsTranspose.R b/src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsTranspose.R
new file mode 100644
index 0000000..4dc5894
--- /dev/null
+++ b/src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsTranspose.R
@@ -0,0 +1,38 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+args <- commandArgs(TRUE)
+
+# Set options for numeric precision
+options(digits=22)
+
+# Load required libraries
+library("Matrix")
+library("matrixStats")
+
+# Read matrices
+A = as.matrix(readMM(paste(args[1], "A.mtx", sep="")))
+
+# Perform operation
+R = t(A)
+#Write result matrix R
+writeMM(as(R, "CsparseMatrix"), paste(args[2], "R", sep=""));
diff --git a/src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsTranspose.dml b/src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsTranspose.dml
new file mode 100644
index 0000000..27f7a0b
--- /dev/null
+++ b/src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsTranspose.dml
@@ -0,0 +1,29 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Read matrices
+A = read($1)
+
+# Perform operation
+R = t(A)
+
+# Write the result matrix R
+write(R, $2)
diff --git a/src/test/scripts/gpu/nn/ResNet18GPU.dml b/src/test/scripts/gpu/nn/ResNet18GPU.dml
new file mode 100644
index 0000000..a78a58c
--- /dev/null
+++ b/src/test/scripts/gpu/nn/ResNet18GPU.dml
@@ -0,0 +1,342 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+conv2d_forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
+  int C, int Hin, int Win, int Hf, int Wf, int strideh, int stridew,
+  int padh, int padw) return (matrix[double] out, int Hout, int Wout)
+{
+  N = nrow(X)
+  F = nrow(W)
+  Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
+  Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
+  # Convolution - built-in implementation
+  out = conv2d(X, W, input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf],
+               stride=[strideh,stridew], padding=[padh,padw])
+  # Add bias term to each output filter
+  out = bias_add(out, b)
+}
+
+conv2d_backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
+  matrix[double] W, matrix[double] b, int C, int Hin, int Win, int Hf, int Wf,
+  int strideh, int stridew, int padh, int padw)
+  return (matrix[double] dX, matrix[double] dW, matrix[double] db)
+{
+  N = nrow(X)
+  F = nrow(W)
+  # Partial derivatives for convolution - built-in implementation
+  dW = conv2d_backward_filter(X, dout, stride=[strideh,stridew], padding=[padh,padw],
+                              input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf])
+  dX = conv2d_backward_data(W, dout, stride=[strideh,stridew], padding=[padh,padw],
+                            input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf])
+  # Partial derivatives for bias vector
+  # Here we sum each column, reshape to (F, Hout*Wout), and sum each row
+  # to result in the summation for each channel.
+  db = rowSums(matrix(colSums(dout), rows=F, cols=Hout*Wout))  # shape (F, 1)
+}
+
+conv2d_init = function(int F, int C, int Hf, int Wf, int seed = -1)
+  return (matrix[double] W, matrix[double] b) {
+  W = rand(rows=F, cols=C*Hf*Wf, pdf="normal", seed=seed) * sqrt(2.0/(C*Hf*Wf))
+  b = matrix(0, rows=F, cols=1)
+}
+
+bn2d_forward = function(matrix[double] X, int C, int Hin, int Win,
+    double mu, double epsilon) return (matrix[double] out)
+{
+    gamma = matrix(1, rows=C, cols=1)
+    beta = matrix(0, rows=C, cols=1)
+    ema_mean = matrix(0, rows=C, cols=1)
+    ema_var = matrix(1, rows=C, cols=1)
+    ema_mean_upd = ema_mean;
+    ema_var_upd = ema_var;
+    cache_mean = ema_mean;
+    cache_inv_var = ema_var
+    mode = 'train';
+    [out, ema_mean_upd, ema_var_upd, cache_mean, cache_inv_var] = batch_norm2d(X, gamma, beta, ema_mean, ema_var, mode, epsilon, mu)
+}
+
+affine_forward = function(matrix[double] X, matrix[double] W, matrix[double] b) return (matrix[double] out) {
+  out = X %*% W + b;
+}
+
+affine_init = function(int D, int M, int seed = -1 ) return (matrix[double] W, matrix[double] b) {
+  W = rand(rows=D, cols=M, pdf="normal", seed=seed) * sqrt(2.0/D);
+  b = matrix(0, rows=1, cols=M);
+}
+
+relu_forward = function(matrix[double] X) return (matrix[double] out) {
+  out = max(0, X);
+}
+
+max_pool2d_forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
+  int strideh, int stridew, int padh, int padw) return(matrix[double] out, int Hout, int Wout)
+{
+  N = nrow(X)
+  Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
+  Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
+  out = max_pool(X, input_shape=[N,C,Hin,Win], pool_size=[Hf,Wf],
+    stride=[strideh,stridew], padding=[padh,padw])
+}
+
+avg_pool2d_forward = function(matrix[double] X, int C, int Hin, int Win)
+  return (matrix[double] out, int Hout, int Wout) {
+  N = nrow(X)
+  Hout = 1
+  Wout = 1
+  out = avg_pool(X, input_shape=[N,C,Hin,Win], pool_size=[Hin,Win], stride=[1,1], padding=[0, 0])
+}
+
+softmax_forward = function(matrix[double] scores) return (matrix[double] probs) {
+  scores = scores - rowMaxs(scores);  # numerical stability
+  unnorm_probs = exp(scores);  # unnormalized probabilities
+  probs = unnorm_probs / rowSums(unnorm_probs);  # normalized probabilities
+}
+
+basic_block = function(matrix[double] X, int C, int C_base, int Hin, int Win, int strideh,
+    int stridew, matrix[double] WC1, matrix[double] bC1, matrix[double] WC2, matrix[double] bC2)
+  return (matrix[double] out, int Hout, int Wout)
+{
+  mu_bn = 0.1;
+  ep_bn = 1e-05;
+  downsample = strideh > 1 | stridew > 1 | C != C_base;
+  if (downsample) {
+    [WC3, bC3] = conv2d_init(C_base, C, Hf=1, Wf=1, 42);
+  }
+  # Residual Path
+  # conv1 -> bn1 -> relu1
+  [out, Hout, Wout] = conv2d_forward(X,WC1,bC1,C,Hin,Win,3,3,strideh,stridew,1,1);
+  out = bn2d_forward(out,C_base,Hout,Wout,mu_bn,ep_bn);
+  out = relu_forward(out);
+  # conv2 -> bn2 -> relu2
+  [out, Hout, Wout] = conv2d_forward(out,WC2,bC2,C_base,Hout,Wout,3,3,1,1,1,1);
+  out = bn2d_forward(out,C_base,Hout,Wout,mu_bn,ep_bn);
+  # Identity Path
+  identity = X;
+  if (downsample) {
+    # Downsample input
+    [identity, Hout, Wout] = conv2d_forward(X,WC3,bC3,C,Hin,Win,1,1,strideh,stridew,0,0);
+    out = bn2d_forward(identity,C_base,Hout,Wout,mu_bn,ep_bn);
+  }
+  out = relu_forward(out + identity);
+}
+
+getWeights = function(int fel, int lid,
+    matrix[double] W_pt, matrix[double] b_pt,
+    matrix[double] W_init, matrix[double] b_init)
+  return (matrix[double] Wl, matrix[double] bl)
+{
+  if (lid < fel) { #extract pretrained features
+    Wl = W_pt;
+    bl = b_pt;
+  }
+  else {  #use initialized weights
+    Wl = W_init;
+    bl = b_init;
+  }
+}
+
+rwRowIndexMax = function(matrix[double] X, matrix[double] oneVec, matrix[double] idxSeq)
+    return (matrix[double] index) {
+  rm = rowMaxs(X) %*% oneVec;
+  I = X == rm;
+  index = rowMaxs(I * idxSeq);
+}
+
+resnet18_forward = function(matrix[double] X, int C, int Hin, int Win, int K)
+  return (matrix[double] Y_pred)
+{
+  mu_bn = 0.1;
+  ep_bn = 1e-05;
+
+  # Get the transferred layers. FIXME: use pretrained weights
+  [W1_pt, b1_pt] = conv2d_init(64, C, Hf=7, Wf=7, 42);
+  [W2_pt, b2_pt] = conv2d_init(64, 64, Hf=3, Wf=3, 42);
+  [W3_pt, b3_pt] = conv2d_init(64, 64, Hf=3, Wf=3, 42);
+  [W4_pt, b4_pt] = conv2d_init(64, 64, Hf=3, Wf=3, 42);
+  [W5_pt, b5_pt] = conv2d_init(64, 64, Hf=3, Wf=3, 42);
+  [W6_pt, b6_pt] = conv2d_init(128, 64, Hf=3, Wf=3, 42);
+  [W7_pt, b7_pt] = conv2d_init(128, 128, Hf=3, Wf=3, 42);
+  [W8_pt, b8_pt] = conv2d_init(128, 128, Hf=3, Wf=3, 42);
+  [W9_pt, b9_pt] = conv2d_init(128, 128, Hf=3, Wf=3, 42);
+  [W10_pt, b10_pt] = conv2d_init(256, 128, Hf=3, Wf=3, 42);
+  [W11_pt, b11_pt] = conv2d_init(256, 256, Hf=3, Wf=3, 42);
+  [W12_pt, b12_pt] = conv2d_init(256, 256, Hf=3, Wf=3, 42);
+  [W13_pt, b13_pt] = conv2d_init(256, 256, Hf=3, Wf=3, 42);
+  [W14_pt, b14_pt] = conv2d_init(512, 256, Hf=3, Wf=3, 42);
+  [W15_pt, b15_pt] = conv2d_init(512, 512, Hf=3, Wf=3, 42);
+  [W16_pt, b16_pt] = conv2d_init(512, 512, Hf=3, Wf=3, 42);
+  [W17_pt, b17_pt] = conv2d_init(512, 512, Hf=3, Wf=3, 42);
+  [W18_pt, b18_pt] = affine_init(512, K, 42);
+  W18_pt = W18_pt/sqrt(2);
+
+  # Initialize the weights for the non-transferred layers
+  [W1_init, b1_init] = conv2d_init(64, C, Hf=7, Wf=7, 43);
+  [W2_init, b2_init] = conv2d_init(64, 64, Hf=3, Wf=3, 43);
+  [W3_init, b3_init] = conv2d_init(64, 64, Hf=3, Wf=3, 43);
+  [W4_init, b4_init] = conv2d_init(64, 64, Hf=3, Wf=3, 43);
+  [W5_init, b5_init] = conv2d_init(64, 64, Hf=3, Wf=3, 43);
+  [W6_init, b6_init] = conv2d_init(128, 64, Hf=3, Wf=3, 43);
+  [W7_init, b7_init] = conv2d_init(128, 128, Hf=3, Wf=3, 43);
+  [W8_init, b8_init] = conv2d_init(128, 128, Hf=3, Wf=3, 43);
+  [W9_init, b9_init] = conv2d_init(128, 128, Hf=3, Wf=3, 43);
+  [W10_init, b10_init] = conv2d_init(256, 128, Hf=3, Wf=3, 42);
+  [W11_init, b11_init] = conv2d_init(256, 256, Hf=3, Wf=3, 42);
+  [W12_init, b12_init] = conv2d_init(256, 256, Hf=3, Wf=3, 42);
+  [W13_init, b13_init] = conv2d_init(256, 256, Hf=3, Wf=3, 42);
+  [W14_init, b14_init] = conv2d_init(512, 256, Hf=3, Wf=3, 42);
+  [W15_init, b15_init] = conv2d_init(512, 512, Hf=3, Wf=3, 42);
+  [W16_init, b16_init] = conv2d_init(512, 512, Hf=3, Wf=3, 42);
+  [W17_init, b17_init] = conv2d_init(512, 512, Hf=3, Wf=3, 42);
+  [W18_init, b18_init] = affine_init(512, K, 42);
+  W18_init = W18_init/sqrt(2);
+
+  # Compute prediction over mini-batches
+  N = nrow(X);
+  Y_pred = matrix(0, rows=N, cols=3);
+  batch_size = 64;
+  oneVec = matrix(1, rows=1, cols=K);
+  idxSeq = matrix(1, rows=batch_size, cols=1) %*% t(seq(1, K));
+  iters = ceil (N / batch_size);
+
+  for (i in 1:iters) {
+    # Get next batch
+    beg = ((i-1) * batch_size) %% N + 1;
+    end = min(N, beg+batch_size-1);
+    X_batch = X[beg:end,];
+
+    # Extract 3 layers
+    j = 1;
+    fel = 10; #extract 9, 8, 7, 6
+    while (j < 4) {
+      # Compute forward pass
+      # Layer1: conv2d 7x7 -> bn -> relu -> maxpool 3x3
+      lid = 1;
+      [Wl1, bl1] = getWeights(fel, lid, W1_pt, b1_pt, W1_init, b1_init);
+      [outc1, Houtc1, Woutc1] = conv2d_forward(X_batch,Wl1,bl1,C,Hin,Win,7,7,2,2,3,3);
+      outb1 = bn2d_forward(outc1,64,Houtc1,Woutc1,mu_bn,ep_bn);
+      outr1 = relu_forward(outb1);
+      [outp1, Houtp1, Woutp1] = max_pool2d_forward(outr1,64,Houtc1, Woutc1,3,3,2,2,1,1);
+
+      # Layer2: residual block1
+      lid = 2;
+      [Wc1, bc1] = getWeights(fel, lid, W2_pt, b2_pt, W2_init, b2_init);
+      [Wc2, bc2] = getWeights(fel, lid, W3_pt, b3_pt, W3_init, b3_init);
+      [outrb1, Houtrb1, Woutrb1] = basic_block(outp1,64,64,Houtp1,Woutp1,1,1,Wc1,bc1,Wc2,bc2);
+      print(nrow(outrb1)+" "+ncol(outrb1));
+
+      # Layer3: residual block2
+      lid = 3;
+      [Wc1, bc1] = getWeights(fel, lid, W4_pt, b4_pt, W4_init, b4_init);
+      [Wc2, bc2] = getWeights(fel, lid, W5_pt, b5_pt, W5_init, b5_init);
+      [outrb2, Houtrb2, Woutrb2] = basic_block(outrb1,64,64,Houtrb1,Woutrb1,1,1,Wc1,bc1,Wc2,bc2);
+      print(nrow(outrb2)+" "+ncol(outrb2));
+
+      # Layer4: residual block3
+      lid = 4;
+      [Wc1, bc1] = getWeights(fel, lid, W6_pt, b6_pt, W6_init, b6_init);
+      [Wc2, bc2] = getWeights(fel, lid, W7_pt, b7_pt, W7_init, b7_init);
+      [outrb3, Houtrb3, Woutrb3] = basic_block(outrb2,64,128,Houtrb2,Woutrb2,2,2,Wc1,bc1,Wc2,bc2);
+      print(nrow(outrb3)+" "+ncol(outrb3));
+
+      # Layer5: residual block4
+      lid = 5;
+      [Wc1, bc1] = getWeights(fel, lid, W8_pt, b8_pt, W8_init, b8_init);
+      [Wc2, bc2] = getWeights(fel, lid, W9_pt, b9_pt, W9_init, b9_init);
+      [outrb4, Houtrb4, Woutrb4] = basic_block(outrb3,128,128,Houtrb3,Woutrb3,1,1,Wc1,bc1,Wc2,bc2);
+      print(nrow(outrb4)+" "+ncol(outrb4));
+
+      # Layer6: residual block5
+      lid = 6;
+      [Wc1, bc1] = getWeights(fel, lid, W10_pt, b10_pt, W10_init, b10_init);
+      [Wc2, bc2] = getWeights(fel, lid, W11_pt, b11_pt, W11_init, b11_init);
+      [outrb5, Houtrb5, Woutrb5] = basic_block(outrb4,128,256,Houtrb4,Woutrb4,2,2,Wc1,bc1,Wc2,bc2);
+      print(nrow(outrb5)+" "+ncol(outrb5));
+
+      # Layer7: residual block6
+      lid = 7;
+      [Wc1, bc1] = getWeights(fel, lid, W12_pt, b12_pt, W12_init, b12_init);
+      [Wc2, bc2] = getWeights(fel, lid, W13_pt, b13_pt, W13_init, b13_init);
+      [outrb6, Houtrb6, Woutrb6] = basic_block(outrb5,256,256,Houtrb5,Woutrb5,1,1,Wc1,bc1,Wc2,bc2);
+      print(nrow(outrb6)+" "+ncol(outrb6));
+
+      # Layer8: residual block7
+      lid = 8;
+      [Wc1, bc1] = getWeights(fel, lid, W14_pt, b14_pt, W14_init, b14_init);
+      [Wc2, bc2] = getWeights(fel, lid, W15_pt, b15_pt, W15_init, b15_init);
+      [outrb7, Houtrb7, Woutrb7] = basic_block(outrb6,256,512,Houtrb6,Woutrb6,2,2,Wc1,bc1,Wc2,bc2);
+      print(nrow(outrb7)+" "+ncol(outrb7));
+
+      # Layer9: residual block8
+      lid = 9;
+      [Wc1, bc1] = getWeights(fel, lid, W16_pt, b16_pt, W16_init, b16_init);
+      [Wc2, bc2] = getWeights(fel, lid, W17_pt, b17_pt, W17_init, b17_init);
+      [outrb8, Houtrb8, Woutrb8] = basic_block(outrb7,512,512,Houtrb7,Woutrb7,1,1,Wc1,bc1,Wc2,bc2);
+      print(nrow(outrb8)+" "+ncol(outrb8));
+
+      # Global average pooling
+      [outap1, Houtap1, Houtap2] = avg_pool2d_forward(outrb8, 512, Houtrb8, Woutrb8);
+
+      # layer10 : Fully connected layer
+      lid = 10;
+      [Wl10, bl10] = getWeights(fel, lid, W18_pt, b18_pt, W18_init, b18_init);
+      outa1 = affine_forward(outap1, Wl10, bl10);
+      probs_batch = softmax_forward(outa1);
+
+      # Store the predictions
+      Y_pred[beg:end,j] = rwRowIndexMax(probs_batch, oneVec, idxSeq);
+      j = j + 1;
+      fel = fel - 1;
+    }
+  }
+
+}
+
+generate_dummy_data = function(int N, int C, int Hin, int Win, int K)
+  return (matrix[double] X, matrix[double] Y) {
+  X = rand(rows=N, cols=C*Hin*Win, pdf="normal", seed=45) #linearized images
+  classes = round(rand(rows=N, cols=1, min=1, max=K, pdf="uniform", seed=46))
+  Y = table(seq(1, N), classes, N, K)  #one-hot encoding
+}
+
+# Read training data and settings
+N = 64;    #num of images in the target dataset
+C = 3;       #num of color channels
+Hin = 224;   #input image height
+Win = 224;   #input image width
+K = 10;      #num of classes
+
+# Generate dummy data
+[X, Y] = generate_dummy_data(N, C, Hin, Win, K);
+
+# Load the CuDNN libraries by calling a conv2d
+print("Eagerly loading cuDNN library");
+[W1, b1] = conv2d_init(96, C, Hf=11, Wf=11, 42);
+[outc1, Houtc1, Woutc1] = conv2d_forward(X[1:8,], W1, b1, C, Hin, Win, 11, 11, 1, 1, 2, 2);
+print(sum(outc1));
+
+print("Starting exploratory feature transfers");
+t1 = time();
+Y_pred = resnet18_forward(X, C, Hin, Win, K);
+R = colSums(Y_pred)
+print(R);
+
+t2 = time();
+print("Elapsed time for feature transfers = "+floor((t2-t1)/1000000)+" millsec");
+
+write(R, $1)