src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuMatMult.java - systemds - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */
 package org.apache.sysds.runtime.matrix.data;

 import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_NON_TRANSPOSE;
 import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_TRANSPOSE;
 import static jcuda.runtime.JCuda.cudaMemcpy;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
 import jcuda.Pointer;

 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.api.DMLScript;
 import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysds.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysds.runtime.instructions.gpu.context.CSRPointer;
 import org.apache.sysds.runtime.instructions.gpu.context.GPUContext;
 import org.apache.sysds.utils.Statistics;

 import jcuda.jcusparse.cusparseHandle;
 import jcuda.jcublas.cublasHandle;
 import jcuda.jcublas.cublasOperation;
 import jcuda.runtime.JCuda;

 public class LibMatrixCuMatMult extends LibMatrixCUDA {

 	private static final Log LOG = LogFactory.getLog(LibMatrixCuMatMult.class.getName());

 	private static class CuMatMultParameters {
 		/*
 		 * For the operation, C = op(A) %*% op(B), the below parameters are used
 		 * to invoke the corresponding kernels in CuBLAS and CuSPARSE.
 		 *
 		 * All the below values have to be valid or else this class has to throw
 		 * an exception. No special values like -1 for unknowns allowed.
 		 */
 		public int m; // number of rows of matrix op(A) and C.
 		public int n; // number of columns of matrix op(B) and C.
 		public int k; // number of columns of op(A) and rows of op(B).
 		public int lda; // leading dimension of two-dimensional array used to
 						// store the matrix A.
 		public int ldb; // leading dimension of two-dimensional array used to
 						// store matrix B.
 		public int ldc; // leading dimension of a two-dimensional array used to
 						// store the matrix C.
 		public long leftNumRows; // number of rows of A
 		public long leftNumCols; // number of cols of A
 		public long rightNumRows; // number of rows of B
 		public long rightNumCols; // number of cols of B
 		private boolean isLeftTransposed; // is op(A) = t(A)
 		private boolean isRightTransposed; // is op(B) = t(B)

 		public CuMatMultParameters(long leftNumRows1, long leftNumCols1, long rightNumRows1, long rightNumCols1,
 				boolean isLeftTransposed1, boolean isRightTransposed1) {
 			leftNumRows = leftNumRows1;
 			leftNumCols = leftNumCols1;
 			rightNumRows = rightNumRows1;
 			rightNumCols = rightNumCols1;
 			isLeftTransposed = isLeftTransposed1;
 			isRightTransposed = isRightTransposed1;
 			setDimensions();
 		}

 		public void rowToColumnMajor() {
 			// To compensate for the input matrices being in row-major format
 			// instead of column-major (the way cublas expects)
 			isRightTransposed = swap(isLeftTransposed, isLeftTransposed = isRightTransposed);
 			rightNumCols = swap(leftNumRows, leftNumRows = rightNumCols);
 			rightNumRows = swap(leftNumCols, leftNumCols = rightNumRows);
 			setDimensions();
 		}

 		private void validate() {
 			int k1 = toInt(isRightTransposed ? rightNumCols : rightNumRows);
 			if (k != k1)
 				throw new DMLRuntimeException("Dimension mismatch: " + k + " != " + k1 + " [" + leftNumRows + ","
 						+ leftNumCols + "," + rightNumRows + "," + rightNumCols + "], " + isLeftTransposed + " "
 						+ isRightTransposed);
 		}

 		private void setDimensions() {
 			// Validate the dimensions
 			m = toInt(isLeftTransposed ? leftNumCols : leftNumRows);
 			n = toInt(isRightTransposed ? rightNumRows : rightNumCols);
 			k = toInt(isLeftTransposed ? leftNumRows : leftNumCols);
 			lda = isLeftTransposed ? k : m;
 			ldb = isRightTransposed ? n : k;
 			ldc = m;
 			if (m == -1 || n == -1 || k == -1)
 				throw new DMLRuntimeException("Incorrect dimensions");
 		}
 	}

 	/**
 	 * Matrix multiply on GPU Examines sparsity and shapes and routes call to
 	 * appropriate method from cuBLAS or cuSparse C = op(A) x op(B)
 	 *
 	 * The user is expected to call
 	 * ec.releaseMatrixOutputForGPUInstruction(outputName);
 	 *
 	 * @param ec
 	 *            Current {@link ExecutionContext} instance
 	 * @param gCtx
 	 *            a valid {@link GPUContext}
 	 * @param instName
 	 *            name of the invoking instruction to record{@link Statistics}.
 	 * @param left
 	 *            Matrix A
 	 * @param right
 	 *            Matrix B
 	 * @param outputName
 	 *            Name of the output matrix C (in code generated after LOP
 	 *            layer)
 	 * @param isLeftTransposed
 	 *            op for A, transposed or not
 	 * @param isRightTransposed
 	 *            op for B, tranposed or not
 	 * @return output of matrix multiply
 	 */
 	public static MatrixObject matmult(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject left,
 			MatrixObject right, String outputName, boolean isLeftTransposed, boolean isRightTransposed) {
 		boolean isM1Sparse = isInSparseFormat(gCtx, left);
 		boolean isM2Sparse = isInSparseFormat(gCtx, right);
 		MatrixObject output = ec.getMatrixObject(outputName);
 		long outRLen = isLeftTransposed ? left.getNumColumns() : left.getNumRows();
 		long outCLen = isRightTransposed ? right.getNumRows() : right.getNumColumns();

 		CuMatMultParameters params = new CuMatMultParameters(left.getNumRows(), left.getNumColumns(),
 				right.getNumRows(), right.getNumColumns(), isLeftTransposed, isRightTransposed);

 		if (isM1Sparse && isM2Sparse) {
 			// -------------------------------------------------------------------------------------
 			// sparse-sparse matrix multiplication
 			params.validate();
 			int transa = cusparseOp(isLeftTransposed);
 			int transb = cusparseOp(isRightTransposed);

 			// Step 1: Allocate output => sparse format
 			ec.allocateGPUMatrixObject(outputName, outRLen, outCLen);

 			// Step 2: Get the handles to sparse/dense pointers for left, right
 			// and output
 			CSRPointer A = left.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
 			CSRPointer B = right.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
 			CSRPointer C = CSRPointer.allocateForMatrixMultiply(gCtx, getCusparseHandle(gCtx), A, transa, B, transb,
 					params.m, params.n, params.k);

 			// Step 3: Invoke the kernel
 			cudaSupportFunctions.cusparsecsrgemm(getCusparseHandle(gCtx), transa, transb, params.m, params.n, params.k, A.descr,
 					(int) A.nnz, A.val, A.rowPtr, A.colInd, B.descr, (int) B.nnz, B.val, B.rowPtr, B.colInd, C.descr,
 					C.val, C.rowPtr, C.colInd);
 			output.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
 			// -------------------------------------------------------------------------------------
 		} else if (!isM1Sparse && isM2Sparse) {
 			// -------------------------------------------------------------------------------------
 			// dense-sparse matrix multiplication
 			// Step 1: Allocate output => dense format
 			getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, outRLen, outCLen);

 			// Step 2: Get the handles to sparse/dense pointers for left, right
 			// and output
 			Pointer A = getDensePointer(gCtx, left, instName);
 			CSRPointer B = right.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
 			Pointer C = getDensePointer(gCtx, output, instName);

 			// Step 3: Invoke the kernel
 			denseSparseMatMult(getCusparseHandle(gCtx), instName, C, A, B, params);
 			// -------------------------------------------------------------------------------------
 		} else if (isM1Sparse && !isM2Sparse) {
 			// -------------------------------------------------------------------------------------
 			// sparse-dense matrix multiplication
 			// Step 1: Allocate output => dense format
 			getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, outRLen, outCLen);

 			// Step 2: Get the handles to sparse/dense pointers for left, right
 			// and output
 			CSRPointer A = left.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
 			Pointer B = getDensePointer(gCtx, right, instName);
 			Pointer C = getDensePointer(gCtx, output, instName);

 			// Step 3: Invoke the kernel
 			sparseDenseMatMult(gCtx, instName, C, A, B, left.getNumRows(), left.getNumColumns(), right.getNumRows(),
 					right.getNumColumns(), outRLen, outCLen, isLeftTransposed, isRightTransposed);
 			// -------------------------------------------------------------------------------------
 		} else {
 			// -------------------------------------------------------------------------------------
 			// dense-dense matrix multiplication
 			// Step 1: Allocate output => dense format
 			getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, outRLen, outCLen);

 			// Step 2: Get the handles to sparse/dense pointers for left, right
 			// and output
 			Pointer A = getDensePointer(gCtx, left, instName);
 			Pointer B = getDensePointer(gCtx, right, instName);
 			Pointer C = getDensePointer(gCtx, output, instName);

 			// Step 3: Invoke the kernel
 			denseDenseMatMult(getCublasHandle(gCtx), instName, C, A, B, params);
 			// -------------------------------------------------------------------------------------
 		}
 		return output;
 	}

 	/**
 	 * Internal method to invoke the appropriate CuSPARSE kernel for matrix
 	 * multiplication for operation: C = op(A) * op(B) This assumes B and C are
 	 * allocated in dense row-major format and A is sparse.
 	 *
 	 * Other than input and output, this method requires additional memory =
 	 * outRLen * outCLen * sizeOfDataType
 	 *
 	 * @param gCtx
 	 *            a valid {@link GPUContext}
 	 * @param instName
 	 *            name of the invoking instruction to record{@link Statistics}.
 	 * @param C
 	 *            output matrix pointer
 	 * @param A
 	 *            left matrix pointer
 	 * @param B
 	 *            right matrix pointer
 	 * @param leftNumRows
 	 *            number of rows of A
 	 * @param leftNumColumns
 	 *            number of cols of A
 	 * @param rightNumRows
 	 *            number of rows of B
 	 * @param rightNumColumns
 	 *            number of cols of B
 	 * @param outRLen
 	 *            number of rows of C
 	 * @param outCLen
 	 *            number of cols of C
 	 * @param isLeftTransposed
 	 *            is op(A) = t(A)
 	 * @param isRightTransposed
 	 *            is op(B) = t(B)
 	 */
 	static void sparseDenseMatMult(GPUContext gCtx, String instName, Pointer C, CSRPointer A, Pointer B,
 			long leftNumRows, long leftNumColumns, long rightNumRows, long rightNumColumns, long outRLen, long outCLen,
 			boolean isLeftTransposed, boolean isRightTransposed) {
 		// t(C) = t(B) %*% t(A)
 		Pointer output = null;
 		if (outRLen != 1 && outCLen != 1) {
 			output = gCtx.allocate(instName, outRLen * outCLen * sizeOfDataType);
 		} else {
 			// no transpose required for vector output
 			output = C;
 		}
 		CuMatMultParameters params = new CuMatMultParameters(rightNumRows, rightNumColumns, leftNumRows,
 				leftNumColumns, !isRightTransposed, !isLeftTransposed);
 		denseSparseMatMult(getCusparseHandle(gCtx), instName, output, B, A, params);
 		if (outRLen != 1 && outCLen != 1) {
 			// Transpose: C = t(output)
 			cudaSupportFunctions.cublasgeam(gCtx.getCublasHandle(), cublasOperation.CUBLAS_OP_T, cublasOperation.CUBLAS_OP_T,
 					toInt(outCLen), toInt(outRLen), one(), output, toInt(outRLen), zero(), new Pointer(),
 					toInt(outRLen), C, toInt(outCLen));
 			if (!DMLScript.EAGER_CUDA_FREE)
 				JCuda.cudaDeviceSynchronize();
 			gCtx.cudaFreeHelper(instName, output, DMLScript.EAGER_CUDA_FREE);
 		}
 	}

 	/**
 	 * Internal method to invoke the appropriate CuSPARSE kernel for matrix
 	 * multiplication for operation: C = op(A) * op(B) This assumes B and C are
 	 * allocated in dense row-major format and A is sparse.
 	 *
 	 * @param handle
 	 *            cusparse handle
 	 * @param instName
 	 *            name of the invoking instruction to record{@link Statistics}.
 	 * @param C
 	 *            output matrix pointer
 	 * @param A
 	 *            left matrix pointer
 	 * @param B
 	 *            right matrix pointer
 	 * @param param
 	 *            BLAS parameters
 	 */
 	private static void denseSparseMatMult(cusparseHandle handle, String instName, Pointer C, Pointer A, CSRPointer B,
 			CuMatMultParameters param) {
 		// Ignoring sparse vector dense matrix multiplication and dot product
 		boolean isVector = (param.leftNumRows == 1 && !param.isLeftTransposed)
 				|| (param.leftNumCols == 1 && param.isLeftTransposed);
 		if (isVector) {
 			LOG.debug(" GPU Sparse-Dense Matrix Vector ");
 			int m = toInt(param.rightNumRows);
 			int n = toInt(param.rightNumCols);
 			int transa = reverseCusparseOp(cusparseOp(param.isLeftTransposed));
 			cudaSupportFunctions.cusparsecsrmv(handle, transa, m, n, toInt(B.nnz), one(), B.descr, B.val, B.rowPtr, B.colInd, A,
 					zero(), C);
 		} else {
 			int m = toInt(param.rightNumRows);
 			int k = toInt(param.rightNumCols);
 			param.rowToColumnMajor();
 			param.validate();
 			int transa = reverseCusparseOp(cusparseOp(param.isLeftTransposed));
 			int transb = cusparseOp(param.isRightTransposed);
 			LOG.debug(" GPU Sparse-Dense Matrix Multiply (rhs transpose) ");
 			cudaSupportFunctions.cusparsecsrmm2(handle, transa, transb, m, param.n, k, toInt(B.nnz), one(), B.descr, B.val,
 					B.rowPtr, B.colInd, A, param.ldb, zero(), C, param.ldc);
 		}
 	}

 	/**
 	 * Internal method to invoke the appropriate CuBLAS kernel for matrix
 	 * multiplication for operation: C = op(A) * op(B) This assumes A, B and C
 	 * are allocated in dense format. The caller is expected to invoke
 	 * params.rowToColumnMajor().
 	 *
 	 * @param handle
 	 *            cublas handle
 	 * @param instName
 	 *            name of the invoking instruction to record{@link Statistics}.
 	 * @param C
 	 *            output matrix pointer
 	 * @param A
 	 *            left matrix pointer
 	 * @param B
 	 *            right matrix pointer
 	 * @param param
 	 *            BLAS parameters
 	 */
 	private static void denseDenseMatMult(cublasHandle handle, String instName, Pointer C, Pointer A, Pointer B,
 			CuMatMultParameters param) {
 		param.rowToColumnMajor();
 		param.validate();
 		int transa = cublasOp(param.isLeftTransposed);
 		int transb = cublasOp(param.isRightTransposed);
 		B = swap(A, A = B);
 		if (param.m == 1 && param.n == 1) {
 			// Vector product
 			LOG.debug(" GPU Dense-dense Vector Product");
 			double[] result = { 0 };
 			cudaSupportFunctions.cublasdot(handle, param.k, A, 1, B, 1, Pointer.to(result));
 			// By default in CuBlas V2, cublas pointer mode is set to
 			// CUBLAS_POINTER_MODE_HOST.
 			// This means that scalar values passed are on host (as opposed to
 			// on device).
 			// The result is copied from the host back to the device so that the
 			// rest of
 			// infrastructure can treat it uniformly.
 			cudaMemcpy(C, Pointer.to(result), 1 * sizeOfDataType, cudaMemcpyHostToDevice);
 		} else if (param.m == 1) {
 			// Vector-matrix multiply
 			LOG.debug(" GPU Dense Vector-Matrix Multiply");
 			transb = reverseCublasOp(transb);
 			int rightNumRows = (transb == CUSPARSE_OPERATION_TRANSPOSE) ? param.k : param.n;
 			int rightNumCols = (transb == CUSPARSE_OPERATION_TRANSPOSE) ? param.n : param.k;
 			cudaSupportFunctions.cublasgemv(handle, transb, rightNumRows, rightNumCols, one(), B, param.ldb, A, 1, zero(), C, 1);
 		} else if (param.n == 1) {
 			// Matrix-vector multiply
 			LOG.debug(" GPU Dense Matrix-Vector Multiply");
 			int leftNumRows = (transa == CUSPARSE_OPERATION_NON_TRANSPOSE) ? param.m : param.k;
 			int leftNumCols = (transa == CUSPARSE_OPERATION_NON_TRANSPOSE) ? param.k : param.m;
 			cudaSupportFunctions.cublasgemv(handle, transa, leftNumRows, leftNumCols, one(), A, param.lda, B, 1, zero(), C, 1);
 		} else {
 			LOG.debug(" GPU Dense-Dense Matrix Multiply ");
 			cudaSupportFunctions.cublasgemm(handle, transa, transb, param.m, param.n, param.k, one(), A, param.lda, B, param.ldb,
 					zero(), C, param.ldc);
 		}
 	}

 	// Convenient methods to swap two values
 	// Usage: y = swap(x, x=y);
 	private static long swap(long x, long y) {
 		return x;
 	}

 	private static boolean swap(boolean x, boolean y) {
 		return x;
 	}

 	private static Pointer swap(Pointer x, Pointer y) {
 		return x;
 	}

 	/**
 	 * Convenient wrapper to return appropriate cuSPARSE trans value
 	 *
 	 * @param isTransposed
 	 *            is op(input) = t(input)
 	 * @return CUSPARSE_OPERATION_TRANSPOSE or CUSPARSE_OPERATION_NON_TRANSPOSE
 	 */
 	private static int cusparseOp(boolean isTransposed) {
 		return isTransposed ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
 	}

 	/**
 	 * Convenient wrapper to return appropriate cuBLAS trans value
 	 *
 	 * @param isTransposed
 	 *            is op(input) = t(input)
 	 * @return CUBLAS_OP_T or CUBLAS_OP_N
 	 */
 	private static int cublasOp(boolean isTransposed) {
 		return isTransposed ? cublasOperation.CUBLAS_OP_T : cublasOperation.CUBLAS_OP_N;
 	}

 	/**
 	 * Flips the cuBLAS trans value
 	 *
 	 * @param trans
 	 *            can be CUBLAS_OP_T or CUBLAS_OP_N
 	 * @return CUBLAS_OP_N if trans is CUBLAS_OP_T else CUBLAS_OP_T
 	 */
 	private static int reverseCublasOp(int trans) {
 		return trans == cublasOperation.CUBLAS_OP_T ? cublasOperation.CUBLAS_OP_N : cublasOperation.CUBLAS_OP_T;
 	}

 	/**
 	 * Flips the cuSPARSE trans value
 	 *
 	 * @param trans
 	 *            can be CUSPARSE_OPERATION_NON_TRANSPOSE or
 	 *            CUSPARSE_OPERATION_TRANSPOSE
 	 * @return CUSPARSE_OPERATION_NON_TRANSPOSE if trans is
 	 *         CUSPARSE_OPERATION_TRANSPOSE else CUSPARSE_OPERATION_TRANSPOSE
 	 */
 	private static int reverseCusparseOp(int trans) {
 		return trans == CUSPARSE_OPERATION_TRANSPOSE ? CUSPARSE_OPERATION_NON_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
 	}
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/
	package org.apache.sysds.runtime.matrix.data;

	import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_NON_TRANSPOSE;
	import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_TRANSPOSE;
	import static jcuda.runtime.JCuda.cudaMemcpy;
	import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
	import jcuda.Pointer;

	import org.apache.commons.logging.Log;
	import org.apache.commons.logging.LogFactory;
	import org.apache.sysds.api.DMLScript;
	import org.apache.sysds.runtime.DMLRuntimeException;
	import org.apache.sysds.runtime.controlprogram.caching.MatrixObject;
	import org.apache.sysds.runtime.controlprogram.context.ExecutionContext;
	import org.apache.sysds.runtime.instructions.gpu.context.CSRPointer;
	import org.apache.sysds.runtime.instructions.gpu.context.GPUContext;
	import org.apache.sysds.utils.Statistics;

	import jcuda.jcusparse.cusparseHandle;
	import jcuda.jcublas.cublasHandle;
	import jcuda.jcublas.cublasOperation;
	import jcuda.runtime.JCuda;

	public class LibMatrixCuMatMult extends LibMatrixCUDA {

	private static final Log LOG = LogFactory.getLog(LibMatrixCuMatMult.class.getName());

	private static class CuMatMultParameters {
	/*
	* For the operation, C = op(A) %*% op(B), the below parameters are used
	* to invoke the corresponding kernels in CuBLAS and CuSPARSE.
	*
	* All the below values have to be valid or else this class has to throw
	* an exception. No special values like -1 for unknowns allowed.
	*/
	public int m; // number of rows of matrix op(A) and C.
	public int n; // number of columns of matrix op(B) and C.
	public int k; // number of columns of op(A) and rows of op(B).
	public int lda; // leading dimension of two-dimensional array used to
	// store the matrix A.
	public int ldb; // leading dimension of two-dimensional array used to
	// store matrix B.
	public int ldc; // leading dimension of a two-dimensional array used to
	// store the matrix C.
	public long leftNumRows; // number of rows of A
	public long leftNumCols; // number of cols of A
	public long rightNumRows; // number of rows of B
	public long rightNumCols; // number of cols of B
	private boolean isLeftTransposed; // is op(A) = t(A)
	private boolean isRightTransposed; // is op(B) = t(B)

	public CuMatMultParameters(long leftNumRows1, long leftNumCols1, long rightNumRows1, long rightNumCols1,
	boolean isLeftTransposed1, boolean isRightTransposed1) {
	leftNumRows = leftNumRows1;
	leftNumCols = leftNumCols1;
	rightNumRows = rightNumRows1;
	rightNumCols = rightNumCols1;
	isLeftTransposed = isLeftTransposed1;
	isRightTransposed = isRightTransposed1;
	setDimensions();
	}

	public void rowToColumnMajor() {
	// To compensate for the input matrices being in row-major format
	// instead of column-major (the way cublas expects)
	isRightTransposed = swap(isLeftTransposed, isLeftTransposed = isRightTransposed);
	rightNumCols = swap(leftNumRows, leftNumRows = rightNumCols);
	rightNumRows = swap(leftNumCols, leftNumCols = rightNumRows);
	setDimensions();
	}

	private void validate() {
	int k1 = toInt(isRightTransposed ? rightNumCols : rightNumRows);
	if (k != k1)
	throw new DMLRuntimeException("Dimension mismatch: " + k + " != " + k1 + " [" + leftNumRows + ","
	+ leftNumCols + "," + rightNumRows + "," + rightNumCols + "], " + isLeftTransposed + " "
	+ isRightTransposed);
	}

	private void setDimensions() {
	// Validate the dimensions
	m = toInt(isLeftTransposed ? leftNumCols : leftNumRows);
	n = toInt(isRightTransposed ? rightNumRows : rightNumCols);
	k = toInt(isLeftTransposed ? leftNumRows : leftNumCols);
	lda = isLeftTransposed ? k : m;
	ldb = isRightTransposed ? n : k;
	ldc = m;
	if (m == -1 \|\| n == -1 \|\| k == -1)
	throw new DMLRuntimeException("Incorrect dimensions");
	}
	}

	/**
	* Matrix multiply on GPU Examines sparsity and shapes and routes call to
	* appropriate method from cuBLAS or cuSparse C = op(A) x op(B)
	*
	* The user is expected to call
	* ec.releaseMatrixOutputForGPUInstruction(outputName);
	*
	* @param ec
	* Current {@link ExecutionContext} instance
	* @param gCtx
	* a valid {@link GPUContext}
	* @param instName
	* name of the invoking instruction to record{@link Statistics}.
	* @param left
	* Matrix A
	* @param right
	* Matrix B
	* @param outputName
	* Name of the output matrix C (in code generated after LOP
	* layer)
	* @param isLeftTransposed
	* op for A, transposed or not
	* @param isRightTransposed
	* op for B, tranposed or not
	* @return output of matrix multiply
	*/
	public static MatrixObject matmult(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject left,
	MatrixObject right, String outputName, boolean isLeftTransposed, boolean isRightTransposed) {
	boolean isM1Sparse = isInSparseFormat(gCtx, left);
	boolean isM2Sparse = isInSparseFormat(gCtx, right);
	MatrixObject output = ec.getMatrixObject(outputName);
	long outRLen = isLeftTransposed ? left.getNumColumns() : left.getNumRows();
	long outCLen = isRightTransposed ? right.getNumRows() : right.getNumColumns();

	CuMatMultParameters params = new CuMatMultParameters(left.getNumRows(), left.getNumColumns(),
	right.getNumRows(), right.getNumColumns(), isLeftTransposed, isRightTransposed);

	if (isM1Sparse && isM2Sparse) {
	// -------------------------------------------------------------------------------------
	// sparse-sparse matrix multiplication
	params.validate();
	int transa = cusparseOp(isLeftTransposed);
	int transb = cusparseOp(isRightTransposed);

	// Step 1: Allocate output => sparse format
	ec.allocateGPUMatrixObject(outputName, outRLen, outCLen);

	// Step 2: Get the handles to sparse/dense pointers for left, right
	// and output
	CSRPointer A = left.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
	CSRPointer B = right.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
	CSRPointer C = CSRPointer.allocateForMatrixMultiply(gCtx, getCusparseHandle(gCtx), A, transa, B, transb,
	params.m, params.n, params.k);

	// Step 3: Invoke the kernel
	cudaSupportFunctions.cusparsecsrgemm(getCusparseHandle(gCtx), transa, transb, params.m, params.n, params.k, A.descr,
	(int) A.nnz, A.val, A.rowPtr, A.colInd, B.descr, (int) B.nnz, B.val, B.rowPtr, B.colInd, C.descr,
	C.val, C.rowPtr, C.colInd);
	output.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
	// -------------------------------------------------------------------------------------
	} else if (!isM1Sparse && isM2Sparse) {
	// -------------------------------------------------------------------------------------
	// dense-sparse matrix multiplication
	// Step 1: Allocate output => dense format
	getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, outRLen, outCLen);

	// Step 2: Get the handles to sparse/dense pointers for left, right
	// and output
	Pointer A = getDensePointer(gCtx, left, instName);
	CSRPointer B = right.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
	Pointer C = getDensePointer(gCtx, output, instName);

	// Step 3: Invoke the kernel
	denseSparseMatMult(getCusparseHandle(gCtx), instName, C, A, B, params);
	// -------------------------------------------------------------------------------------
	} else if (isM1Sparse && !isM2Sparse) {
	// -------------------------------------------------------------------------------------
	// sparse-dense matrix multiplication
	// Step 1: Allocate output => dense format
	getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, outRLen, outCLen);

	// Step 2: Get the handles to sparse/dense pointers for left, right
	// and output
	CSRPointer A = left.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
	Pointer B = getDensePointer(gCtx, right, instName);
	Pointer C = getDensePointer(gCtx, output, instName);

	// Step 3: Invoke the kernel
	sparseDenseMatMult(gCtx, instName, C, A, B, left.getNumRows(), left.getNumColumns(), right.getNumRows(),
	right.getNumColumns(), outRLen, outCLen, isLeftTransposed, isRightTransposed);
	// -------------------------------------------------------------------------------------
	} else {
	// -------------------------------------------------------------------------------------
	// dense-dense matrix multiplication
	// Step 1: Allocate output => dense format
	getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, outRLen, outCLen);

	// Step 2: Get the handles to sparse/dense pointers for left, right
	// and output
	Pointer A = getDensePointer(gCtx, left, instName);
	Pointer B = getDensePointer(gCtx, right, instName);
	Pointer C = getDensePointer(gCtx, output, instName);

	// Step 3: Invoke the kernel
	denseDenseMatMult(getCublasHandle(gCtx), instName, C, A, B, params);
	// -------------------------------------------------------------------------------------
	}
	return output;
	}

	/**
	* Internal method to invoke the appropriate CuSPARSE kernel for matrix
	* multiplication for operation: C = op(A) * op(B) This assumes B and C are
	* allocated in dense row-major format and A is sparse.
	*
	* Other than input and output, this method requires additional memory =
	* outRLen * outCLen * sizeOfDataType
	*
	* @param gCtx
	* a valid {@link GPUContext}
	* @param instName
	* name of the invoking instruction to record{@link Statistics}.
	* @param C
	* output matrix pointer
	* @param A
	* left matrix pointer
	* @param B
	* right matrix pointer
	* @param leftNumRows
	* number of rows of A
	* @param leftNumColumns
	* number of cols of A
	* @param rightNumRows
	* number of rows of B
	* @param rightNumColumns
	* number of cols of B
	* @param outRLen
	* number of rows of C
	* @param outCLen
	* number of cols of C
	* @param isLeftTransposed
	* is op(A) = t(A)
	* @param isRightTransposed
	* is op(B) = t(B)
	*/
	static void sparseDenseMatMult(GPUContext gCtx, String instName, Pointer C, CSRPointer A, Pointer B,
	long leftNumRows, long leftNumColumns, long rightNumRows, long rightNumColumns, long outRLen, long outCLen,
	boolean isLeftTransposed, boolean isRightTransposed) {
	// t(C) = t(B) %*% t(A)
	Pointer output = null;
	if (outRLen != 1 && outCLen != 1) {
	output = gCtx.allocate(instName, outRLen * outCLen * sizeOfDataType);
	} else {
	// no transpose required for vector output
	output = C;
	}
	CuMatMultParameters params = new CuMatMultParameters(rightNumRows, rightNumColumns, leftNumRows,
	leftNumColumns, !isRightTransposed, !isLeftTransposed);
	denseSparseMatMult(getCusparseHandle(gCtx), instName, output, B, A, params);
	if (outRLen != 1 && outCLen != 1) {
	// Transpose: C = t(output)
	cudaSupportFunctions.cublasgeam(gCtx.getCublasHandle(), cublasOperation.CUBLAS_OP_T, cublasOperation.CUBLAS_OP_T,
	toInt(outCLen), toInt(outRLen), one(), output, toInt(outRLen), zero(), new Pointer(),
	toInt(outRLen), C, toInt(outCLen));
	if (!DMLScript.EAGER_CUDA_FREE)
	JCuda.cudaDeviceSynchronize();
	gCtx.cudaFreeHelper(instName, output, DMLScript.EAGER_CUDA_FREE);
	}
	}

	/**
	* Internal method to invoke the appropriate CuSPARSE kernel for matrix
	* multiplication for operation: C = op(A) * op(B) This assumes B and C are
	* allocated in dense row-major format and A is sparse.
	*
	* @param handle
	* cusparse handle
	* @param instName
	* name of the invoking instruction to record{@link Statistics}.
	* @param C
	* output matrix pointer
	* @param A
	* left matrix pointer
	* @param B
	* right matrix pointer
	* @param param
	* BLAS parameters
	*/
	private static void denseSparseMatMult(cusparseHandle handle, String instName, Pointer C, Pointer A, CSRPointer B,
	CuMatMultParameters param) {
	// Ignoring sparse vector dense matrix multiplication and dot product
	boolean isVector = (param.leftNumRows == 1 && !param.isLeftTransposed)
	\|\| (param.leftNumCols == 1 && param.isLeftTransposed);
	if (isVector) {
	LOG.debug(" GPU Sparse-Dense Matrix Vector ");
	int m = toInt(param.rightNumRows);
	int n = toInt(param.rightNumCols);
	int transa = reverseCusparseOp(cusparseOp(param.isLeftTransposed));
	cudaSupportFunctions.cusparsecsrmv(handle, transa, m, n, toInt(B.nnz), one(), B.descr, B.val, B.rowPtr, B.colInd, A,
	zero(), C);
	} else {
	int m = toInt(param.rightNumRows);
	int k = toInt(param.rightNumCols);
	param.rowToColumnMajor();
	param.validate();
	int transa = reverseCusparseOp(cusparseOp(param.isLeftTransposed));
	int transb = cusparseOp(param.isRightTransposed);
	LOG.debug(" GPU Sparse-Dense Matrix Multiply (rhs transpose) ");
	cudaSupportFunctions.cusparsecsrmm2(handle, transa, transb, m, param.n, k, toInt(B.nnz), one(), B.descr, B.val,
	B.rowPtr, B.colInd, A, param.ldb, zero(), C, param.ldc);
	}
	}

	/**
	* Internal method to invoke the appropriate CuBLAS kernel for matrix
	* multiplication for operation: C = op(A) * op(B) This assumes A, B and C
	* are allocated in dense format. The caller is expected to invoke
	* params.rowToColumnMajor().
	*
	* @param handle
	* cublas handle
	* @param instName
	* name of the invoking instruction to record{@link Statistics}.
	* @param C
	* output matrix pointer
	* @param A
	* left matrix pointer
	* @param B
	* right matrix pointer
	* @param param
	* BLAS parameters
	*/
	private static void denseDenseMatMult(cublasHandle handle, String instName, Pointer C, Pointer A, Pointer B,
	CuMatMultParameters param) {
	param.rowToColumnMajor();
	param.validate();
	int transa = cublasOp(param.isLeftTransposed);
	int transb = cublasOp(param.isRightTransposed);
	B = swap(A, A = B);
	if (param.m == 1 && param.n == 1) {
	// Vector product
	LOG.debug(" GPU Dense-dense Vector Product");
	double[] result = { 0 };
	cudaSupportFunctions.cublasdot(handle, param.k, A, 1, B, 1, Pointer.to(result));
	// By default in CuBlas V2, cublas pointer mode is set to
	// CUBLAS_POINTER_MODE_HOST.
	// This means that scalar values passed are on host (as opposed to
	// on device).
	// The result is copied from the host back to the device so that the
	// rest of
	// infrastructure can treat it uniformly.
	cudaMemcpy(C, Pointer.to(result), 1 * sizeOfDataType, cudaMemcpyHostToDevice);
	} else if (param.m == 1) {
	// Vector-matrix multiply
	LOG.debug(" GPU Dense Vector-Matrix Multiply");
	transb = reverseCublasOp(transb);
	int rightNumRows = (transb == CUSPARSE_OPERATION_TRANSPOSE) ? param.k : param.n;
	int rightNumCols = (transb == CUSPARSE_OPERATION_TRANSPOSE) ? param.n : param.k;
	cudaSupportFunctions.cublasgemv(handle, transb, rightNumRows, rightNumCols, one(), B, param.ldb, A, 1, zero(), C, 1);
	} else if (param.n == 1) {
	// Matrix-vector multiply
	LOG.debug(" GPU Dense Matrix-Vector Multiply");
	int leftNumRows = (transa == CUSPARSE_OPERATION_NON_TRANSPOSE) ? param.m : param.k;
	int leftNumCols = (transa == CUSPARSE_OPERATION_NON_TRANSPOSE) ? param.k : param.m;
	cudaSupportFunctions.cublasgemv(handle, transa, leftNumRows, leftNumCols, one(), A, param.lda, B, 1, zero(), C, 1);
	} else {
	LOG.debug(" GPU Dense-Dense Matrix Multiply ");
	cudaSupportFunctions.cublasgemm(handle, transa, transb, param.m, param.n, param.k, one(), A, param.lda, B, param.ldb,
	zero(), C, param.ldc);
	}
	}

	// Convenient methods to swap two values
	// Usage: y = swap(x, x=y);
	private static long swap(long x, long y) {
	return x;
	}

	private static boolean swap(boolean x, boolean y) {
	return x;
	}

	private static Pointer swap(Pointer x, Pointer y) {
	return x;
	}

	/**
	* Convenient wrapper to return appropriate cuSPARSE trans value
	*
	* @param isTransposed
	* is op(input) = t(input)
	* @return CUSPARSE_OPERATION_TRANSPOSE or CUSPARSE_OPERATION_NON_TRANSPOSE
	*/
	private static int cusparseOp(boolean isTransposed) {
	return isTransposed ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
	}

	/**
	* Convenient wrapper to return appropriate cuBLAS trans value
	*
	* @param isTransposed
	* is op(input) = t(input)
	* @return CUBLAS_OP_T or CUBLAS_OP_N
	*/
	private static int cublasOp(boolean isTransposed) {
	return isTransposed ? cublasOperation.CUBLAS_OP_T : cublasOperation.CUBLAS_OP_N;
	}

	/**
	* Flips the cuBLAS trans value
	*
	* @param trans
	* can be CUBLAS_OP_T or CUBLAS_OP_N
	* @return CUBLAS_OP_N if trans is CUBLAS_OP_T else CUBLAS_OP_T
	*/
	private static int reverseCublasOp(int trans) {
	return trans == cublasOperation.CUBLAS_OP_T ? cublasOperation.CUBLAS_OP_N : cublasOperation.CUBLAS_OP_T;
	}

	/**
	* Flips the cuSPARSE trans value
	*
	* @param trans
	* can be CUSPARSE_OPERATION_NON_TRANSPOSE or
	* CUSPARSE_OPERATION_TRANSPOSE
	* @return CUSPARSE_OPERATION_NON_TRANSPOSE if trans is
	* CUSPARSE_OPERATION_TRANSPOSE else CUSPARSE_OPERATION_TRANSPOSE
	*/
	private static int reverseCusparseOp(int trans) {
	return trans == CUSPARSE_OPERATION_TRANSPOSE ? CUSPARSE_OPERATION_NON_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
	}
	}