| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| package org.apache.sysds.runtime.matrix.data; |
| |
| import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_NON_TRANSPOSE; |
| import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_TRANSPOSE; |
| import static jcuda.runtime.JCuda.cudaMemcpy; |
| import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice; |
| import jcuda.Pointer; |
| |
| import org.apache.commons.logging.Log; |
| import org.apache.commons.logging.LogFactory; |
| import org.apache.sysds.api.DMLScript; |
| import org.apache.sysds.runtime.DMLRuntimeException; |
| import org.apache.sysds.runtime.controlprogram.caching.MatrixObject; |
| import org.apache.sysds.runtime.controlprogram.context.ExecutionContext; |
| import org.apache.sysds.runtime.instructions.gpu.context.CSRPointer; |
| import org.apache.sysds.runtime.instructions.gpu.context.GPUContext; |
| import org.apache.sysds.utils.Statistics; |
| |
| import jcuda.jcusparse.cusparseHandle; |
| import jcuda.jcublas.cublasHandle; |
| import jcuda.jcublas.cublasOperation; |
| import jcuda.runtime.JCuda; |
| |
| public class LibMatrixCuMatMult extends LibMatrixCUDA { |
| |
| private static final Log LOG = LogFactory.getLog(LibMatrixCuMatMult.class.getName()); |
| |
| private static class CuMatMultParameters { |
| /* |
| * For the operation, C = op(A) %*% op(B), the below parameters are used |
| * to invoke the corresponding kernels in CuBLAS and CuSPARSE. |
| * |
| * All the below values have to be valid or else this class has to throw |
| * an exception. No special values like -1 for unknowns allowed. |
| */ |
| public int m; // number of rows of matrix op(A) and C. |
| public int n; // number of columns of matrix op(B) and C. |
| public int k; // number of columns of op(A) and rows of op(B). |
| public int lda; // leading dimension of two-dimensional array used to |
| // store the matrix A. |
| public int ldb; // leading dimension of two-dimensional array used to |
| // store matrix B. |
| public int ldc; // leading dimension of a two-dimensional array used to |
| // store the matrix C. |
| public long leftNumRows; // number of rows of A |
| public long leftNumCols; // number of cols of A |
| public long rightNumRows; // number of rows of B |
| public long rightNumCols; // number of cols of B |
| private boolean isLeftTransposed; // is op(A) = t(A) |
| private boolean isRightTransposed; // is op(B) = t(B) |
| |
| public CuMatMultParameters(long leftNumRows1, long leftNumCols1, long rightNumRows1, long rightNumCols1, |
| boolean isLeftTransposed1, boolean isRightTransposed1) { |
| leftNumRows = leftNumRows1; |
| leftNumCols = leftNumCols1; |
| rightNumRows = rightNumRows1; |
| rightNumCols = rightNumCols1; |
| isLeftTransposed = isLeftTransposed1; |
| isRightTransposed = isRightTransposed1; |
| setDimensions(); |
| } |
| |
| public void rowToColumnMajor() { |
| // To compensate for the input matrices being in row-major format |
| // instead of column-major (the way cublas expects) |
| isRightTransposed = swap(isLeftTransposed, isLeftTransposed = isRightTransposed); |
| rightNumCols = swap(leftNumRows, leftNumRows = rightNumCols); |
| rightNumRows = swap(leftNumCols, leftNumCols = rightNumRows); |
| setDimensions(); |
| } |
| |
| private void validate() { |
| int k1 = toInt(isRightTransposed ? rightNumCols : rightNumRows); |
| if (k != k1) |
| throw new DMLRuntimeException("Dimension mismatch: " + k + " != " + k1 + " [" + leftNumRows + "," |
| + leftNumCols + "," + rightNumRows + "," + rightNumCols + "], " + isLeftTransposed + " " |
| + isRightTransposed); |
| } |
| |
| private void setDimensions() { |
| // Validate the dimensions |
| m = toInt(isLeftTransposed ? leftNumCols : leftNumRows); |
| n = toInt(isRightTransposed ? rightNumRows : rightNumCols); |
| k = toInt(isLeftTransposed ? leftNumRows : leftNumCols); |
| lda = isLeftTransposed ? k : m; |
| ldb = isRightTransposed ? n : k; |
| ldc = m; |
| if (m == -1 || n == -1 || k == -1) |
| throw new DMLRuntimeException("Incorrect dimensions"); |
| } |
| } |
| |
| /** |
| * Matrix multiply on GPU Examines sparsity and shapes and routes call to |
| * appropriate method from cuBLAS or cuSparse C = op(A) x op(B) |
| * |
| * The user is expected to call |
| * ec.releaseMatrixOutputForGPUInstruction(outputName); |
| * |
| * @param ec |
| * Current {@link ExecutionContext} instance |
| * @param gCtx |
| * a valid {@link GPUContext} |
| * @param instName |
| * name of the invoking instruction to record{@link Statistics}. |
| * @param left |
| * Matrix A |
| * @param right |
| * Matrix B |
| * @param outputName |
| * Name of the output matrix C (in code generated after LOP |
| * layer) |
| * @param isLeftTransposed |
| * op for A, transposed or not |
| * @param isRightTransposed |
| * op for B, tranposed or not |
| * @return output of matrix multiply |
| */ |
| public static MatrixObject matmult(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject left, |
| MatrixObject right, String outputName, boolean isLeftTransposed, boolean isRightTransposed) { |
| boolean isM1Sparse = isInSparseFormat(gCtx, left); |
| boolean isM2Sparse = isInSparseFormat(gCtx, right); |
| MatrixObject output = ec.getMatrixObject(outputName); |
| long outRLen = isLeftTransposed ? left.getNumColumns() : left.getNumRows(); |
| long outCLen = isRightTransposed ? right.getNumRows() : right.getNumColumns(); |
| |
| CuMatMultParameters params = new CuMatMultParameters(left.getNumRows(), left.getNumColumns(), |
| right.getNumRows(), right.getNumColumns(), isLeftTransposed, isRightTransposed); |
| |
| if (isM1Sparse && isM2Sparse) { |
| // ------------------------------------------------------------------------------------- |
| // sparse-sparse matrix multiplication |
| params.validate(); |
| int transa = cusparseOp(isLeftTransposed); |
| int transb = cusparseOp(isRightTransposed); |
| |
| // Step 1: Allocate output => sparse format |
| ec.allocateGPUMatrixObject(outputName, outRLen, outCLen); |
| |
| // Step 2: Get the handles to sparse/dense pointers for left, right |
| // and output |
| CSRPointer A = left.getGPUObject(gCtx).getJcudaSparseMatrixPtr(); |
| CSRPointer B = right.getGPUObject(gCtx).getJcudaSparseMatrixPtr(); |
| CSRPointer C = CSRPointer.allocateForMatrixMultiply(gCtx, getCusparseHandle(gCtx), A, transa, B, transb, |
| params.m, params.n, params.k); |
| |
| // Step 3: Invoke the kernel |
| cudaSupportFunctions.cusparsecsrgemm(getCusparseHandle(gCtx), transa, transb, params.m, params.n, params.k, A.descr, |
| (int) A.nnz, A.val, A.rowPtr, A.colInd, B.descr, (int) B.nnz, B.val, B.rowPtr, B.colInd, C.descr, |
| C.val, C.rowPtr, C.colInd); |
| output.getGPUObject(gCtx).setSparseMatrixCudaPointer(C); |
| // ------------------------------------------------------------------------------------- |
| } else if (!isM1Sparse && isM2Sparse) { |
| // ------------------------------------------------------------------------------------- |
| // dense-sparse matrix multiplication |
| // Step 1: Allocate output => dense format |
| getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, outRLen, outCLen); |
| |
| // Step 2: Get the handles to sparse/dense pointers for left, right |
| // and output |
| Pointer A = getDensePointer(gCtx, left, instName); |
| CSRPointer B = right.getGPUObject(gCtx).getJcudaSparseMatrixPtr(); |
| Pointer C = getDensePointer(gCtx, output, instName); |
| |
| // Step 3: Invoke the kernel |
| denseSparseMatMult(getCusparseHandle(gCtx), instName, C, A, B, params); |
| // ------------------------------------------------------------------------------------- |
| } else if (isM1Sparse && !isM2Sparse) { |
| // ------------------------------------------------------------------------------------- |
| // sparse-dense matrix multiplication |
| // Step 1: Allocate output => dense format |
| getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, outRLen, outCLen); |
| |
| // Step 2: Get the handles to sparse/dense pointers for left, right |
| // and output |
| CSRPointer A = left.getGPUObject(gCtx).getJcudaSparseMatrixPtr(); |
| Pointer B = getDensePointer(gCtx, right, instName); |
| Pointer C = getDensePointer(gCtx, output, instName); |
| |
| // Step 3: Invoke the kernel |
| sparseDenseMatMult(gCtx, instName, C, A, B, left.getNumRows(), left.getNumColumns(), right.getNumRows(), |
| right.getNumColumns(), outRLen, outCLen, isLeftTransposed, isRightTransposed); |
| // ------------------------------------------------------------------------------------- |
| } else { |
| // ------------------------------------------------------------------------------------- |
| // dense-dense matrix multiplication |
| // Step 1: Allocate output => dense format |
| getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, outRLen, outCLen); |
| |
| // Step 2: Get the handles to sparse/dense pointers for left, right |
| // and output |
| Pointer A = getDensePointer(gCtx, left, instName); |
| Pointer B = getDensePointer(gCtx, right, instName); |
| Pointer C = getDensePointer(gCtx, output, instName); |
| |
| // Step 3: Invoke the kernel |
| denseDenseMatMult(getCublasHandle(gCtx), instName, C, A, B, params); |
| // ------------------------------------------------------------------------------------- |
| } |
| return output; |
| } |
| |
| /** |
| * Internal method to invoke the appropriate CuSPARSE kernel for matrix |
| * multiplication for operation: C = op(A) * op(B) This assumes B and C are |
| * allocated in dense row-major format and A is sparse. |
| * |
| * Other than input and output, this method requires additional memory = |
| * outRLen * outCLen * sizeOfDataType |
| * |
| * @param gCtx |
| * a valid {@link GPUContext} |
| * @param instName |
| * name of the invoking instruction to record{@link Statistics}. |
| * @param C |
| * output matrix pointer |
| * @param A |
| * left matrix pointer |
| * @param B |
| * right matrix pointer |
| * @param leftNumRows |
| * number of rows of A |
| * @param leftNumColumns |
| * number of cols of A |
| * @param rightNumRows |
| * number of rows of B |
| * @param rightNumColumns |
| * number of cols of B |
| * @param outRLen |
| * number of rows of C |
| * @param outCLen |
| * number of cols of C |
| * @param isLeftTransposed |
| * is op(A) = t(A) |
| * @param isRightTransposed |
| * is op(B) = t(B) |
| */ |
| static void sparseDenseMatMult(GPUContext gCtx, String instName, Pointer C, CSRPointer A, Pointer B, |
| long leftNumRows, long leftNumColumns, long rightNumRows, long rightNumColumns, long outRLen, long outCLen, |
| boolean isLeftTransposed, boolean isRightTransposed) { |
| // t(C) = t(B) %*% t(A) |
| Pointer output = null; |
| if (outRLen != 1 && outCLen != 1) { |
| output = gCtx.allocate(instName, outRLen * outCLen * sizeOfDataType); |
| } else { |
| // no transpose required for vector output |
| output = C; |
| } |
| CuMatMultParameters params = new CuMatMultParameters(rightNumRows, rightNumColumns, leftNumRows, |
| leftNumColumns, !isRightTransposed, !isLeftTransposed); |
| denseSparseMatMult(getCusparseHandle(gCtx), instName, output, B, A, params); |
| if (outRLen != 1 && outCLen != 1) { |
| // Transpose: C = t(output) |
| cudaSupportFunctions.cublasgeam(gCtx.getCublasHandle(), cublasOperation.CUBLAS_OP_T, cublasOperation.CUBLAS_OP_T, |
| toInt(outCLen), toInt(outRLen), one(), output, toInt(outRLen), zero(), new Pointer(), |
| toInt(outRLen), C, toInt(outCLen)); |
| if (!DMLScript.EAGER_CUDA_FREE) |
| JCuda.cudaDeviceSynchronize(); |
| gCtx.cudaFreeHelper(instName, output, DMLScript.EAGER_CUDA_FREE); |
| } |
| } |
| |
| /** |
| * Internal method to invoke the appropriate CuSPARSE kernel for matrix |
| * multiplication for operation: C = op(A) * op(B) This assumes B and C are |
| * allocated in dense row-major format and A is sparse. |
| * |
| * @param handle |
| * cusparse handle |
| * @param instName |
| * name of the invoking instruction to record{@link Statistics}. |
| * @param C |
| * output matrix pointer |
| * @param A |
| * left matrix pointer |
| * @param B |
| * right matrix pointer |
| * @param param |
| * BLAS parameters |
| */ |
| private static void denseSparseMatMult(cusparseHandle handle, String instName, Pointer C, Pointer A, CSRPointer B, |
| CuMatMultParameters param) { |
| // Ignoring sparse vector dense matrix multiplication and dot product |
| boolean isVector = (param.leftNumRows == 1 && !param.isLeftTransposed) |
| || (param.leftNumCols == 1 && param.isLeftTransposed); |
| if (isVector) { |
| LOG.debug(" GPU Sparse-Dense Matrix Vector "); |
| int m = toInt(param.rightNumRows); |
| int n = toInt(param.rightNumCols); |
| int transa = reverseCusparseOp(cusparseOp(param.isLeftTransposed)); |
| cudaSupportFunctions.cusparsecsrmv(handle, transa, m, n, toInt(B.nnz), one(), B.descr, B.val, B.rowPtr, B.colInd, A, |
| zero(), C); |
| } else { |
| int m = toInt(param.rightNumRows); |
| int k = toInt(param.rightNumCols); |
| param.rowToColumnMajor(); |
| param.validate(); |
| int transa = reverseCusparseOp(cusparseOp(param.isLeftTransposed)); |
| int transb = cusparseOp(param.isRightTransposed); |
| LOG.debug(" GPU Sparse-Dense Matrix Multiply (rhs transpose) "); |
| cudaSupportFunctions.cusparsecsrmm2(handle, transa, transb, m, param.n, k, toInt(B.nnz), one(), B.descr, B.val, |
| B.rowPtr, B.colInd, A, param.ldb, zero(), C, param.ldc); |
| } |
| } |
| |
| /** |
| * Internal method to invoke the appropriate CuBLAS kernel for matrix |
| * multiplication for operation: C = op(A) * op(B) This assumes A, B and C |
| * are allocated in dense format. The caller is expected to invoke |
| * params.rowToColumnMajor(). |
| * |
| * @param handle |
| * cublas handle |
| * @param instName |
| * name of the invoking instruction to record{@link Statistics}. |
| * @param C |
| * output matrix pointer |
| * @param A |
| * left matrix pointer |
| * @param B |
| * right matrix pointer |
| * @param param |
| * BLAS parameters |
| */ |
| private static void denseDenseMatMult(cublasHandle handle, String instName, Pointer C, Pointer A, Pointer B, |
| CuMatMultParameters param) { |
| param.rowToColumnMajor(); |
| param.validate(); |
| int transa = cublasOp(param.isLeftTransposed); |
| int transb = cublasOp(param.isRightTransposed); |
| B = swap(A, A = B); |
| if (param.m == 1 && param.n == 1) { |
| // Vector product |
| LOG.debug(" GPU Dense-dense Vector Product"); |
| double[] result = { 0 }; |
| cudaSupportFunctions.cublasdot(handle, param.k, A, 1, B, 1, Pointer.to(result)); |
| // By default in CuBlas V2, cublas pointer mode is set to |
| // CUBLAS_POINTER_MODE_HOST. |
| // This means that scalar values passed are on host (as opposed to |
| // on device). |
| // The result is copied from the host back to the device so that the |
| // rest of |
| // infrastructure can treat it uniformly. |
| cudaMemcpy(C, Pointer.to(result), 1 * sizeOfDataType, cudaMemcpyHostToDevice); |
| } else if (param.m == 1) { |
| // Vector-matrix multiply |
| LOG.debug(" GPU Dense Vector-Matrix Multiply"); |
| transb = reverseCublasOp(transb); |
| int rightNumRows = (transb == CUSPARSE_OPERATION_TRANSPOSE) ? param.k : param.n; |
| int rightNumCols = (transb == CUSPARSE_OPERATION_TRANSPOSE) ? param.n : param.k; |
| cudaSupportFunctions.cublasgemv(handle, transb, rightNumRows, rightNumCols, one(), B, param.ldb, A, 1, zero(), C, 1); |
| } else if (param.n == 1) { |
| // Matrix-vector multiply |
| LOG.debug(" GPU Dense Matrix-Vector Multiply"); |
| int leftNumRows = (transa == CUSPARSE_OPERATION_NON_TRANSPOSE) ? param.m : param.k; |
| int leftNumCols = (transa == CUSPARSE_OPERATION_NON_TRANSPOSE) ? param.k : param.m; |
| cudaSupportFunctions.cublasgemv(handle, transa, leftNumRows, leftNumCols, one(), A, param.lda, B, 1, zero(), C, 1); |
| } else { |
| LOG.debug(" GPU Dense-Dense Matrix Multiply "); |
| cudaSupportFunctions.cublasgemm(handle, transa, transb, param.m, param.n, param.k, one(), A, param.lda, B, param.ldb, |
| zero(), C, param.ldc); |
| } |
| } |
| |
| // Convenient methods to swap two values |
| // Usage: y = swap(x, x=y); |
| private static long swap(long x, long y) { |
| return x; |
| } |
| |
| private static boolean swap(boolean x, boolean y) { |
| return x; |
| } |
| |
| private static Pointer swap(Pointer x, Pointer y) { |
| return x; |
| } |
| |
| /** |
| * Convenient wrapper to return appropriate cuSPARSE trans value |
| * |
| * @param isTransposed |
| * is op(input) = t(input) |
| * @return CUSPARSE_OPERATION_TRANSPOSE or CUSPARSE_OPERATION_NON_TRANSPOSE |
| */ |
| private static int cusparseOp(boolean isTransposed) { |
| return isTransposed ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; |
| } |
| |
| /** |
| * Convenient wrapper to return appropriate cuBLAS trans value |
| * |
| * @param isTransposed |
| * is op(input) = t(input) |
| * @return CUBLAS_OP_T or CUBLAS_OP_N |
| */ |
| private static int cublasOp(boolean isTransposed) { |
| return isTransposed ? cublasOperation.CUBLAS_OP_T : cublasOperation.CUBLAS_OP_N; |
| } |
| |
| /** |
| * Flips the cuBLAS trans value |
| * |
| * @param trans |
| * can be CUBLAS_OP_T or CUBLAS_OP_N |
| * @return CUBLAS_OP_N if trans is CUBLAS_OP_T else CUBLAS_OP_T |
| */ |
| private static int reverseCublasOp(int trans) { |
| return trans == cublasOperation.CUBLAS_OP_T ? cublasOperation.CUBLAS_OP_N : cublasOperation.CUBLAS_OP_T; |
| } |
| |
| /** |
| * Flips the cuSPARSE trans value |
| * |
| * @param trans |
| * can be CUSPARSE_OPERATION_NON_TRANSPOSE or |
| * CUSPARSE_OPERATION_TRANSPOSE |
| * @return CUSPARSE_OPERATION_NON_TRANSPOSE if trans is |
| * CUSPARSE_OPERATION_TRANSPOSE else CUSPARSE_OPERATION_TRANSPOSE |
| */ |
| private static int reverseCusparseOp(int trans) { |
| return trans == CUSPARSE_OPERATION_TRANSPOSE ? CUSPARSE_OPERATION_NON_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; |
| } |
| } |