| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| package org.apache.sysds.runtime.matrix.data; |
| |
| import org.apache.commons.logging.Log; |
| import org.apache.commons.logging.LogFactory; |
| import org.apache.log4j.Level; |
| import org.apache.log4j.Logger; |
| import org.apache.sysds.api.DMLScript; |
| import org.apache.sysds.runtime.DMLRuntimeException; |
| import org.apache.sysds.runtime.controlprogram.caching.MatrixObject; |
| import org.apache.sysds.runtime.controlprogram.context.ExecutionContext; |
| import org.apache.sysds.runtime.controlprogram.parfor.stat.Timing; |
| import org.apache.sysds.runtime.functionobjects.And; |
| import org.apache.sysds.runtime.functionobjects.Builtin; |
| import org.apache.sysds.runtime.functionobjects.CM; |
| import org.apache.sysds.runtime.functionobjects.Divide; |
| import org.apache.sysds.runtime.functionobjects.Equals; |
| import org.apache.sysds.runtime.functionobjects.GreaterThan; |
| import org.apache.sysds.runtime.functionobjects.GreaterThanEquals; |
| import org.apache.sysds.runtime.functionobjects.IndexFunction; |
| import org.apache.sysds.runtime.functionobjects.IntegerDivide; |
| import org.apache.sysds.runtime.functionobjects.KahanPlus; |
| import org.apache.sysds.runtime.functionobjects.KahanPlusSq; |
| import org.apache.sysds.runtime.functionobjects.LessThan; |
| import org.apache.sysds.runtime.functionobjects.LessThanEquals; |
| import org.apache.sysds.runtime.functionobjects.Mean; |
| import org.apache.sysds.runtime.functionobjects.Minus; |
| import org.apache.sysds.runtime.functionobjects.Minus1Multiply; |
| import org.apache.sysds.runtime.functionobjects.MinusNz; |
| import org.apache.sysds.runtime.functionobjects.Modulus; |
| import org.apache.sysds.runtime.functionobjects.Multiply; |
| import org.apache.sysds.runtime.functionobjects.Multiply2; |
| import org.apache.sysds.runtime.functionobjects.NotEquals; |
| import org.apache.sysds.runtime.functionobjects.Or; |
| import org.apache.sysds.runtime.functionobjects.Plus; |
| import org.apache.sysds.runtime.functionobjects.Power; |
| import org.apache.sysds.runtime.functionobjects.Power2; |
| import org.apache.sysds.runtime.functionobjects.ReduceAll; |
| import org.apache.sysds.runtime.functionobjects.ReduceCol; |
| import org.apache.sysds.runtime.functionobjects.ReduceDiag; |
| import org.apache.sysds.runtime.functionobjects.ReduceRow; |
| import org.apache.sysds.runtime.functionobjects.ValueFunction; |
| import org.apache.sysds.runtime.functionobjects.Builtin.BuiltinCode; |
| import org.apache.sysds.runtime.instructions.cp.DoubleObject; |
| import org.apache.sysds.runtime.instructions.gpu.GPUInstruction; |
| import org.apache.sysds.runtime.instructions.gpu.context.CSRPointer; |
| import org.apache.sysds.runtime.instructions.gpu.context.ExecutionConfig; |
| import org.apache.sysds.runtime.instructions.gpu.context.GPUContext; |
| import org.apache.sysds.runtime.instructions.gpu.context.GPUObject; |
| import org.apache.sysds.runtime.instructions.gpu.context.JCudaKernels; |
| import org.apache.sysds.runtime.matrix.operators.AggregateOperator; |
| import org.apache.sysds.runtime.matrix.operators.AggregateUnaryOperator; |
| import org.apache.sysds.runtime.matrix.operators.BinaryOperator; |
| import org.apache.sysds.runtime.matrix.operators.CMOperator; |
| import org.apache.sysds.runtime.matrix.operators.LeftScalarOperator; |
| import org.apache.sysds.runtime.matrix.operators.RightScalarOperator; |
| import org.apache.sysds.runtime.matrix.operators.ScalarOperator; |
| import org.apache.sysds.runtime.util.IndexRange; |
| import org.apache.sysds.utils.Statistics; |
| |
| import jcuda.Pointer; |
| import jcuda.Sizeof; |
| import jcuda.jcublas.cublasDiagType; |
| import jcuda.jcublas.cublasFillMode; |
| import jcuda.jcublas.cublasHandle; |
| import jcuda.jcublas.cublasOperation; |
| import jcuda.jcublas.cublasSideMode; |
| import jcuda.jcusparse.cusparseAction; |
| import jcuda.jcusparse.cusparseHandle; |
| import jcuda.jcusparse.cusparseIndexBase; |
| import static jcuda.jcublas.cublasOperation.CUBLAS_OP_N; |
| import static jcuda.jcublas.cublasOperation.CUBLAS_OP_T; |
| import static jcuda.runtime.JCuda.cudaDeviceSynchronize; |
| import static jcuda.runtime.JCuda.cudaMemcpy; |
| import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice; |
| import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost; |
| |
| import java.lang.Math; |
| import java.util.ArrayList; |
| |
| /** |
| * All CUDA kernels and library calls are redirected through this class |
| * @see GPUContext |
| * @see GPUObject |
| */ |
| public class LibMatrixCUDA { |
| private static final Log LOG = LogFactory.getLog(LibMatrixCUDA.class.getName()); |
| |
| protected static int CUDNN_DATA_TYPE = jcuda.jcudnn.cudnnDataType.CUDNN_DATA_DOUBLE; |
| // The below variables are used in CSRPointer, GPUObjects, etc. |
| public static CudaSupportFunctions cudaSupportFunctions = new DoublePrecisionCudaSupportFunctions(); |
| public static int sizeOfDataType = jcuda.Sizeof.DOUBLE; |
| public static String customKernelSuffix = "_d"; |
| |
| /** |
| * Sets the internal state based on the DMLScript.DATA_TYPE |
| */ |
| public static void resetFloatingPointPrecision() { |
| if(DMLScript.FLOATING_POINT_PRECISION.equalsIgnoreCase("double")) { |
| LibMatrixCUDA.CUDNN_DATA_TYPE = jcuda.jcudnn.cudnnDataType.CUDNN_DATA_DOUBLE; |
| LibMatrixCUDA.cudaSupportFunctions = new DoublePrecisionCudaSupportFunctions(); |
| LibMatrixCUDA.sizeOfDataType = jcuda.Sizeof.DOUBLE; |
| LibMatrixCUDA.customKernelSuffix = "_d"; |
| } |
| else if(DMLScript.FLOATING_POINT_PRECISION.equalsIgnoreCase("single")) { |
| LibMatrixCUDA.CUDNN_DATA_TYPE = jcuda.jcudnn.cudnnDataType.CUDNN_DATA_FLOAT; |
| LibMatrixCUDA.cudaSupportFunctions = new SinglePrecisionCudaSupportFunctions(); |
| LibMatrixCUDA.sizeOfDataType = jcuda.Sizeof.FLOAT; |
| LibMatrixCUDA.customKernelSuffix = "_f"; |
| } |
| else { |
| throw new DMLRuntimeException("Unsupported floating point precision: " + DMLScript.FLOATING_POINT_PRECISION); |
| } |
| } |
| |
| // Assume Compute Capability 3.0 |
| // MAX BLOCKS is 2^31 - 1 For compute capability > 3.0 |
| // MAX_THREADS is 1024 For compute capability > 3.0 |
| // WarpSize is usually 32 |
| // Shared mem is mostly at max 48 KB for current GPU |
| private static int _MAX_THREADS = -1; |
| private static int _MAX_BLOCKS = -1; |
| private static int _WARP_SIZE = -1; |
| private static long _SHMEM_SIZE = -1; |
| |
| // From CuDNN 5.1 documentation: |
| // The total size of a tensor including the potential padding between dimensions is limited to 2 Giga-elements of type datatype. |
| protected static long maxNumElementsOfCuDNNTensor = 2000000000; |
| |
| //********************************************************************/ |
| //***************************** UTILS ********************************/ |
| //********************************************************************/ |
| |
| /** |
| * Utility function to get maximum number of threads supported by the active CUDA device. |
| * @param gCtx a valid {@link GPUContext} |
| * @return max threads |
| */ |
| static int getMaxThreads(GPUContext gCtx){ |
| if (_MAX_THREADS == -1){ |
| _MAX_THREADS = gCtx.getMaxThreadsPerBlock(); |
| } |
| return _MAX_THREADS; |
| } |
| |
| /** |
| * Utility function to get maximum number of blocks supported by the active CUDA device. |
| * @param gCtx a valid {@link GPUContext} |
| * @return max blocks |
| */ |
| static int getMaxBlocks(GPUContext gCtx) { |
| if (_MAX_BLOCKS == -1){ |
| _MAX_BLOCKS = gCtx.getMaxBlocks(); |
| } |
| return _MAX_BLOCKS; |
| } |
| |
| /** |
| * Utility function to get the maximum amount of shared memory/block size supported by the active CUDA device. |
| * @param gCtx a valid {@link GPUContext} |
| * @return warp size |
| */ |
| static long getMaxSharedMemory(GPUContext gCtx) { |
| if (_SHMEM_SIZE == -1) { |
| _SHMEM_SIZE = gCtx.getMaxSharedMemory(); |
| } |
| return _SHMEM_SIZE; |
| } |
| |
| /** |
| * Utility function to get the warp size supported by the active CUDA device. |
| * @param gCtx a valid {@link GPUContext} |
| * @return shared mem |
| */ |
| static int getWarpSize(GPUContext gCtx) { |
| if (_WARP_SIZE == -1) { |
| _WARP_SIZE = gCtx.getWarpSize(); |
| } |
| return _WARP_SIZE; |
| } |
| |
| public static boolean isInSparseFormat(GPUContext gCtx, MatrixObject mo) { |
| if(mo.getGPUObject(gCtx) != null && mo.getGPUObject(gCtx).isAllocated()) |
| return mo.getGPUObject(gCtx).isSparse(); |
| return MatrixBlock.evalSparseFormatInMemory(mo.getNumRows(), mo.getNumColumns(), mo.getNnz()); |
| } |
| |
| /** |
| * Note: if the matrix is in dense format, it explicitly re-computes the number of nonzeros. |
| * |
| * @param gCtx a valid GPU context |
| * @param instName instruction name |
| * @param mo matrix object |
| * @param recomputeDenseNNZ recompute NNZ if dense |
| * @return number of non-zeroes |
| */ |
| public static long getNnz(GPUContext gCtx, String instName, MatrixObject mo, boolean recomputeDenseNNZ) { |
| if(mo.getGPUObject(gCtx) != null && mo.getGPUObject(gCtx).isAllocated()) { |
| return mo.getGPUObject(gCtx).getNnz(instName, recomputeDenseNNZ); |
| } |
| else { |
| return mo.getNnz(); |
| } |
| } |
| |
| |
| protected static cusparseHandle getCusparseHandle(GPUContext gCtx) { |
| return gCtx.getCusparseHandle(); |
| } |
| |
| protected static cublasHandle getCublasHandle(GPUContext gCtx) { |
| return gCtx.getCublasHandle(); |
| } |
| |
| public static JCudaKernels getCudaKernels(GPUContext gCtx) throws DMLRuntimeException { |
| return gCtx.getKernels(); |
| } |
| |
| public static Pointer double2float(GPUContext gCtx, Pointer A, Pointer ret, int numElems) { |
| getCudaKernels(gCtx).launchKernel("double2float", ExecutionConfig.getConfigForSimpleVectorOperations(numElems), |
| A, ret, numElems); |
| return ret; |
| } |
| |
| public static Pointer float2double(GPUContext gCtx, Pointer A, Pointer ret, int numElems) { |
| getCudaKernels(gCtx).launchKernel("float2double", |
| ExecutionConfig.getConfigForSimpleVectorOperations(numElems), A, ret, numElems); |
| return ret; |
| } |
| |
| //********************************************************************/ |
| //************************ End of UTILS ******************************/ |
| //********************************************************************/ |
| |
| //********************************************************************/ |
| //***************** DEEP LEARNING Operators **************************/ |
| //********************************************************************/ |
| |
| private static Pointer _one; |
| private static Pointer _zero; |
| private static int oldDataTypeSize; |
| /** |
| * Convenience method to get a pointer to value '1.0' on device. Instead of allocating and deallocating it for every kernel invocation. |
| * @return jcuda pointer |
| */ |
| public static Pointer one() { |
| if(_one == null || oldDataTypeSize != sizeOfDataType) { |
| _one = _dataTypePointerTo(1.0); |
| oldDataTypeSize = sizeOfDataType; |
| } |
| return _one; |
| } |
| /** |
| * Convenience method to get a pointer to value '0.0f' on device. Instead of allocating and deallocating it for every kernel invocation. |
| * @return jcuda pointer |
| */ |
| public static Pointer zero() { |
| if(_zero == null || oldDataTypeSize != sizeOfDataType) { |
| _zero = _dataTypePointerTo(0.0); |
| oldDataTypeSize = sizeOfDataType; |
| } |
| return _zero; |
| } |
| |
| /** |
| * Convenience method to get jcudaDenseMatrixPtr. This method explicitly converts sparse to dense format, so use it judiciously. |
| * @param gCtx a valid {@link GPUContext} |
| * @param input input matrix object |
| * @param instName the invoking instruction's name for record {@link Statistics}. |
| * @return jcuda pointer |
| */ |
| public static Pointer getDensePointer(GPUContext gCtx, MatrixObject input, String instName) throws DMLRuntimeException { |
| if(isInSparseFormat(gCtx, input)) { |
| input.getGPUObject(gCtx).sparseToDense(instName); |
| } |
| return input.getGPUObject(gCtx).getDensePointer(); |
| } |
| |
| /** |
| * Convenience method to get the sparse matrix pointer from a {@link MatrixObject}. Converts dense to sparse if necessary. |
| * @param gCtx a valid {@link GPUContext} |
| * @param input input matrix |
| * @param instName the invoking instruction's name for record {@link Statistics}. |
| * @return a sparse matrix pointer |
| */ |
| protected static CSRPointer getSparsePointer(GPUContext gCtx, MatrixObject input, String instName) { |
| if(!isInSparseFormat(gCtx, input)) { |
| input.getGPUObject(gCtx).denseToSparse(); |
| } |
| return input.getGPUObject(gCtx).getJcudaSparseMatrixPtr(); |
| } |
| |
| private static Pointer _dataTypePointerTo(double value) { |
| if(sizeOfDataType == Sizeof.DOUBLE) { |
| return Pointer.to(new double[] { value }); |
| } |
| else if(sizeOfDataType == Sizeof.FLOAT) { |
| return Pointer.to(new float[] { (float) value }); |
| } |
| else { |
| throw new RuntimeException("Unsupported datatype with size " + sizeOfDataType); |
| } |
| } |
| |
| protected static Pointer dataTypePointerTo(double value) { |
| if(value == 1) { |
| return one(); |
| } |
| else if(value == 0) { |
| return zero(); |
| } |
| else { |
| return _dataTypePointerTo(value); |
| } |
| } |
| |
| |
| /** |
| * This method computes the backpropagation errors for previous layer of relu operation |
| * @param gCtx a valid {@link GPUContext} |
| * @param instName the invoking instruction's name for record {@link Statistics}. |
| * @param input input image |
| * @param dout next layer error propogation |
| * @param outputBlock output |
| */ |
| public static void reluBackward(GPUContext gCtx, String instName, MatrixObject input, MatrixObject dout, MatrixObject outputBlock) { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : reluBackward" + ", GPUContext=" + gCtx); |
| } |
| long rows = input.getNumRows(); |
| long cols = input.getNumColumns(); |
| Pointer imagePointer = getDensePointer(gCtx, input, instName); |
| Pointer doutPointer = getDensePointer(gCtx, dout, instName); |
| Pointer outputPointer = getDensePointer(gCtx, outputBlock, instName); |
| |
| getCudaKernels(gCtx).launchKernel("relu_backward", |
| ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(rows), toInt(cols)), |
| imagePointer, doutPointer, outputPointer, toInt(rows), toInt(cols)); |
| } |
| |
| /** |
| * Perform channel_sums operations: out = rowSums(matrix(colSums(A), rows=C, cols=HW)) |
| * |
| * @param gCtx a valid {@link GPUContext} |
| * @param instName the invoking instruction's name for record {@link Statistics}. |
| * @param input input image |
| * @param outputBlock output |
| * @param C number of channels |
| * @param HW height*width |
| */ |
| public static void channelSums(GPUContext gCtx, String instName, MatrixObject input, MatrixObject outputBlock, long C, long HW) { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : channelSums" + ", GPUContext=" + gCtx); |
| } |
| int N = toInt(input.getNumRows()); |
| int cols = toInt(input.getNumColumns()); |
| if(cols != C*HW) { |
| throw new DMLRuntimeException("Incorrect parameters, number of columns " + cols + " != " + C + "*" + HW); |
| } |
| Pointer imagePointer = getDensePointer(gCtx, input, instName); |
| Pointer outputPointer = getDensePointer(gCtx, outputBlock, instName); |
| |
| // We can replace this with CuDNN tensor reduce |
| Pointer tmp = gCtx.allocate(instName, cols*sizeOfDataType); |
| reduceCol(gCtx, instName, "reduce_col_sum", imagePointer, tmp, N, cols); |
| reduceRow(gCtx, instName, "reduce_row_sum", tmp, outputPointer, toInt(C), toInt(HW)); |
| gCtx.cudaFreeHelper(instName, tmp, DMLScript.EAGER_CUDA_FREE); |
| } |
| |
| /** |
| * Performs the operation corresponding to the DML script: |
| * ones = matrix(1, rows=1, cols=Hout*Wout) |
| * output = input * matrix(bias %*% ones, rows=1, cols=F*Hout*Wout) |
| * This operation is often followed by conv2d and hence we have introduced bias_add(input, bias) built-in function |
| * @param gCtx a valid {@link GPUContext} |
| * @param instName the invoking instruction's name for record {@link Statistics}. |
| * @param input input image |
| * @param bias bias |
| * @param outputBlock output |
| */ |
| public static void biasMultiply(GPUContext gCtx, String instName, MatrixObject input, MatrixObject bias, MatrixObject outputBlock) { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : biasMultiply" + ", GPUContext=" + gCtx); |
| } |
| if(isInSparseFormat(gCtx, input)) { |
| input.getGPUObject(gCtx).sparseToDense(instName); |
| } |
| if(isInSparseFormat(gCtx, bias)) { |
| bias.getGPUObject(gCtx).sparseToDense(instName); |
| } |
| long rows = input.getNumRows(); |
| long cols = input.getNumColumns(); |
| long K = bias.getNumRows(); |
| long PQ = cols / K; |
| if(bias.getNumColumns() != 1 || cols % K != 0) { |
| throw new DMLRuntimeException("Incorrect inputs for bias_multiply: input[" + rows + " X " + cols + "] and bias[" + K + " X " + bias.getNumColumns() + "]"); |
| } |
| Pointer imagePointer = input.getGPUObject(gCtx).getDensePointer(); |
| Pointer biasPointer = bias.getGPUObject(gCtx).getDensePointer(); |
| Pointer outputPointer = outputBlock.getGPUObject(gCtx).getDensePointer(); |
| getCudaKernels(gCtx).launchKernel("bias_multiply", |
| ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(rows), toInt(cols)), |
| imagePointer, biasPointer, outputPointer, toInt(rows), toInt(cols), toInt(PQ)); |
| } |
| |
| /** |
| * Performs the operation corresponding to the DML script: |
| * ones = matrix(1, rows=1, cols=Hout*Wout) |
| * output = input + matrix(bias %*% ones, rows=1, cols=F*Hout*Wout) |
| * This operation is often followed by conv2d and hence we have introduced bias_add(input, bias) built-in function |
| * @param gCtx a valid {@link GPUContext} |
| * @param instName the invoking instruction's name for record {@link Statistics}. |
| * @param input input image |
| * @param bias bias |
| * @param outputBlock output |
| */ |
| public static void biasAdd(GPUContext gCtx, String instName, MatrixObject input, MatrixObject bias, MatrixObject outputBlock) { |
| Pointer imagePointer = getDensePointer(gCtx, input, instName); |
| Pointer biasPointer = getDensePointer(gCtx, bias, instName); |
| Pointer outputPointer = getDensePointer(gCtx, outputBlock, instName); |
| int rows = toInt(input.getNumRows()); |
| int cols = toInt(input.getNumColumns()); |
| int K = toInt(bias.getNumRows()); |
| if(bias.getNumColumns() != 1 || cols % K != 0) { |
| throw new DMLRuntimeException("Incorrect inputs for bias_add: input[" + rows + " X " + cols + "] and bias[" + K + " X " + bias.getNumColumns() + "]"); |
| } |
| biasAdd(gCtx, instName, imagePointer, biasPointer, outputPointer, rows, cols, K); |
| } |
| |
| /** |
| * Performs the operation corresponding to the DML script: |
| * ones = matrix(1, rows=1, cols=Hout*Wout) |
| * output = input + matrix(bias %*% ones, rows=1, cols=F*Hout*Wout) |
| * This operation is often followed by conv2d and hence we have introduced bias_add(input, bias) built-in function |
| * @param gCtx a valid {@link GPUContext} |
| * @param instName the invoking instruction's name for record {@link Statistics}. |
| * @param image input image |
| * @param bias bias |
| * @param output output |
| * @param rows rows in input image |
| * @param cols cols in input image |
| * @param k rows in bias |
| */ |
| private static void biasAdd(GPUContext gCtx, String instName, Pointer image, Pointer bias, Pointer output, int rows, int cols, int k) { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : biasAdd" + ", GPUContext=" + gCtx); |
| } |
| int PQ = cols / k; |
| getCudaKernels(gCtx).launchKernel("bias_add", |
| ExecutionConfig.getConfigForSimpleMatrixOperations(rows, cols), |
| image, bias, output, rows, cols, PQ); |
| } |
| |
| |
| //********************************************************************/ |
| //************* End of DEEP LEARNING Operators ***********************/ |
| //********************************************************************/ |
| |
| |
| |
| //********************************************************************/ |
| //********** TRANSPOSE SELF MATRIX MULTIPLY Functions ****************/ |
| //********************************************************************/ |
| |
| /** |
| * Performs tsmm, A %*% A' or A' %*% A, on GPU by exploiting cublasDsyrk(...) |
| * <p> |
| * Memory Usage - If dense, input space - rows * cols, no intermediate memory, output - Max(rows*rows, cols*cols) |
| * If sparse, calls matmult |
| * |
| * @param ec execution context |
| * @param gCtx a valid {@link GPUContext} |
| * @param instName the invoking instruction's name for record {@link Statistics}. |
| * @param left input matrix, as in a tsmm expression like A %*% A' or A' %*% A, we just need to check whether the left one is transposed or not, I named it 'left' |
| * @param outputName output matrix name |
| * @param isLeftTransposed if true, left transposed |
| */ |
| public static void matmultTSMM(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject left, String outputName, |
| boolean isLeftTransposed) { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : matmultTSMM" + ", GPUContext=" + gCtx); |
| } |
| if (ec.getGPUContext(0) != gCtx) |
| throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); |
| |
| if(isInSparseFormat(gCtx, left)) { |
| // For sparse TSMM, invoke matmult (TODO: possible performance improvement) |
| LibMatrixCuMatMult.matmult(ec, gCtx, instName, left, left, outputName, isLeftTransposed, !isLeftTransposed); |
| return; |
| } |
| |
| // Since CuBLAS expects inputs in column-major format, |
| // reverse the order of matrix-multiplication and take care of dimension mismatch. |
| int transa = isLeftTransposed ? cublasOperation.CUBLAS_OP_N : cublasOperation.CUBLAS_OP_T; |
| // Note: the dimensions are swapped |
| int m = toInt(isLeftTransposed ? left.getNumColumns() : left.getNumRows()); |
| int k = toInt(isLeftTransposed ? left.getNumRows() : left.getNumColumns()); |
| |
| // For dense TSMM, exploit cublasDsyrk(...) and call custom kernel to flip the matrix |
| MatrixObject output = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, m, m); // Allocated the dense output matrix |
| |
| if(m == -1) |
| throw new DMLRuntimeException("Incorrect dimensions"); |
| |
| int lda = toInt(isLeftTransposed ? m : k); |
| int ldc = m; |
| |
| if(!left.getGPUObject(gCtx).isAllocated()) |
| throw new DMLRuntimeException("Input is not allocated:" + left.getGPUObject(gCtx).isAllocated()); |
| if(!output.getGPUObject(gCtx).isAllocated()) |
| throw new DMLRuntimeException("Output is not allocated:" + output.getGPUObject(gCtx).isAllocated()); |
| |
| Pointer A = getDensePointer(gCtx, left, instName); |
| Pointer C = getDensePointer(gCtx, output, instName); |
| |
| cudaSupportFunctions.cublassyrk(getCublasHandle(gCtx), cublasFillMode.CUBLAS_FILL_MODE_LOWER,transa, m, k, one(), A, lda, zero(), C, ldc); |
| copyUpperToLowerTriangle(gCtx, instName, output); |
| } |
| |
| /** |
| * Used for all version of TSMM where the result is known to be symmetric. |
| * Hence, we compute only the upper triangular matrix and copy this partial |
| * result down to lower triangular matrix once. |
| * |
| * @param gCtx a valid {@link GPUContext} |
| * @param instName instruction name |
| * @param ret upper triangular matrix |
| */ |
| private static void copyUpperToLowerTriangle(GPUContext gCtx, String instName, MatrixObject ret) { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : copyUpperToLowerTriangle" + ", GPUContext=" + gCtx); |
| } |
| if(isInSparseFormat(gCtx, ret)) { |
| throw new DMLRuntimeException("Sparse GPU copyUpperToLowerTriangle is not implemented"); |
| } |
| if(ret.getNumRows() != ret.getNumColumns()) { |
| throw new DMLRuntimeException("Only square matrix kernel is implemented for copyUpperToLowerTriangle"); |
| } |
| int dim = toInt(ret.getNumRows()); |
| getCudaKernels(gCtx).launchKernel("copy_u2l_dense", |
| ExecutionConfig.getConfigForSimpleMatrixOperations(dim, dim), |
| getDensePointer(gCtx, ret, instName), dim, dim*dim); |
| } |
| |
| |
| |
| //********************************************************************/ |
| //******** End of TRANSPOSE SELF MATRIX MULTIPLY Functions ***********/ |
| //********************************************************************/ |
| |
| |
| //********************************************************************/ |
| //**************** UNARY AGGREGATE Functions ************************/ |
| //********************************************************************/ |
| |
| /** |
| * Entry point to perform Unary aggregate operations on the GPU. |
| * The execution context object is used to allocate memory for the GPU. |
| * |
| * @param ec Instance of {@link ExecutionContext}, from which the output variable will be allocated |
| * @param gCtx a valid {@link GPUContext} |
| * @param instName name of the invoking instruction to record{@link Statistics}. |
| * @param in1 input matrix |
| * @param output output matrix/scalar name |
| * @param op Instance of {@link AggregateUnaryOperator} which encapsulates the direction of reduction/aggregation and the reduction operation. |
| */ |
| public static void unaryAggregate(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String output, AggregateUnaryOperator op) { |
| if (ec.getGPUContext(0) != gCtx) |
| throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : unaryAggregate" + ", GPUContext=" + gCtx); |
| } |
| final int REDUCTION_ALL = 1; |
| final int REDUCTION_ROW = 2; |
| final int REDUCTION_COL = 3; |
| final int REDUCTION_DIAG = 4; |
| |
| // A kahan sum implemention is not provided. is a "uak+" or other kahan operator is encountered, |
| // it just does regular summation reduction. |
| final int OP_PLUS = 1; |
| final int OP_PLUS_SQ = 2; |
| final int OP_MEAN = 3; |
| final int OP_VARIANCE = 4; |
| final int OP_MULTIPLY = 5; |
| final int OP_MAX = 6; |
| final int OP_MIN = 7; |
| final int OP_MAXINDEX = 8; |
| final int OP_MININDEX = 9; |
| |
| |
| // Sanity Checks |
| if(!in1.getGPUObject(gCtx).isAllocated()) |
| throw new DMLRuntimeException("Internal Error - The input is not allocated for a GPU Aggregate Unary:" + in1.getGPUObject(gCtx).isAllocated()); |
| |
| boolean isSparse = in1.getGPUObject(gCtx).isSparse(); |
| IndexFunction indexFn = op.indexFn; |
| AggregateOperator aggOp = op.aggOp; |
| |
| // Convert Reduction direction to a number |
| int reductionDirection = -1; |
| if (indexFn instanceof ReduceAll){ |
| reductionDirection = REDUCTION_ALL; |
| } else if (indexFn instanceof ReduceRow){ |
| reductionDirection = REDUCTION_ROW; |
| } else if (indexFn instanceof ReduceCol){ |
| reductionDirection = REDUCTION_COL; |
| } else if (indexFn instanceof ReduceDiag){ |
| reductionDirection = REDUCTION_DIAG; |
| } else { |
| throw new DMLRuntimeException("Internal Error - Invalid index function type, only reducing along rows, columns, diagonals or all elements is supported in Aggregate Unary operations"); |
| } |
| if(reductionDirection == -1) |
| throw new DMLRuntimeException("Internal Error - Incorrect type of reduction direction set for aggregate unary GPU instruction"); |
| |
| // Convert function type to a number |
| int opIndex = -1; |
| if (aggOp.increOp.fn instanceof KahanPlus) { |
| opIndex = OP_PLUS; |
| } else if (aggOp.increOp.fn instanceof KahanPlusSq) { |
| opIndex = OP_PLUS_SQ; |
| } else if (aggOp.increOp.fn instanceof Mean) { |
| opIndex = OP_MEAN; |
| } else if (aggOp.increOp.fn instanceof CM) { |
| if(((CM)aggOp.increOp.fn).getAggOpType() != CMOperator.AggregateOperationTypes.VARIANCE) |
| throw new DMLRuntimeException("Internal Error - Invalid Type of CM operator for Aggregate Unary operation on GPU"); |
| opIndex = OP_VARIANCE; |
| } else if (aggOp.increOp.fn instanceof Plus) { |
| opIndex = OP_PLUS; |
| } else if (aggOp.increOp.fn instanceof Multiply) { |
| opIndex = OP_MULTIPLY; |
| } else if (aggOp.increOp.fn instanceof Builtin) { |
| Builtin b = (Builtin)aggOp.increOp.fn; |
| switch(b.bFunc) { |
| case MAX: opIndex = OP_MAX; break; |
| case MIN: opIndex = OP_MIN; break; |
| case MAXINDEX: opIndex = OP_MAXINDEX; break; |
| case MININDEX: opIndex = OP_MININDEX;break; |
| default: |
| throw new DMLRuntimeException("Internal Error - Unsupported Builtin Function for Aggregate unary being done on GPU"); |
| } |
| } else { |
| throw new DMLRuntimeException("Internal Error - Aggregate operator has invalid Value function"); |
| } |
| if(opIndex == -1) |
| throw new DMLRuntimeException("Internal Error - Incorrect type of operation set for aggregate unary GPU instruction"); |
| |
| int rlen = (int)in1.getNumRows(); |
| int clen = (int)in1.getNumColumns(); |
| if (isSparse){ |
| // The strategy for the time being is to convert sparse to dense |
| // until a sparse specific kernel is written. |
| in1.getGPUObject(gCtx).sparseToDense(instName); |
| // long nnz = in1.getNnz(); |
| // assert nnz > 0 : "Internal Error - number of non zeroes set to " + nnz + " in Aggregate Binary for GPU"; |
| // MatrixObject out = ec.getSparseMatrixOutputForGPUInstruction(output, nnz); |
| // throw new DMLRuntimeException("Internal Error - Not implemented"); |
| |
| } |
| |
| long outRLen = -1; |
| long outCLen = -1; |
| if (indexFn instanceof ReduceRow) { // COL{SUM, MAX...} |
| outRLen = 1; |
| outCLen = clen; |
| } |
| else if (indexFn instanceof ReduceCol) { // ROW{SUM, MAX,...} |
| outRLen = rlen; |
| outCLen = 1; |
| } |
| |
| Pointer out = null; |
| if (reductionDirection == REDUCTION_COL || reductionDirection == REDUCTION_ROW) { |
| // Matrix output |
| MatrixObject out1 = getDenseMatrixOutputForGPUInstruction(ec, instName, output, outRLen, outCLen); |
| out = getDensePointer(gCtx, out1, instName); |
| } |
| |
| Pointer in = getDensePointer(gCtx, in1, instName); |
| int size = rlen * clen; |
| |
| // For scalars, set the scalar output in the Execution Context object |
| switch (opIndex){ |
| case OP_PLUS: { |
| switch(reductionDirection) { |
| case REDUCTION_ALL : { |
| double result = reduceAll(gCtx, instName, "reduce_sum", in, size); |
| ec.setScalarOutput(output, new DoubleObject(result)); |
| break; |
| } |
| case REDUCTION_COL : { // The names are a bit misleading, REDUCTION_COL refers to the direction (reduce all elements in a column) |
| reduceRow(gCtx, instName, "reduce_row_sum", in, out, rlen, clen); |
| break; |
| } |
| case REDUCTION_ROW : { |
| reduceCol(gCtx, instName, "reduce_col_sum", in, out, rlen, clen); |
| break; |
| } |
| case REDUCTION_DIAG : |
| throw new DMLRuntimeException("Internal Error - Row, Column and Diag summation not implemented yet"); |
| } |
| break; |
| } |
| case OP_PLUS_SQ : { |
| // Calculate the squares in a temporary object tmp |
| Pointer tmp = gCtx.allocate(instName, size * sizeOfDataType); |
| |
| squareMatrix(gCtx, instName, in, tmp, rlen, clen); |
| // Then do the sum on the temporary object and free it |
| switch(reductionDirection) { |
| case REDUCTION_ALL : { |
| double result = reduceAll(gCtx, instName, "reduce_sum", tmp, size); |
| ec.setScalarOutput(output, new DoubleObject(result)); |
| break; |
| } |
| case REDUCTION_COL : { // The names are a bit misleading, REDUCTION_COL refers to the direction (reduce all elements in a column) |
| reduceRow(gCtx, instName, "reduce_row_sum", tmp, out, rlen, clen); |
| break; |
| } |
| case REDUCTION_ROW : { |
| reduceCol(gCtx, instName, "reduce_col_sum", tmp, out, rlen, clen); |
| break; |
| } |
| default: |
| throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for summation squared"); |
| } |
| gCtx.cudaFreeHelper(instName, tmp, DMLScript.EAGER_CUDA_FREE); |
| break; |
| } |
| case OP_MEAN:{ |
| switch(reductionDirection) { |
| case REDUCTION_ALL: { |
| double result = reduceAll(gCtx, instName, "reduce_sum", in, size); |
| double mean = result / size; |
| ec.setScalarOutput(output, new DoubleObject(mean)); |
| break; |
| } |
| case REDUCTION_COL: { |
| reduceRow(gCtx, instName, "reduce_row_mean", in, out, rlen, clen); |
| break; |
| } |
| case REDUCTION_ROW: { |
| reduceCol(gCtx, instName, "reduce_col_mean", in, out, rlen, clen); |
| break; |
| } |
| default: |
| throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for mean"); |
| } |
| break; |
| } |
| case OP_MULTIPLY : { |
| switch (reductionDirection) { |
| case REDUCTION_ALL: { |
| double result = reduceAll(gCtx, instName, "reduce_prod", in, size); |
| ec.setScalarOutput(output, new DoubleObject(result)); |
| break; |
| } |
| default: |
| throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for multiplication"); |
| } |
| break; |
| } |
| case OP_MAX :{ |
| switch(reductionDirection) { |
| case REDUCTION_ALL: { |
| double result = reduceAll(gCtx, instName, "reduce_max", in, size); |
| ec.setScalarOutput(output, new DoubleObject(result)); |
| break; |
| } |
| case REDUCTION_COL: { |
| reduceRow(gCtx, instName, "reduce_row_max", in, out, rlen, clen); |
| break; |
| } |
| case REDUCTION_ROW: { |
| reduceCol(gCtx, instName, "reduce_col_max", in, out, rlen, clen); |
| break; |
| } |
| default: |
| throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for max"); |
| } |
| break; |
| } |
| case OP_MIN :{ |
| switch(reductionDirection) { |
| case REDUCTION_ALL: { |
| double result = reduceAll(gCtx, instName, "reduce_min", in, size); |
| ec.setScalarOutput(output, new DoubleObject(result)); |
| break; |
| } |
| case REDUCTION_COL: { |
| reduceRow(gCtx, instName, "reduce_row_min", in, out, rlen, clen); |
| break; |
| } |
| case REDUCTION_ROW: { |
| reduceCol(gCtx, instName, "reduce_col_min", in, out, rlen, clen); |
| break; |
| } |
| default: |
| throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for min"); |
| } |
| break; |
| } |
| case OP_VARIANCE : { |
| // Temporary GPU array for |
| Pointer tmp = gCtx.allocate(instName, size * sizeOfDataType); |
| Pointer tmp2 = gCtx.allocate(instName, size * sizeOfDataType); |
| |
| switch(reductionDirection) { |
| |
| case REDUCTION_ALL: { |
| double result = reduceAll(gCtx, instName, "reduce_sum", in, size); |
| double mean = result / size; |
| |
| // Subtract mean from every element in the matrix |
| ScalarOperator minusOp = new RightScalarOperator(Minus.getMinusFnObject(), mean); |
| matrixScalarOp(gCtx, instName, in, mean, rlen, clen, tmp, minusOp); |
| |
| squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen); |
| |
| double result2 = reduceAll(gCtx, instName, "reduce_sum", tmp2, size); |
| double variance = result2 / (size - 1); |
| ec.setScalarOutput(output, new DoubleObject(variance)); |
| |
| break; |
| } |
| case REDUCTION_COL: { |
| reduceRow(gCtx, instName, "reduce_row_mean", in, out, rlen, clen); |
| // Subtract the row-wise mean from every element in the matrix |
| BinaryOperator minusOp = new BinaryOperator(Minus.getMinusFnObject()); |
| matrixMatrixOp(gCtx, instName, in, out, rlen, clen, VectorShape.NONE.code(), VectorShape.COLUMN.code(), tmp, minusOp); |
| |
| squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen); |
| |
| Pointer tmpRow = gCtx.allocate(instName, rlen * sizeOfDataType); |
| reduceRow(gCtx, instName, "reduce_row_sum", tmp2, tmpRow, rlen, clen); |
| |
| ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), clen - 1); |
| matrixScalarOp(gCtx, instName, tmpRow, clen - 1, rlen, 1, out, divideOp); |
| |
| gCtx.cudaFreeHelper(instName, tmpRow, DMLScript.EAGER_CUDA_FREE); |
| |
| break; |
| } |
| case REDUCTION_ROW: { |
| reduceCol(gCtx, instName, "reduce_col_mean", in, out, rlen, clen); |
| // Subtract the columns-wise mean from every element in the matrix |
| BinaryOperator minusOp = new BinaryOperator(Minus.getMinusFnObject()); |
| matrixMatrixOp(gCtx, instName, in, out, rlen, clen, VectorShape.NONE.code(), VectorShape.ROW.code(), tmp, minusOp); |
| |
| squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen); |
| |
| Pointer tmpCol = gCtx.allocate(instName, clen * sizeOfDataType); |
| reduceCol(gCtx, instName, "reduce_col_sum", tmp2, tmpCol, rlen, clen); |
| |
| ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), rlen - 1); |
| matrixScalarOp(gCtx, instName, tmpCol, rlen - 1, 1, clen, out, divideOp); |
| |
| gCtx.cudaFreeHelper(instName, tmpCol, DMLScript.EAGER_CUDA_FREE); |
| |
| break; |
| } |
| default: |
| throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for variance"); |
| } |
| gCtx.cudaFreeHelper(instName, tmp, DMLScript.EAGER_CUDA_FREE); |
| gCtx.cudaFreeHelper(instName, tmp2, DMLScript.EAGER_CUDA_FREE); |
| break; |
| } |
| case OP_MAXINDEX : { |
| switch(reductionDirection) { |
| case REDUCTION_COL: |
| throw new DMLRuntimeException("Internal Error - Column maxindex of matrix not implemented yet for GPU "); |
| default: |
| throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for maxindex"); |
| } |
| // break; |
| } |
| case OP_MININDEX : { |
| switch(reductionDirection) { |
| case REDUCTION_COL: |
| throw new DMLRuntimeException("Internal Error - Column minindex of matrix not implemented yet for GPU "); |
| default: |
| throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for minindex"); |
| } |
| // break; |
| } |
| default : throw new DMLRuntimeException("Internal Error - Invalid GPU Unary aggregate function!"); |
| } |
| } |
| |
| /** |
| * Helper method to square a matrix in GPU memory |
| * @param gCtx a valid {@link GPUContext} |
| * @param instName the invoking instruction's name for record {@link Statistics}. |
| * @param in input matrix on GPU |
| * @param out output matrix on GPU |
| * @param rlen row length |
| * @param clen column length |
| */ |
| private static void squareMatrix(GPUContext gCtx, String instName, Pointer in, Pointer out, int rlen, int clen) { |
| ScalarOperator power2op = new RightScalarOperator(Power.getPowerFnObject(), 2); |
| matrixScalarOp(gCtx, instName, in, 2, rlen, clen, out, power2op); |
| } |
| |
| /** |
| * Do a simple reduction, the output of which is a single value |
| * @param gCtx a valid {@link GPUContext} |
| * @param kernelFunction name of the kernel function to invoke |
| * @param in {@link Pointer} to matrix in device memory |
| * @param n size of array |
| * @return the reduced value |
| */ |
| private static double reduceAll(GPUContext gCtx, String instName, String kernelFunction, Pointer in, int n) { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : reduceAll for " + kernelFunction + ", GPUContext=" + gCtx); |
| } |
| |
| int[] tmp = getKernelParamsForReduceAll(gCtx, n); |
| int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2]; |
| |
| Pointer tempOut = gCtx.allocate(instName, n*sizeOfDataType); |
| |
| getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), in, tempOut, n); |
| //cudaDeviceSynchronize; |
| |
| int s = blocks; |
| while (s > 1) { |
| tmp = getKernelParamsForReduceAll(gCtx, s); |
| blocks = tmp[0]; threads = tmp[1]; sharedMem = tmp[2]; |
| getCudaKernels(gCtx).launchKernel(kernelFunction, |
| new ExecutionConfig(blocks, threads, sharedMem), tempOut, tempOut, s); |
| s = (s + (threads*2-1)) / (threads*2); |
| } |
| double[] result = {-1f}; |
| cudaSupportFunctions.deviceToHost(gCtx, tempOut, result, instName, false); |
| gCtx.cudaFreeHelper(instName, tempOut, DMLScript.EAGER_CUDA_FREE); |
| return result[0]; |
| } |
| |
| /** |
| * Do a reduction by row. Data is reduced per row and the |
| * resulting vector is calculated. |
| * @param gCtx a valid {@link GPUContext} |
| * @param kernelFunction name of the kernel function to invoke |
| * @param in {@link Pointer} to input matrix in device memory (size - rows * columns) |
| * @param out {@link Pointer} to output matrix in device memory (size - rows * 1) |
| * @param rows number of rows in input matrix |
| * @param cols number of columns in input matrix |
| */ |
| private static void reduceRow(GPUContext gCtx, String instName, String kernelFunction, Pointer in, Pointer out, int rows, int cols) { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : reduceRow for " + kernelFunction + ", GPUContext=" + gCtx); |
| } |
| |
| int[] tmp = getKernelParamsForReduceByRow(gCtx, rows, cols); |
| int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2]; |
| |
| Timing time = new Timing(false); |
| double duration = 0; |
| if(LOG.isTraceEnabled()) { time.start(); } |
| |
| getCudaKernels(gCtx).launchKernel(kernelFunction, |
| new ExecutionConfig(blocks, threads, sharedMem), in, out, rows, cols); |
| |
| if(LOG.isTraceEnabled()) { |
| cudaDeviceSynchronize(); |
| duration = time.stop(); |
| LOG.trace("uop kernel function " + kernelFunction + " executed in " + duration + "ms."); |
| } |
| } |
| |
| /** |
| * Do a reduction by column. Data is reduced per column and the |
| * resulting vector is calculated. |
| * @param gCtx a valid {@link GPUContext} |
| * @param kernelFunction name of the kernel function to invoke |
| * @param in {@link Pointer} to input matrix in device memory (size - rows * columns) |
| * @param out {@link Pointer} to output matrix in device memory (size - 1 * cols) |
| * @param rows number of rows in input matrix |
| * @param cols number of columns in input matrix |
| */ |
| private static void reduceCol(GPUContext gCtx, String instName, String kernelFunction, Pointer in, Pointer out, int rows, int cols) { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : reduceCol for " + kernelFunction + ", GPUContext=" + gCtx); |
| } |
| |
| int[] tmp = getKernelParamsForReduceByCol(gCtx, rows, cols); |
| int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2]; |
| |
| Timing time = new Timing(false); |
| double duration = 0; |
| if(LOG.isTraceEnabled()) { time.start(); } |
| |
| getCudaKernels(gCtx).launchKernel(kernelFunction, |
| new ExecutionConfig(blocks, threads, sharedMem), in, out, rows, cols); |
| |
| if(LOG.isTraceEnabled()) { |
| cudaDeviceSynchronize(); |
| duration = time.stop(); |
| LOG.trace("uop kernel function " + kernelFunction + " executed in " + duration + "ms."); |
| } |
| } |
| |
| /** |
| * Get threads, blocks and shared memory for a reduce all operation |
| * @param gCtx a valid {@link GPUContext} |
| * @param n size of input array |
| * @return integer array containing {blocks, threads, shared memory} |
| */ |
| private static int[] getKernelParamsForReduceAll(GPUContext gCtx, int n) { |
| final int MAX_THREADS = getMaxThreads(gCtx); |
| final int MAX_BLOCKS = getMaxBlocks(gCtx); |
| final int WARP_SIZE = getWarpSize(gCtx); |
| int threads = (n < MAX_THREADS *2) ? nextPow2((n + 1)/ 2) : MAX_THREADS; |
| |
| int blocks = (n + (threads * 2 - 1)) / (threads * 2); |
| blocks = Math.min(MAX_BLOCKS, blocks); |
| |
| int sharedMemSize = threads * sizeOfDataType; |
| if (threads <= WARP_SIZE) { |
| sharedMemSize *= 2; |
| } |
| return new int[] {blocks, threads, sharedMemSize}; |
| } |
| |
| /** |
| * Get threads, blocks and shared memory for a reduce by row operation |
| * @param gCtx a valid {@link GPUContext} |
| * @param rows number of rows in input matrix |
| * @param cols number of columns in input matrix |
| * @return integer array containing {blocks, threads, shared memory} |
| */ |
| private static int[] getKernelParamsForReduceByRow(GPUContext gCtx, int rows, int cols) { |
| final int WARP_SIZE = getWarpSize(gCtx); |
| final int MAX_THREADS = getMaxThreads(gCtx); |
| int threads = (cols < MAX_THREADS *2) ? nextPow2((cols + 1)/ 2) : MAX_THREADS; |
| int blocks = rows; |
| int sharedMemSize = threads * sizeOfDataType; |
| if (threads <= WARP_SIZE){ |
| sharedMemSize *=2; |
| } |
| return new int[] {blocks, threads, sharedMemSize}; |
| } |
| |
| private static int[] getKernelParamsForReduceByCol(GPUContext gCtx, int rows, int cols) { |
| final int MAX_THREADS = getMaxThreads(gCtx); |
| final int MAX_BLOCKS = getMaxBlocks(gCtx); |
| final int WARP_SIZE = getWarpSize(gCtx); |
| int threads = Math.min(cols, MAX_THREADS); |
| int blocks = Math.min(cols/MAX_THREADS, MAX_BLOCKS); |
| if (cols % MAX_THREADS != 0) blocks++; |
| int sharedMemSize = threads * sizeOfDataType; |
| if (threads <= WARP_SIZE) { |
| sharedMemSize *=2; |
| } |
| return new int[] {blocks, threads, sharedMemSize}; |
| } |
| |
| private static int nextPow2(int x) |
| { |
| --x; |
| x |= x >> 1; |
| x |= x >> 2; |
| x |= x >> 4; |
| x |= x >> 8; |
| x |= x >> 16; |
| return ++x; |
| } |
| |
| //********************************************************************/ |
| //**************** END OF UNARY AGGREGATE Functions *****************/ |
| //********************************************************************/ |
| |
| |
| //********************************************************************/ |
| //************ Matrix-Matrix & Matrix-Scalar Functions ***************/ |
| //********************************************************************/ |
| |
| /** |
| * Entry point to perform elementwise matrix-scalar relational operation specified by op |
| * |
| * @param ec execution context |
| * @param gCtx a valid {@link GPUContext} |
| * @param instName the invoking instruction's name for record {@link Statistics}. |
| * @param in input matrix |
| * @param outputName output matrix name |
| * @param op scalar operator |
| */ |
| public static void matrixScalarRelational(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in, String outputName, ScalarOperator op) { |
| if (ec.getGPUContext(0) != gCtx) |
| throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); |
| double constant = op.getConstant(); |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : matrixScalarRelational, scalar: " + constant + ", GPUContext=" + gCtx); |
| } |
| |
| Pointer A, C; |
| if (isSparseAndEmpty(gCtx, in)) { |
| setOutputToConstant(ec, gCtx, instName, op.executeScalar(0.0), outputName, in.getNumRows(), |
| in.getNumColumns()); |
| return; |
| } else { |
| A = getDensePointer(gCtx, in, instName); |
| MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, in.getNumRows(), in.getNumColumns()); // Allocated the dense output matrix |
| C = getDensePointer(gCtx, out, instName); |
| } |
| |
| int rlenA = toInt(in.getNumRows()); |
| int clenA = toInt(in.getNumColumns()); |
| |
| matrixScalarOp(gCtx, instName, A, constant, rlenA, clenA, C, op); |
| } |
| |
| /** |
| * Entry point to perform elementwise matrix-scalar arithmetic operation specified by op |
| * |
| * @param ec execution context |
| * @param gCtx a valid {@link GPUContext} |
| * @param instName the invoking instruction's name for record {@link Statistics}. |
| * @param in input matrix |
| * @param outputName output matrix name |
| * @param isInputTransposed true if input transposed |
| * @param op scalar operator |
| */ |
| public static void matrixScalarArithmetic(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in, String outputName, boolean isInputTransposed, ScalarOperator op) { |
| if (ec.getGPUContext(0) != gCtx) |
| throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); |
| double constant = op.getConstant(); |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : matrixScalarArithmetic, scalar: " + constant + ", GPUContext=" + gCtx); |
| } |
| |
| int outRLen = isInputTransposed ? (int) in.getNumColumns() : (int) in.getNumRows(); |
| int outCLen = isInputTransposed ? (int) in.getNumRows() : (int) in.getNumColumns(); |
| |
| //boolean isCUDALibAvailable = (op.fn instanceof Multiply |
| // || (op.fn instanceof Divide && op instanceof RightScalarOperator && constant != 0)) && !isSparseAndEmpty(gCtx, in); |
| //if(!isCUDALibAvailable) { |
| if(constant == 0) { |
| if(op.fn instanceof Plus || (op.fn instanceof Minus && op instanceof RightScalarOperator) || op.fn instanceof Or) { |
| deviceCopy(ec, gCtx, instName, in, outputName, isInputTransposed); |
| } |
| else if(op.fn instanceof Multiply || op.fn instanceof And) { |
| setOutputToConstant(ec, gCtx, instName, 0.0, outputName, outRLen, outCLen); |
| } |
| else if(op.fn instanceof Power) { |
| setOutputToConstant(ec, gCtx, instName, 1.0, outputName, outRLen, outCLen); |
| } |
| // TODO: |
| // x/0.0 is either +Infinity or -Infinity according to Java. |
| // In the context of a matrix, different elements of the matrix |
| // could have different values. |
| // If the IEEE 754 standard defines otherwise, this logic needs |
| // to be re-enabled and the Java computation logic for divide by zero |
| // needs to be revisited |
| //else if(op.fn instanceof Divide && isSparseAndEmpty(gCtx, in)) { |
| // setOutputToConstant(ec, gCtx, instName, Double.NaN, outputName); |
| //} |
| //else if(op.fn instanceof Divide) { |
| // //For division, IEEE 754 defines x/0.0 as INFINITY and 0.0/0.0 as NaN. |
| // compareAndSet(ec, gCtx, instName, in, outputName, 0.0, 1e-6, Double.NaN, Double.POSITIVE_INFINITY, Double.POSITIVE_INFINITY); |
| //} |
| else { |
| // TODO: Potential to optimize |
| matrixScalarOp(ec, gCtx, instName, in, outputName, isInputTransposed, op); |
| } |
| } |
| else if(constant == 1.0 && op.fn instanceof Or) { |
| setOutputToConstant(ec, gCtx, instName, 1.0, outputName, outRLen, outCLen); |
| } |
| else if(constant == 1.0 && (op.fn instanceof And || op.fn instanceof Power)) { |
| deviceCopy(ec, gCtx, instName, in, outputName, isInputTransposed); |
| } |
| else { |
| matrixScalarOp(ec, gCtx, instName, in, outputName, isInputTransposed, op); |
| } |
| // } |
| //else { |
| // double alpha = 0; |
| // if(op.fn instanceof Multiply) { |
| // alpha = op.getConstant(); |
| // } |
| // else if(op.fn instanceof Divide && op instanceof RightScalarOperator) { |
| // alpha = Math.pow(op.getConstant(), -1.0); |
| // } |
| // else { |
| // throw new DMLRuntimeException("Unsupported op"); |
| // } |
| |
| // TODO: Performance optimization: Call cublasDaxpy if(in.getNumRows() == 1 || in.getNumColumns() == 1) |
| // C = alpha* op( A ) + beta* op ( B ) |
| // dgeam(ec, gCtx, instName, in, in, outputName, isInputTransposed, isInputTransposed, alpha, 0.0); |
| //} |
| } |
| |
| |
| /** |
| * Performs elementwise operation relational specified by op of two input matrices in1 and in2 |
| * |
| * @param ec execution context |
| * @param gCtx a valid {@link GPUContext} |
| * @param instName the invoking instruction's name for record {@link Statistics}. |
| * @param in1 input matrix 1 |
| * @param in2 input matrix 2 |
| * @param outputName output matrix name |
| * @param op binary operator |
| */ |
| public static void matrixMatrixRelational(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, |
| String outputName, BinaryOperator op) { |
| |
| if (ec.getGPUContext(0) != gCtx) |
| throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); |
| |
| boolean in1SparseAndEmpty = isSparseAndEmpty(gCtx, in1); |
| boolean in2SparseAndEmpty = isSparseAndEmpty(gCtx, in2); |
| if (in1SparseAndEmpty && in2SparseAndEmpty) { |
| if (op.fn instanceof LessThan || op.fn instanceof GreaterThan || op.fn instanceof NotEquals) { |
| setOutputToConstant(ec, gCtx, instName, 0.0, outputName, in1.getNumRows(), in1.getNumColumns()); |
| } else if (op.fn instanceof LessThanEquals || op.fn instanceof GreaterThanEquals || op.fn instanceof Equals) { |
| setOutputToConstant(ec, gCtx, instName, 1.0, outputName, in1.getNumRows(), in1.getNumColumns()); |
| } |
| } else if (in1SparseAndEmpty) { |
| matrixScalarRelational(ec, gCtx, instName, in2, outputName, new LeftScalarOperator(op.fn, 0.0)); |
| } else if (in2SparseAndEmpty) { |
| matrixScalarRelational(ec, gCtx, instName, in1, outputName, new RightScalarOperator(op.fn, 0.0)); |
| } else { |
| matrixMatrixOp(ec, gCtx, instName, in1, in2, outputName, false, false, op); |
| } |
| } |
| |
| /** |
| * Performs elementwise arithmetic operation specified by op of two input matrices in1 and in2 |
| * |
| * @param ec execution context |
| * @param gCtx a valid {@link GPUContext} |
| * @param instName the invoking instruction's name for record {@link Statistics}. |
| * @param in1 input matrix 1 |
| * @param in2 input matrix 2 |
| * @param outputName output matrix name |
| * @param isLeftTransposed true if left-transposed |
| * @param isRightTransposed true if right-transposed |
| * @param op binary operator |
| */ |
| public static void matrixMatrixArithmetic(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, |
| String outputName, boolean isLeftTransposed, boolean isRightTransposed, BinaryOperator op) { |
| if (ec.getGPUContext(0) != gCtx) |
| throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); |
| boolean isCUDALibAvailable = (op.fn instanceof Plus || op.fn instanceof Minus) && !isSparseAndEmpty(gCtx, in1) && !isSparseAndEmpty(gCtx, in2) && !isVector(in1) && !isVector(in2); |
| if(!isCUDALibAvailable) { |
| matrixMatrixOp(ec, gCtx, instName, in1, in2, outputName, isLeftTransposed, isRightTransposed, op); |
| } |
| else { |
| double alpha; |
| double beta; |
| if(op.fn instanceof Plus) { |
| alpha = 1.0; |
| beta = 1.0; |
| } |
| else if(op.fn instanceof Minus) { |
| alpha = 1.0; |
| beta = -1.0; |
| } |
| else { |
| throw new DMLRuntimeException("Unsupported op"); |
| } |
| // C = alpha* op( A ) + beta* op ( B ) |
| dgeam(ec, gCtx, instName, in1, in2, outputName, isLeftTransposed, isRightTransposed, alpha, beta); |
| } |
| } |
| |
| /** |
| * Utility to do matrix-scalar operation kernel |
| * |
| * @param gCtx a valid {@link GPUContext} |
| * @param instName the invoking instruction's name for record {@link Statistics}. |
| * @param ec execution context |
| * @param in input matrix |
| * @param outputName output variable name |
| * @param isInputTransposed true if input is transposed |
| * @param op operator |
| */ |
| public static void matrixScalarOp(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in, String outputName, boolean isInputTransposed, |
| ScalarOperator op) { |
| if (ec.getGPUContext(0) != gCtx) |
| throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); |
| if(isInputTransposed) |
| throw new DMLRuntimeException("Transposing the input is not supported"); |
| |
| int rlenA = toInt(in.getNumRows()); |
| int clenA = toInt(in.getNumColumns()); |
| Pointer A = getDensePointer(gCtx, in, instName); // TODO: FIXME: Implement sparse binCellSparseScalarOp kernel |
| double scalar = op.getConstant(); |
| // MatrixObject out = ec.getMatrixObject(outputName); |
| MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, rlenA, clenA); // Allocated the dense output matrix |
| Pointer C = getDensePointer(gCtx, out, instName); |
| matrixScalarOp(gCtx, instName, A, scalar, rlenA, clenA, C, op); |
| } |
| |
| /** |
| * Helper method to launch binary scalar-matrix arithmetic/relational operations CUDA kernel. |
| * This method is isolated to be taken advantage of from other operations |
| * as it accepts JCuda {@link Pointer} instances instead of {@link MatrixObject} instances. |
| * |
| * @param gCtx a valid {@link GPUContext} |
| * @param instName the invoking instruction's name for record {@link Statistics}. |
| * @param a the dense input matrix (allocated on GPU) |
| * @param scalar the scalar value to do the op |
| * @param rlenA row length of matrix a |
| * @param clenA column lenght of matrix a |
| * @param c the dense output matrix |
| * @param op operation to perform |
| */ |
| private static void matrixScalarOp(GPUContext gCtx, String instName, Pointer a, double scalar, int rlenA, int clenA, Pointer c, ScalarOperator op) { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : matrix_scalar_op" + ", GPUContext=" + gCtx); |
| } |
| int isLeftScalar = (op instanceof LeftScalarOperator) ? 1 : 0; |
| int size = rlenA * clenA; |
| getCudaKernels(gCtx).launchKernel("matrix_scalar_op", |
| ExecutionConfig.getConfigForSimpleVectorOperations(size), |
| a, scalar, c, size, getBinaryOp(op.fn), isLeftScalar); |
| } |
| |
| /** |
| * Utility to launch binary cellwise matrix-matrix operations CUDA kernel |
| * |
| * @param gCtx a valid {@link GPUContext} |
| * @param ec execution context |
| * @param instName the invoking instruction's name for record {@link Statistics}. |
| * @param in1 left input matrix |
| * @param in2 right input matrix |
| * @param outputName output variable name |
| * @param isLeftTransposed true if left matrix is transposed |
| * @param isRightTransposed true if right matrix is transposed |
| * @param op operator |
| */ |
| private static void matrixMatrixOp(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, |
| String outputName, boolean isLeftTransposed, boolean isRightTransposed, BinaryOperator op) { |
| if (ec.getGPUContext(0) != gCtx) |
| throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); |
| boolean isEmpty1 = isSparseAndEmpty(gCtx, in1); |
| boolean isEmpty2 = isSparseAndEmpty(gCtx, in2); |
| int rlenA = toInt(in1.getNumRows()); |
| int rlenB = toInt(in2.getNumRows()); |
| int clenA = toInt(in1.getNumColumns()); |
| int clenB = toInt(in2.getNumColumns()); |
| int vecStatusA = getVectorStatus(rlenA, clenA).code(); |
| int vecStatusB = getVectorStatus(rlenB, clenB).code(); |
| |
| if(isLeftTransposed || isRightTransposed) { |
| throw new DMLRuntimeException("Unsupported operator: GPU transposed binary op " + isLeftTransposed + " " + isRightTransposed); |
| } |
| long outRLen = Math.max(rlenA, rlenB); |
| long outCLen = Math.max(clenA, clenB); |
| |
| if (isEmpty1 && isEmpty2){ |
| MatrixObject out = ec.allocateGPUMatrixObject(outputName, outRLen, outCLen); |
| // When both inputs are empty, the output is empty too (except in the case of division) |
| if (op.fn instanceof Divide || op.fn instanceof IntegerDivide || op.fn instanceof Modulus) { |
| out.getGPUObject(gCtx).allocateAndFillDense(Double.NaN); |
| } else if (op.fn instanceof Minus1Multiply) { |
| out.getGPUObject(gCtx).allocateAndFillDense(1.0); |
| } else { |
| out.getGPUObject(gCtx).allocateSparseAndEmpty(); |
| } |
| } |
| // Check for M1 * M2 when M1 is empty; if M2 is a vector then fallback to general case |
| else if(isEmpty1 && clenB != 1 && rlenB != 1) { |
| // C = empty_in1 op in2 ==> becomes ==> C = 0.0 op in2 |
| matrixScalarArithmetic(ec, gCtx, instName, in2, outputName, isRightTransposed, new LeftScalarOperator(op.fn, 0.0)); |
| } |
| // Check for M1 * M2 when M2 is empty; if M1 is a vector then fallback to general case |
| else if(isEmpty2 && clenA != 1 && rlenA != 1) { |
| // C = in1 op empty_in2 ==> becomes ==> C = in1 op 0.0 |
| matrixScalarArithmetic(ec, gCtx, instName, in1, outputName, isLeftTransposed, new RightScalarOperator(op.fn, 0.0)); |
| } |
| else { |
| Pointer A = getDensePointer(gCtx, in1, instName); // TODO: FIXME: Implement sparse binCellSparseOp kernel |
| Pointer B = getDensePointer(gCtx, in2, instName); // TODO: FIXME: Implement sparse binCellSparseOp kernel |
| |
| // Allocated the dense output matrix |
| MatrixObject out = null; |
| try { |
| out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, outRLen, outCLen); |
| } catch(DMLRuntimeException e) { |
| throw new DMLRuntimeException("Incorrect dimensions: dimA:[" + rlenA + "," + clenA + "]" |
| + " dimB:[" + rlenB + "," + clenB + "] out:[" + outRLen + "," + outCLen + "]", e); |
| } |
| Pointer C = getDensePointer(gCtx, out, instName); |
| |
| int maxRlen = Math.max(rlenA, rlenB); |
| int maxClen = Math.max(clenA, clenB); |
| |
| matrixMatrixOp(gCtx, instName, A, B, maxRlen, maxClen, vecStatusA, vecStatusB, C, op); |
| } |
| } |
| |
| /** |
| * Do an elementwise matrix-matrix arithmetic operation on the GPU |
| * c = a op b |
| * Either rows and cols in A are the same as in B or |
| * one of them is a vector or both are. |
| * @param gCtx a valid {@link GPUContext} |
| * @param instName the invoking instruction's name for record {@link Statistics}. |
| * @param a The input matrix a allocated on the GPU |
| * @param b The input matrix b allocated on the GPU |
| * @param maxRlen the maximum of the row lengths between a & b |
| * @param maxClen the maximum of the column lengths between a & b |
| * @param vecStatusA if matrix A is a vector |
| * @param vecStatusB if matrix B is a vector |
| * @param c output matrix of size (maxRlen, maxClen) allocated on GPU |
| * @param op the operation to perform |
| */ |
| private static void matrixMatrixOp(GPUContext gCtx, String instName, Pointer a, Pointer b, int maxRlen, int maxClen, int vecStatusA, int vecStatusB, Pointer c, BinaryOperator op) { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : matrix_matrix_cellwise_op" + ", GPUContext=" + gCtx); |
| } |
| getCudaKernels(gCtx).launchKernel("matrix_matrix_cellwise_op", |
| ExecutionConfig.getConfigForSimpleMatrixOperations(maxRlen, maxClen), |
| a, b, c, maxRlen, maxClen, vecStatusA, vecStatusB, getBinaryOp(op.fn)); |
| } |
| |
| /** |
| * This enum declares the different vector shapes |
| * as they recognized in the invoked CUDA kernel(s). |
| */ |
| enum VectorShape { |
| COLUMN (1), |
| ROW (2), |
| NONE (0); |
| private final int code; |
| VectorShape(int code) { |
| this.code = code; |
| } |
| int code() { return code; } |
| } |
| |
| /** |
| * Given the number of rows and columns, returns |
| * whether this is a row vector, column vector or neither. |
| * @param rows |
| * @param cols |
| * @return 1 for column vector, 2 for row vector, 0 for neither |
| */ |
| private static VectorShape getVectorStatus(long rows, long cols) { |
| if(cols == 1) |
| return VectorShape.COLUMN; |
| else if(rows == 1) |
| return VectorShape.ROW; |
| else |
| return VectorShape.NONE; |
| } |
| |
| private static boolean isVector(MatrixObject in) { |
| return in.getNumRows() == 1 || in.getNumColumns() == 1; |
| } |
| |
| private static boolean isSparseAndEmpty(GPUContext gCtx, MatrixObject in1) { |
| boolean isSparse1 = isInSparseFormat(gCtx, in1); |
| boolean isEmpty1 = isSparse1 && in1.getGPUObject(gCtx).getJcudaSparseMatrixPtr().nnz == 0; |
| return isEmpty1; |
| } |
| |
| private static void deviceCopy(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject src, String outputName, boolean isInputTransposed) { |
| if (ec.getGPUContext(0) != gCtx) |
| throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); |
| if(!isInputTransposed) |
| deviceCopy(ec, gCtx, instName, src, outputName); |
| else |
| transpose(ec, gCtx, instName, src, outputName); |
| } |
| |
| /** |
| * Performs a deep device copy of a matrix on the GPU |
| * |
| * @param ec execution context |
| * @param instName the invoking instruction's name for record {@link Statistics}. |
| * @param src source matrix |
| * @param outputName destination variable name |
| */ |
| private static void deviceCopy(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject src, String outputName) { |
| Pointer srcPtr = getDensePointer(gCtx, src, instName); // TODO: FIXME: Implement sparse kernel |
| MatrixObject out = ec.getMatrixObject(outputName); |
| getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, toInt(src.getNumRows()), toInt(src.getNumColumns())); // Allocated the dense output matrix |
| Pointer destPtr = getDensePointer(gCtx, out, instName); |
| deviceCopy(instName, srcPtr, destPtr, (int)src.getNumRows(), (int)src.getNumColumns()); |
| } |
| |
| /** |
| * Fills an an array on the GPU with a given scalar value |
| * @param ec currently active instance of the {@link ExecutionContext} |
| * @param gCtx a valid {@link GPUContext} |
| * @param instName name of the invoking instruction to record{@link Statistics}. |
| * @param constant scalar value with which to fill the matrix |
| * @param outputName (internal) name of the matrix that is to be filled |
| * @param numRows number of rows of output matrix object |
| * @param numCols number of columns of output matrix object |
| */ |
| private static void setOutputToConstant(ExecutionContext ec, GPUContext gCtx, String instName, double constant, String outputName, long numRows, long numCols) { |
| if (ec.getGPUContext(0) != gCtx) |
| throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); |
| if(constant == 0) { |
| getSparseMatrixOutputForGPUInstruction(ec, numRows, numCols, 0, instName, outputName); |
| } else { |
| MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, numRows, numCols); // Allocated the dense output matrix |
| Pointer A = getDensePointer(gCtx, out, instName); |
| int rlen = toInt(out.getNumRows()); |
| int clen = toInt(out.getNumColumns()); |
| int size = rlen * clen; |
| getCudaKernels(gCtx).launchKernel("fill", ExecutionConfig.getConfigForSimpleVectorOperations(size), A, constant, size); |
| } |
| } |
| |
| /** |
| * Performs a deep copy of input device double pointer corresponding to matrix |
| * @param instName the invoking instruction's name for record {@link Statistics}. |
| * @param src source matrix |
| * @param dest destination matrix |
| * @param rlen number of rows |
| * @param clen number of columns |
| */ |
| public static void deviceCopy(String instName, Pointer src, Pointer dest, int rlen, int clen) { |
| int size = rlen * clen * sizeOfDataType; |
| cudaMemcpy(dest, src, size, cudaMemcpyDeviceToDevice); |
| } |
| |
| /** |
| * Helper function to get numeric value for binary op. |
| * This number is passed down to the CUDA kernel |
| * and the appropriate binary operation is performed on the GPU. |
| * op = {0=plus, 1=minus, 2=multiply, 3=divide, 4=power, |
| * 5=less, 6=lessequal, 7=greater, 8=greaterequal, 9=equal, 10=notequal, |
| * 11=min, 12=max, 13=and, 14=or, 15=minus1multiply, 16=minusnz, |
| * 17=modulus, 18=integer division} |
| */ |
| private static int getBinaryOp(ValueFunction fn) { |
| if(fn instanceof Plus) return 0; |
| else if(fn instanceof Minus) return 1; |
| else if(fn instanceof Multiply) return 2; |
| else if(fn instanceof Divide) return 3; |
| else if(fn instanceof Power) return 4; |
| else if(fn instanceof LessThan) return 5; |
| else if(fn instanceof LessThanEquals) return 6; |
| else if(fn instanceof GreaterThan) return 7; |
| else if(fn instanceof GreaterThanEquals) return 8; |
| else if(fn instanceof Equals) return 9; |
| else if(fn instanceof NotEquals) return 10; |
| else if(fn instanceof And) return 13; |
| else if(fn instanceof Or) return 14; |
| else if(fn instanceof Multiply2) return 2; |
| else if(fn instanceof Power2) return 4; |
| else if(fn instanceof Minus1Multiply) return 15; |
| else if(fn instanceof MinusNz) return 16; |
| else if(fn instanceof Modulus) return 17; |
| else if(fn instanceof IntegerDivide) return 18; |
| else if(fn instanceof Builtin && ((Builtin)fn).getBuiltinCode()==BuiltinCode.MIN) return 11; |
| else if(fn instanceof Builtin && ((Builtin)fn).getBuiltinCode()==BuiltinCode.MAX) return 12; |
| |
| throw new DMLRuntimeException("The given value function is not supported:" + fn.getClass().getName()); |
| } |
| |
| /** |
| * Performs sparse and dense dgeam given two input matrices |
| * C = alpha* op( A ) + beta* op ( B ) |
| * where op = transpose or not (specified by isLeftTransposed and isRightTransposed). |
| * To indicate a transpose operation, make sure in1 == in2 and isLeftTransposed == isRightTransposed == true |
| * @param ec execution context |
| * @param gCtx a valid {@link GPUContext} |
| * @param instName the invoking instruction's name for record {@link Statistics}. |
| * @param in1 left input matrix |
| * @param in2 right input matrix |
| * @param outputName output variable name |
| * @param isLeftTransposed true if left matrix is transposed |
| * @param isRightTransposed true if right matrix is transposed |
| * @param alpha alpha |
| * @param beta beta |
| */ |
| private static void dgeam(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName, |
| boolean isLeftTransposed, boolean isRightTransposed, double alpha, double beta) { |
| if (ec.getGPUContext(0) != gCtx) |
| throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : dgeam" + ", GPUContext=" + gCtx); |
| } |
| |
| Pointer alphaPtr = dataTypePointerTo(alpha); |
| Pointer betaPtr = dataTypePointerTo(beta); |
| int transa = isLeftTransposed ? CUBLAS_OP_T : CUBLAS_OP_N; |
| int transb = isRightTransposed ? CUBLAS_OP_T : CUBLAS_OP_N; |
| |
| long outRLen = isLeftTransposed ? in1.getNumColumns() : in1.getNumRows(); |
| long outCLen = isLeftTransposed ? in1.getNumRows() : in1.getNumColumns(); |
| |
| MatrixObject out = ec.getMatrixObject(outputName); |
| boolean isSparse1 = isInSparseFormat(gCtx, in1); |
| boolean isSparse2 = isInSparseFormat(gCtx, in2); |
| |
| // TODO: Implement sparse-dense matrix cublasDgeam kernel |
| if(isSparse1 || isSparse2) { |
| int m = (int)in1.getNumRows(); |
| int n = (int)in1.getNumColumns(); |
| // Invoke cuSparse when either are in sparse format |
| // Perform sparse-sparse dgeam |
| if (!isInSparseFormat(gCtx, in1)) { |
| in1.getGPUObject(gCtx).denseToSparse(); |
| } |
| CSRPointer A = in1.getGPUObject(gCtx).getJcudaSparseMatrixPtr(); |
| if (!isInSparseFormat(gCtx, in2)) { |
| in2.getGPUObject(gCtx).denseToSparse(); |
| } |
| CSRPointer B = in2.getGPUObject(gCtx).getJcudaSparseMatrixPtr(); |
| |
| ec.allocateGPUMatrixObject(outputName, outRLen, outCLen); |
| |
| if (in1 == in2 && isLeftTransposed == true && isLeftTransposed == isRightTransposed) { |
| // Special case for transpose |
| |
| int nnz = (int)A.nnz; |
| CSRPointer C = CSRPointer.allocateEmpty(gCtx, nnz, n); |
| out.getGPUObject(gCtx).setSparseMatrixCudaPointer(C); |
| cudaSupportFunctions.cusparsecsr2csc(getCusparseHandle(gCtx), m, n, nnz, A.val, A.rowPtr, A.colInd, C.val, C.colInd, C.rowPtr, cusparseAction.CUSPARSE_ACTION_NUMERIC, cusparseIndexBase.CUSPARSE_INDEX_BASE_ZERO); |
| } else { |
| // General case (cusparse does not support accept the transpose operator for dgeam) |
| // TODO: to implement the transposed + dgeam for sparse matrices, they need to be converted to csc, which is effectively a tranpose |
| if (isLeftTransposed || isRightTransposed) { |
| throw new DMLRuntimeException( |
| "Transpose in cusparseDcsrgeam not supported for sparse matrices on GPU"); |
| } |
| |
| CSRPointer C = CSRPointer.allocateForDgeam(gCtx, getCusparseHandle(gCtx), A, B, m, n); |
| out.getGPUObject(gCtx).setSparseMatrixCudaPointer(C); |
| //long sizeOfC = CSRPointer.estimateSize(C.nnz, out.getNumRows()); |
| cudaSupportFunctions.cusparsecsrgeam(getCusparseHandle(gCtx), m, n, alphaPtr, A.descr, toInt(A.nnz), A.val, A.rowPtr, A.colInd, betaPtr, |
| B.descr, toInt(B.nnz), B.val, B.rowPtr, B.colInd, C.descr, C.val, C.rowPtr, C.colInd); |
| } |
| } else { |
| // Dense-Dense dgeam |
| |
| int lda = toInt(in1.getNumColumns()); |
| int ldb = toInt(in2.getNumColumns()); |
| int m = toInt(in1.getNumColumns()); |
| int n = toInt(in2.getNumRows()); |
| if (isLeftTransposed && isRightTransposed) { |
| m = toInt(in1.getNumRows()); |
| n = toInt(in2.getNumColumns()); |
| } |
| else if (isLeftTransposed) { |
| m = toInt(in1.getNumRows()); |
| } else if (isRightTransposed) { |
| n = toInt(in2.getNumColumns()); |
| } |
| int ldc = m; |
| |
| Pointer A = getDensePointer(gCtx, in1, instName); |
| Pointer B = getDensePointer(gCtx, in2, instName); |
| getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, outRLen, outCLen); // Allocated the dense output matrix |
| Pointer C = getDensePointer(gCtx, out, instName); |
| |
| cudaSupportFunctions.cublasgeam(getCublasHandle(gCtx), transa, transb, m, n, alphaPtr, A, lda, betaPtr, B, ldb, C, ldc); |
| } |
| } |
| |
| /** |
| * Computes C = t(A) |
| * @param ec execution context |
| * @param gCtx gpu context |
| * @param instName name of the instruction |
| * @param A pointer to the input matrix |
| * @param C pointer to the output matrix |
| * @param numRowsA number of rows of the input matrix |
| * @param numColsA number of columns of the output matrix |
| * @throws DMLRuntimeException if error |
| */ |
| public static void denseTranspose(ExecutionContext ec, GPUContext gCtx, String instName, |
| Pointer A, Pointer C, long numRowsA, long numColsA) throws DMLRuntimeException { |
| if (ec.getGPUContext(0) != gCtx) |
| throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : dense transpose" + ", GPUContext=" + gCtx); |
| } |
| // Dense-Dense dgeam |
| int lda = toInt(numColsA); |
| int ldb = lda; |
| int m = toInt(numRowsA); |
| int n = lda; |
| int ldc = m; |
| cudaSupportFunctions.cublasgeam(getCublasHandle(gCtx), CUBLAS_OP_T, CUBLAS_OP_T, m, n, one(), A, lda, zero(), A, ldb, C, ldc); |
| } |
| |
| |
| //********************************************************************/ |
| //****** End of Matrix-Matrix & Matrix-Scalar Functions **************/ |
| //********************************************************************/ |
| |
| |
| |
| //********************************************************************/ |
| //************************ Re-org Functions **************************/ |
| //********************************************************************/ |
| |
| /** |
| * Transposes the input matrix using cublasDgeam |
| * |
| * @param ec execution context |
| * @param gCtx a valid {@link GPUContext} |
| * @param instName the invoking instruction's name for record {@link Statistics}. |
| * @param in input matrix |
| * @param outputName output matrix name |
| */ |
| public static void transpose(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in, String outputName) { |
| // C = alpha* op( A ) + beta* op ( B ) |
| // = 1.0 * A^T + 0.0 * A^T |
| if (ec.getGPUContext(0) != gCtx) |
| throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); |
| dgeam(ec, gCtx, instName, in, in, outputName, true, true, 1.0, 0.0); |
| } |
| |
| //********************************************************************/ |
| //******************* End of Re-org Functions ************************/ |
| //********************************************************************/ |
| |
| public static int toInt(long num) { |
| if(num >= Integer.MAX_VALUE || num <= Integer.MIN_VALUE) { |
| throw new DMLRuntimeException("GPU : Exceeded supported size " + num); |
| } |
| return (int)num; |
| } |
| |
| //********************************************************************/ |
| //**************** Matrix Manipulation Functions *********************/ |
| //********************************************************************/ |
| |
| /** |
| * Method to perform rightIndex operation for a given lower and upper bounds in row and column dimensions. |
| * |
| * @param ec current execution context |
| * @param gCtx current gpu context |
| * @param instName name of the instruction for maintaining statistics |
| * @param in1 input matrix object |
| * @param ixrange index range (0-based) |
| * @param outputName output matrix object |
| */ |
| public static void sliceOperations(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, |
| IndexRange ixrange, String outputName) { |
| if (ec.getGPUContext(0) != gCtx) |
| throw new DMLRuntimeException( |
| "GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : sliceOperations" + ", GPUContext=" + gCtx); |
| } |
| |
| int rl = (int) ixrange.rowStart; |
| int ru = (int) ixrange.rowEnd; |
| int cl = (int) ixrange.colStart; |
| int cu = (int) ixrange.colEnd; |
| if (rl < 0 || rl >= in1.getNumRows() || ru < rl || ru >= in1.getNumRows() || cl < 0 |
| || cu >= in1.getNumColumns() || cu < cl || cu >= in1.getNumColumns()) { |
| throw new DMLRuntimeException("Invalid values for matrix indexing: [" + (rl + 1) + ":" + (ru + 1) + "," |
| + (cl + 1) + ":" + (cu + 1) + "] " + "must be within matrix dimensions [" + in1.getNumRows() + "," |
| + in1.getNumColumns() + "]"); |
| } |
| int len1 = toInt(in1.getNumColumns()); |
| if(isInSparseFormat(gCtx, in1)) { |
| // Input in1 is in sparse format and output is in dense format |
| MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, ru - rl + 1, cu - cl + 1); |
| CSRPointer inPointer = getSparsePointer(gCtx, in1, instName); |
| Pointer outPointer = getDensePointer(gCtx, out, instName); |
| sliceSparseDense(gCtx, instName, inPointer, outPointer, rl, ru, cl, cu, len1); |
| } |
| else { |
| // Input in1 is in dense format (see inPointer) |
| MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, ru - rl + 1, cu - cl + 1); |
| |
| Pointer inPointer = getDensePointer(gCtx, in1, instName); |
| Pointer outPointer = getDensePointer(gCtx, out, instName); |
| sliceDenseDense(gCtx, instName, inPointer, outPointer, rl, ru, cl, cu, len1); |
| } |
| } |
| |
| /** |
| * Perform slice operation on dense input and output it in dense format |
| * |
| * @param gCtx gpu context |
| * @param instName instruction name |
| * @param inPointer dense input pointer |
| * @param outPointer dense output pointer (doesnot need to be zeroed out) |
| * @param rl row lower |
| * @param ru row upper |
| * @param cl column lower |
| * @param cu column upper |
| * @param inClen input number of columns |
| */ |
| protected static void sliceDenseDense(GPUContext gCtx, String instName, Pointer inPointer, Pointer outPointer, |
| int rl, int ru, int cl, int cu, int inClen) { |
| long retClen = cu - cl + 1; |
| if (inClen == retClen) { |
| cudaMemcpy(outPointer, inPointer.withByteOffset(rl * inClen * sizeOfDataType), (ru - rl + 1) * inClen |
| * sizeOfDataType, cudaMemcpyDeviceToDevice); |
| } else { |
| long retRlen = ru - rl + 1; |
| getCudaKernels(gCtx).launchKernel("slice_dense_dense", ExecutionConfig.getConfigForSimpleVectorOperations(toInt(retRlen*retClen)), |
| inPointer, outPointer, rl, ru, cl, cu, inClen, retRlen, retClen); |
| } |
| } |
| |
| /** |
| * Perform slice operation on sparse input and output it in dense format |
| * |
| * @param gCtx gpu context |
| * @param instName instruction name |
| * @param inPointer sparse CSR input pointer |
| * @param outPointer dense output pointer (expected to be zeroed out) |
| * @param rl row lower |
| * @param ru row upper |
| * @param cl column lower |
| * @param cu column upper |
| * @param inClen number of columns of input matrix |
| */ |
| protected static void sliceSparseDense(GPUContext gCtx, String instName, CSRPointer inPointer, Pointer outPointer, |
| int rl, int ru, int cl, int cu, int inClen) { |
| int size = getNnz(inPointer, rl, ru); |
| // Return since nnz of the output is 0 as outPointer is expected to be zeroed out. |
| if(size == 0) return; |
| |
| int retRlen = ru - rl + 1; |
| int retClen = cu - cl + 1; |
| |
| String kernel = null; |
| // Note: row-wise parallelization scheme iterates over input rows in single thread |
| // whereas nnz parallelization scheme iterates over number of output rows in single thread. |
| if(inClen > 10 && retClen > 2*retRlen) { |
| // Perform nnz parallelization for wide and short matrices |
| kernel = "slice_sparse_dense_nnz"; |
| } |
| else { |
| size = retRlen; |
| kernel = "slice_sparse_dense_row"; |
| } |
| |
| // Performs a slice operation where the input matrix is sparse and the output matrix is dense. |
| // This function avoids unnecessary sparse to dense conversion of the input matrix. |
| // We can generalize this later to output sparse matrix. |
| getCudaKernels(gCtx).launchKernel(kernel, ExecutionConfig.getConfigForSimpleVectorOperations(size), |
| inPointer.val, inPointer.rowPtr, inPointer.colInd, outPointer, rl, ru, cl, cu, retClen); |
| } |
| |
| /** |
| * Returns the number of non-zeroes in the given range of rows |
| * |
| * @param inPointer input CSR pointer |
| * @param rl lower row index (inclusive and zero-based) |
| * @param ru upper row index (inclusive and zero-based) |
| * @return number of non-zeroes |
| */ |
| private static int getNnz(CSRPointer inPointer, int rl, int ru) { |
| int[] rlPtr = { -1 }; int[] ruPtr = { -1 }; |
| cudaMemcpy(Pointer.to(rlPtr), inPointer.rowPtr.withByteOffset(rl*Sizeof.INT), Sizeof.INT, cudaMemcpyDeviceToHost); |
| cudaMemcpy(Pointer.to(ruPtr), inPointer.rowPtr.withByteOffset((ru+1)*Sizeof.INT), Sizeof.INT, cudaMemcpyDeviceToHost); |
| return ruPtr[0] - rlPtr[0]; |
| } |
| |
| public static void cbind(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName) { |
| if (ec.getGPUContext(0) != gCtx) |
| throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : cbind" + ", GPUContext=" + gCtx); |
| } |
| |
| long rowsA = toInt(in1.getNumRows()); |
| long colsA = toInt(in1.getNumColumns()); |
| long rowsB = toInt(in2.getNumRows()); |
| long colsB = toInt(in2.getNumColumns()); |
| |
| if (rowsA != rowsB) { |
| throw new DMLRuntimeException("GPU : Invalid internal state - the rows must match up for a cbind operation"); |
| } |
| |
| // only Dense supported |
| MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, rowsA, colsA + colsB); |
| Pointer C = getDensePointer(gCtx, out, instName); |
| Pointer A = getDensePointer(gCtx, in1, instName); |
| Pointer B = getDensePointer(gCtx, in2, instName); |
| |
| int maxRows = toInt(Math.max(rowsA, rowsB)); |
| int maxCols = toInt(Math.max(colsA, colsB)); |
| |
| getCudaKernels(gCtx).launchKernel("cbind", |
| ExecutionConfig.getConfigForSimpleMatrixOperations(maxRows, maxCols), |
| A, B, C, rowsA, colsA, rowsB, colsB); |
| } |
| |
| public static void rbind(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName) { |
| if (ec.getGPUContext(0) != gCtx) |
| throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : rbind" + ", GPUContext=" + gCtx); |
| } |
| |
| int rowsA = toInt(in1.getNumRows()); |
| int colsA = toInt(in1.getNumColumns()); |
| int rowsB = toInt(in2.getNumRows()); |
| int colsB = toInt(in2.getNumColumns()); |
| |
| if (colsA != colsB){ |
| throw new DMLRuntimeException("GPU : Invalid internal state - the columns must match up for a rbind operation"); |
| } |
| |
| // only Dense supported |
| MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, rowsA + rowsB, colsA); |
| Pointer C = getDensePointer(gCtx, out, instName); |
| Pointer A = getDensePointer(gCtx, in1, instName); |
| Pointer B = getDensePointer(gCtx, in2, instName); |
| |
| int maxRows = Math.max(rowsA, rowsB); |
| int maxCols = Math.max(colsA, colsB); |
| |
| getCudaKernels(gCtx).launchKernel("rbind", |
| ExecutionConfig.getConfigForSimpleMatrixOperations(maxRows, maxCols), |
| A, B, C, rowsA, colsA, rowsB, colsB); |
| } |
| |
| |
| //********************************************************************/ |
| //*********** End of Matrix Manipulation Functions *******************/ |
| //********************************************************************/ |
| |
| |
| //********************************************************************/ |
| //************************ Builtin Functions *************************/ |
| //********************************************************************/ |
| |
| /** |
| * Performs an "exp" operation on a matrix on the GPU |
| * @param ec execution context |
| * @param gCtx a valid {@link GPUContext} |
| * @param instName the invoking instruction's name for record {@link Statistics}. |
| * @param in1 input matrix |
| * @param outputName output matrix name |
| */ |
| public static void exp(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : exp" + ", GPUContext=" + gCtx); |
| } |
| // e^0 = 1, create a dense block full of 1s |
| unaryOp(ec, gCtx, in1, "matrix_exp", 1, outputName, instName, GPUInstruction.MISC_TIMER_EXP_KERNEL); |
| } |
| |
| /** |
| * Performs an "sqrt" operation on a matrix on the GPU |
| * @param ec execution context |
| * @param gCtx a valid {@link GPUContext} |
| * @param instName the invoking instruction's name for record {@link Statistics}. |
| * @param in1 input matrix |
| * @param outputName output matrix name |
| */ |
| public static void sqrt(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : sqrt" + ", GPUContext=" + gCtx); |
| } |
| // sqrt(0) = 0, create a dense block full of 0s |
| unaryOp(ec, gCtx, in1, "matrix_sqrt", 0, outputName, instName, GPUInstruction.MISC_TIMER_SQRT_KERNEL); |
| } |
| |
| /** |
| * Performs an "round" operation on a matrix on the GPU |
| * @param ec execution context |
| * @param gCtx a valid {@link GPUContext} |
| * @param instName the invoking instruction's name for record {@link Statistics}. |
| * @param in1 input matrix |
| * @param outputName output matrix name |
| */ |
| public static void round(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : round" + ", GPUContext=" + gCtx); |
| } |
| // round(0) = 0, create a dense block full of 0s |
| unaryOp(ec, gCtx, in1, "matrix_round", 0, outputName, instName, GPUInstruction.MISC_TIMER_ROUND_KERNEL); |
| } |
| |
| /** |
| * Performs an "abs" operation on a matrix on the GPU |
| * @param ec execution context |
| * @param gCtx a valid {@link GPUContext} |
| * @param instName the invoking instruction's name for record {@link Statistics}. |
| * @param in1 input matrix |
| * @param outputName output matrix name |
| */ |
| public static void abs(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : abs" + ", GPUContext=" + gCtx); |
| } |
| // abs(0) = 0, create a dense block full of 0s |
| unaryOp(ec, gCtx, in1, "matrix_abs", 0, outputName, instName, GPUInstruction.MISC_TIMER_ABS_KERNEL); |
| } |
| |
| /** |
| * Performs an "log" operation on a matrix on the GPU |
| * @param ec execution context |
| * @param gCtx a valid {@link GPUContext} |
| * @param instName the invoking instruction's name for record {@link Statistics}. |
| * @param in1 input matrix |
| * @param outputName output matrix name |
| */ |
| public static void log(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : log" + ", GPUContext=" + gCtx); |
| } |
| // log(0) = -Inf |
| unaryOp(ec, gCtx, in1, "matrix_log", Double.NEGATIVE_INFINITY, outputName, instName, GPUInstruction.MISC_TIMER_LOG_KERNEL); |
| } |
| |
| /** |
| * Performs an "floor" operation on a matrix on the GPU |
| * @param ec execution context |
| * @param gCtx a valid {@link GPUContext} |
| * @param instName the invoking instruction's name for record {@link Statistics}. |
| * @param in1 input matrix |
| * @param outputName output matrix name |
| */ |
| public static void floor(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : floor" + ", GPUContext=" + gCtx); |
| } |
| // floor(0) = 0 |
| unaryOp(ec, gCtx, in1, "matrix_floor", 0, outputName, instName, GPUInstruction.MISC_TIMER_FLOOR_KERNEL); |
| } |
| |
| /** |
| * Performs an "ceil" operation on a matrix on the GPU |
| * @param ec execution context |
| * @param gCtx a valid {@link GPUContext} |
| * @param instName the invoking instruction's name for record {@link Statistics}. |
| * @param in1 input matrix |
| * @param outputName output matrix name |
| */ |
| public static void ceil(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : ceil" + ", GPUContext=" + gCtx); |
| } |
| // ceil(0) = 0 |
| unaryOp(ec, gCtx, in1, "matrix_ceil", 0, outputName, instName, GPUInstruction.MISC_TIMER_CEIL_KERNEL); |
| } |
| |
| /** |
| * Performs an "sin" operation on a matrix on the GPU |
| * @param ec execution context |
| * @param gCtx a valid {@link GPUContext} |
| * @param instName the invoking instruction's name for record {@link Statistics}. |
| * @param in1 input matrix |
| * @param outputName output matrix name |
| */ |
| public static void sin(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : sin" + ", GPUContext=" + gCtx); |
| } |
| // sin(0) = 0 |
| unaryOp(ec, gCtx, in1, "matrix_sin", 0, outputName, instName, GPUInstruction.MISC_TIMER_SIN_KERNEL); |
| } |
| |
| /** |
| * Performs an "cos" operation on a matrix on the GPU |
| * @param ec execution context |
| * @param gCtx a valid {@link GPUContext} |
| * @param instName the invoking instruction's name for record {@link Statistics}. |
| * @param in1 input matrix |
| * @param outputName output matrix name |
| */ |
| public static void cos(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : cos" + ", GPUContext=" + gCtx); |
| } |
| // cos(0) = 1 |
| unaryOp(ec, gCtx, in1, "matrix_cos", 1, outputName, instName, GPUInstruction.MISC_TIMER_COS_KERNEL); |
| } |
| |
| /** |
| * Performs an "tan" operation on a matrix on the GPU |
| * @param ec execution context |
| * @param gCtx a valid {@link GPUContext} |
| * @param instName the invoking instruction's name for record {@link Statistics}. |
| * @param in1 input matrix |
| * @param outputName output matrix name |
| */ |
| public static void tan(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : tan" + ", GPUContext=" + gCtx); |
| } |
| // tan(0) = 0 |
| unaryOp(ec, gCtx, in1, "matrix_tan", 0, outputName, instName, GPUInstruction.MISC_TIMER_TAN_KERNEL); |
| } |
| |
| /** |
| * Performs an "sinh" operation on a matrix on the GPU |
| * @param ec execution context |
| * @param gCtx a valid {@link GPUContext} |
| * @param instName the invoking instruction's name for record {@link Statistics}. |
| * @param in1 input matrix |
| * @param outputName output matrix name |
| */ |
| public static void sinh(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : sinh" + ", GPUContext=" + gCtx); |
| } |
| // sin(0) = 0 |
| unaryOp(ec, gCtx, in1, "matrix_sinh", 0, outputName, instName, GPUInstruction.MISC_TIMER_SINH_KERNEL); |
| } |
| |
| /** |
| * Performs an "cosh" operation on a matrix on the GPU |
| * @param ec execution context |
| * @param gCtx a valid {@link GPUContext} |
| * @param instName the invoking instruction's name for record {@link Statistics}. |
| * @param in1 input matrix |
| * @param outputName output matrix name |
| */ |
| public static void cosh(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : cosh" + ", GPUContext=" + gCtx); |
| } |
| // cos(0) = 1 |
| unaryOp(ec, gCtx, in1, "matrix_cosh", 1, outputName, instName, GPUInstruction.MISC_TIMER_COSH_KERNEL); |
| } |
| |
| /** |
| * Performs an "tanh" operation on a matrix on the GPU |
| * @param ec execution context |
| * @param gCtx a valid {@link GPUContext} |
| * @param instName the invoking instruction's name for record {@link Statistics}. |
| * @param in1 input matrix |
| * @param outputName output matrix name |
| */ |
| public static void tanh(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : tanh" + ", GPUContext=" + gCtx); |
| } |
| // tan(0) = 0 |
| unaryOp(ec, gCtx, in1, "matrix_tanh", 0, outputName, instName, GPUInstruction.MISC_TIMER_TANH_KERNEL); |
| } |
| |
| /** |
| * Performs an "asin" operation on a matrix on the GPU |
| * @param ec execution context |
| * @param gCtx a valid {@link GPUContext} |
| * @param instName the invoking instruction's name for record {@link Statistics}. |
| * @param in1 input matrix |
| * @param outputName output matrix name |
| */ |
| public static void asin(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : asin" + ", GPUContext=" + gCtx); |
| } |
| // asin(0) = 0 |
| unaryOp(ec, gCtx, in1, "matrix_asin", 0, outputName, instName, GPUInstruction.MISC_TIMER_ASIN_KERNEL); |
| } |
| |
| /** |
| * Performs an "acos" operation on a matrix on the GPU |
| * @param ec execution context |
| * @param gCtx a valid {@link GPUContext} |
| * @param instName the invoking instruction's name for record {@link Statistics}. |
| * @param in1 input matrix |
| * @param outputName output matrix name |
| */ |
| public static void acos(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : acos" + ", GPUContext=" + gCtx); |
| } |
| // acos(0) = PI/2 |
| unaryOp(ec, gCtx, in1, "matrix_acos", Math.PI/2.0, outputName, instName, GPUInstruction.MISC_TIMER_ACOS_KERNEL); |
| } |
| |
| /** |
| * Performs an "atan" operation on a matrix on the GPU |
| * @param ec execution context |
| * @param gCtx a valid {@link GPUContext} |
| * @param instName the invoking instruction's name for record {@link Statistics}. |
| * @param in1 input matrix |
| * @param outputName output matrix name |
| */ |
| public static void atan(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : atan" + ", GPUContext=" + gCtx); |
| } |
| // atan(0) = 0 |
| unaryOp(ec, gCtx, in1, "matrix_atan", 0, outputName, instName, GPUInstruction.MISC_TIMER_ATAN_KERNEL); |
| } |
| |
| /** |
| * Performs an "sign" operation on a matrix on the GPU |
| * @param ec execution context |
| * @param gCtx a valid {@link GPUContext} |
| * @param instName the invoking instruction's name for record {@link Statistics}. |
| * @param in1 input matrix |
| * @param outputName output matrix name |
| */ |
| public static void sign(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : sign" + ", GPUContext=" + gCtx); |
| } |
| // sign(0) = 0 |
| unaryOp(ec, gCtx, in1, "matrix_sign", 0, outputName, instName, GPUInstruction.MISC_TIMER_SIGN_KERNEL); |
| } |
| |
| /** |
| * Performs an "sigmoid" operation on a matrix on the GPU |
| * @param ec execution context |
| * @param gCtx a valid {@link GPUContext} |
| * @param instName the invoking instruction's name for record {@link Statistics}. |
| * @param in1 input matrix |
| * @param outputName output matrix name |
| */ |
| public static void sigmoid(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : sigmoid" + ", GPUContext=" + gCtx); |
| } |
| // sigmoid(0) = 0.5 |
| unaryOp(ec, gCtx, in1, "matrix_sigmoid", 0.5, outputName, instName, GPUInstruction.MISC_TIMER_SIGMOID_KERNEL); |
| } |
| |
| /** |
| * Calculate the greatest common denominator recursively |
| * @param a a number |
| * @param b another number |
| * @return greatest common denominator |
| */ |
| private static int gcd(int a, int b) |
| { |
| return b == 0 ? a : gcd(b, a % b); |
| } |
| |
| /** |
| * Utility method to print kernel timings |
| * @param time a Timing object |
| * @param kernelFunction Name of the executed kernel function |
| * @param total_duration Sum of all execution times |
| * @param cascade_blocks Number of blocks in the timed iteration of the cascade |
| * @return accumulated time |
| */ |
| private static double printKernelTiming(Timing time, String kernelFunction, double total_duration, int cascade_blocks) { |
| if (LOG.isTraceEnabled()) { |
| cudaDeviceSynchronize(); |
| double duration = time.stop(); |
| total_duration += duration; |
| if(cascade_blocks > 0) |
| LOG.trace("uop kernel function " + kernelFunction + " (cascading_blocks=" + cascade_blocks + ") executed in " + duration + "ms."); |
| else |
| LOG.trace("uop kernel function " + kernelFunction + " executed in " + duration + "ms."); |
| time.start(); |
| return total_duration; |
| } |
| else |
| return 0.0; |
| } |
| |
| /** |
| * Cumulative scan |
| * @param ec valid execution context |
| * @param gCtx a valid {@link GPUContext} |
| * @param instName the invoking instruction's name for record {@link Statistics}. |
| * @param kernelFunction The name of the cuda kernel to call |
| * @param in input matrix |
| * @param outputName output matrix name |
| */ |
| public static void cumulativeScan(ExecutionContext ec, GPUContext gCtx, String instName, String kernelFunction, MatrixObject in, String outputName) { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : cumulative scan (" + GPUInstruction.MISC_TIMER_CUMULATIVE_SCAN_KERNEL + ") for instruction " + instName + |
| " , GPUContext=" + gCtx); |
| } |
| |
| int rows = toInt(in.getNumRows()); |
| int cols = toInt(in.getNumColumns()); |
| |
| MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, in.getNumRows(), in.getNumColumns()); |
| int[] tmp = getKernelParamsForCumScan(gCtx, rows, cols); |
| int blocks_x = tmp[0], blocks_y = tmp[1], threads_x = tmp[2], sharedMem = 0, block_height = tmp[3]; |
| |
| if (blocks_y > 1) { |
| Timing time = new Timing(false); |
| double duration = 0; |
| double alloc_duration = 0; |
| double total_duration = 0; |
| if(LOG.isTraceEnabled()) { time.start(); } |
| |
| Pointer input = getDensePointer(gCtx, in, instName); |
| Pointer output = getDensePointer(gCtx, out, instName); |
| // storage for last value of each block |
| Pointer blk_res = gCtx.allocate(instName, cols * blocks_y * sizeOfDataType); |
| |
| alloc_duration = printKernelTiming(time, "allocation of temporary buffer (" + |
| cols * blocks_y * sizeOfDataType + " bytes)", alloc_duration, 0); |
| |
| getCudaKernels(gCtx).launchKernel(kernelFunction + "_up_sweep", new ExecutionConfig(blocks_x, blocks_y, |
| threads_x, 1, 0), input, blk_res, rows, cols, block_height); |
| |
| total_duration = printKernelTiming(time, kernelFunction + "_up_sweep", total_duration, 0); |
| |
| getCudaKernels(gCtx).launchKernel(kernelFunction + "_down_sweep", new ExecutionConfig(blocks_x, 1, |
| threads_x, 1, 0), blk_res, blk_res, blk_res, blocks_y, cols, blocks_y); |
| |
| total_duration = printKernelTiming(time, kernelFunction + "_down_sweep", total_duration, 0); |
| |
| getCudaKernels(gCtx).launchKernel(kernelFunction + "_down_sweep", new ExecutionConfig(blocks_x, |
| blocks_y, threads_x, 1, 0), input, output, blk_res, rows, cols, block_height); |
| |
| total_duration = printKernelTiming(time, "final cumulative_scan_down_sweep", total_duration, 0); |
| if(LOG.isTraceEnabled()) { LOG.trace("total kernel execution time: " + total_duration + "ms."); } |
| |
| gCtx.cudaFreeHelper(instName, blk_res, DMLScript.EAGER_CUDA_FREE); |
| if(LOG.isTraceEnabled()) { |
| cudaDeviceSynchronize(); |
| duration = time.stop(); |
| alloc_duration += duration; |
| LOG.trace("freeing of temporary buffer " + " executed in " + duration + "ms."); |
| LOG.trace("total memory mgmt execution time: " + alloc_duration + "ms."); |
| LOG.trace("total execution time (kernel + mem): " + (total_duration + alloc_duration) + "ms."); |
| } |
| } |
| else { |
| Pointer input = getDensePointer(gCtx, in, instName); |
| Pointer output = getDensePointer(gCtx, out, instName); |
| |
| Timing time = new Timing(true); |
| double duration = 0; |
| |
| getCudaKernels(gCtx).launchKernel(kernelFunction + "_down_sweep", |
| new ExecutionConfig(blocks_x, 1, threads_x, 1, sharedMem), input, output, input, rows, cols, block_height); |
| |
| if(LOG.isTraceEnabled()) { |
| cudaDeviceSynchronize(); |
| duration = time.stop(); |
| LOG.trace("total kernel execution time: " + duration + "ms."); |
| } |
| } |
| } |
| |
| /** |
| * Cumulative sum-product kernel cascade invokation |
| * @param ec valid execution context |
| * @param gCtx a valid {@link GPUContext} |
| * @param instName the invoking instruction's name for record {@link Statistics}. |
| * @param kernelFunction The name of the cuda kernel to call |
| * @param in input matrix |
| * @param outputName output matrix name |
| */ |
| public static void cumulativeSumProduct(ExecutionContext ec, GPUContext gCtx, |
| String instName, String kernelFunction, MatrixObject in, String outputName) |
| { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : cumulative sum product for " + GPUInstruction.MISC_TIMER_CUMULATIVE_SCAN_KERNEL + ", GPUContext=" + gCtx); |
| } |
| |
| Timing time = new Timing(false); |
| |
| double total_duration = 0, alloc_duration = 0; |
| int rows = toInt(in.getNumRows()); |
| if(LOG.isTraceEnabled()) { time.start(); } |
| |
| if(rows > 128) { |
| final int MAX_BLOCKS = getMaxBlocks(gCtx); |
| ArrayList<Pointer> intermediate_buffers = new ArrayList<>(); |
| ArrayList<Integer> cb_list = new ArrayList<>(); |
| |
| int block_height = 64; |
| int blocks = (rows + block_height - 1) / block_height; |
| if(blocks > MAX_BLOCKS) { |
| blocks = MAX_BLOCKS; |
| block_height = nextPow2((rows + (blocks - 1)) / blocks); |
| blocks = (rows + block_height - 1) / block_height; |
| } |
| |
| int threads = 1; |
| int cascade_blocks = (blocks + block_height - 1) / block_height; |
| cb_list.add(cascade_blocks); |
| |
| long total_mem_size = 0; |
| while( cascade_blocks > 0) { |
| long buf_size = 2 * block_height * cascade_blocks * sizeOfDataType; |
| total_mem_size += buf_size; |
| intermediate_buffers.add(gCtx.allocate(instName, buf_size)); |
| cascade_blocks = (cascade_blocks + block_height - 2) / block_height; |
| if(cascade_blocks > 0) |
| cb_list.add(cascade_blocks); |
| } |
| alloc_duration = printKernelTiming(time, "allocation of temporary buffer (" |
| + total_mem_size + " bytes)", alloc_duration, 0); |
| cascade_blocks = blocks; |
| |
| MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, in.getNumRows(), 1); |
| Pointer input = getDensePointer(gCtx, in, instName); |
| Pointer output = getDensePointer(gCtx, out, instName); |
| |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("Launch configuration for cumulative aggregate: blocks=" + blocks + |
| " block_height=" + block_height + " threads=" + threads); |
| } |
| |
| getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(blocks, threads), input, |
| 0, 0, intermediate_buffers.get(0), rows, block_height, 0); |
| |
| int cascade_level = 0; |
| |
| while(cascade_level < intermediate_buffers.size() - 1) { |
| total_duration = printKernelTiming(time, kernelFunction, total_duration, cascade_blocks); |
| cascade_blocks = cb_list.get(cascade_level); |
| getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(cascade_blocks, threads), |
| intermediate_buffers.get(cascade_level), intermediate_buffers.get(cascade_level), 0, |
| intermediate_buffers.get(cascade_level+1), rows / ((cascade_level+1) * block_height), block_height, 1); |
| cascade_level++; |
| } |
| |
| while (cascade_level > 0) { |
| total_duration = printKernelTiming(time, kernelFunction, total_duration, cascade_blocks); |
| cascade_level--; |
| cascade_blocks = cb_list.get(cascade_level); |
| |
| getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(cascade_blocks, threads), |
| intermediate_buffers.get(cascade_level), intermediate_buffers.get(cascade_level), |
| intermediate_buffers.get(cascade_level+1), 0, rows / ((cascade_level+1) * block_height), block_height, 2); |
| } |
| |
| total_duration = printKernelTiming(time, kernelFunction, total_duration, cascade_blocks); |
| getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(blocks, threads), input, |
| output, intermediate_buffers.get(0), 0, rows, block_height, 3); |
| if(LOG.isTraceEnabled()) { |
| cudaDeviceSynchronize(); |
| double duration = time.stop(); |
| total_duration += duration; |
| LOG.trace("final cascade ("+ kernelFunction + ", cascading_blocks=" + blocks + ") executed in " + duration + "ms."); |
| LOG.trace("total kernel execution time: " + total_duration + "ms."); |
| time.start(); |
| } |
| |
| for(int j = 0; j < intermediate_buffers.size(); ++j) |
| gCtx.cudaFreeHelper(instName, intermediate_buffers.get(j), DMLScript.EAGER_CUDA_FREE); |
| |
| if(LOG.isTraceEnabled()) { |
| cudaDeviceSynchronize(); |
| double duration = time.stop(); |
| alloc_duration += duration; |
| LOG.trace("freeing of temporary buffer " + " executed in " + duration + "ms."); |
| LOG.trace("total memory mgmt execution time: " + alloc_duration + "ms."); |
| LOG.trace("total execution time (kernel + mem): " + (total_duration + alloc_duration) + "ms."); |
| } |
| } |
| else { |
| MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, in.getNumRows(), 1); |
| Pointer input = getDensePointer(gCtx, in, instName); |
| Pointer output = getDensePointer(gCtx, out, instName); |
| getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(1, 1), input, |
| output, 0, 0, rows, rows); |
| |
| if(LOG.isTraceEnabled()) { |
| cudaDeviceSynchronize(); |
| double duration = time.stop(); |
| total_duration += duration; |
| LOG.trace("uop kernel function " + kernelFunction + " executed in " + duration + "ms."); |
| LOG.trace("total kernel execution time: " + total_duration + "ms."); |
| } |
| } |
| } |
| |
| /** |
| * Get threads, blocks and shared memory for cumulative scan along columns |
| * @param gCtx a valid {@link GPUContext} |
| * @param rows number of rows in input matrix |
| * @param cols number of columns in input matrix |
| * @return integer array containing {blocks, threads, shared memory} |
| */ |
| private static int[] getKernelParamsForCumScan(GPUContext gCtx, int rows, int cols) { |
| final int MAX_THREADS = getMaxThreads(gCtx); |
| final int WARP_SIZE = getWarpSize(gCtx); |
| final int MAX_BLOCKS_Y = gCtx.getGPUProperties().maxGridSize[1]; |
| |
| int t1 = cols % MAX_THREADS; |
| int t2 = (t1 + WARP_SIZE - 1) / WARP_SIZE; |
| int t3 = t2 * WARP_SIZE; |
| int threads_x = gcd(MAX_THREADS, t3); |
| int blocks_x = Math.max(1, (cols + (threads_x - 1)) / (threads_x)); |
| |
| int block_height = Math.max(8, MAX_THREADS / threads_x); |
| int blocks_y = (rows + block_height - 1) / block_height; |
| int min_loop_length = 128; |
| if(rows <= min_loop_length) { |
| block_height = rows; |
| blocks_y = 1; |
| } |
| |
| if(blocks_y > MAX_BLOCKS_Y) { |
| block_height = Math.max(2 ,2 * rows / MAX_BLOCKS_Y); |
| blocks_y = (rows + block_height - 1) / block_height; |
| } |
| |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("Launch configuration for cumulative aggregate: blocks_x=" + blocks_x + " blocks_y=" + |
| blocks_y + " block_height=" + block_height + " threads_x=" + threads_x); |
| } |
| |
| return new int[] {blocks_x, blocks_y, threads_x, block_height}; |
| } |
| |
| /** |
| * A helper function for all Unary ops (sqrt, abs, sin.. etc) |
| * @param ec valid execution context |
| * @param gCtx a valid {@link GPUContext} |
| * @param in1 input matrix |
| * @param kernel name of CUDA kernel for the unary op to execute |
| * @param sparseAndEmptyFillValue the result of the unary op on a completely empty input matrix block |
| * @param outputName output matrix name |
| * @param instName the invoking instruction's name for record {@link Statistics}. |
| * @param kernelTimer the name of the timer to measure the kernel invocation |
| */ |
| private static void unaryOp(ExecutionContext ec, GPUContext gCtx, MatrixObject in1, String kernel, double sparseAndEmptyFillValue, String outputName, String instName, String kernelTimer) { |
| if (ec.getGPUContext(0) != gCtx) |
| throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); |
| GPUObject in = in1.getGPUObject(gCtx); |
| boolean isSparseAndEmpty = in.isSparseAndEmpty(); |
| if (isSparseAndEmpty) { |
| MatrixObject out = ec.getMatrixObject(outputName); |
| ec.allocateGPUMatrixObject(outputName, in1.getNumRows(), in1.getNumColumns()); |
| out.getGPUObject(gCtx).allocateAndFillDense(sparseAndEmptyFillValue); |
| } else { |
| // Dense |
| MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, in1.getNumRows(), in1.getNumColumns()); |
| Pointer output = getDensePointer(gCtx, out, instName); |
| Pointer input = getDensePointer(gCtx, in1, instName); |
| int size = toInt(in1.getNumColumns() * in1.getNumRows()); |
| getCudaKernels(gCtx).launchKernel(kernel, ExecutionConfig.getConfigForSimpleVectorOperations(size), input, output, size); |
| } |
| } |
| |
| /** |
| * Performs daxpy operation |
| * |
| * @param ec execution context |
| * @param gCtx a valid {@link GPUContext} |
| * @param instName the invoking instruction's name for record {@link Statistics}. |
| * @param in1 input matrix 1 |
| * @param in2 input matrix 2 |
| * @param outputName output matrix name |
| * @param constant pointer constant |
| */ |
| public static void axpy(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, |
| String outputName, double constant) { |
| if (ec.getGPUContext(0) != gCtx) |
| throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); |
| Pointer A = getDensePointer(gCtx, in1, instName); |
| Pointer B = getDensePointer(gCtx, in2, instName); |
| MatrixObject out = ec.getMatrixObject(outputName); |
| getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, in1.getNumRows(), in1.getNumColumns()); // Allocated the dense output matrix |
| Pointer C = getDensePointer(gCtx, out, instName); |
| |
| if(in1.getNumRows() == in2.getNumRows() && in1.getNumColumns() == in2.getNumColumns()) { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : cublasDaxpy" + ", GPUContext=" + gCtx); |
| } |
| |
| // Matrix-Matrix daxpy |
| long n = in1.getNumRows()*in2.getNumColumns(); // Since A is always a matrix |
| Pointer alphaPtr = dataTypePointerTo(constant); |
| // C <- A + alpha*B |
| // becomes |
| // C <- A |
| // C <- alpha*B + C |
| cudaMemcpy(C, A, n*sizeOfDataType, cudaMemcpyDeviceToDevice); |
| cudaSupportFunctions.cublasaxpy(getCublasHandle(gCtx), toInt(n), alphaPtr, B, 1, C, 1); |
| } |
| else { |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : daxpy_matrix_vector" + ", GPUContext=" + gCtx); |
| } |
| |
| // Matrix-Vector daxpy |
| // Note: Vector-Matrix operation is not supported |
| // daxpy_matrix_vector(double* A, double* B, double alpha, double* ret, int rlenA, int clenA, int rlenB, int clenB) |
| int rlenA = toInt(in1.getNumRows()); int clenA = toInt(in1.getNumColumns()); |
| int rlenB = toInt(in2.getNumRows()); int clenB = toInt(in2.getNumColumns()); |
| getCudaKernels(gCtx).launchKernel("daxpy_matrix_vector", |
| ExecutionConfig.getConfigForSimpleMatrixOperations(rlenA, clenA), |
| A, B, constant, C, rlenA, clenA, rlenB, clenB); |
| } |
| } |
| |
| |
| /** |
| * Implements the "solve" function for systemds Ax = B (A is of size m*n, B is of size m*1, x is of size n*1) |
| * |
| * @param ec a valid {@link ExecutionContext} |
| * @param gCtx a valid {@link GPUContext} |
| * @param instName the invoking instruction's name for record {@link Statistics}. |
| * @param in1 input matrix A |
| * @param in2 input matrix B |
| * @param outputName name of the output matrix |
| */ |
| public static void solve(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName) { |
| if (ec.getGPUContext(0) != gCtx) |
| throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); |
| |
| // x = solve(A, b) |
| if(LOG.isTraceEnabled()) { |
| LOG.trace("GPU : solve" + ", GPUContext=" + gCtx); |
| } |
| |
| GPUObject Aobj = in1.getGPUObject(gCtx); |
| if (isInSparseFormat(gCtx, in1)) |
| Aobj.sparseToDense(instName); |
| |
| GPUObject bobj = in2.getGPUObject(gCtx); |
| if (isInSparseFormat(gCtx, in2)) |
| bobj.sparseToDense(instName); |
| |
| int m = (int) in1.getNumRows(); |
| int n = (int) in1.getNumColumns(); |
| if ((int) in2.getNumRows() != m) |
| throw new DMLRuntimeException("GPU : Incorrect input for solve(), rows in A should be the same as rows in B"); |
| if ((int) in2.getNumColumns() != 1) |
| throw new DMLRuntimeException("GPU : Incorrect input for solve(), columns in B should be 1"); |
| |
| |
| // Copy over matrices and |
| // convert dense matrices to row major |
| // Operation in cuSolver and cuBlas are for column major dense matrices |
| // and are destructive to the original input |
| GPUObject ATobj = (GPUObject) Aobj.clone(); |
| ATobj.denseRowMajorToColumnMajor(); |
| Pointer A = ATobj.getDensePointer(); |
| |
| GPUObject bTobj = (GPUObject) bobj.clone(); |
| bTobj.denseRowMajorToColumnMajor(); |
| |
| |
| Pointer b = bTobj.getDensePointer(); |
| |
| // The following set of operations is done following the example in the cusolver documentation |
| // http://docs.nvidia.com/cuda/cusolver/#ormqr-example1 |
| |
| // step 3: query working space of geqrf and ormqr |
| int[] lwork = {0}; |
| cudaSupportFunctions.cusolverDngeqrf_bufferSize(gCtx.getCusolverDnHandle(), m, n, A, m, lwork); |
| |
| // step 4: compute QR factorization |
| Pointer work = gCtx.allocate(instName, lwork[0] * sizeOfDataType); |
| Pointer tau = gCtx.allocate(instName, m * sizeOfDataType); |
| Pointer devInfo = gCtx.allocate(instName, Sizeof.INT); |
| cudaSupportFunctions.cusolverDngeqrf(gCtx.getCusolverDnHandle(), m, n, A, m, tau, work, lwork[0], devInfo); |
| |
| int[] qrError = {-1}; |
| cudaMemcpy(Pointer.to(qrError), devInfo, Sizeof.INT, cudaMemcpyDeviceToHost); |
| if (qrError[0] != 0) { |
| throw new DMLRuntimeException("GPU : Error in call to geqrf (QR factorization) as part of solve, argument " + qrError[0] + " was wrong"); |
| } |
| |
| // step 5: compute Q^T*B |
| cudaSupportFunctions.cusolverDnormqr(gCtx.getCusolverDnHandle(), cublasSideMode.CUBLAS_SIDE_LEFT, cublasOperation.CUBLAS_OP_T, m, 1, n, A, m, tau, b, m, work, lwork[0], devInfo); |
| cudaMemcpy(Pointer.to(qrError), devInfo, Sizeof.INT, cudaMemcpyDeviceToHost); |
| if (qrError[0] != 0) { |
| throw new DMLRuntimeException("GPU : Error in call to ormqr (to compuete Q^T*B after QR factorization) as part of solve, argument " + qrError[0] + " was wrong"); |
| } |
| |
| // step 6: compute x = R \ Q^T*B |
| cudaSupportFunctions.cublastrsm(gCtx.getCublasHandle(), |
| cublasSideMode.CUBLAS_SIDE_LEFT, cublasFillMode.CUBLAS_FILL_MODE_UPPER, cublasOperation.CUBLAS_OP_N, cublasDiagType.CUBLAS_DIAG_NON_UNIT, |
| n, 1, dataTypePointerTo(1.0), A, m, b, m); |
| |
| bTobj.denseColumnMajorToRowMajor(); |
| |
| // TODO : Find a way to assign bTobj directly to the output and set the correct flags so as to not crash |
| // There is an avoidable copy happening here |
| MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, in1.getNumColumns(), 1); |
| cudaMemcpy(out.getGPUObject(gCtx).getDensePointer(), bTobj.getDensePointer(), n * 1 * sizeOfDataType, cudaMemcpyDeviceToDevice); |
| |
| gCtx.cudaFreeHelper(instName, work, DMLScript.EAGER_CUDA_FREE); |
| gCtx.cudaFreeHelper(instName, tau, DMLScript.EAGER_CUDA_FREE); |
| ATobj.clearData(instName, DMLScript.EAGER_CUDA_FREE); |
| bTobj.clearData(instName, DMLScript.EAGER_CUDA_FREE); |
| |
| //debugPrintMatrix(b, n, 1); |
| } |
| |
| //********************************************************************/ |
| //***************** END OF Builtin Functions ************************/ |
| //********************************************************************/ |
| |
| /** |
| * Helper method to get the output block (allocated on the GPU) |
| * Also records performance information into {@link Statistics} |
| * @param ec active {@link ExecutionContext} |
| * @param instName the invoking instruction's name for record {@link Statistics}. |
| * @param name name of input matrix (that the {@link ExecutionContext} is aware of) |
| * @param numRows number of rows of output matrix object |
| * @param numCols number of columns of output matrix object |
| * @return the matrix object |
| */ |
| public static MatrixObject getDenseMatrixOutputForGPUInstruction(ExecutionContext ec, String instName, String name, long numRows, long numCols) { |
| return ec.getDenseMatrixOutputForGPUInstruction(name, numRows, numCols).getKey(); |
| } |
| |
| /** |
| * Helper method to get the output block (allocated on the GPU) |
| * Also records performance information into {@link Statistics} |
| * @param ec active {@link ExecutionContext} |
| * @param numRows number of rows of matrix object |
| * @param numCols number of columns of matrix object |
| * @param nnz number of non zeroes in output matrix |
| * @param instName the invoking instruction's name for record {@link Statistics}. |
| * @param name name of input matrix (that the {@link ExecutionContext} is aware of) |
| * @return the matrix object |
| */ |
| private static MatrixObject getSparseMatrixOutputForGPUInstruction(ExecutionContext ec, long numRows, long numCols, long nnz, String instName, String name) { |
| return ec.getSparseMatrixOutputForGPUInstruction(name, numRows, numCols, nnz).getKey(); |
| } |
| |
| /** |
| * Utility to compute number of non-zeroes on the GPU |
| * |
| * @param gCtx the associated GPUContext |
| * @param densePtr device pointer to the dense matrix |
| * @param length length of the dense pointer |
| * @return the number of non-zeroes |
| */ |
| public static synchronized int computeNNZ(GPUContext gCtx, Pointer densePtr, int length) { |
| return (int) reduceAll(gCtx, null, "compute_nnz", densePtr, length); |
| // This is extremely slow |
| // cusparseMatDescr matDescr = CSRPointer.getDefaultCuSparseMatrixDescriptor(); |
| // cusparseHandle cusparseHandle = gCtx.getCusparseHandle(); |
| // if(_TMP_NNZ_ROW_PTR == null) { |
| // // As these are 4-byte pointers, using cudaMalloc directly so as not to include them in memory information. |
| // _TMP_NNZ_ROW_PTR = new Pointer(); |
| // cudaMalloc(_TMP_NNZ_ROW_PTR, jcuda.Sizeof.INT); |
| // _TMP_NNZ_PTR = new Pointer(); |
| // cudaMalloc(_TMP_NNZ_PTR, jcuda.Sizeof.INT); |
| // // _TMP_NNZ_ROW_PTR = gCtx.allocate(jcuda.Sizeof.INT); |
| // // _TMP_NNZ_PTR = gCtx.allocate(jcuda.Sizeof.INT); |
| // } |
| // // Output is in dense vector format, convert it to CSR |
| // LibMatrixCUDA.cudaSupportFunctions.cusparsennz(cusparseHandle, cusparseDirection.CUSPARSE_DIRECTION_ROW, 1, length, matDescr, densePtr, 1, |
| // _TMP_NNZ_ROW_PTR, _TMP_NNZ_PTR); |
| // int[] nnzC = { -1 }; |
| // cudaMemcpy(Pointer.to(nnzC), _TMP_NNZ_PTR, jcuda.Sizeof.INT, cudaMemcpyDeviceToHost); |
| // return nnzC[0]; |
| } |
| } |