blob: dc5a1f0bdd6bf92499cb875ce2608c91817a7537 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.sysds.runtime.matrix.data;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.sysds.api.DMLScript;
import org.apache.sysds.runtime.DMLRuntimeException;
import org.apache.sysds.runtime.controlprogram.caching.MatrixObject;
import org.apache.sysds.runtime.controlprogram.context.ExecutionContext;
import org.apache.sysds.runtime.controlprogram.parfor.stat.Timing;
import org.apache.sysds.runtime.functionobjects.And;
import org.apache.sysds.runtime.functionobjects.Builtin;
import org.apache.sysds.runtime.functionobjects.CM;
import org.apache.sysds.runtime.functionobjects.Divide;
import org.apache.sysds.runtime.functionobjects.Equals;
import org.apache.sysds.runtime.functionobjects.GreaterThan;
import org.apache.sysds.runtime.functionobjects.GreaterThanEquals;
import org.apache.sysds.runtime.functionobjects.IndexFunction;
import org.apache.sysds.runtime.functionobjects.IntegerDivide;
import org.apache.sysds.runtime.functionobjects.KahanPlus;
import org.apache.sysds.runtime.functionobjects.KahanPlusSq;
import org.apache.sysds.runtime.functionobjects.LessThan;
import org.apache.sysds.runtime.functionobjects.LessThanEquals;
import org.apache.sysds.runtime.functionobjects.Mean;
import org.apache.sysds.runtime.functionobjects.Minus;
import org.apache.sysds.runtime.functionobjects.Minus1Multiply;
import org.apache.sysds.runtime.functionobjects.MinusNz;
import org.apache.sysds.runtime.functionobjects.Modulus;
import org.apache.sysds.runtime.functionobjects.Multiply;
import org.apache.sysds.runtime.functionobjects.Multiply2;
import org.apache.sysds.runtime.functionobjects.NotEquals;
import org.apache.sysds.runtime.functionobjects.Or;
import org.apache.sysds.runtime.functionobjects.Plus;
import org.apache.sysds.runtime.functionobjects.Power;
import org.apache.sysds.runtime.functionobjects.Power2;
import org.apache.sysds.runtime.functionobjects.ReduceAll;
import org.apache.sysds.runtime.functionobjects.ReduceCol;
import org.apache.sysds.runtime.functionobjects.ReduceDiag;
import org.apache.sysds.runtime.functionobjects.ReduceRow;
import org.apache.sysds.runtime.functionobjects.ValueFunction;
import org.apache.sysds.runtime.functionobjects.Builtin.BuiltinCode;
import org.apache.sysds.runtime.instructions.cp.DoubleObject;
import org.apache.sysds.runtime.instructions.gpu.GPUInstruction;
import org.apache.sysds.runtime.instructions.gpu.context.CSRPointer;
import org.apache.sysds.runtime.instructions.gpu.context.ExecutionConfig;
import org.apache.sysds.runtime.instructions.gpu.context.GPUContext;
import org.apache.sysds.runtime.instructions.gpu.context.GPUObject;
import org.apache.sysds.runtime.instructions.gpu.context.JCudaKernels;
import org.apache.sysds.runtime.matrix.operators.AggregateOperator;
import org.apache.sysds.runtime.matrix.operators.AggregateUnaryOperator;
import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
import org.apache.sysds.runtime.matrix.operators.CMOperator;
import org.apache.sysds.runtime.matrix.operators.LeftScalarOperator;
import org.apache.sysds.runtime.matrix.operators.RightScalarOperator;
import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
import org.apache.sysds.runtime.util.IndexRange;
import org.apache.sysds.utils.Statistics;
import jcuda.Pointer;
import jcuda.Sizeof;
import jcuda.jcublas.cublasDiagType;
import jcuda.jcublas.cublasFillMode;
import jcuda.jcublas.cublasHandle;
import jcuda.jcublas.cublasOperation;
import jcuda.jcublas.cublasSideMode;
import jcuda.jcusparse.cusparseAction;
import jcuda.jcusparse.cusparseHandle;
import jcuda.jcusparse.cusparseIndexBase;
import static jcuda.jcublas.cublasOperation.CUBLAS_OP_N;
import static jcuda.jcublas.cublasOperation.CUBLAS_OP_T;
import static jcuda.runtime.JCuda.cudaDeviceSynchronize;
import static jcuda.runtime.JCuda.cudaMemcpy;
import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice;
import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
import java.lang.Math;
import java.util.ArrayList;
/**
* All CUDA kernels and library calls are redirected through this class
* @see GPUContext
* @see GPUObject
*/
public class LibMatrixCUDA {
private static final Log LOG = LogFactory.getLog(LibMatrixCUDA.class.getName());
protected static int CUDNN_DATA_TYPE = jcuda.jcudnn.cudnnDataType.CUDNN_DATA_DOUBLE;
// The below variables are used in CSRPointer, GPUObjects, etc.
public static CudaSupportFunctions cudaSupportFunctions = new DoublePrecisionCudaSupportFunctions();
public static int sizeOfDataType = jcuda.Sizeof.DOUBLE;
public static String customKernelSuffix = "_d";
/**
* Sets the internal state based on the DMLScript.DATA_TYPE
*/
public static void resetFloatingPointPrecision() {
if(DMLScript.FLOATING_POINT_PRECISION.equalsIgnoreCase("double")) {
LibMatrixCUDA.CUDNN_DATA_TYPE = jcuda.jcudnn.cudnnDataType.CUDNN_DATA_DOUBLE;
LibMatrixCUDA.cudaSupportFunctions = new DoublePrecisionCudaSupportFunctions();
LibMatrixCUDA.sizeOfDataType = jcuda.Sizeof.DOUBLE;
LibMatrixCUDA.customKernelSuffix = "_d";
}
else if(DMLScript.FLOATING_POINT_PRECISION.equalsIgnoreCase("single")) {
LibMatrixCUDA.CUDNN_DATA_TYPE = jcuda.jcudnn.cudnnDataType.CUDNN_DATA_FLOAT;
LibMatrixCUDA.cudaSupportFunctions = new SinglePrecisionCudaSupportFunctions();
LibMatrixCUDA.sizeOfDataType = jcuda.Sizeof.FLOAT;
LibMatrixCUDA.customKernelSuffix = "_f";
}
else {
throw new DMLRuntimeException("Unsupported floating point precision: " + DMLScript.FLOATING_POINT_PRECISION);
}
}
// Assume Compute Capability 3.0
// MAX BLOCKS is 2^31 - 1 For compute capability > 3.0
// MAX_THREADS is 1024 For compute capability > 3.0
// WarpSize is usually 32
// Shared mem is mostly at max 48 KB for current GPU
private static int _MAX_THREADS = -1;
private static int _MAX_BLOCKS = -1;
private static int _WARP_SIZE = -1;
private static long _SHMEM_SIZE = -1;
// From CuDNN 5.1 documentation:
// The total size of a tensor including the potential padding between dimensions is limited to 2 Giga-elements of type datatype.
protected static long maxNumElementsOfCuDNNTensor = 2000000000;
//********************************************************************/
//***************************** UTILS ********************************/
//********************************************************************/
/**
* Utility function to get maximum number of threads supported by the active CUDA device.
* @param gCtx a valid {@link GPUContext}
* @return max threads
*/
static int getMaxThreads(GPUContext gCtx){
if (_MAX_THREADS == -1){
_MAX_THREADS = gCtx.getMaxThreadsPerBlock();
}
return _MAX_THREADS;
}
/**
* Utility function to get maximum number of blocks supported by the active CUDA device.
* @param gCtx a valid {@link GPUContext}
* @return max blocks
*/
static int getMaxBlocks(GPUContext gCtx) {
if (_MAX_BLOCKS == -1){
_MAX_BLOCKS = gCtx.getMaxBlocks();
}
return _MAX_BLOCKS;
}
/**
* Utility function to get the maximum amount of shared memory/block size supported by the active CUDA device.
* @param gCtx a valid {@link GPUContext}
* @return warp size
*/
static long getMaxSharedMemory(GPUContext gCtx) {
if (_SHMEM_SIZE == -1) {
_SHMEM_SIZE = gCtx.getMaxSharedMemory();
}
return _SHMEM_SIZE;
}
/**
* Utility function to get the warp size supported by the active CUDA device.
* @param gCtx a valid {@link GPUContext}
* @return shared mem
*/
static int getWarpSize(GPUContext gCtx) {
if (_WARP_SIZE == -1) {
_WARP_SIZE = gCtx.getWarpSize();
}
return _WARP_SIZE;
}
public static boolean isInSparseFormat(GPUContext gCtx, MatrixObject mo) {
if(mo.getGPUObject(gCtx) != null && mo.getGPUObject(gCtx).isAllocated())
return mo.getGPUObject(gCtx).isSparse();
return MatrixBlock.evalSparseFormatInMemory(mo.getNumRows(), mo.getNumColumns(), mo.getNnz());
}
/**
* Note: if the matrix is in dense format, it explicitly re-computes the number of nonzeros.
*
* @param gCtx a valid GPU context
* @param instName instruction name
* @param mo matrix object
* @param recomputeDenseNNZ recompute NNZ if dense
* @return number of non-zeroes
*/
public static long getNnz(GPUContext gCtx, String instName, MatrixObject mo, boolean recomputeDenseNNZ) {
if(mo.getGPUObject(gCtx) != null && mo.getGPUObject(gCtx).isAllocated()) {
return mo.getGPUObject(gCtx).getNnz(instName, recomputeDenseNNZ);
}
else {
return mo.getNnz();
}
}
protected static cusparseHandle getCusparseHandle(GPUContext gCtx) {
return gCtx.getCusparseHandle();
}
protected static cublasHandle getCublasHandle(GPUContext gCtx) {
return gCtx.getCublasHandle();
}
public static JCudaKernels getCudaKernels(GPUContext gCtx) throws DMLRuntimeException {
return gCtx.getKernels();
}
public static Pointer double2float(GPUContext gCtx, Pointer A, Pointer ret, int numElems) {
getCudaKernels(gCtx).launchKernel("double2float", ExecutionConfig.getConfigForSimpleVectorOperations(numElems),
A, ret, numElems);
return ret;
}
public static Pointer float2double(GPUContext gCtx, Pointer A, Pointer ret, int numElems) {
getCudaKernels(gCtx).launchKernel("float2double",
ExecutionConfig.getConfigForSimpleVectorOperations(numElems), A, ret, numElems);
return ret;
}
//********************************************************************/
//************************ End of UTILS ******************************/
//********************************************************************/
//********************************************************************/
//***************** DEEP LEARNING Operators **************************/
//********************************************************************/
private static Pointer _one;
private static Pointer _zero;
private static int oldDataTypeSize;
/**
* Convenience method to get a pointer to value '1.0' on device. Instead of allocating and deallocating it for every kernel invocation.
* @return jcuda pointer
*/
public static Pointer one() {
if(_one == null || oldDataTypeSize != sizeOfDataType) {
_one = _dataTypePointerTo(1.0);
oldDataTypeSize = sizeOfDataType;
}
return _one;
}
/**
* Convenience method to get a pointer to value '0.0f' on device. Instead of allocating and deallocating it for every kernel invocation.
* @return jcuda pointer
*/
public static Pointer zero() {
if(_zero == null || oldDataTypeSize != sizeOfDataType) {
_zero = _dataTypePointerTo(0.0);
oldDataTypeSize = sizeOfDataType;
}
return _zero;
}
/**
* Convenience method to get jcudaDenseMatrixPtr. This method explicitly converts sparse to dense format, so use it judiciously.
* @param gCtx a valid {@link GPUContext}
* @param input input matrix object
* @param instName the invoking instruction's name for record {@link Statistics}.
* @return jcuda pointer
*/
public static Pointer getDensePointer(GPUContext gCtx, MatrixObject input, String instName) throws DMLRuntimeException {
if(isInSparseFormat(gCtx, input)) {
input.getGPUObject(gCtx).sparseToDense(instName);
}
return input.getGPUObject(gCtx).getDensePointer();
}
/**
* Convenience method to get the sparse matrix pointer from a {@link MatrixObject}. Converts dense to sparse if necessary.
* @param gCtx a valid {@link GPUContext}
* @param input input matrix
* @param instName the invoking instruction's name for record {@link Statistics}.
* @return a sparse matrix pointer
*/
protected static CSRPointer getSparsePointer(GPUContext gCtx, MatrixObject input, String instName) {
if(!isInSparseFormat(gCtx, input)) {
input.getGPUObject(gCtx).denseToSparse();
}
return input.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
}
private static Pointer _dataTypePointerTo(double value) {
if(sizeOfDataType == Sizeof.DOUBLE) {
return Pointer.to(new double[] { value });
}
else if(sizeOfDataType == Sizeof.FLOAT) {
return Pointer.to(new float[] { (float) value });
}
else {
throw new RuntimeException("Unsupported datatype with size " + sizeOfDataType);
}
}
protected static Pointer dataTypePointerTo(double value) {
if(value == 1) {
return one();
}
else if(value == 0) {
return zero();
}
else {
return _dataTypePointerTo(value);
}
}
/**
* This method computes the backpropagation errors for previous layer of relu operation
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param input input image
* @param dout next layer error propogation
* @param outputBlock output
*/
public static void reluBackward(GPUContext gCtx, String instName, MatrixObject input, MatrixObject dout, MatrixObject outputBlock) {
if(LOG.isTraceEnabled()) {
LOG.trace("GPU : reluBackward" + ", GPUContext=" + gCtx);
}
long rows = input.getNumRows();
long cols = input.getNumColumns();
Pointer imagePointer = getDensePointer(gCtx, input, instName);
Pointer doutPointer = getDensePointer(gCtx, dout, instName);
Pointer outputPointer = getDensePointer(gCtx, outputBlock, instName);
getCudaKernels(gCtx).launchKernel("relu_backward",
ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(rows), toInt(cols)),
imagePointer, doutPointer, outputPointer, toInt(rows), toInt(cols));
}
/**
* Perform channel_sums operations: out = rowSums(matrix(colSums(A), rows=C, cols=HW))
*
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param input input image
* @param outputBlock output
* @param C number of channels
* @param HW height*width
*/
public static void channelSums(GPUContext gCtx, String instName, MatrixObject input, MatrixObject outputBlock, long C, long HW) {
if(LOG.isTraceEnabled()) {
LOG.trace("GPU : channelSums" + ", GPUContext=" + gCtx);
}
int N = toInt(input.getNumRows());
int cols = toInt(input.getNumColumns());
if(cols != C*HW) {
throw new DMLRuntimeException("Incorrect parameters, number of columns " + cols + " != " + C + "*" + HW);
}
Pointer imagePointer = getDensePointer(gCtx, input, instName);
Pointer outputPointer = getDensePointer(gCtx, outputBlock, instName);
// We can replace this with CuDNN tensor reduce
Pointer tmp = gCtx.allocate(instName, cols*sizeOfDataType);
reduceCol(gCtx, instName, "reduce_col_sum", imagePointer, tmp, N, cols);
reduceRow(gCtx, instName, "reduce_row_sum", tmp, outputPointer, toInt(C), toInt(HW));
gCtx.cudaFreeHelper(instName, tmp, DMLScript.EAGER_CUDA_FREE);
}
/**
* Performs the operation corresponding to the DML script:
* ones = matrix(1, rows=1, cols=Hout*Wout)
* output = input * matrix(bias %*% ones, rows=1, cols=F*Hout*Wout)
* This operation is often followed by conv2d and hence we have introduced bias_add(input, bias) built-in function
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param input input image
* @param bias bias
* @param outputBlock output
*/
public static void biasMultiply(GPUContext gCtx, String instName, MatrixObject input, MatrixObject bias, MatrixObject outputBlock) {
if(LOG.isTraceEnabled()) {
LOG.trace("GPU : biasMultiply" + ", GPUContext=" + gCtx);
}
if(isInSparseFormat(gCtx, input)) {
input.getGPUObject(gCtx).sparseToDense(instName);
}
if(isInSparseFormat(gCtx, bias)) {
bias.getGPUObject(gCtx).sparseToDense(instName);
}
long rows = input.getNumRows();
long cols = input.getNumColumns();
long K = bias.getNumRows();
long PQ = cols / K;
if(bias.getNumColumns() != 1 || cols % K != 0) {
throw new DMLRuntimeException("Incorrect inputs for bias_multiply: input[" + rows + " X " + cols + "] and bias[" + K + " X " + bias.getNumColumns() + "]");
}
Pointer imagePointer = input.getGPUObject(gCtx).getDensePointer();
Pointer biasPointer = bias.getGPUObject(gCtx).getDensePointer();
Pointer outputPointer = outputBlock.getGPUObject(gCtx).getDensePointer();
getCudaKernels(gCtx).launchKernel("bias_multiply",
ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(rows), toInt(cols)),
imagePointer, biasPointer, outputPointer, toInt(rows), toInt(cols), toInt(PQ));
}
/**
* Performs the operation corresponding to the DML script:
* ones = matrix(1, rows=1, cols=Hout*Wout)
* output = input + matrix(bias %*% ones, rows=1, cols=F*Hout*Wout)
* This operation is often followed by conv2d and hence we have introduced bias_add(input, bias) built-in function
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param input input image
* @param bias bias
* @param outputBlock output
*/
public static void biasAdd(GPUContext gCtx, String instName, MatrixObject input, MatrixObject bias, MatrixObject outputBlock) {
Pointer imagePointer = getDensePointer(gCtx, input, instName);
Pointer biasPointer = getDensePointer(gCtx, bias, instName);
Pointer outputPointer = getDensePointer(gCtx, outputBlock, instName);
int rows = toInt(input.getNumRows());
int cols = toInt(input.getNumColumns());
int K = toInt(bias.getNumRows());
if(bias.getNumColumns() != 1 || cols % K != 0) {
throw new DMLRuntimeException("Incorrect inputs for bias_add: input[" + rows + " X " + cols + "] and bias[" + K + " X " + bias.getNumColumns() + "]");
}
biasAdd(gCtx, instName, imagePointer, biasPointer, outputPointer, rows, cols, K);
}
/**
* Performs the operation corresponding to the DML script:
* ones = matrix(1, rows=1, cols=Hout*Wout)
* output = input + matrix(bias %*% ones, rows=1, cols=F*Hout*Wout)
* This operation is often followed by conv2d and hence we have introduced bias_add(input, bias) built-in function
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param image input image
* @param bias bias
* @param output output
* @param rows rows in input image
* @param cols cols in input image
* @param k rows in bias
*/
private static void biasAdd(GPUContext gCtx, String instName, Pointer image, Pointer bias, Pointer output, int rows, int cols, int k) {
if(LOG.isTraceEnabled()) {
LOG.trace("GPU : biasAdd" + ", GPUContext=" + gCtx);
}
int PQ = cols / k;
getCudaKernels(gCtx).launchKernel("bias_add",
ExecutionConfig.getConfigForSimpleMatrixOperations(rows, cols),
image, bias, output, rows, cols, PQ);
}
//********************************************************************/
//************* End of DEEP LEARNING Operators ***********************/
//********************************************************************/
//********************************************************************/
//********** TRANSPOSE SELF MATRIX MULTIPLY Functions ****************/
//********************************************************************/
/**
* Performs tsmm, A %*% A' or A' %*% A, on GPU by exploiting cublasDsyrk(...)
* <p>
* Memory Usage - If dense, input space - rows * cols, no intermediate memory, output - Max(rows*rows, cols*cols)
* If sparse, calls matmult
*
* @param ec execution context
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param left input matrix, as in a tsmm expression like A %*% A' or A' %*% A, we just need to check whether the left one is transposed or not, I named it 'left'
* @param outputName output matrix name
* @param isLeftTransposed if true, left transposed
*/
public static void matmultTSMM(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject left, String outputName,
boolean isLeftTransposed) {
if(LOG.isTraceEnabled()) {
LOG.trace("GPU : matmultTSMM" + ", GPUContext=" + gCtx);
}
if (ec.getGPUContext(0) != gCtx)
throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
if(isInSparseFormat(gCtx, left)) {
// For sparse TSMM, invoke matmult (TODO: possible performance improvement)
LibMatrixCuMatMult.matmult(ec, gCtx, instName, left, left, outputName, isLeftTransposed, !isLeftTransposed);
return;
}
// Since CuBLAS expects inputs in column-major format,
// reverse the order of matrix-multiplication and take care of dimension mismatch.
int transa = isLeftTransposed ? cublasOperation.CUBLAS_OP_N : cublasOperation.CUBLAS_OP_T;
// Note: the dimensions are swapped
int m = toInt(isLeftTransposed ? left.getNumColumns() : left.getNumRows());
int k = toInt(isLeftTransposed ? left.getNumRows() : left.getNumColumns());
// For dense TSMM, exploit cublasDsyrk(...) and call custom kernel to flip the matrix
MatrixObject output = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, m, m); // Allocated the dense output matrix
if(m == -1)
throw new DMLRuntimeException("Incorrect dimensions");
int lda = toInt(isLeftTransposed ? m : k);
int ldc = m;
if(!left.getGPUObject(gCtx).isAllocated())
throw new DMLRuntimeException("Input is not allocated:" + left.getGPUObject(gCtx).isAllocated());
if(!output.getGPUObject(gCtx).isAllocated())
throw new DMLRuntimeException("Output is not allocated:" + output.getGPUObject(gCtx).isAllocated());
Pointer A = getDensePointer(gCtx, left, instName);
Pointer C = getDensePointer(gCtx, output, instName);
cudaSupportFunctions.cublassyrk(getCublasHandle(gCtx), cublasFillMode.CUBLAS_FILL_MODE_LOWER,transa, m, k, one(), A, lda, zero(), C, ldc);
copyUpperToLowerTriangle(gCtx, instName, output);
}
/**
* Used for all version of TSMM where the result is known to be symmetric.
* Hence, we compute only the upper triangular matrix and copy this partial
* result down to lower triangular matrix once.
*
* @param gCtx a valid {@link GPUContext}
* @param instName instruction name
* @param ret upper triangular matrix
*/
private static void copyUpperToLowerTriangle(GPUContext gCtx, String instName, MatrixObject ret) {
if(LOG.isTraceEnabled()) {
LOG.trace("GPU : copyUpperToLowerTriangle" + ", GPUContext=" + gCtx);
}
if(isInSparseFormat(gCtx, ret)) {
throw new DMLRuntimeException("Sparse GPU copyUpperToLowerTriangle is not implemented");
}
if(ret.getNumRows() != ret.getNumColumns()) {
throw new DMLRuntimeException("Only square matrix kernel is implemented for copyUpperToLowerTriangle");
}
int dim = toInt(ret.getNumRows());
getCudaKernels(gCtx).launchKernel("copy_u2l_dense",
ExecutionConfig.getConfigForSimpleMatrixOperations(dim, dim),
getDensePointer(gCtx, ret, instName), dim, dim*dim);
}
//********************************************************************/
//******** End of TRANSPOSE SELF MATRIX MULTIPLY Functions ***********/
//********************************************************************/
//********************************************************************/
//**************** UNARY AGGREGATE Functions ************************/
//********************************************************************/
/**
* Entry point to perform Unary aggregate operations on the GPU.
* The execution context object is used to allocate memory for the GPU.
*
* @param ec Instance of {@link ExecutionContext}, from which the output variable will be allocated
* @param gCtx a valid {@link GPUContext}
* @param instName name of the invoking instruction to record{@link Statistics}.
* @param in1 input matrix
* @param output output matrix/scalar name
* @param op Instance of {@link AggregateUnaryOperator} which encapsulates the direction of reduction/aggregation and the reduction operation.
*/
public static void unaryAggregate(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String output, AggregateUnaryOperator op) {
if (ec.getGPUContext(0) != gCtx)
throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
if(LOG.isTraceEnabled()) {
LOG.trace("GPU : unaryAggregate" + ", GPUContext=" + gCtx);
}
final int REDUCTION_ALL = 1;
final int REDUCTION_ROW = 2;
final int REDUCTION_COL = 3;
final int REDUCTION_DIAG = 4;
// A kahan sum implemention is not provided. is a "uak+" or other kahan operator is encountered,
// it just does regular summation reduction.
final int OP_PLUS = 1;
final int OP_PLUS_SQ = 2;
final int OP_MEAN = 3;
final int OP_VARIANCE = 4;
final int OP_MULTIPLY = 5;
final int OP_MAX = 6;
final int OP_MIN = 7;
final int OP_MAXINDEX = 8;
final int OP_MININDEX = 9;
// Sanity Checks
if(!in1.getGPUObject(gCtx).isAllocated())
throw new DMLRuntimeException("Internal Error - The input is not allocated for a GPU Aggregate Unary:" + in1.getGPUObject(gCtx).isAllocated());
boolean isSparse = in1.getGPUObject(gCtx).isSparse();
IndexFunction indexFn = op.indexFn;
AggregateOperator aggOp = op.aggOp;
// Convert Reduction direction to a number
int reductionDirection = -1;
if (indexFn instanceof ReduceAll){
reductionDirection = REDUCTION_ALL;
} else if (indexFn instanceof ReduceRow){
reductionDirection = REDUCTION_ROW;
} else if (indexFn instanceof ReduceCol){
reductionDirection = REDUCTION_COL;
} else if (indexFn instanceof ReduceDiag){
reductionDirection = REDUCTION_DIAG;
} else {
throw new DMLRuntimeException("Internal Error - Invalid index function type, only reducing along rows, columns, diagonals or all elements is supported in Aggregate Unary operations");
}
if(reductionDirection == -1)
throw new DMLRuntimeException("Internal Error - Incorrect type of reduction direction set for aggregate unary GPU instruction");
// Convert function type to a number
int opIndex = -1;
if (aggOp.increOp.fn instanceof KahanPlus) {
opIndex = OP_PLUS;
} else if (aggOp.increOp.fn instanceof KahanPlusSq) {
opIndex = OP_PLUS_SQ;
} else if (aggOp.increOp.fn instanceof Mean) {
opIndex = OP_MEAN;
} else if (aggOp.increOp.fn instanceof CM) {
if(((CM)aggOp.increOp.fn).getAggOpType() != CMOperator.AggregateOperationTypes.VARIANCE)
throw new DMLRuntimeException("Internal Error - Invalid Type of CM operator for Aggregate Unary operation on GPU");
opIndex = OP_VARIANCE;
} else if (aggOp.increOp.fn instanceof Plus) {
opIndex = OP_PLUS;
} else if (aggOp.increOp.fn instanceof Multiply) {
opIndex = OP_MULTIPLY;
} else if (aggOp.increOp.fn instanceof Builtin) {
Builtin b = (Builtin)aggOp.increOp.fn;
switch(b.bFunc) {
case MAX: opIndex = OP_MAX; break;
case MIN: opIndex = OP_MIN; break;
case MAXINDEX: opIndex = OP_MAXINDEX; break;
case MININDEX: opIndex = OP_MININDEX;break;
default:
throw new DMLRuntimeException("Internal Error - Unsupported Builtin Function for Aggregate unary being done on GPU");
}
} else {
throw new DMLRuntimeException("Internal Error - Aggregate operator has invalid Value function");
}
if(opIndex == -1)
throw new DMLRuntimeException("Internal Error - Incorrect type of operation set for aggregate unary GPU instruction");
int rlen = (int)in1.getNumRows();
int clen = (int)in1.getNumColumns();
if (isSparse){
// The strategy for the time being is to convert sparse to dense
// until a sparse specific kernel is written.
in1.getGPUObject(gCtx).sparseToDense(instName);
// long nnz = in1.getNnz();
// assert nnz > 0 : "Internal Error - number of non zeroes set to " + nnz + " in Aggregate Binary for GPU";
// MatrixObject out = ec.getSparseMatrixOutputForGPUInstruction(output, nnz);
// throw new DMLRuntimeException("Internal Error - Not implemented");
}
long outRLen = -1;
long outCLen = -1;
if (indexFn instanceof ReduceRow) { // COL{SUM, MAX...}
outRLen = 1;
outCLen = clen;
}
else if (indexFn instanceof ReduceCol) { // ROW{SUM, MAX,...}
outRLen = rlen;
outCLen = 1;
}
Pointer out = null;
if (reductionDirection == REDUCTION_COL || reductionDirection == REDUCTION_ROW) {
// Matrix output
MatrixObject out1 = getDenseMatrixOutputForGPUInstruction(ec, instName, output, outRLen, outCLen);
out = getDensePointer(gCtx, out1, instName);
}
Pointer in = getDensePointer(gCtx, in1, instName);
int size = rlen * clen;
// For scalars, set the scalar output in the Execution Context object
switch (opIndex){
case OP_PLUS: {
switch(reductionDirection) {
case REDUCTION_ALL : {
double result = reduceAll(gCtx, instName, "reduce_sum", in, size);
ec.setScalarOutput(output, new DoubleObject(result));
break;
}
case REDUCTION_COL : { // The names are a bit misleading, REDUCTION_COL refers to the direction (reduce all elements in a column)
reduceRow(gCtx, instName, "reduce_row_sum", in, out, rlen, clen);
break;
}
case REDUCTION_ROW : {
reduceCol(gCtx, instName, "reduce_col_sum", in, out, rlen, clen);
break;
}
case REDUCTION_DIAG :
throw new DMLRuntimeException("Internal Error - Row, Column and Diag summation not implemented yet");
}
break;
}
case OP_PLUS_SQ : {
// Calculate the squares in a temporary object tmp
Pointer tmp = gCtx.allocate(instName, size * sizeOfDataType);
squareMatrix(gCtx, instName, in, tmp, rlen, clen);
// Then do the sum on the temporary object and free it
switch(reductionDirection) {
case REDUCTION_ALL : {
double result = reduceAll(gCtx, instName, "reduce_sum", tmp, size);
ec.setScalarOutput(output, new DoubleObject(result));
break;
}
case REDUCTION_COL : { // The names are a bit misleading, REDUCTION_COL refers to the direction (reduce all elements in a column)
reduceRow(gCtx, instName, "reduce_row_sum", tmp, out, rlen, clen);
break;
}
case REDUCTION_ROW : {
reduceCol(gCtx, instName, "reduce_col_sum", tmp, out, rlen, clen);
break;
}
default:
throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for summation squared");
}
gCtx.cudaFreeHelper(instName, tmp, DMLScript.EAGER_CUDA_FREE);
break;
}
case OP_MEAN:{
switch(reductionDirection) {
case REDUCTION_ALL: {
double result = reduceAll(gCtx, instName, "reduce_sum", in, size);
double mean = result / size;
ec.setScalarOutput(output, new DoubleObject(mean));
break;
}
case REDUCTION_COL: {
reduceRow(gCtx, instName, "reduce_row_mean", in, out, rlen, clen);
break;
}
case REDUCTION_ROW: {
reduceCol(gCtx, instName, "reduce_col_mean", in, out, rlen, clen);
break;
}
default:
throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for mean");
}
break;
}
case OP_MULTIPLY : {
switch (reductionDirection) {
case REDUCTION_ALL: {
double result = reduceAll(gCtx, instName, "reduce_prod", in, size);
ec.setScalarOutput(output, new DoubleObject(result));
break;
}
default:
throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for multiplication");
}
break;
}
case OP_MAX :{
switch(reductionDirection) {
case REDUCTION_ALL: {
double result = reduceAll(gCtx, instName, "reduce_max", in, size);
ec.setScalarOutput(output, new DoubleObject(result));
break;
}
case REDUCTION_COL: {
reduceRow(gCtx, instName, "reduce_row_max", in, out, rlen, clen);
break;
}
case REDUCTION_ROW: {
reduceCol(gCtx, instName, "reduce_col_max", in, out, rlen, clen);
break;
}
default:
throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for max");
}
break;
}
case OP_MIN :{
switch(reductionDirection) {
case REDUCTION_ALL: {
double result = reduceAll(gCtx, instName, "reduce_min", in, size);
ec.setScalarOutput(output, new DoubleObject(result));
break;
}
case REDUCTION_COL: {
reduceRow(gCtx, instName, "reduce_row_min", in, out, rlen, clen);
break;
}
case REDUCTION_ROW: {
reduceCol(gCtx, instName, "reduce_col_min", in, out, rlen, clen);
break;
}
default:
throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for min");
}
break;
}
case OP_VARIANCE : {
// Temporary GPU array for
Pointer tmp = gCtx.allocate(instName, size * sizeOfDataType);
Pointer tmp2 = gCtx.allocate(instName, size * sizeOfDataType);
switch(reductionDirection) {
case REDUCTION_ALL: {
double result = reduceAll(gCtx, instName, "reduce_sum", in, size);
double mean = result / size;
// Subtract mean from every element in the matrix
ScalarOperator minusOp = new RightScalarOperator(Minus.getMinusFnObject(), mean);
matrixScalarOp(gCtx, instName, in, mean, rlen, clen, tmp, minusOp);
squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen);
double result2 = reduceAll(gCtx, instName, "reduce_sum", tmp2, size);
double variance = result2 / (size - 1);
ec.setScalarOutput(output, new DoubleObject(variance));
break;
}
case REDUCTION_COL: {
reduceRow(gCtx, instName, "reduce_row_mean", in, out, rlen, clen);
// Subtract the row-wise mean from every element in the matrix
BinaryOperator minusOp = new BinaryOperator(Minus.getMinusFnObject());
matrixMatrixOp(gCtx, instName, in, out, rlen, clen, VectorShape.NONE.code(), VectorShape.COLUMN.code(), tmp, minusOp);
squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen);
Pointer tmpRow = gCtx.allocate(instName, rlen * sizeOfDataType);
reduceRow(gCtx, instName, "reduce_row_sum", tmp2, tmpRow, rlen, clen);
ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), clen - 1);
matrixScalarOp(gCtx, instName, tmpRow, clen - 1, rlen, 1, out, divideOp);
gCtx.cudaFreeHelper(instName, tmpRow, DMLScript.EAGER_CUDA_FREE);
break;
}
case REDUCTION_ROW: {
reduceCol(gCtx, instName, "reduce_col_mean", in, out, rlen, clen);
// Subtract the columns-wise mean from every element in the matrix
BinaryOperator minusOp = new BinaryOperator(Minus.getMinusFnObject());
matrixMatrixOp(gCtx, instName, in, out, rlen, clen, VectorShape.NONE.code(), VectorShape.ROW.code(), tmp, minusOp);
squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen);
Pointer tmpCol = gCtx.allocate(instName, clen * sizeOfDataType);
reduceCol(gCtx, instName, "reduce_col_sum", tmp2, tmpCol, rlen, clen);
ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), rlen - 1);
matrixScalarOp(gCtx, instName, tmpCol, rlen - 1, 1, clen, out, divideOp);
gCtx.cudaFreeHelper(instName, tmpCol, DMLScript.EAGER_CUDA_FREE);
break;
}
default:
throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for variance");
}
gCtx.cudaFreeHelper(instName, tmp, DMLScript.EAGER_CUDA_FREE);
gCtx.cudaFreeHelper(instName, tmp2, DMLScript.EAGER_CUDA_FREE);
break;
}
case OP_MAXINDEX : {
switch(reductionDirection) {
case REDUCTION_COL:
throw new DMLRuntimeException("Internal Error - Column maxindex of matrix not implemented yet for GPU ");
default:
throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for maxindex");
}
// break;
}
case OP_MININDEX : {
switch(reductionDirection) {
case REDUCTION_COL:
throw new DMLRuntimeException("Internal Error - Column minindex of matrix not implemented yet for GPU ");
default:
throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for minindex");
}
// break;
}
default : throw new DMLRuntimeException("Internal Error - Invalid GPU Unary aggregate function!");
}
}
/**
* Helper method to square a matrix in GPU memory
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param in input matrix on GPU
* @param out output matrix on GPU
* @param rlen row length
* @param clen column length
*/
private static void squareMatrix(GPUContext gCtx, String instName, Pointer in, Pointer out, int rlen, int clen) {
ScalarOperator power2op = new RightScalarOperator(Power.getPowerFnObject(), 2);
matrixScalarOp(gCtx, instName, in, 2, rlen, clen, out, power2op);
}
/**
* Do a simple reduction, the output of which is a single value
* @param gCtx a valid {@link GPUContext}
* @param kernelFunction name of the kernel function to invoke
* @param in {@link Pointer} to matrix in device memory
* @param n size of array
* @return the reduced value
*/
private static double reduceAll(GPUContext gCtx, String instName, String kernelFunction, Pointer in, int n) {
if(LOG.isTraceEnabled()) {
LOG.trace("GPU : reduceAll for " + kernelFunction + ", GPUContext=" + gCtx);
}
int[] tmp = getKernelParamsForReduceAll(gCtx, n);
int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2];
Pointer tempOut = gCtx.allocate(instName, n*sizeOfDataType);
getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), in, tempOut, n);
//cudaDeviceSynchronize;
int s = blocks;
while (s > 1) {
tmp = getKernelParamsForReduceAll(gCtx, s);
blocks = tmp[0]; threads = tmp[1]; sharedMem = tmp[2];
getCudaKernels(gCtx).launchKernel(kernelFunction,
new ExecutionConfig(blocks, threads, sharedMem), tempOut, tempOut, s);
s = (s + (threads*2-1)) / (threads*2);
}
double[] result = {-1f};
cudaSupportFunctions.deviceToHost(gCtx, tempOut, result, instName, false);
gCtx.cudaFreeHelper(instName, tempOut, DMLScript.EAGER_CUDA_FREE);
return result[0];
}
/**
* Do a reduction by row. Data is reduced per row and the
* resulting vector is calculated.
* @param gCtx a valid {@link GPUContext}
* @param kernelFunction name of the kernel function to invoke
* @param in {@link Pointer} to input matrix in device memory (size - rows * columns)
* @param out {@link Pointer} to output matrix in device memory (size - rows * 1)
* @param rows number of rows in input matrix
* @param cols number of columns in input matrix
*/
private static void reduceRow(GPUContext gCtx, String instName, String kernelFunction, Pointer in, Pointer out, int rows, int cols) {
if(LOG.isTraceEnabled()) {
LOG.trace("GPU : reduceRow for " + kernelFunction + ", GPUContext=" + gCtx);
}
int[] tmp = getKernelParamsForReduceByRow(gCtx, rows, cols);
int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2];
Timing time = new Timing(false);
double duration = 0;
if(LOG.isTraceEnabled()) { time.start(); }
getCudaKernels(gCtx).launchKernel(kernelFunction,
new ExecutionConfig(blocks, threads, sharedMem), in, out, rows, cols);
if(LOG.isTraceEnabled()) {
cudaDeviceSynchronize();
duration = time.stop();
LOG.trace("uop kernel function " + kernelFunction + " executed in " + duration + "ms.");
}
}
/**
* Do a reduction by column. Data is reduced per column and the
* resulting vector is calculated.
* @param gCtx a valid {@link GPUContext}
* @param kernelFunction name of the kernel function to invoke
* @param in {@link Pointer} to input matrix in device memory (size - rows * columns)
* @param out {@link Pointer} to output matrix in device memory (size - 1 * cols)
* @param rows number of rows in input matrix
* @param cols number of columns in input matrix
*/
private static void reduceCol(GPUContext gCtx, String instName, String kernelFunction, Pointer in, Pointer out, int rows, int cols) {
if(LOG.isTraceEnabled()) {
LOG.trace("GPU : reduceCol for " + kernelFunction + ", GPUContext=" + gCtx);
}
int[] tmp = getKernelParamsForReduceByCol(gCtx, rows, cols);
int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2];
Timing time = new Timing(false);
double duration = 0;
if(LOG.isTraceEnabled()) { time.start(); }
getCudaKernels(gCtx).launchKernel(kernelFunction,
new ExecutionConfig(blocks, threads, sharedMem), in, out, rows, cols);
if(LOG.isTraceEnabled()) {
cudaDeviceSynchronize();
duration = time.stop();
LOG.trace("uop kernel function " + kernelFunction + " executed in " + duration + "ms.");
}
}
/**
* Get threads, blocks and shared memory for a reduce all operation
* @param gCtx a valid {@link GPUContext}
* @param n size of input array
* @return integer array containing {blocks, threads, shared memory}
*/
private static int[] getKernelParamsForReduceAll(GPUContext gCtx, int n) {
final int MAX_THREADS = getMaxThreads(gCtx);
final int MAX_BLOCKS = getMaxBlocks(gCtx);
final int WARP_SIZE = getWarpSize(gCtx);
int threads = (n < MAX_THREADS *2) ? nextPow2((n + 1)/ 2) : MAX_THREADS;
int blocks = (n + (threads * 2 - 1)) / (threads * 2);
blocks = Math.min(MAX_BLOCKS, blocks);
int sharedMemSize = threads * sizeOfDataType;
if (threads <= WARP_SIZE) {
sharedMemSize *= 2;
}
return new int[] {blocks, threads, sharedMemSize};
}
/**
* Get threads, blocks and shared memory for a reduce by row operation
* @param gCtx a valid {@link GPUContext}
* @param rows number of rows in input matrix
* @param cols number of columns in input matrix
* @return integer array containing {blocks, threads, shared memory}
*/
private static int[] getKernelParamsForReduceByRow(GPUContext gCtx, int rows, int cols) {
final int WARP_SIZE = getWarpSize(gCtx);
final int MAX_THREADS = getMaxThreads(gCtx);
int threads = (cols < MAX_THREADS *2) ? nextPow2((cols + 1)/ 2) : MAX_THREADS;
int blocks = rows;
int sharedMemSize = threads * sizeOfDataType;
if (threads <= WARP_SIZE){
sharedMemSize *=2;
}
return new int[] {blocks, threads, sharedMemSize};
}
private static int[] getKernelParamsForReduceByCol(GPUContext gCtx, int rows, int cols) {
final int MAX_THREADS = getMaxThreads(gCtx);
final int MAX_BLOCKS = getMaxBlocks(gCtx);
final int WARP_SIZE = getWarpSize(gCtx);
int threads = Math.min(cols, MAX_THREADS);
int blocks = Math.min(cols/MAX_THREADS, MAX_BLOCKS);
if (cols % MAX_THREADS != 0) blocks++;
int sharedMemSize = threads * sizeOfDataType;
if (threads <= WARP_SIZE) {
sharedMemSize *=2;
}
return new int[] {blocks, threads, sharedMemSize};
}
private static int nextPow2(int x)
{
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
}
//********************************************************************/
//**************** END OF UNARY AGGREGATE Functions *****************/
//********************************************************************/
//********************************************************************/
//************ Matrix-Matrix & Matrix-Scalar Functions ***************/
//********************************************************************/
/**
* Entry point to perform elementwise matrix-scalar relational operation specified by op
*
* @param ec execution context
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param in input matrix
* @param outputName output matrix name
* @param op scalar operator
*/
public static void matrixScalarRelational(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in, String outputName, ScalarOperator op) {
if (ec.getGPUContext(0) != gCtx)
throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
double constant = op.getConstant();
if(LOG.isTraceEnabled()) {
LOG.trace("GPU : matrixScalarRelational, scalar: " + constant + ", GPUContext=" + gCtx);
}
Pointer A, C;
if (isSparseAndEmpty(gCtx, in)) {
setOutputToConstant(ec, gCtx, instName, op.executeScalar(0.0), outputName, in.getNumRows(),
in.getNumColumns());
return;
} else {
A = getDensePointer(gCtx, in, instName);
MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, in.getNumRows(), in.getNumColumns()); // Allocated the dense output matrix
C = getDensePointer(gCtx, out, instName);
}
int rlenA = toInt(in.getNumRows());
int clenA = toInt(in.getNumColumns());
matrixScalarOp(gCtx, instName, A, constant, rlenA, clenA, C, op);
}
/**
* Entry point to perform elementwise matrix-scalar arithmetic operation specified by op
*
* @param ec execution context
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param in input matrix
* @param outputName output matrix name
* @param isInputTransposed true if input transposed
* @param op scalar operator
*/
public static void matrixScalarArithmetic(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in, String outputName, boolean isInputTransposed, ScalarOperator op) {
if (ec.getGPUContext(0) != gCtx)
throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
double constant = op.getConstant();
if(LOG.isTraceEnabled()) {
LOG.trace("GPU : matrixScalarArithmetic, scalar: " + constant + ", GPUContext=" + gCtx);
}
int outRLen = isInputTransposed ? (int) in.getNumColumns() : (int) in.getNumRows();
int outCLen = isInputTransposed ? (int) in.getNumRows() : (int) in.getNumColumns();
//boolean isCUDALibAvailable = (op.fn instanceof Multiply
// || (op.fn instanceof Divide && op instanceof RightScalarOperator && constant != 0)) && !isSparseAndEmpty(gCtx, in);
//if(!isCUDALibAvailable) {
if(constant == 0) {
if(op.fn instanceof Plus || (op.fn instanceof Minus && op instanceof RightScalarOperator) || op.fn instanceof Or) {
deviceCopy(ec, gCtx, instName, in, outputName, isInputTransposed);
}
else if(op.fn instanceof Multiply || op.fn instanceof And) {
setOutputToConstant(ec, gCtx, instName, 0.0, outputName, outRLen, outCLen);
}
else if(op.fn instanceof Power) {
setOutputToConstant(ec, gCtx, instName, 1.0, outputName, outRLen, outCLen);
}
// TODO:
// x/0.0 is either +Infinity or -Infinity according to Java.
// In the context of a matrix, different elements of the matrix
// could have different values.
// If the IEEE 754 standard defines otherwise, this logic needs
// to be re-enabled and the Java computation logic for divide by zero
// needs to be revisited
//else if(op.fn instanceof Divide && isSparseAndEmpty(gCtx, in)) {
// setOutputToConstant(ec, gCtx, instName, Double.NaN, outputName);
//}
//else if(op.fn instanceof Divide) {
// //For division, IEEE 754 defines x/0.0 as INFINITY and 0.0/0.0 as NaN.
// compareAndSet(ec, gCtx, instName, in, outputName, 0.0, 1e-6, Double.NaN, Double.POSITIVE_INFINITY, Double.POSITIVE_INFINITY);
//}
else {
// TODO: Potential to optimize
matrixScalarOp(ec, gCtx, instName, in, outputName, isInputTransposed, op);
}
}
else if(constant == 1.0 && op.fn instanceof Or) {
setOutputToConstant(ec, gCtx, instName, 1.0, outputName, outRLen, outCLen);
}
else if(constant == 1.0 && (op.fn instanceof And || op.fn instanceof Power)) {
deviceCopy(ec, gCtx, instName, in, outputName, isInputTransposed);
}
else {
matrixScalarOp(ec, gCtx, instName, in, outputName, isInputTransposed, op);
}
// }
//else {
// double alpha = 0;
// if(op.fn instanceof Multiply) {
// alpha = op.getConstant();
// }
// else if(op.fn instanceof Divide && op instanceof RightScalarOperator) {
// alpha = Math.pow(op.getConstant(), -1.0);
// }
// else {
// throw new DMLRuntimeException("Unsupported op");
// }
// TODO: Performance optimization: Call cublasDaxpy if(in.getNumRows() == 1 || in.getNumColumns() == 1)
// C = alpha* op( A ) + beta* op ( B )
// dgeam(ec, gCtx, instName, in, in, outputName, isInputTransposed, isInputTransposed, alpha, 0.0);
//}
}
/**
* Performs elementwise operation relational specified by op of two input matrices in1 and in2
*
* @param ec execution context
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param in1 input matrix 1
* @param in2 input matrix 2
* @param outputName output matrix name
* @param op binary operator
*/
public static void matrixMatrixRelational(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2,
String outputName, BinaryOperator op) {
if (ec.getGPUContext(0) != gCtx)
throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
boolean in1SparseAndEmpty = isSparseAndEmpty(gCtx, in1);
boolean in2SparseAndEmpty = isSparseAndEmpty(gCtx, in2);
if (in1SparseAndEmpty && in2SparseAndEmpty) {
if (op.fn instanceof LessThan || op.fn instanceof GreaterThan || op.fn instanceof NotEquals) {
setOutputToConstant(ec, gCtx, instName, 0.0, outputName, in1.getNumRows(), in1.getNumColumns());
} else if (op.fn instanceof LessThanEquals || op.fn instanceof GreaterThanEquals || op.fn instanceof Equals) {
setOutputToConstant(ec, gCtx, instName, 1.0, outputName, in1.getNumRows(), in1.getNumColumns());
}
} else if (in1SparseAndEmpty) {
matrixScalarRelational(ec, gCtx, instName, in2, outputName, new LeftScalarOperator(op.fn, 0.0));
} else if (in2SparseAndEmpty) {
matrixScalarRelational(ec, gCtx, instName, in1, outputName, new RightScalarOperator(op.fn, 0.0));
} else {
matrixMatrixOp(ec, gCtx, instName, in1, in2, outputName, false, false, op);
}
}
/**
* Performs elementwise arithmetic operation specified by op of two input matrices in1 and in2
*
* @param ec execution context
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param in1 input matrix 1
* @param in2 input matrix 2
* @param outputName output matrix name
* @param isLeftTransposed true if left-transposed
* @param isRightTransposed true if right-transposed
* @param op binary operator
*/
public static void matrixMatrixArithmetic(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2,
String outputName, boolean isLeftTransposed, boolean isRightTransposed, BinaryOperator op) {
if (ec.getGPUContext(0) != gCtx)
throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
boolean isCUDALibAvailable = (op.fn instanceof Plus || op.fn instanceof Minus) && !isSparseAndEmpty(gCtx, in1) && !isSparseAndEmpty(gCtx, in2) && !isVector(in1) && !isVector(in2);
if(!isCUDALibAvailable) {
matrixMatrixOp(ec, gCtx, instName, in1, in2, outputName, isLeftTransposed, isRightTransposed, op);
}
else {
double alpha;
double beta;
if(op.fn instanceof Plus) {
alpha = 1.0;
beta = 1.0;
}
else if(op.fn instanceof Minus) {
alpha = 1.0;
beta = -1.0;
}
else {
throw new DMLRuntimeException("Unsupported op");
}
// C = alpha* op( A ) + beta* op ( B )
dgeam(ec, gCtx, instName, in1, in2, outputName, isLeftTransposed, isRightTransposed, alpha, beta);
}
}
/**
* Utility to do matrix-scalar operation kernel
*
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param ec execution context
* @param in input matrix
* @param outputName output variable name
* @param isInputTransposed true if input is transposed
* @param op operator
*/
public static void matrixScalarOp(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in, String outputName, boolean isInputTransposed,
ScalarOperator op) {
if (ec.getGPUContext(0) != gCtx)
throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
if(isInputTransposed)
throw new DMLRuntimeException("Transposing the input is not supported");
int rlenA = toInt(in.getNumRows());
int clenA = toInt(in.getNumColumns());
Pointer A = getDensePointer(gCtx, in, instName); // TODO: FIXME: Implement sparse binCellSparseScalarOp kernel
double scalar = op.getConstant();
// MatrixObject out = ec.getMatrixObject(outputName);
MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, rlenA, clenA); // Allocated the dense output matrix
Pointer C = getDensePointer(gCtx, out, instName);
matrixScalarOp(gCtx, instName, A, scalar, rlenA, clenA, C, op);
}
/**
* Helper method to launch binary scalar-matrix arithmetic/relational operations CUDA kernel.
* This method is isolated to be taken advantage of from other operations
* as it accepts JCuda {@link Pointer} instances instead of {@link MatrixObject} instances.
*
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param a the dense input matrix (allocated on GPU)
* @param scalar the scalar value to do the op
* @param rlenA row length of matrix a
* @param clenA column lenght of matrix a
* @param c the dense output matrix
* @param op operation to perform
*/
private static void matrixScalarOp(GPUContext gCtx, String instName, Pointer a, double scalar, int rlenA, int clenA, Pointer c, ScalarOperator op) {
if(LOG.isTraceEnabled()) {
LOG.trace("GPU : matrix_scalar_op" + ", GPUContext=" + gCtx);
}
int isLeftScalar = (op instanceof LeftScalarOperator) ? 1 : 0;
int size = rlenA * clenA;
getCudaKernels(gCtx).launchKernel("matrix_scalar_op",
ExecutionConfig.getConfigForSimpleVectorOperations(size),
a, scalar, c, size, getBinaryOp(op.fn), isLeftScalar);
}
/**
* Utility to launch binary cellwise matrix-matrix operations CUDA kernel
*
* @param gCtx a valid {@link GPUContext}
* @param ec execution context
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param in1 left input matrix
* @param in2 right input matrix
* @param outputName output variable name
* @param isLeftTransposed true if left matrix is transposed
* @param isRightTransposed true if right matrix is transposed
* @param op operator
*/
private static void matrixMatrixOp(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2,
String outputName, boolean isLeftTransposed, boolean isRightTransposed, BinaryOperator op) {
if (ec.getGPUContext(0) != gCtx)
throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
boolean isEmpty1 = isSparseAndEmpty(gCtx, in1);
boolean isEmpty2 = isSparseAndEmpty(gCtx, in2);
int rlenA = toInt(in1.getNumRows());
int rlenB = toInt(in2.getNumRows());
int clenA = toInt(in1.getNumColumns());
int clenB = toInt(in2.getNumColumns());
int vecStatusA = getVectorStatus(rlenA, clenA).code();
int vecStatusB = getVectorStatus(rlenB, clenB).code();
if(isLeftTransposed || isRightTransposed) {
throw new DMLRuntimeException("Unsupported operator: GPU transposed binary op " + isLeftTransposed + " " + isRightTransposed);
}
long outRLen = Math.max(rlenA, rlenB);
long outCLen = Math.max(clenA, clenB);
if (isEmpty1 && isEmpty2){
MatrixObject out = ec.allocateGPUMatrixObject(outputName, outRLen, outCLen);
// When both inputs are empty, the output is empty too (except in the case of division)
if (op.fn instanceof Divide || op.fn instanceof IntegerDivide || op.fn instanceof Modulus) {
out.getGPUObject(gCtx).allocateAndFillDense(Double.NaN);
} else if (op.fn instanceof Minus1Multiply) {
out.getGPUObject(gCtx).allocateAndFillDense(1.0);
} else {
out.getGPUObject(gCtx).allocateSparseAndEmpty();
}
}
// Check for M1 * M2 when M1 is empty; if M2 is a vector then fallback to general case
else if(isEmpty1 && clenB != 1 && rlenB != 1) {
// C = empty_in1 op in2 ==> becomes ==> C = 0.0 op in2
matrixScalarArithmetic(ec, gCtx, instName, in2, outputName, isRightTransposed, new LeftScalarOperator(op.fn, 0.0));
}
// Check for M1 * M2 when M2 is empty; if M1 is a vector then fallback to general case
else if(isEmpty2 && clenA != 1 && rlenA != 1) {
// C = in1 op empty_in2 ==> becomes ==> C = in1 op 0.0
matrixScalarArithmetic(ec, gCtx, instName, in1, outputName, isLeftTransposed, new RightScalarOperator(op.fn, 0.0));
}
else {
Pointer A = getDensePointer(gCtx, in1, instName); // TODO: FIXME: Implement sparse binCellSparseOp kernel
Pointer B = getDensePointer(gCtx, in2, instName); // TODO: FIXME: Implement sparse binCellSparseOp kernel
// Allocated the dense output matrix
MatrixObject out = null;
try {
out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, outRLen, outCLen);
} catch(DMLRuntimeException e) {
throw new DMLRuntimeException("Incorrect dimensions: dimA:[" + rlenA + "," + clenA + "]"
+ " dimB:[" + rlenB + "," + clenB + "] out:[" + outRLen + "," + outCLen + "]", e);
}
Pointer C = getDensePointer(gCtx, out, instName);
int maxRlen = Math.max(rlenA, rlenB);
int maxClen = Math.max(clenA, clenB);
matrixMatrixOp(gCtx, instName, A, B, maxRlen, maxClen, vecStatusA, vecStatusB, C, op);
}
}
/**
* Do an elementwise matrix-matrix arithmetic operation on the GPU
* c = a op b
* Either rows and cols in A are the same as in B or
* one of them is a vector or both are.
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param a The input matrix a allocated on the GPU
* @param b The input matrix b allocated on the GPU
* @param maxRlen the maximum of the row lengths between a & b
* @param maxClen the maximum of the column lengths between a & b
* @param vecStatusA if matrix A is a vector
* @param vecStatusB if matrix B is a vector
* @param c output matrix of size (maxRlen, maxClen) allocated on GPU
* @param op the operation to perform
*/
private static void matrixMatrixOp(GPUContext gCtx, String instName, Pointer a, Pointer b, int maxRlen, int maxClen, int vecStatusA, int vecStatusB, Pointer c, BinaryOperator op) {
if(LOG.isTraceEnabled()) {
LOG.trace("GPU : matrix_matrix_cellwise_op" + ", GPUContext=" + gCtx);
}
getCudaKernels(gCtx).launchKernel("matrix_matrix_cellwise_op",
ExecutionConfig.getConfigForSimpleMatrixOperations(maxRlen, maxClen),
a, b, c, maxRlen, maxClen, vecStatusA, vecStatusB, getBinaryOp(op.fn));
}
/**
* This enum declares the different vector shapes
* as they recognized in the invoked CUDA kernel(s).
*/
enum VectorShape {
COLUMN (1),
ROW (2),
NONE (0);
private final int code;
VectorShape(int code) {
this.code = code;
}
int code() { return code; }
}
/**
* Given the number of rows and columns, returns
* whether this is a row vector, column vector or neither.
* @param rows
* @param cols
* @return 1 for column vector, 2 for row vector, 0 for neither
*/
private static VectorShape getVectorStatus(long rows, long cols) {
if(cols == 1)
return VectorShape.COLUMN;
else if(rows == 1)
return VectorShape.ROW;
else
return VectorShape.NONE;
}
private static boolean isVector(MatrixObject in) {
return in.getNumRows() == 1 || in.getNumColumns() == 1;
}
private static boolean isSparseAndEmpty(GPUContext gCtx, MatrixObject in1) {
boolean isSparse1 = isInSparseFormat(gCtx, in1);
boolean isEmpty1 = isSparse1 && in1.getGPUObject(gCtx).getJcudaSparseMatrixPtr().nnz == 0;
return isEmpty1;
}
private static void deviceCopy(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject src, String outputName, boolean isInputTransposed) {
if (ec.getGPUContext(0) != gCtx)
throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
if(!isInputTransposed)
deviceCopy(ec, gCtx, instName, src, outputName);
else
transpose(ec, gCtx, instName, src, outputName);
}
/**
* Performs a deep device copy of a matrix on the GPU
*
* @param ec execution context
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param src source matrix
* @param outputName destination variable name
*/
private static void deviceCopy(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject src, String outputName) {
Pointer srcPtr = getDensePointer(gCtx, src, instName); // TODO: FIXME: Implement sparse kernel
MatrixObject out = ec.getMatrixObject(outputName);
getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, toInt(src.getNumRows()), toInt(src.getNumColumns())); // Allocated the dense output matrix
Pointer destPtr = getDensePointer(gCtx, out, instName);
deviceCopy(instName, srcPtr, destPtr, (int)src.getNumRows(), (int)src.getNumColumns());
}
/**
* Fills an an array on the GPU with a given scalar value
* @param ec currently active instance of the {@link ExecutionContext}
* @param gCtx a valid {@link GPUContext}
* @param instName name of the invoking instruction to record{@link Statistics}.
* @param constant scalar value with which to fill the matrix
* @param outputName (internal) name of the matrix that is to be filled
* @param numRows number of rows of output matrix object
* @param numCols number of columns of output matrix object
*/
private static void setOutputToConstant(ExecutionContext ec, GPUContext gCtx, String instName, double constant, String outputName, long numRows, long numCols) {
if (ec.getGPUContext(0) != gCtx)
throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
if(constant == 0) {
getSparseMatrixOutputForGPUInstruction(ec, numRows, numCols, 0, instName, outputName);
} else {
MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, numRows, numCols); // Allocated the dense output matrix
Pointer A = getDensePointer(gCtx, out, instName);
int rlen = toInt(out.getNumRows());
int clen = toInt(out.getNumColumns());
int size = rlen * clen;
getCudaKernels(gCtx).launchKernel("fill", ExecutionConfig.getConfigForSimpleVectorOperations(size), A, constant, size);
}
}
/**
* Performs a deep copy of input device double pointer corresponding to matrix
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param src source matrix
* @param dest destination matrix
* @param rlen number of rows
* @param clen number of columns
*/
public static void deviceCopy(String instName, Pointer src, Pointer dest, int rlen, int clen) {
int size = rlen * clen * sizeOfDataType;
cudaMemcpy(dest, src, size, cudaMemcpyDeviceToDevice);
}
/**
* Helper function to get numeric value for binary op.
* This number is passed down to the CUDA kernel
* and the appropriate binary operation is performed on the GPU.
* op = {0=plus, 1=minus, 2=multiply, 3=divide, 4=power,
* 5=less, 6=lessequal, 7=greater, 8=greaterequal, 9=equal, 10=notequal,
* 11=min, 12=max, 13=and, 14=or, 15=minus1multiply, 16=minusnz,
* 17=modulus, 18=integer division}
*/
private static int getBinaryOp(ValueFunction fn) {
if(fn instanceof Plus) return 0;
else if(fn instanceof Minus) return 1;
else if(fn instanceof Multiply) return 2;
else if(fn instanceof Divide) return 3;
else if(fn instanceof Power) return 4;
else if(fn instanceof LessThan) return 5;
else if(fn instanceof LessThanEquals) return 6;
else if(fn instanceof GreaterThan) return 7;
else if(fn instanceof GreaterThanEquals) return 8;
else if(fn instanceof Equals) return 9;
else if(fn instanceof NotEquals) return 10;
else if(fn instanceof And) return 13;
else if(fn instanceof Or) return 14;
else if(fn instanceof Multiply2) return 2;
else if(fn instanceof Power2) return 4;
else if(fn instanceof Minus1Multiply) return 15;
else if(fn instanceof MinusNz) return 16;
else if(fn instanceof Modulus) return 17;
else if(fn instanceof IntegerDivide) return 18;
else if(fn instanceof Builtin && ((Builtin)fn).getBuiltinCode()==BuiltinCode.MIN) return 11;
else if(fn instanceof Builtin && ((Builtin)fn).getBuiltinCode()==BuiltinCode.MAX) return 12;
throw new DMLRuntimeException("The given value function is not supported:" + fn.getClass().getName());
}
/**
* Performs sparse and dense dgeam given two input matrices
* C = alpha* op( A ) + beta* op ( B )
* where op = transpose or not (specified by isLeftTransposed and isRightTransposed).
* To indicate a transpose operation, make sure in1 == in2 and isLeftTransposed == isRightTransposed == true
* @param ec execution context
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param in1 left input matrix
* @param in2 right input matrix
* @param outputName output variable name
* @param isLeftTransposed true if left matrix is transposed
* @param isRightTransposed true if right matrix is transposed
* @param alpha alpha
* @param beta beta
*/
private static void dgeam(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName,
boolean isLeftTransposed, boolean isRightTransposed, double alpha, double beta) {
if (ec.getGPUContext(0) != gCtx)
throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
if(LOG.isTraceEnabled()) {
LOG.trace("GPU : dgeam" + ", GPUContext=" + gCtx);
}
Pointer alphaPtr = dataTypePointerTo(alpha);
Pointer betaPtr = dataTypePointerTo(beta);
int transa = isLeftTransposed ? CUBLAS_OP_T : CUBLAS_OP_N;
int transb = isRightTransposed ? CUBLAS_OP_T : CUBLAS_OP_N;
long outRLen = isLeftTransposed ? in1.getNumColumns() : in1.getNumRows();
long outCLen = isLeftTransposed ? in1.getNumRows() : in1.getNumColumns();
MatrixObject out = ec.getMatrixObject(outputName);
boolean isSparse1 = isInSparseFormat(gCtx, in1);
boolean isSparse2 = isInSparseFormat(gCtx, in2);
// TODO: Implement sparse-dense matrix cublasDgeam kernel
if(isSparse1 || isSparse2) {
int m = (int)in1.getNumRows();
int n = (int)in1.getNumColumns();
// Invoke cuSparse when either are in sparse format
// Perform sparse-sparse dgeam
if (!isInSparseFormat(gCtx, in1)) {
in1.getGPUObject(gCtx).denseToSparse();
}
CSRPointer A = in1.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
if (!isInSparseFormat(gCtx, in2)) {
in2.getGPUObject(gCtx).denseToSparse();
}
CSRPointer B = in2.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
ec.allocateGPUMatrixObject(outputName, outRLen, outCLen);
if (in1 == in2 && isLeftTransposed == true && isLeftTransposed == isRightTransposed) {
// Special case for transpose
int nnz = (int)A.nnz;
CSRPointer C = CSRPointer.allocateEmpty(gCtx, nnz, n);
out.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
cudaSupportFunctions.cusparsecsr2csc(getCusparseHandle(gCtx), m, n, nnz, A.val, A.rowPtr, A.colInd, C.val, C.colInd, C.rowPtr, cusparseAction.CUSPARSE_ACTION_NUMERIC, cusparseIndexBase.CUSPARSE_INDEX_BASE_ZERO);
} else {
// General case (cusparse does not support accept the transpose operator for dgeam)
// TODO: to implement the transposed + dgeam for sparse matrices, they need to be converted to csc, which is effectively a tranpose
if (isLeftTransposed || isRightTransposed) {
throw new DMLRuntimeException(
"Transpose in cusparseDcsrgeam not supported for sparse matrices on GPU");
}
CSRPointer C = CSRPointer.allocateForDgeam(gCtx, getCusparseHandle(gCtx), A, B, m, n);
out.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
//long sizeOfC = CSRPointer.estimateSize(C.nnz, out.getNumRows());
cudaSupportFunctions.cusparsecsrgeam(getCusparseHandle(gCtx), m, n, alphaPtr, A.descr, toInt(A.nnz), A.val, A.rowPtr, A.colInd, betaPtr,
B.descr, toInt(B.nnz), B.val, B.rowPtr, B.colInd, C.descr, C.val, C.rowPtr, C.colInd);
}
} else {
// Dense-Dense dgeam
int lda = toInt(in1.getNumColumns());
int ldb = toInt(in2.getNumColumns());
int m = toInt(in1.getNumColumns());
int n = toInt(in2.getNumRows());
if (isLeftTransposed && isRightTransposed) {
m = toInt(in1.getNumRows());
n = toInt(in2.getNumColumns());
}
else if (isLeftTransposed) {
m = toInt(in1.getNumRows());
} else if (isRightTransposed) {
n = toInt(in2.getNumColumns());
}
int ldc = m;
Pointer A = getDensePointer(gCtx, in1, instName);
Pointer B = getDensePointer(gCtx, in2, instName);
getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, outRLen, outCLen); // Allocated the dense output matrix
Pointer C = getDensePointer(gCtx, out, instName);
cudaSupportFunctions.cublasgeam(getCublasHandle(gCtx), transa, transb, m, n, alphaPtr, A, lda, betaPtr, B, ldb, C, ldc);
}
}
/**
* Computes C = t(A)
* @param ec execution context
* @param gCtx gpu context
* @param instName name of the instruction
* @param A pointer to the input matrix
* @param C pointer to the output matrix
* @param numRowsA number of rows of the input matrix
* @param numColsA number of columns of the output matrix
* @throws DMLRuntimeException if error
*/
public static void denseTranspose(ExecutionContext ec, GPUContext gCtx, String instName,
Pointer A, Pointer C, long numRowsA, long numColsA) throws DMLRuntimeException {
if (ec.getGPUContext(0) != gCtx)
throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
if(LOG.isTraceEnabled()) {
LOG.trace("GPU : dense transpose" + ", GPUContext=" + gCtx);
}
// Dense-Dense dgeam
int lda = toInt(numColsA);
int ldb = lda;
int m = toInt(numRowsA);
int n = lda;
int ldc = m;
cudaSupportFunctions.cublasgeam(getCublasHandle(gCtx), CUBLAS_OP_T, CUBLAS_OP_T, m, n, one(), A, lda, zero(), A, ldb, C, ldc);
}
//********************************************************************/
//****** End of Matrix-Matrix & Matrix-Scalar Functions **************/
//********************************************************************/
//********************************************************************/
//************************ Re-org Functions **************************/
//********************************************************************/
/**
* Transposes the input matrix using cublasDgeam
*
* @param ec execution context
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param in input matrix
* @param outputName output matrix name
*/
public static void transpose(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in, String outputName) {
// C = alpha* op( A ) + beta* op ( B )
// = 1.0 * A^T + 0.0 * A^T
if (ec.getGPUContext(0) != gCtx)
throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
dgeam(ec, gCtx, instName, in, in, outputName, true, true, 1.0, 0.0);
}
//********************************************************************/
//******************* End of Re-org Functions ************************/
//********************************************************************/
public static int toInt(long num) {
if(num >= Integer.MAX_VALUE || num <= Integer.MIN_VALUE) {
throw new DMLRuntimeException("GPU : Exceeded supported size " + num);
}
return (int)num;
}
//********************************************************************/
//**************** Matrix Manipulation Functions *********************/
//********************************************************************/
/**
* Method to perform rightIndex operation for a given lower and upper bounds in row and column dimensions.
*
* @param ec current execution context
* @param gCtx current gpu context
* @param instName name of the instruction for maintaining statistics
* @param in1 input matrix object
* @param ixrange index range (0-based)
* @param outputName output matrix object
*/
public static void sliceOperations(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1,
IndexRange ixrange, String outputName) {
if (ec.getGPUContext(0) != gCtx)
throw new DMLRuntimeException(
"GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
if(LOG.isTraceEnabled()) {
LOG.trace("GPU : sliceOperations" + ", GPUContext=" + gCtx);
}
int rl = (int) ixrange.rowStart;
int ru = (int) ixrange.rowEnd;
int cl = (int) ixrange.colStart;
int cu = (int) ixrange.colEnd;
if (rl < 0 || rl >= in1.getNumRows() || ru < rl || ru >= in1.getNumRows() || cl < 0
|| cu >= in1.getNumColumns() || cu < cl || cu >= in1.getNumColumns()) {
throw new DMLRuntimeException("Invalid values for matrix indexing: [" + (rl + 1) + ":" + (ru + 1) + ","
+ (cl + 1) + ":" + (cu + 1) + "] " + "must be within matrix dimensions [" + in1.getNumRows() + ","
+ in1.getNumColumns() + "]");
}
int len1 = toInt(in1.getNumColumns());
if(isInSparseFormat(gCtx, in1)) {
// Input in1 is in sparse format and output is in dense format
MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, ru - rl + 1, cu - cl + 1);
CSRPointer inPointer = getSparsePointer(gCtx, in1, instName);
Pointer outPointer = getDensePointer(gCtx, out, instName);
sliceSparseDense(gCtx, instName, inPointer, outPointer, rl, ru, cl, cu, len1);
}
else {
// Input in1 is in dense format (see inPointer)
MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, ru - rl + 1, cu - cl + 1);
Pointer inPointer = getDensePointer(gCtx, in1, instName);
Pointer outPointer = getDensePointer(gCtx, out, instName);
sliceDenseDense(gCtx, instName, inPointer, outPointer, rl, ru, cl, cu, len1);
}
}
/**
* Perform slice operation on dense input and output it in dense format
*
* @param gCtx gpu context
* @param instName instruction name
* @param inPointer dense input pointer
* @param outPointer dense output pointer (doesnot need to be zeroed out)
* @param rl row lower
* @param ru row upper
* @param cl column lower
* @param cu column upper
* @param inClen input number of columns
*/
protected static void sliceDenseDense(GPUContext gCtx, String instName, Pointer inPointer, Pointer outPointer,
int rl, int ru, int cl, int cu, int inClen) {
long retClen = cu - cl + 1;
if (inClen == retClen) {
cudaMemcpy(outPointer, inPointer.withByteOffset(rl * inClen * sizeOfDataType), (ru - rl + 1) * inClen
* sizeOfDataType, cudaMemcpyDeviceToDevice);
} else {
long retRlen = ru - rl + 1;
getCudaKernels(gCtx).launchKernel("slice_dense_dense", ExecutionConfig.getConfigForSimpleVectorOperations(toInt(retRlen*retClen)),
inPointer, outPointer, rl, ru, cl, cu, inClen, retRlen, retClen);
}
}
/**
* Perform slice operation on sparse input and output it in dense format
*
* @param gCtx gpu context
* @param instName instruction name
* @param inPointer sparse CSR input pointer
* @param outPointer dense output pointer (expected to be zeroed out)
* @param rl row lower
* @param ru row upper
* @param cl column lower
* @param cu column upper
* @param inClen number of columns of input matrix
*/
protected static void sliceSparseDense(GPUContext gCtx, String instName, CSRPointer inPointer, Pointer outPointer,
int rl, int ru, int cl, int cu, int inClen) {
int size = getNnz(inPointer, rl, ru);
// Return since nnz of the output is 0 as outPointer is expected to be zeroed out.
if(size == 0) return;
int retRlen = ru - rl + 1;
int retClen = cu - cl + 1;
String kernel = null;
// Note: row-wise parallelization scheme iterates over input rows in single thread
// whereas nnz parallelization scheme iterates over number of output rows in single thread.
if(inClen > 10 && retClen > 2*retRlen) {
// Perform nnz parallelization for wide and short matrices
kernel = "slice_sparse_dense_nnz";
}
else {
size = retRlen;
kernel = "slice_sparse_dense_row";
}
// Performs a slice operation where the input matrix is sparse and the output matrix is dense.
// This function avoids unnecessary sparse to dense conversion of the input matrix.
// We can generalize this later to output sparse matrix.
getCudaKernels(gCtx).launchKernel(kernel, ExecutionConfig.getConfigForSimpleVectorOperations(size),
inPointer.val, inPointer.rowPtr, inPointer.colInd, outPointer, rl, ru, cl, cu, retClen);
}
/**
* Returns the number of non-zeroes in the given range of rows
*
* @param inPointer input CSR pointer
* @param rl lower row index (inclusive and zero-based)
* @param ru upper row index (inclusive and zero-based)
* @return number of non-zeroes
*/
private static int getNnz(CSRPointer inPointer, int rl, int ru) {
int[] rlPtr = { -1 }; int[] ruPtr = { -1 };
cudaMemcpy(Pointer.to(rlPtr), inPointer.rowPtr.withByteOffset(rl*Sizeof.INT), Sizeof.INT, cudaMemcpyDeviceToHost);
cudaMemcpy(Pointer.to(ruPtr), inPointer.rowPtr.withByteOffset((ru+1)*Sizeof.INT), Sizeof.INT, cudaMemcpyDeviceToHost);
return ruPtr[0] - rlPtr[0];
}
public static void cbind(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName) {
if (ec.getGPUContext(0) != gCtx)
throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
if(LOG.isTraceEnabled()) {
LOG.trace("GPU : cbind" + ", GPUContext=" + gCtx);
}
long rowsA = toInt(in1.getNumRows());
long colsA = toInt(in1.getNumColumns());
long rowsB = toInt(in2.getNumRows());
long colsB = toInt(in2.getNumColumns());
if (rowsA != rowsB) {
throw new DMLRuntimeException("GPU : Invalid internal state - the rows must match up for a cbind operation");
}
// only Dense supported
MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, rowsA, colsA + colsB);
Pointer C = getDensePointer(gCtx, out, instName);
Pointer A = getDensePointer(gCtx, in1, instName);
Pointer B = getDensePointer(gCtx, in2, instName);
int maxRows = toInt(Math.max(rowsA, rowsB));
int maxCols = toInt(Math.max(colsA, colsB));
getCudaKernels(gCtx).launchKernel("cbind",
ExecutionConfig.getConfigForSimpleMatrixOperations(maxRows, maxCols),
A, B, C, rowsA, colsA, rowsB, colsB);
}
public static void rbind(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName) {
if (ec.getGPUContext(0) != gCtx)
throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
if(LOG.isTraceEnabled()) {
LOG.trace("GPU : rbind" + ", GPUContext=" + gCtx);
}
int rowsA = toInt(in1.getNumRows());
int colsA = toInt(in1.getNumColumns());
int rowsB = toInt(in2.getNumRows());
int colsB = toInt(in2.getNumColumns());
if (colsA != colsB){
throw new DMLRuntimeException("GPU : Invalid internal state - the columns must match up for a rbind operation");
}
// only Dense supported
MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, rowsA + rowsB, colsA);
Pointer C = getDensePointer(gCtx, out, instName);
Pointer A = getDensePointer(gCtx, in1, instName);
Pointer B = getDensePointer(gCtx, in2, instName);
int maxRows = Math.max(rowsA, rowsB);
int maxCols = Math.max(colsA, colsB);
getCudaKernels(gCtx).launchKernel("rbind",
ExecutionConfig.getConfigForSimpleMatrixOperations(maxRows, maxCols),
A, B, C, rowsA, colsA, rowsB, colsB);
}
//********************************************************************/
//*********** End of Matrix Manipulation Functions *******************/
//********************************************************************/
//********************************************************************/
//************************ Builtin Functions *************************/
//********************************************************************/
/**
* Performs an "exp" operation on a matrix on the GPU
* @param ec execution context
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param in1 input matrix
* @param outputName output matrix name
*/
public static void exp(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) {
if(LOG.isTraceEnabled()) {
LOG.trace("GPU : exp" + ", GPUContext=" + gCtx);
}
// e^0 = 1, create a dense block full of 1s
unaryOp(ec, gCtx, in1, "matrix_exp", 1, outputName, instName, GPUInstruction.MISC_TIMER_EXP_KERNEL);
}
/**
* Performs an "sqrt" operation on a matrix on the GPU
* @param ec execution context
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param in1 input matrix
* @param outputName output matrix name
*/
public static void sqrt(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) {
if(LOG.isTraceEnabled()) {
LOG.trace("GPU : sqrt" + ", GPUContext=" + gCtx);
}
// sqrt(0) = 0, create a dense block full of 0s
unaryOp(ec, gCtx, in1, "matrix_sqrt", 0, outputName, instName, GPUInstruction.MISC_TIMER_SQRT_KERNEL);
}
/**
* Performs an "round" operation on a matrix on the GPU
* @param ec execution context
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param in1 input matrix
* @param outputName output matrix name
*/
public static void round(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) {
if(LOG.isTraceEnabled()) {
LOG.trace("GPU : round" + ", GPUContext=" + gCtx);
}
// round(0) = 0, create a dense block full of 0s
unaryOp(ec, gCtx, in1, "matrix_round", 0, outputName, instName, GPUInstruction.MISC_TIMER_ROUND_KERNEL);
}
/**
* Performs an "abs" operation on a matrix on the GPU
* @param ec execution context
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param in1 input matrix
* @param outputName output matrix name
*/
public static void abs(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) {
if(LOG.isTraceEnabled()) {
LOG.trace("GPU : abs" + ", GPUContext=" + gCtx);
}
// abs(0) = 0, create a dense block full of 0s
unaryOp(ec, gCtx, in1, "matrix_abs", 0, outputName, instName, GPUInstruction.MISC_TIMER_ABS_KERNEL);
}
/**
* Performs an "log" operation on a matrix on the GPU
* @param ec execution context
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param in1 input matrix
* @param outputName output matrix name
*/
public static void log(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) {
if(LOG.isTraceEnabled()) {
LOG.trace("GPU : log" + ", GPUContext=" + gCtx);
}
// log(0) = -Inf
unaryOp(ec, gCtx, in1, "matrix_log", Double.NEGATIVE_INFINITY, outputName, instName, GPUInstruction.MISC_TIMER_LOG_KERNEL);
}
/**
* Performs an "floor" operation on a matrix on the GPU
* @param ec execution context
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param in1 input matrix
* @param outputName output matrix name
*/
public static void floor(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) {
if(LOG.isTraceEnabled()) {
LOG.trace("GPU : floor" + ", GPUContext=" + gCtx);
}
// floor(0) = 0
unaryOp(ec, gCtx, in1, "matrix_floor", 0, outputName, instName, GPUInstruction.MISC_TIMER_FLOOR_KERNEL);
}
/**
* Performs an "ceil" operation on a matrix on the GPU
* @param ec execution context
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param in1 input matrix
* @param outputName output matrix name
*/
public static void ceil(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) {
if(LOG.isTraceEnabled()) {
LOG.trace("GPU : ceil" + ", GPUContext=" + gCtx);
}
// ceil(0) = 0
unaryOp(ec, gCtx, in1, "matrix_ceil", 0, outputName, instName, GPUInstruction.MISC_TIMER_CEIL_KERNEL);
}
/**
* Performs an "sin" operation on a matrix on the GPU
* @param ec execution context
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param in1 input matrix
* @param outputName output matrix name
*/
public static void sin(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) {
if(LOG.isTraceEnabled()) {
LOG.trace("GPU : sin" + ", GPUContext=" + gCtx);
}
// sin(0) = 0
unaryOp(ec, gCtx, in1, "matrix_sin", 0, outputName, instName, GPUInstruction.MISC_TIMER_SIN_KERNEL);
}
/**
* Performs an "cos" operation on a matrix on the GPU
* @param ec execution context
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param in1 input matrix
* @param outputName output matrix name
*/
public static void cos(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) {
if(LOG.isTraceEnabled()) {
LOG.trace("GPU : cos" + ", GPUContext=" + gCtx);
}
// cos(0) = 1
unaryOp(ec, gCtx, in1, "matrix_cos", 1, outputName, instName, GPUInstruction.MISC_TIMER_COS_KERNEL);
}
/**
* Performs an "tan" operation on a matrix on the GPU
* @param ec execution context
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param in1 input matrix
* @param outputName output matrix name
*/
public static void tan(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) {
if(LOG.isTraceEnabled()) {
LOG.trace("GPU : tan" + ", GPUContext=" + gCtx);
}
// tan(0) = 0
unaryOp(ec, gCtx, in1, "matrix_tan", 0, outputName, instName, GPUInstruction.MISC_TIMER_TAN_KERNEL);
}
/**
* Performs an "sinh" operation on a matrix on the GPU
* @param ec execution context
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param in1 input matrix
* @param outputName output matrix name
*/
public static void sinh(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) {
if(LOG.isTraceEnabled()) {
LOG.trace("GPU : sinh" + ", GPUContext=" + gCtx);
}
// sin(0) = 0
unaryOp(ec, gCtx, in1, "matrix_sinh", 0, outputName, instName, GPUInstruction.MISC_TIMER_SINH_KERNEL);
}
/**
* Performs an "cosh" operation on a matrix on the GPU
* @param ec execution context
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param in1 input matrix
* @param outputName output matrix name
*/
public static void cosh(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) {
if(LOG.isTraceEnabled()) {
LOG.trace("GPU : cosh" + ", GPUContext=" + gCtx);
}
// cos(0) = 1
unaryOp(ec, gCtx, in1, "matrix_cosh", 1, outputName, instName, GPUInstruction.MISC_TIMER_COSH_KERNEL);
}
/**
* Performs an "tanh" operation on a matrix on the GPU
* @param ec execution context
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param in1 input matrix
* @param outputName output matrix name
*/
public static void tanh(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) {
if(LOG.isTraceEnabled()) {
LOG.trace("GPU : tanh" + ", GPUContext=" + gCtx);
}
// tan(0) = 0
unaryOp(ec, gCtx, in1, "matrix_tanh", 0, outputName, instName, GPUInstruction.MISC_TIMER_TANH_KERNEL);
}
/**
* Performs an "asin" operation on a matrix on the GPU
* @param ec execution context
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param in1 input matrix
* @param outputName output matrix name
*/
public static void asin(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) {
if(LOG.isTraceEnabled()) {
LOG.trace("GPU : asin" + ", GPUContext=" + gCtx);
}
// asin(0) = 0
unaryOp(ec, gCtx, in1, "matrix_asin", 0, outputName, instName, GPUInstruction.MISC_TIMER_ASIN_KERNEL);
}
/**
* Performs an "acos" operation on a matrix on the GPU
* @param ec execution context
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param in1 input matrix
* @param outputName output matrix name
*/
public static void acos(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) {
if(LOG.isTraceEnabled()) {
LOG.trace("GPU : acos" + ", GPUContext=" + gCtx);
}
// acos(0) = PI/2
unaryOp(ec, gCtx, in1, "matrix_acos", Math.PI/2.0, outputName, instName, GPUInstruction.MISC_TIMER_ACOS_KERNEL);
}
/**
* Performs an "atan" operation on a matrix on the GPU
* @param ec execution context
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param in1 input matrix
* @param outputName output matrix name
*/
public static void atan(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) {
if(LOG.isTraceEnabled()) {
LOG.trace("GPU : atan" + ", GPUContext=" + gCtx);
}
// atan(0) = 0
unaryOp(ec, gCtx, in1, "matrix_atan", 0, outputName, instName, GPUInstruction.MISC_TIMER_ATAN_KERNEL);
}
/**
* Performs an "sign" operation on a matrix on the GPU
* @param ec execution context
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param in1 input matrix
* @param outputName output matrix name
*/
public static void sign(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) {
if(LOG.isTraceEnabled()) {
LOG.trace("GPU : sign" + ", GPUContext=" + gCtx);
}
// sign(0) = 0
unaryOp(ec, gCtx, in1, "matrix_sign", 0, outputName, instName, GPUInstruction.MISC_TIMER_SIGN_KERNEL);
}
/**
* Performs an "sigmoid" operation on a matrix on the GPU
* @param ec execution context
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param in1 input matrix
* @param outputName output matrix name
*/
public static void sigmoid(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) {
if(LOG.isTraceEnabled()) {
LOG.trace("GPU : sigmoid" + ", GPUContext=" + gCtx);
}
// sigmoid(0) = 0.5
unaryOp(ec, gCtx, in1, "matrix_sigmoid", 0.5, outputName, instName, GPUInstruction.MISC_TIMER_SIGMOID_KERNEL);
}
/**
* Calculate the greatest common denominator recursively
* @param a a number
* @param b another number
* @return greatest common denominator
*/
private static int gcd(int a, int b)
{
return b == 0 ? a : gcd(b, a % b);
}
/**
* Utility method to print kernel timings
* @param time a Timing object
* @param kernelFunction Name of the executed kernel function
* @param total_duration Sum of all execution times
* @param cascade_blocks Number of blocks in the timed iteration of the cascade
* @return accumulated time
*/
private static double printKernelTiming(Timing time, String kernelFunction, double total_duration, int cascade_blocks) {
if (LOG.isTraceEnabled()) {
cudaDeviceSynchronize();
double duration = time.stop();
total_duration += duration;
if(cascade_blocks > 0)
LOG.trace("uop kernel function " + kernelFunction + " (cascading_blocks=" + cascade_blocks + ") executed in " + duration + "ms.");
else
LOG.trace("uop kernel function " + kernelFunction + " executed in " + duration + "ms.");
time.start();
return total_duration;
}
else
return 0.0;
}
/**
* Cumulative scan
* @param ec valid execution context
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param kernelFunction The name of the cuda kernel to call
* @param in input matrix
* @param outputName output matrix name
*/
public static void cumulativeScan(ExecutionContext ec, GPUContext gCtx, String instName, String kernelFunction, MatrixObject in, String outputName) {
if(LOG.isTraceEnabled()) {
LOG.trace("GPU : cumulative scan (" + GPUInstruction.MISC_TIMER_CUMULATIVE_SCAN_KERNEL + ") for instruction " + instName +
" , GPUContext=" + gCtx);
}
int rows = toInt(in.getNumRows());
int cols = toInt(in.getNumColumns());
MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, in.getNumRows(), in.getNumColumns());
int[] tmp = getKernelParamsForCumScan(gCtx, rows, cols);
int blocks_x = tmp[0], blocks_y = tmp[1], threads_x = tmp[2], sharedMem = 0, block_height = tmp[3];
if (blocks_y > 1) {
Timing time = new Timing(false);
double duration = 0;
double alloc_duration = 0;
double total_duration = 0;
if(LOG.isTraceEnabled()) { time.start(); }
Pointer input = getDensePointer(gCtx, in, instName);
Pointer output = getDensePointer(gCtx, out, instName);
// storage for last value of each block
Pointer blk_res = gCtx.allocate(instName, cols * blocks_y * sizeOfDataType);
alloc_duration = printKernelTiming(time, "allocation of temporary buffer (" +
cols * blocks_y * sizeOfDataType + " bytes)", alloc_duration, 0);
getCudaKernels(gCtx).launchKernel(kernelFunction + "_up_sweep", new ExecutionConfig(blocks_x, blocks_y,
threads_x, 1, 0), input, blk_res, rows, cols, block_height);
total_duration = printKernelTiming(time, kernelFunction + "_up_sweep", total_duration, 0);
getCudaKernels(gCtx).launchKernel(kernelFunction + "_down_sweep", new ExecutionConfig(blocks_x, 1,
threads_x, 1, 0), blk_res, blk_res, blk_res, blocks_y, cols, blocks_y);
total_duration = printKernelTiming(time, kernelFunction + "_down_sweep", total_duration, 0);
getCudaKernels(gCtx).launchKernel(kernelFunction + "_down_sweep", new ExecutionConfig(blocks_x,
blocks_y, threads_x, 1, 0), input, output, blk_res, rows, cols, block_height);
total_duration = printKernelTiming(time, "final cumulative_scan_down_sweep", total_duration, 0);
if(LOG.isTraceEnabled()) { LOG.trace("total kernel execution time: " + total_duration + "ms."); }
gCtx.cudaFreeHelper(instName, blk_res, DMLScript.EAGER_CUDA_FREE);
if(LOG.isTraceEnabled()) {
cudaDeviceSynchronize();
duration = time.stop();
alloc_duration += duration;
LOG.trace("freeing of temporary buffer " + " executed in " + duration + "ms.");
LOG.trace("total memory mgmt execution time: " + alloc_duration + "ms.");
LOG.trace("total execution time (kernel + mem): " + (total_duration + alloc_duration) + "ms.");
}
}
else {
Pointer input = getDensePointer(gCtx, in, instName);
Pointer output = getDensePointer(gCtx, out, instName);
Timing time = new Timing(true);
double duration = 0;
getCudaKernels(gCtx).launchKernel(kernelFunction + "_down_sweep",
new ExecutionConfig(blocks_x, 1, threads_x, 1, sharedMem), input, output, input, rows, cols, block_height);
if(LOG.isTraceEnabled()) {
cudaDeviceSynchronize();
duration = time.stop();
LOG.trace("total kernel execution time: " + duration + "ms.");
}
}
}
/**
* Cumulative sum-product kernel cascade invokation
* @param ec valid execution context
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param kernelFunction The name of the cuda kernel to call
* @param in input matrix
* @param outputName output matrix name
*/
public static void cumulativeSumProduct(ExecutionContext ec, GPUContext gCtx,
String instName, String kernelFunction, MatrixObject in, String outputName)
{
if(LOG.isTraceEnabled()) {
LOG.trace("GPU : cumulative sum product for " + GPUInstruction.MISC_TIMER_CUMULATIVE_SCAN_KERNEL + ", GPUContext=" + gCtx);
}
Timing time = new Timing(false);
double total_duration = 0, alloc_duration = 0;
int rows = toInt(in.getNumRows());
if(LOG.isTraceEnabled()) { time.start(); }
if(rows > 128) {
final int MAX_BLOCKS = getMaxBlocks(gCtx);
ArrayList<Pointer> intermediate_buffers = new ArrayList<>();
ArrayList<Integer> cb_list = new ArrayList<>();
int block_height = 64;
int blocks = (rows + block_height - 1) / block_height;
if(blocks > MAX_BLOCKS) {
blocks = MAX_BLOCKS;
block_height = nextPow2((rows + (blocks - 1)) / blocks);
blocks = (rows + block_height - 1) / block_height;
}
int threads = 1;
int cascade_blocks = (blocks + block_height - 1) / block_height;
cb_list.add(cascade_blocks);
long total_mem_size = 0;
while( cascade_blocks > 0) {
long buf_size = 2 * block_height * cascade_blocks * sizeOfDataType;
total_mem_size += buf_size;
intermediate_buffers.add(gCtx.allocate(instName, buf_size));
cascade_blocks = (cascade_blocks + block_height - 2) / block_height;
if(cascade_blocks > 0)
cb_list.add(cascade_blocks);
}
alloc_duration = printKernelTiming(time, "allocation of temporary buffer ("
+ total_mem_size + " bytes)", alloc_duration, 0);
cascade_blocks = blocks;
MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, in.getNumRows(), 1);
Pointer input = getDensePointer(gCtx, in, instName);
Pointer output = getDensePointer(gCtx, out, instName);
if(LOG.isTraceEnabled()) {
LOG.trace("Launch configuration for cumulative aggregate: blocks=" + blocks +
" block_height=" + block_height + " threads=" + threads);
}
getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(blocks, threads), input,
0, 0, intermediate_buffers.get(0), rows, block_height, 0);
int cascade_level = 0;
while(cascade_level < intermediate_buffers.size() - 1) {
total_duration = printKernelTiming(time, kernelFunction, total_duration, cascade_blocks);
cascade_blocks = cb_list.get(cascade_level);
getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(cascade_blocks, threads),
intermediate_buffers.get(cascade_level), intermediate_buffers.get(cascade_level), 0,
intermediate_buffers.get(cascade_level+1), rows / ((cascade_level+1) * block_height), block_height, 1);
cascade_level++;
}
while (cascade_level > 0) {
total_duration = printKernelTiming(time, kernelFunction, total_duration, cascade_blocks);
cascade_level--;
cascade_blocks = cb_list.get(cascade_level);
getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(cascade_blocks, threads),
intermediate_buffers.get(cascade_level), intermediate_buffers.get(cascade_level),
intermediate_buffers.get(cascade_level+1), 0, rows / ((cascade_level+1) * block_height), block_height, 2);
}
total_duration = printKernelTiming(time, kernelFunction, total_duration, cascade_blocks);
getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(blocks, threads), input,
output, intermediate_buffers.get(0), 0, rows, block_height, 3);
if(LOG.isTraceEnabled()) {
cudaDeviceSynchronize();
double duration = time.stop();
total_duration += duration;
LOG.trace("final cascade ("+ kernelFunction + ", cascading_blocks=" + blocks + ") executed in " + duration + "ms.");
LOG.trace("total kernel execution time: " + total_duration + "ms.");
time.start();
}
for(int j = 0; j < intermediate_buffers.size(); ++j)
gCtx.cudaFreeHelper(instName, intermediate_buffers.get(j), DMLScript.EAGER_CUDA_FREE);
if(LOG.isTraceEnabled()) {
cudaDeviceSynchronize();
double duration = time.stop();
alloc_duration += duration;
LOG.trace("freeing of temporary buffer " + " executed in " + duration + "ms.");
LOG.trace("total memory mgmt execution time: " + alloc_duration + "ms.");
LOG.trace("total execution time (kernel + mem): " + (total_duration + alloc_duration) + "ms.");
}
}
else {
MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, in.getNumRows(), 1);
Pointer input = getDensePointer(gCtx, in, instName);
Pointer output = getDensePointer(gCtx, out, instName);
getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(1, 1), input,
output, 0, 0, rows, rows);
if(LOG.isTraceEnabled()) {
cudaDeviceSynchronize();
double duration = time.stop();
total_duration += duration;
LOG.trace("uop kernel function " + kernelFunction + " executed in " + duration + "ms.");
LOG.trace("total kernel execution time: " + total_duration + "ms.");
}
}
}
/**
* Get threads, blocks and shared memory for cumulative scan along columns
* @param gCtx a valid {@link GPUContext}
* @param rows number of rows in input matrix
* @param cols number of columns in input matrix
* @return integer array containing {blocks, threads, shared memory}
*/
private static int[] getKernelParamsForCumScan(GPUContext gCtx, int rows, int cols) {
final int MAX_THREADS = getMaxThreads(gCtx);
final int WARP_SIZE = getWarpSize(gCtx);
final int MAX_BLOCKS_Y = gCtx.getGPUProperties().maxGridSize[1];
int t1 = cols % MAX_THREADS;
int t2 = (t1 + WARP_SIZE - 1) / WARP_SIZE;
int t3 = t2 * WARP_SIZE;
int threads_x = gcd(MAX_THREADS, t3);
int blocks_x = Math.max(1, (cols + (threads_x - 1)) / (threads_x));
int block_height = Math.max(8, MAX_THREADS / threads_x);
int blocks_y = (rows + block_height - 1) / block_height;
int min_loop_length = 128;
if(rows <= min_loop_length) {
block_height = rows;
blocks_y = 1;
}
if(blocks_y > MAX_BLOCKS_Y) {
block_height = Math.max(2 ,2 * rows / MAX_BLOCKS_Y);
blocks_y = (rows + block_height - 1) / block_height;
}
if(LOG.isTraceEnabled()) {
LOG.trace("Launch configuration for cumulative aggregate: blocks_x=" + blocks_x + " blocks_y=" +
blocks_y + " block_height=" + block_height + " threads_x=" + threads_x);
}
return new int[] {blocks_x, blocks_y, threads_x, block_height};
}
/**
* A helper function for all Unary ops (sqrt, abs, sin.. etc)
* @param ec valid execution context
* @param gCtx a valid {@link GPUContext}
* @param in1 input matrix
* @param kernel name of CUDA kernel for the unary op to execute
* @param sparseAndEmptyFillValue the result of the unary op on a completely empty input matrix block
* @param outputName output matrix name
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param kernelTimer the name of the timer to measure the kernel invocation
*/
private static void unaryOp(ExecutionContext ec, GPUContext gCtx, MatrixObject in1, String kernel, double sparseAndEmptyFillValue, String outputName, String instName, String kernelTimer) {
if (ec.getGPUContext(0) != gCtx)
throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
GPUObject in = in1.getGPUObject(gCtx);
boolean isSparseAndEmpty = in.isSparseAndEmpty();
if (isSparseAndEmpty) {
MatrixObject out = ec.getMatrixObject(outputName);
ec.allocateGPUMatrixObject(outputName, in1.getNumRows(), in1.getNumColumns());
out.getGPUObject(gCtx).allocateAndFillDense(sparseAndEmptyFillValue);
} else {
// Dense
MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, in1.getNumRows(), in1.getNumColumns());
Pointer output = getDensePointer(gCtx, out, instName);
Pointer input = getDensePointer(gCtx, in1, instName);
int size = toInt(in1.getNumColumns() * in1.getNumRows());
getCudaKernels(gCtx).launchKernel(kernel, ExecutionConfig.getConfigForSimpleVectorOperations(size), input, output, size);
}
}
/**
* Performs daxpy operation
*
* @param ec execution context
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param in1 input matrix 1
* @param in2 input matrix 2
* @param outputName output matrix name
* @param constant pointer constant
*/
public static void axpy(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2,
String outputName, double constant) {
if (ec.getGPUContext(0) != gCtx)
throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
Pointer A = getDensePointer(gCtx, in1, instName);
Pointer B = getDensePointer(gCtx, in2, instName);
MatrixObject out = ec.getMatrixObject(outputName);
getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, in1.getNumRows(), in1.getNumColumns()); // Allocated the dense output matrix
Pointer C = getDensePointer(gCtx, out, instName);
if(in1.getNumRows() == in2.getNumRows() && in1.getNumColumns() == in2.getNumColumns()) {
if(LOG.isTraceEnabled()) {
LOG.trace("GPU : cublasDaxpy" + ", GPUContext=" + gCtx);
}
// Matrix-Matrix daxpy
long n = in1.getNumRows()*in2.getNumColumns(); // Since A is always a matrix
Pointer alphaPtr = dataTypePointerTo(constant);
// C <- A + alpha*B
// becomes
// C <- A
// C <- alpha*B + C
cudaMemcpy(C, A, n*sizeOfDataType, cudaMemcpyDeviceToDevice);
cudaSupportFunctions.cublasaxpy(getCublasHandle(gCtx), toInt(n), alphaPtr, B, 1, C, 1);
}
else {
if(LOG.isTraceEnabled()) {
LOG.trace("GPU : daxpy_matrix_vector" + ", GPUContext=" + gCtx);
}
// Matrix-Vector daxpy
// Note: Vector-Matrix operation is not supported
// daxpy_matrix_vector(double* A, double* B, double alpha, double* ret, int rlenA, int clenA, int rlenB, int clenB)
int rlenA = toInt(in1.getNumRows()); int clenA = toInt(in1.getNumColumns());
int rlenB = toInt(in2.getNumRows()); int clenB = toInt(in2.getNumColumns());
getCudaKernels(gCtx).launchKernel("daxpy_matrix_vector",
ExecutionConfig.getConfigForSimpleMatrixOperations(rlenA, clenA),
A, B, constant, C, rlenA, clenA, rlenB, clenB);
}
}
/**
* Implements the "solve" function for systemds Ax = B (A is of size m*n, B is of size m*1, x is of size n*1)
*
* @param ec a valid {@link ExecutionContext}
* @param gCtx a valid {@link GPUContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param in1 input matrix A
* @param in2 input matrix B
* @param outputName name of the output matrix
*/
public static void solve(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName) {
if (ec.getGPUContext(0) != gCtx)
throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
// x = solve(A, b)
if(LOG.isTraceEnabled()) {
LOG.trace("GPU : solve" + ", GPUContext=" + gCtx);
}
GPUObject Aobj = in1.getGPUObject(gCtx);
if (isInSparseFormat(gCtx, in1))
Aobj.sparseToDense(instName);
GPUObject bobj = in2.getGPUObject(gCtx);
if (isInSparseFormat(gCtx, in2))
bobj.sparseToDense(instName);
int m = (int) in1.getNumRows();
int n = (int) in1.getNumColumns();
if ((int) in2.getNumRows() != m)
throw new DMLRuntimeException("GPU : Incorrect input for solve(), rows in A should be the same as rows in B");
if ((int) in2.getNumColumns() != 1)
throw new DMLRuntimeException("GPU : Incorrect input for solve(), columns in B should be 1");
// Copy over matrices and
// convert dense matrices to row major
// Operation in cuSolver and cuBlas are for column major dense matrices
// and are destructive to the original input
GPUObject ATobj = (GPUObject) Aobj.clone();
ATobj.denseRowMajorToColumnMajor();
Pointer A = ATobj.getDensePointer();
GPUObject bTobj = (GPUObject) bobj.clone();
bTobj.denseRowMajorToColumnMajor();
Pointer b = bTobj.getDensePointer();
// The following set of operations is done following the example in the cusolver documentation
// http://docs.nvidia.com/cuda/cusolver/#ormqr-example1
// step 3: query working space of geqrf and ormqr
int[] lwork = {0};
cudaSupportFunctions.cusolverDngeqrf_bufferSize(gCtx.getCusolverDnHandle(), m, n, A, m, lwork);
// step 4: compute QR factorization
Pointer work = gCtx.allocate(instName, lwork[0] * sizeOfDataType);
Pointer tau = gCtx.allocate(instName, m * sizeOfDataType);
Pointer devInfo = gCtx.allocate(instName, Sizeof.INT);
cudaSupportFunctions.cusolverDngeqrf(gCtx.getCusolverDnHandle(), m, n, A, m, tau, work, lwork[0], devInfo);
int[] qrError = {-1};
cudaMemcpy(Pointer.to(qrError), devInfo, Sizeof.INT, cudaMemcpyDeviceToHost);
if (qrError[0] != 0) {
throw new DMLRuntimeException("GPU : Error in call to geqrf (QR factorization) as part of solve, argument " + qrError[0] + " was wrong");
}
// step 5: compute Q^T*B
cudaSupportFunctions.cusolverDnormqr(gCtx.getCusolverDnHandle(), cublasSideMode.CUBLAS_SIDE_LEFT, cublasOperation.CUBLAS_OP_T, m, 1, n, A, m, tau, b, m, work, lwork[0], devInfo);
cudaMemcpy(Pointer.to(qrError), devInfo, Sizeof.INT, cudaMemcpyDeviceToHost);
if (qrError[0] != 0) {
throw new DMLRuntimeException("GPU : Error in call to ormqr (to compuete Q^T*B after QR factorization) as part of solve, argument " + qrError[0] + " was wrong");
}
// step 6: compute x = R \ Q^T*B
cudaSupportFunctions.cublastrsm(gCtx.getCublasHandle(),
cublasSideMode.CUBLAS_SIDE_LEFT, cublasFillMode.CUBLAS_FILL_MODE_UPPER, cublasOperation.CUBLAS_OP_N, cublasDiagType.CUBLAS_DIAG_NON_UNIT,
n, 1, dataTypePointerTo(1.0), A, m, b, m);
bTobj.denseColumnMajorToRowMajor();
// TODO : Find a way to assign bTobj directly to the output and set the correct flags so as to not crash
// There is an avoidable copy happening here
MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, in1.getNumColumns(), 1);
cudaMemcpy(out.getGPUObject(gCtx).getDensePointer(), bTobj.getDensePointer(), n * 1 * sizeOfDataType, cudaMemcpyDeviceToDevice);
gCtx.cudaFreeHelper(instName, work, DMLScript.EAGER_CUDA_FREE);
gCtx.cudaFreeHelper(instName, tau, DMLScript.EAGER_CUDA_FREE);
ATobj.clearData(instName, DMLScript.EAGER_CUDA_FREE);
bTobj.clearData(instName, DMLScript.EAGER_CUDA_FREE);
//debugPrintMatrix(b, n, 1);
}
//********************************************************************/
//***************** END OF Builtin Functions ************************/
//********************************************************************/
/**
* Helper method to get the output block (allocated on the GPU)
* Also records performance information into {@link Statistics}
* @param ec active {@link ExecutionContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param name name of input matrix (that the {@link ExecutionContext} is aware of)
* @param numRows number of rows of output matrix object
* @param numCols number of columns of output matrix object
* @return the matrix object
*/
public static MatrixObject getDenseMatrixOutputForGPUInstruction(ExecutionContext ec, String instName, String name, long numRows, long numCols) {
return ec.getDenseMatrixOutputForGPUInstruction(name, numRows, numCols).getKey();
}
/**
* Helper method to get the output block (allocated on the GPU)
* Also records performance information into {@link Statistics}
* @param ec active {@link ExecutionContext}
* @param numRows number of rows of matrix object
* @param numCols number of columns of matrix object
* @param nnz number of non zeroes in output matrix
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param name name of input matrix (that the {@link ExecutionContext} is aware of)
* @return the matrix object
*/
private static MatrixObject getSparseMatrixOutputForGPUInstruction(ExecutionContext ec, long numRows, long numCols, long nnz, String instName, String name) {
return ec.getSparseMatrixOutputForGPUInstruction(name, numRows, numCols, nnz).getKey();
}
/**
* Utility to compute number of non-zeroes on the GPU
*
* @param gCtx the associated GPUContext
* @param densePtr device pointer to the dense matrix
* @param length length of the dense pointer
* @return the number of non-zeroes
*/
public static synchronized int computeNNZ(GPUContext gCtx, Pointer densePtr, int length) {
return (int) reduceAll(gCtx, null, "compute_nnz", densePtr, length);
// This is extremely slow
// cusparseMatDescr matDescr = CSRPointer.getDefaultCuSparseMatrixDescriptor();
// cusparseHandle cusparseHandle = gCtx.getCusparseHandle();
// if(_TMP_NNZ_ROW_PTR == null) {
// // As these are 4-byte pointers, using cudaMalloc directly so as not to include them in memory information.
// _TMP_NNZ_ROW_PTR = new Pointer();
// cudaMalloc(_TMP_NNZ_ROW_PTR, jcuda.Sizeof.INT);
// _TMP_NNZ_PTR = new Pointer();
// cudaMalloc(_TMP_NNZ_PTR, jcuda.Sizeof.INT);
// // _TMP_NNZ_ROW_PTR = gCtx.allocate(jcuda.Sizeof.INT);
// // _TMP_NNZ_PTR = gCtx.allocate(jcuda.Sizeof.INT);
// }
// // Output is in dense vector format, convert it to CSR
// LibMatrixCUDA.cudaSupportFunctions.cusparsennz(cusparseHandle, cusparseDirection.CUSPARSE_DIRECTION_ROW, 1, length, matDescr, densePtr, 1,
// _TMP_NNZ_ROW_PTR, _TMP_NNZ_PTR);
// int[] nnzC = { -1 };
// cudaMemcpy(Pointer.to(nnzC), _TMP_NNZ_PTR, jcuda.Sizeof.INT, cudaMemcpyDeviceToHost);
// return nnzC[0];
}
}