| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| package org.apache.sysds.runtime.instructions.gpu; |
| |
| import org.apache.commons.logging.Log; |
| import org.apache.commons.logging.LogFactory; |
| import org.apache.sysds.api.DMLScript; |
| import org.apache.sysds.runtime.controlprogram.caching.MatrixObject; |
| import org.apache.sysds.runtime.controlprogram.context.ExecutionContext; |
| import org.apache.sysds.runtime.instructions.GPUInstructionParser; |
| import org.apache.sysds.runtime.instructions.Instruction; |
| import org.apache.sysds.runtime.instructions.cp.CPInstruction; |
| import org.apache.sysds.runtime.instructions.gpu.context.GPUContext; |
| import org.apache.sysds.runtime.matrix.operators.Operator; |
| import org.apache.sysds.utils.Statistics; |
| |
| public abstract class GPUInstruction extends Instruction { |
| private static final Log LOG = LogFactory.getLog(GPUInstruction.class.getName()); |
| |
| public enum GPUINSTRUCTION_TYPE { |
| AggregateUnary, |
| AggregateBinary, |
| RelationalBinary, |
| Dnn, |
| MMTSJ, |
| Reorg, |
| MatrixReshape, |
| Append, |
| ArithmeticBinary, |
| BuiltinUnary, |
| BuiltinBinary, |
| Builtin, |
| MatrixIndexing |
| } |
| |
| // Memory/conversions |
| public final static String MISC_TIMER_HOST_TO_DEVICE = "H2D"; // time spent in bringing data to gpu (from host) |
| public final static String MISC_TIMER_DEVICE_TO_HOST = "D2H"; // time spent in bringing data from gpu (to host) |
| public final static String MISC_TIMER_DEVICE_TO_DEVICE = "D2D"; // time spent in copying data from one region on the device to another |
| public final static String MISC_TIMER_SPARSE_TO_DENSE = "s2d"; // time spent in converting data from sparse to dense |
| public final static String MISC_TIMER_DENSE_TO_SPARSE = "d2s"; // time spent in converting data from dense to sparse |
| public final static String MISC_TIMER_ROW_TO_COLUMN_MAJOR = "r2c"; // time spent in converting data from row major to column major |
| public final static String MISC_TIMER_COLUMN_TO_ROW_MAJOR = "c2r"; // time spent in converting data from column major to row major |
| public final static String MISC_TIMER_OBJECT_CLONE = "clone";// time spent in cloning (deep copying) a GPUObject instance |
| public final static String MISC_TIMER_CUDA_SYNC = "sync"; // time spent in device sync |
| |
| public final static String MISC_TIMER_CUDA_FREE = "f"; // time spent in calling cudaFree |
| public final static String MISC_TIMER_ALLOCATE = "a"; // time spent to allocate memory on gpu |
| public final static String MISC_TIMER_EVICT = "evict"; // time spent in eviction on gpu |
| public final static String MISC_TIMER_ALLOCATE_DENSE_OUTPUT = "ad"; // time spent to allocate dense output (recorded differently than MISC_TIMER_ALLOCATE) |
| public final static String MISC_TIMER_ALLOCATE_SPARSE_OUTPUT = "as"; // time spent to allocate sparse output (recorded differently than MISC_TIMER_ALLOCATE) |
| public final static String MISC_TIMER_SET_ZERO = "az"; // time spent to allocate |
| public final static String MISC_TIMER_REUSE = "r"; // time spent in reusing already allocated memory on GPU (mainly for the count) |
| |
| // Matmult instructions |
| public final static String MISC_TIMER_SPARSE_ALLOCATE_LIB = "Msao"; // time spend in allocating for sparse matrix output |
| public final static String MISC_TIMER_DENSE_DOT_LIB = "Mddot"; // time spent in dot product of 2 dense vectors |
| public final static String MISC_TIMER_DENSE_VECTOR_DENSE_MATRIX_LIB = "Mdvdm"; // time spent in matrix mult of dense vector and dense matrix |
| public final static String MISC_TIMER_DENSE_MATRIX_DENSE_VECTOR_LIB = "Mdmdv"; // time spent in matrix mult of dense matrix and dense vector |
| public final static String MISC_TIMER_DENSE_MATRIX_DENSE_MATRIX_LIB = "Mdmdm"; // time spent in matrix mult of dense matrices |
| public final static String MISC_TIMER_SPARSE_MATRIX_DENSE_VECTOR_LIB = "Msmdv"; // time spent in matrix mult of sparse matrix and dense vector |
| public final static String MISC_TIMER_SPARSE_MATRIX_SPARSE_MATRIX_LIB = "Msmsm"; // time spent in matrix mult of sparse matrices |
| public final static String MISC_TIMER_SPARSE_MATRIX_DENSE_MATRIX_LIB = "Msmdm"; // time spent in matrix mult of sparse matrices |
| public final static String MISC_TIMER_SYRK_LIB = "Msyrk"; // time spent in symmetric rank-k update |
| |
| // Other BLAS instructions |
| public final static String MISC_TIMER_DAXPY_LIB = "daxpy"; // time spent in daxpy |
| public final static String MISC_TIMER_QR_BUFFER = "qr_buffer";// time spent in calculating buffer needed to perform QR |
| public final static String MISC_TIMER_QR = "qr"; // time spent in doing QR |
| public final static String MISC_TIMER_ORMQR = "ormqr"; // time spent in ormqr |
| public final static String MISC_TIMER_TRSM = "trsm"; // time spent in cublas Dtrsm |
| |
| // Transpose |
| public final static String MISC_TIMER_SPARSE_DGEAM_LIB = "sdgeaml"; // time spent in sparse transpose (and other ops of type a*op(A) + b*op(B)) |
| public final static String MISC_TIMER_DENSE_DGEAM_LIB = "ddgeaml"; // time spent in dense transpose (and other ops of type a*op(A) + b*op(B)) |
| public final static String MISC_TIMER_TRANSPOSE_LIB = "dtl"; // time spent on dense transpose, this includes allocation of output |
| |
| // Custom kernels |
| public final static String MISC_TIMER_MATRIX_MATRIX_CELLWISE_OP_KERNEL = "mmck"; // time spent in matrix-matrix cellwise operations |
| public final static String MISC_TIMER_COMPARE_AND_SET_KERNEL = "cask"; // time spent in compareAndSet kernel |
| public final static String MISC_TIMER_EXP_KERNEL = "expk"; // time spent in the exp kernel |
| public final static String MISC_TIMER_SQRT_KERNEL = "sqrtk"; // time spent in the sqrt kernel |
| public final static String MISC_TIMER_ROUND_KERNEL = "roundk"; // time spent in the round kernel |
| public final static String MISC_TIMER_ABS_KERNEL = "absk"; // time spent in the abs kernel |
| public final static String MISC_TIMER_LOG_KERNEL = "logk"; // time spent in the log kernel |
| public final static String MISC_TIMER_FLOOR_KERNEL = "floork"; // time spent in the floor kernel |
| public final static String MISC_TIMER_CEIL_KERNEL = "ceilk"; // time spent in the ceil kernel |
| public final static String MISC_TIMER_SIN_KERNEL = "sink"; // time spent in the sin kernel |
| public final static String MISC_TIMER_COS_KERNEL = "cosk"; // time spent in the cos kernel |
| public final static String MISC_TIMER_TAN_KERNEL = "tank"; // time spent in the tan kernel |
| public final static String MISC_TIMER_SINH_KERNEL = "sinhk"; // time spent in the sinh kernel |
| public final static String MISC_TIMER_COSH_KERNEL = "coshk"; // time spent in the cosh kernel |
| public final static String MISC_TIMER_TANH_KERNEL = "tanhk"; // time spent in the tanh kernel |
| public final static String MISC_TIMER_ASIN_KERNEL = "asink"; // time spent in the asin kernel |
| public final static String MISC_TIMER_ACOS_KERNEL = "acosk"; // time spent in the acos kernel |
| public final static String MISC_TIMER_ATAN_KERNEL = "atank"; // time spent in the atan kernel |
| public final static String MISC_TIMER_SIGN_KERNEL = "signk"; // time spent in the sign kernel |
| public final static String MISC_TIMER_SIGMOID_KERNEL = "sigmk"; // time spent in the sigmoid kernel |
| public final static String MISC_TIMER_CBIND_KERNEL = "cbindk"; // time spent in the cbind kernel |
| public final static String MISC_TIMER_RBIND_KERNEL = "rbindk"; // time spent in the rbind kernel |
| |
| public final static String MISC_TIMER_DAXPY_MV_KERNEL = "daxpymv";// time spent in the daxpy_matrix_vector kernel |
| public final static String MISC_TIMER_UPPER_TO_LOWER_TRIANGLE_KERNEL = "u2lk"; // time spent in the copy_u2l_dense kernel |
| public final static String MISC_TIMER_FILL_KERNEL = "fillk"; // time spent in the "fill" kernel |
| public final static String MISC_TIMER_MATRIX_SCALAR_OP_KERNEL = "msk"; // time spent in the matrix scalar kernel |
| public final static String MISC_TIMER_REDUCE_ALL_KERNEL = "rallk"; // time spent in reduce all kernel |
| public final static String MISC_TIMER_REDUCE_ROW_KERNEL = "rrowk"; // time spent in reduce row kernel |
| public final static String MISC_TIMER_REDUCE_COL_KERNEL = "rcolk"; // time spent in reduce column kernel |
| |
| public final static String MISC_TIMER_RIX_DENSE_OP = "drix"; // time spent in the right indexing dense kernel |
| public final static String MISC_TIMER_RIX_SPARSE_DENSE_OP_ROWWISE = "sdrixr"; // time spent in the right indexing sparse dense kernel (row-wise parallelism) |
| public final static String MISC_TIMER_RIX_SPARSE_DENSE_OP_NNZ = "sdrixn"; // time spent in the right indexing sparse dense kernel (nnz parallelism) |
| |
| // Deep learning operators |
| public final static String MISC_TIMER_ACTIVATION_FORWARD_LIB = "nnaf"; // time spent in cudnnActivationForward |
| public final static String MISC_TIMER_CONVOLUTION_FORWARD_LIB = "nncf"; // time spent in cudnnConvolutionForward |
| public final static String MISC_TIMER_CONVOLUTION_BACKWARD_FILTER_LIB ="nncbf"; // time spent in cudnnConvolutionBackwardFilter |
| public final static String MISC_TIMER_CONVOLUTION_BACKWARD_DATA_LIB = "nncbd"; // time spent in cudnnConvolutionBackwardData |
| public final static String MISC_TIMER_MAXPOOLING_FORWARD_LIB = "nnmf"; // time spent in cudnnPoolingForward |
| public final static String MISC_TIMER_MAXPOOLING_BACKWARD_LIB = "nnmb"; // time spent in cudnnPoolingBackward |
| public final static String MISC_TIMER_BIAS_ADD_LIB = "nnba"; // time spent in bias_add, bias_multiply cuda kernel |
| public final static String MISC_TIMER_RELU_BACKWARD_KERNEL= "nnrbk"; // time spent in relu_backward cuda kernel |
| public final static String MISC_TIMER_RELU_KERNEL = "nnrk"; // time spent in the relu kernel |
| public final static String MISC_TIMER_CUDNN_INIT = "nni"; // time spent in initializations for cudnn call |
| public final static String MISC_TIMER_CUDNN_CLEANUP = "nnc"; // time spent in cleanup for cudnn call |
| public final static String MISC_TIMER_DENSE_IM2COL_KERNEL= "nndim2c"; // time spent in dense im2col cuda kernel |
| public final static String MISC_TIMER_SPARSE_IM2COL_KERNEL= "nnsim2c"; // time spent in sparse im2col cuda kernel |
| public final static String MISC_TIMER_DENSE_REORG_KNPQ_KERNEL= "nndrknpq"; // time spent in dense reorg_knpq cuda kernel |
| // cumulative operators |
| public final static String MISC_TIMER_CUMULATIVE_SCAN_KERNEL = "cumk"; // time spent in cumulative scan cuda kernel |
| public final static String MISC_TIMER_CUMULATIVE_SUMPROD_KERNEL = "cumSumProdk"; // time spent in cumulative sum-product cuda kernel |
| |
| protected GPUINSTRUCTION_TYPE _gputype; |
| protected Operator _optr; |
| |
| protected boolean _requiresLabelUpdate = false; |
| |
| private GPUInstruction(String opcode, String istr) { |
| instString = istr; |
| |
| // prepare opcode and update requirement for repeated usage |
| instOpcode = opcode; |
| _requiresLabelUpdate = super.requiresLabelUpdate(); |
| } |
| |
| protected GPUInstruction(Operator op, String opcode, String istr) { |
| this(opcode, istr); |
| _optr = op; |
| } |
| |
| @Override |
| public IType getType() { |
| return IType.GPU; |
| } |
| |
| public GPUINSTRUCTION_TYPE getGPUInstructionType() { |
| return _gputype; |
| } |
| |
| @Override |
| public boolean requiresLabelUpdate() { |
| return _requiresLabelUpdate; |
| } |
| |
| @Override |
| public String getGraphString() { |
| return getOpcode(); |
| } |
| |
| @Override |
| public Instruction preprocessInstruction(ExecutionContext ec) { |
| //default preprocess behavior (e.g., debug state) |
| Instruction tmp = super.preprocessInstruction(ec); |
| //instruction patching |
| if( tmp.requiresLabelUpdate() ) { //update labels only if required |
| //note: no exchange of updated instruction as labels might change in the general case |
| String updInst = CPInstruction.updateLabels(tmp.toString(), ec.getVariables()); |
| tmp = GPUInstructionParser.parseSingleInstruction(updInst); |
| } |
| return tmp; |
| } |
| |
| @Override |
| public abstract void processInstruction(ExecutionContext ec); |
| |
| @Override |
| public void postprocessInstruction(ExecutionContext ec) { |
| if(DMLScript.SYNCHRONIZE_GPU) { |
| jcuda.runtime.JCuda.cudaDeviceSynchronize(); |
| } |
| if(LOG.isDebugEnabled()) { |
| for(GPUContext gpuCtx : ec.getGPUContexts()) |
| if(gpuCtx != null) |
| gpuCtx.printMemoryInfo(getOpcode()); |
| } |
| } |
| |
| /** |
| * Helper method to get the input block (allocated on the GPU) |
| * Also records performance information into {@link Statistics} |
| * @param ec active {@link ExecutionContext} |
| * @param name name of input matrix (that the {@link ExecutionContext} is aware of) |
| * @return the matrix object |
| */ |
| protected MatrixObject getMatrixInputForGPUInstruction(ExecutionContext ec, String name) { |
| return ec.getMatrixInputForGPUInstruction(name, getExtendedOpcode()); |
| } |
| |
| /** |
| * Helper method to get the output block (allocated on the GPU) |
| * Also records performance information into {@link Statistics} |
| * @param ec active {@link ExecutionContext} |
| * @param name name of input matrix (that the {@link ExecutionContext} is aware of) |
| * @param numRows number of rows of matrix object |
| * @param numCols number of columns of matrix object |
| * @return the matrix object |
| */ |
| protected MatrixObject getDenseMatrixOutputForGPUInstruction(ExecutionContext ec, String name, long numRows, long numCols) { |
| return ec.getDenseMatrixOutputForGPUInstruction(name, numRows, numCols).getKey(); |
| } |
| } |