src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNN.java - systemds - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 package org.apache.sysds.runtime.matrix.data;

 import static jcuda.jcudnn.JCudnn.cudnnActivationForward;
 import static jcuda.jcudnn.JCudnn.cudnnConvolutionBackwardData;
 import static jcuda.jcudnn.JCudnn.cudnnConvolutionBackwardFilter;
 import static jcuda.jcudnn.JCudnn.cudnnConvolutionForward;
 import static jcuda.jcudnn.JCudnn.cudnnCreateActivationDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnCreateTensorDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnDestroyTensorDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnPoolingBackward;
 import static jcuda.jcudnn.JCudnn.cudnnPoolingForward;
 import static jcuda.jcudnn.JCudnn.cudnnSetActivationDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnSetTensor4dDescriptor;
 import static jcuda.jcudnn.cudnnActivationMode.CUDNN_ACTIVATION_RELU;
 import static jcuda.jcudnn.cudnnNanPropagation.CUDNN_PROPAGATE_NAN;
 import static jcuda.jcudnn.cudnnTensorFormat.CUDNN_TENSOR_NCHW;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice;
 import static jcuda.runtime.JCuda.cudaMemcpy;
 import static jcuda.jcudnn.JCudnn.cudnnBatchNormalizationForwardTraining;
 import static jcuda.jcudnn.JCudnn.cudnnBatchNormalizationForwardInference;
 import static jcuda.jcudnn.JCudnn.cudnnBatchNormalizationBackward;
 import static jcuda.runtime.JCuda.cudaMemset;
 import jcuda.CudaException;
 import jcuda.Pointer;
 import jcuda.jcudnn.JCudnn;
 import jcuda.jcudnn.cudnnActivationDescriptor;
 import jcuda.jcudnn.cudnnConvolutionFwdPreference;
 import jcuda.jcudnn.cudnnHandle;
 import jcuda.jcudnn.cudnnStatus;
 import jcuda.jcudnn.cudnnTensorDescriptor;

 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.api.DMLScript;
 import org.apache.sysds.hops.OptimizerUtils;
 import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysds.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysds.runtime.instructions.gpu.context.CSRPointer;
 import org.apache.sysds.runtime.instructions.gpu.context.ExecutionConfig;
 import org.apache.sysds.runtime.instructions.gpu.context.GPUContext;
 import org.apache.sysds.runtime.matrix.data.LibMatrixDNN.PoolingType;
 import org.apache.sysds.utils.Statistics;

 import static jcuda.jcudnn.cudnnSoftmaxAlgorithm.CUDNN_SOFTMAX_ACCURATE;
 import static jcuda.jcudnn.cudnnSoftmaxMode.CUDNN_SOFTMAX_MODE_CHANNEL;

 /**
  * This class contains method that invoke CuDNN operations.
  */
 public class LibMatrixCuDNN extends LibMatrixCUDA {

 	// Currently we only use nnz information from the sparse matrix which is pre-computed
 	// TODO: experiment how often does dense matrix is empty where recomputing nnz before calling CuDNN will help
 	private static final boolean RECOMPUTE_DENSE_NNZ = false;

 	protected static int CONVOLUTION_PREFERENCE = cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
 	private static final Log LOG = LogFactory.getLog(LibMatrixCuDNN.class.getName());

 	protected static cudnnHandle getCudnnHandle(GPUContext gCtx) {
 		return gCtx.getCudnnHandle();
 	}

 	/**
 	 * Does a 2D convolution followed by a bias_add
 	 *
 	 * @param gCtx     a valid {@link GPUContext}
 	 * @param instName the invoking instruction's name for record {@link Statistics}.
 	 * @param image    input image matrix object
 	 * @param bias     bias matrix object
 	 * @param filter   filter matrix object
 	 * @param output   output matrix object
 	 * @param N        number of input images
 	 * @param C        number of channels
 	 * @param H        height of each image
 	 * @param W        width of each image
 	 * @param K        number of output "channels"
 	 * @param R        height of filter
 	 * @param S        width of filter
 	 * @param pad_h    padding height
 	 * @param pad_w    padding width
 	 * @param stride_h stride height
 	 * @param stride_w string width
 	 * @param P        output height
 	 * @param Q        output width
 	 * @param intermediateMemoryBudget intermediate memory budget
 	 */
 	public static void conv2dBiasAdd(GPUContext gCtx, String instName, MatrixObject image, MatrixObject bias, MatrixObject filter, MatrixObject output, int N, int C, int H, int W,
 			int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q, double intermediateMemoryBudget) {
 		conv2d(gCtx, instName, image, filter, output, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q, intermediateMemoryBudget);
 		//cudaDeviceSynchronize;
 		biasAdd(gCtx, instName, output, bias, output);
 	}

 	/**
 	 * Performs im2col operation on GPU
 	 *
 	 * @param gCtx a valid {@link GPUContext}
 	 * @param instName the invoking instruction's name for record {@link Statistics}.
 	 * @param image input matrix object
 	 * @param isSparseImage is input image sparse
 	 * @param N        number of input images
 	 * @param C        number of channels
 	 * @param H        height of each image
 	 * @param W        width of each image
 	 * @param R        height of filter
 	 * @param S        width of filter
 	 * @param pad_h    padding height
 	 * @param pad_w    padding width
 	 * @param stride_h stride height
 	 * @param stride_w string width
 	 * @param P        output height
 	 * @param Q        output width
 	 * @return output im2col pointer (the caller is expected to free this pointer) or null if image is an empty matrix
 	 */
 	private static Pointer denseIm2col(GPUContext gCtx, String instName, MatrixObject image, boolean isSparseImage, long N, long C, long H, long W,
 			int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) {
 		Pointer im2colPointer = null;
 		if(isSparseImage) {
 			CSRPointer inPointer = getSparsePointer(gCtx, image, instName);
 			if(inPointer.nnz < 0) {
 				throw new DMLRuntimeException("Unknown number of nonzeroes in denseIm2col");
 			}
 			else if(inPointer.nnz > 0) {
 				im2colPointer = gCtx.allocate(instName, C*R*S*N*P*Q*sizeOfDataType);
 				getCudaKernels(gCtx).launchKernel("sparse_dense_im2col", ExecutionConfig.getConfigForSimpleVectorOperations(toInt(inPointer.nnz)),
 						inPointer.val, inPointer.rowPtr, inPointer.colInd, im2colPointer, inPointer.nnz, N,
 						C*H*W, H*W, W, R, S, P, Q, P*Q, R*S, N*P*Q, stride_h, stride_w, pad_h, pad_w);
 			}
 			else
 				return null;
 		}
 		else {
 			im2colPointer = gCtx.allocate(instName, C*R*S*N*P*Q*sizeOfDataType);
 			Pointer imagePointer = getDensePointerForCuDNN(gCtx, image, instName);
 			getCudaKernels(gCtx).launchKernel("dense_dense_im2col", ExecutionConfig.getConfigForSimpleVectorOperations(toInt(N*C*H*W)),
 					imagePointer, im2colPointer, N*C*H*W,
 					C*H*W, H*W, W, R, S, P, Q, P*Q, R*S, N*P*Q, stride_h, stride_w, pad_h, pad_w);
 		}
 		return im2colPointer;
 	}

 	/**
 	 * Performs a 2D convolution
 	 *
 	 * @param gCtx a valid {@link GPUContext}
 	 * @param instName the invoking instruction's name for record {@link Statistics}.
 	 * @param image input matrix object
 	 * @param filter filter matrix object
 	 * @param outputBlock output matrix object
 	 * @param N        number of input images
 	 * @param C        number of channels
 	 * @param H        height of each image
 	 * @param W        width of each image
 	 * @param K        number of output "channels"
 	 * @param R        height of filter
 	 * @param S        width of filter
 	 * @param pad_h    padding height
 	 * @param pad_w    padding width
 	 * @param stride_h stride height
 	 * @param stride_w string width
 	 * @param P        output height
 	 * @param Q        output width
 	 * @param intermediateMemoryBudget intermediate memory budget
 	 */
 	public static void conv2d(GPUContext gCtx, String instName, MatrixObject image, MatrixObject filter, MatrixObject outputBlock, int N, int C, int H, int W,
 			int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q, double intermediateMemoryBudget) {

 		long CHW = C*H*W; long KPQ = K*P*Q; long CRS = C*R*S;
 		long NCHW = N*CHW; long NKPQ = N*KPQ; long KCRS = K*CRS;
 		long NPQ = N*P*Q;

 		boolean isSparseFilter = isInSparseFormat(gCtx, filter);
 		long filterNnz = getNnz(gCtx, instName, filter, RECOMPUTE_DENSE_NNZ);
 		if(filterNnz == 0) {
 			return; // since filter is empty
 		}
 		boolean isSparseImage = isInSparseFormat(gCtx, image);
 		long imageNnz = getNnz(gCtx, instName, image, RECOMPUTE_DENSE_NNZ);
 		if(imageNnz == 0) {
 			return; // since image is empty
 		}
 		Pointer dstPointer = getDensePointerForCuDNN(gCtx, outputBlock, instName);

 		if(NCHW < maxNumElementsOfCuDNNTensor && NKPQ < maxNumElementsOfCuDNNTensor && KCRS < maxNumElementsOfCuDNNTensor) {
 			if(isSparseFilter &&
 				(OptimizerUtils.estimateSizeExactSparsity(CRS, NPQ, 1.0) + OptimizerUtils.estimateSizeExactSparsity(K, NPQ, 1.0)) <
 					Math.min(LibMatrixCuDNNConvolutionAlgorithm.MAX_WORKSPACE_LIMIT_BYTES, intermediateMemoryBudget)) {
 				// Sparse filter conv2d
 				// Perform dense im2col
 				Pointer im2colPointer = denseIm2col(gCtx, instName, image, isSparseImage,
 						N, C, H, W, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);

 				// Perform matrix multiplication
 				CSRPointer filterPointer = filter.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
 				Pointer matmultOutputPointer = gCtx.allocate(instName, NKPQ*sizeOfDataType);
 				LibMatrixCuMatMult.sparseDenseMatMult(gCtx, instName, matmultOutputPointer, filterPointer, im2colPointer, K, CRS, CRS, NPQ, K, NPQ, false, false);
 				gCtx.cudaFreeHelper(instName, im2colPointer, DMLScript.EAGER_CUDA_FREE);

 				// Perform reorg_knpq a reorg operation of matmultOutputPointer matrix with dimensions [K, NPQ]
 				// and return a matrix dstPointer with dimensions [N, KPQ]
 				getCudaKernels(gCtx).launchKernel("reorg_knpq", ExecutionConfig.getConfigForSimpleVectorOperations(toInt(NKPQ)),
 						matmultOutputPointer, dstPointer, NKPQ, NPQ, KPQ, P*Q);
 				gCtx.cudaFreeHelper(instName, matmultOutputPointer, DMLScript.EAGER_CUDA_FREE);
 			}
 			else {
 				// Filter and output are accounted as dense in the memory estimation for conv2d
 				double overhead = isSparseFilter ? OptimizerUtils.estimateSizeExactSparsity(K, CRS, 1.0) : 0;
 				overhead += isSparseImage ? OptimizerUtils.estimateSizeExactSparsity(N, CHW, 1.0) : 0;

 				Pointer filterPointer = getDensePointerForCuDNN(gCtx, filter, instName);

 				// Required for LibMatrixCuDNNConvolutionAlgorithm
 				long workspaceLimit = (long) (intermediateMemoryBudget-overhead);
 				int localN = overhead <= intermediateMemoryBudget ? N : 1;

 				try(LibMatrixCuDNNConvolutionAlgorithm algo =
 						LibMatrixCuDNNConvolutionAlgorithm.cudnnGetConvolutionForwardAlgorithm(gCtx, instName,
 						localN, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q, workspaceLimit)) {
 					if(localN == N) {
 						// Perform all-input all-channel conv2d
 						Pointer imagePointer = getDensePointerForCuDNN(gCtx, image, instName);
 						cudnnConv2d(gCtx, instName, imagePointer, filterPointer, dstPointer, algo);
 					}
 					else {
 						try(LibMatrixCuDNNInputRowFetcher imgFetcher = new LibMatrixCuDNNInputRowFetcher(gCtx, instName, image)) {
 							for(int n = 0; n < N; n++) {
 								// Perform one-input all-channel conv2d
 								cudnnConv2d(gCtx, instName, imgFetcher.getNthRow(n), filterPointer, dstPointer.withByteOffset(n*KPQ*sizeOfDataType), algo);
 							}
 						}
 					}
 				}
 			}
 		}
 		else {
 			throwCuDNNDimensionError(N, CHW, K, CRS, N, KPQ);
 		}
 	}

 	/**
 	 * Performs an "softmax" operation on a matrix on the GPU
 	 * @param ec	execution context
 	 * @param gCtx a valid {@link GPUContext}
 	 * @param instName the invoking instruction's name for record {@link Statistics}.
 	 * @param in1	input matrix
 	 * @param outputName	output matrix name
 	 */
 	public static void softmax(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) {
 		if(LOG.isTraceEnabled()) {
 			LOG.trace("GPU : softmax" + ", GPUContext=" + gCtx);
 		}
 		cudnnTensorDescriptor tensorDesc = allocateTensorDescriptor(toInt(in1.getNumRows()), toInt(in1.getNumColumns()), 1, 1);
 		Pointer srcPointer = getDensePointerForCuDNN(gCtx, in1, instName);
 		MatrixObject out = ec.getMatrixObject(outputName);
 		ec.allocateGPUMatrixObject(outputName, in1.getNumRows(), in1.getNumColumns());
 		out.getGPUObject(gCtx).allocateAndFillDense(0);
 		Pointer dstPointer = getDensePointerForCuDNN(gCtx, out, instName);
 		JCudnn.cudnnSoftmaxForward(gCtx.getCudnnHandle(), CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL,
                 one(), tensorDesc, srcPointer,
                 zero(), tensorDesc, dstPointer);
 		cudnnDestroyTensorDescriptor(tensorDesc);
 	}

 	/**
 	 * Convenience method to get tensor descriptor
 	 * @param N number of images
 	 * @param C number of channels
 	 * @param H height
 	 * @param W width
 	 * @return cudnn tensor descriptor
 	 */
 	private static cudnnTensorDescriptor allocateTensorDescriptor(int N, int C, int H, int W) {
 		cudnnTensorDescriptor tensorDescriptor = new cudnnTensorDescriptor();
 		cudnnCreateTensorDescriptor(tensorDescriptor);
 		cudnnSetTensor4dDescriptor(tensorDescriptor, CUDNN_TENSOR_NCHW, LibMatrixCUDA.CUDNN_DATA_TYPE, N, C, H, W);
 		return tensorDescriptor;
 	}


 	/**
 	 * Throw an user-friendly error that shows limitation of invoking a cuDNN kernel
 	 *
 	 * @param dim1 input1 number of rows
 	 * @param dim2 input1 number of columns
 	 * @param dim3 input2 number of rows
 	 * @param dim4 input2 number of columns
 	 * @param dim5 output number of rows
 	 * @param dim6 output number of columns
 	 */
 	private static void throwCuDNNDimensionError(long dim1, long dim2, long dim3, long dim4) {
 		throw new DMLRuntimeException("The dimensions of input/output matrices is too large to execute a CuDNN kernel. "
 				+ "Max CuDNN matrix size:" + maxNumElementsOfCuDNNTensor + ". "
 				+ "Given input matrix dimensions: [" + dim1 + "," + dim2 + "]. Output dimension:  [" + dim3 + "," + dim4 + "].");
 	}

 	/**
 	 * Throw an user-friendly error that shows limitation of invoking a cuDNN kernel
 	 *
 	 * @param dim1 input1 number of rows
 	 * @param dim2 input1 number of columns
 	 * @param dim3 input2 number of rows
 	 * @param dim4 input2 number of columns
 	 * @param dim5 output number of rows
 	 * @param dim6 output number of columns
 	 */
 	private static void throwCuDNNDimensionError(long dim1, long dim2, long dim3, long dim4, long dim5, long dim6) {
 		throw new DMLRuntimeException("The dimensions of input/output matrices is too large to execute a CuDNN kernel. "
 				+ "Max CuDNN matrix size:" + maxNumElementsOfCuDNNTensor + ". "
 				+ "Given input matrix dimensions: [" + dim1 + "," + dim2 + "], [" + dim3 + "," + dim4 + "]. Output dimension: [" + dim5 + "," + dim6 + "]");
 	}

 	/**
 	 * Performs 2D convolution
 	 * Takes up an insignificant amount of intermediate space when CONVOLUTION_PREFERENCE is set to CUDNN_CONVOLUTION_FWD_NO_WORKSPACE
 	 * Intermediate space is required by the filter descriptor and convolution descriptor which are metadata structures and don't scale with the size of the input
 	 *
 	 * @param gCtx     a valid {@link GPUContext}
 	 * @param instName the invoking instruction's name for record {@link Statistics}.
 	 * @param image    the input matrix (or image) allocated on the GPU
 	 * @param filter   the filter allocated on the GPU
 	 * @param output   the output matrix allocated on the GPU
 	 * @param algo     cudnn algorithm wrapper
 	 */
 	private static void cudnnConv2d(GPUContext gCtx, String instName, Pointer image, Pointer filter, Pointer output,
 			LibMatrixCuDNNConvolutionAlgorithm algo) {
 		if(LOG.isTraceEnabled()) {
 			LOG.trace("GPU : conv2d" + ", GPUContext=" + gCtx);
 		}
 		try {
 			int status = cudnnConvolutionForward(getCudnnHandle(gCtx), one(),
 					algo.nchwTensorDesc, image,
 					algo.filterDesc, filter,
 					algo.convDesc, algo.algo, algo.workSpace, algo.sizeInBytes, zero(),
 					algo.nkpqTensorDesc, output);
 			if (status != cudnnStatus.CUDNN_STATUS_SUCCESS) {
 				throw new DMLRuntimeException("Could not executed cudnnConvolutionForward: " + cudnnStatus.stringFor(status));
 			}
 		} catch (CudaException e) {
 			throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), e);
 		}
 	}

 	/**
 	 * This method computes the backpropogation errors for filter of convolution operation
 	 *
 	 * @param gCtx   a valid {@link GPUContext}
 	 * @param instName the invoking instruction's name for record {@link Statistics}.
 	 * @param image input image
 	 * @param dout errors from next layer
 	 * @param outputBlock  output errors
 	 * @param N number of images
 	 * @param C number of channels
 	 * @param H height
 	 * @param W width
 	 * @param K number of filters
 	 * @param R filter height
 	 * @param S filter width
 	 * @param pad_h pad height
 	 * @param pad_w pad width
 	 * @param stride_h stride height
 	 * @param stride_w stride width
 	 * @param P output activation height
 	 * @param Q output activation width
 	 * @param intermediateMemoryBudget intermediate memory budget
 	 */
 	public static void conv2dBackwardFilter(GPUContext gCtx, String instName, MatrixObject image, MatrixObject dout,
 			MatrixObject outputBlock, int N, int C, int H, int W, int K, int R,
 			int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
 			int Q, double intermediateMemoryBudget) {
 		long CHW = C*H*W; long KPQ = K*P*Q; long CRS = C*R*S;
 		long NCHW = N*CHW; long NKPQ = N*KPQ; long KCRS = K*CRS;

 		boolean isSparseDout = isInSparseFormat(gCtx, dout);
 		long doutNnz = getNnz(gCtx, instName, dout, RECOMPUTE_DENSE_NNZ);
 		if(doutNnz == 0) {
 			return; // since dout is empty
 		}
 		boolean isSparseImage = isInSparseFormat(gCtx, image);
 		long imageNnz = getNnz(gCtx, instName, image, RECOMPUTE_DENSE_NNZ);
 		if(imageNnz == 0) {
 			return; // since image is empty
 		}

 		if(NCHW < maxNumElementsOfCuDNNTensor && NKPQ < maxNumElementsOfCuDNNTensor && KCRS < maxNumElementsOfCuDNNTensor) {
 			Pointer dwPointer = getDensePointerForCuDNN(gCtx, outputBlock, instName);
 			double overhead = isSparseImage ? OptimizerUtils.estimateSizeExactSparsity(N, CHW, 1.0) : 0;
 			overhead += isSparseDout ? OptimizerUtils.estimateSizeExactSparsity(N, KPQ, 1.0) : 0;

 			// Required for LibMatrixCuDNNConvolutionAlgorithm
 			long workspaceLimit = (long) (intermediateMemoryBudget-overhead);
 			int localN = overhead <= intermediateMemoryBudget ? N : 1;

 			try(LibMatrixCuDNNConvolutionAlgorithm algo =
 					LibMatrixCuDNNConvolutionAlgorithm.cudnnGetConvolutionBackwardFilterAlgorithm(gCtx, instName,
 					localN, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q, workspaceLimit)) {
 				if(localN == N) {
 					// Perform all-input all-channel conv2dBackwardFilter
 					Pointer imagePointer = getDensePointerForCuDNN(gCtx, image, instName);
 					Pointer doutPointer = getDensePointerForCuDNN(gCtx, dout, instName);
 					cudnnConv2dBackwardFilter(gCtx, instName, imagePointer, doutPointer, dwPointer, algo);
 				}
 				else {
 					try(LibMatrixCuDNNInputRowFetcher imgFetcher = new LibMatrixCuDNNInputRowFetcher(gCtx, instName, image);
 						LibMatrixCuDNNInputRowFetcher doutFetcher = new LibMatrixCuDNNInputRowFetcher(gCtx, instName, dout)) {
 						// Perform one-input conv2dBackwardFilter
 						Pointer tempdwPointer = gCtx.allocate(instName, KCRS*sizeOfDataType);
 						for(int n = 0; n < N; n++) {
 							cudaMemset(tempdwPointer, 0, KCRS*sizeOfDataType);
 							// Perform one-input conv2dBackwardFilter
 							cudnnConv2dBackwardFilter(gCtx, instName, imgFetcher.getNthRow(n), doutFetcher.getNthRow(n), tempdwPointer, algo);
 							getCudaKernels(gCtx).launchKernel("inplace_add",
 									ExecutionConfig.getConfigForSimpleMatrixOperations(K, toInt(CRS)),
 									tempdwPointer, dwPointer, K, toInt(CRS));

 						}
 						// Deallocate temporary array to hold one element of input
 						gCtx.cudaFreeHelper(instName, tempdwPointer, true);
 					}
 				}
 			}
 		}
 		else {
 			throwCuDNNDimensionError(N, CHW, N, KPQ, K, CRS);
 		}
 	}

 	/**
 	 * This method computes the backpropogation errors for filter of convolution operation
 	 *
 	 * @param gCtx   a valid {@link GPUContext}
 	 * @param instName the invoking instruction's name for record {@link Statistics}.
 	 * @param imagePointer pointer to input image
 	 * @param doutPointer pointer to errors from next layer
 	 * @param dwPointer  output errors
 	 * @param algo     cudnn algorithm wrapper
 	 */
 	private static void cudnnConv2dBackwardFilter(GPUContext gCtx, String instName, Pointer imagePointer, Pointer doutPointer,
 			Pointer dwPointer, LibMatrixCuDNNConvolutionAlgorithm algo) {
 		if(LOG.isTraceEnabled()) {
 			LOG.trace("GPU : conv2dBackwardFilter" + ", GPUContext=" + gCtx);
 		}
 		try {
 			int status = cudnnConvolutionBackwardFilter(getCudnnHandle(gCtx), one(), algo.nchwTensorDesc, imagePointer,
 					algo.nkpqTensorDesc, doutPointer, algo.convDesc, algo.algo, algo.workSpace, algo.sizeInBytes, zero(), algo.filterDesc, dwPointer);
 			if (status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
 				throw new DMLRuntimeException("Could not executed cudnnConvolutionBackwardFilter: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
 			}
 		} catch (CudaException e) {
 			throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), e);
 		}
 	}

 	/**
 	 * This method computes the backpropogation errors for previous layer of convolution operation
 	 *
 	 * @param gCtx   a valid {@link GPUContext}
 	 * @param instName the invoking instruction's name for record {@link Statistics}.
 	 * @param filter filter used in conv2d
 	 * @param dout errors from next layer
 	 * @param output  output errors
 	 * @param N number of images
 	 * @param C number of channels
 	 * @param H height
 	 * @param W width
 	 * @param K number of filters
 	 * @param R filter height
 	 * @param S filter width
 	 * @param pad_h pad height
 	 * @param pad_w pad width
 	 * @param stride_h stride height
 	 * @param stride_w stride width
 	 * @param P output activation height
 	 * @param Q output activation width
 	 * @param intermediateMemoryBudget intermediate memory budget
 	 */
 	public static void conv2dBackwardData(GPUContext gCtx, String instName, MatrixObject filter, MatrixObject dout,
 			MatrixObject output, int N, int C, int H, int W, int K, int R,
 			int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
 			int Q, double intermediateMemoryBudget) {
 		long CHW = C*H*W; long KPQ = K*P*Q; long CRS = C*R*S;
 		long NCHW = N*CHW; long NKPQ = N*KPQ; long KCRS = K*CRS;

 		boolean isSparseFilter = isInSparseFormat(gCtx, filter);
 		long filterNnz = getNnz(gCtx, instName, filter, RECOMPUTE_DENSE_NNZ);
 		if(filterNnz == 0) {
 			return; // since filter is empty
 		}
 		boolean isSparseDout = isInSparseFormat(gCtx, dout);
 		long doutNnz = getNnz(gCtx, instName, dout, RECOMPUTE_DENSE_NNZ);
 		if(doutNnz == 0) {
 			return; // since dout is empty
 		}

 		if(NCHW < maxNumElementsOfCuDNNTensor && NKPQ < maxNumElementsOfCuDNNTensor && KCRS < maxNumElementsOfCuDNNTensor) {
 			// Filter and output are accounted as dense in the memory estimation for conv2dBackwardData
 			double overhead = isSparseFilter ? OptimizerUtils.estimateSizeExactSparsity(K, CRS, 1.0) : 0;
 			overhead += isSparseDout ? OptimizerUtils.estimateSizeExactSparsity(N, KPQ, 1.0) : 0;
 			Pointer filterPointer = getDensePointerForCuDNN(gCtx, filter, instName);
 			Pointer dstPointer = getDensePointerForCuDNN(gCtx, output, instName);

 			// Required for LibMatrixCuDNNConvolutionAlgorithm
 			long workspaceLimit = (long) (intermediateMemoryBudget-overhead);
 			int localN = overhead <= intermediateMemoryBudget ? N : 1;

 			try(LibMatrixCuDNNConvolutionAlgorithm algo =
 					LibMatrixCuDNNConvolutionAlgorithm.cudnnGetConvolutionBackwardDataAlgorithm(gCtx, instName,
 					localN, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q, workspaceLimit)) {
 				if(localN == N) {
 					// Perform all-input all-channel conv2dBackwardData
 					Pointer doutPointer = getDensePointerForCuDNN(gCtx, dout, instName);
 					cudnnConv2dBackwardData(gCtx, instName, filterPointer, doutPointer, dstPointer, algo);
 				}
 				else {
 					try(LibMatrixCuDNNInputRowFetcher doutFetcher = new LibMatrixCuDNNInputRowFetcher(gCtx, instName, dout)) {
 						for(int n = 0; n < N; n++) {
 							cudnnConv2dBackwardData(gCtx, instName, doutFetcher.getNthRow(n), filterPointer, dstPointer.withByteOffset(n*CHW*sizeOfDataType), algo);
 						}
 					}
 				}
 			}
 		}
 		else {
 			throwCuDNNDimensionError(N, CHW, N, KPQ, K, CRS);
 		}
 	}

 	/**
 	 * This method computes the backpropogation errors for previous layer of convolution operation
 	 *
 	 * @param gCtx   a valid {@link GPUContext}
 	 * @param instName the invoking instruction's name for record {@link Statistics}.
 	 * @param w pointer to filter used in conv2d
 	 * @param dy pointer to errors from next layer
 	 * @param dx pointer to  output errors
 	 * @param algo cudnn algorithm wrapper
 	 */
 	private static void cudnnConv2dBackwardData(GPUContext gCtx, String instName, Pointer w, Pointer dy,
 			Pointer dx, LibMatrixCuDNNConvolutionAlgorithm algo) {
 		if(LOG.isTraceEnabled()) {
 			LOG.trace("GPU : conv2dBackwardData" + ", GPUContext=" + gCtx);
 		}
 		try {
 			int status = cudnnConvolutionBackwardData(getCudnnHandle(gCtx), one(), algo.filterDesc, w,
 					algo.nkpqTensorDesc, dy, algo.convDesc, algo.algo, algo.workSpace, algo.sizeInBytes, zero(), algo.nchwTensorDesc, dx);

 			if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
 				throw new DMLRuntimeException("Could not executed cudnnConvolutionBackwardData: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
 			}
 		} catch (CudaException e) {
 			throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), e);
 		}
 	}

 	/**
 	 * performs maxpooling on GPU by exploiting cudnnPoolingForward(...)
 	 * @param gCtx   a valid {@link GPUContext}
 	 * @param instName the invoking instruction's name for record {@link Statistics}.
 	 * @param image image as matrix object
 	 * @param outputBlock output matrix
 	 * @param N				batch size
 	 * @param C				number of channels
 	 * @param H				height of image
 	 * @param W				width of image
 	 * @param K				number of filters
 	 * @param R				height of filter
 	 * @param S				width of filter
 	 * @param pad_h			vertical padding
 	 * @param pad_w			horizontal padding
 	 * @param stride_h		horizontal stride
 	 * @param stride_w		vertical stride
 	 * @param P				(H - R + 1 + 2*pad_h)/stride_h
 	 * @param Q				(W - S + 1 + 2*pad_w)/stride_w
 	 * @param poolingType	type of pooling
 	 * @param intermediateMemoryBudget intermediate memory budget
 	 */
 	public static void pooling(GPUContext gCtx, String instName, MatrixObject image,
 			MatrixObject outputBlock, int N, int C, int H, int W, int K, int R,
 			int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
 			int Q, PoolingType poolingType, double intermediateMemoryBudget) {
 		long CHW = C*H*W; long CPQ = C*P*Q;
 		long NCHW = N*CHW; long NCPQ = N*CPQ;

 		if(NCHW < maxNumElementsOfCuDNNTensor && NCPQ < maxNumElementsOfCuDNNTensor) {
 			// Filter and output are accounted as dense in the memory estimation for conv2dBackwardData
 			long overhead = isInSparseFormat(gCtx, image) ? OptimizerUtils.estimateSizeExactSparsity(N, CHW, 1.0) : 0;
 			Pointer y = getDensePointerForCuDNN(gCtx, outputBlock, instName);
 			if(overhead <= intermediateMemoryBudget) {
 				Pointer x = getDensePointerForCuDNN(gCtx, image, instName);
 				cudnnPoolingHelper(gCtx, instName, x, y, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q, poolingType);
 			}
 			else {
 				try( LibMatrixCuDNNInputRowFetcher imgFetcher = new LibMatrixCuDNNInputRowFetcher(gCtx, instName, image) ) {
 					for(int n = 0; n < N; n++) {
 						cudnnPoolingHelper(gCtx, instName, imgFetcher.getNthRow(n), y.withByteOffset(n*CPQ*sizeOfDataType), 1, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q, poolingType);
 					}
 				}
 			}
 		}
 		else {
 			throwCuDNNDimensionError(N, CHW, N, CPQ);
 		}
 	}

 	private static void cudnnPoolingHelper(GPUContext gCtx, String instName, Pointer x,
 			Pointer y, int N, int C, int H, int W, int K, int R,
 			int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
 			int Q, PoolingType poolingType) {
 		if(LOG.isTraceEnabled()) {
 			LOG.trace("GPU : perform pooling" + ", GPUContext=" + gCtx);
 		}

 		try(LibMatrixCuDNNPoolingDescriptors desc =
 				LibMatrixCuDNNPoolingDescriptors.cudnnPoolingDescriptors(gCtx, instName, N, C, H, W, K, R, S,
 						pad_h, pad_w, stride_h, stride_w, P, Q, poolingType)) {
 			int status = cudnnPoolingForward(getCudnnHandle(gCtx), desc.poolingDesc, one(), desc.xDesc, x, zero(), desc.yDesc, y);
 			if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
 				throw new DMLRuntimeException("Could not executed cudnnPoolingForward: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
 			}
 		} catch (CudaException e) {
 			throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), e);
 		}
 	}

 	/**
 	 * Performs maxpoolingBackward on GPU by exploiting cudnnPoolingBackward(...)
 	 * This method computes the backpropogation errors for previous layer of maxpooling operation
 	 * @param gCtx   a valid {@link GPUContext}
 	 * @param instName the invoking instruction's name for record {@link Statistics}.
 	 * @param image image as matrix object
 	 * @param dout			delta matrix, output of previous layer
 	 * @param maxpoolOutput (optional and can be null) output of maxpool forward function
 	 * @param outputBlock output matrix
 	 * @param N				batch size
 	 * @param C				number of channels
 	 * @param H				height of image
 	 * @param W				width of image
 	 * @param K				number of filters
 	 * @param R				height of filter
 	 * @param S				width of filter
 	 * @param pad_h			vertical padding
 	 * @param pad_w			horizontal padding
 	 * @param stride_h		horizontal stride
 	 * @param stride_w		vertical stride
 	 * @param P				(H - R + 1 + 2*pad_h)/stride_h
 	 * @param Q				(W - S + 1 + 2*pad_w)/stride_w
 	 * @param poolingType	type of pooling
 	 * @param intermediateMemoryBudget intermediate memory budget
 	 */
 	@SuppressWarnings("resource")
 	public static void poolingBackward(GPUContext gCtx, String instName, MatrixObject image, MatrixObject dout,
 			MatrixObject maxpoolOutput, MatrixObject outputBlock, int N, int C, int H, int W, int K, int R,
 			int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
 			int Q, PoolingType poolingType, double intermediateMemoryBudget) {
 		long CHW = C*H*W; long CPQ = C*P*Q;
 		long NCHW = N*CHW; long NCPQ = N*CPQ;

 		final boolean isMaxPoolOutputProvided = maxpoolOutput != null;

 		if(NCHW < maxNumElementsOfCuDNNTensor && NCPQ < maxNumElementsOfCuDNNTensor) {
 			// Filter and output are accounted as dense in the memory estimation for conv2dBackwardData
 			long overhead = isInSparseFormat(gCtx, image) ? OptimizerUtils.estimateSizeExactSparsity(N, CHW, 1.0) : 0;
 			overhead += isInSparseFormat(gCtx, dout) ? OptimizerUtils.estimateSizeExactSparsity(N, CPQ, 1.0) : 0;
 			Pointer dx = getDensePointerForCuDNN(gCtx, outputBlock, instName);
 			if(overhead <= intermediateMemoryBudget) {
 				Pointer x = getDensePointerForCuDNN(gCtx, image, instName);
 				Pointer dy = getDensePointerForCuDNN(gCtx, dout, instName);
 				Pointer y = isMaxPoolOutputProvided ? getDensePointerForCuDNN(gCtx, maxpoolOutput, instName) : null;
 				cudnnPoolingBackwardHelper(gCtx, instName, x, dy, y, dx, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q, poolingType);
 			}
 			else {
 				LibMatrixCuDNNInputRowFetcher imgFetcher = new LibMatrixCuDNNInputRowFetcher(gCtx, instName, image);
 				LibMatrixCuDNNInputRowFetcher doutFetcher = new LibMatrixCuDNNInputRowFetcher(gCtx, instName, dout);
 				LibMatrixCuDNNInputRowFetcher maxPoolOutFetcher = isMaxPoolOutputProvided ? new LibMatrixCuDNNInputRowFetcher(gCtx, instName, maxpoolOutput) : null;
 				for(int n = 0; n < N; n++) {
 					Pointer x = imgFetcher.getNthRow(n);
 					Pointer dy = doutFetcher.getNthRow(n);
 					Pointer y = isMaxPoolOutputProvided ? maxPoolOutFetcher.getNthRow(n) : null;
 					cudnnPoolingBackwardHelper(gCtx, instName, x, dy, y,
 							dx.withByteOffset(n*CHW*sizeOfDataType),
 							1, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q, poolingType);
 				}
 				// Deallocate temporary array to hold one element of input
 				imgFetcher.close();
 				doutFetcher.close();
 				if(isMaxPoolOutputProvided)
 					maxPoolOutFetcher.close();
 			}
 		}
 		else {
 			throwCuDNNDimensionError(N, CHW, N, CPQ);
 		}
 	}

 	private static void cudnnPoolingBackwardHelper(GPUContext gCtx, String instName,
 			Pointer x, Pointer dy, Pointer y, Pointer dx,
 			int N, int C, int H, int W, int K, int R,
 			int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
 			int Q, PoolingType poolingType) {
 		if(LOG.isTraceEnabled()) {
 			LOG.trace("GPU : maxpoolingBackward" + ", GPUContext=" + gCtx);
 		}

 		boolean isMaxPoolOutputProvided = (y != null);

 		try(LibMatrixCuDNNPoolingDescriptors desc =
 				LibMatrixCuDNNPoolingDescriptors.cudnnPoolingBackwardDescriptors(gCtx, instName, N, C, H, W, K, R, S,
 						pad_h, pad_w, stride_h, stride_w, P, Q, poolingType)) {
 			int status;
 			if(!isMaxPoolOutputProvided) {
 				long numBytes = N*C*P*Q*sizeOfDataType;
 				y = gCtx.allocate(instName, numBytes);
 				status = cudnnPoolingForward(getCudnnHandle(gCtx), desc.poolingDesc, one(), desc.xDesc, x, zero(), desc.yDesc, y);
 				if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
 					throw new DMLRuntimeException("Could not executed cudnnPoolingForward before cudnnPoolingBackward: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
 				}
 			}
 			status = cudnnPoolingBackward(getCudnnHandle(gCtx), desc.poolingDesc, one(), desc.yDesc, y, desc.dyDesc, dy, desc.xDesc, x, zero(), desc.dxDesc, dx);

 			if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
 				throw new DMLRuntimeException("Could not executed cudnnPoolingBackward: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
 			}
 		} catch (CudaException e) {
 			throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), e);
 		}
 		finally {
 			if(!isMaxPoolOutputProvided)
 				gCtx.cudaFreeHelper(instName, y, DMLScript.EAGER_CUDA_FREE);
 		}
 	}

 	private static void cudnnReLU(GPUContext gCtx, String instName, MatrixObject in, Pointer dstData, cudnnTensorDescriptor srcTensorDesc) {
 		try {
 			if(LOG.isTraceEnabled()) {
 				LOG.trace("GPU : performCuDNNReLU" + ", GPUContext=" + gCtx);
 			}
 			cudnnTensorDescriptor dstTensorDesc = srcTensorDesc;

 			Pointer srcData = getDensePointerForCuDNN(gCtx, in, instName);
 			cudnnActivationDescriptor activationDescriptor = new cudnnActivationDescriptor();
 			cudnnCreateActivationDescriptor(activationDescriptor);
 			double dummy = -1;
 			cudnnSetActivationDescriptor(activationDescriptor, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, dummy);
 			cudnnActivationForward(getCudnnHandle(gCtx), activationDescriptor,
 					one(), srcTensorDesc, srcData,
 					zero(), dstTensorDesc, dstData);
 		} catch (CudaException e) {
 			throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), e);
 		}
 	}

 	/**
 	 * Performs the relu operation on the GPU.
 	 * @param ec currently active {@link ExecutionContext}
 	 * @param gCtx   a valid {@link GPUContext}
 	 * @param instName the invoking instruction's name for record {@link Statistics}.
 	 * @param in input matrix
 	 * @param outputName	name of the output matrix
 	 */
 	public static void relu(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in, String outputName) {
 		if (ec.getGPUContext(0) != gCtx)
 			throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
 		long N = in.getNumRows();
 		long CHW = in.getNumColumns();
 		Pointer dstData = getDenseOutputPointer(ec, gCtx, instName, outputName, in.getNumRows(), in.getNumColumns());
 		if(N*CHW >= maxNumElementsOfCuDNNTensor) {
 			if(LOG.isTraceEnabled()) {
 				LOG.trace("GPU : relu custom kernel" + ", GPUContext=" + gCtx);
 			}
 			// Invokes relu(double* A,  double* ret, int rlen, int clen)
 			Pointer srcData = getDensePointerForCuDNN(gCtx, in, instName); // TODO: FIXME: Add sparse kernel support for relu
 			getCudaKernels(gCtx).launchKernel("relu",
 					ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(N), toInt(CHW)),
 					srcData, dstData, toInt(N), toInt(CHW));
 		}
 		else {
 			cudnnTensorDescriptor tensorDescriptor = new cudnnTensorDescriptor();
 			cudnnCreateTensorDescriptor(tensorDescriptor);
 			cudnnSetTensor4dDescriptor(tensorDescriptor, CUDNN_TENSOR_NCHW, CUDNN_DATA_TYPE, toInt(N), 1, 1, toInt(CHW));
 			cudnnReLU(gCtx, instName, in, dstData, tensorDescriptor);
 			cudnnDestroyTensorDescriptor(tensorDescriptor);
 		}
 	}

 	static Pointer getDenseInputPointer(ExecutionContext ec, GPUContext gCtx, String instName, String inputName,
 			long numRows, long numCols) throws DMLRuntimeException {
 		MatrixObject output = ec.getMatrixInputForGPUInstruction(inputName, instName);
 		return LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, output, instName, toInt(numRows), toInt(numCols));
 	}

 	static Pointer getDenseOutputPointer(ExecutionContext ec, GPUContext gCtx, String instName, String outputName,
 			long numRows, long numCols) throws DMLRuntimeException {
 		MatrixObject output = ec.getMatrixObject(outputName);
 		getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, numRows, numCols); // Allocated the dense output matrix
 		return getDensePointerForCuDNN(gCtx, output, instName, toInt(numRows), toInt(numCols));
 	}

 	/**
 	 * Computes the forward pass for an LSTM layer with M neurons.
 	 * The input data has N sequences of T examples, each with D features.
 	 *
 	 * @param ec execution context
 	 * @param gCtx gpu context
 	 * @param instName name of the instruction
 	 * @param X input matrix pointer
 	 * @param wPointer weight matrix pointer
 	 * @param out0 Outputs from previous timestep
 	 * @param c0 Initial cell state
 	 * @param return_sequences Whether to return `out` at all timesteps, or just for the final timestep.
 	 * @param outputName name of the out variable. If `return_sequences` is True, outputs for all timesteps.
 	 * @param cyName name of the output cell state. Cell state for final timestep.
 	 * @param N minibatch size
 	 * @param M hidden size
 	 * @param D number of features
 	 * @param T sequence length
 	 * @throws DMLRuntimeException if error
 	 */
 	public static void lstm(ExecutionContext ec, GPUContext gCtx, String instName,
 			Pointer X,  Pointer wPointer, Pointer out0, Pointer c0, boolean return_sequences,
 			String outputName, String cyName, int N, int M, int D, int T) throws DMLRuntimeException {
 		singleLayerUnidirectionalRNNForward(ec, gCtx, instName, X, out0, c0, wPointer, outputName, cyName, "lstm", return_sequences, N, M, D, T);
 	}

 	private static void singleLayerUnidirectionalRNNForward(ExecutionContext ec, GPUContext gCtx, String instName,
 			Pointer x, Pointer hx, Pointer cx, Pointer wPointer,  // input
 			String outputName, String cyName,  					 // output
 			String rnnMode, boolean return_sequences, int N, int M, int D, int T) throws DMLRuntimeException {
 		boolean hasCarry = rnnMode.equalsIgnoreCase("lstm");
 		// Get output pointers
 		Pointer cudnnYPointer = gCtx.allocate(instName, N*T*M*sizeOfDataType);
 		Pointer hyPointer = !return_sequences ? getDenseOutputPointer(ec, gCtx, instName, outputName, N, M) : gCtx.allocate(instName, N*M*sizeOfDataType);
 		Pointer cyPointer = hasCarry ? getDenseOutputPointer(ec, gCtx, instName, cyName, N, M) : new Pointer();
 		// Pointer wPointer = getDensePointerForCuDNN(gCtx, w, instName, D+M+2, 4*M);

 		try(LibMatrixCuDNNRnnAlgorithm algo = new LibMatrixCuDNNRnnAlgorithm(ec, gCtx, instName, rnnMode, N, T, M, D, true, wPointer)) {
 			JCudnn.cudnnRNNForwardTraining(gCtx.getCudnnHandle(), algo.rnnDesc, T,
 					algo.xDesc, x,
 					algo.hxDesc, hx,
 					algo.cxDesc, cx,
 					algo.wDesc, wPointer,
 					algo.yDesc, cudnnYPointer,
 					algo.hyDesc, hyPointer,
 					algo.cyDesc, cyPointer,
 					algo.workSpace, algo.sizeInBytes,
 					algo.reserveSpace, algo.reserveSpaceSizeInBytes);
 		}

 		if(return_sequences) {
 			gCtx.cudaFreeHelper(instName, hyPointer, DMLScript.EAGER_CUDA_FREE);
 			Pointer sysdsYPointer = getDenseOutputPointer(ec, gCtx, instName, outputName, N, T*M);
 			LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_output",
 					ExecutionConfig.getConfigForSimpleVectorOperations(N*T*M),
 					sysdsYPointer, cudnnYPointer, N, T, M, N*T*M);
 		}
 		gCtx.cudaFreeHelper(instName, cudnnYPointer, DMLScript.EAGER_CUDA_FREE);
 	}

 	public static void lstmBackward(ExecutionContext ec, GPUContext gCtx, String instName,
 			Pointer x, Pointer hx, Pointer cx, Pointer wPointer, String doutName, String dcyName,  // input
 			String dxName, String dwName, String dbName, String dhxName, String dcxName,  	// output
 			boolean return_sequences, int N, int M, int D, int T) throws DMLRuntimeException {
 		// Transform the input dout and prepare them for cudnnRNNBackwardData
 		Pointer dy = gCtx.allocate(instName, N*T*M*sizeOfDataType);
 		int size = return_sequences ? N*T*M : N*M;
 		LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_backward_gradients",
 				ExecutionConfig.getConfigForSimpleVectorOperations(size),
 				getDenseInputPointer(ec, gCtx, instName, doutName, N, return_sequences ? T*M : M),
 				dy, N, T, M, size, return_sequences ? 1 : 0);
 		ec.releaseMatrixInputForGPUInstruction(doutName);

 		// Allocate intermediate pointers computed by forward
 		Pointer yPointer = gCtx.allocate(instName, N*T*M*sizeOfDataType);
 		try(LibMatrixCuDNNRnnAlgorithm algo = new LibMatrixCuDNNRnnAlgorithm(ec, gCtx, instName, "lstm", N, T, M, D, true, wPointer)) {
 			JCudnn.cudnnRNNForwardTraining(gCtx.getCudnnHandle(), algo.rnnDesc, T,
 					algo.xDesc, x,
 					algo.hxDesc, hx,
 					algo.cxDesc, cx,
 					algo.wDesc, wPointer,
 					algo.yDesc, yPointer,
 					algo.hyDesc, new Pointer(),
 					algo.cyDesc, new Pointer(),
 					algo.workSpace, algo.sizeInBytes,
 					algo.reserveSpace, algo.reserveSpaceSizeInBytes);

 			Pointer cudnnDx = gCtx.allocate(instName, N*T*D*LibMatrixCUDA.sizeOfDataType);
 			JCudnn.cudnnRNNBackwardData(gCtx.getCudnnHandle(), algo.rnnDesc, T,
 					algo.yDesc, yPointer,
 					// ----------------------
 					// Additional inputs:
 					algo.dyDesc, dy,
 					algo.dhyDesc, new Pointer(),
 					algo.dcyDesc, getDenseInputPointer(ec, gCtx, instName, dcyName, N, M),
 					// ----------------------
 					algo.wDesc, wPointer,
 					algo.hxDesc, hx,
 					algo.cxDesc, cx,
 					// ----------------------
 					// Output:
 					algo.dxDesc, cudnnDx,
 					algo.dhxDesc, getDenseOutputPointer(ec, gCtx, instName, dhxName, N, M),
 					algo.dcxDesc, getDenseOutputPointer(ec, gCtx, instName, dcxName, N, M),
 					// ----------------------
 					algo.workSpace, algo.sizeInBytes,
 					algo.reserveSpace, algo.reserveSpaceSizeInBytes);
 			gCtx.cudaFreeHelper(instName, dy, DMLScript.EAGER_CUDA_FREE);
 			ec.releaseMatrixInputForGPUInstruction(dcyName);
 			ec.releaseMatrixOutputForGPUInstruction(dhxName);
 			ec.releaseMatrixOutputForGPUInstruction(dcxName);

 			Pointer smlDx = getDenseOutputPointer(ec, gCtx, instName, dxName, N, T*D);
 			LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_dinput",
 					ExecutionConfig.getConfigForSimpleVectorOperations(N*T*D),
 					smlDx, cudnnDx, N, D, T*D, N*T*D);
 			ec.releaseMatrixOutputForGPUInstruction(dxName);
 			gCtx.cudaFreeHelper(instName, cudnnDx, DMLScript.EAGER_CUDA_FREE);

 			// -------------------------------------------------------------------------------------------
 			Pointer cudnnDwPointer = gCtx.allocate(instName, (D+M+2)*(4*M)*LibMatrixCUDA.sizeOfDataType);
 			JCudnn.cudnnRNNBackwardWeights(gCtx.getCudnnHandle(), algo.rnnDesc, T,
 					algo.xDesc, x,
 					algo.hxDesc, hx,
 					algo.yDesc, yPointer,
 					algo.workSpace, algo.sizeInBytes,
 					algo.dwDesc, cudnnDwPointer,
 					algo.reserveSpace, algo.reserveSpaceSizeInBytes);
 			LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_dweight",
 					ExecutionConfig.getConfigForSimpleVectorOperations((D+M+2)*(4*M)),
 					getDenseOutputPointer(ec, gCtx, instName, dwName, D+M, 4*M),
 					getDenseOutputPointer(ec, gCtx, instName, dbName, 1, 4*M), cudnnDwPointer, D, M);
 			gCtx.cudaFreeHelper(instName, cudnnDwPointer, DMLScript.EAGER_CUDA_FREE);
 			ec.releaseMatrixOutputForGPUInstruction(dwName);
 			ec.releaseMatrixOutputForGPUInstruction(dbName);
 			// -------------------------------------------------------------------------------------------

 			gCtx.cudaFreeHelper(instName, yPointer, DMLScript.EAGER_CUDA_FREE);
 		}
 	}


 	/**
 	 * Performs the forward BatchNormalization layer computation for training
 	 * @param gCtx   a valid {@link GPUContext}
 	 * @param instName name of the instruction
 	 * @param image input image
 	 * @param scale scale (as per CuDNN) and gamma as per original paper: shape [1, C, 1, 1]
 	 * @param bias bias (as per CuDNN) and beta as per original paper: shape [1, C, 1, 1]
 	 * @param runningMean running mean accumulated during training phase: shape [1, C, 1, 1]
 	 * @param runningVar running variance accumulated during training phase: shape [1, C, 1, 1]
 	 * @param ret (output) normalized input
 	 * @param retRunningMean (output) running mean accumulated during training phase: shape [1, C, 1, 1]
 	 * @param retRunningVar (output) running variance accumulated during training phase: shape [1, C, 1, 1]
 	 * @param epsilon epsilon value used in the batch normalization formula
 	 * @param exponentialAverageFactor factor used in the moving average computation
 	 * @param resultSaveMean (output) running mean accumulated during training phase: shape [1, C, 1, 1]
 	 * @param resultSaveInvVariance (output) running variance accumulated during training phase: shape [1, C, 1, 1]
 	 * @throws DMLRuntimeException if error occurs
 	 */
 	public static void batchNormalizationForwardTraining(GPUContext gCtx, String instName, MatrixObject image,
 			MatrixObject scale,  MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar,
 			MatrixObject ret, MatrixObject retRunningMean, MatrixObject retRunningVar,
 			double epsilon, double exponentialAverageFactor,
 			MatrixObject resultSaveMean, MatrixObject resultSaveInvVariance) throws DMLRuntimeException {
 		if(LOG.isTraceEnabled()) {
 			LOG.trace("GPU : batchNormalizationForwardTraining" + ", GPUContext=" + gCtx);
 		}

 		int N = toInt(image.getNumRows());
 		int C = toInt(scale.getNumRows());
 		long CHW = image.getNumColumns();
 		validateBatchNormalizationDimensions(scale, bias, runningMean, runningVar, C);

 		// Allocate descriptors
 		cudnnTensorDescriptor nCHWDescriptor = allocateNCHWDescriptors(gCtx, N, C, CHW,
 				new MatrixObject[] {image},  new MatrixObject[] {ret});
 		cudnnTensorDescriptor scaleTensorDesc = allocateTensorDescriptor(1, C, 1, 1);

 		// Get underlying dense pointer
 		Pointer imagePtr = getDensePointerForCuDNN(gCtx, image, instName);
 		Pointer retPtr = getDensePointerForCuDNN(gCtx, ret, instName);
 		Pointer biasPtr = getDensePointerForCuDNN(gCtx, bias, instName);
 		Pointer scalePtr = getDensePointerForCuDNN(gCtx, scale, instName);
 		Pointer runningMeanPtr = getDensePointerForCuDNN(gCtx, runningMean, instName);
 		Pointer runningVarPtr = getDensePointerForCuDNN(gCtx, runningVar, instName);

 		// To allow for copy-on-write
 		Pointer retRunningMeanPtr = getDensePointerForCuDNN(gCtx, retRunningMean, instName);
 		Pointer retRunningVarPtr = getDensePointerForCuDNN(gCtx, retRunningVar, instName);
 		cudaMemcpy(retRunningMeanPtr, runningMeanPtr, C * sizeOfDataType, cudaMemcpyDeviceToDevice);
 		cudaMemcpy(retRunningVarPtr, runningVarPtr, C * sizeOfDataType, cudaMemcpyDeviceToDevice);

 		Pointer resultSaveMeanPtr = getDensePointerForCuDNN(gCtx, resultSaveMean, instName);
 		Pointer resultSaveInvVariancePtr = getDensePointerForCuDNN(gCtx, resultSaveInvVariance, instName);

 		checkStatus(cudnnBatchNormalizationForwardTraining(getCudnnHandle(gCtx),
 				jcuda.jcudnn.cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL, one(), zero(),
 				nCHWDescriptor, imagePtr, nCHWDescriptor, retPtr,
 				scaleTensorDesc, scalePtr, biasPtr, exponentialAverageFactor,
 				retRunningMeanPtr, retRunningVarPtr, epsilon, resultSaveMeanPtr, resultSaveInvVariancePtr));
 	}

 	/**
 	 * Performs the forward BatchNormalization layer computation for inference
 	 * @param gCtx   a valid {@link GPUContext}
 	 * @param instName name of the instruction
 	 * @param image input image
 	 * @param scale scale (as per CuDNN) and gamma as per original paper: shape [1, C, 1, 1]
 	 * @param bias bias (as per CuDNN) and beta as per original paper: shape [1, C, 1, 1]
 	 * @param runningMean running mean accumulated during training phase: shape [1, C, 1, 1]
 	 * @param runningVar running variance accumulated during training phase: shape [1, C, 1, 1]
 	 * @param ret normalized input
 	 * @param epsilon epsilon value used in the batch normalization formula
 	 * @throws DMLRuntimeException if error occurs
 	 */
 	public static void batchNormalizationForwardInference(GPUContext gCtx, String instName, MatrixObject image,
 			MatrixObject scale, MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar,
 			MatrixObject ret, double epsilon) throws DMLRuntimeException {
 		if(LOG.isTraceEnabled()) {
 			LOG.trace("GPU : batchNormalizationForwardInference" + ", GPUContext=" + gCtx);
 		}

 		int N = toInt(image.getNumRows());
 		int C = toInt(scale.getNumRows());
 		long CHW = image.getNumColumns();
 		validateBatchNormalizationDimensions(scale, bias, runningMean, runningVar, C);

 		// Allocate descriptors
 		cudnnTensorDescriptor nCHWDescriptor = allocateNCHWDescriptors(gCtx, N, C, CHW,
 				new MatrixObject[] {image},  new MatrixObject[] {ret});
 		cudnnTensorDescriptor scaleTensorDesc = allocateTensorDescriptor(1, C, 1, 1);

 		// Get underlying dense pointer
 		Pointer imagePtr = getDensePointerForCuDNN(gCtx, image, instName);
 		Pointer retPtr = getDensePointerForCuDNN(gCtx, ret, instName);
 		Pointer biasPtr = getDensePointerForCuDNN(gCtx, bias, instName);
 		Pointer scalePtr = getDensePointerForCuDNN(gCtx, scale, instName);
 		Pointer runningMeanPtr = getDensePointerForCuDNN(gCtx, runningMean, instName);
 		Pointer runningVarPtr = getDensePointerForCuDNN(gCtx, runningVar, instName);

 		checkStatus(cudnnBatchNormalizationForwardInference(getCudnnHandle(gCtx),
 				jcuda.jcudnn.cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL, one(), zero(),
 				nCHWDescriptor, imagePtr, nCHWDescriptor, retPtr,
 				scaleTensorDesc, scalePtr, biasPtr,
 				runningMeanPtr, runningVarPtr, epsilon));
 	}

 	/**
 	 * This method computes the backpropagation errors for image, scale and bias of batch normalization layer
 	 * @param gCtx   a valid {@link GPUContext}
 	 * @param instName name of the instruction
 	 * @param image input image
 	 * @param dout input errors of shape C, H, W
 	 * @param scale scale (as per CuDNN) and gamma as per original paper: shape [1, C, 1, 1]
 	 * @param dX (output) backpropagation errors for previous layer
 	 * @param dScale backpropagation error for scale
 	 * @param dBias backpropagation error for bias
 	 * @param epsilon epsilon value used in the batch normalization formula
 	 * @param resultSaveMean (input) running mean accumulated during training phase: shape [1, C, 1, 1]
 	 * @param resultSaveInvVariance (input) running variance accumulated during training phase: shape [1, C, 1, 1]
 	 * @throws DMLRuntimeException if error occurs
 	 */
 	public static void batchNormalizationBackward(GPUContext gCtx, String instName, MatrixObject image, MatrixObject dout,
 			MatrixObject scale, MatrixObject dX, MatrixObject dScale, MatrixObject dBias,
 			double epsilon, MatrixObject resultSaveMean, MatrixObject resultSaveInvVariance) throws DMLRuntimeException {
 		if(LOG.isTraceEnabled()) {
 			LOG.trace("GPU : batchNormalizationBackward" + ", GPUContext=" + gCtx);
 		}

 		int N = toInt(image.getNumRows());
 		int C = toInt(scale.getNumRows());
 		long CHW = image.getNumColumns();

 		// Allocate descriptors
 		cudnnTensorDescriptor nCHWDescriptor = allocateNCHWDescriptors(gCtx, N, C, CHW,
 				new MatrixObject[] {image, dout},  new MatrixObject[] {dX});
 		cudnnTensorDescriptor scaleTensorDesc = allocateTensorDescriptor(1, C, 1, 1);

 		// Get underlying dense pointer
 		Pointer imagePtr = getDensePointerForCuDNN(gCtx, image, instName);
 		Pointer doutPtr = getDensePointerForCuDNN(gCtx, dout, instName);
 		Pointer scalePtr = getDensePointerForCuDNN(gCtx, scale, instName);
 		Pointer dXPtr = getDensePointerForCuDNN(gCtx, dX, instName);
 		Pointer dScalePtr = getDensePointerForCuDNN(gCtx, dScale, instName);
 		Pointer dBiasPtr = getDensePointerForCuDNN(gCtx, dBias, instName);

 		Pointer resultSaveMeanPtr = getDensePointerForCuDNN(gCtx, resultSaveMean, instName);
 		Pointer resultSaveInvVariancePtr = getDensePointerForCuDNN(gCtx, resultSaveInvVariance, instName);


 		// ignoring resultSaveMean and resultSaveVariance as it requires state management
 		checkStatus(cudnnBatchNormalizationBackward(getCudnnHandle(gCtx),
 				jcuda.jcudnn.cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL,  one(), zero(), one(), zero(),
 				nCHWDescriptor,  imagePtr, nCHWDescriptor, doutPtr, nCHWDescriptor, dXPtr,
 				scaleTensorDesc, scalePtr, dScalePtr, dBiasPtr, epsilon, resultSaveMeanPtr, resultSaveInvVariancePtr));
 	}

 	private static void validateBatchNormalizationDimensions(MatrixObject scale, MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar, int C) throws DMLRuntimeException {
 		if(scale.getNumRows() != C || scale.getNumColumns() != 1) {
 			throw new DMLRuntimeException("Incorrect dimensions for scale. Expected a column vector of size " + C + ", but found [" + scale.getNumRows() + ", " + scale.getNumColumns() + "]");
 		}
 		if(bias.getNumRows() != C || bias.getNumColumns() != 1) {
 			throw new DMLRuntimeException("Incorrect dimensions for bias. Expected a column vector of size " + C + ", but found [" + bias.getNumRows() + ", " + bias.getNumColumns() + "]");
 		}
 		if(runningMean.getNumRows() != C || runningMean.getNumColumns() != 1) {
 			throw new DMLRuntimeException("Incorrect dimensions for running mean. Expected a column vector of size " + C + ", but found [" + runningMean.getNumRows() + ", " + runningMean.getNumColumns() + "]");
 		}
 		if(runningVar.getNumRows() != C || runningVar.getNumColumns() != 1) {
 			throw new DMLRuntimeException("Incorrect dimensions for running variance. Expected a column vector of size " + C + ", but found [" + runningVar.getNumRows() + ", " + runningVar.getNumColumns() + "]");
 		}
 	}

 	/**
 	 * Convenient utility for batch normalization that returns a NCHW descriptor
 	 * @param gCtx a valid {@link GPUContext}
 	 * @param N number of images
 	 * @param C number of channels
 	 * @param CHW channels*height*width
 	 * @param input input matrix objects
 	 * @param output output matrix objects
 	 * @return one of the NCHW descriptor
 	 * @throws DMLRuntimeException if error occurs
 	 */
 	private static cudnnTensorDescriptor allocateNCHWDescriptors(GPUContext gCtx, int N, int C, long CHW, MatrixObject [] input, MatrixObject [] output) throws DMLRuntimeException {
 		cudnnTensorDescriptor ret  = null; // Return any one
 		if(CHW > ((long)Integer.MAX_VALUE)*C) {
 			throw new DMLRuntimeException("image size (height*width) should be less than " + Integer.MAX_VALUE);
 		}
 		int H = -1; int W = -1;
 		int HW = (int) (CHW / C);
 		H = HW; W = 1; // If not known
 		double potentialH = Math.sqrt(HW);
 		if(potentialH == ((int) potentialH)) {
 			H = (int) potentialH;
 			W = H;
 		}
 		// We are not sure about H and W, hence don't allocate them.
 		ret = new cudnnTensorDescriptor();
 		cudnnCreateTensorDescriptor(ret);
 		cudnnSetTensor4dDescriptor(ret, CUDNN_TENSOR_NCHW, CUDNN_DATA_TYPE, N, C, H, W);
 		return ret;
 	}

 	/**
 	 * Convenience method to get jcudaDenseMatrixPtr. This method explicitly converts sparse to dense format, so use it judiciously.
 	 *
 	 * @param gCtx a valid {@link GPUContext}
 	 * @param image input matrix object
 	 * @param instName name of the instruction
 	 * @return jcuda pointer
 	 */
 	protected static Pointer getDensePointerForCuDNN(GPUContext gCtx, MatrixObject image, String instName) {
 		long numElems = image.getNumRows()*image.getNumColumns();
 		if(numElems > maxNumElementsOfCuDNNTensor) {
 			throw new DMLRuntimeException("CuDNN restriction: the size of input tensor cannot have greater than 2 giga-elements, but has " + numElems + " (i.e. [" + image.getNumRows() + " X " + image.getNumColumns() + "]). Hint: try reducing the mini-batch size.");
 		}
 		return getDensePointer(gCtx, image, instName);
 	}

 	/**
 	 * Convenience method to get jcudaDenseMatrixPtr. This method explicitly converts sparse to dense format, so use it judiciously.
 	 *
 	 * @param gCtx a valid {@link GPUContext}
 	 * @param image input matrix object
 	 * @param instName name of the instruction
 	 * @param numRows expected number of rows
 	 * @param numCols expected number of columns
 	 * @return jcuda pointer
 	 * @throws DMLRuntimeException if error occurs while sparse to dense conversion
 	 */
 	public static Pointer getDensePointerForCuDNN(GPUContext gCtx, MatrixObject image, String instName, int numRows, int numCols) throws DMLRuntimeException {
 		long numElems = image.getNumRows()*image.getNumColumns();
 		if(image.getNumRows() != numRows || image.getNumColumns() != numCols) {
 			throw new DMLRuntimeException("Expected input of size:[" +  numRows + ", " + numCols + "], but found [" + image.getNumRows() + ", " + image.getNumColumns() + "].");
 		}
 		else if(numElems > maxNumElementsOfCuDNNTensor) {
 			throw new DMLRuntimeException("CuDNN restriction: the size of input tensor cannot have greater than 2 giga-elements, but has " + numElems + " (i.e. [" + image.getNumRows() + " X " + image.getNumColumns() + "]). Hint: try reducing the mini-batch size.");
 		}
 		Pointer ptr = getDensePointer(gCtx, image, instName);
 		long sizeOfPtr = gCtx.getMemoryManager().getSizeAllocatedGPUPointer(ptr);
 		if(sizeOfPtr != numElems*sizeOfDataType) {
 			throw new DMLRuntimeException("Incorrect pointer: expected size:" +  (numElems*sizeOfDataType) + ", but found " + sizeOfPtr);
 		}
 		return ptr;
 	}

 	/**
 	 * Convenience method for checking the status of CuDNN kernel.
 	 *
 	 * @param status status returned by CuDNN
 	 */
 	protected static void checkStatus(int status) {
 		if(status != cudnnStatus.CUDNN_STATUS_SUCCESS)
 			throw new DMLRuntimeException("Error status returned by CuDNN:" + jcuda.jcudnn.cudnnStatus.stringFor(status));
 	}
 }