src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java - systemds - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 package org.apache.sysds.runtime.instructions.gpu.context;

 import jcuda.Pointer;
 import jcuda.jcusparse.cusparseDirection;
 import jcuda.jcusparse.cusparseHandle;
 import jcuda.jcusparse.cusparseMatDescr;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.api.DMLScript;
 import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.data.SparseBlockCOO;
 import org.apache.sysds.runtime.data.SparseBlockCSR;
 import org.apache.sysds.runtime.data.SparseBlockMCSR;
 import org.apache.sysds.runtime.matrix.data.LibMatrixCUDA;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.utils.GPUStatistics;

 import java.util.concurrent.atomic.AtomicLong;
 import java.util.concurrent.atomic.LongAdder;

 import static jcuda.jcublas.cublasOperation.CUBLAS_OP_T;
 import static jcuda.runtime.JCuda.cudaMemcpy;
 import static jcuda.runtime.JCuda.cudaMemset;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;

 /**
  * Handle to a matrix block on the GPU
  */
 public class GPUObject {

 	private static final Log LOG = LogFactory.getLog(GPUObject.class.getName());

 	/**
 	 * GPUContext that owns this GPUObject
 	 */
 	private final GPUContext gpuContext;

 	/**
 	 * Pointer to the underlying dense matrix block on GPU
 	 */
 	Pointer jcudaDenseMatrixPtr = null;

 	/**
 	 * Pointer to the underlying sparse matrix block on GPU
 	 */
 	private CSRPointer jcudaSparseMatrixPtr = null;

 	/**
 	 * whether the block attached to this {@link GPUContext} is dirty on the device and needs to be copied back to host
 	 */
 	protected boolean dirty = false;

 	/**
 	 * number of read locks on this object (this GPUObject is being used in a current instruction)
 	 */
 	protected LongAdder readLocks = new LongAdder();

 	/**
 	 * whether write lock on this object (this GPUObject is being used in a current instruction)
 	 */
 	protected boolean writeLock = false;

 	/**
 	 * Timestamp, needed by {@link GPUContext\#evict(long)}
 	 */
 	AtomicLong timestamp = new AtomicLong();

 	/**
 	 * Whether this block is in sparse format
 	 */
 	protected boolean isSparse = false;

 	/**
 	 * Enclosing {@link MatrixObject} instance
 	 */
 	MatrixObject mat = null;

 	/**
 	 * Shadow buffer instance
 	 */
 	final ShadowBuffer shadowBuffer;

 	// ----------------------------------------------------------------------
 	// Methods used to access, set and check jcudaDenseMatrixPtr

 	/**
 	 * Pointer to dense matrix
 	 *
 	 * @return a pointer to the dense matrix
 	 */
 	public Pointer getDensePointer() {
 		if(jcudaDenseMatrixPtr == null && shadowBuffer.isBuffered() && getJcudaSparseMatrixPtr() == null) {
 			shadowBuffer.moveToDevice();
 		}
 		return jcudaDenseMatrixPtr;
 	}

 	/**
 	 * Checks if the dense pointer is null
 	 *
 	 * @return if the state of dense pointer is null
 	 */
 	public boolean isDensePointerNull() {
 		return jcudaDenseMatrixPtr == null;
 	}

 	/**
 	 * Removes the dense pointer and potential soft reference
 	 */
 	public void clearDensePointer() {
 		jcudaDenseMatrixPtr = null;
 		shadowBuffer.clearShadowPointer();
 	}

 	public MatrixObject getMatrixObject() {
 		return mat;
 	}

 	/**
 	 * Convenience method to directly set the dense matrix pointer on GPU
 	 *
 	 * @param densePtr dense pointer
 	 */
 	public void setDensePointer(Pointer densePtr) {
 		if (!this.isDensePointerNull()) {
 			throw new DMLRuntimeException("jcudaDenseMatrixPtr was already allocated for " + this + ", this will cause a memory leak on the GPU");
 		}
 		this.jcudaDenseMatrixPtr = densePtr;
 		this.isSparse = false;
 		if(LOG.isDebugEnabled()) {
 			LOG.debug("Setting dense pointer of size " + getGPUContext().getMemoryManager().getSizeAllocatedGPUPointer(densePtr));
 		}
 		if (getJcudaSparseMatrixPtr() != null) {
 			getJcudaSparseMatrixPtr().deallocate();
 			jcudaSparseMatrixPtr = null;
 		}
 	}

 	public void setDirty(boolean flag) {
 		dirty = flag;
 	}
 	// ----------------------------------------------------------------------


 	@Override
 	public Object clone() {
 		GPUObject me = this;
 		GPUObject that = new GPUObject(me.gpuContext, me.mat);
 		that.dirty = me.dirty;
 		// The only place clone is getting called: LibMatrixCUDA's solve
 		that.readLocks.reset();
 		that.writeLock = false;

 		that.timestamp = new AtomicLong(me.timestamp.get());
 		that.isSparse = me.isSparse;

 		try {
 			if (!me.isDensePointerNull()) {
 				long rows = me.mat.getNumRows();
 				long cols = me.mat.getNumColumns();
 				long size = rows * cols * LibMatrixCUDA.sizeOfDataType;
 				that.setDensePointer(allocate(size, false));
 				cudaMemcpy(that.getDensePointer(), me.getDensePointer(), size, cudaMemcpyDeviceToDevice);
 			}

 			if (me.getJcudaSparseMatrixPtr() != null) {
 				long rows = mat.getNumRows();
 				that.jcudaSparseMatrixPtr = me.jcudaSparseMatrixPtr.clone((int) rows);
 			}

 		} catch (DMLRuntimeException e) {
 			throw new RuntimeException(e);
 		}

 		return that;
 	}

 	private Pointer allocate(long size, boolean initialize) {
 		return getGPUContext().allocate(null, size, initialize);
 	}

 	private void cudaFreeHelper(Pointer toFree) throws DMLRuntimeException {
 		getGPUContext().cudaFreeHelper(null, toFree, DMLScript.EAGER_CUDA_FREE);
 	}

 	GPUContext getGPUContext() { return gpuContext; }

 	/**
 	 * Transposes a dense matrix on the GPU by calling the cublasDgeam operation
 	 *
 	 * @param gCtx     a valid {@link GPUContext}
 	 * @param densePtr Pointer to dense matrix on the GPU
 	 * @param m        rows in ouput matrix
 	 * @param n        columns in output matrix
 	 * @param lda      rows in input matrix
 	 * @param ldc      columns in output matrix
 	 * @return transposed matrix
 	 */
 	public static Pointer transpose(GPUContext gCtx, Pointer densePtr, int m, int n, int lda, int ldc) {
 		if(LOG.isTraceEnabled()) {
 			LOG.trace("GPU : transpose of block of size [" + m + "," + n + "]" + ", GPUContext=" + gCtx);
 		}
 		Pointer alpha = LibMatrixCUDA.one();
 		Pointer beta = LibMatrixCUDA.zero();
 		Pointer A = densePtr;
 		Pointer C = gCtx.allocate(null, m * getDatatypeSizeOf(n), false);

 		// Transpose the matrix to get a dense matrix
 		LibMatrixCUDA.cudaSupportFunctions.cublasgeam(gCtx.getCublasHandle(), CUBLAS_OP_T, CUBLAS_OP_T, m, n, alpha, A, lda, beta, new Pointer(),
 				lda, C, ldc);
 		return C;
 	}


 	/**
 	 * Convenience method to convert a CSR matrix to a dense matrix on the GPU
 	 * Since the allocated matrix is temporary, bookkeeping is not updated.
 	 * Also note that the input dense matrix is expected to be in COLUMN MAJOR FORMAT
 	 * Caller is responsible for deallocating memory on GPU.
 	 *
 	 * @param gCtx           a valid {@link GPUContext}
 	 * @param cusparseHandle handle to cusparse library
 	 * @param densePtr       [in] dense matrix pointer on the GPU in row major
 	 * @param rows           number of rows
 	 * @param cols           number of columns
 	 * @return CSR (compressed sparse row) pointer
 	 */
 	public static CSRPointer columnMajorDenseToRowMajorSparse(GPUContext gCtx, cusparseHandle cusparseHandle,
 			Pointer densePtr, int rows, int cols) {
 		cusparseMatDescr matDescr = CSRPointer.getDefaultCuSparseMatrixDescriptor();
 		Pointer nnzPerRowPtr = null;
 		Pointer nnzTotalDevHostPtr = null;

 		nnzPerRowPtr = gCtx.allocate(null, getIntSizeOf(rows), false);
 		nnzTotalDevHostPtr = gCtx.allocate(null, getIntSizeOf(1), false);

 		// Output is in dense vector format, convert it to CSR
 		LibMatrixCUDA.cudaSupportFunctions.cusparsennz(cusparseHandle, cusparseDirection.CUSPARSE_DIRECTION_ROW, rows, cols, matDescr, densePtr, rows,
 				nnzPerRowPtr, nnzTotalDevHostPtr);
 		//cudaDeviceSynchronize();
 		int[] nnzC = { -1 };

 		cudaMemcpy(Pointer.to(nnzC), nnzTotalDevHostPtr, getIntSizeOf(1), cudaMemcpyDeviceToHost);
 		if (nnzC[0] == -1) {
 			throw new DMLRuntimeException(
 					"cusparseDnnz did not calculate the correct number of nnz from the sparse-matrix vector mulitply on the GPU");
 		}
 		if(LOG.isTraceEnabled()) {
 			LOG.trace("GPU : col-major dense size[" + rows + "," + cols + "] to row-major sparse of with nnz = " + nnzC[0]
 				+ ", GPUContext=" + gCtx);
 		}

 		CSRPointer C = CSRPointer.allocateEmpty(gCtx, nnzC[0], rows);
 		LibMatrixCUDA.cudaSupportFunctions.cusparsedense2csr(cusparseHandle, rows, cols, matDescr, densePtr, rows, nnzPerRowPtr, C.val, C.rowPtr,
 				C.colInd);
 		//cudaDeviceSynchronize();

 		gCtx.cudaFreeHelper(null, nnzPerRowPtr, DMLScript.EAGER_CUDA_FREE);
 		gCtx.cudaFreeHelper(null, nnzTotalDevHostPtr, DMLScript.EAGER_CUDA_FREE);

 		return C;
 	}

 	/**
 	 * Convenience method to directly examine the Sparse matrix on GPU
 	 *
 	 * @return CSR (compressed sparse row) pointer
 	 */
 	public CSRPointer getSparseMatrixCudaPointer() {
 		return getJcudaSparseMatrixPtr();
 	}

 	/**
 	 * Convenience method to directly set the sparse matrix on GPU
 	 * Needed for operations like cusparseDcsrgemm(cusparseHandle, int, int, int, int, int, cusparseMatDescr, int, Pointer, Pointer, Pointer, cusparseMatDescr, int, Pointer, Pointer, Pointer, cusparseMatDescr, Pointer, Pointer, Pointer)
 	 *
 	 * @param sparseMatrixPtr CSR (compressed sparse row) pointer
 	 */
 	public void setSparseMatrixCudaPointer(CSRPointer sparseMatrixPtr) {
 		if (this.jcudaSparseMatrixPtr != null) {
 			throw new DMLRuntimeException("jcudaSparseMatrixPtr was already allocated for " + this + ", this will cause a memory leak on the GPU");
 		}
 		this.jcudaSparseMatrixPtr = sparseMatrixPtr;
 		this.isSparse = true;
 		if (!isDensePointerNull() && !shadowBuffer.isBuffered()) {
 			cudaFreeHelper(getDensePointer());
 			clearDensePointer();
 		}
 	}

 	/**
 	 * Converts this GPUObject from dense to sparse format.
 	 */
 	public void denseToSparse() {
 		if(LOG.isTraceEnabled()) {
 			LOG.trace("GPU : dense -> sparse on " + this + ", GPUContext=" + getGPUContext());
 		}
 		long t0 = 0;
 		if (DMLScript.STATISTICS)
 			t0 = System.nanoTime();
 		cusparseHandle cusparseHandle = getGPUContext().getCusparseHandle();
 		if (cusparseHandle == null)
 			throw new DMLRuntimeException("Expected cusparse to be initialized");
 		int rows = toIntExact(mat.getNumRows());
 		int cols = toIntExact(mat.getNumColumns());

 		if ((isDensePointerNull() && !shadowBuffer.isBuffered()) || !isAllocated())
 			throw new DMLRuntimeException("Expected allocated dense matrix before denseToSparse() call");

 		denseRowMajorToColumnMajor();
 		setSparseMatrixCudaPointer(
 				columnMajorDenseToRowMajorSparse(getGPUContext(), cusparseHandle, getDensePointer(), rows,
 						cols));
 		// TODO: What if mat.getNnz() is -1 ?
 		if (DMLScript.STATISTICS)
 			GPUStatistics.cudaDenseToSparseTime.add(System.nanoTime() - t0);
 		if (DMLScript.STATISTICS)
 			GPUStatistics.cudaDenseToSparseCount.add(1);
 	}

 	/**
 	 * Convenience method. Converts Row Major Dense Matrix to Column Major Dense Matrix
 	 */
 	public void denseRowMajorToColumnMajor() {
 		if(LOG.isTraceEnabled()) {
 			LOG.trace("GPU : dense Ptr row-major -> col-major on " + this + ", GPUContext=" + getGPUContext());
 		}
 		int m = toIntExact(mat.getNumRows());
 		int n = toIntExact(mat.getNumColumns());
 		int lda = n;
 		int ldc = m;
 		if (!isAllocated()) {
 			throw new DMLRuntimeException("Error in converting row major to column major : data is not allocated");
 		}

 		Pointer tmp = transpose(getGPUContext(), getDensePointer(), m, n, lda, ldc);
 		cudaFreeHelper(getDensePointer());
 		clearDensePointer();
 		setDensePointer(tmp);
 	}

 	/**
 	 * Convenience method. Converts Column Major Dense Matrix to Row Major Dense Matrix
 	 */
 	public void denseColumnMajorToRowMajor() {
 		if(LOG.isTraceEnabled()) {
 			LOG.trace("GPU : dense Ptr row-major -> col-major on " + this + ", GPUContext=" + getGPUContext());
 		}

 		int n = toIntExact(mat.getNumRows());
 		int m = toIntExact(mat.getNumColumns());
 		int lda = n;
 		int ldc = m;
 		if (!isAllocated()) {
 			throw new DMLRuntimeException("Error in converting column major to row major : data is not allocated");
 		}

 		Pointer tmp = transpose(getGPUContext(), getDensePointer(), m, n, lda, ldc);
 		cudaFreeHelper(getDensePointer());
 		clearDensePointer();
 		setDensePointer(tmp);
 	}

 	/**
 	 * Convert sparse to dense (Performs transpose, use sparseToColumnMajorDense if the kernel can deal with column major format)
 	 */
 	public void sparseToDense() {
 		sparseToDense(null);
 	}

 	/**
 	 * Convert sparse to dense (Performs transpose, use sparseToColumnMajorDense if the kernel can deal with column major format)
 	 * Also records per instruction invokation of sparseToDense.
 	 *
 	 * @param instructionName Name of the instruction for which statistics are recorded in {@link GPUStatistics}
 	 */
 	public void sparseToDense(String instructionName) {
 		if(LOG.isTraceEnabled()) {
 			LOG.trace("GPU : sparse -> dense on " + this + ", GPUContext=" + getGPUContext());
 		}
 		long start = 0, end = 0;
 		if (DMLScript.STATISTICS)
 			start = System.nanoTime();
 		if (getJcudaSparseMatrixPtr() == null || !isAllocated())
 			throw new DMLRuntimeException("Expected allocated sparse matrix before sparseToDense() call");

 		sparseToColumnMajorDense();
 		denseColumnMajorToRowMajor();
 		if (DMLScript.STATISTICS)
 			end = System.nanoTime();
 		if (DMLScript.STATISTICS)
 			GPUStatistics.cudaSparseToDenseTime.add(end - start);
 		if (DMLScript.STATISTICS)
 			GPUStatistics.cudaSparseToDenseCount.add(1);
 	}

 	/**
 	 * More efficient method to convert sparse to dense but returns dense in column major format
 	 */
 	public void sparseToColumnMajorDense() {
 		if(LOG.isTraceEnabled()) {
 			LOG.trace("GPU : sparse -> col-major dense on " + this + ", GPUContext=" + getGPUContext());
 		}
 		if (getJcudaSparseMatrixPtr() == null || !isAllocated())
 			throw new DMLRuntimeException("Expected allocated sparse matrix before sparseToDense() call");

 		cusparseHandle cusparseHandle = getGPUContext().getCusparseHandle();
 		if (cusparseHandle == null)
 			throw new DMLRuntimeException("Expected cusparse to be initialized");
 		int rows = toIntExact(mat.getNumRows());
 		int cols = toIntExact(mat.getNumColumns());
 		setDensePointer(getJcudaSparseMatrixPtr().toColumnMajorDenseMatrix(cusparseHandle, null, rows, cols, null));
 	}

 	/**
 	 * Initializes this GPUObject with a {@link MatrixObject} instance which will contain metadata about the enclosing matrix block
 	 *
 	 * @param mat2 the matrix object that owns this {@link GPUObject}
 	 */
 	GPUObject(GPUContext gCtx, MatrixObject mat2) {
 		gpuContext = gCtx;
 		this.mat = mat2;
 		this.shadowBuffer = new ShadowBuffer(this);
 	}

 	public GPUObject(GPUContext gCtx, MatrixObject mat, Pointer ptr) {
 		gpuContext = gCtx;
 		this.mat = mat;
 		setDensePointer(ptr);
 		isSparse = false;
 		this.shadowBuffer = new ShadowBuffer(this);
 	}

 	public GPUObject(GPUContext gCtx, GPUObject that, MatrixObject mat) {
 		dirty = that.dirty;
 		readLocks.reset();
 		writeLock = false;
 		timestamp = new AtomicLong(that.timestamp.get());
 		isSparse = that.isSparse;
 		if (!that.isDensePointerNull())
 			setDensePointer(that.getDensePointer());
 		if (that.getJcudaSparseMatrixPtr() != null)
 			setSparseMatrixCudaPointer(that.getSparseMatrixCudaPointer());
 		gpuContext = gCtx;
 		this.mat = mat;
 		shadowBuffer = new ShadowBuffer(this);
 	}

 	public boolean isSparse() {
 		return isSparse;
 	}

 	private static long getDatatypeSizeOf(long numElems) {
 		return numElems * LibMatrixCUDA.sizeOfDataType;
 	}

 	private static long getIntSizeOf(long numElems) {
 		return numElems * jcuda.Sizeof.INT;
 	}

 	public boolean isAllocated() {
 		boolean eitherAllocated = shadowBuffer.isBuffered() || !isDensePointerNull() || getJcudaSparseMatrixPtr() != null;
 		return eitherAllocated;
 	}

 	/**
 	 * Allocates a sparse and empty {@link GPUObject}
 	 * This is the result of operations that are both non zero matrices.
 	 */
 	public void allocateSparseAndEmpty() {
 		if(LOG.isTraceEnabled()) {
 			LOG.trace("GPU : allocate sparse and empty block on " + this + ", GPUContext=" + getGPUContext());
 		}
 		setSparseMatrixCudaPointer(CSRPointer.allocateEmpty(getGPUContext(), 0, mat.getNumRows()));
 	}

 	/**
 	 * Allocates a dense matrix of size obtained from the attached matrix metadata
 	 * and fills it up with a single value
 	 *
 	 * @param v value to fill up the dense matrix
 	 */
 	public void allocateAndFillDense(double v) {
 		if(LOG.isTraceEnabled()) {
 			LOG.trace("GPU : allocate and fill dense with value " + v + " on " + this + ", GPUContext=" + getGPUContext());
 		}
 		long rows = mat.getNumRows();
 		long cols = mat.getNumColumns();
 		int numElems = toIntExact(rows * cols);
 		long size = getDatatypeSizeOf(numElems);
 		setDensePointer(allocate(size,false));
 		getGPUContext().getKernels().launchKernel("fill", ExecutionConfig.getConfigForSimpleVectorOperations
 				(numElems), getDensePointer(), v, numElems);
 	}

 	/**
 	 * If this {@link GPUObject} is sparse and empty
 	 * Being allocated is a prerequisite to being sparse and empty.
 	 *
 	 * @return true if sparse and empty
 	 */
 	public boolean isSparseAndEmpty() {
 		boolean isSparseAndAllocated = isAllocated() && LibMatrixCUDA.isInSparseFormat(getGPUContext(), mat);
 		boolean isEmptyAndSparseAndAllocated = isSparseAndAllocated && getJcudaSparseMatrixPtr().nnz == 0;
 		return isEmptyAndSparseAndAllocated;
 	}


 	/**
 	 * Being allocated is a prerequisite for computing nnz.
 	 * Note: if the matrix is in dense format, it explicitly re-computes the number of nonzeros.
 	 *
 	 * @param instName instruction name
 	 * @param recomputeDenseNNZ recompute NNZ if dense
 	 * @return the number of nonzeroes
 	 */
 	public long getNnz(String instName, boolean recomputeDenseNNZ) {
 		if(isAllocated()) {
 			if(LibMatrixCUDA.isInSparseFormat(getGPUContext(), mat)) {
 				return getJcudaSparseMatrixPtr().nnz;
 			}
 			else {
 				if(!recomputeDenseNNZ)
 					return -1;

 				GPUContext gCtx = getGPUContext();
 				cusparseHandle cusparseHandle = gCtx.getCusparseHandle();
 				cusparseMatDescr matDescr = CSRPointer.getDefaultCuSparseMatrixDescriptor();
 				if (cusparseHandle == null)
 					throw new DMLRuntimeException("Expected cusparse to be initialized");
 				int rows = toIntExact(mat.getNumRows());
 				int cols = toIntExact(mat.getNumColumns());
 				Pointer nnzPerRowPtr = null;
 				Pointer nnzTotalDevHostPtr = null;
 				nnzPerRowPtr = gCtx.allocate(instName, getIntSizeOf(rows), false);
 				nnzTotalDevHostPtr = gCtx.allocate(instName, getIntSizeOf(1), false);
 				LibMatrixCUDA.cudaSupportFunctions.cusparsennz(cusparseHandle, cusparseDirection.CUSPARSE_DIRECTION_ROW, rows, cols, matDescr, getDensePointer(), rows,
 						nnzPerRowPtr, nnzTotalDevHostPtr);
 				int[] nnzC = { -1 };
 				cudaMemcpy(Pointer.to(nnzC), nnzTotalDevHostPtr, getIntSizeOf(1), cudaMemcpyDeviceToHost);
 				if (nnzC[0] == -1) {
 					throw new DMLRuntimeException(
 							"cusparseDnnz did not calculate the correct number of nnz on the GPU");
 				}
 				gCtx.cudaFreeHelper(instName, nnzPerRowPtr, DMLScript.EAGER_CUDA_FREE);
 				gCtx.cudaFreeHelper(instName, nnzTotalDevHostPtr, DMLScript.EAGER_CUDA_FREE);
 				return nnzC[0];
 			}
 		}
 		else
 			throw new DMLRuntimeException("Expected the GPU object to be allocated");
 	}

 	public boolean acquireDeviceRead(String opcode) {
 		if(LOG.isTraceEnabled()) {
 			LOG.trace("GPU : acquireDeviceRead on " + this);
 		}
 		boolean transferred = false;
 		if (!isAllocated()) {
 			if(LOG.isTraceEnabled()) {
 				LOG.trace("GPU : in acquireDeviceRead, data is not allocated, copying from host, on " + this + ", GPUContext="
 							+ getGPUContext());
 			}
 			copyFromHostToDevice(opcode);
 			transferred = true;
 		}
 		addReadLock();
 		if (!isAllocated())
 			throw new DMLRuntimeException("Expected device data to be allocated");
 		return transferred;
 	}

 	public boolean acquireDeviceModifyDense() {
 		return acquireDeviceModifyDense(true);
 	}

 	public boolean acquireDeviceModifyDense(boolean initialize) {
 		if(LOG.isTraceEnabled()) {
 			LOG.trace("GPU : acquireDeviceModifyDense on " + this + ", GPUContext=" + getGPUContext());
 		}
 		boolean allocated = false;
 		if (!isAllocated()) {
 			mat.setDirty(true);
 			if(LOG.isTraceEnabled()) {
 				LOG.trace("GPU : data is not allocated, allocating a dense block, on " + this);
 			}
 			// Dense block, size = numRows * numCols
 			allocateDenseMatrixOnDevice(initialize);
 			allocated = true;
 		}
 		dirty = true;
 		if (!isAllocated())
 			throw new DMLRuntimeException("Expected device data to be allocated");
 		return allocated;
 	}

 	public boolean acquireDeviceModifySparse() {
  		return acquireDeviceModifySparse(true);
 	}

 	public boolean acquireDeviceModifySparse(boolean initialize) {
 		if(LOG.isTraceEnabled()) {
 			LOG.trace("GPU : acquireDeviceModifySparse on " + this + ", GPUContext=" + getGPUContext());
 		}
 		boolean allocated = false;
 		isSparse = true;
 		if (!isAllocated()) {
 			if(LOG.isTraceEnabled()) {
 				LOG.trace("GPU : data is not allocated, allocating a sparse block, on " + this);
 			}
 			mat.setDirty(true);
 			allocateSparseMatrixOnDevice(initialize);
 			allocated = true;
 		}
 		dirty = true;
 		if (!isAllocated())
 			throw new DMLRuntimeException("Expected device data to be allocated");
 		return allocated;
 	}

 	/**
 	 * if the data is allocated on the GPU and is dirty, it is copied back to the host memory
 	 *
 	 * @param instName name of the instruction
 	 * @return true if a copy to host happened, false otherwise
 	 */
 	public boolean acquireHostRead(String instName) {
 		boolean copied = false;
 		if(LOG.isTraceEnabled()) {
 			LOG.trace("GPU : acquireDeviceModifySparse on " + this + ", GPUContext=" + getGPUContext());
 		}
 		if (isAllocated() && dirty) {
 			if(LOG.isTraceEnabled()) {
 				LOG.trace("GPU : data is dirty on device, copying to host, on " + this + ", GPUContext="
 					+ getGPUContext());
 			}

 			if (isAllocated() && dirty) {
 				if(LOG.isTraceEnabled()) {
 					LOG.trace("GPU : data is dirty on device, copying to host, on " + this + ", GPUContext="
 						+ getGPUContext());
 				}
 				// TODO: Future optimization:
 				// For now, we are deleting the device data when copied from device to host.
 				// This can be optimized later by treating acquiredModify+release as a new state
 				copyFromDeviceToHost(instName, false, true);
 				copied = true;
 			}
 		}
 		return copied;
 	}

 	public boolean isLocked() {
 		return writeLock || readLocks.longValue() > 0;
 	}

 	public void addReadLock() {
 		if(writeLock)
 			throw new DMLRuntimeException("Attempting to add a read lock when writeLock="+ writeLock);
 		else
 			readLocks.increment();
 	}

 	public void addWriteLock() {
 		if(readLocks.longValue() > 0)
 			throw new DMLRuntimeException("Attempting to add a write lock when readLocks="+ readLocks.longValue());
 		else if(writeLock)
 			throw new DMLRuntimeException("Attempting to add a write lock when writeLock="+ writeLock);
 		else
 			writeLock = true;
 	}

 	public void releaseReadLock() {
 		readLocks.decrement();
 		if(readLocks.longValue() < 0)
 			throw new DMLRuntimeException("Attempting to release a read lock when readLocks="+ readLocks.longValue());
 	}

 	public void releaseWriteLock() {
 		if(writeLock)
 			writeLock = false;
 		else
 			throw new DMLRuntimeException("Internal state error : Attempting to release write lock on a GPUObject, which was already released");
 	}

 	public void resetReadWriteLock() {
 		readLocks.reset();
 		writeLock = false;
 	}

 	/**
 	 * Updates the locks depending on the eviction policy selected
 	 */
 	private void updateReleaseLocks() {
 		timestamp.set(System.nanoTime());
 	}

 	/**
 	 * Releases input allocated on GPU
 	 */
 	public void releaseInput() {
 		releaseReadLock();
 		updateReleaseLocks();
 		if (!isAllocated())
 			throw new DMLRuntimeException("Attempting to release an input before allocating it");
 	}

 	/**
 	 * releases output allocated on GPU
 	 */
 	public void releaseOutput() {
 		releaseWriteLock();
 		updateReleaseLocks();
 		// Currently, there is no convenient way to acquireDeviceModify independently of dense/sparse format.
 		// Hence, allowing resetting releaseOutput again.
 		// Ideally, we would want to throw CacheException("Attempting to release an output that was not acquired via acquireDeviceModify") if !isDirty()
 		dirty = true;
 		if (!isAllocated())
 			throw new DMLRuntimeException("Attempting to release an output before allocating it");
 	}

 	void allocateDenseMatrixOnDevice(boolean initialize) {
 		if(LOG.isTraceEnabled()) {
 			LOG.trace("GPU : allocateDenseMatrixOnDevice, on " + this + ", GPUContext=" + getGPUContext());
 		}
 		if(isAllocated())
 			throw new DMLRuntimeException("Internal error - trying to allocated dense matrix to a GPUObject that is already allocated");
 		long rows = mat.getNumRows();
 		long cols = mat.getNumColumns();
 		if(rows <= 0)
 			throw new DMLRuntimeException("Internal error - invalid number of rows when allocating dense matrix:" + rows);
 		if(cols <= 0)
 			throw new DMLRuntimeException("Internal error - invalid number of columns when allocating dense matrix:" + cols);
 		long size = getDatatypeSizeOf(rows * cols);
 		Pointer tmp = allocate(size, initialize);
 		setDensePointer(tmp);
 	}

 	void allocateSparseMatrixOnDevice(boolean initialize) {
 		if(LOG.isTraceEnabled()) {
 			LOG.trace("GPU : allocateSparseMatrixOnDevice, on " + this + ", GPUContext=" + getGPUContext());
 		}
 		if(isAllocated())
 			throw new DMLRuntimeException("Internal error - trying to allocated sparse matrix to a GPUObject that is already allocated");
 		long rows = mat.getNumRows();
 		long nnz = mat.getNnz();
 		if(rows <= 0)
 			throw new DMLRuntimeException("Internal error - invalid number of rows when allocating sparse matrix");
 		if(nnz < 0)
 			throw new DMLRuntimeException("Internal error - invalid number of non zeroes when allocating a sparse matrix");
 		CSRPointer tmp = CSRPointer.allocateEmpty(getGPUContext(), nnz, rows, initialize);
 		setSparseMatrixCudaPointer(tmp);
 	}

 	void allocateSparseMatrixOnDevice(long numVals, boolean initialize) {
 		// This method is called when #values > nnz
 		if(LOG.isTraceEnabled()) {
 			LOG.trace("GPU : allocateSparseMatrixOnDevice, on " + this + ", GPUContext=" + getGPUContext());
 		}
 		if(isAllocated())
 			throw new DMLRuntimeException("Internal error - trying to allocated sparse matrix to a GPUObject that is already allocated");
 		long rows = mat.getNumRows();
 		long nnz = mat.getNnz();
 		if(rows <= 0)
 			throw new DMLRuntimeException("Internal error - invalid number of rows when allocating sparse matrix");
 		if(nnz < 0)
 			throw new DMLRuntimeException("Internal error - invalid number of non zeroes when allocating a sparse matrix");
 		CSRPointer tmp = CSRPointer.allocateEmpty(getGPUContext(), numVals, rows, initialize);
 		setSparseMatrixCudaPointer(tmp);
 	}

 	// Method to find the estimated size of this GPU Object in the device
 	public long getSizeOnDevice() {
 		long GPUSize = 0;
 		long rlen = mat.getNumRows();
 		long clen = mat.getNumColumns();
 		long nnz = mat.getNnz();

 		if (LibMatrixCUDA.isInSparseFormat(getGPUContext(), mat)) {
 			GPUSize = CSRPointer.estimateSize(nnz, rlen);
 		} else {
 			GPUSize = getDatatypeSizeOf(rlen * clen);
 		}
 		return GPUSize;
 	}

 	// Method to find the allocated size of this GPU Object in the device
 	public long getAllocatedSize() {
 		return gpuContext.getMemoryManager().getSizeAllocatedGPUPointer(getDensePointer());
 	}

 	void copyFromHostToDevice(String opcode) {
 		if(LOG.isTraceEnabled()) {
 			LOG.trace("GPU : copyFromHostToDevice, on " + this + ", GPUContext=" + getGPUContext());
 		}
 		long start = 0;
 		if (DMLScript.STATISTICS)
 			start = System.nanoTime();

 		MatrixBlock tmp = mat.acquireRead();

 		if (tmp.isInSparseFormat()) {
 			int rowPtr[] = null;
 			int colInd[] = null;
 			double[] values = null;

 			// Only recompute non-zero if unknown, else this will incur huge penalty !!
 			if(tmp.getNonZeros() < 0) {
 				tmp.recomputeNonZeros();
 			}
 			long nnz = tmp.getNonZeros();
 			mat.getDataCharacteristics().setNonZeros(nnz);

 			SparseBlock block = tmp.getSparseBlock();
 			boolean copyToDevice = true;
 			if (block == null && tmp.getNonZeros() == 0) {
 				copyToDevice = false;
 			} else if (block == null && tmp.getNonZeros() != 0) {
 				throw new DMLRuntimeException("Expected CP sparse block to be not null.");
 			}
 			else {
 				// CSR is the preferred format for cuSparse GEMM
 				// Converts MCSR and COO to CSR
 				SparseBlockCSR csrBlock = null;
 				long t0 = 0;
 				if (block instanceof SparseBlockCSR) {
 					csrBlock = (SparseBlockCSR) block;
 				} else if (block instanceof SparseBlockCOO) {
 					// TODO - should we do this on the GPU using cusparse<t>coo2csr() ?
 					if (DMLScript.STATISTICS)
 						t0 = System.nanoTime();
 					SparseBlockCOO cooBlock = (SparseBlockCOO) block;
 					csrBlock = new SparseBlockCSR(toIntExact(mat.getNumRows()), cooBlock.rowIndexes(),
 							cooBlock.indexes(), cooBlock.values());
 					if (DMLScript.STATISTICS)
 						GPUStatistics.cudaSparseConversionTime.add(System.nanoTime() - t0);
 					if (DMLScript.STATISTICS)
 						GPUStatistics.cudaSparseConversionCount.increment();
 				} else if (block instanceof SparseBlockMCSR) {
 					if (DMLScript.STATISTICS)
 						t0 = System.nanoTime();
 					SparseBlockMCSR mcsrBlock = (SparseBlockMCSR) block;
 					csrBlock = new SparseBlockCSR(mcsrBlock.getRows(), toIntExact(mcsrBlock.size()));
 					if (DMLScript.STATISTICS)
 						GPUStatistics.cudaSparseConversionTime.add(System.nanoTime() - t0);
 					if (DMLScript.STATISTICS)
 						GPUStatistics.cudaSparseConversionCount.increment();
 				} else {
 					throw new DMLRuntimeException("Unsupported sparse matrix format for CUDA operations");
 				}
 				rowPtr = csrBlock.rowPointers();
 				colInd = csrBlock.indexes();
 				values = csrBlock.values();
 			}

 			if (values != null)
 				if(values.length > tmp.getNonZeros())
 					allocateSparseMatrixOnDevice(values.length, false);
 				else
 					allocateSparseMatrixOnDevice(false);
 			else
 				allocateSparseMatrixOnDevice(false);

 			if (copyToDevice) {
 				CSRPointer.copyToDevice(getGPUContext(), getJcudaSparseMatrixPtr(),
 					tmp.getNumRows(), tmp.getNonZeros(), rowPtr, colInd, values);
 				if (DMLScript.STATISTICS)
 					GPUStatistics.cudaToDevCount.add(3);
 			}
 		} else {
 			double[] data = tmp.getDenseBlockValues();

 			if (data == null && tmp.getSparseBlock() != null)
 				throw new DMLRuntimeException("Incorrect sparsity calculation");
 			else if (data == null && tmp.getNonZeros() != 0)
 				throw new DMLRuntimeException("MatrixBlock is not allocated");

 			allocateDenseMatrixOnDevice(false);

 			if (tmp.getNonZeros() == 0) {
 				// Minor optimization: No need to allocate empty error for CPU
 				// data = new double[tmp.getNumRows() * tmp.getNumColumns()];
 				cudaMemset(getDensePointer(), 0, getDatatypeSizeOf(mat.getNumRows() * mat.getNumColumns()));
 			}
 			else {
 				// Copy dense block
 				// H2D now only measures the time taken to do
 				LibMatrixCUDA.cudaSupportFunctions.hostToDevice(getGPUContext(), data, getDensePointer(), opcode);
 				if (DMLScript.STATISTICS)
 					GPUStatistics.cudaToDevCount.add(1);
 			}
 		}

 		mat.release();

 		if (DMLScript.STATISTICS)
 			GPUStatistics.cudaToDevTime.add(System.nanoTime() - start);
 	}

 	public static int toIntExact(long l) {
 		if (l < Integer.MIN_VALUE || l > Integer.MAX_VALUE) {
 			throw new DMLRuntimeException("Cannot be cast to int:" + l);
 		}
 		return (int) l;
 	}


 	/**
 	 * Copies the data from device to host.
 	 * Currently eagerDelete and isEviction are both provided for better control in different scenarios.
 	 * In future, we can force eagerDelete if isEviction is true, else false.
 	 *
 	 * @param instName opcode of the instruction for fine-grained statistics
 	 * @param isEviction is called for eviction
 	 * @param eagerDelete whether to perform eager deletion of the device data.
 	 * @throws DMLRuntimeException if error occurs
 	 */
 	protected void copyFromDeviceToHost(String instName, boolean isEviction, boolean eagerDelete) throws DMLRuntimeException {
 		if(LOG.isTraceEnabled()) {
 			LOG.trace("GPU : copyFromDeviceToHost, on " + this + ", GPUContext=" + getGPUContext());
 		}
 		if(shadowBuffer.isBuffered()) {
 			if(isEviction) {
 				// If already copied to shadow buffer as part of previous eviction, do nothing.
 				return;
 			}
 			else {
 				// If already copied to shadow buffer as part of previous eviction and this is not an eviction (i.e. bufferpool call for subsequent CP/Spark instruction),
 				// then copy from shadow buffer to MatrixObject.
 				shadowBuffer.moveToHost();
 				return;
 			}
 		}
 		else if(shadowBuffer.isEligibleForBuffering(isEviction, eagerDelete)) {
 			// Perform shadow buffering if (1) single precision, (2) during eviction, (3) eagerDelete is true
 			// (4) for dense matrices, and (5) if the given matrix can fit into the shadow buffer.
 			shadowBuffer.moveFromDevice(instName);
 			return;
 		}
 		else if (isDensePointerNull() && getJcudaSparseMatrixPtr() == null) {
 			throw new DMLRuntimeException(
 					"Cannot copy from device to host as JCuda dense/sparse pointer is not allocated");
 		}
 		else if (!isDensePointerNull() && getJcudaSparseMatrixPtr() != null) {
 			throw new DMLRuntimeException("Invalid state : JCuda dense/sparse pointer are both allocated");
 		}
 		else if(getJcudaSparseMatrixPtr() != null && !LibMatrixCUDA.isInSparseFormat(getGPUContext(), mat)) {
 			throw new DMLRuntimeException(
 					"Block not in sparse format on host yet the device sparse matrix pointer is not null");
 		}
 		else if(getJcudaSparseMatrixPtr() != null && isSparseAndEmpty()) {
 			mat.acquireModify(new MatrixBlock((int)mat.getNumRows(), (int)mat.getNumColumns(), 0l)); // empty block
 			mat.release();
 			return;
 		}
 		boolean sparse = false;
 		if(isDensePointerNull())
 			sparse = true;
 		MatrixBlock tmp = null;
 		long start = DMLScript.STATISTICS ? System.nanoTime() : 0;
 		if (!isDensePointerNull()) {
 			tmp = new MatrixBlock(toIntExact(mat.getNumRows()), toIntExact(mat.getNumColumns()), false);
 			tmp.allocateDenseBlock();
 			LibMatrixCUDA.cudaSupportFunctions.deviceToHost(getGPUContext(),
 						getDensePointer(), tmp.getDenseBlockValues(), instName, isEviction);
 			if(eagerDelete)
 				clearData(instName, true);
 			tmp.recomputeNonZeros();
 		} else {
 			int rows = toIntExact(mat.getNumRows());
 			int cols = toIntExact(mat.getNumColumns());
 			int nnz = toIntExact(getJcudaSparseMatrixPtr().nnz);
 			double[] values = new double[nnz];
 			LibMatrixCUDA.cudaSupportFunctions.deviceToHost(getGPUContext(), getJcudaSparseMatrixPtr().val, values, instName, isEviction);
 			int[] rowPtr = new int[rows + 1];
 			int[] colInd = new int[nnz];
 			CSRPointer.copyPtrToHost(getJcudaSparseMatrixPtr(), rows, nnz, rowPtr, colInd);
 			if(eagerDelete)
 				clearData(instName, true);
 			SparseBlockCSR sparseBlock = new SparseBlockCSR(rowPtr, colInd, values, nnz);
 			tmp = new MatrixBlock(rows, cols, nnz, sparseBlock);
 		}
 		mat.acquireModify(tmp);
 		mat.release();
 		if (DMLScript.STATISTICS && !isEviction) {
 			// Eviction time measure in malloc
 			long totalTime = System.nanoTime() - start;
 			int count = sparse ? 3 : 1;
 			GPUStatistics.cudaFromDevTime.add(totalTime);
 			GPUStatistics.cudaFromDevCount.add(count);
 		}
 		dirty = false;
 	}

 	/**
 	 * Clears the data associated with this {@link GPUObject} instance
 	 *
 	 * @param opcode opcode of the instruction
 	 * @param eager whether to be done synchronously or asynchronously
 	 * @throws DMLRuntimeException if error occurs
 	 */
 	synchronized public void clearData(String opcode, boolean eager) throws DMLRuntimeException {
 		if(LOG.isTraceEnabled()) {
 			LOG.trace("GPU : clearData on " + this + ", GPUContext=" + getGPUContext());
 		}
 		if (!isDensePointerNull()) {
 			getGPUContext().cudaFreeHelper(opcode, getDensePointer(), eager);
 		}
 		if (getJcudaSparseMatrixPtr() != null) {
 			getJcudaSparseMatrixPtr().deallocate(eager);
 		}
 		clearDensePointer();
 		shadowBuffer.clearShadowPointer();
 		jcudaSparseMatrixPtr = null;
 		resetReadWriteLock();
 		getGPUContext().getMemoryManager().removeGPUObject(this);
 	}

 	public void clearGPUObject() {
 		if(LOG.isTraceEnabled())
 			LOG.trace("GPU : clearData on " + this + ", GPUContext=" + getGPUContext());

 		clearDensePointer();
 		shadowBuffer.clearShadowPointer();
 		jcudaSparseMatrixPtr = null;
 		resetReadWriteLock();
 		getGPUContext().getMemoryManager().removeGPUObject(this);
 	}

 	/**
 	 * Pointer to sparse matrix
 	 *
 	 * @return ?
 	 */
 	public CSRPointer getJcudaSparseMatrixPtr() {
 		return jcudaSparseMatrixPtr;
 	}

 	/**
 	 * Whether this block is dirty on the GPU
 	 *
 	 * @return ?
 	 */
 	public boolean isDirty() {
 		return dirty;
 	}

 	@Override
 	public String toString() {
 		final StringBuilder sb = new StringBuilder("GPUObject{");
 		sb.append(", dirty=").append(dirty);
 		sb.append(", readLocks=").append(readLocks.longValue());
 		sb.append(", writeLock=").append(writeLock);
 		sb.append(", sparse? ").append(isSparse);
 		sb.append(", dims=[").append(mat.getNumRows()).append(",").append(mat.getNumColumns()).append("]");
 		if(!isDensePointerNull())
 			sb.append(", densePtr=").append(getDensePointer());
 		if(jcudaSparseMatrixPtr != null)
 			sb.append(", sparsePtr=").append(jcudaSparseMatrixPtr);
 		sb.append('}');
 		return sb.toString();
 	}

 	private static long getPointerAddressInternal(Pointer p) {
 		// WORKAROUND until a method like CUdeviceptr#getAddress exists in jCuda
 		class PointerWithAddress extends Pointer
 		{
 			PointerWithAddress(Pointer other)
 			{
 				super(other);
 			}
 			long getAddress()
 			{
 				return getNativePointer() + getByteOffset();
 			}
 		}
 		return new PointerWithAddress(p).getAddress();
 	}

 	public long getDensePointerAddress() {
 		return getPointerAddressInternal(getDensePointer());
 	}

 	public static long getPointerAddress(Pointer p) {
 		return (p == null) ?  0 : getPointerAddressInternal(p);
 	}
 }