[SYSTEMDS-3058] Transpose to CSR amendments

This commit change the transpose to CSR to add the column count
threshold to the main algorithm, this remove the unnecessary allocation
of count nnz tasks if the number of columns is larger than 4k, and
it fixes an edge case for the CSR transpose output, if the count nnz
is null.
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixReorg.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixReorg.java
index 72d4ace..7321c8d 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixReorg.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixReorg.java
@@ -202,7 +202,8 @@
 		}
 		// set meta data and allocate output arrays (if required)
 		out.nonZeros = in.nonZeros;
-		allowCSR = allowCSR && out.nonZeros < (long) Integer.MAX_VALUE;
+		// CSR is only allowed in the transposed output if the number of non zeros is counted in the columns
+		allowCSR = allowCSR && out.nonZeros < (long) Integer.MAX_VALUE && in.clen <= 4096;
 		// Timing time = new Timing(true);
 
 		if(out.sparse && allowCSR) {
@@ -219,7 +220,10 @@
 			ExecutorService pool = CommonThreadPool.get(k);
 			// pre-processing (compute nnz per column once for sparse)
 			int[] cnt = null;
-			if(in.sparse && out.sparse) {
+			// filter matrices with many columns since the CountNnzTask would return
+			// null if the number of columns is larger than threshold
+			if(in.sparse && out.sparse && in.clen <= 4096) {
+				
 				ArrayList<CountNnzTask> tasks = new ArrayList<>();
 				int blklen = (int) (Math.ceil((double) in.rlen / k));
 				for(int i = 0; i < k & i * blklen < in.rlen; i++)
@@ -227,18 +231,18 @@
 				List<Future<int[]>> rtasks = pool.invokeAll(tasks);
 				for(Future<int[]> rtask : rtasks)
 					cnt = mergeNnzCounts(cnt, rtask.get());
-			}
 
-			if(out.sparse && allowCSR) {
-				int[] outPtr = ((SparseBlockCSR) out.sparseBlock).rowPointers();
-				for(int i = 0; i < cnt.length; i++) {
-					// set out pointers to correct start of rows.
-					outPtr[i + 1] = outPtr[i] + cnt[i];
-					// set the cnt value to the new pointer to start of row in CSR
-					cnt[i] = outPtr[i];
+				if(allowCSR) {
+					int[] outPtr = ((SparseBlockCSR) out.sparseBlock).rowPointers();
+					for(int i = 0; i < cnt.length; i++) {
+						// set out pointers to correct start of rows.
+						outPtr[i + 1] = outPtr[i] + cnt[i];
+						// set the cnt value to the new pointer to start of row in CSR
+						cnt[i] = outPtr[i];
+					}
 				}
-
 			}
+
 			// compute actual transpose and check for errors
 			ArrayList<TransposeTask> tasks = new ArrayList<>();
 			boolean row = (in.sparse || in.rlen >= in.clen) && !out.sparse;
diff --git a/src/test/java/org/apache/sysds/test/component/matrix/TransposeCSRTest.java b/src/test/java/org/apache/sysds/test/component/matrix/TransposeCSRTest.java
index 7dd16a6..875667b 100644
--- a/src/test/java/org/apache/sysds/test/component/matrix/TransposeCSRTest.java
+++ b/src/test/java/org/apache/sysds/test/component/matrix/TransposeCSRTest.java
@@ -65,6 +65,9 @@
 			DataConverter.convertToMatrixBlock(TestUtils.generateTestMatrix(1000, 15, 0.5, 1.5, 0.01, 6)), 3});
 		tests.add(new Object[] {
 			DataConverter.convertToMatrixBlock(TestUtils.generateTestMatrix(13, 1444, 0.5, 1.5, 0.01, 6)), 3});
+		// pass the threshold
+		tests.add(new Object[] {
+			DataConverter.convertToMatrixBlock(TestUtils.generateTestMatrix(4100, 4100, 0.5, 1.5, 0.01, 6)), 3});
 		return tests;
 	}