[SYSTEMDS-2760] Performance sparse-sparse transpose operations This patch makes two minor performance improvements to multi-threaded sparse-sparse transpose operations. The tasks get column groups of the input and append outputs to the sparse output rows. For tall&skinny sparse matrices there were two shortcomings. First, we aligned the minimum column group size to 8, which is only required for dense-dense transpose operations. This improved performance on above scenario. Second, if the number of cores is larger than the number of columns, cache blocking and related position maintenance only adds unnecessary overhead. Accordingly, we now use added a special case for these scenarios. On a scenario of a 2.5M x 50 matrix with sparsity = 0.1, the total execution time of 10 transpose operations improved from 2.7s to 2.5s (with 1) and 1.9s (with 1 and 2) respectively.

commit: 3ce32709366203eccb8da9063cedafa567f8d3bf [log] [tgz]
author: Matthias Boehm <mboehm7@gmail.com> Thu Dec 17 23:04:18 2020 +0100
committer: Matthias Boehm <mboehm7@gmail.com> Thu Dec 17 23:04:18 2020 +0100
tree: 95eff92f0118cb295ee1fce9f9351885668a7971
parent: ed8b5f5c725526f8414d2dbe9113d52260597352 [diff]
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixReorg.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixReorg.java
index edc38a7..ec1731f 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixReorg.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixReorg.java

@@ -205,7 +205,7 @@
 			boolean row = (in.sparse || in.rlen >= in.clen) && !out.sparse;
 			int len = row ? in.rlen : in.clen;
 			int blklen = (int)(Math.ceil((double)len/k));
-			blklen += (blklen%8 != 0)?8-blklen%8:0;
+			blklen += (!out.sparse && (blklen%8)!=0) ? 8-blklen%8 : 0;
 			for( int i=0; i<k & i*blklen<len; i++ )
 				tasks.add(new TransposeTask(in, out, row, i*blklen, Math.min((i+1)*blklen, len), cnt));
 			List<Future<Object>> taskret = pool.invokeAll(tasks);
@@ -861,50 +861,69 @@
 		SparseBlock a = in.getSparseBlock();
 		SparseBlock c = out.getSparseBlock();
 
-		//allocate output sparse rows
-		if( cnt != null ) {
-			for( int i=cl; i<cu; i++ )
-				if( cnt[i] > 0 )
-					c.allocate(i, cnt[i]);
+		if( cu-cl == 1 ) { // SINGLE TARGET ROW
+			//number of columns <= num cores, use sequential scan over input
+			//and avoid cache blocking and temporary position maintenance
+			if( cnt[cl] > 0 )
+				c.allocate(cl, cnt[cl]);
+			
+			for( int i=0; i<ru; i++ ) {
+				if( a.isEmpty(i) ) continue;
+				int apos = a.pos(i);
+				int alen = a.size(i);
+				int[] aix = a.indexes(i);
+				double[] avals = a.values(i);
+				for( int j=apos; j<apos+alen && aix[j]<=cl; j++ )
+					if( aix[j] == cl )
+						c.append(cl, i, avals[j]);
+			}
 		}
-		
-		//blocking according to typical L2 cache sizes w/ awareness of sparsity
-		final long xsp = (long)in.rlen*in.clen/in.nonZeros;
-		final int blocksizeI = Math.max(128, (int) (8*xsp));
-		final int blocksizeJ = Math.max(128, (int) (8*xsp));
-	
-		//temporary array for block boundaries (for preventing binary search) 
-		int[] ix = new int[Math.min(blocksizeI, ru-rl)];
-		
-		//blocked execution
-		for( int bi=rl; bi<ru; bi+=blocksizeI )
-		{
-			Arrays.fill(ix, 0);
-			//find column starting positions
-			int bimin = Math.min(bi+blocksizeI, ru);
-			if( cl > 0 ) {
-				for( int i=bi; i<bimin; i++ ) {
-					if( a.isEmpty(i) ) continue;
-					int j = a.posFIndexGTE(i, cl);
-					ix[i-bi] = (j>=0) ? j : a.size(i);
-				}
+		else { //GENERAL CASE
+			//allocate output sparse rows
+			if( cnt != null ) {
+				for( int i=cl; i<cu; i++ )
+					if( cnt[i] > 0 )
+						c.allocate(i, cnt[i]);
 			}
 			
-			for( int bj=cl; bj<cu; bj+=blocksizeJ ) {
-				int bjmin = Math.min(bj+blocksizeJ, cu);
-				//core block transpose operation
-				for( int i=bi; i<bimin; i++ ) {
-					if( a.isEmpty(i) ) continue;
-					int apos = a.pos(i);
-					int alen = a.size(i);
-					int[] aix = a.indexes(i);
-					double[] avals = a.values(i);
-					int j = ix[i-bi] + apos; //last block boundary
-					for( ; j<apos+alen && aix[j]<bjmin; j++ ) {
-						c.allocate(aix[j], ennz2, n2);
-						c.append(aix[j], i, avals[j]);
+			//blocking according to typical L2 cache sizes w/ awareness of sparsity
+			final long xsp = (long)in.rlen*in.clen/in.nonZeros;
+			final int blocksizeI = Math.max(128, (int) (8*xsp));
+			final int blocksizeJ = Math.max(128, (int) (8*xsp));
+		
+			//temporary array for block boundaries (for preventing binary search) 
+			int[] ix = new int[Math.min(blocksizeI, ru-rl)];
+			
+			//blocked execution
+			for( int bi=rl; bi<ru; bi+=blocksizeI )
+			{
+				Arrays.fill(ix, 0);
+				//find column starting positions
+				int bimin = Math.min(bi+blocksizeI, ru);
+				if( cl > 0 ) {
+					for( int i=bi; i<bimin; i++ ) {
+						if( a.isEmpty(i) ) continue;
+						int j = a.posFIndexGTE(i, cl);
+						ix[i-bi] = (j>=0) ? j : a.size(i);
 					}
-					ix[i-bi] = j - apos; //keep block boundary
+				}
+				
+				for( int bj=cl; bj<cu; bj+=blocksizeJ ) {
+					int bjmin = Math.min(bj+blocksizeJ, cu);
+					//core block transpose operation
+					for( int i=bi; i<bimin; i++ ) {
+						if( a.isEmpty(i) ) continue;
+						int apos = a.pos(i);
+						int alen = a.size(i);
+						int[] aix = a.indexes(i);
+						double[] avals = a.values(i);
+						int j = ix[i-bi] + apos; //last block boundary
+						for( ; j<apos+alen && aix[j]<bjmin; j++ ) {
+							c.allocate(aix[j], ennz2, n2);
+							c.append(aix[j], i, avals[j]);
+						}
+						ix[i-bi] = j - apos; //keep block boundary
+					}
 				}
 			}
 		}
commit	3ce32709366203eccb8da9063cedafa567f8d3bf	[log] [tgz]
author	Matthias Boehm <mboehm7@gmail.com>	Thu Dec 17 23:04:18 2020 +0100
committer	Matthias Boehm <mboehm7@gmail.com>	Thu Dec 17 23:04:18 2020 +0100
tree	95eff92f0118cb295ee1fce9f9351885668a7971
parent	ed8b5f5c725526f8414d2dbe9113d52260597352 [diff]