[SYSTEMDS-3896] Improved SIMD Vectorized Counting NNZ This patch makes an additional performance improvement which further reduced the runtime on an 8GB matrix from 850ms to 770ms (non-vectorized 1100) by avoiding unnecessary scalar ops. Furthermore, we fix the hard-coded AVX512 vector size to the general vector length (which failed on non-Intel hardware in gitactions).

commit: 2c737bc1b894b88eed684ced55c65be1c777aba5 [log] [tgz]
author: Matthias Boehm <mboehm7@gmail.com> Wed Jul 16 11:18:27 2025 +0200
committer: Matthias Boehm <mboehm7@gmail.com> Wed Jul 16 11:18:27 2025 +0200
tree: 8f20124a004216d485866397061c2c56dbc489fe
parent: 7b34a67f2cd220bb4451385e872ce6a1b0940421 [diff]
diff --git a/src/main/java/org/apache/sysds/runtime/util/UtilFunctions.java b/src/main/java/org/apache/sysds/runtime/util/UtilFunctions.java
index 4f0a15d..3fd1dfd 100644
--- a/src/main/java/org/apache/sysds/runtime/util/UtilFunctions.java
+++ b/src/main/java/org/apache/sysds/runtime/util/UtilFunctions.java

@@ -880,15 +880,17 @@
 	}
 	
 	public static int computeNnz(final double[] a, final int ai, final int len) {
-		int lnnz = 0;
 		final int end = ai + len;
 		final int rest = (end - ai) % vLen;
+		int lnnz = len;
 
+		//start from len and subtract number of zeros because
+		//DoubleVector defines an eq but no neq operation
 		for(int i = ai; i < ai + rest; i++)
-			lnnz += (a[i] != 0.0) ? 1 : 0;
-		for(int i = ai + rest; i < end; i += 8) {
+			lnnz -= (a[i] == 0.0) ? 1 : 0;
+		for(int i = ai + rest; i < end; i += vLen) {
 			DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, i);
-			lnnz += vLen-aVec.eq(0).trueCount();
+			lnnz -= aVec.eq(0).trueCount();
 		}
 		return lnnz;
 	}
commit	2c737bc1b894b88eed684ced55c65be1c777aba5	[log] [tgz]
author	Matthias Boehm <mboehm7@gmail.com>	Wed Jul 16 11:18:27 2025 +0200
committer	Matthias Boehm <mboehm7@gmail.com>	Wed Jul 16 11:18:27 2025 +0200
tree	8f20124a004216d485866397061c2c56dbc489fe
parent	7b34a67f2cd220bb4451385e872ce6a1b0940421 [diff]