[SYSTEMDS-3896] Improved SIMD Vectorized Counting NNZ
This patch makes an additional performance improvement which further
reduced the runtime on an 8GB matrix from 850ms to 770ms
(non-vectorized 1100) by avoiding unnecessary scalar ops. Furthermore,
we fix the hard-coded AVX512 vector size to the general vector length
(which failed on non-Intel hardware in gitactions).
diff --git a/src/main/java/org/apache/sysds/runtime/util/UtilFunctions.java b/src/main/java/org/apache/sysds/runtime/util/UtilFunctions.java
index 4f0a15d..3fd1dfd 100644
--- a/src/main/java/org/apache/sysds/runtime/util/UtilFunctions.java
+++ b/src/main/java/org/apache/sysds/runtime/util/UtilFunctions.java
@@ -880,15 +880,17 @@
}
public static int computeNnz(final double[] a, final int ai, final int len) {
- int lnnz = 0;
final int end = ai + len;
final int rest = (end - ai) % vLen;
+ int lnnz = len;
+ //start from len and subtract number of zeros because
+ //DoubleVector defines an eq but no neq operation
for(int i = ai; i < ai + rest; i++)
- lnnz += (a[i] != 0.0) ? 1 : 0;
- for(int i = ai + rest; i < end; i += 8) {
+ lnnz -= (a[i] == 0.0) ? 1 : 0;
+ for(int i = ai + rest; i < end; i += vLen) {
DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, i);
- lnnz += vLen-aVec.eq(0).trueCount();
+ lnnz -= aVec.eq(0).trueCount();
}
return lnnz;
}