[MINOR] Compilation issue with CUDA codegen's vectCountnnz() function fixed.
diff --git a/src/main/cuda/headers/operators.cuh b/src/main/cuda/headers/operators.cuh
index c88a19d..a48c990 100644
--- a/src/main/cuda/headers/operators.cuh
+++ b/src/main/cuda/headers/operators.cuh
@@ -107,6 +107,17 @@
 };
 
 template<typename T>
+struct NotZero {
+	__device__  __forceinline__ T operator()(T a, T b) const {
+		return (a != 0) ? 1.0 : 0.0;
+	}
+	
+	__device__  __forceinline__ static T exec(T a, T b) {
+		return (a != 0) ? 1.0 : 0.0;
+	}
+};
+
+template<typename T>
 struct XorOp {
 	__device__  __forceinline__ static T exec(T a, T b) {
 		return (a != 0.0) != (b != 0.0) ? 1.0 : 0.0;
diff --git a/src/main/cuda/headers/spoof_utils.cuh b/src/main/cuda/headers/spoof_utils.cuh
index 9bcaef5..5d9b101 100644
--- a/src/main/cuda/headers/spoof_utils.cuh
+++ b/src/main/cuda/headers/spoof_utils.cuh
@@ -693,9 +693,8 @@
 template<typename T>
 T vectCountnnz(T* a, uint32_t ai, uint32_t len) {
 	SumOp<T> agg_op;
-	NotEqualOp<T> load_op;
-	T result = BLOCK_ROW_AGG(&a[ai], &a[ai], len, agg_op, load_op);
-	return result;
+	NotZero<T> load_op;
+	return BLOCK_ROW_AGG(&a[ai], &a[ai], len, agg_op, load_op);
 }
 
 template<typename T>