[MINOR] Compilation issue with CUDA codegen's vectCountnnz() function fixed.
diff --git a/src/main/cuda/headers/operators.cuh b/src/main/cuda/headers/operators.cuh
index c88a19d..a48c990 100644
--- a/src/main/cuda/headers/operators.cuh
+++ b/src/main/cuda/headers/operators.cuh
@@ -107,6 +107,17 @@
};
template<typename T>
+struct NotZero {
+ __device__ __forceinline__ T operator()(T a, T b) const {
+ return (a != 0) ? 1.0 : 0.0;
+ }
+
+ __device__ __forceinline__ static T exec(T a, T b) {
+ return (a != 0) ? 1.0 : 0.0;
+ }
+};
+
+template<typename T>
struct XorOp {
__device__ __forceinline__ static T exec(T a, T b) {
return (a != 0.0) != (b != 0.0) ? 1.0 : 0.0;
diff --git a/src/main/cuda/headers/spoof_utils.cuh b/src/main/cuda/headers/spoof_utils.cuh
index 9bcaef5..5d9b101 100644
--- a/src/main/cuda/headers/spoof_utils.cuh
+++ b/src/main/cuda/headers/spoof_utils.cuh
@@ -693,9 +693,8 @@
template<typename T>
T vectCountnnz(T* a, uint32_t ai, uint32_t len) {
SumOp<T> agg_op;
- NotEqualOp<T> load_op;
- T result = BLOCK_ROW_AGG(&a[ai], &a[ai], len, agg_op, load_op);
- return result;
+ NotZero<T> load_op;
+ return BLOCK_ROW_AGG(&a[ai], &a[ai], len, agg_op, load_op);
}
template<typename T>