[SYSTEMDS-2663, 2671] Resolve problem with failing cumulative aggregate tests (indexing issue)
diff --git a/src/main/cpp/kernels/SystemDS.ptx b/src/main/cpp/kernels/SystemDS.ptx
index 93b3fdc..ee355bf 100644
--- a/src/main/cpp/kernels/SystemDS.ptx
+++ b/src/main/cpp/kernels/SystemDS.ptx
@@ -112,7 +112,7 @@
 	.param .u32 cumulative_sum_up_sweep_d_param_4
 )
 {
-	.reg .pred 	%p<5>;
+	.reg .pred 	%p<4>;
 	.reg .b32 	%r<20>;
 	.reg .f64 	%fd<8>;
 	.reg .b64 	%rd<11>;
@@ -130,7 +130,7 @@
 	mad.lo.s32 	%r1, %r10, %r11, %r12;
 	add.s32 	%r13, %r8, -1;
 	setp.gt.u32	%p1, %r1, %r13;
-	@%p1 bra 	BB2_5;
+	@%p1 bra 	BB2_4;
 
 	mov.u32 	%r14, %ctaid.y;
 	mul.lo.s32 	%r2, %r14, %r8;
@@ -155,16 +155,13 @@
 	@%p3 bra 	BB2_2;
 
 BB2_3:
-	setp.ge.u32	%p4, %r9, %r7;
-	@%p4 bra 	BB2_5;
-
 	add.s32 	%r18, %r1, %r2;
 	cvta.to.global.u64 	%rd8, %rd2;
 	mul.wide.u32 	%rd9, %r18, 8;
 	add.s64 	%rd10, %rd8, %rd9;
 	st.global.f64 	[%rd10], %fd7;
 
-BB2_5:
+BB2_4:
 	ret;
 }
 
@@ -177,7 +174,7 @@
 	.param .u32 cumulative_sum_up_sweep_f_param_4
 )
 {
-	.reg .pred 	%p<5>;
+	.reg .pred 	%p<4>;
 	.reg .f32 	%f<8>;
 	.reg .b32 	%r<20>;
 	.reg .b64 	%rd<11>;
@@ -195,7 +192,7 @@
 	mad.lo.s32 	%r1, %r10, %r11, %r12;
 	add.s32 	%r13, %r8, -1;
 	setp.gt.u32	%p1, %r1, %r13;
-	@%p1 bra 	BB3_5;
+	@%p1 bra 	BB3_4;
 
 	mov.u32 	%r14, %ctaid.y;
 	mul.lo.s32 	%r2, %r14, %r8;
@@ -220,16 +217,13 @@
 	@%p3 bra 	BB3_2;
 
 BB3_3:
-	setp.ge.u32	%p4, %r9, %r7;
-	@%p4 bra 	BB3_5;
-
 	add.s32 	%r18, %r1, %r2;
 	cvta.to.global.u64 	%rd8, %rd2;
 	mul.wide.u32 	%rd9, %r18, 4;
 	add.s64 	%rd10, %rd8, %rd9;
 	st.global.f32 	[%rd10], %f7;
 
-BB3_5:
+BB3_4:
 	ret;
 }
 
@@ -243,8 +237,8 @@
 	.param .u32 cumulative_sum_down_sweep_d_param_5
 )
 {
-	.reg .pred 	%p<7>;
-	.reg .b32 	%r<22>;
+	.reg .pred 	%p<5>;
+	.reg .b32 	%r<21>;
 	.reg .f64 	%fd<11>;
 	.reg .b64 	%rd<15>;
 
@@ -265,47 +259,44 @@
 	setp.gt.u32	%p1, %r1, %r13;
 	@%p1 bra 	BB4_5;
 
-	mov.u32 	%r14, %ctaid.y;
-	mul.lo.s32 	%r2, %r14, %r8;
-	mov.u32 	%r15, %nctaid.y;
-	setp.lt.u32	%p2, %r15, 2;
-	setp.eq.s32	%p3, %r14, 0;
-	or.pred  	%p4, %p2, %p3;
+	mov.u32 	%r2, %ctaid.y;
+	setp.eq.s32	%p2, %r2, 0;
 	mov.f64 	%fd9, 0d0000000000000000;
-	@%p4 bra 	BB4_3;
+	@%p2 bra 	BB4_3;
 
-	add.s32 	%r16, %r1, %r2;
-	add.s32 	%r17, %r16, -1;
+	add.s32 	%r14, %r2, -1;
+	mad.lo.s32 	%r15, %r14, %r8, %r1;
 	cvta.to.global.u64 	%rd6, %rd3;
-	mul.wide.s32 	%rd7, %r17, 8;
+	mul.wide.u32 	%rd7, %r15, 8;
 	add.s64 	%rd8, %rd6, %rd7;
 	ld.global.f64 	%fd9, [%rd8];
 
 BB4_3:
-	mad.lo.s32 	%r18, %r2, %r9, %r1;
-	mul.wide.u32 	%rd9, %r18, 8;
+	mul.lo.s32 	%r16, %r9, %r8;
+	mad.lo.s32 	%r17, %r16, %r2, %r1;
+	mul.wide.u32 	%rd9, %r17, 8;
 	add.s64 	%rd10, %rd2, %rd9;
 	ld.global.f64 	%fd7, [%rd10];
 	add.f64 	%fd10, %fd9, %fd7;
 	add.s64 	%rd11, %rd1, %rd9;
 	st.global.f64 	[%rd11], %fd10;
-	mad.lo.s32 	%r19, %r9, %r8, %r18;
-	mul.lo.s32 	%r20, %r8, %r7;
-	min.u32 	%r3, %r19, %r20;
-	add.s32 	%r21, %r18, %r8;
-	setp.ge.u32	%p5, %r21, %r3;
-	@%p5 bra 	BB4_5;
+	mul.lo.s32 	%r18, %r8, %r7;
+	add.s32 	%r19, %r17, %r16;
+	min.u32 	%r3, %r19, %r18;
+	add.s32 	%r20, %r17, %r8;
+	setp.ge.u32	%p3, %r20, %r3;
+	@%p3 bra 	BB4_5;
 
 BB4_4:
-	mul.wide.s32 	%rd12, %r21, 8;
+	mul.wide.s32 	%rd12, %r20, 8;
 	add.s64 	%rd13, %rd2, %rd12;
 	ld.global.f64 	%fd8, [%rd13];
 	add.f64 	%fd10, %fd10, %fd8;
 	add.s64 	%rd14, %rd1, %rd12;
 	st.global.f64 	[%rd14], %fd10;
-	add.s32 	%r21, %r21, %r8;
-	setp.lt.u32	%p6, %r21, %r3;
-	@%p6 bra 	BB4_4;
+	add.s32 	%r20, %r20, %r8;
+	setp.lt.u32	%p4, %r20, %r3;
+	@%p4 bra 	BB4_4;
 
 BB4_5:
 	ret;
@@ -321,9 +312,9 @@
 	.param .u32 cumulative_sum_down_sweep_f_param_5
 )
 {
-	.reg .pred 	%p<7>;
+	.reg .pred 	%p<5>;
 	.reg .f32 	%f<11>;
-	.reg .b32 	%r<22>;
+	.reg .b32 	%r<21>;
 	.reg .b64 	%rd<15>;
 
 
@@ -343,47 +334,44 @@
 	setp.gt.u32	%p1, %r1, %r13;
 	@%p1 bra 	BB5_5;
 
-	mov.u32 	%r14, %ctaid.y;
-	mul.lo.s32 	%r2, %r14, %r8;
-	mov.u32 	%r15, %nctaid.y;
-	setp.lt.u32	%p2, %r15, 2;
-	setp.eq.s32	%p3, %r14, 0;
-	or.pred  	%p4, %p2, %p3;
+	mov.u32 	%r2, %ctaid.y;
+	setp.eq.s32	%p2, %r2, 0;
 	mov.f32 	%f9, 0f00000000;
-	@%p4 bra 	BB5_3;
+	@%p2 bra 	BB5_3;
 
-	add.s32 	%r16, %r1, %r2;
-	add.s32 	%r17, %r16, -1;
+	add.s32 	%r14, %r2, -1;
+	mad.lo.s32 	%r15, %r14, %r8, %r1;
 	cvta.to.global.u64 	%rd6, %rd3;
-	mul.wide.s32 	%rd7, %r17, 4;
+	mul.wide.u32 	%rd7, %r15, 4;
 	add.s64 	%rd8, %rd6, %rd7;
 	ld.global.f32 	%f9, [%rd8];
 
 BB5_3:
-	mad.lo.s32 	%r18, %r2, %r9, %r1;
-	mul.wide.u32 	%rd9, %r18, 4;
+	mul.lo.s32 	%r16, %r9, %r8;
+	mad.lo.s32 	%r17, %r16, %r2, %r1;
+	mul.wide.u32 	%rd9, %r17, 4;
 	add.s64 	%rd10, %rd2, %rd9;
 	ld.global.f32 	%f7, [%rd10];
 	add.f32 	%f10, %f9, %f7;
 	add.s64 	%rd11, %rd1, %rd9;
 	st.global.f32 	[%rd11], %f10;
-	mad.lo.s32 	%r19, %r9, %r8, %r18;
-	mul.lo.s32 	%r20, %r8, %r7;
-	min.u32 	%r3, %r19, %r20;
-	add.s32 	%r21, %r18, %r8;
-	setp.ge.u32	%p5, %r21, %r3;
-	@%p5 bra 	BB5_5;
+	mul.lo.s32 	%r18, %r8, %r7;
+	add.s32 	%r19, %r17, %r16;
+	min.u32 	%r3, %r19, %r18;
+	add.s32 	%r20, %r17, %r8;
+	setp.ge.u32	%p3, %r20, %r3;
+	@%p3 bra 	BB5_5;
 
 BB5_4:
-	mul.wide.s32 	%rd12, %r21, 4;
+	mul.wide.s32 	%rd12, %r20, 4;
 	add.s64 	%rd13, %rd2, %rd12;
 	ld.global.f32 	%f8, [%rd13];
 	add.f32 	%f10, %f10, %f8;
 	add.s64 	%rd14, %rd1, %rd12;
 	st.global.f32 	[%rd14], %f10;
-	add.s32 	%r21, %r21, %r8;
-	setp.lt.u32	%p6, %r21, %r3;
-	@%p6 bra 	BB5_4;
+	add.s32 	%r20, %r20, %r8;
+	setp.lt.u32	%p4, %r20, %r3;
+	@%p4 bra 	BB5_4;
 
 BB5_5:
 	ret;
@@ -398,7 +386,7 @@
 	.param .u32 cumulative_prod_up_sweep_d_param_4
 )
 {
-	.reg .pred 	%p<5>;
+	.reg .pred 	%p<4>;
 	.reg .b32 	%r<20>;
 	.reg .f64 	%fd<8>;
 	.reg .b64 	%rd<11>;
@@ -416,7 +404,7 @@
 	mad.lo.s32 	%r1, %r10, %r11, %r12;
 	add.s32 	%r13, %r8, -1;
 	setp.gt.u32	%p1, %r1, %r13;
-	@%p1 bra 	BB6_5;
+	@%p1 bra 	BB6_4;
 
 	mov.u32 	%r14, %ctaid.y;
 	mul.lo.s32 	%r2, %r14, %r8;
@@ -441,16 +429,13 @@
 	@%p3 bra 	BB6_2;
 
 BB6_3:
-	setp.ge.u32	%p4, %r9, %r7;
-	@%p4 bra 	BB6_5;
-
 	add.s32 	%r18, %r1, %r2;
 	cvta.to.global.u64 	%rd8, %rd2;
 	mul.wide.u32 	%rd9, %r18, 8;
 	add.s64 	%rd10, %rd8, %rd9;
 	st.global.f64 	[%rd10], %fd7;
 
-BB6_5:
+BB6_4:
 	ret;
 }
 
@@ -463,7 +448,7 @@
 	.param .u32 cumulative_prod_up_sweep_f_param_4
 )
 {
-	.reg .pred 	%p<5>;
+	.reg .pred 	%p<4>;
 	.reg .b32 	%r<20>;
 	.reg .f64 	%fd<8>;
 	.reg .b64 	%rd<11>;
@@ -481,7 +466,7 @@
 	mad.lo.s32 	%r1, %r10, %r11, %r12;
 	add.s32 	%r13, %r8, -1;
 	setp.gt.u32	%p1, %r1, %r13;
-	@%p1 bra 	BB7_5;
+	@%p1 bra 	BB7_4;
 
 	mov.u32 	%r14, %ctaid.y;
 	mul.lo.s32 	%r2, %r14, %r8;
@@ -506,16 +491,13 @@
 	@%p3 bra 	BB7_2;
 
 BB7_3:
-	setp.ge.u32	%p4, %r9, %r7;
-	@%p4 bra 	BB7_5;
-
 	add.s32 	%r18, %r1, %r2;
 	cvta.to.global.u64 	%rd8, %rd2;
 	mul.wide.u32 	%rd9, %r18, 8;
 	add.s64 	%rd10, %rd8, %rd9;
 	st.global.f64 	[%rd10], %fd7;
 
-BB7_5:
+BB7_4:
 	ret;
 }
 
@@ -529,8 +511,8 @@
 	.param .u32 cumulative_prod_down_sweep_d_param_5
 )
 {
-	.reg .pred 	%p<7>;
-	.reg .b32 	%r<22>;
+	.reg .pred 	%p<5>;
+	.reg .b32 	%r<21>;
 	.reg .f64 	%fd<11>;
 	.reg .b64 	%rd<15>;
 
@@ -551,47 +533,44 @@
 	setp.gt.u32	%p1, %r1, %r13;
 	@%p1 bra 	BB8_5;
 
-	mov.u32 	%r14, %ctaid.y;
-	mul.lo.s32 	%r2, %r14, %r8;
-	mov.u32 	%r15, %nctaid.y;
-	setp.lt.u32	%p2, %r15, 2;
-	setp.eq.s32	%p3, %r14, 0;
-	or.pred  	%p4, %p2, %p3;
+	mov.u32 	%r2, %ctaid.y;
+	setp.eq.s32	%p2, %r2, 0;
 	mov.f64 	%fd9, 0d3FF0000000000000;
-	@%p4 bra 	BB8_3;
+	@%p2 bra 	BB8_3;
 
-	add.s32 	%r16, %r1, %r2;
-	add.s32 	%r17, %r16, -1;
+	add.s32 	%r14, %r2, -1;
+	mad.lo.s32 	%r15, %r14, %r8, %r1;
 	cvta.to.global.u64 	%rd6, %rd3;
-	mul.wide.s32 	%rd7, %r17, 8;
+	mul.wide.u32 	%rd7, %r15, 8;
 	add.s64 	%rd8, %rd6, %rd7;
 	ld.global.f64 	%fd9, [%rd8];
 
 BB8_3:
-	mad.lo.s32 	%r18, %r2, %r9, %r1;
-	mul.wide.u32 	%rd9, %r18, 8;
+	mul.lo.s32 	%r16, %r9, %r8;
+	mad.lo.s32 	%r17, %r16, %r2, %r1;
+	mul.wide.u32 	%rd9, %r17, 8;
 	add.s64 	%rd10, %rd2, %rd9;
 	ld.global.f64 	%fd7, [%rd10];
 	mul.f64 	%fd10, %fd9, %fd7;
 	add.s64 	%rd11, %rd1, %rd9;
 	st.global.f64 	[%rd11], %fd10;
-	mad.lo.s32 	%r19, %r9, %r8, %r18;
-	mul.lo.s32 	%r20, %r8, %r7;
-	min.u32 	%r3, %r19, %r20;
-	add.s32 	%r21, %r18, %r8;
-	setp.ge.u32	%p5, %r21, %r3;
-	@%p5 bra 	BB8_5;
+	mul.lo.s32 	%r18, %r8, %r7;
+	add.s32 	%r19, %r17, %r16;
+	min.u32 	%r3, %r19, %r18;
+	add.s32 	%r20, %r17, %r8;
+	setp.ge.u32	%p3, %r20, %r3;
+	@%p3 bra 	BB8_5;
 
 BB8_4:
-	mul.wide.s32 	%rd12, %r21, 8;
+	mul.wide.s32 	%rd12, %r20, 8;
 	add.s64 	%rd13, %rd2, %rd12;
 	ld.global.f64 	%fd8, [%rd13];
 	mul.f64 	%fd10, %fd10, %fd8;
 	add.s64 	%rd14, %rd1, %rd12;
 	st.global.f64 	[%rd14], %fd10;
-	add.s32 	%r21, %r21, %r8;
-	setp.lt.u32	%p6, %r21, %r3;
-	@%p6 bra 	BB8_4;
+	add.s32 	%r20, %r20, %r8;
+	setp.lt.u32	%p4, %r20, %r3;
+	@%p4 bra 	BB8_4;
 
 BB8_5:
 	ret;
@@ -607,9 +586,9 @@
 	.param .u32 cumulative_prod_down_sweep_f_param_5
 )
 {
-	.reg .pred 	%p<7>;
+	.reg .pred 	%p<5>;
 	.reg .f32 	%f<11>;
-	.reg .b32 	%r<22>;
+	.reg .b32 	%r<21>;
 	.reg .b64 	%rd<15>;
 
 
@@ -629,47 +608,44 @@
 	setp.gt.u32	%p1, %r1, %r13;
 	@%p1 bra 	BB9_5;
 
-	mov.u32 	%r14, %ctaid.y;
-	mul.lo.s32 	%r2, %r14, %r8;
-	mov.u32 	%r15, %nctaid.y;
-	setp.lt.u32	%p2, %r15, 2;
-	setp.eq.s32	%p3, %r14, 0;
-	or.pred  	%p4, %p2, %p3;
+	mov.u32 	%r2, %ctaid.y;
+	setp.eq.s32	%p2, %r2, 0;
 	mov.f32 	%f9, 0f3F800000;
-	@%p4 bra 	BB9_3;
+	@%p2 bra 	BB9_3;
 
-	add.s32 	%r16, %r1, %r2;
-	add.s32 	%r17, %r16, -1;
+	add.s32 	%r14, %r2, -1;
+	mad.lo.s32 	%r15, %r14, %r8, %r1;
 	cvta.to.global.u64 	%rd6, %rd3;
-	mul.wide.s32 	%rd7, %r17, 4;
+	mul.wide.u32 	%rd7, %r15, 4;
 	add.s64 	%rd8, %rd6, %rd7;
 	ld.global.f32 	%f9, [%rd8];
 
 BB9_3:
-	mad.lo.s32 	%r18, %r2, %r9, %r1;
-	mul.wide.u32 	%rd9, %r18, 4;
+	mul.lo.s32 	%r16, %r9, %r8;
+	mad.lo.s32 	%r17, %r16, %r2, %r1;
+	mul.wide.u32 	%rd9, %r17, 4;
 	add.s64 	%rd10, %rd2, %rd9;
 	ld.global.f32 	%f7, [%rd10];
 	mul.f32 	%f10, %f9, %f7;
 	add.s64 	%rd11, %rd1, %rd9;
 	st.global.f32 	[%rd11], %f10;
-	mad.lo.s32 	%r19, %r9, %r8, %r18;
-	mul.lo.s32 	%r20, %r8, %r7;
-	min.u32 	%r3, %r19, %r20;
-	add.s32 	%r21, %r18, %r8;
-	setp.ge.u32	%p5, %r21, %r3;
-	@%p5 bra 	BB9_5;
+	mul.lo.s32 	%r18, %r8, %r7;
+	add.s32 	%r19, %r17, %r16;
+	min.u32 	%r3, %r19, %r18;
+	add.s32 	%r20, %r17, %r8;
+	setp.ge.u32	%p3, %r20, %r3;
+	@%p3 bra 	BB9_5;
 
 BB9_4:
-	mul.wide.s32 	%rd12, %r21, 4;
+	mul.wide.s32 	%rd12, %r20, 4;
 	add.s64 	%rd13, %rd2, %rd12;
 	ld.global.f32 	%f8, [%rd13];
 	mul.f32 	%f10, %f10, %f8;
 	add.s64 	%rd14, %rd1, %rd12;
 	st.global.f32 	[%rd14], %f10;
-	add.s32 	%r21, %r21, %r8;
-	setp.lt.u32	%p6, %r21, %r3;
-	@%p6 bra 	BB9_4;
+	add.s32 	%r20, %r20, %r8;
+	setp.lt.u32	%p4, %r20, %r3;
+	@%p4 bra 	BB9_4;
 
 BB9_5:
 	ret;
@@ -684,7 +660,7 @@
 	.param .u32 cumulative_min_up_sweep_d_param_4
 )
 {
-	.reg .pred 	%p<5>;
+	.reg .pred 	%p<4>;
 	.reg .b32 	%r<20>;
 	.reg .f64 	%fd<8>;
 	.reg .b64 	%rd<11>;
@@ -702,7 +678,7 @@
 	mad.lo.s32 	%r1, %r10, %r11, %r12;
 	add.s32 	%r13, %r8, -1;
 	setp.gt.u32	%p1, %r1, %r13;
-	@%p1 bra 	BB10_5;
+	@%p1 bra 	BB10_4;
 
 	mov.u32 	%r14, %ctaid.y;
 	mul.lo.s32 	%r2, %r14, %r8;
@@ -727,16 +703,13 @@
 	@%p3 bra 	BB10_2;
 
 BB10_3:
-	setp.ge.u32	%p4, %r9, %r7;
-	@%p4 bra 	BB10_5;
-
 	add.s32 	%r18, %r1, %r2;
 	cvta.to.global.u64 	%rd8, %rd2;
 	mul.wide.u32 	%rd9, %r18, 8;
 	add.s64 	%rd10, %rd8, %rd9;
 	st.global.f64 	[%rd10], %fd7;
 
-BB10_5:
+BB10_4:
 	ret;
 }
 
@@ -749,7 +722,7 @@
 	.param .u32 cumulative_min_up_sweep_f_param_4
 )
 {
-	.reg .pred 	%p<5>;
+	.reg .pred 	%p<4>;
 	.reg .f32 	%f<8>;
 	.reg .b32 	%r<20>;
 	.reg .b64 	%rd<11>;
@@ -767,7 +740,7 @@
 	mad.lo.s32 	%r1, %r10, %r11, %r12;
 	add.s32 	%r13, %r8, -1;
 	setp.gt.u32	%p1, %r1, %r13;
-	@%p1 bra 	BB11_5;
+	@%p1 bra 	BB11_4;
 
 	mov.u32 	%r14, %ctaid.y;
 	mul.lo.s32 	%r2, %r14, %r8;
@@ -792,16 +765,13 @@
 	@%p3 bra 	BB11_2;
 
 BB11_3:
-	setp.ge.u32	%p4, %r9, %r7;
-	@%p4 bra 	BB11_5;
-
 	add.s32 	%r18, %r1, %r2;
 	cvta.to.global.u64 	%rd8, %rd2;
 	mul.wide.u32 	%rd9, %r18, 4;
 	add.s64 	%rd10, %rd8, %rd9;
 	st.global.f32 	[%rd10], %f7;
 
-BB11_5:
+BB11_4:
 	ret;
 }
 
@@ -815,8 +785,8 @@
 	.param .u32 cumulative_min_down_sweep_d_param_5
 )
 {
-	.reg .pred 	%p<7>;
-	.reg .b32 	%r<22>;
+	.reg .pred 	%p<5>;
+	.reg .b32 	%r<21>;
 	.reg .f64 	%fd<11>;
 	.reg .b64 	%rd<15>;
 
@@ -837,47 +807,44 @@
 	setp.gt.u32	%p1, %r1, %r13;
 	@%p1 bra 	BB12_5;
 
-	mov.u32 	%r14, %ctaid.y;
-	mul.lo.s32 	%r2, %r14, %r8;
-	mov.u32 	%r15, %nctaid.y;
-	setp.lt.u32	%p2, %r15, 2;
-	setp.eq.s32	%p3, %r14, 0;
-	or.pred  	%p4, %p2, %p3;
+	mov.u32 	%r2, %ctaid.y;
+	setp.eq.s32	%p2, %r2, 0;
 	mov.f64 	%fd9, 0d7FF0000000000000;
-	@%p4 bra 	BB12_3;
+	@%p2 bra 	BB12_3;
 
-	add.s32 	%r16, %r1, %r2;
-	add.s32 	%r17, %r16, -1;
+	add.s32 	%r14, %r2, -1;
+	mad.lo.s32 	%r15, %r14, %r8, %r1;
 	cvta.to.global.u64 	%rd6, %rd3;
-	mul.wide.s32 	%rd7, %r17, 8;
+	mul.wide.u32 	%rd7, %r15, 8;
 	add.s64 	%rd8, %rd6, %rd7;
 	ld.global.f64 	%fd9, [%rd8];
 
 BB12_3:
-	mad.lo.s32 	%r18, %r2, %r9, %r1;
-	mul.wide.u32 	%rd9, %r18, 8;
+	mul.lo.s32 	%r16, %r9, %r8;
+	mad.lo.s32 	%r17, %r16, %r2, %r1;
+	mul.wide.u32 	%rd9, %r17, 8;
 	add.s64 	%rd10, %rd2, %rd9;
 	ld.global.f64 	%fd7, [%rd10];
 	min.f64 	%fd10, %fd9, %fd7;
 	add.s64 	%rd11, %rd1, %rd9;
 	st.global.f64 	[%rd11], %fd10;
-	mad.lo.s32 	%r19, %r9, %r8, %r18;
-	mul.lo.s32 	%r20, %r8, %r7;
-	min.u32 	%r3, %r19, %r20;
-	add.s32 	%r21, %r18, %r8;
-	setp.ge.u32	%p5, %r21, %r3;
-	@%p5 bra 	BB12_5;
+	mul.lo.s32 	%r18, %r8, %r7;
+	add.s32 	%r19, %r17, %r16;
+	min.u32 	%r3, %r19, %r18;
+	add.s32 	%r20, %r17, %r8;
+	setp.ge.u32	%p3, %r20, %r3;
+	@%p3 bra 	BB12_5;
 
 BB12_4:
-	mul.wide.s32 	%rd12, %r21, 8;
+	mul.wide.s32 	%rd12, %r20, 8;
 	add.s64 	%rd13, %rd2, %rd12;
 	ld.global.f64 	%fd8, [%rd13];
 	min.f64 	%fd10, %fd10, %fd8;
 	add.s64 	%rd14, %rd1, %rd12;
 	st.global.f64 	[%rd14], %fd10;
-	add.s32 	%r21, %r21, %r8;
-	setp.lt.u32	%p6, %r21, %r3;
-	@%p6 bra 	BB12_4;
+	add.s32 	%r20, %r20, %r8;
+	setp.lt.u32	%p4, %r20, %r3;
+	@%p4 bra 	BB12_4;
 
 BB12_5:
 	ret;
@@ -893,9 +860,9 @@
 	.param .u32 cumulative_min_down_sweep_f_param_5
 )
 {
-	.reg .pred 	%p<7>;
+	.reg .pred 	%p<5>;
 	.reg .f32 	%f<11>;
-	.reg .b32 	%r<22>;
+	.reg .b32 	%r<21>;
 	.reg .b64 	%rd<15>;
 
 
@@ -915,47 +882,44 @@
 	setp.gt.u32	%p1, %r1, %r13;
 	@%p1 bra 	BB13_5;
 
-	mov.u32 	%r14, %ctaid.y;
-	mul.lo.s32 	%r2, %r14, %r8;
-	mov.u32 	%r15, %nctaid.y;
-	setp.lt.u32	%p2, %r15, 2;
-	setp.eq.s32	%p3, %r14, 0;
-	or.pred  	%p4, %p2, %p3;
+	mov.u32 	%r2, %ctaid.y;
+	setp.eq.s32	%p2, %r2, 0;
 	mov.f32 	%f9, 0f7F800000;
-	@%p4 bra 	BB13_3;
+	@%p2 bra 	BB13_3;
 
-	add.s32 	%r16, %r1, %r2;
-	add.s32 	%r17, %r16, -1;
+	add.s32 	%r14, %r2, -1;
+	mad.lo.s32 	%r15, %r14, %r8, %r1;
 	cvta.to.global.u64 	%rd6, %rd3;
-	mul.wide.s32 	%rd7, %r17, 4;
+	mul.wide.u32 	%rd7, %r15, 4;
 	add.s64 	%rd8, %rd6, %rd7;
 	ld.global.f32 	%f9, [%rd8];
 
 BB13_3:
-	mad.lo.s32 	%r18, %r2, %r9, %r1;
-	mul.wide.u32 	%rd9, %r18, 4;
+	mul.lo.s32 	%r16, %r9, %r8;
+	mad.lo.s32 	%r17, %r16, %r2, %r1;
+	mul.wide.u32 	%rd9, %r17, 4;
 	add.s64 	%rd10, %rd2, %rd9;
 	ld.global.f32 	%f7, [%rd10];
 	min.f32 	%f10, %f9, %f7;
 	add.s64 	%rd11, %rd1, %rd9;
 	st.global.f32 	[%rd11], %f10;
-	mad.lo.s32 	%r19, %r9, %r8, %r18;
-	mul.lo.s32 	%r20, %r8, %r7;
-	min.u32 	%r3, %r19, %r20;
-	add.s32 	%r21, %r18, %r8;
-	setp.ge.u32	%p5, %r21, %r3;
-	@%p5 bra 	BB13_5;
+	mul.lo.s32 	%r18, %r8, %r7;
+	add.s32 	%r19, %r17, %r16;
+	min.u32 	%r3, %r19, %r18;
+	add.s32 	%r20, %r17, %r8;
+	setp.ge.u32	%p3, %r20, %r3;
+	@%p3 bra 	BB13_5;
 
 BB13_4:
-	mul.wide.s32 	%rd12, %r21, 4;
+	mul.wide.s32 	%rd12, %r20, 4;
 	add.s64 	%rd13, %rd2, %rd12;
 	ld.global.f32 	%f8, [%rd13];
 	min.f32 	%f10, %f10, %f8;
 	add.s64 	%rd14, %rd1, %rd12;
 	st.global.f32 	[%rd14], %f10;
-	add.s32 	%r21, %r21, %r8;
-	setp.lt.u32	%p6, %r21, %r3;
-	@%p6 bra 	BB13_4;
+	add.s32 	%r20, %r20, %r8;
+	setp.lt.u32	%p4, %r20, %r3;
+	@%p4 bra 	BB13_4;
 
 BB13_5:
 	ret;
@@ -970,7 +934,7 @@
 	.param .u32 cumulative_max_up_sweep_d_param_4
 )
 {
-	.reg .pred 	%p<5>;
+	.reg .pred 	%p<4>;
 	.reg .b32 	%r<20>;
 	.reg .f64 	%fd<8>;
 	.reg .b64 	%rd<11>;
@@ -988,7 +952,7 @@
 	mad.lo.s32 	%r1, %r10, %r11, %r12;
 	add.s32 	%r13, %r8, -1;
 	setp.gt.u32	%p1, %r1, %r13;
-	@%p1 bra 	BB14_5;
+	@%p1 bra 	BB14_4;
 
 	mov.u32 	%r14, %ctaid.y;
 	mul.lo.s32 	%r2, %r14, %r8;
@@ -1013,16 +977,13 @@
 	@%p3 bra 	BB14_2;
 
 BB14_3:
-	setp.ge.u32	%p4, %r9, %r7;
-	@%p4 bra 	BB14_5;
-
 	add.s32 	%r18, %r1, %r2;
 	cvta.to.global.u64 	%rd8, %rd2;
 	mul.wide.u32 	%rd9, %r18, 8;
 	add.s64 	%rd10, %rd8, %rd9;
 	st.global.f64 	[%rd10], %fd7;
 
-BB14_5:
+BB14_4:
 	ret;
 }
 
@@ -1035,7 +996,7 @@
 	.param .u32 cumulative_max_up_sweep_f_param_4
 )
 {
-	.reg .pred 	%p<5>;
+	.reg .pred 	%p<4>;
 	.reg .f32 	%f<8>;
 	.reg .b32 	%r<20>;
 	.reg .b64 	%rd<11>;
@@ -1053,7 +1014,7 @@
 	mad.lo.s32 	%r1, %r10, %r11, %r12;
 	add.s32 	%r13, %r8, -1;
 	setp.gt.u32	%p1, %r1, %r13;
-	@%p1 bra 	BB15_5;
+	@%p1 bra 	BB15_4;
 
 	mov.u32 	%r14, %ctaid.y;
 	mul.lo.s32 	%r2, %r14, %r8;
@@ -1078,16 +1039,13 @@
 	@%p3 bra 	BB15_2;
 
 BB15_3:
-	setp.ge.u32	%p4, %r9, %r7;
-	@%p4 bra 	BB15_5;
-
 	add.s32 	%r18, %r1, %r2;
 	cvta.to.global.u64 	%rd8, %rd2;
 	mul.wide.u32 	%rd9, %r18, 4;
 	add.s64 	%rd10, %rd8, %rd9;
 	st.global.f32 	[%rd10], %f7;
 
-BB15_5:
+BB15_4:
 	ret;
 }
 
@@ -1101,8 +1059,8 @@
 	.param .u32 cumulative_max_down_sweep_d_param_5
 )
 {
-	.reg .pred 	%p<7>;
-	.reg .b32 	%r<22>;
+	.reg .pred 	%p<5>;
+	.reg .b32 	%r<21>;
 	.reg .f64 	%fd<11>;
 	.reg .b64 	%rd<15>;
 
@@ -1123,47 +1081,44 @@
 	setp.gt.u32	%p1, %r1, %r13;
 	@%p1 bra 	BB16_5;
 
-	mov.u32 	%r14, %ctaid.y;
-	mul.lo.s32 	%r2, %r14, %r8;
-	mov.u32 	%r15, %nctaid.y;
-	setp.lt.u32	%p2, %r15, 2;
-	setp.eq.s32	%p3, %r14, 0;
-	or.pred  	%p4, %p2, %p3;
+	mov.u32 	%r2, %ctaid.y;
+	setp.eq.s32	%p2, %r2, 0;
 	mov.f64 	%fd9, 0dFFF0000000000000;
-	@%p4 bra 	BB16_3;
+	@%p2 bra 	BB16_3;
 
-	add.s32 	%r16, %r1, %r2;
-	add.s32 	%r17, %r16, -1;
+	add.s32 	%r14, %r2, -1;
+	mad.lo.s32 	%r15, %r14, %r8, %r1;
 	cvta.to.global.u64 	%rd6, %rd3;
-	mul.wide.s32 	%rd7, %r17, 8;
+	mul.wide.u32 	%rd7, %r15, 8;
 	add.s64 	%rd8, %rd6, %rd7;
 	ld.global.f64 	%fd9, [%rd8];
 
 BB16_3:
-	mad.lo.s32 	%r18, %r2, %r9, %r1;
-	mul.wide.u32 	%rd9, %r18, 8;
+	mul.lo.s32 	%r16, %r9, %r8;
+	mad.lo.s32 	%r17, %r16, %r2, %r1;
+	mul.wide.u32 	%rd9, %r17, 8;
 	add.s64 	%rd10, %rd2, %rd9;
 	ld.global.f64 	%fd7, [%rd10];
 	max.f64 	%fd10, %fd9, %fd7;
 	add.s64 	%rd11, %rd1, %rd9;
 	st.global.f64 	[%rd11], %fd10;
-	mad.lo.s32 	%r19, %r9, %r8, %r18;
-	mul.lo.s32 	%r20, %r8, %r7;
-	min.u32 	%r3, %r19, %r20;
-	add.s32 	%r21, %r18, %r8;
-	setp.ge.u32	%p5, %r21, %r3;
-	@%p5 bra 	BB16_5;
+	mul.lo.s32 	%r18, %r8, %r7;
+	add.s32 	%r19, %r17, %r16;
+	min.u32 	%r3, %r19, %r18;
+	add.s32 	%r20, %r17, %r8;
+	setp.ge.u32	%p3, %r20, %r3;
+	@%p3 bra 	BB16_5;
 
 BB16_4:
-	mul.wide.s32 	%rd12, %r21, 8;
+	mul.wide.s32 	%rd12, %r20, 8;
 	add.s64 	%rd13, %rd2, %rd12;
 	ld.global.f64 	%fd8, [%rd13];
 	max.f64 	%fd10, %fd10, %fd8;
 	add.s64 	%rd14, %rd1, %rd12;
 	st.global.f64 	[%rd14], %fd10;
-	add.s32 	%r21, %r21, %r8;
-	setp.lt.u32	%p6, %r21, %r3;
-	@%p6 bra 	BB16_4;
+	add.s32 	%r20, %r20, %r8;
+	setp.lt.u32	%p4, %r20, %r3;
+	@%p4 bra 	BB16_4;
 
 BB16_5:
 	ret;
@@ -1179,9 +1134,9 @@
 	.param .u32 cumulative_max_down_sweep_f_param_5
 )
 {
-	.reg .pred 	%p<7>;
+	.reg .pred 	%p<5>;
 	.reg .f32 	%f<11>;
-	.reg .b32 	%r<22>;
+	.reg .b32 	%r<21>;
 	.reg .b64 	%rd<15>;
 
 
@@ -1201,47 +1156,44 @@
 	setp.gt.u32	%p1, %r1, %r13;
 	@%p1 bra 	BB17_5;
 
-	mov.u32 	%r14, %ctaid.y;
-	mul.lo.s32 	%r2, %r14, %r8;
-	mov.u32 	%r15, %nctaid.y;
-	setp.lt.u32	%p2, %r15, 2;
-	setp.eq.s32	%p3, %r14, 0;
-	or.pred  	%p4, %p2, %p3;
+	mov.u32 	%r2, %ctaid.y;
+	setp.eq.s32	%p2, %r2, 0;
 	mov.f32 	%f9, 0fFF800000;
-	@%p4 bra 	BB17_3;
+	@%p2 bra 	BB17_3;
 
-	add.s32 	%r16, %r1, %r2;
-	add.s32 	%r17, %r16, -1;
+	add.s32 	%r14, %r2, -1;
+	mad.lo.s32 	%r15, %r14, %r8, %r1;
 	cvta.to.global.u64 	%rd6, %rd3;
-	mul.wide.s32 	%rd7, %r17, 4;
+	mul.wide.u32 	%rd7, %r15, 4;
 	add.s64 	%rd8, %rd6, %rd7;
 	ld.global.f32 	%f9, [%rd8];
 
 BB17_3:
-	mad.lo.s32 	%r18, %r2, %r9, %r1;
-	mul.wide.u32 	%rd9, %r18, 4;
+	mul.lo.s32 	%r16, %r9, %r8;
+	mad.lo.s32 	%r17, %r16, %r2, %r1;
+	mul.wide.u32 	%rd9, %r17, 4;
 	add.s64 	%rd10, %rd2, %rd9;
 	ld.global.f32 	%f7, [%rd10];
 	max.f32 	%f10, %f9, %f7;
 	add.s64 	%rd11, %rd1, %rd9;
 	st.global.f32 	[%rd11], %f10;
-	mad.lo.s32 	%r19, %r9, %r8, %r18;
-	mul.lo.s32 	%r20, %r8, %r7;
-	min.u32 	%r3, %r19, %r20;
-	add.s32 	%r21, %r18, %r8;
-	setp.ge.u32	%p5, %r21, %r3;
-	@%p5 bra 	BB17_5;
+	mul.lo.s32 	%r18, %r8, %r7;
+	add.s32 	%r19, %r17, %r16;
+	min.u32 	%r3, %r19, %r18;
+	add.s32 	%r20, %r17, %r8;
+	setp.ge.u32	%p3, %r20, %r3;
+	@%p3 bra 	BB17_5;
 
 BB17_4:
-	mul.wide.s32 	%rd12, %r21, 4;
+	mul.wide.s32 	%rd12, %r20, 4;
 	add.s64 	%rd13, %rd2, %rd12;
 	ld.global.f32 	%f8, [%rd13];
 	max.f32 	%f10, %f10, %f8;
 	add.s64 	%rd14, %rd1, %rd12;
 	st.global.f32 	[%rd14], %f10;
-	add.s32 	%r21, %r21, %r8;
-	setp.lt.u32	%p6, %r21, %r3;
-	@%p6 bra 	BB17_4;
+	add.s32 	%r20, %r20, %r8;
+	setp.lt.u32	%p4, %r20, %r3;
+	@%p4 bra 	BB17_4;
 
 BB17_5:
 	ret;
diff --git a/src/main/cpp/kernels/cum_scan.cuh b/src/main/cpp/kernels/cum_scan.cuh
index 90b5a1a..e73488d 100644
--- a/src/main/cpp/kernels/cum_scan.cuh
+++ b/src/main/cpp/kernels/cum_scan.cuh
@@ -23,7 +23,8 @@
 #pragma once
 
 /**
- * Cumulative Scan - Applies <scanOp> to accumulate values over columns of an input matrix
+ * Cumulative Scan - Applies <scanOp> to accumulate values over columns of an input matrix.
+ *                   Up sweep writes per block accumulation results to offset buffer once
  * @param scanOp - Type of the functor object that implements the scan operation
  */
 // --------------------------------------------------------
@@ -35,8 +36,8 @@
 	if (blockIdx.x * blockDim.x + threadIdx.x > cols - 1)
 		return;
 
-	uint offset = blockIdx.y * cols * block_height + blockIdx.x * blockDim.x;
-	uint idx = offset + threadIdx.x;
+	uint block_offset = blockIdx.y * cols * block_height + blockIdx.x * blockDim.x;
+	uint idx = block_offset + threadIdx.x;
 
 	// initial accumulator value
 	T acc = g_idata[idx];
@@ -49,15 +50,16 @@
 		acc = scan_op(acc, g_idata[i]);
 
 	// write out accumulated block offset
-	if (block_height < rows)
-	{
-		g_tdata[blockIdx.y * cols + blockIdx.x * blockDim.x + threadIdx.x] = acc;
-		// if(threadIdx.x == 0)
-		// 	printf("blockIdx.y=%d, acc=%f\n", blockIdx.y, acc);
-	}
+	g_tdata[blockIdx.y * cols + blockIdx.x * blockDim.x + threadIdx.x] = acc;
 }
 
 // --------------------------------------------------------
+/**
+ * Cumulative Scan - Applies <scanOp> to accumulate values over columns of an input matrix.
+ *                   Down sweep writes accumulated values to result buffer
+ * @param scanOp - Type of the functor object that implements the scan operation
+ */
+// --------------------------------------------------------
 template<typename scanOp, typename NeutralElement, typename T>
 __device__ void cumulative_scan_down_sweep(T *g_idata, T *g_odata, T *g_tdata, uint rows, uint cols, uint block_height, 
 	scanOp scan_op)
@@ -67,19 +69,13 @@
 		return;
 
 	uint idx = blockIdx.y * cols * block_height + blockIdx.x * blockDim.x + threadIdx.x;
-	int offset_idx = blockIdx.y * cols + blockIdx.x * blockDim.x + threadIdx.x;
-	
-	// initial accumulator value
-	T acc = (gridDim.y > 1) ? ((blockIdx.y > 0) ? g_tdata[offset_idx-1] : NeutralElement::get()) : NeutralElement::get();
 
-	// if(threadIdx.x == 0)
-	// {
-	// 	printf("gridDim.y=%d, blockIdx.y=%d, down sweep acc=%f\n", gridDim.y, blockIdx.y, acc);
-	// 	printf("gridDim.y=%d, blockIdx.y=%d, g_tdata[%d]=%f\n", gridDim.y, blockIdx.y, idx, g_tdata[offset_idx]);
-	// }
+	// initial accumulator value: all but first row fetch from offset buffer
+	T acc = (blockIdx.y > 0) ? g_tdata[(blockIdx.y -1) * cols + blockIdx.x * blockDim.x + threadIdx.x]
+						     : NeutralElement::get();
 
 	g_odata[idx] = acc = scan_op(acc, g_idata[idx]);
-	
+
 	// loop through <block_height> number of items colwise
 	uint last_idx = min(idx + block_height * cols, rows * cols);