blob: 4a30447f7a85bf329a5e71c435ccf93a0a8dcb2a [file] [log] [blame]
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-27506705
// Cuda compilation tools, release 10.2, V10.2.89
// Based on LLVM 3.4svn
//
.version 6.5
.target sm_30
.address_size 64
// .globl double2float_f
.extern .func (.param .b32 func_retval0) vprintf
(
.param .b64 vprintf_param_0,
.param .b64 vprintf_param_1
)
;
.global .align 1 .b8 $str[39] = {98, 108, 111, 99, 107, 73, 100, 120, 46, 120, 61, 37, 100, 32, 114, 101, 100, 117, 99, 116, 105, 111, 110, 32, 114, 101, 115, 117, 108, 116, 58, 32, 37, 51, 46, 49, 102, 10, 0};
.extern .shared .align 1 .b8 memory[];
.visible .entry double2float_f(
.param .u64 double2float_f_param_0,
.param .u64 double2float_f_param_1,
.param .u32 double2float_f_param_2
)
{
.reg .pred %p<2>;
.reg .f32 %f<2>;
.reg .b32 %r<6>;
.reg .f64 %fd<2>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [double2float_f_param_0];
ld.param.u64 %rd2, [double2float_f_param_1];
ld.param.u32 %r2, [double2float_f_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.s32 %p1, %r1, %r2;
@%p1 bra BB0_2;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd1, [%rd5];
cvt.rn.f32.f64 %f1, %fd1;
cvta.to.global.u64 %rd6, %rd2;
mul.wide.s32 %rd7, %r1, 4;
add.s64 %rd8, %rd6, %rd7;
st.global.f32 [%rd8], %f1;
BB0_2:
ret;
}
// .globl float2double_f
.visible .entry float2double_f(
.param .u64 float2double_f_param_0,
.param .u64 float2double_f_param_1,
.param .u32 float2double_f_param_2
)
{
.reg .pred %p<2>;
.reg .f32 %f<2>;
.reg .b32 %r<6>;
.reg .f64 %fd<2>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [float2double_f_param_0];
ld.param.u64 %rd2, [float2double_f_param_1];
ld.param.u32 %r2, [float2double_f_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.s32 %p1, %r1, %r2;
@%p1 bra BB1_2;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f1, [%rd5];
cvt.f64.f32 %fd1, %f1;
cvta.to.global.u64 %rd6, %rd2;
mul.wide.s32 %rd7, %r1, 8;
add.s64 %rd8, %rd6, %rd7;
st.global.f64 [%rd8], %fd1;
BB1_2:
ret;
}
// .globl reduce_sum_d
.visible .entry reduce_sum_d(
.param .u64 reduce_sum_d_param_0,
.param .u64 reduce_sum_d_param_1,
.param .u32 reduce_sum_d_param_2
)
{
.local .align 16 .b8 __local_depot2[16];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<21>;
.reg .b32 %r<39>;
.reg .f64 %fd<61>;
.reg .b64 %rd<16>;
mov.u64 %SPL, __local_depot2;
cvta.local.u64 %SP, %SPL;
ld.param.u64 %rd1, [reduce_sum_d_param_0];
ld.param.u64 %rd2, [reduce_sum_d_param_1];
ld.param.u32 %r6, [reduce_sum_d_param_2];
mov.u32 %r7, %ctaid.x;
shl.b32 %r8, %r7, 1;
mov.u32 %r9, %ntid.x;
mov.u32 %r10, %tid.x;
mad.lo.s32 %r38, %r8, %r9, %r10;
mov.f64 %fd45, 0d0000000000000000;
setp.ge.u32 %p1, %r38, %r6;
@%p1 bra BB2_4;
BB2_1:
cvta.to.global.u64 %rd3, %rd1;
mul.wide.u32 %rd4, %r38, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd30, [%rd5];
add.f64 %fd45, %fd45, %fd30;
add.s32 %r3, %r38, %r9;
setp.ge.u32 %p2, %r3, %r6;
@%p2 bra BB2_3;
mul.wide.u32 %rd7, %r3, 8;
add.s64 %rd8, %rd3, %rd7;
ld.global.f64 %fd31, [%rd8];
add.f64 %fd45, %fd45, %fd31;
BB2_3:
shl.b32 %r13, %r9, 1;
mov.u32 %r14, %nctaid.x;
mad.lo.s32 %r38, %r13, %r14, %r38;
setp.lt.u32 %p3, %r38, %r6;
@%p3 bra BB2_1;
BB2_4:
shl.b32 %r16, %r10, 3;
mov.u32 %r17, memory;
add.s32 %r5, %r17, %r16;
st.shared.f64 [%r5], %fd45;
bar.sync 0;
setp.lt.u32 %p4, %r9, 1024;
@%p4 bra BB2_8;
setp.gt.u32 %p5, %r10, 511;
@%p5 bra BB2_7;
ld.shared.f64 %fd32, [%r5+4096];
add.f64 %fd45, %fd45, %fd32;
st.shared.f64 [%r5], %fd45;
BB2_7:
bar.sync 0;
BB2_8:
setp.lt.u32 %p6, %r9, 512;
@%p6 bra BB2_12;
setp.gt.u32 %p7, %r10, 255;
@%p7 bra BB2_11;
ld.shared.f64 %fd33, [%r5+2048];
add.f64 %fd45, %fd45, %fd33;
st.shared.f64 [%r5], %fd45;
BB2_11:
bar.sync 0;
BB2_12:
setp.lt.u32 %p8, %r9, 256;
@%p8 bra BB2_16;
setp.gt.u32 %p9, %r10, 127;
@%p9 bra BB2_15;
ld.shared.f64 %fd34, [%r5+1024];
add.f64 %fd45, %fd45, %fd34;
st.shared.f64 [%r5], %fd45;
BB2_15:
bar.sync 0;
BB2_16:
setp.lt.u32 %p10, %r9, 128;
@%p10 bra BB2_20;
setp.gt.u32 %p11, %r10, 63;
@%p11 bra BB2_19;
ld.shared.f64 %fd35, [%r5+512];
add.f64 %fd45, %fd45, %fd35;
st.shared.f64 [%r5], %fd45;
BB2_19:
bar.sync 0;
BB2_20:
setp.gt.u32 %p12, %r10, 31;
@%p12 bra BB2_33;
setp.lt.u32 %p13, %r9, 64;
@%p13 bra BB2_23;
ld.volatile.shared.f64 %fd36, [%r5+256];
add.f64 %fd45, %fd45, %fd36;
st.volatile.shared.f64 [%r5], %fd45;
BB2_23:
setp.lt.u32 %p14, %r9, 32;
@%p14 bra BB2_25;
ld.volatile.shared.f64 %fd37, [%r5+128];
add.f64 %fd45, %fd45, %fd37;
st.volatile.shared.f64 [%r5], %fd45;
BB2_25:
setp.lt.u32 %p15, %r9, 16;
@%p15 bra BB2_27;
ld.volatile.shared.f64 %fd38, [%r5+64];
add.f64 %fd45, %fd45, %fd38;
st.volatile.shared.f64 [%r5], %fd45;
BB2_27:
setp.lt.u32 %p16, %r9, 8;
@%p16 bra BB2_29;
ld.volatile.shared.f64 %fd39, [%r5+32];
add.f64 %fd45, %fd45, %fd39;
st.volatile.shared.f64 [%r5], %fd45;
BB2_29:
setp.lt.u32 %p17, %r9, 4;
@%p17 bra BB2_31;
ld.volatile.shared.f64 %fd40, [%r5+16];
add.f64 %fd45, %fd45, %fd40;
st.volatile.shared.f64 [%r5], %fd45;
BB2_31:
setp.lt.u32 %p18, %r9, 2;
@%p18 bra BB2_33;
ld.volatile.shared.f64 %fd41, [%r5+8];
add.f64 %fd42, %fd45, %fd41;
st.volatile.shared.f64 [%r5], %fd42;
BB2_33:
setp.ne.s32 %p19, %r10, 0;
@%p19 bra BB2_37;
mov.u32 %r34, %nctaid.x;
setp.gt.u32 %p20, %r34, 9;
@%p20 bra BB2_36;
ld.shared.f64 %fd43, [memory];
add.u64 %rd9, %SP, 0;
add.u64 %rd10, %SPL, 0;
st.local.u32 [%rd10], %r7;
st.local.f64 [%rd10+8], %fd43;
mov.u64 %rd11, $str;
cvta.global.u64 %rd12, %rd11;
// Callseq Start 0
{
.reg .b32 temp_param_reg;
// <end>}
.param .b64 param0;
st.param.b64 [param0+0], %rd12;
.param .b64 param1;
st.param.b64 [param1+0], %rd9;
.param .b32 retval0;
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r36, [retval0+0];
//{
}// Callseq End 0
BB2_36:
ld.shared.f64 %fd44, [memory];
cvta.to.global.u64 %rd13, %rd2;
mul.wide.u32 %rd14, %r7, 8;
add.s64 %rd15, %rd13, %rd14;
st.global.f64 [%rd15], %fd44;
BB2_37:
ret;
}
// .globl reduce_sum_f
.visible .entry reduce_sum_f(
.param .u64 reduce_sum_f_param_0,
.param .u64 reduce_sum_f_param_1,
.param .u32 reduce_sum_f_param_2
)
{
.local .align 16 .b8 __local_depot3[16];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<21>;
.reg .f32 %f<61>;
.reg .b32 %r<39>;
.reg .f64 %fd<2>;
.reg .b64 %rd<16>;
mov.u64 %SPL, __local_depot3;
cvta.local.u64 %SP, %SPL;
ld.param.u64 %rd1, [reduce_sum_f_param_0];
ld.param.u64 %rd2, [reduce_sum_f_param_1];
ld.param.u32 %r6, [reduce_sum_f_param_2];
mov.u32 %r7, %ctaid.x;
shl.b32 %r8, %r7, 1;
mov.u32 %r9, %ntid.x;
mov.u32 %r10, %tid.x;
mad.lo.s32 %r38, %r8, %r9, %r10;
mov.f32 %f45, 0f00000000;
setp.ge.u32 %p1, %r38, %r6;
@%p1 bra BB3_4;
BB3_1:
cvta.to.global.u64 %rd3, %rd1;
mul.wide.u32 %rd4, %r38, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f30, [%rd5];
add.f32 %f45, %f45, %f30;
add.s32 %r3, %r38, %r9;
setp.ge.u32 %p2, %r3, %r6;
@%p2 bra BB3_3;
mul.wide.u32 %rd7, %r3, 4;
add.s64 %rd8, %rd3, %rd7;
ld.global.f32 %f31, [%rd8];
add.f32 %f45, %f45, %f31;
BB3_3:
shl.b32 %r13, %r9, 1;
mov.u32 %r14, %nctaid.x;
mad.lo.s32 %r38, %r13, %r14, %r38;
setp.lt.u32 %p3, %r38, %r6;
@%p3 bra BB3_1;
BB3_4:
shl.b32 %r16, %r10, 2;
mov.u32 %r17, memory;
add.s32 %r5, %r17, %r16;
st.shared.f32 [%r5], %f45;
bar.sync 0;
setp.lt.u32 %p4, %r9, 1024;
@%p4 bra BB3_8;
setp.gt.u32 %p5, %r10, 511;
@%p5 bra BB3_7;
ld.shared.f32 %f32, [%r5+2048];
add.f32 %f45, %f45, %f32;
st.shared.f32 [%r5], %f45;
BB3_7:
bar.sync 0;
BB3_8:
setp.lt.u32 %p6, %r9, 512;
@%p6 bra BB3_12;
setp.gt.u32 %p7, %r10, 255;
@%p7 bra BB3_11;
ld.shared.f32 %f33, [%r5+1024];
add.f32 %f45, %f45, %f33;
st.shared.f32 [%r5], %f45;
BB3_11:
bar.sync 0;
BB3_12:
setp.lt.u32 %p8, %r9, 256;
@%p8 bra BB3_16;
setp.gt.u32 %p9, %r10, 127;
@%p9 bra BB3_15;
ld.shared.f32 %f34, [%r5+512];
add.f32 %f45, %f45, %f34;
st.shared.f32 [%r5], %f45;
BB3_15:
bar.sync 0;
BB3_16:
setp.lt.u32 %p10, %r9, 128;
@%p10 bra BB3_20;
setp.gt.u32 %p11, %r10, 63;
@%p11 bra BB3_19;
ld.shared.f32 %f35, [%r5+256];
add.f32 %f45, %f45, %f35;
st.shared.f32 [%r5], %f45;
BB3_19:
bar.sync 0;
BB3_20:
setp.gt.u32 %p12, %r10, 31;
@%p12 bra BB3_33;
setp.lt.u32 %p13, %r9, 64;
@%p13 bra BB3_23;
ld.volatile.shared.f32 %f36, [%r5+128];
add.f32 %f45, %f45, %f36;
st.volatile.shared.f32 [%r5], %f45;
BB3_23:
setp.lt.u32 %p14, %r9, 32;
@%p14 bra BB3_25;
ld.volatile.shared.f32 %f37, [%r5+64];
add.f32 %f45, %f45, %f37;
st.volatile.shared.f32 [%r5], %f45;
BB3_25:
setp.lt.u32 %p15, %r9, 16;
@%p15 bra BB3_27;
ld.volatile.shared.f32 %f38, [%r5+32];
add.f32 %f45, %f45, %f38;
st.volatile.shared.f32 [%r5], %f45;
BB3_27:
setp.lt.u32 %p16, %r9, 8;
@%p16 bra BB3_29;
ld.volatile.shared.f32 %f39, [%r5+16];
add.f32 %f45, %f45, %f39;
st.volatile.shared.f32 [%r5], %f45;
BB3_29:
setp.lt.u32 %p17, %r9, 4;
@%p17 bra BB3_31;
ld.volatile.shared.f32 %f40, [%r5+8];
add.f32 %f45, %f45, %f40;
st.volatile.shared.f32 [%r5], %f45;
BB3_31:
setp.lt.u32 %p18, %r9, 2;
@%p18 bra BB3_33;
ld.volatile.shared.f32 %f41, [%r5+4];
add.f32 %f42, %f45, %f41;
st.volatile.shared.f32 [%r5], %f42;
BB3_33:
setp.ne.s32 %p19, %r10, 0;
@%p19 bra BB3_37;
mov.u32 %r34, %nctaid.x;
setp.gt.u32 %p20, %r34, 9;
@%p20 bra BB3_36;
ld.shared.f32 %f43, [memory];
cvt.f64.f32 %fd1, %f43;
add.u64 %rd9, %SP, 0;
add.u64 %rd10, %SPL, 0;
st.local.u32 [%rd10], %r7;
st.local.f64 [%rd10+8], %fd1;
mov.u64 %rd11, $str;
cvta.global.u64 %rd12, %rd11;
// Callseq Start 1
{
.reg .b32 temp_param_reg;
// <end>}
.param .b64 param0;
st.param.b64 [param0+0], %rd12;
.param .b64 param1;
st.param.b64 [param1+0], %rd9;
.param .b32 retval0;
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r36, [retval0+0];
//{
}// Callseq End 1
BB3_36:
ld.shared.f32 %f44, [memory];
cvta.to.global.u64 %rd13, %rd2;
mul.wide.u32 %rd14, %r7, 4;
add.s64 %rd15, %rd13, %rd14;
st.global.f32 [%rd15], %f44;
BB3_37:
ret;
}
// .globl reduce_row_sum_d
.visible .entry reduce_row_sum_d(
.param .u64 reduce_row_sum_d_param_0,
.param .u64 reduce_row_sum_d_param_1,
.param .u32 reduce_row_sum_d_param_2,
.param .u32 reduce_row_sum_d_param_3
)
{
.reg .pred %p<20>;
.reg .b32 %r<72>;
.reg .f64 %fd<56>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [reduce_row_sum_d_param_0];
ld.param.u64 %rd2, [reduce_row_sum_d_param_1];
ld.param.u32 %r5, [reduce_row_sum_d_param_2];
ld.param.u32 %r4, [reduce_row_sum_d_param_3];
mov.u32 %r6, %ctaid.x;
setp.ge.u32 %p1, %r6, %r5;
@%p1 bra BB4_35;
mov.u32 %r71, %tid.x;
mov.f64 %fd6, 0d0000000000000000;
setp.ge.u32 %p2, %r71, %r4;
@%p2 bra BB4_4;
cvta.to.global.u64 %rd3, %rd1;
BB4_3:
mad.lo.s32 %r8, %r6, %r4, %r71;
mul.wide.u32 %rd4, %r8, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd28, [%rd5];
add.f64 %fd6, %fd6, %fd28;
mov.u32 %r9, %ntid.x;
add.s32 %r71, %r9, %r71;
setp.lt.u32 %p3, %r71, %r4;
@%p3 bra BB4_3;
BB4_4:
mov.u32 %r10, %tid.x;
shl.b32 %r11, %r10, 3;
mov.u32 %r12, memory;
add.s32 %r13, %r12, %r11;
st.shared.f64 [%r13], %fd6;
bar.sync 0;
mov.u32 %r14, %ntid.x;
setp.lt.u32 %p4, %r14, 1024;
@%p4 bra BB4_8;
setp.gt.u32 %p5, %r10, 511;
@%p5 bra BB4_7;
ld.shared.f64 %fd29, [%r13+4096];
add.f64 %fd6, %fd6, %fd29;
st.shared.f64 [%r13], %fd6;
BB4_7:
bar.sync 0;
BB4_8:
setp.lt.u32 %p6, %r14, 512;
@%p6 bra BB4_12;
setp.gt.u32 %p7, %r10, 255;
@%p7 bra BB4_11;
ld.shared.f64 %fd30, [%r13+2048];
add.f64 %fd6, %fd6, %fd30;
st.shared.f64 [%r13], %fd6;
BB4_11:
bar.sync 0;
BB4_12:
setp.lt.u32 %p8, %r14, 256;
@%p8 bra BB4_16;
setp.gt.u32 %p9, %r10, 127;
@%p9 bra BB4_15;
ld.shared.f64 %fd31, [%r13+1024];
add.f64 %fd6, %fd6, %fd31;
st.shared.f64 [%r13], %fd6;
BB4_15:
bar.sync 0;
BB4_16:
setp.lt.u32 %p10, %r14, 128;
@%p10 bra BB4_20;
setp.gt.u32 %p11, %r10, 63;
@%p11 bra BB4_19;
ld.shared.f64 %fd32, [%r13+512];
add.f64 %fd6, %fd6, %fd32;
st.shared.f64 [%r13], %fd6;
BB4_19:
bar.sync 0;
BB4_20:
setp.gt.u32 %p12, %r10, 31;
@%p12 bra BB4_33;
setp.lt.u32 %p13, %r14, 64;
@%p13 bra BB4_23;
ld.volatile.shared.f64 %fd33, [%r13+256];
add.f64 %fd6, %fd6, %fd33;
st.volatile.shared.f64 [%r13], %fd6;
BB4_23:
setp.lt.u32 %p14, %r14, 32;
@%p14 bra BB4_25;
ld.volatile.shared.f64 %fd34, [%r13+128];
add.f64 %fd6, %fd6, %fd34;
st.volatile.shared.f64 [%r13], %fd6;
BB4_25:
setp.lt.u32 %p15, %r14, 16;
@%p15 bra BB4_27;
ld.volatile.shared.f64 %fd35, [%r13+64];
add.f64 %fd6, %fd6, %fd35;
st.volatile.shared.f64 [%r13], %fd6;
BB4_27:
setp.lt.u32 %p16, %r14, 8;
@%p16 bra BB4_29;
ld.volatile.shared.f64 %fd36, [%r13+32];
add.f64 %fd6, %fd6, %fd36;
st.volatile.shared.f64 [%r13], %fd6;
BB4_29:
setp.lt.u32 %p17, %r14, 4;
@%p17 bra BB4_31;
ld.volatile.shared.f64 %fd37, [%r13+16];
add.f64 %fd6, %fd6, %fd37;
st.volatile.shared.f64 [%r13], %fd6;
BB4_31:
setp.lt.u32 %p18, %r14, 2;
@%p18 bra BB4_33;
ld.volatile.shared.f64 %fd38, [%r13+8];
add.f64 %fd39, %fd6, %fd38;
st.volatile.shared.f64 [%r13], %fd39;
BB4_33:
setp.ne.s32 %p19, %r10, 0;
@%p19 bra BB4_35;
ld.shared.f64 %fd40, [memory];
cvta.to.global.u64 %rd6, %rd2;
mul.wide.u32 %rd7, %r6, 8;
add.s64 %rd8, %rd6, %rd7;
st.global.f64 [%rd8], %fd40;
BB4_35:
ret;
}
// .globl reduce_row_sum_f
.visible .entry reduce_row_sum_f(
.param .u64 reduce_row_sum_f_param_0,
.param .u64 reduce_row_sum_f_param_1,
.param .u32 reduce_row_sum_f_param_2,
.param .u32 reduce_row_sum_f_param_3
)
{
.reg .pred %p<20>;
.reg .f32 %f<56>;
.reg .b32 %r<72>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [reduce_row_sum_f_param_0];
ld.param.u64 %rd2, [reduce_row_sum_f_param_1];
ld.param.u32 %r5, [reduce_row_sum_f_param_2];
ld.param.u32 %r4, [reduce_row_sum_f_param_3];
mov.u32 %r6, %ctaid.x;
setp.ge.u32 %p1, %r6, %r5;
@%p1 bra BB5_35;
mov.u32 %r71, %tid.x;
mov.f32 %f6, 0f00000000;
setp.ge.u32 %p2, %r71, %r4;
@%p2 bra BB5_4;
cvta.to.global.u64 %rd3, %rd1;
BB5_3:
mad.lo.s32 %r8, %r6, %r4, %r71;
mul.wide.u32 %rd4, %r8, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f28, [%rd5];
add.f32 %f6, %f6, %f28;
mov.u32 %r9, %ntid.x;
add.s32 %r71, %r9, %r71;
setp.lt.u32 %p3, %r71, %r4;
@%p3 bra BB5_3;
BB5_4:
mov.u32 %r10, %tid.x;
shl.b32 %r11, %r10, 2;
mov.u32 %r12, memory;
add.s32 %r13, %r12, %r11;
st.shared.f32 [%r13], %f6;
bar.sync 0;
mov.u32 %r14, %ntid.x;
setp.lt.u32 %p4, %r14, 1024;
@%p4 bra BB5_8;
setp.gt.u32 %p5, %r10, 511;
@%p5 bra BB5_7;
ld.shared.f32 %f29, [%r13+2048];
add.f32 %f6, %f6, %f29;
st.shared.f32 [%r13], %f6;
BB5_7:
bar.sync 0;
BB5_8:
setp.lt.u32 %p6, %r14, 512;
@%p6 bra BB5_12;
setp.gt.u32 %p7, %r10, 255;
@%p7 bra BB5_11;
ld.shared.f32 %f30, [%r13+1024];
add.f32 %f6, %f6, %f30;
st.shared.f32 [%r13], %f6;
BB5_11:
bar.sync 0;
BB5_12:
setp.lt.u32 %p8, %r14, 256;
@%p8 bra BB5_16;
setp.gt.u32 %p9, %r10, 127;
@%p9 bra BB5_15;
ld.shared.f32 %f31, [%r13+512];
add.f32 %f6, %f6, %f31;
st.shared.f32 [%r13], %f6;
BB5_15:
bar.sync 0;
BB5_16:
setp.lt.u32 %p10, %r14, 128;
@%p10 bra BB5_20;
setp.gt.u32 %p11, %r10, 63;
@%p11 bra BB5_19;
ld.shared.f32 %f32, [%r13+256];
add.f32 %f6, %f6, %f32;
st.shared.f32 [%r13], %f6;
BB5_19:
bar.sync 0;
BB5_20:
setp.gt.u32 %p12, %r10, 31;
@%p12 bra BB5_33;
setp.lt.u32 %p13, %r14, 64;
@%p13 bra BB5_23;
ld.volatile.shared.f32 %f33, [%r13+128];
add.f32 %f6, %f6, %f33;
st.volatile.shared.f32 [%r13], %f6;
BB5_23:
setp.lt.u32 %p14, %r14, 32;
@%p14 bra BB5_25;
ld.volatile.shared.f32 %f34, [%r13+64];
add.f32 %f6, %f6, %f34;
st.volatile.shared.f32 [%r13], %f6;
BB5_25:
setp.lt.u32 %p15, %r14, 16;
@%p15 bra BB5_27;
ld.volatile.shared.f32 %f35, [%r13+32];
add.f32 %f6, %f6, %f35;
st.volatile.shared.f32 [%r13], %f6;
BB5_27:
setp.lt.u32 %p16, %r14, 8;
@%p16 bra BB5_29;
ld.volatile.shared.f32 %f36, [%r13+16];
add.f32 %f6, %f6, %f36;
st.volatile.shared.f32 [%r13], %f6;
BB5_29:
setp.lt.u32 %p17, %r14, 4;
@%p17 bra BB5_31;
ld.volatile.shared.f32 %f37, [%r13+8];
add.f32 %f6, %f6, %f37;
st.volatile.shared.f32 [%r13], %f6;
BB5_31:
setp.lt.u32 %p18, %r14, 2;
@%p18 bra BB5_33;
ld.volatile.shared.f32 %f38, [%r13+4];
add.f32 %f39, %f6, %f38;
st.volatile.shared.f32 [%r13], %f39;
BB5_33:
setp.ne.s32 %p19, %r10, 0;
@%p19 bra BB5_35;
ld.shared.f32 %f40, [memory];
cvta.to.global.u64 %rd6, %rd2;
mul.wide.u32 %rd7, %r6, 4;
add.s64 %rd8, %rd6, %rd7;
st.global.f32 [%rd8], %f40;
BB5_35:
ret;
}
// .globl reduce_col_sum_d
.visible .entry reduce_col_sum_d(
.param .u64 reduce_col_sum_d_param_0,
.param .u64 reduce_col_sum_d_param_1,
.param .u32 reduce_col_sum_d_param_2,
.param .u32 reduce_col_sum_d_param_3
)
{
.reg .pred %p<4>;
.reg .b32 %r<11>;
.reg .f64 %fd<9>;
.reg .b64 %rd<9>;
ld.param.u64 %rd2, [reduce_col_sum_d_param_0];
ld.param.u64 %rd3, [reduce_col_sum_d_param_1];
ld.param.u32 %r5, [reduce_col_sum_d_param_2];
ld.param.u32 %r6, [reduce_col_sum_d_param_3];
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %ctaid.x;
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r7, %r8, %r9;
setp.ge.u32 %p1, %r1, %r6;
@%p1 bra BB6_5;
mul.lo.s32 %r2, %r6, %r5;
cvta.to.global.u64 %rd1, %rd2;
mov.f64 %fd8, 0d0000000000000000;
setp.ge.u32 %p2, %r1, %r2;
@%p2 bra BB6_4;
mov.u32 %r10, %r1;
BB6_3:
mul.wide.u32 %rd4, %r10, 8;
add.s64 %rd5, %rd1, %rd4;
ld.global.f64 %fd6, [%rd5];
add.f64 %fd8, %fd8, %fd6;
add.s32 %r10, %r10, %r6;
setp.lt.u32 %p3, %r10, %r2;
@%p3 bra BB6_3;
BB6_4:
cvta.to.global.u64 %rd6, %rd3;
mul.wide.u32 %rd7, %r1, 8;
add.s64 %rd8, %rd6, %rd7;
st.global.f64 [%rd8], %fd8;
BB6_5:
ret;
}
// .globl reduce_col_sum_f
.visible .entry reduce_col_sum_f(
.param .u64 reduce_col_sum_f_param_0,
.param .u64 reduce_col_sum_f_param_1,
.param .u32 reduce_col_sum_f_param_2,
.param .u32 reduce_col_sum_f_param_3
)
{
.reg .pred %p<4>;
.reg .f32 %f<9>;
.reg .b32 %r<11>;
.reg .b64 %rd<9>;
ld.param.u64 %rd2, [reduce_col_sum_f_param_0];
ld.param.u64 %rd3, [reduce_col_sum_f_param_1];
ld.param.u32 %r5, [reduce_col_sum_f_param_2];
ld.param.u32 %r6, [reduce_col_sum_f_param_3];
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %ctaid.x;
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r7, %r8, %r9;
setp.ge.u32 %p1, %r1, %r6;
@%p1 bra BB7_5;
mul.lo.s32 %r2, %r6, %r5;
cvta.to.global.u64 %rd1, %rd2;
mov.f32 %f8, 0f00000000;
setp.ge.u32 %p2, %r1, %r2;
@%p2 bra BB7_4;
mov.u32 %r10, %r1;
BB7_3:
mul.wide.u32 %rd4, %r10, 4;
add.s64 %rd5, %rd1, %rd4;
ld.global.f32 %f6, [%rd5];
add.f32 %f8, %f8, %f6;
add.s32 %r10, %r10, %r6;
setp.lt.u32 %p3, %r10, %r2;
@%p3 bra BB7_3;
BB7_4:
cvta.to.global.u64 %rd6, %rd3;
mul.wide.u32 %rd7, %r1, 4;
add.s64 %rd8, %rd6, %rd7;
st.global.f32 [%rd8], %f8;
BB7_5:
ret;
}
// .globl reduce_max_d
.visible .entry reduce_max_d(
.param .u64 reduce_max_d_param_0,
.param .u64 reduce_max_d_param_1,
.param .u32 reduce_max_d_param_2
)
{
.local .align 16 .b8 __local_depot8[16];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<21>;
.reg .b32 %r<39>;
.reg .f64 %fd<61>;
.reg .b64 %rd<16>;
mov.u64 %SPL, __local_depot8;
cvta.local.u64 %SP, %SPL;
ld.param.u64 %rd1, [reduce_max_d_param_0];
ld.param.u64 %rd2, [reduce_max_d_param_1];
ld.param.u32 %r6, [reduce_max_d_param_2];
mov.u32 %r7, %ctaid.x;
shl.b32 %r8, %r7, 1;
mov.u32 %r9, %ntid.x;
mov.u32 %r10, %tid.x;
mad.lo.s32 %r38, %r8, %r9, %r10;
mov.f64 %fd45, 0dFFEFFFFFFFFFFFFF;
setp.ge.u32 %p1, %r38, %r6;
@%p1 bra BB8_4;
BB8_1:
cvta.to.global.u64 %rd3, %rd1;
mul.wide.u32 %rd4, %r38, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd30, [%rd5];
max.f64 %fd45, %fd45, %fd30;
add.s32 %r3, %r38, %r9;
setp.ge.u32 %p2, %r3, %r6;
@%p2 bra BB8_3;
mul.wide.u32 %rd7, %r3, 8;
add.s64 %rd8, %rd3, %rd7;
ld.global.f64 %fd31, [%rd8];
max.f64 %fd45, %fd45, %fd31;
BB8_3:
shl.b32 %r13, %r9, 1;
mov.u32 %r14, %nctaid.x;
mad.lo.s32 %r38, %r13, %r14, %r38;
setp.lt.u32 %p3, %r38, %r6;
@%p3 bra BB8_1;
BB8_4:
shl.b32 %r16, %r10, 3;
mov.u32 %r17, memory;
add.s32 %r5, %r17, %r16;
st.shared.f64 [%r5], %fd45;
bar.sync 0;
setp.lt.u32 %p4, %r9, 1024;
@%p4 bra BB8_8;
setp.gt.u32 %p5, %r10, 511;
@%p5 bra BB8_7;
ld.shared.f64 %fd32, [%r5+4096];
max.f64 %fd45, %fd45, %fd32;
st.shared.f64 [%r5], %fd45;
BB8_7:
bar.sync 0;
BB8_8:
setp.lt.u32 %p6, %r9, 512;
@%p6 bra BB8_12;
setp.gt.u32 %p7, %r10, 255;
@%p7 bra BB8_11;
ld.shared.f64 %fd33, [%r5+2048];
max.f64 %fd45, %fd45, %fd33;
st.shared.f64 [%r5], %fd45;
BB8_11:
bar.sync 0;
BB8_12:
setp.lt.u32 %p8, %r9, 256;
@%p8 bra BB8_16;
setp.gt.u32 %p9, %r10, 127;
@%p9 bra BB8_15;
ld.shared.f64 %fd34, [%r5+1024];
max.f64 %fd45, %fd45, %fd34;
st.shared.f64 [%r5], %fd45;
BB8_15:
bar.sync 0;
BB8_16:
setp.lt.u32 %p10, %r9, 128;
@%p10 bra BB8_20;
setp.gt.u32 %p11, %r10, 63;
@%p11 bra BB8_19;
ld.shared.f64 %fd35, [%r5+512];
max.f64 %fd45, %fd45, %fd35;
st.shared.f64 [%r5], %fd45;
BB8_19:
bar.sync 0;
BB8_20:
setp.gt.u32 %p12, %r10, 31;
@%p12 bra BB8_33;
setp.lt.u32 %p13, %r9, 64;
@%p13 bra BB8_23;
ld.volatile.shared.f64 %fd36, [%r5+256];
max.f64 %fd45, %fd45, %fd36;
st.volatile.shared.f64 [%r5], %fd45;
BB8_23:
setp.lt.u32 %p14, %r9, 32;
@%p14 bra BB8_25;
ld.volatile.shared.f64 %fd37, [%r5+128];
max.f64 %fd45, %fd45, %fd37;
st.volatile.shared.f64 [%r5], %fd45;
BB8_25:
setp.lt.u32 %p15, %r9, 16;
@%p15 bra BB8_27;
ld.volatile.shared.f64 %fd38, [%r5+64];
max.f64 %fd45, %fd45, %fd38;
st.volatile.shared.f64 [%r5], %fd45;
BB8_27:
setp.lt.u32 %p16, %r9, 8;
@%p16 bra BB8_29;
ld.volatile.shared.f64 %fd39, [%r5+32];
max.f64 %fd45, %fd45, %fd39;
st.volatile.shared.f64 [%r5], %fd45;
BB8_29:
setp.lt.u32 %p17, %r9, 4;
@%p17 bra BB8_31;
ld.volatile.shared.f64 %fd40, [%r5+16];
max.f64 %fd45, %fd45, %fd40;
st.volatile.shared.f64 [%r5], %fd45;
BB8_31:
setp.lt.u32 %p18, %r9, 2;
@%p18 bra BB8_33;
ld.volatile.shared.f64 %fd41, [%r5+8];
max.f64 %fd42, %fd45, %fd41;
st.volatile.shared.f64 [%r5], %fd42;
BB8_33:
setp.ne.s32 %p19, %r10, 0;
@%p19 bra BB8_37;
mov.u32 %r34, %nctaid.x;
setp.gt.u32 %p20, %r34, 9;
@%p20 bra BB8_36;
ld.shared.f64 %fd43, [memory];
add.u64 %rd9, %SP, 0;
add.u64 %rd10, %SPL, 0;
st.local.u32 [%rd10], %r7;
st.local.f64 [%rd10+8], %fd43;
mov.u64 %rd11, $str;
cvta.global.u64 %rd12, %rd11;
// Callseq Start 2
{
.reg .b32 temp_param_reg;
// <end>}
.param .b64 param0;
st.param.b64 [param0+0], %rd12;
.param .b64 param1;
st.param.b64 [param1+0], %rd9;
.param .b32 retval0;
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r36, [retval0+0];
//{
}// Callseq End 2
BB8_36:
ld.shared.f64 %fd44, [memory];
cvta.to.global.u64 %rd13, %rd2;
mul.wide.u32 %rd14, %r7, 8;
add.s64 %rd15, %rd13, %rd14;
st.global.f64 [%rd15], %fd44;
BB8_37:
ret;
}
// .globl reduce_max_f
.visible .entry reduce_max_f(
.param .u64 reduce_max_f_param_0,
.param .u64 reduce_max_f_param_1,
.param .u32 reduce_max_f_param_2
)
{
.local .align 16 .b8 __local_depot9[16];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<21>;
.reg .f32 %f<61>;
.reg .b32 %r<39>;
.reg .f64 %fd<2>;
.reg .b64 %rd<16>;
mov.u64 %SPL, __local_depot9;
cvta.local.u64 %SP, %SPL;
ld.param.u64 %rd1, [reduce_max_f_param_0];
ld.param.u64 %rd2, [reduce_max_f_param_1];
ld.param.u32 %r6, [reduce_max_f_param_2];
mov.u32 %r7, %ctaid.x;
shl.b32 %r8, %r7, 1;
mov.u32 %r9, %ntid.x;
mov.u32 %r10, %tid.x;
mad.lo.s32 %r38, %r8, %r9, %r10;
mov.f32 %f45, 0fFF7FFFFF;
setp.ge.u32 %p1, %r38, %r6;
@%p1 bra BB9_4;
BB9_1:
cvta.to.global.u64 %rd3, %rd1;
mul.wide.u32 %rd4, %r38, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f30, [%rd5];
max.f32 %f45, %f45, %f30;
add.s32 %r3, %r38, %r9;
setp.ge.u32 %p2, %r3, %r6;
@%p2 bra BB9_3;
mul.wide.u32 %rd7, %r3, 4;
add.s64 %rd8, %rd3, %rd7;
ld.global.f32 %f31, [%rd8];
max.f32 %f45, %f45, %f31;
BB9_3:
shl.b32 %r13, %r9, 1;
mov.u32 %r14, %nctaid.x;
mad.lo.s32 %r38, %r13, %r14, %r38;
setp.lt.u32 %p3, %r38, %r6;
@%p3 bra BB9_1;
BB9_4:
shl.b32 %r16, %r10, 2;
mov.u32 %r17, memory;
add.s32 %r5, %r17, %r16;
st.shared.f32 [%r5], %f45;
bar.sync 0;
setp.lt.u32 %p4, %r9, 1024;
@%p4 bra BB9_8;
setp.gt.u32 %p5, %r10, 511;
@%p5 bra BB9_7;
ld.shared.f32 %f32, [%r5+2048];
max.f32 %f45, %f45, %f32;
st.shared.f32 [%r5], %f45;
BB9_7:
bar.sync 0;
BB9_8:
setp.lt.u32 %p6, %r9, 512;
@%p6 bra BB9_12;
setp.gt.u32 %p7, %r10, 255;
@%p7 bra BB9_11;
ld.shared.f32 %f33, [%r5+1024];
max.f32 %f45, %f45, %f33;
st.shared.f32 [%r5], %f45;
BB9_11:
bar.sync 0;
BB9_12:
setp.lt.u32 %p8, %r9, 256;
@%p8 bra BB9_16;
setp.gt.u32 %p9, %r10, 127;
@%p9 bra BB9_15;
ld.shared.f32 %f34, [%r5+512];
max.f32 %f45, %f45, %f34;
st.shared.f32 [%r5], %f45;
BB9_15:
bar.sync 0;
BB9_16:
setp.lt.u32 %p10, %r9, 128;
@%p10 bra BB9_20;
setp.gt.u32 %p11, %r10, 63;
@%p11 bra BB9_19;
ld.shared.f32 %f35, [%r5+256];
max.f32 %f45, %f45, %f35;
st.shared.f32 [%r5], %f45;
BB9_19:
bar.sync 0;
BB9_20:
setp.gt.u32 %p12, %r10, 31;
@%p12 bra BB9_33;
setp.lt.u32 %p13, %r9, 64;
@%p13 bra BB9_23;
ld.volatile.shared.f32 %f36, [%r5+128];
max.f32 %f45, %f45, %f36;
st.volatile.shared.f32 [%r5], %f45;
BB9_23:
setp.lt.u32 %p14, %r9, 32;
@%p14 bra BB9_25;
ld.volatile.shared.f32 %f37, [%r5+64];
max.f32 %f45, %f45, %f37;
st.volatile.shared.f32 [%r5], %f45;
BB9_25:
setp.lt.u32 %p15, %r9, 16;
@%p15 bra BB9_27;
ld.volatile.shared.f32 %f38, [%r5+32];
max.f32 %f45, %f45, %f38;
st.volatile.shared.f32 [%r5], %f45;
BB9_27:
setp.lt.u32 %p16, %r9, 8;
@%p16 bra BB9_29;
ld.volatile.shared.f32 %f39, [%r5+16];
max.f32 %f45, %f45, %f39;
st.volatile.shared.f32 [%r5], %f45;
BB9_29:
setp.lt.u32 %p17, %r9, 4;
@%p17 bra BB9_31;
ld.volatile.shared.f32 %f40, [%r5+8];
max.f32 %f45, %f45, %f40;
st.volatile.shared.f32 [%r5], %f45;
BB9_31:
setp.lt.u32 %p18, %r9, 2;
@%p18 bra BB9_33;
ld.volatile.shared.f32 %f41, [%r5+4];
max.f32 %f42, %f45, %f41;
st.volatile.shared.f32 [%r5], %f42;
BB9_33:
setp.ne.s32 %p19, %r10, 0;
@%p19 bra BB9_37;
mov.u32 %r34, %nctaid.x;
setp.gt.u32 %p20, %r34, 9;
@%p20 bra BB9_36;
ld.shared.f32 %f43, [memory];
cvt.f64.f32 %fd1, %f43;
add.u64 %rd9, %SP, 0;
add.u64 %rd10, %SPL, 0;
st.local.u32 [%rd10], %r7;
st.local.f64 [%rd10+8], %fd1;
mov.u64 %rd11, $str;
cvta.global.u64 %rd12, %rd11;
// Callseq Start 3
{
.reg .b32 temp_param_reg;
// <end>}
.param .b64 param0;
st.param.b64 [param0+0], %rd12;
.param .b64 param1;
st.param.b64 [param1+0], %rd9;
.param .b32 retval0;
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r36, [retval0+0];
//{
}// Callseq End 3
BB9_36:
ld.shared.f32 %f44, [memory];
cvta.to.global.u64 %rd13, %rd2;
mul.wide.u32 %rd14, %r7, 4;
add.s64 %rd15, %rd13, %rd14;
st.global.f32 [%rd15], %f44;
BB9_37:
ret;
}
// .globl reduce_row_max_d
.visible .entry reduce_row_max_d(
.param .u64 reduce_row_max_d_param_0,
.param .u64 reduce_row_max_d_param_1,
.param .u32 reduce_row_max_d_param_2,
.param .u32 reduce_row_max_d_param_3
)
{
.reg .pred %p<20>;
.reg .b32 %r<72>;
.reg .f64 %fd<56>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [reduce_row_max_d_param_0];
ld.param.u64 %rd2, [reduce_row_max_d_param_1];
ld.param.u32 %r5, [reduce_row_max_d_param_2];
ld.param.u32 %r4, [reduce_row_max_d_param_3];
mov.u32 %r6, %ctaid.x;
setp.ge.u32 %p1, %r6, %r5;
@%p1 bra BB10_35;
mov.u32 %r71, %tid.x;
mov.f64 %fd6, 0dFFEFFFFFFFFFFFFF;
setp.ge.u32 %p2, %r71, %r4;
@%p2 bra BB10_4;
cvta.to.global.u64 %rd3, %rd1;
BB10_3:
mad.lo.s32 %r8, %r6, %r4, %r71;
mul.wide.u32 %rd4, %r8, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd28, [%rd5];
max.f64 %fd6, %fd6, %fd28;
mov.u32 %r9, %ntid.x;
add.s32 %r71, %r9, %r71;
setp.lt.u32 %p3, %r71, %r4;
@%p3 bra BB10_3;
BB10_4:
mov.u32 %r10, %tid.x;
shl.b32 %r11, %r10, 3;
mov.u32 %r12, memory;
add.s32 %r13, %r12, %r11;
st.shared.f64 [%r13], %fd6;
bar.sync 0;
mov.u32 %r14, %ntid.x;
setp.lt.u32 %p4, %r14, 1024;
@%p4 bra BB10_8;
setp.gt.u32 %p5, %r10, 511;
@%p5 bra BB10_7;
ld.shared.f64 %fd29, [%r13+4096];
max.f64 %fd6, %fd6, %fd29;
st.shared.f64 [%r13], %fd6;
BB10_7:
bar.sync 0;
BB10_8:
setp.lt.u32 %p6, %r14, 512;
@%p6 bra BB10_12;
setp.gt.u32 %p7, %r10, 255;
@%p7 bra BB10_11;
ld.shared.f64 %fd30, [%r13+2048];
max.f64 %fd6, %fd6, %fd30;
st.shared.f64 [%r13], %fd6;
BB10_11:
bar.sync 0;
BB10_12:
setp.lt.u32 %p8, %r14, 256;
@%p8 bra BB10_16;
setp.gt.u32 %p9, %r10, 127;
@%p9 bra BB10_15;
ld.shared.f64 %fd31, [%r13+1024];
max.f64 %fd6, %fd6, %fd31;
st.shared.f64 [%r13], %fd6;
BB10_15:
bar.sync 0;
BB10_16:
setp.lt.u32 %p10, %r14, 128;
@%p10 bra BB10_20;
setp.gt.u32 %p11, %r10, 63;
@%p11 bra BB10_19;
ld.shared.f64 %fd32, [%r13+512];
max.f64 %fd6, %fd6, %fd32;
st.shared.f64 [%r13], %fd6;
BB10_19:
bar.sync 0;
BB10_20:
setp.gt.u32 %p12, %r10, 31;
@%p12 bra BB10_33;
setp.lt.u32 %p13, %r14, 64;
@%p13 bra BB10_23;
ld.volatile.shared.f64 %fd33, [%r13+256];
max.f64 %fd6, %fd6, %fd33;
st.volatile.shared.f64 [%r13], %fd6;
BB10_23:
setp.lt.u32 %p14, %r14, 32;
@%p14 bra BB10_25;
ld.volatile.shared.f64 %fd34, [%r13+128];
max.f64 %fd6, %fd6, %fd34;
st.volatile.shared.f64 [%r13], %fd6;
BB10_25:
setp.lt.u32 %p15, %r14, 16;
@%p15 bra BB10_27;
ld.volatile.shared.f64 %fd35, [%r13+64];
max.f64 %fd6, %fd6, %fd35;
st.volatile.shared.f64 [%r13], %fd6;
BB10_27:
setp.lt.u32 %p16, %r14, 8;
@%p16 bra BB10_29;
ld.volatile.shared.f64 %fd36, [%r13+32];
max.f64 %fd6, %fd6, %fd36;
st.volatile.shared.f64 [%r13], %fd6;
BB10_29:
setp.lt.u32 %p17, %r14, 4;
@%p17 bra BB10_31;
ld.volatile.shared.f64 %fd37, [%r13+16];
max.f64 %fd6, %fd6, %fd37;
st.volatile.shared.f64 [%r13], %fd6;
BB10_31:
setp.lt.u32 %p18, %r14, 2;
@%p18 bra BB10_33;
ld.volatile.shared.f64 %fd38, [%r13+8];
max.f64 %fd39, %fd6, %fd38;
st.volatile.shared.f64 [%r13], %fd39;
BB10_33:
setp.ne.s32 %p19, %r10, 0;
@%p19 bra BB10_35;
ld.shared.f64 %fd40, [memory];
cvta.to.global.u64 %rd6, %rd2;
mul.wide.u32 %rd7, %r6, 8;
add.s64 %rd8, %rd6, %rd7;
st.global.f64 [%rd8], %fd40;
BB10_35:
ret;
}
// .globl reduce_row_max_f
.visible .entry reduce_row_max_f(
.param .u64 reduce_row_max_f_param_0,
.param .u64 reduce_row_max_f_param_1,
.param .u32 reduce_row_max_f_param_2,
.param .u32 reduce_row_max_f_param_3
)
{
.reg .pred %p<20>;
.reg .f32 %f<56>;
.reg .b32 %r<72>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [reduce_row_max_f_param_0];
ld.param.u64 %rd2, [reduce_row_max_f_param_1];
ld.param.u32 %r5, [reduce_row_max_f_param_2];
ld.param.u32 %r4, [reduce_row_max_f_param_3];
mov.u32 %r6, %ctaid.x;
setp.ge.u32 %p1, %r6, %r5;
@%p1 bra BB11_35;
mov.u32 %r71, %tid.x;
mov.f32 %f6, 0fFF7FFFFF;
setp.ge.u32 %p2, %r71, %r4;
@%p2 bra BB11_4;
cvta.to.global.u64 %rd3, %rd1;
BB11_3:
mad.lo.s32 %r8, %r6, %r4, %r71;
mul.wide.u32 %rd4, %r8, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f28, [%rd5];
max.f32 %f6, %f6, %f28;
mov.u32 %r9, %ntid.x;
add.s32 %r71, %r9, %r71;
setp.lt.u32 %p3, %r71, %r4;
@%p3 bra BB11_3;
BB11_4:
mov.u32 %r10, %tid.x;
shl.b32 %r11, %r10, 2;
mov.u32 %r12, memory;
add.s32 %r13, %r12, %r11;
st.shared.f32 [%r13], %f6;
bar.sync 0;
mov.u32 %r14, %ntid.x;
setp.lt.u32 %p4, %r14, 1024;
@%p4 bra BB11_8;
setp.gt.u32 %p5, %r10, 511;
@%p5 bra BB11_7;
ld.shared.f32 %f29, [%r13+2048];
max.f32 %f6, %f6, %f29;
st.shared.f32 [%r13], %f6;
BB11_7:
bar.sync 0;
BB11_8:
setp.lt.u32 %p6, %r14, 512;
@%p6 bra BB11_12;
setp.gt.u32 %p7, %r10, 255;
@%p7 bra BB11_11;
ld.shared.f32 %f30, [%r13+1024];
max.f32 %f6, %f6, %f30;
st.shared.f32 [%r13], %f6;
BB11_11:
bar.sync 0;
BB11_12:
setp.lt.u32 %p8, %r14, 256;
@%p8 bra BB11_16;
setp.gt.u32 %p9, %r10, 127;
@%p9 bra BB11_15;
ld.shared.f32 %f31, [%r13+512];
max.f32 %f6, %f6, %f31;
st.shared.f32 [%r13], %f6;
BB11_15:
bar.sync 0;
BB11_16:
setp.lt.u32 %p10, %r14, 128;
@%p10 bra BB11_20;
setp.gt.u32 %p11, %r10, 63;
@%p11 bra BB11_19;
ld.shared.f32 %f32, [%r13+256];
max.f32 %f6, %f6, %f32;
st.shared.f32 [%r13], %f6;
BB11_19:
bar.sync 0;
BB11_20:
setp.gt.u32 %p12, %r10, 31;
@%p12 bra BB11_33;
setp.lt.u32 %p13, %r14, 64;
@%p13 bra BB11_23;
ld.volatile.shared.f32 %f33, [%r13+128];
max.f32 %f6, %f6, %f33;
st.volatile.shared.f32 [%r13], %f6;
BB11_23:
setp.lt.u32 %p14, %r14, 32;
@%p14 bra BB11_25;
ld.volatile.shared.f32 %f34, [%r13+64];
max.f32 %f6, %f6, %f34;
st.volatile.shared.f32 [%r13], %f6;
BB11_25:
setp.lt.u32 %p15, %r14, 16;
@%p15 bra BB11_27;
ld.volatile.shared.f32 %f35, [%r13+32];
max.f32 %f6, %f6, %f35;
st.volatile.shared.f32 [%r13], %f6;
BB11_27:
setp.lt.u32 %p16, %r14, 8;
@%p16 bra BB11_29;
ld.volatile.shared.f32 %f36, [%r13+16];
max.f32 %f6, %f6, %f36;
st.volatile.shared.f32 [%r13], %f6;
BB11_29:
setp.lt.u32 %p17, %r14, 4;
@%p17 bra BB11_31;
ld.volatile.shared.f32 %f37, [%r13+8];
max.f32 %f6, %f6, %f37;
st.volatile.shared.f32 [%r13], %f6;
BB11_31:
setp.lt.u32 %p18, %r14, 2;
@%p18 bra BB11_33;
ld.volatile.shared.f32 %f38, [%r13+4];
max.f32 %f39, %f6, %f38;
st.volatile.shared.f32 [%r13], %f39;
BB11_33:
setp.ne.s32 %p19, %r10, 0;
@%p19 bra BB11_35;
ld.shared.f32 %f40, [memory];
cvta.to.global.u64 %rd6, %rd2;
mul.wide.u32 %rd7, %r6, 4;
add.s64 %rd8, %rd6, %rd7;
st.global.f32 [%rd8], %f40;
BB11_35:
ret;
}
// .globl reduce_col_max_d
.visible .entry reduce_col_max_d(
.param .u64 reduce_col_max_d_param_0,
.param .u64 reduce_col_max_d_param_1,
.param .u32 reduce_col_max_d_param_2,
.param .u32 reduce_col_max_d_param_3
)
{
.reg .pred %p<4>;
.reg .b32 %r<11>;
.reg .f64 %fd<9>;
.reg .b64 %rd<9>;
ld.param.u64 %rd2, [reduce_col_max_d_param_0];
ld.param.u64 %rd3, [reduce_col_max_d_param_1];
ld.param.u32 %r5, [reduce_col_max_d_param_2];
ld.param.u32 %r6, [reduce_col_max_d_param_3];
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %ctaid.x;
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r7, %r8, %r9;
setp.ge.u32 %p1, %r1, %r6;
@%p1 bra BB12_5;
mul.lo.s32 %r2, %r6, %r5;
cvta.to.global.u64 %rd1, %rd2;
mov.f64 %fd8, 0dFFEFFFFFFFFFFFFF;
setp.ge.u32 %p2, %r1, %r2;
@%p2 bra BB12_4;
mov.u32 %r10, %r1;
BB12_3:
mul.wide.u32 %rd4, %r10, 8;
add.s64 %rd5, %rd1, %rd4;
ld.global.f64 %fd6, [%rd5];
max.f64 %fd8, %fd8, %fd6;
add.s32 %r10, %r10, %r6;
setp.lt.u32 %p3, %r10, %r2;
@%p3 bra BB12_3;
BB12_4:
cvta.to.global.u64 %rd6, %rd3;
mul.wide.u32 %rd7, %r1, 8;
add.s64 %rd8, %rd6, %rd7;
st.global.f64 [%rd8], %fd8;
BB12_5:
ret;
}
// .globl reduce_col_max_f
.visible .entry reduce_col_max_f(
.param .u64 reduce_col_max_f_param_0,
.param .u64 reduce_col_max_f_param_1,
.param .u32 reduce_col_max_f_param_2,
.param .u32 reduce_col_max_f_param_3
)
{
.reg .pred %p<4>;
.reg .f32 %f<9>;
.reg .b32 %r<11>;
.reg .b64 %rd<9>;
ld.param.u64 %rd2, [reduce_col_max_f_param_0];
ld.param.u64 %rd3, [reduce_col_max_f_param_1];
ld.param.u32 %r5, [reduce_col_max_f_param_2];
ld.param.u32 %r6, [reduce_col_max_f_param_3];
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %ctaid.x;
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r7, %r8, %r9;
setp.ge.u32 %p1, %r1, %r6;
@%p1 bra BB13_5;
mul.lo.s32 %r2, %r6, %r5;
cvta.to.global.u64 %rd1, %rd2;
mov.f32 %f8, 0fFF7FFFFF;
setp.ge.u32 %p2, %r1, %r2;
@%p2 bra BB13_4;
mov.u32 %r10, %r1;
BB13_3:
mul.wide.u32 %rd4, %r10, 4;
add.s64 %rd5, %rd1, %rd4;
ld.global.f32 %f6, [%rd5];
max.f32 %f8, %f8, %f6;
add.s32 %r10, %r10, %r6;
setp.lt.u32 %p3, %r10, %r2;
@%p3 bra BB13_3;
BB13_4:
cvta.to.global.u64 %rd6, %rd3;
mul.wide.u32 %rd7, %r1, 4;
add.s64 %rd8, %rd6, %rd7;
st.global.f32 [%rd8], %f8;
BB13_5:
ret;
}
// .globl reduce_min_d
.visible .entry reduce_min_d(
.param .u64 reduce_min_d_param_0,
.param .u64 reduce_min_d_param_1,
.param .u32 reduce_min_d_param_2
)
{
.local .align 16 .b8 __local_depot14[16];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<21>;
.reg .b32 %r<39>;
.reg .f64 %fd<61>;
.reg .b64 %rd<16>;
mov.u64 %SPL, __local_depot14;
cvta.local.u64 %SP, %SPL;
ld.param.u64 %rd1, [reduce_min_d_param_0];
ld.param.u64 %rd2, [reduce_min_d_param_1];
ld.param.u32 %r6, [reduce_min_d_param_2];
mov.u32 %r7, %ctaid.x;
shl.b32 %r8, %r7, 1;
mov.u32 %r9, %ntid.x;
mov.u32 %r10, %tid.x;
mad.lo.s32 %r38, %r8, %r9, %r10;
mov.f64 %fd45, 0d7FEFFFFFFFFFFFFF;
setp.ge.u32 %p1, %r38, %r6;
@%p1 bra BB14_4;
BB14_1:
cvta.to.global.u64 %rd3, %rd1;
mul.wide.u32 %rd4, %r38, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd30, [%rd5];
min.f64 %fd45, %fd45, %fd30;
add.s32 %r3, %r38, %r9;
setp.ge.u32 %p2, %r3, %r6;
@%p2 bra BB14_3;
mul.wide.u32 %rd7, %r3, 8;
add.s64 %rd8, %rd3, %rd7;
ld.global.f64 %fd31, [%rd8];
min.f64 %fd45, %fd45, %fd31;
BB14_3:
shl.b32 %r13, %r9, 1;
mov.u32 %r14, %nctaid.x;
mad.lo.s32 %r38, %r13, %r14, %r38;
setp.lt.u32 %p3, %r38, %r6;
@%p3 bra BB14_1;
BB14_4:
shl.b32 %r16, %r10, 3;
mov.u32 %r17, memory;
add.s32 %r5, %r17, %r16;
st.shared.f64 [%r5], %fd45;
bar.sync 0;
setp.lt.u32 %p4, %r9, 1024;
@%p4 bra BB14_8;
setp.gt.u32 %p5, %r10, 511;
@%p5 bra BB14_7;
ld.shared.f64 %fd32, [%r5+4096];
min.f64 %fd45, %fd45, %fd32;
st.shared.f64 [%r5], %fd45;
BB14_7:
bar.sync 0;
BB14_8:
setp.lt.u32 %p6, %r9, 512;
@%p6 bra BB14_12;
setp.gt.u32 %p7, %r10, 255;
@%p7 bra BB14_11;
ld.shared.f64 %fd33, [%r5+2048];
min.f64 %fd45, %fd45, %fd33;
st.shared.f64 [%r5], %fd45;
BB14_11:
bar.sync 0;
BB14_12:
setp.lt.u32 %p8, %r9, 256;
@%p8 bra BB14_16;
setp.gt.u32 %p9, %r10, 127;
@%p9 bra BB14_15;
ld.shared.f64 %fd34, [%r5+1024];
min.f64 %fd45, %fd45, %fd34;
st.shared.f64 [%r5], %fd45;
BB14_15:
bar.sync 0;
BB14_16:
setp.lt.u32 %p10, %r9, 128;
@%p10 bra BB14_20;
setp.gt.u32 %p11, %r10, 63;
@%p11 bra BB14_19;
ld.shared.f64 %fd35, [%r5+512];
min.f64 %fd45, %fd45, %fd35;
st.shared.f64 [%r5], %fd45;
BB14_19:
bar.sync 0;
BB14_20:
setp.gt.u32 %p12, %r10, 31;
@%p12 bra BB14_33;
setp.lt.u32 %p13, %r9, 64;
@%p13 bra BB14_23;
ld.volatile.shared.f64 %fd36, [%r5+256];
min.f64 %fd45, %fd45, %fd36;
st.volatile.shared.f64 [%r5], %fd45;
BB14_23:
setp.lt.u32 %p14, %r9, 32;
@%p14 bra BB14_25;
ld.volatile.shared.f64 %fd37, [%r5+128];
min.f64 %fd45, %fd45, %fd37;
st.volatile.shared.f64 [%r5], %fd45;
BB14_25:
setp.lt.u32 %p15, %r9, 16;
@%p15 bra BB14_27;
ld.volatile.shared.f64 %fd38, [%r5+64];
min.f64 %fd45, %fd45, %fd38;
st.volatile.shared.f64 [%r5], %fd45;
BB14_27:
setp.lt.u32 %p16, %r9, 8;
@%p16 bra BB14_29;
ld.volatile.shared.f64 %fd39, [%r5+32];
min.f64 %fd45, %fd45, %fd39;
st.volatile.shared.f64 [%r5], %fd45;
BB14_29:
setp.lt.u32 %p17, %r9, 4;
@%p17 bra BB14_31;
ld.volatile.shared.f64 %fd40, [%r5+16];
min.f64 %fd45, %fd45, %fd40;
st.volatile.shared.f64 [%r5], %fd45;
BB14_31:
setp.lt.u32 %p18, %r9, 2;
@%p18 bra BB14_33;
ld.volatile.shared.f64 %fd41, [%r5+8];
min.f64 %fd42, %fd45, %fd41;
st.volatile.shared.f64 [%r5], %fd42;
BB14_33:
setp.ne.s32 %p19, %r10, 0;
@%p19 bra BB14_37;
mov.u32 %r34, %nctaid.x;
setp.gt.u32 %p20, %r34, 9;
@%p20 bra BB14_36;
ld.shared.f64 %fd43, [memory];
add.u64 %rd9, %SP, 0;
add.u64 %rd10, %SPL, 0;
st.local.u32 [%rd10], %r7;
st.local.f64 [%rd10+8], %fd43;
mov.u64 %rd11, $str;
cvta.global.u64 %rd12, %rd11;
// Callseq Start 4
{
.reg .b32 temp_param_reg;
// <end>}
.param .b64 param0;
st.param.b64 [param0+0], %rd12;
.param .b64 param1;
st.param.b64 [param1+0], %rd9;
.param .b32 retval0;
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r36, [retval0+0];
//{
}// Callseq End 4
BB14_36:
ld.shared.f64 %fd44, [memory];
cvta.to.global.u64 %rd13, %rd2;
mul.wide.u32 %rd14, %r7, 8;
add.s64 %rd15, %rd13, %rd14;
st.global.f64 [%rd15], %fd44;
BB14_37:
ret;
}
// .globl reduce_min_f
.visible .entry reduce_min_f(
.param .u64 reduce_min_f_param_0,
.param .u64 reduce_min_f_param_1,
.param .u32 reduce_min_f_param_2
)
{
.local .align 16 .b8 __local_depot15[16];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<21>;
.reg .f32 %f<61>;
.reg .b32 %r<39>;
.reg .f64 %fd<2>;
.reg .b64 %rd<16>;
mov.u64 %SPL, __local_depot15;
cvta.local.u64 %SP, %SPL;
ld.param.u64 %rd1, [reduce_min_f_param_0];
ld.param.u64 %rd2, [reduce_min_f_param_1];
ld.param.u32 %r6, [reduce_min_f_param_2];
mov.u32 %r7, %ctaid.x;
shl.b32 %r8, %r7, 1;
mov.u32 %r9, %ntid.x;
mov.u32 %r10, %tid.x;
mad.lo.s32 %r38, %r8, %r9, %r10;
mov.f32 %f45, 0f7F7FFFFF;
setp.ge.u32 %p1, %r38, %r6;
@%p1 bra BB15_4;
BB15_1:
cvta.to.global.u64 %rd3, %rd1;
mul.wide.u32 %rd4, %r38, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f30, [%rd5];
min.f32 %f45, %f45, %f30;
add.s32 %r3, %r38, %r9;
setp.ge.u32 %p2, %r3, %r6;
@%p2 bra BB15_3;
mul.wide.u32 %rd7, %r3, 4;
add.s64 %rd8, %rd3, %rd7;
ld.global.f32 %f31, [%rd8];
min.f32 %f45, %f45, %f31;
BB15_3:
shl.b32 %r13, %r9, 1;
mov.u32 %r14, %nctaid.x;
mad.lo.s32 %r38, %r13, %r14, %r38;
setp.lt.u32 %p3, %r38, %r6;
@%p3 bra BB15_1;
BB15_4:
shl.b32 %r16, %r10, 2;
mov.u32 %r17, memory;
add.s32 %r5, %r17, %r16;
st.shared.f32 [%r5], %f45;
bar.sync 0;
setp.lt.u32 %p4, %r9, 1024;
@%p4 bra BB15_8;
setp.gt.u32 %p5, %r10, 511;
@%p5 bra BB15_7;
ld.shared.f32 %f32, [%r5+2048];
min.f32 %f45, %f45, %f32;
st.shared.f32 [%r5], %f45;
BB15_7:
bar.sync 0;
BB15_8:
setp.lt.u32 %p6, %r9, 512;
@%p6 bra BB15_12;
setp.gt.u32 %p7, %r10, 255;
@%p7 bra BB15_11;
ld.shared.f32 %f33, [%r5+1024];
min.f32 %f45, %f45, %f33;
st.shared.f32 [%r5], %f45;
BB15_11:
bar.sync 0;
BB15_12:
setp.lt.u32 %p8, %r9, 256;
@%p8 bra BB15_16;
setp.gt.u32 %p9, %r10, 127;
@%p9 bra BB15_15;
ld.shared.f32 %f34, [%r5+512];
min.f32 %f45, %f45, %f34;
st.shared.f32 [%r5], %f45;
BB15_15:
bar.sync 0;
BB15_16:
setp.lt.u32 %p10, %r9, 128;
@%p10 bra BB15_20;
setp.gt.u32 %p11, %r10, 63;
@%p11 bra BB15_19;
ld.shared.f32 %f35, [%r5+256];
min.f32 %f45, %f45, %f35;
st.shared.f32 [%r5], %f45;
BB15_19:
bar.sync 0;
BB15_20:
setp.gt.u32 %p12, %r10, 31;
@%p12 bra BB15_33;
setp.lt.u32 %p13, %r9, 64;
@%p13 bra BB15_23;
ld.volatile.shared.f32 %f36, [%r5+128];
min.f32 %f45, %f45, %f36;
st.volatile.shared.f32 [%r5], %f45;
BB15_23:
setp.lt.u32 %p14, %r9, 32;
@%p14 bra BB15_25;
ld.volatile.shared.f32 %f37, [%r5+64];
min.f32 %f45, %f45, %f37;
st.volatile.shared.f32 [%r5], %f45;
BB15_25:
setp.lt.u32 %p15, %r9, 16;
@%p15 bra BB15_27;
ld.volatile.shared.f32 %f38, [%r5+32];
min.f32 %f45, %f45, %f38;
st.volatile.shared.f32 [%r5], %f45;
BB15_27:
setp.lt.u32 %p16, %r9, 8;
@%p16 bra BB15_29;
ld.volatile.shared.f32 %f39, [%r5+16];
min.f32 %f45, %f45, %f39;
st.volatile.shared.f32 [%r5], %f45;
BB15_29:
setp.lt.u32 %p17, %r9, 4;
@%p17 bra BB15_31;
ld.volatile.shared.f32 %f40, [%r5+8];
min.f32 %f45, %f45, %f40;
st.volatile.shared.f32 [%r5], %f45;
BB15_31:
setp.lt.u32 %p18, %r9, 2;
@%p18 bra BB15_33;
ld.volatile.shared.f32 %f41, [%r5+4];
min.f32 %f42, %f45, %f41;
st.volatile.shared.f32 [%r5], %f42;
BB15_33:
setp.ne.s32 %p19, %r10, 0;
@%p19 bra BB15_37;
mov.u32 %r34, %nctaid.x;
setp.gt.u32 %p20, %r34, 9;
@%p20 bra BB15_36;
ld.shared.f32 %f43, [memory];
cvt.f64.f32 %fd1, %f43;
add.u64 %rd9, %SP, 0;
add.u64 %rd10, %SPL, 0;
st.local.u32 [%rd10], %r7;
st.local.f64 [%rd10+8], %fd1;
mov.u64 %rd11, $str;
cvta.global.u64 %rd12, %rd11;
// Callseq Start 5
{
.reg .b32 temp_param_reg;
// <end>}
.param .b64 param0;
st.param.b64 [param0+0], %rd12;
.param .b64 param1;
st.param.b64 [param1+0], %rd9;
.param .b32 retval0;
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r36, [retval0+0];
//{
}// Callseq End 5
BB15_36:
ld.shared.f32 %f44, [memory];
cvta.to.global.u64 %rd13, %rd2;
mul.wide.u32 %rd14, %r7, 4;
add.s64 %rd15, %rd13, %rd14;
st.global.f32 [%rd15], %f44;
BB15_37:
ret;
}
// .globl reduce_row_min_d
.visible .entry reduce_row_min_d(
.param .u64 reduce_row_min_d_param_0,
.param .u64 reduce_row_min_d_param_1,
.param .u32 reduce_row_min_d_param_2,
.param .u32 reduce_row_min_d_param_3
)
{
.reg .pred %p<20>;
.reg .b32 %r<72>;
.reg .f64 %fd<56>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [reduce_row_min_d_param_0];
ld.param.u64 %rd2, [reduce_row_min_d_param_1];
ld.param.u32 %r5, [reduce_row_min_d_param_2];
ld.param.u32 %r4, [reduce_row_min_d_param_3];
mov.u32 %r6, %ctaid.x;
setp.ge.u32 %p1, %r6, %r5;
@%p1 bra BB16_35;
mov.u32 %r71, %tid.x;
mov.f64 %fd6, 0d7FEFFFFFFFFFFFFF;
setp.ge.u32 %p2, %r71, %r4;
@%p2 bra BB16_4;
cvta.to.global.u64 %rd3, %rd1;
BB16_3:
mad.lo.s32 %r8, %r6, %r4, %r71;
mul.wide.u32 %rd4, %r8, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd28, [%rd5];
min.f64 %fd6, %fd6, %fd28;
mov.u32 %r9, %ntid.x;
add.s32 %r71, %r9, %r71;
setp.lt.u32 %p3, %r71, %r4;
@%p3 bra BB16_3;
BB16_4:
mov.u32 %r10, %tid.x;
shl.b32 %r11, %r10, 3;
mov.u32 %r12, memory;
add.s32 %r13, %r12, %r11;
st.shared.f64 [%r13], %fd6;
bar.sync 0;
mov.u32 %r14, %ntid.x;
setp.lt.u32 %p4, %r14, 1024;
@%p4 bra BB16_8;
setp.gt.u32 %p5, %r10, 511;
@%p5 bra BB16_7;
ld.shared.f64 %fd29, [%r13+4096];
min.f64 %fd6, %fd6, %fd29;
st.shared.f64 [%r13], %fd6;
BB16_7:
bar.sync 0;
BB16_8:
setp.lt.u32 %p6, %r14, 512;
@%p6 bra BB16_12;
setp.gt.u32 %p7, %r10, 255;
@%p7 bra BB16_11;
ld.shared.f64 %fd30, [%r13+2048];
min.f64 %fd6, %fd6, %fd30;
st.shared.f64 [%r13], %fd6;
BB16_11:
bar.sync 0;
BB16_12:
setp.lt.u32 %p8, %r14, 256;
@%p8 bra BB16_16;
setp.gt.u32 %p9, %r10, 127;
@%p9 bra BB16_15;
ld.shared.f64 %fd31, [%r13+1024];
min.f64 %fd6, %fd6, %fd31;
st.shared.f64 [%r13], %fd6;
BB16_15:
bar.sync 0;
BB16_16:
setp.lt.u32 %p10, %r14, 128;
@%p10 bra BB16_20;
setp.gt.u32 %p11, %r10, 63;
@%p11 bra BB16_19;
ld.shared.f64 %fd32, [%r13+512];
min.f64 %fd6, %fd6, %fd32;
st.shared.f64 [%r13], %fd6;
BB16_19:
bar.sync 0;
BB16_20:
setp.gt.u32 %p12, %r10, 31;
@%p12 bra BB16_33;
setp.lt.u32 %p13, %r14, 64;
@%p13 bra BB16_23;
ld.volatile.shared.f64 %fd33, [%r13+256];
min.f64 %fd6, %fd6, %fd33;
st.volatile.shared.f64 [%r13], %fd6;
BB16_23:
setp.lt.u32 %p14, %r14, 32;
@%p14 bra BB16_25;
ld.volatile.shared.f64 %fd34, [%r13+128];
min.f64 %fd6, %fd6, %fd34;
st.volatile.shared.f64 [%r13], %fd6;
BB16_25:
setp.lt.u32 %p15, %r14, 16;
@%p15 bra BB16_27;
ld.volatile.shared.f64 %fd35, [%r13+64];
min.f64 %fd6, %fd6, %fd35;
st.volatile.shared.f64 [%r13], %fd6;
BB16_27:
setp.lt.u32 %p16, %r14, 8;
@%p16 bra BB16_29;
ld.volatile.shared.f64 %fd36, [%r13+32];
min.f64 %fd6, %fd6, %fd36;
st.volatile.shared.f64 [%r13], %fd6;
BB16_29:
setp.lt.u32 %p17, %r14, 4;
@%p17 bra BB16_31;
ld.volatile.shared.f64 %fd37, [%r13+16];
min.f64 %fd6, %fd6, %fd37;
st.volatile.shared.f64 [%r13], %fd6;
BB16_31:
setp.lt.u32 %p18, %r14, 2;
@%p18 bra BB16_33;
ld.volatile.shared.f64 %fd38, [%r13+8];
min.f64 %fd39, %fd6, %fd38;
st.volatile.shared.f64 [%r13], %fd39;
BB16_33:
setp.ne.s32 %p19, %r10, 0;
@%p19 bra BB16_35;
ld.shared.f64 %fd40, [memory];
cvta.to.global.u64 %rd6, %rd2;
mul.wide.u32 %rd7, %r6, 8;
add.s64 %rd8, %rd6, %rd7;
st.global.f64 [%rd8], %fd40;
BB16_35:
ret;
}
// .globl reduce_row_min_f
.visible .entry reduce_row_min_f(
.param .u64 reduce_row_min_f_param_0,
.param .u64 reduce_row_min_f_param_1,
.param .u32 reduce_row_min_f_param_2,
.param .u32 reduce_row_min_f_param_3
)
{
.reg .pred %p<20>;
.reg .f32 %f<56>;
.reg .b32 %r<72>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [reduce_row_min_f_param_0];
ld.param.u64 %rd2, [reduce_row_min_f_param_1];
ld.param.u32 %r5, [reduce_row_min_f_param_2];
ld.param.u32 %r4, [reduce_row_min_f_param_3];
mov.u32 %r6, %ctaid.x;
setp.ge.u32 %p1, %r6, %r5;
@%p1 bra BB17_35;
mov.u32 %r71, %tid.x;
mov.f32 %f6, 0f7F7FFFFF;
setp.ge.u32 %p2, %r71, %r4;
@%p2 bra BB17_4;
cvta.to.global.u64 %rd3, %rd1;
BB17_3:
mad.lo.s32 %r8, %r6, %r4, %r71;
mul.wide.u32 %rd4, %r8, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f28, [%rd5];
min.f32 %f6, %f6, %f28;
mov.u32 %r9, %ntid.x;
add.s32 %r71, %r9, %r71;
setp.lt.u32 %p3, %r71, %r4;
@%p3 bra BB17_3;
BB17_4:
mov.u32 %r10, %tid.x;
shl.b32 %r11, %r10, 2;
mov.u32 %r12, memory;
add.s32 %r13, %r12, %r11;
st.shared.f32 [%r13], %f6;
bar.sync 0;
mov.u32 %r14, %ntid.x;
setp.lt.u32 %p4, %r14, 1024;
@%p4 bra BB17_8;
setp.gt.u32 %p5, %r10, 511;
@%p5 bra BB17_7;
ld.shared.f32 %f29, [%r13+2048];
min.f32 %f6, %f6, %f29;
st.shared.f32 [%r13], %f6;
BB17_7:
bar.sync 0;
BB17_8:
setp.lt.u32 %p6, %r14, 512;
@%p6 bra BB17_12;
setp.gt.u32 %p7, %r10, 255;
@%p7 bra BB17_11;
ld.shared.f32 %f30, [%r13+1024];
min.f32 %f6, %f6, %f30;
st.shared.f32 [%r13], %f6;
BB17_11:
bar.sync 0;
BB17_12:
setp.lt.u32 %p8, %r14, 256;
@%p8 bra BB17_16;
setp.gt.u32 %p9, %r10, 127;
@%p9 bra BB17_15;
ld.shared.f32 %f31, [%r13+512];
min.f32 %f6, %f6, %f31;
st.shared.f32 [%r13], %f6;
BB17_15:
bar.sync 0;
BB17_16:
setp.lt.u32 %p10, %r14, 128;
@%p10 bra BB17_20;
setp.gt.u32 %p11, %r10, 63;
@%p11 bra BB17_19;
ld.shared.f32 %f32, [%r13+256];
min.f32 %f6, %f6, %f32;
st.shared.f32 [%r13], %f6;
BB17_19:
bar.sync 0;
BB17_20:
setp.gt.u32 %p12, %r10, 31;
@%p12 bra BB17_33;
setp.lt.u32 %p13, %r14, 64;
@%p13 bra BB17_23;
ld.volatile.shared.f32 %f33, [%r13+128];
min.f32 %f6, %f6, %f33;
st.volatile.shared.f32 [%r13], %f6;
BB17_23:
setp.lt.u32 %p14, %r14, 32;
@%p14 bra BB17_25;
ld.volatile.shared.f32 %f34, [%r13+64];
min.f32 %f6, %f6, %f34;
st.volatile.shared.f32 [%r13], %f6;
BB17_25:
setp.lt.u32 %p15, %r14, 16;
@%p15 bra BB17_27;
ld.volatile.shared.f32 %f35, [%r13+32];
min.f32 %f6, %f6, %f35;
st.volatile.shared.f32 [%r13], %f6;
BB17_27:
setp.lt.u32 %p16, %r14, 8;
@%p16 bra BB17_29;
ld.volatile.shared.f32 %f36, [%r13+16];
min.f32 %f6, %f6, %f36;
st.volatile.shared.f32 [%r13], %f6;
BB17_29:
setp.lt.u32 %p17, %r14, 4;
@%p17 bra BB17_31;
ld.volatile.shared.f32 %f37, [%r13+8];
min.f32 %f6, %f6, %f37;
st.volatile.shared.f32 [%r13], %f6;
BB17_31:
setp.lt.u32 %p18, %r14, 2;
@%p18 bra BB17_33;
ld.volatile.shared.f32 %f38, [%r13+4];
min.f32 %f39, %f6, %f38;
st.volatile.shared.f32 [%r13], %f39;
BB17_33:
setp.ne.s32 %p19, %r10, 0;
@%p19 bra BB17_35;
ld.shared.f32 %f40, [memory];
cvta.to.global.u64 %rd6, %rd2;
mul.wide.u32 %rd7, %r6, 4;
add.s64 %rd8, %rd6, %rd7;
st.global.f32 [%rd8], %f40;
BB17_35:
ret;
}
// .globl reduce_col_min_d
.visible .entry reduce_col_min_d(
.param .u64 reduce_col_min_d_param_0,
.param .u64 reduce_col_min_d_param_1,
.param .u32 reduce_col_min_d_param_2,
.param .u32 reduce_col_min_d_param_3
)
{
.reg .pred %p<4>;
.reg .b32 %r<11>;
.reg .f64 %fd<9>;
.reg .b64 %rd<9>;
ld.param.u64 %rd2, [reduce_col_min_d_param_0];
ld.param.u64 %rd3, [reduce_col_min_d_param_1];
ld.param.u32 %r5, [reduce_col_min_d_param_2];
ld.param.u32 %r6, [reduce_col_min_d_param_3];
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %ctaid.x;
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r7, %r8, %r9;
setp.ge.u32 %p1, %r1, %r6;
@%p1 bra BB18_5;
mul.lo.s32 %r2, %r6, %r5;
cvta.to.global.u64 %rd1, %rd2;
mov.f64 %fd8, 0d7FEFFFFFFFFFFFFF;
setp.ge.u32 %p2, %r1, %r2;
@%p2 bra BB18_4;
mov.u32 %r10, %r1;
BB18_3:
mul.wide.u32 %rd4, %r10, 8;
add.s64 %rd5, %rd1, %rd4;
ld.global.f64 %fd6, [%rd5];
min.f64 %fd8, %fd8, %fd6;
add.s32 %r10, %r10, %r6;
setp.lt.u32 %p3, %r10, %r2;
@%p3 bra BB18_3;
BB18_4:
cvta.to.global.u64 %rd6, %rd3;
mul.wide.u32 %rd7, %r1, 8;
add.s64 %rd8, %rd6, %rd7;
st.global.f64 [%rd8], %fd8;
BB18_5:
ret;
}
// .globl reduce_col_min_f
.visible .entry reduce_col_min_f(
.param .u64 reduce_col_min_f_param_0,
.param .u64 reduce_col_min_f_param_1,
.param .u32 reduce_col_min_f_param_2,
.param .u32 reduce_col_min_f_param_3
)
{
.reg .pred %p<4>;
.reg .f32 %f<9>;
.reg .b32 %r<11>;
.reg .b64 %rd<9>;
ld.param.u64 %rd2, [reduce_col_min_f_param_0];
ld.param.u64 %rd3, [reduce_col_min_f_param_1];
ld.param.u32 %r5, [reduce_col_min_f_param_2];
ld.param.u32 %r6, [reduce_col_min_f_param_3];
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %ctaid.x;
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r7, %r8, %r9;
setp.ge.u32 %p1, %r1, %r6;
@%p1 bra BB19_5;
mul.lo.s32 %r2, %r6, %r5;
cvta.to.global.u64 %rd1, %rd2;
mov.f32 %f8, 0f7F7FFFFF;
setp.ge.u32 %p2, %r1, %r2;
@%p2 bra BB19_4;
mov.u32 %r10, %r1;
BB19_3:
mul.wide.u32 %rd4, %r10, 4;
add.s64 %rd5, %rd1, %rd4;
ld.global.f32 %f6, [%rd5];
min.f32 %f8, %f8, %f6;
add.s32 %r10, %r10, %r6;
setp.lt.u32 %p3, %r10, %r2;
@%p3 bra BB19_3;
BB19_4:
cvta.to.global.u64 %rd6, %rd3;
mul.wide.u32 %rd7, %r1, 4;
add.s64 %rd8, %rd6, %rd7;
st.global.f32 [%rd8], %f8;
BB19_5:
ret;
}
// .globl reduce_sum_sq_d
.visible .entry reduce_sum_sq_d(
.param .u64 reduce_sum_sq_d_param_0,
.param .u64 reduce_sum_sq_d_param_1,
.param .u32 reduce_sum_sq_d_param_2
)
{
.local .align 16 .b8 __local_depot20[16];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<21>;
.reg .b32 %r<39>;
.reg .f64 %fd<61>;
.reg .b64 %rd<16>;
mov.u64 %SPL, __local_depot20;
cvta.local.u64 %SP, %SPL;
ld.param.u64 %rd1, [reduce_sum_sq_d_param_0];
ld.param.u64 %rd2, [reduce_sum_sq_d_param_1];
ld.param.u32 %r6, [reduce_sum_sq_d_param_2];
mov.u32 %r7, %ctaid.x;
shl.b32 %r8, %r7, 1;
mov.u32 %r9, %ntid.x;
mov.u32 %r10, %tid.x;
mad.lo.s32 %r38, %r8, %r9, %r10;
mov.f64 %fd45, 0d0000000000000000;
setp.ge.u32 %p1, %r38, %r6;
@%p1 bra BB20_4;
BB20_1:
cvta.to.global.u64 %rd3, %rd1;
mul.wide.u32 %rd4, %r38, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd30, [%rd5];
fma.rn.f64 %fd45, %fd30, %fd30, %fd45;
add.s32 %r3, %r38, %r9;
setp.ge.u32 %p2, %r3, %r6;
@%p2 bra BB20_3;
mul.wide.u32 %rd7, %r3, 8;
add.s64 %rd8, %rd3, %rd7;
ld.global.f64 %fd31, [%rd8];
fma.rn.f64 %fd45, %fd31, %fd31, %fd45;
BB20_3:
shl.b32 %r13, %r9, 1;
mov.u32 %r14, %nctaid.x;
mad.lo.s32 %r38, %r13, %r14, %r38;
setp.lt.u32 %p3, %r38, %r6;
@%p3 bra BB20_1;
BB20_4:
shl.b32 %r16, %r10, 3;
mov.u32 %r17, memory;
add.s32 %r5, %r17, %r16;
st.shared.f64 [%r5], %fd45;
bar.sync 0;
setp.lt.u32 %p4, %r9, 1024;
@%p4 bra BB20_8;
setp.gt.u32 %p5, %r10, 511;
@%p5 bra BB20_7;
ld.shared.f64 %fd32, [%r5+4096];
fma.rn.f64 %fd45, %fd32, %fd32, %fd45;
st.shared.f64 [%r5], %fd45;
BB20_7:
bar.sync 0;
BB20_8:
setp.lt.u32 %p6, %r9, 512;
@%p6 bra BB20_12;
setp.gt.u32 %p7, %r10, 255;
@%p7 bra BB20_11;
ld.shared.f64 %fd33, [%r5+2048];
fma.rn.f64 %fd45, %fd33, %fd33, %fd45;
st.shared.f64 [%r5], %fd45;
BB20_11:
bar.sync 0;
BB20_12:
setp.lt.u32 %p8, %r9, 256;
@%p8 bra BB20_16;
setp.gt.u32 %p9, %r10, 127;
@%p9 bra BB20_15;
ld.shared.f64 %fd34, [%r5+1024];
fma.rn.f64 %fd45, %fd34, %fd34, %fd45;
st.shared.f64 [%r5], %fd45;
BB20_15:
bar.sync 0;
BB20_16:
setp.lt.u32 %p10, %r9, 128;
@%p10 bra BB20_20;
setp.gt.u32 %p11, %r10, 63;
@%p11 bra BB20_19;
ld.shared.f64 %fd35, [%r5+512];
fma.rn.f64 %fd45, %fd35, %fd35, %fd45;
st.shared.f64 [%r5], %fd45;
BB20_19:
bar.sync 0;
BB20_20:
setp.gt.u32 %p12, %r10, 31;
@%p12 bra BB20_33;
setp.lt.u32 %p13, %r9, 64;
@%p13 bra BB20_23;
ld.volatile.shared.f64 %fd36, [%r5+256];
fma.rn.f64 %fd45, %fd36, %fd36, %fd45;
st.volatile.shared.f64 [%r5], %fd45;
BB20_23:
setp.lt.u32 %p14, %r9, 32;
@%p14 bra BB20_25;
ld.volatile.shared.f64 %fd37, [%r5+128];
fma.rn.f64 %fd45, %fd37, %fd37, %fd45;
st.volatile.shared.f64 [%r5], %fd45;
BB20_25:
setp.lt.u32 %p15, %r9, 16;
@%p15 bra BB20_27;
ld.volatile.shared.f64 %fd38, [%r5+64];
fma.rn.f64 %fd45, %fd38, %fd38, %fd45;
st.volatile.shared.f64 [%r5], %fd45;
BB20_27:
setp.lt.u32 %p16, %r9, 8;
@%p16 bra BB20_29;
ld.volatile.shared.f64 %fd39, [%r5+32];
fma.rn.f64 %fd45, %fd39, %fd39, %fd45;
st.volatile.shared.f64 [%r5], %fd45;
BB20_29:
setp.lt.u32 %p17, %r9, 4;
@%p17 bra BB20_31;
ld.volatile.shared.f64 %fd40, [%r5+16];
fma.rn.f64 %fd45, %fd40, %fd40, %fd45;
st.volatile.shared.f64 [%r5], %fd45;
BB20_31:
setp.lt.u32 %p18, %r9, 2;
@%p18 bra BB20_33;
ld.volatile.shared.f64 %fd41, [%r5+8];
fma.rn.f64 %fd42, %fd41, %fd41, %fd45;
st.volatile.shared.f64 [%r5], %fd42;
BB20_33:
setp.ne.s32 %p19, %r10, 0;
@%p19 bra BB20_37;
mov.u32 %r34, %nctaid.x;
setp.gt.u32 %p20, %r34, 9;
@%p20 bra BB20_36;
ld.shared.f64 %fd43, [memory];
add.u64 %rd9, %SP, 0;
add.u64 %rd10, %SPL, 0;
st.local.u32 [%rd10], %r7;
st.local.f64 [%rd10+8], %fd43;
mov.u64 %rd11, $str;
cvta.global.u64 %rd12, %rd11;
// Callseq Start 6
{
.reg .b32 temp_param_reg;
// <end>}
.param .b64 param0;
st.param.b64 [param0+0], %rd12;
.param .b64 param1;
st.param.b64 [param1+0], %rd9;
.param .b32 retval0;
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r36, [retval0+0];
//{
}// Callseq End 6
BB20_36:
ld.shared.f64 %fd44, [memory];
cvta.to.global.u64 %rd13, %rd2;
mul.wide.u32 %rd14, %r7, 8;
add.s64 %rd15, %rd13, %rd14;
st.global.f64 [%rd15], %fd44;
BB20_37:
ret;
}
// .globl reduce_sum_sq_f
.visible .entry reduce_sum_sq_f(
.param .u64 reduce_sum_sq_f_param_0,
.param .u64 reduce_sum_sq_f_param_1,
.param .u32 reduce_sum_sq_f_param_2
)
{
.local .align 16 .b8 __local_depot21[16];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<21>;
.reg .f32 %f<61>;
.reg .b32 %r<39>;
.reg .f64 %fd<2>;
.reg .b64 %rd<16>;
mov.u64 %SPL, __local_depot21;
cvta.local.u64 %SP, %SPL;
ld.param.u64 %rd1, [reduce_sum_sq_f_param_0];
ld.param.u64 %rd2, [reduce_sum_sq_f_param_1];
ld.param.u32 %r6, [reduce_sum_sq_f_param_2];
mov.u32 %r7, %ctaid.x;
shl.b32 %r8, %r7, 1;
mov.u32 %r9, %ntid.x;
mov.u32 %r10, %tid.x;
mad.lo.s32 %r38, %r8, %r9, %r10;
mov.f32 %f45, 0f00000000;
setp.ge.u32 %p1, %r38, %r6;
@%p1 bra BB21_4;
BB21_1:
cvta.to.global.u64 %rd3, %rd1;
mul.wide.u32 %rd4, %r38, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f30, [%rd5];
fma.rn.f32 %f45, %f30, %f30, %f45;
add.s32 %r3, %r38, %r9;
setp.ge.u32 %p2, %r3, %r6;
@%p2 bra BB21_3;
mul.wide.u32 %rd7, %r3, 4;
add.s64 %rd8, %rd3, %rd7;
ld.global.f32 %f31, [%rd8];
fma.rn.f32 %f45, %f31, %f31, %f45;
BB21_3:
shl.b32 %r13, %r9, 1;
mov.u32 %r14, %nctaid.x;
mad.lo.s32 %r38, %r13, %r14, %r38;
setp.lt.u32 %p3, %r38, %r6;
@%p3 bra BB21_1;
BB21_4:
shl.b32 %r16, %r10, 2;
mov.u32 %r17, memory;
add.s32 %r5, %r17, %r16;
st.shared.f32 [%r5], %f45;
bar.sync 0;
setp.lt.u32 %p4, %r9, 1024;
@%p4 bra BB21_8;
setp.gt.u32 %p5, %r10, 511;
@%p5 bra BB21_7;
ld.shared.f32 %f32, [%r5+2048];
fma.rn.f32 %f45, %f32, %f32, %f45;
st.shared.f32 [%r5], %f45;
BB21_7:
bar.sync 0;
BB21_8:
setp.lt.u32 %p6, %r9, 512;
@%p6 bra BB21_12;
setp.gt.u32 %p7, %r10, 255;
@%p7 bra BB21_11;
ld.shared.f32 %f33, [%r5+1024];
fma.rn.f32 %f45, %f33, %f33, %f45;
st.shared.f32 [%r5], %f45;
BB21_11:
bar.sync 0;
BB21_12:
setp.lt.u32 %p8, %r9, 256;
@%p8 bra BB21_16;
setp.gt.u32 %p9, %r10, 127;
@%p9 bra BB21_15;
ld.shared.f32 %f34, [%r5+512];
fma.rn.f32 %f45, %f34, %f34, %f45;
st.shared.f32 [%r5], %f45;
BB21_15:
bar.sync 0;
BB21_16:
setp.lt.u32 %p10, %r9, 128;
@%p10 bra BB21_20;
setp.gt.u32 %p11, %r10, 63;
@%p11 bra BB21_19;
ld.shared.f32 %f35, [%r5+256];
fma.rn.f32 %f45, %f35, %f35, %f45;
st.shared.f32 [%r5], %f45;
BB21_19:
bar.sync 0;
BB21_20:
setp.gt.u32 %p12, %r10, 31;
@%p12 bra BB21_33;
setp.lt.u32 %p13, %r9, 64;
@%p13 bra BB21_23;
ld.volatile.shared.f32 %f36, [%r5+128];
fma.rn.f32 %f45, %f36, %f36, %f45;
st.volatile.shared.f32 [%r5], %f45;
BB21_23:
setp.lt.u32 %p14, %r9, 32;
@%p14 bra BB21_25;
ld.volatile.shared.f32 %f37, [%r5+64];
fma.rn.f32 %f45, %f37, %f37, %f45;
st.volatile.shared.f32 [%r5], %f45;
BB21_25:
setp.lt.u32 %p15, %r9, 16;
@%p15 bra BB21_27;
ld.volatile.shared.f32 %f38, [%r5+32];
fma.rn.f32 %f45, %f38, %f38, %f45;
st.volatile.shared.f32 [%r5], %f45;
BB21_27:
setp.lt.u32 %p16, %r9, 8;
@%p16 bra BB21_29;
ld.volatile.shared.f32 %f39, [%r5+16];
fma.rn.f32 %f45, %f39, %f39, %f45;
st.volatile.shared.f32 [%r5], %f45;
BB21_29:
setp.lt.u32 %p17, %r9, 4;
@%p17 bra BB21_31;
ld.volatile.shared.f32 %f40, [%r5+8];
fma.rn.f32 %f45, %f40, %f40, %f45;
st.volatile.shared.f32 [%r5], %f45;
BB21_31:
setp.lt.u32 %p18, %r9, 2;
@%p18 bra BB21_33;
ld.volatile.shared.f32 %f41, [%r5+4];
fma.rn.f32 %f42, %f41, %f41, %f45;
st.volatile.shared.f32 [%r5], %f42;
BB21_33:
setp.ne.s32 %p19, %r10, 0;
@%p19 bra BB21_37;
mov.u32 %r34, %nctaid.x;
setp.gt.u32 %p20, %r34, 9;
@%p20 bra BB21_36;
ld.shared.f32 %f43, [memory];
cvt.f64.f32 %fd1, %f43;
add.u64 %rd9, %SP, 0;
add.u64 %rd10, %SPL, 0;
st.local.u32 [%rd10], %r7;
st.local.f64 [%rd10+8], %fd1;
mov.u64 %rd11, $str;
cvta.global.u64 %rd12, %rd11;
// Callseq Start 7
{
.reg .b32 temp_param_reg;
// <end>}
.param .b64 param0;
st.param.b64 [param0+0], %rd12;
.param .b64 param1;
st.param.b64 [param1+0], %rd9;
.param .b32 retval0;
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r36, [retval0+0];
//{
}// Callseq End 7
BB21_36:
ld.shared.f32 %f44, [memory];
cvta.to.global.u64 %rd13, %rd2;
mul.wide.u32 %rd14, %r7, 4;
add.s64 %rd15, %rd13, %rd14;
st.global.f32 [%rd15], %f44;
BB21_37:
ret;
}
// .globl reduce_col_sum_sq_d
.visible .entry reduce_col_sum_sq_d(
.param .u64 reduce_col_sum_sq_d_param_0,
.param .u64 reduce_col_sum_sq_d_param_1,
.param .u32 reduce_col_sum_sq_d_param_2,
.param .u32 reduce_col_sum_sq_d_param_3
)
{
.reg .pred %p<4>;
.reg .b32 %r<11>;
.reg .f64 %fd<9>;
.reg .b64 %rd<9>;
ld.param.u64 %rd2, [reduce_col_sum_sq_d_param_0];
ld.param.u64 %rd3, [reduce_col_sum_sq_d_param_1];
ld.param.u32 %r5, [reduce_col_sum_sq_d_param_2];
ld.param.u32 %r6, [reduce_col_sum_sq_d_param_3];
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %ctaid.x;
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r7, %r8, %r9;
setp.ge.u32 %p1, %r1, %r6;
@%p1 bra BB22_5;
mul.lo.s32 %r2, %r6, %r5;
cvta.to.global.u64 %rd1, %rd2;
mov.f64 %fd8, 0d0000000000000000;
setp.ge.u32 %p2, %r1, %r2;
@%p2 bra BB22_4;
mov.u32 %r10, %r1;
BB22_3:
mul.wide.u32 %rd4, %r10, 8;
add.s64 %rd5, %rd1, %rd4;
ld.global.f64 %fd6, [%rd5];
fma.rn.f64 %fd8, %fd6, %fd6, %fd8;
add.s32 %r10, %r10, %r6;
setp.lt.u32 %p3, %r10, %r2;
@%p3 bra BB22_3;
BB22_4:
cvta.to.global.u64 %rd6, %rd3;
mul.wide.u32 %rd7, %r1, 8;
add.s64 %rd8, %rd6, %rd7;
st.global.f64 [%rd8], %fd8;
BB22_5:
ret;
}
// .globl reduce_col_sum_sq_f
.visible .entry reduce_col_sum_sq_f(
.param .u64 reduce_col_sum_sq_f_param_0,
.param .u64 reduce_col_sum_sq_f_param_1,
.param .u32 reduce_col_sum_sq_f_param_2,
.param .u32 reduce_col_sum_sq_f_param_3
)
{
.reg .pred %p<4>;
.reg .f32 %f<9>;
.reg .b32 %r<11>;
.reg .b64 %rd<9>;
ld.param.u64 %rd2, [reduce_col_sum_sq_f_param_0];
ld.param.u64 %rd3, [reduce_col_sum_sq_f_param_1];
ld.param.u32 %r5, [reduce_col_sum_sq_f_param_2];
ld.param.u32 %r6, [reduce_col_sum_sq_f_param_3];
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %ctaid.x;
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r7, %r8, %r9;
setp.ge.u32 %p1, %r1, %r6;
@%p1 bra BB23_5;
mul.lo.s32 %r2, %r6, %r5;
cvta.to.global.u64 %rd1, %rd2;
mov.f32 %f8, 0f00000000;
setp.ge.u32 %p2, %r1, %r2;
@%p2 bra BB23_4;
mov.u32 %r10, %r1;
BB23_3:
mul.wide.u32 %rd4, %r10, 4;
add.s64 %rd5, %rd1, %rd4;
ld.global.f32 %f6, [%rd5];
fma.rn.f32 %f8, %f6, %f6, %f8;
add.s32 %r10, %r10, %r6;
setp.lt.u32 %p3, %r10, %r2;
@%p3 bra BB23_3;
BB23_4:
cvta.to.global.u64 %rd6, %rd3;
mul.wide.u32 %rd7, %r1, 4;
add.s64 %rd8, %rd6, %rd7;
st.global.f32 [%rd8], %f8;
BB23_5:
ret;
}
// .globl reduce_row_sum_sq_d
.visible .entry reduce_row_sum_sq_d(
.param .u64 reduce_row_sum_sq_d_param_0,
.param .u64 reduce_row_sum_sq_d_param_1,
.param .u32 reduce_row_sum_sq_d_param_2,
.param .u32 reduce_row_sum_sq_d_param_3
)
{
.reg .pred %p<20>;
.reg .b32 %r<72>;
.reg .f64 %fd<56>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [reduce_row_sum_sq_d_param_0];
ld.param.u64 %rd2, [reduce_row_sum_sq_d_param_1];
ld.param.u32 %r5, [reduce_row_sum_sq_d_param_2];
ld.param.u32 %r4, [reduce_row_sum_sq_d_param_3];
mov.u32 %r6, %ctaid.x;
setp.ge.u32 %p1, %r6, %r5;
@%p1 bra BB24_35;
mov.u32 %r71, %tid.x;
mov.f64 %fd6, 0d0000000000000000;
setp.ge.u32 %p2, %r71, %r4;
@%p2 bra BB24_4;
cvta.to.global.u64 %rd3, %rd1;
BB24_3:
mad.lo.s32 %r8, %r6, %r4, %r71;
mul.wide.u32 %rd4, %r8, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd28, [%rd5];
fma.rn.f64 %fd6, %fd28, %fd28, %fd6;
mov.u32 %r9, %ntid.x;
add.s32 %r71, %r9, %r71;
setp.lt.u32 %p3, %r71, %r4;
@%p3 bra BB24_3;
BB24_4:
mov.u32 %r10, %tid.x;
shl.b32 %r11, %r10, 3;
mov.u32 %r12, memory;
add.s32 %r13, %r12, %r11;
st.shared.f64 [%r13], %fd6;
bar.sync 0;
mov.u32 %r14, %ntid.x;
setp.lt.u32 %p4, %r14, 1024;
@%p4 bra BB24_8;
setp.gt.u32 %p5, %r10, 511;
@%p5 bra BB24_7;
ld.shared.f64 %fd29, [%r13+4096];
fma.rn.f64 %fd6, %fd29, %fd29, %fd6;
st.shared.f64 [%r13], %fd6;
BB24_7:
bar.sync 0;
BB24_8:
setp.lt.u32 %p6, %r14, 512;
@%p6 bra BB24_12;
setp.gt.u32 %p7, %r10, 255;
@%p7 bra BB24_11;
ld.shared.f64 %fd30, [%r13+2048];
fma.rn.f64 %fd6, %fd30, %fd30, %fd6;
st.shared.f64 [%r13], %fd6;
BB24_11:
bar.sync 0;
BB24_12:
setp.lt.u32 %p8, %r14, 256;
@%p8 bra BB24_16;
setp.gt.u32 %p9, %r10, 127;
@%p9 bra BB24_15;
ld.shared.f64 %fd31, [%r13+1024];
fma.rn.f64 %fd6, %fd31, %fd31, %fd6;
st.shared.f64 [%r13], %fd6;
BB24_15:
bar.sync 0;
BB24_16:
setp.lt.u32 %p10, %r14, 128;
@%p10 bra BB24_20;
setp.gt.u32 %p11, %r10, 63;
@%p11 bra BB24_19;
ld.shared.f64 %fd32, [%r13+512];
fma.rn.f64 %fd6, %fd32, %fd32, %fd6;
st.shared.f64 [%r13], %fd6;
BB24_19:
bar.sync 0;
BB24_20:
setp.gt.u32 %p12, %r10, 31;
@%p12 bra BB24_33;
setp.lt.u32 %p13, %r14, 64;
@%p13 bra BB24_23;
ld.volatile.shared.f64 %fd33, [%r13+256];
fma.rn.f64 %fd6, %fd33, %fd33, %fd6;
st.volatile.shared.f64 [%r13], %fd6;
BB24_23:
setp.lt.u32 %p14, %r14, 32;
@%p14 bra BB24_25;
ld.volatile.shared.f64 %fd34, [%r13+128];
fma.rn.f64 %fd6, %fd34, %fd34, %fd6;
st.volatile.shared.f64 [%r13], %fd6;
BB24_25:
setp.lt.u32 %p15, %r14, 16;
@%p15 bra BB24_27;
ld.volatile.shared.f64 %fd35, [%r13+64];
fma.rn.f64 %fd6, %fd35, %fd35, %fd6;
st.volatile.shared.f64 [%r13], %fd6;
BB24_27:
setp.lt.u32 %p16, %r14, 8;
@%p16 bra BB24_29;
ld.volatile.shared.f64 %fd36, [%r13+32];
fma.rn.f64 %fd6, %fd36, %fd36, %fd6;
st.volatile.shared.f64 [%r13], %fd6;
BB24_29:
setp.lt.u32 %p17, %r14, 4;
@%p17 bra BB24_31;
ld.volatile.shared.f64 %fd37, [%r13+16];
fma.rn.f64 %fd6, %fd37, %fd37, %fd6;
st.volatile.shared.f64 [%r13], %fd6;
BB24_31:
setp.lt.u32 %p18, %r14, 2;
@%p18 bra BB24_33;
ld.volatile.shared.f64 %fd38, [%r13+8];
fma.rn.f64 %fd39, %fd38, %fd38, %fd6;
st.volatile.shared.f64 [%r13], %fd39;
BB24_33:
setp.ne.s32 %p19, %r10, 0;
@%p19 bra BB24_35;
ld.shared.f64 %fd40, [memory];
cvta.to.global.u64 %rd6, %rd2;
mul.wide.u32 %rd7, %r6, 8;
add.s64 %rd8, %rd6, %rd7;
st.global.f64 [%rd8], %fd40;
BB24_35:
ret;
}
// .globl reduce_row_sum_sq_f
.visible .entry reduce_row_sum_sq_f(
.param .u64 reduce_row_sum_sq_f_param_0,
.param .u64 reduce_row_sum_sq_f_param_1,
.param .u32 reduce_row_sum_sq_f_param_2,
.param .u32 reduce_row_sum_sq_f_param_3
)
{
.reg .pred %p<20>;
.reg .f32 %f<56>;
.reg .b32 %r<72>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [reduce_row_sum_sq_f_param_0];
ld.param.u64 %rd2, [reduce_row_sum_sq_f_param_1];
ld.param.u32 %r5, [reduce_row_sum_sq_f_param_2];
ld.param.u32 %r4, [reduce_row_sum_sq_f_param_3];
mov.u32 %r6, %ctaid.x;
setp.ge.u32 %p1, %r6, %r5;
@%p1 bra BB25_35;
mov.u32 %r71, %tid.x;
mov.f32 %f6, 0f00000000;
setp.ge.u32 %p2, %r71, %r4;
@%p2 bra BB25_4;
cvta.to.global.u64 %rd3, %rd1;
BB25_3:
mad.lo.s32 %r8, %r6, %r4, %r71;
mul.wide.u32 %rd4, %r8, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f28, [%rd5];
fma.rn.f32 %f6, %f28, %f28, %f6;
mov.u32 %r9, %ntid.x;
add.s32 %r71, %r9, %r71;
setp.lt.u32 %p3, %r71, %r4;
@%p3 bra BB25_3;
BB25_4:
mov.u32 %r10, %tid.x;
shl.b32 %r11, %r10, 2;
mov.u32 %r12, memory;
add.s32 %r13, %r12, %r11;
st.shared.f32 [%r13], %f6;
bar.sync 0;
mov.u32 %r14, %ntid.x;
setp.lt.u32 %p4, %r14, 1024;
@%p4 bra BB25_8;
setp.gt.u32 %p5, %r10, 511;
@%p5 bra BB25_7;
ld.shared.f32 %f29, [%r13+2048];
fma.rn.f32 %f6, %f29, %f29, %f6;
st.shared.f32 [%r13], %f6;
BB25_7:
bar.sync 0;
BB25_8:
setp.lt.u32 %p6, %r14, 512;
@%p6 bra BB25_12;
setp.gt.u32 %p7, %r10, 255;
@%p7 bra BB25_11;
ld.shared.f32 %f30, [%r13+1024];
fma.rn.f32 %f6, %f30, %f30, %f6;
st.shared.f32 [%r13], %f6;
BB25_11:
bar.sync 0;
BB25_12:
setp.lt.u32 %p8, %r14, 256;
@%p8 bra BB25_16;
setp.gt.u32 %p9, %r10, 127;
@%p9 bra BB25_15;
ld.shared.f32 %f31, [%r13+512];
fma.rn.f32 %f6, %f31, %f31, %f6;
st.shared.f32 [%r13], %f6;
BB25_15:
bar.sync 0;
BB25_16:
setp.lt.u32 %p10, %r14, 128;
@%p10 bra BB25_20;
setp.gt.u32 %p11, %r10, 63;
@%p11 bra BB25_19;
ld.shared.f32 %f32, [%r13+256];
fma.rn.f32 %f6, %f32, %f32, %f6;
st.shared.f32 [%r13], %f6;
BB25_19:
bar.sync 0;
BB25_20:
setp.gt.u32 %p12, %r10, 31;
@%p12 bra BB25_33;
setp.lt.u32 %p13, %r14, 64;
@%p13 bra BB25_23;
ld.volatile.shared.f32 %f33, [%r13+128];
fma.rn.f32 %f6, %f33, %f33, %f6;
st.volatile.shared.f32 [%r13], %f6;
BB25_23:
setp.lt.u32 %p14, %r14, 32;
@%p14 bra BB25_25;
ld.volatile.shared.f32 %f34, [%r13+64];
fma.rn.f32 %f6, %f34, %f34, %f6;
st.volatile.shared.f32 [%r13], %f6;
BB25_25:
setp.lt.u32 %p15, %r14, 16;
@%p15 bra BB25_27;
ld.volatile.shared.f32 %f35, [%r13+32];
fma.rn.f32 %f6, %f35, %f35, %f6;
st.volatile.shared.f32 [%r13], %f6;
BB25_27:
setp.lt.u32 %p16, %r14, 8;
@%p16 bra BB25_29;
ld.volatile.shared.f32 %f36, [%r13+16];
fma.rn.f32 %f6, %f36, %f36, %f6;
st.volatile.shared.f32 [%r13], %f6;
BB25_29:
setp.lt.u32 %p17, %r14, 4;
@%p17 bra BB25_31;
ld.volatile.shared.f32 %f37, [%r13+8];
fma.rn.f32 %f6, %f37, %f37, %f6;
st.volatile.shared.f32 [%r13], %f6;
BB25_31:
setp.lt.u32 %p18, %r14, 2;
@%p18 bra BB25_33;
ld.volatile.shared.f32 %f38, [%r13+4];
fma.rn.f32 %f39, %f38, %f38, %f6;
st.volatile.shared.f32 [%r13], %f39;
BB25_33:
setp.ne.s32 %p19, %r10, 0;
@%p19 bra BB25_35;
ld.shared.f32 %f40, [memory];
cvta.to.global.u64 %rd6, %rd2;
mul.wide.u32 %rd7, %r6, 4;
add.s64 %rd8, %rd6, %rd7;
st.global.f32 [%rd8], %f40;
BB25_35:
ret;
}