| // |
| // Generated by NVIDIA NVVM Compiler |
| // |
| // Compiler Build ID: CL-27506705 |
| // Cuda compilation tools, release 10.2, V10.2.89 |
| // Based on LLVM 3.4svn |
| // |
| |
| .version 6.5 |
| .target sm_30 |
| .address_size 64 |
| |
| // .globl double2float_f |
| .extern .func (.param .b32 func_retval0) vprintf |
| ( |
| .param .b64 vprintf_param_0, |
| .param .b64 vprintf_param_1 |
| ) |
| ; |
| .global .align 1 .b8 $str[39] = {98, 108, 111, 99, 107, 73, 100, 120, 46, 120, 61, 37, 100, 32, 114, 101, 100, 117, 99, 116, 105, 111, 110, 32, 114, 101, 115, 117, 108, 116, 58, 32, 37, 51, 46, 49, 102, 10, 0}; |
| .extern .shared .align 1 .b8 memory[]; |
| |
| .visible .entry double2float_f( |
| .param .u64 double2float_f_param_0, |
| .param .u64 double2float_f_param_1, |
| .param .u32 double2float_f_param_2 |
| ) |
| { |
| .reg .pred %p<2>; |
| .reg .f32 %f<2>; |
| .reg .b32 %r<6>; |
| .reg .f64 %fd<2>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd1, [double2float_f_param_0]; |
| ld.param.u64 %rd2, [double2float_f_param_1]; |
| ld.param.u32 %r2, [double2float_f_param_2]; |
| mov.u32 %r3, %ctaid.x; |
| mov.u32 %r4, %ntid.x; |
| mov.u32 %r5, %tid.x; |
| mad.lo.s32 %r1, %r4, %r3, %r5; |
| setp.ge.s32 %p1, %r1, %r2; |
| @%p1 bra BB0_2; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.s32 %rd4, %r1, 8; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f64 %fd1, [%rd5]; |
| cvt.rn.f32.f64 %f1, %fd1; |
| cvta.to.global.u64 %rd6, %rd2; |
| mul.wide.s32 %rd7, %r1, 4; |
| add.s64 %rd8, %rd6, %rd7; |
| st.global.f32 [%rd8], %f1; |
| |
| BB0_2: |
| ret; |
| } |
| |
| // .globl float2double_f |
| .visible .entry float2double_f( |
| .param .u64 float2double_f_param_0, |
| .param .u64 float2double_f_param_1, |
| .param .u32 float2double_f_param_2 |
| ) |
| { |
| .reg .pred %p<2>; |
| .reg .f32 %f<2>; |
| .reg .b32 %r<6>; |
| .reg .f64 %fd<2>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd1, [float2double_f_param_0]; |
| ld.param.u64 %rd2, [float2double_f_param_1]; |
| ld.param.u32 %r2, [float2double_f_param_2]; |
| mov.u32 %r3, %ctaid.x; |
| mov.u32 %r4, %ntid.x; |
| mov.u32 %r5, %tid.x; |
| mad.lo.s32 %r1, %r4, %r3, %r5; |
| setp.ge.s32 %p1, %r1, %r2; |
| @%p1 bra BB1_2; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.s32 %rd4, %r1, 4; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f32 %f1, [%rd5]; |
| cvt.f64.f32 %fd1, %f1; |
| cvta.to.global.u64 %rd6, %rd2; |
| mul.wide.s32 %rd7, %r1, 8; |
| add.s64 %rd8, %rd6, %rd7; |
| st.global.f64 [%rd8], %fd1; |
| |
| BB1_2: |
| ret; |
| } |
| |
| // .globl reduce_sum_d |
| .visible .entry reduce_sum_d( |
| .param .u64 reduce_sum_d_param_0, |
| .param .u64 reduce_sum_d_param_1, |
| .param .u32 reduce_sum_d_param_2 |
| ) |
| { |
| .local .align 16 .b8 __local_depot2[16]; |
| .reg .b64 %SP; |
| .reg .b64 %SPL; |
| .reg .pred %p<21>; |
| .reg .b32 %r<39>; |
| .reg .f64 %fd<61>; |
| .reg .b64 %rd<16>; |
| |
| |
| mov.u64 %SPL, __local_depot2; |
| cvta.local.u64 %SP, %SPL; |
| ld.param.u64 %rd1, [reduce_sum_d_param_0]; |
| ld.param.u64 %rd2, [reduce_sum_d_param_1]; |
| ld.param.u32 %r6, [reduce_sum_d_param_2]; |
| mov.u32 %r7, %ctaid.x; |
| shl.b32 %r8, %r7, 1; |
| mov.u32 %r9, %ntid.x; |
| mov.u32 %r10, %tid.x; |
| mad.lo.s32 %r38, %r8, %r9, %r10; |
| mov.f64 %fd45, 0d0000000000000000; |
| setp.ge.u32 %p1, %r38, %r6; |
| @%p1 bra BB2_4; |
| |
| BB2_1: |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.u32 %rd4, %r38, 8; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f64 %fd30, [%rd5]; |
| add.f64 %fd45, %fd45, %fd30; |
| add.s32 %r3, %r38, %r9; |
| setp.ge.u32 %p2, %r3, %r6; |
| @%p2 bra BB2_3; |
| |
| mul.wide.u32 %rd7, %r3, 8; |
| add.s64 %rd8, %rd3, %rd7; |
| ld.global.f64 %fd31, [%rd8]; |
| add.f64 %fd45, %fd45, %fd31; |
| |
| BB2_3: |
| shl.b32 %r13, %r9, 1; |
| mov.u32 %r14, %nctaid.x; |
| mad.lo.s32 %r38, %r13, %r14, %r38; |
| setp.lt.u32 %p3, %r38, %r6; |
| @%p3 bra BB2_1; |
| |
| BB2_4: |
| shl.b32 %r16, %r10, 3; |
| mov.u32 %r17, memory; |
| add.s32 %r5, %r17, %r16; |
| st.shared.f64 [%r5], %fd45; |
| bar.sync 0; |
| setp.lt.u32 %p4, %r9, 1024; |
| @%p4 bra BB2_8; |
| |
| setp.gt.u32 %p5, %r10, 511; |
| @%p5 bra BB2_7; |
| |
| ld.shared.f64 %fd32, [%r5+4096]; |
| add.f64 %fd45, %fd45, %fd32; |
| st.shared.f64 [%r5], %fd45; |
| |
| BB2_7: |
| bar.sync 0; |
| |
| BB2_8: |
| setp.lt.u32 %p6, %r9, 512; |
| @%p6 bra BB2_12; |
| |
| setp.gt.u32 %p7, %r10, 255; |
| @%p7 bra BB2_11; |
| |
| ld.shared.f64 %fd33, [%r5+2048]; |
| add.f64 %fd45, %fd45, %fd33; |
| st.shared.f64 [%r5], %fd45; |
| |
| BB2_11: |
| bar.sync 0; |
| |
| BB2_12: |
| setp.lt.u32 %p8, %r9, 256; |
| @%p8 bra BB2_16; |
| |
| setp.gt.u32 %p9, %r10, 127; |
| @%p9 bra BB2_15; |
| |
| ld.shared.f64 %fd34, [%r5+1024]; |
| add.f64 %fd45, %fd45, %fd34; |
| st.shared.f64 [%r5], %fd45; |
| |
| BB2_15: |
| bar.sync 0; |
| |
| BB2_16: |
| setp.lt.u32 %p10, %r9, 128; |
| @%p10 bra BB2_20; |
| |
| setp.gt.u32 %p11, %r10, 63; |
| @%p11 bra BB2_19; |
| |
| ld.shared.f64 %fd35, [%r5+512]; |
| add.f64 %fd45, %fd45, %fd35; |
| st.shared.f64 [%r5], %fd45; |
| |
| BB2_19: |
| bar.sync 0; |
| |
| BB2_20: |
| setp.gt.u32 %p12, %r10, 31; |
| @%p12 bra BB2_33; |
| |
| setp.lt.u32 %p13, %r9, 64; |
| @%p13 bra BB2_23; |
| |
| ld.volatile.shared.f64 %fd36, [%r5+256]; |
| add.f64 %fd45, %fd45, %fd36; |
| st.volatile.shared.f64 [%r5], %fd45; |
| |
| BB2_23: |
| setp.lt.u32 %p14, %r9, 32; |
| @%p14 bra BB2_25; |
| |
| ld.volatile.shared.f64 %fd37, [%r5+128]; |
| add.f64 %fd45, %fd45, %fd37; |
| st.volatile.shared.f64 [%r5], %fd45; |
| |
| BB2_25: |
| setp.lt.u32 %p15, %r9, 16; |
| @%p15 bra BB2_27; |
| |
| ld.volatile.shared.f64 %fd38, [%r5+64]; |
| add.f64 %fd45, %fd45, %fd38; |
| st.volatile.shared.f64 [%r5], %fd45; |
| |
| BB2_27: |
| setp.lt.u32 %p16, %r9, 8; |
| @%p16 bra BB2_29; |
| |
| ld.volatile.shared.f64 %fd39, [%r5+32]; |
| add.f64 %fd45, %fd45, %fd39; |
| st.volatile.shared.f64 [%r5], %fd45; |
| |
| BB2_29: |
| setp.lt.u32 %p17, %r9, 4; |
| @%p17 bra BB2_31; |
| |
| ld.volatile.shared.f64 %fd40, [%r5+16]; |
| add.f64 %fd45, %fd45, %fd40; |
| st.volatile.shared.f64 [%r5], %fd45; |
| |
| BB2_31: |
| setp.lt.u32 %p18, %r9, 2; |
| @%p18 bra BB2_33; |
| |
| ld.volatile.shared.f64 %fd41, [%r5+8]; |
| add.f64 %fd42, %fd45, %fd41; |
| st.volatile.shared.f64 [%r5], %fd42; |
| |
| BB2_33: |
| setp.ne.s32 %p19, %r10, 0; |
| @%p19 bra BB2_37; |
| |
| mov.u32 %r34, %nctaid.x; |
| setp.gt.u32 %p20, %r34, 9; |
| @%p20 bra BB2_36; |
| |
| ld.shared.f64 %fd43, [memory]; |
| add.u64 %rd9, %SP, 0; |
| add.u64 %rd10, %SPL, 0; |
| st.local.u32 [%rd10], %r7; |
| st.local.f64 [%rd10+8], %fd43; |
| mov.u64 %rd11, $str; |
| cvta.global.u64 %rd12, %rd11; |
| // Callseq Start 0 |
| { |
| .reg .b32 temp_param_reg; |
| // <end>} |
| .param .b64 param0; |
| st.param.b64 [param0+0], %rd12; |
| .param .b64 param1; |
| st.param.b64 [param1+0], %rd9; |
| .param .b32 retval0; |
| call.uni (retval0), |
| vprintf, |
| ( |
| param0, |
| param1 |
| ); |
| ld.param.b32 %r36, [retval0+0]; |
| |
| //{ |
| }// Callseq End 0 |
| |
| BB2_36: |
| ld.shared.f64 %fd44, [memory]; |
| cvta.to.global.u64 %rd13, %rd2; |
| mul.wide.u32 %rd14, %r7, 8; |
| add.s64 %rd15, %rd13, %rd14; |
| st.global.f64 [%rd15], %fd44; |
| |
| BB2_37: |
| ret; |
| } |
| |
| // .globl reduce_sum_f |
| .visible .entry reduce_sum_f( |
| .param .u64 reduce_sum_f_param_0, |
| .param .u64 reduce_sum_f_param_1, |
| .param .u32 reduce_sum_f_param_2 |
| ) |
| { |
| .local .align 16 .b8 __local_depot3[16]; |
| .reg .b64 %SP; |
| .reg .b64 %SPL; |
| .reg .pred %p<21>; |
| .reg .f32 %f<61>; |
| .reg .b32 %r<39>; |
| .reg .f64 %fd<2>; |
| .reg .b64 %rd<16>; |
| |
| |
| mov.u64 %SPL, __local_depot3; |
| cvta.local.u64 %SP, %SPL; |
| ld.param.u64 %rd1, [reduce_sum_f_param_0]; |
| ld.param.u64 %rd2, [reduce_sum_f_param_1]; |
| ld.param.u32 %r6, [reduce_sum_f_param_2]; |
| mov.u32 %r7, %ctaid.x; |
| shl.b32 %r8, %r7, 1; |
| mov.u32 %r9, %ntid.x; |
| mov.u32 %r10, %tid.x; |
| mad.lo.s32 %r38, %r8, %r9, %r10; |
| mov.f32 %f45, 0f00000000; |
| setp.ge.u32 %p1, %r38, %r6; |
| @%p1 bra BB3_4; |
| |
| BB3_1: |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.u32 %rd4, %r38, 4; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f32 %f30, [%rd5]; |
| add.f32 %f45, %f45, %f30; |
| add.s32 %r3, %r38, %r9; |
| setp.ge.u32 %p2, %r3, %r6; |
| @%p2 bra BB3_3; |
| |
| mul.wide.u32 %rd7, %r3, 4; |
| add.s64 %rd8, %rd3, %rd7; |
| ld.global.f32 %f31, [%rd8]; |
| add.f32 %f45, %f45, %f31; |
| |
| BB3_3: |
| shl.b32 %r13, %r9, 1; |
| mov.u32 %r14, %nctaid.x; |
| mad.lo.s32 %r38, %r13, %r14, %r38; |
| setp.lt.u32 %p3, %r38, %r6; |
| @%p3 bra BB3_1; |
| |
| BB3_4: |
| shl.b32 %r16, %r10, 2; |
| mov.u32 %r17, memory; |
| add.s32 %r5, %r17, %r16; |
| st.shared.f32 [%r5], %f45; |
| bar.sync 0; |
| setp.lt.u32 %p4, %r9, 1024; |
| @%p4 bra BB3_8; |
| |
| setp.gt.u32 %p5, %r10, 511; |
| @%p5 bra BB3_7; |
| |
| ld.shared.f32 %f32, [%r5+2048]; |
| add.f32 %f45, %f45, %f32; |
| st.shared.f32 [%r5], %f45; |
| |
| BB3_7: |
| bar.sync 0; |
| |
| BB3_8: |
| setp.lt.u32 %p6, %r9, 512; |
| @%p6 bra BB3_12; |
| |
| setp.gt.u32 %p7, %r10, 255; |
| @%p7 bra BB3_11; |
| |
| ld.shared.f32 %f33, [%r5+1024]; |
| add.f32 %f45, %f45, %f33; |
| st.shared.f32 [%r5], %f45; |
| |
| BB3_11: |
| bar.sync 0; |
| |
| BB3_12: |
| setp.lt.u32 %p8, %r9, 256; |
| @%p8 bra BB3_16; |
| |
| setp.gt.u32 %p9, %r10, 127; |
| @%p9 bra BB3_15; |
| |
| ld.shared.f32 %f34, [%r5+512]; |
| add.f32 %f45, %f45, %f34; |
| st.shared.f32 [%r5], %f45; |
| |
| BB3_15: |
| bar.sync 0; |
| |
| BB3_16: |
| setp.lt.u32 %p10, %r9, 128; |
| @%p10 bra BB3_20; |
| |
| setp.gt.u32 %p11, %r10, 63; |
| @%p11 bra BB3_19; |
| |
| ld.shared.f32 %f35, [%r5+256]; |
| add.f32 %f45, %f45, %f35; |
| st.shared.f32 [%r5], %f45; |
| |
| BB3_19: |
| bar.sync 0; |
| |
| BB3_20: |
| setp.gt.u32 %p12, %r10, 31; |
| @%p12 bra BB3_33; |
| |
| setp.lt.u32 %p13, %r9, 64; |
| @%p13 bra BB3_23; |
| |
| ld.volatile.shared.f32 %f36, [%r5+128]; |
| add.f32 %f45, %f45, %f36; |
| st.volatile.shared.f32 [%r5], %f45; |
| |
| BB3_23: |
| setp.lt.u32 %p14, %r9, 32; |
| @%p14 bra BB3_25; |
| |
| ld.volatile.shared.f32 %f37, [%r5+64]; |
| add.f32 %f45, %f45, %f37; |
| st.volatile.shared.f32 [%r5], %f45; |
| |
| BB3_25: |
| setp.lt.u32 %p15, %r9, 16; |
| @%p15 bra BB3_27; |
| |
| ld.volatile.shared.f32 %f38, [%r5+32]; |
| add.f32 %f45, %f45, %f38; |
| st.volatile.shared.f32 [%r5], %f45; |
| |
| BB3_27: |
| setp.lt.u32 %p16, %r9, 8; |
| @%p16 bra BB3_29; |
| |
| ld.volatile.shared.f32 %f39, [%r5+16]; |
| add.f32 %f45, %f45, %f39; |
| st.volatile.shared.f32 [%r5], %f45; |
| |
| BB3_29: |
| setp.lt.u32 %p17, %r9, 4; |
| @%p17 bra BB3_31; |
| |
| ld.volatile.shared.f32 %f40, [%r5+8]; |
| add.f32 %f45, %f45, %f40; |
| st.volatile.shared.f32 [%r5], %f45; |
| |
| BB3_31: |
| setp.lt.u32 %p18, %r9, 2; |
| @%p18 bra BB3_33; |
| |
| ld.volatile.shared.f32 %f41, [%r5+4]; |
| add.f32 %f42, %f45, %f41; |
| st.volatile.shared.f32 [%r5], %f42; |
| |
| BB3_33: |
| setp.ne.s32 %p19, %r10, 0; |
| @%p19 bra BB3_37; |
| |
| mov.u32 %r34, %nctaid.x; |
| setp.gt.u32 %p20, %r34, 9; |
| @%p20 bra BB3_36; |
| |
| ld.shared.f32 %f43, [memory]; |
| cvt.f64.f32 %fd1, %f43; |
| add.u64 %rd9, %SP, 0; |
| add.u64 %rd10, %SPL, 0; |
| st.local.u32 [%rd10], %r7; |
| st.local.f64 [%rd10+8], %fd1; |
| mov.u64 %rd11, $str; |
| cvta.global.u64 %rd12, %rd11; |
| // Callseq Start 1 |
| { |
| .reg .b32 temp_param_reg; |
| // <end>} |
| .param .b64 param0; |
| st.param.b64 [param0+0], %rd12; |
| .param .b64 param1; |
| st.param.b64 [param1+0], %rd9; |
| .param .b32 retval0; |
| call.uni (retval0), |
| vprintf, |
| ( |
| param0, |
| param1 |
| ); |
| ld.param.b32 %r36, [retval0+0]; |
| |
| //{ |
| }// Callseq End 1 |
| |
| BB3_36: |
| ld.shared.f32 %f44, [memory]; |
| cvta.to.global.u64 %rd13, %rd2; |
| mul.wide.u32 %rd14, %r7, 4; |
| add.s64 %rd15, %rd13, %rd14; |
| st.global.f32 [%rd15], %f44; |
| |
| BB3_37: |
| ret; |
| } |
| |
| // .globl reduce_row_sum_d |
| .visible .entry reduce_row_sum_d( |
| .param .u64 reduce_row_sum_d_param_0, |
| .param .u64 reduce_row_sum_d_param_1, |
| .param .u32 reduce_row_sum_d_param_2, |
| .param .u32 reduce_row_sum_d_param_3 |
| ) |
| { |
| .reg .pred %p<20>; |
| .reg .b32 %r<72>; |
| .reg .f64 %fd<56>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd1, [reduce_row_sum_d_param_0]; |
| ld.param.u64 %rd2, [reduce_row_sum_d_param_1]; |
| ld.param.u32 %r5, [reduce_row_sum_d_param_2]; |
| ld.param.u32 %r4, [reduce_row_sum_d_param_3]; |
| mov.u32 %r6, %ctaid.x; |
| setp.ge.u32 %p1, %r6, %r5; |
| @%p1 bra BB4_35; |
| |
| mov.u32 %r71, %tid.x; |
| mov.f64 %fd6, 0d0000000000000000; |
| setp.ge.u32 %p2, %r71, %r4; |
| @%p2 bra BB4_4; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| |
| BB4_3: |
| mad.lo.s32 %r8, %r6, %r4, %r71; |
| mul.wide.u32 %rd4, %r8, 8; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f64 %fd28, [%rd5]; |
| add.f64 %fd6, %fd6, %fd28; |
| mov.u32 %r9, %ntid.x; |
| add.s32 %r71, %r9, %r71; |
| setp.lt.u32 %p3, %r71, %r4; |
| @%p3 bra BB4_3; |
| |
| BB4_4: |
| mov.u32 %r10, %tid.x; |
| shl.b32 %r11, %r10, 3; |
| mov.u32 %r12, memory; |
| add.s32 %r13, %r12, %r11; |
| st.shared.f64 [%r13], %fd6; |
| bar.sync 0; |
| mov.u32 %r14, %ntid.x; |
| setp.lt.u32 %p4, %r14, 1024; |
| @%p4 bra BB4_8; |
| |
| setp.gt.u32 %p5, %r10, 511; |
| @%p5 bra BB4_7; |
| |
| ld.shared.f64 %fd29, [%r13+4096]; |
| add.f64 %fd6, %fd6, %fd29; |
| st.shared.f64 [%r13], %fd6; |
| |
| BB4_7: |
| bar.sync 0; |
| |
| BB4_8: |
| setp.lt.u32 %p6, %r14, 512; |
| @%p6 bra BB4_12; |
| |
| setp.gt.u32 %p7, %r10, 255; |
| @%p7 bra BB4_11; |
| |
| ld.shared.f64 %fd30, [%r13+2048]; |
| add.f64 %fd6, %fd6, %fd30; |
| st.shared.f64 [%r13], %fd6; |
| |
| BB4_11: |
| bar.sync 0; |
| |
| BB4_12: |
| setp.lt.u32 %p8, %r14, 256; |
| @%p8 bra BB4_16; |
| |
| setp.gt.u32 %p9, %r10, 127; |
| @%p9 bra BB4_15; |
| |
| ld.shared.f64 %fd31, [%r13+1024]; |
| add.f64 %fd6, %fd6, %fd31; |
| st.shared.f64 [%r13], %fd6; |
| |
| BB4_15: |
| bar.sync 0; |
| |
| BB4_16: |
| setp.lt.u32 %p10, %r14, 128; |
| @%p10 bra BB4_20; |
| |
| setp.gt.u32 %p11, %r10, 63; |
| @%p11 bra BB4_19; |
| |
| ld.shared.f64 %fd32, [%r13+512]; |
| add.f64 %fd6, %fd6, %fd32; |
| st.shared.f64 [%r13], %fd6; |
| |
| BB4_19: |
| bar.sync 0; |
| |
| BB4_20: |
| setp.gt.u32 %p12, %r10, 31; |
| @%p12 bra BB4_33; |
| |
| setp.lt.u32 %p13, %r14, 64; |
| @%p13 bra BB4_23; |
| |
| ld.volatile.shared.f64 %fd33, [%r13+256]; |
| add.f64 %fd6, %fd6, %fd33; |
| st.volatile.shared.f64 [%r13], %fd6; |
| |
| BB4_23: |
| setp.lt.u32 %p14, %r14, 32; |
| @%p14 bra BB4_25; |
| |
| ld.volatile.shared.f64 %fd34, [%r13+128]; |
| add.f64 %fd6, %fd6, %fd34; |
| st.volatile.shared.f64 [%r13], %fd6; |
| |
| BB4_25: |
| setp.lt.u32 %p15, %r14, 16; |
| @%p15 bra BB4_27; |
| |
| ld.volatile.shared.f64 %fd35, [%r13+64]; |
| add.f64 %fd6, %fd6, %fd35; |
| st.volatile.shared.f64 [%r13], %fd6; |
| |
| BB4_27: |
| setp.lt.u32 %p16, %r14, 8; |
| @%p16 bra BB4_29; |
| |
| ld.volatile.shared.f64 %fd36, [%r13+32]; |
| add.f64 %fd6, %fd6, %fd36; |
| st.volatile.shared.f64 [%r13], %fd6; |
| |
| BB4_29: |
| setp.lt.u32 %p17, %r14, 4; |
| @%p17 bra BB4_31; |
| |
| ld.volatile.shared.f64 %fd37, [%r13+16]; |
| add.f64 %fd6, %fd6, %fd37; |
| st.volatile.shared.f64 [%r13], %fd6; |
| |
| BB4_31: |
| setp.lt.u32 %p18, %r14, 2; |
| @%p18 bra BB4_33; |
| |
| ld.volatile.shared.f64 %fd38, [%r13+8]; |
| add.f64 %fd39, %fd6, %fd38; |
| st.volatile.shared.f64 [%r13], %fd39; |
| |
| BB4_33: |
| setp.ne.s32 %p19, %r10, 0; |
| @%p19 bra BB4_35; |
| |
| ld.shared.f64 %fd40, [memory]; |
| cvta.to.global.u64 %rd6, %rd2; |
| mul.wide.u32 %rd7, %r6, 8; |
| add.s64 %rd8, %rd6, %rd7; |
| st.global.f64 [%rd8], %fd40; |
| |
| BB4_35: |
| ret; |
| } |
| |
| // .globl reduce_row_sum_f |
| .visible .entry reduce_row_sum_f( |
| .param .u64 reduce_row_sum_f_param_0, |
| .param .u64 reduce_row_sum_f_param_1, |
| .param .u32 reduce_row_sum_f_param_2, |
| .param .u32 reduce_row_sum_f_param_3 |
| ) |
| { |
| .reg .pred %p<20>; |
| .reg .f32 %f<56>; |
| .reg .b32 %r<72>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd1, [reduce_row_sum_f_param_0]; |
| ld.param.u64 %rd2, [reduce_row_sum_f_param_1]; |
| ld.param.u32 %r5, [reduce_row_sum_f_param_2]; |
| ld.param.u32 %r4, [reduce_row_sum_f_param_3]; |
| mov.u32 %r6, %ctaid.x; |
| setp.ge.u32 %p1, %r6, %r5; |
| @%p1 bra BB5_35; |
| |
| mov.u32 %r71, %tid.x; |
| mov.f32 %f6, 0f00000000; |
| setp.ge.u32 %p2, %r71, %r4; |
| @%p2 bra BB5_4; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| |
| BB5_3: |
| mad.lo.s32 %r8, %r6, %r4, %r71; |
| mul.wide.u32 %rd4, %r8, 4; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f32 %f28, [%rd5]; |
| add.f32 %f6, %f6, %f28; |
| mov.u32 %r9, %ntid.x; |
| add.s32 %r71, %r9, %r71; |
| setp.lt.u32 %p3, %r71, %r4; |
| @%p3 bra BB5_3; |
| |
| BB5_4: |
| mov.u32 %r10, %tid.x; |
| shl.b32 %r11, %r10, 2; |
| mov.u32 %r12, memory; |
| add.s32 %r13, %r12, %r11; |
| st.shared.f32 [%r13], %f6; |
| bar.sync 0; |
| mov.u32 %r14, %ntid.x; |
| setp.lt.u32 %p4, %r14, 1024; |
| @%p4 bra BB5_8; |
| |
| setp.gt.u32 %p5, %r10, 511; |
| @%p5 bra BB5_7; |
| |
| ld.shared.f32 %f29, [%r13+2048]; |
| add.f32 %f6, %f6, %f29; |
| st.shared.f32 [%r13], %f6; |
| |
| BB5_7: |
| bar.sync 0; |
| |
| BB5_8: |
| setp.lt.u32 %p6, %r14, 512; |
| @%p6 bra BB5_12; |
| |
| setp.gt.u32 %p7, %r10, 255; |
| @%p7 bra BB5_11; |
| |
| ld.shared.f32 %f30, [%r13+1024]; |
| add.f32 %f6, %f6, %f30; |
| st.shared.f32 [%r13], %f6; |
| |
| BB5_11: |
| bar.sync 0; |
| |
| BB5_12: |
| setp.lt.u32 %p8, %r14, 256; |
| @%p8 bra BB5_16; |
| |
| setp.gt.u32 %p9, %r10, 127; |
| @%p9 bra BB5_15; |
| |
| ld.shared.f32 %f31, [%r13+512]; |
| add.f32 %f6, %f6, %f31; |
| st.shared.f32 [%r13], %f6; |
| |
| BB5_15: |
| bar.sync 0; |
| |
| BB5_16: |
| setp.lt.u32 %p10, %r14, 128; |
| @%p10 bra BB5_20; |
| |
| setp.gt.u32 %p11, %r10, 63; |
| @%p11 bra BB5_19; |
| |
| ld.shared.f32 %f32, [%r13+256]; |
| add.f32 %f6, %f6, %f32; |
| st.shared.f32 [%r13], %f6; |
| |
| BB5_19: |
| bar.sync 0; |
| |
| BB5_20: |
| setp.gt.u32 %p12, %r10, 31; |
| @%p12 bra BB5_33; |
| |
| setp.lt.u32 %p13, %r14, 64; |
| @%p13 bra BB5_23; |
| |
| ld.volatile.shared.f32 %f33, [%r13+128]; |
| add.f32 %f6, %f6, %f33; |
| st.volatile.shared.f32 [%r13], %f6; |
| |
| BB5_23: |
| setp.lt.u32 %p14, %r14, 32; |
| @%p14 bra BB5_25; |
| |
| ld.volatile.shared.f32 %f34, [%r13+64]; |
| add.f32 %f6, %f6, %f34; |
| st.volatile.shared.f32 [%r13], %f6; |
| |
| BB5_25: |
| setp.lt.u32 %p15, %r14, 16; |
| @%p15 bra BB5_27; |
| |
| ld.volatile.shared.f32 %f35, [%r13+32]; |
| add.f32 %f6, %f6, %f35; |
| st.volatile.shared.f32 [%r13], %f6; |
| |
| BB5_27: |
| setp.lt.u32 %p16, %r14, 8; |
| @%p16 bra BB5_29; |
| |
| ld.volatile.shared.f32 %f36, [%r13+16]; |
| add.f32 %f6, %f6, %f36; |
| st.volatile.shared.f32 [%r13], %f6; |
| |
| BB5_29: |
| setp.lt.u32 %p17, %r14, 4; |
| @%p17 bra BB5_31; |
| |
| ld.volatile.shared.f32 %f37, [%r13+8]; |
| add.f32 %f6, %f6, %f37; |
| st.volatile.shared.f32 [%r13], %f6; |
| |
| BB5_31: |
| setp.lt.u32 %p18, %r14, 2; |
| @%p18 bra BB5_33; |
| |
| ld.volatile.shared.f32 %f38, [%r13+4]; |
| add.f32 %f39, %f6, %f38; |
| st.volatile.shared.f32 [%r13], %f39; |
| |
| BB5_33: |
| setp.ne.s32 %p19, %r10, 0; |
| @%p19 bra BB5_35; |
| |
| ld.shared.f32 %f40, [memory]; |
| cvta.to.global.u64 %rd6, %rd2; |
| mul.wide.u32 %rd7, %r6, 4; |
| add.s64 %rd8, %rd6, %rd7; |
| st.global.f32 [%rd8], %f40; |
| |
| BB5_35: |
| ret; |
| } |
| |
| // .globl reduce_col_sum_d |
| .visible .entry reduce_col_sum_d( |
| .param .u64 reduce_col_sum_d_param_0, |
| .param .u64 reduce_col_sum_d_param_1, |
| .param .u32 reduce_col_sum_d_param_2, |
| .param .u32 reduce_col_sum_d_param_3 |
| ) |
| { |
| .reg .pred %p<4>; |
| .reg .b32 %r<11>; |
| .reg .f64 %fd<9>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd2, [reduce_col_sum_d_param_0]; |
| ld.param.u64 %rd3, [reduce_col_sum_d_param_1]; |
| ld.param.u32 %r5, [reduce_col_sum_d_param_2]; |
| ld.param.u32 %r6, [reduce_col_sum_d_param_3]; |
| mov.u32 %r7, %ntid.x; |
| mov.u32 %r8, %ctaid.x; |
| mov.u32 %r9, %tid.x; |
| mad.lo.s32 %r1, %r7, %r8, %r9; |
| setp.ge.u32 %p1, %r1, %r6; |
| @%p1 bra BB6_5; |
| |
| mul.lo.s32 %r2, %r6, %r5; |
| cvta.to.global.u64 %rd1, %rd2; |
| mov.f64 %fd8, 0d0000000000000000; |
| setp.ge.u32 %p2, %r1, %r2; |
| @%p2 bra BB6_4; |
| |
| mov.u32 %r10, %r1; |
| |
| BB6_3: |
| mul.wide.u32 %rd4, %r10, 8; |
| add.s64 %rd5, %rd1, %rd4; |
| ld.global.f64 %fd6, [%rd5]; |
| add.f64 %fd8, %fd8, %fd6; |
| add.s32 %r10, %r10, %r6; |
| setp.lt.u32 %p3, %r10, %r2; |
| @%p3 bra BB6_3; |
| |
| BB6_4: |
| cvta.to.global.u64 %rd6, %rd3; |
| mul.wide.u32 %rd7, %r1, 8; |
| add.s64 %rd8, %rd6, %rd7; |
| st.global.f64 [%rd8], %fd8; |
| |
| BB6_5: |
| ret; |
| } |
| |
| // .globl reduce_col_sum_f |
| .visible .entry reduce_col_sum_f( |
| .param .u64 reduce_col_sum_f_param_0, |
| .param .u64 reduce_col_sum_f_param_1, |
| .param .u32 reduce_col_sum_f_param_2, |
| .param .u32 reduce_col_sum_f_param_3 |
| ) |
| { |
| .reg .pred %p<4>; |
| .reg .f32 %f<9>; |
| .reg .b32 %r<11>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd2, [reduce_col_sum_f_param_0]; |
| ld.param.u64 %rd3, [reduce_col_sum_f_param_1]; |
| ld.param.u32 %r5, [reduce_col_sum_f_param_2]; |
| ld.param.u32 %r6, [reduce_col_sum_f_param_3]; |
| mov.u32 %r7, %ntid.x; |
| mov.u32 %r8, %ctaid.x; |
| mov.u32 %r9, %tid.x; |
| mad.lo.s32 %r1, %r7, %r8, %r9; |
| setp.ge.u32 %p1, %r1, %r6; |
| @%p1 bra BB7_5; |
| |
| mul.lo.s32 %r2, %r6, %r5; |
| cvta.to.global.u64 %rd1, %rd2; |
| mov.f32 %f8, 0f00000000; |
| setp.ge.u32 %p2, %r1, %r2; |
| @%p2 bra BB7_4; |
| |
| mov.u32 %r10, %r1; |
| |
| BB7_3: |
| mul.wide.u32 %rd4, %r10, 4; |
| add.s64 %rd5, %rd1, %rd4; |
| ld.global.f32 %f6, [%rd5]; |
| add.f32 %f8, %f8, %f6; |
| add.s32 %r10, %r10, %r6; |
| setp.lt.u32 %p3, %r10, %r2; |
| @%p3 bra BB7_3; |
| |
| BB7_4: |
| cvta.to.global.u64 %rd6, %rd3; |
| mul.wide.u32 %rd7, %r1, 4; |
| add.s64 %rd8, %rd6, %rd7; |
| st.global.f32 [%rd8], %f8; |
| |
| BB7_5: |
| ret; |
| } |
| |
| // .globl reduce_max_d |
| .visible .entry reduce_max_d( |
| .param .u64 reduce_max_d_param_0, |
| .param .u64 reduce_max_d_param_1, |
| .param .u32 reduce_max_d_param_2 |
| ) |
| { |
| .local .align 16 .b8 __local_depot8[16]; |
| .reg .b64 %SP; |
| .reg .b64 %SPL; |
| .reg .pred %p<21>; |
| .reg .b32 %r<39>; |
| .reg .f64 %fd<61>; |
| .reg .b64 %rd<16>; |
| |
| |
| mov.u64 %SPL, __local_depot8; |
| cvta.local.u64 %SP, %SPL; |
| ld.param.u64 %rd1, [reduce_max_d_param_0]; |
| ld.param.u64 %rd2, [reduce_max_d_param_1]; |
| ld.param.u32 %r6, [reduce_max_d_param_2]; |
| mov.u32 %r7, %ctaid.x; |
| shl.b32 %r8, %r7, 1; |
| mov.u32 %r9, %ntid.x; |
| mov.u32 %r10, %tid.x; |
| mad.lo.s32 %r38, %r8, %r9, %r10; |
| mov.f64 %fd45, 0dFFEFFFFFFFFFFFFF; |
| setp.ge.u32 %p1, %r38, %r6; |
| @%p1 bra BB8_4; |
| |
| BB8_1: |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.u32 %rd4, %r38, 8; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f64 %fd30, [%rd5]; |
| max.f64 %fd45, %fd45, %fd30; |
| add.s32 %r3, %r38, %r9; |
| setp.ge.u32 %p2, %r3, %r6; |
| @%p2 bra BB8_3; |
| |
| mul.wide.u32 %rd7, %r3, 8; |
| add.s64 %rd8, %rd3, %rd7; |
| ld.global.f64 %fd31, [%rd8]; |
| max.f64 %fd45, %fd45, %fd31; |
| |
| BB8_3: |
| shl.b32 %r13, %r9, 1; |
| mov.u32 %r14, %nctaid.x; |
| mad.lo.s32 %r38, %r13, %r14, %r38; |
| setp.lt.u32 %p3, %r38, %r6; |
| @%p3 bra BB8_1; |
| |
| BB8_4: |
| shl.b32 %r16, %r10, 3; |
| mov.u32 %r17, memory; |
| add.s32 %r5, %r17, %r16; |
| st.shared.f64 [%r5], %fd45; |
| bar.sync 0; |
| setp.lt.u32 %p4, %r9, 1024; |
| @%p4 bra BB8_8; |
| |
| setp.gt.u32 %p5, %r10, 511; |
| @%p5 bra BB8_7; |
| |
| ld.shared.f64 %fd32, [%r5+4096]; |
| max.f64 %fd45, %fd45, %fd32; |
| st.shared.f64 [%r5], %fd45; |
| |
| BB8_7: |
| bar.sync 0; |
| |
| BB8_8: |
| setp.lt.u32 %p6, %r9, 512; |
| @%p6 bra BB8_12; |
| |
| setp.gt.u32 %p7, %r10, 255; |
| @%p7 bra BB8_11; |
| |
| ld.shared.f64 %fd33, [%r5+2048]; |
| max.f64 %fd45, %fd45, %fd33; |
| st.shared.f64 [%r5], %fd45; |
| |
| BB8_11: |
| bar.sync 0; |
| |
| BB8_12: |
| setp.lt.u32 %p8, %r9, 256; |
| @%p8 bra BB8_16; |
| |
| setp.gt.u32 %p9, %r10, 127; |
| @%p9 bra BB8_15; |
| |
| ld.shared.f64 %fd34, [%r5+1024]; |
| max.f64 %fd45, %fd45, %fd34; |
| st.shared.f64 [%r5], %fd45; |
| |
| BB8_15: |
| bar.sync 0; |
| |
| BB8_16: |
| setp.lt.u32 %p10, %r9, 128; |
| @%p10 bra BB8_20; |
| |
| setp.gt.u32 %p11, %r10, 63; |
| @%p11 bra BB8_19; |
| |
| ld.shared.f64 %fd35, [%r5+512]; |
| max.f64 %fd45, %fd45, %fd35; |
| st.shared.f64 [%r5], %fd45; |
| |
| BB8_19: |
| bar.sync 0; |
| |
| BB8_20: |
| setp.gt.u32 %p12, %r10, 31; |
| @%p12 bra BB8_33; |
| |
| setp.lt.u32 %p13, %r9, 64; |
| @%p13 bra BB8_23; |
| |
| ld.volatile.shared.f64 %fd36, [%r5+256]; |
| max.f64 %fd45, %fd45, %fd36; |
| st.volatile.shared.f64 [%r5], %fd45; |
| |
| BB8_23: |
| setp.lt.u32 %p14, %r9, 32; |
| @%p14 bra BB8_25; |
| |
| ld.volatile.shared.f64 %fd37, [%r5+128]; |
| max.f64 %fd45, %fd45, %fd37; |
| st.volatile.shared.f64 [%r5], %fd45; |
| |
| BB8_25: |
| setp.lt.u32 %p15, %r9, 16; |
| @%p15 bra BB8_27; |
| |
| ld.volatile.shared.f64 %fd38, [%r5+64]; |
| max.f64 %fd45, %fd45, %fd38; |
| st.volatile.shared.f64 [%r5], %fd45; |
| |
| BB8_27: |
| setp.lt.u32 %p16, %r9, 8; |
| @%p16 bra BB8_29; |
| |
| ld.volatile.shared.f64 %fd39, [%r5+32]; |
| max.f64 %fd45, %fd45, %fd39; |
| st.volatile.shared.f64 [%r5], %fd45; |
| |
| BB8_29: |
| setp.lt.u32 %p17, %r9, 4; |
| @%p17 bra BB8_31; |
| |
| ld.volatile.shared.f64 %fd40, [%r5+16]; |
| max.f64 %fd45, %fd45, %fd40; |
| st.volatile.shared.f64 [%r5], %fd45; |
| |
| BB8_31: |
| setp.lt.u32 %p18, %r9, 2; |
| @%p18 bra BB8_33; |
| |
| ld.volatile.shared.f64 %fd41, [%r5+8]; |
| max.f64 %fd42, %fd45, %fd41; |
| st.volatile.shared.f64 [%r5], %fd42; |
| |
| BB8_33: |
| setp.ne.s32 %p19, %r10, 0; |
| @%p19 bra BB8_37; |
| |
| mov.u32 %r34, %nctaid.x; |
| setp.gt.u32 %p20, %r34, 9; |
| @%p20 bra BB8_36; |
| |
| ld.shared.f64 %fd43, [memory]; |
| add.u64 %rd9, %SP, 0; |
| add.u64 %rd10, %SPL, 0; |
| st.local.u32 [%rd10], %r7; |
| st.local.f64 [%rd10+8], %fd43; |
| mov.u64 %rd11, $str; |
| cvta.global.u64 %rd12, %rd11; |
| // Callseq Start 2 |
| { |
| .reg .b32 temp_param_reg; |
| // <end>} |
| .param .b64 param0; |
| st.param.b64 [param0+0], %rd12; |
| .param .b64 param1; |
| st.param.b64 [param1+0], %rd9; |
| .param .b32 retval0; |
| call.uni (retval0), |
| vprintf, |
| ( |
| param0, |
| param1 |
| ); |
| ld.param.b32 %r36, [retval0+0]; |
| |
| //{ |
| }// Callseq End 2 |
| |
| BB8_36: |
| ld.shared.f64 %fd44, [memory]; |
| cvta.to.global.u64 %rd13, %rd2; |
| mul.wide.u32 %rd14, %r7, 8; |
| add.s64 %rd15, %rd13, %rd14; |
| st.global.f64 [%rd15], %fd44; |
| |
| BB8_37: |
| ret; |
| } |
| |
| // .globl reduce_max_f |
| .visible .entry reduce_max_f( |
| .param .u64 reduce_max_f_param_0, |
| .param .u64 reduce_max_f_param_1, |
| .param .u32 reduce_max_f_param_2 |
| ) |
| { |
| .local .align 16 .b8 __local_depot9[16]; |
| .reg .b64 %SP; |
| .reg .b64 %SPL; |
| .reg .pred %p<21>; |
| .reg .f32 %f<61>; |
| .reg .b32 %r<39>; |
| .reg .f64 %fd<2>; |
| .reg .b64 %rd<16>; |
| |
| |
| mov.u64 %SPL, __local_depot9; |
| cvta.local.u64 %SP, %SPL; |
| ld.param.u64 %rd1, [reduce_max_f_param_0]; |
| ld.param.u64 %rd2, [reduce_max_f_param_1]; |
| ld.param.u32 %r6, [reduce_max_f_param_2]; |
| mov.u32 %r7, %ctaid.x; |
| shl.b32 %r8, %r7, 1; |
| mov.u32 %r9, %ntid.x; |
| mov.u32 %r10, %tid.x; |
| mad.lo.s32 %r38, %r8, %r9, %r10; |
| mov.f32 %f45, 0fFF7FFFFF; |
| setp.ge.u32 %p1, %r38, %r6; |
| @%p1 bra BB9_4; |
| |
| BB9_1: |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.u32 %rd4, %r38, 4; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f32 %f30, [%rd5]; |
| max.f32 %f45, %f45, %f30; |
| add.s32 %r3, %r38, %r9; |
| setp.ge.u32 %p2, %r3, %r6; |
| @%p2 bra BB9_3; |
| |
| mul.wide.u32 %rd7, %r3, 4; |
| add.s64 %rd8, %rd3, %rd7; |
| ld.global.f32 %f31, [%rd8]; |
| max.f32 %f45, %f45, %f31; |
| |
| BB9_3: |
| shl.b32 %r13, %r9, 1; |
| mov.u32 %r14, %nctaid.x; |
| mad.lo.s32 %r38, %r13, %r14, %r38; |
| setp.lt.u32 %p3, %r38, %r6; |
| @%p3 bra BB9_1; |
| |
| BB9_4: |
| shl.b32 %r16, %r10, 2; |
| mov.u32 %r17, memory; |
| add.s32 %r5, %r17, %r16; |
| st.shared.f32 [%r5], %f45; |
| bar.sync 0; |
| setp.lt.u32 %p4, %r9, 1024; |
| @%p4 bra BB9_8; |
| |
| setp.gt.u32 %p5, %r10, 511; |
| @%p5 bra BB9_7; |
| |
| ld.shared.f32 %f32, [%r5+2048]; |
| max.f32 %f45, %f45, %f32; |
| st.shared.f32 [%r5], %f45; |
| |
| BB9_7: |
| bar.sync 0; |
| |
| BB9_8: |
| setp.lt.u32 %p6, %r9, 512; |
| @%p6 bra BB9_12; |
| |
| setp.gt.u32 %p7, %r10, 255; |
| @%p7 bra BB9_11; |
| |
| ld.shared.f32 %f33, [%r5+1024]; |
| max.f32 %f45, %f45, %f33; |
| st.shared.f32 [%r5], %f45; |
| |
| BB9_11: |
| bar.sync 0; |
| |
| BB9_12: |
| setp.lt.u32 %p8, %r9, 256; |
| @%p8 bra BB9_16; |
| |
| setp.gt.u32 %p9, %r10, 127; |
| @%p9 bra BB9_15; |
| |
| ld.shared.f32 %f34, [%r5+512]; |
| max.f32 %f45, %f45, %f34; |
| st.shared.f32 [%r5], %f45; |
| |
| BB9_15: |
| bar.sync 0; |
| |
| BB9_16: |
| setp.lt.u32 %p10, %r9, 128; |
| @%p10 bra BB9_20; |
| |
| setp.gt.u32 %p11, %r10, 63; |
| @%p11 bra BB9_19; |
| |
| ld.shared.f32 %f35, [%r5+256]; |
| max.f32 %f45, %f45, %f35; |
| st.shared.f32 [%r5], %f45; |
| |
| BB9_19: |
| bar.sync 0; |
| |
| BB9_20: |
| setp.gt.u32 %p12, %r10, 31; |
| @%p12 bra BB9_33; |
| |
| setp.lt.u32 %p13, %r9, 64; |
| @%p13 bra BB9_23; |
| |
| ld.volatile.shared.f32 %f36, [%r5+128]; |
| max.f32 %f45, %f45, %f36; |
| st.volatile.shared.f32 [%r5], %f45; |
| |
| BB9_23: |
| setp.lt.u32 %p14, %r9, 32; |
| @%p14 bra BB9_25; |
| |
| ld.volatile.shared.f32 %f37, [%r5+64]; |
| max.f32 %f45, %f45, %f37; |
| st.volatile.shared.f32 [%r5], %f45; |
| |
| BB9_25: |
| setp.lt.u32 %p15, %r9, 16; |
| @%p15 bra BB9_27; |
| |
| ld.volatile.shared.f32 %f38, [%r5+32]; |
| max.f32 %f45, %f45, %f38; |
| st.volatile.shared.f32 [%r5], %f45; |
| |
| BB9_27: |
| setp.lt.u32 %p16, %r9, 8; |
| @%p16 bra BB9_29; |
| |
| ld.volatile.shared.f32 %f39, [%r5+16]; |
| max.f32 %f45, %f45, %f39; |
| st.volatile.shared.f32 [%r5], %f45; |
| |
| BB9_29: |
| setp.lt.u32 %p17, %r9, 4; |
| @%p17 bra BB9_31; |
| |
| ld.volatile.shared.f32 %f40, [%r5+8]; |
| max.f32 %f45, %f45, %f40; |
| st.volatile.shared.f32 [%r5], %f45; |
| |
| BB9_31: |
| setp.lt.u32 %p18, %r9, 2; |
| @%p18 bra BB9_33; |
| |
| ld.volatile.shared.f32 %f41, [%r5+4]; |
| max.f32 %f42, %f45, %f41; |
| st.volatile.shared.f32 [%r5], %f42; |
| |
| BB9_33: |
| setp.ne.s32 %p19, %r10, 0; |
| @%p19 bra BB9_37; |
| |
| mov.u32 %r34, %nctaid.x; |
| setp.gt.u32 %p20, %r34, 9; |
| @%p20 bra BB9_36; |
| |
| ld.shared.f32 %f43, [memory]; |
| cvt.f64.f32 %fd1, %f43; |
| add.u64 %rd9, %SP, 0; |
| add.u64 %rd10, %SPL, 0; |
| st.local.u32 [%rd10], %r7; |
| st.local.f64 [%rd10+8], %fd1; |
| mov.u64 %rd11, $str; |
| cvta.global.u64 %rd12, %rd11; |
| // Callseq Start 3 |
| { |
| .reg .b32 temp_param_reg; |
| // <end>} |
| .param .b64 param0; |
| st.param.b64 [param0+0], %rd12; |
| .param .b64 param1; |
| st.param.b64 [param1+0], %rd9; |
| .param .b32 retval0; |
| call.uni (retval0), |
| vprintf, |
| ( |
| param0, |
| param1 |
| ); |
| ld.param.b32 %r36, [retval0+0]; |
| |
| //{ |
| }// Callseq End 3 |
| |
| BB9_36: |
| ld.shared.f32 %f44, [memory]; |
| cvta.to.global.u64 %rd13, %rd2; |
| mul.wide.u32 %rd14, %r7, 4; |
| add.s64 %rd15, %rd13, %rd14; |
| st.global.f32 [%rd15], %f44; |
| |
| BB9_37: |
| ret; |
| } |
| |
| // .globl reduce_row_max_d |
| .visible .entry reduce_row_max_d( |
| .param .u64 reduce_row_max_d_param_0, |
| .param .u64 reduce_row_max_d_param_1, |
| .param .u32 reduce_row_max_d_param_2, |
| .param .u32 reduce_row_max_d_param_3 |
| ) |
| { |
| .reg .pred %p<20>; |
| .reg .b32 %r<72>; |
| .reg .f64 %fd<56>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd1, [reduce_row_max_d_param_0]; |
| ld.param.u64 %rd2, [reduce_row_max_d_param_1]; |
| ld.param.u32 %r5, [reduce_row_max_d_param_2]; |
| ld.param.u32 %r4, [reduce_row_max_d_param_3]; |
| mov.u32 %r6, %ctaid.x; |
| setp.ge.u32 %p1, %r6, %r5; |
| @%p1 bra BB10_35; |
| |
| mov.u32 %r71, %tid.x; |
| mov.f64 %fd6, 0dFFEFFFFFFFFFFFFF; |
| setp.ge.u32 %p2, %r71, %r4; |
| @%p2 bra BB10_4; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| |
| BB10_3: |
| mad.lo.s32 %r8, %r6, %r4, %r71; |
| mul.wide.u32 %rd4, %r8, 8; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f64 %fd28, [%rd5]; |
| max.f64 %fd6, %fd6, %fd28; |
| mov.u32 %r9, %ntid.x; |
| add.s32 %r71, %r9, %r71; |
| setp.lt.u32 %p3, %r71, %r4; |
| @%p3 bra BB10_3; |
| |
| BB10_4: |
| mov.u32 %r10, %tid.x; |
| shl.b32 %r11, %r10, 3; |
| mov.u32 %r12, memory; |
| add.s32 %r13, %r12, %r11; |
| st.shared.f64 [%r13], %fd6; |
| bar.sync 0; |
| mov.u32 %r14, %ntid.x; |
| setp.lt.u32 %p4, %r14, 1024; |
| @%p4 bra BB10_8; |
| |
| setp.gt.u32 %p5, %r10, 511; |
| @%p5 bra BB10_7; |
| |
| ld.shared.f64 %fd29, [%r13+4096]; |
| max.f64 %fd6, %fd6, %fd29; |
| st.shared.f64 [%r13], %fd6; |
| |
| BB10_7: |
| bar.sync 0; |
| |
| BB10_8: |
| setp.lt.u32 %p6, %r14, 512; |
| @%p6 bra BB10_12; |
| |
| setp.gt.u32 %p7, %r10, 255; |
| @%p7 bra BB10_11; |
| |
| ld.shared.f64 %fd30, [%r13+2048]; |
| max.f64 %fd6, %fd6, %fd30; |
| st.shared.f64 [%r13], %fd6; |
| |
| BB10_11: |
| bar.sync 0; |
| |
| BB10_12: |
| setp.lt.u32 %p8, %r14, 256; |
| @%p8 bra BB10_16; |
| |
| setp.gt.u32 %p9, %r10, 127; |
| @%p9 bra BB10_15; |
| |
| ld.shared.f64 %fd31, [%r13+1024]; |
| max.f64 %fd6, %fd6, %fd31; |
| st.shared.f64 [%r13], %fd6; |
| |
| BB10_15: |
| bar.sync 0; |
| |
| BB10_16: |
| setp.lt.u32 %p10, %r14, 128; |
| @%p10 bra BB10_20; |
| |
| setp.gt.u32 %p11, %r10, 63; |
| @%p11 bra BB10_19; |
| |
| ld.shared.f64 %fd32, [%r13+512]; |
| max.f64 %fd6, %fd6, %fd32; |
| st.shared.f64 [%r13], %fd6; |
| |
| BB10_19: |
| bar.sync 0; |
| |
| BB10_20: |
| setp.gt.u32 %p12, %r10, 31; |
| @%p12 bra BB10_33; |
| |
| setp.lt.u32 %p13, %r14, 64; |
| @%p13 bra BB10_23; |
| |
| ld.volatile.shared.f64 %fd33, [%r13+256]; |
| max.f64 %fd6, %fd6, %fd33; |
| st.volatile.shared.f64 [%r13], %fd6; |
| |
| BB10_23: |
| setp.lt.u32 %p14, %r14, 32; |
| @%p14 bra BB10_25; |
| |
| ld.volatile.shared.f64 %fd34, [%r13+128]; |
| max.f64 %fd6, %fd6, %fd34; |
| st.volatile.shared.f64 [%r13], %fd6; |
| |
| BB10_25: |
| setp.lt.u32 %p15, %r14, 16; |
| @%p15 bra BB10_27; |
| |
| ld.volatile.shared.f64 %fd35, [%r13+64]; |
| max.f64 %fd6, %fd6, %fd35; |
| st.volatile.shared.f64 [%r13], %fd6; |
| |
| BB10_27: |
| setp.lt.u32 %p16, %r14, 8; |
| @%p16 bra BB10_29; |
| |
| ld.volatile.shared.f64 %fd36, [%r13+32]; |
| max.f64 %fd6, %fd6, %fd36; |
| st.volatile.shared.f64 [%r13], %fd6; |
| |
| BB10_29: |
| setp.lt.u32 %p17, %r14, 4; |
| @%p17 bra BB10_31; |
| |
| ld.volatile.shared.f64 %fd37, [%r13+16]; |
| max.f64 %fd6, %fd6, %fd37; |
| st.volatile.shared.f64 [%r13], %fd6; |
| |
| BB10_31: |
| setp.lt.u32 %p18, %r14, 2; |
| @%p18 bra BB10_33; |
| |
| ld.volatile.shared.f64 %fd38, [%r13+8]; |
| max.f64 %fd39, %fd6, %fd38; |
| st.volatile.shared.f64 [%r13], %fd39; |
| |
| BB10_33: |
| setp.ne.s32 %p19, %r10, 0; |
| @%p19 bra BB10_35; |
| |
| ld.shared.f64 %fd40, [memory]; |
| cvta.to.global.u64 %rd6, %rd2; |
| mul.wide.u32 %rd7, %r6, 8; |
| add.s64 %rd8, %rd6, %rd7; |
| st.global.f64 [%rd8], %fd40; |
| |
| BB10_35: |
| ret; |
| } |
| |
| // .globl reduce_row_max_f |
| .visible .entry reduce_row_max_f( |
| .param .u64 reduce_row_max_f_param_0, |
| .param .u64 reduce_row_max_f_param_1, |
| .param .u32 reduce_row_max_f_param_2, |
| .param .u32 reduce_row_max_f_param_3 |
| ) |
| { |
| .reg .pred %p<20>; |
| .reg .f32 %f<56>; |
| .reg .b32 %r<72>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd1, [reduce_row_max_f_param_0]; |
| ld.param.u64 %rd2, [reduce_row_max_f_param_1]; |
| ld.param.u32 %r5, [reduce_row_max_f_param_2]; |
| ld.param.u32 %r4, [reduce_row_max_f_param_3]; |
| mov.u32 %r6, %ctaid.x; |
| setp.ge.u32 %p1, %r6, %r5; |
| @%p1 bra BB11_35; |
| |
| mov.u32 %r71, %tid.x; |
| mov.f32 %f6, 0fFF7FFFFF; |
| setp.ge.u32 %p2, %r71, %r4; |
| @%p2 bra BB11_4; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| |
| BB11_3: |
| mad.lo.s32 %r8, %r6, %r4, %r71; |
| mul.wide.u32 %rd4, %r8, 4; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f32 %f28, [%rd5]; |
| max.f32 %f6, %f6, %f28; |
| mov.u32 %r9, %ntid.x; |
| add.s32 %r71, %r9, %r71; |
| setp.lt.u32 %p3, %r71, %r4; |
| @%p3 bra BB11_3; |
| |
| BB11_4: |
| mov.u32 %r10, %tid.x; |
| shl.b32 %r11, %r10, 2; |
| mov.u32 %r12, memory; |
| add.s32 %r13, %r12, %r11; |
| st.shared.f32 [%r13], %f6; |
| bar.sync 0; |
| mov.u32 %r14, %ntid.x; |
| setp.lt.u32 %p4, %r14, 1024; |
| @%p4 bra BB11_8; |
| |
| setp.gt.u32 %p5, %r10, 511; |
| @%p5 bra BB11_7; |
| |
| ld.shared.f32 %f29, [%r13+2048]; |
| max.f32 %f6, %f6, %f29; |
| st.shared.f32 [%r13], %f6; |
| |
| BB11_7: |
| bar.sync 0; |
| |
| BB11_8: |
| setp.lt.u32 %p6, %r14, 512; |
| @%p6 bra BB11_12; |
| |
| setp.gt.u32 %p7, %r10, 255; |
| @%p7 bra BB11_11; |
| |
| ld.shared.f32 %f30, [%r13+1024]; |
| max.f32 %f6, %f6, %f30; |
| st.shared.f32 [%r13], %f6; |
| |
| BB11_11: |
| bar.sync 0; |
| |
| BB11_12: |
| setp.lt.u32 %p8, %r14, 256; |
| @%p8 bra BB11_16; |
| |
| setp.gt.u32 %p9, %r10, 127; |
| @%p9 bra BB11_15; |
| |
| ld.shared.f32 %f31, [%r13+512]; |
| max.f32 %f6, %f6, %f31; |
| st.shared.f32 [%r13], %f6; |
| |
| BB11_15: |
| bar.sync 0; |
| |
| BB11_16: |
| setp.lt.u32 %p10, %r14, 128; |
| @%p10 bra BB11_20; |
| |
| setp.gt.u32 %p11, %r10, 63; |
| @%p11 bra BB11_19; |
| |
| ld.shared.f32 %f32, [%r13+256]; |
| max.f32 %f6, %f6, %f32; |
| st.shared.f32 [%r13], %f6; |
| |
| BB11_19: |
| bar.sync 0; |
| |
| BB11_20: |
| setp.gt.u32 %p12, %r10, 31; |
| @%p12 bra BB11_33; |
| |
| setp.lt.u32 %p13, %r14, 64; |
| @%p13 bra BB11_23; |
| |
| ld.volatile.shared.f32 %f33, [%r13+128]; |
| max.f32 %f6, %f6, %f33; |
| st.volatile.shared.f32 [%r13], %f6; |
| |
| BB11_23: |
| setp.lt.u32 %p14, %r14, 32; |
| @%p14 bra BB11_25; |
| |
| ld.volatile.shared.f32 %f34, [%r13+64]; |
| max.f32 %f6, %f6, %f34; |
| st.volatile.shared.f32 [%r13], %f6; |
| |
| BB11_25: |
| setp.lt.u32 %p15, %r14, 16; |
| @%p15 bra BB11_27; |
| |
| ld.volatile.shared.f32 %f35, [%r13+32]; |
| max.f32 %f6, %f6, %f35; |
| st.volatile.shared.f32 [%r13], %f6; |
| |
| BB11_27: |
| setp.lt.u32 %p16, %r14, 8; |
| @%p16 bra BB11_29; |
| |
| ld.volatile.shared.f32 %f36, [%r13+16]; |
| max.f32 %f6, %f6, %f36; |
| st.volatile.shared.f32 [%r13], %f6; |
| |
| BB11_29: |
| setp.lt.u32 %p17, %r14, 4; |
| @%p17 bra BB11_31; |
| |
| ld.volatile.shared.f32 %f37, [%r13+8]; |
| max.f32 %f6, %f6, %f37; |
| st.volatile.shared.f32 [%r13], %f6; |
| |
| BB11_31: |
| setp.lt.u32 %p18, %r14, 2; |
| @%p18 bra BB11_33; |
| |
| ld.volatile.shared.f32 %f38, [%r13+4]; |
| max.f32 %f39, %f6, %f38; |
| st.volatile.shared.f32 [%r13], %f39; |
| |
| BB11_33: |
| setp.ne.s32 %p19, %r10, 0; |
| @%p19 bra BB11_35; |
| |
| ld.shared.f32 %f40, [memory]; |
| cvta.to.global.u64 %rd6, %rd2; |
| mul.wide.u32 %rd7, %r6, 4; |
| add.s64 %rd8, %rd6, %rd7; |
| st.global.f32 [%rd8], %f40; |
| |
| BB11_35: |
| ret; |
| } |
| |
| // .globl reduce_col_max_d |
| .visible .entry reduce_col_max_d( |
| .param .u64 reduce_col_max_d_param_0, |
| .param .u64 reduce_col_max_d_param_1, |
| .param .u32 reduce_col_max_d_param_2, |
| .param .u32 reduce_col_max_d_param_3 |
| ) |
| { |
| .reg .pred %p<4>; |
| .reg .b32 %r<11>; |
| .reg .f64 %fd<9>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd2, [reduce_col_max_d_param_0]; |
| ld.param.u64 %rd3, [reduce_col_max_d_param_1]; |
| ld.param.u32 %r5, [reduce_col_max_d_param_2]; |
| ld.param.u32 %r6, [reduce_col_max_d_param_3]; |
| mov.u32 %r7, %ntid.x; |
| mov.u32 %r8, %ctaid.x; |
| mov.u32 %r9, %tid.x; |
| mad.lo.s32 %r1, %r7, %r8, %r9; |
| setp.ge.u32 %p1, %r1, %r6; |
| @%p1 bra BB12_5; |
| |
| mul.lo.s32 %r2, %r6, %r5; |
| cvta.to.global.u64 %rd1, %rd2; |
| mov.f64 %fd8, 0dFFEFFFFFFFFFFFFF; |
| setp.ge.u32 %p2, %r1, %r2; |
| @%p2 bra BB12_4; |
| |
| mov.u32 %r10, %r1; |
| |
| BB12_3: |
| mul.wide.u32 %rd4, %r10, 8; |
| add.s64 %rd5, %rd1, %rd4; |
| ld.global.f64 %fd6, [%rd5]; |
| max.f64 %fd8, %fd8, %fd6; |
| add.s32 %r10, %r10, %r6; |
| setp.lt.u32 %p3, %r10, %r2; |
| @%p3 bra BB12_3; |
| |
| BB12_4: |
| cvta.to.global.u64 %rd6, %rd3; |
| mul.wide.u32 %rd7, %r1, 8; |
| add.s64 %rd8, %rd6, %rd7; |
| st.global.f64 [%rd8], %fd8; |
| |
| BB12_5: |
| ret; |
| } |
| |
| // .globl reduce_col_max_f |
| .visible .entry reduce_col_max_f( |
| .param .u64 reduce_col_max_f_param_0, |
| .param .u64 reduce_col_max_f_param_1, |
| .param .u32 reduce_col_max_f_param_2, |
| .param .u32 reduce_col_max_f_param_3 |
| ) |
| { |
| .reg .pred %p<4>; |
| .reg .f32 %f<9>; |
| .reg .b32 %r<11>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd2, [reduce_col_max_f_param_0]; |
| ld.param.u64 %rd3, [reduce_col_max_f_param_1]; |
| ld.param.u32 %r5, [reduce_col_max_f_param_2]; |
| ld.param.u32 %r6, [reduce_col_max_f_param_3]; |
| mov.u32 %r7, %ntid.x; |
| mov.u32 %r8, %ctaid.x; |
| mov.u32 %r9, %tid.x; |
| mad.lo.s32 %r1, %r7, %r8, %r9; |
| setp.ge.u32 %p1, %r1, %r6; |
| @%p1 bra BB13_5; |
| |
| mul.lo.s32 %r2, %r6, %r5; |
| cvta.to.global.u64 %rd1, %rd2; |
| mov.f32 %f8, 0fFF7FFFFF; |
| setp.ge.u32 %p2, %r1, %r2; |
| @%p2 bra BB13_4; |
| |
| mov.u32 %r10, %r1; |
| |
| BB13_3: |
| mul.wide.u32 %rd4, %r10, 4; |
| add.s64 %rd5, %rd1, %rd4; |
| ld.global.f32 %f6, [%rd5]; |
| max.f32 %f8, %f8, %f6; |
| add.s32 %r10, %r10, %r6; |
| setp.lt.u32 %p3, %r10, %r2; |
| @%p3 bra BB13_3; |
| |
| BB13_4: |
| cvta.to.global.u64 %rd6, %rd3; |
| mul.wide.u32 %rd7, %r1, 4; |
| add.s64 %rd8, %rd6, %rd7; |
| st.global.f32 [%rd8], %f8; |
| |
| BB13_5: |
| ret; |
| } |
| |
| // .globl reduce_min_d |
| .visible .entry reduce_min_d( |
| .param .u64 reduce_min_d_param_0, |
| .param .u64 reduce_min_d_param_1, |
| .param .u32 reduce_min_d_param_2 |
| ) |
| { |
| .local .align 16 .b8 __local_depot14[16]; |
| .reg .b64 %SP; |
| .reg .b64 %SPL; |
| .reg .pred %p<21>; |
| .reg .b32 %r<39>; |
| .reg .f64 %fd<61>; |
| .reg .b64 %rd<16>; |
| |
| |
| mov.u64 %SPL, __local_depot14; |
| cvta.local.u64 %SP, %SPL; |
| ld.param.u64 %rd1, [reduce_min_d_param_0]; |
| ld.param.u64 %rd2, [reduce_min_d_param_1]; |
| ld.param.u32 %r6, [reduce_min_d_param_2]; |
| mov.u32 %r7, %ctaid.x; |
| shl.b32 %r8, %r7, 1; |
| mov.u32 %r9, %ntid.x; |
| mov.u32 %r10, %tid.x; |
| mad.lo.s32 %r38, %r8, %r9, %r10; |
| mov.f64 %fd45, 0d7FEFFFFFFFFFFFFF; |
| setp.ge.u32 %p1, %r38, %r6; |
| @%p1 bra BB14_4; |
| |
| BB14_1: |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.u32 %rd4, %r38, 8; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f64 %fd30, [%rd5]; |
| min.f64 %fd45, %fd45, %fd30; |
| add.s32 %r3, %r38, %r9; |
| setp.ge.u32 %p2, %r3, %r6; |
| @%p2 bra BB14_3; |
| |
| mul.wide.u32 %rd7, %r3, 8; |
| add.s64 %rd8, %rd3, %rd7; |
| ld.global.f64 %fd31, [%rd8]; |
| min.f64 %fd45, %fd45, %fd31; |
| |
| BB14_3: |
| shl.b32 %r13, %r9, 1; |
| mov.u32 %r14, %nctaid.x; |
| mad.lo.s32 %r38, %r13, %r14, %r38; |
| setp.lt.u32 %p3, %r38, %r6; |
| @%p3 bra BB14_1; |
| |
| BB14_4: |
| shl.b32 %r16, %r10, 3; |
| mov.u32 %r17, memory; |
| add.s32 %r5, %r17, %r16; |
| st.shared.f64 [%r5], %fd45; |
| bar.sync 0; |
| setp.lt.u32 %p4, %r9, 1024; |
| @%p4 bra BB14_8; |
| |
| setp.gt.u32 %p5, %r10, 511; |
| @%p5 bra BB14_7; |
| |
| ld.shared.f64 %fd32, [%r5+4096]; |
| min.f64 %fd45, %fd45, %fd32; |
| st.shared.f64 [%r5], %fd45; |
| |
| BB14_7: |
| bar.sync 0; |
| |
| BB14_8: |
| setp.lt.u32 %p6, %r9, 512; |
| @%p6 bra BB14_12; |
| |
| setp.gt.u32 %p7, %r10, 255; |
| @%p7 bra BB14_11; |
| |
| ld.shared.f64 %fd33, [%r5+2048]; |
| min.f64 %fd45, %fd45, %fd33; |
| st.shared.f64 [%r5], %fd45; |
| |
| BB14_11: |
| bar.sync 0; |
| |
| BB14_12: |
| setp.lt.u32 %p8, %r9, 256; |
| @%p8 bra BB14_16; |
| |
| setp.gt.u32 %p9, %r10, 127; |
| @%p9 bra BB14_15; |
| |
| ld.shared.f64 %fd34, [%r5+1024]; |
| min.f64 %fd45, %fd45, %fd34; |
| st.shared.f64 [%r5], %fd45; |
| |
| BB14_15: |
| bar.sync 0; |
| |
| BB14_16: |
| setp.lt.u32 %p10, %r9, 128; |
| @%p10 bra BB14_20; |
| |
| setp.gt.u32 %p11, %r10, 63; |
| @%p11 bra BB14_19; |
| |
| ld.shared.f64 %fd35, [%r5+512]; |
| min.f64 %fd45, %fd45, %fd35; |
| st.shared.f64 [%r5], %fd45; |
| |
| BB14_19: |
| bar.sync 0; |
| |
| BB14_20: |
| setp.gt.u32 %p12, %r10, 31; |
| @%p12 bra BB14_33; |
| |
| setp.lt.u32 %p13, %r9, 64; |
| @%p13 bra BB14_23; |
| |
| ld.volatile.shared.f64 %fd36, [%r5+256]; |
| min.f64 %fd45, %fd45, %fd36; |
| st.volatile.shared.f64 [%r5], %fd45; |
| |
| BB14_23: |
| setp.lt.u32 %p14, %r9, 32; |
| @%p14 bra BB14_25; |
| |
| ld.volatile.shared.f64 %fd37, [%r5+128]; |
| min.f64 %fd45, %fd45, %fd37; |
| st.volatile.shared.f64 [%r5], %fd45; |
| |
| BB14_25: |
| setp.lt.u32 %p15, %r9, 16; |
| @%p15 bra BB14_27; |
| |
| ld.volatile.shared.f64 %fd38, [%r5+64]; |
| min.f64 %fd45, %fd45, %fd38; |
| st.volatile.shared.f64 [%r5], %fd45; |
| |
| BB14_27: |
| setp.lt.u32 %p16, %r9, 8; |
| @%p16 bra BB14_29; |
| |
| ld.volatile.shared.f64 %fd39, [%r5+32]; |
| min.f64 %fd45, %fd45, %fd39; |
| st.volatile.shared.f64 [%r5], %fd45; |
| |
| BB14_29: |
| setp.lt.u32 %p17, %r9, 4; |
| @%p17 bra BB14_31; |
| |
| ld.volatile.shared.f64 %fd40, [%r5+16]; |
| min.f64 %fd45, %fd45, %fd40; |
| st.volatile.shared.f64 [%r5], %fd45; |
| |
| BB14_31: |
| setp.lt.u32 %p18, %r9, 2; |
| @%p18 bra BB14_33; |
| |
| ld.volatile.shared.f64 %fd41, [%r5+8]; |
| min.f64 %fd42, %fd45, %fd41; |
| st.volatile.shared.f64 [%r5], %fd42; |
| |
| BB14_33: |
| setp.ne.s32 %p19, %r10, 0; |
| @%p19 bra BB14_37; |
| |
| mov.u32 %r34, %nctaid.x; |
| setp.gt.u32 %p20, %r34, 9; |
| @%p20 bra BB14_36; |
| |
| ld.shared.f64 %fd43, [memory]; |
| add.u64 %rd9, %SP, 0; |
| add.u64 %rd10, %SPL, 0; |
| st.local.u32 [%rd10], %r7; |
| st.local.f64 [%rd10+8], %fd43; |
| mov.u64 %rd11, $str; |
| cvta.global.u64 %rd12, %rd11; |
| // Callseq Start 4 |
| { |
| .reg .b32 temp_param_reg; |
| // <end>} |
| .param .b64 param0; |
| st.param.b64 [param0+0], %rd12; |
| .param .b64 param1; |
| st.param.b64 [param1+0], %rd9; |
| .param .b32 retval0; |
| call.uni (retval0), |
| vprintf, |
| ( |
| param0, |
| param1 |
| ); |
| ld.param.b32 %r36, [retval0+0]; |
| |
| //{ |
| }// Callseq End 4 |
| |
| BB14_36: |
| ld.shared.f64 %fd44, [memory]; |
| cvta.to.global.u64 %rd13, %rd2; |
| mul.wide.u32 %rd14, %r7, 8; |
| add.s64 %rd15, %rd13, %rd14; |
| st.global.f64 [%rd15], %fd44; |
| |
| BB14_37: |
| ret; |
| } |
| |
| // .globl reduce_min_f |
| .visible .entry reduce_min_f( |
| .param .u64 reduce_min_f_param_0, |
| .param .u64 reduce_min_f_param_1, |
| .param .u32 reduce_min_f_param_2 |
| ) |
| { |
| .local .align 16 .b8 __local_depot15[16]; |
| .reg .b64 %SP; |
| .reg .b64 %SPL; |
| .reg .pred %p<21>; |
| .reg .f32 %f<61>; |
| .reg .b32 %r<39>; |
| .reg .f64 %fd<2>; |
| .reg .b64 %rd<16>; |
| |
| |
| mov.u64 %SPL, __local_depot15; |
| cvta.local.u64 %SP, %SPL; |
| ld.param.u64 %rd1, [reduce_min_f_param_0]; |
| ld.param.u64 %rd2, [reduce_min_f_param_1]; |
| ld.param.u32 %r6, [reduce_min_f_param_2]; |
| mov.u32 %r7, %ctaid.x; |
| shl.b32 %r8, %r7, 1; |
| mov.u32 %r9, %ntid.x; |
| mov.u32 %r10, %tid.x; |
| mad.lo.s32 %r38, %r8, %r9, %r10; |
| mov.f32 %f45, 0f7F7FFFFF; |
| setp.ge.u32 %p1, %r38, %r6; |
| @%p1 bra BB15_4; |
| |
| BB15_1: |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.u32 %rd4, %r38, 4; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f32 %f30, [%rd5]; |
| min.f32 %f45, %f45, %f30; |
| add.s32 %r3, %r38, %r9; |
| setp.ge.u32 %p2, %r3, %r6; |
| @%p2 bra BB15_3; |
| |
| mul.wide.u32 %rd7, %r3, 4; |
| add.s64 %rd8, %rd3, %rd7; |
| ld.global.f32 %f31, [%rd8]; |
| min.f32 %f45, %f45, %f31; |
| |
| BB15_3: |
| shl.b32 %r13, %r9, 1; |
| mov.u32 %r14, %nctaid.x; |
| mad.lo.s32 %r38, %r13, %r14, %r38; |
| setp.lt.u32 %p3, %r38, %r6; |
| @%p3 bra BB15_1; |
| |
| BB15_4: |
| shl.b32 %r16, %r10, 2; |
| mov.u32 %r17, memory; |
| add.s32 %r5, %r17, %r16; |
| st.shared.f32 [%r5], %f45; |
| bar.sync 0; |
| setp.lt.u32 %p4, %r9, 1024; |
| @%p4 bra BB15_8; |
| |
| setp.gt.u32 %p5, %r10, 511; |
| @%p5 bra BB15_7; |
| |
| ld.shared.f32 %f32, [%r5+2048]; |
| min.f32 %f45, %f45, %f32; |
| st.shared.f32 [%r5], %f45; |
| |
| BB15_7: |
| bar.sync 0; |
| |
| BB15_8: |
| setp.lt.u32 %p6, %r9, 512; |
| @%p6 bra BB15_12; |
| |
| setp.gt.u32 %p7, %r10, 255; |
| @%p7 bra BB15_11; |
| |
| ld.shared.f32 %f33, [%r5+1024]; |
| min.f32 %f45, %f45, %f33; |
| st.shared.f32 [%r5], %f45; |
| |
| BB15_11: |
| bar.sync 0; |
| |
| BB15_12: |
| setp.lt.u32 %p8, %r9, 256; |
| @%p8 bra BB15_16; |
| |
| setp.gt.u32 %p9, %r10, 127; |
| @%p9 bra BB15_15; |
| |
| ld.shared.f32 %f34, [%r5+512]; |
| min.f32 %f45, %f45, %f34; |
| st.shared.f32 [%r5], %f45; |
| |
| BB15_15: |
| bar.sync 0; |
| |
| BB15_16: |
| setp.lt.u32 %p10, %r9, 128; |
| @%p10 bra BB15_20; |
| |
| setp.gt.u32 %p11, %r10, 63; |
| @%p11 bra BB15_19; |
| |
| ld.shared.f32 %f35, [%r5+256]; |
| min.f32 %f45, %f45, %f35; |
| st.shared.f32 [%r5], %f45; |
| |
| BB15_19: |
| bar.sync 0; |
| |
| BB15_20: |
| setp.gt.u32 %p12, %r10, 31; |
| @%p12 bra BB15_33; |
| |
| setp.lt.u32 %p13, %r9, 64; |
| @%p13 bra BB15_23; |
| |
| ld.volatile.shared.f32 %f36, [%r5+128]; |
| min.f32 %f45, %f45, %f36; |
| st.volatile.shared.f32 [%r5], %f45; |
| |
| BB15_23: |
| setp.lt.u32 %p14, %r9, 32; |
| @%p14 bra BB15_25; |
| |
| ld.volatile.shared.f32 %f37, [%r5+64]; |
| min.f32 %f45, %f45, %f37; |
| st.volatile.shared.f32 [%r5], %f45; |
| |
| BB15_25: |
| setp.lt.u32 %p15, %r9, 16; |
| @%p15 bra BB15_27; |
| |
| ld.volatile.shared.f32 %f38, [%r5+32]; |
| min.f32 %f45, %f45, %f38; |
| st.volatile.shared.f32 [%r5], %f45; |
| |
| BB15_27: |
| setp.lt.u32 %p16, %r9, 8; |
| @%p16 bra BB15_29; |
| |
| ld.volatile.shared.f32 %f39, [%r5+16]; |
| min.f32 %f45, %f45, %f39; |
| st.volatile.shared.f32 [%r5], %f45; |
| |
| BB15_29: |
| setp.lt.u32 %p17, %r9, 4; |
| @%p17 bra BB15_31; |
| |
| ld.volatile.shared.f32 %f40, [%r5+8]; |
| min.f32 %f45, %f45, %f40; |
| st.volatile.shared.f32 [%r5], %f45; |
| |
| BB15_31: |
| setp.lt.u32 %p18, %r9, 2; |
| @%p18 bra BB15_33; |
| |
| ld.volatile.shared.f32 %f41, [%r5+4]; |
| min.f32 %f42, %f45, %f41; |
| st.volatile.shared.f32 [%r5], %f42; |
| |
| BB15_33: |
| setp.ne.s32 %p19, %r10, 0; |
| @%p19 bra BB15_37; |
| |
| mov.u32 %r34, %nctaid.x; |
| setp.gt.u32 %p20, %r34, 9; |
| @%p20 bra BB15_36; |
| |
| ld.shared.f32 %f43, [memory]; |
| cvt.f64.f32 %fd1, %f43; |
| add.u64 %rd9, %SP, 0; |
| add.u64 %rd10, %SPL, 0; |
| st.local.u32 [%rd10], %r7; |
| st.local.f64 [%rd10+8], %fd1; |
| mov.u64 %rd11, $str; |
| cvta.global.u64 %rd12, %rd11; |
| // Callseq Start 5 |
| { |
| .reg .b32 temp_param_reg; |
| // <end>} |
| .param .b64 param0; |
| st.param.b64 [param0+0], %rd12; |
| .param .b64 param1; |
| st.param.b64 [param1+0], %rd9; |
| .param .b32 retval0; |
| call.uni (retval0), |
| vprintf, |
| ( |
| param0, |
| param1 |
| ); |
| ld.param.b32 %r36, [retval0+0]; |
| |
| //{ |
| }// Callseq End 5 |
| |
| BB15_36: |
| ld.shared.f32 %f44, [memory]; |
| cvta.to.global.u64 %rd13, %rd2; |
| mul.wide.u32 %rd14, %r7, 4; |
| add.s64 %rd15, %rd13, %rd14; |
| st.global.f32 [%rd15], %f44; |
| |
| BB15_37: |
| ret; |
| } |
| |
| // .globl reduce_row_min_d |
| .visible .entry reduce_row_min_d( |
| .param .u64 reduce_row_min_d_param_0, |
| .param .u64 reduce_row_min_d_param_1, |
| .param .u32 reduce_row_min_d_param_2, |
| .param .u32 reduce_row_min_d_param_3 |
| ) |
| { |
| .reg .pred %p<20>; |
| .reg .b32 %r<72>; |
| .reg .f64 %fd<56>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd1, [reduce_row_min_d_param_0]; |
| ld.param.u64 %rd2, [reduce_row_min_d_param_1]; |
| ld.param.u32 %r5, [reduce_row_min_d_param_2]; |
| ld.param.u32 %r4, [reduce_row_min_d_param_3]; |
| mov.u32 %r6, %ctaid.x; |
| setp.ge.u32 %p1, %r6, %r5; |
| @%p1 bra BB16_35; |
| |
| mov.u32 %r71, %tid.x; |
| mov.f64 %fd6, 0d7FEFFFFFFFFFFFFF; |
| setp.ge.u32 %p2, %r71, %r4; |
| @%p2 bra BB16_4; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| |
| BB16_3: |
| mad.lo.s32 %r8, %r6, %r4, %r71; |
| mul.wide.u32 %rd4, %r8, 8; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f64 %fd28, [%rd5]; |
| min.f64 %fd6, %fd6, %fd28; |
| mov.u32 %r9, %ntid.x; |
| add.s32 %r71, %r9, %r71; |
| setp.lt.u32 %p3, %r71, %r4; |
| @%p3 bra BB16_3; |
| |
| BB16_4: |
| mov.u32 %r10, %tid.x; |
| shl.b32 %r11, %r10, 3; |
| mov.u32 %r12, memory; |
| add.s32 %r13, %r12, %r11; |
| st.shared.f64 [%r13], %fd6; |
| bar.sync 0; |
| mov.u32 %r14, %ntid.x; |
| setp.lt.u32 %p4, %r14, 1024; |
| @%p4 bra BB16_8; |
| |
| setp.gt.u32 %p5, %r10, 511; |
| @%p5 bra BB16_7; |
| |
| ld.shared.f64 %fd29, [%r13+4096]; |
| min.f64 %fd6, %fd6, %fd29; |
| st.shared.f64 [%r13], %fd6; |
| |
| BB16_7: |
| bar.sync 0; |
| |
| BB16_8: |
| setp.lt.u32 %p6, %r14, 512; |
| @%p6 bra BB16_12; |
| |
| setp.gt.u32 %p7, %r10, 255; |
| @%p7 bra BB16_11; |
| |
| ld.shared.f64 %fd30, [%r13+2048]; |
| min.f64 %fd6, %fd6, %fd30; |
| st.shared.f64 [%r13], %fd6; |
| |
| BB16_11: |
| bar.sync 0; |
| |
| BB16_12: |
| setp.lt.u32 %p8, %r14, 256; |
| @%p8 bra BB16_16; |
| |
| setp.gt.u32 %p9, %r10, 127; |
| @%p9 bra BB16_15; |
| |
| ld.shared.f64 %fd31, [%r13+1024]; |
| min.f64 %fd6, %fd6, %fd31; |
| st.shared.f64 [%r13], %fd6; |
| |
| BB16_15: |
| bar.sync 0; |
| |
| BB16_16: |
| setp.lt.u32 %p10, %r14, 128; |
| @%p10 bra BB16_20; |
| |
| setp.gt.u32 %p11, %r10, 63; |
| @%p11 bra BB16_19; |
| |
| ld.shared.f64 %fd32, [%r13+512]; |
| min.f64 %fd6, %fd6, %fd32; |
| st.shared.f64 [%r13], %fd6; |
| |
| BB16_19: |
| bar.sync 0; |
| |
| BB16_20: |
| setp.gt.u32 %p12, %r10, 31; |
| @%p12 bra BB16_33; |
| |
| setp.lt.u32 %p13, %r14, 64; |
| @%p13 bra BB16_23; |
| |
| ld.volatile.shared.f64 %fd33, [%r13+256]; |
| min.f64 %fd6, %fd6, %fd33; |
| st.volatile.shared.f64 [%r13], %fd6; |
| |
| BB16_23: |
| setp.lt.u32 %p14, %r14, 32; |
| @%p14 bra BB16_25; |
| |
| ld.volatile.shared.f64 %fd34, [%r13+128]; |
| min.f64 %fd6, %fd6, %fd34; |
| st.volatile.shared.f64 [%r13], %fd6; |
| |
| BB16_25: |
| setp.lt.u32 %p15, %r14, 16; |
| @%p15 bra BB16_27; |
| |
| ld.volatile.shared.f64 %fd35, [%r13+64]; |
| min.f64 %fd6, %fd6, %fd35; |
| st.volatile.shared.f64 [%r13], %fd6; |
| |
| BB16_27: |
| setp.lt.u32 %p16, %r14, 8; |
| @%p16 bra BB16_29; |
| |
| ld.volatile.shared.f64 %fd36, [%r13+32]; |
| min.f64 %fd6, %fd6, %fd36; |
| st.volatile.shared.f64 [%r13], %fd6; |
| |
| BB16_29: |
| setp.lt.u32 %p17, %r14, 4; |
| @%p17 bra BB16_31; |
| |
| ld.volatile.shared.f64 %fd37, [%r13+16]; |
| min.f64 %fd6, %fd6, %fd37; |
| st.volatile.shared.f64 [%r13], %fd6; |
| |
| BB16_31: |
| setp.lt.u32 %p18, %r14, 2; |
| @%p18 bra BB16_33; |
| |
| ld.volatile.shared.f64 %fd38, [%r13+8]; |
| min.f64 %fd39, %fd6, %fd38; |
| st.volatile.shared.f64 [%r13], %fd39; |
| |
| BB16_33: |
| setp.ne.s32 %p19, %r10, 0; |
| @%p19 bra BB16_35; |
| |
| ld.shared.f64 %fd40, [memory]; |
| cvta.to.global.u64 %rd6, %rd2; |
| mul.wide.u32 %rd7, %r6, 8; |
| add.s64 %rd8, %rd6, %rd7; |
| st.global.f64 [%rd8], %fd40; |
| |
| BB16_35: |
| ret; |
| } |
| |
| // .globl reduce_row_min_f |
| .visible .entry reduce_row_min_f( |
| .param .u64 reduce_row_min_f_param_0, |
| .param .u64 reduce_row_min_f_param_1, |
| .param .u32 reduce_row_min_f_param_2, |
| .param .u32 reduce_row_min_f_param_3 |
| ) |
| { |
| .reg .pred %p<20>; |
| .reg .f32 %f<56>; |
| .reg .b32 %r<72>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd1, [reduce_row_min_f_param_0]; |
| ld.param.u64 %rd2, [reduce_row_min_f_param_1]; |
| ld.param.u32 %r5, [reduce_row_min_f_param_2]; |
| ld.param.u32 %r4, [reduce_row_min_f_param_3]; |
| mov.u32 %r6, %ctaid.x; |
| setp.ge.u32 %p1, %r6, %r5; |
| @%p1 bra BB17_35; |
| |
| mov.u32 %r71, %tid.x; |
| mov.f32 %f6, 0f7F7FFFFF; |
| setp.ge.u32 %p2, %r71, %r4; |
| @%p2 bra BB17_4; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| |
| BB17_3: |
| mad.lo.s32 %r8, %r6, %r4, %r71; |
| mul.wide.u32 %rd4, %r8, 4; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f32 %f28, [%rd5]; |
| min.f32 %f6, %f6, %f28; |
| mov.u32 %r9, %ntid.x; |
| add.s32 %r71, %r9, %r71; |
| setp.lt.u32 %p3, %r71, %r4; |
| @%p3 bra BB17_3; |
| |
| BB17_4: |
| mov.u32 %r10, %tid.x; |
| shl.b32 %r11, %r10, 2; |
| mov.u32 %r12, memory; |
| add.s32 %r13, %r12, %r11; |
| st.shared.f32 [%r13], %f6; |
| bar.sync 0; |
| mov.u32 %r14, %ntid.x; |
| setp.lt.u32 %p4, %r14, 1024; |
| @%p4 bra BB17_8; |
| |
| setp.gt.u32 %p5, %r10, 511; |
| @%p5 bra BB17_7; |
| |
| ld.shared.f32 %f29, [%r13+2048]; |
| min.f32 %f6, %f6, %f29; |
| st.shared.f32 [%r13], %f6; |
| |
| BB17_7: |
| bar.sync 0; |
| |
| BB17_8: |
| setp.lt.u32 %p6, %r14, 512; |
| @%p6 bra BB17_12; |
| |
| setp.gt.u32 %p7, %r10, 255; |
| @%p7 bra BB17_11; |
| |
| ld.shared.f32 %f30, [%r13+1024]; |
| min.f32 %f6, %f6, %f30; |
| st.shared.f32 [%r13], %f6; |
| |
| BB17_11: |
| bar.sync 0; |
| |
| BB17_12: |
| setp.lt.u32 %p8, %r14, 256; |
| @%p8 bra BB17_16; |
| |
| setp.gt.u32 %p9, %r10, 127; |
| @%p9 bra BB17_15; |
| |
| ld.shared.f32 %f31, [%r13+512]; |
| min.f32 %f6, %f6, %f31; |
| st.shared.f32 [%r13], %f6; |
| |
| BB17_15: |
| bar.sync 0; |
| |
| BB17_16: |
| setp.lt.u32 %p10, %r14, 128; |
| @%p10 bra BB17_20; |
| |
| setp.gt.u32 %p11, %r10, 63; |
| @%p11 bra BB17_19; |
| |
| ld.shared.f32 %f32, [%r13+256]; |
| min.f32 %f6, %f6, %f32; |
| st.shared.f32 [%r13], %f6; |
| |
| BB17_19: |
| bar.sync 0; |
| |
| BB17_20: |
| setp.gt.u32 %p12, %r10, 31; |
| @%p12 bra BB17_33; |
| |
| setp.lt.u32 %p13, %r14, 64; |
| @%p13 bra BB17_23; |
| |
| ld.volatile.shared.f32 %f33, [%r13+128]; |
| min.f32 %f6, %f6, %f33; |
| st.volatile.shared.f32 [%r13], %f6; |
| |
| BB17_23: |
| setp.lt.u32 %p14, %r14, 32; |
| @%p14 bra BB17_25; |
| |
| ld.volatile.shared.f32 %f34, [%r13+64]; |
| min.f32 %f6, %f6, %f34; |
| st.volatile.shared.f32 [%r13], %f6; |
| |
| BB17_25: |
| setp.lt.u32 %p15, %r14, 16; |
| @%p15 bra BB17_27; |
| |
| ld.volatile.shared.f32 %f35, [%r13+32]; |
| min.f32 %f6, %f6, %f35; |
| st.volatile.shared.f32 [%r13], %f6; |
| |
| BB17_27: |
| setp.lt.u32 %p16, %r14, 8; |
| @%p16 bra BB17_29; |
| |
| ld.volatile.shared.f32 %f36, [%r13+16]; |
| min.f32 %f6, %f6, %f36; |
| st.volatile.shared.f32 [%r13], %f6; |
| |
| BB17_29: |
| setp.lt.u32 %p17, %r14, 4; |
| @%p17 bra BB17_31; |
| |
| ld.volatile.shared.f32 %f37, [%r13+8]; |
| min.f32 %f6, %f6, %f37; |
| st.volatile.shared.f32 [%r13], %f6; |
| |
| BB17_31: |
| setp.lt.u32 %p18, %r14, 2; |
| @%p18 bra BB17_33; |
| |
| ld.volatile.shared.f32 %f38, [%r13+4]; |
| min.f32 %f39, %f6, %f38; |
| st.volatile.shared.f32 [%r13], %f39; |
| |
| BB17_33: |
| setp.ne.s32 %p19, %r10, 0; |
| @%p19 bra BB17_35; |
| |
| ld.shared.f32 %f40, [memory]; |
| cvta.to.global.u64 %rd6, %rd2; |
| mul.wide.u32 %rd7, %r6, 4; |
| add.s64 %rd8, %rd6, %rd7; |
| st.global.f32 [%rd8], %f40; |
| |
| BB17_35: |
| ret; |
| } |
| |
| // .globl reduce_col_min_d |
| .visible .entry reduce_col_min_d( |
| .param .u64 reduce_col_min_d_param_0, |
| .param .u64 reduce_col_min_d_param_1, |
| .param .u32 reduce_col_min_d_param_2, |
| .param .u32 reduce_col_min_d_param_3 |
| ) |
| { |
| .reg .pred %p<4>; |
| .reg .b32 %r<11>; |
| .reg .f64 %fd<9>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd2, [reduce_col_min_d_param_0]; |
| ld.param.u64 %rd3, [reduce_col_min_d_param_1]; |
| ld.param.u32 %r5, [reduce_col_min_d_param_2]; |
| ld.param.u32 %r6, [reduce_col_min_d_param_3]; |
| mov.u32 %r7, %ntid.x; |
| mov.u32 %r8, %ctaid.x; |
| mov.u32 %r9, %tid.x; |
| mad.lo.s32 %r1, %r7, %r8, %r9; |
| setp.ge.u32 %p1, %r1, %r6; |
| @%p1 bra BB18_5; |
| |
| mul.lo.s32 %r2, %r6, %r5; |
| cvta.to.global.u64 %rd1, %rd2; |
| mov.f64 %fd8, 0d7FEFFFFFFFFFFFFF; |
| setp.ge.u32 %p2, %r1, %r2; |
| @%p2 bra BB18_4; |
| |
| mov.u32 %r10, %r1; |
| |
| BB18_3: |
| mul.wide.u32 %rd4, %r10, 8; |
| add.s64 %rd5, %rd1, %rd4; |
| ld.global.f64 %fd6, [%rd5]; |
| min.f64 %fd8, %fd8, %fd6; |
| add.s32 %r10, %r10, %r6; |
| setp.lt.u32 %p3, %r10, %r2; |
| @%p3 bra BB18_3; |
| |
| BB18_4: |
| cvta.to.global.u64 %rd6, %rd3; |
| mul.wide.u32 %rd7, %r1, 8; |
| add.s64 %rd8, %rd6, %rd7; |
| st.global.f64 [%rd8], %fd8; |
| |
| BB18_5: |
| ret; |
| } |
| |
| // .globl reduce_col_min_f |
| .visible .entry reduce_col_min_f( |
| .param .u64 reduce_col_min_f_param_0, |
| .param .u64 reduce_col_min_f_param_1, |
| .param .u32 reduce_col_min_f_param_2, |
| .param .u32 reduce_col_min_f_param_3 |
| ) |
| { |
| .reg .pred %p<4>; |
| .reg .f32 %f<9>; |
| .reg .b32 %r<11>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd2, [reduce_col_min_f_param_0]; |
| ld.param.u64 %rd3, [reduce_col_min_f_param_1]; |
| ld.param.u32 %r5, [reduce_col_min_f_param_2]; |
| ld.param.u32 %r6, [reduce_col_min_f_param_3]; |
| mov.u32 %r7, %ntid.x; |
| mov.u32 %r8, %ctaid.x; |
| mov.u32 %r9, %tid.x; |
| mad.lo.s32 %r1, %r7, %r8, %r9; |
| setp.ge.u32 %p1, %r1, %r6; |
| @%p1 bra BB19_5; |
| |
| mul.lo.s32 %r2, %r6, %r5; |
| cvta.to.global.u64 %rd1, %rd2; |
| mov.f32 %f8, 0f7F7FFFFF; |
| setp.ge.u32 %p2, %r1, %r2; |
| @%p2 bra BB19_4; |
| |
| mov.u32 %r10, %r1; |
| |
| BB19_3: |
| mul.wide.u32 %rd4, %r10, 4; |
| add.s64 %rd5, %rd1, %rd4; |
| ld.global.f32 %f6, [%rd5]; |
| min.f32 %f8, %f8, %f6; |
| add.s32 %r10, %r10, %r6; |
| setp.lt.u32 %p3, %r10, %r2; |
| @%p3 bra BB19_3; |
| |
| BB19_4: |
| cvta.to.global.u64 %rd6, %rd3; |
| mul.wide.u32 %rd7, %r1, 4; |
| add.s64 %rd8, %rd6, %rd7; |
| st.global.f32 [%rd8], %f8; |
| |
| BB19_5: |
| ret; |
| } |
| |
| // .globl reduce_sum_sq_d |
| .visible .entry reduce_sum_sq_d( |
| .param .u64 reduce_sum_sq_d_param_0, |
| .param .u64 reduce_sum_sq_d_param_1, |
| .param .u32 reduce_sum_sq_d_param_2 |
| ) |
| { |
| .local .align 16 .b8 __local_depot20[16]; |
| .reg .b64 %SP; |
| .reg .b64 %SPL; |
| .reg .pred %p<21>; |
| .reg .b32 %r<39>; |
| .reg .f64 %fd<61>; |
| .reg .b64 %rd<16>; |
| |
| |
| mov.u64 %SPL, __local_depot20; |
| cvta.local.u64 %SP, %SPL; |
| ld.param.u64 %rd1, [reduce_sum_sq_d_param_0]; |
| ld.param.u64 %rd2, [reduce_sum_sq_d_param_1]; |
| ld.param.u32 %r6, [reduce_sum_sq_d_param_2]; |
| mov.u32 %r7, %ctaid.x; |
| shl.b32 %r8, %r7, 1; |
| mov.u32 %r9, %ntid.x; |
| mov.u32 %r10, %tid.x; |
| mad.lo.s32 %r38, %r8, %r9, %r10; |
| mov.f64 %fd45, 0d0000000000000000; |
| setp.ge.u32 %p1, %r38, %r6; |
| @%p1 bra BB20_4; |
| |
| BB20_1: |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.u32 %rd4, %r38, 8; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f64 %fd30, [%rd5]; |
| fma.rn.f64 %fd45, %fd30, %fd30, %fd45; |
| add.s32 %r3, %r38, %r9; |
| setp.ge.u32 %p2, %r3, %r6; |
| @%p2 bra BB20_3; |
| |
| mul.wide.u32 %rd7, %r3, 8; |
| add.s64 %rd8, %rd3, %rd7; |
| ld.global.f64 %fd31, [%rd8]; |
| fma.rn.f64 %fd45, %fd31, %fd31, %fd45; |
| |
| BB20_3: |
| shl.b32 %r13, %r9, 1; |
| mov.u32 %r14, %nctaid.x; |
| mad.lo.s32 %r38, %r13, %r14, %r38; |
| setp.lt.u32 %p3, %r38, %r6; |
| @%p3 bra BB20_1; |
| |
| BB20_4: |
| shl.b32 %r16, %r10, 3; |
| mov.u32 %r17, memory; |
| add.s32 %r5, %r17, %r16; |
| st.shared.f64 [%r5], %fd45; |
| bar.sync 0; |
| setp.lt.u32 %p4, %r9, 1024; |
| @%p4 bra BB20_8; |
| |
| setp.gt.u32 %p5, %r10, 511; |
| @%p5 bra BB20_7; |
| |
| ld.shared.f64 %fd32, [%r5+4096]; |
| fma.rn.f64 %fd45, %fd32, %fd32, %fd45; |
| st.shared.f64 [%r5], %fd45; |
| |
| BB20_7: |
| bar.sync 0; |
| |
| BB20_8: |
| setp.lt.u32 %p6, %r9, 512; |
| @%p6 bra BB20_12; |
| |
| setp.gt.u32 %p7, %r10, 255; |
| @%p7 bra BB20_11; |
| |
| ld.shared.f64 %fd33, [%r5+2048]; |
| fma.rn.f64 %fd45, %fd33, %fd33, %fd45; |
| st.shared.f64 [%r5], %fd45; |
| |
| BB20_11: |
| bar.sync 0; |
| |
| BB20_12: |
| setp.lt.u32 %p8, %r9, 256; |
| @%p8 bra BB20_16; |
| |
| setp.gt.u32 %p9, %r10, 127; |
| @%p9 bra BB20_15; |
| |
| ld.shared.f64 %fd34, [%r5+1024]; |
| fma.rn.f64 %fd45, %fd34, %fd34, %fd45; |
| st.shared.f64 [%r5], %fd45; |
| |
| BB20_15: |
| bar.sync 0; |
| |
| BB20_16: |
| setp.lt.u32 %p10, %r9, 128; |
| @%p10 bra BB20_20; |
| |
| setp.gt.u32 %p11, %r10, 63; |
| @%p11 bra BB20_19; |
| |
| ld.shared.f64 %fd35, [%r5+512]; |
| fma.rn.f64 %fd45, %fd35, %fd35, %fd45; |
| st.shared.f64 [%r5], %fd45; |
| |
| BB20_19: |
| bar.sync 0; |
| |
| BB20_20: |
| setp.gt.u32 %p12, %r10, 31; |
| @%p12 bra BB20_33; |
| |
| setp.lt.u32 %p13, %r9, 64; |
| @%p13 bra BB20_23; |
| |
| ld.volatile.shared.f64 %fd36, [%r5+256]; |
| fma.rn.f64 %fd45, %fd36, %fd36, %fd45; |
| st.volatile.shared.f64 [%r5], %fd45; |
| |
| BB20_23: |
| setp.lt.u32 %p14, %r9, 32; |
| @%p14 bra BB20_25; |
| |
| ld.volatile.shared.f64 %fd37, [%r5+128]; |
| fma.rn.f64 %fd45, %fd37, %fd37, %fd45; |
| st.volatile.shared.f64 [%r5], %fd45; |
| |
| BB20_25: |
| setp.lt.u32 %p15, %r9, 16; |
| @%p15 bra BB20_27; |
| |
| ld.volatile.shared.f64 %fd38, [%r5+64]; |
| fma.rn.f64 %fd45, %fd38, %fd38, %fd45; |
| st.volatile.shared.f64 [%r5], %fd45; |
| |
| BB20_27: |
| setp.lt.u32 %p16, %r9, 8; |
| @%p16 bra BB20_29; |
| |
| ld.volatile.shared.f64 %fd39, [%r5+32]; |
| fma.rn.f64 %fd45, %fd39, %fd39, %fd45; |
| st.volatile.shared.f64 [%r5], %fd45; |
| |
| BB20_29: |
| setp.lt.u32 %p17, %r9, 4; |
| @%p17 bra BB20_31; |
| |
| ld.volatile.shared.f64 %fd40, [%r5+16]; |
| fma.rn.f64 %fd45, %fd40, %fd40, %fd45; |
| st.volatile.shared.f64 [%r5], %fd45; |
| |
| BB20_31: |
| setp.lt.u32 %p18, %r9, 2; |
| @%p18 bra BB20_33; |
| |
| ld.volatile.shared.f64 %fd41, [%r5+8]; |
| fma.rn.f64 %fd42, %fd41, %fd41, %fd45; |
| st.volatile.shared.f64 [%r5], %fd42; |
| |
| BB20_33: |
| setp.ne.s32 %p19, %r10, 0; |
| @%p19 bra BB20_37; |
| |
| mov.u32 %r34, %nctaid.x; |
| setp.gt.u32 %p20, %r34, 9; |
| @%p20 bra BB20_36; |
| |
| ld.shared.f64 %fd43, [memory]; |
| add.u64 %rd9, %SP, 0; |
| add.u64 %rd10, %SPL, 0; |
| st.local.u32 [%rd10], %r7; |
| st.local.f64 [%rd10+8], %fd43; |
| mov.u64 %rd11, $str; |
| cvta.global.u64 %rd12, %rd11; |
| // Callseq Start 6 |
| { |
| .reg .b32 temp_param_reg; |
| // <end>} |
| .param .b64 param0; |
| st.param.b64 [param0+0], %rd12; |
| .param .b64 param1; |
| st.param.b64 [param1+0], %rd9; |
| .param .b32 retval0; |
| call.uni (retval0), |
| vprintf, |
| ( |
| param0, |
| param1 |
| ); |
| ld.param.b32 %r36, [retval0+0]; |
| |
| //{ |
| }// Callseq End 6 |
| |
| BB20_36: |
| ld.shared.f64 %fd44, [memory]; |
| cvta.to.global.u64 %rd13, %rd2; |
| mul.wide.u32 %rd14, %r7, 8; |
| add.s64 %rd15, %rd13, %rd14; |
| st.global.f64 [%rd15], %fd44; |
| |
| BB20_37: |
| ret; |
| } |
| |
| // .globl reduce_sum_sq_f |
| .visible .entry reduce_sum_sq_f( |
| .param .u64 reduce_sum_sq_f_param_0, |
| .param .u64 reduce_sum_sq_f_param_1, |
| .param .u32 reduce_sum_sq_f_param_2 |
| ) |
| { |
| .local .align 16 .b8 __local_depot21[16]; |
| .reg .b64 %SP; |
| .reg .b64 %SPL; |
| .reg .pred %p<21>; |
| .reg .f32 %f<61>; |
| .reg .b32 %r<39>; |
| .reg .f64 %fd<2>; |
| .reg .b64 %rd<16>; |
| |
| |
| mov.u64 %SPL, __local_depot21; |
| cvta.local.u64 %SP, %SPL; |
| ld.param.u64 %rd1, [reduce_sum_sq_f_param_0]; |
| ld.param.u64 %rd2, [reduce_sum_sq_f_param_1]; |
| ld.param.u32 %r6, [reduce_sum_sq_f_param_2]; |
| mov.u32 %r7, %ctaid.x; |
| shl.b32 %r8, %r7, 1; |
| mov.u32 %r9, %ntid.x; |
| mov.u32 %r10, %tid.x; |
| mad.lo.s32 %r38, %r8, %r9, %r10; |
| mov.f32 %f45, 0f00000000; |
| setp.ge.u32 %p1, %r38, %r6; |
| @%p1 bra BB21_4; |
| |
| BB21_1: |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.u32 %rd4, %r38, 4; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f32 %f30, [%rd5]; |
| fma.rn.f32 %f45, %f30, %f30, %f45; |
| add.s32 %r3, %r38, %r9; |
| setp.ge.u32 %p2, %r3, %r6; |
| @%p2 bra BB21_3; |
| |
| mul.wide.u32 %rd7, %r3, 4; |
| add.s64 %rd8, %rd3, %rd7; |
| ld.global.f32 %f31, [%rd8]; |
| fma.rn.f32 %f45, %f31, %f31, %f45; |
| |
| BB21_3: |
| shl.b32 %r13, %r9, 1; |
| mov.u32 %r14, %nctaid.x; |
| mad.lo.s32 %r38, %r13, %r14, %r38; |
| setp.lt.u32 %p3, %r38, %r6; |
| @%p3 bra BB21_1; |
| |
| BB21_4: |
| shl.b32 %r16, %r10, 2; |
| mov.u32 %r17, memory; |
| add.s32 %r5, %r17, %r16; |
| st.shared.f32 [%r5], %f45; |
| bar.sync 0; |
| setp.lt.u32 %p4, %r9, 1024; |
| @%p4 bra BB21_8; |
| |
| setp.gt.u32 %p5, %r10, 511; |
| @%p5 bra BB21_7; |
| |
| ld.shared.f32 %f32, [%r5+2048]; |
| fma.rn.f32 %f45, %f32, %f32, %f45; |
| st.shared.f32 [%r5], %f45; |
| |
| BB21_7: |
| bar.sync 0; |
| |
| BB21_8: |
| setp.lt.u32 %p6, %r9, 512; |
| @%p6 bra BB21_12; |
| |
| setp.gt.u32 %p7, %r10, 255; |
| @%p7 bra BB21_11; |
| |
| ld.shared.f32 %f33, [%r5+1024]; |
| fma.rn.f32 %f45, %f33, %f33, %f45; |
| st.shared.f32 [%r5], %f45; |
| |
| BB21_11: |
| bar.sync 0; |
| |
| BB21_12: |
| setp.lt.u32 %p8, %r9, 256; |
| @%p8 bra BB21_16; |
| |
| setp.gt.u32 %p9, %r10, 127; |
| @%p9 bra BB21_15; |
| |
| ld.shared.f32 %f34, [%r5+512]; |
| fma.rn.f32 %f45, %f34, %f34, %f45; |
| st.shared.f32 [%r5], %f45; |
| |
| BB21_15: |
| bar.sync 0; |
| |
| BB21_16: |
| setp.lt.u32 %p10, %r9, 128; |
| @%p10 bra BB21_20; |
| |
| setp.gt.u32 %p11, %r10, 63; |
| @%p11 bra BB21_19; |
| |
| ld.shared.f32 %f35, [%r5+256]; |
| fma.rn.f32 %f45, %f35, %f35, %f45; |
| st.shared.f32 [%r5], %f45; |
| |
| BB21_19: |
| bar.sync 0; |
| |
| BB21_20: |
| setp.gt.u32 %p12, %r10, 31; |
| @%p12 bra BB21_33; |
| |
| setp.lt.u32 %p13, %r9, 64; |
| @%p13 bra BB21_23; |
| |
| ld.volatile.shared.f32 %f36, [%r5+128]; |
| fma.rn.f32 %f45, %f36, %f36, %f45; |
| st.volatile.shared.f32 [%r5], %f45; |
| |
| BB21_23: |
| setp.lt.u32 %p14, %r9, 32; |
| @%p14 bra BB21_25; |
| |
| ld.volatile.shared.f32 %f37, [%r5+64]; |
| fma.rn.f32 %f45, %f37, %f37, %f45; |
| st.volatile.shared.f32 [%r5], %f45; |
| |
| BB21_25: |
| setp.lt.u32 %p15, %r9, 16; |
| @%p15 bra BB21_27; |
| |
| ld.volatile.shared.f32 %f38, [%r5+32]; |
| fma.rn.f32 %f45, %f38, %f38, %f45; |
| st.volatile.shared.f32 [%r5], %f45; |
| |
| BB21_27: |
| setp.lt.u32 %p16, %r9, 8; |
| @%p16 bra BB21_29; |
| |
| ld.volatile.shared.f32 %f39, [%r5+16]; |
| fma.rn.f32 %f45, %f39, %f39, %f45; |
| st.volatile.shared.f32 [%r5], %f45; |
| |
| BB21_29: |
| setp.lt.u32 %p17, %r9, 4; |
| @%p17 bra BB21_31; |
| |
| ld.volatile.shared.f32 %f40, [%r5+8]; |
| fma.rn.f32 %f45, %f40, %f40, %f45; |
| st.volatile.shared.f32 [%r5], %f45; |
| |
| BB21_31: |
| setp.lt.u32 %p18, %r9, 2; |
| @%p18 bra BB21_33; |
| |
| ld.volatile.shared.f32 %f41, [%r5+4]; |
| fma.rn.f32 %f42, %f41, %f41, %f45; |
| st.volatile.shared.f32 [%r5], %f42; |
| |
| BB21_33: |
| setp.ne.s32 %p19, %r10, 0; |
| @%p19 bra BB21_37; |
| |
| mov.u32 %r34, %nctaid.x; |
| setp.gt.u32 %p20, %r34, 9; |
| @%p20 bra BB21_36; |
| |
| ld.shared.f32 %f43, [memory]; |
| cvt.f64.f32 %fd1, %f43; |
| add.u64 %rd9, %SP, 0; |
| add.u64 %rd10, %SPL, 0; |
| st.local.u32 [%rd10], %r7; |
| st.local.f64 [%rd10+8], %fd1; |
| mov.u64 %rd11, $str; |
| cvta.global.u64 %rd12, %rd11; |
| // Callseq Start 7 |
| { |
| .reg .b32 temp_param_reg; |
| // <end>} |
| .param .b64 param0; |
| st.param.b64 [param0+0], %rd12; |
| .param .b64 param1; |
| st.param.b64 [param1+0], %rd9; |
| .param .b32 retval0; |
| call.uni (retval0), |
| vprintf, |
| ( |
| param0, |
| param1 |
| ); |
| ld.param.b32 %r36, [retval0+0]; |
| |
| //{ |
| }// Callseq End 7 |
| |
| BB21_36: |
| ld.shared.f32 %f44, [memory]; |
| cvta.to.global.u64 %rd13, %rd2; |
| mul.wide.u32 %rd14, %r7, 4; |
| add.s64 %rd15, %rd13, %rd14; |
| st.global.f32 [%rd15], %f44; |
| |
| BB21_37: |
| ret; |
| } |
| |
| // .globl reduce_col_sum_sq_d |
| .visible .entry reduce_col_sum_sq_d( |
| .param .u64 reduce_col_sum_sq_d_param_0, |
| .param .u64 reduce_col_sum_sq_d_param_1, |
| .param .u32 reduce_col_sum_sq_d_param_2, |
| .param .u32 reduce_col_sum_sq_d_param_3 |
| ) |
| { |
| .reg .pred %p<4>; |
| .reg .b32 %r<11>; |
| .reg .f64 %fd<9>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd2, [reduce_col_sum_sq_d_param_0]; |
| ld.param.u64 %rd3, [reduce_col_sum_sq_d_param_1]; |
| ld.param.u32 %r5, [reduce_col_sum_sq_d_param_2]; |
| ld.param.u32 %r6, [reduce_col_sum_sq_d_param_3]; |
| mov.u32 %r7, %ntid.x; |
| mov.u32 %r8, %ctaid.x; |
| mov.u32 %r9, %tid.x; |
| mad.lo.s32 %r1, %r7, %r8, %r9; |
| setp.ge.u32 %p1, %r1, %r6; |
| @%p1 bra BB22_5; |
| |
| mul.lo.s32 %r2, %r6, %r5; |
| cvta.to.global.u64 %rd1, %rd2; |
| mov.f64 %fd8, 0d0000000000000000; |
| setp.ge.u32 %p2, %r1, %r2; |
| @%p2 bra BB22_4; |
| |
| mov.u32 %r10, %r1; |
| |
| BB22_3: |
| mul.wide.u32 %rd4, %r10, 8; |
| add.s64 %rd5, %rd1, %rd4; |
| ld.global.f64 %fd6, [%rd5]; |
| fma.rn.f64 %fd8, %fd6, %fd6, %fd8; |
| add.s32 %r10, %r10, %r6; |
| setp.lt.u32 %p3, %r10, %r2; |
| @%p3 bra BB22_3; |
| |
| BB22_4: |
| cvta.to.global.u64 %rd6, %rd3; |
| mul.wide.u32 %rd7, %r1, 8; |
| add.s64 %rd8, %rd6, %rd7; |
| st.global.f64 [%rd8], %fd8; |
| |
| BB22_5: |
| ret; |
| } |
| |
| // .globl reduce_col_sum_sq_f |
| .visible .entry reduce_col_sum_sq_f( |
| .param .u64 reduce_col_sum_sq_f_param_0, |
| .param .u64 reduce_col_sum_sq_f_param_1, |
| .param .u32 reduce_col_sum_sq_f_param_2, |
| .param .u32 reduce_col_sum_sq_f_param_3 |
| ) |
| { |
| .reg .pred %p<4>; |
| .reg .f32 %f<9>; |
| .reg .b32 %r<11>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd2, [reduce_col_sum_sq_f_param_0]; |
| ld.param.u64 %rd3, [reduce_col_sum_sq_f_param_1]; |
| ld.param.u32 %r5, [reduce_col_sum_sq_f_param_2]; |
| ld.param.u32 %r6, [reduce_col_sum_sq_f_param_3]; |
| mov.u32 %r7, %ntid.x; |
| mov.u32 %r8, %ctaid.x; |
| mov.u32 %r9, %tid.x; |
| mad.lo.s32 %r1, %r7, %r8, %r9; |
| setp.ge.u32 %p1, %r1, %r6; |
| @%p1 bra BB23_5; |
| |
| mul.lo.s32 %r2, %r6, %r5; |
| cvta.to.global.u64 %rd1, %rd2; |
| mov.f32 %f8, 0f00000000; |
| setp.ge.u32 %p2, %r1, %r2; |
| @%p2 bra BB23_4; |
| |
| mov.u32 %r10, %r1; |
| |
| BB23_3: |
| mul.wide.u32 %rd4, %r10, 4; |
| add.s64 %rd5, %rd1, %rd4; |
| ld.global.f32 %f6, [%rd5]; |
| fma.rn.f32 %f8, %f6, %f6, %f8; |
| add.s32 %r10, %r10, %r6; |
| setp.lt.u32 %p3, %r10, %r2; |
| @%p3 bra BB23_3; |
| |
| BB23_4: |
| cvta.to.global.u64 %rd6, %rd3; |
| mul.wide.u32 %rd7, %r1, 4; |
| add.s64 %rd8, %rd6, %rd7; |
| st.global.f32 [%rd8], %f8; |
| |
| BB23_5: |
| ret; |
| } |
| |
| // .globl reduce_row_sum_sq_d |
| .visible .entry reduce_row_sum_sq_d( |
| .param .u64 reduce_row_sum_sq_d_param_0, |
| .param .u64 reduce_row_sum_sq_d_param_1, |
| .param .u32 reduce_row_sum_sq_d_param_2, |
| .param .u32 reduce_row_sum_sq_d_param_3 |
| ) |
| { |
| .reg .pred %p<20>; |
| .reg .b32 %r<72>; |
| .reg .f64 %fd<56>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd1, [reduce_row_sum_sq_d_param_0]; |
| ld.param.u64 %rd2, [reduce_row_sum_sq_d_param_1]; |
| ld.param.u32 %r5, [reduce_row_sum_sq_d_param_2]; |
| ld.param.u32 %r4, [reduce_row_sum_sq_d_param_3]; |
| mov.u32 %r6, %ctaid.x; |
| setp.ge.u32 %p1, %r6, %r5; |
| @%p1 bra BB24_35; |
| |
| mov.u32 %r71, %tid.x; |
| mov.f64 %fd6, 0d0000000000000000; |
| setp.ge.u32 %p2, %r71, %r4; |
| @%p2 bra BB24_4; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| |
| BB24_3: |
| mad.lo.s32 %r8, %r6, %r4, %r71; |
| mul.wide.u32 %rd4, %r8, 8; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f64 %fd28, [%rd5]; |
| fma.rn.f64 %fd6, %fd28, %fd28, %fd6; |
| mov.u32 %r9, %ntid.x; |
| add.s32 %r71, %r9, %r71; |
| setp.lt.u32 %p3, %r71, %r4; |
| @%p3 bra BB24_3; |
| |
| BB24_4: |
| mov.u32 %r10, %tid.x; |
| shl.b32 %r11, %r10, 3; |
| mov.u32 %r12, memory; |
| add.s32 %r13, %r12, %r11; |
| st.shared.f64 [%r13], %fd6; |
| bar.sync 0; |
| mov.u32 %r14, %ntid.x; |
| setp.lt.u32 %p4, %r14, 1024; |
| @%p4 bra BB24_8; |
| |
| setp.gt.u32 %p5, %r10, 511; |
| @%p5 bra BB24_7; |
| |
| ld.shared.f64 %fd29, [%r13+4096]; |
| fma.rn.f64 %fd6, %fd29, %fd29, %fd6; |
| st.shared.f64 [%r13], %fd6; |
| |
| BB24_7: |
| bar.sync 0; |
| |
| BB24_8: |
| setp.lt.u32 %p6, %r14, 512; |
| @%p6 bra BB24_12; |
| |
| setp.gt.u32 %p7, %r10, 255; |
| @%p7 bra BB24_11; |
| |
| ld.shared.f64 %fd30, [%r13+2048]; |
| fma.rn.f64 %fd6, %fd30, %fd30, %fd6; |
| st.shared.f64 [%r13], %fd6; |
| |
| BB24_11: |
| bar.sync 0; |
| |
| BB24_12: |
| setp.lt.u32 %p8, %r14, 256; |
| @%p8 bra BB24_16; |
| |
| setp.gt.u32 %p9, %r10, 127; |
| @%p9 bra BB24_15; |
| |
| ld.shared.f64 %fd31, [%r13+1024]; |
| fma.rn.f64 %fd6, %fd31, %fd31, %fd6; |
| st.shared.f64 [%r13], %fd6; |
| |
| BB24_15: |
| bar.sync 0; |
| |
| BB24_16: |
| setp.lt.u32 %p10, %r14, 128; |
| @%p10 bra BB24_20; |
| |
| setp.gt.u32 %p11, %r10, 63; |
| @%p11 bra BB24_19; |
| |
| ld.shared.f64 %fd32, [%r13+512]; |
| fma.rn.f64 %fd6, %fd32, %fd32, %fd6; |
| st.shared.f64 [%r13], %fd6; |
| |
| BB24_19: |
| bar.sync 0; |
| |
| BB24_20: |
| setp.gt.u32 %p12, %r10, 31; |
| @%p12 bra BB24_33; |
| |
| setp.lt.u32 %p13, %r14, 64; |
| @%p13 bra BB24_23; |
| |
| ld.volatile.shared.f64 %fd33, [%r13+256]; |
| fma.rn.f64 %fd6, %fd33, %fd33, %fd6; |
| st.volatile.shared.f64 [%r13], %fd6; |
| |
| BB24_23: |
| setp.lt.u32 %p14, %r14, 32; |
| @%p14 bra BB24_25; |
| |
| ld.volatile.shared.f64 %fd34, [%r13+128]; |
| fma.rn.f64 %fd6, %fd34, %fd34, %fd6; |
| st.volatile.shared.f64 [%r13], %fd6; |
| |
| BB24_25: |
| setp.lt.u32 %p15, %r14, 16; |
| @%p15 bra BB24_27; |
| |
| ld.volatile.shared.f64 %fd35, [%r13+64]; |
| fma.rn.f64 %fd6, %fd35, %fd35, %fd6; |
| st.volatile.shared.f64 [%r13], %fd6; |
| |
| BB24_27: |
| setp.lt.u32 %p16, %r14, 8; |
| @%p16 bra BB24_29; |
| |
| ld.volatile.shared.f64 %fd36, [%r13+32]; |
| fma.rn.f64 %fd6, %fd36, %fd36, %fd6; |
| st.volatile.shared.f64 [%r13], %fd6; |
| |
| BB24_29: |
| setp.lt.u32 %p17, %r14, 4; |
| @%p17 bra BB24_31; |
| |
| ld.volatile.shared.f64 %fd37, [%r13+16]; |
| fma.rn.f64 %fd6, %fd37, %fd37, %fd6; |
| st.volatile.shared.f64 [%r13], %fd6; |
| |
| BB24_31: |
| setp.lt.u32 %p18, %r14, 2; |
| @%p18 bra BB24_33; |
| |
| ld.volatile.shared.f64 %fd38, [%r13+8]; |
| fma.rn.f64 %fd39, %fd38, %fd38, %fd6; |
| st.volatile.shared.f64 [%r13], %fd39; |
| |
| BB24_33: |
| setp.ne.s32 %p19, %r10, 0; |
| @%p19 bra BB24_35; |
| |
| ld.shared.f64 %fd40, [memory]; |
| cvta.to.global.u64 %rd6, %rd2; |
| mul.wide.u32 %rd7, %r6, 8; |
| add.s64 %rd8, %rd6, %rd7; |
| st.global.f64 [%rd8], %fd40; |
| |
| BB24_35: |
| ret; |
| } |
| |
| // .globl reduce_row_sum_sq_f |
| .visible .entry reduce_row_sum_sq_f( |
| .param .u64 reduce_row_sum_sq_f_param_0, |
| .param .u64 reduce_row_sum_sq_f_param_1, |
| .param .u32 reduce_row_sum_sq_f_param_2, |
| .param .u32 reduce_row_sum_sq_f_param_3 |
| ) |
| { |
| .reg .pred %p<20>; |
| .reg .f32 %f<56>; |
| .reg .b32 %r<72>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd1, [reduce_row_sum_sq_f_param_0]; |
| ld.param.u64 %rd2, [reduce_row_sum_sq_f_param_1]; |
| ld.param.u32 %r5, [reduce_row_sum_sq_f_param_2]; |
| ld.param.u32 %r4, [reduce_row_sum_sq_f_param_3]; |
| mov.u32 %r6, %ctaid.x; |
| setp.ge.u32 %p1, %r6, %r5; |
| @%p1 bra BB25_35; |
| |
| mov.u32 %r71, %tid.x; |
| mov.f32 %f6, 0f00000000; |
| setp.ge.u32 %p2, %r71, %r4; |
| @%p2 bra BB25_4; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| |
| BB25_3: |
| mad.lo.s32 %r8, %r6, %r4, %r71; |
| mul.wide.u32 %rd4, %r8, 4; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f32 %f28, [%rd5]; |
| fma.rn.f32 %f6, %f28, %f28, %f6; |
| mov.u32 %r9, %ntid.x; |
| add.s32 %r71, %r9, %r71; |
| setp.lt.u32 %p3, %r71, %r4; |
| @%p3 bra BB25_3; |
| |
| BB25_4: |
| mov.u32 %r10, %tid.x; |
| shl.b32 %r11, %r10, 2; |
| mov.u32 %r12, memory; |
| add.s32 %r13, %r12, %r11; |
| st.shared.f32 [%r13], %f6; |
| bar.sync 0; |
| mov.u32 %r14, %ntid.x; |
| setp.lt.u32 %p4, %r14, 1024; |
| @%p4 bra BB25_8; |
| |
| setp.gt.u32 %p5, %r10, 511; |
| @%p5 bra BB25_7; |
| |
| ld.shared.f32 %f29, [%r13+2048]; |
| fma.rn.f32 %f6, %f29, %f29, %f6; |
| st.shared.f32 [%r13], %f6; |
| |
| BB25_7: |
| bar.sync 0; |
| |
| BB25_8: |
| setp.lt.u32 %p6, %r14, 512; |
| @%p6 bra BB25_12; |
| |
| setp.gt.u32 %p7, %r10, 255; |
| @%p7 bra BB25_11; |
| |
| ld.shared.f32 %f30, [%r13+1024]; |
| fma.rn.f32 %f6, %f30, %f30, %f6; |
| st.shared.f32 [%r13], %f6; |
| |
| BB25_11: |
| bar.sync 0; |
| |
| BB25_12: |
| setp.lt.u32 %p8, %r14, 256; |
| @%p8 bra BB25_16; |
| |
| setp.gt.u32 %p9, %r10, 127; |
| @%p9 bra BB25_15; |
| |
| ld.shared.f32 %f31, [%r13+512]; |
| fma.rn.f32 %f6, %f31, %f31, %f6; |
| st.shared.f32 [%r13], %f6; |
| |
| BB25_15: |
| bar.sync 0; |
| |
| BB25_16: |
| setp.lt.u32 %p10, %r14, 128; |
| @%p10 bra BB25_20; |
| |
| setp.gt.u32 %p11, %r10, 63; |
| @%p11 bra BB25_19; |
| |
| ld.shared.f32 %f32, [%r13+256]; |
| fma.rn.f32 %f6, %f32, %f32, %f6; |
| st.shared.f32 [%r13], %f6; |
| |
| BB25_19: |
| bar.sync 0; |
| |
| BB25_20: |
| setp.gt.u32 %p12, %r10, 31; |
| @%p12 bra BB25_33; |
| |
| setp.lt.u32 %p13, %r14, 64; |
| @%p13 bra BB25_23; |
| |
| ld.volatile.shared.f32 %f33, [%r13+128]; |
| fma.rn.f32 %f6, %f33, %f33, %f6; |
| st.volatile.shared.f32 [%r13], %f6; |
| |
| BB25_23: |
| setp.lt.u32 %p14, %r14, 32; |
| @%p14 bra BB25_25; |
| |
| ld.volatile.shared.f32 %f34, [%r13+64]; |
| fma.rn.f32 %f6, %f34, %f34, %f6; |
| st.volatile.shared.f32 [%r13], %f6; |
| |
| BB25_25: |
| setp.lt.u32 %p15, %r14, 16; |
| @%p15 bra BB25_27; |
| |
| ld.volatile.shared.f32 %f35, [%r13+32]; |
| fma.rn.f32 %f6, %f35, %f35, %f6; |
| st.volatile.shared.f32 [%r13], %f6; |
| |
| BB25_27: |
| setp.lt.u32 %p16, %r14, 8; |
| @%p16 bra BB25_29; |
| |
| ld.volatile.shared.f32 %f36, [%r13+16]; |
| fma.rn.f32 %f6, %f36, %f36, %f6; |
| st.volatile.shared.f32 [%r13], %f6; |
| |
| BB25_29: |
| setp.lt.u32 %p17, %r14, 4; |
| @%p17 bra BB25_31; |
| |
| ld.volatile.shared.f32 %f37, [%r13+8]; |
| fma.rn.f32 %f6, %f37, %f37, %f6; |
| st.volatile.shared.f32 [%r13], %f6; |
| |
| BB25_31: |
| setp.lt.u32 %p18, %r14, 2; |
| @%p18 bra BB25_33; |
| |
| ld.volatile.shared.f32 %f38, [%r13+4]; |
| fma.rn.f32 %f39, %f38, %f38, %f6; |
| st.volatile.shared.f32 [%r13], %f39; |
| |
| BB25_33: |
| setp.ne.s32 %p19, %r10, 0; |
| @%p19 bra BB25_35; |
| |
| ld.shared.f32 %f40, [memory]; |
| cvta.to.global.u64 %rd6, %rd2; |
| mul.wide.u32 %rd7, %r6, 4; |
| add.s64 %rd8, %rd6, %rd7; |
| st.global.f32 [%rd8], %f40; |
| |
| BB25_35: |
| ret; |
| } |
| |
| |