| // |
| // Generated by NVIDIA NVVM Compiler |
| // |
| // Compiler Build ID: CL-27506705 |
| // Cuda compilation tools, release 10.2, V10.2.89 |
| // Based on LLVM 3.4svn |
| // |
| |
| .version 6.5 |
| .target sm_30 |
| .address_size 64 |
| |
| // .globl double2float_f |
| .func (.param .b64 func_retval0) __internal_trig_reduction_slowpathd |
| ( |
| .param .b64 __internal_trig_reduction_slowpathd_param_0, |
| .param .b64 __internal_trig_reduction_slowpathd_param_1 |
| ) |
| ; |
| .func (.param .b64 func_retval0) __internal_accurate_pow |
| ( |
| .param .b64 __internal_accurate_pow_param_0, |
| .param .b64 __internal_accurate_pow_param_1 |
| ) |
| ; |
| .extern .shared .align 1 .b8 memory[]; |
| .const .align 4 .b8 __cudart_i2opi_f[24] = {65, 144, 67, 60, 153, 149, 98, 219, 192, 221, 52, 245, 209, 87, 39, 252, 41, 21, 68, 78, 110, 131, 249, 162}; |
| .const .align 8 .b8 __cudart_i2opi_d[144] = {8, 93, 141, 31, 177, 95, 251, 107, 234, 146, 82, 138, 247, 57, 7, 61, 123, 241, 229, 235, 199, 186, 39, 117, 45, 234, 95, 158, 102, 63, 70, 79, 183, 9, 203, 39, 207, 126, 54, 109, 31, 109, 10, 90, 139, 17, 47, 239, 15, 152, 5, 222, 255, 151, 248, 31, 59, 40, 249, 189, 139, 95, 132, 156, 244, 57, 83, 131, 57, 214, 145, 57, 65, 126, 95, 180, 38, 112, 156, 233, 132, 68, 187, 46, 245, 53, 130, 232, 62, 167, 41, 177, 28, 235, 29, 254, 28, 146, 209, 9, 234, 46, 73, 6, 224, 210, 77, 66, 58, 110, 36, 183, 97, 197, 187, 222, 171, 99, 81, 254, 65, 144, 67, 60, 153, 149, 98, 219, 192, 221, 52, 245, 209, 87, 39, 252, 41, 21, 68, 78, 110, 131, 249, 162}; |
| .const .align 8 .b8 __cudart_sin_cos_coeffs[128] = {186, 94, 120, 249, 101, 219, 229, 61, 70, 210, 176, 44, 241, 229, 90, 190, 146, 227, 172, 105, 227, 29, 199, 62, 161, 98, 219, 25, 160, 1, 42, 191, 24, 8, 17, 17, 17, 17, 129, 63, 84, 85, 85, 85, 85, 85, 197, 191, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 100, 129, 253, 32, 131, 255, 168, 189, 40, 133, 239, 193, 167, 238, 33, 62, 217, 230, 6, 142, 79, 126, 146, 190, 233, 188, 221, 25, 160, 1, 250, 62, 71, 93, 193, 22, 108, 193, 86, 191, 81, 85, 85, 85, 85, 85, 165, 63, 0, 0, 0, 0, 0, 0, 224, 191, 0, 0, 0, 0, 0, 0, 240, 63}; |
| |
| .visible .entry double2float_f( |
| .param .u64 double2float_f_param_0, |
| .param .u64 double2float_f_param_1, |
| .param .u32 double2float_f_param_2 |
| ) |
| { |
| .reg .pred %p<2>; |
| .reg .f32 %f<2>; |
| .reg .b32 %r<6>; |
| .reg .f64 %fd<2>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd1, [double2float_f_param_0]; |
| ld.param.u64 %rd2, [double2float_f_param_1]; |
| ld.param.u32 %r2, [double2float_f_param_2]; |
| mov.u32 %r3, %ctaid.x; |
| mov.u32 %r4, %ntid.x; |
| mov.u32 %r5, %tid.x; |
| mad.lo.s32 %r1, %r4, %r3, %r5; |
| setp.ge.s32 %p1, %r1, %r2; |
| @%p1 bra BB0_2; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.s32 %rd4, %r1, 8; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f64 %fd1, [%rd5]; |
| cvt.rn.f32.f64 %f1, %fd1; |
| cvta.to.global.u64 %rd6, %rd2; |
| mul.wide.s32 %rd7, %r1, 4; |
| add.s64 %rd8, %rd6, %rd7; |
| st.global.f32 [%rd8], %f1; |
| |
| BB0_2: |
| ret; |
| } |
| |
| // .globl float2double_f |
| .visible .entry float2double_f( |
| .param .u64 float2double_f_param_0, |
| .param .u64 float2double_f_param_1, |
| .param .u32 float2double_f_param_2 |
| ) |
| { |
| .reg .pred %p<2>; |
| .reg .f32 %f<2>; |
| .reg .b32 %r<6>; |
| .reg .f64 %fd<2>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd1, [float2double_f_param_0]; |
| ld.param.u64 %rd2, [float2double_f_param_1]; |
| ld.param.u32 %r2, [float2double_f_param_2]; |
| mov.u32 %r3, %ctaid.x; |
| mov.u32 %r4, %ntid.x; |
| mov.u32 %r5, %tid.x; |
| mad.lo.s32 %r1, %r4, %r3, %r5; |
| setp.ge.s32 %p1, %r1, %r2; |
| @%p1 bra BB1_2; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.s32 %rd4, %r1, 4; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f32 %f1, [%rd5]; |
| cvt.f64.f32 %fd1, %f1; |
| cvta.to.global.u64 %rd6, %rd2; |
| mul.wide.s32 %rd7, %r1, 8; |
| add.s64 %rd8, %rd6, %rd7; |
| st.global.f64 [%rd8], %fd1; |
| |
| BB1_2: |
| ret; |
| } |
| |
| // .globl cumulative_sum_up_sweep_d |
| .visible .entry cumulative_sum_up_sweep_d( |
| .param .u64 cumulative_sum_up_sweep_d_param_0, |
| .param .u64 cumulative_sum_up_sweep_d_param_1, |
| .param .u32 cumulative_sum_up_sweep_d_param_2, |
| .param .u32 cumulative_sum_up_sweep_d_param_3, |
| .param .u32 cumulative_sum_up_sweep_d_param_4 |
| ) |
| { |
| .reg .pred %p<4>; |
| .reg .b32 %r<20>; |
| .reg .f64 %fd<8>; |
| .reg .b64 %rd<11>; |
| |
| |
| ld.param.u64 %rd3, [cumulative_sum_up_sweep_d_param_0]; |
| ld.param.u64 %rd2, [cumulative_sum_up_sweep_d_param_1]; |
| ld.param.u32 %r7, [cumulative_sum_up_sweep_d_param_2]; |
| ld.param.u32 %r8, [cumulative_sum_up_sweep_d_param_3]; |
| ld.param.u32 %r9, [cumulative_sum_up_sweep_d_param_4]; |
| cvta.to.global.u64 %rd1, %rd3; |
| mov.u32 %r10, %ntid.x; |
| mov.u32 %r11, %ctaid.x; |
| mov.u32 %r12, %tid.x; |
| mad.lo.s32 %r1, %r10, %r11, %r12; |
| add.s32 %r13, %r8, -1; |
| setp.gt.u32 %p1, %r1, %r13; |
| @%p1 bra BB2_4; |
| |
| mov.u32 %r14, %ctaid.y; |
| mul.lo.s32 %r2, %r14, %r8; |
| mad.lo.s32 %r15, %r2, %r9, %r1; |
| mul.wide.u32 %rd4, %r15, 8; |
| add.s64 %rd5, %rd1, %rd4; |
| ld.global.f64 %fd7, [%rd5]; |
| mad.lo.s32 %r16, %r9, %r8, %r15; |
| mul.lo.s32 %r17, %r8, %r7; |
| min.u32 %r3, %r16, %r17; |
| add.s32 %r19, %r15, %r8; |
| setp.ge.u32 %p2, %r19, %r3; |
| @%p2 bra BB2_3; |
| |
| BB2_2: |
| mul.wide.s32 %rd6, %r19, 8; |
| add.s64 %rd7, %rd1, %rd6; |
| ld.global.f64 %fd5, [%rd7]; |
| add.f64 %fd7, %fd7, %fd5; |
| add.s32 %r19, %r19, %r8; |
| setp.lt.u32 %p3, %r19, %r3; |
| @%p3 bra BB2_2; |
| |
| BB2_3: |
| add.s32 %r18, %r1, %r2; |
| cvta.to.global.u64 %rd8, %rd2; |
| mul.wide.u32 %rd9, %r18, 8; |
| add.s64 %rd10, %rd8, %rd9; |
| st.global.f64 [%rd10], %fd7; |
| |
| BB2_4: |
| ret; |
| } |
| |
| // .globl cumulative_sum_up_sweep_f |
| .visible .entry cumulative_sum_up_sweep_f( |
| .param .u64 cumulative_sum_up_sweep_f_param_0, |
| .param .u64 cumulative_sum_up_sweep_f_param_1, |
| .param .u32 cumulative_sum_up_sweep_f_param_2, |
| .param .u32 cumulative_sum_up_sweep_f_param_3, |
| .param .u32 cumulative_sum_up_sweep_f_param_4 |
| ) |
| { |
| .reg .pred %p<4>; |
| .reg .f32 %f<8>; |
| .reg .b32 %r<20>; |
| .reg .b64 %rd<11>; |
| |
| |
| ld.param.u64 %rd3, [cumulative_sum_up_sweep_f_param_0]; |
| ld.param.u64 %rd2, [cumulative_sum_up_sweep_f_param_1]; |
| ld.param.u32 %r7, [cumulative_sum_up_sweep_f_param_2]; |
| ld.param.u32 %r8, [cumulative_sum_up_sweep_f_param_3]; |
| ld.param.u32 %r9, [cumulative_sum_up_sweep_f_param_4]; |
| cvta.to.global.u64 %rd1, %rd3; |
| mov.u32 %r10, %ntid.x; |
| mov.u32 %r11, %ctaid.x; |
| mov.u32 %r12, %tid.x; |
| mad.lo.s32 %r1, %r10, %r11, %r12; |
| add.s32 %r13, %r8, -1; |
| setp.gt.u32 %p1, %r1, %r13; |
| @%p1 bra BB3_4; |
| |
| mov.u32 %r14, %ctaid.y; |
| mul.lo.s32 %r2, %r14, %r8; |
| mad.lo.s32 %r15, %r2, %r9, %r1; |
| mul.wide.u32 %rd4, %r15, 4; |
| add.s64 %rd5, %rd1, %rd4; |
| ld.global.f32 %f7, [%rd5]; |
| mad.lo.s32 %r16, %r9, %r8, %r15; |
| mul.lo.s32 %r17, %r8, %r7; |
| min.u32 %r3, %r16, %r17; |
| add.s32 %r19, %r15, %r8; |
| setp.ge.u32 %p2, %r19, %r3; |
| @%p2 bra BB3_3; |
| |
| BB3_2: |
| mul.wide.s32 %rd6, %r19, 4; |
| add.s64 %rd7, %rd1, %rd6; |
| ld.global.f32 %f5, [%rd7]; |
| add.f32 %f7, %f7, %f5; |
| add.s32 %r19, %r19, %r8; |
| setp.lt.u32 %p3, %r19, %r3; |
| @%p3 bra BB3_2; |
| |
| BB3_3: |
| add.s32 %r18, %r1, %r2; |
| cvta.to.global.u64 %rd8, %rd2; |
| mul.wide.u32 %rd9, %r18, 4; |
| add.s64 %rd10, %rd8, %rd9; |
| st.global.f32 [%rd10], %f7; |
| |
| BB3_4: |
| ret; |
| } |
| |
| // .globl cumulative_sum_down_sweep_d |
| .visible .entry cumulative_sum_down_sweep_d( |
| .param .u64 cumulative_sum_down_sweep_d_param_0, |
| .param .u64 cumulative_sum_down_sweep_d_param_1, |
| .param .u64 cumulative_sum_down_sweep_d_param_2, |
| .param .u32 cumulative_sum_down_sweep_d_param_3, |
| .param .u32 cumulative_sum_down_sweep_d_param_4, |
| .param .u32 cumulative_sum_down_sweep_d_param_5 |
| ) |
| { |
| .reg .pred %p<5>; |
| .reg .b32 %r<21>; |
| .reg .f64 %fd<11>; |
| .reg .b64 %rd<15>; |
| |
| |
| ld.param.u64 %rd4, [cumulative_sum_down_sweep_d_param_0]; |
| ld.param.u64 %rd5, [cumulative_sum_down_sweep_d_param_1]; |
| ld.param.u64 %rd3, [cumulative_sum_down_sweep_d_param_2]; |
| ld.param.u32 %r7, [cumulative_sum_down_sweep_d_param_3]; |
| ld.param.u32 %r8, [cumulative_sum_down_sweep_d_param_4]; |
| ld.param.u32 %r9, [cumulative_sum_down_sweep_d_param_5]; |
| cvta.to.global.u64 %rd1, %rd5; |
| cvta.to.global.u64 %rd2, %rd4; |
| mov.u32 %r10, %ntid.x; |
| mov.u32 %r11, %ctaid.x; |
| mov.u32 %r12, %tid.x; |
| mad.lo.s32 %r1, %r10, %r11, %r12; |
| add.s32 %r13, %r8, -1; |
| setp.gt.u32 %p1, %r1, %r13; |
| @%p1 bra BB4_5; |
| |
| mov.u32 %r2, %ctaid.y; |
| setp.eq.s32 %p2, %r2, 0; |
| mov.f64 %fd9, 0d0000000000000000; |
| @%p2 bra BB4_3; |
| |
| add.s32 %r14, %r2, -1; |
| mad.lo.s32 %r15, %r14, %r8, %r1; |
| cvta.to.global.u64 %rd6, %rd3; |
| mul.wide.u32 %rd7, %r15, 8; |
| add.s64 %rd8, %rd6, %rd7; |
| ld.global.f64 %fd9, [%rd8]; |
| |
| BB4_3: |
| mul.lo.s32 %r16, %r9, %r8; |
| mad.lo.s32 %r17, %r16, %r2, %r1; |
| mul.wide.u32 %rd9, %r17, 8; |
| add.s64 %rd10, %rd2, %rd9; |
| ld.global.f64 %fd7, [%rd10]; |
| add.f64 %fd10, %fd9, %fd7; |
| add.s64 %rd11, %rd1, %rd9; |
| st.global.f64 [%rd11], %fd10; |
| mul.lo.s32 %r18, %r8, %r7; |
| add.s32 %r19, %r17, %r16; |
| min.u32 %r3, %r19, %r18; |
| add.s32 %r20, %r17, %r8; |
| setp.ge.u32 %p3, %r20, %r3; |
| @%p3 bra BB4_5; |
| |
| BB4_4: |
| mul.wide.s32 %rd12, %r20, 8; |
| add.s64 %rd13, %rd2, %rd12; |
| ld.global.f64 %fd8, [%rd13]; |
| add.f64 %fd10, %fd10, %fd8; |
| add.s64 %rd14, %rd1, %rd12; |
| st.global.f64 [%rd14], %fd10; |
| add.s32 %r20, %r20, %r8; |
| setp.lt.u32 %p4, %r20, %r3; |
| @%p4 bra BB4_4; |
| |
| BB4_5: |
| ret; |
| } |
| |
| // .globl cumulative_sum_down_sweep_f |
| .visible .entry cumulative_sum_down_sweep_f( |
| .param .u64 cumulative_sum_down_sweep_f_param_0, |
| .param .u64 cumulative_sum_down_sweep_f_param_1, |
| .param .u64 cumulative_sum_down_sweep_f_param_2, |
| .param .u32 cumulative_sum_down_sweep_f_param_3, |
| .param .u32 cumulative_sum_down_sweep_f_param_4, |
| .param .u32 cumulative_sum_down_sweep_f_param_5 |
| ) |
| { |
| .reg .pred %p<5>; |
| .reg .f32 %f<11>; |
| .reg .b32 %r<21>; |
| .reg .b64 %rd<15>; |
| |
| |
| ld.param.u64 %rd4, [cumulative_sum_down_sweep_f_param_0]; |
| ld.param.u64 %rd5, [cumulative_sum_down_sweep_f_param_1]; |
| ld.param.u64 %rd3, [cumulative_sum_down_sweep_f_param_2]; |
| ld.param.u32 %r7, [cumulative_sum_down_sweep_f_param_3]; |
| ld.param.u32 %r8, [cumulative_sum_down_sweep_f_param_4]; |
| ld.param.u32 %r9, [cumulative_sum_down_sweep_f_param_5]; |
| cvta.to.global.u64 %rd1, %rd5; |
| cvta.to.global.u64 %rd2, %rd4; |
| mov.u32 %r10, %ntid.x; |
| mov.u32 %r11, %ctaid.x; |
| mov.u32 %r12, %tid.x; |
| mad.lo.s32 %r1, %r10, %r11, %r12; |
| add.s32 %r13, %r8, -1; |
| setp.gt.u32 %p1, %r1, %r13; |
| @%p1 bra BB5_5; |
| |
| mov.u32 %r2, %ctaid.y; |
| setp.eq.s32 %p2, %r2, 0; |
| mov.f32 %f9, 0f00000000; |
| @%p2 bra BB5_3; |
| |
| add.s32 %r14, %r2, -1; |
| mad.lo.s32 %r15, %r14, %r8, %r1; |
| cvta.to.global.u64 %rd6, %rd3; |
| mul.wide.u32 %rd7, %r15, 4; |
| add.s64 %rd8, %rd6, %rd7; |
| ld.global.f32 %f9, [%rd8]; |
| |
| BB5_3: |
| mul.lo.s32 %r16, %r9, %r8; |
| mad.lo.s32 %r17, %r16, %r2, %r1; |
| mul.wide.u32 %rd9, %r17, 4; |
| add.s64 %rd10, %rd2, %rd9; |
| ld.global.f32 %f7, [%rd10]; |
| add.f32 %f10, %f9, %f7; |
| add.s64 %rd11, %rd1, %rd9; |
| st.global.f32 [%rd11], %f10; |
| mul.lo.s32 %r18, %r8, %r7; |
| add.s32 %r19, %r17, %r16; |
| min.u32 %r3, %r19, %r18; |
| add.s32 %r20, %r17, %r8; |
| setp.ge.u32 %p3, %r20, %r3; |
| @%p3 bra BB5_5; |
| |
| BB5_4: |
| mul.wide.s32 %rd12, %r20, 4; |
| add.s64 %rd13, %rd2, %rd12; |
| ld.global.f32 %f8, [%rd13]; |
| add.f32 %f10, %f10, %f8; |
| add.s64 %rd14, %rd1, %rd12; |
| st.global.f32 [%rd14], %f10; |
| add.s32 %r20, %r20, %r8; |
| setp.lt.u32 %p4, %r20, %r3; |
| @%p4 bra BB5_4; |
| |
| BB5_5: |
| ret; |
| } |
| |
| // .globl cumulative_prod_up_sweep_d |
| .visible .entry cumulative_prod_up_sweep_d( |
| .param .u64 cumulative_prod_up_sweep_d_param_0, |
| .param .u64 cumulative_prod_up_sweep_d_param_1, |
| .param .u32 cumulative_prod_up_sweep_d_param_2, |
| .param .u32 cumulative_prod_up_sweep_d_param_3, |
| .param .u32 cumulative_prod_up_sweep_d_param_4 |
| ) |
| { |
| .reg .pred %p<4>; |
| .reg .b32 %r<20>; |
| .reg .f64 %fd<8>; |
| .reg .b64 %rd<11>; |
| |
| |
| ld.param.u64 %rd3, [cumulative_prod_up_sweep_d_param_0]; |
| ld.param.u64 %rd2, [cumulative_prod_up_sweep_d_param_1]; |
| ld.param.u32 %r7, [cumulative_prod_up_sweep_d_param_2]; |
| ld.param.u32 %r8, [cumulative_prod_up_sweep_d_param_3]; |
| ld.param.u32 %r9, [cumulative_prod_up_sweep_d_param_4]; |
| cvta.to.global.u64 %rd1, %rd3; |
| mov.u32 %r10, %ntid.x; |
| mov.u32 %r11, %ctaid.x; |
| mov.u32 %r12, %tid.x; |
| mad.lo.s32 %r1, %r10, %r11, %r12; |
| add.s32 %r13, %r8, -1; |
| setp.gt.u32 %p1, %r1, %r13; |
| @%p1 bra BB6_4; |
| |
| mov.u32 %r14, %ctaid.y; |
| mul.lo.s32 %r2, %r14, %r8; |
| mad.lo.s32 %r15, %r2, %r9, %r1; |
| mul.wide.u32 %rd4, %r15, 8; |
| add.s64 %rd5, %rd1, %rd4; |
| ld.global.f64 %fd7, [%rd5]; |
| mad.lo.s32 %r16, %r9, %r8, %r15; |
| mul.lo.s32 %r17, %r8, %r7; |
| min.u32 %r3, %r16, %r17; |
| add.s32 %r19, %r15, %r8; |
| setp.ge.u32 %p2, %r19, %r3; |
| @%p2 bra BB6_3; |
| |
| BB6_2: |
| mul.wide.s32 %rd6, %r19, 8; |
| add.s64 %rd7, %rd1, %rd6; |
| ld.global.f64 %fd5, [%rd7]; |
| mul.f64 %fd7, %fd7, %fd5; |
| add.s32 %r19, %r19, %r8; |
| setp.lt.u32 %p3, %r19, %r3; |
| @%p3 bra BB6_2; |
| |
| BB6_3: |
| add.s32 %r18, %r1, %r2; |
| cvta.to.global.u64 %rd8, %rd2; |
| mul.wide.u32 %rd9, %r18, 8; |
| add.s64 %rd10, %rd8, %rd9; |
| st.global.f64 [%rd10], %fd7; |
| |
| BB6_4: |
| ret; |
| } |
| |
| // .globl cumulative_prod_up_sweep_f |
| .visible .entry cumulative_prod_up_sweep_f( |
| .param .u64 cumulative_prod_up_sweep_f_param_0, |
| .param .u64 cumulative_prod_up_sweep_f_param_1, |
| .param .u32 cumulative_prod_up_sweep_f_param_2, |
| .param .u32 cumulative_prod_up_sweep_f_param_3, |
| .param .u32 cumulative_prod_up_sweep_f_param_4 |
| ) |
| { |
| .reg .pred %p<4>; |
| .reg .b32 %r<20>; |
| .reg .f64 %fd<8>; |
| .reg .b64 %rd<11>; |
| |
| |
| ld.param.u64 %rd3, [cumulative_prod_up_sweep_f_param_0]; |
| ld.param.u64 %rd2, [cumulative_prod_up_sweep_f_param_1]; |
| ld.param.u32 %r7, [cumulative_prod_up_sweep_f_param_2]; |
| ld.param.u32 %r8, [cumulative_prod_up_sweep_f_param_3]; |
| ld.param.u32 %r9, [cumulative_prod_up_sweep_f_param_4]; |
| cvta.to.global.u64 %rd1, %rd3; |
| mov.u32 %r10, %ntid.x; |
| mov.u32 %r11, %ctaid.x; |
| mov.u32 %r12, %tid.x; |
| mad.lo.s32 %r1, %r10, %r11, %r12; |
| add.s32 %r13, %r8, -1; |
| setp.gt.u32 %p1, %r1, %r13; |
| @%p1 bra BB7_4; |
| |
| mov.u32 %r14, %ctaid.y; |
| mul.lo.s32 %r2, %r14, %r8; |
| mad.lo.s32 %r15, %r2, %r9, %r1; |
| mul.wide.u32 %rd4, %r15, 8; |
| add.s64 %rd5, %rd1, %rd4; |
| ld.global.f64 %fd7, [%rd5]; |
| mad.lo.s32 %r16, %r9, %r8, %r15; |
| mul.lo.s32 %r17, %r8, %r7; |
| min.u32 %r3, %r16, %r17; |
| add.s32 %r19, %r15, %r8; |
| setp.ge.u32 %p2, %r19, %r3; |
| @%p2 bra BB7_3; |
| |
| BB7_2: |
| mul.wide.s32 %rd6, %r19, 8; |
| add.s64 %rd7, %rd1, %rd6; |
| ld.global.f64 %fd5, [%rd7]; |
| mul.f64 %fd7, %fd7, %fd5; |
| add.s32 %r19, %r19, %r8; |
| setp.lt.u32 %p3, %r19, %r3; |
| @%p3 bra BB7_2; |
| |
| BB7_3: |
| add.s32 %r18, %r1, %r2; |
| cvta.to.global.u64 %rd8, %rd2; |
| mul.wide.u32 %rd9, %r18, 8; |
| add.s64 %rd10, %rd8, %rd9; |
| st.global.f64 [%rd10], %fd7; |
| |
| BB7_4: |
| ret; |
| } |
| |
| // .globl cumulative_prod_down_sweep_d |
| .visible .entry cumulative_prod_down_sweep_d( |
| .param .u64 cumulative_prod_down_sweep_d_param_0, |
| .param .u64 cumulative_prod_down_sweep_d_param_1, |
| .param .u64 cumulative_prod_down_sweep_d_param_2, |
| .param .u32 cumulative_prod_down_sweep_d_param_3, |
| .param .u32 cumulative_prod_down_sweep_d_param_4, |
| .param .u32 cumulative_prod_down_sweep_d_param_5 |
| ) |
| { |
| .reg .pred %p<5>; |
| .reg .b32 %r<21>; |
| .reg .f64 %fd<11>; |
| .reg .b64 %rd<15>; |
| |
| |
| ld.param.u64 %rd4, [cumulative_prod_down_sweep_d_param_0]; |
| ld.param.u64 %rd5, [cumulative_prod_down_sweep_d_param_1]; |
| ld.param.u64 %rd3, [cumulative_prod_down_sweep_d_param_2]; |
| ld.param.u32 %r7, [cumulative_prod_down_sweep_d_param_3]; |
| ld.param.u32 %r8, [cumulative_prod_down_sweep_d_param_4]; |
| ld.param.u32 %r9, [cumulative_prod_down_sweep_d_param_5]; |
| cvta.to.global.u64 %rd1, %rd5; |
| cvta.to.global.u64 %rd2, %rd4; |
| mov.u32 %r10, %ntid.x; |
| mov.u32 %r11, %ctaid.x; |
| mov.u32 %r12, %tid.x; |
| mad.lo.s32 %r1, %r10, %r11, %r12; |
| add.s32 %r13, %r8, -1; |
| setp.gt.u32 %p1, %r1, %r13; |
| @%p1 bra BB8_5; |
| |
| mov.u32 %r2, %ctaid.y; |
| setp.eq.s32 %p2, %r2, 0; |
| mov.f64 %fd9, 0d3FF0000000000000; |
| @%p2 bra BB8_3; |
| |
| add.s32 %r14, %r2, -1; |
| mad.lo.s32 %r15, %r14, %r8, %r1; |
| cvta.to.global.u64 %rd6, %rd3; |
| mul.wide.u32 %rd7, %r15, 8; |
| add.s64 %rd8, %rd6, %rd7; |
| ld.global.f64 %fd9, [%rd8]; |
| |
| BB8_3: |
| mul.lo.s32 %r16, %r9, %r8; |
| mad.lo.s32 %r17, %r16, %r2, %r1; |
| mul.wide.u32 %rd9, %r17, 8; |
| add.s64 %rd10, %rd2, %rd9; |
| ld.global.f64 %fd7, [%rd10]; |
| mul.f64 %fd10, %fd9, %fd7; |
| add.s64 %rd11, %rd1, %rd9; |
| st.global.f64 [%rd11], %fd10; |
| mul.lo.s32 %r18, %r8, %r7; |
| add.s32 %r19, %r17, %r16; |
| min.u32 %r3, %r19, %r18; |
| add.s32 %r20, %r17, %r8; |
| setp.ge.u32 %p3, %r20, %r3; |
| @%p3 bra BB8_5; |
| |
| BB8_4: |
| mul.wide.s32 %rd12, %r20, 8; |
| add.s64 %rd13, %rd2, %rd12; |
| ld.global.f64 %fd8, [%rd13]; |
| mul.f64 %fd10, %fd10, %fd8; |
| add.s64 %rd14, %rd1, %rd12; |
| st.global.f64 [%rd14], %fd10; |
| add.s32 %r20, %r20, %r8; |
| setp.lt.u32 %p4, %r20, %r3; |
| @%p4 bra BB8_4; |
| |
| BB8_5: |
| ret; |
| } |
| |
| // .globl cumulative_prod_down_sweep_f |
| .visible .entry cumulative_prod_down_sweep_f( |
| .param .u64 cumulative_prod_down_sweep_f_param_0, |
| .param .u64 cumulative_prod_down_sweep_f_param_1, |
| .param .u64 cumulative_prod_down_sweep_f_param_2, |
| .param .u32 cumulative_prod_down_sweep_f_param_3, |
| .param .u32 cumulative_prod_down_sweep_f_param_4, |
| .param .u32 cumulative_prod_down_sweep_f_param_5 |
| ) |
| { |
| .reg .pred %p<5>; |
| .reg .f32 %f<11>; |
| .reg .b32 %r<21>; |
| .reg .b64 %rd<15>; |
| |
| |
| ld.param.u64 %rd4, [cumulative_prod_down_sweep_f_param_0]; |
| ld.param.u64 %rd5, [cumulative_prod_down_sweep_f_param_1]; |
| ld.param.u64 %rd3, [cumulative_prod_down_sweep_f_param_2]; |
| ld.param.u32 %r7, [cumulative_prod_down_sweep_f_param_3]; |
| ld.param.u32 %r8, [cumulative_prod_down_sweep_f_param_4]; |
| ld.param.u32 %r9, [cumulative_prod_down_sweep_f_param_5]; |
| cvta.to.global.u64 %rd1, %rd5; |
| cvta.to.global.u64 %rd2, %rd4; |
| mov.u32 %r10, %ntid.x; |
| mov.u32 %r11, %ctaid.x; |
| mov.u32 %r12, %tid.x; |
| mad.lo.s32 %r1, %r10, %r11, %r12; |
| add.s32 %r13, %r8, -1; |
| setp.gt.u32 %p1, %r1, %r13; |
| @%p1 bra BB9_5; |
| |
| mov.u32 %r2, %ctaid.y; |
| setp.eq.s32 %p2, %r2, 0; |
| mov.f32 %f9, 0f3F800000; |
| @%p2 bra BB9_3; |
| |
| add.s32 %r14, %r2, -1; |
| mad.lo.s32 %r15, %r14, %r8, %r1; |
| cvta.to.global.u64 %rd6, %rd3; |
| mul.wide.u32 %rd7, %r15, 4; |
| add.s64 %rd8, %rd6, %rd7; |
| ld.global.f32 %f9, [%rd8]; |
| |
| BB9_3: |
| mul.lo.s32 %r16, %r9, %r8; |
| mad.lo.s32 %r17, %r16, %r2, %r1; |
| mul.wide.u32 %rd9, %r17, 4; |
| add.s64 %rd10, %rd2, %rd9; |
| ld.global.f32 %f7, [%rd10]; |
| mul.f32 %f10, %f9, %f7; |
| add.s64 %rd11, %rd1, %rd9; |
| st.global.f32 [%rd11], %f10; |
| mul.lo.s32 %r18, %r8, %r7; |
| add.s32 %r19, %r17, %r16; |
| min.u32 %r3, %r19, %r18; |
| add.s32 %r20, %r17, %r8; |
| setp.ge.u32 %p3, %r20, %r3; |
| @%p3 bra BB9_5; |
| |
| BB9_4: |
| mul.wide.s32 %rd12, %r20, 4; |
| add.s64 %rd13, %rd2, %rd12; |
| ld.global.f32 %f8, [%rd13]; |
| mul.f32 %f10, %f10, %f8; |
| add.s64 %rd14, %rd1, %rd12; |
| st.global.f32 [%rd14], %f10; |
| add.s32 %r20, %r20, %r8; |
| setp.lt.u32 %p4, %r20, %r3; |
| @%p4 bra BB9_4; |
| |
| BB9_5: |
| ret; |
| } |
| |
| // .globl cumulative_min_up_sweep_d |
| .visible .entry cumulative_min_up_sweep_d( |
| .param .u64 cumulative_min_up_sweep_d_param_0, |
| .param .u64 cumulative_min_up_sweep_d_param_1, |
| .param .u32 cumulative_min_up_sweep_d_param_2, |
| .param .u32 cumulative_min_up_sweep_d_param_3, |
| .param .u32 cumulative_min_up_sweep_d_param_4 |
| ) |
| { |
| .reg .pred %p<4>; |
| .reg .b32 %r<20>; |
| .reg .f64 %fd<8>; |
| .reg .b64 %rd<11>; |
| |
| |
| ld.param.u64 %rd3, [cumulative_min_up_sweep_d_param_0]; |
| ld.param.u64 %rd2, [cumulative_min_up_sweep_d_param_1]; |
| ld.param.u32 %r7, [cumulative_min_up_sweep_d_param_2]; |
| ld.param.u32 %r8, [cumulative_min_up_sweep_d_param_3]; |
| ld.param.u32 %r9, [cumulative_min_up_sweep_d_param_4]; |
| cvta.to.global.u64 %rd1, %rd3; |
| mov.u32 %r10, %ntid.x; |
| mov.u32 %r11, %ctaid.x; |
| mov.u32 %r12, %tid.x; |
| mad.lo.s32 %r1, %r10, %r11, %r12; |
| add.s32 %r13, %r8, -1; |
| setp.gt.u32 %p1, %r1, %r13; |
| @%p1 bra BB10_4; |
| |
| mov.u32 %r14, %ctaid.y; |
| mul.lo.s32 %r2, %r14, %r8; |
| mad.lo.s32 %r15, %r2, %r9, %r1; |
| mul.wide.u32 %rd4, %r15, 8; |
| add.s64 %rd5, %rd1, %rd4; |
| ld.global.f64 %fd7, [%rd5]; |
| mad.lo.s32 %r16, %r9, %r8, %r15; |
| mul.lo.s32 %r17, %r8, %r7; |
| min.u32 %r3, %r16, %r17; |
| add.s32 %r19, %r15, %r8; |
| setp.ge.u32 %p2, %r19, %r3; |
| @%p2 bra BB10_3; |
| |
| BB10_2: |
| mul.wide.s32 %rd6, %r19, 8; |
| add.s64 %rd7, %rd1, %rd6; |
| ld.global.f64 %fd5, [%rd7]; |
| min.f64 %fd7, %fd7, %fd5; |
| add.s32 %r19, %r19, %r8; |
| setp.lt.u32 %p3, %r19, %r3; |
| @%p3 bra BB10_2; |
| |
| BB10_3: |
| add.s32 %r18, %r1, %r2; |
| cvta.to.global.u64 %rd8, %rd2; |
| mul.wide.u32 %rd9, %r18, 8; |
| add.s64 %rd10, %rd8, %rd9; |
| st.global.f64 [%rd10], %fd7; |
| |
| BB10_4: |
| ret; |
| } |
| |
| // .globl cumulative_min_up_sweep_f |
| .visible .entry cumulative_min_up_sweep_f( |
| .param .u64 cumulative_min_up_sweep_f_param_0, |
| .param .u64 cumulative_min_up_sweep_f_param_1, |
| .param .u32 cumulative_min_up_sweep_f_param_2, |
| .param .u32 cumulative_min_up_sweep_f_param_3, |
| .param .u32 cumulative_min_up_sweep_f_param_4 |
| ) |
| { |
| .reg .pred %p<4>; |
| .reg .f32 %f<8>; |
| .reg .b32 %r<20>; |
| .reg .b64 %rd<11>; |
| |
| |
| ld.param.u64 %rd3, [cumulative_min_up_sweep_f_param_0]; |
| ld.param.u64 %rd2, [cumulative_min_up_sweep_f_param_1]; |
| ld.param.u32 %r7, [cumulative_min_up_sweep_f_param_2]; |
| ld.param.u32 %r8, [cumulative_min_up_sweep_f_param_3]; |
| ld.param.u32 %r9, [cumulative_min_up_sweep_f_param_4]; |
| cvta.to.global.u64 %rd1, %rd3; |
| mov.u32 %r10, %ntid.x; |
| mov.u32 %r11, %ctaid.x; |
| mov.u32 %r12, %tid.x; |
| mad.lo.s32 %r1, %r10, %r11, %r12; |
| add.s32 %r13, %r8, -1; |
| setp.gt.u32 %p1, %r1, %r13; |
| @%p1 bra BB11_4; |
| |
| mov.u32 %r14, %ctaid.y; |
| mul.lo.s32 %r2, %r14, %r8; |
| mad.lo.s32 %r15, %r2, %r9, %r1; |
| mul.wide.u32 %rd4, %r15, 4; |
| add.s64 %rd5, %rd1, %rd4; |
| ld.global.f32 %f7, [%rd5]; |
| mad.lo.s32 %r16, %r9, %r8, %r15; |
| mul.lo.s32 %r17, %r8, %r7; |
| min.u32 %r3, %r16, %r17; |
| add.s32 %r19, %r15, %r8; |
| setp.ge.u32 %p2, %r19, %r3; |
| @%p2 bra BB11_3; |
| |
| BB11_2: |
| mul.wide.s32 %rd6, %r19, 4; |
| add.s64 %rd7, %rd1, %rd6; |
| ld.global.f32 %f5, [%rd7]; |
| min.f32 %f7, %f7, %f5; |
| add.s32 %r19, %r19, %r8; |
| setp.lt.u32 %p3, %r19, %r3; |
| @%p3 bra BB11_2; |
| |
| BB11_3: |
| add.s32 %r18, %r1, %r2; |
| cvta.to.global.u64 %rd8, %rd2; |
| mul.wide.u32 %rd9, %r18, 4; |
| add.s64 %rd10, %rd8, %rd9; |
| st.global.f32 [%rd10], %f7; |
| |
| BB11_4: |
| ret; |
| } |
| |
| // .globl cumulative_min_down_sweep_d |
| .visible .entry cumulative_min_down_sweep_d( |
| .param .u64 cumulative_min_down_sweep_d_param_0, |
| .param .u64 cumulative_min_down_sweep_d_param_1, |
| .param .u64 cumulative_min_down_sweep_d_param_2, |
| .param .u32 cumulative_min_down_sweep_d_param_3, |
| .param .u32 cumulative_min_down_sweep_d_param_4, |
| .param .u32 cumulative_min_down_sweep_d_param_5 |
| ) |
| { |
| .reg .pred %p<5>; |
| .reg .b32 %r<21>; |
| .reg .f64 %fd<11>; |
| .reg .b64 %rd<15>; |
| |
| |
| ld.param.u64 %rd4, [cumulative_min_down_sweep_d_param_0]; |
| ld.param.u64 %rd5, [cumulative_min_down_sweep_d_param_1]; |
| ld.param.u64 %rd3, [cumulative_min_down_sweep_d_param_2]; |
| ld.param.u32 %r7, [cumulative_min_down_sweep_d_param_3]; |
| ld.param.u32 %r8, [cumulative_min_down_sweep_d_param_4]; |
| ld.param.u32 %r9, [cumulative_min_down_sweep_d_param_5]; |
| cvta.to.global.u64 %rd1, %rd5; |
| cvta.to.global.u64 %rd2, %rd4; |
| mov.u32 %r10, %ntid.x; |
| mov.u32 %r11, %ctaid.x; |
| mov.u32 %r12, %tid.x; |
| mad.lo.s32 %r1, %r10, %r11, %r12; |
| add.s32 %r13, %r8, -1; |
| setp.gt.u32 %p1, %r1, %r13; |
| @%p1 bra BB12_5; |
| |
| mov.u32 %r2, %ctaid.y; |
| setp.eq.s32 %p2, %r2, 0; |
| mov.f64 %fd9, 0d7FF0000000000000; |
| @%p2 bra BB12_3; |
| |
| add.s32 %r14, %r2, -1; |
| mad.lo.s32 %r15, %r14, %r8, %r1; |
| cvta.to.global.u64 %rd6, %rd3; |
| mul.wide.u32 %rd7, %r15, 8; |
| add.s64 %rd8, %rd6, %rd7; |
| ld.global.f64 %fd9, [%rd8]; |
| |
| BB12_3: |
| mul.lo.s32 %r16, %r9, %r8; |
| mad.lo.s32 %r17, %r16, %r2, %r1; |
| mul.wide.u32 %rd9, %r17, 8; |
| add.s64 %rd10, %rd2, %rd9; |
| ld.global.f64 %fd7, [%rd10]; |
| min.f64 %fd10, %fd9, %fd7; |
| add.s64 %rd11, %rd1, %rd9; |
| st.global.f64 [%rd11], %fd10; |
| mul.lo.s32 %r18, %r8, %r7; |
| add.s32 %r19, %r17, %r16; |
| min.u32 %r3, %r19, %r18; |
| add.s32 %r20, %r17, %r8; |
| setp.ge.u32 %p3, %r20, %r3; |
| @%p3 bra BB12_5; |
| |
| BB12_4: |
| mul.wide.s32 %rd12, %r20, 8; |
| add.s64 %rd13, %rd2, %rd12; |
| ld.global.f64 %fd8, [%rd13]; |
| min.f64 %fd10, %fd10, %fd8; |
| add.s64 %rd14, %rd1, %rd12; |
| st.global.f64 [%rd14], %fd10; |
| add.s32 %r20, %r20, %r8; |
| setp.lt.u32 %p4, %r20, %r3; |
| @%p4 bra BB12_4; |
| |
| BB12_5: |
| ret; |
| } |
| |
| // .globl cumulative_min_down_sweep_f |
| .visible .entry cumulative_min_down_sweep_f( |
| .param .u64 cumulative_min_down_sweep_f_param_0, |
| .param .u64 cumulative_min_down_sweep_f_param_1, |
| .param .u64 cumulative_min_down_sweep_f_param_2, |
| .param .u32 cumulative_min_down_sweep_f_param_3, |
| .param .u32 cumulative_min_down_sweep_f_param_4, |
| .param .u32 cumulative_min_down_sweep_f_param_5 |
| ) |
| { |
| .reg .pred %p<5>; |
| .reg .f32 %f<11>; |
| .reg .b32 %r<21>; |
| .reg .b64 %rd<15>; |
| |
| |
| ld.param.u64 %rd4, [cumulative_min_down_sweep_f_param_0]; |
| ld.param.u64 %rd5, [cumulative_min_down_sweep_f_param_1]; |
| ld.param.u64 %rd3, [cumulative_min_down_sweep_f_param_2]; |
| ld.param.u32 %r7, [cumulative_min_down_sweep_f_param_3]; |
| ld.param.u32 %r8, [cumulative_min_down_sweep_f_param_4]; |
| ld.param.u32 %r9, [cumulative_min_down_sweep_f_param_5]; |
| cvta.to.global.u64 %rd1, %rd5; |
| cvta.to.global.u64 %rd2, %rd4; |
| mov.u32 %r10, %ntid.x; |
| mov.u32 %r11, %ctaid.x; |
| mov.u32 %r12, %tid.x; |
| mad.lo.s32 %r1, %r10, %r11, %r12; |
| add.s32 %r13, %r8, -1; |
| setp.gt.u32 %p1, %r1, %r13; |
| @%p1 bra BB13_5; |
| |
| mov.u32 %r2, %ctaid.y; |
| setp.eq.s32 %p2, %r2, 0; |
| mov.f32 %f9, 0f7F800000; |
| @%p2 bra BB13_3; |
| |
| add.s32 %r14, %r2, -1; |
| mad.lo.s32 %r15, %r14, %r8, %r1; |
| cvta.to.global.u64 %rd6, %rd3; |
| mul.wide.u32 %rd7, %r15, 4; |
| add.s64 %rd8, %rd6, %rd7; |
| ld.global.f32 %f9, [%rd8]; |
| |
| BB13_3: |
| mul.lo.s32 %r16, %r9, %r8; |
| mad.lo.s32 %r17, %r16, %r2, %r1; |
| mul.wide.u32 %rd9, %r17, 4; |
| add.s64 %rd10, %rd2, %rd9; |
| ld.global.f32 %f7, [%rd10]; |
| min.f32 %f10, %f9, %f7; |
| add.s64 %rd11, %rd1, %rd9; |
| st.global.f32 [%rd11], %f10; |
| mul.lo.s32 %r18, %r8, %r7; |
| add.s32 %r19, %r17, %r16; |
| min.u32 %r3, %r19, %r18; |
| add.s32 %r20, %r17, %r8; |
| setp.ge.u32 %p3, %r20, %r3; |
| @%p3 bra BB13_5; |
| |
| BB13_4: |
| mul.wide.s32 %rd12, %r20, 4; |
| add.s64 %rd13, %rd2, %rd12; |
| ld.global.f32 %f8, [%rd13]; |
| min.f32 %f10, %f10, %f8; |
| add.s64 %rd14, %rd1, %rd12; |
| st.global.f32 [%rd14], %f10; |
| add.s32 %r20, %r20, %r8; |
| setp.lt.u32 %p4, %r20, %r3; |
| @%p4 bra BB13_4; |
| |
| BB13_5: |
| ret; |
| } |
| |
| // .globl cumulative_max_up_sweep_d |
| .visible .entry cumulative_max_up_sweep_d( |
| .param .u64 cumulative_max_up_sweep_d_param_0, |
| .param .u64 cumulative_max_up_sweep_d_param_1, |
| .param .u32 cumulative_max_up_sweep_d_param_2, |
| .param .u32 cumulative_max_up_sweep_d_param_3, |
| .param .u32 cumulative_max_up_sweep_d_param_4 |
| ) |
| { |
| .reg .pred %p<4>; |
| .reg .b32 %r<20>; |
| .reg .f64 %fd<8>; |
| .reg .b64 %rd<11>; |
| |
| |
| ld.param.u64 %rd3, [cumulative_max_up_sweep_d_param_0]; |
| ld.param.u64 %rd2, [cumulative_max_up_sweep_d_param_1]; |
| ld.param.u32 %r7, [cumulative_max_up_sweep_d_param_2]; |
| ld.param.u32 %r8, [cumulative_max_up_sweep_d_param_3]; |
| ld.param.u32 %r9, [cumulative_max_up_sweep_d_param_4]; |
| cvta.to.global.u64 %rd1, %rd3; |
| mov.u32 %r10, %ntid.x; |
| mov.u32 %r11, %ctaid.x; |
| mov.u32 %r12, %tid.x; |
| mad.lo.s32 %r1, %r10, %r11, %r12; |
| add.s32 %r13, %r8, -1; |
| setp.gt.u32 %p1, %r1, %r13; |
| @%p1 bra BB14_4; |
| |
| mov.u32 %r14, %ctaid.y; |
| mul.lo.s32 %r2, %r14, %r8; |
| mad.lo.s32 %r15, %r2, %r9, %r1; |
| mul.wide.u32 %rd4, %r15, 8; |
| add.s64 %rd5, %rd1, %rd4; |
| ld.global.f64 %fd7, [%rd5]; |
| mad.lo.s32 %r16, %r9, %r8, %r15; |
| mul.lo.s32 %r17, %r8, %r7; |
| min.u32 %r3, %r16, %r17; |
| add.s32 %r19, %r15, %r8; |
| setp.ge.u32 %p2, %r19, %r3; |
| @%p2 bra BB14_3; |
| |
| BB14_2: |
| mul.wide.s32 %rd6, %r19, 8; |
| add.s64 %rd7, %rd1, %rd6; |
| ld.global.f64 %fd5, [%rd7]; |
| max.f64 %fd7, %fd7, %fd5; |
| add.s32 %r19, %r19, %r8; |
| setp.lt.u32 %p3, %r19, %r3; |
| @%p3 bra BB14_2; |
| |
| BB14_3: |
| add.s32 %r18, %r1, %r2; |
| cvta.to.global.u64 %rd8, %rd2; |
| mul.wide.u32 %rd9, %r18, 8; |
| add.s64 %rd10, %rd8, %rd9; |
| st.global.f64 [%rd10], %fd7; |
| |
| BB14_4: |
| ret; |
| } |
| |
| // .globl cumulative_max_up_sweep_f |
| .visible .entry cumulative_max_up_sweep_f( |
| .param .u64 cumulative_max_up_sweep_f_param_0, |
| .param .u64 cumulative_max_up_sweep_f_param_1, |
| .param .u32 cumulative_max_up_sweep_f_param_2, |
| .param .u32 cumulative_max_up_sweep_f_param_3, |
| .param .u32 cumulative_max_up_sweep_f_param_4 |
| ) |
| { |
| .reg .pred %p<4>; |
| .reg .f32 %f<8>; |
| .reg .b32 %r<20>; |
| .reg .b64 %rd<11>; |
| |
| |
| ld.param.u64 %rd3, [cumulative_max_up_sweep_f_param_0]; |
| ld.param.u64 %rd2, [cumulative_max_up_sweep_f_param_1]; |
| ld.param.u32 %r7, [cumulative_max_up_sweep_f_param_2]; |
| ld.param.u32 %r8, [cumulative_max_up_sweep_f_param_3]; |
| ld.param.u32 %r9, [cumulative_max_up_sweep_f_param_4]; |
| cvta.to.global.u64 %rd1, %rd3; |
| mov.u32 %r10, %ntid.x; |
| mov.u32 %r11, %ctaid.x; |
| mov.u32 %r12, %tid.x; |
| mad.lo.s32 %r1, %r10, %r11, %r12; |
| add.s32 %r13, %r8, -1; |
| setp.gt.u32 %p1, %r1, %r13; |
| @%p1 bra BB15_4; |
| |
| mov.u32 %r14, %ctaid.y; |
| mul.lo.s32 %r2, %r14, %r8; |
| mad.lo.s32 %r15, %r2, %r9, %r1; |
| mul.wide.u32 %rd4, %r15, 4; |
| add.s64 %rd5, %rd1, %rd4; |
| ld.global.f32 %f7, [%rd5]; |
| mad.lo.s32 %r16, %r9, %r8, %r15; |
| mul.lo.s32 %r17, %r8, %r7; |
| min.u32 %r3, %r16, %r17; |
| add.s32 %r19, %r15, %r8; |
| setp.ge.u32 %p2, %r19, %r3; |
| @%p2 bra BB15_3; |
| |
| BB15_2: |
| mul.wide.s32 %rd6, %r19, 4; |
| add.s64 %rd7, %rd1, %rd6; |
| ld.global.f32 %f5, [%rd7]; |
| max.f32 %f7, %f7, %f5; |
| add.s32 %r19, %r19, %r8; |
| setp.lt.u32 %p3, %r19, %r3; |
| @%p3 bra BB15_2; |
| |
| BB15_3: |
| add.s32 %r18, %r1, %r2; |
| cvta.to.global.u64 %rd8, %rd2; |
| mul.wide.u32 %rd9, %r18, 4; |
| add.s64 %rd10, %rd8, %rd9; |
| st.global.f32 [%rd10], %f7; |
| |
| BB15_4: |
| ret; |
| } |
| |
| // .globl cumulative_max_down_sweep_d |
| .visible .entry cumulative_max_down_sweep_d( |
| .param .u64 cumulative_max_down_sweep_d_param_0, |
| .param .u64 cumulative_max_down_sweep_d_param_1, |
| .param .u64 cumulative_max_down_sweep_d_param_2, |
| .param .u32 cumulative_max_down_sweep_d_param_3, |
| .param .u32 cumulative_max_down_sweep_d_param_4, |
| .param .u32 cumulative_max_down_sweep_d_param_5 |
| ) |
| { |
| .reg .pred %p<5>; |
| .reg .b32 %r<21>; |
| .reg .f64 %fd<11>; |
| .reg .b64 %rd<15>; |
| |
| |
| ld.param.u64 %rd4, [cumulative_max_down_sweep_d_param_0]; |
| ld.param.u64 %rd5, [cumulative_max_down_sweep_d_param_1]; |
| ld.param.u64 %rd3, [cumulative_max_down_sweep_d_param_2]; |
| ld.param.u32 %r7, [cumulative_max_down_sweep_d_param_3]; |
| ld.param.u32 %r8, [cumulative_max_down_sweep_d_param_4]; |
| ld.param.u32 %r9, [cumulative_max_down_sweep_d_param_5]; |
| cvta.to.global.u64 %rd1, %rd5; |
| cvta.to.global.u64 %rd2, %rd4; |
| mov.u32 %r10, %ntid.x; |
| mov.u32 %r11, %ctaid.x; |
| mov.u32 %r12, %tid.x; |
| mad.lo.s32 %r1, %r10, %r11, %r12; |
| add.s32 %r13, %r8, -1; |
| setp.gt.u32 %p1, %r1, %r13; |
| @%p1 bra BB16_5; |
| |
| mov.u32 %r2, %ctaid.y; |
| setp.eq.s32 %p2, %r2, 0; |
| mov.f64 %fd9, 0dFFF0000000000000; |
| @%p2 bra BB16_3; |
| |
| add.s32 %r14, %r2, -1; |
| mad.lo.s32 %r15, %r14, %r8, %r1; |
| cvta.to.global.u64 %rd6, %rd3; |
| mul.wide.u32 %rd7, %r15, 8; |
| add.s64 %rd8, %rd6, %rd7; |
| ld.global.f64 %fd9, [%rd8]; |
| |
| BB16_3: |
| mul.lo.s32 %r16, %r9, %r8; |
| mad.lo.s32 %r17, %r16, %r2, %r1; |
| mul.wide.u32 %rd9, %r17, 8; |
| add.s64 %rd10, %rd2, %rd9; |
| ld.global.f64 %fd7, [%rd10]; |
| max.f64 %fd10, %fd9, %fd7; |
| add.s64 %rd11, %rd1, %rd9; |
| st.global.f64 [%rd11], %fd10; |
| mul.lo.s32 %r18, %r8, %r7; |
| add.s32 %r19, %r17, %r16; |
| min.u32 %r3, %r19, %r18; |
| add.s32 %r20, %r17, %r8; |
| setp.ge.u32 %p3, %r20, %r3; |
| @%p3 bra BB16_5; |
| |
| BB16_4: |
| mul.wide.s32 %rd12, %r20, 8; |
| add.s64 %rd13, %rd2, %rd12; |
| ld.global.f64 %fd8, [%rd13]; |
| max.f64 %fd10, %fd10, %fd8; |
| add.s64 %rd14, %rd1, %rd12; |
| st.global.f64 [%rd14], %fd10; |
| add.s32 %r20, %r20, %r8; |
| setp.lt.u32 %p4, %r20, %r3; |
| @%p4 bra BB16_4; |
| |
| BB16_5: |
| ret; |
| } |
| |
| // .globl cumulative_max_down_sweep_f |
| .visible .entry cumulative_max_down_sweep_f( |
| .param .u64 cumulative_max_down_sweep_f_param_0, |
| .param .u64 cumulative_max_down_sweep_f_param_1, |
| .param .u64 cumulative_max_down_sweep_f_param_2, |
| .param .u32 cumulative_max_down_sweep_f_param_3, |
| .param .u32 cumulative_max_down_sweep_f_param_4, |
| .param .u32 cumulative_max_down_sweep_f_param_5 |
| ) |
| { |
| .reg .pred %p<5>; |
| .reg .f32 %f<11>; |
| .reg .b32 %r<21>; |
| .reg .b64 %rd<15>; |
| |
| |
| ld.param.u64 %rd4, [cumulative_max_down_sweep_f_param_0]; |
| ld.param.u64 %rd5, [cumulative_max_down_sweep_f_param_1]; |
| ld.param.u64 %rd3, [cumulative_max_down_sweep_f_param_2]; |
| ld.param.u32 %r7, [cumulative_max_down_sweep_f_param_3]; |
| ld.param.u32 %r8, [cumulative_max_down_sweep_f_param_4]; |
| ld.param.u32 %r9, [cumulative_max_down_sweep_f_param_5]; |
| cvta.to.global.u64 %rd1, %rd5; |
| cvta.to.global.u64 %rd2, %rd4; |
| mov.u32 %r10, %ntid.x; |
| mov.u32 %r11, %ctaid.x; |
| mov.u32 %r12, %tid.x; |
| mad.lo.s32 %r1, %r10, %r11, %r12; |
| add.s32 %r13, %r8, -1; |
| setp.gt.u32 %p1, %r1, %r13; |
| @%p1 bra BB17_5; |
| |
| mov.u32 %r2, %ctaid.y; |
| setp.eq.s32 %p2, %r2, 0; |
| mov.f32 %f9, 0fFF800000; |
| @%p2 bra BB17_3; |
| |
| add.s32 %r14, %r2, -1; |
| mad.lo.s32 %r15, %r14, %r8, %r1; |
| cvta.to.global.u64 %rd6, %rd3; |
| mul.wide.u32 %rd7, %r15, 4; |
| add.s64 %rd8, %rd6, %rd7; |
| ld.global.f32 %f9, [%rd8]; |
| |
| BB17_3: |
| mul.lo.s32 %r16, %r9, %r8; |
| mad.lo.s32 %r17, %r16, %r2, %r1; |
| mul.wide.u32 %rd9, %r17, 4; |
| add.s64 %rd10, %rd2, %rd9; |
| ld.global.f32 %f7, [%rd10]; |
| max.f32 %f10, %f9, %f7; |
| add.s64 %rd11, %rd1, %rd9; |
| st.global.f32 [%rd11], %f10; |
| mul.lo.s32 %r18, %r8, %r7; |
| add.s32 %r19, %r17, %r16; |
| min.u32 %r3, %r19, %r18; |
| add.s32 %r20, %r17, %r8; |
| setp.ge.u32 %p3, %r20, %r3; |
| @%p3 bra BB17_5; |
| |
| BB17_4: |
| mul.wide.s32 %rd12, %r20, 4; |
| add.s64 %rd13, %rd2, %rd12; |
| ld.global.f32 %f8, [%rd13]; |
| max.f32 %f10, %f10, %f8; |
| add.s64 %rd14, %rd1, %rd12; |
| st.global.f32 [%rd14], %f10; |
| add.s32 %r20, %r20, %r8; |
| setp.lt.u32 %p4, %r20, %r3; |
| @%p4 bra BB17_4; |
| |
| BB17_5: |
| ret; |
| } |
| |
| // .globl cumulative_sum_prod_d |
| .visible .entry cumulative_sum_prod_d( |
| .param .u64 cumulative_sum_prod_d_param_0, |
| .param .u64 cumulative_sum_prod_d_param_1, |
| .param .u64 cumulative_sum_prod_d_param_2, |
| .param .u64 cumulative_sum_prod_d_param_3, |
| .param .u32 cumulative_sum_prod_d_param_4, |
| .param .u32 cumulative_sum_prod_d_param_5, |
| .param .u32 cumulative_sum_prod_d_param_6 |
| ) |
| { |
| .reg .pred %p<18>; |
| .reg .b32 %r<62>; |
| .reg .f64 %fd<106>; |
| .reg .b64 %rd<46>; |
| |
| |
| ld.param.u64 %rd6, [cumulative_sum_prod_d_param_0]; |
| ld.param.u64 %rd7, [cumulative_sum_prod_d_param_1]; |
| ld.param.u64 %rd4, [cumulative_sum_prod_d_param_2]; |
| ld.param.u64 %rd5, [cumulative_sum_prod_d_param_3]; |
| ld.param.u32 %r28, [cumulative_sum_prod_d_param_4]; |
| ld.param.u32 %r29, [cumulative_sum_prod_d_param_5]; |
| ld.param.u32 %r30, [cumulative_sum_prod_d_param_6]; |
| cvta.to.global.u64 %rd1, %rd7; |
| cvta.to.global.u64 %rd2, %rd6; |
| mov.u32 %r1, %ctaid.x; |
| mul.lo.s32 %r2, %r1, %r29; |
| add.s32 %r31, %r28, -1; |
| setp.gt.u32 %p1, %r2, %r31; |
| @%p1 bra BB18_30; |
| |
| add.s32 %r32, %r2, %r29; |
| min.u32 %r3, %r32, %r28; |
| shl.b32 %r33, %r2, 1; |
| mul.wide.u32 %rd8, %r33, 8; |
| add.s64 %rd3, %rd2, %rd8; |
| setp.gt.u32 %p2, %r30, 1; |
| @%p2 bra BB18_3; |
| bra.uni BB18_2; |
| |
| BB18_3: |
| shl.b32 %r34, %r1, 1; |
| add.s32 %r4, %r34, -2; |
| mov.f64 %fd87, 0d0000000000000000; |
| setp.lt.s32 %p3, %r4, 0; |
| @%p3 bra BB18_5; |
| |
| cvta.to.global.u64 %rd9, %rd4; |
| mul.wide.s32 %rd10, %r4, 8; |
| add.s64 %rd11, %rd9, %rd10; |
| ld.global.f64 %fd87, [%rd11]; |
| |
| BB18_5: |
| ld.global.v2.f64 {%fd40, %fd104}, [%rd3]; |
| fma.rn.f64 %fd105, %fd87, %fd104, %fd40; |
| bra.uni BB18_6; |
| |
| BB18_2: |
| ld.global.v2.f64 {%fd105, %fd104}, [%rd3]; |
| |
| BB18_6: |
| setp.eq.s32 %p4, %r30, 2; |
| @%p4 bra BB18_9; |
| bra.uni BB18_7; |
| |
| BB18_9: |
| add.s64 %rd15, %rd1, %rd8; |
| st.global.v2.f64 [%rd15], {%fd105, %fd104}; |
| bra.uni BB18_10; |
| |
| BB18_7: |
| setp.ne.s32 %p5, %r30, 3; |
| @%p5 bra BB18_10; |
| |
| mul.wide.u32 %rd12, %r2, 8; |
| add.s64 %rd13, %rd1, %rd12; |
| st.global.f64 [%rd13], %fd105; |
| |
| BB18_10: |
| add.s32 %r5, %r2, 1; |
| setp.ge.u32 %p6, %r5, %r3; |
| @%p6 bra BB18_27; |
| |
| @%p4 bra BB18_15; |
| bra.uni BB18_12; |
| |
| BB18_15: |
| add.s32 %r39, %r3, -1; |
| sub.s32 %r11, %r39, %r2; |
| and.b32 %r38, %r11, 3; |
| mov.f64 %fd48, 0d0000000000000000; |
| setp.eq.s32 %p10, %r38, 0; |
| @%p10 bra BB18_16; |
| |
| setp.eq.s32 %p11, %r38, 1; |
| @%p11 bra BB18_21; |
| |
| setp.eq.s32 %p12, %r38, 2; |
| @%p12 bra BB18_20; |
| |
| shl.b32 %r40, %r5, 1; |
| mul.wide.u32 %rd20, %r40, 8; |
| add.s64 %rd21, %rd2, %rd20; |
| ld.global.v2.f64 {%fd49, %fd50}, [%rd21]; |
| mul.f64 %fd104, %fd104, %fd50; |
| fma.rn.f64 %fd105, %fd105, %fd50, %fd49; |
| add.s64 %rd22, %rd1, %rd20; |
| st.global.v2.f64 [%rd22], {%fd105, %fd104}; |
| add.s32 %r5, %r2, 2; |
| |
| BB18_20: |
| shl.b32 %r41, %r5, 1; |
| mul.wide.u32 %rd23, %r41, 8; |
| add.s64 %rd24, %rd2, %rd23; |
| ld.global.v2.f64 {%fd53, %fd54}, [%rd24]; |
| mul.f64 %fd104, %fd104, %fd54; |
| fma.rn.f64 %fd105, %fd105, %fd54, %fd53; |
| add.s64 %rd25, %rd1, %rd23; |
| st.global.v2.f64 [%rd25], {%fd105, %fd104}; |
| add.s32 %r5, %r5, 1; |
| |
| BB18_21: |
| shl.b32 %r42, %r5, 1; |
| mul.wide.u32 %rd26, %r42, 8; |
| add.s64 %rd27, %rd2, %rd26; |
| ld.global.v2.f64 {%fd57, %fd58}, [%rd27]; |
| mul.f64 %fd96, %fd104, %fd58; |
| fma.rn.f64 %fd97, %fd105, %fd58, %fd57; |
| add.s64 %rd28, %rd1, %rd26; |
| st.global.v2.f64 [%rd28], {%fd97, %fd96}; |
| add.s32 %r5, %r5, 1; |
| mov.f64 %fd105, %fd97; |
| mov.f64 %fd104, %fd96; |
| bra.uni BB18_22; |
| |
| BB18_12: |
| setp.ne.s32 %p8, %r30, 3; |
| @%p8 bra BB18_25; |
| |
| mad.lo.s32 %r53, %r2, 2, 2; |
| |
| BB18_14: |
| mul.wide.u32 %rd16, %r53, 8; |
| add.s64 %rd17, %rd2, %rd16; |
| ld.global.v2.f64 {%fd43, %fd44}, [%rd17]; |
| mul.f64 %fd104, %fd104, %fd44; |
| fma.rn.f64 %fd105, %fd105, %fd44, %fd43; |
| mul.wide.s32 %rd18, %r5, 8; |
| add.s64 %rd19, %rd1, %rd18; |
| st.global.f64 [%rd19], %fd105; |
| add.s32 %r53, %r53, 2; |
| add.s32 %r5, %r5, 1; |
| setp.lt.u32 %p9, %r5, %r3; |
| @%p9 bra BB18_14; |
| bra.uni BB18_27; |
| |
| BB18_25: |
| mad.lo.s32 %r60, %r2, 2, 2; |
| |
| BB18_26: |
| mul.wide.u32 %rd41, %r60, 8; |
| add.s64 %rd42, %rd2, %rd41; |
| ld.global.v2.f64 {%fd83, %fd84}, [%rd42]; |
| mul.f64 %fd104, %fd104, %fd84; |
| fma.rn.f64 %fd105, %fd105, %fd84, %fd83; |
| add.s32 %r60, %r60, 2; |
| add.s32 %r5, %r5, 1; |
| setp.lt.u32 %p15, %r5, %r3; |
| @%p15 bra BB18_26; |
| bra.uni BB18_27; |
| |
| BB18_16: |
| mov.f64 %fd96, %fd104; |
| mov.f64 %fd97, %fd105; |
| mov.f64 %fd105, %fd48; |
| mov.f64 %fd104, %fd48; |
| |
| BB18_22: |
| setp.lt.u32 %p13, %r11, 4; |
| @%p13 bra BB18_27; |
| |
| shl.b32 %r58, %r5, 1; |
| mov.f64 %fd104, %fd96; |
| mov.f64 %fd105, %fd97; |
| |
| BB18_24: |
| mul.wide.u32 %rd29, %r58, 8; |
| add.s64 %rd30, %rd2, %rd29; |
| ld.global.v2.f64 {%fd61, %fd62}, [%rd30]; |
| add.s64 %rd31, %rd1, %rd29; |
| fma.rn.f64 %fd65, %fd105, %fd62, %fd61; |
| mul.f64 %fd66, %fd104, %fd62; |
| st.global.v2.f64 [%rd31], {%fd65, %fd66}; |
| add.s32 %r43, %r58, 2; |
| mul.wide.u32 %rd32, %r43, 8; |
| add.s64 %rd33, %rd2, %rd32; |
| ld.global.v2.f64 {%fd67, %fd68}, [%rd33]; |
| add.s64 %rd34, %rd1, %rd32; |
| fma.rn.f64 %fd71, %fd65, %fd68, %fd67; |
| mul.f64 %fd72, %fd66, %fd68; |
| st.global.v2.f64 [%rd34], {%fd71, %fd72}; |
| add.s32 %r44, %r58, 4; |
| mul.wide.u32 %rd35, %r44, 8; |
| add.s64 %rd36, %rd2, %rd35; |
| ld.global.v2.f64 {%fd73, %fd74}, [%rd36]; |
| add.s64 %rd37, %rd1, %rd35; |
| fma.rn.f64 %fd77, %fd71, %fd74, %fd73; |
| mul.f64 %fd78, %fd72, %fd74; |
| st.global.v2.f64 [%rd37], {%fd77, %fd78}; |
| add.s32 %r45, %r58, 6; |
| mul.wide.u32 %rd38, %r45, 8; |
| add.s64 %rd39, %rd2, %rd38; |
| ld.global.v2.f64 {%fd79, %fd80}, [%rd39]; |
| mul.f64 %fd104, %fd78, %fd80; |
| fma.rn.f64 %fd105, %fd77, %fd80, %fd79; |
| add.s64 %rd40, %rd1, %rd38; |
| st.global.v2.f64 [%rd40], {%fd105, %fd104}; |
| add.s32 %r58, %r58, 8; |
| add.s32 %r5, %r5, 4; |
| setp.lt.u32 %p14, %r5, %r3; |
| @%p14 bra BB18_24; |
| |
| BB18_27: |
| setp.eq.s64 %p16, %rd5, 0; |
| @%p16 bra BB18_30; |
| |
| mov.u32 %r48, %nctaid.x; |
| add.s32 %r49, %r48, -1; |
| setp.ge.u32 %p17, %r1, %r49; |
| @%p17 bra BB18_30; |
| |
| shl.b32 %r52, %r1, 1; |
| cvta.to.global.u64 %rd43, %rd5; |
| mul.wide.u32 %rd44, %r52, 8; |
| add.s64 %rd45, %rd43, %rd44; |
| st.global.v2.f64 [%rd45], {%fd105, %fd104}; |
| |
| BB18_30: |
| ret; |
| } |
| |
| // .globl cumulative_sum_prod_f |
| .visible .entry cumulative_sum_prod_f( |
| .param .u64 cumulative_sum_prod_f_param_0, |
| .param .u64 cumulative_sum_prod_f_param_1, |
| .param .u64 cumulative_sum_prod_f_param_2, |
| .param .u64 cumulative_sum_prod_f_param_3, |
| .param .u32 cumulative_sum_prod_f_param_4, |
| .param .u32 cumulative_sum_prod_f_param_5, |
| .param .u32 cumulative_sum_prod_f_param_6 |
| ) |
| { |
| .reg .pred %p<18>; |
| .reg .f32 %f<106>; |
| .reg .b32 %r<62>; |
| .reg .b64 %rd<46>; |
| |
| |
| ld.param.u64 %rd6, [cumulative_sum_prod_f_param_0]; |
| ld.param.u64 %rd7, [cumulative_sum_prod_f_param_1]; |
| ld.param.u64 %rd4, [cumulative_sum_prod_f_param_2]; |
| ld.param.u64 %rd5, [cumulative_sum_prod_f_param_3]; |
| ld.param.u32 %r28, [cumulative_sum_prod_f_param_4]; |
| ld.param.u32 %r29, [cumulative_sum_prod_f_param_5]; |
| ld.param.u32 %r30, [cumulative_sum_prod_f_param_6]; |
| cvta.to.global.u64 %rd1, %rd7; |
| cvta.to.global.u64 %rd2, %rd6; |
| mov.u32 %r1, %ctaid.x; |
| mul.lo.s32 %r2, %r1, %r29; |
| add.s32 %r31, %r28, -1; |
| setp.gt.u32 %p1, %r2, %r31; |
| @%p1 bra BB19_30; |
| |
| add.s32 %r32, %r2, %r29; |
| min.u32 %r3, %r32, %r28; |
| shl.b32 %r33, %r2, 1; |
| mul.wide.u32 %rd8, %r33, 4; |
| add.s64 %rd3, %rd2, %rd8; |
| setp.gt.u32 %p2, %r30, 1; |
| @%p2 bra BB19_3; |
| bra.uni BB19_2; |
| |
| BB19_3: |
| shl.b32 %r34, %r1, 1; |
| add.s32 %r4, %r34, -2; |
| mov.f32 %f87, 0f00000000; |
| setp.lt.s32 %p3, %r4, 0; |
| @%p3 bra BB19_5; |
| |
| cvta.to.global.u64 %rd9, %rd4; |
| mul.wide.s32 %rd10, %r4, 4; |
| add.s64 %rd11, %rd9, %rd10; |
| ld.global.f32 %f87, [%rd11]; |
| |
| BB19_5: |
| ld.global.v2.f32 {%f40, %f104}, [%rd3]; |
| fma.rn.f32 %f105, %f87, %f104, %f40; |
| bra.uni BB19_6; |
| |
| BB19_2: |
| ld.global.v2.f32 {%f105, %f104}, [%rd3]; |
| |
| BB19_6: |
| setp.eq.s32 %p4, %r30, 2; |
| @%p4 bra BB19_9; |
| bra.uni BB19_7; |
| |
| BB19_9: |
| add.s64 %rd15, %rd1, %rd8; |
| st.global.v2.f32 [%rd15], {%f105, %f104}; |
| bra.uni BB19_10; |
| |
| BB19_7: |
| setp.ne.s32 %p5, %r30, 3; |
| @%p5 bra BB19_10; |
| |
| mul.wide.u32 %rd12, %r2, 4; |
| add.s64 %rd13, %rd1, %rd12; |
| st.global.f32 [%rd13], %f105; |
| |
| BB19_10: |
| add.s32 %r5, %r2, 1; |
| setp.ge.u32 %p6, %r5, %r3; |
| @%p6 bra BB19_27; |
| |
| @%p4 bra BB19_15; |
| bra.uni BB19_12; |
| |
| BB19_15: |
| add.s32 %r39, %r3, -1; |
| sub.s32 %r11, %r39, %r2; |
| and.b32 %r38, %r11, 3; |
| mov.f32 %f48, 0f00000000; |
| setp.eq.s32 %p10, %r38, 0; |
| @%p10 bra BB19_16; |
| |
| setp.eq.s32 %p11, %r38, 1; |
| @%p11 bra BB19_21; |
| |
| setp.eq.s32 %p12, %r38, 2; |
| @%p12 bra BB19_20; |
| |
| shl.b32 %r40, %r5, 1; |
| mul.wide.u32 %rd20, %r40, 4; |
| add.s64 %rd21, %rd2, %rd20; |
| ld.global.v2.f32 {%f49, %f50}, [%rd21]; |
| mul.f32 %f104, %f104, %f50; |
| fma.rn.f32 %f105, %f105, %f50, %f49; |
| add.s64 %rd22, %rd1, %rd20; |
| st.global.v2.f32 [%rd22], {%f105, %f104}; |
| add.s32 %r5, %r2, 2; |
| |
| BB19_20: |
| shl.b32 %r41, %r5, 1; |
| mul.wide.u32 %rd23, %r41, 4; |
| add.s64 %rd24, %rd2, %rd23; |
| ld.global.v2.f32 {%f53, %f54}, [%rd24]; |
| mul.f32 %f104, %f104, %f54; |
| fma.rn.f32 %f105, %f105, %f54, %f53; |
| add.s64 %rd25, %rd1, %rd23; |
| st.global.v2.f32 [%rd25], {%f105, %f104}; |
| add.s32 %r5, %r5, 1; |
| |
| BB19_21: |
| shl.b32 %r42, %r5, 1; |
| mul.wide.u32 %rd26, %r42, 4; |
| add.s64 %rd27, %rd2, %rd26; |
| ld.global.v2.f32 {%f57, %f58}, [%rd27]; |
| mul.f32 %f96, %f104, %f58; |
| fma.rn.f32 %f97, %f105, %f58, %f57; |
| add.s64 %rd28, %rd1, %rd26; |
| st.global.v2.f32 [%rd28], {%f97, %f96}; |
| add.s32 %r5, %r5, 1; |
| mov.f32 %f105, %f97; |
| mov.f32 %f104, %f96; |
| bra.uni BB19_22; |
| |
| BB19_12: |
| setp.ne.s32 %p8, %r30, 3; |
| @%p8 bra BB19_25; |
| |
| mad.lo.s32 %r53, %r2, 2, 2; |
| |
| BB19_14: |
| mul.wide.u32 %rd16, %r53, 4; |
| add.s64 %rd17, %rd2, %rd16; |
| ld.global.v2.f32 {%f43, %f44}, [%rd17]; |
| mul.f32 %f104, %f104, %f44; |
| fma.rn.f32 %f105, %f105, %f44, %f43; |
| mul.wide.s32 %rd18, %r5, 4; |
| add.s64 %rd19, %rd1, %rd18; |
| st.global.f32 [%rd19], %f105; |
| add.s32 %r53, %r53, 2; |
| add.s32 %r5, %r5, 1; |
| setp.lt.u32 %p9, %r5, %r3; |
| @%p9 bra BB19_14; |
| bra.uni BB19_27; |
| |
| BB19_25: |
| mad.lo.s32 %r60, %r2, 2, 2; |
| |
| BB19_26: |
| mul.wide.u32 %rd41, %r60, 4; |
| add.s64 %rd42, %rd2, %rd41; |
| ld.global.v2.f32 {%f83, %f84}, [%rd42]; |
| mul.f32 %f104, %f104, %f84; |
| fma.rn.f32 %f105, %f105, %f84, %f83; |
| add.s32 %r60, %r60, 2; |
| add.s32 %r5, %r5, 1; |
| setp.lt.u32 %p15, %r5, %r3; |
| @%p15 bra BB19_26; |
| bra.uni BB19_27; |
| |
| BB19_16: |
| mov.f32 %f96, %f104; |
| mov.f32 %f97, %f105; |
| mov.f32 %f105, %f48; |
| mov.f32 %f104, %f48; |
| |
| BB19_22: |
| setp.lt.u32 %p13, %r11, 4; |
| @%p13 bra BB19_27; |
| |
| shl.b32 %r58, %r5, 1; |
| mov.f32 %f104, %f96; |
| mov.f32 %f105, %f97; |
| |
| BB19_24: |
| mul.wide.u32 %rd29, %r58, 4; |
| add.s64 %rd30, %rd2, %rd29; |
| ld.global.v2.f32 {%f61, %f62}, [%rd30]; |
| add.s64 %rd31, %rd1, %rd29; |
| fma.rn.f32 %f65, %f105, %f62, %f61; |
| mul.f32 %f66, %f104, %f62; |
| st.global.v2.f32 [%rd31], {%f65, %f66}; |
| add.s32 %r43, %r58, 2; |
| mul.wide.u32 %rd32, %r43, 4; |
| add.s64 %rd33, %rd2, %rd32; |
| ld.global.v2.f32 {%f67, %f68}, [%rd33]; |
| add.s64 %rd34, %rd1, %rd32; |
| fma.rn.f32 %f71, %f65, %f68, %f67; |
| mul.f32 %f72, %f66, %f68; |
| st.global.v2.f32 [%rd34], {%f71, %f72}; |
| add.s32 %r44, %r58, 4; |
| mul.wide.u32 %rd35, %r44, 4; |
| add.s64 %rd36, %rd2, %rd35; |
| ld.global.v2.f32 {%f73, %f74}, [%rd36]; |
| add.s64 %rd37, %rd1, %rd35; |
| fma.rn.f32 %f77, %f71, %f74, %f73; |
| mul.f32 %f78, %f72, %f74; |
| st.global.v2.f32 [%rd37], {%f77, %f78}; |
| add.s32 %r45, %r58, 6; |
| mul.wide.u32 %rd38, %r45, 4; |
| add.s64 %rd39, %rd2, %rd38; |
| ld.global.v2.f32 {%f79, %f80}, [%rd39]; |
| mul.f32 %f104, %f78, %f80; |
| fma.rn.f32 %f105, %f77, %f80, %f79; |
| add.s64 %rd40, %rd1, %rd38; |
| st.global.v2.f32 [%rd40], {%f105, %f104}; |
| add.s32 %r58, %r58, 8; |
| add.s32 %r5, %r5, 4; |
| setp.lt.u32 %p14, %r5, %r3; |
| @%p14 bra BB19_24; |
| |
| BB19_27: |
| setp.eq.s64 %p16, %rd5, 0; |
| @%p16 bra BB19_30; |
| |
| mov.u32 %r48, %nctaid.x; |
| add.s32 %r49, %r48, -1; |
| setp.ge.u32 %p17, %r1, %r49; |
| @%p17 bra BB19_30; |
| |
| shl.b32 %r52, %r1, 1; |
| cvta.to.global.u64 %rd43, %rd5; |
| mul.wide.u32 %rd44, %r52, 4; |
| add.s64 %rd45, %rd43, %rd44; |
| st.global.v2.f32 [%rd45], {%f105, %f104}; |
| |
| BB19_30: |
| ret; |
| } |
| |
| // .globl sparse_dense_im2col_d |
| .visible .entry sparse_dense_im2col_d( |
| .param .u64 sparse_dense_im2col_d_param_0, |
| .param .u64 sparse_dense_im2col_d_param_1, |
| .param .u64 sparse_dense_im2col_d_param_2, |
| .param .u64 sparse_dense_im2col_d_param_3, |
| .param .u32 sparse_dense_im2col_d_param_4, |
| .param .u32 sparse_dense_im2col_d_param_5, |
| .param .u32 sparse_dense_im2col_d_param_6, |
| .param .u32 sparse_dense_im2col_d_param_7, |
| .param .u32 sparse_dense_im2col_d_param_8, |
| .param .u32 sparse_dense_im2col_d_param_9, |
| .param .u32 sparse_dense_im2col_d_param_10, |
| .param .u32 sparse_dense_im2col_d_param_11, |
| .param .u32 sparse_dense_im2col_d_param_12, |
| .param .u32 sparse_dense_im2col_d_param_13, |
| .param .u32 sparse_dense_im2col_d_param_14, |
| .param .u32 sparse_dense_im2col_d_param_15, |
| .param .u32 sparse_dense_im2col_d_param_16, |
| .param .u32 sparse_dense_im2col_d_param_17, |
| .param .u32 sparse_dense_im2col_d_param_18, |
| .param .u32 sparse_dense_im2col_d_param_19 |
| ) |
| { |
| .reg .pred %p<13>; |
| .reg .b32 %r<72>; |
| .reg .f64 %fd<2>; |
| .reg .b64 %rd<17>; |
| |
| |
| ld.param.u64 %rd3, [sparse_dense_im2col_d_param_0]; |
| ld.param.u64 %rd4, [sparse_dense_im2col_d_param_1]; |
| ld.param.u64 %rd5, [sparse_dense_im2col_d_param_2]; |
| ld.param.u64 %rd6, [sparse_dense_im2col_d_param_3]; |
| ld.param.u32 %r35, [sparse_dense_im2col_d_param_4]; |
| ld.param.u32 %r22, [sparse_dense_im2col_d_param_7]; |
| ld.param.u32 %r23, [sparse_dense_im2col_d_param_8]; |
| ld.param.u32 %r24, [sparse_dense_im2col_d_param_9]; |
| ld.param.u32 %r25, [sparse_dense_im2col_d_param_10]; |
| ld.param.u32 %r26, [sparse_dense_im2col_d_param_11]; |
| ld.param.u32 %r27, [sparse_dense_im2col_d_param_12]; |
| ld.param.u32 %r28, [sparse_dense_im2col_d_param_13]; |
| ld.param.u32 %r29, [sparse_dense_im2col_d_param_14]; |
| ld.param.u32 %r30, [sparse_dense_im2col_d_param_15]; |
| ld.param.u32 %r31, [sparse_dense_im2col_d_param_16]; |
| ld.param.u32 %r32, [sparse_dense_im2col_d_param_17]; |
| ld.param.u32 %r33, [sparse_dense_im2col_d_param_18]; |
| ld.param.u32 %r34, [sparse_dense_im2col_d_param_19]; |
| mov.u32 %r36, %ntid.x; |
| mov.u32 %r37, %ctaid.x; |
| mov.u32 %r38, %tid.x; |
| mad.lo.s32 %r1, %r36, %r37, %r38; |
| setp.ge.s32 %p1, %r1, %r35; |
| @%p1 bra BB20_11; |
| |
| cvta.to.global.u64 %rd1, %rd5; |
| cvta.to.global.u64 %rd2, %rd4; |
| cvta.to.global.u64 %rd7, %rd3; |
| mul.wide.s32 %rd8, %r1, 8; |
| add.s64 %rd9, %rd7, %rd8; |
| ld.global.f64 %fd1, [%rd9]; |
| mov.u32 %r67, 0; |
| |
| BB20_2: |
| mov.u32 %r2, %r67; |
| add.s32 %r67, %r2, 1; |
| mul.wide.s32 %rd10, %r67, 4; |
| add.s64 %rd11, %rd2, %rd10; |
| ld.global.u32 %r40, [%rd11]; |
| setp.le.s32 %p2, %r40, %r1; |
| @%p2 bra BB20_2; |
| |
| mul.wide.s32 %rd12, %r1, 4; |
| add.s64 %rd13, %rd1, %rd12; |
| ld.global.u32 %r41, [%rd13]; |
| div.s32 %r4, %r41, %r22; |
| rem.s32 %r42, %r41, %r22; |
| div.s32 %r43, %r42, %r23; |
| rem.s32 %r44, %r42, %r23; |
| add.s32 %r5, %r43, %r33; |
| mul.lo.s32 %r45, %r31, %r26; |
| mov.u32 %r46, 1; |
| sub.s32 %r47, %r46, %r45; |
| add.s32 %r48, %r47, %r5; |
| mov.u32 %r49, 0; |
| max.s32 %r68, %r49, %r48; |
| add.s32 %r50, %r24, -1; |
| min.s32 %r7, %r50, %r5; |
| add.s32 %r8, %r44, %r34; |
| mul.lo.s32 %r51, %r32, %r27; |
| sub.s32 %r52, %r46, %r51; |
| add.s32 %r53, %r52, %r8; |
| max.s32 %r69, %r49, %r53; |
| add.s32 %r54, %r25, -1; |
| min.s32 %r10, %r54, %r8; |
| |
| BB20_4: |
| mov.u32 %r70, %r68; |
| sub.s32 %r55, %r5, %r70; |
| rem.s32 %r56, %r55, %r31; |
| setp.ne.s32 %p3, %r56, 0; |
| setp.le.s32 %p4, %r70, %r7; |
| and.pred %p5, %p4, %p3; |
| add.s32 %r68, %r70, 1; |
| @%p5 bra BB20_4; |
| |
| BB20_5: |
| mov.u32 %r13, %r69; |
| sub.s32 %r57, %r8, %r13; |
| rem.s32 %r58, %r57, %r32; |
| setp.ne.s32 %p6, %r58, 0; |
| setp.le.s32 %p7, %r13, %r10; |
| and.pred %p8, %p7, %p6; |
| add.s32 %r69, %r13, 1; |
| @%p8 bra BB20_5; |
| |
| setp.gt.s32 %p9, %r70, %r7; |
| @%p9 bra BB20_11; |
| |
| mul.lo.s32 %r15, %r2, %r28; |
| mul.lo.s32 %r16, %r4, %r29; |
| cvta.to.global.u64 %rd14, %rd6; |
| |
| BB20_8: |
| sub.s32 %r59, %r5, %r70; |
| div.s32 %r60, %r59, %r31; |
| mad.lo.s32 %r18, %r60, %r27, %r15; |
| setp.gt.s32 %p10, %r13, %r10; |
| mov.u32 %r71, %r13; |
| @%p10 bra BB20_10; |
| |
| BB20_9: |
| sub.s32 %r61, %r8, %r71; |
| div.s32 %r62, %r61, %r32; |
| mad.lo.s32 %r63, %r70, %r25, %r16; |
| add.s32 %r64, %r63, %r71; |
| mad.lo.s32 %r65, %r64, %r30, %r18; |
| add.s32 %r66, %r65, %r62; |
| mul.wide.s32 %rd15, %r66, 8; |
| add.s64 %rd16, %rd14, %rd15; |
| st.global.f64 [%rd16], %fd1; |
| add.s32 %r71, %r71, %r32; |
| setp.le.s32 %p11, %r71, %r10; |
| @%p11 bra BB20_9; |
| |
| BB20_10: |
| add.s32 %r70, %r70, %r31; |
| setp.le.s32 %p12, %r70, %r7; |
| @%p12 bra BB20_8; |
| |
| BB20_11: |
| ret; |
| } |
| |
| // .globl sparse_dense_im2col_f |
| .visible .entry sparse_dense_im2col_f( |
| .param .u64 sparse_dense_im2col_f_param_0, |
| .param .u64 sparse_dense_im2col_f_param_1, |
| .param .u64 sparse_dense_im2col_f_param_2, |
| .param .u64 sparse_dense_im2col_f_param_3, |
| .param .u32 sparse_dense_im2col_f_param_4, |
| .param .u32 sparse_dense_im2col_f_param_5, |
| .param .u32 sparse_dense_im2col_f_param_6, |
| .param .u32 sparse_dense_im2col_f_param_7, |
| .param .u32 sparse_dense_im2col_f_param_8, |
| .param .u32 sparse_dense_im2col_f_param_9, |
| .param .u32 sparse_dense_im2col_f_param_10, |
| .param .u32 sparse_dense_im2col_f_param_11, |
| .param .u32 sparse_dense_im2col_f_param_12, |
| .param .u32 sparse_dense_im2col_f_param_13, |
| .param .u32 sparse_dense_im2col_f_param_14, |
| .param .u32 sparse_dense_im2col_f_param_15, |
| .param .u32 sparse_dense_im2col_f_param_16, |
| .param .u32 sparse_dense_im2col_f_param_17, |
| .param .u32 sparse_dense_im2col_f_param_18, |
| .param .u32 sparse_dense_im2col_f_param_19 |
| ) |
| { |
| .reg .pred %p<13>; |
| .reg .f32 %f<2>; |
| .reg .b32 %r<72>; |
| .reg .b64 %rd<17>; |
| |
| |
| ld.param.u64 %rd3, [sparse_dense_im2col_f_param_0]; |
| ld.param.u64 %rd4, [sparse_dense_im2col_f_param_1]; |
| ld.param.u64 %rd5, [sparse_dense_im2col_f_param_2]; |
| ld.param.u64 %rd6, [sparse_dense_im2col_f_param_3]; |
| ld.param.u32 %r35, [sparse_dense_im2col_f_param_4]; |
| ld.param.u32 %r22, [sparse_dense_im2col_f_param_7]; |
| ld.param.u32 %r23, [sparse_dense_im2col_f_param_8]; |
| ld.param.u32 %r24, [sparse_dense_im2col_f_param_9]; |
| ld.param.u32 %r25, [sparse_dense_im2col_f_param_10]; |
| ld.param.u32 %r26, [sparse_dense_im2col_f_param_11]; |
| ld.param.u32 %r27, [sparse_dense_im2col_f_param_12]; |
| ld.param.u32 %r28, [sparse_dense_im2col_f_param_13]; |
| ld.param.u32 %r29, [sparse_dense_im2col_f_param_14]; |
| ld.param.u32 %r30, [sparse_dense_im2col_f_param_15]; |
| ld.param.u32 %r31, [sparse_dense_im2col_f_param_16]; |
| ld.param.u32 %r32, [sparse_dense_im2col_f_param_17]; |
| ld.param.u32 %r33, [sparse_dense_im2col_f_param_18]; |
| ld.param.u32 %r34, [sparse_dense_im2col_f_param_19]; |
| mov.u32 %r36, %ntid.x; |
| mov.u32 %r37, %ctaid.x; |
| mov.u32 %r38, %tid.x; |
| mad.lo.s32 %r1, %r36, %r37, %r38; |
| setp.ge.s32 %p1, %r1, %r35; |
| @%p1 bra BB21_11; |
| |
| cvta.to.global.u64 %rd1, %rd5; |
| cvta.to.global.u64 %rd2, %rd4; |
| cvta.to.global.u64 %rd7, %rd3; |
| mul.wide.s32 %rd8, %r1, 4; |
| add.s64 %rd9, %rd7, %rd8; |
| ld.global.f32 %f1, [%rd9]; |
| mov.u32 %r67, 0; |
| |
| BB21_2: |
| mov.u32 %r2, %r67; |
| add.s32 %r67, %r2, 1; |
| mul.wide.s32 %rd10, %r67, 4; |
| add.s64 %rd11, %rd2, %rd10; |
| ld.global.u32 %r40, [%rd11]; |
| setp.le.s32 %p2, %r40, %r1; |
| @%p2 bra BB21_2; |
| |
| add.s64 %rd13, %rd1, %rd8; |
| ld.global.u32 %r41, [%rd13]; |
| div.s32 %r4, %r41, %r22; |
| rem.s32 %r42, %r41, %r22; |
| div.s32 %r43, %r42, %r23; |
| rem.s32 %r44, %r42, %r23; |
| add.s32 %r5, %r43, %r33; |
| mul.lo.s32 %r45, %r31, %r26; |
| mov.u32 %r46, 1; |
| sub.s32 %r47, %r46, %r45; |
| add.s32 %r48, %r47, %r5; |
| mov.u32 %r49, 0; |
| max.s32 %r68, %r49, %r48; |
| add.s32 %r50, %r24, -1; |
| min.s32 %r7, %r50, %r5; |
| add.s32 %r8, %r44, %r34; |
| mul.lo.s32 %r51, %r32, %r27; |
| sub.s32 %r52, %r46, %r51; |
| add.s32 %r53, %r52, %r8; |
| max.s32 %r69, %r49, %r53; |
| add.s32 %r54, %r25, -1; |
| min.s32 %r10, %r54, %r8; |
| |
| BB21_4: |
| mov.u32 %r70, %r68; |
| sub.s32 %r55, %r5, %r70; |
| rem.s32 %r56, %r55, %r31; |
| setp.ne.s32 %p3, %r56, 0; |
| setp.le.s32 %p4, %r70, %r7; |
| and.pred %p5, %p4, %p3; |
| add.s32 %r68, %r70, 1; |
| @%p5 bra BB21_4; |
| |
| BB21_5: |
| mov.u32 %r13, %r69; |
| sub.s32 %r57, %r8, %r13; |
| rem.s32 %r58, %r57, %r32; |
| setp.ne.s32 %p6, %r58, 0; |
| setp.le.s32 %p7, %r13, %r10; |
| and.pred %p8, %p7, %p6; |
| add.s32 %r69, %r13, 1; |
| @%p8 bra BB21_5; |
| |
| setp.gt.s32 %p9, %r70, %r7; |
| @%p9 bra BB21_11; |
| |
| mul.lo.s32 %r15, %r2, %r28; |
| mul.lo.s32 %r16, %r4, %r29; |
| cvta.to.global.u64 %rd14, %rd6; |
| |
| BB21_8: |
| sub.s32 %r59, %r5, %r70; |
| div.s32 %r60, %r59, %r31; |
| mad.lo.s32 %r18, %r60, %r27, %r15; |
| setp.gt.s32 %p10, %r13, %r10; |
| mov.u32 %r71, %r13; |
| @%p10 bra BB21_10; |
| |
| BB21_9: |
| sub.s32 %r61, %r8, %r71; |
| div.s32 %r62, %r61, %r32; |
| mad.lo.s32 %r63, %r70, %r25, %r16; |
| add.s32 %r64, %r63, %r71; |
| mad.lo.s32 %r65, %r64, %r30, %r18; |
| add.s32 %r66, %r65, %r62; |
| mul.wide.s32 %rd15, %r66, 4; |
| add.s64 %rd16, %rd14, %rd15; |
| st.global.f32 [%rd16], %f1; |
| add.s32 %r71, %r71, %r32; |
| setp.le.s32 %p11, %r71, %r10; |
| @%p11 bra BB21_9; |
| |
| BB21_10: |
| add.s32 %r70, %r70, %r31; |
| setp.le.s32 %p12, %r70, %r7; |
| @%p12 bra BB21_8; |
| |
| BB21_11: |
| ret; |
| } |
| |
| // .globl dense_dense_im2col_d |
| .visible .entry dense_dense_im2col_d( |
| .param .u64 dense_dense_im2col_d_param_0, |
| .param .u64 dense_dense_im2col_d_param_1, |
| .param .u32 dense_dense_im2col_d_param_2, |
| .param .u32 dense_dense_im2col_d_param_3, |
| .param .u32 dense_dense_im2col_d_param_4, |
| .param .u32 dense_dense_im2col_d_param_5, |
| .param .u32 dense_dense_im2col_d_param_6, |
| .param .u32 dense_dense_im2col_d_param_7, |
| .param .u32 dense_dense_im2col_d_param_8, |
| .param .u32 dense_dense_im2col_d_param_9, |
| .param .u32 dense_dense_im2col_d_param_10, |
| .param .u32 dense_dense_im2col_d_param_11, |
| .param .u32 dense_dense_im2col_d_param_12, |
| .param .u32 dense_dense_im2col_d_param_13, |
| .param .u32 dense_dense_im2col_d_param_14, |
| .param .u32 dense_dense_im2col_d_param_15, |
| .param .u32 dense_dense_im2col_d_param_16 |
| ) |
| { |
| .reg .pred %p<12>; |
| .reg .b32 %r<69>; |
| .reg .f64 %fd<2>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd1, [dense_dense_im2col_d_param_0]; |
| ld.param.u64 %rd2, [dense_dense_im2col_d_param_1]; |
| ld.param.u32 %r35, [dense_dense_im2col_d_param_2]; |
| ld.param.u32 %r21, [dense_dense_im2col_d_param_3]; |
| ld.param.u32 %r22, [dense_dense_im2col_d_param_4]; |
| ld.param.u32 %r23, [dense_dense_im2col_d_param_5]; |
| ld.param.u32 %r24, [dense_dense_im2col_d_param_6]; |
| ld.param.u32 %r25, [dense_dense_im2col_d_param_7]; |
| ld.param.u32 %r26, [dense_dense_im2col_d_param_8]; |
| ld.param.u32 %r27, [dense_dense_im2col_d_param_9]; |
| ld.param.u32 %r28, [dense_dense_im2col_d_param_10]; |
| ld.param.u32 %r29, [dense_dense_im2col_d_param_11]; |
| ld.param.u32 %r30, [dense_dense_im2col_d_param_12]; |
| ld.param.u32 %r31, [dense_dense_im2col_d_param_13]; |
| ld.param.u32 %r32, [dense_dense_im2col_d_param_14]; |
| ld.param.u32 %r33, [dense_dense_im2col_d_param_15]; |
| ld.param.u32 %r34, [dense_dense_im2col_d_param_16]; |
| mov.u32 %r36, %ctaid.x; |
| mov.u32 %r37, %ntid.x; |
| mov.u32 %r38, %tid.x; |
| mad.lo.s32 %r1, %r37, %r36, %r38; |
| setp.ge.s32 %p1, %r1, %r35; |
| @%p1 bra BB22_9; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.s32 %rd4, %r1, 8; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f64 %fd1, [%rd5]; |
| div.s32 %r2, %r1, %r21; |
| rem.s32 %r39, %r1, %r21; |
| div.s32 %r3, %r39, %r22; |
| rem.s32 %r40, %r39, %r22; |
| div.s32 %r41, %r40, %r23; |
| rem.s32 %r42, %r40, %r23; |
| add.s32 %r4, %r41, %r33; |
| mul.lo.s32 %r43, %r31, %r26; |
| mov.u32 %r44, 1; |
| sub.s32 %r45, %r44, %r43; |
| add.s32 %r46, %r45, %r4; |
| mov.u32 %r47, 0; |
| max.s32 %r65, %r47, %r46; |
| add.s32 %r48, %r24, -1; |
| min.s32 %r6, %r48, %r4; |
| add.s32 %r7, %r42, %r34; |
| mul.lo.s32 %r49, %r32, %r27; |
| sub.s32 %r50, %r44, %r49; |
| add.s32 %r51, %r50, %r7; |
| max.s32 %r66, %r47, %r51; |
| add.s32 %r52, %r25, -1; |
| min.s32 %r9, %r52, %r7; |
| |
| BB22_2: |
| mov.u32 %r67, %r65; |
| sub.s32 %r53, %r4, %r67; |
| rem.s32 %r54, %r53, %r31; |
| setp.ne.s32 %p2, %r54, 0; |
| setp.le.s32 %p3, %r67, %r6; |
| and.pred %p4, %p3, %p2; |
| add.s32 %r65, %r67, 1; |
| @%p4 bra BB22_2; |
| |
| BB22_3: |
| mov.u32 %r12, %r66; |
| sub.s32 %r55, %r7, %r12; |
| rem.s32 %r56, %r55, %r32; |
| setp.ne.s32 %p5, %r56, 0; |
| setp.le.s32 %p6, %r12, %r9; |
| and.pred %p7, %p6, %p5; |
| add.s32 %r66, %r12, 1; |
| @%p7 bra BB22_3; |
| |
| setp.gt.s32 %p8, %r67, %r6; |
| @%p8 bra BB22_9; |
| |
| mul.lo.s32 %r14, %r2, %r28; |
| mul.lo.s32 %r15, %r3, %r29; |
| cvta.to.global.u64 %rd6, %rd2; |
| |
| BB22_6: |
| sub.s32 %r57, %r4, %r67; |
| div.s32 %r58, %r57, %r31; |
| mad.lo.s32 %r17, %r58, %r27, %r14; |
| setp.gt.s32 %p9, %r12, %r9; |
| mov.u32 %r68, %r12; |
| @%p9 bra BB22_8; |
| |
| BB22_7: |
| sub.s32 %r59, %r7, %r68; |
| div.s32 %r60, %r59, %r32; |
| mad.lo.s32 %r61, %r67, %r25, %r15; |
| add.s32 %r62, %r61, %r68; |
| mad.lo.s32 %r63, %r62, %r30, %r17; |
| add.s32 %r64, %r63, %r60; |
| mul.wide.s32 %rd7, %r64, 8; |
| add.s64 %rd8, %rd6, %rd7; |
| st.global.f64 [%rd8], %fd1; |
| add.s32 %r68, %r68, %r32; |
| setp.le.s32 %p10, %r68, %r9; |
| @%p10 bra BB22_7; |
| |
| BB22_8: |
| add.s32 %r67, %r67, %r31; |
| setp.le.s32 %p11, %r67, %r6; |
| @%p11 bra BB22_6; |
| |
| BB22_9: |
| ret; |
| } |
| |
| // .globl dense_dense_im2col_f |
| .visible .entry dense_dense_im2col_f( |
| .param .u64 dense_dense_im2col_f_param_0, |
| .param .u64 dense_dense_im2col_f_param_1, |
| .param .u32 dense_dense_im2col_f_param_2, |
| .param .u32 dense_dense_im2col_f_param_3, |
| .param .u32 dense_dense_im2col_f_param_4, |
| .param .u32 dense_dense_im2col_f_param_5, |
| .param .u32 dense_dense_im2col_f_param_6, |
| .param .u32 dense_dense_im2col_f_param_7, |
| .param .u32 dense_dense_im2col_f_param_8, |
| .param .u32 dense_dense_im2col_f_param_9, |
| .param .u32 dense_dense_im2col_f_param_10, |
| .param .u32 dense_dense_im2col_f_param_11, |
| .param .u32 dense_dense_im2col_f_param_12, |
| .param .u32 dense_dense_im2col_f_param_13, |
| .param .u32 dense_dense_im2col_f_param_14, |
| .param .u32 dense_dense_im2col_f_param_15, |
| .param .u32 dense_dense_im2col_f_param_16 |
| ) |
| { |
| .reg .pred %p<12>; |
| .reg .f32 %f<2>; |
| .reg .b32 %r<69>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd1, [dense_dense_im2col_f_param_0]; |
| ld.param.u64 %rd2, [dense_dense_im2col_f_param_1]; |
| ld.param.u32 %r35, [dense_dense_im2col_f_param_2]; |
| ld.param.u32 %r21, [dense_dense_im2col_f_param_3]; |
| ld.param.u32 %r22, [dense_dense_im2col_f_param_4]; |
| ld.param.u32 %r23, [dense_dense_im2col_f_param_5]; |
| ld.param.u32 %r24, [dense_dense_im2col_f_param_6]; |
| ld.param.u32 %r25, [dense_dense_im2col_f_param_7]; |
| ld.param.u32 %r26, [dense_dense_im2col_f_param_8]; |
| ld.param.u32 %r27, [dense_dense_im2col_f_param_9]; |
| ld.param.u32 %r28, [dense_dense_im2col_f_param_10]; |
| ld.param.u32 %r29, [dense_dense_im2col_f_param_11]; |
| ld.param.u32 %r30, [dense_dense_im2col_f_param_12]; |
| ld.param.u32 %r31, [dense_dense_im2col_f_param_13]; |
| ld.param.u32 %r32, [dense_dense_im2col_f_param_14]; |
| ld.param.u32 %r33, [dense_dense_im2col_f_param_15]; |
| ld.param.u32 %r34, [dense_dense_im2col_f_param_16]; |
| mov.u32 %r36, %ctaid.x; |
| mov.u32 %r37, %ntid.x; |
| mov.u32 %r38, %tid.x; |
| mad.lo.s32 %r1, %r37, %r36, %r38; |
| setp.ge.s32 %p1, %r1, %r35; |
| @%p1 bra BB23_9; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.s32 %rd4, %r1, 4; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f32 %f1, [%rd5]; |
| div.s32 %r2, %r1, %r21; |
| rem.s32 %r39, %r1, %r21; |
| div.s32 %r3, %r39, %r22; |
| rem.s32 %r40, %r39, %r22; |
| div.s32 %r41, %r40, %r23; |
| rem.s32 %r42, %r40, %r23; |
| add.s32 %r4, %r41, %r33; |
| mul.lo.s32 %r43, %r31, %r26; |
| mov.u32 %r44, 1; |
| sub.s32 %r45, %r44, %r43; |
| add.s32 %r46, %r45, %r4; |
| mov.u32 %r47, 0; |
| max.s32 %r65, %r47, %r46; |
| add.s32 %r48, %r24, -1; |
| min.s32 %r6, %r48, %r4; |
| add.s32 %r7, %r42, %r34; |
| mul.lo.s32 %r49, %r32, %r27; |
| sub.s32 %r50, %r44, %r49; |
| add.s32 %r51, %r50, %r7; |
| max.s32 %r66, %r47, %r51; |
| add.s32 %r52, %r25, -1; |
| min.s32 %r9, %r52, %r7; |
| |
| BB23_2: |
| mov.u32 %r67, %r65; |
| sub.s32 %r53, %r4, %r67; |
| rem.s32 %r54, %r53, %r31; |
| setp.ne.s32 %p2, %r54, 0; |
| setp.le.s32 %p3, %r67, %r6; |
| and.pred %p4, %p3, %p2; |
| add.s32 %r65, %r67, 1; |
| @%p4 bra BB23_2; |
| |
| BB23_3: |
| mov.u32 %r12, %r66; |
| sub.s32 %r55, %r7, %r12; |
| rem.s32 %r56, %r55, %r32; |
| setp.ne.s32 %p5, %r56, 0; |
| setp.le.s32 %p6, %r12, %r9; |
| and.pred %p7, %p6, %p5; |
| add.s32 %r66, %r12, 1; |
| @%p7 bra BB23_3; |
| |
| setp.gt.s32 %p8, %r67, %r6; |
| @%p8 bra BB23_9; |
| |
| mul.lo.s32 %r14, %r2, %r28; |
| mul.lo.s32 %r15, %r3, %r29; |
| cvta.to.global.u64 %rd6, %rd2; |
| |
| BB23_6: |
| sub.s32 %r57, %r4, %r67; |
| div.s32 %r58, %r57, %r31; |
| mad.lo.s32 %r17, %r58, %r27, %r14; |
| setp.gt.s32 %p9, %r12, %r9; |
| mov.u32 %r68, %r12; |
| @%p9 bra BB23_8; |
| |
| BB23_7: |
| sub.s32 %r59, %r7, %r68; |
| div.s32 %r60, %r59, %r32; |
| mad.lo.s32 %r61, %r67, %r25, %r15; |
| add.s32 %r62, %r61, %r68; |
| mad.lo.s32 %r63, %r62, %r30, %r17; |
| add.s32 %r64, %r63, %r60; |
| mul.wide.s32 %rd7, %r64, 4; |
| add.s64 %rd8, %rd6, %rd7; |
| st.global.f32 [%rd8], %f1; |
| add.s32 %r68, %r68, %r32; |
| setp.le.s32 %p10, %r68, %r9; |
| @%p10 bra BB23_7; |
| |
| BB23_8: |
| add.s32 %r67, %r67, %r31; |
| setp.le.s32 %p11, %r67, %r6; |
| @%p11 bra BB23_6; |
| |
| BB23_9: |
| ret; |
| } |
| |
| // .globl reorg_knpq_d |
| .visible .entry reorg_knpq_d( |
| .param .u64 reorg_knpq_d_param_0, |
| .param .u64 reorg_knpq_d_param_1, |
| .param .u32 reorg_knpq_d_param_2, |
| .param .u32 reorg_knpq_d_param_3, |
| .param .u32 reorg_knpq_d_param_4, |
| .param .u32 reorg_knpq_d_param_5 |
| ) |
| { |
| .reg .pred %p<2>; |
| .reg .b32 %r<16>; |
| .reg .f64 %fd<2>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd1, [reorg_knpq_d_param_0]; |
| ld.param.u64 %rd2, [reorg_knpq_d_param_1]; |
| ld.param.u32 %r5, [reorg_knpq_d_param_2]; |
| ld.param.u32 %r2, [reorg_knpq_d_param_3]; |
| ld.param.u32 %r3, [reorg_knpq_d_param_4]; |
| ld.param.u32 %r4, [reorg_knpq_d_param_5]; |
| mov.u32 %r6, %ctaid.x; |
| mov.u32 %r7, %ntid.x; |
| mov.u32 %r8, %tid.x; |
| mad.lo.s32 %r1, %r7, %r6, %r8; |
| setp.ge.s32 %p1, %r1, %r5; |
| @%p1 bra BB24_2; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| rem.s32 %r9, %r1, %r2; |
| div.s32 %r10, %r9, %r4; |
| rem.s32 %r11, %r9, %r4; |
| mul.wide.s32 %rd4, %r1, 8; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f64 %fd1, [%rd5]; |
| div.s32 %r12, %r1, %r2; |
| mul.lo.s32 %r13, %r12, %r4; |
| mad.lo.s32 %r14, %r10, %r3, %r13; |
| add.s32 %r15, %r14, %r11; |
| cvta.to.global.u64 %rd6, %rd2; |
| mul.wide.s32 %rd7, %r15, 8; |
| add.s64 %rd8, %rd6, %rd7; |
| st.global.f64 [%rd8], %fd1; |
| |
| BB24_2: |
| ret; |
| } |
| |
| // .globl reorg_knpq_f |
| .visible .entry reorg_knpq_f( |
| .param .u64 reorg_knpq_f_param_0, |
| .param .u64 reorg_knpq_f_param_1, |
| .param .u32 reorg_knpq_f_param_2, |
| .param .u32 reorg_knpq_f_param_3, |
| .param .u32 reorg_knpq_f_param_4, |
| .param .u32 reorg_knpq_f_param_5 |
| ) |
| { |
| .reg .pred %p<2>; |
| .reg .f32 %f<2>; |
| .reg .b32 %r<16>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd1, [reorg_knpq_f_param_0]; |
| ld.param.u64 %rd2, [reorg_knpq_f_param_1]; |
| ld.param.u32 %r5, [reorg_knpq_f_param_2]; |
| ld.param.u32 %r2, [reorg_knpq_f_param_3]; |
| ld.param.u32 %r3, [reorg_knpq_f_param_4]; |
| ld.param.u32 %r4, [reorg_knpq_f_param_5]; |
| mov.u32 %r6, %ctaid.x; |
| mov.u32 %r7, %ntid.x; |
| mov.u32 %r8, %tid.x; |
| mad.lo.s32 %r1, %r7, %r6, %r8; |
| setp.ge.s32 %p1, %r1, %r5; |
| @%p1 bra BB25_2; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| rem.s32 %r9, %r1, %r2; |
| div.s32 %r10, %r9, %r4; |
| rem.s32 %r11, %r9, %r4; |
| mul.wide.s32 %rd4, %r1, 4; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f32 %f1, [%rd5]; |
| div.s32 %r12, %r1, %r2; |
| mul.lo.s32 %r13, %r12, %r4; |
| mad.lo.s32 %r14, %r10, %r3, %r13; |
| add.s32 %r15, %r14, %r11; |
| cvta.to.global.u64 %rd6, %rd2; |
| mul.wide.s32 %rd7, %r15, 4; |
| add.s64 %rd8, %rd6, %rd7; |
| st.global.f32 [%rd8], %f1; |
| |
| BB25_2: |
| ret; |
| } |
| |
| // .globl slice_sparse_dense_row_d |
| .visible .entry slice_sparse_dense_row_d( |
| .param .u64 slice_sparse_dense_row_d_param_0, |
| .param .u64 slice_sparse_dense_row_d_param_1, |
| .param .u64 slice_sparse_dense_row_d_param_2, |
| .param .u64 slice_sparse_dense_row_d_param_3, |
| .param .u32 slice_sparse_dense_row_d_param_4, |
| .param .u32 slice_sparse_dense_row_d_param_5, |
| .param .u32 slice_sparse_dense_row_d_param_6, |
| .param .u32 slice_sparse_dense_row_d_param_7, |
| .param .u32 slice_sparse_dense_row_d_param_8 |
| ) |
| { |
| .reg .pred %p<7>; |
| .reg .b32 %r<25>; |
| .reg .f64 %fd<2>; |
| .reg .b64 %rd<23>; |
| |
| |
| ld.param.u64 %rd9, [slice_sparse_dense_row_d_param_0]; |
| ld.param.u64 %rd10, [slice_sparse_dense_row_d_param_1]; |
| ld.param.u64 %rd11, [slice_sparse_dense_row_d_param_2]; |
| ld.param.u64 %rd12, [slice_sparse_dense_row_d_param_3]; |
| ld.param.u32 %r15, [slice_sparse_dense_row_d_param_4]; |
| ld.param.u32 %r16, [slice_sparse_dense_row_d_param_5]; |
| ld.param.u32 %r12, [slice_sparse_dense_row_d_param_6]; |
| ld.param.u32 %r13, [slice_sparse_dense_row_d_param_7]; |
| ld.param.u32 %r14, [slice_sparse_dense_row_d_param_8]; |
| mov.u32 %r17, %ntid.x; |
| mov.u32 %r18, %ctaid.x; |
| mov.u32 %r19, %tid.x; |
| mad.lo.s32 %r1, %r17, %r18, %r19; |
| add.s32 %r2, %r1, %r15; |
| setp.gt.s32 %p1, %r2, %r16; |
| @%p1 bra BB26_6; |
| |
| cvta.to.global.u64 %rd13, %rd10; |
| mul.wide.s32 %rd14, %r2, 4; |
| add.s64 %rd1, %rd13, %rd14; |
| ld.global.u32 %r23, [%rd1]; |
| ld.global.u32 %r24, [%rd1+4]; |
| setp.ge.s32 %p2, %r23, %r24; |
| @%p2 bra BB26_6; |
| |
| cvta.to.global.u64 %rd2, %rd12; |
| cvta.to.global.u64 %rd15, %rd9; |
| cvta.to.global.u64 %rd16, %rd11; |
| mul.lo.s32 %r20, %r1, %r14; |
| sub.s32 %r5, %r20, %r12; |
| mul.wide.s32 %rd17, %r23, 8; |
| add.s64 %rd22, %rd15, %rd17; |
| mul.wide.s32 %rd18, %r23, 4; |
| add.s64 %rd21, %rd16, %rd18; |
| |
| BB26_3: |
| ld.global.u32 %r8, [%rd21]; |
| setp.lt.s32 %p3, %r8, %r12; |
| setp.gt.s32 %p4, %r8, %r13; |
| or.pred %p5, %p3, %p4; |
| @%p5 bra BB26_5; |
| |
| ld.global.f64 %fd1, [%rd22]; |
| add.s32 %r21, %r5, %r8; |
| mul.wide.s32 %rd19, %r21, 8; |
| add.s64 %rd20, %rd2, %rd19; |
| st.global.f64 [%rd20], %fd1; |
| ld.global.u32 %r24, [%rd1+4]; |
| |
| BB26_5: |
| add.s64 %rd22, %rd22, 8; |
| add.s64 %rd21, %rd21, 4; |
| add.s32 %r23, %r23, 1; |
| setp.lt.s32 %p6, %r23, %r24; |
| @%p6 bra BB26_3; |
| |
| BB26_6: |
| ret; |
| } |
| |
| // .globl slice_sparse_dense_row_f |
| .visible .entry slice_sparse_dense_row_f( |
| .param .u64 slice_sparse_dense_row_f_param_0, |
| .param .u64 slice_sparse_dense_row_f_param_1, |
| .param .u64 slice_sparse_dense_row_f_param_2, |
| .param .u64 slice_sparse_dense_row_f_param_3, |
| .param .u32 slice_sparse_dense_row_f_param_4, |
| .param .u32 slice_sparse_dense_row_f_param_5, |
| .param .u32 slice_sparse_dense_row_f_param_6, |
| .param .u32 slice_sparse_dense_row_f_param_7, |
| .param .u32 slice_sparse_dense_row_f_param_8 |
| ) |
| { |
| .reg .pred %p<7>; |
| .reg .f32 %f<2>; |
| .reg .b32 %r<25>; |
| .reg .b64 %rd<22>; |
| |
| |
| ld.param.u64 %rd9, [slice_sparse_dense_row_f_param_0]; |
| ld.param.u64 %rd10, [slice_sparse_dense_row_f_param_1]; |
| ld.param.u64 %rd11, [slice_sparse_dense_row_f_param_2]; |
| ld.param.u64 %rd12, [slice_sparse_dense_row_f_param_3]; |
| ld.param.u32 %r15, [slice_sparse_dense_row_f_param_4]; |
| ld.param.u32 %r16, [slice_sparse_dense_row_f_param_5]; |
| ld.param.u32 %r12, [slice_sparse_dense_row_f_param_6]; |
| ld.param.u32 %r13, [slice_sparse_dense_row_f_param_7]; |
| ld.param.u32 %r14, [slice_sparse_dense_row_f_param_8]; |
| mov.u32 %r17, %ntid.x; |
| mov.u32 %r18, %ctaid.x; |
| mov.u32 %r19, %tid.x; |
| mad.lo.s32 %r1, %r17, %r18, %r19; |
| add.s32 %r2, %r1, %r15; |
| setp.gt.s32 %p1, %r2, %r16; |
| @%p1 bra BB27_6; |
| |
| cvta.to.global.u64 %rd13, %rd10; |
| mul.wide.s32 %rd14, %r2, 4; |
| add.s64 %rd1, %rd13, %rd14; |
| ld.global.u32 %r23, [%rd1]; |
| ld.global.u32 %r24, [%rd1+4]; |
| setp.ge.s32 %p2, %r23, %r24; |
| @%p2 bra BB27_6; |
| |
| cvta.to.global.u64 %rd2, %rd12; |
| cvta.to.global.u64 %rd15, %rd9; |
| cvta.to.global.u64 %rd16, %rd11; |
| mul.lo.s32 %r20, %r1, %r14; |
| sub.s32 %r5, %r20, %r12; |
| mul.wide.s32 %rd17, %r23, 4; |
| add.s64 %rd21, %rd15, %rd17; |
| add.s64 %rd20, %rd16, %rd17; |
| |
| BB27_3: |
| ld.global.u32 %r8, [%rd20]; |
| setp.lt.s32 %p3, %r8, %r12; |
| setp.gt.s32 %p4, %r8, %r13; |
| or.pred %p5, %p3, %p4; |
| @%p5 bra BB27_5; |
| |
| ld.global.f32 %f1, [%rd21]; |
| add.s32 %r21, %r5, %r8; |
| mul.wide.s32 %rd18, %r21, 4; |
| add.s64 %rd19, %rd2, %rd18; |
| st.global.f32 [%rd19], %f1; |
| ld.global.u32 %r24, [%rd1+4]; |
| |
| BB27_5: |
| add.s64 %rd21, %rd21, 4; |
| add.s64 %rd20, %rd20, 4; |
| add.s32 %r23, %r23, 1; |
| setp.lt.s32 %p6, %r23, %r24; |
| @%p6 bra BB27_3; |
| |
| BB27_6: |
| ret; |
| } |
| |
| // .globl slice_sparse_dense_nnz_d |
| .visible .entry slice_sparse_dense_nnz_d( |
| .param .u64 slice_sparse_dense_nnz_d_param_0, |
| .param .u64 slice_sparse_dense_nnz_d_param_1, |
| .param .u64 slice_sparse_dense_nnz_d_param_2, |
| .param .u64 slice_sparse_dense_nnz_d_param_3, |
| .param .u32 slice_sparse_dense_nnz_d_param_4, |
| .param .u32 slice_sparse_dense_nnz_d_param_5, |
| .param .u32 slice_sparse_dense_nnz_d_param_6, |
| .param .u32 slice_sparse_dense_nnz_d_param_7, |
| .param .u32 slice_sparse_dense_nnz_d_param_8 |
| ) |
| { |
| .reg .pred %p<6>; |
| .reg .b32 %r<22>; |
| .reg .f64 %fd<2>; |
| .reg .b64 %rd<21>; |
| |
| |
| ld.param.u64 %rd4, [slice_sparse_dense_nnz_d_param_0]; |
| ld.param.u64 %rd7, [slice_sparse_dense_nnz_d_param_1]; |
| ld.param.u64 %rd5, [slice_sparse_dense_nnz_d_param_2]; |
| ld.param.u64 %rd6, [slice_sparse_dense_nnz_d_param_3]; |
| ld.param.u32 %r5, [slice_sparse_dense_nnz_d_param_4]; |
| ld.param.u32 %r9, [slice_sparse_dense_nnz_d_param_5]; |
| ld.param.u32 %r6, [slice_sparse_dense_nnz_d_param_6]; |
| ld.param.u32 %r7, [slice_sparse_dense_nnz_d_param_7]; |
| ld.param.u32 %r8, [slice_sparse_dense_nnz_d_param_8]; |
| mov.u32 %r10, %ntid.x; |
| mov.u32 %r11, %ctaid.x; |
| mov.u32 %r12, %tid.x; |
| mad.lo.s32 %r13, %r10, %r11, %r12; |
| cvta.to.global.u64 %rd1, %rd7; |
| mul.wide.s32 %rd8, %r5, 4; |
| add.s64 %rd9, %rd1, %rd8; |
| ld.global.u32 %r14, [%rd9]; |
| add.s32 %r1, %r13, %r14; |
| mul.wide.s32 %rd10, %r9, 4; |
| add.s64 %rd11, %rd1, %rd10; |
| ld.global.u32 %r15, [%rd11+4]; |
| setp.ge.s32 %p1, %r1, %r15; |
| @%p1 bra BB28_5; |
| |
| cvta.to.global.u64 %rd2, %rd6; |
| cvta.to.global.u64 %rd3, %rd4; |
| cvta.to.global.u64 %rd12, %rd5; |
| mul.wide.s32 %rd13, %r1, 4; |
| add.s64 %rd14, %rd12, %rd13; |
| ld.global.u32 %r2, [%rd14]; |
| setp.lt.s32 %p2, %r2, %r6; |
| setp.gt.s32 %p3, %r2, %r7; |
| or.pred %p4, %p2, %p3; |
| @%p4 bra BB28_5; |
| |
| mov.u32 %r21, %r5; |
| |
| BB28_3: |
| mov.u32 %r3, %r21; |
| add.s32 %r21, %r3, 1; |
| mul.wide.s32 %rd15, %r21, 4; |
| add.s64 %rd16, %rd1, %rd15; |
| ld.global.u32 %r16, [%rd16]; |
| setp.le.s32 %p5, %r16, %r1; |
| @%p5 bra BB28_3; |
| |
| mul.wide.s32 %rd17, %r1, 8; |
| add.s64 %rd18, %rd3, %rd17; |
| ld.global.f64 %fd1, [%rd18]; |
| sub.s32 %r17, %r3, %r5; |
| mul.lo.s32 %r18, %r17, %r8; |
| sub.s32 %r19, %r18, %r6; |
| add.s32 %r20, %r19, %r2; |
| mul.wide.s32 %rd19, %r20, 8; |
| add.s64 %rd20, %rd2, %rd19; |
| st.global.f64 [%rd20], %fd1; |
| |
| BB28_5: |
| ret; |
| } |
| |
| // .globl slice_sparse_dense_nnz_f |
| .visible .entry slice_sparse_dense_nnz_f( |
| .param .u64 slice_sparse_dense_nnz_f_param_0, |
| .param .u64 slice_sparse_dense_nnz_f_param_1, |
| .param .u64 slice_sparse_dense_nnz_f_param_2, |
| .param .u64 slice_sparse_dense_nnz_f_param_3, |
| .param .u32 slice_sparse_dense_nnz_f_param_4, |
| .param .u32 slice_sparse_dense_nnz_f_param_5, |
| .param .u32 slice_sparse_dense_nnz_f_param_6, |
| .param .u32 slice_sparse_dense_nnz_f_param_7, |
| .param .u32 slice_sparse_dense_nnz_f_param_8 |
| ) |
| { |
| .reg .pred %p<6>; |
| .reg .f32 %f<2>; |
| .reg .b32 %r<22>; |
| .reg .b64 %rd<21>; |
| |
| |
| ld.param.u64 %rd4, [slice_sparse_dense_nnz_f_param_0]; |
| ld.param.u64 %rd7, [slice_sparse_dense_nnz_f_param_1]; |
| ld.param.u64 %rd5, [slice_sparse_dense_nnz_f_param_2]; |
| ld.param.u64 %rd6, [slice_sparse_dense_nnz_f_param_3]; |
| ld.param.u32 %r5, [slice_sparse_dense_nnz_f_param_4]; |
| ld.param.u32 %r9, [slice_sparse_dense_nnz_f_param_5]; |
| ld.param.u32 %r6, [slice_sparse_dense_nnz_f_param_6]; |
| ld.param.u32 %r7, [slice_sparse_dense_nnz_f_param_7]; |
| ld.param.u32 %r8, [slice_sparse_dense_nnz_f_param_8]; |
| mov.u32 %r10, %ntid.x; |
| mov.u32 %r11, %ctaid.x; |
| mov.u32 %r12, %tid.x; |
| mad.lo.s32 %r13, %r10, %r11, %r12; |
| cvta.to.global.u64 %rd1, %rd7; |
| mul.wide.s32 %rd8, %r5, 4; |
| add.s64 %rd9, %rd1, %rd8; |
| ld.global.u32 %r14, [%rd9]; |
| add.s32 %r1, %r13, %r14; |
| mul.wide.s32 %rd10, %r9, 4; |
| add.s64 %rd11, %rd1, %rd10; |
| ld.global.u32 %r15, [%rd11+4]; |
| setp.ge.s32 %p1, %r1, %r15; |
| @%p1 bra BB29_5; |
| |
| cvta.to.global.u64 %rd2, %rd6; |
| cvta.to.global.u64 %rd3, %rd4; |
| cvta.to.global.u64 %rd12, %rd5; |
| mul.wide.s32 %rd13, %r1, 4; |
| add.s64 %rd14, %rd12, %rd13; |
| ld.global.u32 %r2, [%rd14]; |
| setp.lt.s32 %p2, %r2, %r6; |
| setp.gt.s32 %p3, %r2, %r7; |
| or.pred %p4, %p2, %p3; |
| @%p4 bra BB29_5; |
| |
| mov.u32 %r21, %r5; |
| |
| BB29_3: |
| mov.u32 %r3, %r21; |
| add.s32 %r21, %r3, 1; |
| mul.wide.s32 %rd15, %r21, 4; |
| add.s64 %rd16, %rd1, %rd15; |
| ld.global.u32 %r16, [%rd16]; |
| setp.le.s32 %p5, %r16, %r1; |
| @%p5 bra BB29_3; |
| |
| add.s64 %rd18, %rd3, %rd13; |
| ld.global.f32 %f1, [%rd18]; |
| sub.s32 %r17, %r3, %r5; |
| mul.lo.s32 %r18, %r17, %r8; |
| sub.s32 %r19, %r18, %r6; |
| add.s32 %r20, %r19, %r2; |
| mul.wide.s32 %rd19, %r20, 4; |
| add.s64 %rd20, %rd2, %rd19; |
| st.global.f32 [%rd20], %f1; |
| |
| BB29_5: |
| ret; |
| } |
| |
| // .globl slice_dense_dense_d |
| .visible .entry slice_dense_dense_d( |
| .param .u64 slice_dense_dense_d_param_0, |
| .param .u64 slice_dense_dense_d_param_1, |
| .param .u32 slice_dense_dense_d_param_2, |
| .param .u32 slice_dense_dense_d_param_3, |
| .param .u32 slice_dense_dense_d_param_4, |
| .param .u32 slice_dense_dense_d_param_5, |
| .param .u32 slice_dense_dense_d_param_6, |
| .param .u32 slice_dense_dense_d_param_7, |
| .param .u32 slice_dense_dense_d_param_8 |
| ) |
| { |
| .reg .pred %p<4>; |
| .reg .b32 %r<15>; |
| .reg .f64 %fd<2>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd1, [slice_dense_dense_d_param_0]; |
| ld.param.u64 %rd2, [slice_dense_dense_d_param_1]; |
| ld.param.u32 %r3, [slice_dense_dense_d_param_2]; |
| ld.param.u32 %r4, [slice_dense_dense_d_param_4]; |
| ld.param.u32 %r5, [slice_dense_dense_d_param_6]; |
| ld.param.u32 %r7, [slice_dense_dense_d_param_7]; |
| ld.param.u32 %r6, [slice_dense_dense_d_param_8]; |
| mov.u32 %r8, %ctaid.x; |
| mov.u32 %r9, %ntid.x; |
| mov.u32 %r10, %tid.x; |
| mad.lo.s32 %r1, %r9, %r8, %r10; |
| div.s32 %r2, %r1, %r6; |
| setp.ge.s32 %p1, %r2, %r7; |
| setp.lt.s32 %p2, %r6, 0; |
| or.pred %p3, %p1, %p2; |
| @%p3 bra BB30_2; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| add.s32 %r11, %r2, %r3; |
| rem.s32 %r12, %r1, %r6; |
| add.s32 %r13, %r12, %r4; |
| mad.lo.s32 %r14, %r11, %r5, %r13; |
| mul.wide.s32 %rd4, %r14, 8; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f64 %fd1, [%rd5]; |
| cvta.to.global.u64 %rd6, %rd2; |
| mul.wide.s32 %rd7, %r1, 8; |
| add.s64 %rd8, %rd6, %rd7; |
| st.global.f64 [%rd8], %fd1; |
| |
| BB30_2: |
| ret; |
| } |
| |
| // .globl slice_dense_dense_f |
| .visible .entry slice_dense_dense_f( |
| .param .u64 slice_dense_dense_f_param_0, |
| .param .u64 slice_dense_dense_f_param_1, |
| .param .u32 slice_dense_dense_f_param_2, |
| .param .u32 slice_dense_dense_f_param_3, |
| .param .u32 slice_dense_dense_f_param_4, |
| .param .u32 slice_dense_dense_f_param_5, |
| .param .u32 slice_dense_dense_f_param_6, |
| .param .u32 slice_dense_dense_f_param_7, |
| .param .u32 slice_dense_dense_f_param_8 |
| ) |
| { |
| .reg .pred %p<4>; |
| .reg .f32 %f<2>; |
| .reg .b32 %r<15>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd1, [slice_dense_dense_f_param_0]; |
| ld.param.u64 %rd2, [slice_dense_dense_f_param_1]; |
| ld.param.u32 %r3, [slice_dense_dense_f_param_2]; |
| ld.param.u32 %r4, [slice_dense_dense_f_param_4]; |
| ld.param.u32 %r5, [slice_dense_dense_f_param_6]; |
| ld.param.u32 %r7, [slice_dense_dense_f_param_7]; |
| ld.param.u32 %r6, [slice_dense_dense_f_param_8]; |
| mov.u32 %r8, %ctaid.x; |
| mov.u32 %r9, %ntid.x; |
| mov.u32 %r10, %tid.x; |
| mad.lo.s32 %r1, %r9, %r8, %r10; |
| div.s32 %r2, %r1, %r6; |
| setp.ge.s32 %p1, %r2, %r7; |
| setp.lt.s32 %p2, %r6, 0; |
| or.pred %p3, %p1, %p2; |
| @%p3 bra BB31_2; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| add.s32 %r11, %r2, %r3; |
| rem.s32 %r12, %r1, %r6; |
| add.s32 %r13, %r12, %r4; |
| mad.lo.s32 %r14, %r11, %r5, %r13; |
| mul.wide.s32 %rd4, %r14, 4; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f32 %f1, [%rd5]; |
| cvta.to.global.u64 %rd6, %rd2; |
| mul.wide.s32 %rd7, %r1, 4; |
| add.s64 %rd8, %rd6, %rd7; |
| st.global.f32 [%rd8], %f1; |
| |
| BB31_2: |
| ret; |
| } |
| |
| // .globl copy_u2l_dense_d |
| .visible .entry copy_u2l_dense_d( |
| .param .u64 copy_u2l_dense_d_param_0, |
| .param .u32 copy_u2l_dense_d_param_1, |
| .param .u32 copy_u2l_dense_d_param_2 |
| ) |
| { |
| .reg .pred %p<4>; |
| .reg .b32 %r<10>; |
| .reg .f64 %fd<2>; |
| .reg .b64 %rd<7>; |
| |
| |
| ld.param.u64 %rd1, [copy_u2l_dense_d_param_0]; |
| ld.param.u32 %r3, [copy_u2l_dense_d_param_1]; |
| ld.param.u32 %r4, [copy_u2l_dense_d_param_2]; |
| mov.u32 %r5, %ntid.x; |
| mov.u32 %r6, %ctaid.x; |
| mov.u32 %r7, %tid.x; |
| mad.lo.s32 %r1, %r5, %r6, %r7; |
| div.s32 %r8, %r1, %r3; |
| rem.s32 %r9, %r1, %r3; |
| mad.lo.s32 %r2, %r9, %r3, %r8; |
| setp.le.s32 %p1, %r9, %r8; |
| setp.ge.s32 %p2, %r2, %r4; |
| or.pred %p3, %p1, %p2; |
| @%p3 bra BB32_2; |
| |
| cvta.to.global.u64 %rd2, %rd1; |
| mul.wide.s32 %rd3, %r1, 8; |
| add.s64 %rd4, %rd2, %rd3; |
| ld.global.f64 %fd1, [%rd4]; |
| mul.wide.s32 %rd5, %r2, 8; |
| add.s64 %rd6, %rd2, %rd5; |
| st.global.f64 [%rd6], %fd1; |
| |
| BB32_2: |
| ret; |
| } |
| |
| // .globl copy_u2l_dense_f |
| .visible .entry copy_u2l_dense_f( |
| .param .u64 copy_u2l_dense_f_param_0, |
| .param .u32 copy_u2l_dense_f_param_1, |
| .param .u32 copy_u2l_dense_f_param_2 |
| ) |
| { |
| .reg .pred %p<4>; |
| .reg .f32 %f<2>; |
| .reg .b32 %r<10>; |
| .reg .b64 %rd<7>; |
| |
| |
| ld.param.u64 %rd1, [copy_u2l_dense_f_param_0]; |
| ld.param.u32 %r3, [copy_u2l_dense_f_param_1]; |
| ld.param.u32 %r4, [copy_u2l_dense_f_param_2]; |
| mov.u32 %r5, %ntid.x; |
| mov.u32 %r6, %ctaid.x; |
| mov.u32 %r7, %tid.x; |
| mad.lo.s32 %r1, %r5, %r6, %r7; |
| div.s32 %r8, %r1, %r3; |
| rem.s32 %r9, %r1, %r3; |
| mad.lo.s32 %r2, %r9, %r3, %r8; |
| setp.le.s32 %p1, %r9, %r8; |
| setp.ge.s32 %p2, %r2, %r4; |
| or.pred %p3, %p1, %p2; |
| @%p3 bra BB33_2; |
| |
| cvta.to.global.u64 %rd2, %rd1; |
| mul.wide.s32 %rd3, %r1, 4; |
| add.s64 %rd4, %rd2, %rd3; |
| ld.global.f32 %f1, [%rd4]; |
| mul.wide.s32 %rd5, %r2, 4; |
| add.s64 %rd6, %rd2, %rd5; |
| st.global.f32 [%rd6], %f1; |
| |
| BB33_2: |
| ret; |
| } |
| |
| // .globl relu_d |
| .visible .entry relu_d( |
| .param .u64 relu_d_param_0, |
| .param .u64 relu_d_param_1, |
| .param .u32 relu_d_param_2, |
| .param .u32 relu_d_param_3 |
| ) |
| { |
| .reg .pred %p<4>; |
| .reg .b32 %r<8>; |
| .reg .f64 %fd<4>; |
| .reg .b64 %rd<8>; |
| |
| |
| ld.param.u64 %rd1, [relu_d_param_0]; |
| ld.param.u64 %rd2, [relu_d_param_1]; |
| ld.param.u32 %r2, [relu_d_param_2]; |
| ld.param.u32 %r3, [relu_d_param_3]; |
| mov.u32 %r4, %ctaid.x; |
| mov.u32 %r5, %ntid.x; |
| mov.u32 %r6, %tid.x; |
| mad.lo.s32 %r1, %r5, %r4, %r6; |
| div.s32 %r7, %r1, %r3; |
| setp.ge.s32 %p1, %r7, %r2; |
| setp.lt.s32 %p2, %r3, 0; |
| or.pred %p3, %p1, %p2; |
| @%p3 bra BB34_2; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.s32 %rd4, %r1, 8; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f64 %fd1, [%rd5]; |
| mov.f64 %fd2, 0d0000000000000000; |
| max.f64 %fd3, %fd2, %fd1; |
| cvta.to.global.u64 %rd6, %rd2; |
| add.s64 %rd7, %rd6, %rd4; |
| st.global.f64 [%rd7], %fd3; |
| |
| BB34_2: |
| ret; |
| } |
| |
| // .globl relu_f |
| .visible .entry relu_f( |
| .param .u64 relu_f_param_0, |
| .param .u64 relu_f_param_1, |
| .param .u32 relu_f_param_2, |
| .param .u32 relu_f_param_3 |
| ) |
| { |
| .reg .pred %p<4>; |
| .reg .f32 %f<3>; |
| .reg .b32 %r<8>; |
| .reg .f64 %fd<4>; |
| .reg .b64 %rd<8>; |
| |
| |
| ld.param.u64 %rd1, [relu_f_param_0]; |
| ld.param.u64 %rd2, [relu_f_param_1]; |
| ld.param.u32 %r2, [relu_f_param_2]; |
| ld.param.u32 %r3, [relu_f_param_3]; |
| mov.u32 %r4, %ctaid.x; |
| mov.u32 %r5, %ntid.x; |
| mov.u32 %r6, %tid.x; |
| mad.lo.s32 %r1, %r5, %r4, %r6; |
| div.s32 %r7, %r1, %r3; |
| setp.ge.s32 %p1, %r7, %r2; |
| setp.lt.s32 %p2, %r3, 0; |
| or.pred %p3, %p1, %p2; |
| @%p3 bra BB35_2; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.s32 %rd4, %r1, 4; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f32 %f1, [%rd5]; |
| cvt.f64.f32 %fd1, %f1; |
| mov.f64 %fd2, 0d0000000000000000; |
| max.f64 %fd3, %fd2, %fd1; |
| cvt.rn.f32.f64 %f2, %fd3; |
| cvta.to.global.u64 %rd6, %rd2; |
| add.s64 %rd7, %rd6, %rd4; |
| st.global.f32 [%rd7], %f2; |
| |
| BB35_2: |
| ret; |
| } |
| |
| // .globl relu_backward_d |
| .visible .entry relu_backward_d( |
| .param .u64 relu_backward_d_param_0, |
| .param .u64 relu_backward_d_param_1, |
| .param .u64 relu_backward_d_param_2, |
| .param .u32 relu_backward_d_param_3, |
| .param .u32 relu_backward_d_param_4 |
| ) |
| { |
| .reg .pred %p<5>; |
| .reg .b32 %r<8>; |
| .reg .f64 %fd<6>; |
| .reg .b64 %rd<13>; |
| |
| |
| ld.param.u64 %rd1, [relu_backward_d_param_0]; |
| ld.param.u64 %rd2, [relu_backward_d_param_1]; |
| ld.param.u64 %rd3, [relu_backward_d_param_2]; |
| ld.param.u32 %r2, [relu_backward_d_param_3]; |
| ld.param.u32 %r3, [relu_backward_d_param_4]; |
| mov.u32 %r4, %ntid.x; |
| mov.u32 %r5, %ctaid.x; |
| mov.u32 %r6, %tid.x; |
| mad.lo.s32 %r1, %r4, %r5, %r6; |
| div.s32 %r7, %r1, %r3; |
| setp.ge.s32 %p1, %r7, %r2; |
| setp.lt.s32 %p2, %r3, 0; |
| or.pred %p3, %p1, %p2; |
| @%p3 bra BB36_4; |
| |
| cvta.to.global.u64 %rd4, %rd1; |
| mul.wide.s32 %rd5, %r1, 8; |
| add.s64 %rd6, %rd4, %rd5; |
| ld.global.f64 %fd4, [%rd6]; |
| mov.f64 %fd5, 0d0000000000000000; |
| setp.leu.f64 %p4, %fd4, 0d0000000000000000; |
| @%p4 bra BB36_3; |
| |
| cvta.to.global.u64 %rd7, %rd2; |
| add.s64 %rd9, %rd7, %rd5; |
| ld.global.f64 %fd5, [%rd9]; |
| |
| BB36_3: |
| cvta.to.global.u64 %rd10, %rd3; |
| add.s64 %rd12, %rd10, %rd5; |
| st.global.f64 [%rd12], %fd5; |
| |
| BB36_4: |
| ret; |
| } |
| |
| // .globl relu_backward_f |
| .visible .entry relu_backward_f( |
| .param .u64 relu_backward_f_param_0, |
| .param .u64 relu_backward_f_param_1, |
| .param .u64 relu_backward_f_param_2, |
| .param .u32 relu_backward_f_param_3, |
| .param .u32 relu_backward_f_param_4 |
| ) |
| { |
| .reg .pred %p<5>; |
| .reg .f32 %f<6>; |
| .reg .b32 %r<8>; |
| .reg .b64 %rd<13>; |
| |
| |
| ld.param.u64 %rd1, [relu_backward_f_param_0]; |
| ld.param.u64 %rd2, [relu_backward_f_param_1]; |
| ld.param.u64 %rd3, [relu_backward_f_param_2]; |
| ld.param.u32 %r2, [relu_backward_f_param_3]; |
| ld.param.u32 %r3, [relu_backward_f_param_4]; |
| mov.u32 %r4, %ntid.x; |
| mov.u32 %r5, %ctaid.x; |
| mov.u32 %r6, %tid.x; |
| mad.lo.s32 %r1, %r4, %r5, %r6; |
| div.s32 %r7, %r1, %r3; |
| setp.ge.s32 %p1, %r7, %r2; |
| setp.lt.s32 %p2, %r3, 0; |
| or.pred %p3, %p1, %p2; |
| @%p3 bra BB37_4; |
| |
| cvta.to.global.u64 %rd4, %rd1; |
| mul.wide.s32 %rd5, %r1, 4; |
| add.s64 %rd6, %rd4, %rd5; |
| ld.global.f32 %f4, [%rd6]; |
| mov.f32 %f5, 0f00000000; |
| setp.leu.f32 %p4, %f4, 0f00000000; |
| @%p4 bra BB37_3; |
| |
| cvta.to.global.u64 %rd7, %rd2; |
| add.s64 %rd9, %rd7, %rd5; |
| ld.global.f32 %f5, [%rd9]; |
| |
| BB37_3: |
| cvta.to.global.u64 %rd10, %rd3; |
| add.s64 %rd12, %rd10, %rd5; |
| st.global.f32 [%rd12], %f5; |
| |
| BB37_4: |
| ret; |
| } |
| |
| // .globl inplace_add_d |
| .visible .entry inplace_add_d( |
| .param .u64 inplace_add_d_param_0, |
| .param .u64 inplace_add_d_param_1, |
| .param .u32 inplace_add_d_param_2, |
| .param .u32 inplace_add_d_param_3 |
| ) |
| { |
| .reg .pred %p<4>; |
| .reg .b32 %r<8>; |
| .reg .f64 %fd<4>; |
| .reg .b64 %rd<8>; |
| |
| |
| ld.param.u64 %rd1, [inplace_add_d_param_0]; |
| ld.param.u64 %rd2, [inplace_add_d_param_1]; |
| ld.param.u32 %r2, [inplace_add_d_param_2]; |
| ld.param.u32 %r3, [inplace_add_d_param_3]; |
| mov.u32 %r4, %ctaid.x; |
| mov.u32 %r5, %ntid.x; |
| mov.u32 %r6, %tid.x; |
| mad.lo.s32 %r1, %r5, %r4, %r6; |
| div.s32 %r7, %r1, %r3; |
| setp.ge.s32 %p1, %r7, %r2; |
| setp.lt.s32 %p2, %r3, 0; |
| or.pred %p3, %p1, %p2; |
| @%p3 bra BB38_2; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.s32 %rd4, %r1, 8; |
| add.s64 %rd5, %rd3, %rd4; |
| cvta.to.global.u64 %rd6, %rd2; |
| add.s64 %rd7, %rd6, %rd4; |
| ld.global.f64 %fd1, [%rd7]; |
| ld.global.f64 %fd2, [%rd5]; |
| add.f64 %fd3, %fd2, %fd1; |
| st.global.f64 [%rd7], %fd3; |
| |
| BB38_2: |
| ret; |
| } |
| |
| // .globl inplace_add_f |
| .visible .entry inplace_add_f( |
| .param .u64 inplace_add_f_param_0, |
| .param .u64 inplace_add_f_param_1, |
| .param .u32 inplace_add_f_param_2, |
| .param .u32 inplace_add_f_param_3 |
| ) |
| { |
| .reg .pred %p<4>; |
| .reg .f32 %f<4>; |
| .reg .b32 %r<8>; |
| .reg .b64 %rd<8>; |
| |
| |
| ld.param.u64 %rd1, [inplace_add_f_param_0]; |
| ld.param.u64 %rd2, [inplace_add_f_param_1]; |
| ld.param.u32 %r2, [inplace_add_f_param_2]; |
| ld.param.u32 %r3, [inplace_add_f_param_3]; |
| mov.u32 %r4, %ctaid.x; |
| mov.u32 %r5, %ntid.x; |
| mov.u32 %r6, %tid.x; |
| mad.lo.s32 %r1, %r5, %r4, %r6; |
| div.s32 %r7, %r1, %r3; |
| setp.ge.s32 %p1, %r7, %r2; |
| setp.lt.s32 %p2, %r3, 0; |
| or.pred %p3, %p1, %p2; |
| @%p3 bra BB39_2; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.s32 %rd4, %r1, 4; |
| add.s64 %rd5, %rd3, %rd4; |
| cvta.to.global.u64 %rd6, %rd2; |
| add.s64 %rd7, %rd6, %rd4; |
| ld.global.f32 %f1, [%rd7]; |
| ld.global.f32 %f2, [%rd5]; |
| add.f32 %f3, %f2, %f1; |
| st.global.f32 [%rd7], %f3; |
| |
| BB39_2: |
| ret; |
| } |
| |
| // .globl bias_add_d |
| .visible .entry bias_add_d( |
| .param .u64 bias_add_d_param_0, |
| .param .u64 bias_add_d_param_1, |
| .param .u64 bias_add_d_param_2, |
| .param .u32 bias_add_d_param_3, |
| .param .u32 bias_add_d_param_4, |
| .param .u32 bias_add_d_param_5 |
| ) |
| { |
| .reg .pred %p<4>; |
| .reg .b32 %r<11>; |
| .reg .f64 %fd<4>; |
| .reg .b64 %rd<12>; |
| |
| |
| ld.param.u64 %rd1, [bias_add_d_param_0]; |
| ld.param.u64 %rd2, [bias_add_d_param_1]; |
| ld.param.u64 %rd3, [bias_add_d_param_2]; |
| ld.param.u32 %r4, [bias_add_d_param_3]; |
| ld.param.u32 %r2, [bias_add_d_param_4]; |
| ld.param.u32 %r3, [bias_add_d_param_5]; |
| mov.u32 %r5, %ctaid.x; |
| mov.u32 %r6, %ntid.x; |
| mov.u32 %r7, %tid.x; |
| mad.lo.s32 %r1, %r6, %r5, %r7; |
| div.s32 %r8, %r1, %r2; |
| setp.ge.s32 %p1, %r8, %r4; |
| setp.lt.s32 %p2, %r2, 0; |
| or.pred %p3, %p1, %p2; |
| @%p3 bra BB40_2; |
| |
| cvta.to.global.u64 %rd4, %rd1; |
| mul.wide.s32 %rd5, %r1, 8; |
| add.s64 %rd6, %rd4, %rd5; |
| rem.s32 %r9, %r1, %r2; |
| div.s32 %r10, %r9, %r3; |
| cvta.to.global.u64 %rd7, %rd2; |
| mul.wide.s32 %rd8, %r10, 8; |
| add.s64 %rd9, %rd7, %rd8; |
| ld.global.f64 %fd1, [%rd9]; |
| ld.global.f64 %fd2, [%rd6]; |
| add.f64 %fd3, %fd2, %fd1; |
| cvta.to.global.u64 %rd10, %rd3; |
| add.s64 %rd11, %rd10, %rd5; |
| st.global.f64 [%rd11], %fd3; |
| |
| BB40_2: |
| ret; |
| } |
| |
| // .globl bias_add_f |
| .visible .entry bias_add_f( |
| .param .u64 bias_add_f_param_0, |
| .param .u64 bias_add_f_param_1, |
| .param .u64 bias_add_f_param_2, |
| .param .u32 bias_add_f_param_3, |
| .param .u32 bias_add_f_param_4, |
| .param .u32 bias_add_f_param_5 |
| ) |
| { |
| .reg .pred %p<4>; |
| .reg .f32 %f<4>; |
| .reg .b32 %r<11>; |
| .reg .b64 %rd<12>; |
| |
| |
| ld.param.u64 %rd1, [bias_add_f_param_0]; |
| ld.param.u64 %rd2, [bias_add_f_param_1]; |
| ld.param.u64 %rd3, [bias_add_f_param_2]; |
| ld.param.u32 %r4, [bias_add_f_param_3]; |
| ld.param.u32 %r2, [bias_add_f_param_4]; |
| ld.param.u32 %r3, [bias_add_f_param_5]; |
| mov.u32 %r5, %ctaid.x; |
| mov.u32 %r6, %ntid.x; |
| mov.u32 %r7, %tid.x; |
| mad.lo.s32 %r1, %r6, %r5, %r7; |
| div.s32 %r8, %r1, %r2; |
| setp.ge.s32 %p1, %r8, %r4; |
| setp.lt.s32 %p2, %r2, 0; |
| or.pred %p3, %p1, %p2; |
| @%p3 bra BB41_2; |
| |
| cvta.to.global.u64 %rd4, %rd1; |
| mul.wide.s32 %rd5, %r1, 4; |
| add.s64 %rd6, %rd4, %rd5; |
| rem.s32 %r9, %r1, %r2; |
| div.s32 %r10, %r9, %r3; |
| cvta.to.global.u64 %rd7, %rd2; |
| mul.wide.s32 %rd8, %r10, 4; |
| add.s64 %rd9, %rd7, %rd8; |
| ld.global.f32 %f1, [%rd9]; |
| ld.global.f32 %f2, [%rd6]; |
| add.f32 %f3, %f2, %f1; |
| cvta.to.global.u64 %rd10, %rd3; |
| add.s64 %rd11, %rd10, %rd5; |
| st.global.f32 [%rd11], %f3; |
| |
| BB41_2: |
| ret; |
| } |
| |
| // .globl daxpy_matrix_vector_d |
| .visible .entry daxpy_matrix_vector_d( |
| .param .u64 daxpy_matrix_vector_d_param_0, |
| .param .u64 daxpy_matrix_vector_d_param_1, |
| .param .f64 daxpy_matrix_vector_d_param_2, |
| .param .u64 daxpy_matrix_vector_d_param_3, |
| .param .u32 daxpy_matrix_vector_d_param_4, |
| .param .u32 daxpy_matrix_vector_d_param_5, |
| .param .u32 daxpy_matrix_vector_d_param_6, |
| .param .u32 daxpy_matrix_vector_d_param_7 |
| ) |
| { |
| .reg .pred %p<5>; |
| .reg .b32 %r<11>; |
| .reg .f64 %fd<7>; |
| .reg .b64 %rd<14>; |
| |
| |
| ld.param.u64 %rd3, [daxpy_matrix_vector_d_param_0]; |
| ld.param.u64 %rd5, [daxpy_matrix_vector_d_param_1]; |
| ld.param.f64 %fd2, [daxpy_matrix_vector_d_param_2]; |
| ld.param.u64 %rd4, [daxpy_matrix_vector_d_param_3]; |
| ld.param.u32 %r5, [daxpy_matrix_vector_d_param_4]; |
| ld.param.u32 %r3, [daxpy_matrix_vector_d_param_5]; |
| ld.param.u32 %r4, [daxpy_matrix_vector_d_param_6]; |
| cvta.to.global.u64 %rd1, %rd5; |
| mov.u32 %r6, %ntid.x; |
| mov.u32 %r7, %ctaid.x; |
| mov.u32 %r8, %tid.x; |
| mad.lo.s32 %r9, %r6, %r7, %r8; |
| div.s32 %r1, %r9, %r3; |
| rem.s32 %r2, %r9, %r3; |
| setp.ge.s32 %p1, %r1, %r5; |
| setp.lt.s32 %p2, %r3, 0; |
| or.pred %p3, %p1, %p2; |
| @%p3 bra BB42_4; |
| |
| cvta.to.global.u64 %rd6, %rd4; |
| mad.lo.s32 %r10, %r1, %r3, %r2; |
| cvta.to.global.u64 %rd7, %rd3; |
| mul.wide.s32 %rd8, %r10, 8; |
| add.s64 %rd9, %rd7, %rd8; |
| ld.global.f64 %fd1, [%rd9]; |
| add.s64 %rd2, %rd6, %rd8; |
| setp.eq.s32 %p4, %r4, 1; |
| @%p4 bra BB42_3; |
| bra.uni BB42_2; |
| |
| BB42_3: |
| mul.wide.s32 %rd12, %r2, 8; |
| add.s64 %rd13, %rd1, %rd12; |
| ld.global.f64 %fd5, [%rd13]; |
| fma.rn.f64 %fd6, %fd5, %fd2, %fd1; |
| st.global.f64 [%rd2], %fd6; |
| bra.uni BB42_4; |
| |
| BB42_2: |
| mul.wide.s32 %rd10, %r1, 8; |
| add.s64 %rd11, %rd1, %rd10; |
| ld.global.f64 %fd3, [%rd11]; |
| fma.rn.f64 %fd4, %fd3, %fd2, %fd1; |
| st.global.f64 [%rd2], %fd4; |
| |
| BB42_4: |
| ret; |
| } |
| |
| // .globl daxpy_matrix_vector_f |
| .visible .entry daxpy_matrix_vector_f( |
| .param .u64 daxpy_matrix_vector_f_param_0, |
| .param .u64 daxpy_matrix_vector_f_param_1, |
| .param .f64 daxpy_matrix_vector_f_param_2, |
| .param .u64 daxpy_matrix_vector_f_param_3, |
| .param .u32 daxpy_matrix_vector_f_param_4, |
| .param .u32 daxpy_matrix_vector_f_param_5, |
| .param .u32 daxpy_matrix_vector_f_param_6, |
| .param .u32 daxpy_matrix_vector_f_param_7 |
| ) |
| { |
| .reg .pred %p<5>; |
| .reg .f32 %f<6>; |
| .reg .b32 %r<11>; |
| .reg .f64 %fd<7>; |
| .reg .b64 %rd<14>; |
| |
| |
| ld.param.u64 %rd3, [daxpy_matrix_vector_f_param_0]; |
| ld.param.u64 %rd5, [daxpy_matrix_vector_f_param_1]; |
| ld.param.f64 %fd2, [daxpy_matrix_vector_f_param_2]; |
| ld.param.u64 %rd4, [daxpy_matrix_vector_f_param_3]; |
| ld.param.u32 %r5, [daxpy_matrix_vector_f_param_4]; |
| ld.param.u32 %r3, [daxpy_matrix_vector_f_param_5]; |
| ld.param.u32 %r4, [daxpy_matrix_vector_f_param_6]; |
| cvta.to.global.u64 %rd1, %rd5; |
| mov.u32 %r6, %ntid.x; |
| mov.u32 %r7, %ctaid.x; |
| mov.u32 %r8, %tid.x; |
| mad.lo.s32 %r9, %r6, %r7, %r8; |
| div.s32 %r1, %r9, %r3; |
| rem.s32 %r2, %r9, %r3; |
| setp.ge.s32 %p1, %r1, %r5; |
| setp.lt.s32 %p2, %r3, 0; |
| or.pred %p3, %p1, %p2; |
| @%p3 bra BB43_4; |
| |
| cvta.to.global.u64 %rd6, %rd4; |
| mad.lo.s32 %r10, %r1, %r3, %r2; |
| cvta.to.global.u64 %rd7, %rd3; |
| mul.wide.s32 %rd8, %r10, 4; |
| add.s64 %rd9, %rd7, %rd8; |
| ld.global.f32 %f1, [%rd9]; |
| cvt.f64.f32 %fd1, %f1; |
| add.s64 %rd2, %rd6, %rd8; |
| setp.eq.s32 %p4, %r4, 1; |
| @%p4 bra BB43_3; |
| bra.uni BB43_2; |
| |
| BB43_3: |
| mul.wide.s32 %rd12, %r2, 4; |
| add.s64 %rd13, %rd1, %rd12; |
| ld.global.f32 %f4, [%rd13]; |
| cvt.f64.f32 %fd5, %f4; |
| fma.rn.f64 %fd6, %fd5, %fd2, %fd1; |
| cvt.rn.f32.f64 %f5, %fd6; |
| st.global.f32 [%rd2], %f5; |
| bra.uni BB43_4; |
| |
| BB43_2: |
| mul.wide.s32 %rd10, %r1, 4; |
| add.s64 %rd11, %rd1, %rd10; |
| ld.global.f32 %f2, [%rd11]; |
| cvt.f64.f32 %fd3, %f2; |
| fma.rn.f64 %fd4, %fd3, %fd2, %fd1; |
| cvt.rn.f32.f64 %f3, %fd4; |
| st.global.f32 [%rd2], %f3; |
| |
| BB43_4: |
| ret; |
| } |
| |
| // .globl bias_multiply_d |
| .visible .entry bias_multiply_d( |
| .param .u64 bias_multiply_d_param_0, |
| .param .u64 bias_multiply_d_param_1, |
| .param .u64 bias_multiply_d_param_2, |
| .param .u32 bias_multiply_d_param_3, |
| .param .u32 bias_multiply_d_param_4, |
| .param .u32 bias_multiply_d_param_5 |
| ) |
| { |
| .reg .pred %p<4>; |
| .reg .b32 %r<11>; |
| .reg .f64 %fd<4>; |
| .reg .b64 %rd<12>; |
| |
| |
| ld.param.u64 %rd1, [bias_multiply_d_param_0]; |
| ld.param.u64 %rd2, [bias_multiply_d_param_1]; |
| ld.param.u64 %rd3, [bias_multiply_d_param_2]; |
| ld.param.u32 %r4, [bias_multiply_d_param_3]; |
| ld.param.u32 %r2, [bias_multiply_d_param_4]; |
| ld.param.u32 %r3, [bias_multiply_d_param_5]; |
| mov.u32 %r5, %ctaid.x; |
| mov.u32 %r6, %ntid.x; |
| mov.u32 %r7, %tid.x; |
| mad.lo.s32 %r1, %r6, %r5, %r7; |
| div.s32 %r8, %r1, %r2; |
| setp.ge.s32 %p1, %r8, %r4; |
| setp.lt.s32 %p2, %r2, 0; |
| or.pred %p3, %p1, %p2; |
| @%p3 bra BB44_2; |
| |
| cvta.to.global.u64 %rd4, %rd1; |
| mul.wide.s32 %rd5, %r1, 8; |
| add.s64 %rd6, %rd4, %rd5; |
| rem.s32 %r9, %r1, %r2; |
| div.s32 %r10, %r9, %r3; |
| cvta.to.global.u64 %rd7, %rd2; |
| mul.wide.s32 %rd8, %r10, 8; |
| add.s64 %rd9, %rd7, %rd8; |
| ld.global.f64 %fd1, [%rd9]; |
| ld.global.f64 %fd2, [%rd6]; |
| mul.f64 %fd3, %fd2, %fd1; |
| cvta.to.global.u64 %rd10, %rd3; |
| add.s64 %rd11, %rd10, %rd5; |
| st.global.f64 [%rd11], %fd3; |
| |
| BB44_2: |
| ret; |
| } |
| |
| // .globl bias_multiply_f |
| .visible .entry bias_multiply_f( |
| .param .u64 bias_multiply_f_param_0, |
| .param .u64 bias_multiply_f_param_1, |
| .param .u64 bias_multiply_f_param_2, |
| .param .u32 bias_multiply_f_param_3, |
| .param .u32 bias_multiply_f_param_4, |
| .param .u32 bias_multiply_f_param_5 |
| ) |
| { |
| .reg .pred %p<4>; |
| .reg .f32 %f<4>; |
| .reg .b32 %r<11>; |
| .reg .b64 %rd<12>; |
| |
| |
| ld.param.u64 %rd1, [bias_multiply_f_param_0]; |
| ld.param.u64 %rd2, [bias_multiply_f_param_1]; |
| ld.param.u64 %rd3, [bias_multiply_f_param_2]; |
| ld.param.u32 %r4, [bias_multiply_f_param_3]; |
| ld.param.u32 %r2, [bias_multiply_f_param_4]; |
| ld.param.u32 %r3, [bias_multiply_f_param_5]; |
| mov.u32 %r5, %ctaid.x; |
| mov.u32 %r6, %ntid.x; |
| mov.u32 %r7, %tid.x; |
| mad.lo.s32 %r1, %r6, %r5, %r7; |
| div.s32 %r8, %r1, %r2; |
| setp.ge.s32 %p1, %r8, %r4; |
| setp.lt.s32 %p2, %r2, 0; |
| or.pred %p3, %p1, %p2; |
| @%p3 bra BB45_2; |
| |
| cvta.to.global.u64 %rd4, %rd1; |
| mul.wide.s32 %rd5, %r1, 4; |
| add.s64 %rd6, %rd4, %rd5; |
| rem.s32 %r9, %r1, %r2; |
| div.s32 %r10, %r9, %r3; |
| cvta.to.global.u64 %rd7, %rd2; |
| mul.wide.s32 %rd8, %r10, 4; |
| add.s64 %rd9, %rd7, %rd8; |
| ld.global.f32 %f1, [%rd9]; |
| ld.global.f32 %f2, [%rd6]; |
| mul.f32 %f3, %f2, %f1; |
| cvta.to.global.u64 %rd10, %rd3; |
| add.s64 %rd11, %rd10, %rd5; |
| st.global.f32 [%rd11], %f3; |
| |
| BB45_2: |
| ret; |
| } |
| |
| // .globl matrix_matrix_cellwise_op_d |
| .visible .entry matrix_matrix_cellwise_op_d( |
| .param .u64 matrix_matrix_cellwise_op_d_param_0, |
| .param .u64 matrix_matrix_cellwise_op_d_param_1, |
| .param .u64 matrix_matrix_cellwise_op_d_param_2, |
| .param .u32 matrix_matrix_cellwise_op_d_param_3, |
| .param .u32 matrix_matrix_cellwise_op_d_param_4, |
| .param .u32 matrix_matrix_cellwise_op_d_param_5, |
| .param .u32 matrix_matrix_cellwise_op_d_param_6, |
| .param .u32 matrix_matrix_cellwise_op_d_param_7 |
| ) |
| { |
| .reg .pred %p<76>; |
| .reg .b32 %r<61>; |
| .reg .f64 %fd<51>; |
| .reg .b64 %rd<19>; |
| |
| |
| ld.param.u64 %rd2, [matrix_matrix_cellwise_op_d_param_0]; |
| ld.param.u64 %rd3, [matrix_matrix_cellwise_op_d_param_1]; |
| ld.param.u64 %rd4, [matrix_matrix_cellwise_op_d_param_2]; |
| ld.param.u32 %r14, [matrix_matrix_cellwise_op_d_param_3]; |
| ld.param.u32 %r10, [matrix_matrix_cellwise_op_d_param_4]; |
| ld.param.u32 %r11, [matrix_matrix_cellwise_op_d_param_5]; |
| ld.param.u32 %r12, [matrix_matrix_cellwise_op_d_param_6]; |
| ld.param.u32 %r13, [matrix_matrix_cellwise_op_d_param_7]; |
| mov.u32 %r15, %ntid.x; |
| mov.u32 %r16, %ctaid.x; |
| mov.u32 %r17, %tid.x; |
| mad.lo.s32 %r18, %r15, %r16, %r17; |
| div.s32 %r60, %r18, %r10; |
| rem.s32 %r2, %r18, %r10; |
| setp.ge.s32 %p2, %r60, %r14; |
| setp.lt.s32 %p3, %r10, 0; |
| or.pred %p4, %p2, %p3; |
| @%p4 bra BB46_77; |
| |
| mad.lo.s32 %r3, %r60, %r10, %r2; |
| setp.eq.s32 %p5, %r11, 1; |
| mov.u32 %r58, %r60; |
| @%p5 bra BB46_4; |
| |
| setp.ne.s32 %p6, %r11, 2; |
| mov.u32 %r58, %r3; |
| @%p6 bra BB46_4; |
| |
| mov.u32 %r58, %r2; |
| |
| BB46_4: |
| setp.eq.s32 %p7, %r12, 1; |
| @%p7 bra BB46_7; |
| |
| setp.ne.s32 %p8, %r12, 2; |
| mov.u32 %r60, %r3; |
| @%p8 bra BB46_7; |
| |
| mov.u32 %r60, %r2; |
| |
| BB46_7: |
| cvta.to.global.u64 %rd5, %rd3; |
| cvta.to.global.u64 %rd6, %rd2; |
| mul.wide.s32 %rd7, %r58, 8; |
| add.s64 %rd8, %rd6, %rd7; |
| ld.global.f64 %fd1, [%rd8]; |
| mul.wide.s32 %rd9, %r60, 8; |
| add.s64 %rd10, %rd5, %rd9; |
| ld.global.f64 %fd2, [%rd10]; |
| mov.f64 %fd50, 0d7FEFFFFFFFFFFFFF; |
| setp.gt.s32 %p9, %r13, 8; |
| @%p9 bra BB46_24; |
| |
| setp.gt.s32 %p23, %r13, 3; |
| @%p23 bra BB46_16; |
| |
| setp.gt.s32 %p30, %r13, 1; |
| @%p30 bra BB46_13; |
| |
| setp.eq.s32 %p33, %r13, 0; |
| @%p33 bra BB46_75; |
| bra.uni BB46_11; |
| |
| BB46_75: |
| add.f64 %fd50, %fd1, %fd2; |
| bra.uni BB46_76; |
| |
| BB46_24: |
| setp.gt.s32 %p10, %r13, 13; |
| @%p10 bra BB46_33; |
| |
| setp.gt.s32 %p17, %r13, 10; |
| @%p17 bra BB46_29; |
| |
| setp.eq.s32 %p21, %r13, 9; |
| @%p21 bra BB46_53; |
| bra.uni BB46_27; |
| |
| BB46_53: |
| setp.eq.f64 %p48, %fd1, %fd2; |
| selp.f64 %fd50, 0d3FF0000000000000, 0d0000000000000000, %p48; |
| bra.uni BB46_76; |
| |
| BB46_16: |
| setp.gt.s32 %p24, %r13, 5; |
| @%p24 bra BB46_20; |
| |
| setp.eq.s32 %p28, %r13, 4; |
| @%p28 bra BB46_56; |
| bra.uni BB46_18; |
| |
| BB46_56: |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r8}, %fd1; |
| } |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r9}, %fd2; |
| } |
| bfe.u32 %r31, %r9, 20, 11; |
| add.s32 %r32, %r31, -1012; |
| mov.b64 %rd15, %fd2; |
| shl.b64 %rd1, %rd15, %r32; |
| setp.ne.s64 %p53, %rd1, -9223372036854775808; |
| setp.eq.s64 %p54, %rd1, -9223372036854775808; |
| abs.f64 %fd19, %fd1; |
| // Callseq Start 0 |
| { |
| .reg .b32 temp_param_reg; |
| // <end>} |
| .param .b64 param0; |
| st.param.f64 [param0+0], %fd19; |
| .param .b64 param1; |
| st.param.f64 [param1+0], %fd2; |
| .param .b64 retval0; |
| call.uni (retval0), |
| __internal_accurate_pow, |
| ( |
| param0, |
| param1 |
| ); |
| ld.param.f64 %fd25, [retval0+0]; |
| |
| //{ |
| }// Callseq End 0 |
| setp.gt.s32 %p55, %r8, -1; |
| setp.lt.s32 %p56, %r8, 0; |
| and.pred %p1, %p56, %p54; |
| or.pred %p57, %p55, %p53; |
| @%p57 bra BB46_58; |
| |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r33}, %fd25; |
| } |
| xor.b32 %r34, %r33, -2147483648; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r35, %temp}, %fd25; |
| } |
| mov.b64 %fd25, {%r35, %r34}; |
| |
| BB46_58: |
| setp.eq.f64 %p58, %fd1, 0d0000000000000000; |
| @%p58 bra BB46_61; |
| bra.uni BB46_59; |
| |
| BB46_61: |
| selp.b32 %r36, %r8, 0, %p54; |
| mov.u32 %r37, 0; |
| or.b32 %r38, %r36, 2146435072; |
| setp.lt.s32 %p62, %r9, 0; |
| selp.b32 %r39, %r38, %r36, %p62; |
| mov.b64 %fd25, {%r37, %r39}; |
| bra.uni BB46_62; |
| |
| BB46_33: |
| setp.gt.s32 %p11, %r13, 15; |
| @%p11 bra BB46_37; |
| |
| setp.eq.s32 %p15, %r13, 14; |
| @%p15 bra BB46_50; |
| bra.uni BB46_35; |
| |
| BB46_50: |
| cvt.rni.s64.f64 %rd11, %fd1; |
| cvt.u32.u64 %r25, %rd11; |
| cvt.rni.s64.f64 %rd12, %fd2; |
| cvt.u32.u64 %r26, %rd12; |
| or.b32 %r27, %r26, %r25; |
| setp.eq.s32 %p45, %r27, 0; |
| selp.f64 %fd50, 0d0000000000000000, 0d3FF0000000000000, %p45; |
| bra.uni BB46_76; |
| |
| BB46_13: |
| setp.eq.s32 %p31, %r13, 2; |
| @%p31 bra BB46_74; |
| bra.uni BB46_14; |
| |
| BB46_74: |
| mul.f64 %fd50, %fd1, %fd2; |
| bra.uni BB46_76; |
| |
| BB46_29: |
| setp.eq.s32 %p18, %r13, 11; |
| @%p18 bra BB46_52; |
| |
| setp.eq.s32 %p19, %r13, 12; |
| @%p19 bra BB46_51; |
| bra.uni BB46_31; |
| |
| BB46_51: |
| max.f64 %fd50, %fd1, %fd2; |
| bra.uni BB46_76; |
| |
| BB46_20: |
| setp.eq.s32 %p25, %r13, 6; |
| @%p25 bra BB46_55; |
| |
| setp.eq.s32 %p26, %r13, 7; |
| @%p26 bra BB46_54; |
| bra.uni BB46_22; |
| |
| BB46_54: |
| setp.gt.f64 %p50, %fd1, %fd2; |
| selp.f64 %fd50, 0d3FF0000000000000, 0d0000000000000000, %p50; |
| bra.uni BB46_76; |
| |
| BB46_37: |
| setp.eq.s32 %p12, %r13, 16; |
| @%p12 bra BB46_49; |
| |
| setp.eq.s32 %p13, %r13, 17; |
| @%p13 bra BB46_44; |
| bra.uni BB46_39; |
| |
| BB46_44: |
| setp.eq.f64 %p38, %fd2, 0d0000000000000000; |
| setp.eq.f64 %p39, %fd2, 0d8000000000000000; |
| or.pred %p40, %p38, %p39; |
| mov.f64 %fd50, 0d7FF8000000000000; |
| @%p40 bra BB46_76; |
| |
| div.rn.f64 %fd50, %fd1, %fd2; |
| abs.f64 %fd39, %fd50; |
| setp.gtu.f64 %p41, %fd39, 0d7FF0000000000000; |
| @%p41 bra BB46_76; |
| |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r22}, %fd50; |
| } |
| and.b32 %r23, %r22, 2147483647; |
| setp.ne.s32 %p42, %r23, 2146435072; |
| @%p42 bra BB46_48; |
| |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r24, %temp}, %fd50; |
| } |
| setp.eq.s32 %p43, %r24, 0; |
| @%p43 bra BB46_76; |
| |
| BB46_48: |
| cvt.rmi.f64.f64 %fd40, %fd50; |
| mul.f64 %fd41, %fd2, %fd40; |
| sub.f64 %fd50, %fd1, %fd41; |
| bra.uni BB46_76; |
| |
| BB46_11: |
| setp.eq.s32 %p34, %r13, 1; |
| @%p34 bra BB46_12; |
| bra.uni BB46_76; |
| |
| BB46_12: |
| sub.f64 %fd50, %fd1, %fd2; |
| bra.uni BB46_76; |
| |
| BB46_27: |
| setp.eq.s32 %p22, %r13, 10; |
| @%p22 bra BB46_28; |
| bra.uni BB46_76; |
| |
| BB46_28: |
| setp.neu.f64 %p47, %fd1, %fd2; |
| selp.f64 %fd50, 0d3FF0000000000000, 0d0000000000000000, %p47; |
| bra.uni BB46_76; |
| |
| BB46_18: |
| setp.eq.s32 %p29, %r13, 5; |
| @%p29 bra BB46_19; |
| bra.uni BB46_76; |
| |
| BB46_19: |
| setp.lt.f64 %p52, %fd1, %fd2; |
| selp.f64 %fd50, 0d3FF0000000000000, 0d0000000000000000, %p52; |
| bra.uni BB46_76; |
| |
| BB46_35: |
| setp.eq.s32 %p16, %r13, 15; |
| @%p16 bra BB46_36; |
| bra.uni BB46_76; |
| |
| BB46_36: |
| mul.f64 %fd43, %fd1, %fd2; |
| mov.f64 %fd44, 0d3FF0000000000000; |
| sub.f64 %fd50, %fd44, %fd43; |
| bra.uni BB46_76; |
| |
| BB46_14: |
| setp.eq.s32 %p32, %r13, 3; |
| @%p32 bra BB46_15; |
| bra.uni BB46_76; |
| |
| BB46_15: |
| div.rn.f64 %fd50, %fd1, %fd2; |
| bra.uni BB46_76; |
| |
| BB46_52: |
| min.f64 %fd50, %fd1, %fd2; |
| bra.uni BB46_76; |
| |
| BB46_31: |
| setp.eq.s32 %p20, %r13, 13; |
| @%p20 bra BB46_32; |
| bra.uni BB46_76; |
| |
| BB46_32: |
| cvt.rni.s64.f64 %rd13, %fd1; |
| cvt.u32.u64 %r28, %rd13; |
| cvt.rni.s64.f64 %rd14, %fd2; |
| cvt.u32.u64 %r29, %rd14; |
| and.b32 %r30, %r29, %r28; |
| setp.eq.s32 %p46, %r30, 0; |
| selp.f64 %fd50, 0d0000000000000000, 0d3FF0000000000000, %p46; |
| bra.uni BB46_76; |
| |
| BB46_55: |
| setp.gtu.f64 %p51, %fd1, %fd2; |
| selp.f64 %fd50, 0d0000000000000000, 0d3FF0000000000000, %p51; |
| bra.uni BB46_76; |
| |
| BB46_22: |
| setp.eq.s32 %p27, %r13, 8; |
| @%p27 bra BB46_23; |
| bra.uni BB46_76; |
| |
| BB46_23: |
| setp.ltu.f64 %p49, %fd1, %fd2; |
| selp.f64 %fd50, 0d0000000000000000, 0d3FF0000000000000, %p49; |
| bra.uni BB46_76; |
| |
| BB46_49: |
| setp.neu.f64 %p44, %fd1, 0d0000000000000000; |
| sub.f64 %fd42, %fd1, %fd2; |
| selp.f64 %fd50, %fd42, 0d0000000000000000, %p44; |
| bra.uni BB46_76; |
| |
| BB46_39: |
| setp.ne.s32 %p14, %r13, 18; |
| @%p14 bra BB46_76; |
| |
| div.rn.f64 %fd50, %fd1, %fd2; |
| abs.f64 %fd37, %fd50; |
| setp.gtu.f64 %p35, %fd37, 0d7FF0000000000000; |
| @%p35 bra BB46_76; |
| |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r19}, %fd50; |
| } |
| and.b32 %r20, %r19, 2147483647; |
| setp.ne.s32 %p36, %r20, 2146435072; |
| @%p36 bra BB46_43; |
| |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r21, %temp}, %fd50; |
| } |
| setp.eq.s32 %p37, %r21, 0; |
| @%p37 bra BB46_76; |
| |
| BB46_43: |
| cvt.rmi.f64.f64 %fd50, %fd50; |
| bra.uni BB46_76; |
| |
| BB46_59: |
| @%p55 bra BB46_62; |
| |
| cvt.rzi.f64.f64 %fd45, %fd2; |
| setp.neu.f64 %p60, %fd45, %fd2; |
| selp.f64 %fd25, 0dFFF8000000000000, %fd25, %p60; |
| |
| BB46_62: |
| add.f64 %fd49, %fd1, %fd2; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r40}, %fd49; |
| } |
| and.b32 %r41, %r40, 2146435072; |
| setp.ne.s32 %p63, %r41, 2146435072; |
| @%p63 bra BB46_63; |
| |
| setp.gtu.f64 %p64, %fd19, 0d7FF0000000000000; |
| @%p64 bra BB46_73; |
| |
| abs.f64 %fd46, %fd2; |
| setp.gtu.f64 %p65, %fd46, 0d7FF0000000000000; |
| @%p65 bra BB46_73; |
| |
| and.b32 %r42, %r9, 2147483647; |
| setp.ne.s32 %p66, %r42, 2146435072; |
| @%p66 bra BB46_68; |
| |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r43, %temp}, %fd2; |
| } |
| setp.eq.s32 %p67, %r43, 0; |
| @%p67 bra BB46_72; |
| |
| BB46_68: |
| and.b32 %r44, %r8, 2147483647; |
| setp.ne.s32 %p68, %r44, 2146435072; |
| @%p68 bra BB46_69; |
| |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r45, %temp}, %fd1; |
| } |
| setp.ne.s32 %p69, %r45, 0; |
| mov.f64 %fd49, %fd25; |
| @%p69 bra BB46_73; |
| |
| shr.s32 %r46, %r9, 31; |
| and.b32 %r47, %r46, -2146435072; |
| add.s32 %r48, %r47, 2146435072; |
| or.b32 %r49, %r48, -2147483648; |
| selp.b32 %r50, %r49, %r48, %p1; |
| mov.u32 %r51, 0; |
| mov.b64 %fd49, {%r51, %r50}; |
| bra.uni BB46_73; |
| |
| BB46_63: |
| mov.f64 %fd49, %fd25; |
| |
| BB46_73: |
| setp.eq.f64 %p73, %fd2, 0d0000000000000000; |
| setp.eq.f64 %p74, %fd1, 0d3FF0000000000000; |
| or.pred %p75, %p74, %p73; |
| selp.f64 %fd50, 0d3FF0000000000000, %fd49, %p75; |
| |
| BB46_76: |
| cvta.to.global.u64 %rd16, %rd4; |
| mul.wide.s32 %rd17, %r3, 8; |
| add.s64 %rd18, %rd16, %rd17; |
| st.global.f64 [%rd18], %fd50; |
| bar.sync 0; |
| |
| BB46_77: |
| ret; |
| |
| BB46_69: |
| mov.f64 %fd49, %fd25; |
| bra.uni BB46_73; |
| |
| BB46_72: |
| setp.gt.f64 %p70, %fd19, 0d3FF0000000000000; |
| selp.b32 %r52, 2146435072, 0, %p70; |
| mov.u32 %r53, 0; |
| xor.b32 %r54, %r52, 2146435072; |
| setp.lt.s32 %p71, %r9, 0; |
| selp.b32 %r55, %r54, %r52, %p71; |
| setp.eq.f64 %p72, %fd1, 0dBFF0000000000000; |
| selp.b32 %r56, 1072693248, %r55, %p72; |
| mov.b64 %fd49, {%r53, %r56}; |
| bra.uni BB46_73; |
| } |
| |
| // .globl matrix_matrix_cellwise_op_f |
| .visible .entry matrix_matrix_cellwise_op_f( |
| .param .u64 matrix_matrix_cellwise_op_f_param_0, |
| .param .u64 matrix_matrix_cellwise_op_f_param_1, |
| .param .u64 matrix_matrix_cellwise_op_f_param_2, |
| .param .u32 matrix_matrix_cellwise_op_f_param_3, |
| .param .u32 matrix_matrix_cellwise_op_f_param_4, |
| .param .u32 matrix_matrix_cellwise_op_f_param_5, |
| .param .u32 matrix_matrix_cellwise_op_f_param_6, |
| .param .u32 matrix_matrix_cellwise_op_f_param_7 |
| ) |
| { |
| .reg .pred %p<76>; |
| .reg .f32 %f<135>; |
| .reg .b32 %r<46>; |
| .reg .b64 %rd<17>; |
| |
| |
| ld.param.u64 %rd1, [matrix_matrix_cellwise_op_f_param_0]; |
| ld.param.u64 %rd2, [matrix_matrix_cellwise_op_f_param_1]; |
| ld.param.u64 %rd3, [matrix_matrix_cellwise_op_f_param_2]; |
| ld.param.u32 %r12, [matrix_matrix_cellwise_op_f_param_3]; |
| ld.param.u32 %r8, [matrix_matrix_cellwise_op_f_param_4]; |
| ld.param.u32 %r9, [matrix_matrix_cellwise_op_f_param_5]; |
| ld.param.u32 %r10, [matrix_matrix_cellwise_op_f_param_6]; |
| ld.param.u32 %r11, [matrix_matrix_cellwise_op_f_param_7]; |
| mov.u32 %r13, %ntid.x; |
| mov.u32 %r14, %ctaid.x; |
| mov.u32 %r15, %tid.x; |
| mad.lo.s32 %r16, %r13, %r14, %r15; |
| div.s32 %r45, %r16, %r8; |
| rem.s32 %r2, %r16, %r8; |
| setp.ge.s32 %p2, %r45, %r12; |
| setp.lt.s32 %p3, %r8, 0; |
| or.pred %p4, %p2, %p3; |
| @%p4 bra BB47_69; |
| |
| mad.lo.s32 %r3, %r45, %r8, %r2; |
| setp.eq.s32 %p5, %r9, 1; |
| mov.u32 %r43, %r45; |
| @%p5 bra BB47_4; |
| |
| setp.ne.s32 %p6, %r9, 2; |
| mov.u32 %r43, %r3; |
| @%p6 bra BB47_4; |
| |
| mov.u32 %r43, %r2; |
| |
| BB47_4: |
| setp.eq.s32 %p7, %r10, 1; |
| @%p7 bra BB47_7; |
| |
| setp.ne.s32 %p8, %r10, 2; |
| mov.u32 %r45, %r3; |
| @%p8 bra BB47_7; |
| |
| mov.u32 %r45, %r2; |
| |
| BB47_7: |
| cvta.to.global.u64 %rd4, %rd2; |
| cvta.to.global.u64 %rd5, %rd1; |
| mul.wide.s32 %rd6, %r43, 4; |
| add.s64 %rd7, %rd5, %rd6; |
| ld.global.f32 %f1, [%rd7]; |
| mul.wide.s32 %rd8, %r45, 4; |
| add.s64 %rd9, %rd4, %rd8; |
| ld.global.f32 %f2, [%rd9]; |
| mov.f32 %f134, 0f7F7FFFFF; |
| setp.gt.s32 %p9, %r11, 8; |
| @%p9 bra BB47_24; |
| |
| setp.gt.s32 %p23, %r11, 3; |
| @%p23 bra BB47_16; |
| |
| setp.gt.s32 %p30, %r11, 1; |
| @%p30 bra BB47_13; |
| |
| setp.eq.s32 %p33, %r11, 0; |
| @%p33 bra BB47_67; |
| bra.uni BB47_11; |
| |
| BB47_67: |
| add.f32 %f134, %f1, %f2; |
| bra.uni BB47_68; |
| |
| BB47_24: |
| setp.gt.s32 %p10, %r11, 13; |
| @%p10 bra BB47_33; |
| |
| setp.gt.s32 %p17, %r11, 10; |
| @%p17 bra BB47_29; |
| |
| setp.eq.s32 %p21, %r11, 9; |
| @%p21 bra BB47_49; |
| bra.uni BB47_27; |
| |
| BB47_49: |
| setp.eq.f32 %p44, %f1, %f2; |
| selp.f32 %f134, 0f3F800000, 0f00000000, %p44; |
| bra.uni BB47_68; |
| |
| BB47_16: |
| setp.gt.s32 %p24, %r11, 5; |
| @%p24 bra BB47_20; |
| |
| setp.eq.s32 %p28, %r11, 4; |
| @%p28 bra BB47_52; |
| bra.uni BB47_18; |
| |
| BB47_52: |
| mul.f32 %f51, %f2, 0f3F000000; |
| cvt.rzi.f32.f32 %f52, %f51; |
| fma.rn.f32 %f53, %f52, 0fC0000000, %f2; |
| abs.f32 %f19, %f53; |
| abs.f32 %f20, %f1; |
| setp.lt.f32 %p49, %f20, 0f00800000; |
| mul.f32 %f54, %f20, 0f4B800000; |
| selp.f32 %f55, 0fC3170000, 0fC2FE0000, %p49; |
| selp.f32 %f56, %f54, %f20, %p49; |
| mov.b32 %r23, %f56; |
| and.b32 %r24, %r23, 8388607; |
| or.b32 %r25, %r24, 1065353216; |
| mov.b32 %f57, %r25; |
| shr.u32 %r26, %r23, 23; |
| cvt.rn.f32.u32 %f58, %r26; |
| add.f32 %f59, %f55, %f58; |
| setp.gt.f32 %p50, %f57, 0f3FB504F3; |
| mul.f32 %f60, %f57, 0f3F000000; |
| add.f32 %f61, %f59, 0f3F800000; |
| selp.f32 %f62, %f60, %f57, %p50; |
| selp.f32 %f63, %f61, %f59, %p50; |
| add.f32 %f64, %f62, 0fBF800000; |
| add.f32 %f50, %f62, 0f3F800000; |
| // inline asm |
| rcp.approx.ftz.f32 %f49,%f50; |
| // inline asm |
| add.f32 %f65, %f64, %f64; |
| mul.f32 %f66, %f49, %f65; |
| mul.f32 %f67, %f66, %f66; |
| mov.f32 %f68, 0f3C4CAF63; |
| mov.f32 %f69, 0f3B18F0FE; |
| fma.rn.f32 %f70, %f69, %f67, %f68; |
| mov.f32 %f71, 0f3DAAAABD; |
| fma.rn.f32 %f72, %f70, %f67, %f71; |
| mul.rn.f32 %f73, %f72, %f67; |
| mul.rn.f32 %f74, %f73, %f66; |
| sub.f32 %f75, %f64, %f66; |
| neg.f32 %f76, %f66; |
| add.f32 %f77, %f75, %f75; |
| fma.rn.f32 %f78, %f76, %f64, %f77; |
| mul.rn.f32 %f79, %f49, %f78; |
| add.f32 %f80, %f74, %f66; |
| sub.f32 %f81, %f66, %f80; |
| add.f32 %f82, %f74, %f81; |
| add.f32 %f83, %f79, %f82; |
| add.f32 %f84, %f80, %f83; |
| sub.f32 %f85, %f80, %f84; |
| add.f32 %f86, %f83, %f85; |
| mov.f32 %f87, 0f3F317200; |
| mul.rn.f32 %f88, %f63, %f87; |
| mov.f32 %f89, 0f35BFBE8E; |
| mul.rn.f32 %f90, %f63, %f89; |
| add.f32 %f91, %f88, %f84; |
| sub.f32 %f92, %f88, %f91; |
| add.f32 %f93, %f84, %f92; |
| add.f32 %f94, %f86, %f93; |
| add.f32 %f95, %f90, %f94; |
| add.f32 %f96, %f91, %f95; |
| sub.f32 %f97, %f91, %f96; |
| add.f32 %f98, %f95, %f97; |
| abs.f32 %f21, %f2; |
| setp.gt.f32 %p51, %f21, 0f77F684DF; |
| mul.f32 %f99, %f2, 0f39000000; |
| selp.f32 %f100, %f99, %f2, %p51; |
| mul.rn.f32 %f101, %f100, %f96; |
| neg.f32 %f102, %f101; |
| fma.rn.f32 %f103, %f100, %f96, %f102; |
| fma.rn.f32 %f104, %f100, %f98, %f103; |
| mov.f32 %f105, 0f00000000; |
| fma.rn.f32 %f106, %f105, %f96, %f104; |
| add.rn.f32 %f107, %f101, %f106; |
| neg.f32 %f108, %f107; |
| add.rn.f32 %f109, %f101, %f108; |
| add.rn.f32 %f110, %f109, %f106; |
| mov.b32 %r27, %f107; |
| setp.eq.s32 %p52, %r27, 1118925336; |
| add.s32 %r28, %r27, -1; |
| mov.b32 %f111, %r28; |
| add.f32 %f112, %f110, 0f37000000; |
| selp.f32 %f113, %f111, %f107, %p52; |
| selp.f32 %f22, %f112, %f110, %p52; |
| mul.f32 %f114, %f113, 0f3FB8AA3B; |
| cvt.rzi.f32.f32 %f115, %f114; |
| mov.f32 %f116, 0fBF317200; |
| fma.rn.f32 %f117, %f115, %f116, %f113; |
| mov.f32 %f118, 0fB5BFBE8E; |
| fma.rn.f32 %f119, %f115, %f118, %f117; |
| mul.f32 %f120, %f119, 0f3FB8AA3B; |
| ex2.approx.ftz.f32 %f121, %f120; |
| add.f32 %f122, %f115, 0f00000000; |
| ex2.approx.f32 %f123, %f122; |
| mul.f32 %f124, %f121, %f123; |
| setp.lt.f32 %p53, %f113, 0fC2D20000; |
| selp.f32 %f125, 0f00000000, %f124, %p53; |
| setp.gt.f32 %p54, %f113, 0f42D20000; |
| selp.f32 %f131, 0f7F800000, %f125, %p54; |
| setp.eq.f32 %p55, %f131, 0f7F800000; |
| @%p55 bra BB47_54; |
| |
| fma.rn.f32 %f131, %f131, %f22, %f131; |
| |
| BB47_54: |
| setp.lt.f32 %p56, %f1, 0f00000000; |
| setp.eq.f32 %p57, %f19, 0f3F800000; |
| and.pred %p1, %p56, %p57; |
| mov.b32 %r29, %f131; |
| xor.b32 %r30, %r29, -2147483648; |
| mov.b32 %f126, %r30; |
| selp.f32 %f133, %f126, %f131, %p1; |
| setp.eq.f32 %p58, %f1, 0f00000000; |
| @%p58 bra BB47_57; |
| bra.uni BB47_55; |
| |
| BB47_57: |
| add.f32 %f128, %f1, %f1; |
| mov.b32 %r31, %f128; |
| selp.b32 %r32, %r31, 0, %p57; |
| or.b32 %r33, %r32, 2139095040; |
| setp.lt.f32 %p62, %f2, 0f00000000; |
| selp.b32 %r34, %r33, %r32, %p62; |
| mov.b32 %f133, %r34; |
| bra.uni BB47_58; |
| |
| BB47_33: |
| setp.gt.s32 %p11, %r11, 15; |
| @%p11 bra BB47_37; |
| |
| setp.eq.s32 %p15, %r11, 14; |
| @%p15 bra BB47_46; |
| bra.uni BB47_35; |
| |
| BB47_46: |
| cvt.rni.s64.f32 %rd10, %f1; |
| cvt.u32.u64 %r17, %rd10; |
| cvt.rni.s64.f32 %rd11, %f2; |
| cvt.u32.u64 %r18, %rd11; |
| or.b32 %r19, %r18, %r17; |
| setp.eq.s32 %p41, %r19, 0; |
| selp.f32 %f134, 0f00000000, 0f3F800000, %p41; |
| bra.uni BB47_68; |
| |
| BB47_13: |
| setp.eq.s32 %p31, %r11, 2; |
| @%p31 bra BB47_66; |
| bra.uni BB47_14; |
| |
| BB47_66: |
| mul.f32 %f134, %f1, %f2; |
| bra.uni BB47_68; |
| |
| BB47_29: |
| setp.eq.s32 %p18, %r11, 11; |
| @%p18 bra BB47_48; |
| |
| setp.eq.s32 %p19, %r11, 12; |
| @%p19 bra BB47_47; |
| bra.uni BB47_31; |
| |
| BB47_47: |
| max.f32 %f134, %f1, %f2; |
| bra.uni BB47_68; |
| |
| BB47_20: |
| setp.eq.s32 %p25, %r11, 6; |
| @%p25 bra BB47_51; |
| |
| setp.eq.s32 %p26, %r11, 7; |
| @%p26 bra BB47_50; |
| bra.uni BB47_22; |
| |
| BB47_50: |
| setp.gt.f32 %p46, %f1, %f2; |
| selp.f32 %f134, 0f3F800000, 0f00000000, %p46; |
| bra.uni BB47_68; |
| |
| BB47_37: |
| setp.eq.s32 %p12, %r11, 16; |
| @%p12 bra BB47_45; |
| |
| setp.eq.s32 %p13, %r11, 17; |
| @%p13 bra BB47_42; |
| bra.uni BB47_39; |
| |
| BB47_42: |
| setp.eq.f32 %p36, %f2, 0f00000000; |
| setp.eq.f32 %p37, %f2, 0f80000000; |
| or.pred %p38, %p36, %p37; |
| mov.f32 %f134, 0f7FC00000; |
| @%p38 bra BB47_68; |
| |
| div.rn.f32 %f134, %f1, %f2; |
| abs.f32 %f43, %f134; |
| setp.geu.f32 %p39, %f43, 0f7F800000; |
| @%p39 bra BB47_68; |
| |
| cvt.rmi.f32.f32 %f44, %f134; |
| mul.f32 %f45, %f2, %f44; |
| sub.f32 %f134, %f1, %f45; |
| bra.uni BB47_68; |
| |
| BB47_11: |
| setp.eq.s32 %p34, %r11, 1; |
| @%p34 bra BB47_12; |
| bra.uni BB47_68; |
| |
| BB47_12: |
| sub.f32 %f134, %f1, %f2; |
| bra.uni BB47_68; |
| |
| BB47_27: |
| setp.eq.s32 %p22, %r11, 10; |
| @%p22 bra BB47_28; |
| bra.uni BB47_68; |
| |
| BB47_28: |
| setp.neu.f32 %p43, %f1, %f2; |
| selp.f32 %f134, 0f3F800000, 0f00000000, %p43; |
| bra.uni BB47_68; |
| |
| BB47_18: |
| setp.eq.s32 %p29, %r11, 5; |
| @%p29 bra BB47_19; |
| bra.uni BB47_68; |
| |
| BB47_19: |
| setp.lt.f32 %p48, %f1, %f2; |
| selp.f32 %f134, 0f3F800000, 0f00000000, %p48; |
| bra.uni BB47_68; |
| |
| BB47_35: |
| setp.eq.s32 %p16, %r11, 15; |
| @%p16 bra BB47_36; |
| bra.uni BB47_68; |
| |
| BB47_36: |
| mul.f32 %f47, %f1, %f2; |
| mov.f32 %f48, 0f3F800000; |
| sub.f32 %f134, %f48, %f47; |
| bra.uni BB47_68; |
| |
| BB47_14: |
| setp.eq.s32 %p32, %r11, 3; |
| @%p32 bra BB47_15; |
| bra.uni BB47_68; |
| |
| BB47_15: |
| div.rn.f32 %f134, %f1, %f2; |
| bra.uni BB47_68; |
| |
| BB47_48: |
| min.f32 %f134, %f1, %f2; |
| bra.uni BB47_68; |
| |
| BB47_31: |
| setp.eq.s32 %p20, %r11, 13; |
| @%p20 bra BB47_32; |
| bra.uni BB47_68; |
| |
| BB47_32: |
| cvt.rni.s64.f32 %rd12, %f1; |
| cvt.u32.u64 %r20, %rd12; |
| cvt.rni.s64.f32 %rd13, %f2; |
| cvt.u32.u64 %r21, %rd13; |
| and.b32 %r22, %r21, %r20; |
| setp.eq.s32 %p42, %r22, 0; |
| selp.f32 %f134, 0f00000000, 0f3F800000, %p42; |
| bra.uni BB47_68; |
| |
| BB47_51: |
| setp.gtu.f32 %p47, %f1, %f2; |
| selp.f32 %f134, 0f00000000, 0f3F800000, %p47; |
| bra.uni BB47_68; |
| |
| BB47_22: |
| setp.eq.s32 %p27, %r11, 8; |
| @%p27 bra BB47_23; |
| bra.uni BB47_68; |
| |
| BB47_23: |
| setp.ltu.f32 %p45, %f1, %f2; |
| selp.f32 %f134, 0f00000000, 0f3F800000, %p45; |
| bra.uni BB47_68; |
| |
| BB47_45: |
| setp.neu.f32 %p40, %f1, 0f00000000; |
| sub.f32 %f46, %f1, %f2; |
| selp.f32 %f134, %f46, 0f00000000, %p40; |
| bra.uni BB47_68; |
| |
| BB47_39: |
| setp.ne.s32 %p14, %r11, 18; |
| @%p14 bra BB47_68; |
| |
| div.rn.f32 %f134, %f1, %f2; |
| abs.f32 %f41, %f134; |
| setp.geu.f32 %p35, %f41, 0f7F800000; |
| @%p35 bra BB47_68; |
| |
| cvt.rmi.f32.f32 %f134, %f134; |
| bra.uni BB47_68; |
| |
| BB47_55: |
| setp.geu.f32 %p59, %f1, 0f00000000; |
| @%p59 bra BB47_58; |
| |
| cvt.rzi.f32.f32 %f127, %f2; |
| setp.neu.f32 %p60, %f127, %f2; |
| selp.f32 %f133, 0f7FFFFFFF, %f133, %p60; |
| |
| BB47_58: |
| add.f32 %f129, %f20, %f21; |
| mov.b32 %r35, %f129; |
| setp.lt.s32 %p63, %r35, 2139095040; |
| @%p63 bra BB47_65; |
| |
| setp.gtu.f32 %p64, %f20, 0f7F800000; |
| setp.gtu.f32 %p65, %f21, 0f7F800000; |
| or.pred %p66, %p64, %p65; |
| @%p66 bra BB47_64; |
| bra.uni BB47_60; |
| |
| BB47_64: |
| add.f32 %f133, %f1, %f2; |
| bra.uni BB47_65; |
| |
| BB47_60: |
| setp.eq.f32 %p67, %f21, 0f7F800000; |
| @%p67 bra BB47_63; |
| bra.uni BB47_61; |
| |
| BB47_63: |
| setp.gt.f32 %p70, %f20, 0f3F800000; |
| selp.b32 %r39, 2139095040, 0, %p70; |
| xor.b32 %r40, %r39, 2139095040; |
| setp.lt.f32 %p71, %f2, 0f00000000; |
| selp.b32 %r41, %r40, %r39, %p71; |
| mov.b32 %f130, %r41; |
| setp.eq.f32 %p72, %f1, 0fBF800000; |
| selp.f32 %f133, 0f3F800000, %f130, %p72; |
| bra.uni BB47_65; |
| |
| BB47_61: |
| setp.neu.f32 %p68, %f20, 0f7F800000; |
| @%p68 bra BB47_65; |
| |
| setp.ltu.f32 %p69, %f2, 0f00000000; |
| selp.b32 %r36, 0, 2139095040, %p69; |
| or.b32 %r37, %r36, -2147483648; |
| selp.b32 %r38, %r37, %r36, %p1; |
| mov.b32 %f133, %r38; |
| |
| BB47_65: |
| setp.eq.f32 %p73, %f2, 0f00000000; |
| setp.eq.f32 %p74, %f1, 0f3F800000; |
| or.pred %p75, %p74, %p73; |
| selp.f32 %f134, 0f3F800000, %f133, %p75; |
| |
| BB47_68: |
| cvta.to.global.u64 %rd14, %rd3; |
| mul.wide.s32 %rd15, %r3, 4; |
| add.s64 %rd16, %rd14, %rd15; |
| st.global.f32 [%rd16], %f134; |
| bar.sync 0; |
| |
| BB47_69: |
| ret; |
| } |
| |
| // .globl matrix_scalar_op_d |
| .visible .entry matrix_scalar_op_d( |
| .param .u64 matrix_scalar_op_d_param_0, |
| .param .f64 matrix_scalar_op_d_param_1, |
| .param .u64 matrix_scalar_op_d_param_2, |
| .param .u32 matrix_scalar_op_d_param_3, |
| .param .u32 matrix_scalar_op_d_param_4, |
| .param .u32 matrix_scalar_op_d_param_5 |
| ) |
| { |
| .reg .pred %p<139>; |
| .reg .b32 %r<88>; |
| .reg .f64 %fd<99>; |
| .reg .b64 %rd<20>; |
| |
| |
| ld.param.u64 %rd4, [matrix_scalar_op_d_param_0]; |
| ld.param.f64 %fd68, [matrix_scalar_op_d_param_1]; |
| ld.param.u64 %rd5, [matrix_scalar_op_d_param_2]; |
| ld.param.u32 %r8, [matrix_scalar_op_d_param_3]; |
| ld.param.u32 %r6, [matrix_scalar_op_d_param_4]; |
| ld.param.u32 %r7, [matrix_scalar_op_d_param_5]; |
| mov.u32 %r9, %ntid.x; |
| mov.u32 %r10, %ctaid.x; |
| mov.u32 %r11, %tid.x; |
| mad.lo.s32 %r1, %r9, %r10, %r11; |
| setp.ge.s32 %p3, %r1, %r8; |
| @%p3 bra BB48_142; |
| |
| cvta.to.global.u64 %rd6, %rd5; |
| cvta.to.global.u64 %rd7, %rd4; |
| mul.wide.s32 %rd8, %r1, 8; |
| add.s64 %rd9, %rd7, %rd8; |
| ld.global.f64 %fd1, [%rd9]; |
| add.s64 %rd1, %rd6, %rd8; |
| setp.eq.s32 %p4, %r7, 0; |
| @%p4 bra BB48_72; |
| |
| mov.f64 %fd94, 0d7FEFFFFFFFFFFFFF; |
| setp.gt.s32 %p5, %r6, 8; |
| @%p5 bra BB48_19; |
| |
| setp.gt.s32 %p19, %r6, 3; |
| @%p19 bra BB48_11; |
| |
| setp.gt.s32 %p26, %r6, 1; |
| @%p26 bra BB48_8; |
| |
| setp.eq.s32 %p29, %r6, 0; |
| @%p29 bra BB48_70; |
| bra.uni BB48_6; |
| |
| BB48_70: |
| add.f64 %fd94, %fd1, %fd68; |
| bra.uni BB48_71; |
| |
| BB48_72: |
| mov.f64 %fd98, 0d7FEFFFFFFFFFFFFF; |
| setp.gt.s32 %p72, %r6, 8; |
| @%p72 bra BB48_89; |
| |
| setp.gt.s32 %p86, %r6, 3; |
| @%p86 bra BB48_81; |
| |
| setp.gt.s32 %p93, %r6, 1; |
| @%p93 bra BB48_78; |
| |
| setp.eq.s32 %p96, %r6, 0; |
| @%p96 bra BB48_140; |
| bra.uni BB48_76; |
| |
| BB48_140: |
| add.f64 %fd98, %fd1, %fd68; |
| bra.uni BB48_141; |
| |
| BB48_19: |
| setp.gt.s32 %p6, %r6, 13; |
| @%p6 bra BB48_28; |
| |
| setp.gt.s32 %p13, %r6, 10; |
| @%p13 bra BB48_24; |
| |
| setp.eq.s32 %p17, %r6, 9; |
| @%p17 bra BB48_48; |
| bra.uni BB48_22; |
| |
| BB48_48: |
| setp.eq.f64 %p44, %fd1, %fd68; |
| selp.f64 %fd94, 0d3FF0000000000000, 0d0000000000000000, %p44; |
| bra.uni BB48_71; |
| |
| BB48_89: |
| setp.gt.s32 %p73, %r6, 13; |
| @%p73 bra BB48_98; |
| |
| setp.gt.s32 %p80, %r6, 10; |
| @%p80 bra BB48_94; |
| |
| setp.eq.s32 %p84, %r6, 9; |
| @%p84 bra BB48_118; |
| bra.uni BB48_92; |
| |
| BB48_118: |
| setp.eq.f64 %p111, %fd1, %fd68; |
| selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p111; |
| bra.uni BB48_141; |
| |
| BB48_11: |
| setp.gt.s32 %p20, %r6, 5; |
| @%p20 bra BB48_15; |
| |
| setp.eq.s32 %p24, %r6, 4; |
| @%p24 bra BB48_51; |
| bra.uni BB48_13; |
| |
| BB48_51: |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r2}, %fd68; |
| } |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r3}, %fd1; |
| } |
| bfe.u32 %r24, %r3, 20, 11; |
| add.s32 %r25, %r24, -1012; |
| mov.b64 %rd14, %fd1; |
| shl.b64 %rd2, %rd14, %r25; |
| setp.ne.s64 %p49, %rd2, -9223372036854775808; |
| setp.eq.s64 %p50, %rd2, -9223372036854775808; |
| abs.f64 %fd18, %fd68; |
| // Callseq Start 1 |
| { |
| .reg .b32 temp_param_reg; |
| // <end>} |
| .param .b64 param0; |
| st.param.f64 [param0+0], %fd18; |
| .param .b64 param1; |
| st.param.f64 [param1+0], %fd1; |
| .param .b64 retval0; |
| call.uni (retval0), |
| __internal_accurate_pow, |
| ( |
| param0, |
| param1 |
| ); |
| ld.param.f64 %fd24, [retval0+0]; |
| |
| //{ |
| }// Callseq End 1 |
| setp.gt.s32 %p51, %r2, -1; |
| setp.lt.s32 %p52, %r2, 0; |
| and.pred %p1, %p52, %p50; |
| or.pred %p53, %p51, %p49; |
| @%p53 bra BB48_53; |
| |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r26}, %fd24; |
| } |
| xor.b32 %r27, %r26, -2147483648; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r28, %temp}, %fd24; |
| } |
| mov.b64 %fd24, {%r28, %r27}; |
| |
| BB48_53: |
| setp.eq.f64 %p54, %fd68, 0d0000000000000000; |
| @%p54 bra BB48_56; |
| bra.uni BB48_54; |
| |
| BB48_56: |
| selp.b32 %r29, %r2, 0, %p50; |
| mov.u32 %r30, 0; |
| or.b32 %r31, %r29, 2146435072; |
| setp.lt.s32 %p58, %r3, 0; |
| selp.b32 %r32, %r31, %r29, %p58; |
| mov.b64 %fd24, {%r30, %r32}; |
| bra.uni BB48_57; |
| |
| BB48_28: |
| setp.gt.s32 %p7, %r6, 15; |
| @%p7 bra BB48_32; |
| |
| setp.eq.s32 %p11, %r6, 14; |
| @%p11 bra BB48_45; |
| bra.uni BB48_30; |
| |
| BB48_45: |
| cvt.rni.s64.f64 %rd10, %fd68; |
| cvt.u32.u64 %r18, %rd10; |
| cvt.rni.s64.f64 %rd11, %fd1; |
| cvt.u32.u64 %r19, %rd11; |
| or.b32 %r20, %r19, %r18; |
| setp.eq.s32 %p41, %r20, 0; |
| selp.f64 %fd94, 0d0000000000000000, 0d3FF0000000000000, %p41; |
| bra.uni BB48_71; |
| |
| BB48_81: |
| setp.gt.s32 %p87, %r6, 5; |
| @%p87 bra BB48_85; |
| |
| setp.eq.s32 %p91, %r6, 4; |
| @%p91 bra BB48_121; |
| bra.uni BB48_83; |
| |
| BB48_121: |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r4}, %fd1; |
| } |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r5}, %fd68; |
| } |
| bfe.u32 %r62, %r5, 20, 11; |
| add.s32 %r63, %r62, -1012; |
| mov.b64 %rd19, %fd68; |
| shl.b64 %rd3, %rd19, %r63; |
| setp.ne.s64 %p116, %rd3, -9223372036854775808; |
| setp.eq.s64 %p117, %rd3, -9223372036854775808; |
| abs.f64 %fd51, %fd1; |
| // Callseq Start 2 |
| { |
| .reg .b32 temp_param_reg; |
| // <end>} |
| .param .b64 param0; |
| st.param.f64 [param0+0], %fd51; |
| .param .b64 param1; |
| st.param.f64 [param1+0], %fd68; |
| .param .b64 retval0; |
| call.uni (retval0), |
| __internal_accurate_pow, |
| ( |
| param0, |
| param1 |
| ); |
| ld.param.f64 %fd57, [retval0+0]; |
| |
| //{ |
| }// Callseq End 2 |
| setp.gt.s32 %p118, %r4, -1; |
| setp.lt.s32 %p119, %r4, 0; |
| and.pred %p2, %p119, %p117; |
| or.pred %p120, %p118, %p116; |
| @%p120 bra BB48_123; |
| |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r64}, %fd57; |
| } |
| xor.b32 %r65, %r64, -2147483648; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r66, %temp}, %fd57; |
| } |
| mov.b64 %fd57, {%r66, %r65}; |
| |
| BB48_123: |
| setp.eq.f64 %p121, %fd1, 0d0000000000000000; |
| @%p121 bra BB48_126; |
| bra.uni BB48_124; |
| |
| BB48_126: |
| selp.b32 %r67, %r4, 0, %p117; |
| mov.u32 %r68, 0; |
| or.b32 %r69, %r67, 2146435072; |
| setp.lt.s32 %p125, %r5, 0; |
| selp.b32 %r70, %r69, %r67, %p125; |
| mov.b64 %fd57, {%r68, %r70}; |
| bra.uni BB48_127; |
| |
| BB48_98: |
| setp.gt.s32 %p74, %r6, 15; |
| @%p74 bra BB48_102; |
| |
| setp.eq.s32 %p78, %r6, 14; |
| @%p78 bra BB48_115; |
| bra.uni BB48_100; |
| |
| BB48_115: |
| cvt.rni.s64.f64 %rd15, %fd1; |
| cvt.u32.u64 %r56, %rd15; |
| cvt.rni.s64.f64 %rd16, %fd68; |
| cvt.u32.u64 %r57, %rd16; |
| or.b32 %r58, %r57, %r56; |
| setp.eq.s32 %p108, %r58, 0; |
| selp.f64 %fd98, 0d0000000000000000, 0d3FF0000000000000, %p108; |
| bra.uni BB48_141; |
| |
| BB48_8: |
| setp.eq.s32 %p27, %r6, 2; |
| @%p27 bra BB48_69; |
| bra.uni BB48_9; |
| |
| BB48_69: |
| mul.f64 %fd94, %fd1, %fd68; |
| bra.uni BB48_71; |
| |
| BB48_24: |
| setp.eq.s32 %p14, %r6, 11; |
| @%p14 bra BB48_47; |
| |
| setp.eq.s32 %p15, %r6, 12; |
| @%p15 bra BB48_46; |
| bra.uni BB48_26; |
| |
| BB48_46: |
| max.f64 %fd94, %fd68, %fd1; |
| bra.uni BB48_71; |
| |
| BB48_15: |
| setp.eq.s32 %p21, %r6, 6; |
| @%p21 bra BB48_50; |
| |
| setp.eq.s32 %p22, %r6, 7; |
| @%p22 bra BB48_49; |
| bra.uni BB48_17; |
| |
| BB48_49: |
| setp.lt.f64 %p46, %fd1, %fd68; |
| selp.f64 %fd94, 0d3FF0000000000000, 0d0000000000000000, %p46; |
| bra.uni BB48_71; |
| |
| BB48_32: |
| setp.eq.s32 %p8, %r6, 16; |
| @%p8 bra BB48_44; |
| |
| setp.eq.s32 %p9, %r6, 17; |
| @%p9 bra BB48_39; |
| bra.uni BB48_34; |
| |
| BB48_39: |
| setp.eq.f64 %p34, %fd1, 0d0000000000000000; |
| setp.eq.f64 %p35, %fd1, 0d8000000000000000; |
| or.pred %p36, %p34, %p35; |
| mov.f64 %fd94, 0d7FF8000000000000; |
| @%p36 bra BB48_71; |
| |
| div.rn.f64 %fd94, %fd68, %fd1; |
| abs.f64 %fd72, %fd94; |
| setp.gtu.f64 %p37, %fd72, 0d7FF0000000000000; |
| @%p37 bra BB48_71; |
| |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r15}, %fd94; |
| } |
| and.b32 %r16, %r15, 2147483647; |
| setp.ne.s32 %p38, %r16, 2146435072; |
| @%p38 bra BB48_43; |
| |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r17, %temp}, %fd94; |
| } |
| setp.eq.s32 %p39, %r17, 0; |
| @%p39 bra BB48_71; |
| |
| BB48_43: |
| cvt.rmi.f64.f64 %fd73, %fd94; |
| mul.f64 %fd74, %fd1, %fd73; |
| sub.f64 %fd94, %fd68, %fd74; |
| bra.uni BB48_71; |
| |
| BB48_78: |
| setp.eq.s32 %p94, %r6, 2; |
| @%p94 bra BB48_139; |
| bra.uni BB48_79; |
| |
| BB48_139: |
| mul.f64 %fd98, %fd1, %fd68; |
| bra.uni BB48_141; |
| |
| BB48_94: |
| setp.eq.s32 %p81, %r6, 11; |
| @%p81 bra BB48_117; |
| |
| setp.eq.s32 %p82, %r6, 12; |
| @%p82 bra BB48_116; |
| bra.uni BB48_96; |
| |
| BB48_116: |
| max.f64 %fd98, %fd1, %fd68; |
| bra.uni BB48_141; |
| |
| BB48_85: |
| setp.eq.s32 %p88, %r6, 6; |
| @%p88 bra BB48_120; |
| |
| setp.eq.s32 %p89, %r6, 7; |
| @%p89 bra BB48_119; |
| bra.uni BB48_87; |
| |
| BB48_119: |
| setp.gt.f64 %p113, %fd1, %fd68; |
| selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p113; |
| bra.uni BB48_141; |
| |
| BB48_102: |
| setp.eq.s32 %p75, %r6, 16; |
| @%p75 bra BB48_114; |
| |
| setp.eq.s32 %p76, %r6, 17; |
| @%p76 bra BB48_109; |
| bra.uni BB48_104; |
| |
| BB48_109: |
| setp.eq.f64 %p101, %fd68, 0d0000000000000000; |
| setp.eq.f64 %p102, %fd68, 0d8000000000000000; |
| or.pred %p103, %p101, %p102; |
| mov.f64 %fd98, 0d7FF8000000000000; |
| @%p103 bra BB48_141; |
| |
| div.rn.f64 %fd98, %fd1, %fd68; |
| abs.f64 %fd83, %fd98; |
| setp.gtu.f64 %p104, %fd83, 0d7FF0000000000000; |
| @%p104 bra BB48_141; |
| |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r53}, %fd98; |
| } |
| and.b32 %r54, %r53, 2147483647; |
| setp.ne.s32 %p105, %r54, 2146435072; |
| @%p105 bra BB48_113; |
| |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r55, %temp}, %fd98; |
| } |
| setp.eq.s32 %p106, %r55, 0; |
| @%p106 bra BB48_141; |
| |
| BB48_113: |
| cvt.rmi.f64.f64 %fd84, %fd98; |
| mul.f64 %fd85, %fd84, %fd68; |
| sub.f64 %fd98, %fd1, %fd85; |
| bra.uni BB48_141; |
| |
| BB48_6: |
| setp.eq.s32 %p30, %r6, 1; |
| @%p30 bra BB48_7; |
| bra.uni BB48_71; |
| |
| BB48_7: |
| sub.f64 %fd94, %fd68, %fd1; |
| bra.uni BB48_71; |
| |
| BB48_22: |
| setp.eq.s32 %p18, %r6, 10; |
| @%p18 bra BB48_23; |
| bra.uni BB48_71; |
| |
| BB48_23: |
| setp.neu.f64 %p43, %fd1, %fd68; |
| selp.f64 %fd94, 0d3FF0000000000000, 0d0000000000000000, %p43; |
| bra.uni BB48_71; |
| |
| BB48_13: |
| setp.eq.s32 %p25, %r6, 5; |
| @%p25 bra BB48_14; |
| bra.uni BB48_71; |
| |
| BB48_14: |
| setp.gt.f64 %p48, %fd1, %fd68; |
| selp.f64 %fd94, 0d3FF0000000000000, 0d0000000000000000, %p48; |
| bra.uni BB48_71; |
| |
| BB48_30: |
| setp.eq.s32 %p12, %r6, 15; |
| @%p12 bra BB48_31; |
| bra.uni BB48_71; |
| |
| BB48_31: |
| mul.f64 %fd76, %fd1, %fd68; |
| mov.f64 %fd77, 0d3FF0000000000000; |
| sub.f64 %fd94, %fd77, %fd76; |
| bra.uni BB48_71; |
| |
| BB48_9: |
| setp.eq.s32 %p28, %r6, 3; |
| @%p28 bra BB48_10; |
| bra.uni BB48_71; |
| |
| BB48_10: |
| div.rn.f64 %fd94, %fd68, %fd1; |
| bra.uni BB48_71; |
| |
| BB48_47: |
| min.f64 %fd94, %fd68, %fd1; |
| bra.uni BB48_71; |
| |
| BB48_26: |
| setp.eq.s32 %p16, %r6, 13; |
| @%p16 bra BB48_27; |
| bra.uni BB48_71; |
| |
| BB48_27: |
| cvt.rni.s64.f64 %rd12, %fd68; |
| cvt.u32.u64 %r21, %rd12; |
| cvt.rni.s64.f64 %rd13, %fd1; |
| cvt.u32.u64 %r22, %rd13; |
| and.b32 %r23, %r22, %r21; |
| setp.eq.s32 %p42, %r23, 0; |
| selp.f64 %fd94, 0d0000000000000000, 0d3FF0000000000000, %p42; |
| bra.uni BB48_71; |
| |
| BB48_50: |
| setp.ltu.f64 %p47, %fd1, %fd68; |
| selp.f64 %fd94, 0d0000000000000000, 0d3FF0000000000000, %p47; |
| bra.uni BB48_71; |
| |
| BB48_17: |
| setp.eq.s32 %p23, %r6, 8; |
| @%p23 bra BB48_18; |
| bra.uni BB48_71; |
| |
| BB48_18: |
| setp.gtu.f64 %p45, %fd1, %fd68; |
| selp.f64 %fd94, 0d0000000000000000, 0d3FF0000000000000, %p45; |
| bra.uni BB48_71; |
| |
| BB48_44: |
| setp.neu.f64 %p40, %fd68, 0d0000000000000000; |
| sub.f64 %fd75, %fd68, %fd1; |
| selp.f64 %fd94, %fd75, 0d0000000000000000, %p40; |
| bra.uni BB48_71; |
| |
| BB48_34: |
| setp.ne.s32 %p10, %r6, 18; |
| @%p10 bra BB48_71; |
| |
| div.rn.f64 %fd94, %fd68, %fd1; |
| abs.f64 %fd70, %fd94; |
| setp.gtu.f64 %p31, %fd70, 0d7FF0000000000000; |
| @%p31 bra BB48_71; |
| |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r12}, %fd94; |
| } |
| and.b32 %r13, %r12, 2147483647; |
| setp.ne.s32 %p32, %r13, 2146435072; |
| @%p32 bra BB48_38; |
| |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r14, %temp}, %fd94; |
| } |
| setp.eq.s32 %p33, %r14, 0; |
| @%p33 bra BB48_71; |
| |
| BB48_38: |
| cvt.rmi.f64.f64 %fd94, %fd94; |
| bra.uni BB48_71; |
| |
| BB48_76: |
| setp.eq.s32 %p97, %r6, 1; |
| @%p97 bra BB48_77; |
| bra.uni BB48_141; |
| |
| BB48_77: |
| sub.f64 %fd98, %fd1, %fd68; |
| bra.uni BB48_141; |
| |
| BB48_92: |
| setp.eq.s32 %p85, %r6, 10; |
| @%p85 bra BB48_93; |
| bra.uni BB48_141; |
| |
| BB48_93: |
| setp.neu.f64 %p110, %fd1, %fd68; |
| selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p110; |
| bra.uni BB48_141; |
| |
| BB48_83: |
| setp.eq.s32 %p92, %r6, 5; |
| @%p92 bra BB48_84; |
| bra.uni BB48_141; |
| |
| BB48_84: |
| setp.lt.f64 %p115, %fd1, %fd68; |
| selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p115; |
| bra.uni BB48_141; |
| |
| BB48_100: |
| setp.eq.s32 %p79, %r6, 15; |
| @%p79 bra BB48_101; |
| bra.uni BB48_141; |
| |
| BB48_101: |
| mul.f64 %fd87, %fd1, %fd68; |
| mov.f64 %fd88, 0d3FF0000000000000; |
| sub.f64 %fd98, %fd88, %fd87; |
| bra.uni BB48_141; |
| |
| BB48_79: |
| setp.eq.s32 %p95, %r6, 3; |
| @%p95 bra BB48_80; |
| bra.uni BB48_141; |
| |
| BB48_80: |
| div.rn.f64 %fd98, %fd1, %fd68; |
| bra.uni BB48_141; |
| |
| BB48_117: |
| min.f64 %fd98, %fd1, %fd68; |
| bra.uni BB48_141; |
| |
| BB48_96: |
| setp.eq.s32 %p83, %r6, 13; |
| @%p83 bra BB48_97; |
| bra.uni BB48_141; |
| |
| BB48_97: |
| cvt.rni.s64.f64 %rd17, %fd1; |
| cvt.u32.u64 %r59, %rd17; |
| cvt.rni.s64.f64 %rd18, %fd68; |
| cvt.u32.u64 %r60, %rd18; |
| and.b32 %r61, %r60, %r59; |
| setp.eq.s32 %p109, %r61, 0; |
| selp.f64 %fd98, 0d0000000000000000, 0d3FF0000000000000, %p109; |
| bra.uni BB48_141; |
| |
| BB48_120: |
| setp.gtu.f64 %p114, %fd1, %fd68; |
| selp.f64 %fd98, 0d0000000000000000, 0d3FF0000000000000, %p114; |
| bra.uni BB48_141; |
| |
| BB48_87: |
| setp.eq.s32 %p90, %r6, 8; |
| @%p90 bra BB48_88; |
| bra.uni BB48_141; |
| |
| BB48_88: |
| setp.ltu.f64 %p112, %fd1, %fd68; |
| selp.f64 %fd98, 0d0000000000000000, 0d3FF0000000000000, %p112; |
| bra.uni BB48_141; |
| |
| BB48_114: |
| setp.neu.f64 %p107, %fd1, 0d0000000000000000; |
| sub.f64 %fd86, %fd1, %fd68; |
| selp.f64 %fd98, %fd86, 0d0000000000000000, %p107; |
| bra.uni BB48_141; |
| |
| BB48_104: |
| setp.ne.s32 %p77, %r6, 18; |
| @%p77 bra BB48_141; |
| |
| div.rn.f64 %fd98, %fd1, %fd68; |
| abs.f64 %fd81, %fd98; |
| setp.gtu.f64 %p98, %fd81, 0d7FF0000000000000; |
| @%p98 bra BB48_141; |
| |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r50}, %fd98; |
| } |
| and.b32 %r51, %r50, 2147483647; |
| setp.ne.s32 %p99, %r51, 2146435072; |
| @%p99 bra BB48_108; |
| |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r52, %temp}, %fd98; |
| } |
| setp.eq.s32 %p100, %r52, 0; |
| @%p100 bra BB48_141; |
| |
| BB48_108: |
| cvt.rmi.f64.f64 %fd98, %fd98; |
| bra.uni BB48_141; |
| |
| BB48_54: |
| @%p51 bra BB48_57; |
| |
| cvt.rzi.f64.f64 %fd78, %fd1; |
| setp.neu.f64 %p56, %fd78, %fd1; |
| selp.f64 %fd24, 0dFFF8000000000000, %fd24, %p56; |
| |
| BB48_57: |
| add.f64 %fd93, %fd1, %fd68; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r33}, %fd93; |
| } |
| and.b32 %r34, %r33, 2146435072; |
| setp.ne.s32 %p59, %r34, 2146435072; |
| @%p59 bra BB48_58; |
| |
| setp.gtu.f64 %p60, %fd18, 0d7FF0000000000000; |
| @%p60 bra BB48_68; |
| |
| abs.f64 %fd79, %fd1; |
| setp.gtu.f64 %p61, %fd79, 0d7FF0000000000000; |
| @%p61 bra BB48_68; |
| |
| and.b32 %r35, %r3, 2147483647; |
| setp.ne.s32 %p62, %r35, 2146435072; |
| @%p62 bra BB48_63; |
| |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r36, %temp}, %fd1; |
| } |
| setp.eq.s32 %p63, %r36, 0; |
| @%p63 bra BB48_67; |
| |
| BB48_63: |
| and.b32 %r37, %r2, 2147483647; |
| setp.ne.s32 %p64, %r37, 2146435072; |
| @%p64 bra BB48_64; |
| |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r38, %temp}, %fd68; |
| } |
| setp.ne.s32 %p65, %r38, 0; |
| mov.f64 %fd93, %fd24; |
| @%p65 bra BB48_68; |
| |
| shr.s32 %r39, %r3, 31; |
| and.b32 %r40, %r39, -2146435072; |
| add.s32 %r41, %r40, 2146435072; |
| or.b32 %r42, %r41, -2147483648; |
| selp.b32 %r43, %r42, %r41, %p1; |
| mov.u32 %r44, 0; |
| mov.b64 %fd93, {%r44, %r43}; |
| bra.uni BB48_68; |
| |
| BB48_58: |
| mov.f64 %fd93, %fd24; |
| |
| BB48_68: |
| setp.eq.f64 %p69, %fd1, 0d0000000000000000; |
| setp.eq.f64 %p70, %fd68, 0d3FF0000000000000; |
| or.pred %p71, %p70, %p69; |
| selp.f64 %fd94, 0d3FF0000000000000, %fd93, %p71; |
| |
| BB48_71: |
| st.global.f64 [%rd1], %fd94; |
| bra.uni BB48_142; |
| |
| BB48_124: |
| @%p118 bra BB48_127; |
| |
| cvt.rzi.f64.f64 %fd89, %fd68; |
| setp.neu.f64 %p123, %fd89, %fd68; |
| selp.f64 %fd57, 0dFFF8000000000000, %fd57, %p123; |
| |
| BB48_127: |
| add.f64 %fd97, %fd1, %fd68; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r71}, %fd97; |
| } |
| and.b32 %r72, %r71, 2146435072; |
| setp.ne.s32 %p126, %r72, 2146435072; |
| @%p126 bra BB48_128; |
| |
| setp.gtu.f64 %p127, %fd51, 0d7FF0000000000000; |
| @%p127 bra BB48_138; |
| |
| abs.f64 %fd90, %fd68; |
| setp.gtu.f64 %p128, %fd90, 0d7FF0000000000000; |
| @%p128 bra BB48_138; |
| |
| and.b32 %r73, %r5, 2147483647; |
| setp.ne.s32 %p129, %r73, 2146435072; |
| @%p129 bra BB48_133; |
| |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r74, %temp}, %fd68; |
| } |
| setp.eq.s32 %p130, %r74, 0; |
| @%p130 bra BB48_137; |
| |
| BB48_133: |
| and.b32 %r75, %r4, 2147483647; |
| setp.ne.s32 %p131, %r75, 2146435072; |
| @%p131 bra BB48_134; |
| |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r76, %temp}, %fd1; |
| } |
| setp.ne.s32 %p132, %r76, 0; |
| mov.f64 %fd97, %fd57; |
| @%p132 bra BB48_138; |
| |
| shr.s32 %r77, %r5, 31; |
| and.b32 %r78, %r77, -2146435072; |
| add.s32 %r79, %r78, 2146435072; |
| or.b32 %r80, %r79, -2147483648; |
| selp.b32 %r81, %r80, %r79, %p2; |
| mov.u32 %r82, 0; |
| mov.b64 %fd97, {%r82, %r81}; |
| bra.uni BB48_138; |
| |
| BB48_128: |
| mov.f64 %fd97, %fd57; |
| |
| BB48_138: |
| setp.eq.f64 %p136, %fd68, 0d0000000000000000; |
| setp.eq.f64 %p137, %fd1, 0d3FF0000000000000; |
| or.pred %p138, %p137, %p136; |
| selp.f64 %fd98, 0d3FF0000000000000, %fd97, %p138; |
| |
| BB48_141: |
| st.global.f64 [%rd1], %fd98; |
| |
| BB48_142: |
| bar.sync 0; |
| ret; |
| |
| BB48_64: |
| mov.f64 %fd93, %fd24; |
| bra.uni BB48_68; |
| |
| BB48_134: |
| mov.f64 %fd97, %fd57; |
| bra.uni BB48_138; |
| |
| BB48_67: |
| setp.gt.f64 %p66, %fd18, 0d3FF0000000000000; |
| selp.b32 %r45, 2146435072, 0, %p66; |
| mov.u32 %r46, 0; |
| xor.b32 %r47, %r45, 2146435072; |
| setp.lt.s32 %p67, %r3, 0; |
| selp.b32 %r48, %r47, %r45, %p67; |
| setp.eq.f64 %p68, %fd68, 0dBFF0000000000000; |
| selp.b32 %r49, 1072693248, %r48, %p68; |
| mov.b64 %fd93, {%r46, %r49}; |
| bra.uni BB48_68; |
| |
| BB48_137: |
| setp.gt.f64 %p133, %fd51, 0d3FF0000000000000; |
| selp.b32 %r83, 2146435072, 0, %p133; |
| mov.u32 %r84, 0; |
| xor.b32 %r85, %r83, 2146435072; |
| setp.lt.s32 %p134, %r5, 0; |
| selp.b32 %r86, %r85, %r83, %p134; |
| setp.eq.f64 %p135, %fd1, 0dBFF0000000000000; |
| selp.b32 %r87, 1072693248, %r86, %p135; |
| mov.b64 %fd97, {%r84, %r87}; |
| bra.uni BB48_138; |
| } |
| |
| // .globl matrix_scalar_op_f |
| .visible .entry matrix_scalar_op_f( |
| .param .u64 matrix_scalar_op_f_param_0, |
| .param .f64 matrix_scalar_op_f_param_1, |
| .param .u64 matrix_scalar_op_f_param_2, |
| .param .u32 matrix_scalar_op_f_param_3, |
| .param .u32 matrix_scalar_op_f_param_4, |
| .param .u32 matrix_scalar_op_f_param_5 |
| ) |
| { |
| .reg .pred %p<139>; |
| .reg .f32 %f<267>; |
| .reg .b32 %r<58>; |
| .reg .f64 %fd<2>; |
| .reg .b64 %rd<16>; |
| |
| |
| ld.param.u64 %rd2, [matrix_scalar_op_f_param_0]; |
| ld.param.f64 %fd1, [matrix_scalar_op_f_param_1]; |
| ld.param.u64 %rd3, [matrix_scalar_op_f_param_2]; |
| ld.param.u32 %r4, [matrix_scalar_op_f_param_3]; |
| ld.param.u32 %r2, [matrix_scalar_op_f_param_4]; |
| ld.param.u32 %r3, [matrix_scalar_op_f_param_5]; |
| cvt.rn.f32.f64 %f1, %fd1; |
| mov.u32 %r5, %ntid.x; |
| mov.u32 %r6, %ctaid.x; |
| mov.u32 %r7, %tid.x; |
| mad.lo.s32 %r1, %r5, %r6, %r7; |
| setp.ge.s32 %p3, %r1, %r4; |
| @%p3 bra BB49_126; |
| |
| cvta.to.global.u64 %rd4, %rd3; |
| cvta.to.global.u64 %rd5, %rd2; |
| mul.wide.s32 %rd6, %r1, 4; |
| add.s64 %rd7, %rd5, %rd6; |
| ld.global.f32 %f2, [%rd7]; |
| add.s64 %rd1, %rd4, %rd6; |
| setp.eq.s32 %p4, %r3, 0; |
| @%p4 bra BB49_64; |
| |
| mov.f32 %f262, 0f7F7FFFFF; |
| setp.gt.s32 %p5, %r2, 8; |
| @%p5 bra BB49_19; |
| |
| setp.gt.s32 %p19, %r2, 3; |
| @%p19 bra BB49_11; |
| |
| setp.gt.s32 %p26, %r2, 1; |
| @%p26 bra BB49_8; |
| |
| setp.eq.s32 %p29, %r2, 0; |
| @%p29 bra BB49_62; |
| bra.uni BB49_6; |
| |
| BB49_62: |
| add.f32 %f262, %f1, %f2; |
| bra.uni BB49_63; |
| |
| BB49_64: |
| mov.f32 %f266, 0f7F7FFFFF; |
| setp.gt.s32 %p72, %r2, 8; |
| @%p72 bra BB49_81; |
| |
| setp.gt.s32 %p86, %r2, 3; |
| @%p86 bra BB49_73; |
| |
| setp.gt.s32 %p93, %r2, 1; |
| @%p93 bra BB49_70; |
| |
| setp.eq.s32 %p96, %r2, 0; |
| @%p96 bra BB49_124; |
| bra.uni BB49_68; |
| |
| BB49_124: |
| add.f32 %f266, %f1, %f2; |
| bra.uni BB49_125; |
| |
| BB49_19: |
| setp.gt.s32 %p6, %r2, 13; |
| @%p6 bra BB49_28; |
| |
| setp.gt.s32 %p13, %r2, 10; |
| @%p13 bra BB49_24; |
| |
| setp.eq.s32 %p17, %r2, 9; |
| @%p17 bra BB49_44; |
| bra.uni BB49_22; |
| |
| BB49_44: |
| setp.eq.f32 %p40, %f1, %f2; |
| selp.f32 %f262, 0f3F800000, 0f00000000, %p40; |
| bra.uni BB49_63; |
| |
| BB49_81: |
| setp.gt.s32 %p73, %r2, 13; |
| @%p73 bra BB49_90; |
| |
| setp.gt.s32 %p80, %r2, 10; |
| @%p80 bra BB49_86; |
| |
| setp.eq.s32 %p84, %r2, 9; |
| @%p84 bra BB49_106; |
| bra.uni BB49_84; |
| |
| BB49_106: |
| setp.eq.f32 %p107, %f2, %f1; |
| selp.f32 %f266, 0f3F800000, 0f00000000, %p107; |
| bra.uni BB49_125; |
| |
| BB49_11: |
| setp.gt.s32 %p20, %r2, 5; |
| @%p20 bra BB49_15; |
| |
| setp.eq.s32 %p24, %r2, 4; |
| @%p24 bra BB49_47; |
| bra.uni BB49_13; |
| |
| BB49_47: |
| mul.f32 %f88, %f2, 0f3F000000; |
| cvt.rzi.f32.f32 %f89, %f88; |
| fma.rn.f32 %f90, %f89, 0fC0000000, %f2; |
| abs.f32 %f19, %f90; |
| abs.f32 %f20, %f1; |
| setp.lt.f32 %p45, %f20, 0f00800000; |
| mul.f32 %f91, %f20, 0f4B800000; |
| selp.f32 %f92, 0fC3170000, 0fC2FE0000, %p45; |
| selp.f32 %f93, %f91, %f20, %p45; |
| mov.b32 %r14, %f93; |
| and.b32 %r15, %r14, 8388607; |
| or.b32 %r16, %r15, 1065353216; |
| mov.b32 %f94, %r16; |
| shr.u32 %r17, %r14, 23; |
| cvt.rn.f32.u32 %f95, %r17; |
| add.f32 %f96, %f92, %f95; |
| setp.gt.f32 %p46, %f94, 0f3FB504F3; |
| mul.f32 %f97, %f94, 0f3F000000; |
| add.f32 %f98, %f96, 0f3F800000; |
| selp.f32 %f99, %f97, %f94, %p46; |
| selp.f32 %f100, %f98, %f96, %p46; |
| add.f32 %f101, %f99, 0fBF800000; |
| add.f32 %f87, %f99, 0f3F800000; |
| // inline asm |
| rcp.approx.ftz.f32 %f86,%f87; |
| // inline asm |
| add.f32 %f102, %f101, %f101; |
| mul.f32 %f103, %f86, %f102; |
| mul.f32 %f104, %f103, %f103; |
| mov.f32 %f105, 0f3C4CAF63; |
| mov.f32 %f106, 0f3B18F0FE; |
| fma.rn.f32 %f107, %f106, %f104, %f105; |
| mov.f32 %f108, 0f3DAAAABD; |
| fma.rn.f32 %f109, %f107, %f104, %f108; |
| mul.rn.f32 %f110, %f109, %f104; |
| mul.rn.f32 %f111, %f110, %f103; |
| sub.f32 %f112, %f101, %f103; |
| neg.f32 %f113, %f103; |
| add.f32 %f114, %f112, %f112; |
| fma.rn.f32 %f115, %f113, %f101, %f114; |
| mul.rn.f32 %f116, %f86, %f115; |
| add.f32 %f117, %f111, %f103; |
| sub.f32 %f118, %f103, %f117; |
| add.f32 %f119, %f111, %f118; |
| add.f32 %f120, %f116, %f119; |
| add.f32 %f121, %f117, %f120; |
| sub.f32 %f122, %f117, %f121; |
| add.f32 %f123, %f120, %f122; |
| mov.f32 %f124, 0f3F317200; |
| mul.rn.f32 %f125, %f100, %f124; |
| mov.f32 %f126, 0f35BFBE8E; |
| mul.rn.f32 %f127, %f100, %f126; |
| add.f32 %f128, %f125, %f121; |
| sub.f32 %f129, %f125, %f128; |
| add.f32 %f130, %f121, %f129; |
| add.f32 %f131, %f123, %f130; |
| add.f32 %f132, %f127, %f131; |
| add.f32 %f133, %f128, %f132; |
| sub.f32 %f134, %f128, %f133; |
| add.f32 %f135, %f132, %f134; |
| abs.f32 %f21, %f2; |
| setp.gt.f32 %p47, %f21, 0f77F684DF; |
| mul.f32 %f136, %f2, 0f39000000; |
| selp.f32 %f137, %f136, %f2, %p47; |
| mul.rn.f32 %f138, %f137, %f133; |
| neg.f32 %f139, %f138; |
| fma.rn.f32 %f140, %f137, %f133, %f139; |
| fma.rn.f32 %f141, %f137, %f135, %f140; |
| mov.f32 %f142, 0f00000000; |
| fma.rn.f32 %f143, %f142, %f133, %f141; |
| add.rn.f32 %f144, %f138, %f143; |
| neg.f32 %f145, %f144; |
| add.rn.f32 %f146, %f138, %f145; |
| add.rn.f32 %f147, %f146, %f143; |
| mov.b32 %r18, %f144; |
| setp.eq.s32 %p48, %r18, 1118925336; |
| add.s32 %r19, %r18, -1; |
| mov.b32 %f148, %r19; |
| add.f32 %f149, %f147, 0f37000000; |
| selp.f32 %f150, %f148, %f144, %p48; |
| selp.f32 %f22, %f149, %f147, %p48; |
| mul.f32 %f151, %f150, 0f3FB8AA3B; |
| cvt.rzi.f32.f32 %f152, %f151; |
| mov.f32 %f153, 0fBF317200; |
| fma.rn.f32 %f154, %f152, %f153, %f150; |
| mov.f32 %f155, 0fB5BFBE8E; |
| fma.rn.f32 %f156, %f152, %f155, %f154; |
| mul.f32 %f157, %f156, 0f3FB8AA3B; |
| ex2.approx.ftz.f32 %f158, %f157; |
| add.f32 %f159, %f152, 0f00000000; |
| ex2.approx.f32 %f160, %f159; |
| mul.f32 %f161, %f158, %f160; |
| setp.lt.f32 %p49, %f150, 0fC2D20000; |
| selp.f32 %f162, 0f00000000, %f161, %p49; |
| setp.gt.f32 %p50, %f150, 0f42D20000; |
| selp.f32 %f259, 0f7F800000, %f162, %p50; |
| setp.eq.f32 %p51, %f259, 0f7F800000; |
| @%p51 bra BB49_49; |
| |
| fma.rn.f32 %f259, %f259, %f22, %f259; |
| |
| BB49_49: |
| setp.lt.f32 %p52, %f1, 0f00000000; |
| setp.eq.f32 %p53, %f19, 0f3F800000; |
| and.pred %p1, %p52, %p53; |
| mov.b32 %r20, %f259; |
| xor.b32 %r21, %r20, -2147483648; |
| mov.b32 %f163, %r21; |
| selp.f32 %f261, %f163, %f259, %p1; |
| setp.eq.f32 %p54, %f1, 0f00000000; |
| @%p54 bra BB49_52; |
| bra.uni BB49_50; |
| |
| BB49_52: |
| add.f32 %f165, %f1, %f1; |
| mov.b32 %r22, %f165; |
| selp.b32 %r23, %r22, 0, %p53; |
| or.b32 %r24, %r23, 2139095040; |
| setp.lt.f32 %p58, %f2, 0f00000000; |
| selp.b32 %r25, %r24, %r23, %p58; |
| mov.b32 %f261, %r25; |
| bra.uni BB49_53; |
| |
| BB49_28: |
| setp.gt.s32 %p7, %r2, 15; |
| @%p7 bra BB49_32; |
| |
| setp.eq.s32 %p11, %r2, 14; |
| @%p11 bra BB49_41; |
| bra.uni BB49_30; |
| |
| BB49_41: |
| cvt.rni.s64.f32 %rd8, %f1; |
| cvt.u32.u64 %r8, %rd8; |
| cvt.rni.s64.f32 %rd9, %f2; |
| cvt.u32.u64 %r9, %rd9; |
| or.b32 %r10, %r9, %r8; |
| setp.eq.s32 %p37, %r10, 0; |
| selp.f32 %f262, 0f00000000, 0f3F800000, %p37; |
| bra.uni BB49_63; |
| |
| BB49_73: |
| setp.gt.s32 %p87, %r2, 5; |
| @%p87 bra BB49_77; |
| |
| setp.eq.s32 %p91, %r2, 4; |
| @%p91 bra BB49_109; |
| bra.uni BB49_75; |
| |
| BB49_109: |
| mul.f32 %f179, %f1, 0f3F000000; |
| cvt.rzi.f32.f32 %f180, %f179; |
| fma.rn.f32 %f181, %f180, 0fC0000000, %f1; |
| abs.f32 %f56, %f181; |
| abs.f32 %f57, %f2; |
| setp.lt.f32 %p112, %f57, 0f00800000; |
| mul.f32 %f182, %f57, 0f4B800000; |
| selp.f32 %f183, 0fC3170000, 0fC2FE0000, %p112; |
| selp.f32 %f184, %f182, %f57, %p112; |
| mov.b32 %r39, %f184; |
| and.b32 %r40, %r39, 8388607; |
| or.b32 %r41, %r40, 1065353216; |
| mov.b32 %f185, %r41; |
| shr.u32 %r42, %r39, 23; |
| cvt.rn.f32.u32 %f186, %r42; |
| add.f32 %f187, %f183, %f186; |
| setp.gt.f32 %p113, %f185, 0f3FB504F3; |
| mul.f32 %f188, %f185, 0f3F000000; |
| add.f32 %f189, %f187, 0f3F800000; |
| selp.f32 %f190, %f188, %f185, %p113; |
| selp.f32 %f191, %f189, %f187, %p113; |
| add.f32 %f192, %f190, 0fBF800000; |
| add.f32 %f178, %f190, 0f3F800000; |
| // inline asm |
| rcp.approx.ftz.f32 %f177,%f178; |
| // inline asm |
| add.f32 %f193, %f192, %f192; |
| mul.f32 %f194, %f177, %f193; |
| mul.f32 %f195, %f194, %f194; |
| mov.f32 %f196, 0f3C4CAF63; |
| mov.f32 %f197, 0f3B18F0FE; |
| fma.rn.f32 %f198, %f197, %f195, %f196; |
| mov.f32 %f199, 0f3DAAAABD; |
| fma.rn.f32 %f200, %f198, %f195, %f199; |
| mul.rn.f32 %f201, %f200, %f195; |
| mul.rn.f32 %f202, %f201, %f194; |
| sub.f32 %f203, %f192, %f194; |
| neg.f32 %f204, %f194; |
| add.f32 %f205, %f203, %f203; |
| fma.rn.f32 %f206, %f204, %f192, %f205; |
| mul.rn.f32 %f207, %f177, %f206; |
| add.f32 %f208, %f202, %f194; |
| sub.f32 %f209, %f194, %f208; |
| add.f32 %f210, %f202, %f209; |
| add.f32 %f211, %f207, %f210; |
| add.f32 %f212, %f208, %f211; |
| sub.f32 %f213, %f208, %f212; |
| add.f32 %f214, %f211, %f213; |
| mov.f32 %f215, 0f3F317200; |
| mul.rn.f32 %f216, %f191, %f215; |
| mov.f32 %f217, 0f35BFBE8E; |
| mul.rn.f32 %f218, %f191, %f217; |
| add.f32 %f219, %f216, %f212; |
| sub.f32 %f220, %f216, %f219; |
| add.f32 %f221, %f212, %f220; |
| add.f32 %f222, %f214, %f221; |
| add.f32 %f223, %f218, %f222; |
| add.f32 %f224, %f219, %f223; |
| sub.f32 %f225, %f219, %f224; |
| add.f32 %f226, %f223, %f225; |
| abs.f32 %f58, %f1; |
| setp.gt.f32 %p114, %f58, 0f77F684DF; |
| mul.f32 %f227, %f1, 0f39000000; |
| selp.f32 %f228, %f227, %f1, %p114; |
| mul.rn.f32 %f229, %f228, %f224; |
| neg.f32 %f230, %f229; |
| fma.rn.f32 %f231, %f228, %f224, %f230; |
| fma.rn.f32 %f232, %f228, %f226, %f231; |
| mov.f32 %f233, 0f00000000; |
| fma.rn.f32 %f234, %f233, %f224, %f232; |
| add.rn.f32 %f235, %f229, %f234; |
| neg.f32 %f236, %f235; |
| add.rn.f32 %f237, %f229, %f236; |
| add.rn.f32 %f238, %f237, %f234; |
| mov.b32 %r43, %f235; |
| setp.eq.s32 %p115, %r43, 1118925336; |
| add.s32 %r44, %r43, -1; |
| mov.b32 %f239, %r44; |
| add.f32 %f240, %f238, 0f37000000; |
| selp.f32 %f241, %f239, %f235, %p115; |
| selp.f32 %f59, %f240, %f238, %p115; |
| mul.f32 %f242, %f241, 0f3FB8AA3B; |
| cvt.rzi.f32.f32 %f243, %f242; |
| mov.f32 %f244, 0fBF317200; |
| fma.rn.f32 %f245, %f243, %f244, %f241; |
| mov.f32 %f246, 0fB5BFBE8E; |
| fma.rn.f32 %f247, %f243, %f246, %f245; |
| mul.f32 %f248, %f247, 0f3FB8AA3B; |
| ex2.approx.ftz.f32 %f249, %f248; |
| add.f32 %f250, %f243, 0f00000000; |
| ex2.approx.f32 %f251, %f250; |
| mul.f32 %f252, %f249, %f251; |
| setp.lt.f32 %p116, %f241, 0fC2D20000; |
| selp.f32 %f253, 0f00000000, %f252, %p116; |
| setp.gt.f32 %p117, %f241, 0f42D20000; |
| selp.f32 %f263, 0f7F800000, %f253, %p117; |
| setp.eq.f32 %p118, %f263, 0f7F800000; |
| @%p118 bra BB49_111; |
| |
| fma.rn.f32 %f263, %f263, %f59, %f263; |
| |
| BB49_111: |
| setp.lt.f32 %p119, %f2, 0f00000000; |
| setp.eq.f32 %p120, %f56, 0f3F800000; |
| and.pred %p2, %p119, %p120; |
| mov.b32 %r45, %f263; |
| xor.b32 %r46, %r45, -2147483648; |
| mov.b32 %f254, %r46; |
| selp.f32 %f265, %f254, %f263, %p2; |
| setp.eq.f32 %p121, %f2, 0f00000000; |
| @%p121 bra BB49_114; |
| bra.uni BB49_112; |
| |
| BB49_114: |
| add.f32 %f256, %f2, %f2; |
| mov.b32 %r47, %f256; |
| selp.b32 %r48, %r47, 0, %p120; |
| or.b32 %r49, %r48, 2139095040; |
| setp.lt.f32 %p125, %f1, 0f00000000; |
| selp.b32 %r50, %r49, %r48, %p125; |
| mov.b32 %f265, %r50; |
| bra.uni BB49_115; |
| |
| BB49_90: |
| setp.gt.s32 %p74, %r2, 15; |
| @%p74 bra BB49_94; |
| |
| setp.eq.s32 %p78, %r2, 14; |
| @%p78 bra BB49_103; |
| bra.uni BB49_92; |
| |
| BB49_103: |
| cvt.rni.s64.f32 %rd12, %f2; |
| cvt.u32.u64 %r33, %rd12; |
| cvt.rni.s64.f32 %rd13, %f1; |
| cvt.u32.u64 %r34, %rd13; |
| or.b32 %r35, %r34, %r33; |
| setp.eq.s32 %p104, %r35, 0; |
| selp.f32 %f266, 0f00000000, 0f3F800000, %p104; |
| bra.uni BB49_125; |
| |
| BB49_8: |
| setp.eq.s32 %p27, %r2, 2; |
| @%p27 bra BB49_61; |
| bra.uni BB49_9; |
| |
| BB49_61: |
| mul.f32 %f262, %f1, %f2; |
| bra.uni BB49_63; |
| |
| BB49_24: |
| setp.eq.s32 %p14, %r2, 11; |
| @%p14 bra BB49_43; |
| |
| setp.eq.s32 %p15, %r2, 12; |
| @%p15 bra BB49_42; |
| bra.uni BB49_26; |
| |
| BB49_42: |
| max.f32 %f262, %f1, %f2; |
| bra.uni BB49_63; |
| |
| BB49_15: |
| setp.eq.s32 %p21, %r2, 6; |
| @%p21 bra BB49_46; |
| |
| setp.eq.s32 %p22, %r2, 7; |
| @%p22 bra BB49_45; |
| bra.uni BB49_17; |
| |
| BB49_45: |
| setp.gt.f32 %p42, %f1, %f2; |
| selp.f32 %f262, 0f3F800000, 0f00000000, %p42; |
| bra.uni BB49_63; |
| |
| BB49_32: |
| setp.eq.s32 %p8, %r2, 16; |
| @%p8 bra BB49_40; |
| |
| setp.eq.s32 %p9, %r2, 17; |
| @%p9 bra BB49_37; |
| bra.uni BB49_34; |
| |
| BB49_37: |
| setp.eq.f32 %p32, %f2, 0f00000000; |
| setp.eq.f32 %p33, %f2, 0f80000000; |
| or.pred %p34, %p32, %p33; |
| mov.f32 %f262, 0f7FC00000; |
| @%p34 bra BB49_63; |
| |
| div.rn.f32 %f262, %f1, %f2; |
| abs.f32 %f80, %f262; |
| setp.geu.f32 %p35, %f80, 0f7F800000; |
| @%p35 bra BB49_63; |
| |
| cvt.rmi.f32.f32 %f81, %f262; |
| mul.f32 %f82, %f2, %f81; |
| sub.f32 %f262, %f1, %f82; |
| bra.uni BB49_63; |
| |
| BB49_70: |
| setp.eq.s32 %p94, %r2, 2; |
| @%p94 bra BB49_123; |
| bra.uni BB49_71; |
| |
| BB49_123: |
| mul.f32 %f266, %f1, %f2; |
| bra.uni BB49_125; |
| |
| BB49_86: |
| setp.eq.s32 %p81, %r2, 11; |
| @%p81 bra BB49_105; |
| |
| setp.eq.s32 %p82, %r2, 12; |
| @%p82 bra BB49_104; |
| bra.uni BB49_88; |
| |
| BB49_104: |
| max.f32 %f266, %f2, %f1; |
| bra.uni BB49_125; |
| |
| BB49_77: |
| setp.eq.s32 %p88, %r2, 6; |
| @%p88 bra BB49_108; |
| |
| setp.eq.s32 %p89, %r2, 7; |
| @%p89 bra BB49_107; |
| bra.uni BB49_79; |
| |
| BB49_107: |
| setp.gt.f32 %p109, %f2, %f1; |
| selp.f32 %f266, 0f3F800000, 0f00000000, %p109; |
| bra.uni BB49_125; |
| |
| BB49_94: |
| setp.eq.s32 %p75, %r2, 16; |
| @%p75 bra BB49_102; |
| |
| setp.eq.s32 %p76, %r2, 17; |
| @%p76 bra BB49_99; |
| bra.uni BB49_96; |
| |
| BB49_99: |
| setp.eq.f32 %p99, %f1, 0f00000000; |
| setp.eq.f32 %p100, %f1, 0f80000000; |
| or.pred %p101, %p99, %p100; |
| mov.f32 %f266, 0f7FC00000; |
| @%p101 bra BB49_125; |
| |
| div.rn.f32 %f266, %f2, %f1; |
| abs.f32 %f171, %f266; |
| setp.geu.f32 %p102, %f171, 0f7F800000; |
| @%p102 bra BB49_125; |
| |
| cvt.rmi.f32.f32 %f172, %f266; |
| mul.f32 %f173, %f1, %f172; |
| sub.f32 %f266, %f2, %f173; |
| bra.uni BB49_125; |
| |
| BB49_6: |
| setp.eq.s32 %p30, %r2, 1; |
| @%p30 bra BB49_7; |
| bra.uni BB49_63; |
| |
| BB49_7: |
| sub.f32 %f262, %f1, %f2; |
| bra.uni BB49_63; |
| |
| BB49_22: |
| setp.eq.s32 %p18, %r2, 10; |
| @%p18 bra BB49_23; |
| bra.uni BB49_63; |
| |
| BB49_23: |
| setp.neu.f32 %p39, %f1, %f2; |
| selp.f32 %f262, 0f3F800000, 0f00000000, %p39; |
| bra.uni BB49_63; |
| |
| BB49_13: |
| setp.eq.s32 %p25, %r2, 5; |
| @%p25 bra BB49_14; |
| bra.uni BB49_63; |
| |
| BB49_14: |
| setp.lt.f32 %p44, %f1, %f2; |
| selp.f32 %f262, 0f3F800000, 0f00000000, %p44; |
| bra.uni BB49_63; |
| |
| BB49_30: |
| setp.eq.s32 %p12, %r2, 15; |
| @%p12 bra BB49_31; |
| bra.uni BB49_63; |
| |
| BB49_31: |
| mul.f32 %f84, %f1, %f2; |
| mov.f32 %f85, 0f3F800000; |
| sub.f32 %f262, %f85, %f84; |
| bra.uni BB49_63; |
| |
| BB49_9: |
| setp.eq.s32 %p28, %r2, 3; |
| @%p28 bra BB49_10; |
| bra.uni BB49_63; |
| |
| BB49_10: |
| div.rn.f32 %f262, %f1, %f2; |
| bra.uni BB49_63; |
| |
| BB49_43: |
| min.f32 %f262, %f1, %f2; |
| bra.uni BB49_63; |
| |
| BB49_26: |
| setp.eq.s32 %p16, %r2, 13; |
| @%p16 bra BB49_27; |
| bra.uni BB49_63; |
| |
| BB49_27: |
| cvt.rni.s64.f32 %rd10, %f1; |
| cvt.u32.u64 %r11, %rd10; |
| cvt.rni.s64.f32 %rd11, %f2; |
| cvt.u32.u64 %r12, %rd11; |
| and.b32 %r13, %r12, %r11; |
| setp.eq.s32 %p38, %r13, 0; |
| selp.f32 %f262, 0f00000000, 0f3F800000, %p38; |
| bra.uni BB49_63; |
| |
| BB49_46: |
| setp.gtu.f32 %p43, %f1, %f2; |
| selp.f32 %f262, 0f00000000, 0f3F800000, %p43; |
| bra.uni BB49_63; |
| |
| BB49_17: |
| setp.eq.s32 %p23, %r2, 8; |
| @%p23 bra BB49_18; |
| bra.uni BB49_63; |
| |
| BB49_18: |
| setp.ltu.f32 %p41, %f1, %f2; |
| selp.f32 %f262, 0f00000000, 0f3F800000, %p41; |
| bra.uni BB49_63; |
| |
| BB49_40: |
| setp.neu.f32 %p36, %f1, 0f00000000; |
| sub.f32 %f83, %f1, %f2; |
| selp.f32 %f262, %f83, 0f00000000, %p36; |
| bra.uni BB49_63; |
| |
| BB49_34: |
| setp.ne.s32 %p10, %r2, 18; |
| @%p10 bra BB49_63; |
| |
| div.rn.f32 %f262, %f1, %f2; |
| abs.f32 %f78, %f262; |
| setp.geu.f32 %p31, %f78, 0f7F800000; |
| @%p31 bra BB49_63; |
| |
| cvt.rmi.f32.f32 %f262, %f262; |
| bra.uni BB49_63; |
| |
| BB49_68: |
| setp.eq.s32 %p97, %r2, 1; |
| @%p97 bra BB49_69; |
| bra.uni BB49_125; |
| |
| BB49_69: |
| sub.f32 %f266, %f2, %f1; |
| bra.uni BB49_125; |
| |
| BB49_84: |
| setp.eq.s32 %p85, %r2, 10; |
| @%p85 bra BB49_85; |
| bra.uni BB49_125; |
| |
| BB49_85: |
| setp.neu.f32 %p106, %f2, %f1; |
| selp.f32 %f266, 0f3F800000, 0f00000000, %p106; |
| bra.uni BB49_125; |
| |
| BB49_75: |
| setp.eq.s32 %p92, %r2, 5; |
| @%p92 bra BB49_76; |
| bra.uni BB49_125; |
| |
| BB49_76: |
| setp.lt.f32 %p111, %f2, %f1; |
| selp.f32 %f266, 0f3F800000, 0f00000000, %p111; |
| bra.uni BB49_125; |
| |
| BB49_92: |
| setp.eq.s32 %p79, %r2, 15; |
| @%p79 bra BB49_93; |
| bra.uni BB49_125; |
| |
| BB49_93: |
| mul.f32 %f175, %f1, %f2; |
| mov.f32 %f176, 0f3F800000; |
| sub.f32 %f266, %f176, %f175; |
| bra.uni BB49_125; |
| |
| BB49_71: |
| setp.eq.s32 %p95, %r2, 3; |
| @%p95 bra BB49_72; |
| bra.uni BB49_125; |
| |
| BB49_72: |
| div.rn.f32 %f266, %f2, %f1; |
| bra.uni BB49_125; |
| |
| BB49_105: |
| min.f32 %f266, %f2, %f1; |
| bra.uni BB49_125; |
| |
| BB49_88: |
| setp.eq.s32 %p83, %r2, 13; |
| @%p83 bra BB49_89; |
| bra.uni BB49_125; |
| |
| BB49_89: |
| cvt.rni.s64.f32 %rd14, %f2; |
| cvt.u32.u64 %r36, %rd14; |
| cvt.rni.s64.f32 %rd15, %f1; |
| cvt.u32.u64 %r37, %rd15; |
| and.b32 %r38, %r37, %r36; |
| setp.eq.s32 %p105, %r38, 0; |
| selp.f32 %f266, 0f00000000, 0f3F800000, %p105; |
| bra.uni BB49_125; |
| |
| BB49_108: |
| setp.gtu.f32 %p110, %f2, %f1; |
| selp.f32 %f266, 0f00000000, 0f3F800000, %p110; |
| bra.uni BB49_125; |
| |
| BB49_79: |
| setp.eq.s32 %p90, %r2, 8; |
| @%p90 bra BB49_80; |
| bra.uni BB49_125; |
| |
| BB49_80: |
| setp.ltu.f32 %p108, %f2, %f1; |
| selp.f32 %f266, 0f00000000, 0f3F800000, %p108; |
| bra.uni BB49_125; |
| |
| BB49_102: |
| setp.neu.f32 %p103, %f2, 0f00000000; |
| sub.f32 %f174, %f2, %f1; |
| selp.f32 %f266, %f174, 0f00000000, %p103; |
| bra.uni BB49_125; |
| |
| BB49_96: |
| setp.ne.s32 %p77, %r2, 18; |
| @%p77 bra BB49_125; |
| |
| div.rn.f32 %f266, %f2, %f1; |
| abs.f32 %f169, %f266; |
| setp.geu.f32 %p98, %f169, 0f7F800000; |
| @%p98 bra BB49_125; |
| |
| cvt.rmi.f32.f32 %f266, %f266; |
| bra.uni BB49_125; |
| |
| BB49_50: |
| setp.geu.f32 %p55, %f1, 0f00000000; |
| @%p55 bra BB49_53; |
| |
| cvt.rzi.f32.f32 %f164, %f2; |
| setp.neu.f32 %p56, %f164, %f2; |
| selp.f32 %f261, 0f7FFFFFFF, %f261, %p56; |
| |
| BB49_53: |
| add.f32 %f166, %f20, %f21; |
| mov.b32 %r26, %f166; |
| setp.lt.s32 %p59, %r26, 2139095040; |
| @%p59 bra BB49_60; |
| |
| setp.gtu.f32 %p60, %f20, 0f7F800000; |
| setp.gtu.f32 %p61, %f21, 0f7F800000; |
| or.pred %p62, %p60, %p61; |
| @%p62 bra BB49_59; |
| bra.uni BB49_55; |
| |
| BB49_59: |
| add.f32 %f261, %f1, %f2; |
| bra.uni BB49_60; |
| |
| BB49_55: |
| setp.eq.f32 %p63, %f21, 0f7F800000; |
| @%p63 bra BB49_58; |
| bra.uni BB49_56; |
| |
| BB49_58: |
| setp.gt.f32 %p66, %f20, 0f3F800000; |
| selp.b32 %r30, 2139095040, 0, %p66; |
| xor.b32 %r31, %r30, 2139095040; |
| setp.lt.f32 %p67, %f2, 0f00000000; |
| selp.b32 %r32, %r31, %r30, %p67; |
| mov.b32 %f167, %r32; |
| setp.eq.f32 %p68, %f1, 0fBF800000; |
| selp.f32 %f261, 0f3F800000, %f167, %p68; |
| bra.uni BB49_60; |
| |
| BB49_112: |
| setp.geu.f32 %p122, %f2, 0f00000000; |
| @%p122 bra BB49_115; |
| |
| cvt.rzi.f32.f32 %f255, %f1; |
| setp.neu.f32 %p123, %f255, %f1; |
| selp.f32 %f265, 0f7FFFFFFF, %f265, %p123; |
| |
| BB49_115: |
| add.f32 %f257, %f57, %f58; |
| mov.b32 %r51, %f257; |
| setp.lt.s32 %p126, %r51, 2139095040; |
| @%p126 bra BB49_122; |
| |
| setp.gtu.f32 %p127, %f57, 0f7F800000; |
| setp.gtu.f32 %p128, %f58, 0f7F800000; |
| or.pred %p129, %p127, %p128; |
| @%p129 bra BB49_121; |
| bra.uni BB49_117; |
| |
| BB49_121: |
| add.f32 %f265, %f1, %f2; |
| bra.uni BB49_122; |
| |
| BB49_117: |
| setp.eq.f32 %p130, %f58, 0f7F800000; |
| @%p130 bra BB49_120; |
| bra.uni BB49_118; |
| |
| BB49_120: |
| setp.gt.f32 %p133, %f57, 0f3F800000; |
| selp.b32 %r55, 2139095040, 0, %p133; |
| xor.b32 %r56, %r55, 2139095040; |
| setp.lt.f32 %p134, %f1, 0f00000000; |
| selp.b32 %r57, %r56, %r55, %p134; |
| mov.b32 %f258, %r57; |
| setp.eq.f32 %p135, %f2, 0fBF800000; |
| selp.f32 %f265, 0f3F800000, %f258, %p135; |
| bra.uni BB49_122; |
| |
| BB49_56: |
| setp.neu.f32 %p64, %f20, 0f7F800000; |
| @%p64 bra BB49_60; |
| |
| setp.ltu.f32 %p65, %f2, 0f00000000; |
| selp.b32 %r27, 0, 2139095040, %p65; |
| or.b32 %r28, %r27, -2147483648; |
| selp.b32 %r29, %r28, %r27, %p1; |
| mov.b32 %f261, %r29; |
| |
| BB49_60: |
| setp.eq.f32 %p69, %f2, 0f00000000; |
| setp.eq.f32 %p70, %f1, 0f3F800000; |
| or.pred %p71, %p70, %p69; |
| selp.f32 %f262, 0f3F800000, %f261, %p71; |
| |
| BB49_63: |
| st.global.f32 [%rd1], %f262; |
| bra.uni BB49_126; |
| |
| BB49_118: |
| setp.neu.f32 %p131, %f57, 0f7F800000; |
| @%p131 bra BB49_122; |
| |
| setp.ltu.f32 %p132, %f1, 0f00000000; |
| selp.b32 %r52, 0, 2139095040, %p132; |
| or.b32 %r53, %r52, -2147483648; |
| selp.b32 %r54, %r53, %r52, %p2; |
| mov.b32 %f265, %r54; |
| |
| BB49_122: |
| setp.eq.f32 %p136, %f1, 0f00000000; |
| setp.eq.f32 %p137, %f2, 0f3F800000; |
| or.pred %p138, %p137, %p136; |
| selp.f32 %f266, 0f3F800000, %f265, %p138; |
| |
| BB49_125: |
| st.global.f32 [%rd1], %f266; |
| |
| BB49_126: |
| bar.sync 0; |
| ret; |
| } |
| |
| // .globl fill_d |
| .visible .entry fill_d( |
| .param .u64 fill_d_param_0, |
| .param .f64 fill_d_param_1, |
| .param .u32 fill_d_param_2 |
| ) |
| { |
| .reg .pred %p<2>; |
| .reg .b32 %r<6>; |
| .reg .f64 %fd<2>; |
| .reg .b64 %rd<5>; |
| |
| |
| ld.param.u64 %rd1, [fill_d_param_0]; |
| ld.param.f64 %fd1, [fill_d_param_1]; |
| ld.param.u32 %r2, [fill_d_param_2]; |
| mov.u32 %r3, %ctaid.x; |
| mov.u32 %r4, %ntid.x; |
| mov.u32 %r5, %tid.x; |
| mad.lo.s32 %r1, %r4, %r3, %r5; |
| setp.ge.s32 %p1, %r1, %r2; |
| @%p1 bra BB50_2; |
| |
| cvta.to.global.u64 %rd2, %rd1; |
| mul.wide.s32 %rd3, %r1, 8; |
| add.s64 %rd4, %rd2, %rd3; |
| st.global.f64 [%rd4], %fd1; |
| |
| BB50_2: |
| ret; |
| } |
| |
| // .globl fill_f |
| .visible .entry fill_f( |
| .param .u64 fill_f_param_0, |
| .param .f64 fill_f_param_1, |
| .param .u32 fill_f_param_2 |
| ) |
| { |
| .reg .pred %p<2>; |
| .reg .f32 %f<2>; |
| .reg .b32 %r<6>; |
| .reg .f64 %fd<2>; |
| .reg .b64 %rd<5>; |
| |
| |
| ld.param.u64 %rd1, [fill_f_param_0]; |
| ld.param.f64 %fd1, [fill_f_param_1]; |
| ld.param.u32 %r2, [fill_f_param_2]; |
| mov.u32 %r3, %ctaid.x; |
| mov.u32 %r4, %ntid.x; |
| mov.u32 %r5, %tid.x; |
| mad.lo.s32 %r1, %r4, %r3, %r5; |
| setp.ge.s32 %p1, %r1, %r2; |
| @%p1 bra BB51_2; |
| |
| cvt.rn.f32.f64 %f1, %fd1; |
| cvta.to.global.u64 %rd2, %rd1; |
| mul.wide.s32 %rd3, %r1, 4; |
| add.s64 %rd4, %rd2, %rd3; |
| st.global.f32 [%rd4], %f1; |
| |
| BB51_2: |
| ret; |
| } |
| |
| // .globl cbind_d |
| .visible .entry cbind_d( |
| .param .u64 cbind_d_param_0, |
| .param .u64 cbind_d_param_1, |
| .param .u64 cbind_d_param_2, |
| .param .u32 cbind_d_param_3, |
| .param .u32 cbind_d_param_4, |
| .param .u32 cbind_d_param_5, |
| .param .u32 cbind_d_param_6 |
| ) |
| { |
| .reg .pred %p<7>; |
| .reg .b32 %r<18>; |
| .reg .f64 %fd<3>; |
| .reg .b64 %rd<15>; |
| |
| |
| ld.param.u64 %rd2, [cbind_d_param_0]; |
| ld.param.u64 %rd3, [cbind_d_param_1]; |
| ld.param.u64 %rd4, [cbind_d_param_2]; |
| ld.param.u32 %r7, [cbind_d_param_3]; |
| ld.param.u32 %r4, [cbind_d_param_4]; |
| ld.param.u32 %r5, [cbind_d_param_5]; |
| ld.param.u32 %r6, [cbind_d_param_6]; |
| cvta.to.global.u64 %rd1, %rd4; |
| mov.u32 %r8, %ntid.x; |
| mov.u32 %r9, %ctaid.x; |
| mov.u32 %r10, %tid.x; |
| mad.lo.s32 %r11, %r8, %r9, %r10; |
| max.s32 %r12, %r4, %r6; |
| div.s32 %r1, %r11, %r12; |
| rem.s32 %r2, %r11, %r12; |
| add.s32 %r3, %r6, %r4; |
| setp.ge.s32 %p1, %r1, %r7; |
| setp.ge.s32 %p2, %r2, %r4; |
| or.pred %p3, %p1, %p2; |
| @%p3 bra BB52_2; |
| |
| cvta.to.global.u64 %rd5, %rd2; |
| mad.lo.s32 %r13, %r1, %r4, %r2; |
| mul.wide.s32 %rd6, %r13, 8; |
| add.s64 %rd7, %rd5, %rd6; |
| ld.global.f64 %fd1, [%rd7]; |
| mad.lo.s32 %r14, %r1, %r3, %r2; |
| mul.wide.s32 %rd8, %r14, 8; |
| add.s64 %rd9, %rd1, %rd8; |
| st.global.f64 [%rd9], %fd1; |
| |
| BB52_2: |
| setp.ge.s32 %p4, %r1, %r5; |
| setp.ge.s32 %p5, %r2, %r6; |
| or.pred %p6, %p4, %p5; |
| @%p6 bra BB52_4; |
| |
| cvta.to.global.u64 %rd10, %rd3; |
| mad.lo.s32 %r15, %r1, %r6, %r2; |
| mul.wide.s32 %rd11, %r15, 8; |
| add.s64 %rd12, %rd10, %rd11; |
| ld.global.f64 %fd2, [%rd12]; |
| add.s32 %r16, %r2, %r4; |
| mad.lo.s32 %r17, %r1, %r3, %r16; |
| mul.wide.s32 %rd13, %r17, 8; |
| add.s64 %rd14, %rd1, %rd13; |
| st.global.f64 [%rd14], %fd2; |
| |
| BB52_4: |
| ret; |
| } |
| |
| // .globl cbind_f |
| .visible .entry cbind_f( |
| .param .u64 cbind_f_param_0, |
| .param .u64 cbind_f_param_1, |
| .param .u64 cbind_f_param_2, |
| .param .u32 cbind_f_param_3, |
| .param .u32 cbind_f_param_4, |
| .param .u32 cbind_f_param_5, |
| .param .u32 cbind_f_param_6 |
| ) |
| { |
| .reg .pred %p<7>; |
| .reg .f32 %f<3>; |
| .reg .b32 %r<18>; |
| .reg .b64 %rd<15>; |
| |
| |
| ld.param.u64 %rd2, [cbind_f_param_0]; |
| ld.param.u64 %rd3, [cbind_f_param_1]; |
| ld.param.u64 %rd4, [cbind_f_param_2]; |
| ld.param.u32 %r7, [cbind_f_param_3]; |
| ld.param.u32 %r4, [cbind_f_param_4]; |
| ld.param.u32 %r5, [cbind_f_param_5]; |
| ld.param.u32 %r6, [cbind_f_param_6]; |
| cvta.to.global.u64 %rd1, %rd4; |
| mov.u32 %r8, %ntid.x; |
| mov.u32 %r9, %ctaid.x; |
| mov.u32 %r10, %tid.x; |
| mad.lo.s32 %r11, %r8, %r9, %r10; |
| max.s32 %r12, %r4, %r6; |
| div.s32 %r1, %r11, %r12; |
| rem.s32 %r2, %r11, %r12; |
| add.s32 %r3, %r6, %r4; |
| setp.ge.s32 %p1, %r1, %r7; |
| setp.ge.s32 %p2, %r2, %r4; |
| or.pred %p3, %p1, %p2; |
| @%p3 bra BB53_2; |
| |
| cvta.to.global.u64 %rd5, %rd2; |
| mad.lo.s32 %r13, %r1, %r4, %r2; |
| mul.wide.s32 %rd6, %r13, 4; |
| add.s64 %rd7, %rd5, %rd6; |
| ld.global.f32 %f1, [%rd7]; |
| mad.lo.s32 %r14, %r1, %r3, %r2; |
| mul.wide.s32 %rd8, %r14, 4; |
| add.s64 %rd9, %rd1, %rd8; |
| st.global.f32 [%rd9], %f1; |
| |
| BB53_2: |
| setp.ge.s32 %p4, %r1, %r5; |
| setp.ge.s32 %p5, %r2, %r6; |
| or.pred %p6, %p4, %p5; |
| @%p6 bra BB53_4; |
| |
| cvta.to.global.u64 %rd10, %rd3; |
| mad.lo.s32 %r15, %r1, %r6, %r2; |
| mul.wide.s32 %rd11, %r15, 4; |
| add.s64 %rd12, %rd10, %rd11; |
| ld.global.f32 %f2, [%rd12]; |
| add.s32 %r16, %r2, %r4; |
| mad.lo.s32 %r17, %r1, %r3, %r16; |
| mul.wide.s32 %rd13, %r17, 4; |
| add.s64 %rd14, %rd1, %rd13; |
| st.global.f32 [%rd14], %f2; |
| |
| BB53_4: |
| ret; |
| } |
| |
| // .globl rbind_d |
| .visible .entry rbind_d( |
| .param .u64 rbind_d_param_0, |
| .param .u64 rbind_d_param_1, |
| .param .u64 rbind_d_param_2, |
| .param .u32 rbind_d_param_3, |
| .param .u32 rbind_d_param_4, |
| .param .u32 rbind_d_param_5, |
| .param .u32 rbind_d_param_6 |
| ) |
| { |
| .reg .pred %p<7>; |
| .reg .b32 %r<16>; |
| .reg .f64 %fd<3>; |
| .reg .b64 %rd<14>; |
| |
| |
| ld.param.u64 %rd2, [rbind_d_param_0]; |
| ld.param.u64 %rd3, [rbind_d_param_1]; |
| ld.param.u64 %rd4, [rbind_d_param_2]; |
| ld.param.u32 %r3, [rbind_d_param_3]; |
| ld.param.u32 %r4, [rbind_d_param_4]; |
| ld.param.u32 %r5, [rbind_d_param_5]; |
| ld.param.u32 %r6, [rbind_d_param_6]; |
| cvta.to.global.u64 %rd1, %rd4; |
| mov.u32 %r7, %ntid.x; |
| mov.u32 %r8, %ctaid.x; |
| mov.u32 %r9, %tid.x; |
| mad.lo.s32 %r10, %r7, %r8, %r9; |
| max.s32 %r11, %r4, %r6; |
| div.s32 %r1, %r10, %r11; |
| rem.s32 %r2, %r10, %r11; |
| setp.ge.s32 %p1, %r1, %r3; |
| setp.ge.s32 %p2, %r2, %r4; |
| or.pred %p3, %p1, %p2; |
| @%p3 bra BB54_2; |
| |
| cvta.to.global.u64 %rd5, %rd2; |
| mad.lo.s32 %r12, %r1, %r4, %r2; |
| mul.wide.s32 %rd6, %r12, 8; |
| add.s64 %rd7, %rd5, %rd6; |
| ld.global.f64 %fd1, [%rd7]; |
| add.s64 %rd8, %rd1, %rd6; |
| st.global.f64 [%rd8], %fd1; |
| |
| BB54_2: |
| setp.ge.s32 %p4, %r1, %r5; |
| setp.ge.s32 %p5, %r2, %r6; |
| or.pred %p6, %p4, %p5; |
| @%p6 bra BB54_4; |
| |
| cvta.to.global.u64 %rd9, %rd3; |
| mad.lo.s32 %r13, %r1, %r6, %r2; |
| mul.wide.s32 %rd10, %r13, 8; |
| add.s64 %rd11, %rd9, %rd10; |
| ld.global.f64 %fd2, [%rd11]; |
| add.s32 %r14, %r1, %r3; |
| mad.lo.s32 %r15, %r14, %r4, %r2; |
| mul.wide.s32 %rd12, %r15, 8; |
| add.s64 %rd13, %rd1, %rd12; |
| st.global.f64 [%rd13], %fd2; |
| |
| BB54_4: |
| ret; |
| } |
| |
| // .globl rbind_f |
| .visible .entry rbind_f( |
| .param .u64 rbind_f_param_0, |
| .param .u64 rbind_f_param_1, |
| .param .u64 rbind_f_param_2, |
| .param .u32 rbind_f_param_3, |
| .param .u32 rbind_f_param_4, |
| .param .u32 rbind_f_param_5, |
| .param .u32 rbind_f_param_6 |
| ) |
| { |
| .reg .pred %p<7>; |
| .reg .f32 %f<3>; |
| .reg .b32 %r<16>; |
| .reg .b64 %rd<14>; |
| |
| |
| ld.param.u64 %rd2, [rbind_f_param_0]; |
| ld.param.u64 %rd3, [rbind_f_param_1]; |
| ld.param.u64 %rd4, [rbind_f_param_2]; |
| ld.param.u32 %r3, [rbind_f_param_3]; |
| ld.param.u32 %r4, [rbind_f_param_4]; |
| ld.param.u32 %r5, [rbind_f_param_5]; |
| ld.param.u32 %r6, [rbind_f_param_6]; |
| cvta.to.global.u64 %rd1, %rd4; |
| mov.u32 %r7, %ntid.x; |
| mov.u32 %r8, %ctaid.x; |
| mov.u32 %r9, %tid.x; |
| mad.lo.s32 %r10, %r7, %r8, %r9; |
| max.s32 %r11, %r4, %r6; |
| div.s32 %r1, %r10, %r11; |
| rem.s32 %r2, %r10, %r11; |
| setp.ge.s32 %p1, %r1, %r3; |
| setp.ge.s32 %p2, %r2, %r4; |
| or.pred %p3, %p1, %p2; |
| @%p3 bra BB55_2; |
| |
| cvta.to.global.u64 %rd5, %rd2; |
| mad.lo.s32 %r12, %r1, %r4, %r2; |
| mul.wide.s32 %rd6, %r12, 4; |
| add.s64 %rd7, %rd5, %rd6; |
| ld.global.f32 %f1, [%rd7]; |
| add.s64 %rd8, %rd1, %rd6; |
| st.global.f32 [%rd8], %f1; |
| |
| BB55_2: |
| setp.ge.s32 %p4, %r1, %r5; |
| setp.ge.s32 %p5, %r2, %r6; |
| or.pred %p6, %p4, %p5; |
| @%p6 bra BB55_4; |
| |
| cvta.to.global.u64 %rd9, %rd3; |
| mad.lo.s32 %r13, %r1, %r6, %r2; |
| mul.wide.s32 %rd10, %r13, 4; |
| add.s64 %rd11, %rd9, %rd10; |
| ld.global.f32 %f2, [%rd11]; |
| add.s32 %r14, %r1, %r3; |
| mad.lo.s32 %r15, %r14, %r4, %r2; |
| mul.wide.s32 %rd12, %r15, 4; |
| add.s64 %rd13, %rd1, %rd12; |
| st.global.f32 [%rd13], %f2; |
| |
| BB55_4: |
| ret; |
| } |
| |
| // .globl reduce_sum_d |
| .visible .entry reduce_sum_d( |
| .param .u64 reduce_sum_d_param_0, |
| .param .u64 reduce_sum_d_param_1, |
| .param .u32 reduce_sum_d_param_2 |
| ) |
| { |
| .reg .pred %p<20>; |
| .reg .b32 %r<36>; |
| .reg .f64 %fd<60>; |
| .reg .b64 %rd<12>; |
| |
| |
| ld.param.u64 %rd1, [reduce_sum_d_param_0]; |
| ld.param.u64 %rd2, [reduce_sum_d_param_1]; |
| ld.param.u32 %r6, [reduce_sum_d_param_2]; |
| mov.u32 %r7, %tid.x; |
| mov.u32 %r8, %ctaid.x; |
| shl.b32 %r9, %r8, 1; |
| mov.u32 %r10, %ntid.x; |
| mad.lo.s32 %r35, %r9, %r10, %r7; |
| mov.f64 %fd44, 0d0000000000000000; |
| setp.ge.u32 %p1, %r35, %r6; |
| @%p1 bra BB56_4; |
| |
| BB56_1: |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.u32 %rd4, %r35, 8; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f64 %fd30, [%rd5]; |
| add.f64 %fd44, %fd44, %fd30; |
| add.s32 %r3, %r35, %r10; |
| setp.ge.u32 %p2, %r3, %r6; |
| @%p2 bra BB56_3; |
| |
| mul.wide.u32 %rd7, %r3, 8; |
| add.s64 %rd8, %rd3, %rd7; |
| ld.global.f64 %fd31, [%rd8]; |
| add.f64 %fd44, %fd44, %fd31; |
| |
| BB56_3: |
| shl.b32 %r13, %r10, 1; |
| mov.u32 %r14, %nctaid.x; |
| mad.lo.s32 %r35, %r13, %r14, %r35; |
| setp.lt.u32 %p3, %r35, %r6; |
| @%p3 bra BB56_1; |
| |
| BB56_4: |
| shl.b32 %r16, %r7, 3; |
| mov.u32 %r17, memory; |
| add.s32 %r5, %r17, %r16; |
| st.shared.f64 [%r5], %fd44; |
| bar.sync 0; |
| setp.lt.u32 %p4, %r10, 1024; |
| @%p4 bra BB56_8; |
| |
| setp.gt.u32 %p5, %r7, 511; |
| @%p5 bra BB56_7; |
| |
| ld.shared.f64 %fd32, [%r5+4096]; |
| add.f64 %fd44, %fd44, %fd32; |
| st.shared.f64 [%r5], %fd44; |
| |
| BB56_7: |
| bar.sync 0; |
| |
| BB56_8: |
| setp.lt.u32 %p6, %r10, 512; |
| @%p6 bra BB56_12; |
| |
| setp.gt.u32 %p7, %r7, 255; |
| @%p7 bra BB56_11; |
| |
| ld.shared.f64 %fd33, [%r5+2048]; |
| add.f64 %fd44, %fd44, %fd33; |
| st.shared.f64 [%r5], %fd44; |
| |
| BB56_11: |
| bar.sync 0; |
| |
| BB56_12: |
| setp.lt.u32 %p8, %r10, 256; |
| @%p8 bra BB56_16; |
| |
| setp.gt.u32 %p9, %r7, 127; |
| @%p9 bra BB56_15; |
| |
| ld.shared.f64 %fd34, [%r5+1024]; |
| add.f64 %fd44, %fd44, %fd34; |
| st.shared.f64 [%r5], %fd44; |
| |
| BB56_15: |
| bar.sync 0; |
| |
| BB56_16: |
| setp.lt.u32 %p10, %r10, 128; |
| @%p10 bra BB56_20; |
| |
| setp.gt.u32 %p11, %r7, 63; |
| @%p11 bra BB56_19; |
| |
| ld.shared.f64 %fd35, [%r5+512]; |
| add.f64 %fd44, %fd44, %fd35; |
| st.shared.f64 [%r5], %fd44; |
| |
| BB56_19: |
| bar.sync 0; |
| |
| BB56_20: |
| setp.gt.u32 %p12, %r7, 31; |
| @%p12 bra BB56_33; |
| |
| setp.lt.u32 %p13, %r10, 64; |
| @%p13 bra BB56_23; |
| |
| ld.volatile.shared.f64 %fd36, [%r5+256]; |
| add.f64 %fd44, %fd44, %fd36; |
| st.volatile.shared.f64 [%r5], %fd44; |
| |
| BB56_23: |
| setp.lt.u32 %p14, %r10, 32; |
| @%p14 bra BB56_25; |
| |
| ld.volatile.shared.f64 %fd37, [%r5+128]; |
| add.f64 %fd44, %fd44, %fd37; |
| st.volatile.shared.f64 [%r5], %fd44; |
| |
| BB56_25: |
| setp.lt.u32 %p15, %r10, 16; |
| @%p15 bra BB56_27; |
| |
| ld.volatile.shared.f64 %fd38, [%r5+64]; |
| add.f64 %fd44, %fd44, %fd38; |
| st.volatile.shared.f64 [%r5], %fd44; |
| |
| BB56_27: |
| setp.lt.u32 %p16, %r10, 8; |
| @%p16 bra BB56_29; |
| |
| ld.volatile.shared.f64 %fd39, [%r5+32]; |
| add.f64 %fd44, %fd44, %fd39; |
| st.volatile.shared.f64 [%r5], %fd44; |
| |
| BB56_29: |
| setp.lt.u32 %p17, %r10, 4; |
| @%p17 bra BB56_31; |
| |
| ld.volatile.shared.f64 %fd40, [%r5+16]; |
| add.f64 %fd44, %fd44, %fd40; |
| st.volatile.shared.f64 [%r5], %fd44; |
| |
| BB56_31: |
| setp.lt.u32 %p18, %r10, 2; |
| @%p18 bra BB56_33; |
| |
| ld.volatile.shared.f64 %fd41, [%r5+8]; |
| add.f64 %fd42, %fd44, %fd41; |
| st.volatile.shared.f64 [%r5], %fd42; |
| |
| BB56_33: |
| setp.ne.s32 %p19, %r7, 0; |
| @%p19 bra BB56_35; |
| |
| ld.shared.f64 %fd43, [memory]; |
| cvta.to.global.u64 %rd9, %rd2; |
| mul.wide.u32 %rd10, %r8, 8; |
| add.s64 %rd11, %rd9, %rd10; |
| st.global.f64 [%rd11], %fd43; |
| |
| BB56_35: |
| ret; |
| } |
| |
| // .globl reduce_sum_f |
| .visible .entry reduce_sum_f( |
| .param .u64 reduce_sum_f_param_0, |
| .param .u64 reduce_sum_f_param_1, |
| .param .u32 reduce_sum_f_param_2 |
| ) |
| { |
| .reg .pred %p<20>; |
| .reg .f32 %f<60>; |
| .reg .b32 %r<36>; |
| .reg .b64 %rd<12>; |
| |
| |
| ld.param.u64 %rd1, [reduce_sum_f_param_0]; |
| ld.param.u64 %rd2, [reduce_sum_f_param_1]; |
| ld.param.u32 %r6, [reduce_sum_f_param_2]; |
| mov.u32 %r7, %tid.x; |
| mov.u32 %r8, %ctaid.x; |
| shl.b32 %r9, %r8, 1; |
| mov.u32 %r10, %ntid.x; |
| mad.lo.s32 %r35, %r9, %r10, %r7; |
| mov.f32 %f44, 0f00000000; |
| setp.ge.u32 %p1, %r35, %r6; |
| @%p1 bra BB57_4; |
| |
| BB57_1: |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.u32 %rd4, %r35, 4; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f32 %f30, [%rd5]; |
| add.f32 %f44, %f44, %f30; |
| add.s32 %r3, %r35, %r10; |
| setp.ge.u32 %p2, %r3, %r6; |
| @%p2 bra BB57_3; |
| |
| mul.wide.u32 %rd7, %r3, 4; |
| add.s64 %rd8, %rd3, %rd7; |
| ld.global.f32 %f31, [%rd8]; |
| add.f32 %f44, %f44, %f31; |
| |
| BB57_3: |
| shl.b32 %r13, %r10, 1; |
| mov.u32 %r14, %nctaid.x; |
| mad.lo.s32 %r35, %r13, %r14, %r35; |
| setp.lt.u32 %p3, %r35, %r6; |
| @%p3 bra BB57_1; |
| |
| BB57_4: |
| shl.b32 %r16, %r7, 2; |
| mov.u32 %r17, memory; |
| add.s32 %r5, %r17, %r16; |
| st.shared.f32 [%r5], %f44; |
| bar.sync 0; |
| setp.lt.u32 %p4, %r10, 1024; |
| @%p4 bra BB57_8; |
| |
| setp.gt.u32 %p5, %r7, 511; |
| @%p5 bra BB57_7; |
| |
| ld.shared.f32 %f32, [%r5+2048]; |
| add.f32 %f44, %f44, %f32; |
| st.shared.f32 [%r5], %f44; |
| |
| BB57_7: |
| bar.sync 0; |
| |
| BB57_8: |
| setp.lt.u32 %p6, %r10, 512; |
| @%p6 bra BB57_12; |
| |
| setp.gt.u32 %p7, %r7, 255; |
| @%p7 bra BB57_11; |
| |
| ld.shared.f32 %f33, [%r5+1024]; |
| add.f32 %f44, %f44, %f33; |
| st.shared.f32 [%r5], %f44; |
| |
| BB57_11: |
| bar.sync 0; |
| |
| BB57_12: |
| setp.lt.u32 %p8, %r10, 256; |
| @%p8 bra BB57_16; |
| |
| setp.gt.u32 %p9, %r7, 127; |
| @%p9 bra BB57_15; |
| |
| ld.shared.f32 %f34, [%r5+512]; |
| add.f32 %f44, %f44, %f34; |
| st.shared.f32 [%r5], %f44; |
| |
| BB57_15: |
| bar.sync 0; |
| |
| BB57_16: |
| setp.lt.u32 %p10, %r10, 128; |
| @%p10 bra BB57_20; |
| |
| setp.gt.u32 %p11, %r7, 63; |
| @%p11 bra BB57_19; |
| |
| ld.shared.f32 %f35, [%r5+256]; |
| add.f32 %f44, %f44, %f35; |
| st.shared.f32 [%r5], %f44; |
| |
| BB57_19: |
| bar.sync 0; |
| |
| BB57_20: |
| setp.gt.u32 %p12, %r7, 31; |
| @%p12 bra BB57_33; |
| |
| setp.lt.u32 %p13, %r10, 64; |
| @%p13 bra BB57_23; |
| |
| ld.volatile.shared.f32 %f36, [%r5+128]; |
| add.f32 %f44, %f44, %f36; |
| st.volatile.shared.f32 [%r5], %f44; |
| |
| BB57_23: |
| setp.lt.u32 %p14, %r10, 32; |
| @%p14 bra BB57_25; |
| |
| ld.volatile.shared.f32 %f37, [%r5+64]; |
| add.f32 %f44, %f44, %f37; |
| st.volatile.shared.f32 [%r5], %f44; |
| |
| BB57_25: |
| setp.lt.u32 %p15, %r10, 16; |
| @%p15 bra BB57_27; |
| |
| ld.volatile.shared.f32 %f38, [%r5+32]; |
| add.f32 %f44, %f44, %f38; |
| st.volatile.shared.f32 [%r5], %f44; |
| |
| BB57_27: |
| setp.lt.u32 %p16, %r10, 8; |
| @%p16 bra BB57_29; |
| |
| ld.volatile.shared.f32 %f39, [%r5+16]; |
| add.f32 %f44, %f44, %f39; |
| st.volatile.shared.f32 [%r5], %f44; |
| |
| BB57_29: |
| setp.lt.u32 %p17, %r10, 4; |
| @%p17 bra BB57_31; |
| |
| ld.volatile.shared.f32 %f40, [%r5+8]; |
| add.f32 %f44, %f44, %f40; |
| st.volatile.shared.f32 [%r5], %f44; |
| |
| BB57_31: |
| setp.lt.u32 %p18, %r10, 2; |
| @%p18 bra BB57_33; |
| |
| ld.volatile.shared.f32 %f41, [%r5+4]; |
| add.f32 %f42, %f44, %f41; |
| st.volatile.shared.f32 [%r5], %f42; |
| |
| BB57_33: |
| setp.ne.s32 %p19, %r7, 0; |
| @%p19 bra BB57_35; |
| |
| ld.shared.f32 %f43, [memory]; |
| cvta.to.global.u64 %rd9, %rd2; |
| mul.wide.u32 %rd10, %r8, 4; |
| add.s64 %rd11, %rd9, %rd10; |
| st.global.f32 [%rd11], %f43; |
| |
| BB57_35: |
| ret; |
| } |
| |
| // .globl reduce_row_sum_d |
| .visible .entry reduce_row_sum_d( |
| .param .u64 reduce_row_sum_d_param_0, |
| .param .u64 reduce_row_sum_d_param_1, |
| .param .u32 reduce_row_sum_d_param_2, |
| .param .u32 reduce_row_sum_d_param_3 |
| ) |
| { |
| .reg .pred %p<20>; |
| .reg .b32 %r<72>; |
| .reg .f64 %fd<56>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd1, [reduce_row_sum_d_param_0]; |
| ld.param.u64 %rd2, [reduce_row_sum_d_param_1]; |
| ld.param.u32 %r5, [reduce_row_sum_d_param_2]; |
| ld.param.u32 %r4, [reduce_row_sum_d_param_3]; |
| mov.u32 %r6, %ctaid.x; |
| setp.ge.u32 %p1, %r6, %r5; |
| @%p1 bra BB58_35; |
| |
| mov.u32 %r71, %tid.x; |
| mov.f64 %fd6, 0d0000000000000000; |
| setp.ge.u32 %p2, %r71, %r4; |
| @%p2 bra BB58_4; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| |
| BB58_3: |
| mad.lo.s32 %r8, %r6, %r4, %r71; |
| mul.wide.u32 %rd4, %r8, 8; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f64 %fd28, [%rd5]; |
| add.f64 %fd6, %fd6, %fd28; |
| mov.u32 %r9, %ntid.x; |
| add.s32 %r71, %r9, %r71; |
| setp.lt.u32 %p3, %r71, %r4; |
| @%p3 bra BB58_3; |
| |
| BB58_4: |
| mov.u32 %r10, %tid.x; |
| shl.b32 %r11, %r10, 3; |
| mov.u32 %r12, memory; |
| add.s32 %r13, %r12, %r11; |
| st.shared.f64 [%r13], %fd6; |
| bar.sync 0; |
| mov.u32 %r14, %ntid.x; |
| setp.lt.u32 %p4, %r14, 1024; |
| @%p4 bra BB58_8; |
| |
| setp.gt.u32 %p5, %r10, 511; |
| @%p5 bra BB58_7; |
| |
| ld.shared.f64 %fd29, [%r13+4096]; |
| add.f64 %fd6, %fd6, %fd29; |
| st.shared.f64 [%r13], %fd6; |
| |
| BB58_7: |
| bar.sync 0; |
| |
| BB58_8: |
| setp.lt.u32 %p6, %r14, 512; |
| @%p6 bra BB58_12; |
| |
| setp.gt.u32 %p7, %r10, 255; |
| @%p7 bra BB58_11; |
| |
| ld.shared.f64 %fd30, [%r13+2048]; |
| add.f64 %fd6, %fd6, %fd30; |
| st.shared.f64 [%r13], %fd6; |
| |
| BB58_11: |
| bar.sync 0; |
| |
| BB58_12: |
| setp.lt.u32 %p8, %r14, 256; |
| @%p8 bra BB58_16; |
| |
| setp.gt.u32 %p9, %r10, 127; |
| @%p9 bra BB58_15; |
| |
| ld.shared.f64 %fd31, [%r13+1024]; |
| add.f64 %fd6, %fd6, %fd31; |
| st.shared.f64 [%r13], %fd6; |
| |
| BB58_15: |
| bar.sync 0; |
| |
| BB58_16: |
| setp.lt.u32 %p10, %r14, 128; |
| @%p10 bra BB58_20; |
| |
| setp.gt.u32 %p11, %r10, 63; |
| @%p11 bra BB58_19; |
| |
| ld.shared.f64 %fd32, [%r13+512]; |
| add.f64 %fd6, %fd6, %fd32; |
| st.shared.f64 [%r13], %fd6; |
| |
| BB58_19: |
| bar.sync 0; |
| |
| BB58_20: |
| setp.gt.u32 %p12, %r10, 31; |
| @%p12 bra BB58_33; |
| |
| setp.lt.u32 %p13, %r14, 64; |
| @%p13 bra BB58_23; |
| |
| ld.volatile.shared.f64 %fd33, [%r13+256]; |
| add.f64 %fd6, %fd6, %fd33; |
| st.volatile.shared.f64 [%r13], %fd6; |
| |
| BB58_23: |
| setp.lt.u32 %p14, %r14, 32; |
| @%p14 bra BB58_25; |
| |
| ld.volatile.shared.f64 %fd34, [%r13+128]; |
| add.f64 %fd6, %fd6, %fd34; |
| st.volatile.shared.f64 [%r13], %fd6; |
| |
| BB58_25: |
| setp.lt.u32 %p15, %r14, 16; |
| @%p15 bra BB58_27; |
| |
| ld.volatile.shared.f64 %fd35, [%r13+64]; |
| add.f64 %fd6, %fd6, %fd35; |
| st.volatile.shared.f64 [%r13], %fd6; |
| |
| BB58_27: |
| setp.lt.u32 %p16, %r14, 8; |
| @%p16 bra BB58_29; |
| |
| ld.volatile.shared.f64 %fd36, [%r13+32]; |
| add.f64 %fd6, %fd6, %fd36; |
| st.volatile.shared.f64 [%r13], %fd6; |
| |
| BB58_29: |
| setp.lt.u32 %p17, %r14, 4; |
| @%p17 bra BB58_31; |
| |
| ld.volatile.shared.f64 %fd37, [%r13+16]; |
| add.f64 %fd6, %fd6, %fd37; |
| st.volatile.shared.f64 [%r13], %fd6; |
| |
| BB58_31: |
| setp.lt.u32 %p18, %r14, 2; |
| @%p18 bra BB58_33; |
| |
| ld.volatile.shared.f64 %fd38, [%r13+8]; |
| add.f64 %fd39, %fd6, %fd38; |
| st.volatile.shared.f64 [%r13], %fd39; |
| |
| BB58_33: |
| setp.ne.s32 %p19, %r10, 0; |
| @%p19 bra BB58_35; |
| |
| ld.shared.f64 %fd40, [memory]; |
| cvta.to.global.u64 %rd6, %rd2; |
| mul.wide.u32 %rd7, %r6, 8; |
| add.s64 %rd8, %rd6, %rd7; |
| st.global.f64 [%rd8], %fd40; |
| |
| BB58_35: |
| ret; |
| } |
| |
| // .globl reduce_row_sum_f |
| .visible .entry reduce_row_sum_f( |
| .param .u64 reduce_row_sum_f_param_0, |
| .param .u64 reduce_row_sum_f_param_1, |
| .param .u32 reduce_row_sum_f_param_2, |
| .param .u32 reduce_row_sum_f_param_3 |
| ) |
| { |
| .reg .pred %p<20>; |
| .reg .f32 %f<56>; |
| .reg .b32 %r<72>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd1, [reduce_row_sum_f_param_0]; |
| ld.param.u64 %rd2, [reduce_row_sum_f_param_1]; |
| ld.param.u32 %r5, [reduce_row_sum_f_param_2]; |
| ld.param.u32 %r4, [reduce_row_sum_f_param_3]; |
| mov.u32 %r6, %ctaid.x; |
| setp.ge.u32 %p1, %r6, %r5; |
| @%p1 bra BB59_35; |
| |
| mov.u32 %r71, %tid.x; |
| mov.f32 %f6, 0f00000000; |
| setp.ge.u32 %p2, %r71, %r4; |
| @%p2 bra BB59_4; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| |
| BB59_3: |
| mad.lo.s32 %r8, %r6, %r4, %r71; |
| mul.wide.u32 %rd4, %r8, 4; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f32 %f28, [%rd5]; |
| add.f32 %f6, %f6, %f28; |
| mov.u32 %r9, %ntid.x; |
| add.s32 %r71, %r9, %r71; |
| setp.lt.u32 %p3, %r71, %r4; |
| @%p3 bra BB59_3; |
| |
| BB59_4: |
| mov.u32 %r10, %tid.x; |
| shl.b32 %r11, %r10, 2; |
| mov.u32 %r12, memory; |
| add.s32 %r13, %r12, %r11; |
| st.shared.f32 [%r13], %f6; |
| bar.sync 0; |
| mov.u32 %r14, %ntid.x; |
| setp.lt.u32 %p4, %r14, 1024; |
| @%p4 bra BB59_8; |
| |
| setp.gt.u32 %p5, %r10, 511; |
| @%p5 bra BB59_7; |
| |
| ld.shared.f32 %f29, [%r13+2048]; |
| add.f32 %f6, %f6, %f29; |
| st.shared.f32 [%r13], %f6; |
| |
| BB59_7: |
| bar.sync 0; |
| |
| BB59_8: |
| setp.lt.u32 %p6, %r14, 512; |
| @%p6 bra BB59_12; |
| |
| setp.gt.u32 %p7, %r10, 255; |
| @%p7 bra BB59_11; |
| |
| ld.shared.f32 %f30, [%r13+1024]; |
| add.f32 %f6, %f6, %f30; |
| st.shared.f32 [%r13], %f6; |
| |
| BB59_11: |
| bar.sync 0; |
| |
| BB59_12: |
| setp.lt.u32 %p8, %r14, 256; |
| @%p8 bra BB59_16; |
| |
| setp.gt.u32 %p9, %r10, 127; |
| @%p9 bra BB59_15; |
| |
| ld.shared.f32 %f31, [%r13+512]; |
| add.f32 %f6, %f6, %f31; |
| st.shared.f32 [%r13], %f6; |
| |
| BB59_15: |
| bar.sync 0; |
| |
| BB59_16: |
| setp.lt.u32 %p10, %r14, 128; |
| @%p10 bra BB59_20; |
| |
| setp.gt.u32 %p11, %r10, 63; |
| @%p11 bra BB59_19; |
| |
| ld.shared.f32 %f32, [%r13+256]; |
| add.f32 %f6, %f6, %f32; |
| st.shared.f32 [%r13], %f6; |
| |
| BB59_19: |
| bar.sync 0; |
| |
| BB59_20: |
| setp.gt.u32 %p12, %r10, 31; |
| @%p12 bra BB59_33; |
| |
| setp.lt.u32 %p13, %r14, 64; |
| @%p13 bra BB59_23; |
| |
| ld.volatile.shared.f32 %f33, [%r13+128]; |
| add.f32 %f6, %f6, %f33; |
| st.volatile.shared.f32 [%r13], %f6; |
| |
| BB59_23: |
| setp.lt.u32 %p14, %r14, 32; |
| @%p14 bra BB59_25; |
| |
| ld.volatile.shared.f32 %f34, [%r13+64]; |
| add.f32 %f6, %f6, %f34; |
| st.volatile.shared.f32 [%r13], %f6; |
| |
| BB59_25: |
| setp.lt.u32 %p15, %r14, 16; |
| @%p15 bra BB59_27; |
| |
| ld.volatile.shared.f32 %f35, [%r13+32]; |
| add.f32 %f6, %f6, %f35; |
| st.volatile.shared.f32 [%r13], %f6; |
| |
| BB59_27: |
| setp.lt.u32 %p16, %r14, 8; |
| @%p16 bra BB59_29; |
| |
| ld.volatile.shared.f32 %f36, [%r13+16]; |
| add.f32 %f6, %f6, %f36; |
| st.volatile.shared.f32 [%r13], %f6; |
| |
| BB59_29: |
| setp.lt.u32 %p17, %r14, 4; |
| @%p17 bra BB59_31; |
| |
| ld.volatile.shared.f32 %f37, [%r13+8]; |
| add.f32 %f6, %f6, %f37; |
| st.volatile.shared.f32 [%r13], %f6; |
| |
| BB59_31: |
| setp.lt.u32 %p18, %r14, 2; |
| @%p18 bra BB59_33; |
| |
| ld.volatile.shared.f32 %f38, [%r13+4]; |
| add.f32 %f39, %f6, %f38; |
| st.volatile.shared.f32 [%r13], %f39; |
| |
| BB59_33: |
| setp.ne.s32 %p19, %r10, 0; |
| @%p19 bra BB59_35; |
| |
| ld.shared.f32 %f40, [memory]; |
| cvta.to.global.u64 %rd6, %rd2; |
| mul.wide.u32 %rd7, %r6, 4; |
| add.s64 %rd8, %rd6, %rd7; |
| st.global.f32 [%rd8], %f40; |
| |
| BB59_35: |
| ret; |
| } |
| |
| // .globl reduce_col_sum_d |
| .visible .entry reduce_col_sum_d( |
| .param .u64 reduce_col_sum_d_param_0, |
| .param .u64 reduce_col_sum_d_param_1, |
| .param .u32 reduce_col_sum_d_param_2, |
| .param .u32 reduce_col_sum_d_param_3 |
| ) |
| { |
| .reg .pred %p<4>; |
| .reg .b32 %r<11>; |
| .reg .f64 %fd<9>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd2, [reduce_col_sum_d_param_0]; |
| ld.param.u64 %rd3, [reduce_col_sum_d_param_1]; |
| ld.param.u32 %r5, [reduce_col_sum_d_param_2]; |
| ld.param.u32 %r6, [reduce_col_sum_d_param_3]; |
| mov.u32 %r7, %ntid.x; |
| mov.u32 %r8, %ctaid.x; |
| mov.u32 %r9, %tid.x; |
| mad.lo.s32 %r1, %r7, %r8, %r9; |
| setp.ge.u32 %p1, %r1, %r6; |
| @%p1 bra BB60_5; |
| |
| mul.lo.s32 %r2, %r6, %r5; |
| cvta.to.global.u64 %rd1, %rd2; |
| mov.f64 %fd8, 0d0000000000000000; |
| setp.ge.u32 %p2, %r1, %r2; |
| @%p2 bra BB60_4; |
| |
| mov.u32 %r10, %r1; |
| |
| BB60_3: |
| mul.wide.u32 %rd4, %r10, 8; |
| add.s64 %rd5, %rd1, %rd4; |
| ld.global.f64 %fd6, [%rd5]; |
| add.f64 %fd8, %fd8, %fd6; |
| add.s32 %r10, %r10, %r6; |
| setp.lt.u32 %p3, %r10, %r2; |
| @%p3 bra BB60_3; |
| |
| BB60_4: |
| cvta.to.global.u64 %rd6, %rd3; |
| mul.wide.u32 %rd7, %r1, 8; |
| add.s64 %rd8, %rd6, %rd7; |
| st.global.f64 [%rd8], %fd8; |
| |
| BB60_5: |
| ret; |
| } |
| |
| // .globl reduce_col_sum_f |
| .visible .entry reduce_col_sum_f( |
| .param .u64 reduce_col_sum_f_param_0, |
| .param .u64 reduce_col_sum_f_param_1, |
| .param .u32 reduce_col_sum_f_param_2, |
| .param .u32 reduce_col_sum_f_param_3 |
| ) |
| { |
| .reg .pred %p<4>; |
| .reg .f32 %f<9>; |
| .reg .b32 %r<11>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd2, [reduce_col_sum_f_param_0]; |
| ld.param.u64 %rd3, [reduce_col_sum_f_param_1]; |
| ld.param.u32 %r5, [reduce_col_sum_f_param_2]; |
| ld.param.u32 %r6, [reduce_col_sum_f_param_3]; |
| mov.u32 %r7, %ntid.x; |
| mov.u32 %r8, %ctaid.x; |
| mov.u32 %r9, %tid.x; |
| mad.lo.s32 %r1, %r7, %r8, %r9; |
| setp.ge.u32 %p1, %r1, %r6; |
| @%p1 bra BB61_5; |
| |
| mul.lo.s32 %r2, %r6, %r5; |
| cvta.to.global.u64 %rd1, %rd2; |
| mov.f32 %f8, 0f00000000; |
| setp.ge.u32 %p2, %r1, %r2; |
| @%p2 bra BB61_4; |
| |
| mov.u32 %r10, %r1; |
| |
| BB61_3: |
| mul.wide.u32 %rd4, %r10, 4; |
| add.s64 %rd5, %rd1, %rd4; |
| ld.global.f32 %f6, [%rd5]; |
| add.f32 %f8, %f8, %f6; |
| add.s32 %r10, %r10, %r6; |
| setp.lt.u32 %p3, %r10, %r2; |
| @%p3 bra BB61_3; |
| |
| BB61_4: |
| cvta.to.global.u64 %rd6, %rd3; |
| mul.wide.u32 %rd7, %r1, 4; |
| add.s64 %rd8, %rd6, %rd7; |
| st.global.f32 [%rd8], %f8; |
| |
| BB61_5: |
| ret; |
| } |
| |
| // .globl reduce_max_d |
| .visible .entry reduce_max_d( |
| .param .u64 reduce_max_d_param_0, |
| .param .u64 reduce_max_d_param_1, |
| .param .u32 reduce_max_d_param_2 |
| ) |
| { |
| .reg .pred %p<20>; |
| .reg .b32 %r<36>; |
| .reg .f64 %fd<60>; |
| .reg .b64 %rd<12>; |
| |
| |
| ld.param.u64 %rd1, [reduce_max_d_param_0]; |
| ld.param.u64 %rd2, [reduce_max_d_param_1]; |
| ld.param.u32 %r6, [reduce_max_d_param_2]; |
| mov.u32 %r7, %tid.x; |
| mov.u32 %r8, %ctaid.x; |
| shl.b32 %r9, %r8, 1; |
| mov.u32 %r10, %ntid.x; |
| mad.lo.s32 %r35, %r9, %r10, %r7; |
| mov.f64 %fd44, 0dFFEFFFFFFFFFFFFF; |
| setp.ge.u32 %p1, %r35, %r6; |
| @%p1 bra BB62_4; |
| |
| BB62_1: |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.u32 %rd4, %r35, 8; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f64 %fd30, [%rd5]; |
| max.f64 %fd44, %fd44, %fd30; |
| add.s32 %r3, %r35, %r10; |
| setp.ge.u32 %p2, %r3, %r6; |
| @%p2 bra BB62_3; |
| |
| mul.wide.u32 %rd7, %r3, 8; |
| add.s64 %rd8, %rd3, %rd7; |
| ld.global.f64 %fd31, [%rd8]; |
| max.f64 %fd44, %fd44, %fd31; |
| |
| BB62_3: |
| shl.b32 %r13, %r10, 1; |
| mov.u32 %r14, %nctaid.x; |
| mad.lo.s32 %r35, %r13, %r14, %r35; |
| setp.lt.u32 %p3, %r35, %r6; |
| @%p3 bra BB62_1; |
| |
| BB62_4: |
| shl.b32 %r16, %r7, 3; |
| mov.u32 %r17, memory; |
| add.s32 %r5, %r17, %r16; |
| st.shared.f64 [%r5], %fd44; |
| bar.sync 0; |
| setp.lt.u32 %p4, %r10, 1024; |
| @%p4 bra BB62_8; |
| |
| setp.gt.u32 %p5, %r7, 511; |
| @%p5 bra BB62_7; |
| |
| ld.shared.f64 %fd32, [%r5+4096]; |
| max.f64 %fd44, %fd44, %fd32; |
| st.shared.f64 [%r5], %fd44; |
| |
| BB62_7: |
| bar.sync 0; |
| |
| BB62_8: |
| setp.lt.u32 %p6, %r10, 512; |
| @%p6 bra BB62_12; |
| |
| setp.gt.u32 %p7, %r7, 255; |
| @%p7 bra BB62_11; |
| |
| ld.shared.f64 %fd33, [%r5+2048]; |
| max.f64 %fd44, %fd44, %fd33; |
| st.shared.f64 [%r5], %fd44; |
| |
| BB62_11: |
| bar.sync 0; |
| |
| BB62_12: |
| setp.lt.u32 %p8, %r10, 256; |
| @%p8 bra BB62_16; |
| |
| setp.gt.u32 %p9, %r7, 127; |
| @%p9 bra BB62_15; |
| |
| ld.shared.f64 %fd34, [%r5+1024]; |
| max.f64 %fd44, %fd44, %fd34; |
| st.shared.f64 [%r5], %fd44; |
| |
| BB62_15: |
| bar.sync 0; |
| |
| BB62_16: |
| setp.lt.u32 %p10, %r10, 128; |
| @%p10 bra BB62_20; |
| |
| setp.gt.u32 %p11, %r7, 63; |
| @%p11 bra BB62_19; |
| |
| ld.shared.f64 %fd35, [%r5+512]; |
| max.f64 %fd44, %fd44, %fd35; |
| st.shared.f64 [%r5], %fd44; |
| |
| BB62_19: |
| bar.sync 0; |
| |
| BB62_20: |
| setp.gt.u32 %p12, %r7, 31; |
| @%p12 bra BB62_33; |
| |
| setp.lt.u32 %p13, %r10, 64; |
| @%p13 bra BB62_23; |
| |
| ld.volatile.shared.f64 %fd36, [%r5+256]; |
| max.f64 %fd44, %fd44, %fd36; |
| st.volatile.shared.f64 [%r5], %fd44; |
| |
| BB62_23: |
| setp.lt.u32 %p14, %r10, 32; |
| @%p14 bra BB62_25; |
| |
| ld.volatile.shared.f64 %fd37, [%r5+128]; |
| max.f64 %fd44, %fd44, %fd37; |
| st.volatile.shared.f64 [%r5], %fd44; |
| |
| BB62_25: |
| setp.lt.u32 %p15, %r10, 16; |
| @%p15 bra BB62_27; |
| |
| ld.volatile.shared.f64 %fd38, [%r5+64]; |
| max.f64 %fd44, %fd44, %fd38; |
| st.volatile.shared.f64 [%r5], %fd44; |
| |
| BB62_27: |
| setp.lt.u32 %p16, %r10, 8; |
| @%p16 bra BB62_29; |
| |
| ld.volatile.shared.f64 %fd39, [%r5+32]; |
| max.f64 %fd44, %fd44, %fd39; |
| st.volatile.shared.f64 [%r5], %fd44; |
| |
| BB62_29: |
| setp.lt.u32 %p17, %r10, 4; |
| @%p17 bra BB62_31; |
| |
| ld.volatile.shared.f64 %fd40, [%r5+16]; |
| max.f64 %fd44, %fd44, %fd40; |
| st.volatile.shared.f64 [%r5], %fd44; |
| |
| BB62_31: |
| setp.lt.u32 %p18, %r10, 2; |
| @%p18 bra BB62_33; |
| |
| ld.volatile.shared.f64 %fd41, [%r5+8]; |
| max.f64 %fd42, %fd44, %fd41; |
| st.volatile.shared.f64 [%r5], %fd42; |
| |
| BB62_33: |
| setp.ne.s32 %p19, %r7, 0; |
| @%p19 bra BB62_35; |
| |
| ld.shared.f64 %fd43, [memory]; |
| cvta.to.global.u64 %rd9, %rd2; |
| mul.wide.u32 %rd10, %r8, 8; |
| add.s64 %rd11, %rd9, %rd10; |
| st.global.f64 [%rd11], %fd43; |
| |
| BB62_35: |
| ret; |
| } |
| |
| // .globl reduce_max_f |
| .visible .entry reduce_max_f( |
| .param .u64 reduce_max_f_param_0, |
| .param .u64 reduce_max_f_param_1, |
| .param .u32 reduce_max_f_param_2 |
| ) |
| { |
| .reg .pred %p<20>; |
| .reg .f32 %f<60>; |
| .reg .b32 %r<36>; |
| .reg .b64 %rd<12>; |
| |
| |
| ld.param.u64 %rd1, [reduce_max_f_param_0]; |
| ld.param.u64 %rd2, [reduce_max_f_param_1]; |
| ld.param.u32 %r6, [reduce_max_f_param_2]; |
| mov.u32 %r7, %tid.x; |
| mov.u32 %r8, %ctaid.x; |
| shl.b32 %r9, %r8, 1; |
| mov.u32 %r10, %ntid.x; |
| mad.lo.s32 %r35, %r9, %r10, %r7; |
| mov.f32 %f44, 0fFF7FFFFF; |
| setp.ge.u32 %p1, %r35, %r6; |
| @%p1 bra BB63_4; |
| |
| BB63_1: |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.u32 %rd4, %r35, 4; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f32 %f30, [%rd5]; |
| max.f32 %f44, %f44, %f30; |
| add.s32 %r3, %r35, %r10; |
| setp.ge.u32 %p2, %r3, %r6; |
| @%p2 bra BB63_3; |
| |
| mul.wide.u32 %rd7, %r3, 4; |
| add.s64 %rd8, %rd3, %rd7; |
| ld.global.f32 %f31, [%rd8]; |
| max.f32 %f44, %f44, %f31; |
| |
| BB63_3: |
| shl.b32 %r13, %r10, 1; |
| mov.u32 %r14, %nctaid.x; |
| mad.lo.s32 %r35, %r13, %r14, %r35; |
| setp.lt.u32 %p3, %r35, %r6; |
| @%p3 bra BB63_1; |
| |
| BB63_4: |
| shl.b32 %r16, %r7, 2; |
| mov.u32 %r17, memory; |
| add.s32 %r5, %r17, %r16; |
| st.shared.f32 [%r5], %f44; |
| bar.sync 0; |
| setp.lt.u32 %p4, %r10, 1024; |
| @%p4 bra BB63_8; |
| |
| setp.gt.u32 %p5, %r7, 511; |
| @%p5 bra BB63_7; |
| |
| ld.shared.f32 %f32, [%r5+2048]; |
| max.f32 %f44, %f44, %f32; |
| st.shared.f32 [%r5], %f44; |
| |
| BB63_7: |
| bar.sync 0; |
| |
| BB63_8: |
| setp.lt.u32 %p6, %r10, 512; |
| @%p6 bra BB63_12; |
| |
| setp.gt.u32 %p7, %r7, 255; |
| @%p7 bra BB63_11; |
| |
| ld.shared.f32 %f33, [%r5+1024]; |
| max.f32 %f44, %f44, %f33; |
| st.shared.f32 [%r5], %f44; |
| |
| BB63_11: |
| bar.sync 0; |
| |
| BB63_12: |
| setp.lt.u32 %p8, %r10, 256; |
| @%p8 bra BB63_16; |
| |
| setp.gt.u32 %p9, %r7, 127; |
| @%p9 bra BB63_15; |
| |
| ld.shared.f32 %f34, [%r5+512]; |
| max.f32 %f44, %f44, %f34; |
| st.shared.f32 [%r5], %f44; |
| |
| BB63_15: |
| bar.sync 0; |
| |
| BB63_16: |
| setp.lt.u32 %p10, %r10, 128; |
| @%p10 bra BB63_20; |
| |
| setp.gt.u32 %p11, %r7, 63; |
| @%p11 bra BB63_19; |
| |
| ld.shared.f32 %f35, [%r5+256]; |
| max.f32 %f44, %f44, %f35; |
| st.shared.f32 [%r5], %f44; |
| |
| BB63_19: |
| bar.sync 0; |
| |
| BB63_20: |
| setp.gt.u32 %p12, %r7, 31; |
| @%p12 bra BB63_33; |
| |
| setp.lt.u32 %p13, %r10, 64; |
| @%p13 bra BB63_23; |
| |
| ld.volatile.shared.f32 %f36, [%r5+128]; |
| max.f32 %f44, %f44, %f36; |
| st.volatile.shared.f32 [%r5], %f44; |
| |
| BB63_23: |
| setp.lt.u32 %p14, %r10, 32; |
| @%p14 bra BB63_25; |
| |
| ld.volatile.shared.f32 %f37, [%r5+64]; |
| max.f32 %f44, %f44, %f37; |
| st.volatile.shared.f32 [%r5], %f44; |
| |
| BB63_25: |
| setp.lt.u32 %p15, %r10, 16; |
| @%p15 bra BB63_27; |
| |
| ld.volatile.shared.f32 %f38, [%r5+32]; |
| max.f32 %f44, %f44, %f38; |
| st.volatile.shared.f32 [%r5], %f44; |
| |
| BB63_27: |
| setp.lt.u32 %p16, %r10, 8; |
| @%p16 bra BB63_29; |
| |
| ld.volatile.shared.f32 %f39, [%r5+16]; |
| max.f32 %f44, %f44, %f39; |
| st.volatile.shared.f32 [%r5], %f44; |
| |
| BB63_29: |
| setp.lt.u32 %p17, %r10, 4; |
| @%p17 bra BB63_31; |
| |
| ld.volatile.shared.f32 %f40, [%r5+8]; |
| max.f32 %f44, %f44, %f40; |
| st.volatile.shared.f32 [%r5], %f44; |
| |
| BB63_31: |
| setp.lt.u32 %p18, %r10, 2; |
| @%p18 bra BB63_33; |
| |
| ld.volatile.shared.f32 %f41, [%r5+4]; |
| max.f32 %f42, %f44, %f41; |
| st.volatile.shared.f32 [%r5], %f42; |
| |
| BB63_33: |
| setp.ne.s32 %p19, %r7, 0; |
| @%p19 bra BB63_35; |
| |
| ld.shared.f32 %f43, [memory]; |
| cvta.to.global.u64 %rd9, %rd2; |
| mul.wide.u32 %rd10, %r8, 4; |
| add.s64 %rd11, %rd9, %rd10; |
| st.global.f32 [%rd11], %f43; |
| |
| BB63_35: |
| ret; |
| } |
| |
| // .globl reduce_row_max_d |
| .visible .entry reduce_row_max_d( |
| .param .u64 reduce_row_max_d_param_0, |
| .param .u64 reduce_row_max_d_param_1, |
| .param .u32 reduce_row_max_d_param_2, |
| .param .u32 reduce_row_max_d_param_3 |
| ) |
| { |
| .reg .pred %p<20>; |
| .reg .b32 %r<72>; |
| .reg .f64 %fd<56>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd1, [reduce_row_max_d_param_0]; |
| ld.param.u64 %rd2, [reduce_row_max_d_param_1]; |
| ld.param.u32 %r5, [reduce_row_max_d_param_2]; |
| ld.param.u32 %r4, [reduce_row_max_d_param_3]; |
| mov.u32 %r6, %ctaid.x; |
| setp.ge.u32 %p1, %r6, %r5; |
| @%p1 bra BB64_35; |
| |
| mov.u32 %r71, %tid.x; |
| mov.f64 %fd6, 0dFFEFFFFFFFFFFFFF; |
| setp.ge.u32 %p2, %r71, %r4; |
| @%p2 bra BB64_4; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| |
| BB64_3: |
| mad.lo.s32 %r8, %r6, %r4, %r71; |
| mul.wide.u32 %rd4, %r8, 8; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f64 %fd28, [%rd5]; |
| max.f64 %fd6, %fd6, %fd28; |
| mov.u32 %r9, %ntid.x; |
| add.s32 %r71, %r9, %r71; |
| setp.lt.u32 %p3, %r71, %r4; |
| @%p3 bra BB64_3; |
| |
| BB64_4: |
| mov.u32 %r10, %tid.x; |
| shl.b32 %r11, %r10, 3; |
| mov.u32 %r12, memory; |
| add.s32 %r13, %r12, %r11; |
| st.shared.f64 [%r13], %fd6; |
| bar.sync 0; |
| mov.u32 %r14, %ntid.x; |
| setp.lt.u32 %p4, %r14, 1024; |
| @%p4 bra BB64_8; |
| |
| setp.gt.u32 %p5, %r10, 511; |
| @%p5 bra BB64_7; |
| |
| ld.shared.f64 %fd29, [%r13+4096]; |
| max.f64 %fd6, %fd6, %fd29; |
| st.shared.f64 [%r13], %fd6; |
| |
| BB64_7: |
| bar.sync 0; |
| |
| BB64_8: |
| setp.lt.u32 %p6, %r14, 512; |
| @%p6 bra BB64_12; |
| |
| setp.gt.u32 %p7, %r10, 255; |
| @%p7 bra BB64_11; |
| |
| ld.shared.f64 %fd30, [%r13+2048]; |
| max.f64 %fd6, %fd6, %fd30; |
| st.shared.f64 [%r13], %fd6; |
| |
| BB64_11: |
| bar.sync 0; |
| |
| BB64_12: |
| setp.lt.u32 %p8, %r14, 256; |
| @%p8 bra BB64_16; |
| |
| setp.gt.u32 %p9, %r10, 127; |
| @%p9 bra BB64_15; |
| |
| ld.shared.f64 %fd31, [%r13+1024]; |
| max.f64 %fd6, %fd6, %fd31; |
| st.shared.f64 [%r13], %fd6; |
| |
| BB64_15: |
| bar.sync 0; |
| |
| BB64_16: |
| setp.lt.u32 %p10, %r14, 128; |
| @%p10 bra BB64_20; |
| |
| setp.gt.u32 %p11, %r10, 63; |
| @%p11 bra BB64_19; |
| |
| ld.shared.f64 %fd32, [%r13+512]; |
| max.f64 %fd6, %fd6, %fd32; |
| st.shared.f64 [%r13], %fd6; |
| |
| BB64_19: |
| bar.sync 0; |
| |
| BB64_20: |
| setp.gt.u32 %p12, %r10, 31; |
| @%p12 bra BB64_33; |
| |
| setp.lt.u32 %p13, %r14, 64; |
| @%p13 bra BB64_23; |
| |
| ld.volatile.shared.f64 %fd33, [%r13+256]; |
| max.f64 %fd6, %fd6, %fd33; |
| st.volatile.shared.f64 [%r13], %fd6; |
| |
| BB64_23: |
| setp.lt.u32 %p14, %r14, 32; |
| @%p14 bra BB64_25; |
| |
| ld.volatile.shared.f64 %fd34, [%r13+128]; |
| max.f64 %fd6, %fd6, %fd34; |
| st.volatile.shared.f64 [%r13], %fd6; |
| |
| BB64_25: |
| setp.lt.u32 %p15, %r14, 16; |
| @%p15 bra BB64_27; |
| |
| ld.volatile.shared.f64 %fd35, [%r13+64]; |
| max.f64 %fd6, %fd6, %fd35; |
| st.volatile.shared.f64 [%r13], %fd6; |
| |
| BB64_27: |
| setp.lt.u32 %p16, %r14, 8; |
| @%p16 bra BB64_29; |
| |
| ld.volatile.shared.f64 %fd36, [%r13+32]; |
| max.f64 %fd6, %fd6, %fd36; |
| st.volatile.shared.f64 [%r13], %fd6; |
| |
| BB64_29: |
| setp.lt.u32 %p17, %r14, 4; |
| @%p17 bra BB64_31; |
| |
| ld.volatile.shared.f64 %fd37, [%r13+16]; |
| max.f64 %fd6, %fd6, %fd37; |
| st.volatile.shared.f64 [%r13], %fd6; |
| |
| BB64_31: |
| setp.lt.u32 %p18, %r14, 2; |
| @%p18 bra BB64_33; |
| |
| ld.volatile.shared.f64 %fd38, [%r13+8]; |
| max.f64 %fd39, %fd6, %fd38; |
| st.volatile.shared.f64 [%r13], %fd39; |
| |
| BB64_33: |
| setp.ne.s32 %p19, %r10, 0; |
| @%p19 bra BB64_35; |
| |
| ld.shared.f64 %fd40, [memory]; |
| cvta.to.global.u64 %rd6, %rd2; |
| mul.wide.u32 %rd7, %r6, 8; |
| add.s64 %rd8, %rd6, %rd7; |
| st.global.f64 [%rd8], %fd40; |
| |
| BB64_35: |
| ret; |
| } |
| |
| // .globl reduce_row_max_f |
| .visible .entry reduce_row_max_f( |
| .param .u64 reduce_row_max_f_param_0, |
| .param .u64 reduce_row_max_f_param_1, |
| .param .u32 reduce_row_max_f_param_2, |
| .param .u32 reduce_row_max_f_param_3 |
| ) |
| { |
| .reg .pred %p<20>; |
| .reg .f32 %f<56>; |
| .reg .b32 %r<72>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd1, [reduce_row_max_f_param_0]; |
| ld.param.u64 %rd2, [reduce_row_max_f_param_1]; |
| ld.param.u32 %r5, [reduce_row_max_f_param_2]; |
| ld.param.u32 %r4, [reduce_row_max_f_param_3]; |
| mov.u32 %r6, %ctaid.x; |
| setp.ge.u32 %p1, %r6, %r5; |
| @%p1 bra BB65_35; |
| |
| mov.u32 %r71, %tid.x; |
| mov.f32 %f6, 0fFF7FFFFF; |
| setp.ge.u32 %p2, %r71, %r4; |
| @%p2 bra BB65_4; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| |
| BB65_3: |
| mad.lo.s32 %r8, %r6, %r4, %r71; |
| mul.wide.u32 %rd4, %r8, 4; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f32 %f28, [%rd5]; |
| max.f32 %f6, %f6, %f28; |
| mov.u32 %r9, %ntid.x; |
| add.s32 %r71, %r9, %r71; |
| setp.lt.u32 %p3, %r71, %r4; |
| @%p3 bra BB65_3; |
| |
| BB65_4: |
| mov.u32 %r10, %tid.x; |
| shl.b32 %r11, %r10, 2; |
| mov.u32 %r12, memory; |
| add.s32 %r13, %r12, %r11; |
| st.shared.f32 [%r13], %f6; |
| bar.sync 0; |
| mov.u32 %r14, %ntid.x; |
| setp.lt.u32 %p4, %r14, 1024; |
| @%p4 bra BB65_8; |
| |
| setp.gt.u32 %p5, %r10, 511; |
| @%p5 bra BB65_7; |
| |
| ld.shared.f32 %f29, [%r13+2048]; |
| max.f32 %f6, %f6, %f29; |
| st.shared.f32 [%r13], %f6; |
| |
| BB65_7: |
| bar.sync 0; |
| |
| BB65_8: |
| setp.lt.u32 %p6, %r14, 512; |
| @%p6 bra BB65_12; |
| |
| setp.gt.u32 %p7, %r10, 255; |
| @%p7 bra BB65_11; |
| |
| ld.shared.f32 %f30, [%r13+1024]; |
| max.f32 %f6, %f6, %f30; |
| st.shared.f32 [%r13], %f6; |
| |
| BB65_11: |
| bar.sync 0; |
| |
| BB65_12: |
| setp.lt.u32 %p8, %r14, 256; |
| @%p8 bra BB65_16; |
| |
| setp.gt.u32 %p9, %r10, 127; |
| @%p9 bra BB65_15; |
| |
| ld.shared.f32 %f31, [%r13+512]; |
| max.f32 %f6, %f6, %f31; |
| st.shared.f32 [%r13], %f6; |
| |
| BB65_15: |
| bar.sync 0; |
| |
| BB65_16: |
| setp.lt.u32 %p10, %r14, 128; |
| @%p10 bra BB65_20; |
| |
| setp.gt.u32 %p11, %r10, 63; |
| @%p11 bra BB65_19; |
| |
| ld.shared.f32 %f32, [%r13+256]; |
| max.f32 %f6, %f6, %f32; |
| st.shared.f32 [%r13], %f6; |
| |
| BB65_19: |
| bar.sync 0; |
| |
| BB65_20: |
| setp.gt.u32 %p12, %r10, 31; |
| @%p12 bra BB65_33; |
| |
| setp.lt.u32 %p13, %r14, 64; |
| @%p13 bra BB65_23; |
| |
| ld.volatile.shared.f32 %f33, [%r13+128]; |
| max.f32 %f6, %f6, %f33; |
| st.volatile.shared.f32 [%r13], %f6; |
| |
| BB65_23: |
| setp.lt.u32 %p14, %r14, 32; |
| @%p14 bra BB65_25; |
| |
| ld.volatile.shared.f32 %f34, [%r13+64]; |
| max.f32 %f6, %f6, %f34; |
| st.volatile.shared.f32 [%r13], %f6; |
| |
| BB65_25: |
| setp.lt.u32 %p15, %r14, 16; |
| @%p15 bra BB65_27; |
| |
| ld.volatile.shared.f32 %f35, [%r13+32]; |
| max.f32 %f6, %f6, %f35; |
| st.volatile.shared.f32 [%r13], %f6; |
| |
| BB65_27: |
| setp.lt.u32 %p16, %r14, 8; |
| @%p16 bra BB65_29; |
| |
| ld.volatile.shared.f32 %f36, [%r13+16]; |
| max.f32 %f6, %f6, %f36; |
| st.volatile.shared.f32 [%r13], %f6; |
| |
| BB65_29: |
| setp.lt.u32 %p17, %r14, 4; |
| @%p17 bra BB65_31; |
| |
| ld.volatile.shared.f32 %f37, [%r13+8]; |
| max.f32 %f6, %f6, %f37; |
| st.volatile.shared.f32 [%r13], %f6; |
| |
| BB65_31: |
| setp.lt.u32 %p18, %r14, 2; |
| @%p18 bra BB65_33; |
| |
| ld.volatile.shared.f32 %f38, [%r13+4]; |
| max.f32 %f39, %f6, %f38; |
| st.volatile.shared.f32 [%r13], %f39; |
| |
| BB65_33: |
| setp.ne.s32 %p19, %r10, 0; |
| @%p19 bra BB65_35; |
| |
| ld.shared.f32 %f40, [memory]; |
| cvta.to.global.u64 %rd6, %rd2; |
| mul.wide.u32 %rd7, %r6, 4; |
| add.s64 %rd8, %rd6, %rd7; |
| st.global.f32 [%rd8], %f40; |
| |
| BB65_35: |
| ret; |
| } |
| |
| // .globl reduce_col_max_d |
| .visible .entry reduce_col_max_d( |
| .param .u64 reduce_col_max_d_param_0, |
| .param .u64 reduce_col_max_d_param_1, |
| .param .u32 reduce_col_max_d_param_2, |
| .param .u32 reduce_col_max_d_param_3 |
| ) |
| { |
| .reg .pred %p<4>; |
| .reg .b32 %r<11>; |
| .reg .f64 %fd<9>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd2, [reduce_col_max_d_param_0]; |
| ld.param.u64 %rd3, [reduce_col_max_d_param_1]; |
| ld.param.u32 %r5, [reduce_col_max_d_param_2]; |
| ld.param.u32 %r6, [reduce_col_max_d_param_3]; |
| mov.u32 %r7, %ntid.x; |
| mov.u32 %r8, %ctaid.x; |
| mov.u32 %r9, %tid.x; |
| mad.lo.s32 %r1, %r7, %r8, %r9; |
| setp.ge.u32 %p1, %r1, %r6; |
| @%p1 bra BB66_5; |
| |
| mul.lo.s32 %r2, %r6, %r5; |
| cvta.to.global.u64 %rd1, %rd2; |
| mov.f64 %fd8, 0dFFEFFFFFFFFFFFFF; |
| setp.ge.u32 %p2, %r1, %r2; |
| @%p2 bra BB66_4; |
| |
| mov.u32 %r10, %r1; |
| |
| BB66_3: |
| mul.wide.u32 %rd4, %r10, 8; |
| add.s64 %rd5, %rd1, %rd4; |
| ld.global.f64 %fd6, [%rd5]; |
| max.f64 %fd8, %fd8, %fd6; |
| add.s32 %r10, %r10, %r6; |
| setp.lt.u32 %p3, %r10, %r2; |
| @%p3 bra BB66_3; |
| |
| BB66_4: |
| cvta.to.global.u64 %rd6, %rd3; |
| mul.wide.u32 %rd7, %r1, 8; |
| add.s64 %rd8, %rd6, %rd7; |
| st.global.f64 [%rd8], %fd8; |
| |
| BB66_5: |
| ret; |
| } |
| |
| // .globl reduce_col_max_f |
| .visible .entry reduce_col_max_f( |
| .param .u64 reduce_col_max_f_param_0, |
| .param .u64 reduce_col_max_f_param_1, |
| .param .u32 reduce_col_max_f_param_2, |
| .param .u32 reduce_col_max_f_param_3 |
| ) |
| { |
| .reg .pred %p<4>; |
| .reg .f32 %f<9>; |
| .reg .b32 %r<11>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd2, [reduce_col_max_f_param_0]; |
| ld.param.u64 %rd3, [reduce_col_max_f_param_1]; |
| ld.param.u32 %r5, [reduce_col_max_f_param_2]; |
| ld.param.u32 %r6, [reduce_col_max_f_param_3]; |
| mov.u32 %r7, %ntid.x; |
| mov.u32 %r8, %ctaid.x; |
| mov.u32 %r9, %tid.x; |
| mad.lo.s32 %r1, %r7, %r8, %r9; |
| setp.ge.u32 %p1, %r1, %r6; |
| @%p1 bra BB67_5; |
| |
| mul.lo.s32 %r2, %r6, %r5; |
| cvta.to.global.u64 %rd1, %rd2; |
| mov.f32 %f8, 0fFF7FFFFF; |
| setp.ge.u32 %p2, %r1, %r2; |
| @%p2 bra BB67_4; |
| |
| mov.u32 %r10, %r1; |
| |
| BB67_3: |
| mul.wide.u32 %rd4, %r10, 4; |
| add.s64 %rd5, %rd1, %rd4; |
| ld.global.f32 %f6, [%rd5]; |
| max.f32 %f8, %f8, %f6; |
| add.s32 %r10, %r10, %r6; |
| setp.lt.u32 %p3, %r10, %r2; |
| @%p3 bra BB67_3; |
| |
| BB67_4: |
| cvta.to.global.u64 %rd6, %rd3; |
| mul.wide.u32 %rd7, %r1, 4; |
| add.s64 %rd8, %rd6, %rd7; |
| st.global.f32 [%rd8], %f8; |
| |
| BB67_5: |
| ret; |
| } |
| |
| // .globl reduce_min_d |
| .visible .entry reduce_min_d( |
| .param .u64 reduce_min_d_param_0, |
| .param .u64 reduce_min_d_param_1, |
| .param .u32 reduce_min_d_param_2 |
| ) |
| { |
| .reg .pred %p<20>; |
| .reg .b32 %r<36>; |
| .reg .f64 %fd<60>; |
| .reg .b64 %rd<12>; |
| |
| |
| ld.param.u64 %rd1, [reduce_min_d_param_0]; |
| ld.param.u64 %rd2, [reduce_min_d_param_1]; |
| ld.param.u32 %r6, [reduce_min_d_param_2]; |
| mov.u32 %r7, %tid.x; |
| mov.u32 %r8, %ctaid.x; |
| shl.b32 %r9, %r8, 1; |
| mov.u32 %r10, %ntid.x; |
| mad.lo.s32 %r35, %r9, %r10, %r7; |
| mov.f64 %fd44, 0d7FEFFFFFFFFFFFFF; |
| setp.ge.u32 %p1, %r35, %r6; |
| @%p1 bra BB68_4; |
| |
| BB68_1: |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.u32 %rd4, %r35, 8; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f64 %fd30, [%rd5]; |
| min.f64 %fd44, %fd44, %fd30; |
| add.s32 %r3, %r35, %r10; |
| setp.ge.u32 %p2, %r3, %r6; |
| @%p2 bra BB68_3; |
| |
| mul.wide.u32 %rd7, %r3, 8; |
| add.s64 %rd8, %rd3, %rd7; |
| ld.global.f64 %fd31, [%rd8]; |
| min.f64 %fd44, %fd44, %fd31; |
| |
| BB68_3: |
| shl.b32 %r13, %r10, 1; |
| mov.u32 %r14, %nctaid.x; |
| mad.lo.s32 %r35, %r13, %r14, %r35; |
| setp.lt.u32 %p3, %r35, %r6; |
| @%p3 bra BB68_1; |
| |
| BB68_4: |
| shl.b32 %r16, %r7, 3; |
| mov.u32 %r17, memory; |
| add.s32 %r5, %r17, %r16; |
| st.shared.f64 [%r5], %fd44; |
| bar.sync 0; |
| setp.lt.u32 %p4, %r10, 1024; |
| @%p4 bra BB68_8; |
| |
| setp.gt.u32 %p5, %r7, 511; |
| @%p5 bra BB68_7; |
| |
| ld.shared.f64 %fd32, [%r5+4096]; |
| min.f64 %fd44, %fd44, %fd32; |
| st.shared.f64 [%r5], %fd44; |
| |
| BB68_7: |
| bar.sync 0; |
| |
| BB68_8: |
| setp.lt.u32 %p6, %r10, 512; |
| @%p6 bra BB68_12; |
| |
| setp.gt.u32 %p7, %r7, 255; |
| @%p7 bra BB68_11; |
| |
| ld.shared.f64 %fd33, [%r5+2048]; |
| min.f64 %fd44, %fd44, %fd33; |
| st.shared.f64 [%r5], %fd44; |
| |
| BB68_11: |
| bar.sync 0; |
| |
| BB68_12: |
| setp.lt.u32 %p8, %r10, 256; |
| @%p8 bra BB68_16; |
| |
| setp.gt.u32 %p9, %r7, 127; |
| @%p9 bra BB68_15; |
| |
| ld.shared.f64 %fd34, [%r5+1024]; |
| min.f64 %fd44, %fd44, %fd34; |
| st.shared.f64 [%r5], %fd44; |
| |
| BB68_15: |
| bar.sync 0; |
| |
| BB68_16: |
| setp.lt.u32 %p10, %r10, 128; |
| @%p10 bra BB68_20; |
| |
| setp.gt.u32 %p11, %r7, 63; |
| @%p11 bra BB68_19; |
| |
| ld.shared.f64 %fd35, [%r5+512]; |
| min.f64 %fd44, %fd44, %fd35; |
| st.shared.f64 [%r5], %fd44; |
| |
| BB68_19: |
| bar.sync 0; |
| |
| BB68_20: |
| setp.gt.u32 %p12, %r7, 31; |
| @%p12 bra BB68_33; |
| |
| setp.lt.u32 %p13, %r10, 64; |
| @%p13 bra BB68_23; |
| |
| ld.volatile.shared.f64 %fd36, [%r5+256]; |
| min.f64 %fd44, %fd44, %fd36; |
| st.volatile.shared.f64 [%r5], %fd44; |
| |
| BB68_23: |
| setp.lt.u32 %p14, %r10, 32; |
| @%p14 bra BB68_25; |
| |
| ld.volatile.shared.f64 %fd37, [%r5+128]; |
| min.f64 %fd44, %fd44, %fd37; |
| st.volatile.shared.f64 [%r5], %fd44; |
| |
| BB68_25: |
| setp.lt.u32 %p15, %r10, 16; |
| @%p15 bra BB68_27; |
| |
| ld.volatile.shared.f64 %fd38, [%r5+64]; |
| min.f64 %fd44, %fd44, %fd38; |
| st.volatile.shared.f64 [%r5], %fd44; |
| |
| BB68_27: |
| setp.lt.u32 %p16, %r10, 8; |
| @%p16 bra BB68_29; |
| |
| ld.volatile.shared.f64 %fd39, [%r5+32]; |
| min.f64 %fd44, %fd44, %fd39; |
| st.volatile.shared.f64 [%r5], %fd44; |
| |
| BB68_29: |
| setp.lt.u32 %p17, %r10, 4; |
| @%p17 bra BB68_31; |
| |
| ld.volatile.shared.f64 %fd40, [%r5+16]; |
| min.f64 %fd44, %fd44, %fd40; |
| st.volatile.shared.f64 [%r5], %fd44; |
| |
| BB68_31: |
| setp.lt.u32 %p18, %r10, 2; |
| @%p18 bra BB68_33; |
| |
| ld.volatile.shared.f64 %fd41, [%r5+8]; |
| min.f64 %fd42, %fd44, %fd41; |
| st.volatile.shared.f64 [%r5], %fd42; |
| |
| BB68_33: |
| setp.ne.s32 %p19, %r7, 0; |
| @%p19 bra BB68_35; |
| |
| ld.shared.f64 %fd43, [memory]; |
| cvta.to.global.u64 %rd9, %rd2; |
| mul.wide.u32 %rd10, %r8, 8; |
| add.s64 %rd11, %rd9, %rd10; |
| st.global.f64 [%rd11], %fd43; |
| |
| BB68_35: |
| ret; |
| } |
| |
| // .globl reduce_min_f |
| .visible .entry reduce_min_f( |
| .param .u64 reduce_min_f_param_0, |
| .param .u64 reduce_min_f_param_1, |
| .param .u32 reduce_min_f_param_2 |
| ) |
| { |
| .reg .pred %p<20>; |
| .reg .f32 %f<60>; |
| .reg .b32 %r<36>; |
| .reg .b64 %rd<12>; |
| |
| |
| ld.param.u64 %rd1, [reduce_min_f_param_0]; |
| ld.param.u64 %rd2, [reduce_min_f_param_1]; |
| ld.param.u32 %r6, [reduce_min_f_param_2]; |
| mov.u32 %r7, %tid.x; |
| mov.u32 %r8, %ctaid.x; |
| shl.b32 %r9, %r8, 1; |
| mov.u32 %r10, %ntid.x; |
| mad.lo.s32 %r35, %r9, %r10, %r7; |
| mov.f32 %f44, 0f7F7FFFFF; |
| setp.ge.u32 %p1, %r35, %r6; |
| @%p1 bra BB69_4; |
| |
| BB69_1: |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.u32 %rd4, %r35, 4; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f32 %f30, [%rd5]; |
| min.f32 %f44, %f44, %f30; |
| add.s32 %r3, %r35, %r10; |
| setp.ge.u32 %p2, %r3, %r6; |
| @%p2 bra BB69_3; |
| |
| mul.wide.u32 %rd7, %r3, 4; |
| add.s64 %rd8, %rd3, %rd7; |
| ld.global.f32 %f31, [%rd8]; |
| min.f32 %f44, %f44, %f31; |
| |
| BB69_3: |
| shl.b32 %r13, %r10, 1; |
| mov.u32 %r14, %nctaid.x; |
| mad.lo.s32 %r35, %r13, %r14, %r35; |
| setp.lt.u32 %p3, %r35, %r6; |
| @%p3 bra BB69_1; |
| |
| BB69_4: |
| shl.b32 %r16, %r7, 2; |
| mov.u32 %r17, memory; |
| add.s32 %r5, %r17, %r16; |
| st.shared.f32 [%r5], %f44; |
| bar.sync 0; |
| setp.lt.u32 %p4, %r10, 1024; |
| @%p4 bra BB69_8; |
| |
| setp.gt.u32 %p5, %r7, 511; |
| @%p5 bra BB69_7; |
| |
| ld.shared.f32 %f32, [%r5+2048]; |
| min.f32 %f44, %f44, %f32; |
| st.shared.f32 [%r5], %f44; |
| |
| BB69_7: |
| bar.sync 0; |
| |
| BB69_8: |
| setp.lt.u32 %p6, %r10, 512; |
| @%p6 bra BB69_12; |
| |
| setp.gt.u32 %p7, %r7, 255; |
| @%p7 bra BB69_11; |
| |
| ld.shared.f32 %f33, [%r5+1024]; |
| min.f32 %f44, %f44, %f33; |
| st.shared.f32 [%r5], %f44; |
| |
| BB69_11: |
| bar.sync 0; |
| |
| BB69_12: |
| setp.lt.u32 %p8, %r10, 256; |
| @%p8 bra BB69_16; |
| |
| setp.gt.u32 %p9, %r7, 127; |
| @%p9 bra BB69_15; |
| |
| ld.shared.f32 %f34, [%r5+512]; |
| min.f32 %f44, %f44, %f34; |
| st.shared.f32 [%r5], %f44; |
| |
| BB69_15: |
| bar.sync 0; |
| |
| BB69_16: |
| setp.lt.u32 %p10, %r10, 128; |
| @%p10 bra BB69_20; |
| |
| setp.gt.u32 %p11, %r7, 63; |
| @%p11 bra BB69_19; |
| |
| ld.shared.f32 %f35, [%r5+256]; |
| min.f32 %f44, %f44, %f35; |
| st.shared.f32 [%r5], %f44; |
| |
| BB69_19: |
| bar.sync 0; |
| |
| BB69_20: |
| setp.gt.u32 %p12, %r7, 31; |
| @%p12 bra BB69_33; |
| |
| setp.lt.u32 %p13, %r10, 64; |
| @%p13 bra BB69_23; |
| |
| ld.volatile.shared.f32 %f36, [%r5+128]; |
| min.f32 %f44, %f44, %f36; |
| st.volatile.shared.f32 [%r5], %f44; |
| |
| BB69_23: |
| setp.lt.u32 %p14, %r10, 32; |
| @%p14 bra BB69_25; |
| |
| ld.volatile.shared.f32 %f37, [%r5+64]; |
| min.f32 %f44, %f44, %f37; |
| st.volatile.shared.f32 [%r5], %f44; |
| |
| BB69_25: |
| setp.lt.u32 %p15, %r10, 16; |
| @%p15 bra BB69_27; |
| |
| ld.volatile.shared.f32 %f38, [%r5+32]; |
| min.f32 %f44, %f44, %f38; |
| st.volatile.shared.f32 [%r5], %f44; |
| |
| BB69_27: |
| setp.lt.u32 %p16, %r10, 8; |
| @%p16 bra BB69_29; |
| |
| ld.volatile.shared.f32 %f39, [%r5+16]; |
| min.f32 %f44, %f44, %f39; |
| st.volatile.shared.f32 [%r5], %f44; |
| |
| BB69_29: |
| setp.lt.u32 %p17, %r10, 4; |
| @%p17 bra BB69_31; |
| |
| ld.volatile.shared.f32 %f40, [%r5+8]; |
| min.f32 %f44, %f44, %f40; |
| st.volatile.shared.f32 [%r5], %f44; |
| |
| BB69_31: |
| setp.lt.u32 %p18, %r10, 2; |
| @%p18 bra BB69_33; |
| |
| ld.volatile.shared.f32 %f41, [%r5+4]; |
| min.f32 %f42, %f44, %f41; |
| st.volatile.shared.f32 [%r5], %f42; |
| |
| BB69_33: |
| setp.ne.s32 %p19, %r7, 0; |
| @%p19 bra BB69_35; |
| |
| ld.shared.f32 %f43, [memory]; |
| cvta.to.global.u64 %rd9, %rd2; |
| mul.wide.u32 %rd10, %r8, 4; |
| add.s64 %rd11, %rd9, %rd10; |
| st.global.f32 [%rd11], %f43; |
| |
| BB69_35: |
| ret; |
| } |
| |
| // .globl reduce_row_min_d |
| .visible .entry reduce_row_min_d( |
| .param .u64 reduce_row_min_d_param_0, |
| .param .u64 reduce_row_min_d_param_1, |
| .param .u32 reduce_row_min_d_param_2, |
| .param .u32 reduce_row_min_d_param_3 |
| ) |
| { |
| .reg .pred %p<20>; |
| .reg .b32 %r<72>; |
| .reg .f64 %fd<56>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd1, [reduce_row_min_d_param_0]; |
| ld.param.u64 %rd2, [reduce_row_min_d_param_1]; |
| ld.param.u32 %r5, [reduce_row_min_d_param_2]; |
| ld.param.u32 %r4, [reduce_row_min_d_param_3]; |
| mov.u32 %r6, %ctaid.x; |
| setp.ge.u32 %p1, %r6, %r5; |
| @%p1 bra BB70_35; |
| |
| mov.u32 %r71, %tid.x; |
| mov.f64 %fd6, 0d7FEFFFFFFFFFFFFF; |
| setp.ge.u32 %p2, %r71, %r4; |
| @%p2 bra BB70_4; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| |
| BB70_3: |
| mad.lo.s32 %r8, %r6, %r4, %r71; |
| mul.wide.u32 %rd4, %r8, 8; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f64 %fd28, [%rd5]; |
| min.f64 %fd6, %fd6, %fd28; |
| mov.u32 %r9, %ntid.x; |
| add.s32 %r71, %r9, %r71; |
| setp.lt.u32 %p3, %r71, %r4; |
| @%p3 bra BB70_3; |
| |
| BB70_4: |
| mov.u32 %r10, %tid.x; |
| shl.b32 %r11, %r10, 3; |
| mov.u32 %r12, memory; |
| add.s32 %r13, %r12, %r11; |
| st.shared.f64 [%r13], %fd6; |
| bar.sync 0; |
| mov.u32 %r14, %ntid.x; |
| setp.lt.u32 %p4, %r14, 1024; |
| @%p4 bra BB70_8; |
| |
| setp.gt.u32 %p5, %r10, 511; |
| @%p5 bra BB70_7; |
| |
| ld.shared.f64 %fd29, [%r13+4096]; |
| min.f64 %fd6, %fd6, %fd29; |
| st.shared.f64 [%r13], %fd6; |
| |
| BB70_7: |
| bar.sync 0; |
| |
| BB70_8: |
| setp.lt.u32 %p6, %r14, 512; |
| @%p6 bra BB70_12; |
| |
| setp.gt.u32 %p7, %r10, 255; |
| @%p7 bra BB70_11; |
| |
| ld.shared.f64 %fd30, [%r13+2048]; |
| min.f64 %fd6, %fd6, %fd30; |
| st.shared.f64 [%r13], %fd6; |
| |
| BB70_11: |
| bar.sync 0; |
| |
| BB70_12: |
| setp.lt.u32 %p8, %r14, 256; |
| @%p8 bra BB70_16; |
| |
| setp.gt.u32 %p9, %r10, 127; |
| @%p9 bra BB70_15; |
| |
| ld.shared.f64 %fd31, [%r13+1024]; |
| min.f64 %fd6, %fd6, %fd31; |
| st.shared.f64 [%r13], %fd6; |
| |
| BB70_15: |
| bar.sync 0; |
| |
| BB70_16: |
| setp.lt.u32 %p10, %r14, 128; |
| @%p10 bra BB70_20; |
| |
| setp.gt.u32 %p11, %r10, 63; |
| @%p11 bra BB70_19; |
| |
| ld.shared.f64 %fd32, [%r13+512]; |
| min.f64 %fd6, %fd6, %fd32; |
| st.shared.f64 [%r13], %fd6; |
| |
| BB70_19: |
| bar.sync 0; |
| |
| BB70_20: |
| setp.gt.u32 %p12, %r10, 31; |
| @%p12 bra BB70_33; |
| |
| setp.lt.u32 %p13, %r14, 64; |
| @%p13 bra BB70_23; |
| |
| ld.volatile.shared.f64 %fd33, [%r13+256]; |
| min.f64 %fd6, %fd6, %fd33; |
| st.volatile.shared.f64 [%r13], %fd6; |
| |
| BB70_23: |
| setp.lt.u32 %p14, %r14, 32; |
| @%p14 bra BB70_25; |
| |
| ld.volatile.shared.f64 %fd34, [%r13+128]; |
| min.f64 %fd6, %fd6, %fd34; |
| st.volatile.shared.f64 [%r13], %fd6; |
| |
| BB70_25: |
| setp.lt.u32 %p15, %r14, 16; |
| @%p15 bra BB70_27; |
| |
| ld.volatile.shared.f64 %fd35, [%r13+64]; |
| min.f64 %fd6, %fd6, %fd35; |
| st.volatile.shared.f64 [%r13], %fd6; |
| |
| BB70_27: |
| setp.lt.u32 %p16, %r14, 8; |
| @%p16 bra BB70_29; |
| |
| ld.volatile.shared.f64 %fd36, [%r13+32]; |
| min.f64 %fd6, %fd6, %fd36; |
| st.volatile.shared.f64 [%r13], %fd6; |
| |
| BB70_29: |
| setp.lt.u32 %p17, %r14, 4; |
| @%p17 bra BB70_31; |
| |
| ld.volatile.shared.f64 %fd37, [%r13+16]; |
| min.f64 %fd6, %fd6, %fd37; |
| st.volatile.shared.f64 [%r13], %fd6; |
| |
| BB70_31: |
| setp.lt.u32 %p18, %r14, 2; |
| @%p18 bra BB70_33; |
| |
| ld.volatile.shared.f64 %fd38, [%r13+8]; |
| min.f64 %fd39, %fd6, %fd38; |
| st.volatile.shared.f64 [%r13], %fd39; |
| |
| BB70_33: |
| setp.ne.s32 %p19, %r10, 0; |
| @%p19 bra BB70_35; |
| |
| ld.shared.f64 %fd40, [memory]; |
| cvta.to.global.u64 %rd6, %rd2; |
| mul.wide.u32 %rd7, %r6, 8; |
| add.s64 %rd8, %rd6, %rd7; |
| st.global.f64 [%rd8], %fd40; |
| |
| BB70_35: |
| ret; |
| } |
| |
| // .globl reduce_row_min_f |
| .visible .entry reduce_row_min_f( |
| .param .u64 reduce_row_min_f_param_0, |
| .param .u64 reduce_row_min_f_param_1, |
| .param .u32 reduce_row_min_f_param_2, |
| .param .u32 reduce_row_min_f_param_3 |
| ) |
| { |
| .reg .pred %p<20>; |
| .reg .f32 %f<56>; |
| .reg .b32 %r<72>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd1, [reduce_row_min_f_param_0]; |
| ld.param.u64 %rd2, [reduce_row_min_f_param_1]; |
| ld.param.u32 %r5, [reduce_row_min_f_param_2]; |
| ld.param.u32 %r4, [reduce_row_min_f_param_3]; |
| mov.u32 %r6, %ctaid.x; |
| setp.ge.u32 %p1, %r6, %r5; |
| @%p1 bra BB71_35; |
| |
| mov.u32 %r71, %tid.x; |
| mov.f32 %f6, 0f7F7FFFFF; |
| setp.ge.u32 %p2, %r71, %r4; |
| @%p2 bra BB71_4; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| |
| BB71_3: |
| mad.lo.s32 %r8, %r6, %r4, %r71; |
| mul.wide.u32 %rd4, %r8, 4; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f32 %f28, [%rd5]; |
| min.f32 %f6, %f6, %f28; |
| mov.u32 %r9, %ntid.x; |
| add.s32 %r71, %r9, %r71; |
| setp.lt.u32 %p3, %r71, %r4; |
| @%p3 bra BB71_3; |
| |
| BB71_4: |
| mov.u32 %r10, %tid.x; |
| shl.b32 %r11, %r10, 2; |
| mov.u32 %r12, memory; |
| add.s32 %r13, %r12, %r11; |
| st.shared.f32 [%r13], %f6; |
| bar.sync 0; |
| mov.u32 %r14, %ntid.x; |
| setp.lt.u32 %p4, %r14, 1024; |
| @%p4 bra BB71_8; |
| |
| setp.gt.u32 %p5, %r10, 511; |
| @%p5 bra BB71_7; |
| |
| ld.shared.f32 %f29, [%r13+2048]; |
| min.f32 %f6, %f6, %f29; |
| st.shared.f32 [%r13], %f6; |
| |
| BB71_7: |
| bar.sync 0; |
| |
| BB71_8: |
| setp.lt.u32 %p6, %r14, 512; |
| @%p6 bra BB71_12; |
| |
| setp.gt.u32 %p7, %r10, 255; |
| @%p7 bra BB71_11; |
| |
| ld.shared.f32 %f30, [%r13+1024]; |
| min.f32 %f6, %f6, %f30; |
| st.shared.f32 [%r13], %f6; |
| |
| BB71_11: |
| bar.sync 0; |
| |
| BB71_12: |
| setp.lt.u32 %p8, %r14, 256; |
| @%p8 bra BB71_16; |
| |
| setp.gt.u32 %p9, %r10, 127; |
| @%p9 bra BB71_15; |
| |
| ld.shared.f32 %f31, [%r13+512]; |
| min.f32 %f6, %f6, %f31; |
| st.shared.f32 [%r13], %f6; |
| |
| BB71_15: |
| bar.sync 0; |
| |
| BB71_16: |
| setp.lt.u32 %p10, %r14, 128; |
| @%p10 bra BB71_20; |
| |
| setp.gt.u32 %p11, %r10, 63; |
| @%p11 bra BB71_19; |
| |
| ld.shared.f32 %f32, [%r13+256]; |
| min.f32 %f6, %f6, %f32; |
| st.shared.f32 [%r13], %f6; |
| |
| BB71_19: |
| bar.sync 0; |
| |
| BB71_20: |
| setp.gt.u32 %p12, %r10, 31; |
| @%p12 bra BB71_33; |
| |
| setp.lt.u32 %p13, %r14, 64; |
| @%p13 bra BB71_23; |
| |
| ld.volatile.shared.f32 %f33, [%r13+128]; |
| min.f32 %f6, %f6, %f33; |
| st.volatile.shared.f32 [%r13], %f6; |
| |
| BB71_23: |
| setp.lt.u32 %p14, %r14, 32; |
| @%p14 bra BB71_25; |
| |
| ld.volatile.shared.f32 %f34, [%r13+64]; |
| min.f32 %f6, %f6, %f34; |
| st.volatile.shared.f32 [%r13], %f6; |
| |
| BB71_25: |
| setp.lt.u32 %p15, %r14, 16; |
| @%p15 bra BB71_27; |
| |
| ld.volatile.shared.f32 %f35, [%r13+32]; |
| min.f32 %f6, %f6, %f35; |
| st.volatile.shared.f32 [%r13], %f6; |
| |
| BB71_27: |
| setp.lt.u32 %p16, %r14, 8; |
| @%p16 bra BB71_29; |
| |
| ld.volatile.shared.f32 %f36, [%r13+16]; |
| min.f32 %f6, %f6, %f36; |
| st.volatile.shared.f32 [%r13], %f6; |
| |
| BB71_29: |
| setp.lt.u32 %p17, %r14, 4; |
| @%p17 bra BB71_31; |
| |
| ld.volatile.shared.f32 %f37, [%r13+8]; |
| min.f32 %f6, %f6, %f37; |
| st.volatile.shared.f32 [%r13], %f6; |
| |
| BB71_31: |
| setp.lt.u32 %p18, %r14, 2; |
| @%p18 bra BB71_33; |
| |
| ld.volatile.shared.f32 %f38, [%r13+4]; |
| min.f32 %f39, %f6, %f38; |
| st.volatile.shared.f32 [%r13], %f39; |
| |
| BB71_33: |
| setp.ne.s32 %p19, %r10, 0; |
| @%p19 bra BB71_35; |
| |
| ld.shared.f32 %f40, [memory]; |
| cvta.to.global.u64 %rd6, %rd2; |
| mul.wide.u32 %rd7, %r6, 4; |
| add.s64 %rd8, %rd6, %rd7; |
| st.global.f32 [%rd8], %f40; |
| |
| BB71_35: |
| ret; |
| } |
| |
| // .globl reduce_col_min_d |
| .visible .entry reduce_col_min_d( |
| .param .u64 reduce_col_min_d_param_0, |
| .param .u64 reduce_col_min_d_param_1, |
| .param .u32 reduce_col_min_d_param_2, |
| .param .u32 reduce_col_min_d_param_3 |
| ) |
| { |
| .reg .pred %p<4>; |
| .reg .b32 %r<11>; |
| .reg .f64 %fd<9>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd2, [reduce_col_min_d_param_0]; |
| ld.param.u64 %rd3, [reduce_col_min_d_param_1]; |
| ld.param.u32 %r5, [reduce_col_min_d_param_2]; |
| ld.param.u32 %r6, [reduce_col_min_d_param_3]; |
| mov.u32 %r7, %ntid.x; |
| mov.u32 %r8, %ctaid.x; |
| mov.u32 %r9, %tid.x; |
| mad.lo.s32 %r1, %r7, %r8, %r9; |
| setp.ge.u32 %p1, %r1, %r6; |
| @%p1 bra BB72_5; |
| |
| mul.lo.s32 %r2, %r6, %r5; |
| cvta.to.global.u64 %rd1, %rd2; |
| mov.f64 %fd8, 0d7FEFFFFFFFFFFFFF; |
| setp.ge.u32 %p2, %r1, %r2; |
| @%p2 bra BB72_4; |
| |
| mov.u32 %r10, %r1; |
| |
| BB72_3: |
| mul.wide.u32 %rd4, %r10, 8; |
| add.s64 %rd5, %rd1, %rd4; |
| ld.global.f64 %fd6, [%rd5]; |
| min.f64 %fd8, %fd8, %fd6; |
| add.s32 %r10, %r10, %r6; |
| setp.lt.u32 %p3, %r10, %r2; |
| @%p3 bra BB72_3; |
| |
| BB72_4: |
| cvta.to.global.u64 %rd6, %rd3; |
| mul.wide.u32 %rd7, %r1, 8; |
| add.s64 %rd8, %rd6, %rd7; |
| st.global.f64 [%rd8], %fd8; |
| |
| BB72_5: |
| ret; |
| } |
| |
| // .globl reduce_col_min_f |
| .visible .entry reduce_col_min_f( |
| .param .u64 reduce_col_min_f_param_0, |
| .param .u64 reduce_col_min_f_param_1, |
| .param .u32 reduce_col_min_f_param_2, |
| .param .u32 reduce_col_min_f_param_3 |
| ) |
| { |
| .reg .pred %p<4>; |
| .reg .f32 %f<9>; |
| .reg .b32 %r<11>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd2, [reduce_col_min_f_param_0]; |
| ld.param.u64 %rd3, [reduce_col_min_f_param_1]; |
| ld.param.u32 %r5, [reduce_col_min_f_param_2]; |
| ld.param.u32 %r6, [reduce_col_min_f_param_3]; |
| mov.u32 %r7, %ntid.x; |
| mov.u32 %r8, %ctaid.x; |
| mov.u32 %r9, %tid.x; |
| mad.lo.s32 %r1, %r7, %r8, %r9; |
| setp.ge.u32 %p1, %r1, %r6; |
| @%p1 bra BB73_5; |
| |
| mul.lo.s32 %r2, %r6, %r5; |
| cvta.to.global.u64 %rd1, %rd2; |
| mov.f32 %f8, 0f7F7FFFFF; |
| setp.ge.u32 %p2, %r1, %r2; |
| @%p2 bra BB73_4; |
| |
| mov.u32 %r10, %r1; |
| |
| BB73_3: |
| mul.wide.u32 %rd4, %r10, 4; |
| add.s64 %rd5, %rd1, %rd4; |
| ld.global.f32 %f6, [%rd5]; |
| min.f32 %f8, %f8, %f6; |
| add.s32 %r10, %r10, %r6; |
| setp.lt.u32 %p3, %r10, %r2; |
| @%p3 bra BB73_3; |
| |
| BB73_4: |
| cvta.to.global.u64 %rd6, %rd3; |
| mul.wide.u32 %rd7, %r1, 4; |
| add.s64 %rd8, %rd6, %rd7; |
| st.global.f32 [%rd8], %f8; |
| |
| BB73_5: |
| ret; |
| } |
| |
| // .globl reduce_prod_d |
| .visible .entry reduce_prod_d( |
| .param .u64 reduce_prod_d_param_0, |
| .param .u64 reduce_prod_d_param_1, |
| .param .u32 reduce_prod_d_param_2 |
| ) |
| { |
| .reg .pred %p<20>; |
| .reg .b32 %r<36>; |
| .reg .f64 %fd<60>; |
| .reg .b64 %rd<12>; |
| |
| |
| ld.param.u64 %rd1, [reduce_prod_d_param_0]; |
| ld.param.u64 %rd2, [reduce_prod_d_param_1]; |
| ld.param.u32 %r6, [reduce_prod_d_param_2]; |
| mov.u32 %r7, %tid.x; |
| mov.u32 %r8, %ctaid.x; |
| shl.b32 %r9, %r8, 1; |
| mov.u32 %r10, %ntid.x; |
| mad.lo.s32 %r35, %r9, %r10, %r7; |
| mov.f64 %fd44, 0d3FF0000000000000; |
| setp.ge.u32 %p1, %r35, %r6; |
| @%p1 bra BB74_4; |
| |
| BB74_1: |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.u32 %rd4, %r35, 8; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f64 %fd30, [%rd5]; |
| mul.f64 %fd44, %fd44, %fd30; |
| add.s32 %r3, %r35, %r10; |
| setp.ge.u32 %p2, %r3, %r6; |
| @%p2 bra BB74_3; |
| |
| mul.wide.u32 %rd7, %r3, 8; |
| add.s64 %rd8, %rd3, %rd7; |
| ld.global.f64 %fd31, [%rd8]; |
| mul.f64 %fd44, %fd44, %fd31; |
| |
| BB74_3: |
| shl.b32 %r13, %r10, 1; |
| mov.u32 %r14, %nctaid.x; |
| mad.lo.s32 %r35, %r13, %r14, %r35; |
| setp.lt.u32 %p3, %r35, %r6; |
| @%p3 bra BB74_1; |
| |
| BB74_4: |
| shl.b32 %r16, %r7, 3; |
| mov.u32 %r17, memory; |
| add.s32 %r5, %r17, %r16; |
| st.shared.f64 [%r5], %fd44; |
| bar.sync 0; |
| setp.lt.u32 %p4, %r10, 1024; |
| @%p4 bra BB74_8; |
| |
| setp.gt.u32 %p5, %r7, 511; |
| @%p5 bra BB74_7; |
| |
| ld.shared.f64 %fd32, [%r5+4096]; |
| mul.f64 %fd44, %fd44, %fd32; |
| st.shared.f64 [%r5], %fd44; |
| |
| BB74_7: |
| bar.sync 0; |
| |
| BB74_8: |
| setp.lt.u32 %p6, %r10, 512; |
| @%p6 bra BB74_12; |
| |
| setp.gt.u32 %p7, %r7, 255; |
| @%p7 bra BB74_11; |
| |
| ld.shared.f64 %fd33, [%r5+2048]; |
| mul.f64 %fd44, %fd44, %fd33; |
| st.shared.f64 [%r5], %fd44; |
| |
| BB74_11: |
| bar.sync 0; |
| |
| BB74_12: |
| setp.lt.u32 %p8, %r10, 256; |
| @%p8 bra BB74_16; |
| |
| setp.gt.u32 %p9, %r7, 127; |
| @%p9 bra BB74_15; |
| |
| ld.shared.f64 %fd34, [%r5+1024]; |
| mul.f64 %fd44, %fd44, %fd34; |
| st.shared.f64 [%r5], %fd44; |
| |
| BB74_15: |
| bar.sync 0; |
| |
| BB74_16: |
| setp.lt.u32 %p10, %r10, 128; |
| @%p10 bra BB74_20; |
| |
| setp.gt.u32 %p11, %r7, 63; |
| @%p11 bra BB74_19; |
| |
| ld.shared.f64 %fd35, [%r5+512]; |
| mul.f64 %fd44, %fd44, %fd35; |
| st.shared.f64 [%r5], %fd44; |
| |
| BB74_19: |
| bar.sync 0; |
| |
| BB74_20: |
| setp.gt.u32 %p12, %r7, 31; |
| @%p12 bra BB74_33; |
| |
| setp.lt.u32 %p13, %r10, 64; |
| @%p13 bra BB74_23; |
| |
| ld.volatile.shared.f64 %fd36, [%r5+256]; |
| mul.f64 %fd44, %fd44, %fd36; |
| st.volatile.shared.f64 [%r5], %fd44; |
| |
| BB74_23: |
| setp.lt.u32 %p14, %r10, 32; |
| @%p14 bra BB74_25; |
| |
| ld.volatile.shared.f64 %fd37, [%r5+128]; |
| mul.f64 %fd44, %fd44, %fd37; |
| st.volatile.shared.f64 [%r5], %fd44; |
| |
| BB74_25: |
| setp.lt.u32 %p15, %r10, 16; |
| @%p15 bra BB74_27; |
| |
| ld.volatile.shared.f64 %fd38, [%r5+64]; |
| mul.f64 %fd44, %fd44, %fd38; |
| st.volatile.shared.f64 [%r5], %fd44; |
| |
| BB74_27: |
| setp.lt.u32 %p16, %r10, 8; |
| @%p16 bra BB74_29; |
| |
| ld.volatile.shared.f64 %fd39, [%r5+32]; |
| mul.f64 %fd44, %fd44, %fd39; |
| st.volatile.shared.f64 [%r5], %fd44; |
| |
| BB74_29: |
| setp.lt.u32 %p17, %r10, 4; |
| @%p17 bra BB74_31; |
| |
| ld.volatile.shared.f64 %fd40, [%r5+16]; |
| mul.f64 %fd44, %fd44, %fd40; |
| st.volatile.shared.f64 [%r5], %fd44; |
| |
| BB74_31: |
| setp.lt.u32 %p18, %r10, 2; |
| @%p18 bra BB74_33; |
| |
| ld.volatile.shared.f64 %fd41, [%r5+8]; |
| mul.f64 %fd42, %fd44, %fd41; |
| st.volatile.shared.f64 [%r5], %fd42; |
| |
| BB74_33: |
| setp.ne.s32 %p19, %r7, 0; |
| @%p19 bra BB74_35; |
| |
| ld.shared.f64 %fd43, [memory]; |
| cvta.to.global.u64 %rd9, %rd2; |
| mul.wide.u32 %rd10, %r8, 8; |
| add.s64 %rd11, %rd9, %rd10; |
| st.global.f64 [%rd11], %fd43; |
| |
| BB74_35: |
| ret; |
| } |
| |
| // .globl reduce_prod_f |
| .visible .entry reduce_prod_f( |
| .param .u64 reduce_prod_f_param_0, |
| .param .u64 reduce_prod_f_param_1, |
| .param .u32 reduce_prod_f_param_2 |
| ) |
| { |
| .reg .pred %p<20>; |
| .reg .f32 %f<60>; |
| .reg .b32 %r<36>; |
| .reg .b64 %rd<12>; |
| |
| |
| ld.param.u64 %rd1, [reduce_prod_f_param_0]; |
| ld.param.u64 %rd2, [reduce_prod_f_param_1]; |
| ld.param.u32 %r6, [reduce_prod_f_param_2]; |
| mov.u32 %r7, %tid.x; |
| mov.u32 %r8, %ctaid.x; |
| shl.b32 %r9, %r8, 1; |
| mov.u32 %r10, %ntid.x; |
| mad.lo.s32 %r35, %r9, %r10, %r7; |
| mov.f32 %f44, 0f3F800000; |
| setp.ge.u32 %p1, %r35, %r6; |
| @%p1 bra BB75_4; |
| |
| BB75_1: |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.u32 %rd4, %r35, 4; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f32 %f30, [%rd5]; |
| mul.f32 %f44, %f44, %f30; |
| add.s32 %r3, %r35, %r10; |
| setp.ge.u32 %p2, %r3, %r6; |
| @%p2 bra BB75_3; |
| |
| mul.wide.u32 %rd7, %r3, 4; |
| add.s64 %rd8, %rd3, %rd7; |
| ld.global.f32 %f31, [%rd8]; |
| mul.f32 %f44, %f44, %f31; |
| |
| BB75_3: |
| shl.b32 %r13, %r10, 1; |
| mov.u32 %r14, %nctaid.x; |
| mad.lo.s32 %r35, %r13, %r14, %r35; |
| setp.lt.u32 %p3, %r35, %r6; |
| @%p3 bra BB75_1; |
| |
| BB75_4: |
| shl.b32 %r16, %r7, 2; |
| mov.u32 %r17, memory; |
| add.s32 %r5, %r17, %r16; |
| st.shared.f32 [%r5], %f44; |
| bar.sync 0; |
| setp.lt.u32 %p4, %r10, 1024; |
| @%p4 bra BB75_8; |
| |
| setp.gt.u32 %p5, %r7, 511; |
| @%p5 bra BB75_7; |
| |
| ld.shared.f32 %f32, [%r5+2048]; |
| mul.f32 %f44, %f44, %f32; |
| st.shared.f32 [%r5], %f44; |
| |
| BB75_7: |
| bar.sync 0; |
| |
| BB75_8: |
| setp.lt.u32 %p6, %r10, 512; |
| @%p6 bra BB75_12; |
| |
| setp.gt.u32 %p7, %r7, 255; |
| @%p7 bra BB75_11; |
| |
| ld.shared.f32 %f33, [%r5+1024]; |
| mul.f32 %f44, %f44, %f33; |
| st.shared.f32 [%r5], %f44; |
| |
| BB75_11: |
| bar.sync 0; |
| |
| BB75_12: |
| setp.lt.u32 %p8, %r10, 256; |
| @%p8 bra BB75_16; |
| |
| setp.gt.u32 %p9, %r7, 127; |
| @%p9 bra BB75_15; |
| |
| ld.shared.f32 %f34, [%r5+512]; |
| mul.f32 %f44, %f44, %f34; |
| st.shared.f32 [%r5], %f44; |
| |
| BB75_15: |
| bar.sync 0; |
| |
| BB75_16: |
| setp.lt.u32 %p10, %r10, 128; |
| @%p10 bra BB75_20; |
| |
| setp.gt.u32 %p11, %r7, 63; |
| @%p11 bra BB75_19; |
| |
| ld.shared.f32 %f35, [%r5+256]; |
| mul.f32 %f44, %f44, %f35; |
| st.shared.f32 [%r5], %f44; |
| |
| BB75_19: |
| bar.sync 0; |
| |
| BB75_20: |
| setp.gt.u32 %p12, %r7, 31; |
| @%p12 bra BB75_33; |
| |
| setp.lt.u32 %p13, %r10, 64; |
| @%p13 bra BB75_23; |
| |
| ld.volatile.shared.f32 %f36, [%r5+128]; |
| mul.f32 %f44, %f44, %f36; |
| st.volatile.shared.f32 [%r5], %f44; |
| |
| BB75_23: |
| setp.lt.u32 %p14, %r10, 32; |
| @%p14 bra BB75_25; |
| |
| ld.volatile.shared.f32 %f37, [%r5+64]; |
| mul.f32 %f44, %f44, %f37; |
| st.volatile.shared.f32 [%r5], %f44; |
| |
| BB75_25: |
| setp.lt.u32 %p15, %r10, 16; |
| @%p15 bra BB75_27; |
| |
| ld.volatile.shared.f32 %f38, [%r5+32]; |
| mul.f32 %f44, %f44, %f38; |
| st.volatile.shared.f32 [%r5], %f44; |
| |
| BB75_27: |
| setp.lt.u32 %p16, %r10, 8; |
| @%p16 bra BB75_29; |
| |
| ld.volatile.shared.f32 %f39, [%r5+16]; |
| mul.f32 %f44, %f44, %f39; |
| st.volatile.shared.f32 [%r5], %f44; |
| |
| BB75_29: |
| setp.lt.u32 %p17, %r10, 4; |
| @%p17 bra BB75_31; |
| |
| ld.volatile.shared.f32 %f40, [%r5+8]; |
| mul.f32 %f44, %f44, %f40; |
| st.volatile.shared.f32 [%r5], %f44; |
| |
| BB75_31: |
| setp.lt.u32 %p18, %r10, 2; |
| @%p18 bra BB75_33; |
| |
| ld.volatile.shared.f32 %f41, [%r5+4]; |
| mul.f32 %f42, %f44, %f41; |
| st.volatile.shared.f32 [%r5], %f42; |
| |
| BB75_33: |
| setp.ne.s32 %p19, %r7, 0; |
| @%p19 bra BB75_35; |
| |
| ld.shared.f32 %f43, [memory]; |
| cvta.to.global.u64 %rd9, %rd2; |
| mul.wide.u32 %rd10, %r8, 4; |
| add.s64 %rd11, %rd9, %rd10; |
| st.global.f32 [%rd11], %f43; |
| |
| BB75_35: |
| ret; |
| } |
| |
| // .globl reduce_row_mean_d |
| .visible .entry reduce_row_mean_d( |
| .param .u64 reduce_row_mean_d_param_0, |
| .param .u64 reduce_row_mean_d_param_1, |
| .param .u32 reduce_row_mean_d_param_2, |
| .param .u32 reduce_row_mean_d_param_3 |
| ) |
| { |
| .reg .pred %p<20>; |
| .reg .b32 %r<72>; |
| .reg .f64 %fd<58>; |
| .reg .b64 %rd<10>; |
| |
| |
| ld.param.u64 %rd1, [reduce_row_mean_d_param_0]; |
| ld.param.u64 %rd2, [reduce_row_mean_d_param_1]; |
| ld.param.u32 %r5, [reduce_row_mean_d_param_2]; |
| ld.param.u32 %r4, [reduce_row_mean_d_param_3]; |
| mov.u32 %r6, %ctaid.x; |
| setp.ge.u32 %p1, %r6, %r5; |
| @%p1 bra BB76_35; |
| |
| mov.u32 %r71, %tid.x; |
| mov.f64 %fd6, 0d0000000000000000; |
| setp.ge.u32 %p2, %r71, %r4; |
| @%p2 bra BB76_4; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| |
| BB76_3: |
| mad.lo.s32 %r8, %r6, %r4, %r71; |
| mul.wide.u32 %rd4, %r8, 8; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f64 %fd28, [%rd5]; |
| add.f64 %fd6, %fd6, %fd28; |
| mov.u32 %r9, %ntid.x; |
| add.s32 %r71, %r9, %r71; |
| setp.lt.u32 %p3, %r71, %r4; |
| @%p3 bra BB76_3; |
| |
| BB76_4: |
| mov.u32 %r10, %tid.x; |
| shl.b32 %r11, %r10, 3; |
| mov.u32 %r12, memory; |
| add.s32 %r13, %r12, %r11; |
| st.shared.f64 [%r13], %fd6; |
| bar.sync 0; |
| mov.u32 %r14, %ntid.x; |
| setp.lt.u32 %p4, %r14, 1024; |
| @%p4 bra BB76_8; |
| |
| setp.gt.u32 %p5, %r10, 511; |
| @%p5 bra BB76_7; |
| |
| ld.shared.f64 %fd29, [%r13+4096]; |
| add.f64 %fd6, %fd6, %fd29; |
| st.shared.f64 [%r13], %fd6; |
| |
| BB76_7: |
| bar.sync 0; |
| |
| BB76_8: |
| setp.lt.u32 %p6, %r14, 512; |
| @%p6 bra BB76_12; |
| |
| setp.gt.u32 %p7, %r10, 255; |
| @%p7 bra BB76_11; |
| |
| ld.shared.f64 %fd30, [%r13+2048]; |
| add.f64 %fd6, %fd6, %fd30; |
| st.shared.f64 [%r13], %fd6; |
| |
| BB76_11: |
| bar.sync 0; |
| |
| BB76_12: |
| setp.lt.u32 %p8, %r14, 256; |
| @%p8 bra BB76_16; |
| |
| setp.gt.u32 %p9, %r10, 127; |
| @%p9 bra BB76_15; |
| |
| ld.shared.f64 %fd31, [%r13+1024]; |
| add.f64 %fd6, %fd6, %fd31; |
| st.shared.f64 [%r13], %fd6; |
| |
| BB76_15: |
| bar.sync 0; |
| |
| BB76_16: |
| setp.lt.u32 %p10, %r14, 128; |
| @%p10 bra BB76_20; |
| |
| setp.gt.u32 %p11, %r10, 63; |
| @%p11 bra BB76_19; |
| |
| ld.shared.f64 %fd32, [%r13+512]; |
| add.f64 %fd6, %fd6, %fd32; |
| st.shared.f64 [%r13], %fd6; |
| |
| BB76_19: |
| bar.sync 0; |
| |
| BB76_20: |
| setp.gt.u32 %p12, %r10, 31; |
| @%p12 bra BB76_33; |
| |
| setp.lt.u32 %p13, %r14, 64; |
| @%p13 bra BB76_23; |
| |
| ld.volatile.shared.f64 %fd33, [%r13+256]; |
| add.f64 %fd6, %fd6, %fd33; |
| st.volatile.shared.f64 [%r13], %fd6; |
| |
| BB76_23: |
| setp.lt.u32 %p14, %r14, 32; |
| @%p14 bra BB76_25; |
| |
| ld.volatile.shared.f64 %fd34, [%r13+128]; |
| add.f64 %fd6, %fd6, %fd34; |
| st.volatile.shared.f64 [%r13], %fd6; |
| |
| BB76_25: |
| setp.lt.u32 %p15, %r14, 16; |
| @%p15 bra BB76_27; |
| |
| ld.volatile.shared.f64 %fd35, [%r13+64]; |
| add.f64 %fd6, %fd6, %fd35; |
| st.volatile.shared.f64 [%r13], %fd6; |
| |
| BB76_27: |
| setp.lt.u32 %p16, %r14, 8; |
| @%p16 bra BB76_29; |
| |
| ld.volatile.shared.f64 %fd36, [%r13+32]; |
| add.f64 %fd6, %fd6, %fd36; |
| st.volatile.shared.f64 [%r13], %fd6; |
| |
| BB76_29: |
| setp.lt.u32 %p17, %r14, 4; |
| @%p17 bra BB76_31; |
| |
| ld.volatile.shared.f64 %fd37, [%r13+16]; |
| add.f64 %fd6, %fd6, %fd37; |
| st.volatile.shared.f64 [%r13], %fd6; |
| |
| BB76_31: |
| setp.lt.u32 %p18, %r14, 2; |
| @%p18 bra BB76_33; |
| |
| ld.volatile.shared.f64 %fd38, [%r13+8]; |
| add.f64 %fd39, %fd6, %fd38; |
| st.volatile.shared.f64 [%r13], %fd39; |
| |
| BB76_33: |
| setp.ne.s32 %p19, %r10, 0; |
| @%p19 bra BB76_35; |
| |
| ld.shared.f64 %fd40, [memory]; |
| cvt.u64.u32 %rd6, %r4; |
| cvt.rn.f64.s64 %fd41, %rd6; |
| div.rn.f64 %fd42, %fd40, %fd41; |
| cvta.to.global.u64 %rd7, %rd2; |
| mul.wide.u32 %rd8, %r6, 8; |
| add.s64 %rd9, %rd7, %rd8; |
| st.global.f64 [%rd9], %fd42; |
| |
| BB76_35: |
| ret; |
| } |
| |
| // .globl reduce_row_mean_f |
| .visible .entry reduce_row_mean_f( |
| .param .u64 reduce_row_mean_f_param_0, |
| .param .u64 reduce_row_mean_f_param_1, |
| .param .u32 reduce_row_mean_f_param_2, |
| .param .u32 reduce_row_mean_f_param_3 |
| ) |
| { |
| .reg .pred %p<20>; |
| .reg .f32 %f<58>; |
| .reg .b32 %r<72>; |
| .reg .b64 %rd<10>; |
| |
| |
| ld.param.u64 %rd1, [reduce_row_mean_f_param_0]; |
| ld.param.u64 %rd2, [reduce_row_mean_f_param_1]; |
| ld.param.u32 %r5, [reduce_row_mean_f_param_2]; |
| ld.param.u32 %r4, [reduce_row_mean_f_param_3]; |
| mov.u32 %r6, %ctaid.x; |
| setp.ge.u32 %p1, %r6, %r5; |
| @%p1 bra BB77_35; |
| |
| mov.u32 %r71, %tid.x; |
| mov.f32 %f6, 0f00000000; |
| setp.ge.u32 %p2, %r71, %r4; |
| @%p2 bra BB77_4; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| |
| BB77_3: |
| mad.lo.s32 %r8, %r6, %r4, %r71; |
| mul.wide.u32 %rd4, %r8, 4; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f32 %f28, [%rd5]; |
| add.f32 %f6, %f6, %f28; |
| mov.u32 %r9, %ntid.x; |
| add.s32 %r71, %r9, %r71; |
| setp.lt.u32 %p3, %r71, %r4; |
| @%p3 bra BB77_3; |
| |
| BB77_4: |
| mov.u32 %r10, %tid.x; |
| shl.b32 %r11, %r10, 2; |
| mov.u32 %r12, memory; |
| add.s32 %r13, %r12, %r11; |
| st.shared.f32 [%r13], %f6; |
| bar.sync 0; |
| mov.u32 %r14, %ntid.x; |
| setp.lt.u32 %p4, %r14, 1024; |
| @%p4 bra BB77_8; |
| |
| setp.gt.u32 %p5, %r10, 511; |
| @%p5 bra BB77_7; |
| |
| ld.shared.f32 %f29, [%r13+2048]; |
| add.f32 %f6, %f6, %f29; |
| st.shared.f32 [%r13], %f6; |
| |
| BB77_7: |
| bar.sync 0; |
| |
| BB77_8: |
| setp.lt.u32 %p6, %r14, 512; |
| @%p6 bra BB77_12; |
| |
| setp.gt.u32 %p7, %r10, 255; |
| @%p7 bra BB77_11; |
| |
| ld.shared.f32 %f30, [%r13+1024]; |
| add.f32 %f6, %f6, %f30; |
| st.shared.f32 [%r13], %f6; |
| |
| BB77_11: |
| bar.sync 0; |
| |
| BB77_12: |
| setp.lt.u32 %p8, %r14, 256; |
| @%p8 bra BB77_16; |
| |
| setp.gt.u32 %p9, %r10, 127; |
| @%p9 bra BB77_15; |
| |
| ld.shared.f32 %f31, [%r13+512]; |
| add.f32 %f6, %f6, %f31; |
| st.shared.f32 [%r13], %f6; |
| |
| BB77_15: |
| bar.sync 0; |
| |
| BB77_16: |
| setp.lt.u32 %p10, %r14, 128; |
| @%p10 bra BB77_20; |
| |
| setp.gt.u32 %p11, %r10, 63; |
| @%p11 bra BB77_19; |
| |
| ld.shared.f32 %f32, [%r13+256]; |
| add.f32 %f6, %f6, %f32; |
| st.shared.f32 [%r13], %f6; |
| |
| BB77_19: |
| bar.sync 0; |
| |
| BB77_20: |
| setp.gt.u32 %p12, %r10, 31; |
| @%p12 bra BB77_33; |
| |
| setp.lt.u32 %p13, %r14, 64; |
| @%p13 bra BB77_23; |
| |
| ld.volatile.shared.f32 %f33, [%r13+128]; |
| add.f32 %f6, %f6, %f33; |
| st.volatile.shared.f32 [%r13], %f6; |
| |
| BB77_23: |
| setp.lt.u32 %p14, %r14, 32; |
| @%p14 bra BB77_25; |
| |
| ld.volatile.shared.f32 %f34, [%r13+64]; |
| add.f32 %f6, %f6, %f34; |
| st.volatile.shared.f32 [%r13], %f6; |
| |
| BB77_25: |
| setp.lt.u32 %p15, %r14, 16; |
| @%p15 bra BB77_27; |
| |
| ld.volatile.shared.f32 %f35, [%r13+32]; |
| add.f32 %f6, %f6, %f35; |
| st.volatile.shared.f32 [%r13], %f6; |
| |
| BB77_27: |
| setp.lt.u32 %p16, %r14, 8; |
| @%p16 bra BB77_29; |
| |
| ld.volatile.shared.f32 %f36, [%r13+16]; |
| add.f32 %f6, %f6, %f36; |
| st.volatile.shared.f32 [%r13], %f6; |
| |
| BB77_29: |
| setp.lt.u32 %p17, %r14, 4; |
| @%p17 bra BB77_31; |
| |
| ld.volatile.shared.f32 %f37, [%r13+8]; |
| add.f32 %f6, %f6, %f37; |
| st.volatile.shared.f32 [%r13], %f6; |
| |
| BB77_31: |
| setp.lt.u32 %p18, %r14, 2; |
| @%p18 bra BB77_33; |
| |
| ld.volatile.shared.f32 %f38, [%r13+4]; |
| add.f32 %f39, %f6, %f38; |
| st.volatile.shared.f32 [%r13], %f39; |
| |
| BB77_33: |
| setp.ne.s32 %p19, %r10, 0; |
| @%p19 bra BB77_35; |
| |
| ld.shared.f32 %f40, [memory]; |
| cvt.u64.u32 %rd6, %r4; |
| cvt.rn.f32.s64 %f41, %rd6; |
| div.rn.f32 %f42, %f40, %f41; |
| cvta.to.global.u64 %rd7, %rd2; |
| mul.wide.u32 %rd8, %r6, 4; |
| add.s64 %rd9, %rd7, %rd8; |
| st.global.f32 [%rd9], %f42; |
| |
| BB77_35: |
| ret; |
| } |
| |
| // .globl reduce_col_mean_d |
| .visible .entry reduce_col_mean_d( |
| .param .u64 reduce_col_mean_d_param_0, |
| .param .u64 reduce_col_mean_d_param_1, |
| .param .u32 reduce_col_mean_d_param_2, |
| .param .u32 reduce_col_mean_d_param_3 |
| ) |
| { |
| .reg .pred %p<4>; |
| .reg .b32 %r<11>; |
| .reg .f64 %fd<11>; |
| .reg .b64 %rd<10>; |
| |
| |
| ld.param.u64 %rd2, [reduce_col_mean_d_param_0]; |
| ld.param.u64 %rd3, [reduce_col_mean_d_param_1]; |
| ld.param.u32 %r5, [reduce_col_mean_d_param_2]; |
| ld.param.u32 %r6, [reduce_col_mean_d_param_3]; |
| mov.u32 %r7, %ntid.x; |
| mov.u32 %r8, %ctaid.x; |
| mov.u32 %r9, %tid.x; |
| mad.lo.s32 %r1, %r7, %r8, %r9; |
| setp.ge.u32 %p1, %r1, %r6; |
| @%p1 bra BB78_5; |
| |
| mul.lo.s32 %r2, %r6, %r5; |
| cvta.to.global.u64 %rd1, %rd2; |
| mov.f64 %fd10, 0d0000000000000000; |
| setp.ge.u32 %p2, %r1, %r2; |
| @%p2 bra BB78_4; |
| |
| mov.u32 %r10, %r1; |
| |
| BB78_3: |
| mul.wide.u32 %rd4, %r10, 8; |
| add.s64 %rd5, %rd1, %rd4; |
| ld.global.f64 %fd6, [%rd5]; |
| add.f64 %fd10, %fd10, %fd6; |
| add.s32 %r10, %r10, %r6; |
| setp.lt.u32 %p3, %r10, %r2; |
| @%p3 bra BB78_3; |
| |
| BB78_4: |
| cvt.u64.u32 %rd6, %r5; |
| cvt.rn.f64.s64 %fd7, %rd6; |
| div.rn.f64 %fd8, %fd10, %fd7; |
| cvta.to.global.u64 %rd7, %rd3; |
| mul.wide.u32 %rd8, %r1, 8; |
| add.s64 %rd9, %rd7, %rd8; |
| st.global.f64 [%rd9], %fd8; |
| |
| BB78_5: |
| ret; |
| } |
| |
| // .globl reduce_col_mean_f |
| .visible .entry reduce_col_mean_f( |
| .param .u64 reduce_col_mean_f_param_0, |
| .param .u64 reduce_col_mean_f_param_1, |
| .param .u32 reduce_col_mean_f_param_2, |
| .param .u32 reduce_col_mean_f_param_3 |
| ) |
| { |
| .reg .pred %p<4>; |
| .reg .f32 %f<11>; |
| .reg .b32 %r<11>; |
| .reg .b64 %rd<10>; |
| |
| |
| ld.param.u64 %rd2, [reduce_col_mean_f_param_0]; |
| ld.param.u64 %rd3, [reduce_col_mean_f_param_1]; |
| ld.param.u32 %r5, [reduce_col_mean_f_param_2]; |
| ld.param.u32 %r6, [reduce_col_mean_f_param_3]; |
| mov.u32 %r7, %ntid.x; |
| mov.u32 %r8, %ctaid.x; |
| mov.u32 %r9, %tid.x; |
| mad.lo.s32 %r1, %r7, %r8, %r9; |
| setp.ge.u32 %p1, %r1, %r6; |
| @%p1 bra BB79_5; |
| |
| mul.lo.s32 %r2, %r6, %r5; |
| cvta.to.global.u64 %rd1, %rd2; |
| mov.f32 %f10, 0f00000000; |
| setp.ge.u32 %p2, %r1, %r2; |
| @%p2 bra BB79_4; |
| |
| mov.u32 %r10, %r1; |
| |
| BB79_3: |
| mul.wide.u32 %rd4, %r10, 4; |
| add.s64 %rd5, %rd1, %rd4; |
| ld.global.f32 %f6, [%rd5]; |
| add.f32 %f10, %f10, %f6; |
| add.s32 %r10, %r10, %r6; |
| setp.lt.u32 %p3, %r10, %r2; |
| @%p3 bra BB79_3; |
| |
| BB79_4: |
| cvt.u64.u32 %rd6, %r5; |
| cvt.rn.f32.s64 %f7, %rd6; |
| div.rn.f32 %f8, %f10, %f7; |
| cvta.to.global.u64 %rd7, %rd3; |
| mul.wide.u32 %rd8, %r1, 4; |
| add.s64 %rd9, %rd7, %rd8; |
| st.global.f32 [%rd9], %f8; |
| |
| BB79_5: |
| ret; |
| } |
| |
| // .globl matrix_exp_d |
| .visible .entry matrix_exp_d( |
| .param .u64 matrix_exp_d_param_0, |
| .param .u64 matrix_exp_d_param_1, |
| .param .u32 matrix_exp_d_param_2 |
| ) |
| { |
| .reg .pred %p<5>; |
| .reg .f32 %f<3>; |
| .reg .b32 %r<21>; |
| .reg .f64 %fd<41>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd1, [matrix_exp_d_param_0]; |
| ld.param.u64 %rd2, [matrix_exp_d_param_1]; |
| ld.param.u32 %r5, [matrix_exp_d_param_2]; |
| mov.u32 %r6, %ctaid.x; |
| mov.u32 %r7, %ntid.x; |
| mov.u32 %r8, %tid.x; |
| mad.lo.s32 %r1, %r7, %r6, %r8; |
| setp.ge.u32 %p1, %r1, %r5; |
| @%p1 bra BB80_5; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.s32 %rd4, %r1, 8; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f64 %fd1, [%rd5]; |
| mov.f64 %fd6, 0d4338000000000000; |
| mov.f64 %fd7, 0d3FF71547652B82FE; |
| fma.rn.f64 %fd8, %fd1, %fd7, %fd6; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r2, %temp}, %fd8; |
| } |
| mov.f64 %fd9, 0dC338000000000000; |
| add.rn.f64 %fd10, %fd8, %fd9; |
| mov.f64 %fd11, 0dBFE62E42FEFA39EF; |
| fma.rn.f64 %fd12, %fd10, %fd11, %fd1; |
| mov.f64 %fd13, 0dBC7ABC9E3B39803F; |
| fma.rn.f64 %fd14, %fd10, %fd13, %fd12; |
| mov.f64 %fd15, 0d3E928AF3FCA213EA; |
| mov.f64 %fd16, 0d3E5ADE1569CE2BDF; |
| fma.rn.f64 %fd17, %fd16, %fd14, %fd15; |
| mov.f64 %fd18, 0d3EC71DEE62401315; |
| fma.rn.f64 %fd19, %fd17, %fd14, %fd18; |
| mov.f64 %fd20, 0d3EFA01997C89EB71; |
| fma.rn.f64 %fd21, %fd19, %fd14, %fd20; |
| mov.f64 %fd22, 0d3F2A01A014761F65; |
| fma.rn.f64 %fd23, %fd21, %fd14, %fd22; |
| mov.f64 %fd24, 0d3F56C16C1852B7AF; |
| fma.rn.f64 %fd25, %fd23, %fd14, %fd24; |
| mov.f64 %fd26, 0d3F81111111122322; |
| fma.rn.f64 %fd27, %fd25, %fd14, %fd26; |
| mov.f64 %fd28, 0d3FA55555555502A1; |
| fma.rn.f64 %fd29, %fd27, %fd14, %fd28; |
| mov.f64 %fd30, 0d3FC5555555555511; |
| fma.rn.f64 %fd31, %fd29, %fd14, %fd30; |
| mov.f64 %fd32, 0d3FE000000000000B; |
| fma.rn.f64 %fd33, %fd31, %fd14, %fd32; |
| mov.f64 %fd34, 0d3FF0000000000000; |
| fma.rn.f64 %fd35, %fd33, %fd14, %fd34; |
| fma.rn.f64 %fd36, %fd35, %fd14, %fd34; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r3, %temp}, %fd36; |
| } |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r4}, %fd36; |
| } |
| shl.b32 %r9, %r2, 20; |
| add.s32 %r10, %r4, %r9; |
| mov.b64 %fd40, {%r3, %r10}; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r11}, %fd1; |
| } |
| mov.b32 %f2, %r11; |
| abs.f32 %f1, %f2; |
| setp.lt.f32 %p2, %f1, 0f4086232B; |
| @%p2 bra BB80_4; |
| |
| setp.lt.f64 %p3, %fd1, 0d0000000000000000; |
| add.f64 %fd37, %fd1, 0d7FF0000000000000; |
| selp.f64 %fd40, 0d0000000000000000, %fd37, %p3; |
| setp.geu.f32 %p4, %f1, 0f40874800; |
| @%p4 bra BB80_4; |
| |
| shr.u32 %r12, %r2, 31; |
| add.s32 %r13, %r2, %r12; |
| shr.s32 %r14, %r13, 1; |
| shl.b32 %r15, %r14, 20; |
| add.s32 %r16, %r15, %r4; |
| mov.b64 %fd38, {%r3, %r16}; |
| sub.s32 %r17, %r2, %r14; |
| shl.b32 %r18, %r17, 20; |
| add.s32 %r19, %r18, 1072693248; |
| mov.u32 %r20, 0; |
| mov.b64 %fd39, {%r20, %r19}; |
| mul.f64 %fd40, %fd38, %fd39; |
| |
| BB80_4: |
| cvta.to.global.u64 %rd6, %rd2; |
| add.s64 %rd8, %rd6, %rd4; |
| st.global.f64 [%rd8], %fd40; |
| |
| BB80_5: |
| ret; |
| } |
| |
| // .globl matrix_exp_f |
| .visible .entry matrix_exp_f( |
| .param .u64 matrix_exp_f_param_0, |
| .param .u64 matrix_exp_f_param_1, |
| .param .u32 matrix_exp_f_param_2 |
| ) |
| { |
| .reg .pred %p<4>; |
| .reg .f32 %f<15>; |
| .reg .b32 %r<6>; |
| .reg .b64 %rd<8>; |
| |
| |
| ld.param.u64 %rd1, [matrix_exp_f_param_0]; |
| ld.param.u64 %rd2, [matrix_exp_f_param_1]; |
| ld.param.u32 %r2, [matrix_exp_f_param_2]; |
| mov.u32 %r3, %ctaid.x; |
| mov.u32 %r4, %ntid.x; |
| mov.u32 %r5, %tid.x; |
| mad.lo.s32 %r1, %r4, %r3, %r5; |
| setp.ge.u32 %p1, %r1, %r2; |
| @%p1 bra BB81_2; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.s32 %rd4, %r1, 4; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f32 %f1, [%rd5]; |
| mul.f32 %f2, %f1, 0f3FB8AA3B; |
| cvt.rzi.f32.f32 %f3, %f2; |
| mov.f32 %f4, 0fBF317200; |
| fma.rn.f32 %f5, %f3, %f4, %f1; |
| mov.f32 %f6, 0fB5BFBE8E; |
| fma.rn.f32 %f7, %f3, %f6, %f5; |
| mul.f32 %f8, %f7, 0f3FB8AA3B; |
| ex2.approx.ftz.f32 %f9, %f8; |
| add.f32 %f10, %f3, 0f00000000; |
| ex2.approx.f32 %f11, %f10; |
| mul.f32 %f12, %f9, %f11; |
| setp.lt.f32 %p2, %f1, 0fC2D20000; |
| selp.f32 %f13, 0f00000000, %f12, %p2; |
| setp.gt.f32 %p3, %f1, 0f42D20000; |
| selp.f32 %f14, 0f7F800000, %f13, %p3; |
| cvta.to.global.u64 %rd6, %rd2; |
| add.s64 %rd7, %rd6, %rd4; |
| st.global.f32 [%rd7], %f14; |
| |
| BB81_2: |
| ret; |
| } |
| |
| // .globl matrix_sqrt_d |
| .visible .entry matrix_sqrt_d( |
| .param .u64 matrix_sqrt_d_param_0, |
| .param .u64 matrix_sqrt_d_param_1, |
| .param .u32 matrix_sqrt_d_param_2 |
| ) |
| { |
| .reg .pred %p<2>; |
| .reg .b32 %r<6>; |
| .reg .f64 %fd<3>; |
| .reg .b64 %rd<8>; |
| |
| |
| ld.param.u64 %rd1, [matrix_sqrt_d_param_0]; |
| ld.param.u64 %rd2, [matrix_sqrt_d_param_1]; |
| ld.param.u32 %r2, [matrix_sqrt_d_param_2]; |
| mov.u32 %r3, %ctaid.x; |
| mov.u32 %r4, %ntid.x; |
| mov.u32 %r5, %tid.x; |
| mad.lo.s32 %r1, %r4, %r3, %r5; |
| setp.ge.u32 %p1, %r1, %r2; |
| @%p1 bra BB82_2; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.s32 %rd4, %r1, 8; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f64 %fd1, [%rd5]; |
| sqrt.rn.f64 %fd2, %fd1; |
| cvta.to.global.u64 %rd6, %rd2; |
| add.s64 %rd7, %rd6, %rd4; |
| st.global.f64 [%rd7], %fd2; |
| |
| BB82_2: |
| ret; |
| } |
| |
| // .globl matrix_sqrt_f |
| .visible .entry matrix_sqrt_f( |
| .param .u64 matrix_sqrt_f_param_0, |
| .param .u64 matrix_sqrt_f_param_1, |
| .param .u32 matrix_sqrt_f_param_2 |
| ) |
| { |
| .reg .pred %p<2>; |
| .reg .f32 %f<3>; |
| .reg .b32 %r<6>; |
| .reg .b64 %rd<8>; |
| |
| |
| ld.param.u64 %rd1, [matrix_sqrt_f_param_0]; |
| ld.param.u64 %rd2, [matrix_sqrt_f_param_1]; |
| ld.param.u32 %r2, [matrix_sqrt_f_param_2]; |
| mov.u32 %r3, %ctaid.x; |
| mov.u32 %r4, %ntid.x; |
| mov.u32 %r5, %tid.x; |
| mad.lo.s32 %r1, %r4, %r3, %r5; |
| setp.ge.u32 %p1, %r1, %r2; |
| @%p1 bra BB83_2; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.s32 %rd4, %r1, 4; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f32 %f1, [%rd5]; |
| sqrt.rn.f32 %f2, %f1; |
| cvta.to.global.u64 %rd6, %rd2; |
| add.s64 %rd7, %rd6, %rd4; |
| st.global.f32 [%rd7], %f2; |
| |
| BB83_2: |
| ret; |
| } |
| |
| // .globl matrix_round_d |
| .visible .entry matrix_round_d( |
| .param .u64 matrix_round_d_param_0, |
| .param .u64 matrix_round_d_param_1, |
| .param .u32 matrix_round_d_param_2 |
| ) |
| { |
| .reg .pred %p<2>; |
| .reg .b32 %r<11>; |
| .reg .f64 %fd<7>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd1, [matrix_round_d_param_0]; |
| ld.param.u64 %rd2, [matrix_round_d_param_1]; |
| ld.param.u32 %r2, [matrix_round_d_param_2]; |
| mov.u32 %r3, %ctaid.x; |
| mov.u32 %r4, %ntid.x; |
| mov.u32 %r5, %tid.x; |
| mad.lo.s32 %r1, %r4, %r3, %r5; |
| setp.ge.u32 %p1, %r1, %r2; |
| @%p1 bra BB84_2; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.s32 %rd4, %r1, 8; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f64 %fd1, [%rd5]; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r6}, %fd1; |
| } |
| and.b32 %r7, %r6, -2147483648; |
| mov.f64 %fd2, 0d3FE0000000000000; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r8}, %fd2; |
| } |
| or.b32 %r9, %r8, %r7; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r10, %temp}, %fd2; |
| } |
| mov.b64 %fd3, {%r10, %r9}; |
| add.rz.f64 %fd4, %fd1, %fd3; |
| cvt.rzi.f64.f64 %fd5, %fd4; |
| cvt.rzi.s64.f64 %rd6, %fd5; |
| cvt.rn.f64.s64 %fd6, %rd6; |
| cvta.to.global.u64 %rd7, %rd2; |
| add.s64 %rd8, %rd7, %rd4; |
| st.global.f64 [%rd8], %fd6; |
| |
| BB84_2: |
| ret; |
| } |
| |
| // .globl matrix_round_f |
| .visible .entry matrix_round_f( |
| .param .u64 matrix_round_f_param_0, |
| .param .u64 matrix_round_f_param_1, |
| .param .u32 matrix_round_f_param_2 |
| ) |
| { |
| .reg .pred %p<8>; |
| .reg .f32 %f<7>; |
| .reg .b32 %r<17>; |
| .reg .b64 %rd<25>; |
| |
| |
| ld.param.u64 %rd6, [matrix_round_f_param_0]; |
| ld.param.u64 %rd7, [matrix_round_f_param_1]; |
| ld.param.u32 %r5, [matrix_round_f_param_2]; |
| mov.u32 %r6, %ctaid.x; |
| mov.u32 %r7, %ntid.x; |
| mov.u32 %r8, %tid.x; |
| mad.lo.s32 %r1, %r7, %r6, %r8; |
| setp.ge.u32 %p1, %r1, %r5; |
| @%p1 bra BB85_9; |
| |
| cvta.to.global.u64 %rd8, %rd6; |
| mul.wide.s32 %rd9, %r1, 4; |
| add.s64 %rd10, %rd8, %rd9; |
| ld.global.u32 %r2, [%rd10]; |
| shl.b32 %r9, %r2, 1; |
| setp.gt.u32 %p2, %r9, -16777216; |
| mov.f32 %f3, 0fDF000000; |
| @%p2 bra BB85_2; |
| bra.uni BB85_3; |
| |
| BB85_2: |
| mov.f32 %f6, %f3; |
| bra.uni BB85_8; |
| |
| BB85_3: |
| setp.gt.s32 %p3, %r2, 1593835519; |
| mov.f32 %f6, 0f5F000000; |
| @%p3 bra BB85_8; |
| |
| setp.gt.u32 %p4, %r2, -553648129; |
| mov.f32 %f6, %f3; |
| @%p4 bra BB85_8; |
| |
| bfe.u32 %r3, %r2, 23, 8; |
| mov.u32 %r10, 189; |
| sub.s32 %r4, %r10, %r3; |
| shl.b32 %r11, %r2, 8; |
| shr.u32 %r12, %r11, 1; |
| or.b32 %r13, %r12, 1073741824; |
| cvt.u64.u32 %rd12, %r13; |
| shl.b64 %rd24, %rd12, 32; |
| setp.gt.s32 %p5, %r4, 63; |
| mov.u64 %rd23, 0; |
| @%p5 bra BB85_7; |
| |
| setp.eq.s32 %p6, %r3, 189; |
| mov.u32 %r14, 64; |
| sub.s32 %r15, %r14, %r4; |
| shl.b64 %rd13, %rd24, %r15; |
| cvt.u64.u32 %rd14, %r4; |
| selp.b64 %rd15, 0, %rd14, %p6; |
| cvt.u32.u64 %r16, %rd15; |
| shr.u64 %rd23, %rd24, %r16; |
| selp.b64 %rd24, 0, %rd13, %p6; |
| |
| BB85_7: |
| shr.u64 %rd16, %rd24, 63; |
| add.s64 %rd17, %rd16, %rd23; |
| neg.s64 %rd18, %rd17; |
| setp.lt.s32 %p7, %r2, 0; |
| selp.b64 %rd19, %rd18, %rd17, %p7; |
| cvt.rn.f32.s64 %f6, %rd19; |
| |
| BB85_8: |
| cvta.to.global.u64 %rd20, %rd7; |
| add.s64 %rd22, %rd20, %rd9; |
| st.global.f32 [%rd22], %f6; |
| |
| BB85_9: |
| ret; |
| } |
| |
| // .globl matrix_abs_d |
| .visible .entry matrix_abs_d( |
| .param .u64 matrix_abs_d_param_0, |
| .param .u64 matrix_abs_d_param_1, |
| .param .u32 matrix_abs_d_param_2 |
| ) |
| { |
| .reg .pred %p<2>; |
| .reg .b32 %r<6>; |
| .reg .f64 %fd<3>; |
| .reg .b64 %rd<8>; |
| |
| |
| ld.param.u64 %rd1, [matrix_abs_d_param_0]; |
| ld.param.u64 %rd2, [matrix_abs_d_param_1]; |
| ld.param.u32 %r2, [matrix_abs_d_param_2]; |
| mov.u32 %r3, %ctaid.x; |
| mov.u32 %r4, %ntid.x; |
| mov.u32 %r5, %tid.x; |
| mad.lo.s32 %r1, %r4, %r3, %r5; |
| setp.ge.u32 %p1, %r1, %r2; |
| @%p1 bra BB86_2; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.s32 %rd4, %r1, 8; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f64 %fd1, [%rd5]; |
| abs.f64 %fd2, %fd1; |
| cvta.to.global.u64 %rd6, %rd2; |
| add.s64 %rd7, %rd6, %rd4; |
| st.global.f64 [%rd7], %fd2; |
| |
| BB86_2: |
| ret; |
| } |
| |
| // .globl matrix_abs_f |
| .visible .entry matrix_abs_f( |
| .param .u64 matrix_abs_f_param_0, |
| .param .u64 matrix_abs_f_param_1, |
| .param .u32 matrix_abs_f_param_2 |
| ) |
| { |
| .reg .pred %p<2>; |
| .reg .f32 %f<3>; |
| .reg .b32 %r<6>; |
| .reg .b64 %rd<8>; |
| |
| |
| ld.param.u64 %rd1, [matrix_abs_f_param_0]; |
| ld.param.u64 %rd2, [matrix_abs_f_param_1]; |
| ld.param.u32 %r2, [matrix_abs_f_param_2]; |
| mov.u32 %r3, %ctaid.x; |
| mov.u32 %r4, %ntid.x; |
| mov.u32 %r5, %tid.x; |
| mad.lo.s32 %r1, %r4, %r3, %r5; |
| setp.ge.u32 %p1, %r1, %r2; |
| @%p1 bra BB87_2; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.s32 %rd4, %r1, 4; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f32 %f1, [%rd5]; |
| abs.f32 %f2, %f1; |
| cvta.to.global.u64 %rd6, %rd2; |
| add.s64 %rd7, %rd6, %rd4; |
| st.global.f32 [%rd7], %f2; |
| |
| BB87_2: |
| ret; |
| } |
| |
| // .globl matrix_log_d |
| .visible .entry matrix_log_d( |
| .param .u64 matrix_log_d_param_0, |
| .param .u64 matrix_log_d_param_1, |
| .param .u32 matrix_log_d_param_2 |
| ) |
| { |
| .reg .pred %p<6>; |
| .reg .f32 %f<2>; |
| .reg .b32 %r<33>; |
| .reg .f64 %fd<59>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd1, [matrix_log_d_param_0]; |
| ld.param.u64 %rd2, [matrix_log_d_param_1]; |
| ld.param.u32 %r12, [matrix_log_d_param_2]; |
| mov.u32 %r13, %ctaid.x; |
| mov.u32 %r14, %ntid.x; |
| mov.u32 %r15, %tid.x; |
| mad.lo.s32 %r1, %r14, %r13, %r15; |
| setp.ge.u32 %p1, %r1, %r12; |
| @%p1 bra BB88_9; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.s32 %rd4, %r1, 8; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f64 %fd56, [%rd5]; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r29}, %fd56; |
| } |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r30, %temp}, %fd56; |
| } |
| mov.u32 %r31, -1023; |
| setp.gt.s32 %p2, %r29, 1048575; |
| @%p2 bra BB88_3; |
| |
| mul.f64 %fd56, %fd56, 0d4350000000000000; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r29}, %fd56; |
| } |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r30, %temp}, %fd56; |
| } |
| mov.u32 %r31, -1077; |
| |
| BB88_3: |
| add.s32 %r18, %r29, -1; |
| setp.lt.u32 %p3, %r18, 2146435071; |
| @%p3 bra BB88_5; |
| bra.uni BB88_4; |
| |
| BB88_5: |
| shr.u32 %r20, %r29, 20; |
| add.s32 %r32, %r31, %r20; |
| and.b32 %r21, %r29, -2146435073; |
| or.b32 %r22, %r21, 1072693248; |
| mov.b64 %fd57, {%r30, %r22}; |
| setp.lt.s32 %p5, %r22, 1073127583; |
| @%p5 bra BB88_7; |
| |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r23, %temp}, %fd57; |
| } |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r24}, %fd57; |
| } |
| add.s32 %r25, %r24, -1048576; |
| mov.b64 %fd57, {%r23, %r25}; |
| add.s32 %r32, %r32, 1; |
| |
| BB88_7: |
| add.f64 %fd12, %fd57, 0d3FF0000000000000; |
| rcp.approx.ftz.f64 %fd13, %fd12; |
| neg.f64 %fd14, %fd12; |
| mov.f64 %fd15, 0d3FF0000000000000; |
| fma.rn.f64 %fd16, %fd14, %fd13, %fd15; |
| fma.rn.f64 %fd17, %fd16, %fd16, %fd16; |
| fma.rn.f64 %fd18, %fd17, %fd13, %fd13; |
| add.f64 %fd19, %fd57, 0dBFF0000000000000; |
| mul.f64 %fd20, %fd19, %fd18; |
| fma.rn.f64 %fd21, %fd19, %fd18, %fd20; |
| mul.f64 %fd22, %fd21, %fd21; |
| mov.f64 %fd23, 0d3ED0EE258B7A8B04; |
| mov.f64 %fd24, 0d3EB1380B3AE80F1E; |
| fma.rn.f64 %fd25, %fd24, %fd22, %fd23; |
| mov.f64 %fd26, 0d3EF3B2669F02676F; |
| fma.rn.f64 %fd27, %fd25, %fd22, %fd26; |
| mov.f64 %fd28, 0d3F1745CBA9AB0956; |
| fma.rn.f64 %fd29, %fd27, %fd22, %fd28; |
| mov.f64 %fd30, 0d3F3C71C72D1B5154; |
| fma.rn.f64 %fd31, %fd29, %fd22, %fd30; |
| mov.f64 %fd32, 0d3F624924923BE72D; |
| fma.rn.f64 %fd33, %fd31, %fd22, %fd32; |
| mov.f64 %fd34, 0d3F8999999999A3C4; |
| fma.rn.f64 %fd35, %fd33, %fd22, %fd34; |
| mov.f64 %fd36, 0d3FB5555555555554; |
| fma.rn.f64 %fd37, %fd35, %fd22, %fd36; |
| sub.f64 %fd38, %fd19, %fd21; |
| add.f64 %fd39, %fd38, %fd38; |
| neg.f64 %fd40, %fd21; |
| fma.rn.f64 %fd41, %fd40, %fd19, %fd39; |
| mul.f64 %fd42, %fd18, %fd41; |
| mul.f64 %fd43, %fd22, %fd37; |
| fma.rn.f64 %fd44, %fd43, %fd21, %fd42; |
| xor.b32 %r26, %r32, -2147483648; |
| mov.u32 %r27, -2147483648; |
| mov.u32 %r28, 1127219200; |
| mov.b64 %fd45, {%r26, %r28}; |
| mov.b64 %fd46, {%r27, %r28}; |
| sub.f64 %fd47, %fd45, %fd46; |
| mov.f64 %fd48, 0d3FE62E42FEFA39EF; |
| fma.rn.f64 %fd49, %fd47, %fd48, %fd21; |
| neg.f64 %fd50, %fd47; |
| fma.rn.f64 %fd51, %fd50, %fd48, %fd49; |
| sub.f64 %fd52, %fd51, %fd21; |
| sub.f64 %fd53, %fd44, %fd52; |
| mov.f64 %fd54, 0d3C7ABC9E3B39803F; |
| fma.rn.f64 %fd55, %fd47, %fd54, %fd53; |
| add.f64 %fd58, %fd49, %fd55; |
| bra.uni BB88_8; |
| |
| BB88_4: |
| mov.f64 %fd10, 0d7FF0000000000000; |
| fma.rn.f64 %fd11, %fd56, %fd10, %fd10; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r19}, %fd56; |
| } |
| mov.b32 %f1, %r19; |
| setp.eq.f32 %p4, %f1, 0f00000000; |
| selp.f64 %fd58, 0dFFF0000000000000, %fd11, %p4; |
| |
| BB88_8: |
| cvta.to.global.u64 %rd6, %rd2; |
| add.s64 %rd8, %rd6, %rd4; |
| st.global.f64 [%rd8], %fd58; |
| |
| BB88_9: |
| ret; |
| } |
| |
| // .globl matrix_log_f |
| .visible .entry matrix_log_f( |
| .param .u64 matrix_log_f_param_0, |
| .param .u64 matrix_log_f_param_1, |
| .param .u32 matrix_log_f_param_2 |
| ) |
| { |
| .reg .pred %p<5>; |
| .reg .f32 %f<36>; |
| .reg .b32 %r<10>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd1, [matrix_log_f_param_0]; |
| ld.param.u64 %rd2, [matrix_log_f_param_1]; |
| ld.param.u32 %r2, [matrix_log_f_param_2]; |
| mov.u32 %r3, %ctaid.x; |
| mov.u32 %r4, %ntid.x; |
| mov.u32 %r5, %tid.x; |
| mad.lo.s32 %r1, %r4, %r3, %r5; |
| setp.ge.u32 %p1, %r1, %r2; |
| @%p1 bra BB89_4; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.s32 %rd4, %r1, 4; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f32 %f5, [%rd5]; |
| setp.lt.f32 %p2, %f5, 0f00800000; |
| mul.f32 %f6, %f5, 0f4B000000; |
| selp.f32 %f1, %f6, %f5, %p2; |
| selp.f32 %f7, 0fC1B80000, 0f00000000, %p2; |
| mov.b32 %r6, %f1; |
| add.s32 %r7, %r6, -1059760811; |
| and.b32 %r8, %r7, -8388608; |
| sub.s32 %r9, %r6, %r8; |
| mov.b32 %f8, %r9; |
| cvt.rn.f32.s32 %f9, %r8; |
| mov.f32 %f10, 0f34000000; |
| fma.rn.f32 %f11, %f9, %f10, %f7; |
| add.f32 %f12, %f8, 0fBF800000; |
| mov.f32 %f13, 0f3E1039F6; |
| mov.f32 %f14, 0fBE055027; |
| fma.rn.f32 %f15, %f14, %f12, %f13; |
| mov.f32 %f16, 0fBDF8CDCC; |
| fma.rn.f32 %f17, %f15, %f12, %f16; |
| mov.f32 %f18, 0f3E0F2955; |
| fma.rn.f32 %f19, %f17, %f12, %f18; |
| mov.f32 %f20, 0fBE2AD8B9; |
| fma.rn.f32 %f21, %f19, %f12, %f20; |
| mov.f32 %f22, 0f3E4CED0B; |
| fma.rn.f32 %f23, %f21, %f12, %f22; |
| mov.f32 %f24, 0fBE7FFF22; |
| fma.rn.f32 %f25, %f23, %f12, %f24; |
| mov.f32 %f26, 0f3EAAAA78; |
| fma.rn.f32 %f27, %f25, %f12, %f26; |
| mov.f32 %f28, 0fBF000000; |
| fma.rn.f32 %f29, %f27, %f12, %f28; |
| mul.f32 %f30, %f12, %f29; |
| fma.rn.f32 %f31, %f30, %f12, %f12; |
| mov.f32 %f32, 0f3F317218; |
| fma.rn.f32 %f35, %f11, %f32, %f31; |
| setp.lt.u32 %p3, %r6, 2139095040; |
| @%p3 bra BB89_3; |
| |
| mov.f32 %f33, 0f7F800000; |
| fma.rn.f32 %f35, %f1, %f33, %f33; |
| |
| BB89_3: |
| cvta.to.global.u64 %rd6, %rd2; |
| setp.eq.f32 %p4, %f1, 0f00000000; |
| selp.f32 %f34, 0fFF800000, %f35, %p4; |
| add.s64 %rd8, %rd6, %rd4; |
| st.global.f32 [%rd8], %f34; |
| |
| BB89_4: |
| ret; |
| } |
| |
| // .globl matrix_floor_d |
| .visible .entry matrix_floor_d( |
| .param .u64 matrix_floor_d_param_0, |
| .param .u64 matrix_floor_d_param_1, |
| .param .u32 matrix_floor_d_param_2 |
| ) |
| { |
| .reg .pred %p<2>; |
| .reg .b32 %r<6>; |
| .reg .f64 %fd<3>; |
| .reg .b64 %rd<8>; |
| |
| |
| ld.param.u64 %rd1, [matrix_floor_d_param_0]; |
| ld.param.u64 %rd2, [matrix_floor_d_param_1]; |
| ld.param.u32 %r2, [matrix_floor_d_param_2]; |
| mov.u32 %r3, %ctaid.x; |
| mov.u32 %r4, %ntid.x; |
| mov.u32 %r5, %tid.x; |
| mad.lo.s32 %r1, %r4, %r3, %r5; |
| setp.ge.u32 %p1, %r1, %r2; |
| @%p1 bra BB90_2; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.s32 %rd4, %r1, 8; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f64 %fd1, [%rd5]; |
| cvt.rmi.f64.f64 %fd2, %fd1; |
| cvta.to.global.u64 %rd6, %rd2; |
| add.s64 %rd7, %rd6, %rd4; |
| st.global.f64 [%rd7], %fd2; |
| |
| BB90_2: |
| ret; |
| } |
| |
| // .globl matrix_floor_f |
| .visible .entry matrix_floor_f( |
| .param .u64 matrix_floor_f_param_0, |
| .param .u64 matrix_floor_f_param_1, |
| .param .u32 matrix_floor_f_param_2 |
| ) |
| { |
| .reg .pred %p<2>; |
| .reg .f32 %f<3>; |
| .reg .b32 %r<6>; |
| .reg .b64 %rd<8>; |
| |
| |
| ld.param.u64 %rd1, [matrix_floor_f_param_0]; |
| ld.param.u64 %rd2, [matrix_floor_f_param_1]; |
| ld.param.u32 %r2, [matrix_floor_f_param_2]; |
| mov.u32 %r3, %ctaid.x; |
| mov.u32 %r4, %ntid.x; |
| mov.u32 %r5, %tid.x; |
| mad.lo.s32 %r1, %r4, %r3, %r5; |
| setp.ge.u32 %p1, %r1, %r2; |
| @%p1 bra BB91_2; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.s32 %rd4, %r1, 4; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f32 %f1, [%rd5]; |
| cvt.rmi.f32.f32 %f2, %f1; |
| cvta.to.global.u64 %rd6, %rd2; |
| add.s64 %rd7, %rd6, %rd4; |
| st.global.f32 [%rd7], %f2; |
| |
| BB91_2: |
| ret; |
| } |
| |
| // .globl matrix_ceil_d |
| .visible .entry matrix_ceil_d( |
| .param .u64 matrix_ceil_d_param_0, |
| .param .u64 matrix_ceil_d_param_1, |
| .param .u32 matrix_ceil_d_param_2 |
| ) |
| { |
| .reg .pred %p<2>; |
| .reg .b32 %r<6>; |
| .reg .f64 %fd<3>; |
| .reg .b64 %rd<8>; |
| |
| |
| ld.param.u64 %rd1, [matrix_ceil_d_param_0]; |
| ld.param.u64 %rd2, [matrix_ceil_d_param_1]; |
| ld.param.u32 %r2, [matrix_ceil_d_param_2]; |
| mov.u32 %r3, %ctaid.x; |
| mov.u32 %r4, %ntid.x; |
| mov.u32 %r5, %tid.x; |
| mad.lo.s32 %r1, %r4, %r3, %r5; |
| setp.ge.u32 %p1, %r1, %r2; |
| @%p1 bra BB92_2; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.s32 %rd4, %r1, 8; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f64 %fd1, [%rd5]; |
| cvt.rpi.f64.f64 %fd2, %fd1; |
| cvta.to.global.u64 %rd6, %rd2; |
| add.s64 %rd7, %rd6, %rd4; |
| st.global.f64 [%rd7], %fd2; |
| |
| BB92_2: |
| ret; |
| } |
| |
| // .globl matrix_ceil_f |
| .visible .entry matrix_ceil_f( |
| .param .u64 matrix_ceil_f_param_0, |
| .param .u64 matrix_ceil_f_param_1, |
| .param .u32 matrix_ceil_f_param_2 |
| ) |
| { |
| .reg .pred %p<2>; |
| .reg .f32 %f<3>; |
| .reg .b32 %r<6>; |
| .reg .b64 %rd<8>; |
| |
| |
| ld.param.u64 %rd1, [matrix_ceil_f_param_0]; |
| ld.param.u64 %rd2, [matrix_ceil_f_param_1]; |
| ld.param.u32 %r2, [matrix_ceil_f_param_2]; |
| mov.u32 %r3, %ctaid.x; |
| mov.u32 %r4, %ntid.x; |
| mov.u32 %r5, %tid.x; |
| mad.lo.s32 %r1, %r4, %r3, %r5; |
| setp.ge.u32 %p1, %r1, %r2; |
| @%p1 bra BB93_2; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.s32 %rd4, %r1, 4; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f32 %f1, [%rd5]; |
| cvt.rpi.f32.f32 %f2, %f1; |
| cvta.to.global.u64 %rd6, %rd2; |
| add.s64 %rd7, %rd6, %rd4; |
| st.global.f32 [%rd7], %f2; |
| |
| BB93_2: |
| ret; |
| } |
| |
| // .globl matrix_sin_d |
| .visible .entry matrix_sin_d( |
| .param .u64 matrix_sin_d_param_0, |
| .param .u64 matrix_sin_d_param_1, |
| .param .u32 matrix_sin_d_param_2 |
| ) |
| { |
| .local .align 4 .b8 __local_depot94[4]; |
| .reg .b64 %SP; |
| .reg .b64 %SPL; |
| .reg .pred %p<7>; |
| .reg .b32 %r<19>; |
| .reg .f64 %fd<42>; |
| .reg .b64 %rd<15>; |
| |
| |
| mov.u64 %SPL, __local_depot94; |
| cvta.local.u64 %SP, %SPL; |
| ld.param.u64 %rd2, [matrix_sin_d_param_0]; |
| ld.param.u64 %rd3, [matrix_sin_d_param_1]; |
| ld.param.u32 %r5, [matrix_sin_d_param_2]; |
| add.u64 %rd4, %SP, 0; |
| add.u64 %rd1, %SPL, 0; |
| mov.u32 %r6, %ntid.x; |
| mov.u32 %r7, %ctaid.x; |
| mov.u32 %r8, %tid.x; |
| mad.lo.s32 %r1, %r6, %r7, %r8; |
| setp.ge.u32 %p1, %r1, %r5; |
| @%p1 bra BB94_11; |
| |
| cvta.to.global.u64 %rd5, %rd2; |
| mul.wide.s32 %rd6, %r1, 8; |
| add.s64 %rd7, %rd5, %rd6; |
| ld.global.f64 %fd38, [%rd7]; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r9}, %fd38; |
| } |
| and.b32 %r10, %r9, 2147483647; |
| setp.ne.s32 %p2, %r10, 2146435072; |
| @%p2 bra BB94_4; |
| |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r11, %temp}, %fd38; |
| } |
| setp.ne.s32 %p3, %r11, 0; |
| @%p3 bra BB94_4; |
| |
| mov.f64 %fd14, 0d0000000000000000; |
| mul.rn.f64 %fd38, %fd38, %fd14; |
| |
| BB94_4: |
| mul.f64 %fd15, %fd38, 0d3FE45F306DC9C883; |
| cvt.rni.s32.f64 %r18, %fd15; |
| st.local.u32 [%rd1], %r18; |
| cvt.rn.f64.s32 %fd16, %r18; |
| neg.f64 %fd17, %fd16; |
| mov.f64 %fd18, 0d3FF921FB54442D18; |
| fma.rn.f64 %fd19, %fd17, %fd18, %fd38; |
| mov.f64 %fd20, 0d3C91A62633145C00; |
| fma.rn.f64 %fd21, %fd17, %fd20, %fd19; |
| mov.f64 %fd22, 0d397B839A252049C0; |
| fma.rn.f64 %fd39, %fd17, %fd22, %fd21; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r12}, %fd38; |
| } |
| and.b32 %r13, %r12, 2145386496; |
| setp.lt.u32 %p4, %r13, 1105199104; |
| @%p4 bra BB94_6; |
| |
| // Callseq Start 3 |
| { |
| .reg .b32 temp_param_reg; |
| // <end>} |
| .param .b64 param0; |
| st.param.f64 [param0+0], %fd38; |
| .param .b64 param1; |
| st.param.b64 [param1+0], %rd4; |
| .param .b64 retval0; |
| call.uni (retval0), |
| __internal_trig_reduction_slowpathd, |
| ( |
| param0, |
| param1 |
| ); |
| ld.param.f64 %fd39, [retval0+0]; |
| |
| //{ |
| }// Callseq End 3 |
| ld.local.u32 %r18, [%rd1]; |
| |
| BB94_6: |
| and.b32 %r14, %r18, 1; |
| shl.b32 %r15, %r14, 3; |
| setp.eq.s32 %p5, %r14, 0; |
| selp.f64 %fd23, 0d3DE5DB65F9785EBA, 0dBDA8FF8320FD8164, %p5; |
| add.s32 %r16, %r15, 1; |
| mul.wide.s32 %rd9, %r16, 8; |
| mov.u64 %rd10, __cudart_sin_cos_coeffs; |
| add.s64 %rd11, %rd10, %rd9; |
| ld.const.f64 %fd24, [%rd11]; |
| mul.rn.f64 %fd7, %fd39, %fd39; |
| fma.rn.f64 %fd25, %fd23, %fd7, %fd24; |
| ld.const.f64 %fd26, [%rd11+8]; |
| fma.rn.f64 %fd27, %fd25, %fd7, %fd26; |
| ld.const.f64 %fd28, [%rd11+16]; |
| fma.rn.f64 %fd29, %fd27, %fd7, %fd28; |
| ld.const.f64 %fd30, [%rd11+24]; |
| fma.rn.f64 %fd31, %fd29, %fd7, %fd30; |
| ld.const.f64 %fd32, [%rd11+32]; |
| fma.rn.f64 %fd33, %fd31, %fd7, %fd32; |
| ld.const.f64 %fd34, [%rd11+40]; |
| fma.rn.f64 %fd8, %fd33, %fd7, %fd34; |
| fma.rn.f64 %fd40, %fd8, %fd39, %fd39; |
| @%p5 bra BB94_8; |
| |
| mov.f64 %fd35, 0d3FF0000000000000; |
| fma.rn.f64 %fd40, %fd8, %fd7, %fd35; |
| |
| BB94_8: |
| and.b32 %r17, %r18, 2; |
| setp.eq.s32 %p6, %r17, 0; |
| @%p6 bra BB94_10; |
| |
| mov.f64 %fd36, 0d0000000000000000; |
| mov.f64 %fd37, 0dBFF0000000000000; |
| fma.rn.f64 %fd40, %fd40, %fd37, %fd36; |
| |
| BB94_10: |
| cvta.to.global.u64 %rd12, %rd3; |
| add.s64 %rd14, %rd12, %rd6; |
| st.global.f64 [%rd14], %fd40; |
| |
| BB94_11: |
| ret; |
| } |
| |
| // .globl matrix_sin_f |
| .visible .entry matrix_sin_f( |
| .param .u64 matrix_sin_f_param_0, |
| .param .u64 matrix_sin_f_param_1, |
| .param .u32 matrix_sin_f_param_2 |
| ) |
| { |
| .local .align 4 .b8 __local_depot95[28]; |
| .reg .b64 %SP; |
| .reg .b64 %SPL; |
| .reg .pred %p<13>; |
| .reg .f32 %f<38>; |
| .reg .b32 %r<69>; |
| .reg .f64 %fd<3>; |
| .reg .b64 %rd<24>; |
| |
| |
| mov.u64 %SPL, __local_depot95; |
| ld.param.u64 %rd7, [matrix_sin_f_param_0]; |
| ld.param.u64 %rd8, [matrix_sin_f_param_1]; |
| ld.param.u32 %r29, [matrix_sin_f_param_2]; |
| mov.u32 %r30, %ntid.x; |
| mov.u32 %r31, %ctaid.x; |
| mov.u32 %r32, %tid.x; |
| mad.lo.s32 %r1, %r30, %r31, %r32; |
| setp.ge.u32 %p1, %r1, %r29; |
| @%p1 bra BB95_17; |
| |
| cvta.to.global.u64 %rd9, %rd7; |
| mul.wide.s32 %rd10, %r1, 4; |
| add.s64 %rd11, %rd9, %rd10; |
| add.u64 %rd1, %SPL, 0; |
| ld.global.f32 %f1, [%rd11]; |
| mul.f32 %f15, %f1, 0f3F22F983; |
| cvt.rni.s32.f32 %r68, %f15; |
| cvt.rn.f32.s32 %f16, %r68; |
| mov.f32 %f17, 0fBFC90FDA; |
| fma.rn.f32 %f18, %f16, %f17, %f1; |
| mov.f32 %f19, 0fB3A22168; |
| fma.rn.f32 %f20, %f16, %f19, %f18; |
| mov.f32 %f21, 0fA7C234C5; |
| fma.rn.f32 %f35, %f16, %f21, %f20; |
| abs.f32 %f3, %f1; |
| setp.leu.f32 %p2, %f3, 0f47CE4780; |
| @%p2 bra BB95_12; |
| |
| setp.eq.f32 %p3, %f3, 0f7F800000; |
| @%p3 bra BB95_11; |
| bra.uni BB95_3; |
| |
| BB95_11: |
| mov.f32 %f24, 0f00000000; |
| mul.rn.f32 %f35, %f1, %f24; |
| bra.uni BB95_12; |
| |
| BB95_3: |
| mov.b32 %r3, %f1; |
| shl.b32 %r35, %r3, 8; |
| or.b32 %r4, %r35, -2147483648; |
| mov.u32 %r62, 0; |
| mov.u64 %rd22, __cudart_i2opi_f; |
| mov.u32 %r61, -6; |
| mov.u64 %rd23, %rd1; |
| |
| BB95_4: |
| .pragma "nounroll"; |
| ld.const.u32 %r38, [%rd22]; |
| // inline asm |
| { |
| mad.lo.cc.u32 %r36, %r38, %r4, %r62; |
| madc.hi.u32 %r62, %r38, %r4, 0; |
| } |
| // inline asm |
| st.local.u32 [%rd23], %r36; |
| add.s64 %rd23, %rd23, 4; |
| add.s64 %rd22, %rd22, 4; |
| add.s32 %r61, %r61, 1; |
| setp.ne.s32 %p4, %r61, 0; |
| @%p4 bra BB95_4; |
| |
| bfe.u32 %r41, %r3, 23, 8; |
| add.s32 %r42, %r41, -128; |
| shr.u32 %r43, %r42, 5; |
| and.b32 %r9, %r3, -2147483648; |
| st.local.u32 [%rd1+24], %r62; |
| bfe.u32 %r10, %r3, 23, 5; |
| mov.u32 %r44, 6; |
| sub.s32 %r45, %r44, %r43; |
| mul.wide.s32 %rd14, %r45, 4; |
| add.s64 %rd6, %rd1, %rd14; |
| ld.local.u32 %r64, [%rd6]; |
| ld.local.u32 %r63, [%rd6+-4]; |
| setp.eq.s32 %p5, %r10, 0; |
| @%p5 bra BB95_7; |
| |
| mov.u32 %r46, 32; |
| sub.s32 %r47, %r46, %r10; |
| shr.u32 %r48, %r63, %r47; |
| shl.b32 %r49, %r64, %r10; |
| add.s32 %r64, %r48, %r49; |
| ld.local.u32 %r50, [%rd6+-8]; |
| shr.u32 %r51, %r50, %r47; |
| shl.b32 %r52, %r63, %r10; |
| add.s32 %r63, %r51, %r52; |
| |
| BB95_7: |
| shr.u32 %r53, %r63, 30; |
| shl.b32 %r54, %r64, 2; |
| add.s32 %r66, %r54, %r53; |
| shl.b32 %r18, %r63, 2; |
| shr.u32 %r55, %r66, 31; |
| shr.u32 %r56, %r64, 30; |
| add.s32 %r19, %r55, %r56; |
| setp.eq.s32 %p6, %r55, 0; |
| @%p6 bra BB95_8; |
| |
| not.b32 %r57, %r66; |
| neg.s32 %r65, %r18; |
| setp.eq.s32 %p7, %r18, 0; |
| selp.u32 %r58, 1, 0, %p7; |
| add.s32 %r66, %r58, %r57; |
| xor.b32 %r67, %r9, -2147483648; |
| bra.uni BB95_10; |
| |
| BB95_8: |
| mov.u32 %r65, %r18; |
| mov.u32 %r67, %r9; |
| |
| BB95_10: |
| cvt.u64.u32 %rd15, %r66; |
| shl.b64 %rd16, %rd15, 32; |
| cvt.u64.u32 %rd17, %r65; |
| or.b64 %rd18, %rd16, %rd17; |
| cvt.rn.f64.s64 %fd1, %rd18; |
| mul.f64 %fd2, %fd1, 0d3BF921FB54442D19; |
| cvt.rn.f32.f64 %f22, %fd2; |
| neg.f32 %f23, %f22; |
| setp.eq.s32 %p8, %r67, 0; |
| selp.f32 %f35, %f22, %f23, %p8; |
| setp.eq.s32 %p9, %r9, 0; |
| neg.s32 %r59, %r19; |
| selp.b32 %r68, %r19, %r59, %p9; |
| |
| BB95_12: |
| and.b32 %r28, %r68, 1; |
| setp.eq.s32 %p10, %r28, 0; |
| selp.f32 %f7, %f35, 0f3F800000, %p10; |
| mul.rn.f32 %f8, %f35, %f35; |
| mov.f32 %f26, 0f00000000; |
| fma.rn.f32 %f9, %f8, %f7, %f26; |
| mov.f32 %f36, 0fB94D4153; |
| @%p10 bra BB95_14; |
| |
| mov.f32 %f27, 0fBAB607ED; |
| mov.f32 %f28, 0f37CBAC00; |
| fma.rn.f32 %f36, %f28, %f8, %f27; |
| |
| BB95_14: |
| selp.f32 %f29, 0f3C0885E4, 0f3D2AAABB, %p10; |
| fma.rn.f32 %f30, %f36, %f8, %f29; |
| selp.f32 %f31, 0fBE2AAAA8, 0fBEFFFFFF, %p10; |
| fma.rn.f32 %f32, %f30, %f8, %f31; |
| fma.rn.f32 %f37, %f32, %f9, %f7; |
| and.b32 %r60, %r68, 2; |
| setp.eq.s32 %p12, %r60, 0; |
| @%p12 bra BB95_16; |
| |
| mov.f32 %f34, 0fBF800000; |
| fma.rn.f32 %f37, %f37, %f34, %f26; |
| |
| BB95_16: |
| cvta.to.global.u64 %rd19, %rd8; |
| add.s64 %rd21, %rd19, %rd10; |
| st.global.f32 [%rd21], %f37; |
| |
| BB95_17: |
| ret; |
| } |
| |
| // .globl matrix_sinh_d |
| .visible .entry matrix_sinh_d( |
| .param .u64 matrix_sinh_d_param_0, |
| .param .u64 matrix_sinh_d_param_1, |
| .param .u32 matrix_sinh_d_param_2 |
| ) |
| { |
| .reg .pred %p<7>; |
| .reg .b32 %r<24>; |
| .reg .f64 %fd<68>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd1, [matrix_sinh_d_param_0]; |
| ld.param.u64 %rd2, [matrix_sinh_d_param_1]; |
| ld.param.u32 %r3, [matrix_sinh_d_param_2]; |
| mov.u32 %r4, %ctaid.x; |
| mov.u32 %r5, %ntid.x; |
| mov.u32 %r6, %tid.x; |
| mad.lo.s32 %r1, %r5, %r4, %r6; |
| setp.ge.u32 %p1, %r1, %r3; |
| @%p1 bra BB96_5; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.s32 %rd4, %r1, 8; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f64 %fd5, [%rd5]; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r2}, %fd5; |
| } |
| and.b32 %r7, %r2, 2147483647; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r8, %temp}, %fd5; |
| } |
| mov.b64 %fd1, {%r8, %r7}; |
| setp.lt.u32 %p2, %r7, 1072693248; |
| @%p2 bra BB96_3; |
| bra.uni BB96_2; |
| |
| BB96_3: |
| mul.f64 %fd51, %fd1, %fd1; |
| mov.f64 %fd52, 0d3DE611A561D87DEF; |
| mov.f64 %fd53, 0d3D6B4C75AB274C53; |
| fma.rn.f64 %fd54, %fd53, %fd51, %fd52; |
| mov.f64 %fd55, 0d3E5AE64671B18F5C; |
| fma.rn.f64 %fd56, %fd54, %fd51, %fd55; |
| mov.f64 %fd57, 0d3EC71DE3A465B1E4; |
| fma.rn.f64 %fd58, %fd56, %fd51, %fd57; |
| mov.f64 %fd59, 0d3F2A01A01A02899D; |
| fma.rn.f64 %fd60, %fd58, %fd51, %fd59; |
| mov.f64 %fd61, 0d3F811111111110A6; |
| fma.rn.f64 %fd62, %fd60, %fd51, %fd61; |
| mov.f64 %fd63, 0d3FC5555555555556; |
| fma.rn.f64 %fd64, %fd62, %fd51, %fd63; |
| mul.f64 %fd65, %fd51, %fd64; |
| fma.rn.f64 %fd67, %fd65, %fd1, %fd1; |
| bra.uni BB96_4; |
| |
| BB96_2: |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r9}, %fd1; |
| } |
| mov.f64 %fd6, 0d4338000000000000; |
| mov.f64 %fd7, 0d3FF71547652B82FE; |
| fma.rn.f64 %fd8, %fd1, %fd7, %fd6; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r10, %temp}, %fd8; |
| } |
| add.s32 %r11, %r10, -1; |
| mov.f64 %fd9, 0dC338000000000000; |
| add.rn.f64 %fd10, %fd8, %fd9; |
| mov.f64 %fd11, 0dBFE62E42FEFA39EF; |
| fma.rn.f64 %fd12, %fd10, %fd11, %fd1; |
| mov.f64 %fd13, 0dBC7ABC9E3B39803F; |
| fma.rn.f64 %fd14, %fd10, %fd13, %fd12; |
| add.s32 %r12, %r9, %r9; |
| setp.lt.u32 %p3, %r12, 2142496327; |
| selp.b32 %r13, 0, %r11, %p3; |
| mov.u32 %r14, 0; |
| selp.f64 %fd15, %fd1, %fd14, %p3; |
| mov.f64 %fd16, 0d3E5AF86D8EBD13CD; |
| mov.f64 %fd17, 0d3E21F4076ACD15B6; |
| fma.rn.f64 %fd18, %fd17, %fd15, %fd16; |
| mov.f64 %fd19, 0d3E927E5092BA033D; |
| fma.rn.f64 %fd20, %fd18, %fd15, %fd19; |
| mov.f64 %fd21, 0d3EC71DDE6C5F9DA1; |
| fma.rn.f64 %fd22, %fd20, %fd15, %fd21; |
| mov.f64 %fd23, 0d3EFA01A018D034E6; |
| fma.rn.f64 %fd24, %fd22, %fd15, %fd23; |
| mov.f64 %fd25, 0d3F2A01A01B3B6940; |
| fma.rn.f64 %fd26, %fd24, %fd15, %fd25; |
| mov.f64 %fd27, 0d3F56C16C16C1B5DD; |
| fma.rn.f64 %fd28, %fd26, %fd15, %fd27; |
| mov.f64 %fd29, 0d3F8111111110F74D; |
| fma.rn.f64 %fd30, %fd28, %fd15, %fd29; |
| mov.f64 %fd31, 0d3FA555555555554D; |
| fma.rn.f64 %fd32, %fd30, %fd15, %fd31; |
| mov.f64 %fd33, 0d3FC5555555555557; |
| fma.rn.f64 %fd34, %fd32, %fd15, %fd33; |
| mov.f64 %fd35, 0d3FE0000000000000; |
| fma.rn.f64 %fd36, %fd34, %fd15, %fd35; |
| mul.f64 %fd37, %fd15, %fd36; |
| fma.rn.f64 %fd38, %fd37, %fd15, %fd15; |
| setp.eq.s32 %p4, %r13, 1024; |
| selp.b32 %r15, -1, 0, %p4; |
| add.s32 %r16, %r15, %r13; |
| shl.b32 %r17, %r16, 20; |
| add.s32 %r18, %r17, 1072693248; |
| mov.b64 %fd39, {%r14, %r18}; |
| mov.u32 %r19, 1071644672; |
| mov.b64 %fd40, {%r14, %r19}; |
| sub.f64 %fd41, %fd39, %fd40; |
| fma.rn.f64 %fd42, %fd38, %fd39, %fd41; |
| add.f64 %fd43, %fd42, %fd42; |
| selp.f64 %fd44, %fd43, %fd42, %p4; |
| setp.eq.s32 %p5, %r12, 0; |
| selp.f64 %fd45, %fd15, %fd44, %p5; |
| mov.f64 %fd46, 0d3FF0000000000000; |
| mov.f64 %fd47, 0d4000000000000000; |
| fma.rn.f64 %fd48, %fd47, %fd45, %fd46; |
| div.rn.f64 %fd49, %fd45, %fd48; |
| add.f64 %fd50, %fd49, %fd45; |
| setp.ltu.f64 %p6, %fd1, 0d408633CE8FB9F87E; |
| selp.f64 %fd67, %fd50, 0d7FF0000000000000, %p6; |
| |
| BB96_4: |
| cvta.to.global.u64 %rd6, %rd2; |
| and.b32 %r20, %r2, -2147483648; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r21}, %fd67; |
| } |
| or.b32 %r22, %r21, %r20; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r23, %temp}, %fd67; |
| } |
| mov.b64 %fd66, {%r23, %r22}; |
| add.s64 %rd8, %rd6, %rd4; |
| st.global.f64 [%rd8], %fd66; |
| |
| BB96_5: |
| ret; |
| } |
| |
| // .globl matrix_sinh_f |
| .visible .entry matrix_sinh_f( |
| .param .u64 matrix_sinh_f_param_0, |
| .param .u64 matrix_sinh_f_param_1, |
| .param .u32 matrix_sinh_f_param_2 |
| ) |
| { |
| .reg .pred %p<4>; |
| .reg .f32 %f<32>; |
| .reg .b32 %r<11>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd1, [matrix_sinh_f_param_0]; |
| ld.param.u64 %rd2, [matrix_sinh_f_param_1]; |
| ld.param.u32 %r2, [matrix_sinh_f_param_2]; |
| mov.u32 %r3, %ctaid.x; |
| mov.u32 %r4, %ntid.x; |
| mov.u32 %r5, %tid.x; |
| mad.lo.s32 %r1, %r4, %r3, %r5; |
| setp.ge.u32 %p1, %r1, %r2; |
| @%p1 bra BB97_5; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.s32 %rd4, %r1, 4; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f32 %f1, [%rd5]; |
| abs.f32 %f2, %f1; |
| setp.ltu.f32 %p2, %f2, 0f3F800000; |
| @%p2 bra BB97_3; |
| bra.uni BB97_2; |
| |
| BB97_3: |
| mul.f32 %f22, %f1, %f1; |
| mov.f32 %f23, 0f394FFF49; |
| mov.f32 %f24, 0f363D0ADA; |
| fma.rn.f32 %f25, %f24, %f22, %f23; |
| mov.f32 %f26, 0f3C08889A; |
| fma.rn.f32 %f27, %f25, %f22, %f26; |
| mov.f32 %f28, 0f3E2AAAAB; |
| fma.rn.f32 %f29, %f27, %f22, %f28; |
| mul.f32 %f30, %f22, %f29; |
| fma.rn.f32 %f31, %f30, %f1, %f1; |
| bra.uni BB97_4; |
| |
| BB97_2: |
| mul.f32 %f6, %f2, 0f3FB8AA3B; |
| cvt.rzi.f32.f32 %f7, %f6; |
| mov.f32 %f8, 0fBF317200; |
| fma.rn.f32 %f9, %f7, %f8, %f2; |
| mov.f32 %f10, 0fB5BFBE8E; |
| fma.rn.f32 %f11, %f7, %f10, %f9; |
| mul.f32 %f12, %f11, 0f3FB8AA3B; |
| ex2.approx.ftz.f32 %f13, %f12; |
| add.f32 %f14, %f7, 0fC0000000; |
| ex2.approx.f32 %f15, %f14; |
| mul.f32 %f16, %f13, %f15; |
| mov.f32 %f17, 0f3E000000; |
| div.approx.f32 %f18, %f17, %f16; |
| neg.f32 %f19, %f18; |
| mov.f32 %f20, 0f40000000; |
| fma.rn.f32 %f21, %f20, %f16, %f19; |
| mov.b32 %r6, %f21; |
| setp.ltu.f32 %p3, %f2, 0f42B40000; |
| selp.b32 %r7, %r6, 2139095040, %p3; |
| mov.b32 %r8, %f1; |
| and.b32 %r9, %r8, -2147483648; |
| or.b32 %r10, %r7, %r9; |
| mov.b32 %f31, %r10; |
| |
| BB97_4: |
| cvta.to.global.u64 %rd6, %rd2; |
| add.s64 %rd8, %rd6, %rd4; |
| st.global.f32 [%rd8], %f31; |
| |
| BB97_5: |
| ret; |
| } |
| |
| // .globl matrix_cos_d |
| .visible .entry matrix_cos_d( |
| .param .u64 matrix_cos_d_param_0, |
| .param .u64 matrix_cos_d_param_1, |
| .param .u32 matrix_cos_d_param_2 |
| ) |
| { |
| .local .align 4 .b8 __local_depot98[4]; |
| .reg .b64 %SP; |
| .reg .b64 %SPL; |
| .reg .pred %p<7>; |
| .reg .b32 %r<20>; |
| .reg .f64 %fd<42>; |
| .reg .b64 %rd<15>; |
| |
| |
| mov.u64 %SPL, __local_depot98; |
| cvta.local.u64 %SP, %SPL; |
| ld.param.u64 %rd2, [matrix_cos_d_param_0]; |
| ld.param.u64 %rd3, [matrix_cos_d_param_1]; |
| ld.param.u32 %r6, [matrix_cos_d_param_2]; |
| add.u64 %rd4, %SP, 0; |
| add.u64 %rd1, %SPL, 0; |
| mov.u32 %r7, %ntid.x; |
| mov.u32 %r8, %ctaid.x; |
| mov.u32 %r9, %tid.x; |
| mad.lo.s32 %r1, %r7, %r8, %r9; |
| setp.ge.u32 %p1, %r1, %r6; |
| @%p1 bra BB98_11; |
| |
| cvta.to.global.u64 %rd5, %rd2; |
| mul.wide.s32 %rd6, %r1, 8; |
| add.s64 %rd7, %rd5, %rd6; |
| ld.global.f64 %fd38, [%rd7]; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r10}, %fd38; |
| } |
| and.b32 %r11, %r10, 2147483647; |
| setp.ne.s32 %p2, %r11, 2146435072; |
| @%p2 bra BB98_4; |
| |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r12, %temp}, %fd38; |
| } |
| setp.ne.s32 %p3, %r12, 0; |
| @%p3 bra BB98_4; |
| |
| mov.f64 %fd14, 0d0000000000000000; |
| mul.rn.f64 %fd38, %fd38, %fd14; |
| |
| BB98_4: |
| mul.f64 %fd15, %fd38, 0d3FE45F306DC9C883; |
| cvt.rni.s32.f64 %r19, %fd15; |
| st.local.u32 [%rd1], %r19; |
| cvt.rn.f64.s32 %fd16, %r19; |
| neg.f64 %fd17, %fd16; |
| mov.f64 %fd18, 0d3FF921FB54442D18; |
| fma.rn.f64 %fd19, %fd17, %fd18, %fd38; |
| mov.f64 %fd20, 0d3C91A62633145C00; |
| fma.rn.f64 %fd21, %fd17, %fd20, %fd19; |
| mov.f64 %fd22, 0d397B839A252049C0; |
| fma.rn.f64 %fd39, %fd17, %fd22, %fd21; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r13}, %fd38; |
| } |
| and.b32 %r14, %r13, 2145386496; |
| setp.lt.u32 %p4, %r14, 1105199104; |
| @%p4 bra BB98_6; |
| |
| // Callseq Start 4 |
| { |
| .reg .b32 temp_param_reg; |
| // <end>} |
| .param .b64 param0; |
| st.param.f64 [param0+0], %fd38; |
| .param .b64 param1; |
| st.param.b64 [param1+0], %rd4; |
| .param .b64 retval0; |
| call.uni (retval0), |
| __internal_trig_reduction_slowpathd, |
| ( |
| param0, |
| param1 |
| ); |
| ld.param.f64 %fd39, [retval0+0]; |
| |
| //{ |
| }// Callseq End 4 |
| ld.local.u32 %r19, [%rd1]; |
| |
| BB98_6: |
| add.s32 %r5, %r19, 1; |
| and.b32 %r15, %r5, 1; |
| shl.b32 %r16, %r15, 3; |
| setp.eq.s32 %p5, %r15, 0; |
| selp.f64 %fd23, 0d3DE5DB65F9785EBA, 0dBDA8FF8320FD8164, %p5; |
| add.s32 %r17, %r16, 1; |
| mul.wide.s32 %rd9, %r17, 8; |
| mov.u64 %rd10, __cudart_sin_cos_coeffs; |
| add.s64 %rd11, %rd10, %rd9; |
| ld.const.f64 %fd24, [%rd11]; |
| mul.rn.f64 %fd7, %fd39, %fd39; |
| fma.rn.f64 %fd25, %fd23, %fd7, %fd24; |
| ld.const.f64 %fd26, [%rd11+8]; |
| fma.rn.f64 %fd27, %fd25, %fd7, %fd26; |
| ld.const.f64 %fd28, [%rd11+16]; |
| fma.rn.f64 %fd29, %fd27, %fd7, %fd28; |
| ld.const.f64 %fd30, [%rd11+24]; |
| fma.rn.f64 %fd31, %fd29, %fd7, %fd30; |
| ld.const.f64 %fd32, [%rd11+32]; |
| fma.rn.f64 %fd33, %fd31, %fd7, %fd32; |
| ld.const.f64 %fd34, [%rd11+40]; |
| fma.rn.f64 %fd8, %fd33, %fd7, %fd34; |
| fma.rn.f64 %fd40, %fd8, %fd39, %fd39; |
| @%p5 bra BB98_8; |
| |
| mov.f64 %fd35, 0d3FF0000000000000; |
| fma.rn.f64 %fd40, %fd8, %fd7, %fd35; |
| |
| BB98_8: |
| and.b32 %r18, %r5, 2; |
| setp.eq.s32 %p6, %r18, 0; |
| @%p6 bra BB98_10; |
| |
| mov.f64 %fd36, 0d0000000000000000; |
| mov.f64 %fd37, 0dBFF0000000000000; |
| fma.rn.f64 %fd40, %fd40, %fd37, %fd36; |
| |
| BB98_10: |
| cvta.to.global.u64 %rd12, %rd3; |
| add.s64 %rd14, %rd12, %rd6; |
| st.global.f64 [%rd14], %fd40; |
| |
| BB98_11: |
| ret; |
| } |
| |
| // .globl matrix_cos_f |
| .visible .entry matrix_cos_f( |
| .param .u64 matrix_cos_f_param_0, |
| .param .u64 matrix_cos_f_param_1, |
| .param .u32 matrix_cos_f_param_2 |
| ) |
| { |
| .local .align 4 .b8 __local_depot99[28]; |
| .reg .b64 %SP; |
| .reg .b64 %SPL; |
| .reg .pred %p<13>; |
| .reg .f32 %f<38>; |
| .reg .b32 %r<70>; |
| .reg .f64 %fd<3>; |
| .reg .b64 %rd<24>; |
| |
| |
| mov.u64 %SPL, __local_depot99; |
| ld.param.u64 %rd7, [matrix_cos_f_param_0]; |
| ld.param.u64 %rd8, [matrix_cos_f_param_1]; |
| ld.param.u32 %r30, [matrix_cos_f_param_2]; |
| mov.u32 %r31, %ntid.x; |
| mov.u32 %r32, %ctaid.x; |
| mov.u32 %r33, %tid.x; |
| mad.lo.s32 %r1, %r31, %r32, %r33; |
| setp.ge.u32 %p1, %r1, %r30; |
| @%p1 bra BB99_17; |
| |
| cvta.to.global.u64 %rd9, %rd7; |
| mul.wide.s32 %rd10, %r1, 4; |
| add.s64 %rd11, %rd9, %rd10; |
| add.u64 %rd1, %SPL, 0; |
| ld.global.f32 %f1, [%rd11]; |
| mul.f32 %f15, %f1, 0f3F22F983; |
| cvt.rni.s32.f32 %r69, %f15; |
| cvt.rn.f32.s32 %f16, %r69; |
| mov.f32 %f17, 0fBFC90FDA; |
| fma.rn.f32 %f18, %f16, %f17, %f1; |
| mov.f32 %f19, 0fB3A22168; |
| fma.rn.f32 %f20, %f16, %f19, %f18; |
| mov.f32 %f21, 0fA7C234C5; |
| fma.rn.f32 %f35, %f16, %f21, %f20; |
| abs.f32 %f3, %f1; |
| setp.leu.f32 %p2, %f3, 0f47CE4780; |
| @%p2 bra BB99_12; |
| |
| setp.eq.f32 %p3, %f3, 0f7F800000; |
| @%p3 bra BB99_11; |
| bra.uni BB99_3; |
| |
| BB99_11: |
| mov.f32 %f24, 0f00000000; |
| mul.rn.f32 %f35, %f1, %f24; |
| bra.uni BB99_12; |
| |
| BB99_3: |
| mov.b32 %r3, %f1; |
| shl.b32 %r36, %r3, 8; |
| or.b32 %r4, %r36, -2147483648; |
| mov.u32 %r63, 0; |
| mov.u64 %rd22, __cudart_i2opi_f; |
| mov.u32 %r62, -6; |
| mov.u64 %rd23, %rd1; |
| |
| BB99_4: |
| .pragma "nounroll"; |
| ld.const.u32 %r39, [%rd22]; |
| // inline asm |
| { |
| mad.lo.cc.u32 %r37, %r39, %r4, %r63; |
| madc.hi.u32 %r63, %r39, %r4, 0; |
| } |
| // inline asm |
| st.local.u32 [%rd23], %r37; |
| add.s64 %rd23, %rd23, 4; |
| add.s64 %rd22, %rd22, 4; |
| add.s32 %r62, %r62, 1; |
| setp.ne.s32 %p4, %r62, 0; |
| @%p4 bra BB99_4; |
| |
| bfe.u32 %r42, %r3, 23, 8; |
| add.s32 %r43, %r42, -128; |
| shr.u32 %r44, %r43, 5; |
| and.b32 %r9, %r3, -2147483648; |
| st.local.u32 [%rd1+24], %r63; |
| bfe.u32 %r10, %r3, 23, 5; |
| mov.u32 %r45, 6; |
| sub.s32 %r46, %r45, %r44; |
| mul.wide.s32 %rd14, %r46, 4; |
| add.s64 %rd6, %rd1, %rd14; |
| ld.local.u32 %r65, [%rd6]; |
| ld.local.u32 %r64, [%rd6+-4]; |
| setp.eq.s32 %p5, %r10, 0; |
| @%p5 bra BB99_7; |
| |
| mov.u32 %r47, 32; |
| sub.s32 %r48, %r47, %r10; |
| shr.u32 %r49, %r64, %r48; |
| shl.b32 %r50, %r65, %r10; |
| add.s32 %r65, %r49, %r50; |
| ld.local.u32 %r51, [%rd6+-8]; |
| shr.u32 %r52, %r51, %r48; |
| shl.b32 %r53, %r64, %r10; |
| add.s32 %r64, %r52, %r53; |
| |
| BB99_7: |
| shr.u32 %r54, %r64, 30; |
| shl.b32 %r55, %r65, 2; |
| add.s32 %r67, %r55, %r54; |
| shl.b32 %r18, %r64, 2; |
| shr.u32 %r56, %r67, 31; |
| shr.u32 %r57, %r65, 30; |
| add.s32 %r19, %r56, %r57; |
| setp.eq.s32 %p6, %r56, 0; |
| @%p6 bra BB99_8; |
| |
| not.b32 %r58, %r67; |
| neg.s32 %r66, %r18; |
| setp.eq.s32 %p7, %r18, 0; |
| selp.u32 %r59, 1, 0, %p7; |
| add.s32 %r67, %r59, %r58; |
| xor.b32 %r68, %r9, -2147483648; |
| bra.uni BB99_10; |
| |
| BB99_8: |
| mov.u32 %r66, %r18; |
| mov.u32 %r68, %r9; |
| |
| BB99_10: |
| cvt.u64.u32 %rd15, %r67; |
| shl.b64 %rd16, %rd15, 32; |
| cvt.u64.u32 %rd17, %r66; |
| or.b64 %rd18, %rd16, %rd17; |
| cvt.rn.f64.s64 %fd1, %rd18; |
| mul.f64 %fd2, %fd1, 0d3BF921FB54442D19; |
| cvt.rn.f32.f64 %f22, %fd2; |
| neg.f32 %f23, %f22; |
| setp.eq.s32 %p8, %r68, 0; |
| selp.f32 %f35, %f22, %f23, %p8; |
| setp.eq.s32 %p9, %r9, 0; |
| neg.s32 %r60, %r19; |
| selp.b32 %r69, %r19, %r60, %p9; |
| |
| BB99_12: |
| add.s32 %r28, %r69, 1; |
| and.b32 %r29, %r28, 1; |
| setp.eq.s32 %p10, %r29, 0; |
| selp.f32 %f7, %f35, 0f3F800000, %p10; |
| mul.rn.f32 %f8, %f35, %f35; |
| mov.f32 %f26, 0f00000000; |
| fma.rn.f32 %f9, %f8, %f7, %f26; |
| mov.f32 %f36, 0fB94D4153; |
| @%p10 bra BB99_14; |
| |
| mov.f32 %f27, 0fBAB607ED; |
| mov.f32 %f28, 0f37CBAC00; |
| fma.rn.f32 %f36, %f28, %f8, %f27; |
| |
| BB99_14: |
| selp.f32 %f29, 0f3C0885E4, 0f3D2AAABB, %p10; |
| fma.rn.f32 %f30, %f36, %f8, %f29; |
| selp.f32 %f31, 0fBE2AAAA8, 0fBEFFFFFF, %p10; |
| fma.rn.f32 %f32, %f30, %f8, %f31; |
| fma.rn.f32 %f37, %f32, %f9, %f7; |
| and.b32 %r61, %r28, 2; |
| setp.eq.s32 %p12, %r61, 0; |
| @%p12 bra BB99_16; |
| |
| mov.f32 %f34, 0fBF800000; |
| fma.rn.f32 %f37, %f37, %f34, %f26; |
| |
| BB99_16: |
| cvta.to.global.u64 %rd19, %rd8; |
| add.s64 %rd21, %rd19, %rd10; |
| st.global.f32 [%rd21], %f37; |
| |
| BB99_17: |
| ret; |
| } |
| |
| // .globl matrix_cosh_d |
| .visible .entry matrix_cosh_d( |
| .param .u64 matrix_cosh_d_param_0, |
| .param .u64 matrix_cosh_d_param_1, |
| .param .u32 matrix_cosh_d_param_2 |
| ) |
| { |
| .reg .pred %p<4>; |
| .reg .b32 %r<16>; |
| .reg .f64 %fd<46>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd1, [matrix_cosh_d_param_0]; |
| ld.param.u64 %rd2, [matrix_cosh_d_param_1]; |
| ld.param.u32 %r2, [matrix_cosh_d_param_2]; |
| mov.u32 %r3, %ctaid.x; |
| mov.u32 %r4, %ntid.x; |
| mov.u32 %r5, %tid.x; |
| mad.lo.s32 %r1, %r4, %r3, %r5; |
| setp.ge.u32 %p1, %r1, %r2; |
| @%p1 bra BB100_5; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.s32 %rd4, %r1, 8; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f64 %fd1, [%rd5]; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r6}, %fd1; |
| } |
| and.b32 %r7, %r6, 2147483647; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r8, %temp}, %fd1; |
| } |
| mov.b64 %fd2, {%r8, %r7}; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r9}, %fd2; |
| } |
| setp.lt.u32 %p2, %r9, 1082536911; |
| @%p2 bra BB100_3; |
| bra.uni BB100_2; |
| |
| BB100_3: |
| mov.f64 %fd6, 0d4338000000000000; |
| mov.f64 %fd7, 0d3FF71547652B82FE; |
| fma.rn.f64 %fd8, %fd2, %fd7, %fd6; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r10, %temp}, %fd8; |
| } |
| mov.f64 %fd9, 0dC338000000000000; |
| add.rn.f64 %fd10, %fd8, %fd9; |
| mov.f64 %fd11, 0dBFE62E42FEFA39EF; |
| fma.rn.f64 %fd12, %fd10, %fd11, %fd2; |
| mov.f64 %fd13, 0dBC7ABC9E3B39803F; |
| fma.rn.f64 %fd14, %fd10, %fd13, %fd12; |
| mov.f64 %fd15, 0d3E928AF3FCA213EA; |
| mov.f64 %fd16, 0d3E5ADE1569CE2BDF; |
| fma.rn.f64 %fd17, %fd16, %fd14, %fd15; |
| mov.f64 %fd18, 0d3EC71DEE62401315; |
| fma.rn.f64 %fd19, %fd17, %fd14, %fd18; |
| mov.f64 %fd20, 0d3EFA01997C89EB71; |
| fma.rn.f64 %fd21, %fd19, %fd14, %fd20; |
| mov.f64 %fd22, 0d3F2A01A014761F65; |
| fma.rn.f64 %fd23, %fd21, %fd14, %fd22; |
| mov.f64 %fd24, 0d3F56C16C1852B7AF; |
| fma.rn.f64 %fd25, %fd23, %fd14, %fd24; |
| mov.f64 %fd26, 0d3F81111111122322; |
| fma.rn.f64 %fd27, %fd25, %fd14, %fd26; |
| mov.f64 %fd28, 0d3FA55555555502A1; |
| fma.rn.f64 %fd29, %fd27, %fd14, %fd28; |
| mov.f64 %fd30, 0d3FC5555555555511; |
| fma.rn.f64 %fd31, %fd29, %fd14, %fd30; |
| mov.f64 %fd32, 0d3FE000000000000B; |
| fma.rn.f64 %fd33, %fd31, %fd14, %fd32; |
| mov.f64 %fd34, 0d3FF0000000000000; |
| fma.rn.f64 %fd35, %fd33, %fd14, %fd34; |
| fma.rn.f64 %fd36, %fd35, %fd14, %fd34; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r11, %temp}, %fd36; |
| } |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r12}, %fd36; |
| } |
| shl.b32 %r13, %r10, 20; |
| add.s32 %r14, %r13, %r12; |
| add.s32 %r15, %r14, -2097152; |
| mov.b64 %fd37, {%r11, %r15}; |
| rcp.approx.ftz.f64 %fd38, %fd37; |
| neg.f64 %fd39, %fd37; |
| fma.rn.f64 %fd40, %fd39, %fd38, %fd34; |
| fma.rn.f64 %fd41, %fd40, %fd40, %fd40; |
| fma.rn.f64 %fd42, %fd41, %fd38, %fd38; |
| mov.f64 %fd43, 0d3FB0000000000000; |
| fma.rn.f64 %fd45, %fd42, %fd43, %fd37; |
| bra.uni BB100_4; |
| |
| BB100_2: |
| setp.gtu.f64 %p3, %fd1, 0d7FF0000000000000; |
| selp.f64 %fd45, %fd1, 0d7FF0000000000000, %p3; |
| |
| BB100_4: |
| cvta.to.global.u64 %rd6, %rd2; |
| add.s64 %rd8, %rd6, %rd4; |
| add.f64 %fd44, %fd45, %fd45; |
| st.global.f64 [%rd8], %fd44; |
| |
| BB100_5: |
| ret; |
| } |
| |
| // .globl matrix_cosh_f |
| .visible .entry matrix_cosh_f( |
| .param .u64 matrix_cosh_f_param_0, |
| .param .u64 matrix_cosh_f_param_1, |
| .param .u32 matrix_cosh_f_param_2 |
| ) |
| { |
| .reg .pred %p<3>; |
| .reg .f32 %f<19>; |
| .reg .b32 %r<6>; |
| .reg .b64 %rd<8>; |
| |
| |
| ld.param.u64 %rd1, [matrix_cosh_f_param_0]; |
| ld.param.u64 %rd2, [matrix_cosh_f_param_1]; |
| ld.param.u32 %r2, [matrix_cosh_f_param_2]; |
| mov.u32 %r3, %ctaid.x; |
| mov.u32 %r4, %ntid.x; |
| mov.u32 %r5, %tid.x; |
| mad.lo.s32 %r1, %r4, %r3, %r5; |
| setp.ge.u32 %p1, %r1, %r2; |
| @%p1 bra BB101_2; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.s32 %rd4, %r1, 4; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f32 %f1, [%rd5]; |
| abs.f32 %f2, %f1; |
| mul.f32 %f3, %f2, 0f3FB8AA3B; |
| cvt.rzi.f32.f32 %f4, %f3; |
| mov.f32 %f5, 0fBF317200; |
| fma.rn.f32 %f6, %f4, %f5, %f2; |
| mov.f32 %f7, 0fB5BFBE8E; |
| fma.rn.f32 %f8, %f4, %f7, %f6; |
| mul.f32 %f9, %f8, 0f3FB8AA3B; |
| ex2.approx.ftz.f32 %f10, %f9; |
| add.f32 %f11, %f4, 0fC0000000; |
| ex2.approx.f32 %f12, %f11; |
| mul.f32 %f13, %f10, %f12; |
| mov.f32 %f14, 0f3E000000; |
| div.approx.f32 %f15, %f14, %f13; |
| mov.f32 %f16, 0f40000000; |
| fma.rn.f32 %f17, %f16, %f13, %f15; |
| setp.ltu.f32 %p2, %f2, 0f42B40000; |
| selp.f32 %f18, %f17, 0f7F800000, %p2; |
| cvta.to.global.u64 %rd6, %rd2; |
| add.s64 %rd7, %rd6, %rd4; |
| st.global.f32 [%rd7], %f18; |
| |
| BB101_2: |
| ret; |
| } |
| |
| // .globl matrix_tan_d |
| .visible .entry matrix_tan_d( |
| .param .u64 matrix_tan_d_param_0, |
| .param .u64 matrix_tan_d_param_1, |
| .param .u32 matrix_tan_d_param_2 |
| ) |
| { |
| .local .align 4 .b8 __local_depot102[4]; |
| .reg .b64 %SP; |
| .reg .b64 %SPL; |
| .reg .pred %p<6>; |
| .reg .b32 %r<16>; |
| .reg .f64 %fd<65>; |
| .reg .b64 %rd<12>; |
| |
| |
| mov.u64 %SPL, __local_depot102; |
| cvta.local.u64 %SP, %SPL; |
| ld.param.u64 %rd2, [matrix_tan_d_param_0]; |
| ld.param.u64 %rd3, [matrix_tan_d_param_1]; |
| ld.param.u32 %r5, [matrix_tan_d_param_2]; |
| add.u64 %rd4, %SP, 0; |
| add.u64 %rd1, %SPL, 0; |
| mov.u32 %r6, %ntid.x; |
| mov.u32 %r7, %ctaid.x; |
| mov.u32 %r8, %tid.x; |
| mad.lo.s32 %r1, %r6, %r7, %r8; |
| setp.ge.u32 %p1, %r1, %r5; |
| @%p1 bra BB102_9; |
| |
| cvta.to.global.u64 %rd5, %rd2; |
| mul.wide.s32 %rd6, %r1, 8; |
| add.s64 %rd7, %rd5, %rd6; |
| ld.global.f64 %fd62, [%rd7]; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r9}, %fd62; |
| } |
| and.b32 %r10, %r9, 2147483647; |
| setp.ne.s32 %p2, %r10, 2146435072; |
| @%p2 bra BB102_4; |
| |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r11, %temp}, %fd62; |
| } |
| setp.ne.s32 %p3, %r11, 0; |
| @%p3 bra BB102_4; |
| |
| mov.f64 %fd11, 0d0000000000000000; |
| mul.rn.f64 %fd62, %fd62, %fd11; |
| |
| BB102_4: |
| mul.f64 %fd12, %fd62, 0d3FE45F306DC9C883; |
| cvt.rni.s32.f64 %r15, %fd12; |
| st.local.u32 [%rd1], %r15; |
| cvt.rn.f64.s32 %fd13, %r15; |
| neg.f64 %fd14, %fd13; |
| mov.f64 %fd15, 0d3FF921FB54442D18; |
| fma.rn.f64 %fd16, %fd14, %fd15, %fd62; |
| mov.f64 %fd17, 0d3C91A62633145C00; |
| fma.rn.f64 %fd18, %fd14, %fd17, %fd16; |
| mov.f64 %fd19, 0d397B839A252049C0; |
| fma.rn.f64 %fd63, %fd14, %fd19, %fd18; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r12}, %fd62; |
| } |
| and.b32 %r13, %r12, 2145386496; |
| setp.lt.u32 %p4, %r13, 1105199104; |
| @%p4 bra BB102_6; |
| |
| // Callseq Start 5 |
| { |
| .reg .b32 temp_param_reg; |
| // <end>} |
| .param .b64 param0; |
| st.param.f64 [param0+0], %fd62; |
| .param .b64 param1; |
| st.param.b64 [param1+0], %rd4; |
| .param .b64 retval0; |
| call.uni (retval0), |
| __internal_trig_reduction_slowpathd, |
| ( |
| param0, |
| param1 |
| ); |
| ld.param.f64 %fd63, [retval0+0]; |
| |
| //{ |
| }// Callseq End 5 |
| ld.local.u32 %r15, [%rd1]; |
| |
| BB102_6: |
| mul.f64 %fd20, %fd63, %fd63; |
| mov.f64 %fd21, 0dBEF9757C5B27EBB1; |
| mov.f64 %fd22, 0d3EE48DAC2799BCB9; |
| fma.rn.f64 %fd23, %fd22, %fd20, %fd21; |
| mov.f64 %fd24, 0d3F0980E90FD91E04; |
| fma.rn.f64 %fd25, %fd23, %fd20, %fd24; |
| mov.f64 %fd26, 0dBEFAE2B0417D7E1D; |
| fma.rn.f64 %fd27, %fd25, %fd20, %fd26; |
| mov.f64 %fd28, 0d3F119F5341BFBA57; |
| fma.rn.f64 %fd29, %fd27, %fd20, %fd28; |
| mov.f64 %fd30, 0d3F15E791A00F6919; |
| fma.rn.f64 %fd31, %fd29, %fd20, %fd30; |
| mov.f64 %fd32, 0d3F2FF2E7FADEC73A; |
| fma.rn.f64 %fd33, %fd31, %fd20, %fd32; |
| mov.f64 %fd34, 0d3F434BC1B206DA62; |
| fma.rn.f64 %fd35, %fd33, %fd20, %fd34; |
| mov.f64 %fd36, 0d3F57DB18EF2F83F9; |
| fma.rn.f64 %fd37, %fd35, %fd20, %fd36; |
| mov.f64 %fd38, 0d3F6D6D2E7AE49FBC; |
| fma.rn.f64 %fd39, %fd37, %fd20, %fd38; |
| mov.f64 %fd40, 0d3F8226E3A816A776; |
| fma.rn.f64 %fd41, %fd39, %fd20, %fd40; |
| mov.f64 %fd42, 0d3F9664F485D25660; |
| fma.rn.f64 %fd43, %fd41, %fd20, %fd42; |
| mov.f64 %fd44, 0d3FABA1BA1BABF31D; |
| fma.rn.f64 %fd45, %fd43, %fd20, %fd44; |
| mov.f64 %fd46, 0d3FC11111111105D2; |
| fma.rn.f64 %fd47, %fd45, %fd20, %fd46; |
| mov.f64 %fd48, 0d3FD555555555555E; |
| fma.rn.f64 %fd49, %fd47, %fd20, %fd48; |
| mul.f64 %fd7, %fd20, %fd49; |
| fma.rn.f64 %fd64, %fd7, %fd63, %fd63; |
| and.b32 %r14, %r15, 1; |
| setp.eq.b32 %p5, %r14, 1; |
| @!%p5 bra BB102_8; |
| bra.uni BB102_7; |
| |
| BB102_7: |
| sub.f64 %fd50, %fd64, %fd63; |
| neg.f64 %fd51, %fd50; |
| fma.rn.f64 %fd52, %fd7, %fd63, %fd51; |
| neg.f64 %fd53, %fd64; |
| rcp.approx.ftz.f64 %fd54, %fd64; |
| mov.f64 %fd55, 0d3FF0000000000000; |
| fma.rn.f64 %fd56, %fd53, %fd54, %fd55; |
| fma.rn.f64 %fd57, %fd56, %fd56, %fd56; |
| fma.rn.f64 %fd58, %fd57, %fd54, %fd54; |
| neg.f64 %fd59, %fd58; |
| fma.rn.f64 %fd60, %fd64, %fd59, %fd55; |
| fma.rn.f64 %fd61, %fd59, %fd52, %fd60; |
| fma.rn.f64 %fd64, %fd61, %fd59, %fd59; |
| |
| BB102_8: |
| cvta.to.global.u64 %rd9, %rd3; |
| add.s64 %rd11, %rd9, %rd6; |
| st.global.f64 [%rd11], %fd64; |
| |
| BB102_9: |
| ret; |
| } |
| |
| // .globl matrix_tan_f |
| .visible .entry matrix_tan_f( |
| .param .u64 matrix_tan_f_param_0, |
| .param .u64 matrix_tan_f_param_1, |
| .param .u32 matrix_tan_f_param_2 |
| ) |
| { |
| .local .align 4 .b8 __local_depot103[28]; |
| .reg .b64 %SP; |
| .reg .b64 %SPL; |
| .reg .pred %p<12>; |
| .reg .f32 %f<39>; |
| .reg .b32 %r<68>; |
| .reg .f64 %fd<3>; |
| .reg .b64 %rd<24>; |
| |
| |
| mov.u64 %SPL, __local_depot103; |
| ld.param.u64 %rd7, [matrix_tan_f_param_0]; |
| ld.param.u64 %rd8, [matrix_tan_f_param_1]; |
| ld.param.u32 %r28, [matrix_tan_f_param_2]; |
| mov.u32 %r29, %ntid.x; |
| mov.u32 %r30, %ctaid.x; |
| mov.u32 %r31, %tid.x; |
| mad.lo.s32 %r1, %r29, %r30, %r31; |
| setp.ge.u32 %p1, %r1, %r28; |
| @%p1 bra BB103_15; |
| |
| cvta.to.global.u64 %rd9, %rd7; |
| mul.wide.s32 %rd10, %r1, 4; |
| add.s64 %rd11, %rd9, %rd10; |
| add.u64 %rd1, %SPL, 0; |
| ld.global.f32 %f1, [%rd11]; |
| mul.f32 %f10, %f1, 0f3F22F983; |
| cvt.rni.s32.f32 %r67, %f10; |
| cvt.rn.f32.s32 %f11, %r67; |
| mov.f32 %f12, 0fBFC90FDA; |
| fma.rn.f32 %f13, %f11, %f12, %f1; |
| mov.f32 %f14, 0fB3A22168; |
| fma.rn.f32 %f15, %f11, %f14, %f13; |
| mov.f32 %f16, 0fA7C234C5; |
| fma.rn.f32 %f37, %f11, %f16, %f15; |
| abs.f32 %f3, %f1; |
| setp.leu.f32 %p2, %f3, 0f47CE4780; |
| @%p2 bra BB103_12; |
| |
| setp.eq.f32 %p3, %f3, 0f7F800000; |
| @%p3 bra BB103_11; |
| bra.uni BB103_3; |
| |
| BB103_11: |
| mov.f32 %f19, 0f00000000; |
| mul.rn.f32 %f37, %f1, %f19; |
| bra.uni BB103_12; |
| |
| BB103_3: |
| mov.b32 %r3, %f1; |
| shl.b32 %r34, %r3, 8; |
| or.b32 %r4, %r34, -2147483648; |
| mov.u32 %r61, 0; |
| mov.u64 %rd22, __cudart_i2opi_f; |
| mov.u32 %r60, -6; |
| mov.u64 %rd23, %rd1; |
| |
| BB103_4: |
| .pragma "nounroll"; |
| ld.const.u32 %r37, [%rd22]; |
| // inline asm |
| { |
| mad.lo.cc.u32 %r35, %r37, %r4, %r61; |
| madc.hi.u32 %r61, %r37, %r4, 0; |
| } |
| // inline asm |
| st.local.u32 [%rd23], %r35; |
| add.s64 %rd23, %rd23, 4; |
| add.s64 %rd22, %rd22, 4; |
| add.s32 %r60, %r60, 1; |
| setp.ne.s32 %p4, %r60, 0; |
| @%p4 bra BB103_4; |
| |
| bfe.u32 %r40, %r3, 23, 8; |
| add.s32 %r41, %r40, -128; |
| shr.u32 %r42, %r41, 5; |
| and.b32 %r9, %r3, -2147483648; |
| st.local.u32 [%rd1+24], %r61; |
| bfe.u32 %r10, %r3, 23, 5; |
| mov.u32 %r43, 6; |
| sub.s32 %r44, %r43, %r42; |
| mul.wide.s32 %rd14, %r44, 4; |
| add.s64 %rd6, %rd1, %rd14; |
| ld.local.u32 %r63, [%rd6]; |
| ld.local.u32 %r62, [%rd6+-4]; |
| setp.eq.s32 %p5, %r10, 0; |
| @%p5 bra BB103_7; |
| |
| mov.u32 %r45, 32; |
| sub.s32 %r46, %r45, %r10; |
| shr.u32 %r47, %r62, %r46; |
| shl.b32 %r48, %r63, %r10; |
| add.s32 %r63, %r47, %r48; |
| ld.local.u32 %r49, [%rd6+-8]; |
| shr.u32 %r50, %r49, %r46; |
| shl.b32 %r51, %r62, %r10; |
| add.s32 %r62, %r50, %r51; |
| |
| BB103_7: |
| shr.u32 %r52, %r62, 30; |
| shl.b32 %r53, %r63, 2; |
| add.s32 %r65, %r53, %r52; |
| shl.b32 %r18, %r62, 2; |
| shr.u32 %r54, %r65, 31; |
| shr.u32 %r55, %r63, 30; |
| add.s32 %r19, %r54, %r55; |
| setp.eq.s32 %p6, %r54, 0; |
| @%p6 bra BB103_8; |
| |
| not.b32 %r56, %r65; |
| neg.s32 %r64, %r18; |
| setp.eq.s32 %p7, %r18, 0; |
| selp.u32 %r57, 1, 0, %p7; |
| add.s32 %r65, %r57, %r56; |
| xor.b32 %r66, %r9, -2147483648; |
| bra.uni BB103_10; |
| |
| BB103_8: |
| mov.u32 %r64, %r18; |
| mov.u32 %r66, %r9; |
| |
| BB103_10: |
| cvt.u64.u32 %rd15, %r65; |
| shl.b64 %rd16, %rd15, 32; |
| cvt.u64.u32 %rd17, %r64; |
| or.b64 %rd18, %rd16, %rd17; |
| cvt.rn.f64.s64 %fd1, %rd18; |
| mul.f64 %fd2, %fd1, 0d3BF921FB54442D19; |
| cvt.rn.f32.f64 %f17, %fd2; |
| neg.f32 %f18, %f17; |
| setp.eq.s32 %p8, %r66, 0; |
| selp.f32 %f37, %f17, %f18, %p8; |
| setp.eq.s32 %p9, %r9, 0; |
| neg.s32 %r58, %r19; |
| selp.b32 %r67, %r19, %r58, %p9; |
| |
| BB103_12: |
| mul.f32 %f20, %f37, %f37; |
| mov.f32 %f21, 0f3B560000; |
| mov.f32 %f22, 0f3C190000; |
| fma.rn.f32 %f23, %f22, %f20, %f21; |
| mov.f32 %f24, 0f3CC70000; |
| fma.rn.f32 %f25, %f23, %f20, %f24; |
| mov.f32 %f26, 0f3D5B0000; |
| fma.rn.f32 %f27, %f25, %f20, %f26; |
| mov.f32 %f28, 0f3E089438; |
| fma.rn.f32 %f29, %f27, %f20, %f28; |
| mov.f32 %f30, 0f3EAAAA88; |
| fma.rn.f32 %f31, %f29, %f20, %f30; |
| mul.rn.f32 %f32, %f20, %f37; |
| fma.rn.f32 %f33, %f31, %f32, %f37; |
| abs.f32 %f34, %f37; |
| setp.eq.f32 %p10, %f34, 0f3A00B43C; |
| selp.f32 %f38, %f37, %f33, %p10; |
| and.b32 %r59, %r67, 1; |
| setp.eq.b32 %p11, %r59, 1; |
| @!%p11 bra BB103_14; |
| bra.uni BB103_13; |
| |
| BB103_13: |
| // inline asm |
| rcp.approx.ftz.f32 %f35,%f38; |
| // inline asm |
| neg.f32 %f38, %f35; |
| |
| BB103_14: |
| cvta.to.global.u64 %rd19, %rd8; |
| add.s64 %rd21, %rd19, %rd10; |
| st.global.f32 [%rd21], %f38; |
| |
| BB103_15: |
| ret; |
| } |
| |
| // .globl matrix_tanh_d |
| .visible .entry matrix_tanh_d( |
| .param .u64 matrix_tanh_d_param_0, |
| .param .u64 matrix_tanh_d_param_1, |
| .param .u32 matrix_tanh_d_param_2 |
| ) |
| { |
| .reg .pred %p<4>; |
| .reg .f32 %f<5>; |
| .reg .b32 %r<13>; |
| .reg .f64 %fd<72>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd1, [matrix_tanh_d_param_0]; |
| ld.param.u64 %rd2, [matrix_tanh_d_param_1]; |
| ld.param.u32 %r4, [matrix_tanh_d_param_2]; |
| mov.u32 %r5, %ctaid.x; |
| mov.u32 %r6, %ntid.x; |
| mov.u32 %r7, %tid.x; |
| mad.lo.s32 %r1, %r6, %r5, %r7; |
| setp.ge.u32 %p1, %r1, %r4; |
| @%p1 bra BB104_5; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.s32 %rd4, %r1, 8; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f64 %fd1, [%rd5]; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r2}, %fd1; |
| } |
| and.b32 %r3, %r2, 2147483647; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r8, %temp}, %fd1; |
| } |
| mov.b64 %fd2, {%r8, %r3}; |
| setp.ltu.f64 %p2, %fd2, 0d3FE4F92224DD2F1A; |
| @%p2 bra BB104_3; |
| bra.uni BB104_2; |
| |
| BB104_3: |
| mul.f64 %fd47, %fd1, %fd1; |
| mov.f64 %fd48, 0d3F14359F420AFC3D; |
| mov.f64 %fd49, 0dBEF0BC46E2F5E964; |
| fma.rn.f64 %fd50, %fd49, %fd47, %fd48; |
| mov.f64 %fd51, 0dBF2DF9F0728C5D84; |
| fma.rn.f64 %fd52, %fd50, %fd47, %fd51; |
| mov.f64 %fd53, 0d3F4337D1CEC4F033; |
| fma.rn.f64 %fd54, %fd52, %fd47, %fd53; |
| mov.f64 %fd55, 0dBF57D6E9674335B3; |
| fma.rn.f64 %fd56, %fd54, %fd47, %fd55; |
| mov.f64 %fd57, 0d3F6D6D000D7AAD3D; |
| fma.rn.f64 %fd58, %fd56, %fd47, %fd57; |
| mov.f64 %fd59, 0dBF8226E1F3CF1EF5; |
| fma.rn.f64 %fd60, %fd58, %fd47, %fd59; |
| mov.f64 %fd61, 0d3F9664F47EC0C8CF; |
| fma.rn.f64 %fd62, %fd60, %fd47, %fd61; |
| mov.f64 %fd63, 0dBFABA1BA1B80AB40; |
| fma.rn.f64 %fd64, %fd62, %fd47, %fd63; |
| mov.f64 %fd65, 0d3FC111111110FA4A; |
| fma.rn.f64 %fd66, %fd64, %fd47, %fd65; |
| mov.f64 %fd67, 0dBFD5555555555550; |
| fma.rn.f64 %fd68, %fd66, %fd47, %fd67; |
| mov.f64 %fd69, 0d0000000000000000; |
| fma.rn.f64 %fd70, %fd68, %fd47, %fd69; |
| fma.rn.f64 %fd71, %fd70, %fd1, %fd1; |
| bra.uni BB104_4; |
| |
| BB104_2: |
| add.f64 %fd6, %fd2, %fd2; |
| cvt.rn.f32.f64 %f1, %fd6; |
| mul.f32 %f2, %f1, 0f3FB8AA3B; |
| cvt.rni.f32.f32 %f3, %f2; |
| cvt.f64.f32 %fd7, %f3; |
| neg.f64 %fd8, %fd7; |
| mov.f64 %fd9, 0d3FE62E42FEFA39EF; |
| fma.rn.f64 %fd10, %fd8, %fd9, %fd6; |
| mov.f64 %fd11, 0d3E928A27F89B6999; |
| mov.f64 %fd12, 0d3E5AE904A4741B81; |
| fma.rn.f64 %fd13, %fd12, %fd10, %fd11; |
| mov.f64 %fd14, 0d3EC71DE715FF7E07; |
| fma.rn.f64 %fd15, %fd13, %fd10, %fd14; |
| mov.f64 %fd16, 0d3EFA019A6B0AC45A; |
| fma.rn.f64 %fd17, %fd15, %fd10, %fd16; |
| mov.f64 %fd18, 0d3F2A01A017EED94F; |
| fma.rn.f64 %fd19, %fd17, %fd10, %fd18; |
| mov.f64 %fd20, 0d3F56C16C17F2A71B; |
| fma.rn.f64 %fd21, %fd19, %fd10, %fd20; |
| mov.f64 %fd22, 0d3F811111111173C4; |
| fma.rn.f64 %fd23, %fd21, %fd10, %fd22; |
| mov.f64 %fd24, 0d3FA555555555211A; |
| fma.rn.f64 %fd25, %fd23, %fd10, %fd24; |
| mov.f64 %fd26, 0d3FC5555555555540; |
| fma.rn.f64 %fd27, %fd25, %fd10, %fd26; |
| mov.f64 %fd28, 0d3FE0000000000005; |
| fma.rn.f64 %fd29, %fd27, %fd10, %fd28; |
| mul.f64 %fd30, %fd10, %fd29; |
| fma.rn.f64 %fd31, %fd30, %fd10, %fd10; |
| ex2.approx.ftz.f32 %f4, %f3; |
| cvt.f64.f32 %fd32, %f4; |
| mov.f64 %fd33, 0d3FF0000000000000; |
| sub.f64 %fd34, %fd33, %fd32; |
| neg.f64 %fd35, %fd31; |
| fma.rn.f64 %fd36, %fd35, %fd32, %fd34; |
| mov.f64 %fd37, 0d4000000000000000; |
| sub.f64 %fd38, %fd37, %fd36; |
| rcp.approx.ftz.f64 %fd39, %fd38; |
| neg.f64 %fd40, %fd38; |
| fma.rn.f64 %fd41, %fd40, %fd39, %fd33; |
| fma.rn.f64 %fd42, %fd41, %fd41, %fd41; |
| fma.rn.f64 %fd43, %fd42, %fd39, %fd39; |
| neg.f64 %fd44, %fd43; |
| fma.rn.f64 %fd45, %fd37, %fd44, %fd33; |
| setp.gt.u32 %p3, %r3, 1077088193; |
| selp.f64 %fd46, 0d3FF0000000000000, %fd45, %p3; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r9, %temp}, %fd46; |
| } |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r10}, %fd46; |
| } |
| and.b32 %r11, %r2, -2147483648; |
| or.b32 %r12, %r10, %r11; |
| mov.b64 %fd71, {%r9, %r12}; |
| |
| BB104_4: |
| cvta.to.global.u64 %rd6, %rd2; |
| add.s64 %rd8, %rd6, %rd4; |
| st.global.f64 [%rd8], %fd71; |
| |
| BB104_5: |
| ret; |
| } |
| |
| // .globl matrix_tanh_f |
| .visible .entry matrix_tanh_f( |
| .param .u64 matrix_tanh_f_param_0, |
| .param .u64 matrix_tanh_f_param_1, |
| .param .u32 matrix_tanh_f_param_2 |
| ) |
| { |
| .reg .pred %p<4>; |
| .reg .f32 %f<24>; |
| .reg .b32 %r<11>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd1, [matrix_tanh_f_param_0]; |
| ld.param.u64 %rd2, [matrix_tanh_f_param_1]; |
| ld.param.u32 %r2, [matrix_tanh_f_param_2]; |
| mov.u32 %r3, %ctaid.x; |
| mov.u32 %r4, %ntid.x; |
| mov.u32 %r5, %tid.x; |
| mad.lo.s32 %r1, %r4, %r3, %r5; |
| setp.ge.u32 %p1, %r1, %r2; |
| @%p1 bra BB105_5; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.s32 %rd4, %r1, 4; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f32 %f1, [%rd5]; |
| abs.f32 %f2, %f1; |
| setp.ltu.f32 %p2, %f2, 0f3F19999A; |
| @%p2 bra BB105_3; |
| bra.uni BB105_2; |
| |
| BB105_3: |
| mul.f32 %f13, %f1, %f1; |
| mov.f32 %f14, 0fBD563CAE; |
| mov.f32 %f15, 0f3C80F082; |
| fma.rn.f32 %f16, %f15, %f13, %f14; |
| mov.f32 %f17, 0f3E085941; |
| fma.rn.f32 %f18, %f16, %f13, %f17; |
| mov.f32 %f19, 0fBEAAA9ED; |
| fma.rn.f32 %f20, %f18, %f13, %f19; |
| mov.f32 %f21, 0f00000000; |
| fma.rn.f32 %f22, %f20, %f13, %f21; |
| fma.rn.f32 %f23, %f22, %f1, %f1; |
| bra.uni BB105_4; |
| |
| BB105_2: |
| mul.f32 %f8, %f2, 0f4038AA3B; |
| ex2.approx.ftz.f32 %f9, %f8; |
| add.f32 %f7, %f9, 0f3F800000; |
| // inline asm |
| rcp.approx.ftz.f32 %f6,%f7; |
| // inline asm |
| mov.f32 %f10, 0f3F800000; |
| mov.f32 %f11, 0fC0000000; |
| fma.rn.f32 %f12, %f6, %f11, %f10; |
| mov.b32 %r6, %f12; |
| setp.ltu.f32 %p3, %f2, 0f41102CB4; |
| selp.b32 %r7, %r6, 1065353216, %p3; |
| mov.b32 %r8, %f1; |
| and.b32 %r9, %r8, -2147483648; |
| or.b32 %r10, %r7, %r9; |
| mov.b32 %f23, %r10; |
| |
| BB105_4: |
| cvta.to.global.u64 %rd6, %rd2; |
| add.s64 %rd8, %rd6, %rd4; |
| st.global.f32 [%rd8], %f23; |
| |
| BB105_5: |
| ret; |
| } |
| |
| // .globl matrix_asin_d |
| .visible .entry matrix_asin_d( |
| .param .u64 matrix_asin_d_param_0, |
| .param .u64 matrix_asin_d_param_1, |
| .param .u32 matrix_asin_d_param_2 |
| ) |
| { |
| .reg .pred %p<5>; |
| .reg .f32 %f<3>; |
| .reg .b32 %r<15>; |
| .reg .f64 %fd<83>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd1, [matrix_asin_d_param_0]; |
| ld.param.u64 %rd2, [matrix_asin_d_param_1]; |
| ld.param.u32 %r3, [matrix_asin_d_param_2]; |
| mov.u32 %r4, %ctaid.x; |
| mov.u32 %r5, %ntid.x; |
| mov.u32 %r6, %tid.x; |
| mad.lo.s32 %r1, %r5, %r4, %r6; |
| setp.ge.u32 %p1, %r1, %r3; |
| @%p1 bra BB106_5; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.s32 %rd4, %r1, 8; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f64 %fd1, [%rd5]; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r2}, %fd1; |
| } |
| mov.b32 %f1, %r2; |
| abs.f32 %f2, %f1; |
| setp.lt.f32 %p2, %f2, 0f3FE26666; |
| @%p2 bra BB106_3; |
| bra.uni BB106_2; |
| |
| BB106_3: |
| mul.f64 %fd55, %fd1, %fd1; |
| mov.f64 %fd56, 0dBFB3823B180754AF; |
| mov.f64 %fd57, 0d3FB0066BDC1895E9; |
| fma.rn.f64 %fd58, %fd57, %fd55, %fd56; |
| mov.f64 %fd59, 0d3FB11E52CC2F79AE; |
| fma.rn.f64 %fd60, %fd58, %fd55, %fd59; |
| mov.f64 %fd61, 0dBF924EAF3526861B; |
| fma.rn.f64 %fd62, %fd60, %fd55, %fd61; |
| mov.f64 %fd63, 0d3F91DF02A31E6CB7; |
| fma.rn.f64 %fd64, %fd62, %fd55, %fd63; |
| mov.f64 %fd65, 0d3F847D18B0EEC6CC; |
| fma.rn.f64 %fd66, %fd64, %fd55, %fd65; |
| mov.f64 %fd67, 0d3F8D0AF961BA53B0; |
| fma.rn.f64 %fd68, %fd66, %fd55, %fd67; |
| mov.f64 %fd69, 0d3F91BF7734CF1C48; |
| fma.rn.f64 %fd70, %fd68, %fd55, %fd69; |
| mov.f64 %fd71, 0d3F96E91483144EF7; |
| fma.rn.f64 %fd72, %fd70, %fd55, %fd71; |
| mov.f64 %fd73, 0d3F9F1C6E0A4F9F81; |
| fma.rn.f64 %fd74, %fd72, %fd55, %fd73; |
| mov.f64 %fd75, 0d3FA6DB6DC27FA92B; |
| fma.rn.f64 %fd76, %fd74, %fd55, %fd75; |
| mov.f64 %fd77, 0d3FB333333320F91B; |
| fma.rn.f64 %fd78, %fd76, %fd55, %fd77; |
| mov.f64 %fd79, 0d3FC5555555555F4D; |
| fma.rn.f64 %fd80, %fd78, %fd55, %fd79; |
| mul.f64 %fd81, %fd55, %fd80; |
| fma.rn.f64 %fd82, %fd81, %fd1, %fd1; |
| bra.uni BB106_4; |
| |
| BB106_2: |
| abs.f64 %fd7, %fd1; |
| mov.f64 %fd8, 0d3FE0000000000000; |
| mov.f64 %fd9, 0dBFE0000000000000; |
| fma.rn.f64 %fd6, %fd9, %fd7, %fd8; |
| // inline asm |
| rsqrt.approx.ftz.f64 %fd5, %fd6; |
| // inline asm |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r7, %temp}, %fd5; |
| } |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r8}, %fd5; |
| } |
| add.s32 %r9, %r8, -1048576; |
| mov.b64 %fd10, {%r7, %r9}; |
| mul.f64 %fd11, %fd6, %fd5; |
| neg.f64 %fd12, %fd11; |
| fma.rn.f64 %fd13, %fd11, %fd12, %fd6; |
| fma.rn.f64 %fd14, %fd13, %fd10, %fd11; |
| neg.f64 %fd15, %fd14; |
| mov.f64 %fd16, 0d3FF0000000000000; |
| fma.rn.f64 %fd17, %fd5, %fd15, %fd16; |
| fma.rn.f64 %fd18, %fd17, %fd10, %fd10; |
| fma.rn.f64 %fd19, %fd14, %fd15, %fd6; |
| fma.rn.f64 %fd20, %fd19, %fd18, %fd14; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r10}, %fd6; |
| } |
| setp.lt.s32 %p3, %r10, 0; |
| selp.f64 %fd21, 0dFFF8000000000000, %fd20, %p3; |
| setp.equ.f64 %p4, %fd6, 0d0000000000000000; |
| selp.f64 %fd22, %fd6, %fd21, %p4; |
| mov.f64 %fd23, 0dBFB3823B180754AF; |
| mov.f64 %fd24, 0d3FB0066BDC1895E9; |
| fma.rn.f64 %fd25, %fd24, %fd6, %fd23; |
| mov.f64 %fd26, 0d3FB11E52CC2F79AE; |
| fma.rn.f64 %fd27, %fd25, %fd6, %fd26; |
| mov.f64 %fd28, 0dBF924EAF3526861B; |
| fma.rn.f64 %fd29, %fd27, %fd6, %fd28; |
| mov.f64 %fd30, 0d3F91DF02A31E6CB7; |
| fma.rn.f64 %fd31, %fd29, %fd6, %fd30; |
| mov.f64 %fd32, 0d3F847D18B0EEC6CC; |
| fma.rn.f64 %fd33, %fd31, %fd6, %fd32; |
| mov.f64 %fd34, 0d3F8D0AF961BA53B0; |
| fma.rn.f64 %fd35, %fd33, %fd6, %fd34; |
| mov.f64 %fd36, 0d3F91BF7734CF1C48; |
| fma.rn.f64 %fd37, %fd35, %fd6, %fd36; |
| mov.f64 %fd38, 0d3F96E91483144EF7; |
| fma.rn.f64 %fd39, %fd37, %fd6, %fd38; |
| mov.f64 %fd40, 0d3F9F1C6E0A4F9F81; |
| fma.rn.f64 %fd41, %fd39, %fd6, %fd40; |
| mov.f64 %fd42, 0d3FA6DB6DC27FA92B; |
| fma.rn.f64 %fd43, %fd41, %fd6, %fd42; |
| mov.f64 %fd44, 0d3FB333333320F91B; |
| fma.rn.f64 %fd45, %fd43, %fd6, %fd44; |
| mov.f64 %fd46, 0d3FC5555555555F4D; |
| fma.rn.f64 %fd47, %fd45, %fd6, %fd46; |
| mul.f64 %fd48, %fd6, %fd47; |
| mul.f64 %fd49, %fd22, 0dC000000000000000; |
| mov.f64 %fd50, 0d3C91A62633145C07; |
| fma.rn.f64 %fd51, %fd49, %fd48, %fd50; |
| add.f64 %fd52, %fd49, 0d3FE921FB54442D18; |
| add.f64 %fd53, %fd52, %fd51; |
| add.f64 %fd54, %fd53, 0d3FE921FB54442D18; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r11, %temp}, %fd54; |
| } |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r12}, %fd54; |
| } |
| and.b32 %r13, %r2, -2147483648; |
| or.b32 %r14, %r12, %r13; |
| mov.b64 %fd82, {%r11, %r14}; |
| |
| BB106_4: |
| cvta.to.global.u64 %rd6, %rd2; |
| add.s64 %rd8, %rd6, %rd4; |
| st.global.f64 [%rd8], %fd82; |
| |
| BB106_5: |
| ret; |
| } |
| |
| // .globl matrix_asin_f |
| .visible .entry matrix_asin_f( |
| .param .u64 matrix_asin_f_param_0, |
| .param .u64 matrix_asin_f_param_1, |
| .param .u32 matrix_asin_f_param_2 |
| ) |
| { |
| .reg .pred %p<4>; |
| .reg .f32 %f<26>; |
| .reg .b32 %r<10>; |
| .reg .b64 %rd<8>; |
| |
| |
| ld.param.u64 %rd1, [matrix_asin_f_param_0]; |
| ld.param.u64 %rd2, [matrix_asin_f_param_1]; |
| ld.param.u32 %r2, [matrix_asin_f_param_2]; |
| mov.u32 %r3, %ctaid.x; |
| mov.u32 %r4, %ntid.x; |
| mov.u32 %r5, %tid.x; |
| mad.lo.s32 %r1, %r4, %r3, %r5; |
| setp.ge.u32 %p1, %r1, %r2; |
| @%p1 bra BB107_2; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.s32 %rd4, %r1, 4; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f32 %f1, [%rd5]; |
| abs.f32 %f2, %f1; |
| mov.f32 %f3, 0f3F800000; |
| sub.f32 %f4, %f3, %f2; |
| mul.f32 %f5, %f4, 0f3F000000; |
| sqrt.rn.f32 %f6, %f5; |
| setp.gt.f32 %p2, %f2, 0f3F11EB85; |
| selp.f32 %f7, %f6, %f2, %p2; |
| mul.f32 %f8, %f7, %f7; |
| mov.f32 %f9, 0f3C94D2E9; |
| mov.f32 %f10, 0f3D53F941; |
| fma.rn.f32 %f11, %f10, %f8, %f9; |
| mov.f32 %f12, 0f3D3F841F; |
| fma.rn.f32 %f13, %f11, %f8, %f12; |
| mov.f32 %f14, 0f3D994929; |
| fma.rn.f32 %f15, %f13, %f8, %f14; |
| mov.f32 %f16, 0f3E2AAB94; |
| fma.rn.f32 %f17, %f15, %f8, %f16; |
| mul.f32 %f18, %f8, %f17; |
| fma.rn.f32 %f19, %f18, %f7, %f7; |
| mov.f32 %f20, 0f3FC90FDB; |
| mov.f32 %f21, 0fC0000000; |
| fma.rn.f32 %f22, %f21, %f19, %f20; |
| selp.f32 %f23, %f22, %f19, %p2; |
| setp.gtu.f32 %p3, %f23, 0f7F800000; |
| mov.b32 %r6, %f23; |
| mov.b32 %r7, %f1; |
| and.b32 %r8, %r7, -2147483648; |
| or.b32 %r9, %r6, %r8; |
| mov.b32 %f24, %r9; |
| selp.f32 %f25, %f23, %f24, %p3; |
| cvta.to.global.u64 %rd6, %rd2; |
| add.s64 %rd7, %rd6, %rd4; |
| st.global.f32 [%rd7], %f25; |
| |
| BB107_2: |
| ret; |
| } |
| |
| // .globl matrix_acos_d |
| .visible .entry matrix_acos_d( |
| .param .u64 matrix_acos_d_param_0, |
| .param .u64 matrix_acos_d_param_1, |
| .param .u32 matrix_acos_d_param_2 |
| ) |
| { |
| .reg .pred %p<7>; |
| .reg .b32 %r<17>; |
| .reg .f64 %fd<97>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd1, [matrix_acos_d_param_0]; |
| ld.param.u64 %rd2, [matrix_acos_d_param_1]; |
| ld.param.u32 %r4, [matrix_acos_d_param_2]; |
| mov.u32 %r5, %ctaid.x; |
| mov.u32 %r6, %ntid.x; |
| mov.u32 %r7, %tid.x; |
| mad.lo.s32 %r1, %r6, %r5, %r7; |
| setp.ge.u32 %p1, %r1, %r4; |
| @%p1 bra BB108_14; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.s32 %rd4, %r1, 8; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f64 %fd16, [%rd5]; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r2}, %fd16; |
| } |
| abs.f64 %fd1, %fd16; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r8}, %fd1; |
| } |
| setp.lt.s32 %p2, %r8, 1071801958; |
| @%p2 bra BB108_9; |
| bra.uni BB108_2; |
| |
| BB108_9: |
| mul.f64 %fd62, %fd1, %fd1; |
| mov.f64 %fd63, 0dBFB3823B180754AF; |
| mov.f64 %fd64, 0d3FB0066BDC1895E9; |
| fma.rn.f64 %fd65, %fd64, %fd62, %fd63; |
| mov.f64 %fd66, 0d3FB11E52CC2F79AE; |
| fma.rn.f64 %fd67, %fd65, %fd62, %fd66; |
| mov.f64 %fd68, 0dBF924EAF3526861B; |
| fma.rn.f64 %fd69, %fd67, %fd62, %fd68; |
| mov.f64 %fd70, 0d3F91DF02A31E6CB7; |
| fma.rn.f64 %fd71, %fd69, %fd62, %fd70; |
| mov.f64 %fd72, 0d3F847D18B0EEC6CC; |
| fma.rn.f64 %fd73, %fd71, %fd62, %fd72; |
| mov.f64 %fd74, 0d3F8D0AF961BA53B0; |
| fma.rn.f64 %fd75, %fd73, %fd62, %fd74; |
| mov.f64 %fd76, 0d3F91BF7734CF1C48; |
| fma.rn.f64 %fd77, %fd75, %fd62, %fd76; |
| mov.f64 %fd78, 0d3F96E91483144EF7; |
| fma.rn.f64 %fd79, %fd77, %fd62, %fd78; |
| mov.f64 %fd80, 0d3F9F1C6E0A4F9F81; |
| fma.rn.f64 %fd81, %fd79, %fd62, %fd80; |
| mov.f64 %fd82, 0d3FA6DB6DC27FA92B; |
| fma.rn.f64 %fd83, %fd81, %fd62, %fd82; |
| mov.f64 %fd84, 0d3FB333333320F91B; |
| fma.rn.f64 %fd85, %fd83, %fd62, %fd84; |
| mov.f64 %fd86, 0d3FC5555555555F4D; |
| fma.rn.f64 %fd87, %fd85, %fd62, %fd86; |
| mul.f64 %fd88, %fd62, %fd87; |
| fma.rn.f64 %fd10, %fd88, %fd1, %fd1; |
| setp.lt.s32 %p6, %r2, 0; |
| @%p6 bra BB108_11; |
| |
| mov.f64 %fd89, 0dBC91A62633145C07; |
| add.rn.f64 %fd90, %fd10, %fd89; |
| neg.f64 %fd95, %fd90; |
| bra.uni BB108_12; |
| |
| BB108_2: |
| mov.f64 %fd19, 0d3FF0000000000000; |
| sub.f64 %fd2, %fd19, %fd1; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r9, %temp}, %fd2; |
| } |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r3}, %fd2; |
| } |
| add.s32 %r10, %r3, -1048576; |
| mov.b64 %fd18, {%r9, %r10}; |
| // inline asm |
| rsqrt.approx.ftz.f64 %fd17, %fd18; |
| // inline asm |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r11, %temp}, %fd17; |
| } |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r12}, %fd17; |
| } |
| add.s32 %r13, %r12, -1048576; |
| mov.b64 %fd20, {%r11, %r13}; |
| mul.f64 %fd21, %fd18, %fd17; |
| neg.f64 %fd22, %fd21; |
| fma.rn.f64 %fd23, %fd21, %fd22, %fd18; |
| fma.rn.f64 %fd24, %fd23, %fd20, %fd21; |
| neg.f64 %fd25, %fd24; |
| fma.rn.f64 %fd26, %fd17, %fd25, %fd19; |
| fma.rn.f64 %fd27, %fd26, %fd20, %fd20; |
| fma.rn.f64 %fd28, %fd24, %fd25, %fd18; |
| fma.rn.f64 %fd3, %fd28, %fd27, %fd24; |
| setp.lt.s32 %p3, %r3, 1; |
| @%p3 bra BB108_4; |
| |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r14}, %fd3; |
| } |
| add.s32 %r15, %r14, 1048576; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r16, %temp}, %fd3; |
| } |
| mov.b64 %fd29, {%r16, %r15}; |
| mov.f64 %fd30, 0dBEBAC2FE66FAAC4B; |
| mov.f64 %fd31, 0d3EC715B371155F70; |
| fma.rn.f64 %fd32, %fd31, %fd2, %fd30; |
| mov.f64 %fd33, 0d3ED9A9B88EFCD9B8; |
| fma.rn.f64 %fd34, %fd32, %fd2, %fd33; |
| mov.f64 %fd35, 0d3EDD0F40A8A0C4C3; |
| fma.rn.f64 %fd36, %fd34, %fd2, %fd35; |
| mov.f64 %fd37, 0d3EF46D4CFA9E0E1F; |
| fma.rn.f64 %fd38, %fd36, %fd2, %fd37; |
| mov.f64 %fd39, 0d3F079C168D1E2422; |
| fma.rn.f64 %fd40, %fd38, %fd2, %fd39; |
| mov.f64 %fd41, 0d3F1C9A88C3BCA540; |
| fma.rn.f64 %fd42, %fd40, %fd2, %fd41; |
| mov.f64 %fd43, 0d3F31C4E64BD476DF; |
| fma.rn.f64 %fd44, %fd42, %fd2, %fd43; |
| mov.f64 %fd45, 0d3F46E8BA60009C8F; |
| fma.rn.f64 %fd46, %fd44, %fd2, %fd45; |
| mov.f64 %fd47, 0d3F5F1C71C62B05A2; |
| fma.rn.f64 %fd48, %fd46, %fd2, %fd47; |
| mov.f64 %fd49, 0d3F76DB6DB6DC9F2C; |
| fma.rn.f64 %fd50, %fd48, %fd2, %fd49; |
| mov.f64 %fd51, 0d3F9333333333329C; |
| fma.rn.f64 %fd52, %fd50, %fd2, %fd51; |
| mov.f64 %fd53, 0d3FB5555555555555; |
| fma.rn.f64 %fd54, %fd52, %fd2, %fd53; |
| mul.f64 %fd55, %fd2, %fd54; |
| fma.rn.f64 %fd94, %fd55, %fd29, %fd29; |
| bra.uni BB108_5; |
| |
| BB108_11: |
| mov.f64 %fd91, 0d3C91A62633145C07; |
| add.rn.f64 %fd95, %fd10, %fd91; |
| |
| BB108_12: |
| mov.f64 %fd92, 0d3FF921FB54442D18; |
| add.rn.f64 %fd94, %fd92, %fd95; |
| bra.uni BB108_13; |
| |
| BB108_4: |
| mov.f64 %fd56, 0d0000000000000000; |
| mul.rn.f64 %fd94, %fd1, %fd56; |
| |
| BB108_5: |
| setp.gt.s32 %p4, %r3, -1; |
| @%p4 bra BB108_7; |
| |
| mov.f64 %fd57, 0d7FF0000000000000; |
| mul.rn.f64 %fd94, %fd94, %fd57; |
| |
| BB108_7: |
| setp.gt.s32 %p5, %r2, -1; |
| @%p5 bra BB108_13; |
| |
| mov.f64 %fd58, 0dBCA1A62633145C07; |
| add.rn.f64 %fd59, %fd94, %fd58; |
| neg.f64 %fd60, %fd59; |
| mov.f64 %fd61, 0d400921FB54442D18; |
| add.rn.f64 %fd94, %fd61, %fd60; |
| |
| BB108_13: |
| cvta.to.global.u64 %rd6, %rd2; |
| add.s64 %rd8, %rd6, %rd4; |
| st.global.f64 [%rd8], %fd94; |
| |
| BB108_14: |
| ret; |
| } |
| |
| // .globl matrix_acos_f |
| .visible .entry matrix_acos_f( |
| .param .u64 matrix_acos_f_param_0, |
| .param .u64 matrix_acos_f_param_1, |
| .param .u32 matrix_acos_f_param_2 |
| ) |
| { |
| .reg .pred %p<4>; |
| .reg .f32 %f<27>; |
| .reg .b32 %r<6>; |
| .reg .b64 %rd<8>; |
| |
| |
| ld.param.u64 %rd1, [matrix_acos_f_param_0]; |
| ld.param.u64 %rd2, [matrix_acos_f_param_1]; |
| ld.param.u32 %r2, [matrix_acos_f_param_2]; |
| mov.u32 %r3, %ctaid.x; |
| mov.u32 %r4, %ntid.x; |
| mov.u32 %r5, %tid.x; |
| mad.lo.s32 %r1, %r4, %r3, %r5; |
| setp.ge.u32 %p1, %r1, %r2; |
| @%p1 bra BB109_2; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.s32 %rd4, %r1, 4; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f32 %f1, [%rd5]; |
| abs.f32 %f2, %f1; |
| mov.f32 %f3, 0f3F800000; |
| sub.f32 %f4, %f3, %f2; |
| mul.f32 %f5, %f4, 0f3F000000; |
| sqrt.rn.f32 %f6, %f5; |
| setp.gt.f32 %p2, %f2, 0f3F11EB85; |
| selp.f32 %f7, %f6, %f2, %p2; |
| mul.f32 %f8, %f7, %f7; |
| mov.f32 %f9, 0f3C94D2E9; |
| mov.f32 %f10, 0f3D53F941; |
| fma.rn.f32 %f11, %f10, %f8, %f9; |
| mov.f32 %f12, 0f3D3F841F; |
| fma.rn.f32 %f13, %f11, %f8, %f12; |
| mov.f32 %f14, 0f3D994929; |
| fma.rn.f32 %f15, %f13, %f8, %f14; |
| mov.f32 %f16, 0f3E2AAB94; |
| fma.rn.f32 %f17, %f15, %f8, %f16; |
| mul.f32 %f18, %f8, %f17; |
| fma.rn.f32 %f19, %f18, %f7, %f7; |
| add.f32 %f20, %f19, %f19; |
| mov.f32 %f21, 0f3FC90FDB; |
| sub.f32 %f22, %f21, %f19; |
| selp.f32 %f23, %f20, %f22, %p2; |
| setp.lt.f32 %p3, %f1, 0f00000000; |
| mov.f32 %f24, 0f40490FDB; |
| sub.f32 %f25, %f24, %f23; |
| selp.f32 %f26, %f25, %f23, %p3; |
| cvta.to.global.u64 %rd6, %rd2; |
| add.s64 %rd7, %rd6, %rd4; |
| st.global.f32 [%rd7], %f26; |
| |
| BB109_2: |
| ret; |
| } |
| |
| // .globl matrix_atan_d |
| .visible .entry matrix_atan_d( |
| .param .u64 matrix_atan_d_param_0, |
| .param .u64 matrix_atan_d_param_1, |
| .param .u32 matrix_atan_d_param_2 |
| ) |
| { |
| .reg .pred %p<5>; |
| .reg .b32 %r<11>; |
| .reg .f64 %fd<56>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd1, [matrix_atan_d_param_0]; |
| ld.param.u64 %rd2, [matrix_atan_d_param_1]; |
| ld.param.u32 %r2, [matrix_atan_d_param_2]; |
| mov.u32 %r3, %ctaid.x; |
| mov.u32 %r4, %ntid.x; |
| mov.u32 %r5, %tid.x; |
| mad.lo.s32 %r1, %r4, %r3, %r5; |
| setp.ge.u32 %p1, %r1, %r2; |
| @%p1 bra BB110_4; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.s32 %rd4, %r1, 8; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f64 %fd1, [%rd5]; |
| abs.f64 %fd2, %fd1; |
| setp.leu.f64 %p2, %fd2, 0d3FF0000000000000; |
| mov.f64 %fd55, %fd2; |
| @%p2 bra BB110_3; |
| |
| rcp.approx.ftz.f64 %fd5, %fd2; |
| neg.f64 %fd6, %fd2; |
| mov.f64 %fd7, 0d3FF0000000000000; |
| fma.rn.f64 %fd8, %fd6, %fd5, %fd7; |
| fma.rn.f64 %fd9, %fd8, %fd8, %fd8; |
| fma.rn.f64 %fd10, %fd9, %fd5, %fd5; |
| setp.eq.f64 %p3, %fd2, 0d7FF0000000000000; |
| selp.f64 %fd55, 0d0000000000000000, %fd10, %p3; |
| |
| BB110_3: |
| cvta.to.global.u64 %rd6, %rd2; |
| mul.f64 %fd11, %fd55, %fd55; |
| mov.f64 %fd12, 0d3F2D3B63DBB65B49; |
| mov.f64 %fd13, 0dBEF53E1D2A25FF7E; |
| fma.rn.f64 %fd14, %fd13, %fd11, %fd12; |
| mov.f64 %fd15, 0dBF5312788DDE082E; |
| fma.rn.f64 %fd16, %fd14, %fd11, %fd15; |
| mov.f64 %fd17, 0d3F6F9690C8249315; |
| fma.rn.f64 %fd18, %fd16, %fd11, %fd17; |
| mov.f64 %fd19, 0dBF82CF5AABC7CF0D; |
| fma.rn.f64 %fd20, %fd18, %fd11, %fd19; |
| mov.f64 %fd21, 0d3F9162B0B2A3BFDE; |
| fma.rn.f64 %fd22, %fd20, %fd11, %fd21; |
| mov.f64 %fd23, 0dBF9A7256FEB6FC6B; |
| fma.rn.f64 %fd24, %fd22, %fd11, %fd23; |
| mov.f64 %fd25, 0d3FA171560CE4A489; |
| fma.rn.f64 %fd26, %fd24, %fd11, %fd25; |
| mov.f64 %fd27, 0dBFA4F44D841450E4; |
| fma.rn.f64 %fd28, %fd26, %fd11, %fd27; |
| mov.f64 %fd29, 0d3FA7EE3D3F36BB95; |
| fma.rn.f64 %fd30, %fd28, %fd11, %fd29; |
| mov.f64 %fd31, 0dBFAAD32AE04A9FD1; |
| fma.rn.f64 %fd32, %fd30, %fd11, %fd31; |
| mov.f64 %fd33, 0d3FAE17813D66954F; |
| fma.rn.f64 %fd34, %fd32, %fd11, %fd33; |
| mov.f64 %fd35, 0dBFB11089CA9A5BCD; |
| fma.rn.f64 %fd36, %fd34, %fd11, %fd35; |
| mov.f64 %fd37, 0d3FB3B12B2DB51738; |
| fma.rn.f64 %fd38, %fd36, %fd11, %fd37; |
| mov.f64 %fd39, 0dBFB745D022F8DC5C; |
| fma.rn.f64 %fd40, %fd38, %fd11, %fd39; |
| mov.f64 %fd41, 0d3FBC71C709DFE927; |
| fma.rn.f64 %fd42, %fd40, %fd11, %fd41; |
| mov.f64 %fd43, 0dBFC2492491FA1744; |
| fma.rn.f64 %fd44, %fd42, %fd11, %fd43; |
| mov.f64 %fd45, 0d3FC99999999840D2; |
| fma.rn.f64 %fd46, %fd44, %fd11, %fd45; |
| mov.f64 %fd47, 0dBFD555555555544C; |
| fma.rn.f64 %fd48, %fd46, %fd11, %fd47; |
| mul.f64 %fd49, %fd11, %fd48; |
| fma.rn.f64 %fd50, %fd49, %fd55, %fd55; |
| mov.f64 %fd51, 0d3FF921FB54442D18; |
| sub.f64 %fd52, %fd51, %fd50; |
| setp.gt.f64 %p4, %fd2, 0d3FF0000000000000; |
| selp.f64 %fd53, %fd52, %fd50, %p4; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r6, %temp}, %fd53; |
| } |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r7}, %fd53; |
| } |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r8}, %fd1; |
| } |
| and.b32 %r9, %r8, -2147483648; |
| or.b32 %r10, %r7, %r9; |
| mov.b64 %fd54, {%r6, %r10}; |
| add.s64 %rd8, %rd6, %rd4; |
| st.global.f64 [%rd8], %fd54; |
| |
| BB110_4: |
| ret; |
| } |
| |
| // .globl matrix_atan_f |
| .visible .entry matrix_atan_f( |
| .param .u64 matrix_atan_f_param_0, |
| .param .u64 matrix_atan_f_param_1, |
| .param .u32 matrix_atan_f_param_2 |
| ) |
| { |
| .reg .pred %p<5>; |
| .reg .f32 %f<26>; |
| .reg .b32 %r<10>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd1, [matrix_atan_f_param_0]; |
| ld.param.u64 %rd2, [matrix_atan_f_param_1]; |
| ld.param.u32 %r2, [matrix_atan_f_param_2]; |
| mov.u32 %r3, %ctaid.x; |
| mov.u32 %r4, %ntid.x; |
| mov.u32 %r5, %tid.x; |
| mad.lo.s32 %r1, %r4, %r3, %r5; |
| setp.ge.u32 %p1, %r1, %r2; |
| @%p1 bra BB111_4; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.s32 %rd4, %r1, 4; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f32 %f1, [%rd5]; |
| abs.f32 %f2, %f1; |
| setp.leu.f32 %p2, %f2, 0f3F800000; |
| mov.f32 %f25, %f2; |
| @%p2 bra BB111_3; |
| |
| rcp.rn.f32 %f25, %f2; |
| |
| BB111_3: |
| cvta.to.global.u64 %rd6, %rd2; |
| mul.rn.f32 %f5, %f25, %f25; |
| mov.f32 %f6, 0fC0B59883; |
| mov.f32 %f7, 0fBF52C7EA; |
| fma.rn.f32 %f8, %f5, %f7, %f6; |
| mov.f32 %f9, 0fC0D21907; |
| fma.rn.f32 %f10, %f8, %f5, %f9; |
| mul.f32 %f11, %f5, %f10; |
| mul.f32 %f12, %f25, %f11; |
| add.f32 %f13, %f5, 0f41355DC0; |
| mov.f32 %f14, 0f41E6BD60; |
| fma.rn.f32 %f15, %f13, %f5, %f14; |
| mov.f32 %f16, 0f419D92C8; |
| fma.rn.f32 %f17, %f15, %f5, %f16; |
| rcp.rn.f32 %f18, %f17; |
| fma.rn.f32 %f19, %f12, %f18, %f25; |
| mov.f32 %f20, 0f3FC90FDB; |
| sub.f32 %f21, %f20, %f19; |
| setp.gt.f32 %p3, %f2, 0f3F800000; |
| selp.f32 %f22, %f21, %f19, %p3; |
| mov.b32 %r6, %f22; |
| mov.b32 %r7, %f1; |
| and.b32 %r8, %r7, -2147483648; |
| or.b32 %r9, %r6, %r8; |
| mov.b32 %f23, %r9; |
| setp.gtu.f32 %p4, %f2, 0f7F800000; |
| selp.f32 %f24, %f22, %f23, %p4; |
| add.s64 %rd8, %rd6, %rd4; |
| st.global.f32 [%rd8], %f24; |
| |
| BB111_4: |
| ret; |
| } |
| |
| // .globl matrix_sign_d |
| .visible .entry matrix_sign_d( |
| .param .u64 matrix_sign_d_param_0, |
| .param .u64 matrix_sign_d_param_1, |
| .param .u32 matrix_sign_d_param_2 |
| ) |
| { |
| .reg .pred %p<3>; |
| .reg .b32 %r<12>; |
| .reg .f64 %fd<4>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd2, [matrix_sign_d_param_0]; |
| ld.param.u64 %rd3, [matrix_sign_d_param_1]; |
| ld.param.u32 %r2, [matrix_sign_d_param_2]; |
| mov.u32 %r3, %ctaid.x; |
| mov.u32 %r4, %ntid.x; |
| mov.u32 %r5, %tid.x; |
| mad.lo.s32 %r1, %r4, %r3, %r5; |
| setp.ge.u32 %p1, %r1, %r2; |
| @%p1 bra BB112_4; |
| |
| cvta.to.global.u64 %rd4, %rd2; |
| mul.wide.s32 %rd5, %r1, 8; |
| add.s64 %rd6, %rd4, %rd5; |
| ld.global.f64 %fd1, [%rd6]; |
| setp.eq.f64 %p2, %fd1, 0d0000000000000000; |
| cvta.to.global.u64 %rd7, %rd3; |
| add.s64 %rd1, %rd7, %rd5; |
| @%p2 bra BB112_3; |
| bra.uni BB112_2; |
| |
| BB112_3: |
| mov.u64 %rd8, 0; |
| st.global.u64 [%rd1], %rd8; |
| bra.uni BB112_4; |
| |
| BB112_2: |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r6}, %fd1; |
| } |
| and.b32 %r7, %r6, -2147483648; |
| mov.f64 %fd2, 0d3FF0000000000000; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r8}, %fd2; |
| } |
| and.b32 %r9, %r8, 2147483647; |
| or.b32 %r10, %r9, %r7; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r11, %temp}, %fd2; |
| } |
| mov.b64 %fd3, {%r11, %r10}; |
| st.global.f64 [%rd1], %fd3; |
| |
| BB112_4: |
| ret; |
| } |
| |
| // .globl matrix_sign_f |
| .visible .entry matrix_sign_f( |
| .param .u64 matrix_sign_f_param_0, |
| .param .u64 matrix_sign_f_param_1, |
| .param .u32 matrix_sign_f_param_2 |
| ) |
| { |
| .reg .pred %p<3>; |
| .reg .f32 %f<3>; |
| .reg .b32 %r<13>; |
| .reg .f64 %fd<4>; |
| .reg .b64 %rd<8>; |
| |
| |
| ld.param.u64 %rd2, [matrix_sign_f_param_0]; |
| ld.param.u64 %rd3, [matrix_sign_f_param_1]; |
| ld.param.u32 %r2, [matrix_sign_f_param_2]; |
| mov.u32 %r3, %ctaid.x; |
| mov.u32 %r4, %ntid.x; |
| mov.u32 %r5, %tid.x; |
| mad.lo.s32 %r1, %r4, %r3, %r5; |
| setp.ge.u32 %p1, %r1, %r2; |
| @%p1 bra BB113_4; |
| |
| cvta.to.global.u64 %rd4, %rd2; |
| mul.wide.s32 %rd5, %r1, 4; |
| add.s64 %rd6, %rd4, %rd5; |
| ld.global.f32 %f1, [%rd6]; |
| setp.eq.f32 %p2, %f1, 0f00000000; |
| cvta.to.global.u64 %rd7, %rd3; |
| add.s64 %rd1, %rd7, %rd5; |
| @%p2 bra BB113_3; |
| bra.uni BB113_2; |
| |
| BB113_3: |
| mov.u32 %r12, 0; |
| st.global.u32 [%rd1], %r12; |
| bra.uni BB113_4; |
| |
| BB113_2: |
| cvt.f64.f32 %fd1, %f1; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r6}, %fd1; |
| } |
| and.b32 %r7, %r6, -2147483648; |
| mov.f64 %fd2, 0d3FF0000000000000; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r8}, %fd2; |
| } |
| and.b32 %r9, %r8, 2147483647; |
| or.b32 %r10, %r9, %r7; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r11, %temp}, %fd2; |
| } |
| mov.b64 %fd3, {%r11, %r10}; |
| cvt.rn.f32.f64 %f2, %fd3; |
| st.global.f32 [%rd1], %f2; |
| |
| BB113_4: |
| ret; |
| } |
| |
| // .globl matrix_sigmoid_d |
| .visible .entry matrix_sigmoid_d( |
| .param .u64 matrix_sigmoid_d_param_0, |
| .param .u64 matrix_sigmoid_d_param_1, |
| .param .u32 matrix_sigmoid_d_param_2 |
| ) |
| { |
| .reg .pred %p<4>; |
| .reg .f32 %f<5>; |
| .reg .b32 %r<13>; |
| .reg .f64 %fd<74>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd1, [matrix_sigmoid_d_param_0]; |
| ld.param.u64 %rd2, [matrix_sigmoid_d_param_1]; |
| ld.param.u32 %r4, [matrix_sigmoid_d_param_2]; |
| mov.u32 %r5, %ctaid.x; |
| mov.u32 %r6, %ntid.x; |
| mov.u32 %r7, %tid.x; |
| mad.lo.s32 %r1, %r6, %r5, %r7; |
| setp.ge.u32 %p1, %r1, %r4; |
| @%p1 bra BB114_5; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.s32 %rd4, %r1, 8; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f64 %fd6, [%rd5]; |
| mul.f64 %fd1, %fd6, 0d3FE0000000000000; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r2}, %fd1; |
| } |
| and.b32 %r3, %r2, 2147483647; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r8, %temp}, %fd1; |
| } |
| mov.b64 %fd2, {%r8, %r3}; |
| setp.ltu.f64 %p2, %fd2, 0d3FE4F92224DD2F1A; |
| @%p2 bra BB114_3; |
| bra.uni BB114_2; |
| |
| BB114_3: |
| mul.f64 %fd48, %fd1, %fd1; |
| mov.f64 %fd49, 0d3F14359F420AFC3D; |
| mov.f64 %fd50, 0dBEF0BC46E2F5E964; |
| fma.rn.f64 %fd51, %fd50, %fd48, %fd49; |
| mov.f64 %fd52, 0dBF2DF9F0728C5D84; |
| fma.rn.f64 %fd53, %fd51, %fd48, %fd52; |
| mov.f64 %fd54, 0d3F4337D1CEC4F033; |
| fma.rn.f64 %fd55, %fd53, %fd48, %fd54; |
| mov.f64 %fd56, 0dBF57D6E9674335B3; |
| fma.rn.f64 %fd57, %fd55, %fd48, %fd56; |
| mov.f64 %fd58, 0d3F6D6D000D7AAD3D; |
| fma.rn.f64 %fd59, %fd57, %fd48, %fd58; |
| mov.f64 %fd60, 0dBF8226E1F3CF1EF5; |
| fma.rn.f64 %fd61, %fd59, %fd48, %fd60; |
| mov.f64 %fd62, 0d3F9664F47EC0C8CF; |
| fma.rn.f64 %fd63, %fd61, %fd48, %fd62; |
| mov.f64 %fd64, 0dBFABA1BA1B80AB40; |
| fma.rn.f64 %fd65, %fd63, %fd48, %fd64; |
| mov.f64 %fd66, 0d3FC111111110FA4A; |
| fma.rn.f64 %fd67, %fd65, %fd48, %fd66; |
| mov.f64 %fd68, 0dBFD5555555555550; |
| fma.rn.f64 %fd69, %fd67, %fd48, %fd68; |
| mov.f64 %fd70, 0d0000000000000000; |
| fma.rn.f64 %fd71, %fd69, %fd48, %fd70; |
| fma.rn.f64 %fd73, %fd71, %fd1, %fd1; |
| bra.uni BB114_4; |
| |
| BB114_2: |
| add.f64 %fd7, %fd2, %fd2; |
| cvt.rn.f32.f64 %f1, %fd7; |
| mul.f32 %f2, %f1, 0f3FB8AA3B; |
| cvt.rni.f32.f32 %f3, %f2; |
| cvt.f64.f32 %fd8, %f3; |
| neg.f64 %fd9, %fd8; |
| mov.f64 %fd10, 0d3FE62E42FEFA39EF; |
| fma.rn.f64 %fd11, %fd9, %fd10, %fd7; |
| mov.f64 %fd12, 0d3E928A27F89B6999; |
| mov.f64 %fd13, 0d3E5AE904A4741B81; |
| fma.rn.f64 %fd14, %fd13, %fd11, %fd12; |
| mov.f64 %fd15, 0d3EC71DE715FF7E07; |
| fma.rn.f64 %fd16, %fd14, %fd11, %fd15; |
| mov.f64 %fd17, 0d3EFA019A6B0AC45A; |
| fma.rn.f64 %fd18, %fd16, %fd11, %fd17; |
| mov.f64 %fd19, 0d3F2A01A017EED94F; |
| fma.rn.f64 %fd20, %fd18, %fd11, %fd19; |
| mov.f64 %fd21, 0d3F56C16C17F2A71B; |
| fma.rn.f64 %fd22, %fd20, %fd11, %fd21; |
| mov.f64 %fd23, 0d3F811111111173C4; |
| fma.rn.f64 %fd24, %fd22, %fd11, %fd23; |
| mov.f64 %fd25, 0d3FA555555555211A; |
| fma.rn.f64 %fd26, %fd24, %fd11, %fd25; |
| mov.f64 %fd27, 0d3FC5555555555540; |
| fma.rn.f64 %fd28, %fd26, %fd11, %fd27; |
| mov.f64 %fd29, 0d3FE0000000000005; |
| fma.rn.f64 %fd30, %fd28, %fd11, %fd29; |
| mul.f64 %fd31, %fd11, %fd30; |
| fma.rn.f64 %fd32, %fd31, %fd11, %fd11; |
| ex2.approx.ftz.f32 %f4, %f3; |
| cvt.f64.f32 %fd33, %f4; |
| mov.f64 %fd34, 0d3FF0000000000000; |
| sub.f64 %fd35, %fd34, %fd33; |
| neg.f64 %fd36, %fd32; |
| fma.rn.f64 %fd37, %fd36, %fd33, %fd35; |
| mov.f64 %fd38, 0d4000000000000000; |
| sub.f64 %fd39, %fd38, %fd37; |
| rcp.approx.ftz.f64 %fd40, %fd39; |
| neg.f64 %fd41, %fd39; |
| fma.rn.f64 %fd42, %fd41, %fd40, %fd34; |
| fma.rn.f64 %fd43, %fd42, %fd42, %fd42; |
| fma.rn.f64 %fd44, %fd43, %fd40, %fd40; |
| neg.f64 %fd45, %fd44; |
| fma.rn.f64 %fd46, %fd38, %fd45, %fd34; |
| setp.gt.u32 %p3, %r3, 1077088193; |
| selp.f64 %fd47, 0d3FF0000000000000, %fd46, %p3; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r9, %temp}, %fd47; |
| } |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r10}, %fd47; |
| } |
| and.b32 %r11, %r2, -2147483648; |
| or.b32 %r12, %r10, %r11; |
| mov.b64 %fd73, {%r9, %r12}; |
| |
| BB114_4: |
| cvta.to.global.u64 %rd6, %rd2; |
| fma.rn.f64 %fd72, %fd73, 0d3FE0000000000000, 0d3FE0000000000000; |
| add.s64 %rd8, %rd6, %rd4; |
| st.global.f64 [%rd8], %fd72; |
| |
| BB114_5: |
| ret; |
| } |
| |
| // .globl matrix_sigmoid_f |
| .visible .entry matrix_sigmoid_f( |
| .param .u64 matrix_sigmoid_f_param_0, |
| .param .u64 matrix_sigmoid_f_param_1, |
| .param .u32 matrix_sigmoid_f_param_2 |
| ) |
| { |
| .reg .pred %p<4>; |
| .reg .f32 %f<7>; |
| .reg .b32 %r<13>; |
| .reg .f64 %fd<74>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd1, [matrix_sigmoid_f_param_0]; |
| ld.param.u64 %rd2, [matrix_sigmoid_f_param_1]; |
| ld.param.u32 %r4, [matrix_sigmoid_f_param_2]; |
| mov.u32 %r5, %ctaid.x; |
| mov.u32 %r6, %ntid.x; |
| mov.u32 %r7, %tid.x; |
| mad.lo.s32 %r1, %r6, %r5, %r7; |
| setp.ge.u32 %p1, %r1, %r4; |
| @%p1 bra BB115_5; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.s32 %rd4, %r1, 4; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f32 %f1, [%rd5]; |
| cvt.f64.f32 %fd6, %f1; |
| mul.f64 %fd1, %fd6, 0d3FE0000000000000; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r2}, %fd1; |
| } |
| and.b32 %r3, %r2, 2147483647; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r8, %temp}, %fd1; |
| } |
| mov.b64 %fd2, {%r8, %r3}; |
| setp.ltu.f64 %p2, %fd2, 0d3FE4F92224DD2F1A; |
| @%p2 bra BB115_3; |
| bra.uni BB115_2; |
| |
| BB115_3: |
| mul.f64 %fd48, %fd1, %fd1; |
| mov.f64 %fd49, 0d3F14359F420AFC3D; |
| mov.f64 %fd50, 0dBEF0BC46E2F5E964; |
| fma.rn.f64 %fd51, %fd50, %fd48, %fd49; |
| mov.f64 %fd52, 0dBF2DF9F0728C5D84; |
| fma.rn.f64 %fd53, %fd51, %fd48, %fd52; |
| mov.f64 %fd54, 0d3F4337D1CEC4F033; |
| fma.rn.f64 %fd55, %fd53, %fd48, %fd54; |
| mov.f64 %fd56, 0dBF57D6E9674335B3; |
| fma.rn.f64 %fd57, %fd55, %fd48, %fd56; |
| mov.f64 %fd58, 0d3F6D6D000D7AAD3D; |
| fma.rn.f64 %fd59, %fd57, %fd48, %fd58; |
| mov.f64 %fd60, 0dBF8226E1F3CF1EF5; |
| fma.rn.f64 %fd61, %fd59, %fd48, %fd60; |
| mov.f64 %fd62, 0d3F9664F47EC0C8CF; |
| fma.rn.f64 %fd63, %fd61, %fd48, %fd62; |
| mov.f64 %fd64, 0dBFABA1BA1B80AB40; |
| fma.rn.f64 %fd65, %fd63, %fd48, %fd64; |
| mov.f64 %fd66, 0d3FC111111110FA4A; |
| fma.rn.f64 %fd67, %fd65, %fd48, %fd66; |
| mov.f64 %fd68, 0dBFD5555555555550; |
| fma.rn.f64 %fd69, %fd67, %fd48, %fd68; |
| mov.f64 %fd70, 0d0000000000000000; |
| fma.rn.f64 %fd71, %fd69, %fd48, %fd70; |
| fma.rn.f64 %fd73, %fd71, %fd1, %fd1; |
| bra.uni BB115_4; |
| |
| BB115_2: |
| add.f64 %fd7, %fd2, %fd2; |
| cvt.rn.f32.f64 %f2, %fd7; |
| mul.f32 %f3, %f2, 0f3FB8AA3B; |
| cvt.rni.f32.f32 %f4, %f3; |
| cvt.f64.f32 %fd8, %f4; |
| neg.f64 %fd9, %fd8; |
| mov.f64 %fd10, 0d3FE62E42FEFA39EF; |
| fma.rn.f64 %fd11, %fd9, %fd10, %fd7; |
| mov.f64 %fd12, 0d3E928A27F89B6999; |
| mov.f64 %fd13, 0d3E5AE904A4741B81; |
| fma.rn.f64 %fd14, %fd13, %fd11, %fd12; |
| mov.f64 %fd15, 0d3EC71DE715FF7E07; |
| fma.rn.f64 %fd16, %fd14, %fd11, %fd15; |
| mov.f64 %fd17, 0d3EFA019A6B0AC45A; |
| fma.rn.f64 %fd18, %fd16, %fd11, %fd17; |
| mov.f64 %fd19, 0d3F2A01A017EED94F; |
| fma.rn.f64 %fd20, %fd18, %fd11, %fd19; |
| mov.f64 %fd21, 0d3F56C16C17F2A71B; |
| fma.rn.f64 %fd22, %fd20, %fd11, %fd21; |
| mov.f64 %fd23, 0d3F811111111173C4; |
| fma.rn.f64 %fd24, %fd22, %fd11, %fd23; |
| mov.f64 %fd25, 0d3FA555555555211A; |
| fma.rn.f64 %fd26, %fd24, %fd11, %fd25; |
| mov.f64 %fd27, 0d3FC5555555555540; |
| fma.rn.f64 %fd28, %fd26, %fd11, %fd27; |
| mov.f64 %fd29, 0d3FE0000000000005; |
| fma.rn.f64 %fd30, %fd28, %fd11, %fd29; |
| mul.f64 %fd31, %fd11, %fd30; |
| fma.rn.f64 %fd32, %fd31, %fd11, %fd11; |
| ex2.approx.ftz.f32 %f5, %f4; |
| cvt.f64.f32 %fd33, %f5; |
| mov.f64 %fd34, 0d3FF0000000000000; |
| sub.f64 %fd35, %fd34, %fd33; |
| neg.f64 %fd36, %fd32; |
| fma.rn.f64 %fd37, %fd36, %fd33, %fd35; |
| mov.f64 %fd38, 0d4000000000000000; |
| sub.f64 %fd39, %fd38, %fd37; |
| rcp.approx.ftz.f64 %fd40, %fd39; |
| neg.f64 %fd41, %fd39; |
| fma.rn.f64 %fd42, %fd41, %fd40, %fd34; |
| fma.rn.f64 %fd43, %fd42, %fd42, %fd42; |
| fma.rn.f64 %fd44, %fd43, %fd40, %fd40; |
| neg.f64 %fd45, %fd44; |
| fma.rn.f64 %fd46, %fd38, %fd45, %fd34; |
| setp.gt.u32 %p3, %r3, 1077088193; |
| selp.f64 %fd47, 0d3FF0000000000000, %fd46, %p3; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r9, %temp}, %fd47; |
| } |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r10}, %fd47; |
| } |
| and.b32 %r11, %r2, -2147483648; |
| or.b32 %r12, %r10, %r11; |
| mov.b64 %fd73, {%r9, %r12}; |
| |
| BB115_4: |
| cvta.to.global.u64 %rd6, %rd2; |
| fma.rn.f64 %fd72, %fd73, 0d3FE0000000000000, 0d3FE0000000000000; |
| cvt.rn.f32.f64 %f6, %fd72; |
| add.s64 %rd8, %rd6, %rd4; |
| st.global.f32 [%rd8], %f6; |
| |
| BB115_5: |
| ret; |
| } |
| |
| // .globl prepare_lstm_input_d |
| .visible .entry prepare_lstm_input_d( |
| .param .u64 prepare_lstm_input_d_param_0, |
| .param .u64 prepare_lstm_input_d_param_1, |
| .param .u32 prepare_lstm_input_d_param_2, |
| .param .u32 prepare_lstm_input_d_param_3, |
| .param .u32 prepare_lstm_input_d_param_4, |
| .param .u32 prepare_lstm_input_d_param_5 |
| ) |
| { |
| .reg .pred %p<2>; |
| .reg .b32 %r<15>; |
| .reg .f64 %fd<2>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd1, [prepare_lstm_input_d_param_0]; |
| ld.param.u64 %rd2, [prepare_lstm_input_d_param_1]; |
| ld.param.u32 %r2, [prepare_lstm_input_d_param_2]; |
| ld.param.u32 %r3, [prepare_lstm_input_d_param_3]; |
| ld.param.u32 %r4, [prepare_lstm_input_d_param_4]; |
| ld.param.u32 %r5, [prepare_lstm_input_d_param_5]; |
| mov.u32 %r6, %ctaid.x; |
| mov.u32 %r7, %ntid.x; |
| mov.u32 %r8, %tid.x; |
| mad.lo.s32 %r1, %r7, %r6, %r8; |
| setp.ge.s32 %p1, %r1, %r5; |
| @%p1 bra BB116_2; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| rem.s32 %r9, %r1, %r4; |
| div.s32 %r10, %r9, %r3; |
| rem.s32 %r11, %r9, %r3; |
| mul.wide.s32 %rd4, %r1, 8; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f64 %fd1, [%rd5]; |
| div.s32 %r12, %r1, %r4; |
| mad.lo.s32 %r13, %r10, %r2, %r12; |
| mad.lo.s32 %r14, %r13, %r3, %r11; |
| cvta.to.global.u64 %rd6, %rd2; |
| mul.wide.s32 %rd7, %r14, 8; |
| add.s64 %rd8, %rd6, %rd7; |
| st.global.f64 [%rd8], %fd1; |
| |
| BB116_2: |
| ret; |
| } |
| |
| // .globl prepare_lstm_input_f |
| .visible .entry prepare_lstm_input_f( |
| .param .u64 prepare_lstm_input_f_param_0, |
| .param .u64 prepare_lstm_input_f_param_1, |
| .param .u32 prepare_lstm_input_f_param_2, |
| .param .u32 prepare_lstm_input_f_param_3, |
| .param .u32 prepare_lstm_input_f_param_4, |
| .param .u32 prepare_lstm_input_f_param_5 |
| ) |
| { |
| .reg .pred %p<2>; |
| .reg .f32 %f<2>; |
| .reg .b32 %r<15>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd1, [prepare_lstm_input_f_param_0]; |
| ld.param.u64 %rd2, [prepare_lstm_input_f_param_1]; |
| ld.param.u32 %r2, [prepare_lstm_input_f_param_2]; |
| ld.param.u32 %r3, [prepare_lstm_input_f_param_3]; |
| ld.param.u32 %r4, [prepare_lstm_input_f_param_4]; |
| ld.param.u32 %r5, [prepare_lstm_input_f_param_5]; |
| mov.u32 %r6, %ctaid.x; |
| mov.u32 %r7, %ntid.x; |
| mov.u32 %r8, %tid.x; |
| mad.lo.s32 %r1, %r7, %r6, %r8; |
| setp.ge.s32 %p1, %r1, %r5; |
| @%p1 bra BB117_2; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| rem.s32 %r9, %r1, %r4; |
| div.s32 %r10, %r9, %r3; |
| rem.s32 %r11, %r9, %r3; |
| mul.wide.s32 %rd4, %r1, 4; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f32 %f1, [%rd5]; |
| div.s32 %r12, %r1, %r4; |
| mad.lo.s32 %r13, %r10, %r2, %r12; |
| mad.lo.s32 %r14, %r13, %r3, %r11; |
| cvta.to.global.u64 %rd6, %rd2; |
| mul.wide.s32 %rd7, %r14, 4; |
| add.s64 %rd8, %rd6, %rd7; |
| st.global.f32 [%rd8], %f1; |
| |
| BB117_2: |
| ret; |
| } |
| |
| // .globl prepare_lstm_weight_d |
| .visible .entry prepare_lstm_weight_d( |
| .param .u64 prepare_lstm_weight_d_param_0, |
| .param .u64 prepare_lstm_weight_d_param_1, |
| .param .u64 prepare_lstm_weight_d_param_2, |
| .param .u32 prepare_lstm_weight_d_param_3, |
| .param .u32 prepare_lstm_weight_d_param_4 |
| ) |
| { |
| .reg .pred %p<8>; |
| .reg .b32 %r<48>; |
| .reg .f64 %fd<3>; |
| .reg .b64 %rd<15>; |
| |
| |
| ld.param.u64 %rd2, [prepare_lstm_weight_d_param_0]; |
| ld.param.u64 %rd3, [prepare_lstm_weight_d_param_1]; |
| ld.param.u64 %rd4, [prepare_lstm_weight_d_param_2]; |
| ld.param.u32 %r45, [prepare_lstm_weight_d_param_3]; |
| ld.param.u32 %r21, [prepare_lstm_weight_d_param_4]; |
| cvta.to.global.u64 %rd1, %rd4; |
| mov.u32 %r22, %ntid.x; |
| mov.u32 %r23, %ctaid.x; |
| mov.u32 %r24, %tid.x; |
| mad.lo.s32 %r1, %r22, %r23, %r24; |
| add.s32 %r2, %r21, %r45; |
| shl.b32 %r3, %r21, 2; |
| mul.lo.s32 %r4, %r2, %r3; |
| setp.lt.s32 %p1, %r1, %r4; |
| @%p1 bra BB118_3; |
| bra.uni BB118_1; |
| |
| BB118_3: |
| mul.lo.s32 %r5, %r21, %r45; |
| mul.lo.s32 %r47, %r21, %r21; |
| shl.b32 %r7, %r5, 2; |
| setp.lt.s32 %p5, %r1, %r7; |
| @%p5 bra BB118_5; |
| bra.uni BB118_4; |
| |
| BB118_5: |
| rem.s32 %r44, %r1, %r5; |
| div.s32 %r42, %r44, %r21; |
| mov.u32 %r43, %r42; |
| mov.u32 %r46, %r1; |
| mov.u32 %r47, %r5; |
| bra.uni BB118_6; |
| |
| BB118_1: |
| add.s32 %r25, %r2, 1; |
| mul.lo.s32 %r26, %r25, %r3; |
| setp.ge.s32 %p2, %r1, %r26; |
| @%p2 bra BB118_7; |
| |
| cvta.to.global.u64 %rd5, %rd3; |
| sub.s32 %r27, %r1, %r4; |
| div.s32 %r28, %r27, %r21; |
| setp.lt.s32 %p3, %r28, 2; |
| setp.eq.s32 %p4, %r28, 2; |
| selp.b32 %r29, 3, 2, %p4; |
| selp.b32 %r30, %r28, %r29, %p3; |
| rem.s32 %r31, %r27, %r21; |
| mad.lo.s32 %r32, %r30, %r21, %r31; |
| mul.wide.s32 %rd6, %r32, 8; |
| add.s64 %rd7, %rd5, %rd6; |
| ld.global.f64 %fd1, [%rd7]; |
| mul.wide.s32 %rd8, %r1, 8; |
| add.s64 %rd9, %rd1, %rd8; |
| st.global.f64 [%rd9], %fd1; |
| bra.uni BB118_7; |
| |
| BB118_4: |
| sub.s32 %r46, %r1, %r7; |
| rem.s32 %r44, %r46, %r47; |
| div.s32 %r43, %r44, %r21; |
| add.s32 %r42, %r43, %r45; |
| mov.u32 %r45, %r21; |
| |
| BB118_6: |
| cvta.to.global.u64 %rd10, %rd2; |
| div.s32 %r33, %r46, %r47; |
| setp.eq.s32 %p6, %r33, 2; |
| selp.b32 %r34, 3, 2, %p6; |
| setp.lt.s32 %p7, %r33, 2; |
| selp.b32 %r35, %r33, %r34, %p7; |
| rem.s32 %r36, %r44, %r21; |
| sub.s32 %r37, %r1, %r44; |
| add.s32 %r38, %r37, %r43; |
| mad.lo.s32 %r39, %r36, %r45, %r38; |
| mad.lo.s32 %r40, %r42, %r3, %r36; |
| mad.lo.s32 %r41, %r35, %r21, %r40; |
| mul.wide.s32 %rd11, %r41, 8; |
| add.s64 %rd12, %rd10, %rd11; |
| ld.global.f64 %fd2, [%rd12]; |
| mul.wide.s32 %rd13, %r39, 8; |
| add.s64 %rd14, %rd1, %rd13; |
| st.global.f64 [%rd14], %fd2; |
| |
| BB118_7: |
| ret; |
| } |
| |
| // .globl prepare_lstm_weight_f |
| .visible .entry prepare_lstm_weight_f( |
| .param .u64 prepare_lstm_weight_f_param_0, |
| .param .u64 prepare_lstm_weight_f_param_1, |
| .param .u64 prepare_lstm_weight_f_param_2, |
| .param .u32 prepare_lstm_weight_f_param_3, |
| .param .u32 prepare_lstm_weight_f_param_4 |
| ) |
| { |
| .reg .pred %p<8>; |
| .reg .f32 %f<3>; |
| .reg .b32 %r<48>; |
| .reg .b64 %rd<15>; |
| |
| |
| ld.param.u64 %rd2, [prepare_lstm_weight_f_param_0]; |
| ld.param.u64 %rd3, [prepare_lstm_weight_f_param_1]; |
| ld.param.u64 %rd4, [prepare_lstm_weight_f_param_2]; |
| ld.param.u32 %r45, [prepare_lstm_weight_f_param_3]; |
| ld.param.u32 %r21, [prepare_lstm_weight_f_param_4]; |
| cvta.to.global.u64 %rd1, %rd4; |
| mov.u32 %r22, %ntid.x; |
| mov.u32 %r23, %ctaid.x; |
| mov.u32 %r24, %tid.x; |
| mad.lo.s32 %r1, %r22, %r23, %r24; |
| add.s32 %r2, %r21, %r45; |
| shl.b32 %r3, %r21, 2; |
| mul.lo.s32 %r4, %r2, %r3; |
| setp.lt.s32 %p1, %r1, %r4; |
| @%p1 bra BB119_3; |
| bra.uni BB119_1; |
| |
| BB119_3: |
| mul.lo.s32 %r5, %r21, %r45; |
| mul.lo.s32 %r47, %r21, %r21; |
| shl.b32 %r7, %r5, 2; |
| setp.lt.s32 %p5, %r1, %r7; |
| @%p5 bra BB119_5; |
| bra.uni BB119_4; |
| |
| BB119_5: |
| rem.s32 %r44, %r1, %r5; |
| div.s32 %r42, %r44, %r21; |
| mov.u32 %r43, %r42; |
| mov.u32 %r46, %r1; |
| mov.u32 %r47, %r5; |
| bra.uni BB119_6; |
| |
| BB119_1: |
| add.s32 %r25, %r2, 1; |
| mul.lo.s32 %r26, %r25, %r3; |
| setp.ge.s32 %p2, %r1, %r26; |
| @%p2 bra BB119_7; |
| |
| cvta.to.global.u64 %rd5, %rd3; |
| sub.s32 %r27, %r1, %r4; |
| div.s32 %r28, %r27, %r21; |
| setp.lt.s32 %p3, %r28, 2; |
| setp.eq.s32 %p4, %r28, 2; |
| selp.b32 %r29, 3, 2, %p4; |
| selp.b32 %r30, %r28, %r29, %p3; |
| rem.s32 %r31, %r27, %r21; |
| mad.lo.s32 %r32, %r30, %r21, %r31; |
| mul.wide.s32 %rd6, %r32, 4; |
| add.s64 %rd7, %rd5, %rd6; |
| ld.global.f32 %f1, [%rd7]; |
| mul.wide.s32 %rd8, %r1, 4; |
| add.s64 %rd9, %rd1, %rd8; |
| st.global.f32 [%rd9], %f1; |
| bra.uni BB119_7; |
| |
| BB119_4: |
| sub.s32 %r46, %r1, %r7; |
| rem.s32 %r44, %r46, %r47; |
| div.s32 %r43, %r44, %r21; |
| add.s32 %r42, %r43, %r45; |
| mov.u32 %r45, %r21; |
| |
| BB119_6: |
| cvta.to.global.u64 %rd10, %rd2; |
| div.s32 %r33, %r46, %r47; |
| setp.eq.s32 %p6, %r33, 2; |
| selp.b32 %r34, 3, 2, %p6; |
| setp.lt.s32 %p7, %r33, 2; |
| selp.b32 %r35, %r33, %r34, %p7; |
| rem.s32 %r36, %r44, %r21; |
| sub.s32 %r37, %r1, %r44; |
| add.s32 %r38, %r37, %r43; |
| mad.lo.s32 %r39, %r36, %r45, %r38; |
| mad.lo.s32 %r40, %r42, %r3, %r36; |
| mad.lo.s32 %r41, %r35, %r21, %r40; |
| mul.wide.s32 %rd11, %r41, 4; |
| add.s64 %rd12, %rd10, %rd11; |
| ld.global.f32 %f2, [%rd12]; |
| mul.wide.s32 %rd13, %r39, 4; |
| add.s64 %rd14, %rd1, %rd13; |
| st.global.f32 [%rd14], %f2; |
| |
| BB119_7: |
| ret; |
| } |
| |
| // .globl compute_nnz_d |
| .visible .entry compute_nnz_d( |
| .param .u64 compute_nnz_d_param_0, |
| .param .u64 compute_nnz_d_param_1, |
| .param .u32 compute_nnz_d_param_2 |
| ) |
| { |
| .reg .pred %p<22>; |
| .reg .b32 %r<36>; |
| .reg .f64 %fd<62>; |
| .reg .b64 %rd<12>; |
| |
| |
| ld.param.u64 %rd1, [compute_nnz_d_param_0]; |
| ld.param.u64 %rd2, [compute_nnz_d_param_1]; |
| ld.param.u32 %r6, [compute_nnz_d_param_2]; |
| mov.u32 %r7, %tid.x; |
| mov.u32 %r8, %ctaid.x; |
| shl.b32 %r9, %r8, 1; |
| mov.u32 %r10, %ntid.x; |
| mad.lo.s32 %r35, %r9, %r10, %r7; |
| mov.f64 %fd46, 0d0000000000000000; |
| setp.ge.u32 %p1, %r35, %r6; |
| @%p1 bra BB120_4; |
| |
| BB120_1: |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.u32 %rd4, %r35, 8; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f64 %fd30, [%rd5]; |
| setp.neu.f64 %p2, %fd30, 0d0000000000000000; |
| selp.f64 %fd31, 0d3FF0000000000000, 0d0000000000000000, %p2; |
| add.f64 %fd46, %fd46, %fd31; |
| add.s32 %r3, %r35, %r10; |
| setp.ge.u32 %p3, %r3, %r6; |
| @%p3 bra BB120_3; |
| |
| mul.wide.u32 %rd7, %r3, 8; |
| add.s64 %rd8, %rd3, %rd7; |
| ld.global.f64 %fd32, [%rd8]; |
| setp.neu.f64 %p4, %fd32, 0d0000000000000000; |
| selp.f64 %fd33, 0d3FF0000000000000, 0d0000000000000000, %p4; |
| add.f64 %fd46, %fd46, %fd33; |
| |
| BB120_3: |
| shl.b32 %r13, %r10, 1; |
| mov.u32 %r14, %nctaid.x; |
| mad.lo.s32 %r35, %r13, %r14, %r35; |
| setp.lt.u32 %p5, %r35, %r6; |
| @%p5 bra BB120_1; |
| |
| BB120_4: |
| shl.b32 %r16, %r7, 3; |
| mov.u32 %r17, memory; |
| add.s32 %r5, %r17, %r16; |
| st.shared.f64 [%r5], %fd46; |
| bar.sync 0; |
| setp.lt.u32 %p6, %r10, 1024; |
| @%p6 bra BB120_8; |
| |
| setp.gt.u32 %p7, %r7, 511; |
| @%p7 bra BB120_7; |
| |
| ld.shared.f64 %fd34, [%r5+4096]; |
| add.f64 %fd46, %fd46, %fd34; |
| st.shared.f64 [%r5], %fd46; |
| |
| BB120_7: |
| bar.sync 0; |
| |
| BB120_8: |
| setp.lt.u32 %p8, %r10, 512; |
| @%p8 bra BB120_12; |
| |
| setp.gt.u32 %p9, %r7, 255; |
| @%p9 bra BB120_11; |
| |
| ld.shared.f64 %fd35, [%r5+2048]; |
| add.f64 %fd46, %fd46, %fd35; |
| st.shared.f64 [%r5], %fd46; |
| |
| BB120_11: |
| bar.sync 0; |
| |
| BB120_12: |
| setp.lt.u32 %p10, %r10, 256; |
| @%p10 bra BB120_16; |
| |
| setp.gt.u32 %p11, %r7, 127; |
| @%p11 bra BB120_15; |
| |
| ld.shared.f64 %fd36, [%r5+1024]; |
| add.f64 %fd46, %fd46, %fd36; |
| st.shared.f64 [%r5], %fd46; |
| |
| BB120_15: |
| bar.sync 0; |
| |
| BB120_16: |
| setp.lt.u32 %p12, %r10, 128; |
| @%p12 bra BB120_20; |
| |
| setp.gt.u32 %p13, %r7, 63; |
| @%p13 bra BB120_19; |
| |
| ld.shared.f64 %fd37, [%r5+512]; |
| add.f64 %fd46, %fd46, %fd37; |
| st.shared.f64 [%r5], %fd46; |
| |
| BB120_19: |
| bar.sync 0; |
| |
| BB120_20: |
| setp.gt.u32 %p14, %r7, 31; |
| @%p14 bra BB120_33; |
| |
| setp.lt.u32 %p15, %r10, 64; |
| @%p15 bra BB120_23; |
| |
| ld.volatile.shared.f64 %fd38, [%r5+256]; |
| add.f64 %fd46, %fd46, %fd38; |
| st.volatile.shared.f64 [%r5], %fd46; |
| |
| BB120_23: |
| setp.lt.u32 %p16, %r10, 32; |
| @%p16 bra BB120_25; |
| |
| ld.volatile.shared.f64 %fd39, [%r5+128]; |
| add.f64 %fd46, %fd46, %fd39; |
| st.volatile.shared.f64 [%r5], %fd46; |
| |
| BB120_25: |
| setp.lt.u32 %p17, %r10, 16; |
| @%p17 bra BB120_27; |
| |
| ld.volatile.shared.f64 %fd40, [%r5+64]; |
| add.f64 %fd46, %fd46, %fd40; |
| st.volatile.shared.f64 [%r5], %fd46; |
| |
| BB120_27: |
| setp.lt.u32 %p18, %r10, 8; |
| @%p18 bra BB120_29; |
| |
| ld.volatile.shared.f64 %fd41, [%r5+32]; |
| add.f64 %fd46, %fd46, %fd41; |
| st.volatile.shared.f64 [%r5], %fd46; |
| |
| BB120_29: |
| setp.lt.u32 %p19, %r10, 4; |
| @%p19 bra BB120_31; |
| |
| ld.volatile.shared.f64 %fd42, [%r5+16]; |
| add.f64 %fd46, %fd46, %fd42; |
| st.volatile.shared.f64 [%r5], %fd46; |
| |
| BB120_31: |
| setp.lt.u32 %p20, %r10, 2; |
| @%p20 bra BB120_33; |
| |
| ld.volatile.shared.f64 %fd43, [%r5+8]; |
| add.f64 %fd44, %fd46, %fd43; |
| st.volatile.shared.f64 [%r5], %fd44; |
| |
| BB120_33: |
| setp.ne.s32 %p21, %r7, 0; |
| @%p21 bra BB120_35; |
| |
| ld.shared.f64 %fd45, [memory]; |
| cvta.to.global.u64 %rd9, %rd2; |
| mul.wide.u32 %rd10, %r8, 8; |
| add.s64 %rd11, %rd9, %rd10; |
| st.global.f64 [%rd11], %fd45; |
| |
| BB120_35: |
| ret; |
| } |
| |
| // .globl compute_nnz_f |
| .visible .entry compute_nnz_f( |
| .param .u64 compute_nnz_f_param_0, |
| .param .u64 compute_nnz_f_param_1, |
| .param .u32 compute_nnz_f_param_2 |
| ) |
| { |
| .reg .pred %p<22>; |
| .reg .f32 %f<62>; |
| .reg .b32 %r<36>; |
| .reg .b64 %rd<12>; |
| |
| |
| ld.param.u64 %rd1, [compute_nnz_f_param_0]; |
| ld.param.u64 %rd2, [compute_nnz_f_param_1]; |
| ld.param.u32 %r6, [compute_nnz_f_param_2]; |
| mov.u32 %r7, %tid.x; |
| mov.u32 %r8, %ctaid.x; |
| shl.b32 %r9, %r8, 1; |
| mov.u32 %r10, %ntid.x; |
| mad.lo.s32 %r35, %r9, %r10, %r7; |
| mov.f32 %f46, 0f00000000; |
| setp.ge.u32 %p1, %r35, %r6; |
| @%p1 bra BB121_4; |
| |
| BB121_1: |
| cvta.to.global.u64 %rd3, %rd1; |
| mul.wide.u32 %rd4, %r35, 4; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f32 %f30, [%rd5]; |
| setp.neu.f32 %p2, %f30, 0f00000000; |
| selp.f32 %f31, 0f3F800000, 0f00000000, %p2; |
| add.f32 %f46, %f46, %f31; |
| add.s32 %r3, %r35, %r10; |
| setp.ge.u32 %p3, %r3, %r6; |
| @%p3 bra BB121_3; |
| |
| mul.wide.u32 %rd7, %r3, 4; |
| add.s64 %rd8, %rd3, %rd7; |
| ld.global.f32 %f32, [%rd8]; |
| setp.neu.f32 %p4, %f32, 0f00000000; |
| selp.f32 %f33, 0f3F800000, 0f00000000, %p4; |
| add.f32 %f46, %f46, %f33; |
| |
| BB121_3: |
| shl.b32 %r13, %r10, 1; |
| mov.u32 %r14, %nctaid.x; |
| mad.lo.s32 %r35, %r13, %r14, %r35; |
| setp.lt.u32 %p5, %r35, %r6; |
| @%p5 bra BB121_1; |
| |
| BB121_4: |
| shl.b32 %r16, %r7, 2; |
| mov.u32 %r17, memory; |
| add.s32 %r5, %r17, %r16; |
| st.shared.f32 [%r5], %f46; |
| bar.sync 0; |
| setp.lt.u32 %p6, %r10, 1024; |
| @%p6 bra BB121_8; |
| |
| setp.gt.u32 %p7, %r7, 511; |
| @%p7 bra BB121_7; |
| |
| ld.shared.f32 %f34, [%r5+2048]; |
| add.f32 %f46, %f46, %f34; |
| st.shared.f32 [%r5], %f46; |
| |
| BB121_7: |
| bar.sync 0; |
| |
| BB121_8: |
| setp.lt.u32 %p8, %r10, 512; |
| @%p8 bra BB121_12; |
| |
| setp.gt.u32 %p9, %r7, 255; |
| @%p9 bra BB121_11; |
| |
| ld.shared.f32 %f35, [%r5+1024]; |
| add.f32 %f46, %f46, %f35; |
| st.shared.f32 [%r5], %f46; |
| |
| BB121_11: |
| bar.sync 0; |
| |
| BB121_12: |
| setp.lt.u32 %p10, %r10, 256; |
| @%p10 bra BB121_16; |
| |
| setp.gt.u32 %p11, %r7, 127; |
| @%p11 bra BB121_15; |
| |
| ld.shared.f32 %f36, [%r5+512]; |
| add.f32 %f46, %f46, %f36; |
| st.shared.f32 [%r5], %f46; |
| |
| BB121_15: |
| bar.sync 0; |
| |
| BB121_16: |
| setp.lt.u32 %p12, %r10, 128; |
| @%p12 bra BB121_20; |
| |
| setp.gt.u32 %p13, %r7, 63; |
| @%p13 bra BB121_19; |
| |
| ld.shared.f32 %f37, [%r5+256]; |
| add.f32 %f46, %f46, %f37; |
| st.shared.f32 [%r5], %f46; |
| |
| BB121_19: |
| bar.sync 0; |
| |
| BB121_20: |
| setp.gt.u32 %p14, %r7, 31; |
| @%p14 bra BB121_33; |
| |
| setp.lt.u32 %p15, %r10, 64; |
| @%p15 bra BB121_23; |
| |
| ld.volatile.shared.f32 %f38, [%r5+128]; |
| add.f32 %f46, %f46, %f38; |
| st.volatile.shared.f32 [%r5], %f46; |
| |
| BB121_23: |
| setp.lt.u32 %p16, %r10, 32; |
| @%p16 bra BB121_25; |
| |
| ld.volatile.shared.f32 %f39, [%r5+64]; |
| add.f32 %f46, %f46, %f39; |
| st.volatile.shared.f32 [%r5], %f46; |
| |
| BB121_25: |
| setp.lt.u32 %p17, %r10, 16; |
| @%p17 bra BB121_27; |
| |
| ld.volatile.shared.f32 %f40, [%r5+32]; |
| add.f32 %f46, %f46, %f40; |
| st.volatile.shared.f32 [%r5], %f46; |
| |
| BB121_27: |
| setp.lt.u32 %p18, %r10, 8; |
| @%p18 bra BB121_29; |
| |
| ld.volatile.shared.f32 %f41, [%r5+16]; |
| add.f32 %f46, %f46, %f41; |
| st.volatile.shared.f32 [%r5], %f46; |
| |
| BB121_29: |
| setp.lt.u32 %p19, %r10, 4; |
| @%p19 bra BB121_31; |
| |
| ld.volatile.shared.f32 %f42, [%r5+8]; |
| add.f32 %f46, %f46, %f42; |
| st.volatile.shared.f32 [%r5], %f46; |
| |
| BB121_31: |
| setp.lt.u32 %p20, %r10, 2; |
| @%p20 bra BB121_33; |
| |
| ld.volatile.shared.f32 %f43, [%r5+4]; |
| add.f32 %f44, %f46, %f43; |
| st.volatile.shared.f32 [%r5], %f44; |
| |
| BB121_33: |
| setp.ne.s32 %p21, %r7, 0; |
| @%p21 bra BB121_35; |
| |
| ld.shared.f32 %f45, [memory]; |
| cvta.to.global.u64 %rd9, %rd2; |
| mul.wide.u32 %rd10, %r8, 4; |
| add.s64 %rd11, %rd9, %rd10; |
| st.global.f32 [%rd11], %f45; |
| |
| BB121_35: |
| ret; |
| } |
| |
| // .globl prepare_lstm_output_d |
| .visible .entry prepare_lstm_output_d( |
| .param .u64 prepare_lstm_output_d_param_0, |
| .param .u64 prepare_lstm_output_d_param_1, |
| .param .u32 prepare_lstm_output_d_param_2, |
| .param .u32 prepare_lstm_output_d_param_3, |
| .param .u32 prepare_lstm_output_d_param_4, |
| .param .u32 prepare_lstm_output_d_param_5 |
| ) |
| { |
| .reg .pred %p<2>; |
| .reg .b32 %r<16>; |
| .reg .f64 %fd<2>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd1, [prepare_lstm_output_d_param_0]; |
| ld.param.u64 %rd2, [prepare_lstm_output_d_param_1]; |
| ld.param.u32 %r2, [prepare_lstm_output_d_param_2]; |
| ld.param.u32 %r3, [prepare_lstm_output_d_param_3]; |
| ld.param.u32 %r4, [prepare_lstm_output_d_param_4]; |
| ld.param.u32 %r5, [prepare_lstm_output_d_param_5]; |
| mov.u32 %r6, %ctaid.x; |
| mov.u32 %r7, %ntid.x; |
| mov.u32 %r8, %tid.x; |
| mad.lo.s32 %r1, %r7, %r6, %r8; |
| setp.ge.s32 %p1, %r1, %r5; |
| @%p1 bra BB122_2; |
| |
| cvta.to.global.u64 %rd3, %rd2; |
| mul.lo.s32 %r9, %r4, %r3; |
| div.s32 %r10, %r1, %r9; |
| rem.s32 %r11, %r1, %r9; |
| div.s32 %r12, %r11, %r4; |
| rem.s32 %r13, %r11, %r4; |
| mad.lo.s32 %r14, %r12, %r2, %r10; |
| mad.lo.s32 %r15, %r14, %r4, %r13; |
| mul.wide.s32 %rd4, %r15, 8; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f64 %fd1, [%rd5]; |
| cvta.to.global.u64 %rd6, %rd1; |
| mul.wide.s32 %rd7, %r1, 8; |
| add.s64 %rd8, %rd6, %rd7; |
| st.global.f64 [%rd8], %fd1; |
| |
| BB122_2: |
| ret; |
| } |
| |
| // .globl prepare_lstm_output_f |
| .visible .entry prepare_lstm_output_f( |
| .param .u64 prepare_lstm_output_f_param_0, |
| .param .u64 prepare_lstm_output_f_param_1, |
| .param .u32 prepare_lstm_output_f_param_2, |
| .param .u32 prepare_lstm_output_f_param_3, |
| .param .u32 prepare_lstm_output_f_param_4, |
| .param .u32 prepare_lstm_output_f_param_5 |
| ) |
| { |
| .reg .pred %p<2>; |
| .reg .f32 %f<2>; |
| .reg .b32 %r<16>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd1, [prepare_lstm_output_f_param_0]; |
| ld.param.u64 %rd2, [prepare_lstm_output_f_param_1]; |
| ld.param.u32 %r2, [prepare_lstm_output_f_param_2]; |
| ld.param.u32 %r3, [prepare_lstm_output_f_param_3]; |
| ld.param.u32 %r4, [prepare_lstm_output_f_param_4]; |
| ld.param.u32 %r5, [prepare_lstm_output_f_param_5]; |
| mov.u32 %r6, %ctaid.x; |
| mov.u32 %r7, %ntid.x; |
| mov.u32 %r8, %tid.x; |
| mad.lo.s32 %r1, %r7, %r6, %r8; |
| setp.ge.s32 %p1, %r1, %r5; |
| @%p1 bra BB123_2; |
| |
| cvta.to.global.u64 %rd3, %rd2; |
| mul.lo.s32 %r9, %r4, %r3; |
| div.s32 %r10, %r1, %r9; |
| rem.s32 %r11, %r1, %r9; |
| div.s32 %r12, %r11, %r4; |
| rem.s32 %r13, %r11, %r4; |
| mad.lo.s32 %r14, %r12, %r2, %r10; |
| mad.lo.s32 %r15, %r14, %r4, %r13; |
| mul.wide.s32 %rd4, %r15, 4; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f32 %f1, [%rd5]; |
| cvta.to.global.u64 %rd6, %rd1; |
| mul.wide.s32 %rd7, %r1, 4; |
| add.s64 %rd8, %rd6, %rd7; |
| st.global.f32 [%rd8], %f1; |
| |
| BB123_2: |
| ret; |
| } |
| |
| // .globl prepare_lstm_backward_gradients_d |
| .visible .entry prepare_lstm_backward_gradients_d( |
| .param .u64 prepare_lstm_backward_gradients_d_param_0, |
| .param .u64 prepare_lstm_backward_gradients_d_param_1, |
| .param .u32 prepare_lstm_backward_gradients_d_param_2, |
| .param .u32 prepare_lstm_backward_gradients_d_param_3, |
| .param .u32 prepare_lstm_backward_gradients_d_param_4, |
| .param .u32 prepare_lstm_backward_gradients_d_param_5, |
| .param .u32 prepare_lstm_backward_gradients_d_param_6 |
| ) |
| { |
| .reg .pred %p<5>; |
| .reg .b32 %r<20>; |
| .reg .f64 %fd<3>; |
| .reg .b64 %rd<11>; |
| |
| |
| ld.param.u64 %rd3, [prepare_lstm_backward_gradients_d_param_0]; |
| ld.param.u64 %rd4, [prepare_lstm_backward_gradients_d_param_1]; |
| ld.param.u32 %r2, [prepare_lstm_backward_gradients_d_param_2]; |
| ld.param.u32 %r3, [prepare_lstm_backward_gradients_d_param_3]; |
| ld.param.u32 %r4, [prepare_lstm_backward_gradients_d_param_4]; |
| ld.param.u32 %r5, [prepare_lstm_backward_gradients_d_param_5]; |
| ld.param.u32 %r6, [prepare_lstm_backward_gradients_d_param_6]; |
| cvta.to.global.u64 %rd1, %rd4; |
| mov.u32 %r7, %ntid.x; |
| mov.u32 %r8, %ctaid.x; |
| mov.u32 %r9, %tid.x; |
| mad.lo.s32 %r1, %r7, %r8, %r9; |
| setp.lt.s32 %p1, %r1, %r5; |
| setp.ne.s32 %p2, %r6, 0; |
| and.pred %p3, %p1, %p2; |
| cvta.to.global.u64 %rd5, %rd3; |
| mul.wide.s32 %rd6, %r1, 8; |
| add.s64 %rd2, %rd5, %rd6; |
| @%p3 bra BB124_3; |
| bra.uni BB124_1; |
| |
| BB124_3: |
| mul.lo.s32 %r13, %r4, %r3; |
| div.s32 %r14, %r1, %r13; |
| rem.s32 %r15, %r1, %r13; |
| div.s32 %r16, %r15, %r4; |
| rem.s32 %r17, %r15, %r4; |
| ld.global.f64 %fd2, [%rd2]; |
| mad.lo.s32 %r18, %r16, %r2, %r14; |
| mad.lo.s32 %r19, %r18, %r4, %r17; |
| mul.wide.s32 %rd9, %r19, 8; |
| add.s64 %rd10, %rd1, %rd9; |
| st.global.f64 [%rd10], %fd2; |
| bra.uni BB124_4; |
| |
| BB124_1: |
| setp.ge.s32 %p4, %r1, %r5; |
| @%p4 bra BB124_4; |
| |
| ld.global.f64 %fd1, [%rd2]; |
| add.s32 %r10, %r3, -1; |
| mul.lo.s32 %r11, %r10, %r2; |
| mad.lo.s32 %r12, %r11, %r4, %r1; |
| mul.wide.s32 %rd7, %r12, 8; |
| add.s64 %rd8, %rd1, %rd7; |
| st.global.f64 [%rd8], %fd1; |
| |
| BB124_4: |
| ret; |
| } |
| |
| // .globl prepare_lstm_backward_gradients_f |
| .visible .entry prepare_lstm_backward_gradients_f( |
| .param .u64 prepare_lstm_backward_gradients_f_param_0, |
| .param .u64 prepare_lstm_backward_gradients_f_param_1, |
| .param .u32 prepare_lstm_backward_gradients_f_param_2, |
| .param .u32 prepare_lstm_backward_gradients_f_param_3, |
| .param .u32 prepare_lstm_backward_gradients_f_param_4, |
| .param .u32 prepare_lstm_backward_gradients_f_param_5, |
| .param .u32 prepare_lstm_backward_gradients_f_param_6 |
| ) |
| { |
| .reg .pred %p<5>; |
| .reg .f32 %f<3>; |
| .reg .b32 %r<20>; |
| .reg .b64 %rd<11>; |
| |
| |
| ld.param.u64 %rd3, [prepare_lstm_backward_gradients_f_param_0]; |
| ld.param.u64 %rd4, [prepare_lstm_backward_gradients_f_param_1]; |
| ld.param.u32 %r2, [prepare_lstm_backward_gradients_f_param_2]; |
| ld.param.u32 %r3, [prepare_lstm_backward_gradients_f_param_3]; |
| ld.param.u32 %r4, [prepare_lstm_backward_gradients_f_param_4]; |
| ld.param.u32 %r5, [prepare_lstm_backward_gradients_f_param_5]; |
| ld.param.u32 %r6, [prepare_lstm_backward_gradients_f_param_6]; |
| cvta.to.global.u64 %rd1, %rd4; |
| mov.u32 %r7, %ntid.x; |
| mov.u32 %r8, %ctaid.x; |
| mov.u32 %r9, %tid.x; |
| mad.lo.s32 %r1, %r7, %r8, %r9; |
| setp.lt.s32 %p1, %r1, %r5; |
| setp.ne.s32 %p2, %r6, 0; |
| and.pred %p3, %p1, %p2; |
| cvta.to.global.u64 %rd5, %rd3; |
| mul.wide.s32 %rd6, %r1, 4; |
| add.s64 %rd2, %rd5, %rd6; |
| @%p3 bra BB125_3; |
| bra.uni BB125_1; |
| |
| BB125_3: |
| mul.lo.s32 %r13, %r4, %r3; |
| div.s32 %r14, %r1, %r13; |
| rem.s32 %r15, %r1, %r13; |
| div.s32 %r16, %r15, %r4; |
| rem.s32 %r17, %r15, %r4; |
| ld.global.f32 %f2, [%rd2]; |
| mad.lo.s32 %r18, %r16, %r2, %r14; |
| mad.lo.s32 %r19, %r18, %r4, %r17; |
| mul.wide.s32 %rd9, %r19, 4; |
| add.s64 %rd10, %rd1, %rd9; |
| st.global.f32 [%rd10], %f2; |
| bra.uni BB125_4; |
| |
| BB125_1: |
| setp.ge.s32 %p4, %r1, %r5; |
| @%p4 bra BB125_4; |
| |
| ld.global.f32 %f1, [%rd2]; |
| add.s32 %r10, %r3, -1; |
| mul.lo.s32 %r11, %r10, %r2; |
| mad.lo.s32 %r12, %r11, %r4, %r1; |
| mul.wide.s32 %rd7, %r12, 4; |
| add.s64 %rd8, %rd1, %rd7; |
| st.global.f32 [%rd8], %f1; |
| |
| BB125_4: |
| ret; |
| } |
| |
| // .globl prepare_lstm_dweight_d |
| .visible .entry prepare_lstm_dweight_d( |
| .param .u64 prepare_lstm_dweight_d_param_0, |
| .param .u64 prepare_lstm_dweight_d_param_1, |
| .param .u64 prepare_lstm_dweight_d_param_2, |
| .param .u32 prepare_lstm_dweight_d_param_3, |
| .param .u32 prepare_lstm_dweight_d_param_4 |
| ) |
| { |
| .reg .pred %p<8>; |
| .reg .b32 %r<48>; |
| .reg .f64 %fd<3>; |
| .reg .b64 %rd<15>; |
| |
| |
| ld.param.u64 %rd2, [prepare_lstm_dweight_d_param_0]; |
| ld.param.u64 %rd3, [prepare_lstm_dweight_d_param_1]; |
| ld.param.u64 %rd4, [prepare_lstm_dweight_d_param_2]; |
| ld.param.u32 %r45, [prepare_lstm_dweight_d_param_3]; |
| ld.param.u32 %r21, [prepare_lstm_dweight_d_param_4]; |
| cvta.to.global.u64 %rd1, %rd4; |
| mov.u32 %r22, %ntid.x; |
| mov.u32 %r23, %ctaid.x; |
| mov.u32 %r24, %tid.x; |
| mad.lo.s32 %r1, %r22, %r23, %r24; |
| add.s32 %r2, %r21, %r45; |
| shl.b32 %r3, %r21, 2; |
| mul.lo.s32 %r4, %r2, %r3; |
| setp.lt.s32 %p1, %r1, %r4; |
| @%p1 bra BB126_3; |
| bra.uni BB126_1; |
| |
| BB126_3: |
| mul.lo.s32 %r5, %r21, %r45; |
| mul.lo.s32 %r47, %r21, %r21; |
| shl.b32 %r7, %r5, 2; |
| setp.lt.s32 %p5, %r1, %r7; |
| @%p5 bra BB126_5; |
| bra.uni BB126_4; |
| |
| BB126_5: |
| rem.s32 %r44, %r1, %r5; |
| div.s32 %r42, %r44, %r21; |
| mov.u32 %r43, %r42; |
| mov.u32 %r46, %r1; |
| mov.u32 %r47, %r5; |
| bra.uni BB126_6; |
| |
| BB126_1: |
| add.s32 %r25, %r2, 1; |
| mul.lo.s32 %r26, %r25, %r3; |
| setp.ge.s32 %p2, %r1, %r26; |
| @%p2 bra BB126_7; |
| |
| cvta.to.global.u64 %rd5, %rd3; |
| sub.s32 %r27, %r1, %r4; |
| div.s32 %r28, %r27, %r21; |
| setp.lt.s32 %p3, %r28, 2; |
| setp.eq.s32 %p4, %r28, 2; |
| selp.b32 %r29, 3, 2, %p4; |
| selp.b32 %r30, %r28, %r29, %p3; |
| rem.s32 %r31, %r27, %r21; |
| mad.lo.s32 %r32, %r30, %r21, %r31; |
| mul.wide.s32 %rd6, %r1, 8; |
| add.s64 %rd7, %rd1, %rd6; |
| ld.global.f64 %fd1, [%rd7]; |
| mul.wide.s32 %rd8, %r32, 8; |
| add.s64 %rd9, %rd5, %rd8; |
| st.global.f64 [%rd9], %fd1; |
| bra.uni BB126_7; |
| |
| BB126_4: |
| sub.s32 %r46, %r1, %r7; |
| rem.s32 %r44, %r46, %r47; |
| div.s32 %r43, %r44, %r21; |
| add.s32 %r42, %r43, %r45; |
| mov.u32 %r45, %r21; |
| |
| BB126_6: |
| cvta.to.global.u64 %rd10, %rd2; |
| div.s32 %r33, %r46, %r47; |
| setp.eq.s32 %p6, %r33, 2; |
| selp.b32 %r34, 3, 2, %p6; |
| setp.lt.s32 %p7, %r33, 2; |
| selp.b32 %r35, %r33, %r34, %p7; |
| rem.s32 %r36, %r44, %r21; |
| sub.s32 %r37, %r1, %r44; |
| add.s32 %r38, %r37, %r43; |
| mad.lo.s32 %r39, %r36, %r45, %r38; |
| mad.lo.s32 %r40, %r42, %r3, %r36; |
| mad.lo.s32 %r41, %r35, %r21, %r40; |
| mul.wide.s32 %rd11, %r39, 8; |
| add.s64 %rd12, %rd1, %rd11; |
| ld.global.f64 %fd2, [%rd12]; |
| mul.wide.s32 %rd13, %r41, 8; |
| add.s64 %rd14, %rd10, %rd13; |
| st.global.f64 [%rd14], %fd2; |
| |
| BB126_7: |
| ret; |
| } |
| |
| // .globl prepare_lstm_dweight_f |
| .visible .entry prepare_lstm_dweight_f( |
| .param .u64 prepare_lstm_dweight_f_param_0, |
| .param .u64 prepare_lstm_dweight_f_param_1, |
| .param .u64 prepare_lstm_dweight_f_param_2, |
| .param .u32 prepare_lstm_dweight_f_param_3, |
| .param .u32 prepare_lstm_dweight_f_param_4 |
| ) |
| { |
| .reg .pred %p<8>; |
| .reg .f32 %f<3>; |
| .reg .b32 %r<48>; |
| .reg .b64 %rd<15>; |
| |
| |
| ld.param.u64 %rd2, [prepare_lstm_dweight_f_param_0]; |
| ld.param.u64 %rd3, [prepare_lstm_dweight_f_param_1]; |
| ld.param.u64 %rd4, [prepare_lstm_dweight_f_param_2]; |
| ld.param.u32 %r45, [prepare_lstm_dweight_f_param_3]; |
| ld.param.u32 %r21, [prepare_lstm_dweight_f_param_4]; |
| cvta.to.global.u64 %rd1, %rd4; |
| mov.u32 %r22, %ntid.x; |
| mov.u32 %r23, %ctaid.x; |
| mov.u32 %r24, %tid.x; |
| mad.lo.s32 %r1, %r22, %r23, %r24; |
| add.s32 %r2, %r21, %r45; |
| shl.b32 %r3, %r21, 2; |
| mul.lo.s32 %r4, %r2, %r3; |
| setp.lt.s32 %p1, %r1, %r4; |
| @%p1 bra BB127_3; |
| bra.uni BB127_1; |
| |
| BB127_3: |
| mul.lo.s32 %r5, %r21, %r45; |
| mul.lo.s32 %r47, %r21, %r21; |
| shl.b32 %r7, %r5, 2; |
| setp.lt.s32 %p5, %r1, %r7; |
| @%p5 bra BB127_5; |
| bra.uni BB127_4; |
| |
| BB127_5: |
| rem.s32 %r44, %r1, %r5; |
| div.s32 %r42, %r44, %r21; |
| mov.u32 %r43, %r42; |
| mov.u32 %r46, %r1; |
| mov.u32 %r47, %r5; |
| bra.uni BB127_6; |
| |
| BB127_1: |
| add.s32 %r25, %r2, 1; |
| mul.lo.s32 %r26, %r25, %r3; |
| setp.ge.s32 %p2, %r1, %r26; |
| @%p2 bra BB127_7; |
| |
| cvta.to.global.u64 %rd5, %rd3; |
| sub.s32 %r27, %r1, %r4; |
| div.s32 %r28, %r27, %r21; |
| setp.lt.s32 %p3, %r28, 2; |
| setp.eq.s32 %p4, %r28, 2; |
| selp.b32 %r29, 3, 2, %p4; |
| selp.b32 %r30, %r28, %r29, %p3; |
| rem.s32 %r31, %r27, %r21; |
| mad.lo.s32 %r32, %r30, %r21, %r31; |
| mul.wide.s32 %rd6, %r1, 4; |
| add.s64 %rd7, %rd1, %rd6; |
| ld.global.f32 %f1, [%rd7]; |
| mul.wide.s32 %rd8, %r32, 4; |
| add.s64 %rd9, %rd5, %rd8; |
| st.global.f32 [%rd9], %f1; |
| bra.uni BB127_7; |
| |
| BB127_4: |
| sub.s32 %r46, %r1, %r7; |
| rem.s32 %r44, %r46, %r47; |
| div.s32 %r43, %r44, %r21; |
| add.s32 %r42, %r43, %r45; |
| mov.u32 %r45, %r21; |
| |
| BB127_6: |
| cvta.to.global.u64 %rd10, %rd2; |
| div.s32 %r33, %r46, %r47; |
| setp.eq.s32 %p6, %r33, 2; |
| selp.b32 %r34, 3, 2, %p6; |
| setp.lt.s32 %p7, %r33, 2; |
| selp.b32 %r35, %r33, %r34, %p7; |
| rem.s32 %r36, %r44, %r21; |
| sub.s32 %r37, %r1, %r44; |
| add.s32 %r38, %r37, %r43; |
| mad.lo.s32 %r39, %r36, %r45, %r38; |
| mad.lo.s32 %r40, %r42, %r3, %r36; |
| mad.lo.s32 %r41, %r35, %r21, %r40; |
| mul.wide.s32 %rd11, %r39, 4; |
| add.s64 %rd12, %rd1, %rd11; |
| ld.global.f32 %f2, [%rd12]; |
| mul.wide.s32 %rd13, %r41, 4; |
| add.s64 %rd14, %rd10, %rd13; |
| st.global.f32 [%rd14], %f2; |
| |
| BB127_7: |
| ret; |
| } |
| |
| // .globl prepare_lstm_dinput_d |
| .visible .entry prepare_lstm_dinput_d( |
| .param .u64 prepare_lstm_dinput_d_param_0, |
| .param .u64 prepare_lstm_dinput_d_param_1, |
| .param .u32 prepare_lstm_dinput_d_param_2, |
| .param .u32 prepare_lstm_dinput_d_param_3, |
| .param .u32 prepare_lstm_dinput_d_param_4, |
| .param .u32 prepare_lstm_dinput_d_param_5 |
| ) |
| { |
| .reg .pred %p<2>; |
| .reg .b32 %r<15>; |
| .reg .f64 %fd<2>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd1, [prepare_lstm_dinput_d_param_0]; |
| ld.param.u64 %rd2, [prepare_lstm_dinput_d_param_1]; |
| ld.param.u32 %r2, [prepare_lstm_dinput_d_param_2]; |
| ld.param.u32 %r3, [prepare_lstm_dinput_d_param_3]; |
| ld.param.u32 %r4, [prepare_lstm_dinput_d_param_4]; |
| ld.param.u32 %r5, [prepare_lstm_dinput_d_param_5]; |
| mov.u32 %r6, %ctaid.x; |
| mov.u32 %r7, %ntid.x; |
| mov.u32 %r8, %tid.x; |
| mad.lo.s32 %r1, %r7, %r6, %r8; |
| setp.ge.s32 %p1, %r1, %r5; |
| @%p1 bra BB128_2; |
| |
| cvta.to.global.u64 %rd3, %rd2; |
| rem.s32 %r9, %r1, %r4; |
| div.s32 %r10, %r9, %r3; |
| rem.s32 %r11, %r9, %r3; |
| div.s32 %r12, %r1, %r4; |
| mad.lo.s32 %r13, %r10, %r2, %r12; |
| mad.lo.s32 %r14, %r13, %r3, %r11; |
| mul.wide.s32 %rd4, %r14, 8; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f64 %fd1, [%rd5]; |
| cvta.to.global.u64 %rd6, %rd1; |
| mul.wide.s32 %rd7, %r1, 8; |
| add.s64 %rd8, %rd6, %rd7; |
| st.global.f64 [%rd8], %fd1; |
| |
| BB128_2: |
| ret; |
| } |
| |
| // .globl prepare_lstm_dinput_f |
| .visible .entry prepare_lstm_dinput_f( |
| .param .u64 prepare_lstm_dinput_f_param_0, |
| .param .u64 prepare_lstm_dinput_f_param_1, |
| .param .u32 prepare_lstm_dinput_f_param_2, |
| .param .u32 prepare_lstm_dinput_f_param_3, |
| .param .u32 prepare_lstm_dinput_f_param_4, |
| .param .u32 prepare_lstm_dinput_f_param_5 |
| ) |
| { |
| .reg .pred %p<2>; |
| .reg .f32 %f<2>; |
| .reg .b32 %r<15>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd1, [prepare_lstm_dinput_f_param_0]; |
| ld.param.u64 %rd2, [prepare_lstm_dinput_f_param_1]; |
| ld.param.u32 %r2, [prepare_lstm_dinput_f_param_2]; |
| ld.param.u32 %r3, [prepare_lstm_dinput_f_param_3]; |
| ld.param.u32 %r4, [prepare_lstm_dinput_f_param_4]; |
| ld.param.u32 %r5, [prepare_lstm_dinput_f_param_5]; |
| mov.u32 %r6, %ctaid.x; |
| mov.u32 %r7, %ntid.x; |
| mov.u32 %r8, %tid.x; |
| mad.lo.s32 %r1, %r7, %r6, %r8; |
| setp.ge.s32 %p1, %r1, %r5; |
| @%p1 bra BB129_2; |
| |
| cvta.to.global.u64 %rd3, %rd2; |
| rem.s32 %r9, %r1, %r4; |
| div.s32 %r10, %r9, %r3; |
| rem.s32 %r11, %r9, %r3; |
| div.s32 %r12, %r1, %r4; |
| mad.lo.s32 %r13, %r10, %r2, %r12; |
| mad.lo.s32 %r14, %r13, %r3, %r11; |
| mul.wide.s32 %rd4, %r14, 4; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f32 %f1, [%rd5]; |
| cvta.to.global.u64 %rd6, %rd1; |
| mul.wide.s32 %rd7, %r1, 4; |
| add.s64 %rd8, %rd6, %rd7; |
| st.global.f32 [%rd8], %f1; |
| |
| BB129_2: |
| ret; |
| } |
| |
| // .globl colwise_reshape_d |
| .visible .entry colwise_reshape_d( |
| .param .u64 colwise_reshape_d_param_0, |
| .param .u64 colwise_reshape_d_param_1, |
| .param .u32 colwise_reshape_d_param_2, |
| .param .u32 colwise_reshape_d_param_3, |
| .param .u32 colwise_reshape_d_param_4, |
| .param .u32 colwise_reshape_d_param_5, |
| .param .u32 colwise_reshape_d_param_6 |
| ) |
| { |
| .reg .pred %p<2>; |
| .reg .b32 %r<16>; |
| .reg .f64 %fd<2>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd1, [colwise_reshape_d_param_0]; |
| ld.param.u64 %rd2, [colwise_reshape_d_param_1]; |
| ld.param.u32 %r6, [colwise_reshape_d_param_2]; |
| ld.param.u32 %r2, [colwise_reshape_d_param_3]; |
| ld.param.u32 %r3, [colwise_reshape_d_param_4]; |
| ld.param.u32 %r4, [colwise_reshape_d_param_5]; |
| ld.param.u32 %r5, [colwise_reshape_d_param_6]; |
| mov.u32 %r7, %ctaid.x; |
| mov.u32 %r8, %ntid.x; |
| mov.u32 %r9, %tid.x; |
| mad.lo.s32 %r1, %r8, %r7, %r9; |
| setp.ge.u32 %p1, %r1, %r6; |
| @%p1 bra BB130_2; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| rem.u32 %r10, %r1, %r5; |
| div.u32 %r11, %r1, %r5; |
| mad.lo.s32 %r12, %r10, %r4, %r11; |
| rem.u32 %r13, %r12, %r2; |
| div.u32 %r14, %r12, %r2; |
| mad.lo.s32 %r15, %r13, %r3, %r14; |
| mul.wide.u32 %rd4, %r15, 8; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f64 %fd1, [%rd5]; |
| cvta.to.global.u64 %rd6, %rd2; |
| mul.wide.s32 %rd7, %r1, 8; |
| add.s64 %rd8, %rd6, %rd7; |
| st.global.f64 [%rd8], %fd1; |
| |
| BB130_2: |
| ret; |
| } |
| |
| // .globl colwise_reshape_f |
| .visible .entry colwise_reshape_f( |
| .param .u64 colwise_reshape_f_param_0, |
| .param .u64 colwise_reshape_f_param_1, |
| .param .u32 colwise_reshape_f_param_2, |
| .param .u32 colwise_reshape_f_param_3, |
| .param .u32 colwise_reshape_f_param_4, |
| .param .u32 colwise_reshape_f_param_5, |
| .param .u32 colwise_reshape_f_param_6 |
| ) |
| { |
| .reg .pred %p<2>; |
| .reg .f32 %f<2>; |
| .reg .b32 %r<16>; |
| .reg .b64 %rd<9>; |
| |
| |
| ld.param.u64 %rd1, [colwise_reshape_f_param_0]; |
| ld.param.u64 %rd2, [colwise_reshape_f_param_1]; |
| ld.param.u32 %r6, [colwise_reshape_f_param_2]; |
| ld.param.u32 %r2, [colwise_reshape_f_param_3]; |
| ld.param.u32 %r3, [colwise_reshape_f_param_4]; |
| ld.param.u32 %r4, [colwise_reshape_f_param_5]; |
| ld.param.u32 %r5, [colwise_reshape_f_param_6]; |
| mov.u32 %r7, %ctaid.x; |
| mov.u32 %r8, %ntid.x; |
| mov.u32 %r9, %tid.x; |
| mad.lo.s32 %r1, %r8, %r7, %r9; |
| setp.ge.u32 %p1, %r1, %r6; |
| @%p1 bra BB131_2; |
| |
| cvta.to.global.u64 %rd3, %rd1; |
| rem.u32 %r10, %r1, %r5; |
| div.u32 %r11, %r1, %r5; |
| mad.lo.s32 %r12, %r10, %r4, %r11; |
| rem.u32 %r13, %r12, %r2; |
| div.u32 %r14, %r12, %r2; |
| mad.lo.s32 %r15, %r13, %r3, %r14; |
| mul.wide.u32 %rd4, %r15, 4; |
| add.s64 %rd5, %rd3, %rd4; |
| ld.global.f32 %f1, [%rd5]; |
| cvta.to.global.u64 %rd6, %rd2; |
| mul.wide.s32 %rd7, %r1, 4; |
| add.s64 %rd8, %rd6, %rd7; |
| st.global.f32 [%rd8], %f1; |
| |
| BB131_2: |
| ret; |
| } |
| |
| // .globl update_nesterov_x_d |
| .visible .entry update_nesterov_x_d( |
| .param .u64 update_nesterov_x_d_param_0, |
| .param .u64 update_nesterov_x_d_param_1, |
| .param .u64 update_nesterov_x_d_param_2, |
| .param .f64 update_nesterov_x_d_param_3, |
| .param .u64 update_nesterov_x_d_param_4, |
| .param .u32 update_nesterov_x_d_param_5 |
| ) |
| { |
| .reg .pred %p<2>; |
| .reg .b32 %r<6>; |
| .reg .f64 %fd<9>; |
| .reg .b64 %rd<14>; |
| |
| |
| ld.param.u64 %rd1, [update_nesterov_x_d_param_0]; |
| ld.param.u64 %rd2, [update_nesterov_x_d_param_1]; |
| ld.param.u64 %rd3, [update_nesterov_x_d_param_2]; |
| ld.param.f64 %fd1, [update_nesterov_x_d_param_3]; |
| ld.param.u64 %rd4, [update_nesterov_x_d_param_4]; |
| ld.param.u32 %r2, [update_nesterov_x_d_param_5]; |
| mov.u32 %r3, %ctaid.x; |
| mov.u32 %r4, %ntid.x; |
| mov.u32 %r5, %tid.x; |
| mad.lo.s32 %r1, %r4, %r3, %r5; |
| setp.ge.u32 %p1, %r1, %r2; |
| @%p1 bra BB132_2; |
| |
| cvta.to.global.u64 %rd5, %rd1; |
| mul.wide.s32 %rd6, %r1, 8; |
| add.s64 %rd7, %rd5, %rd6; |
| cvta.to.global.u64 %rd8, %rd3; |
| add.s64 %rd9, %rd8, %rd6; |
| ld.global.f64 %fd2, [%rd9]; |
| mul.f64 %fd3, %fd2, %fd1; |
| ld.global.f64 %fd4, [%rd7]; |
| sub.f64 %fd5, %fd4, %fd3; |
| cvta.to.global.u64 %rd10, %rd2; |
| add.s64 %rd11, %rd10, %rd6; |
| ld.global.f64 %fd6, [%rd11]; |
| add.f64 %fd7, %fd1, 0d3FF0000000000000; |
| fma.rn.f64 %fd8, %fd7, %fd6, %fd5; |
| cvta.to.global.u64 %rd12, %rd4; |
| add.s64 %rd13, %rd12, %rd6; |
| st.global.f64 [%rd13], %fd8; |
| |
| BB132_2: |
| ret; |
| } |
| |
| // .globl update_nesterov_x_f |
| .visible .entry update_nesterov_x_f( |
| .param .u64 update_nesterov_x_f_param_0, |
| .param .u64 update_nesterov_x_f_param_1, |
| .param .u64 update_nesterov_x_f_param_2, |
| .param .f64 update_nesterov_x_f_param_3, |
| .param .u64 update_nesterov_x_f_param_4, |
| .param .u32 update_nesterov_x_f_param_5 |
| ) |
| { |
| .reg .pred %p<2>; |
| .reg .f32 %f<5>; |
| .reg .b32 %r<6>; |
| .reg .f64 %fd<9>; |
| .reg .b64 %rd<14>; |
| |
| |
| ld.param.u64 %rd1, [update_nesterov_x_f_param_0]; |
| ld.param.u64 %rd2, [update_nesterov_x_f_param_1]; |
| ld.param.u64 %rd3, [update_nesterov_x_f_param_2]; |
| ld.param.f64 %fd1, [update_nesterov_x_f_param_3]; |
| ld.param.u64 %rd4, [update_nesterov_x_f_param_4]; |
| ld.param.u32 %r2, [update_nesterov_x_f_param_5]; |
| mov.u32 %r3, %ctaid.x; |
| mov.u32 %r4, %ntid.x; |
| mov.u32 %r5, %tid.x; |
| mad.lo.s32 %r1, %r4, %r3, %r5; |
| setp.ge.u32 %p1, %r1, %r2; |
| @%p1 bra BB133_2; |
| |
| cvta.to.global.u64 %rd5, %rd1; |
| mul.wide.s32 %rd6, %r1, 4; |
| add.s64 %rd7, %rd5, %rd6; |
| ld.global.f32 %f1, [%rd7]; |
| cvt.f64.f32 %fd2, %f1; |
| cvta.to.global.u64 %rd8, %rd3; |
| add.s64 %rd9, %rd8, %rd6; |
| ld.global.f32 %f2, [%rd9]; |
| cvt.f64.f32 %fd3, %f2; |
| mul.f64 %fd4, %fd3, %fd1; |
| sub.f64 %fd5, %fd2, %fd4; |
| cvta.to.global.u64 %rd10, %rd2; |
| add.s64 %rd11, %rd10, %rd6; |
| ld.global.f32 %f3, [%rd11]; |
| cvt.f64.f32 %fd6, %f3; |
| add.f64 %fd7, %fd1, 0d3FF0000000000000; |
| fma.rn.f64 %fd8, %fd7, %fd6, %fd5; |
| cvt.rn.f32.f64 %f4, %fd8; |
| cvta.to.global.u64 %rd12, %rd4; |
| add.s64 %rd13, %rd12, %rd6; |
| st.global.f32 [%rd13], %f4; |
| |
| BB133_2: |
| ret; |
| } |
| |
| .func (.param .b64 func_retval0) __internal_trig_reduction_slowpathd( |
| .param .b64 __internal_trig_reduction_slowpathd_param_0, |
| .param .b64 __internal_trig_reduction_slowpathd_param_1 |
| ) |
| { |
| .local .align 8 .b8 __local_depot134[40]; |
| .reg .b64 %SP; |
| .reg .b64 %SPL; |
| .reg .pred %p<9>; |
| .reg .b32 %r<42>; |
| .reg .f64 %fd<5>; |
| .reg .b64 %rd<101>; |
| |
| |
| mov.u64 %SPL, __local_depot134; |
| ld.param.f64 %fd4, [__internal_trig_reduction_slowpathd_param_0]; |
| ld.param.u64 %rd37, [__internal_trig_reduction_slowpathd_param_1]; |
| add.u64 %rd1, %SPL, 0; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r1}, %fd4; |
| } |
| and.b32 %r40, %r1, -2147483648; |
| shr.u32 %r3, %r1, 20; |
| bfe.u32 %r4, %r1, 20, 11; |
| setp.eq.s32 %p1, %r4, 2047; |
| @%p1 bra BB134_13; |
| |
| add.s32 %r15, %r4, -1024; |
| shr.u32 %r16, %r15, 6; |
| mov.u32 %r17, 15; |
| sub.s32 %r5, %r17, %r16; |
| mov.u32 %r18, 19; |
| sub.s32 %r19, %r18, %r16; |
| mov.u32 %r20, 18; |
| min.s32 %r6, %r20, %r19; |
| mov.u64 %rd94, 0; |
| setp.ge.s32 %p2, %r5, %r6; |
| mov.u64 %rd93, %rd1; |
| @%p2 bra BB134_4; |
| |
| bfe.u32 %r21, %r1, 20, 11; |
| add.s32 %r22, %r21, -1024; |
| shr.u32 %r23, %r22, 6; |
| sub.s32 %r25, %r17, %r23; |
| mul.wide.s32 %rd41, %r25, 8; |
| mov.u64 %rd42, __cudart_i2opi_d; |
| add.s64 %rd89, %rd42, %rd41; |
| mov.b64 %rd43, %fd4; |
| shl.b64 %rd44, %rd43, 11; |
| or.b64 %rd5, %rd44, -9223372036854775808; |
| mov.u64 %rd94, 0; |
| mov.u64 %rd93, %rd1; |
| mov.u64 %rd91, %rd1; |
| mov.u32 %r39, %r5; |
| |
| BB134_3: |
| .pragma "nounroll"; |
| ld.const.u64 %rd47, [%rd89]; |
| // inline asm |
| { |
| .reg .u32 r0, r1, r2, r3, alo, ahi, blo, bhi, clo, chi; |
| mov.b64 {alo,ahi}, %rd47; |
| mov.b64 {blo,bhi}, %rd5; |
| mov.b64 {clo,chi}, %rd94; |
| mad.lo.cc.u32 r0, alo, blo, clo; |
| madc.hi.cc.u32 r1, alo, blo, chi; |
| madc.hi.u32 r2, alo, bhi, 0; |
| mad.lo.cc.u32 r1, alo, bhi, r1; |
| madc.hi.cc.u32 r2, ahi, blo, r2; |
| madc.hi.u32 r3, ahi, bhi, 0; |
| mad.lo.cc.u32 r1, ahi, blo, r1; |
| madc.lo.cc.u32 r2, ahi, bhi, r2; |
| addc.u32 r3, r3, 0; |
| mov.b64 %rd45, {r0,r1}; |
| mov.b64 %rd94, {r2,r3}; |
| } |
| // inline asm |
| st.local.u64 [%rd91], %rd45; |
| add.s32 %r39, %r39, 1; |
| sub.s32 %r26, %r39, %r5; |
| mul.wide.s32 %rd50, %r26, 8; |
| add.s64 %rd91, %rd1, %rd50; |
| add.s64 %rd93, %rd93, 8; |
| add.s64 %rd89, %rd89, 8; |
| setp.lt.s32 %p3, %r39, %r6; |
| @%p3 bra BB134_3; |
| |
| BB134_4: |
| st.local.u64 [%rd93], %rd94; |
| ld.local.u64 %rd95, [%rd1+16]; |
| ld.local.u64 %rd96, [%rd1+24]; |
| and.b32 %r9, %r3, 63; |
| setp.eq.s32 %p4, %r9, 0; |
| @%p4 bra BB134_6; |
| |
| mov.u32 %r27, 64; |
| sub.s32 %r28, %r27, %r9; |
| shl.b64 %rd51, %rd96, %r9; |
| shr.u64 %rd52, %rd95, %r28; |
| or.b64 %rd96, %rd51, %rd52; |
| shl.b64 %rd53, %rd95, %r9; |
| ld.local.u64 %rd54, [%rd1+8]; |
| shr.u64 %rd55, %rd54, %r28; |
| or.b64 %rd95, %rd55, %rd53; |
| |
| BB134_6: |
| shr.u64 %rd56, %rd96, 62; |
| cvt.u32.u64 %r29, %rd56; |
| shr.u64 %rd57, %rd95, 62; |
| shl.b64 %rd58, %rd96, 2; |
| or.b64 %rd98, %rd58, %rd57; |
| shl.b64 %rd97, %rd95, 2; |
| shr.u64 %rd59, %rd96, 61; |
| cvt.u32.u64 %r30, %rd59; |
| and.b32 %r31, %r30, 1; |
| add.s32 %r32, %r31, %r29; |
| neg.s32 %r33, %r32; |
| setp.eq.s32 %p5, %r40, 0; |
| selp.b32 %r34, %r32, %r33, %p5; |
| cvta.to.local.u64 %rd60, %rd37; |
| st.local.u32 [%rd60], %r34; |
| setp.eq.s32 %p6, %r31, 0; |
| @%p6 bra BB134_8; |
| |
| mov.u64 %rd64, 0; |
| // inline asm |
| { |
| .reg .u32 r0, r1, r2, r3, a0, a1, a2, a3, b0, b1, b2, b3; |
| mov.b64 {a0,a1}, %rd64; |
| mov.b64 {a2,a3}, %rd64; |
| mov.b64 {b0,b1}, %rd97; |
| mov.b64 {b2,b3}, %rd98; |
| sub.cc.u32 r0, a0, b0; |
| subc.cc.u32 r1, a1, b1; |
| subc.cc.u32 r2, a2, b2; |
| subc.u32 r3, a3, b3; |
| mov.b64 %rd97, {r0,r1}; |
| mov.b64 %rd98, {r2,r3}; |
| } |
| // inline asm |
| xor.b32 %r40, %r40, -2147483648; |
| |
| BB134_8: |
| clz.b64 %r41, %rd98; |
| setp.eq.s32 %p7, %r41, 0; |
| @%p7 bra BB134_10; |
| |
| shl.b64 %rd67, %rd98, %r41; |
| mov.u32 %r35, 64; |
| sub.s32 %r36, %r35, %r41; |
| shr.u64 %rd68, %rd97, %r36; |
| or.b64 %rd98, %rd68, %rd67; |
| |
| BB134_10: |
| mov.u64 %rd72, -3958705157555305931; |
| // inline asm |
| { |
| .reg .u32 r0, r1, r2, r3, alo, ahi, blo, bhi; |
| mov.b64 {alo,ahi}, %rd98; |
| mov.b64 {blo,bhi}, %rd72; |
| mul.lo.u32 r0, alo, blo; |
| mul.hi.u32 r1, alo, blo; |
| mad.lo.cc.u32 r1, alo, bhi, r1; |
| madc.hi.u32 r2, alo, bhi, 0; |
| mad.lo.cc.u32 r1, ahi, blo, r1; |
| madc.hi.cc.u32 r2, ahi, blo, r2; |
| madc.hi.u32 r3, ahi, bhi, 0; |
| mad.lo.cc.u32 r2, ahi, bhi, r2; |
| addc.u32 r3, r3, 0; |
| mov.b64 %rd69, {r0,r1}; |
| mov.b64 %rd100, {r2,r3}; |
| } |
| // inline asm |
| setp.lt.s64 %p8, %rd100, 1; |
| @%p8 bra BB134_12; |
| |
| // inline asm |
| { |
| .reg .u32 r0, r1, r2, r3, a0, a1, a2, a3, b0, b1, b2, b3; |
| mov.b64 {a0,a1}, %rd69; |
| mov.b64 {a2,a3}, %rd100; |
| mov.b64 {b0,b1}, %rd69; |
| mov.b64 {b2,b3}, %rd100; |
| add.cc.u32 r0, a0, b0; |
| addc.cc.u32 r1, a1, b1; |
| addc.cc.u32 r2, a2, b2; |
| addc.u32 r3, a3, b3; |
| mov.b64 %rd73, {r0,r1}; |
| mov.b64 %rd100, {r2,r3}; |
| } |
| // inline asm |
| add.s32 %r41, %r41, 1; |
| |
| BB134_12: |
| cvt.u64.u32 %rd79, %r40; |
| shl.b64 %rd80, %rd79, 32; |
| mov.u32 %r37, 1022; |
| sub.s32 %r38, %r37, %r41; |
| cvt.u64.u32 %rd81, %r38; |
| shl.b64 %rd82, %rd81, 52; |
| add.s64 %rd83, %rd100, 1; |
| shr.u64 %rd84, %rd83, 10; |
| add.s64 %rd85, %rd84, 1; |
| shr.u64 %rd86, %rd85, 1; |
| add.s64 %rd87, %rd86, %rd82; |
| or.b64 %rd88, %rd87, %rd80; |
| mov.b64 %fd4, %rd88; |
| |
| BB134_13: |
| st.param.f64 [func_retval0+0], %fd4; |
| ret; |
| } |
| |
| .func (.param .b64 func_retval0) __internal_accurate_pow( |
| .param .b64 __internal_accurate_pow_param_0, |
| .param .b64 __internal_accurate_pow_param_1 |
| ) |
| { |
| .reg .pred %p<9>; |
| .reg .f32 %f<3>; |
| .reg .b32 %r<53>; |
| .reg .f64 %fd<138>; |
| |
| |
| ld.param.f64 %fd12, [__internal_accurate_pow_param_0]; |
| ld.param.f64 %fd13, [__internal_accurate_pow_param_1]; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r50}, %fd12; |
| } |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r49, %temp}, %fd12; |
| } |
| shr.u32 %r51, %r50, 20; |
| setp.ne.s32 %p1, %r51, 0; |
| @%p1 bra BB135_2; |
| |
| mul.f64 %fd14, %fd12, 0d4350000000000000; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r50}, %fd14; |
| } |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r49, %temp}, %fd14; |
| } |
| shr.u32 %r16, %r50, 20; |
| add.s32 %r51, %r16, -54; |
| |
| BB135_2: |
| add.s32 %r52, %r51, -1023; |
| and.b32 %r17, %r50, -2146435073; |
| or.b32 %r18, %r17, 1072693248; |
| mov.b64 %fd135, {%r49, %r18}; |
| setp.lt.u32 %p2, %r18, 1073127583; |
| @%p2 bra BB135_4; |
| |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r19, %temp}, %fd135; |
| } |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r20}, %fd135; |
| } |
| add.s32 %r21, %r20, -1048576; |
| mov.b64 %fd135, {%r19, %r21}; |
| add.s32 %r52, %r51, -1022; |
| |
| BB135_4: |
| add.f64 %fd15, %fd135, 0d3FF0000000000000; |
| rcp.approx.ftz.f64 %fd16, %fd15; |
| neg.f64 %fd17, %fd15; |
| mov.f64 %fd18, 0d3FF0000000000000; |
| fma.rn.f64 %fd19, %fd17, %fd16, %fd18; |
| fma.rn.f64 %fd20, %fd19, %fd19, %fd19; |
| fma.rn.f64 %fd21, %fd20, %fd16, %fd16; |
| add.f64 %fd22, %fd135, 0dBFF0000000000000; |
| mul.f64 %fd23, %fd22, %fd21; |
| fma.rn.f64 %fd24, %fd22, %fd21, %fd23; |
| mul.f64 %fd25, %fd24, %fd24; |
| mov.f64 %fd26, 0d3ED0F5D241AD3B5A; |
| mov.f64 %fd27, 0d3EB0F5FF7D2CAFE2; |
| fma.rn.f64 %fd28, %fd27, %fd25, %fd26; |
| mov.f64 %fd29, 0d3EF3B20A75488A3F; |
| fma.rn.f64 %fd30, %fd28, %fd25, %fd29; |
| mov.f64 %fd31, 0d3F1745CDE4FAECD5; |
| fma.rn.f64 %fd32, %fd30, %fd25, %fd31; |
| mov.f64 %fd33, 0d3F3C71C7258A578B; |
| fma.rn.f64 %fd34, %fd32, %fd25, %fd33; |
| mov.f64 %fd35, 0d3F6249249242B910; |
| fma.rn.f64 %fd36, %fd34, %fd25, %fd35; |
| mov.f64 %fd37, 0d3F89999999999DFB; |
| fma.rn.f64 %fd38, %fd36, %fd25, %fd37; |
| sub.f64 %fd39, %fd22, %fd24; |
| add.f64 %fd40, %fd39, %fd39; |
| neg.f64 %fd41, %fd24; |
| fma.rn.f64 %fd42, %fd41, %fd22, %fd40; |
| mul.f64 %fd43, %fd21, %fd42; |
| fma.rn.f64 %fd44, %fd25, %fd38, 0d3FB5555555555555; |
| mov.f64 %fd45, 0d3FB5555555555555; |
| sub.f64 %fd46, %fd45, %fd44; |
| fma.rn.f64 %fd47, %fd25, %fd38, %fd46; |
| add.f64 %fd48, %fd47, 0d0000000000000000; |
| add.f64 %fd49, %fd48, 0dBC46A4CB00B9E7B0; |
| add.f64 %fd50, %fd44, %fd49; |
| sub.f64 %fd51, %fd44, %fd50; |
| add.f64 %fd52, %fd49, %fd51; |
| mul.rn.f64 %fd53, %fd24, %fd24; |
| neg.f64 %fd54, %fd53; |
| fma.rn.f64 %fd55, %fd24, %fd24, %fd54; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r22, %temp}, %fd43; |
| } |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r23}, %fd43; |
| } |
| add.s32 %r24, %r23, 1048576; |
| mov.b64 %fd56, {%r22, %r24}; |
| fma.rn.f64 %fd57, %fd24, %fd56, %fd55; |
| mul.rn.f64 %fd58, %fd53, %fd24; |
| neg.f64 %fd59, %fd58; |
| fma.rn.f64 %fd60, %fd53, %fd24, %fd59; |
| fma.rn.f64 %fd61, %fd53, %fd43, %fd60; |
| fma.rn.f64 %fd62, %fd57, %fd24, %fd61; |
| mul.rn.f64 %fd63, %fd50, %fd58; |
| neg.f64 %fd64, %fd63; |
| fma.rn.f64 %fd65, %fd50, %fd58, %fd64; |
| fma.rn.f64 %fd66, %fd50, %fd62, %fd65; |
| fma.rn.f64 %fd67, %fd52, %fd58, %fd66; |
| add.f64 %fd68, %fd63, %fd67; |
| sub.f64 %fd69, %fd63, %fd68; |
| add.f64 %fd70, %fd67, %fd69; |
| add.f64 %fd71, %fd24, %fd68; |
| sub.f64 %fd72, %fd24, %fd71; |
| add.f64 %fd73, %fd68, %fd72; |
| add.f64 %fd74, %fd70, %fd73; |
| add.f64 %fd75, %fd43, %fd74; |
| add.f64 %fd76, %fd71, %fd75; |
| sub.f64 %fd77, %fd71, %fd76; |
| add.f64 %fd78, %fd75, %fd77; |
| xor.b32 %r25, %r52, -2147483648; |
| mov.u32 %r26, -2147483648; |
| mov.u32 %r27, 1127219200; |
| mov.b64 %fd79, {%r25, %r27}; |
| mov.b64 %fd80, {%r26, %r27}; |
| sub.f64 %fd81, %fd79, %fd80; |
| mov.f64 %fd82, 0d3FE62E42FEFA39EF; |
| fma.rn.f64 %fd83, %fd81, %fd82, %fd76; |
| neg.f64 %fd84, %fd81; |
| fma.rn.f64 %fd85, %fd84, %fd82, %fd83; |
| sub.f64 %fd86, %fd85, %fd76; |
| sub.f64 %fd87, %fd78, %fd86; |
| mov.f64 %fd88, 0d3C7ABC9E3B39803F; |
| fma.rn.f64 %fd89, %fd81, %fd88, %fd87; |
| add.f64 %fd90, %fd83, %fd89; |
| sub.f64 %fd91, %fd83, %fd90; |
| add.f64 %fd92, %fd89, %fd91; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r28}, %fd13; |
| } |
| add.s32 %r29, %r28, %r28; |
| setp.gt.u32 %p3, %r29, -33554433; |
| and.b32 %r30, %r28, -15728641; |
| selp.b32 %r31, %r30, %r28, %p3; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r32, %temp}, %fd13; |
| } |
| mov.b64 %fd93, {%r32, %r31}; |
| mul.rn.f64 %fd94, %fd90, %fd93; |
| neg.f64 %fd95, %fd94; |
| fma.rn.f64 %fd96, %fd90, %fd93, %fd95; |
| fma.rn.f64 %fd97, %fd92, %fd93, %fd96; |
| add.f64 %fd4, %fd94, %fd97; |
| sub.f64 %fd98, %fd94, %fd4; |
| add.f64 %fd5, %fd97, %fd98; |
| mov.f64 %fd99, 0d4338000000000000; |
| mov.f64 %fd100, 0d3FF71547652B82FE; |
| fma.rn.f64 %fd101, %fd4, %fd100, %fd99; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r13, %temp}, %fd101; |
| } |
| mov.f64 %fd102, 0dC338000000000000; |
| add.rn.f64 %fd103, %fd101, %fd102; |
| mov.f64 %fd104, 0dBFE62E42FEFA39EF; |
| fma.rn.f64 %fd105, %fd103, %fd104, %fd4; |
| mov.f64 %fd106, 0dBC7ABC9E3B39803F; |
| fma.rn.f64 %fd107, %fd103, %fd106, %fd105; |
| mov.f64 %fd108, 0d3E928AF3FCA213EA; |
| mov.f64 %fd109, 0d3E5ADE1569CE2BDF; |
| fma.rn.f64 %fd110, %fd109, %fd107, %fd108; |
| mov.f64 %fd111, 0d3EC71DEE62401315; |
| fma.rn.f64 %fd112, %fd110, %fd107, %fd111; |
| mov.f64 %fd113, 0d3EFA01997C89EB71; |
| fma.rn.f64 %fd114, %fd112, %fd107, %fd113; |
| mov.f64 %fd115, 0d3F2A01A014761F65; |
| fma.rn.f64 %fd116, %fd114, %fd107, %fd115; |
| mov.f64 %fd117, 0d3F56C16C1852B7AF; |
| fma.rn.f64 %fd118, %fd116, %fd107, %fd117; |
| mov.f64 %fd119, 0d3F81111111122322; |
| fma.rn.f64 %fd120, %fd118, %fd107, %fd119; |
| mov.f64 %fd121, 0d3FA55555555502A1; |
| fma.rn.f64 %fd122, %fd120, %fd107, %fd121; |
| mov.f64 %fd123, 0d3FC5555555555511; |
| fma.rn.f64 %fd124, %fd122, %fd107, %fd123; |
| mov.f64 %fd125, 0d3FE000000000000B; |
| fma.rn.f64 %fd126, %fd124, %fd107, %fd125; |
| fma.rn.f64 %fd127, %fd126, %fd107, %fd18; |
| fma.rn.f64 %fd128, %fd127, %fd107, %fd18; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r14, %temp}, %fd128; |
| } |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r15}, %fd128; |
| } |
| shl.b32 %r33, %r13, 20; |
| add.s32 %r34, %r15, %r33; |
| mov.b64 %fd136, {%r14, %r34}; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r35}, %fd4; |
| } |
| mov.b32 %f2, %r35; |
| abs.f32 %f1, %f2; |
| setp.lt.f32 %p4, %f1, 0f4086232B; |
| @%p4 bra BB135_7; |
| |
| setp.lt.f64 %p5, %fd4, 0d0000000000000000; |
| add.f64 %fd129, %fd4, 0d7FF0000000000000; |
| selp.f64 %fd136, 0d0000000000000000, %fd129, %p5; |
| setp.geu.f32 %p6, %f1, 0f40874800; |
| @%p6 bra BB135_7; |
| |
| mov.f64 %fd134, 0d4338000000000000; |
| mov.f64 %fd133, 0d3FF71547652B82FE; |
| fma.rn.f64 %fd132, %fd4, %fd133, %fd134; |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r48, %temp}, %fd132; |
| } |
| shr.u32 %r36, %r48, 31; |
| add.s32 %r37, %r48, %r36; |
| shr.s32 %r38, %r37, 1; |
| shl.b32 %r39, %r38, 20; |
| add.s32 %r40, %r39, %r15; |
| mov.b64 %fd130, {%r14, %r40}; |
| sub.s32 %r41, %r48, %r38; |
| shl.b32 %r42, %r41, 20; |
| add.s32 %r43, %r42, 1072693248; |
| mov.u32 %r44, 0; |
| mov.b64 %fd131, {%r44, %r43}; |
| mul.f64 %fd136, %fd130, %fd131; |
| |
| BB135_7: |
| { |
| .reg .b32 %temp; |
| mov.b64 {%temp, %r45}, %fd136; |
| } |
| and.b32 %r46, %r45, 2147483647; |
| setp.ne.s32 %p7, %r46, 2146435072; |
| @%p7 bra BB135_9; |
| |
| { |
| .reg .b32 %temp; |
| mov.b64 {%r47, %temp}, %fd136; |
| } |
| setp.eq.s32 %p8, %r47, 0; |
| @%p8 bra BB135_10; |
| |
| BB135_9: |
| fma.rn.f64 %fd136, %fd136, %fd5, %fd136; |
| |
| BB135_10: |
| st.param.f64 [func_retval0+0], %fd136; |
| ret; |
| } |
| |
| |