blob: ee355bf62742a3eabf4da7b5b0dcc1a002bb0eeb [file] [log] [blame]
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-27506705
// Cuda compilation tools, release 10.2, V10.2.89
// Based on LLVM 3.4svn
//
.version 6.5
.target sm_30
.address_size 64
// .globl double2float_f
.func (.param .b64 func_retval0) __internal_trig_reduction_slowpathd
(
.param .b64 __internal_trig_reduction_slowpathd_param_0,
.param .b64 __internal_trig_reduction_slowpathd_param_1
)
;
.func (.param .b64 func_retval0) __internal_accurate_pow
(
.param .b64 __internal_accurate_pow_param_0,
.param .b64 __internal_accurate_pow_param_1
)
;
.extern .shared .align 1 .b8 memory[];
.const .align 4 .b8 __cudart_i2opi_f[24] = {65, 144, 67, 60, 153, 149, 98, 219, 192, 221, 52, 245, 209, 87, 39, 252, 41, 21, 68, 78, 110, 131, 249, 162};
.const .align 8 .b8 __cudart_i2opi_d[144] = {8, 93, 141, 31, 177, 95, 251, 107, 234, 146, 82, 138, 247, 57, 7, 61, 123, 241, 229, 235, 199, 186, 39, 117, 45, 234, 95, 158, 102, 63, 70, 79, 183, 9, 203, 39, 207, 126, 54, 109, 31, 109, 10, 90, 139, 17, 47, 239, 15, 152, 5, 222, 255, 151, 248, 31, 59, 40, 249, 189, 139, 95, 132, 156, 244, 57, 83, 131, 57, 214, 145, 57, 65, 126, 95, 180, 38, 112, 156, 233, 132, 68, 187, 46, 245, 53, 130, 232, 62, 167, 41, 177, 28, 235, 29, 254, 28, 146, 209, 9, 234, 46, 73, 6, 224, 210, 77, 66, 58, 110, 36, 183, 97, 197, 187, 222, 171, 99, 81, 254, 65, 144, 67, 60, 153, 149, 98, 219, 192, 221, 52, 245, 209, 87, 39, 252, 41, 21, 68, 78, 110, 131, 249, 162};
.const .align 8 .b8 __cudart_sin_cos_coeffs[128] = {186, 94, 120, 249, 101, 219, 229, 61, 70, 210, 176, 44, 241, 229, 90, 190, 146, 227, 172, 105, 227, 29, 199, 62, 161, 98, 219, 25, 160, 1, 42, 191, 24, 8, 17, 17, 17, 17, 129, 63, 84, 85, 85, 85, 85, 85, 197, 191, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 100, 129, 253, 32, 131, 255, 168, 189, 40, 133, 239, 193, 167, 238, 33, 62, 217, 230, 6, 142, 79, 126, 146, 190, 233, 188, 221, 25, 160, 1, 250, 62, 71, 93, 193, 22, 108, 193, 86, 191, 81, 85, 85, 85, 85, 85, 165, 63, 0, 0, 0, 0, 0, 0, 224, 191, 0, 0, 0, 0, 0, 0, 240, 63};
.visible .entry double2float_f(
.param .u64 double2float_f_param_0,
.param .u64 double2float_f_param_1,
.param .u32 double2float_f_param_2
)
{
.reg .pred %p<2>;
.reg .f32 %f<2>;
.reg .b32 %r<6>;
.reg .f64 %fd<2>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [double2float_f_param_0];
ld.param.u64 %rd2, [double2float_f_param_1];
ld.param.u32 %r2, [double2float_f_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.s32 %p1, %r1, %r2;
@%p1 bra BB0_2;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd1, [%rd5];
cvt.rn.f32.f64 %f1, %fd1;
cvta.to.global.u64 %rd6, %rd2;
mul.wide.s32 %rd7, %r1, 4;
add.s64 %rd8, %rd6, %rd7;
st.global.f32 [%rd8], %f1;
BB0_2:
ret;
}
// .globl float2double_f
.visible .entry float2double_f(
.param .u64 float2double_f_param_0,
.param .u64 float2double_f_param_1,
.param .u32 float2double_f_param_2
)
{
.reg .pred %p<2>;
.reg .f32 %f<2>;
.reg .b32 %r<6>;
.reg .f64 %fd<2>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [float2double_f_param_0];
ld.param.u64 %rd2, [float2double_f_param_1];
ld.param.u32 %r2, [float2double_f_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.s32 %p1, %r1, %r2;
@%p1 bra BB1_2;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f1, [%rd5];
cvt.f64.f32 %fd1, %f1;
cvta.to.global.u64 %rd6, %rd2;
mul.wide.s32 %rd7, %r1, 8;
add.s64 %rd8, %rd6, %rd7;
st.global.f64 [%rd8], %fd1;
BB1_2:
ret;
}
// .globl cumulative_sum_up_sweep_d
.visible .entry cumulative_sum_up_sweep_d(
.param .u64 cumulative_sum_up_sweep_d_param_0,
.param .u64 cumulative_sum_up_sweep_d_param_1,
.param .u32 cumulative_sum_up_sweep_d_param_2,
.param .u32 cumulative_sum_up_sweep_d_param_3,
.param .u32 cumulative_sum_up_sweep_d_param_4
)
{
.reg .pred %p<4>;
.reg .b32 %r<20>;
.reg .f64 %fd<8>;
.reg .b64 %rd<11>;
ld.param.u64 %rd3, [cumulative_sum_up_sweep_d_param_0];
ld.param.u64 %rd2, [cumulative_sum_up_sweep_d_param_1];
ld.param.u32 %r7, [cumulative_sum_up_sweep_d_param_2];
ld.param.u32 %r8, [cumulative_sum_up_sweep_d_param_3];
ld.param.u32 %r9, [cumulative_sum_up_sweep_d_param_4];
cvta.to.global.u64 %rd1, %rd3;
mov.u32 %r10, %ntid.x;
mov.u32 %r11, %ctaid.x;
mov.u32 %r12, %tid.x;
mad.lo.s32 %r1, %r10, %r11, %r12;
add.s32 %r13, %r8, -1;
setp.gt.u32 %p1, %r1, %r13;
@%p1 bra BB2_4;
mov.u32 %r14, %ctaid.y;
mul.lo.s32 %r2, %r14, %r8;
mad.lo.s32 %r15, %r2, %r9, %r1;
mul.wide.u32 %rd4, %r15, 8;
add.s64 %rd5, %rd1, %rd4;
ld.global.f64 %fd7, [%rd5];
mad.lo.s32 %r16, %r9, %r8, %r15;
mul.lo.s32 %r17, %r8, %r7;
min.u32 %r3, %r16, %r17;
add.s32 %r19, %r15, %r8;
setp.ge.u32 %p2, %r19, %r3;
@%p2 bra BB2_3;
BB2_2:
mul.wide.s32 %rd6, %r19, 8;
add.s64 %rd7, %rd1, %rd6;
ld.global.f64 %fd5, [%rd7];
add.f64 %fd7, %fd7, %fd5;
add.s32 %r19, %r19, %r8;
setp.lt.u32 %p3, %r19, %r3;
@%p3 bra BB2_2;
BB2_3:
add.s32 %r18, %r1, %r2;
cvta.to.global.u64 %rd8, %rd2;
mul.wide.u32 %rd9, %r18, 8;
add.s64 %rd10, %rd8, %rd9;
st.global.f64 [%rd10], %fd7;
BB2_4:
ret;
}
// .globl cumulative_sum_up_sweep_f
.visible .entry cumulative_sum_up_sweep_f(
.param .u64 cumulative_sum_up_sweep_f_param_0,
.param .u64 cumulative_sum_up_sweep_f_param_1,
.param .u32 cumulative_sum_up_sweep_f_param_2,
.param .u32 cumulative_sum_up_sweep_f_param_3,
.param .u32 cumulative_sum_up_sweep_f_param_4
)
{
.reg .pred %p<4>;
.reg .f32 %f<8>;
.reg .b32 %r<20>;
.reg .b64 %rd<11>;
ld.param.u64 %rd3, [cumulative_sum_up_sweep_f_param_0];
ld.param.u64 %rd2, [cumulative_sum_up_sweep_f_param_1];
ld.param.u32 %r7, [cumulative_sum_up_sweep_f_param_2];
ld.param.u32 %r8, [cumulative_sum_up_sweep_f_param_3];
ld.param.u32 %r9, [cumulative_sum_up_sweep_f_param_4];
cvta.to.global.u64 %rd1, %rd3;
mov.u32 %r10, %ntid.x;
mov.u32 %r11, %ctaid.x;
mov.u32 %r12, %tid.x;
mad.lo.s32 %r1, %r10, %r11, %r12;
add.s32 %r13, %r8, -1;
setp.gt.u32 %p1, %r1, %r13;
@%p1 bra BB3_4;
mov.u32 %r14, %ctaid.y;
mul.lo.s32 %r2, %r14, %r8;
mad.lo.s32 %r15, %r2, %r9, %r1;
mul.wide.u32 %rd4, %r15, 4;
add.s64 %rd5, %rd1, %rd4;
ld.global.f32 %f7, [%rd5];
mad.lo.s32 %r16, %r9, %r8, %r15;
mul.lo.s32 %r17, %r8, %r7;
min.u32 %r3, %r16, %r17;
add.s32 %r19, %r15, %r8;
setp.ge.u32 %p2, %r19, %r3;
@%p2 bra BB3_3;
BB3_2:
mul.wide.s32 %rd6, %r19, 4;
add.s64 %rd7, %rd1, %rd6;
ld.global.f32 %f5, [%rd7];
add.f32 %f7, %f7, %f5;
add.s32 %r19, %r19, %r8;
setp.lt.u32 %p3, %r19, %r3;
@%p3 bra BB3_2;
BB3_3:
add.s32 %r18, %r1, %r2;
cvta.to.global.u64 %rd8, %rd2;
mul.wide.u32 %rd9, %r18, 4;
add.s64 %rd10, %rd8, %rd9;
st.global.f32 [%rd10], %f7;
BB3_4:
ret;
}
// .globl cumulative_sum_down_sweep_d
.visible .entry cumulative_sum_down_sweep_d(
.param .u64 cumulative_sum_down_sweep_d_param_0,
.param .u64 cumulative_sum_down_sweep_d_param_1,
.param .u64 cumulative_sum_down_sweep_d_param_2,
.param .u32 cumulative_sum_down_sweep_d_param_3,
.param .u32 cumulative_sum_down_sweep_d_param_4,
.param .u32 cumulative_sum_down_sweep_d_param_5
)
{
.reg .pred %p<5>;
.reg .b32 %r<21>;
.reg .f64 %fd<11>;
.reg .b64 %rd<15>;
ld.param.u64 %rd4, [cumulative_sum_down_sweep_d_param_0];
ld.param.u64 %rd5, [cumulative_sum_down_sweep_d_param_1];
ld.param.u64 %rd3, [cumulative_sum_down_sweep_d_param_2];
ld.param.u32 %r7, [cumulative_sum_down_sweep_d_param_3];
ld.param.u32 %r8, [cumulative_sum_down_sweep_d_param_4];
ld.param.u32 %r9, [cumulative_sum_down_sweep_d_param_5];
cvta.to.global.u64 %rd1, %rd5;
cvta.to.global.u64 %rd2, %rd4;
mov.u32 %r10, %ntid.x;
mov.u32 %r11, %ctaid.x;
mov.u32 %r12, %tid.x;
mad.lo.s32 %r1, %r10, %r11, %r12;
add.s32 %r13, %r8, -1;
setp.gt.u32 %p1, %r1, %r13;
@%p1 bra BB4_5;
mov.u32 %r2, %ctaid.y;
setp.eq.s32 %p2, %r2, 0;
mov.f64 %fd9, 0d0000000000000000;
@%p2 bra BB4_3;
add.s32 %r14, %r2, -1;
mad.lo.s32 %r15, %r14, %r8, %r1;
cvta.to.global.u64 %rd6, %rd3;
mul.wide.u32 %rd7, %r15, 8;
add.s64 %rd8, %rd6, %rd7;
ld.global.f64 %fd9, [%rd8];
BB4_3:
mul.lo.s32 %r16, %r9, %r8;
mad.lo.s32 %r17, %r16, %r2, %r1;
mul.wide.u32 %rd9, %r17, 8;
add.s64 %rd10, %rd2, %rd9;
ld.global.f64 %fd7, [%rd10];
add.f64 %fd10, %fd9, %fd7;
add.s64 %rd11, %rd1, %rd9;
st.global.f64 [%rd11], %fd10;
mul.lo.s32 %r18, %r8, %r7;
add.s32 %r19, %r17, %r16;
min.u32 %r3, %r19, %r18;
add.s32 %r20, %r17, %r8;
setp.ge.u32 %p3, %r20, %r3;
@%p3 bra BB4_5;
BB4_4:
mul.wide.s32 %rd12, %r20, 8;
add.s64 %rd13, %rd2, %rd12;
ld.global.f64 %fd8, [%rd13];
add.f64 %fd10, %fd10, %fd8;
add.s64 %rd14, %rd1, %rd12;
st.global.f64 [%rd14], %fd10;
add.s32 %r20, %r20, %r8;
setp.lt.u32 %p4, %r20, %r3;
@%p4 bra BB4_4;
BB4_5:
ret;
}
// .globl cumulative_sum_down_sweep_f
.visible .entry cumulative_sum_down_sweep_f(
.param .u64 cumulative_sum_down_sweep_f_param_0,
.param .u64 cumulative_sum_down_sweep_f_param_1,
.param .u64 cumulative_sum_down_sweep_f_param_2,
.param .u32 cumulative_sum_down_sweep_f_param_3,
.param .u32 cumulative_sum_down_sweep_f_param_4,
.param .u32 cumulative_sum_down_sweep_f_param_5
)
{
.reg .pred %p<5>;
.reg .f32 %f<11>;
.reg .b32 %r<21>;
.reg .b64 %rd<15>;
ld.param.u64 %rd4, [cumulative_sum_down_sweep_f_param_0];
ld.param.u64 %rd5, [cumulative_sum_down_sweep_f_param_1];
ld.param.u64 %rd3, [cumulative_sum_down_sweep_f_param_2];
ld.param.u32 %r7, [cumulative_sum_down_sweep_f_param_3];
ld.param.u32 %r8, [cumulative_sum_down_sweep_f_param_4];
ld.param.u32 %r9, [cumulative_sum_down_sweep_f_param_5];
cvta.to.global.u64 %rd1, %rd5;
cvta.to.global.u64 %rd2, %rd4;
mov.u32 %r10, %ntid.x;
mov.u32 %r11, %ctaid.x;
mov.u32 %r12, %tid.x;
mad.lo.s32 %r1, %r10, %r11, %r12;
add.s32 %r13, %r8, -1;
setp.gt.u32 %p1, %r1, %r13;
@%p1 bra BB5_5;
mov.u32 %r2, %ctaid.y;
setp.eq.s32 %p2, %r2, 0;
mov.f32 %f9, 0f00000000;
@%p2 bra BB5_3;
add.s32 %r14, %r2, -1;
mad.lo.s32 %r15, %r14, %r8, %r1;
cvta.to.global.u64 %rd6, %rd3;
mul.wide.u32 %rd7, %r15, 4;
add.s64 %rd8, %rd6, %rd7;
ld.global.f32 %f9, [%rd8];
BB5_3:
mul.lo.s32 %r16, %r9, %r8;
mad.lo.s32 %r17, %r16, %r2, %r1;
mul.wide.u32 %rd9, %r17, 4;
add.s64 %rd10, %rd2, %rd9;
ld.global.f32 %f7, [%rd10];
add.f32 %f10, %f9, %f7;
add.s64 %rd11, %rd1, %rd9;
st.global.f32 [%rd11], %f10;
mul.lo.s32 %r18, %r8, %r7;
add.s32 %r19, %r17, %r16;
min.u32 %r3, %r19, %r18;
add.s32 %r20, %r17, %r8;
setp.ge.u32 %p3, %r20, %r3;
@%p3 bra BB5_5;
BB5_4:
mul.wide.s32 %rd12, %r20, 4;
add.s64 %rd13, %rd2, %rd12;
ld.global.f32 %f8, [%rd13];
add.f32 %f10, %f10, %f8;
add.s64 %rd14, %rd1, %rd12;
st.global.f32 [%rd14], %f10;
add.s32 %r20, %r20, %r8;
setp.lt.u32 %p4, %r20, %r3;
@%p4 bra BB5_4;
BB5_5:
ret;
}
// .globl cumulative_prod_up_sweep_d
.visible .entry cumulative_prod_up_sweep_d(
.param .u64 cumulative_prod_up_sweep_d_param_0,
.param .u64 cumulative_prod_up_sweep_d_param_1,
.param .u32 cumulative_prod_up_sweep_d_param_2,
.param .u32 cumulative_prod_up_sweep_d_param_3,
.param .u32 cumulative_prod_up_sweep_d_param_4
)
{
.reg .pred %p<4>;
.reg .b32 %r<20>;
.reg .f64 %fd<8>;
.reg .b64 %rd<11>;
ld.param.u64 %rd3, [cumulative_prod_up_sweep_d_param_0];
ld.param.u64 %rd2, [cumulative_prod_up_sweep_d_param_1];
ld.param.u32 %r7, [cumulative_prod_up_sweep_d_param_2];
ld.param.u32 %r8, [cumulative_prod_up_sweep_d_param_3];
ld.param.u32 %r9, [cumulative_prod_up_sweep_d_param_4];
cvta.to.global.u64 %rd1, %rd3;
mov.u32 %r10, %ntid.x;
mov.u32 %r11, %ctaid.x;
mov.u32 %r12, %tid.x;
mad.lo.s32 %r1, %r10, %r11, %r12;
add.s32 %r13, %r8, -1;
setp.gt.u32 %p1, %r1, %r13;
@%p1 bra BB6_4;
mov.u32 %r14, %ctaid.y;
mul.lo.s32 %r2, %r14, %r8;
mad.lo.s32 %r15, %r2, %r9, %r1;
mul.wide.u32 %rd4, %r15, 8;
add.s64 %rd5, %rd1, %rd4;
ld.global.f64 %fd7, [%rd5];
mad.lo.s32 %r16, %r9, %r8, %r15;
mul.lo.s32 %r17, %r8, %r7;
min.u32 %r3, %r16, %r17;
add.s32 %r19, %r15, %r8;
setp.ge.u32 %p2, %r19, %r3;
@%p2 bra BB6_3;
BB6_2:
mul.wide.s32 %rd6, %r19, 8;
add.s64 %rd7, %rd1, %rd6;
ld.global.f64 %fd5, [%rd7];
mul.f64 %fd7, %fd7, %fd5;
add.s32 %r19, %r19, %r8;
setp.lt.u32 %p3, %r19, %r3;
@%p3 bra BB6_2;
BB6_3:
add.s32 %r18, %r1, %r2;
cvta.to.global.u64 %rd8, %rd2;
mul.wide.u32 %rd9, %r18, 8;
add.s64 %rd10, %rd8, %rd9;
st.global.f64 [%rd10], %fd7;
BB6_4:
ret;
}
// .globl cumulative_prod_up_sweep_f
.visible .entry cumulative_prod_up_sweep_f(
.param .u64 cumulative_prod_up_sweep_f_param_0,
.param .u64 cumulative_prod_up_sweep_f_param_1,
.param .u32 cumulative_prod_up_sweep_f_param_2,
.param .u32 cumulative_prod_up_sweep_f_param_3,
.param .u32 cumulative_prod_up_sweep_f_param_4
)
{
.reg .pred %p<4>;
.reg .b32 %r<20>;
.reg .f64 %fd<8>;
.reg .b64 %rd<11>;
ld.param.u64 %rd3, [cumulative_prod_up_sweep_f_param_0];
ld.param.u64 %rd2, [cumulative_prod_up_sweep_f_param_1];
ld.param.u32 %r7, [cumulative_prod_up_sweep_f_param_2];
ld.param.u32 %r8, [cumulative_prod_up_sweep_f_param_3];
ld.param.u32 %r9, [cumulative_prod_up_sweep_f_param_4];
cvta.to.global.u64 %rd1, %rd3;
mov.u32 %r10, %ntid.x;
mov.u32 %r11, %ctaid.x;
mov.u32 %r12, %tid.x;
mad.lo.s32 %r1, %r10, %r11, %r12;
add.s32 %r13, %r8, -1;
setp.gt.u32 %p1, %r1, %r13;
@%p1 bra BB7_4;
mov.u32 %r14, %ctaid.y;
mul.lo.s32 %r2, %r14, %r8;
mad.lo.s32 %r15, %r2, %r9, %r1;
mul.wide.u32 %rd4, %r15, 8;
add.s64 %rd5, %rd1, %rd4;
ld.global.f64 %fd7, [%rd5];
mad.lo.s32 %r16, %r9, %r8, %r15;
mul.lo.s32 %r17, %r8, %r7;
min.u32 %r3, %r16, %r17;
add.s32 %r19, %r15, %r8;
setp.ge.u32 %p2, %r19, %r3;
@%p2 bra BB7_3;
BB7_2:
mul.wide.s32 %rd6, %r19, 8;
add.s64 %rd7, %rd1, %rd6;
ld.global.f64 %fd5, [%rd7];
mul.f64 %fd7, %fd7, %fd5;
add.s32 %r19, %r19, %r8;
setp.lt.u32 %p3, %r19, %r3;
@%p3 bra BB7_2;
BB7_3:
add.s32 %r18, %r1, %r2;
cvta.to.global.u64 %rd8, %rd2;
mul.wide.u32 %rd9, %r18, 8;
add.s64 %rd10, %rd8, %rd9;
st.global.f64 [%rd10], %fd7;
BB7_4:
ret;
}
// .globl cumulative_prod_down_sweep_d
.visible .entry cumulative_prod_down_sweep_d(
.param .u64 cumulative_prod_down_sweep_d_param_0,
.param .u64 cumulative_prod_down_sweep_d_param_1,
.param .u64 cumulative_prod_down_sweep_d_param_2,
.param .u32 cumulative_prod_down_sweep_d_param_3,
.param .u32 cumulative_prod_down_sweep_d_param_4,
.param .u32 cumulative_prod_down_sweep_d_param_5
)
{
.reg .pred %p<5>;
.reg .b32 %r<21>;
.reg .f64 %fd<11>;
.reg .b64 %rd<15>;
ld.param.u64 %rd4, [cumulative_prod_down_sweep_d_param_0];
ld.param.u64 %rd5, [cumulative_prod_down_sweep_d_param_1];
ld.param.u64 %rd3, [cumulative_prod_down_sweep_d_param_2];
ld.param.u32 %r7, [cumulative_prod_down_sweep_d_param_3];
ld.param.u32 %r8, [cumulative_prod_down_sweep_d_param_4];
ld.param.u32 %r9, [cumulative_prod_down_sweep_d_param_5];
cvta.to.global.u64 %rd1, %rd5;
cvta.to.global.u64 %rd2, %rd4;
mov.u32 %r10, %ntid.x;
mov.u32 %r11, %ctaid.x;
mov.u32 %r12, %tid.x;
mad.lo.s32 %r1, %r10, %r11, %r12;
add.s32 %r13, %r8, -1;
setp.gt.u32 %p1, %r1, %r13;
@%p1 bra BB8_5;
mov.u32 %r2, %ctaid.y;
setp.eq.s32 %p2, %r2, 0;
mov.f64 %fd9, 0d3FF0000000000000;
@%p2 bra BB8_3;
add.s32 %r14, %r2, -1;
mad.lo.s32 %r15, %r14, %r8, %r1;
cvta.to.global.u64 %rd6, %rd3;
mul.wide.u32 %rd7, %r15, 8;
add.s64 %rd8, %rd6, %rd7;
ld.global.f64 %fd9, [%rd8];
BB8_3:
mul.lo.s32 %r16, %r9, %r8;
mad.lo.s32 %r17, %r16, %r2, %r1;
mul.wide.u32 %rd9, %r17, 8;
add.s64 %rd10, %rd2, %rd9;
ld.global.f64 %fd7, [%rd10];
mul.f64 %fd10, %fd9, %fd7;
add.s64 %rd11, %rd1, %rd9;
st.global.f64 [%rd11], %fd10;
mul.lo.s32 %r18, %r8, %r7;
add.s32 %r19, %r17, %r16;
min.u32 %r3, %r19, %r18;
add.s32 %r20, %r17, %r8;
setp.ge.u32 %p3, %r20, %r3;
@%p3 bra BB8_5;
BB8_4:
mul.wide.s32 %rd12, %r20, 8;
add.s64 %rd13, %rd2, %rd12;
ld.global.f64 %fd8, [%rd13];
mul.f64 %fd10, %fd10, %fd8;
add.s64 %rd14, %rd1, %rd12;
st.global.f64 [%rd14], %fd10;
add.s32 %r20, %r20, %r8;
setp.lt.u32 %p4, %r20, %r3;
@%p4 bra BB8_4;
BB8_5:
ret;
}
// .globl cumulative_prod_down_sweep_f
.visible .entry cumulative_prod_down_sweep_f(
.param .u64 cumulative_prod_down_sweep_f_param_0,
.param .u64 cumulative_prod_down_sweep_f_param_1,
.param .u64 cumulative_prod_down_sweep_f_param_2,
.param .u32 cumulative_prod_down_sweep_f_param_3,
.param .u32 cumulative_prod_down_sweep_f_param_4,
.param .u32 cumulative_prod_down_sweep_f_param_5
)
{
.reg .pred %p<5>;
.reg .f32 %f<11>;
.reg .b32 %r<21>;
.reg .b64 %rd<15>;
ld.param.u64 %rd4, [cumulative_prod_down_sweep_f_param_0];
ld.param.u64 %rd5, [cumulative_prod_down_sweep_f_param_1];
ld.param.u64 %rd3, [cumulative_prod_down_sweep_f_param_2];
ld.param.u32 %r7, [cumulative_prod_down_sweep_f_param_3];
ld.param.u32 %r8, [cumulative_prod_down_sweep_f_param_4];
ld.param.u32 %r9, [cumulative_prod_down_sweep_f_param_5];
cvta.to.global.u64 %rd1, %rd5;
cvta.to.global.u64 %rd2, %rd4;
mov.u32 %r10, %ntid.x;
mov.u32 %r11, %ctaid.x;
mov.u32 %r12, %tid.x;
mad.lo.s32 %r1, %r10, %r11, %r12;
add.s32 %r13, %r8, -1;
setp.gt.u32 %p1, %r1, %r13;
@%p1 bra BB9_5;
mov.u32 %r2, %ctaid.y;
setp.eq.s32 %p2, %r2, 0;
mov.f32 %f9, 0f3F800000;
@%p2 bra BB9_3;
add.s32 %r14, %r2, -1;
mad.lo.s32 %r15, %r14, %r8, %r1;
cvta.to.global.u64 %rd6, %rd3;
mul.wide.u32 %rd7, %r15, 4;
add.s64 %rd8, %rd6, %rd7;
ld.global.f32 %f9, [%rd8];
BB9_3:
mul.lo.s32 %r16, %r9, %r8;
mad.lo.s32 %r17, %r16, %r2, %r1;
mul.wide.u32 %rd9, %r17, 4;
add.s64 %rd10, %rd2, %rd9;
ld.global.f32 %f7, [%rd10];
mul.f32 %f10, %f9, %f7;
add.s64 %rd11, %rd1, %rd9;
st.global.f32 [%rd11], %f10;
mul.lo.s32 %r18, %r8, %r7;
add.s32 %r19, %r17, %r16;
min.u32 %r3, %r19, %r18;
add.s32 %r20, %r17, %r8;
setp.ge.u32 %p3, %r20, %r3;
@%p3 bra BB9_5;
BB9_4:
mul.wide.s32 %rd12, %r20, 4;
add.s64 %rd13, %rd2, %rd12;
ld.global.f32 %f8, [%rd13];
mul.f32 %f10, %f10, %f8;
add.s64 %rd14, %rd1, %rd12;
st.global.f32 [%rd14], %f10;
add.s32 %r20, %r20, %r8;
setp.lt.u32 %p4, %r20, %r3;
@%p4 bra BB9_4;
BB9_5:
ret;
}
// .globl cumulative_min_up_sweep_d
.visible .entry cumulative_min_up_sweep_d(
.param .u64 cumulative_min_up_sweep_d_param_0,
.param .u64 cumulative_min_up_sweep_d_param_1,
.param .u32 cumulative_min_up_sweep_d_param_2,
.param .u32 cumulative_min_up_sweep_d_param_3,
.param .u32 cumulative_min_up_sweep_d_param_4
)
{
.reg .pred %p<4>;
.reg .b32 %r<20>;
.reg .f64 %fd<8>;
.reg .b64 %rd<11>;
ld.param.u64 %rd3, [cumulative_min_up_sweep_d_param_0];
ld.param.u64 %rd2, [cumulative_min_up_sweep_d_param_1];
ld.param.u32 %r7, [cumulative_min_up_sweep_d_param_2];
ld.param.u32 %r8, [cumulative_min_up_sweep_d_param_3];
ld.param.u32 %r9, [cumulative_min_up_sweep_d_param_4];
cvta.to.global.u64 %rd1, %rd3;
mov.u32 %r10, %ntid.x;
mov.u32 %r11, %ctaid.x;
mov.u32 %r12, %tid.x;
mad.lo.s32 %r1, %r10, %r11, %r12;
add.s32 %r13, %r8, -1;
setp.gt.u32 %p1, %r1, %r13;
@%p1 bra BB10_4;
mov.u32 %r14, %ctaid.y;
mul.lo.s32 %r2, %r14, %r8;
mad.lo.s32 %r15, %r2, %r9, %r1;
mul.wide.u32 %rd4, %r15, 8;
add.s64 %rd5, %rd1, %rd4;
ld.global.f64 %fd7, [%rd5];
mad.lo.s32 %r16, %r9, %r8, %r15;
mul.lo.s32 %r17, %r8, %r7;
min.u32 %r3, %r16, %r17;
add.s32 %r19, %r15, %r8;
setp.ge.u32 %p2, %r19, %r3;
@%p2 bra BB10_3;
BB10_2:
mul.wide.s32 %rd6, %r19, 8;
add.s64 %rd7, %rd1, %rd6;
ld.global.f64 %fd5, [%rd7];
min.f64 %fd7, %fd7, %fd5;
add.s32 %r19, %r19, %r8;
setp.lt.u32 %p3, %r19, %r3;
@%p3 bra BB10_2;
BB10_3:
add.s32 %r18, %r1, %r2;
cvta.to.global.u64 %rd8, %rd2;
mul.wide.u32 %rd9, %r18, 8;
add.s64 %rd10, %rd8, %rd9;
st.global.f64 [%rd10], %fd7;
BB10_4:
ret;
}
// .globl cumulative_min_up_sweep_f
.visible .entry cumulative_min_up_sweep_f(
.param .u64 cumulative_min_up_sweep_f_param_0,
.param .u64 cumulative_min_up_sweep_f_param_1,
.param .u32 cumulative_min_up_sweep_f_param_2,
.param .u32 cumulative_min_up_sweep_f_param_3,
.param .u32 cumulative_min_up_sweep_f_param_4
)
{
.reg .pred %p<4>;
.reg .f32 %f<8>;
.reg .b32 %r<20>;
.reg .b64 %rd<11>;
ld.param.u64 %rd3, [cumulative_min_up_sweep_f_param_0];
ld.param.u64 %rd2, [cumulative_min_up_sweep_f_param_1];
ld.param.u32 %r7, [cumulative_min_up_sweep_f_param_2];
ld.param.u32 %r8, [cumulative_min_up_sweep_f_param_3];
ld.param.u32 %r9, [cumulative_min_up_sweep_f_param_4];
cvta.to.global.u64 %rd1, %rd3;
mov.u32 %r10, %ntid.x;
mov.u32 %r11, %ctaid.x;
mov.u32 %r12, %tid.x;
mad.lo.s32 %r1, %r10, %r11, %r12;
add.s32 %r13, %r8, -1;
setp.gt.u32 %p1, %r1, %r13;
@%p1 bra BB11_4;
mov.u32 %r14, %ctaid.y;
mul.lo.s32 %r2, %r14, %r8;
mad.lo.s32 %r15, %r2, %r9, %r1;
mul.wide.u32 %rd4, %r15, 4;
add.s64 %rd5, %rd1, %rd4;
ld.global.f32 %f7, [%rd5];
mad.lo.s32 %r16, %r9, %r8, %r15;
mul.lo.s32 %r17, %r8, %r7;
min.u32 %r3, %r16, %r17;
add.s32 %r19, %r15, %r8;
setp.ge.u32 %p2, %r19, %r3;
@%p2 bra BB11_3;
BB11_2:
mul.wide.s32 %rd6, %r19, 4;
add.s64 %rd7, %rd1, %rd6;
ld.global.f32 %f5, [%rd7];
min.f32 %f7, %f7, %f5;
add.s32 %r19, %r19, %r8;
setp.lt.u32 %p3, %r19, %r3;
@%p3 bra BB11_2;
BB11_3:
add.s32 %r18, %r1, %r2;
cvta.to.global.u64 %rd8, %rd2;
mul.wide.u32 %rd9, %r18, 4;
add.s64 %rd10, %rd8, %rd9;
st.global.f32 [%rd10], %f7;
BB11_4:
ret;
}
// .globl cumulative_min_down_sweep_d
.visible .entry cumulative_min_down_sweep_d(
.param .u64 cumulative_min_down_sweep_d_param_0,
.param .u64 cumulative_min_down_sweep_d_param_1,
.param .u64 cumulative_min_down_sweep_d_param_2,
.param .u32 cumulative_min_down_sweep_d_param_3,
.param .u32 cumulative_min_down_sweep_d_param_4,
.param .u32 cumulative_min_down_sweep_d_param_5
)
{
.reg .pred %p<5>;
.reg .b32 %r<21>;
.reg .f64 %fd<11>;
.reg .b64 %rd<15>;
ld.param.u64 %rd4, [cumulative_min_down_sweep_d_param_0];
ld.param.u64 %rd5, [cumulative_min_down_sweep_d_param_1];
ld.param.u64 %rd3, [cumulative_min_down_sweep_d_param_2];
ld.param.u32 %r7, [cumulative_min_down_sweep_d_param_3];
ld.param.u32 %r8, [cumulative_min_down_sweep_d_param_4];
ld.param.u32 %r9, [cumulative_min_down_sweep_d_param_5];
cvta.to.global.u64 %rd1, %rd5;
cvta.to.global.u64 %rd2, %rd4;
mov.u32 %r10, %ntid.x;
mov.u32 %r11, %ctaid.x;
mov.u32 %r12, %tid.x;
mad.lo.s32 %r1, %r10, %r11, %r12;
add.s32 %r13, %r8, -1;
setp.gt.u32 %p1, %r1, %r13;
@%p1 bra BB12_5;
mov.u32 %r2, %ctaid.y;
setp.eq.s32 %p2, %r2, 0;
mov.f64 %fd9, 0d7FF0000000000000;
@%p2 bra BB12_3;
add.s32 %r14, %r2, -1;
mad.lo.s32 %r15, %r14, %r8, %r1;
cvta.to.global.u64 %rd6, %rd3;
mul.wide.u32 %rd7, %r15, 8;
add.s64 %rd8, %rd6, %rd7;
ld.global.f64 %fd9, [%rd8];
BB12_3:
mul.lo.s32 %r16, %r9, %r8;
mad.lo.s32 %r17, %r16, %r2, %r1;
mul.wide.u32 %rd9, %r17, 8;
add.s64 %rd10, %rd2, %rd9;
ld.global.f64 %fd7, [%rd10];
min.f64 %fd10, %fd9, %fd7;
add.s64 %rd11, %rd1, %rd9;
st.global.f64 [%rd11], %fd10;
mul.lo.s32 %r18, %r8, %r7;
add.s32 %r19, %r17, %r16;
min.u32 %r3, %r19, %r18;
add.s32 %r20, %r17, %r8;
setp.ge.u32 %p3, %r20, %r3;
@%p3 bra BB12_5;
BB12_4:
mul.wide.s32 %rd12, %r20, 8;
add.s64 %rd13, %rd2, %rd12;
ld.global.f64 %fd8, [%rd13];
min.f64 %fd10, %fd10, %fd8;
add.s64 %rd14, %rd1, %rd12;
st.global.f64 [%rd14], %fd10;
add.s32 %r20, %r20, %r8;
setp.lt.u32 %p4, %r20, %r3;
@%p4 bra BB12_4;
BB12_5:
ret;
}
// .globl cumulative_min_down_sweep_f
.visible .entry cumulative_min_down_sweep_f(
.param .u64 cumulative_min_down_sweep_f_param_0,
.param .u64 cumulative_min_down_sweep_f_param_1,
.param .u64 cumulative_min_down_sweep_f_param_2,
.param .u32 cumulative_min_down_sweep_f_param_3,
.param .u32 cumulative_min_down_sweep_f_param_4,
.param .u32 cumulative_min_down_sweep_f_param_5
)
{
.reg .pred %p<5>;
.reg .f32 %f<11>;
.reg .b32 %r<21>;
.reg .b64 %rd<15>;
ld.param.u64 %rd4, [cumulative_min_down_sweep_f_param_0];
ld.param.u64 %rd5, [cumulative_min_down_sweep_f_param_1];
ld.param.u64 %rd3, [cumulative_min_down_sweep_f_param_2];
ld.param.u32 %r7, [cumulative_min_down_sweep_f_param_3];
ld.param.u32 %r8, [cumulative_min_down_sweep_f_param_4];
ld.param.u32 %r9, [cumulative_min_down_sweep_f_param_5];
cvta.to.global.u64 %rd1, %rd5;
cvta.to.global.u64 %rd2, %rd4;
mov.u32 %r10, %ntid.x;
mov.u32 %r11, %ctaid.x;
mov.u32 %r12, %tid.x;
mad.lo.s32 %r1, %r10, %r11, %r12;
add.s32 %r13, %r8, -1;
setp.gt.u32 %p1, %r1, %r13;
@%p1 bra BB13_5;
mov.u32 %r2, %ctaid.y;
setp.eq.s32 %p2, %r2, 0;
mov.f32 %f9, 0f7F800000;
@%p2 bra BB13_3;
add.s32 %r14, %r2, -1;
mad.lo.s32 %r15, %r14, %r8, %r1;
cvta.to.global.u64 %rd6, %rd3;
mul.wide.u32 %rd7, %r15, 4;
add.s64 %rd8, %rd6, %rd7;
ld.global.f32 %f9, [%rd8];
BB13_3:
mul.lo.s32 %r16, %r9, %r8;
mad.lo.s32 %r17, %r16, %r2, %r1;
mul.wide.u32 %rd9, %r17, 4;
add.s64 %rd10, %rd2, %rd9;
ld.global.f32 %f7, [%rd10];
min.f32 %f10, %f9, %f7;
add.s64 %rd11, %rd1, %rd9;
st.global.f32 [%rd11], %f10;
mul.lo.s32 %r18, %r8, %r7;
add.s32 %r19, %r17, %r16;
min.u32 %r3, %r19, %r18;
add.s32 %r20, %r17, %r8;
setp.ge.u32 %p3, %r20, %r3;
@%p3 bra BB13_5;
BB13_4:
mul.wide.s32 %rd12, %r20, 4;
add.s64 %rd13, %rd2, %rd12;
ld.global.f32 %f8, [%rd13];
min.f32 %f10, %f10, %f8;
add.s64 %rd14, %rd1, %rd12;
st.global.f32 [%rd14], %f10;
add.s32 %r20, %r20, %r8;
setp.lt.u32 %p4, %r20, %r3;
@%p4 bra BB13_4;
BB13_5:
ret;
}
// .globl cumulative_max_up_sweep_d
.visible .entry cumulative_max_up_sweep_d(
.param .u64 cumulative_max_up_sweep_d_param_0,
.param .u64 cumulative_max_up_sweep_d_param_1,
.param .u32 cumulative_max_up_sweep_d_param_2,
.param .u32 cumulative_max_up_sweep_d_param_3,
.param .u32 cumulative_max_up_sweep_d_param_4
)
{
.reg .pred %p<4>;
.reg .b32 %r<20>;
.reg .f64 %fd<8>;
.reg .b64 %rd<11>;
ld.param.u64 %rd3, [cumulative_max_up_sweep_d_param_0];
ld.param.u64 %rd2, [cumulative_max_up_sweep_d_param_1];
ld.param.u32 %r7, [cumulative_max_up_sweep_d_param_2];
ld.param.u32 %r8, [cumulative_max_up_sweep_d_param_3];
ld.param.u32 %r9, [cumulative_max_up_sweep_d_param_4];
cvta.to.global.u64 %rd1, %rd3;
mov.u32 %r10, %ntid.x;
mov.u32 %r11, %ctaid.x;
mov.u32 %r12, %tid.x;
mad.lo.s32 %r1, %r10, %r11, %r12;
add.s32 %r13, %r8, -1;
setp.gt.u32 %p1, %r1, %r13;
@%p1 bra BB14_4;
mov.u32 %r14, %ctaid.y;
mul.lo.s32 %r2, %r14, %r8;
mad.lo.s32 %r15, %r2, %r9, %r1;
mul.wide.u32 %rd4, %r15, 8;
add.s64 %rd5, %rd1, %rd4;
ld.global.f64 %fd7, [%rd5];
mad.lo.s32 %r16, %r9, %r8, %r15;
mul.lo.s32 %r17, %r8, %r7;
min.u32 %r3, %r16, %r17;
add.s32 %r19, %r15, %r8;
setp.ge.u32 %p2, %r19, %r3;
@%p2 bra BB14_3;
BB14_2:
mul.wide.s32 %rd6, %r19, 8;
add.s64 %rd7, %rd1, %rd6;
ld.global.f64 %fd5, [%rd7];
max.f64 %fd7, %fd7, %fd5;
add.s32 %r19, %r19, %r8;
setp.lt.u32 %p3, %r19, %r3;
@%p3 bra BB14_2;
BB14_3:
add.s32 %r18, %r1, %r2;
cvta.to.global.u64 %rd8, %rd2;
mul.wide.u32 %rd9, %r18, 8;
add.s64 %rd10, %rd8, %rd9;
st.global.f64 [%rd10], %fd7;
BB14_4:
ret;
}
// .globl cumulative_max_up_sweep_f
.visible .entry cumulative_max_up_sweep_f(
.param .u64 cumulative_max_up_sweep_f_param_0,
.param .u64 cumulative_max_up_sweep_f_param_1,
.param .u32 cumulative_max_up_sweep_f_param_2,
.param .u32 cumulative_max_up_sweep_f_param_3,
.param .u32 cumulative_max_up_sweep_f_param_4
)
{
.reg .pred %p<4>;
.reg .f32 %f<8>;
.reg .b32 %r<20>;
.reg .b64 %rd<11>;
ld.param.u64 %rd3, [cumulative_max_up_sweep_f_param_0];
ld.param.u64 %rd2, [cumulative_max_up_sweep_f_param_1];
ld.param.u32 %r7, [cumulative_max_up_sweep_f_param_2];
ld.param.u32 %r8, [cumulative_max_up_sweep_f_param_3];
ld.param.u32 %r9, [cumulative_max_up_sweep_f_param_4];
cvta.to.global.u64 %rd1, %rd3;
mov.u32 %r10, %ntid.x;
mov.u32 %r11, %ctaid.x;
mov.u32 %r12, %tid.x;
mad.lo.s32 %r1, %r10, %r11, %r12;
add.s32 %r13, %r8, -1;
setp.gt.u32 %p1, %r1, %r13;
@%p1 bra BB15_4;
mov.u32 %r14, %ctaid.y;
mul.lo.s32 %r2, %r14, %r8;
mad.lo.s32 %r15, %r2, %r9, %r1;
mul.wide.u32 %rd4, %r15, 4;
add.s64 %rd5, %rd1, %rd4;
ld.global.f32 %f7, [%rd5];
mad.lo.s32 %r16, %r9, %r8, %r15;
mul.lo.s32 %r17, %r8, %r7;
min.u32 %r3, %r16, %r17;
add.s32 %r19, %r15, %r8;
setp.ge.u32 %p2, %r19, %r3;
@%p2 bra BB15_3;
BB15_2:
mul.wide.s32 %rd6, %r19, 4;
add.s64 %rd7, %rd1, %rd6;
ld.global.f32 %f5, [%rd7];
max.f32 %f7, %f7, %f5;
add.s32 %r19, %r19, %r8;
setp.lt.u32 %p3, %r19, %r3;
@%p3 bra BB15_2;
BB15_3:
add.s32 %r18, %r1, %r2;
cvta.to.global.u64 %rd8, %rd2;
mul.wide.u32 %rd9, %r18, 4;
add.s64 %rd10, %rd8, %rd9;
st.global.f32 [%rd10], %f7;
BB15_4:
ret;
}
// .globl cumulative_max_down_sweep_d
.visible .entry cumulative_max_down_sweep_d(
.param .u64 cumulative_max_down_sweep_d_param_0,
.param .u64 cumulative_max_down_sweep_d_param_1,
.param .u64 cumulative_max_down_sweep_d_param_2,
.param .u32 cumulative_max_down_sweep_d_param_3,
.param .u32 cumulative_max_down_sweep_d_param_4,
.param .u32 cumulative_max_down_sweep_d_param_5
)
{
.reg .pred %p<5>;
.reg .b32 %r<21>;
.reg .f64 %fd<11>;
.reg .b64 %rd<15>;
ld.param.u64 %rd4, [cumulative_max_down_sweep_d_param_0];
ld.param.u64 %rd5, [cumulative_max_down_sweep_d_param_1];
ld.param.u64 %rd3, [cumulative_max_down_sweep_d_param_2];
ld.param.u32 %r7, [cumulative_max_down_sweep_d_param_3];
ld.param.u32 %r8, [cumulative_max_down_sweep_d_param_4];
ld.param.u32 %r9, [cumulative_max_down_sweep_d_param_5];
cvta.to.global.u64 %rd1, %rd5;
cvta.to.global.u64 %rd2, %rd4;
mov.u32 %r10, %ntid.x;
mov.u32 %r11, %ctaid.x;
mov.u32 %r12, %tid.x;
mad.lo.s32 %r1, %r10, %r11, %r12;
add.s32 %r13, %r8, -1;
setp.gt.u32 %p1, %r1, %r13;
@%p1 bra BB16_5;
mov.u32 %r2, %ctaid.y;
setp.eq.s32 %p2, %r2, 0;
mov.f64 %fd9, 0dFFF0000000000000;
@%p2 bra BB16_3;
add.s32 %r14, %r2, -1;
mad.lo.s32 %r15, %r14, %r8, %r1;
cvta.to.global.u64 %rd6, %rd3;
mul.wide.u32 %rd7, %r15, 8;
add.s64 %rd8, %rd6, %rd7;
ld.global.f64 %fd9, [%rd8];
BB16_3:
mul.lo.s32 %r16, %r9, %r8;
mad.lo.s32 %r17, %r16, %r2, %r1;
mul.wide.u32 %rd9, %r17, 8;
add.s64 %rd10, %rd2, %rd9;
ld.global.f64 %fd7, [%rd10];
max.f64 %fd10, %fd9, %fd7;
add.s64 %rd11, %rd1, %rd9;
st.global.f64 [%rd11], %fd10;
mul.lo.s32 %r18, %r8, %r7;
add.s32 %r19, %r17, %r16;
min.u32 %r3, %r19, %r18;
add.s32 %r20, %r17, %r8;
setp.ge.u32 %p3, %r20, %r3;
@%p3 bra BB16_5;
BB16_4:
mul.wide.s32 %rd12, %r20, 8;
add.s64 %rd13, %rd2, %rd12;
ld.global.f64 %fd8, [%rd13];
max.f64 %fd10, %fd10, %fd8;
add.s64 %rd14, %rd1, %rd12;
st.global.f64 [%rd14], %fd10;
add.s32 %r20, %r20, %r8;
setp.lt.u32 %p4, %r20, %r3;
@%p4 bra BB16_4;
BB16_5:
ret;
}
// .globl cumulative_max_down_sweep_f
.visible .entry cumulative_max_down_sweep_f(
.param .u64 cumulative_max_down_sweep_f_param_0,
.param .u64 cumulative_max_down_sweep_f_param_1,
.param .u64 cumulative_max_down_sweep_f_param_2,
.param .u32 cumulative_max_down_sweep_f_param_3,
.param .u32 cumulative_max_down_sweep_f_param_4,
.param .u32 cumulative_max_down_sweep_f_param_5
)
{
.reg .pred %p<5>;
.reg .f32 %f<11>;
.reg .b32 %r<21>;
.reg .b64 %rd<15>;
ld.param.u64 %rd4, [cumulative_max_down_sweep_f_param_0];
ld.param.u64 %rd5, [cumulative_max_down_sweep_f_param_1];
ld.param.u64 %rd3, [cumulative_max_down_sweep_f_param_2];
ld.param.u32 %r7, [cumulative_max_down_sweep_f_param_3];
ld.param.u32 %r8, [cumulative_max_down_sweep_f_param_4];
ld.param.u32 %r9, [cumulative_max_down_sweep_f_param_5];
cvta.to.global.u64 %rd1, %rd5;
cvta.to.global.u64 %rd2, %rd4;
mov.u32 %r10, %ntid.x;
mov.u32 %r11, %ctaid.x;
mov.u32 %r12, %tid.x;
mad.lo.s32 %r1, %r10, %r11, %r12;
add.s32 %r13, %r8, -1;
setp.gt.u32 %p1, %r1, %r13;
@%p1 bra BB17_5;
mov.u32 %r2, %ctaid.y;
setp.eq.s32 %p2, %r2, 0;
mov.f32 %f9, 0fFF800000;
@%p2 bra BB17_3;
add.s32 %r14, %r2, -1;
mad.lo.s32 %r15, %r14, %r8, %r1;
cvta.to.global.u64 %rd6, %rd3;
mul.wide.u32 %rd7, %r15, 4;
add.s64 %rd8, %rd6, %rd7;
ld.global.f32 %f9, [%rd8];
BB17_3:
mul.lo.s32 %r16, %r9, %r8;
mad.lo.s32 %r17, %r16, %r2, %r1;
mul.wide.u32 %rd9, %r17, 4;
add.s64 %rd10, %rd2, %rd9;
ld.global.f32 %f7, [%rd10];
max.f32 %f10, %f9, %f7;
add.s64 %rd11, %rd1, %rd9;
st.global.f32 [%rd11], %f10;
mul.lo.s32 %r18, %r8, %r7;
add.s32 %r19, %r17, %r16;
min.u32 %r3, %r19, %r18;
add.s32 %r20, %r17, %r8;
setp.ge.u32 %p3, %r20, %r3;
@%p3 bra BB17_5;
BB17_4:
mul.wide.s32 %rd12, %r20, 4;
add.s64 %rd13, %rd2, %rd12;
ld.global.f32 %f8, [%rd13];
max.f32 %f10, %f10, %f8;
add.s64 %rd14, %rd1, %rd12;
st.global.f32 [%rd14], %f10;
add.s32 %r20, %r20, %r8;
setp.lt.u32 %p4, %r20, %r3;
@%p4 bra BB17_4;
BB17_5:
ret;
}
// .globl cumulative_sum_prod_d
.visible .entry cumulative_sum_prod_d(
.param .u64 cumulative_sum_prod_d_param_0,
.param .u64 cumulative_sum_prod_d_param_1,
.param .u64 cumulative_sum_prod_d_param_2,
.param .u64 cumulative_sum_prod_d_param_3,
.param .u32 cumulative_sum_prod_d_param_4,
.param .u32 cumulative_sum_prod_d_param_5,
.param .u32 cumulative_sum_prod_d_param_6
)
{
.reg .pred %p<18>;
.reg .b32 %r<62>;
.reg .f64 %fd<106>;
.reg .b64 %rd<46>;
ld.param.u64 %rd6, [cumulative_sum_prod_d_param_0];
ld.param.u64 %rd7, [cumulative_sum_prod_d_param_1];
ld.param.u64 %rd4, [cumulative_sum_prod_d_param_2];
ld.param.u64 %rd5, [cumulative_sum_prod_d_param_3];
ld.param.u32 %r28, [cumulative_sum_prod_d_param_4];
ld.param.u32 %r29, [cumulative_sum_prod_d_param_5];
ld.param.u32 %r30, [cumulative_sum_prod_d_param_6];
cvta.to.global.u64 %rd1, %rd7;
cvta.to.global.u64 %rd2, %rd6;
mov.u32 %r1, %ctaid.x;
mul.lo.s32 %r2, %r1, %r29;
add.s32 %r31, %r28, -1;
setp.gt.u32 %p1, %r2, %r31;
@%p1 bra BB18_30;
add.s32 %r32, %r2, %r29;
min.u32 %r3, %r32, %r28;
shl.b32 %r33, %r2, 1;
mul.wide.u32 %rd8, %r33, 8;
add.s64 %rd3, %rd2, %rd8;
setp.gt.u32 %p2, %r30, 1;
@%p2 bra BB18_3;
bra.uni BB18_2;
BB18_3:
shl.b32 %r34, %r1, 1;
add.s32 %r4, %r34, -2;
mov.f64 %fd87, 0d0000000000000000;
setp.lt.s32 %p3, %r4, 0;
@%p3 bra BB18_5;
cvta.to.global.u64 %rd9, %rd4;
mul.wide.s32 %rd10, %r4, 8;
add.s64 %rd11, %rd9, %rd10;
ld.global.f64 %fd87, [%rd11];
BB18_5:
ld.global.v2.f64 {%fd40, %fd104}, [%rd3];
fma.rn.f64 %fd105, %fd87, %fd104, %fd40;
bra.uni BB18_6;
BB18_2:
ld.global.v2.f64 {%fd105, %fd104}, [%rd3];
BB18_6:
setp.eq.s32 %p4, %r30, 2;
@%p4 bra BB18_9;
bra.uni BB18_7;
BB18_9:
add.s64 %rd15, %rd1, %rd8;
st.global.v2.f64 [%rd15], {%fd105, %fd104};
bra.uni BB18_10;
BB18_7:
setp.ne.s32 %p5, %r30, 3;
@%p5 bra BB18_10;
mul.wide.u32 %rd12, %r2, 8;
add.s64 %rd13, %rd1, %rd12;
st.global.f64 [%rd13], %fd105;
BB18_10:
add.s32 %r5, %r2, 1;
setp.ge.u32 %p6, %r5, %r3;
@%p6 bra BB18_27;
@%p4 bra BB18_15;
bra.uni BB18_12;
BB18_15:
add.s32 %r39, %r3, -1;
sub.s32 %r11, %r39, %r2;
and.b32 %r38, %r11, 3;
mov.f64 %fd48, 0d0000000000000000;
setp.eq.s32 %p10, %r38, 0;
@%p10 bra BB18_16;
setp.eq.s32 %p11, %r38, 1;
@%p11 bra BB18_21;
setp.eq.s32 %p12, %r38, 2;
@%p12 bra BB18_20;
shl.b32 %r40, %r5, 1;
mul.wide.u32 %rd20, %r40, 8;
add.s64 %rd21, %rd2, %rd20;
ld.global.v2.f64 {%fd49, %fd50}, [%rd21];
mul.f64 %fd104, %fd104, %fd50;
fma.rn.f64 %fd105, %fd105, %fd50, %fd49;
add.s64 %rd22, %rd1, %rd20;
st.global.v2.f64 [%rd22], {%fd105, %fd104};
add.s32 %r5, %r2, 2;
BB18_20:
shl.b32 %r41, %r5, 1;
mul.wide.u32 %rd23, %r41, 8;
add.s64 %rd24, %rd2, %rd23;
ld.global.v2.f64 {%fd53, %fd54}, [%rd24];
mul.f64 %fd104, %fd104, %fd54;
fma.rn.f64 %fd105, %fd105, %fd54, %fd53;
add.s64 %rd25, %rd1, %rd23;
st.global.v2.f64 [%rd25], {%fd105, %fd104};
add.s32 %r5, %r5, 1;
BB18_21:
shl.b32 %r42, %r5, 1;
mul.wide.u32 %rd26, %r42, 8;
add.s64 %rd27, %rd2, %rd26;
ld.global.v2.f64 {%fd57, %fd58}, [%rd27];
mul.f64 %fd96, %fd104, %fd58;
fma.rn.f64 %fd97, %fd105, %fd58, %fd57;
add.s64 %rd28, %rd1, %rd26;
st.global.v2.f64 [%rd28], {%fd97, %fd96};
add.s32 %r5, %r5, 1;
mov.f64 %fd105, %fd97;
mov.f64 %fd104, %fd96;
bra.uni BB18_22;
BB18_12:
setp.ne.s32 %p8, %r30, 3;
@%p8 bra BB18_25;
mad.lo.s32 %r53, %r2, 2, 2;
BB18_14:
mul.wide.u32 %rd16, %r53, 8;
add.s64 %rd17, %rd2, %rd16;
ld.global.v2.f64 {%fd43, %fd44}, [%rd17];
mul.f64 %fd104, %fd104, %fd44;
fma.rn.f64 %fd105, %fd105, %fd44, %fd43;
mul.wide.s32 %rd18, %r5, 8;
add.s64 %rd19, %rd1, %rd18;
st.global.f64 [%rd19], %fd105;
add.s32 %r53, %r53, 2;
add.s32 %r5, %r5, 1;
setp.lt.u32 %p9, %r5, %r3;
@%p9 bra BB18_14;
bra.uni BB18_27;
BB18_25:
mad.lo.s32 %r60, %r2, 2, 2;
BB18_26:
mul.wide.u32 %rd41, %r60, 8;
add.s64 %rd42, %rd2, %rd41;
ld.global.v2.f64 {%fd83, %fd84}, [%rd42];
mul.f64 %fd104, %fd104, %fd84;
fma.rn.f64 %fd105, %fd105, %fd84, %fd83;
add.s32 %r60, %r60, 2;
add.s32 %r5, %r5, 1;
setp.lt.u32 %p15, %r5, %r3;
@%p15 bra BB18_26;
bra.uni BB18_27;
BB18_16:
mov.f64 %fd96, %fd104;
mov.f64 %fd97, %fd105;
mov.f64 %fd105, %fd48;
mov.f64 %fd104, %fd48;
BB18_22:
setp.lt.u32 %p13, %r11, 4;
@%p13 bra BB18_27;
shl.b32 %r58, %r5, 1;
mov.f64 %fd104, %fd96;
mov.f64 %fd105, %fd97;
BB18_24:
mul.wide.u32 %rd29, %r58, 8;
add.s64 %rd30, %rd2, %rd29;
ld.global.v2.f64 {%fd61, %fd62}, [%rd30];
add.s64 %rd31, %rd1, %rd29;
fma.rn.f64 %fd65, %fd105, %fd62, %fd61;
mul.f64 %fd66, %fd104, %fd62;
st.global.v2.f64 [%rd31], {%fd65, %fd66};
add.s32 %r43, %r58, 2;
mul.wide.u32 %rd32, %r43, 8;
add.s64 %rd33, %rd2, %rd32;
ld.global.v2.f64 {%fd67, %fd68}, [%rd33];
add.s64 %rd34, %rd1, %rd32;
fma.rn.f64 %fd71, %fd65, %fd68, %fd67;
mul.f64 %fd72, %fd66, %fd68;
st.global.v2.f64 [%rd34], {%fd71, %fd72};
add.s32 %r44, %r58, 4;
mul.wide.u32 %rd35, %r44, 8;
add.s64 %rd36, %rd2, %rd35;
ld.global.v2.f64 {%fd73, %fd74}, [%rd36];
add.s64 %rd37, %rd1, %rd35;
fma.rn.f64 %fd77, %fd71, %fd74, %fd73;
mul.f64 %fd78, %fd72, %fd74;
st.global.v2.f64 [%rd37], {%fd77, %fd78};
add.s32 %r45, %r58, 6;
mul.wide.u32 %rd38, %r45, 8;
add.s64 %rd39, %rd2, %rd38;
ld.global.v2.f64 {%fd79, %fd80}, [%rd39];
mul.f64 %fd104, %fd78, %fd80;
fma.rn.f64 %fd105, %fd77, %fd80, %fd79;
add.s64 %rd40, %rd1, %rd38;
st.global.v2.f64 [%rd40], {%fd105, %fd104};
add.s32 %r58, %r58, 8;
add.s32 %r5, %r5, 4;
setp.lt.u32 %p14, %r5, %r3;
@%p14 bra BB18_24;
BB18_27:
setp.eq.s64 %p16, %rd5, 0;
@%p16 bra BB18_30;
mov.u32 %r48, %nctaid.x;
add.s32 %r49, %r48, -1;
setp.ge.u32 %p17, %r1, %r49;
@%p17 bra BB18_30;
shl.b32 %r52, %r1, 1;
cvta.to.global.u64 %rd43, %rd5;
mul.wide.u32 %rd44, %r52, 8;
add.s64 %rd45, %rd43, %rd44;
st.global.v2.f64 [%rd45], {%fd105, %fd104};
BB18_30:
ret;
}
// .globl cumulative_sum_prod_f
.visible .entry cumulative_sum_prod_f(
.param .u64 cumulative_sum_prod_f_param_0,
.param .u64 cumulative_sum_prod_f_param_1,
.param .u64 cumulative_sum_prod_f_param_2,
.param .u64 cumulative_sum_prod_f_param_3,
.param .u32 cumulative_sum_prod_f_param_4,
.param .u32 cumulative_sum_prod_f_param_5,
.param .u32 cumulative_sum_prod_f_param_6
)
{
.reg .pred %p<18>;
.reg .f32 %f<106>;
.reg .b32 %r<62>;
.reg .b64 %rd<46>;
ld.param.u64 %rd6, [cumulative_sum_prod_f_param_0];
ld.param.u64 %rd7, [cumulative_sum_prod_f_param_1];
ld.param.u64 %rd4, [cumulative_sum_prod_f_param_2];
ld.param.u64 %rd5, [cumulative_sum_prod_f_param_3];
ld.param.u32 %r28, [cumulative_sum_prod_f_param_4];
ld.param.u32 %r29, [cumulative_sum_prod_f_param_5];
ld.param.u32 %r30, [cumulative_sum_prod_f_param_6];
cvta.to.global.u64 %rd1, %rd7;
cvta.to.global.u64 %rd2, %rd6;
mov.u32 %r1, %ctaid.x;
mul.lo.s32 %r2, %r1, %r29;
add.s32 %r31, %r28, -1;
setp.gt.u32 %p1, %r2, %r31;
@%p1 bra BB19_30;
add.s32 %r32, %r2, %r29;
min.u32 %r3, %r32, %r28;
shl.b32 %r33, %r2, 1;
mul.wide.u32 %rd8, %r33, 4;
add.s64 %rd3, %rd2, %rd8;
setp.gt.u32 %p2, %r30, 1;
@%p2 bra BB19_3;
bra.uni BB19_2;
BB19_3:
shl.b32 %r34, %r1, 1;
add.s32 %r4, %r34, -2;
mov.f32 %f87, 0f00000000;
setp.lt.s32 %p3, %r4, 0;
@%p3 bra BB19_5;
cvta.to.global.u64 %rd9, %rd4;
mul.wide.s32 %rd10, %r4, 4;
add.s64 %rd11, %rd9, %rd10;
ld.global.f32 %f87, [%rd11];
BB19_5:
ld.global.v2.f32 {%f40, %f104}, [%rd3];
fma.rn.f32 %f105, %f87, %f104, %f40;
bra.uni BB19_6;
BB19_2:
ld.global.v2.f32 {%f105, %f104}, [%rd3];
BB19_6:
setp.eq.s32 %p4, %r30, 2;
@%p4 bra BB19_9;
bra.uni BB19_7;
BB19_9:
add.s64 %rd15, %rd1, %rd8;
st.global.v2.f32 [%rd15], {%f105, %f104};
bra.uni BB19_10;
BB19_7:
setp.ne.s32 %p5, %r30, 3;
@%p5 bra BB19_10;
mul.wide.u32 %rd12, %r2, 4;
add.s64 %rd13, %rd1, %rd12;
st.global.f32 [%rd13], %f105;
BB19_10:
add.s32 %r5, %r2, 1;
setp.ge.u32 %p6, %r5, %r3;
@%p6 bra BB19_27;
@%p4 bra BB19_15;
bra.uni BB19_12;
BB19_15:
add.s32 %r39, %r3, -1;
sub.s32 %r11, %r39, %r2;
and.b32 %r38, %r11, 3;
mov.f32 %f48, 0f00000000;
setp.eq.s32 %p10, %r38, 0;
@%p10 bra BB19_16;
setp.eq.s32 %p11, %r38, 1;
@%p11 bra BB19_21;
setp.eq.s32 %p12, %r38, 2;
@%p12 bra BB19_20;
shl.b32 %r40, %r5, 1;
mul.wide.u32 %rd20, %r40, 4;
add.s64 %rd21, %rd2, %rd20;
ld.global.v2.f32 {%f49, %f50}, [%rd21];
mul.f32 %f104, %f104, %f50;
fma.rn.f32 %f105, %f105, %f50, %f49;
add.s64 %rd22, %rd1, %rd20;
st.global.v2.f32 [%rd22], {%f105, %f104};
add.s32 %r5, %r2, 2;
BB19_20:
shl.b32 %r41, %r5, 1;
mul.wide.u32 %rd23, %r41, 4;
add.s64 %rd24, %rd2, %rd23;
ld.global.v2.f32 {%f53, %f54}, [%rd24];
mul.f32 %f104, %f104, %f54;
fma.rn.f32 %f105, %f105, %f54, %f53;
add.s64 %rd25, %rd1, %rd23;
st.global.v2.f32 [%rd25], {%f105, %f104};
add.s32 %r5, %r5, 1;
BB19_21:
shl.b32 %r42, %r5, 1;
mul.wide.u32 %rd26, %r42, 4;
add.s64 %rd27, %rd2, %rd26;
ld.global.v2.f32 {%f57, %f58}, [%rd27];
mul.f32 %f96, %f104, %f58;
fma.rn.f32 %f97, %f105, %f58, %f57;
add.s64 %rd28, %rd1, %rd26;
st.global.v2.f32 [%rd28], {%f97, %f96};
add.s32 %r5, %r5, 1;
mov.f32 %f105, %f97;
mov.f32 %f104, %f96;
bra.uni BB19_22;
BB19_12:
setp.ne.s32 %p8, %r30, 3;
@%p8 bra BB19_25;
mad.lo.s32 %r53, %r2, 2, 2;
BB19_14:
mul.wide.u32 %rd16, %r53, 4;
add.s64 %rd17, %rd2, %rd16;
ld.global.v2.f32 {%f43, %f44}, [%rd17];
mul.f32 %f104, %f104, %f44;
fma.rn.f32 %f105, %f105, %f44, %f43;
mul.wide.s32 %rd18, %r5, 4;
add.s64 %rd19, %rd1, %rd18;
st.global.f32 [%rd19], %f105;
add.s32 %r53, %r53, 2;
add.s32 %r5, %r5, 1;
setp.lt.u32 %p9, %r5, %r3;
@%p9 bra BB19_14;
bra.uni BB19_27;
BB19_25:
mad.lo.s32 %r60, %r2, 2, 2;
BB19_26:
mul.wide.u32 %rd41, %r60, 4;
add.s64 %rd42, %rd2, %rd41;
ld.global.v2.f32 {%f83, %f84}, [%rd42];
mul.f32 %f104, %f104, %f84;
fma.rn.f32 %f105, %f105, %f84, %f83;
add.s32 %r60, %r60, 2;
add.s32 %r5, %r5, 1;
setp.lt.u32 %p15, %r5, %r3;
@%p15 bra BB19_26;
bra.uni BB19_27;
BB19_16:
mov.f32 %f96, %f104;
mov.f32 %f97, %f105;
mov.f32 %f105, %f48;
mov.f32 %f104, %f48;
BB19_22:
setp.lt.u32 %p13, %r11, 4;
@%p13 bra BB19_27;
shl.b32 %r58, %r5, 1;
mov.f32 %f104, %f96;
mov.f32 %f105, %f97;
BB19_24:
mul.wide.u32 %rd29, %r58, 4;
add.s64 %rd30, %rd2, %rd29;
ld.global.v2.f32 {%f61, %f62}, [%rd30];
add.s64 %rd31, %rd1, %rd29;
fma.rn.f32 %f65, %f105, %f62, %f61;
mul.f32 %f66, %f104, %f62;
st.global.v2.f32 [%rd31], {%f65, %f66};
add.s32 %r43, %r58, 2;
mul.wide.u32 %rd32, %r43, 4;
add.s64 %rd33, %rd2, %rd32;
ld.global.v2.f32 {%f67, %f68}, [%rd33];
add.s64 %rd34, %rd1, %rd32;
fma.rn.f32 %f71, %f65, %f68, %f67;
mul.f32 %f72, %f66, %f68;
st.global.v2.f32 [%rd34], {%f71, %f72};
add.s32 %r44, %r58, 4;
mul.wide.u32 %rd35, %r44, 4;
add.s64 %rd36, %rd2, %rd35;
ld.global.v2.f32 {%f73, %f74}, [%rd36];
add.s64 %rd37, %rd1, %rd35;
fma.rn.f32 %f77, %f71, %f74, %f73;
mul.f32 %f78, %f72, %f74;
st.global.v2.f32 [%rd37], {%f77, %f78};
add.s32 %r45, %r58, 6;
mul.wide.u32 %rd38, %r45, 4;
add.s64 %rd39, %rd2, %rd38;
ld.global.v2.f32 {%f79, %f80}, [%rd39];
mul.f32 %f104, %f78, %f80;
fma.rn.f32 %f105, %f77, %f80, %f79;
add.s64 %rd40, %rd1, %rd38;
st.global.v2.f32 [%rd40], {%f105, %f104};
add.s32 %r58, %r58, 8;
add.s32 %r5, %r5, 4;
setp.lt.u32 %p14, %r5, %r3;
@%p14 bra BB19_24;
BB19_27:
setp.eq.s64 %p16, %rd5, 0;
@%p16 bra BB19_30;
mov.u32 %r48, %nctaid.x;
add.s32 %r49, %r48, -1;
setp.ge.u32 %p17, %r1, %r49;
@%p17 bra BB19_30;
shl.b32 %r52, %r1, 1;
cvta.to.global.u64 %rd43, %rd5;
mul.wide.u32 %rd44, %r52, 4;
add.s64 %rd45, %rd43, %rd44;
st.global.v2.f32 [%rd45], {%f105, %f104};
BB19_30:
ret;
}
// .globl sparse_dense_im2col_d
.visible .entry sparse_dense_im2col_d(
.param .u64 sparse_dense_im2col_d_param_0,
.param .u64 sparse_dense_im2col_d_param_1,
.param .u64 sparse_dense_im2col_d_param_2,
.param .u64 sparse_dense_im2col_d_param_3,
.param .u32 sparse_dense_im2col_d_param_4,
.param .u32 sparse_dense_im2col_d_param_5,
.param .u32 sparse_dense_im2col_d_param_6,
.param .u32 sparse_dense_im2col_d_param_7,
.param .u32 sparse_dense_im2col_d_param_8,
.param .u32 sparse_dense_im2col_d_param_9,
.param .u32 sparse_dense_im2col_d_param_10,
.param .u32 sparse_dense_im2col_d_param_11,
.param .u32 sparse_dense_im2col_d_param_12,
.param .u32 sparse_dense_im2col_d_param_13,
.param .u32 sparse_dense_im2col_d_param_14,
.param .u32 sparse_dense_im2col_d_param_15,
.param .u32 sparse_dense_im2col_d_param_16,
.param .u32 sparse_dense_im2col_d_param_17,
.param .u32 sparse_dense_im2col_d_param_18,
.param .u32 sparse_dense_im2col_d_param_19
)
{
.reg .pred %p<13>;
.reg .b32 %r<72>;
.reg .f64 %fd<2>;
.reg .b64 %rd<17>;
ld.param.u64 %rd3, [sparse_dense_im2col_d_param_0];
ld.param.u64 %rd4, [sparse_dense_im2col_d_param_1];
ld.param.u64 %rd5, [sparse_dense_im2col_d_param_2];
ld.param.u64 %rd6, [sparse_dense_im2col_d_param_3];
ld.param.u32 %r35, [sparse_dense_im2col_d_param_4];
ld.param.u32 %r22, [sparse_dense_im2col_d_param_7];
ld.param.u32 %r23, [sparse_dense_im2col_d_param_8];
ld.param.u32 %r24, [sparse_dense_im2col_d_param_9];
ld.param.u32 %r25, [sparse_dense_im2col_d_param_10];
ld.param.u32 %r26, [sparse_dense_im2col_d_param_11];
ld.param.u32 %r27, [sparse_dense_im2col_d_param_12];
ld.param.u32 %r28, [sparse_dense_im2col_d_param_13];
ld.param.u32 %r29, [sparse_dense_im2col_d_param_14];
ld.param.u32 %r30, [sparse_dense_im2col_d_param_15];
ld.param.u32 %r31, [sparse_dense_im2col_d_param_16];
ld.param.u32 %r32, [sparse_dense_im2col_d_param_17];
ld.param.u32 %r33, [sparse_dense_im2col_d_param_18];
ld.param.u32 %r34, [sparse_dense_im2col_d_param_19];
mov.u32 %r36, %ntid.x;
mov.u32 %r37, %ctaid.x;
mov.u32 %r38, %tid.x;
mad.lo.s32 %r1, %r36, %r37, %r38;
setp.ge.s32 %p1, %r1, %r35;
@%p1 bra BB20_11;
cvta.to.global.u64 %rd1, %rd5;
cvta.to.global.u64 %rd2, %rd4;
cvta.to.global.u64 %rd7, %rd3;
mul.wide.s32 %rd8, %r1, 8;
add.s64 %rd9, %rd7, %rd8;
ld.global.f64 %fd1, [%rd9];
mov.u32 %r67, 0;
BB20_2:
mov.u32 %r2, %r67;
add.s32 %r67, %r2, 1;
mul.wide.s32 %rd10, %r67, 4;
add.s64 %rd11, %rd2, %rd10;
ld.global.u32 %r40, [%rd11];
setp.le.s32 %p2, %r40, %r1;
@%p2 bra BB20_2;
mul.wide.s32 %rd12, %r1, 4;
add.s64 %rd13, %rd1, %rd12;
ld.global.u32 %r41, [%rd13];
div.s32 %r4, %r41, %r22;
rem.s32 %r42, %r41, %r22;
div.s32 %r43, %r42, %r23;
rem.s32 %r44, %r42, %r23;
add.s32 %r5, %r43, %r33;
mul.lo.s32 %r45, %r31, %r26;
mov.u32 %r46, 1;
sub.s32 %r47, %r46, %r45;
add.s32 %r48, %r47, %r5;
mov.u32 %r49, 0;
max.s32 %r68, %r49, %r48;
add.s32 %r50, %r24, -1;
min.s32 %r7, %r50, %r5;
add.s32 %r8, %r44, %r34;
mul.lo.s32 %r51, %r32, %r27;
sub.s32 %r52, %r46, %r51;
add.s32 %r53, %r52, %r8;
max.s32 %r69, %r49, %r53;
add.s32 %r54, %r25, -1;
min.s32 %r10, %r54, %r8;
BB20_4:
mov.u32 %r70, %r68;
sub.s32 %r55, %r5, %r70;
rem.s32 %r56, %r55, %r31;
setp.ne.s32 %p3, %r56, 0;
setp.le.s32 %p4, %r70, %r7;
and.pred %p5, %p4, %p3;
add.s32 %r68, %r70, 1;
@%p5 bra BB20_4;
BB20_5:
mov.u32 %r13, %r69;
sub.s32 %r57, %r8, %r13;
rem.s32 %r58, %r57, %r32;
setp.ne.s32 %p6, %r58, 0;
setp.le.s32 %p7, %r13, %r10;
and.pred %p8, %p7, %p6;
add.s32 %r69, %r13, 1;
@%p8 bra BB20_5;
setp.gt.s32 %p9, %r70, %r7;
@%p9 bra BB20_11;
mul.lo.s32 %r15, %r2, %r28;
mul.lo.s32 %r16, %r4, %r29;
cvta.to.global.u64 %rd14, %rd6;
BB20_8:
sub.s32 %r59, %r5, %r70;
div.s32 %r60, %r59, %r31;
mad.lo.s32 %r18, %r60, %r27, %r15;
setp.gt.s32 %p10, %r13, %r10;
mov.u32 %r71, %r13;
@%p10 bra BB20_10;
BB20_9:
sub.s32 %r61, %r8, %r71;
div.s32 %r62, %r61, %r32;
mad.lo.s32 %r63, %r70, %r25, %r16;
add.s32 %r64, %r63, %r71;
mad.lo.s32 %r65, %r64, %r30, %r18;
add.s32 %r66, %r65, %r62;
mul.wide.s32 %rd15, %r66, 8;
add.s64 %rd16, %rd14, %rd15;
st.global.f64 [%rd16], %fd1;
add.s32 %r71, %r71, %r32;
setp.le.s32 %p11, %r71, %r10;
@%p11 bra BB20_9;
BB20_10:
add.s32 %r70, %r70, %r31;
setp.le.s32 %p12, %r70, %r7;
@%p12 bra BB20_8;
BB20_11:
ret;
}
// .globl sparse_dense_im2col_f
.visible .entry sparse_dense_im2col_f(
.param .u64 sparse_dense_im2col_f_param_0,
.param .u64 sparse_dense_im2col_f_param_1,
.param .u64 sparse_dense_im2col_f_param_2,
.param .u64 sparse_dense_im2col_f_param_3,
.param .u32 sparse_dense_im2col_f_param_4,
.param .u32 sparse_dense_im2col_f_param_5,
.param .u32 sparse_dense_im2col_f_param_6,
.param .u32 sparse_dense_im2col_f_param_7,
.param .u32 sparse_dense_im2col_f_param_8,
.param .u32 sparse_dense_im2col_f_param_9,
.param .u32 sparse_dense_im2col_f_param_10,
.param .u32 sparse_dense_im2col_f_param_11,
.param .u32 sparse_dense_im2col_f_param_12,
.param .u32 sparse_dense_im2col_f_param_13,
.param .u32 sparse_dense_im2col_f_param_14,
.param .u32 sparse_dense_im2col_f_param_15,
.param .u32 sparse_dense_im2col_f_param_16,
.param .u32 sparse_dense_im2col_f_param_17,
.param .u32 sparse_dense_im2col_f_param_18,
.param .u32 sparse_dense_im2col_f_param_19
)
{
.reg .pred %p<13>;
.reg .f32 %f<2>;
.reg .b32 %r<72>;
.reg .b64 %rd<17>;
ld.param.u64 %rd3, [sparse_dense_im2col_f_param_0];
ld.param.u64 %rd4, [sparse_dense_im2col_f_param_1];
ld.param.u64 %rd5, [sparse_dense_im2col_f_param_2];
ld.param.u64 %rd6, [sparse_dense_im2col_f_param_3];
ld.param.u32 %r35, [sparse_dense_im2col_f_param_4];
ld.param.u32 %r22, [sparse_dense_im2col_f_param_7];
ld.param.u32 %r23, [sparse_dense_im2col_f_param_8];
ld.param.u32 %r24, [sparse_dense_im2col_f_param_9];
ld.param.u32 %r25, [sparse_dense_im2col_f_param_10];
ld.param.u32 %r26, [sparse_dense_im2col_f_param_11];
ld.param.u32 %r27, [sparse_dense_im2col_f_param_12];
ld.param.u32 %r28, [sparse_dense_im2col_f_param_13];
ld.param.u32 %r29, [sparse_dense_im2col_f_param_14];
ld.param.u32 %r30, [sparse_dense_im2col_f_param_15];
ld.param.u32 %r31, [sparse_dense_im2col_f_param_16];
ld.param.u32 %r32, [sparse_dense_im2col_f_param_17];
ld.param.u32 %r33, [sparse_dense_im2col_f_param_18];
ld.param.u32 %r34, [sparse_dense_im2col_f_param_19];
mov.u32 %r36, %ntid.x;
mov.u32 %r37, %ctaid.x;
mov.u32 %r38, %tid.x;
mad.lo.s32 %r1, %r36, %r37, %r38;
setp.ge.s32 %p1, %r1, %r35;
@%p1 bra BB21_11;
cvta.to.global.u64 %rd1, %rd5;
cvta.to.global.u64 %rd2, %rd4;
cvta.to.global.u64 %rd7, %rd3;
mul.wide.s32 %rd8, %r1, 4;
add.s64 %rd9, %rd7, %rd8;
ld.global.f32 %f1, [%rd9];
mov.u32 %r67, 0;
BB21_2:
mov.u32 %r2, %r67;
add.s32 %r67, %r2, 1;
mul.wide.s32 %rd10, %r67, 4;
add.s64 %rd11, %rd2, %rd10;
ld.global.u32 %r40, [%rd11];
setp.le.s32 %p2, %r40, %r1;
@%p2 bra BB21_2;
add.s64 %rd13, %rd1, %rd8;
ld.global.u32 %r41, [%rd13];
div.s32 %r4, %r41, %r22;
rem.s32 %r42, %r41, %r22;
div.s32 %r43, %r42, %r23;
rem.s32 %r44, %r42, %r23;
add.s32 %r5, %r43, %r33;
mul.lo.s32 %r45, %r31, %r26;
mov.u32 %r46, 1;
sub.s32 %r47, %r46, %r45;
add.s32 %r48, %r47, %r5;
mov.u32 %r49, 0;
max.s32 %r68, %r49, %r48;
add.s32 %r50, %r24, -1;
min.s32 %r7, %r50, %r5;
add.s32 %r8, %r44, %r34;
mul.lo.s32 %r51, %r32, %r27;
sub.s32 %r52, %r46, %r51;
add.s32 %r53, %r52, %r8;
max.s32 %r69, %r49, %r53;
add.s32 %r54, %r25, -1;
min.s32 %r10, %r54, %r8;
BB21_4:
mov.u32 %r70, %r68;
sub.s32 %r55, %r5, %r70;
rem.s32 %r56, %r55, %r31;
setp.ne.s32 %p3, %r56, 0;
setp.le.s32 %p4, %r70, %r7;
and.pred %p5, %p4, %p3;
add.s32 %r68, %r70, 1;
@%p5 bra BB21_4;
BB21_5:
mov.u32 %r13, %r69;
sub.s32 %r57, %r8, %r13;
rem.s32 %r58, %r57, %r32;
setp.ne.s32 %p6, %r58, 0;
setp.le.s32 %p7, %r13, %r10;
and.pred %p8, %p7, %p6;
add.s32 %r69, %r13, 1;
@%p8 bra BB21_5;
setp.gt.s32 %p9, %r70, %r7;
@%p9 bra BB21_11;
mul.lo.s32 %r15, %r2, %r28;
mul.lo.s32 %r16, %r4, %r29;
cvta.to.global.u64 %rd14, %rd6;
BB21_8:
sub.s32 %r59, %r5, %r70;
div.s32 %r60, %r59, %r31;
mad.lo.s32 %r18, %r60, %r27, %r15;
setp.gt.s32 %p10, %r13, %r10;
mov.u32 %r71, %r13;
@%p10 bra BB21_10;
BB21_9:
sub.s32 %r61, %r8, %r71;
div.s32 %r62, %r61, %r32;
mad.lo.s32 %r63, %r70, %r25, %r16;
add.s32 %r64, %r63, %r71;
mad.lo.s32 %r65, %r64, %r30, %r18;
add.s32 %r66, %r65, %r62;
mul.wide.s32 %rd15, %r66, 4;
add.s64 %rd16, %rd14, %rd15;
st.global.f32 [%rd16], %f1;
add.s32 %r71, %r71, %r32;
setp.le.s32 %p11, %r71, %r10;
@%p11 bra BB21_9;
BB21_10:
add.s32 %r70, %r70, %r31;
setp.le.s32 %p12, %r70, %r7;
@%p12 bra BB21_8;
BB21_11:
ret;
}
// .globl dense_dense_im2col_d
.visible .entry dense_dense_im2col_d(
.param .u64 dense_dense_im2col_d_param_0,
.param .u64 dense_dense_im2col_d_param_1,
.param .u32 dense_dense_im2col_d_param_2,
.param .u32 dense_dense_im2col_d_param_3,
.param .u32 dense_dense_im2col_d_param_4,
.param .u32 dense_dense_im2col_d_param_5,
.param .u32 dense_dense_im2col_d_param_6,
.param .u32 dense_dense_im2col_d_param_7,
.param .u32 dense_dense_im2col_d_param_8,
.param .u32 dense_dense_im2col_d_param_9,
.param .u32 dense_dense_im2col_d_param_10,
.param .u32 dense_dense_im2col_d_param_11,
.param .u32 dense_dense_im2col_d_param_12,
.param .u32 dense_dense_im2col_d_param_13,
.param .u32 dense_dense_im2col_d_param_14,
.param .u32 dense_dense_im2col_d_param_15,
.param .u32 dense_dense_im2col_d_param_16
)
{
.reg .pred %p<12>;
.reg .b32 %r<69>;
.reg .f64 %fd<2>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [dense_dense_im2col_d_param_0];
ld.param.u64 %rd2, [dense_dense_im2col_d_param_1];
ld.param.u32 %r35, [dense_dense_im2col_d_param_2];
ld.param.u32 %r21, [dense_dense_im2col_d_param_3];
ld.param.u32 %r22, [dense_dense_im2col_d_param_4];
ld.param.u32 %r23, [dense_dense_im2col_d_param_5];
ld.param.u32 %r24, [dense_dense_im2col_d_param_6];
ld.param.u32 %r25, [dense_dense_im2col_d_param_7];
ld.param.u32 %r26, [dense_dense_im2col_d_param_8];
ld.param.u32 %r27, [dense_dense_im2col_d_param_9];
ld.param.u32 %r28, [dense_dense_im2col_d_param_10];
ld.param.u32 %r29, [dense_dense_im2col_d_param_11];
ld.param.u32 %r30, [dense_dense_im2col_d_param_12];
ld.param.u32 %r31, [dense_dense_im2col_d_param_13];
ld.param.u32 %r32, [dense_dense_im2col_d_param_14];
ld.param.u32 %r33, [dense_dense_im2col_d_param_15];
ld.param.u32 %r34, [dense_dense_im2col_d_param_16];
mov.u32 %r36, %ctaid.x;
mov.u32 %r37, %ntid.x;
mov.u32 %r38, %tid.x;
mad.lo.s32 %r1, %r37, %r36, %r38;
setp.ge.s32 %p1, %r1, %r35;
@%p1 bra BB22_9;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd1, [%rd5];
div.s32 %r2, %r1, %r21;
rem.s32 %r39, %r1, %r21;
div.s32 %r3, %r39, %r22;
rem.s32 %r40, %r39, %r22;
div.s32 %r41, %r40, %r23;
rem.s32 %r42, %r40, %r23;
add.s32 %r4, %r41, %r33;
mul.lo.s32 %r43, %r31, %r26;
mov.u32 %r44, 1;
sub.s32 %r45, %r44, %r43;
add.s32 %r46, %r45, %r4;
mov.u32 %r47, 0;
max.s32 %r65, %r47, %r46;
add.s32 %r48, %r24, -1;
min.s32 %r6, %r48, %r4;
add.s32 %r7, %r42, %r34;
mul.lo.s32 %r49, %r32, %r27;
sub.s32 %r50, %r44, %r49;
add.s32 %r51, %r50, %r7;
max.s32 %r66, %r47, %r51;
add.s32 %r52, %r25, -1;
min.s32 %r9, %r52, %r7;
BB22_2:
mov.u32 %r67, %r65;
sub.s32 %r53, %r4, %r67;
rem.s32 %r54, %r53, %r31;
setp.ne.s32 %p2, %r54, 0;
setp.le.s32 %p3, %r67, %r6;
and.pred %p4, %p3, %p2;
add.s32 %r65, %r67, 1;
@%p4 bra BB22_2;
BB22_3:
mov.u32 %r12, %r66;
sub.s32 %r55, %r7, %r12;
rem.s32 %r56, %r55, %r32;
setp.ne.s32 %p5, %r56, 0;
setp.le.s32 %p6, %r12, %r9;
and.pred %p7, %p6, %p5;
add.s32 %r66, %r12, 1;
@%p7 bra BB22_3;
setp.gt.s32 %p8, %r67, %r6;
@%p8 bra BB22_9;
mul.lo.s32 %r14, %r2, %r28;
mul.lo.s32 %r15, %r3, %r29;
cvta.to.global.u64 %rd6, %rd2;
BB22_6:
sub.s32 %r57, %r4, %r67;
div.s32 %r58, %r57, %r31;
mad.lo.s32 %r17, %r58, %r27, %r14;
setp.gt.s32 %p9, %r12, %r9;
mov.u32 %r68, %r12;
@%p9 bra BB22_8;
BB22_7:
sub.s32 %r59, %r7, %r68;
div.s32 %r60, %r59, %r32;
mad.lo.s32 %r61, %r67, %r25, %r15;
add.s32 %r62, %r61, %r68;
mad.lo.s32 %r63, %r62, %r30, %r17;
add.s32 %r64, %r63, %r60;
mul.wide.s32 %rd7, %r64, 8;
add.s64 %rd8, %rd6, %rd7;
st.global.f64 [%rd8], %fd1;
add.s32 %r68, %r68, %r32;
setp.le.s32 %p10, %r68, %r9;
@%p10 bra BB22_7;
BB22_8:
add.s32 %r67, %r67, %r31;
setp.le.s32 %p11, %r67, %r6;
@%p11 bra BB22_6;
BB22_9:
ret;
}
// .globl dense_dense_im2col_f
.visible .entry dense_dense_im2col_f(
.param .u64 dense_dense_im2col_f_param_0,
.param .u64 dense_dense_im2col_f_param_1,
.param .u32 dense_dense_im2col_f_param_2,
.param .u32 dense_dense_im2col_f_param_3,
.param .u32 dense_dense_im2col_f_param_4,
.param .u32 dense_dense_im2col_f_param_5,
.param .u32 dense_dense_im2col_f_param_6,
.param .u32 dense_dense_im2col_f_param_7,
.param .u32 dense_dense_im2col_f_param_8,
.param .u32 dense_dense_im2col_f_param_9,
.param .u32 dense_dense_im2col_f_param_10,
.param .u32 dense_dense_im2col_f_param_11,
.param .u32 dense_dense_im2col_f_param_12,
.param .u32 dense_dense_im2col_f_param_13,
.param .u32 dense_dense_im2col_f_param_14,
.param .u32 dense_dense_im2col_f_param_15,
.param .u32 dense_dense_im2col_f_param_16
)
{
.reg .pred %p<12>;
.reg .f32 %f<2>;
.reg .b32 %r<69>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [dense_dense_im2col_f_param_0];
ld.param.u64 %rd2, [dense_dense_im2col_f_param_1];
ld.param.u32 %r35, [dense_dense_im2col_f_param_2];
ld.param.u32 %r21, [dense_dense_im2col_f_param_3];
ld.param.u32 %r22, [dense_dense_im2col_f_param_4];
ld.param.u32 %r23, [dense_dense_im2col_f_param_5];
ld.param.u32 %r24, [dense_dense_im2col_f_param_6];
ld.param.u32 %r25, [dense_dense_im2col_f_param_7];
ld.param.u32 %r26, [dense_dense_im2col_f_param_8];
ld.param.u32 %r27, [dense_dense_im2col_f_param_9];
ld.param.u32 %r28, [dense_dense_im2col_f_param_10];
ld.param.u32 %r29, [dense_dense_im2col_f_param_11];
ld.param.u32 %r30, [dense_dense_im2col_f_param_12];
ld.param.u32 %r31, [dense_dense_im2col_f_param_13];
ld.param.u32 %r32, [dense_dense_im2col_f_param_14];
ld.param.u32 %r33, [dense_dense_im2col_f_param_15];
ld.param.u32 %r34, [dense_dense_im2col_f_param_16];
mov.u32 %r36, %ctaid.x;
mov.u32 %r37, %ntid.x;
mov.u32 %r38, %tid.x;
mad.lo.s32 %r1, %r37, %r36, %r38;
setp.ge.s32 %p1, %r1, %r35;
@%p1 bra BB23_9;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f1, [%rd5];
div.s32 %r2, %r1, %r21;
rem.s32 %r39, %r1, %r21;
div.s32 %r3, %r39, %r22;
rem.s32 %r40, %r39, %r22;
div.s32 %r41, %r40, %r23;
rem.s32 %r42, %r40, %r23;
add.s32 %r4, %r41, %r33;
mul.lo.s32 %r43, %r31, %r26;
mov.u32 %r44, 1;
sub.s32 %r45, %r44, %r43;
add.s32 %r46, %r45, %r4;
mov.u32 %r47, 0;
max.s32 %r65, %r47, %r46;
add.s32 %r48, %r24, -1;
min.s32 %r6, %r48, %r4;
add.s32 %r7, %r42, %r34;
mul.lo.s32 %r49, %r32, %r27;
sub.s32 %r50, %r44, %r49;
add.s32 %r51, %r50, %r7;
max.s32 %r66, %r47, %r51;
add.s32 %r52, %r25, -1;
min.s32 %r9, %r52, %r7;
BB23_2:
mov.u32 %r67, %r65;
sub.s32 %r53, %r4, %r67;
rem.s32 %r54, %r53, %r31;
setp.ne.s32 %p2, %r54, 0;
setp.le.s32 %p3, %r67, %r6;
and.pred %p4, %p3, %p2;
add.s32 %r65, %r67, 1;
@%p4 bra BB23_2;
BB23_3:
mov.u32 %r12, %r66;
sub.s32 %r55, %r7, %r12;
rem.s32 %r56, %r55, %r32;
setp.ne.s32 %p5, %r56, 0;
setp.le.s32 %p6, %r12, %r9;
and.pred %p7, %p6, %p5;
add.s32 %r66, %r12, 1;
@%p7 bra BB23_3;
setp.gt.s32 %p8, %r67, %r6;
@%p8 bra BB23_9;
mul.lo.s32 %r14, %r2, %r28;
mul.lo.s32 %r15, %r3, %r29;
cvta.to.global.u64 %rd6, %rd2;
BB23_6:
sub.s32 %r57, %r4, %r67;
div.s32 %r58, %r57, %r31;
mad.lo.s32 %r17, %r58, %r27, %r14;
setp.gt.s32 %p9, %r12, %r9;
mov.u32 %r68, %r12;
@%p9 bra BB23_8;
BB23_7:
sub.s32 %r59, %r7, %r68;
div.s32 %r60, %r59, %r32;
mad.lo.s32 %r61, %r67, %r25, %r15;
add.s32 %r62, %r61, %r68;
mad.lo.s32 %r63, %r62, %r30, %r17;
add.s32 %r64, %r63, %r60;
mul.wide.s32 %rd7, %r64, 4;
add.s64 %rd8, %rd6, %rd7;
st.global.f32 [%rd8], %f1;
add.s32 %r68, %r68, %r32;
setp.le.s32 %p10, %r68, %r9;
@%p10 bra BB23_7;
BB23_8:
add.s32 %r67, %r67, %r31;
setp.le.s32 %p11, %r67, %r6;
@%p11 bra BB23_6;
BB23_9:
ret;
}
// .globl reorg_knpq_d
.visible .entry reorg_knpq_d(
.param .u64 reorg_knpq_d_param_0,
.param .u64 reorg_knpq_d_param_1,
.param .u32 reorg_knpq_d_param_2,
.param .u32 reorg_knpq_d_param_3,
.param .u32 reorg_knpq_d_param_4,
.param .u32 reorg_knpq_d_param_5
)
{
.reg .pred %p<2>;
.reg .b32 %r<16>;
.reg .f64 %fd<2>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [reorg_knpq_d_param_0];
ld.param.u64 %rd2, [reorg_knpq_d_param_1];
ld.param.u32 %r5, [reorg_knpq_d_param_2];
ld.param.u32 %r2, [reorg_knpq_d_param_3];
ld.param.u32 %r3, [reorg_knpq_d_param_4];
ld.param.u32 %r4, [reorg_knpq_d_param_5];
mov.u32 %r6, %ctaid.x;
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %tid.x;
mad.lo.s32 %r1, %r7, %r6, %r8;
setp.ge.s32 %p1, %r1, %r5;
@%p1 bra BB24_2;
cvta.to.global.u64 %rd3, %rd1;
rem.s32 %r9, %r1, %r2;
div.s32 %r10, %r9, %r4;
rem.s32 %r11, %r9, %r4;
mul.wide.s32 %rd4, %r1, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd1, [%rd5];
div.s32 %r12, %r1, %r2;
mul.lo.s32 %r13, %r12, %r4;
mad.lo.s32 %r14, %r10, %r3, %r13;
add.s32 %r15, %r14, %r11;
cvta.to.global.u64 %rd6, %rd2;
mul.wide.s32 %rd7, %r15, 8;
add.s64 %rd8, %rd6, %rd7;
st.global.f64 [%rd8], %fd1;
BB24_2:
ret;
}
// .globl reorg_knpq_f
.visible .entry reorg_knpq_f(
.param .u64 reorg_knpq_f_param_0,
.param .u64 reorg_knpq_f_param_1,
.param .u32 reorg_knpq_f_param_2,
.param .u32 reorg_knpq_f_param_3,
.param .u32 reorg_knpq_f_param_4,
.param .u32 reorg_knpq_f_param_5
)
{
.reg .pred %p<2>;
.reg .f32 %f<2>;
.reg .b32 %r<16>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [reorg_knpq_f_param_0];
ld.param.u64 %rd2, [reorg_knpq_f_param_1];
ld.param.u32 %r5, [reorg_knpq_f_param_2];
ld.param.u32 %r2, [reorg_knpq_f_param_3];
ld.param.u32 %r3, [reorg_knpq_f_param_4];
ld.param.u32 %r4, [reorg_knpq_f_param_5];
mov.u32 %r6, %ctaid.x;
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %tid.x;
mad.lo.s32 %r1, %r7, %r6, %r8;
setp.ge.s32 %p1, %r1, %r5;
@%p1 bra BB25_2;
cvta.to.global.u64 %rd3, %rd1;
rem.s32 %r9, %r1, %r2;
div.s32 %r10, %r9, %r4;
rem.s32 %r11, %r9, %r4;
mul.wide.s32 %rd4, %r1, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f1, [%rd5];
div.s32 %r12, %r1, %r2;
mul.lo.s32 %r13, %r12, %r4;
mad.lo.s32 %r14, %r10, %r3, %r13;
add.s32 %r15, %r14, %r11;
cvta.to.global.u64 %rd6, %rd2;
mul.wide.s32 %rd7, %r15, 4;
add.s64 %rd8, %rd6, %rd7;
st.global.f32 [%rd8], %f1;
BB25_2:
ret;
}
// .globl slice_sparse_dense_row_d
.visible .entry slice_sparse_dense_row_d(
.param .u64 slice_sparse_dense_row_d_param_0,
.param .u64 slice_sparse_dense_row_d_param_1,
.param .u64 slice_sparse_dense_row_d_param_2,
.param .u64 slice_sparse_dense_row_d_param_3,
.param .u32 slice_sparse_dense_row_d_param_4,
.param .u32 slice_sparse_dense_row_d_param_5,
.param .u32 slice_sparse_dense_row_d_param_6,
.param .u32 slice_sparse_dense_row_d_param_7,
.param .u32 slice_sparse_dense_row_d_param_8
)
{
.reg .pred %p<7>;
.reg .b32 %r<25>;
.reg .f64 %fd<2>;
.reg .b64 %rd<23>;
ld.param.u64 %rd9, [slice_sparse_dense_row_d_param_0];
ld.param.u64 %rd10, [slice_sparse_dense_row_d_param_1];
ld.param.u64 %rd11, [slice_sparse_dense_row_d_param_2];
ld.param.u64 %rd12, [slice_sparse_dense_row_d_param_3];
ld.param.u32 %r15, [slice_sparse_dense_row_d_param_4];
ld.param.u32 %r16, [slice_sparse_dense_row_d_param_5];
ld.param.u32 %r12, [slice_sparse_dense_row_d_param_6];
ld.param.u32 %r13, [slice_sparse_dense_row_d_param_7];
ld.param.u32 %r14, [slice_sparse_dense_row_d_param_8];
mov.u32 %r17, %ntid.x;
mov.u32 %r18, %ctaid.x;
mov.u32 %r19, %tid.x;
mad.lo.s32 %r1, %r17, %r18, %r19;
add.s32 %r2, %r1, %r15;
setp.gt.s32 %p1, %r2, %r16;
@%p1 bra BB26_6;
cvta.to.global.u64 %rd13, %rd10;
mul.wide.s32 %rd14, %r2, 4;
add.s64 %rd1, %rd13, %rd14;
ld.global.u32 %r23, [%rd1];
ld.global.u32 %r24, [%rd1+4];
setp.ge.s32 %p2, %r23, %r24;
@%p2 bra BB26_6;
cvta.to.global.u64 %rd2, %rd12;
cvta.to.global.u64 %rd15, %rd9;
cvta.to.global.u64 %rd16, %rd11;
mul.lo.s32 %r20, %r1, %r14;
sub.s32 %r5, %r20, %r12;
mul.wide.s32 %rd17, %r23, 8;
add.s64 %rd22, %rd15, %rd17;
mul.wide.s32 %rd18, %r23, 4;
add.s64 %rd21, %rd16, %rd18;
BB26_3:
ld.global.u32 %r8, [%rd21];
setp.lt.s32 %p3, %r8, %r12;
setp.gt.s32 %p4, %r8, %r13;
or.pred %p5, %p3, %p4;
@%p5 bra BB26_5;
ld.global.f64 %fd1, [%rd22];
add.s32 %r21, %r5, %r8;
mul.wide.s32 %rd19, %r21, 8;
add.s64 %rd20, %rd2, %rd19;
st.global.f64 [%rd20], %fd1;
ld.global.u32 %r24, [%rd1+4];
BB26_5:
add.s64 %rd22, %rd22, 8;
add.s64 %rd21, %rd21, 4;
add.s32 %r23, %r23, 1;
setp.lt.s32 %p6, %r23, %r24;
@%p6 bra BB26_3;
BB26_6:
ret;
}
// .globl slice_sparse_dense_row_f
.visible .entry slice_sparse_dense_row_f(
.param .u64 slice_sparse_dense_row_f_param_0,
.param .u64 slice_sparse_dense_row_f_param_1,
.param .u64 slice_sparse_dense_row_f_param_2,
.param .u64 slice_sparse_dense_row_f_param_3,
.param .u32 slice_sparse_dense_row_f_param_4,
.param .u32 slice_sparse_dense_row_f_param_5,
.param .u32 slice_sparse_dense_row_f_param_6,
.param .u32 slice_sparse_dense_row_f_param_7,
.param .u32 slice_sparse_dense_row_f_param_8
)
{
.reg .pred %p<7>;
.reg .f32 %f<2>;
.reg .b32 %r<25>;
.reg .b64 %rd<22>;
ld.param.u64 %rd9, [slice_sparse_dense_row_f_param_0];
ld.param.u64 %rd10, [slice_sparse_dense_row_f_param_1];
ld.param.u64 %rd11, [slice_sparse_dense_row_f_param_2];
ld.param.u64 %rd12, [slice_sparse_dense_row_f_param_3];
ld.param.u32 %r15, [slice_sparse_dense_row_f_param_4];
ld.param.u32 %r16, [slice_sparse_dense_row_f_param_5];
ld.param.u32 %r12, [slice_sparse_dense_row_f_param_6];
ld.param.u32 %r13, [slice_sparse_dense_row_f_param_7];
ld.param.u32 %r14, [slice_sparse_dense_row_f_param_8];
mov.u32 %r17, %ntid.x;
mov.u32 %r18, %ctaid.x;
mov.u32 %r19, %tid.x;
mad.lo.s32 %r1, %r17, %r18, %r19;
add.s32 %r2, %r1, %r15;
setp.gt.s32 %p1, %r2, %r16;
@%p1 bra BB27_6;
cvta.to.global.u64 %rd13, %rd10;
mul.wide.s32 %rd14, %r2, 4;
add.s64 %rd1, %rd13, %rd14;
ld.global.u32 %r23, [%rd1];
ld.global.u32 %r24, [%rd1+4];
setp.ge.s32 %p2, %r23, %r24;
@%p2 bra BB27_6;
cvta.to.global.u64 %rd2, %rd12;
cvta.to.global.u64 %rd15, %rd9;
cvta.to.global.u64 %rd16, %rd11;
mul.lo.s32 %r20, %r1, %r14;
sub.s32 %r5, %r20, %r12;
mul.wide.s32 %rd17, %r23, 4;
add.s64 %rd21, %rd15, %rd17;
add.s64 %rd20, %rd16, %rd17;
BB27_3:
ld.global.u32 %r8, [%rd20];
setp.lt.s32 %p3, %r8, %r12;
setp.gt.s32 %p4, %r8, %r13;
or.pred %p5, %p3, %p4;
@%p5 bra BB27_5;
ld.global.f32 %f1, [%rd21];
add.s32 %r21, %r5, %r8;
mul.wide.s32 %rd18, %r21, 4;
add.s64 %rd19, %rd2, %rd18;
st.global.f32 [%rd19], %f1;
ld.global.u32 %r24, [%rd1+4];
BB27_5:
add.s64 %rd21, %rd21, 4;
add.s64 %rd20, %rd20, 4;
add.s32 %r23, %r23, 1;
setp.lt.s32 %p6, %r23, %r24;
@%p6 bra BB27_3;
BB27_6:
ret;
}
// .globl slice_sparse_dense_nnz_d
.visible .entry slice_sparse_dense_nnz_d(
.param .u64 slice_sparse_dense_nnz_d_param_0,
.param .u64 slice_sparse_dense_nnz_d_param_1,
.param .u64 slice_sparse_dense_nnz_d_param_2,
.param .u64 slice_sparse_dense_nnz_d_param_3,
.param .u32 slice_sparse_dense_nnz_d_param_4,
.param .u32 slice_sparse_dense_nnz_d_param_5,
.param .u32 slice_sparse_dense_nnz_d_param_6,
.param .u32 slice_sparse_dense_nnz_d_param_7,
.param .u32 slice_sparse_dense_nnz_d_param_8
)
{
.reg .pred %p<6>;
.reg .b32 %r<22>;
.reg .f64 %fd<2>;
.reg .b64 %rd<21>;
ld.param.u64 %rd4, [slice_sparse_dense_nnz_d_param_0];
ld.param.u64 %rd7, [slice_sparse_dense_nnz_d_param_1];
ld.param.u64 %rd5, [slice_sparse_dense_nnz_d_param_2];
ld.param.u64 %rd6, [slice_sparse_dense_nnz_d_param_3];
ld.param.u32 %r5, [slice_sparse_dense_nnz_d_param_4];
ld.param.u32 %r9, [slice_sparse_dense_nnz_d_param_5];
ld.param.u32 %r6, [slice_sparse_dense_nnz_d_param_6];
ld.param.u32 %r7, [slice_sparse_dense_nnz_d_param_7];
ld.param.u32 %r8, [slice_sparse_dense_nnz_d_param_8];
mov.u32 %r10, %ntid.x;
mov.u32 %r11, %ctaid.x;
mov.u32 %r12, %tid.x;
mad.lo.s32 %r13, %r10, %r11, %r12;
cvta.to.global.u64 %rd1, %rd7;
mul.wide.s32 %rd8, %r5, 4;
add.s64 %rd9, %rd1, %rd8;
ld.global.u32 %r14, [%rd9];
add.s32 %r1, %r13, %r14;
mul.wide.s32 %rd10, %r9, 4;
add.s64 %rd11, %rd1, %rd10;
ld.global.u32 %r15, [%rd11+4];
setp.ge.s32 %p1, %r1, %r15;
@%p1 bra BB28_5;
cvta.to.global.u64 %rd2, %rd6;
cvta.to.global.u64 %rd3, %rd4;
cvta.to.global.u64 %rd12, %rd5;
mul.wide.s32 %rd13, %r1, 4;
add.s64 %rd14, %rd12, %rd13;
ld.global.u32 %r2, [%rd14];
setp.lt.s32 %p2, %r2, %r6;
setp.gt.s32 %p3, %r2, %r7;
or.pred %p4, %p2, %p3;
@%p4 bra BB28_5;
mov.u32 %r21, %r5;
BB28_3:
mov.u32 %r3, %r21;
add.s32 %r21, %r3, 1;
mul.wide.s32 %rd15, %r21, 4;
add.s64 %rd16, %rd1, %rd15;
ld.global.u32 %r16, [%rd16];
setp.le.s32 %p5, %r16, %r1;
@%p5 bra BB28_3;
mul.wide.s32 %rd17, %r1, 8;
add.s64 %rd18, %rd3, %rd17;
ld.global.f64 %fd1, [%rd18];
sub.s32 %r17, %r3, %r5;
mul.lo.s32 %r18, %r17, %r8;
sub.s32 %r19, %r18, %r6;
add.s32 %r20, %r19, %r2;
mul.wide.s32 %rd19, %r20, 8;
add.s64 %rd20, %rd2, %rd19;
st.global.f64 [%rd20], %fd1;
BB28_5:
ret;
}
// .globl slice_sparse_dense_nnz_f
.visible .entry slice_sparse_dense_nnz_f(
.param .u64 slice_sparse_dense_nnz_f_param_0,
.param .u64 slice_sparse_dense_nnz_f_param_1,
.param .u64 slice_sparse_dense_nnz_f_param_2,
.param .u64 slice_sparse_dense_nnz_f_param_3,
.param .u32 slice_sparse_dense_nnz_f_param_4,
.param .u32 slice_sparse_dense_nnz_f_param_5,
.param .u32 slice_sparse_dense_nnz_f_param_6,
.param .u32 slice_sparse_dense_nnz_f_param_7,
.param .u32 slice_sparse_dense_nnz_f_param_8
)
{
.reg .pred %p<6>;
.reg .f32 %f<2>;
.reg .b32 %r<22>;
.reg .b64 %rd<21>;
ld.param.u64 %rd4, [slice_sparse_dense_nnz_f_param_0];
ld.param.u64 %rd7, [slice_sparse_dense_nnz_f_param_1];
ld.param.u64 %rd5, [slice_sparse_dense_nnz_f_param_2];
ld.param.u64 %rd6, [slice_sparse_dense_nnz_f_param_3];
ld.param.u32 %r5, [slice_sparse_dense_nnz_f_param_4];
ld.param.u32 %r9, [slice_sparse_dense_nnz_f_param_5];
ld.param.u32 %r6, [slice_sparse_dense_nnz_f_param_6];
ld.param.u32 %r7, [slice_sparse_dense_nnz_f_param_7];
ld.param.u32 %r8, [slice_sparse_dense_nnz_f_param_8];
mov.u32 %r10, %ntid.x;
mov.u32 %r11, %ctaid.x;
mov.u32 %r12, %tid.x;
mad.lo.s32 %r13, %r10, %r11, %r12;
cvta.to.global.u64 %rd1, %rd7;
mul.wide.s32 %rd8, %r5, 4;
add.s64 %rd9, %rd1, %rd8;
ld.global.u32 %r14, [%rd9];
add.s32 %r1, %r13, %r14;
mul.wide.s32 %rd10, %r9, 4;
add.s64 %rd11, %rd1, %rd10;
ld.global.u32 %r15, [%rd11+4];
setp.ge.s32 %p1, %r1, %r15;
@%p1 bra BB29_5;
cvta.to.global.u64 %rd2, %rd6;
cvta.to.global.u64 %rd3, %rd4;
cvta.to.global.u64 %rd12, %rd5;
mul.wide.s32 %rd13, %r1, 4;
add.s64 %rd14, %rd12, %rd13;
ld.global.u32 %r2, [%rd14];
setp.lt.s32 %p2, %r2, %r6;
setp.gt.s32 %p3, %r2, %r7;
or.pred %p4, %p2, %p3;
@%p4 bra BB29_5;
mov.u32 %r21, %r5;
BB29_3:
mov.u32 %r3, %r21;
add.s32 %r21, %r3, 1;
mul.wide.s32 %rd15, %r21, 4;
add.s64 %rd16, %rd1, %rd15;
ld.global.u32 %r16, [%rd16];
setp.le.s32 %p5, %r16, %r1;
@%p5 bra BB29_3;
add.s64 %rd18, %rd3, %rd13;
ld.global.f32 %f1, [%rd18];
sub.s32 %r17, %r3, %r5;
mul.lo.s32 %r18, %r17, %r8;
sub.s32 %r19, %r18, %r6;
add.s32 %r20, %r19, %r2;
mul.wide.s32 %rd19, %r20, 4;
add.s64 %rd20, %rd2, %rd19;
st.global.f32 [%rd20], %f1;
BB29_5:
ret;
}
// .globl slice_dense_dense_d
.visible .entry slice_dense_dense_d(
.param .u64 slice_dense_dense_d_param_0,
.param .u64 slice_dense_dense_d_param_1,
.param .u32 slice_dense_dense_d_param_2,
.param .u32 slice_dense_dense_d_param_3,
.param .u32 slice_dense_dense_d_param_4,
.param .u32 slice_dense_dense_d_param_5,
.param .u32 slice_dense_dense_d_param_6,
.param .u32 slice_dense_dense_d_param_7,
.param .u32 slice_dense_dense_d_param_8
)
{
.reg .pred %p<4>;
.reg .b32 %r<15>;
.reg .f64 %fd<2>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [slice_dense_dense_d_param_0];
ld.param.u64 %rd2, [slice_dense_dense_d_param_1];
ld.param.u32 %r3, [slice_dense_dense_d_param_2];
ld.param.u32 %r4, [slice_dense_dense_d_param_4];
ld.param.u32 %r5, [slice_dense_dense_d_param_6];
ld.param.u32 %r7, [slice_dense_dense_d_param_7];
ld.param.u32 %r6, [slice_dense_dense_d_param_8];
mov.u32 %r8, %ctaid.x;
mov.u32 %r9, %ntid.x;
mov.u32 %r10, %tid.x;
mad.lo.s32 %r1, %r9, %r8, %r10;
div.s32 %r2, %r1, %r6;
setp.ge.s32 %p1, %r2, %r7;
setp.lt.s32 %p2, %r6, 0;
or.pred %p3, %p1, %p2;
@%p3 bra BB30_2;
cvta.to.global.u64 %rd3, %rd1;
add.s32 %r11, %r2, %r3;
rem.s32 %r12, %r1, %r6;
add.s32 %r13, %r12, %r4;
mad.lo.s32 %r14, %r11, %r5, %r13;
mul.wide.s32 %rd4, %r14, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd1, [%rd5];
cvta.to.global.u64 %rd6, %rd2;
mul.wide.s32 %rd7, %r1, 8;
add.s64 %rd8, %rd6, %rd7;
st.global.f64 [%rd8], %fd1;
BB30_2:
ret;
}
// .globl slice_dense_dense_f
.visible .entry slice_dense_dense_f(
.param .u64 slice_dense_dense_f_param_0,
.param .u64 slice_dense_dense_f_param_1,
.param .u32 slice_dense_dense_f_param_2,
.param .u32 slice_dense_dense_f_param_3,
.param .u32 slice_dense_dense_f_param_4,
.param .u32 slice_dense_dense_f_param_5,
.param .u32 slice_dense_dense_f_param_6,
.param .u32 slice_dense_dense_f_param_7,
.param .u32 slice_dense_dense_f_param_8
)
{
.reg .pred %p<4>;
.reg .f32 %f<2>;
.reg .b32 %r<15>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [slice_dense_dense_f_param_0];
ld.param.u64 %rd2, [slice_dense_dense_f_param_1];
ld.param.u32 %r3, [slice_dense_dense_f_param_2];
ld.param.u32 %r4, [slice_dense_dense_f_param_4];
ld.param.u32 %r5, [slice_dense_dense_f_param_6];
ld.param.u32 %r7, [slice_dense_dense_f_param_7];
ld.param.u32 %r6, [slice_dense_dense_f_param_8];
mov.u32 %r8, %ctaid.x;
mov.u32 %r9, %ntid.x;
mov.u32 %r10, %tid.x;
mad.lo.s32 %r1, %r9, %r8, %r10;
div.s32 %r2, %r1, %r6;
setp.ge.s32 %p1, %r2, %r7;
setp.lt.s32 %p2, %r6, 0;
or.pred %p3, %p1, %p2;
@%p3 bra BB31_2;
cvta.to.global.u64 %rd3, %rd1;
add.s32 %r11, %r2, %r3;
rem.s32 %r12, %r1, %r6;
add.s32 %r13, %r12, %r4;
mad.lo.s32 %r14, %r11, %r5, %r13;
mul.wide.s32 %rd4, %r14, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f1, [%rd5];
cvta.to.global.u64 %rd6, %rd2;
mul.wide.s32 %rd7, %r1, 4;
add.s64 %rd8, %rd6, %rd7;
st.global.f32 [%rd8], %f1;
BB31_2:
ret;
}
// .globl copy_u2l_dense_d
.visible .entry copy_u2l_dense_d(
.param .u64 copy_u2l_dense_d_param_0,
.param .u32 copy_u2l_dense_d_param_1,
.param .u32 copy_u2l_dense_d_param_2
)
{
.reg .pred %p<4>;
.reg .b32 %r<10>;
.reg .f64 %fd<2>;
.reg .b64 %rd<7>;
ld.param.u64 %rd1, [copy_u2l_dense_d_param_0];
ld.param.u32 %r3, [copy_u2l_dense_d_param_1];
ld.param.u32 %r4, [copy_u2l_dense_d_param_2];
mov.u32 %r5, %ntid.x;
mov.u32 %r6, %ctaid.x;
mov.u32 %r7, %tid.x;
mad.lo.s32 %r1, %r5, %r6, %r7;
div.s32 %r8, %r1, %r3;
rem.s32 %r9, %r1, %r3;
mad.lo.s32 %r2, %r9, %r3, %r8;
setp.le.s32 %p1, %r9, %r8;
setp.ge.s32 %p2, %r2, %r4;
or.pred %p3, %p1, %p2;
@%p3 bra BB32_2;
cvta.to.global.u64 %rd2, %rd1;
mul.wide.s32 %rd3, %r1, 8;
add.s64 %rd4, %rd2, %rd3;
ld.global.f64 %fd1, [%rd4];
mul.wide.s32 %rd5, %r2, 8;
add.s64 %rd6, %rd2, %rd5;
st.global.f64 [%rd6], %fd1;
BB32_2:
ret;
}
// .globl copy_u2l_dense_f
.visible .entry copy_u2l_dense_f(
.param .u64 copy_u2l_dense_f_param_0,
.param .u32 copy_u2l_dense_f_param_1,
.param .u32 copy_u2l_dense_f_param_2
)
{
.reg .pred %p<4>;
.reg .f32 %f<2>;
.reg .b32 %r<10>;
.reg .b64 %rd<7>;
ld.param.u64 %rd1, [copy_u2l_dense_f_param_0];
ld.param.u32 %r3, [copy_u2l_dense_f_param_1];
ld.param.u32 %r4, [copy_u2l_dense_f_param_2];
mov.u32 %r5, %ntid.x;
mov.u32 %r6, %ctaid.x;
mov.u32 %r7, %tid.x;
mad.lo.s32 %r1, %r5, %r6, %r7;
div.s32 %r8, %r1, %r3;
rem.s32 %r9, %r1, %r3;
mad.lo.s32 %r2, %r9, %r3, %r8;
setp.le.s32 %p1, %r9, %r8;
setp.ge.s32 %p2, %r2, %r4;
or.pred %p3, %p1, %p2;
@%p3 bra BB33_2;
cvta.to.global.u64 %rd2, %rd1;
mul.wide.s32 %rd3, %r1, 4;
add.s64 %rd4, %rd2, %rd3;
ld.global.f32 %f1, [%rd4];
mul.wide.s32 %rd5, %r2, 4;
add.s64 %rd6, %rd2, %rd5;
st.global.f32 [%rd6], %f1;
BB33_2:
ret;
}
// .globl relu_d
.visible .entry relu_d(
.param .u64 relu_d_param_0,
.param .u64 relu_d_param_1,
.param .u32 relu_d_param_2,
.param .u32 relu_d_param_3
)
{
.reg .pred %p<4>;
.reg .b32 %r<8>;
.reg .f64 %fd<4>;
.reg .b64 %rd<8>;
ld.param.u64 %rd1, [relu_d_param_0];
ld.param.u64 %rd2, [relu_d_param_1];
ld.param.u32 %r2, [relu_d_param_2];
ld.param.u32 %r3, [relu_d_param_3];
mov.u32 %r4, %ctaid.x;
mov.u32 %r5, %ntid.x;
mov.u32 %r6, %tid.x;
mad.lo.s32 %r1, %r5, %r4, %r6;
div.s32 %r7, %r1, %r3;
setp.ge.s32 %p1, %r7, %r2;
setp.lt.s32 %p2, %r3, 0;
or.pred %p3, %p1, %p2;
@%p3 bra BB34_2;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd1, [%rd5];
mov.f64 %fd2, 0d0000000000000000;
max.f64 %fd3, %fd2, %fd1;
cvta.to.global.u64 %rd6, %rd2;
add.s64 %rd7, %rd6, %rd4;
st.global.f64 [%rd7], %fd3;
BB34_2:
ret;
}
// .globl relu_f
.visible .entry relu_f(
.param .u64 relu_f_param_0,
.param .u64 relu_f_param_1,
.param .u32 relu_f_param_2,
.param .u32 relu_f_param_3
)
{
.reg .pred %p<4>;
.reg .f32 %f<3>;
.reg .b32 %r<8>;
.reg .f64 %fd<4>;
.reg .b64 %rd<8>;
ld.param.u64 %rd1, [relu_f_param_0];
ld.param.u64 %rd2, [relu_f_param_1];
ld.param.u32 %r2, [relu_f_param_2];
ld.param.u32 %r3, [relu_f_param_3];
mov.u32 %r4, %ctaid.x;
mov.u32 %r5, %ntid.x;
mov.u32 %r6, %tid.x;
mad.lo.s32 %r1, %r5, %r4, %r6;
div.s32 %r7, %r1, %r3;
setp.ge.s32 %p1, %r7, %r2;
setp.lt.s32 %p2, %r3, 0;
or.pred %p3, %p1, %p2;
@%p3 bra BB35_2;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f1, [%rd5];
cvt.f64.f32 %fd1, %f1;
mov.f64 %fd2, 0d0000000000000000;
max.f64 %fd3, %fd2, %fd1;
cvt.rn.f32.f64 %f2, %fd3;
cvta.to.global.u64 %rd6, %rd2;
add.s64 %rd7, %rd6, %rd4;
st.global.f32 [%rd7], %f2;
BB35_2:
ret;
}
// .globl relu_backward_d
.visible .entry relu_backward_d(
.param .u64 relu_backward_d_param_0,
.param .u64 relu_backward_d_param_1,
.param .u64 relu_backward_d_param_2,
.param .u32 relu_backward_d_param_3,
.param .u32 relu_backward_d_param_4
)
{
.reg .pred %p<5>;
.reg .b32 %r<8>;
.reg .f64 %fd<6>;
.reg .b64 %rd<13>;
ld.param.u64 %rd1, [relu_backward_d_param_0];
ld.param.u64 %rd2, [relu_backward_d_param_1];
ld.param.u64 %rd3, [relu_backward_d_param_2];
ld.param.u32 %r2, [relu_backward_d_param_3];
ld.param.u32 %r3, [relu_backward_d_param_4];
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %ctaid.x;
mov.u32 %r6, %tid.x;
mad.lo.s32 %r1, %r4, %r5, %r6;
div.s32 %r7, %r1, %r3;
setp.ge.s32 %p1, %r7, %r2;
setp.lt.s32 %p2, %r3, 0;
or.pred %p3, %p1, %p2;
@%p3 bra BB36_4;
cvta.to.global.u64 %rd4, %rd1;
mul.wide.s32 %rd5, %r1, 8;
add.s64 %rd6, %rd4, %rd5;
ld.global.f64 %fd4, [%rd6];
mov.f64 %fd5, 0d0000000000000000;
setp.leu.f64 %p4, %fd4, 0d0000000000000000;
@%p4 bra BB36_3;
cvta.to.global.u64 %rd7, %rd2;
add.s64 %rd9, %rd7, %rd5;
ld.global.f64 %fd5, [%rd9];
BB36_3:
cvta.to.global.u64 %rd10, %rd3;
add.s64 %rd12, %rd10, %rd5;
st.global.f64 [%rd12], %fd5;
BB36_4:
ret;
}
// .globl relu_backward_f
.visible .entry relu_backward_f(
.param .u64 relu_backward_f_param_0,
.param .u64 relu_backward_f_param_1,
.param .u64 relu_backward_f_param_2,
.param .u32 relu_backward_f_param_3,
.param .u32 relu_backward_f_param_4
)
{
.reg .pred %p<5>;
.reg .f32 %f<6>;
.reg .b32 %r<8>;
.reg .b64 %rd<13>;
ld.param.u64 %rd1, [relu_backward_f_param_0];
ld.param.u64 %rd2, [relu_backward_f_param_1];
ld.param.u64 %rd3, [relu_backward_f_param_2];
ld.param.u32 %r2, [relu_backward_f_param_3];
ld.param.u32 %r3, [relu_backward_f_param_4];
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %ctaid.x;
mov.u32 %r6, %tid.x;
mad.lo.s32 %r1, %r4, %r5, %r6;
div.s32 %r7, %r1, %r3;
setp.ge.s32 %p1, %r7, %r2;
setp.lt.s32 %p2, %r3, 0;
or.pred %p3, %p1, %p2;
@%p3 bra BB37_4;
cvta.to.global.u64 %rd4, %rd1;
mul.wide.s32 %rd5, %r1, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.f32 %f4, [%rd6];
mov.f32 %f5, 0f00000000;
setp.leu.f32 %p4, %f4, 0f00000000;
@%p4 bra BB37_3;
cvta.to.global.u64 %rd7, %rd2;
add.s64 %rd9, %rd7, %rd5;
ld.global.f32 %f5, [%rd9];
BB37_3:
cvta.to.global.u64 %rd10, %rd3;
add.s64 %rd12, %rd10, %rd5;
st.global.f32 [%rd12], %f5;
BB37_4:
ret;
}
// .globl inplace_add_d
.visible .entry inplace_add_d(
.param .u64 inplace_add_d_param_0,
.param .u64 inplace_add_d_param_1,
.param .u32 inplace_add_d_param_2,
.param .u32 inplace_add_d_param_3
)
{
.reg .pred %p<4>;
.reg .b32 %r<8>;
.reg .f64 %fd<4>;
.reg .b64 %rd<8>;
ld.param.u64 %rd1, [inplace_add_d_param_0];
ld.param.u64 %rd2, [inplace_add_d_param_1];
ld.param.u32 %r2, [inplace_add_d_param_2];
ld.param.u32 %r3, [inplace_add_d_param_3];
mov.u32 %r4, %ctaid.x;
mov.u32 %r5, %ntid.x;
mov.u32 %r6, %tid.x;
mad.lo.s32 %r1, %r5, %r4, %r6;
div.s32 %r7, %r1, %r3;
setp.ge.s32 %p1, %r7, %r2;
setp.lt.s32 %p2, %r3, 0;
or.pred %p3, %p1, %p2;
@%p3 bra BB38_2;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 8;
add.s64 %rd5, %rd3, %rd4;
cvta.to.global.u64 %rd6, %rd2;
add.s64 %rd7, %rd6, %rd4;
ld.global.f64 %fd1, [%rd7];
ld.global.f64 %fd2, [%rd5];
add.f64 %fd3, %fd2, %fd1;
st.global.f64 [%rd7], %fd3;
BB38_2:
ret;
}
// .globl inplace_add_f
.visible .entry inplace_add_f(
.param .u64 inplace_add_f_param_0,
.param .u64 inplace_add_f_param_1,
.param .u32 inplace_add_f_param_2,
.param .u32 inplace_add_f_param_3
)
{
.reg .pred %p<4>;
.reg .f32 %f<4>;
.reg .b32 %r<8>;
.reg .b64 %rd<8>;
ld.param.u64 %rd1, [inplace_add_f_param_0];
ld.param.u64 %rd2, [inplace_add_f_param_1];
ld.param.u32 %r2, [inplace_add_f_param_2];
ld.param.u32 %r3, [inplace_add_f_param_3];
mov.u32 %r4, %ctaid.x;
mov.u32 %r5, %ntid.x;
mov.u32 %r6, %tid.x;
mad.lo.s32 %r1, %r5, %r4, %r6;
div.s32 %r7, %r1, %r3;
setp.ge.s32 %p1, %r7, %r2;
setp.lt.s32 %p2, %r3, 0;
or.pred %p3, %p1, %p2;
@%p3 bra BB39_2;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 4;
add.s64 %rd5, %rd3, %rd4;
cvta.to.global.u64 %rd6, %rd2;
add.s64 %rd7, %rd6, %rd4;
ld.global.f32 %f1, [%rd7];
ld.global.f32 %f2, [%rd5];
add.f32 %f3, %f2, %f1;
st.global.f32 [%rd7], %f3;
BB39_2:
ret;
}
// .globl bias_add_d
.visible .entry bias_add_d(
.param .u64 bias_add_d_param_0,
.param .u64 bias_add_d_param_1,
.param .u64 bias_add_d_param_2,
.param .u32 bias_add_d_param_3,
.param .u32 bias_add_d_param_4,
.param .u32 bias_add_d_param_5
)
{
.reg .pred %p<4>;
.reg .b32 %r<11>;
.reg .f64 %fd<4>;
.reg .b64 %rd<12>;
ld.param.u64 %rd1, [bias_add_d_param_0];
ld.param.u64 %rd2, [bias_add_d_param_1];
ld.param.u64 %rd3, [bias_add_d_param_2];
ld.param.u32 %r4, [bias_add_d_param_3];
ld.param.u32 %r2, [bias_add_d_param_4];
ld.param.u32 %r3, [bias_add_d_param_5];
mov.u32 %r5, %ctaid.x;
mov.u32 %r6, %ntid.x;
mov.u32 %r7, %tid.x;
mad.lo.s32 %r1, %r6, %r5, %r7;
div.s32 %r8, %r1, %r2;
setp.ge.s32 %p1, %r8, %r4;
setp.lt.s32 %p2, %r2, 0;
or.pred %p3, %p1, %p2;
@%p3 bra BB40_2;
cvta.to.global.u64 %rd4, %rd1;
mul.wide.s32 %rd5, %r1, 8;
add.s64 %rd6, %rd4, %rd5;
rem.s32 %r9, %r1, %r2;
div.s32 %r10, %r9, %r3;
cvta.to.global.u64 %rd7, %rd2;
mul.wide.s32 %rd8, %r10, 8;
add.s64 %rd9, %rd7, %rd8;
ld.global.f64 %fd1, [%rd9];
ld.global.f64 %fd2, [%rd6];
add.f64 %fd3, %fd2, %fd1;
cvta.to.global.u64 %rd10, %rd3;
add.s64 %rd11, %rd10, %rd5;
st.global.f64 [%rd11], %fd3;
BB40_2:
ret;
}
// .globl bias_add_f
.visible .entry bias_add_f(
.param .u64 bias_add_f_param_0,
.param .u64 bias_add_f_param_1,
.param .u64 bias_add_f_param_2,
.param .u32 bias_add_f_param_3,
.param .u32 bias_add_f_param_4,
.param .u32 bias_add_f_param_5
)
{
.reg .pred %p<4>;
.reg .f32 %f<4>;
.reg .b32 %r<11>;
.reg .b64 %rd<12>;
ld.param.u64 %rd1, [bias_add_f_param_0];
ld.param.u64 %rd2, [bias_add_f_param_1];
ld.param.u64 %rd3, [bias_add_f_param_2];
ld.param.u32 %r4, [bias_add_f_param_3];
ld.param.u32 %r2, [bias_add_f_param_4];
ld.param.u32 %r3, [bias_add_f_param_5];
mov.u32 %r5, %ctaid.x;
mov.u32 %r6, %ntid.x;
mov.u32 %r7, %tid.x;
mad.lo.s32 %r1, %r6, %r5, %r7;
div.s32 %r8, %r1, %r2;
setp.ge.s32 %p1, %r8, %r4;
setp.lt.s32 %p2, %r2, 0;
or.pred %p3, %p1, %p2;
@%p3 bra BB41_2;
cvta.to.global.u64 %rd4, %rd1;
mul.wide.s32 %rd5, %r1, 4;
add.s64 %rd6, %rd4, %rd5;
rem.s32 %r9, %r1, %r2;
div.s32 %r10, %r9, %r3;
cvta.to.global.u64 %rd7, %rd2;
mul.wide.s32 %rd8, %r10, 4;
add.s64 %rd9, %rd7, %rd8;
ld.global.f32 %f1, [%rd9];
ld.global.f32 %f2, [%rd6];
add.f32 %f3, %f2, %f1;
cvta.to.global.u64 %rd10, %rd3;
add.s64 %rd11, %rd10, %rd5;
st.global.f32 [%rd11], %f3;
BB41_2:
ret;
}
// .globl daxpy_matrix_vector_d
.visible .entry daxpy_matrix_vector_d(
.param .u64 daxpy_matrix_vector_d_param_0,
.param .u64 daxpy_matrix_vector_d_param_1,
.param .f64 daxpy_matrix_vector_d_param_2,
.param .u64 daxpy_matrix_vector_d_param_3,
.param .u32 daxpy_matrix_vector_d_param_4,
.param .u32 daxpy_matrix_vector_d_param_5,
.param .u32 daxpy_matrix_vector_d_param_6,
.param .u32 daxpy_matrix_vector_d_param_7
)
{
.reg .pred %p<5>;
.reg .b32 %r<11>;
.reg .f64 %fd<7>;
.reg .b64 %rd<14>;
ld.param.u64 %rd3, [daxpy_matrix_vector_d_param_0];
ld.param.u64 %rd5, [daxpy_matrix_vector_d_param_1];
ld.param.f64 %fd2, [daxpy_matrix_vector_d_param_2];
ld.param.u64 %rd4, [daxpy_matrix_vector_d_param_3];
ld.param.u32 %r5, [daxpy_matrix_vector_d_param_4];
ld.param.u32 %r3, [daxpy_matrix_vector_d_param_5];
ld.param.u32 %r4, [daxpy_matrix_vector_d_param_6];
cvta.to.global.u64 %rd1, %rd5;
mov.u32 %r6, %ntid.x;
mov.u32 %r7, %ctaid.x;
mov.u32 %r8, %tid.x;
mad.lo.s32 %r9, %r6, %r7, %r8;
div.s32 %r1, %r9, %r3;
rem.s32 %r2, %r9, %r3;
setp.ge.s32 %p1, %r1, %r5;
setp.lt.s32 %p2, %r3, 0;
or.pred %p3, %p1, %p2;
@%p3 bra BB42_4;
cvta.to.global.u64 %rd6, %rd4;
mad.lo.s32 %r10, %r1, %r3, %r2;
cvta.to.global.u64 %rd7, %rd3;
mul.wide.s32 %rd8, %r10, 8;
add.s64 %rd9, %rd7, %rd8;
ld.global.f64 %fd1, [%rd9];
add.s64 %rd2, %rd6, %rd8;
setp.eq.s32 %p4, %r4, 1;
@%p4 bra BB42_3;
bra.uni BB42_2;
BB42_3:
mul.wide.s32 %rd12, %r2, 8;
add.s64 %rd13, %rd1, %rd12;
ld.global.f64 %fd5, [%rd13];
fma.rn.f64 %fd6, %fd5, %fd2, %fd1;
st.global.f64 [%rd2], %fd6;
bra.uni BB42_4;
BB42_2:
mul.wide.s32 %rd10, %r1, 8;
add.s64 %rd11, %rd1, %rd10;
ld.global.f64 %fd3, [%rd11];
fma.rn.f64 %fd4, %fd3, %fd2, %fd1;
st.global.f64 [%rd2], %fd4;
BB42_4:
ret;
}
// .globl daxpy_matrix_vector_f
.visible .entry daxpy_matrix_vector_f(
.param .u64 daxpy_matrix_vector_f_param_0,
.param .u64 daxpy_matrix_vector_f_param_1,
.param .f64 daxpy_matrix_vector_f_param_2,
.param .u64 daxpy_matrix_vector_f_param_3,
.param .u32 daxpy_matrix_vector_f_param_4,
.param .u32 daxpy_matrix_vector_f_param_5,
.param .u32 daxpy_matrix_vector_f_param_6,
.param .u32 daxpy_matrix_vector_f_param_7
)
{
.reg .pred %p<5>;
.reg .f32 %f<6>;
.reg .b32 %r<11>;
.reg .f64 %fd<7>;
.reg .b64 %rd<14>;
ld.param.u64 %rd3, [daxpy_matrix_vector_f_param_0];
ld.param.u64 %rd5, [daxpy_matrix_vector_f_param_1];
ld.param.f64 %fd2, [daxpy_matrix_vector_f_param_2];
ld.param.u64 %rd4, [daxpy_matrix_vector_f_param_3];
ld.param.u32 %r5, [daxpy_matrix_vector_f_param_4];
ld.param.u32 %r3, [daxpy_matrix_vector_f_param_5];
ld.param.u32 %r4, [daxpy_matrix_vector_f_param_6];
cvta.to.global.u64 %rd1, %rd5;
mov.u32 %r6, %ntid.x;
mov.u32 %r7, %ctaid.x;
mov.u32 %r8, %tid.x;
mad.lo.s32 %r9, %r6, %r7, %r8;
div.s32 %r1, %r9, %r3;
rem.s32 %r2, %r9, %r3;
setp.ge.s32 %p1, %r1, %r5;
setp.lt.s32 %p2, %r3, 0;
or.pred %p3, %p1, %p2;
@%p3 bra BB43_4;
cvta.to.global.u64 %rd6, %rd4;
mad.lo.s32 %r10, %r1, %r3, %r2;
cvta.to.global.u64 %rd7, %rd3;
mul.wide.s32 %rd8, %r10, 4;
add.s64 %rd9, %rd7, %rd8;
ld.global.f32 %f1, [%rd9];
cvt.f64.f32 %fd1, %f1;
add.s64 %rd2, %rd6, %rd8;
setp.eq.s32 %p4, %r4, 1;
@%p4 bra BB43_3;
bra.uni BB43_2;
BB43_3:
mul.wide.s32 %rd12, %r2, 4;
add.s64 %rd13, %rd1, %rd12;
ld.global.f32 %f4, [%rd13];
cvt.f64.f32 %fd5, %f4;
fma.rn.f64 %fd6, %fd5, %fd2, %fd1;
cvt.rn.f32.f64 %f5, %fd6;
st.global.f32 [%rd2], %f5;
bra.uni BB43_4;
BB43_2:
mul.wide.s32 %rd10, %r1, 4;
add.s64 %rd11, %rd1, %rd10;
ld.global.f32 %f2, [%rd11];
cvt.f64.f32 %fd3, %f2;
fma.rn.f64 %fd4, %fd3, %fd2, %fd1;
cvt.rn.f32.f64 %f3, %fd4;
st.global.f32 [%rd2], %f3;
BB43_4:
ret;
}
// .globl bias_multiply_d
.visible .entry bias_multiply_d(
.param .u64 bias_multiply_d_param_0,
.param .u64 bias_multiply_d_param_1,
.param .u64 bias_multiply_d_param_2,
.param .u32 bias_multiply_d_param_3,
.param .u32 bias_multiply_d_param_4,
.param .u32 bias_multiply_d_param_5
)
{
.reg .pred %p<4>;
.reg .b32 %r<11>;
.reg .f64 %fd<4>;
.reg .b64 %rd<12>;
ld.param.u64 %rd1, [bias_multiply_d_param_0];
ld.param.u64 %rd2, [bias_multiply_d_param_1];
ld.param.u64 %rd3, [bias_multiply_d_param_2];
ld.param.u32 %r4, [bias_multiply_d_param_3];
ld.param.u32 %r2, [bias_multiply_d_param_4];
ld.param.u32 %r3, [bias_multiply_d_param_5];
mov.u32 %r5, %ctaid.x;
mov.u32 %r6, %ntid.x;
mov.u32 %r7, %tid.x;
mad.lo.s32 %r1, %r6, %r5, %r7;
div.s32 %r8, %r1, %r2;
setp.ge.s32 %p1, %r8, %r4;
setp.lt.s32 %p2, %r2, 0;
or.pred %p3, %p1, %p2;
@%p3 bra BB44_2;
cvta.to.global.u64 %rd4, %rd1;
mul.wide.s32 %rd5, %r1, 8;
add.s64 %rd6, %rd4, %rd5;
rem.s32 %r9, %r1, %r2;
div.s32 %r10, %r9, %r3;
cvta.to.global.u64 %rd7, %rd2;
mul.wide.s32 %rd8, %r10, 8;
add.s64 %rd9, %rd7, %rd8;
ld.global.f64 %fd1, [%rd9];
ld.global.f64 %fd2, [%rd6];
mul.f64 %fd3, %fd2, %fd1;
cvta.to.global.u64 %rd10, %rd3;
add.s64 %rd11, %rd10, %rd5;
st.global.f64 [%rd11], %fd3;
BB44_2:
ret;
}
// .globl bias_multiply_f
.visible .entry bias_multiply_f(
.param .u64 bias_multiply_f_param_0,
.param .u64 bias_multiply_f_param_1,
.param .u64 bias_multiply_f_param_2,
.param .u32 bias_multiply_f_param_3,
.param .u32 bias_multiply_f_param_4,
.param .u32 bias_multiply_f_param_5
)
{
.reg .pred %p<4>;
.reg .f32 %f<4>;
.reg .b32 %r<11>;
.reg .b64 %rd<12>;
ld.param.u64 %rd1, [bias_multiply_f_param_0];
ld.param.u64 %rd2, [bias_multiply_f_param_1];
ld.param.u64 %rd3, [bias_multiply_f_param_2];
ld.param.u32 %r4, [bias_multiply_f_param_3];
ld.param.u32 %r2, [bias_multiply_f_param_4];
ld.param.u32 %r3, [bias_multiply_f_param_5];
mov.u32 %r5, %ctaid.x;
mov.u32 %r6, %ntid.x;
mov.u32 %r7, %tid.x;
mad.lo.s32 %r1, %r6, %r5, %r7;
div.s32 %r8, %r1, %r2;
setp.ge.s32 %p1, %r8, %r4;
setp.lt.s32 %p2, %r2, 0;
or.pred %p3, %p1, %p2;
@%p3 bra BB45_2;
cvta.to.global.u64 %rd4, %rd1;
mul.wide.s32 %rd5, %r1, 4;
add.s64 %rd6, %rd4, %rd5;
rem.s32 %r9, %r1, %r2;
div.s32 %r10, %r9, %r3;
cvta.to.global.u64 %rd7, %rd2;
mul.wide.s32 %rd8, %r10, 4;
add.s64 %rd9, %rd7, %rd8;
ld.global.f32 %f1, [%rd9];
ld.global.f32 %f2, [%rd6];
mul.f32 %f3, %f2, %f1;
cvta.to.global.u64 %rd10, %rd3;
add.s64 %rd11, %rd10, %rd5;
st.global.f32 [%rd11], %f3;
BB45_2:
ret;
}
// .globl matrix_matrix_cellwise_op_d
.visible .entry matrix_matrix_cellwise_op_d(
.param .u64 matrix_matrix_cellwise_op_d_param_0,
.param .u64 matrix_matrix_cellwise_op_d_param_1,
.param .u64 matrix_matrix_cellwise_op_d_param_2,
.param .u32 matrix_matrix_cellwise_op_d_param_3,
.param .u32 matrix_matrix_cellwise_op_d_param_4,
.param .u32 matrix_matrix_cellwise_op_d_param_5,
.param .u32 matrix_matrix_cellwise_op_d_param_6,
.param .u32 matrix_matrix_cellwise_op_d_param_7
)
{
.reg .pred %p<76>;
.reg .b32 %r<61>;
.reg .f64 %fd<51>;
.reg .b64 %rd<19>;
ld.param.u64 %rd2, [matrix_matrix_cellwise_op_d_param_0];
ld.param.u64 %rd3, [matrix_matrix_cellwise_op_d_param_1];
ld.param.u64 %rd4, [matrix_matrix_cellwise_op_d_param_2];
ld.param.u32 %r14, [matrix_matrix_cellwise_op_d_param_3];
ld.param.u32 %r10, [matrix_matrix_cellwise_op_d_param_4];
ld.param.u32 %r11, [matrix_matrix_cellwise_op_d_param_5];
ld.param.u32 %r12, [matrix_matrix_cellwise_op_d_param_6];
ld.param.u32 %r13, [matrix_matrix_cellwise_op_d_param_7];
mov.u32 %r15, %ntid.x;
mov.u32 %r16, %ctaid.x;
mov.u32 %r17, %tid.x;
mad.lo.s32 %r18, %r15, %r16, %r17;
div.s32 %r60, %r18, %r10;
rem.s32 %r2, %r18, %r10;
setp.ge.s32 %p2, %r60, %r14;
setp.lt.s32 %p3, %r10, 0;
or.pred %p4, %p2, %p3;
@%p4 bra BB46_77;
mad.lo.s32 %r3, %r60, %r10, %r2;
setp.eq.s32 %p5, %r11, 1;
mov.u32 %r58, %r60;
@%p5 bra BB46_4;
setp.ne.s32 %p6, %r11, 2;
mov.u32 %r58, %r3;
@%p6 bra BB46_4;
mov.u32 %r58, %r2;
BB46_4:
setp.eq.s32 %p7, %r12, 1;
@%p7 bra BB46_7;
setp.ne.s32 %p8, %r12, 2;
mov.u32 %r60, %r3;
@%p8 bra BB46_7;
mov.u32 %r60, %r2;
BB46_7:
cvta.to.global.u64 %rd5, %rd3;
cvta.to.global.u64 %rd6, %rd2;
mul.wide.s32 %rd7, %r58, 8;
add.s64 %rd8, %rd6, %rd7;
ld.global.f64 %fd1, [%rd8];
mul.wide.s32 %rd9, %r60, 8;
add.s64 %rd10, %rd5, %rd9;
ld.global.f64 %fd2, [%rd10];
mov.f64 %fd50, 0d7FEFFFFFFFFFFFFF;
setp.gt.s32 %p9, %r13, 8;
@%p9 bra BB46_24;
setp.gt.s32 %p23, %r13, 3;
@%p23 bra BB46_16;
setp.gt.s32 %p30, %r13, 1;
@%p30 bra BB46_13;
setp.eq.s32 %p33, %r13, 0;
@%p33 bra BB46_75;
bra.uni BB46_11;
BB46_75:
add.f64 %fd50, %fd1, %fd2;
bra.uni BB46_76;
BB46_24:
setp.gt.s32 %p10, %r13, 13;
@%p10 bra BB46_33;
setp.gt.s32 %p17, %r13, 10;
@%p17 bra BB46_29;
setp.eq.s32 %p21, %r13, 9;
@%p21 bra BB46_53;
bra.uni BB46_27;
BB46_53:
setp.eq.f64 %p48, %fd1, %fd2;
selp.f64 %fd50, 0d3FF0000000000000, 0d0000000000000000, %p48;
bra.uni BB46_76;
BB46_16:
setp.gt.s32 %p24, %r13, 5;
@%p24 bra BB46_20;
setp.eq.s32 %p28, %r13, 4;
@%p28 bra BB46_56;
bra.uni BB46_18;
BB46_56:
{
.reg .b32 %temp;
mov.b64 {%temp, %r8}, %fd1;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r9}, %fd2;
}
bfe.u32 %r31, %r9, 20, 11;
add.s32 %r32, %r31, -1012;
mov.b64 %rd15, %fd2;
shl.b64 %rd1, %rd15, %r32;
setp.ne.s64 %p53, %rd1, -9223372036854775808;
setp.eq.s64 %p54, %rd1, -9223372036854775808;
abs.f64 %fd19, %fd1;
// Callseq Start 0
{
.reg .b32 temp_param_reg;
// <end>}
.param .b64 param0;
st.param.f64 [param0+0], %fd19;
.param .b64 param1;
st.param.f64 [param1+0], %fd2;
.param .b64 retval0;
call.uni (retval0),
__internal_accurate_pow,
(
param0,
param1
);
ld.param.f64 %fd25, [retval0+0];
//{
}// Callseq End 0
setp.gt.s32 %p55, %r8, -1;
setp.lt.s32 %p56, %r8, 0;
and.pred %p1, %p56, %p54;
or.pred %p57, %p55, %p53;
@%p57 bra BB46_58;
{
.reg .b32 %temp;
mov.b64 {%temp, %r33}, %fd25;
}
xor.b32 %r34, %r33, -2147483648;
{
.reg .b32 %temp;
mov.b64 {%r35, %temp}, %fd25;
}
mov.b64 %fd25, {%r35, %r34};
BB46_58:
setp.eq.f64 %p58, %fd1, 0d0000000000000000;
@%p58 bra BB46_61;
bra.uni BB46_59;
BB46_61:
selp.b32 %r36, %r8, 0, %p54;
mov.u32 %r37, 0;
or.b32 %r38, %r36, 2146435072;
setp.lt.s32 %p62, %r9, 0;
selp.b32 %r39, %r38, %r36, %p62;
mov.b64 %fd25, {%r37, %r39};
bra.uni BB46_62;
BB46_33:
setp.gt.s32 %p11, %r13, 15;
@%p11 bra BB46_37;
setp.eq.s32 %p15, %r13, 14;
@%p15 bra BB46_50;
bra.uni BB46_35;
BB46_50:
cvt.rni.s64.f64 %rd11, %fd1;
cvt.u32.u64 %r25, %rd11;
cvt.rni.s64.f64 %rd12, %fd2;
cvt.u32.u64 %r26, %rd12;
or.b32 %r27, %r26, %r25;
setp.eq.s32 %p45, %r27, 0;
selp.f64 %fd50, 0d0000000000000000, 0d3FF0000000000000, %p45;
bra.uni BB46_76;
BB46_13:
setp.eq.s32 %p31, %r13, 2;
@%p31 bra BB46_74;
bra.uni BB46_14;
BB46_74:
mul.f64 %fd50, %fd1, %fd2;
bra.uni BB46_76;
BB46_29:
setp.eq.s32 %p18, %r13, 11;
@%p18 bra BB46_52;
setp.eq.s32 %p19, %r13, 12;
@%p19 bra BB46_51;
bra.uni BB46_31;
BB46_51:
max.f64 %fd50, %fd1, %fd2;
bra.uni BB46_76;
BB46_20:
setp.eq.s32 %p25, %r13, 6;
@%p25 bra BB46_55;
setp.eq.s32 %p26, %r13, 7;
@%p26 bra BB46_54;
bra.uni BB46_22;
BB46_54:
setp.gt.f64 %p50, %fd1, %fd2;
selp.f64 %fd50, 0d3FF0000000000000, 0d0000000000000000, %p50;
bra.uni BB46_76;
BB46_37:
setp.eq.s32 %p12, %r13, 16;
@%p12 bra BB46_49;
setp.eq.s32 %p13, %r13, 17;
@%p13 bra BB46_44;
bra.uni BB46_39;
BB46_44:
setp.eq.f64 %p38, %fd2, 0d0000000000000000;
setp.eq.f64 %p39, %fd2, 0d8000000000000000;
or.pred %p40, %p38, %p39;
mov.f64 %fd50, 0d7FF8000000000000;
@%p40 bra BB46_76;
div.rn.f64 %fd50, %fd1, %fd2;
abs.f64 %fd39, %fd50;
setp.gtu.f64 %p41, %fd39, 0d7FF0000000000000;
@%p41 bra BB46_76;
{
.reg .b32 %temp;
mov.b64 {%temp, %r22}, %fd50;
}
and.b32 %r23, %r22, 2147483647;
setp.ne.s32 %p42, %r23, 2146435072;
@%p42 bra BB46_48;
{
.reg .b32 %temp;
mov.b64 {%r24, %temp}, %fd50;
}
setp.eq.s32 %p43, %r24, 0;
@%p43 bra BB46_76;
BB46_48:
cvt.rmi.f64.f64 %fd40, %fd50;
mul.f64 %fd41, %fd2, %fd40;
sub.f64 %fd50, %fd1, %fd41;
bra.uni BB46_76;
BB46_11:
setp.eq.s32 %p34, %r13, 1;
@%p34 bra BB46_12;
bra.uni BB46_76;
BB46_12:
sub.f64 %fd50, %fd1, %fd2;
bra.uni BB46_76;
BB46_27:
setp.eq.s32 %p22, %r13, 10;
@%p22 bra BB46_28;
bra.uni BB46_76;
BB46_28:
setp.neu.f64 %p47, %fd1, %fd2;
selp.f64 %fd50, 0d3FF0000000000000, 0d0000000000000000, %p47;
bra.uni BB46_76;
BB46_18:
setp.eq.s32 %p29, %r13, 5;
@%p29 bra BB46_19;
bra.uni BB46_76;
BB46_19:
setp.lt.f64 %p52, %fd1, %fd2;
selp.f64 %fd50, 0d3FF0000000000000, 0d0000000000000000, %p52;
bra.uni BB46_76;
BB46_35:
setp.eq.s32 %p16, %r13, 15;
@%p16 bra BB46_36;
bra.uni BB46_76;
BB46_36:
mul.f64 %fd43, %fd1, %fd2;
mov.f64 %fd44, 0d3FF0000000000000;
sub.f64 %fd50, %fd44, %fd43;
bra.uni BB46_76;
BB46_14:
setp.eq.s32 %p32, %r13, 3;
@%p32 bra BB46_15;
bra.uni BB46_76;
BB46_15:
div.rn.f64 %fd50, %fd1, %fd2;
bra.uni BB46_76;
BB46_52:
min.f64 %fd50, %fd1, %fd2;
bra.uni BB46_76;
BB46_31:
setp.eq.s32 %p20, %r13, 13;
@%p20 bra BB46_32;
bra.uni BB46_76;
BB46_32:
cvt.rni.s64.f64 %rd13, %fd1;
cvt.u32.u64 %r28, %rd13;
cvt.rni.s64.f64 %rd14, %fd2;
cvt.u32.u64 %r29, %rd14;
and.b32 %r30, %r29, %r28;
setp.eq.s32 %p46, %r30, 0;
selp.f64 %fd50, 0d0000000000000000, 0d3FF0000000000000, %p46;
bra.uni BB46_76;
BB46_55:
setp.gtu.f64 %p51, %fd1, %fd2;
selp.f64 %fd50, 0d0000000000000000, 0d3FF0000000000000, %p51;
bra.uni BB46_76;
BB46_22:
setp.eq.s32 %p27, %r13, 8;
@%p27 bra BB46_23;
bra.uni BB46_76;
BB46_23:
setp.ltu.f64 %p49, %fd1, %fd2;
selp.f64 %fd50, 0d0000000000000000, 0d3FF0000000000000, %p49;
bra.uni BB46_76;
BB46_49:
setp.neu.f64 %p44, %fd1, 0d0000000000000000;
sub.f64 %fd42, %fd1, %fd2;
selp.f64 %fd50, %fd42, 0d0000000000000000, %p44;
bra.uni BB46_76;
BB46_39:
setp.ne.s32 %p14, %r13, 18;
@%p14 bra BB46_76;
div.rn.f64 %fd50, %fd1, %fd2;
abs.f64 %fd37, %fd50;
setp.gtu.f64 %p35, %fd37, 0d7FF0000000000000;
@%p35 bra BB46_76;
{
.reg .b32 %temp;
mov.b64 {%temp, %r19}, %fd50;
}
and.b32 %r20, %r19, 2147483647;
setp.ne.s32 %p36, %r20, 2146435072;
@%p36 bra BB46_43;
{
.reg .b32 %temp;
mov.b64 {%r21, %temp}, %fd50;
}
setp.eq.s32 %p37, %r21, 0;
@%p37 bra BB46_76;
BB46_43:
cvt.rmi.f64.f64 %fd50, %fd50;
bra.uni BB46_76;
BB46_59:
@%p55 bra BB46_62;
cvt.rzi.f64.f64 %fd45, %fd2;
setp.neu.f64 %p60, %fd45, %fd2;
selp.f64 %fd25, 0dFFF8000000000000, %fd25, %p60;
BB46_62:
add.f64 %fd49, %fd1, %fd2;
{
.reg .b32 %temp;
mov.b64 {%temp, %r40}, %fd49;
}
and.b32 %r41, %r40, 2146435072;
setp.ne.s32 %p63, %r41, 2146435072;
@%p63 bra BB46_63;
setp.gtu.f64 %p64, %fd19, 0d7FF0000000000000;
@%p64 bra BB46_73;
abs.f64 %fd46, %fd2;
setp.gtu.f64 %p65, %fd46, 0d7FF0000000000000;
@%p65 bra BB46_73;
and.b32 %r42, %r9, 2147483647;
setp.ne.s32 %p66, %r42, 2146435072;
@%p66 bra BB46_68;
{
.reg .b32 %temp;
mov.b64 {%r43, %temp}, %fd2;
}
setp.eq.s32 %p67, %r43, 0;
@%p67 bra BB46_72;
BB46_68:
and.b32 %r44, %r8, 2147483647;
setp.ne.s32 %p68, %r44, 2146435072;
@%p68 bra BB46_69;
{
.reg .b32 %temp;
mov.b64 {%r45, %temp}, %fd1;
}
setp.ne.s32 %p69, %r45, 0;
mov.f64 %fd49, %fd25;
@%p69 bra BB46_73;
shr.s32 %r46, %r9, 31;
and.b32 %r47, %r46, -2146435072;
add.s32 %r48, %r47, 2146435072;
or.b32 %r49, %r48, -2147483648;
selp.b32 %r50, %r49, %r48, %p1;
mov.u32 %r51, 0;
mov.b64 %fd49, {%r51, %r50};
bra.uni BB46_73;
BB46_63:
mov.f64 %fd49, %fd25;
BB46_73:
setp.eq.f64 %p73, %fd2, 0d0000000000000000;
setp.eq.f64 %p74, %fd1, 0d3FF0000000000000;
or.pred %p75, %p74, %p73;
selp.f64 %fd50, 0d3FF0000000000000, %fd49, %p75;
BB46_76:
cvta.to.global.u64 %rd16, %rd4;
mul.wide.s32 %rd17, %r3, 8;
add.s64 %rd18, %rd16, %rd17;
st.global.f64 [%rd18], %fd50;
bar.sync 0;
BB46_77:
ret;
BB46_69:
mov.f64 %fd49, %fd25;
bra.uni BB46_73;
BB46_72:
setp.gt.f64 %p70, %fd19, 0d3FF0000000000000;
selp.b32 %r52, 2146435072, 0, %p70;
mov.u32 %r53, 0;
xor.b32 %r54, %r52, 2146435072;
setp.lt.s32 %p71, %r9, 0;
selp.b32 %r55, %r54, %r52, %p71;
setp.eq.f64 %p72, %fd1, 0dBFF0000000000000;
selp.b32 %r56, 1072693248, %r55, %p72;
mov.b64 %fd49, {%r53, %r56};
bra.uni BB46_73;
}
// .globl matrix_matrix_cellwise_op_f
.visible .entry matrix_matrix_cellwise_op_f(
.param .u64 matrix_matrix_cellwise_op_f_param_0,
.param .u64 matrix_matrix_cellwise_op_f_param_1,
.param .u64 matrix_matrix_cellwise_op_f_param_2,
.param .u32 matrix_matrix_cellwise_op_f_param_3,
.param .u32 matrix_matrix_cellwise_op_f_param_4,
.param .u32 matrix_matrix_cellwise_op_f_param_5,
.param .u32 matrix_matrix_cellwise_op_f_param_6,
.param .u32 matrix_matrix_cellwise_op_f_param_7
)
{
.reg .pred %p<76>;
.reg .f32 %f<135>;
.reg .b32 %r<46>;
.reg .b64 %rd<17>;
ld.param.u64 %rd1, [matrix_matrix_cellwise_op_f_param_0];
ld.param.u64 %rd2, [matrix_matrix_cellwise_op_f_param_1];
ld.param.u64 %rd3, [matrix_matrix_cellwise_op_f_param_2];
ld.param.u32 %r12, [matrix_matrix_cellwise_op_f_param_3];
ld.param.u32 %r8, [matrix_matrix_cellwise_op_f_param_4];
ld.param.u32 %r9, [matrix_matrix_cellwise_op_f_param_5];
ld.param.u32 %r10, [matrix_matrix_cellwise_op_f_param_6];
ld.param.u32 %r11, [matrix_matrix_cellwise_op_f_param_7];
mov.u32 %r13, %ntid.x;
mov.u32 %r14, %ctaid.x;
mov.u32 %r15, %tid.x;
mad.lo.s32 %r16, %r13, %r14, %r15;
div.s32 %r45, %r16, %r8;
rem.s32 %r2, %r16, %r8;
setp.ge.s32 %p2, %r45, %r12;
setp.lt.s32 %p3, %r8, 0;
or.pred %p4, %p2, %p3;
@%p4 bra BB47_69;
mad.lo.s32 %r3, %r45, %r8, %r2;
setp.eq.s32 %p5, %r9, 1;
mov.u32 %r43, %r45;
@%p5 bra BB47_4;
setp.ne.s32 %p6, %r9, 2;
mov.u32 %r43, %r3;
@%p6 bra BB47_4;
mov.u32 %r43, %r2;
BB47_4:
setp.eq.s32 %p7, %r10, 1;
@%p7 bra BB47_7;
setp.ne.s32 %p8, %r10, 2;
mov.u32 %r45, %r3;
@%p8 bra BB47_7;
mov.u32 %r45, %r2;
BB47_7:
cvta.to.global.u64 %rd4, %rd2;
cvta.to.global.u64 %rd5, %rd1;
mul.wide.s32 %rd6, %r43, 4;
add.s64 %rd7, %rd5, %rd6;
ld.global.f32 %f1, [%rd7];
mul.wide.s32 %rd8, %r45, 4;
add.s64 %rd9, %rd4, %rd8;
ld.global.f32 %f2, [%rd9];
mov.f32 %f134, 0f7F7FFFFF;
setp.gt.s32 %p9, %r11, 8;
@%p9 bra BB47_24;
setp.gt.s32 %p23, %r11, 3;
@%p23 bra BB47_16;
setp.gt.s32 %p30, %r11, 1;
@%p30 bra BB47_13;
setp.eq.s32 %p33, %r11, 0;
@%p33 bra BB47_67;
bra.uni BB47_11;
BB47_67:
add.f32 %f134, %f1, %f2;
bra.uni BB47_68;
BB47_24:
setp.gt.s32 %p10, %r11, 13;
@%p10 bra BB47_33;
setp.gt.s32 %p17, %r11, 10;
@%p17 bra BB47_29;
setp.eq.s32 %p21, %r11, 9;
@%p21 bra BB47_49;
bra.uni BB47_27;
BB47_49:
setp.eq.f32 %p44, %f1, %f2;
selp.f32 %f134, 0f3F800000, 0f00000000, %p44;
bra.uni BB47_68;
BB47_16:
setp.gt.s32 %p24, %r11, 5;
@%p24 bra BB47_20;
setp.eq.s32 %p28, %r11, 4;
@%p28 bra BB47_52;
bra.uni BB47_18;
BB47_52:
mul.f32 %f51, %f2, 0f3F000000;
cvt.rzi.f32.f32 %f52, %f51;
fma.rn.f32 %f53, %f52, 0fC0000000, %f2;
abs.f32 %f19, %f53;
abs.f32 %f20, %f1;
setp.lt.f32 %p49, %f20, 0f00800000;
mul.f32 %f54, %f20, 0f4B800000;
selp.f32 %f55, 0fC3170000, 0fC2FE0000, %p49;
selp.f32 %f56, %f54, %f20, %p49;
mov.b32 %r23, %f56;
and.b32 %r24, %r23, 8388607;
or.b32 %r25, %r24, 1065353216;
mov.b32 %f57, %r25;
shr.u32 %r26, %r23, 23;
cvt.rn.f32.u32 %f58, %r26;
add.f32 %f59, %f55, %f58;
setp.gt.f32 %p50, %f57, 0f3FB504F3;
mul.f32 %f60, %f57, 0f3F000000;
add.f32 %f61, %f59, 0f3F800000;
selp.f32 %f62, %f60, %f57, %p50;
selp.f32 %f63, %f61, %f59, %p50;
add.f32 %f64, %f62, 0fBF800000;
add.f32 %f50, %f62, 0f3F800000;
// inline asm
rcp.approx.ftz.f32 %f49,%f50;
// inline asm
add.f32 %f65, %f64, %f64;
mul.f32 %f66, %f49, %f65;
mul.f32 %f67, %f66, %f66;
mov.f32 %f68, 0f3C4CAF63;
mov.f32 %f69, 0f3B18F0FE;
fma.rn.f32 %f70, %f69, %f67, %f68;
mov.f32 %f71, 0f3DAAAABD;
fma.rn.f32 %f72, %f70, %f67, %f71;
mul.rn.f32 %f73, %f72, %f67;
mul.rn.f32 %f74, %f73, %f66;
sub.f32 %f75, %f64, %f66;
neg.f32 %f76, %f66;
add.f32 %f77, %f75, %f75;
fma.rn.f32 %f78, %f76, %f64, %f77;
mul.rn.f32 %f79, %f49, %f78;
add.f32 %f80, %f74, %f66;
sub.f32 %f81, %f66, %f80;
add.f32 %f82, %f74, %f81;
add.f32 %f83, %f79, %f82;
add.f32 %f84, %f80, %f83;
sub.f32 %f85, %f80, %f84;
add.f32 %f86, %f83, %f85;
mov.f32 %f87, 0f3F317200;
mul.rn.f32 %f88, %f63, %f87;
mov.f32 %f89, 0f35BFBE8E;
mul.rn.f32 %f90, %f63, %f89;
add.f32 %f91, %f88, %f84;
sub.f32 %f92, %f88, %f91;
add.f32 %f93, %f84, %f92;
add.f32 %f94, %f86, %f93;
add.f32 %f95, %f90, %f94;
add.f32 %f96, %f91, %f95;
sub.f32 %f97, %f91, %f96;
add.f32 %f98, %f95, %f97;
abs.f32 %f21, %f2;
setp.gt.f32 %p51, %f21, 0f77F684DF;
mul.f32 %f99, %f2, 0f39000000;
selp.f32 %f100, %f99, %f2, %p51;
mul.rn.f32 %f101, %f100, %f96;
neg.f32 %f102, %f101;
fma.rn.f32 %f103, %f100, %f96, %f102;
fma.rn.f32 %f104, %f100, %f98, %f103;
mov.f32 %f105, 0f00000000;
fma.rn.f32 %f106, %f105, %f96, %f104;
add.rn.f32 %f107, %f101, %f106;
neg.f32 %f108, %f107;
add.rn.f32 %f109, %f101, %f108;
add.rn.f32 %f110, %f109, %f106;
mov.b32 %r27, %f107;
setp.eq.s32 %p52, %r27, 1118925336;
add.s32 %r28, %r27, -1;
mov.b32 %f111, %r28;
add.f32 %f112, %f110, 0f37000000;
selp.f32 %f113, %f111, %f107, %p52;
selp.f32 %f22, %f112, %f110, %p52;
mul.f32 %f114, %f113, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f115, %f114;
mov.f32 %f116, 0fBF317200;
fma.rn.f32 %f117, %f115, %f116, %f113;
mov.f32 %f118, 0fB5BFBE8E;
fma.rn.f32 %f119, %f115, %f118, %f117;
mul.f32 %f120, %f119, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f121, %f120;
add.f32 %f122, %f115, 0f00000000;
ex2.approx.f32 %f123, %f122;
mul.f32 %f124, %f121, %f123;
setp.lt.f32 %p53, %f113, 0fC2D20000;
selp.f32 %f125, 0f00000000, %f124, %p53;
setp.gt.f32 %p54, %f113, 0f42D20000;
selp.f32 %f131, 0f7F800000, %f125, %p54;
setp.eq.f32 %p55, %f131, 0f7F800000;
@%p55 bra BB47_54;
fma.rn.f32 %f131, %f131, %f22, %f131;
BB47_54:
setp.lt.f32 %p56, %f1, 0f00000000;
setp.eq.f32 %p57, %f19, 0f3F800000;
and.pred %p1, %p56, %p57;
mov.b32 %r29, %f131;
xor.b32 %r30, %r29, -2147483648;
mov.b32 %f126, %r30;
selp.f32 %f133, %f126, %f131, %p1;
setp.eq.f32 %p58, %f1, 0f00000000;
@%p58 bra BB47_57;
bra.uni BB47_55;
BB47_57:
add.f32 %f128, %f1, %f1;
mov.b32 %r31, %f128;
selp.b32 %r32, %r31, 0, %p57;
or.b32 %r33, %r32, 2139095040;
setp.lt.f32 %p62, %f2, 0f00000000;
selp.b32 %r34, %r33, %r32, %p62;
mov.b32 %f133, %r34;
bra.uni BB47_58;
BB47_33:
setp.gt.s32 %p11, %r11, 15;
@%p11 bra BB47_37;
setp.eq.s32 %p15, %r11, 14;
@%p15 bra BB47_46;
bra.uni BB47_35;
BB47_46:
cvt.rni.s64.f32 %rd10, %f1;
cvt.u32.u64 %r17, %rd10;
cvt.rni.s64.f32 %rd11, %f2;
cvt.u32.u64 %r18, %rd11;
or.b32 %r19, %r18, %r17;
setp.eq.s32 %p41, %r19, 0;
selp.f32 %f134, 0f00000000, 0f3F800000, %p41;
bra.uni BB47_68;
BB47_13:
setp.eq.s32 %p31, %r11, 2;
@%p31 bra BB47_66;
bra.uni BB47_14;
BB47_66:
mul.f32 %f134, %f1, %f2;
bra.uni BB47_68;
BB47_29:
setp.eq.s32 %p18, %r11, 11;
@%p18 bra BB47_48;
setp.eq.s32 %p19, %r11, 12;
@%p19 bra BB47_47;
bra.uni BB47_31;
BB47_47:
max.f32 %f134, %f1, %f2;
bra.uni BB47_68;
BB47_20:
setp.eq.s32 %p25, %r11, 6;
@%p25 bra BB47_51;
setp.eq.s32 %p26, %r11, 7;
@%p26 bra BB47_50;
bra.uni BB47_22;
BB47_50:
setp.gt.f32 %p46, %f1, %f2;
selp.f32 %f134, 0f3F800000, 0f00000000, %p46;
bra.uni BB47_68;
BB47_37:
setp.eq.s32 %p12, %r11, 16;
@%p12 bra BB47_45;
setp.eq.s32 %p13, %r11, 17;
@%p13 bra BB47_42;
bra.uni BB47_39;
BB47_42:
setp.eq.f32 %p36, %f2, 0f00000000;
setp.eq.f32 %p37, %f2, 0f80000000;
or.pred %p38, %p36, %p37;
mov.f32 %f134, 0f7FC00000;
@%p38 bra BB47_68;
div.rn.f32 %f134, %f1, %f2;
abs.f32 %f43, %f134;
setp.geu.f32 %p39, %f43, 0f7F800000;
@%p39 bra BB47_68;
cvt.rmi.f32.f32 %f44, %f134;
mul.f32 %f45, %f2, %f44;
sub.f32 %f134, %f1, %f45;
bra.uni BB47_68;
BB47_11:
setp.eq.s32 %p34, %r11, 1;
@%p34 bra BB47_12;
bra.uni BB47_68;
BB47_12:
sub.f32 %f134, %f1, %f2;
bra.uni BB47_68;
BB47_27:
setp.eq.s32 %p22, %r11, 10;
@%p22 bra BB47_28;
bra.uni BB47_68;
BB47_28:
setp.neu.f32 %p43, %f1, %f2;
selp.f32 %f134, 0f3F800000, 0f00000000, %p43;
bra.uni BB47_68;
BB47_18:
setp.eq.s32 %p29, %r11, 5;
@%p29 bra BB47_19;
bra.uni BB47_68;
BB47_19:
setp.lt.f32 %p48, %f1, %f2;
selp.f32 %f134, 0f3F800000, 0f00000000, %p48;
bra.uni BB47_68;
BB47_35:
setp.eq.s32 %p16, %r11, 15;
@%p16 bra BB47_36;
bra.uni BB47_68;
BB47_36:
mul.f32 %f47, %f1, %f2;
mov.f32 %f48, 0f3F800000;
sub.f32 %f134, %f48, %f47;
bra.uni BB47_68;
BB47_14:
setp.eq.s32 %p32, %r11, 3;
@%p32 bra BB47_15;
bra.uni BB47_68;
BB47_15:
div.rn.f32 %f134, %f1, %f2;
bra.uni BB47_68;
BB47_48:
min.f32 %f134, %f1, %f2;
bra.uni BB47_68;
BB47_31:
setp.eq.s32 %p20, %r11, 13;
@%p20 bra BB47_32;
bra.uni BB47_68;
BB47_32:
cvt.rni.s64.f32 %rd12, %f1;
cvt.u32.u64 %r20, %rd12;
cvt.rni.s64.f32 %rd13, %f2;
cvt.u32.u64 %r21, %rd13;
and.b32 %r22, %r21, %r20;
setp.eq.s32 %p42, %r22, 0;
selp.f32 %f134, 0f00000000, 0f3F800000, %p42;
bra.uni BB47_68;
BB47_51:
setp.gtu.f32 %p47, %f1, %f2;
selp.f32 %f134, 0f00000000, 0f3F800000, %p47;
bra.uni BB47_68;
BB47_22:
setp.eq.s32 %p27, %r11, 8;
@%p27 bra BB47_23;
bra.uni BB47_68;
BB47_23:
setp.ltu.f32 %p45, %f1, %f2;
selp.f32 %f134, 0f00000000, 0f3F800000, %p45;
bra.uni BB47_68;
BB47_45:
setp.neu.f32 %p40, %f1, 0f00000000;
sub.f32 %f46, %f1, %f2;
selp.f32 %f134, %f46, 0f00000000, %p40;
bra.uni BB47_68;
BB47_39:
setp.ne.s32 %p14, %r11, 18;
@%p14 bra BB47_68;
div.rn.f32 %f134, %f1, %f2;
abs.f32 %f41, %f134;
setp.geu.f32 %p35, %f41, 0f7F800000;
@%p35 bra BB47_68;
cvt.rmi.f32.f32 %f134, %f134;
bra.uni BB47_68;
BB47_55:
setp.geu.f32 %p59, %f1, 0f00000000;
@%p59 bra BB47_58;
cvt.rzi.f32.f32 %f127, %f2;
setp.neu.f32 %p60, %f127, %f2;
selp.f32 %f133, 0f7FFFFFFF, %f133, %p60;
BB47_58:
add.f32 %f129, %f20, %f21;
mov.b32 %r35, %f129;
setp.lt.s32 %p63, %r35, 2139095040;
@%p63 bra BB47_65;
setp.gtu.f32 %p64, %f20, 0f7F800000;
setp.gtu.f32 %p65, %f21, 0f7F800000;
or.pred %p66, %p64, %p65;
@%p66 bra BB47_64;
bra.uni BB47_60;
BB47_64:
add.f32 %f133, %f1, %f2;
bra.uni BB47_65;
BB47_60:
setp.eq.f32 %p67, %f21, 0f7F800000;
@%p67 bra BB47_63;
bra.uni BB47_61;
BB47_63:
setp.gt.f32 %p70, %f20, 0f3F800000;
selp.b32 %r39, 2139095040, 0, %p70;
xor.b32 %r40, %r39, 2139095040;
setp.lt.f32 %p71, %f2, 0f00000000;
selp.b32 %r41, %r40, %r39, %p71;
mov.b32 %f130, %r41;
setp.eq.f32 %p72, %f1, 0fBF800000;
selp.f32 %f133, 0f3F800000, %f130, %p72;
bra.uni BB47_65;
BB47_61:
setp.neu.f32 %p68, %f20, 0f7F800000;
@%p68 bra BB47_65;
setp.ltu.f32 %p69, %f2, 0f00000000;
selp.b32 %r36, 0, 2139095040, %p69;
or.b32 %r37, %r36, -2147483648;
selp.b32 %r38, %r37, %r36, %p1;
mov.b32 %f133, %r38;
BB47_65:
setp.eq.f32 %p73, %f2, 0f00000000;
setp.eq.f32 %p74, %f1, 0f3F800000;
or.pred %p75, %p74, %p73;
selp.f32 %f134, 0f3F800000, %f133, %p75;
BB47_68:
cvta.to.global.u64 %rd14, %rd3;
mul.wide.s32 %rd15, %r3, 4;
add.s64 %rd16, %rd14, %rd15;
st.global.f32 [%rd16], %f134;
bar.sync 0;
BB47_69:
ret;
}
// .globl matrix_scalar_op_d
.visible .entry matrix_scalar_op_d(
.param .u64 matrix_scalar_op_d_param_0,
.param .f64 matrix_scalar_op_d_param_1,
.param .u64 matrix_scalar_op_d_param_2,
.param .u32 matrix_scalar_op_d_param_3,
.param .u32 matrix_scalar_op_d_param_4,
.param .u32 matrix_scalar_op_d_param_5
)
{
.reg .pred %p<139>;
.reg .b32 %r<88>;
.reg .f64 %fd<99>;
.reg .b64 %rd<20>;
ld.param.u64 %rd4, [matrix_scalar_op_d_param_0];
ld.param.f64 %fd68, [matrix_scalar_op_d_param_1];
ld.param.u64 %rd5, [matrix_scalar_op_d_param_2];
ld.param.u32 %r8, [matrix_scalar_op_d_param_3];
ld.param.u32 %r6, [matrix_scalar_op_d_param_4];
ld.param.u32 %r7, [matrix_scalar_op_d_param_5];
mov.u32 %r9, %ntid.x;
mov.u32 %r10, %ctaid.x;
mov.u32 %r11, %tid.x;
mad.lo.s32 %r1, %r9, %r10, %r11;
setp.ge.s32 %p3, %r1, %r8;
@%p3 bra BB48_142;
cvta.to.global.u64 %rd6, %rd5;
cvta.to.global.u64 %rd7, %rd4;
mul.wide.s32 %rd8, %r1, 8;
add.s64 %rd9, %rd7, %rd8;
ld.global.f64 %fd1, [%rd9];
add.s64 %rd1, %rd6, %rd8;
setp.eq.s32 %p4, %r7, 0;
@%p4 bra BB48_72;
mov.f64 %fd94, 0d7FEFFFFFFFFFFFFF;
setp.gt.s32 %p5, %r6, 8;
@%p5 bra BB48_19;
setp.gt.s32 %p19, %r6, 3;
@%p19 bra BB48_11;
setp.gt.s32 %p26, %r6, 1;
@%p26 bra BB48_8;
setp.eq.s32 %p29, %r6, 0;
@%p29 bra BB48_70;
bra.uni BB48_6;
BB48_70:
add.f64 %fd94, %fd1, %fd68;
bra.uni BB48_71;
BB48_72:
mov.f64 %fd98, 0d7FEFFFFFFFFFFFFF;
setp.gt.s32 %p72, %r6, 8;
@%p72 bra BB48_89;
setp.gt.s32 %p86, %r6, 3;
@%p86 bra BB48_81;
setp.gt.s32 %p93, %r6, 1;
@%p93 bra BB48_78;
setp.eq.s32 %p96, %r6, 0;
@%p96 bra BB48_140;
bra.uni BB48_76;
BB48_140:
add.f64 %fd98, %fd1, %fd68;
bra.uni BB48_141;
BB48_19:
setp.gt.s32 %p6, %r6, 13;
@%p6 bra BB48_28;
setp.gt.s32 %p13, %r6, 10;
@%p13 bra BB48_24;
setp.eq.s32 %p17, %r6, 9;
@%p17 bra BB48_48;
bra.uni BB48_22;
BB48_48:
setp.eq.f64 %p44, %fd1, %fd68;
selp.f64 %fd94, 0d3FF0000000000000, 0d0000000000000000, %p44;
bra.uni BB48_71;
BB48_89:
setp.gt.s32 %p73, %r6, 13;
@%p73 bra BB48_98;
setp.gt.s32 %p80, %r6, 10;
@%p80 bra BB48_94;
setp.eq.s32 %p84, %r6, 9;
@%p84 bra BB48_118;
bra.uni BB48_92;
BB48_118:
setp.eq.f64 %p111, %fd1, %fd68;
selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p111;
bra.uni BB48_141;
BB48_11:
setp.gt.s32 %p20, %r6, 5;
@%p20 bra BB48_15;
setp.eq.s32 %p24, %r6, 4;
@%p24 bra BB48_51;
bra.uni BB48_13;
BB48_51:
{
.reg .b32 %temp;
mov.b64 {%temp, %r2}, %fd68;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r3}, %fd1;
}
bfe.u32 %r24, %r3, 20, 11;
add.s32 %r25, %r24, -1012;
mov.b64 %rd14, %fd1;
shl.b64 %rd2, %rd14, %r25;
setp.ne.s64 %p49, %rd2, -9223372036854775808;
setp.eq.s64 %p50, %rd2, -9223372036854775808;
abs.f64 %fd18, %fd68;
// Callseq Start 1
{
.reg .b32 temp_param_reg;
// <end>}
.param .b64 param0;
st.param.f64 [param0+0], %fd18;
.param .b64 param1;
st.param.f64 [param1+0], %fd1;
.param .b64 retval0;
call.uni (retval0),
__internal_accurate_pow,
(
param0,
param1
);
ld.param.f64 %fd24, [retval0+0];
//{
}// Callseq End 1
setp.gt.s32 %p51, %r2, -1;
setp.lt.s32 %p52, %r2, 0;
and.pred %p1, %p52, %p50;
or.pred %p53, %p51, %p49;
@%p53 bra BB48_53;
{
.reg .b32 %temp;
mov.b64 {%temp, %r26}, %fd24;
}
xor.b32 %r27, %r26, -2147483648;
{
.reg .b32 %temp;
mov.b64 {%r28, %temp}, %fd24;
}
mov.b64 %fd24, {%r28, %r27};
BB48_53:
setp.eq.f64 %p54, %fd68, 0d0000000000000000;
@%p54 bra BB48_56;
bra.uni BB48_54;
BB48_56:
selp.b32 %r29, %r2, 0, %p50;
mov.u32 %r30, 0;
or.b32 %r31, %r29, 2146435072;
setp.lt.s32 %p58, %r3, 0;
selp.b32 %r32, %r31, %r29, %p58;
mov.b64 %fd24, {%r30, %r32};
bra.uni BB48_57;
BB48_28:
setp.gt.s32 %p7, %r6, 15;
@%p7 bra BB48_32;
setp.eq.s32 %p11, %r6, 14;
@%p11 bra BB48_45;
bra.uni BB48_30;
BB48_45:
cvt.rni.s64.f64 %rd10, %fd68;
cvt.u32.u64 %r18, %rd10;
cvt.rni.s64.f64 %rd11, %fd1;
cvt.u32.u64 %r19, %rd11;
or.b32 %r20, %r19, %r18;
setp.eq.s32 %p41, %r20, 0;
selp.f64 %fd94, 0d0000000000000000, 0d3FF0000000000000, %p41;
bra.uni BB48_71;
BB48_81:
setp.gt.s32 %p87, %r6, 5;
@%p87 bra BB48_85;
setp.eq.s32 %p91, %r6, 4;
@%p91 bra BB48_121;
bra.uni BB48_83;
BB48_121:
{
.reg .b32 %temp;
mov.b64 {%temp, %r4}, %fd1;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r5}, %fd68;
}
bfe.u32 %r62, %r5, 20, 11;
add.s32 %r63, %r62, -1012;
mov.b64 %rd19, %fd68;
shl.b64 %rd3, %rd19, %r63;
setp.ne.s64 %p116, %rd3, -9223372036854775808;
setp.eq.s64 %p117, %rd3, -9223372036854775808;
abs.f64 %fd51, %fd1;
// Callseq Start 2
{
.reg .b32 temp_param_reg;
// <end>}
.param .b64 param0;
st.param.f64 [param0+0], %fd51;
.param .b64 param1;
st.param.f64 [param1+0], %fd68;
.param .b64 retval0;
call.uni (retval0),
__internal_accurate_pow,
(
param0,
param1
);
ld.param.f64 %fd57, [retval0+0];
//{
}// Callseq End 2
setp.gt.s32 %p118, %r4, -1;
setp.lt.s32 %p119, %r4, 0;
and.pred %p2, %p119, %p117;
or.pred %p120, %p118, %p116;
@%p120 bra BB48_123;
{
.reg .b32 %temp;
mov.b64 {%temp, %r64}, %fd57;
}
xor.b32 %r65, %r64, -2147483648;
{
.reg .b32 %temp;
mov.b64 {%r66, %temp}, %fd57;
}
mov.b64 %fd57, {%r66, %r65};
BB48_123:
setp.eq.f64 %p121, %fd1, 0d0000000000000000;
@%p121 bra BB48_126;
bra.uni BB48_124;
BB48_126:
selp.b32 %r67, %r4, 0, %p117;
mov.u32 %r68, 0;
or.b32 %r69, %r67, 2146435072;
setp.lt.s32 %p125, %r5, 0;
selp.b32 %r70, %r69, %r67, %p125;
mov.b64 %fd57, {%r68, %r70};
bra.uni BB48_127;
BB48_98:
setp.gt.s32 %p74, %r6, 15;
@%p74 bra BB48_102;
setp.eq.s32 %p78, %r6, 14;
@%p78 bra BB48_115;
bra.uni BB48_100;
BB48_115:
cvt.rni.s64.f64 %rd15, %fd1;
cvt.u32.u64 %r56, %rd15;
cvt.rni.s64.f64 %rd16, %fd68;
cvt.u32.u64 %r57, %rd16;
or.b32 %r58, %r57, %r56;
setp.eq.s32 %p108, %r58, 0;
selp.f64 %fd98, 0d0000000000000000, 0d3FF0000000000000, %p108;
bra.uni BB48_141;
BB48_8:
setp.eq.s32 %p27, %r6, 2;
@%p27 bra BB48_69;
bra.uni BB48_9;
BB48_69:
mul.f64 %fd94, %fd1, %fd68;
bra.uni BB48_71;
BB48_24:
setp.eq.s32 %p14, %r6, 11;
@%p14 bra BB48_47;
setp.eq.s32 %p15, %r6, 12;
@%p15 bra BB48_46;
bra.uni BB48_26;
BB48_46:
max.f64 %fd94, %fd68, %fd1;
bra.uni BB48_71;
BB48_15:
setp.eq.s32 %p21, %r6, 6;
@%p21 bra BB48_50;
setp.eq.s32 %p22, %r6, 7;
@%p22 bra BB48_49;
bra.uni BB48_17;
BB48_49:
setp.lt.f64 %p46, %fd1, %fd68;
selp.f64 %fd94, 0d3FF0000000000000, 0d0000000000000000, %p46;
bra.uni BB48_71;
BB48_32:
setp.eq.s32 %p8, %r6, 16;
@%p8 bra BB48_44;
setp.eq.s32 %p9, %r6, 17;
@%p9 bra BB48_39;
bra.uni BB48_34;
BB48_39:
setp.eq.f64 %p34, %fd1, 0d0000000000000000;
setp.eq.f64 %p35, %fd1, 0d8000000000000000;
or.pred %p36, %p34, %p35;
mov.f64 %fd94, 0d7FF8000000000000;
@%p36 bra BB48_71;
div.rn.f64 %fd94, %fd68, %fd1;
abs.f64 %fd72, %fd94;
setp.gtu.f64 %p37, %fd72, 0d7FF0000000000000;
@%p37 bra BB48_71;
{
.reg .b32 %temp;
mov.b64 {%temp, %r15}, %fd94;
}
and.b32 %r16, %r15, 2147483647;
setp.ne.s32 %p38, %r16, 2146435072;
@%p38 bra BB48_43;
{
.reg .b32 %temp;
mov.b64 {%r17, %temp}, %fd94;
}
setp.eq.s32 %p39, %r17, 0;
@%p39 bra BB48_71;
BB48_43:
cvt.rmi.f64.f64 %fd73, %fd94;
mul.f64 %fd74, %fd1, %fd73;
sub.f64 %fd94, %fd68, %fd74;
bra.uni BB48_71;
BB48_78:
setp.eq.s32 %p94, %r6, 2;
@%p94 bra BB48_139;
bra.uni BB48_79;
BB48_139:
mul.f64 %fd98, %fd1, %fd68;
bra.uni BB48_141;
BB48_94:
setp.eq.s32 %p81, %r6, 11;
@%p81 bra BB48_117;
setp.eq.s32 %p82, %r6, 12;
@%p82 bra BB48_116;
bra.uni BB48_96;
BB48_116:
max.f64 %fd98, %fd1, %fd68;
bra.uni BB48_141;
BB48_85:
setp.eq.s32 %p88, %r6, 6;
@%p88 bra BB48_120;
setp.eq.s32 %p89, %r6, 7;
@%p89 bra BB48_119;
bra.uni BB48_87;
BB48_119:
setp.gt.f64 %p113, %fd1, %fd68;
selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p113;
bra.uni BB48_141;
BB48_102:
setp.eq.s32 %p75, %r6, 16;
@%p75 bra BB48_114;
setp.eq.s32 %p76, %r6, 17;
@%p76 bra BB48_109;
bra.uni BB48_104;
BB48_109:
setp.eq.f64 %p101, %fd68, 0d0000000000000000;
setp.eq.f64 %p102, %fd68, 0d8000000000000000;
or.pred %p103, %p101, %p102;
mov.f64 %fd98, 0d7FF8000000000000;
@%p103 bra BB48_141;
div.rn.f64 %fd98, %fd1, %fd68;
abs.f64 %fd83, %fd98;
setp.gtu.f64 %p104, %fd83, 0d7FF0000000000000;
@%p104 bra BB48_141;
{
.reg .b32 %temp;
mov.b64 {%temp, %r53}, %fd98;
}
and.b32 %r54, %r53, 2147483647;
setp.ne.s32 %p105, %r54, 2146435072;
@%p105 bra BB48_113;
{
.reg .b32 %temp;
mov.b64 {%r55, %temp}, %fd98;
}
setp.eq.s32 %p106, %r55, 0;
@%p106 bra BB48_141;
BB48_113:
cvt.rmi.f64.f64 %fd84, %fd98;
mul.f64 %fd85, %fd84, %fd68;
sub.f64 %fd98, %fd1, %fd85;
bra.uni BB48_141;
BB48_6:
setp.eq.s32 %p30, %r6, 1;
@%p30 bra BB48_7;
bra.uni BB48_71;
BB48_7:
sub.f64 %fd94, %fd68, %fd1;
bra.uni BB48_71;
BB48_22:
setp.eq.s32 %p18, %r6, 10;
@%p18 bra BB48_23;
bra.uni BB48_71;
BB48_23:
setp.neu.f64 %p43, %fd1, %fd68;
selp.f64 %fd94, 0d3FF0000000000000, 0d0000000000000000, %p43;
bra.uni BB48_71;
BB48_13:
setp.eq.s32 %p25, %r6, 5;
@%p25 bra BB48_14;
bra.uni BB48_71;
BB48_14:
setp.gt.f64 %p48, %fd1, %fd68;
selp.f64 %fd94, 0d3FF0000000000000, 0d0000000000000000, %p48;
bra.uni BB48_71;
BB48_30:
setp.eq.s32 %p12, %r6, 15;
@%p12 bra BB48_31;
bra.uni BB48_71;
BB48_31:
mul.f64 %fd76, %fd1, %fd68;
mov.f64 %fd77, 0d3FF0000000000000;
sub.f64 %fd94, %fd77, %fd76;
bra.uni BB48_71;
BB48_9:
setp.eq.s32 %p28, %r6, 3;
@%p28 bra BB48_10;
bra.uni BB48_71;
BB48_10:
div.rn.f64 %fd94, %fd68, %fd1;
bra.uni BB48_71;
BB48_47:
min.f64 %fd94, %fd68, %fd1;
bra.uni BB48_71;
BB48_26:
setp.eq.s32 %p16, %r6, 13;
@%p16 bra BB48_27;
bra.uni BB48_71;
BB48_27:
cvt.rni.s64.f64 %rd12, %fd68;
cvt.u32.u64 %r21, %rd12;
cvt.rni.s64.f64 %rd13, %fd1;
cvt.u32.u64 %r22, %rd13;
and.b32 %r23, %r22, %r21;
setp.eq.s32 %p42, %r23, 0;
selp.f64 %fd94, 0d0000000000000000, 0d3FF0000000000000, %p42;
bra.uni BB48_71;
BB48_50:
setp.ltu.f64 %p47, %fd1, %fd68;
selp.f64 %fd94, 0d0000000000000000, 0d3FF0000000000000, %p47;
bra.uni BB48_71;
BB48_17:
setp.eq.s32 %p23, %r6, 8;
@%p23 bra BB48_18;
bra.uni BB48_71;
BB48_18:
setp.gtu.f64 %p45, %fd1, %fd68;
selp.f64 %fd94, 0d0000000000000000, 0d3FF0000000000000, %p45;
bra.uni BB48_71;
BB48_44:
setp.neu.f64 %p40, %fd68, 0d0000000000000000;
sub.f64 %fd75, %fd68, %fd1;
selp.f64 %fd94, %fd75, 0d0000000000000000, %p40;
bra.uni BB48_71;
BB48_34:
setp.ne.s32 %p10, %r6, 18;
@%p10 bra BB48_71;
div.rn.f64 %fd94, %fd68, %fd1;
abs.f64 %fd70, %fd94;
setp.gtu.f64 %p31, %fd70, 0d7FF0000000000000;
@%p31 bra BB48_71;
{
.reg .b32 %temp;
mov.b64 {%temp, %r12}, %fd94;
}
and.b32 %r13, %r12, 2147483647;
setp.ne.s32 %p32, %r13, 2146435072;
@%p32 bra BB48_38;
{
.reg .b32 %temp;
mov.b64 {%r14, %temp}, %fd94;
}
setp.eq.s32 %p33, %r14, 0;
@%p33 bra BB48_71;
BB48_38:
cvt.rmi.f64.f64 %fd94, %fd94;
bra.uni BB48_71;
BB48_76:
setp.eq.s32 %p97, %r6, 1;
@%p97 bra BB48_77;
bra.uni BB48_141;
BB48_77:
sub.f64 %fd98, %fd1, %fd68;
bra.uni BB48_141;
BB48_92:
setp.eq.s32 %p85, %r6, 10;
@%p85 bra BB48_93;
bra.uni BB48_141;
BB48_93:
setp.neu.f64 %p110, %fd1, %fd68;
selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p110;
bra.uni BB48_141;
BB48_83:
setp.eq.s32 %p92, %r6, 5;
@%p92 bra BB48_84;
bra.uni BB48_141;
BB48_84:
setp.lt.f64 %p115, %fd1, %fd68;
selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p115;
bra.uni BB48_141;
BB48_100:
setp.eq.s32 %p79, %r6, 15;
@%p79 bra BB48_101;
bra.uni BB48_141;
BB48_101:
mul.f64 %fd87, %fd1, %fd68;
mov.f64 %fd88, 0d3FF0000000000000;
sub.f64 %fd98, %fd88, %fd87;
bra.uni BB48_141;
BB48_79:
setp.eq.s32 %p95, %r6, 3;
@%p95 bra BB48_80;
bra.uni BB48_141;
BB48_80:
div.rn.f64 %fd98, %fd1, %fd68;
bra.uni BB48_141;
BB48_117:
min.f64 %fd98, %fd1, %fd68;
bra.uni BB48_141;
BB48_96:
setp.eq.s32 %p83, %r6, 13;
@%p83 bra BB48_97;
bra.uni BB48_141;
BB48_97:
cvt.rni.s64.f64 %rd17, %fd1;
cvt.u32.u64 %r59, %rd17;
cvt.rni.s64.f64 %rd18, %fd68;
cvt.u32.u64 %r60, %rd18;
and.b32 %r61, %r60, %r59;
setp.eq.s32 %p109, %r61, 0;
selp.f64 %fd98, 0d0000000000000000, 0d3FF0000000000000, %p109;
bra.uni BB48_141;
BB48_120:
setp.gtu.f64 %p114, %fd1, %fd68;
selp.f64 %fd98, 0d0000000000000000, 0d3FF0000000000000, %p114;
bra.uni BB48_141;
BB48_87:
setp.eq.s32 %p90, %r6, 8;
@%p90 bra BB48_88;
bra.uni BB48_141;
BB48_88:
setp.ltu.f64 %p112, %fd1, %fd68;
selp.f64 %fd98, 0d0000000000000000, 0d3FF0000000000000, %p112;
bra.uni BB48_141;
BB48_114:
setp.neu.f64 %p107, %fd1, 0d0000000000000000;
sub.f64 %fd86, %fd1, %fd68;
selp.f64 %fd98, %fd86, 0d0000000000000000, %p107;
bra.uni BB48_141;
BB48_104:
setp.ne.s32 %p77, %r6, 18;
@%p77 bra BB48_141;
div.rn.f64 %fd98, %fd1, %fd68;
abs.f64 %fd81, %fd98;
setp.gtu.f64 %p98, %fd81, 0d7FF0000000000000;
@%p98 bra BB48_141;
{
.reg .b32 %temp;
mov.b64 {%temp, %r50}, %fd98;
}
and.b32 %r51, %r50, 2147483647;
setp.ne.s32 %p99, %r51, 2146435072;
@%p99 bra BB48_108;
{
.reg .b32 %temp;
mov.b64 {%r52, %temp}, %fd98;
}
setp.eq.s32 %p100, %r52, 0;
@%p100 bra BB48_141;
BB48_108:
cvt.rmi.f64.f64 %fd98, %fd98;
bra.uni BB48_141;
BB48_54:
@%p51 bra BB48_57;
cvt.rzi.f64.f64 %fd78, %fd1;
setp.neu.f64 %p56, %fd78, %fd1;
selp.f64 %fd24, 0dFFF8000000000000, %fd24, %p56;
BB48_57:
add.f64 %fd93, %fd1, %fd68;
{
.reg .b32 %temp;
mov.b64 {%temp, %r33}, %fd93;
}
and.b32 %r34, %r33, 2146435072;
setp.ne.s32 %p59, %r34, 2146435072;
@%p59 bra BB48_58;
setp.gtu.f64 %p60, %fd18, 0d7FF0000000000000;
@%p60 bra BB48_68;
abs.f64 %fd79, %fd1;
setp.gtu.f64 %p61, %fd79, 0d7FF0000000000000;
@%p61 bra BB48_68;
and.b32 %r35, %r3, 2147483647;
setp.ne.s32 %p62, %r35, 2146435072;
@%p62 bra BB48_63;
{
.reg .b32 %temp;
mov.b64 {%r36, %temp}, %fd1;
}
setp.eq.s32 %p63, %r36, 0;
@%p63 bra BB48_67;
BB48_63:
and.b32 %r37, %r2, 2147483647;
setp.ne.s32 %p64, %r37, 2146435072;
@%p64 bra BB48_64;
{
.reg .b32 %temp;
mov.b64 {%r38, %temp}, %fd68;
}
setp.ne.s32 %p65, %r38, 0;
mov.f64 %fd93, %fd24;
@%p65 bra BB48_68;
shr.s32 %r39, %r3, 31;
and.b32 %r40, %r39, -2146435072;
add.s32 %r41, %r40, 2146435072;
or.b32 %r42, %r41, -2147483648;
selp.b32 %r43, %r42, %r41, %p1;
mov.u32 %r44, 0;
mov.b64 %fd93, {%r44, %r43};
bra.uni BB48_68;
BB48_58:
mov.f64 %fd93, %fd24;
BB48_68:
setp.eq.f64 %p69, %fd1, 0d0000000000000000;
setp.eq.f64 %p70, %fd68, 0d3FF0000000000000;
or.pred %p71, %p70, %p69;
selp.f64 %fd94, 0d3FF0000000000000, %fd93, %p71;
BB48_71:
st.global.f64 [%rd1], %fd94;
bra.uni BB48_142;
BB48_124:
@%p118 bra BB48_127;
cvt.rzi.f64.f64 %fd89, %fd68;
setp.neu.f64 %p123, %fd89, %fd68;
selp.f64 %fd57, 0dFFF8000000000000, %fd57, %p123;
BB48_127:
add.f64 %fd97, %fd1, %fd68;
{
.reg .b32 %temp;
mov.b64 {%temp, %r71}, %fd97;
}
and.b32 %r72, %r71, 2146435072;
setp.ne.s32 %p126, %r72, 2146435072;
@%p126 bra BB48_128;
setp.gtu.f64 %p127, %fd51, 0d7FF0000000000000;
@%p127 bra BB48_138;
abs.f64 %fd90, %fd68;
setp.gtu.f64 %p128, %fd90, 0d7FF0000000000000;
@%p128 bra BB48_138;
and.b32 %r73, %r5, 2147483647;
setp.ne.s32 %p129, %r73, 2146435072;
@%p129 bra BB48_133;
{
.reg .b32 %temp;
mov.b64 {%r74, %temp}, %fd68;
}
setp.eq.s32 %p130, %r74, 0;
@%p130 bra BB48_137;
BB48_133:
and.b32 %r75, %r4, 2147483647;
setp.ne.s32 %p131, %r75, 2146435072;
@%p131 bra BB48_134;
{
.reg .b32 %temp;
mov.b64 {%r76, %temp}, %fd1;
}
setp.ne.s32 %p132, %r76, 0;
mov.f64 %fd97, %fd57;
@%p132 bra BB48_138;
shr.s32 %r77, %r5, 31;
and.b32 %r78, %r77, -2146435072;
add.s32 %r79, %r78, 2146435072;
or.b32 %r80, %r79, -2147483648;
selp.b32 %r81, %r80, %r79, %p2;
mov.u32 %r82, 0;
mov.b64 %fd97, {%r82, %r81};
bra.uni BB48_138;
BB48_128:
mov.f64 %fd97, %fd57;
BB48_138:
setp.eq.f64 %p136, %fd68, 0d0000000000000000;
setp.eq.f64 %p137, %fd1, 0d3FF0000000000000;
or.pred %p138, %p137, %p136;
selp.f64 %fd98, 0d3FF0000000000000, %fd97, %p138;
BB48_141:
st.global.f64 [%rd1], %fd98;
BB48_142:
bar.sync 0;
ret;
BB48_64:
mov.f64 %fd93, %fd24;
bra.uni BB48_68;
BB48_134:
mov.f64 %fd97, %fd57;
bra.uni BB48_138;
BB48_67:
setp.gt.f64 %p66, %fd18, 0d3FF0000000000000;
selp.b32 %r45, 2146435072, 0, %p66;
mov.u32 %r46, 0;
xor.b32 %r47, %r45, 2146435072;
setp.lt.s32 %p67, %r3, 0;
selp.b32 %r48, %r47, %r45, %p67;
setp.eq.f64 %p68, %fd68, 0dBFF0000000000000;
selp.b32 %r49, 1072693248, %r48, %p68;
mov.b64 %fd93, {%r46, %r49};
bra.uni BB48_68;
BB48_137:
setp.gt.f64 %p133, %fd51, 0d3FF0000000000000;
selp.b32 %r83, 2146435072, 0, %p133;
mov.u32 %r84, 0;
xor.b32 %r85, %r83, 2146435072;
setp.lt.s32 %p134, %r5, 0;
selp.b32 %r86, %r85, %r83, %p134;
setp.eq.f64 %p135, %fd1, 0dBFF0000000000000;
selp.b32 %r87, 1072693248, %r86, %p135;
mov.b64 %fd97, {%r84, %r87};
bra.uni BB48_138;
}
// .globl matrix_scalar_op_f
.visible .entry matrix_scalar_op_f(
.param .u64 matrix_scalar_op_f_param_0,
.param .f64 matrix_scalar_op_f_param_1,
.param .u64 matrix_scalar_op_f_param_2,
.param .u32 matrix_scalar_op_f_param_3,
.param .u32 matrix_scalar_op_f_param_4,
.param .u32 matrix_scalar_op_f_param_5
)
{
.reg .pred %p<139>;
.reg .f32 %f<267>;
.reg .b32 %r<58>;
.reg .f64 %fd<2>;
.reg .b64 %rd<16>;
ld.param.u64 %rd2, [matrix_scalar_op_f_param_0];
ld.param.f64 %fd1, [matrix_scalar_op_f_param_1];
ld.param.u64 %rd3, [matrix_scalar_op_f_param_2];
ld.param.u32 %r4, [matrix_scalar_op_f_param_3];
ld.param.u32 %r2, [matrix_scalar_op_f_param_4];
ld.param.u32 %r3, [matrix_scalar_op_f_param_5];
cvt.rn.f32.f64 %f1, %fd1;
mov.u32 %r5, %ntid.x;
mov.u32 %r6, %ctaid.x;
mov.u32 %r7, %tid.x;
mad.lo.s32 %r1, %r5, %r6, %r7;
setp.ge.s32 %p3, %r1, %r4;
@%p3 bra BB49_126;
cvta.to.global.u64 %rd4, %rd3;
cvta.to.global.u64 %rd5, %rd2;
mul.wide.s32 %rd6, %r1, 4;
add.s64 %rd7, %rd5, %rd6;
ld.global.f32 %f2, [%rd7];
add.s64 %rd1, %rd4, %rd6;
setp.eq.s32 %p4, %r3, 0;
@%p4 bra BB49_64;
mov.f32 %f262, 0f7F7FFFFF;
setp.gt.s32 %p5, %r2, 8;
@%p5 bra BB49_19;
setp.gt.s32 %p19, %r2, 3;
@%p19 bra BB49_11;
setp.gt.s32 %p26, %r2, 1;
@%p26 bra BB49_8;
setp.eq.s32 %p29, %r2, 0;
@%p29 bra BB49_62;
bra.uni BB49_6;
BB49_62:
add.f32 %f262, %f1, %f2;
bra.uni BB49_63;
BB49_64:
mov.f32 %f266, 0f7F7FFFFF;
setp.gt.s32 %p72, %r2, 8;
@%p72 bra BB49_81;
setp.gt.s32 %p86, %r2, 3;
@%p86 bra BB49_73;
setp.gt.s32 %p93, %r2, 1;
@%p93 bra BB49_70;
setp.eq.s32 %p96, %r2, 0;
@%p96 bra BB49_124;
bra.uni BB49_68;
BB49_124:
add.f32 %f266, %f1, %f2;
bra.uni BB49_125;
BB49_19:
setp.gt.s32 %p6, %r2, 13;
@%p6 bra BB49_28;
setp.gt.s32 %p13, %r2, 10;
@%p13 bra BB49_24;
setp.eq.s32 %p17, %r2, 9;
@%p17 bra BB49_44;
bra.uni BB49_22;
BB49_44:
setp.eq.f32 %p40, %f1, %f2;
selp.f32 %f262, 0f3F800000, 0f00000000, %p40;
bra.uni BB49_63;
BB49_81:
setp.gt.s32 %p73, %r2, 13;
@%p73 bra BB49_90;
setp.gt.s32 %p80, %r2, 10;
@%p80 bra BB49_86;
setp.eq.s32 %p84, %r2, 9;
@%p84 bra BB49_106;
bra.uni BB49_84;
BB49_106:
setp.eq.f32 %p107, %f2, %f1;
selp.f32 %f266, 0f3F800000, 0f00000000, %p107;
bra.uni BB49_125;
BB49_11:
setp.gt.s32 %p20, %r2, 5;
@%p20 bra BB49_15;
setp.eq.s32 %p24, %r2, 4;
@%p24 bra BB49_47;
bra.uni BB49_13;
BB49_47:
mul.f32 %f88, %f2, 0f3F000000;
cvt.rzi.f32.f32 %f89, %f88;
fma.rn.f32 %f90, %f89, 0fC0000000, %f2;
abs.f32 %f19, %f90;
abs.f32 %f20, %f1;
setp.lt.f32 %p45, %f20, 0f00800000;
mul.f32 %f91, %f20, 0f4B800000;
selp.f32 %f92, 0fC3170000, 0fC2FE0000, %p45;
selp.f32 %f93, %f91, %f20, %p45;
mov.b32 %r14, %f93;
and.b32 %r15, %r14, 8388607;
or.b32 %r16, %r15, 1065353216;
mov.b32 %f94, %r16;
shr.u32 %r17, %r14, 23;
cvt.rn.f32.u32 %f95, %r17;
add.f32 %f96, %f92, %f95;
setp.gt.f32 %p46, %f94, 0f3FB504F3;
mul.f32 %f97, %f94, 0f3F000000;
add.f32 %f98, %f96, 0f3F800000;
selp.f32 %f99, %f97, %f94, %p46;
selp.f32 %f100, %f98, %f96, %p46;
add.f32 %f101, %f99, 0fBF800000;
add.f32 %f87, %f99, 0f3F800000;
// inline asm
rcp.approx.ftz.f32 %f86,%f87;
// inline asm
add.f32 %f102, %f101, %f101;
mul.f32 %f103, %f86, %f102;
mul.f32 %f104, %f103, %f103;
mov.f32 %f105, 0f3C4CAF63;
mov.f32 %f106, 0f3B18F0FE;
fma.rn.f32 %f107, %f106, %f104, %f105;
mov.f32 %f108, 0f3DAAAABD;
fma.rn.f32 %f109, %f107, %f104, %f108;
mul.rn.f32 %f110, %f109, %f104;
mul.rn.f32 %f111, %f110, %f103;
sub.f32 %f112, %f101, %f103;
neg.f32 %f113, %f103;
add.f32 %f114, %f112, %f112;
fma.rn.f32 %f115, %f113, %f101, %f114;
mul.rn.f32 %f116, %f86, %f115;
add.f32 %f117, %f111, %f103;
sub.f32 %f118, %f103, %f117;
add.f32 %f119, %f111, %f118;
add.f32 %f120, %f116, %f119;
add.f32 %f121, %f117, %f120;
sub.f32 %f122, %f117, %f121;
add.f32 %f123, %f120, %f122;
mov.f32 %f124, 0f3F317200;
mul.rn.f32 %f125, %f100, %f124;
mov.f32 %f126, 0f35BFBE8E;
mul.rn.f32 %f127, %f100, %f126;
add.f32 %f128, %f125, %f121;
sub.f32 %f129, %f125, %f128;
add.f32 %f130, %f121, %f129;
add.f32 %f131, %f123, %f130;
add.f32 %f132, %f127, %f131;
add.f32 %f133, %f128, %f132;
sub.f32 %f134, %f128, %f133;
add.f32 %f135, %f132, %f134;
abs.f32 %f21, %f2;
setp.gt.f32 %p47, %f21, 0f77F684DF;
mul.f32 %f136, %f2, 0f39000000;
selp.f32 %f137, %f136, %f2, %p47;
mul.rn.f32 %f138, %f137, %f133;
neg.f32 %f139, %f138;
fma.rn.f32 %f140, %f137, %f133, %f139;
fma.rn.f32 %f141, %f137, %f135, %f140;
mov.f32 %f142, 0f00000000;
fma.rn.f32 %f143, %f142, %f133, %f141;
add.rn.f32 %f144, %f138, %f143;
neg.f32 %f145, %f144;
add.rn.f32 %f146, %f138, %f145;
add.rn.f32 %f147, %f146, %f143;
mov.b32 %r18, %f144;
setp.eq.s32 %p48, %r18, 1118925336;
add.s32 %r19, %r18, -1;
mov.b32 %f148, %r19;
add.f32 %f149, %f147, 0f37000000;
selp.f32 %f150, %f148, %f144, %p48;
selp.f32 %f22, %f149, %f147, %p48;
mul.f32 %f151, %f150, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f152, %f151;
mov.f32 %f153, 0fBF317200;
fma.rn.f32 %f154, %f152, %f153, %f150;
mov.f32 %f155, 0fB5BFBE8E;
fma.rn.f32 %f156, %f152, %f155, %f154;
mul.f32 %f157, %f156, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f158, %f157;
add.f32 %f159, %f152, 0f00000000;
ex2.approx.f32 %f160, %f159;
mul.f32 %f161, %f158, %f160;
setp.lt.f32 %p49, %f150, 0fC2D20000;
selp.f32 %f162, 0f00000000, %f161, %p49;
setp.gt.f32 %p50, %f150, 0f42D20000;
selp.f32 %f259, 0f7F800000, %f162, %p50;
setp.eq.f32 %p51, %f259, 0f7F800000;
@%p51 bra BB49_49;
fma.rn.f32 %f259, %f259, %f22, %f259;
BB49_49:
setp.lt.f32 %p52, %f1, 0f00000000;
setp.eq.f32 %p53, %f19, 0f3F800000;
and.pred %p1, %p52, %p53;
mov.b32 %r20, %f259;
xor.b32 %r21, %r20, -2147483648;
mov.b32 %f163, %r21;
selp.f32 %f261, %f163, %f259, %p1;
setp.eq.f32 %p54, %f1, 0f00000000;
@%p54 bra BB49_52;
bra.uni BB49_50;
BB49_52:
add.f32 %f165, %f1, %f1;
mov.b32 %r22, %f165;
selp.b32 %r23, %r22, 0, %p53;
or.b32 %r24, %r23, 2139095040;
setp.lt.f32 %p58, %f2, 0f00000000;
selp.b32 %r25, %r24, %r23, %p58;
mov.b32 %f261, %r25;
bra.uni BB49_53;
BB49_28:
setp.gt.s32 %p7, %r2, 15;
@%p7 bra BB49_32;
setp.eq.s32 %p11, %r2, 14;
@%p11 bra BB49_41;
bra.uni BB49_30;
BB49_41:
cvt.rni.s64.f32 %rd8, %f1;
cvt.u32.u64 %r8, %rd8;
cvt.rni.s64.f32 %rd9, %f2;
cvt.u32.u64 %r9, %rd9;
or.b32 %r10, %r9, %r8;
setp.eq.s32 %p37, %r10, 0;
selp.f32 %f262, 0f00000000, 0f3F800000, %p37;
bra.uni BB49_63;
BB49_73:
setp.gt.s32 %p87, %r2, 5;
@%p87 bra BB49_77;
setp.eq.s32 %p91, %r2, 4;
@%p91 bra BB49_109;
bra.uni BB49_75;
BB49_109:
mul.f32 %f179, %f1, 0f3F000000;
cvt.rzi.f32.f32 %f180, %f179;
fma.rn.f32 %f181, %f180, 0fC0000000, %f1;
abs.f32 %f56, %f181;
abs.f32 %f57, %f2;
setp.lt.f32 %p112, %f57, 0f00800000;
mul.f32 %f182, %f57, 0f4B800000;
selp.f32 %f183, 0fC3170000, 0fC2FE0000, %p112;
selp.f32 %f184, %f182, %f57, %p112;
mov.b32 %r39, %f184;
and.b32 %r40, %r39, 8388607;
or.b32 %r41, %r40, 1065353216;
mov.b32 %f185, %r41;
shr.u32 %r42, %r39, 23;
cvt.rn.f32.u32 %f186, %r42;
add.f32 %f187, %f183, %f186;
setp.gt.f32 %p113, %f185, 0f3FB504F3;
mul.f32 %f188, %f185, 0f3F000000;
add.f32 %f189, %f187, 0f3F800000;
selp.f32 %f190, %f188, %f185, %p113;
selp.f32 %f191, %f189, %f187, %p113;
add.f32 %f192, %f190, 0fBF800000;
add.f32 %f178, %f190, 0f3F800000;
// inline asm
rcp.approx.ftz.f32 %f177,%f178;
// inline asm
add.f32 %f193, %f192, %f192;
mul.f32 %f194, %f177, %f193;
mul.f32 %f195, %f194, %f194;
mov.f32 %f196, 0f3C4CAF63;
mov.f32 %f197, 0f3B18F0FE;
fma.rn.f32 %f198, %f197, %f195, %f196;
mov.f32 %f199, 0f3DAAAABD;
fma.rn.f32 %f200, %f198, %f195, %f199;
mul.rn.f32 %f201, %f200, %f195;
mul.rn.f32 %f202, %f201, %f194;
sub.f32 %f203, %f192, %f194;
neg.f32 %f204, %f194;
add.f32 %f205, %f203, %f203;
fma.rn.f32 %f206, %f204, %f192, %f205;
mul.rn.f32 %f207, %f177, %f206;
add.f32 %f208, %f202, %f194;
sub.f32 %f209, %f194, %f208;
add.f32 %f210, %f202, %f209;
add.f32 %f211, %f207, %f210;
add.f32 %f212, %f208, %f211;
sub.f32 %f213, %f208, %f212;
add.f32 %f214, %f211, %f213;
mov.f32 %f215, 0f3F317200;
mul.rn.f32 %f216, %f191, %f215;
mov.f32 %f217, 0f35BFBE8E;
mul.rn.f32 %f218, %f191, %f217;
add.f32 %f219, %f216, %f212;
sub.f32 %f220, %f216, %f219;
add.f32 %f221, %f212, %f220;
add.f32 %f222, %f214, %f221;
add.f32 %f223, %f218, %f222;
add.f32 %f224, %f219, %f223;
sub.f32 %f225, %f219, %f224;
add.f32 %f226, %f223, %f225;
abs.f32 %f58, %f1;
setp.gt.f32 %p114, %f58, 0f77F684DF;
mul.f32 %f227, %f1, 0f39000000;
selp.f32 %f228, %f227, %f1, %p114;
mul.rn.f32 %f229, %f228, %f224;
neg.f32 %f230, %f229;
fma.rn.f32 %f231, %f228, %f224, %f230;
fma.rn.f32 %f232, %f228, %f226, %f231;
mov.f32 %f233, 0f00000000;
fma.rn.f32 %f234, %f233, %f224, %f232;
add.rn.f32 %f235, %f229, %f234;
neg.f32 %f236, %f235;
add.rn.f32 %f237, %f229, %f236;
add.rn.f32 %f238, %f237, %f234;
mov.b32 %r43, %f235;
setp.eq.s32 %p115, %r43, 1118925336;
add.s32 %r44, %r43, -1;
mov.b32 %f239, %r44;
add.f32 %f240, %f238, 0f37000000;
selp.f32 %f241, %f239, %f235, %p115;
selp.f32 %f59, %f240, %f238, %p115;
mul.f32 %f242, %f241, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f243, %f242;
mov.f32 %f244, 0fBF317200;
fma.rn.f32 %f245, %f243, %f244, %f241;
mov.f32 %f246, 0fB5BFBE8E;
fma.rn.f32 %f247, %f243, %f246, %f245;
mul.f32 %f248, %f247, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f249, %f248;
add.f32 %f250, %f243, 0f00000000;
ex2.approx.f32 %f251, %f250;
mul.f32 %f252, %f249, %f251;
setp.lt.f32 %p116, %f241, 0fC2D20000;
selp.f32 %f253, 0f00000000, %f252, %p116;
setp.gt.f32 %p117, %f241, 0f42D20000;
selp.f32 %f263, 0f7F800000, %f253, %p117;
setp.eq.f32 %p118, %f263, 0f7F800000;
@%p118 bra BB49_111;
fma.rn.f32 %f263, %f263, %f59, %f263;
BB49_111:
setp.lt.f32 %p119, %f2, 0f00000000;
setp.eq.f32 %p120, %f56, 0f3F800000;
and.pred %p2, %p119, %p120;
mov.b32 %r45, %f263;
xor.b32 %r46, %r45, -2147483648;
mov.b32 %f254, %r46;
selp.f32 %f265, %f254, %f263, %p2;
setp.eq.f32 %p121, %f2, 0f00000000;
@%p121 bra BB49_114;
bra.uni BB49_112;
BB49_114:
add.f32 %f256, %f2, %f2;
mov.b32 %r47, %f256;
selp.b32 %r48, %r47, 0, %p120;
or.b32 %r49, %r48, 2139095040;
setp.lt.f32 %p125, %f1, 0f00000000;
selp.b32 %r50, %r49, %r48, %p125;
mov.b32 %f265, %r50;
bra.uni BB49_115;
BB49_90:
setp.gt.s32 %p74, %r2, 15;
@%p74 bra BB49_94;
setp.eq.s32 %p78, %r2, 14;
@%p78 bra BB49_103;
bra.uni BB49_92;
BB49_103:
cvt.rni.s64.f32 %rd12, %f2;
cvt.u32.u64 %r33, %rd12;
cvt.rni.s64.f32 %rd13, %f1;
cvt.u32.u64 %r34, %rd13;
or.b32 %r35, %r34, %r33;
setp.eq.s32 %p104, %r35, 0;
selp.f32 %f266, 0f00000000, 0f3F800000, %p104;
bra.uni BB49_125;
BB49_8:
setp.eq.s32 %p27, %r2, 2;
@%p27 bra BB49_61;
bra.uni BB49_9;
BB49_61:
mul.f32 %f262, %f1, %f2;
bra.uni BB49_63;
BB49_24:
setp.eq.s32 %p14, %r2, 11;
@%p14 bra BB49_43;
setp.eq.s32 %p15, %r2, 12;
@%p15 bra BB49_42;
bra.uni BB49_26;
BB49_42:
max.f32 %f262, %f1, %f2;
bra.uni BB49_63;
BB49_15:
setp.eq.s32 %p21, %r2, 6;
@%p21 bra BB49_46;
setp.eq.s32 %p22, %r2, 7;
@%p22 bra BB49_45;
bra.uni BB49_17;
BB49_45:
setp.gt.f32 %p42, %f1, %f2;
selp.f32 %f262, 0f3F800000, 0f00000000, %p42;
bra.uni BB49_63;
BB49_32:
setp.eq.s32 %p8, %r2, 16;
@%p8 bra BB49_40;
setp.eq.s32 %p9, %r2, 17;
@%p9 bra BB49_37;
bra.uni BB49_34;
BB49_37:
setp.eq.f32 %p32, %f2, 0f00000000;
setp.eq.f32 %p33, %f2, 0f80000000;
or.pred %p34, %p32, %p33;
mov.f32 %f262, 0f7FC00000;
@%p34 bra BB49_63;
div.rn.f32 %f262, %f1, %f2;
abs.f32 %f80, %f262;
setp.geu.f32 %p35, %f80, 0f7F800000;
@%p35 bra BB49_63;
cvt.rmi.f32.f32 %f81, %f262;
mul.f32 %f82, %f2, %f81;
sub.f32 %f262, %f1, %f82;
bra.uni BB49_63;
BB49_70:
setp.eq.s32 %p94, %r2, 2;
@%p94 bra BB49_123;
bra.uni BB49_71;
BB49_123:
mul.f32 %f266, %f1, %f2;
bra.uni BB49_125;
BB49_86:
setp.eq.s32 %p81, %r2, 11;
@%p81 bra BB49_105;
setp.eq.s32 %p82, %r2, 12;
@%p82 bra BB49_104;
bra.uni BB49_88;
BB49_104:
max.f32 %f266, %f2, %f1;
bra.uni BB49_125;
BB49_77:
setp.eq.s32 %p88, %r2, 6;
@%p88 bra BB49_108;
setp.eq.s32 %p89, %r2, 7;
@%p89 bra BB49_107;
bra.uni BB49_79;
BB49_107:
setp.gt.f32 %p109, %f2, %f1;
selp.f32 %f266, 0f3F800000, 0f00000000, %p109;
bra.uni BB49_125;
BB49_94:
setp.eq.s32 %p75, %r2, 16;
@%p75 bra BB49_102;
setp.eq.s32 %p76, %r2, 17;
@%p76 bra BB49_99;
bra.uni BB49_96;
BB49_99:
setp.eq.f32 %p99, %f1, 0f00000000;
setp.eq.f32 %p100, %f1, 0f80000000;
or.pred %p101, %p99, %p100;
mov.f32 %f266, 0f7FC00000;
@%p101 bra BB49_125;
div.rn.f32 %f266, %f2, %f1;
abs.f32 %f171, %f266;
setp.geu.f32 %p102, %f171, 0f7F800000;
@%p102 bra BB49_125;
cvt.rmi.f32.f32 %f172, %f266;
mul.f32 %f173, %f1, %f172;
sub.f32 %f266, %f2, %f173;
bra.uni BB49_125;
BB49_6:
setp.eq.s32 %p30, %r2, 1;
@%p30 bra BB49_7;
bra.uni BB49_63;
BB49_7:
sub.f32 %f262, %f1, %f2;
bra.uni BB49_63;
BB49_22:
setp.eq.s32 %p18, %r2, 10;
@%p18 bra BB49_23;
bra.uni BB49_63;
BB49_23:
setp.neu.f32 %p39, %f1, %f2;
selp.f32 %f262, 0f3F800000, 0f00000000, %p39;
bra.uni BB49_63;
BB49_13:
setp.eq.s32 %p25, %r2, 5;
@%p25 bra BB49_14;
bra.uni BB49_63;
BB49_14:
setp.lt.f32 %p44, %f1, %f2;
selp.f32 %f262, 0f3F800000, 0f00000000, %p44;
bra.uni BB49_63;
BB49_30:
setp.eq.s32 %p12, %r2, 15;
@%p12 bra BB49_31;
bra.uni BB49_63;
BB49_31:
mul.f32 %f84, %f1, %f2;
mov.f32 %f85, 0f3F800000;
sub.f32 %f262, %f85, %f84;
bra.uni BB49_63;
BB49_9:
setp.eq.s32 %p28, %r2, 3;
@%p28 bra BB49_10;
bra.uni BB49_63;
BB49_10:
div.rn.f32 %f262, %f1, %f2;
bra.uni BB49_63;
BB49_43:
min.f32 %f262, %f1, %f2;
bra.uni BB49_63;
BB49_26:
setp.eq.s32 %p16, %r2, 13;
@%p16 bra BB49_27;
bra.uni BB49_63;
BB49_27:
cvt.rni.s64.f32 %rd10, %f1;
cvt.u32.u64 %r11, %rd10;
cvt.rni.s64.f32 %rd11, %f2;
cvt.u32.u64 %r12, %rd11;
and.b32 %r13, %r12, %r11;
setp.eq.s32 %p38, %r13, 0;
selp.f32 %f262, 0f00000000, 0f3F800000, %p38;
bra.uni BB49_63;
BB49_46:
setp.gtu.f32 %p43, %f1, %f2;
selp.f32 %f262, 0f00000000, 0f3F800000, %p43;
bra.uni BB49_63;
BB49_17:
setp.eq.s32 %p23, %r2, 8;
@%p23 bra BB49_18;
bra.uni BB49_63;
BB49_18:
setp.ltu.f32 %p41, %f1, %f2;
selp.f32 %f262, 0f00000000, 0f3F800000, %p41;
bra.uni BB49_63;
BB49_40:
setp.neu.f32 %p36, %f1, 0f00000000;
sub.f32 %f83, %f1, %f2;
selp.f32 %f262, %f83, 0f00000000, %p36;
bra.uni BB49_63;
BB49_34:
setp.ne.s32 %p10, %r2, 18;
@%p10 bra BB49_63;
div.rn.f32 %f262, %f1, %f2;
abs.f32 %f78, %f262;
setp.geu.f32 %p31, %f78, 0f7F800000;
@%p31 bra BB49_63;
cvt.rmi.f32.f32 %f262, %f262;
bra.uni BB49_63;
BB49_68:
setp.eq.s32 %p97, %r2, 1;
@%p97 bra BB49_69;
bra.uni BB49_125;
BB49_69:
sub.f32 %f266, %f2, %f1;
bra.uni BB49_125;
BB49_84:
setp.eq.s32 %p85, %r2, 10;
@%p85 bra BB49_85;
bra.uni BB49_125;
BB49_85:
setp.neu.f32 %p106, %f2, %f1;
selp.f32 %f266, 0f3F800000, 0f00000000, %p106;
bra.uni BB49_125;
BB49_75:
setp.eq.s32 %p92, %r2, 5;
@%p92 bra BB49_76;
bra.uni BB49_125;
BB49_76:
setp.lt.f32 %p111, %f2, %f1;
selp.f32 %f266, 0f3F800000, 0f00000000, %p111;
bra.uni BB49_125;
BB49_92:
setp.eq.s32 %p79, %r2, 15;
@%p79 bra BB49_93;
bra.uni BB49_125;
BB49_93:
mul.f32 %f175, %f1, %f2;
mov.f32 %f176, 0f3F800000;
sub.f32 %f266, %f176, %f175;
bra.uni BB49_125;
BB49_71:
setp.eq.s32 %p95, %r2, 3;
@%p95 bra BB49_72;
bra.uni BB49_125;
BB49_72:
div.rn.f32 %f266, %f2, %f1;
bra.uni BB49_125;
BB49_105:
min.f32 %f266, %f2, %f1;
bra.uni BB49_125;
BB49_88:
setp.eq.s32 %p83, %r2, 13;
@%p83 bra BB49_89;
bra.uni BB49_125;
BB49_89:
cvt.rni.s64.f32 %rd14, %f2;
cvt.u32.u64 %r36, %rd14;
cvt.rni.s64.f32 %rd15, %f1;
cvt.u32.u64 %r37, %rd15;
and.b32 %r38, %r37, %r36;
setp.eq.s32 %p105, %r38, 0;
selp.f32 %f266, 0f00000000, 0f3F800000, %p105;
bra.uni BB49_125;
BB49_108:
setp.gtu.f32 %p110, %f2, %f1;
selp.f32 %f266, 0f00000000, 0f3F800000, %p110;
bra.uni BB49_125;
BB49_79:
setp.eq.s32 %p90, %r2, 8;
@%p90 bra BB49_80;
bra.uni BB49_125;
BB49_80:
setp.ltu.f32 %p108, %f2, %f1;
selp.f32 %f266, 0f00000000, 0f3F800000, %p108;
bra.uni BB49_125;
BB49_102:
setp.neu.f32 %p103, %f2, 0f00000000;
sub.f32 %f174, %f2, %f1;
selp.f32 %f266, %f174, 0f00000000, %p103;
bra.uni BB49_125;
BB49_96:
setp.ne.s32 %p77, %r2, 18;
@%p77 bra BB49_125;
div.rn.f32 %f266, %f2, %f1;
abs.f32 %f169, %f266;
setp.geu.f32 %p98, %f169, 0f7F800000;
@%p98 bra BB49_125;
cvt.rmi.f32.f32 %f266, %f266;
bra.uni BB49_125;
BB49_50:
setp.geu.f32 %p55, %f1, 0f00000000;
@%p55 bra BB49_53;
cvt.rzi.f32.f32 %f164, %f2;
setp.neu.f32 %p56, %f164, %f2;
selp.f32 %f261, 0f7FFFFFFF, %f261, %p56;
BB49_53:
add.f32 %f166, %f20, %f21;
mov.b32 %r26, %f166;
setp.lt.s32 %p59, %r26, 2139095040;
@%p59 bra BB49_60;
setp.gtu.f32 %p60, %f20, 0f7F800000;
setp.gtu.f32 %p61, %f21, 0f7F800000;
or.pred %p62, %p60, %p61;
@%p62 bra BB49_59;
bra.uni BB49_55;
BB49_59:
add.f32 %f261, %f1, %f2;
bra.uni BB49_60;
BB49_55:
setp.eq.f32 %p63, %f21, 0f7F800000;
@%p63 bra BB49_58;
bra.uni BB49_56;
BB49_58:
setp.gt.f32 %p66, %f20, 0f3F800000;
selp.b32 %r30, 2139095040, 0, %p66;
xor.b32 %r31, %r30, 2139095040;
setp.lt.f32 %p67, %f2, 0f00000000;
selp.b32 %r32, %r31, %r30, %p67;
mov.b32 %f167, %r32;
setp.eq.f32 %p68, %f1, 0fBF800000;
selp.f32 %f261, 0f3F800000, %f167, %p68;
bra.uni BB49_60;
BB49_112:
setp.geu.f32 %p122, %f2, 0f00000000;
@%p122 bra BB49_115;
cvt.rzi.f32.f32 %f255, %f1;
setp.neu.f32 %p123, %f255, %f1;
selp.f32 %f265, 0f7FFFFFFF, %f265, %p123;
BB49_115:
add.f32 %f257, %f57, %f58;
mov.b32 %r51, %f257;
setp.lt.s32 %p126, %r51, 2139095040;
@%p126 bra BB49_122;
setp.gtu.f32 %p127, %f57, 0f7F800000;
setp.gtu.f32 %p128, %f58, 0f7F800000;
or.pred %p129, %p127, %p128;
@%p129 bra BB49_121;
bra.uni BB49_117;
BB49_121:
add.f32 %f265, %f1, %f2;
bra.uni BB49_122;
BB49_117:
setp.eq.f32 %p130, %f58, 0f7F800000;
@%p130 bra BB49_120;
bra.uni BB49_118;
BB49_120:
setp.gt.f32 %p133, %f57, 0f3F800000;
selp.b32 %r55, 2139095040, 0, %p133;
xor.b32 %r56, %r55, 2139095040;
setp.lt.f32 %p134, %f1, 0f00000000;
selp.b32 %r57, %r56, %r55, %p134;
mov.b32 %f258, %r57;
setp.eq.f32 %p135, %f2, 0fBF800000;
selp.f32 %f265, 0f3F800000, %f258, %p135;
bra.uni BB49_122;
BB49_56:
setp.neu.f32 %p64, %f20, 0f7F800000;
@%p64 bra BB49_60;
setp.ltu.f32 %p65, %f2, 0f00000000;
selp.b32 %r27, 0, 2139095040, %p65;
or.b32 %r28, %r27, -2147483648;
selp.b32 %r29, %r28, %r27, %p1;
mov.b32 %f261, %r29;
BB49_60:
setp.eq.f32 %p69, %f2, 0f00000000;
setp.eq.f32 %p70, %f1, 0f3F800000;
or.pred %p71, %p70, %p69;
selp.f32 %f262, 0f3F800000, %f261, %p71;
BB49_63:
st.global.f32 [%rd1], %f262;
bra.uni BB49_126;
BB49_118:
setp.neu.f32 %p131, %f57, 0f7F800000;
@%p131 bra BB49_122;
setp.ltu.f32 %p132, %f1, 0f00000000;
selp.b32 %r52, 0, 2139095040, %p132;
or.b32 %r53, %r52, -2147483648;
selp.b32 %r54, %r53, %r52, %p2;
mov.b32 %f265, %r54;
BB49_122:
setp.eq.f32 %p136, %f1, 0f00000000;
setp.eq.f32 %p137, %f2, 0f3F800000;
or.pred %p138, %p137, %p136;
selp.f32 %f266, 0f3F800000, %f265, %p138;
BB49_125:
st.global.f32 [%rd1], %f266;
BB49_126:
bar.sync 0;
ret;
}
// .globl fill_d
.visible .entry fill_d(
.param .u64 fill_d_param_0,
.param .f64 fill_d_param_1,
.param .u32 fill_d_param_2
)
{
.reg .pred %p<2>;
.reg .b32 %r<6>;
.reg .f64 %fd<2>;
.reg .b64 %rd<5>;
ld.param.u64 %rd1, [fill_d_param_0];
ld.param.f64 %fd1, [fill_d_param_1];
ld.param.u32 %r2, [fill_d_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.s32 %p1, %r1, %r2;
@%p1 bra BB50_2;
cvta.to.global.u64 %rd2, %rd1;
mul.wide.s32 %rd3, %r1, 8;
add.s64 %rd4, %rd2, %rd3;
st.global.f64 [%rd4], %fd1;
BB50_2:
ret;
}
// .globl fill_f
.visible .entry fill_f(
.param .u64 fill_f_param_0,
.param .f64 fill_f_param_1,
.param .u32 fill_f_param_2
)
{
.reg .pred %p<2>;
.reg .f32 %f<2>;
.reg .b32 %r<6>;
.reg .f64 %fd<2>;
.reg .b64 %rd<5>;
ld.param.u64 %rd1, [fill_f_param_0];
ld.param.f64 %fd1, [fill_f_param_1];
ld.param.u32 %r2, [fill_f_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.s32 %p1, %r1, %r2;
@%p1 bra BB51_2;
cvt.rn.f32.f64 %f1, %fd1;
cvta.to.global.u64 %rd2, %rd1;
mul.wide.s32 %rd3, %r1, 4;
add.s64 %rd4, %rd2, %rd3;
st.global.f32 [%rd4], %f1;
BB51_2:
ret;
}
// .globl cbind_d
.visible .entry cbind_d(
.param .u64 cbind_d_param_0,
.param .u64 cbind_d_param_1,
.param .u64 cbind_d_param_2,
.param .u32 cbind_d_param_3,
.param .u32 cbind_d_param_4,
.param .u32 cbind_d_param_5,
.param .u32 cbind_d_param_6
)
{
.reg .pred %p<7>;
.reg .b32 %r<18>;
.reg .f64 %fd<3>;
.reg .b64 %rd<15>;
ld.param.u64 %rd2, [cbind_d_param_0];
ld.param.u64 %rd3, [cbind_d_param_1];
ld.param.u64 %rd4, [cbind_d_param_2];
ld.param.u32 %r7, [cbind_d_param_3];
ld.param.u32 %r4, [cbind_d_param_4];
ld.param.u32 %r5, [cbind_d_param_5];
ld.param.u32 %r6, [cbind_d_param_6];
cvta.to.global.u64 %rd1, %rd4;
mov.u32 %r8, %ntid.x;
mov.u32 %r9, %ctaid.x;
mov.u32 %r10, %tid.x;
mad.lo.s32 %r11, %r8, %r9, %r10;
max.s32 %r12, %r4, %r6;
div.s32 %r1, %r11, %r12;
rem.s32 %r2, %r11, %r12;
add.s32 %r3, %r6, %r4;
setp.ge.s32 %p1, %r1, %r7;
setp.ge.s32 %p2, %r2, %r4;
or.pred %p3, %p1, %p2;
@%p3 bra BB52_2;
cvta.to.global.u64 %rd5, %rd2;
mad.lo.s32 %r13, %r1, %r4, %r2;
mul.wide.s32 %rd6, %r13, 8;
add.s64 %rd7, %rd5, %rd6;
ld.global.f64 %fd1, [%rd7];
mad.lo.s32 %r14, %r1, %r3, %r2;
mul.wide.s32 %rd8, %r14, 8;
add.s64 %rd9, %rd1, %rd8;
st.global.f64 [%rd9], %fd1;
BB52_2:
setp.ge.s32 %p4, %r1, %r5;
setp.ge.s32 %p5, %r2, %r6;
or.pred %p6, %p4, %p5;
@%p6 bra BB52_4;
cvta.to.global.u64 %rd10, %rd3;
mad.lo.s32 %r15, %r1, %r6, %r2;
mul.wide.s32 %rd11, %r15, 8;
add.s64 %rd12, %rd10, %rd11;
ld.global.f64 %fd2, [%rd12];
add.s32 %r16, %r2, %r4;
mad.lo.s32 %r17, %r1, %r3, %r16;
mul.wide.s32 %rd13, %r17, 8;
add.s64 %rd14, %rd1, %rd13;
st.global.f64 [%rd14], %fd2;
BB52_4:
ret;
}
// .globl cbind_f
.visible .entry cbind_f(
.param .u64 cbind_f_param_0,
.param .u64 cbind_f_param_1,
.param .u64 cbind_f_param_2,
.param .u32 cbind_f_param_3,
.param .u32 cbind_f_param_4,
.param .u32 cbind_f_param_5,
.param .u32 cbind_f_param_6
)
{
.reg .pred %p<7>;
.reg .f32 %f<3>;
.reg .b32 %r<18>;
.reg .b64 %rd<15>;
ld.param.u64 %rd2, [cbind_f_param_0];
ld.param.u64 %rd3, [cbind_f_param_1];
ld.param.u64 %rd4, [cbind_f_param_2];
ld.param.u32 %r7, [cbind_f_param_3];
ld.param.u32 %r4, [cbind_f_param_4];
ld.param.u32 %r5, [cbind_f_param_5];
ld.param.u32 %r6, [cbind_f_param_6];
cvta.to.global.u64 %rd1, %rd4;
mov.u32 %r8, %ntid.x;
mov.u32 %r9, %ctaid.x;
mov.u32 %r10, %tid.x;
mad.lo.s32 %r11, %r8, %r9, %r10;
max.s32 %r12, %r4, %r6;
div.s32 %r1, %r11, %r12;
rem.s32 %r2, %r11, %r12;
add.s32 %r3, %r6, %r4;
setp.ge.s32 %p1, %r1, %r7;
setp.ge.s32 %p2, %r2, %r4;
or.pred %p3, %p1, %p2;
@%p3 bra BB53_2;
cvta.to.global.u64 %rd5, %rd2;
mad.lo.s32 %r13, %r1, %r4, %r2;
mul.wide.s32 %rd6, %r13, 4;
add.s64 %rd7, %rd5, %rd6;
ld.global.f32 %f1, [%rd7];
mad.lo.s32 %r14, %r1, %r3, %r2;
mul.wide.s32 %rd8, %r14, 4;
add.s64 %rd9, %rd1, %rd8;
st.global.f32 [%rd9], %f1;
BB53_2:
setp.ge.s32 %p4, %r1, %r5;
setp.ge.s32 %p5, %r2, %r6;
or.pred %p6, %p4, %p5;
@%p6 bra BB53_4;
cvta.to.global.u64 %rd10, %rd3;
mad.lo.s32 %r15, %r1, %r6, %r2;
mul.wide.s32 %rd11, %r15, 4;
add.s64 %rd12, %rd10, %rd11;
ld.global.f32 %f2, [%rd12];
add.s32 %r16, %r2, %r4;
mad.lo.s32 %r17, %r1, %r3, %r16;
mul.wide.s32 %rd13, %r17, 4;
add.s64 %rd14, %rd1, %rd13;
st.global.f32 [%rd14], %f2;
BB53_4:
ret;
}
// .globl rbind_d
.visible .entry rbind_d(
.param .u64 rbind_d_param_0,
.param .u64 rbind_d_param_1,
.param .u64 rbind_d_param_2,
.param .u32 rbind_d_param_3,
.param .u32 rbind_d_param_4,
.param .u32 rbind_d_param_5,
.param .u32 rbind_d_param_6
)
{
.reg .pred %p<7>;
.reg .b32 %r<16>;
.reg .f64 %fd<3>;
.reg .b64 %rd<14>;
ld.param.u64 %rd2, [rbind_d_param_0];
ld.param.u64 %rd3, [rbind_d_param_1];
ld.param.u64 %rd4, [rbind_d_param_2];
ld.param.u32 %r3, [rbind_d_param_3];
ld.param.u32 %r4, [rbind_d_param_4];
ld.param.u32 %r5, [rbind_d_param_5];
ld.param.u32 %r6, [rbind_d_param_6];
cvta.to.global.u64 %rd1, %rd4;
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %ctaid.x;
mov.u32 %r9, %tid.x;
mad.lo.s32 %r10, %r7, %r8, %r9;
max.s32 %r11, %r4, %r6;
div.s32 %r1, %r10, %r11;
rem.s32 %r2, %r10, %r11;
setp.ge.s32 %p1, %r1, %r3;
setp.ge.s32 %p2, %r2, %r4;
or.pred %p3, %p1, %p2;
@%p3 bra BB54_2;
cvta.to.global.u64 %rd5, %rd2;
mad.lo.s32 %r12, %r1, %r4, %r2;
mul.wide.s32 %rd6, %r12, 8;
add.s64 %rd7, %rd5, %rd6;
ld.global.f64 %fd1, [%rd7];
add.s64 %rd8, %rd1, %rd6;
st.global.f64 [%rd8], %fd1;
BB54_2:
setp.ge.s32 %p4, %r1, %r5;
setp.ge.s32 %p5, %r2, %r6;
or.pred %p6, %p4, %p5;
@%p6 bra BB54_4;
cvta.to.global.u64 %rd9, %rd3;
mad.lo.s32 %r13, %r1, %r6, %r2;
mul.wide.s32 %rd10, %r13, 8;
add.s64 %rd11, %rd9, %rd10;
ld.global.f64 %fd2, [%rd11];
add.s32 %r14, %r1, %r3;
mad.lo.s32 %r15, %r14, %r4, %r2;
mul.wide.s32 %rd12, %r15, 8;
add.s64 %rd13, %rd1, %rd12;
st.global.f64 [%rd13], %fd2;
BB54_4:
ret;
}
// .globl rbind_f
.visible .entry rbind_f(
.param .u64 rbind_f_param_0,
.param .u64 rbind_f_param_1,
.param .u64 rbind_f_param_2,
.param .u32 rbind_f_param_3,
.param .u32 rbind_f_param_4,
.param .u32 rbind_f_param_5,
.param .u32 rbind_f_param_6
)
{
.reg .pred %p<7>;
.reg .f32 %f<3>;
.reg .b32 %r<16>;
.reg .b64 %rd<14>;
ld.param.u64 %rd2, [rbind_f_param_0];
ld.param.u64 %rd3, [rbind_f_param_1];
ld.param.u64 %rd4, [rbind_f_param_2];
ld.param.u32 %r3, [rbind_f_param_3];
ld.param.u32 %r4, [rbind_f_param_4];
ld.param.u32 %r5, [rbind_f_param_5];
ld.param.u32 %r6, [rbind_f_param_6];
cvta.to.global.u64 %rd1, %rd4;
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %ctaid.x;
mov.u32 %r9, %tid.x;
mad.lo.s32 %r10, %r7, %r8, %r9;
max.s32 %r11, %r4, %r6;
div.s32 %r1, %r10, %r11;
rem.s32 %r2, %r10, %r11;
setp.ge.s32 %p1, %r1, %r3;
setp.ge.s32 %p2, %r2, %r4;
or.pred %p3, %p1, %p2;
@%p3 bra BB55_2;
cvta.to.global.u64 %rd5, %rd2;
mad.lo.s32 %r12, %r1, %r4, %r2;
mul.wide.s32 %rd6, %r12, 4;
add.s64 %rd7, %rd5, %rd6;
ld.global.f32 %f1, [%rd7];
add.s64 %rd8, %rd1, %rd6;
st.global.f32 [%rd8], %f1;
BB55_2:
setp.ge.s32 %p4, %r1, %r5;
setp.ge.s32 %p5, %r2, %r6;
or.pred %p6, %p4, %p5;
@%p6 bra BB55_4;
cvta.to.global.u64 %rd9, %rd3;
mad.lo.s32 %r13, %r1, %r6, %r2;
mul.wide.s32 %rd10, %r13, 4;
add.s64 %rd11, %rd9, %rd10;
ld.global.f32 %f2, [%rd11];
add.s32 %r14, %r1, %r3;
mad.lo.s32 %r15, %r14, %r4, %r2;
mul.wide.s32 %rd12, %r15, 4;
add.s64 %rd13, %rd1, %rd12;
st.global.f32 [%rd13], %f2;
BB55_4:
ret;
}
// .globl reduce_sum_d
.visible .entry reduce_sum_d(
.param .u64 reduce_sum_d_param_0,
.param .u64 reduce_sum_d_param_1,
.param .u32 reduce_sum_d_param_2
)
{
.reg .pred %p<20>;
.reg .b32 %r<36>;
.reg .f64 %fd<60>;
.reg .b64 %rd<12>;
ld.param.u64 %rd1, [reduce_sum_d_param_0];
ld.param.u64 %rd2, [reduce_sum_d_param_1];
ld.param.u32 %r6, [reduce_sum_d_param_2];
mov.u32 %r7, %tid.x;
mov.u32 %r8, %ctaid.x;
shl.b32 %r9, %r8, 1;
mov.u32 %r10, %ntid.x;
mad.lo.s32 %r35, %r9, %r10, %r7;
mov.f64 %fd44, 0d0000000000000000;
setp.ge.u32 %p1, %r35, %r6;
@%p1 bra BB56_4;
BB56_1:
cvta.to.global.u64 %rd3, %rd1;
mul.wide.u32 %rd4, %r35, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd30, [%rd5];
add.f64 %fd44, %fd44, %fd30;
add.s32 %r3, %r35, %r10;
setp.ge.u32 %p2, %r3, %r6;
@%p2 bra BB56_3;
mul.wide.u32 %rd7, %r3, 8;
add.s64 %rd8, %rd3, %rd7;
ld.global.f64 %fd31, [%rd8];
add.f64 %fd44, %fd44, %fd31;
BB56_3:
shl.b32 %r13, %r10, 1;
mov.u32 %r14, %nctaid.x;
mad.lo.s32 %r35, %r13, %r14, %r35;
setp.lt.u32 %p3, %r35, %r6;
@%p3 bra BB56_1;
BB56_4:
shl.b32 %r16, %r7, 3;
mov.u32 %r17, memory;
add.s32 %r5, %r17, %r16;
st.shared.f64 [%r5], %fd44;
bar.sync 0;
setp.lt.u32 %p4, %r10, 1024;
@%p4 bra BB56_8;
setp.gt.u32 %p5, %r7, 511;
@%p5 bra BB56_7;
ld.shared.f64 %fd32, [%r5+4096];
add.f64 %fd44, %fd44, %fd32;
st.shared.f64 [%r5], %fd44;
BB56_7:
bar.sync 0;
BB56_8:
setp.lt.u32 %p6, %r10, 512;
@%p6 bra BB56_12;
setp.gt.u32 %p7, %r7, 255;
@%p7 bra BB56_11;
ld.shared.f64 %fd33, [%r5+2048];
add.f64 %fd44, %fd44, %fd33;
st.shared.f64 [%r5], %fd44;
BB56_11:
bar.sync 0;
BB56_12:
setp.lt.u32 %p8, %r10, 256;
@%p8 bra BB56_16;
setp.gt.u32 %p9, %r7, 127;
@%p9 bra BB56_15;
ld.shared.f64 %fd34, [%r5+1024];
add.f64 %fd44, %fd44, %fd34;
st.shared.f64 [%r5], %fd44;
BB56_15:
bar.sync 0;
BB56_16:
setp.lt.u32 %p10, %r10, 128;
@%p10 bra BB56_20;
setp.gt.u32 %p11, %r7, 63;
@%p11 bra BB56_19;
ld.shared.f64 %fd35, [%r5+512];
add.f64 %fd44, %fd44, %fd35;
st.shared.f64 [%r5], %fd44;
BB56_19:
bar.sync 0;
BB56_20:
setp.gt.u32 %p12, %r7, 31;
@%p12 bra BB56_33;
setp.lt.u32 %p13, %r10, 64;
@%p13 bra BB56_23;
ld.volatile.shared.f64 %fd36, [%r5+256];
add.f64 %fd44, %fd44, %fd36;
st.volatile.shared.f64 [%r5], %fd44;
BB56_23:
setp.lt.u32 %p14, %r10, 32;
@%p14 bra BB56_25;
ld.volatile.shared.f64 %fd37, [%r5+128];
add.f64 %fd44, %fd44, %fd37;
st.volatile.shared.f64 [%r5], %fd44;
BB56_25:
setp.lt.u32 %p15, %r10, 16;
@%p15 bra BB56_27;
ld.volatile.shared.f64 %fd38, [%r5+64];
add.f64 %fd44, %fd44, %fd38;
st.volatile.shared.f64 [%r5], %fd44;
BB56_27:
setp.lt.u32 %p16, %r10, 8;
@%p16 bra BB56_29;
ld.volatile.shared.f64 %fd39, [%r5+32];
add.f64 %fd44, %fd44, %fd39;
st.volatile.shared.f64 [%r5], %fd44;
BB56_29:
setp.lt.u32 %p17, %r10, 4;
@%p17 bra BB56_31;
ld.volatile.shared.f64 %fd40, [%r5+16];
add.f64 %fd44, %fd44, %fd40;
st.volatile.shared.f64 [%r5], %fd44;
BB56_31:
setp.lt.u32 %p18, %r10, 2;
@%p18 bra BB56_33;
ld.volatile.shared.f64 %fd41, [%r5+8];
add.f64 %fd42, %fd44, %fd41;
st.volatile.shared.f64 [%r5], %fd42;
BB56_33:
setp.ne.s32 %p19, %r7, 0;
@%p19 bra BB56_35;
ld.shared.f64 %fd43, [memory];
cvta.to.global.u64 %rd9, %rd2;
mul.wide.u32 %rd10, %r8, 8;
add.s64 %rd11, %rd9, %rd10;
st.global.f64 [%rd11], %fd43;
BB56_35:
ret;
}
// .globl reduce_sum_f
.visible .entry reduce_sum_f(
.param .u64 reduce_sum_f_param_0,
.param .u64 reduce_sum_f_param_1,
.param .u32 reduce_sum_f_param_2
)
{
.reg .pred %p<20>;
.reg .f32 %f<60>;
.reg .b32 %r<36>;
.reg .b64 %rd<12>;
ld.param.u64 %rd1, [reduce_sum_f_param_0];
ld.param.u64 %rd2, [reduce_sum_f_param_1];
ld.param.u32 %r6, [reduce_sum_f_param_2];
mov.u32 %r7, %tid.x;
mov.u32 %r8, %ctaid.x;
shl.b32 %r9, %r8, 1;
mov.u32 %r10, %ntid.x;
mad.lo.s32 %r35, %r9, %r10, %r7;
mov.f32 %f44, 0f00000000;
setp.ge.u32 %p1, %r35, %r6;
@%p1 bra BB57_4;
BB57_1:
cvta.to.global.u64 %rd3, %rd1;
mul.wide.u32 %rd4, %r35, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f30, [%rd5];
add.f32 %f44, %f44, %f30;
add.s32 %r3, %r35, %r10;
setp.ge.u32 %p2, %r3, %r6;
@%p2 bra BB57_3;
mul.wide.u32 %rd7, %r3, 4;
add.s64 %rd8, %rd3, %rd7;
ld.global.f32 %f31, [%rd8];
add.f32 %f44, %f44, %f31;
BB57_3:
shl.b32 %r13, %r10, 1;
mov.u32 %r14, %nctaid.x;
mad.lo.s32 %r35, %r13, %r14, %r35;
setp.lt.u32 %p3, %r35, %r6;
@%p3 bra BB57_1;
BB57_4:
shl.b32 %r16, %r7, 2;
mov.u32 %r17, memory;
add.s32 %r5, %r17, %r16;
st.shared.f32 [%r5], %f44;
bar.sync 0;
setp.lt.u32 %p4, %r10, 1024;
@%p4 bra BB57_8;
setp.gt.u32 %p5, %r7, 511;
@%p5 bra BB57_7;
ld.shared.f32 %f32, [%r5+2048];
add.f32 %f44, %f44, %f32;
st.shared.f32 [%r5], %f44;
BB57_7:
bar.sync 0;
BB57_8:
setp.lt.u32 %p6, %r10, 512;
@%p6 bra BB57_12;
setp.gt.u32 %p7, %r7, 255;
@%p7 bra BB57_11;
ld.shared.f32 %f33, [%r5+1024];
add.f32 %f44, %f44, %f33;
st.shared.f32 [%r5], %f44;
BB57_11:
bar.sync 0;
BB57_12:
setp.lt.u32 %p8, %r10, 256;
@%p8 bra BB57_16;
setp.gt.u32 %p9, %r7, 127;
@%p9 bra BB57_15;
ld.shared.f32 %f34, [%r5+512];
add.f32 %f44, %f44, %f34;
st.shared.f32 [%r5], %f44;
BB57_15:
bar.sync 0;
BB57_16:
setp.lt.u32 %p10, %r10, 128;
@%p10 bra BB57_20;
setp.gt.u32 %p11, %r7, 63;
@%p11 bra BB57_19;
ld.shared.f32 %f35, [%r5+256];
add.f32 %f44, %f44, %f35;
st.shared.f32 [%r5], %f44;
BB57_19:
bar.sync 0;
BB57_20:
setp.gt.u32 %p12, %r7, 31;
@%p12 bra BB57_33;
setp.lt.u32 %p13, %r10, 64;
@%p13 bra BB57_23;
ld.volatile.shared.f32 %f36, [%r5+128];
add.f32 %f44, %f44, %f36;
st.volatile.shared.f32 [%r5], %f44;
BB57_23:
setp.lt.u32 %p14, %r10, 32;
@%p14 bra BB57_25;
ld.volatile.shared.f32 %f37, [%r5+64];
add.f32 %f44, %f44, %f37;
st.volatile.shared.f32 [%r5], %f44;
BB57_25:
setp.lt.u32 %p15, %r10, 16;
@%p15 bra BB57_27;
ld.volatile.shared.f32 %f38, [%r5+32];
add.f32 %f44, %f44, %f38;
st.volatile.shared.f32 [%r5], %f44;
BB57_27:
setp.lt.u32 %p16, %r10, 8;
@%p16 bra BB57_29;
ld.volatile.shared.f32 %f39, [%r5+16];
add.f32 %f44, %f44, %f39;
st.volatile.shared.f32 [%r5], %f44;
BB57_29:
setp.lt.u32 %p17, %r10, 4;
@%p17 bra BB57_31;
ld.volatile.shared.f32 %f40, [%r5+8];
add.f32 %f44, %f44, %f40;
st.volatile.shared.f32 [%r5], %f44;
BB57_31:
setp.lt.u32 %p18, %r10, 2;
@%p18 bra BB57_33;
ld.volatile.shared.f32 %f41, [%r5+4];
add.f32 %f42, %f44, %f41;
st.volatile.shared.f32 [%r5], %f42;
BB57_33:
setp.ne.s32 %p19, %r7, 0;
@%p19 bra BB57_35;
ld.shared.f32 %f43, [memory];
cvta.to.global.u64 %rd9, %rd2;
mul.wide.u32 %rd10, %r8, 4;
add.s64 %rd11, %rd9, %rd10;
st.global.f32 [%rd11], %f43;
BB57_35:
ret;
}
// .globl reduce_row_sum_d
.visible .entry reduce_row_sum_d(
.param .u64 reduce_row_sum_d_param_0,
.param .u64 reduce_row_sum_d_param_1,
.param .u32 reduce_row_sum_d_param_2,
.param .u32 reduce_row_sum_d_param_3
)
{
.reg .pred %p<20>;
.reg .b32 %r<72>;
.reg .f64 %fd<56>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [reduce_row_sum_d_param_0];
ld.param.u64 %rd2, [reduce_row_sum_d_param_1];
ld.param.u32 %r5, [reduce_row_sum_d_param_2];
ld.param.u32 %r4, [reduce_row_sum_d_param_3];
mov.u32 %r6, %ctaid.x;
setp.ge.u32 %p1, %r6, %r5;
@%p1 bra BB58_35;
mov.u32 %r71, %tid.x;
mov.f64 %fd6, 0d0000000000000000;
setp.ge.u32 %p2, %r71, %r4;
@%p2 bra BB58_4;
cvta.to.global.u64 %rd3, %rd1;
BB58_3:
mad.lo.s32 %r8, %r6, %r4, %r71;
mul.wide.u32 %rd4, %r8, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd28, [%rd5];
add.f64 %fd6, %fd6, %fd28;
mov.u32 %r9, %ntid.x;
add.s32 %r71, %r9, %r71;
setp.lt.u32 %p3, %r71, %r4;
@%p3 bra BB58_3;
BB58_4:
mov.u32 %r10, %tid.x;
shl.b32 %r11, %r10, 3;
mov.u32 %r12, memory;
add.s32 %r13, %r12, %r11;
st.shared.f64 [%r13], %fd6;
bar.sync 0;
mov.u32 %r14, %ntid.x;
setp.lt.u32 %p4, %r14, 1024;
@%p4 bra BB58_8;
setp.gt.u32 %p5, %r10, 511;
@%p5 bra BB58_7;
ld.shared.f64 %fd29, [%r13+4096];
add.f64 %fd6, %fd6, %fd29;
st.shared.f64 [%r13], %fd6;
BB58_7:
bar.sync 0;
BB58_8:
setp.lt.u32 %p6, %r14, 512;
@%p6 bra BB58_12;
setp.gt.u32 %p7, %r10, 255;
@%p7 bra BB58_11;
ld.shared.f64 %fd30, [%r13+2048];
add.f64 %fd6, %fd6, %fd30;
st.shared.f64 [%r13], %fd6;
BB58_11:
bar.sync 0;
BB58_12:
setp.lt.u32 %p8, %r14, 256;
@%p8 bra BB58_16;
setp.gt.u32 %p9, %r10, 127;
@%p9 bra BB58_15;
ld.shared.f64 %fd31, [%r13+1024];
add.f64 %fd6, %fd6, %fd31;
st.shared.f64 [%r13], %fd6;
BB58_15:
bar.sync 0;
BB58_16:
setp.lt.u32 %p10, %r14, 128;
@%p10 bra BB58_20;
setp.gt.u32 %p11, %r10, 63;
@%p11 bra BB58_19;
ld.shared.f64 %fd32, [%r13+512];
add.f64 %fd6, %fd6, %fd32;
st.shared.f64 [%r13], %fd6;
BB58_19:
bar.sync 0;
BB58_20:
setp.gt.u32 %p12, %r10, 31;
@%p12 bra BB58_33;
setp.lt.u32 %p13, %r14, 64;
@%p13 bra BB58_23;
ld.volatile.shared.f64 %fd33, [%r13+256];
add.f64 %fd6, %fd6, %fd33;
st.volatile.shared.f64 [%r13], %fd6;
BB58_23:
setp.lt.u32 %p14, %r14, 32;
@%p14 bra BB58_25;
ld.volatile.shared.f64 %fd34, [%r13+128];
add.f64 %fd6, %fd6, %fd34;
st.volatile.shared.f64 [%r13], %fd6;
BB58_25:
setp.lt.u32 %p15, %r14, 16;
@%p15 bra BB58_27;
ld.volatile.shared.f64 %fd35, [%r13+64];
add.f64 %fd6, %fd6, %fd35;
st.volatile.shared.f64 [%r13], %fd6;
BB58_27:
setp.lt.u32 %p16, %r14, 8;
@%p16 bra BB58_29;
ld.volatile.shared.f64 %fd36, [%r13+32];
add.f64 %fd6, %fd6, %fd36;
st.volatile.shared.f64 [%r13], %fd6;
BB58_29:
setp.lt.u32 %p17, %r14, 4;
@%p17 bra BB58_31;
ld.volatile.shared.f64 %fd37, [%r13+16];
add.f64 %fd6, %fd6, %fd37;
st.volatile.shared.f64 [%r13], %fd6;
BB58_31:
setp.lt.u32 %p18, %r14, 2;
@%p18 bra BB58_33;
ld.volatile.shared.f64 %fd38, [%r13+8];
add.f64 %fd39, %fd6, %fd38;
st.volatile.shared.f64 [%r13], %fd39;
BB58_33:
setp.ne.s32 %p19, %r10, 0;
@%p19 bra BB58_35;
ld.shared.f64 %fd40, [memory];
cvta.to.global.u64 %rd6, %rd2;
mul.wide.u32 %rd7, %r6, 8;
add.s64 %rd8, %rd6, %rd7;
st.global.f64 [%rd8], %fd40;
BB58_35:
ret;
}
// .globl reduce_row_sum_f
.visible .entry reduce_row_sum_f(
.param .u64 reduce_row_sum_f_param_0,
.param .u64 reduce_row_sum_f_param_1,
.param .u32 reduce_row_sum_f_param_2,
.param .u32 reduce_row_sum_f_param_3
)
{
.reg .pred %p<20>;
.reg .f32 %f<56>;
.reg .b32 %r<72>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [reduce_row_sum_f_param_0];
ld.param.u64 %rd2, [reduce_row_sum_f_param_1];
ld.param.u32 %r5, [reduce_row_sum_f_param_2];
ld.param.u32 %r4, [reduce_row_sum_f_param_3];
mov.u32 %r6, %ctaid.x;
setp.ge.u32 %p1, %r6, %r5;
@%p1 bra BB59_35;
mov.u32 %r71, %tid.x;
mov.f32 %f6, 0f00000000;
setp.ge.u32 %p2, %r71, %r4;
@%p2 bra BB59_4;
cvta.to.global.u64 %rd3, %rd1;
BB59_3:
mad.lo.s32 %r8, %r6, %r4, %r71;
mul.wide.u32 %rd4, %r8, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f28, [%rd5];
add.f32 %f6, %f6, %f28;
mov.u32 %r9, %ntid.x;
add.s32 %r71, %r9, %r71;
setp.lt.u32 %p3, %r71, %r4;
@%p3 bra BB59_3;
BB59_4:
mov.u32 %r10, %tid.x;
shl.b32 %r11, %r10, 2;
mov.u32 %r12, memory;
add.s32 %r13, %r12, %r11;
st.shared.f32 [%r13], %f6;
bar.sync 0;
mov.u32 %r14, %ntid.x;
setp.lt.u32 %p4, %r14, 1024;
@%p4 bra BB59_8;
setp.gt.u32 %p5, %r10, 511;
@%p5 bra BB59_7;
ld.shared.f32 %f29, [%r13+2048];
add.f32 %f6, %f6, %f29;
st.shared.f32 [%r13], %f6;
BB59_7:
bar.sync 0;
BB59_8:
setp.lt.u32 %p6, %r14, 512;
@%p6 bra BB59_12;
setp.gt.u32 %p7, %r10, 255;
@%p7 bra BB59_11;
ld.shared.f32 %f30, [%r13+1024];
add.f32 %f6, %f6, %f30;
st.shared.f32 [%r13], %f6;
BB59_11:
bar.sync 0;
BB59_12:
setp.lt.u32 %p8, %r14, 256;
@%p8 bra BB59_16;
setp.gt.u32 %p9, %r10, 127;
@%p9 bra BB59_15;
ld.shared.f32 %f31, [%r13+512];
add.f32 %f6, %f6, %f31;
st.shared.f32 [%r13], %f6;
BB59_15:
bar.sync 0;
BB59_16:
setp.lt.u32 %p10, %r14, 128;
@%p10 bra BB59_20;
setp.gt.u32 %p11, %r10, 63;
@%p11 bra BB59_19;
ld.shared.f32 %f32, [%r13+256];
add.f32 %f6, %f6, %f32;
st.shared.f32 [%r13], %f6;
BB59_19:
bar.sync 0;
BB59_20:
setp.gt.u32 %p12, %r10, 31;
@%p12 bra BB59_33;
setp.lt.u32 %p13, %r14, 64;
@%p13 bra BB59_23;
ld.volatile.shared.f32 %f33, [%r13+128];
add.f32 %f6, %f6, %f33;
st.volatile.shared.f32 [%r13], %f6;
BB59_23:
setp.lt.u32 %p14, %r14, 32;
@%p14 bra BB59_25;
ld.volatile.shared.f32 %f34, [%r13+64];
add.f32 %f6, %f6, %f34;
st.volatile.shared.f32 [%r13], %f6;
BB59_25:
setp.lt.u32 %p15, %r14, 16;
@%p15 bra BB59_27;
ld.volatile.shared.f32 %f35, [%r13+32];
add.f32 %f6, %f6, %f35;
st.volatile.shared.f32 [%r13], %f6;
BB59_27:
setp.lt.u32 %p16, %r14, 8;
@%p16 bra BB59_29;
ld.volatile.shared.f32 %f36, [%r13+16];
add.f32 %f6, %f6, %f36;
st.volatile.shared.f32 [%r13], %f6;
BB59_29:
setp.lt.u32 %p17, %r14, 4;
@%p17 bra BB59_31;
ld.volatile.shared.f32 %f37, [%r13+8];
add.f32 %f6, %f6, %f37;
st.volatile.shared.f32 [%r13], %f6;
BB59_31:
setp.lt.u32 %p18, %r14, 2;
@%p18 bra BB59_33;
ld.volatile.shared.f32 %f38, [%r13+4];
add.f32 %f39, %f6, %f38;
st.volatile.shared.f32 [%r13], %f39;
BB59_33:
setp.ne.s32 %p19, %r10, 0;
@%p19 bra BB59_35;
ld.shared.f32 %f40, [memory];
cvta.to.global.u64 %rd6, %rd2;
mul.wide.u32 %rd7, %r6, 4;
add.s64 %rd8, %rd6, %rd7;
st.global.f32 [%rd8], %f40;
BB59_35:
ret;
}
// .globl reduce_col_sum_d
.visible .entry reduce_col_sum_d(
.param .u64 reduce_col_sum_d_param_0,
.param .u64 reduce_col_sum_d_param_1,
.param .u32 reduce_col_sum_d_param_2,
.param .u32 reduce_col_sum_d_param_3
)
{
.reg .pred %p<4>;
.reg .b32 %r<11>;
.reg .f64 %fd<9>;
.reg .b64 %rd<9>;
ld.param.u64 %rd2, [reduce_col_sum_d_param_0];
ld.param.u64 %rd3, [reduce_col_sum_d_param_1];
ld.param.u32 %r5, [reduce_col_sum_d_param_2];
ld.param.u32 %r6, [reduce_col_sum_d_param_3];
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %ctaid.x;
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r7, %r8, %r9;
setp.ge.u32 %p1, %r1, %r6;
@%p1 bra BB60_5;
mul.lo.s32 %r2, %r6, %r5;
cvta.to.global.u64 %rd1, %rd2;
mov.f64 %fd8, 0d0000000000000000;
setp.ge.u32 %p2, %r1, %r2;
@%p2 bra BB60_4;
mov.u32 %r10, %r1;
BB60_3:
mul.wide.u32 %rd4, %r10, 8;
add.s64 %rd5, %rd1, %rd4;
ld.global.f64 %fd6, [%rd5];
add.f64 %fd8, %fd8, %fd6;
add.s32 %r10, %r10, %r6;
setp.lt.u32 %p3, %r10, %r2;
@%p3 bra BB60_3;
BB60_4:
cvta.to.global.u64 %rd6, %rd3;
mul.wide.u32 %rd7, %r1, 8;
add.s64 %rd8, %rd6, %rd7;
st.global.f64 [%rd8], %fd8;
BB60_5:
ret;
}
// .globl reduce_col_sum_f
.visible .entry reduce_col_sum_f(
.param .u64 reduce_col_sum_f_param_0,
.param .u64 reduce_col_sum_f_param_1,
.param .u32 reduce_col_sum_f_param_2,
.param .u32 reduce_col_sum_f_param_3
)
{
.reg .pred %p<4>;
.reg .f32 %f<9>;
.reg .b32 %r<11>;
.reg .b64 %rd<9>;
ld.param.u64 %rd2, [reduce_col_sum_f_param_0];
ld.param.u64 %rd3, [reduce_col_sum_f_param_1];
ld.param.u32 %r5, [reduce_col_sum_f_param_2];
ld.param.u32 %r6, [reduce_col_sum_f_param_3];
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %ctaid.x;
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r7, %r8, %r9;
setp.ge.u32 %p1, %r1, %r6;
@%p1 bra BB61_5;
mul.lo.s32 %r2, %r6, %r5;
cvta.to.global.u64 %rd1, %rd2;
mov.f32 %f8, 0f00000000;
setp.ge.u32 %p2, %r1, %r2;
@%p2 bra BB61_4;
mov.u32 %r10, %r1;
BB61_3:
mul.wide.u32 %rd4, %r10, 4;
add.s64 %rd5, %rd1, %rd4;
ld.global.f32 %f6, [%rd5];
add.f32 %f8, %f8, %f6;
add.s32 %r10, %r10, %r6;
setp.lt.u32 %p3, %r10, %r2;
@%p3 bra BB61_3;
BB61_4:
cvta.to.global.u64 %rd6, %rd3;
mul.wide.u32 %rd7, %r1, 4;
add.s64 %rd8, %rd6, %rd7;
st.global.f32 [%rd8], %f8;
BB61_5:
ret;
}
// .globl reduce_max_d
.visible .entry reduce_max_d(
.param .u64 reduce_max_d_param_0,
.param .u64 reduce_max_d_param_1,
.param .u32 reduce_max_d_param_2
)
{
.reg .pred %p<20>;
.reg .b32 %r<36>;
.reg .f64 %fd<60>;
.reg .b64 %rd<12>;
ld.param.u64 %rd1, [reduce_max_d_param_0];
ld.param.u64 %rd2, [reduce_max_d_param_1];
ld.param.u32 %r6, [reduce_max_d_param_2];
mov.u32 %r7, %tid.x;
mov.u32 %r8, %ctaid.x;
shl.b32 %r9, %r8, 1;
mov.u32 %r10, %ntid.x;
mad.lo.s32 %r35, %r9, %r10, %r7;
mov.f64 %fd44, 0dFFEFFFFFFFFFFFFF;
setp.ge.u32 %p1, %r35, %r6;
@%p1 bra BB62_4;
BB62_1:
cvta.to.global.u64 %rd3, %rd1;
mul.wide.u32 %rd4, %r35, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd30, [%rd5];
max.f64 %fd44, %fd44, %fd30;
add.s32 %r3, %r35, %r10;
setp.ge.u32 %p2, %r3, %r6;
@%p2 bra BB62_3;
mul.wide.u32 %rd7, %r3, 8;
add.s64 %rd8, %rd3, %rd7;
ld.global.f64 %fd31, [%rd8];
max.f64 %fd44, %fd44, %fd31;
BB62_3:
shl.b32 %r13, %r10, 1;
mov.u32 %r14, %nctaid.x;
mad.lo.s32 %r35, %r13, %r14, %r35;
setp.lt.u32 %p3, %r35, %r6;
@%p3 bra BB62_1;
BB62_4:
shl.b32 %r16, %r7, 3;
mov.u32 %r17, memory;
add.s32 %r5, %r17, %r16;
st.shared.f64 [%r5], %fd44;
bar.sync 0;
setp.lt.u32 %p4, %r10, 1024;
@%p4 bra BB62_8;
setp.gt.u32 %p5, %r7, 511;
@%p5 bra BB62_7;
ld.shared.f64 %fd32, [%r5+4096];
max.f64 %fd44, %fd44, %fd32;
st.shared.f64 [%r5], %fd44;
BB62_7:
bar.sync 0;
BB62_8:
setp.lt.u32 %p6, %r10, 512;
@%p6 bra BB62_12;
setp.gt.u32 %p7, %r7, 255;
@%p7 bra BB62_11;
ld.shared.f64 %fd33, [%r5+2048];
max.f64 %fd44, %fd44, %fd33;
st.shared.f64 [%r5], %fd44;
BB62_11:
bar.sync 0;
BB62_12:
setp.lt.u32 %p8, %r10, 256;
@%p8 bra BB62_16;
setp.gt.u32 %p9, %r7, 127;
@%p9 bra BB62_15;
ld.shared.f64 %fd34, [%r5+1024];
max.f64 %fd44, %fd44, %fd34;
st.shared.f64 [%r5], %fd44;
BB62_15:
bar.sync 0;
BB62_16:
setp.lt.u32 %p10, %r10, 128;
@%p10 bra BB62_20;
setp.gt.u32 %p11, %r7, 63;
@%p11 bra BB62_19;
ld.shared.f64 %fd35, [%r5+512];
max.f64 %fd44, %fd44, %fd35;
st.shared.f64 [%r5], %fd44;
BB62_19:
bar.sync 0;
BB62_20:
setp.gt.u32 %p12, %r7, 31;
@%p12 bra BB62_33;
setp.lt.u32 %p13, %r10, 64;
@%p13 bra BB62_23;
ld.volatile.shared.f64 %fd36, [%r5+256];
max.f64 %fd44, %fd44, %fd36;
st.volatile.shared.f64 [%r5], %fd44;
BB62_23:
setp.lt.u32 %p14, %r10, 32;
@%p14 bra BB62_25;
ld.volatile.shared.f64 %fd37, [%r5+128];
max.f64 %fd44, %fd44, %fd37;
st.volatile.shared.f64 [%r5], %fd44;
BB62_25:
setp.lt.u32 %p15, %r10, 16;
@%p15 bra BB62_27;
ld.volatile.shared.f64 %fd38, [%r5+64];
max.f64 %fd44, %fd44, %fd38;
st.volatile.shared.f64 [%r5], %fd44;
BB62_27:
setp.lt.u32 %p16, %r10, 8;
@%p16 bra BB62_29;
ld.volatile.shared.f64 %fd39, [%r5+32];
max.f64 %fd44, %fd44, %fd39;
st.volatile.shared.f64 [%r5], %fd44;
BB62_29:
setp.lt.u32 %p17, %r10, 4;
@%p17 bra BB62_31;
ld.volatile.shared.f64 %fd40, [%r5+16];
max.f64 %fd44, %fd44, %fd40;
st.volatile.shared.f64 [%r5], %fd44;
BB62_31:
setp.lt.u32 %p18, %r10, 2;
@%p18 bra BB62_33;
ld.volatile.shared.f64 %fd41, [%r5+8];
max.f64 %fd42, %fd44, %fd41;
st.volatile.shared.f64 [%r5], %fd42;
BB62_33:
setp.ne.s32 %p19, %r7, 0;
@%p19 bra BB62_35;
ld.shared.f64 %fd43, [memory];
cvta.to.global.u64 %rd9, %rd2;
mul.wide.u32 %rd10, %r8, 8;
add.s64 %rd11, %rd9, %rd10;
st.global.f64 [%rd11], %fd43;
BB62_35:
ret;
}
// .globl reduce_max_f
.visible .entry reduce_max_f(
.param .u64 reduce_max_f_param_0,
.param .u64 reduce_max_f_param_1,
.param .u32 reduce_max_f_param_2
)
{
.reg .pred %p<20>;
.reg .f32 %f<60>;
.reg .b32 %r<36>;
.reg .b64 %rd<12>;
ld.param.u64 %rd1, [reduce_max_f_param_0];
ld.param.u64 %rd2, [reduce_max_f_param_1];
ld.param.u32 %r6, [reduce_max_f_param_2];
mov.u32 %r7, %tid.x;
mov.u32 %r8, %ctaid.x;
shl.b32 %r9, %r8, 1;
mov.u32 %r10, %ntid.x;
mad.lo.s32 %r35, %r9, %r10, %r7;
mov.f32 %f44, 0fFF7FFFFF;
setp.ge.u32 %p1, %r35, %r6;
@%p1 bra BB63_4;
BB63_1:
cvta.to.global.u64 %rd3, %rd1;
mul.wide.u32 %rd4, %r35, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f30, [%rd5];
max.f32 %f44, %f44, %f30;
add.s32 %r3, %r35, %r10;
setp.ge.u32 %p2, %r3, %r6;
@%p2 bra BB63_3;
mul.wide.u32 %rd7, %r3, 4;
add.s64 %rd8, %rd3, %rd7;
ld.global.f32 %f31, [%rd8];
max.f32 %f44, %f44, %f31;
BB63_3:
shl.b32 %r13, %r10, 1;
mov.u32 %r14, %nctaid.x;
mad.lo.s32 %r35, %r13, %r14, %r35;
setp.lt.u32 %p3, %r35, %r6;
@%p3 bra BB63_1;
BB63_4:
shl.b32 %r16, %r7, 2;
mov.u32 %r17, memory;
add.s32 %r5, %r17, %r16;
st.shared.f32 [%r5], %f44;
bar.sync 0;
setp.lt.u32 %p4, %r10, 1024;
@%p4 bra BB63_8;
setp.gt.u32 %p5, %r7, 511;
@%p5 bra BB63_7;
ld.shared.f32 %f32, [%r5+2048];
max.f32 %f44, %f44, %f32;
st.shared.f32 [%r5], %f44;
BB63_7:
bar.sync 0;
BB63_8:
setp.lt.u32 %p6, %r10, 512;
@%p6 bra BB63_12;
setp.gt.u32 %p7, %r7, 255;
@%p7 bra BB63_11;
ld.shared.f32 %f33, [%r5+1024];
max.f32 %f44, %f44, %f33;
st.shared.f32 [%r5], %f44;
BB63_11:
bar.sync 0;
BB63_12:
setp.lt.u32 %p8, %r10, 256;
@%p8 bra BB63_16;
setp.gt.u32 %p9, %r7, 127;
@%p9 bra BB63_15;
ld.shared.f32 %f34, [%r5+512];
max.f32 %f44, %f44, %f34;
st.shared.f32 [%r5], %f44;
BB63_15:
bar.sync 0;
BB63_16:
setp.lt.u32 %p10, %r10, 128;
@%p10 bra BB63_20;
setp.gt.u32 %p11, %r7, 63;
@%p11 bra BB63_19;
ld.shared.f32 %f35, [%r5+256];
max.f32 %f44, %f44, %f35;
st.shared.f32 [%r5], %f44;
BB63_19:
bar.sync 0;
BB63_20:
setp.gt.u32 %p12, %r7, 31;
@%p12 bra BB63_33;
setp.lt.u32 %p13, %r10, 64;
@%p13 bra BB63_23;
ld.volatile.shared.f32 %f36, [%r5+128];
max.f32 %f44, %f44, %f36;
st.volatile.shared.f32 [%r5], %f44;
BB63_23:
setp.lt.u32 %p14, %r10, 32;
@%p14 bra BB63_25;
ld.volatile.shared.f32 %f37, [%r5+64];
max.f32 %f44, %f44, %f37;
st.volatile.shared.f32 [%r5], %f44;
BB63_25:
setp.lt.u32 %p15, %r10, 16;
@%p15 bra BB63_27;
ld.volatile.shared.f32 %f38, [%r5+32];
max.f32 %f44, %f44, %f38;
st.volatile.shared.f32 [%r5], %f44;
BB63_27:
setp.lt.u32 %p16, %r10, 8;
@%p16 bra BB63_29;
ld.volatile.shared.f32 %f39, [%r5+16];
max.f32 %f44, %f44, %f39;
st.volatile.shared.f32 [%r5], %f44;
BB63_29:
setp.lt.u32 %p17, %r10, 4;
@%p17 bra BB63_31;
ld.volatile.shared.f32 %f40, [%r5+8];
max.f32 %f44, %f44, %f40;
st.volatile.shared.f32 [%r5], %f44;
BB63_31:
setp.lt.u32 %p18, %r10, 2;
@%p18 bra BB63_33;
ld.volatile.shared.f32 %f41, [%r5+4];
max.f32 %f42, %f44, %f41;
st.volatile.shared.f32 [%r5], %f42;
BB63_33:
setp.ne.s32 %p19, %r7, 0;
@%p19 bra BB63_35;
ld.shared.f32 %f43, [memory];
cvta.to.global.u64 %rd9, %rd2;
mul.wide.u32 %rd10, %r8, 4;
add.s64 %rd11, %rd9, %rd10;
st.global.f32 [%rd11], %f43;
BB63_35:
ret;
}
// .globl reduce_row_max_d
.visible .entry reduce_row_max_d(
.param .u64 reduce_row_max_d_param_0,
.param .u64 reduce_row_max_d_param_1,
.param .u32 reduce_row_max_d_param_2,
.param .u32 reduce_row_max_d_param_3
)
{
.reg .pred %p<20>;
.reg .b32 %r<72>;
.reg .f64 %fd<56>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [reduce_row_max_d_param_0];
ld.param.u64 %rd2, [reduce_row_max_d_param_1];
ld.param.u32 %r5, [reduce_row_max_d_param_2];
ld.param.u32 %r4, [reduce_row_max_d_param_3];
mov.u32 %r6, %ctaid.x;
setp.ge.u32 %p1, %r6, %r5;
@%p1 bra BB64_35;
mov.u32 %r71, %tid.x;
mov.f64 %fd6, 0dFFEFFFFFFFFFFFFF;
setp.ge.u32 %p2, %r71, %r4;
@%p2 bra BB64_4;
cvta.to.global.u64 %rd3, %rd1;
BB64_3:
mad.lo.s32 %r8, %r6, %r4, %r71;
mul.wide.u32 %rd4, %r8, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd28, [%rd5];
max.f64 %fd6, %fd6, %fd28;
mov.u32 %r9, %ntid.x;
add.s32 %r71, %r9, %r71;
setp.lt.u32 %p3, %r71, %r4;
@%p3 bra BB64_3;
BB64_4:
mov.u32 %r10, %tid.x;
shl.b32 %r11, %r10, 3;
mov.u32 %r12, memory;
add.s32 %r13, %r12, %r11;
st.shared.f64 [%r13], %fd6;
bar.sync 0;
mov.u32 %r14, %ntid.x;
setp.lt.u32 %p4, %r14, 1024;
@%p4 bra BB64_8;
setp.gt.u32 %p5, %r10, 511;
@%p5 bra BB64_7;
ld.shared.f64 %fd29, [%r13+4096];
max.f64 %fd6, %fd6, %fd29;
st.shared.f64 [%r13], %fd6;
BB64_7:
bar.sync 0;
BB64_8:
setp.lt.u32 %p6, %r14, 512;
@%p6 bra BB64_12;
setp.gt.u32 %p7, %r10, 255;
@%p7 bra BB64_11;
ld.shared.f64 %fd30, [%r13+2048];
max.f64 %fd6, %fd6, %fd30;
st.shared.f64 [%r13], %fd6;
BB64_11:
bar.sync 0;
BB64_12:
setp.lt.u32 %p8, %r14, 256;
@%p8 bra BB64_16;
setp.gt.u32 %p9, %r10, 127;
@%p9 bra BB64_15;
ld.shared.f64 %fd31, [%r13+1024];
max.f64 %fd6, %fd6, %fd31;
st.shared.f64 [%r13], %fd6;
BB64_15:
bar.sync 0;
BB64_16:
setp.lt.u32 %p10, %r14, 128;
@%p10 bra BB64_20;
setp.gt.u32 %p11, %r10, 63;
@%p11 bra BB64_19;
ld.shared.f64 %fd32, [%r13+512];
max.f64 %fd6, %fd6, %fd32;
st.shared.f64 [%r13], %fd6;
BB64_19:
bar.sync 0;
BB64_20:
setp.gt.u32 %p12, %r10, 31;
@%p12 bra BB64_33;
setp.lt.u32 %p13, %r14, 64;
@%p13 bra BB64_23;
ld.volatile.shared.f64 %fd33, [%r13+256];
max.f64 %fd6, %fd6, %fd33;
st.volatile.shared.f64 [%r13], %fd6;
BB64_23:
setp.lt.u32 %p14, %r14, 32;
@%p14 bra BB64_25;
ld.volatile.shared.f64 %fd34, [%r13+128];
max.f64 %fd6, %fd6, %fd34;
st.volatile.shared.f64 [%r13], %fd6;
BB64_25:
setp.lt.u32 %p15, %r14, 16;
@%p15 bra BB64_27;
ld.volatile.shared.f64 %fd35, [%r13+64];
max.f64 %fd6, %fd6, %fd35;
st.volatile.shared.f64 [%r13], %fd6;
BB64_27:
setp.lt.u32 %p16, %r14, 8;
@%p16 bra BB64_29;
ld.volatile.shared.f64 %fd36, [%r13+32];
max.f64 %fd6, %fd6, %fd36;
st.volatile.shared.f64 [%r13], %fd6;
BB64_29:
setp.lt.u32 %p17, %r14, 4;
@%p17 bra BB64_31;
ld.volatile.shared.f64 %fd37, [%r13+16];
max.f64 %fd6, %fd6, %fd37;
st.volatile.shared.f64 [%r13], %fd6;
BB64_31:
setp.lt.u32 %p18, %r14, 2;
@%p18 bra BB64_33;
ld.volatile.shared.f64 %fd38, [%r13+8];
max.f64 %fd39, %fd6, %fd38;
st.volatile.shared.f64 [%r13], %fd39;
BB64_33:
setp.ne.s32 %p19, %r10, 0;
@%p19 bra BB64_35;
ld.shared.f64 %fd40, [memory];
cvta.to.global.u64 %rd6, %rd2;
mul.wide.u32 %rd7, %r6, 8;
add.s64 %rd8, %rd6, %rd7;
st.global.f64 [%rd8], %fd40;
BB64_35:
ret;
}
// .globl reduce_row_max_f
.visible .entry reduce_row_max_f(
.param .u64 reduce_row_max_f_param_0,
.param .u64 reduce_row_max_f_param_1,
.param .u32 reduce_row_max_f_param_2,
.param .u32 reduce_row_max_f_param_3
)
{
.reg .pred %p<20>;
.reg .f32 %f<56>;
.reg .b32 %r<72>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [reduce_row_max_f_param_0];
ld.param.u64 %rd2, [reduce_row_max_f_param_1];
ld.param.u32 %r5, [reduce_row_max_f_param_2];
ld.param.u32 %r4, [reduce_row_max_f_param_3];
mov.u32 %r6, %ctaid.x;
setp.ge.u32 %p1, %r6, %r5;
@%p1 bra BB65_35;
mov.u32 %r71, %tid.x;
mov.f32 %f6, 0fFF7FFFFF;
setp.ge.u32 %p2, %r71, %r4;
@%p2 bra BB65_4;
cvta.to.global.u64 %rd3, %rd1;
BB65_3:
mad.lo.s32 %r8, %r6, %r4, %r71;
mul.wide.u32 %rd4, %r8, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f28, [%rd5];
max.f32 %f6, %f6, %f28;
mov.u32 %r9, %ntid.x;
add.s32 %r71, %r9, %r71;
setp.lt.u32 %p3, %r71, %r4;
@%p3 bra BB65_3;
BB65_4:
mov.u32 %r10, %tid.x;
shl.b32 %r11, %r10, 2;
mov.u32 %r12, memory;
add.s32 %r13, %r12, %r11;
st.shared.f32 [%r13], %f6;
bar.sync 0;
mov.u32 %r14, %ntid.x;
setp.lt.u32 %p4, %r14, 1024;
@%p4 bra BB65_8;
setp.gt.u32 %p5, %r10, 511;
@%p5 bra BB65_7;
ld.shared.f32 %f29, [%r13+2048];
max.f32 %f6, %f6, %f29;
st.shared.f32 [%r13], %f6;
BB65_7:
bar.sync 0;
BB65_8:
setp.lt.u32 %p6, %r14, 512;
@%p6 bra BB65_12;
setp.gt.u32 %p7, %r10, 255;
@%p7 bra BB65_11;
ld.shared.f32 %f30, [%r13+1024];
max.f32 %f6, %f6, %f30;
st.shared.f32 [%r13], %f6;
BB65_11:
bar.sync 0;
BB65_12:
setp.lt.u32 %p8, %r14, 256;
@%p8 bra BB65_16;
setp.gt.u32 %p9, %r10, 127;
@%p9 bra BB65_15;
ld.shared.f32 %f31, [%r13+512];
max.f32 %f6, %f6, %f31;
st.shared.f32 [%r13], %f6;
BB65_15:
bar.sync 0;
BB65_16:
setp.lt.u32 %p10, %r14, 128;
@%p10 bra BB65_20;
setp.gt.u32 %p11, %r10, 63;
@%p11 bra BB65_19;
ld.shared.f32 %f32, [%r13+256];
max.f32 %f6, %f6, %f32;
st.shared.f32 [%r13], %f6;
BB65_19:
bar.sync 0;
BB65_20:
setp.gt.u32 %p12, %r10, 31;
@%p12 bra BB65_33;
setp.lt.u32 %p13, %r14, 64;
@%p13 bra BB65_23;
ld.volatile.shared.f32 %f33, [%r13+128];
max.f32 %f6, %f6, %f33;
st.volatile.shared.f32 [%r13], %f6;
BB65_23:
setp.lt.u32 %p14, %r14, 32;
@%p14 bra BB65_25;
ld.volatile.shared.f32 %f34, [%r13+64];
max.f32 %f6, %f6, %f34;
st.volatile.shared.f32 [%r13], %f6;
BB65_25:
setp.lt.u32 %p15, %r14, 16;
@%p15 bra BB65_27;
ld.volatile.shared.f32 %f35, [%r13+32];
max.f32 %f6, %f6, %f35;
st.volatile.shared.f32 [%r13], %f6;
BB65_27:
setp.lt.u32 %p16, %r14, 8;
@%p16 bra BB65_29;
ld.volatile.shared.f32 %f36, [%r13+16];
max.f32 %f6, %f6, %f36;
st.volatile.shared.f32 [%r13], %f6;
BB65_29:
setp.lt.u32 %p17, %r14, 4;
@%p17 bra BB65_31;
ld.volatile.shared.f32 %f37, [%r13+8];
max.f32 %f6, %f6, %f37;
st.volatile.shared.f32 [%r13], %f6;
BB65_31:
setp.lt.u32 %p18, %r14, 2;
@%p18 bra BB65_33;
ld.volatile.shared.f32 %f38, [%r13+4];
max.f32 %f39, %f6, %f38;
st.volatile.shared.f32 [%r13], %f39;
BB65_33:
setp.ne.s32 %p19, %r10, 0;
@%p19 bra BB65_35;
ld.shared.f32 %f40, [memory];
cvta.to.global.u64 %rd6, %rd2;
mul.wide.u32 %rd7, %r6, 4;
add.s64 %rd8, %rd6, %rd7;
st.global.f32 [%rd8], %f40;
BB65_35:
ret;
}
// .globl reduce_col_max_d
.visible .entry reduce_col_max_d(
.param .u64 reduce_col_max_d_param_0,
.param .u64 reduce_col_max_d_param_1,
.param .u32 reduce_col_max_d_param_2,
.param .u32 reduce_col_max_d_param_3
)
{
.reg .pred %p<4>;
.reg .b32 %r<11>;
.reg .f64 %fd<9>;
.reg .b64 %rd<9>;
ld.param.u64 %rd2, [reduce_col_max_d_param_0];
ld.param.u64 %rd3, [reduce_col_max_d_param_1];
ld.param.u32 %r5, [reduce_col_max_d_param_2];
ld.param.u32 %r6, [reduce_col_max_d_param_3];
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %ctaid.x;
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r7, %r8, %r9;
setp.ge.u32 %p1, %r1, %r6;
@%p1 bra BB66_5;
mul.lo.s32 %r2, %r6, %r5;
cvta.to.global.u64 %rd1, %rd2;
mov.f64 %fd8, 0dFFEFFFFFFFFFFFFF;
setp.ge.u32 %p2, %r1, %r2;
@%p2 bra BB66_4;
mov.u32 %r10, %r1;
BB66_3:
mul.wide.u32 %rd4, %r10, 8;
add.s64 %rd5, %rd1, %rd4;
ld.global.f64 %fd6, [%rd5];
max.f64 %fd8, %fd8, %fd6;
add.s32 %r10, %r10, %r6;
setp.lt.u32 %p3, %r10, %r2;
@%p3 bra BB66_3;
BB66_4:
cvta.to.global.u64 %rd6, %rd3;
mul.wide.u32 %rd7, %r1, 8;
add.s64 %rd8, %rd6, %rd7;
st.global.f64 [%rd8], %fd8;
BB66_5:
ret;
}
// .globl reduce_col_max_f
.visible .entry reduce_col_max_f(
.param .u64 reduce_col_max_f_param_0,
.param .u64 reduce_col_max_f_param_1,
.param .u32 reduce_col_max_f_param_2,
.param .u32 reduce_col_max_f_param_3
)
{
.reg .pred %p<4>;
.reg .f32 %f<9>;
.reg .b32 %r<11>;
.reg .b64 %rd<9>;
ld.param.u64 %rd2, [reduce_col_max_f_param_0];
ld.param.u64 %rd3, [reduce_col_max_f_param_1];
ld.param.u32 %r5, [reduce_col_max_f_param_2];
ld.param.u32 %r6, [reduce_col_max_f_param_3];
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %ctaid.x;
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r7, %r8, %r9;
setp.ge.u32 %p1, %r1, %r6;
@%p1 bra BB67_5;
mul.lo.s32 %r2, %r6, %r5;
cvta.to.global.u64 %rd1, %rd2;
mov.f32 %f8, 0fFF7FFFFF;
setp.ge.u32 %p2, %r1, %r2;
@%p2 bra BB67_4;
mov.u32 %r10, %r1;
BB67_3:
mul.wide.u32 %rd4, %r10, 4;
add.s64 %rd5, %rd1, %rd4;
ld.global.f32 %f6, [%rd5];
max.f32 %f8, %f8, %f6;
add.s32 %r10, %r10, %r6;
setp.lt.u32 %p3, %r10, %r2;
@%p3 bra BB67_3;
BB67_4:
cvta.to.global.u64 %rd6, %rd3;
mul.wide.u32 %rd7, %r1, 4;
add.s64 %rd8, %rd6, %rd7;
st.global.f32 [%rd8], %f8;
BB67_5:
ret;
}
// .globl reduce_min_d
.visible .entry reduce_min_d(
.param .u64 reduce_min_d_param_0,
.param .u64 reduce_min_d_param_1,
.param .u32 reduce_min_d_param_2
)
{
.reg .pred %p<20>;
.reg .b32 %r<36>;
.reg .f64 %fd<60>;
.reg .b64 %rd<12>;
ld.param.u64 %rd1, [reduce_min_d_param_0];
ld.param.u64 %rd2, [reduce_min_d_param_1];
ld.param.u32 %r6, [reduce_min_d_param_2];
mov.u32 %r7, %tid.x;
mov.u32 %r8, %ctaid.x;
shl.b32 %r9, %r8, 1;
mov.u32 %r10, %ntid.x;
mad.lo.s32 %r35, %r9, %r10, %r7;
mov.f64 %fd44, 0d7FEFFFFFFFFFFFFF;
setp.ge.u32 %p1, %r35, %r6;
@%p1 bra BB68_4;
BB68_1:
cvta.to.global.u64 %rd3, %rd1;
mul.wide.u32 %rd4, %r35, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd30, [%rd5];
min.f64 %fd44, %fd44, %fd30;
add.s32 %r3, %r35, %r10;
setp.ge.u32 %p2, %r3, %r6;
@%p2 bra BB68_3;
mul.wide.u32 %rd7, %r3, 8;
add.s64 %rd8, %rd3, %rd7;
ld.global.f64 %fd31, [%rd8];
min.f64 %fd44, %fd44, %fd31;
BB68_3:
shl.b32 %r13, %r10, 1;
mov.u32 %r14, %nctaid.x;
mad.lo.s32 %r35, %r13, %r14, %r35;
setp.lt.u32 %p3, %r35, %r6;
@%p3 bra BB68_1;
BB68_4:
shl.b32 %r16, %r7, 3;
mov.u32 %r17, memory;
add.s32 %r5, %r17, %r16;
st.shared.f64 [%r5], %fd44;
bar.sync 0;
setp.lt.u32 %p4, %r10, 1024;
@%p4 bra BB68_8;
setp.gt.u32 %p5, %r7, 511;
@%p5 bra BB68_7;
ld.shared.f64 %fd32, [%r5+4096];
min.f64 %fd44, %fd44, %fd32;
st.shared.f64 [%r5], %fd44;
BB68_7:
bar.sync 0;
BB68_8:
setp.lt.u32 %p6, %r10, 512;
@%p6 bra BB68_12;
setp.gt.u32 %p7, %r7, 255;
@%p7 bra BB68_11;
ld.shared.f64 %fd33, [%r5+2048];
min.f64 %fd44, %fd44, %fd33;
st.shared.f64 [%r5], %fd44;
BB68_11:
bar.sync 0;
BB68_12:
setp.lt.u32 %p8, %r10, 256;
@%p8 bra BB68_16;
setp.gt.u32 %p9, %r7, 127;
@%p9 bra BB68_15;
ld.shared.f64 %fd34, [%r5+1024];
min.f64 %fd44, %fd44, %fd34;
st.shared.f64 [%r5], %fd44;
BB68_15:
bar.sync 0;
BB68_16:
setp.lt.u32 %p10, %r10, 128;
@%p10 bra BB68_20;
setp.gt.u32 %p11, %r7, 63;
@%p11 bra BB68_19;
ld.shared.f64 %fd35, [%r5+512];
min.f64 %fd44, %fd44, %fd35;
st.shared.f64 [%r5], %fd44;
BB68_19:
bar.sync 0;
BB68_20:
setp.gt.u32 %p12, %r7, 31;
@%p12 bra BB68_33;
setp.lt.u32 %p13, %r10, 64;
@%p13 bra BB68_23;
ld.volatile.shared.f64 %fd36, [%r5+256];
min.f64 %fd44, %fd44, %fd36;
st.volatile.shared.f64 [%r5], %fd44;
BB68_23:
setp.lt.u32 %p14, %r10, 32;
@%p14 bra BB68_25;
ld.volatile.shared.f64 %fd37, [%r5+128];
min.f64 %fd44, %fd44, %fd37;
st.volatile.shared.f64 [%r5], %fd44;
BB68_25:
setp.lt.u32 %p15, %r10, 16;
@%p15 bra BB68_27;
ld.volatile.shared.f64 %fd38, [%r5+64];
min.f64 %fd44, %fd44, %fd38;
st.volatile.shared.f64 [%r5], %fd44;
BB68_27:
setp.lt.u32 %p16, %r10, 8;
@%p16 bra BB68_29;
ld.volatile.shared.f64 %fd39, [%r5+32];
min.f64 %fd44, %fd44, %fd39;
st.volatile.shared.f64 [%r5], %fd44;
BB68_29:
setp.lt.u32 %p17, %r10, 4;
@%p17 bra BB68_31;
ld.volatile.shared.f64 %fd40, [%r5+16];
min.f64 %fd44, %fd44, %fd40;
st.volatile.shared.f64 [%r5], %fd44;
BB68_31:
setp.lt.u32 %p18, %r10, 2;
@%p18 bra BB68_33;
ld.volatile.shared.f64 %fd41, [%r5+8];
min.f64 %fd42, %fd44, %fd41;
st.volatile.shared.f64 [%r5], %fd42;
BB68_33:
setp.ne.s32 %p19, %r7, 0;
@%p19 bra BB68_35;
ld.shared.f64 %fd43, [memory];
cvta.to.global.u64 %rd9, %rd2;
mul.wide.u32 %rd10, %r8, 8;
add.s64 %rd11, %rd9, %rd10;
st.global.f64 [%rd11], %fd43;
BB68_35:
ret;
}
// .globl reduce_min_f
.visible .entry reduce_min_f(
.param .u64 reduce_min_f_param_0,
.param .u64 reduce_min_f_param_1,
.param .u32 reduce_min_f_param_2
)
{
.reg .pred %p<20>;
.reg .f32 %f<60>;
.reg .b32 %r<36>;
.reg .b64 %rd<12>;
ld.param.u64 %rd1, [reduce_min_f_param_0];
ld.param.u64 %rd2, [reduce_min_f_param_1];
ld.param.u32 %r6, [reduce_min_f_param_2];
mov.u32 %r7, %tid.x;
mov.u32 %r8, %ctaid.x;
shl.b32 %r9, %r8, 1;
mov.u32 %r10, %ntid.x;
mad.lo.s32 %r35, %r9, %r10, %r7;
mov.f32 %f44, 0f7F7FFFFF;
setp.ge.u32 %p1, %r35, %r6;
@%p1 bra BB69_4;
BB69_1:
cvta.to.global.u64 %rd3, %rd1;
mul.wide.u32 %rd4, %r35, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f30, [%rd5];
min.f32 %f44, %f44, %f30;
add.s32 %r3, %r35, %r10;
setp.ge.u32 %p2, %r3, %r6;
@%p2 bra BB69_3;
mul.wide.u32 %rd7, %r3, 4;
add.s64 %rd8, %rd3, %rd7;
ld.global.f32 %f31, [%rd8];
min.f32 %f44, %f44, %f31;
BB69_3:
shl.b32 %r13, %r10, 1;
mov.u32 %r14, %nctaid.x;
mad.lo.s32 %r35, %r13, %r14, %r35;
setp.lt.u32 %p3, %r35, %r6;
@%p3 bra BB69_1;
BB69_4:
shl.b32 %r16, %r7, 2;
mov.u32 %r17, memory;
add.s32 %r5, %r17, %r16;
st.shared.f32 [%r5], %f44;
bar.sync 0;
setp.lt.u32 %p4, %r10, 1024;
@%p4 bra BB69_8;
setp.gt.u32 %p5, %r7, 511;
@%p5 bra BB69_7;
ld.shared.f32 %f32, [%r5+2048];
min.f32 %f44, %f44, %f32;
st.shared.f32 [%r5], %f44;
BB69_7:
bar.sync 0;
BB69_8:
setp.lt.u32 %p6, %r10, 512;
@%p6 bra BB69_12;
setp.gt.u32 %p7, %r7, 255;
@%p7 bra BB69_11;
ld.shared.f32 %f33, [%r5+1024];
min.f32 %f44, %f44, %f33;
st.shared.f32 [%r5], %f44;
BB69_11:
bar.sync 0;
BB69_12:
setp.lt.u32 %p8, %r10, 256;
@%p8 bra BB69_16;
setp.gt.u32 %p9, %r7, 127;
@%p9 bra BB69_15;
ld.shared.f32 %f34, [%r5+512];
min.f32 %f44, %f44, %f34;
st.shared.f32 [%r5], %f44;
BB69_15:
bar.sync 0;
BB69_16:
setp.lt.u32 %p10, %r10, 128;
@%p10 bra BB69_20;
setp.gt.u32 %p11, %r7, 63;
@%p11 bra BB69_19;
ld.shared.f32 %f35, [%r5+256];
min.f32 %f44, %f44, %f35;
st.shared.f32 [%r5], %f44;
BB69_19:
bar.sync 0;
BB69_20:
setp.gt.u32 %p12, %r7, 31;
@%p12 bra BB69_33;
setp.lt.u32 %p13, %r10, 64;
@%p13 bra BB69_23;
ld.volatile.shared.f32 %f36, [%r5+128];
min.f32 %f44, %f44, %f36;
st.volatile.shared.f32 [%r5], %f44;
BB69_23:
setp.lt.u32 %p14, %r10, 32;
@%p14 bra BB69_25;
ld.volatile.shared.f32 %f37, [%r5+64];
min.f32 %f44, %f44, %f37;
st.volatile.shared.f32 [%r5], %f44;
BB69_25:
setp.lt.u32 %p15, %r10, 16;
@%p15 bra BB69_27;
ld.volatile.shared.f32 %f38, [%r5+32];
min.f32 %f44, %f44, %f38;
st.volatile.shared.f32 [%r5], %f44;
BB69_27:
setp.lt.u32 %p16, %r10, 8;
@%p16 bra BB69_29;
ld.volatile.shared.f32 %f39, [%r5+16];
min.f32 %f44, %f44, %f39;
st.volatile.shared.f32 [%r5], %f44;
BB69_29:
setp.lt.u32 %p17, %r10, 4;
@%p17 bra BB69_31;
ld.volatile.shared.f32 %f40, [%r5+8];
min.f32 %f44, %f44, %f40;
st.volatile.shared.f32 [%r5], %f44;
BB69_31:
setp.lt.u32 %p18, %r10, 2;
@%p18 bra BB69_33;
ld.volatile.shared.f32 %f41, [%r5+4];
min.f32 %f42, %f44, %f41;
st.volatile.shared.f32 [%r5], %f42;
BB69_33:
setp.ne.s32 %p19, %r7, 0;
@%p19 bra BB69_35;
ld.shared.f32 %f43, [memory];
cvta.to.global.u64 %rd9, %rd2;
mul.wide.u32 %rd10, %r8, 4;
add.s64 %rd11, %rd9, %rd10;
st.global.f32 [%rd11], %f43;
BB69_35:
ret;
}
// .globl reduce_row_min_d
.visible .entry reduce_row_min_d(
.param .u64 reduce_row_min_d_param_0,
.param .u64 reduce_row_min_d_param_1,
.param .u32 reduce_row_min_d_param_2,
.param .u32 reduce_row_min_d_param_3
)
{
.reg .pred %p<20>;
.reg .b32 %r<72>;
.reg .f64 %fd<56>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [reduce_row_min_d_param_0];
ld.param.u64 %rd2, [reduce_row_min_d_param_1];
ld.param.u32 %r5, [reduce_row_min_d_param_2];
ld.param.u32 %r4, [reduce_row_min_d_param_3];
mov.u32 %r6, %ctaid.x;
setp.ge.u32 %p1, %r6, %r5;
@%p1 bra BB70_35;
mov.u32 %r71, %tid.x;
mov.f64 %fd6, 0d7FEFFFFFFFFFFFFF;
setp.ge.u32 %p2, %r71, %r4;
@%p2 bra BB70_4;
cvta.to.global.u64 %rd3, %rd1;
BB70_3:
mad.lo.s32 %r8, %r6, %r4, %r71;
mul.wide.u32 %rd4, %r8, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd28, [%rd5];
min.f64 %fd6, %fd6, %fd28;
mov.u32 %r9, %ntid.x;
add.s32 %r71, %r9, %r71;
setp.lt.u32 %p3, %r71, %r4;
@%p3 bra BB70_3;
BB70_4:
mov.u32 %r10, %tid.x;
shl.b32 %r11, %r10, 3;
mov.u32 %r12, memory;
add.s32 %r13, %r12, %r11;
st.shared.f64 [%r13], %fd6;
bar.sync 0;
mov.u32 %r14, %ntid.x;
setp.lt.u32 %p4, %r14, 1024;
@%p4 bra BB70_8;
setp.gt.u32 %p5, %r10, 511;
@%p5 bra BB70_7;
ld.shared.f64 %fd29, [%r13+4096];
min.f64 %fd6, %fd6, %fd29;
st.shared.f64 [%r13], %fd6;
BB70_7:
bar.sync 0;
BB70_8:
setp.lt.u32 %p6, %r14, 512;
@%p6 bra BB70_12;
setp.gt.u32 %p7, %r10, 255;
@%p7 bra BB70_11;
ld.shared.f64 %fd30, [%r13+2048];
min.f64 %fd6, %fd6, %fd30;
st.shared.f64 [%r13], %fd6;
BB70_11:
bar.sync 0;
BB70_12:
setp.lt.u32 %p8, %r14, 256;
@%p8 bra BB70_16;
setp.gt.u32 %p9, %r10, 127;
@%p9 bra BB70_15;
ld.shared.f64 %fd31, [%r13+1024];
min.f64 %fd6, %fd6, %fd31;
st.shared.f64 [%r13], %fd6;
BB70_15:
bar.sync 0;
BB70_16:
setp.lt.u32 %p10, %r14, 128;
@%p10 bra BB70_20;
setp.gt.u32 %p11, %r10, 63;
@%p11 bra BB70_19;
ld.shared.f64 %fd32, [%r13+512];
min.f64 %fd6, %fd6, %fd32;
st.shared.f64 [%r13], %fd6;
BB70_19:
bar.sync 0;
BB70_20:
setp.gt.u32 %p12, %r10, 31;
@%p12 bra BB70_33;
setp.lt.u32 %p13, %r14, 64;
@%p13 bra BB70_23;
ld.volatile.shared.f64 %fd33, [%r13+256];
min.f64 %fd6, %fd6, %fd33;
st.volatile.shared.f64 [%r13], %fd6;
BB70_23:
setp.lt.u32 %p14, %r14, 32;
@%p14 bra BB70_25;
ld.volatile.shared.f64 %fd34, [%r13+128];
min.f64 %fd6, %fd6, %fd34;
st.volatile.shared.f64 [%r13], %fd6;
BB70_25:
setp.lt.u32 %p15, %r14, 16;
@%p15 bra BB70_27;
ld.volatile.shared.f64 %fd35, [%r13+64];
min.f64 %fd6, %fd6, %fd35;
st.volatile.shared.f64 [%r13], %fd6;
BB70_27:
setp.lt.u32 %p16, %r14, 8;
@%p16 bra BB70_29;
ld.volatile.shared.f64 %fd36, [%r13+32];
min.f64 %fd6, %fd6, %fd36;
st.volatile.shared.f64 [%r13], %fd6;
BB70_29:
setp.lt.u32 %p17, %r14, 4;
@%p17 bra BB70_31;
ld.volatile.shared.f64 %fd37, [%r13+16];
min.f64 %fd6, %fd6, %fd37;
st.volatile.shared.f64 [%r13], %fd6;
BB70_31:
setp.lt.u32 %p18, %r14, 2;
@%p18 bra BB70_33;
ld.volatile.shared.f64 %fd38, [%r13+8];
min.f64 %fd39, %fd6, %fd38;
st.volatile.shared.f64 [%r13], %fd39;
BB70_33:
setp.ne.s32 %p19, %r10, 0;
@%p19 bra BB70_35;
ld.shared.f64 %fd40, [memory];
cvta.to.global.u64 %rd6, %rd2;
mul.wide.u32 %rd7, %r6, 8;
add.s64 %rd8, %rd6, %rd7;
st.global.f64 [%rd8], %fd40;
BB70_35:
ret;
}
// .globl reduce_row_min_f
.visible .entry reduce_row_min_f(
.param .u64 reduce_row_min_f_param_0,
.param .u64 reduce_row_min_f_param_1,
.param .u32 reduce_row_min_f_param_2,
.param .u32 reduce_row_min_f_param_3
)
{
.reg .pred %p<20>;
.reg .f32 %f<56>;
.reg .b32 %r<72>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [reduce_row_min_f_param_0];
ld.param.u64 %rd2, [reduce_row_min_f_param_1];
ld.param.u32 %r5, [reduce_row_min_f_param_2];
ld.param.u32 %r4, [reduce_row_min_f_param_3];
mov.u32 %r6, %ctaid.x;
setp.ge.u32 %p1, %r6, %r5;
@%p1 bra BB71_35;
mov.u32 %r71, %tid.x;
mov.f32 %f6, 0f7F7FFFFF;
setp.ge.u32 %p2, %r71, %r4;
@%p2 bra BB71_4;
cvta.to.global.u64 %rd3, %rd1;
BB71_3:
mad.lo.s32 %r8, %r6, %r4, %r71;
mul.wide.u32 %rd4, %r8, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f28, [%rd5];
min.f32 %f6, %f6, %f28;
mov.u32 %r9, %ntid.x;
add.s32 %r71, %r9, %r71;
setp.lt.u32 %p3, %r71, %r4;
@%p3 bra BB71_3;
BB71_4:
mov.u32 %r10, %tid.x;
shl.b32 %r11, %r10, 2;
mov.u32 %r12, memory;
add.s32 %r13, %r12, %r11;
st.shared.f32 [%r13], %f6;
bar.sync 0;
mov.u32 %r14, %ntid.x;
setp.lt.u32 %p4, %r14, 1024;
@%p4 bra BB71_8;
setp.gt.u32 %p5, %r10, 511;
@%p5 bra BB71_7;
ld.shared.f32 %f29, [%r13+2048];
min.f32 %f6, %f6, %f29;
st.shared.f32 [%r13], %f6;
BB71_7:
bar.sync 0;
BB71_8:
setp.lt.u32 %p6, %r14, 512;
@%p6 bra BB71_12;
setp.gt.u32 %p7, %r10, 255;
@%p7 bra BB71_11;
ld.shared.f32 %f30, [%r13+1024];
min.f32 %f6, %f6, %f30;
st.shared.f32 [%r13], %f6;
BB71_11:
bar.sync 0;
BB71_12:
setp.lt.u32 %p8, %r14, 256;
@%p8 bra BB71_16;
setp.gt.u32 %p9, %r10, 127;
@%p9 bra BB71_15;
ld.shared.f32 %f31, [%r13+512];
min.f32 %f6, %f6, %f31;
st.shared.f32 [%r13], %f6;
BB71_15:
bar.sync 0;
BB71_16:
setp.lt.u32 %p10, %r14, 128;
@%p10 bra BB71_20;
setp.gt.u32 %p11, %r10, 63;
@%p11 bra BB71_19;
ld.shared.f32 %f32, [%r13+256];
min.f32 %f6, %f6, %f32;
st.shared.f32 [%r13], %f6;
BB71_19:
bar.sync 0;
BB71_20:
setp.gt.u32 %p12, %r10, 31;
@%p12 bra BB71_33;
setp.lt.u32 %p13, %r14, 64;
@%p13 bra BB71_23;
ld.volatile.shared.f32 %f33, [%r13+128];
min.f32 %f6, %f6, %f33;
st.volatile.shared.f32 [%r13], %f6;
BB71_23:
setp.lt.u32 %p14, %r14, 32;
@%p14 bra BB71_25;
ld.volatile.shared.f32 %f34, [%r13+64];
min.f32 %f6, %f6, %f34;
st.volatile.shared.f32 [%r13], %f6;
BB71_25:
setp.lt.u32 %p15, %r14, 16;
@%p15 bra BB71_27;
ld.volatile.shared.f32 %f35, [%r13+32];
min.f32 %f6, %f6, %f35;
st.volatile.shared.f32 [%r13], %f6;
BB71_27:
setp.lt.u32 %p16, %r14, 8;
@%p16 bra BB71_29;
ld.volatile.shared.f32 %f36, [%r13+16];
min.f32 %f6, %f6, %f36;
st.volatile.shared.f32 [%r13], %f6;
BB71_29:
setp.lt.u32 %p17, %r14, 4;
@%p17 bra BB71_31;
ld.volatile.shared.f32 %f37, [%r13+8];
min.f32 %f6, %f6, %f37;
st.volatile.shared.f32 [%r13], %f6;
BB71_31:
setp.lt.u32 %p18, %r14, 2;
@%p18 bra BB71_33;
ld.volatile.shared.f32 %f38, [%r13+4];
min.f32 %f39, %f6, %f38;
st.volatile.shared.f32 [%r13], %f39;
BB71_33:
setp.ne.s32 %p19, %r10, 0;
@%p19 bra BB71_35;
ld.shared.f32 %f40, [memory];
cvta.to.global.u64 %rd6, %rd2;
mul.wide.u32 %rd7, %r6, 4;
add.s64 %rd8, %rd6, %rd7;
st.global.f32 [%rd8], %f40;
BB71_35:
ret;
}
// .globl reduce_col_min_d
.visible .entry reduce_col_min_d(
.param .u64 reduce_col_min_d_param_0,
.param .u64 reduce_col_min_d_param_1,
.param .u32 reduce_col_min_d_param_2,
.param .u32 reduce_col_min_d_param_3
)
{
.reg .pred %p<4>;
.reg .b32 %r<11>;
.reg .f64 %fd<9>;
.reg .b64 %rd<9>;
ld.param.u64 %rd2, [reduce_col_min_d_param_0];
ld.param.u64 %rd3, [reduce_col_min_d_param_1];
ld.param.u32 %r5, [reduce_col_min_d_param_2];
ld.param.u32 %r6, [reduce_col_min_d_param_3];
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %ctaid.x;
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r7, %r8, %r9;
setp.ge.u32 %p1, %r1, %r6;
@%p1 bra BB72_5;
mul.lo.s32 %r2, %r6, %r5;
cvta.to.global.u64 %rd1, %rd2;
mov.f64 %fd8, 0d7FEFFFFFFFFFFFFF;
setp.ge.u32 %p2, %r1, %r2;
@%p2 bra BB72_4;
mov.u32 %r10, %r1;
BB72_3:
mul.wide.u32 %rd4, %r10, 8;
add.s64 %rd5, %rd1, %rd4;
ld.global.f64 %fd6, [%rd5];
min.f64 %fd8, %fd8, %fd6;
add.s32 %r10, %r10, %r6;
setp.lt.u32 %p3, %r10, %r2;
@%p3 bra BB72_3;
BB72_4:
cvta.to.global.u64 %rd6, %rd3;
mul.wide.u32 %rd7, %r1, 8;
add.s64 %rd8, %rd6, %rd7;
st.global.f64 [%rd8], %fd8;
BB72_5:
ret;
}
// .globl reduce_col_min_f
.visible .entry reduce_col_min_f(
.param .u64 reduce_col_min_f_param_0,
.param .u64 reduce_col_min_f_param_1,
.param .u32 reduce_col_min_f_param_2,
.param .u32 reduce_col_min_f_param_3
)
{
.reg .pred %p<4>;
.reg .f32 %f<9>;
.reg .b32 %r<11>;
.reg .b64 %rd<9>;
ld.param.u64 %rd2, [reduce_col_min_f_param_0];
ld.param.u64 %rd3, [reduce_col_min_f_param_1];
ld.param.u32 %r5, [reduce_col_min_f_param_2];
ld.param.u32 %r6, [reduce_col_min_f_param_3];
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %ctaid.x;
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r7, %r8, %r9;
setp.ge.u32 %p1, %r1, %r6;
@%p1 bra BB73_5;
mul.lo.s32 %r2, %r6, %r5;
cvta.to.global.u64 %rd1, %rd2;
mov.f32 %f8, 0f7F7FFFFF;
setp.ge.u32 %p2, %r1, %r2;
@%p2 bra BB73_4;
mov.u32 %r10, %r1;
BB73_3:
mul.wide.u32 %rd4, %r10, 4;
add.s64 %rd5, %rd1, %rd4;
ld.global.f32 %f6, [%rd5];
min.f32 %f8, %f8, %f6;
add.s32 %r10, %r10, %r6;
setp.lt.u32 %p3, %r10, %r2;
@%p3 bra BB73_3;
BB73_4:
cvta.to.global.u64 %rd6, %rd3;
mul.wide.u32 %rd7, %r1, 4;
add.s64 %rd8, %rd6, %rd7;
st.global.f32 [%rd8], %f8;
BB73_5:
ret;
}
// .globl reduce_prod_d
.visible .entry reduce_prod_d(
.param .u64 reduce_prod_d_param_0,
.param .u64 reduce_prod_d_param_1,
.param .u32 reduce_prod_d_param_2
)
{
.reg .pred %p<20>;
.reg .b32 %r<36>;
.reg .f64 %fd<60>;
.reg .b64 %rd<12>;
ld.param.u64 %rd1, [reduce_prod_d_param_0];
ld.param.u64 %rd2, [reduce_prod_d_param_1];
ld.param.u32 %r6, [reduce_prod_d_param_2];
mov.u32 %r7, %tid.x;
mov.u32 %r8, %ctaid.x;
shl.b32 %r9, %r8, 1;
mov.u32 %r10, %ntid.x;
mad.lo.s32 %r35, %r9, %r10, %r7;
mov.f64 %fd44, 0d3FF0000000000000;
setp.ge.u32 %p1, %r35, %r6;
@%p1 bra BB74_4;
BB74_1:
cvta.to.global.u64 %rd3, %rd1;
mul.wide.u32 %rd4, %r35, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd30, [%rd5];
mul.f64 %fd44, %fd44, %fd30;
add.s32 %r3, %r35, %r10;
setp.ge.u32 %p2, %r3, %r6;
@%p2 bra BB74_3;
mul.wide.u32 %rd7, %r3, 8;
add.s64 %rd8, %rd3, %rd7;
ld.global.f64 %fd31, [%rd8];
mul.f64 %fd44, %fd44, %fd31;
BB74_3:
shl.b32 %r13, %r10, 1;
mov.u32 %r14, %nctaid.x;
mad.lo.s32 %r35, %r13, %r14, %r35;
setp.lt.u32 %p3, %r35, %r6;
@%p3 bra BB74_1;
BB74_4:
shl.b32 %r16, %r7, 3;
mov.u32 %r17, memory;
add.s32 %r5, %r17, %r16;
st.shared.f64 [%r5], %fd44;
bar.sync 0;
setp.lt.u32 %p4, %r10, 1024;
@%p4 bra BB74_8;
setp.gt.u32 %p5, %r7, 511;
@%p5 bra BB74_7;
ld.shared.f64 %fd32, [%r5+4096];
mul.f64 %fd44, %fd44, %fd32;
st.shared.f64 [%r5], %fd44;
BB74_7:
bar.sync 0;
BB74_8:
setp.lt.u32 %p6, %r10, 512;
@%p6 bra BB74_12;
setp.gt.u32 %p7, %r7, 255;
@%p7 bra BB74_11;
ld.shared.f64 %fd33, [%r5+2048];
mul.f64 %fd44, %fd44, %fd33;
st.shared.f64 [%r5], %fd44;
BB74_11:
bar.sync 0;
BB74_12:
setp.lt.u32 %p8, %r10, 256;
@%p8 bra BB74_16;
setp.gt.u32 %p9, %r7, 127;
@%p9 bra BB74_15;
ld.shared.f64 %fd34, [%r5+1024];
mul.f64 %fd44, %fd44, %fd34;
st.shared.f64 [%r5], %fd44;
BB74_15:
bar.sync 0;
BB74_16:
setp.lt.u32 %p10, %r10, 128;
@%p10 bra BB74_20;
setp.gt.u32 %p11, %r7, 63;
@%p11 bra BB74_19;
ld.shared.f64 %fd35, [%r5+512];
mul.f64 %fd44, %fd44, %fd35;
st.shared.f64 [%r5], %fd44;
BB74_19:
bar.sync 0;
BB74_20:
setp.gt.u32 %p12, %r7, 31;
@%p12 bra BB74_33;
setp.lt.u32 %p13, %r10, 64;
@%p13 bra BB74_23;
ld.volatile.shared.f64 %fd36, [%r5+256];
mul.f64 %fd44, %fd44, %fd36;
st.volatile.shared.f64 [%r5], %fd44;
BB74_23:
setp.lt.u32 %p14, %r10, 32;
@%p14 bra BB74_25;
ld.volatile.shared.f64 %fd37, [%r5+128];
mul.f64 %fd44, %fd44, %fd37;
st.volatile.shared.f64 [%r5], %fd44;
BB74_25:
setp.lt.u32 %p15, %r10, 16;
@%p15 bra BB74_27;
ld.volatile.shared.f64 %fd38, [%r5+64];
mul.f64 %fd44, %fd44, %fd38;
st.volatile.shared.f64 [%r5], %fd44;
BB74_27:
setp.lt.u32 %p16, %r10, 8;
@%p16 bra BB74_29;
ld.volatile.shared.f64 %fd39, [%r5+32];
mul.f64 %fd44, %fd44, %fd39;
st.volatile.shared.f64 [%r5], %fd44;
BB74_29:
setp.lt.u32 %p17, %r10, 4;
@%p17 bra BB74_31;
ld.volatile.shared.f64 %fd40, [%r5+16];
mul.f64 %fd44, %fd44, %fd40;
st.volatile.shared.f64 [%r5], %fd44;
BB74_31:
setp.lt.u32 %p18, %r10, 2;
@%p18 bra BB74_33;
ld.volatile.shared.f64 %fd41, [%r5+8];
mul.f64 %fd42, %fd44, %fd41;
st.volatile.shared.f64 [%r5], %fd42;
BB74_33:
setp.ne.s32 %p19, %r7, 0;
@%p19 bra BB74_35;
ld.shared.f64 %fd43, [memory];
cvta.to.global.u64 %rd9, %rd2;
mul.wide.u32 %rd10, %r8, 8;
add.s64 %rd11, %rd9, %rd10;
st.global.f64 [%rd11], %fd43;
BB74_35:
ret;
}
// .globl reduce_prod_f
.visible .entry reduce_prod_f(
.param .u64 reduce_prod_f_param_0,
.param .u64 reduce_prod_f_param_1,
.param .u32 reduce_prod_f_param_2
)
{
.reg .pred %p<20>;
.reg .f32 %f<60>;
.reg .b32 %r<36>;
.reg .b64 %rd<12>;
ld.param.u64 %rd1, [reduce_prod_f_param_0];
ld.param.u64 %rd2, [reduce_prod_f_param_1];
ld.param.u32 %r6, [reduce_prod_f_param_2];
mov.u32 %r7, %tid.x;
mov.u32 %r8, %ctaid.x;
shl.b32 %r9, %r8, 1;
mov.u32 %r10, %ntid.x;
mad.lo.s32 %r35, %r9, %r10, %r7;
mov.f32 %f44, 0f3F800000;
setp.ge.u32 %p1, %r35, %r6;
@%p1 bra BB75_4;
BB75_1:
cvta.to.global.u64 %rd3, %rd1;
mul.wide.u32 %rd4, %r35, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f30, [%rd5];
mul.f32 %f44, %f44, %f30;
add.s32 %r3, %r35, %r10;
setp.ge.u32 %p2, %r3, %r6;
@%p2 bra BB75_3;
mul.wide.u32 %rd7, %r3, 4;
add.s64 %rd8, %rd3, %rd7;
ld.global.f32 %f31, [%rd8];
mul.f32 %f44, %f44, %f31;
BB75_3:
shl.b32 %r13, %r10, 1;
mov.u32 %r14, %nctaid.x;
mad.lo.s32 %r35, %r13, %r14, %r35;
setp.lt.u32 %p3, %r35, %r6;
@%p3 bra BB75_1;
BB75_4:
shl.b32 %r16, %r7, 2;
mov.u32 %r17, memory;
add.s32 %r5, %r17, %r16;
st.shared.f32 [%r5], %f44;
bar.sync 0;
setp.lt.u32 %p4, %r10, 1024;
@%p4 bra BB75_8;
setp.gt.u32 %p5, %r7, 511;
@%p5 bra BB75_7;
ld.shared.f32 %f32, [%r5+2048];
mul.f32 %f44, %f44, %f32;
st.shared.f32 [%r5], %f44;
BB75_7:
bar.sync 0;
BB75_8:
setp.lt.u32 %p6, %r10, 512;
@%p6 bra BB75_12;
setp.gt.u32 %p7, %r7, 255;
@%p7 bra BB75_11;
ld.shared.f32 %f33, [%r5+1024];
mul.f32 %f44, %f44, %f33;
st.shared.f32 [%r5], %f44;
BB75_11:
bar.sync 0;
BB75_12:
setp.lt.u32 %p8, %r10, 256;
@%p8 bra BB75_16;
setp.gt.u32 %p9, %r7, 127;
@%p9 bra BB75_15;
ld.shared.f32 %f34, [%r5+512];
mul.f32 %f44, %f44, %f34;
st.shared.f32 [%r5], %f44;
BB75_15:
bar.sync 0;
BB75_16:
setp.lt.u32 %p10, %r10, 128;
@%p10 bra BB75_20;
setp.gt.u32 %p11, %r7, 63;
@%p11 bra BB75_19;
ld.shared.f32 %f35, [%r5+256];
mul.f32 %f44, %f44, %f35;
st.shared.f32 [%r5], %f44;
BB75_19:
bar.sync 0;
BB75_20:
setp.gt.u32 %p12, %r7, 31;
@%p12 bra BB75_33;
setp.lt.u32 %p13, %r10, 64;
@%p13 bra BB75_23;
ld.volatile.shared.f32 %f36, [%r5+128];
mul.f32 %f44, %f44, %f36;
st.volatile.shared.f32 [%r5], %f44;
BB75_23:
setp.lt.u32 %p14, %r10, 32;
@%p14 bra BB75_25;
ld.volatile.shared.f32 %f37, [%r5+64];
mul.f32 %f44, %f44, %f37;
st.volatile.shared.f32 [%r5], %f44;
BB75_25:
setp.lt.u32 %p15, %r10, 16;
@%p15 bra BB75_27;
ld.volatile.shared.f32 %f38, [%r5+32];
mul.f32 %f44, %f44, %f38;
st.volatile.shared.f32 [%r5], %f44;
BB75_27:
setp.lt.u32 %p16, %r10, 8;
@%p16 bra BB75_29;
ld.volatile.shared.f32 %f39, [%r5+16];
mul.f32 %f44, %f44, %f39;
st.volatile.shared.f32 [%r5], %f44;
BB75_29:
setp.lt.u32 %p17, %r10, 4;
@%p17 bra BB75_31;
ld.volatile.shared.f32 %f40, [%r5+8];
mul.f32 %f44, %f44, %f40;
st.volatile.shared.f32 [%r5], %f44;
BB75_31:
setp.lt.u32 %p18, %r10, 2;
@%p18 bra BB75_33;
ld.volatile.shared.f32 %f41, [%r5+4];
mul.f32 %f42, %f44, %f41;
st.volatile.shared.f32 [%r5], %f42;
BB75_33:
setp.ne.s32 %p19, %r7, 0;
@%p19 bra BB75_35;
ld.shared.f32 %f43, [memory];
cvta.to.global.u64 %rd9, %rd2;
mul.wide.u32 %rd10, %r8, 4;
add.s64 %rd11, %rd9, %rd10;
st.global.f32 [%rd11], %f43;
BB75_35:
ret;
}
// .globl reduce_row_mean_d
.visible .entry reduce_row_mean_d(
.param .u64 reduce_row_mean_d_param_0,
.param .u64 reduce_row_mean_d_param_1,
.param .u32 reduce_row_mean_d_param_2,
.param .u32 reduce_row_mean_d_param_3
)
{
.reg .pred %p<20>;
.reg .b32 %r<72>;
.reg .f64 %fd<58>;
.reg .b64 %rd<10>;
ld.param.u64 %rd1, [reduce_row_mean_d_param_0];
ld.param.u64 %rd2, [reduce_row_mean_d_param_1];
ld.param.u32 %r5, [reduce_row_mean_d_param_2];
ld.param.u32 %r4, [reduce_row_mean_d_param_3];
mov.u32 %r6, %ctaid.x;
setp.ge.u32 %p1, %r6, %r5;
@%p1 bra BB76_35;
mov.u32 %r71, %tid.x;
mov.f64 %fd6, 0d0000000000000000;
setp.ge.u32 %p2, %r71, %r4;
@%p2 bra BB76_4;
cvta.to.global.u64 %rd3, %rd1;
BB76_3:
mad.lo.s32 %r8, %r6, %r4, %r71;
mul.wide.u32 %rd4, %r8, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd28, [%rd5];
add.f64 %fd6, %fd6, %fd28;
mov.u32 %r9, %ntid.x;
add.s32 %r71, %r9, %r71;
setp.lt.u32 %p3, %r71, %r4;
@%p3 bra BB76_3;
BB76_4:
mov.u32 %r10, %tid.x;
shl.b32 %r11, %r10, 3;
mov.u32 %r12, memory;
add.s32 %r13, %r12, %r11;
st.shared.f64 [%r13], %fd6;
bar.sync 0;
mov.u32 %r14, %ntid.x;
setp.lt.u32 %p4, %r14, 1024;
@%p4 bra BB76_8;
setp.gt.u32 %p5, %r10, 511;
@%p5 bra BB76_7;
ld.shared.f64 %fd29, [%r13+4096];
add.f64 %fd6, %fd6, %fd29;
st.shared.f64 [%r13], %fd6;
BB76_7:
bar.sync 0;
BB76_8:
setp.lt.u32 %p6, %r14, 512;
@%p6 bra BB76_12;
setp.gt.u32 %p7, %r10, 255;
@%p7 bra BB76_11;
ld.shared.f64 %fd30, [%r13+2048];
add.f64 %fd6, %fd6, %fd30;
st.shared.f64 [%r13], %fd6;
BB76_11:
bar.sync 0;
BB76_12:
setp.lt.u32 %p8, %r14, 256;
@%p8 bra BB76_16;
setp.gt.u32 %p9, %r10, 127;
@%p9 bra BB76_15;
ld.shared.f64 %fd31, [%r13+1024];
add.f64 %fd6, %fd6, %fd31;
st.shared.f64 [%r13], %fd6;
BB76_15:
bar.sync 0;
BB76_16:
setp.lt.u32 %p10, %r14, 128;
@%p10 bra BB76_20;
setp.gt.u32 %p11, %r10, 63;
@%p11 bra BB76_19;
ld.shared.f64 %fd32, [%r13+512];
add.f64 %fd6, %fd6, %fd32;
st.shared.f64 [%r13], %fd6;
BB76_19:
bar.sync 0;
BB76_20:
setp.gt.u32 %p12, %r10, 31;
@%p12 bra BB76_33;
setp.lt.u32 %p13, %r14, 64;
@%p13 bra BB76_23;
ld.volatile.shared.f64 %fd33, [%r13+256];
add.f64 %fd6, %fd6, %fd33;
st.volatile.shared.f64 [%r13], %fd6;
BB76_23:
setp.lt.u32 %p14, %r14, 32;
@%p14 bra BB76_25;
ld.volatile.shared.f64 %fd34, [%r13+128];
add.f64 %fd6, %fd6, %fd34;
st.volatile.shared.f64 [%r13], %fd6;
BB76_25:
setp.lt.u32 %p15, %r14, 16;
@%p15 bra BB76_27;
ld.volatile.shared.f64 %fd35, [%r13+64];
add.f64 %fd6, %fd6, %fd35;
st.volatile.shared.f64 [%r13], %fd6;
BB76_27:
setp.lt.u32 %p16, %r14, 8;
@%p16 bra BB76_29;
ld.volatile.shared.f64 %fd36, [%r13+32];
add.f64 %fd6, %fd6, %fd36;
st.volatile.shared.f64 [%r13], %fd6;
BB76_29:
setp.lt.u32 %p17, %r14, 4;
@%p17 bra BB76_31;
ld.volatile.shared.f64 %fd37, [%r13+16];
add.f64 %fd6, %fd6, %fd37;
st.volatile.shared.f64 [%r13], %fd6;
BB76_31:
setp.lt.u32 %p18, %r14, 2;
@%p18 bra BB76_33;
ld.volatile.shared.f64 %fd38, [%r13+8];
add.f64 %fd39, %fd6, %fd38;
st.volatile.shared.f64 [%r13], %fd39;
BB76_33:
setp.ne.s32 %p19, %r10, 0;
@%p19 bra BB76_35;
ld.shared.f64 %fd40, [memory];
cvt.u64.u32 %rd6, %r4;
cvt.rn.f64.s64 %fd41, %rd6;
div.rn.f64 %fd42, %fd40, %fd41;
cvta.to.global.u64 %rd7, %rd2;
mul.wide.u32 %rd8, %r6, 8;
add.s64 %rd9, %rd7, %rd8;
st.global.f64 [%rd9], %fd42;
BB76_35:
ret;
}
// .globl reduce_row_mean_f
.visible .entry reduce_row_mean_f(
.param .u64 reduce_row_mean_f_param_0,
.param .u64 reduce_row_mean_f_param_1,
.param .u32 reduce_row_mean_f_param_2,
.param .u32 reduce_row_mean_f_param_3
)
{
.reg .pred %p<20>;
.reg .f32 %f<58>;
.reg .b32 %r<72>;
.reg .b64 %rd<10>;
ld.param.u64 %rd1, [reduce_row_mean_f_param_0];
ld.param.u64 %rd2, [reduce_row_mean_f_param_1];
ld.param.u32 %r5, [reduce_row_mean_f_param_2];
ld.param.u32 %r4, [reduce_row_mean_f_param_3];
mov.u32 %r6, %ctaid.x;
setp.ge.u32 %p1, %r6, %r5;
@%p1 bra BB77_35;
mov.u32 %r71, %tid.x;
mov.f32 %f6, 0f00000000;
setp.ge.u32 %p2, %r71, %r4;
@%p2 bra BB77_4;
cvta.to.global.u64 %rd3, %rd1;
BB77_3:
mad.lo.s32 %r8, %r6, %r4, %r71;
mul.wide.u32 %rd4, %r8, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f28, [%rd5];
add.f32 %f6, %f6, %f28;
mov.u32 %r9, %ntid.x;
add.s32 %r71, %r9, %r71;
setp.lt.u32 %p3, %r71, %r4;
@%p3 bra BB77_3;
BB77_4:
mov.u32 %r10, %tid.x;
shl.b32 %r11, %r10, 2;
mov.u32 %r12, memory;
add.s32 %r13, %r12, %r11;
st.shared.f32 [%r13], %f6;
bar.sync 0;
mov.u32 %r14, %ntid.x;
setp.lt.u32 %p4, %r14, 1024;
@%p4 bra BB77_8;
setp.gt.u32 %p5, %r10, 511;
@%p5 bra BB77_7;
ld.shared.f32 %f29, [%r13+2048];
add.f32 %f6, %f6, %f29;
st.shared.f32 [%r13], %f6;
BB77_7:
bar.sync 0;
BB77_8:
setp.lt.u32 %p6, %r14, 512;
@%p6 bra BB77_12;
setp.gt.u32 %p7, %r10, 255;
@%p7 bra BB77_11;
ld.shared.f32 %f30, [%r13+1024];
add.f32 %f6, %f6, %f30;
st.shared.f32 [%r13], %f6;
BB77_11:
bar.sync 0;
BB77_12:
setp.lt.u32 %p8, %r14, 256;
@%p8 bra BB77_16;
setp.gt.u32 %p9, %r10, 127;
@%p9 bra BB77_15;
ld.shared.f32 %f31, [%r13+512];
add.f32 %f6, %f6, %f31;
st.shared.f32 [%r13], %f6;
BB77_15:
bar.sync 0;
BB77_16:
setp.lt.u32 %p10, %r14, 128;
@%p10 bra BB77_20;
setp.gt.u32 %p11, %r10, 63;
@%p11 bra BB77_19;
ld.shared.f32 %f32, [%r13+256];
add.f32 %f6, %f6, %f32;
st.shared.f32 [%r13], %f6;
BB77_19:
bar.sync 0;
BB77_20:
setp.gt.u32 %p12, %r10, 31;
@%p12 bra BB77_33;
setp.lt.u32 %p13, %r14, 64;
@%p13 bra BB77_23;
ld.volatile.shared.f32 %f33, [%r13+128];
add.f32 %f6, %f6, %f33;
st.volatile.shared.f32 [%r13], %f6;
BB77_23:
setp.lt.u32 %p14, %r14, 32;
@%p14 bra BB77_25;
ld.volatile.shared.f32 %f34, [%r13+64];
add.f32 %f6, %f6, %f34;
st.volatile.shared.f32 [%r13], %f6;
BB77_25:
setp.lt.u32 %p15, %r14, 16;
@%p15 bra BB77_27;
ld.volatile.shared.f32 %f35, [%r13+32];
add.f32 %f6, %f6, %f35;
st.volatile.shared.f32 [%r13], %f6;
BB77_27:
setp.lt.u32 %p16, %r14, 8;
@%p16 bra BB77_29;
ld.volatile.shared.f32 %f36, [%r13+16];
add.f32 %f6, %f6, %f36;
st.volatile.shared.f32 [%r13], %f6;
BB77_29:
setp.lt.u32 %p17, %r14, 4;
@%p17 bra BB77_31;
ld.volatile.shared.f32 %f37, [%r13+8];
add.f32 %f6, %f6, %f37;
st.volatile.shared.f32 [%r13], %f6;
BB77_31:
setp.lt.u32 %p18, %r14, 2;
@%p18 bra BB77_33;
ld.volatile.shared.f32 %f38, [%r13+4];
add.f32 %f39, %f6, %f38;
st.volatile.shared.f32 [%r13], %f39;
BB77_33:
setp.ne.s32 %p19, %r10, 0;
@%p19 bra BB77_35;
ld.shared.f32 %f40, [memory];
cvt.u64.u32 %rd6, %r4;
cvt.rn.f32.s64 %f41, %rd6;
div.rn.f32 %f42, %f40, %f41;
cvta.to.global.u64 %rd7, %rd2;
mul.wide.u32 %rd8, %r6, 4;
add.s64 %rd9, %rd7, %rd8;
st.global.f32 [%rd9], %f42;
BB77_35:
ret;
}
// .globl reduce_col_mean_d
.visible .entry reduce_col_mean_d(
.param .u64 reduce_col_mean_d_param_0,
.param .u64 reduce_col_mean_d_param_1,
.param .u32 reduce_col_mean_d_param_2,
.param .u32 reduce_col_mean_d_param_3
)
{
.reg .pred %p<4>;
.reg .b32 %r<11>;
.reg .f64 %fd<11>;
.reg .b64 %rd<10>;
ld.param.u64 %rd2, [reduce_col_mean_d_param_0];
ld.param.u64 %rd3, [reduce_col_mean_d_param_1];
ld.param.u32 %r5, [reduce_col_mean_d_param_2];
ld.param.u32 %r6, [reduce_col_mean_d_param_3];
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %ctaid.x;
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r7, %r8, %r9;
setp.ge.u32 %p1, %r1, %r6;
@%p1 bra BB78_5;
mul.lo.s32 %r2, %r6, %r5;
cvta.to.global.u64 %rd1, %rd2;
mov.f64 %fd10, 0d0000000000000000;
setp.ge.u32 %p2, %r1, %r2;
@%p2 bra BB78_4;
mov.u32 %r10, %r1;
BB78_3:
mul.wide.u32 %rd4, %r10, 8;
add.s64 %rd5, %rd1, %rd4;
ld.global.f64 %fd6, [%rd5];
add.f64 %fd10, %fd10, %fd6;
add.s32 %r10, %r10, %r6;
setp.lt.u32 %p3, %r10, %r2;
@%p3 bra BB78_3;
BB78_4:
cvt.u64.u32 %rd6, %r5;
cvt.rn.f64.s64 %fd7, %rd6;
div.rn.f64 %fd8, %fd10, %fd7;
cvta.to.global.u64 %rd7, %rd3;
mul.wide.u32 %rd8, %r1, 8;
add.s64 %rd9, %rd7, %rd8;
st.global.f64 [%rd9], %fd8;
BB78_5:
ret;
}
// .globl reduce_col_mean_f
.visible .entry reduce_col_mean_f(
.param .u64 reduce_col_mean_f_param_0,
.param .u64 reduce_col_mean_f_param_1,
.param .u32 reduce_col_mean_f_param_2,
.param .u32 reduce_col_mean_f_param_3
)
{
.reg .pred %p<4>;
.reg .f32 %f<11>;
.reg .b32 %r<11>;
.reg .b64 %rd<10>;
ld.param.u64 %rd2, [reduce_col_mean_f_param_0];
ld.param.u64 %rd3, [reduce_col_mean_f_param_1];
ld.param.u32 %r5, [reduce_col_mean_f_param_2];
ld.param.u32 %r6, [reduce_col_mean_f_param_3];
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %ctaid.x;
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r7, %r8, %r9;
setp.ge.u32 %p1, %r1, %r6;
@%p1 bra BB79_5;
mul.lo.s32 %r2, %r6, %r5;
cvta.to.global.u64 %rd1, %rd2;
mov.f32 %f10, 0f00000000;
setp.ge.u32 %p2, %r1, %r2;
@%p2 bra BB79_4;
mov.u32 %r10, %r1;
BB79_3:
mul.wide.u32 %rd4, %r10, 4;
add.s64 %rd5, %rd1, %rd4;
ld.global.f32 %f6, [%rd5];
add.f32 %f10, %f10, %f6;
add.s32 %r10, %r10, %r6;
setp.lt.u32 %p3, %r10, %r2;
@%p3 bra BB79_3;
BB79_4:
cvt.u64.u32 %rd6, %r5;
cvt.rn.f32.s64 %f7, %rd6;
div.rn.f32 %f8, %f10, %f7;
cvta.to.global.u64 %rd7, %rd3;
mul.wide.u32 %rd8, %r1, 4;
add.s64 %rd9, %rd7, %rd8;
st.global.f32 [%rd9], %f8;
BB79_5:
ret;
}
// .globl matrix_exp_d
.visible .entry matrix_exp_d(
.param .u64 matrix_exp_d_param_0,
.param .u64 matrix_exp_d_param_1,
.param .u32 matrix_exp_d_param_2
)
{
.reg .pred %p<5>;
.reg .f32 %f<3>;
.reg .b32 %r<21>;
.reg .f64 %fd<41>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [matrix_exp_d_param_0];
ld.param.u64 %rd2, [matrix_exp_d_param_1];
ld.param.u32 %r5, [matrix_exp_d_param_2];
mov.u32 %r6, %ctaid.x;
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %tid.x;
mad.lo.s32 %r1, %r7, %r6, %r8;
setp.ge.u32 %p1, %r1, %r5;
@%p1 bra BB80_5;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd1, [%rd5];
mov.f64 %fd6, 0d4338000000000000;
mov.f64 %fd7, 0d3FF71547652B82FE;
fma.rn.f64 %fd8, %fd1, %fd7, %fd6;
{
.reg .b32 %temp;
mov.b64 {%r2, %temp}, %fd8;
}
mov.f64 %fd9, 0dC338000000000000;
add.rn.f64 %fd10, %fd8, %fd9;
mov.f64 %fd11, 0dBFE62E42FEFA39EF;
fma.rn.f64 %fd12, %fd10, %fd11, %fd1;
mov.f64 %fd13, 0dBC7ABC9E3B39803F;
fma.rn.f64 %fd14, %fd10, %fd13, %fd12;
mov.f64 %fd15, 0d3E928AF3FCA213EA;
mov.f64 %fd16, 0d3E5ADE1569CE2BDF;
fma.rn.f64 %fd17, %fd16, %fd14, %fd15;
mov.f64 %fd18, 0d3EC71DEE62401315;
fma.rn.f64 %fd19, %fd17, %fd14, %fd18;
mov.f64 %fd20, 0d3EFA01997C89EB71;
fma.rn.f64 %fd21, %fd19, %fd14, %fd20;
mov.f64 %fd22, 0d3F2A01A014761F65;
fma.rn.f64 %fd23, %fd21, %fd14, %fd22;
mov.f64 %fd24, 0d3F56C16C1852B7AF;
fma.rn.f64 %fd25, %fd23, %fd14, %fd24;
mov.f64 %fd26, 0d3F81111111122322;
fma.rn.f64 %fd27, %fd25, %fd14, %fd26;
mov.f64 %fd28, 0d3FA55555555502A1;
fma.rn.f64 %fd29, %fd27, %fd14, %fd28;
mov.f64 %fd30, 0d3FC5555555555511;
fma.rn.f64 %fd31, %fd29, %fd14, %fd30;
mov.f64 %fd32, 0d3FE000000000000B;
fma.rn.f64 %fd33, %fd31, %fd14, %fd32;
mov.f64 %fd34, 0d3FF0000000000000;
fma.rn.f64 %fd35, %fd33, %fd14, %fd34;
fma.rn.f64 %fd36, %fd35, %fd14, %fd34;
{
.reg .b32 %temp;
mov.b64 {%r3, %temp}, %fd36;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r4}, %fd36;
}
shl.b32 %r9, %r2, 20;
add.s32 %r10, %r4, %r9;
mov.b64 %fd40, {%r3, %r10};
{
.reg .b32 %temp;
mov.b64 {%temp, %r11}, %fd1;
}
mov.b32 %f2, %r11;
abs.f32 %f1, %f2;
setp.lt.f32 %p2, %f1, 0f4086232B;
@%p2 bra BB80_4;
setp.lt.f64 %p3, %fd1, 0d0000000000000000;
add.f64 %fd37, %fd1, 0d7FF0000000000000;
selp.f64 %fd40, 0d0000000000000000, %fd37, %p3;
setp.geu.f32 %p4, %f1, 0f40874800;
@%p4 bra BB80_4;
shr.u32 %r12, %r2, 31;
add.s32 %r13, %r2, %r12;
shr.s32 %r14, %r13, 1;
shl.b32 %r15, %r14, 20;
add.s32 %r16, %r15, %r4;
mov.b64 %fd38, {%r3, %r16};
sub.s32 %r17, %r2, %r14;
shl.b32 %r18, %r17, 20;
add.s32 %r19, %r18, 1072693248;
mov.u32 %r20, 0;
mov.b64 %fd39, {%r20, %r19};
mul.f64 %fd40, %fd38, %fd39;
BB80_4:
cvta.to.global.u64 %rd6, %rd2;
add.s64 %rd8, %rd6, %rd4;
st.global.f64 [%rd8], %fd40;
BB80_5:
ret;
}
// .globl matrix_exp_f
.visible .entry matrix_exp_f(
.param .u64 matrix_exp_f_param_0,
.param .u64 matrix_exp_f_param_1,
.param .u32 matrix_exp_f_param_2
)
{
.reg .pred %p<4>;
.reg .f32 %f<15>;
.reg .b32 %r<6>;
.reg .b64 %rd<8>;
ld.param.u64 %rd1, [matrix_exp_f_param_0];
ld.param.u64 %rd2, [matrix_exp_f_param_1];
ld.param.u32 %r2, [matrix_exp_f_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
@%p1 bra BB81_2;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f1, [%rd5];
mul.f32 %f2, %f1, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f3, %f2;
mov.f32 %f4, 0fBF317200;
fma.rn.f32 %f5, %f3, %f4, %f1;
mov.f32 %f6, 0fB5BFBE8E;
fma.rn.f32 %f7, %f3, %f6, %f5;
mul.f32 %f8, %f7, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f9, %f8;
add.f32 %f10, %f3, 0f00000000;
ex2.approx.f32 %f11, %f10;
mul.f32 %f12, %f9, %f11;
setp.lt.f32 %p2, %f1, 0fC2D20000;
selp.f32 %f13, 0f00000000, %f12, %p2;
setp.gt.f32 %p3, %f1, 0f42D20000;
selp.f32 %f14, 0f7F800000, %f13, %p3;
cvta.to.global.u64 %rd6, %rd2;
add.s64 %rd7, %rd6, %rd4;
st.global.f32 [%rd7], %f14;
BB81_2:
ret;
}
// .globl matrix_sqrt_d
.visible .entry matrix_sqrt_d(
.param .u64 matrix_sqrt_d_param_0,
.param .u64 matrix_sqrt_d_param_1,
.param .u32 matrix_sqrt_d_param_2
)
{
.reg .pred %p<2>;
.reg .b32 %r<6>;
.reg .f64 %fd<3>;
.reg .b64 %rd<8>;
ld.param.u64 %rd1, [matrix_sqrt_d_param_0];
ld.param.u64 %rd2, [matrix_sqrt_d_param_1];
ld.param.u32 %r2, [matrix_sqrt_d_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
@%p1 bra BB82_2;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd1, [%rd5];
sqrt.rn.f64 %fd2, %fd1;
cvta.to.global.u64 %rd6, %rd2;
add.s64 %rd7, %rd6, %rd4;
st.global.f64 [%rd7], %fd2;
BB82_2:
ret;
}
// .globl matrix_sqrt_f
.visible .entry matrix_sqrt_f(
.param .u64 matrix_sqrt_f_param_0,
.param .u64 matrix_sqrt_f_param_1,
.param .u32 matrix_sqrt_f_param_2
)
{
.reg .pred %p<2>;
.reg .f32 %f<3>;
.reg .b32 %r<6>;
.reg .b64 %rd<8>;
ld.param.u64 %rd1, [matrix_sqrt_f_param_0];
ld.param.u64 %rd2, [matrix_sqrt_f_param_1];
ld.param.u32 %r2, [matrix_sqrt_f_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
@%p1 bra BB83_2;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f1, [%rd5];
sqrt.rn.f32 %f2, %f1;
cvta.to.global.u64 %rd6, %rd2;
add.s64 %rd7, %rd6, %rd4;
st.global.f32 [%rd7], %f2;
BB83_2:
ret;
}
// .globl matrix_round_d
.visible .entry matrix_round_d(
.param .u64 matrix_round_d_param_0,
.param .u64 matrix_round_d_param_1,
.param .u32 matrix_round_d_param_2
)
{
.reg .pred %p<2>;
.reg .b32 %r<11>;
.reg .f64 %fd<7>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [matrix_round_d_param_0];
ld.param.u64 %rd2, [matrix_round_d_param_1];
ld.param.u32 %r2, [matrix_round_d_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
@%p1 bra BB84_2;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd1, [%rd5];
{
.reg .b32 %temp;
mov.b64 {%temp, %r6}, %fd1;
}
and.b32 %r7, %r6, -2147483648;
mov.f64 %fd2, 0d3FE0000000000000;
{
.reg .b32 %temp;
mov.b64 {%temp, %r8}, %fd2;
}
or.b32 %r9, %r8, %r7;
{
.reg .b32 %temp;
mov.b64 {%r10, %temp}, %fd2;
}
mov.b64 %fd3, {%r10, %r9};
add.rz.f64 %fd4, %fd1, %fd3;
cvt.rzi.f64.f64 %fd5, %fd4;
cvt.rzi.s64.f64 %rd6, %fd5;
cvt.rn.f64.s64 %fd6, %rd6;
cvta.to.global.u64 %rd7, %rd2;
add.s64 %rd8, %rd7, %rd4;
st.global.f64 [%rd8], %fd6;
BB84_2:
ret;
}
// .globl matrix_round_f
.visible .entry matrix_round_f(
.param .u64 matrix_round_f_param_0,
.param .u64 matrix_round_f_param_1,
.param .u32 matrix_round_f_param_2
)
{
.reg .pred %p<8>;
.reg .f32 %f<7>;
.reg .b32 %r<17>;
.reg .b64 %rd<25>;
ld.param.u64 %rd6, [matrix_round_f_param_0];
ld.param.u64 %rd7, [matrix_round_f_param_1];
ld.param.u32 %r5, [matrix_round_f_param_2];
mov.u32 %r6, %ctaid.x;
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %tid.x;
mad.lo.s32 %r1, %r7, %r6, %r8;
setp.ge.u32 %p1, %r1, %r5;
@%p1 bra BB85_9;
cvta.to.global.u64 %rd8, %rd6;
mul.wide.s32 %rd9, %r1, 4;
add.s64 %rd10, %rd8, %rd9;
ld.global.u32 %r2, [%rd10];
shl.b32 %r9, %r2, 1;
setp.gt.u32 %p2, %r9, -16777216;
mov.f32 %f3, 0fDF000000;
@%p2 bra BB85_2;
bra.uni BB85_3;
BB85_2:
mov.f32 %f6, %f3;
bra.uni BB85_8;
BB85_3:
setp.gt.s32 %p3, %r2, 1593835519;
mov.f32 %f6, 0f5F000000;
@%p3 bra BB85_8;
setp.gt.u32 %p4, %r2, -553648129;
mov.f32 %f6, %f3;
@%p4 bra BB85_8;
bfe.u32 %r3, %r2, 23, 8;
mov.u32 %r10, 189;
sub.s32 %r4, %r10, %r3;
shl.b32 %r11, %r2, 8;
shr.u32 %r12, %r11, 1;
or.b32 %r13, %r12, 1073741824;
cvt.u64.u32 %rd12, %r13;
shl.b64 %rd24, %rd12, 32;
setp.gt.s32 %p5, %r4, 63;
mov.u64 %rd23, 0;
@%p5 bra BB85_7;
setp.eq.s32 %p6, %r3, 189;
mov.u32 %r14, 64;
sub.s32 %r15, %r14, %r4;
shl.b64 %rd13, %rd24, %r15;
cvt.u64.u32 %rd14, %r4;
selp.b64 %rd15, 0, %rd14, %p6;
cvt.u32.u64 %r16, %rd15;
shr.u64 %rd23, %rd24, %r16;
selp.b64 %rd24, 0, %rd13, %p6;
BB85_7:
shr.u64 %rd16, %rd24, 63;
add.s64 %rd17, %rd16, %rd23;
neg.s64 %rd18, %rd17;
setp.lt.s32 %p7, %r2, 0;
selp.b64 %rd19, %rd18, %rd17, %p7;
cvt.rn.f32.s64 %f6, %rd19;
BB85_8:
cvta.to.global.u64 %rd20, %rd7;
add.s64 %rd22, %rd20, %rd9;
st.global.f32 [%rd22], %f6;
BB85_9:
ret;
}
// .globl matrix_abs_d
.visible .entry matrix_abs_d(
.param .u64 matrix_abs_d_param_0,
.param .u64 matrix_abs_d_param_1,
.param .u32 matrix_abs_d_param_2
)
{
.reg .pred %p<2>;
.reg .b32 %r<6>;
.reg .f64 %fd<3>;
.reg .b64 %rd<8>;
ld.param.u64 %rd1, [matrix_abs_d_param_0];
ld.param.u64 %rd2, [matrix_abs_d_param_1];
ld.param.u32 %r2, [matrix_abs_d_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
@%p1 bra BB86_2;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd1, [%rd5];
abs.f64 %fd2, %fd1;
cvta.to.global.u64 %rd6, %rd2;
add.s64 %rd7, %rd6, %rd4;
st.global.f64 [%rd7], %fd2;
BB86_2:
ret;
}
// .globl matrix_abs_f
.visible .entry matrix_abs_f(
.param .u64 matrix_abs_f_param_0,
.param .u64 matrix_abs_f_param_1,
.param .u32 matrix_abs_f_param_2
)
{
.reg .pred %p<2>;
.reg .f32 %f<3>;
.reg .b32 %r<6>;
.reg .b64 %rd<8>;
ld.param.u64 %rd1, [matrix_abs_f_param_0];
ld.param.u64 %rd2, [matrix_abs_f_param_1];
ld.param.u32 %r2, [matrix_abs_f_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
@%p1 bra BB87_2;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f1, [%rd5];
abs.f32 %f2, %f1;
cvta.to.global.u64 %rd6, %rd2;
add.s64 %rd7, %rd6, %rd4;
st.global.f32 [%rd7], %f2;
BB87_2:
ret;
}
// .globl matrix_log_d
.visible .entry matrix_log_d(
.param .u64 matrix_log_d_param_0,
.param .u64 matrix_log_d_param_1,
.param .u32 matrix_log_d_param_2
)
{
.reg .pred %p<6>;
.reg .f32 %f<2>;
.reg .b32 %r<33>;
.reg .f64 %fd<59>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [matrix_log_d_param_0];
ld.param.u64 %rd2, [matrix_log_d_param_1];
ld.param.u32 %r12, [matrix_log_d_param_2];
mov.u32 %r13, %ctaid.x;
mov.u32 %r14, %ntid.x;
mov.u32 %r15, %tid.x;
mad.lo.s32 %r1, %r14, %r13, %r15;
setp.ge.u32 %p1, %r1, %r12;
@%p1 bra BB88_9;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd56, [%rd5];
{
.reg .b32 %temp;
mov.b64 {%temp, %r29}, %fd56;
}
{
.reg .b32 %temp;
mov.b64 {%r30, %temp}, %fd56;
}
mov.u32 %r31, -1023;
setp.gt.s32 %p2, %r29, 1048575;
@%p2 bra BB88_3;
mul.f64 %fd56, %fd56, 0d4350000000000000;
{
.reg .b32 %temp;
mov.b64 {%temp, %r29}, %fd56;
}
{
.reg .b32 %temp;
mov.b64 {%r30, %temp}, %fd56;
}
mov.u32 %r31, -1077;
BB88_3:
add.s32 %r18, %r29, -1;
setp.lt.u32 %p3, %r18, 2146435071;
@%p3 bra BB88_5;
bra.uni BB88_4;
BB88_5:
shr.u32 %r20, %r29, 20;
add.s32 %r32, %r31, %r20;
and.b32 %r21, %r29, -2146435073;
or.b32 %r22, %r21, 1072693248;
mov.b64 %fd57, {%r30, %r22};
setp.lt.s32 %p5, %r22, 1073127583;
@%p5 bra BB88_7;
{
.reg .b32 %temp;
mov.b64 {%r23, %temp}, %fd57;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r24}, %fd57;
}
add.s32 %r25, %r24, -1048576;
mov.b64 %fd57, {%r23, %r25};
add.s32 %r32, %r32, 1;
BB88_7:
add.f64 %fd12, %fd57, 0d3FF0000000000000;
rcp.approx.ftz.f64 %fd13, %fd12;
neg.f64 %fd14, %fd12;
mov.f64 %fd15, 0d3FF0000000000000;
fma.rn.f64 %fd16, %fd14, %fd13, %fd15;
fma.rn.f64 %fd17, %fd16, %fd16, %fd16;
fma.rn.f64 %fd18, %fd17, %fd13, %fd13;
add.f64 %fd19, %fd57, 0dBFF0000000000000;
mul.f64 %fd20, %fd19, %fd18;
fma.rn.f64 %fd21, %fd19, %fd18, %fd20;
mul.f64 %fd22, %fd21, %fd21;
mov.f64 %fd23, 0d3ED0EE258B7A8B04;
mov.f64 %fd24, 0d3EB1380B3AE80F1E;
fma.rn.f64 %fd25, %fd24, %fd22, %fd23;
mov.f64 %fd26, 0d3EF3B2669F02676F;
fma.rn.f64 %fd27, %fd25, %fd22, %fd26;
mov.f64 %fd28, 0d3F1745CBA9AB0956;
fma.rn.f64 %fd29, %fd27, %fd22, %fd28;
mov.f64 %fd30, 0d3F3C71C72D1B5154;
fma.rn.f64 %fd31, %fd29, %fd22, %fd30;
mov.f64 %fd32, 0d3F624924923BE72D;
fma.rn.f64 %fd33, %fd31, %fd22, %fd32;
mov.f64 %fd34, 0d3F8999999999A3C4;
fma.rn.f64 %fd35, %fd33, %fd22, %fd34;
mov.f64 %fd36, 0d3FB5555555555554;
fma.rn.f64 %fd37, %fd35, %fd22, %fd36;
sub.f64 %fd38, %fd19, %fd21;
add.f64 %fd39, %fd38, %fd38;
neg.f64 %fd40, %fd21;
fma.rn.f64 %fd41, %fd40, %fd19, %fd39;
mul.f64 %fd42, %fd18, %fd41;
mul.f64 %fd43, %fd22, %fd37;
fma.rn.f64 %fd44, %fd43, %fd21, %fd42;
xor.b32 %r26, %r32, -2147483648;
mov.u32 %r27, -2147483648;
mov.u32 %r28, 1127219200;
mov.b64 %fd45, {%r26, %r28};
mov.b64 %fd46, {%r27, %r28};
sub.f64 %fd47, %fd45, %fd46;
mov.f64 %fd48, 0d3FE62E42FEFA39EF;
fma.rn.f64 %fd49, %fd47, %fd48, %fd21;
neg.f64 %fd50, %fd47;
fma.rn.f64 %fd51, %fd50, %fd48, %fd49;
sub.f64 %fd52, %fd51, %fd21;
sub.f64 %fd53, %fd44, %fd52;
mov.f64 %fd54, 0d3C7ABC9E3B39803F;
fma.rn.f64 %fd55, %fd47, %fd54, %fd53;
add.f64 %fd58, %fd49, %fd55;
bra.uni BB88_8;
BB88_4:
mov.f64 %fd10, 0d7FF0000000000000;
fma.rn.f64 %fd11, %fd56, %fd10, %fd10;
{
.reg .b32 %temp;
mov.b64 {%temp, %r19}, %fd56;
}
mov.b32 %f1, %r19;
setp.eq.f32 %p4, %f1, 0f00000000;
selp.f64 %fd58, 0dFFF0000000000000, %fd11, %p4;
BB88_8:
cvta.to.global.u64 %rd6, %rd2;
add.s64 %rd8, %rd6, %rd4;
st.global.f64 [%rd8], %fd58;
BB88_9:
ret;
}
// .globl matrix_log_f
.visible .entry matrix_log_f(
.param .u64 matrix_log_f_param_0,
.param .u64 matrix_log_f_param_1,
.param .u32 matrix_log_f_param_2
)
{
.reg .pred %p<5>;
.reg .f32 %f<36>;
.reg .b32 %r<10>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [matrix_log_f_param_0];
ld.param.u64 %rd2, [matrix_log_f_param_1];
ld.param.u32 %r2, [matrix_log_f_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
@%p1 bra BB89_4;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f5, [%rd5];
setp.lt.f32 %p2, %f5, 0f00800000;
mul.f32 %f6, %f5, 0f4B000000;
selp.f32 %f1, %f6, %f5, %p2;
selp.f32 %f7, 0fC1B80000, 0f00000000, %p2;
mov.b32 %r6, %f1;
add.s32 %r7, %r6, -1059760811;
and.b32 %r8, %r7, -8388608;
sub.s32 %r9, %r6, %r8;
mov.b32 %f8, %r9;
cvt.rn.f32.s32 %f9, %r8;
mov.f32 %f10, 0f34000000;
fma.rn.f32 %f11, %f9, %f10, %f7;
add.f32 %f12, %f8, 0fBF800000;
mov.f32 %f13, 0f3E1039F6;
mov.f32 %f14, 0fBE055027;
fma.rn.f32 %f15, %f14, %f12, %f13;
mov.f32 %f16, 0fBDF8CDCC;
fma.rn.f32 %f17, %f15, %f12, %f16;
mov.f32 %f18, 0f3E0F2955;
fma.rn.f32 %f19, %f17, %f12, %f18;
mov.f32 %f20, 0fBE2AD8B9;
fma.rn.f32 %f21, %f19, %f12, %f20;
mov.f32 %f22, 0f3E4CED0B;
fma.rn.f32 %f23, %f21, %f12, %f22;
mov.f32 %f24, 0fBE7FFF22;
fma.rn.f32 %f25, %f23, %f12, %f24;
mov.f32 %f26, 0f3EAAAA78;
fma.rn.f32 %f27, %f25, %f12, %f26;
mov.f32 %f28, 0fBF000000;
fma.rn.f32 %f29, %f27, %f12, %f28;
mul.f32 %f30, %f12, %f29;
fma.rn.f32 %f31, %f30, %f12, %f12;
mov.f32 %f32, 0f3F317218;
fma.rn.f32 %f35, %f11, %f32, %f31;
setp.lt.u32 %p3, %r6, 2139095040;
@%p3 bra BB89_3;
mov.f32 %f33, 0f7F800000;
fma.rn.f32 %f35, %f1, %f33, %f33;
BB89_3:
cvta.to.global.u64 %rd6, %rd2;
setp.eq.f32 %p4, %f1, 0f00000000;
selp.f32 %f34, 0fFF800000, %f35, %p4;
add.s64 %rd8, %rd6, %rd4;
st.global.f32 [%rd8], %f34;
BB89_4:
ret;
}
// .globl matrix_floor_d
.visible .entry matrix_floor_d(
.param .u64 matrix_floor_d_param_0,
.param .u64 matrix_floor_d_param_1,
.param .u32 matrix_floor_d_param_2
)
{
.reg .pred %p<2>;
.reg .b32 %r<6>;
.reg .f64 %fd<3>;
.reg .b64 %rd<8>;
ld.param.u64 %rd1, [matrix_floor_d_param_0];
ld.param.u64 %rd2, [matrix_floor_d_param_1];
ld.param.u32 %r2, [matrix_floor_d_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
@%p1 bra BB90_2;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd1, [%rd5];
cvt.rmi.f64.f64 %fd2, %fd1;
cvta.to.global.u64 %rd6, %rd2;
add.s64 %rd7, %rd6, %rd4;
st.global.f64 [%rd7], %fd2;
BB90_2:
ret;
}
// .globl matrix_floor_f
.visible .entry matrix_floor_f(
.param .u64 matrix_floor_f_param_0,
.param .u64 matrix_floor_f_param_1,
.param .u32 matrix_floor_f_param_2
)
{
.reg .pred %p<2>;
.reg .f32 %f<3>;
.reg .b32 %r<6>;
.reg .b64 %rd<8>;
ld.param.u64 %rd1, [matrix_floor_f_param_0];
ld.param.u64 %rd2, [matrix_floor_f_param_1];
ld.param.u32 %r2, [matrix_floor_f_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
@%p1 bra BB91_2;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f1, [%rd5];
cvt.rmi.f32.f32 %f2, %f1;
cvta.to.global.u64 %rd6, %rd2;
add.s64 %rd7, %rd6, %rd4;
st.global.f32 [%rd7], %f2;
BB91_2:
ret;
}
// .globl matrix_ceil_d
.visible .entry matrix_ceil_d(
.param .u64 matrix_ceil_d_param_0,
.param .u64 matrix_ceil_d_param_1,
.param .u32 matrix_ceil_d_param_2
)
{
.reg .pred %p<2>;
.reg .b32 %r<6>;
.reg .f64 %fd<3>;
.reg .b64 %rd<8>;
ld.param.u64 %rd1, [matrix_ceil_d_param_0];
ld.param.u64 %rd2, [matrix_ceil_d_param_1];
ld.param.u32 %r2, [matrix_ceil_d_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
@%p1 bra BB92_2;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd1, [%rd5];
cvt.rpi.f64.f64 %fd2, %fd1;
cvta.to.global.u64 %rd6, %rd2;
add.s64 %rd7, %rd6, %rd4;
st.global.f64 [%rd7], %fd2;
BB92_2:
ret;
}
// .globl matrix_ceil_f
.visible .entry matrix_ceil_f(
.param .u64 matrix_ceil_f_param_0,
.param .u64 matrix_ceil_f_param_1,
.param .u32 matrix_ceil_f_param_2
)
{
.reg .pred %p<2>;
.reg .f32 %f<3>;
.reg .b32 %r<6>;
.reg .b64 %rd<8>;
ld.param.u64 %rd1, [matrix_ceil_f_param_0];
ld.param.u64 %rd2, [matrix_ceil_f_param_1];
ld.param.u32 %r2, [matrix_ceil_f_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
@%p1 bra BB93_2;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f1, [%rd5];
cvt.rpi.f32.f32 %f2, %f1;
cvta.to.global.u64 %rd6, %rd2;
add.s64 %rd7, %rd6, %rd4;
st.global.f32 [%rd7], %f2;
BB93_2:
ret;
}
// .globl matrix_sin_d
.visible .entry matrix_sin_d(
.param .u64 matrix_sin_d_param_0,
.param .u64 matrix_sin_d_param_1,
.param .u32 matrix_sin_d_param_2
)
{
.local .align 4 .b8 __local_depot94[4];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<7>;
.reg .b32 %r<19>;
.reg .f64 %fd<42>;
.reg .b64 %rd<15>;
mov.u64 %SPL, __local_depot94;
cvta.local.u64 %SP, %SPL;
ld.param.u64 %rd2, [matrix_sin_d_param_0];
ld.param.u64 %rd3, [matrix_sin_d_param_1];
ld.param.u32 %r5, [matrix_sin_d_param_2];
add.u64 %rd4, %SP, 0;
add.u64 %rd1, %SPL, 0;
mov.u32 %r6, %ntid.x;
mov.u32 %r7, %ctaid.x;
mov.u32 %r8, %tid.x;
mad.lo.s32 %r1, %r6, %r7, %r8;
setp.ge.u32 %p1, %r1, %r5;
@%p1 bra BB94_11;
cvta.to.global.u64 %rd5, %rd2;
mul.wide.s32 %rd6, %r1, 8;
add.s64 %rd7, %rd5, %rd6;
ld.global.f64 %fd38, [%rd7];
{
.reg .b32 %temp;
mov.b64 {%temp, %r9}, %fd38;
}
and.b32 %r10, %r9, 2147483647;
setp.ne.s32 %p2, %r10, 2146435072;
@%p2 bra BB94_4;
{
.reg .b32 %temp;
mov.b64 {%r11, %temp}, %fd38;
}
setp.ne.s32 %p3, %r11, 0;
@%p3 bra BB94_4;
mov.f64 %fd14, 0d0000000000000000;
mul.rn.f64 %fd38, %fd38, %fd14;
BB94_4:
mul.f64 %fd15, %fd38, 0d3FE45F306DC9C883;
cvt.rni.s32.f64 %r18, %fd15;
st.local.u32 [%rd1], %r18;
cvt.rn.f64.s32 %fd16, %r18;
neg.f64 %fd17, %fd16;
mov.f64 %fd18, 0d3FF921FB54442D18;
fma.rn.f64 %fd19, %fd17, %fd18, %fd38;
mov.f64 %fd20, 0d3C91A62633145C00;
fma.rn.f64 %fd21, %fd17, %fd20, %fd19;
mov.f64 %fd22, 0d397B839A252049C0;
fma.rn.f64 %fd39, %fd17, %fd22, %fd21;
{
.reg .b32 %temp;
mov.b64 {%temp, %r12}, %fd38;
}
and.b32 %r13, %r12, 2145386496;
setp.lt.u32 %p4, %r13, 1105199104;
@%p4 bra BB94_6;
// Callseq Start 3
{
.reg .b32 temp_param_reg;
// <end>}
.param .b64 param0;
st.param.f64 [param0+0], %fd38;
.param .b64 param1;
st.param.b64 [param1+0], %rd4;
.param .b64 retval0;
call.uni (retval0),
__internal_trig_reduction_slowpathd,
(
param0,
param1
);
ld.param.f64 %fd39, [retval0+0];
//{
}// Callseq End 3
ld.local.u32 %r18, [%rd1];
BB94_6:
and.b32 %r14, %r18, 1;
shl.b32 %r15, %r14, 3;
setp.eq.s32 %p5, %r14, 0;
selp.f64 %fd23, 0d3DE5DB65F9785EBA, 0dBDA8FF8320FD8164, %p5;
add.s32 %r16, %r15, 1;
mul.wide.s32 %rd9, %r16, 8;
mov.u64 %rd10, __cudart_sin_cos_coeffs;
add.s64 %rd11, %rd10, %rd9;
ld.const.f64 %fd24, [%rd11];
mul.rn.f64 %fd7, %fd39, %fd39;
fma.rn.f64 %fd25, %fd23, %fd7, %fd24;
ld.const.f64 %fd26, [%rd11+8];
fma.rn.f64 %fd27, %fd25, %fd7, %fd26;
ld.const.f64 %fd28, [%rd11+16];
fma.rn.f64 %fd29, %fd27, %fd7, %fd28;
ld.const.f64 %fd30, [%rd11+24];
fma.rn.f64 %fd31, %fd29, %fd7, %fd30;
ld.const.f64 %fd32, [%rd11+32];
fma.rn.f64 %fd33, %fd31, %fd7, %fd32;
ld.const.f64 %fd34, [%rd11+40];
fma.rn.f64 %fd8, %fd33, %fd7, %fd34;
fma.rn.f64 %fd40, %fd8, %fd39, %fd39;
@%p5 bra BB94_8;
mov.f64 %fd35, 0d3FF0000000000000;
fma.rn.f64 %fd40, %fd8, %fd7, %fd35;
BB94_8:
and.b32 %r17, %r18, 2;
setp.eq.s32 %p6, %r17, 0;
@%p6 bra BB94_10;
mov.f64 %fd36, 0d0000000000000000;
mov.f64 %fd37, 0dBFF0000000000000;
fma.rn.f64 %fd40, %fd40, %fd37, %fd36;
BB94_10:
cvta.to.global.u64 %rd12, %rd3;
add.s64 %rd14, %rd12, %rd6;
st.global.f64 [%rd14], %fd40;
BB94_11:
ret;
}
// .globl matrix_sin_f
.visible .entry matrix_sin_f(
.param .u64 matrix_sin_f_param_0,
.param .u64 matrix_sin_f_param_1,
.param .u32 matrix_sin_f_param_2
)
{
.local .align 4 .b8 __local_depot95[28];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<13>;
.reg .f32 %f<38>;
.reg .b32 %r<69>;
.reg .f64 %fd<3>;
.reg .b64 %rd<24>;
mov.u64 %SPL, __local_depot95;
ld.param.u64 %rd7, [matrix_sin_f_param_0];
ld.param.u64 %rd8, [matrix_sin_f_param_1];
ld.param.u32 %r29, [matrix_sin_f_param_2];
mov.u32 %r30, %ntid.x;
mov.u32 %r31, %ctaid.x;
mov.u32 %r32, %tid.x;
mad.lo.s32 %r1, %r30, %r31, %r32;
setp.ge.u32 %p1, %r1, %r29;
@%p1 bra BB95_17;
cvta.to.global.u64 %rd9, %rd7;
mul.wide.s32 %rd10, %r1, 4;
add.s64 %rd11, %rd9, %rd10;
add.u64 %rd1, %SPL, 0;
ld.global.f32 %f1, [%rd11];
mul.f32 %f15, %f1, 0f3F22F983;
cvt.rni.s32.f32 %r68, %f15;
cvt.rn.f32.s32 %f16, %r68;
mov.f32 %f17, 0fBFC90FDA;
fma.rn.f32 %f18, %f16, %f17, %f1;
mov.f32 %f19, 0fB3A22168;
fma.rn.f32 %f20, %f16, %f19, %f18;
mov.f32 %f21, 0fA7C234C5;
fma.rn.f32 %f35, %f16, %f21, %f20;
abs.f32 %f3, %f1;
setp.leu.f32 %p2, %f3, 0f47CE4780;
@%p2 bra BB95_12;
setp.eq.f32 %p3, %f3, 0f7F800000;
@%p3 bra BB95_11;
bra.uni BB95_3;
BB95_11:
mov.f32 %f24, 0f00000000;
mul.rn.f32 %f35, %f1, %f24;
bra.uni BB95_12;
BB95_3:
mov.b32 %r3, %f1;
shl.b32 %r35, %r3, 8;
or.b32 %r4, %r35, -2147483648;
mov.u32 %r62, 0;
mov.u64 %rd22, __cudart_i2opi_f;
mov.u32 %r61, -6;
mov.u64 %rd23, %rd1;
BB95_4:
.pragma "nounroll";
ld.const.u32 %r38, [%rd22];
// inline asm
{
mad.lo.cc.u32 %r36, %r38, %r4, %r62;
madc.hi.u32 %r62, %r38, %r4, 0;
}
// inline asm
st.local.u32 [%rd23], %r36;
add.s64 %rd23, %rd23, 4;
add.s64 %rd22, %rd22, 4;
add.s32 %r61, %r61, 1;
setp.ne.s32 %p4, %r61, 0;
@%p4 bra BB95_4;
bfe.u32 %r41, %r3, 23, 8;
add.s32 %r42, %r41, -128;
shr.u32 %r43, %r42, 5;
and.b32 %r9, %r3, -2147483648;
st.local.u32 [%rd1+24], %r62;
bfe.u32 %r10, %r3, 23, 5;
mov.u32 %r44, 6;
sub.s32 %r45, %r44, %r43;
mul.wide.s32 %rd14, %r45, 4;
add.s64 %rd6, %rd1, %rd14;
ld.local.u32 %r64, [%rd6];
ld.local.u32 %r63, [%rd6+-4];
setp.eq.s32 %p5, %r10, 0;
@%p5 bra BB95_7;
mov.u32 %r46, 32;
sub.s32 %r47, %r46, %r10;
shr.u32 %r48, %r63, %r47;
shl.b32 %r49, %r64, %r10;
add.s32 %r64, %r48, %r49;
ld.local.u32 %r50, [%rd6+-8];
shr.u32 %r51, %r50, %r47;
shl.b32 %r52, %r63, %r10;
add.s32 %r63, %r51, %r52;
BB95_7:
shr.u32 %r53, %r63, 30;
shl.b32 %r54, %r64, 2;
add.s32 %r66, %r54, %r53;
shl.b32 %r18, %r63, 2;
shr.u32 %r55, %r66, 31;
shr.u32 %r56, %r64, 30;
add.s32 %r19, %r55, %r56;
setp.eq.s32 %p6, %r55, 0;
@%p6 bra BB95_8;
not.b32 %r57, %r66;
neg.s32 %r65, %r18;
setp.eq.s32 %p7, %r18, 0;
selp.u32 %r58, 1, 0, %p7;
add.s32 %r66, %r58, %r57;
xor.b32 %r67, %r9, -2147483648;
bra.uni BB95_10;
BB95_8:
mov.u32 %r65, %r18;
mov.u32 %r67, %r9;
BB95_10:
cvt.u64.u32 %rd15, %r66;
shl.b64 %rd16, %rd15, 32;
cvt.u64.u32 %rd17, %r65;
or.b64 %rd18, %rd16, %rd17;
cvt.rn.f64.s64 %fd1, %rd18;
mul.f64 %fd2, %fd1, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f22, %fd2;
neg.f32 %f23, %f22;
setp.eq.s32 %p8, %r67, 0;
selp.f32 %f35, %f22, %f23, %p8;
setp.eq.s32 %p9, %r9, 0;
neg.s32 %r59, %r19;
selp.b32 %r68, %r19, %r59, %p9;
BB95_12:
and.b32 %r28, %r68, 1;
setp.eq.s32 %p10, %r28, 0;
selp.f32 %f7, %f35, 0f3F800000, %p10;
mul.rn.f32 %f8, %f35, %f35;
mov.f32 %f26, 0f00000000;
fma.rn.f32 %f9, %f8, %f7, %f26;
mov.f32 %f36, 0fB94D4153;
@%p10 bra BB95_14;
mov.f32 %f27, 0fBAB607ED;
mov.f32 %f28, 0f37CBAC00;
fma.rn.f32 %f36, %f28, %f8, %f27;
BB95_14:
selp.f32 %f29, 0f3C0885E4, 0f3D2AAABB, %p10;
fma.rn.f32 %f30, %f36, %f8, %f29;
selp.f32 %f31, 0fBE2AAAA8, 0fBEFFFFFF, %p10;
fma.rn.f32 %f32, %f30, %f8, %f31;
fma.rn.f32 %f37, %f32, %f9, %f7;
and.b32 %r60, %r68, 2;
setp.eq.s32 %p12, %r60, 0;
@%p12 bra BB95_16;
mov.f32 %f34, 0fBF800000;
fma.rn.f32 %f37, %f37, %f34, %f26;
BB95_16:
cvta.to.global.u64 %rd19, %rd8;
add.s64 %rd21, %rd19, %rd10;
st.global.f32 [%rd21], %f37;
BB95_17:
ret;
}
// .globl matrix_sinh_d
.visible .entry matrix_sinh_d(
.param .u64 matrix_sinh_d_param_0,
.param .u64 matrix_sinh_d_param_1,
.param .u32 matrix_sinh_d_param_2
)
{
.reg .pred %p<7>;
.reg .b32 %r<24>;
.reg .f64 %fd<68>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [matrix_sinh_d_param_0];
ld.param.u64 %rd2, [matrix_sinh_d_param_1];
ld.param.u32 %r3, [matrix_sinh_d_param_2];
mov.u32 %r4, %ctaid.x;
mov.u32 %r5, %ntid.x;
mov.u32 %r6, %tid.x;
mad.lo.s32 %r1, %r5, %r4, %r6;
setp.ge.u32 %p1, %r1, %r3;
@%p1 bra BB96_5;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd5, [%rd5];
{
.reg .b32 %temp;
mov.b64 {%temp, %r2}, %fd5;
}
and.b32 %r7, %r2, 2147483647;
{
.reg .b32 %temp;
mov.b64 {%r8, %temp}, %fd5;
}
mov.b64 %fd1, {%r8, %r7};
setp.lt.u32 %p2, %r7, 1072693248;
@%p2 bra BB96_3;
bra.uni BB96_2;
BB96_3:
mul.f64 %fd51, %fd1, %fd1;
mov.f64 %fd52, 0d3DE611A561D87DEF;
mov.f64 %fd53, 0d3D6B4C75AB274C53;
fma.rn.f64 %fd54, %fd53, %fd51, %fd52;
mov.f64 %fd55, 0d3E5AE64671B18F5C;
fma.rn.f64 %fd56, %fd54, %fd51, %fd55;
mov.f64 %fd57, 0d3EC71DE3A465B1E4;
fma.rn.f64 %fd58, %fd56, %fd51, %fd57;
mov.f64 %fd59, 0d3F2A01A01A02899D;
fma.rn.f64 %fd60, %fd58, %fd51, %fd59;
mov.f64 %fd61, 0d3F811111111110A6;
fma.rn.f64 %fd62, %fd60, %fd51, %fd61;
mov.f64 %fd63, 0d3FC5555555555556;
fma.rn.f64 %fd64, %fd62, %fd51, %fd63;
mul.f64 %fd65, %fd51, %fd64;
fma.rn.f64 %fd67, %fd65, %fd1, %fd1;
bra.uni BB96_4;
BB96_2:
{
.reg .b32 %temp;
mov.b64 {%temp, %r9}, %fd1;
}
mov.f64 %fd6, 0d4338000000000000;
mov.f64 %fd7, 0d3FF71547652B82FE;
fma.rn.f64 %fd8, %fd1, %fd7, %fd6;
{
.reg .b32 %temp;
mov.b64 {%r10, %temp}, %fd8;
}
add.s32 %r11, %r10, -1;
mov.f64 %fd9, 0dC338000000000000;
add.rn.f64 %fd10, %fd8, %fd9;
mov.f64 %fd11, 0dBFE62E42FEFA39EF;
fma.rn.f64 %fd12, %fd10, %fd11, %fd1;
mov.f64 %fd13, 0dBC7ABC9E3B39803F;
fma.rn.f64 %fd14, %fd10, %fd13, %fd12;
add.s32 %r12, %r9, %r9;
setp.lt.u32 %p3, %r12, 2142496327;
selp.b32 %r13, 0, %r11, %p3;
mov.u32 %r14, 0;
selp.f64 %fd15, %fd1, %fd14, %p3;
mov.f64 %fd16, 0d3E5AF86D8EBD13CD;
mov.f64 %fd17, 0d3E21F4076ACD15B6;
fma.rn.f64 %fd18, %fd17, %fd15, %fd16;
mov.f64 %fd19, 0d3E927E5092BA033D;
fma.rn.f64 %fd20, %fd18, %fd15, %fd19;
mov.f64 %fd21, 0d3EC71DDE6C5F9DA1;
fma.rn.f64 %fd22, %fd20, %fd15, %fd21;
mov.f64 %fd23, 0d3EFA01A018D034E6;
fma.rn.f64 %fd24, %fd22, %fd15, %fd23;
mov.f64 %fd25, 0d3F2A01A01B3B6940;
fma.rn.f64 %fd26, %fd24, %fd15, %fd25;
mov.f64 %fd27, 0d3F56C16C16C1B5DD;
fma.rn.f64 %fd28, %fd26, %fd15, %fd27;
mov.f64 %fd29, 0d3F8111111110F74D;
fma.rn.f64 %fd30, %fd28, %fd15, %fd29;
mov.f64 %fd31, 0d3FA555555555554D;
fma.rn.f64 %fd32, %fd30, %fd15, %fd31;
mov.f64 %fd33, 0d3FC5555555555557;
fma.rn.f64 %fd34, %fd32, %fd15, %fd33;
mov.f64 %fd35, 0d3FE0000000000000;
fma.rn.f64 %fd36, %fd34, %fd15, %fd35;
mul.f64 %fd37, %fd15, %fd36;
fma.rn.f64 %fd38, %fd37, %fd15, %fd15;
setp.eq.s32 %p4, %r13, 1024;
selp.b32 %r15, -1, 0, %p4;
add.s32 %r16, %r15, %r13;
shl.b32 %r17, %r16, 20;
add.s32 %r18, %r17, 1072693248;
mov.b64 %fd39, {%r14, %r18};
mov.u32 %r19, 1071644672;
mov.b64 %fd40, {%r14, %r19};
sub.f64 %fd41, %fd39, %fd40;
fma.rn.f64 %fd42, %fd38, %fd39, %fd41;
add.f64 %fd43, %fd42, %fd42;
selp.f64 %fd44, %fd43, %fd42, %p4;
setp.eq.s32 %p5, %r12, 0;
selp.f64 %fd45, %fd15, %fd44, %p5;
mov.f64 %fd46, 0d3FF0000000000000;
mov.f64 %fd47, 0d4000000000000000;
fma.rn.f64 %fd48, %fd47, %fd45, %fd46;
div.rn.f64 %fd49, %fd45, %fd48;
add.f64 %fd50, %fd49, %fd45;
setp.ltu.f64 %p6, %fd1, 0d408633CE8FB9F87E;
selp.f64 %fd67, %fd50, 0d7FF0000000000000, %p6;
BB96_4:
cvta.to.global.u64 %rd6, %rd2;
and.b32 %r20, %r2, -2147483648;
{
.reg .b32 %temp;
mov.b64 {%temp, %r21}, %fd67;
}
or.b32 %r22, %r21, %r20;
{
.reg .b32 %temp;
mov.b64 {%r23, %temp}, %fd67;
}
mov.b64 %fd66, {%r23, %r22};
add.s64 %rd8, %rd6, %rd4;
st.global.f64 [%rd8], %fd66;
BB96_5:
ret;
}
// .globl matrix_sinh_f
.visible .entry matrix_sinh_f(
.param .u64 matrix_sinh_f_param_0,
.param .u64 matrix_sinh_f_param_1,
.param .u32 matrix_sinh_f_param_2
)
{
.reg .pred %p<4>;
.reg .f32 %f<32>;
.reg .b32 %r<11>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [matrix_sinh_f_param_0];
ld.param.u64 %rd2, [matrix_sinh_f_param_1];
ld.param.u32 %r2, [matrix_sinh_f_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
@%p1 bra BB97_5;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f1, [%rd5];
abs.f32 %f2, %f1;
setp.ltu.f32 %p2, %f2, 0f3F800000;
@%p2 bra BB97_3;
bra.uni BB97_2;
BB97_3:
mul.f32 %f22, %f1, %f1;
mov.f32 %f23, 0f394FFF49;
mov.f32 %f24, 0f363D0ADA;
fma.rn.f32 %f25, %f24, %f22, %f23;
mov.f32 %f26, 0f3C08889A;
fma.rn.f32 %f27, %f25, %f22, %f26;
mov.f32 %f28, 0f3E2AAAAB;
fma.rn.f32 %f29, %f27, %f22, %f28;
mul.f32 %f30, %f22, %f29;
fma.rn.f32 %f31, %f30, %f1, %f1;
bra.uni BB97_4;
BB97_2:
mul.f32 %f6, %f2, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f7, %f6;
mov.f32 %f8, 0fBF317200;
fma.rn.f32 %f9, %f7, %f8, %f2;
mov.f32 %f10, 0fB5BFBE8E;
fma.rn.f32 %f11, %f7, %f10, %f9;
mul.f32 %f12, %f11, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f13, %f12;
add.f32 %f14, %f7, 0fC0000000;
ex2.approx.f32 %f15, %f14;
mul.f32 %f16, %f13, %f15;
mov.f32 %f17, 0f3E000000;
div.approx.f32 %f18, %f17, %f16;
neg.f32 %f19, %f18;
mov.f32 %f20, 0f40000000;
fma.rn.f32 %f21, %f20, %f16, %f19;
mov.b32 %r6, %f21;
setp.ltu.f32 %p3, %f2, 0f42B40000;
selp.b32 %r7, %r6, 2139095040, %p3;
mov.b32 %r8, %f1;
and.b32 %r9, %r8, -2147483648;
or.b32 %r10, %r7, %r9;
mov.b32 %f31, %r10;
BB97_4:
cvta.to.global.u64 %rd6, %rd2;
add.s64 %rd8, %rd6, %rd4;
st.global.f32 [%rd8], %f31;
BB97_5:
ret;
}
// .globl matrix_cos_d
.visible .entry matrix_cos_d(
.param .u64 matrix_cos_d_param_0,
.param .u64 matrix_cos_d_param_1,
.param .u32 matrix_cos_d_param_2
)
{
.local .align 4 .b8 __local_depot98[4];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<7>;
.reg .b32 %r<20>;
.reg .f64 %fd<42>;
.reg .b64 %rd<15>;
mov.u64 %SPL, __local_depot98;
cvta.local.u64 %SP, %SPL;
ld.param.u64 %rd2, [matrix_cos_d_param_0];
ld.param.u64 %rd3, [matrix_cos_d_param_1];
ld.param.u32 %r6, [matrix_cos_d_param_2];
add.u64 %rd4, %SP, 0;
add.u64 %rd1, %SPL, 0;
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %ctaid.x;
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r7, %r8, %r9;
setp.ge.u32 %p1, %r1, %r6;
@%p1 bra BB98_11;
cvta.to.global.u64 %rd5, %rd2;
mul.wide.s32 %rd6, %r1, 8;
add.s64 %rd7, %rd5, %rd6;
ld.global.f64 %fd38, [%rd7];
{
.reg .b32 %temp;
mov.b64 {%temp, %r10}, %fd38;
}
and.b32 %r11, %r10, 2147483647;
setp.ne.s32 %p2, %r11, 2146435072;
@%p2 bra BB98_4;
{
.reg .b32 %temp;
mov.b64 {%r12, %temp}, %fd38;
}
setp.ne.s32 %p3, %r12, 0;
@%p3 bra BB98_4;
mov.f64 %fd14, 0d0000000000000000;
mul.rn.f64 %fd38, %fd38, %fd14;
BB98_4:
mul.f64 %fd15, %fd38, 0d3FE45F306DC9C883;
cvt.rni.s32.f64 %r19, %fd15;
st.local.u32 [%rd1], %r19;
cvt.rn.f64.s32 %fd16, %r19;
neg.f64 %fd17, %fd16;
mov.f64 %fd18, 0d3FF921FB54442D18;
fma.rn.f64 %fd19, %fd17, %fd18, %fd38;
mov.f64 %fd20, 0d3C91A62633145C00;
fma.rn.f64 %fd21, %fd17, %fd20, %fd19;
mov.f64 %fd22, 0d397B839A252049C0;
fma.rn.f64 %fd39, %fd17, %fd22, %fd21;
{
.reg .b32 %temp;
mov.b64 {%temp, %r13}, %fd38;
}
and.b32 %r14, %r13, 2145386496;
setp.lt.u32 %p4, %r14, 1105199104;
@%p4 bra BB98_6;
// Callseq Start 4
{
.reg .b32 temp_param_reg;
// <end>}
.param .b64 param0;
st.param.f64 [param0+0], %fd38;
.param .b64 param1;
st.param.b64 [param1+0], %rd4;
.param .b64 retval0;
call.uni (retval0),
__internal_trig_reduction_slowpathd,
(
param0,
param1
);
ld.param.f64 %fd39, [retval0+0];
//{
}// Callseq End 4
ld.local.u32 %r19, [%rd1];
BB98_6:
add.s32 %r5, %r19, 1;
and.b32 %r15, %r5, 1;
shl.b32 %r16, %r15, 3;
setp.eq.s32 %p5, %r15, 0;
selp.f64 %fd23, 0d3DE5DB65F9785EBA, 0dBDA8FF8320FD8164, %p5;
add.s32 %r17, %r16, 1;
mul.wide.s32 %rd9, %r17, 8;
mov.u64 %rd10, __cudart_sin_cos_coeffs;
add.s64 %rd11, %rd10, %rd9;
ld.const.f64 %fd24, [%rd11];
mul.rn.f64 %fd7, %fd39, %fd39;
fma.rn.f64 %fd25, %fd23, %fd7, %fd24;
ld.const.f64 %fd26, [%rd11+8];
fma.rn.f64 %fd27, %fd25, %fd7, %fd26;
ld.const.f64 %fd28, [%rd11+16];
fma.rn.f64 %fd29, %fd27, %fd7, %fd28;
ld.const.f64 %fd30, [%rd11+24];
fma.rn.f64 %fd31, %fd29, %fd7, %fd30;
ld.const.f64 %fd32, [%rd11+32];
fma.rn.f64 %fd33, %fd31, %fd7, %fd32;
ld.const.f64 %fd34, [%rd11+40];
fma.rn.f64 %fd8, %fd33, %fd7, %fd34;
fma.rn.f64 %fd40, %fd8, %fd39, %fd39;
@%p5 bra BB98_8;
mov.f64 %fd35, 0d3FF0000000000000;
fma.rn.f64 %fd40, %fd8, %fd7, %fd35;
BB98_8:
and.b32 %r18, %r5, 2;
setp.eq.s32 %p6, %r18, 0;
@%p6 bra BB98_10;
mov.f64 %fd36, 0d0000000000000000;
mov.f64 %fd37, 0dBFF0000000000000;
fma.rn.f64 %fd40, %fd40, %fd37, %fd36;
BB98_10:
cvta.to.global.u64 %rd12, %rd3;
add.s64 %rd14, %rd12, %rd6;
st.global.f64 [%rd14], %fd40;
BB98_11:
ret;
}
// .globl matrix_cos_f
.visible .entry matrix_cos_f(
.param .u64 matrix_cos_f_param_0,
.param .u64 matrix_cos_f_param_1,
.param .u32 matrix_cos_f_param_2
)
{
.local .align 4 .b8 __local_depot99[28];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<13>;
.reg .f32 %f<38>;
.reg .b32 %r<70>;
.reg .f64 %fd<3>;
.reg .b64 %rd<24>;
mov.u64 %SPL, __local_depot99;
ld.param.u64 %rd7, [matrix_cos_f_param_0];
ld.param.u64 %rd8, [matrix_cos_f_param_1];
ld.param.u32 %r30, [matrix_cos_f_param_2];
mov.u32 %r31, %ntid.x;
mov.u32 %r32, %ctaid.x;
mov.u32 %r33, %tid.x;
mad.lo.s32 %r1, %r31, %r32, %r33;
setp.ge.u32 %p1, %r1, %r30;
@%p1 bra BB99_17;
cvta.to.global.u64 %rd9, %rd7;
mul.wide.s32 %rd10, %r1, 4;
add.s64 %rd11, %rd9, %rd10;
add.u64 %rd1, %SPL, 0;
ld.global.f32 %f1, [%rd11];
mul.f32 %f15, %f1, 0f3F22F983;
cvt.rni.s32.f32 %r69, %f15;
cvt.rn.f32.s32 %f16, %r69;
mov.f32 %f17, 0fBFC90FDA;
fma.rn.f32 %f18, %f16, %f17, %f1;
mov.f32 %f19, 0fB3A22168;
fma.rn.f32 %f20, %f16, %f19, %f18;
mov.f32 %f21, 0fA7C234C5;
fma.rn.f32 %f35, %f16, %f21, %f20;
abs.f32 %f3, %f1;
setp.leu.f32 %p2, %f3, 0f47CE4780;
@%p2 bra BB99_12;
setp.eq.f32 %p3, %f3, 0f7F800000;
@%p3 bra BB99_11;
bra.uni BB99_3;
BB99_11:
mov.f32 %f24, 0f00000000;
mul.rn.f32 %f35, %f1, %f24;
bra.uni BB99_12;
BB99_3:
mov.b32 %r3, %f1;
shl.b32 %r36, %r3, 8;
or.b32 %r4, %r36, -2147483648;
mov.u32 %r63, 0;
mov.u64 %rd22, __cudart_i2opi_f;
mov.u32 %r62, -6;
mov.u64 %rd23, %rd1;
BB99_4:
.pragma "nounroll";
ld.const.u32 %r39, [%rd22];
// inline asm
{
mad.lo.cc.u32 %r37, %r39, %r4, %r63;
madc.hi.u32 %r63, %r39, %r4, 0;
}
// inline asm
st.local.u32 [%rd23], %r37;
add.s64 %rd23, %rd23, 4;
add.s64 %rd22, %rd22, 4;
add.s32 %r62, %r62, 1;
setp.ne.s32 %p4, %r62, 0;
@%p4 bra BB99_4;
bfe.u32 %r42, %r3, 23, 8;
add.s32 %r43, %r42, -128;
shr.u32 %r44, %r43, 5;
and.b32 %r9, %r3, -2147483648;
st.local.u32 [%rd1+24], %r63;
bfe.u32 %r10, %r3, 23, 5;
mov.u32 %r45, 6;
sub.s32 %r46, %r45, %r44;
mul.wide.s32 %rd14, %r46, 4;
add.s64 %rd6, %rd1, %rd14;
ld.local.u32 %r65, [%rd6];
ld.local.u32 %r64, [%rd6+-4];
setp.eq.s32 %p5, %r10, 0;
@%p5 bra BB99_7;
mov.u32 %r47, 32;
sub.s32 %r48, %r47, %r10;
shr.u32 %r49, %r64, %r48;
shl.b32 %r50, %r65, %r10;
add.s32 %r65, %r49, %r50;
ld.local.u32 %r51, [%rd6+-8];
shr.u32 %r52, %r51, %r48;
shl.b32 %r53, %r64, %r10;
add.s32 %r64, %r52, %r53;
BB99_7:
shr.u32 %r54, %r64, 30;
shl.b32 %r55, %r65, 2;
add.s32 %r67, %r55, %r54;
shl.b32 %r18, %r64, 2;
shr.u32 %r56, %r67, 31;
shr.u32 %r57, %r65, 30;
add.s32 %r19, %r56, %r57;
setp.eq.s32 %p6, %r56, 0;
@%p6 bra BB99_8;
not.b32 %r58, %r67;
neg.s32 %r66, %r18;
setp.eq.s32 %p7, %r18, 0;
selp.u32 %r59, 1, 0, %p7;
add.s32 %r67, %r59, %r58;
xor.b32 %r68, %r9, -2147483648;
bra.uni BB99_10;
BB99_8:
mov.u32 %r66, %r18;
mov.u32 %r68, %r9;
BB99_10:
cvt.u64.u32 %rd15, %r67;
shl.b64 %rd16, %rd15, 32;
cvt.u64.u32 %rd17, %r66;
or.b64 %rd18, %rd16, %rd17;
cvt.rn.f64.s64 %fd1, %rd18;
mul.f64 %fd2, %fd1, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f22, %fd2;
neg.f32 %f23, %f22;
setp.eq.s32 %p8, %r68, 0;
selp.f32 %f35, %f22, %f23, %p8;
setp.eq.s32 %p9, %r9, 0;
neg.s32 %r60, %r19;
selp.b32 %r69, %r19, %r60, %p9;
BB99_12:
add.s32 %r28, %r69, 1;
and.b32 %r29, %r28, 1;
setp.eq.s32 %p10, %r29, 0;
selp.f32 %f7, %f35, 0f3F800000, %p10;
mul.rn.f32 %f8, %f35, %f35;
mov.f32 %f26, 0f00000000;
fma.rn.f32 %f9, %f8, %f7, %f26;
mov.f32 %f36, 0fB94D4153;
@%p10 bra BB99_14;
mov.f32 %f27, 0fBAB607ED;
mov.f32 %f28, 0f37CBAC00;
fma.rn.f32 %f36, %f28, %f8, %f27;
BB99_14:
selp.f32 %f29, 0f3C0885E4, 0f3D2AAABB, %p10;
fma.rn.f32 %f30, %f36, %f8, %f29;
selp.f32 %f31, 0fBE2AAAA8, 0fBEFFFFFF, %p10;
fma.rn.f32 %f32, %f30, %f8, %f31;
fma.rn.f32 %f37, %f32, %f9, %f7;
and.b32 %r61, %r28, 2;
setp.eq.s32 %p12, %r61, 0;
@%p12 bra BB99_16;
mov.f32 %f34, 0fBF800000;
fma.rn.f32 %f37, %f37, %f34, %f26;
BB99_16:
cvta.to.global.u64 %rd19, %rd8;
add.s64 %rd21, %rd19, %rd10;
st.global.f32 [%rd21], %f37;
BB99_17:
ret;
}
// .globl matrix_cosh_d
.visible .entry matrix_cosh_d(
.param .u64 matrix_cosh_d_param_0,
.param .u64 matrix_cosh_d_param_1,
.param .u32 matrix_cosh_d_param_2
)
{
.reg .pred %p<4>;
.reg .b32 %r<16>;
.reg .f64 %fd<46>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [matrix_cosh_d_param_0];
ld.param.u64 %rd2, [matrix_cosh_d_param_1];
ld.param.u32 %r2, [matrix_cosh_d_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
@%p1 bra BB100_5;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd1, [%rd5];
{
.reg .b32 %temp;
mov.b64 {%temp, %r6}, %fd1;
}
and.b32 %r7, %r6, 2147483647;
{
.reg .b32 %temp;
mov.b64 {%r8, %temp}, %fd1;
}
mov.b64 %fd2, {%r8, %r7};
{
.reg .b32 %temp;
mov.b64 {%temp, %r9}, %fd2;
}
setp.lt.u32 %p2, %r9, 1082536911;
@%p2 bra BB100_3;
bra.uni BB100_2;
BB100_3:
mov.f64 %fd6, 0d4338000000000000;
mov.f64 %fd7, 0d3FF71547652B82FE;
fma.rn.f64 %fd8, %fd2, %fd7, %fd6;
{
.reg .b32 %temp;
mov.b64 {%r10, %temp}, %fd8;
}
mov.f64 %fd9, 0dC338000000000000;
add.rn.f64 %fd10, %fd8, %fd9;
mov.f64 %fd11, 0dBFE62E42FEFA39EF;
fma.rn.f64 %fd12, %fd10, %fd11, %fd2;
mov.f64 %fd13, 0dBC7ABC9E3B39803F;
fma.rn.f64 %fd14, %fd10, %fd13, %fd12;
mov.f64 %fd15, 0d3E928AF3FCA213EA;
mov.f64 %fd16, 0d3E5ADE1569CE2BDF;
fma.rn.f64 %fd17, %fd16, %fd14, %fd15;
mov.f64 %fd18, 0d3EC71DEE62401315;
fma.rn.f64 %fd19, %fd17, %fd14, %fd18;
mov.f64 %fd20, 0d3EFA01997C89EB71;
fma.rn.f64 %fd21, %fd19, %fd14, %fd20;
mov.f64 %fd22, 0d3F2A01A014761F65;
fma.rn.f64 %fd23, %fd21, %fd14, %fd22;
mov.f64 %fd24, 0d3F56C16C1852B7AF;
fma.rn.f64 %fd25, %fd23, %fd14, %fd24;
mov.f64 %fd26, 0d3F81111111122322;
fma.rn.f64 %fd27, %fd25, %fd14, %fd26;
mov.f64 %fd28, 0d3FA55555555502A1;
fma.rn.f64 %fd29, %fd27, %fd14, %fd28;
mov.f64 %fd30, 0d3FC5555555555511;
fma.rn.f64 %fd31, %fd29, %fd14, %fd30;
mov.f64 %fd32, 0d3FE000000000000B;
fma.rn.f64 %fd33, %fd31, %fd14, %fd32;
mov.f64 %fd34, 0d3FF0000000000000;
fma.rn.f64 %fd35, %fd33, %fd14, %fd34;
fma.rn.f64 %fd36, %fd35, %fd14, %fd34;
{
.reg .b32 %temp;
mov.b64 {%r11, %temp}, %fd36;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r12}, %fd36;
}
shl.b32 %r13, %r10, 20;
add.s32 %r14, %r13, %r12;
add.s32 %r15, %r14, -2097152;
mov.b64 %fd37, {%r11, %r15};
rcp.approx.ftz.f64 %fd38, %fd37;
neg.f64 %fd39, %fd37;
fma.rn.f64 %fd40, %fd39, %fd38, %fd34;
fma.rn.f64 %fd41, %fd40, %fd40, %fd40;
fma.rn.f64 %fd42, %fd41, %fd38, %fd38;
mov.f64 %fd43, 0d3FB0000000000000;
fma.rn.f64 %fd45, %fd42, %fd43, %fd37;
bra.uni BB100_4;
BB100_2:
setp.gtu.f64 %p3, %fd1, 0d7FF0000000000000;
selp.f64 %fd45, %fd1, 0d7FF0000000000000, %p3;
BB100_4:
cvta.to.global.u64 %rd6, %rd2;
add.s64 %rd8, %rd6, %rd4;
add.f64 %fd44, %fd45, %fd45;
st.global.f64 [%rd8], %fd44;
BB100_5:
ret;
}
// .globl matrix_cosh_f
.visible .entry matrix_cosh_f(
.param .u64 matrix_cosh_f_param_0,
.param .u64 matrix_cosh_f_param_1,
.param .u32 matrix_cosh_f_param_2
)
{
.reg .pred %p<3>;
.reg .f32 %f<19>;
.reg .b32 %r<6>;
.reg .b64 %rd<8>;
ld.param.u64 %rd1, [matrix_cosh_f_param_0];
ld.param.u64 %rd2, [matrix_cosh_f_param_1];
ld.param.u32 %r2, [matrix_cosh_f_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
@%p1 bra BB101_2;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f1, [%rd5];
abs.f32 %f2, %f1;
mul.f32 %f3, %f2, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f4, %f3;
mov.f32 %f5, 0fBF317200;
fma.rn.f32 %f6, %f4, %f5, %f2;
mov.f32 %f7, 0fB5BFBE8E;
fma.rn.f32 %f8, %f4, %f7, %f6;
mul.f32 %f9, %f8, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f10, %f9;
add.f32 %f11, %f4, 0fC0000000;
ex2.approx.f32 %f12, %f11;
mul.f32 %f13, %f10, %f12;
mov.f32 %f14, 0f3E000000;
div.approx.f32 %f15, %f14, %f13;
mov.f32 %f16, 0f40000000;
fma.rn.f32 %f17, %f16, %f13, %f15;
setp.ltu.f32 %p2, %f2, 0f42B40000;
selp.f32 %f18, %f17, 0f7F800000, %p2;
cvta.to.global.u64 %rd6, %rd2;
add.s64 %rd7, %rd6, %rd4;
st.global.f32 [%rd7], %f18;
BB101_2:
ret;
}
// .globl matrix_tan_d
.visible .entry matrix_tan_d(
.param .u64 matrix_tan_d_param_0,
.param .u64 matrix_tan_d_param_1,
.param .u32 matrix_tan_d_param_2
)
{
.local .align 4 .b8 __local_depot102[4];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<6>;
.reg .b32 %r<16>;
.reg .f64 %fd<65>;
.reg .b64 %rd<12>;
mov.u64 %SPL, __local_depot102;
cvta.local.u64 %SP, %SPL;
ld.param.u64 %rd2, [matrix_tan_d_param_0];
ld.param.u64 %rd3, [matrix_tan_d_param_1];
ld.param.u32 %r5, [matrix_tan_d_param_2];
add.u64 %rd4, %SP, 0;
add.u64 %rd1, %SPL, 0;
mov.u32 %r6, %ntid.x;
mov.u32 %r7, %ctaid.x;
mov.u32 %r8, %tid.x;
mad.lo.s32 %r1, %r6, %r7, %r8;
setp.ge.u32 %p1, %r1, %r5;
@%p1 bra BB102_9;
cvta.to.global.u64 %rd5, %rd2;
mul.wide.s32 %rd6, %r1, 8;
add.s64 %rd7, %rd5, %rd6;
ld.global.f64 %fd62, [%rd7];
{
.reg .b32 %temp;
mov.b64 {%temp, %r9}, %fd62;
}
and.b32 %r10, %r9, 2147483647;
setp.ne.s32 %p2, %r10, 2146435072;
@%p2 bra BB102_4;
{
.reg .b32 %temp;
mov.b64 {%r11, %temp}, %fd62;
}
setp.ne.s32 %p3, %r11, 0;
@%p3 bra BB102_4;
mov.f64 %fd11, 0d0000000000000000;
mul.rn.f64 %fd62, %fd62, %fd11;
BB102_4:
mul.f64 %fd12, %fd62, 0d3FE45F306DC9C883;
cvt.rni.s32.f64 %r15, %fd12;
st.local.u32 [%rd1], %r15;
cvt.rn.f64.s32 %fd13, %r15;
neg.f64 %fd14, %fd13;
mov.f64 %fd15, 0d3FF921FB54442D18;
fma.rn.f64 %fd16, %fd14, %fd15, %fd62;
mov.f64 %fd17, 0d3C91A62633145C00;
fma.rn.f64 %fd18, %fd14, %fd17, %fd16;
mov.f64 %fd19, 0d397B839A252049C0;
fma.rn.f64 %fd63, %fd14, %fd19, %fd18;
{
.reg .b32 %temp;
mov.b64 {%temp, %r12}, %fd62;
}
and.b32 %r13, %r12, 2145386496;
setp.lt.u32 %p4, %r13, 1105199104;
@%p4 bra BB102_6;
// Callseq Start 5
{
.reg .b32 temp_param_reg;
// <end>}
.param .b64 param0;
st.param.f64 [param0+0], %fd62;
.param .b64 param1;
st.param.b64 [param1+0], %rd4;
.param .b64 retval0;
call.uni (retval0),
__internal_trig_reduction_slowpathd,
(
param0,
param1
);
ld.param.f64 %fd63, [retval0+0];
//{
}// Callseq End 5
ld.local.u32 %r15, [%rd1];
BB102_6:
mul.f64 %fd20, %fd63, %fd63;
mov.f64 %fd21, 0dBEF9757C5B27EBB1;
mov.f64 %fd22, 0d3EE48DAC2799BCB9;
fma.rn.f64 %fd23, %fd22, %fd20, %fd21;
mov.f64 %fd24, 0d3F0980E90FD91E04;
fma.rn.f64 %fd25, %fd23, %fd20, %fd24;
mov.f64 %fd26, 0dBEFAE2B0417D7E1D;
fma.rn.f64 %fd27, %fd25, %fd20, %fd26;
mov.f64 %fd28, 0d3F119F5341BFBA57;
fma.rn.f64 %fd29, %fd27, %fd20, %fd28;
mov.f64 %fd30, 0d3F15E791A00F6919;
fma.rn.f64 %fd31, %fd29, %fd20, %fd30;
mov.f64 %fd32, 0d3F2FF2E7FADEC73A;
fma.rn.f64 %fd33, %fd31, %fd20, %fd32;
mov.f64 %fd34, 0d3F434BC1B206DA62;
fma.rn.f64 %fd35, %fd33, %fd20, %fd34;
mov.f64 %fd36, 0d3F57DB18EF2F83F9;
fma.rn.f64 %fd37, %fd35, %fd20, %fd36;
mov.f64 %fd38, 0d3F6D6D2E7AE49FBC;
fma.rn.f64 %fd39, %fd37, %fd20, %fd38;
mov.f64 %fd40, 0d3F8226E3A816A776;
fma.rn.f64 %fd41, %fd39, %fd20, %fd40;
mov.f64 %fd42, 0d3F9664F485D25660;
fma.rn.f64 %fd43, %fd41, %fd20, %fd42;
mov.f64 %fd44, 0d3FABA1BA1BABF31D;
fma.rn.f64 %fd45, %fd43, %fd20, %fd44;
mov.f64 %fd46, 0d3FC11111111105D2;
fma.rn.f64 %fd47, %fd45, %fd20, %fd46;
mov.f64 %fd48, 0d3FD555555555555E;
fma.rn.f64 %fd49, %fd47, %fd20, %fd48;
mul.f64 %fd7, %fd20, %fd49;
fma.rn.f64 %fd64, %fd7, %fd63, %fd63;
and.b32 %r14, %r15, 1;
setp.eq.b32 %p5, %r14, 1;
@!%p5 bra BB102_8;
bra.uni BB102_7;
BB102_7:
sub.f64 %fd50, %fd64, %fd63;
neg.f64 %fd51, %fd50;
fma.rn.f64 %fd52, %fd7, %fd63, %fd51;
neg.f64 %fd53, %fd64;
rcp.approx.ftz.f64 %fd54, %fd64;
mov.f64 %fd55, 0d3FF0000000000000;
fma.rn.f64 %fd56, %fd53, %fd54, %fd55;
fma.rn.f64 %fd57, %fd56, %fd56, %fd56;
fma.rn.f64 %fd58, %fd57, %fd54, %fd54;
neg.f64 %fd59, %fd58;
fma.rn.f64 %fd60, %fd64, %fd59, %fd55;
fma.rn.f64 %fd61, %fd59, %fd52, %fd60;
fma.rn.f64 %fd64, %fd61, %fd59, %fd59;
BB102_8:
cvta.to.global.u64 %rd9, %rd3;
add.s64 %rd11, %rd9, %rd6;
st.global.f64 [%rd11], %fd64;
BB102_9:
ret;
}
// .globl matrix_tan_f
.visible .entry matrix_tan_f(
.param .u64 matrix_tan_f_param_0,
.param .u64 matrix_tan_f_param_1,
.param .u32 matrix_tan_f_param_2
)
{
.local .align 4 .b8 __local_depot103[28];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<12>;
.reg .f32 %f<39>;
.reg .b32 %r<68>;
.reg .f64 %fd<3>;
.reg .b64 %rd<24>;
mov.u64 %SPL, __local_depot103;
ld.param.u64 %rd7, [matrix_tan_f_param_0];
ld.param.u64 %rd8, [matrix_tan_f_param_1];
ld.param.u32 %r28, [matrix_tan_f_param_2];
mov.u32 %r29, %ntid.x;
mov.u32 %r30, %ctaid.x;
mov.u32 %r31, %tid.x;
mad.lo.s32 %r1, %r29, %r30, %r31;
setp.ge.u32 %p1, %r1, %r28;
@%p1 bra BB103_15;
cvta.to.global.u64 %rd9, %rd7;
mul.wide.s32 %rd10, %r1, 4;
add.s64 %rd11, %rd9, %rd10;
add.u64 %rd1, %SPL, 0;
ld.global.f32 %f1, [%rd11];
mul.f32 %f10, %f1, 0f3F22F983;
cvt.rni.s32.f32 %r67, %f10;
cvt.rn.f32.s32 %f11, %r67;
mov.f32 %f12, 0fBFC90FDA;
fma.rn.f32 %f13, %f11, %f12, %f1;
mov.f32 %f14, 0fB3A22168;
fma.rn.f32 %f15, %f11, %f14, %f13;
mov.f32 %f16, 0fA7C234C5;
fma.rn.f32 %f37, %f11, %f16, %f15;
abs.f32 %f3, %f1;
setp.leu.f32 %p2, %f3, 0f47CE4780;
@%p2 bra BB103_12;
setp.eq.f32 %p3, %f3, 0f7F800000;
@%p3 bra BB103_11;
bra.uni BB103_3;
BB103_11:
mov.f32 %f19, 0f00000000;
mul.rn.f32 %f37, %f1, %f19;
bra.uni BB103_12;
BB103_3:
mov.b32 %r3, %f1;
shl.b32 %r34, %r3, 8;
or.b32 %r4, %r34, -2147483648;
mov.u32 %r61, 0;
mov.u64 %rd22, __cudart_i2opi_f;
mov.u32 %r60, -6;
mov.u64 %rd23, %rd1;
BB103_4:
.pragma "nounroll";
ld.const.u32 %r37, [%rd22];
// inline asm
{
mad.lo.cc.u32 %r35, %r37, %r4, %r61;
madc.hi.u32 %r61, %r37, %r4, 0;
}
// inline asm
st.local.u32 [%rd23], %r35;
add.s64 %rd23, %rd23, 4;
add.s64 %rd22, %rd22, 4;
add.s32 %r60, %r60, 1;
setp.ne.s32 %p4, %r60, 0;
@%p4 bra BB103_4;
bfe.u32 %r40, %r3, 23, 8;
add.s32 %r41, %r40, -128;
shr.u32 %r42, %r41, 5;
and.b32 %r9, %r3, -2147483648;
st.local.u32 [%rd1+24], %r61;
bfe.u32 %r10, %r3, 23, 5;
mov.u32 %r43, 6;
sub.s32 %r44, %r43, %r42;
mul.wide.s32 %rd14, %r44, 4;
add.s64 %rd6, %rd1, %rd14;
ld.local.u32 %r63, [%rd6];
ld.local.u32 %r62, [%rd6+-4];
setp.eq.s32 %p5, %r10, 0;
@%p5 bra BB103_7;
mov.u32 %r45, 32;
sub.s32 %r46, %r45, %r10;
shr.u32 %r47, %r62, %r46;
shl.b32 %r48, %r63, %r10;
add.s32 %r63, %r47, %r48;
ld.local.u32 %r49, [%rd6+-8];
shr.u32 %r50, %r49, %r46;
shl.b32 %r51, %r62, %r10;
add.s32 %r62, %r50, %r51;
BB103_7:
shr.u32 %r52, %r62, 30;
shl.b32 %r53, %r63, 2;
add.s32 %r65, %r53, %r52;
shl.b32 %r18, %r62, 2;
shr.u32 %r54, %r65, 31;
shr.u32 %r55, %r63, 30;
add.s32 %r19, %r54, %r55;
setp.eq.s32 %p6, %r54, 0;
@%p6 bra BB103_8;
not.b32 %r56, %r65;
neg.s32 %r64, %r18;
setp.eq.s32 %p7, %r18, 0;
selp.u32 %r57, 1, 0, %p7;
add.s32 %r65, %r57, %r56;
xor.b32 %r66, %r9, -2147483648;
bra.uni BB103_10;
BB103_8:
mov.u32 %r64, %r18;
mov.u32 %r66, %r9;
BB103_10:
cvt.u64.u32 %rd15, %r65;
shl.b64 %rd16, %rd15, 32;
cvt.u64.u32 %rd17, %r64;
or.b64 %rd18, %rd16, %rd17;
cvt.rn.f64.s64 %fd1, %rd18;
mul.f64 %fd2, %fd1, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f17, %fd2;
neg.f32 %f18, %f17;
setp.eq.s32 %p8, %r66, 0;
selp.f32 %f37, %f17, %f18, %p8;
setp.eq.s32 %p9, %r9, 0;
neg.s32 %r58, %r19;
selp.b32 %r67, %r19, %r58, %p9;
BB103_12:
mul.f32 %f20, %f37, %f37;
mov.f32 %f21, 0f3B560000;
mov.f32 %f22, 0f3C190000;
fma.rn.f32 %f23, %f22, %f20, %f21;
mov.f32 %f24, 0f3CC70000;
fma.rn.f32 %f25, %f23, %f20, %f24;
mov.f32 %f26, 0f3D5B0000;
fma.rn.f32 %f27, %f25, %f20, %f26;
mov.f32 %f28, 0f3E089438;
fma.rn.f32 %f29, %f27, %f20, %f28;
mov.f32 %f30, 0f3EAAAA88;
fma.rn.f32 %f31, %f29, %f20, %f30;
mul.rn.f32 %f32, %f20, %f37;
fma.rn.f32 %f33, %f31, %f32, %f37;
abs.f32 %f34, %f37;
setp.eq.f32 %p10, %f34, 0f3A00B43C;
selp.f32 %f38, %f37, %f33, %p10;
and.b32 %r59, %r67, 1;
setp.eq.b32 %p11, %r59, 1;
@!%p11 bra BB103_14;
bra.uni BB103_13;
BB103_13:
// inline asm
rcp.approx.ftz.f32 %f35,%f38;
// inline asm
neg.f32 %f38, %f35;
BB103_14:
cvta.to.global.u64 %rd19, %rd8;
add.s64 %rd21, %rd19, %rd10;
st.global.f32 [%rd21], %f38;
BB103_15:
ret;
}
// .globl matrix_tanh_d
.visible .entry matrix_tanh_d(
.param .u64 matrix_tanh_d_param_0,
.param .u64 matrix_tanh_d_param_1,
.param .u32 matrix_tanh_d_param_2
)
{
.reg .pred %p<4>;
.reg .f32 %f<5>;
.reg .b32 %r<13>;
.reg .f64 %fd<72>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [matrix_tanh_d_param_0];
ld.param.u64 %rd2, [matrix_tanh_d_param_1];
ld.param.u32 %r4, [matrix_tanh_d_param_2];
mov.u32 %r5, %ctaid.x;
mov.u32 %r6, %ntid.x;
mov.u32 %r7, %tid.x;
mad.lo.s32 %r1, %r6, %r5, %r7;
setp.ge.u32 %p1, %r1, %r4;
@%p1 bra BB104_5;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd1, [%rd5];
{
.reg .b32 %temp;
mov.b64 {%temp, %r2}, %fd1;
}
and.b32 %r3, %r2, 2147483647;
{
.reg .b32 %temp;
mov.b64 {%r8, %temp}, %fd1;
}
mov.b64 %fd2, {%r8, %r3};
setp.ltu.f64 %p2, %fd2, 0d3FE4F92224DD2F1A;
@%p2 bra BB104_3;
bra.uni BB104_2;
BB104_3:
mul.f64 %fd47, %fd1, %fd1;
mov.f64 %fd48, 0d3F14359F420AFC3D;
mov.f64 %fd49, 0dBEF0BC46E2F5E964;
fma.rn.f64 %fd50, %fd49, %fd47, %fd48;
mov.f64 %fd51, 0dBF2DF9F0728C5D84;
fma.rn.f64 %fd52, %fd50, %fd47, %fd51;
mov.f64 %fd53, 0d3F4337D1CEC4F033;
fma.rn.f64 %fd54, %fd52, %fd47, %fd53;
mov.f64 %fd55, 0dBF57D6E9674335B3;
fma.rn.f64 %fd56, %fd54, %fd47, %fd55;
mov.f64 %fd57, 0d3F6D6D000D7AAD3D;
fma.rn.f64 %fd58, %fd56, %fd47, %fd57;
mov.f64 %fd59, 0dBF8226E1F3CF1EF5;
fma.rn.f64 %fd60, %fd58, %fd47, %fd59;
mov.f64 %fd61, 0d3F9664F47EC0C8CF;
fma.rn.f64 %fd62, %fd60, %fd47, %fd61;
mov.f64 %fd63, 0dBFABA1BA1B80AB40;
fma.rn.f64 %fd64, %fd62, %fd47, %fd63;
mov.f64 %fd65, 0d3FC111111110FA4A;
fma.rn.f64 %fd66, %fd64, %fd47, %fd65;
mov.f64 %fd67, 0dBFD5555555555550;
fma.rn.f64 %fd68, %fd66, %fd47, %fd67;
mov.f64 %fd69, 0d0000000000000000;
fma.rn.f64 %fd70, %fd68, %fd47, %fd69;
fma.rn.f64 %fd71, %fd70, %fd1, %fd1;
bra.uni BB104_4;
BB104_2:
add.f64 %fd6, %fd2, %fd2;
cvt.rn.f32.f64 %f1, %fd6;
mul.f32 %f2, %f1, 0f3FB8AA3B;
cvt.rni.f32.f32 %f3, %f2;
cvt.f64.f32 %fd7, %f3;
neg.f64 %fd8, %fd7;
mov.f64 %fd9, 0d3FE62E42FEFA39EF;
fma.rn.f64 %fd10, %fd8, %fd9, %fd6;
mov.f64 %fd11, 0d3E928A27F89B6999;
mov.f64 %fd12, 0d3E5AE904A4741B81;
fma.rn.f64 %fd13, %fd12, %fd10, %fd11;
mov.f64 %fd14, 0d3EC71DE715FF7E07;
fma.rn.f64 %fd15, %fd13, %fd10, %fd14;
mov.f64 %fd16, 0d3EFA019A6B0AC45A;
fma.rn.f64 %fd17, %fd15, %fd10, %fd16;
mov.f64 %fd18, 0d3F2A01A017EED94F;
fma.rn.f64 %fd19, %fd17, %fd10, %fd18;
mov.f64 %fd20, 0d3F56C16C17F2A71B;
fma.rn.f64 %fd21, %fd19, %fd10, %fd20;
mov.f64 %fd22, 0d3F811111111173C4;
fma.rn.f64 %fd23, %fd21, %fd10, %fd22;
mov.f64 %fd24, 0d3FA555555555211A;
fma.rn.f64 %fd25, %fd23, %fd10, %fd24;
mov.f64 %fd26, 0d3FC5555555555540;
fma.rn.f64 %fd27, %fd25, %fd10, %fd26;
mov.f64 %fd28, 0d3FE0000000000005;
fma.rn.f64 %fd29, %fd27, %fd10, %fd28;
mul.f64 %fd30, %fd10, %fd29;
fma.rn.f64 %fd31, %fd30, %fd10, %fd10;
ex2.approx.ftz.f32 %f4, %f3;
cvt.f64.f32 %fd32, %f4;
mov.f64 %fd33, 0d3FF0000000000000;
sub.f64 %fd34, %fd33, %fd32;
neg.f64 %fd35, %fd31;
fma.rn.f64 %fd36, %fd35, %fd32, %fd34;
mov.f64 %fd37, 0d4000000000000000;
sub.f64 %fd38, %fd37, %fd36;
rcp.approx.ftz.f64 %fd39, %fd38;
neg.f64 %fd40, %fd38;
fma.rn.f64 %fd41, %fd40, %fd39, %fd33;
fma.rn.f64 %fd42, %fd41, %fd41, %fd41;
fma.rn.f64 %fd43, %fd42, %fd39, %fd39;
neg.f64 %fd44, %fd43;
fma.rn.f64 %fd45, %fd37, %fd44, %fd33;
setp.gt.u32 %p3, %r3, 1077088193;
selp.f64 %fd46, 0d3FF0000000000000, %fd45, %p3;
{
.reg .b32 %temp;
mov.b64 {%r9, %temp}, %fd46;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r10}, %fd46;
}
and.b32 %r11, %r2, -2147483648;
or.b32 %r12, %r10, %r11;
mov.b64 %fd71, {%r9, %r12};
BB104_4:
cvta.to.global.u64 %rd6, %rd2;
add.s64 %rd8, %rd6, %rd4;
st.global.f64 [%rd8], %fd71;
BB104_5:
ret;
}
// .globl matrix_tanh_f
.visible .entry matrix_tanh_f(
.param .u64 matrix_tanh_f_param_0,
.param .u64 matrix_tanh_f_param_1,
.param .u32 matrix_tanh_f_param_2
)
{
.reg .pred %p<4>;
.reg .f32 %f<24>;
.reg .b32 %r<11>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [matrix_tanh_f_param_0];
ld.param.u64 %rd2, [matrix_tanh_f_param_1];
ld.param.u32 %r2, [matrix_tanh_f_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
@%p1 bra BB105_5;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f1, [%rd5];
abs.f32 %f2, %f1;
setp.ltu.f32 %p2, %f2, 0f3F19999A;
@%p2 bra BB105_3;
bra.uni BB105_2;
BB105_3:
mul.f32 %f13, %f1, %f1;
mov.f32 %f14, 0fBD563CAE;
mov.f32 %f15, 0f3C80F082;
fma.rn.f32 %f16, %f15, %f13, %f14;
mov.f32 %f17, 0f3E085941;
fma.rn.f32 %f18, %f16, %f13, %f17;
mov.f32 %f19, 0fBEAAA9ED;
fma.rn.f32 %f20, %f18, %f13, %f19;
mov.f32 %f21, 0f00000000;
fma.rn.f32 %f22, %f20, %f13, %f21;
fma.rn.f32 %f23, %f22, %f1, %f1;
bra.uni BB105_4;
BB105_2:
mul.f32 %f8, %f2, 0f4038AA3B;
ex2.approx.ftz.f32 %f9, %f8;
add.f32 %f7, %f9, 0f3F800000;
// inline asm
rcp.approx.ftz.f32 %f6,%f7;
// inline asm
mov.f32 %f10, 0f3F800000;
mov.f32 %f11, 0fC0000000;
fma.rn.f32 %f12, %f6, %f11, %f10;
mov.b32 %r6, %f12;
setp.ltu.f32 %p3, %f2, 0f41102CB4;
selp.b32 %r7, %r6, 1065353216, %p3;
mov.b32 %r8, %f1;
and.b32 %r9, %r8, -2147483648;
or.b32 %r10, %r7, %r9;
mov.b32 %f23, %r10;
BB105_4:
cvta.to.global.u64 %rd6, %rd2;
add.s64 %rd8, %rd6, %rd4;
st.global.f32 [%rd8], %f23;
BB105_5:
ret;
}
// .globl matrix_asin_d
.visible .entry matrix_asin_d(
.param .u64 matrix_asin_d_param_0,
.param .u64 matrix_asin_d_param_1,
.param .u32 matrix_asin_d_param_2
)
{
.reg .pred %p<5>;
.reg .f32 %f<3>;
.reg .b32 %r<15>;
.reg .f64 %fd<83>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [matrix_asin_d_param_0];
ld.param.u64 %rd2, [matrix_asin_d_param_1];
ld.param.u32 %r3, [matrix_asin_d_param_2];
mov.u32 %r4, %ctaid.x;
mov.u32 %r5, %ntid.x;
mov.u32 %r6, %tid.x;
mad.lo.s32 %r1, %r5, %r4, %r6;
setp.ge.u32 %p1, %r1, %r3;
@%p1 bra BB106_5;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd1, [%rd5];
{
.reg .b32 %temp;
mov.b64 {%temp, %r2}, %fd1;
}
mov.b32 %f1, %r2;
abs.f32 %f2, %f1;
setp.lt.f32 %p2, %f2, 0f3FE26666;
@%p2 bra BB106_3;
bra.uni BB106_2;
BB106_3:
mul.f64 %fd55, %fd1, %fd1;
mov.f64 %fd56, 0dBFB3823B180754AF;
mov.f64 %fd57, 0d3FB0066BDC1895E9;
fma.rn.f64 %fd58, %fd57, %fd55, %fd56;
mov.f64 %fd59, 0d3FB11E52CC2F79AE;
fma.rn.f64 %fd60, %fd58, %fd55, %fd59;
mov.f64 %fd61, 0dBF924EAF3526861B;
fma.rn.f64 %fd62, %fd60, %fd55, %fd61;
mov.f64 %fd63, 0d3F91DF02A31E6CB7;
fma.rn.f64 %fd64, %fd62, %fd55, %fd63;
mov.f64 %fd65, 0d3F847D18B0EEC6CC;
fma.rn.f64 %fd66, %fd64, %fd55, %fd65;
mov.f64 %fd67, 0d3F8D0AF961BA53B0;
fma.rn.f64 %fd68, %fd66, %fd55, %fd67;
mov.f64 %fd69, 0d3F91BF7734CF1C48;
fma.rn.f64 %fd70, %fd68, %fd55, %fd69;
mov.f64 %fd71, 0d3F96E91483144EF7;
fma.rn.f64 %fd72, %fd70, %fd55, %fd71;
mov.f64 %fd73, 0d3F9F1C6E0A4F9F81;
fma.rn.f64 %fd74, %fd72, %fd55, %fd73;
mov.f64 %fd75, 0d3FA6DB6DC27FA92B;
fma.rn.f64 %fd76, %fd74, %fd55, %fd75;
mov.f64 %fd77, 0d3FB333333320F91B;
fma.rn.f64 %fd78, %fd76, %fd55, %fd77;
mov.f64 %fd79, 0d3FC5555555555F4D;
fma.rn.f64 %fd80, %fd78, %fd55, %fd79;
mul.f64 %fd81, %fd55, %fd80;
fma.rn.f64 %fd82, %fd81, %fd1, %fd1;
bra.uni BB106_4;
BB106_2:
abs.f64 %fd7, %fd1;
mov.f64 %fd8, 0d3FE0000000000000;
mov.f64 %fd9, 0dBFE0000000000000;
fma.rn.f64 %fd6, %fd9, %fd7, %fd8;
// inline asm
rsqrt.approx.ftz.f64 %fd5, %fd6;
// inline asm
{
.reg .b32 %temp;
mov.b64 {%r7, %temp}, %fd5;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r8}, %fd5;
}
add.s32 %r9, %r8, -1048576;
mov.b64 %fd10, {%r7, %r9};
mul.f64 %fd11, %fd6, %fd5;
neg.f64 %fd12, %fd11;
fma.rn.f64 %fd13, %fd11, %fd12, %fd6;
fma.rn.f64 %fd14, %fd13, %fd10, %fd11;
neg.f64 %fd15, %fd14;
mov.f64 %fd16, 0d3FF0000000000000;
fma.rn.f64 %fd17, %fd5, %fd15, %fd16;
fma.rn.f64 %fd18, %fd17, %fd10, %fd10;
fma.rn.f64 %fd19, %fd14, %fd15, %fd6;
fma.rn.f64 %fd20, %fd19, %fd18, %fd14;
{
.reg .b32 %temp;
mov.b64 {%temp, %r10}, %fd6;
}
setp.lt.s32 %p3, %r10, 0;
selp.f64 %fd21, 0dFFF8000000000000, %fd20, %p3;
setp.equ.f64 %p4, %fd6, 0d0000000000000000;
selp.f64 %fd22, %fd6, %fd21, %p4;
mov.f64 %fd23, 0dBFB3823B180754AF;
mov.f64 %fd24, 0d3FB0066BDC1895E9;
fma.rn.f64 %fd25, %fd24, %fd6, %fd23;
mov.f64 %fd26, 0d3FB11E52CC2F79AE;
fma.rn.f64 %fd27, %fd25, %fd6, %fd26;
mov.f64 %fd28, 0dBF924EAF3526861B;
fma.rn.f64 %fd29, %fd27, %fd6, %fd28;
mov.f64 %fd30, 0d3F91DF02A31E6CB7;
fma.rn.f64 %fd31, %fd29, %fd6, %fd30;
mov.f64 %fd32, 0d3F847D18B0EEC6CC;
fma.rn.f64 %fd33, %fd31, %fd6, %fd32;
mov.f64 %fd34, 0d3F8D0AF961BA53B0;
fma.rn.f64 %fd35, %fd33, %fd6, %fd34;
mov.f64 %fd36, 0d3F91BF7734CF1C48;
fma.rn.f64 %fd37, %fd35, %fd6, %fd36;
mov.f64 %fd38, 0d3F96E91483144EF7;
fma.rn.f64 %fd39, %fd37, %fd6, %fd38;
mov.f64 %fd40, 0d3F9F1C6E0A4F9F81;
fma.rn.f64 %fd41, %fd39, %fd6, %fd40;
mov.f64 %fd42, 0d3FA6DB6DC27FA92B;
fma.rn.f64 %fd43, %fd41, %fd6, %fd42;
mov.f64 %fd44, 0d3FB333333320F91B;
fma.rn.f64 %fd45, %fd43, %fd6, %fd44;
mov.f64 %fd46, 0d3FC5555555555F4D;
fma.rn.f64 %fd47, %fd45, %fd6, %fd46;
mul.f64 %fd48, %fd6, %fd47;
mul.f64 %fd49, %fd22, 0dC000000000000000;
mov.f64 %fd50, 0d3C91A62633145C07;
fma.rn.f64 %fd51, %fd49, %fd48, %fd50;
add.f64 %fd52, %fd49, 0d3FE921FB54442D18;
add.f64 %fd53, %fd52, %fd51;
add.f64 %fd54, %fd53, 0d3FE921FB54442D18;
{
.reg .b32 %temp;
mov.b64 {%r11, %temp}, %fd54;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r12}, %fd54;
}
and.b32 %r13, %r2, -2147483648;
or.b32 %r14, %r12, %r13;
mov.b64 %fd82, {%r11, %r14};
BB106_4:
cvta.to.global.u64 %rd6, %rd2;
add.s64 %rd8, %rd6, %rd4;
st.global.f64 [%rd8], %fd82;
BB106_5:
ret;
}
// .globl matrix_asin_f
.visible .entry matrix_asin_f(
.param .u64 matrix_asin_f_param_0,
.param .u64 matrix_asin_f_param_1,
.param .u32 matrix_asin_f_param_2
)
{
.reg .pred %p<4>;
.reg .f32 %f<26>;
.reg .b32 %r<10>;
.reg .b64 %rd<8>;
ld.param.u64 %rd1, [matrix_asin_f_param_0];
ld.param.u64 %rd2, [matrix_asin_f_param_1];
ld.param.u32 %r2, [matrix_asin_f_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
@%p1 bra BB107_2;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f1, [%rd5];
abs.f32 %f2, %f1;
mov.f32 %f3, 0f3F800000;
sub.f32 %f4, %f3, %f2;
mul.f32 %f5, %f4, 0f3F000000;
sqrt.rn.f32 %f6, %f5;
setp.gt.f32 %p2, %f2, 0f3F11EB85;
selp.f32 %f7, %f6, %f2, %p2;
mul.f32 %f8, %f7, %f7;
mov.f32 %f9, 0f3C94D2E9;
mov.f32 %f10, 0f3D53F941;
fma.rn.f32 %f11, %f10, %f8, %f9;
mov.f32 %f12, 0f3D3F841F;
fma.rn.f32 %f13, %f11, %f8, %f12;
mov.f32 %f14, 0f3D994929;
fma.rn.f32 %f15, %f13, %f8, %f14;
mov.f32 %f16, 0f3E2AAB94;
fma.rn.f32 %f17, %f15, %f8, %f16;
mul.f32 %f18, %f8, %f17;
fma.rn.f32 %f19, %f18, %f7, %f7;
mov.f32 %f20, 0f3FC90FDB;
mov.f32 %f21, 0fC0000000;
fma.rn.f32 %f22, %f21, %f19, %f20;
selp.f32 %f23, %f22, %f19, %p2;
setp.gtu.f32 %p3, %f23, 0f7F800000;
mov.b32 %r6, %f23;
mov.b32 %r7, %f1;
and.b32 %r8, %r7, -2147483648;
or.b32 %r9, %r6, %r8;
mov.b32 %f24, %r9;
selp.f32 %f25, %f23, %f24, %p3;
cvta.to.global.u64 %rd6, %rd2;
add.s64 %rd7, %rd6, %rd4;
st.global.f32 [%rd7], %f25;
BB107_2:
ret;
}
// .globl matrix_acos_d
.visible .entry matrix_acos_d(
.param .u64 matrix_acos_d_param_0,
.param .u64 matrix_acos_d_param_1,
.param .u32 matrix_acos_d_param_2
)
{
.reg .pred %p<7>;
.reg .b32 %r<17>;
.reg .f64 %fd<97>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [matrix_acos_d_param_0];
ld.param.u64 %rd2, [matrix_acos_d_param_1];
ld.param.u32 %r4, [matrix_acos_d_param_2];
mov.u32 %r5, %ctaid.x;
mov.u32 %r6, %ntid.x;
mov.u32 %r7, %tid.x;
mad.lo.s32 %r1, %r6, %r5, %r7;
setp.ge.u32 %p1, %r1, %r4;
@%p1 bra BB108_14;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd16, [%rd5];
{
.reg .b32 %temp;
mov.b64 {%temp, %r2}, %fd16;
}
abs.f64 %fd1, %fd16;
{
.reg .b32 %temp;
mov.b64 {%temp, %r8}, %fd1;
}
setp.lt.s32 %p2, %r8, 1071801958;
@%p2 bra BB108_9;
bra.uni BB108_2;
BB108_9:
mul.f64 %fd62, %fd1, %fd1;
mov.f64 %fd63, 0dBFB3823B180754AF;
mov.f64 %fd64, 0d3FB0066BDC1895E9;
fma.rn.f64 %fd65, %fd64, %fd62, %fd63;
mov.f64 %fd66, 0d3FB11E52CC2F79AE;
fma.rn.f64 %fd67, %fd65, %fd62, %fd66;
mov.f64 %fd68, 0dBF924EAF3526861B;
fma.rn.f64 %fd69, %fd67, %fd62, %fd68;
mov.f64 %fd70, 0d3F91DF02A31E6CB7;
fma.rn.f64 %fd71, %fd69, %fd62, %fd70;
mov.f64 %fd72, 0d3F847D18B0EEC6CC;
fma.rn.f64 %fd73, %fd71, %fd62, %fd72;
mov.f64 %fd74, 0d3F8D0AF961BA53B0;
fma.rn.f64 %fd75, %fd73, %fd62, %fd74;
mov.f64 %fd76, 0d3F91BF7734CF1C48;
fma.rn.f64 %fd77, %fd75, %fd62, %fd76;
mov.f64 %fd78, 0d3F96E91483144EF7;
fma.rn.f64 %fd79, %fd77, %fd62, %fd78;
mov.f64 %fd80, 0d3F9F1C6E0A4F9F81;
fma.rn.f64 %fd81, %fd79, %fd62, %fd80;
mov.f64 %fd82, 0d3FA6DB6DC27FA92B;
fma.rn.f64 %fd83, %fd81, %fd62, %fd82;
mov.f64 %fd84, 0d3FB333333320F91B;
fma.rn.f64 %fd85, %fd83, %fd62, %fd84;
mov.f64 %fd86, 0d3FC5555555555F4D;
fma.rn.f64 %fd87, %fd85, %fd62, %fd86;
mul.f64 %fd88, %fd62, %fd87;
fma.rn.f64 %fd10, %fd88, %fd1, %fd1;
setp.lt.s32 %p6, %r2, 0;
@%p6 bra BB108_11;
mov.f64 %fd89, 0dBC91A62633145C07;
add.rn.f64 %fd90, %fd10, %fd89;
neg.f64 %fd95, %fd90;
bra.uni BB108_12;
BB108_2:
mov.f64 %fd19, 0d3FF0000000000000;
sub.f64 %fd2, %fd19, %fd1;
{
.reg .b32 %temp;
mov.b64 {%r9, %temp}, %fd2;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r3}, %fd2;
}
add.s32 %r10, %r3, -1048576;
mov.b64 %fd18, {%r9, %r10};
// inline asm
rsqrt.approx.ftz.f64 %fd17, %fd18;
// inline asm
{
.reg .b32 %temp;
mov.b64 {%r11, %temp}, %fd17;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r12}, %fd17;
}
add.s32 %r13, %r12, -1048576;
mov.b64 %fd20, {%r11, %r13};
mul.f64 %fd21, %fd18, %fd17;
neg.f64 %fd22, %fd21;
fma.rn.f64 %fd23, %fd21, %fd22, %fd18;
fma.rn.f64 %fd24, %fd23, %fd20, %fd21;
neg.f64 %fd25, %fd24;
fma.rn.f64 %fd26, %fd17, %fd25, %fd19;
fma.rn.f64 %fd27, %fd26, %fd20, %fd20;
fma.rn.f64 %fd28, %fd24, %fd25, %fd18;
fma.rn.f64 %fd3, %fd28, %fd27, %fd24;
setp.lt.s32 %p3, %r3, 1;
@%p3 bra BB108_4;
{
.reg .b32 %temp;
mov.b64 {%temp, %r14}, %fd3;
}
add.s32 %r15, %r14, 1048576;
{
.reg .b32 %temp;
mov.b64 {%r16, %temp}, %fd3;
}
mov.b64 %fd29, {%r16, %r15};
mov.f64 %fd30, 0dBEBAC2FE66FAAC4B;
mov.f64 %fd31, 0d3EC715B371155F70;
fma.rn.f64 %fd32, %fd31, %fd2, %fd30;
mov.f64 %fd33, 0d3ED9A9B88EFCD9B8;
fma.rn.f64 %fd34, %fd32, %fd2, %fd33;
mov.f64 %fd35, 0d3EDD0F40A8A0C4C3;
fma.rn.f64 %fd36, %fd34, %fd2, %fd35;
mov.f64 %fd37, 0d3EF46D4CFA9E0E1F;
fma.rn.f64 %fd38, %fd36, %fd2, %fd37;
mov.f64 %fd39, 0d3F079C168D1E2422;
fma.rn.f64 %fd40, %fd38, %fd2, %fd39;
mov.f64 %fd41, 0d3F1C9A88C3BCA540;
fma.rn.f64 %fd42, %fd40, %fd2, %fd41;
mov.f64 %fd43, 0d3F31C4E64BD476DF;
fma.rn.f64 %fd44, %fd42, %fd2, %fd43;
mov.f64 %fd45, 0d3F46E8BA60009C8F;
fma.rn.f64 %fd46, %fd44, %fd2, %fd45;
mov.f64 %fd47, 0d3F5F1C71C62B05A2;
fma.rn.f64 %fd48, %fd46, %fd2, %fd47;
mov.f64 %fd49, 0d3F76DB6DB6DC9F2C;
fma.rn.f64 %fd50, %fd48, %fd2, %fd49;
mov.f64 %fd51, 0d3F9333333333329C;
fma.rn.f64 %fd52, %fd50, %fd2, %fd51;
mov.f64 %fd53, 0d3FB5555555555555;
fma.rn.f64 %fd54, %fd52, %fd2, %fd53;
mul.f64 %fd55, %fd2, %fd54;
fma.rn.f64 %fd94, %fd55, %fd29, %fd29;
bra.uni BB108_5;
BB108_11:
mov.f64 %fd91, 0d3C91A62633145C07;
add.rn.f64 %fd95, %fd10, %fd91;
BB108_12:
mov.f64 %fd92, 0d3FF921FB54442D18;
add.rn.f64 %fd94, %fd92, %fd95;
bra.uni BB108_13;
BB108_4:
mov.f64 %fd56, 0d0000000000000000;
mul.rn.f64 %fd94, %fd1, %fd56;
BB108_5:
setp.gt.s32 %p4, %r3, -1;
@%p4 bra BB108_7;
mov.f64 %fd57, 0d7FF0000000000000;
mul.rn.f64 %fd94, %fd94, %fd57;
BB108_7:
setp.gt.s32 %p5, %r2, -1;
@%p5 bra BB108_13;
mov.f64 %fd58, 0dBCA1A62633145C07;
add.rn.f64 %fd59, %fd94, %fd58;
neg.f64 %fd60, %fd59;
mov.f64 %fd61, 0d400921FB54442D18;
add.rn.f64 %fd94, %fd61, %fd60;
BB108_13:
cvta.to.global.u64 %rd6, %rd2;
add.s64 %rd8, %rd6, %rd4;
st.global.f64 [%rd8], %fd94;
BB108_14:
ret;
}
// .globl matrix_acos_f
.visible .entry matrix_acos_f(
.param .u64 matrix_acos_f_param_0,
.param .u64 matrix_acos_f_param_1,
.param .u32 matrix_acos_f_param_2
)
{
.reg .pred %p<4>;
.reg .f32 %f<27>;
.reg .b32 %r<6>;
.reg .b64 %rd<8>;
ld.param.u64 %rd1, [matrix_acos_f_param_0];
ld.param.u64 %rd2, [matrix_acos_f_param_1];
ld.param.u32 %r2, [matrix_acos_f_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
@%p1 bra BB109_2;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f1, [%rd5];
abs.f32 %f2, %f1;
mov.f32 %f3, 0f3F800000;
sub.f32 %f4, %f3, %f2;
mul.f32 %f5, %f4, 0f3F000000;
sqrt.rn.f32 %f6, %f5;
setp.gt.f32 %p2, %f2, 0f3F11EB85;
selp.f32 %f7, %f6, %f2, %p2;
mul.f32 %f8, %f7, %f7;
mov.f32 %f9, 0f3C94D2E9;
mov.f32 %f10, 0f3D53F941;
fma.rn.f32 %f11, %f10, %f8, %f9;
mov.f32 %f12, 0f3D3F841F;
fma.rn.f32 %f13, %f11, %f8, %f12;
mov.f32 %f14, 0f3D994929;
fma.rn.f32 %f15, %f13, %f8, %f14;
mov.f32 %f16, 0f3E2AAB94;
fma.rn.f32 %f17, %f15, %f8, %f16;
mul.f32 %f18, %f8, %f17;
fma.rn.f32 %f19, %f18, %f7, %f7;
add.f32 %f20, %f19, %f19;
mov.f32 %f21, 0f3FC90FDB;
sub.f32 %f22, %f21, %f19;
selp.f32 %f23, %f20, %f22, %p2;
setp.lt.f32 %p3, %f1, 0f00000000;
mov.f32 %f24, 0f40490FDB;
sub.f32 %f25, %f24, %f23;
selp.f32 %f26, %f25, %f23, %p3;
cvta.to.global.u64 %rd6, %rd2;
add.s64 %rd7, %rd6, %rd4;
st.global.f32 [%rd7], %f26;
BB109_2:
ret;
}
// .globl matrix_atan_d
.visible .entry matrix_atan_d(
.param .u64 matrix_atan_d_param_0,
.param .u64 matrix_atan_d_param_1,
.param .u32 matrix_atan_d_param_2
)
{
.reg .pred %p<5>;
.reg .b32 %r<11>;
.reg .f64 %fd<56>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [matrix_atan_d_param_0];
ld.param.u64 %rd2, [matrix_atan_d_param_1];
ld.param.u32 %r2, [matrix_atan_d_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
@%p1 bra BB110_4;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd1, [%rd5];
abs.f64 %fd2, %fd1;
setp.leu.f64 %p2, %fd2, 0d3FF0000000000000;
mov.f64 %fd55, %fd2;
@%p2 bra BB110_3;
rcp.approx.ftz.f64 %fd5, %fd2;
neg.f64 %fd6, %fd2;
mov.f64 %fd7, 0d3FF0000000000000;
fma.rn.f64 %fd8, %fd6, %fd5, %fd7;
fma.rn.f64 %fd9, %fd8, %fd8, %fd8;
fma.rn.f64 %fd10, %fd9, %fd5, %fd5;
setp.eq.f64 %p3, %fd2, 0d7FF0000000000000;
selp.f64 %fd55, 0d0000000000000000, %fd10, %p3;
BB110_3:
cvta.to.global.u64 %rd6, %rd2;
mul.f64 %fd11, %fd55, %fd55;
mov.f64 %fd12, 0d3F2D3B63DBB65B49;
mov.f64 %fd13, 0dBEF53E1D2A25FF7E;
fma.rn.f64 %fd14, %fd13, %fd11, %fd12;
mov.f64 %fd15, 0dBF5312788DDE082E;
fma.rn.f64 %fd16, %fd14, %fd11, %fd15;
mov.f64 %fd17, 0d3F6F9690C8249315;
fma.rn.f64 %fd18, %fd16, %fd11, %fd17;
mov.f64 %fd19, 0dBF82CF5AABC7CF0D;
fma.rn.f64 %fd20, %fd18, %fd11, %fd19;
mov.f64 %fd21, 0d3F9162B0B2A3BFDE;
fma.rn.f64 %fd22, %fd20, %fd11, %fd21;
mov.f64 %fd23, 0dBF9A7256FEB6FC6B;
fma.rn.f64 %fd24, %fd22, %fd11, %fd23;
mov.f64 %fd25, 0d3FA171560CE4A489;
fma.rn.f64 %fd26, %fd24, %fd11, %fd25;
mov.f64 %fd27, 0dBFA4F44D841450E4;
fma.rn.f64 %fd28, %fd26, %fd11, %fd27;
mov.f64 %fd29, 0d3FA7EE3D3F36BB95;
fma.rn.f64 %fd30, %fd28, %fd11, %fd29;
mov.f64 %fd31, 0dBFAAD32AE04A9FD1;
fma.rn.f64 %fd32, %fd30, %fd11, %fd31;
mov.f64 %fd33, 0d3FAE17813D66954F;
fma.rn.f64 %fd34, %fd32, %fd11, %fd33;
mov.f64 %fd35, 0dBFB11089CA9A5BCD;
fma.rn.f64 %fd36, %fd34, %fd11, %fd35;
mov.f64 %fd37, 0d3FB3B12B2DB51738;
fma.rn.f64 %fd38, %fd36, %fd11, %fd37;
mov.f64 %fd39, 0dBFB745D022F8DC5C;
fma.rn.f64 %fd40, %fd38, %fd11, %fd39;
mov.f64 %fd41, 0d3FBC71C709DFE927;
fma.rn.f64 %fd42, %fd40, %fd11, %fd41;
mov.f64 %fd43, 0dBFC2492491FA1744;
fma.rn.f64 %fd44, %fd42, %fd11, %fd43;
mov.f64 %fd45, 0d3FC99999999840D2;
fma.rn.f64 %fd46, %fd44, %fd11, %fd45;
mov.f64 %fd47, 0dBFD555555555544C;
fma.rn.f64 %fd48, %fd46, %fd11, %fd47;
mul.f64 %fd49, %fd11, %fd48;
fma.rn.f64 %fd50, %fd49, %fd55, %fd55;
mov.f64 %fd51, 0d3FF921FB54442D18;
sub.f64 %fd52, %fd51, %fd50;
setp.gt.f64 %p4, %fd2, 0d3FF0000000000000;
selp.f64 %fd53, %fd52, %fd50, %p4;
{
.reg .b32 %temp;
mov.b64 {%r6, %temp}, %fd53;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r7}, %fd53;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r8}, %fd1;
}
and.b32 %r9, %r8, -2147483648;
or.b32 %r10, %r7, %r9;
mov.b64 %fd54, {%r6, %r10};
add.s64 %rd8, %rd6, %rd4;
st.global.f64 [%rd8], %fd54;
BB110_4:
ret;
}
// .globl matrix_atan_f
.visible .entry matrix_atan_f(
.param .u64 matrix_atan_f_param_0,
.param .u64 matrix_atan_f_param_1,
.param .u32 matrix_atan_f_param_2
)
{
.reg .pred %p<5>;
.reg .f32 %f<26>;
.reg .b32 %r<10>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [matrix_atan_f_param_0];
ld.param.u64 %rd2, [matrix_atan_f_param_1];
ld.param.u32 %r2, [matrix_atan_f_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
@%p1 bra BB111_4;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f1, [%rd5];
abs.f32 %f2, %f1;
setp.leu.f32 %p2, %f2, 0f3F800000;
mov.f32 %f25, %f2;
@%p2 bra BB111_3;
rcp.rn.f32 %f25, %f2;
BB111_3:
cvta.to.global.u64 %rd6, %rd2;
mul.rn.f32 %f5, %f25, %f25;
mov.f32 %f6, 0fC0B59883;
mov.f32 %f7, 0fBF52C7EA;
fma.rn.f32 %f8, %f5, %f7, %f6;
mov.f32 %f9, 0fC0D21907;
fma.rn.f32 %f10, %f8, %f5, %f9;
mul.f32 %f11, %f5, %f10;
mul.f32 %f12, %f25, %f11;
add.f32 %f13, %f5, 0f41355DC0;
mov.f32 %f14, 0f41E6BD60;
fma.rn.f32 %f15, %f13, %f5, %f14;
mov.f32 %f16, 0f419D92C8;
fma.rn.f32 %f17, %f15, %f5, %f16;
rcp.rn.f32 %f18, %f17;
fma.rn.f32 %f19, %f12, %f18, %f25;
mov.f32 %f20, 0f3FC90FDB;
sub.f32 %f21, %f20, %f19;
setp.gt.f32 %p3, %f2, 0f3F800000;
selp.f32 %f22, %f21, %f19, %p3;
mov.b32 %r6, %f22;
mov.b32 %r7, %f1;
and.b32 %r8, %r7, -2147483648;
or.b32 %r9, %r6, %r8;
mov.b32 %f23, %r9;
setp.gtu.f32 %p4, %f2, 0f7F800000;
selp.f32 %f24, %f22, %f23, %p4;
add.s64 %rd8, %rd6, %rd4;
st.global.f32 [%rd8], %f24;
BB111_4:
ret;
}
// .globl matrix_sign_d
.visible .entry matrix_sign_d(
.param .u64 matrix_sign_d_param_0,
.param .u64 matrix_sign_d_param_1,
.param .u32 matrix_sign_d_param_2
)
{
.reg .pred %p<3>;
.reg .b32 %r<12>;
.reg .f64 %fd<4>;
.reg .b64 %rd<9>;
ld.param.u64 %rd2, [matrix_sign_d_param_0];
ld.param.u64 %rd3, [matrix_sign_d_param_1];
ld.param.u32 %r2, [matrix_sign_d_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
@%p1 bra BB112_4;
cvta.to.global.u64 %rd4, %rd2;
mul.wide.s32 %rd5, %r1, 8;
add.s64 %rd6, %rd4, %rd5;
ld.global.f64 %fd1, [%rd6];
setp.eq.f64 %p2, %fd1, 0d0000000000000000;
cvta.to.global.u64 %rd7, %rd3;
add.s64 %rd1, %rd7, %rd5;
@%p2 bra BB112_3;
bra.uni BB112_2;
BB112_3:
mov.u64 %rd8, 0;
st.global.u64 [%rd1], %rd8;
bra.uni BB112_4;
BB112_2:
{
.reg .b32 %temp;
mov.b64 {%temp, %r6}, %fd1;
}
and.b32 %r7, %r6, -2147483648;
mov.f64 %fd2, 0d3FF0000000000000;
{
.reg .b32 %temp;
mov.b64 {%temp, %r8}, %fd2;
}
and.b32 %r9, %r8, 2147483647;
or.b32 %r10, %r9, %r7;
{
.reg .b32 %temp;
mov.b64 {%r11, %temp}, %fd2;
}
mov.b64 %fd3, {%r11, %r10};
st.global.f64 [%rd1], %fd3;
BB112_4:
ret;
}
// .globl matrix_sign_f
.visible .entry matrix_sign_f(
.param .u64 matrix_sign_f_param_0,
.param .u64 matrix_sign_f_param_1,
.param .u32 matrix_sign_f_param_2
)
{
.reg .pred %p<3>;
.reg .f32 %f<3>;
.reg .b32 %r<13>;
.reg .f64 %fd<4>;
.reg .b64 %rd<8>;
ld.param.u64 %rd2, [matrix_sign_f_param_0];
ld.param.u64 %rd3, [matrix_sign_f_param_1];
ld.param.u32 %r2, [matrix_sign_f_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
@%p1 bra BB113_4;
cvta.to.global.u64 %rd4, %rd2;
mul.wide.s32 %rd5, %r1, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.f32 %f1, [%rd6];
setp.eq.f32 %p2, %f1, 0f00000000;
cvta.to.global.u64 %rd7, %rd3;
add.s64 %rd1, %rd7, %rd5;
@%p2 bra BB113_3;
bra.uni BB113_2;
BB113_3:
mov.u32 %r12, 0;
st.global.u32 [%rd1], %r12;
bra.uni BB113_4;
BB113_2:
cvt.f64.f32 %fd1, %f1;
{
.reg .b32 %temp;
mov.b64 {%temp, %r6}, %fd1;
}
and.b32 %r7, %r6, -2147483648;
mov.f64 %fd2, 0d3FF0000000000000;
{
.reg .b32 %temp;
mov.b64 {%temp, %r8}, %fd2;
}
and.b32 %r9, %r8, 2147483647;
or.b32 %r10, %r9, %r7;
{
.reg .b32 %temp;
mov.b64 {%r11, %temp}, %fd2;
}
mov.b64 %fd3, {%r11, %r10};
cvt.rn.f32.f64 %f2, %fd3;
st.global.f32 [%rd1], %f2;
BB113_4:
ret;
}
// .globl matrix_sigmoid_d
.visible .entry matrix_sigmoid_d(
.param .u64 matrix_sigmoid_d_param_0,
.param .u64 matrix_sigmoid_d_param_1,
.param .u32 matrix_sigmoid_d_param_2
)
{
.reg .pred %p<4>;
.reg .f32 %f<5>;
.reg .b32 %r<13>;
.reg .f64 %fd<74>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [matrix_sigmoid_d_param_0];
ld.param.u64 %rd2, [matrix_sigmoid_d_param_1];
ld.param.u32 %r4, [matrix_sigmoid_d_param_2];
mov.u32 %r5, %ctaid.x;
mov.u32 %r6, %ntid.x;
mov.u32 %r7, %tid.x;
mad.lo.s32 %r1, %r6, %r5, %r7;
setp.ge.u32 %p1, %r1, %r4;
@%p1 bra BB114_5;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd6, [%rd5];
mul.f64 %fd1, %fd6, 0d3FE0000000000000;
{
.reg .b32 %temp;
mov.b64 {%temp, %r2}, %fd1;
}
and.b32 %r3, %r2, 2147483647;
{
.reg .b32 %temp;
mov.b64 {%r8, %temp}, %fd1;
}
mov.b64 %fd2, {%r8, %r3};
setp.ltu.f64 %p2, %fd2, 0d3FE4F92224DD2F1A;
@%p2 bra BB114_3;
bra.uni BB114_2;
BB114_3:
mul.f64 %fd48, %fd1, %fd1;
mov.f64 %fd49, 0d3F14359F420AFC3D;
mov.f64 %fd50, 0dBEF0BC46E2F5E964;
fma.rn.f64 %fd51, %fd50, %fd48, %fd49;
mov.f64 %fd52, 0dBF2DF9F0728C5D84;
fma.rn.f64 %fd53, %fd51, %fd48, %fd52;
mov.f64 %fd54, 0d3F4337D1CEC4F033;
fma.rn.f64 %fd55, %fd53, %fd48, %fd54;
mov.f64 %fd56, 0dBF57D6E9674335B3;
fma.rn.f64 %fd57, %fd55, %fd48, %fd56;
mov.f64 %fd58, 0d3F6D6D000D7AAD3D;
fma.rn.f64 %fd59, %fd57, %fd48, %fd58;
mov.f64 %fd60, 0dBF8226E1F3CF1EF5;
fma.rn.f64 %fd61, %fd59, %fd48, %fd60;
mov.f64 %fd62, 0d3F9664F47EC0C8CF;
fma.rn.f64 %fd63, %fd61, %fd48, %fd62;
mov.f64 %fd64, 0dBFABA1BA1B80AB40;
fma.rn.f64 %fd65, %fd63, %fd48, %fd64;
mov.f64 %fd66, 0d3FC111111110FA4A;
fma.rn.f64 %fd67, %fd65, %fd48, %fd66;
mov.f64 %fd68, 0dBFD5555555555550;
fma.rn.f64 %fd69, %fd67, %fd48, %fd68;
mov.f64 %fd70, 0d0000000000000000;
fma.rn.f64 %fd71, %fd69, %fd48, %fd70;
fma.rn.f64 %fd73, %fd71, %fd1, %fd1;
bra.uni BB114_4;
BB114_2:
add.f64 %fd7, %fd2, %fd2;
cvt.rn.f32.f64 %f1, %fd7;
mul.f32 %f2, %f1, 0f3FB8AA3B;
cvt.rni.f32.f32 %f3, %f2;
cvt.f64.f32 %fd8, %f3;
neg.f64 %fd9, %fd8;
mov.f64 %fd10, 0d3FE62E42FEFA39EF;
fma.rn.f64 %fd11, %fd9, %fd10, %fd7;
mov.f64 %fd12, 0d3E928A27F89B6999;
mov.f64 %fd13, 0d3E5AE904A4741B81;
fma.rn.f64 %fd14, %fd13, %fd11, %fd12;
mov.f64 %fd15, 0d3EC71DE715FF7E07;
fma.rn.f64 %fd16, %fd14, %fd11, %fd15;
mov.f64 %fd17, 0d3EFA019A6B0AC45A;
fma.rn.f64 %fd18, %fd16, %fd11, %fd17;
mov.f64 %fd19, 0d3F2A01A017EED94F;
fma.rn.f64 %fd20, %fd18, %fd11, %fd19;
mov.f64 %fd21, 0d3F56C16C17F2A71B;
fma.rn.f64 %fd22, %fd20, %fd11, %fd21;
mov.f64 %fd23, 0d3F811111111173C4;
fma.rn.f64 %fd24, %fd22, %fd11, %fd23;
mov.f64 %fd25, 0d3FA555555555211A;
fma.rn.f64 %fd26, %fd24, %fd11, %fd25;
mov.f64 %fd27, 0d3FC5555555555540;
fma.rn.f64 %fd28, %fd26, %fd11, %fd27;
mov.f64 %fd29, 0d3FE0000000000005;
fma.rn.f64 %fd30, %fd28, %fd11, %fd29;
mul.f64 %fd31, %fd11, %fd30;
fma.rn.f64 %fd32, %fd31, %fd11, %fd11;
ex2.approx.ftz.f32 %f4, %f3;
cvt.f64.f32 %fd33, %f4;
mov.f64 %fd34, 0d3FF0000000000000;
sub.f64 %fd35, %fd34, %fd33;
neg.f64 %fd36, %fd32;
fma.rn.f64 %fd37, %fd36, %fd33, %fd35;
mov.f64 %fd38, 0d4000000000000000;
sub.f64 %fd39, %fd38, %fd37;
rcp.approx.ftz.f64 %fd40, %fd39;
neg.f64 %fd41, %fd39;
fma.rn.f64 %fd42, %fd41, %fd40, %fd34;
fma.rn.f64 %fd43, %fd42, %fd42, %fd42;
fma.rn.f64 %fd44, %fd43, %fd40, %fd40;
neg.f64 %fd45, %fd44;
fma.rn.f64 %fd46, %fd38, %fd45, %fd34;
setp.gt.u32 %p3, %r3, 1077088193;
selp.f64 %fd47, 0d3FF0000000000000, %fd46, %p3;
{
.reg .b32 %temp;
mov.b64 {%r9, %temp}, %fd47;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r10}, %fd47;
}
and.b32 %r11, %r2, -2147483648;
or.b32 %r12, %r10, %r11;
mov.b64 %fd73, {%r9, %r12};
BB114_4:
cvta.to.global.u64 %rd6, %rd2;
fma.rn.f64 %fd72, %fd73, 0d3FE0000000000000, 0d3FE0000000000000;
add.s64 %rd8, %rd6, %rd4;
st.global.f64 [%rd8], %fd72;
BB114_5:
ret;
}
// .globl matrix_sigmoid_f
.visible .entry matrix_sigmoid_f(
.param .u64 matrix_sigmoid_f_param_0,
.param .u64 matrix_sigmoid_f_param_1,
.param .u32 matrix_sigmoid_f_param_2
)
{
.reg .pred %p<4>;
.reg .f32 %f<7>;
.reg .b32 %r<13>;
.reg .f64 %fd<74>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [matrix_sigmoid_f_param_0];
ld.param.u64 %rd2, [matrix_sigmoid_f_param_1];
ld.param.u32 %r4, [matrix_sigmoid_f_param_2];
mov.u32 %r5, %ctaid.x;
mov.u32 %r6, %ntid.x;
mov.u32 %r7, %tid.x;
mad.lo.s32 %r1, %r6, %r5, %r7;
setp.ge.u32 %p1, %r1, %r4;
@%p1 bra BB115_5;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f1, [%rd5];
cvt.f64.f32 %fd6, %f1;
mul.f64 %fd1, %fd6, 0d3FE0000000000000;
{
.reg .b32 %temp;
mov.b64 {%temp, %r2}, %fd1;
}
and.b32 %r3, %r2, 2147483647;
{
.reg .b32 %temp;
mov.b64 {%r8, %temp}, %fd1;
}
mov.b64 %fd2, {%r8, %r3};
setp.ltu.f64 %p2, %fd2, 0d3FE4F92224DD2F1A;
@%p2 bra BB115_3;
bra.uni BB115_2;
BB115_3:
mul.f64 %fd48, %fd1, %fd1;
mov.f64 %fd49, 0d3F14359F420AFC3D;
mov.f64 %fd50, 0dBEF0BC46E2F5E964;
fma.rn.f64 %fd51, %fd50, %fd48, %fd49;
mov.f64 %fd52, 0dBF2DF9F0728C5D84;
fma.rn.f64 %fd53, %fd51, %fd48, %fd52;
mov.f64 %fd54, 0d3F4337D1CEC4F033;
fma.rn.f64 %fd55, %fd53, %fd48, %fd54;
mov.f64 %fd56, 0dBF57D6E9674335B3;
fma.rn.f64 %fd57, %fd55, %fd48, %fd56;
mov.f64 %fd58, 0d3F6D6D000D7AAD3D;
fma.rn.f64 %fd59, %fd57, %fd48, %fd58;
mov.f64 %fd60, 0dBF8226E1F3CF1EF5;
fma.rn.f64 %fd61, %fd59, %fd48, %fd60;
mov.f64 %fd62, 0d3F9664F47EC0C8CF;
fma.rn.f64 %fd63, %fd61, %fd48, %fd62;
mov.f64 %fd64, 0dBFABA1BA1B80AB40;
fma.rn.f64 %fd65, %fd63, %fd48, %fd64;
mov.f64 %fd66, 0d3FC111111110FA4A;
fma.rn.f64 %fd67, %fd65, %fd48, %fd66;
mov.f64 %fd68, 0dBFD5555555555550;
fma.rn.f64 %fd69, %fd67, %fd48, %fd68;
mov.f64 %fd70, 0d0000000000000000;
fma.rn.f64 %fd71, %fd69, %fd48, %fd70;
fma.rn.f64 %fd73, %fd71, %fd1, %fd1;
bra.uni BB115_4;
BB115_2:
add.f64 %fd7, %fd2, %fd2;
cvt.rn.f32.f64 %f2, %fd7;
mul.f32 %f3, %f2, 0f3FB8AA3B;
cvt.rni.f32.f32 %f4, %f3;
cvt.f64.f32 %fd8, %f4;
neg.f64 %fd9, %fd8;
mov.f64 %fd10, 0d3FE62E42FEFA39EF;
fma.rn.f64 %fd11, %fd9, %fd10, %fd7;
mov.f64 %fd12, 0d3E928A27F89B6999;
mov.f64 %fd13, 0d3E5AE904A4741B81;
fma.rn.f64 %fd14, %fd13, %fd11, %fd12;
mov.f64 %fd15, 0d3EC71DE715FF7E07;
fma.rn.f64 %fd16, %fd14, %fd11, %fd15;
mov.f64 %fd17, 0d3EFA019A6B0AC45A;
fma.rn.f64 %fd18, %fd16, %fd11, %fd17;
mov.f64 %fd19, 0d3F2A01A017EED94F;
fma.rn.f64 %fd20, %fd18, %fd11, %fd19;
mov.f64 %fd21, 0d3F56C16C17F2A71B;
fma.rn.f64 %fd22, %fd20, %fd11, %fd21;
mov.f64 %fd23, 0d3F811111111173C4;
fma.rn.f64 %fd24, %fd22, %fd11, %fd23;
mov.f64 %fd25, 0d3FA555555555211A;
fma.rn.f64 %fd26, %fd24, %fd11, %fd25;
mov.f64 %fd27, 0d3FC5555555555540;
fma.rn.f64 %fd28, %fd26, %fd11, %fd27;
mov.f64 %fd29, 0d3FE0000000000005;
fma.rn.f64 %fd30, %fd28, %fd11, %fd29;
mul.f64 %fd31, %fd11, %fd30;
fma.rn.f64 %fd32, %fd31, %fd11, %fd11;
ex2.approx.ftz.f32 %f5, %f4;
cvt.f64.f32 %fd33, %f5;
mov.f64 %fd34, 0d3FF0000000000000;
sub.f64 %fd35, %fd34, %fd33;
neg.f64 %fd36, %fd32;
fma.rn.f64 %fd37, %fd36, %fd33, %fd35;
mov.f64 %fd38, 0d4000000000000000;
sub.f64 %fd39, %fd38, %fd37;
rcp.approx.ftz.f64 %fd40, %fd39;
neg.f64 %fd41, %fd39;
fma.rn.f64 %fd42, %fd41, %fd40, %fd34;
fma.rn.f64 %fd43, %fd42, %fd42, %fd42;
fma.rn.f64 %fd44, %fd43, %fd40, %fd40;
neg.f64 %fd45, %fd44;
fma.rn.f64 %fd46, %fd38, %fd45, %fd34;
setp.gt.u32 %p3, %r3, 1077088193;
selp.f64 %fd47, 0d3FF0000000000000, %fd46, %p3;
{
.reg .b32 %temp;
mov.b64 {%r9, %temp}, %fd47;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r10}, %fd47;
}
and.b32 %r11, %r2, -2147483648;
or.b32 %r12, %r10, %r11;
mov.b64 %fd73, {%r9, %r12};
BB115_4:
cvta.to.global.u64 %rd6, %rd2;
fma.rn.f64 %fd72, %fd73, 0d3FE0000000000000, 0d3FE0000000000000;
cvt.rn.f32.f64 %f6, %fd72;
add.s64 %rd8, %rd6, %rd4;
st.global.f32 [%rd8], %f6;
BB115_5:
ret;
}
// .globl prepare_lstm_input_d
.visible .entry prepare_lstm_input_d(
.param .u64 prepare_lstm_input_d_param_0,
.param .u64 prepare_lstm_input_d_param_1,
.param .u32 prepare_lstm_input_d_param_2,
.param .u32 prepare_lstm_input_d_param_3,
.param .u32 prepare_lstm_input_d_param_4,
.param .u32 prepare_lstm_input_d_param_5
)
{
.reg .pred %p<2>;
.reg .b32 %r<15>;
.reg .f64 %fd<2>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [prepare_lstm_input_d_param_0];
ld.param.u64 %rd2, [prepare_lstm_input_d_param_1];
ld.param.u32 %r2, [prepare_lstm_input_d_param_2];
ld.param.u32 %r3, [prepare_lstm_input_d_param_3];
ld.param.u32 %r4, [prepare_lstm_input_d_param_4];
ld.param.u32 %r5, [prepare_lstm_input_d_param_5];
mov.u32 %r6, %ctaid.x;
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %tid.x;
mad.lo.s32 %r1, %r7, %r6, %r8;
setp.ge.s32 %p1, %r1, %r5;
@%p1 bra BB116_2;
cvta.to.global.u64 %rd3, %rd1;
rem.s32 %r9, %r1, %r4;
div.s32 %r10, %r9, %r3;
rem.s32 %r11, %r9, %r3;
mul.wide.s32 %rd4, %r1, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd1, [%rd5];
div.s32 %r12, %r1, %r4;
mad.lo.s32 %r13, %r10, %r2, %r12;
mad.lo.s32 %r14, %r13, %r3, %r11;
cvta.to.global.u64 %rd6, %rd2;
mul.wide.s32 %rd7, %r14, 8;
add.s64 %rd8, %rd6, %rd7;
st.global.f64 [%rd8], %fd1;
BB116_2:
ret;
}
// .globl prepare_lstm_input_f
.visible .entry prepare_lstm_input_f(
.param .u64 prepare_lstm_input_f_param_0,
.param .u64 prepare_lstm_input_f_param_1,
.param .u32 prepare_lstm_input_f_param_2,
.param .u32 prepare_lstm_input_f_param_3,
.param .u32 prepare_lstm_input_f_param_4,
.param .u32 prepare_lstm_input_f_param_5
)
{
.reg .pred %p<2>;
.reg .f32 %f<2>;
.reg .b32 %r<15>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [prepare_lstm_input_f_param_0];
ld.param.u64 %rd2, [prepare_lstm_input_f_param_1];
ld.param.u32 %r2, [prepare_lstm_input_f_param_2];
ld.param.u32 %r3, [prepare_lstm_input_f_param_3];
ld.param.u32 %r4, [prepare_lstm_input_f_param_4];
ld.param.u32 %r5, [prepare_lstm_input_f_param_5];
mov.u32 %r6, %ctaid.x;
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %tid.x;
mad.lo.s32 %r1, %r7, %r6, %r8;
setp.ge.s32 %p1, %r1, %r5;
@%p1 bra BB117_2;
cvta.to.global.u64 %rd3, %rd1;
rem.s32 %r9, %r1, %r4;
div.s32 %r10, %r9, %r3;
rem.s32 %r11, %r9, %r3;
mul.wide.s32 %rd4, %r1, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f1, [%rd5];
div.s32 %r12, %r1, %r4;
mad.lo.s32 %r13, %r10, %r2, %r12;
mad.lo.s32 %r14, %r13, %r3, %r11;
cvta.to.global.u64 %rd6, %rd2;
mul.wide.s32 %rd7, %r14, 4;
add.s64 %rd8, %rd6, %rd7;
st.global.f32 [%rd8], %f1;
BB117_2:
ret;
}
// .globl prepare_lstm_weight_d
.visible .entry prepare_lstm_weight_d(
.param .u64 prepare_lstm_weight_d_param_0,
.param .u64 prepare_lstm_weight_d_param_1,
.param .u64 prepare_lstm_weight_d_param_2,
.param .u32 prepare_lstm_weight_d_param_3,
.param .u32 prepare_lstm_weight_d_param_4
)
{
.reg .pred %p<8>;
.reg .b32 %r<48>;
.reg .f64 %fd<3>;
.reg .b64 %rd<15>;
ld.param.u64 %rd2, [prepare_lstm_weight_d_param_0];
ld.param.u64 %rd3, [prepare_lstm_weight_d_param_1];
ld.param.u64 %rd4, [prepare_lstm_weight_d_param_2];
ld.param.u32 %r45, [prepare_lstm_weight_d_param_3];
ld.param.u32 %r21, [prepare_lstm_weight_d_param_4];
cvta.to.global.u64 %rd1, %rd4;
mov.u32 %r22, %ntid.x;
mov.u32 %r23, %ctaid.x;
mov.u32 %r24, %tid.x;
mad.lo.s32 %r1, %r22, %r23, %r24;
add.s32 %r2, %r21, %r45;
shl.b32 %r3, %r21, 2;
mul.lo.s32 %r4, %r2, %r3;
setp.lt.s32 %p1, %r1, %r4;
@%p1 bra BB118_3;
bra.uni BB118_1;
BB118_3:
mul.lo.s32 %r5, %r21, %r45;
mul.lo.s32 %r47, %r21, %r21;
shl.b32 %r7, %r5, 2;
setp.lt.s32 %p5, %r1, %r7;
@%p5 bra BB118_5;
bra.uni BB118_4;
BB118_5:
rem.s32 %r44, %r1, %r5;
div.s32 %r42, %r44, %r21;
mov.u32 %r43, %r42;
mov.u32 %r46, %r1;
mov.u32 %r47, %r5;
bra.uni BB118_6;
BB118_1:
add.s32 %r25, %r2, 1;
mul.lo.s32 %r26, %r25, %r3;
setp.ge.s32 %p2, %r1, %r26;
@%p2 bra BB118_7;
cvta.to.global.u64 %rd5, %rd3;
sub.s32 %r27, %r1, %r4;
div.s32 %r28, %r27, %r21;
setp.lt.s32 %p3, %r28, 2;
setp.eq.s32 %p4, %r28, 2;
selp.b32 %r29, 3, 2, %p4;
selp.b32 %r30, %r28, %r29, %p3;
rem.s32 %r31, %r27, %r21;
mad.lo.s32 %r32, %r30, %r21, %r31;
mul.wide.s32 %rd6, %r32, 8;
add.s64 %rd7, %rd5, %rd6;
ld.global.f64 %fd1, [%rd7];
mul.wide.s32 %rd8, %r1, 8;
add.s64 %rd9, %rd1, %rd8;
st.global.f64 [%rd9], %fd1;
bra.uni BB118_7;
BB118_4:
sub.s32 %r46, %r1, %r7;
rem.s32 %r44, %r46, %r47;
div.s32 %r43, %r44, %r21;
add.s32 %r42, %r43, %r45;
mov.u32 %r45, %r21;
BB118_6:
cvta.to.global.u64 %rd10, %rd2;
div.s32 %r33, %r46, %r47;
setp.eq.s32 %p6, %r33, 2;
selp.b32 %r34, 3, 2, %p6;
setp.lt.s32 %p7, %r33, 2;
selp.b32 %r35, %r33, %r34, %p7;
rem.s32 %r36, %r44, %r21;
sub.s32 %r37, %r1, %r44;
add.s32 %r38, %r37, %r43;
mad.lo.s32 %r39, %r36, %r45, %r38;
mad.lo.s32 %r40, %r42, %r3, %r36;
mad.lo.s32 %r41, %r35, %r21, %r40;
mul.wide.s32 %rd11, %r41, 8;
add.s64 %rd12, %rd10, %rd11;
ld.global.f64 %fd2, [%rd12];
mul.wide.s32 %rd13, %r39, 8;
add.s64 %rd14, %rd1, %rd13;
st.global.f64 [%rd14], %fd2;
BB118_7:
ret;
}
// .globl prepare_lstm_weight_f
.visible .entry prepare_lstm_weight_f(
.param .u64 prepare_lstm_weight_f_param_0,
.param .u64 prepare_lstm_weight_f_param_1,
.param .u64 prepare_lstm_weight_f_param_2,
.param .u32 prepare_lstm_weight_f_param_3,
.param .u32 prepare_lstm_weight_f_param_4
)
{
.reg .pred %p<8>;
.reg .f32 %f<3>;
.reg .b32 %r<48>;
.reg .b64 %rd<15>;
ld.param.u64 %rd2, [prepare_lstm_weight_f_param_0];
ld.param.u64 %rd3, [prepare_lstm_weight_f_param_1];
ld.param.u64 %rd4, [prepare_lstm_weight_f_param_2];
ld.param.u32 %r45, [prepare_lstm_weight_f_param_3];
ld.param.u32 %r21, [prepare_lstm_weight_f_param_4];
cvta.to.global.u64 %rd1, %rd4;
mov.u32 %r22, %ntid.x;
mov.u32 %r23, %ctaid.x;
mov.u32 %r24, %tid.x;
mad.lo.s32 %r1, %r22, %r23, %r24;
add.s32 %r2, %r21, %r45;
shl.b32 %r3, %r21, 2;
mul.lo.s32 %r4, %r2, %r3;
setp.lt.s32 %p1, %r1, %r4;
@%p1 bra BB119_3;
bra.uni BB119_1;
BB119_3:
mul.lo.s32 %r5, %r21, %r45;
mul.lo.s32 %r47, %r21, %r21;
shl.b32 %r7, %r5, 2;
setp.lt.s32 %p5, %r1, %r7;
@%p5 bra BB119_5;
bra.uni BB119_4;
BB119_5:
rem.s32 %r44, %r1, %r5;
div.s32 %r42, %r44, %r21;
mov.u32 %r43, %r42;
mov.u32 %r46, %r1;
mov.u32 %r47, %r5;
bra.uni BB119_6;
BB119_1:
add.s32 %r25, %r2, 1;
mul.lo.s32 %r26, %r25, %r3;
setp.ge.s32 %p2, %r1, %r26;
@%p2 bra BB119_7;
cvta.to.global.u64 %rd5, %rd3;
sub.s32 %r27, %r1, %r4;
div.s32 %r28, %r27, %r21;
setp.lt.s32 %p3, %r28, 2;
setp.eq.s32 %p4, %r28, 2;
selp.b32 %r29, 3, 2, %p4;
selp.b32 %r30, %r28, %r29, %p3;
rem.s32 %r31, %r27, %r21;
mad.lo.s32 %r32, %r30, %r21, %r31;
mul.wide.s32 %rd6, %r32, 4;
add.s64 %rd7, %rd5, %rd6;
ld.global.f32 %f1, [%rd7];
mul.wide.s32 %rd8, %r1, 4;
add.s64 %rd9, %rd1, %rd8;
st.global.f32 [%rd9], %f1;
bra.uni BB119_7;
BB119_4:
sub.s32 %r46, %r1, %r7;
rem.s32 %r44, %r46, %r47;
div.s32 %r43, %r44, %r21;
add.s32 %r42, %r43, %r45;
mov.u32 %r45, %r21;
BB119_6:
cvta.to.global.u64 %rd10, %rd2;
div.s32 %r33, %r46, %r47;
setp.eq.s32 %p6, %r33, 2;
selp.b32 %r34, 3, 2, %p6;
setp.lt.s32 %p7, %r33, 2;
selp.b32 %r35, %r33, %r34, %p7;
rem.s32 %r36, %r44, %r21;
sub.s32 %r37, %r1, %r44;
add.s32 %r38, %r37, %r43;
mad.lo.s32 %r39, %r36, %r45, %r38;
mad.lo.s32 %r40, %r42, %r3, %r36;
mad.lo.s32 %r41, %r35, %r21, %r40;
mul.wide.s32 %rd11, %r41, 4;
add.s64 %rd12, %rd10, %rd11;
ld.global.f32 %f2, [%rd12];
mul.wide.s32 %rd13, %r39, 4;
add.s64 %rd14, %rd1, %rd13;
st.global.f32 [%rd14], %f2;
BB119_7:
ret;
}
// .globl compute_nnz_d
.visible .entry compute_nnz_d(
.param .u64 compute_nnz_d_param_0,
.param .u64 compute_nnz_d_param_1,
.param .u32 compute_nnz_d_param_2
)
{
.reg .pred %p<22>;
.reg .b32 %r<36>;
.reg .f64 %fd<62>;
.reg .b64 %rd<12>;
ld.param.u64 %rd1, [compute_nnz_d_param_0];
ld.param.u64 %rd2, [compute_nnz_d_param_1];
ld.param.u32 %r6, [compute_nnz_d_param_2];
mov.u32 %r7, %tid.x;
mov.u32 %r8, %ctaid.x;
shl.b32 %r9, %r8, 1;
mov.u32 %r10, %ntid.x;
mad.lo.s32 %r35, %r9, %r10, %r7;
mov.f64 %fd46, 0d0000000000000000;
setp.ge.u32 %p1, %r35, %r6;
@%p1 bra BB120_4;
BB120_1:
cvta.to.global.u64 %rd3, %rd1;
mul.wide.u32 %rd4, %r35, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd30, [%rd5];
setp.neu.f64 %p2, %fd30, 0d0000000000000000;
selp.f64 %fd31, 0d3FF0000000000000, 0d0000000000000000, %p2;
add.f64 %fd46, %fd46, %fd31;
add.s32 %r3, %r35, %r10;
setp.ge.u32 %p3, %r3, %r6;
@%p3 bra BB120_3;
mul.wide.u32 %rd7, %r3, 8;
add.s64 %rd8, %rd3, %rd7;
ld.global.f64 %fd32, [%rd8];
setp.neu.f64 %p4, %fd32, 0d0000000000000000;
selp.f64 %fd33, 0d3FF0000000000000, 0d0000000000000000, %p4;
add.f64 %fd46, %fd46, %fd33;
BB120_3:
shl.b32 %r13, %r10, 1;
mov.u32 %r14, %nctaid.x;
mad.lo.s32 %r35, %r13, %r14, %r35;
setp.lt.u32 %p5, %r35, %r6;
@%p5 bra BB120_1;
BB120_4:
shl.b32 %r16, %r7, 3;
mov.u32 %r17, memory;
add.s32 %r5, %r17, %r16;
st.shared.f64 [%r5], %fd46;
bar.sync 0;
setp.lt.u32 %p6, %r10, 1024;
@%p6 bra BB120_8;
setp.gt.u32 %p7, %r7, 511;
@%p7 bra BB120_7;
ld.shared.f64 %fd34, [%r5+4096];
add.f64 %fd46, %fd46, %fd34;
st.shared.f64 [%r5], %fd46;
BB120_7:
bar.sync 0;
BB120_8:
setp.lt.u32 %p8, %r10, 512;
@%p8 bra BB120_12;
setp.gt.u32 %p9, %r7, 255;
@%p9 bra BB120_11;
ld.shared.f64 %fd35, [%r5+2048];
add.f64 %fd46, %fd46, %fd35;
st.shared.f64 [%r5], %fd46;
BB120_11:
bar.sync 0;
BB120_12:
setp.lt.u32 %p10, %r10, 256;
@%p10 bra BB120_16;
setp.gt.u32 %p11, %r7, 127;
@%p11 bra BB120_15;
ld.shared.f64 %fd36, [%r5+1024];
add.f64 %fd46, %fd46, %fd36;
st.shared.f64 [%r5], %fd46;
BB120_15:
bar.sync 0;
BB120_16:
setp.lt.u32 %p12, %r10, 128;
@%p12 bra BB120_20;
setp.gt.u32 %p13, %r7, 63;
@%p13 bra BB120_19;
ld.shared.f64 %fd37, [%r5+512];
add.f64 %fd46, %fd46, %fd37;
st.shared.f64 [%r5], %fd46;
BB120_19:
bar.sync 0;
BB120_20:
setp.gt.u32 %p14, %r7, 31;
@%p14 bra BB120_33;
setp.lt.u32 %p15, %r10, 64;
@%p15 bra BB120_23;
ld.volatile.shared.f64 %fd38, [%r5+256];
add.f64 %fd46, %fd46, %fd38;
st.volatile.shared.f64 [%r5], %fd46;
BB120_23:
setp.lt.u32 %p16, %r10, 32;
@%p16 bra BB120_25;
ld.volatile.shared.f64 %fd39, [%r5+128];
add.f64 %fd46, %fd46, %fd39;
st.volatile.shared.f64 [%r5], %fd46;
BB120_25:
setp.lt.u32 %p17, %r10, 16;
@%p17 bra BB120_27;
ld.volatile.shared.f64 %fd40, [%r5+64];
add.f64 %fd46, %fd46, %fd40;
st.volatile.shared.f64 [%r5], %fd46;
BB120_27:
setp.lt.u32 %p18, %r10, 8;
@%p18 bra BB120_29;
ld.volatile.shared.f64 %fd41, [%r5+32];
add.f64 %fd46, %fd46, %fd41;
st.volatile.shared.f64 [%r5], %fd46;
BB120_29:
setp.lt.u32 %p19, %r10, 4;
@%p19 bra BB120_31;
ld.volatile.shared.f64 %fd42, [%r5+16];
add.f64 %fd46, %fd46, %fd42;
st.volatile.shared.f64 [%r5], %fd46;
BB120_31:
setp.lt.u32 %p20, %r10, 2;
@%p20 bra BB120_33;
ld.volatile.shared.f64 %fd43, [%r5+8];
add.f64 %fd44, %fd46, %fd43;
st.volatile.shared.f64 [%r5], %fd44;
BB120_33:
setp.ne.s32 %p21, %r7, 0;
@%p21 bra BB120_35;
ld.shared.f64 %fd45, [memory];
cvta.to.global.u64 %rd9, %rd2;
mul.wide.u32 %rd10, %r8, 8;
add.s64 %rd11, %rd9, %rd10;
st.global.f64 [%rd11], %fd45;
BB120_35:
ret;
}
// .globl compute_nnz_f
.visible .entry compute_nnz_f(
.param .u64 compute_nnz_f_param_0,
.param .u64 compute_nnz_f_param_1,
.param .u32 compute_nnz_f_param_2
)
{
.reg .pred %p<22>;
.reg .f32 %f<62>;
.reg .b32 %r<36>;
.reg .b64 %rd<12>;
ld.param.u64 %rd1, [compute_nnz_f_param_0];
ld.param.u64 %rd2, [compute_nnz_f_param_1];
ld.param.u32 %r6, [compute_nnz_f_param_2];
mov.u32 %r7, %tid.x;
mov.u32 %r8, %ctaid.x;
shl.b32 %r9, %r8, 1;
mov.u32 %r10, %ntid.x;
mad.lo.s32 %r35, %r9, %r10, %r7;
mov.f32 %f46, 0f00000000;
setp.ge.u32 %p1, %r35, %r6;
@%p1 bra BB121_4;
BB121_1:
cvta.to.global.u64 %rd3, %rd1;
mul.wide.u32 %rd4, %r35, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f30, [%rd5];
setp.neu.f32 %p2, %f30, 0f00000000;
selp.f32 %f31, 0f3F800000, 0f00000000, %p2;
add.f32 %f46, %f46, %f31;
add.s32 %r3, %r35, %r10;
setp.ge.u32 %p3, %r3, %r6;
@%p3 bra BB121_3;
mul.wide.u32 %rd7, %r3, 4;
add.s64 %rd8, %rd3, %rd7;
ld.global.f32 %f32, [%rd8];
setp.neu.f32 %p4, %f32, 0f00000000;
selp.f32 %f33, 0f3F800000, 0f00000000, %p4;
add.f32 %f46, %f46, %f33;
BB121_3:
shl.b32 %r13, %r10, 1;
mov.u32 %r14, %nctaid.x;
mad.lo.s32 %r35, %r13, %r14, %r35;
setp.lt.u32 %p5, %r35, %r6;
@%p5 bra BB121_1;
BB121_4:
shl.b32 %r16, %r7, 2;
mov.u32 %r17, memory;
add.s32 %r5, %r17, %r16;
st.shared.f32 [%r5], %f46;
bar.sync 0;
setp.lt.u32 %p6, %r10, 1024;
@%p6 bra BB121_8;
setp.gt.u32 %p7, %r7, 511;
@%p7 bra BB121_7;
ld.shared.f32 %f34, [%r5+2048];
add.f32 %f46, %f46, %f34;
st.shared.f32 [%r5], %f46;
BB121_7:
bar.sync 0;
BB121_8:
setp.lt.u32 %p8, %r10, 512;
@%p8 bra BB121_12;
setp.gt.u32 %p9, %r7, 255;
@%p9 bra BB121_11;
ld.shared.f32 %f35, [%r5+1024];
add.f32 %f46, %f46, %f35;
st.shared.f32 [%r5], %f46;
BB121_11:
bar.sync 0;
BB121_12:
setp.lt.u32 %p10, %r10, 256;
@%p10 bra BB121_16;
setp.gt.u32 %p11, %r7, 127;
@%p11 bra BB121_15;
ld.shared.f32 %f36, [%r5+512];
add.f32 %f46, %f46, %f36;
st.shared.f32 [%r5], %f46;
BB121_15:
bar.sync 0;
BB121_16:
setp.lt.u32 %p12, %r10, 128;
@%p12 bra BB121_20;
setp.gt.u32 %p13, %r7, 63;
@%p13 bra BB121_19;
ld.shared.f32 %f37, [%r5+256];
add.f32 %f46, %f46, %f37;
st.shared.f32 [%r5], %f46;
BB121_19:
bar.sync 0;
BB121_20:
setp.gt.u32 %p14, %r7, 31;
@%p14 bra BB121_33;
setp.lt.u32 %p15, %r10, 64;
@%p15 bra BB121_23;
ld.volatile.shared.f32 %f38, [%r5+128];
add.f32 %f46, %f46, %f38;
st.volatile.shared.f32 [%r5], %f46;
BB121_23:
setp.lt.u32 %p16, %r10, 32;
@%p16 bra BB121_25;
ld.volatile.shared.f32 %f39, [%r5+64];
add.f32 %f46, %f46, %f39;
st.volatile.shared.f32 [%r5], %f46;
BB121_25:
setp.lt.u32 %p17, %r10, 16;
@%p17 bra BB121_27;
ld.volatile.shared.f32 %f40, [%r5+32];
add.f32 %f46, %f46, %f40;
st.volatile.shared.f32 [%r5], %f46;
BB121_27:
setp.lt.u32 %p18, %r10, 8;
@%p18 bra BB121_29;
ld.volatile.shared.f32 %f41, [%r5+16];
add.f32 %f46, %f46, %f41;
st.volatile.shared.f32 [%r5], %f46;
BB121_29:
setp.lt.u32 %p19, %r10, 4;
@%p19 bra BB121_31;
ld.volatile.shared.f32 %f42, [%r5+8];
add.f32 %f46, %f46, %f42;
st.volatile.shared.f32 [%r5], %f46;
BB121_31:
setp.lt.u32 %p20, %r10, 2;
@%p20 bra BB121_33;
ld.volatile.shared.f32 %f43, [%r5+4];
add.f32 %f44, %f46, %f43;
st.volatile.shared.f32 [%r5], %f44;
BB121_33:
setp.ne.s32 %p21, %r7, 0;
@%p21 bra BB121_35;
ld.shared.f32 %f45, [memory];
cvta.to.global.u64 %rd9, %rd2;
mul.wide.u32 %rd10, %r8, 4;
add.s64 %rd11, %rd9, %rd10;
st.global.f32 [%rd11], %f45;
BB121_35:
ret;
}
// .globl prepare_lstm_output_d
.visible .entry prepare_lstm_output_d(
.param .u64 prepare_lstm_output_d_param_0,
.param .u64 prepare_lstm_output_d_param_1,
.param .u32 prepare_lstm_output_d_param_2,
.param .u32 prepare_lstm_output_d_param_3,
.param .u32 prepare_lstm_output_d_param_4,
.param .u32 prepare_lstm_output_d_param_5
)
{
.reg .pred %p<2>;
.reg .b32 %r<16>;
.reg .f64 %fd<2>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [prepare_lstm_output_d_param_0];
ld.param.u64 %rd2, [prepare_lstm_output_d_param_1];
ld.param.u32 %r2, [prepare_lstm_output_d_param_2];
ld.param.u32 %r3, [prepare_lstm_output_d_param_3];
ld.param.u32 %r4, [prepare_lstm_output_d_param_4];
ld.param.u32 %r5, [prepare_lstm_output_d_param_5];
mov.u32 %r6, %ctaid.x;
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %tid.x;
mad.lo.s32 %r1, %r7, %r6, %r8;
setp.ge.s32 %p1, %r1, %r5;
@%p1 bra BB122_2;
cvta.to.global.u64 %rd3, %rd2;
mul.lo.s32 %r9, %r4, %r3;
div.s32 %r10, %r1, %r9;
rem.s32 %r11, %r1, %r9;
div.s32 %r12, %r11, %r4;
rem.s32 %r13, %r11, %r4;
mad.lo.s32 %r14, %r12, %r2, %r10;
mad.lo.s32 %r15, %r14, %r4, %r13;
mul.wide.s32 %rd4, %r15, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd1, [%rd5];
cvta.to.global.u64 %rd6, %rd1;
mul.wide.s32 %rd7, %r1, 8;
add.s64 %rd8, %rd6, %rd7;
st.global.f64 [%rd8], %fd1;
BB122_2:
ret;
}
// .globl prepare_lstm_output_f
.visible .entry prepare_lstm_output_f(
.param .u64 prepare_lstm_output_f_param_0,
.param .u64 prepare_lstm_output_f_param_1,
.param .u32 prepare_lstm_output_f_param_2,
.param .u32 prepare_lstm_output_f_param_3,
.param .u32 prepare_lstm_output_f_param_4,
.param .u32 prepare_lstm_output_f_param_5
)
{
.reg .pred %p<2>;
.reg .f32 %f<2>;
.reg .b32 %r<16>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [prepare_lstm_output_f_param_0];
ld.param.u64 %rd2, [prepare_lstm_output_f_param_1];
ld.param.u32 %r2, [prepare_lstm_output_f_param_2];
ld.param.u32 %r3, [prepare_lstm_output_f_param_3];
ld.param.u32 %r4, [prepare_lstm_output_f_param_4];
ld.param.u32 %r5, [prepare_lstm_output_f_param_5];
mov.u32 %r6, %ctaid.x;
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %tid.x;
mad.lo.s32 %r1, %r7, %r6, %r8;
setp.ge.s32 %p1, %r1, %r5;
@%p1 bra BB123_2;
cvta.to.global.u64 %rd3, %rd2;
mul.lo.s32 %r9, %r4, %r3;
div.s32 %r10, %r1, %r9;
rem.s32 %r11, %r1, %r9;
div.s32 %r12, %r11, %r4;
rem.s32 %r13, %r11, %r4;
mad.lo.s32 %r14, %r12, %r2, %r10;
mad.lo.s32 %r15, %r14, %r4, %r13;
mul.wide.s32 %rd4, %r15, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f1, [%rd5];
cvta.to.global.u64 %rd6, %rd1;
mul.wide.s32 %rd7, %r1, 4;
add.s64 %rd8, %rd6, %rd7;
st.global.f32 [%rd8], %f1;
BB123_2:
ret;
}
// .globl prepare_lstm_backward_gradients_d
.visible .entry prepare_lstm_backward_gradients_d(
.param .u64 prepare_lstm_backward_gradients_d_param_0,
.param .u64 prepare_lstm_backward_gradients_d_param_1,
.param .u32 prepare_lstm_backward_gradients_d_param_2,
.param .u32 prepare_lstm_backward_gradients_d_param_3,
.param .u32 prepare_lstm_backward_gradients_d_param_4,
.param .u32 prepare_lstm_backward_gradients_d_param_5,
.param .u32 prepare_lstm_backward_gradients_d_param_6
)
{
.reg .pred %p<5>;
.reg .b32 %r<20>;
.reg .f64 %fd<3>;
.reg .b64 %rd<11>;
ld.param.u64 %rd3, [prepare_lstm_backward_gradients_d_param_0];
ld.param.u64 %rd4, [prepare_lstm_backward_gradients_d_param_1];
ld.param.u32 %r2, [prepare_lstm_backward_gradients_d_param_2];
ld.param.u32 %r3, [prepare_lstm_backward_gradients_d_param_3];
ld.param.u32 %r4, [prepare_lstm_backward_gradients_d_param_4];
ld.param.u32 %r5, [prepare_lstm_backward_gradients_d_param_5];
ld.param.u32 %r6, [prepare_lstm_backward_gradients_d_param_6];
cvta.to.global.u64 %rd1, %rd4;
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %ctaid.x;
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r7, %r8, %r9;
setp.lt.s32 %p1, %r1, %r5;
setp.ne.s32 %p2, %r6, 0;
and.pred %p3, %p1, %p2;
cvta.to.global.u64 %rd5, %rd3;
mul.wide.s32 %rd6, %r1, 8;
add.s64 %rd2, %rd5, %rd6;
@%p3 bra BB124_3;
bra.uni BB124_1;
BB124_3:
mul.lo.s32 %r13, %r4, %r3;
div.s32 %r14, %r1, %r13;
rem.s32 %r15, %r1, %r13;
div.s32 %r16, %r15, %r4;
rem.s32 %r17, %r15, %r4;
ld.global.f64 %fd2, [%rd2];
mad.lo.s32 %r18, %r16, %r2, %r14;
mad.lo.s32 %r19, %r18, %r4, %r17;
mul.wide.s32 %rd9, %r19, 8;
add.s64 %rd10, %rd1, %rd9;
st.global.f64 [%rd10], %fd2;
bra.uni BB124_4;
BB124_1:
setp.ge.s32 %p4, %r1, %r5;
@%p4 bra BB124_4;
ld.global.f64 %fd1, [%rd2];
add.s32 %r10, %r3, -1;
mul.lo.s32 %r11, %r10, %r2;
mad.lo.s32 %r12, %r11, %r4, %r1;
mul.wide.s32 %rd7, %r12, 8;
add.s64 %rd8, %rd1, %rd7;
st.global.f64 [%rd8], %fd1;
BB124_4:
ret;
}
// .globl prepare_lstm_backward_gradients_f
.visible .entry prepare_lstm_backward_gradients_f(
.param .u64 prepare_lstm_backward_gradients_f_param_0,
.param .u64 prepare_lstm_backward_gradients_f_param_1,
.param .u32 prepare_lstm_backward_gradients_f_param_2,
.param .u32 prepare_lstm_backward_gradients_f_param_3,
.param .u32 prepare_lstm_backward_gradients_f_param_4,
.param .u32 prepare_lstm_backward_gradients_f_param_5,
.param .u32 prepare_lstm_backward_gradients_f_param_6
)
{
.reg .pred %p<5>;
.reg .f32 %f<3>;
.reg .b32 %r<20>;
.reg .b64 %rd<11>;
ld.param.u64 %rd3, [prepare_lstm_backward_gradients_f_param_0];
ld.param.u64 %rd4, [prepare_lstm_backward_gradients_f_param_1];
ld.param.u32 %r2, [prepare_lstm_backward_gradients_f_param_2];
ld.param.u32 %r3, [prepare_lstm_backward_gradients_f_param_3];
ld.param.u32 %r4, [prepare_lstm_backward_gradients_f_param_4];
ld.param.u32 %r5, [prepare_lstm_backward_gradients_f_param_5];
ld.param.u32 %r6, [prepare_lstm_backward_gradients_f_param_6];
cvta.to.global.u64 %rd1, %rd4;
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %ctaid.x;
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r7, %r8, %r9;
setp.lt.s32 %p1, %r1, %r5;
setp.ne.s32 %p2, %r6, 0;
and.pred %p3, %p1, %p2;
cvta.to.global.u64 %rd5, %rd3;
mul.wide.s32 %rd6, %r1, 4;
add.s64 %rd2, %rd5, %rd6;
@%p3 bra BB125_3;
bra.uni BB125_1;
BB125_3:
mul.lo.s32 %r13, %r4, %r3;
div.s32 %r14, %r1, %r13;
rem.s32 %r15, %r1, %r13;
div.s32 %r16, %r15, %r4;
rem.s32 %r17, %r15, %r4;
ld.global.f32 %f2, [%rd2];
mad.lo.s32 %r18, %r16, %r2, %r14;
mad.lo.s32 %r19, %r18, %r4, %r17;
mul.wide.s32 %rd9, %r19, 4;
add.s64 %rd10, %rd1, %rd9;
st.global.f32 [%rd10], %f2;
bra.uni BB125_4;
BB125_1:
setp.ge.s32 %p4, %r1, %r5;
@%p4 bra BB125_4;
ld.global.f32 %f1, [%rd2];
add.s32 %r10, %r3, -1;
mul.lo.s32 %r11, %r10, %r2;
mad.lo.s32 %r12, %r11, %r4, %r1;
mul.wide.s32 %rd7, %r12, 4;
add.s64 %rd8, %rd1, %rd7;
st.global.f32 [%rd8], %f1;
BB125_4:
ret;
}
// .globl prepare_lstm_dweight_d
.visible .entry prepare_lstm_dweight_d(
.param .u64 prepare_lstm_dweight_d_param_0,
.param .u64 prepare_lstm_dweight_d_param_1,
.param .u64 prepare_lstm_dweight_d_param_2,
.param .u32 prepare_lstm_dweight_d_param_3,
.param .u32 prepare_lstm_dweight_d_param_4
)
{
.reg .pred %p<8>;
.reg .b32 %r<48>;
.reg .f64 %fd<3>;
.reg .b64 %rd<15>;
ld.param.u64 %rd2, [prepare_lstm_dweight_d_param_0];
ld.param.u64 %rd3, [prepare_lstm_dweight_d_param_1];
ld.param.u64 %rd4, [prepare_lstm_dweight_d_param_2];
ld.param.u32 %r45, [prepare_lstm_dweight_d_param_3];
ld.param.u32 %r21, [prepare_lstm_dweight_d_param_4];
cvta.to.global.u64 %rd1, %rd4;
mov.u32 %r22, %ntid.x;
mov.u32 %r23, %ctaid.x;
mov.u32 %r24, %tid.x;
mad.lo.s32 %r1, %r22, %r23, %r24;
add.s32 %r2, %r21, %r45;
shl.b32 %r3, %r21, 2;
mul.lo.s32 %r4, %r2, %r3;
setp.lt.s32 %p1, %r1, %r4;
@%p1 bra BB126_3;
bra.uni BB126_1;
BB126_3:
mul.lo.s32 %r5, %r21, %r45;
mul.lo.s32 %r47, %r21, %r21;
shl.b32 %r7, %r5, 2;
setp.lt.s32 %p5, %r1, %r7;
@%p5 bra BB126_5;
bra.uni BB126_4;
BB126_5:
rem.s32 %r44, %r1, %r5;
div.s32 %r42, %r44, %r21;
mov.u32 %r43, %r42;
mov.u32 %r46, %r1;
mov.u32 %r47, %r5;
bra.uni BB126_6;
BB126_1:
add.s32 %r25, %r2, 1;
mul.lo.s32 %r26, %r25, %r3;
setp.ge.s32 %p2, %r1, %r26;
@%p2 bra BB126_7;
cvta.to.global.u64 %rd5, %rd3;
sub.s32 %r27, %r1, %r4;
div.s32 %r28, %r27, %r21;
setp.lt.s32 %p3, %r28, 2;
setp.eq.s32 %p4, %r28, 2;
selp.b32 %r29, 3, 2, %p4;
selp.b32 %r30, %r28, %r29, %p3;
rem.s32 %r31, %r27, %r21;
mad.lo.s32 %r32, %r30, %r21, %r31;
mul.wide.s32 %rd6, %r1, 8;
add.s64 %rd7, %rd1, %rd6;
ld.global.f64 %fd1, [%rd7];
mul.wide.s32 %rd8, %r32, 8;
add.s64 %rd9, %rd5, %rd8;
st.global.f64 [%rd9], %fd1;
bra.uni BB126_7;
BB126_4:
sub.s32 %r46, %r1, %r7;
rem.s32 %r44, %r46, %r47;
div.s32 %r43, %r44, %r21;
add.s32 %r42, %r43, %r45;
mov.u32 %r45, %r21;
BB126_6:
cvta.to.global.u64 %rd10, %rd2;
div.s32 %r33, %r46, %r47;
setp.eq.s32 %p6, %r33, 2;
selp.b32 %r34, 3, 2, %p6;
setp.lt.s32 %p7, %r33, 2;
selp.b32 %r35, %r33, %r34, %p7;
rem.s32 %r36, %r44, %r21;
sub.s32 %r37, %r1, %r44;
add.s32 %r38, %r37, %r43;
mad.lo.s32 %r39, %r36, %r45, %r38;
mad.lo.s32 %r40, %r42, %r3, %r36;
mad.lo.s32 %r41, %r35, %r21, %r40;
mul.wide.s32 %rd11, %r39, 8;
add.s64 %rd12, %rd1, %rd11;
ld.global.f64 %fd2, [%rd12];
mul.wide.s32 %rd13, %r41, 8;
add.s64 %rd14, %rd10, %rd13;
st.global.f64 [%rd14], %fd2;
BB126_7:
ret;
}
// .globl prepare_lstm_dweight_f
.visible .entry prepare_lstm_dweight_f(
.param .u64 prepare_lstm_dweight_f_param_0,
.param .u64 prepare_lstm_dweight_f_param_1,
.param .u64 prepare_lstm_dweight_f_param_2,
.param .u32 prepare_lstm_dweight_f_param_3,
.param .u32 prepare_lstm_dweight_f_param_4
)
{
.reg .pred %p<8>;
.reg .f32 %f<3>;
.reg .b32 %r<48>;
.reg .b64 %rd<15>;
ld.param.u64 %rd2, [prepare_lstm_dweight_f_param_0];
ld.param.u64 %rd3, [prepare_lstm_dweight_f_param_1];
ld.param.u64 %rd4, [prepare_lstm_dweight_f_param_2];
ld.param.u32 %r45, [prepare_lstm_dweight_f_param_3];
ld.param.u32 %r21, [prepare_lstm_dweight_f_param_4];
cvta.to.global.u64 %rd1, %rd4;
mov.u32 %r22, %ntid.x;
mov.u32 %r23, %ctaid.x;
mov.u32 %r24, %tid.x;
mad.lo.s32 %r1, %r22, %r23, %r24;
add.s32 %r2, %r21, %r45;
shl.b32 %r3, %r21, 2;
mul.lo.s32 %r4, %r2, %r3;
setp.lt.s32 %p1, %r1, %r4;
@%p1 bra BB127_3;
bra.uni BB127_1;
BB127_3:
mul.lo.s32 %r5, %r21, %r45;
mul.lo.s32 %r47, %r21, %r21;
shl.b32 %r7, %r5, 2;
setp.lt.s32 %p5, %r1, %r7;
@%p5 bra BB127_5;
bra.uni BB127_4;
BB127_5:
rem.s32 %r44, %r1, %r5;
div.s32 %r42, %r44, %r21;
mov.u32 %r43, %r42;
mov.u32 %r46, %r1;
mov.u32 %r47, %r5;
bra.uni BB127_6;
BB127_1:
add.s32 %r25, %r2, 1;
mul.lo.s32 %r26, %r25, %r3;
setp.ge.s32 %p2, %r1, %r26;
@%p2 bra BB127_7;
cvta.to.global.u64 %rd5, %rd3;
sub.s32 %r27, %r1, %r4;
div.s32 %r28, %r27, %r21;
setp.lt.s32 %p3, %r28, 2;
setp.eq.s32 %p4, %r28, 2;
selp.b32 %r29, 3, 2, %p4;
selp.b32 %r30, %r28, %r29, %p3;
rem.s32 %r31, %r27, %r21;
mad.lo.s32 %r32, %r30, %r21, %r31;
mul.wide.s32 %rd6, %r1, 4;
add.s64 %rd7, %rd1, %rd6;
ld.global.f32 %f1, [%rd7];
mul.wide.s32 %rd8, %r32, 4;
add.s64 %rd9, %rd5, %rd8;
st.global.f32 [%rd9], %f1;
bra.uni BB127_7;
BB127_4:
sub.s32 %r46, %r1, %r7;
rem.s32 %r44, %r46, %r47;
div.s32 %r43, %r44, %r21;
add.s32 %r42, %r43, %r45;
mov.u32 %r45, %r21;
BB127_6:
cvta.to.global.u64 %rd10, %rd2;
div.s32 %r33, %r46, %r47;
setp.eq.s32 %p6, %r33, 2;
selp.b32 %r34, 3, 2, %p6;
setp.lt.s32 %p7, %r33, 2;
selp.b32 %r35, %r33, %r34, %p7;
rem.s32 %r36, %r44, %r21;
sub.s32 %r37, %r1, %r44;
add.s32 %r38, %r37, %r43;
mad.lo.s32 %r39, %r36, %r45, %r38;
mad.lo.s32 %r40, %r42, %r3, %r36;
mad.lo.s32 %r41, %r35, %r21, %r40;
mul.wide.s32 %rd11, %r39, 4;
add.s64 %rd12, %rd1, %rd11;
ld.global.f32 %f2, [%rd12];
mul.wide.s32 %rd13, %r41, 4;
add.s64 %rd14, %rd10, %rd13;
st.global.f32 [%rd14], %f2;
BB127_7:
ret;
}
// .globl prepare_lstm_dinput_d
.visible .entry prepare_lstm_dinput_d(
.param .u64 prepare_lstm_dinput_d_param_0,
.param .u64 prepare_lstm_dinput_d_param_1,
.param .u32 prepare_lstm_dinput_d_param_2,
.param .u32 prepare_lstm_dinput_d_param_3,
.param .u32 prepare_lstm_dinput_d_param_4,
.param .u32 prepare_lstm_dinput_d_param_5
)
{
.reg .pred %p<2>;
.reg .b32 %r<15>;
.reg .f64 %fd<2>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [prepare_lstm_dinput_d_param_0];
ld.param.u64 %rd2, [prepare_lstm_dinput_d_param_1];
ld.param.u32 %r2, [prepare_lstm_dinput_d_param_2];
ld.param.u32 %r3, [prepare_lstm_dinput_d_param_3];
ld.param.u32 %r4, [prepare_lstm_dinput_d_param_4];
ld.param.u32 %r5, [prepare_lstm_dinput_d_param_5];
mov.u32 %r6, %ctaid.x;
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %tid.x;
mad.lo.s32 %r1, %r7, %r6, %r8;
setp.ge.s32 %p1, %r1, %r5;
@%p1 bra BB128_2;
cvta.to.global.u64 %rd3, %rd2;
rem.s32 %r9, %r1, %r4;
div.s32 %r10, %r9, %r3;
rem.s32 %r11, %r9, %r3;
div.s32 %r12, %r1, %r4;
mad.lo.s32 %r13, %r10, %r2, %r12;
mad.lo.s32 %r14, %r13, %r3, %r11;
mul.wide.s32 %rd4, %r14, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd1, [%rd5];
cvta.to.global.u64 %rd6, %rd1;
mul.wide.s32 %rd7, %r1, 8;
add.s64 %rd8, %rd6, %rd7;
st.global.f64 [%rd8], %fd1;
BB128_2:
ret;
}
// .globl prepare_lstm_dinput_f
.visible .entry prepare_lstm_dinput_f(
.param .u64 prepare_lstm_dinput_f_param_0,
.param .u64 prepare_lstm_dinput_f_param_1,
.param .u32 prepare_lstm_dinput_f_param_2,
.param .u32 prepare_lstm_dinput_f_param_3,
.param .u32 prepare_lstm_dinput_f_param_4,
.param .u32 prepare_lstm_dinput_f_param_5
)
{
.reg .pred %p<2>;
.reg .f32 %f<2>;
.reg .b32 %r<15>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [prepare_lstm_dinput_f_param_0];
ld.param.u64 %rd2, [prepare_lstm_dinput_f_param_1];
ld.param.u32 %r2, [prepare_lstm_dinput_f_param_2];
ld.param.u32 %r3, [prepare_lstm_dinput_f_param_3];
ld.param.u32 %r4, [prepare_lstm_dinput_f_param_4];
ld.param.u32 %r5, [prepare_lstm_dinput_f_param_5];
mov.u32 %r6, %ctaid.x;
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %tid.x;
mad.lo.s32 %r1, %r7, %r6, %r8;
setp.ge.s32 %p1, %r1, %r5;
@%p1 bra BB129_2;
cvta.to.global.u64 %rd3, %rd2;
rem.s32 %r9, %r1, %r4;
div.s32 %r10, %r9, %r3;
rem.s32 %r11, %r9, %r3;
div.s32 %r12, %r1, %r4;
mad.lo.s32 %r13, %r10, %r2, %r12;
mad.lo.s32 %r14, %r13, %r3, %r11;
mul.wide.s32 %rd4, %r14, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f1, [%rd5];
cvta.to.global.u64 %rd6, %rd1;
mul.wide.s32 %rd7, %r1, 4;
add.s64 %rd8, %rd6, %rd7;
st.global.f32 [%rd8], %f1;
BB129_2:
ret;
}
// .globl colwise_reshape_d
.visible .entry colwise_reshape_d(
.param .u64 colwise_reshape_d_param_0,
.param .u64 colwise_reshape_d_param_1,
.param .u32 colwise_reshape_d_param_2,
.param .u32 colwise_reshape_d_param_3,
.param .u32 colwise_reshape_d_param_4,
.param .u32 colwise_reshape_d_param_5,
.param .u32 colwise_reshape_d_param_6
)
{
.reg .pred %p<2>;
.reg .b32 %r<16>;
.reg .f64 %fd<2>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [colwise_reshape_d_param_0];
ld.param.u64 %rd2, [colwise_reshape_d_param_1];
ld.param.u32 %r6, [colwise_reshape_d_param_2];
ld.param.u32 %r2, [colwise_reshape_d_param_3];
ld.param.u32 %r3, [colwise_reshape_d_param_4];
ld.param.u32 %r4, [colwise_reshape_d_param_5];
ld.param.u32 %r5, [colwise_reshape_d_param_6];
mov.u32 %r7, %ctaid.x;
mov.u32 %r8, %ntid.x;
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r8, %r7, %r9;
setp.ge.u32 %p1, %r1, %r6;
@%p1 bra BB130_2;
cvta.to.global.u64 %rd3, %rd1;
rem.u32 %r10, %r1, %r5;
div.u32 %r11, %r1, %r5;
mad.lo.s32 %r12, %r10, %r4, %r11;
rem.u32 %r13, %r12, %r2;
div.u32 %r14, %r12, %r2;
mad.lo.s32 %r15, %r13, %r3, %r14;
mul.wide.u32 %rd4, %r15, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd1, [%rd5];
cvta.to.global.u64 %rd6, %rd2;
mul.wide.s32 %rd7, %r1, 8;
add.s64 %rd8, %rd6, %rd7;
st.global.f64 [%rd8], %fd1;
BB130_2:
ret;
}
// .globl colwise_reshape_f
.visible .entry colwise_reshape_f(
.param .u64 colwise_reshape_f_param_0,
.param .u64 colwise_reshape_f_param_1,
.param .u32 colwise_reshape_f_param_2,
.param .u32 colwise_reshape_f_param_3,
.param .u32 colwise_reshape_f_param_4,
.param .u32 colwise_reshape_f_param_5,
.param .u32 colwise_reshape_f_param_6
)
{
.reg .pred %p<2>;
.reg .f32 %f<2>;
.reg .b32 %r<16>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [colwise_reshape_f_param_0];
ld.param.u64 %rd2, [colwise_reshape_f_param_1];
ld.param.u32 %r6, [colwise_reshape_f_param_2];
ld.param.u32 %r2, [colwise_reshape_f_param_3];
ld.param.u32 %r3, [colwise_reshape_f_param_4];
ld.param.u32 %r4, [colwise_reshape_f_param_5];
ld.param.u32 %r5, [colwise_reshape_f_param_6];
mov.u32 %r7, %ctaid.x;
mov.u32 %r8, %ntid.x;
mov.u32 %r9, %tid.x;
mad.lo.s32 %r1, %r8, %r7, %r9;
setp.ge.u32 %p1, %r1, %r6;
@%p1 bra BB131_2;
cvta.to.global.u64 %rd3, %rd1;
rem.u32 %r10, %r1, %r5;
div.u32 %r11, %r1, %r5;
mad.lo.s32 %r12, %r10, %r4, %r11;
rem.u32 %r13, %r12, %r2;
div.u32 %r14, %r12, %r2;
mad.lo.s32 %r15, %r13, %r3, %r14;
mul.wide.u32 %rd4, %r15, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f1, [%rd5];
cvta.to.global.u64 %rd6, %rd2;
mul.wide.s32 %rd7, %r1, 4;
add.s64 %rd8, %rd6, %rd7;
st.global.f32 [%rd8], %f1;
BB131_2:
ret;
}
// .globl update_nesterov_x_d
.visible .entry update_nesterov_x_d(
.param .u64 update_nesterov_x_d_param_0,
.param .u64 update_nesterov_x_d_param_1,
.param .u64 update_nesterov_x_d_param_2,
.param .f64 update_nesterov_x_d_param_3,
.param .u64 update_nesterov_x_d_param_4,
.param .u32 update_nesterov_x_d_param_5
)
{
.reg .pred %p<2>;
.reg .b32 %r<6>;
.reg .f64 %fd<9>;
.reg .b64 %rd<14>;
ld.param.u64 %rd1, [update_nesterov_x_d_param_0];
ld.param.u64 %rd2, [update_nesterov_x_d_param_1];
ld.param.u64 %rd3, [update_nesterov_x_d_param_2];
ld.param.f64 %fd1, [update_nesterov_x_d_param_3];
ld.param.u64 %rd4, [update_nesterov_x_d_param_4];
ld.param.u32 %r2, [update_nesterov_x_d_param_5];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
@%p1 bra BB132_2;
cvta.to.global.u64 %rd5, %rd1;
mul.wide.s32 %rd6, %r1, 8;
add.s64 %rd7, %rd5, %rd6;
cvta.to.global.u64 %rd8, %rd3;
add.s64 %rd9, %rd8, %rd6;
ld.global.f64 %fd2, [%rd9];
mul.f64 %fd3, %fd2, %fd1;
ld.global.f64 %fd4, [%rd7];
sub.f64 %fd5, %fd4, %fd3;
cvta.to.global.u64 %rd10, %rd2;
add.s64 %rd11, %rd10, %rd6;
ld.global.f64 %fd6, [%rd11];
add.f64 %fd7, %fd1, 0d3FF0000000000000;
fma.rn.f64 %fd8, %fd7, %fd6, %fd5;
cvta.to.global.u64 %rd12, %rd4;
add.s64 %rd13, %rd12, %rd6;
st.global.f64 [%rd13], %fd8;
BB132_2:
ret;
}
// .globl update_nesterov_x_f
.visible .entry update_nesterov_x_f(
.param .u64 update_nesterov_x_f_param_0,
.param .u64 update_nesterov_x_f_param_1,
.param .u64 update_nesterov_x_f_param_2,
.param .f64 update_nesterov_x_f_param_3,
.param .u64 update_nesterov_x_f_param_4,
.param .u32 update_nesterov_x_f_param_5
)
{
.reg .pred %p<2>;
.reg .f32 %f<5>;
.reg .b32 %r<6>;
.reg .f64 %fd<9>;
.reg .b64 %rd<14>;
ld.param.u64 %rd1, [update_nesterov_x_f_param_0];
ld.param.u64 %rd2, [update_nesterov_x_f_param_1];
ld.param.u64 %rd3, [update_nesterov_x_f_param_2];
ld.param.f64 %fd1, [update_nesterov_x_f_param_3];
ld.param.u64 %rd4, [update_nesterov_x_f_param_4];
ld.param.u32 %r2, [update_nesterov_x_f_param_5];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.u32 %p1, %r1, %r2;
@%p1 bra BB133_2;
cvta.to.global.u64 %rd5, %rd1;
mul.wide.s32 %rd6, %r1, 4;
add.s64 %rd7, %rd5, %rd6;
ld.global.f32 %f1, [%rd7];
cvt.f64.f32 %fd2, %f1;
cvta.to.global.u64 %rd8, %rd3;
add.s64 %rd9, %rd8, %rd6;
ld.global.f32 %f2, [%rd9];
cvt.f64.f32 %fd3, %f2;
mul.f64 %fd4, %fd3, %fd1;
sub.f64 %fd5, %fd2, %fd4;
cvta.to.global.u64 %rd10, %rd2;
add.s64 %rd11, %rd10, %rd6;
ld.global.f32 %f3, [%rd11];
cvt.f64.f32 %fd6, %f3;
add.f64 %fd7, %fd1, 0d3FF0000000000000;
fma.rn.f64 %fd8, %fd7, %fd6, %fd5;
cvt.rn.f32.f64 %f4, %fd8;
cvta.to.global.u64 %rd12, %rd4;
add.s64 %rd13, %rd12, %rd6;
st.global.f32 [%rd13], %f4;
BB133_2:
ret;
}
.func (.param .b64 func_retval0) __internal_trig_reduction_slowpathd(
.param .b64 __internal_trig_reduction_slowpathd_param_0,
.param .b64 __internal_trig_reduction_slowpathd_param_1
)
{
.local .align 8 .b8 __local_depot134[40];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<9>;
.reg .b32 %r<42>;
.reg .f64 %fd<5>;
.reg .b64 %rd<101>;
mov.u64 %SPL, __local_depot134;
ld.param.f64 %fd4, [__internal_trig_reduction_slowpathd_param_0];
ld.param.u64 %rd37, [__internal_trig_reduction_slowpathd_param_1];
add.u64 %rd1, %SPL, 0;
{
.reg .b32 %temp;
mov.b64 {%temp, %r1}, %fd4;
}
and.b32 %r40, %r1, -2147483648;
shr.u32 %r3, %r1, 20;
bfe.u32 %r4, %r1, 20, 11;
setp.eq.s32 %p1, %r4, 2047;
@%p1 bra BB134_13;
add.s32 %r15, %r4, -1024;
shr.u32 %r16, %r15, 6;
mov.u32 %r17, 15;
sub.s32 %r5, %r17, %r16;
mov.u32 %r18, 19;
sub.s32 %r19, %r18, %r16;
mov.u32 %r20, 18;
min.s32 %r6, %r20, %r19;
mov.u64 %rd94, 0;
setp.ge.s32 %p2, %r5, %r6;
mov.u64 %rd93, %rd1;
@%p2 bra BB134_4;
bfe.u32 %r21, %r1, 20, 11;
add.s32 %r22, %r21, -1024;
shr.u32 %r23, %r22, 6;
sub.s32 %r25, %r17, %r23;
mul.wide.s32 %rd41, %r25, 8;
mov.u64 %rd42, __cudart_i2opi_d;
add.s64 %rd89, %rd42, %rd41;
mov.b64 %rd43, %fd4;
shl.b64 %rd44, %rd43, 11;
or.b64 %rd5, %rd44, -9223372036854775808;
mov.u64 %rd94, 0;
mov.u64 %rd93, %rd1;
mov.u64 %rd91, %rd1;
mov.u32 %r39, %r5;
BB134_3:
.pragma "nounroll";
ld.const.u64 %rd47, [%rd89];
// inline asm
{
.reg .u32 r0, r1, r2, r3, alo, ahi, blo, bhi, clo, chi;
mov.b64 {alo,ahi}, %rd47;
mov.b64 {blo,bhi}, %rd5;
mov.b64 {clo,chi}, %rd94;
mad.lo.cc.u32 r0, alo, blo, clo;
madc.hi.cc.u32 r1, alo, blo, chi;
madc.hi.u32 r2, alo, bhi, 0;
mad.lo.cc.u32 r1, alo, bhi, r1;
madc.hi.cc.u32 r2, ahi, blo, r2;
madc.hi.u32 r3, ahi, bhi, 0;
mad.lo.cc.u32 r1, ahi, blo, r1;
madc.lo.cc.u32 r2, ahi, bhi, r2;
addc.u32 r3, r3, 0;
mov.b64 %rd45, {r0,r1};
mov.b64 %rd94, {r2,r3};
}
// inline asm
st.local.u64 [%rd91], %rd45;
add.s32 %r39, %r39, 1;
sub.s32 %r26, %r39, %r5;
mul.wide.s32 %rd50, %r26, 8;
add.s64 %rd91, %rd1, %rd50;
add.s64 %rd93, %rd93, 8;
add.s64 %rd89, %rd89, 8;
setp.lt.s32 %p3, %r39, %r6;
@%p3 bra BB134_3;
BB134_4:
st.local.u64 [%rd93], %rd94;
ld.local.u64 %rd95, [%rd1+16];
ld.local.u64 %rd96, [%rd1+24];
and.b32 %r9, %r3, 63;
setp.eq.s32 %p4, %r9, 0;
@%p4 bra BB134_6;
mov.u32 %r27, 64;
sub.s32 %r28, %r27, %r9;
shl.b64 %rd51, %rd96, %r9;
shr.u64 %rd52, %rd95, %r28;
or.b64 %rd96, %rd51, %rd52;
shl.b64 %rd53, %rd95, %r9;
ld.local.u64 %rd54, [%rd1+8];
shr.u64 %rd55, %rd54, %r28;
or.b64 %rd95, %rd55, %rd53;
BB134_6:
shr.u64 %rd56, %rd96, 62;
cvt.u32.u64 %r29, %rd56;
shr.u64 %rd57, %rd95, 62;
shl.b64 %rd58, %rd96, 2;
or.b64 %rd98, %rd58, %rd57;
shl.b64 %rd97, %rd95, 2;
shr.u64 %rd59, %rd96, 61;
cvt.u32.u64 %r30, %rd59;
and.b32 %r31, %r30, 1;
add.s32 %r32, %r31, %r29;
neg.s32 %r33, %r32;
setp.eq.s32 %p5, %r40, 0;
selp.b32 %r34, %r32, %r33, %p5;
cvta.to.local.u64 %rd60, %rd37;
st.local.u32 [%rd60], %r34;
setp.eq.s32 %p6, %r31, 0;
@%p6 bra BB134_8;
mov.u64 %rd64, 0;
// inline asm
{
.reg .u32 r0, r1, r2, r3, a0, a1, a2, a3, b0, b1, b2, b3;
mov.b64 {a0,a1}, %rd64;
mov.b64 {a2,a3}, %rd64;
mov.b64 {b0,b1}, %rd97;
mov.b64 {b2,b3}, %rd98;
sub.cc.u32 r0, a0, b0;
subc.cc.u32 r1, a1, b1;
subc.cc.u32 r2, a2, b2;
subc.u32 r3, a3, b3;
mov.b64 %rd97, {r0,r1};
mov.b64 %rd98, {r2,r3};
}
// inline asm
xor.b32 %r40, %r40, -2147483648;
BB134_8:
clz.b64 %r41, %rd98;
setp.eq.s32 %p7, %r41, 0;
@%p7 bra BB134_10;
shl.b64 %rd67, %rd98, %r41;
mov.u32 %r35, 64;
sub.s32 %r36, %r35, %r41;
shr.u64 %rd68, %rd97, %r36;
or.b64 %rd98, %rd68, %rd67;
BB134_10:
mov.u64 %rd72, -3958705157555305931;
// inline asm
{
.reg .u32 r0, r1, r2, r3, alo, ahi, blo, bhi;
mov.b64 {alo,ahi}, %rd98;
mov.b64 {blo,bhi}, %rd72;
mul.lo.u32 r0, alo, blo;
mul.hi.u32 r1, alo, blo;
mad.lo.cc.u32 r1, alo, bhi, r1;
madc.hi.u32 r2, alo, bhi, 0;
mad.lo.cc.u32 r1, ahi, blo, r1;
madc.hi.cc.u32 r2, ahi, blo, r2;
madc.hi.u32 r3, ahi, bhi, 0;
mad.lo.cc.u32 r2, ahi, bhi, r2;
addc.u32 r3, r3, 0;
mov.b64 %rd69, {r0,r1};
mov.b64 %rd100, {r2,r3};
}
// inline asm
setp.lt.s64 %p8, %rd100, 1;
@%p8 bra BB134_12;
// inline asm
{
.reg .u32 r0, r1, r2, r3, a0, a1, a2, a3, b0, b1, b2, b3;
mov.b64 {a0,a1}, %rd69;
mov.b64 {a2,a3}, %rd100;
mov.b64 {b0,b1}, %rd69;
mov.b64 {b2,b3}, %rd100;
add.cc.u32 r0, a0, b0;
addc.cc.u32 r1, a1, b1;
addc.cc.u32 r2, a2, b2;
addc.u32 r3, a3, b3;
mov.b64 %rd73, {r0,r1};
mov.b64 %rd100, {r2,r3};
}
// inline asm
add.s32 %r41, %r41, 1;
BB134_12:
cvt.u64.u32 %rd79, %r40;
shl.b64 %rd80, %rd79, 32;
mov.u32 %r37, 1022;
sub.s32 %r38, %r37, %r41;
cvt.u64.u32 %rd81, %r38;
shl.b64 %rd82, %rd81, 52;
add.s64 %rd83, %rd100, 1;
shr.u64 %rd84, %rd83, 10;
add.s64 %rd85, %rd84, 1;
shr.u64 %rd86, %rd85, 1;
add.s64 %rd87, %rd86, %rd82;
or.b64 %rd88, %rd87, %rd80;
mov.b64 %fd4, %rd88;
BB134_13:
st.param.f64 [func_retval0+0], %fd4;
ret;
}
.func (.param .b64 func_retval0) __internal_accurate_pow(
.param .b64 __internal_accurate_pow_param_0,
.param .b64 __internal_accurate_pow_param_1
)
{
.reg .pred %p<9>;
.reg .f32 %f<3>;
.reg .b32 %r<53>;
.reg .f64 %fd<138>;
ld.param.f64 %fd12, [__internal_accurate_pow_param_0];
ld.param.f64 %fd13, [__internal_accurate_pow_param_1];
{
.reg .b32 %temp;
mov.b64 {%temp, %r50}, %fd12;
}
{
.reg .b32 %temp;
mov.b64 {%r49, %temp}, %fd12;
}
shr.u32 %r51, %r50, 20;
setp.ne.s32 %p1, %r51, 0;
@%p1 bra BB135_2;
mul.f64 %fd14, %fd12, 0d4350000000000000;
{
.reg .b32 %temp;
mov.b64 {%temp, %r50}, %fd14;
}
{
.reg .b32 %temp;
mov.b64 {%r49, %temp}, %fd14;
}
shr.u32 %r16, %r50, 20;
add.s32 %r51, %r16, -54;
BB135_2:
add.s32 %r52, %r51, -1023;
and.b32 %r17, %r50, -2146435073;
or.b32 %r18, %r17, 1072693248;
mov.b64 %fd135, {%r49, %r18};
setp.lt.u32 %p2, %r18, 1073127583;
@%p2 bra BB135_4;
{
.reg .b32 %temp;
mov.b64 {%r19, %temp}, %fd135;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r20}, %fd135;
}
add.s32 %r21, %r20, -1048576;
mov.b64 %fd135, {%r19, %r21};
add.s32 %r52, %r51, -1022;
BB135_4:
add.f64 %fd15, %fd135, 0d3FF0000000000000;
rcp.approx.ftz.f64 %fd16, %fd15;
neg.f64 %fd17, %fd15;
mov.f64 %fd18, 0d3FF0000000000000;
fma.rn.f64 %fd19, %fd17, %fd16, %fd18;
fma.rn.f64 %fd20, %fd19, %fd19, %fd19;
fma.rn.f64 %fd21, %fd20, %fd16, %fd16;
add.f64 %fd22, %fd135, 0dBFF0000000000000;
mul.f64 %fd23, %fd22, %fd21;
fma.rn.f64 %fd24, %fd22, %fd21, %fd23;
mul.f64 %fd25, %fd24, %fd24;
mov.f64 %fd26, 0d3ED0F5D241AD3B5A;
mov.f64 %fd27, 0d3EB0F5FF7D2CAFE2;
fma.rn.f64 %fd28, %fd27, %fd25, %fd26;
mov.f64 %fd29, 0d3EF3B20A75488A3F;
fma.rn.f64 %fd30, %fd28, %fd25, %fd29;
mov.f64 %fd31, 0d3F1745CDE4FAECD5;
fma.rn.f64 %fd32, %fd30, %fd25, %fd31;
mov.f64 %fd33, 0d3F3C71C7258A578B;
fma.rn.f64 %fd34, %fd32, %fd25, %fd33;
mov.f64 %fd35, 0d3F6249249242B910;
fma.rn.f64 %fd36, %fd34, %fd25, %fd35;
mov.f64 %fd37, 0d3F89999999999DFB;
fma.rn.f64 %fd38, %fd36, %fd25, %fd37;
sub.f64 %fd39, %fd22, %fd24;
add.f64 %fd40, %fd39, %fd39;
neg.f64 %fd41, %fd24;
fma.rn.f64 %fd42, %fd41, %fd22, %fd40;
mul.f64 %fd43, %fd21, %fd42;
fma.rn.f64 %fd44, %fd25, %fd38, 0d3FB5555555555555;
mov.f64 %fd45, 0d3FB5555555555555;
sub.f64 %fd46, %fd45, %fd44;
fma.rn.f64 %fd47, %fd25, %fd38, %fd46;
add.f64 %fd48, %fd47, 0d0000000000000000;
add.f64 %fd49, %fd48, 0dBC46A4CB00B9E7B0;
add.f64 %fd50, %fd44, %fd49;
sub.f64 %fd51, %fd44, %fd50;
add.f64 %fd52, %fd49, %fd51;
mul.rn.f64 %fd53, %fd24, %fd24;
neg.f64 %fd54, %fd53;
fma.rn.f64 %fd55, %fd24, %fd24, %fd54;
{
.reg .b32 %temp;
mov.b64 {%r22, %temp}, %fd43;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r23}, %fd43;
}
add.s32 %r24, %r23, 1048576;
mov.b64 %fd56, {%r22, %r24};
fma.rn.f64 %fd57, %fd24, %fd56, %fd55;
mul.rn.f64 %fd58, %fd53, %fd24;
neg.f64 %fd59, %fd58;
fma.rn.f64 %fd60, %fd53, %fd24, %fd59;
fma.rn.f64 %fd61, %fd53, %fd43, %fd60;
fma.rn.f64 %fd62, %fd57, %fd24, %fd61;
mul.rn.f64 %fd63, %fd50, %fd58;
neg.f64 %fd64, %fd63;
fma.rn.f64 %fd65, %fd50, %fd58, %fd64;
fma.rn.f64 %fd66, %fd50, %fd62, %fd65;
fma.rn.f64 %fd67, %fd52, %fd58, %fd66;
add.f64 %fd68, %fd63, %fd67;
sub.f64 %fd69, %fd63, %fd68;
add.f64 %fd70, %fd67, %fd69;
add.f64 %fd71, %fd24, %fd68;
sub.f64 %fd72, %fd24, %fd71;
add.f64 %fd73, %fd68, %fd72;
add.f64 %fd74, %fd70, %fd73;
add.f64 %fd75, %fd43, %fd74;
add.f64 %fd76, %fd71, %fd75;
sub.f64 %fd77, %fd71, %fd76;
add.f64 %fd78, %fd75, %fd77;
xor.b32 %r25, %r52, -2147483648;
mov.u32 %r26, -2147483648;
mov.u32 %r27, 1127219200;
mov.b64 %fd79, {%r25, %r27};
mov.b64 %fd80, {%r26, %r27};
sub.f64 %fd81, %fd79, %fd80;
mov.f64 %fd82, 0d3FE62E42FEFA39EF;
fma.rn.f64 %fd83, %fd81, %fd82, %fd76;
neg.f64 %fd84, %fd81;
fma.rn.f64 %fd85, %fd84, %fd82, %fd83;
sub.f64 %fd86, %fd85, %fd76;
sub.f64 %fd87, %fd78, %fd86;
mov.f64 %fd88, 0d3C7ABC9E3B39803F;
fma.rn.f64 %fd89, %fd81, %fd88, %fd87;
add.f64 %fd90, %fd83, %fd89;
sub.f64 %fd91, %fd83, %fd90;
add.f64 %fd92, %fd89, %fd91;
{
.reg .b32 %temp;
mov.b64 {%temp, %r28}, %fd13;
}
add.s32 %r29, %r28, %r28;
setp.gt.u32 %p3, %r29, -33554433;
and.b32 %r30, %r28, -15728641;
selp.b32 %r31, %r30, %r28, %p3;
{
.reg .b32 %temp;
mov.b64 {%r32, %temp}, %fd13;
}
mov.b64 %fd93, {%r32, %r31};
mul.rn.f64 %fd94, %fd90, %fd93;
neg.f64 %fd95, %fd94;
fma.rn.f64 %fd96, %fd90, %fd93, %fd95;
fma.rn.f64 %fd97, %fd92, %fd93, %fd96;
add.f64 %fd4, %fd94, %fd97;
sub.f64 %fd98, %fd94, %fd4;
add.f64 %fd5, %fd97, %fd98;
mov.f64 %fd99, 0d4338000000000000;
mov.f64 %fd100, 0d3FF71547652B82FE;
fma.rn.f64 %fd101, %fd4, %fd100, %fd99;
{
.reg .b32 %temp;
mov.b64 {%r13, %temp}, %fd101;
}
mov.f64 %fd102, 0dC338000000000000;
add.rn.f64 %fd103, %fd101, %fd102;
mov.f64 %fd104, 0dBFE62E42FEFA39EF;
fma.rn.f64 %fd105, %fd103, %fd104, %fd4;
mov.f64 %fd106, 0dBC7ABC9E3B39803F;
fma.rn.f64 %fd107, %fd103, %fd106, %fd105;
mov.f64 %fd108, 0d3E928AF3FCA213EA;
mov.f64 %fd109, 0d3E5ADE1569CE2BDF;
fma.rn.f64 %fd110, %fd109, %fd107, %fd108;
mov.f64 %fd111, 0d3EC71DEE62401315;
fma.rn.f64 %fd112, %fd110, %fd107, %fd111;
mov.f64 %fd113, 0d3EFA01997C89EB71;
fma.rn.f64 %fd114, %fd112, %fd107, %fd113;
mov.f64 %fd115, 0d3F2A01A014761F65;
fma.rn.f64 %fd116, %fd114, %fd107, %fd115;
mov.f64 %fd117, 0d3F56C16C1852B7AF;
fma.rn.f64 %fd118, %fd116, %fd107, %fd117;
mov.f64 %fd119, 0d3F81111111122322;
fma.rn.f64 %fd120, %fd118, %fd107, %fd119;
mov.f64 %fd121, 0d3FA55555555502A1;
fma.rn.f64 %fd122, %fd120, %fd107, %fd121;
mov.f64 %fd123, 0d3FC5555555555511;
fma.rn.f64 %fd124, %fd122, %fd107, %fd123;
mov.f64 %fd125, 0d3FE000000000000B;
fma.rn.f64 %fd126, %fd124, %fd107, %fd125;
fma.rn.f64 %fd127, %fd126, %fd107, %fd18;
fma.rn.f64 %fd128, %fd127, %fd107, %fd18;
{
.reg .b32 %temp;
mov.b64 {%r14, %temp}, %fd128;
}
{
.reg .b32 %temp;
mov.b64 {%temp, %r15}, %fd128;
}
shl.b32 %r33, %r13, 20;
add.s32 %r34, %r15, %r33;
mov.b64 %fd136, {%r14, %r34};
{
.reg .b32 %temp;
mov.b64 {%temp, %r35}, %fd4;
}
mov.b32 %f2, %r35;
abs.f32 %f1, %f2;
setp.lt.f32 %p4, %f1, 0f4086232B;
@%p4 bra BB135_7;
setp.lt.f64 %p5, %fd4, 0d0000000000000000;
add.f64 %fd129, %fd4, 0d7FF0000000000000;
selp.f64 %fd136, 0d0000000000000000, %fd129, %p5;
setp.geu.f32 %p6, %f1, 0f40874800;
@%p6 bra BB135_7;
mov.f64 %fd134, 0d4338000000000000;
mov.f64 %fd133, 0d3FF71547652B82FE;
fma.rn.f64 %fd132, %fd4, %fd133, %fd134;
{
.reg .b32 %temp;
mov.b64 {%r48, %temp}, %fd132;
}
shr.u32 %r36, %r48, 31;
add.s32 %r37, %r48, %r36;
shr.s32 %r38, %r37, 1;
shl.b32 %r39, %r38, 20;
add.s32 %r40, %r39, %r15;
mov.b64 %fd130, {%r14, %r40};
sub.s32 %r41, %r48, %r38;
shl.b32 %r42, %r41, 20;
add.s32 %r43, %r42, 1072693248;
mov.u32 %r44, 0;
mov.b64 %fd131, {%r44, %r43};
mul.f64 %fd136, %fd130, %fd131;
BB135_7:
{
.reg .b32 %temp;
mov.b64 {%temp, %r45}, %fd136;
}
and.b32 %r46, %r45, 2147483647;
setp.ne.s32 %p7, %r46, 2146435072;
@%p7 bra BB135_9;
{
.reg .b32 %temp;
mov.b64 {%r47, %temp}, %fd136;
}
setp.eq.s32 %p8, %r47, 0;
@%p8 bra BB135_10;
BB135_9:
fma.rn.f64 %fd136, %fd136, %fd5, %fd136;
BB135_10:
st.param.f64 [func_retval0+0], %fd136;
ret;
}