blob: e5fe247ec158c702ec77bce9a54aa1b3f230bd48 [file] [log] [blame]
//+build !noasm !appengine
// AUTO-GENERATED BY C2GOASM -- DO NOT EDIT
TEXT ยท_sum_float64_avx2(SB), $0-24
MOVQ buf+0(FP), DI
MOVQ len+8(FP), SI
MOVQ res+16(FP), DX
LONG $0xc057f9c5 // vxorpd xmm0, xmm0, xmm0
WORD $0x8548; BYTE $0xf6 // test rsi, rsi
JE LBB0_14
LONG $0x1ffe8348 // cmp rsi, 31
JBE LBB0_2
WORD $0x8949; BYTE $0xf1 // mov r9, rsi
LONG $0xe0e18349 // and r9, -32
JE LBB0_2
LONG $0xe0418d4d // lea r8, [r9 - 32]
WORD $0x8944; BYTE $0xc0 // mov eax, r8d
WORD $0xe8c1; BYTE $0x05 // shr eax, 5
WORD $0xc0ff // inc eax
LONG $0x07e08348 // and rax, 7
JE LBB0_7
WORD $0xf748; BYTE $0xd8 // neg rax
LONG $0xc057fdc5 // vxorpd ymm0, ymm0, ymm0
WORD $0xc931 // xor ecx, ecx
LONG $0xc957f5c5 // vxorpd ymm1, ymm1, ymm1
LONG $0xd257edc5 // vxorpd ymm2, ymm2, ymm2
LONG $0xdb57e5c5 // vxorpd ymm3, ymm3, ymm3
LONG $0xe457ddc5 // vxorpd ymm4, ymm4, ymm4
LONG $0xed57d5c5 // vxorpd ymm5, ymm5, ymm5
LONG $0xf657cdc5 // vxorpd ymm6, ymm6, ymm6
LONG $0xff57c5c5 // vxorpd ymm7, ymm7, ymm7
LBB0_9:
LONG $0x0458fdc5; BYTE $0xcf // vaddpd ymm0, ymm0, yword [rdi + 8*rcx]
LONG $0x4c58f5c5; WORD $0x20cf // vaddpd ymm1, ymm1, yword [rdi + 8*rcx + 32]
LONG $0x5458edc5; WORD $0x40cf // vaddpd ymm2, ymm2, yword [rdi + 8*rcx + 64]
LONG $0x5c58e5c5; WORD $0x60cf // vaddpd ymm3, ymm3, yword [rdi + 8*rcx + 96]
QUAD $0x000080cfa458ddc5; BYTE $0x00 // vaddpd ymm4, ymm4, yword [rdi + 8*rcx + 128]
QUAD $0x0000a0cfac58d5c5; BYTE $0x00 // vaddpd ymm5, ymm5, yword [rdi + 8*rcx + 160]
QUAD $0x0000c0cfb458cdc5; BYTE $0x00 // vaddpd ymm6, ymm6, yword [rdi + 8*rcx + 192]
QUAD $0x0000e0cfbc58c5c5; BYTE $0x00 // vaddpd ymm7, ymm7, yword [rdi + 8*rcx + 224]
LONG $0x20c18348 // add rcx, 32
WORD $0xff48; BYTE $0xc0 // inc rax
JNE LBB0_9
JMP LBB0_10
LBB0_2:
WORD $0x3145; BYTE $0xc9 // xor r9d, r9d
LBB0_3:
LONG $0xcf048d4a // lea rax, [rdi + 8*r9]
WORD $0x294c; BYTE $0xce // sub rsi, r9
LBB0_4:
LONG $0x0058fbc5 // vaddsd xmm0, xmm0, qword [rax]
LONG $0x08c08348 // add rax, 8
WORD $0xff48; BYTE $0xce // dec rsi
JNE LBB0_4
LBB0_14:
LONG $0x0211fbc5 // vmovsd qword [rdx], xmm0
VZEROUPPER
RET
LBB0_7:
WORD $0xc931 // xor ecx, ecx
LONG $0xc057fdc5 // vxorpd ymm0, ymm0, ymm0
LONG $0xc957f5c5 // vxorpd ymm1, ymm1, ymm1
LONG $0xd257edc5 // vxorpd ymm2, ymm2, ymm2
LONG $0xdb57e5c5 // vxorpd ymm3, ymm3, ymm3
LONG $0xe457ddc5 // vxorpd ymm4, ymm4, ymm4
LONG $0xed57d5c5 // vxorpd ymm5, ymm5, ymm5
LONG $0xf657cdc5 // vxorpd ymm6, ymm6, ymm6
LONG $0xff57c5c5 // vxorpd ymm7, ymm7, ymm7
LBB0_10:
LONG $0xe0f88149; WORD $0x0000; BYTE $0x00 // cmp r8, 224
JB LBB0_13
WORD $0x894c; BYTE $0xc8 // mov rax, r9
WORD $0x2948; BYTE $0xc8 // sub rax, rcx
QUAD $0x00000700cf8c8d48 // lea rcx, [rdi + 8*rcx + 1792]
LBB0_12:
QUAD $0xfffff9e0b958c5c5 // vaddpd ymm7, ymm7, yword [rcx - 1568]
QUAD $0xfffff9c0b158cdc5 // vaddpd ymm6, ymm6, yword [rcx - 1600]
QUAD $0xfffff9a0a958d5c5 // vaddpd ymm5, ymm5, yword [rcx - 1632]
QUAD $0xfffff980a158ddc5 // vaddpd ymm4, ymm4, yword [rcx - 1664]
QUAD $0xfffff9609958e5c5 // vaddpd ymm3, ymm3, yword [rcx - 1696]
QUAD $0xfffff9409158edc5 // vaddpd ymm2, ymm2, yword [rcx - 1728]
QUAD $0xfffff9208958f5c5 // vaddpd ymm1, ymm1, yword [rcx - 1760]
QUAD $0xfffff9008158fdc5 // vaddpd ymm0, ymm0, yword [rcx - 1792]
QUAD $0xfffffa008158fdc5 // vaddpd ymm0, ymm0, yword [rcx - 1536]
QUAD $0xfffffa208958f5c5 // vaddpd ymm1, ymm1, yword [rcx - 1504]
QUAD $0xfffffa409158edc5 // vaddpd ymm2, ymm2, yword [rcx - 1472]
QUAD $0xfffffa609958e5c5 // vaddpd ymm3, ymm3, yword [rcx - 1440]
QUAD $0xfffffa80a158ddc5 // vaddpd ymm4, ymm4, yword [rcx - 1408]
QUAD $0xfffffaa0a958d5c5 // vaddpd ymm5, ymm5, yword [rcx - 1376]
QUAD $0xfffffac0b158cdc5 // vaddpd ymm6, ymm6, yword [rcx - 1344]
QUAD $0xfffffae0b958c5c5 // vaddpd ymm7, ymm7, yword [rcx - 1312]
QUAD $0xfffffbe0b958c5c5 // vaddpd ymm7, ymm7, yword [rcx - 1056]
QUAD $0xfffffbc0b158cdc5 // vaddpd ymm6, ymm6, yword [rcx - 1088]
QUAD $0xfffffba0a958d5c5 // vaddpd ymm5, ymm5, yword [rcx - 1120]
QUAD $0xfffffb80a158ddc5 // vaddpd ymm4, ymm4, yword [rcx - 1152]
QUAD $0xfffffb609958e5c5 // vaddpd ymm3, ymm3, yword [rcx - 1184]
QUAD $0xfffffb409158edc5 // vaddpd ymm2, ymm2, yword [rcx - 1216]
QUAD $0xfffffb208958f5c5 // vaddpd ymm1, ymm1, yword [rcx - 1248]
QUAD $0xfffffb008158fdc5 // vaddpd ymm0, ymm0, yword [rcx - 1280]
QUAD $0xfffffc008158fdc5 // vaddpd ymm0, ymm0, yword [rcx - 1024]
QUAD $0xfffffc208958f5c5 // vaddpd ymm1, ymm1, yword [rcx - 992]
QUAD $0xfffffc409158edc5 // vaddpd ymm2, ymm2, yword [rcx - 960]
QUAD $0xfffffc609958e5c5 // vaddpd ymm3, ymm3, yword [rcx - 928]
QUAD $0xfffffc80a158ddc5 // vaddpd ymm4, ymm4, yword [rcx - 896]
QUAD $0xfffffca0a958d5c5 // vaddpd ymm5, ymm5, yword [rcx - 864]
QUAD $0xfffffcc0b158cdc5 // vaddpd ymm6, ymm6, yword [rcx - 832]
QUAD $0xfffffce0b958c5c5 // vaddpd ymm7, ymm7, yword [rcx - 800]
QUAD $0xfffffde0b958c5c5 // vaddpd ymm7, ymm7, yword [rcx - 544]
QUAD $0xfffffdc0b158cdc5 // vaddpd ymm6, ymm6, yword [rcx - 576]
QUAD $0xfffffda0a958d5c5 // vaddpd ymm5, ymm5, yword [rcx - 608]
QUAD $0xfffffd80a158ddc5 // vaddpd ymm4, ymm4, yword [rcx - 640]
QUAD $0xfffffd609958e5c5 // vaddpd ymm3, ymm3, yword [rcx - 672]
QUAD $0xfffffd409158edc5 // vaddpd ymm2, ymm2, yword [rcx - 704]
QUAD $0xfffffd208958f5c5 // vaddpd ymm1, ymm1, yword [rcx - 736]
QUAD $0xfffffd008158fdc5 // vaddpd ymm0, ymm0, yword [rcx - 768]
QUAD $0xfffffe008158fdc5 // vaddpd ymm0, ymm0, yword [rcx - 512]
QUAD $0xfffffe208958f5c5 // vaddpd ymm1, ymm1, yword [rcx - 480]
QUAD $0xfffffe409158edc5 // vaddpd ymm2, ymm2, yword [rcx - 448]
QUAD $0xfffffe609958e5c5 // vaddpd ymm3, ymm3, yword [rcx - 416]
QUAD $0xfffffe80a158ddc5 // vaddpd ymm4, ymm4, yword [rcx - 384]
QUAD $0xfffffea0a958d5c5 // vaddpd ymm5, ymm5, yword [rcx - 352]
QUAD $0xfffffec0b158cdc5 // vaddpd ymm6, ymm6, yword [rcx - 320]
QUAD $0xfffffee0b958c5c5 // vaddpd ymm7, ymm7, yword [rcx - 288]
LONG $0x7958c5c5; BYTE $0xe0 // vaddpd ymm7, ymm7, yword [rcx - 32]
LONG $0x7158cdc5; BYTE $0xc0 // vaddpd ymm6, ymm6, yword [rcx - 64]
LONG $0x6958d5c5; BYTE $0xa0 // vaddpd ymm5, ymm5, yword [rcx - 96]
LONG $0x6158ddc5; BYTE $0x80 // vaddpd ymm4, ymm4, yword [rcx - 128]
QUAD $0xffffff609958e5c5 // vaddpd ymm3, ymm3, yword [rcx - 160]
QUAD $0xffffff409158edc5 // vaddpd ymm2, ymm2, yword [rcx - 192]
QUAD $0xffffff208958f5c5 // vaddpd ymm1, ymm1, yword [rcx - 224]
QUAD $0xffffff008158fdc5 // vaddpd ymm0, ymm0, yword [rcx - 256]
LONG $0x0158fdc5 // vaddpd ymm0, ymm0, yword [rcx]
LONG $0x4958f5c5; BYTE $0x20 // vaddpd ymm1, ymm1, yword [rcx + 32]
LONG $0x5158edc5; BYTE $0x40 // vaddpd ymm2, ymm2, yword [rcx + 64]
LONG $0x5958e5c5; BYTE $0x60 // vaddpd ymm3, ymm3, yword [rcx + 96]
QUAD $0x00000080a158ddc5 // vaddpd ymm4, ymm4, yword [rcx + 128]
QUAD $0x000000a0a958d5c5 // vaddpd ymm5, ymm5, yword [rcx + 160]
QUAD $0x000000c0b158cdc5 // vaddpd ymm6, ymm6, yword [rcx + 192]
QUAD $0x000000e0b958c5c5 // vaddpd ymm7, ymm7, yword [rcx + 224]
LONG $0x00c18148; WORD $0x0008; BYTE $0x00 // add rcx, 2048
LONG $0xff000548; WORD $0xffff // add rax, -256
JNE LBB0_12
LBB0_13:
LONG $0xcd58f5c5 // vaddpd ymm1, ymm1, ymm5
LONG $0xdf58e5c5 // vaddpd ymm3, ymm3, ymm7
LONG $0xc458fdc5 // vaddpd ymm0, ymm0, ymm4
LONG $0xd658edc5 // vaddpd ymm2, ymm2, ymm6
LONG $0xc258fdc5 // vaddpd ymm0, ymm0, ymm2
LONG $0xcb58f5c5 // vaddpd ymm1, ymm1, ymm3
LONG $0xc158fdc5 // vaddpd ymm0, ymm0, ymm1
LONG $0x197de3c4; WORD $0x01c1 // vextractf128 xmm1, ymm0, 1
LONG $0xc158fdc5 // vaddpd ymm0, ymm0, ymm1
LONG $0xc07cfdc5 // vhaddpd ymm0, ymm0, ymm0
WORD $0x3949; BYTE $0xf1 // cmp r9, rsi
JNE LBB0_3
JMP LBB0_14