blob: 04b4cc8536189a4a3c78166a296bd494ec281400 [file] [log] [blame]
.text
.intel_syntax noprefix
.file "_lib/uint64.c"
.globl sum_uint64_sse4
.p2align 4, 0x90
.type sum_uint64_sse4,@function
sum_uint64_sse4: # @sum_uint64_sse4
# BB#0:
push rbp
mov rbp, rsp
and rsp, -8
test rsi, rsi
je .LBB0_1
# BB#2:
cmp rsi, 3
jbe .LBB0_3
# BB#6:
mov r9, rsi
and r9, -4
je .LBB0_3
# BB#7:
lea r8, [r9 - 4]
mov eax, r8d
shr eax, 2
inc eax
and rax, 3
je .LBB0_8
# BB#9:
neg rax
pxor xmm0, xmm0
xor ecx, ecx
pxor xmm1, xmm1
.p2align 4, 0x90
.LBB0_10: # =>This Inner Loop Header: Depth=1
movdqu xmm2, xmmword ptr [rdi + 8*rcx]
movdqu xmm3, xmmword ptr [rdi + 8*rcx + 16]
paddq xmm0, xmm2
paddq xmm1, xmm3
add rcx, 4
inc rax
jne .LBB0_10
jmp .LBB0_11
.LBB0_3:
xor r9d, r9d
xor eax, eax
.LBB0_4:
lea rcx, [rdi + 8*r9]
sub rsi, r9
.p2align 4, 0x90
.LBB0_5: # =>This Inner Loop Header: Depth=1
add rax, qword ptr [rcx]
add rcx, 8
dec rsi
jne .LBB0_5
jmp .LBB0_15
.LBB0_1:
xor eax, eax
.LBB0_15:
mov qword ptr [rdx], rax
mov rsp, rbp
pop rbp
ret
.LBB0_8:
xor ecx, ecx
pxor xmm0, xmm0
pxor xmm1, xmm1
.LBB0_11:
cmp r8, 12
jb .LBB0_14
# BB#12:
mov rax, r9
sub rax, rcx
lea rcx, [rdi + 8*rcx + 112]
.p2align 4, 0x90
.LBB0_13: # =>This Inner Loop Header: Depth=1
movdqu xmm2, xmmword ptr [rcx - 112]
movdqu xmm3, xmmword ptr [rcx - 96]
movdqu xmm4, xmmword ptr [rcx - 80]
movdqu xmm5, xmmword ptr [rcx - 64]
paddq xmm2, xmm0
paddq xmm3, xmm1
movdqu xmm6, xmmword ptr [rcx - 48]
movdqu xmm7, xmmword ptr [rcx - 32]
paddq xmm6, xmm4
paddq xmm6, xmm2
paddq xmm7, xmm5
paddq xmm7, xmm3
movdqu xmm0, xmmword ptr [rcx - 16]
movdqu xmm1, xmmword ptr [rcx]
paddq xmm0, xmm6
paddq xmm1, xmm7
sub rcx, -128
add rax, -16
jne .LBB0_13
.LBB0_14:
paddq xmm0, xmm1
pshufd xmm1, xmm0, 78 # xmm1 = xmm0[2,3,0,1]
paddq xmm1, xmm0
movq rax, xmm1
cmp r9, rsi
jne .LBB0_4
jmp .LBB0_15
.Lfunc_end0:
.size sum_uint64_sse4, .Lfunc_end0-sum_uint64_sse4
.ident "Apple LLVM version 9.0.0 (clang-900.0.39.2)"
.section ".note.GNU-stack","",@progbits