#include <machine/asm.h>
#include "amd64_archlevel.h"
#define ALIGN_TEXT .p2align 4, 0x90
.weak stpcpy
.set stpcpy, __stpcpy
ARCHFUNCS(__stpcpy)
ARCHFUNC(__stpcpy, scalar)
ARCHFUNC(__stpcpy, baseline)
ENDARCHFUNCS(__stpcpy)
ARCHENTRY(__stpcpy, scalar)
movabsq $0x0101010101010101,%r8
movabsq $0x8080808080808080,%r9
.Lalign:
testb $7,%sil
je .Lword_aligned
movb (%rsi),%dl
incq %rsi
movb %dl,(%rdi)
incq %rdi
testb %dl,%dl
jne .Lalign
movq %rdi,%rax
dec %rax
ret
ALIGN_TEXT
.Lloop:
movq %rdx,(%rdi)
addq $8,%rdi
.Lword_aligned:
movq (%rsi),%rdx
movq %rdx,%rcx
addq $8,%rsi
subq %r8,%rcx
testq %r9,%rcx
je .Lloop
movb %dl,(%rdi)
testb %dl,%dl
je .Ldone
incq %rdi
shrq $8,%rdx
movb %dl,(%rdi)
testb %dl,%dl
je .Ldone
incq %rdi
shrq $8,%rdx
movb %dl,(%rdi)
testb %dl,%dl
je .Ldone
incq %rdi
shrq $8,%rdx
movb %dl,(%rdi)
testb %dl,%dl
je .Ldone
incq %rdi
shrq $8,%rdx
movb %dl,(%rdi)
testb %dl,%dl
je .Ldone
incq %rdi
shrq $8,%rdx
movb %dl,(%rdi)
testb %dl,%dl
je .Ldone
incq %rdi
shrq $8,%rdx
movb %dl,(%rdi)
testb %dl,%dl
je .Ldone
incq %rdi
shrq $8,%rdx
movb %dl,(%rdi)
incq %rdi
testb %dl,%dl
jne .Lword_aligned
decq %rdi
.Ldone:
movq %rdi,%rax
ret
ARCHEND(__stpcpy, scalar)
ARCHENTRY(__stpcpy, baseline)
mov %esi, %ecx
mov %rdi, %rdx
sub %rsi, %rdi # express destination as distance to surce
and $~0xf, %rsi # align source to 16 byte
movdqa (%rsi), %xmm0 # head of string with junk before
pxor %xmm1, %xmm1
and $0xf, %ecx # misalignment in bytes
pcmpeqb %xmm1, %xmm0 # NUL byte present?
pmovmskb %xmm0, %eax
shr %cl, %eax # clear out matches in junk bytes
bsf %eax, %eax # find match if any
jnz .Lrunt
movdqa 16(%rsi), %xmm0 # 16 bytes of current iteration
movdqu (%rsi, %rcx, 1), %xmm2 # first 16 bytes of the string
pcmpeqb %xmm0, %xmm1 # NUL byte present?
pmovmskb %xmm1, %eax
test %eax, %eax # find match if any
jnz .Lshorty
movdqu %xmm2, (%rdx) # store beginning of string
ALIGN_TEXT
0: movdqa 32(%rsi), %xmm2 # load current iteraion
movdqu %xmm0, 16(%rsi, %rdi, 1) # write back previous iteraion
pxor %xmm1, %xmm1
add $32, %rsi
pcmpeqb %xmm2, %xmm1 # NUL byte present?
pmovmskb %xmm1, %eax
test %eax, %eax
jnz 1f
movdqa 16(%rsi), %xmm0 # load current iteraion
movdqu %xmm2, (%rsi, %rdi, 1) # write back previous iteraion
pxor %xmm1, %xmm1
pcmpeqb %xmm0, %xmm1 # NUL byte present?
pmovmskb %xmm1, %eax
test %eax, %eax
jz 0b
add $16, %rsi # advance rsi to second unrolled half
1: tzcnt %eax, %eax # find location of match
# (behaves as bsf on pre-x86-64-v3 CPUs)
add %rsi, %rax # point to NUL byte
movdqu -15(%rax), %xmm0 # last 16 bytes of string
movdqu %xmm0, -15(%rax, %rdi, 1) # copied to destination
add %rdi, %rax # point to destination's NUL byte
ret
.Lshorty:
tzcnt %eax, %eax
add $16, %eax # account for length of first iteration
sub %ecx, %eax # but not the parts before the string
.Lrunt: lea 1(%rax), %edi # string length including NUL byte
add %rcx, %rsi # point to beginning of string
add %rdx, %rax # point to NUL byte
.L1632: cmp $16, %edi
jb .L0815
movdqu -16(%rsi, %rdi, 1), %xmm0 # load last 16 bytes
movdqu %xmm2, (%rdx) # store first 16 bytes
movdqu %xmm0, -15(%rax) # store last 16 bytes
ret
.L0815: cmp $8, %edi
jb .L0407
mov (%rsi), %rcx # load first 8 bytes
mov -8(%rsi, %rdi, 1), %rdi # load last 8 bytes
mov %rcx, (%rdx) # store to dst
mov %rdi, -7(%rax) # dito
ret
.L0407: cmp $4, %edi
jb .L0203
mov (%rsi), %ecx
mov -4(%rsi, %rdi, 1), %edi
mov %ecx, (%rdx)
mov %edi, -3(%rax)
ret
.L0203: cmp $2, %edi
jb .L0101
movzwl (%rsi), %ecx
mov %cx, (%rdx) # store first two bytes
.L0101: movb $0, (%rax) # store terminating NUL byte
ret
ARCHEND(__stpcpy, baseline)
.section .note.GNU-stack,"",%progbits