#include <machine/asm.h>
#include "amd64_archlevel.h"
#define ALIGN_TEXT .p2align 4, 0x90
.weak stpncpy
.set stpncpy, __stpncpy
ARCHFUNCS(__stpncpy)
ARCHFUNC(__stpncpy, scalar)
ARCHFUNC(__stpncpy, baseline)
ENDARCHFUNCS(__stpncpy)
ARCHENTRY(__stpncpy, scalar)
push %rbp # establish stack frame
mov %rsp, %rbp
push %rdx
push %rdi
push %rsi
push %rax # dummy push for alignment
mov %rsi, %rdi
xor %esi, %esi
call CNAME(__memchr) # memchr(src, '\0', len)
pop %rcx # dummy pop
pop %rsi
mov -16(%rbp), %rdi
test %rax, %rax # NUL found?
jz .Lfullcopy
mov %rax, %rdx
sub %rsi, %rdx # copy until the NUL byte
add %rdx, -16(%rbp) # advance destination by string length
sub %rdx, -8(%rbp) # and shorten buffer size by string length
call CNAME(memcpy)
pop %rdi
pop %rdx
xor %esi, %esi
pop %rbp
jmp CNAME(memset) # clear remaining buffer
.Lfullcopy:
mov -8(%rbp), %rdx
call CNAME(memcpy) # copy whole string
add -8(%rbp), %rax # point to dest[n]
leave
ret
ARCHEND(__stpncpy, scalar)
.section .rodata
.Lmask: .quad 0xffffffffffffffff
.quad 0xffffffffffffffff
.quad 0x0000000000000000
.quad 0x0000000000000000
ARCHENTRY(__stpncpy, baseline)
#define bounce (-3*16-8)
test %rdx, %rdx # no bytes to copy?
jz .L0
mov %esi, %ecx
and $~0xf, %rsi # align source to 16 bytes
movdqa (%rsi), %xmm0 # load head
and $0xf, %ecx # offset from alignment
mov $-1, %r9d
lea -33(%rcx), %rax # set up overflow-proof comparison rdx+rcx<=32
shl %cl, %r9d # mask of bytes belonging to the string
sub %rcx, %rdi # adjust RDI to correspond to RSI
pxor %xmm1, %xmm1
movdqa %xmm0, bounce(%rsp) # stash copy of head on the stack
pcmpeqb %xmm1, %xmm0
pmovmskb %xmm0, %r8d
lea (%rdx, %rcx, 1), %r10 # buffer length from alignment boundary
add %rdx, %rax # less than 2 chunks (32 bytes) to play with?
jnc .Lrunt # if yes, use special runt processing
movdqu %xmm1, -16(%rdi, %r10, 1) # clear final bytes of destination
and %r9d, %r8d # end of string within head?
jnz .Lheadnul
movdqu (%rsi, %rcx, 1), %xmm2 # load head from source buffer
movdqu %xmm2, (%rdi, %rcx, 1) # an deposit
add $16, %rsi
add $16, %rdi
sub $32, %r10
ALIGN_TEXT
0: movdqa (%rsi), %xmm0
pxor %xmm1, %xmm1
pcmpeqb %xmm0, %xmm1 # NUL byte encountered?
pmovmskb %xmm1, %r8d
test %r8d, %r8d
jnz 3f
movdqu %xmm0, (%rdi)
cmp $16, %r10 # more than a full chunk left?
jbe 1f
movdqa 16(%rsi), %xmm0
add $32, %rdi # advance pointers to next chunk
add $32, %rsi
pxor %xmm1, %xmm1
pcmpeqb %xmm0, %xmm1 # NUL byte encountered?
pmovmskb %xmm1, %r8d
test %r8d, %r8d
jnz 2f
movdqu %xmm0, -16(%rdi)
sub $32, %r10 # more than another full chunk left?
ja 0b
sub $16, %rdi # undo second advancement
sub $16, %rsi
add $16, %r10d # restore number of remaining bytes
1: pxor %xmm1, %xmm1
pcmpeqb 16(%rsi), %xmm1 # NUL byte in source tail?
pmovmskb %xmm1, %r8d
bts %r10d, %r8d # treat end of buffer as NUL
tzcnt %r8d, %r8d # where is the NUL byte?
movdqu (%rsi, %r8, 1), %xmm0 # load source tail before NUL
lea 16(%rdi, %r8, 1), %rax # point return value to NUL byte
movdqu %xmm0, (%rdi, %r8, 1) # store tail into the buffer
ret
2: sub $16, %rdi # undo second advancement
sub $16, %rsi
sub $16, %r10
3: tzcnt %r8d, %r8d # where did the string end?
lea .Lmask+16(%rip), %rcx
lea (%rdi, %r8, 1), %rax # where the NUL byte will be
neg %r8
movdqu (%rcx, %r8, 1), %xmm1 # mask with FF where the string is,
# 00 where it is not
pand %xmm1, %xmm0 # mask out bytes after the string
movdqu %xmm0, (%rdi) # store masked current chunk
pxor %xmm1, %xmm1
sub $16, %r10 # another full chunk left?
jbe 1f
ALIGN_TEXT
0: movdqu %xmm1, 16(%rdi)
cmp $16, %r10
jbe 1f
movdqu %xmm1, 32(%rdi)
add $32, %rdi
sub $32, %r10
ja 0b
1: ret
.Lheadnul:
movdqu bounce(%rsp, %rcx, 1), %xmm0 # load start of source from stack
tzcnt %r8d, %r8d # find location of NUL byte
movdqu %xmm0, (%rdi, %rcx, 1) # deposit head in the destination
movdqu %xmm1, (%rdi, %r8, 1) # clear out following bytes
movdqu %xmm1, 16(%rdi) # clear out second chunk
lea (%rdi, %r8, 1), %rax # make RAX point to the NUL byte
add $32, %rdi # advance past first two chunks
sub $32+16, %r10 # advance past first three chunks
jbe 1f # did we pass the end of the buffer?
ALIGN_TEXT
0: movdqu %xmm1, (%rdi) # clear out buffer chunk
cmp $16, %r10
jbe 1f
movdqu %xmm1, 16(%rdi)
add $32, %rdi
sub $32, %r10
ja 0b
1: ret
.Lrunt: movdqa %xmm1, bounce+16(%rsp) # clear out rest of on-stack copy
and %r9d, %r8d # mask out head before string
bts %r10, %r8 # treat end of buffer as end of string
test $0x1ffff, %r8d # end of string within first chunk or right after?
jnz 0f # if yes, do not inspect second buffer
movdqa 16(%rsi), %xmm0 # load second chunk of input
movdqa %xmm0, bounce+16(%rsp) # stash copy on stack
pcmpeqb %xmm1, %xmm0 # NUL in second chunk?
pmovmskb %xmm0, %r9d
shl $16, %r9d
or %r9, %r8 # merge found NUL bytes into NUL mask
0: tzcnt %r8, %r8 # location of last char in string
movdqu %xmm1, bounce(%rsp, %r8, 1) # clear bytes behind string
lea bounce(%rsp, %rcx, 1), %rsi # start of string copy on stack
lea (%rdi, %r8, 1), %rax # return pointer to NUL byte
cmp $16, %edx # at least 16 bytes to transfer?
jae .L1631
mov (%rsi), %r8 # load string head
cmp $8, %edx # at least 8 bytes to transfer?
jae .L0815
cmp $4, %edx # at least 4 bytes to transfer?
jae .L0407
movzwl -2(%rsi, %rdx, 1), %esi # load last two bytes of string
mov %r8b, (%rdi, %rcx, 1) # store first byte
cmp $2, %edx # at least 2 bytes to transfer?
jb .L1
mov %si, -2(%rdi, %r10, 1) # store last two bytes of string
.L1: ret
.L1631: movdqu (%rsi), %xmm0 # load first 16 bytes of string
movdqu -16(%rsi, %rdx, 1), %xmm1 # load last 16 bytes of string
movdqu %xmm0, (%rdi, %rcx, 1)
movdqu %xmm1, -16(%rdi, %r10, 1)
ret
.L0815: mov -8(%rsi, %rdx, 1), %rdx # load last 8 bytes of string
mov %r8, (%rdi, %rcx, 1)
mov %rdx, -8(%rdi, %r10, 1)
ret
.L0407: mov -4(%rsi, %rdx, 1), %edx # load last four bytes of string
mov %r8d, (%rdi, %rcx, 1)
mov %edx, -4(%rdi, %r10, 1)
ret
.L0: mov %rdi, %rax
ret
ARCHEND(__stpncpy, baseline)
.section .note.GNU-stack,"",%progbits