#include <machine/asm.h>
#include "amd64_archlevel.h"
#define ALIGN_TEXT .p2align 4, 0x90
.weak strlcpy
.set strlcpy, __strlcpy
ARCHFUNCS(__strlcpy)
ARCHFUNC(__strlcpy, scalar)
ARCHFUNC(__strlcpy, baseline)
ENDARCHFUNCS(__strlcpy)
ARCHENTRY(__strlcpy, scalar)
push %rbp # establish stack frame
mov %rsp, %rbp
push %rsi
push %rbx
push %rdi
push %rdx
mov %rsi, %rdi
call CNAME(strlen) # strlen(src)
pop %rdx
pop %rdi
mov -8(%rbp), %rsi
mov %rax, %rbx # remember string length for return value
sub $1, %rdx # do not copy into the final byte of the buffer
jc 0f # skip copying altogether if buffer was empty
cmp %rax, %rdx # is the buffer longer than the input?
cmova %rax, %rdx # if yes, only copy the part that fits
movb $0, (%rdi, %rdx, 1) # NUL-terminate output buffer
call CNAME(memcpy) # copy string to output
0: mov %rbx, %rax # restore return value
pop %rbx
leave
ret
ARCHEND(__strlcpy, scalar)
ARCHENTRY(__strlcpy, baseline)
sub $1, %rdx # do not count NUL byte in buffer length
jb .L0 # go to special code path if len was 0
mov %esi, %ecx
pxor %xmm1, %xmm1
mov %rsi, %r9 # stash a copy of the source pointer for later
and $~0xf, %rsi
pcmpeqb (%rsi), %xmm1 # NUL found in head?
mov $-1, %r8d
and $0xf, %ecx
shl %cl, %r8d # mask of bytes in the string
pmovmskb %xmm1, %eax
and %r8d, %eax
jnz .Lhead_nul
movdqa 16(%rsi), %xmm3 # load second string chunk
movdqu (%r9), %xmm2 # load unaligned string head
mov $32, %r8d
sub %ecx, %r8d # head length + length of second chunk
pxor %xmm1, %xmm1
pcmpeqb %xmm3, %xmm1 # NUL found in second chunk?
sub %r8, %rdx # enough space left for the second chunk?
jbe .Lhead_buf_end
pmovmskb %xmm1, %eax
test %eax, %eax
jnz .Lsecond_nul
movdqa 32(%rsi), %xmm0 # load next string chunk
pxor %xmm1, %xmm1
movdqu %xmm2, (%rdi) # deposit head into buffer
sub %rcx, %rdi # adjust RDI to correspond to RSI
movdqu %xmm3, 16(%rdi) # deposit second chunk
sub %rsi, %rdi # express RDI as distance from RSI
add $32, %rsi # advance RSI past first two chunks
sub $16, %rdx # enough left for another round?
jbe 1f
ALIGN_TEXT
0: pcmpeqb %xmm0, %xmm1 # NUL byte encountered?
pmovmskb %xmm1, %eax
test %eax, %eax
jnz 3f
movdqu %xmm0, (%rsi, %rdi)
movdqa 16(%rsi), %xmm0 # load next string chunk
pxor %xmm1, %xmm1
cmp $16, %rdx # more than a full chunk left?
jbe 2f
add $32, %rsi # advance pointers to next chunk
pcmpeqb %xmm0, %xmm1 # NUL byte encountered?
pmovmskb %xmm1, %eax
test %eax, %eax
jnz 4f
movdqu %xmm0, -16(%rsi, %rdi)
movdqa (%rsi), %xmm0 # load next string chunk
pxor %xmm1, %xmm1
sub $32, %rdx
ja 0b
1: sub $16, %rsi # undo second advancement
add $16, %edx
2: pcmpeqb %xmm1, %xmm0 # NUL byte encountered?
pmovmskb %xmm0, %r8d
mov %r8d, %eax
bts %edx, %r8d # treat end of buffer as end of string
tzcnt %r8d, %r8d # find tail length
add %rsi, %rdi # restore RDI
movdqu (%rsi, %r8, 1), %xmm0 # load string tail
movdqu %xmm0, (%rdi, %r8, 1) # store string tail
movb $0, 16(%rdi, %r8, 1) # NUL terminate
test %eax, %eax # end of string already reached?
jnz 1f
ALIGN_TEXT
0: pcmpeqb 32(%rsi), %xmm1
pmovmskb %xmm1, %eax
pxor %xmm1, %xmm1
test %eax, %eax
jnz 2f
pcmpeqb 48(%rsi), %xmm1
pmovmskb %xmm1, %eax
add $32, %rsi
pxor %xmm1, %xmm1
test %eax, %eax
jz 0b
1: sub $16, %rsi # undo second advancement
2: tzcnt %eax, %eax # where is the NUL byte?
sub %r9, %rsi
lea 32(%rsi, %rax, 1), %rax # return string length
ret
4: sub $16, %rsi # undo second advancement
add $16, %rdx # restore number of remaining bytes
3: tzcnt %eax, %eax # find length of string tail
movdqu -15(%rsi, %rax, 1), %xmm0 # load string tail (incl. NUL)
add %rsi, %rdi # restore destination pointer
movdqu %xmm0, -15(%rdi, %rax, 1) # store string tail (incl. NUL)
sub %r9, %rsi # string length to current chunk
add %rsi, %rax # plus length of current chunk
ret
.Lhead_buf_end:
pmovmskb %xmm1, %r8d
add $32, %edx # restore edx to (len-1) + ecx
mov %r8d, %eax
shl $16, %r8d # place 2nd chunk NUL mask into bits 16--31
bts %rdx, %r8 # treat end of buffer as end of string
tzcnt %r8, %rdx # find string/bufer len from alignment boundary
sub %ecx, %edx # find actual string/buffer len
movb $0, (%rdi, %rdx, 1) # write NUL terminator
test %eax, %eax # end of string already reached?
jnz 1f
ALIGN_TEXT
0: pcmpeqb 32(%rsi), %xmm1
pmovmskb %xmm1, %eax
pxor %xmm1, %xmm1
test %eax, %eax
jnz 2f
pcmpeqb 48(%rsi), %xmm1
pmovmskb %xmm1, %eax
add $32, %rsi
pxor %xmm1, %xmm1
test %eax, %eax
jz 0b
1: sub $16, %rsi
2: tzcnt %eax, %eax
sub %r9, %rsi
lea 32(%rsi, %rax, 1), %rax # return string length
jmp .L0031
.Lsecond_nul:
add %r8, %rdx # restore buffer length
tzcnt %eax, %eax # where is the NUL byte?
lea -16(%rcx), %r8d
sub %r8d, %eax # string length
cmp %rax, %rdx # is the string shorter than the buffer?
cmova %rax, %rdx # copy only min(buflen, srclen) bytes
movb $0, (%rdi, %rdx, 1) # write NUL terminator
.L0031: cmp $16, %rdx # at least 16 bytes to copy (not incl NUL)?
jb .L0015
movdqu (%r9), %xmm0 # load first 16 bytes
movdqu -16(%r9, %rdx, 1), %xmm1 # load last 16 bytes
movdqu %xmm0, (%rdi)
movdqu %xmm1, -16(%rdi, %rdx, 1)
ret
.Lhead_nul:
tzcnt %eax, %eax # where is the NUL byte?
sub %ecx, %eax # ... from the beginning of the string?
cmp %rax, %rdx # is the string shorter than the buffer?
cmova %rax, %rdx # copy only min(buflen, srclen) bytes
movb $0, (%rdi, %rdx, 1) # write NUL terminator
.L0015: cmp $8, %rdx # at least 8 bytes to copy?
jae .L0815
cmp $4, %rdx # at least 4 bytes to copy?
jae .L0407
cmp $2, %rdx # at least 2 bytes to copy?
jae .L0203
movzbl (%r9), %ecx # load first byte from src
mov %cl, (%rdi) # deposit into destination
movb $0, (%rdi, %rdx, 1) # add NUL terminator (again)
ret
.L0203: movzwl (%r9), %ecx
movzwl -2(%r9, %rdx, 1), %esi
mov %cx, (%rdi)
mov %si, -2(%rdi, %rdx, 1)
ret
.L0407: mov (%r9), %ecx
mov -4(%r9, %rdx, 1), %esi
mov %ecx, (%rdi)
mov %esi, -4(%rdi, %rdx, 1)
ret
.L0815: mov (%r9), %rcx
mov -8(%r9, %rdx, 1), %rsi
mov %rcx, (%rdi)
mov %rsi, -8(%rdi, %rdx, 1)
ret
.L0: mov %rsi, %rdi
jmp CNAME(strlen)
ARCHEND(__strlcpy, baseline)
.section .note.GNU-stack,"",%progbits