#include <machine/asm.h>
.weak memccpy
.set memccpy, __memccpy
.text
ENTRY(__memccpy)
subs x3, x3, #1
b.lo .L0
dup v0.16b, w2
mov x9, x0
bic x10, x1, #0xf
and x11, x1, #0xf
ldr q1, [x10]
cmeq v1.16b, v1.16b, v0.16b
mov x8, #-1
mov x6, #0xf
lsl x12, x11, #2
lsl x8, x8, x12
shrn v1.8b, v1.8h, #4
fmov x5, d1
sub x12, x11, #32
adds x12, x12, x3
b.cc .Lrunt
ands x8, x8, x5
b.eq 0f
rbit x8, x8
clz x8, x8
lsr x8, x8, #2
sub x8, x8, x11
add x0, x0, x8
add x4, x9, x8
add x5, x1, x8
add x0, x0, #1
b .L0816
0:
ldr q3, [x10, #16]
ldr q2, [x1]
cmeq v1.16b, v3.16b, v0.16b
shrn v1.8b, v1.8h, #4
fmov x5, d1
cbz x5, 0f
rbit x8, x5
clz x8, x8
lsr x8, x8, #2
sub x11, x11, #16
sub x8, x8, x11
add x0, x0, x8
add x0, x0, #1
add x4, x9, x8
add x5, x1, x8
b .L1732
0:
ldr q1, [x10, #32]
str q2, [x0]
sub x0, x0, x11
mov x3, x12
str q3, [x0, #16]
add x10, x10, #32
add x0, x0, #32
subs x3, x3, #16
b.lo 1f
.p2align 4
0:
cmeq v2.16b, v1.16b, v0.16b
shrn v2.8b, v2.8h, #4
fmov x5, d2
cbnz x5, 3f
str q1, [x0]
ldr q1, [x10, #16]
cmp x3, #16
b.lo 2f
add x10, x10, #32
add x0, x0, #32
cmeq v2.16b, v1.16b, v0.16b
shrn v2.8b, v2.8h, #4
fmov x5, d2
cbnz x5, 4f
str q1, [x0, #-16]
ldr q1, [x10]
subs x3, x3, #32
b.hs 0b
1:
sub x10, x10, #16
add x3, x3, #16
sub x0, x0, #16
2:
cmeq v2.16b, v1.16b, v0.16b
shrn v2.8b, v2.8h, #4
fmov x4, d2
lsl x5, x3, #2
lsl x5, x6, x5
orr x8, x4, x5
rbit x8, x8
clz x7, x8
lsr x8, x7, #2
lsl x5, x6, x7
add x8, x8, #1
add x0, x0, x8
ldr q1, [x10, x8]
str q1, [x0]
add x0, x0, #16
tst x4, x5
csel x0, x0, xzr, ne
ret
4:
sub x10, x10, #16
sub x0, x0, #16
3:
rbit x8, x5
clz x8, x8
lsr x3, x8, #2
add x0, x0, x3
add x10, x10, x3
ldr q1, [x10, #-15]
str q1, [x0, #-15]
add x0, x0, #1
ret
.Lrunt:
add x13, x11, x3
mov x7, x5
lsl x4, x12, #2
lsl x4, x6, x4
cmp x13, #16
csel x4, x4, xzr, lo
orr x5, x5, x4
ands x8, x8, x5
b.ne 0f
ldr q4, [x10, #16]
cmeq v1.16b, v4.16b, v0.16b
shrn v1.8b, v1.8h, #4
fmov x8, d1
mov x7, x8
lsl x4, x12, #2
lsl x4, x6, x4
orr x8, x8, x4
rbit x8, x8
clz x4, x8
lsr x8, x4, #2
add x8, x8, #16
b 1f
0:
rbit x8, x8
clz x4, x8
lsr x8, x4, #2
1:
add x0, x0, x8
sub x0, x0, x11
add x0, x0, #1
lsl x5, x6, x4
ands x7, x7, x5
csel x0, xzr, x0, eq
sub x8, x8, x11
add x4, x9, x8
add x5, x1, x8
.L1732:
cmp x8, #16
b.lo .L0816
add x5, x5, #1
add x4, x4, #1
ldp x16, x17, [x1]
ldp x12, x13, [x5, #-16]
stp x16, x17, [x9]
stp x12, x13, [x4, #-16]
ret
.L0816:
tbz x8, #3, .L0407
ldr x16, [x1]
ldr x17, [x5, #-7]
str x16, [x9]
str x17, [x4, #-7]
ret
.p2align 4
.L0407:
cmp x8, #3
b.lo .L0103
ldr w16, [x1]
ldr w18, [x5, #-3]
str w16, [x9]
str w18, [x4, #-3]
ret
.p2align 4
.L0103:
lsr x14, x8, #1
ldrb w16, [x1]
ldrb w15, [x5]
ldrb w18, [x1, x14]
strb w16, [x9]
strb w18, [x9, x14]
strb w15, [x4]
ret
.L0:
eor x0, x0, x0
ret
END(__memccpy)