#include <linux/linkage.h>
#include <linux/cfi_types.h>
.section .rodata
.p2align 4
.Lbswap_mask:
.octa 0x000102030405060708090a0b0c0d0e0f
.Lctr_pattern:
.quad 0, 0
.Lone:
.quad 1, 0
.Ltwo:
.quad 2, 0
.quad 3, 0
.Lfour:
.quad 4, 0
.text
.macro _vmovdqu src, dst
.if VL < 64
vmovdqu \src, \dst
.else
vmovdqu8 \src, \dst
.endif
.endm
.macro _vmovdqa src, dst
.if VL < 64
vmovdqa \src, \dst
.else
vmovdqa64 \src, \dst
.endif
.endm
.macro _vbroadcast128 src, dst
.if VL == 16
vmovdqu \src, \dst
.elseif VL == 32
vbroadcasti128 \src, \dst
.else
vbroadcasti32x4 \src, \dst
.endif
.endm
.macro _vpxor src1, src2, dst
.if VL < 64
vpxor \src1, \src2, \dst
.else
vpxord \src1, \src2, \dst
.endif
.endm
.macro _load_partial_block src, dst, tmp64, tmp32
sub $8, %ecx
jle .Lle8\@
vmovq (\src), \dst
mov (\src, %rcx), %rax
neg %ecx
shl $3, %ecx
shr %cl, %rax
vpinsrq $1, %rax, \dst, \dst
jmp .Ldone\@
.Lle8\@:
add $4, %ecx
jl .Llt4\@
mov (\src), %eax
mov (\src, %rcx), \tmp32
jmp .Lcombine\@
.Llt4\@:
add $2, %ecx
movzbl (\src), %eax
jl .Lmovq\@
movzwl (\src, %rcx), \tmp32
.Lcombine\@:
shl $3, %ecx
shl %cl, \tmp64
or \tmp64, %rax
.Lmovq\@:
vmovq %rax, \dst
.Ldone\@:
.endm
.macro _store_partial_block src, dst, tmp64, tmp32
sub $8, %ecx
jl .Llt8\@
vpextrq $1, \src, %rax
mov %ecx, \tmp32
shl $3, %ecx
ror %cl, %rax
mov %rax, (\dst, \tmp64)
vmovq \src, (\dst)
jmp .Ldone\@
.Llt8\@:
add $4, %ecx
jl .Llt4\@
vpextrd $1, \src, %eax
mov %ecx, \tmp32
shl $3, %ecx
ror %cl, %eax
mov %eax, (\dst, \tmp64)
vmovd \src, (\dst)
jmp .Ldone\@
.Llt4\@:
vpextrb $0, \src, 0(\dst)
cmp $-2, %ecx
jl .Ldone\@
vpextrb $1, \src, 1(\dst)
je .Ldone\@
vpextrb $2, \src, 2(\dst)
.Ldone\@:
.endm
.macro _prepare_2_ctr_vecs is_xctr, i0, i1, final=0
.if \is_xctr
.if USE_AVX512
vmovdqa64 LE_CTR, AESDATA\i0
vpternlogd $0x96, XCTR_IV, RNDKEY0, AESDATA\i0
.else
vpxor XCTR_IV, LE_CTR, AESDATA\i0
vpxor RNDKEY0, AESDATA\i0, AESDATA\i0
.endif
vpaddq LE_CTR_INC1, LE_CTR, AESDATA\i1
.if USE_AVX512
vpternlogd $0x96, XCTR_IV, RNDKEY0, AESDATA\i1
.else
vpxor XCTR_IV, AESDATA\i1, AESDATA\i1
vpxor RNDKEY0, AESDATA\i1, AESDATA\i1
.endif
.else
vpshufb BSWAP_MASK, LE_CTR, AESDATA\i0
_vpxor RNDKEY0, AESDATA\i0, AESDATA\i0
vpaddq LE_CTR_INC1, LE_CTR, AESDATA\i1
vpshufb BSWAP_MASK, AESDATA\i1, AESDATA\i1
_vpxor RNDKEY0, AESDATA\i1, AESDATA\i1
.endif
.if !\final
vpaddq LE_CTR_INC2, LE_CTR, LE_CTR
.endif
.endm
.macro _aesenc_loop vecs:vararg
mov KEY, %rax
1:
_vbroadcast128 (%rax), RNDKEY
.irp i, \vecs
vaesenc RNDKEY, AESDATA\i, AESDATA\i
.endr
add $16, %rax
cmp %rax, RNDKEYLAST_PTR
jne 1b
.endm
.macro _aesenclast_and_xor vecs:vararg
.irp i, \vecs
_vpxor \i*VL(SRC), RNDKEYLAST, RNDKEY
vaesenclast RNDKEY, AESDATA\i, AESDATA\i
.endr
.irp i, \vecs
_vmovdqu AESDATA\i, \i*VL(DST)
.endr
.endm
.macro _xor_data vecs:vararg
.irp i, \vecs
_vpxor \i*VL(SRC), AESDATA\i, AESDATA\i
.endr
.irp i, \vecs
_vmovdqu AESDATA\i, \i*VL(DST)
.endr
.endm
.macro _aes_ctr_crypt is_xctr
.irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
.if VL == 16
.set V\i, %xmm\i
.elseif VL == 32
.set V\i, %ymm\i
.elseif VL == 64
.set V\i, %zmm\i
.else
.error "Unsupported Vector Length (VL)"
.endif
.endr
.set KEY, %rdi
.set KEY32, %edi
.set SRC, %rsi
.set DST, %rdx
.set LEN, %ecx
.set LEN64, %rcx
.set LEN8, %cl
.if \is_xctr
.set XCTR_IV_PTR, %r8
.set XCTR_CTR, %r9
.else
.set LE_CTR_PTR, %r8
.endif
.set RNDKEYLAST_PTR, %r10
.set AESDATA0, V0
.set AESDATA0_XMM, %xmm0
.set AESDATA1, V1
.set AESDATA1_XMM, %xmm1
.set AESDATA2, V2
.set AESDATA3, V3
.set AESDATA4, V4
.set AESDATA5, V5
.set AESDATA6, V6
.set AESDATA7, V7
.if \is_xctr
.set XCTR_IV, V8
.else
.set BSWAP_MASK, V8
.endif
.set LE_CTR, V9
.set LE_CTR_XMM, %xmm9
.set LE_CTR_INC1, V10
.set LE_CTR_INC2, V11
.set RNDKEY0, V12
.set RNDKEYLAST, V13
.set RNDKEY, V14
.if \is_xctr
.if VL == 16
vmovq XCTR_CTR, LE_CTR
.elseif VL == 32
vmovq XCTR_CTR, LE_CTR_XMM
inc XCTR_CTR
vmovq XCTR_CTR, AESDATA0_XMM
vinserti128 $1, AESDATA0_XMM, LE_CTR, LE_CTR
.else
vpbroadcastq XCTR_CTR, LE_CTR
vpsrldq $8, LE_CTR, LE_CTR
vpaddq .Lctr_pattern(%rip), LE_CTR, LE_CTR
.endif
_vbroadcast128 (XCTR_IV_PTR), XCTR_IV
.else
_vbroadcast128 (LE_CTR_PTR), LE_CTR
.if VL > 16
vpaddq .Lctr_pattern(%rip), LE_CTR, LE_CTR
.endif
_vbroadcast128 .Lbswap_mask(%rip), BSWAP_MASK
.endif
.if VL == 16
_vbroadcast128 .Lone(%rip), LE_CTR_INC1
.elseif VL == 32
_vbroadcast128 .Ltwo(%rip), LE_CTR_INC1
.else
_vbroadcast128 .Lfour(%rip), LE_CTR_INC1
.endif
vpsllq $1, LE_CTR_INC1, LE_CTR_INC2
movl 480(KEY), %eax
lea 6*16(KEY, %rax, 4), RNDKEYLAST_PTR
_vbroadcast128 (KEY), RNDKEY0
_vbroadcast128 (RNDKEYLAST_PTR), RNDKEYLAST
add $16, KEY
add $-8*VL, LEN
jl .Lloop_8x_done\@
.Lloop_8x\@:
_prepare_2_ctr_vecs \is_xctr, 0, 1
_prepare_2_ctr_vecs \is_xctr, 2, 3
_prepare_2_ctr_vecs \is_xctr, 4, 5
_prepare_2_ctr_vecs \is_xctr, 6, 7
_aesenc_loop 0,1,2,3,4,5,6,7
_aesenclast_and_xor 0,1,2,3,4,5,6,7
sub $-8*VL, SRC
sub $-8*VL, DST
add $-8*VL, LEN
jge .Lloop_8x\@
.Lloop_8x_done\@:
sub $-8*VL, LEN
jz .Ldone\@
_prepare_2_ctr_vecs \is_xctr, 0, 1
_prepare_2_ctr_vecs \is_xctr, 2, 3
cmp $4*VL, LEN
jle .Lenc_tail_atmost4vecs\@
_prepare_2_ctr_vecs \is_xctr, 4, 5
_prepare_2_ctr_vecs \is_xctr, 6, 7, final=1
_aesenc_loop 0,1,2,3,4,5,6,7
_aesenclast_and_xor 0,1,2,3
vaesenclast RNDKEYLAST, AESDATA4, AESDATA0
vaesenclast RNDKEYLAST, AESDATA5, AESDATA1
vaesenclast RNDKEYLAST, AESDATA6, AESDATA2
vaesenclast RNDKEYLAST, AESDATA7, AESDATA3
sub $-4*VL, SRC
sub $-4*VL, DST
add $-4*VL, LEN
cmp $1*VL-1, LEN
jle .Lxor_tail_partial_vec_0\@
_xor_data 0
cmp $2*VL-1, LEN
jle .Lxor_tail_partial_vec_1\@
_xor_data 1
cmp $3*VL-1, LEN
jle .Lxor_tail_partial_vec_2\@
_xor_data 2
cmp $4*VL-1, LEN
jle .Lxor_tail_partial_vec_3\@
_xor_data 3
jmp .Ldone\@
.Lenc_tail_atmost4vecs\@:
cmp $2*VL, LEN
jle .Lenc_tail_atmost2vecs\@
_aesenc_loop 0,1,2,3
_aesenclast_and_xor 0,1
vaesenclast RNDKEYLAST, AESDATA2, AESDATA0
vaesenclast RNDKEYLAST, AESDATA3, AESDATA1
sub $-2*VL, SRC
sub $-2*VL, DST
add $-2*VL, LEN
jmp .Lxor_tail_upto2vecs\@
.Lenc_tail_atmost2vecs\@:
_aesenc_loop 0,1
vaesenclast RNDKEYLAST, AESDATA0, AESDATA0
vaesenclast RNDKEYLAST, AESDATA1, AESDATA1
.Lxor_tail_upto2vecs\@:
cmp $1*VL-1, LEN
jle .Lxor_tail_partial_vec_0\@
_xor_data 0
cmp $2*VL-1, LEN
jle .Lxor_tail_partial_vec_1\@
_xor_data 1
jmp .Ldone\@
.Lxor_tail_partial_vec_1\@:
add $-1*VL, LEN
jz .Ldone\@
sub $-1*VL, SRC
sub $-1*VL, DST
_vmovdqa AESDATA1, AESDATA0
jmp .Lxor_tail_partial_vec_0\@
.Lxor_tail_partial_vec_2\@:
add $-2*VL, LEN
jz .Ldone\@
sub $-2*VL, SRC
sub $-2*VL, DST
_vmovdqa AESDATA2, AESDATA0
jmp .Lxor_tail_partial_vec_0\@
.Lxor_tail_partial_vec_3\@:
add $-3*VL, LEN
jz .Ldone\@
sub $-3*VL, SRC
sub $-3*VL, DST
_vmovdqa AESDATA3, AESDATA0
.Lxor_tail_partial_vec_0\@:
.if USE_AVX512
mov $-1, %rax
bzhi LEN64, %rax, %rax
kmovq %rax, %k1
vmovdqu8 (SRC), AESDATA1{%k1}{z}
vpxord AESDATA1, AESDATA0, AESDATA0
vmovdqu8 AESDATA0, (DST){%k1}
.else
.if VL == 32
cmp $16, LEN
jl 1f
vpxor (SRC), AESDATA0_XMM, AESDATA1_XMM
vmovdqu AESDATA1_XMM, (DST)
add $16, SRC
add $16, DST
sub $16, LEN
jz .Ldone\@
vextracti128 $1, AESDATA0, AESDATA0_XMM
1:
.endif
mov LEN, %r10d
_load_partial_block SRC, AESDATA1_XMM, KEY, KEY32
vpxor AESDATA1_XMM, AESDATA0_XMM, AESDATA0_XMM
mov %r10d, %ecx
_store_partial_block AESDATA0_XMM, DST, KEY, KEY32
.endif
.Ldone\@:
.if VL > 16
vzeroupper
.endif
RET
.endm
.set VL, 16
.set USE_AVX512, 0
SYM_TYPED_FUNC_START(aes_ctr64_crypt_aesni_avx)
_aes_ctr_crypt 0
SYM_FUNC_END(aes_ctr64_crypt_aesni_avx)
SYM_TYPED_FUNC_START(aes_xctr_crypt_aesni_avx)
_aes_ctr_crypt 1
SYM_FUNC_END(aes_xctr_crypt_aesni_avx)
.set VL, 32
.set USE_AVX512, 0
SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx2)
_aes_ctr_crypt 0
SYM_FUNC_END(aes_ctr64_crypt_vaes_avx2)
SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx2)
_aes_ctr_crypt 1
SYM_FUNC_END(aes_xctr_crypt_vaes_avx2)
.set VL, 64
.set USE_AVX512, 1
SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx512)
_aes_ctr_crypt 0
SYM_FUNC_END(aes_ctr64_crypt_vaes_avx512)
SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx512)
_aes_ctr_crypt 1
SYM_FUNC_END(aes_xctr_crypt_vaes_avx512)