root/arch/x86/crypto/blowfish-x86_64-asm_64.S
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * Blowfish Cipher Algorithm (x86_64)
 *
 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
 */

#include <linux/linkage.h>

.file "blowfish-x86_64-asm.S"
.text

/* structure of crypto context */
#define p       0
#define s0      ((16 + 2) * 4)
#define s1      ((16 + 2 + (1 * 256)) * 4)
#define s2      ((16 + 2 + (2 * 256)) * 4)
#define s3      ((16 + 2 + (3 * 256)) * 4)

/* register macros */
#define CTX %r12
#define RIO %rsi

#define RX0 %rax
#define RX1 %rbx
#define RX2 %rcx
#define RX3 %rdx

#define RX0d %eax
#define RX1d %ebx
#define RX2d %ecx
#define RX3d %edx

#define RX0bl %al
#define RX1bl %bl
#define RX2bl %cl
#define RX3bl %dl

#define RX0bh %ah
#define RX1bh %bh
#define RX2bh %ch
#define RX3bh %dh

#define RT0 %rdi
#define RT1 %rsi
#define RT2 %r8
#define RT3 %r9

#define RT0d %edi
#define RT1d %esi
#define RT2d %r8d
#define RT3d %r9d

#define RKEY %r10

/***********************************************************************
 * 1-way blowfish
 ***********************************************************************/
#define F() \
        rorq $16,               RX0; \
        movzbl RX0bh,           RT0d; \
        movzbl RX0bl,           RT1d; \
        rolq $16,               RX0; \
        movl s0(CTX,RT0,4),     RT0d; \
        addl s1(CTX,RT1,4),     RT0d; \
        movzbl RX0bh,           RT1d; \
        movzbl RX0bl,           RT2d; \
        rolq $32,               RX0; \
        xorl s2(CTX,RT1,4),     RT0d; \
        addl s3(CTX,RT2,4),     RT0d; \
        xorq RT0,               RX0;

#define add_roundkey_enc(n) \
        xorq p+4*(n)(CTX),      RX0;

#define round_enc(n) \
        add_roundkey_enc(n); \
        \
        F(); \
        F();

#define add_roundkey_dec(n) \
        movq p+4*(n-1)(CTX),    RT0; \
        rorq $32,               RT0; \
        xorq RT0,               RX0;

#define round_dec(n) \
        add_roundkey_dec(n); \
        \
        F(); \
        F(); \

#define read_block() \
        movq (RIO),             RX0; \
        rorq $32,               RX0; \
        bswapq                  RX0;

#define write_block() \
        bswapq                  RX0; \
        movq RX0,               (RIO);

SYM_FUNC_START(blowfish_enc_blk)
        /* input:
         *      %rdi: ctx
         *      %rsi: dst
         *      %rdx: src
         */
        movq %r12, %r11;

        movq %rdi, CTX;
        movq %rsi, %r10;
        movq %rdx, RIO;

        read_block();

        round_enc(0);
        round_enc(2);
        round_enc(4);
        round_enc(6);
        round_enc(8);
        round_enc(10);
        round_enc(12);
        round_enc(14);
        add_roundkey_enc(16);

        movq %r11, %r12;
        movq %r10, RIO;

        write_block();
        RET;
SYM_FUNC_END(blowfish_enc_blk)

SYM_FUNC_START(blowfish_dec_blk)
        /* input:
         *      %rdi: ctx
         *      %rsi: dst
         *      %rdx: src
         */
        movq %r12, %r11;

        movq %rdi, CTX;
        movq %rsi, %r10;
        movq %rdx, RIO;

        read_block();

        round_dec(17);
        round_dec(15);
        round_dec(13);
        round_dec(11);
        round_dec(9);
        round_dec(7);
        round_dec(5);
        round_dec(3);
        add_roundkey_dec(1);

        movq %r10, RIO;
        write_block();

        movq %r11, %r12;

        RET;
SYM_FUNC_END(blowfish_dec_blk)

/**********************************************************************
  4-way blowfish, four blocks parallel
 **********************************************************************/

/* F() for 4-way. Slower when used alone/1-way, but faster when used
 * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330).
 */
#define F4(x) \
        movzbl x ## bh,         RT1d; \
        movzbl x ## bl,         RT3d; \
        rorq $16,               x; \
        movzbl x ## bh,         RT0d; \
        movzbl x ## bl,         RT2d; \
        rorq $16,               x; \
        movl s0(CTX,RT0,4),     RT0d; \
        addl s1(CTX,RT2,4),     RT0d; \
        xorl s2(CTX,RT1,4),     RT0d; \
        addl s3(CTX,RT3,4),     RT0d; \
        xorq RT0,               x;

#define add_preloaded_roundkey4() \
        xorq RKEY,              RX0; \
        xorq RKEY,              RX1; \
        xorq RKEY,              RX2; \
        xorq RKEY,              RX3;

#define preload_roundkey_enc(n) \
        movq p+4*(n)(CTX),      RKEY;

#define add_roundkey_enc4(n) \
        add_preloaded_roundkey4(); \
        preload_roundkey_enc(n + 2);

#define round_enc4(n) \
        add_roundkey_enc4(n); \
        \
        F4(RX0); \
        F4(RX1); \
        F4(RX2); \
        F4(RX3); \
        \
        F4(RX0); \
        F4(RX1); \
        F4(RX2); \
        F4(RX3);

#define preload_roundkey_dec(n) \
        movq p+4*((n)-1)(CTX),  RKEY; \
        rorq $32,               RKEY;

#define add_roundkey_dec4(n) \
        add_preloaded_roundkey4(); \
        preload_roundkey_dec(n - 2);

#define round_dec4(n) \
        add_roundkey_dec4(n); \
        \
        F4(RX0); \
        F4(RX1); \
        F4(RX2); \
        F4(RX3); \
        \
        F4(RX0); \
        F4(RX1); \
        F4(RX2); \
        F4(RX3);

#define read_block4() \
        movq (RIO),             RX0; \
        rorq $32,               RX0; \
        bswapq                  RX0; \
        \
        movq 8(RIO),            RX1; \
        rorq $32,               RX1; \
        bswapq                  RX1; \
        \
        movq 16(RIO),           RX2; \
        rorq $32,               RX2; \
        bswapq                  RX2; \
        \
        movq 24(RIO),           RX3; \
        rorq $32,               RX3; \
        bswapq                  RX3;

#define write_block4() \
        bswapq                  RX0; \
        movq RX0,               (RIO); \
        \
        bswapq                  RX1; \
        movq RX1,               8(RIO); \
        \
        bswapq                  RX2; \
        movq RX2,               16(RIO); \
        \
        bswapq                  RX3; \
        movq RX3,               24(RIO);

#define xor_block4() \
        movq (RIO),             RT0; \
        bswapq                  RT0; \
        xorq RT0,               RX1; \
        \
        movq 8(RIO),            RT2; \
        bswapq                  RT2; \
        xorq RT2,               RX2; \
        \
        movq 16(RIO),           RT3; \
        bswapq                  RT3; \
        xorq RT3,               RX3;

SYM_FUNC_START(blowfish_enc_blk_4way)
        /* input:
         *      %rdi: ctx
         *      %rsi: dst
         *      %rdx: src
         */
        pushq %r12;
        pushq %rbx;

        movq %rdi, CTX
        movq %rsi, %r11;
        movq %rdx, RIO;

        preload_roundkey_enc(0);

        read_block4();

        round_enc4(0);
        round_enc4(2);
        round_enc4(4);
        round_enc4(6);
        round_enc4(8);
        round_enc4(10);
        round_enc4(12);
        round_enc4(14);
        add_preloaded_roundkey4();

        movq %r11, RIO;
        write_block4();

        popq %rbx;
        popq %r12;
        RET;
SYM_FUNC_END(blowfish_enc_blk_4way)

SYM_FUNC_START(__blowfish_dec_blk_4way)
        /* input:
         *      %rdi: ctx
         *      %rsi: dst
         *      %rdx: src
         *      %rcx: cbc (bool)
         */
        pushq %r12;
        pushq %rbx;
        pushq %rcx;
        pushq %rdx;

        movq %rdi, CTX;
        movq %rsi, %r11;
        movq %rdx, RIO;

        preload_roundkey_dec(17);
        read_block4();

        round_dec4(17);
        round_dec4(15);
        round_dec4(13);
        round_dec4(11);
        round_dec4(9);
        round_dec4(7);
        round_dec4(5);
        round_dec4(3);
        add_preloaded_roundkey4();

        popq RIO;
        popq %r12;
        testq %r12, %r12;
        jz .L_no_cbc_xor;

        xor_block4();

.L_no_cbc_xor:
        movq %r11, RIO;
        write_block4();

        popq %rbx;
        popq %r12;

        RET;
SYM_FUNC_END(__blowfish_dec_blk_4way)