root/lib/crypto/powerpc/poly1305-p10le_64.S
/* SPDX-License-Identifier: GPL-2.0-or-later */
#
# Accelerated poly1305 implementation for ppc64le.
#
# Copyright 2023- IBM Corp. All rights reserved
#
#===================================================================================
# Written by Danny Tsen <dtsen@us.ibm.com>
#
# Poly1305 - this version mainly using vector/VSX/Scalar
#  - 26 bits limbs
#  - Handle multiple 64 byte blcok.
#
# Block size 16 bytes
# key = (r, s)
# clamp r &= 0x0FFFFFFC0FFFFFFC 0x0FFFFFFC0FFFFFFF
# p = 2^130 - 5
# a += m
# a = (r + a) % p
# a += s
#
# Improve performance by breaking down polynominal to the sum of products with
#     h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r
#
#  07/22/21 - this revison based on the above sum of products.  Setup r^4, r^3, r^2, r and s3, s2, s1, s0
#             to 9 vectors for multiplications.
#
# setup r^4, r^3, r^2, r vectors
#    vs    [r^1, r^3, r^2, r^4]
#    vs0 = [r0,.....]
#    vs1 = [r1,.....]
#    vs2 = [r2,.....]
#    vs3 = [r3,.....]
#    vs4 = [r4,.....]
#    vs5 = [r1*5,...]
#    vs6 = [r2*5,...]
#    vs7 = [r2*5,...]
#    vs8 = [r4*5,...]
#
#  Each word in a vector consists a member of a "r/s" in [a * r/s].
#
# r0, r4*5, r3*5, r2*5, r1*5;
# r1, r0,   r4*5, r3*5, r2*5;
# r2, r1,   r0,   r4*5, r3*5;
# r3, r2,   r1,   r0,   r4*5;
# r4, r3,   r2,   r1,   r0  ;
#
#
# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m)
#  k = 32 bytes key
#  r3 = k (r, s)
#  r4 = mlen
#  r5 = m
#
#include <asm/ppc_asm.h>
#include <asm/asm-offsets.h>
#include <asm/asm-compat.h>
#include <linux/linkage.h>

.machine "any"

.text

.macro  SAVE_GPR GPR OFFSET FRAME
        std     \GPR,\OFFSET(\FRAME)
.endm

.macro  SAVE_VRS VRS OFFSET FRAME
        li      16, \OFFSET
        stvx    \VRS, 16, \FRAME
.endm

.macro  SAVE_VSX VSX OFFSET FRAME
        li      16, \OFFSET
        stxvx   \VSX, 16, \FRAME
.endm

.macro  RESTORE_GPR GPR OFFSET FRAME
        ld      \GPR,\OFFSET(\FRAME)
.endm

.macro  RESTORE_VRS VRS OFFSET FRAME
        li      16, \OFFSET
        lvx     \VRS, 16, \FRAME
.endm

.macro  RESTORE_VSX VSX OFFSET FRAME
        li      16, \OFFSET
        lxvx    \VSX, 16, \FRAME
.endm

.macro SAVE_REGS
        mflr 0
        std 0, 16(1)
        stdu 1,-752(1)

        SAVE_GPR 14, 112, 1
        SAVE_GPR 15, 120, 1
        SAVE_GPR 16, 128, 1
        SAVE_GPR 17, 136, 1
        SAVE_GPR 18, 144, 1
        SAVE_GPR 19, 152, 1
        SAVE_GPR 20, 160, 1
        SAVE_GPR 21, 168, 1
        SAVE_GPR 22, 176, 1
        SAVE_GPR 23, 184, 1
        SAVE_GPR 24, 192, 1
        SAVE_GPR 25, 200, 1
        SAVE_GPR 26, 208, 1
        SAVE_GPR 27, 216, 1
        SAVE_GPR 28, 224, 1
        SAVE_GPR 29, 232, 1
        SAVE_GPR 30, 240, 1
        SAVE_GPR 31, 248, 1

        addi    9, 1, 256
        SAVE_VRS 20, 0, 9
        SAVE_VRS 21, 16, 9
        SAVE_VRS 22, 32, 9
        SAVE_VRS 23, 48, 9
        SAVE_VRS 24, 64, 9
        SAVE_VRS 25, 80, 9
        SAVE_VRS 26, 96, 9
        SAVE_VRS 27, 112, 9
        SAVE_VRS 28, 128, 9
        SAVE_VRS 29, 144, 9
        SAVE_VRS 30, 160, 9
        SAVE_VRS 31, 176, 9

        SAVE_VSX 14, 192, 9
        SAVE_VSX 15, 208, 9
        SAVE_VSX 16, 224, 9
        SAVE_VSX 17, 240, 9
        SAVE_VSX 18, 256, 9
        SAVE_VSX 19, 272, 9
        SAVE_VSX 20, 288, 9
        SAVE_VSX 21, 304, 9
        SAVE_VSX 22, 320, 9
        SAVE_VSX 23, 336, 9
        SAVE_VSX 24, 352, 9
        SAVE_VSX 25, 368, 9
        SAVE_VSX 26, 384, 9
        SAVE_VSX 27, 400, 9
        SAVE_VSX 28, 416, 9
        SAVE_VSX 29, 432, 9
        SAVE_VSX 30, 448, 9
        SAVE_VSX 31, 464, 9
.endm # SAVE_REGS

.macro RESTORE_REGS
        addi    9, 1, 256
        RESTORE_VRS 20, 0, 9
        RESTORE_VRS 21, 16, 9
        RESTORE_VRS 22, 32, 9
        RESTORE_VRS 23, 48, 9
        RESTORE_VRS 24, 64, 9
        RESTORE_VRS 25, 80, 9
        RESTORE_VRS 26, 96, 9
        RESTORE_VRS 27, 112, 9
        RESTORE_VRS 28, 128, 9
        RESTORE_VRS 29, 144, 9
        RESTORE_VRS 30, 160, 9
        RESTORE_VRS 31, 176, 9

        RESTORE_VSX 14, 192, 9
        RESTORE_VSX 15, 208, 9
        RESTORE_VSX 16, 224, 9
        RESTORE_VSX 17, 240, 9
        RESTORE_VSX 18, 256, 9
        RESTORE_VSX 19, 272, 9
        RESTORE_VSX 20, 288, 9
        RESTORE_VSX 21, 304, 9
        RESTORE_VSX 22, 320, 9
        RESTORE_VSX 23, 336, 9
        RESTORE_VSX 24, 352, 9
        RESTORE_VSX 25, 368, 9
        RESTORE_VSX 26, 384, 9
        RESTORE_VSX 27, 400, 9
        RESTORE_VSX 28, 416, 9
        RESTORE_VSX 29, 432, 9
        RESTORE_VSX 30, 448, 9
        RESTORE_VSX 31, 464, 9

        RESTORE_GPR 14, 112, 1
        RESTORE_GPR 15, 120, 1
        RESTORE_GPR 16, 128, 1
        RESTORE_GPR 17, 136, 1
        RESTORE_GPR 18, 144, 1
        RESTORE_GPR 19, 152, 1
        RESTORE_GPR 20, 160, 1
        RESTORE_GPR 21, 168, 1
        RESTORE_GPR 22, 176, 1
        RESTORE_GPR 23, 184, 1
        RESTORE_GPR 24, 192, 1
        RESTORE_GPR 25, 200, 1
        RESTORE_GPR 26, 208, 1
        RESTORE_GPR 27, 216, 1
        RESTORE_GPR 28, 224, 1
        RESTORE_GPR 29, 232, 1
        RESTORE_GPR 30, 240, 1
        RESTORE_GPR 31, 248, 1

        addi    1, 1, 752
        ld 0, 16(1)
        mtlr 0
.endm # RESTORE_REGS

#
# p[0] = a0*r0 + a1*r4*5 + a2*r3*5 + a3*r2*5 + a4*r1*5;
# p[1] = a0*r1 + a1*r0   + a2*r4*5 + a3*r3*5 + a4*r2*5;
# p[2] = a0*r2 + a1*r1   + a2*r0   + a3*r4*5 + a4*r3*5;
# p[3] = a0*r3 + a1*r2   + a2*r1   + a3*r0   + a4*r4*5;
# p[4] = a0*r4 + a1*r3   + a2*r2   + a3*r1   + a4*r0  ;
#
#    [r^2, r^3, r^1, r^4]
#    [m3,  m2,  m4,  m1]
#
# multiply odd and even words
.macro mul_odd
        vmulouw 14, 4, 26
        vmulouw 10, 5, 3
        vmulouw 11, 6, 2
        vmulouw 12, 7, 1
        vmulouw 13, 8, 0
        vmulouw 15, 4, 27
        vaddudm 14, 14, 10
        vaddudm 14, 14, 11
        vmulouw 10, 5, 26
        vmulouw 11, 6, 3
        vaddudm 14, 14, 12
        vaddudm 14, 14, 13      # x0
        vaddudm 15, 15, 10
        vaddudm 15, 15, 11
        vmulouw 12, 7, 2
        vmulouw 13, 8, 1
        vaddudm 15, 15, 12
        vaddudm 15, 15, 13      # x1
        vmulouw 16, 4, 28
        vmulouw 10, 5, 27
        vmulouw 11, 6, 26
        vaddudm 16, 16, 10
        vaddudm 16, 16, 11
        vmulouw 12, 7, 3
        vmulouw 13, 8, 2
        vaddudm 16, 16, 12
        vaddudm 16, 16, 13      # x2
        vmulouw 17, 4, 29
        vmulouw 10, 5, 28
        vmulouw 11, 6, 27
        vaddudm 17, 17, 10
        vaddudm 17, 17, 11
        vmulouw 12, 7, 26
        vmulouw 13, 8, 3
        vaddudm 17, 17, 12
        vaddudm 17, 17, 13      # x3
        vmulouw 18, 4, 30
        vmulouw 10, 5, 29
        vmulouw 11, 6, 28
        vaddudm 18, 18, 10
        vaddudm 18, 18, 11
        vmulouw 12, 7, 27
        vmulouw 13, 8, 26
        vaddudm 18, 18, 12
        vaddudm 18, 18, 13      # x4
.endm

.macro mul_even
        vmuleuw 9, 4, 26
        vmuleuw 10, 5, 3
        vmuleuw 11, 6, 2
        vmuleuw 12, 7, 1
        vmuleuw 13, 8, 0
        vaddudm 14, 14, 9
        vaddudm 14, 14, 10
        vaddudm 14, 14, 11
        vaddudm 14, 14, 12
        vaddudm 14, 14, 13      # x0

        vmuleuw 9, 4, 27
        vmuleuw 10, 5, 26
        vmuleuw 11, 6, 3
        vmuleuw 12, 7, 2
        vmuleuw 13, 8, 1
        vaddudm 15, 15, 9
        vaddudm 15, 15, 10
        vaddudm 15, 15, 11
        vaddudm 15, 15, 12
        vaddudm 15, 15, 13      # x1

        vmuleuw 9, 4, 28
        vmuleuw 10, 5, 27
        vmuleuw 11, 6, 26
        vmuleuw 12, 7, 3
        vmuleuw 13, 8, 2
        vaddudm 16, 16, 9
        vaddudm 16, 16, 10
        vaddudm 16, 16, 11
        vaddudm 16, 16, 12
        vaddudm 16, 16, 13      # x2

        vmuleuw 9, 4, 29
        vmuleuw 10, 5, 28
        vmuleuw 11, 6, 27
        vmuleuw 12, 7, 26
        vmuleuw 13, 8, 3
        vaddudm 17, 17, 9
        vaddudm 17, 17, 10
        vaddudm 17, 17, 11
        vaddudm 17, 17, 12
        vaddudm 17, 17, 13      # x3

        vmuleuw 9, 4, 30
        vmuleuw 10, 5, 29
        vmuleuw 11, 6, 28
        vmuleuw 12, 7, 27
        vmuleuw 13, 8, 26
        vaddudm 18, 18, 9
        vaddudm 18, 18, 10
        vaddudm 18, 18, 11
        vaddudm 18, 18, 12
        vaddudm 18, 18, 13      # x4
.endm

#
# poly1305_setup_r
#
# setup r^4, r^3, r^2, r vectors
#    [r, r^3, r^2, r^4]
#    vs0 = [r0,...]
#    vs1 = [r1,...]
#    vs2 = [r2,...]
#    vs3 = [r3,...]
#    vs4 = [r4,...]
#    vs5 = [r4*5,...]
#    vs6 = [r3*5,...]
#    vs7 = [r2*5,...]
#    vs8 = [r1*5,...]
#
# r0, r4*5, r3*5, r2*5, r1*5;
# r1, r0,   r4*5, r3*5, r2*5;
# r2, r1,   r0,   r4*5, r3*5;
# r3, r2,   r1,   r0,   r4*5;
# r4, r3,   r2,   r1,   r0  ;
#
.macro poly1305_setup_r

        # save r
        xxlor   26, 58, 58
        xxlor   27, 59, 59
        xxlor   28, 60, 60
        xxlor   29, 61, 61
        xxlor   30, 62, 62

        xxlxor  31, 31, 31

#    [r, r^3, r^2, r^4]
        # compute r^2
        vmr     4, 26
        vmr     5, 27
        vmr     6, 28
        vmr     7, 29
        vmr     8, 30
        bl      do_mul          # r^2 r^1
        xxpermdi 58, 58, 36, 0x3                # r0
        xxpermdi 59, 59, 37, 0x3                # r1
        xxpermdi 60, 60, 38, 0x3                # r2
        xxpermdi 61, 61, 39, 0x3                # r3
        xxpermdi 62, 62, 40, 0x3                # r4
        xxpermdi 36, 36, 36, 0x3
        xxpermdi 37, 37, 37, 0x3
        xxpermdi 38, 38, 38, 0x3
        xxpermdi 39, 39, 39, 0x3
        xxpermdi 40, 40, 40, 0x3
        vspltisb 13, 2
        vsld    9, 27, 13
        vsld    10, 28, 13
        vsld    11, 29, 13
        vsld    12, 30, 13
        vaddudm 0, 9, 27
        vaddudm 1, 10, 28
        vaddudm 2, 11, 29
        vaddudm 3, 12, 30

        bl      do_mul          # r^4 r^3
        vmrgow  26, 26, 4
        vmrgow  27, 27, 5
        vmrgow  28, 28, 6
        vmrgow  29, 29, 7
        vmrgow  30, 30, 8
        vspltisb 13, 2
        vsld    9, 27, 13
        vsld    10, 28, 13
        vsld    11, 29, 13
        vsld    12, 30, 13
        vaddudm 0, 9, 27
        vaddudm 1, 10, 28
        vaddudm 2, 11, 29
        vaddudm 3, 12, 30

        # r^2 r^4
        xxlor   0, 58, 58
        xxlor   1, 59, 59
        xxlor   2, 60, 60
        xxlor   3, 61, 61
        xxlor   4, 62, 62
        xxlor   5, 32, 32
        xxlor   6, 33, 33
        xxlor   7, 34, 34
        xxlor   8, 35, 35

        vspltw  9, 26, 3
        vspltw  10, 26, 2
        vmrgow  26, 10, 9
        vspltw  9, 27, 3
        vspltw  10, 27, 2
        vmrgow  27, 10, 9
        vspltw  9, 28, 3
        vspltw  10, 28, 2
        vmrgow  28, 10, 9
        vspltw  9, 29, 3
        vspltw  10, 29, 2
        vmrgow  29, 10, 9
        vspltw  9, 30, 3
        vspltw  10, 30, 2
        vmrgow  30, 10, 9

        vsld    9, 27, 13
        vsld    10, 28, 13
        vsld    11, 29, 13
        vsld    12, 30, 13
        vaddudm 0, 9, 27
        vaddudm 1, 10, 28
        vaddudm 2, 11, 29
        vaddudm 3, 12, 30
.endm

SYM_FUNC_START_LOCAL(do_mul)
        mul_odd

        # do reduction ( h %= p )
        # carry reduction
        vspltisb 9, 2
        vsrd    10, 14, 31
        vsrd    11, 17, 31
        vand    7, 17, 25
        vand    4, 14, 25
        vaddudm 18, 18, 11
        vsrd    12, 18, 31
        vaddudm 15, 15, 10

        vsrd    11, 15, 31
        vand    8, 18, 25
        vand    5, 15, 25
        vaddudm 4, 4, 12
        vsld    10, 12, 9
        vaddudm 6, 16, 11

        vsrd    13, 6, 31
        vand    6, 6, 25
        vaddudm 4, 4, 10
        vsrd    10, 4, 31
        vaddudm 7, 7, 13

        vsrd    11, 7, 31
        vand    7, 7, 25
        vand    4, 4, 25
        vaddudm 5, 5, 10
        vaddudm 8, 8, 11
        blr
SYM_FUNC_END(do_mul)

#
# init key
#
.macro do_poly1305_init
        addis   10, 2, rmask@toc@ha
        addi    10, 10, rmask@toc@l

        ld      11, 0(10)
        ld      12, 8(10)

        li      14, 16
        li      15, 32
        addis   10, 2, cnum@toc@ha
        addi    10, 10, cnum@toc@l
        lvx     25, 0, 10       # v25 - mask
        lvx     31, 14, 10      # v31 = 1a
        lvx     19, 15, 10      # v19 = 1 << 24
        lxv     24, 48(10)      # vs24
        lxv     25, 64(10)      # vs25

        # initialize
        # load key from r3 to vectors
        ld      9, 24(3)
        ld      10, 32(3)
        and.    9, 9, 11
        and.    10, 10, 12

        # break 26 bits
        extrdi  14, 9, 26, 38
        extrdi  15, 9, 26, 12
        extrdi  16, 9, 12, 0
        mtvsrdd 58, 0, 14
        insrdi  16, 10, 14, 38
        mtvsrdd 59, 0, 15
        extrdi  17, 10, 26, 24
        mtvsrdd 60, 0, 16
        extrdi  18, 10, 24, 0
        mtvsrdd 61, 0, 17
        mtvsrdd 62, 0, 18

        # r1 = r1 * 5, r2 = r2 * 5, r3 = r3 * 5, r4 = r4 * 5
        li      9, 5
        mtvsrdd 36, 0, 9
        vmulouw 0, 27, 4                # v0 = rr0
        vmulouw 1, 28, 4                # v1 = rr1
        vmulouw 2, 29, 4                # v2 = rr2
        vmulouw 3, 30, 4                # v3 = rr3
.endm

#
# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m)
#  k = 32 bytes key
#  r3 = k (r, s)
#  r4 = mlen
#  r5 = m
#
SYM_FUNC_START(poly1305_p10le_4blocks)
.align 5
        cmpdi   5, 64
        blt     Out_no_poly1305

        SAVE_REGS

        do_poly1305_init

        li      21, 0   # counter to message

        poly1305_setup_r

        # load previous H state
        # break/convert r6 to 26 bits
        ld      9, 0(3)
        ld      10, 8(3)
        ld      19, 16(3)
        sldi    19, 19, 24
        mtvsrdd 41, 0, 19
        extrdi  14, 9, 26, 38
        extrdi  15, 9, 26, 12
        extrdi  16, 9, 12, 0
        mtvsrdd 36, 0, 14
        insrdi  16, 10, 14, 38
        mtvsrdd 37, 0, 15
        extrdi  17, 10, 26, 24
        mtvsrdd 38, 0, 16
        extrdi  18, 10, 24, 0
        mtvsrdd 39, 0, 17
        mtvsrdd 40, 0, 18
        vor     8, 8, 9

        # input m1 m2
        add     20, 4, 21
        xxlor   49, 24, 24
        xxlor   50, 25, 25
        lxvw4x  43, 0, 20
        addi    17, 20, 16
        lxvw4x  44, 0, 17
        vperm   14, 11, 12, 17
        vperm   15, 11, 12, 18
        vand    9, 14, 25       # a0
        vsrd    10, 14, 31      # >> 26
        vsrd    11, 10, 31      # 12 bits left
        vand    10, 10, 25      # a1
        vspltisb 13, 12
        vand    16, 15, 25
        vsld    12, 16, 13
        vor     11, 11, 12
        vand    11, 11, 25      # a2
        vspltisb 13, 14
        vsrd    12, 15, 13      # >> 14
        vsrd    13, 12, 31      # >> 26, a4
        vand    12, 12, 25      # a3

        vaddudm 20, 4, 9
        vaddudm 21, 5, 10
        vaddudm 22, 6, 11
        vaddudm 23, 7, 12
        vaddudm 24, 8, 13

        # m3 m4
        addi    17, 17, 16
        lxvw4x  43, 0, 17
        addi    17, 17, 16
        lxvw4x  44, 0, 17
        vperm   14, 11, 12, 17
        vperm   15, 11, 12, 18
        vand    9, 14, 25       # a0
        vsrd    10, 14, 31      # >> 26
        vsrd    11, 10, 31      # 12 bits left
        vand    10, 10, 25      # a1
        vspltisb 13, 12
        vand    16, 15, 25
        vsld    12, 16, 13
        vspltisb 13, 14
        vor     11, 11, 12
        vand    11, 11, 25      # a2
        vsrd    12, 15, 13      # >> 14
        vsrd    13, 12, 31      # >> 26, a4
        vand    12, 12, 25      # a3

        # Smash 4 message blocks into 5 vectors of [m4,  m2,  m3,  m1]
        vmrgow  4, 9, 20
        vmrgow  5, 10, 21
        vmrgow  6, 11, 22
        vmrgow  7, 12, 23
        vmrgow  8, 13, 24
        vaddudm 8, 8, 19

        addi    5, 5, -64       # len -= 64
        addi    21, 21, 64      # offset += 64

        li      9, 64
        divdu   31, 5, 9

        cmpdi   31, 0
        ble     Skip_block_loop

        mtctr   31

# h4 =   m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r
# Rewrite the polynominal sum of product as follows,
# h1 = (h0 + m1) * r^2, h2 = (h0 + m2) * r^2
# h3 = (h1 + m3) * r^2, h4 = (h2 + m4) * r^2  --> (h0 + m1) r*4 + (h3 + m3) r^2, (h0 + m2) r^4 + (h0 + m4) r^2
#  .... Repeat
# h5 = (h3 + m5) * r^2, h6 = (h4 + m6) * r^2  -->
# h7 = (h5 + m7) * r^2, h8 = (h6 + m8) * r^1  --> m5 * r^4 + m6 * r^3 + m7 * r^2 + m8 * r
#
loop_4blocks:

        # Multiply odd words and even words
        mul_odd
        mul_even
        # carry reduction
        vspltisb 9, 2
        vsrd    10, 14, 31
        vsrd    11, 17, 31
        vand    7, 17, 25
        vand    4, 14, 25
        vaddudm 18, 18, 11
        vsrd    12, 18, 31
        vaddudm 15, 15, 10

        vsrd    11, 15, 31
        vand    8, 18, 25
        vand    5, 15, 25
        vaddudm 4, 4, 12
        vsld    10, 12, 9
        vaddudm 6, 16, 11

        vsrd    13, 6, 31
        vand    6, 6, 25
        vaddudm 4, 4, 10
        vsrd    10, 4, 31
        vaddudm 7, 7, 13

        vsrd    11, 7, 31
        vand    7, 7, 25
        vand    4, 4, 25
        vaddudm 5, 5, 10
        vaddudm 8, 8, 11

        # input m1  m2  m3  m4
        add     20, 4, 21
        xxlor   49, 24, 24
        xxlor   50, 25, 25
        lxvw4x  43, 0, 20
        addi    17, 20, 16
        lxvw4x  44, 0, 17
        vperm   14, 11, 12, 17
        vperm   15, 11, 12, 18
        addi    17, 17, 16
        lxvw4x  43, 0, 17
        addi    17, 17, 16
        lxvw4x  44, 0, 17
        vperm   17, 11, 12, 17
        vperm   18, 11, 12, 18

        vand    20, 14, 25      # a0
        vand    9, 17, 25       # a0
        vsrd    21, 14, 31      # >> 26
        vsrd    22, 21, 31      # 12 bits left
        vsrd    10, 17, 31      # >> 26
        vsrd    11, 10, 31      # 12 bits left

        vand    21, 21, 25      # a1
        vand    10, 10, 25      # a1

        vspltisb 13, 12
        vand    16, 15, 25
        vsld    23, 16, 13
        vor     22, 22, 23
        vand    22, 22, 25      # a2
        vand    16, 18, 25
        vsld    12, 16, 13
        vor     11, 11, 12
        vand    11, 11, 25      # a2
        vspltisb 13, 14
        vsrd    23, 15, 13      # >> 14
        vsrd    24, 23, 31      # >> 26, a4
        vand    23, 23, 25      # a3
        vsrd    12, 18, 13      # >> 14
        vsrd    13, 12, 31      # >> 26, a4
        vand    12, 12, 25      # a3

        vaddudm 4, 4, 20
        vaddudm 5, 5, 21
        vaddudm 6, 6, 22
        vaddudm 7, 7, 23
        vaddudm 8, 8, 24

        # Smash 4 message blocks into 5 vectors of [m4,  m2,  m3,  m1]
        vmrgow  4, 9, 4
        vmrgow  5, 10, 5
        vmrgow  6, 11, 6
        vmrgow  7, 12, 7
        vmrgow  8, 13, 8
        vaddudm 8, 8, 19

        addi    5, 5, -64       # len -= 64
        addi    21, 21, 64      # offset += 64

        bdnz    loop_4blocks

Skip_block_loop:
        xxlor   58, 0, 0
        xxlor   59, 1, 1
        xxlor   60, 2, 2
        xxlor   61, 3, 3
        xxlor   62, 4, 4
        xxlor   32, 5, 5
        xxlor   33, 6, 6
        xxlor   34, 7, 7
        xxlor   35, 8, 8

        # Multiply odd words and even words
        mul_odd
        mul_even

        # Sum the products.
        xxpermdi 41, 31, 46, 0
        xxpermdi 42, 31, 47, 0
        vaddudm 4, 14, 9
        xxpermdi 36, 31, 36, 3
        vaddudm 5, 15, 10
        xxpermdi 37, 31, 37, 3
        xxpermdi 43, 31, 48, 0
        vaddudm 6, 16, 11
        xxpermdi 38, 31, 38, 3
        xxpermdi 44, 31, 49, 0
        vaddudm 7, 17, 12
        xxpermdi 39, 31, 39, 3
        xxpermdi 45, 31, 50, 0
        vaddudm 8, 18, 13
        xxpermdi 40, 31, 40, 3

        # carry reduction
        vspltisb 9, 2
        vsrd    10, 4, 31
        vsrd    11, 7, 31
        vand    7, 7, 25
        vand    4, 4, 25
        vaddudm 8, 8, 11
        vsrd    12, 8, 31
        vaddudm 5, 5, 10

        vsrd    11, 5, 31
        vand    8, 8, 25
        vand    5, 5, 25
        vaddudm 4, 4, 12
        vsld    10, 12, 9
        vaddudm 6, 6, 11

        vsrd    13, 6, 31
        vand    6, 6, 25
        vaddudm 4, 4, 10
        vsrd    10, 4, 31
        vaddudm 7, 7, 13

        vsrd    11, 7, 31
        vand    7, 7, 25
        vand    4, 4, 25
        vaddudm 5, 5, 10
        vsrd    10, 5, 31
        vand    5, 5, 25
        vaddudm 6, 6, 10
        vaddudm 8, 8, 11

        b       do_final_update

do_final_update:
        # combine 26 bit limbs
        # v4, v5, v6, v7 and v8 are 26 bit vectors
        vsld    5, 5, 31
        vor     20, 4, 5
        vspltisb 11, 12
        vsrd    12, 6, 11
        vsld    6, 6, 31
        vsld    6, 6, 31
        vor     20, 20, 6
        vspltisb 11, 14
        vsld    7, 7, 11
        vor     21, 7, 12
        mfvsrld 16, 40          # save last 2 bytes
        vsld    8, 8, 11
        vsld    8, 8, 31
        vor     21, 21, 8
        mfvsrld 17, 52
        mfvsrld 19, 53
        srdi    16, 16, 24

        std     17, 0(3)
        std     19, 8(3)
        stw     16, 16(3)

Out_loop:
        li      3, 0

        RESTORE_REGS

        blr

Out_no_poly1305:
        li      3, 0
        blr
SYM_FUNC_END(poly1305_p10le_4blocks)

#
# =======================================================================
# The following functions implement 64 x 64 bits multiplication poly1305.
#
SYM_FUNC_START_LOCAL(Poly1305_init_64)
        #  mask 0x0FFFFFFC0FFFFFFC
        #  mask 0x0FFFFFFC0FFFFFFF
        addis   10, 2, rmask@toc@ha
        addi    10, 10, rmask@toc@l
        ld      11, 0(10)
        ld      12, 8(10)

        # initialize
        # load key from r3
        ld      9, 24(3)
        ld      10, 32(3)
        and.    9, 9, 11        # cramp mask r0
        and.    10, 10, 12      # cramp mask r1

        srdi    21, 10, 2
        add     19, 21, 10      # s1: r19 - (r1 >> 2) *5

        # setup r and s
        li      25, 0
        mtvsrdd 32+0, 9, 19     # r0, s1
        mtvsrdd 32+1, 10, 9     # r1, r0
        mtvsrdd 32+2, 19, 25    # s1
        mtvsrdd 32+3, 9, 25     # r0

        blr
SYM_FUNC_END(Poly1305_init_64)

# Poly1305_mult
# v6 = (h0, h1), v8 = h2
# v0 = (r0, s1), v1 = (r1, r0), v2 = s1, v3 = r0
#
# Output: v7, v10, v11
#
SYM_FUNC_START_LOCAL(Poly1305_mult)
        #
        #       d0 = h0 * r0 + h1 * s1
        vmsumudm        7, 6, 0, 9              # h0 * r0, h1 * s1

        #       d1 = h0 * r1 + h1 * r0 + h2 * s1
        vmsumudm        11, 6, 1, 9             # h0 * r1, h1 * r0
        vmsumudm        10, 8, 2, 11            # d1 += h2 * s1

        #       d2 = r0
        vmsumudm        11, 8, 3, 9             # d2 = h2 * r0
        blr
SYM_FUNC_END(Poly1305_mult)

#
# carry reduction
# h %=p
#
# Input: v7, v10, v11
# Output: r27, r28, r29
#
SYM_FUNC_START_LOCAL(Carry_reduction)
        mfvsrld 27, 32+7
        mfvsrld 28, 32+10
        mfvsrld 29, 32+11
        mfvsrd  20, 32+7        # h0.h
        mfvsrd  21, 32+10       # h1.h

        addc    28, 28, 20
        adde    29, 29, 21
        srdi    22, 29, 0x2
        sldi    23, 22, 0x2
        add     23, 23, 22      # (h2 & 3) * 5
        addc    27, 27, 23      # h0
        addze   28, 28          # h1
        andi.   29, 29, 0x3     # h2
        blr
SYM_FUNC_END(Carry_reduction)

#
# poly1305 multiplication
# h *= r, h %= p
#       d0 = h0 * r0 + h1 * s1
#       d1 = h0 * r1 + h1 * r0 + h2 * s1
#       d2 = h0 * r0
#
#
# unsigned int poly1305_test_64s(unisgned char *state, const byte *src, size_t len, highbit)
#   - no highbit if final leftover block (highbit = 0)
#
SYM_FUNC_START(poly1305_64s)
        cmpdi   5, 0
        ble     Out_no_poly1305_64

        mflr 0
        std 0, 16(1)
        stdu 1,-400(1)

        SAVE_GPR 14, 112, 1
        SAVE_GPR 15, 120, 1
        SAVE_GPR 16, 128, 1
        SAVE_GPR 17, 136, 1
        SAVE_GPR 18, 144, 1
        SAVE_GPR 19, 152, 1
        SAVE_GPR 20, 160, 1
        SAVE_GPR 21, 168, 1
        SAVE_GPR 22, 176, 1
        SAVE_GPR 23, 184, 1
        SAVE_GPR 24, 192, 1
        SAVE_GPR 25, 200, 1
        SAVE_GPR 26, 208, 1
        SAVE_GPR 27, 216, 1
        SAVE_GPR 28, 224, 1
        SAVE_GPR 29, 232, 1
        SAVE_GPR 30, 240, 1
        SAVE_GPR 31, 248, 1

        # Init poly1305
        bl Poly1305_init_64

        li 25, 0                        # offset to inp and outp

        add 11, 25, 4

        # load h
        # h0, h1, h2?
        ld      27, 0(3)
        ld      28, 8(3)
        lwz     29, 16(3)

        li      30, 16
        divdu   31, 5, 30

        mtctr   31

        mr      24, 6           # highbit

Loop_block_64:
        vxor    9, 9, 9

        ld      20, 0(11)
        ld      21, 8(11)
        addi    11, 11, 16

        addc    27, 27, 20
        adde    28, 28, 21
        adde    29, 29, 24

        li      22, 0
        mtvsrdd 32+6, 27, 28    # h0, h1
        mtvsrdd 32+8, 29, 22    # h2

        bl      Poly1305_mult

        bl      Carry_reduction

        bdnz    Loop_block_64

        std     27, 0(3)
        std     28, 8(3)
        stw     29, 16(3)

        li      3, 0

        RESTORE_GPR 14, 112, 1
        RESTORE_GPR 15, 120, 1
        RESTORE_GPR 16, 128, 1
        RESTORE_GPR 17, 136, 1
        RESTORE_GPR 18, 144, 1
        RESTORE_GPR 19, 152, 1
        RESTORE_GPR 20, 160, 1
        RESTORE_GPR 21, 168, 1
        RESTORE_GPR 22, 176, 1
        RESTORE_GPR 23, 184, 1
        RESTORE_GPR 24, 192, 1
        RESTORE_GPR 25, 200, 1
        RESTORE_GPR 26, 208, 1
        RESTORE_GPR 27, 216, 1
        RESTORE_GPR 28, 224, 1
        RESTORE_GPR 29, 232, 1
        RESTORE_GPR 30, 240, 1
        RESTORE_GPR 31, 248, 1

        addi    1, 1, 400
        ld 0, 16(1)
        mtlr 0

        blr

Out_no_poly1305_64:
        li      3, 0
        blr
SYM_FUNC_END(poly1305_64s)

#
# Input: r3 = h, r4 = s, r5 = mac
# mac = h + s
#
SYM_FUNC_START(poly1305_emit_64)
        ld      10, 0(3)
        ld      11, 8(3)
        ld      12, 16(3)

        # compare modulus
        # h + 5 + (-p)
        mr      6, 10
        mr      7, 11
        mr      8, 12
        addic.  6, 6, 5
        addze   7, 7
        addze   8, 8
        srdi    9, 8, 2         # overflow?
        cmpdi   9, 0
        beq     Skip_h64
        mr      10, 6
        mr      11, 7
        mr      12, 8

Skip_h64:
        ld      6, 0(4)
        ld      7, 8(4)
        addc    10, 10, 6
        adde    11, 11, 7
        addze   12, 12

        std     10, 0(5)
        std     11, 8(5)
        blr
SYM_FUNC_END(poly1305_emit_64)

SYM_DATA_START_LOCAL(RMASK)
.align 5
rmask:
.byte   0xff, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f
cnum:
.long   0x03ffffff, 0x00000000, 0x03ffffff, 0x00000000
.long   0x1a, 0x00, 0x1a, 0x00
.long   0x01000000, 0x01000000, 0x01000000, 0x01000000
.long   0x00010203, 0x04050607, 0x10111213, 0x14151617
.long   0x08090a0b, 0x0c0d0e0f, 0x18191a1b, 0x1c1d1e1f
SYM_DATA_END(RMASK)