root/usr/src/common/crypto/arcfour/sun4v/arcfour_crypt.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */

#include "../arcfour.h"

/* Initialize the key stream 'key' using the key value */
void
arcfour_key_init(ARCFour_key *key, uchar_t *keyval, int keyvallen)
{
        uchar_t ext_keyval[256];
        uchar_t tmp;
        int i, j;

        for (i = j = 0; i < 256; i++, j++) {
                if (j == keyvallen)
                        j = 0;

                ext_keyval[i] = keyval[j];
        }
        for (i = 0; i < 256; i++)
                key->arr[i] = (uchar_t)i;

        j = 0;
        for (i = 0; i < 256; i++) {
                j = (j + key->arr[i] + ext_keyval[i]) % 256;
                tmp = key->arr[i];
                key->arr[i] = key->arr[j];
                key->arr[j] = tmp;
        }
        key->i = 0;
        key->j = 0;
}


/*
 * Encipher 'in' using 'key.
 * in and out can point to the same location
 */
void
arcfour_crypt(ARCFour_key *key, uchar_t *in, uchar_t *out, size_t len)
{
        size_t ii;
        unsigned long long in0, merge = 0, merge0 = 0, merge1, mask = 0;
        uchar_t i, j, *base, jj, *base1, tmp;
        unsigned int tmp0, tmp1, i_accum, shift = 0, i1;

        int index;

        base = key->arr;

        index = (((uintptr_t)in) & 0x7);

        /* Get the 'in' on an 8-byte alignment */
        if (index > 0) {
                i = key->i;
                j = key->j;

                for (index = 8 - index; (index-- > 0) && len > 0;
                    len--, in++, out++) {

                        i = i + 1;
                        j = j + key->arr[i];
                        tmp = key->arr[i];
                        key->arr[i] = key->arr[j];
                        key->arr[j] = tmp;
                        tmp = key->arr[i] + key->arr[j];
                        *out = *in ^ key->arr[tmp];
                }
                key->i = i;
                key->j = j;

        }
        if (len == 0)
                return;

        /* See if we're fortunate and 'out' got aligned as well */


        /*
         * Niagara optimized version for
         * the cases where the input and output  buffers are aligned on
         * a multiple of 8-byte boundary.
         */
#ifdef  sun4v
        if ((((uintptr_t)out) & 7) != 0) {
#endif  /* sun4v */
                i = key->i;
                j = key->j;
                for (ii = 0; ii < len; ii++) {
                        i = i + 1;
                        tmp0 = base[i];
                        j = j + tmp0;
                        tmp1 = base[j];
                        base[i] = (uchar_t)tmp1;
                        base[j] = (uchar_t)tmp0;
                        tmp0 += tmp1;
                        tmp0 = tmp0 & 0xff;
                        out[ii] = in[ii] ^ base[tmp0];
                }
                key->i = i;
                key->j = j;
#ifdef  sun4v
        } else {
                i = key->i;
                j = key->j;

                /*
                 * Want to align base[i] on a 2B boundary -- allows updates
                 * via [i] to be performed in 2B chunks (reducing # of stores).
                 * Requires appropriate alias detection.
                 */

                if (((i+1) % 2) != 0) {
                        i = i + 1;
                        tmp0 = base[i];
                        j = j + tmp0;
                        tmp1 = base[j];

                        base[i] = (uchar_t)tmp1;
                        base[j] = (uchar_t)tmp0;

                        tmp0 += tmp1;
                        tmp0 = tmp0 & 0xff;

                        merge0 = (unsigned long long)(base[tmp0]) << 56;
                        shift = 8; mask = 0xff;
                }

                /*
                 * Note - in and out may now be misaligned -
                 * as updating [out] in 8B chunks need to handle this
                 * possibility. Also could have a 1B overrun.
                 * Need to drop out of loop early as a result.
                 */

                for (ii = 0, i1 = i; ii < ((len-1)  & (~7));
                    ii += 8, i1 = i1&0xff) {

                        /*
                         * If i < less than 248, know wont wrap around
                         * (i % 256), so don't need to bother with masking i
                         * after each increment
                         */
                        if (i1 < 248) {

                                /* BYTE 0 */
                                i1 = (i1 + 1);

                                /*
                                 * Creating this base pointer reduces subsequent
                                 * arihmetic ops required to load [i]
                                 *
                                 * N.B. don't need to check if [j] aliases.
                                 * [i] and [j] end up with the same values
                                 * anyway.
                                 */
                                base1 = &base[i1];

                                tmp0 = base1[0];
                                j = j + tmp0;

                                tmp1 = base[j];
                                /*
                                 * Don't store [i] yet
                                 */
                                i_accum = tmp1;
                                base[j] = (uchar_t)tmp0;

                                tmp0 += tmp1;
                                tmp0 = tmp0 & 0xff;

                                /*
                                 * Check [tmp0] doesn't alias with [i]
                                 */

                                /*
                                 * Updating [out] in 8B chunks
                                 */
                                if (i1 == tmp0) {
                                        merge =
                                            (unsigned long long)(i_accum) << 56;
                                } else {
                                        merge =
                                            (unsigned long long)(base[tmp0]) <<
                                            56;
                                }

                                /* BYTE 1 */
                                tmp0 = base1[1];

                                j = j + tmp0;

                                /*
                                 * [j] can now alias with [i] and [i-1]
                                 * If alias abort speculation
                                 */
                                if ((i1 ^ j) < 2) {
                                        base1[0] = (uchar_t)i_accum;

                                        tmp1 = base[j];

                                        base1[1] = (uchar_t)tmp1;
                                        base[j] = (uchar_t)tmp0;

                                        tmp0 += tmp1;
                                        tmp0 = tmp0 & 0xff;

                                        merge |= (unsigned long long)
                                            (base[tmp0]) << 48;
                                } else {

                                        tmp1 = base[j];

                                        i_accum = i_accum << 8;
                                        i_accum |= tmp1;

                                        base[j] = (uchar_t)tmp0;

                                        tmp0 += tmp1;
                                        tmp0 = tmp0 & 0xff;

                                        /*
                                         * Speculation suceeded! Update [i]
                                         * in 2B chunk
                                         */
                                        /* LINTED E_BAD_PTR_CAST_ALIGN */
                                        *((unsigned short *) &base[i1]) =
                                            i_accum;

                                        merge |=
                                            (unsigned long long)(base[tmp0]) <<
                                            48;
                                }


                                /*
                                 * Too expensive to perform [i] speculation for
                                 * every byte. Just need to reduce frequency
                                 * of stores until store buffer full stalls
                                 * are not the bottleneck.
                                 */

                                /* BYTE 2 */
                                tmp0 = base1[2];
                                j = j + tmp0;
                                tmp1 = base[j];
                                base1[2] = (uchar_t)tmp1;
                                base[j] = (uchar_t)tmp0;
                                tmp1 += tmp0;
                                tmp1 = tmp1 & 0xff;
                                merge |= (unsigned long long)(base[tmp1]) << 40;

                                /* BYTE 3 */
                                tmp0 = base1[3];
                                j = j + tmp0;
                                tmp1 = base[j];
                                base1[3] = (uchar_t)tmp1;
                                base[j] = (uchar_t)tmp0;
                                tmp0 += tmp1;
                                tmp0 = tmp0 & 0xff;
                                merge |= (unsigned long long)(base[tmp0]) << 32;

                                /* BYTE 4 */
                                tmp0 = base1[4];
                                j = j + tmp0;
                                tmp1 = base[j];
                                base1[4] = (uchar_t)tmp1;
                                base[j] = (uchar_t)tmp0;
                                tmp0 += tmp1;
                                tmp0 = tmp0 & 0xff;
                                merge |= (unsigned long long)(base[tmp0]) << 24;

                                /* BYTE 5 */
                                tmp0 = base1[5];
                                j = j + tmp0;
                                tmp1 = base[j];
                                base1[5] = (uchar_t)tmp1;
                                base[j] = (uchar_t)tmp0;
                                tmp0 += tmp1;
                                tmp0 = tmp0 & 0xff;
                                merge |= (unsigned long long)(base[tmp0]) << 16;

                                /* BYTE 6 */
                                i1 = (i1+6);
                                tmp0 = base1[6];
                                j = j + tmp0;
                                tmp1 = base[j];
                                i_accum = tmp1;
                                base[j] = (uchar_t)tmp0;

                                tmp0 += tmp1;
                                tmp0 = tmp0 & 0xff;

                                if (i1 == tmp0) {
                                        merge |=
                                            (unsigned long long)(i_accum) << 8;
                                } else {
                                        merge |=
                                            (unsigned long long)(base[tmp0]) <<
                                            8;
                                }

                                /* BYTE 7 */
                                tmp0 = base1[7];

                                /*
                                 * Perform [i] speculation again. Indentical
                                 * to that performed for BYTE0 and BYTE1.
                                 */
                                j = j + tmp0;
                                if ((i1 ^ j) < 2) {
                                        base1[6] = (uchar_t)i_accum;
                                        tmp1 = base[j];

                                        base1[7] = (uchar_t)tmp1;
                                        base[j] = (uchar_t)tmp0;

                                        tmp0 += tmp1;
                                        tmp0 = tmp0 & 0xff;

                                        merge |=
                                            (unsigned long long)(base[tmp0]);

                                } else {
                                        tmp1 = base[j];

                                        i_accum = i_accum << 8;
                                        i_accum |= tmp1;

                                        base[j] = (uchar_t)tmp0;

                                        tmp0 += tmp1;
                                        tmp0 = tmp0 & 0xff;

                                        /* LINTED E_BAD_PTR_CAST_ALIGN */
                                        *((unsigned short *) &base[i1]) =
                                            i_accum;

                                        merge |=
                                            (unsigned long long)(base[tmp0]);
                                }
                                i1++;
                        } else {
                                /*
                                 * i is too close to wrap-around to allow
                                 * masking to be disregarded
                                 */

                                /*
                                 * Same old speculation for BYTE 0 and BYTE 1
                                 */

                                /* BYTE 0 */
                                i1 = (i1 + 1) & 0xff;
                                jj = (uchar_t)i1;

                                tmp0 = base[i1];
                                j = j + tmp0;

                                tmp1 = base[j];
                                i_accum = tmp1;
                                base[j] = (uchar_t)tmp0;

                                tmp0 += tmp1;
                                tmp0 = tmp0 & 0xff;

                                if (i1 == tmp0) {
                                        merge =
                                            (unsigned long long)(i_accum) << 56;
                                } else {
                                        merge =
                                            (unsigned long long)(base[tmp0]) <<
                                            56;
                                }

                                /* BYTE 1 */
                                tmp0 = base[i1+1];

                                j = j + tmp0;

                                if ((jj ^ j) < 2) {
                                        base[jj] = (uchar_t)i_accum;

                                        tmp1 = base[j];

                                        base[i1+1] = (uchar_t)tmp1;
                                        base[j] = (uchar_t)tmp0;

                                        tmp0 += tmp1;
                                        tmp0 = tmp0 & 0xff;

                                        merge |=
                                            (unsigned long long)(base[tmp0]) <<
                                            48;
                                } else {

                                        tmp1 = base[j];

                                        i_accum = i_accum << 8;
                                        i_accum |= tmp1;

                                        base[j] = (uchar_t)tmp0;

                                        tmp0 += tmp1;
                                        tmp0 = tmp0 & 0xff;

                                        /* LINTED E_BAD_PTR_CAST_ALIGN */
                                        *((unsigned short *) &base[jj]) =
                                            i_accum;

                                        merge |=
                                            (unsigned long long)(base[tmp0]) <<
                                            48;
                                }

                                /* BYTE 2 */
                                /*
                                 * As know i must be even when enter loop (to
                                 * satisfy alignment), can only wrap around
                                 * on the even bytes. So just need to perform
                                 * mask every 2nd byte
                                 */
                                i1 = (i1 + 2) & 0xff;
                                tmp0 = base[i1];
                                j = j + tmp0;
                                tmp1 = base[j];
                                base[i1] = (uchar_t)tmp1;
                                base[j] = (uchar_t)tmp0;
                                tmp0 += tmp1;
                                tmp0 = tmp0 & 0xff;
                                merge |= (unsigned long long)(base[tmp0]) << 40;

                                /* BYTE 3 */
                                tmp0 = base[i1+1];
                                j = j + tmp0;
                                tmp1 = base[j];
                                base[i1+1] = (uchar_t)tmp1;
                                base[j] = (uchar_t)tmp0;
                                tmp0 += tmp1;
                                tmp0 = tmp0 & 0xff;
                                merge |= (unsigned long long)(base[tmp0]) << 32;

                                /* BYTE 4 */
                                i1 = (i1 + 2) & 0xff;
                                tmp0 = base[i1];
                                j = j + tmp0;
                                tmp1 = base[j];
                                base[i1] = (uchar_t)tmp1;
                                base[j] = (uchar_t)tmp0;
                                tmp0 += tmp1;
                                tmp0 = tmp0 & 0xff;
                                merge |= (unsigned long long)(base[tmp0]) << 24;

                                /* BYTE 5 */
                                tmp0 = base[i1+1];
                                j = j + tmp0;
                                tmp1 = base[j];
                                base[i1+1] = (uchar_t)tmp1;
                                base[j] = (uchar_t)tmp0;
                                tmp0 += tmp1;
                                tmp0 = tmp0 & 0xff;
                                merge |= (unsigned long long)(base[tmp0]) << 16;

                                /* BYTE 6 */
                                i1 = (i1+2) &0xff;
                                jj = (uchar_t)i1;
                                tmp0 = base[i1];

                                j = j + tmp0;

                                tmp1 = base[j];
                                i_accum = tmp1;
                                base[j] = (uchar_t)tmp0;


                                tmp0 += tmp1;
                                tmp0 = tmp0 & 0xff;

                                if (i1 == tmp0) {
                                        merge |=
                                            (unsigned long long)(i_accum) << 8;
                                } else {
                                        merge |=
                                            (unsigned long long)(base[tmp0]) <<
                                            8;
                                }

                                /* BYTE 7 */
                                i1++;
                                tmp0 = base[i1];

                                j = j + tmp0;
                                if ((jj ^ j) < 2) {
                                        base[jj] = (uchar_t)i_accum;
                                        tmp1 = base[j];

                                        base[i1] = (uchar_t)tmp1;
                                        base[j] = (uchar_t)tmp0;

                                        tmp0 += tmp1;
                                        tmp0 = tmp0 & 0xff;

                                        merge |=
                                            (unsigned long long)(base[tmp0]);

                                } else {

                                        tmp1 = base[j];

                                        i_accum = i_accum << 8;
                                        i_accum |= tmp1;

                                        base[j] = (uchar_t)tmp0;

                                        tmp0 += tmp1;
                                        tmp0 = tmp0 & 0xff;

                                        /* LINTED E_BAD_PTR_CAST_ALIGN */
                                        *((unsigned short *) &base[jj]) =
                                            i_accum;

                                        merge |=
                                            (unsigned long long)(base[tmp0]);
                                }
                        }

                        /*
                         * Perform update to [out]
                         * Remember could be alignment issues
                         */
                        /* LINTED E_BAD_PTR_CAST_ALIGN */
                        in0 = *((unsigned long long *) (&in[ii]));

                        merge1 = merge0 | (merge >> shift);

                        merge0 = (merge & mask) << 56;

                        in0 = in0 ^ merge1;

                        /* LINTED E_BAD_PTR_CAST_ALIGN */
                        *((unsigned long long *) (&out[ii])) = in0;
                }

                i = (uchar_t)i1;

                /*
                 * Handle any overrun
                 */
                if (shift) {
                        out[ii] = in[ii] ^ (merge0 >> 56);
                        ii++;
                }

                /*
                 * Handle final few bytes
                 */
                for (; ii < len; ii++) {
                        i = i + 1;
                        tmp0 = base[i];
                        j = j + tmp0;
                        tmp1 = base[j];

                        base[i] = (uchar_t)tmp1;
                        base[j] = (uchar_t)tmp0;

                        tmp0 += tmp1;
                        tmp0 = tmp0 & 0xff;
                        out[ii] = in[ii] ^ base[tmp0];
                }
                key->i = i;
                key->j = j;
        }
#endif /* sun4v */
}