root/usr/src/common/bignum/bignumimpl.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
 */

/*
 * Configuration guide
 * -------------------
 *
 * There are 4 preprocessor symbols used to configure the bignum
 * implementation.  This file contains no logic to configure based on
 * processor; we leave that to the Makefiles to specify.
 *
 * USE_FLOATING_POINT
 *   Meaning: There is support for a fast floating-point implementation of
 *   Montgomery multiply.
 *
 * PSR_MUL
 *   Meaning: There are processor-specific versions of the low level
 *   functions to implement big_mul.  Those functions are: big_mul_set_vec,
 *   big_mul_add_vec, big_mul_vec, and big_sqr_vec.  PSR_MUL implies support
 *   for all 4 functions.  You cannot pick and choose which subset of these
 *   functions to support; that would lead to a rat's nest of #ifdefs.
 *
 * HWCAP
 *   Meaning: Call multiply support functions through a function pointer.
 *   On x86, there are multiple implementations for different hardware
 *   capabilities, such as MMX, SSE2, etc.  Tests are made at run-time, when
 *   a function is first used.  So, the support functions are called through
 *   a function pointer.  There is no need for that on Sparc, because there
 *   is only one implementation; support functions are called directly.
 *   Later, if there were some new VIS instruction, or something, and a
 *   run-time test were needed, rather than variant kernel modules and
 *   libraries, then HWCAP would be defined for Sparc, as well.
 *
 * UMUL64
 *   Meaning: It is safe to use generic C code that assumes the existence
 *   of a 32 x 32 --> 64 bit unsigned multiply.  If this is not defined,
 *   then the generic code for big_mul_add_vec() must necessarily be very slow,
 *   because it must fall back to using 16 x 16 --> 32 bit multiplication.
 *
 */


#include <sys/types.h>
#include "bignum.h"

#ifdef  _KERNEL
#include <sys/ddi.h>
#include <sys/mdesc.h>
#include <sys/crypto/common.h>

#include <sys/kmem.h>
#include <sys/param.h>
#include <sys/sunddi.h>

#else
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#define ASSERT  assert
#endif  /* _KERNEL */

#ifdef __amd64
#ifdef _KERNEL
#include <sys/x86_archext.h>    /* cpuid_getvendor() */
#include <sys/cpuvar.h>
#else
#include <sys/auxv.h>           /* getisax() */
#endif  /* _KERNEL */
#endif  /* __amd64 */


#ifdef  _LP64 /* truncate 64-bit size_t to 32-bits */
#define UI32(ui)        ((uint32_t)ui)
#else /* size_t already 32-bits */
#define UI32(ui)        (ui)
#endif


#ifdef  _KERNEL
#define big_malloc(size)        kmem_alloc(size, KM_NOSLEEP)
#define big_free(ptr, size)     kmem_free(ptr, size)

/*
 * big_realloc()
 * Allocate memory of newsize bytes and copy oldsize bytes
 * to the newly-allocated memory, then free the
 * previously-allocated memory.
 * Note: newsize must be > oldsize
 */
void *
big_realloc(void *from, size_t oldsize, size_t newsize)
{
        void *rv;

        rv = kmem_alloc(newsize, KM_NOSLEEP);
        if (rv != NULL)
                bcopy(from, rv, oldsize);
        kmem_free(from, oldsize);
        return (rv);
}

#else   /* _KERNEL */

#ifndef MALLOC_DEBUG

#define big_malloc(size)        malloc(size)
#define big_free(ptr, size)     free(ptr)

#else

void
big_free(void *ptr, size_t size)
{
        printf("freed %d bytes at %p\n", size, ptr);
        free(ptr);
}

void *
big_malloc(size_t size)
{
        void *rv;
        rv = malloc(size);
        printf("malloced %d bytes, addr:%p\n", size, rv);
        return (rv);
}
#endif /* MALLOC_DEBUG */

#define big_realloc(x, y, z) realloc((x), (z))


/*
 * printbignum()
 * Print a BIGNUM type to stdout.
 */
void
printbignum(char *aname, BIGNUM *a)
{
        int i;

        (void) printf("\n%s\n%d\n", aname, a->sign*a->len);
        for (i = a->len - 1; i >= 0; i--) {
#ifdef BIGNUM_CHUNK_32
                (void) printf("%08x ", a->value[i]);
                if (((i & (BITSINBYTE - 1)) == 0) && (i != 0)) {
                        (void) printf("\n");
                }
#else
                (void) printf("%08x %08x ", (uint32_t)((a->value[i]) >> 32),
                    (uint32_t)((a->value[i]) & 0xffffffff));
                if (((i & 3) == 0) && (i != 0)) { /* end of this chunk */
                        (void) printf("\n");
                }
#endif
        }
        (void) printf("\n");
}

#endif  /* _KERNEL */


#ifdef  __amd64
/*
 * Return 1 if executing on Intel, otherwise 0 (e.g., AMD64).
 * Cache the result, as the CPU can't change.
 *
 * Note: the userland version uses getisax() and checks for an AMD-64-only
 * feature.  The kernel version uses cpuid_getvendor().
 */
static int
bignum_on_intel(void)
{
        static int      cached_result = -1;

        if (cached_result == -1) { /* first time */
#ifdef _KERNEL
                cached_result = (cpuid_getvendor(CPU) == X86_VENDOR_Intel);
#else
                uint_t  ui;

                (void) getisax(&ui, 1);
                cached_result = ((ui & AV_386_AMD_MMX) == 0);
#endif  /* _KERNEL */
        }

        return (cached_result);
}
#endif  /* __amd64 */


/*
 * big_init()
 * Initialize and allocate memory for a BIGNUM type.
 *
 * big_init(number, size) is equivalent to big_init1(number, size, NULL, 0)
 *
 * Note: call big_finish() to free memory allocated by big_init().
 *
 * Input:
 * number       Uninitialized memory for BIGNUM
 * size         Minimum size, in BIG_CHUNK_SIZE-bit words, required for BIGNUM
 *
 * Output:
 * number       Initialized BIGNUM
 *
 * Return BIG_OK on success or BIG_NO_MEM for an allocation error.
 */
BIG_ERR_CODE
big_init(BIGNUM *number, int size)
{
        number->value = big_malloc(BIGNUM_WORDSIZE * size);
        if (number->value == NULL) {
                return (BIG_NO_MEM);
        }
        number->size = size;
        number->len = 0;
        number->sign = 1;
        number->malloced = 1;
        return (BIG_OK);
}


/*
 * big_init1()
 * Initialize and, if needed, allocate memory for a BIGNUM type.
 * Use the buffer passed, buf, if any, instad of allocating memory
 * if it's at least "size" bytes.
 *
 * Note: call big_finish() to free memory allocated by big_init().
 *
 * Input:
 * number       Uninitialized memory for BIGNUM
 * size         Minimum size, in BIG_CHUNK_SIZE-bit words, required for BIGNUM
 * buf          Buffer for storing a BIGNUM.
 *              If NULL, big_init1() will allocate a buffer
 * bufsize      Size, in BIG_CHUNK_SIZE_bit words, of buf
 *
 * Output:
 * number       Initialized BIGNUM
 *
 * Return BIG_OK on success or BIG_NO_MEM for an allocation error.
 */
BIG_ERR_CODE
big_init1(BIGNUM *number, int size, BIG_CHUNK_TYPE *buf, int bufsize)
{
        if ((buf == NULL) || (size > bufsize)) {
                number->value = big_malloc(BIGNUM_WORDSIZE * size);
                if (number->value == NULL) {
                        return (BIG_NO_MEM);
                }
                number->size = size;
                number->malloced = 1;
        } else {
                number->value = buf;
                number->size = bufsize;
                number->malloced = 0;
        }
        number->len = 0;
        number->sign = 1;

        return (BIG_OK);
}


/*
 * big_finish()
 * Free memory, if any, allocated by big_init() or big_init1().
 */
void
big_finish(BIGNUM *number)
{
        if (number->malloced == 1) {
                big_free(number->value, BIGNUM_WORDSIZE * number->size);
                number->malloced = 0;
        }
}


/*
 * bn->size should be at least
 * (len + BIGNUM_WORDSIZE - 1) / BIGNUM_WORDSIZE bytes
 * converts from byte-big-endian format to bignum format (words in
 * little endian order, but bytes within the words big endian)
 */
void
bytestring2bignum(BIGNUM *bn, uchar_t *kn, size_t len)
{
        int             i, j;
        uint32_t        offs;
        const uint32_t  slen = UI32(len);
        BIG_CHUNK_TYPE  word;
        uchar_t         *knwordp;

        if (slen == 0) {
                bn->len = 1;
                bn->value[0] = 0;
                return;
        }

        offs = slen % BIGNUM_WORDSIZE;
        bn->len = slen / BIGNUM_WORDSIZE;

        for (i = 0; i < slen / BIGNUM_WORDSIZE; i++) {
                knwordp = &(kn[slen - BIGNUM_WORDSIZE * (i + 1)]);
                word = knwordp[0];
                for (j = 1; j < BIGNUM_WORDSIZE; j++) {
                        word = (word << BITSINBYTE) + knwordp[j];
                }
                bn->value[i] = word;
        }
        if (offs > 0) {
                word = kn[0];
                for (i = 1; i < offs; i++) word = (word << BITSINBYTE) + kn[i];
                bn->value[bn->len++] = word;
        }
        while ((bn->len > 1) && (bn->value[bn->len - 1] == 0)) {
                bn->len --;
        }
}


/*
 * copies the least significant len bytes if
 * len < bn->len * BIGNUM_WORDSIZE
 * converts from bignum format to byte-big-endian format.
 * bignum format is words of type  BIG_CHUNK_TYPE in little endian order.
 */
void
bignum2bytestring(uchar_t *kn, BIGNUM *bn, size_t len)
{
        int             i, j;
        uint32_t        offs;
        const uint32_t  slen = UI32(len);
        BIG_CHUNK_TYPE  word;

        if (len < BIGNUM_WORDSIZE * bn->len) {
                for (i = 0; i < slen / BIGNUM_WORDSIZE; i++) {
                        word = bn->value[i];
                        for (j = 0; j < BIGNUM_WORDSIZE; j++) {
                                kn[slen - BIGNUM_WORDSIZE * i - j - 1] =
                                    word & 0xff;
                                word = word >> BITSINBYTE;
                        }
                }
                offs = slen % BIGNUM_WORDSIZE;
                if (offs > 0) {
                        word = bn->value[slen / BIGNUM_WORDSIZE];
                        for (i =  slen % BIGNUM_WORDSIZE; i > 0; i --) {
                                kn[i - 1] = word & 0xff;
                                word = word >> BITSINBYTE;
                        }
                }
        } else {
                for (i = 0; i < bn->len; i++) {
                        word = bn->value[i];
                        for (j = 0; j < BIGNUM_WORDSIZE; j++) {
                                kn[slen - BIGNUM_WORDSIZE * i - j - 1] =
                                    word & 0xff;
                                word = word >> BITSINBYTE;
                        }
                }
                for (i = 0; i < slen - BIGNUM_WORDSIZE * bn->len; i++) {
                        kn[i] = 0;
                }
        }
}


int
big_bitlength(BIGNUM *a)
{
        int             l = 0, b = 0;
        BIG_CHUNK_TYPE  c;

        l = a->len - 1;
        while ((l > 0) && (a->value[l] == 0)) {
                l--;
        }
        b = BIG_CHUNK_SIZE;
        c = a->value[l];
        while ((b > 1) && ((c & BIG_CHUNK_HIGHBIT) == 0)) {
                c = c << 1;
                b--;
        }

        return (l * BIG_CHUNK_SIZE + b);
}


/*
 * big_copy()
 * Copy BIGNUM src to dest, allocating memory if needed.
 */
BIG_ERR_CODE
big_copy(BIGNUM *dest, BIGNUM *src)
{
        BIG_CHUNK_TYPE  *newptr;
        int             i, len;

        len = src->len;
        while ((len > 1) && (src->value[len - 1] == 0)) {
                len--;
        }
        src->len = len;
        if (dest->size < len) {
                if (dest->malloced == 1) {
                        newptr = (BIG_CHUNK_TYPE *)big_realloc(dest->value,
                            BIGNUM_WORDSIZE * dest->size,
                            BIGNUM_WORDSIZE * len);
                } else {
                        newptr = (BIG_CHUNK_TYPE *)
                            big_malloc(BIGNUM_WORDSIZE * len);
                        if (newptr != NULL) {
                                dest->malloced = 1;
                        }
                }
                if (newptr == NULL) {
                        return (BIG_NO_MEM);
                }
                dest->value = newptr;
                dest->size = len;
        }
        dest->len = len;
        dest->sign = src->sign;
        for (i = 0; i < len; i++) {
                dest->value[i] = src->value[i];
        }

        return (BIG_OK);
}


/*
 * big_extend()
 * Allocate memory to extend BIGNUM number to size bignum chunks,
 * if not at least that size already.
 */
BIG_ERR_CODE
big_extend(BIGNUM *number, int size)
{
        BIG_CHUNK_TYPE  *newptr;
        int             i;

        if (number->size >= size)
                return (BIG_OK);
        if (number->malloced) {
                number->value = big_realloc(number->value,
                    BIGNUM_WORDSIZE * number->size,
                    BIGNUM_WORDSIZE * size);
        } else {
                newptr = big_malloc(BIGNUM_WORDSIZE * size);
                if (newptr != NULL) {
                        for (i = 0; i < number->size; i++) {
                                newptr[i] = number->value[i];
                        }
                }
                number->value = newptr;
        }

        if (number->value == NULL) {
                return (BIG_NO_MEM);
        }

        number->size = size;
        number->malloced = 1;
        return (BIG_OK);
}


/* returns 1 if n == 0 */
int
big_is_zero(BIGNUM *n)
{
        int     i, result;

        result = 1;
        for (i = 0; i < n->len; i++) {
                if (n->value[i] != 0) {
                        result = 0;
                }
        }
        return (result);
}


BIG_ERR_CODE
big_add_abs(BIGNUM *result, BIGNUM *aa, BIGNUM *bb)
{
        int             i, shorter, longer;
        BIG_CHUNK_TYPE  cy, ai;
        BIG_CHUNK_TYPE  *r, *a, *b, *c;
        BIG_ERR_CODE    err;
        BIGNUM          *longerarg;

        if (aa->len > bb->len) {
                shorter = bb->len;
                longer = aa->len;
                longerarg = aa;
        } else {
                shorter = aa->len;
                longer = bb->len;
                longerarg = bb;
        }
        if (result->size < longer + 1) {
                err = big_extend(result, longer + 1);
                if (err != BIG_OK) {
                        return (err);
                }
        }

        r = result->value;
        a = aa->value;
        b = bb->value;
        c = longerarg->value;
        cy = 0;
        for (i = 0; i < shorter; i++) {
                ai = a[i];
                r[i] = ai + b[i] + cy;
                if (r[i] > ai) {
                        cy = 0;
                } else if (r[i] < ai) {
                        cy = 1;
                }
        }
        for (; i < longer; i++) {
                ai = c[i];
                r[i] = ai + cy;
                if (r[i] >= ai) {
                        cy = 0;
                }
        }
        if (cy == 1) {
                r[i] = cy;
                result->len = longer + 1;
        } else {
                result->len = longer;
        }
        result->sign = 1;
        return (BIG_OK);
}


/* caller must make sure that result has at least len words allocated */
void
big_sub_vec(BIG_CHUNK_TYPE *r, BIG_CHUNK_TYPE *a, BIG_CHUNK_TYPE *b, int len)
{
        int             i;
        BIG_CHUNK_TYPE  cy, ai;

        cy = 1;
        for (i = 0; i < len; i++) {
                ai = a[i];
                r[i] = ai + (~b[i]) + cy;
                if (r[i] > ai) {
                        cy = 0;
                } else if (r[i] < ai) {
                        cy = 1;
                }
        }
}


/* result=aa-bb  it is assumed that aa>=bb */
BIG_ERR_CODE
big_sub_pos(BIGNUM *result, BIGNUM *aa, BIGNUM *bb)
{
        int             i, shorter;
        BIG_CHUNK_TYPE  cy = 1, ai;
        BIG_CHUNK_TYPE  *r, *a, *b;
        BIG_ERR_CODE    err = BIG_OK;

        if (aa->len > bb->len) {
                shorter = bb->len;
        } else {
                shorter = aa->len;
        }
        if (result->size < aa->len) {
                err = big_extend(result, aa->len);
                if (err != BIG_OK) {
                        return (err);
                }
        }

        r = result->value;
        a = aa->value;
        b = bb->value;
        result->len = aa->len;
        cy = 1;
        for (i = 0; i < shorter; i++) {
                ai = a[i];
                r[i] = ai + (~b[i]) + cy;
                if (r[i] > ai) {
                        cy = 0;
                } else if (r[i] < ai) {
                        cy = 1;
                }
        }
        for (; i < aa->len; i++) {
                ai = a[i];
                r[i] = ai + (~0) + cy;
                if (r[i] < ai) {
                        cy = 1;
                }
        }
        result->sign = 1;

        if (cy == 0) {
                return (BIG_INVALID_ARGS);
        } else {
                return (BIG_OK);
        }
}


/* returns -1 if |aa|<|bb|, 0 if |aa|==|bb| 1 if |aa|>|bb| */
int
big_cmp_abs(BIGNUM *aa, BIGNUM *bb)
{
        int     i;

        if (aa->len > bb->len) {
                for (i = aa->len - 1; i > bb->len - 1; i--) {
                        if (aa->value[i] > 0) {
                                return (1);
                        }
                }
        } else if (aa->len < bb->len) {
                for (i = bb->len - 1; i > aa->len - 1; i--) {
                        if (bb->value[i] > 0) {
                                return (-1);
                        }
                }
        } else {
                i = aa->len - 1;
        }
        for (; i >= 0; i--) {
                if (aa->value[i] > bb->value[i]) {
                        return (1);
                } else if (aa->value[i] < bb->value[i]) {
                        return (-1);
                }
        }

        return (0);
}


BIG_ERR_CODE
big_sub(BIGNUM *result, BIGNUM *aa, BIGNUM *bb)
{
        BIG_ERR_CODE    err;

        if ((bb->sign == -1) && (aa->sign == 1)) {
                if ((err = big_add_abs(result, aa, bb)) != BIG_OK) {
                        return (err);
                }
                result->sign = 1;
        } else if ((aa->sign == -1) && (bb->sign == 1)) {
                if ((err = big_add_abs(result, aa, bb)) != BIG_OK) {
                        return (err);
                }
                result->sign = -1;
        } else if ((aa->sign == 1) && (bb->sign == 1)) {
                if (big_cmp_abs(aa, bb) >= 0) {
                        if ((err = big_sub_pos(result, aa, bb)) != BIG_OK) {
                                return (err);
                        }
                        result->sign = 1;
                } else {
                        if ((err = big_sub_pos(result, bb, aa)) != BIG_OK) {
                                return (err);
                        }
                        result->sign = -1;
                }
        } else {
                if (big_cmp_abs(aa, bb) >= 0) {
                        if ((err = big_sub_pos(result, aa, bb)) != BIG_OK) {
                                return (err);
                        }
                        result->sign = -1;
                } else {
                        if ((err = big_sub_pos(result, bb, aa)) != BIG_OK) {
                                return (err);
                        }
                        result->sign = 1;
                }
        }
        return (BIG_OK);
}


BIG_ERR_CODE
big_add(BIGNUM *result, BIGNUM *aa, BIGNUM *bb)
{
        BIG_ERR_CODE    err;

        if ((bb->sign == -1) && (aa->sign == -1)) {
                if ((err = big_add_abs(result, aa, bb)) != BIG_OK) {
                        return (err);
                }
                result->sign = -1;
        } else if ((aa->sign == 1) && (bb->sign == 1)) {
                if ((err = big_add_abs(result, aa, bb)) != BIG_OK) {
                        return (err);
                }
                result->sign = 1;
        } else if ((aa->sign == 1) && (bb->sign == -1)) {
                if (big_cmp_abs(aa, bb) >= 0) {
                        if ((err = big_sub_pos(result, aa, bb)) != BIG_OK) {
                                return (err);
                        }
                        result->sign = 1;
                } else {
                        if ((err = big_sub_pos(result, bb, aa)) != BIG_OK) {
                                return (err);
                        }
                        result->sign = -1;
                }
        } else {
                if (big_cmp_abs(aa, bb) >= 0) {
                        if ((err = big_sub_pos(result, aa, bb)) != BIG_OK) {
                                return (err);
                        }
                        result->sign = -1;
                } else {
                        if ((err = big_sub_pos(result, bb, aa)) != BIG_OK) {
                                return (err);
                        }
                        result->sign = 1;
                }
        }
        return (BIG_OK);
}


/* result = aa/2 */
BIG_ERR_CODE
big_half_pos(BIGNUM *result, BIGNUM *aa)
{
        BIG_ERR_CODE    err;
        int             i;
        BIG_CHUNK_TYPE  cy, cy1;
        BIG_CHUNK_TYPE  *a, *r;

        if (result->size < aa->len) {
                err = big_extend(result, aa->len);
                if (err != BIG_OK) {
                        return (err);
                }
        }

        result->len = aa->len;
        a = aa->value;
        r = result->value;
        cy = 0;
        for (i = aa->len - 1; i >= 0; i--) {
                cy1 = a[i] << (BIG_CHUNK_SIZE - 1);
                r[i] = (cy | (a[i] >> 1));
                cy = cy1;
        }
        if (r[result->len - 1] == 0) {
                result->len--;
        }

        return (BIG_OK);
}

/* result  =  aa*2 */
BIG_ERR_CODE
big_double(BIGNUM *result, BIGNUM *aa)
{
        BIG_ERR_CODE    err;
        int             i, rsize;
        BIG_CHUNK_TYPE  cy, cy1;
        BIG_CHUNK_TYPE  *a, *r;

        if ((aa->len > 0) &&
            ((aa->value[aa->len - 1] & BIG_CHUNK_HIGHBIT) != 0)) {
                rsize = aa->len + 1;
        } else {
                rsize = aa->len;
        }

        if (result->size < rsize) {
                err = big_extend(result, rsize);
                if (err != BIG_OK)
                        return (err);
        }

        a = aa->value;
        r = result->value;
        if (rsize == aa->len + 1) {
                r[rsize - 1] = 1;
        }
        cy = 0;
        for (i = 0; i < aa->len; i++) {
                cy1 = a[i] >> (BIG_CHUNK_SIZE - 1);
                r[i] = (cy | (a[i] << 1));
                cy = cy1;
        }
        result->len = rsize;
        return (BIG_OK);
}


/*
 * returns aa mod b, aa must be nonneg, b must be a max
 * (BIG_CHUNK_SIZE / 2)-bit integer
 */
static uint32_t
big_modhalf_pos(BIGNUM *aa, uint32_t b)
{
        int             i;
        BIG_CHUNK_TYPE  rem;

        if (aa->len == 0) {
                return (0);
        }
        rem = aa->value[aa->len - 1] % b;
        for (i = aa->len - 2; i >= 0; i--) {
                rem = ((rem << (BIG_CHUNK_SIZE / 2)) |
                    (aa->value[i] >> (BIG_CHUNK_SIZE / 2))) % b;
                rem = ((rem << (BIG_CHUNK_SIZE / 2)) |
                    (aa->value[i] & BIG_CHUNK_LOWHALFBITS)) % b;
        }

        return ((uint32_t)rem);
}


/*
 * result = aa - (2^BIG_CHUNK_SIZE)^lendiff * bb
 * result->size should be at least aa->len at entry
 * aa, bb, and result should be positive
 */
void
big_sub_pos_high(BIGNUM *result, BIGNUM *aa, BIGNUM *bb)
{
        int i, lendiff;
        BIGNUM res1, aa1;

        lendiff = aa->len - bb->len;
        res1.size = result->size - lendiff;
        res1.malloced = 0;
        res1.value = result->value + lendiff;
        aa1.size = aa->size - lendiff;
        aa1.value = aa->value + lendiff;
        aa1.len = bb->len;
        aa1.sign = 1;
        (void) big_sub_pos(&res1, &aa1, bb);
        if (result->value != aa->value) {
                for (i = 0; i < lendiff; i++) {
                        result->value[i] = aa->value[i];
                }
        }
        result->len = aa->len;
}


/*
 * returns 1, 0, or -1 depending on whether |aa| > , ==, or <
 *                                      (2^BIG_CHUNK_SIZE)^lendiff * |bb|
 * aa->len should be >= bb->len
 */
int
big_cmp_abs_high(BIGNUM *aa, BIGNUM *bb)
{
        int             lendiff;
        BIGNUM          aa1;

        lendiff = aa->len - bb->len;
        aa1.len = bb->len;
        aa1.size = aa->size - lendiff;
        aa1.malloced = 0;
        aa1.value = aa->value + lendiff;
        return (big_cmp_abs(&aa1, bb));
}


/*
 * result = aa * b where b is a max. (BIG_CHUNK_SIZE / 2)-bit positive integer.
 * result should have enough space allocated.
 */
static void
big_mulhalf_low(BIGNUM *result, BIGNUM *aa, BIG_CHUNK_TYPE b)
{
        int             i;
        BIG_CHUNK_TYPE  t1, t2, ai, cy;
        BIG_CHUNK_TYPE  *a, *r;

        a = aa->value;
        r = result->value;
        cy = 0;
        for (i = 0; i < aa->len; i++) {
                ai = a[i];
                t1 = (ai & BIG_CHUNK_LOWHALFBITS) * b + cy;
                t2 = (ai >> (BIG_CHUNK_SIZE / 2)) * b +
                    (t1 >> (BIG_CHUNK_SIZE / 2));
                r[i] = (t1 & BIG_CHUNK_LOWHALFBITS) |
                    (t2 << (BIG_CHUNK_SIZE / 2));
                cy = t2 >> (BIG_CHUNK_SIZE / 2);
        }
        r[i] = cy;
        result->len = aa->len + 1;
        result->sign = aa->sign;
}


/*
 * result = aa * b * 2^(BIG_CHUNK_SIZE / 2) where b is a max.
 * (BIG_CHUNK_SIZE / 2)-bit positive integer.
 * result should have enough space allocated.
 */
static void
big_mulhalf_high(BIGNUM *result, BIGNUM *aa, BIG_CHUNK_TYPE b)
{
        int             i;
        BIG_CHUNK_TYPE  t1, t2, ai, cy, ri;
        BIG_CHUNK_TYPE  *a, *r;

        a = aa->value;
        r = result->value;
        cy = 0;
        ri = 0;
        for (i = 0; i < aa->len; i++) {
                ai = a[i];
                t1 = (ai & BIG_CHUNK_LOWHALFBITS) * b + cy;
                t2 = (ai >>  (BIG_CHUNK_SIZE / 2)) * b +
                    (t1 >>  (BIG_CHUNK_SIZE / 2));
                r[i] = (t1 <<  (BIG_CHUNK_SIZE / 2)) + ri;
                ri = t2 & BIG_CHUNK_LOWHALFBITS;
                cy = t2 >> (BIG_CHUNK_SIZE / 2);
        }
        r[i] = (cy <<  (BIG_CHUNK_SIZE / 2)) + ri;
        result->len = aa->len + 1;
        result->sign = aa->sign;
}


/* it is assumed that result->size is big enough */
void
big_shiftleft(BIGNUM *result, BIGNUM *aa, int offs)
{
        int             i;
        BIG_CHUNK_TYPE  cy, ai;

        if (offs == 0) {
                if (result != aa) {
                        (void) big_copy(result, aa);
                }
                return;
        }
        cy = 0;
        for (i = 0; i < aa->len; i++) {
                ai = aa->value[i];
                result->value[i] = (ai << offs) | cy;
                cy = ai >> (BIG_CHUNK_SIZE - offs);
        }
        if (cy != 0) {
                result->len = aa->len + 1;
                result->value[result->len - 1] = cy;
        } else {
                result->len = aa->len;
        }
        result->sign = aa->sign;
}


/* it is assumed that result->size is big enough */
void
big_shiftright(BIGNUM *result, BIGNUM *aa, int offs)
{
        int              i;
        BIG_CHUNK_TYPE  cy, ai;

        if (offs == 0) {
                if (result != aa) {
                        (void) big_copy(result, aa);
                }
                return;
        }
        cy = aa->value[0] >> offs;
        for (i = 1; i < aa->len; i++) {
                ai = aa->value[i];
                result->value[i - 1] = (ai << (BIG_CHUNK_SIZE - offs)) | cy;
                cy = ai >> offs;
        }
        result->len = aa->len;
        result->value[result->len - 1] = cy;
        result->sign = aa->sign;
}


/*
 * result = aa/bb   remainder = aa mod bb
 * it is assumed that aa and bb are positive
 */
BIG_ERR_CODE
big_div_pos(BIGNUM *result, BIGNUM *remainder, BIGNUM *aa, BIGNUM *bb)
{
        BIG_ERR_CODE    err = BIG_OK;
        int             i, alen, blen, tlen, rlen, offs;
        BIG_CHUNK_TYPE  higha, highb, coeff;
        BIG_CHUNK_TYPE  *a, *b;
        BIGNUM          bbhigh, bblow, tresult, tmp1, tmp2;
        BIG_CHUNK_TYPE  tmp1value[BIGTMPSIZE];
        BIG_CHUNK_TYPE  tmp2value[BIGTMPSIZE];
        BIG_CHUNK_TYPE  tresultvalue[BIGTMPSIZE];
        BIG_CHUNK_TYPE  bblowvalue[BIGTMPSIZE];
        BIG_CHUNK_TYPE  bbhighvalue[BIGTMPSIZE];

        a = aa->value;
        b = bb->value;
        alen = aa->len;
        blen = bb->len;
        while ((alen > 1) && (a[alen - 1] == 0)) {
                alen = alen - 1;
        }
        aa->len = alen;
        while ((blen > 1) && (b[blen - 1] == 0)) {
                blen = blen - 1;
        }
        bb->len = blen;
        if ((blen == 1) && (b[0] == 0)) {
                return (BIG_DIV_BY_0);
        }

        if (big_cmp_abs(aa, bb) < 0) {
                if ((remainder != NULL) &&
                    ((err = big_copy(remainder, aa)) != BIG_OK)) {
                        return (err);
                }
                if (result != NULL) {
                        result->len = 1;
                        result->sign = 1;
                        result->value[0] = 0;
                }
                return (BIG_OK);
        }

        if ((err = big_init1(&bblow, blen + 1,
            bblowvalue, arraysize(bblowvalue))) != BIG_OK)
                return (err);

        if ((err = big_init1(&bbhigh, blen + 1,
            bbhighvalue, arraysize(bbhighvalue))) != BIG_OK)
                goto ret1;

        if ((err = big_init1(&tmp1, alen + 2,
            tmp1value, arraysize(tmp1value))) != BIG_OK)
                goto ret2;

        if ((err = big_init1(&tmp2, blen + 2,
            tmp2value, arraysize(tmp2value))) != BIG_OK)
                goto ret3;

        if ((err = big_init1(&tresult, alen - blen + 2,
            tresultvalue, arraysize(tresultvalue))) != BIG_OK)
                goto ret4;

        offs = 0;
        highb = b[blen - 1];
        if (highb >= (BIG_CHUNK_HALF_HIGHBIT << 1)) {
                highb = highb >> (BIG_CHUNK_SIZE / 2);
                offs = (BIG_CHUNK_SIZE / 2);
        }
        while ((highb & BIG_CHUNK_HALF_HIGHBIT) == 0) {
                highb = highb << 1;
                offs++;
        }

        big_shiftleft(&bblow, bb, offs);

        if (offs <= (BIG_CHUNK_SIZE / 2 - 1)) {
                big_shiftleft(&bbhigh, &bblow, BIG_CHUNK_SIZE / 2);
        } else {
                big_shiftright(&bbhigh, &bblow, BIG_CHUNK_SIZE / 2);
        }
        if (bbhigh.value[bbhigh.len - 1] == 0) {
                bbhigh.len--;
        } else {
                bbhigh.value[bbhigh.len] = 0;
        }

        highb = bblow.value[bblow.len - 1];

        big_shiftleft(&tmp1, aa, offs);
        rlen = tmp1.len - bblow.len + 1;
        tresult.len = rlen;

        tmp1.len++;
        tlen = tmp1.len;
        tmp1.value[tmp1.len - 1] = 0;
        for (i = 0; i < rlen; i++) {
                higha = (tmp1.value[tlen - 1] << (BIG_CHUNK_SIZE / 2)) +
                    (tmp1.value[tlen - 2] >> (BIG_CHUNK_SIZE / 2));
                coeff = higha / (highb + 1);
                big_mulhalf_high(&tmp2, &bblow, coeff);
                big_sub_pos_high(&tmp1, &tmp1, &tmp2);
                bbhigh.len++;
                while (tmp1.value[tlen - 1] > 0) {
                        big_sub_pos_high(&tmp1, &tmp1, &bbhigh);
                        coeff++;
                }
                bbhigh.len--;
                tlen--;
                tmp1.len--;
                while (big_cmp_abs_high(&tmp1, &bbhigh) >= 0) {
                        big_sub_pos_high(&tmp1, &tmp1, &bbhigh);
                        coeff++;
                }
                tresult.value[rlen - i - 1] = coeff << (BIG_CHUNK_SIZE / 2);
                higha = tmp1.value[tlen - 1];
                coeff = higha / (highb + 1);
                big_mulhalf_low(&tmp2, &bblow, coeff);
                tmp2.len--;
                big_sub_pos_high(&tmp1, &tmp1, &tmp2);
                while (big_cmp_abs_high(&tmp1, &bblow) >= 0) {
                        big_sub_pos_high(&tmp1, &tmp1, &bblow);
                        coeff++;
                }
                tresult.value[rlen - i - 1] =
                    tresult.value[rlen - i - 1] + coeff;
        }

        big_shiftright(&tmp1, &tmp1, offs);

        err = BIG_OK;

        if ((remainder != NULL) &&
            ((err = big_copy(remainder, &tmp1)) != BIG_OK))
                goto ret;

        if (result != NULL)
                err = big_copy(result, &tresult);

ret:
        big_finish(&tresult);
ret4:
        big_finish(&tmp1);
ret3:
        big_finish(&tmp2);
ret2:
        big_finish(&bbhigh);
ret1:
        big_finish(&bblow);
        return (err);
}


/*
 * If there is no processor-specific integer implementation of
 * the lower level multiply functions, then this code is provided
 * for big_mul_set_vec(), big_mul_add_vec(), big_mul_vec() and
 * big_sqr_vec().
 *
 * There are two generic implementations.  One that assumes that
 * there is hardware and C compiler support for a 32 x 32 --> 64
 * bit unsigned multiply, but otherwise is not specific to any
 * processor, platform, or ISA.
 *
 * The other makes very few assumptions about hardware capabilities.
 * It does not even assume that there is any implementation of a
 * 32 x 32 --> 64 bit multiply that is accessible to C code and
 * appropriate to use.  It falls constructs 32 x 32 --> 64 bit
 * multiplies from 16 x 16 --> 32 bit multiplies.
 *
 */

#if !defined(PSR_MUL)

#ifdef UMUL64

#if (BIG_CHUNK_SIZE == 32)

#define UNROLL8

#define MUL_SET_VEC_ROUND_PREFETCH(R) \
        p = pf * d; \
        pf = (uint64_t)a[R + 1]; \
        t = p + cy; \
        r[R] = (uint32_t)t; \
        cy = t >> 32

#define MUL_SET_VEC_ROUND_NOPREFETCH(R) \
        p = pf * d; \
        t = p + cy; \
        r[R] = (uint32_t)t; \
        cy = t >> 32

#define MUL_ADD_VEC_ROUND_PREFETCH(R) \
        t = (uint64_t)r[R]; \
        p = pf * d; \
        pf = (uint64_t)a[R + 1]; \
        t = p + t + cy; \
        r[R] = (uint32_t)t; \
        cy = t >> 32

#define MUL_ADD_VEC_ROUND_NOPREFETCH(R) \
        t = (uint64_t)r[R]; \
        p = pf * d; \
        t = p + t + cy; \
        r[R] = (uint32_t)t; \
        cy = t >> 32

#ifdef UNROLL8

#define UNROLL 8

/*
 * r = a * b
 * where r and a are vectors; b is a single 32-bit digit
 */

uint32_t
big_mul_set_vec(uint32_t *r, uint32_t *a, int len, uint32_t b)
{
        uint64_t d, pf, p, t, cy;

        if (len == 0)
                return (0);
        cy = 0;
        d = (uint64_t)b;
        pf = (uint64_t)a[0];
        while (len > UNROLL) {
                MUL_SET_VEC_ROUND_PREFETCH(0);
                MUL_SET_VEC_ROUND_PREFETCH(1);
                MUL_SET_VEC_ROUND_PREFETCH(2);
                MUL_SET_VEC_ROUND_PREFETCH(3);
                MUL_SET_VEC_ROUND_PREFETCH(4);
                MUL_SET_VEC_ROUND_PREFETCH(5);
                MUL_SET_VEC_ROUND_PREFETCH(6);
                MUL_SET_VEC_ROUND_PREFETCH(7);
                r += UNROLL;
                a += UNROLL;
                len -= UNROLL;
        }
        if (len == UNROLL) {
                MUL_SET_VEC_ROUND_PREFETCH(0);
                MUL_SET_VEC_ROUND_PREFETCH(1);
                MUL_SET_VEC_ROUND_PREFETCH(2);
                MUL_SET_VEC_ROUND_PREFETCH(3);
                MUL_SET_VEC_ROUND_PREFETCH(4);
                MUL_SET_VEC_ROUND_PREFETCH(5);
                MUL_SET_VEC_ROUND_PREFETCH(6);
                MUL_SET_VEC_ROUND_NOPREFETCH(7);
                return ((uint32_t)cy);
        }
        while (len > 1) {
                MUL_SET_VEC_ROUND_PREFETCH(0);
                ++r;
                ++a;
                --len;
        }
        if (len > 0) {
                MUL_SET_VEC_ROUND_NOPREFETCH(0);
        }
        return ((uint32_t)cy);
}

/*
 * r += a * b
 * where r and a are vectors; b is a single 32-bit digit
 */

uint32_t
big_mul_add_vec(uint32_t *r, uint32_t *a, int len, uint32_t b)
{
        uint64_t d, pf, p, t, cy;

        if (len == 0)
                return (0);
        cy = 0;
        d = (uint64_t)b;
        pf = (uint64_t)a[0];
        while (len > 8) {
                MUL_ADD_VEC_ROUND_PREFETCH(0);
                MUL_ADD_VEC_ROUND_PREFETCH(1);
                MUL_ADD_VEC_ROUND_PREFETCH(2);
                MUL_ADD_VEC_ROUND_PREFETCH(3);
                MUL_ADD_VEC_ROUND_PREFETCH(4);
                MUL_ADD_VEC_ROUND_PREFETCH(5);
                MUL_ADD_VEC_ROUND_PREFETCH(6);
                MUL_ADD_VEC_ROUND_PREFETCH(7);
                r += 8;
                a += 8;
                len -= 8;
        }
        if (len == 8) {
                MUL_ADD_VEC_ROUND_PREFETCH(0);
                MUL_ADD_VEC_ROUND_PREFETCH(1);
                MUL_ADD_VEC_ROUND_PREFETCH(2);
                MUL_ADD_VEC_ROUND_PREFETCH(3);
                MUL_ADD_VEC_ROUND_PREFETCH(4);
                MUL_ADD_VEC_ROUND_PREFETCH(5);
                MUL_ADD_VEC_ROUND_PREFETCH(6);
                MUL_ADD_VEC_ROUND_NOPREFETCH(7);
                return ((uint32_t)cy);
        }
        while (len > 1) {
                MUL_ADD_VEC_ROUND_PREFETCH(0);
                ++r;
                ++a;
                --len;
        }
        if (len > 0) {
                MUL_ADD_VEC_ROUND_NOPREFETCH(0);
        }
        return ((uint32_t)cy);
}
#endif /* UNROLL8 */

void
big_sqr_vec(uint32_t *r, uint32_t *a, int len)
{
        uint32_t        *tr, *ta;
        int             tlen, row, col;
        uint64_t        p, s, t, t2, cy;
        uint32_t        d;

        tr = r + 1;
        ta = a;
        tlen = len - 1;
        tr[tlen] = big_mul_set_vec(tr, ta + 1, tlen, ta[0]);
        while (--tlen > 0) {
                tr += 2;
                ++ta;
                tr[tlen] = big_mul_add_vec(tr, ta + 1, tlen, ta[0]);
        }
        s = (uint64_t)a[0];
        s = s * s;
        r[0] = (uint32_t)s;
        cy = s >> 32;
        p = ((uint64_t)r[1] << 1) + cy;
        r[1] = (uint32_t)p;
        cy = p >> 32;
        row = 1;
        col = 2;
        while (row < len) {
                s = (uint64_t)a[row];
                s = s * s;
                p = (uint64_t)r[col] << 1;
                t = p + s;
                d = (uint32_t)t;
                t2 = (uint64_t)d + cy;
                r[col] = (uint32_t)t2;
                cy = (t >> 32) + (t2 >> 32);
                if (row == len - 1)
                        break;
                p = ((uint64_t)r[col + 1] << 1) + cy;
                r[col + 1] = (uint32_t)p;
                cy = p >> 32;
                ++row;
                col += 2;
        }
        r[col + 1] = (uint32_t)cy;
}

#else /* BIG_CHUNK_SIZE == 64 */

/*
 * r = r + a * digit, r and a are vectors of length len
 * returns the carry digit
 */
BIG_CHUNK_TYPE
big_mul_add_vec(BIG_CHUNK_TYPE *r, BIG_CHUNK_TYPE *a, int len,
    BIG_CHUNK_TYPE digit)
{
        BIG_CHUNK_TYPE  cy, cy1, retcy, dlow, dhigh;
        int             i;

        cy1 = 0;
        dlow = digit & BIG_CHUNK_LOWHALFBITS;
        dhigh = digit >> (BIG_CHUNK_SIZE / 2);
        for (i = 0; i < len; i++) {
                cy = (cy1 >> (BIG_CHUNK_SIZE / 2)) +
                    dlow * (a[i] & BIG_CHUNK_LOWHALFBITS) +
                    (r[i] & BIG_CHUNK_LOWHALFBITS);
                cy1 = (cy >> (BIG_CHUNK_SIZE / 2)) +
                    dlow * (a[i] >> (BIG_CHUNK_SIZE / 2)) +
                    (r[i] >> (BIG_CHUNK_SIZE / 2));
                r[i] = (cy & BIG_CHUNK_LOWHALFBITS) |
                    (cy1 << (BIG_CHUNK_SIZE / 2));
        }
        retcy = cy1 >> (BIG_CHUNK_SIZE / 2);

        cy1 = r[0] & BIG_CHUNK_LOWHALFBITS;
        for (i = 0; i < len - 1; i++) {
                cy = (cy1 >> (BIG_CHUNK_SIZE / 2)) +
                    dhigh * (a[i] & BIG_CHUNK_LOWHALFBITS) +
                    (r[i] >> (BIG_CHUNK_SIZE / 2));
                r[i] = (cy1 & BIG_CHUNK_LOWHALFBITS) |
                    (cy << (BIG_CHUNK_SIZE / 2));
                cy1 = (cy >> (BIG_CHUNK_SIZE / 2)) +
                    dhigh * (a[i] >> (BIG_CHUNK_SIZE / 2)) +
                    (r[i + 1] & BIG_CHUNK_LOWHALFBITS);
        }
        cy = (cy1 >> (BIG_CHUNK_SIZE / 2)) +
            dhigh * (a[len - 1] & BIG_CHUNK_LOWHALFBITS) +
            (r[len - 1] >> (BIG_CHUNK_SIZE / 2));
        r[len - 1] = (cy1 & BIG_CHUNK_LOWHALFBITS) |
            (cy << (BIG_CHUNK_SIZE / 2));
        retcy = (cy >> (BIG_CHUNK_SIZE / 2)) +
            dhigh * (a[len - 1] >> (BIG_CHUNK_SIZE / 2)) + retcy;

        return (retcy);
}


/*
 * r = a * digit, r and a are vectors of length len
 * returns the carry digit
 */
BIG_CHUNK_TYPE
big_mul_set_vec(BIG_CHUNK_TYPE *r, BIG_CHUNK_TYPE *a, int len,
    BIG_CHUNK_TYPE digit)
{
        int     i;

        ASSERT(r != a);
        for (i = 0; i < len; i++) {
                r[i] = 0;
        }
        return (big_mul_add_vec(r, a, len, digit));
}

void
big_sqr_vec(BIG_CHUNK_TYPE *r, BIG_CHUNK_TYPE *a, int len)
{
        int i;

        ASSERT(r != a);
        r[len] = big_mul_set_vec(r, a, len, a[0]);
        for (i = 1; i < len; ++i)
                r[len + i] = big_mul_add_vec(r + i, a, len, a[i]);
}

#endif /* BIG_CHUNK_SIZE == 32/64 */


#else /* ! UMUL64 */

#if (BIG_CHUNK_SIZE != 32)
#error "Don't use 64-bit chunks without defining UMUL64"
#endif


/*
 * r = r + a * digit, r and a are vectors of length len
 * returns the carry digit
 */
uint32_t
big_mul_add_vec(uint32_t *r, uint32_t *a, int len, uint32_t digit)
{
        uint32_t cy, cy1, retcy, dlow, dhigh;
        int i;

        cy1 = 0;
        dlow = digit & 0xffff;
        dhigh = digit >> 16;
        for (i = 0; i < len; i++) {
                cy = (cy1 >> 16) + dlow * (a[i] & 0xffff) + (r[i] & 0xffff);
                cy1 = (cy >> 16) + dlow * (a[i]>>16) + (r[i] >> 16);
                r[i] = (cy & 0xffff) | (cy1 << 16);
        }
        retcy = cy1 >> 16;

        cy1 = r[0] & 0xffff;
        for (i = 0; i < len - 1; i++) {
                cy = (cy1 >> 16) + dhigh * (a[i] & 0xffff) + (r[i] >> 16);
                r[i] = (cy1 & 0xffff) | (cy << 16);
                cy1 = (cy >> 16) + dhigh * (a[i] >> 16) + (r[i + 1] & 0xffff);
        }
        cy = (cy1 >> 16) + dhigh * (a[len - 1] & 0xffff) + (r[len - 1] >> 16);
        r[len - 1] = (cy1 & 0xffff) | (cy << 16);
        retcy = (cy >> 16) + dhigh * (a[len - 1] >> 16) + retcy;

        return (retcy);
}


/*
 * r = a * digit, r and a are vectors of length len
 * returns the carry digit
 */
uint32_t
big_mul_set_vec(uint32_t *r, uint32_t *a, int len, uint32_t digit)
{
        int     i;

        ASSERT(r != a);
        for (i = 0; i < len; i++) {
                r[i] = 0;
        }

        return (big_mul_add_vec(r, a, len, digit));
}

void
big_sqr_vec(uint32_t *r, uint32_t *a, int len)
{
        int i;

        ASSERT(r != a);
        r[len] = big_mul_set_vec(r, a, len, a[0]);
        for (i = 1; i < len; ++i)
                r[len + i] = big_mul_add_vec(r + i, a, len, a[i]);
}

#endif /* UMUL64 */

void
big_mul_vec(BIG_CHUNK_TYPE *r, BIG_CHUNK_TYPE *a, int alen,
    BIG_CHUNK_TYPE *b, int blen)
{
        int i;

        r[alen] = big_mul_set_vec(r, a, alen, b[0]);
        for (i = 1; i < blen; ++i)
                r[alen + i] = big_mul_add_vec(r + i, a, alen, b[i]);
}


#endif /* ! PSR_MUL */


/*
 * result = aa * bb  result->value should be big enough to hold the result
 *
 * Implementation: Standard grammar school algorithm
 *
 */
BIG_ERR_CODE
big_mul(BIGNUM *result, BIGNUM *aa, BIGNUM *bb)
{
        BIGNUM          tmp1;
        BIG_CHUNK_TYPE  tmp1value[BIGTMPSIZE];
        BIG_CHUNK_TYPE  *r, *t, *a, *b;
        BIG_ERR_CODE    err;
        int             i, alen, blen, rsize, sign, diff;

        if (aa == bb) {
                diff = 0;
        } else {
                diff = big_cmp_abs(aa, bb);
                if (diff < 0) {
                        BIGNUM *tt;
                        tt = aa;
                        aa = bb;
                        bb = tt;
                }
        }
        a = aa->value;
        b = bb->value;
        alen = aa->len;
        blen = bb->len;
        while ((alen > 1) && (a[alen - 1] == 0)) {
                alen--;
        }
        aa->len = alen;
        while ((blen > 1) && (b[blen - 1] == 0)) {
                blen--;
        }
        bb->len = blen;

        rsize = alen + blen;
        ASSERT(rsize > 0);
        if (result->size < rsize) {
                err = big_extend(result, rsize);
                if (err != BIG_OK) {
                        return (err);
                }
                /* aa or bb might be an alias to result */
                a = aa->value;
                b = bb->value;
        }
        r = result->value;

        if (((alen == 1) && (a[0] == 0)) || ((blen == 1) && (b[0] == 0))) {
                result->len = 1;
                result->sign = 1;
                r[0] = 0;
                return (BIG_OK);
        }
        sign = aa->sign * bb->sign;
        if ((alen == 1) && (a[0] == 1)) {
                for (i = 0; i < blen; i++) {
                        r[i] = b[i];
                }
                result->len = blen;
                result->sign = sign;
                return (BIG_OK);
        }
        if ((blen == 1) && (b[0] == 1)) {
                for (i = 0; i < alen; i++) {
                        r[i] = a[i];
                }
                result->len = alen;
                result->sign = sign;
                return (BIG_OK);
        }

        if ((err = big_init1(&tmp1, rsize,
            tmp1value, arraysize(tmp1value))) != BIG_OK) {
                return (err);
        }
        t = tmp1.value;

        for (i = 0; i < rsize; i++) {
                t[i] = 0;
        }

        if (diff == 0 && alen > 2) {
                BIG_SQR_VEC(t, a, alen);
        } else if (blen > 0) {
                BIG_MUL_VEC(t, a, alen, b, blen);
        }

        if (t[rsize - 1] == 0) {
                tmp1.len = rsize - 1;
        } else {
                tmp1.len = rsize;
        }

        err = big_copy(result, &tmp1);

        result->sign = sign;

        big_finish(&tmp1);

        return (err);
}


/*
 * big_mont_mul()
 * Montgomery multiplication.
 *
 * Caller must ensure that  a < n,  b < n,  ret->size >=  2 * n->len + 1,
 * and that ret is not n.
 */
BIG_ERR_CODE
big_mont_mul(BIGNUM *ret, BIGNUM *a, BIGNUM *b, BIGNUM *n, BIG_CHUNK_TYPE n0)
{
        int             i, j, nlen, needsubtract;
        BIG_CHUNK_TYPE  *nn, *rr, *rrplusi;
        BIG_CHUNK_TYPE  digit, c;
        BIG_ERR_CODE    err;
#ifdef  __amd64
#define BIG_CPU_UNKNOWN 0
#define BIG_CPU_AMD     1
#define BIG_CPU_INTEL   2
        static int      big_cpu = BIG_CPU_UNKNOWN;
        BIG_CHUNK_TYPE  carry[BIGTMPSIZE];

        if (big_cpu == BIG_CPU_UNKNOWN) {
                big_cpu = 1 + bignum_on_intel();
        }
#endif  /* __amd64 */

        nlen = n->len;
        nn = n->value;

        rr = ret->value;

        if ((err = big_mul(ret, a, b)) != BIG_OK) {
                return (err);
        }

        rr = ret->value;
        for (i = ret->len; i < 2 * nlen + 1; i++) {
                rr[i] = 0;
        }

#ifdef  __amd64 /* pipelining optimization for Intel 64, but not AMD64 */
        if ((big_cpu == BIG_CPU_INTEL) && (nlen <= BIGTMPSIZE)) {
                /*
                 * Perform the following in two for loops to reduce the
                 * dependency between computing the carryover bits with
                 * BIG_MUL_ADD_VEC() and adding them, thus improving pipelining.
                 */
                for (i = 0; i < nlen; i++) {
                        rrplusi = rr + i;
                        digit = *rrplusi * n0;
                        carry[i] = BIG_MUL_ADD_VEC(rrplusi, nn, nlen, digit);
                }
                for (i = 0; i < nlen; i++) {
                        j = i + nlen;
                        rr[j] += carry[i];
                        while (rr[j] < carry[i]) {
                                rr[++j] += 1;
                                carry[i] = 1;
                        }
                }
        } else
#endif  /* __amd64 */
        { /* no pipelining optimization */
                for (i = 0; i < nlen; i++) {
                        rrplusi = rr + i;
                        digit = *rrplusi * n0;
                        c = BIG_MUL_ADD_VEC(rrplusi, nn, nlen, digit);
                        j = i + nlen;
                        rr[j] += c;
                        while (rr[j] < c) {
                                rr[++j] += 1;
                                c = 1;
                        }
                }
        }

        needsubtract = 0;
        if ((rr[2 * nlen]  != 0))
                needsubtract = 1;
        else {
                for (i = 2 * nlen - 1; i >= nlen; i--) {
                        if (rr[i] > nn[i - nlen]) {
                                needsubtract = 1;
                                break;
                        } else if (rr[i] < nn[i - nlen]) {
                                break;
                        }
                }
        }
        if (needsubtract)
                big_sub_vec(rr, rr + nlen, nn, nlen);
        else {
                for (i = 0; i < nlen; i++) {
                        rr[i] = rr[i + nlen];
                }
        }

        /* Remove leading zeros, but keep at least 1 digit: */
        for (i = nlen - 1; (i > 0) && (rr[i] == 0); i--)
                ;
        ret->len = i + 1;

        return (BIG_OK);
}


BIG_CHUNK_TYPE
big_n0(BIG_CHUNK_TYPE n)
{
        int             i;
        BIG_CHUNK_TYPE  result, tmp;

        result = 0;
        tmp = BIG_CHUNK_ALLBITS;
        for (i = 0; i < BIG_CHUNK_SIZE; i++) {
                if ((tmp & 1) == 1) {
                        result = (result >> 1) | BIG_CHUNK_HIGHBIT;
                        tmp = tmp - n;
                } else {
                        result = (result >> 1);
                }
                tmp = tmp >> 1;
        }

        return (result);
}


int
big_numbits(BIGNUM *n)
{
        int             i, j;
        BIG_CHUNK_TYPE  t;

        for (i = n->len - 1; i > 0; i--) {
                if (n->value[i] != 0) {
                        break;
                }
        }
        t = n->value[i];
        for (j = BIG_CHUNK_SIZE; j > 0; j--) {
                if ((t & BIG_CHUNK_HIGHBIT) == 0) {
                        t = t << 1;
                } else {
                        return (BIG_CHUNK_SIZE * i + j);
                }
        }
        return (0);
}


/* caller must make sure that a < n */
BIG_ERR_CODE
big_mont_rr(BIGNUM *result, BIGNUM *n)
{
        BIGNUM          rr;
        BIG_CHUNK_TYPE  rrvalue[BIGTMPSIZE];
        int             len, i;
        BIG_ERR_CODE    err;

        rr.malloced = 0;
        len = n->len;

        if ((err = big_init1(&rr, 2 * len + 1,
            rrvalue, arraysize(rrvalue))) != BIG_OK) {
                return (err);
        }

        for (i = 0; i < 2 * len; i++) {
                rr.value[i] = 0;
        }
        rr.value[2 * len] = 1;
        rr.len = 2 * len + 1;
        if ((err = big_div_pos(NULL, &rr, &rr, n)) != BIG_OK) {
                goto ret;
        }
        err = big_copy(result, &rr);
ret:
        big_finish(&rr);
        return (err);
}


/* caller must make sure that a < n */
BIG_ERR_CODE
big_mont_conv(BIGNUM *result, BIGNUM *a, BIGNUM *n, BIG_CHUNK_TYPE n0,
    BIGNUM *n_rr)
{
        BIGNUM          rr;
        BIG_CHUNK_TYPE  rrvalue[BIGTMPSIZE];
        int             len, i;
        BIG_ERR_CODE    err;

        rr.malloced = 0;
        len = n->len;

        if ((err = big_init1(&rr, 2 * len + 1, rrvalue, arraysize(rrvalue)))
            != BIG_OK) {
                return (err);
        }

        if (n_rr == NULL) {
                for (i = 0; i < 2 * len; i++) {
                        rr.value[i] = 0;
                }
                rr.value[2 * len] = 1;
                rr.len = 2 * len + 1;
                if ((err = big_div_pos(NULL, &rr, &rr, n)) != BIG_OK) {
                        goto ret;
                }
                n_rr = &rr;
        }

        if ((err = big_mont_mul(&rr, n_rr, a, n, n0)) != BIG_OK) {
                goto ret;
        }
        err = big_copy(result, &rr);

ret:
        big_finish(&rr);
        return (err);
}


#ifdef  USE_FLOATING_POINT
#define big_modexp_ncp_float    big_modexp_ncp_sw
#else
#define big_modexp_ncp_int      big_modexp_ncp_sw
#endif

#define MAX_EXP_BIT_GROUP_SIZE 6
#define APOWERS_MAX_SIZE (1 << (MAX_EXP_BIT_GROUP_SIZE - 1))

/* ARGSUSED */
static BIG_ERR_CODE
big_modexp_ncp_int(BIGNUM *result, BIGNUM *ma, BIGNUM *e, BIGNUM *n,
    BIGNUM *tmp, BIG_CHUNK_TYPE n0)

{
        BIGNUM          apowers[APOWERS_MAX_SIZE];
        BIGNUM          tmp1;
        BIG_CHUNK_TYPE  tmp1value[BIGTMPSIZE];
        int             i, j, k, l, m, p;
        uint32_t        bit, bitind, bitcount, groupbits, apowerssize;
        uint32_t        nbits;
        BIG_ERR_CODE    err;

        nbits = big_numbits(e);
        if (nbits < 50) {
                groupbits = 1;
                apowerssize = 1;
        } else {
                groupbits = MAX_EXP_BIT_GROUP_SIZE;
                apowerssize = 1 << (groupbits - 1);
        }


        if ((err = big_init1(&tmp1, 2 * n->len + 1,
            tmp1value, arraysize(tmp1value))) != BIG_OK) {
                return (err);
        }

        /* clear the malloced bit to help cleanup */
        for (i = 0; i < apowerssize; i++) {
                apowers[i].malloced = 0;
        }

        for (i = 0; i < apowerssize; i++) {
                if ((err = big_init1(&(apowers[i]), n->len, NULL, 0)) !=
                    BIG_OK) {
                        goto ret;
                }
        }

        (void) big_copy(&(apowers[0]), ma);

        if ((err = big_mont_mul(&tmp1, ma, ma, n, n0)) != BIG_OK) {
                goto ret;
        }
        (void) big_copy(ma, &tmp1);

        for (i = 1; i < apowerssize; i++) {
                if ((err = big_mont_mul(&tmp1, ma,
                    &(apowers[i - 1]), n, n0)) != BIG_OK) {
                        goto ret;
                }
                (void) big_copy(&apowers[i], &tmp1);
        }

        bitind = nbits % BIG_CHUNK_SIZE;
        k = 0;
        l = 0;
        p = 0;
        bitcount = 0;
        for (i = nbits / BIG_CHUNK_SIZE; i >= 0; i--) {
                for (j = bitind - 1; j >= 0; j--) {
                        bit = (e->value[i] >> j) & 1;
                        if ((bitcount == 0) && (bit == 0)) {
                                if ((err = big_mont_mul(tmp,
                                    tmp, tmp, n, n0)) != BIG_OK) {
                                        goto ret;
                                }
                        } else {
                                bitcount++;
                                p = p * 2 + bit;
                                if (bit == 1) {
                                        k = k + l + 1;
                                        l = 0;
                                } else {
                                        l++;
                                }
                                if (bitcount == groupbits) {
                                        for (m = 0; m < k; m++) {
                                                if ((err = big_mont_mul(tmp,
                                                    tmp, tmp, n, n0)) !=
                                                    BIG_OK) {
                                                        goto ret;
                                                }
                                        }
                                        if ((err = big_mont_mul(tmp, tmp,
                                            &(apowers[p >> (l + 1)]),
                                            n, n0)) != BIG_OK) {
                                                goto ret;
                                        }
                                        for (m = 0; m < l; m++) {
                                                if ((err = big_mont_mul(tmp,
                                                    tmp, tmp, n, n0)) !=
                                                    BIG_OK) {
                                                        goto ret;
                                                }
                                        }
                                        k = 0;
                                        l = 0;
                                        p = 0;
                                        bitcount = 0;
                                }
                        }
                }
                bitind = BIG_CHUNK_SIZE;
        }

        for (m = 0; m < k; m++) {
                if ((err = big_mont_mul(tmp, tmp, tmp, n, n0)) != BIG_OK) {
                        goto ret;
                }
        }
        if (p != 0) {
                if ((err = big_mont_mul(tmp, tmp,
                    &(apowers[p >> (l + 1)]), n, n0)) != BIG_OK) {
                        goto ret;
                }
        }
        for (m = 0; m < l; m++) {
                if ((err = big_mont_mul(result, tmp, tmp, n, n0)) != BIG_OK) {
                        goto ret;
                }
        }

ret:
        for (i = apowerssize - 1; i >= 0; i--) {
                big_finish(&(apowers[i]));
        }
        big_finish(&tmp1);

        return (err);
}


#ifdef USE_FLOATING_POINT

#ifdef _KERNEL

#include <sys/sysmacros.h>
#include <sys/regset.h>
#include <sys/fpu/fpusystm.h>

/* the alignment for block stores to save fp registers */
#define FPR_ALIGN       (64)

extern void big_savefp(kfpu_t *);
extern void big_restorefp(kfpu_t *);

#endif /* _KERNEL */

/*
 * This version makes use of floating point for performance
 */
static BIG_ERR_CODE
big_modexp_ncp_float(BIGNUM *result, BIGNUM *ma, BIGNUM *e, BIGNUM *n,
    BIGNUM *tmp, BIG_CHUNK_TYPE n0)
{

        int             i, j, k, l, m, p;
        uint32_t        bit, bitind, bitcount, nlen;
        double          dn0;
        double          *dn, *dt, *d16r, *d32r;
        uint32_t        *nint, *prod;
        double          *apowers[APOWERS_MAX_SIZE];
        uint32_t        nbits, groupbits, apowerssize;
        BIG_ERR_CODE    err = BIG_OK;

#ifdef _KERNEL
        uint8_t fpua[sizeof (kfpu_t) + FPR_ALIGN];
        kfpu_t *fpu;

#ifdef DEBUG
        if (!fpu_exists)
                return (BIG_GENERAL_ERR);
#endif

        fpu =  (kfpu_t *)P2ROUNDUP((uintptr_t)fpua, FPR_ALIGN);
        big_savefp(fpu);

#endif /* _KERNEL */

        nbits = big_numbits(e);
        if (nbits < 50) {
                groupbits = 1;
                apowerssize = 1;
        } else {
                groupbits = MAX_EXP_BIT_GROUP_SIZE;
                apowerssize = 1 << (groupbits - 1);
        }

        nlen = (BIG_CHUNK_SIZE / 32) * n->len;
        dn0 = (double)(n0 & 0xffff);

        dn = dt = d16r = d32r = NULL;
        nint = prod = NULL;
        for (i = 0; i < apowerssize; i++) {
                apowers[i] = NULL;
        }

        if ((dn = big_malloc(nlen * sizeof (double))) == NULL) {
                err = BIG_NO_MEM;
                goto ret;
        }
        if ((dt = big_malloc((4 * nlen + 2) * sizeof (double))) == NULL) {
                err = BIG_NO_MEM;
                goto ret;
        }
        if ((nint = big_malloc(nlen * sizeof (uint32_t))) == NULL) {
                err = BIG_NO_MEM;
                goto ret;
        }
        if ((prod = big_malloc((nlen + 1) * sizeof (uint32_t))) == NULL) {
                err = BIG_NO_MEM;
                goto ret;
        }
        if ((d16r = big_malloc((2 * nlen + 1) * sizeof (double))) == NULL) {
                err = BIG_NO_MEM;
                goto ret;
        }
        if ((d32r = big_malloc(nlen * sizeof (double))) == NULL) {
                err = BIG_NO_MEM;
                goto ret;
        }
        for (i = 0; i < apowerssize; i++) {
                if ((apowers[i] = big_malloc((2 * nlen + 1) *
                    sizeof (double))) == NULL) {
                        err = BIG_NO_MEM;
                        goto ret;
                }
        }

#if (BIG_CHUNK_SIZE == 32)
        for (i = 0; i < ma->len; i++) {
                nint[i] = ma->value[i];
        }
        for (; i < nlen; i++) {
                nint[i] = 0;
        }
#else
        for (i = 0; i < ma->len; i++) {
                nint[2 * i] = (uint32_t)(ma->value[i] & 0xffffffffULL);
                nint[2 * i + 1] = (uint32_t)(ma->value[i] >> 32);
        }
        for (i = ma->len * 2; i < nlen; i++) {
                nint[i] = 0;
        }
#endif
        conv_i32_to_d32_and_d16(d32r, apowers[0], nint, nlen);

#if (BIG_CHUNK_SIZE == 32)
        for (i = 0; i < n->len; i++) {
                nint[i] = n->value[i];
        }
        for (; i < nlen; i++) {
                nint[i] = 0;
        }
#else
        for (i = 0; i < n->len; i++) {
                nint[2 * i] = (uint32_t)(n->value[i] & 0xffffffffULL);
                nint[2 * i + 1] = (uint32_t)(n->value[i] >> 32);
        }
        for (i = n->len * 2; i < nlen; i++) {
                nint[i] = 0;
        }
#endif
        conv_i32_to_d32(dn, nint, nlen);

        mont_mulf_noconv(prod, d32r, apowers[0], dt, dn, nint, nlen, dn0);
        conv_i32_to_d32(d32r, prod, nlen);
        for (i = 1; i < apowerssize; i++) {
                mont_mulf_noconv(prod, d32r, apowers[i - 1],
                    dt, dn, nint, nlen, dn0);
                conv_i32_to_d16(apowers[i], prod, nlen);
        }

#if (BIG_CHUNK_SIZE == 32)
        for (i = 0; i < tmp->len; i++) {
                prod[i] = tmp->value[i];
        }
        for (; i < nlen + 1; i++) {
                prod[i] = 0;
        }
#else
        for (i = 0; i < tmp->len; i++) {
                prod[2 * i] = (uint32_t)(tmp->value[i] & 0xffffffffULL);
                prod[2 * i + 1] = (uint32_t)(tmp->value[i] >> 32);
        }
        for (i = tmp->len * 2; i < nlen + 1; i++) {
                prod[i] = 0;
        }
#endif

        bitind = nbits % BIG_CHUNK_SIZE;
        k = 0;
        l = 0;
        p = 0;
        bitcount = 0;
        for (i = nbits / BIG_CHUNK_SIZE; i >= 0; i--) {
                for (j = bitind - 1; j >= 0; j--) {
                        bit = (e->value[i] >> j) & 1;
                        if ((bitcount == 0) && (bit == 0)) {
                                conv_i32_to_d32_and_d16(d32r, d16r,
                                    prod, nlen);
                                mont_mulf_noconv(prod, d32r, d16r,
                                    dt, dn, nint, nlen, dn0);
                        } else {
                                bitcount++;
                                p = p * 2 + bit;
                                if (bit == 1) {
                                        k = k + l + 1;
                                        l = 0;
                                } else {
                                        l++;
                                }
                                if (bitcount == groupbits) {
                                        for (m = 0; m < k; m++) {
                                                conv_i32_to_d32_and_d16(d32r,
                                                    d16r, prod, nlen);
                                                mont_mulf_noconv(prod, d32r,
                                                    d16r, dt, dn, nint,
                                                    nlen, dn0);
                                        }
                                        conv_i32_to_d32(d32r, prod, nlen);
                                        mont_mulf_noconv(prod, d32r,
                                            apowers[p >> (l + 1)],
                                            dt, dn, nint, nlen, dn0);
                                        for (m = 0; m < l; m++) {
                                                conv_i32_to_d32_and_d16(d32r,
                                                    d16r, prod, nlen);
                                                mont_mulf_noconv(prod, d32r,
                                                    d16r, dt, dn, nint,
                                                    nlen, dn0);
                                        }
                                        k = 0;
                                        l = 0;
                                        p = 0;
                                        bitcount = 0;
                                }
                        }
                }
                bitind = BIG_CHUNK_SIZE;
        }

        for (m = 0; m < k; m++) {
                conv_i32_to_d32_and_d16(d32r, d16r, prod, nlen);
                mont_mulf_noconv(prod, d32r, d16r, dt, dn, nint, nlen, dn0);
        }
        if (p != 0) {
                conv_i32_to_d32(d32r, prod, nlen);
                mont_mulf_noconv(prod, d32r, apowers[p >> (l + 1)],
                    dt, dn, nint, nlen, dn0);
        }
        for (m = 0; m < l; m++) {
                conv_i32_to_d32_and_d16(d32r, d16r, prod, nlen);
                mont_mulf_noconv(prod, d32r, d16r, dt, dn, nint, nlen, dn0);
        }

#if (BIG_CHUNK_SIZE == 32)
        for (i = 0; i < nlen; i++) {
                result->value[i] = prod[i];
        }
        for (i = nlen - 1; (i > 0) && (prod[i] == 0); i--)
                ;
#else
        for (i = 0; i < nlen / 2; i++) {
                result->value[i] = (uint64_t)(prod[2 * i]) +
                    (((uint64_t)(prod[2 * i + 1])) << 32);
        }
        for (i = nlen / 2 - 1; (i > 0) && (result->value[i] == 0); i--)
                ;
#endif
        result->len = i + 1;

ret:
        for (i = apowerssize - 1; i >= 0; i--) {
                if (apowers[i] != NULL)
                        big_free(apowers[i], (2 * nlen + 1) * sizeof (double));
        }
        if (d32r != NULL) {
                big_free(d32r, nlen * sizeof (double));
        }
        if (d16r != NULL) {
                big_free(d16r, (2 * nlen + 1) * sizeof (double));
        }
        if (prod != NULL) {
                big_free(prod, (nlen + 1) * sizeof (uint32_t));
        }
        if (nint != NULL) {
                big_free(nint, nlen * sizeof (uint32_t));
        }
        if (dt != NULL) {
                big_free(dt, (4 * nlen + 2) * sizeof (double));
        }
        if (dn != NULL) {
                big_free(dn, nlen * sizeof (double));
        }

#ifdef _KERNEL
        big_restorefp(fpu);
#endif

        return (err);
}

#endif /* USE_FLOATING_POINT */


BIG_ERR_CODE
big_modexp_ext(BIGNUM *result, BIGNUM *a, BIGNUM *e, BIGNUM *n, BIGNUM *n_rr,
    big_modexp_ncp_info_t *info)
{
        BIGNUM          ma, tmp, rr;
        BIG_CHUNK_TYPE  mavalue[BIGTMPSIZE];
        BIG_CHUNK_TYPE  tmpvalue[BIGTMPSIZE];
        BIG_CHUNK_TYPE  rrvalue[BIGTMPSIZE];
        BIG_ERR_CODE    err;
        BIG_CHUNK_TYPE  n0;

        if ((err = big_init1(&ma, n->len, mavalue, arraysize(mavalue))) !=
            BIG_OK) {
                return (err);
        }
        ma.len = 1;
        ma.value[0] = 0;

        if ((err = big_init1(&tmp, 2 * n->len + 1,
            tmpvalue, arraysize(tmpvalue))) != BIG_OK) {
                goto ret1;
        }

        /* clear the malloced bit to help cleanup */
        rr.malloced = 0;

        if (n_rr == NULL) {
                if ((err = big_init1(&rr, 2 * n->len + 1,
                    rrvalue, arraysize(rrvalue))) != BIG_OK) {
                        goto ret2;
                }
                if (big_mont_rr(&rr, n) != BIG_OK) {
                        goto ret;
                }
                n_rr = &rr;
        }

        n0 = big_n0(n->value[0]);

        if (big_cmp_abs(a, n) > 0) {
                if ((err = big_div_pos(NULL, &ma, a, n)) != BIG_OK) {
                        goto ret;
                }
                err = big_mont_conv(&ma, &ma, n, n0, n_rr);
        } else {
                err = big_mont_conv(&ma, a, n, n0, n_rr);
        }
        if (err != BIG_OK) {
                goto ret;
        }

        tmp.len = 1;
        tmp.value[0] = 1;
        if ((err = big_mont_conv(&tmp, &tmp, n, n0, n_rr)) != BIG_OK) {
                goto ret;
        }

        if ((info != NULL) && (info->func != NULL)) {
                err = (*(info->func))(&tmp, &ma, e, n, &tmp, n0,
                    info->ncp, info->reqp);
        } else {
                err = big_modexp_ncp_sw(&tmp, &ma, e, n, &tmp, n0);
        }
        if (err != BIG_OK) {
                goto ret;
        }

        ma.value[0] = 1;
        ma.len = 1;
        if ((err = big_mont_mul(&tmp, &tmp, &ma, n, n0)) != BIG_OK) {
                goto ret;
        }
        err = big_copy(result, &tmp);

ret:
        if (rr.malloced) {
                big_finish(&rr);
        }
ret2:
        big_finish(&tmp);
ret1:
        big_finish(&ma);

        return (err);
}

BIG_ERR_CODE
big_modexp(BIGNUM *result, BIGNUM *a, BIGNUM *e, BIGNUM *n, BIGNUM *n_rr)
{
        return (big_modexp_ext(result, a, e, n, n_rr, NULL));
}


BIG_ERR_CODE
big_modexp_crt_ext(BIGNUM *result, BIGNUM *a, BIGNUM *dmodpminus1,
    BIGNUM *dmodqminus1, BIGNUM *p, BIGNUM *q, BIGNUM *pinvmodq,
    BIGNUM *p_rr, BIGNUM *q_rr, big_modexp_ncp_info_t *info)
{
        BIGNUM          ap, aq, tmp;
        int             alen, biglen, sign;
        BIG_ERR_CODE    err;

        if (p->len > q->len) {
                biglen = p->len;
        } else {
                biglen = q->len;
        }

        if ((err = big_init1(&ap, p->len, NULL, 0)) != BIG_OK) {
                return (err);
        }
        if ((err = big_init1(&aq, q->len, NULL, 0)) != BIG_OK) {
                goto ret1;
        }
        if ((err = big_init1(&tmp, biglen + q->len + 1, NULL, 0)) != BIG_OK) {
                goto ret2;
        }

        /*
         * check whether a is too short - to avoid timing attacks
         */
        alen = a->len;
        while ((alen > p->len) && (a->value[alen - 1] == 0)) {
                alen--;
        }
        if (alen < p->len + q->len) {
                /*
                 * a is too short, add p*q to it before
                 * taking it modulo p and q
                 * this will also affect timing, but this difference
                 * does not depend on p or q, only on a
                 * (in "normal" operation, this path will never be
                 * taken, so it is not a performance penalty
                 */
                if ((err = big_mul(&tmp, p, q)) != BIG_OK) {
                        goto ret;
                }
                if ((err = big_add(&tmp, &tmp, a)) != BIG_OK) {
                        goto ret;
                }
                if ((err = big_div_pos(NULL, &ap, &tmp, p)) != BIG_OK) {
                        goto ret;
                }
                if ((err = big_div_pos(NULL, &aq, &tmp, q)) != BIG_OK) {
                        goto ret;
                }
        } else {
                if ((err = big_div_pos(NULL, &ap, a, p)) != BIG_OK) {
                        goto ret;
                }
                if ((err = big_div_pos(NULL, &aq, a, q)) != BIG_OK) {
                        goto ret;
                }
        }

        if ((err = big_modexp_ext(&ap, &ap, dmodpminus1, p, p_rr, info)) !=
            BIG_OK) {
                goto ret;
        }
        if ((err = big_modexp_ext(&aq, &aq, dmodqminus1, q, q_rr, info)) !=
            BIG_OK) {
                goto ret;
        }
        if ((err = big_sub(&tmp, &aq, &ap)) != BIG_OK) {
                goto ret;
        }
        if ((err = big_mul(&tmp, &tmp, pinvmodq)) != BIG_OK) {
                goto ret;
        }
        sign = tmp.sign;
        tmp.sign = 1;
        if ((err = big_div_pos(NULL, &aq, &tmp, q)) != BIG_OK) {
                goto ret;
        }
        if ((sign == -1) && (!big_is_zero(&aq))) {
                (void) big_sub_pos(&aq, q, &aq);
        }
        if ((err = big_mul(&tmp, &aq, p)) != BIG_OK) {
                goto ret;
        }
        err = big_add_abs(result, &ap, &tmp);

ret:
        big_finish(&tmp);
ret2:
        big_finish(&aq);
ret1:
        big_finish(&ap);

        return (err);
}


BIG_ERR_CODE
big_modexp_crt(BIGNUM *result, BIGNUM *a, BIGNUM *dmodpminus1,
    BIGNUM *dmodqminus1, BIGNUM *p, BIGNUM *q, BIGNUM *pinvmodq,
    BIGNUM *p_rr, BIGNUM *q_rr)
{
        return (big_modexp_crt_ext(result, a, dmodpminus1, dmodqminus1,
            p, q, pinvmodq, p_rr, q_rr, NULL));
}


static BIG_CHUNK_TYPE onearr[1] = {(BIG_CHUNK_TYPE)1};
#if     !defined(NO_BIG_ONE)
BIGNUM big_One = {1, 1, 1, 0, onearr};
#endif

static BIG_CHUNK_TYPE twoarr[1] = {(BIG_CHUNK_TYPE)2};
#if     !defined(NO_BIG_TWO)
BIGNUM big_Two = {1, 1, 1, 0, twoarr};
#endif

static BIG_CHUNK_TYPE fourarr[1] = {(BIG_CHUNK_TYPE)4};
static BIGNUM big_Four = {1, 1, 1, 0, fourarr};


BIG_ERR_CODE
big_sqrt_pos(BIGNUM *result, BIGNUM *n)
{
        BIGNUM          *high, *low, *mid, *t;
        BIGNUM          t1, t2, t3, prod;
        BIG_CHUNK_TYPE  t1value[BIGTMPSIZE];
        BIG_CHUNK_TYPE  t2value[BIGTMPSIZE];
        BIG_CHUNK_TYPE  t3value[BIGTMPSIZE];
        BIG_CHUNK_TYPE  prodvalue[BIGTMPSIZE];
        int             i, diff;
        uint32_t        nbits, nrootbits, highbits;
        BIG_ERR_CODE    err;

        nbits = big_numbits(n);

        if ((err = big_init1(&t1, n->len + 1,
            t1value, arraysize(t1value))) != BIG_OK)
                return (err);
        if ((err = big_init1(&t2, n->len + 1,
            t2value, arraysize(t2value))) != BIG_OK)
                goto ret1;
        if ((err = big_init1(&t3, n->len + 1,
            t3value, arraysize(t3value))) != BIG_OK)
                goto ret2;
        if ((err = big_init1(&prod, n->len + 1,
            prodvalue, arraysize(prodvalue))) != BIG_OK)
                goto ret3;

        nrootbits = (nbits + 1) / 2;
        t1.len = t2.len = t3.len = (nrootbits - 1) / BIG_CHUNK_SIZE + 1;
        for (i = 0; i < t1.len; i++) {
                t1.value[i] = 0;
                t2.value[i] = BIG_CHUNK_ALLBITS;
        }
        highbits = nrootbits - BIG_CHUNK_SIZE * (t1.len - 1);
        if (highbits == BIG_CHUNK_SIZE) {
                t1.value[t1.len - 1] = BIG_CHUNK_HIGHBIT;
                t2.value[t2.len - 1] = BIG_CHUNK_ALLBITS;
        } else {
                t1.value[t1.len - 1] = (BIG_CHUNK_TYPE)1 << (highbits - 1);
                t2.value[t2.len - 1] = 2 * t1.value[t1.len - 1] - 1;
        }

        high = &t2;
        low = &t1;
        mid = &t3;

        if ((err = big_mul(&prod, high, high)) != BIG_OK) {
                goto ret;
        }
        diff = big_cmp_abs(&prod, n);
        if (diff <= 0) {
                err = big_copy(result, high);
                goto ret;
        }

        (void) big_sub_pos(mid, high, low);
        while (big_cmp_abs(&big_One, mid) != 0) {
                (void) big_add_abs(mid, high, low);
                (void) big_half_pos(mid, mid);
                if ((err = big_mul(&prod, mid, mid)) != BIG_OK)
                        goto ret;
                diff = big_cmp_abs(&prod, n);
                if (diff > 0) {
                        t = high;
                        high = mid;
                        mid = t;
                } else if (diff < 0) {
                        t = low;
                        low = mid;
                        mid = t;
                } else {
                        err = big_copy(result, low);
                        goto ret;
                }
                (void) big_sub_pos(mid, high, low);
        }

        err = big_copy(result, low);
ret:
        if (prod.malloced) big_finish(&prod);
ret3:
        if (t3.malloced) big_finish(&t3);
ret2:
        if (t2.malloced) big_finish(&t2);
ret1:
        if (t1.malloced) big_finish(&t1);

        return (err);
}


BIG_ERR_CODE
big_Jacobi_pos(int *jac, BIGNUM *nn, BIGNUM *mm)
{
        BIGNUM          *t, *tmp2, *m, *n;
        BIGNUM          t1, t2, t3;
        BIG_CHUNK_TYPE  t1value[BIGTMPSIZE];
        BIG_CHUNK_TYPE  t2value[BIGTMPSIZE];
        BIG_CHUNK_TYPE  t3value[BIGTMPSIZE];
        int             len, err;

        if (big_is_zero(nn) ||
            (((nn->value[0] & 1) | (mm->value[0] & 1)) == 0)) {
                *jac = 0;
                return (BIG_OK);
        }

        if (nn->len > mm->len) {
                len = nn->len;
        } else {
                len = mm->len;
        }

        if ((err = big_init1(&t1, len,
            t1value, arraysize(t1value))) != BIG_OK) {
                return (err);
        }
        if ((err = big_init1(&t2, len,
            t2value, arraysize(t2value))) != BIG_OK) {
                goto ret1;
        }
        if ((err = big_init1(&t3, len,
            t3value, arraysize(t3value))) != BIG_OK) {
                goto ret2;
        }

        n = &t1;
        m = &t2;
        tmp2 = &t3;

        (void) big_copy(n, nn);
        (void) big_copy(m, mm);

        *jac = 1;
        while (big_cmp_abs(&big_One, m) != 0) {
                if (big_is_zero(n)) {
                        *jac = 0;
                        goto ret;
                }
                if ((m->value[0] & 1) == 0) {
                        if (((n->value[0] & 7) == 3) ||
                            ((n->value[0] & 7) == 5))
                                *jac = -*jac;
                        (void) big_half_pos(m, m);
                } else if ((n->value[0] & 1) == 0) {
                        if (((m->value[0] & 7) == 3) ||
                            ((m->value[0] & 7) == 5))
                                *jac = -*jac;
                        (void) big_half_pos(n, n);
                } else {
                        if (((m->value[0] & 3) == 3) &&
                            ((n->value[0] & 3) == 3)) {
                                *jac = -*jac;
                        }
                        if ((err = big_div_pos(NULL, tmp2, m, n)) != BIG_OK) {
                                goto ret;
                        }
                        t = tmp2;
                        tmp2 = m;
                        m = n;
                        n = t;
                }
        }
        err = BIG_OK;

ret:
        if (t3.malloced) big_finish(&t3);
ret2:
        if (t2.malloced) big_finish(&t2);
ret1:
        if (t1.malloced) big_finish(&t1);

        return (err);
}


BIG_ERR_CODE
big_Lucas(BIGNUM *Lkminus1, BIGNUM *Lk, BIGNUM *p, BIGNUM *k, BIGNUM *n)
{
        int             i;
        uint32_t        m, w;
        BIG_CHUNK_TYPE  bit;
        BIGNUM          ki, tmp, tmp2;
        BIG_CHUNK_TYPE  kivalue[BIGTMPSIZE];
        BIG_CHUNK_TYPE  tmpvalue[BIGTMPSIZE];
        BIG_CHUNK_TYPE  tmp2value[BIGTMPSIZE];
        BIG_ERR_CODE    err;

        if (big_cmp_abs(k, &big_One) == 0) {
                (void) big_copy(Lk, p);
                (void) big_copy(Lkminus1, &big_Two);
                return (BIG_OK);
        }

        if ((err = big_init1(&ki, k->len + 1,
            kivalue, arraysize(kivalue))) != BIG_OK)
                return (err);

        if ((err = big_init1(&tmp, 2 * n->len + 1,
            tmpvalue, arraysize(tmpvalue))) != BIG_OK)
                goto ret1;

        if ((err = big_init1(&tmp2, n->len,
            tmp2value, arraysize(tmp2value))) != BIG_OK)
                goto ret2;

        m = big_numbits(k);
        ki.len = (m - 1) / BIG_CHUNK_SIZE + 1;
        w = (m - 1) / BIG_CHUNK_SIZE;
        bit = (BIG_CHUNK_TYPE)1 << ((m - 1) % BIG_CHUNK_SIZE);
        for (i = 0; i < ki.len; i++) {
                ki.value[i] = 0;
        }
        ki.value[ki.len - 1] = bit;
        if (big_cmp_abs(k, &ki) != 0) {
                (void) big_double(&ki, &ki);
        }
        (void) big_sub_pos(&ki, &ki, k);

        (void) big_copy(Lk, p);
        (void) big_copy(Lkminus1, &big_Two);

        for (i = 0; i < m; i++) {
                if ((err = big_mul(&tmp, Lk, Lkminus1)) != BIG_OK) {
                        goto ret;
                }
                (void) big_add_abs(&tmp, &tmp, n);
                (void) big_sub_pos(&tmp, &tmp, p);
                if ((err = big_div_pos(NULL, &tmp2, &tmp, n)) != BIG_OK) {
                        goto ret;
                }
                if ((ki.value[w] & bit) != 0) {
                        if ((err = big_mul(&tmp, Lkminus1, Lkminus1)) !=
                            BIG_OK) {
                                goto ret;
                        }
                        (void) big_add_abs(&tmp, &tmp, n);
                        (void) big_sub_pos(&tmp, &tmp, &big_Two);
                        if ((err = big_div_pos(NULL, Lkminus1, &tmp, n)) !=
                            BIG_OK) {
                                goto ret;
                        }
                        (void) big_copy(Lk, &tmp2);
                } else {
                        if ((err = big_mul(&tmp, Lk, Lk)) != BIG_OK) {
                                goto ret;
                        }
                        (void) big_add_abs(&tmp, &tmp, n);
                        (void) big_sub_pos(&tmp, &tmp, &big_Two);
                        if ((err = big_div_pos(NULL, Lk, &tmp, n)) != BIG_OK) {
                                goto ret;
                        }
                        (void) big_copy(Lkminus1, &tmp2);
                }
                bit = bit >> 1;
                if (bit == 0) {
                        bit = BIG_CHUNK_HIGHBIT;
                        w--;
                }
        }

        err = BIG_OK;

ret:
        if (tmp2.malloced) big_finish(&tmp2);
ret2:
        if (tmp.malloced) big_finish(&tmp);
ret1:
        if (ki.malloced) big_finish(&ki);

        return (err);
}


BIG_ERR_CODE
big_isprime_pos_ext(BIGNUM *n, big_modexp_ncp_info_t *info)
{
        BIGNUM          o, nminus1, tmp, Lkminus1, Lk;
        BIG_CHUNK_TYPE  ovalue[BIGTMPSIZE];
        BIG_CHUNK_TYPE  nminus1value[BIGTMPSIZE];
        BIG_CHUNK_TYPE  tmpvalue[BIGTMPSIZE];
        BIG_CHUNK_TYPE  Lkminus1value[BIGTMPSIZE];
        BIG_CHUNK_TYPE  Lkvalue[BIGTMPSIZE];
        BIG_ERR_CODE    err;
        int             e, i, jac;

        if (big_cmp_abs(n, &big_One) == 0) {
                return (BIG_FALSE);
        }
        if (big_cmp_abs(n, &big_Two) == 0) {
                return (BIG_TRUE);
        }
        if ((n->value[0] & 1) == 0) {
                return (BIG_FALSE);
        }

        if ((err = big_init1(&o, n->len, ovalue, arraysize(ovalue))) !=
            BIG_OK) {
                return (err);
        }

        if ((err = big_init1(&nminus1, n->len,
            nminus1value, arraysize(nminus1value))) != BIG_OK) {
                goto ret1;
        }

        if ((err = big_init1(&tmp, 2 * n->len,
            tmpvalue, arraysize(tmpvalue))) != BIG_OK) {
                goto ret2;
        }

        if ((err = big_init1(&Lkminus1, n->len,
            Lkminus1value, arraysize(Lkminus1value))) != BIG_OK) {
                goto ret3;
        }

        if ((err = big_init1(&Lk, n->len,
            Lkvalue, arraysize(Lkvalue))) != BIG_OK) {
                goto ret4;
        }

        (void) big_sub_pos(&o, n, &big_One);    /* cannot fail */
        (void) big_copy(&nminus1, &o);          /* cannot fail */
        e = 0;
        while ((o.value[0] & 1) == 0) {
                e++;
                (void) big_half_pos(&o, &o);  /* cannot fail */
        }
        if ((err = big_modexp_ext(&tmp, &big_Two, &o, n, NULL, info)) !=
            BIG_OK) {
                goto ret;
        }

        i = 0;
        while ((i < e) &&
            (big_cmp_abs(&tmp, &big_One) != 0) &&
            (big_cmp_abs(&tmp, &nminus1) != 0)) {
                if ((err =
                    big_modexp_ext(&tmp, &tmp, &big_Two, n, NULL, info)) !=
                    BIG_OK)
                        goto ret;
                i++;
        }

        if (!((big_cmp_abs(&tmp, &nminus1) == 0) ||
            ((i == 0) && (big_cmp_abs(&tmp, &big_One) == 0)))) {
                err = BIG_FALSE;
                goto ret;
        }

        if ((err = big_sqrt_pos(&tmp, n)) != BIG_OK) {
                goto ret;
        }

        if ((err = big_mul(&tmp, &tmp, &tmp)) != BIG_OK) {
                goto ret;
        }
        if (big_cmp_abs(&tmp, n) == 0) {
                err = BIG_FALSE;
                goto ret;
        }

        (void) big_copy(&o, &big_Two);
        do {
                (void) big_add_abs(&o, &o, &big_One);
                if ((err = big_mul(&tmp, &o, &o)) != BIG_OK) {
                        goto ret;
                }
                (void) big_sub_pos(&tmp, &tmp, &big_Four);
                if ((err = big_Jacobi_pos(&jac, &tmp, n)) != BIG_OK) {
                        goto ret;
                }
        } while (jac != -1);

        (void) big_add_abs(&tmp, n, &big_One);
        if ((err = big_Lucas(&Lkminus1, &Lk, &o, &tmp, n)) != BIG_OK) {
                goto ret;
        }

        if ((big_cmp_abs(&Lkminus1, &o) == 0) &&
            (big_cmp_abs(&Lk, &big_Two) == 0)) {
                err = BIG_TRUE;
        } else {
                err = BIG_FALSE;
        }

ret:
        if (Lk.malloced) big_finish(&Lk);
ret4:
        if (Lkminus1.malloced) big_finish(&Lkminus1);
ret3:
        if (tmp.malloced) big_finish(&tmp);
ret2:
        if (nminus1.malloced) big_finish(&nminus1);
ret1:
        if (o.malloced) big_finish(&o);

        return (err);
}


BIG_ERR_CODE
big_isprime_pos(BIGNUM *n)
{
        return (big_isprime_pos_ext(n, NULL));
}


#define SIEVESIZE 1000


BIG_ERR_CODE
big_nextprime_pos_ext(BIGNUM *result, BIGNUM *n, big_modexp_ncp_info_t *info)
{
        static const uint32_t smallprimes[] = {
            3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47,
            51, 53, 59, 61, 67, 71, 73, 79, 83, 89, 91, 97 };
        BIG_ERR_CODE    err;
        int             sieve[SIEVESIZE];
        int             i;
        uint32_t        off, p;

        if ((err = big_copy(result, n)) != BIG_OK) {
                return (err);
        }
        result->value[0] |= 1;
        /* CONSTCOND */
        while (1) {
                for (i = 0; i < SIEVESIZE; i++) sieve[i] = 0;
                for (i = 0;
                    i < sizeof (smallprimes) / sizeof (smallprimes[0]); i++) {
                        p = smallprimes[i];
                        off = big_modhalf_pos(result, p);
                        off = p - off;
                        if ((off % 2) == 1) {
                                off = off + p;
                        }
                        off = off / 2;
                        while (off < SIEVESIZE) {
                                sieve[off] = 1;
                                off = off + p;
                        }
                }

                for (i = 0; i < SIEVESIZE; i++) {
                        if (sieve[i] == 0) {
                                err = big_isprime_pos_ext(result, info);
                                if (err != BIG_FALSE) {
                                        if (err != BIG_TRUE) {
                                                return (err);
                                        } else {
                                                goto out;
                                        }
                                }

                        }
                        if ((err = big_add_abs(result, result, &big_Two)) !=
                            BIG_OK) {
                                return (err);
                        }
                }
        }

out:
        return (BIG_OK);
}


BIG_ERR_CODE
big_nextprime_pos(BIGNUM *result, BIGNUM *n)
{
        return (big_nextprime_pos_ext(result, n, NULL));
}


BIG_ERR_CODE
big_nextprime_pos_slow(BIGNUM *result, BIGNUM *n)
{
        BIG_ERR_CODE err;


        if ((err = big_copy(result, n)) != BIG_OK)
                return (err);
        result->value[0] |= 1;
        while ((err = big_isprime_pos_ext(result, NULL)) != BIG_TRUE) {
                if (err != BIG_FALSE)
                        return (err);
                if ((err = big_add_abs(result, result, &big_Two)) != BIG_OK)
                        return (err);
        }
        return (BIG_OK);
}


/*
 * given m and e, computes the rest in the equation
 * gcd(m, e) = cm * m + ce * e
 */
BIG_ERR_CODE
big_ext_gcd_pos(BIGNUM *gcd, BIGNUM *cm, BIGNUM *ce, BIGNUM *m, BIGNUM *e)
{
        BIGNUM          *xi, *ri, *riminus1, *riminus2, *t;
        BIGNUM          *vmi, *vei, *vmiminus1, *veiminus1;
        BIGNUM          t1, t2, t3, t4, t5, t6, t7, t8, tmp;
        BIG_CHUNK_TYPE  t1value[BIGTMPSIZE];
        BIG_CHUNK_TYPE  t2value[BIGTMPSIZE];
        BIG_CHUNK_TYPE  t3value[BIGTMPSIZE];
        BIG_CHUNK_TYPE  t4value[BIGTMPSIZE];
        BIG_CHUNK_TYPE  t5value[BIGTMPSIZE];
        BIG_CHUNK_TYPE  t6value[BIGTMPSIZE];
        BIG_CHUNK_TYPE  t7value[BIGTMPSIZE];
        BIG_CHUNK_TYPE  t8value[BIGTMPSIZE];
        BIG_CHUNK_TYPE  tmpvalue[BIGTMPSIZE];
        BIG_ERR_CODE    err;
        int             len;

        if (big_cmp_abs(m, e) >= 0) {
                len = m->len;
        } else {
                len = e->len;
        }

        if ((err = big_init1(&t1, len,
            t1value, arraysize(t1value))) != BIG_OK) {
                return (err);
        }
        if ((err = big_init1(&t2, len,
            t2value, arraysize(t2value))) != BIG_OK) {
                goto ret1;
        }
        if ((err = big_init1(&t3, len,
            t3value, arraysize(t3value))) != BIG_OK) {
                goto ret2;
        }
        if ((err = big_init1(&t4, len,
            t4value, arraysize(t3value))) != BIG_OK) {
                goto ret3;
        }
        if ((err = big_init1(&t5, len,
            t5value, arraysize(t5value))) != BIG_OK) {
                goto ret4;
        }
        if ((err = big_init1(&t6, len,
            t6value, arraysize(t6value))) != BIG_OK) {
                goto ret5;
        }
        if ((err = big_init1(&t7, len,
            t7value, arraysize(t7value))) != BIG_OK) {
                goto ret6;
        }
        if ((err = big_init1(&t8, len,
            t8value, arraysize(t8value))) != BIG_OK) {
                goto ret7;
        }

        if ((err = big_init1(&tmp, 2 * len,
            tmpvalue, arraysize(tmpvalue))) != BIG_OK) {
                goto ret8;
        }

        ri = &t1;
        ri->value[0] = 1;
        ri->len = 1;
        xi = &t2;
        riminus1 = &t3;
        riminus2 = &t4;
        vmi = &t5;
        vei = &t6;
        vmiminus1 = &t7;
        veiminus1 = &t8;

        (void) big_copy(vmiminus1, &big_One);
        (void) big_copy(vmi, &big_One);
        (void) big_copy(veiminus1, &big_One);
        (void) big_copy(xi, &big_One);
        vei->len = 1;
        vei->value[0] = 0;

        (void) big_copy(riminus1, m);
        (void) big_copy(ri, e);

        while (!big_is_zero(ri)) {
                t = riminus2;
                riminus2 = riminus1;
                riminus1 = ri;
                ri = t;
                if ((err = big_mul(&tmp, vmi, xi)) != BIG_OK) {
                        goto ret;
                }
                if ((err = big_sub(vmiminus1, vmiminus1, &tmp)) != BIG_OK) {
                        goto ret;
                }
                t = vmiminus1;
                vmiminus1 = vmi;
                vmi = t;
                if ((err = big_mul(&tmp, vei, xi)) != BIG_OK) {
                        goto ret;
                }
                if ((err = big_sub(veiminus1, veiminus1, &tmp)) != BIG_OK) {
                        goto ret;
                }
                t = veiminus1;
                veiminus1 = vei;
                vei = t;
                if ((err = big_div_pos(xi, ri, riminus2, riminus1)) !=
                    BIG_OK) {
                        goto ret;
                }
        }
        if ((gcd != NULL) && ((err = big_copy(gcd, riminus1)) != BIG_OK)) {
                goto ret;
        }
        if ((cm != NULL) && ((err = big_copy(cm, vmi)) != BIG_OK)) {
                goto ret;
        }
        if (ce != NULL) {
                err = big_copy(ce, vei);
        }
ret:
        if (tmp.malloced) big_finish(&tmp);
ret8:
        if (t8.malloced) big_finish(&t8);
ret7:
        if (t7.malloced) big_finish(&t7);
ret6:
        if (t6.malloced) big_finish(&t6);
ret5:
        if (t5.malloced) big_finish(&t5);
ret4:
        if (t4.malloced) big_finish(&t4);
ret3:
        if (t3.malloced) big_finish(&t3);
ret2:
        if (t2.malloced) big_finish(&t2);
ret1:
        if (t1.malloced) big_finish(&t1);

        return (err);
}

/*
 * Get a rlen-bit random number in BIGNUM format.  Caller-supplied
 * (*rfunc)(void *dbuf, size_t dlen) must return 0 for success and
 * -1 for failure.  Note:  (*rfunc)() takes length in bytes, not bits.
 */
BIG_ERR_CODE
big_random(BIGNUM *r, size_t rlen, int (*rfunc)(void *, size_t))
{
        size_t  rwords, rbytes;
        int     shift;

        if (r == NULL || rlen == 0 || rfunc == NULL)
                return (BIG_INVALID_ARGS);

        /*
         * Convert rlen bits to r->len words (32- or 64-bit), rbytes bytes
         * and extend r if it's not big enough to hold the random number.
         */
        rwords = BITLEN2BIGNUMLEN(rlen);
        rbytes = rwords * sizeof (BIG_CHUNK_TYPE);
        if (big_extend(r, rwords) != BIG_OK)
                return (BIG_NO_MEM);
#ifdef BIGNUM_CHUNK_32
        r->len = rwords;
#else
        r->len = (uint32_t)rwords;
#endif

        if ((*rfunc)(r->value, rbytes) < 0)
                return (BIG_NO_RANDOM);

        r->value[rwords - 1] |= BIG_CHUNK_HIGHBIT;

        /*
         * If the bit length is not a word boundary, shift the most
         * significant word so that we have an exactly rlen-long number.
         */
        if ((shift = rlen % BIG_CHUNK_SIZE) != 0)
                r->value[rwords - 1] >>= (BIG_CHUNK_SIZE - shift);

        r->sign = 1;    /* non-negative */

        return (BIG_OK);
}