usr/src/cmd/spell/huff.c

root/usr/src/cmd/spell/huff.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License, Version 1.0 only
 * (the "License").  You may not use this file except in compliance
 * with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
/*        All Rights Reserved   */

#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>

#define BYTE 8
#define QW 1            /* width of bas-q digit in bits */

/*
 * this stuff should be local and hidden; it was made
 * accessible outside for dirty reasons: 20% faster spell
 */
#include "huff.h"
struct huff huffcode;

/*
 * Infinite Huffman code
 *
 * Let the messages be exponentially distributed with ratio r:
 *      P {message k} = r^k*(1-r),      k = 0, 1, ...
 * Let the messages be coded in base q, and suppose
 *      r^n = 1/q
 * If each decade(base q) contains n codes, then
 * the messages assigned to each decade will be q times
 * as probable as the next. Moreover the code for the tail of
 * the distribution after truncating one decade should look
 * just like the original, but longer by one leading digit q-1.
 *      q(z+n) = z + (q-1)q^w
 * where z is first code of decade, w is width of code, in shortest
 * full decade. Examples, base 2:
 *      r^1 = 1/2       r^5 = 1/2
 *      0               0110
 *      10              0111
 *      110             1000
 *      1110            1001
 *      ...             1010
 *                      10110
 *      w = 1, z = 0            w = 4, z = 0110
 * Rewriting slightly
 *      (q-1)z + q*n = (q-1)q^w
 * whence z is a multiple of q and n is a multiple of q-1. Let
 *      z = cq, n = d(q-1)
 * We pick w to be the least integer such that
 *      d = n/(q-1) <= q^(w-1)
 * Then solve for c
 *      c = q^(w-1) - d
 * If c is not zero, the first decade may be preceded by
 * even shorter (w-1)-digit codes 0, 1, ..., c-1. Thus
 * the example code with r^5 = 1/2 becomes
 *      000
 *      001
 *      010
 *      0110
 *      0111
 *      1000
 *      1001
 *      1010
 *      10110
 *      ...
 *      110110
 *      ...
 * The expected number of base-q digits in a codeword is then
 *      w - 1 + r^c/(1-r^n)
 * The present routines require q to be a power of 2
 */
/*
 * There is a lot of hanky-panky with left justification against
 * sign instead of simple left justification because
 * unsigned long is not available
 */
#define L (BYTE*(sizeof (long))-1)      /* length of signless long */
#define MASK (~((unsigned long)1<<L))   /* mask out sign */

/*
 * decode the prefix of word y (which is left justified against sign)
 * place mesage number into place pointed to by kp
 * return length (in bits) of decoded prefix or 0 if code is out of
 * range
 */
int
decode(long y, long *pk)
{
        int l;
        long v;
        if (y < cs) {
                *pk = y >> (long)(L+QW-w);
                return (w-QW);
        }
        for (l = w, v = v0; y >= qcs;
            y = ((unsigned long)y << QW) & MASK, v += n)
                if ((l += QW) > L)
                        return (0);
        *pk = v + (y>>(long)(L-w));
        return (l);
}

/*
 * encode message k and put result (right justified) into
 * place pointed to by py.
 * return length (in bits) of result,
 * or 0 if code is too long
 */

int
encode(long k, long *py)
{
        int l;
        long y;
        if (k < c) {
                *py = k;
                return (w-QW);
        }
        for (k -= c, y = 1, l = w; k >= n; k -= n, y <<= QW)
                if ((l += QW) > L)
                        return (0);
        *py = ((y-1)<<w) + cq + k;
        return (l);
}


/*
 * Initialization code, given expected value of k
 * E(k) = r/(1-r) = a
 * and given base width b
 * return expected length of coded messages
 */
static struct qlog {
        long p;
        double u;
} z;

static struct qlog
qlog(double x, double y, long p, double u)      /* find smallest p so x^p<=y */
{

        if (u/x <= y) {
                z.p = 0;
                z.u = 1;
        } else {
                z = qlog(x, y, p+p, u*u);
                if (u*z.u/x > y) {
                        z.p += p;
                        z.u *= u;
                }
        }
        return (z);
}

double
huff(float a)
{
        int i, q;
        long d, j;
        double r = a/(1.0 + a);
        double rc, rq;

        for (i = 0, q = 1, rq = r; i < QW; i++, q *= 2, rq *= rq)
                continue;
        rq /= r;        /* rq = r^(q-1) */
        (void) qlog(rq, 1./q, 1L, rq);
        d = z.p;
        n = d*(q-1);
        if (n != d * (q - 1))
                abort();        /* time to make n long */
        for (w = QW, j = 1; j < d; w += QW, j *= q)
                continue;
        c = j - d;
        cq = c*q;
        cs = cq<<(L-w);
        qcs = (((long)(q-1)<<w) + cq) << (L-QW-w);
        v0 = c - cq;
        for (i = 0, rc = 1; i < c; i++, rc *= r)        /* rc = r^c */
                continue;
        return (w + QW*(rc/(1-z.u) - 1));
}

void
whuff(void)
{
        (void) fwrite((char *) & huffcode, sizeof (huffcode), 1, stdout);
}

int
rhuff(FILE *f)
{
        return (read(fileno(f), (char *)&huffcode, sizeof (huffcode)) ==
            sizeof (huffcode));
}
Illumos