root/usr/src/cmd/sendmail/db/include/mp.h
/*-
 * See the file LICENSE for redistribution information.
 *
 * Copyright (c) 1996, 1997, 1998
 *      Sleepycat Software.  All rights reserved.
 *
 *      @(#)mp.h        10.37 (Sleepycat) 1/1/99
 */

struct __bh;            typedef struct __bh BH;
struct __db_mpreg;      typedef struct __db_mpreg DB_MPREG;
struct __mpool;         typedef struct __mpool MPOOL;
struct __mpoolfile;     typedef struct __mpoolfile MPOOLFILE;

                                        /* Default mpool name. */
#define DB_DEFAULT_MPOOL_FILE   "__db_mpool.share"

/*
 * We default to 256K (32 8K pages) if the user doesn't specify, and
 * require a minimum of 20K.
 */
#ifndef DB_CACHESIZE_DEF
#define DB_CACHESIZE_DEF        (256 * 1024)
#endif
#define DB_CACHESIZE_MIN        ( 20 * 1024)

#define INVALID         0               /* Invalid shared memory offset. */

/*
 * There are three ways we do locking in the mpool code:
 *
 * Locking a handle mutex to provide concurrency for DB_THREAD operations.
 * Locking the region mutex to provide mutual exclusion while reading and
 *    writing structures in the shared region.
 * Locking buffer header mutexes during I/O.
 *
 * The first will not be further described here.  We use the shared mpool
 * region lock to provide mutual exclusion while reading/modifying all of
 * the data structures, including the buffer headers.  We use a per-buffer
 * header lock to wait on buffer I/O.  The order of locking is as follows:
 *
 * Searching for a buffer:
 *      Acquire the region lock.
 *      Find the buffer header.
 *      Increment the reference count (guarantee the buffer stays).
 *      While the BH_LOCKED flag is set (I/O is going on) {
 *          Release the region lock.
 *              Explicitly yield the processor if it's not the first pass
 *              through this loop, otherwise, we can simply spin because
 *              we'll be simply switching between the two locks.
 *          Request the buffer lock.
 *          The I/O will complete...
 *          Acquire the buffer lock.
 *          Release the buffer lock.
 *          Acquire the region lock.
 *      }
 *      Return the buffer.
 *
 * Reading/writing a buffer:
 *      Acquire the region lock.
 *      Find/create the buffer header.
 *      If reading, increment the reference count (guarantee the buffer stays).
 *      Set the BH_LOCKED flag.
 *      Acquire the buffer lock (guaranteed not to block).
 *      Release the region lock.
 *      Do the I/O and/or initialize the buffer contents.
 *      Release the buffer lock.
 *          At this point, the buffer lock is available, but the logical
 *          operation (flagged by BH_LOCKED) is not yet completed.  For
 *          this reason, among others, threads checking the BH_LOCKED flag
 *          must loop around their test.
 *      Acquire the region lock.
 *      Clear the BH_LOCKED flag.
 *      Release the region lock.
 *      Return/discard the buffer.
 *
 * Pointers to DB_MPOOL, MPOOL, DB_MPOOLFILE and MPOOLFILE structures are not
 * reacquired when a region lock is reacquired because they couldn't have been
 * closed/discarded and because they never move in memory.
 */
#define LOCKINIT(dbmp, mutexp)                                          \
        if (F_ISSET(dbmp, MP_LOCKHANDLE | MP_LOCKREGION))               \
                (void)__db_mutex_init(mutexp,                           \
                    MUTEX_LOCK_OFFSET((dbmp)->reginfo.addr, mutexp))

#define LOCKHANDLE(dbmp, mutexp)                                        \
        if (F_ISSET(dbmp, MP_LOCKHANDLE))                               \
                (void)__db_mutex_lock(mutexp, (dbmp)->reginfo.fd)
#define UNLOCKHANDLE(dbmp, mutexp)                                      \
        if (F_ISSET(dbmp, MP_LOCKHANDLE))                               \
                (void)__db_mutex_unlock(mutexp, (dbmp)->reginfo.fd)

#define LOCKREGION(dbmp)                                                \
        if (F_ISSET(dbmp, MP_LOCKREGION))                               \
                (void)__db_mutex_lock(&((RLAYOUT *)(dbmp)->mp)->lock,   \
                    (dbmp)->reginfo.fd)
#define UNLOCKREGION(dbmp)                                              \
        if (F_ISSET(dbmp, MP_LOCKREGION))                               \
                (void)__db_mutex_unlock(&((RLAYOUT *)(dbmp)->mp)->lock, \
                (dbmp)->reginfo.fd)

#define LOCKBUFFER(dbmp, bhp)                                           \
        if (F_ISSET(dbmp, MP_LOCKREGION))                               \
                (void)__db_mutex_lock(&(bhp)->mutex, (dbmp)->reginfo.fd)
#define UNLOCKBUFFER(dbmp, bhp)                                         \
        if (F_ISSET(dbmp, MP_LOCKREGION))                               \
                (void)__db_mutex_unlock(&(bhp)->mutex, (dbmp)->reginfo.fd)

/* Check for region catastrophic shutdown. */
#define MP_PANIC_CHECK(dbmp) {                                          \
        if ((dbmp)->mp->rlayout.panic)                                  \
                return (DB_RUNRECOVERY);                                \
}

/*
 * DB_MPOOL --
 *      Per-process memory pool structure.
 */
struct __db_mpool {
/* These fields need to be protected for multi-threaded support. */
        db_mutex_t      *mutexp;        /* Structure lock. */

                                        /* List of pgin/pgout routines. */
        LIST_HEAD(__db_mpregh, __db_mpreg) dbregq;

                                        /* List of DB_MPOOLFILE's. */
        TAILQ_HEAD(__db_mpoolfileh, __db_mpoolfile) dbmfq;

/* These fields are not protected. */
        DB_ENV     *dbenv;              /* Reference to error information. */
        REGINFO     reginfo;            /* Region information. */

        MPOOL      *mp;                 /* Address of the shared MPOOL. */

        void       *addr;               /* Address of shalloc() region. */

        DB_HASHTAB *htab;               /* Hash table of bucket headers. */

#define MP_LOCKHANDLE   0x01            /* Threaded, lock handles and region. */
#define MP_LOCKREGION   0x02            /* Concurrent access, lock region. */
        u_int32_t  flags;
};

/*
 * DB_MPREG --
 *      DB_MPOOL registry of pgin/pgout functions.
 */
struct __db_mpreg {
        LIST_ENTRY(__db_mpreg) q;       /* Linked list. */

        int ftype;                      /* File type. */
                                        /* Pgin, pgout routines. */
        int (DB_CALLBACK *pgin) __P((db_pgno_t, void *, DBT *));
        int (DB_CALLBACK *pgout) __P((db_pgno_t, void *, DBT *));
};

/*
 * DB_MPOOLFILE --
 *      Per-process DB_MPOOLFILE information.
 */
struct __db_mpoolfile {
/* These fields need to be protected for multi-threaded support. */
        db_mutex_t      *mutexp;        /* Structure lock. */

        int        fd;                  /* Underlying file descriptor. */

        u_int32_t ref;                  /* Reference count. */

        /*
         * !!!
         * This field is a special case -- it's protected by the region lock
         * NOT the thread lock.  The reason for this is that we always have
         * the region lock immediately before or after we modify the field,
         * and we don't want to use the structure lock to protect it because
         * then I/O (which is done with the structure lock held because of
         * the race between the seek and write of the file descriptor) will
         * block any other put/get calls using this DB_MPOOLFILE structure.
         */
        u_int32_t pinref;               /* Pinned block reference count. */

/* These fields are not protected. */
        TAILQ_ENTRY(__db_mpoolfile) q;  /* Linked list of DB_MPOOLFILE's. */

        DB_MPOOL  *dbmp;                /* Overlying DB_MPOOL. */
        MPOOLFILE *mfp;                 /* Underlying MPOOLFILE. */

        void      *addr;                /* Address of mmap'd region. */
        size_t     len;                 /* Length of mmap'd region. */

/* These fields need to be protected for multi-threaded support. */
#define MP_READONLY     0x01            /* File is readonly. */
#define MP_UPGRADE      0x02            /* File descriptor is readwrite. */
#define MP_UPGRADE_FAIL 0x04            /* Upgrade wasn't possible. */
        u_int32_t  flags;
};

/*
 * MPOOL --
 *      Shared memory pool region.  One of these is allocated in shared
 *      memory, and describes the pool.
 */
struct __mpool {
        RLAYOUT     rlayout;            /* General region information. */

        SH_TAILQ_HEAD(__bhq) bhq;       /* LRU list of buckets. */
        SH_TAILQ_HEAD(__bhfq) bhfq;     /* Free buckets. */
        SH_TAILQ_HEAD(__mpfq) mpfq;     /* List of MPOOLFILEs. */

        /*
         * We make the assumption that the early pages of the file are far
         * more likely to be retrieved than the later pages, which means
         * that the top bits are more interesting for hashing since they're
         * less likely to collide.  On the other hand, since 512 4K pages
         * represents a 2MB file, only the bottom 9 bits of the page number
         * are likely to be set.  We XOR in the offset in the MPOOL of the
         * MPOOLFILE that backs this particular page, since that should also
         * be unique for the page.
         */
#define BUCKET(mp, mf_offset, pgno)                                     \
        (((pgno) ^ ((mf_offset) << 9)) % (mp)->htab_buckets)

        size_t      htab;               /* Hash table offset. */
        size_t      htab_buckets;       /* Number of hash table entries. */

        DB_LSN      lsn;                /* Maximum checkpoint LSN. */
        u_int32_t   lsn_cnt;            /* Checkpoint buffers left to write. */

        DB_MPOOL_STAT stat;             /* Global mpool statistics. */

#define MP_LSN_RETRY    0x01            /* Retry all BH_WRITE buffers. */
        u_int32_t  flags;
};

/*
 * MPOOLFILE --
 *      Shared DB_MPOOLFILE information.
 */
struct __mpoolfile {
        SH_TAILQ_ENTRY  q;              /* List of MPOOLFILEs */

        u_int32_t ref;                  /* Reference count. */

        int       ftype;                /* File type. */

        int32_t   lsn_off;              /* Page's LSN offset. */
        u_int32_t clear_len;            /* Bytes to clear on page create. */

        size_t    path_off;             /* File name location. */
        size_t    fileid_off;           /* File identification location. */

        size_t    pgcookie_len;         /* Pgin/pgout cookie length. */
        size_t    pgcookie_off;         /* Pgin/pgout cookie location. */

        u_int32_t lsn_cnt;              /* Checkpoint buffers left to write. */

        db_pgno_t last_pgno;            /* Last page in the file. */
        db_pgno_t orig_last_pgno;       /* Original last page in the file. */

#define MP_CAN_MMAP     0x01            /* If the file can be mmap'd. */
#define MP_TEMP         0x02            /* Backing file is a temporary. */
        u_int32_t  flags;

        DB_MPOOL_FSTAT stat;            /* Per-file mpool statistics. */
};

/*
 * BH --
 *      Buffer header.
 */
struct __bh {
        db_mutex_t      mutex;          /* Structure lock. */

        u_int16_t       ref;            /* Reference count. */

#define BH_CALLPGIN     0x001           /* Page needs to be reworked... */
#define BH_DIRTY        0x002           /* Page was modified. */
#define BH_DISCARD      0x004           /* Page is useless. */
#define BH_LOCKED       0x008           /* Page is locked (I/O in progress). */
#define BH_TRASH        0x010           /* Page is garbage. */
#define BH_WRITE        0x020           /* Page scheduled for writing. */
        u_int16_t  flags;

        SH_TAILQ_ENTRY  q;              /* LRU queue. */
        SH_TAILQ_ENTRY  hq;             /* MPOOL hash bucket queue. */

        db_pgno_t pgno;                 /* Underlying MPOOLFILE page number. */
        size_t    mf_offset;            /* Associated MPOOLFILE offset. */

        /*
         * !!!
         * This array must be size_t aligned -- the DB access methods put PAGE
         * and other structures into it, and expect to be able to access them
         * directly.  (We guarantee size_t alignment in the db_mpool(3) manual
         * page as well.)
         */
        u_int8_t   buf[1];              /* Variable length data. */
};

#include "mp_ext.h"