root/usr/src/uts/common/os/flock.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
/*      All Rights Reserved */

/*
 * Copyright 2019 Nexenta by DDN, Inc. All rights reserved.
 * Copyright 2015 Joyent, Inc.
 */

#include <sys/flock_impl.h>
#include <sys/vfs.h>
#include <sys/t_lock.h>         /* for <sys/callb.h> */
#include <sys/callb.h>
#include <sys/clconf.h>
#include <sys/cladm.h>
#include <sys/nbmlock.h>
#include <sys/cred.h>
#include <sys/policy.h>

/*
 * The following four variables are for statistics purposes and they are
 * not protected by locks. They may not be accurate but will at least be
 * close to the actual value.
 */

int     flk_lock_allocs;
int     flk_lock_frees;
int     edge_allocs;
int     edge_frees;
int     flk_proc_vertex_allocs;
int     flk_proc_edge_allocs;
int     flk_proc_vertex_frees;
int     flk_proc_edge_frees;

static kmutex_t flock_lock;

#ifdef DEBUG
int check_debug = 0;
#define CHECK_ACTIVE_LOCKS(gp)  if (check_debug) \
                                        check_active_locks(gp);
#define CHECK_SLEEPING_LOCKS(gp)        if (check_debug) \
                                                check_sleeping_locks(gp);
#define CHECK_OWNER_LOCKS(gp, pid, sysid, vp)   \
                if (check_debug)        \
                        check_owner_locks(gp, pid, sysid, vp);
#define CHECK_LOCK_TRANSITION(old_state, new_state) \
        { \
                if (check_lock_transition(old_state, new_state)) { \
                        cmn_err(CE_PANIC, "Illegal lock transition \
                            from %d to %d", old_state, new_state); \
                } \
        }
#else

#define CHECK_ACTIVE_LOCKS(gp)
#define CHECK_SLEEPING_LOCKS(gp)
#define CHECK_OWNER_LOCKS(gp, pid, sysid, vp)
#define CHECK_LOCK_TRANSITION(old_state, new_state)

#endif /* DEBUG */

struct kmem_cache       *flk_edge_cache;

graph_t         *lock_graph[HASH_SIZE];
proc_graph_t    pgraph;

/*
 * Clustering.
 *
 * NLM REGISTRY TYPE IMPLEMENTATION
 *
 * Assumptions:
 *  1.  Nodes in a cluster are numbered starting at 1; always non-negative
 *      integers; maximum node id is returned by clconf_maximum_nodeid().
 *  2.  We use this node id to identify the node an NLM server runs on.
 */

/*
 * NLM registry object keeps track of NLM servers via their
 * nlmids (which are the node ids of the node in the cluster they run on)
 * that have requested locks at this LLM with which this registry is
 * associated.
 *
 * Representation of abstraction:
 *    rep = record[     states: array[nlm_state],
 *                      lock: mutex]
 *
 *    Representation invariants:
 *      1. index i of rep.states is between 0 and n - 1 where n is number
 *         of elements in the array, which happen to be the maximum number
 *         of nodes in the cluster configuration + 1.
 *      2. map nlmid to index i of rep.states
 *              0   -> 0
 *              1   -> 1
 *              2   -> 2
 *              n-1 -> clconf_maximum_nodeid()+1
 *      3.  This 1-1 mapping is quite convenient and it avoids errors resulting
 *          from forgetting to subtract 1 from the index.
 *      4.  The reason we keep the 0th index is the following.  A legitimate
 *          cluster configuration includes making a UFS file system NFS
 *          exportable.  The code is structured so that if you're in a cluster
 *          you do one thing; otherwise, you do something else.  The problem
 *          is what to do if you think you're in a cluster with PXFS loaded,
 *          but you're using UFS not PXFS?  The upper two bytes of the sysid
 *          encode the node id of the node where NLM server runs; these bytes
 *          are zero for UFS.  Since the nodeid is used to index into the
 *          registry, we can record the NLM server state information at index
 *          0 using the same mechanism used for PXFS file locks!
 */
static flk_nlm_status_t *nlm_reg_status = NULL; /* state array 0..N-1 */
static kmutex_t nlm_reg_lock;                   /* lock to protect arrary */
static uint_t nlm_status_size;                  /* size of state array */

/*
 * Although we need a global lock dependency graph (and associated data
 * structures), we also need a per-zone notion of whether the lock manager is
 * running, and so whether to allow lock manager requests or not.
 *
 * Thus, on a per-zone basis we maintain a ``global'' variable
 * (flk_lockmgr_status), protected by flock_lock, and set when the lock
 * manager is determined to be changing state (starting or stopping).
 *
 * Each graph/zone pair also has a copy of this variable, which is protected by
 * the graph's mutex.
 *
 * The per-graph copies are used to synchronize lock requests with shutdown
 * requests.  The global copy is used to initialize the per-graph field when a
 * new graph is created.
 */
struct flock_globals {
        flk_lockmgr_status_t flk_lockmgr_status;
        flk_lockmgr_status_t lockmgr_status[HASH_SIZE];
};

zone_key_t flock_zone_key;

static void create_flock(lock_descriptor_t *, flock64_t *);
static lock_descriptor_t        *flk_get_lock(void);
static void     flk_free_lock(lock_descriptor_t *lock);
static void     flk_get_first_blocking_lock(lock_descriptor_t *request);
static int flk_process_request(lock_descriptor_t *);
static int flk_add_edge(lock_descriptor_t *, lock_descriptor_t *, int, int);
static edge_t *flk_get_edge(void);
static int flk_wait_execute_request(lock_descriptor_t *);
static int flk_relation(lock_descriptor_t *, lock_descriptor_t *);
static void flk_insert_active_lock(lock_descriptor_t *);
static void flk_delete_active_lock(lock_descriptor_t *, int);
static void flk_insert_sleeping_lock(lock_descriptor_t *);
static void flk_graph_uncolor(graph_t *);
static void flk_wakeup(lock_descriptor_t *, int);
static void flk_free_edge(edge_t *);
static void flk_recompute_dependencies(lock_descriptor_t *,
                        lock_descriptor_t **,  int, int);
static int flk_find_barriers(lock_descriptor_t *);
static void flk_update_barriers(lock_descriptor_t *);
static int flk_color_reachables(lock_descriptor_t *);
static int flk_canceled(lock_descriptor_t *);
static void flk_delete_locks_by_sysid(lock_descriptor_t *);
static void report_blocker(lock_descriptor_t *, lock_descriptor_t *);
static void wait_for_lock(lock_descriptor_t *);
static void unlock_lockmgr_granted(struct flock_globals *);
static void wakeup_sleeping_lockmgr_locks(struct flock_globals *);

/* Clustering hooks */
static void cl_flk_change_nlm_state_all_locks(int, flk_nlm_status_t);
static void cl_flk_wakeup_sleeping_nlm_locks(int);
static void cl_flk_unlock_nlm_granted(int);

#ifdef DEBUG
static int check_lock_transition(int, int);
static void check_sleeping_locks(graph_t *);
static void check_active_locks(graph_t *);
static int no_path(lock_descriptor_t *, lock_descriptor_t *);
static void path(lock_descriptor_t *, lock_descriptor_t *);
static void check_owner_locks(graph_t *, pid_t, int, vnode_t *);
static int level_one_path(lock_descriptor_t *, lock_descriptor_t *);
static int level_two_path(lock_descriptor_t *, lock_descriptor_t *, int);
#endif

/*      proc_graph function definitions */
static int flk_check_deadlock(lock_descriptor_t *);
static void flk_proc_graph_uncolor(void);
static proc_vertex_t *flk_get_proc_vertex(lock_descriptor_t *);
static proc_edge_t *flk_get_proc_edge(void);
static void flk_proc_release(proc_vertex_t *);
static void flk_free_proc_edge(proc_edge_t *);
static void flk_update_proc_graph(edge_t *, int);

/* Non-blocking mandatory locking */
static int lock_blocks_io(nbl_op_t, u_offset_t, ssize_t, int, u_offset_t,
                        u_offset_t);

static struct flock_globals *
flk_get_globals(void)
{
        /*
         * The KLM module had better be loaded if we're attempting to handle
         * lockmgr requests.
         */
        ASSERT(flock_zone_key != ZONE_KEY_UNINITIALIZED);
        return (zone_getspecific(flock_zone_key, curproc->p_zone));
}

static flk_lockmgr_status_t
flk_get_lockmgr_status(void)
{
        struct flock_globals *fg;

        ASSERT(MUTEX_HELD(&flock_lock));

        if (flock_zone_key == ZONE_KEY_UNINITIALIZED) {
                /*
                 * KLM module not loaded; lock manager definitely not running.
                 */
                return (FLK_LOCKMGR_DOWN);
        }
        fg = flk_get_globals();
        return (fg->flk_lockmgr_status);
}

/*
 * This implements Open File Description (not descriptor) style record locking.
 * These locks can also be thought of as pid-less since they are not tied to a
 * specific process, thus they're preserved across fork.
 *
 * Called directly from fcntl.
 *
 * See reclock() for the implementation of the traditional POSIX style record
 * locking scheme (pid-ful). This function is derived from reclock() but
 * simplified and modified to work for OFD style locking.
 *
 * The two primary advantages of OFD style of locking are:
 * 1) It is per-file description, so closing a file descriptor that refers to a
 *    different file description for the same file will not drop the lock (i.e.
 *    two open's of the same file get different descriptions but a dup or fork
 *    will refer to the same description).
 * 2) Locks are preserved across fork(2).
 *
 * Because these locks are per-description a lock ptr lives at the f_filocks
 * member of the file_t and the lock_descriptor includes a file_t pointer
 * to enable unique lock identification and management.
 *
 * Since these locks are pid-less we cannot do deadlock detection with the
 * current process-oriented implementation. This is consistent with OFD locking
 * behavior on other operating systems such as Linux. Since we don't do
 * deadlock detection we never interact with the process graph that is
 * maintained for deadlock detection on the traditional POSIX-style locks.
 *
 * Future Work:
 *
 * The current implementation does not support record locks. That is,
 * currently the single lock must cover the entire file. This is validated in
 * fcntl. To support record locks the f_filock pointer in the file_t needs to
 * be changed to a list of pointers to the locks. That list needs to be
 * managed independently of the lock list on the vnode itself and it needs to
 * be maintained as record locks are created, split, coalesced and deleted.
 *
 * The current implementation does not support remote file systems (e.g.
 * NFS or CIFS). This is handled in fs_frlock(). The design of how OFD locks
 * interact with the NLM is not clear since the NLM protocol/implementation
 * appears to be oriented around locks associated with a process. A further
 * problem is that a design is needed for what nlm_send_siglost() should do and
 * where it will send SIGLOST. More recent versions of Linux apparently try to
 * emulate OFD locks on NFS by converting them to traditional POSIX style locks
 * that work with the NLM. It is not clear that this provides the correct
 * semantics in all cases.
 */
int
ofdlock(file_t *fp, int fcmd, flock64_t *lckdat, int flag, u_offset_t offset)
{
        int cmd = 0;
        vnode_t *vp;
        lock_descriptor_t       stack_lock_request;
        lock_descriptor_t       *lock_request;
        int error = 0;
        graph_t *gp;
        int serialize = 0;

        if (fcmd != F_OFD_GETLK)
                cmd = SETFLCK;

        if (fcmd == F_OFD_SETLKW || fcmd == F_FLOCKW)
                cmd |= SLPFLCK;

        /* see block comment */
        VERIFY(lckdat->l_whence == 0);
        VERIFY(lckdat->l_start == 0);
        VERIFY(lckdat->l_len == 0);

        vp = fp->f_vnode;

        /*
         * For reclock fs_frlock() would normally have set these in a few
         * places but for us it's cleaner to centralize it here. Note that
         * IGN_PID is -1. We use 0 for our pid-less locks.
         */
        lckdat->l_pid = 0;
        lckdat->l_sysid = 0;

        /*
         * Check access permissions
         */
        if ((fcmd == F_OFD_SETLK || fcmd == F_OFD_SETLKW) &&
            ((lckdat->l_type == F_RDLCK && (flag & FREAD) == 0) ||
            (lckdat->l_type == F_WRLCK && (flag & FWRITE) == 0)))
                return (EBADF);

        /*
         * for query and unlock we use the stack_lock_request
         */
        if (lckdat->l_type == F_UNLCK || !(cmd & SETFLCK)) {
                lock_request = &stack_lock_request;
                (void) bzero((caddr_t)lock_request,
                    sizeof (lock_descriptor_t));

                /*
                 * following is added to make the assertions in
                 * flk_execute_request() pass
                 */
                lock_request->l_edge.edge_in_next = &lock_request->l_edge;
                lock_request->l_edge.edge_in_prev = &lock_request->l_edge;
                lock_request->l_edge.edge_adj_next = &lock_request->l_edge;
                lock_request->l_edge.edge_adj_prev = &lock_request->l_edge;
                lock_request->l_status = FLK_INITIAL_STATE;
        } else {
                lock_request = flk_get_lock();
                fp->f_filock = (struct filock *)lock_request;
        }
        lock_request->l_state = 0;
        lock_request->l_vnode = vp;
        lock_request->l_zoneid = getzoneid();
        lock_request->l_ofd = fp;

        /*
         * Convert the request range into the canonical start and end
         * values then check the validity of the lock range.
         */
        error = flk_convert_lock_data(vp, lckdat, &lock_request->l_start,
            &lock_request->l_end, offset);
        if (error)
                goto done;

        error = flk_check_lock_data(lock_request->l_start, lock_request->l_end,
            MAXEND);
        if (error)
                goto done;

        ASSERT(lock_request->l_end >= lock_request->l_start);

        lock_request->l_type = lckdat->l_type;
        if (cmd & SLPFLCK)
                lock_request->l_state |= WILLING_TO_SLEEP_LOCK;

        if (!(cmd & SETFLCK)) {
                if (lock_request->l_type == F_RDLCK ||
                    lock_request->l_type == F_WRLCK)
                        lock_request->l_state |= QUERY_LOCK;
        }
        lock_request->l_flock = (*lckdat);

        /*
         * We are ready for processing the request
         */

        if (fcmd != F_OFD_GETLK && lock_request->l_type != F_UNLCK &&
            nbl_need_check(vp)) {
                nbl_start_crit(vp, RW_WRITER);
                serialize = 1;
        }

        /* Get the lock graph for a particular vnode */
        gp = flk_get_lock_graph(vp, FLK_INIT_GRAPH);

        mutex_enter(&gp->gp_mutex);

        lock_request->l_state |= REFERENCED_LOCK;
        lock_request->l_graph = gp;

        switch (lock_request->l_type) {
        case F_RDLCK:
        case F_WRLCK:
                if (IS_QUERY_LOCK(lock_request)) {
                        flk_get_first_blocking_lock(lock_request);
                        if (lock_request->l_ofd != NULL)
                                lock_request->l_flock.l_pid = -1;
                        (*lckdat) = lock_request->l_flock;
                } else {
                        /* process the request now */
                        error = flk_process_request(lock_request);
                }
                break;

        case F_UNLCK:
                /* unlock request will not block so execute it immediately */
                error = flk_execute_request(lock_request);
                break;

        default:
                error = EINVAL;
                break;
        }

        if (lock_request == &stack_lock_request) {
                flk_set_state(lock_request, FLK_DEAD_STATE);
        } else {
                lock_request->l_state &= ~REFERENCED_LOCK;
                if ((error != 0) || IS_DELETED(lock_request)) {
                        flk_set_state(lock_request, FLK_DEAD_STATE);
                        flk_free_lock(lock_request);
                }
        }

        mutex_exit(&gp->gp_mutex);
        if (serialize)
                nbl_end_crit(vp);

        return (error);

done:
        flk_set_state(lock_request, FLK_DEAD_STATE);
        if (lock_request != &stack_lock_request)
                flk_free_lock(lock_request);
        return (error);
}

/*
 * Remove any lock on the vnode belonging to the given file_t.
 * Called from closef on last close, file_t is locked.
 *
 * This is modeled on the cleanlocks() function but only removes the single
 * lock associated with fp.
 */
void
ofdcleanlock(file_t *fp)
{
        lock_descriptor_t *fplock, *lock, *nlock;
        vnode_t *vp;
        graph_t *gp;

        ASSERT(MUTEX_HELD(&fp->f_tlock));

        if ((fplock = (lock_descriptor_t *)fp->f_filock) == NULL)
                return;

        fp->f_filock = NULL;
        vp = fp->f_vnode;

        gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);

        if (gp == NULL)
                return;
        mutex_enter(&gp->gp_mutex);

        CHECK_SLEEPING_LOCKS(gp);
        CHECK_ACTIVE_LOCKS(gp);

        SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);

        if (lock) {
                do {
                        nlock = lock->l_next;
                        if (fplock == lock) {
                                CANCEL_WAKEUP(lock);
                                break;
                        }
                        lock = nlock;
                } while (lock->l_vnode == vp);
        }

        SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);

        if (lock) {
                do {
                        nlock = lock->l_next;
                        if (fplock == lock) {
                                flk_delete_active_lock(lock, 0);
                                flk_wakeup(lock, 1);
                                flk_free_lock(lock);
                                break;
                        }
                        lock = nlock;
                } while (lock->l_vnode == vp);
        }

        CHECK_SLEEPING_LOCKS(gp);
        CHECK_ACTIVE_LOCKS(gp);
        mutex_exit(&gp->gp_mutex);
}

/*
 * Routine called from fs_frlock in fs/fs_subr.c
 *
 * This implements traditional POSIX style record locking. The two primary
 * drawbacks to this style of locking are:
 * 1) It is per-process, so any close of a file descriptor that refers to the
 *    file will drop the lock (e.g. lock /etc/passwd, call a library function
 *    which opens /etc/passwd to read the file, when the library closes it's
 *    file descriptor the application loses its lock and does not know).
 * 2) Locks are not preserved across fork(2).
 *
 * Because these locks are only associated with a PID, they are per-process.
 * This is why any close will drop the lock and is also why, once the process
 * forks, the lock is no longer related to the new process. These locks can
 * be considered as PID-ful.
 *
 * See ofdlock() for the implementation of a similar but improved locking
 * scheme.
 */
int
reclock(vnode_t *vp, flock64_t *lckdat, int cmd, int flag, u_offset_t offset,
    flk_callback_t *flk_cbp)
{
        lock_descriptor_t       stack_lock_request;
        lock_descriptor_t       *lock_request;
        int error = 0;
        graph_t *gp;
        int                     nlmid;

        /*
         * Check access permissions
         */
        if ((cmd & SETFLCK) &&
            ((lckdat->l_type == F_RDLCK && (flag & FREAD) == 0) ||
            (lckdat->l_type == F_WRLCK && (flag & FWRITE) == 0)))
                        return (EBADF);

        /*
         * for query and unlock we use the stack_lock_request
         */

        if ((lckdat->l_type == F_UNLCK) ||
            !((cmd & INOFLCK) || (cmd & SETFLCK))) {
                lock_request = &stack_lock_request;
                (void) bzero((caddr_t)lock_request,
                    sizeof (lock_descriptor_t));

                /*
                 * following is added to make the assertions in
                 * flk_execute_request() to pass through
                 */

                lock_request->l_edge.edge_in_next = &lock_request->l_edge;
                lock_request->l_edge.edge_in_prev = &lock_request->l_edge;
                lock_request->l_edge.edge_adj_next = &lock_request->l_edge;
                lock_request->l_edge.edge_adj_prev = &lock_request->l_edge;
                lock_request->l_status = FLK_INITIAL_STATE;
        } else {
                lock_request = flk_get_lock();
        }
        lock_request->l_state = 0;
        lock_request->l_vnode = vp;
        lock_request->l_zoneid = getzoneid();

        /*
         * Convert the request range into the canonical start and end
         * values.  The NLM protocol supports locking over the entire
         * 32-bit range, so there's no range checking for remote requests,
         * but we still need to verify that local requests obey the rules.
         */
        /* Clustering */
        if ((cmd & (RCMDLCK | PCMDLCK)) != 0) {
                ASSERT(lckdat->l_whence == 0);
                lock_request->l_start = lckdat->l_start;
                lock_request->l_end = (lckdat->l_len == 0) ? MAX_U_OFFSET_T :
                    lckdat->l_start + (lckdat->l_len - 1);
        } else {
                /* check the validity of the lock range */
                error = flk_convert_lock_data(vp, lckdat,
                    &lock_request->l_start, &lock_request->l_end,
                    offset);
                if (error) {
                        goto done;
                }
                error = flk_check_lock_data(lock_request->l_start,
                    lock_request->l_end, MAXEND);
                if (error) {
                        goto done;
                }
        }

        ASSERT(lock_request->l_end >= lock_request->l_start);

        lock_request->l_type = lckdat->l_type;
        if (cmd & INOFLCK)
                lock_request->l_state |= IO_LOCK;
        if (cmd & SLPFLCK)
                lock_request->l_state |= WILLING_TO_SLEEP_LOCK;
        if (cmd & RCMDLCK)
                lock_request->l_state |= LOCKMGR_LOCK;
        if (cmd & NBMLCK)
                lock_request->l_state |= NBMAND_LOCK;
        /*
         * Clustering: set flag for PXFS locks
         * We do not _only_ check for the PCMDLCK flag because PXFS locks could
         * also be of type 'RCMDLCK'.
         * We do not _only_ check the GETPXFSID() macro because local PXFS
         * clients use a pxfsid of zero to permit deadlock detection in the LLM.
         */

        if ((cmd & PCMDLCK) || (GETPXFSID(lckdat->l_sysid) != 0)) {
                lock_request->l_state |= PXFS_LOCK;
        }
        if (!((cmd & SETFLCK) || (cmd & INOFLCK))) {
                if (lock_request->l_type == F_RDLCK ||
                    lock_request->l_type == F_WRLCK)
                        lock_request->l_state |= QUERY_LOCK;
        }
        lock_request->l_flock = (*lckdat);
        lock_request->l_callbacks = flk_cbp;

        /*
         * We are ready for processing the request
         */
        if (IS_LOCKMGR(lock_request)) {
                /*
                 * If the lock request is an NLM server request ....
                 */
                if (nlm_status_size == 0) { /* not booted as cluster */
                        mutex_enter(&flock_lock);
                        /*
                         * Bail out if this is a lock manager request and the
                         * lock manager is not supposed to be running.
                         */
                        if (flk_get_lockmgr_status() != FLK_LOCKMGR_UP) {
                                mutex_exit(&flock_lock);
                                error = ENOLCK;
                                goto done;
                        }
                        mutex_exit(&flock_lock);
                } else {                        /* booted as a cluster */
                        nlmid = GETNLMID(lock_request->l_flock.l_sysid);
                        ASSERT(nlmid <= nlm_status_size && nlmid >= 0);

                        mutex_enter(&nlm_reg_lock);
                        /*
                         * If the NLM registry does not know about this
                         * NLM server making the request, add its nlmid
                         * to the registry.
                         */
                        if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status,
                            nlmid)) {
                                FLK_REGISTRY_ADD_NLMID(nlm_reg_status, nlmid);
                        } else if (!FLK_REGISTRY_IS_NLM_UP(nlm_reg_status,
                            nlmid)) {
                                /*
                                 * If the NLM server is already known (has made
                                 * previous lock requests) and its state is
                                 * not NLM_UP (means that NLM server is
                                 * shutting down), then bail out with an
                                 * error to deny the lock request.
                                 */
                                mutex_exit(&nlm_reg_lock);
                                error = ENOLCK;
                                goto done;
                        }
                        mutex_exit(&nlm_reg_lock);
                }
        }

        /* Now get the lock graph for a particular vnode */
        gp = flk_get_lock_graph(vp, FLK_INIT_GRAPH);

        /*
         * We drop rwlock here otherwise this might end up causing a
         * deadlock if this IOLOCK sleeps. (bugid # 1183392).
         */

        if (IS_IO_LOCK(lock_request)) {
                VOP_RWUNLOCK(vp,
                    (lock_request->l_type == F_RDLCK) ?
                    V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
        }
        mutex_enter(&gp->gp_mutex);

        lock_request->l_state |= REFERENCED_LOCK;
        lock_request->l_graph = gp;

        switch (lock_request->l_type) {
        case F_RDLCK:
        case F_WRLCK:
                if (IS_QUERY_LOCK(lock_request)) {
                        flk_get_first_blocking_lock(lock_request);
                        if (lock_request->l_ofd != NULL)
                                lock_request->l_flock.l_pid = -1;
                        (*lckdat) = lock_request->l_flock;
                        break;
                }

                /* process the request now */

                error = flk_process_request(lock_request);
                break;

        case F_UNLCK:
                /* unlock request will not block so execute it immediately */

                if (IS_LOCKMGR(lock_request) &&
                    flk_canceled(lock_request)) {
                        error = 0;
                } else {
                        error = flk_execute_request(lock_request);
                }
                break;

        case F_UNLKSYS:
                /*
                 * Recovery mechanism to release lock manager locks when
                 * NFS client crashes and restart. NFS server will clear
                 * old locks and grant new locks.
                 */

                if (lock_request->l_flock.l_sysid == 0) {
                        mutex_exit(&gp->gp_mutex);
                        return (EINVAL);
                }
                if (secpolicy_nfs(CRED()) != 0) {
                        mutex_exit(&gp->gp_mutex);
                        return (EPERM);
                }
                flk_delete_locks_by_sysid(lock_request);
                lock_request->l_state &= ~REFERENCED_LOCK;
                flk_set_state(lock_request, FLK_DEAD_STATE);
                flk_free_lock(lock_request);
                mutex_exit(&gp->gp_mutex);
                return (0);

        default:
                error = EINVAL;
                break;
        }

        /* Clustering: For blocked PXFS locks, return */
        if (error == PXFS_LOCK_BLOCKED) {
                lock_request->l_state &= ~REFERENCED_LOCK;
                mutex_exit(&gp->gp_mutex);
                return (error);
        }

        /*
         * Now that we have seen the status of locks in the system for
         * this vnode we acquire the rwlock if it is an IO_LOCK.
         */

        if (IS_IO_LOCK(lock_request)) {
                (void) VOP_RWLOCK(vp,
                    (lock_request->l_type == F_RDLCK) ?
                    V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
                if (!error) {
                        lckdat->l_type = F_UNLCK;

                        /*
                         * This wake up is needed otherwise
                         * if IO_LOCK has slept the dependents on this
                         * will not be woken up at all. (bugid # 1185482).
                         */

                        flk_wakeup(lock_request, 1);
                        flk_set_state(lock_request, FLK_DEAD_STATE);
                        flk_free_lock(lock_request);
                }
                /*
                 * else if error had occurred either flk_process_request()
                 * has returned EDEADLK in which case there will be no
                 * dependents for this lock or EINTR from flk_wait_execute_
                 * request() in which case flk_cancel_sleeping_lock()
                 * would have been done. same is true with EBADF.
                 */
        }

        if (lock_request == &stack_lock_request) {
                flk_set_state(lock_request, FLK_DEAD_STATE);
        } else {
                lock_request->l_state &= ~REFERENCED_LOCK;
                if ((error != 0) || IS_DELETED(lock_request)) {
                        flk_set_state(lock_request, FLK_DEAD_STATE);
                        flk_free_lock(lock_request);
                }
        }

        mutex_exit(&gp->gp_mutex);
        return (error);

done:
        flk_set_state(lock_request, FLK_DEAD_STATE);
        if (lock_request != &stack_lock_request)
                flk_free_lock(lock_request);
        return (error);
}

/*
 * Invoke the callbacks in the given list.  If before sleeping, invoke in
 * list order.  If after sleeping, invoke in reverse order.
 *
 * CPR (suspend/resume) support: if one of the callbacks returns a
 * callb_cpr_t, return it.   This will be used to make the thread CPR-safe
 * while it is sleeping.  There should be at most one callb_cpr_t for the
 * thread.
 * XXX This is unnecessarily complicated.  The CPR information should just
 * get passed in directly through VOP_FRLOCK and reclock, rather than
 * sneaking it in via a callback.
 */

callb_cpr_t *
flk_invoke_callbacks(flk_callback_t *cblist, flk_cb_when_t when)
{
        callb_cpr_t *cpr_callbackp = NULL;
        callb_cpr_t *one_result;
        flk_callback_t *cb;

        if (cblist == NULL)
                return (NULL);

        if (when == FLK_BEFORE_SLEEP) {
                cb = cblist;
                do {
                        one_result = (*cb->cb_callback)(when, cb->cb_data);
                        if (one_result != NULL) {
                                ASSERT(cpr_callbackp == NULL);
                                cpr_callbackp = one_result;
                        }
                        cb = cb->cb_next;
                } while (cb != cblist);
        } else {
                cb = cblist->cb_prev;
                do {
                        one_result = (*cb->cb_callback)(when, cb->cb_data);
                        if (one_result != NULL) {
                                cpr_callbackp = one_result;
                        }
                        cb = cb->cb_prev;
                } while (cb != cblist->cb_prev);
        }

        return (cpr_callbackp);
}

/*
 * Initialize a flk_callback_t to hold the given callback.
 */

void
flk_init_callback(flk_callback_t *flk_cb,
    callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *), void *cbdata)
{
        flk_cb->cb_next = flk_cb;
        flk_cb->cb_prev = flk_cb;
        flk_cb->cb_callback = cb_fcn;
        flk_cb->cb_data = cbdata;
}

/*
 * Initialize an flk_callback_t and then link it into the head of an
 * existing list (which may be NULL).
 */

void
flk_add_callback(flk_callback_t *newcb,
    callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *),
    void *cbdata, flk_callback_t *cblist)
{
        flk_init_callback(newcb, cb_fcn, cbdata);

        if (cblist == NULL)
                return;

        newcb->cb_prev = cblist->cb_prev;
        newcb->cb_next = cblist;
        cblist->cb_prev->cb_next = newcb;
        cblist->cb_prev = newcb;
}

/*
 * Remove the callback from a list.
 */

void
flk_del_callback(flk_callback_t *flk_cb)
{
        flk_cb->cb_next->cb_prev = flk_cb->cb_prev;
        flk_cb->cb_prev->cb_next = flk_cb->cb_next;

        flk_cb->cb_prev = flk_cb;
        flk_cb->cb_next = flk_cb;
}

/*
 * Initialize the flk_edge_cache data structure and create the
 * nlm_reg_status array.
 */

void
flk_init(void)
{
        uint_t  i;

        flk_edge_cache = kmem_cache_create("flk_edges",
            sizeof (struct edge), 0, NULL, NULL, NULL, NULL, NULL, 0);
        if (flk_edge_cache == NULL) {
                cmn_err(CE_PANIC, "Couldn't create flk_edge_cache\n");
        }
        /*
         * Create the NLM registry object.
         */

        if (cluster_bootflags & CLUSTER_BOOTED) {
                /*
                 * This routine tells you the maximum node id that will be used
                 * in the cluster.  This number will be the size of the nlm
                 * registry status array.  We add 1 because we will be using
                 * all entries indexed from 0 to maxnodeid; e.g., from 0
                 * to 64, for a total of 65 entries.
                 */
                nlm_status_size = clconf_maximum_nodeid() + 1;
        } else {
                nlm_status_size = 0;
        }

        if (nlm_status_size != 0) {     /* booted as a cluster */
                nlm_reg_status = (flk_nlm_status_t *)
                    kmem_alloc(sizeof (flk_nlm_status_t) * nlm_status_size,
                    KM_SLEEP);

                /* initialize all NLM states in array to NLM_UNKNOWN */
                for (i = 0; i < nlm_status_size; i++) {
                        nlm_reg_status[i] = FLK_NLM_UNKNOWN;
                }
        }
}

/*
 * Zone constructor/destructor callbacks to be executed when a zone is
 * created/destroyed.
 */
/* ARGSUSED */
void *
flk_zone_init(zoneid_t zoneid)
{
        struct flock_globals *fg;
        uint_t i;

        fg = kmem_alloc(sizeof (*fg), KM_SLEEP);
        fg->flk_lockmgr_status = FLK_LOCKMGR_UP;
        for (i = 0; i < HASH_SIZE; i++)
                fg->lockmgr_status[i] = FLK_LOCKMGR_UP;
        return (fg);
}

/* ARGSUSED */
void
flk_zone_fini(zoneid_t zoneid, void *data)
{
        struct flock_globals *fg = data;

        kmem_free(fg, sizeof (*fg));
}

/*
 * Get a lock_descriptor structure with initialization of edge lists.
 */

static lock_descriptor_t *
flk_get_lock(void)
{
        lock_descriptor_t       *l;

        l = kmem_zalloc(sizeof (lock_descriptor_t), KM_SLEEP);

        cv_init(&l->l_cv, NULL, CV_DRIVER, NULL);
        l->l_edge.edge_in_next = &l->l_edge;
        l->l_edge.edge_in_prev = &l->l_edge;
        l->l_edge.edge_adj_next = &l->l_edge;
        l->l_edge.edge_adj_prev = &l->l_edge;
        l->pvertex = -1;
        l->l_status = FLK_INITIAL_STATE;
        flk_lock_allocs++;
        return (l);
}

/*
 * Free a lock_descriptor structure. Just sets the DELETED_LOCK flag
 * when some thread has a reference to it as in reclock().
 */

void
flk_free_lock(lock_descriptor_t *lock)
{
        file_t *fp;

        ASSERT(IS_DEAD(lock));

        if ((fp = lock->l_ofd) != NULL && fp->f_filock == (struct filock *)lock)
                fp->f_filock = NULL;

        if (IS_REFERENCED(lock)) {
                lock->l_state |= DELETED_LOCK;
                return;
        }
        flk_lock_frees++;
        kmem_free((void *)lock, sizeof (lock_descriptor_t));
}

void
flk_set_state(lock_descriptor_t *lock, int new_state)
{
        /*
         * Locks in the sleeping list may be woken up in a number of ways,
         * and more than once.  If a sleeping lock is signaled awake more
         * than once, then it may or may not change state depending on its
         * current state.
         * Also note that NLM locks that are sleeping could be moved to an
         * interrupted state more than once if the unlock request is
         * retransmitted by the NLM client - the second time around, this is
         * just a nop.
         * The ordering of being signaled awake is:
         * INTERRUPTED_STATE > CANCELLED_STATE > GRANTED_STATE.
         * The checks below implement this ordering.
         */
        if (IS_INTERRUPTED(lock)) {
                if ((new_state == FLK_CANCELLED_STATE) ||
                    (new_state == FLK_GRANTED_STATE) ||
                    (new_state == FLK_INTERRUPTED_STATE)) {
                        return;
                }
        }
        if (IS_CANCELLED(lock)) {
                if ((new_state == FLK_GRANTED_STATE) ||
                    (new_state == FLK_CANCELLED_STATE)) {
                        return;
                }
        }
        CHECK_LOCK_TRANSITION(lock->l_status, new_state);
        if (IS_PXFS(lock)) {
                cl_flk_state_transition_notify(lock, lock->l_status, new_state);
        }
        lock->l_status = new_state;
}

/*
 * Routine that checks whether there are any blocking locks in the system.
 *
 * The policy followed is if a write lock is sleeping we don't allow read
 * locks before this write lock even though there may not be any active
 * locks corresponding to the read locks' region.
 *
 * flk_add_edge() function adds an edge between l1 and l2 iff there
 * is no path between l1 and l2. This is done to have a "minimum
 * storage representation" of the dependency graph.
 *
 * Another property of the graph is since only the new request throws
 * edges to the existing locks in the graph, the graph is always topologically
 * ordered.
 */

static int
flk_process_request(lock_descriptor_t *request)
{
        graph_t *gp = request->l_graph;
        lock_descriptor_t *lock;
        int request_blocked_by_active = 0;
        int request_blocked_by_granted = 0;
        int request_blocked_by_sleeping = 0;
        vnode_t *vp = request->l_vnode;
        int     error = 0;
        int request_will_wait = 0;
        int found_covering_lock = 0;
        lock_descriptor_t *covered_by = NULL;

        ASSERT(MUTEX_HELD(&gp->gp_mutex));
        request_will_wait = IS_WILLING_TO_SLEEP(request);

        /*
         * check active locks
         */

        SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);


        if (lock) {
                do {
                        if (BLOCKS(lock, request)) {
                                if (!request_will_wait)
                                        return (EAGAIN);
                                request_blocked_by_active = 1;
                                break;
                        }
                        /*
                         * Grant lock if it is for the same owner holding active
                         * lock that covers the request.
                         */

                        if (SAME_OWNER(lock, request) &&
                            COVERS(lock, request) &&
                            (request->l_type == F_RDLCK))
                                return (flk_execute_request(request));
                        lock = lock->l_next;
                } while (lock->l_vnode == vp);
        }

        if (!request_blocked_by_active) {
                lock_descriptor_t *lk[1];
                lock_descriptor_t *first_glock = NULL;
                /*
                 * Shall we grant this?! NO!!
                 * What about those locks that were just granted and still
                 * in sleep queue. Those threads are woken up and so locks
                 * are almost active.
                 */
                SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
                if (lock) {
                        do {
                                if (BLOCKS(lock, request)) {
                                        if (IS_GRANTED(lock)) {
                                                request_blocked_by_granted = 1;
                                        } else {
                                                request_blocked_by_sleeping = 1;
                                        }
                                }

                                lock = lock->l_next;
                        } while ((lock->l_vnode == vp));
                        first_glock = lock->l_prev;
                        ASSERT(first_glock->l_vnode == vp);
                }

                if (request_blocked_by_granted)
                        goto block;

                if (!request_blocked_by_sleeping) {
                        /*
                         * If the request isn't going to be blocked by a
                         * sleeping request, we know that it isn't going to
                         * be blocked; we can just execute the request --
                         * without performing costly deadlock detection.
                         */
                        ASSERT(!request_blocked_by_active);
                        return (flk_execute_request(request));
                } else if (request->l_type == F_RDLCK) {
                        /*
                         * If we have a sleeping writer in the requested
                         * lock's range, block.
                         */
                        goto block;
                }

                lk[0] = request;
                request->l_state |= RECOMPUTE_LOCK;
                SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
                if (lock) {
                        do {
                                flk_recompute_dependencies(lock, lk, 1, 0);
                                lock = lock->l_next;
                        } while (lock->l_vnode == vp);
                }
                lock = first_glock;
                if (lock) {
                        do {
                                if (IS_GRANTED(lock)) {
                                flk_recompute_dependencies(lock, lk, 1, 0);
                                }
                                lock = lock->l_prev;
                        } while ((lock->l_vnode == vp));
                }
                request->l_state &= ~RECOMPUTE_LOCK;
                if (!NO_DEPENDENTS(request) && flk_check_deadlock(request))
                        return (EDEADLK);
                return (flk_execute_request(request));
        }

block:
        if (request_will_wait)
                flk_graph_uncolor(gp);

        /* check sleeping locks */

        SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);

        /*
         * If we find a sleeping write lock that is a superset of the
         * region wanted by request we can be assured that by adding an
         * edge to this write lock we have paths to all locks in the
         * graph that blocks the request except in one case and that is why
         * another check for SAME_OWNER in the loop below. The exception
         * case is when this process that owns the sleeping write lock 'l1'
         * has other locks l2, l3, l4 that are in the system and arrived
         * before l1. l1 does not have path to these locks as they are from
         * same process. We break when we find a second covering sleeping
         * lock l5 owned by a process different from that owning l1, because
         * there cannot be any of l2, l3, l4, etc., arrived before l5, and if
         * it has l1 would have produced a deadlock already.
         */

        if (lock) {
                do {
                        if (BLOCKS(lock, request)) {
                                if (!request_will_wait)
                                        return (EAGAIN);
                                if (COVERS(lock, request) &&
                                    lock->l_type == F_WRLCK) {
                                        if (found_covering_lock &&
                                            !SAME_OWNER(lock, covered_by)) {
                                                found_covering_lock++;
                                                break;
                                        }
                                        found_covering_lock = 1;
                                        covered_by = lock;
                                }
                                if (found_covering_lock &&
                                    !SAME_OWNER(lock, covered_by)) {
                                        lock = lock->l_next;
                                        continue;
                                }
                                if ((error = flk_add_edge(request, lock,
                                    !found_covering_lock, 0)))
                                        return (error);
                        }
                        lock = lock->l_next;
                } while (lock->l_vnode == vp);
        }

/*
 * found_covering_lock == 2 iff at this point 'request' has paths
 * to all locks that blocks 'request'. found_covering_lock == 1 iff at this
 * point 'request' has paths to all locks that blocks 'request' whose owners
 * are not same as the one that covers 'request' (covered_by above) and
 * we can have locks whose owner is same as covered_by in the active list.
 */

        if (request_blocked_by_active && found_covering_lock != 2) {
                SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
                ASSERT(lock != NULL);
                do {
                        if (BLOCKS(lock, request)) {
                                if (found_covering_lock &&
                                    !SAME_OWNER(lock, covered_by)) {
                                        lock = lock->l_next;
                                        continue;
                                }
                                if ((error = flk_add_edge(request, lock,
                                    CHECK_CYCLE, 0)))
                                        return (error);
                        }
                        lock = lock->l_next;
                } while (lock->l_vnode == vp);
        }

        if (NOT_BLOCKED(request)) {
                /*
                 * request not dependent on any other locks
                 * so execute this request
                 */
                return (flk_execute_request(request));
        } else {
                /*
                 * check for deadlock
                 */
                if (flk_check_deadlock(request))
                        return (EDEADLK);
                /*
                 * this thread has to sleep
                 */
                return (flk_wait_execute_request(request));
        }
}

/*
 * The actual execution of the request in the simple case is only to
 * insert the 'request' in the list of active locks if it is not an
 * UNLOCK.
 * We have to consider the existing active locks' relation to
 * this 'request' if they are owned by same process. flk_relation() does
 * this job and sees to that the dependency graph information is maintained
 * properly.
 */

int
flk_execute_request(lock_descriptor_t *request)
{
        graph_t *gp = request->l_graph;
        vnode_t *vp = request->l_vnode;
        lock_descriptor_t       *lock, *lock1;
        int done_searching = 0;

        CHECK_SLEEPING_LOCKS(gp);
        CHECK_ACTIVE_LOCKS(gp);

        ASSERT(MUTEX_HELD(&gp->gp_mutex));

        flk_set_state(request, FLK_START_STATE);

        ASSERT(NOT_BLOCKED(request));

        /* IO_LOCK requests are only to check status */

        if (IS_IO_LOCK(request))
                return (0);

        SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);

        if (lock == NULL && request->l_type == F_UNLCK)
                return (0);
        if (lock == NULL) {
                flk_insert_active_lock(request);
                return (0);
        }

        do {
                lock1 = lock->l_next;
                if (SAME_OWNER(request, lock)) {
                        done_searching = flk_relation(lock, request);
                }
                lock = lock1;
        } while (lock->l_vnode == vp && !done_searching);

        /*
         * insert in active queue
         */

        if (request->l_type != F_UNLCK)
                flk_insert_active_lock(request);

        return (0);
}

/*
 * 'request' is blocked by some one therefore we put it into sleep queue.
 */
static int
flk_wait_execute_request(lock_descriptor_t *request)
{
        graph_t *gp = request->l_graph;
        callb_cpr_t     *cprp;          /* CPR info from callback */
        struct flock_globals *fg;
        int index;

        ASSERT(MUTEX_HELD(&gp->gp_mutex));
        ASSERT(IS_WILLING_TO_SLEEP(request));

        flk_insert_sleeping_lock(request);

        index = 0;      /* quiesce compiler warning. */
        fg = NULL;
        if (IS_LOCKMGR(request)) {
                index = HASH_INDEX(request->l_vnode);
                fg = flk_get_globals();

                if (nlm_status_size == 0) {     /* not booted as a cluster */
                        if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP) {
                                flk_cancel_sleeping_lock(request, 1);
                                return (ENOLCK);
                        }
                } else {                        /* booted as a cluster */
                        /*
                         * If the request is an NLM server lock request,
                         * and the NLM state of the lock request is not
                         * NLM_UP (because the NLM server is shutting
                         * down), then cancel the sleeping lock and
                         * return error ENOLCK that will encourage the
                         * client to retransmit.
                         */
                        if (!IS_NLM_UP(request)) {
                                flk_cancel_sleeping_lock(request, 1);
                                return (ENOLCK);
                        }
                }
        }

        /* Clustering: For blocking PXFS locks, return */
        if (IS_PXFS(request)) {
                /*
                 * PXFS locks sleep on the client side.
                 * The callback argument is used to wake up the sleeper
                 * when the lock is granted.
                 * We return -1 (rather than an errno value) to indicate
                 * the client side should sleep
                 */
                return (PXFS_LOCK_BLOCKED);
        }

        if (request->l_callbacks != NULL) {
                /*
                 * To make sure the shutdown code works correctly, either
                 * the callback must happen after putting the lock on the
                 * sleep list, or we must check the shutdown status after
                 * returning from the callback (and before sleeping).  At
                 * least for now, we'll use the first option.  If a
                 * shutdown or signal or whatever happened while the graph
                 * mutex was dropped, that will be detected by
                 * wait_for_lock().
                 */
                mutex_exit(&gp->gp_mutex);

                cprp = flk_invoke_callbacks(request->l_callbacks,
                    FLK_BEFORE_SLEEP);

                mutex_enter(&gp->gp_mutex);

                if (cprp == NULL) {
                        wait_for_lock(request);
                } else {
                        mutex_enter(cprp->cc_lockp);
                        CALLB_CPR_SAFE_BEGIN(cprp);
                        mutex_exit(cprp->cc_lockp);
                        wait_for_lock(request);
                        mutex_enter(cprp->cc_lockp);
                        CALLB_CPR_SAFE_END(cprp, cprp->cc_lockp);
                        mutex_exit(cprp->cc_lockp);
                }

                mutex_exit(&gp->gp_mutex);
                (void) flk_invoke_callbacks(request->l_callbacks,
                    FLK_AFTER_SLEEP);
                mutex_enter(&gp->gp_mutex);
        } else {
                wait_for_lock(request);
        }

        if (IS_LOCKMGR(request)) {
                /*
                 * If the lock manager is shutting down, return an
                 * error that will encourage the client to retransmit.
                 */
                if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP &&
                    !IS_GRANTED(request)) {
                        flk_cancel_sleeping_lock(request, 1);
                        return (ENOLCK);
                }
        }

        if (IS_INTERRUPTED(request)) {
                /* we got a signal, or act like we did */
                flk_cancel_sleeping_lock(request, 1);
                return (EINTR);
        }

        /* Cancelled if some other thread has closed the file */

        if (IS_CANCELLED(request)) {
                flk_cancel_sleeping_lock(request, 1);
                return (EBADF);
        }

        request->l_state &= ~GRANTED_LOCK;
        REMOVE_SLEEP_QUEUE(request);
        return (flk_execute_request(request));
}

/*
 * This routine adds an edge between from and to because from depends
 * to. If asked to check for deadlock it checks whether there are any
 * reachable locks from "from_lock" that is owned by the same process
 * as "from_lock".
 * NOTE: It is the caller's responsibility to make sure that the color
 * of the graph is consistent between the calls to flk_add_edge as done
 * in flk_process_request. This routine does not color and check for
 * deadlock explicitly.
 */

static int
flk_add_edge(lock_descriptor_t *from_lock, lock_descriptor_t *to_lock,
    int check_cycle, int update_graph)
{
        edge_t  *edge;
        edge_t  *ep;
        lock_descriptor_t       *vertex;
        lock_descriptor_t *vertex_stack;

        STACK_INIT(vertex_stack);

        /*
         * if to vertex already has mark_color just return
         * don't add an edge as it is reachable from from vertex
         * before itself.
         */

        if (COLORED(to_lock))
                return (0);

        edge = flk_get_edge();

        /*
         * set the from and to vertex
         */

        edge->from_vertex = from_lock;
        edge->to_vertex = to_lock;

        /*
         * put in adjacency list of from vertex
         */

        from_lock->l_edge.edge_adj_next->edge_adj_prev = edge;
        edge->edge_adj_next = from_lock->l_edge.edge_adj_next;
        edge->edge_adj_prev = &from_lock->l_edge;
        from_lock->l_edge.edge_adj_next = edge;

        /*
         * put in list of to vertex
         */

        to_lock->l_edge.edge_in_next->edge_in_prev = edge;
        edge->edge_in_next = to_lock->l_edge.edge_in_next;
        to_lock->l_edge.edge_in_next = edge;
        edge->edge_in_prev = &to_lock->l_edge;


        if (update_graph) {
                flk_update_proc_graph(edge, 0);
                return (0);
        }
        if (!check_cycle) {
                return (0);
        }

        STACK_PUSH(vertex_stack, from_lock, l_stack);

        while ((vertex = STACK_TOP(vertex_stack)) != NULL) {

                STACK_POP(vertex_stack, l_stack);

                for (ep = FIRST_ADJ(vertex);
                    ep != HEAD(vertex);
                    ep = NEXT_ADJ(ep)) {
                        if (COLORED(ep->to_vertex))
                                continue;
                        COLOR(ep->to_vertex);
                        if (SAME_OWNER(ep->to_vertex, from_lock))
                                goto dead_lock;
                        STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
                }
        }
        return (0);

dead_lock:

        /*
         * remove all edges
         */

        ep = FIRST_ADJ(from_lock);

        while (ep != HEAD(from_lock)) {
                IN_LIST_REMOVE(ep);
                from_lock->l_sedge = NEXT_ADJ(ep);
                ADJ_LIST_REMOVE(ep);
                flk_free_edge(ep);
                ep = from_lock->l_sedge;
        }
        return (EDEADLK);
}

/*
 * Get an edge structure for representing the dependency between two locks.
 */

static edge_t *
flk_get_edge()
{
        edge_t  *ep;

        ASSERT(flk_edge_cache != NULL);

        ep = kmem_cache_alloc(flk_edge_cache, KM_SLEEP);
        edge_allocs++;
        return (ep);
}

/*
 * Free the edge structure.
 */

static void
flk_free_edge(edge_t *ep)
{
        edge_frees++;
        kmem_cache_free(flk_edge_cache, (void *)ep);
}

/*
 * Check the relationship of request with lock and perform the
 * recomputation of dependencies, break lock if required, and return
 * 1 if request cannot have any more relationship with the next
 * active locks.
 * The 'lock' and 'request' are compared and in case of overlap we
 * delete the 'lock' and form new locks to represent the non-overlapped
 * portion of original 'lock'. This function has side effects such as
 * 'lock' will be freed, new locks will be added to the active list.
 */

static int
flk_relation(lock_descriptor_t *lock, lock_descriptor_t *request)
{
        int lock_effect;
        lock_descriptor_t *lock1, *lock2;
        lock_descriptor_t *topology[3];
        int nvertex = 0;
        int i;
        edge_t  *ep;
        graph_t *gp = (lock->l_graph);


        CHECK_SLEEPING_LOCKS(gp);
        CHECK_ACTIVE_LOCKS(gp);

        ASSERT(MUTEX_HELD(&gp->gp_mutex));

        topology[0] = topology[1] = topology[2] = NULL;

        if (request->l_type == F_UNLCK)
                lock_effect = FLK_UNLOCK;
        else if (request->l_type == F_RDLCK &&
            lock->l_type == F_WRLCK)
                lock_effect = FLK_DOWNGRADE;
        else if (request->l_type == F_WRLCK &&
            lock->l_type == F_RDLCK)
                lock_effect = FLK_UPGRADE;
        else
                lock_effect = FLK_STAY_SAME;

        if (lock->l_end < request->l_start) {
                if (lock->l_end == request->l_start - 1 &&
                    lock_effect == FLK_STAY_SAME) {
                        topology[0] = request;
                        request->l_start = lock->l_start;
                        nvertex = 1;
                        goto recompute;
                } else {
                        return (0);
                }
        }

        if (lock->l_start > request->l_end) {
                if (request->l_end == lock->l_start - 1 &&
                    lock_effect == FLK_STAY_SAME) {
                        topology[0] = request;
                        request->l_end = lock->l_end;
                        nvertex = 1;
                        goto recompute;
                } else {
                        return (1);
                }
        }

        if (request->l_end < lock->l_end) {
                if (request->l_start > lock->l_start) {
                        if (lock_effect == FLK_STAY_SAME) {
                                request->l_start = lock->l_start;
                                request->l_end = lock->l_end;
                                topology[0] = request;
                                nvertex = 1;
                        } else {
                                lock1 = flk_get_lock();
                                lock2 = flk_get_lock();
                                COPY(lock1, lock);
                                COPY(lock2, lock);
                                lock1->l_start = lock->l_start;
                                lock1->l_end = request->l_start - 1;
                                lock2->l_start = request->l_end + 1;
                                lock2->l_end = lock->l_end;
                                topology[0] = lock1;
                                topology[1] = lock2;
                                topology[2] = request;
                                nvertex = 3;
                        }
                } else if (request->l_start < lock->l_start) {
                        if (lock_effect == FLK_STAY_SAME) {
                                request->l_end = lock->l_end;
                                topology[0] = request;
                                nvertex = 1;
                        } else {
                                lock1 = flk_get_lock();
                                COPY(lock1, lock);
                                lock1->l_start = request->l_end + 1;
                                topology[0] = lock1;
                                topology[1] = request;
                                nvertex = 2;
                        }
                } else  {
                        if (lock_effect == FLK_STAY_SAME) {
                                request->l_start = lock->l_start;
                                request->l_end = lock->l_end;
                                topology[0] = request;
                                nvertex = 1;
                        } else {
                                lock1 = flk_get_lock();
                                COPY(lock1, lock);
                                lock1->l_start = request->l_end + 1;
                                topology[0] = lock1;
                                topology[1] = request;
                                nvertex = 2;
                        }
                }
        } else if (request->l_end > lock->l_end) {
                if (request->l_start > lock->l_start)  {
                        if (lock_effect == FLK_STAY_SAME) {
                                request->l_start = lock->l_start;
                                topology[0] = request;
                                nvertex = 1;
                        } else {
                                lock1 = flk_get_lock();
                                COPY(lock1, lock);
                                lock1->l_end = request->l_start - 1;
                                topology[0] = lock1;
                                topology[1] = request;
                                nvertex = 2;
                        }
                } else if (request->l_start < lock->l_start)  {
                        topology[0] = request;
                        nvertex = 1;
                } else {
                        topology[0] = request;
                        nvertex = 1;
                }
        } else {
                if (request->l_start > lock->l_start) {
                        if (lock_effect == FLK_STAY_SAME) {
                                request->l_start = lock->l_start;
                                topology[0] = request;
                                nvertex = 1;
                        } else {
                                lock1 = flk_get_lock();
                                COPY(lock1, lock);
                                lock1->l_end = request->l_start - 1;
                                topology[0] = lock1;
                                topology[1] = request;
                                nvertex = 2;
                        }
                } else if (request->l_start < lock->l_start) {
                        topology[0] = request;
                        nvertex = 1;
                } else {
                        if (lock_effect !=  FLK_UNLOCK) {
                                topology[0] = request;
                                nvertex = 1;
                        } else {
                                flk_delete_active_lock(lock, 0);
                                flk_wakeup(lock, 1);
                                flk_free_lock(lock);
                                CHECK_SLEEPING_LOCKS(gp);
                                CHECK_ACTIVE_LOCKS(gp);
                                return (1);
                        }
                }
        }

recompute:

        /*
         * For unlock we don't send the 'request' to for recomputing
         * dependencies because no lock will add an edge to this.
         */

        if (lock_effect == FLK_UNLOCK) {
                topology[nvertex-1] = NULL;
                nvertex--;
        }
        for (i = 0; i < nvertex; i++) {
                topology[i]->l_state |= RECOMPUTE_LOCK;
                topology[i]->l_color = NO_COLOR;
        }

        ASSERT(FIRST_ADJ(lock) == HEAD(lock));

        /*
         * we remove the adjacent edges for all vertices' to this vertex
         * 'lock'.
         */

        ep = FIRST_IN(lock);
        while (ep != HEAD(lock)) {
                ADJ_LIST_REMOVE(ep);
                ep = NEXT_IN(ep);
        }

        flk_delete_active_lock(lock, 0);

        /* We are ready for recomputing the dependencies now */

        flk_recompute_dependencies(lock, topology, nvertex, 1);

        for (i = 0; i < nvertex; i++) {
                topology[i]->l_state &= ~RECOMPUTE_LOCK;
                topology[i]->l_color = NO_COLOR;
        }


        if (lock_effect == FLK_UNLOCK) {
                nvertex++;
        }
        for (i = 0; i < nvertex - 1; i++) {
                flk_insert_active_lock(topology[i]);
        }


        if (lock_effect == FLK_DOWNGRADE || lock_effect == FLK_UNLOCK) {
                flk_wakeup(lock, 0);
        } else {
                ep = FIRST_IN(lock);
                while (ep != HEAD(lock)) {
                        lock->l_sedge = NEXT_IN(ep);
                        IN_LIST_REMOVE(ep);
                        flk_update_proc_graph(ep, 1);
                        flk_free_edge(ep);
                        ep = lock->l_sedge;
                }
        }
        flk_free_lock(lock);

        CHECK_SLEEPING_LOCKS(gp);
        CHECK_ACTIVE_LOCKS(gp);
        return (0);
}

/*
 * Insert a lock into the active queue.
 */

static void
flk_insert_active_lock(lock_descriptor_t *new_lock)
{
        graph_t *gp = new_lock->l_graph;
        vnode_t *vp = new_lock->l_vnode;
        lock_descriptor_t *first_lock, *lock;

        ASSERT(MUTEX_HELD(&gp->gp_mutex));

        SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
        first_lock = lock;

        if (first_lock != NULL) {
                for (; (lock->l_vnode == vp &&
                    lock->l_start < new_lock->l_start); lock = lock->l_next)
                        ;
        } else {
                lock = ACTIVE_HEAD(gp);
        }

        lock->l_prev->l_next = new_lock;
        new_lock->l_next = lock;
        new_lock->l_prev = lock->l_prev;
        lock->l_prev = new_lock;

        if (first_lock == NULL || (new_lock->l_start <= first_lock->l_start)) {
                vp->v_filocks = (struct filock *)new_lock;
        }
        flk_set_state(new_lock, FLK_ACTIVE_STATE);
        new_lock->l_state |= ACTIVE_LOCK;

        CHECK_ACTIVE_LOCKS(gp);
        CHECK_SLEEPING_LOCKS(gp);
}

/*
 * Delete the active lock : Performs two functions depending on the
 * value of second parameter. One is to remove from the active lists
 * only and other is to both remove and free the lock.
 */

static void
flk_delete_active_lock(lock_descriptor_t *lock, int free_lock)
{
        vnode_t *vp = lock->l_vnode;
        graph_t *gp = lock->l_graph;

        ASSERT(MUTEX_HELD(&gp->gp_mutex));
        if (free_lock)
                ASSERT(NO_DEPENDENTS(lock));
        ASSERT(NOT_BLOCKED(lock));
        ASSERT(IS_ACTIVE(lock));

        ASSERT((vp->v_filocks != NULL));

        if (vp->v_filocks == (struct filock *)lock) {
                vp->v_filocks = (struct filock *)
                    ((lock->l_next->l_vnode == vp) ? lock->l_next :
                    NULL);
        }
        lock->l_next->l_prev = lock->l_prev;
        lock->l_prev->l_next = lock->l_next;
        lock->l_next = lock->l_prev = NULL;
        flk_set_state(lock, FLK_DEAD_STATE);
        lock->l_state &= ~ACTIVE_LOCK;

        if (free_lock)
                flk_free_lock(lock);
        CHECK_ACTIVE_LOCKS(gp);
        CHECK_SLEEPING_LOCKS(gp);
}

/*
 * Insert into the sleep queue.
 */

static void
flk_insert_sleeping_lock(lock_descriptor_t *request)
{
        graph_t *gp = request->l_graph;
        vnode_t *vp = request->l_vnode;
        lock_descriptor_t       *lock;

        ASSERT(MUTEX_HELD(&gp->gp_mutex));
        ASSERT(IS_INITIAL(request));

        for (lock = gp->sleeping_locks.l_next; (lock != &gp->sleeping_locks &&
            lock->l_vnode < vp); lock = lock->l_next)
                ;

        lock->l_prev->l_next = request;
        request->l_prev = lock->l_prev;
        lock->l_prev = request;
        request->l_next = lock;
        flk_set_state(request, FLK_SLEEPING_STATE);
        request->l_state |= SLEEPING_LOCK;
}

/*
 * Cancelling a sleeping lock implies removing a vertex from the
 * dependency graph and therefore we should recompute the dependencies
 * of all vertices that have a path  to this vertex, w.r.t. all
 * vertices reachable from this vertex.
 */

void
flk_cancel_sleeping_lock(lock_descriptor_t *request, int remove_from_queue)
{
        graph_t *gp = request->l_graph;
        vnode_t *vp = request->l_vnode;
        lock_descriptor_t **topology = NULL;
        edge_t  *ep;
        lock_descriptor_t *vertex, *lock;
        int nvertex = 0;
        int i;
        lock_descriptor_t *vertex_stack;

        STACK_INIT(vertex_stack);

        ASSERT(MUTEX_HELD(&gp->gp_mutex));
        /*
         * count number of vertex pointers that has to be allocated
         * All vertices that are reachable from request.
         */

        STACK_PUSH(vertex_stack, request, l_stack);

        while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
                STACK_POP(vertex_stack, l_stack);
                for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
                    ep = NEXT_ADJ(ep)) {
                        if (IS_RECOMPUTE(ep->to_vertex))
                                continue;
                        ep->to_vertex->l_state |= RECOMPUTE_LOCK;
                        STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
                        nvertex++;
                }
        }

        /*
         * allocate memory for holding the vertex pointers
         */

        if (nvertex) {
                topology = kmem_zalloc(nvertex * sizeof (lock_descriptor_t *),
                    KM_SLEEP);
        }

        /*
         * one more pass to actually store the vertices in the
         * allocated array.
         * We first check sleeping locks and then active locks
         * so that topology array will be in a topological
         * order.
         */

        nvertex = 0;
        SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);

        if (lock) {
                do {
                        if (IS_RECOMPUTE(lock)) {
                                lock->l_index = nvertex;
                                topology[nvertex++] = lock;
                        }
                        lock->l_color = NO_COLOR;
                        lock = lock->l_next;
                } while (lock->l_vnode == vp);
        }

        SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);

        if (lock) {
                do {
                        if (IS_RECOMPUTE(lock)) {
                                lock->l_index = nvertex;
                                topology[nvertex++] = lock;
                        }
                        lock->l_color = NO_COLOR;
                        lock = lock->l_next;
                } while (lock->l_vnode == vp);
        }

        /*
         * remove in and out edges of request
         * They are freed after updating proc_graph below.
         */

        for (ep = FIRST_IN(request); ep != HEAD(request); ep = NEXT_IN(ep)) {
                ADJ_LIST_REMOVE(ep);
        }


        if (remove_from_queue)
                REMOVE_SLEEP_QUEUE(request);

        /* we are ready to recompute */

        flk_recompute_dependencies(request, topology, nvertex, 1);

        ep = FIRST_ADJ(request);
        while (ep != HEAD(request)) {
                IN_LIST_REMOVE(ep);
                request->l_sedge = NEXT_ADJ(ep);
                ADJ_LIST_REMOVE(ep);
                flk_update_proc_graph(ep, 1);
                flk_free_edge(ep);
                ep = request->l_sedge;
        }


        /*
         * unset the RECOMPUTE flag in those vertices
         */

        for (i = 0; i < nvertex; i++) {
                topology[i]->l_state &= ~RECOMPUTE_LOCK;
        }

        /*
         * free the topology
         */
        if (nvertex)
                kmem_free((void *)topology,
                    (nvertex * sizeof (lock_descriptor_t *)));
        /*
         * Possibility of some locks unblocked now
         */

        flk_wakeup(request, 0);

        /*
         * we expect to have a correctly recomputed graph  now.
         */
        flk_set_state(request, FLK_DEAD_STATE);
        flk_free_lock(request);
        CHECK_SLEEPING_LOCKS(gp);
        CHECK_ACTIVE_LOCKS(gp);

}

/*
 * Uncoloring the graph is simply to increment the mark value of the graph
 * And only when wrap round takes place will we color all vertices in
 * the graph explicitly.
 */

static void
flk_graph_uncolor(graph_t *gp)
{
        lock_descriptor_t *lock;

        if (gp->mark == UINT_MAX) {
                gp->mark = 1;
                for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
                    lock = lock->l_next)
                        lock->l_color  = 0;

                for (lock = SLEEPING_HEAD(gp)->l_next;
                    lock != SLEEPING_HEAD(gp); lock = lock->l_next)
                        lock->l_color  = 0;
        } else {
                gp->mark++;
        }
}

/*
 * Wake up locks that are blocked on the given lock.
 */

static void
flk_wakeup(lock_descriptor_t *lock, int adj_list_remove)
{
        edge_t  *ep;
        graph_t *gp = lock->l_graph;
        lock_descriptor_t       *lck;

        ASSERT(MUTEX_HELD(&gp->gp_mutex));
        if (NO_DEPENDENTS(lock))
                return;
        ep = FIRST_IN(lock);
        do {
                /*
                 * delete the edge from the adjacency list
                 * of from vertex. if no more adjacent edges
                 * for this vertex wake this process.
                 */
                lck = ep->from_vertex;
                if (adj_list_remove)
                        ADJ_LIST_REMOVE(ep);
                flk_update_proc_graph(ep, 1);
                if (NOT_BLOCKED(lck)) {
                        GRANT_WAKEUP(lck);
                }
                lock->l_sedge = NEXT_IN(ep);
                IN_LIST_REMOVE(ep);
                flk_free_edge(ep);
                ep = lock->l_sedge;
        } while (ep != HEAD(lock));
        ASSERT(NO_DEPENDENTS(lock));
}

/*
 * The dependents of request, is checked for its dependency against the
 * locks in topology (called topology because the array is and should be in
 * topological order for this algorithm, if not in topological order the
 * inner loop below might add more edges than necessary. Topological ordering
 * of vertices satisfies the property that all edges will be from left to
 * right i.e., topology[i] can have an edge to  topology[j], iff i<j)
 * If lock l1 in the dependent set of request is dependent (blocked by)
 * on lock l2 in topology but does not have a path to it, we add an edge
 * in the inner loop below.
 *
 * We don't want to add an edge between l1 and l2 if there exists
 * already a path from l1 to l2, so care has to be taken for those vertices
 * that  have two paths to 'request'. These vertices are referred to here
 * as barrier locks.
 *
 * The barriers has to be found (those vertex that originally had two paths
 * to request) because otherwise we may end up adding edges unnecessarily
 * to vertices in topology, and thus barrier vertices can have an edge
 * to a vertex in topology as well a path to it.
 */

static void
flk_recompute_dependencies(lock_descriptor_t *request,
    lock_descriptor_t **topology, int nvertex, int update_graph)
{
        lock_descriptor_t *vertex, *lock;
        graph_t *gp = request->l_graph;
        int i, count;
        int barrier_found = 0;
        edge_t  *ep;
        lock_descriptor_t *vertex_stack;

        STACK_INIT(vertex_stack);

        ASSERT(MUTEX_HELD(&gp->gp_mutex));
        if (nvertex == 0)
                return;
        flk_graph_uncolor(request->l_graph);
        barrier_found = flk_find_barriers(request);
        request->l_state |= RECOMPUTE_DONE;

        STACK_PUSH(vertex_stack, request, l_stack);
        request->l_sedge = FIRST_IN(request);


        while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
                if (vertex->l_state & RECOMPUTE_DONE) {
                        count = 0;
                        goto next_in_edge;
                }
                if (IS_BARRIER(vertex)) {
                        /* decrement the barrier count */
                        if (vertex->l_index) {
                                vertex->l_index--;
                                /* this guy will be pushed again anyway ? */
                                STACK_POP(vertex_stack, l_stack);
                                if (vertex->l_index == 0)  {
                                /*
                                 * barrier is over we can recompute
                                 * dependencies for this lock in the
                                 * next stack pop
                                 */
                                        vertex->l_state &= ~BARRIER_LOCK;
                                }
                                continue;
                        }
                }
                vertex->l_state |= RECOMPUTE_DONE;
                flk_graph_uncolor(gp);
                count = flk_color_reachables(vertex);
                for (i = 0; i < nvertex; i++) {
                        lock = topology[i];
                        if (COLORED(lock))
                                continue;
                        if (BLOCKS(lock, vertex)) {
                                (void) flk_add_edge(vertex, lock,
                                    NO_CHECK_CYCLE, update_graph);
                                COLOR(lock);
                                count++;
                                count += flk_color_reachables(lock);
                        }

                }

next_in_edge:
                if (count == nvertex ||
                    vertex->l_sedge == HEAD(vertex)) {
                        /* prune the tree below this */
                        STACK_POP(vertex_stack, l_stack);
                        vertex->l_state &= ~RECOMPUTE_DONE;
                        /* update the barrier locks below this! */
                        if (vertex->l_sedge != HEAD(vertex) && barrier_found) {
                                flk_graph_uncolor(gp);
                                flk_update_barriers(vertex);
                        }
                        continue;
                }

                ep = vertex->l_sedge;
                lock = ep->from_vertex;
                STACK_PUSH(vertex_stack, lock, l_stack);
                lock->l_sedge = FIRST_IN(lock);
                vertex->l_sedge = NEXT_IN(ep);
        }

}

/*
 * Color all reachable vertices from vertex that belongs to topology (here
 * those that have RECOMPUTE_LOCK set in their state) and yet uncolored.
 *
 * Note: we need to use a different stack_link l_stack1 because this is
 * called from flk_recompute_dependencies() that already uses a stack with
 * l_stack as stack_link.
 */

static int
flk_color_reachables(lock_descriptor_t *vertex)
{
        lock_descriptor_t *ver, *lock;
        int count;
        edge_t  *ep;
        lock_descriptor_t *vertex_stack;

        STACK_INIT(vertex_stack);

        STACK_PUSH(vertex_stack, vertex, l_stack1);
        count = 0;
        while ((ver = STACK_TOP(vertex_stack)) != NULL) {

                STACK_POP(vertex_stack, l_stack1);
                for (ep = FIRST_ADJ(ver); ep != HEAD(ver);
                    ep = NEXT_ADJ(ep)) {
                        lock = ep->to_vertex;
                        if (COLORED(lock))
                                continue;
                        COLOR(lock);
                        if (IS_RECOMPUTE(lock))
                                count++;
                        STACK_PUSH(vertex_stack, lock, l_stack1);
                }

        }
        return (count);
}

/*
 * Called from flk_recompute_dependencies() this routine decrements
 * the barrier count of barrier vertices that are reachable from lock.
 */

static void
flk_update_barriers(lock_descriptor_t *lock)
{
        lock_descriptor_t *vertex, *lck;
        edge_t  *ep;
        lock_descriptor_t *vertex_stack;

        STACK_INIT(vertex_stack);

        STACK_PUSH(vertex_stack, lock, l_stack1);

        while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
                STACK_POP(vertex_stack, l_stack1);
                for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
                    ep = NEXT_IN(ep)) {
                        lck = ep->from_vertex;
                        if (COLORED(lck)) {
                                if (IS_BARRIER(lck)) {
                                        ASSERT(lck->l_index > 0);
                                        lck->l_index--;
                                        if (lck->l_index == 0)
                                                lck->l_state &= ~BARRIER_LOCK;
                                }
                                continue;
                        }
                        COLOR(lck);
                        if (IS_BARRIER(lck)) {
                                ASSERT(lck->l_index > 0);
                                lck->l_index--;
                                if (lck->l_index == 0)
                                        lck->l_state &= ~BARRIER_LOCK;
                        }
                        STACK_PUSH(vertex_stack, lck, l_stack1);
                }
        }
}

/*
 * Finds all vertices that are reachable from 'lock' more than once and
 * mark them as barrier vertices and increment their barrier count.
 * The barrier count is one minus the total number of paths from lock
 * to that vertex.
 */

static int
flk_find_barriers(lock_descriptor_t *lock)
{
        lock_descriptor_t *vertex, *lck;
        int found = 0;
        edge_t  *ep;
        lock_descriptor_t *vertex_stack;

        STACK_INIT(vertex_stack);

        STACK_PUSH(vertex_stack, lock, l_stack1);

        while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
                STACK_POP(vertex_stack, l_stack1);
                for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
                    ep = NEXT_IN(ep)) {
                        lck = ep->from_vertex;
                        if (COLORED(lck)) {
                                /* this is a barrier */
                                lck->l_state |= BARRIER_LOCK;
                                /* index will have barrier count */
                                lck->l_index++;
                                if (!found)
                                        found = 1;
                                continue;
                        }
                        COLOR(lck);
                        lck->l_index = 0;
                        STACK_PUSH(vertex_stack, lck, l_stack1);
                }
        }
        return (found);
}

/*
 * Finds the first lock that is mainly responsible for blocking this
 * request.  If there is no such lock, request->l_flock.l_type is set to
 * F_UNLCK.  Otherwise, request->l_flock is filled in with the particulars
 * of the blocking lock.
 *
 * Note: It is possible a request is blocked by a sleeping lock because
 * of the fairness policy used in flk_process_request() to construct the
 * dependencies. (see comments before flk_process_request()).
 */

static void
flk_get_first_blocking_lock(lock_descriptor_t *request)
{
        graph_t *gp = request->l_graph;
        vnode_t *vp = request->l_vnode;
        lock_descriptor_t *lock, *blocker;

        ASSERT(MUTEX_HELD(&gp->gp_mutex));
        blocker = NULL;
        SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);

        if (lock) {
                do {
                        if (BLOCKS(lock, request)) {
                                blocker = lock;
                                break;
                        }
                        lock = lock->l_next;
                } while (lock->l_vnode == vp);
        }

        if (blocker == NULL && request->l_flock.l_type == F_RDLCK) {
                /*
                 * No active lock is blocking this request, but if a read
                 * lock is requested, it may also get blocked by a waiting
                 * writer. So search all sleeping locks and see if there is
                 * a writer waiting.
                 */
                SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
                if (lock) {
                        do {
                                if (BLOCKS(lock, request)) {
                                        blocker = lock;
                                        break;
                                }
                                lock = lock->l_next;
                        } while (lock->l_vnode == vp);
                }
        }

        if (blocker) {
                report_blocker(blocker, request);
        } else
                request->l_flock.l_type = F_UNLCK;
}

/*
 * Get the graph_t structure associated with a vnode.
 * If 'initialize' is non-zero, and the graph_t structure for this vnode has
 * not yet been initialized, then a new element is allocated and returned.
 */
graph_t *
flk_get_lock_graph(vnode_t *vp, int initialize)
{
        graph_t *gp;
        graph_t *gp_alloc = NULL;
        int index = HASH_INDEX(vp);

        if (initialize == FLK_USE_GRAPH) {
                mutex_enter(&flock_lock);
                gp = lock_graph[index];
                mutex_exit(&flock_lock);
                return (gp);
        }

        ASSERT(initialize == FLK_INIT_GRAPH);

        if (lock_graph[index] == NULL) {

                gp_alloc = kmem_zalloc(sizeof (graph_t), KM_SLEEP);

                /* Initialize the graph */

                gp_alloc->active_locks.l_next =
                    gp_alloc->active_locks.l_prev =
                    (lock_descriptor_t *)ACTIVE_HEAD(gp_alloc);
                gp_alloc->sleeping_locks.l_next =
                    gp_alloc->sleeping_locks.l_prev =
                    (lock_descriptor_t *)SLEEPING_HEAD(gp_alloc);
                gp_alloc->index = index;
                mutex_init(&gp_alloc->gp_mutex, NULL, MUTEX_DEFAULT, NULL);
        }

        mutex_enter(&flock_lock);

        gp = lock_graph[index];

        /* Recheck the value within flock_lock */
        if (gp == NULL) {
                struct flock_globals *fg;

                /* We must have previously allocated the graph_t structure */
                ASSERT(gp_alloc != NULL);
                lock_graph[index] = gp = gp_alloc;
                /*
                 * The lockmgr status is only needed if KLM is loaded.
                 */
                if (flock_zone_key != ZONE_KEY_UNINITIALIZED) {
                        fg = flk_get_globals();
                        fg->lockmgr_status[index] = fg->flk_lockmgr_status;
                }
        }

        mutex_exit(&flock_lock);

        if ((gp_alloc != NULL) && (gp != gp_alloc)) {
                /* There was a race to allocate the graph_t and we lost */
                mutex_destroy(&gp_alloc->gp_mutex);
                kmem_free(gp_alloc, sizeof (graph_t));
        }

        return (gp);
}

/*
 * PSARC case 1997/292
 */
int
cl_flk_has_remote_locks_for_nlmid(vnode_t *vp, int nlmid)
{
        lock_descriptor_t *lock;
        int result = 0;
        graph_t *gp;
        int                     lock_nlmid;

        /*
         * Check to see if node is booted as a cluster. If not, return.
         */
        if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
                return (0);
        }

        gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
        if (gp == NULL) {
                return (0);
        }

        mutex_enter(&gp->gp_mutex);

        SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);

        if (lock) {
                while (lock->l_vnode == vp) {
                        /* get NLM id from sysid */
                        lock_nlmid = GETNLMID(lock->l_flock.l_sysid);

                        /*
                         * If NLM server request _and_ nlmid of lock matches
                         * nlmid of argument, then we've found a remote lock.
                         */
                        if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
                                result = 1;
                                goto done;
                        }
                        lock = lock->l_next;
                }
        }

        SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);

        if (lock) {
                while (lock->l_vnode == vp) {
                        /* get NLM id from sysid */
                        lock_nlmid = GETNLMID(lock->l_flock.l_sysid);

                        /*
                         * If NLM server request _and_ nlmid of lock matches
                         * nlmid of argument, then we've found a remote lock.
                         */
                        if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
                                result = 1;
                                goto done;
                        }
                        lock = lock->l_next;
                }
        }

done:
        mutex_exit(&gp->gp_mutex);
        return (result);
}

/*
 * Determine whether there are any locks for the given vnode with a remote
 * sysid.  Returns zero if not, non-zero if there are.
 *
 * Note that the return value from this function is potentially invalid
 * once it has been returned.  The caller is responsible for providing its
 * own synchronization mechanism to ensure that the return value is useful
 * (e.g., see nfs_lockcompletion()).
 */
int
flk_has_remote_locks(vnode_t *vp)
{
        lock_descriptor_t *lock;
        int result = 0;
        graph_t *gp;

        gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
        if (gp == NULL) {
                return (0);
        }

        mutex_enter(&gp->gp_mutex);

        SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);

        if (lock) {
                while (lock->l_vnode == vp) {
                        if (IS_REMOTE(lock)) {
                                result = 1;
                                goto done;
                        }
                        lock = lock->l_next;
                }
        }

        SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);

        if (lock) {
                while (lock->l_vnode == vp) {
                        if (IS_REMOTE(lock)) {
                                result = 1;
                                goto done;
                        }
                        lock = lock->l_next;
                }
        }

done:
        mutex_exit(&gp->gp_mutex);
        return (result);
}

/*
 * Determine whether there are any locks for the given vnode with a remote
 * sysid matching given sysid.
 * Used by the new (open source) NFS Lock Manager (NLM)
 */
int
flk_has_remote_locks_for_sysid(vnode_t *vp, int sysid)
{
        lock_descriptor_t *lock;
        int result = 0;
        graph_t *gp;

        if (sysid == 0)
                return (0);

        gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
        if (gp == NULL) {
                return (0);
        }

        mutex_enter(&gp->gp_mutex);

        SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);

        if (lock) {
                while (lock->l_vnode == vp) {
                        if (lock->l_flock.l_sysid == sysid) {
                                result = 1;
                                goto done;
                        }
                        lock = lock->l_next;
                }
        }

        SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);

        if (lock) {
                while (lock->l_vnode == vp) {
                        if (lock->l_flock.l_sysid == sysid) {
                                result = 1;
                                goto done;
                        }
                        lock = lock->l_next;
                }
        }

done:
        mutex_exit(&gp->gp_mutex);
        return (result);
}

/*
 * Determine if there are any locks owned by the given sysid.
 * Returns zero if not, non-zero if there are.  Note that this return code
 * could be derived from flk_get_{sleeping,active}_locks, but this routine
 * avoids all the memory allocations of those routines.
 *
 * This routine has the same synchronization issues as
 * flk_has_remote_locks.
 */

int
flk_sysid_has_locks(int sysid, int lck_type)
{
        int             has_locks = 0;
        lock_descriptor_t       *lock;
        graph_t         *gp;
        int             i;

        for (i = 0; i < HASH_SIZE && !has_locks; i++) {
                mutex_enter(&flock_lock);
                gp = lock_graph[i];
                mutex_exit(&flock_lock);
                if (gp == NULL) {
                        continue;
                }

                mutex_enter(&gp->gp_mutex);

                if (lck_type & FLK_QUERY_ACTIVE) {
                        for (lock = ACTIVE_HEAD(gp)->l_next;
                            lock != ACTIVE_HEAD(gp) && !has_locks;
                            lock = lock->l_next) {
                                if (lock->l_flock.l_sysid == sysid)
                                        has_locks = 1;
                        }
                }

                if (lck_type & FLK_QUERY_SLEEPING) {
                        for (lock = SLEEPING_HEAD(gp)->l_next;
                            lock != SLEEPING_HEAD(gp) && !has_locks;
                            lock = lock->l_next) {
                                if (lock->l_flock.l_sysid == sysid)
                                        has_locks = 1;
                        }
                }
                mutex_exit(&gp->gp_mutex);
        }

        return (has_locks);
}


/*
 * PSARC case 1997/292
 *
 * Requires: "sysid" is a pair [nlmid, sysid].  The lower half is 16-bit
 *  quantity, the real sysid generated by the NLM server; the upper half
 *  identifies the node of the cluster where the NLM server ran.
 *  This routine is only called by an NLM server running in a cluster.
 * Effects: Remove all locks held on behalf of the client identified
 *  by "sysid."
 */
void
cl_flk_remove_locks_by_sysid(int sysid)
{
        graph_t *gp;
        int i;
        lock_descriptor_t *lock, *nlock;

        /*
         * Check to see if node is booted as a cluster. If not, return.
         */
        if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
                return;
        }

        ASSERT(sysid != 0);
        for (i = 0; i < HASH_SIZE; i++) {
                mutex_enter(&flock_lock);
                gp = lock_graph[i];
                mutex_exit(&flock_lock);

                if (gp == NULL)
                        continue;

                mutex_enter(&gp->gp_mutex);     /*  get mutex on lock graph */

                /* signal sleeping requests so that they bail out */
                lock = SLEEPING_HEAD(gp)->l_next;
                while (lock != SLEEPING_HEAD(gp)) {
                        nlock = lock->l_next;
                        if (lock->l_flock.l_sysid == sysid) {
                                INTERRUPT_WAKEUP(lock);
                        }
                        lock = nlock;
                }

                /* delete active locks */
                lock = ACTIVE_HEAD(gp)->l_next;
                while (lock != ACTIVE_HEAD(gp)) {
                        nlock = lock->l_next;
                        if (lock->l_flock.l_sysid == sysid) {
                                flk_delete_active_lock(lock, 0);
                                flk_wakeup(lock, 1);
                                flk_free_lock(lock);
                        }
                        lock = nlock;
                }
                mutex_exit(&gp->gp_mutex);    /* release mutex on lock graph */
        }
}

/*
 * Delete all locks in the system that belongs to the sysid of the request.
 */

static void
flk_delete_locks_by_sysid(lock_descriptor_t *request)
{
        int     sysid  = request->l_flock.l_sysid;
        lock_descriptor_t *lock, *nlock;
        graph_t *gp;
        int i;

        ASSERT(MUTEX_HELD(&request->l_graph->gp_mutex));
        ASSERT(sysid != 0);

        mutex_exit(&request->l_graph->gp_mutex);

        for (i = 0; i < HASH_SIZE; i++) {
                mutex_enter(&flock_lock);
                gp = lock_graph[i];
                mutex_exit(&flock_lock);

                if (gp == NULL)
                        continue;

                mutex_enter(&gp->gp_mutex);

                /* signal sleeping requests so that they bail out */
                lock = SLEEPING_HEAD(gp)->l_next;
                while (lock != SLEEPING_HEAD(gp)) {
                        nlock = lock->l_next;
                        if (lock->l_flock.l_sysid == sysid) {
                                INTERRUPT_WAKEUP(lock);
                        }
                        lock = nlock;
                }

                /* delete active locks */
                lock = ACTIVE_HEAD(gp)->l_next;
                while (lock != ACTIVE_HEAD(gp)) {
                        nlock = lock->l_next;
                        if (lock->l_flock.l_sysid == sysid) {
                                flk_delete_active_lock(lock, 0);
                                flk_wakeup(lock, 1);
                                flk_free_lock(lock);
                        }
                        lock = nlock;
                }
                mutex_exit(&gp->gp_mutex);
        }

        mutex_enter(&request->l_graph->gp_mutex);
}

/*
 * Clustering: Deletes PXFS locks
 * Effects: Delete all locks on files in the given file system and with the
 *  given PXFS id.
 */
void
cl_flk_delete_pxfs_locks(struct vfs *vfsp, int pxfsid)
{
        lock_descriptor_t *lock, *nlock;
        graph_t *gp;
        int i;

        for (i = 0; i < HASH_SIZE; i++) {
                mutex_enter(&flock_lock);
                gp = lock_graph[i];
                mutex_exit(&flock_lock);

                if (gp == NULL)
                        continue;

                mutex_enter(&gp->gp_mutex);

                /* signal sleeping requests so that they bail out */
                lock = SLEEPING_HEAD(gp)->l_next;
                while (lock != SLEEPING_HEAD(gp)) {
                        nlock = lock->l_next;
                        if (lock->l_vnode->v_vfsp == vfsp) {
                                ASSERT(IS_PXFS(lock));
                                if (GETPXFSID(lock->l_flock.l_sysid) ==
                                    pxfsid) {
                                        flk_set_state(lock,
                                            FLK_CANCELLED_STATE);
                                        flk_cancel_sleeping_lock(lock, 1);
                                }
                        }
                        lock = nlock;
                }

                /* delete active locks */
                lock = ACTIVE_HEAD(gp)->l_next;
                while (lock != ACTIVE_HEAD(gp)) {
                        nlock = lock->l_next;
                        if (lock->l_vnode->v_vfsp == vfsp) {
                                ASSERT(IS_PXFS(lock));
                                if (GETPXFSID(lock->l_flock.l_sysid) ==
                                    pxfsid) {
                                        flk_delete_active_lock(lock, 0);
                                        flk_wakeup(lock, 1);
                                        flk_free_lock(lock);
                                }
                        }
                        lock = nlock;
                }
                mutex_exit(&gp->gp_mutex);
        }
}

/*
 * Search for a sleeping lock manager lock which matches exactly this lock
 * request; if one is found, fake a signal to cancel it.
 *
 * Return 1 if a matching lock was found, 0 otherwise.
 */

static int
flk_canceled(lock_descriptor_t *request)
{
        lock_descriptor_t *lock, *nlock;
        graph_t *gp = request->l_graph;
        vnode_t *vp = request->l_vnode;

        ASSERT(MUTEX_HELD(&gp->gp_mutex));
        ASSERT(IS_LOCKMGR(request));
        SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);

        if (lock) {
                while (lock->l_vnode == vp) {
                        nlock = lock->l_next;
                        if (SAME_OWNER(lock, request) &&
                            lock->l_start == request->l_start &&
                            lock->l_end == request->l_end) {
                                INTERRUPT_WAKEUP(lock);
                                return (1);
                        }
                        lock = nlock;
                }
        }
        return (0);
}

/*
 * Remove all non-OFD locks for the vnode belonging to the given pid and sysid.
 * That is, since OFD locks are pid-less we'll never match on the incoming
 * pid. OFD locks are removed earlier in the close() path via closef() and
 * ofdcleanlock().
 */
void
cleanlocks(vnode_t *vp, pid_t pid, int sysid)
{
        graph_t *gp;
        lock_descriptor_t *lock, *nlock;
        lock_descriptor_t *link_stack;

        STACK_INIT(link_stack);

        gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);

        if (gp == NULL)
                return;
        mutex_enter(&gp->gp_mutex);

        CHECK_SLEEPING_LOCKS(gp);
        CHECK_ACTIVE_LOCKS(gp);

        SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);

        if (lock) {
                do {
                        nlock = lock->l_next;
                        if ((lock->l_flock.l_pid == pid ||
                            pid == IGN_PID) &&
                            lock->l_flock.l_sysid == sysid) {
                                CANCEL_WAKEUP(lock);
                        }
                        lock = nlock;
                } while (lock->l_vnode == vp);
        }

        SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);

        if (lock) {
                do {
                        nlock = lock->l_next;
                        if ((lock->l_flock.l_pid == pid ||
                            pid == IGN_PID) &&
                            lock->l_flock.l_sysid == sysid) {
                                flk_delete_active_lock(lock, 0);
                                STACK_PUSH(link_stack, lock, l_stack);
                        }
                        lock = nlock;
                } while (lock->l_vnode == vp);
        }

        while ((lock = STACK_TOP(link_stack)) != NULL) {
                STACK_POP(link_stack, l_stack);
                flk_wakeup(lock, 1);
                flk_free_lock(lock);
        }

        CHECK_SLEEPING_LOCKS(gp);
        CHECK_ACTIVE_LOCKS(gp);
        CHECK_OWNER_LOCKS(gp, pid, sysid, vp);
        mutex_exit(&gp->gp_mutex);
}


/*
 * Called from 'fs' read and write routines for files that have mandatory
 * locking enabled.
 */

int
chklock(struct vnode *vp, int iomode, u_offset_t offset, ssize_t len, int fmode,
    caller_context_t *ct)
{
        register int    i;
        struct flock64  bf;
        int             error = 0;

        bf.l_type = (iomode & FWRITE) ? F_WRLCK : F_RDLCK;
        bf.l_whence = 0;
        bf.l_start = offset;
        bf.l_len = len;
        if (ct == NULL) {
                bf.l_pid = curproc->p_pid;
                bf.l_sysid = 0;
        } else {
                bf.l_pid = ct->cc_pid;
                bf.l_sysid = ct->cc_sysid;
        }
        i = (fmode & (FNDELAY|FNONBLOCK)) ? INOFLCK : INOFLCK|SLPFLCK;
        if ((i = reclock(vp, &bf, i, 0, offset, NULL)) != 0 ||
            bf.l_type != F_UNLCK)
                error = i ? i : EAGAIN;
        return (error);
}

/*
 * convoff - converts the given data (start, whence) to the
 * given whence.
 */
int
convoff(struct vnode *vp, struct flock64 *lckdat, int whence, offset_t offset)
{
        int             error;
        struct vattr    vattr;

        if ((lckdat->l_whence == 2) || (whence == 2)) {
                vattr.va_mask = AT_SIZE;
                if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
                        return (error);
        }

        switch (lckdat->l_whence) {
        case 1:
                lckdat->l_start += offset;
                break;
        case 2:
                lckdat->l_start += vattr.va_size;
                /* FALLTHRU */
        case 0:
                break;
        default:
                return (EINVAL);
        }

        if (lckdat->l_start < 0)
                return (EINVAL);

        switch (whence) {
        case 1:
                lckdat->l_start -= offset;
                break;
        case 2:
                lckdat->l_start -= vattr.va_size;
                /* FALLTHRU */
        case 0:
                break;
        default:
                return (EINVAL);
        }

        lckdat->l_whence = (short)whence;
        return (0);
}


/*      proc_graph function definitions */

/*
 * Function checks for deadlock due to the new 'lock'. If deadlock found
 * edges of this lock are freed and returned.
 */

static int
flk_check_deadlock(lock_descriptor_t *lock)
{
        proc_vertex_t   *start_vertex, *pvertex;
        proc_vertex_t *dvertex;
        proc_edge_t *pep, *ppep;
        edge_t  *ep, *nep;
        proc_vertex_t *process_stack;

        /*
         * OFD style locks are not associated with any process so there is
         * no proc graph for these. Thus we cannot, and do not, do deadlock
         * detection.
         */
        if (lock->l_ofd != NULL)
                return (0);

        STACK_INIT(process_stack);

        mutex_enter(&flock_lock);
        start_vertex = flk_get_proc_vertex(lock);
        ASSERT(start_vertex != NULL);

        /* construct the edges from this process to other processes */

        ep = FIRST_ADJ(lock);
        while (ep != HEAD(lock)) {
                proc_vertex_t *adj_proc;

                adj_proc = flk_get_proc_vertex(ep->to_vertex);
                for (pep = start_vertex->edge; pep != NULL; pep = pep->next) {
                        if (pep->to_proc == adj_proc) {
                                ASSERT(pep->refcount);
                                pep->refcount++;
                                break;
                        }
                }
                if (pep == NULL) {
                        pep = flk_get_proc_edge();
                        pep->to_proc = adj_proc;
                        pep->refcount = 1;
                        adj_proc->incount++;
                        pep->next = start_vertex->edge;
                        start_vertex->edge = pep;
                }
                ep = NEXT_ADJ(ep);
        }

        ep = FIRST_IN(lock);

        while (ep != HEAD(lock)) {
                proc_vertex_t *in_proc;

                in_proc = flk_get_proc_vertex(ep->from_vertex);

                for (pep = in_proc->edge; pep != NULL; pep = pep->next) {
                        if (pep->to_proc == start_vertex) {
                                ASSERT(pep->refcount);
                                pep->refcount++;
                                break;
                        }
                }
                if (pep == NULL) {
                        pep = flk_get_proc_edge();
                        pep->to_proc = start_vertex;
                        pep->refcount = 1;
                        start_vertex->incount++;
                        pep->next = in_proc->edge;
                        in_proc->edge = pep;
                }
                ep = NEXT_IN(ep);
        }

        if (start_vertex->incount == 0) {
                mutex_exit(&flock_lock);
                return (0);
        }

        flk_proc_graph_uncolor();

        start_vertex->p_sedge = start_vertex->edge;

        STACK_PUSH(process_stack, start_vertex, p_stack);

        while ((pvertex = STACK_TOP(process_stack)) != NULL) {
                for (pep = pvertex->p_sedge; pep != NULL; pep = pep->next) {
                        dvertex = pep->to_proc;
                        if (!PROC_ARRIVED(dvertex)) {
                                STACK_PUSH(process_stack, dvertex, p_stack);
                                dvertex->p_sedge = dvertex->edge;
                                PROC_ARRIVE(pvertex);
                                pvertex->p_sedge = pep->next;
                                break;
                        }
                        if (!PROC_DEPARTED(dvertex))
                                goto deadlock;
                }
                if (pep == NULL) {
                        PROC_DEPART(pvertex);
                        STACK_POP(process_stack, p_stack);
                }
        }
        mutex_exit(&flock_lock);
        return (0);

deadlock:

        /* we remove all lock edges and proc edges */

        ep = FIRST_ADJ(lock);
        while (ep != HEAD(lock)) {
                proc_vertex_t *adj_proc;
                adj_proc = flk_get_proc_vertex(ep->to_vertex);
                nep = NEXT_ADJ(ep);
                IN_LIST_REMOVE(ep);
                ADJ_LIST_REMOVE(ep);
                flk_free_edge(ep);
                ppep = start_vertex->edge;
                for (pep = start_vertex->edge; pep != NULL; ppep = pep,
                    pep = ppep->next) {
                        if (pep->to_proc == adj_proc) {
                                pep->refcount--;
                                if (pep->refcount == 0) {
                                        if (pep == ppep) {
                                                start_vertex->edge = pep->next;
                                        } else {
                                                ppep->next = pep->next;
                                        }
                                        adj_proc->incount--;
                                        flk_proc_release(adj_proc);
                                        flk_free_proc_edge(pep);
                                }
                                break;
                        }
                }
                ep = nep;
        }
        ep = FIRST_IN(lock);
        while (ep != HEAD(lock)) {
                proc_vertex_t *in_proc;
                in_proc = flk_get_proc_vertex(ep->from_vertex);
                nep = NEXT_IN(ep);
                IN_LIST_REMOVE(ep);
                ADJ_LIST_REMOVE(ep);
                flk_free_edge(ep);
                ppep = in_proc->edge;
                for (pep = in_proc->edge; pep != NULL; ppep = pep,
                    pep = ppep->next) {
                        if (pep->to_proc == start_vertex) {
                                pep->refcount--;
                                if (pep->refcount == 0) {
                                        if (pep == ppep) {
                                                in_proc->edge = pep->next;
                                        } else {
                                                ppep->next = pep->next;
                                        }
                                        start_vertex->incount--;
                                        flk_proc_release(in_proc);
                                        flk_free_proc_edge(pep);
                                }
                                break;
                        }
                }
                ep = nep;
        }
        flk_proc_release(start_vertex);
        mutex_exit(&flock_lock);
        return (1);
}

/*
 * Get a proc vertex. If lock's pvertex value gets a correct proc vertex
 * from the list we return that, otherwise we allocate one. If necessary,
 * we grow the list of vertices also.
 */

static proc_vertex_t *
flk_get_proc_vertex(lock_descriptor_t *lock)
{
        int i;
        proc_vertex_t   *pv;
        proc_vertex_t   **palloc;

        ASSERT(MUTEX_HELD(&flock_lock));
        if (lock->pvertex != -1) {
                ASSERT(lock->pvertex >= 0);
                pv = pgraph.proc[lock->pvertex];
                if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
                        return (pv);
                }
        }
        for (i = 0; i < pgraph.gcount; i++) {
                pv = pgraph.proc[i];
                if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
                        lock->pvertex = pv->index = i;
                        return (pv);
                }
        }
        pv = kmem_zalloc(sizeof (struct proc_vertex), KM_SLEEP);
        pv->pid = lock->l_flock.l_pid;
        pv->sysid = lock->l_flock.l_sysid;
        flk_proc_vertex_allocs++;
        if (pgraph.free != 0) {
                for (i = 0; i < pgraph.gcount; i++) {
                        if (pgraph.proc[i] == NULL) {
                                pgraph.proc[i] = pv;
                                lock->pvertex = pv->index = i;
                                pgraph.free--;
                                return (pv);
                        }
                }
        }
        palloc = kmem_zalloc((pgraph.gcount + PROC_CHUNK) *
            sizeof (proc_vertex_t *), KM_SLEEP);

        if (pgraph.proc) {
                bcopy(pgraph.proc, palloc,
                    pgraph.gcount * sizeof (proc_vertex_t *));

                kmem_free(pgraph.proc,
                    pgraph.gcount * sizeof (proc_vertex_t *));
        }
        pgraph.proc = palloc;
        pgraph.free += (PROC_CHUNK - 1);
        pv->index = lock->pvertex = pgraph.gcount;
        pgraph.gcount += PROC_CHUNK;
        pgraph.proc[pv->index] = pv;
        return (pv);
}

/*
 * Allocate a proc edge.
 */

static proc_edge_t *
flk_get_proc_edge()
{
        proc_edge_t *pep;

        pep = kmem_zalloc(sizeof (proc_edge_t), KM_SLEEP);
        flk_proc_edge_allocs++;
        return (pep);
}

/*
 * Free the proc edge. Called whenever its reference count goes to zero.
 */

static void
flk_free_proc_edge(proc_edge_t *pep)
{
        ASSERT(pep->refcount == 0);
        kmem_free((void *)pep, sizeof (proc_edge_t));
        flk_proc_edge_frees++;
}

/*
 * Color the graph explicitly done only when the mark value hits max value.
 */

static void
flk_proc_graph_uncolor()
{
        int i;

        if (pgraph.mark == UINT_MAX) {
                for (i = 0; i < pgraph.gcount; i++)
                        if (pgraph.proc[i] != NULL) {
                                pgraph.proc[i]->atime = 0;
                                pgraph.proc[i]->dtime = 0;
                        }
                pgraph.mark = 1;
        } else {
                pgraph.mark++;
        }
}

/*
 * Release the proc vertex iff both there are no in edges and out edges
 */

static void
flk_proc_release(proc_vertex_t *proc)
{
        ASSERT(MUTEX_HELD(&flock_lock));
        if (proc->edge == NULL && proc->incount == 0) {
                pgraph.proc[proc->index] = NULL;
                pgraph.free++;
                kmem_free(proc, sizeof (proc_vertex_t));
                flk_proc_vertex_frees++;
        }
}

/*
 * Updates process graph to reflect change in a lock_graph.
 * Note: We should call this function only after we have a correctly
 * recomputed lock graph. Otherwise we might miss a deadlock detection.
 * eg: in function flk_relation() we call this function after flk_recompute_
 * dependencies() otherwise if a process tries to lock a vnode hashed
 * into another graph it might sleep for ever.
 */

static void
flk_update_proc_graph(edge_t *ep, int delete)
{
        proc_vertex_t *toproc, *fromproc;
        proc_edge_t *pep, *prevpep;

        mutex_enter(&flock_lock);

        /*
         * OFD style locks are not associated with any process so there is
         * no proc graph for these.
         */
        if (ep->from_vertex->l_ofd != NULL) {
                mutex_exit(&flock_lock);
                return;
        }

        toproc = flk_get_proc_vertex(ep->to_vertex);
        fromproc = flk_get_proc_vertex(ep->from_vertex);

        if (!delete)
                goto add;
        pep = prevpep = fromproc->edge;

        ASSERT(pep != NULL);
        while (pep != NULL) {
                if (pep->to_proc == toproc) {
                        ASSERT(pep->refcount > 0);
                        pep->refcount--;
                        if (pep->refcount == 0) {
                                if (pep == prevpep) {
                                        fromproc->edge = pep->next;
                                } else {
                                        prevpep->next = pep->next;
                                }
                                toproc->incount--;
                                flk_proc_release(toproc);
                                flk_free_proc_edge(pep);
                        }
                        break;
                }
                prevpep = pep;
                pep = pep->next;
        }
        flk_proc_release(fromproc);
        mutex_exit(&flock_lock);
        return;
add:

        pep = fromproc->edge;

        while (pep != NULL) {
                if (pep->to_proc == toproc) {
                        ASSERT(pep->refcount > 0);
                        pep->refcount++;
                        break;
                }
                pep = pep->next;
        }
        if (pep == NULL) {
                pep = flk_get_proc_edge();
                pep->to_proc = toproc;
                pep->refcount = 1;
                toproc->incount++;
                pep->next = fromproc->edge;
                fromproc->edge = pep;
        }
        mutex_exit(&flock_lock);
}

/*
 * Set the control status for lock manager requests.
 *
 */

/*
 * PSARC case 1997/292
 *
 * Requires: "nlmid" must be >= 1 and <= clconf_maximum_nodeid().
 * Effects: Set the state of the NLM server identified by "nlmid"
 *   in the NLM registry to state "nlm_state."
 *   Raises exception no_such_nlm if "nlmid" doesn't identify a known
 *   NLM server to this LLM.
 *   Note that when this routine is called with NLM_SHUTTING_DOWN there
 *   may be locks requests that have gotten started but not finished.  In
 *   particular, there may be blocking requests that are in the callback code
 *   before sleeping (so they're not holding the lock for the graph).  If
 *   such a thread reacquires the graph's lock (to go to sleep) after
 *   NLM state in the NLM registry  is set to a non-up value,
 *   it will notice the status and bail out.  If the request gets
 *   granted before the thread can check the NLM registry, let it
 *   continue normally.  It will get flushed when we are called with NLM_DOWN.
 *
 * Modifies: nlm_reg_obj (global)
 * Arguments:
 *    nlmid     (IN):    id uniquely identifying an NLM server
 *    nlm_state (IN):    NLM server state to change "nlmid" to
 */
void
cl_flk_set_nlm_status(int nlmid, flk_nlm_status_t nlm_state)
{
        /*
         * Check to see if node is booted as a cluster. If not, return.
         */
        if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
                return;
        }

        /*
         * Check for development/debugging.  It is possible to boot a node
         * in non-cluster mode, and then run a special script, currently
         * available only to developers, to bring up the node as part of a
         * cluster.  The problem is that running such a script does not
         * result in the routine flk_init() being called and hence global array
         * nlm_reg_status is NULL.  The NLM thinks it's in cluster mode,
         * but the LLM needs to do an additional check to see if the global
         * array has been created or not. If nlm_reg_status is NULL, then
         * return, else continue.
         */
        if (nlm_reg_status == NULL) {
                return;
        }

        ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
        mutex_enter(&nlm_reg_lock);

        if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status, nlmid)) {
                /*
                 * If the NLM server "nlmid" is unknown in the NLM registry,
                 * add it to the registry in the nlm shutting down state.
                 */
                FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
                    FLK_NLM_SHUTTING_DOWN);
        } else {
                /*
                 * Change the state of the NLM server identified by "nlmid"
                 * in the NLM registry to the argument "nlm_state."
                 */
                FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
                    nlm_state);
        }

        /*
         *  The reason we must register the NLM server that is shutting down
         *  with an LLM that doesn't already know about it (never sent a lock
         *  request) is to handle correctly a race between shutdown and a new
         *  lock request.  Suppose that a shutdown request from the NLM server
         *  invokes this routine at the LLM, and a thread is spawned to
         *  service the request. Now suppose a new lock request is in
         *  progress and has already passed the first line of defense in
         *  reclock(), which denies new locks requests from NLM servers
         *  that are not in the NLM_UP state.  After the current routine
         *  is invoked for both phases of shutdown, the routine will return,
         *  having done nothing, and the lock request will proceed and
         *  probably be granted.  The problem is that the shutdown was ignored
         *  by the lock request because there was no record of that NLM server
         *  shutting down.   We will be in the peculiar position of thinking
         *  that we've shutdown the NLM server and all locks at all LLMs have
         *  been discarded, but in fact there's still one lock held.
         *  The solution is to record the existence of NLM server and change
         *  its state immediately to NLM_SHUTTING_DOWN.  The lock request in
         *  progress may proceed because the next phase NLM_DOWN will catch
         *  this lock and discard it.
         */
        mutex_exit(&nlm_reg_lock);

        switch (nlm_state) {
        case FLK_NLM_UP:
                /*
                 * Change the NLM state of all locks still held on behalf of
                 * the NLM server identified by "nlmid" to NLM_UP.
                 */
                cl_flk_change_nlm_state_all_locks(nlmid, FLK_NLM_UP);
                break;

        case FLK_NLM_SHUTTING_DOWN:
                /*
                 * Wake up all sleeping locks for the NLM server identified
                 * by "nlmid." Note that eventually all woken threads will
                 * have their lock requests cancelled and descriptors
                 * removed from the sleeping lock list.  Note that the NLM
                 * server state associated with each lock descriptor is
                 * changed to FLK_NLM_SHUTTING_DOWN.
                 */
                cl_flk_wakeup_sleeping_nlm_locks(nlmid);
                break;

        case FLK_NLM_DOWN:
                /*
                 * Discard all active, granted locks for this NLM server
                 * identified by "nlmid."
                 */
                cl_flk_unlock_nlm_granted(nlmid);
                break;

        default:
                panic("cl_set_nlm_status: bad status (%d)", nlm_state);
        }
}

/*
 * Set the control status for lock manager requests.
 *
 * Note that when this routine is called with FLK_WAKEUP_SLEEPERS, there
 * may be locks requests that have gotten started but not finished.  In
 * particular, there may be blocking requests that are in the callback code
 * before sleeping (so they're not holding the lock for the graph).  If
 * such a thread reacquires the graph's lock (to go to sleep) after
 * flk_lockmgr_status is set to a non-up value, it will notice the status
 * and bail out.  If the request gets granted before the thread can check
 * flk_lockmgr_status, let it continue normally.  It will get flushed when
 * we are called with FLK_LOCKMGR_DOWN.
 */

void
flk_set_lockmgr_status(flk_lockmgr_status_t status)
{
        int i;
        graph_t *gp;
        struct flock_globals *fg;

        fg = flk_get_globals();
        ASSERT(fg != NULL);

        mutex_enter(&flock_lock);
        fg->flk_lockmgr_status = status;
        mutex_exit(&flock_lock);

        /*
         * If the lock manager is coming back up, all that's needed is to
         * propagate this information to the graphs.  If the lock manager
         * is going down, additional action is required, and each graph's
         * copy of the state is updated atomically with this other action.
         */
        switch (status) {
        case FLK_LOCKMGR_UP:
                for (i = 0; i < HASH_SIZE; i++) {
                        mutex_enter(&flock_lock);
                        gp = lock_graph[i];
                        mutex_exit(&flock_lock);
                        if (gp == NULL)
                                continue;
                        mutex_enter(&gp->gp_mutex);
                        fg->lockmgr_status[i] = status;
                        mutex_exit(&gp->gp_mutex);
                }
                break;
        case FLK_WAKEUP_SLEEPERS:
                wakeup_sleeping_lockmgr_locks(fg);
                break;
        case FLK_LOCKMGR_DOWN:
                unlock_lockmgr_granted(fg);
                break;
        default:
                panic("flk_set_lockmgr_status: bad status (%d)", status);
                break;
        }
}

/*
 * This routine returns all the locks that are active or sleeping and are
 * associated with a particular set of identifiers.  If lock_state != 0, then
 * only locks that match the lock_state are returned. If lock_state == 0, then
 * all locks are returned. If pid == NOPID, the pid is ignored.  If
 * use_sysid is FALSE, then the sysid is ignored.  If vp is NULL, then the
 * vnode pointer is ignored.
 *
 * A list containing the vnode pointer and an flock structure
 * describing the lock is returned.  Each element in the list is
 * dynamically allocated and must be freed by the caller.  The
 * last item in the list is denoted by a NULL value in the ll_next
 * field.
 *
 * The vnode pointers returned are held.  The caller is responsible
 * for releasing these.  Note that the returned list is only a snapshot of
 * the current lock information, and that it is a snapshot of a moving
 * target (only one graph is locked at a time).
 */

locklist_t *
get_lock_list(int list_type, int lock_state, int sysid, boolean_t use_sysid,
    pid_t pid, const vnode_t *vp, zoneid_t zoneid)
{
        lock_descriptor_t       *lock;
        lock_descriptor_t       *graph_head;
        locklist_t              listhead;
        locklist_t              *llheadp;
        locklist_t              *llp;
        locklist_t              *lltp;
        graph_t                 *gp;
        int                     i;
        int                     first_index; /* graph index */
        int                     num_indexes; /* graph index */

        ASSERT((list_type == FLK_ACTIVE_STATE) ||
            (list_type == FLK_SLEEPING_STATE));

        /*
         * Get a pointer to something to use as a list head while building
         * the rest of the list.
         */
        llheadp = &listhead;
        lltp = llheadp;
        llheadp->ll_next = (locklist_t *)NULL;

        /* Figure out which graphs we want to look at. */
        if (vp == NULL) {
                first_index = 0;
                num_indexes = HASH_SIZE;
        } else {
                first_index = HASH_INDEX(vp);
                num_indexes = 1;
        }

        for (i = first_index; i < first_index + num_indexes; i++) {
                mutex_enter(&flock_lock);
                gp = lock_graph[i];
                mutex_exit(&flock_lock);
                if (gp == NULL) {
                        continue;
                }

                mutex_enter(&gp->gp_mutex);
                graph_head = (list_type == FLK_ACTIVE_STATE) ?
                    ACTIVE_HEAD(gp) : SLEEPING_HEAD(gp);
                for (lock = graph_head->l_next;
                    lock != graph_head;
                    lock = lock->l_next) {
                        if (use_sysid && lock->l_flock.l_sysid != sysid)
                                continue;
                        if (pid != NOPID && lock->l_flock.l_pid != pid)
                                continue;
                        if (vp != NULL && lock->l_vnode != vp)
                                continue;
                        if (lock_state && !(lock_state & lock->l_state))
                                continue;
                        if (zoneid != lock->l_zoneid && zoneid != ALL_ZONES)
                                continue;
                        /*
                         * A matching lock was found.  Allocate
                         * space for a new locklist entry and fill
                         * it in.
                         */
                        llp = kmem_alloc(sizeof (locklist_t), KM_SLEEP);
                        lltp->ll_next = llp;
                        VN_HOLD(lock->l_vnode);
                        llp->ll_vp = lock->l_vnode;
                        create_flock(lock, &(llp->ll_flock));
                        llp->ll_next = (locklist_t *)NULL;
                        lltp = llp;
                }
                mutex_exit(&gp->gp_mutex);
        }

        llp = llheadp->ll_next;
        return (llp);
}

/*
 * These two functions are simply interfaces to get_lock_list.  They return
 * a list of sleeping or active locks for the given sysid and pid.  See
 * get_lock_list for details.
 *
 * In either case we don't particularly care to specify the zone of interest;
 * the sysid-space is global across zones, so the sysid will map to exactly one
 * zone, and we'll return information for that zone.
 */

locklist_t *
flk_get_sleeping_locks(int sysid, pid_t pid)
{
        return (get_lock_list(FLK_SLEEPING_STATE, 0, sysid, B_TRUE, pid, NULL,
            ALL_ZONES));
}

locklist_t *
flk_get_active_locks(int sysid, pid_t pid)
{
        return (get_lock_list(FLK_ACTIVE_STATE, 0, sysid, B_TRUE, pid, NULL,
            ALL_ZONES));
}

/*
 * Another interface to get_lock_list.  This one returns all the active
 * locks for a given vnode.  Again, see get_lock_list for details.
 *
 * We don't need to specify which zone's locks we're interested in.  The matter
 * would only be interesting if the vnode belonged to NFS, and NFS vnodes can't
 * be used by multiple zones, so the list of locks will all be from the right
 * zone.
 */

locklist_t *
flk_active_locks_for_vp(const vnode_t *vp)
{
        return (get_lock_list(FLK_ACTIVE_STATE, 0, 0, B_FALSE, NOPID, vp,
            ALL_ZONES));
}

/*
 * Another interface to get_lock_list.  This one returns all the active
 * nbmand locks for a given vnode.  Again, see get_lock_list for details.
 *
 * See the comment for flk_active_locks_for_vp() for why we don't care to
 * specify the particular zone of interest.
 */
locklist_t *
flk_active_nbmand_locks_for_vp(const vnode_t *vp)
{
        return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
            NOPID, vp, ALL_ZONES));
}

/*
 * Another interface to get_lock_list.  This one returns all the active
 * nbmand locks for a given pid.  Again, see get_lock_list for details.
 *
 * The zone doesn't need to be specified here; the locks held by a
 * particular process will either be local (ie, non-NFS) or from the zone
 * the process is executing in.  This is because other parts of the system
 * ensure that an NFS vnode can't be used in a zone other than that in
 * which it was opened.
 */
locklist_t *
flk_active_nbmand_locks(pid_t pid)
{
        return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
            pid, NULL, ALL_ZONES));
}

/*
 * Free up all entries in the locklist.
 */
void
flk_free_locklist(locklist_t *llp)
{
        locklist_t *next_llp;

        while (llp) {
                next_llp = llp->ll_next;
                VN_RELE(llp->ll_vp);
                kmem_free(llp, sizeof (*llp));
                llp = next_llp;
        }
}

static void
cl_flk_change_nlm_state_all_locks(int nlmid, flk_nlm_status_t nlm_state)
{
        /*
         * For each graph "lg" in the hash table lock_graph do
         * a.  Get the list of sleeping locks
         * b.  For each lock descriptor in the list do
         *      i.   If the requested lock is an NLM server request AND
         *              the nlmid is the same as the routine argument then
         *              change the lock descriptor's state field to
         *              "nlm_state."
         * c.  Get the list of active locks
         * d.  For each lock descriptor in the list do
         *      i.   If the requested lock is an NLM server request AND
         *              the nlmid is the same as the routine argument then
         *              change the lock descriptor's state field to
         *              "nlm_state."
         */

        int                     i;
        graph_t                 *gp;                    /* lock graph */
        lock_descriptor_t       *lock;                  /* lock */
        lock_descriptor_t       *nlock = NULL;          /* next lock */
        int                     lock_nlmid;

        for (i = 0; i < HASH_SIZE; i++) {
                mutex_enter(&flock_lock);
                gp = lock_graph[i];
                mutex_exit(&flock_lock);
                if (gp == NULL) {
                        continue;
                }

                /* Get list of sleeping locks in current lock graph. */
                mutex_enter(&gp->gp_mutex);
                for (lock = SLEEPING_HEAD(gp)->l_next;
                    lock != SLEEPING_HEAD(gp);
                    lock = nlock) {
                        nlock = lock->l_next;
                        /* get NLM id */
                        lock_nlmid = GETNLMID(lock->l_flock.l_sysid);

                        /*
                         * If NLM server request AND nlmid of lock matches
                         * nlmid of argument, then set the NLM state of the
                         * lock to "nlm_state."
                         */
                        if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
                                SET_NLM_STATE(lock, nlm_state);
                        }
                }

                /* Get list of active locks in current lock graph. */
                for (lock = ACTIVE_HEAD(gp)->l_next;
                    lock != ACTIVE_HEAD(gp);
                    lock = nlock) {
                        nlock = lock->l_next;
                        /* get NLM id */
                        lock_nlmid = GETNLMID(lock->l_flock.l_sysid);

                        /*
                         * If NLM server request AND nlmid of lock matches
                         * nlmid of argument, then set the NLM state of the
                         * lock to "nlm_state."
                         */
                        if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
                                ASSERT(IS_ACTIVE(lock));
                                SET_NLM_STATE(lock, nlm_state);
                        }
                }
                mutex_exit(&gp->gp_mutex);
        }
}

/*
 * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid().
 * Effects: Find all sleeping lock manager requests _only_ for the NLM server
 *   identified by "nlmid." Poke those lock requests.
 */
static void
cl_flk_wakeup_sleeping_nlm_locks(int nlmid)
{
        lock_descriptor_t *lock;
        lock_descriptor_t *nlock = NULL; /* next lock */
        int i;
        graph_t *gp;
        int     lock_nlmid;

        for (i = 0; i < HASH_SIZE; i++) {
                mutex_enter(&flock_lock);
                gp = lock_graph[i];
                mutex_exit(&flock_lock);
                if (gp == NULL) {
                        continue;
                }

                mutex_enter(&gp->gp_mutex);
                for (lock = SLEEPING_HEAD(gp)->l_next;
                    lock != SLEEPING_HEAD(gp);
                    lock = nlock) {
                        nlock = lock->l_next;
                        /*
                         * If NLM server request _and_ nlmid of lock matches
                         * nlmid of argument, then set the NLM state of the
                         * lock to NLM_SHUTTING_DOWN, and wake up sleeping
                         * request.
                         */
                        if (IS_LOCKMGR(lock)) {
                                /* get NLM id */
                                lock_nlmid =
                                    GETNLMID(lock->l_flock.l_sysid);
                                if (nlmid == lock_nlmid) {
                                        SET_NLM_STATE(lock,
                                            FLK_NLM_SHUTTING_DOWN);
                                        INTERRUPT_WAKEUP(lock);
                                }
                        }
                }
                mutex_exit(&gp->gp_mutex);
        }
}

/*
 * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid()
 * Effects:  Find all active (granted) lock manager locks _only_ for the
 *   NLM server identified by "nlmid" and release them.
 */
static void
cl_flk_unlock_nlm_granted(int nlmid)
{
        lock_descriptor_t *lock;
        lock_descriptor_t *nlock = NULL; /* next lock */
        int i;
        graph_t *gp;
        int     lock_nlmid;

        for (i = 0; i < HASH_SIZE; i++) {
                mutex_enter(&flock_lock);
                gp = lock_graph[i];
                mutex_exit(&flock_lock);
                if (gp == NULL) {
                        continue;
                }

                mutex_enter(&gp->gp_mutex);
                for (lock = ACTIVE_HEAD(gp)->l_next;
                    lock != ACTIVE_HEAD(gp);
                    lock = nlock) {
                        nlock = lock->l_next;
                        ASSERT(IS_ACTIVE(lock));

                        /*
                         * If it's an  NLM server request _and_ nlmid of
                         * the lock matches nlmid of argument, then
                         * remove the active lock the list, wakup blocked
                         * threads, and free the storage for the lock.
                         * Note that there's no need to mark the NLM state
                         * of this lock to NLM_DOWN because the lock will
                         * be deleted anyway and its storage freed.
                         */
                        if (IS_LOCKMGR(lock)) {
                                /* get NLM id */
                                lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
                                if (nlmid == lock_nlmid) {
                                        flk_delete_active_lock(lock, 0);
                                        flk_wakeup(lock, 1);
                                        flk_free_lock(lock);
                                }
                        }
                }
                mutex_exit(&gp->gp_mutex);
        }
}

/*
 * Find all sleeping lock manager requests and poke them.
 */
static void
wakeup_sleeping_lockmgr_locks(struct flock_globals *fg)
{
        lock_descriptor_t *lock;
        lock_descriptor_t *nlock = NULL; /* next lock */
        int i;
        graph_t *gp;
        zoneid_t zoneid = getzoneid();

        for (i = 0; i < HASH_SIZE; i++) {
                mutex_enter(&flock_lock);
                gp = lock_graph[i];
                mutex_exit(&flock_lock);
                if (gp == NULL) {
                        continue;
                }

                mutex_enter(&gp->gp_mutex);
                fg->lockmgr_status[i] = FLK_WAKEUP_SLEEPERS;
                for (lock = SLEEPING_HEAD(gp)->l_next;
                    lock != SLEEPING_HEAD(gp);
                    lock = nlock) {
                        nlock = lock->l_next;
                        if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
                                INTERRUPT_WAKEUP(lock);
                        }
                }
                mutex_exit(&gp->gp_mutex);
        }
}


/*
 * Find all active (granted) lock manager locks and release them.
 */
static void
unlock_lockmgr_granted(struct flock_globals *fg)
{
        lock_descriptor_t *lock;
        lock_descriptor_t *nlock = NULL; /* next lock */
        int i;
        graph_t *gp;
        zoneid_t zoneid = getzoneid();

        for (i = 0; i < HASH_SIZE; i++) {
                mutex_enter(&flock_lock);
                gp = lock_graph[i];
                mutex_exit(&flock_lock);
                if (gp == NULL) {
                        continue;
                }

                mutex_enter(&gp->gp_mutex);
                fg->lockmgr_status[i] = FLK_LOCKMGR_DOWN;
                for (lock = ACTIVE_HEAD(gp)->l_next;
                    lock != ACTIVE_HEAD(gp);
                    lock = nlock) {
                        nlock = lock->l_next;
                        if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
                                ASSERT(IS_ACTIVE(lock));
                                flk_delete_active_lock(lock, 0);
                                flk_wakeup(lock, 1);
                                flk_free_lock(lock);
                        }
                }
                mutex_exit(&gp->gp_mutex);
        }
}


/*
 * Wait until a lock is granted, cancelled, or interrupted.
 */

static void
wait_for_lock(lock_descriptor_t *request)
{
        graph_t *gp = request->l_graph;

        ASSERT(MUTEX_HELD(&gp->gp_mutex));

        while (!(IS_GRANTED(request)) && !(IS_CANCELLED(request)) &&
            !(IS_INTERRUPTED(request))) {
                if (!cv_wait_sig(&request->l_cv, &gp->gp_mutex)) {
                        flk_set_state(request, FLK_INTERRUPTED_STATE);
                        request->l_state |= INTERRUPTED_LOCK;
                }
        }
}

/*
 * Create an flock structure from the existing lock information
 *
 * This routine is used to create flock structures for the lock manager
 * to use in a reclaim request.  Since the lock was originated on this
 * host, it must be conforming to UNIX semantics, so no checking is
 * done to make sure it falls within the lower half of the 32-bit range.
 */

static void
create_flock(lock_descriptor_t *lp, flock64_t *flp)
{
        ASSERT(lp->l_end == MAX_U_OFFSET_T || lp->l_end <= MAXEND);
        ASSERT(lp->l_end >= lp->l_start);

        flp->l_type = lp->l_type;
        flp->l_whence = 0;
        flp->l_start = lp->l_start;
        flp->l_len = (lp->l_end == MAX_U_OFFSET_T) ? 0 :
            (lp->l_end - lp->l_start + 1);
        flp->l_sysid = lp->l_flock.l_sysid;
        flp->l_pid = lp->l_flock.l_pid;
}

/*
 * Convert flock_t data describing a lock range into unsigned long starting
 * and ending points, which are put into lock_request.  Returns 0 or an
 * errno value.
 */

int
flk_convert_lock_data(vnode_t *vp, flock64_t *flp,
    u_offset_t *start, u_offset_t *end, offset_t offset)
{
        struct vattr    vattr;
        int     error;

        /*
         * Determine the starting point of the request
         */
        switch (flp->l_whence) {
        case 0:         /* SEEK_SET */
                *start = (u_offset_t)flp->l_start;
                break;
        case 1:         /* SEEK_CUR */
                *start = (u_offset_t)(flp->l_start + offset);
                break;
        case 2:         /* SEEK_END */
                vattr.va_mask = AT_SIZE;
                if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
                        return (error);
                *start = (u_offset_t)(flp->l_start + vattr.va_size);
                break;
        default:
                return (EINVAL);
        }

        /*
         * Determine the range covered by the request.
         */
        if (flp->l_len == 0)
                *end = MAX_U_OFFSET_T;
        else if ((offset_t)flp->l_len > 0) {
                *end = (u_offset_t)(*start + (flp->l_len - 1));
        } else {
                /*
                 * Negative length; why do we even allow this ?
                 * Because this allows easy specification of
                 * the last n bytes of the file.
                 */
                *end = *start;
                *start += (u_offset_t)flp->l_len;
                (*start)++;
        }
        return (0);
}

/*
 * Check the validity of lock data.  This can used by the NFS
 * frlock routines to check data before contacting the server.  The
 * server must support semantics that aren't as restrictive as
 * the UNIX API, so the NFS client is required to check.
 * The maximum is now passed in by the caller.
 */

int
flk_check_lock_data(u_offset_t start, u_offset_t end, offset_t max)
{
        /*
         * The end (length) for local locking should never be greater
         * than MAXEND. However, the representation for
         * the entire file is MAX_U_OFFSET_T.
         */
        if ((start > max) ||
            ((end > max) && (end != MAX_U_OFFSET_T))) {
                return (EINVAL);
        }
        if (start > end) {
                return (EINVAL);
        }
        return (0);
}

/*
 * Fill in request->l_flock with information about the lock blocking the
 * request.  The complexity here is that lock manager requests are allowed
 * to see into the upper part of the 32-bit address range, whereas local
 * requests are only allowed to see signed values.
 *
 * What should be done when "blocker" is a lock manager lock that uses the
 * upper portion of the 32-bit range, but "request" is local?  Since the
 * request has already been determined to have been blocked by the blocker,
 * at least some portion of "blocker" must be in the range of the request,
 * or the request extends to the end of file.  For the first case, the
 * portion in the lower range is returned with the indication that it goes
 * "to EOF."  For the second case, the last byte of the lower range is
 * returned with the indication that it goes "to EOF."
 */

static void
report_blocker(lock_descriptor_t *blocker, lock_descriptor_t *request)
{
        flock64_t *flrp;                        /* l_flock portion of request */

        ASSERT(blocker != NULL);

        flrp = &request->l_flock;
        flrp->l_whence = 0;
        flrp->l_type = blocker->l_type;
        flrp->l_pid = blocker->l_flock.l_pid;
        flrp->l_sysid = blocker->l_flock.l_sysid;
        request->l_ofd = blocker->l_ofd;

        if (IS_LOCKMGR(request)) {
                flrp->l_start = blocker->l_start;
                if (blocker->l_end == MAX_U_OFFSET_T)
                        flrp->l_len = 0;
                else
                        flrp->l_len = blocker->l_end - blocker->l_start + 1;
        } else {
                if (blocker->l_start > MAXEND) {
                        flrp->l_start = MAXEND;
                        flrp->l_len = 0;
                } else {
                        flrp->l_start = blocker->l_start;
                        if (blocker->l_end == MAX_U_OFFSET_T)
                                flrp->l_len = 0;
                        else
                                flrp->l_len = blocker->l_end -
                                    blocker->l_start + 1;
                }
        }
}

/*
 * PSARC case 1997/292
 */
/*
 * This is the public routine exported by flock.h.
 */
void
cl_flk_change_nlm_state_to_unknown(int nlmid)
{
        /*
         * Check to see if node is booted as a cluster. If not, return.
         */
        if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
                return;
        }

        /*
         * See comment in cl_flk_set_nlm_status().
         */
        if (nlm_reg_status == NULL) {
                return;
        }

        /*
         * protect NLM registry state with a mutex.
         */
        ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
        mutex_enter(&nlm_reg_lock);
        FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid, FLK_NLM_UNKNOWN);
        mutex_exit(&nlm_reg_lock);
}

/*
 * Return non-zero if the given I/O request conflicts with an active NBMAND
 * lock.
 * If svmand is non-zero, it means look at all active locks, not just NBMAND
 * locks.
 */

int
nbl_lock_conflict(vnode_t *vp, nbl_op_t op, u_offset_t offset,
    ssize_t length, int svmand, caller_context_t *ct)
{
        int conflict = 0;
        graph_t                 *gp;
        lock_descriptor_t       *lock;
        pid_t pid;
        int sysid;

        if (ct == NULL) {
                pid = curproc->p_pid;
                sysid = 0;
        } else {
                pid = ct->cc_pid;
                sysid = ct->cc_sysid;
        }

        mutex_enter(&flock_lock);
        gp = lock_graph[HASH_INDEX(vp)];
        mutex_exit(&flock_lock);
        if (gp == NULL)
                return (0);

        mutex_enter(&gp->gp_mutex);
        SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);

        for (; lock && lock->l_vnode == vp; lock = lock->l_next) {
                if ((svmand || (lock->l_state & NBMAND_LOCK)) &&
                    (lock->l_flock.l_sysid != sysid ||
                    lock->l_flock.l_pid != pid) &&
                    lock_blocks_io(op, offset, length,
                    lock->l_type, lock->l_start, lock->l_end)) {
                        DTRACE_PROBE1(conflict_lock,
                            lock_descriptor_t *, lock);
                        conflict = 1;
                        break;
                }
        }
        mutex_exit(&gp->gp_mutex);

        return (conflict);
}

/*
 * Return non-zero if the given I/O request conflicts with the given lock.
 */

static int
lock_blocks_io(nbl_op_t op, u_offset_t offset, ssize_t length,
    int lock_type, u_offset_t lock_start, u_offset_t lock_end)
{
        ASSERT(op == NBL_READ || op == NBL_WRITE || op == NBL_READWRITE);
        ASSERT(lock_type == F_RDLCK || lock_type == F_WRLCK);

        if (op == NBL_READ && lock_type == F_RDLCK)
                return (0);

        if (offset <= lock_start && lock_start < offset + length)
                return (1);
        if (lock_start <= offset && offset <= lock_end)
                return (1);

        return (0);
}

#ifdef DEBUG
static void
check_active_locks(graph_t *gp)
{
        lock_descriptor_t *lock, *lock1;
        edge_t  *ep;

        for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
            lock = lock->l_next) {
                ASSERT(IS_ACTIVE(lock));
                ASSERT(NOT_BLOCKED(lock));
                ASSERT(!IS_BARRIER(lock));

                ep = FIRST_IN(lock);

                while (ep != HEAD(lock)) {
                        ASSERT(IS_SLEEPING(ep->from_vertex));
                        ASSERT(!NOT_BLOCKED(ep->from_vertex));
                        ep = NEXT_IN(ep);
                }

                for (lock1 = lock->l_next; lock1 != ACTIVE_HEAD(gp);
                    lock1 = lock1->l_next) {
                        if (lock1->l_vnode == lock->l_vnode) {
                        if (BLOCKS(lock1, lock)) {
                                cmn_err(CE_PANIC,
                                    "active lock %p blocks %p",
                                    (void *)lock1, (void *)lock);
                        } else if (BLOCKS(lock, lock1)) {
                                cmn_err(CE_PANIC,
                                    "active lock %p blocks %p",
                                    (void *)lock, (void *)lock1);
                        }
                        }
                }
        }
}

/*
 * Effect: This functions checks to see if the transition from 'old_state' to
 *      'new_state' is a valid one.  It returns 0 if the transition is valid
 *      and 1 if it is not.
 *      For a map of valid transitions, see sys/flock_impl.h
 */
static int
check_lock_transition(int old_state, int new_state)
{
        switch (old_state) {
        case FLK_INITIAL_STATE:
                if ((new_state == FLK_START_STATE) ||
                    (new_state == FLK_SLEEPING_STATE) ||
                    (new_state == FLK_ACTIVE_STATE) ||
                    (new_state == FLK_DEAD_STATE)) {
                        return (0);
                } else {
                        return (1);
                }
        case FLK_START_STATE:
                if ((new_state == FLK_ACTIVE_STATE) ||
                    (new_state == FLK_DEAD_STATE)) {
                        return (0);
                } else {
                        return (1);
                }
        case FLK_ACTIVE_STATE:
                if (new_state == FLK_DEAD_STATE) {
                        return (0);
                } else {
                        return (1);
                }
        case FLK_SLEEPING_STATE:
                if ((new_state == FLK_GRANTED_STATE) ||
                    (new_state == FLK_INTERRUPTED_STATE) ||
                    (new_state == FLK_CANCELLED_STATE)) {
                        return (0);
                } else {
                        return (1);
                }
        case FLK_GRANTED_STATE:
                if ((new_state == FLK_START_STATE) ||
                    (new_state == FLK_INTERRUPTED_STATE) ||
                    (new_state == FLK_CANCELLED_STATE)) {
                        return (0);
                } else {
                        return (1);
                }
        case FLK_CANCELLED_STATE:
                if ((new_state == FLK_INTERRUPTED_STATE) ||
                    (new_state == FLK_DEAD_STATE)) {
                        return (0);
                } else {
                        return (1);
                }
        case FLK_INTERRUPTED_STATE:
                if (new_state == FLK_DEAD_STATE) {
                        return (0);
                } else {
                        return (1);
                }
        case FLK_DEAD_STATE:
                /* May be set more than once */
                if (new_state == FLK_DEAD_STATE) {
                        return (0);
                } else {
                        return (1);
                }
        default:
                return (1);
        }
}

static void
check_sleeping_locks(graph_t *gp)
{
        lock_descriptor_t *lock1, *lock2;
        edge_t *ep;
        for (lock1 = SLEEPING_HEAD(gp)->l_next; lock1 != SLEEPING_HEAD(gp);
            lock1 = lock1->l_next) {
                ASSERT(!IS_BARRIER(lock1));
                for (lock2 = lock1->l_next; lock2 != SLEEPING_HEAD(gp);
                    lock2 = lock2->l_next) {
                        if (lock1->l_vnode == lock2->l_vnode) {
                                if (BLOCKS(lock2, lock1)) {
                                        ASSERT(!IS_GRANTED(lock1));
                                        ASSERT(!NOT_BLOCKED(lock1));
                                        path(lock1, lock2);
                                }
                        }
                }

                for (lock2 = ACTIVE_HEAD(gp)->l_next; lock2 != ACTIVE_HEAD(gp);
                    lock2 = lock2->l_next) {
                        ASSERT(!IS_BARRIER(lock1));
                        if (lock1->l_vnode == lock2->l_vnode) {
                                if (BLOCKS(lock2, lock1)) {
                                        ASSERT(!IS_GRANTED(lock1));
                                        ASSERT(!NOT_BLOCKED(lock1));
                                        path(lock1, lock2);
                                }
                        }
                }
                ep = FIRST_ADJ(lock1);
                while (ep != HEAD(lock1)) {
                        ASSERT(BLOCKS(ep->to_vertex, lock1));
                        ep = NEXT_ADJ(ep);
                }
        }
}

static int
level_two_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2, int no_path)
{
        edge_t  *ep;
        lock_descriptor_t       *vertex;
        lock_descriptor_t *vertex_stack;

        STACK_INIT(vertex_stack);

        flk_graph_uncolor(lock1->l_graph);
        ep = FIRST_ADJ(lock1);
        ASSERT(ep != HEAD(lock1));
        while (ep != HEAD(lock1)) {
                if (no_path)
                        ASSERT(ep->to_vertex != lock2);
                STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
                COLOR(ep->to_vertex);
                ep = NEXT_ADJ(ep);
        }

        while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
                STACK_POP(vertex_stack, l_dstack);
                for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
                    ep = NEXT_ADJ(ep)) {
                        if (COLORED(ep->to_vertex))
                                continue;
                        COLOR(ep->to_vertex);
                        if (ep->to_vertex == lock2)
                                return (1);

                        STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
                }
        }
        return (0);
}

static void
check_owner_locks(graph_t *gp, pid_t pid, int sysid, vnode_t *vp)
{
        lock_descriptor_t *lock;

        /* Ignore OFD style locks since they're not process-wide. */
        if (pid == 0)
                return;

        SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);

        if (lock) {
                while (lock != ACTIVE_HEAD(gp) && (lock->l_vnode == vp)) {
                        if (lock->l_flock.l_pid == pid &&
                            lock->l_flock.l_sysid == sysid)
                                cmn_err(CE_PANIC,
                                    "owner pid %d's lock %p in active queue",
                                    pid, (void *)lock);
                        lock = lock->l_next;
                }
        }
        SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);

        if (lock) {
                while (lock != SLEEPING_HEAD(gp) && (lock->l_vnode == vp)) {
                        if (lock->l_flock.l_pid == pid &&
                            lock->l_flock.l_sysid == sysid)
                                cmn_err(CE_PANIC,
                                    "owner pid %d's lock %p in sleep queue",
                                    pid, (void *)lock);
                        lock = lock->l_next;
                }
        }
}

static int
level_one_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
{
        edge_t *ep = FIRST_ADJ(lock1);

        while (ep != HEAD(lock1)) {
                if (ep->to_vertex == lock2)
                        return (1);
                else
                        ep = NEXT_ADJ(ep);
        }
        return (0);
}

static int
no_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
{
        return (!level_two_path(lock1, lock2, 1));
}

static void
path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
{
        if (level_one_path(lock1, lock2)) {
                if (level_two_path(lock1, lock2, 0) != 0) {
                        cmn_err(CE_WARN,
                            "one edge one path from lock1 %p lock2 %p",
                            (void *)lock1, (void *)lock2);
                }
        } else if (no_path(lock1, lock2)) {
                cmn_err(CE_PANIC,
                    "No path from  lock1 %p to lock2 %p",
                    (void *)lock1, (void *)lock2);
        }
}
#endif /* DEBUG */