root/usr/src/uts/common/fs/nfs/nfs4_srv_deleg.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*
 * Copyright 2018 Nexenta Systems, Inc.
 */

#include <sys/systm.h>
#include <rpc/auth.h>
#include <rpc/clnt.h>
#include <nfs/nfs4_kprot.h>
#include <nfs/nfs4.h>
#include <nfs/lm.h>
#include <sys/cmn_err.h>
#include <sys/disp.h>
#include <sys/sdt.h>

#include <sys/pathname.h>

#include <sys/strsubr.h>
#include <sys/ddi.h>

#include <sys/vnode.h>
#include <sys/sdt.h>
#include <inet/common.h>
#include <inet/ip.h>
#include <inet/ip6.h>

#define MAX_READ_DELEGATIONS 5

static int rfs4_deleg_disabled;
static int rfs4_max_setup_cb_tries = 5;

#ifdef DEBUG

int rfs4_cb_null;
int rfs4_cb_debug;
int rfs4_deleg_debug;

#endif

static void rfs4_recall_file(rfs4_file_t *,
    void (*recall)(rfs4_deleg_state_t *, bool_t),
    bool_t, rfs4_client_t *);
static  void            rfs4_revoke_file(rfs4_file_t *);
static  void            rfs4_cb_chflush(rfs4_cbinfo_t *);
static  CLIENT          *rfs4_cb_getch(rfs4_cbinfo_t *);
static  void            rfs4_cb_freech(rfs4_cbinfo_t *, CLIENT *, bool_t);
static rfs4_deleg_state_t *rfs4_deleg_state(rfs4_state_t *,
    open_delegation_type4, int *);

/*
 * Convert a universal address to an transport specific
 * address using inet_pton.
 */
static int
uaddr2sockaddr(int af, char *ua, void *ap, in_port_t *pp)
{
        int dots = 0, i, j, len, k;
        unsigned char c;
        in_port_t port = 0;

        len = strlen(ua);

        for (i = len-1; i >= 0; i--) {

                if (ua[i] == '.')
                        dots++;

                if (dots == 2) {

                        ua[i] = '\0';
                        /*
                         * We use k to remember were to stick '.' back, since
                         * ua was kmem_allocateded from the pool len+1.
                         */
                        k = i;
                        if (inet_pton(af, ua, ap) == 1) {

                                c = 0;

                                for (j = i+1; j < len; j++) {
                                        if (ua[j] == '.') {
                                                port = c << 8;
                                                c = 0;
                                        } else if (ua[j] >= '0' &&
                                            ua[j] <= '9') {
                                                c *= 10;
                                                c += ua[j] - '0';
                                        } else {
                                                ua[k] = '.';
                                                return (EINVAL);
                                        }
                                }
                                port += c;

                                *pp = htons(port);

                                ua[k] = '.';
                                return (0);
                        } else {
                                ua[k] = '.';
                                return (EINVAL);
                        }
                }
        }

        return (EINVAL);
}

/*
 * Update the delegation policy with the
 * value of "new_policy"
 */
void
rfs4_set_deleg_policy(nfs4_srv_t *nsrv4, srv_deleg_policy_t new_policy)
{
        rw_enter(&nsrv4->deleg_policy_lock, RW_WRITER);
        nsrv4->nfs4_deleg_policy = new_policy;
        rw_exit(&nsrv4->deleg_policy_lock);
}

void
rfs4_hold_deleg_policy(nfs4_srv_t *nsrv4)
{
        rw_enter(&nsrv4->deleg_policy_lock, RW_READER);
}

void
rfs4_rele_deleg_policy(nfs4_srv_t *nsrv4)
{
        rw_exit(&nsrv4->deleg_policy_lock);
}

srv_deleg_policy_t
nfs4_get_deleg_policy()
{
        nfs4_srv_t *nsrv4 = nfs4_get_srv();
        return (nsrv4->nfs4_deleg_policy);
}


/*
 * This free function is to be used when the client struct is being
 * released and nothing at all is needed of the callback info any
 * longer.
 */
void
rfs4_cbinfo_free(rfs4_cbinfo_t *cbp)
{
        char *addr = cbp->cb_callback.cb_location.r_addr;
        char *netid = cbp->cb_callback.cb_location.r_netid;

        /* Free old address if any */

        if (addr)
                kmem_free(addr, strlen(addr) + 1);
        if (netid)
                kmem_free(netid, strlen(netid) + 1);

        addr = cbp->cb_newer.cb_callback.cb_location.r_addr;
        netid = cbp->cb_newer.cb_callback.cb_location.r_netid;

        if (addr)
                kmem_free(addr, strlen(addr) + 1);
        if (netid)
                kmem_free(netid, strlen(netid) + 1);

        if (cbp->cb_chc_free) {
                rfs4_cb_chflush(cbp);
        }
}

/*
 * The server uses this to check the callback path supplied by the
 * client.  The callback connection is marked "in progress" while this
 * work is going on and then eventually marked either OK or FAILED.
 * This work can be done as part of a separate thread and at the end
 * of this the thread will exit or it may be done such that the caller
 * will continue with other work.
 */
static void
rfs4_do_cb_null(rfs4_client_t *cp)
{
        struct timeval tv;
        CLIENT *ch;
        rfs4_cbstate_t newstate;
        rfs4_cbinfo_t *cbp = &cp->rc_cbinfo;

        mutex_enter(cbp->cb_lock);
        /* If another thread is doing CB_NULL RPC then return */
        if (cbp->cb_nullcaller == TRUE) {
                mutex_exit(cbp->cb_lock);
                rfs4_client_rele(cp);
                zthread_exit();
        }

        /* Mark the cbinfo as having a thread in the NULL callback */
        cbp->cb_nullcaller = TRUE;

        /*
         * Are there other threads still using the cbinfo client
         * handles?  If so, this thread must wait before going and
         * mucking aroiund with the callback information
         */
        while (cbp->cb_refcnt != 0)
                cv_wait(cbp->cb_cv_nullcaller, cbp->cb_lock);

        /*
         * This thread itself may find that new callback info has
         * arrived and is set up to handle this case and redrive the
         * call to the client's callback server.
         */
retry:
        if (cbp->cb_newer.cb_new == TRUE &&
            cbp->cb_newer.cb_confirmed == TRUE) {
                char *addr = cbp->cb_callback.cb_location.r_addr;
                char *netid = cbp->cb_callback.cb_location.r_netid;

                /*
                 * Free the old stuff if it exists; may be the first
                 * time through this path
                 */
                if (addr)
                        kmem_free(addr, strlen(addr) + 1);
                if (netid)
                        kmem_free(netid, strlen(netid) + 1);

                /* Move over the addr/netid */
                cbp->cb_callback.cb_location.r_addr =
                    cbp->cb_newer.cb_callback.cb_location.r_addr;
                cbp->cb_newer.cb_callback.cb_location.r_addr = NULL;
                cbp->cb_callback.cb_location.r_netid =
                    cbp->cb_newer.cb_callback.cb_location.r_netid;
                cbp->cb_newer.cb_callback.cb_location.r_netid = NULL;

                /* Get the program number */
                cbp->cb_callback.cb_program =
                    cbp->cb_newer.cb_callback.cb_program;
                cbp->cb_newer.cb_callback.cb_program = 0;

                /* Don't forget the protocol's "cb_ident" field */
                cbp->cb_ident = cbp->cb_newer.cb_ident;
                cbp->cb_newer.cb_ident = 0;

                /* no longer new */
                cbp->cb_newer.cb_new = FALSE;
                cbp->cb_newer.cb_confirmed = FALSE;

                /* get rid of the old client handles that may exist */
                rfs4_cb_chflush(cbp);

                cbp->cb_state = CB_NONE;
                cbp->cb_timefailed = 0; /* reset the clock */
                cbp->cb_notified_of_cb_path_down = TRUE;
        }

        if (cbp->cb_state != CB_NONE) {
                cv_broadcast(cbp->cb_cv);       /* let the others know */
                cbp->cb_nullcaller = FALSE;
                mutex_exit(cbp->cb_lock);
                rfs4_client_rele(cp);
                zthread_exit();
        }

        /* mark rfs4_client_t as CALLBACK NULL in progress */
        cbp->cb_state = CB_INPROG;
        mutex_exit(cbp->cb_lock);

        /* get/generate a client handle */
        if ((ch = rfs4_cb_getch(cbp)) == NULL) {
                mutex_enter(cbp->cb_lock);
                cbp->cb_state = CB_BAD;
                cbp->cb_timefailed = gethrestime_sec(); /* observability */
                goto retry;
        }


        tv.tv_sec = 30;
        tv.tv_usec = 0;
        if (clnt_call(ch, CB_NULL, xdr_void, NULL, xdr_void, NULL, tv) != 0) {
                newstate = CB_BAD;
        } else {
                newstate = CB_OK;
#ifdef  DEBUG
                rfs4_cb_null++;
#endif
        }

        /* Check to see if the client has specified new callback info */
        mutex_enter(cbp->cb_lock);
        rfs4_cb_freech(cbp, ch, TRUE);
        if (cbp->cb_newer.cb_new == TRUE &&
            cbp->cb_newer.cb_confirmed == TRUE) {
                goto retry;     /* give the CB_NULL another chance */
        }

        cbp->cb_state = newstate;
        if (cbp->cb_state == CB_BAD)
                cbp->cb_timefailed = gethrestime_sec(); /* observability */

        cv_broadcast(cbp->cb_cv);       /* start up the other threads */
        cbp->cb_nullcaller = FALSE;
        mutex_exit(cbp->cb_lock);
        rfs4_client_rele(cp);
        zthread_exit();
}

/*
 * Given a client struct, inspect the callback info to see if the
 * callback path is up and available.
 *
 * If new callback path is available and no one has set it up then
 * try to set it up. If setup is not successful after 5 tries (5 secs)
 * then gives up and returns NULL.
 *
 * If callback path is being initialized, then wait for the CB_NULL RPC
 * call to occur.
 */
static rfs4_cbinfo_t *
rfs4_cbinfo_hold(rfs4_client_t *cp)
{
        rfs4_cbinfo_t *cbp = &cp->rc_cbinfo;
        int retries = 0;

        mutex_enter(cbp->cb_lock);

        while (cbp->cb_newer.cb_new == TRUE && cbp->cb_nullcaller == FALSE) {
                /*
                 * Looks like a new callback path may be available and
                 * noone has set it up.
                 */
                mutex_exit(cbp->cb_lock);
                rfs4_dbe_hold(cp->rc_dbe);
                rfs4_do_cb_null(cp); /* caller will release client hold */

                mutex_enter(cbp->cb_lock);
                /*
                 * If callback path is no longer new, or it's being setup
                 * then stop and wait for it to be done.
                 */
                if (cbp->cb_newer.cb_new == FALSE || cbp->cb_nullcaller == TRUE)
                        break;
                mutex_exit(cbp->cb_lock);

                if (++retries >= rfs4_max_setup_cb_tries)
                        return (NULL);
                delay(hz);
                mutex_enter(cbp->cb_lock);
        }

        /* Is there a thread working on doing the CB_NULL RPC? */
        if (cbp->cb_nullcaller == TRUE)
                cv_wait(cbp->cb_cv, cbp->cb_lock);  /* if so, wait on it */

        /* If the callback path is not okay (up and running), just quit */
        if (cbp->cb_state != CB_OK) {
                mutex_exit(cbp->cb_lock);
                return (NULL);
        }

        /* Let someone know we are using the current callback info */
        cbp->cb_refcnt++;
        mutex_exit(cbp->cb_lock);
        return (cbp);
}

/*
 * The caller is done with the callback info.  It may be that the
 * caller's RPC failed and the NFSv4 client has actually provided new
 * callback information.  If so, let the caller know so they can
 * advantage of this and maybe retry the RPC that originally failed.
 */
static int
rfs4_cbinfo_rele(rfs4_cbinfo_t *cbp, rfs4_cbstate_t newstate)
{
        int cb_new = FALSE;

        mutex_enter(cbp->cb_lock);

        /* The caller gets a chance to mark the callback info as bad */
        if (newstate != CB_NOCHANGE)
                cbp->cb_state = newstate;
        if (newstate == CB_FAILED) {
                cbp->cb_timefailed = gethrestime_sec(); /* observability */
                cbp->cb_notified_of_cb_path_down = FALSE;
        }

        cbp->cb_refcnt--;       /* no longer using the information */

        /*
         * A thread may be waiting on this one to finish and if so,
         * let it know that it is okay to do the CB_NULL to the
         * client's callback server.
         */
        if (cbp->cb_refcnt == 0 && cbp->cb_nullcaller)
                cv_broadcast(cbp->cb_cv_nullcaller);

        /*
         * If this is the last thread to use the callback info and
         * there is new callback information to try and no thread is
         * there ready to do the CB_NULL, then return true to teh
         * caller so they can do the CB_NULL
         */
        if (cbp->cb_refcnt == 0 &&
            cbp->cb_nullcaller == FALSE &&
            cbp->cb_newer.cb_new == TRUE &&
            cbp->cb_newer.cb_confirmed == TRUE)
                cb_new = TRUE;

        mutex_exit(cbp->cb_lock);

        return (cb_new);
}

/*
 * Given the information in the callback info struct, create a client
 * handle that can be used by the server for its callback path.
 */
static CLIENT *
rfs4_cbch_init(rfs4_cbinfo_t *cbp)
{
        struct knetconfig knc;
        vnode_t *vp;
        struct sockaddr_in addr4;
        struct sockaddr_in6 addr6;
        void *addr, *taddr;
        in_port_t *pp;
        int af;
        char *devnam;
        struct netbuf nb;
        int size;
        CLIENT *ch = NULL;
        int useresvport = 0;

        mutex_enter(cbp->cb_lock);

        if (cbp->cb_callback.cb_location.r_netid == NULL ||
            cbp->cb_callback.cb_location.r_addr == NULL) {
                goto cb_init_out;
        }

        if (strcmp(cbp->cb_callback.cb_location.r_netid, "tcp") == 0) {
                knc.knc_semantics = NC_TPI_COTS;
                knc.knc_protofmly = "inet";
                knc.knc_proto = "tcp";
                devnam = "/dev/tcp";
                af = AF_INET;
        } else if (strcmp(cbp->cb_callback.cb_location.r_netid, "udp")
            == 0) {
                knc.knc_semantics = NC_TPI_CLTS;
                knc.knc_protofmly = "inet";
                knc.knc_proto = "udp";
                devnam = "/dev/udp";
                af = AF_INET;
        } else if (strcmp(cbp->cb_callback.cb_location.r_netid, "tcp6")
            == 0) {
                knc.knc_semantics = NC_TPI_COTS;
                knc.knc_protofmly = "inet6";
                knc.knc_proto = "tcp";
                devnam = "/dev/tcp6";
                af = AF_INET6;
        } else if (strcmp(cbp->cb_callback.cb_location.r_netid, "udp6")
            == 0) {
                knc.knc_semantics = NC_TPI_CLTS;
                knc.knc_protofmly = "inet6";
                knc.knc_proto = "udp";
                devnam = "/dev/udp6";
                af = AF_INET6;
        } else {
                goto cb_init_out;
        }

        if (lookupname(devnam, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp) != 0) {

                goto cb_init_out;
        }

        if (vp->v_type != VCHR) {
                VN_RELE(vp);
                goto cb_init_out;
        }

        knc.knc_rdev = vp->v_rdev;

        VN_RELE(vp);

        if (af == AF_INET) {
                size = sizeof (addr4);
                bzero(&addr4, size);
                addr4.sin_family = (sa_family_t)af;
                addr = &addr4.sin_addr;
                pp = &addr4.sin_port;
                taddr = &addr4;
        } else /* AF_INET6 */ {
                size = sizeof (addr6);
                bzero(&addr6, size);
                addr6.sin6_family = (sa_family_t)af;
                addr = &addr6.sin6_addr;
                pp = &addr6.sin6_port;
                taddr = &addr6;
        }

        if (uaddr2sockaddr(af,
            cbp->cb_callback.cb_location.r_addr, addr, pp)) {

                goto cb_init_out;
        }


        nb.maxlen = nb.len = size;
        nb.buf = (char *)taddr;

        if (clnt_tli_kcreate(&knc, &nb, cbp->cb_callback.cb_program,
            NFS_CB, 0, 0, curthread->t_cred, &ch)) {

                ch = NULL;
        }

        /* turn off reserved port usage */
        (void) CLNT_CONTROL(ch, CLSET_BINDRESVPORT, (char *)&useresvport);

cb_init_out:
        mutex_exit(cbp->cb_lock);
        return (ch);
}

/*
 * Iterate over the client handle cache and
 * destroy it.
 */
static void
rfs4_cb_chflush(rfs4_cbinfo_t *cbp)
{
        CLIENT *ch;

        while (cbp->cb_chc_free) {
                cbp->cb_chc_free--;
                ch = cbp->cb_chc[cbp->cb_chc_free];
                cbp->cb_chc[cbp->cb_chc_free] = NULL;
                if (ch) {
                        if (ch->cl_auth)
                                auth_destroy(ch->cl_auth);
                        clnt_destroy(ch);
                }
        }
}

/*
 * Return a client handle, either from a the small
 * rfs4_client_t cache or one that we just created.
 */
static CLIENT *
rfs4_cb_getch(rfs4_cbinfo_t *cbp)
{
        CLIENT *cbch = NULL;
        uint32_t zilch = 0;

        mutex_enter(cbp->cb_lock);

        if (cbp->cb_chc_free) {
                cbp->cb_chc_free--;
                cbch = cbp->cb_chc[ cbp->cb_chc_free ];
                mutex_exit(cbp->cb_lock);
                (void) CLNT_CONTROL(cbch, CLSET_XID, (char *)&zilch);
                return (cbch);
        }

        mutex_exit(cbp->cb_lock);

        /* none free so make it now */
        cbch = rfs4_cbch_init(cbp);

        return (cbch);
}

/*
 * Return the client handle to the small cache or
 * destroy it.
 */
static void
rfs4_cb_freech(rfs4_cbinfo_t *cbp, CLIENT *ch, bool_t lockheld)
{
        if (lockheld == FALSE)
                mutex_enter(cbp->cb_lock);

        if (cbp->cb_chc_free < RFS4_CBCH_MAX) {
                cbp->cb_chc[ cbp->cb_chc_free++ ] = ch;
                if (lockheld == FALSE)
                        mutex_exit(cbp->cb_lock);
                return;
        }
        if (lockheld == FALSE)
                mutex_exit(cbp->cb_lock);

        /*
         * cache maxed out of free entries, obliterate
         * this client handle, destroy it, throw it away.
         */
        if (ch->cl_auth)
                auth_destroy(ch->cl_auth);
        clnt_destroy(ch);
}

/*
 * With the supplied callback information - initialize the client
 * callback data.  If there is a callback in progress, save the
 * callback info so that a thread can pick it up in the future.
 */
void
rfs4_client_setcb(rfs4_client_t *cp, cb_client4 *cb, uint32_t cb_ident)
{
        char *addr = NULL;
        char *netid = NULL;
        rfs4_cbinfo_t *cbp = &cp->rc_cbinfo;
        size_t len;

        /* Set the call back for the client */
        if (cb->cb_location.r_addr && cb->cb_location.r_addr[0] != '\0' &&
            cb->cb_location.r_netid && cb->cb_location.r_netid[0] != '\0') {
                len = strlen(cb->cb_location.r_addr) + 1;
                addr = kmem_alloc(len, KM_SLEEP);
                bcopy(cb->cb_location.r_addr, addr, len);
                len = strlen(cb->cb_location.r_netid) + 1;
                netid = kmem_alloc(len, KM_SLEEP);
                bcopy(cb->cb_location.r_netid, netid, len);
        }
        /* ready to save the new information but first free old, if exists */
        mutex_enter(cbp->cb_lock);

        cbp->cb_newer.cb_callback.cb_program = cb->cb_program;

        if (cbp->cb_newer.cb_callback.cb_location.r_addr != NULL)
                kmem_free(cbp->cb_newer.cb_callback.cb_location.r_addr,
                    strlen(cbp->cb_newer.cb_callback.cb_location.r_addr) + 1);
        cbp->cb_newer.cb_callback.cb_location.r_addr = addr;

        if (cbp->cb_newer.cb_callback.cb_location.r_netid != NULL)
                kmem_free(cbp->cb_newer.cb_callback.cb_location.r_netid,
                    strlen(cbp->cb_newer.cb_callback.cb_location.r_netid) + 1);
        cbp->cb_newer.cb_callback.cb_location.r_netid = netid;

        cbp->cb_newer.cb_ident = cb_ident;

        if (addr && *addr && netid && *netid) {
                cbp->cb_newer.cb_new = TRUE;
                cbp->cb_newer.cb_confirmed = FALSE;
        } else {
                cbp->cb_newer.cb_new = FALSE;
                cbp->cb_newer.cb_confirmed = FALSE;
        }

        mutex_exit(cbp->cb_lock);
}

/*
 * The server uses this when processing SETCLIENTID_CONFIRM.  Callback
 * information may have been provided on SETCLIENTID and this call
 * marks that information as confirmed and then starts a thread to
 * test the callback path.
 */
void
rfs4_deleg_cb_check(rfs4_client_t *cp)
{
        if (cp->rc_cbinfo.cb_newer.cb_new == FALSE)
                return;

        cp->rc_cbinfo.cb_newer.cb_confirmed = TRUE;

        rfs4_dbe_hold(cp->rc_dbe); /* hold the client struct for thread */

        (void) zthread_create(NULL, 0, rfs4_do_cb_null, cp, 0,
            minclsyspri);
}

static void
rfs4args_cb_recall_free(nfs_cb_argop4 *argop)
{
        CB_RECALL4args  *rec_argp;

        rec_argp = &argop->nfs_cb_argop4_u.opcbrecall;
        if (rec_argp->fh.nfs_fh4_val)
                kmem_free(rec_argp->fh.nfs_fh4_val, rec_argp->fh.nfs_fh4_len);
}

/* ARGSUSED */
static void
rfs4args_cb_getattr_free(nfs_cb_argop4 *argop)
{
        CB_GETATTR4args *argp;

        argp = &argop->nfs_cb_argop4_u.opcbgetattr;
        if (argp->fh.nfs_fh4_val)
                kmem_free(argp->fh.nfs_fh4_val, argp->fh.nfs_fh4_len);
}

static void
rfs4freeargres(CB_COMPOUND4args *args, CB_COMPOUND4res *resp)
{
        int i, arglen;
        nfs_cb_argop4 *argop;

        /*
         * First free any special args alloc'd for specific ops.
         */
        arglen = args->array_len;
        argop = args->array;
        for (i = 0; i < arglen; i++, argop++) {

                switch (argop->argop) {
                case OP_CB_RECALL:
                        rfs4args_cb_recall_free(argop);
                        break;

                case OP_CB_GETATTR:
                        rfs4args_cb_getattr_free(argop);
                        break;

                default:
                        return;
                }
        }

        if (args->tag.utf8string_len > 0)
                UTF8STRING_FREE(args->tag)

        kmem_free(args->array, arglen * sizeof (nfs_cb_argop4));
        if (resp)
                xdr_free(xdr_CB_COMPOUND4res, (caddr_t)resp);
}

/*
 * General callback routine for the server to the client.
 */
static enum clnt_stat
rfs4_do_callback(rfs4_client_t *cp, CB_COMPOUND4args *args,
    CB_COMPOUND4res *res, struct timeval timeout)
{
        rfs4_cbinfo_t *cbp;
        CLIENT *ch;
        /* start with this in case cb_getch() fails */
        enum clnt_stat  stat = RPC_FAILED;

        res->tag.utf8string_val = NULL;
        res->array = NULL;

retry:
        cbp = rfs4_cbinfo_hold(cp);
        if (cbp == NULL)
                return (stat);

        /* get a client handle */
        if ((ch = rfs4_cb_getch(cbp)) != NULL) {
                /*
                 * reset the cb_ident since it may have changed in
                 * rfs4_cbinfo_hold()
                 */
                args->callback_ident = cbp->cb_ident;

                stat = clnt_call(ch, CB_COMPOUND, xdr_CB_COMPOUND4args_srv,
                    (caddr_t)args, xdr_CB_COMPOUND4res,
                    (caddr_t)res, timeout);

                /* free client handle */
                rfs4_cb_freech(cbp, ch, FALSE);
        }

        /*
         * If the rele says that there may be new callback info then
         * retry this sequence and it may succeed as a result of the
         * new callback path
         */
        if (rfs4_cbinfo_rele(cbp,
            (stat == RPC_SUCCESS ? CB_NOCHANGE : CB_FAILED)) == TRUE)
                goto retry;

        return (stat);
}

/*
 * Used by the NFSv4 server to get attributes for a file while
 * handling the case where a file has been write delegated.  For the
 * time being, VOP_GETATTR() is called and CB_GETATTR processing is
 * not undertaken.  This call site is maintained in case the server is
 * updated in the future to handle write delegation space guarantees.
 */
nfsstat4
rfs4_vop_getattr(vnode_t *vp, vattr_t *vap, int flag, cred_t *cr)
{

        int error;

        error = VOP_GETATTR(vp, vap, flag, cr, NULL);
        return (puterrno4(error));
}

/*
 * This is used everywhere in the v2/v3 server to allow the
 * integration of all NFS versions and the support of delegation.  For
 * now, just call the VOP_GETATTR().  If the NFSv4 server is enhanced
 * in the future to provide space guarantees for write delegations
 * then this call site should be expanded to interact with the client.
 */
int
rfs4_delegated_getattr(vnode_t *vp, vattr_t *vap, int flag, cred_t *cr)
{
        return (VOP_GETATTR(vp, vap, flag, cr, NULL));
}

/*
 * Place the actual cb_recall otw call to client.
 */
static void
rfs4_do_cb_recall(rfs4_deleg_state_t *dsp, bool_t trunc)
{
        CB_COMPOUND4args        cb4_args;
        CB_COMPOUND4res         cb4_res;
        CB_RECALL4args          *rec_argp;
        CB_RECALL4res           *rec_resp;
        nfs_cb_argop4           *argop;
        int                     numops;
        int                     argoplist_size;
        struct timeval          timeout;
        nfs_fh4                 *fhp;
        enum clnt_stat          call_stat;

        /*
         * set up the compound args
         */
        numops = 1;     /* CB_RECALL only */

        argoplist_size = numops * sizeof (nfs_cb_argop4);
        argop = kmem_zalloc(argoplist_size, KM_SLEEP);
        argop->argop = OP_CB_RECALL;
        rec_argp = &argop->nfs_cb_argop4_u.opcbrecall;

        (void) str_to_utf8("cb_recall", &cb4_args.tag);
        cb4_args.minorversion = CB4_MINORVERSION;
        /* cb4_args.callback_ident is set in rfs4_do_callback() */
        cb4_args.array_len = numops;
        cb4_args.array = argop;

        /*
         * fill in the args struct
         */
        bcopy(&dsp->rds_delegid.stateid, &rec_argp->stateid, sizeof (stateid4));
        rec_argp->truncate = trunc;

        fhp = &dsp->rds_finfo->rf_filehandle;
        rec_argp->fh.nfs_fh4_val = kmem_alloc(sizeof (char) *
            fhp->nfs_fh4_len, KM_SLEEP);
        nfs_fh4_copy(fhp, &rec_argp->fh);

        /* Keep track of when we did this for observability */
        dsp->rds_time_recalled = gethrestime_sec();

        /*
         * Set up the timeout for the callback and make the actual call.
         * Timeout will be 80% of the lease period for this server.
         */
        timeout.tv_sec = (rfs4_lease_time * 80) / 100;
        timeout.tv_usec = 0;

        DTRACE_NFSV4_3(cb__recall__start, rfs4_client_t *, dsp->rds_client,
            rfs4_deleg_state_t *, dsp, CB_RECALL4args *, rec_argp);

        call_stat = rfs4_do_callback(dsp->rds_client, &cb4_args, &cb4_res,
            timeout);

        rec_resp = (cb4_res.array_len == 0) ? NULL :
            &cb4_res.array[0].nfs_cb_resop4_u.opcbrecall;

        DTRACE_NFSV4_3(cb__recall__done, rfs4_client_t *, dsp->rds_client,
            rfs4_deleg_state_t *, dsp, CB_RECALL4res *, rec_resp);

        if (call_stat != RPC_SUCCESS || cb4_res.status != NFS4_OK) {
                rfs4_return_deleg(dsp, TRUE);
        }

        rfs4freeargres(&cb4_args, &cb4_res);
}

struct recall_arg {
        rfs4_deleg_state_t *dsp;
        void (*recall)(rfs4_deleg_state_t *, bool_t trunc);
        bool_t trunc;
};

static void
do_recall(struct recall_arg *arg)
{
        rfs4_deleg_state_t *dsp = arg->dsp;
        rfs4_file_t *fp = dsp->rds_finfo;
        callb_cpr_t cpr_info;
        kmutex_t cpr_lock;

        mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
        CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Recall");

        /*
         * It is possible that before this thread starts
         * the client has send us a return_delegation, and
         * if that is the case we do not need to send the
         * recall callback.
         */
        if (dsp->rds_dtype != OPEN_DELEGATE_NONE) {
                DTRACE_PROBE3(nfss__i__recall,
                    struct recall_arg *, arg,
                    struct rfs4_deleg_state_t *, dsp,
                    struct rfs4_file_t *, fp);

                if (arg->recall)
                        (void) (*arg->recall)(dsp, arg->trunc);
        }

        mutex_enter(fp->rf_dinfo.rd_recall_lock);
        /*
         * Recall count may go negative if the parent thread that is
         * creating the individual callback threads does not modify
         * the recall_count field before the callback thread actually
         * gets a response from the CB_RECALL
         */
        fp->rf_dinfo.rd_recall_count--;
        if (fp->rf_dinfo.rd_recall_count == 0)
                cv_signal(fp->rf_dinfo.rd_recall_cv);
        mutex_exit(fp->rf_dinfo.rd_recall_lock);

        mutex_enter(&cpr_lock);
        CALLB_CPR_EXIT(&cpr_info);
        mutex_destroy(&cpr_lock);

        rfs4_deleg_state_rele(dsp); /* release the hold for this thread */
        kmem_free(arg, sizeof (struct recall_arg));
        zthread_exit();
}

struct master_recall_args {
    rfs4_file_t *fp;
    void (*recall)(rfs4_deleg_state_t *, bool_t);
    bool_t trunc;
};

static void
do_recall_file(struct master_recall_args *map)
{
        rfs4_file_t *fp = map->fp;
        rfs4_deleg_state_t *dsp;
        struct recall_arg *arg;
        callb_cpr_t cpr_info;
        kmutex_t cpr_lock;
        int32_t recall_count;

        rfs4_dbe_lock(fp->rf_dbe);

        /* Recall already in progress ? */
        mutex_enter(fp->rf_dinfo.rd_recall_lock);
        if (fp->rf_dinfo.rd_recall_count != 0) {
                mutex_exit(fp->rf_dinfo.rd_recall_lock);
                rfs4_dbe_rele_nolock(fp->rf_dbe);
                rfs4_dbe_unlock(fp->rf_dbe);
                kmem_free(map, sizeof (struct master_recall_args));
                zthread_exit();
        }

        mutex_exit(fp->rf_dinfo.rd_recall_lock);

        mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
        CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "v4RecallFile");

        recall_count = 0;
        for (dsp = list_head(&fp->rf_delegstatelist); dsp != NULL;
            dsp = list_next(&fp->rf_delegstatelist, dsp)) {

                rfs4_dbe_lock(dsp->rds_dbe);
                /*
                 * if this delegation state
                 * is being reaped skip it
                 */
                if (rfs4_dbe_is_invalid(dsp->rds_dbe)) {
                        rfs4_dbe_unlock(dsp->rds_dbe);
                        continue;
                }

                /* hold for receiving thread */
                rfs4_dbe_hold(dsp->rds_dbe);
                rfs4_dbe_unlock(dsp->rds_dbe);

                arg = kmem_alloc(sizeof (struct recall_arg), KM_SLEEP);
                arg->recall = map->recall;
                arg->trunc = map->trunc;
                arg->dsp = dsp;

                recall_count++;

                (void) zthread_create(NULL, 0, do_recall, arg, 0,
                    minclsyspri);
        }

        rfs4_dbe_unlock(fp->rf_dbe);

        mutex_enter(fp->rf_dinfo.rd_recall_lock);
        /*
         * Recall count may go negative if the parent thread that is
         * creating the individual callback threads does not modify
         * the recall_count field before the callback thread actually
         * gets a response from the CB_RECALL
         */
        fp->rf_dinfo.rd_recall_count += recall_count;
        while (fp->rf_dinfo.rd_recall_count)
                cv_wait(fp->rf_dinfo.rd_recall_cv, fp->rf_dinfo.rd_recall_lock);

        mutex_exit(fp->rf_dinfo.rd_recall_lock);

        DTRACE_PROBE1(nfss__i__recall_done, rfs4_file_t *, fp);
        rfs4_file_rele(fp);
        kmem_free(map, sizeof (struct master_recall_args));
        mutex_enter(&cpr_lock);
        CALLB_CPR_EXIT(&cpr_info);
        mutex_destroy(&cpr_lock);
        zthread_exit();
}

static void
rfs4_recall_file(rfs4_file_t *fp,
    void (*recall)(rfs4_deleg_state_t *, bool_t trunc),
    bool_t trunc, rfs4_client_t *cp)
{
        struct master_recall_args *args;

        rfs4_dbe_lock(fp->rf_dbe);
        if (fp->rf_dinfo.rd_dtype == OPEN_DELEGATE_NONE) {
                rfs4_dbe_unlock(fp->rf_dbe);
                return;
        }
        rfs4_dbe_hold(fp->rf_dbe);      /* hold for new thread */

        /*
         * Mark the time we started the recall processing.
         * If it has been previously recalled, do not reset the
         * timer since this is used for the revocation decision.
         */
        if (fp->rf_dinfo.rd_time_recalled == 0)
                fp->rf_dinfo.rd_time_recalled = gethrestime_sec();
        fp->rf_dinfo.rd_ever_recalled = TRUE; /* used for policy decision */
        /* Client causing recall not always available */
        if (cp)
                fp->rf_dinfo.rd_conflicted_client = cp->rc_clientid;

        rfs4_dbe_unlock(fp->rf_dbe);

        args = kmem_alloc(sizeof (struct master_recall_args), KM_SLEEP);
        args->fp = fp;
        args->recall = recall;
        args->trunc = trunc;

        (void) zthread_create(NULL, 0, do_recall_file, args, 0,
            minclsyspri);
}

void
rfs4_recall_deleg(rfs4_file_t *fp, bool_t trunc, rfs4_client_t *cp)
{
        time_t elapsed1, elapsed2;

        if (fp->rf_dinfo.rd_time_recalled != 0) {
                elapsed1 = gethrestime_sec() - fp->rf_dinfo.rd_time_recalled;
                elapsed2 = gethrestime_sec() - fp->rf_dinfo.rd_time_lastwrite;
                /* First check to see if a revocation should occur */
                if (elapsed1 > rfs4_lease_time &&
                    elapsed2 > rfs4_lease_time) {
                        rfs4_revoke_file(fp);
                        return;
                }
                /*
                 * Next check to see if a recall should be done again
                 * so quickly.
                 */
                if (elapsed1 <= ((rfs4_lease_time * 20) / 100))
                        return;
        }
        rfs4_recall_file(fp, rfs4_do_cb_recall, trunc, cp);
}

/*
 * rfs4_check_recall is called from rfs4_do_open to determine if the current
 * open conflicts with the delegation.
 * Return true if we need recall otherwise false.
 * Assumes entry locks for sp and sp->rs_finfo are held.
 */
bool_t
rfs4_check_recall(rfs4_state_t *sp, uint32_t access)
{
        open_delegation_type4 dtype = sp->rs_finfo->rf_dinfo.rd_dtype;

        switch (dtype) {
        case OPEN_DELEGATE_NONE:
                /* Not currently delegated so there is nothing to do */
                return (FALSE);
        case OPEN_DELEGATE_READ:
                /*
                 * If the access is only asking for READ then there is
                 * no conflict and nothing to do.  If it is asking
                 * for write, then there will be conflict and the read
                 * delegation should be recalled.
                 */
                if (access == OPEN4_SHARE_ACCESS_READ)
                        return (FALSE);
                else
                        return (TRUE);
        case OPEN_DELEGATE_WRITE:
                /* Check to see if this client has the delegation */
                return (rfs4_is_deleg(sp));
        }

        return (FALSE);
}

/*
 * Return the "best" allowable delegation available given the current
 * delegation type and the desired access and deny modes on the file.
 * At the point that this routine is called we know that the access and
 * deny modes are consistent with the file modes.
 */
static open_delegation_type4
rfs4_check_delegation(rfs4_state_t *sp, rfs4_file_t *fp)
{
        open_delegation_type4 dtype = fp->rf_dinfo.rd_dtype;
        uint32_t access = sp->rs_share_access;
        uint32_t deny = sp->rs_share_deny;
        int readcnt = 0;
        int writecnt = 0;

        switch (dtype) {
        case OPEN_DELEGATE_NONE:
                /*
                 * Determine if more than just this OPEN have the file
                 * open and if so, no delegation may be provided to
                 * the client.
                 */
                if (access & OPEN4_SHARE_ACCESS_WRITE)
                        writecnt++;
                if (access & OPEN4_SHARE_ACCESS_READ)
                        readcnt++;

                if (fp->rf_access_read > readcnt ||
                    fp->rf_access_write > writecnt)
                        return (OPEN_DELEGATE_NONE);

                /*
                 * If the client is going to write, or if the client
                 * has exclusive access, return a write delegation.
                 */
                if ((access & OPEN4_SHARE_ACCESS_WRITE) ||
                    (deny & (OPEN4_SHARE_DENY_READ | OPEN4_SHARE_DENY_WRITE)))
                        return (OPEN_DELEGATE_WRITE);
                /*
                 * If we don't want to write or we've haven't denied read
                 * access to others, return a read delegation.
                 */
                if ((access & ~OPEN4_SHARE_ACCESS_WRITE) ||
                    (deny & ~OPEN4_SHARE_DENY_READ))
                        return (OPEN_DELEGATE_READ);

                /* Shouldn't get here */
                return (OPEN_DELEGATE_NONE);

        case OPEN_DELEGATE_READ:
                /*
                 * If the file is delegated for read but we wan't to
                 * write or deny others to read then we can't delegate
                 * the file. We shouldn't get here since the delegation should
                 * have been recalled already.
                 */
                if ((access & OPEN4_SHARE_ACCESS_WRITE) ||
                    (deny & OPEN4_SHARE_DENY_READ))
                        return (OPEN_DELEGATE_NONE);
                return (OPEN_DELEGATE_READ);

        case OPEN_DELEGATE_WRITE:
                return (OPEN_DELEGATE_WRITE);
        }

        /* Shouldn't get here */
        return (OPEN_DELEGATE_NONE);
}

/*
 * Given the desired delegation type and the "history" of the file
 * determine the actual delegation type to return.
 */
static open_delegation_type4
rfs4_delegation_policy(nfs4_srv_t *nsrv4, open_delegation_type4 dtype,
    rfs4_dinfo_t *dinfo, clientid4 cid)
{
        time_t elapsed;

        if (nsrv4->nfs4_deleg_policy != SRV_NORMAL_DELEGATE)
                return (OPEN_DELEGATE_NONE);

        /*
         * Has this file/delegation ever been recalled?  If not then
         * no further checks for a delegation race need to be done.
         * However if a recall has occurred, then check to see if a
         * client has caused its own delegation recall to occur.  If
         * not, then has a delegation for this file been returned
         * recently?  If so, then do not assign a new delegation to
         * avoid a "delegation race" between the original client and
         * the new/conflicting client.
         */
        if (dinfo->rd_ever_recalled == TRUE) {
                if (dinfo->rd_conflicted_client != cid) {
                        elapsed = gethrestime_sec() - dinfo->rd_time_returned;
                        if (elapsed < rfs4_lease_time)
                                return (OPEN_DELEGATE_NONE);
                }
        }

        /* Limit the number of read grants */
        if (dtype == OPEN_DELEGATE_READ &&
            dinfo->rd_rdgrants > MAX_READ_DELEGATIONS)
                return (OPEN_DELEGATE_NONE);

        /*
         * Should consider limiting total number of read/write
         * delegations the server will permit.
         */

        return (dtype);
}

/*
 * Try and grant a delegation for an open give the state. The routine
 * returns the delegation type granted. This could be OPEN_DELEGATE_NONE.
 *
 * The state and associate file entry must be locked
 */
rfs4_deleg_state_t *
rfs4_grant_delegation(delegreq_t dreq, rfs4_state_t *sp, int *recall)
{
        nfs4_srv_t *nsrv4;
        rfs4_file_t *fp = sp->rs_finfo;
        open_delegation_type4 dtype;
        int no_delegation;

        ASSERT(rfs4_dbe_islocked(sp->rs_dbe));
        ASSERT(rfs4_dbe_islocked(fp->rf_dbe));

        nsrv4 = nfs4_get_srv();

        /* Is the server even providing delegations? */
        if (nsrv4->nfs4_deleg_policy == SRV_NEVER_DELEGATE ||
            dreq == DELEG_NONE) {
                return (NULL);
        }

        /* Check to see if delegations have been temporarily disabled */
        mutex_enter(&nsrv4->deleg_lock);
        no_delegation = rfs4_deleg_disabled;
        mutex_exit(&nsrv4->deleg_lock);

        if (no_delegation)
                return (NULL);

        /* Don't grant a delegation if a deletion is impending. */
        if (fp->rf_dinfo.rd_hold_grant > 0) {
                return (NULL);
        }

        /*
         * Don't grant a delegation if there are any lock manager
         * (NFSv2/v3) locks for the file.  This is a bit of a hack (e.g.,
         * if there are only read locks we should be able to grant a
         * read-only delegation), but it's good enough for now.
         *
         * MT safety: the lock manager checks for conflicting delegations
         * before processing a lock request.  That check will block until
         * we are done here.  So if the lock manager acquires a lock after
         * we decide to grant the delegation, the delegation will get
         * immediately recalled (if there's a conflict), so we're safe.
         */
        if (lm_vp_active(fp->rf_vp)) {
                return (NULL);
        }

        /*
         * Based on the type of delegation request passed in, take the
         * appropriate action (DELEG_NONE is handled above)
         */
        switch (dreq) {

        case DELEG_READ:
        case DELEG_WRITE:
                /*
                 * The server "must" grant the delegation in this case.
                 * Client is using open previous
                 */
                dtype = (open_delegation_type4)dreq;
                *recall = 1;
                break;
        case DELEG_ANY:
                /*
                 * If a valid callback path does not exist, no delegation may
                 * be granted.
                 */
                if (sp->rs_owner->ro_client->rc_cbinfo.cb_state != CB_OK)
                        return (NULL);

                /*
                 * If the original operation which caused time_rm_delayed
                 * to be set hasn't been retried and completed for one
                 * full lease period, clear it and allow delegations to
                 * get granted again.
                 */
                if (fp->rf_dinfo.rd_time_rm_delayed > 0 &&
                    gethrestime_sec() >
                    fp->rf_dinfo.rd_time_rm_delayed + rfs4_lease_time)
                        fp->rf_dinfo.rd_time_rm_delayed = 0;

                /*
                 * If we are waiting for a delegation to be returned then
                 * don't delegate this file. We do this for correctness as
                 * well as if the file is being recalled we would likely
                 * recall this file again.
                 */

                if (fp->rf_dinfo.rd_time_recalled != 0 ||
                    fp->rf_dinfo.rd_time_rm_delayed != 0)
                        return (NULL);

                /* Get the "best" delegation candidate */
                dtype = rfs4_check_delegation(sp, fp);

                if (dtype == OPEN_DELEGATE_NONE)
                        return (NULL);

                /*
                 * Based on policy and the history of the file get the
                 * actual delegation.
                 */
                dtype = rfs4_delegation_policy(nsrv4, dtype, &fp->rf_dinfo,
                    sp->rs_owner->ro_client->rc_clientid);

                if (dtype == OPEN_DELEGATE_NONE)
                        return (NULL);
                break;
        default:
                return (NULL);
        }

        /* set the delegation for the state */
        return (rfs4_deleg_state(sp, dtype, recall));
}

void
rfs4_set_deleg_response(rfs4_deleg_state_t *dsp, open_delegation4 *dp,
    nfsace4 *ace,  int recall)
{
        open_write_delegation4 *wp;
        open_read_delegation4 *rp;
        nfs_space_limit4 *spl;
        nfsace4 nace;

        /*
         * We need to allocate a new copy of the who string.
         * this string will be freed by the rfs4_op_open dis_resfree
         * routine. We need to do this allocation since replays will
         * be allocated and rfs4_compound can't tell the difference from
         * a replay and an inital open. N.B. if an ace is passed in, it
         * the caller's responsibility to free it.
         */

        if (ace == NULL) {
                /*
                 * Default is to deny all access, the client will have
                 * to contact the server.  XXX Do we want to actually
                 * set a deny for every one, or do we simply want to
                 * construct an entity that will match no one?
                 */
                nace.type = ACE4_ACCESS_DENIED_ACE_TYPE;
                nace.flag = 0;
                nace.access_mask = ACE4_VALID_MASK_BITS;
                (void) str_to_utf8(ACE4_WHO_EVERYONE, &nace.who);
        } else {
                nace.type = ace->type;
                nace.flag = ace->flag;
                nace.access_mask = ace->access_mask;
                (void) utf8_copy(&ace->who, &nace.who);
        }

        dp->delegation_type = dsp->rds_dtype;

        switch (dsp->rds_dtype) {
        case OPEN_DELEGATE_NONE:
                break;
        case OPEN_DELEGATE_READ:
                rp = &dp->open_delegation4_u.read;
                rp->stateid = dsp->rds_delegid.stateid;
                rp->recall = (bool_t)recall;
                rp->permissions = nace;
                break;
        case OPEN_DELEGATE_WRITE:
                wp = &dp->open_delegation4_u.write;
                wp->stateid = dsp->rds_delegid.stateid;
                wp->recall = (bool_t)recall;
                spl = &wp->space_limit;
                spl->limitby = NFS_LIMIT_SIZE;
                spl->nfs_space_limit4_u.filesize = 0;
                wp->permissions = nace;
                break;
        }
}

/*
 * Check if the file is delegated via the provided file struct.
 * Return TRUE if it is delegated.  This is intended for use by
 * the v4 server.  The v2/v3 server code should use rfs4_check_delegated().
 *
 * Note that if the file is found to have a delegation, it is
 * recalled, unless the clientid of the caller matches the clientid of the
 * delegation. If the caller has specified, there is a slight delay
 * inserted in the hopes that the delegation will be returned quickly.
 */
bool_t
rfs4_check_delegated_byfp(int mode, rfs4_file_t *fp,
    bool_t trunc, bool_t do_delay, bool_t is_rm, clientid4 *cp)
{
        rfs4_deleg_state_t *dsp;

        nfs4_srv_t *nsrv4 = nfs4_get_srv();

        /* Is delegation enabled? */
        if (nsrv4->nfs4_deleg_policy == SRV_NEVER_DELEGATE)
                return (FALSE);

        /* do we have a delegation on this file? */
        rfs4_dbe_lock(fp->rf_dbe);
        if (fp->rf_dinfo.rd_dtype == OPEN_DELEGATE_NONE) {
                if (is_rm)
                        fp->rf_dinfo.rd_hold_grant++;
                rfs4_dbe_unlock(fp->rf_dbe);
                return (FALSE);
        }
        /*
         * do we have a write delegation on this file or are we
         * requesting write access to a file with any type of existing
         * delegation?
         */
        if (mode == FWRITE || fp->rf_dinfo.rd_dtype == OPEN_DELEGATE_WRITE) {
                if (cp != NULL) {
                        dsp = list_head(&fp->rf_delegstatelist);
                        if (dsp == NULL) {
                                rfs4_dbe_unlock(fp->rf_dbe);
                                return (FALSE);
                        }
                        /*
                         * Does the requestor already own the delegation?
                         */
                        if (dsp->rds_client->rc_clientid == *(cp)) {
                                rfs4_dbe_unlock(fp->rf_dbe);
                                return (FALSE);
                        }
                }

                rfs4_dbe_unlock(fp->rf_dbe);
                rfs4_recall_deleg(fp, trunc, NULL);

                if (!do_delay) {
                        rfs4_dbe_lock(fp->rf_dbe);
                        fp->rf_dinfo.rd_time_rm_delayed = gethrestime_sec();
                        rfs4_dbe_unlock(fp->rf_dbe);
                        return (TRUE);
                }

                delay(NFS4_DELEGATION_CONFLICT_DELAY);

                rfs4_dbe_lock(fp->rf_dbe);
                if (fp->rf_dinfo.rd_dtype != OPEN_DELEGATE_NONE) {
                        fp->rf_dinfo.rd_time_rm_delayed = gethrestime_sec();
                        rfs4_dbe_unlock(fp->rf_dbe);
                        return (TRUE);
                }
        }
        if (is_rm)
                fp->rf_dinfo.rd_hold_grant++;
        rfs4_dbe_unlock(fp->rf_dbe);
        return (FALSE);
}

/*
 * Check if the file is delegated in the case of a v2 or v3 access.
 * Return TRUE if it is delegated which in turn means that v2 should
 * drop the request and in the case of v3 JUKEBOX should be returned.
 */
bool_t
rfs4_check_delegated(int mode, vnode_t *vp, bool_t trunc)
{
        nfs4_srv_t *nsrv4;
        rfs4_file_t *fp;
        bool_t create = FALSE;
        bool_t rc = FALSE;

        nsrv4 = nfs4_get_srv();
        rfs4_hold_deleg_policy(nsrv4);

        /* Is delegation enabled? */
        if (nsrv4->nfs4_deleg_policy != SRV_NEVER_DELEGATE) {
                fp = rfs4_findfile(vp, NULL, &create);
                if (fp != NULL) {
                        if (rfs4_check_delegated_byfp(mode, fp, trunc,
                            TRUE, FALSE, NULL)) {
                                rc = TRUE;
                        }
                        rfs4_file_rele(fp);
                }
        }
        rfs4_rele_deleg_policy(nsrv4);
        return (rc);
}

/*
 * Release a hold on the hold_grant counter which
 * prevents delegation from being granted while a remove
 * or a rename is in progress.
 */
void
rfs4_clear_dont_grant(rfs4_file_t *fp)
{
        nfs4_srv_t *nsrv4 = nfs4_get_srv();

        if (nsrv4->nfs4_deleg_policy == SRV_NEVER_DELEGATE)
                return;
        rfs4_dbe_lock(fp->rf_dbe);
        ASSERT(fp->rf_dinfo.rd_hold_grant > 0);
        fp->rf_dinfo.rd_hold_grant--;
        fp->rf_dinfo.rd_time_rm_delayed = 0;
        rfs4_dbe_unlock(fp->rf_dbe);
}

/*
 * State support for delegation.
 * Set the state delegation type for this state;
 * This routine is called from open via rfs4_grant_delegation and the entry
 * locks on sp and sp->rs_finfo are assumed.
 */
static rfs4_deleg_state_t *
rfs4_deleg_state(rfs4_state_t *sp, open_delegation_type4 dtype, int *recall)
{
        rfs4_file_t *fp = sp->rs_finfo;
        bool_t create = TRUE;
        rfs4_deleg_state_t *dsp;
        vnode_t *vp;
        int open_prev = *recall;
        int ret;
        int fflags = 0;

        ASSERT(rfs4_dbe_islocked(sp->rs_dbe));
        ASSERT(rfs4_dbe_islocked(fp->rf_dbe));

        /* Shouldn't happen */
        if (fp->rf_dinfo.rd_recall_count != 0 ||
            (fp->rf_dinfo.rd_dtype == OPEN_DELEGATE_READ &&
            dtype != OPEN_DELEGATE_READ)) {
                return (NULL);
        }

        /* Unlock to avoid deadlock */
        rfs4_dbe_unlock(fp->rf_dbe);
        rfs4_dbe_unlock(sp->rs_dbe);

        dsp = rfs4_finddeleg(sp, &create);

        rfs4_dbe_lock(sp->rs_dbe);
        rfs4_dbe_lock(fp->rf_dbe);

        if (dsp == NULL)
                return (NULL);

        /*
         * It is possible that since we dropped the lock
         * in order to call finddeleg, the rfs4_file_t
         * was marked such that we should not grant a
         * delegation, if so bail out.
         */
        if (fp->rf_dinfo.rd_hold_grant > 0) {
                rfs4_deleg_state_rele(dsp);
                return (NULL);
        }

        if (create == FALSE) {
                if (sp->rs_owner->ro_client == dsp->rds_client &&
                    dsp->rds_dtype == dtype) {
                        return (dsp);
                } else {
                        rfs4_deleg_state_rele(dsp);
                        return (NULL);
                }
        }

        /*
         * Check that this file has not been delegated to another
         * client
         */
        if (fp->rf_dinfo.rd_recall_count != 0 ||
            fp->rf_dinfo.rd_dtype == OPEN_DELEGATE_WRITE ||
            (fp->rf_dinfo.rd_dtype == OPEN_DELEGATE_READ &&
            dtype != OPEN_DELEGATE_READ)) {
                rfs4_deleg_state_rele(dsp);
                return (NULL);
        }

        vp = fp->rf_vp;
        /* vnevent_support returns 0 if file system supports vnevents */
        if (vnevent_support(vp, NULL)) {
                rfs4_deleg_state_rele(dsp);
                return (NULL);
        }

        /* Calculate the fflags for this OPEN. */
        if (sp->rs_share_access & OPEN4_SHARE_ACCESS_READ)
                fflags |= FREAD;
        if (sp->rs_share_access & OPEN4_SHARE_ACCESS_WRITE)
                fflags |= FWRITE;

        *recall = 0;
        /*
         * Before granting a delegation we need to know if anyone else has
         * opened the file in a conflicting mode.  However, first we need to
         * know how we opened the file to check the counts properly.
         */
        if (dtype == OPEN_DELEGATE_READ) {
                if (((fflags & FWRITE) && vn_has_other_opens(vp, V_WRITE)) ||
                    (((fflags & FWRITE) == 0) && vn_is_opened(vp, V_WRITE)) ||
                    vn_is_mapped(vp, V_WRITE)) {
                        if (open_prev) {
                                *recall = 1;
                        } else {
                                rfs4_deleg_state_rele(dsp);
                                return (NULL);
                        }
                }
                ret = fem_install(vp, deleg_rdops, (void *)fp, OPUNIQ,
                    rfs4_mon_hold, rfs4_mon_rele);
                if (((fflags & FWRITE) && vn_has_other_opens(vp, V_WRITE)) ||
                    (((fflags & FWRITE) == 0) && vn_is_opened(vp, V_WRITE)) ||
                    vn_is_mapped(vp, V_WRITE)) {
                        if (open_prev) {
                                *recall = 1;
                        } else {
                                (void) fem_uninstall(vp, deleg_rdops,
                                    (void *)fp);
                                rfs4_deleg_state_rele(dsp);
                                return (NULL);
                        }
                }
                /*
                 * Because a client can hold onto a delegation after the
                 * file has been closed, we need to keep track of the
                 * access to this file.  Otherwise the CIFS server would
                 * not know about the client accessing the file and could
                 * inappropriately grant an OPLOCK.
                 * fem_install() returns EBUSY when asked to install a
                 * OPUNIQ monitor more than once.  Therefore, check the
                 * return code because we only want this done once.
                 */
                if (ret == 0)
                        vn_open_upgrade(vp, FREAD);
        } else { /* WRITE */
                if (((fflags & FWRITE) && vn_has_other_opens(vp, V_WRITE)) ||
                    (((fflags & FWRITE) == 0) && vn_is_opened(vp, V_WRITE)) ||
                    ((fflags & FREAD) && vn_has_other_opens(vp, V_READ)) ||
                    (((fflags & FREAD) == 0) && vn_is_opened(vp, V_READ)) ||
                    vn_is_mapped(vp, V_RDORWR)) {
                        if (open_prev) {
                                *recall = 1;
                        } else {
                                rfs4_deleg_state_rele(dsp);
                                return (NULL);
                        }
                }
                ret = fem_install(vp, deleg_wrops, (void *)fp, OPUNIQ,
                    rfs4_mon_hold, rfs4_mon_rele);
                if (((fflags & FWRITE) && vn_has_other_opens(vp, V_WRITE)) ||
                    (((fflags & FWRITE) == 0) && vn_is_opened(vp, V_WRITE)) ||
                    ((fflags & FREAD) && vn_has_other_opens(vp, V_READ)) ||
                    (((fflags & FREAD) == 0) && vn_is_opened(vp, V_READ)) ||
                    vn_is_mapped(vp, V_RDORWR)) {
                        if (open_prev) {
                                *recall = 1;
                        } else {
                                (void) fem_uninstall(vp, deleg_wrops,
                                    (void *)fp);
                                rfs4_deleg_state_rele(dsp);
                                return (NULL);
                        }
                }
                /*
                 * Because a client can hold onto a delegation after the
                 * file has been closed, we need to keep track of the
                 * access to this file.  Otherwise the CIFS server would
                 * not know about the client accessing the file and could
                 * inappropriately grant an OPLOCK.
                 * fem_install() returns EBUSY when asked to install a
                 * OPUNIQ monitor more than once.  Therefore, check the
                 * return code because we only want this done once.
                 */
                if (ret == 0)
                        vn_open_upgrade(vp, FREAD|FWRITE);
        }
        /* Place on delegation list for file */
        ASSERT(!list_link_active(&dsp->rds_node));
        list_insert_tail(&fp->rf_delegstatelist, dsp);

        dsp->rds_dtype = fp->rf_dinfo.rd_dtype = dtype;

        /* Update delegation stats for this file */
        fp->rf_dinfo.rd_time_lastgrant = gethrestime_sec();

        /* reset since this is a new delegation */
        fp->rf_dinfo.rd_conflicted_client = 0;
        fp->rf_dinfo.rd_ever_recalled = FALSE;

        if (dtype == OPEN_DELEGATE_READ)
                fp->rf_dinfo.rd_rdgrants++;
        else
                fp->rf_dinfo.rd_wrgrants++;

        return (dsp);
}

/*
 * State routine for the server when a delegation is returned.
 */
void
rfs4_return_deleg(rfs4_deleg_state_t *dsp, bool_t revoked)
{
        rfs4_file_t *fp = dsp->rds_finfo;
        open_delegation_type4 dtypewas;

        rfs4_dbe_lock(fp->rf_dbe);

        /* nothing to do if no longer on list */
        if (!list_link_active(&dsp->rds_node)) {
                rfs4_dbe_unlock(fp->rf_dbe);
                return;
        }

        /* Remove state from recall list */
        list_remove(&fp->rf_delegstatelist, dsp);

        if (list_is_empty(&fp->rf_delegstatelist)) {
                dtypewas = fp->rf_dinfo.rd_dtype;
                fp->rf_dinfo.rd_dtype = OPEN_DELEGATE_NONE;
                rfs4_dbe_cv_broadcast(fp->rf_dbe);

                /* if file system was unshared, the vp will be NULL */
                if (fp->rf_vp != NULL) {
                        /*
                         * Once a delegation is no longer held by any client,
                         * the monitor is uninstalled.  At this point, the
                         * client must send OPEN otw, so we don't need the
                         * reference on the vnode anymore.  The open
                         * downgrade removes the reference put on earlier.
                         */
                        if (dtypewas == OPEN_DELEGATE_READ) {
                                (void) fem_uninstall(fp->rf_vp, deleg_rdops,
                                    (void *)fp);
                                vn_open_downgrade(fp->rf_vp, FREAD);
                        } else if (dtypewas == OPEN_DELEGATE_WRITE) {
                                (void) fem_uninstall(fp->rf_vp, deleg_wrops,
                                    (void *)fp);
                                vn_open_downgrade(fp->rf_vp, FREAD|FWRITE);
                        }
                }
        }

        switch (dsp->rds_dtype) {
        case OPEN_DELEGATE_READ:
                fp->rf_dinfo.rd_rdgrants--;
                break;
        case OPEN_DELEGATE_WRITE:
                fp->rf_dinfo.rd_wrgrants--;
                break;
        default:
                break;
        }

        /* used in the policy decision */
        fp->rf_dinfo.rd_time_returned = gethrestime_sec();

        /*
         * reset the time_recalled field so future delegations are not
         * accidentally revoked
         */
        if ((fp->rf_dinfo.rd_rdgrants + fp->rf_dinfo.rd_wrgrants) == 0)
                fp->rf_dinfo.rd_time_recalled = 0;

        rfs4_dbe_unlock(fp->rf_dbe);

        rfs4_dbe_lock(dsp->rds_dbe);

        dsp->rds_dtype = OPEN_DELEGATE_NONE;

        if (revoked == TRUE)
                dsp->rds_time_revoked = gethrestime_sec();

        rfs4_dbe_invalidate(dsp->rds_dbe);

        rfs4_dbe_unlock(dsp->rds_dbe);

        if (revoked == TRUE) {
                rfs4_dbe_lock(dsp->rds_client->rc_dbe);
                dsp->rds_client->rc_deleg_revoked++;    /* observability */
                rfs4_dbe_unlock(dsp->rds_client->rc_dbe);
        }
}

static void
rfs4_revoke_file(rfs4_file_t *fp)
{
        rfs4_deleg_state_t *dsp;

        /*
         * The lock for rfs4_file_t must be held when traversing the
         * delegation list but that lock needs to be released to call
         * rfs4_return_deleg()
         */
        rfs4_dbe_lock(fp->rf_dbe);
        while ((dsp = list_head(&fp->rf_delegstatelist)) != NULL) {
                rfs4_dbe_hold(dsp->rds_dbe);
                rfs4_dbe_unlock(fp->rf_dbe);
                rfs4_return_deleg(dsp, TRUE);
                rfs4_deleg_state_rele(dsp);
                rfs4_dbe_lock(fp->rf_dbe);
        }
        rfs4_dbe_unlock(fp->rf_dbe);
}

/*
 * A delegation is assumed to be present on the file associated with
 * "sp".  Check to see if the delegation matches is associated with
 * the same client as referenced by "sp".  If it is not, TRUE is
 * returned.  If the delegation DOES match the client (or no
 * delegation is present), return FALSE.
 * Assume the state entry and file entry are locked.
 */
bool_t
rfs4_is_deleg(rfs4_state_t *sp)
{
        rfs4_deleg_state_t *dsp;
        rfs4_file_t *fp = sp->rs_finfo;
        rfs4_client_t *cp = sp->rs_owner->ro_client;

        ASSERT(rfs4_dbe_islocked(fp->rf_dbe));
        for (dsp = list_head(&fp->rf_delegstatelist); dsp != NULL;
            dsp = list_next(&fp->rf_delegstatelist, dsp)) {
                if (cp != dsp->rds_client) {
                        return (TRUE);
                }
        }
        return (FALSE);
}

void
rfs4_disable_delegation(void)
{
        nfs4_srv_t *nsrv4;

        nsrv4 = nfs4_get_srv();
        mutex_enter(&nsrv4->deleg_lock);
        rfs4_deleg_disabled++;
        mutex_exit(&nsrv4->deleg_lock);
}

void
rfs4_enable_delegation(void)
{
        nfs4_srv_t *nsrv4;

        nsrv4 = nfs4_get_srv();
        mutex_enter(&nsrv4->deleg_lock);
        ASSERT(rfs4_deleg_disabled > 0);
        rfs4_deleg_disabled--;
        mutex_exit(&nsrv4->deleg_lock);
}

void
rfs4_mon_hold(void *arg)
{
        rfs4_file_t *fp = arg;

        rfs4_dbe_hold(fp->rf_dbe);
}

void
rfs4_mon_rele(void *arg)
{
        rfs4_file_t *fp = arg;

        rfs4_dbe_rele_nolock(fp->rf_dbe);
}