root/fs/smb/client/transport.c
// SPDX-License-Identifier: LGPL-2.1
/*
 *
 *   Copyright (C) International Business Machines  Corp., 2002,2008
 *   Author(s): Steve French (sfrench@us.ibm.com)
 *   Jeremy Allison (jra@samba.org) 2006.
 *
 */

#include <linux/fs.h>
#include <linux/list.h>
#include <linux/gfp.h>
#include <linux/wait.h>
#include <linux/net.h>
#include <linux/delay.h>
#include <linux/freezer.h>
#include <linux/tcp.h>
#include <linux/bvec.h>
#include <linux/highmem.h>
#include <linux/uaccess.h>
#include <linux/processor.h>
#include <linux/mempool.h>
#include <linux/sched/signal.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/task_work.h>
#include "cifsglob.h"
#include "cifsproto.h"
#include "cifs_debug.h"
#include "smb2proto.h"
#include "smbdirect.h"
#include "compress.h"

void
cifs_wake_up_task(struct TCP_Server_Info *server, struct mid_q_entry *mid)
{
        if (mid->mid_state == MID_RESPONSE_RECEIVED)
                mid->mid_state = MID_RESPONSE_READY;
        wake_up_process(mid->callback_data);
}

void __release_mid(struct TCP_Server_Info *server, struct mid_q_entry *midEntry)
{
#ifdef CONFIG_CIFS_STATS2
        __le16 command = server->vals->lock_cmd;
        __u16 smb_cmd = le16_to_cpu(midEntry->command);
        unsigned long now;
        unsigned long roundtrip_time;
#endif

        if (midEntry->resp_buf && (midEntry->wait_cancelled) &&
            (midEntry->mid_state == MID_RESPONSE_RECEIVED ||
             midEntry->mid_state == MID_RESPONSE_READY) &&
            server->ops->handle_cancelled_mid)
                server->ops->handle_cancelled_mid(midEntry, server);

        midEntry->mid_state = MID_FREE;
        atomic_dec(&mid_count);
        if (midEntry->large_buf)
                cifs_buf_release(midEntry->resp_buf);
        else
                cifs_small_buf_release(midEntry->resp_buf);
#ifdef CONFIG_CIFS_STATS2
        now = jiffies;
        if (now < midEntry->when_alloc)
                cifs_server_dbg(VFS, "Invalid mid allocation time\n");
        roundtrip_time = now - midEntry->when_alloc;

        if (smb_cmd < NUMBER_OF_SMB2_COMMANDS) {
                if (atomic_read(&server->num_cmds[smb_cmd]) == 0) {
                        server->slowest_cmd[smb_cmd] = roundtrip_time;
                        server->fastest_cmd[smb_cmd] = roundtrip_time;
                } else {
                        if (server->slowest_cmd[smb_cmd] < roundtrip_time)
                                server->slowest_cmd[smb_cmd] = roundtrip_time;
                        else if (server->fastest_cmd[smb_cmd] > roundtrip_time)
                                server->fastest_cmd[smb_cmd] = roundtrip_time;
                }
                cifs_stats_inc(&server->num_cmds[smb_cmd]);
                server->time_per_cmd[smb_cmd] += roundtrip_time;
        }
        /*
         * commands taking longer than one second (default) can be indications
         * that something is wrong, unless it is quite a slow link or a very
         * busy server. Note that this calc is unlikely or impossible to wrap
         * as long as slow_rsp_threshold is not set way above recommended max
         * value (32767 ie 9 hours) and is generally harmless even if wrong
         * since only affects debug counters - so leaving the calc as simple
         * comparison rather than doing multiple conversions and overflow
         * checks
         */
        if ((slow_rsp_threshold != 0) &&
            time_after(now, midEntry->when_alloc + (slow_rsp_threshold * HZ)) &&
            (midEntry->command != command)) {
                /*
                 * smb2slowcmd[NUMBER_OF_SMB2_COMMANDS] counts by command
                 * NB: le16_to_cpu returns unsigned so can not be negative below
                 */
                if (smb_cmd < NUMBER_OF_SMB2_COMMANDS)
                        cifs_stats_inc(&server->smb2slowcmd[smb_cmd]);

                trace_smb3_slow_rsp(smb_cmd, midEntry->mid, midEntry->pid,
                               midEntry->when_sent, midEntry->when_received);
                if (cifsFYI & CIFS_TIMER) {
                        pr_debug("slow rsp: cmd %d mid %llu",
                                 midEntry->command, midEntry->mid);
                        cifs_info("A: 0x%lx S: 0x%lx R: 0x%lx\n",
                                  now - midEntry->when_alloc,
                                  now - midEntry->when_sent,
                                  now - midEntry->when_received);
                }
        }
#endif
        put_task_struct(midEntry->creator);

        mempool_free(midEntry, &cifs_mid_pool);
}

void
delete_mid(struct TCP_Server_Info *server, struct mid_q_entry *mid)
{
        spin_lock(&server->mid_queue_lock);

        if (!mid->deleted_from_q) {
                list_del_init(&mid->qhead);
                mid->deleted_from_q = true;
        }
        spin_unlock(&server->mid_queue_lock);

        release_mid(server, mid);
}

/*
 * smb_send_kvec - send an array of kvecs to the server
 * @server:     Server to send the data to
 * @smb_msg:    Message to send
 * @sent:       amount of data sent on socket is stored here
 *
 * Our basic "send data to server" function. Should be called with srv_mutex
 * held. The caller is responsible for handling the results.
 */
int
smb_send_kvec(struct TCP_Server_Info *server, struct msghdr *smb_msg,
              size_t *sent)
{
        int rc = 0;
        int retries = 0;
        struct socket *ssocket = server->ssocket;

        *sent = 0;

        if (server->noblocksnd)
                smb_msg->msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL;
        else
                smb_msg->msg_flags = MSG_NOSIGNAL;

        while (msg_data_left(smb_msg)) {
                /*
                 * If blocking send, we try 3 times, since each can block
                 * for 5 seconds. For nonblocking  we have to try more
                 * but wait increasing amounts of time allowing time for
                 * socket to clear.  The overall time we wait in either
                 * case to send on the socket is about 15 seconds.
                 * Similarly we wait for 15 seconds for a response from
                 * the server in SendReceive[2] for the server to send
                 * a response back for most types of requests (except
                 * SMB Write past end of file which can be slow, and
                 * blocking lock operations). NFS waits slightly longer
                 * than CIFS, but this can make it take longer for
                 * nonresponsive servers to be detected and 15 seconds
                 * is more than enough time for modern networks to
                 * send a packet.  In most cases if we fail to send
                 * after the retries we will kill the socket and
                 * reconnect which may clear the network problem.
                 *
                 * Even if regular signals are masked, EINTR might be
                 * propagated from sk_stream_wait_memory() to here when
                 * TIF_NOTIFY_SIGNAL is used for task work. For example,
                 * certain io_uring completions will use that. Treat
                 * having EINTR with pending task work the same as EAGAIN
                 * to avoid unnecessary reconnects.
                 */
                rc = sock_sendmsg(ssocket, smb_msg);
                if (rc == -EAGAIN || unlikely(rc == -EINTR && task_work_pending(current))) {
                        retries++;
                        if (retries >= 14 ||
                            (!server->noblocksnd && (retries > 2))) {
                                cifs_server_dbg(VFS, "sends on sock %p stuck for 15 seconds\n",
                                         ssocket);
                                return -EAGAIN;
                        }
                        msleep(1 << retries);
                        continue;
                }

                if (rc < 0)
                        return rc;

                if (rc == 0) {
                        /* should never happen, letting socket clear before
                           retrying is our only obvious option here */
                        cifs_server_dbg(VFS, "tcp sent no data\n");
                        msleep(500);
                        continue;
                }

                /* send was at least partially successful */
                *sent += rc;
                retries = 0; /* in case we get ENOSPC on the next send */
        }
        return 0;
}

unsigned long
smb_rqst_len(struct TCP_Server_Info *server, struct smb_rqst *rqst)
{
        unsigned int i;
        struct kvec *iov;
        int nvec;
        unsigned long buflen = 0;

        if (!is_smb1(server) && rqst->rq_nvec >= 2 &&
            rqst->rq_iov[0].iov_len == 4) {
                iov = &rqst->rq_iov[1];
                nvec = rqst->rq_nvec - 1;
        } else {
                iov = rqst->rq_iov;
                nvec = rqst->rq_nvec;
        }

        /* total up iov array first */
        for (i = 0; i < nvec; i++)
                buflen += iov[i].iov_len;

        buflen += iov_iter_count(&rqst->rq_iter);
        return buflen;
}

int __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
                    struct smb_rqst *rqst)
{
        int rc;
        struct kvec *iov;
        int n_vec;
        unsigned int send_length = 0;
        unsigned int i, j;
        sigset_t mask, oldmask;
        size_t total_len = 0, sent, size;
        struct socket *ssocket = server->ssocket;
        struct msghdr smb_msg = {};
        __be32 rfc1002_marker;

        cifs_in_send_inc(server);
        if (cifs_rdma_enabled(server)) {
                /* return -EAGAIN when connecting or reconnecting */
                rc = -EAGAIN;
                if (server->smbd_conn)
                        rc = smbd_send(server, num_rqst, rqst);
                goto smbd_done;
        }

        rc = -EAGAIN;
        if (ssocket == NULL)
                goto out;

        rc = -ERESTARTSYS;
        if (fatal_signal_pending(current)) {
                cifs_dbg(FYI, "signal pending before send request\n");
                goto out;
        }

        rc = 0;
        /* cork the socket */
        tcp_sock_set_cork(ssocket->sk, true);

        for (j = 0; j < num_rqst; j++)
                send_length += smb_rqst_len(server, &rqst[j]);
        rfc1002_marker = cpu_to_be32(send_length);

        /*
         * We should not allow signals to interrupt the network send because
         * any partial send will cause session reconnects thus increasing
         * latency of system calls and overload a server with unnecessary
         * requests.
         */

        sigfillset(&mask);
        sigprocmask(SIG_BLOCK, &mask, &oldmask);

        /* Generate a rfc1002 marker */
        {
                struct kvec hiov = {
                        .iov_base = &rfc1002_marker,
                        .iov_len  = 4
                };
                iov_iter_kvec(&smb_msg.msg_iter, ITER_SOURCE, &hiov, 1, 4);
                rc = smb_send_kvec(server, &smb_msg, &sent);
                if (rc < 0)
                        goto unmask;

                total_len += sent;
                send_length += 4;
        }

        cifs_dbg(FYI, "Sending smb: smb_len=%u\n", send_length);

        for (j = 0; j < num_rqst; j++) {
                iov = rqst[j].rq_iov;
                n_vec = rqst[j].rq_nvec;

                size = 0;
                for (i = 0; i < n_vec; i++) {
                        dump_smb(iov[i].iov_base, iov[i].iov_len);
                        size += iov[i].iov_len;
                }

                iov_iter_kvec(&smb_msg.msg_iter, ITER_SOURCE, iov, n_vec, size);

                rc = smb_send_kvec(server, &smb_msg, &sent);
                if (rc < 0)
                        goto unmask;

                total_len += sent;

                if (iov_iter_count(&rqst[j].rq_iter) > 0) {
                        smb_msg.msg_iter = rqst[j].rq_iter;
                        rc = smb_send_kvec(server, &smb_msg, &sent);
                        if (rc < 0)
                                break;
                        total_len += sent;
                }
        }

unmask:
        sigprocmask(SIG_SETMASK, &oldmask, NULL);

        /*
         * If signal is pending but we have already sent the whole packet to
         * the server we need to return success status to allow a corresponding
         * mid entry to be kept in the pending requests queue thus allowing
         * to handle responses from the server by the client.
         *
         * If only part of the packet has been sent there is no need to hide
         * interrupt because the session will be reconnected anyway, so there
         * won't be any response from the server to handle.
         */

        if (signal_pending(current) && (total_len != send_length)) {
                cifs_dbg(FYI, "signal is pending after attempt to send\n");
                rc = -ERESTARTSYS;
        }

        /* uncork it */
        tcp_sock_set_cork(ssocket->sk, false);

        if ((total_len > 0) && (total_len != send_length)) {
                cifs_dbg(FYI, "partial send (wanted=%u sent=%zu): terminating session\n",
                         send_length, total_len);
                /*
                 * If we have only sent part of an SMB then the next SMB could
                 * be taken as the remainder of this one. We need to kill the
                 * socket so the server throws away the partial SMB
                 */
                cifs_signal_cifsd_for_reconnect(server, false);
                trace_smb3_partial_send_reconnect(server->current_mid,
                                                  server->conn_id, server->hostname);
        }
smbd_done:
        /*
         * there's hardly any use for the layers above to know the
         * actual error code here. All they should do at this point is
         * to retry the connection and hope it goes away.
         */
        if (rc < 0 && rc != -EINTR && rc != -EAGAIN) {
                cifs_server_dbg(VFS, "Error %d sending data on socket to server\n",
                         rc);
                rc = -ECONNABORTED;
                cifs_signal_cifsd_for_reconnect(server, false);
        } else if (rc > 0)
                rc = 0;
out:
        cifs_in_send_dec(server);
        return rc;
}

static int
smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
              struct smb_rqst *rqst, int flags)
{
        struct smb2_transform_hdr tr_hdr;
        struct smb_rqst new_rqst[MAX_COMPOUND] = {};
        struct kvec iov = {
                .iov_base = &tr_hdr,
                .iov_len = sizeof(tr_hdr),
        };
        int rc;

        if (flags & CIFS_COMPRESS_REQ)
                return smb_compress(server, &rqst[0], __smb_send_rqst);

        if (!(flags & CIFS_TRANSFORM_REQ))
                return __smb_send_rqst(server, num_rqst, rqst);

        if (WARN_ON_ONCE(num_rqst > MAX_COMPOUND - 1))
                return smb_EIO1(smb_eio_trace_tx_max_compound, num_rqst);

        if (!server->ops->init_transform_rq) {
                cifs_server_dbg(VFS, "Encryption requested but transform callback is missing\n");
                return smb_EIO(smb_eio_trace_tx_need_transform);
        }

        new_rqst[0].rq_iov = &iov;
        new_rqst[0].rq_nvec = 1;

        rc = server->ops->init_transform_rq(server, num_rqst + 1,
                                            new_rqst, rqst);
        if (!rc) {
                rc = __smb_send_rqst(server, num_rqst + 1, new_rqst);
                smb3_free_compound_rqst(num_rqst, &new_rqst[1]);
        }
        return rc;
}

static int
wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits,
                      const int timeout, const int flags,
                      unsigned int *instance)
{
        long rc;
        int *credits;
        int optype;
        long int t;
        int scredits, in_flight;

        if (timeout < 0)
                t = MAX_JIFFY_OFFSET;
        else
                t = msecs_to_jiffies(timeout);

        optype = flags & CIFS_OP_MASK;

        *instance = 0;

        credits = server->ops->get_credits_field(server, optype);
        /* Since an echo is already inflight, no need to wait to send another */
        if (*credits <= 0 && optype == CIFS_ECHO_OP)
                return -EAGAIN;

        spin_lock(&server->req_lock);
        if ((flags & CIFS_TIMEOUT_MASK) == CIFS_NON_BLOCKING) {
                /* oplock breaks must not be held up */
                server->in_flight++;
                if (server->in_flight > server->max_in_flight)
                        server->max_in_flight = server->in_flight;
                *credits -= 1;
                *instance = server->reconnect_instance;
                scredits = *credits;
                in_flight = server->in_flight;
                spin_unlock(&server->req_lock);

                trace_smb3_nblk_credits(server->current_mid,
                                server->conn_id, server->hostname, scredits, -1, in_flight);
                cifs_dbg(FYI, "%s: remove %u credits total=%d\n",
                                __func__, 1, scredits);

                return 0;
        }

        while (1) {
                spin_unlock(&server->req_lock);

                spin_lock(&server->srv_lock);
                if (server->tcpStatus == CifsExiting) {
                        spin_unlock(&server->srv_lock);
                        return -ENOENT;
                }
                spin_unlock(&server->srv_lock);

                spin_lock(&server->req_lock);
                if (*credits < num_credits) {
                        scredits = *credits;
                        spin_unlock(&server->req_lock);

                        cifs_num_waiters_inc(server);
                        rc = wait_event_killable_timeout(server->request_q,
                                has_credits(server, credits, num_credits), t);
                        cifs_num_waiters_dec(server);
                        if (!rc) {
                                spin_lock(&server->req_lock);
                                scredits = *credits;
                                in_flight = server->in_flight;
                                spin_unlock(&server->req_lock);

                                trace_smb3_credit_timeout(server->current_mid,
                                                server->conn_id, server->hostname, scredits,
                                                num_credits, in_flight);
                                cifs_server_dbg(VFS, "wait timed out after %d ms\n",
                                                timeout);
                                return -EBUSY;
                        }
                        if (rc == -ERESTARTSYS)
                                return -ERESTARTSYS;
                        spin_lock(&server->req_lock);
                } else {
                        /*
                         * For normal commands, reserve the last MAX_COMPOUND
                         * credits to compound requests.
                         * Otherwise these compounds could be permanently
                         * starved for credits by single-credit requests.
                         *
                         * To prevent spinning CPU, block this thread until
                         * there are >MAX_COMPOUND credits available.
                         * But only do this is we already have a lot of
                         * credits in flight to avoid triggering this check
                         * for servers that are slow to hand out credits on
                         * new sessions.
                         */
                        if (!optype && num_credits == 1 &&
                            server->in_flight > 2 * MAX_COMPOUND &&
                            *credits <= MAX_COMPOUND) {
                                spin_unlock(&server->req_lock);

                                cifs_num_waiters_inc(server);
                                rc = wait_event_killable_timeout(
                                        server->request_q,
                                        has_credits(server, credits,
                                                    MAX_COMPOUND + 1),
                                        t);
                                cifs_num_waiters_dec(server);
                                if (!rc) {
                                        spin_lock(&server->req_lock);
                                        scredits = *credits;
                                        in_flight = server->in_flight;
                                        spin_unlock(&server->req_lock);

                                        trace_smb3_credit_timeout(
                                                        server->current_mid,
                                                        server->conn_id, server->hostname,
                                                        scredits, num_credits, in_flight);
                                        cifs_server_dbg(VFS, "wait timed out after %d ms\n",
                                                        timeout);
                                        return -EBUSY;
                                }
                                if (rc == -ERESTARTSYS)
                                        return -ERESTARTSYS;
                                spin_lock(&server->req_lock);
                                continue;
                        }

                        /*
                         * Can not count locking commands against total
                         * as they are allowed to block on server.
                         */

                        /* update # of requests on the wire to server */
                        if ((flags & CIFS_TIMEOUT_MASK) != CIFS_BLOCKING_OP) {
                                *credits -= num_credits;
                                server->in_flight += num_credits;
                                if (server->in_flight > server->max_in_flight)
                                        server->max_in_flight = server->in_flight;
                                *instance = server->reconnect_instance;
                        }
                        scredits = *credits;
                        in_flight = server->in_flight;
                        spin_unlock(&server->req_lock);

                        trace_smb3_waitff_credits(server->current_mid,
                                        server->conn_id, server->hostname, scredits,
                                        -(num_credits), in_flight);
                        cifs_dbg(FYI, "%s: remove %u credits total=%d\n",
                                        __func__, num_credits, scredits);
                        break;
                }
        }
        return 0;
}

int wait_for_free_request(struct TCP_Server_Info *server, const int flags,
                          unsigned int *instance)
{
        return wait_for_free_credits(server, 1, -1, flags,
                                     instance);
}

static int
wait_for_compound_request(struct TCP_Server_Info *server, int num,
                          const int flags, unsigned int *instance)
{
        int *credits;
        int scredits, in_flight;

        credits = server->ops->get_credits_field(server, flags & CIFS_OP_MASK);

        spin_lock(&server->req_lock);
        scredits = *credits;
        in_flight = server->in_flight;

        if (*credits < num) {
                /*
                 * If the server is tight on resources or just gives us less
                 * credits for other reasons (e.g. requests are coming out of
                 * order and the server delays granting more credits until it
                 * processes a missing mid) and we exhausted most available
                 * credits there may be situations when we try to send
                 * a compound request but we don't have enough credits. At this
                 * point the client needs to decide if it should wait for
                 * additional credits or fail the request. If at least one
                 * request is in flight there is a high probability that the
                 * server will return enough credits to satisfy this compound
                 * request.
                 *
                 * Return immediately if no requests in flight since we will be
                 * stuck on waiting for credits.
                 */
                if (server->in_flight == 0) {
                        spin_unlock(&server->req_lock);
                        trace_smb3_insufficient_credits(server->current_mid,
                                        server->conn_id, server->hostname, scredits,
                                        num, in_flight);
                        cifs_dbg(FYI, "%s: %d requests in flight, needed %d total=%d\n",
                                        __func__, in_flight, num, scredits);
                        return -EDEADLK;
                }
        }
        spin_unlock(&server->req_lock);

        return wait_for_free_credits(server, num, 60000, flags,
                                     instance);
}

int
cifs_wait_mtu_credits(struct TCP_Server_Info *server, size_t size,
                      size_t *num, struct cifs_credits *credits)
{
        *num = size;
        credits->value = 0;
        credits->instance = server->reconnect_instance;
        return 0;
}

int wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *mid)
{
        unsigned int sleep_state = TASK_KILLABLE;
        int error;

        if (mid->sr_flags & CIFS_INTERRUPTIBLE_WAIT)
                sleep_state = TASK_INTERRUPTIBLE;

        error = wait_event_state(server->response_q,
                                 mid->mid_state != MID_REQUEST_SUBMITTED &&
                                 mid->mid_state != MID_RESPONSE_RECEIVED,
                                 (sleep_state | TASK_FREEZABLE_UNSAFE));
        if (error < 0)
                return -ERESTARTSYS;

        return 0;
}

/*
 * Send a SMB request and set the callback function in the mid to handle
 * the result. Caller is responsible for dealing with timeouts.
 */
int
cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
                mid_receive_t receive, mid_callback_t callback,
                mid_handle_t handle, void *cbdata, const int flags,
                const struct cifs_credits *exist_credits)
{
        int rc;
        struct mid_q_entry *mid;
        struct cifs_credits credits = { .value = 0, .instance = 0 };
        unsigned int instance;
        int optype;

        optype = flags & CIFS_OP_MASK;

        if ((flags & CIFS_HAS_CREDITS) == 0) {
                rc = wait_for_free_request(server, flags, &instance);
                if (rc)
                        return rc;
                credits.value = 1;
                credits.instance = instance;
        } else
                instance = exist_credits->instance;

        cifs_server_lock(server);

        /*
         * We can't use credits obtained from the previous session to send this
         * request. Check if there were reconnects after we obtained credits and
         * return -EAGAIN in such cases to let callers handle it.
         */
        if (instance != server->reconnect_instance) {
                cifs_server_unlock(server);
                add_credits_and_wake_if(server, &credits, optype);
                return -EAGAIN;
        }

        mid = server->ops->setup_async_request(server, rqst);
        if (IS_ERR(mid)) {
                cifs_server_unlock(server);
                add_credits_and_wake_if(server, &credits, optype);
                return PTR_ERR(mid);
        }

        mid->sr_flags = flags;
        mid->receive = receive;
        mid->callback = callback;
        mid->callback_data = cbdata;
        mid->handle = handle;
        mid->mid_state = MID_REQUEST_SUBMITTED;

        /* put it on the pending_mid_q */
        spin_lock(&server->mid_queue_lock);
        list_add_tail(&mid->qhead, &server->pending_mid_q);
        spin_unlock(&server->mid_queue_lock);

        /*
         * Need to store the time in mid before calling I/O. For call_async,
         * I/O response may come back and free the mid entry on another thread.
         */
        cifs_save_when_sent(mid);
        rc = smb_send_rqst(server, 1, rqst, flags);

        if (rc < 0) {
                revert_current_mid(server, mid->credits);
                server->sequence_number -= 2;
                delete_mid(server, mid);
        }

        cifs_server_unlock(server);

        if (rc == 0)
                return 0;

        add_credits_and_wake_if(server, &credits, optype);
        return rc;
}

int cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
{
        int rc = 0;

        cifs_dbg(FYI, "%s: cmd=%d mid=%llu state=%d\n",
                 __func__, le16_to_cpu(mid->command), mid->mid, mid->mid_state);

        spin_lock(&server->mid_queue_lock);
        switch (mid->mid_state) {
        case MID_RESPONSE_READY:
                spin_unlock(&server->mid_queue_lock);
                return rc;
        case MID_RETRY_NEEDED:
                rc = -EAGAIN;
                break;
        case MID_RESPONSE_MALFORMED:
                rc = smb_EIO(smb_eio_trace_rx_sync_mid_malformed);
                break;
        case MID_SHUTDOWN:
                rc = -EHOSTDOWN;
                break;
        case MID_RC:
                rc = mid->mid_rc;
                break;
        default:
                if (mid->deleted_from_q == false) {
                        list_del_init(&mid->qhead);
                        mid->deleted_from_q = true;
                }
                spin_unlock(&server->mid_queue_lock);
                cifs_server_dbg(VFS, "%s: invalid mid state mid=%llu state=%d\n",
                         __func__, mid->mid, mid->mid_state);
                rc = smb_EIO1(smb_eio_trace_rx_sync_mid_invalid, mid->mid_state);
                goto sync_mid_done;
        }
        spin_unlock(&server->mid_queue_lock);

sync_mid_done:
        release_mid(server, mid);
        return rc;
}

static void
cifs_compound_callback(struct TCP_Server_Info *server, struct mid_q_entry *mid)
{
        struct cifs_credits credits = {
                .value = server->ops->get_credits(mid),
                .instance = server->reconnect_instance,
        };

        add_credits(server, &credits, mid->optype);

        if (mid->mid_state == MID_RESPONSE_RECEIVED)
                mid->mid_state = MID_RESPONSE_READY;
}

static void
cifs_compound_last_callback(struct TCP_Server_Info *server, struct mid_q_entry *mid)
{
        cifs_compound_callback(server, mid);
        cifs_wake_up_task(server, mid);
}

static void
cifs_cancelled_callback(struct TCP_Server_Info *server, struct mid_q_entry *mid)
{
        cifs_compound_callback(server, mid);
        release_mid(server, mid);
}

/*
 * cifs_pick_channel - pick an eligible channel for network operations
 *
 * @ses: session reference
 *
 * Select an eligible channel (not terminating and not marked as needing
 * reconnect), preferring the least loaded one. If no eligible channel is
 * found, fall back to the primary channel (index 0).
 *
 * Return: TCP_Server_Info pointer for the chosen channel, or NULL if @ses is
 * NULL.
 */
struct TCP_Server_Info *cifs_pick_channel(struct cifs_ses *ses)
{
        uint index = 0;
        unsigned int min_in_flight = UINT_MAX;
        struct TCP_Server_Info *server = NULL;
        int i, start, cur;

        if (!ses)
                return NULL;

        spin_lock(&ses->chan_lock);
        start = atomic_inc_return(&ses->chan_seq);
        for (i = 0; i < ses->chan_count; i++) {
                cur = (start + i) % ses->chan_count;
                server = ses->chans[cur].server;
                if (!server || server->terminate)
                        continue;

                if (CIFS_CHAN_NEEDS_RECONNECT(ses, cur))
                        continue;

                /*
                 * strictly speaking, we should pick up req_lock to read
                 * server->in_flight. But it shouldn't matter much here if we
                 * race while reading this data. The worst that can happen is
                 * that we could use a channel that's not least loaded. Avoiding
                 * taking the lock could help reduce wait time, which is
                 * important for this function
                 */
                if (server->in_flight < min_in_flight) {
                        min_in_flight = server->in_flight;
                        index = cur;
                }
        }

        server = ses->chans[index].server;
        spin_unlock(&ses->chan_lock);

        return server;
}

int
compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
                   struct TCP_Server_Info *server,
                   const int flags, const int num_rqst, struct smb_rqst *rqst,
                   int *resp_buf_type, struct kvec *resp_iov)
{
        int i, j, optype, rc = 0;
        struct mid_q_entry *mid[MAX_COMPOUND];
        bool cancelled_mid[MAX_COMPOUND] = {false};
        struct cifs_credits credits[MAX_COMPOUND] = {
                { .value = 0, .instance = 0 }
        };
        unsigned int instance;
        char *buf;

        optype = flags & CIFS_OP_MASK;

        for (i = 0; i < num_rqst; i++)
                resp_buf_type[i] = CIFS_NO_BUFFER;  /* no response buf yet */

        if (!ses || !ses->server || !server) {
                cifs_dbg(VFS, "Null session\n");
                return smb_EIO(smb_eio_trace_null_pointers);
        }

        spin_lock(&server->srv_lock);
        if (server->tcpStatus == CifsExiting) {
                spin_unlock(&server->srv_lock);
                return -ENOENT;
        }
        spin_unlock(&server->srv_lock);

        /*
         * Wait for all the requests to become available.
         * This approach still leaves the possibility to be stuck waiting for
         * credits if the server doesn't grant credits to the outstanding
         * requests and if the client is completely idle, not generating any
         * other requests.
         * This can be handled by the eventual session reconnect.
         */
        rc = wait_for_compound_request(server, num_rqst, flags,
                                       &instance);
        if (rc)
                return rc;

        for (i = 0; i < num_rqst; i++) {
                credits[i].value = 1;
                credits[i].instance = instance;
        }

        /*
         * Make sure that we sign in the same order that we send on this socket
         * and avoid races inside tcp sendmsg code that could cause corruption
         * of smb data.
         */

        cifs_server_lock(server);

        /*
         * All the parts of the compound chain belong obtained credits from the
         * same session. We can not use credits obtained from the previous
         * session to send this request. Check if there were reconnects after
         * we obtained credits and return -EAGAIN in such cases to let callers
         * handle it.
         */
        if (instance != server->reconnect_instance) {
                cifs_server_unlock(server);
                for (j = 0; j < num_rqst; j++)
                        add_credits(server, &credits[j], optype);
                return -EAGAIN;
        }

        for (i = 0; i < num_rqst; i++) {
                mid[i] = server->ops->setup_request(ses, server, &rqst[i]);
                if (IS_ERR(mid[i])) {
                        revert_current_mid(server, i);
                        for (j = 0; j < i; j++)
                                delete_mid(server, mid[j]);
                        cifs_server_unlock(server);

                        /* Update # of requests on wire to server */
                        for (j = 0; j < num_rqst; j++)
                                add_credits(server, &credits[j], optype);
                        return PTR_ERR(mid[i]);
                }

                mid[i]->sr_flags = flags;
                mid[i]->mid_state = MID_REQUEST_SUBMITTED;
                mid[i]->optype = optype;
                /*
                 * Invoke callback for every part of the compound chain
                 * to calculate credits properly. Wake up this thread only when
                 * the last element is received.
                 */
                if (i < num_rqst - 1)
                        mid[i]->callback = cifs_compound_callback;
                else
                        mid[i]->callback = cifs_compound_last_callback;
        }
        rc = smb_send_rqst(server, num_rqst, rqst, flags);

        for (i = 0; i < num_rqst; i++)
                cifs_save_when_sent(mid[i]);

        if (rc < 0) {
                revert_current_mid(server, num_rqst);
                server->sequence_number -= 2;
        }

        cifs_server_unlock(server);

        /*
         * If sending failed for some reason or it is an oplock break that we
         * will not receive a response to - return credits back
         */
        if (rc < 0 || (flags & CIFS_NO_SRV_RSP)) {
                for (i = 0; i < num_rqst; i++)
                        add_credits(server, &credits[i], optype);
                goto out;
        }

        /*
         * At this point the request is passed to the network stack - we assume
         * that any credits taken from the server structure on the client have
         * been spent and we can't return them back. Once we receive responses
         * we will collect credits granted by the server in the mid callbacks
         * and add those credits to the server structure.
         */

        /*
         * Compounding is never used during session establish.
         */
        spin_lock(&ses->ses_lock);
        if ((ses->ses_status == SES_NEW) || (optype & CIFS_NEG_OP) || (optype & CIFS_SESS_OP)) {
                spin_unlock(&ses->ses_lock);

                if (WARN_ON_ONCE(num_rqst != 1 || !resp_iov))
                        return -EINVAL;

                cifs_server_lock(server);
                smb311_update_preauth_hash(ses, server, rqst[0].rq_iov, rqst[0].rq_nvec);
                cifs_server_unlock(server);

                spin_lock(&ses->ses_lock);
        }
        spin_unlock(&ses->ses_lock);

        for (i = 0; i < num_rqst; i++) {
                rc = wait_for_response(server, mid[i]);
                if (rc != 0)
                        break;
        }
        if (rc != 0) {
                for (; i < num_rqst; i++) {
                        cifs_server_dbg(FYI, "Cancelling wait for mid %llu cmd: %d\n",
                                 mid[i]->mid, le16_to_cpu(mid[i]->command));
                        send_cancel(ses, server, &rqst[i], mid[i], xid);
                        spin_lock(&mid[i]->mid_lock);
                        mid[i]->wait_cancelled = true;
                        if (mid[i]->mid_state == MID_REQUEST_SUBMITTED ||
                            mid[i]->mid_state == MID_RESPONSE_RECEIVED) {
                                mid[i]->callback = cifs_cancelled_callback;
                                cancelled_mid[i] = true;
                                credits[i].value = 0;
                        }
                        spin_unlock(&mid[i]->mid_lock);
                }
        }

        for (i = 0; i < num_rqst; i++) {
                if (rc < 0)
                        goto out;

                rc = cifs_sync_mid_result(mid[i], server);
                if (rc != 0) {
                        /* mark this mid as cancelled to not free it below */
                        cancelled_mid[i] = true;
                        goto out;
                }

                if (!mid[i]->resp_buf ||
                    mid[i]->mid_state != MID_RESPONSE_READY) {
                        rc = smb_EIO1(smb_eio_trace_rx_mid_unready, mid[i]->mid_state);
                        cifs_dbg(FYI, "Bad MID state?\n");
                        goto out;
                }

                rc = server->ops->check_receive(mid[i], server,
                                                flags & CIFS_LOG_ERROR);

                if (resp_iov) {
                        buf = (char *)mid[i]->resp_buf;
                        resp_iov[i].iov_base = buf;
                        resp_iov[i].iov_len = mid[i]->resp_buf_size;

                        if (mid[i]->large_buf)
                                resp_buf_type[i] = CIFS_LARGE_BUFFER;
                        else
                                resp_buf_type[i] = CIFS_SMALL_BUFFER;

                        /* mark it so buf will not be freed by delete_mid */
                        if ((flags & CIFS_NO_RSP_BUF) == 0)
                                mid[i]->resp_buf = NULL;
                }
        }

        /*
         * Compounding is never used during session establish.
         */
        spin_lock(&ses->ses_lock);
        if ((ses->ses_status == SES_NEW) || (optype & CIFS_NEG_OP) || (optype & CIFS_SESS_OP)) {
                struct kvec iov = {
                        .iov_base = resp_iov[0].iov_base,
                        .iov_len = resp_iov[0].iov_len
                };
                spin_unlock(&ses->ses_lock);
                cifs_server_lock(server);
                smb311_update_preauth_hash(ses, server, &iov, 1);
                cifs_server_unlock(server);
                spin_lock(&ses->ses_lock);
        }
        spin_unlock(&ses->ses_lock);

out:
        /*
         * This will dequeue all mids. After this it is important that the
         * demultiplex_thread will not process any of these mids any further.
         * This is prevented above by using a noop callback that will not
         * wake this thread except for the very last PDU.
         */
        for (i = 0; i < num_rqst; i++) {
                if (!cancelled_mid[i])
                        delete_mid(server, mid[i]);
        }

        return rc;
}

int
cifs_send_recv(const unsigned int xid, struct cifs_ses *ses,
               struct TCP_Server_Info *server,
               struct smb_rqst *rqst, int *resp_buf_type, const int flags,
               struct kvec *resp_iov)
{
        return compound_send_recv(xid, ses, server, flags, 1,
                                  rqst, resp_buf_type, resp_iov);
}


/*
 * Discard any remaining data in the current SMB. To do this, we borrow the
 * current bigbuf.
 */
int
cifs_discard_remaining_data(struct TCP_Server_Info *server)
{
        unsigned int rfclen = server->pdu_size;
        size_t remaining = rfclen - server->total_read;

        while (remaining > 0) {
                ssize_t length;

                length = cifs_discard_from_socket(server,
                                min_t(size_t, remaining,
                                      CIFSMaxBufSize + MAX_HEADER_SIZE(server)));
                if (length < 0)
                        return length;
                server->total_read += length;
                remaining -= length;
        }

        return 0;
}

static int
__cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid,
                     bool malformed)
{
        int length;

        length = cifs_discard_remaining_data(server);
        dequeue_mid(server, mid, malformed);
        mid->resp_buf = server->smallbuf;
        server->smallbuf = NULL;
        return length;
}

static int
cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
{
        struct cifs_io_subrequest *rdata = mid->callback_data;

        return  __cifs_readv_discard(server, mid, rdata->result);
}

int
cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
{
        int length, len;
        unsigned int data_offset, data_len;
        struct cifs_io_subrequest *rdata = mid->callback_data;
        char *buf = server->smallbuf;
        unsigned int buflen = server->pdu_size;
        bool use_rdma_mr = false;

        cifs_dbg(FYI, "%s: mid=%llu offset=%llu bytes=%zu\n",
                 __func__, mid->mid, rdata->subreq.start, rdata->subreq.len);

        /*
         * read the rest of READ_RSP header (sans Data array), or whatever we
         * can if there's not enough data. At this point, we've read down to
         * the Mid.
         */
        len = min_t(unsigned int, buflen, server->vals->read_rsp_size) -
                                                        HEADER_SIZE(server) + 1;

        length = cifs_read_from_socket(server,
                                       buf + HEADER_SIZE(server) - 1, len);
        if (length < 0)
                return length;
        server->total_read += length;

        if (server->ops->is_session_expired &&
            server->ops->is_session_expired(buf)) {
                cifs_reconnect(server, true);
                return -1;
        }

        if (server->ops->is_status_pending &&
            server->ops->is_status_pending(buf, server)) {
                cifs_discard_remaining_data(server);
                return -1;
        }

        /* set up first two iov for signature check and to get credits */
        rdata->iov[0].iov_base = buf;
        rdata->iov[0].iov_len = server->total_read;
        cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n",
                 rdata->iov[0].iov_base, rdata->iov[0].iov_len);

        /* Was the SMB read successful? */
        rdata->result = server->ops->map_error(buf, false);
        if (rdata->result != 0) {
                cifs_dbg(FYI, "%s: server returned error %d\n",
                         __func__, rdata->result);
                /* normal error on read response */
                return __cifs_readv_discard(server, mid, false);
        }

        /* Is there enough to get to the rest of the READ_RSP header? */
        if (server->total_read < server->vals->read_rsp_size) {
                cifs_dbg(FYI, "%s: server returned short header. got=%u expected=%zu\n",
                         __func__, server->total_read,
                         server->vals->read_rsp_size);
                rdata->result = smb_EIO2(smb_eio_trace_read_rsp_short,
                                         server->total_read, server->vals->read_rsp_size);
                return cifs_readv_discard(server, mid);
        }

        data_offset = server->ops->read_data_offset(buf);
        if (data_offset < server->total_read) {
                /*
                 * win2k8 sometimes sends an offset of 0 when the read
                 * is beyond the EOF. Treat it as if the data starts just after
                 * the header.
                 */
                cifs_dbg(FYI, "%s: data offset (%u) inside read response header\n",
                         __func__, data_offset);
                data_offset = server->total_read;
        } else if (data_offset > MAX_CIFS_SMALL_BUFFER_SIZE) {
                /* data_offset is beyond the end of smallbuf */
                cifs_dbg(FYI, "%s: data offset (%u) beyond end of smallbuf\n",
                         __func__, data_offset);
                rdata->result = smb_EIO1(smb_eio_trace_read_overlarge,
                                         data_offset);
                return cifs_readv_discard(server, mid);
        }

        cifs_dbg(FYI, "%s: total_read=%u data_offset=%u\n",
                 __func__, server->total_read, data_offset);

        len = data_offset - server->total_read;
        if (len > 0) {
                /* read any junk before data into the rest of smallbuf */
                length = cifs_read_from_socket(server,
                                               buf + server->total_read, len);
                if (length < 0)
                        return length;
                server->total_read += length;
                rdata->iov[0].iov_len = server->total_read;
        }

        /* how much data is in the response? */
#ifdef CONFIG_CIFS_SMB_DIRECT
        use_rdma_mr = rdata->mr;
#endif
        data_len = server->ops->read_data_length(buf, use_rdma_mr);
        if (!use_rdma_mr && (data_offset + data_len > buflen)) {
                /* data_len is corrupt -- discard frame */
                rdata->result = smb_EIO2(smb_eio_trace_read_rsp_malformed,
                                         data_offset + data_len, buflen);
                return cifs_readv_discard(server, mid);
        }

#ifdef CONFIG_CIFS_SMB_DIRECT
        if (rdata->mr)
                length = data_len; /* An RDMA read is already done. */
        else
#endif
                length = cifs_read_iter_from_socket(server, &rdata->subreq.io_iter,
                                                    data_len);
        if (length > 0)
                rdata->got_bytes += length;
        server->total_read += length;

        cifs_dbg(FYI, "total_read=%u buflen=%u remaining=%u\n",
                 server->total_read, buflen, data_len);

        /* discard anything left over */
        if (server->total_read < buflen)
                return cifs_readv_discard(server, mid);

        dequeue_mid(server, mid, false);
        mid->resp_buf = server->smallbuf;
        server->smallbuf = NULL;
        return length;
}