root/fs/netfs/write_collect.c
// SPDX-License-Identifier: GPL-2.0-only
/* Network filesystem write subrequest result collection, assessment
 * and retrying.
 *
 * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#include <linux/export.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/slab.h>
#include "internal.h"

/* Notes made in the collector */
#define HIT_PENDING             0x01    /* A front op was still pending */
#define NEED_REASSESS           0x02    /* Need to loop round and reassess */
#define MADE_PROGRESS           0x04    /* Made progress cleaning up a stream or the folio set */
#define NEED_UNLOCK             0x08    /* The pagecache needs unlocking */
#define NEED_RETRY              0x10    /* A front op requests retrying */
#define SAW_FAILURE             0x20    /* One stream or hit a permanent failure */

static void netfs_dump_request(const struct netfs_io_request *rreq)
{
        pr_err("Request R=%08x r=%d fl=%lx or=%x e=%ld\n",
               rreq->debug_id, refcount_read(&rreq->ref), rreq->flags,
               rreq->origin, rreq->error);
        pr_err("  st=%llx tsl=%zx/%llx/%llx\n",
               rreq->start, rreq->transferred, rreq->submitted, rreq->len);
        pr_err("  cci=%llx/%llx/%llx\n",
               rreq->cleaned_to, rreq->collected_to, atomic64_read(&rreq->issued_to));
        pr_err("  iw=%pSR\n", rreq->netfs_ops->issue_write);
        for (int i = 0; i < NR_IO_STREAMS; i++) {
                const struct netfs_io_subrequest *sreq;
                const struct netfs_io_stream *s = &rreq->io_streams[i];

                pr_err("  str[%x] s=%x e=%d acnf=%u,%u,%u,%u\n",
                       s->stream_nr, s->source, s->error,
                       s->avail, s->active, s->need_retry, s->failed);
                pr_err("  str[%x] ct=%llx t=%zx\n",
                       s->stream_nr, s->collected_to, s->transferred);
                list_for_each_entry(sreq, &s->subrequests, rreq_link) {
                        pr_err("  sreq[%x:%x] sc=%u s=%llx t=%zx/%zx r=%d f=%lx\n",
                               sreq->stream_nr, sreq->debug_index, sreq->source,
                               sreq->start, sreq->transferred, sreq->len,
                               refcount_read(&sreq->ref), sreq->flags);
                }
        }
}

/*
 * Successful completion of write of a folio to the server and/or cache.  Note
 * that we are not allowed to lock the folio here on pain of deadlocking with
 * truncate.
 */
int netfs_folio_written_back(struct folio *folio)
{
        enum netfs_folio_trace why = netfs_folio_trace_clear;
        struct netfs_inode *ictx = netfs_inode(folio->mapping->host);
        struct netfs_folio *finfo;
        struct netfs_group *group = NULL;
        int gcount = 0;

        if ((finfo = netfs_folio_info(folio))) {
                /* Streaming writes cannot be redirtied whilst under writeback,
                 * so discard the streaming record.
                 */
                unsigned long long fend;

                fend = folio_pos(folio) + finfo->dirty_offset + finfo->dirty_len;
                if (fend > ictx->zero_point)
                        ictx->zero_point = fend;

                folio_detach_private(folio);
                group = finfo->netfs_group;
                gcount++;
                kfree(finfo);
                why = netfs_folio_trace_clear_s;
                goto end_wb;
        }

        if ((group = netfs_folio_group(folio))) {
                if (group == NETFS_FOLIO_COPY_TO_CACHE) {
                        why = netfs_folio_trace_clear_cc;
                        folio_detach_private(folio);
                        goto end_wb;
                }

                /* Need to detach the group pointer if the page didn't get
                 * redirtied.  If it has been redirtied, then it must be within
                 * the same group.
                 */
                why = netfs_folio_trace_redirtied;
                if (!folio_test_dirty(folio)) {
                        folio_detach_private(folio);
                        gcount++;
                        why = netfs_folio_trace_clear_g;
                }
        }

end_wb:
        trace_netfs_folio(folio, why);
        folio_end_writeback(folio);
        return gcount;
}

/*
 * Unlock any folios we've finished with.
 */
static void netfs_writeback_unlock_folios(struct netfs_io_request *wreq,
                                          unsigned int *notes)
{
        struct folio_queue *folioq = wreq->buffer.tail;
        unsigned long long collected_to = wreq->collected_to;
        unsigned int slot = wreq->buffer.first_tail_slot;

        if (WARN_ON_ONCE(!folioq)) {
                pr_err("[!] Writeback unlock found empty rolling buffer!\n");
                netfs_dump_request(wreq);
                return;
        }

        if (wreq->origin == NETFS_PGPRIV2_COPY_TO_CACHE) {
                if (netfs_pgpriv2_unlock_copied_folios(wreq))
                        *notes |= MADE_PROGRESS;
                return;
        }

        if (slot >= folioq_nr_slots(folioq)) {
                folioq = rolling_buffer_delete_spent(&wreq->buffer);
                if (!folioq)
                        return;
                slot = 0;
        }

        for (;;) {
                struct folio *folio;
                struct netfs_folio *finfo;
                unsigned long long fpos, fend;
                size_t fsize, flen;

                folio = folioq_folio(folioq, slot);
                if (WARN_ONCE(!folio_test_writeback(folio),
                              "R=%08x: folio %lx is not under writeback\n",
                              wreq->debug_id, folio->index))
                        trace_netfs_folio(folio, netfs_folio_trace_not_under_wback);

                fpos = folio_pos(folio);
                fsize = folio_size(folio);
                finfo = netfs_folio_info(folio);
                flen = finfo ? finfo->dirty_offset + finfo->dirty_len : fsize;

                fend = min_t(unsigned long long, fpos + flen, wreq->i_size);

                trace_netfs_collect_folio(wreq, folio, fend, collected_to);

                /* Unlock any folio we've transferred all of. */
                if (collected_to < fend)
                        break;

                wreq->nr_group_rel += netfs_folio_written_back(folio);
                wreq->cleaned_to = fpos + fsize;
                *notes |= MADE_PROGRESS;

                /* Clean up the head folioq.  If we clear an entire folioq, then
                 * we can get rid of it provided it's not also the tail folioq
                 * being filled by the issuer.
                 */
                folioq_clear(folioq, slot);
                slot++;
                if (slot >= folioq_nr_slots(folioq)) {
                        folioq = rolling_buffer_delete_spent(&wreq->buffer);
                        if (!folioq)
                                goto done;
                        slot = 0;
                }

                if (fpos + fsize >= collected_to)
                        break;
        }

        wreq->buffer.tail = folioq;
done:
        wreq->buffer.first_tail_slot = slot;
}

/*
 * Collect and assess the results of various write subrequests.  We may need to
 * retry some of the results - or even do an RMW cycle for content crypto.
 *
 * Note that we have a number of parallel, overlapping lists of subrequests,
 * one to the server and one to the local cache for example, which may not be
 * the same size or starting position and may not even correspond in boundary
 * alignment.
 */
static void netfs_collect_write_results(struct netfs_io_request *wreq)
{
        struct netfs_io_subrequest *front, *remove;
        struct netfs_io_stream *stream;
        unsigned long long collected_to, issued_to;
        unsigned int notes;
        int s;

        _enter("%llx-%llx", wreq->start, wreq->start + wreq->len);
        trace_netfs_collect(wreq);
        trace_netfs_rreq(wreq, netfs_rreq_trace_collect);

reassess_streams:
        issued_to = atomic64_read(&wreq->issued_to);
        smp_rmb();
        collected_to = ULLONG_MAX;
        if (wreq->origin == NETFS_WRITEBACK ||
            wreq->origin == NETFS_WRITETHROUGH ||
            wreq->origin == NETFS_PGPRIV2_COPY_TO_CACHE)
                notes = NEED_UNLOCK;
        else
                notes = 0;

        /* Remove completed subrequests from the front of the streams and
         * advance the completion point on each stream.  We stop when we hit
         * something that's in progress.  The issuer thread may be adding stuff
         * to the tail whilst we're doing this.
         */
        for (s = 0; s < NR_IO_STREAMS; s++) {
                stream = &wreq->io_streams[s];
                /* Read active flag before list pointers */
                if (!smp_load_acquire(&stream->active))
                        continue;

                front = list_first_entry_or_null(&stream->subrequests,
                                                 struct netfs_io_subrequest, rreq_link);
                while (front) {
                        trace_netfs_collect_sreq(wreq, front);
                        //_debug("sreq [%x] %llx %zx/%zx",
                        //       front->debug_index, front->start, front->transferred, front->len);

                        if (stream->collected_to < front->start) {
                                trace_netfs_collect_gap(wreq, stream, issued_to, 'F');
                                stream->collected_to = front->start;
                        }

                        /* Stall if the front is still undergoing I/O. */
                        if (netfs_check_subreq_in_progress(front)) {
                                notes |= HIT_PENDING;
                                break;
                        }
                        smp_rmb(); /* Read counters after I-P flag. */

                        if (stream->failed) {
                                stream->collected_to = front->start + front->len;
                                notes |= MADE_PROGRESS | SAW_FAILURE;
                                goto cancel;
                        }
                        if (front->start + front->transferred > stream->collected_to) {
                                stream->collected_to = front->start + front->transferred;
                                stream->transferred = stream->collected_to - wreq->start;
                                stream->transferred_valid = true;
                                notes |= MADE_PROGRESS;
                        }
                        if (test_bit(NETFS_SREQ_FAILED, &front->flags)) {
                                stream->failed = true;
                                stream->error = front->error;
                                if (stream->source == NETFS_UPLOAD_TO_SERVER)
                                        mapping_set_error(wreq->mapping, front->error);
                                notes |= NEED_REASSESS | SAW_FAILURE;
                                break;
                        }
                        if (front->transferred < front->len) {
                                stream->need_retry = true;
                                notes |= NEED_RETRY | MADE_PROGRESS;
                                break;
                        }

                cancel:
                        /* Remove if completely consumed. */
                        spin_lock(&wreq->lock);

                        remove = front;
                        list_del_init(&front->rreq_link);
                        front = list_first_entry_or_null(&stream->subrequests,
                                                         struct netfs_io_subrequest, rreq_link);
                        spin_unlock(&wreq->lock);
                        netfs_put_subrequest(remove,
                                             notes & SAW_FAILURE ?
                                             netfs_sreq_trace_put_cancel :
                                             netfs_sreq_trace_put_done);
                }

                /* If we have an empty stream, we need to jump it forward
                 * otherwise the collection point will never advance.
                 */
                if (!front && issued_to > stream->collected_to) {
                        trace_netfs_collect_gap(wreq, stream, issued_to, 'E');
                        stream->collected_to = issued_to;
                }

                if (stream->collected_to < collected_to)
                        collected_to = stream->collected_to;
        }

        if (collected_to != ULLONG_MAX && collected_to > wreq->collected_to)
                wreq->collected_to = collected_to;

        for (s = 0; s < NR_IO_STREAMS; s++) {
                stream = &wreq->io_streams[s];
                if (stream->active)
                        trace_netfs_collect_stream(wreq, stream);
        }

        trace_netfs_collect_state(wreq, wreq->collected_to, notes);

        /* Unlock any folios that we have now finished with. */
        if (notes & NEED_UNLOCK) {
                if (wreq->cleaned_to < wreq->collected_to)
                        netfs_writeback_unlock_folios(wreq, &notes);
        } else {
                wreq->cleaned_to = wreq->collected_to;
        }

        // TODO: Discard encryption buffers

        if (notes & NEED_RETRY)
                goto need_retry;

        if (notes & MADE_PROGRESS) {
                netfs_wake_rreq_flag(wreq, NETFS_RREQ_PAUSE, netfs_rreq_trace_unpause);
                //cond_resched();
                goto reassess_streams;
        }

        if (notes & NEED_REASSESS) {
                //cond_resched();
                goto reassess_streams;
        }

out:
        netfs_put_group_many(wreq->group, wreq->nr_group_rel);
        wreq->nr_group_rel = 0;
        _leave(" = %x", notes);
        return;

need_retry:
        /* Okay...  We're going to have to retry one or both streams.  Note
         * that any partially completed op will have had any wholly transferred
         * folios removed from it.
         */
        _debug("retry");
        netfs_retry_writes(wreq);
        goto out;
}

/*
 * Perform the collection of subrequests, folios and encryption buffers.
 */
bool netfs_write_collection(struct netfs_io_request *wreq)
{
        struct netfs_inode *ictx = netfs_inode(wreq->inode);
        size_t transferred;
        bool transferred_valid = false;
        int s;

        _enter("R=%x", wreq->debug_id);

        netfs_collect_write_results(wreq);

        /* We're done when the app thread has finished posting subreqs and all
         * the queues in all the streams are empty.
         */
        if (!test_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags))
                return false;
        smp_rmb(); /* Read ALL_QUEUED before lists. */

        transferred = LONG_MAX;
        for (s = 0; s < NR_IO_STREAMS; s++) {
                struct netfs_io_stream *stream = &wreq->io_streams[s];
                if (!stream->active)
                        continue;
                if (!list_empty(&stream->subrequests))
                        return false;
                if (stream->transferred_valid &&
                    stream->transferred < transferred) {
                        transferred = stream->transferred;
                        transferred_valid = true;
                }
        }

        /* Okay, declare that all I/O is complete. */
        if (transferred_valid)
                wreq->transferred = transferred;
        trace_netfs_rreq(wreq, netfs_rreq_trace_write_done);

        if (wreq->io_streams[1].active &&
            wreq->io_streams[1].failed &&
            ictx->ops->invalidate_cache) {
                /* Cache write failure doesn't prevent writeback completion
                 * unless we're in disconnected mode.
                 */
                ictx->ops->invalidate_cache(wreq);
        }

        _debug("finished");
        netfs_wake_rreq_flag(wreq, NETFS_RREQ_IN_PROGRESS, netfs_rreq_trace_wake_ip);
        /* As we cleared NETFS_RREQ_IN_PROGRESS, we acquired its ref. */

        if (wreq->iocb) {
                size_t written = min(wreq->transferred, wreq->len);
                wreq->iocb->ki_pos += written;
                if (wreq->iocb->ki_complete) {
                        trace_netfs_rreq(wreq, netfs_rreq_trace_ki_complete);
                        wreq->iocb->ki_complete(
                                wreq->iocb, wreq->error ? wreq->error : written);
                }
                wreq->iocb = VFS_PTR_POISON;
        }

        netfs_clear_subrequests(wreq);
        return true;
}

void netfs_write_collection_worker(struct work_struct *work)
{
        struct netfs_io_request *rreq = container_of(work, struct netfs_io_request, work);

        netfs_see_request(rreq, netfs_rreq_trace_see_work);
        if (netfs_check_rreq_in_progress(rreq)) {
                if (netfs_write_collection(rreq))
                        /* Drop the ref from the IN_PROGRESS flag. */
                        netfs_put_request(rreq, netfs_rreq_trace_put_work_ip);
                else
                        netfs_see_request(rreq, netfs_rreq_trace_see_work_complete);
        }
}

/**
 * netfs_write_subrequest_terminated - Note the termination of a write operation.
 * @_op: The I/O request that has terminated.
 * @transferred_or_error: The amount of data transferred or an error code.
 *
 * This tells the library that a contributory write I/O operation has
 * terminated, one way or another, and that it should collect the results.
 *
 * The caller indicates in @transferred_or_error the outcome of the operation,
 * supplying a positive value to indicate the number of bytes transferred or a
 * negative error code.  The library will look after reissuing I/O operations
 * as appropriate and writing downloaded data to the cache.
 *
 * When this is called, ownership of the subrequest is transferred back to the
 * library, along with a ref.
 *
 * Note that %_op is a void* so that the function can be passed to
 * kiocb::term_func without the need for a casting wrapper.
 */
void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error)
{
        struct netfs_io_subrequest *subreq = _op;
        struct netfs_io_request *wreq = subreq->rreq;

        _enter("%x[%x] %zd", wreq->debug_id, subreq->debug_index, transferred_or_error);

        switch (subreq->source) {
        case NETFS_UPLOAD_TO_SERVER:
                netfs_stat(&netfs_n_wh_upload_done);
                break;
        case NETFS_WRITE_TO_CACHE:
                netfs_stat(&netfs_n_wh_write_done);
                break;
        default:
                BUG();
        }

        if (IS_ERR_VALUE(transferred_or_error)) {
                subreq->error = transferred_or_error;
                /* if need retry is set, error should not matter */
                if (!test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) {
                        set_bit(NETFS_SREQ_FAILED, &subreq->flags);
                        trace_netfs_failure(wreq, subreq, transferred_or_error, netfs_fail_write);
                }

                switch (subreq->source) {
                case NETFS_WRITE_TO_CACHE:
                        netfs_stat(&netfs_n_wh_write_failed);
                        break;
                case NETFS_UPLOAD_TO_SERVER:
                        netfs_stat(&netfs_n_wh_upload_failed);
                        break;
                default:
                        break;
                }
                trace_netfs_rreq(wreq, netfs_rreq_trace_set_pause);
                set_bit(NETFS_RREQ_PAUSE, &wreq->flags);
        } else {
                if (WARN(transferred_or_error > subreq->len - subreq->transferred,
                         "Subreq excess write: R=%x[%x] %zd > %zu - %zu",
                         wreq->debug_id, subreq->debug_index,
                         transferred_or_error, subreq->len, subreq->transferred))
                        transferred_or_error = subreq->len - subreq->transferred;

                subreq->error = 0;
                subreq->transferred += transferred_or_error;

                if (subreq->transferred < subreq->len)
                        set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
        }

        trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
        netfs_subreq_clear_in_progress(subreq);
        netfs_put_subrequest(subreq, netfs_sreq_trace_put_terminated);
}
EXPORT_SYMBOL(netfs_write_subrequest_terminated);