root/fs/fserror.c
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Copyright (c) 2025 Oracle.  All Rights Reserved.
 * Author: Darrick J. Wong <djwong@kernel.org>
 */
#include <linux/fs.h>
#include <linux/fsnotify.h>
#include <linux/mempool.h>
#include <linux/fserror.h>

#define FSERROR_DEFAULT_EVENT_POOL_SIZE         (32)

static struct mempool fserror_events_pool;

void fserror_mount(struct super_block *sb)
{
        /*
         * The pending error counter is biased by 1 so that we don't wake_var
         * until we're actually trying to unmount.
         */
        refcount_set(&sb->s_pending_errors, 1);
}

void fserror_unmount(struct super_block *sb)
{
        /*
         * If we don't drop the pending error count to zero, then wait for it
         * to drop below 1, which means that the pending errors cleared and
         * hopefully we didn't saturate with 1 billion+ concurrent events.
         */
        if (!refcount_dec_and_test(&sb->s_pending_errors))
                wait_var_event(&sb->s_pending_errors,
                               refcount_read(&sb->s_pending_errors) < 1);
}

static inline void fserror_pending_dec(struct super_block *sb)
{
        if (refcount_dec_and_test(&sb->s_pending_errors))
                wake_up_var(&sb->s_pending_errors);
}

static inline void fserror_free_event(struct fserror_event *event)
{
        fserror_pending_dec(event->sb);
        mempool_free(event, &fserror_events_pool);
}

static void fserror_worker(struct work_struct *work)
{
        struct fserror_event *event =
                        container_of(work, struct fserror_event, work);
        struct super_block *sb = event->sb;

        if (sb->s_flags & SB_ACTIVE) {
                struct fs_error_report report = {
                        /* send positive error number to userspace */
                        .error = -event->error,
                        .inode = event->inode,
                        .sb = event->sb,
                };

                if (sb->s_op->report_error)
                        sb->s_op->report_error(event);

                fsnotify(FS_ERROR, &report, FSNOTIFY_EVENT_ERROR, NULL, NULL,
                         NULL, 0);
        }

        iput(event->inode);
        fserror_free_event(event);
}

static inline struct fserror_event *fserror_alloc_event(struct super_block *sb,
                                                        gfp_t gfp_flags)
{
        struct fserror_event *event = NULL;

        /*
         * If pending_errors already reached zero or is no longer active,
         * the superblock is being deactivated so there's no point in
         * continuing.
         *
         * The order of the check of s_pending_errors and SB_ACTIVE are
         * mandated by order of accesses in generic_shutdown_super and
         * fserror_unmount.  Barriers are implicitly provided by the refcount
         * manipulations in this function and fserror_unmount.
         */
        if (!refcount_inc_not_zero(&sb->s_pending_errors))
                return NULL;
        if (!(sb->s_flags & SB_ACTIVE))
                goto out_pending;

        event = mempool_alloc(&fserror_events_pool, gfp_flags);
        if (!event)
                goto out_pending;

        /* mempool_alloc doesn't support GFP_ZERO */
        memset(event, 0, sizeof(*event));
        event->sb = sb;
        INIT_WORK(&event->work, fserror_worker);

        return event;

out_pending:
        fserror_pending_dec(sb);
        return NULL;
}

/**
 * fserror_report - report a filesystem error of some kind
 *
 * @sb:         superblock of the filesystem
 * @inode:      inode within that filesystem, if applicable
 * @type:       type of error encountered
 * @pos:        start of inode range affected, if applicable
 * @len:        length of inode range affected, if applicable
 * @error:      error number encountered, must be negative
 * @gfp:        memory allocation flags for conveying the event to a worker,
 *              since this function can be called from atomic contexts
 *
 * Report details of a filesystem error to the super_operations::report_error
 * callback if present; and to fsnotify for distribution to userspace.  @sb,
 * @gfp, @type, and @error must all be specified.  For file I/O errors, the
 * @inode, @pos, and @len fields must also be specified.  For file metadata
 * errors, @inode must be specified.  If @inode is not NULL, then @inode->i_sb
 * must point to @sb.
 *
 * Reporting work is deferred to a workqueue to ensure that ->report_error is
 * called from process context without any locks held.  An active reference to
 * the inode is maintained until event handling is complete, and unmount will
 * wait for queued events to drain.
 */
void fserror_report(struct super_block *sb, struct inode *inode,
                    enum fserror_type type, loff_t pos, u64 len, int error,
                    gfp_t gfp)
{
        struct fserror_event *event;

        /* sb and inode must be from the same filesystem */
        WARN_ON_ONCE(inode && inode->i_sb != sb);

        /* error number must be negative */
        WARN_ON_ONCE(error >= 0);

        event = fserror_alloc_event(sb, gfp);
        if (!event)
                goto lost;

        event->type = type;
        event->pos = pos;
        event->len = len;
        event->error = error;

        /*
         * Can't iput from non-sleeping context, so grabbing another reference
         * to the inode must be the last thing before submitting the event.
         */
        if (inode) {
                event->inode = igrab(inode);
                if (!event->inode)
                        goto lost_event;
        }

        /*
         * Use schedule_work here even if we're already in process context so
         * that fsnotify and super_operations::report_error implementations are
         * guaranteed to run in process context without any locks held.  Since
         * errors are supposed to be rare, the overhead shouldn't kill us any
         * more than the failing device will.
         */
        schedule_work(&event->work);
        return;

lost_event:
        fserror_free_event(event);
lost:
        if (inode)
                pr_err_ratelimited(
 "%s: lost file I/O error report for ino %lu type %u pos 0x%llx len 0x%llx error %d",
                       sb->s_id, inode->i_ino, type, pos, len, error);
        else
                pr_err_ratelimited(
 "%s: lost filesystem error report for type %u error %d",
                       sb->s_id, type, error);
}
EXPORT_SYMBOL_GPL(fserror_report);

static int __init fserror_init(void)
{
        return mempool_init_kmalloc_pool(&fserror_events_pool,
                                         FSERROR_DEFAULT_EVENT_POOL_SIZE,
                                         sizeof(struct fserror_event));
}
fs_initcall(fserror_init);