root/fs/xfs/xfs_healthmon.c
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Copyright (c) 2024-2026 Oracle.  All Rights Reserved.
 * Author: Darrick J. Wong <djwong@kernel.org>
 */
#include "xfs_platform.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_trace.h"
#include "xfs_ag.h"
#include "xfs_btree.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
#include "xfs_quota_defs.h"
#include "xfs_rtgroup.h"
#include "xfs_health.h"
#include "xfs_healthmon.h"
#include "xfs_fsops.h"
#include "xfs_notify_failure.h"
#include "xfs_file.h"
#include "xfs_ioctl.h"

#include <linux/anon_inodes.h>
#include <linux/eventpoll.h>
#include <linux/poll.h>
#include <linux/fserror.h>

/*
 * Live Health Monitoring
 * ======================
 *
 * Autonomous self-healing of XFS filesystems requires a means for the kernel
 * to send filesystem health events to a monitoring daemon in userspace.  To
 * accomplish this, we establish a thread_with_file kthread object to handle
 * translating internal events about filesystem health into a format that can
 * be parsed easily by userspace.  When those internal events occur, the core
 * filesystem code calls this health monitor to convey the events to userspace.
 * Userspace reads events from the file descriptor returned by the ioctl.
 *
 * The healthmon abstraction has a weak reference to the host filesystem mount
 * so that the queueing and processing of the events do not pin the mount and
 * cannot slow down the main filesystem.  The healthmon object can exist past
 * the end of the filesystem mount.
 */

/* sign of a detached health monitor */
#define DETACHED_MOUNT_COOKIE           ((uintptr_t)0)

/* Constrain the number of event objects that can build up in memory. */
#define XFS_HEALTHMON_MAX_EVENTS        (SZ_32K / \
                                         sizeof(struct xfs_healthmon_event))

/* Constrain the size of the output buffer for read_iter. */
#define XFS_HEALTHMON_MAX_OUTBUF        SZ_64K

/* spinlock for atomically updating xfs_mount <-> xfs_healthmon pointers */
static DEFINE_SPINLOCK(xfs_healthmon_lock);

/* Grab a reference to the healthmon object for a given mount, if any. */
static struct xfs_healthmon *
xfs_healthmon_get(
        struct xfs_mount                *mp)
{
        struct xfs_healthmon            *hm;

        rcu_read_lock();
        hm = rcu_dereference(mp->m_healthmon);
        if (hm && !refcount_inc_not_zero(&hm->ref))
                hm = NULL;
        rcu_read_unlock();

        return hm;
}

/*
 * Release the reference to a healthmon object.  If there are no more holders,
 * free the health monitor after an RCU grace period to eliminate possibility
 * of races with xfs_healthmon_get.
 */
static void
xfs_healthmon_put(
        struct xfs_healthmon            *hm)
{
        if (refcount_dec_and_test(&hm->ref)) {
                struct xfs_healthmon_event      *event;
                struct xfs_healthmon_event      *next = hm->first_event;

                while ((event = next) != NULL) {
                        trace_xfs_healthmon_drop(hm, event);
                        next = event->next;
                        kfree(event);
                }

                kfree(hm->unmount_event);
                kfree(hm->buffer);
                mutex_destroy(&hm->lock);
                kfree_rcu_mightsleep(hm);
        }
}

/* Attach a health monitor to an xfs_mount.  Only one allowed at a time. */
STATIC int
xfs_healthmon_attach(
        struct xfs_mount        *mp,
        struct xfs_healthmon    *hm)
{
        spin_lock(&xfs_healthmon_lock);
        if (rcu_access_pointer(mp->m_healthmon) != NULL) {
                spin_unlock(&xfs_healthmon_lock);
                return -EEXIST;
        }

        refcount_inc(&hm->ref);
        rcu_assign_pointer(mp->m_healthmon, hm);
        hm->mount_cookie = (uintptr_t)mp->m_super;
        spin_unlock(&xfs_healthmon_lock);

        return 0;
}

/* Detach a xfs mount from a specific healthmon instance. */
STATIC void
xfs_healthmon_detach(
        struct xfs_healthmon    *hm)
{
        struct xfs_mount        *mp;

        spin_lock(&xfs_healthmon_lock);
        if (hm->mount_cookie == DETACHED_MOUNT_COOKIE) {
                spin_unlock(&xfs_healthmon_lock);
                return;
        }

        mp = XFS_M((struct super_block *)hm->mount_cookie);
        rcu_assign_pointer(mp->m_healthmon, NULL);
        hm->mount_cookie = DETACHED_MOUNT_COOKIE;
        spin_unlock(&xfs_healthmon_lock);

        /*
         * Wake up any readers that might remain.  This can happen if unmount
         * races with the healthmon fd owner entering ->read_iter, having
         * already emptied the event queue.
         *
         * In the ->release case there shouldn't be any readers because the
         * only users of the waiter are read and poll.
         */
        wake_up_all(&hm->wait);

        trace_xfs_healthmon_detach(hm);
        xfs_healthmon_put(hm);
}

static inline void xfs_healthmon_bump_events(struct xfs_healthmon *hm)
{
        hm->events++;
        hm->total_events++;
}

static inline void xfs_healthmon_bump_lost(struct xfs_healthmon *hm)
{
        hm->lost_prev_event++;
        hm->total_lost++;
}

/*
 * If possible, merge a new event into an existing event.  Returns whether or
 * not it merged anything.
 */
static bool
xfs_healthmon_merge_events(
        struct xfs_healthmon_event              *existing,
        const struct xfs_healthmon_event        *new)
{
        if (!existing)
                return false;

        /* type and domain must match to merge events */
        if (existing->type != new->type ||
            existing->domain != new->domain)
                return false;

        switch (existing->type) {
        case XFS_HEALTHMON_RUNNING:
        case XFS_HEALTHMON_UNMOUNT:
                /* should only ever be one of these events anyway */
                return false;

        case XFS_HEALTHMON_LOST:
                existing->lostcount += new->lostcount;
                return true;

        case XFS_HEALTHMON_SICK:
        case XFS_HEALTHMON_CORRUPT:
        case XFS_HEALTHMON_HEALTHY:
                switch (existing->domain) {
                case XFS_HEALTHMON_FS:
                        existing->fsmask |= new->fsmask;
                        return true;
                case XFS_HEALTHMON_AG:
                case XFS_HEALTHMON_RTGROUP:
                        if (existing->group == new->group){
                                existing->grpmask |= new->grpmask;
                                return true;
                        }
                        return false;
                case XFS_HEALTHMON_INODE:
                        if (existing->ino == new->ino &&
                            existing->gen == new->gen) {
                                existing->imask |= new->imask;
                                return true;
                        }
                        return false;
                default:
                        ASSERT(0);
                        return false;
                }
                return false;

        case XFS_HEALTHMON_SHUTDOWN:
                /* yes, we can race to shutdown */
                existing->flags |= new->flags;
                return true;

        case XFS_HEALTHMON_MEDIA_ERROR:
                /* physically adjacent errors can merge */
                if (existing->daddr + existing->bbcount == new->daddr) {
                        existing->bbcount += new->bbcount;
                        return true;
                }
                if (new->daddr + new->bbcount == existing->daddr) {
                        existing->daddr = new->daddr;
                        existing->bbcount += new->bbcount;
                        return true;
                }
                return false;

        case XFS_HEALTHMON_BUFREAD:
        case XFS_HEALTHMON_BUFWRITE:
        case XFS_HEALTHMON_DIOREAD:
        case XFS_HEALTHMON_DIOWRITE:
        case XFS_HEALTHMON_DATALOST:
                /* logically adjacent file ranges can merge */
                if (existing->fino != new->fino || existing->fgen != new->fgen)
                        return false;

                if (existing->fpos + existing->flen == new->fpos) {
                        existing->flen += new->flen;
                        return true;
                }

                if (new->fpos + new->flen == existing->fpos) {
                        existing->fpos = new->fpos;
                        existing->flen += new->flen;
                        return true;
                }
                return false;
        }

        return false;
}

/* Insert an event onto the start of the queue. */
static inline void
__xfs_healthmon_insert(
        struct xfs_healthmon            *hm,
        struct xfs_healthmon_event      *event)
{
        struct timespec64               now;

        ktime_get_coarse_real_ts64(&now);
        event->time_ns = (now.tv_sec * NSEC_PER_SEC) + now.tv_nsec;

        event->next = hm->first_event;
        if (!hm->first_event)
                hm->first_event = event;
        if (!hm->last_event)
                hm->last_event = event;
        xfs_healthmon_bump_events(hm);
        wake_up(&hm->wait);

        trace_xfs_healthmon_insert(hm, event);
}

/* Push an event onto the end of the queue. */
static inline void
__xfs_healthmon_push(
        struct xfs_healthmon            *hm,
        struct xfs_healthmon_event      *event)
{
        struct timespec64               now;

        ktime_get_coarse_real_ts64(&now);
        event->time_ns = (now.tv_sec * NSEC_PER_SEC) + now.tv_nsec;

        if (!hm->first_event)
                hm->first_event = event;
        if (hm->last_event)
                hm->last_event->next = event;
        hm->last_event = event;
        event->next = NULL;
        xfs_healthmon_bump_events(hm);
        wake_up(&hm->wait);

        trace_xfs_healthmon_push(hm, event);
}

/* Deal with any previously lost events */
static int
xfs_healthmon_clear_lost_prev(
        struct xfs_healthmon            *hm)
{
        struct xfs_healthmon_event      lost_event = {
                .type                   = XFS_HEALTHMON_LOST,
                .domain                 = XFS_HEALTHMON_MOUNT,
                .lostcount              = hm->lost_prev_event,
        };
        struct xfs_healthmon_event      *event = NULL;

        if (xfs_healthmon_merge_events(hm->last_event, &lost_event)) {
                trace_xfs_healthmon_merge(hm, hm->last_event);
                wake_up(&hm->wait);
                goto cleared;
        }

        if (hm->events < XFS_HEALTHMON_MAX_EVENTS)
                event = kmemdup(&lost_event, sizeof(struct xfs_healthmon_event),
                                GFP_NOFS);
        if (!event)
                return -ENOMEM;

        __xfs_healthmon_push(hm, event);
cleared:
        hm->lost_prev_event = 0;
        return 0;
}

/*
 * Push an event onto the end of the list after dealing with lost events and
 * possibly full queues.
 */
STATIC int
xfs_healthmon_push(
        struct xfs_healthmon                    *hm,
        const struct xfs_healthmon_event        *template)
{
        struct xfs_healthmon_event              *event = NULL;
        int                                     error = 0;

        /*
         * Locklessly check if the health monitor has already detached from the
         * mount.  If so, ignore the event.  If we race with deactivation,
         * we'll queue the event but never send it.
         */
        if (hm->mount_cookie == DETACHED_MOUNT_COOKIE)
                return -ESHUTDOWN;

        mutex_lock(&hm->lock);

        /* Report previously lost events before we do anything else */
        if (hm->lost_prev_event) {
                error = xfs_healthmon_clear_lost_prev(hm);
                if (error)
                        goto out_unlock;
        }

        /* Try to merge with the newest event */
        if (xfs_healthmon_merge_events(hm->last_event, template)) {
                trace_xfs_healthmon_merge(hm, hm->last_event);
                wake_up(&hm->wait);
                goto out_unlock;
        }

        /* Only create a heap event object if we're not already at capacity. */
        if (hm->events < XFS_HEALTHMON_MAX_EVENTS)
                event = kmemdup(template, sizeof(struct xfs_healthmon_event),
                                GFP_NOFS);
        if (!event) {
                /* No memory means we lose the event */
                trace_xfs_healthmon_lost_event(hm);
                xfs_healthmon_bump_lost(hm);
                error = -ENOMEM;
                goto out_unlock;
        }

        __xfs_healthmon_push(hm, event);

out_unlock:
        mutex_unlock(&hm->lock);
        return error;
}

/*
 * Report that the filesystem is being unmounted, then detach the xfs mount
 * from this healthmon instance.
 */
void
xfs_healthmon_unmount(
        struct xfs_mount                *mp)
{
        struct xfs_healthmon            *hm = xfs_healthmon_get(mp);

        if (!hm)
                return;

        trace_xfs_healthmon_report_unmount(hm);

        /*
         * Insert the unmount notification at the start of the event queue so
         * that userspace knows the filesystem went away as soon as possible.
         * There's nothing actionable for userspace after an unmount.  Once
         * we've inserted the unmount event, hm no longer owns that event.
         */
        __xfs_healthmon_insert(hm, hm->unmount_event);
        hm->unmount_event = NULL;

        xfs_healthmon_detach(hm);
        xfs_healthmon_put(hm);
}

/* Compute the reporting mask for non-unmount metadata health events. */
static inline unsigned int
metadata_event_mask(
        struct xfs_healthmon            *hm,
        enum xfs_healthmon_type         type,
        unsigned int                    old_mask,
        unsigned int                    new_mask)
{
        /* If we want all events, return all events. */
        if (hm->verbose)
                return new_mask;

        switch (type) {
        case XFS_HEALTHMON_SICK:
                /* Always report runtime corruptions */
                return new_mask;
        case XFS_HEALTHMON_CORRUPT:
                /* Only report new fsck errors */
                return new_mask & ~old_mask;
        case XFS_HEALTHMON_HEALTHY:
                /* Only report healthy metadata that got fixed */
                return new_mask & old_mask;
        default:
                ASSERT(0);
                break;
        }

        return 0;
}

/* Report XFS_FS_SICK_* events to healthmon */
void
xfs_healthmon_report_fs(
        struct xfs_mount                *mp,
        enum xfs_healthmon_type         type,
        unsigned int                    old_mask,
        unsigned int                    new_mask)
{
        struct xfs_healthmon_event      event = {
                .type                   = type,
                .domain                 = XFS_HEALTHMON_FS,
        };
        struct xfs_healthmon            *hm = xfs_healthmon_get(mp);

        if (!hm)
                return;

        event.fsmask = metadata_event_mask(hm, type, old_mask, new_mask) &
                        ~XFS_SICK_FS_SECONDARY;
        trace_xfs_healthmon_report_fs(hm, old_mask, new_mask, &event);

        if (event.fsmask)
                xfs_healthmon_push(hm, &event);

        xfs_healthmon_put(hm);
}

/* Report XFS_SICK_(AG|RG)* flags to healthmon */
void
xfs_healthmon_report_group(
        struct xfs_group                *xg,
        enum xfs_healthmon_type         type,
        unsigned int                    old_mask,
        unsigned int                    new_mask)
{
        struct xfs_healthmon_event      event = {
                .type                   = type,
                .group                  = xg->xg_gno,
        };
        struct xfs_healthmon            *hm = xfs_healthmon_get(xg->xg_mount);

        if (!hm)
                return;

        switch (xg->xg_type) {
        case XG_TYPE_RTG:
                event.domain = XFS_HEALTHMON_RTGROUP;
                event.grpmask = metadata_event_mask(hm, type, old_mask,
                                                    new_mask) &
                                ~XFS_SICK_RG_SECONDARY;
                break;
        case XG_TYPE_AG:
                event.domain = XFS_HEALTHMON_AG;
                event.grpmask = metadata_event_mask(hm, type, old_mask,
                                                    new_mask) &
                                ~XFS_SICK_AG_SECONDARY;
                break;
        default:
                ASSERT(0);
                break;
        }

        trace_xfs_healthmon_report_group(hm, old_mask, new_mask, &event);

        if (event.grpmask)
                xfs_healthmon_push(hm, &event);

        xfs_healthmon_put(hm);
}

/* Report XFS_SICK_INO_* flags to healthmon */
void
xfs_healthmon_report_inode(
        struct xfs_inode                *ip,
        enum xfs_healthmon_type         type,
        unsigned int                    old_mask,
        unsigned int                    new_mask)
{
        struct xfs_healthmon_event      event = {
                .type                   = type,
                .domain                 = XFS_HEALTHMON_INODE,
                .ino                    = ip->i_ino,
                .gen                    = VFS_I(ip)->i_generation,
        };
        struct xfs_healthmon            *hm = xfs_healthmon_get(ip->i_mount);

        if (!hm)
                return;

        event.imask = metadata_event_mask(hm, type, old_mask, new_mask) &
                        ~XFS_SICK_INO_SECONDARY;
        trace_xfs_healthmon_report_inode(hm, old_mask, event.imask, &event);

        if (event.imask)
                xfs_healthmon_push(hm, &event);

        xfs_healthmon_put(hm);
}

/* Add a shutdown event to the reporting queue. */
void
xfs_healthmon_report_shutdown(
        struct xfs_mount                *mp,
        uint32_t                        flags)
{
        struct xfs_healthmon_event      event = {
                .type                   = XFS_HEALTHMON_SHUTDOWN,
                .domain                 = XFS_HEALTHMON_MOUNT,
                .flags                  = flags,
        };
        struct xfs_healthmon            *hm = xfs_healthmon_get(mp);

        if (!hm)
                return;

        trace_xfs_healthmon_report_shutdown(hm, flags);

        xfs_healthmon_push(hm, &event);
        xfs_healthmon_put(hm);
}

static inline enum xfs_healthmon_domain
media_error_domain(
        enum xfs_device                 fdev)
{
        switch (fdev) {
        case XFS_DEV_DATA:
                return XFS_HEALTHMON_DATADEV;
        case XFS_DEV_LOG:
                return XFS_HEALTHMON_LOGDEV;
        case XFS_DEV_RT:
                return XFS_HEALTHMON_RTDEV;
        }

        ASSERT(0);
        return 0;
}

/* Add a media error event to the reporting queue. */
void
xfs_healthmon_report_media(
        struct xfs_mount                *mp,
        enum xfs_device                 fdev,
        xfs_daddr_t                     daddr,
        uint64_t                        bbcount)
{
        struct xfs_healthmon_event      event = {
                .type                   = XFS_HEALTHMON_MEDIA_ERROR,
                .domain                 = media_error_domain(fdev),
                .daddr                  = daddr,
                .bbcount                = bbcount,
        };
        struct xfs_healthmon            *hm = xfs_healthmon_get(mp);

        if (!hm)
                return;

        trace_xfs_healthmon_report_media(hm, fdev, &event);

        xfs_healthmon_push(hm, &event);
        xfs_healthmon_put(hm);
}

static inline enum xfs_healthmon_type file_ioerr_type(enum fserror_type action)
{
        switch (action) {
        case FSERR_BUFFERED_READ:
                return XFS_HEALTHMON_BUFREAD;
        case FSERR_BUFFERED_WRITE:
                return XFS_HEALTHMON_BUFWRITE;
        case FSERR_DIRECTIO_READ:
                return XFS_HEALTHMON_DIOREAD;
        case FSERR_DIRECTIO_WRITE:
                return XFS_HEALTHMON_DIOWRITE;
        case FSERR_DATA_LOST:
                return XFS_HEALTHMON_DATALOST;
        case FSERR_METADATA:
                /* filtered out by xfs_fs_report_error */
                break;
        }

        ASSERT(0);
        return -1;
}

/* Add a file io error event to the reporting queue. */
void
xfs_healthmon_report_file_ioerror(
        struct xfs_inode                *ip,
        const struct fserror_event      *p)
{
        struct xfs_healthmon_event      event = {
                .type                   = file_ioerr_type(p->type),
                .domain                 = XFS_HEALTHMON_FILERANGE,
                .fino                   = ip->i_ino,
                .fgen                   = VFS_I(ip)->i_generation,
                .fpos                   = p->pos,
                .flen                   = p->len,
                /* send positive error number to userspace */
                .error                  = -p->error,
        };
        struct xfs_healthmon            *hm = xfs_healthmon_get(ip->i_mount);

        if (!hm)
                return;

        trace_xfs_healthmon_report_file_ioerror(hm, p);

        xfs_healthmon_push(hm, &event);
        xfs_healthmon_put(hm);
}

static inline void
xfs_healthmon_reset_outbuf(
        struct xfs_healthmon            *hm)
{
        hm->buftail = 0;
        hm->bufhead = 0;
}

struct flags_map {
        unsigned int            in_mask;
        unsigned int            out_mask;
};

static const struct flags_map shutdown_map[] = {
        { SHUTDOWN_META_IO_ERROR,       XFS_HEALTH_SHUTDOWN_META_IO_ERROR },
        { SHUTDOWN_LOG_IO_ERROR,        XFS_HEALTH_SHUTDOWN_LOG_IO_ERROR },
        { SHUTDOWN_FORCE_UMOUNT,        XFS_HEALTH_SHUTDOWN_FORCE_UMOUNT },
        { SHUTDOWN_CORRUPT_INCORE,      XFS_HEALTH_SHUTDOWN_CORRUPT_INCORE },
        { SHUTDOWN_CORRUPT_ONDISK,      XFS_HEALTH_SHUTDOWN_CORRUPT_ONDISK },
        { SHUTDOWN_DEVICE_REMOVED,      XFS_HEALTH_SHUTDOWN_DEVICE_REMOVED },
};

static inline unsigned int
__map_flags(
        const struct flags_map  *map,
        size_t                  array_len,
        unsigned int            flags)
{
        const struct flags_map  *m;
        unsigned int            ret = 0;

        for (m = map; m < map + array_len; m++) {
                if (flags & m->in_mask)
                        ret |= m->out_mask;
        }

        return ret;
}

#define map_flags(map, flags) __map_flags((map), ARRAY_SIZE(map), (flags))

static inline unsigned int shutdown_mask(unsigned int in)
{
        return map_flags(shutdown_map, in);
}

static const unsigned int domain_map[] = {
        [XFS_HEALTHMON_MOUNT]           = XFS_HEALTH_MONITOR_DOMAIN_MOUNT,
        [XFS_HEALTHMON_FS]              = XFS_HEALTH_MONITOR_DOMAIN_FS,
        [XFS_HEALTHMON_AG]              = XFS_HEALTH_MONITOR_DOMAIN_AG,
        [XFS_HEALTHMON_INODE]           = XFS_HEALTH_MONITOR_DOMAIN_INODE,
        [XFS_HEALTHMON_RTGROUP]         = XFS_HEALTH_MONITOR_DOMAIN_RTGROUP,
        [XFS_HEALTHMON_DATADEV]         = XFS_HEALTH_MONITOR_DOMAIN_DATADEV,
        [XFS_HEALTHMON_RTDEV]           = XFS_HEALTH_MONITOR_DOMAIN_RTDEV,
        [XFS_HEALTHMON_LOGDEV]          = XFS_HEALTH_MONITOR_DOMAIN_LOGDEV,
        [XFS_HEALTHMON_FILERANGE]       = XFS_HEALTH_MONITOR_DOMAIN_FILERANGE,
};

static const unsigned int type_map[] = {
        [XFS_HEALTHMON_RUNNING]         = XFS_HEALTH_MONITOR_TYPE_RUNNING,
        [XFS_HEALTHMON_LOST]            = XFS_HEALTH_MONITOR_TYPE_LOST,
        [XFS_HEALTHMON_SICK]            = XFS_HEALTH_MONITOR_TYPE_SICK,
        [XFS_HEALTHMON_CORRUPT]         = XFS_HEALTH_MONITOR_TYPE_CORRUPT,
        [XFS_HEALTHMON_HEALTHY]         = XFS_HEALTH_MONITOR_TYPE_HEALTHY,
        [XFS_HEALTHMON_UNMOUNT]         = XFS_HEALTH_MONITOR_TYPE_UNMOUNT,
        [XFS_HEALTHMON_SHUTDOWN]        = XFS_HEALTH_MONITOR_TYPE_SHUTDOWN,
        [XFS_HEALTHMON_MEDIA_ERROR]     = XFS_HEALTH_MONITOR_TYPE_MEDIA_ERROR,
        [XFS_HEALTHMON_BUFREAD]         = XFS_HEALTH_MONITOR_TYPE_BUFREAD,
        [XFS_HEALTHMON_BUFWRITE]        = XFS_HEALTH_MONITOR_TYPE_BUFWRITE,
        [XFS_HEALTHMON_DIOREAD]         = XFS_HEALTH_MONITOR_TYPE_DIOREAD,
        [XFS_HEALTHMON_DIOWRITE]        = XFS_HEALTH_MONITOR_TYPE_DIOWRITE,
        [XFS_HEALTHMON_DATALOST]        = XFS_HEALTH_MONITOR_TYPE_DATALOST,
};

/* Render event as a V0 structure */
STATIC int
xfs_healthmon_format_v0(
        struct xfs_healthmon            *hm,
        const struct xfs_healthmon_event *event)
{
        struct xfs_health_monitor_event hme = {
                .time_ns                = event->time_ns,
        };

        trace_xfs_healthmon_format(hm, event);

        if (event->domain < 0 || event->domain >= ARRAY_SIZE(domain_map) ||
            event->type < 0   || event->type >= ARRAY_SIZE(type_map))
                return -EFSCORRUPTED;

        hme.domain = domain_map[event->domain];
        hme.type = type_map[event->type];

        /* fill in the event-specific details */
        switch (event->domain) {
        case XFS_HEALTHMON_MOUNT:
                switch (event->type) {
                case XFS_HEALTHMON_LOST:
                        hme.e.lost.count = event->lostcount;
                        break;
                case XFS_HEALTHMON_SHUTDOWN:
                        hme.e.shutdown.reasons = shutdown_mask(event->flags);
                        break;
                default:
                        break;
                }
                break;
        case XFS_HEALTHMON_FS:
                hme.e.fs.mask = xfs_healthmon_fs_mask(event->fsmask);
                break;
        case XFS_HEALTHMON_RTGROUP:
                hme.e.group.mask = xfs_healthmon_rtgroup_mask(event->grpmask);
                hme.e.group.gno = event->group;
                break;
        case XFS_HEALTHMON_AG:
                hme.e.group.mask = xfs_healthmon_perag_mask(event->grpmask);
                hme.e.group.gno = event->group;
                break;
        case XFS_HEALTHMON_INODE:
                hme.e.inode.mask = xfs_healthmon_inode_mask(event->imask);
                hme.e.inode.ino = event->ino;
                hme.e.inode.gen = event->gen;
                break;
        case XFS_HEALTHMON_DATADEV:
        case XFS_HEALTHMON_LOGDEV:
        case XFS_HEALTHMON_RTDEV:
                hme.e.media.daddr = event->daddr;
                hme.e.media.bbcount = event->bbcount;
                break;
        case XFS_HEALTHMON_FILERANGE:
                hme.e.filerange.ino = event->fino;
                hme.e.filerange.gen = event->fgen;
                hme.e.filerange.pos = event->fpos;
                hme.e.filerange.len = event->flen;
                hme.e.filerange.error = abs(event->error);
                break;
        default:
                break;
        }

        ASSERT(hm->bufhead + sizeof(hme) <= hm->bufsize);

        /* copy formatted object to the outbuf */
        if (hm->bufhead + sizeof(hme) <= hm->bufsize) {
                memcpy(hm->buffer + hm->bufhead, &hme, sizeof(hme));
                hm->bufhead += sizeof(hme);
        }

        return 0;
}

/* How many bytes are waiting in the outbuf to be copied? */
static inline size_t
xfs_healthmon_outbuf_bytes(
        struct xfs_healthmon    *hm)
{
        if (hm->bufhead > hm->buftail)
                return hm->bufhead - hm->buftail;
        return 0;
}

/*
 * Do we have something for userspace to read?  This can mean unmount events,
 * events pending in the queue, or pending bytes in the outbuf.
 */
static inline bool
xfs_healthmon_has_eventdata(
        struct xfs_healthmon    *hm)
{
        /*
         * If the health monitor is already detached from the xfs_mount, we
         * want reads to return 0 bytes even if there are no events, because
         * userspace interprets that as EOF.  If we race with deactivation,
         * read_iter will take the necessary locks to discover that there are
         * no events to send.
         */
        if (hm->mount_cookie == DETACHED_MOUNT_COOKIE)
                return true;

        /*
         * Either there are events waiting to be formatted into the buffer, or
         * there's unread bytes in the buffer.
         */
        return hm->events > 0 || xfs_healthmon_outbuf_bytes(hm) > 0;
}

/* Try to copy the rest of the outbuf to the iov iter. */
STATIC ssize_t
xfs_healthmon_copybuf(
        struct xfs_healthmon    *hm,
        struct iov_iter         *to)
{
        size_t                  to_copy;
        size_t                  w = 0;

        trace_xfs_healthmon_copybuf(hm, to);

        to_copy = xfs_healthmon_outbuf_bytes(hm);
        if (to_copy) {
                w = copy_to_iter(hm->buffer + hm->buftail, to_copy, to);
                if (!w)
                        return -EFAULT;

                hm->buftail += w;
        }

        /*
         * Nothing left to copy?  Reset the output buffer cursors to the start
         * since there's no live data in the buffer.
         */
        if (xfs_healthmon_outbuf_bytes(hm) == 0)
                xfs_healthmon_reset_outbuf(hm);
        return w;
}

/*
 * Return a health monitoring event for formatting into the output buffer if
 * there's enough space in the outbuf and an event waiting for us.  Caller
 * must hold i_rwsem on the healthmon file.
 */
static inline struct xfs_healthmon_event *
xfs_healthmon_format_pop(
        struct xfs_healthmon    *hm)
{
        struct xfs_healthmon_event *event;

        if (hm->bufhead + sizeof(*event) > hm->bufsize)
                return NULL;

        mutex_lock(&hm->lock);
        event = hm->first_event;
        if (event) {
                if (hm->last_event == event)
                        hm->last_event = NULL;
                hm->first_event = event->next;
                hm->events--;

                trace_xfs_healthmon_pop(hm, event);
        }
        mutex_unlock(&hm->lock);
        return event;
}

/* Allocate formatting buffer */
STATIC int
xfs_healthmon_alloc_outbuf(
        struct xfs_healthmon    *hm,
        size_t                  user_bufsize)
{
        void                    *outbuf;
        size_t                  bufsize =
                min(XFS_HEALTHMON_MAX_OUTBUF, max(PAGE_SIZE, user_bufsize));

        outbuf = kzalloc(bufsize, GFP_KERNEL);
        if (!outbuf) {
                if (bufsize == PAGE_SIZE)
                        return -ENOMEM;

                bufsize = PAGE_SIZE;
                outbuf = kzalloc(bufsize, GFP_KERNEL);
                if (!outbuf)
                        return -ENOMEM;
        }

        hm->buffer = outbuf;
        hm->bufsize = bufsize;
        hm->bufhead = 0;
        hm->buftail = 0;

        return 0;
}

/*
 * Convey queued event data to userspace.  First copy any remaining bytes in
 * the outbuf, then format the oldest event into the outbuf and copy that too.
 */
STATIC ssize_t
xfs_healthmon_read_iter(
        struct kiocb            *iocb,
        struct iov_iter         *to)
{
        struct file             *file = iocb->ki_filp;
        struct inode            *inode = file_inode(file);
        struct xfs_healthmon    *hm = file->private_data;
        struct xfs_healthmon_event *event;
        size_t                  copied = 0;
        ssize_t                 ret = 0;

        if (file->f_flags & O_NONBLOCK) {
                if (!xfs_healthmon_has_eventdata(hm) || !inode_trylock(inode))
                        return -EAGAIN;
        } else {
                ret = wait_event_interruptible(hm->wait,
                                xfs_healthmon_has_eventdata(hm));
                if (ret)
                        return ret;

                inode_lock(inode);
        }

        if (hm->bufsize == 0) {
                ret = xfs_healthmon_alloc_outbuf(hm, iov_iter_count(to));
                if (ret)
                        goto out_unlock;
        }

        trace_xfs_healthmon_read_start(hm);

        /*
         * If there's anything left in the output buffer, copy that before
         * formatting more events.
         */
        ret = xfs_healthmon_copybuf(hm, to);
        if (ret < 0)
                goto out_unlock;
        copied += ret;

        while (iov_iter_count(to) > 0) {
                /* Format the next events into the outbuf until it's full. */
                while ((event = xfs_healthmon_format_pop(hm)) != NULL) {
                        ret = xfs_healthmon_format_v0(hm, event);
                        kfree(event);
                        if (ret)
                                goto out_unlock;
                }

                /* Copy anything formatted into outbuf to userspace */
                ret = xfs_healthmon_copybuf(hm, to);
                if (ret <= 0)
                        break;

                copied += ret;
        }

out_unlock:
        trace_xfs_healthmon_read_finish(hm);
        inode_unlock(inode);
        return copied ?: ret;
}

/* Poll for available events. */
STATIC __poll_t
xfs_healthmon_poll(
        struct file                     *file,
        struct poll_table_struct        *wait)
{
        struct xfs_healthmon            *hm = file->private_data;
        __poll_t                        mask = 0;

        poll_wait(file, &hm->wait, wait);

        if (xfs_healthmon_has_eventdata(hm))
                mask |= EPOLLIN;
        return mask;
}

/* Free the health monitoring information. */
STATIC int
xfs_healthmon_release(
        struct inode            *inode,
        struct file             *file)
{
        struct xfs_healthmon    *hm = file->private_data;

        trace_xfs_healthmon_release(hm);

        /*
         * We might be closing the healthmon file before the filesystem
         * unmounts, because userspace processes can terminate at any time and
         * for any reason.  Null out xfs_mount::m_healthmon so that another
         * process can create another health monitor file.
         */
        xfs_healthmon_detach(hm);
        xfs_healthmon_put(hm);
        return 0;
}

/* Validate ioctl parameters. */
static inline bool
xfs_healthmon_validate(
        const struct xfs_health_monitor *hmo)
{
        if (hmo->flags & ~XFS_HEALTH_MONITOR_ALL)
                return false;
        if (hmo->format != XFS_HEALTH_MONITOR_FMT_V0)
                return false;
        if (memchr_inv(&hmo->pad, 0, sizeof(hmo->pad)))
                return false;
        return true;
}

/* Emit some data about the health monitoring fd. */
static void
xfs_healthmon_show_fdinfo(
        struct seq_file         *m,
        struct file             *file)
{
        struct xfs_healthmon    *hm = file->private_data;

        mutex_lock(&hm->lock);
        seq_printf(m, "state:\t%s\ndev:\t%d:%d\nformat:\tv0\nevents:\t%llu\nlost:\t%llu\n",
                        hm->mount_cookie == DETACHED_MOUNT_COOKIE ?
                                "dead" : "alive",
                        MAJOR(hm->dev), MINOR(hm->dev),
                        hm->total_events,
                        hm->total_lost);
        mutex_unlock(&hm->lock);
}

/* Reconfigure the health monitor. */
STATIC long
xfs_healthmon_reconfigure(
        struct file                     *file,
        unsigned int                    cmd,
        void __user                     *arg)
{
        struct xfs_health_monitor       hmo;
        struct xfs_healthmon            *hm = file->private_data;

        if (copy_from_user(&hmo, arg, sizeof(hmo)))
                return -EFAULT;

        if (!xfs_healthmon_validate(&hmo))
                return -EINVAL;

        mutex_lock(&hm->lock);
        hm->verbose = !!(hmo.flags & XFS_HEALTH_MONITOR_VERBOSE);
        mutex_unlock(&hm->lock);

        return 0;
}

/* Does the fd point to the same filesystem as the one we're monitoring? */
STATIC long
xfs_healthmon_file_on_monitored_fs(
        struct file                     *file,
        unsigned int                    cmd,
        void __user                     *arg)
{
        struct xfs_health_file_on_monitored_fs hms;
        struct xfs_healthmon            *hm = file->private_data;
        struct inode                    *hms_inode;

        if (copy_from_user(&hms, arg, sizeof(hms)))
                return -EFAULT;

        if (hms.flags)
                return -EINVAL;

        CLASS(fd, hms_fd)(hms.fd);
        if (fd_empty(hms_fd))
                return -EBADF;

        hms_inode = file_inode(fd_file(hms_fd));
        mutex_lock(&hm->lock);
        if (hm->mount_cookie != (uintptr_t)hms_inode->i_sb) {
                mutex_unlock(&hm->lock);
                return -ESTALE;
        }

        mutex_unlock(&hm->lock);
        return 0;
}

/* Handle ioctls for the health monitoring thread. */
STATIC long
xfs_healthmon_ioctl(
        struct file                     *file,
        unsigned int                    cmd,
        unsigned long                   p)
{
        void __user                     *arg = (void __user *)p;

        switch (cmd) {
        case XFS_IOC_HEALTH_MONITOR:
                return xfs_healthmon_reconfigure(file, cmd, arg);
        case XFS_IOC_HEALTH_FD_ON_MONITORED_FS:
                return xfs_healthmon_file_on_monitored_fs(file, cmd, arg);
        default:
                break;
        }

        return -ENOTTY;
}

static const struct file_operations xfs_healthmon_fops = {
        .owner          = THIS_MODULE,
        .show_fdinfo    = xfs_healthmon_show_fdinfo,
        .read_iter      = xfs_healthmon_read_iter,
        .poll           = xfs_healthmon_poll,
        .release        = xfs_healthmon_release,
        .unlocked_ioctl = xfs_healthmon_ioctl,
};

/*
 * Create a health monitoring file.  Returns an index to the fd table or a
 * negative errno.
 */
long
xfs_ioc_health_monitor(
        struct file                     *file,
        struct xfs_health_monitor __user *arg)
{
        struct xfs_health_monitor       hmo;
        struct xfs_healthmon_event      *running_event;
        struct xfs_healthmon            *hm;
        struct xfs_inode                *ip = XFS_I(file_inode(file));
        struct xfs_mount                *mp = ip->i_mount;
        int                             ret;

        /*
         * The only intended user of the health monitoring system should be the
         * xfs_healer daemon running on behalf of the whole filesystem in the
         * initial user namespace.  IOWs, we don't allow unprivileged userspace
         * (they can use fsnotify) nor do we allow containers.
         */
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
        if (ip->i_ino != mp->m_sb.sb_rootino)
                return -EPERM;
        if (current_user_ns() != &init_user_ns)
                return -EPERM;

        if (copy_from_user(&hmo, arg, sizeof(hmo)))
                return -EFAULT;

        if (!xfs_healthmon_validate(&hmo))
                return -EINVAL;

        hm = kzalloc_obj(*hm);
        if (!hm)
                return -ENOMEM;
        hm->dev = mp->m_super->s_dev;
        refcount_set(&hm->ref, 1);

        mutex_init(&hm->lock);
        init_waitqueue_head(&hm->wait);

        if (hmo.flags & XFS_HEALTH_MONITOR_VERBOSE)
                hm->verbose = true;

        /* Queue up the first event that lets the client know we're running. */
        running_event = kzalloc_obj(struct xfs_healthmon_event, GFP_NOFS);
        if (!running_event) {
                ret = -ENOMEM;
                goto out_hm;
        }
        running_event->type = XFS_HEALTHMON_RUNNING;
        running_event->domain = XFS_HEALTHMON_MOUNT;
        __xfs_healthmon_insert(hm, running_event);

        /*
         * Preallocate the unmount event so that we can't fail to notify the
         * filesystem later.  This is key for triggering fast exit of the
         * xfs_healer daemon.
         */
        hm->unmount_event = kzalloc_obj(struct xfs_healthmon_event, GFP_NOFS);
        if (!hm->unmount_event) {
                ret = -ENOMEM;
                goto out_hm;
        }
        hm->unmount_event->type = XFS_HEALTHMON_UNMOUNT;
        hm->unmount_event->domain = XFS_HEALTHMON_MOUNT;

        /*
         * Try to attach this health monitor to the xfs_mount.  The monitor is
         * considered live and will receive events if this succeeds.
         */
        ret = xfs_healthmon_attach(mp, hm);
        if (ret)
                goto out_hm;

        /*
         * Create the anonymous file and install a fd for it.  If it succeeds,
         * the file owns hm and can go away at any time, so we must not access
         * it again.  This must go last because we can't undo a fd table
         * installation.
         */
        ret = anon_inode_getfd("xfs_healthmon", &xfs_healthmon_fops, hm,
                        O_CLOEXEC | O_RDONLY);
        if (ret < 0)
                goto out_mp;

        trace_xfs_healthmon_create(mp->m_super->s_dev, hmo.flags, hmo.format);

        return ret;

out_mp:
        xfs_healthmon_detach(hm);
out_hm:
        ASSERT(refcount_read(&hm->ref) == 1);
        xfs_healthmon_put(hm);
        return ret;
}