arch/x86/kernel/cpu/mce/core.c

root/arch/x86/kernel/cpu/mce/core.c
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Machine check handler.
 *
 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
 * Rest from unknown author(s).
 * 2004 Andi Kleen. Rewrote most of it.
 * Copyright 2008 Intel Corporation
 * Author: Andi Kleen
 */

#include <linux/thread_info.h>
#include <linux/capability.h>
#include <linux/miscdevice.h>
#include <linux/ratelimit.h>
#include <linux/rcupdate.h>
#include <linux/kobject.h>
#include <linux/uaccess.h>
#include <linux/kdebug.h>
#include <linux/kernel.h>
#include <linux/percpu.h>
#include <linux/string.h>
#include <linux/device.h>
#include <linux/syscore_ops.h>
#include <linux/delay.h>
#include <linux/ctype.h>
#include <linux/sched.h>
#include <linux/sysfs.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/kmod.h>
#include <linux/poll.h>
#include <linux/nmi.h>
#include <linux/cpu.h>
#include <linux/ras.h>
#include <linux/smp.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/debugfs.h>
#include <linux/irq_work.h>
#include <linux/export.h>
#include <linux/set_memory.h>
#include <linux/sync_core.h>
#include <linux/task_work.h>
#include <linux/hardirq.h>
#include <linux/kexec.h>
#include <linux/vmcore_info.h>

#include <asm/fred.h>
#include <asm/cpu_device_id.h>
#include <asm/processor.h>
#include <asm/traps.h>
#include <asm/tlbflush.h>
#include <asm/mce.h>
#include <asm/msr.h>
#include <asm/reboot.h>
#include <asm/tdx.h>

#include "internal.h"

/* sysfs synchronization */
static DEFINE_MUTEX(mce_sysfs_mutex);

#define CREATE_TRACE_POINTS
#include <trace/events/mce.h>

#define SPINUNIT                100     /* 100ns */

DEFINE_PER_CPU(unsigned, mce_exception_count);

DEFINE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks);

DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array);

#define ATTR_LEN               16
/* One object for each MCE bank, shared by all CPUs */
struct mce_bank_dev {
        struct device_attribute attr;                   /* device attribute */
        char                    attrname[ATTR_LEN];     /* attribute name */
        u8                      bank;                   /* bank number */
};
static struct mce_bank_dev mce_bank_devs[MAX_NR_BANKS];

struct mce_vendor_flags mce_flags __read_mostly;

struct mca_config mca_cfg __read_mostly = {
        .bootlog  = -1,
        .monarch_timeout = -1
};

static DEFINE_PER_CPU(struct mce_hw_err, hw_errs_seen);
static unsigned long mce_need_notify;

/*
 * MCA banks polled by the period polling timer for corrected events.
 * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
 */
DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
        [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
};

/*
 * MCA banks controlled through firmware first for corrected errors.
 * This is a global list of banks for which we won't enable CMCI and we
 * won't poll. Firmware controls these banks and is responsible for
 * reporting corrected errors through GHES. Uncorrected/recoverable
 * errors are still notified through a machine check.
 */
mce_banks_t mce_banks_ce_disabled;

static struct work_struct mce_work;
static struct irq_work mce_irq_work;

/*
 * CPU/chipset specific EDAC code can register a notifier call here to print
 * MCE errors in a human-readable form.
 */
BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);

void mce_prep_record_common(struct mce *m)
{
        m->cpuid        = cpuid_eax(1);
        m->cpuvendor    = boot_cpu_data.x86_vendor;
        m->mcgcap       = native_rdmsrq(MSR_IA32_MCG_CAP);
        /* need the internal __ version to avoid deadlocks */
        m->time         = __ktime_get_real_seconds();
}

void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m)
{
        m->cpu          = cpu;
        m->extcpu       = cpu;
        m->apicid       = cpu_data(cpu).topo.initial_apicid;
        m->microcode    = cpu_data(cpu).microcode;
        m->ppin         = topology_ppin(cpu);
        m->socketid     = topology_physical_package_id(cpu);
}

/* Do initial initialization of struct mce_hw_err */
void mce_prep_record(struct mce_hw_err *err)
{
        struct mce *m = &err->m;

        memset(err, 0, sizeof(struct mce_hw_err));
        mce_prep_record_common(m);
        mce_prep_record_per_cpu(smp_processor_id(), m);
}

DEFINE_PER_CPU(struct mce, injectm);
EXPORT_PER_CPU_SYMBOL_GPL(injectm);

void mce_log(struct mce_hw_err *err)
{
        if (mce_gen_pool_add(err))
                irq_work_queue(&mce_irq_work);
}
EXPORT_SYMBOL_GPL(mce_log);

void mce_register_decode_chain(struct notifier_block *nb)
{
        if (WARN_ON(nb->priority < MCE_PRIO_LOWEST ||
                    nb->priority > MCE_PRIO_HIGHEST))
                return;

        blocking_notifier_chain_register(&x86_mce_decoder_chain, nb);
}
EXPORT_SYMBOL_GPL(mce_register_decode_chain);

void mce_unregister_decode_chain(struct notifier_block *nb)
{
        blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
}
EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);

static void __print_mce(struct mce_hw_err *err)
{
        struct mce *m = &err->m;

        pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n",
                 m->extcpu,
                 (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""),
                 m->mcgstatus, m->bank, m->status);

        if (m->ip) {
                pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
                        !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
                        m->cs, m->ip);

                if (m->cs == __KERNEL_CS)
                        pr_cont("{%pS}", (void *)(unsigned long)m->ip);
                pr_cont("\n");
        }

        pr_emerg(HW_ERR "TSC %llx ", m->tsc);
        if (m->addr)
                pr_cont("ADDR %llx ", m->addr);
        if (m->misc)
                pr_cont("MISC %llx ", m->misc);
        if (m->ppin)
                pr_cont("PPIN %llx ", m->ppin);

        if (mce_flags.smca) {
                if (m->synd)
                        pr_cont("SYND %llx ", m->synd);
                if (err->vendor.amd.synd1)
                        pr_cont("SYND1 %llx ", err->vendor.amd.synd1);
                if (err->vendor.amd.synd2)
                        pr_cont("SYND2 %llx ", err->vendor.amd.synd2);
                if (m->ipid)
                        pr_cont("IPID %llx ", m->ipid);
        }

        pr_cont("\n");

        /*
         * Note this output is parsed by external tools and old fields
         * should not be changed.
         */
        pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
                m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
                m->microcode);
}

static void print_mce(struct mce_hw_err *err)
{
        struct mce *m = &err->m;

        __print_mce(err);

        if (m->cpuvendor != X86_VENDOR_AMD && m->cpuvendor != X86_VENDOR_HYGON)
                pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
}

#define PANIC_TIMEOUT 5 /* 5 seconds */

static atomic_t mce_panicked;

static int fake_panic;
static atomic_t mce_fake_panicked;

/* Panic in progress. Enable interrupts and wait for final IPI */
static void wait_for_panic(void)
{
        long timeout = PANIC_TIMEOUT*USEC_PER_SEC;

        preempt_disable();
        local_irq_enable();
        while (timeout-- > 0)
                udelay(1);
        if (panic_timeout == 0)
                panic_timeout = mca_cfg.panic_timeout;
        panic("Panicing machine check CPU died");
}

static const char *mce_dump_aux_info(struct mce *m)
{
        if (boot_cpu_has_bug(X86_BUG_TDX_PW_MCE))
                return tdx_dump_mce_info(m);

        return NULL;
}

static noinstr void mce_panic(const char *msg, struct mce_hw_err *final, char *exp)
{
        struct llist_node *pending;
        struct mce_evt_llist *l;
        int apei_err = 0;
        const char *memmsg;

        /*
         * Allow instrumentation around external facilities usage. Not that it
         * matters a whole lot since the machine is going to panic anyway.
         */
        instrumentation_begin();

        if (!fake_panic) {
                /*
                 * Make sure only one CPU runs in machine check panic
                 */
                if (atomic_inc_return(&mce_panicked) > 1)
                        wait_for_panic();
                barrier();

                bust_spinlocks(1);
                console_verbose();
        } else {
                /* Don't log too much for fake panic */
                if (atomic_inc_return(&mce_fake_panicked) > 1)
                        goto out;
        }
        pending = mce_gen_pool_prepare_records();
        /* First print corrected ones that are still unlogged */
        llist_for_each_entry(l, pending, llnode) {
                struct mce_hw_err *err = &l->err;
                struct mce *m = &err->m;
                if (!(m->status & MCI_STATUS_UC)) {
                        print_mce(err);
                        if (!apei_err)
                                apei_err = apei_write_mce(m);
                }
        }
        /* Now print uncorrected but with the final one last */
        llist_for_each_entry(l, pending, llnode) {
                struct mce_hw_err *err = &l->err;
                struct mce *m = &err->m;
                if (!(m->status & MCI_STATUS_UC))
                        continue;
                if (!final || mce_cmp(m, &final->m)) {
                        print_mce(err);
                        if (!apei_err)
                                apei_err = apei_write_mce(m);
                }
        }
        if (final) {
                print_mce(final);
                if (!apei_err)
                        apei_err = apei_write_mce(&final->m);
        }
        if (exp)
                pr_emerg(HW_ERR "Machine check: %s\n", exp);

        memmsg = mce_dump_aux_info(&final->m);
        if (memmsg)
                pr_emerg(HW_ERR "Machine check: %s\n", memmsg);

        if (!fake_panic) {
                if (panic_timeout == 0)
                        panic_timeout = mca_cfg.panic_timeout;

                /*
                 * Kdump skips the poisoned page in order to avoid
                 * touching the error bits again. Poison the page even
                 * if the error is fatal and the machine is about to
                 * panic.
                 */
                if (kexec_crash_loaded()) {
                        if (final && (final->m.status & MCI_STATUS_ADDRV)) {
                                struct page *p;
                                p = pfn_to_online_page(final->m.addr >> PAGE_SHIFT);
                                if (p)
                                        SetPageHWPoison(p);
                        }
                }
                panic(msg);
        } else
                pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);

out:
        instrumentation_end();
}

/* Support code for software error injection */

static int msr_to_offset(u32 msr)
{
        unsigned bank = __this_cpu_read(injectm.bank);

        if (msr == mca_cfg.rip_msr)
                return offsetof(struct mce, ip);
        if (msr == mca_msr_reg(bank, MCA_STATUS))
                return offsetof(struct mce, status);
        if (msr == mca_msr_reg(bank, MCA_ADDR))
                return offsetof(struct mce, addr);
        if (msr == mca_msr_reg(bank, MCA_MISC))
                return offsetof(struct mce, misc);
        if (msr == MSR_IA32_MCG_STATUS)
                return offsetof(struct mce, mcgstatus);
        return -1;
}

void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr)
{
        if (wrmsr) {
                pr_emerg("MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n",
                         (unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->ax,
                         regs->ip, (void *)regs->ip);
        } else {
                pr_emerg("MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n",
                         (unsigned int)regs->cx, regs->ip, (void *)regs->ip);
        }

        show_stack_regs(regs);

        panic("MCA architectural violation!\n");

        while (true)
                cpu_relax();
}

/* MSR access wrappers used for error injection */
noinstr u64 mce_rdmsrq(u32 msr)
{
        EAX_EDX_DECLARE_ARGS(val, low, high);

        if (__this_cpu_read(injectm.finished)) {
                int offset;
                u64 ret;

                instrumentation_begin();

                offset = msr_to_offset(msr);
                if (offset < 0)
                        ret = 0;
                else
                        ret = *(u64 *)((char *)this_cpu_ptr(&injectm) + offset);

                instrumentation_end();

                return ret;
        }

        /*
         * RDMSR on MCA MSRs should not fault. If they do, this is very much an
         * architectural violation and needs to be reported to hw vendor. Panic
         * the box to not allow any further progress.
         */
        asm volatile("1: rdmsr\n"
                     "2:\n"
                     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_RDMSR_IN_MCE)
                     : EAX_EDX_RET(val, low, high) : "c" (msr));


        return EAX_EDX_VAL(val, low, high);
}

noinstr void mce_wrmsrq(u32 msr, u64 v)
{
        u32 low, high;

        if (__this_cpu_read(injectm.finished)) {
                int offset;

                instrumentation_begin();

                offset = msr_to_offset(msr);
                if (offset >= 0)
                        *(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v;

                instrumentation_end();

                return;
        }

        low  = (u32)v;
        high = (u32)(v >> 32);

        /* See comment in mce_rdmsrq() */
        asm volatile("1: wrmsr\n"
                     "2:\n"
                     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR_IN_MCE)
                     : : "c" (msr), "a"(low), "d" (high) : "memory");
}

/*
 * Collect all global (w.r.t. this processor) status about this machine
 * check into our "mce" struct so that we can use it later to assess
 * the severity of the problem as we read per-bank specific details.
 */
static noinstr void mce_gather_info(struct mce_hw_err *err, struct pt_regs *regs)
{
        struct mce *m;
        /*
         * Enable instrumentation around mce_prep_record() which calls external
         * facilities.
         */
        instrumentation_begin();
        mce_prep_record(err);
        instrumentation_end();

        m = &err->m;
        m->mcgstatus = mce_rdmsrq(MSR_IA32_MCG_STATUS);
        if (regs) {
                /*
                 * Get the address of the instruction at the time of
                 * the machine check error.
                 */
                if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
                        m->ip = regs->ip;
                        m->cs = regs->cs;

                        /*
                         * When in VM86 mode make the cs look like ring 3
                         * always. This is a lie, but it's better than passing
                         * the additional vm86 bit around everywhere.
                         */
                        if (v8086_mode(regs))
                                m->cs |= 3;
                }
                /* Use accurate RIP reporting if available. */
                if (mca_cfg.rip_msr)
                        m->ip = mce_rdmsrq(mca_cfg.rip_msr);
        }
}

bool mce_available(struct cpuinfo_x86 *c)
{
        if (mca_cfg.disabled)
                return false;
        return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
}

static void mce_schedule_work(void)
{
        if (!mce_gen_pool_empty())
                schedule_work(&mce_work);
}

static void mce_irq_work_cb(struct irq_work *entry)
{
        mce_schedule_work();
}

bool mce_usable_address(struct mce *m)
{
        if (!(m->status & MCI_STATUS_ADDRV))
                return false;

        switch (m->cpuvendor) {
        case X86_VENDOR_AMD:
                return amd_mce_usable_address(m);

        case X86_VENDOR_INTEL:
        case X86_VENDOR_ZHAOXIN:
                return intel_mce_usable_address(m);

        default:
                return true;
        }
}
EXPORT_SYMBOL_GPL(mce_usable_address);

bool mce_is_memory_error(struct mce *m)
{
        switch (m->cpuvendor) {
        case X86_VENDOR_AMD:
        case X86_VENDOR_HYGON:
                return amd_mce_is_memory_error(m);

        case X86_VENDOR_INTEL:
        case X86_VENDOR_ZHAOXIN:
                /*
                 * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
                 *
                 * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
                 * indicating a memory error. Bit 8 is used for indicating a
                 * cache hierarchy error. The combination of bit 2 and bit 3
                 * is used for indicating a `generic' cache hierarchy error
                 * But we can't just blindly check the above bits, because if
                 * bit 11 is set, then it is a bus/interconnect error - and
                 * either way the above bits just gives more detail on what
                 * bus/interconnect error happened. Note that bit 12 can be
                 * ignored, as it's the "filter" bit.
                 */
                return (m->status & 0xef80) == BIT(7) ||
                       (m->status & 0xef00) == BIT(8) ||
                       (m->status & 0xeffc) == 0xc;

        default:
                return false;
        }
}
EXPORT_SYMBOL_GPL(mce_is_memory_error);

static bool whole_page(struct mce *m)
{
        if (!mca_cfg.ser || !(m->status & MCI_STATUS_MISCV))
                return true;

        return MCI_MISC_ADDR_LSB(m->misc) >= PAGE_SHIFT;
}

bool mce_is_correctable(struct mce *m)
{
        if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED)
                return false;

        if (m->cpuvendor == X86_VENDOR_HYGON && m->status & MCI_STATUS_DEFERRED)
                return false;

        if (m->status & MCI_STATUS_UC)
                return false;

        return true;
}
EXPORT_SYMBOL_GPL(mce_is_correctable);

/*
 * Notify the user(s) about new machine check events.
 * Can be called from interrupt context, but not from machine check/NMI
 * context.
 */
static bool mce_notify_irq(void)
{
        /* Not more than two messages every minute */
        static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);

        if (test_and_clear_bit(0, &mce_need_notify)) {
                mce_work_trigger();

                if (__ratelimit(&ratelimit))
                        pr_info(HW_ERR "Machine check events logged\n");

                return true;
        }

        return false;
}

static int mce_early_notifier(struct notifier_block *nb, unsigned long val,
                              void *data)
{
        struct mce_hw_err *err = to_mce_hw_err(data);

        if (!err)
                return NOTIFY_DONE;

        /* Emit the trace record: */
        trace_mce_record(err);

        set_bit(0, &mce_need_notify);

        mce_notify_irq();

        return NOTIFY_DONE;
}

static struct notifier_block early_nb = {
        .notifier_call  = mce_early_notifier,
        .priority       = MCE_PRIO_EARLY,
};

static int uc_decode_notifier(struct notifier_block *nb, unsigned long val,
                              void *data)
{
        struct mce *mce = (struct mce *)data;
        unsigned long pfn;

        if (!mce || !mce_usable_address(mce))
                return NOTIFY_DONE;

        if (mce->severity != MCE_AO_SEVERITY &&
            mce->severity != MCE_DEFERRED_SEVERITY)
                return NOTIFY_DONE;

        pfn = (mce->addr & MCI_ADDR_PHYSADDR) >> PAGE_SHIFT;
        if (!memory_failure(pfn, 0)) {
                set_mce_nospec(pfn);
                mce->kflags |= MCE_HANDLED_UC;
        }

        return NOTIFY_OK;
}

static struct notifier_block mce_uc_nb = {
        .notifier_call  = uc_decode_notifier,
        .priority       = MCE_PRIO_UC,
};

static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
                                void *data)
{
        struct mce_hw_err *err = to_mce_hw_err(data);

        if (!err)
                return NOTIFY_DONE;

        if (mca_cfg.print_all || !(err->m.kflags))
                __print_mce(err);

        return NOTIFY_DONE;
}

static struct notifier_block mce_default_nb = {
        .notifier_call  = mce_default_notifier,
        /* lowest prio, we want it to run last. */
        .priority       = MCE_PRIO_LOWEST,
};

/*
 * Read ADDR and MISC registers.
 */
static noinstr void mce_read_aux(struct mce_hw_err *err, int i)
{
        struct mce *m = &err->m;

        if (m->status & MCI_STATUS_MISCV)
                m->misc = mce_rdmsrq(mca_msr_reg(i, MCA_MISC));

        if (m->status & MCI_STATUS_ADDRV) {
                if (m->kflags & MCE_CHECK_DFR_REGS)
                        m->addr = mce_rdmsrq(MSR_AMD64_SMCA_MCx_DEADDR(i));
                else
                        m->addr = mce_rdmsrq(mca_msr_reg(i, MCA_ADDR));

                /*
                 * Mask the reported address by the reported granularity.
                 */
                if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
                        u8 shift = MCI_MISC_ADDR_LSB(m->misc);
                        m->addr >>= shift;
                        m->addr <<= shift;
                }

                smca_extract_err_addr(m);
        }

        if (mce_flags.smca) {
                m->ipid = mce_rdmsrq(MSR_AMD64_SMCA_MCx_IPID(i));

                if (m->status & MCI_STATUS_SYNDV) {
                        m->synd = mce_rdmsrq(MSR_AMD64_SMCA_MCx_SYND(i));
                        err->vendor.amd.synd1 = mce_rdmsrq(MSR_AMD64_SMCA_MCx_SYND1(i));
                        err->vendor.amd.synd2 = mce_rdmsrq(MSR_AMD64_SMCA_MCx_SYND2(i));
                }
        }
}

DEFINE_PER_CPU(unsigned, mce_poll_count);

/*
 * We have three scenarios for checking for Deferred errors:
 *
 * 1) Non-SMCA systems check MCA_STATUS and log error if found.
 * 2) SMCA systems check MCA_STATUS. If error is found then log it and also
 *    clear MCA_DESTAT.
 * 3) SMCA systems check MCA_DESTAT, if error was not found in MCA_STATUS, and
 *    log it.
 */
static bool smca_should_log_poll_error(struct mce *m)
{
        if (m->status & MCI_STATUS_VAL)
                return true;

        m->status = mce_rdmsrq(MSR_AMD64_SMCA_MCx_DESTAT(m->bank));
        if ((m->status & MCI_STATUS_VAL) && (m->status & MCI_STATUS_DEFERRED)) {
                m->kflags |= MCE_CHECK_DFR_REGS;
                return true;
        }

        return false;
}

/*
 * Newer Intel systems that support software error
 * recovery need to make additional checks. Other
 * CPUs should skip over uncorrected errors, but log
 * everything else.
 */
static bool ser_should_log_poll_error(struct mce *m)
{
        /* Log "not enabled" (speculative) errors */
        if (!(m->status & MCI_STATUS_EN))
                return true;

        /*
         * Log UCNA (SDM: 15.6.3 "UCR Error Classification")
         * UC == 1 && PCC == 0 && S == 0
         */
        if (!(m->status & MCI_STATUS_PCC) && !(m->status & MCI_STATUS_S))
                return true;

        return false;
}

static bool should_log_poll_error(enum mcp_flags flags, struct mce_hw_err *err)
{
        struct mce *m = &err->m;

        if (mce_flags.smca)
                return smca_should_log_poll_error(m);

        /* If this entry is not valid, ignore it. */
        if (!(m->status & MCI_STATUS_VAL))
                return false;

        /*
         * If we are logging everything (at CPU online) or this
         * is a corrected error, then we must log it.
         */
        if ((flags & MCP_UC) || !(m->status & MCI_STATUS_UC))
                return true;

        if (mca_cfg.ser)
                return ser_should_log_poll_error(m);

        if (m->status & MCI_STATUS_UC)
                return false;

        return true;
}

static void clear_bank(struct mce *m)
{
        if (m->cpuvendor == X86_VENDOR_AMD)
                return amd_clear_bank(m);

        mce_wrmsrq(mca_msr_reg(m->bank, MCA_STATUS), 0);
}

/*
 * Poll for corrected events or events that happened before reset.
 * Those are just logged through /dev/mcelog.
 *
 * This is executed in standard interrupt context.
 *
 * Note: spec recommends to panic for fatal unsignalled
 * errors here. However this would be quite problematic --
 * we would need to reimplement the Monarch handling and
 * it would mess up the exclusion between exception handler
 * and poll handler -- * so we skip this for now.
 * These cases should not happen anyways, or only when the CPU
 * is already totally * confused. In this case it's likely it will
 * not fully execute the machine check handler either.
 */
void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
{
        struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
        struct mce_hw_err err;
        struct mce *m;
        int i;

        this_cpu_inc(mce_poll_count);

        mce_gather_info(&err, NULL);
        m = &err.m;

        if (flags & MCP_TIMESTAMP)
                m->tsc = rdtsc();

        for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
                if (!mce_banks[i].ctl || !test_bit(i, *b))
                        continue;

                m->misc = 0;
                m->addr = 0;
                m->bank = i;

                barrier();
                m->status = mce_rdmsrq(mca_msr_reg(i, MCA_STATUS));

                /*
                 * Update storm tracking here, before checking for the
                 * MCI_STATUS_VAL bit. Valid corrected errors count
                 * towards declaring, or maintaining, storm status. No
                 * error in a bank counts towards avoiding, or ending,
                 * storm status.
                 */
                if (!mca_cfg.cmci_disabled)
                        mce_track_storm(m);

                /* Verify that the error should be logged based on hardware conditions. */
                if (!should_log_poll_error(flags, &err))
                        continue;

                mce_read_aux(&err, i);
                m->severity = mce_severity(m, NULL, NULL, false);
                /*
                 * Don't get the IP here because it's unlikely to
                 * have anything to do with the actual error location.
                 */

                if (mca_cfg.dont_log_ce && !mce_usable_address(m))
                        goto clear_it;

                if (flags & MCP_QUEUE_LOG)
                        mce_gen_pool_add(&err);
                else
                        mce_log(&err);

clear_it:
                clear_bank(m);
        }

        /*
         * Don't clear MCG_STATUS here because it's only defined for
         * exceptions.
         */

        sync_core();
}
EXPORT_SYMBOL_GPL(machine_check_poll);

/*
 * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
 * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
 * Vol 3B Table 15-20). But this confuses both the code that determines
 * whether the machine check occurred in kernel or user mode, and also
 * the severity assessment code. Pretend that EIPV was set, and take the
 * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
 */
static __always_inline void
quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
{
        if (bank != 0)
                return;
        if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
                return;
        if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
                          MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
                          MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
                          MCACOD)) !=
                         (MCI_STATUS_UC|MCI_STATUS_EN|
                          MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
                          MCI_STATUS_AR|MCACOD_INSTR))
                return;

        m->mcgstatus |= MCG_STATUS_EIPV;
        m->ip = regs->ip;
        m->cs = regs->cs;
}

/*
 * Disable fast string copy and return from the MCE handler upon the first SRAR
 * MCE on bank 1 due to a CPU erratum on Intel Skylake/Cascade Lake/Cooper Lake
 * CPUs.
 * The fast string copy instructions ("REP; MOVS*") could consume an
 * uncorrectable memory error in the cache line _right after_ the desired region
 * to copy and raise an MCE with RIP pointing to the instruction _after_ the
 * "REP; MOVS*".
 * This mitigation addresses the issue completely with the caveat of performance
 * degradation on the CPU affected. This is still better than the OS crashing on
 * MCEs raised on an irrelevant process due to "REP; MOVS*" accesses from a
 * kernel context (e.g., copy_page).
 *
 * Returns true when fast string copy on CPU has been disabled.
 */
static noinstr bool quirk_skylake_repmov(void)
{
        u64 mcgstatus   = mce_rdmsrq(MSR_IA32_MCG_STATUS);
        u64 misc_enable = mce_rdmsrq(MSR_IA32_MISC_ENABLE);
        u64 mc1_status;

        /*
         * Apply the quirk only to local machine checks, i.e., no broadcast
         * sync is needed.
         */
        if (!(mcgstatus & MCG_STATUS_LMCES) ||
            !(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING))
                return false;

        mc1_status = mce_rdmsrq(MSR_IA32_MCx_STATUS(1));

        /* Check for a software-recoverable data fetch error. */
        if ((mc1_status &
             (MCI_STATUS_VAL | MCI_STATUS_OVER | MCI_STATUS_UC | MCI_STATUS_EN |
              MCI_STATUS_ADDRV | MCI_STATUS_MISCV | MCI_STATUS_PCC |
              MCI_STATUS_AR | MCI_STATUS_S)) ==
             (MCI_STATUS_VAL |                   MCI_STATUS_UC | MCI_STATUS_EN |
              MCI_STATUS_ADDRV | MCI_STATUS_MISCV |
              MCI_STATUS_AR | MCI_STATUS_S)) {
                misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING;
                mce_wrmsrq(MSR_IA32_MISC_ENABLE, misc_enable);
                mce_wrmsrq(MSR_IA32_MCx_STATUS(1), 0);

                instrumentation_begin();
                pr_err_once("Erratum detected, disable fast string copy instructions.\n");
                instrumentation_end();

                return true;
        }

        return false;
}

/*
 * Some Zen-based Instruction Fetch Units set EIPV=RIPV=0 on poison consumption
 * errors. This means mce_gather_info() will not save the "ip" and "cs" registers.
 *
 * However, the context is still valid, so save the "cs" register for later use.
 *
 * The "ip" register is truly unknown, so don't save it or fixup EIPV/RIPV.
 *
 * The Instruction Fetch Unit is at MCA bank 1 for all affected systems.
 */
static __always_inline void quirk_zen_ifu(int bank, struct mce *m, struct pt_regs *regs)
{
        if (bank != 1)
                return;
        if (!(m->status & MCI_STATUS_POISON))
                return;

        m->cs = regs->cs;
}

/*
 * Do a quick check if any of the events requires a panic.
 * This decides if we keep the events around or clear them.
 */
static __always_inline int mce_no_way_out(struct mce_hw_err *err, char **msg, unsigned long *validp,
                                          struct pt_regs *regs)
{
        struct mce *m = &err->m;
        char *tmp = *msg;
        int i;

        for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
                m->status = mce_rdmsrq(mca_msr_reg(i, MCA_STATUS));
                if (!(m->status & MCI_STATUS_VAL))
                        continue;

                arch___set_bit(i, validp);
                if (mce_flags.snb_ifu_quirk)
                        quirk_sandybridge_ifu(i, m, regs);

                if (mce_flags.zen_ifu_quirk)
                        quirk_zen_ifu(i, m, regs);

                m->bank = i;
                if (mce_severity(m, regs, &tmp, true) >= MCE_PANIC_SEVERITY) {
                        mce_read_aux(err, i);
                        *msg = tmp;
                        return 1;
                }
        }
        return 0;
}

/*
 * Variable to establish order between CPUs while scanning.
 * Each CPU spins initially until executing is equal its number.
 */
static atomic_t mce_executing;

/*
 * Defines order of CPUs on entry. First CPU becomes Monarch.
 */
static atomic_t mce_callin;

/*
 * Track which CPUs entered the MCA broadcast synchronization and which not in
 * order to print holdouts.
 */
static cpumask_t mce_missing_cpus = CPU_MASK_ALL;

/*
 * Check if a timeout waiting for other CPUs happened.
 */
static noinstr int mce_timed_out(u64 *t, const char *msg)
{
        int ret = 0;

        /* Enable instrumentation around calls to external facilities */
        instrumentation_begin();

        /*
         * The others already did panic for some reason.
         * Bail out like in a timeout.
         * rmb() to tell the compiler that system_state
         * might have been modified by someone else.
         */
        rmb();
        if (atomic_read(&mce_panicked))
                wait_for_panic();
        if (!mca_cfg.monarch_timeout)
                goto out;
        if ((s64)*t < SPINUNIT) {
                if (cpumask_and(&mce_missing_cpus, cpu_online_mask, &mce_missing_cpus))
                        pr_emerg("CPUs not responding to MCE broadcast (may include false positives): %*pbl\n",
                                 cpumask_pr_args(&mce_missing_cpus));
                mce_panic(msg, NULL, NULL);

                ret = 1;
                goto out;
        }
        *t -= SPINUNIT;

out:
        touch_nmi_watchdog();

        instrumentation_end();

        return ret;
}

/*
 * The Monarch's reign.  The Monarch is the CPU who entered
 * the machine check handler first. It waits for the others to
 * raise the exception too and then grades them. When any
 * error is fatal panic. Only then let the others continue.
 *
 * The other CPUs entering the MCE handler will be controlled by the
 * Monarch. They are called Subjects.
 *
 * This way we prevent any potential data corruption in a unrecoverable case
 * and also makes sure always all CPU's errors are examined.
 *
 * Also this detects the case of a machine check event coming from outer
 * space (not detected by any CPUs) In this case some external agent wants
 * us to shut down, so panic too.
 *
 * The other CPUs might still decide to panic if the handler happens
 * in a unrecoverable place, but in this case the system is in a semi-stable
 * state and won't corrupt anything by itself. It's ok to let the others
 * continue for a bit first.
 *
 * All the spin loops have timeouts; when a timeout happens a CPU
 * typically elects itself to be Monarch.
 */
static void mce_reign(void)
{
        struct mce_hw_err *err = NULL;
        struct mce *m = NULL;
        int global_worst = 0;
        char *msg = NULL;
        int cpu;

        /*
         * This CPU is the Monarch and the other CPUs have run
         * through their handlers.
         * Grade the severity of the errors of all the CPUs.
         */
        for_each_possible_cpu(cpu) {
                struct mce_hw_err *etmp = &per_cpu(hw_errs_seen, cpu);
                struct mce *mtmp = &etmp->m;

                if (mtmp->severity > global_worst) {
                        global_worst = mtmp->severity;
                        err = &per_cpu(hw_errs_seen, cpu);
                        m = &err->m;
                }
        }

        /*
         * Cannot recover? Panic here then.
         * This dumps all the mces in the log buffer and stops the
         * other CPUs.
         */
        if (m && global_worst >= MCE_PANIC_SEVERITY) {
                /* call mce_severity() to get "msg" for panic */
                mce_severity(m, NULL, &msg, true);
                mce_panic("Fatal machine check", err, msg);
        }

        /*
         * For UC somewhere we let the CPU who detects it handle it.
         * Also must let continue the others, otherwise the handling
         * CPU could deadlock on a lock.
         */

        /*
         * No machine check event found. Must be some external
         * source or one CPU is hung. Panic.
         */
        if (global_worst <= MCE_KEEP_SEVERITY)
                mce_panic("Fatal machine check from unknown source", NULL, NULL);

        /*
         * Now clear all the hw_errs_seen so that they don't reappear on
         * the next mce.
         */
        for_each_possible_cpu(cpu)
                memset(&per_cpu(hw_errs_seen, cpu), 0, sizeof(struct mce_hw_err));
}

static atomic_t global_nwo;

/*
 * Start of Monarch synchronization. This waits until all CPUs have
 * entered the exception handler and then determines if any of them
 * saw a fatal event that requires panic. Then it executes them
 * in the entry order.
 * TBD double check parallel CPU hotunplug
 */
static noinstr int mce_start(int *no_way_out)
{
        u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
        int order, ret = -1;

        if (!timeout)
                return ret;

        raw_atomic_add(*no_way_out, &global_nwo);
        /*
         * Rely on the implied barrier below, such that global_nwo
         * is updated before mce_callin.
         */
        order = raw_atomic_inc_return(&mce_callin);
        arch_cpumask_clear_cpu(smp_processor_id(), &mce_missing_cpus);

        /* Enable instrumentation around calls to external facilities */
        instrumentation_begin();

        /*
         * Wait for everyone.
         */
        while (raw_atomic_read(&mce_callin) != num_online_cpus()) {
                if (mce_timed_out(&timeout,
                                  "Timeout: Not all CPUs entered broadcast exception handler")) {
                        raw_atomic_set(&global_nwo, 0);
                        goto out;
                }
                ndelay(SPINUNIT);
        }

        /*
         * mce_callin should be read before global_nwo
         */
        smp_rmb();

        if (order == 1) {
                /*
                 * Monarch: Starts executing now, the others wait.
                 */
                raw_atomic_set(&mce_executing, 1);
        } else {
                /*
                 * Subject: Now start the scanning loop one by one in
                 * the original callin order.
                 * This way when there are any shared banks it will be
                 * only seen by one CPU before cleared, avoiding duplicates.
                 */
                while (raw_atomic_read(&mce_executing) < order) {
                        if (mce_timed_out(&timeout,
                                          "Timeout: Subject CPUs unable to finish machine check processing")) {
                                raw_atomic_set(&global_nwo, 0);
                                goto out;
                        }
                        ndelay(SPINUNIT);
                }
        }

        /*
         * Cache the global no_way_out state.
         */
        *no_way_out = raw_atomic_read(&global_nwo);

        ret = order;

out:
        instrumentation_end();

        return ret;
}

/*
 * Synchronize between CPUs after main scanning loop.
 * This invokes the bulk of the Monarch processing.
 */
static noinstr int mce_end(int order)
{
        u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
        int ret = -1;

        /* Allow instrumentation around external facilities. */
        instrumentation_begin();

        if (!timeout)
                goto reset;
        if (order < 0)
                goto reset;

        /*
         * Allow others to run.
         */
        atomic_inc(&mce_executing);

        if (order == 1) {
                /*
                 * Monarch: Wait for everyone to go through their scanning
                 * loops.
                 */
                while (atomic_read(&mce_executing) <= num_online_cpus()) {
                        if (mce_timed_out(&timeout,
                                          "Timeout: Monarch CPU unable to finish machine check processing"))
                                goto reset;
                        ndelay(SPINUNIT);
                }

                mce_reign();
                barrier();
                ret = 0;
        } else {
                /*
                 * Subject: Wait for Monarch to finish.
                 */
                while (atomic_read(&mce_executing) != 0) {
                        if (mce_timed_out(&timeout,
                                          "Timeout: Monarch CPU did not finish machine check processing"))
                                goto reset;
                        ndelay(SPINUNIT);
                }

                /*
                 * Don't reset anything. That's done by the Monarch.
                 */
                ret = 0;
                goto out;
        }

        /*
         * Reset all global state.
         */
reset:
        atomic_set(&global_nwo, 0);
        atomic_set(&mce_callin, 0);
        cpumask_setall(&mce_missing_cpus);
        barrier();

        /*
         * Let others run again.
         */
        atomic_set(&mce_executing, 0);

out:
        instrumentation_end();

        return ret;
}

static __always_inline void mce_clear_state(unsigned long *toclear)
{
        int i;

        for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
                if (arch_test_bit(i, toclear))
                        mce_wrmsrq(mca_msr_reg(i, MCA_STATUS), 0);
        }
}

/*
 * Cases where we avoid rendezvous handler timeout:
 * 1) If this CPU is offline.
 *
 * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
 *  skip those CPUs which remain looping in the 1st kernel - see
 *  crash_nmi_callback().
 *
 * Note: there still is a small window between kexec-ing and the new,
 * kdump kernel establishing a new #MC handler where a broadcasted MCE
 * might not get handled properly.
 */
static noinstr bool mce_check_crashing_cpu(void)
{
        unsigned int cpu = smp_processor_id();

        if (arch_cpu_is_offline(cpu) ||
            (crashing_cpu != -1 && crashing_cpu != cpu)) {
                u64 mcgstatus;

                mcgstatus = native_rdmsrq(MSR_IA32_MCG_STATUS);

                if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) {
                        if (mcgstatus & MCG_STATUS_LMCES)
                                return false;
                }

                if (mcgstatus & MCG_STATUS_RIPV) {
                        native_wrmsrq(MSR_IA32_MCG_STATUS, 0);
                        return true;
                }
        }
        return false;
}

static __always_inline int
__mc_scan_banks(struct mce_hw_err *err, struct pt_regs *regs,
                struct mce_hw_err *final, unsigned long *toclear,
                unsigned long *valid_banks, int no_way_out, int *worst)
{
        struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
        struct mca_config *cfg = &mca_cfg;
        int severity, i, taint = 0;
        struct mce *m = &err->m;

        for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
                arch___clear_bit(i, toclear);
                if (!arch_test_bit(i, valid_banks))
                        continue;

                if (!mce_banks[i].ctl)
                        continue;

                m->misc = 0;
                m->addr = 0;
                m->bank = i;

                m->status = mce_rdmsrq(mca_msr_reg(i, MCA_STATUS));
                if (!(m->status & MCI_STATUS_VAL))
                        continue;

                /*
                 * Corrected or non-signaled errors are handled by
                 * machine_check_poll(). Leave them alone, unless this panics.
                 */
                if (!(m->status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
                        !no_way_out)
                        continue;

                /* Set taint even when machine check was not enabled. */
                taint++;

                severity = mce_severity(m, regs, NULL, true);

                /*
                 * When machine check was for corrected/deferred handler don't
                 * touch, unless we're panicking.
                 */
                if ((severity == MCE_KEEP_SEVERITY ||
                     severity == MCE_UCNA_SEVERITY) && !no_way_out)
                        continue;

                arch___set_bit(i, toclear);

                /* Machine check event was not enabled. Clear, but ignore. */
                if (severity == MCE_NO_SEVERITY)
                        continue;

                mce_read_aux(err, i);

                /* assuming valid severity level != 0 */
                m->severity = severity;

                /*
                 * Enable instrumentation around the mce_log() call which is
                 * done in #MC context, where instrumentation is disabled.
                 */
                instrumentation_begin();
                mce_log(err);
                instrumentation_end();

                if (severity > *worst) {
                        *final = *err;
                        *worst = severity;
                }
        }

        /* mce_clear_state will clear *final, save locally for use later */
        *err = *final;

        return taint;
}

static void kill_me_now(struct callback_head *ch)
{
        struct task_struct *p = container_of(ch, struct task_struct, mce_kill_me);

        p->mce_count = 0;
        force_sig(SIGBUS);
}

static void kill_me_maybe(struct callback_head *cb)
{
        struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
        int flags = MF_ACTION_REQUIRED;
        unsigned long pfn;
        int ret;

        p->mce_count = 0;
        pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr);

        if (!p->mce_ripv)
                flags |= MF_MUST_KILL;

        pfn = (p->mce_addr & MCI_ADDR_PHYSADDR) >> PAGE_SHIFT;
        ret = memory_failure(pfn, flags);
        if (!ret) {
                set_mce_nospec(pfn);
                sync_core();
                return;
        }

        /*
         * -EHWPOISON from memory_failure() means that it already sent SIGBUS
         * to the current process with the proper error info,
         * -EOPNOTSUPP means hwpoison_filter() filtered the error event,
         *
         * In both cases, no further processing is required.
         */
        if (ret == -EHWPOISON || ret == -EOPNOTSUPP)
                return;

        pr_err("Memory error not recovered");
        kill_me_now(cb);
}

static void kill_me_never(struct callback_head *cb)
{
        struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
        unsigned long pfn;

        p->mce_count = 0;
        pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr);
        pfn = (p->mce_addr & MCI_ADDR_PHYSADDR) >> PAGE_SHIFT;
        if (!memory_failure(pfn, 0))
                set_mce_nospec(pfn);
}

static void queue_task_work(struct mce_hw_err *err, char *msg, void (*func)(struct callback_head *))
{
        int count = ++current->mce_count;
        struct mce *m = &err->m;

        /* First call, save all the details */
        if (count == 1) {
                current->mce_addr = m->addr;
                current->mce_kflags = m->kflags;
                current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV);
                current->mce_whole_page = whole_page(m);
                current->mce_kill_me.func = func;
        }

        /* Ten is likely overkill. Don't expect more than two faults before task_work() */
        if (count > 10)
                mce_panic("Too many consecutive machine checks while accessing user data",
                          err, msg);

        /* Second or later call, make sure page address matches the one from first call */
        if (count > 1 && (current->mce_addr >> PAGE_SHIFT) != (m->addr >> PAGE_SHIFT))
                mce_panic("Consecutive machine checks to different user pages", err, msg);

        /* Do not call task_work_add() more than once */
        if (count > 1)
                return;

        task_work_add(current, &current->mce_kill_me, TWA_RESUME);
}

/* Handle unconfigured int18 (should never happen) */
static noinstr void unexpected_machine_check(struct pt_regs *regs)
{
        instrumentation_begin();
        pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
               smp_processor_id());
        instrumentation_end();
}

/*
 * The actual machine check handler. This only handles real exceptions when
 * something got corrupted coming in through int 18.
 *
 * This is executed in #MC context not subject to normal locking rules.
 * This implies that most kernel services cannot be safely used. Don't even
 * think about putting a printk in there!
 *
 * On Intel systems this is entered on all CPUs in parallel through
 * MCE broadcast. However some CPUs might be broken beyond repair,
 * so be always careful when synchronizing with others.
 *
 * Tracing and kprobes are disabled: if we interrupted a kernel context
 * with IF=1, we need to minimize stack usage.  There are also recursion
 * issues: if the machine check was due to a failure of the memory
 * backing the user stack, tracing that reads the user stack will cause
 * potentially infinite recursion.
 *
 * Currently, the #MC handler calls out to a number of external facilities
 * and, therefore, allows instrumentation around them. The optimal thing to
 * have would be to do the absolutely minimal work required in #MC context
 * and have instrumentation disabled only around that. Further processing can
 * then happen in process context where instrumentation is allowed. Achieving
 * that requires careful auditing and modifications. Until then, the code
 * allows instrumentation temporarily, where required. *
 */
noinstr void do_machine_check(struct pt_regs *regs)
{
        int worst = 0, order, no_way_out, kill_current_task, lmce, taint = 0;
        DECLARE_BITMAP(valid_banks, MAX_NR_BANKS) = { 0 };
        DECLARE_BITMAP(toclear, MAX_NR_BANKS) = { 0 };
        struct mce_hw_err *final;
        struct mce_hw_err err;
        char *msg = NULL;
        struct mce *m;

        if (unlikely(mce_flags.p5))
                return pentium_machine_check(regs);
        else if (unlikely(mce_flags.winchip))
                return winchip_machine_check(regs);
        else if (unlikely(!mca_cfg.initialized))
                return unexpected_machine_check(regs);

        if (mce_flags.skx_repmov_quirk && quirk_skylake_repmov())
                goto clear;

        /*
         * Establish sequential order between the CPUs entering the machine
         * check handler.
         */
        order = -1;

        /*
         * If no_way_out gets set, there is no safe way to recover from this
         * MCE.
         */
        no_way_out = 0;

        /*
         * If kill_current_task is not set, there might be a way to recover from this
         * error.
         */
        kill_current_task = 0;

        /*
         * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES
         * on Intel.
         */
        lmce = 1;

        this_cpu_inc(mce_exception_count);

        mce_gather_info(&err, regs);
        m = &err.m;
        m->tsc = rdtsc();

        final = this_cpu_ptr(&hw_errs_seen);
        *final = err;

        no_way_out = mce_no_way_out(&err, &msg, valid_banks, regs);

        barrier();

        /*
         * When no restart IP might need to kill or panic.
         * Assume the worst for now, but if we find the
         * severity is MCE_AR_SEVERITY we have other options.
         */
        if (!(m->mcgstatus & MCG_STATUS_RIPV))
                kill_current_task = 1;
        /*
         * Check if this MCE is signaled to only this logical processor,
         * on Intel, Zhaoxin only.
         */
        if (m->cpuvendor == X86_VENDOR_INTEL ||
            m->cpuvendor == X86_VENDOR_ZHAOXIN)
                lmce = m->mcgstatus & MCG_STATUS_LMCES;

        /*
         * Local machine check may already know that we have to panic.
         * Broadcast machine check begins rendezvous in mce_start()
         * Go through all banks in exclusion of the other CPUs. This way we
         * don't report duplicated events on shared banks because the first one
         * to see it will clear it.
         */
        if (lmce) {
                if (no_way_out)
                        mce_panic("Fatal local machine check", &err, msg);
        } else {
                order = mce_start(&no_way_out);
        }

        taint = __mc_scan_banks(&err, regs, final, toclear, valid_banks, no_way_out, &worst);

        if (!no_way_out)
                mce_clear_state(toclear);

        /*
         * Do most of the synchronization with other CPUs.
         * When there's any problem use only local no_way_out state.
         */
        if (!lmce) {
                if (mce_end(order) < 0) {
                        if (!no_way_out)
                                no_way_out = worst >= MCE_PANIC_SEVERITY;

                        if (no_way_out)
                                mce_panic("Fatal machine check on current CPU", &err, msg);
                }
        } else {
                /*
                 * If there was a fatal machine check we should have
                 * already called mce_panic earlier in this function.
                 * Since we re-read the banks, we might have found
                 * something new. Check again to see if we found a
                 * fatal error. We call "mce_severity()" again to
                 * make sure we have the right "msg".
                 */
                if (worst >= MCE_PANIC_SEVERITY) {
                        mce_severity(m, regs, &msg, true);
                        mce_panic("Local fatal machine check!", &err, msg);
                }
        }

        /*
         * Enable instrumentation around the external facilities like task_work_add()
         * (via queue_task_work()), fixup_exception() etc. For now, that is. Fixing this
         * properly would need a lot more involved reorganization.
         */
        instrumentation_begin();

        if (taint)
                add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);

        if (worst != MCE_AR_SEVERITY && !kill_current_task)
                goto out;

        /* Fault was in user mode and we need to take some action */
        if ((m->cs & 3) == 3) {
                /* If this triggers there is no way to recover. Die hard. */
                BUG_ON(!on_thread_stack() || !user_mode(regs));

                if (!mce_usable_address(m))
                        queue_task_work(&err, msg, kill_me_now);
                else
                        queue_task_work(&err, msg, kill_me_maybe);

        } else if (m->mcgstatus & MCG_STATUS_SEAM_NR) {
                /*
                 * Saved RIP on stack makes it look like the machine check
                 * was taken in the kernel on the instruction following
                 * the entry to SEAM mode. But MCG_STATUS_SEAM_NR indicates
                 * that the machine check was taken inside SEAM non-root
                 * mode.  CPU core has already marked that guest as dead.
                 * It is OK for the kernel to resume execution at the
                 * apparent point of the machine check as the fault did
                 * not occur there. Mark the page as poisoned so it won't
                 * be added to free list when the guest is terminated.
                 */
                if (mce_usable_address(m)) {
                        struct page *p = pfn_to_online_page(m->addr >> PAGE_SHIFT);

                        if (p)
                                SetPageHWPoison(p);
                }
        } else {
                /*
                 * Handle an MCE which has happened in kernel space but from
                 * which the kernel can recover: ex_has_fault_handler() has
                 * already verified that the rIP at which the error happened is
                 * a rIP from which the kernel can recover (by jumping to
                 * recovery code specified in _ASM_EXTABLE_FAULT()) and the
                 * corresponding exception handler which would do that is the
                 * proper one.
                 */
                if (m->kflags & MCE_IN_KERNEL_RECOV) {
                        if (!fixup_exception(regs, X86_TRAP_MC, 0, 0))
                                mce_panic("Failed kernel mode recovery", &err, msg);
                }

                if (m->kflags & MCE_IN_KERNEL_COPYIN)
                        queue_task_work(&err, msg, kill_me_never);
        }

out:
        /* Given it didn't panic, mark it as recoverable */
        hwerr_log_error_type(HWERR_RECOV_OTHERS);

        instrumentation_end();

clear:
        mce_wrmsrq(MSR_IA32_MCG_STATUS, 0);
}
EXPORT_SYMBOL_GPL(do_machine_check);

#ifndef CONFIG_MEMORY_FAILURE
int memory_failure(unsigned long pfn, int flags)
{
        /* mce_severity() should not hand us an ACTION_REQUIRED error */
        BUG_ON(flags & MF_ACTION_REQUIRED);
        pr_err("Uncorrected memory error in page 0x%lx ignored\n"
               "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
               pfn);

        return 0;
}
#endif

/*
 * Periodic polling timer for "silent" machine check errors.  If the
 * poller finds an MCE, poll 2x faster.  When the poller finds no more
 * errors, poll 2x slower (up to check_interval seconds).
 */
static unsigned long check_interval = INITIAL_CHECK_INTERVAL;

static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
static DEFINE_PER_CPU(struct timer_list, mce_timer);

static void __start_timer(struct timer_list *t, unsigned long interval)
{
        unsigned long when = jiffies + interval;
        unsigned long flags;

        local_irq_save(flags);

        if (!timer_pending(t) || time_before(when, t->expires))
                mod_timer(t, round_jiffies(when));

        local_irq_restore(flags);
}

static void mc_poll_banks_default(void)
{
        machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
}

void (*mc_poll_banks)(void) = mc_poll_banks_default;

static bool should_enable_timer(unsigned long iv)
{
        return !mca_cfg.ignore_ce && iv;
}

static void mce_timer_fn(struct timer_list *t)
{
        struct timer_list *cpu_t = this_cpu_ptr(&mce_timer);
        unsigned long iv;

        WARN_ON(cpu_t != t);

        iv = __this_cpu_read(mce_next_interval);

        if (mce_available(this_cpu_ptr(&cpu_info)))
                mc_poll_banks();

        /*
         * Alert userspace if needed. If we logged an MCE, reduce the polling
         * interval, otherwise increase the polling interval.
         */
        if (mce_notify_irq())
                iv = max(iv / 2, (unsigned long) HZ/100);
        else
                iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));

        if (mce_get_storm_mode()) {
                __start_timer(t, HZ);
        } else if (should_enable_timer(iv)) {
                __this_cpu_write(mce_next_interval, iv);
                __start_timer(t, iv);
        }
}

/*
 * When a storm starts on any bank on this CPU, switch to polling
 * once per second. When the storm ends, revert to the default
 * polling interval.
 */
void mce_timer_kick(bool storm)
{
        struct timer_list *t = this_cpu_ptr(&mce_timer);

        mce_set_storm_mode(storm);

        if (storm)
                __start_timer(t, HZ);
        else
                __this_cpu_write(mce_next_interval, check_interval * HZ);
}

/* Must not be called in IRQ context where timer_delete_sync() can deadlock */
static void mce_timer_delete_all(void)
{
        int cpu;

        for_each_online_cpu(cpu)
                timer_delete_sync(&per_cpu(mce_timer, cpu));
}

static void __mcheck_cpu_mce_banks_init(void)
{
        struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
        u8 n_banks = this_cpu_read(mce_num_banks);
        int i;

        for (i = 0; i < n_banks; i++) {
                struct mce_bank *b = &mce_banks[i];

                /*
                 * Init them all by default.
                 *
                 * The required vendor quirks will be applied before
                 * __mcheck_cpu_init_prepare_banks() does the final bank setup.
                 */
                b->ctl = -1ULL;
                b->init = true;
        }
}

/*
 * Initialize Machine Checks for a CPU.
 */
static void __mcheck_cpu_cap_init(void)
{
        u64 cap;
        u8 b;

        rdmsrq(MSR_IA32_MCG_CAP, cap);

        b = cap & MCG_BANKCNT_MASK;

        if (b > MAX_NR_BANKS) {
                pr_warn("CPU%d: Using only %u machine check banks out of %u\n",
                        smp_processor_id(), MAX_NR_BANKS, b);
                b = MAX_NR_BANKS;
        }

        this_cpu_write(mce_num_banks, b);

        __mcheck_cpu_mce_banks_init();
}

static void __mcheck_cpu_init_generic(void)
{
        u64 cap;

        rdmsrq(MSR_IA32_MCG_CAP, cap);
        if (cap & MCG_CTL_P)
                wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
}

static void __mcheck_cpu_init_prepare_banks(void)
{
        struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
        u64 msrval;
        int i;

        /*
         * Log the machine checks left over from the previous reset. Log them
         * only, do not start processing them. That will happen in mcheck_late_init()
         * when all consumers have been registered on the notifier chain.
         */
        if (mca_cfg.bootlog) {
                mce_banks_t all_banks;

                bitmap_fill(all_banks, MAX_NR_BANKS);
                machine_check_poll(MCP_UC | MCP_QUEUE_LOG, &all_banks);
        }

        for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
                struct mce_bank *b = &mce_banks[i];

                if (!b->init)
                        continue;

                wrmsrq(mca_msr_reg(i, MCA_CTL), b->ctl);
                wrmsrq(mca_msr_reg(i, MCA_STATUS), 0);

                rdmsrq(mca_msr_reg(i, MCA_CTL), msrval);
                b->init = !!msrval;
        }
}

static void amd_apply_global_quirks(struct cpuinfo_x86 *c)
{
        if (c->x86 < 0x11 && mca_cfg.bootlog < 0) {
                /*
                 * Lots of broken BIOS around that don't clear them
                 * by default and leave crap in there. Don't log:
                 */
                mca_cfg.bootlog = 0;
        }

        /*
         * overflow_recov is supported for F15h Models 00h-0fh
         * even though we don't have a CPUID bit for it.
         */
        if (c->x86 == 0x15 && c->x86_model <= 0xf)
                mce_flags.overflow_recov = 1;

        if (c->x86 >= 0x17 && c->x86 <= 0x1A)
                mce_flags.zen_ifu_quirk = 1;
}

static void intel_apply_global_quirks(struct cpuinfo_x86 *c)
{
        /* Older CPUs (prior to family 6) don't need quirks. */
        if (c->x86_vfm < INTEL_PENTIUM_PRO)
                return;

        /*
         * All newer Intel systems support MCE broadcasting. Enable
         * synchronization with a one second timeout.
         */
        if (c->x86_vfm >= INTEL_CORE_YONAH && mca_cfg.monarch_timeout < 0)
                mca_cfg.monarch_timeout = USEC_PER_SEC;

        /*
         * There are also broken BIOSes on some Pentium M and
         * earlier systems:
         */
        if (c->x86_vfm < INTEL_CORE_YONAH && mca_cfg.bootlog < 0)
                mca_cfg.bootlog = 0;

        if (c->x86_vfm == INTEL_SANDYBRIDGE_X)
                mce_flags.snb_ifu_quirk = 1;

        /*
         * Skylake, Cascacde Lake and Cooper Lake require a quirk on
         * rep movs.
         */
        if (c->x86_vfm == INTEL_SKYLAKE_X)
                mce_flags.skx_repmov_quirk = 1;
}

static void zhaoxin_apply_global_quirks(struct cpuinfo_x86 *c)
{
        /*
         * All newer Zhaoxin CPUs support MCE broadcasting. Enable
         * synchronization with a one second timeout.
         */
        if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) {
                if (mca_cfg.monarch_timeout < 0)
                        mca_cfg.monarch_timeout = USEC_PER_SEC;
        }
}

static bool __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
{
        if (c->x86 != 5)
                return false;

        switch (c->x86_vendor) {
        case X86_VENDOR_INTEL:
                intel_p5_mcheck_init(c);
                mce_flags.p5 = 1;
                return true;
        case X86_VENDOR_CENTAUR:
                winchip_mcheck_init(c);
                mce_flags.winchip = 1;
                return true;
        default:
                return false;
        }

        return false;
}

static void mce_centaur_feature_init(struct cpuinfo_x86 *c)
{
        struct mca_config *cfg = &mca_cfg;

         /*
          * All newer Centaur CPUs support MCE broadcasting. Enable
          * synchronization with a one second timeout.
          */
        if ((c->x86 == 6 && c->x86_model == 0xf && c->x86_stepping >= 0xe) ||
             c->x86 > 6) {
                if (cfg->monarch_timeout < 0)
                        cfg->monarch_timeout = USEC_PER_SEC;
        }
}

static void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c)
{
        struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);

        /*
         * These CPUs have MCA bank 8 which reports only one error type called
         * SVAD (System View Address Decoder). The reporting of that error is
         * controlled by IA32_MC8.CTL.0.
         *
         * If enabled, prefetching on these CPUs will cause SVAD MCE when
         * virtual machines start and result in a system  panic. Always disable
         * bank 8 SVAD error by default.
         */
        if ((c->x86 == 7 && c->x86_model == 0x1b) ||
            (c->x86_model == 0x19 || c->x86_model == 0x1f)) {
                if (this_cpu_read(mce_num_banks) > 8)
                        mce_banks[8].ctl = 0;
        }

        intel_init_cmci();
        intel_init_lmce();
}

static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c)
{
        intel_clear_lmce();
}

static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
{
        switch (c->x86_vendor) {
        case X86_VENDOR_INTEL:
                mce_intel_feature_init(c);
                break;

        case X86_VENDOR_AMD:
        case X86_VENDOR_HYGON:
                mce_amd_feature_init(c);
                break;

        case X86_VENDOR_CENTAUR:
                mce_centaur_feature_init(c);
                break;

        case X86_VENDOR_ZHAOXIN:
                mce_zhaoxin_feature_init(c);
                break;

        default:
                break;
        }
}

static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
{
        switch (c->x86_vendor) {
        case X86_VENDOR_INTEL:
                mce_intel_feature_clear(c);
                break;

        case X86_VENDOR_ZHAOXIN:
                mce_zhaoxin_feature_clear(c);
                break;

        default:
                break;
        }
}

static void mce_start_timer(struct timer_list *t)
{
        unsigned long iv = check_interval * HZ;

        if (should_enable_timer(iv)) {
                this_cpu_write(mce_next_interval, iv);
                __start_timer(t, iv);
        }
}

static void __mcheck_cpu_setup_timer(void)
{
        struct timer_list *t = this_cpu_ptr(&mce_timer);

        timer_setup(t, mce_timer_fn, TIMER_PINNED);
}

static void __mcheck_cpu_init_timer(void)
{
        struct timer_list *t = this_cpu_ptr(&mce_timer);

        timer_setup(t, mce_timer_fn, TIMER_PINNED);
        mce_start_timer(t);
}

bool filter_mce(struct mce *m)
{
        if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
                return amd_filter_mce(m);
        if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
                return intel_filter_mce(m);

        return false;
}

static __always_inline void exc_machine_check_kernel(struct pt_regs *regs)
{
        irqentry_state_t irq_state;

        WARN_ON_ONCE(user_mode(regs));

        /*
         * Only required when from kernel mode. See
         * mce_check_crashing_cpu() for details.
         */
        if (mca_cfg.initialized && mce_check_crashing_cpu())
                return;

        irq_state = irqentry_nmi_enter(regs);

        do_machine_check(regs);

        irqentry_nmi_exit(regs, irq_state);
}

static __always_inline void exc_machine_check_user(struct pt_regs *regs)
{
        irqentry_enter_from_user_mode(regs);

        do_machine_check(regs);

        irqentry_exit_to_user_mode(regs);
}

#ifdef CONFIG_X86_64
/* MCE hit kernel mode */
DEFINE_IDTENTRY_MCE(exc_machine_check)
{
        unsigned long dr7;

        dr7 = local_db_save();
        exc_machine_check_kernel(regs);
        local_db_restore(dr7);
}

/* The user mode variant. */
DEFINE_IDTENTRY_MCE_USER(exc_machine_check)
{
        unsigned long dr7;

        dr7 = local_db_save();
        exc_machine_check_user(regs);
        local_db_restore(dr7);
}

#ifdef CONFIG_X86_FRED
/*
 * When occurred on different ring level, i.e., from user or kernel
 * context, #MCE needs to be handled on different stack: User #MCE
 * on current task stack, while kernel #MCE on a dedicated stack.
 *
 * This is exactly how FRED event delivery invokes an exception
 * handler: ring 3 event on level 0 stack, i.e., current task stack;
 * ring 0 event on the #MCE dedicated stack specified in the
 * IA32_FRED_STKLVLS MSR. So unlike IDT, the FRED machine check entry
 * stub doesn't do stack switch.
 */
DEFINE_FREDENTRY_MCE(exc_machine_check)
{
        unsigned long dr7;

        dr7 = local_db_save();
        if (user_mode(regs))
                exc_machine_check_user(regs);
        else
                exc_machine_check_kernel(regs);
        local_db_restore(dr7);
}
#endif
#else
/* 32bit unified entry point */
DEFINE_IDTENTRY_RAW(exc_machine_check)
{
        unsigned long dr7;

        dr7 = local_db_save();
        if (user_mode(regs))
                exc_machine_check_user(regs);
        else
                exc_machine_check_kernel(regs);
        local_db_restore(dr7);
}
#endif

void mca_bsp_init(struct cpuinfo_x86 *c)
{
        u64 cap;

        if (!mce_available(c))
                return;

        if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
                mca_cfg.disabled = 1;
                pr_info("unknown CPU type - not enabling MCE support\n");
                return;
        }

        mce_flags.overflow_recov = cpu_feature_enabled(X86_FEATURE_OVERFLOW_RECOV);
        mce_flags.succor         = cpu_feature_enabled(X86_FEATURE_SUCCOR);
        mce_flags.smca           = cpu_feature_enabled(X86_FEATURE_SMCA);

        if (mce_flags.smca)
                smca_bsp_init();

        rdmsrq(MSR_IA32_MCG_CAP, cap);

        /* Use accurate RIP reporting if available. */
        if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
                mca_cfg.rip_msr = MSR_IA32_MCG_EIP;

        if (cap & MCG_SER_P)
                mca_cfg.ser = 1;

        switch (c->x86_vendor) {
        case X86_VENDOR_AMD:
                amd_apply_global_quirks(c);
                break;
        case X86_VENDOR_INTEL:
                intel_apply_global_quirks(c);
                break;
        case X86_VENDOR_ZHAOXIN:
                zhaoxin_apply_global_quirks(c);
                break;
        }

        if (mca_cfg.monarch_timeout < 0)
                mca_cfg.monarch_timeout = 0;
        if (mca_cfg.bootlog != 0)
                mca_cfg.panic_timeout = 30;
}

/*
 * Called for each booted CPU to set up machine checks.
 * Must be called with preempt off:
 */
void mcheck_cpu_init(struct cpuinfo_x86 *c)
{
        if (mca_cfg.disabled)
                return;

        if (__mcheck_cpu_ancient_init(c))
                return;

        if (!mce_available(c))
                return;

        __mcheck_cpu_cap_init();

        if (!mce_gen_pool_init()) {
                mca_cfg.disabled = 1;
                pr_emerg("Couldn't allocate MCE records pool!\n");
                return;
        }

        mca_cfg.initialized = 1;

        __mcheck_cpu_init_generic();
        __mcheck_cpu_init_vendor(c);
        __mcheck_cpu_init_prepare_banks();
        __mcheck_cpu_setup_timer();
        cr4_set_bits(X86_CR4_MCE);
}

/*
 * Called for each booted CPU to clear some machine checks opt-ins
 */
void mcheck_cpu_clear(struct cpuinfo_x86 *c)
{
        if (mca_cfg.disabled)
                return;

        if (!mce_available(c))
                return;

        /*
         * Possibly to clear general settings generic to x86
         * __mcheck_cpu_clear_generic(c);
         */
        __mcheck_cpu_clear_vendor(c);

}

static void __mce_disable_bank(void *arg)
{
        int bank = *((int *)arg);
        __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
        cmci_disable_bank(bank);
}

void mce_disable_bank(int bank)
{
        if (bank >= this_cpu_read(mce_num_banks)) {
                pr_warn(FW_BUG
                        "Ignoring request to disable invalid MCA bank %d.\n",
                        bank);
                return;
        }
        set_bit(bank, mce_banks_ce_disabled);
        on_each_cpu(__mce_disable_bank, &bank, 1);
}

/*
 * mce=off Disables machine check
 * mce=no_cmci Disables CMCI
 * mce=no_lmce Disables LMCE
 * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
 * mce=print_all Print all machine check logs to console
 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
 *      monarchtimeout is how long to wait for other CPUs on machine
 *      check, or 0 to not wait
 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD Fam10h
        and older.
 * mce=nobootlog Don't log MCEs from before booting.
 * mce=bios_cmci_threshold Don't program the CMCI threshold
 * mce=recovery force enable copy_mc_fragile()
 */
static int __init mcheck_enable(char *str)
{
        struct mca_config *cfg = &mca_cfg;

        if (*str == 0) {
                enable_p5_mce();
                return 1;
        }
        if (*str == '=')
                str++;
        if (!strcmp(str, "off"))
                cfg->disabled = 1;
        else if (!strcmp(str, "no_cmci"))
                cfg->cmci_disabled = true;
        else if (!strcmp(str, "no_lmce"))
                cfg->lmce_disabled = 1;
        else if (!strcmp(str, "dont_log_ce"))
                cfg->dont_log_ce = true;
        else if (!strcmp(str, "print_all"))
                cfg->print_all = true;
        else if (!strcmp(str, "ignore_ce"))
                cfg->ignore_ce = true;
        else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
                cfg->bootlog = (str[0] == 'b');
        else if (!strcmp(str, "bios_cmci_threshold"))
                cfg->bios_cmci_threshold = 1;
        else if (!strcmp(str, "recovery"))
                cfg->recovery = 1;
        else if (isdigit(str[0]))
                get_option(&str, &(cfg->monarch_timeout));
        else {
                pr_info("mce argument %s ignored. Please use /sys\n", str);
                return 0;
        }
        return 1;
}
__setup("mce", mcheck_enable);

int __init mcheck_init(void)
{
        mce_register_decode_chain(&early_nb);
        mce_register_decode_chain(&mce_uc_nb);
        mce_register_decode_chain(&mce_default_nb);

        INIT_WORK(&mce_work, mce_gen_pool_process);
        init_irq_work(&mce_irq_work, mce_irq_work_cb);

        return 0;
}

/*
 * mce_syscore: PM support
 */

/*
 * Disable machine checks on suspend and shutdown. We can't really handle
 * them later.
 */
static void mce_disable_error_reporting(void)
{
        struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
        int i;

        for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
                struct mce_bank *b = &mce_banks[i];

                if (b->init)
                        wrmsrq(mca_msr_reg(i, MCA_CTL), 0);
        }
        return;
}

static void vendor_disable_error_reporting(void)
{
        /*
         * Don't clear on Intel or AMD or Hygon or Zhaoxin CPUs. Some of these
         * MSRs are socket-wide. Disabling them for just a single offlined CPU
         * is bad, since it will inhibit reporting for all shared resources on
         * the socket like the last level cache (LLC), the integrated memory
         * controller (iMC), etc.
         */
        if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ||
            boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ||
            boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
            boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN)
                return;

        mce_disable_error_reporting();
}

static int mce_syscore_suspend(void *data)
{
        vendor_disable_error_reporting();
        return 0;
}

static void mce_syscore_shutdown(void *data)
{
        vendor_disable_error_reporting();
}

/*
 * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
 * Only one CPU is active at this time, the others get re-added later using
 * CPU hotplug:
 */
static void mce_syscore_resume(void *data)
{
        __mcheck_cpu_init_generic();
        __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info));
        __mcheck_cpu_init_prepare_banks();
        cr4_set_bits(X86_CR4_MCE);
}

static const struct syscore_ops mce_syscore_ops = {
        .suspend        = mce_syscore_suspend,
        .shutdown       = mce_syscore_shutdown,
        .resume         = mce_syscore_resume,
};

static struct syscore mce_syscore = {
        .ops = &mce_syscore_ops,
};

/*
 * mce_device: Sysfs support
 */

static void mce_cpu_restart(void *data)
{
        if (!mce_available(raw_cpu_ptr(&cpu_info)))
                return;
        __mcheck_cpu_init_generic();
        __mcheck_cpu_init_prepare_banks();
        __mcheck_cpu_init_timer();
        cr4_set_bits(X86_CR4_MCE);
}

/* Reinit MCEs after user configuration changes */
static void mce_restart(void)
{
        mce_timer_delete_all();
        on_each_cpu(mce_cpu_restart, NULL, 1);
        mce_schedule_work();
}

/* Toggle features for corrected errors */
static void mce_disable_cmci(void *data)
{
        if (!mce_available(raw_cpu_ptr(&cpu_info)))
                return;
        cmci_clear();
}

static void mce_enable_ce(void *all)
{
        if (!mce_available(raw_cpu_ptr(&cpu_info)))
                return;
        cmci_reenable();
        cmci_recheck();
        if (all)
                __mcheck_cpu_init_timer();
}

static const struct bus_type mce_subsys = {
        .name           = "machinecheck",
        .dev_name       = "machinecheck",
};

DEFINE_PER_CPU(struct device *, mce_device);

static inline struct mce_bank_dev *attr_to_bank(struct device_attribute *attr)
{
        return container_of(attr, struct mce_bank_dev, attr);
}

static ssize_t show_bank(struct device *s, struct device_attribute *attr,
                         char *buf)
{
        u8 bank = attr_to_bank(attr)->bank;
        struct mce_bank *b;

        if (bank >= per_cpu(mce_num_banks, s->id))
                return -EINVAL;

        b = &per_cpu(mce_banks_array, s->id)[bank];

        if (!b->init)
                return -ENODEV;

        return sprintf(buf, "%llx\n", b->ctl);
}

static ssize_t set_bank(struct device *s, struct device_attribute *attr,
                        const char *buf, size_t size)
{
        u8 bank = attr_to_bank(attr)->bank;
        struct mce_bank *b;
        u64 new;

        if (kstrtou64(buf, 0, &new) < 0)
                return -EINVAL;

        if (bank >= per_cpu(mce_num_banks, s->id))
                return -EINVAL;

        b = &per_cpu(mce_banks_array, s->id)[bank];
        if (!b->init)
                return -ENODEV;

        b->ctl = new;

        mutex_lock(&mce_sysfs_mutex);
        mce_restart();
        mutex_unlock(&mce_sysfs_mutex);

        return size;
}

static ssize_t set_ignore_ce(struct device *s,
                             struct device_attribute *attr,
                             const char *buf, size_t size)
{
        u64 new;

        if (kstrtou64(buf, 0, &new) < 0)
                return -EINVAL;

        mutex_lock(&mce_sysfs_mutex);
        if (mca_cfg.ignore_ce ^ !!new) {
                if (new) {
                        /* disable ce features */
                        mce_timer_delete_all();
                        on_each_cpu(mce_disable_cmci, NULL, 1);
                        mca_cfg.ignore_ce = true;
                } else {
                        /* enable ce features */
                        mca_cfg.ignore_ce = false;
                        on_each_cpu(mce_enable_ce, (void *)1, 1);
                }
        }
        mutex_unlock(&mce_sysfs_mutex);

        return size;
}

static ssize_t set_cmci_disabled(struct device *s,
                                 struct device_attribute *attr,
                                 const char *buf, size_t size)
{
        u64 new;

        if (kstrtou64(buf, 0, &new) < 0)
                return -EINVAL;

        mutex_lock(&mce_sysfs_mutex);
        if (mca_cfg.cmci_disabled ^ !!new) {
                if (new) {
                        /* disable cmci */
                        on_each_cpu(mce_disable_cmci, NULL, 1);
                        mca_cfg.cmci_disabled = true;
                } else {
                        /* enable cmci */
                        mca_cfg.cmci_disabled = false;
                        on_each_cpu(mce_enable_ce, NULL, 1);
                }
        }
        mutex_unlock(&mce_sysfs_mutex);

        return size;
}

static ssize_t store_int_with_restart(struct device *s,
                                      struct device_attribute *attr,
                                      const char *buf, size_t size)
{
        unsigned long old_check_interval = check_interval;
        ssize_t ret = device_store_ulong(s, attr, buf, size);

        if (check_interval == old_check_interval)
                return ret;

        mutex_lock(&mce_sysfs_mutex);
        mce_restart();
        mutex_unlock(&mce_sysfs_mutex);

        return ret;
}

static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
static DEVICE_BOOL_ATTR(print_all, 0644, mca_cfg.print_all);

static struct dev_ext_attribute dev_attr_check_interval = {
        __ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
        &check_interval
};

static struct dev_ext_attribute dev_attr_ignore_ce = {
        __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce),
        &mca_cfg.ignore_ce
};

static struct dev_ext_attribute dev_attr_cmci_disabled = {
        __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled),
        &mca_cfg.cmci_disabled
};

static struct device_attribute *mce_device_attrs[] = {
        &dev_attr_check_interval.attr,
#ifdef CONFIG_X86_MCELOG_LEGACY
        &dev_attr_trigger,
#endif
        &dev_attr_monarch_timeout.attr,
        &dev_attr_dont_log_ce.attr,
        &dev_attr_print_all.attr,
        &dev_attr_ignore_ce.attr,
        &dev_attr_cmci_disabled.attr,
        NULL
};

static cpumask_var_t mce_device_initialized;

static void mce_device_release(struct device *dev)
{
        kfree(dev);
}

/* Per CPU device init. All of the CPUs still share the same bank device: */
static int mce_device_create(unsigned int cpu)
{
        struct device *dev;
        int err;
        int i, j;

        dev = per_cpu(mce_device, cpu);
        if (dev)
                return 0;

        dev = kzalloc_obj(*dev);
        if (!dev)
                return -ENOMEM;
        dev->id  = cpu;
        dev->bus = &mce_subsys;
        dev->release = &mce_device_release;

        err = device_register(dev);
        if (err) {
                put_device(dev);
                return err;
        }

        for (i = 0; mce_device_attrs[i]; i++) {
                err = device_create_file(dev, mce_device_attrs[i]);
                if (err)
                        goto error;
        }
        for (j = 0; j < per_cpu(mce_num_banks, cpu); j++) {
                err = device_create_file(dev, &mce_bank_devs[j].attr);
                if (err)
                        goto error2;
        }
        cpumask_set_cpu(cpu, mce_device_initialized);
        per_cpu(mce_device, cpu) = dev;

        return 0;
error2:
        while (--j >= 0)
                device_remove_file(dev, &mce_bank_devs[j].attr);
error:
        while (--i >= 0)
                device_remove_file(dev, mce_device_attrs[i]);

        device_unregister(dev);

        return err;
}

static void mce_device_remove(unsigned int cpu)
{
        struct device *dev = per_cpu(mce_device, cpu);
        int i;

        if (!cpumask_test_cpu(cpu, mce_device_initialized))
                return;

        for (i = 0; mce_device_attrs[i]; i++)
                device_remove_file(dev, mce_device_attrs[i]);

        for (i = 0; i < per_cpu(mce_num_banks, cpu); i++)
                device_remove_file(dev, &mce_bank_devs[i].attr);

        device_unregister(dev);
        cpumask_clear_cpu(cpu, mce_device_initialized);
        per_cpu(mce_device, cpu) = NULL;
}

/* Make sure there are no machine checks on offlined CPUs. */
static void mce_disable_cpu(void)
{
        if (!mce_available(raw_cpu_ptr(&cpu_info)))
                return;

        if (!cpuhp_tasks_frozen)
                cmci_clear();

        vendor_disable_error_reporting();
}

static void mce_reenable_cpu(void)
{
        struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
        int i;

        if (!mce_available(raw_cpu_ptr(&cpu_info)))
                return;

        if (!cpuhp_tasks_frozen)
                cmci_reenable();
        for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
                struct mce_bank *b = &mce_banks[i];

                if (b->init)
                        wrmsrq(mca_msr_reg(i, MCA_CTL), b->ctl);
        }
}

static int mce_cpu_dead(unsigned int cpu)
{
        /* intentionally ignoring frozen here */
        if (!cpuhp_tasks_frozen)
                cmci_rediscover();
        return 0;
}

static int mce_cpu_online(unsigned int cpu)
{
        struct timer_list *t = this_cpu_ptr(&mce_timer);

        mce_device_create(cpu);
        mce_threshold_create_device(cpu);
        mce_reenable_cpu();
        mce_start_timer(t);
        return 0;
}

static int mce_cpu_pre_down(unsigned int cpu)
{
        struct timer_list *t = this_cpu_ptr(&mce_timer);

        mce_disable_cpu();
        timer_delete_sync(t);
        mce_threshold_remove_device(cpu);
        mce_device_remove(cpu);
        return 0;
}

static __init void mce_init_banks(void)
{
        int i;

        for (i = 0; i < MAX_NR_BANKS; i++) {
                struct mce_bank_dev *b = &mce_bank_devs[i];
                struct device_attribute *a = &b->attr;

                b->bank = i;

                sysfs_attr_init(&a->attr);
                a->attr.name    = b->attrname;
                snprintf(b->attrname, ATTR_LEN, "bank%d", i);

                a->attr.mode    = 0644;
                a->show         = show_bank;
                a->store        = set_bank;
        }
}

/*
 * When running on XEN, this initcall is ordered against the XEN mcelog
 * initcall:
 *
 *   device_initcall(xen_late_init_mcelog);
 *   device_initcall_sync(mcheck_init_device);
 */
static __init int mcheck_init_device(void)
{
        int err;

        /*
         * Check if we have a spare virtual bit. This will only become
         * a problem if/when we move beyond 5-level page tables.
         */
        MAYBE_BUILD_BUG_ON(__VIRTUAL_MASK_SHIFT >= 63);

        if (!mce_available(&boot_cpu_data)) {
                err = -EIO;
                goto err_out;
        }

        if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
                err = -ENOMEM;
                goto err_out;
        }

        mce_init_banks();

        err = subsys_system_register(&mce_subsys, NULL);
        if (err)
                goto err_out_mem;

        err = cpuhp_setup_state(CPUHP_X86_MCE_DEAD, "x86/mce:dead", NULL,
                                mce_cpu_dead);
        if (err)
                goto err_out_mem;

        /*
         * Invokes mce_cpu_online() on all CPUs which are online when
         * the state is installed.
         */
        err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/mce:online",
                                mce_cpu_online, mce_cpu_pre_down);
        if (err < 0)
                goto err_out_online;

        register_syscore(&mce_syscore);

        return 0;

err_out_online:
        cpuhp_remove_state(CPUHP_X86_MCE_DEAD);

err_out_mem:
        free_cpumask_var(mce_device_initialized);

err_out:
        pr_err("Unable to init MCE device (rc: %d)\n", err);

        return err;
}
device_initcall_sync(mcheck_init_device);

/*
 * Old style boot options parsing. Only for compatibility.
 */
static int __init mcheck_disable(char *str)
{
        mca_cfg.disabled = 1;
        return 1;
}
__setup("nomce", mcheck_disable);

#ifdef CONFIG_DEBUG_FS
struct dentry *mce_get_debugfs_dir(void)
{
        static struct dentry *dmce;

        if (!dmce)
                dmce = debugfs_create_dir("mce", NULL);

        return dmce;
}

static void mce_reset(void)
{
        atomic_set(&mce_fake_panicked, 0);
        atomic_set(&mce_executing, 0);
        atomic_set(&mce_callin, 0);
        atomic_set(&global_nwo, 0);
        cpumask_setall(&mce_missing_cpus);
}

static int fake_panic_get(void *data, u64 *val)
{
        *val = fake_panic;
        return 0;
}

static int fake_panic_set(void *data, u64 val)
{
        mce_reset();
        fake_panic = val;
        return 0;
}

DEFINE_DEBUGFS_ATTRIBUTE(fake_panic_fops, fake_panic_get, fake_panic_set,
                         "%llu\n");

static void __init mcheck_debugfs_init(void)
{
        struct dentry *dmce;

        dmce = mce_get_debugfs_dir();
        debugfs_create_file_unsafe("fake_panic", 0444, dmce, NULL,
                                   &fake_panic_fops);
}
#else
static void __init mcheck_debugfs_init(void) { }
#endif

static int __init mcheck_late_init(void)
{
        if (mca_cfg.recovery)
                enable_copy_mc_fragile();

        mcheck_debugfs_init();

        /*
         * Flush out everything that has been logged during early boot, now that
         * everything has been initialized (workqueues, decoders, ...).
         */
        mce_schedule_work();

        return 0;
}
late_initcall(mcheck_late_init);
Linux