root/arch/x86/kernel/cpu/topology.c
// SPDX-License-Identifier: GPL-2.0-only
/*
 * CPU/APIC topology
 *
 * The APIC IDs describe the system topology in multiple domain levels.
 * The CPUID topology parser provides the information which part of the
 * APIC ID is associated to the individual levels:
 *
 * [PACKAGE][DIEGRP][DIE][TILE][MODULE][CORE][THREAD]
 *
 * The root space contains the package (socket) IDs.
 *
 * Not enumerated levels consume 0 bits space, but conceptually they are
 * always represented. If e.g. only CORE and THREAD levels are enumerated
 * then the DIE, MODULE and TILE have the same physical ID as the PACKAGE.
 *
 * If SMT is not supported, then the THREAD domain is still used. It then
 * has the same physical ID as the CORE domain and is the only child of
 * the core domain.
 *
 * This allows a unified view on the system independent of the enumerated
 * domain levels without requiring any conditionals in the code.
 */
#define pr_fmt(fmt) "CPU topo: " fmt
#include <linux/cpu.h>

#include <xen/xen.h>

#include <asm/apic.h>
#include <asm/io_apic.h>
#include <asm/mpspec.h>
#include <asm/msr.h>
#include <asm/smp.h>
#include <asm/numa.h>

#include "cpu.h"

/*
 * Map cpu index to physical APIC ID
 */
DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_apicid, BAD_APICID);
DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid, CPU_ACPIID_INVALID);
EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_acpiid);

/* Bitmap of physically present CPUs. */
DECLARE_BITMAP(phys_cpu_present_map, MAX_LOCAL_APIC) __read_mostly;

/* Used for CPU number allocation and parallel CPU bringup */
u32 cpuid_to_apicid[] __ro_after_init = { [0 ... NR_CPUS - 1] = BAD_APICID, };

/* Bitmaps to mark registered APICs at each topology domain */
static struct { DECLARE_BITMAP(map, MAX_LOCAL_APIC); } apic_maps[TOPO_MAX_DOMAIN] __ro_after_init;

/*
 * Keep track of assigned, disabled and rejected CPUs. Present assigned
 * with 1 as CPU #0 is reserved for the boot CPU.
 */
static struct {
        unsigned int            nr_assigned_cpus;
        unsigned int            nr_disabled_cpus;
        unsigned int            nr_rejected_cpus;
        u32                     boot_cpu_apic_id;
        u32                     real_bsp_apic_id;
} topo_info __ro_after_init = {
        .nr_assigned_cpus       = 1,
        .boot_cpu_apic_id       = BAD_APICID,
        .real_bsp_apic_id       = BAD_APICID,
};

#define domain_weight(_dom)     bitmap_weight(apic_maps[_dom].map, MAX_LOCAL_APIC)

bool arch_match_cpu_phys_id(int cpu, u64 phys_id)
{
        return phys_id == (u64)cpuid_to_apicid[cpu];
}

static void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid)
{
        if (!(apicid & (__max_threads_per_core - 1)))
                cpumask_set_cpu(cpu, &__cpu_primary_thread_mask);
}

/*
 * Convert the APIC ID to a domain level ID by masking out the low bits
 * below the domain level @dom.
 */
static inline u32 topo_apicid(u32 apicid, enum x86_topology_domains dom)
{
        if (dom == TOPO_SMT_DOMAIN)
                return apicid;
        return apicid & (UINT_MAX << x86_topo_system.dom_shifts[dom - 1]);
}

static int topo_lookup_cpuid(u32 apic_id)
{
        int i;

        /* CPU# to APICID mapping is persistent once it is established */
        for (i = 0; i < topo_info.nr_assigned_cpus; i++) {
                if (cpuid_to_apicid[i] == apic_id)
                        return i;
        }
        return -ENODEV;
}

static __init int topo_get_cpunr(u32 apic_id)
{
        int cpu = topo_lookup_cpuid(apic_id);

        if (cpu >= 0)
                return cpu;

        return topo_info.nr_assigned_cpus++;
}

static void topo_set_cpuids(unsigned int cpu, u32 apic_id, u32 acpi_id)
{
#if defined(CONFIG_SMP) || defined(CONFIG_X86_64)
        early_per_cpu(x86_cpu_to_apicid, cpu) = apic_id;
        early_per_cpu(x86_cpu_to_acpiid, cpu) = acpi_id;
#endif
        set_cpu_present(cpu, true);
}

static __init bool check_for_real_bsp(u32 apic_id)
{
        bool is_bsp = false, has_apic_base = boot_cpu_data.x86 >= 6;
        u64 msr;

        /*
         * There is no real good way to detect whether this a kdump()
         * kernel, but except on the Voyager SMP monstrosity which is not
         * longer supported, the real BSP APIC ID is the first one which is
         * enumerated by firmware. That allows to detect whether the boot
         * CPU is the real BSP. If it is not, then do not register the APIC
         * because sending INIT to the real BSP would reset the whole
         * system.
         *
         * The first APIC ID which is enumerated by firmware is detectable
         * because the boot CPU APIC ID is registered before that without
         * invoking this code.
         */
        if (topo_info.real_bsp_apic_id != BAD_APICID)
                return false;

        /*
         * Check whether the enumeration order is broken by evaluating the
         * BSP bit in the APICBASE MSR. If the CPU does not have the
         * APICBASE MSR then the BSP detection is not possible and the
         * kernel must rely on the firmware enumeration order.
         */
        if (has_apic_base) {
                rdmsrq(MSR_IA32_APICBASE, msr);
                is_bsp = !!(msr & MSR_IA32_APICBASE_BSP);
        }

        if (apic_id == topo_info.boot_cpu_apic_id) {
                /*
                 * If the boot CPU has the APIC BSP bit set then the
                 * firmware enumeration is agreeing. If the CPU does not
                 * have the APICBASE MSR then the only choice is to trust
                 * the enumeration order.
                 */
                if (is_bsp || !has_apic_base) {
                        topo_info.real_bsp_apic_id = apic_id;
                        return false;
                }
                /*
                 * If the boot APIC is enumerated first, but the APICBASE
                 * MSR does not have the BSP bit set, then there is no way
                 * to discover the real BSP here. Assume a crash kernel and
                 * limit the number of CPUs to 1 as an INIT to the real BSP
                 * would reset the machine.
                 */
                pr_warn("Enumerated BSP APIC %x is not marked in APICBASE MSR\n", apic_id);
                pr_warn("Assuming crash kernel. Limiting to one CPU to prevent machine INIT\n");
                set_nr_cpu_ids(1);
                goto fwbug;
        }

        pr_warn("Boot CPU APIC ID not the first enumerated APIC ID: %x != %x\n",
                topo_info.boot_cpu_apic_id, apic_id);

        if (is_bsp) {
                /*
                 * The boot CPU has the APIC BSP bit set. Use it and complain
                 * about the broken firmware enumeration.
                 */
                topo_info.real_bsp_apic_id = topo_info.boot_cpu_apic_id;
                goto fwbug;
        }

        pr_warn("Crash kernel detected. Disabling real BSP to prevent machine INIT\n");

        topo_info.real_bsp_apic_id = apic_id;
        return true;

fwbug:
        pr_warn(FW_BUG "APIC enumeration order not specification compliant\n");
        return false;
}

static unsigned int topo_unit_count(u32 lvlid, enum x86_topology_domains at_level,
                                    unsigned long *map)
{
        unsigned int id, end, cnt = 0;

        /* Calculate the exclusive end */
        end = lvlid + (1U << x86_topo_system.dom_shifts[at_level]);

        /* Unfortunately there is no bitmap_weight_range() */
        for (id = find_next_bit(map, end, lvlid); id < end; id = find_next_bit(map, end, ++id))
                cnt++;
        return cnt;
}

static __init void topo_register_apic(u32 apic_id, u32 acpi_id, bool present)
{
        int cpu, dom;

        if (present) {
                set_bit(apic_id, phys_cpu_present_map);

                /*
                 * Double registration is valid in case of the boot CPU
                 * APIC because that is registered before the enumeration
                 * of the APICs via firmware parsers or VM guest
                 * mechanisms.
                 */
                if (apic_id == topo_info.boot_cpu_apic_id)
                        cpu = 0;
                else
                        cpu = topo_get_cpunr(apic_id);

                cpuid_to_apicid[cpu] = apic_id;
                topo_set_cpuids(cpu, apic_id, acpi_id);
        } else {
                topo_info.nr_disabled_cpus++;
        }

        /*
         * Register present and possible CPUs in the domain
         * maps. cpu_possible_map will be updated in
         * topology_init_possible_cpus() after enumeration is done.
         */
        for (dom = TOPO_SMT_DOMAIN; dom < TOPO_MAX_DOMAIN; dom++)
                set_bit(topo_apicid(apic_id, dom), apic_maps[dom].map);
}

/**
 * topology_register_apic - Register an APIC in early topology maps
 * @apic_id:    The APIC ID to set up
 * @acpi_id:    The ACPI ID associated to the APIC
 * @present:    True if the corresponding CPU is present
 */
void __init topology_register_apic(u32 apic_id, u32 acpi_id, bool present)
{
        if (apic_id >= MAX_LOCAL_APIC) {
                pr_err_once("APIC ID %x exceeds kernel limit of: %x\n", apic_id, MAX_LOCAL_APIC - 1);
                topo_info.nr_rejected_cpus++;
                return;
        }

        if (check_for_real_bsp(apic_id)) {
                topo_info.nr_rejected_cpus++;
                return;
        }

        /* CPU numbers exhausted? */
        if (apic_id != topo_info.boot_cpu_apic_id && topo_info.nr_assigned_cpus >= nr_cpu_ids) {
                pr_warn_once("CPU limit of %d reached. Ignoring further CPUs\n", nr_cpu_ids);
                topo_info.nr_rejected_cpus++;
                return;
        }

        topo_register_apic(apic_id, acpi_id, present);
}

/**
 * topology_register_boot_apic - Register the boot CPU APIC
 * @apic_id:    The APIC ID to set up
 *
 * Separate so CPU #0 can be assigned
 */
void __init topology_register_boot_apic(u32 apic_id)
{
        WARN_ON_ONCE(topo_info.boot_cpu_apic_id != BAD_APICID);

        topo_info.boot_cpu_apic_id = apic_id;
        topo_register_apic(apic_id, CPU_ACPIID_INVALID, true);
}

/**
 * topology_get_logical_id - Retrieve the logical ID at a given topology domain level
 * @apicid:             The APIC ID for which to lookup the logical ID
 * @at_level:           The topology domain level to use
 *
 * @apicid must be a full APIC ID, not the normalized variant. It's valid to have
 * all bits below the domain level specified by @at_level to be clear. So both
 * real APIC IDs and backshifted normalized APIC IDs work correctly.
 *
 * Returns:
 *  - >= 0:     The requested logical ID
 *  - -ERANGE:  @apicid is out of range
 *  - -ENODEV:  @apicid is not registered
 */
int topology_get_logical_id(u32 apicid, enum x86_topology_domains at_level)
{
        /* Remove the bits below @at_level to get the proper level ID of @apicid */
        unsigned int lvlid = topo_apicid(apicid, at_level);

        if (lvlid >= MAX_LOCAL_APIC)
                return -ERANGE;
        if (!test_bit(lvlid, apic_maps[at_level].map))
                return -ENODEV;
        /* Get the number of set bits before @lvlid. */
        return bitmap_weight(apic_maps[at_level].map, lvlid);
}
EXPORT_SYMBOL_GPL(topology_get_logical_id);

/**
 * topology_unit_count - Retrieve the count of specified units at a given topology domain level
 * @apicid:             The APIC ID which specifies the search range
 * @which_units:        The domain level specifying the units to count
 * @at_level:           The domain level at which @which_units have to be counted
 *
 * This returns the number of possible units according to the enumerated
 * information.
 *
 * E.g. topology_count_units(apicid, TOPO_CORE_DOMAIN, TOPO_PKG_DOMAIN)
 * counts the number of possible cores in the package to which @apicid
 * belongs.
 *
 * @at_level must obviously be greater than @which_level to produce useful
 * results.  If @at_level is equal to @which_units the result is
 * unsurprisingly 1. If @at_level is less than @which_units the results
 * is by definition undefined and the function returns 0.
 */
unsigned int topology_unit_count(u32 apicid, enum x86_topology_domains which_units,
                                 enum x86_topology_domains at_level)
{
        /* Remove the bits below @at_level to get the proper level ID of @apicid */
        unsigned int lvlid = topo_apicid(apicid, at_level);

        if (lvlid >= MAX_LOCAL_APIC)
                return 0;
        if (!test_bit(lvlid, apic_maps[at_level].map))
                return 0;
        if (which_units > at_level)
                return 0;
        if (which_units == at_level)
                return 1;
        return topo_unit_count(lvlid, at_level, apic_maps[which_units].map);
}

#ifdef CONFIG_SMP
int topology_get_primary_thread(unsigned int cpu)
{
        u32 apic_id = cpuid_to_apicid[cpu];

        /*
         * Get the core domain level APIC id, which is the primary thread
         * and return the CPU number assigned to it.
         */
        return topo_lookup_cpuid(topo_apicid(apic_id, TOPO_CORE_DOMAIN));
}
#endif

#ifdef CONFIG_ACPI_HOTPLUG_CPU
/**
 * topology_hotplug_apic - Handle a physical hotplugged APIC after boot
 * @apic_id:    The APIC ID to set up
 * @acpi_id:    The ACPI ID associated to the APIC
 */
int topology_hotplug_apic(u32 apic_id, u32 acpi_id)
{
        int cpu;

        if (apic_id >= MAX_LOCAL_APIC)
                return -EINVAL;

        /* Reject if the APIC ID was not registered during enumeration. */
        if (!test_bit(apic_id, apic_maps[TOPO_SMT_DOMAIN].map))
                return -ENODEV;

        cpu = topo_lookup_cpuid(apic_id);
        if (cpu < 0)
                return -ENOSPC;

        set_bit(apic_id, phys_cpu_present_map);
        topo_set_cpuids(cpu, apic_id, acpi_id);
        cpu_mark_primary_thread(cpu, apic_id);
        return cpu;
}

/**
 * topology_hotunplug_apic - Remove a physical hotplugged APIC after boot
 * @cpu:        The CPU number for which the APIC ID is removed
 */
void topology_hotunplug_apic(unsigned int cpu)
{
        u32 apic_id = cpuid_to_apicid[cpu];

        if (apic_id == BAD_APICID)
                return;

        per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID;
        clear_bit(apic_id, phys_cpu_present_map);
        set_cpu_present(cpu, false);
}
#endif

#ifdef CONFIG_X86_LOCAL_APIC
static unsigned int max_possible_cpus __initdata = NR_CPUS;

/**
 * topology_apply_cmdline_limits_early - Apply topology command line limits early
 *
 * Ensure that command line limits are in effect before firmware parsing
 * takes place.
 */
void __init topology_apply_cmdline_limits_early(void)
{
        unsigned int possible = nr_cpu_ids;

        /* 'maxcpus=0' 'nosmp' 'nolapic' */
        if (!setup_max_cpus || apic_is_disabled)
                possible = 1;

        /* 'possible_cpus=N' */
        possible = min_t(unsigned int, max_possible_cpus, possible);

        if (possible < nr_cpu_ids) {
                pr_info("Limiting to %u possible CPUs\n", possible);
                set_nr_cpu_ids(possible);
        }
}

static __init bool restrict_to_up(void)
{
        if (!smp_found_config)
                return true;
        /*
         * XEN PV is special as it does not advertise the local APIC
         * properly, but provides a fake topology for it so that the
         * infrastructure works. So don't apply the restrictions vs. APIC
         * here.
         */
        if (xen_pv_domain())
                return false;

        return apic_is_disabled;
}

void __init topology_init_possible_cpus(void)
{
        unsigned int assigned = topo_info.nr_assigned_cpus;
        unsigned int disabled = topo_info.nr_disabled_cpus;
        unsigned int cnta, cntb, cpu, allowed = 1;
        unsigned int total = assigned + disabled;
        u32 apicid, firstid;

        /*
         * If there was no APIC registered, then fake one so that the
         * topology bitmap is populated. That ensures that the code below
         * is valid and the various query interfaces can be used
         * unconditionally. This does not affect the actual APIC code in
         * any way because either the local APIC address has not been
         * registered or the local APIC was disabled on the command line.
         */
        if (topo_info.boot_cpu_apic_id == BAD_APICID)
                topology_register_boot_apic(0);

        if (!restrict_to_up()) {
                if (WARN_ON_ONCE(assigned > nr_cpu_ids)) {
                        disabled += assigned - nr_cpu_ids;
                        assigned = nr_cpu_ids;
                }
                allowed = min_t(unsigned int, total, nr_cpu_ids);
        }

        if (total > allowed)
                pr_warn("%u possible CPUs exceed the limit of %u\n", total, allowed);

        assigned = min_t(unsigned int, allowed, assigned);
        disabled = allowed - assigned;

        topo_info.nr_assigned_cpus = assigned;
        topo_info.nr_disabled_cpus = disabled;

        total_cpus = allowed;
        set_nr_cpu_ids(allowed);

        cnta = domain_weight(TOPO_PKG_DOMAIN);
        __max_logical_packages = cnta;

        pr_info("Max. logical packages: %3u\n", __max_logical_packages);

        cntb = num_phys_nodes();
        __num_nodes_per_package = DIV_ROUND_UP(cntb, cnta);

        pr_info("Max. logical nodes:    %3u\n", cntb);
        pr_info("Num. nodes per package:%3u\n", __num_nodes_per_package);

        cntb = domain_weight(TOPO_DIE_DOMAIN);
        __max_dies_per_package = 1U << (get_count_order(cntb) - get_count_order(cnta));

        pr_info("Max. logical dies:     %3u\n", cntb);
        pr_info("Max. dies per package: %3u\n", __max_dies_per_package);

        cnta = domain_weight(TOPO_CORE_DOMAIN);
        cntb = domain_weight(TOPO_SMT_DOMAIN);
        /*
         * Can't use order delta here as order(cnta) can be equal
         * order(cntb) even if cnta != cntb.
         */
        __max_threads_per_core = DIV_ROUND_UP(cntb, cnta);
        pr_info("Max. threads per core: %3u\n", __max_threads_per_core);

        firstid = find_first_bit(apic_maps[TOPO_SMT_DOMAIN].map, MAX_LOCAL_APIC);
        __num_cores_per_package = topology_unit_count(firstid, TOPO_CORE_DOMAIN, TOPO_PKG_DOMAIN);
        pr_info("Num. cores per package:   %3u\n", __num_cores_per_package);
        __num_threads_per_package = topology_unit_count(firstid, TOPO_SMT_DOMAIN, TOPO_PKG_DOMAIN);
        pr_info("Num. threads per package: %3u\n", __num_threads_per_package);

        pr_info("Allowing %u present CPUs plus %u hotplug CPUs\n", assigned, disabled);
        if (topo_info.nr_rejected_cpus)
                pr_info("Rejected CPUs %u\n", topo_info.nr_rejected_cpus);

        init_cpu_present(cpumask_of(0));
        init_cpu_possible(cpumask_of(0));

        /* Assign CPU numbers to non-present CPUs */
        for (apicid = 0; disabled; disabled--, apicid++) {
                apicid = find_next_andnot_bit(apic_maps[TOPO_SMT_DOMAIN].map, phys_cpu_present_map,
                                              MAX_LOCAL_APIC, apicid);
                if (apicid >= MAX_LOCAL_APIC)
                        break;
                cpuid_to_apicid[topo_info.nr_assigned_cpus++] = apicid;
        }

        for (cpu = 0; cpu < allowed; cpu++) {
                apicid = cpuid_to_apicid[cpu];

                set_cpu_possible(cpu, true);

                if (apicid == BAD_APICID)
                        continue;

                cpu_mark_primary_thread(cpu, apicid);
                set_cpu_present(cpu, test_bit(apicid, phys_cpu_present_map));
        }
}

/*
 * Late SMP disable after sizing CPU masks when APIC/IOAPIC setup failed.
 */
void __init topology_reset_possible_cpus_up(void)
{
        init_cpu_present(cpumask_of(0));
        init_cpu_possible(cpumask_of(0));

        bitmap_zero(phys_cpu_present_map, MAX_LOCAL_APIC);
        if (topo_info.boot_cpu_apic_id != BAD_APICID)
                set_bit(topo_info.boot_cpu_apic_id, phys_cpu_present_map);
}

static int __init setup_possible_cpus(char *str)
{
        get_option(&str, &max_possible_cpus);
        return 0;
}
early_param("possible_cpus", setup_possible_cpus);
#endif