root/arch/s390/kernel/hiperdispatch.c
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright IBM Corp. 2024
 */

#define pr_fmt(fmt) "hd: " fmt

/*
 * Hiperdispatch:
 * Dynamically calculates the optimum number of high capacity COREs
 * by considering the state the system is in. When hiperdispatch decides
 * that a capacity update is necessary, it schedules a topology update.
 * During topology updates the CPU capacities are always re-adjusted.
 *
 * There is two places where CPU capacities are being accessed within
 * hiperdispatch.
 * -> hiperdispatch's reoccuring work function reads CPU capacities to
 *    determine high capacity CPU count.
 * -> during a topology update hiperdispatch's adjustment function
 *    updates CPU capacities.
 * These two can run on different CPUs in parallel which can cause
 * hiperdispatch to make wrong decisions. This can potentially cause
 * some overhead by leading to extra rebuild_sched_domains() calls
 * for correction. Access to capacities within hiperdispatch has to be
 * serialized to prevent the overhead.
 *
 * Hiperdispatch decision making revolves around steal time.
 * HD_STEAL_THRESHOLD value is taken as reference. Whenever steal time
 * crosses the threshold value hiperdispatch falls back to giving high
 * capacities to entitled CPUs. When steal time drops below the
 * threshold boundary, hiperdispatch utilizes all CPUs by giving all
 * of them high capacity.
 *
 * The theory behind HD_STEAL_THRESHOLD is related to the SMP thread
 * performance. Comparing the throughput of;
 * - single CORE, with N threads, running N tasks
 * - N separate COREs running N tasks,
 * using individual COREs for individual tasks yield better
 * performance. This performance difference is roughly ~30% (can change
 * between machine generations)
 *
 * Hiperdispatch tries to hint scheduler to use individual COREs for
 * each task, as long as steal time on those COREs are less than 30%,
 * therefore delaying the throughput loss caused by using SMP threads.
 */

#include <linux/cpufeature.h>
#include <linux/cpumask.h>
#include <linux/debugfs.h>
#include <linux/device.h>
#include <linux/kernel_stat.h>
#include <linux/kstrtox.h>
#include <linux/ktime.h>
#include <linux/sysctl.h>
#include <linux/types.h>
#include <linux/workqueue.h>
#include <asm/hiperdispatch.h>
#include <asm/setup.h>
#include <asm/smp.h>
#include <asm/topology.h>

#define CREATE_TRACE_POINTS
#include <asm/trace/hiperdispatch.h>

#define HD_DELAY_FACTOR                 (4)
#define HD_DELAY_INTERVAL               (HZ / 4)
#define HD_STEAL_THRESHOLD              10
#define HD_STEAL_AVG_WEIGHT             16

static cpumask_t hd_vl_coremask;        /* Mask containing all vertical low COREs */
static cpumask_t hd_vmvl_cpumask;       /* Mask containing vertical medium and low CPUs */
static int hd_high_capacity_cores;      /* Current CORE count with high capacity */
static int hd_entitled_cores;           /* Total vertical high and medium CORE count */
static int hd_online_cores;             /* Current online CORE count */

static unsigned long hd_previous_steal; /* Previous iteration's CPU steal timer total */
static unsigned long hd_high_time;      /* Total time spent while all cpus have high capacity */
static unsigned long hd_low_time;       /* Total time spent while vl cpus have low capacity */
static atomic64_t hd_adjustments;       /* Total occurrence count of hiperdispatch adjustments */

static unsigned int hd_steal_threshold = HD_STEAL_THRESHOLD;
static unsigned int hd_delay_factor = HD_DELAY_FACTOR;
static int hd_enabled;

static void hd_capacity_work_fn(struct work_struct *work);
static DECLARE_DELAYED_WORK(hd_capacity_work, hd_capacity_work_fn);

static int hd_set_hiperdispatch_mode(int enable)
{
        if (!cpu_has_topology())
                enable = 0;
        if (hd_enabled == enable)
                return 0;
        hd_enabled = enable;
        return 1;
}

void hd_reset_state(void)
{
        cpumask_clear(&hd_vl_coremask);
        cpumask_clear(&hd_vmvl_cpumask);
        hd_entitled_cores = 0;
        hd_online_cores = 0;
}

void hd_add_core(int cpu)
{
        const struct cpumask *siblings;
        int polarization;

        hd_online_cores++;
        polarization = smp_cpu_get_polarization(cpu);
        siblings = topology_sibling_cpumask(cpu);
        switch (polarization) {
        case POLARIZATION_VH:
                hd_entitled_cores++;
                break;
        case POLARIZATION_VM:
                hd_entitled_cores++;
                cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings);
                break;
        case POLARIZATION_VL:
                cpumask_set_cpu(cpu, &hd_vl_coremask);
                cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings);
                break;
        }
}

/* Serialize update and read operations of debug counters. */
static DEFINE_MUTEX(hd_counter_mutex);

static void hd_update_times(void)
{
        static ktime_t prev;
        ktime_t now;

        /*
         * Check if hiperdispatch is active, if not set the prev to 0.
         * This way it is possible to differentiate the first update iteration after
         * enabling hiperdispatch.
         */
        if (hd_entitled_cores == 0 || hd_enabled == 0) {
                prev = ktime_set(0, 0);
                return;
        }
        now = ktime_get();
        if (ktime_after(prev, 0)) {
                if (hd_high_capacity_cores == hd_online_cores)
                        hd_high_time += ktime_ms_delta(now, prev);
                else
                        hd_low_time += ktime_ms_delta(now, prev);
        }
        prev = now;
}

static void hd_update_capacities(void)
{
        int cpu, upscaling_cores;
        unsigned long capacity;

        upscaling_cores = hd_high_capacity_cores - hd_entitled_cores;
        capacity = upscaling_cores > 0 ? CPU_CAPACITY_HIGH : CPU_CAPACITY_LOW;
        hd_high_capacity_cores = hd_entitled_cores;
        for_each_cpu(cpu, &hd_vl_coremask) {
                smp_set_core_capacity(cpu, capacity);
                if (capacity != CPU_CAPACITY_HIGH)
                        continue;
                hd_high_capacity_cores++;
                upscaling_cores--;
                if (upscaling_cores == 0)
                        capacity = CPU_CAPACITY_LOW;
        }
}

void hd_disable_hiperdispatch(void)
{
        cancel_delayed_work_sync(&hd_capacity_work);
        hd_high_capacity_cores = hd_online_cores;
        hd_previous_steal = 0;
}

int hd_enable_hiperdispatch(void)
{
        mutex_lock(&hd_counter_mutex);
        hd_update_times();
        mutex_unlock(&hd_counter_mutex);
        if (hd_enabled == 0)
                return 0;
        if (hd_entitled_cores == 0)
                return 0;
        if (hd_online_cores <= hd_entitled_cores)
                return 0;
        mod_delayed_work(system_dfl_wq, &hd_capacity_work, HD_DELAY_INTERVAL * hd_delay_factor);
        hd_update_capacities();
        return 1;
}

static unsigned long hd_steal_avg(unsigned long new)
{
        static unsigned long steal;

        steal = (steal * (HD_STEAL_AVG_WEIGHT - 1) + new) / HD_STEAL_AVG_WEIGHT;
        return steal;
}

static unsigned long hd_calculate_steal_percentage(void)
{
        unsigned long time_delta, steal_delta, steal, percentage;
        static ktime_t prev;
        int cpus, cpu;
        ktime_t now;

        cpus = 0;
        steal = 0;
        percentage = 0;
        for_each_cpu(cpu, &hd_vmvl_cpumask) {
                steal += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL];
                cpus++;
        }
        /*
         * If there is no vertical medium and low CPUs steal time
         * is 0 as vertical high CPUs shouldn't experience steal time.
         */
        if (cpus == 0)
                return percentage;
        now = ktime_get();
        time_delta = ktime_to_ns(ktime_sub(now, prev));
        if (steal > hd_previous_steal && hd_previous_steal != 0) {
                steal_delta = (steal - hd_previous_steal) * 100 / time_delta;
                percentage = steal_delta / cpus;
        }
        hd_previous_steal = steal;
        prev = now;
        return percentage;
}

static void hd_capacity_work_fn(struct work_struct *work)
{
        unsigned long steal_percentage, new_cores;

        mutex_lock(&smp_cpu_state_mutex);
        /*
         * If online cores are less or equal to entitled cores hiperdispatch
         * does not need to make any adjustments, call a topology update to
         * disable hiperdispatch.
         * Normally this check is handled on topology update, but during cpu
         * unhotplug, topology and cpu mask updates are done in reverse
         * order, causing hd_enable_hiperdispatch() to get stale data.
         */
        if (hd_online_cores <= hd_entitled_cores) {
                topology_schedule_update();
                mutex_unlock(&smp_cpu_state_mutex);
                return;
        }
        steal_percentage = hd_steal_avg(hd_calculate_steal_percentage());
        if (steal_percentage < hd_steal_threshold)
                new_cores = hd_online_cores;
        else
                new_cores = hd_entitled_cores;
        if (hd_high_capacity_cores != new_cores) {
                trace_s390_hd_rebuild_domains(hd_high_capacity_cores, new_cores);
                hd_high_capacity_cores = new_cores;
                atomic64_inc(&hd_adjustments);
                topology_schedule_update();
        }
        trace_s390_hd_work_fn(steal_percentage, hd_entitled_cores, hd_high_capacity_cores);
        mutex_unlock(&smp_cpu_state_mutex);
        schedule_delayed_work(&hd_capacity_work, HD_DELAY_INTERVAL);
}

static int hiperdispatch_ctl_handler(const struct ctl_table *ctl, int write,
                                     void *buffer, size_t *lenp, loff_t *ppos)
{
        int hiperdispatch;
        int rc;
        struct ctl_table ctl_entry = {
                .procname       = ctl->procname,
                .data           = &hiperdispatch,
                .maxlen         = sizeof(int),
                .extra1         = SYSCTL_ZERO,
                .extra2         = SYSCTL_ONE,
        };

        hiperdispatch = hd_enabled;
        rc = proc_douintvec_minmax(&ctl_entry, write, buffer, lenp, ppos);
        if (rc < 0 || !write)
                return rc;
        mutex_lock(&smp_cpu_state_mutex);
        if (hd_set_hiperdispatch_mode(hiperdispatch))
                topology_schedule_update();
        mutex_unlock(&smp_cpu_state_mutex);
        return 0;
}

static const struct ctl_table hiperdispatch_ctl_table[] = {
        {
                .procname       = "hiperdispatch",
                .mode           = 0644,
                .proc_handler   = hiperdispatch_ctl_handler,
        },
};

static ssize_t hd_steal_threshold_show(struct device *dev,
                                       struct device_attribute *attr,
                                       char *buf)
{
        return sysfs_emit(buf, "%u\n", hd_steal_threshold);
}

static ssize_t hd_steal_threshold_store(struct device *dev,
                                        struct device_attribute *attr,
                                        const char *buf,
                                        size_t count)
{
        unsigned int val;
        int rc;

        rc = kstrtouint(buf, 0, &val);
        if (rc)
                return rc;
        if (val > 100)
                return -ERANGE;
        hd_steal_threshold = val;
        return count;
}

static DEVICE_ATTR_RW(hd_steal_threshold);

static ssize_t hd_delay_factor_show(struct device *dev,
                                    struct device_attribute *attr,
                                    char *buf)
{
        return sysfs_emit(buf, "%u\n", hd_delay_factor);
}

static ssize_t hd_delay_factor_store(struct device *dev,
                                     struct device_attribute *attr,
                                     const char *buf,
                                     size_t count)
{
        unsigned int val;
        int rc;

        rc = kstrtouint(buf, 0, &val);
        if (rc)
                return rc;
        if (!val)
                return -ERANGE;
        hd_delay_factor = val;
        return count;
}

static DEVICE_ATTR_RW(hd_delay_factor);

static struct attribute *hd_attrs[] = {
        &dev_attr_hd_steal_threshold.attr,
        &dev_attr_hd_delay_factor.attr,
        NULL,
};

static const struct attribute_group hd_attr_group = {
        .name  = "hiperdispatch",
        .attrs = hd_attrs,
};

static int hd_greedy_time_get(void *unused, u64 *val)
{
        mutex_lock(&hd_counter_mutex);
        hd_update_times();
        *val = hd_high_time;
        mutex_unlock(&hd_counter_mutex);
        return 0;
}

DEFINE_SIMPLE_ATTRIBUTE(hd_greedy_time_fops, hd_greedy_time_get, NULL, "%llu\n");

static int hd_conservative_time_get(void *unused, u64 *val)
{
        mutex_lock(&hd_counter_mutex);
        hd_update_times();
        *val = hd_low_time;
        mutex_unlock(&hd_counter_mutex);
        return 0;
}

DEFINE_SIMPLE_ATTRIBUTE(hd_conservative_time_fops, hd_conservative_time_get, NULL, "%llu\n");

static int hd_adjustment_count_get(void *unused, u64 *val)
{
        *val = atomic64_read(&hd_adjustments);
        return 0;
}

DEFINE_SIMPLE_ATTRIBUTE(hd_adjustments_fops, hd_adjustment_count_get, NULL, "%llu\n");

static void __init hd_create_debugfs_counters(void)
{
        struct dentry *dir;

        dir = debugfs_create_dir("hiperdispatch", arch_debugfs_dir);
        debugfs_create_file("conservative_time_ms", 0400, dir, NULL, &hd_conservative_time_fops);
        debugfs_create_file("greedy_time_ms", 0400, dir, NULL, &hd_greedy_time_fops);
        debugfs_create_file("adjustment_count", 0400, dir, NULL, &hd_adjustments_fops);
}

static void __init hd_create_attributes(void)
{
        struct device *dev;

        dev = bus_get_dev_root(&cpu_subsys);
        if (!dev)
                return;
        if (sysfs_create_group(&dev->kobj, &hd_attr_group))
                pr_warn("Unable to create hiperdispatch attribute group\n");
        put_device(dev);
}

static int __init hd_init(void)
{
        if (IS_ENABLED(CONFIG_HIPERDISPATCH_ON)) {
                hd_set_hiperdispatch_mode(1);
                topology_schedule_update();
        }
        if (!register_sysctl("s390", hiperdispatch_ctl_table))
                pr_warn("Failed to register s390.hiperdispatch sysctl attribute\n");
        hd_create_debugfs_counters();
        hd_create_attributes();
        return 0;
}
late_initcall(hd_init);