root/sys/x86/cpufreq/hwpstate_intel.c
/*-
 * SPDX-License-Identifier: BSD-2-Clause
 *
 * Copyright (c) 2018 Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted providing that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/types.h>
#include <sys/param.h>
#include <sys/sbuf.h>
#include <sys/module.h>
#include <sys/systm.h>
#include <sys/errno.h>
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/bus.h>
#include <sys/cpu.h>
#include <sys/smp.h>
#include <sys/proc.h>
#include <sys/sched.h>

#include <machine/cpu.h>
#include <machine/md_var.h>
#include <machine/cputypes.h>
#include <machine/specialreg.h>

#include <contrib/dev/acpica/include/acpi.h>

#include <dev/acpica/acpivar.h>

#include <x86/cpufreq/hwpstate_common.h>
#include <x86/cpufreq/hwpstate_intel_internal.h>

#include "acpi_if.h"
#include "cpufreq_if.h"

extern uint64_t tsc_freq;

static int      intel_hwpstate_probe(device_t dev);
static int      intel_hwpstate_attach(device_t dev);
static int      intel_hwpstate_detach(device_t dev);
static int      intel_hwpstate_suspend(device_t dev);
static int      intel_hwpstate_resume(device_t dev);

static int      intel_hwpstate_get(device_t dev, struct cf_setting *cf);
static int      intel_hwpstate_type(device_t dev, int *type);

static device_method_t intel_hwpstate_methods[] = {
        /* Device interface */
        DEVMETHOD(device_identify,      intel_hwpstate_identify),
        DEVMETHOD(device_probe,         intel_hwpstate_probe),
        DEVMETHOD(device_attach,        intel_hwpstate_attach),
        DEVMETHOD(device_detach,        intel_hwpstate_detach),
        DEVMETHOD(device_suspend,       intel_hwpstate_suspend),
        DEVMETHOD(device_resume,        intel_hwpstate_resume),

        /* cpufreq interface */
        DEVMETHOD(cpufreq_drv_get,      intel_hwpstate_get),
        DEVMETHOD(cpufreq_drv_type,     intel_hwpstate_type),

        DEVMETHOD_END
};

struct hwp_softc {
        device_t                dev;
        bool                    hwp_notifications;
        bool                    hwp_activity_window;
        bool                    hwp_pref_ctrl;
        bool                    hwp_pkg_ctrl;
        bool                    hwp_pkg_ctrl_en;
        bool                    hwp_perf_bias;
        bool                    hwp_perf_bias_cached;

        uint64_t                req; /* Cached copy of HWP_REQUEST */
        uint64_t                hwp_energy_perf_bias;   /* Cache PERF_BIAS */

        uint8_t                 high;
        uint8_t                 guaranteed;
        uint8_t                 efficient;
        uint8_t                 low;
};

static driver_t hwpstate_intel_driver = {
        "hwpstate_intel",
        intel_hwpstate_methods,
        sizeof(struct hwp_softc),
};

DRIVER_MODULE(hwpstate_intel, cpu, hwpstate_intel_driver, NULL, NULL);
MODULE_VERSION(hwpstate_intel, 1);

static int
intel_hwp_dump_sysctl_handler(SYSCTL_HANDLER_ARGS)
{
        device_t dev;
        struct pcpu *pc;
        struct sbuf *sb;
        struct hwp_softc *sc;
        uint64_t data, data2;
        int ret;

        sc = (struct hwp_softc *)arg1;
        dev = sc->dev;

        pc = cpu_get_pcpu(dev);
        if (pc == NULL)
                return (ENXIO);

        sb = sbuf_new(NULL, NULL, 1024, SBUF_FIXEDLEN | SBUF_INCLUDENUL);
        sbuf_putc(sb, '\n');
        thread_lock(curthread);
        sched_bind(curthread, pc->pc_cpuid);
        thread_unlock(curthread);

        rdmsr_safe(MSR_IA32_PM_ENABLE, &data);
        sbuf_printf(sb, "CPU%d: HWP %sabled\n", pc->pc_cpuid,
            ((data & 1) ? "En" : "Dis"));

        if (data == 0) {
                ret = 0;
                goto out;
        }

        rdmsr_safe(MSR_IA32_HWP_CAPABILITIES, &data);
        sbuf_printf(sb, "\tHighest Performance: %03ju\n", data & 0xff);
        sbuf_printf(sb, "\tGuaranteed Performance: %03ju\n", (data >> 8) & 0xff);
        sbuf_printf(sb, "\tEfficient Performance: %03ju\n", (data >> 16) & 0xff);
        sbuf_printf(sb, "\tLowest Performance: %03ju\n", (data >> 24) & 0xff);

        rdmsr_safe(MSR_IA32_HWP_REQUEST, &data);
        data2 = 0;
        if (sc->hwp_pkg_ctrl && (data & IA32_HWP_REQUEST_PACKAGE_CONTROL))
                rdmsr_safe(MSR_IA32_HWP_REQUEST_PKG, &data2);

        sbuf_putc(sb, '\n');

#define pkg_print(x, name, offset) do {                                 \
        if (!sc->hwp_pkg_ctrl || (data & x) != 0)                       \
                sbuf_printf(sb, "\t%s: %03u\n", name,                   \
                    (unsigned)(data >> offset) & 0xff);                 \
        else                                                            \
                sbuf_printf(sb, "\t%s: %03u\n", name,                   \
                    (unsigned)(data2 >> offset) & 0xff);                \
} while (0)

        pkg_print(IA32_HWP_REQUEST_EPP_VALID,
            "Requested Efficiency Performance Preference", 24);
        pkg_print(IA32_HWP_REQUEST_DESIRED_VALID,
            "Requested Desired Performance", 16);
        pkg_print(IA32_HWP_REQUEST_MAXIMUM_VALID,
            "Requested Maximum Performance", 8);
        pkg_print(IA32_HWP_REQUEST_MINIMUM_VALID,
            "Requested Minimum Performance", 0);
#undef pkg_print

        sbuf_putc(sb, '\n');

out:
        thread_lock(curthread);
        sched_unbind(curthread);
        thread_unlock(curthread);

        ret = sbuf_finish(sb);
        if (ret == 0)
                ret = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb));
        sbuf_delete(sb);

        return (ret);
}

static inline int
percent_to_raw(int x)
{

        MPASS(x <= 100 && x >= 0);
        return (0xff * x / 100);
}

/*
 * Given x * 10 in [0, 1000], round to the integer nearest x.
 *
 * This allows round-tripping nice human readable numbers through this
 * interface.  Otherwise, user-provided percentages such as 25, 50, 75 get
 * rounded down to 24, 49, and 74, which is a bit ugly.
 */
static inline int
round10(int xtimes10)
{
        return ((xtimes10 + 5) / 10);
}

static inline int
raw_to_percent(int x)
{
        MPASS(x <= 0xff && x >= 0);
        return (round10(x * 1000 / 0xff));
}

/* Range of MSR_IA32_ENERGY_PERF_BIAS is more limited: 0-0xf. */
static inline int
percent_to_raw_perf_bias(int x)
{
        /*
         * Round up so that raw values present as nice round human numbers and
         * also round-trip to the same raw value.
         */
        MPASS(x <= 100 && x >= 0);
        return (((0xf * x) + 50) / 100);
}

static inline int
raw_to_percent_perf_bias(int x)
{
        /* Rounding to nice human numbers despite a step interval of 6.67%. */
        MPASS(x <= 0xf && x >= 0);
        return (((x * 20) / 0xf) * 5);
}

static int
sysctl_epp_select(SYSCTL_HANDLER_ARGS)
{
        struct hwp_softc *sc;
        device_t dev;
        struct pcpu *pc;
        uint64_t epb;
        uint32_t val;
        int ret;

        dev = oidp->oid_arg1;
        sc = device_get_softc(dev);
        if (!sc->hwp_pref_ctrl && !sc->hwp_perf_bias)
                return (ENODEV);

        pc = cpu_get_pcpu(dev);
        if (pc == NULL)
                return (ENXIO);

        thread_lock(curthread);
        sched_bind(curthread, pc->pc_cpuid);
        thread_unlock(curthread);

        if (sc->hwp_pref_ctrl) {
                val = (sc->req & IA32_HWP_REQUEST_ENERGY_PERFORMANCE_PREFERENCE) >> 24;
                val = raw_to_percent(val);
        } else {
                /*
                 * If cpuid indicates EPP is not supported, the HWP controller
                 * uses MSR_IA32_ENERGY_PERF_BIAS instead (Intel SDM §14.4.4).
                 * This register is per-core (but not HT).
                 */
                if (!sc->hwp_perf_bias_cached) {
                        ret = rdmsr_safe(MSR_IA32_ENERGY_PERF_BIAS, &epb);
                        if (ret)
                                goto out;
                        sc->hwp_energy_perf_bias = epb;
                        sc->hwp_perf_bias_cached = true;
                }
                val = sc->hwp_energy_perf_bias &
                    IA32_ENERGY_PERF_BIAS_POLICY_HINT_MASK;
                val = raw_to_percent_perf_bias(val);
        }

        MPASS(val >= 0 && val <= 100);

        ret = sysctl_handle_int(oidp, &val, 0, req);
        if (ret || req->newptr == NULL)
                goto out;

        if (val > 100) {
                ret = EINVAL;
                goto out;
        }

        if (sc->hwp_pref_ctrl) {
                val = percent_to_raw(val);

                sc->req =
                    ((sc->req & ~IA32_HWP_REQUEST_ENERGY_PERFORMANCE_PREFERENCE)
                    | (val << 24u));

                if (sc->hwp_pkg_ctrl_en)
                        ret = wrmsr_safe(MSR_IA32_HWP_REQUEST_PKG, sc->req);
                else
                        ret = wrmsr_safe(MSR_IA32_HWP_REQUEST, sc->req);
        } else {
                val = percent_to_raw_perf_bias(val);
                MPASS((val & ~IA32_ENERGY_PERF_BIAS_POLICY_HINT_MASK) == 0);

                sc->hwp_energy_perf_bias =
                    ((sc->hwp_energy_perf_bias &
                    ~IA32_ENERGY_PERF_BIAS_POLICY_HINT_MASK) | val);
                ret = wrmsr_safe(MSR_IA32_ENERGY_PERF_BIAS,
                    sc->hwp_energy_perf_bias);
        }

out:
        thread_lock(curthread);
        sched_unbind(curthread);
        thread_unlock(curthread);

        return (ret);
}

void
intel_hwpstate_identify(driver_t *driver, device_t parent)
{
        if (device_find_child(parent, "hwpstate_intel", DEVICE_UNIT_ANY) != NULL)
                return;

        if (cpu_vendor_id != CPU_VENDOR_INTEL)
                return;

        if (resource_disabled("hwpstate_intel", 0))
                return;

        /*
         * Intel SDM 14.4.1 (HWP Programming Interfaces):
         *   Availability of HWP baseline resource and capability,
         *   CPUID.06H:EAX[bit 7]: If this bit is set, HWP provides several new
         *   architectural MSRs: IA32_PM_ENABLE, IA32_HWP_CAPABILITIES,
         *   IA32_HWP_REQUEST, IA32_HWP_STATUS.
         */
        if ((cpu_power_eax & CPUTPM1_HWP) == 0)
                return;

        if (BUS_ADD_CHILD(parent, 10, "hwpstate_intel", device_get_unit(parent))
            == NULL)
                device_printf(parent, "hwpstate_intel: add child failed\n");
}

static int
intel_hwpstate_probe(device_t dev)
{

        device_set_desc(dev, "Intel Speed Shift");
        return (BUS_PROBE_NOWILDCARD);
}

static int
set_autonomous_hwp(struct hwp_softc *sc)
{
        struct pcpu *pc;
        device_t dev;
        uint64_t caps;
        int ret;

        dev = sc->dev;

        pc = cpu_get_pcpu(dev);
        if (pc == NULL)
                return (ENXIO);

        thread_lock(curthread);
        sched_bind(curthread, pc->pc_cpuid);
        thread_unlock(curthread);

        /* XXX: Many MSRs aren't readable until feature is enabled */
        ret = wrmsr_safe(MSR_IA32_PM_ENABLE, 1);
        if (ret) {
                /*
                 * This is actually a package-level MSR, and only the first
                 * write is not ignored.  So it is harmless to enable it across
                 * all devices, and this allows us not to care especially in
                 * which order cores (and packages) are probed.  This error
                 * condition should not happen given we gate on the HWP CPUID
                 * feature flag, if the Intel SDM is correct.
                 */
                device_printf(dev, "Failed to enable HWP for cpu%d (%d)\n",
                    pc->pc_cpuid, ret);
                goto out;
        }

        ret = rdmsr_safe(MSR_IA32_HWP_REQUEST, &sc->req);
        if (ret) {
                device_printf(dev,
                    "Failed to read HWP request MSR for cpu%d (%d)\n",
                    pc->pc_cpuid, ret);
                goto out;
        }

        ret = rdmsr_safe(MSR_IA32_HWP_CAPABILITIES, &caps);
        if (ret) {
                device_printf(dev,
                    "Failed to read HWP capabilities MSR for cpu%d (%d)\n",
                    pc->pc_cpuid, ret);
                goto out;
        }

        /*
         * High and low are static; "guaranteed" is dynamic; and efficient is
         * also dynamic.
         */
        sc->high = IA32_HWP_CAPABILITIES_HIGHEST_PERFORMANCE(caps);
        sc->guaranteed = IA32_HWP_CAPABILITIES_GUARANTEED_PERFORMANCE(caps);
        sc->efficient = IA32_HWP_CAPABILITIES_EFFICIENT_PERFORMANCE(caps);
        sc->low = IA32_HWP_CAPABILITIES_LOWEST_PERFORMANCE(caps);

        /* hardware autonomous selection determines the performance target */
        sc->req &= ~IA32_HWP_DESIRED_PERFORMANCE;

        /* enable HW dynamic selection of window size */
        sc->req &= ~IA32_HWP_ACTIVITY_WINDOW;

        /* IA32_HWP_REQUEST.Minimum_Performance = IA32_HWP_CAPABILITIES.Lowest_Performance */
        sc->req &= ~IA32_HWP_MINIMUM_PERFORMANCE;
        sc->req |= sc->low;

        /* IA32_HWP_REQUEST.Maximum_Performance = IA32_HWP_CAPABILITIES.Highest_Performance. */
        sc->req &= ~IA32_HWP_REQUEST_MAXIMUM_PERFORMANCE;
        sc->req |= sc->high << 8;

        /* If supported, request package-level control for this CPU. */
        if (sc->hwp_pkg_ctrl_en)
                ret = wrmsr_safe(MSR_IA32_HWP_REQUEST, sc->req |
                    IA32_HWP_REQUEST_PACKAGE_CONTROL);
        else
                ret = wrmsr_safe(MSR_IA32_HWP_REQUEST, sc->req);
        if (ret) {
                device_printf(dev,
                    "Failed to setup%s autonomous HWP for cpu%d\n",
                    sc->hwp_pkg_ctrl_en ? " PKG" : "", pc->pc_cpuid);
                goto out;
        }

        /* If supported, write the PKG-wide control MSR. */
        if (sc->hwp_pkg_ctrl_en) {
                /*
                 * "The structure of the IA32_HWP_REQUEST_PKG MSR
                 * (package-level) is identical to the IA32_HWP_REQUEST MSR
                 * with the exception of the Package Control field, which does
                 * not exist." (Intel SDM §14.4.4)
                 */
                ret = wrmsr_safe(MSR_IA32_HWP_REQUEST_PKG, sc->req);
                if (ret) {
                        device_printf(dev,
                            "Failed to set autonomous HWP for package\n");
                }
        }

out:
        thread_lock(curthread);
        sched_unbind(curthread);
        thread_unlock(curthread);

        return (ret);
}

static int
intel_hwpstate_attach(device_t dev)
{
        struct hwp_softc *sc;
        int ret;

        sc = device_get_softc(dev);
        sc->dev = dev;

        /* eax */
        if (cpu_power_eax & CPUTPM1_HWP_NOTIFICATION)
                sc->hwp_notifications = true;
        if (cpu_power_eax & CPUTPM1_HWP_ACTIVITY_WINDOW)
                sc->hwp_activity_window = true;
        if (cpu_power_eax & CPUTPM1_HWP_PERF_PREF)
                sc->hwp_pref_ctrl = true;
        if (cpu_power_eax & CPUTPM1_HWP_PKG)
                sc->hwp_pkg_ctrl = true;

        /* Allow administrators to disable pkg-level control. */
        sc->hwp_pkg_ctrl_en = (sc->hwp_pkg_ctrl && hwpstate_pkg_ctrl_enable);

        /* ecx */
        if (cpu_power_ecx & CPUID_PERF_BIAS)
                sc->hwp_perf_bias = true;

        ret = set_autonomous_hwp(sc);
        if (ret)
                return (ret);

        SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
            SYSCTL_STATIC_CHILDREN(_debug), OID_AUTO, device_get_nameunit(dev),
            CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_SKIP | CTLFLAG_MPSAFE,
            sc, 0, intel_hwp_dump_sysctl_handler, "A", "");

        SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
            SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO,
            "epp", CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, dev, 0,
            sysctl_epp_select, "I",
            "Efficiency/Performance Preference "
            "(range from 0, most performant, through 100, most efficient)");

        return (cpufreq_register(dev));
}

static int
intel_hwpstate_detach(device_t dev)
{

        return (cpufreq_unregister(dev));
}

static int
intel_hwpstate_get(device_t dev, struct cf_setting *set)
{
        struct pcpu *pc;
        uint64_t rate;
        int ret;

        if (set == NULL)
                return (EINVAL);

        pc = cpu_get_pcpu(dev);
        if (pc == NULL)
                return (ENXIO);

        memset(set, CPUFREQ_VAL_UNKNOWN, sizeof(*set));
        set->dev = dev;

        ret = cpu_est_clockrate(pc->pc_cpuid, &rate);
        if (ret == 0)
                set->freq = rate / 1000000;

        set->volts = CPUFREQ_VAL_UNKNOWN;
        set->power = CPUFREQ_VAL_UNKNOWN;
        set->lat = CPUFREQ_VAL_UNKNOWN;

        return (0);
}

static int
intel_hwpstate_type(device_t dev, int *type)
{
        if (type == NULL)
                return (EINVAL);
        *type = CPUFREQ_TYPE_ABSOLUTE | CPUFREQ_FLAG_INFO_ONLY | CPUFREQ_FLAG_UNCACHED;

        return (0);
}

static int
intel_hwpstate_suspend(device_t dev)
{
        return (0);
}

/*
 * Redo a subset of set_autonomous_hwp on resume; untested.  Without this,
 * testers observed that on resume MSR_IA32_HWP_REQUEST was bogus.
 */
static int
intel_hwpstate_resume(device_t dev)
{
        struct hwp_softc *sc;
        struct pcpu *pc;
        int ret;

        sc = device_get_softc(dev);

        pc = cpu_get_pcpu(dev);
        if (pc == NULL)
                return (ENXIO);

        thread_lock(curthread);
        sched_bind(curthread, pc->pc_cpuid);
        thread_unlock(curthread);

        ret = wrmsr_safe(MSR_IA32_PM_ENABLE, 1);
        if (ret) {
                device_printf(dev,
                    "Failed to enable HWP for cpu%d after suspend (%d)\n",
                    pc->pc_cpuid, ret);
                goto out;
        }

        if (sc->hwp_pkg_ctrl_en)
                ret = wrmsr_safe(MSR_IA32_HWP_REQUEST, sc->req |
                    IA32_HWP_REQUEST_PACKAGE_CONTROL);
        else
                ret = wrmsr_safe(MSR_IA32_HWP_REQUEST, sc->req);
        if (ret) {
                device_printf(dev,
                    "Failed to set%s autonomous HWP for cpu%d after suspend\n",
                    sc->hwp_pkg_ctrl_en ? " PKG" : "", pc->pc_cpuid);
                goto out;
        }
        if (sc->hwp_pkg_ctrl_en) {
                ret = wrmsr_safe(MSR_IA32_HWP_REQUEST_PKG, sc->req);
                if (ret) {
                        device_printf(dev,
                            "Failed to set autonomous HWP for package after "
                            "suspend\n");
                        goto out;
                }
        }
        if (!sc->hwp_pref_ctrl && sc->hwp_perf_bias_cached) {
                ret = wrmsr_safe(MSR_IA32_ENERGY_PERF_BIAS,
                    sc->hwp_energy_perf_bias);
                if (ret) {
                        device_printf(dev,
                            "Failed to set energy perf bias for cpu%d after "
                            "suspend\n", pc->pc_cpuid);
                }
        }

out:
        thread_lock(curthread);
        sched_unbind(curthread);
        thread_unlock(curthread);

        return (ret);
}