root/drivers/thermal/devfreq_cooling.c
// SPDX-License-Identifier: GPL-2.0
/*
 * devfreq_cooling: Thermal cooling device implementation for devices using
 *                  devfreq
 *
 * Copyright (C) 2014-2015 ARM Limited
 *
 * TODO:
 *    - If OPPs are added or removed after devfreq cooling has
 *      registered, the devfreq cooling won't react to it.
 */

#include <linux/devfreq.h>
#include <linux/devfreq_cooling.h>
#include <linux/energy_model.h>
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/pm_opp.h>
#include <linux/pm_qos.h>
#include <linux/thermal.h>
#include <linux/units.h>

#include "thermal_trace.h"

#define SCALE_ERROR_MITIGATION  100

/**
 * struct devfreq_cooling_device - Devfreq cooling device
 *              devfreq_cooling_device registered.
 * @cdev:       Pointer to associated thermal cooling device.
 * @cooling_ops: devfreq callbacks to thermal cooling device ops
 * @devfreq:    Pointer to associated devfreq device.
 * @cooling_state:      Current cooling state.
 * @freq_table: Pointer to a table with the frequencies sorted in descending
 *              order.  You can index the table by cooling device state
 * @max_state:  It is the last index, that is, one less than the number of the
 *              OPPs
 * @power_ops:  Pointer to devfreq_cooling_power, a more precised model.
 * @res_util:   Resource utilization scaling factor for the power.
 *              It is multiplied by 100 to minimize the error. It is used
 *              for estimation of the power budget instead of using
 *              'utilization' (which is 'busy_time' / 'total_time').
 *              The 'res_util' range is from 100 to power * 100 for the
 *              corresponding 'state'.
 * @capped_state:       index to cooling state with in dynamic power budget
 * @req_max_freq:       PM QoS request for limiting the maximum frequency
 *                      of the devfreq device.
 * @em_pd:              Energy Model for the associated Devfreq device
 */
struct devfreq_cooling_device {
        struct thermal_cooling_device *cdev;
        struct thermal_cooling_device_ops cooling_ops;
        struct devfreq *devfreq;
        unsigned long cooling_state;
        u32 *freq_table;
        size_t max_state;
        struct devfreq_cooling_power *power_ops;
        u32 res_util;
        int capped_state;
        struct dev_pm_qos_request req_max_freq;
        struct em_perf_domain *em_pd;
};

static int devfreq_cooling_get_max_state(struct thermal_cooling_device *cdev,
                                         unsigned long *state)
{
        struct devfreq_cooling_device *dfc = cdev->devdata;

        *state = dfc->max_state;

        return 0;
}

static int devfreq_cooling_get_cur_state(struct thermal_cooling_device *cdev,
                                         unsigned long *state)
{
        struct devfreq_cooling_device *dfc = cdev->devdata;

        *state = dfc->cooling_state;

        return 0;
}

static int devfreq_cooling_set_cur_state(struct thermal_cooling_device *cdev,
                                         unsigned long state)
{
        struct devfreq_cooling_device *dfc = cdev->devdata;
        struct devfreq *df = dfc->devfreq;
        struct device *dev = df->dev.parent;
        struct em_perf_state *table;
        unsigned long freq;
        int perf_idx;

        if (state == dfc->cooling_state)
                return 0;

        dev_dbg(dev, "Setting cooling state %lu\n", state);

        if (state > dfc->max_state)
                return -EINVAL;

        if (dfc->em_pd) {
                perf_idx = dfc->max_state - state;

                rcu_read_lock();
                table = em_perf_state_from_pd(dfc->em_pd);
                freq = table[perf_idx].frequency * 1000;
                rcu_read_unlock();
        } else {
                freq = dfc->freq_table[state];
        }

        dev_pm_qos_update_request(&dfc->req_max_freq,
                                  DIV_ROUND_UP(freq, HZ_PER_KHZ));

        dfc->cooling_state = state;

        return 0;
}

/**
 * get_perf_idx() - get the performance index corresponding to a frequency
 * @em_pd:      Pointer to device's Energy Model
 * @freq:       frequency in kHz
 *
 * Return: the performance index associated with the @freq, or
 * -EINVAL if it wasn't found.
 */
static int get_perf_idx(struct em_perf_domain *em_pd, unsigned long freq)
{
        struct em_perf_state *table;
        int i, idx = -EINVAL;

        rcu_read_lock();
        table = em_perf_state_from_pd(em_pd);
        for (i = 0; i < em_pd->nr_perf_states; i++) {
                if (table[i].frequency != freq)
                        continue;

                idx = i;
                break;
        }
        rcu_read_unlock();

        return idx;
}

static unsigned long get_voltage(struct devfreq *df, unsigned long freq)
{
        struct device *dev = df->dev.parent;
        unsigned long voltage;
        struct dev_pm_opp *opp;

        opp = dev_pm_opp_find_freq_exact(dev, freq, true);
        if (PTR_ERR(opp) == -ERANGE)
                opp = dev_pm_opp_find_freq_exact(dev, freq, false);

        if (IS_ERR(opp)) {
                dev_err_ratelimited(dev, "Failed to find OPP for frequency %lu: %ld\n",
                                    freq, PTR_ERR(opp));
                return 0;
        }

        voltage = dev_pm_opp_get_voltage(opp) / 1000; /* mV */
        dev_pm_opp_put(opp);

        if (voltage == 0) {
                dev_err_ratelimited(dev,
                                    "Failed to get voltage for frequency %lu\n",
                                    freq);
        }

        return voltage;
}

static void _normalize_load(struct devfreq_dev_status *status)
{
        if (status->total_time > 0xfffff) {
                status->total_time >>= 10;
                status->busy_time >>= 10;
        }

        status->busy_time <<= 10;
        status->busy_time /= status->total_time ? : 1;

        status->busy_time = status->busy_time ? : 1;
        status->total_time = 1024;
}

static int devfreq_cooling_get_requested_power(struct thermal_cooling_device *cdev,
                                               u32 *power)
{
        struct devfreq_cooling_device *dfc = cdev->devdata;
        struct devfreq *df = dfc->devfreq;
        struct devfreq_dev_status status;
        struct em_perf_state *table;
        unsigned long state;
        unsigned long freq;
        unsigned long voltage;
        int res, perf_idx;

        mutex_lock(&df->lock);
        status = df->last_status;
        mutex_unlock(&df->lock);

        freq = status.current_frequency;

        if (dfc->power_ops && dfc->power_ops->get_real_power) {
                voltage = get_voltage(df, freq);
                if (voltage == 0) {
                        res = -EINVAL;
                        goto fail;
                }

                res = dfc->power_ops->get_real_power(df, power, freq, voltage);
                if (!res) {
                        state = dfc->max_state - dfc->capped_state;

                        /* Convert EM power into milli-Watts first */
                        rcu_read_lock();
                        table = em_perf_state_from_pd(dfc->em_pd);
                        dfc->res_util = table[state].power;
                        rcu_read_unlock();

                        dfc->res_util /= MICROWATT_PER_MILLIWATT;

                        dfc->res_util *= SCALE_ERROR_MITIGATION;

                        if (*power > 1)
                                dfc->res_util /= *power;
                } else {
                        goto fail;
                }
        } else {
                /* Energy Model frequencies are in kHz */
                perf_idx = get_perf_idx(dfc->em_pd, freq / 1000);
                if (perf_idx < 0) {
                        res = -EAGAIN;
                        goto fail;
                }

                _normalize_load(&status);

                /* Convert EM power into milli-Watts first */
                rcu_read_lock();
                table = em_perf_state_from_pd(dfc->em_pd);
                *power = table[perf_idx].power;
                rcu_read_unlock();

                *power /= MICROWATT_PER_MILLIWATT;
                /* Scale power for utilization */
                *power *= status.busy_time;
                *power >>= 10;
        }

        trace_thermal_power_devfreq_get_power(cdev, &status, freq, *power);

        return 0;
fail:
        /* It is safe to set max in this case */
        dfc->res_util = SCALE_ERROR_MITIGATION;
        return res;
}

static int devfreq_cooling_state2power(struct thermal_cooling_device *cdev,
                                       unsigned long state, u32 *power)
{
        struct devfreq_cooling_device *dfc = cdev->devdata;
        struct em_perf_state *table;
        int perf_idx;

        if (state > dfc->max_state)
                return -EINVAL;

        perf_idx = dfc->max_state - state;

        rcu_read_lock();
        table = em_perf_state_from_pd(dfc->em_pd);
        *power = table[perf_idx].power;
        rcu_read_unlock();

        *power /= MICROWATT_PER_MILLIWATT;

        return 0;
}

static int devfreq_cooling_power2state(struct thermal_cooling_device *cdev,
                                       u32 power, unsigned long *state)
{
        struct devfreq_cooling_device *dfc = cdev->devdata;
        struct devfreq *df = dfc->devfreq;
        struct devfreq_dev_status status;
        unsigned long freq, em_power_mw;
        struct em_perf_state *table;
        s32 est_power;
        int i;

        mutex_lock(&df->lock);
        status = df->last_status;
        mutex_unlock(&df->lock);

        freq = status.current_frequency;

        if (dfc->power_ops && dfc->power_ops->get_real_power) {
                /* Scale for resource utilization */
                est_power = power * dfc->res_util;
                est_power /= SCALE_ERROR_MITIGATION;
        } else {
                /* Scale dynamic power for utilization */
                _normalize_load(&status);
                est_power = power << 10;
                est_power /= status.busy_time;
        }

        /*
         * Find the first cooling state that is within the power
         * budget. The EM power table is sorted ascending.
         */
        rcu_read_lock();
        table = em_perf_state_from_pd(dfc->em_pd);
        for (i = dfc->max_state; i > 0; i--) {
                /* Convert EM power to milli-Watts to make safe comparison */
                em_power_mw = table[i].power;
                em_power_mw /= MICROWATT_PER_MILLIWATT;
                if (est_power >= em_power_mw)
                        break;
        }
        rcu_read_unlock();

        *state = dfc->max_state - i;
        dfc->capped_state = *state;

        trace_thermal_power_devfreq_limit(cdev, freq, *state, power);
        return 0;
}

/**
 * devfreq_cooling_gen_tables() - Generate frequency table.
 * @dfc:        Pointer to devfreq cooling device.
 * @num_opps:   Number of OPPs
 *
 * Generate frequency table which holds the frequencies in descending
 * order. That way its indexed by cooling device state. This is for
 * compatibility with drivers which do not register Energy Model.
 *
 * Return: 0 on success, negative error code on failure.
 */
static int devfreq_cooling_gen_tables(struct devfreq_cooling_device *dfc,
                                      int num_opps)
{
        struct devfreq *df = dfc->devfreq;
        struct device *dev = df->dev.parent;
        unsigned long freq;
        int i;

        dfc->freq_table = kcalloc(num_opps, sizeof(*dfc->freq_table),
                             GFP_KERNEL);
        if (!dfc->freq_table)
                return -ENOMEM;

        for (i = 0, freq = ULONG_MAX; i < num_opps; i++, freq--) {
                struct dev_pm_opp *opp;

                opp = dev_pm_opp_find_freq_floor(dev, &freq);
                if (IS_ERR(opp)) {
                        kfree(dfc->freq_table);
                        return PTR_ERR(opp);
                }

                dev_pm_opp_put(opp);
                dfc->freq_table[i] = freq;
        }

        return 0;
}

/**
 * of_devfreq_cooling_register_power() - Register devfreq cooling device,
 *                                      with OF and power information.
 * @np: Pointer to OF device_node.
 * @df: Pointer to devfreq device.
 * @dfc_power:  Pointer to devfreq_cooling_power.
 *
 * Register a devfreq cooling device.  The available OPPs must be
 * registered on the device.
 *
 * If @dfc_power is provided, the cooling device is registered with the
 * power extensions.  For the power extensions to work correctly,
 * devfreq should use the simple_ondemand governor, other governors
 * are not currently supported.
 */
struct thermal_cooling_device *
of_devfreq_cooling_register_power(struct device_node *np, struct devfreq *df,
                                  struct devfreq_cooling_power *dfc_power)
{
        struct thermal_cooling_device *cdev;
        struct device *dev = df->dev.parent;
        struct devfreq_cooling_device *dfc;
        struct em_perf_domain *em;
        struct thermal_cooling_device_ops *ops;
        char *name;
        int err, num_opps;


        dfc = kzalloc_obj(*dfc);
        if (!dfc)
                return ERR_PTR(-ENOMEM);

        dfc->devfreq = df;

        ops = &dfc->cooling_ops;
        ops->get_max_state = devfreq_cooling_get_max_state;
        ops->get_cur_state = devfreq_cooling_get_cur_state;
        ops->set_cur_state = devfreq_cooling_set_cur_state;

        em = em_pd_get(dev);
        if (em && !em_is_artificial(em)) {
                dfc->em_pd = em;
                ops->get_requested_power =
                        devfreq_cooling_get_requested_power;
                ops->state2power = devfreq_cooling_state2power;
                ops->power2state = devfreq_cooling_power2state;

                dfc->power_ops = dfc_power;

                num_opps = em_pd_nr_perf_states(dfc->em_pd);
        } else {
                /* Backward compatibility for drivers which do not use IPA */
                dev_dbg(dev, "missing proper EM for cooling device\n");

                num_opps = dev_pm_opp_get_opp_count(dev);

                err = devfreq_cooling_gen_tables(dfc, num_opps);
                if (err)
                        goto free_dfc;
        }

        if (num_opps <= 0) {
                err = -EINVAL;
                goto free_dfc;
        }

        /* max_state is an index, not a counter */
        dfc->max_state = num_opps - 1;

        err = dev_pm_qos_add_request(dev, &dfc->req_max_freq,
                                     DEV_PM_QOS_MAX_FREQUENCY,
                                     PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE);
        if (err < 0)
                goto free_table;

        err = -ENOMEM;
        name = kasprintf(GFP_KERNEL, "devfreq-%s", dev_name(dev));
        if (!name)
                goto remove_qos_req;

        cdev = thermal_of_cooling_device_register(np, name, dfc, ops);
        kfree(name);

        if (IS_ERR(cdev)) {
                err = PTR_ERR(cdev);
                dev_err(dev,
                        "Failed to register devfreq cooling device (%d)\n",
                        err);
                goto remove_qos_req;
        }

        dfc->cdev = cdev;

        return cdev;

remove_qos_req:
        dev_pm_qos_remove_request(&dfc->req_max_freq);
free_table:
        kfree(dfc->freq_table);
free_dfc:
        kfree(dfc);

        return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(of_devfreq_cooling_register_power);

/**
 * of_devfreq_cooling_register() - Register devfreq cooling device,
 *                                with OF information.
 * @np: Pointer to OF device_node.
 * @df: Pointer to devfreq device.
 */
struct thermal_cooling_device *
of_devfreq_cooling_register(struct device_node *np, struct devfreq *df)
{
        return of_devfreq_cooling_register_power(np, df, NULL);
}
EXPORT_SYMBOL_GPL(of_devfreq_cooling_register);

/**
 * devfreq_cooling_register() - Register devfreq cooling device.
 * @df: Pointer to devfreq device.
 */
struct thermal_cooling_device *devfreq_cooling_register(struct devfreq *df)
{
        return of_devfreq_cooling_register(NULL, df);
}
EXPORT_SYMBOL_GPL(devfreq_cooling_register);

/**
 * devfreq_cooling_em_register() - Register devfreq cooling device with
 *              power information and automatically register Energy Model (EM)
 * @df:         Pointer to devfreq device.
 * @dfc_power:  Pointer to devfreq_cooling_power.
 *
 * Register a devfreq cooling device and automatically register EM. The
 * available OPPs must be registered for the device.
 *
 * If @dfc_power is provided, the cooling device is registered with the
 * power extensions. It is using the simple Energy Model which requires
 * "dynamic-power-coefficient" a devicetree property. To not break drivers
 * which miss that DT property, the function won't bail out when the EM
 * registration failed. The cooling device will be registered if everything
 * else is OK.
 */
struct thermal_cooling_device *
devfreq_cooling_em_register(struct devfreq *df,
                            struct devfreq_cooling_power *dfc_power)
{
        struct thermal_cooling_device *cdev;
        struct device *dev;
        int ret;

        if (IS_ERR_OR_NULL(df))
                return ERR_PTR(-EINVAL);

        dev = df->dev.parent;

        ret = dev_pm_opp_of_register_em(dev, NULL);
        if (ret)
                dev_dbg(dev, "Unable to register EM for devfreq cooling device (%d)\n",
                        ret);

        cdev = of_devfreq_cooling_register_power(dev->of_node, df, dfc_power);

        if (IS_ERR_OR_NULL(cdev))
                em_dev_unregister_perf_domain(dev);

        return cdev;
}
EXPORT_SYMBOL_GPL(devfreq_cooling_em_register);

/**
 * devfreq_cooling_unregister() - Unregister devfreq cooling device.
 * @cdev: Pointer to devfreq cooling device to unregister.
 *
 * Unregisters devfreq cooling device and related Energy Model if it was
 * present.
 */
void devfreq_cooling_unregister(struct thermal_cooling_device *cdev)
{
        struct devfreq_cooling_device *dfc;
        struct device *dev;

        if (IS_ERR_OR_NULL(cdev))
                return;

        dfc = cdev->devdata;
        dev = dfc->devfreq->dev.parent;

        thermal_cooling_device_unregister(dfc->cdev);
        dev_pm_qos_remove_request(&dfc->req_max_freq);

        em_dev_unregister_perf_domain(dev);

        kfree(dfc->freq_table);
        kfree(dfc);
}
EXPORT_SYMBOL_GPL(devfreq_cooling_unregister);