root/drivers/gpu/drm/amd/ras/rascore/ras_process.c
// SPDX-License-Identifier: MIT
/*
 * Copyright 2025 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 *
 */
#include "ras.h"
#include "ras_process.h"

#define RAS_EVENT_FIFO_SIZE (128 * sizeof(struct ras_event_req))

#define RAS_POLLING_ECC_TIMEOUT  300

static int ras_process_put_event(struct ras_core_context *ras_core,
                struct ras_event_req *req)
{
        struct ras_process *ras_proc = &ras_core->ras_proc;
        int ret;

        ret = kfifo_in_spinlocked(&ras_proc->event_fifo,
                        req, sizeof(*req), &ras_proc->fifo_spinlock);
        if (!ret) {
                RAS_DEV_ERR(ras_core->dev, "Poison message fifo is full!\n");
                return -ENOSPC;
        }

        return 0;
}

static int ras_process_add_reset_gpu_event(struct ras_core_context *ras_core,
                        uint32_t reset_cause)
{
        struct ras_event_req req = {0};

        req.reset = reset_cause;

        return ras_process_put_event(ras_core, &req);
}

static int ras_process_get_event(struct ras_core_context *ras_core,
                struct ras_event_req *req)
{
        struct ras_process *ras_proc = &ras_core->ras_proc;

        return kfifo_out_spinlocked(&ras_proc->event_fifo,
                                req, sizeof(*req), &ras_proc->fifo_spinlock);
}

static void ras_process_clear_event_fifo(struct ras_core_context *ras_core)
{
        struct ras_event_req req;
        int ret;

        do {
                ret = ras_process_get_event(ras_core, &req);
        } while (ret);
}

#define AMDGPU_RAS_WAITING_DATA_READY  200
static int ras_process_umc_event(struct ras_core_context *ras_core,
                                uint32_t event_count)
{
        struct ras_ecc_count ecc_data;
        int ret = 0;
        uint32_t timeout = 0;
        uint32_t detected_de_count = 0;

        do {
                memset(&ecc_data, 0, sizeof(ecc_data));
                ret = ras_core_update_ecc_info(ras_core);
                if (ret)
                        return ret;

                ret = ras_core_query_block_ecc_data(ras_core, RAS_BLOCK_ID__UMC, &ecc_data);
                if (ret)
                        return ret;

                if (ecc_data.new_de_count) {
                        detected_de_count += ecc_data.new_de_count;
                        timeout = 0;
                } else {
                        if (!timeout && event_count)
                                timeout = AMDGPU_RAS_WAITING_DATA_READY;

                        if (timeout) {
                                if (!--timeout)
                                        break;

                                msleep(1);
                        }
                }
        } while (detected_de_count < event_count);

        if (detected_de_count && ras_core_gpu_is_rma(ras_core))
                ras_process_add_reset_gpu_event(ras_core, GPU_RESET_CAUSE_RMA);

        return 0;
}

static int ras_process_non_umc_event(struct ras_core_context *ras_core)
{
        struct ras_process *ras_proc = &ras_core->ras_proc;
        struct ras_event_req req;
        uint32_t event_count = kfifo_len(&ras_proc->event_fifo);
        uint32_t reset_flags = 0;
        int ret = 0, i;

        for (i = 0; i < event_count; i++) {
                memset(&req, 0, sizeof(req));
                ret = ras_process_get_event(ras_core, &req);
                if (!ret)
                        continue;

                ras_core_event_notify(ras_core,
                        RAS_EVENT_ID__POISON_CONSUMPTION, &req);

                reset_flags |= req.reset;

                if (req.reset == GPU_RESET_CAUSE_RMA)
                        continue;

                if (req.reset)
                        RAS_DEV_INFO(ras_core->dev,
                                "{%llu} GPU reset for %s RAS poison consumption is issued!\n",
                                req.seqno, ras_core_get_ras_block_name(req.block));
                else
                        RAS_DEV_INFO(ras_core->dev,
                                "{%llu} %s RAS poison consumption is issued!\n",
                                req.seqno, ras_core_get_ras_block_name(req.block));
        }

        if (reset_flags) {
                ret = ras_core_event_notify(ras_core,
                                RAS_EVENT_ID__RESET_GPU, &reset_flags);
                if (!ret && (reset_flags & GPU_RESET_CAUSE_RMA))
                        return -RAS_CORE_GPU_IN_MODE1_RESET;
        }

        return ret;
}

int ras_process_handle_ras_event(struct ras_core_context *ras_core)
{
        struct ras_process *ras_proc = &ras_core->ras_proc;
        uint32_t umc_event_count;
        int ret;

        ret = ras_core_event_notify(ras_core,
                        RAS_EVENT_ID__RAS_EVENT_PROC_BEGIN, NULL);
        if (ret)
                return ret;

        ras_aca_clear_fatal_flag(ras_core);
        ras_umc_log_pending_bad_bank(ras_core);

        do {
                umc_event_count = atomic_read(&ras_proc->umc_interrupt_count);
                ret = ras_process_umc_event(ras_core, umc_event_count);
                if (ret == -RAS_CORE_GPU_IN_MODE1_RESET)
                        break;

                if (umc_event_count)
                        atomic_sub(umc_event_count, &ras_proc->umc_interrupt_count);
        } while (atomic_read(&ras_proc->umc_interrupt_count));

        if ((ret != -RAS_CORE_GPU_IN_MODE1_RESET) &&
                        (kfifo_len(&ras_proc->event_fifo)))
                ret = ras_process_non_umc_event(ras_core);

        if (ret == -RAS_CORE_GPU_IN_MODE1_RESET) {
                /* Clear poison fifo */
                ras_process_clear_event_fifo(ras_core);
                atomic_set(&ras_proc->umc_interrupt_count, 0);
        }

        ras_core_event_notify(ras_core,
                        RAS_EVENT_ID__RAS_EVENT_PROC_END, NULL);
        return ret;
}

static int thread_wait_condition(void *param)
{
        struct ras_process *ras_proc = (struct ras_process *)param;

        return (kthread_should_stop() ||
                atomic_read(&ras_proc->ras_interrupt_req));
}

static int ras_process_thread(void *context)
{
        struct ras_core_context *ras_core = (struct ras_core_context *)context;
        struct ras_process *ras_proc = &ras_core->ras_proc;

        while (!kthread_should_stop()) {
                ras_wait_event_interruptible_timeout(&ras_proc->ras_process_wq,
                        thread_wait_condition, ras_proc,
                        msecs_to_jiffies(RAS_POLLING_ECC_TIMEOUT));

                if (kthread_should_stop())
                        break;

                if (!ras_core->is_initialized)
                        continue;

                atomic_set(&ras_proc->ras_interrupt_req, 0);

                if (ras_core_gpu_in_reset(ras_core))
                        continue;

                if (ras_core->sys_fn && ras_core->sys_fn->async_handle_ras_event)
                        ras_core->sys_fn->async_handle_ras_event(ras_core, NULL);
                else
                        ras_process_handle_ras_event(ras_core);
        }

        return 0;
}

int ras_process_init(struct ras_core_context *ras_core)
{
        struct ras_process *ras_proc = &ras_core->ras_proc;
        int ret;

        ret = kfifo_alloc(&ras_proc->event_fifo, RAS_EVENT_FIFO_SIZE, GFP_KERNEL);
        if (ret)
                return ret;

        spin_lock_init(&ras_proc->fifo_spinlock);

        init_waitqueue_head(&ras_proc->ras_process_wq);

        ras_proc->ras_process_thread = kthread_run(ras_process_thread,
                                                        (void *)ras_core, "ras_process_thread");
        if (!ras_proc->ras_process_thread) {
                RAS_DEV_ERR(ras_core->dev, "Failed to create ras_process_thread.\n");
                ret =  -ENOMEM;
                goto err;
        }

        return 0;

err:
        ras_process_fini(ras_core);
        return ret;
}

int ras_process_fini(struct ras_core_context *ras_core)
{
        struct ras_process *ras_proc = &ras_core->ras_proc;

        if (ras_proc->ras_process_thread) {
                kthread_stop(ras_proc->ras_process_thread);
                ras_proc->ras_process_thread = NULL;
        }

        kfifo_free(&ras_proc->event_fifo);

        return 0;
}

static int ras_process_add_umc_interrupt_req(struct ras_core_context *ras_core,
                        struct ras_event_req *req)
{
        struct ras_process *ras_proc = &ras_core->ras_proc;

        atomic_inc(&ras_proc->umc_interrupt_count);
        atomic_inc(&ras_proc->ras_interrupt_req);

        wake_up(&ras_proc->ras_process_wq);
        return 0;
}

static int ras_process_add_non_umc_interrupt_req(struct ras_core_context *ras_core,
                struct ras_event_req *req)
{
        struct ras_process *ras_proc = &ras_core->ras_proc;
        int ret;

        ret = ras_process_put_event(ras_core, req);
        if (!ret) {
                atomic_inc(&ras_proc->ras_interrupt_req);
                wake_up(&ras_proc->ras_process_wq);
        }

        return ret;
}

int ras_process_add_interrupt_req(struct ras_core_context *ras_core,
        struct ras_event_req *req, bool is_umc)
{
        int ret;

        if (!ras_core)
                return -EINVAL;

        if (!ras_core->is_initialized)
                return -EPERM;

        if (is_umc)
                ret = ras_process_add_umc_interrupt_req(ras_core, req);
        else
                ret = ras_process_add_non_umc_interrupt_req(ras_core, req);

        return ret;
}