root/drivers/gpu/drm/amd/ras/rascore/ras_core.c
// SPDX-License-Identifier: MIT
/*
 * Copyright 2025 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 *
 */
#include "ras.h"
#include "ras_core_status.h"

#define RAS_SEQNO_FIFO_SIZE (128 * sizeof(uint64_t))

#define IS_LEAP_YEAR(x) ((x % 4 == 0 && x % 100 != 0) || x % 400 == 0)

static const char * const ras_block_name[] = {
        "umc",
        "sdma",
        "gfx",
        "mmhub",
        "athub",
        "pcie_bif",
        "hdp",
        "xgmi_wafl",
        "df",
        "smn",
        "sem",
        "mp0",
        "mp1",
        "fuse",
        "mca",
        "vcn",
        "jpeg",
        "ih",
        "mpio",
};

const char *ras_core_get_ras_block_name(enum ras_block_id block_id)
{
        if (block_id >= ARRAY_SIZE(ras_block_name))
                return "";

        return ras_block_name[block_id];
}

int ras_core_convert_timestamp_to_time(struct ras_core_context *ras_core,
                        uint64_t timestamp, struct ras_time *tm)
{
        int days_in_month[] = {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31};
        uint64_t month = 0, day = 0, hour = 0, minute = 0, second = 0;
        uint32_t year = 0;
        int seconds_per_day = 24 * 60 * 60;
        int seconds_per_hour = 60 * 60;
        int seconds_per_minute = 60;
        int days, remaining_seconds;

        days = div64_u64_rem(timestamp, seconds_per_day, (uint64_t *)&remaining_seconds);

        /* utc_timestamp follows the Unix epoch */
        year = 1970;
        while (days >= 365) {
                if (IS_LEAP_YEAR(year)) {
                        if (days < 366)
                                break;
                        days -= 366;
                } else {
                        days -= 365;
                }
                year++;
        }

        days_in_month[1] += IS_LEAP_YEAR(year);

        month = 0;
        while (days >= days_in_month[month]) {
                days -= days_in_month[month];
                month++;
        }
        month++;
        day = days + 1;

        if (remaining_seconds) {
                hour = remaining_seconds / seconds_per_hour;
                minute = (remaining_seconds % seconds_per_hour) / seconds_per_minute;
                second = remaining_seconds % seconds_per_minute;
        }

        tm->tm_year = year;
        tm->tm_mon = month;
        tm->tm_mday = day;
        tm->tm_hour = hour;
        tm->tm_min = minute;
        tm->tm_sec = second;

        return 0;
}

bool ras_core_gpu_in_reset(struct ras_core_context *ras_core)
{
        uint32_t status = 0;

        if (ras_core->sys_fn &&
                ras_core->sys_fn->check_gpu_status)
                ras_core->sys_fn->check_gpu_status(ras_core, &status);

        return (status & RAS_GPU_STATUS__IN_RESET) ? true : false;
}

bool ras_core_gpu_is_vf(struct ras_core_context *ras_core)
{
        uint32_t status = 0;

        if (ras_core->sys_fn &&
                ras_core->sys_fn->check_gpu_status)
                ras_core->sys_fn->check_gpu_status(ras_core, &status);

        return (status & RAS_GPU_STATUS__IS_VF) ? true : false;
}

bool ras_core_gpu_is_rma(struct ras_core_context *ras_core)
{
        if (!ras_core)
                return false;

        return ras_core->is_rma;
}

static int ras_core_seqno_fifo_write(struct ras_core_context *ras_core,
                enum ras_seqno_fifo fifo_type, uint64_t seqno)
{
        int ret = 0;
        struct kfifo *seqno_fifo = NULL;

        if (fifo_type == SEQNO_FIFO_POISON_CREATION)
                seqno_fifo = &ras_core->de_seqno_fifo;
        else if (fifo_type == SEQNO_FIFO_POISON_CONSUMPTION)
                seqno_fifo = &ras_core->consumption_seqno_fifo;

        if (seqno_fifo)
                ret = kfifo_in_spinlocked(seqno_fifo,
                        &seqno, sizeof(seqno), &ras_core->seqno_lock);

        return ret ? 0 : -EINVAL;
}

static int ras_core_seqno_fifo_read(struct ras_core_context *ras_core,
                enum ras_seqno_fifo fifo_type, uint64_t *seqno, bool pop)
{
        int ret = 0;
        struct kfifo *seqno_fifo = NULL;

        if (fifo_type == SEQNO_FIFO_POISON_CREATION)
                seqno_fifo = &ras_core->de_seqno_fifo;
        else if (fifo_type == SEQNO_FIFO_POISON_CONSUMPTION)
                seqno_fifo = &ras_core->consumption_seqno_fifo;

        if (seqno_fifo) {
                if (pop)
                        ret = kfifo_out_spinlocked(seqno_fifo,
                                seqno, sizeof(*seqno), &ras_core->seqno_lock);
                else
                        ret = kfifo_out_peek(seqno_fifo, seqno, sizeof(*seqno));
        }

        return ret ? 0 : -EINVAL;
}

uint64_t ras_core_gen_seqno(struct ras_core_context *ras_core,
                        enum ras_seqno_type type)
{
        uint64_t seqno = 0;

        if (ras_core->sys_fn &&
                ras_core->sys_fn->gen_seqno)
                ras_core->sys_fn->gen_seqno(ras_core, type, &seqno);

        return seqno;
}

int ras_core_put_seqno(struct ras_core_context *ras_core,
                enum ras_seqno_type seqno_type, uint64_t seqno)
{
        int ret = 0;

        if (seqno_type >= RAS_SEQNO_TYPE_COUNT_MAX)
                return -EINVAL;

        if (seqno_type == RAS_SEQNO_TYPE_DE)
                ret = ras_core_seqno_fifo_write(ras_core,
                                SEQNO_FIFO_POISON_CREATION, seqno);
        else if (seqno_type == RAS_SEQNO_TYPE_POISON_CONSUMPTION)
                ret = ras_core_seqno_fifo_write(ras_core,
                                SEQNO_FIFO_POISON_CONSUMPTION, seqno);
        else
                ret = -EINVAL;

        return ret;
}

uint64_t ras_core_get_seqno(struct ras_core_context *ras_core,
                        enum ras_seqno_type seqno_type, bool pop)
{
        uint64_t seq_no;
        int ret = -ENODATA;

        if (seqno_type >= RAS_SEQNO_TYPE_COUNT_MAX)
                return 0;

        if (seqno_type == RAS_SEQNO_TYPE_DE)
                ret = ras_core_seqno_fifo_read(ras_core,
                                SEQNO_FIFO_POISON_CREATION, &seq_no, pop);
        else if (seqno_type == RAS_SEQNO_TYPE_POISON_CONSUMPTION)
                ret = ras_core_seqno_fifo_read(ras_core,
                                SEQNO_FIFO_POISON_CONSUMPTION, &seq_no, pop);

        if (ret)
                seq_no = ras_core_gen_seqno(ras_core, seqno_type);

        return seq_no;
}

static int ras_core_eeprom_recovery(struct ras_core_context *ras_core)
{
        int count;
        int ret;

        count = ras_eeprom_get_record_count(ras_core);
        if (!count)
                return 0;

        /* Avoid bad page to be loaded again after gpu reset */
        if (ras_umc_get_saved_eeprom_count(ras_core) >= count)
                return 0;

        ret = ras_umc_load_bad_pages(ras_core);
        if (ret) {
                RAS_DEV_ERR(ras_core->dev, "ras_umc_load_bad_pages failed: %d\n", ret);
                return ret;
        }

        ras_eeprom_sync_info(ras_core);

        return ret;
}

struct ras_core_context *ras_core_create(struct ras_core_config *init_config)
{
        struct ras_core_context *ras_core;
        struct ras_core_config *config;

        ras_core = kzalloc_obj(*ras_core);
        if (!ras_core)
                return NULL;

        config = kzalloc_obj(*config);
        if (!config) {
                kfree(ras_core);
                return NULL;
        }

        memcpy(config, init_config, sizeof(*config));
        ras_core->config = config;

        return ras_core;
}

void ras_core_destroy(struct ras_core_context *ras_core)
{
        if (ras_core)
                kfree(ras_core->config);

        kfree(ras_core);
}

int ras_core_sw_init(struct ras_core_context *ras_core)
{
        int ret;

        if (!ras_core->config) {
                RAS_DEV_ERR(ras_core->dev, "No ras core config!\n");
                return -EINVAL;
        }

        ras_core->sys_fn = ras_core->config->sys_fn;
        if (!ras_core->sys_fn)
                return -EINVAL;

        ret = kfifo_alloc(&ras_core->de_seqno_fifo,
                 RAS_SEQNO_FIFO_SIZE, GFP_KERNEL);
        if (ret)
                return ret;

        ret = kfifo_alloc(&ras_core->consumption_seqno_fifo,
                 RAS_SEQNO_FIFO_SIZE, GFP_KERNEL);
        if (ret)
                return ret;

        spin_lock_init(&ras_core->seqno_lock);

        ret = ras_aca_sw_init(ras_core);
        if (ret)
                return ret;

        ret = ras_umc_sw_init(ras_core);
        if (ret)
                return ret;

        ret = ras_cmd_init(ras_core);
        if (ret)
                return ret;

        ret = ras_log_ring_sw_init(ras_core);
        if (ret)
                return ret;

        ret = ras_psp_sw_init(ras_core);
        if (ret)
                return ret;

        return 0;
}

int ras_core_sw_fini(struct ras_core_context *ras_core)
{
        kfifo_free(&ras_core->de_seqno_fifo);
        kfifo_free(&ras_core->consumption_seqno_fifo);

        ras_psp_sw_fini(ras_core);
        ras_log_ring_sw_fini(ras_core);
        ras_cmd_fini(ras_core);
        ras_umc_sw_fini(ras_core);
        ras_aca_sw_fini(ras_core);

        return 0;
}

int ras_core_hw_init(struct ras_core_context *ras_core)
{
        int ret;

        ras_core->ras_eeprom_supported =
                        ras_core->config->ras_eeprom_supported;

        ras_core->poison_supported = ras_core->config->poison_supported;

        ret = ras_psp_hw_init(ras_core);
        if (ret)
                return ret;

        ret = ras_aca_hw_init(ras_core);
        if (ret)
                goto init_err1;

        ret = ras_mp1_hw_init(ras_core);
        if (ret)
                goto init_err2;

        ret = ras_nbio_hw_init(ras_core);
        if (ret)
                goto init_err3;

        ret = ras_umc_hw_init(ras_core);
        if (ret)
                goto init_err4;

        ret = ras_gfx_hw_init(ras_core);
        if (ret)
                goto init_err5;

        ret = ras_eeprom_hw_init(ras_core);
        if (ret)
                goto init_err6;

        ret = ras_core_eeprom_recovery(ras_core);
        if (ret) {
                RAS_DEV_ERR(ras_core->dev,
                        "Failed to recovery ras core, ret:%d\n", ret);
                goto init_err6;
        }

        ret = ras_eeprom_check_storage_status(ras_core);
        if (ret)
                goto init_err6;

        ret = ras_process_init(ras_core);
        if (ret)
                goto init_err7;

        ras_core->is_initialized = true;

        return 0;

init_err7:
        ras_eeprom_hw_fini(ras_core);
init_err6:
        ras_gfx_hw_fini(ras_core);
init_err5:
        ras_umc_hw_fini(ras_core);
init_err4:
        ras_nbio_hw_fini(ras_core);
init_err3:
        ras_mp1_hw_fini(ras_core);
init_err2:
        ras_aca_hw_fini(ras_core);
init_err1:
        ras_psp_hw_fini(ras_core);
        return ret;
}

int ras_core_hw_fini(struct ras_core_context *ras_core)
{
        ras_core->is_initialized = false;

        ras_process_fini(ras_core);
        ras_eeprom_hw_fini(ras_core);
        ras_gfx_hw_fini(ras_core);
        ras_nbio_hw_fini(ras_core);
        ras_umc_hw_fini(ras_core);
        ras_mp1_hw_fini(ras_core);
        ras_aca_hw_fini(ras_core);
        ras_psp_hw_fini(ras_core);

        return 0;
}

bool ras_core_handle_nbio_irq(struct ras_core_context *ras_core, void *data)
{
        return ras_nbio_handle_irq_error(ras_core, data);
}

int ras_core_handle_fatal_error(struct ras_core_context *ras_core)
{
        int ret = 0;

        ras_aca_mark_fatal_flag(ras_core);

        ret = ras_core_event_notify(ras_core,
                        RAS_EVENT_ID__FATAL_ERROR_DETECTED, NULL);

        return ret;
}

uint32_t ras_core_get_curr_nps_mode(struct ras_core_context *ras_core)
{
        if (ras_core->ras_nbio.ip_func &&
            ras_core->ras_nbio.ip_func->get_memory_partition_mode)
                return ras_core->ras_nbio.ip_func->get_memory_partition_mode(ras_core);

        RAS_DEV_ERR(ras_core->dev, "Failed to get gpu memory nps mode!\n");
        return 0;
}

int ras_core_update_ecc_info(struct ras_core_context *ras_core)
{
        int ret;

        ret = ras_aca_update_ecc(ras_core, RAS_ERR_TYPE__CE, NULL);
        if (!ret)
                ret = ras_aca_update_ecc(ras_core, RAS_ERR_TYPE__UE, NULL);

        return ret;
}

int ras_core_query_block_ecc_data(struct ras_core_context *ras_core,
                        enum ras_block_id block, struct ras_ecc_count *ecc_count)
{
        int ret;

        if (!ecc_count || (block >= RAS_BLOCK_ID__LAST) || !ras_core)
                return -EINVAL;

        ret = ras_aca_get_block_ecc_count(ras_core, block, ecc_count);
        if (!ret)
                ras_aca_clear_block_new_ecc_count(ras_core, block);

        return ret;
}

int ras_core_set_status(struct ras_core_context *ras_core, bool enable)
{
        ras_core->ras_core_enabled = enable;

        return 0;
}

bool ras_core_is_enabled(struct ras_core_context *ras_core)
{
        return ras_core->ras_core_enabled;
}

uint64_t ras_core_get_utc_second_timestamp(struct ras_core_context *ras_core)
{
        if (ras_core && ras_core->sys_fn &&
                ras_core->sys_fn->get_utc_second_timestamp)
                return ras_core->sys_fn->get_utc_second_timestamp(ras_core);

        RAS_DEV_ERR(ras_core->dev, "Failed to get system timestamp!\n");
        return 0;
}

int ras_core_translate_soc_pa_and_bank(struct ras_core_context *ras_core,
        uint64_t *soc_pa, struct umc_bank_addr *bank_addr, bool bank_to_pa)
{
        if (!ras_core || !soc_pa || !bank_addr)
                return -EINVAL;

        return ras_umc_translate_soc_pa_and_bank(ras_core, soc_pa, bank_addr, bank_to_pa);
}

bool ras_core_ras_interrupt_detected(struct ras_core_context *ras_core)
{
        if (ras_core && ras_core->sys_fn &&
                ras_core->sys_fn->detect_ras_interrupt)
                return ras_core->sys_fn->detect_ras_interrupt(ras_core);

        RAS_DEV_ERR(ras_core->dev, "Failed to detect ras interrupt!\n");
        return false;
}

int ras_core_get_gpu_mem(struct ras_core_context *ras_core,
        enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem)
{
        if (ras_core->sys_fn && ras_core->sys_fn->get_gpu_mem)
                return ras_core->sys_fn->get_gpu_mem(ras_core, mem_type, gpu_mem);

        RAS_DEV_ERR(ras_core->dev, "Not config get gpu memory API!\n");
        return -EACCES;
}

int ras_core_put_gpu_mem(struct ras_core_context *ras_core,
        enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem)
{
        if (ras_core->sys_fn && ras_core->sys_fn->put_gpu_mem)
                return ras_core->sys_fn->put_gpu_mem(ras_core, mem_type, gpu_mem);

        RAS_DEV_ERR(ras_core->dev, "Not config put gpu memory API!!\n");
        return -EACCES;
}

bool ras_core_is_ready(struct ras_core_context *ras_core)
{
        return ras_core ? ras_core->is_initialized : false;
}

bool ras_core_check_safety_watermark(struct ras_core_context *ras_core)
{
        return ras_eeprom_check_safety_watermark(ras_core);
}

int ras_core_down_trylock_gpu_reset_lock(struct ras_core_context *ras_core)
{
        if (ras_core->sys_fn && ras_core->sys_fn->gpu_reset_lock)
                return ras_core->sys_fn->gpu_reset_lock(ras_core, true, true);

        return 1;
}

void ras_core_down_gpu_reset_lock(struct ras_core_context *ras_core)
{
        if (ras_core->sys_fn && ras_core->sys_fn->gpu_reset_lock)
                ras_core->sys_fn->gpu_reset_lock(ras_core, true, false);
}

void ras_core_up_gpu_reset_lock(struct ras_core_context *ras_core)
{
        if (ras_core->sys_fn && ras_core->sys_fn->gpu_reset_lock)
                ras_core->sys_fn->gpu_reset_lock(ras_core, false, false);
}

int ras_core_event_notify(struct ras_core_context *ras_core,
                enum ras_notify_event event_id, void *data)
{
        if (ras_core && ras_core->sys_fn &&
                ras_core->sys_fn->ras_notifier)
                return ras_core->sys_fn->ras_notifier(ras_core, event_id, data);

        return -RAS_CORE_NOT_SUPPORTED;
}

int ras_core_get_device_system_info(struct ras_core_context *ras_core,
                struct device_system_info *dev_info)
{
        if (ras_core && ras_core->sys_fn &&
                ras_core->sys_fn->get_device_system_info)
                return ras_core->sys_fn->get_device_system_info(ras_core, dev_info);

        return -RAS_CORE_NOT_SUPPORTED;
}

int ras_core_convert_soc_pa_to_cur_nps_pages(struct ras_core_context *ras_core,
                uint64_t soc_pa, uint64_t *page_pfn, uint32_t max_pages)
{
        struct eeprom_umc_record record;
        uint32_t cur_nps_mode;
        int count = 0;

        if (!ras_core || !page_pfn || !max_pages)
                return -EINVAL;

        cur_nps_mode = ras_core_get_curr_nps_mode(ras_core);
        if (!cur_nps_mode || cur_nps_mode > UMC_MEMORY_PARTITION_MODE_NPS8)
                return -EINVAL;

        memset(&record, 0, sizeof(record));
        record.cur_nps_retired_row_pfn = RAS_ADDR_TO_PFN(soc_pa);

        count = ras_umc_convert_record_to_nps_pages(ras_core,
                                &record, cur_nps_mode, page_pfn, max_pages);

        return count;
}