drivers/gpu/drm/amd/ras/rascore/ras.h

root/drivers/gpu/drm/amd/ras/rascore/ras.h
/* SPDX-License-Identifier: MIT */
/*
 * Copyright 2025 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 *
 */

#ifndef __RAS_H__
#define __RAS_H__
#include "ras_sys.h"
#include "ras_umc.h"
#include "ras_aca.h"
#include "ras_eeprom.h"
#include "ras_core_status.h"
#include "ras_process.h"
#include "ras_gfx.h"
#include "ras_cmd.h"
#include "ras_nbio.h"
#include "ras_mp1.h"
#include "ras_psp.h"
#include "ras_log_ring.h"
#include "ras_eeprom_fw.h"

#define RAS_HW_ERR              "[Hardware Error]: "

#define RAS_GPU_PAGE_SHIFT  12
#define RAS_ADDR_TO_PFN(addr) ((addr) >> RAS_GPU_PAGE_SHIFT)
#define RAS_PFN_TO_ADDR(pfn) ((pfn) << RAS_GPU_PAGE_SHIFT)

#define RAS_CORE_RESET_GPU 0x10000

#define GPU_RESET_CAUSE_POISON  (RAS_CORE_RESET_GPU | 0x0001)
#define GPU_RESET_CAUSE_FATAL   (RAS_CORE_RESET_GPU | 0x0002)
#define GPU_RESET_CAUSE_RMA     (RAS_CORE_RESET_GPU | 0x0004)

enum ras_gpu_health_status {
        RAS_GPU_HEALTH_NONE = 0,
        RAS_GPU_HEALTH_USABLE = 1,
        RAS_GPU_RETIRED__ECC_REACH_THRESHOLD = 2,
        RAS_GPU_IN_BAD_STATUS = 3,
};

enum ras_core_fw_feature_flags {
        RAS_CORE_FW_FEATURE_BIT__RAS_EEPROM = BIT_ULL(0),
};

enum ras_block_id {
        RAS_BLOCK_ID__UMC = 0,
        RAS_BLOCK_ID__SDMA,
        RAS_BLOCK_ID__GFX,
        RAS_BLOCK_ID__MMHUB,
        RAS_BLOCK_ID__ATHUB,
        RAS_BLOCK_ID__PCIE_BIF,
        RAS_BLOCK_ID__HDP,
        RAS_BLOCK_ID__XGMI_WAFL,
        RAS_BLOCK_ID__DF,
        RAS_BLOCK_ID__SMN,
        RAS_BLOCK_ID__SEM,
        RAS_BLOCK_ID__MP0,
        RAS_BLOCK_ID__MP1,
        RAS_BLOCK_ID__FUSE,
        RAS_BLOCK_ID__MCA,
        RAS_BLOCK_ID__VCN,
        RAS_BLOCK_ID__JPEG,
        RAS_BLOCK_ID__IH,
        RAS_BLOCK_ID__MPIO,

        RAS_BLOCK_ID__LAST
};

enum ras_ecc_err_type {
        RAS_ECC_ERR__NONE                = 0,
        RAS_ECC_ERR__PARITY              = 1,
        RAS_ECC_ERR__SINGLE_CORRECTABLE  = 2,
        RAS_ECC_ERR__MULTI_UNCORRECTABLE = 4,
        RAS_ECC_ERR__POISON              = 8,
};

enum ras_err_type {
        RAS_ERR_TYPE__UE = 0,
        RAS_ERR_TYPE__CE,
        RAS_ERR_TYPE__DE,
        RAS_ERR_TYPE__LAST
};

enum ras_seqno_type {
        RAS_SEQNO_TYPE_INVALID = 0,
        RAS_SEQNO_TYPE_UE,
        RAS_SEQNO_TYPE_CE,
        RAS_SEQNO_TYPE_DE,
        RAS_SEQNO_TYPE_POISON_CONSUMPTION,
        RAS_SEQNO_TYPE_COUNT_MAX,
};

enum ras_seqno_fifo {
        SEQNO_FIFO_INVALID = 0,
        SEQNO_FIFO_POISON_CREATION,
        SEQNO_FIFO_POISON_CONSUMPTION,
        SEQNO_FIFO_COUNT_MAX
};

enum ras_notify_event {
        RAS_EVENT_ID__NONE,
        RAS_EVENT_ID__BAD_PAGE_DETECTED,
        RAS_EVENT_ID__POISON_CONSUMPTION,
        RAS_EVENT_ID__RESERVE_BAD_PAGE,
        RAS_EVENT_ID__DEVICE_RMA,
        RAS_EVENT_ID__UPDATE_BAD_PAGE_NUM,
        RAS_EVENT_ID__UPDATE_BAD_CHANNEL_BITMAP,
        RAS_EVENT_ID__FATAL_ERROR_DETECTED,
        RAS_EVENT_ID__RESET_GPU,
        RAS_EVENT_ID__RESET_VF,
        RAS_EVENT_ID__RAS_EVENT_PROC_BEGIN,
        RAS_EVENT_ID__RAS_EVENT_PROC_END,
};

enum ras_gpu_status {
        RAS_GPU_STATUS__NOT_READY = 0,
        RAS_GPU_STATUS__READY = 0x1,
        RAS_GPU_STATUS__IN_RESET = 0x2,
        RAS_GPU_STATUS__IS_RMA = 0x4,
        RAS_GPU_STATUS__IS_VF = 0x8,
};

enum ras_fw_eeprom_cmd {
        RAS_SMU_GetRASTableVersion = 0,
        RAS_SMU_GetBadPageCount,
        RAS_SMU_SetTimestamp,
        RAS_SMU_GetTimestamp,
        RAS_SMU_GetBadPageIpid,
        RAS_SMU_EraseRasTable,
        RAS_SMU_GetBadPageMcaAddr,
};

struct ras_core_context;
struct ras_bank_ecc;
struct ras_umc;
struct ras_aca;
struct ras_process;
struct ras_nbio;
struct ras_log_ring;
struct ras_psp;

struct ras_mp1_sys_func {
        int (*mp1_get_valid_bank_count)(struct ras_core_context *ras_core,
                        u32 msg, u32 *count);
        int (*mp1_dump_valid_bank)(struct ras_core_context *ras_core,
                        u32 msg, u32 idx, u32 reg_idx, u64 *val);
        int (*mp1_send_eeprom_msg)(struct ras_core_context *ras_core,
                        enum ras_fw_eeprom_cmd index, uint32_t param, uint32_t *read_arg);
        int (*mp1_get_ras_enabled_mask)(struct ras_core_context *ras_core,
                        uint64_t *enabled_mask);
};

struct ras_eeprom_sys_func {
        int (*eeprom_i2c_xfer)(struct ras_core_context *ras_core,
                        u32 eeprom_addr, u8 *eeprom_buf, u32 buf_size, bool read);
        int (*update_eeprom_i2c_config)(struct ras_core_context *ras_core);
};

struct ras_nbio_sys_func {
        int (*set_ras_controller_irq_state)(struct ras_core_context *ras_core,
                        bool state);
        int (*set_ras_err_event_athub_irq_state)(struct ras_core_context *ras_core,
                        bool state);
};

struct ras_time {
        int tm_sec;
        int tm_min;
        int tm_hour;
        int tm_mday;
        int tm_mon;
        long tm_year;
};

struct device_system_info {
        uint32_t device_id;
        uint32_t vendor_id;
        uint32_t socket_id;
};

enum gpu_mem_type {
        GPU_MEM_TYPE_DEFAULT,
        GPU_MEM_TYPE_RAS_PSP_RING,
        GPU_MEM_TYPE_RAS_PSP_CMD,
        GPU_MEM_TYPE_RAS_PSP_FENCE,
        GPU_MEM_TYPE_RAS_TA_FW,
        GPU_MEM_TYPE_RAS_TA_CMD,
};

struct ras_psp_sys_func {
        int (*get_ras_psp_system_status)(struct ras_core_context *ras_core,
                struct ras_psp_sys_status *status);
        int (*get_ras_ta_init_param)(struct ras_core_context *ras_core,
                struct ras_ta_init_param *ras_ta_param);
};

struct ras_sys_func {
        int (*gpu_reset_lock)(struct ras_core_context *ras_core,
                        bool down, bool try);
        int (*check_gpu_status)(struct ras_core_context *ras_core,
                        uint32_t *status);
        int (*gen_seqno)(struct ras_core_context *ras_core,
                        enum ras_seqno_type seqno_type, uint64_t *seqno);
        int (*async_handle_ras_event)(struct ras_core_context *ras_core, void *data);
        int (*ras_notifier)(struct ras_core_context *ras_core,
                    enum ras_notify_event event_id, void *data);
        u64 (*get_utc_second_timestamp)(struct ras_core_context *ras_core);
        int (*get_device_system_info)(struct ras_core_context *ras_core,
                        struct device_system_info *dev_info);
        bool (*detect_ras_interrupt)(struct ras_core_context *ras_core);
        int (*get_gpu_mem)(struct ras_core_context *ras_core,
                enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem);
        int (*put_gpu_mem)(struct ras_core_context *ras_core,
                enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem);
};

struct ras_ecc_count {
        uint64_t new_ce_count;
        uint64_t total_ce_count;
        uint64_t new_ue_count;
        uint64_t total_ue_count;
        uint64_t new_de_count;
        uint64_t total_de_count;
};

struct ras_bank_ecc {
        uint32_t nps;
        uint64_t seq_no;
        uint64_t status;
        uint64_t ipid;
        uint64_t addr;
        uint64_t ts;
};

struct ras_bank_ecc_node {
        struct list_head node;
        struct ras_bank_ecc ecc;
};

struct ras_aca_config {
        u32 socket_num_per_hive;
        u32 aid_num_per_socket;
        u32 xcd_num_per_aid;
};

struct ras_mp1_config {
        const struct ras_mp1_sys_func *mp1_sys_fn;
};

struct ras_nbio_config {
        const struct ras_nbio_sys_func *nbio_sys_fn;
};

struct ras_psp_config {
        const struct ras_psp_sys_func *psp_sys_fn;
};

struct ras_umc_config {
        uint32_t umc_vram_type;
};

struct ras_eeprom_config {
        const struct ras_eeprom_sys_func *eeprom_sys_fn;
        int eeprom_record_threshold_config;
        uint32_t eeprom_record_threshold_count;
        void *eeprom_i2c_adapter;
        u32 eeprom_i2c_addr;
        u32 eeprom_i2c_port;
        u16 max_i2c_read_len;
        u16 max_i2c_write_len;
};

struct ras_core_config {
        u32 aca_ip_version;
        u32 umc_ip_version;
        u32 mp1_ip_version;
        u32 gfx_ip_version;
        u32 nbio_ip_version;
        u32 psp_ip_version;

        bool poison_supported;
        bool ras_eeprom_supported;
        const struct ras_sys_func *sys_fn;

        struct ras_aca_config aca_cfg;
        struct ras_mp1_config mp1_cfg;
        struct ras_nbio_config nbio_cfg;
        struct ras_psp_config psp_cfg;
        struct ras_eeprom_config eeprom_cfg;
        struct ras_umc_config umc_cfg;
};

struct ras_core_context {
        void *dev;
        struct ras_core_config *config;
        u32 socket_num_per_hive;
        u32 aid_num_per_socket;
        u32 xcd_num_per_aid;
        int max_ue_banks_per_query;
        int max_ce_banks_per_query;
        struct ras_aca ras_aca;

        bool ras_eeprom_supported;
        struct ras_eeprom_control ras_eeprom;
        struct ras_fw_eeprom_control ras_fw_eeprom;

        struct ras_psp ras_psp;
        struct ras_umc ras_umc;
        struct ras_nbio ras_nbio;
        struct ras_gfx ras_gfx;
        struct ras_mp1 ras_mp1;
        struct ras_process ras_proc;
        struct ras_log_ring ras_log_ring;

        const struct ras_sys_func *sys_fn;

        /* is poison mode supported */
        bool poison_supported;

        bool is_rma;
        bool is_initialized;

        struct kfifo de_seqno_fifo;
        struct kfifo consumption_seqno_fifo;
        spinlock_t seqno_lock;

        bool ras_core_enabled;

        u64 ras_fw_features;
};

struct ras_core_context *ras_core_create(struct ras_core_config *init_config);
void ras_core_destroy(struct ras_core_context *ras_core);
int ras_core_sw_init(struct ras_core_context *ras_core);
int ras_core_sw_fini(struct ras_core_context *ras_core);
int ras_core_hw_init(struct ras_core_context *ras_core);
int ras_core_hw_fini(struct ras_core_context *ras_core);
bool ras_core_is_ready(struct ras_core_context *ras_core);
uint64_t ras_core_gen_seqno(struct ras_core_context *ras_core,
                        enum ras_seqno_type seqno_type);
uint64_t ras_core_get_seqno(struct ras_core_context *ras_core,
                        enum ras_seqno_type seqno_type, bool pop);

int ras_core_put_seqno(struct ras_core_context *ras_core,
                enum ras_seqno_type seqno_type, uint64_t seqno);

int ras_core_update_ecc_info(struct ras_core_context *ras_core);
int ras_core_query_block_ecc_data(struct ras_core_context *ras_core,
                enum ras_block_id block, struct ras_ecc_count *ecc_count);

bool ras_core_gpu_in_reset(struct ras_core_context *ras_core);
bool ras_core_gpu_is_rma(struct ras_core_context *ras_core);
bool ras_core_gpu_is_vf(struct ras_core_context *ras_core);
bool ras_core_handle_nbio_irq(struct ras_core_context *ras_core, void *data);
int ras_core_handle_fatal_error(struct ras_core_context *ras_core);

uint32_t ras_core_get_curr_nps_mode(struct ras_core_context *ras_core);
const char *ras_core_get_ras_block_name(enum ras_block_id block_id);
int ras_core_convert_timestamp_to_time(struct ras_core_context *ras_core,
                        uint64_t timestamp, struct ras_time *tm);

int ras_core_set_status(struct ras_core_context *ras_core, bool enable);
bool ras_core_is_enabled(struct ras_core_context *ras_core);
uint64_t ras_core_get_utc_second_timestamp(struct ras_core_context *ras_core);
int ras_core_translate_soc_pa_and_bank(struct ras_core_context *ras_core,
        uint64_t *soc_pa, struct umc_bank_addr *bank_addr, bool bank_to_pa);
bool ras_core_ras_interrupt_detected(struct ras_core_context *ras_core);
int ras_core_get_gpu_mem(struct ras_core_context *ras_core,
                enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem);
int ras_core_put_gpu_mem(struct ras_core_context *ras_core,
                enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem);
bool ras_core_check_safety_watermark(struct ras_core_context *ras_core);
int ras_core_down_trylock_gpu_reset_lock(struct ras_core_context *ras_core);
void ras_core_down_gpu_reset_lock(struct ras_core_context *ras_core);
void ras_core_up_gpu_reset_lock(struct ras_core_context *ras_core);
int ras_core_event_notify(struct ras_core_context *ras_core,
                enum ras_notify_event event_id, void *data);
int ras_core_get_device_system_info(struct ras_core_context *ras_core,
                struct device_system_info *dev_info);
int ras_core_convert_soc_pa_to_cur_nps_pages(struct ras_core_context *ras_core,
                uint64_t soc_pa, uint64_t *page_pfn, uint32_t max_pages);
#endif
Linux