root/drivers/iommu/generic_pt/fmt/amdv1.h
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
 *
 * AMD IOMMU v1 page table
 *
 * This is described in Section "2.2.3 I/O Page Tables for Host Translations"
 * of the "AMD I/O Virtualization Technology (IOMMU) Specification"
 *
 * Note the level numbering here matches the core code, so level 0 is the same
 * as mode 1.
 *
 */
#ifndef __GENERIC_PT_FMT_AMDV1_H
#define __GENERIC_PT_FMT_AMDV1_H

#include "defs_amdv1.h"
#include "../pt_defs.h"

#include <asm/page.h>
#include <linux/bitfield.h>
#include <linux/container_of.h>
#include <linux/mem_encrypt.h>
#include <linux/minmax.h>
#include <linux/sizes.h>
#include <linux/string.h>

enum {
        PT_ITEM_WORD_SIZE = sizeof(u64),
        /*
         * The IOMMUFD selftest uses the AMDv1 format with some alterations It
         * uses a 2k page size to test cases where the CPU page size is not the
         * same.
         */
#ifdef AMDV1_IOMMUFD_SELFTEST
        PT_MAX_VA_ADDRESS_LG2 = 56,
        PT_MAX_OUTPUT_ADDRESS_LG2 = 51,
        PT_MAX_TOP_LEVEL = 4,
        PT_GRANULE_LG2SZ = 11,
#else
        PT_MAX_VA_ADDRESS_LG2 = 64,
        PT_MAX_OUTPUT_ADDRESS_LG2 = 52,
        PT_MAX_TOP_LEVEL = 5,
        PT_GRANULE_LG2SZ = 12,
#endif
        PT_TABLEMEM_LG2SZ = 12,

        /* The DTE only has these bits for the top phyiscal address */
        PT_TOP_PHYS_MASK = GENMASK_ULL(51, 12),
};

/* PTE bits */
enum {
        AMDV1PT_FMT_PR = BIT(0),
        AMDV1PT_FMT_D = BIT(6),
        AMDV1PT_FMT_NEXT_LEVEL = GENMASK_ULL(11, 9),
        AMDV1PT_FMT_OA = GENMASK_ULL(51, 12),
        AMDV1PT_FMT_FC = BIT_ULL(60),
        AMDV1PT_FMT_IR = BIT_ULL(61),
        AMDV1PT_FMT_IW = BIT_ULL(62),
};

/*
 * gcc 13 has a bug where it thinks the output of FIELD_GET() is an enum, make
 * these defines to avoid it.
 */
#define AMDV1PT_FMT_NL_DEFAULT 0
#define AMDV1PT_FMT_NL_SIZE 7

static inline pt_oaddr_t amdv1pt_table_pa(const struct pt_state *pts)
{
        u64 entry = pts->entry;

        if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES))
                entry = __sme_clr(entry);
        return oalog2_mul(FIELD_GET(AMDV1PT_FMT_OA, entry), PT_GRANULE_LG2SZ);
}
#define pt_table_pa amdv1pt_table_pa

/* Returns the oa for the start of the contiguous entry */
static inline pt_oaddr_t amdv1pt_entry_oa(const struct pt_state *pts)
{
        u64 entry = pts->entry;
        pt_oaddr_t oa;

        if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES))
                entry = __sme_clr(entry);
        oa = FIELD_GET(AMDV1PT_FMT_OA, entry);

        if (FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, entry) == AMDV1PT_FMT_NL_SIZE) {
                unsigned int sz_bits = oaffz(oa);

                oa = oalog2_set_mod(oa, 0, sz_bits);
        } else if (PT_WARN_ON(FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, entry) !=
                              AMDV1PT_FMT_NL_DEFAULT))
                return 0;
        return oalog2_mul(oa, PT_GRANULE_LG2SZ);
}
#define pt_entry_oa amdv1pt_entry_oa

static inline bool amdv1pt_can_have_leaf(const struct pt_state *pts)
{
        /*
         * Table 15: Page Table Level Parameters
         * The top most level cannot have translation entries
         */
        return pts->level < PT_MAX_TOP_LEVEL;
}
#define pt_can_have_leaf amdv1pt_can_have_leaf

/* Body in pt_fmt_defaults.h */
static inline unsigned int pt_table_item_lg2sz(const struct pt_state *pts);

static inline unsigned int
amdv1pt_entry_num_contig_lg2(const struct pt_state *pts)
{
        u32 code;

        if (FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry) ==
            AMDV1PT_FMT_NL_DEFAULT)
                return ilog2(1);

        PT_WARN_ON(FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry) !=
                   AMDV1PT_FMT_NL_SIZE);

        /*
         * The contiguous size is encoded in the length of a string of 1's in
         * the low bits of the OA. Reverse the equation:
         *  code = log2_to_int(num_contig_lg2 + item_lg2sz -
         *              PT_GRANULE_LG2SZ - 1) - 1
         * Which can be expressed as:
         *  num_contig_lg2 = oalog2_ffz(code) + 1 -
         *              item_lg2sz - PT_GRANULE_LG2SZ
         *
         * Assume the bit layout is correct and remove the masking. Reorganize
         * the equation to move all the arithmetic before the ffz.
         */
        code = pts->entry >> (__bf_shf(AMDV1PT_FMT_OA) - 1 +
                              pt_table_item_lg2sz(pts) - PT_GRANULE_LG2SZ);
        return ffz_t(u32, code);
}
#define pt_entry_num_contig_lg2 amdv1pt_entry_num_contig_lg2

static inline unsigned int amdv1pt_num_items_lg2(const struct pt_state *pts)
{
        /*
         * Top entry covers bits [63:57] only, this is handled through
         * max_vasz_lg2.
         */
        if (PT_WARN_ON(pts->level == 5))
                return 7;
        return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64));
}
#define pt_num_items_lg2 amdv1pt_num_items_lg2

static inline pt_vaddr_t amdv1pt_possible_sizes(const struct pt_state *pts)
{
        unsigned int isz_lg2 = pt_table_item_lg2sz(pts);

        if (!amdv1pt_can_have_leaf(pts))
                return 0;

        /*
         * Table 14: Example Page Size Encodings
         * Address bits 51:32 can be used to encode page sizes greater than 4
         * Gbytes. Address bits 63:52 are zero-extended.
         *
         * 512GB Pages are not supported due to a hardware bug.
         * Otherwise every power of two size is supported.
         */
        return GENMASK_ULL(min(51, isz_lg2 + amdv1pt_num_items_lg2(pts) - 1),
                           isz_lg2) & ~SZ_512G;
}
#define pt_possible_sizes amdv1pt_possible_sizes

static inline enum pt_entry_type amdv1pt_load_entry_raw(struct pt_state *pts)
{
        const u64 *tablep = pt_cur_table(pts, u64) + pts->index;
        unsigned int next_level;
        u64 entry;

        pts->entry = entry = READ_ONCE(*tablep);
        if (!(entry & AMDV1PT_FMT_PR))
                return PT_ENTRY_EMPTY;

        next_level = FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry);
        if (pts->level == 0 || next_level == AMDV1PT_FMT_NL_DEFAULT ||
            next_level == AMDV1PT_FMT_NL_SIZE)
                return PT_ENTRY_OA;
        return PT_ENTRY_TABLE;
}
#define pt_load_entry_raw amdv1pt_load_entry_raw

static __always_inline void
amdv1pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa,
                           unsigned int oasz_lg2,
                           const struct pt_write_attrs *attrs)
{
        unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
        u64 *tablep = pt_cur_table(pts, u64) + pts->index;
        u64 entry;

        if (!pt_check_install_leaf_args(pts, oa, oasz_lg2))
                return;

        entry = AMDV1PT_FMT_PR |
                FIELD_PREP(AMDV1PT_FMT_OA, log2_div(oa, PT_GRANULE_LG2SZ)) |
                attrs->descriptor_bits;

        if (oasz_lg2 == isz_lg2) {
                entry |= FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL,
                                    AMDV1PT_FMT_NL_DEFAULT);
                WRITE_ONCE(*tablep, entry);
        } else {
                unsigned int num_contig_lg2 = oasz_lg2 - isz_lg2;
                u64 *end = tablep + log2_to_int(num_contig_lg2);

                entry |= FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL,
                                    AMDV1PT_FMT_NL_SIZE) |
                         FIELD_PREP(AMDV1PT_FMT_OA,
                                    oalog2_to_int(oasz_lg2 - PT_GRANULE_LG2SZ -
                                                  1) -
                                            1);

                /* See amdv1pt_clear_entries() */
                if (num_contig_lg2 <= ilog2(32)) {
                        for (; tablep != end; tablep++)
                                WRITE_ONCE(*tablep, entry);
                } else {
                        memset64(tablep, entry, log2_to_int(num_contig_lg2));
                }
        }
        pts->entry = entry;
}
#define pt_install_leaf_entry amdv1pt_install_leaf_entry

static inline bool amdv1pt_install_table(struct pt_state *pts,
                                         pt_oaddr_t table_pa,
                                         const struct pt_write_attrs *attrs)
{
        u64 entry;

        /*
         * IR and IW are ANDed from the table levels along with the PTE. We
         * always control permissions from the PTE, so always set IR and IW for
         * tables.
         */
        entry = AMDV1PT_FMT_PR |
                FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL, pts->level) |
                FIELD_PREP(AMDV1PT_FMT_OA,
                           log2_div(table_pa, PT_GRANULE_LG2SZ)) |
                AMDV1PT_FMT_IR | AMDV1PT_FMT_IW;
        if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES))
                entry = __sme_set(entry);
        return pt_table_install64(pts, entry);
}
#define pt_install_table amdv1pt_install_table

static inline void amdv1pt_attr_from_entry(const struct pt_state *pts,
                                           struct pt_write_attrs *attrs)
{
        attrs->descriptor_bits =
                pts->entry & (AMDV1PT_FMT_FC | AMDV1PT_FMT_IR | AMDV1PT_FMT_IW);
}
#define pt_attr_from_entry amdv1pt_attr_from_entry

static inline void amdv1pt_clear_entries(struct pt_state *pts,
                                         unsigned int num_contig_lg2)
{
        u64 *tablep = pt_cur_table(pts, u64) + pts->index;
        u64 *end = tablep + log2_to_int(num_contig_lg2);

        /*
         * gcc generates rep stos for the io-pgtable code, and this difference
         * can show in microbenchmarks with larger contiguous page sizes.
         * rep is slower for small cases.
         */
        if (num_contig_lg2 <= ilog2(32)) {
                for (; tablep != end; tablep++)
                        WRITE_ONCE(*tablep, 0);
        } else {
                memset64(tablep, 0, log2_to_int(num_contig_lg2));
        }
}
#define pt_clear_entries amdv1pt_clear_entries

static inline bool amdv1pt_entry_is_write_dirty(const struct pt_state *pts)
{
        unsigned int num_contig_lg2 = amdv1pt_entry_num_contig_lg2(pts);
        u64 *tablep = pt_cur_table(pts, u64) +
                      log2_set_mod(pts->index, 0, num_contig_lg2);
        u64 *end = tablep + log2_to_int(num_contig_lg2);

        for (; tablep != end; tablep++)
                if (READ_ONCE(*tablep) & AMDV1PT_FMT_D)
                        return true;
        return false;
}
#define pt_entry_is_write_dirty amdv1pt_entry_is_write_dirty

static inline void amdv1pt_entry_make_write_clean(struct pt_state *pts)
{
        unsigned int num_contig_lg2 = amdv1pt_entry_num_contig_lg2(pts);
        u64 *tablep = pt_cur_table(pts, u64) +
                      log2_set_mod(pts->index, 0, num_contig_lg2);
        u64 *end = tablep + log2_to_int(num_contig_lg2);

        for (; tablep != end; tablep++)
                WRITE_ONCE(*tablep, READ_ONCE(*tablep) & ~(u64)AMDV1PT_FMT_D);
}
#define pt_entry_make_write_clean amdv1pt_entry_make_write_clean

static inline bool amdv1pt_entry_make_write_dirty(struct pt_state *pts)
{
        u64 *tablep = pt_cur_table(pts, u64) + pts->index;
        u64 new = pts->entry | AMDV1PT_FMT_D;

        return try_cmpxchg64(tablep, &pts->entry, new);
}
#define pt_entry_make_write_dirty amdv1pt_entry_make_write_dirty

/* --- iommu */
#include <linux/generic_pt/iommu.h>
#include <linux/iommu.h>

#define pt_iommu_table pt_iommu_amdv1

/* The common struct is in the per-format common struct */
static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table)
{
        return &container_of(iommu_table, struct pt_iommu_amdv1, iommu)
                        ->amdpt.common;
}

static inline struct pt_iommu *iommu_from_common(struct pt_common *common)
{
        return &container_of(common, struct pt_iommu_amdv1, amdpt.common)->iommu;
}

static inline int amdv1pt_iommu_set_prot(struct pt_common *common,
                                         struct pt_write_attrs *attrs,
                                         unsigned int iommu_prot)
{
        u64 pte = 0;

        if (pt_feature(common, PT_FEAT_AMDV1_FORCE_COHERENCE))
                pte |= AMDV1PT_FMT_FC;
        if (iommu_prot & IOMMU_READ)
                pte |= AMDV1PT_FMT_IR;
        if (iommu_prot & IOMMU_WRITE)
                pte |= AMDV1PT_FMT_IW;

        /*
         * Ideally we'd have an IOMMU_ENCRYPTED flag set by higher levels to
         * control this. For now if the tables use sme_set then so do the ptes.
         */
        if (pt_feature(common, PT_FEAT_AMDV1_ENCRYPT_TABLES) &&
            !(iommu_prot & IOMMU_MMIO))
                pte = __sme_set(pte);

        attrs->descriptor_bits = pte;
        return 0;
}
#define pt_iommu_set_prot amdv1pt_iommu_set_prot

static inline int amdv1pt_iommu_fmt_init(struct pt_iommu_amdv1 *iommu_table,
                                         const struct pt_iommu_amdv1_cfg *cfg)
{
        struct pt_amdv1 *table = &iommu_table->amdpt;
        unsigned int max_vasz_lg2 = PT_MAX_VA_ADDRESS_LG2;

        if (cfg->starting_level == 0 || cfg->starting_level > PT_MAX_TOP_LEVEL)
                return -EINVAL;

        if (!pt_feature(&table->common, PT_FEAT_DYNAMIC_TOP) &&
            cfg->starting_level != PT_MAX_TOP_LEVEL)
                max_vasz_lg2 = PT_GRANULE_LG2SZ +
                               (PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64))) *
                                       (cfg->starting_level + 1);

        table->common.max_vasz_lg2 =
                min(max_vasz_lg2, cfg->common.hw_max_vasz_lg2);
        table->common.max_oasz_lg2 =
                min(PT_MAX_OUTPUT_ADDRESS_LG2, cfg->common.hw_max_oasz_lg2);
        pt_top_set_level(&table->common, cfg->starting_level);
        return 0;
}
#define pt_iommu_fmt_init amdv1pt_iommu_fmt_init

#ifndef PT_FMT_VARIANT
static inline void
amdv1pt_iommu_fmt_hw_info(struct pt_iommu_amdv1 *table,
                          const struct pt_range *top_range,
                          struct pt_iommu_amdv1_hw_info *info)
{
        info->host_pt_root = virt_to_phys(top_range->top_table);
        PT_WARN_ON(info->host_pt_root & ~PT_TOP_PHYS_MASK);
        info->mode = top_range->top_level + 1;
}
#define pt_iommu_fmt_hw_info amdv1pt_iommu_fmt_hw_info
#endif

#if defined(GENERIC_PT_KUNIT)
static const struct pt_iommu_amdv1_cfg amdv1_kunit_fmt_cfgs[] = {
        /* Matches what io_pgtable does */
        [0] = { .starting_level = 2 },
};
#define kunit_fmt_cfgs amdv1_kunit_fmt_cfgs
enum { KUNIT_FMT_FEATURES = 0 };
#endif

#endif