#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include "mmu.h"
#include "mmu_internal.h"
#include "mmutrace.h"
#include "tdp_iter.h"
#include "tdp_mmu.h"
#include "spte.h"
#include <asm/cmpxchg.h>
#include <trace/events/kvm.h>
void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
{
INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
}
static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
bool shared)
{
if (shared)
lockdep_assert_held_read(&kvm->mmu_lock);
else
lockdep_assert_held_write(&kvm->mmu_lock);
return true;
}
void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
{
kvm_tdp_mmu_invalidate_roots(kvm, KVM_VALID_ROOTS);
kvm_tdp_mmu_zap_invalidated_roots(kvm, false);
#ifdef CONFIG_KVM_PROVE_MMU
KVM_MMU_WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
#endif
WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
rcu_barrier();
}
static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
{
free_page((unsigned long)sp->external_spt);
free_page((unsigned long)sp->spt);
kmem_cache_free(mmu_page_header_cache, sp);
}
static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
{
struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
rcu_head);
tdp_mmu_free_sp(sp);
}
void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
{
if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
return;
KVM_BUG_ON(!is_tdp_mmu_page(root) || !root->role.invalid, kvm);
spin_lock(&kvm->arch.tdp_mmu_pages_lock);
list_del_rcu(&root->link);
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
}
static bool tdp_mmu_root_match(struct kvm_mmu_page *root,
enum kvm_tdp_mmu_root_types types)
{
if (WARN_ON_ONCE(!(types & KVM_VALID_ROOTS)))
return false;
if (root->role.invalid && !(types & KVM_INVALID_ROOTS))
return false;
if (likely(!is_mirror_sp(root)))
return types & KVM_DIRECT_ROOTS;
return types & KVM_MIRROR_ROOTS;
}
static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
struct kvm_mmu_page *prev_root,
enum kvm_tdp_mmu_root_types types)
{
struct kvm_mmu_page *next_root;
lockdep_assert_held(&kvm->mmu_lock);
rcu_read_lock();
if (prev_root)
next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
&prev_root->link,
typeof(*prev_root), link);
else
next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
typeof(*next_root), link);
while (next_root) {
if (tdp_mmu_root_match(next_root, types) &&
kvm_tdp_mmu_get_root(next_root))
break;
next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
&next_root->link, typeof(*next_root), link);
}
rcu_read_unlock();
if (prev_root)
kvm_tdp_mmu_put_root(kvm, prev_root);
return next_root;
}
#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _types) \
for (_root = tdp_mmu_next_root(_kvm, NULL, _types); \
({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \
_root = tdp_mmu_next_root(_kvm, _root, _types)) \
if (_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) { \
} else
#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \
__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, KVM_VALID_ROOTS)
#define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \
for (_root = tdp_mmu_next_root(_kvm, NULL, KVM_ALL_ROOTS); \
({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \
_root = tdp_mmu_next_root(_kvm, _root, KVM_ALL_ROOTS))
#define __for_each_tdp_mmu_root(_kvm, _root, _as_id, _types) \
list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \
if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \
((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) || \
!tdp_mmu_root_match((_root), (_types)))) { \
} else
#define for_each_tdp_mmu_root_rcu(_kvm, _root, _as_id, _types) \
list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link) \
if ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) || \
!tdp_mmu_root_match((_root), (_types))) { \
} else
#define for_each_valid_tdp_mmu_root(_kvm, _root, _as_id) \
__for_each_tdp_mmu_root(_kvm, _root, _as_id, KVM_VALID_ROOTS)
static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
{
struct kvm_mmu_page *sp;
sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
return sp;
}
static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
gfn_t gfn, union kvm_mmu_page_role role)
{
INIT_LIST_HEAD(&sp->possible_nx_huge_page_link);
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
sp->role = role;
sp->gfn = gfn;
sp->ptep = sptep;
sp->tdp_mmu_page = true;
trace_kvm_mmu_get_page(sp, true);
}
static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
struct tdp_iter *iter)
{
struct kvm_mmu_page *parent_sp;
union kvm_mmu_page_role role;
parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
role = parent_sp->role;
role.level--;
tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
}
void kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu, bool mirror)
{
struct kvm_mmu *mmu = vcpu->arch.mmu;
union kvm_mmu_page_role role = mmu->root_role;
int as_id = kvm_mmu_role_as_id(role);
struct kvm *kvm = vcpu->kvm;
struct kvm_mmu_page *root;
if (mirror)
role.is_mirror = true;
read_lock(&kvm->mmu_lock);
for_each_valid_tdp_mmu_root_yield_safe(kvm, root, as_id) {
if (root->role.word == role.word)
goto out_read_unlock;
}
spin_lock(&kvm->arch.tdp_mmu_pages_lock);
list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
if (root->role.word == role.word &&
!WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root)))
goto out_spin_unlock;
}
root = tdp_mmu_alloc_sp(vcpu);
tdp_mmu_init_sp(root, NULL, 0, role);
refcount_set(&root->tdp_mmu_root_count, 2);
list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
out_spin_unlock:
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
out_read_unlock:
read_unlock(&kvm->mmu_lock);
if (mirror) {
mmu->mirror_root_hpa = __pa(root->spt);
} else {
mmu->root.hpa = __pa(root->spt);
mmu->root.pgd = 0;
}
}
static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
u64 old_spte, u64 new_spte, int level,
bool shared);
static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
kvm_account_pgtable_pages((void *)sp->spt, +1);
#ifdef CONFIG_KVM_PROVE_MMU
atomic64_inc(&kvm->arch.tdp_mmu_pages);
#endif
}
static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
kvm_account_pgtable_pages((void *)sp->spt, -1);
#ifdef CONFIG_KVM_PROVE_MMU
atomic64_dec(&kvm->arch.tdp_mmu_pages);
#endif
}
static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
{
tdp_unaccount_mmu_page(kvm, sp);
if (!sp->nx_huge_page_disallowed)
return;
spin_lock(&kvm->arch.tdp_mmu_pages_lock);
sp->nx_huge_page_disallowed = false;
untrack_possible_nx_huge_page(kvm, sp, KVM_TDP_MMU);
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
}
static void remove_external_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte,
int level)
{
if (!is_last_spte(old_spte, level))
return;
lockdep_assert_held_write(&kvm->mmu_lock);
kvm_x86_call(remove_external_spte)(kvm, gfn, level, old_spte);
}
static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
{
struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
int level = sp->role.level;
gfn_t base_gfn = sp->gfn;
int i;
trace_kvm_mmu_prepare_zap_page(sp);
tdp_mmu_unlink_sp(kvm, sp);
for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
tdp_ptep_t sptep = pt + i;
gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
u64 old_spte;
if (shared) {
for (;;) {
old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, FROZEN_SPTE);
if (!is_frozen_spte(old_spte))
break;
cpu_relax();
}
} else {
old_spte = kvm_tdp_mmu_read_spte(sptep);
if (!is_shadow_present_pte(old_spte))
continue;
old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
FROZEN_SPTE, level);
}
handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
old_spte, FROZEN_SPTE, level, shared);
if (is_mirror_sp(sp)) {
KVM_BUG_ON(shared, kvm);
remove_external_spte(kvm, gfn, old_spte, level);
}
}
if (is_mirror_sp(sp) &&
WARN_ON(kvm_x86_call(free_external_spt)(kvm, base_gfn, sp->role.level,
sp->external_spt))) {
sp->external_spt = NULL;
}
call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
}
static void *get_external_spt(gfn_t gfn, u64 new_spte, int level)
{
if (is_shadow_present_pte(new_spte) && !is_last_spte(new_spte, level)) {
struct kvm_mmu_page *sp = spte_to_child_sp(new_spte);
WARN_ON_ONCE(sp->role.level + 1 != level);
WARN_ON_ONCE(sp->gfn != gfn);
return sp->external_spt;
}
return NULL;
}
static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sptep,
gfn_t gfn, u64 old_spte,
u64 new_spte, int level)
{
bool was_present = is_shadow_present_pte(old_spte);
bool is_present = is_shadow_present_pte(new_spte);
bool is_leaf = is_present && is_last_spte(new_spte, level);
int ret = 0;
KVM_BUG_ON(was_present, kvm);
lockdep_assert_held(&kvm->mmu_lock);
if (!try_cmpxchg64(rcu_dereference(sptep), &old_spte, FROZEN_SPTE))
return -EBUSY;
if (is_leaf) {
ret = kvm_x86_call(set_external_spte)(kvm, gfn, level, new_spte);
} else {
void *external_spt = get_external_spt(gfn, new_spte, level);
KVM_BUG_ON(!external_spt, kvm);
ret = kvm_x86_call(link_external_spt)(kvm, gfn, level, external_spt);
}
if (ret)
__kvm_tdp_mmu_write_spte(sptep, old_spte);
else
__kvm_tdp_mmu_write_spte(sptep, new_spte);
return ret;
}
static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
u64 old_spte, u64 new_spte, int level,
bool shared)
{
bool was_present = is_shadow_present_pte(old_spte);
bool is_present = is_shadow_present_pte(new_spte);
bool was_leaf = was_present && is_last_spte(old_spte, level);
bool is_leaf = is_present && is_last_spte(new_spte, level);
bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
WARN_ON_ONCE(level > PT64_ROOT_MAX_LEVEL);
WARN_ON_ONCE(level < PG_LEVEL_4K);
WARN_ON_ONCE(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
if (was_leaf && is_leaf && pfn_changed) {
pr_err("Invalid SPTE change: cannot replace a present leaf\n"
"SPTE with another present leaf SPTE mapping a\n"
"different PFN!\n"
"as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
as_id, gfn, old_spte, new_spte, level);
BUG();
}
if (old_spte == new_spte)
return;
trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
if (is_leaf)
check_spte_writable_invariants(new_spte);
if (!was_present && !is_present) {
if (WARN_ON_ONCE(!is_mmio_spte(kvm, old_spte) &&
!is_mmio_spte(kvm, new_spte) &&
!is_frozen_spte(new_spte)))
pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
"should not be replaced with another,\n"
"different nonpresent SPTE, unless one or both\n"
"are MMIO SPTEs, or the new SPTE is\n"
"a temporary frozen SPTE.\n"
"as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
as_id, gfn, old_spte, new_spte, level);
return;
}
if (is_leaf != was_leaf)
kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
if (was_present && !was_leaf &&
(is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
}
static inline int __must_check __tdp_mmu_set_spte_atomic(struct kvm *kvm,
struct tdp_iter *iter,
u64 new_spte)
{
WARN_ON_ONCE(iter->yielded || is_frozen_spte(iter->old_spte));
if (is_mirror_sptep(iter->sptep) && !is_frozen_spte(new_spte)) {
int ret;
if (KVM_BUG_ON(!is_shadow_present_pte(new_spte), kvm))
return -EBUSY;
ret = set_external_spte_present(kvm, iter->sptep, iter->gfn,
iter->old_spte, new_spte, iter->level);
if (ret)
return ret;
} else {
u64 *sptep = rcu_dereference(iter->sptep);
if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
return -EBUSY;
}
return 0;
}
static inline int __must_check tdp_mmu_set_spte_atomic(struct kvm *kvm,
struct tdp_iter *iter,
u64 new_spte)
{
int ret;
lockdep_assert_held_read(&kvm->mmu_lock);
ret = __tdp_mmu_set_spte_atomic(kvm, iter, new_spte);
if (ret)
return ret;
handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
new_spte, iter->level, true);
return 0;
}
static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
u64 old_spte, u64 new_spte, gfn_t gfn, int level)
{
lockdep_assert_held_write(&kvm->mmu_lock);
WARN_ON_ONCE(is_frozen_spte(old_spte) || is_frozen_spte(new_spte));
old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
if (is_mirror_sptep(sptep)) {
KVM_BUG_ON(is_shadow_present_pte(new_spte), kvm);
remove_external_spte(kvm, gfn, old_spte, level);
}
return old_spte;
}
static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter,
u64 new_spte)
{
WARN_ON_ONCE(iter->yielded);
iter->old_spte = tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
iter->old_spte, new_spte,
iter->gfn, iter->level);
}
#define tdp_root_for_each_pte(_iter, _kvm, _root, _start, _end) \
for_each_tdp_pte(_iter, _kvm, _root, _start, _end)
#define tdp_root_for_each_leaf_pte(_iter, _kvm, _root, _start, _end) \
tdp_root_for_each_pte(_iter, _kvm, _root, _start, _end) \
if (!is_shadow_present_pte(_iter.old_spte) || \
!is_last_spte(_iter.old_spte, _iter.level)) \
continue; \
else
static inline bool __must_check tdp_mmu_iter_need_resched(struct kvm *kvm,
struct tdp_iter *iter)
{
if (!need_resched() && !rwlock_needbreak(&kvm->mmu_lock))
return false;
return iter->next_last_level_gfn != iter->yielded_gfn;
}
static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
struct tdp_iter *iter,
bool flush, bool shared)
{
KVM_MMU_WARN_ON(iter->yielded);
if (!tdp_mmu_iter_need_resched(kvm, iter))
return false;
if (flush)
kvm_flush_remote_tlbs(kvm);
rcu_read_unlock();
if (shared)
cond_resched_rwlock_read(&kvm->mmu_lock);
else
cond_resched_rwlock_write(&kvm->mmu_lock);
rcu_read_lock();
WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn);
iter->yielded = true;
return true;
}
static inline gfn_t tdp_mmu_max_gfn_exclusive(void)
{
return kvm_mmu_max_gfn() + 1;
}
static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
bool shared, int zap_level)
{
struct tdp_iter iter;
for_each_tdp_pte_min_level_all(iter, root, zap_level) {
retry:
if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
continue;
if (!is_shadow_present_pte(iter.old_spte))
continue;
if (iter.level > zap_level)
continue;
if (!shared)
tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE);
else if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE))
goto retry;
}
}
static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
bool shared)
{
WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count));
kvm_lockdep_assert_mmu_lock_held(kvm, shared);
rcu_read_lock();
if (!IS_ENABLED(CONFIG_KVM_PROVE_MMU)) {
__tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_4K);
__tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_2M);
}
__tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
__tdp_mmu_zap_root(kvm, root, shared, root->role.level);
rcu_read_unlock();
}
bool kvm_tdp_mmu_zap_possible_nx_huge_page(struct kvm *kvm,
struct kvm_mmu_page *sp)
{
struct tdp_iter iter = {
.old_spte = sp->ptep ? kvm_tdp_mmu_read_spte(sp->ptep) : 0,
.sptep = sp->ptep,
.level = sp->role.level + 1,
.gfn = sp->gfn,
.as_id = kvm_mmu_page_as_id(sp),
};
lockdep_assert_held_read(&kvm->mmu_lock);
if (WARN_ON_ONCE(!is_tdp_mmu_page(sp)))
return false;
if (WARN_ON_ONCE(!sp->ptep))
return false;
if ((tdp_ptep_t)sp->spt != spte_to_child_pt(iter.old_spte, iter.level))
return false;
if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE)) {
WARN_ON_ONCE((tdp_ptep_t)sp->spt == spte_to_child_pt(iter.old_spte, iter.level));
return false;
}
return true;
}
static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
gfn_t start, gfn_t end, bool can_yield, bool flush)
{
struct tdp_iter iter;
end = min(end, tdp_mmu_max_gfn_exclusive());
lockdep_assert_held_write(&kvm->mmu_lock);
rcu_read_lock();
for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_4K, start, end) {
if (can_yield &&
tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
flush = false;
continue;
}
if (!is_shadow_present_pte(iter.old_spte) ||
!is_last_spte(iter.old_spte, iter.level))
continue;
tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE);
if (!root->role.invalid)
flush = true;
}
rcu_read_unlock();
return flush;
}
bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush)
{
struct kvm_mmu_page *root;
lockdep_assert_held_write(&kvm->mmu_lock);
for_each_valid_tdp_mmu_root_yield_safe(kvm, root, -1)
flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush);
return flush;
}
void kvm_tdp_mmu_zap_all(struct kvm *kvm)
{
struct kvm_mmu_page *root;
lockdep_assert_held_write(&kvm->mmu_lock);
__for_each_tdp_mmu_root_yield_safe(kvm, root, -1,
KVM_DIRECT_ROOTS | KVM_INVALID_ROOTS)
tdp_mmu_zap_root(kvm, root, false);
}
void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm, bool shared)
{
struct kvm_mmu_page *root;
if (shared)
read_lock(&kvm->mmu_lock);
else
write_lock(&kvm->mmu_lock);
for_each_tdp_mmu_root_yield_safe(kvm, root) {
if (!root->tdp_mmu_scheduled_root_to_zap)
continue;
root->tdp_mmu_scheduled_root_to_zap = false;
KVM_BUG_ON(!root->role.invalid, kvm);
tdp_mmu_zap_root(kvm, root, shared);
kvm_tdp_mmu_put_root(kvm, root);
}
if (shared)
read_unlock(&kvm->mmu_lock);
else
write_unlock(&kvm->mmu_lock);
}
void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm,
enum kvm_tdp_mmu_root_types root_types)
{
struct kvm_mmu_page *root;
if (WARN_ON_ONCE(root_types & KVM_INVALID_ROOTS))
root_types &= ~KVM_INVALID_ROOTS;
if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
refcount_read(&kvm->users_count) && kvm->created_vcpus)
lockdep_assert_held_write(&kvm->mmu_lock);
list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
if (!tdp_mmu_root_match(root, root_types))
continue;
if (!root->role.invalid) {
root->tdp_mmu_scheduled_root_to_zap = true;
root->role.invalid = true;
}
}
}
static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
struct kvm_page_fault *fault,
struct tdp_iter *iter)
{
struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
u64 new_spte;
int ret = RET_PF_FIXED;
bool wrprot = false;
if (WARN_ON_ONCE(sp->role.level != fault->goal_level))
return RET_PF_RETRY;
if (is_shadow_present_pte(iter->old_spte) &&
(fault->prefetch || is_access_allowed(fault, iter->old_spte)) &&
is_last_spte(iter->old_spte, iter->level)) {
WARN_ON_ONCE(fault->pfn != spte_to_pfn(iter->old_spte));
return RET_PF_SPURIOUS;
}
if (unlikely(!fault->slot))
new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
else
wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
fault->pfn, iter->old_spte, fault->prefetch,
false, fault->map_writable, &new_spte);
if (new_spte == iter->old_spte)
ret = RET_PF_SPURIOUS;
else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
return RET_PF_RETRY;
else if (is_shadow_present_pte(iter->old_spte) &&
(!is_last_spte(iter->old_spte, iter->level) ||
WARN_ON_ONCE(leaf_spte_change_needs_tlb_flush(iter->old_spte, new_spte))))
kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level);
if (wrprot && fault->write)
ret = RET_PF_WRITE_PROTECTED;
if (unlikely(is_mmio_spte(vcpu->kvm, new_spte))) {
vcpu->stat.pf_mmio_spte_created++;
trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
new_spte);
ret = RET_PF_EMULATE;
} else {
trace_kvm_mmu_set_spte(iter->level, iter->gfn,
rcu_dereference(iter->sptep));
}
return ret;
}
static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
struct kvm_mmu_page *sp, bool shared)
{
u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled);
int ret = 0;
if (shared) {
ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
if (ret)
return ret;
} else {
tdp_mmu_iter_set_spte(kvm, iter, spte);
}
tdp_account_mmu_page(kvm, sp);
return 0;
}
static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
struct kvm_mmu_page *sp, bool shared);
int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
struct kvm_mmu_page *root = tdp_mmu_get_root_for_fault(vcpu, fault);
struct kvm *kvm = vcpu->kvm;
struct tdp_iter iter;
struct kvm_mmu_page *sp;
int ret = RET_PF_RETRY;
KVM_MMU_WARN_ON(!root || root->role.invalid);
kvm_mmu_hugepage_adjust(vcpu, fault);
trace_kvm_mmu_spte_requested(fault);
rcu_read_lock();
for_each_tdp_pte(iter, kvm, root, fault->gfn, fault->gfn + 1) {
int r;
if (fault->nx_huge_page_workaround_enabled)
disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
if (is_frozen_spte(iter.old_spte))
goto retry;
if (iter.level == fault->goal_level)
goto map_target_level;
if (is_shadow_present_pte(iter.old_spte) &&
!is_large_pte(iter.old_spte))
continue;
sp = tdp_mmu_alloc_sp(vcpu);
tdp_mmu_init_child_sp(sp, &iter);
if (is_mirror_sp(sp))
kvm_mmu_alloc_external_spt(vcpu, sp);
sp->nx_huge_page_disallowed = fault->huge_page_disallowed;
if (is_shadow_present_pte(iter.old_spte)) {
KVM_BUG_ON(is_mirror_sptep(iter.sptep), vcpu->kvm);
r = tdp_mmu_split_huge_page(kvm, &iter, sp, true);
} else {
r = tdp_mmu_link_sp(kvm, &iter, sp, true);
}
if (r) {
tdp_mmu_free_sp(sp);
goto retry;
}
if (fault->huge_page_disallowed &&
fault->req_level >= iter.level) {
spin_lock(&kvm->arch.tdp_mmu_pages_lock);
if (sp->nx_huge_page_disallowed)
track_possible_nx_huge_page(kvm, sp, KVM_TDP_MMU);
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
}
}
WARN_ON_ONCE(iter.level == fault->goal_level);
goto retry;
map_target_level:
ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
retry:
rcu_read_unlock();
return ret;
}
bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
bool flush)
{
enum kvm_tdp_mmu_root_types types;
struct kvm_mmu_page *root;
types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter) | KVM_INVALID_ROOTS;
__for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, types)
flush = tdp_mmu_zap_leafs(kvm, root, range->start, range->end,
range->may_block, flush);
return flush;
}
static void kvm_tdp_mmu_age_spte(struct kvm *kvm, struct tdp_iter *iter)
{
u64 new_spte;
if (spte_ad_enabled(iter->old_spte)) {
iter->old_spte = tdp_mmu_clear_spte_bits_atomic(iter->sptep,
shadow_accessed_mask);
new_spte = iter->old_spte & ~shadow_accessed_mask;
} else {
new_spte = mark_spte_for_access_track(iter->old_spte);
if (__tdp_mmu_set_spte_atomic(kvm, iter, new_spte))
return;
}
trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
iter->old_spte, new_spte);
}
static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm,
struct kvm_gfn_range *range,
bool test_only)
{
enum kvm_tdp_mmu_root_types types;
struct kvm_mmu_page *root;
struct tdp_iter iter;
bool ret = false;
types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter);
WARN_ON(types & ~KVM_VALID_ROOTS);
guard(rcu)();
for_each_tdp_mmu_root_rcu(kvm, root, range->slot->as_id, types) {
tdp_root_for_each_leaf_pte(iter, kvm, root, range->start, range->end) {
if (!is_accessed_spte(iter.old_spte))
continue;
if (test_only)
return true;
ret = true;
kvm_tdp_mmu_age_spte(kvm, &iter);
}
}
return ret;
}
bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
{
return __kvm_tdp_mmu_age_gfn_range(kvm, range, false);
}
bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
return __kvm_tdp_mmu_age_gfn_range(kvm, range, true);
}
static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
gfn_t start, gfn_t end, int min_level)
{
struct tdp_iter iter;
u64 new_spte;
bool spte_set = false;
rcu_read_lock();
BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
for_each_tdp_pte_min_level(iter, kvm, root, min_level, start, end) {
retry:
if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
continue;
if (!is_shadow_present_pte(iter.old_spte) ||
!is_last_spte(iter.old_spte, iter.level) ||
!(iter.old_spte & PT_WRITABLE_MASK))
continue;
new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
goto retry;
spte_set = true;
}
rcu_read_unlock();
return spte_set;
}
bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
const struct kvm_memory_slot *slot, int min_level)
{
struct kvm_mmu_page *root;
bool spte_set = false;
lockdep_assert_held_read(&kvm->mmu_lock);
for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
slot->base_gfn + slot->npages, min_level);
return spte_set;
}
static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(void)
{
struct kvm_mmu_page *sp;
sp = kmem_cache_zalloc(mmu_page_header_cache, GFP_KERNEL_ACCOUNT);
if (!sp)
return NULL;
sp->spt = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
if (!sp->spt) {
kmem_cache_free(mmu_page_header_cache, sp);
return NULL;
}
return sp;
}
static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
struct kvm_mmu_page *sp, bool shared)
{
const u64 huge_spte = iter->old_spte;
const int level = iter->level;
int ret, i;
for (i = 0; i < SPTE_ENT_PER_PAGE; i++)
sp->spt[i] = make_small_spte(kvm, huge_spte, sp->role, i);
ret = tdp_mmu_link_sp(kvm, iter, sp, shared);
if (ret)
goto out;
kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE);
out:
trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
return ret;
}
static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
struct kvm_mmu_page *root,
gfn_t start, gfn_t end,
int target_level, bool shared)
{
struct kvm_mmu_page *sp = NULL;
struct tdp_iter iter;
rcu_read_lock();
for_each_tdp_pte_min_level(iter, kvm, root, target_level + 1, start, end) {
retry:
if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
continue;
if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
continue;
if (!sp) {
rcu_read_unlock();
if (shared)
read_unlock(&kvm->mmu_lock);
else
write_unlock(&kvm->mmu_lock);
sp = tdp_mmu_alloc_sp_for_split();
if (shared)
read_lock(&kvm->mmu_lock);
else
write_lock(&kvm->mmu_lock);
if (!sp) {
trace_kvm_mmu_split_huge_page(iter.gfn,
iter.old_spte,
iter.level, -ENOMEM);
return -ENOMEM;
}
rcu_read_lock();
iter.yielded = true;
continue;
}
tdp_mmu_init_child_sp(sp, &iter);
if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
goto retry;
sp = NULL;
}
rcu_read_unlock();
if (sp)
tdp_mmu_free_sp(sp);
return 0;
}
void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
const struct kvm_memory_slot *slot,
gfn_t start, gfn_t end,
int target_level, bool shared)
{
struct kvm_mmu_page *root;
int r = 0;
kvm_lockdep_assert_mmu_lock_held(kvm, shared);
for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) {
r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
if (r) {
kvm_tdp_mmu_put_root(kvm, root);
break;
}
}
}
static bool tdp_mmu_need_write_protect(struct kvm *kvm, struct kvm_mmu_page *sp)
{
return kvm_mmu_page_ad_need_write_protect(kvm, sp) || !kvm_ad_enabled;
}
static void clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
gfn_t start, gfn_t end)
{
const u64 dbit = tdp_mmu_need_write_protect(kvm, root) ?
PT_WRITABLE_MASK : shadow_dirty_mask;
struct tdp_iter iter;
rcu_read_lock();
tdp_root_for_each_pte(iter, kvm, root, start, end) {
retry:
if (!is_shadow_present_pte(iter.old_spte) ||
!is_last_spte(iter.old_spte, iter.level))
continue;
if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
continue;
KVM_MMU_WARN_ON(dbit == shadow_dirty_mask &&
spte_ad_need_write_protect(iter.old_spte));
if (!(iter.old_spte & dbit))
continue;
if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit))
goto retry;
}
rcu_read_unlock();
}
void kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
const struct kvm_memory_slot *slot)
{
struct kvm_mmu_page *root;
lockdep_assert_held_read(&kvm->mmu_lock);
for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
clear_dirty_gfn_range(kvm, root, slot->base_gfn,
slot->base_gfn + slot->npages);
}
static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
gfn_t gfn, unsigned long mask, bool wrprot)
{
const u64 dbit = (wrprot || tdp_mmu_need_write_protect(kvm, root)) ?
PT_WRITABLE_MASK : shadow_dirty_mask;
struct tdp_iter iter;
lockdep_assert_held_write(&kvm->mmu_lock);
rcu_read_lock();
tdp_root_for_each_leaf_pte(iter, kvm, root, gfn + __ffs(mask),
gfn + BITS_PER_LONG) {
if (!mask)
break;
KVM_MMU_WARN_ON(dbit == shadow_dirty_mask &&
spte_ad_need_write_protect(iter.old_spte));
if (iter.level > PG_LEVEL_4K ||
!(mask & (1UL << (iter.gfn - gfn))))
continue;
mask &= ~(1UL << (iter.gfn - gfn));
if (!(iter.old_spte & dbit))
continue;
iter.old_spte = tdp_mmu_clear_spte_bits(iter.sptep,
iter.old_spte, dbit,
iter.level);
trace_kvm_tdp_mmu_spte_changed(iter.as_id, iter.gfn, iter.level,
iter.old_spte,
iter.old_spte & ~dbit);
}
rcu_read_unlock();
}
void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn, unsigned long mask,
bool wrprot)
{
struct kvm_mmu_page *root;
for_each_valid_tdp_mmu_root(kvm, root, slot->as_id)
clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
}
static int tdp_mmu_make_huge_spte(struct kvm *kvm,
struct tdp_iter *parent,
u64 *huge_spte)
{
struct kvm_mmu_page *root = spte_to_child_sp(parent->old_spte);
gfn_t start = parent->gfn;
gfn_t end = start + KVM_PAGES_PER_HPAGE(parent->level);
struct tdp_iter iter;
tdp_root_for_each_leaf_pte(iter, kvm, root, start, end) {
if (tdp_mmu_iter_need_resched(kvm, parent))
return -EAGAIN;
*huge_spte = make_huge_spte(kvm, iter.old_spte, parent->level);
return 0;
}
return -ENOENT;
}
static void recover_huge_pages_range(struct kvm *kvm,
struct kvm_mmu_page *root,
const struct kvm_memory_slot *slot)
{
gfn_t start = slot->base_gfn;
gfn_t end = start + slot->npages;
struct tdp_iter iter;
int max_mapping_level;
bool flush = false;
u64 huge_spte;
int r;
if (WARN_ON_ONCE(kvm_slot_dirty_track_enabled(slot)))
return;
rcu_read_lock();
for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_2M, start, end) {
retry:
if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) {
flush = false;
continue;
}
if (iter.level > KVM_MAX_HUGEPAGE_LEVEL ||
!is_shadow_present_pte(iter.old_spte))
continue;
if (is_last_spte(iter.old_spte, iter.level))
continue;
if (iter.gfn < start || iter.gfn >= end)
continue;
max_mapping_level = kvm_mmu_max_mapping_level(kvm, NULL, slot, iter.gfn);
if (max_mapping_level < iter.level)
continue;
r = tdp_mmu_make_huge_spte(kvm, &iter, &huge_spte);
if (r == -EAGAIN)
goto retry;
else if (r)
continue;
if (tdp_mmu_set_spte_atomic(kvm, &iter, huge_spte))
goto retry;
flush = true;
}
if (flush)
kvm_flush_remote_tlbs_memslot(kvm, slot);
rcu_read_unlock();
}
void kvm_tdp_mmu_recover_huge_pages(struct kvm *kvm,
const struct kvm_memory_slot *slot)
{
struct kvm_mmu_page *root;
lockdep_assert_held_read(&kvm->mmu_lock);
for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
recover_huge_pages_range(kvm, root, slot);
}
static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
gfn_t gfn, int min_level)
{
struct tdp_iter iter;
u64 new_spte;
bool spte_set = false;
BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
rcu_read_lock();
for_each_tdp_pte_min_level(iter, kvm, root, min_level, gfn, gfn + 1) {
if (!is_shadow_present_pte(iter.old_spte) ||
!is_last_spte(iter.old_spte, iter.level))
continue;
new_spte = iter.old_spte &
~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
if (new_spte == iter.old_spte)
break;
tdp_mmu_iter_set_spte(kvm, &iter, new_spte);
spte_set = true;
}
rcu_read_unlock();
return spte_set;
}
bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn,
int min_level)
{
struct kvm_mmu_page *root;
bool spte_set = false;
lockdep_assert_held_write(&kvm->mmu_lock);
for_each_valid_tdp_mmu_root(kvm, root, slot->as_id)
spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
return spte_set;
}
int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
int *root_level)
{
struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa);
struct tdp_iter iter;
gfn_t gfn = addr >> PAGE_SHIFT;
int leaf = -1;
*root_level = vcpu->arch.mmu->root_role.level;
for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
leaf = iter.level;
sptes[leaf] = iter.old_spte;
}
return leaf;
}
u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn,
u64 *spte)
{
struct kvm_mmu_page *root = tdp_mmu_get_root(vcpu, KVM_DIRECT_ROOTS);
struct tdp_iter iter;
tdp_ptep_t sptep = NULL;
for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
*spte = iter.old_spte;
sptep = iter.sptep;
}
return rcu_dereference(sptep);
}