root/arch/s390/mm/gmap_helpers.c
// SPDX-License-Identifier: GPL-2.0
/*
 *  Helper functions for KVM guest address space mapping code
 *
 *    Copyright IBM Corp. 2007, 2025
 */

#include <linux/export.h>
#include <linux/mm_types.h>
#include <linux/mmap_lock.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/swap.h>
#include <linux/leafops.h>
#include <linux/pagewalk.h>
#include <linux/ksm.h>
#include <asm/gmap_helpers.h>

/**
 * ptep_zap_softleaf_entry() - discard a software leaf entry.
 * @mm: the mm
 * @entry: the software leaf entry that needs to be zapped
 *
 * Discards the given software leaf entry. If the leaf entry was an actual
 * swap entry (and not a migration entry, for example), the actual swapped
 * page is also discarded from swap.
 */
static void ptep_zap_softleaf_entry(struct mm_struct *mm, softleaf_t entry)
{
        if (softleaf_is_swap(entry))
                dec_mm_counter(mm, MM_SWAPENTS);
        else if (softleaf_is_migration(entry))
                dec_mm_counter(mm, mm_counter(softleaf_to_folio(entry)));
        swap_put_entries_direct(entry, 1);
}

/**
 * gmap_helper_zap_one_page() - discard a page if it was swapped.
 * @mm: the mm
 * @vmaddr: the userspace virtual address that needs to be discarded
 *
 * If the given address maps to a swap entry, discard it.
 *
 * Context: needs to be called while holding the mmap lock.
 */
void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr)
{
        struct vm_area_struct *vma;
        spinlock_t *ptl;
        pte_t *ptep;

        mmap_assert_locked(mm);

        /* Find the vm address for the guest address */
        vma = vma_lookup(mm, vmaddr);
        if (!vma || is_vm_hugetlb_page(vma))
                return;

        /* Get pointer to the page table entry */
        ptep = get_locked_pte(mm, vmaddr, &ptl);
        if (unlikely(!ptep))
                return;
        if (pte_swap(*ptep)) {
                ptep_zap_softleaf_entry(mm, softleaf_from_pte(*ptep));
                pte_clear(mm, vmaddr, ptep);
        }
        pte_unmap_unlock(ptep, ptl);
}
EXPORT_SYMBOL_GPL(gmap_helper_zap_one_page);

/**
 * gmap_helper_discard() - discard user pages in the given range
 * @mm: the mm
 * @vmaddr: starting userspace address
 * @end: end address (first address outside the range)
 *
 * All userpace pages in the range [@vamddr, @end) are discarded and unmapped.
 *
 * Context: needs to be called while holding the mmap lock.
 */
void gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned long end)
{
        struct vm_area_struct *vma;

        mmap_assert_locked(mm);

        while (vmaddr < end) {
                vma = find_vma_intersection(mm, vmaddr, end);
                if (!vma)
                        return;
                if (!is_vm_hugetlb_page(vma))
                        zap_page_range_single(vma, vmaddr, min(end, vma->vm_end) - vmaddr, NULL);
                vmaddr = vma->vm_end;
        }
}
EXPORT_SYMBOL_GPL(gmap_helper_discard);

/**
 * gmap_helper_try_set_pte_unused() - mark a pte entry as unused
 * @mm: the mm
 * @vmaddr: the userspace address whose pte is to be marked
 *
 * Mark the pte corresponding the given address as unused. This will cause
 * core mm code to just drop this page instead of swapping it.
 *
 * This function needs to be called with interrupts disabled (for example
 * while holding a spinlock), or while holding the mmap lock. Normally this
 * function is called as a result of an unmap operation, and thus KVM common
 * code will already hold kvm->mmu_lock in write mode.
 *
 * Context: Needs to be called while holding the mmap lock or with interrupts
 *          disabled.
 */
void gmap_helper_try_set_pte_unused(struct mm_struct *mm, unsigned long vmaddr)
{
        pmd_t *pmdp, pmd, pmdval;
        pud_t *pudp, pud;
        p4d_t *p4dp, p4d;
        pgd_t *pgdp, pgd;
        spinlock_t *ptl;        /* Lock for the host (userspace) page table */
        pte_t *ptep;

        pgdp = pgd_offset(mm, vmaddr);
        pgd = pgdp_get(pgdp);
        if (pgd_none(pgd) || !pgd_present(pgd))
                return;

        p4dp = p4d_offset(pgdp, vmaddr);
        p4d = p4dp_get(p4dp);
        if (p4d_none(p4d) || !p4d_present(p4d))
                return;

        pudp = pud_offset(p4dp, vmaddr);
        pud = pudp_get(pudp);
        if (pud_none(pud) || pud_leaf(pud) || !pud_present(pud))
                return;

        pmdp = pmd_offset(pudp, vmaddr);
        pmd = pmdp_get_lockless(pmdp);
        if (pmd_none(pmd) || pmd_leaf(pmd) || !pmd_present(pmd))
                return;

        ptep = pte_offset_map_rw_nolock(mm, pmdp, vmaddr, &pmdval, &ptl);
        if (!ptep)
                return;

        /*
         * Several paths exists that takes the ptl lock and then call the
         * mmu_notifier, which takes the mmu_lock. The unmap path, instead,
         * takes the mmu_lock in write mode first, and then potentially
         * calls this function, which takes the ptl lock. This can lead to a
         * deadlock.
         * The unused page mechanism is only an optimization, if the
         * _PAGE_UNUSED bit is not set, the unused page is swapped as normal
         * instead of being discarded.
         * If the lock is contended the bit is not set and the deadlock is
         * avoided.
         */
        if (spin_trylock(ptl)) {
                /*
                 * Make sure the pte we are touching is still the correct
                 * one. In theory this check should not be needed, but
                 * better safe than sorry.
                 * Disabling interrupts or holding the mmap lock is enough to
                 * guarantee that no concurrent updates to the page tables
                 * are possible.
                 */
                if (likely(pmd_same(pmdval, pmdp_get_lockless(pmdp))))
                        __atomic64_or(_PAGE_UNUSED, (long *)ptep);
                spin_unlock(ptl);
        }

        pte_unmap(ptep);
}
EXPORT_SYMBOL_GPL(gmap_helper_try_set_pte_unused);

static int find_zeropage_pte_entry(pte_t *pte, unsigned long addr,
                                   unsigned long end, struct mm_walk *walk)
{
        unsigned long *found_addr = walk->private;

        /* Return 1 of the page is a zeropage. */
        if (is_zero_pfn(pte_pfn(*pte))) {
                /*
                 * Shared zeropage in e.g., a FS DAX mapping? We cannot do the
                 * right thing and likely don't care: FAULT_FLAG_UNSHARE
                 * currently only works in COW mappings, which is also where
                 * mm_forbids_zeropage() is checked.
                 */
                if (!is_cow_mapping(walk->vma->vm_flags))
                        return -EFAULT;

                *found_addr = addr;
                return 1;
        }
        return 0;
}

static const struct mm_walk_ops find_zeropage_ops = {
        .pte_entry      = find_zeropage_pte_entry,
        .walk_lock      = PGWALK_WRLOCK,
};

/** __gmap_helper_unshare_zeropages() - unshare all shared zeropages
 * @mm: the mm whose zero pages are to be unshared
 *
 * Unshare all shared zeropages, replacing them by anonymous pages. Note that
 * we cannot simply zap all shared zeropages, because this could later
 * trigger unexpected userfaultfd missing events.
 *
 * This must be called after mm->context.allow_cow_sharing was
 * set to 0, to avoid future mappings of shared zeropages.
 *
 * mm contracts with s390, that even if mm were to remove a page table,
 * and racing with walk_page_range_vma() calling pte_offset_map_lock()
 * would fail, it will never insert a page table containing empty zero
 * pages once mm_forbids_zeropage(mm) i.e.
 * mm->context.allow_cow_sharing is set to 0.
 */
static int __gmap_helper_unshare_zeropages(struct mm_struct *mm)
{
        struct vm_area_struct *vma;
        VMA_ITERATOR(vmi, mm, 0);
        unsigned long addr;
        vm_fault_t fault;
        int rc;

        for_each_vma(vmi, vma) {
                /*
                 * We could only look at COW mappings, but it's more future
                 * proof to catch unexpected zeropages in other mappings and
                 * fail.
                 */
                if ((vma->vm_flags & VM_PFNMAP) || is_vm_hugetlb_page(vma))
                        continue;
                addr = vma->vm_start;

retry:
                rc = walk_page_range_vma(vma, addr, vma->vm_end,
                                         &find_zeropage_ops, &addr);
                if (rc < 0)
                        return rc;
                else if (!rc)
                        continue;

                /* addr was updated by find_zeropage_pte_entry() */
                fault = handle_mm_fault(vma, addr,
                                        FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE,
                                        NULL);
                if (fault & VM_FAULT_OOM)
                        return -ENOMEM;
                /*
                 * See break_ksm(): even after handle_mm_fault() returned 0, we
                 * must start the lookup from the current address, because
                 * handle_mm_fault() may back out if there's any difficulty.
                 *
                 * VM_FAULT_SIGBUS and VM_FAULT_SIGSEGV are unexpected but
                 * maybe they could trigger in the future on concurrent
                 * truncation. In that case, the shared zeropage would be gone
                 * and we can simply retry and make progress.
                 */
                cond_resched();
                goto retry;
        }

        return 0;
}

/**
 * gmap_helper_disable_cow_sharing() - disable all COW sharing
 *
 * Disable most COW-sharing of memory pages for the whole process:
 * (1) Disable KSM and unmerge/unshare any KSM pages.
 * (2) Disallow shared zeropages and unshare any zerpages that are mapped.
 *
 * Not that we currently don't bother with COW-shared pages that are shared
 * with parent/child processes due to fork().
 */
int gmap_helper_disable_cow_sharing(void)
{
        struct mm_struct *mm = current->mm;
        int rc;

        mmap_assert_write_locked(mm);

        if (!mm->context.allow_cow_sharing)
                return 0;

        mm->context.allow_cow_sharing = 0;

        /* Replace all shared zeropages by anonymous pages. */
        rc = __gmap_helper_unshare_zeropages(mm);
        /*
         * Make sure to disable KSM (if enabled for the whole process or
         * individual VMAs). Note that nothing currently hinders user space
         * from re-enabling it.
         */
        if (!rc)
                rc = ksm_disable(mm);
        if (rc)
                mm->context.allow_cow_sharing = 1;
        return rc;
}
EXPORT_SYMBOL_GPL(gmap_helper_disable_cow_sharing);