#include <linux/mm.h>
#include <linux/sched/mm.h>
#include <linux/sched/task.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/leafops.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/rcupdate.h>
#include <linux/export.h>
#include <linux/memcontrol.h>
#include <linux/mmu_notifier.h>
#include <linux/migrate.h>
#include <linux/hugetlb.h>
#include <linux/huge_mm.h>
#include <linux/backing-dev.h>
#include <linux/page_idle.h>
#include <linux/memremap.h>
#include <linux/userfaultfd_k.h>
#include <linux/mm_inline.h>
#include <linux/oom.h>
#include <asm/tlb.h>
#define CREATE_TRACE_POINTS
#include <trace/events/migrate.h>
#include "internal.h"
#include "swap.h"
static struct kmem_cache *anon_vma_cachep;
static struct kmem_cache *anon_vma_chain_cachep;
static inline struct anon_vma *anon_vma_alloc(void)
{
struct anon_vma *anon_vma;
anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
if (anon_vma) {
atomic_set(&anon_vma->refcount, 1);
anon_vma->num_children = 0;
anon_vma->num_active_vmas = 0;
anon_vma->parent = anon_vma;
anon_vma->root = anon_vma;
}
return anon_vma;
}
static inline void anon_vma_free(struct anon_vma *anon_vma)
{
VM_BUG_ON(atomic_read(&anon_vma->refcount));
might_sleep();
if (rwsem_is_locked(&anon_vma->root->rwsem)) {
anon_vma_lock_write(anon_vma);
anon_vma_unlock_write(anon_vma);
}
kmem_cache_free(anon_vma_cachep, anon_vma);
}
static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp)
{
return kmem_cache_alloc(anon_vma_chain_cachep, gfp);
}
static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
{
kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
}
static void anon_vma_chain_assign(struct vm_area_struct *vma,
struct anon_vma_chain *avc,
struct anon_vma *anon_vma)
{
avc->vma = vma;
avc->anon_vma = anon_vma;
list_add(&avc->same_vma, &vma->anon_vma_chain);
}
int __anon_vma_prepare(struct vm_area_struct *vma)
{
struct mm_struct *mm = vma->vm_mm;
struct anon_vma *anon_vma, *allocated;
struct anon_vma_chain *avc;
mmap_assert_locked(mm);
might_sleep();
avc = anon_vma_chain_alloc(GFP_KERNEL);
if (!avc)
goto out_enomem;
anon_vma = find_mergeable_anon_vma(vma);
allocated = NULL;
if (!anon_vma) {
anon_vma = anon_vma_alloc();
if (unlikely(!anon_vma))
goto out_enomem_free_avc;
anon_vma->num_children++;
allocated = anon_vma;
}
anon_vma_lock_write(anon_vma);
spin_lock(&mm->page_table_lock);
if (likely(!vma->anon_vma)) {
vma->anon_vma = anon_vma;
anon_vma_chain_assign(vma, avc, anon_vma);
anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
anon_vma->num_active_vmas++;
allocated = NULL;
avc = NULL;
}
spin_unlock(&mm->page_table_lock);
anon_vma_unlock_write(anon_vma);
if (unlikely(allocated))
put_anon_vma(allocated);
if (unlikely(avc))
anon_vma_chain_free(avc);
return 0;
out_enomem_free_avc:
anon_vma_chain_free(avc);
out_enomem:
return -ENOMEM;
}
static void check_anon_vma_clone(struct vm_area_struct *dst,
struct vm_area_struct *src,
enum vma_operation operation)
{
mmap_assert_write_locked(src->vm_mm);
VM_WARN_ON_ONCE(operation != VMA_OP_FORK && dst->vm_mm != src->vm_mm);
VM_WARN_ON_ONCE(!src->anon_vma && !list_empty(&src->anon_vma_chain));
VM_WARN_ON_ONCE(!src->anon_vma && dst->anon_vma);
VM_WARN_ON_ONCE(!list_empty(&dst->anon_vma_chain));
VM_WARN_ON_ONCE(dst->anon_vma && dst->anon_vma != src->anon_vma);
VM_WARN_ON_ONCE(operation != VMA_OP_FORK && src->anon_vma &&
!dst->anon_vma);
VM_WARN_ON_ONCE(operation == VMA_OP_MERGE_UNFAULTED &&
!list_is_singular(&src->anon_vma_chain));
#ifdef CONFIG_PER_VMA_LOCK
VM_WARN_ON_ONCE(operation != VMA_OP_MERGE_UNFAULTED &&
vma_is_attached(dst));
#endif
}
static void maybe_reuse_anon_vma(struct vm_area_struct *dst,
struct anon_vma *anon_vma)
{
if (dst->anon_vma)
return;
if (anon_vma->num_active_vmas > 0)
return;
if (anon_vma->num_children > 1)
return;
dst->anon_vma = anon_vma;
anon_vma->num_active_vmas++;
}
static void cleanup_partial_anon_vmas(struct vm_area_struct *vma);
int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src,
enum vma_operation operation)
{
struct anon_vma_chain *avc, *pavc;
struct anon_vma *active_anon_vma = src->anon_vma;
check_anon_vma_clone(dst, src, operation);
if (!active_anon_vma)
return 0;
list_for_each_entry(pavc, &src->anon_vma_chain, same_vma) {
avc = anon_vma_chain_alloc(GFP_KERNEL);
if (!avc)
goto enomem_failure;
anon_vma_chain_assign(dst, avc, pavc->anon_vma);
}
anon_vma_lock_write(src->anon_vma);
list_for_each_entry_reverse(avc, &dst->anon_vma_chain, same_vma) {
struct anon_vma *anon_vma = avc->anon_vma;
anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
if (operation == VMA_OP_FORK)
maybe_reuse_anon_vma(dst, anon_vma);
}
if (operation != VMA_OP_FORK)
dst->anon_vma->num_active_vmas++;
anon_vma_unlock_write(active_anon_vma);
return 0;
enomem_failure:
cleanup_partial_anon_vmas(dst);
return -ENOMEM;
}
int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
{
struct anon_vma_chain *avc;
struct anon_vma *anon_vma;
int rc;
if (!pvma->anon_vma)
return 0;
vma->anon_vma = NULL;
anon_vma = anon_vma_alloc();
if (!anon_vma)
return -ENOMEM;
avc = anon_vma_chain_alloc(GFP_KERNEL);
if (!avc) {
put_anon_vma(anon_vma);
return -ENOMEM;
}
rc = anon_vma_clone(vma, pvma, VMA_OP_FORK);
if (rc || vma->anon_vma) {
put_anon_vma(anon_vma);
anon_vma_chain_free(avc);
return rc;
}
anon_vma->num_active_vmas = 1;
anon_vma->root = pvma->anon_vma->root;
anon_vma->parent = pvma->anon_vma;
get_anon_vma(anon_vma->root);
vma->anon_vma = anon_vma;
anon_vma_chain_assign(vma, avc, anon_vma);
anon_vma_lock_write(anon_vma);
anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
anon_vma->parent->num_children++;
anon_vma_unlock_write(anon_vma);
return 0;
}
static void cleanup_partial_anon_vmas(struct vm_area_struct *vma)
{
struct anon_vma_chain *avc, *next;
list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
list_del(&avc->same_vma);
anon_vma_chain_free(avc);
}
vma->anon_vma = NULL;
}
void unlink_anon_vmas(struct vm_area_struct *vma)
{
struct anon_vma_chain *avc, *next;
struct anon_vma *active_anon_vma = vma->anon_vma;
mmap_assert_locked(vma->vm_mm);
if (!active_anon_vma) {
VM_WARN_ON_ONCE(!list_empty(&vma->anon_vma_chain));
return;
}
anon_vma_lock_write(active_anon_vma);
list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
struct anon_vma *anon_vma = avc->anon_vma;
anon_vma_interval_tree_remove(avc, &anon_vma->rb_root);
if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) {
anon_vma->parent->num_children--;
continue;
}
list_del(&avc->same_vma);
anon_vma_chain_free(avc);
}
active_anon_vma->num_active_vmas--;
vma->anon_vma = NULL;
anon_vma_unlock_write(active_anon_vma);
list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
struct anon_vma *anon_vma = avc->anon_vma;
VM_WARN_ON(anon_vma->num_children);
VM_WARN_ON(anon_vma->num_active_vmas);
put_anon_vma(anon_vma);
list_del(&avc->same_vma);
anon_vma_chain_free(avc);
}
}
static void anon_vma_ctor(void *data)
{
struct anon_vma *anon_vma = data;
init_rwsem(&anon_vma->rwsem);
atomic_set(&anon_vma->refcount, 0);
anon_vma->rb_root = RB_ROOT_CACHED;
}
void __init anon_vma_init(void)
{
anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
0, SLAB_TYPESAFE_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT,
anon_vma_ctor);
anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain,
SLAB_PANIC|SLAB_ACCOUNT);
}
struct anon_vma *folio_get_anon_vma(const struct folio *folio)
{
struct anon_vma *anon_vma = NULL;
unsigned long anon_mapping;
VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
rcu_read_lock();
anon_mapping = (unsigned long)READ_ONCE(folio->mapping);
if ((anon_mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON)
goto out;
if (!folio_mapped(folio))
goto out;
anon_vma = (struct anon_vma *) (anon_mapping - FOLIO_MAPPING_ANON);
if (!atomic_inc_not_zero(&anon_vma->refcount)) {
anon_vma = NULL;
goto out;
}
if (!folio_mapped(folio)) {
rcu_read_unlock();
put_anon_vma(anon_vma);
return NULL;
}
out:
rcu_read_unlock();
return anon_vma;
}
struct anon_vma *folio_lock_anon_vma_read(const struct folio *folio,
struct rmap_walk_control *rwc)
{
struct anon_vma *anon_vma = NULL;
struct anon_vma *root_anon_vma;
unsigned long anon_mapping;
VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
rcu_read_lock();
anon_mapping = (unsigned long)READ_ONCE(folio->mapping);
if ((anon_mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON)
goto out;
if (!folio_mapped(folio))
goto out;
anon_vma = (struct anon_vma *) (anon_mapping - FOLIO_MAPPING_ANON);
root_anon_vma = READ_ONCE(anon_vma->root);
if (down_read_trylock(&root_anon_vma->rwsem)) {
if (!folio_mapped(folio)) {
up_read(&root_anon_vma->rwsem);
anon_vma = NULL;
}
goto out;
}
if (rwc && rwc->try_lock) {
anon_vma = NULL;
rwc->contended = true;
goto out;
}
if (!atomic_inc_not_zero(&anon_vma->refcount)) {
anon_vma = NULL;
goto out;
}
if (!folio_mapped(folio)) {
rcu_read_unlock();
put_anon_vma(anon_vma);
return NULL;
}
rcu_read_unlock();
anon_vma_lock_read(anon_vma);
if (atomic_dec_and_test(&anon_vma->refcount)) {
anon_vma_unlock_read(anon_vma);
__put_anon_vma(anon_vma);
anon_vma = NULL;
}
return anon_vma;
out:
rcu_read_unlock();
return anon_vma;
}
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
void try_to_unmap_flush(void)
{
struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc;
if (!tlb_ubc->flush_required)
return;
arch_tlbbatch_flush(&tlb_ubc->arch);
tlb_ubc->flush_required = false;
tlb_ubc->writable = false;
}
void try_to_unmap_flush_dirty(void)
{
struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc;
if (tlb_ubc->writable)
try_to_unmap_flush();
}
#define TLB_FLUSH_BATCH_FLUSHED_SHIFT 16
#define TLB_FLUSH_BATCH_PENDING_MASK \
((1 << (TLB_FLUSH_BATCH_FLUSHED_SHIFT - 1)) - 1)
#define TLB_FLUSH_BATCH_PENDING_LARGE \
(TLB_FLUSH_BATCH_PENDING_MASK / 2)
static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval,
unsigned long start, unsigned long end)
{
struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc;
int batch;
bool writable = pte_dirty(pteval);
if (!pte_accessible(mm, pteval))
return;
arch_tlbbatch_add_pending(&tlb_ubc->arch, mm, start, end);
tlb_ubc->flush_required = true;
barrier();
batch = atomic_read(&mm->tlb_flush_batched);
retry:
if ((batch & TLB_FLUSH_BATCH_PENDING_MASK) > TLB_FLUSH_BATCH_PENDING_LARGE) {
if (!atomic_try_cmpxchg(&mm->tlb_flush_batched, &batch, 1))
goto retry;
} else {
atomic_inc(&mm->tlb_flush_batched);
}
if (writable)
tlb_ubc->writable = true;
}
static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
{
if (!(flags & TTU_BATCH_FLUSH))
return false;
return arch_tlbbatch_should_defer(mm);
}
void flush_tlb_batched_pending(struct mm_struct *mm)
{
int batch = atomic_read(&mm->tlb_flush_batched);
int pending = batch & TLB_FLUSH_BATCH_PENDING_MASK;
int flushed = batch >> TLB_FLUSH_BATCH_FLUSHED_SHIFT;
if (pending != flushed) {
flush_tlb_mm(mm);
atomic_cmpxchg(&mm->tlb_flush_batched, batch,
pending | (pending << TLB_FLUSH_BATCH_FLUSHED_SHIFT));
}
}
#else
static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval,
unsigned long start, unsigned long end)
{
}
static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
{
return false;
}
#endif
unsigned long page_address_in_vma(const struct folio *folio,
const struct page *page, const struct vm_area_struct *vma)
{
if (folio_test_anon(folio)) {
struct anon_vma *anon_vma = folio_anon_vma(folio);
if (!vma->anon_vma || !anon_vma ||
vma->anon_vma->root != anon_vma->root)
return -EFAULT;
} else if (!vma->vm_file) {
return -EFAULT;
} else if (vma->vm_file->f_mapping != folio->mapping) {
return -EFAULT;
}
return vma_address(vma, page_pgoff(folio, page), 1);
}
pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
{
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd = NULL;
pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
goto out;
p4d = p4d_offset(pgd, address);
if (!p4d_present(*p4d))
goto out;
pud = pud_offset(p4d, address);
if (!pud_present(*pud))
goto out;
pmd = pmd_offset(pud, address);
out:
return pmd;
}
struct folio_referenced_arg {
int mapcount;
int referenced;
vm_flags_t vm_flags;
struct mem_cgroup *memcg;
};
static bool folio_referenced_one(struct folio *folio,
struct vm_area_struct *vma, unsigned long address, void *arg)
{
struct folio_referenced_arg *pra = arg;
DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
int ptes = 0, referenced = 0;
unsigned int nr;
while (page_vma_mapped_walk(&pvmw)) {
address = pvmw.address;
nr = 1;
if (vma->vm_flags & VM_LOCKED) {
ptes++;
pra->mapcount--;
if (pvmw.pte && ptes != pvmw.nr_pages)
continue;
if (pvmw.flags & PVMW_PGTABLE_CROSSED)
continue;
mlock_vma_folio(folio, vma);
page_vma_mapped_walk_done(&pvmw);
pra->vm_flags |= VM_LOCKED;
return false;
}
if ((!atomic_read(&vma->vm_mm->mm_users) ||
check_stable_address_space(vma->vm_mm)) &&
folio_test_anon(folio) && folio_test_swapbacked(folio) &&
!folio_maybe_mapped_shared(folio)) {
pra->referenced = -1;
page_vma_mapped_walk_done(&pvmw);
return false;
}
if (lru_gen_enabled() && pvmw.pte) {
if (lru_gen_look_around(&pvmw))
referenced++;
} else if (pvmw.pte) {
if (folio_test_large(folio)) {
unsigned long end_addr = pmd_addr_end(address, vma->vm_end);
unsigned int max_nr = (end_addr - address) >> PAGE_SHIFT;
pte_t pteval = ptep_get(pvmw.pte);
nr = folio_pte_batch(folio, pvmw.pte,
pteval, max_nr);
}
ptes += nr;
if (clear_flush_young_ptes_notify(vma, address, pvmw.pte, nr))
referenced++;
pvmw.pte += nr - 1;
pvmw.address += (nr - 1) * PAGE_SIZE;
} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
if (pmdp_clear_flush_young_notify(vma, address,
pvmw.pmd))
referenced++;
} else {
WARN_ON_ONCE(1);
}
pra->mapcount -= nr;
if (ptes == pvmw.nr_pages) {
page_vma_mapped_walk_done(&pvmw);
break;
}
}
if (referenced)
folio_clear_idle(folio);
if (folio_test_clear_young(folio))
referenced++;
if (referenced) {
pra->referenced++;
pra->vm_flags |= vma->vm_flags & ~VM_LOCKED;
}
if (!pra->mapcount)
return false;
return true;
}
static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg)
{
struct folio_referenced_arg *pra = arg;
struct mem_cgroup *memcg = pra->memcg;
if (!vma_has_recency(vma))
return true;
if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
return true;
return false;
}
int folio_referenced(struct folio *folio, int is_locked,
struct mem_cgroup *memcg, vm_flags_t *vm_flags)
{
bool we_locked = false;
struct folio_referenced_arg pra = {
.mapcount = folio_mapcount(folio),
.memcg = memcg,
};
struct rmap_walk_control rwc = {
.rmap_one = folio_referenced_one,
.arg = (void *)&pra,
.anon_lock = folio_lock_anon_vma_read,
.try_lock = true,
.invalid_vma = invalid_folio_referenced_vma,
};
*vm_flags = 0;
if (!pra.mapcount)
return 0;
if (!folio_raw_mapping(folio))
return 0;
if (!is_locked) {
we_locked = folio_trylock(folio);
if (!we_locked)
return 1;
}
rmap_walk(folio, &rwc);
*vm_flags = pra.vm_flags;
if (we_locked)
folio_unlock(folio);
return rwc.contended ? -1 : pra.referenced;
}
static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw)
{
int cleaned = 0;
struct vm_area_struct *vma = pvmw->vma;
struct mmu_notifier_range range;
unsigned long address = pvmw->address;
mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0,
vma->vm_mm, address, vma_address_end(pvmw));
mmu_notifier_invalidate_range_start(&range);
while (page_vma_mapped_walk(pvmw)) {
int ret = 0;
address = pvmw->address;
if (pvmw->pte) {
pte_t *pte = pvmw->pte;
pte_t entry = ptep_get(pte);
if (!pte_present(entry))
continue;
if (!pte_dirty(entry) && !pte_write(entry))
continue;
flush_cache_page(vma, address, pte_pfn(entry));
entry = ptep_clear_flush(vma, address, pte);
entry = pte_wrprotect(entry);
entry = pte_mkclean(entry);
set_pte_at(vma->vm_mm, address, pte, entry);
ret = 1;
} else {
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
pmd_t *pmd = pvmw->pmd;
pmd_t entry = pmdp_get(pmd);
if (!pmd_present(entry))
continue;
if (!pmd_dirty(entry) && !pmd_write(entry))
continue;
flush_cache_range(vma, address,
address + HPAGE_PMD_SIZE);
entry = pmdp_invalidate(vma, address, pmd);
entry = pmd_wrprotect(entry);
entry = pmd_mkclean(entry);
set_pmd_at(vma->vm_mm, address, pmd, entry);
ret = 1;
#else
WARN_ON_ONCE(1);
#endif
}
if (ret)
cleaned++;
}
mmu_notifier_invalidate_range_end(&range);
return cleaned;
}
static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma,
unsigned long address, void *arg)
{
DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_SYNC);
int *cleaned = arg;
*cleaned += page_vma_mkclean_one(&pvmw);
return true;
}
static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg)
{
if (vma->vm_flags & VM_SHARED)
return false;
return true;
}
int folio_mkclean(struct folio *folio)
{
int cleaned = 0;
struct address_space *mapping;
struct rmap_walk_control rwc = {
.arg = (void *)&cleaned,
.rmap_one = page_mkclean_one,
.invalid_vma = invalid_mkclean_vma,
};
BUG_ON(!folio_test_locked(folio));
if (!folio_mapped(folio))
return 0;
mapping = folio_mapping(folio);
if (!mapping)
return 0;
rmap_walk(folio, &rwc);
return cleaned;
}
EXPORT_SYMBOL_GPL(folio_mkclean);
struct wrprotect_file_state {
int cleaned;
pgoff_t pgoff;
unsigned long pfn;
unsigned long nr_pages;
};
static bool mapping_wrprotect_range_one(struct folio *folio,
struct vm_area_struct *vma, unsigned long address, void *arg)
{
struct wrprotect_file_state *state = (struct wrprotect_file_state *)arg;
struct page_vma_mapped_walk pvmw = {
.pfn = state->pfn,
.nr_pages = state->nr_pages,
.pgoff = state->pgoff,
.vma = vma,
.address = address,
.flags = PVMW_SYNC,
};
state->cleaned += page_vma_mkclean_one(&pvmw);
return true;
}
static void __rmap_walk_file(struct folio *folio, struct address_space *mapping,
pgoff_t pgoff_start, unsigned long nr_pages,
struct rmap_walk_control *rwc, bool locked);
int mapping_wrprotect_range(struct address_space *mapping, pgoff_t pgoff,
unsigned long pfn, unsigned long nr_pages)
{
struct wrprotect_file_state state = {
.cleaned = 0,
.pgoff = pgoff,
.pfn = pfn,
.nr_pages = nr_pages,
};
struct rmap_walk_control rwc = {
.arg = (void *)&state,
.rmap_one = mapping_wrprotect_range_one,
.invalid_vma = invalid_mkclean_vma,
};
if (!mapping)
return 0;
__rmap_walk_file(NULL, mapping, pgoff, nr_pages, &rwc,
false);
return state.cleaned;
}
EXPORT_SYMBOL_GPL(mapping_wrprotect_range);
int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
struct vm_area_struct *vma)
{
struct page_vma_mapped_walk pvmw = {
.pfn = pfn,
.nr_pages = nr_pages,
.pgoff = pgoff,
.vma = vma,
.flags = PVMW_SYNC,
};
if (invalid_mkclean_vma(vma, NULL))
return 0;
pvmw.address = vma_address(vma, pgoff, nr_pages);
VM_BUG_ON_VMA(pvmw.address == -EFAULT, vma);
return page_vma_mkclean_one(&pvmw);
}
static void __folio_mod_stat(struct folio *folio, int nr, int nr_pmdmapped)
{
int idx;
if (nr) {
idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED;
lruvec_stat_mod_folio(folio, idx, nr);
}
if (nr_pmdmapped) {
if (folio_test_anon(folio)) {
idx = NR_ANON_THPS;
lruvec_stat_mod_folio(folio, idx, nr_pmdmapped);
} else {
idx = folio_test_swapbacked(folio) ?
NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED;
__mod_node_page_state(folio_pgdat(folio), idx,
nr_pmdmapped);
}
}
}
static __always_inline void __folio_add_rmap(struct folio *folio,
struct page *page, int nr_pages, struct vm_area_struct *vma,
enum pgtable_level level)
{
atomic_t *mapped = &folio->_nr_pages_mapped;
const int orig_nr_pages = nr_pages;
int first = 0, nr = 0, nr_pmdmapped = 0;
__folio_rmap_sanity_checks(folio, page, nr_pages, level);
switch (level) {
case PGTABLE_LEVEL_PTE:
if (!folio_test_large(folio)) {
nr = atomic_inc_and_test(&folio->_mapcount);
break;
}
if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
nr = folio_add_return_large_mapcount(folio, orig_nr_pages, vma);
if (nr == orig_nr_pages)
nr = folio_large_nr_pages(folio);
else
nr = 0;
break;
}
do {
first += atomic_inc_and_test(&page->_mapcount);
} while (page++, --nr_pages > 0);
if (first &&
atomic_add_return_relaxed(first, mapped) < ENTIRELY_MAPPED)
nr = first;
folio_add_large_mapcount(folio, orig_nr_pages, vma);
break;
case PGTABLE_LEVEL_PMD:
case PGTABLE_LEVEL_PUD:
first = atomic_inc_and_test(&folio->_entire_mapcount);
if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
if (level == PGTABLE_LEVEL_PMD && first)
nr_pmdmapped = folio_large_nr_pages(folio);
nr = folio_inc_return_large_mapcount(folio, vma);
if (nr == 1)
nr = folio_large_nr_pages(folio);
else
nr = 0;
break;
}
if (first) {
nr = atomic_add_return_relaxed(ENTIRELY_MAPPED, mapped);
if (likely(nr < ENTIRELY_MAPPED + ENTIRELY_MAPPED)) {
nr_pages = folio_large_nr_pages(folio);
if (level == PGTABLE_LEVEL_PMD)
nr_pmdmapped = nr_pages;
nr = nr_pages - (nr & FOLIO_PAGES_MAPPED);
if (unlikely(nr < 0))
nr = 0;
} else {
nr = 0;
}
}
folio_inc_large_mapcount(folio, vma);
break;
default:
BUILD_BUG();
}
__folio_mod_stat(folio, nr, nr_pmdmapped);
}
void folio_move_anon_rmap(struct folio *folio, struct vm_area_struct *vma)
{
void *anon_vma = vma->anon_vma;
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
VM_BUG_ON_VMA(!anon_vma, vma);
anon_vma += FOLIO_MAPPING_ANON;
WRITE_ONCE(folio->mapping, anon_vma);
}
static void __folio_set_anon(struct folio *folio, struct vm_area_struct *vma,
unsigned long address, bool exclusive)
{
struct anon_vma *anon_vma = vma->anon_vma;
BUG_ON(!anon_vma);
if (!exclusive)
anon_vma = anon_vma->root;
anon_vma = (void *) anon_vma + FOLIO_MAPPING_ANON;
WRITE_ONCE(folio->mapping, (struct address_space *) anon_vma);
folio->index = linear_page_index(vma, address);
}
static void __page_check_anon_rmap(const struct folio *folio,
const struct page *page, struct vm_area_struct *vma,
unsigned long address)
{
VM_BUG_ON_FOLIO(folio_anon_vma(folio)->root != vma->anon_vma->root,
folio);
VM_BUG_ON_PAGE(page_pgoff(folio, page) != linear_page_index(vma, address),
page);
}
static __always_inline void __folio_add_anon_rmap(struct folio *folio,
struct page *page, int nr_pages, struct vm_area_struct *vma,
unsigned long address, rmap_t flags, enum pgtable_level level)
{
int i;
VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
__folio_add_rmap(folio, page, nr_pages, vma, level);
if (likely(!folio_test_ksm(folio)))
__page_check_anon_rmap(folio, page, vma, address);
if (flags & RMAP_EXCLUSIVE) {
switch (level) {
case PGTABLE_LEVEL_PTE:
for (i = 0; i < nr_pages; i++)
SetPageAnonExclusive(page + i);
break;
case PGTABLE_LEVEL_PMD:
SetPageAnonExclusive(page);
break;
case PGTABLE_LEVEL_PUD:
WARN_ON_ONCE(1);
break;
default:
BUILD_BUG();
}
}
VM_WARN_ON_FOLIO(!folio_test_large(folio) && PageAnonExclusive(page) &&
atomic_read(&folio->_mapcount) > 0, folio);
for (i = 0; i < nr_pages; i++) {
struct page *cur_page = page + i;
VM_WARN_ON_FOLIO(folio_test_large(folio) &&
folio_entire_mapcount(folio) > 1 &&
PageAnonExclusive(cur_page), folio);
if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT))
continue;
VM_WARN_ON_FOLIO(atomic_read(&cur_page->_mapcount) > 0 &&
PageAnonExclusive(cur_page), folio);
}
if (folio_nr_pages(folio) == nr_pages)
mlock_vma_folio(folio, vma);
}
void folio_add_anon_rmap_ptes(struct folio *folio, struct page *page,
int nr_pages, struct vm_area_struct *vma, unsigned long address,
rmap_t flags)
{
__folio_add_anon_rmap(folio, page, nr_pages, vma, address, flags,
PGTABLE_LEVEL_PTE);
}
void folio_add_anon_rmap_pmd(struct folio *folio, struct page *page,
struct vm_area_struct *vma, unsigned long address, rmap_t flags)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
__folio_add_anon_rmap(folio, page, HPAGE_PMD_NR, vma, address, flags,
PGTABLE_LEVEL_PMD);
#else
WARN_ON_ONCE(true);
#endif
}
void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
unsigned long address, rmap_t flags)
{
const bool exclusive = flags & RMAP_EXCLUSIVE;
int nr = 1, nr_pmdmapped = 0;
VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
VM_WARN_ON_FOLIO(!exclusive && !folio_test_locked(folio), folio);
if (!folio_test_swapbacked(folio) && !(vma->vm_flags & VM_DROPPABLE))
__folio_set_swapbacked(folio);
__folio_set_anon(folio, vma, address, exclusive);
if (likely(!folio_test_large(folio))) {
atomic_set(&folio->_mapcount, 0);
if (exclusive)
SetPageAnonExclusive(&folio->page);
} else if (!folio_test_pmd_mappable(folio)) {
int i;
nr = folio_large_nr_pages(folio);
for (i = 0; i < nr; i++) {
struct page *page = folio_page(folio, i);
if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
atomic_set(&page->_mapcount, 0);
if (exclusive)
SetPageAnonExclusive(page);
}
folio_set_large_mapcount(folio, nr, vma);
if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
atomic_set(&folio->_nr_pages_mapped, nr);
} else {
nr = folio_large_nr_pages(folio);
atomic_set(&folio->_entire_mapcount, 0);
folio_set_large_mapcount(folio, 1, vma);
if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
atomic_set(&folio->_nr_pages_mapped, ENTIRELY_MAPPED);
if (exclusive)
SetPageAnonExclusive(&folio->page);
nr_pmdmapped = nr;
}
VM_WARN_ON_ONCE(address < vma->vm_start ||
address + (nr << PAGE_SHIFT) > vma->vm_end);
__folio_mod_stat(folio, nr, nr_pmdmapped);
mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1);
}
static __always_inline void __folio_add_file_rmap(struct folio *folio,
struct page *page, int nr_pages, struct vm_area_struct *vma,
enum pgtable_level level)
{
VM_WARN_ON_FOLIO(folio_test_anon(folio), folio);
__folio_add_rmap(folio, page, nr_pages, vma, level);
if (folio_nr_pages(folio) == nr_pages)
mlock_vma_folio(folio, vma);
}
void folio_add_file_rmap_ptes(struct folio *folio, struct page *page,
int nr_pages, struct vm_area_struct *vma)
{
__folio_add_file_rmap(folio, page, nr_pages, vma, PGTABLE_LEVEL_PTE);
}
void folio_add_file_rmap_pmd(struct folio *folio, struct page *page,
struct vm_area_struct *vma)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
__folio_add_file_rmap(folio, page, HPAGE_PMD_NR, vma, PGTABLE_LEVEL_PMD);
#else
WARN_ON_ONCE(true);
#endif
}
void folio_add_file_rmap_pud(struct folio *folio, struct page *page,
struct vm_area_struct *vma)
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
__folio_add_file_rmap(folio, page, HPAGE_PUD_NR, vma, PGTABLE_LEVEL_PUD);
#else
WARN_ON_ONCE(true);
#endif
}
static __always_inline void __folio_remove_rmap(struct folio *folio,
struct page *page, int nr_pages, struct vm_area_struct *vma,
enum pgtable_level level)
{
atomic_t *mapped = &folio->_nr_pages_mapped;
int last = 0, nr = 0, nr_pmdmapped = 0;
bool partially_mapped = false;
__folio_rmap_sanity_checks(folio, page, nr_pages, level);
switch (level) {
case PGTABLE_LEVEL_PTE:
if (!folio_test_large(folio)) {
nr = atomic_add_negative(-1, &folio->_mapcount);
break;
}
if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
nr = folio_sub_return_large_mapcount(folio, nr_pages, vma);
if (!nr) {
nr = folio_large_nr_pages(folio);
} else {
partially_mapped = nr < folio_large_nr_pages(folio) &&
!folio_entire_mapcount(folio);
nr = 0;
}
break;
}
folio_sub_large_mapcount(folio, nr_pages, vma);
do {
last += atomic_add_negative(-1, &page->_mapcount);
} while (page++, --nr_pages > 0);
if (last &&
atomic_sub_return_relaxed(last, mapped) < ENTIRELY_MAPPED)
nr = last;
partially_mapped = nr && atomic_read(mapped);
break;
case PGTABLE_LEVEL_PMD:
case PGTABLE_LEVEL_PUD:
if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
last = atomic_add_negative(-1, &folio->_entire_mapcount);
if (level == PGTABLE_LEVEL_PMD && last)
nr_pmdmapped = folio_large_nr_pages(folio);
nr = folio_dec_return_large_mapcount(folio, vma);
if (!nr) {
nr = folio_large_nr_pages(folio);
} else {
partially_mapped = last &&
nr < folio_large_nr_pages(folio);
nr = 0;
}
break;
}
folio_dec_large_mapcount(folio, vma);
last = atomic_add_negative(-1, &folio->_entire_mapcount);
if (last) {
nr = atomic_sub_return_relaxed(ENTIRELY_MAPPED, mapped);
if (likely(nr < ENTIRELY_MAPPED)) {
nr_pages = folio_large_nr_pages(folio);
if (level == PGTABLE_LEVEL_PMD)
nr_pmdmapped = nr_pages;
nr = nr_pages - nr;
if (unlikely(nr < 0))
nr = 0;
} else {
nr = 0;
}
}
partially_mapped = nr && nr < nr_pmdmapped;
break;
default:
BUILD_BUG();
}
if (partially_mapped && folio_test_anon(folio) &&
!folio_test_partially_mapped(folio) &&
!folio_is_device_private(folio))
deferred_split_folio(folio, true);
__folio_mod_stat(folio, -nr, -nr_pmdmapped);
munlock_vma_folio(folio, vma);
}
void folio_remove_rmap_ptes(struct folio *folio, struct page *page,
int nr_pages, struct vm_area_struct *vma)
{
__folio_remove_rmap(folio, page, nr_pages, vma, PGTABLE_LEVEL_PTE);
}
void folio_remove_rmap_pmd(struct folio *folio, struct page *page,
struct vm_area_struct *vma)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
__folio_remove_rmap(folio, page, HPAGE_PMD_NR, vma, PGTABLE_LEVEL_PMD);
#else
WARN_ON_ONCE(true);
#endif
}
void folio_remove_rmap_pud(struct folio *folio, struct page *page,
struct vm_area_struct *vma)
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
__folio_remove_rmap(folio, page, HPAGE_PUD_NR, vma, PGTABLE_LEVEL_PUD);
#else
WARN_ON_ONCE(true);
#endif
}
static inline unsigned int folio_unmap_pte_batch(struct folio *folio,
struct page_vma_mapped_walk *pvmw,
enum ttu_flags flags, pte_t pte)
{
unsigned long end_addr, addr = pvmw->address;
struct vm_area_struct *vma = pvmw->vma;
unsigned int max_nr;
if (flags & TTU_HWPOISON)
return 1;
if (!folio_test_large(folio))
return 1;
end_addr = pmd_addr_end(addr, vma->vm_end);
max_nr = (end_addr - addr) >> PAGE_SHIFT;
if (folio_test_anon(folio) && folio_test_swapbacked(folio))
return 1;
if (pte_unused(pte))
return 1;
if (userfaultfd_wp(vma))
return 1;
return folio_pte_batch_flags(folio, vma, pvmw->pte, &pte, max_nr,
FPB_RESPECT_WRITE | FPB_RESPECT_SOFT_DIRTY);
}
static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
unsigned long address, void *arg)
{
struct mm_struct *mm = vma->vm_mm;
DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
bool anon_exclusive, ret = true;
pte_t pteval;
struct page *subpage;
struct mmu_notifier_range range;
enum ttu_flags flags = (enum ttu_flags)(long)arg;
unsigned long nr_pages = 1, end_addr;
unsigned long pfn;
unsigned long hsz = 0;
int ptes = 0;
if (flags & TTU_SYNC)
pvmw.flags = PVMW_SYNC;
range.end = vma_address_end(&pvmw);
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
address, range.end);
if (folio_test_hugetlb(folio)) {
adjust_range_if_pmd_sharing_possible(vma, &range.start,
&range.end);
hsz = huge_page_size(hstate_vma(vma));
}
mmu_notifier_invalidate_range_start(&range);
while (page_vma_mapped_walk(&pvmw)) {
if (!(flags & TTU_IGNORE_MLOCK) &&
(vma->vm_flags & VM_LOCKED)) {
ptes++;
ret = false;
if (pvmw.pte && ptes != pvmw.nr_pages)
continue;
if (pvmw.flags & PVMW_PGTABLE_CROSSED)
goto walk_done;
mlock_vma_folio(folio, vma);
goto walk_done;
}
if (!pvmw.pte) {
if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) {
if (unmap_huge_pmd_locked(vma, pvmw.address, pvmw.pmd, folio))
goto walk_done;
goto walk_abort;
}
if (flags & TTU_SPLIT_HUGE_PMD) {
split_huge_pmd_locked(vma, pvmw.address,
pvmw.pmd, false);
flags &= ~TTU_SPLIT_HUGE_PMD;
page_vma_mapped_walk_restart(&pvmw);
continue;
}
}
VM_BUG_ON_FOLIO(!pvmw.pte, folio);
pteval = ptep_get(pvmw.pte);
if (likely(pte_present(pteval))) {
pfn = pte_pfn(pteval);
} else {
const softleaf_t entry = softleaf_from_pte(pteval);
pfn = softleaf_to_pfn(entry);
VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
}
subpage = folio_page(folio, pfn - folio_pfn(folio));
address = pvmw.address;
anon_exclusive = folio_test_anon(folio) &&
PageAnonExclusive(subpage);
if (folio_test_hugetlb(folio)) {
bool anon = folio_test_anon(folio);
VM_BUG_ON_PAGE(!PageHWPoison(subpage), subpage);
flush_cache_range(vma, range.start, range.end);
if (!anon) {
struct mmu_gather tlb;
VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
if (!hugetlb_vma_trylock_write(vma))
goto walk_abort;
tlb_gather_mmu_vma(&tlb, vma);
if (huge_pmd_unshare(&tlb, vma, address, pvmw.pte)) {
hugetlb_vma_unlock_write(vma);
huge_pmd_unshare_flush(&tlb, vma);
tlb_finish_mmu(&tlb);
goto walk_done;
}
hugetlb_vma_unlock_write(vma);
tlb_finish_mmu(&tlb);
}
pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
if (pte_dirty(pteval))
folio_mark_dirty(folio);
} else if (likely(pte_present(pteval))) {
nr_pages = folio_unmap_pte_batch(folio, &pvmw, flags, pteval);
end_addr = address + nr_pages * PAGE_SIZE;
flush_cache_range(vma, address, end_addr);
pteval = get_and_clear_ptes(mm, address, pvmw.pte, nr_pages);
if (should_defer_flush(mm, flags))
set_tlb_ubc_flush_pending(mm, pteval, address, end_addr);
else
flush_tlb_range(vma, address, end_addr);
if (pte_dirty(pteval))
folio_mark_dirty(folio);
} else {
pte_clear(mm, address, pvmw.pte);
}
pte_install_uffd_wp_if_needed(vma, address, pvmw.pte, pteval);
update_hiwater_rss(mm);
if (PageHWPoison(subpage) && (flags & TTU_HWPOISON)) {
pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
if (folio_test_hugetlb(folio)) {
hugetlb_count_sub(folio_nr_pages(folio), mm);
set_huge_pte_at(mm, address, pvmw.pte, pteval,
hsz);
} else {
dec_mm_counter(mm, mm_counter(folio));
set_pte_at(mm, address, pvmw.pte, pteval);
}
} else if (likely(pte_present(pteval)) && pte_unused(pteval) &&
!userfaultfd_armed(vma)) {
dec_mm_counter(mm, mm_counter(folio));
} else if (folio_test_anon(folio)) {
swp_entry_t entry = page_swap_entry(subpage);
pte_t swp_pte;
if (unlikely(folio_test_swapbacked(folio) !=
folio_test_swapcache(folio))) {
WARN_ON_ONCE(1);
goto walk_abort;
}
if (!folio_test_swapbacked(folio)) {
int ref_count, map_count;
smp_mb();
ref_count = folio_ref_count(folio);
map_count = folio_mapcount(folio);
smp_rmb();
if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
set_ptes(mm, address, pvmw.pte, pteval, nr_pages);
folio_set_swapbacked(folio);
goto walk_abort;
} else if (ref_count != 1 + map_count) {
set_ptes(mm, address, pvmw.pte, pteval, nr_pages);
goto walk_abort;
}
add_mm_counter(mm, MM_ANONPAGES, -nr_pages);
goto discard;
}
if (folio_dup_swap(folio, subpage) < 0) {
set_pte_at(mm, address, pvmw.pte, pteval);
goto walk_abort;
}
if (arch_unmap_one(mm, vma, address, pteval) < 0) {
folio_put_swap(folio, subpage);
set_pte_at(mm, address, pvmw.pte, pteval);
goto walk_abort;
}
if (anon_exclusive &&
folio_try_share_anon_rmap_pte(folio, subpage)) {
folio_put_swap(folio, subpage);
set_pte_at(mm, address, pvmw.pte, pteval);
goto walk_abort;
}
if (list_empty(&mm->mmlist)) {
spin_lock(&mmlist_lock);
if (list_empty(&mm->mmlist))
list_add(&mm->mmlist, &init_mm.mmlist);
spin_unlock(&mmlist_lock);
}
dec_mm_counter(mm, MM_ANONPAGES);
inc_mm_counter(mm, MM_SWAPENTS);
swp_pte = swp_entry_to_pte(entry);
if (anon_exclusive)
swp_pte = pte_swp_mkexclusive(swp_pte);
if (likely(pte_present(pteval))) {
if (pte_soft_dirty(pteval))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
if (pte_uffd_wp(pteval))
swp_pte = pte_swp_mkuffd_wp(swp_pte);
} else {
if (pte_swp_soft_dirty(pteval))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
if (pte_swp_uffd_wp(pteval))
swp_pte = pte_swp_mkuffd_wp(swp_pte);
}
set_pte_at(mm, address, pvmw.pte, swp_pte);
} else {
add_mm_counter(mm, mm_counter_file(folio), -nr_pages);
}
discard:
if (unlikely(folio_test_hugetlb(folio))) {
hugetlb_remove_rmap(folio);
} else {
folio_remove_rmap_ptes(folio, subpage, nr_pages, vma);
}
if (vma->vm_flags & VM_LOCKED)
mlock_drain_local();
folio_put_refs(folio, nr_pages);
if (nr_pages == folio_nr_pages(folio))
goto walk_done;
continue;
walk_abort:
ret = false;
walk_done:
page_vma_mapped_walk_done(&pvmw);
break;
}
mmu_notifier_invalidate_range_end(&range);
return ret;
}
static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg)
{
return vma_is_temporary_stack(vma);
}
static int folio_not_mapped(struct folio *folio)
{
return !folio_mapped(folio);
}
void try_to_unmap(struct folio *folio, enum ttu_flags flags)
{
struct rmap_walk_control rwc = {
.rmap_one = try_to_unmap_one,
.arg = (void *)flags,
.done = folio_not_mapped,
.anon_lock = folio_lock_anon_vma_read,
};
if (flags & TTU_RMAP_LOCKED)
rmap_walk_locked(folio, &rwc);
else
rmap_walk(folio, &rwc);
}
static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
unsigned long address, void *arg)
{
struct mm_struct *mm = vma->vm_mm;
DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
bool anon_exclusive, writable, ret = true;
pte_t pteval;
struct page *subpage;
struct mmu_notifier_range range;
enum ttu_flags flags = (enum ttu_flags)(long)arg;
unsigned long pfn;
unsigned long hsz = 0;
if (flags & TTU_SYNC)
pvmw.flags = PVMW_SYNC;
range.end = vma_address_end(&pvmw);
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
address, range.end);
if (folio_test_hugetlb(folio)) {
adjust_range_if_pmd_sharing_possible(vma, &range.start,
&range.end);
hsz = huge_page_size(hstate_vma(vma));
}
mmu_notifier_invalidate_range_start(&range);
while (page_vma_mapped_walk(&pvmw)) {
if (!pvmw.pte) {
__maybe_unused unsigned long pfn;
__maybe_unused pmd_t pmdval;
if (flags & TTU_SPLIT_HUGE_PMD) {
split_huge_pmd_locked(vma, pvmw.address,
pvmw.pmd, true);
flags &= ~TTU_SPLIT_HUGE_PMD;
page_vma_mapped_walk_restart(&pvmw);
continue;
}
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
pmdval = pmdp_get(pvmw.pmd);
if (likely(pmd_present(pmdval)))
pfn = pmd_pfn(pmdval);
else
pfn = softleaf_to_pfn(softleaf_from_pmd(pmdval));
subpage = folio_page(folio, pfn - folio_pfn(folio));
VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) ||
!folio_test_pmd_mappable(folio), folio);
if (set_pmd_migration_entry(&pvmw, subpage)) {
ret = false;
page_vma_mapped_walk_done(&pvmw);
break;
}
continue;
#endif
}
VM_BUG_ON_FOLIO(!pvmw.pte, folio);
pteval = ptep_get(pvmw.pte);
if (likely(pte_present(pteval))) {
pfn = pte_pfn(pteval);
} else {
const softleaf_t entry = softleaf_from_pte(pteval);
pfn = softleaf_to_pfn(entry);
VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
}
subpage = folio_page(folio, pfn - folio_pfn(folio));
address = pvmw.address;
anon_exclusive = folio_test_anon(folio) &&
PageAnonExclusive(subpage);
if (folio_test_hugetlb(folio)) {
bool anon = folio_test_anon(folio);
flush_cache_range(vma, range.start, range.end);
if (!anon) {
struct mmu_gather tlb;
VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
if (!hugetlb_vma_trylock_write(vma)) {
page_vma_mapped_walk_done(&pvmw);
ret = false;
break;
}
tlb_gather_mmu_vma(&tlb, vma);
if (huge_pmd_unshare(&tlb, vma, address, pvmw.pte)) {
hugetlb_vma_unlock_write(vma);
huge_pmd_unshare_flush(&tlb, vma);
tlb_finish_mmu(&tlb);
page_vma_mapped_walk_done(&pvmw);
break;
}
hugetlb_vma_unlock_write(vma);
tlb_finish_mmu(&tlb);
}
pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
if (pte_dirty(pteval))
folio_mark_dirty(folio);
writable = pte_write(pteval);
} else if (likely(pte_present(pteval))) {
flush_cache_page(vma, address, pfn);
if (should_defer_flush(mm, flags)) {
pteval = ptep_get_and_clear(mm, address, pvmw.pte);
set_tlb_ubc_flush_pending(mm, pteval, address, address + PAGE_SIZE);
} else {
pteval = ptep_clear_flush(vma, address, pvmw.pte);
}
if (pte_dirty(pteval))
folio_mark_dirty(folio);
writable = pte_write(pteval);
} else {
const softleaf_t entry = softleaf_from_pte(pteval);
pte_clear(mm, address, pvmw.pte);
writable = softleaf_is_device_private_write(entry);
}
VM_WARN_ON_FOLIO(writable && folio_test_anon(folio) &&
!anon_exclusive, folio);
update_hiwater_rss(mm);
if (PageHWPoison(subpage)) {
VM_WARN_ON_FOLIO(folio_is_device_private(folio), folio);
pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
if (folio_test_hugetlb(folio)) {
hugetlb_count_sub(folio_nr_pages(folio), mm);
set_huge_pte_at(mm, address, pvmw.pte, pteval,
hsz);
} else {
dec_mm_counter(mm, mm_counter(folio));
set_pte_at(mm, address, pvmw.pte, pteval);
}
} else if (likely(pte_present(pteval)) && pte_unused(pteval) &&
!userfaultfd_armed(vma)) {
dec_mm_counter(mm, mm_counter(folio));
} else {
swp_entry_t entry;
pte_t swp_pte;
if (arch_unmap_one(mm, vma, address, pteval) < 0) {
if (folio_test_hugetlb(folio))
set_huge_pte_at(mm, address, pvmw.pte,
pteval, hsz);
else
set_pte_at(mm, address, pvmw.pte, pteval);
ret = false;
page_vma_mapped_walk_done(&pvmw);
break;
}
if (folio_test_hugetlb(folio)) {
if (anon_exclusive &&
hugetlb_try_share_anon_rmap(folio)) {
set_huge_pte_at(mm, address, pvmw.pte,
pteval, hsz);
ret = false;
page_vma_mapped_walk_done(&pvmw);
break;
}
} else if (anon_exclusive &&
folio_try_share_anon_rmap_pte(folio, subpage)) {
set_pte_at(mm, address, pvmw.pte, pteval);
ret = false;
page_vma_mapped_walk_done(&pvmw);
break;
}
if (writable)
entry = make_writable_migration_entry(
page_to_pfn(subpage));
else if (anon_exclusive)
entry = make_readable_exclusive_migration_entry(
page_to_pfn(subpage));
else
entry = make_readable_migration_entry(
page_to_pfn(subpage));
if (likely(pte_present(pteval))) {
if (pte_young(pteval))
entry = make_migration_entry_young(entry);
if (pte_dirty(pteval))
entry = make_migration_entry_dirty(entry);
swp_pte = swp_entry_to_pte(entry);
if (pte_soft_dirty(pteval))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
if (pte_uffd_wp(pteval))
swp_pte = pte_swp_mkuffd_wp(swp_pte);
} else {
swp_pte = swp_entry_to_pte(entry);
if (pte_swp_soft_dirty(pteval))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
if (pte_swp_uffd_wp(pteval))
swp_pte = pte_swp_mkuffd_wp(swp_pte);
}
if (folio_test_hugetlb(folio))
set_huge_pte_at(mm, address, pvmw.pte, swp_pte,
hsz);
else
set_pte_at(mm, address, pvmw.pte, swp_pte);
trace_set_migration_pte(address, pte_val(swp_pte),
folio_order(folio));
}
if (unlikely(folio_test_hugetlb(folio)))
hugetlb_remove_rmap(folio);
else
folio_remove_rmap_pte(folio, subpage, vma);
if (vma->vm_flags & VM_LOCKED)
mlock_drain_local();
folio_put(folio);
}
mmu_notifier_invalidate_range_end(&range);
return ret;
}
void try_to_migrate(struct folio *folio, enum ttu_flags flags)
{
struct rmap_walk_control rwc = {
.rmap_one = try_to_migrate_one,
.arg = (void *)flags,
.done = folio_not_mapped,
.anon_lock = folio_lock_anon_vma_read,
};
if (WARN_ON_ONCE(flags & ~(TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD |
TTU_SYNC | TTU_BATCH_FLUSH)))
return;
if (folio_is_zone_device(folio) &&
(!folio_is_device_private(folio) && !folio_is_device_coherent(folio)))
return;
if (!folio_test_ksm(folio) && folio_test_anon(folio))
rwc.invalid_vma = invalid_migration_vma;
if (flags & TTU_RMAP_LOCKED)
rmap_walk_locked(folio, &rwc);
else
rmap_walk(folio, &rwc);
}
#ifdef CONFIG_DEVICE_PRIVATE
struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr,
void *owner, struct folio **foliop)
{
struct mmu_notifier_range range;
struct folio *folio, *fw_folio;
struct vm_area_struct *vma;
struct folio_walk fw;
struct page *page;
swp_entry_t entry;
pte_t swp_pte;
int ret;
mmap_assert_locked(mm);
addr = PAGE_ALIGN_DOWN(addr);
retry:
page = get_user_page_vma_remote(mm, addr,
FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD,
&vma);
if (IS_ERR(page))
return page;
folio = page_folio(page);
if (!folio_test_anon(folio) || folio_test_hugetlb(folio)) {
folio_put(folio);
return ERR_PTR(-EOPNOTSUPP);
}
ret = folio_lock_killable(folio);
if (ret) {
folio_put(folio);
return ERR_PTR(ret);
}
mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
mm, addr, addr + PAGE_SIZE, owner);
mmu_notifier_invalidate_range_start(&range);
fw_folio = folio_walk_start(&fw, vma, addr, 0);
if (fw_folio != folio || fw.page != page ||
fw.level != FW_LEVEL_PTE || !pte_write(fw.pte)) {
if (fw_folio)
folio_walk_end(&fw, vma);
mmu_notifier_invalidate_range_end(&range);
folio_unlock(folio);
folio_put(folio);
goto retry;
}
flush_cache_page(vma, addr, page_to_pfn(page));
fw.pte = ptep_clear_flush(vma, addr, fw.ptep);
if (pte_dirty(fw.pte))
folio_mark_dirty(folio);
entry = make_device_exclusive_entry(page_to_pfn(page));
swp_pte = swp_entry_to_pte(entry);
if (pte_soft_dirty(fw.pte))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
set_pte_at(mm, addr, fw.ptep, swp_pte);
folio_walk_end(&fw, vma);
mmu_notifier_invalidate_range_end(&range);
*foliop = folio;
return page;
}
EXPORT_SYMBOL_GPL(make_device_exclusive);
#endif
void __put_anon_vma(struct anon_vma *anon_vma)
{
struct anon_vma *root = anon_vma->root;
anon_vma_free(anon_vma);
if (root != anon_vma && atomic_dec_and_test(&root->refcount))
anon_vma_free(root);
}
static struct anon_vma *rmap_walk_anon_lock(const struct folio *folio,
struct rmap_walk_control *rwc)
{
struct anon_vma *anon_vma;
if (rwc->anon_lock)
return rwc->anon_lock(folio, rwc);
anon_vma = folio_anon_vma(folio);
if (!anon_vma)
return NULL;
if (anon_vma_trylock_read(anon_vma))
goto out;
if (rwc->try_lock) {
anon_vma = NULL;
rwc->contended = true;
goto out;
}
anon_vma_lock_read(anon_vma);
out:
return anon_vma;
}
static void rmap_walk_anon(struct folio *folio,
struct rmap_walk_control *rwc, bool locked)
{
struct anon_vma *anon_vma;
pgoff_t pgoff_start, pgoff_end;
struct anon_vma_chain *avc;
VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
if (locked) {
anon_vma = folio_anon_vma(folio);
VM_BUG_ON_FOLIO(!anon_vma, folio);
} else {
anon_vma = rmap_walk_anon_lock(folio, rwc);
}
if (!anon_vma)
return;
pgoff_start = folio_pgoff(folio);
pgoff_end = pgoff_start + folio_nr_pages(folio) - 1;
anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
pgoff_start, pgoff_end) {
struct vm_area_struct *vma = avc->vma;
unsigned long address = vma_address(vma, pgoff_start,
folio_nr_pages(folio));
VM_BUG_ON_VMA(address == -EFAULT, vma);
cond_resched();
if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
continue;
if (!rwc->rmap_one(folio, vma, address, rwc->arg))
break;
if (rwc->done && rwc->done(folio))
break;
}
if (!locked)
anon_vma_unlock_read(anon_vma);
}
static void __rmap_walk_file(struct folio *folio, struct address_space *mapping,
pgoff_t pgoff_start, unsigned long nr_pages,
struct rmap_walk_control *rwc, bool locked)
{
pgoff_t pgoff_end = pgoff_start + nr_pages - 1;
struct vm_area_struct *vma;
VM_WARN_ON_FOLIO(folio && mapping != folio_mapping(folio), folio);
VM_WARN_ON_FOLIO(folio && pgoff_start != folio_pgoff(folio), folio);
VM_WARN_ON_FOLIO(folio && nr_pages != folio_nr_pages(folio), folio);
if (!locked) {
if (i_mmap_trylock_read(mapping))
goto lookup;
if (rwc->try_lock) {
rwc->contended = true;
return;
}
i_mmap_lock_read(mapping);
}
lookup:
vma_interval_tree_foreach(vma, &mapping->i_mmap,
pgoff_start, pgoff_end) {
unsigned long address = vma_address(vma, pgoff_start, nr_pages);
VM_BUG_ON_VMA(address == -EFAULT, vma);
cond_resched();
if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
continue;
if (!rwc->rmap_one(folio, vma, address, rwc->arg))
goto done;
if (rwc->done && rwc->done(folio))
goto done;
}
done:
if (!locked)
i_mmap_unlock_read(mapping);
}
static void rmap_walk_file(struct folio *folio,
struct rmap_walk_control *rwc, bool locked)
{
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
if (!folio->mapping)
return;
__rmap_walk_file(folio, folio->mapping, folio->index,
folio_nr_pages(folio), rwc, locked);
}
void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc)
{
if (unlikely(folio_test_ksm(folio)))
rmap_walk_ksm(folio, rwc);
else if (folio_test_anon(folio))
rmap_walk_anon(folio, rwc, false);
else
rmap_walk_file(folio, rwc, false);
}
void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc)
{
VM_BUG_ON_FOLIO(folio_test_ksm(folio), folio);
if (folio_test_anon(folio))
rmap_walk_anon(folio, rwc, true);
else
rmap_walk_file(folio, rwc, true);
}
#ifdef CONFIG_HUGETLB_PAGE
void hugetlb_add_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
unsigned long address, rmap_t flags)
{
VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
atomic_inc(&folio->_entire_mapcount);
atomic_inc(&folio->_large_mapcount);
if (flags & RMAP_EXCLUSIVE)
SetPageAnonExclusive(&folio->page);
VM_WARN_ON_FOLIO(folio_entire_mapcount(folio) > 1 &&
PageAnonExclusive(&folio->page), folio);
}
void hugetlb_add_new_anon_rmap(struct folio *folio,
struct vm_area_struct *vma, unsigned long address)
{
VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
BUG_ON(address < vma->vm_start || address >= vma->vm_end);
atomic_set(&folio->_entire_mapcount, 0);
atomic_set(&folio->_large_mapcount, 0);
folio_clear_hugetlb_restore_reserve(folio);
__folio_set_anon(folio, vma, address, true);
SetPageAnonExclusive(&folio->page);
}
#endif