#include <linux/vmalloc.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/highmem.h>
#include <linux/sched/signal.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/interrupt.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/set_memory.h>
#include <linux/debugobjects.h>
#include <linux/kallsyms.h>
#include <linux/list.h>
#include <linux/notifier.h>
#include <linux/rbtree.h>
#include <linux/xarray.h>
#include <linux/io.h>
#include <linux/rcupdate.h>
#include <linux/pfn.h>
#include <linux/kmemleak.h>
#include <linux/atomic.h>
#include <linux/compiler.h>
#include <linux/memcontrol.h>
#include <linux/llist.h>
#include <linux/uio.h>
#include <linux/bitops.h>
#include <linux/rbtree_augmented.h>
#include <linux/overflow.h>
#include <linux/pgtable.h>
#include <linux/hugetlb.h>
#include <linux/sched/mm.h>
#include <asm/tlbflush.h>
#include <asm/shmparam.h>
#include <linux/page_owner.h>
#define CREATE_TRACE_POINTS
#include <trace/events/vmalloc.h>
#include "internal.h"
#include "pgalloc-track.h"
#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
static unsigned int __ro_after_init ioremap_max_page_shift = BITS_PER_LONG - 1;
static int __init set_nohugeiomap(char *str)
{
ioremap_max_page_shift = PAGE_SHIFT;
return 0;
}
early_param("nohugeiomap", set_nohugeiomap);
#else
static const unsigned int ioremap_max_page_shift = PAGE_SHIFT;
#endif
#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
static bool __ro_after_init vmap_allow_huge = true;
static int __init set_nohugevmalloc(char *str)
{
vmap_allow_huge = false;
return 0;
}
early_param("nohugevmalloc", set_nohugevmalloc);
#else
static const bool vmap_allow_huge = false;
#endif
bool is_vmalloc_addr(const void *x)
{
unsigned long addr = (unsigned long)kasan_reset_tag(x);
return addr >= VMALLOC_START && addr < VMALLOC_END;
}
EXPORT_SYMBOL(is_vmalloc_addr);
struct vfree_deferred {
struct llist_head list;
struct work_struct wq;
};
static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred);
static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
phys_addr_t phys_addr, pgprot_t prot,
unsigned int max_page_shift, pgtbl_mod_mask *mask)
{
pte_t *pte;
u64 pfn;
struct page *page;
unsigned long size = PAGE_SIZE;
if (WARN_ON_ONCE(!PAGE_ALIGNED(end - addr)))
return -EINVAL;
pfn = phys_addr >> PAGE_SHIFT;
pte = pte_alloc_kernel_track(pmd, addr, mask);
if (!pte)
return -ENOMEM;
lazy_mmu_mode_enable();
do {
if (unlikely(!pte_none(ptep_get(pte)))) {
if (pfn_valid(pfn)) {
page = pfn_to_page(pfn);
dump_page(page, "remapping already mapped page");
}
BUG();
}
#ifdef CONFIG_HUGETLB_PAGE
size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift);
if (size != PAGE_SIZE) {
pte_t entry = pfn_pte(pfn, prot);
entry = arch_make_huge_pte(entry, ilog2(size), 0);
set_huge_pte_at(&init_mm, addr, pte, entry, size);
pfn += PFN_DOWN(size);
continue;
}
#endif
set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
pfn++;
} while (pte += PFN_DOWN(size), addr += size, addr != end);
lazy_mmu_mode_disable();
*mask |= PGTBL_PTE_MODIFIED;
return 0;
}
static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end,
phys_addr_t phys_addr, pgprot_t prot,
unsigned int max_page_shift)
{
if (max_page_shift < PMD_SHIFT)
return 0;
if (!arch_vmap_pmd_supported(prot))
return 0;
if ((end - addr) != PMD_SIZE)
return 0;
if (!IS_ALIGNED(addr, PMD_SIZE))
return 0;
if (!IS_ALIGNED(phys_addr, PMD_SIZE))
return 0;
if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
return 0;
return pmd_set_huge(pmd, phys_addr, prot);
}
static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
phys_addr_t phys_addr, pgprot_t prot,
unsigned int max_page_shift, pgtbl_mod_mask *mask)
{
pmd_t *pmd;
unsigned long next;
int err = 0;
pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
if (!pmd)
return -ENOMEM;
do {
next = pmd_addr_end(addr, end);
if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot,
max_page_shift)) {
*mask |= PGTBL_PMD_MODIFIED;
continue;
}
err = vmap_pte_range(pmd, addr, next, phys_addr, prot, max_page_shift, mask);
if (err)
break;
} while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
return err;
}
static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end,
phys_addr_t phys_addr, pgprot_t prot,
unsigned int max_page_shift)
{
if (max_page_shift < PUD_SHIFT)
return 0;
if (!arch_vmap_pud_supported(prot))
return 0;
if ((end - addr) != PUD_SIZE)
return 0;
if (!IS_ALIGNED(addr, PUD_SIZE))
return 0;
if (!IS_ALIGNED(phys_addr, PUD_SIZE))
return 0;
if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
return 0;
return pud_set_huge(pud, phys_addr, prot);
}
static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
phys_addr_t phys_addr, pgprot_t prot,
unsigned int max_page_shift, pgtbl_mod_mask *mask)
{
pud_t *pud;
unsigned long next;
int err = 0;
pud = pud_alloc_track(&init_mm, p4d, addr, mask);
if (!pud)
return -ENOMEM;
do {
next = pud_addr_end(addr, end);
if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot,
max_page_shift)) {
*mask |= PGTBL_PUD_MODIFIED;
continue;
}
err = vmap_pmd_range(pud, addr, next, phys_addr, prot, max_page_shift, mask);
if (err)
break;
} while (pud++, phys_addr += (next - addr), addr = next, addr != end);
return err;
}
static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end,
phys_addr_t phys_addr, pgprot_t prot,
unsigned int max_page_shift)
{
if (max_page_shift < P4D_SHIFT)
return 0;
if (!arch_vmap_p4d_supported(prot))
return 0;
if ((end - addr) != P4D_SIZE)
return 0;
if (!IS_ALIGNED(addr, P4D_SIZE))
return 0;
if (!IS_ALIGNED(phys_addr, P4D_SIZE))
return 0;
if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr))
return 0;
return p4d_set_huge(p4d, phys_addr, prot);
}
static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
phys_addr_t phys_addr, pgprot_t prot,
unsigned int max_page_shift, pgtbl_mod_mask *mask)
{
p4d_t *p4d;
unsigned long next;
int err = 0;
p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
if (!p4d)
return -ENOMEM;
do {
next = p4d_addr_end(addr, end);
if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot,
max_page_shift)) {
*mask |= PGTBL_P4D_MODIFIED;
continue;
}
err = vmap_pud_range(p4d, addr, next, phys_addr, prot, max_page_shift, mask);
if (err)
break;
} while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
return err;
}
static int vmap_range_noflush(unsigned long addr, unsigned long end,
phys_addr_t phys_addr, pgprot_t prot,
unsigned int max_page_shift)
{
pgd_t *pgd;
unsigned long start;
unsigned long next;
int err;
pgtbl_mod_mask mask = 0;
might_sleep();
BUG_ON(addr >= end);
start = addr;
pgd = pgd_offset_k(addr);
do {
next = pgd_addr_end(addr, end);
err = vmap_p4d_range(pgd, addr, next, phys_addr, prot,
max_page_shift, &mask);
if (err)
break;
} while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
arch_sync_kernel_mappings(start, end);
return err;
}
int vmap_page_range(unsigned long addr, unsigned long end,
phys_addr_t phys_addr, pgprot_t prot)
{
int err;
err = vmap_range_noflush(addr, end, phys_addr, pgprot_nx(prot),
ioremap_max_page_shift);
flush_cache_vmap(addr, end);
if (!err)
err = kmsan_ioremap_page_range(addr, end, phys_addr, prot,
ioremap_max_page_shift);
return err;
}
int ioremap_page_range(unsigned long addr, unsigned long end,
phys_addr_t phys_addr, pgprot_t prot)
{
struct vm_struct *area;
area = find_vm_area((void *)addr);
if (!area || !(area->flags & VM_IOREMAP)) {
WARN_ONCE(1, "vm_area at addr %lx is not marked as VM_IOREMAP\n", addr);
return -EINVAL;
}
if (addr != (unsigned long)area->addr ||
(void *)end != area->addr + get_vm_area_size(area)) {
WARN_ONCE(1, "ioremap request [%lx,%lx) doesn't match vm_area [%lx, %lx)\n",
addr, end, (long)area->addr,
(long)area->addr + get_vm_area_size(area));
return -ERANGE;
}
return vmap_page_range(addr, end, phys_addr, prot);
}
static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pgtbl_mod_mask *mask)
{
pte_t *pte;
pte_t ptent;
unsigned long size = PAGE_SIZE;
pte = pte_offset_kernel(pmd, addr);
lazy_mmu_mode_enable();
do {
#ifdef CONFIG_HUGETLB_PAGE
size = arch_vmap_pte_range_unmap_size(addr, pte);
if (size != PAGE_SIZE) {
if (WARN_ON(!IS_ALIGNED(addr, size))) {
addr = ALIGN_DOWN(addr, size);
pte = PTR_ALIGN_DOWN(pte, sizeof(*pte) * (size >> PAGE_SHIFT));
}
ptent = huge_ptep_get_and_clear(&init_mm, addr, pte, size);
if (WARN_ON(end - addr < size))
size = end - addr;
} else
#endif
ptent = ptep_get_and_clear(&init_mm, addr, pte);
WARN_ON(!pte_none(ptent) && !pte_present(ptent));
} while (pte += (size >> PAGE_SHIFT), addr += size, addr != end);
lazy_mmu_mode_disable();
*mask |= PGTBL_PTE_MODIFIED;
}
static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
pgtbl_mod_mask *mask)
{
pmd_t *pmd;
unsigned long next;
int cleared;
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
cleared = pmd_clear_huge(pmd);
if (cleared || pmd_bad(*pmd))
*mask |= PGTBL_PMD_MODIFIED;
if (cleared) {
WARN_ON(next - addr < PMD_SIZE);
continue;
}
if (pmd_none_or_clear_bad(pmd))
continue;
vunmap_pte_range(pmd, addr, next, mask);
cond_resched();
} while (pmd++, addr = next, addr != end);
}
static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
pgtbl_mod_mask *mask)
{
pud_t *pud;
unsigned long next;
int cleared;
pud = pud_offset(p4d, addr);
do {
next = pud_addr_end(addr, end);
cleared = pud_clear_huge(pud);
if (cleared || pud_bad(*pud))
*mask |= PGTBL_PUD_MODIFIED;
if (cleared) {
WARN_ON(next - addr < PUD_SIZE);
continue;
}
if (pud_none_or_clear_bad(pud))
continue;
vunmap_pmd_range(pud, addr, next, mask);
} while (pud++, addr = next, addr != end);
}
static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
pgtbl_mod_mask *mask)
{
p4d_t *p4d;
unsigned long next;
p4d = p4d_offset(pgd, addr);
do {
next = p4d_addr_end(addr, end);
p4d_clear_huge(p4d);
if (p4d_bad(*p4d))
*mask |= PGTBL_P4D_MODIFIED;
if (p4d_none_or_clear_bad(p4d))
continue;
vunmap_pud_range(p4d, addr, next, mask);
} while (p4d++, addr = next, addr != end);
}
void __vunmap_range_noflush(unsigned long start, unsigned long end)
{
unsigned long next;
pgd_t *pgd;
unsigned long addr = start;
pgtbl_mod_mask mask = 0;
BUG_ON(addr >= end);
pgd = pgd_offset_k(addr);
do {
next = pgd_addr_end(addr, end);
if (pgd_bad(*pgd))
mask |= PGTBL_PGD_MODIFIED;
if (pgd_none_or_clear_bad(pgd))
continue;
vunmap_p4d_range(pgd, addr, next, &mask);
} while (pgd++, addr = next, addr != end);
if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
arch_sync_kernel_mappings(start, end);
}
void vunmap_range_noflush(unsigned long start, unsigned long end)
{
kmsan_vunmap_range_noflush(start, end);
__vunmap_range_noflush(start, end);
}
void vunmap_range(unsigned long addr, unsigned long end)
{
flush_cache_vunmap(addr, end);
vunmap_range_noflush(addr, end);
flush_tlb_kernel_range(addr, end);
}
static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr,
pgtbl_mod_mask *mask)
{
int err = 0;
pte_t *pte;
pte = pte_alloc_kernel_track(pmd, addr, mask);
if (!pte)
return -ENOMEM;
lazy_mmu_mode_enable();
do {
struct page *page = pages[*nr];
if (WARN_ON(!pte_none(ptep_get(pte)))) {
err = -EBUSY;
break;
}
if (WARN_ON(!page)) {
err = -ENOMEM;
break;
}
if (WARN_ON(!pfn_valid(page_to_pfn(page)))) {
err = -EINVAL;
break;
}
set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
(*nr)++;
} while (pte++, addr += PAGE_SIZE, addr != end);
lazy_mmu_mode_disable();
*mask |= PGTBL_PTE_MODIFIED;
return err;
}
static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr,
pgtbl_mod_mask *mask)
{
pmd_t *pmd;
unsigned long next;
pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
if (!pmd)
return -ENOMEM;
do {
next = pmd_addr_end(addr, end);
if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr, mask))
return -ENOMEM;
} while (pmd++, addr = next, addr != end);
return 0;
}
static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr,
pgtbl_mod_mask *mask)
{
pud_t *pud;
unsigned long next;
pud = pud_alloc_track(&init_mm, p4d, addr, mask);
if (!pud)
return -ENOMEM;
do {
next = pud_addr_end(addr, end);
if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr, mask))
return -ENOMEM;
} while (pud++, addr = next, addr != end);
return 0;
}
static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr,
pgtbl_mod_mask *mask)
{
p4d_t *p4d;
unsigned long next;
p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
if (!p4d)
return -ENOMEM;
do {
next = p4d_addr_end(addr, end);
if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr, mask))
return -ENOMEM;
} while (p4d++, addr = next, addr != end);
return 0;
}
static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end,
pgprot_t prot, struct page **pages)
{
unsigned long start = addr;
pgd_t *pgd;
unsigned long next;
int err = 0;
int nr = 0;
pgtbl_mod_mask mask = 0;
BUG_ON(addr >= end);
pgd = pgd_offset_k(addr);
do {
next = pgd_addr_end(addr, end);
if (pgd_bad(*pgd))
mask |= PGTBL_PGD_MODIFIED;
err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
if (err)
break;
} while (pgd++, addr = next, addr != end);
if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
arch_sync_kernel_mappings(start, end);
return err;
}
int __vmap_pages_range_noflush(unsigned long addr, unsigned long end,
pgprot_t prot, struct page **pages, unsigned int page_shift)
{
unsigned int i, nr = (end - addr) >> PAGE_SHIFT;
WARN_ON(page_shift < PAGE_SHIFT);
if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) ||
page_shift == PAGE_SHIFT)
return vmap_small_pages_range_noflush(addr, end, prot, pages);
for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) {
int err;
err = vmap_range_noflush(addr, addr + (1UL << page_shift),
page_to_phys(pages[i]), prot,
page_shift);
if (err)
return err;
addr += 1UL << page_shift;
}
return 0;
}
int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
pgprot_t prot, struct page **pages, unsigned int page_shift,
gfp_t gfp_mask)
{
int ret = kmsan_vmap_pages_range_noflush(addr, end, prot, pages,
page_shift, gfp_mask);
if (ret)
return ret;
return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
}
static int __vmap_pages_range(unsigned long addr, unsigned long end,
pgprot_t prot, struct page **pages, unsigned int page_shift,
gfp_t gfp_mask)
{
int err;
err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift, gfp_mask);
flush_cache_vmap(addr, end);
return err;
}
int vmap_pages_range(unsigned long addr, unsigned long end,
pgprot_t prot, struct page **pages, unsigned int page_shift)
{
return __vmap_pages_range(addr, end, prot, pages, page_shift, GFP_KERNEL);
}
static int check_sparse_vm_area(struct vm_struct *area, unsigned long start,
unsigned long end)
{
might_sleep();
if (WARN_ON_ONCE(area->flags & VM_FLUSH_RESET_PERMS))
return -EINVAL;
if (WARN_ON_ONCE(area->flags & VM_NO_GUARD))
return -EINVAL;
if (WARN_ON_ONCE(!(area->flags & VM_SPARSE)))
return -EINVAL;
if ((end - start) >> PAGE_SHIFT > totalram_pages())
return -E2BIG;
if (start < (unsigned long)area->addr ||
(void *)end > area->addr + get_vm_area_size(area))
return -ERANGE;
return 0;
}
int vm_area_map_pages(struct vm_struct *area, unsigned long start,
unsigned long end, struct page **pages)
{
int err;
err = check_sparse_vm_area(area, start, end);
if (err)
return err;
return vmap_pages_range(start, end, PAGE_KERNEL, pages, PAGE_SHIFT);
}
void vm_area_unmap_pages(struct vm_struct *area, unsigned long start,
unsigned long end)
{
if (check_sparse_vm_area(area, start, end))
return;
vunmap_range(start, end);
}
int is_vmalloc_or_module_addr(const void *x)
{
#if defined(CONFIG_EXECMEM) && defined(MODULES_VADDR)
unsigned long addr = (unsigned long)kasan_reset_tag(x);
if (addr >= MODULES_VADDR && addr < MODULES_END)
return 1;
#endif
return is_vmalloc_addr(x);
}
EXPORT_SYMBOL_GPL(is_vmalloc_or_module_addr);
struct page *vmalloc_to_page(const void *vmalloc_addr)
{
unsigned long addr = (unsigned long) vmalloc_addr;
struct page *page = NULL;
pgd_t *pgd = pgd_offset_k(addr);
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *ptep, pte;
VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
if (pgd_none(*pgd))
return NULL;
if (WARN_ON_ONCE(pgd_leaf(*pgd)))
return NULL;
if (WARN_ON_ONCE(pgd_bad(*pgd)))
return NULL;
p4d = p4d_offset(pgd, addr);
if (p4d_none(*p4d))
return NULL;
if (p4d_leaf(*p4d))
return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT);
if (WARN_ON_ONCE(p4d_bad(*p4d)))
return NULL;
pud = pud_offset(p4d, addr);
if (pud_none(*pud))
return NULL;
if (pud_leaf(*pud))
return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
if (WARN_ON_ONCE(pud_bad(*pud)))
return NULL;
pmd = pmd_offset(pud, addr);
if (pmd_none(*pmd))
return NULL;
if (pmd_leaf(*pmd))
return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
if (WARN_ON_ONCE(pmd_bad(*pmd)))
return NULL;
ptep = pte_offset_kernel(pmd, addr);
pte = ptep_get(ptep);
if (pte_present(pte))
page = pte_page(pte);
return page;
}
EXPORT_SYMBOL(vmalloc_to_page);
unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
{
return page_to_pfn(vmalloc_to_page(vmalloc_addr));
}
EXPORT_SYMBOL(vmalloc_to_pfn);
#define DEBUG_AUGMENT_PROPAGATE_CHECK 0
#define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0
static DEFINE_SPINLOCK(free_vmap_area_lock);
static bool vmap_initialized __read_mostly;
static struct kmem_cache *vmap_area_cachep;
static LIST_HEAD(free_vmap_area_list);
static struct rb_root free_vmap_area_root = RB_ROOT;
static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);
struct rb_list {
struct rb_root root;
struct list_head head;
spinlock_t lock;
};
#define MAX_VA_SIZE_PAGES 256
struct vmap_pool {
struct list_head head;
unsigned long len;
};
static struct vmap_node {
struct vmap_pool pool[MAX_VA_SIZE_PAGES];
spinlock_t pool_lock;
bool skip_populate;
struct rb_list busy;
struct rb_list lazy;
struct list_head purge_list;
struct work_struct purge_work;
unsigned long nr_purged;
} single;
static struct vmap_node *vmap_nodes = &single;
static __read_mostly unsigned int nr_vmap_nodes = 1;
static __read_mostly unsigned int vmap_zone_size = 1;
#define for_each_vmap_node(vn) \
for ((vn) = &vmap_nodes[0]; \
(vn) < &vmap_nodes[nr_vmap_nodes]; (vn)++)
static inline unsigned int
addr_to_node_id(unsigned long addr)
{
return (addr / vmap_zone_size) % nr_vmap_nodes;
}
static inline struct vmap_node *
addr_to_node(unsigned long addr)
{
return &vmap_nodes[addr_to_node_id(addr)];
}
static inline struct vmap_node *
id_to_node(unsigned int id)
{
return &vmap_nodes[id % nr_vmap_nodes];
}
static inline unsigned int
node_to_id(struct vmap_node *node)
{
unsigned int id = node - vmap_nodes;
if (likely(id < nr_vmap_nodes))
return id;
WARN_ONCE(1, "An address 0x%p is out-of-bounds.\n", node);
return 0;
}
static unsigned int
encode_vn_id(unsigned int node_id)
{
if (node_id < nr_vmap_nodes)
return (node_id + 1) << BITS_PER_BYTE;
WARN_ONCE(1, "Encode wrong node id (%u)\n", node_id);
return 0;
}
static unsigned int
decode_vn_id(unsigned int val)
{
unsigned int node_id = (val >> BITS_PER_BYTE) - 1;
if (node_id < nr_vmap_nodes)
return node_id;
WARN_ONCE(node_id != UINT_MAX,
"Decode wrong node id (%d)\n", node_id);
return nr_vmap_nodes;
}
static bool
is_vn_id_valid(unsigned int node_id)
{
if (node_id < nr_vmap_nodes)
return true;
return false;
}
static __always_inline unsigned long
va_size(struct vmap_area *va)
{
return (va->va_end - va->va_start);
}
static __always_inline unsigned long
get_subtree_max_size(struct rb_node *node)
{
struct vmap_area *va;
va = rb_entry_safe(node, struct vmap_area, rb_node);
return va ? va->subtree_max_size : 0;
}
RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb,
struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size)
static void reclaim_and_purge_vmap_areas(void);
static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
static void drain_vmap_area_work(struct work_struct *work);
static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work);
static __cacheline_aligned_in_smp atomic_long_t nr_vmalloc_pages;
static __cacheline_aligned_in_smp atomic_long_t vmap_lazy_nr;
unsigned long vmalloc_nr_pages(void)
{
return atomic_long_read(&nr_vmalloc_pages);
}
static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root)
{
struct rb_node *n = root->rb_node;
addr = (unsigned long)kasan_reset_tag((void *)addr);
while (n) {
struct vmap_area *va;
va = rb_entry(n, struct vmap_area, rb_node);
if (addr < va->va_start)
n = n->rb_left;
else if (addr >= va->va_end)
n = n->rb_right;
else
return va;
}
return NULL;
}
static struct vmap_area *
__find_vmap_area_exceed_addr(unsigned long addr, struct rb_root *root)
{
struct vmap_area *va = NULL;
struct rb_node *n = root->rb_node;
addr = (unsigned long)kasan_reset_tag((void *)addr);
while (n) {
struct vmap_area *tmp;
tmp = rb_entry(n, struct vmap_area, rb_node);
if (tmp->va_end > addr) {
va = tmp;
if (tmp->va_start <= addr)
break;
n = n->rb_left;
} else
n = n->rb_right;
}
return va;
}
static struct vmap_node *
find_vmap_area_exceed_addr_lock(unsigned long addr, struct vmap_area **va)
{
unsigned long va_start_lowest;
struct vmap_node *vn;
repeat:
va_start_lowest = 0;
for_each_vmap_node(vn) {
spin_lock(&vn->busy.lock);
*va = __find_vmap_area_exceed_addr(addr, &vn->busy.root);
if (*va)
if (!va_start_lowest || (*va)->va_start < va_start_lowest)
va_start_lowest = (*va)->va_start;
spin_unlock(&vn->busy.lock);
}
if (va_start_lowest) {
vn = addr_to_node(va_start_lowest);
spin_lock(&vn->busy.lock);
*va = __find_vmap_area(va_start_lowest, &vn->busy.root);
if (*va)
return vn;
spin_unlock(&vn->busy.lock);
goto repeat;
}
return NULL;
}
static __always_inline struct rb_node **
find_va_links(struct vmap_area *va,
struct rb_root *root, struct rb_node *from,
struct rb_node **parent)
{
struct vmap_area *tmp_va;
struct rb_node **link;
if (root) {
link = &root->rb_node;
if (unlikely(!*link)) {
*parent = NULL;
return link;
}
} else {
link = &from;
}
do {
tmp_va = rb_entry(*link, struct vmap_area, rb_node);
if (va->va_end <= tmp_va->va_start)
link = &(*link)->rb_left;
else if (va->va_start >= tmp_va->va_end)
link = &(*link)->rb_right;
else {
WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n",
va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end);
return NULL;
}
} while (*link);
*parent = &tmp_va->rb_node;
return link;
}
static __always_inline struct list_head *
get_va_next_sibling(struct rb_node *parent, struct rb_node **link)
{
struct list_head *list;
if (unlikely(!parent))
return NULL;
list = &rb_entry(parent, struct vmap_area, rb_node)->list;
return (&parent->rb_right == link ? list->next : list);
}
static __always_inline void
__link_va(struct vmap_area *va, struct rb_root *root,
struct rb_node *parent, struct rb_node **link,
struct list_head *head, bool augment)
{
if (likely(parent)) {
head = &rb_entry(parent, struct vmap_area, rb_node)->list;
if (&parent->rb_right != link)
head = head->prev;
}
rb_link_node(&va->rb_node, parent, link);
if (augment) {
rb_insert_augmented(&va->rb_node,
root, &free_vmap_area_rb_augment_cb);
va->subtree_max_size = 0;
} else {
rb_insert_color(&va->rb_node, root);
}
list_add(&va->list, head);
}
static __always_inline void
link_va(struct vmap_area *va, struct rb_root *root,
struct rb_node *parent, struct rb_node **link,
struct list_head *head)
{
__link_va(va, root, parent, link, head, false);
}
static __always_inline void
link_va_augment(struct vmap_area *va, struct rb_root *root,
struct rb_node *parent, struct rb_node **link,
struct list_head *head)
{
__link_va(va, root, parent, link, head, true);
}
static __always_inline void
__unlink_va(struct vmap_area *va, struct rb_root *root, bool augment)
{
if (WARN_ON(RB_EMPTY_NODE(&va->rb_node)))
return;
if (augment)
rb_erase_augmented(&va->rb_node,
root, &free_vmap_area_rb_augment_cb);
else
rb_erase(&va->rb_node, root);
list_del_init(&va->list);
RB_CLEAR_NODE(&va->rb_node);
}
static __always_inline void
unlink_va(struct vmap_area *va, struct rb_root *root)
{
__unlink_va(va, root, false);
}
static __always_inline void
unlink_va_augment(struct vmap_area *va, struct rb_root *root)
{
__unlink_va(va, root, true);
}
#if DEBUG_AUGMENT_PROPAGATE_CHECK
static __always_inline unsigned long
compute_subtree_max_size(struct vmap_area *va)
{
return max3(va_size(va),
get_subtree_max_size(va->rb_node.rb_left),
get_subtree_max_size(va->rb_node.rb_right));
}
static void
augment_tree_propagate_check(void)
{
struct vmap_area *va;
unsigned long computed_size;
list_for_each_entry(va, &free_vmap_area_list, list) {
computed_size = compute_subtree_max_size(va);
if (computed_size != va->subtree_max_size)
pr_emerg("tree is corrupted: %lu, %lu\n",
va_size(va), va->subtree_max_size);
}
}
#endif
static __always_inline void
augment_tree_propagate_from(struct vmap_area *va)
{
free_vmap_area_rb_augment_cb_propagate(&va->rb_node, NULL);
#if DEBUG_AUGMENT_PROPAGATE_CHECK
augment_tree_propagate_check();
#endif
}
static void
insert_vmap_area(struct vmap_area *va,
struct rb_root *root, struct list_head *head)
{
struct rb_node **link;
struct rb_node *parent;
link = find_va_links(va, root, NULL, &parent);
if (link)
link_va(va, root, parent, link, head);
}
static void
insert_vmap_area_augment(struct vmap_area *va,
struct rb_node *from, struct rb_root *root,
struct list_head *head)
{
struct rb_node **link;
struct rb_node *parent;
if (from)
link = find_va_links(va, NULL, from, &parent);
else
link = find_va_links(va, root, NULL, &parent);
if (link) {
link_va_augment(va, root, parent, link, head);
augment_tree_propagate_from(va);
}
}
static __always_inline struct vmap_area *
__merge_or_add_vmap_area(struct vmap_area *va,
struct rb_root *root, struct list_head *head, bool augment)
{
struct vmap_area *sibling;
struct list_head *next;
struct rb_node **link;
struct rb_node *parent;
bool merged = false;
link = find_va_links(va, root, NULL, &parent);
if (!link)
return NULL;
next = get_va_next_sibling(parent, link);
if (unlikely(next == NULL))
goto insert;
if (next != head) {
sibling = list_entry(next, struct vmap_area, list);
if (sibling->va_start == va->va_end) {
sibling->va_start = va->va_start;
kmem_cache_free(vmap_area_cachep, va);
va = sibling;
merged = true;
}
}
if (next->prev != head) {
sibling = list_entry(next->prev, struct vmap_area, list);
if (sibling->va_end == va->va_start) {
if (merged)
__unlink_va(va, root, augment);
sibling->va_end = va->va_end;
kmem_cache_free(vmap_area_cachep, va);
va = sibling;
merged = true;
}
}
insert:
if (!merged)
__link_va(va, root, parent, link, head, augment);
return va;
}
static __always_inline struct vmap_area *
merge_or_add_vmap_area(struct vmap_area *va,
struct rb_root *root, struct list_head *head)
{
return __merge_or_add_vmap_area(va, root, head, false);
}
static __always_inline struct vmap_area *
merge_or_add_vmap_area_augment(struct vmap_area *va,
struct rb_root *root, struct list_head *head)
{
va = __merge_or_add_vmap_area(va, root, head, true);
if (va)
augment_tree_propagate_from(va);
return va;
}
static __always_inline bool
is_within_this_va(struct vmap_area *va, unsigned long size,
unsigned long align, unsigned long vstart)
{
unsigned long nva_start_addr;
if (va->va_start > vstart)
nva_start_addr = ALIGN(va->va_start, align);
else
nva_start_addr = ALIGN(vstart, align);
if (nva_start_addr + size < nva_start_addr ||
nva_start_addr < vstart)
return false;
return (nva_start_addr + size <= va->va_end);
}
static __always_inline struct vmap_area *
find_vmap_lowest_match(struct rb_root *root, unsigned long size,
unsigned long align, unsigned long vstart, bool adjust_search_size)
{
struct vmap_area *va;
struct rb_node *node;
unsigned long length;
node = root->rb_node;
length = adjust_search_size ? size + align - 1 : size;
while (node) {
va = rb_entry(node, struct vmap_area, rb_node);
if (get_subtree_max_size(node->rb_left) >= length &&
vstart < va->va_start) {
node = node->rb_left;
} else {
if (is_within_this_va(va, size, align, vstart))
return va;
if (get_subtree_max_size(node->rb_right) >= length) {
node = node->rb_right;
continue;
}
while ((node = rb_parent(node))) {
va = rb_entry(node, struct vmap_area, rb_node);
if (is_within_this_va(va, size, align, vstart))
return va;
if (get_subtree_max_size(node->rb_right) >= length &&
vstart <= va->va_start) {
vstart = va->va_start + 1;
node = node->rb_right;
break;
}
}
}
}
return NULL;
}
#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
#include <linux/random.h>
static struct vmap_area *
find_vmap_lowest_linear_match(struct list_head *head, unsigned long size,
unsigned long align, unsigned long vstart)
{
struct vmap_area *va;
list_for_each_entry(va, head, list) {
if (!is_within_this_va(va, size, align, vstart))
continue;
return va;
}
return NULL;
}
static void
find_vmap_lowest_match_check(struct rb_root *root, struct list_head *head,
unsigned long size, unsigned long align)
{
struct vmap_area *va_1, *va_2;
unsigned long vstart;
unsigned int rnd;
get_random_bytes(&rnd, sizeof(rnd));
vstart = VMALLOC_START + rnd;
va_1 = find_vmap_lowest_match(root, size, align, vstart, false);
va_2 = find_vmap_lowest_linear_match(head, size, align, vstart);
if (va_1 != va_2)
pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n",
va_1, va_2, vstart);
}
#endif
enum fit_type {
NOTHING_FIT = 0,
FL_FIT_TYPE = 1,
LE_FIT_TYPE = 2,
RE_FIT_TYPE = 3,
NE_FIT_TYPE = 4
};
static __always_inline enum fit_type
classify_va_fit_type(struct vmap_area *va,
unsigned long nva_start_addr, unsigned long size)
{
enum fit_type type;
if (nva_start_addr < va->va_start ||
nva_start_addr + size > va->va_end)
return NOTHING_FIT;
if (va->va_start == nva_start_addr) {
if (va->va_end == nva_start_addr + size)
type = FL_FIT_TYPE;
else
type = LE_FIT_TYPE;
} else if (va->va_end == nva_start_addr + size) {
type = RE_FIT_TYPE;
} else {
type = NE_FIT_TYPE;
}
return type;
}
static __always_inline int
va_clip(struct rb_root *root, struct list_head *head,
struct vmap_area *va, unsigned long nva_start_addr,
unsigned long size)
{
struct vmap_area *lva = NULL;
enum fit_type type = classify_va_fit_type(va, nva_start_addr, size);
if (type == FL_FIT_TYPE) {
unlink_va_augment(va, root);
kmem_cache_free(vmap_area_cachep, va);
} else if (type == LE_FIT_TYPE) {
va->va_start += size;
} else if (type == RE_FIT_TYPE) {
va->va_end = nva_start_addr;
} else if (type == NE_FIT_TYPE) {
lva = __this_cpu_xchg(ne_fit_preload_node, NULL);
if (unlikely(!lva)) {
lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
if (!lva)
return -ENOMEM;
}
lva->va_start = va->va_start;
lva->va_end = nva_start_addr;
va->va_start = nva_start_addr + size;
} else {
return -EINVAL;
}
if (type != FL_FIT_TYPE) {
augment_tree_propagate_from(va);
if (lva)
insert_vmap_area_augment(lva, &va->rb_node, root, head);
}
return 0;
}
static unsigned long
va_alloc(struct vmap_area *va,
struct rb_root *root, struct list_head *head,
unsigned long size, unsigned long align,
unsigned long vstart, unsigned long vend)
{
unsigned long nva_start_addr;
int ret;
if (va->va_start > vstart)
nva_start_addr = ALIGN(va->va_start, align);
else
nva_start_addr = ALIGN(vstart, align);
if (nva_start_addr + size > vend)
return -ERANGE;
ret = va_clip(root, head, va, nva_start_addr, size);
if (WARN_ON_ONCE(ret))
return ret;
return nva_start_addr;
}
static __always_inline unsigned long
__alloc_vmap_area(struct rb_root *root, struct list_head *head,
unsigned long size, unsigned long align,
unsigned long vstart, unsigned long vend)
{
bool adjust_search_size = true;
unsigned long nva_start_addr;
struct vmap_area *va;
if (align <= PAGE_SIZE || (align > PAGE_SIZE && (vend - vstart) == size))
adjust_search_size = false;
va = find_vmap_lowest_match(root, size, align, vstart, adjust_search_size);
if (unlikely(!va))
return -ENOENT;
nva_start_addr = va_alloc(va, root, head, size, align, vstart, vend);
#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
if (!IS_ERR_VALUE(nva_start_addr))
find_vmap_lowest_match_check(root, head, size, align);
#endif
return nva_start_addr;
}
static void free_vmap_area(struct vmap_area *va)
{
struct vmap_node *vn = addr_to_node(va->va_start);
spin_lock(&vn->busy.lock);
unlink_va(va, &vn->busy.root);
spin_unlock(&vn->busy.lock);
spin_lock(&free_vmap_area_lock);
merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list);
spin_unlock(&free_vmap_area_lock);
}
static inline void
preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node)
{
struct vmap_area *va = NULL, *tmp;
if (!this_cpu_read(ne_fit_preload_node))
va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
spin_lock(lock);
tmp = NULL;
if (va && !__this_cpu_try_cmpxchg(ne_fit_preload_node, &tmp, va))
kmem_cache_free(vmap_area_cachep, va);
}
static struct vmap_pool *
size_to_va_pool(struct vmap_node *vn, unsigned long size)
{
unsigned int idx = (size - 1) / PAGE_SIZE;
if (idx < MAX_VA_SIZE_PAGES)
return &vn->pool[idx];
return NULL;
}
static bool
node_pool_add_va(struct vmap_node *n, struct vmap_area *va)
{
struct vmap_pool *vp;
vp = size_to_va_pool(n, va_size(va));
if (!vp)
return false;
spin_lock(&n->pool_lock);
list_add(&va->list, &vp->head);
WRITE_ONCE(vp->len, vp->len + 1);
spin_unlock(&n->pool_lock);
return true;
}
static struct vmap_area *
node_pool_del_va(struct vmap_node *vn, unsigned long size,
unsigned long align, unsigned long vstart,
unsigned long vend)
{
struct vmap_area *va = NULL;
struct vmap_pool *vp;
int err = 0;
vp = size_to_va_pool(vn, size);
if (!vp || list_empty(&vp->head))
return NULL;
spin_lock(&vn->pool_lock);
if (!list_empty(&vp->head)) {
va = list_first_entry(&vp->head, struct vmap_area, list);
if (IS_ALIGNED(va->va_start, align)) {
err |= (va_size(va) != size);
err |= (va->va_start < vstart);
err |= (va->va_end > vend);
if (!WARN_ON_ONCE(err)) {
list_del_init(&va->list);
WRITE_ONCE(vp->len, vp->len - 1);
} else {
va = NULL;
}
} else {
list_move_tail(&va->list, &vp->head);
va = NULL;
}
}
spin_unlock(&vn->pool_lock);
return va;
}
static struct vmap_area *
node_alloc(unsigned long size, unsigned long align,
unsigned long vstart, unsigned long vend,
unsigned long *addr, unsigned int *vn_id)
{
struct vmap_area *va;
*vn_id = 0;
*addr = -EINVAL;
if (vstart != VMALLOC_START || vend != VMALLOC_END ||
nr_vmap_nodes == 1)
return NULL;
*vn_id = raw_smp_processor_id() % nr_vmap_nodes;
va = node_pool_del_va(id_to_node(*vn_id), size, align, vstart, vend);
*vn_id = encode_vn_id(*vn_id);
if (va)
*addr = va->va_start;
return va;
}
static inline void setup_vmalloc_vm(struct vm_struct *vm,
struct vmap_area *va, unsigned long flags, const void *caller)
{
vm->flags = flags;
vm->addr = (void *)va->va_start;
vm->size = vm->requested_size = va_size(va);
vm->caller = caller;
va->vm = vm;
}
static struct vmap_area *alloc_vmap_area(unsigned long size,
unsigned long align,
unsigned long vstart, unsigned long vend,
int node, gfp_t gfp_mask,
unsigned long va_flags, struct vm_struct *vm)
{
struct vmap_node *vn;
struct vmap_area *va;
unsigned long freed;
unsigned long addr;
unsigned int vn_id;
bool allow_block;
int purged = 0;
int ret;
if (unlikely(!size || offset_in_page(size) || !is_power_of_2(align)))
return ERR_PTR(-EINVAL);
if (unlikely(!vmap_initialized))
return ERR_PTR(-EBUSY);
gfp_mask = gfp_mask & GFP_RECLAIM_MASK;
allow_block = gfpflags_allow_blocking(gfp_mask);
might_sleep_if(allow_block);
va = node_alloc(size, align, vstart, vend, &addr, &vn_id);
if (!va) {
va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
if (unlikely(!va))
return ERR_PTR(-ENOMEM);
kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask);
}
retry:
if (IS_ERR_VALUE(addr)) {
preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node);
addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list,
size, align, vstart, vend);
spin_unlock(&free_vmap_area_lock);
if (allow_block)
cond_resched();
}
trace_alloc_vmap_area(addr, size, align, vstart, vend, IS_ERR_VALUE(addr));
if (IS_ERR_VALUE(addr)) {
if (allow_block)
goto overflow;
goto out_free_va;
}
va->va_start = addr;
va->va_end = addr + size;
va->vm = NULL;
va->flags = (va_flags | vn_id);
if (vm) {
vm->addr = (void *)va->va_start;
vm->size = va_size(va);
va->vm = vm;
}
vn = addr_to_node(va->va_start);
spin_lock(&vn->busy.lock);
insert_vmap_area(va, &vn->busy.root, &vn->busy.head);
spin_unlock(&vn->busy.lock);
BUG_ON(!IS_ALIGNED(va->va_start, align));
BUG_ON(va->va_start < vstart);
BUG_ON(va->va_end > vend);
ret = kasan_populate_vmalloc(addr, size, gfp_mask);
if (ret) {
free_vmap_area(va);
return ERR_PTR(ret);
}
return va;
overflow:
if (!purged) {
reclaim_and_purge_vmap_areas();
purged = 1;
goto retry;
}
freed = 0;
blocking_notifier_call_chain(&vmap_notify_list, 0, &freed);
if (freed > 0) {
purged = 0;
goto retry;
}
if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit())
pr_warn("vmalloc_node_range for size %lu failed: Address range restricted to %#lx - %#lx\n",
size, vstart, vend);
out_free_va:
kmem_cache_free(vmap_area_cachep, va);
return ERR_PTR(-EBUSY);
}
int register_vmap_purge_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_register(&vmap_notify_list, nb);
}
EXPORT_SYMBOL_GPL(register_vmap_purge_notifier);
int unregister_vmap_purge_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_unregister(&vmap_notify_list, nb);
}
EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);
static unsigned long lazy_max_pages(void)
{
unsigned int log;
log = fls(num_online_cpus());
return log * (32UL * 1024 * 1024 / PAGE_SIZE);
}
static DEFINE_MUTEX(vmap_purge_lock);
static void purge_fragmented_blocks_allcpus(void);
static void
reclaim_list_global(struct list_head *head)
{
struct vmap_area *va, *n;
if (list_empty(head))
return;
spin_lock(&free_vmap_area_lock);
list_for_each_entry_safe(va, n, head, list)
merge_or_add_vmap_area_augment(va,
&free_vmap_area_root, &free_vmap_area_list);
spin_unlock(&free_vmap_area_lock);
}
static void
decay_va_pool_node(struct vmap_node *vn, bool full_decay)
{
LIST_HEAD(decay_list);
struct rb_root decay_root = RB_ROOT;
struct vmap_area *va, *nva;
unsigned long n_decay, pool_len;
int i;
for (i = 0; i < MAX_VA_SIZE_PAGES; i++) {
LIST_HEAD(tmp_list);
if (list_empty(&vn->pool[i].head))
continue;
spin_lock(&vn->pool_lock);
list_replace_init(&vn->pool[i].head, &tmp_list);
spin_unlock(&vn->pool_lock);
pool_len = n_decay = vn->pool[i].len;
WRITE_ONCE(vn->pool[i].len, 0);
if (!full_decay)
n_decay >>= 2;
pool_len -= n_decay;
list_for_each_entry_safe(va, nva, &tmp_list, list) {
if (!n_decay--)
break;
list_del_init(&va->list);
merge_or_add_vmap_area(va, &decay_root, &decay_list);
}
if (!list_empty(&tmp_list)) {
spin_lock(&vn->pool_lock);
list_replace_init(&tmp_list, &vn->pool[i].head);
WRITE_ONCE(vn->pool[i].len, pool_len);
spin_unlock(&vn->pool_lock);
}
}
reclaim_list_global(&decay_list);
}
#define KASAN_RELEASE_BATCH_SIZE 32
static void
kasan_release_vmalloc_node(struct vmap_node *vn)
{
struct vmap_area *va;
unsigned long start, end;
unsigned int batch_count = 0;
start = list_first_entry(&vn->purge_list, struct vmap_area, list)->va_start;
end = list_last_entry(&vn->purge_list, struct vmap_area, list)->va_end;
list_for_each_entry(va, &vn->purge_list, list) {
if (is_vmalloc_or_module_addr((void *) va->va_start))
kasan_release_vmalloc(va->va_start, va->va_end,
va->va_start, va->va_end,
KASAN_VMALLOC_PAGE_RANGE);
if (need_resched() || (++batch_count >= KASAN_RELEASE_BATCH_SIZE)) {
cond_resched();
batch_count = 0;
}
}
kasan_release_vmalloc(start, end, start, end, KASAN_VMALLOC_TLB_FLUSH);
}
static void purge_vmap_node(struct work_struct *work)
{
struct vmap_node *vn = container_of(work,
struct vmap_node, purge_work);
unsigned long nr_purged_pages = 0;
struct vmap_area *va, *n_va;
LIST_HEAD(local_list);
if (IS_ENABLED(CONFIG_KASAN_VMALLOC))
kasan_release_vmalloc_node(vn);
vn->nr_purged = 0;
list_for_each_entry_safe(va, n_va, &vn->purge_list, list) {
unsigned long nr = va_size(va) >> PAGE_SHIFT;
unsigned int vn_id = decode_vn_id(va->flags);
list_del_init(&va->list);
nr_purged_pages += nr;
vn->nr_purged++;
if (is_vn_id_valid(vn_id) && !vn->skip_populate)
if (node_pool_add_va(vn, va))
continue;
list_add(&va->list, &local_list);
}
atomic_long_sub(nr_purged_pages, &vmap_lazy_nr);
reclaim_list_global(&local_list);
}
static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end,
bool full_pool_decay)
{
unsigned long nr_purged_areas = 0;
unsigned int nr_purge_helpers;
static cpumask_t purge_nodes;
unsigned int nr_purge_nodes;
struct vmap_node *vn;
int i;
lockdep_assert_held(&vmap_purge_lock);
purge_nodes = CPU_MASK_NONE;
for_each_vmap_node(vn) {
INIT_LIST_HEAD(&vn->purge_list);
vn->skip_populate = full_pool_decay;
decay_va_pool_node(vn, full_pool_decay);
if (RB_EMPTY_ROOT(&vn->lazy.root))
continue;
spin_lock(&vn->lazy.lock);
WRITE_ONCE(vn->lazy.root.rb_node, NULL);
list_replace_init(&vn->lazy.head, &vn->purge_list);
spin_unlock(&vn->lazy.lock);
start = min(start, list_first_entry(&vn->purge_list,
struct vmap_area, list)->va_start);
end = max(end, list_last_entry(&vn->purge_list,
struct vmap_area, list)->va_end);
cpumask_set_cpu(node_to_id(vn), &purge_nodes);
}
nr_purge_nodes = cpumask_weight(&purge_nodes);
if (nr_purge_nodes > 0) {
flush_tlb_kernel_range(start, end);
nr_purge_helpers = atomic_long_read(&vmap_lazy_nr) / lazy_max_pages();
nr_purge_helpers = clamp(nr_purge_helpers, 1U, nr_purge_nodes) - 1;
for_each_cpu(i, &purge_nodes) {
vn = &vmap_nodes[i];
if (nr_purge_helpers > 0) {
INIT_WORK(&vn->purge_work, purge_vmap_node);
if (cpumask_test_cpu(i, cpu_online_mask))
schedule_work_on(i, &vn->purge_work);
else
schedule_work(&vn->purge_work);
nr_purge_helpers--;
} else {
vn->purge_work.func = NULL;
purge_vmap_node(&vn->purge_work);
nr_purged_areas += vn->nr_purged;
}
}
for_each_cpu(i, &purge_nodes) {
vn = &vmap_nodes[i];
if (vn->purge_work.func) {
flush_work(&vn->purge_work);
nr_purged_areas += vn->nr_purged;
}
}
}
trace_purge_vmap_area_lazy(start, end, nr_purged_areas);
return nr_purged_areas > 0;
}
static void reclaim_and_purge_vmap_areas(void)
{
mutex_lock(&vmap_purge_lock);
purge_fragmented_blocks_allcpus();
__purge_vmap_area_lazy(ULONG_MAX, 0, true);
mutex_unlock(&vmap_purge_lock);
}
static void drain_vmap_area_work(struct work_struct *work)
{
mutex_lock(&vmap_purge_lock);
__purge_vmap_area_lazy(ULONG_MAX, 0, false);
mutex_unlock(&vmap_purge_lock);
}
static void free_vmap_area_noflush(struct vmap_area *va)
{
unsigned long nr_lazy_max = lazy_max_pages();
unsigned long va_start = va->va_start;
unsigned int vn_id = decode_vn_id(va->flags);
struct vmap_node *vn;
unsigned long nr_lazy;
if (WARN_ON_ONCE(!list_empty(&va->list)))
return;
nr_lazy = atomic_long_add_return_relaxed(va_size(va) >> PAGE_SHIFT,
&vmap_lazy_nr);
vn = is_vn_id_valid(vn_id) ?
id_to_node(vn_id):addr_to_node(va->va_start);
spin_lock(&vn->lazy.lock);
insert_vmap_area(va, &vn->lazy.root, &vn->lazy.head);
spin_unlock(&vn->lazy.lock);
trace_free_vmap_area_noflush(va_start, nr_lazy, nr_lazy_max);
if (unlikely(nr_lazy > nr_lazy_max))
schedule_work(&drain_vmap_work);
}
static void free_unmap_vmap_area(struct vmap_area *va)
{
flush_cache_vunmap(va->va_start, va->va_end);
vunmap_range_noflush(va->va_start, va->va_end);
if (debug_pagealloc_enabled_static())
flush_tlb_kernel_range(va->va_start, va->va_end);
free_vmap_area_noflush(va);
}
struct vmap_area *find_vmap_area(unsigned long addr)
{
struct vmap_node *vn;
struct vmap_area *va;
int i, j;
if (unlikely(!vmap_initialized))
return NULL;
i = j = addr_to_node_id(addr);
do {
vn = &vmap_nodes[i];
spin_lock(&vn->busy.lock);
va = __find_vmap_area(addr, &vn->busy.root);
spin_unlock(&vn->busy.lock);
if (va)
return va;
} while ((i = (i + nr_vmap_nodes - 1) % nr_vmap_nodes) != j);
return NULL;
}
static struct vmap_area *find_unlink_vmap_area(unsigned long addr)
{
struct vmap_node *vn;
struct vmap_area *va;
int i, j;
i = j = addr_to_node_id(addr);
do {
vn = &vmap_nodes[i];
spin_lock(&vn->busy.lock);
va = __find_vmap_area(addr, &vn->busy.root);
if (va)
unlink_va(va, &vn->busy.root);
spin_unlock(&vn->busy.lock);
if (va)
return va;
} while ((i = (i + nr_vmap_nodes - 1) % nr_vmap_nodes) != j);
return NULL;
}
#if BITS_PER_LONG == 32
#define VMALLOC_SPACE (128UL*1024*1024)
#else
#define VMALLOC_SPACE (128UL*1024*1024*1024)
#endif
#define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE)
#define VMAP_MAX_ALLOC BITS_PER_LONG
#define VMAP_BBMAP_BITS_MAX 1024
#define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2)
#define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y))
#define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y))
#define VMAP_BBMAP_BITS \
VMAP_MIN(VMAP_BBMAP_BITS_MAX, \
VMAP_MAX(VMAP_BBMAP_BITS_MIN, \
VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16))
#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE)
#define VMAP_PURGE_THRESHOLD (VMAP_BBMAP_BITS / 4)
#define VMAP_RAM 0x1
#define VMAP_BLOCK 0x2
#define VMAP_FLAGS_MASK 0x3
struct vmap_block_queue {
spinlock_t lock;
struct list_head free;
struct xarray vmap_blocks;
};
struct vmap_block {
spinlock_t lock;
struct vmap_area *va;
unsigned long free, dirty;
DECLARE_BITMAP(used_map, VMAP_BBMAP_BITS);
unsigned long dirty_min, dirty_max;
struct list_head free_list;
struct rcu_head rcu_head;
struct list_head purge;
unsigned int cpu;
};
static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
static struct xarray *
addr_to_vb_xa(unsigned long addr)
{
int index = (addr / VMAP_BLOCK_SIZE) % nr_cpu_ids;
if (!cpu_possible(index))
index = cpumask_next(index, cpu_possible_mask);
return &per_cpu(vmap_block_queue, index).vmap_blocks;
}
static unsigned long addr_to_vb_idx(unsigned long addr)
{
addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1);
addr /= VMAP_BLOCK_SIZE;
return addr;
}
static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off)
{
unsigned long addr;
addr = va_start + (pages_off << PAGE_SHIFT);
BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start));
return (void *)addr;
}
static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
{
struct vmap_block_queue *vbq;
struct vmap_block *vb;
struct vmap_area *va;
struct xarray *xa;
unsigned long vb_idx;
int node, err;
void *vaddr;
node = numa_node_id();
vb = kmalloc_node(sizeof(struct vmap_block), gfp_mask, node);
if (unlikely(!vb))
return ERR_PTR(-ENOMEM);
va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
VMALLOC_START, VMALLOC_END,
node, gfp_mask,
VMAP_RAM|VMAP_BLOCK, NULL);
if (IS_ERR(va)) {
kfree(vb);
return ERR_CAST(va);
}
vaddr = vmap_block_vaddr(va->va_start, 0);
spin_lock_init(&vb->lock);
vb->va = va;
BUG_ON(VMAP_BBMAP_BITS <= (1UL << order));
bitmap_zero(vb->used_map, VMAP_BBMAP_BITS);
vb->free = VMAP_BBMAP_BITS - (1UL << order);
vb->dirty = 0;
vb->dirty_min = VMAP_BBMAP_BITS;
vb->dirty_max = 0;
bitmap_set(vb->used_map, 0, (1UL << order));
INIT_LIST_HEAD(&vb->free_list);
vb->cpu = raw_smp_processor_id();
xa = addr_to_vb_xa(va->va_start);
vb_idx = addr_to_vb_idx(va->va_start);
err = xa_insert(xa, vb_idx, vb, gfp_mask);
if (err) {
kfree(vb);
free_vmap_area(va);
return ERR_PTR(err);
}
vbq = per_cpu_ptr(&vmap_block_queue, vb->cpu);
spin_lock(&vbq->lock);
list_add_tail_rcu(&vb->free_list, &vbq->free);
spin_unlock(&vbq->lock);
return vaddr;
}
static void free_vmap_block(struct vmap_block *vb)
{
struct vmap_node *vn;
struct vmap_block *tmp;
struct xarray *xa;
xa = addr_to_vb_xa(vb->va->va_start);
tmp = xa_erase(xa, addr_to_vb_idx(vb->va->va_start));
BUG_ON(tmp != vb);
vn = addr_to_node(vb->va->va_start);
spin_lock(&vn->busy.lock);
unlink_va(vb->va, &vn->busy.root);
spin_unlock(&vn->busy.lock);
free_vmap_area_noflush(vb->va);
kfree_rcu(vb, rcu_head);
}
static bool purge_fragmented_block(struct vmap_block *vb,
struct list_head *purge_list, bool force_purge)
{
struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, vb->cpu);
if (vb->free + vb->dirty != VMAP_BBMAP_BITS ||
vb->dirty == VMAP_BBMAP_BITS)
return false;
if (!(force_purge || vb->free < VMAP_PURGE_THRESHOLD))
return false;
WRITE_ONCE(vb->free, 0);
WRITE_ONCE(vb->dirty, VMAP_BBMAP_BITS);
vb->dirty_min = 0;
vb->dirty_max = VMAP_BBMAP_BITS;
spin_lock(&vbq->lock);
list_del_rcu(&vb->free_list);
spin_unlock(&vbq->lock);
list_add_tail(&vb->purge, purge_list);
return true;
}
static void free_purged_blocks(struct list_head *purge_list)
{
struct vmap_block *vb, *n_vb;
list_for_each_entry_safe(vb, n_vb, purge_list, purge) {
list_del(&vb->purge);
free_vmap_block(vb);
}
}
static void purge_fragmented_blocks(int cpu)
{
LIST_HEAD(purge);
struct vmap_block *vb;
struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
rcu_read_lock();
list_for_each_entry_rcu(vb, &vbq->free, free_list) {
unsigned long free = READ_ONCE(vb->free);
unsigned long dirty = READ_ONCE(vb->dirty);
if (free + dirty != VMAP_BBMAP_BITS ||
dirty == VMAP_BBMAP_BITS)
continue;
spin_lock(&vb->lock);
purge_fragmented_block(vb, &purge, true);
spin_unlock(&vb->lock);
}
rcu_read_unlock();
free_purged_blocks(&purge);
}
static void purge_fragmented_blocks_allcpus(void)
{
int cpu;
for_each_possible_cpu(cpu)
purge_fragmented_blocks(cpu);
}
static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
{
struct vmap_block_queue *vbq;
struct vmap_block *vb;
void *vaddr = NULL;
unsigned int order;
BUG_ON(offset_in_page(size));
BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
if (WARN_ON(size == 0)) {
return ERR_PTR(-EINVAL);
}
order = get_order(size);
rcu_read_lock();
vbq = raw_cpu_ptr(&vmap_block_queue);
list_for_each_entry_rcu(vb, &vbq->free, free_list) {
unsigned long pages_off;
if (READ_ONCE(vb->free) < (1UL << order))
continue;
spin_lock(&vb->lock);
if (vb->free < (1UL << order)) {
spin_unlock(&vb->lock);
continue;
}
pages_off = VMAP_BBMAP_BITS - vb->free;
vaddr = vmap_block_vaddr(vb->va->va_start, pages_off);
WRITE_ONCE(vb->free, vb->free - (1UL << order));
bitmap_set(vb->used_map, pages_off, (1UL << order));
if (vb->free == 0) {
spin_lock(&vbq->lock);
list_del_rcu(&vb->free_list);
spin_unlock(&vbq->lock);
}
spin_unlock(&vb->lock);
break;
}
rcu_read_unlock();
if (!vaddr)
vaddr = new_vmap_block(order, gfp_mask);
return vaddr;
}
static void vb_free(unsigned long addr, unsigned long size)
{
unsigned long offset;
unsigned int order;
struct vmap_block *vb;
struct xarray *xa;
BUG_ON(offset_in_page(size));
BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
flush_cache_vunmap(addr, addr + size);
order = get_order(size);
offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;
xa = addr_to_vb_xa(addr);
vb = xa_load(xa, addr_to_vb_idx(addr));
spin_lock(&vb->lock);
bitmap_clear(vb->used_map, offset, (1UL << order));
spin_unlock(&vb->lock);
vunmap_range_noflush(addr, addr + size);
if (debug_pagealloc_enabled_static())
flush_tlb_kernel_range(addr, addr + size);
spin_lock(&vb->lock);
vb->dirty_min = min(vb->dirty_min, offset);
vb->dirty_max = max(vb->dirty_max, offset + (1UL << order));
WRITE_ONCE(vb->dirty, vb->dirty + (1UL << order));
if (vb->dirty == VMAP_BBMAP_BITS) {
BUG_ON(vb->free);
spin_unlock(&vb->lock);
free_vmap_block(vb);
} else
spin_unlock(&vb->lock);
}
static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
{
LIST_HEAD(purge_list);
int cpu;
if (unlikely(!vmap_initialized))
return;
mutex_lock(&vmap_purge_lock);
for_each_possible_cpu(cpu) {
struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
struct vmap_block *vb;
unsigned long idx;
rcu_read_lock();
xa_for_each(&vbq->vmap_blocks, idx, vb) {
spin_lock(&vb->lock);
if (!purge_fragmented_block(vb, &purge_list, false) &&
vb->dirty_max && vb->dirty != VMAP_BBMAP_BITS) {
unsigned long va_start = vb->va->va_start;
unsigned long s, e;
s = va_start + (vb->dirty_min << PAGE_SHIFT);
e = va_start + (vb->dirty_max << PAGE_SHIFT);
start = min(s, start);
end = max(e, end);
vb->dirty_min = VMAP_BBMAP_BITS;
vb->dirty_max = 0;
flush = 1;
}
spin_unlock(&vb->lock);
}
rcu_read_unlock();
}
free_purged_blocks(&purge_list);
if (!__purge_vmap_area_lazy(start, end, false) && flush)
flush_tlb_kernel_range(start, end);
mutex_unlock(&vmap_purge_lock);
}
void vm_unmap_aliases(void)
{
_vm_unmap_aliases(ULONG_MAX, 0, 0);
}
EXPORT_SYMBOL_GPL(vm_unmap_aliases);
void vm_unmap_ram(const void *mem, unsigned int count)
{
unsigned long size = (unsigned long)count << PAGE_SHIFT;
unsigned long addr = (unsigned long)kasan_reset_tag(mem);
struct vmap_area *va;
might_sleep();
BUG_ON(!addr);
BUG_ON(addr < VMALLOC_START);
BUG_ON(addr > VMALLOC_END);
BUG_ON(!PAGE_ALIGNED(addr));
kasan_poison_vmalloc(mem, size);
if (likely(count <= VMAP_MAX_ALLOC)) {
debug_check_no_locks_freed(mem, size);
vb_free(addr, size);
return;
}
va = find_unlink_vmap_area(addr);
if (WARN_ON_ONCE(!va))
return;
debug_check_no_locks_freed((void *)va->va_start, va_size(va));
free_unmap_vmap_area(va);
}
EXPORT_SYMBOL(vm_unmap_ram);
void *vm_map_ram(struct page **pages, unsigned int count, int node)
{
unsigned long size = (unsigned long)count << PAGE_SHIFT;
unsigned long addr;
void *mem;
if (likely(count <= VMAP_MAX_ALLOC)) {
mem = vb_alloc(size, GFP_KERNEL);
if (IS_ERR(mem))
return NULL;
addr = (unsigned long)mem;
} else {
struct vmap_area *va;
va = alloc_vmap_area(size, PAGE_SIZE,
VMALLOC_START, VMALLOC_END,
node, GFP_KERNEL, VMAP_RAM,
NULL);
if (IS_ERR(va))
return NULL;
addr = va->va_start;
mem = (void *)addr;
}
if (vmap_pages_range(addr, addr + size, PAGE_KERNEL,
pages, PAGE_SHIFT) < 0) {
vm_unmap_ram(mem, count);
return NULL;
}
mem = kasan_unpoison_vmalloc(mem, size, KASAN_VMALLOC_PROT_NORMAL);
return mem;
}
EXPORT_SYMBOL(vm_map_ram);
static struct vm_struct *vmlist __initdata;
static inline unsigned int vm_area_page_order(struct vm_struct *vm)
{
#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
return vm->page_order;
#else
return 0;
#endif
}
unsigned int get_vm_area_page_order(struct vm_struct *vm)
{
return vm_area_page_order(vm);
}
static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order)
{
#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
vm->page_order = order;
#else
BUG_ON(order != 0);
#endif
}
void __init vm_area_add_early(struct vm_struct *vm)
{
struct vm_struct *tmp, **p;
BUG_ON(vmap_initialized);
for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
if (tmp->addr >= vm->addr) {
BUG_ON(tmp->addr < vm->addr + vm->size);
break;
} else
BUG_ON(tmp->addr + tmp->size > vm->addr);
}
vm->next = *p;
*p = vm;
}
void __init vm_area_register_early(struct vm_struct *vm, size_t align)
{
unsigned long addr = ALIGN(VMALLOC_START, align);
struct vm_struct *cur, **p;
BUG_ON(vmap_initialized);
for (p = &vmlist; (cur = *p) != NULL; p = &cur->next) {
if ((unsigned long)cur->addr - addr >= vm->size)
break;
addr = ALIGN((unsigned long)cur->addr + cur->size, align);
}
BUG_ON(addr > VMALLOC_END - vm->size);
vm->addr = (void *)addr;
vm->next = *p;
*p = vm;
kasan_populate_early_vm_area_shadow(vm->addr, vm->size);
}
static void clear_vm_uninitialized_flag(struct vm_struct *vm)
{
smp_wmb();
vm->flags &= ~VM_UNINITIALIZED;
}
struct vm_struct *__get_vm_area_node(unsigned long size,
unsigned long align, unsigned long shift, unsigned long flags,
unsigned long start, unsigned long end, int node,
gfp_t gfp_mask, const void *caller)
{
struct vmap_area *va;
struct vm_struct *area;
unsigned long requested_size = size;
BUG_ON(in_interrupt());
size = ALIGN(size, 1ul << shift);
if (unlikely(!size))
return NULL;
if (flags & VM_IOREMAP)
align = 1ul << clamp_t(int, get_count_order_long(size),
PAGE_SHIFT, IOREMAP_MAX_ORDER);
area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
if (unlikely(!area))
return NULL;
if (!(flags & VM_NO_GUARD))
size += PAGE_SIZE;
area->flags = flags;
area->caller = caller;
area->requested_size = requested_size;
va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0, area);
if (IS_ERR(va)) {
kfree(area);
return NULL;
}
if (!(flags & VM_ALLOC))
area->addr = kasan_unpoison_vmalloc(area->addr, requested_size,
KASAN_VMALLOC_PROT_NORMAL);
return area;
}
struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
unsigned long start, unsigned long end,
const void *caller)
{
return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, start, end,
NUMA_NO_NODE, GFP_KERNEL, caller);
}
struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
{
return __get_vm_area_node(size, 1, PAGE_SHIFT, flags,
VMALLOC_START, VMALLOC_END,
NUMA_NO_NODE, GFP_KERNEL,
__builtin_return_address(0));
}
struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
const void *caller)
{
return __get_vm_area_node(size, 1, PAGE_SHIFT, flags,
VMALLOC_START, VMALLOC_END,
NUMA_NO_NODE, GFP_KERNEL, caller);
}
struct vm_struct *find_vm_area(const void *addr)
{
struct vmap_area *va;
va = find_vmap_area((unsigned long)addr);
if (!va)
return NULL;
return va->vm;
}
struct vm_struct *remove_vm_area(const void *addr)
{
struct vmap_area *va;
struct vm_struct *vm;
might_sleep();
if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n",
addr))
return NULL;
va = find_unlink_vmap_area((unsigned long)addr);
if (!va || !va->vm)
return NULL;
vm = va->vm;
debug_check_no_locks_freed(vm->addr, get_vm_area_size(vm));
debug_check_no_obj_freed(vm->addr, get_vm_area_size(vm));
kasan_free_module_shadow(vm);
kasan_poison_vmalloc(vm->addr, get_vm_area_size(vm));
free_unmap_vmap_area(va);
return vm;
}
static inline void set_area_direct_map(const struct vm_struct *area,
int (*set_direct_map)(struct page *page))
{
int i;
for (i = 0; i < area->nr_pages; i++)
if (page_address(area->pages[i]))
set_direct_map(area->pages[i]);
}
static void vm_reset_perms(struct vm_struct *area)
{
unsigned long start = ULONG_MAX, end = 0;
unsigned int page_order = vm_area_page_order(area);
int flush_dmap = 0;
int i;
for (i = 0; i < area->nr_pages; i += 1U << page_order) {
unsigned long addr = (unsigned long)page_address(area->pages[i]);
if (addr) {
unsigned long page_size;
page_size = PAGE_SIZE << page_order;
start = min(addr, start);
end = max(addr + page_size, end);
flush_dmap = 1;
}
}
set_area_direct_map(area, set_direct_map_invalid_noflush);
_vm_unmap_aliases(start, end, flush_dmap);
set_area_direct_map(area, set_direct_map_default_noflush);
}
static void delayed_vfree_work(struct work_struct *w)
{
struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
struct llist_node *t, *llnode;
llist_for_each_safe(llnode, t, llist_del_all(&p->list))
vfree(llnode);
}
void vfree_atomic(const void *addr)
{
struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);
BUG_ON(in_nmi());
kmemleak_free(addr);
if (addr && llist_add((struct llist_node *)addr, &p->list))
schedule_work(&p->wq);
}
void vfree(const void *addr)
{
struct vm_struct *vm;
int i;
if (unlikely(in_interrupt())) {
vfree_atomic(addr);
return;
}
BUG_ON(in_nmi());
kmemleak_free(addr);
might_sleep();
if (!addr)
return;
vm = remove_vm_area(addr);
if (unlikely(!vm)) {
WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
addr);
return;
}
if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS))
vm_reset_perms(vm);
if (vm->nr_pages && !(vm->flags & VM_MAP_PUT_PAGES))
mod_memcg_page_state(vm->pages[0], MEMCG_VMALLOC, -vm->nr_pages);
for (i = 0; i < vm->nr_pages; i++) {
struct page *page = vm->pages[i];
BUG_ON(!page);
__free_page(page);
cond_resched();
}
if (!(vm->flags & VM_MAP_PUT_PAGES))
atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages);
kvfree(vm->pages);
kfree(vm);
}
EXPORT_SYMBOL(vfree);
void vunmap(const void *addr)
{
struct vm_struct *vm;
BUG_ON(in_interrupt());
might_sleep();
if (!addr)
return;
vm = remove_vm_area(addr);
if (unlikely(!vm)) {
WARN(1, KERN_ERR "Trying to vunmap() nonexistent vm area (%p)\n",
addr);
return;
}
kfree(vm);
}
EXPORT_SYMBOL(vunmap);
void *vmap(struct page **pages, unsigned int count,
unsigned long flags, pgprot_t prot)
{
struct vm_struct *area;
unsigned long addr;
unsigned long size;
might_sleep();
if (WARN_ON_ONCE(flags & VM_FLUSH_RESET_PERMS))
return NULL;
if (WARN_ON_ONCE(flags & VM_NO_GUARD))
flags &= ~VM_NO_GUARD;
if (count > totalram_pages())
return NULL;
size = (unsigned long)count << PAGE_SHIFT;
area = get_vm_area_caller(size, flags, __builtin_return_address(0));
if (!area)
return NULL;
addr = (unsigned long)area->addr;
if (vmap_pages_range(addr, addr + size, pgprot_nx(prot),
pages, PAGE_SHIFT) < 0) {
vunmap(area->addr);
return NULL;
}
if (flags & VM_MAP_PUT_PAGES) {
area->pages = pages;
area->nr_pages = count;
}
return area->addr;
}
EXPORT_SYMBOL(vmap);
#ifdef CONFIG_VMAP_PFN
struct vmap_pfn_data {
unsigned long *pfns;
pgprot_t prot;
unsigned int idx;
};
static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private)
{
struct vmap_pfn_data *data = private;
unsigned long pfn = data->pfns[data->idx];
pte_t ptent;
if (WARN_ON_ONCE(pfn_valid(pfn)))
return -EINVAL;
ptent = pte_mkspecial(pfn_pte(pfn, data->prot));
set_pte_at(&init_mm, addr, pte, ptent);
data->idx++;
return 0;
}
void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot)
{
struct vmap_pfn_data data = { .pfns = pfns, .prot = pgprot_nx(prot) };
struct vm_struct *area;
area = get_vm_area_caller(count * PAGE_SIZE, VM_IOREMAP,
__builtin_return_address(0));
if (!area)
return NULL;
if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
count * PAGE_SIZE, vmap_pfn_apply, &data)) {
free_vm_area(area);
return NULL;
}
flush_cache_vmap((unsigned long)area->addr,
(unsigned long)area->addr + count * PAGE_SIZE);
return area->addr;
}
EXPORT_SYMBOL_GPL(vmap_pfn);
#endif
static inline gfp_t vmalloc_gfp_adjust(gfp_t flags, const bool large)
{
flags |= __GFP_NOWARN;
if (large)
flags &= ~__GFP_NOFAIL;
return flags;
}
static inline unsigned int
vm_area_alloc_pages(gfp_t gfp, int nid,
unsigned int order, unsigned int nr_pages, struct page **pages)
{
unsigned int nr_allocated = 0;
unsigned int nr_remaining = nr_pages;
unsigned int max_attempt_order = MAX_PAGE_ORDER;
struct page *page;
int i;
unsigned int large_order = ilog2(nr_remaining);
gfp_t large_gfp = vmalloc_gfp_adjust(gfp, large_order) & ~__GFP_DIRECT_RECLAIM;
large_order = min(max_attempt_order, large_order);
while (large_order > order && nr_remaining) {
if (nid == NUMA_NO_NODE)
page = alloc_pages_noprof(large_gfp, large_order);
else
page = alloc_pages_node_noprof(nid, large_gfp, large_order);
if (unlikely(!page)) {
max_attempt_order = --large_order;
continue;
}
split_page(page, large_order);
for (i = 0; i < (1U << large_order); i++)
pages[nr_allocated + i] = page + i;
nr_allocated += 1U << large_order;
nr_remaining = nr_pages - nr_allocated;
large_order = ilog2(nr_remaining);
large_order = min(max_attempt_order, large_order);
}
if (!order) {
while (nr_allocated < nr_pages) {
unsigned int nr, nr_pages_request;
nr_pages_request = min(100U, nr_pages - nr_allocated);
if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE)
nr = alloc_pages_bulk_mempolicy_noprof(gfp,
nr_pages_request,
pages + nr_allocated);
else
nr = alloc_pages_bulk_node_noprof(gfp, nid,
nr_pages_request,
pages + nr_allocated);
nr_allocated += nr;
if (nr != nr_pages_request)
break;
}
}
while (nr_allocated < nr_pages) {
if (!(gfp & __GFP_NOFAIL) && fatal_signal_pending(current))
break;
if (nid == NUMA_NO_NODE)
page = alloc_pages_noprof(gfp, order);
else
page = alloc_pages_node_noprof(nid, gfp, order);
if (unlikely(!page))
break;
if (order)
split_page(page, order);
for (i = 0; i < (1U << order); i++)
pages[nr_allocated + i] = page + i;
nr_allocated += 1U << order;
}
return nr_allocated;
}
static LLIST_HEAD(pending_vm_area_cleanup);
static void cleanup_vm_area_work(struct work_struct *work)
{
struct vm_struct *area, *tmp;
struct llist_node *head;
head = llist_del_all(&pending_vm_area_cleanup);
if (!head)
return;
llist_for_each_entry_safe(area, tmp, head, llnode) {
if (!area->pages)
free_vm_area(area);
else
vfree(area->addr);
}
}
static DECLARE_WORK(cleanup_vm_area, cleanup_vm_area_work);
static void defer_vm_area_cleanup(struct vm_struct *area)
{
if (llist_add(&area->llnode, &pending_vm_area_cleanup))
schedule_work(&cleanup_vm_area);
}
unsigned int
memalloc_apply_gfp_scope(gfp_t gfp_mask)
{
unsigned int flags = 0;
if (!gfpflags_allow_blocking(gfp_mask))
flags = memalloc_noreclaim_save();
else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
flags = memalloc_nofs_save();
else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
flags = memalloc_noio_save();
return flags;
}
void
memalloc_restore_scope(unsigned int flags)
{
if (flags)
memalloc_flags_restore(flags);
}
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
pgprot_t prot, unsigned int page_shift,
int node)
{
const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
bool nofail = gfp_mask & __GFP_NOFAIL;
unsigned long addr = (unsigned long)area->addr;
unsigned long size = get_vm_area_size(area);
unsigned long array_size;
unsigned int nr_small_pages = size >> PAGE_SHIFT;
unsigned int page_order;
unsigned int flags;
int ret;
array_size = (unsigned long)nr_small_pages * sizeof(struct page *);
if (!gfpflags_allow_blocking(gfp_mask))
nofail = false;
if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
gfp_mask |= __GFP_HIGHMEM;
if (array_size > PAGE_SIZE) {
area->pages = __vmalloc_node_noprof(array_size, 1, nested_gfp, node,
area->caller);
} else {
area->pages = kmalloc_node_noprof(array_size, nested_gfp, node);
}
if (!area->pages) {
warn_alloc(gfp_mask, NULL,
"vmalloc error: size %lu, failed to allocated page array size %lu",
nr_small_pages * PAGE_SIZE, array_size);
goto fail;
}
set_vm_area_page_order(area, page_shift - PAGE_SHIFT);
page_order = vm_area_page_order(area);
area->nr_pages = vm_area_alloc_pages(
vmalloc_gfp_adjust(gfp_mask, page_order), node,
page_order, nr_small_pages, area->pages);
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
if (gfp_mask & __GFP_ACCOUNT && area->nr_pages)
mod_memcg_page_state(area->pages[0], MEMCG_VMALLOC,
area->nr_pages);
if (area->nr_pages != nr_small_pages) {
if (!fatal_signal_pending(current) && page_order == 0)
warn_alloc(gfp_mask, NULL,
"vmalloc error: size %lu, failed to allocate pages",
area->nr_pages * PAGE_SIZE);
goto fail;
}
flags = memalloc_apply_gfp_scope(gfp_mask);
do {
ret = __vmap_pages_range(addr, addr + size, prot, area->pages,
page_shift, nested_gfp);
if (nofail && (ret < 0))
schedule_timeout_uninterruptible(1);
} while (nofail && (ret < 0));
memalloc_restore_scope(flags);
if (ret < 0) {
warn_alloc(gfp_mask, NULL,
"vmalloc error: size %lu, failed to map pages",
area->nr_pages * PAGE_SIZE);
goto fail;
}
return area->addr;
fail:
defer_vm_area_cleanup(area);
return NULL;
}
#define GFP_VMALLOC_SUPPORTED (GFP_KERNEL | GFP_ATOMIC | GFP_NOWAIT |\
__GFP_NOFAIL | __GFP_ZERO | __GFP_NORETRY |\
GFP_NOFS | GFP_NOIO | GFP_KERNEL_ACCOUNT |\
GFP_USER | __GFP_NOLOCKDEP)
static gfp_t vmalloc_fix_flags(gfp_t flags)
{
gfp_t invalid_mask = flags & ~GFP_VMALLOC_SUPPORTED;
flags &= GFP_VMALLOC_SUPPORTED;
WARN_ONCE(1, "Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n",
invalid_mask, &invalid_mask, flags, &flags);
return flags;
}
void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
unsigned long start, unsigned long end, gfp_t gfp_mask,
pgprot_t prot, unsigned long vm_flags, int node,
const void *caller)
{
struct vm_struct *area;
void *ret;
kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE;
unsigned long original_align = align;
unsigned int shift = PAGE_SHIFT;
if (WARN_ON_ONCE(!size))
return NULL;
if ((size >> PAGE_SHIFT) > totalram_pages()) {
warn_alloc(gfp_mask, NULL,
"vmalloc error: size %lu, exceeds total pages",
size);
return NULL;
}
if (vmap_allow_huge && (vm_flags & VM_ALLOW_HUGE_VMAP)) {
if (arch_vmap_pmd_supported(prot) && size >= PMD_SIZE)
shift = PMD_SHIFT;
else
shift = arch_vmap_pte_supported_shift(size);
align = max(original_align, 1UL << shift);
}
again:
area = __get_vm_area_node(size, align, shift, VM_ALLOC |
VM_UNINITIALIZED | vm_flags, start, end, node,
gfp_mask, caller);
if (!area) {
bool nofail = gfp_mask & __GFP_NOFAIL;
warn_alloc(gfp_mask, NULL,
"vmalloc error: size %lu, vm_struct allocation failed%s",
size, (nofail) ? ". Retrying." : "");
if (nofail) {
schedule_timeout_uninterruptible(1);
goto again;
}
goto fail;
}
if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) {
if (kasan_hw_tags_enabled()) {
prot = arch_vmap_pgprot_tagged(prot);
gfp_mask |= __GFP_SKIP_KASAN | __GFP_SKIP_ZERO;
}
kasan_flags |= KASAN_VMALLOC_PROT_NORMAL;
}
ret = __vmalloc_area_node(area, gfp_mask, prot, shift, node);
if (!ret)
goto fail;
kasan_flags |= KASAN_VMALLOC_VM_ALLOC;
if (!want_init_on_free() && want_init_on_alloc(gfp_mask) &&
(gfp_mask & __GFP_SKIP_ZERO))
kasan_flags |= KASAN_VMALLOC_INIT;
area->addr = kasan_unpoison_vmalloc(area->addr, size, kasan_flags);
clear_vm_uninitialized_flag(area);
if (!(vm_flags & VM_DEFER_KMEMLEAK))
kmemleak_vmalloc(area, PAGE_ALIGN(size), gfp_mask);
return area->addr;
fail:
if (shift > PAGE_SHIFT) {
shift = PAGE_SHIFT;
align = original_align;
goto again;
}
return NULL;
}
void *__vmalloc_node_noprof(unsigned long size, unsigned long align,
gfp_t gfp_mask, int node, const void *caller)
{
return __vmalloc_node_range_noprof(size, align, VMALLOC_START, VMALLOC_END,
gfp_mask, PAGE_KERNEL, 0, node, caller);
}
#ifdef CONFIG_TEST_VMALLOC_MODULE
EXPORT_SYMBOL_GPL(__vmalloc_node_noprof);
#endif
void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask)
{
if (unlikely(gfp_mask & ~GFP_VMALLOC_SUPPORTED))
gfp_mask = vmalloc_fix_flags(gfp_mask);
return __vmalloc_node_noprof(size, 1, gfp_mask, NUMA_NO_NODE,
__builtin_return_address(0));
}
EXPORT_SYMBOL(__vmalloc_noprof);
void *vmalloc_noprof(unsigned long size)
{
return __vmalloc_node_noprof(size, 1, GFP_KERNEL, NUMA_NO_NODE,
__builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_noprof);
void *vmalloc_huge_node_noprof(unsigned long size, gfp_t gfp_mask, int node)
{
if (unlikely(gfp_mask & ~GFP_VMALLOC_SUPPORTED))
gfp_mask = vmalloc_fix_flags(gfp_mask);
return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
node, __builtin_return_address(0));
}
EXPORT_SYMBOL_GPL(vmalloc_huge_node_noprof);
void *vzalloc_noprof(unsigned long size)
{
return __vmalloc_node_noprof(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE,
__builtin_return_address(0));
}
EXPORT_SYMBOL(vzalloc_noprof);
void *vmalloc_user_noprof(unsigned long size)
{
return __vmalloc_node_range_noprof(size, SHMLBA, VMALLOC_START, VMALLOC_END,
GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL,
VM_USERMAP, NUMA_NO_NODE,
__builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_user_noprof);
void *vmalloc_node_noprof(unsigned long size, int node)
{
return __vmalloc_node_noprof(size, 1, GFP_KERNEL, node,
__builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_node_noprof);
void *vzalloc_node_noprof(unsigned long size, int node)
{
return __vmalloc_node_noprof(size, 1, GFP_KERNEL | __GFP_ZERO, node,
__builtin_return_address(0));
}
EXPORT_SYMBOL(vzalloc_node_noprof);
void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align,
gfp_t flags, int nid)
{
struct vm_struct *vm = NULL;
size_t alloced_size = 0;
size_t old_size = 0;
void *n;
if (!size) {
vfree(p);
return NULL;
}
if (p) {
vm = find_vm_area(p);
if (unlikely(!vm)) {
WARN(1, "Trying to vrealloc() nonexistent vm area (%p)\n", p);
return NULL;
}
alloced_size = get_vm_area_size(vm);
old_size = vm->requested_size;
if (WARN(alloced_size < old_size,
"vrealloc() has mismatched area vs requested sizes (%p)\n", p))
return NULL;
if (WARN(!IS_ALIGNED((unsigned long)p, align),
"will not reallocate with a bigger alignment (0x%lx)\n", align))
return NULL;
if (unlikely(flags & __GFP_THISNODE) && nid != NUMA_NO_NODE &&
nid != page_to_nid(vmalloc_to_page(p)))
goto need_realloc;
}
if (size <= old_size) {
if (want_init_on_free() || want_init_on_alloc(flags))
memset((void *)p + size, 0, old_size - size);
vm->requested_size = size;
kasan_vrealloc(p, old_size, size);
return (void *)p;
}
if (size <= alloced_size) {
vm->requested_size = size;
kasan_vrealloc(p, old_size, size);
return (void *)p;
}
need_realloc:
n = __vmalloc_node_noprof(size, align, flags, nid, __builtin_return_address(0));
if (!n)
return NULL;
if (p) {
memcpy(n, p, old_size);
vfree(p);
}
return n;
}
EXPORT_SYMBOL(vrealloc_node_align_noprof);
#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
#elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
#define GFP_VMALLOC32 (GFP_DMA | GFP_KERNEL)
#else
#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
#endif
void *vmalloc_32_noprof(unsigned long size)
{
return __vmalloc_node_noprof(size, 1, GFP_VMALLOC32, NUMA_NO_NODE,
__builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_32_noprof);
void *vmalloc_32_user_noprof(unsigned long size)
{
return __vmalloc_node_range_noprof(size, SHMLBA, VMALLOC_START, VMALLOC_END,
GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
VM_USERMAP, NUMA_NO_NODE,
__builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_32_user_noprof);
static size_t zero_iter(struct iov_iter *iter, size_t count)
{
size_t remains = count;
while (remains > 0) {
size_t num, copied;
num = min_t(size_t, remains, PAGE_SIZE);
copied = copy_page_to_iter_nofault(ZERO_PAGE(0), 0, num, iter);
remains -= copied;
if (copied < num)
break;
}
return count - remains;
}
static size_t aligned_vread_iter(struct iov_iter *iter,
const char *addr, size_t count)
{
size_t remains = count;
struct page *page;
while (remains > 0) {
unsigned long offset, length;
size_t copied = 0;
offset = offset_in_page(addr);
length = PAGE_SIZE - offset;
if (length > remains)
length = remains;
page = vmalloc_to_page(addr);
if (page)
copied = copy_page_to_iter_nofault(page, offset,
length, iter);
else
copied = zero_iter(iter, length);
addr += copied;
remains -= copied;
if (copied != length)
break;
}
return count - remains;
}
static size_t vmap_ram_vread_iter(struct iov_iter *iter, const char *addr,
size_t count, unsigned long flags)
{
char *start;
struct vmap_block *vb;
struct xarray *xa;
unsigned long offset;
unsigned int rs, re;
size_t remains, n;
if (!(flags & VMAP_BLOCK))
return aligned_vread_iter(iter, addr, count);
remains = count;
xa = addr_to_vb_xa((unsigned long) addr);
vb = xa_load(xa, addr_to_vb_idx((unsigned long)addr));
if (!vb)
goto finished_zero;
spin_lock(&vb->lock);
if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) {
spin_unlock(&vb->lock);
goto finished_zero;
}
for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) {
size_t copied;
if (remains == 0)
goto finished;
start = vmap_block_vaddr(vb->va->va_start, rs);
if (addr < start) {
size_t to_zero = min_t(size_t, start - addr, remains);
size_t zeroed = zero_iter(iter, to_zero);
addr += zeroed;
remains -= zeroed;
if (remains == 0 || zeroed != to_zero)
goto finished;
}
offset = offset_in_page(addr);
n = ((re - rs + 1) << PAGE_SHIFT) - offset;
if (n > remains)
n = remains;
copied = aligned_vread_iter(iter, start + offset, n);
addr += copied;
remains -= copied;
if (copied != n)
goto finished;
}
spin_unlock(&vb->lock);
finished_zero:
return count - remains + zero_iter(iter, remains);
finished:
spin_unlock(&vb->lock);
return count - remains;
}
long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
{
struct vmap_node *vn;
struct vmap_area *va;
struct vm_struct *vm;
char *vaddr;
size_t n, size, flags, remains;
unsigned long next;
addr = kasan_reset_tag(addr);
if ((unsigned long) addr + count < count)
count = -(unsigned long) addr;
remains = count;
vn = find_vmap_area_exceed_addr_lock((unsigned long) addr, &va);
if (!vn)
goto finished_zero;
if ((unsigned long)addr + remains <= va->va_start)
goto finished_zero;
do {
size_t copied;
if (remains == 0)
goto finished;
vm = va->vm;
flags = va->flags & VMAP_FLAGS_MASK;
WARN_ON(flags == VMAP_BLOCK);
if (!vm && !flags)
goto next_va;
if (vm && (vm->flags & VM_UNINITIALIZED))
goto next_va;
smp_rmb();
vaddr = (char *) va->va_start;
size = vm ? get_vm_area_size(vm) : va_size(va);
if (addr >= vaddr + size)
goto next_va;
if (addr < vaddr) {
size_t to_zero = min_t(size_t, vaddr - addr, remains);
size_t zeroed = zero_iter(iter, to_zero);
addr += zeroed;
remains -= zeroed;
if (remains == 0 || zeroed != to_zero)
goto finished;
}
n = vaddr + size - addr;
if (n > remains)
n = remains;
if (flags & VMAP_RAM)
copied = vmap_ram_vread_iter(iter, addr, n, flags);
else if (!(vm && (vm->flags & (VM_IOREMAP | VM_SPARSE))))
copied = aligned_vread_iter(iter, addr, n);
else
copied = zero_iter(iter, n);
addr += copied;
remains -= copied;
if (copied != n)
goto finished;
next_va:
next = va->va_end;
spin_unlock(&vn->busy.lock);
} while ((vn = find_vmap_area_exceed_addr_lock(next, &va)));
finished_zero:
if (vn)
spin_unlock(&vn->busy.lock);
return count - remains + zero_iter(iter, remains);
finished:
if (vn)
spin_unlock(&vn->busy.lock);
return count - remains;
}
int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
void *kaddr, unsigned long pgoff,
unsigned long size)
{
struct vm_struct *area;
unsigned long off;
unsigned long end_index;
if (check_shl_overflow(pgoff, PAGE_SHIFT, &off))
return -EINVAL;
size = PAGE_ALIGN(size);
if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr))
return -EINVAL;
area = find_vm_area(kaddr);
if (!area)
return -EINVAL;
if (!(area->flags & (VM_USERMAP | VM_DMA_COHERENT)))
return -EINVAL;
if (check_add_overflow(size, off, &end_index) ||
end_index > get_vm_area_size(area))
return -EINVAL;
kaddr += off;
do {
struct page *page = vmalloc_to_page(kaddr);
int ret;
ret = vm_insert_page(vma, uaddr, page);
if (ret)
return ret;
uaddr += PAGE_SIZE;
kaddr += PAGE_SIZE;
size -= PAGE_SIZE;
} while (size > 0);
vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP);
return 0;
}
int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
unsigned long pgoff)
{
return remap_vmalloc_range_partial(vma, vma->vm_start,
addr, pgoff,
vma->vm_end - vma->vm_start);
}
EXPORT_SYMBOL(remap_vmalloc_range);
void free_vm_area(struct vm_struct *area)
{
struct vm_struct *ret;
ret = remove_vm_area(area->addr);
BUG_ON(ret != area);
kfree(area);
}
EXPORT_SYMBOL_GPL(free_vm_area);
#ifdef CONFIG_SMP
static struct vmap_area *node_to_va(struct rb_node *n)
{
return rb_entry_safe(n, struct vmap_area, rb_node);
}
static struct vmap_area *
pvm_find_va_enclose_addr(unsigned long addr)
{
struct vmap_area *va, *tmp;
struct rb_node *n;
n = free_vmap_area_root.rb_node;
va = NULL;
while (n) {
tmp = rb_entry(n, struct vmap_area, rb_node);
if (tmp->va_start <= addr) {
va = tmp;
if (tmp->va_end >= addr)
break;
n = n->rb_right;
} else {
n = n->rb_left;
}
}
return va;
}
static unsigned long
pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align)
{
unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
unsigned long addr;
if (likely(*va)) {
list_for_each_entry_from_reverse((*va),
&free_vmap_area_list, list) {
addr = min((*va)->va_end & ~(align - 1), vmalloc_end);
if ((*va)->va_start < addr)
return addr;
}
}
return 0;
}
struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
const size_t *sizes, int nr_vms,
size_t align)
{
const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
struct vmap_area **vas, *va;
struct vm_struct **vms;
int area, area2, last_area, term_area;
unsigned long base, start, size, end, last_end, orig_start, orig_end;
bool purged = false;
BUG_ON(offset_in_page(align) || !is_power_of_2(align));
for (last_area = 0, area = 0; area < nr_vms; area++) {
start = offsets[area];
end = start + sizes[area];
BUG_ON(!IS_ALIGNED(offsets[area], align));
BUG_ON(!IS_ALIGNED(sizes[area], align));
if (start > offsets[last_area])
last_area = area;
for (area2 = area + 1; area2 < nr_vms; area2++) {
unsigned long start2 = offsets[area2];
unsigned long end2 = start2 + sizes[area2];
BUG_ON(start2 < end && start < end2);
}
}
last_end = offsets[last_area] + sizes[last_area];
if (vmalloc_end - vmalloc_start < last_end) {
WARN_ON(true);
return NULL;
}
vms = kzalloc_objs(vms[0], nr_vms);
vas = kzalloc_objs(vas[0], nr_vms);
if (!vas || !vms)
goto err_free2;
for (area = 0; area < nr_vms; area++) {
vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL);
vms[area] = kzalloc_obj(struct vm_struct);
if (!vas[area] || !vms[area])
goto err_free;
}
retry:
spin_lock(&free_vmap_area_lock);
area = term_area = last_area;
start = offsets[area];
end = start + sizes[area];
va = pvm_find_va_enclose_addr(vmalloc_end);
base = pvm_determine_end_from_reverse(&va, align) - end;
while (true) {
if (base + last_end < vmalloc_start + last_end)
goto overflow;
if (va == NULL)
goto overflow;
if (base + end > va->va_end) {
base = pvm_determine_end_from_reverse(&va, align) - end;
term_area = area;
continue;
}
if (base + start < va->va_start) {
va = node_to_va(rb_prev(&va->rb_node));
base = pvm_determine_end_from_reverse(&va, align) - end;
term_area = area;
continue;
}
area = (area + nr_vms - 1) % nr_vms;
if (area == term_area)
break;
start = offsets[area];
end = start + sizes[area];
va = pvm_find_va_enclose_addr(base + end);
}
for (area = 0; area < nr_vms; area++) {
int ret;
start = base + offsets[area];
size = sizes[area];
va = pvm_find_va_enclose_addr(start);
if (WARN_ON_ONCE(va == NULL))
goto recovery;
ret = va_clip(&free_vmap_area_root,
&free_vmap_area_list, va, start, size);
if (WARN_ON_ONCE(unlikely(ret)))
goto recovery;
va = vas[area];
va->va_start = start;
va->va_end = start + size;
}
spin_unlock(&free_vmap_area_lock);
for (area = 0; area < nr_vms; area++) {
if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area], GFP_KERNEL))
goto err_free_shadow;
}
for (area = 0; area < nr_vms; area++) {
struct vmap_node *vn = addr_to_node(vas[area]->va_start);
spin_lock(&vn->busy.lock);
insert_vmap_area(vas[area], &vn->busy.root, &vn->busy.head);
setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
pcpu_get_vm_areas);
spin_unlock(&vn->busy.lock);
}
kasan_unpoison_vmap_areas(vms, nr_vms, KASAN_VMALLOC_PROT_NORMAL);
kfree(vas);
return vms;
recovery:
while (area--) {
orig_start = vas[area]->va_start;
orig_end = vas[area]->va_end;
va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root,
&free_vmap_area_list);
if (va)
kasan_release_vmalloc(orig_start, orig_end,
va->va_start, va->va_end,
KASAN_VMALLOC_PAGE_RANGE | KASAN_VMALLOC_TLB_FLUSH);
vas[area] = NULL;
}
overflow:
spin_unlock(&free_vmap_area_lock);
if (!purged) {
reclaim_and_purge_vmap_areas();
purged = true;
for (area = 0; area < nr_vms; area++) {
if (vas[area])
continue;
vas[area] = kmem_cache_zalloc(
vmap_area_cachep, GFP_KERNEL);
if (!vas[area])
goto err_free;
}
goto retry;
}
err_free:
for (area = 0; area < nr_vms; area++) {
if (vas[area])
kmem_cache_free(vmap_area_cachep, vas[area]);
kfree(vms[area]);
}
err_free2:
kfree(vas);
kfree(vms);
return NULL;
err_free_shadow:
spin_lock(&free_vmap_area_lock);
for (area = 0; area < nr_vms; area++) {
orig_start = vas[area]->va_start;
orig_end = vas[area]->va_end;
va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root,
&free_vmap_area_list);
if (va)
kasan_release_vmalloc(orig_start, orig_end,
va->va_start, va->va_end,
KASAN_VMALLOC_PAGE_RANGE | KASAN_VMALLOC_TLB_FLUSH);
vas[area] = NULL;
kfree(vms[area]);
}
spin_unlock(&free_vmap_area_lock);
kfree(vas);
kfree(vms);
return NULL;
}
void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
{
int i;
for (i = 0; i < nr_vms; i++)
free_vm_area(vms[i]);
kfree(vms);
}
#endif
#ifdef CONFIG_PRINTK
bool vmalloc_dump_obj(void *object)
{
const void *caller;
struct vm_struct *vm;
struct vmap_area *va;
struct vmap_node *vn;
unsigned long addr;
unsigned int nr_pages;
addr = PAGE_ALIGN((unsigned long) object);
vn = addr_to_node(addr);
if (!spin_trylock(&vn->busy.lock))
return false;
va = __find_vmap_area(addr, &vn->busy.root);
if (!va || !va->vm) {
spin_unlock(&vn->busy.lock);
return false;
}
vm = va->vm;
addr = (unsigned long) vm->addr;
caller = vm->caller;
nr_pages = vm->nr_pages;
spin_unlock(&vn->busy.lock);
pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n",
nr_pages, addr, caller);
return true;
}
#endif
#ifdef CONFIG_PROC_FS
static void show_numa_info(struct seq_file *m, struct vm_struct *v,
unsigned int *counters)
{
unsigned int nr;
unsigned int step = 1U << vm_area_page_order(v);
if (!counters)
return;
memset(counters, 0, nr_node_ids * sizeof(unsigned int));
for (nr = 0; nr < v->nr_pages; nr += step)
counters[page_to_nid(v->pages[nr])] += step;
for_each_node_state(nr, N_HIGH_MEMORY)
if (counters[nr])
seq_printf(m, " N%u=%u", nr, counters[nr]);
}
static void show_purge_info(struct seq_file *m)
{
struct vmap_node *vn;
struct vmap_area *va;
for_each_vmap_node(vn) {
spin_lock(&vn->lazy.lock);
list_for_each_entry(va, &vn->lazy.head, list) {
seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
(void *)va->va_start, (void *)va->va_end,
va_size(va));
}
spin_unlock(&vn->lazy.lock);
}
}
static int vmalloc_info_show(struct seq_file *m, void *p)
{
struct vmap_node *vn;
struct vmap_area *va;
struct vm_struct *v;
unsigned int *counters;
if (IS_ENABLED(CONFIG_NUMA))
counters = kmalloc_array(nr_node_ids, sizeof(unsigned int), GFP_KERNEL);
for_each_vmap_node(vn) {
spin_lock(&vn->busy.lock);
list_for_each_entry(va, &vn->busy.head, list) {
if (!va->vm) {
if (va->flags & VMAP_RAM)
seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
(void *)va->va_start, (void *)va->va_end,
va_size(va));
continue;
}
v = va->vm;
if (v->flags & VM_UNINITIALIZED)
continue;
smp_rmb();
seq_printf(m, "0x%pK-0x%pK %7ld",
v->addr, v->addr + v->size, v->size);
if (v->caller)
seq_printf(m, " %pS", v->caller);
if (v->nr_pages)
seq_printf(m, " pages=%d", v->nr_pages);
if (v->phys_addr)
seq_printf(m, " phys=%pa", &v->phys_addr);
if (v->flags & VM_IOREMAP)
seq_puts(m, " ioremap");
if (v->flags & VM_SPARSE)
seq_puts(m, " sparse");
if (v->flags & VM_ALLOC)
seq_puts(m, " vmalloc");
if (v->flags & VM_MAP)
seq_puts(m, " vmap");
if (v->flags & VM_USERMAP)
seq_puts(m, " user");
if (v->flags & VM_DMA_COHERENT)
seq_puts(m, " dma-coherent");
if (is_vmalloc_addr(v->pages))
seq_puts(m, " vpages");
if (IS_ENABLED(CONFIG_NUMA))
show_numa_info(m, v, counters);
seq_putc(m, '\n');
}
spin_unlock(&vn->busy.lock);
}
show_purge_info(m);
if (IS_ENABLED(CONFIG_NUMA))
kfree(counters);
return 0;
}
static int __init proc_vmalloc_init(void)
{
proc_create_single("vmallocinfo", 0400, NULL, vmalloc_info_show);
return 0;
}
module_init(proc_vmalloc_init);
#endif
static void __init vmap_init_free_space(void)
{
unsigned long vmap_start = 1;
const unsigned long vmap_end = ULONG_MAX;
struct vmap_area *free;
struct vm_struct *busy;
for (busy = vmlist; busy; busy = busy->next) {
if ((unsigned long) busy->addr - vmap_start > 0) {
free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
if (!WARN_ON_ONCE(!free)) {
free->va_start = vmap_start;
free->va_end = (unsigned long) busy->addr;
insert_vmap_area_augment(free, NULL,
&free_vmap_area_root,
&free_vmap_area_list);
}
}
vmap_start = (unsigned long) busy->addr + busy->size;
}
if (vmap_end - vmap_start > 0) {
free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
if (!WARN_ON_ONCE(!free)) {
free->va_start = vmap_start;
free->va_end = vmap_end;
insert_vmap_area_augment(free, NULL,
&free_vmap_area_root,
&free_vmap_area_list);
}
}
}
static void vmap_init_nodes(void)
{
struct vmap_node *vn;
int i;
#if BITS_PER_LONG == 64
int n = clamp_t(unsigned int, num_possible_cpus(), 1, 128);
if (n > 1) {
vn = kmalloc_objs(*vn, n, GFP_NOWAIT);
if (vn) {
vmap_zone_size = (1 << 4) * PAGE_SIZE;
nr_vmap_nodes = n;
vmap_nodes = vn;
} else {
pr_err("Failed to allocate an array. Disable a node layer\n");
}
}
#endif
for_each_vmap_node(vn) {
vn->busy.root = RB_ROOT;
INIT_LIST_HEAD(&vn->busy.head);
spin_lock_init(&vn->busy.lock);
vn->lazy.root = RB_ROOT;
INIT_LIST_HEAD(&vn->lazy.head);
spin_lock_init(&vn->lazy.lock);
for (i = 0; i < MAX_VA_SIZE_PAGES; i++) {
INIT_LIST_HEAD(&vn->pool[i].head);
WRITE_ONCE(vn->pool[i].len, 0);
}
spin_lock_init(&vn->pool_lock);
}
}
static unsigned long
vmap_node_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
{
unsigned long count = 0;
struct vmap_node *vn;
int i;
for_each_vmap_node(vn) {
for (i = 0; i < MAX_VA_SIZE_PAGES; i++)
count += READ_ONCE(vn->pool[i].len);
}
return count ? count : SHRINK_EMPTY;
}
static unsigned long
vmap_node_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
{
struct vmap_node *vn;
for_each_vmap_node(vn)
decay_va_pool_node(vn, true);
return SHRINK_STOP;
}
void __init vmalloc_init(void)
{
struct shrinker *vmap_node_shrinker;
struct vmap_area *va;
struct vmap_node *vn;
struct vm_struct *tmp;
int i;
vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);
for_each_possible_cpu(i) {
struct vmap_block_queue *vbq;
struct vfree_deferred *p;
vbq = &per_cpu(vmap_block_queue, i);
spin_lock_init(&vbq->lock);
INIT_LIST_HEAD(&vbq->free);
p = &per_cpu(vfree_deferred, i);
init_llist_head(&p->list);
INIT_WORK(&p->wq, delayed_vfree_work);
xa_init(&vbq->vmap_blocks);
}
vmap_init_nodes();
for (tmp = vmlist; tmp; tmp = tmp->next) {
va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
if (WARN_ON_ONCE(!va))
continue;
va->va_start = (unsigned long)tmp->addr;
va->va_end = va->va_start + tmp->size;
va->vm = tmp;
vn = addr_to_node(va->va_start);
insert_vmap_area(va, &vn->busy.root, &vn->busy.head);
}
vmap_init_free_space();
vmap_initialized = true;
vmap_node_shrinker = shrinker_alloc(0, "vmap-node");
if (!vmap_node_shrinker) {
pr_err("Failed to allocate vmap-node shrinker!\n");
return;
}
vmap_node_shrinker->count_objects = vmap_node_shrink_count;
vmap_node_shrinker->scan_objects = vmap_node_shrink_scan;
shrinker_register(vmap_node_shrinker);
}