#include <linux/fs.h>
#include <linux/vfs.h>
#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/sched/signal.h>
#include <linux/khugepaged.h>
#include <linux/syscalls.h>
#include <linux/hugetlb.h>
#include <linux/shmem_fs.h>
#include <linux/memfd.h>
#include <linux/pid_namespace.h>
#include <uapi/linux/memfd.h>
#include "swap.h"
#define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE
#define LAST_SCAN 4
static bool memfd_folio_has_extra_refs(struct folio *folio)
{
return folio_ref_count(folio) != folio_expected_ref_count(folio);
}
static void memfd_tag_pins(struct xa_state *xas)
{
struct folio *folio;
int latency = 0;
lru_add_drain();
xas_lock_irq(xas);
xas_for_each(xas, folio, ULONG_MAX) {
if (!xa_is_value(folio) && memfd_folio_has_extra_refs(folio))
xas_set_mark(xas, MEMFD_TAG_PINNED);
if (++latency < XA_CHECK_SCHED)
continue;
latency = 0;
xas_pause(xas);
xas_unlock_irq(xas);
cond_resched();
xas_lock_irq(xas);
}
xas_unlock_irq(xas);
}
struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx)
{
#ifdef CONFIG_HUGETLB_PAGE
struct folio *folio;
gfp_t gfp_mask;
if (is_file_hugepages(memfd)) {
struct inode *inode = file_inode(memfd);
struct hstate *h = hstate_file(memfd);
int err = -ENOMEM;
long nr_resv;
gfp_mask = htlb_alloc_mask(h);
gfp_mask &= ~(__GFP_HIGHMEM | __GFP_MOVABLE);
idx >>= huge_page_order(h);
nr_resv = hugetlb_reserve_pages(inode, idx, idx + 1, NULL, EMPTY_VMA_FLAGS);
if (nr_resv < 0)
return ERR_PTR(nr_resv);
folio = alloc_hugetlb_folio_reserve(h,
numa_node_id(),
NULL,
gfp_mask);
if (folio) {
u32 hash;
folio_zero_user(folio, 0);
__folio_mark_uptodate(folio);
hash = hugetlb_fault_mutex_hash(memfd->f_mapping, idx);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
err = hugetlb_add_to_page_cache(folio,
memfd->f_mapping,
idx);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
if (err) {
folio_put(folio);
goto err_unresv;
}
hugetlb_set_folio_subpool(folio, subpool_inode(inode));
folio_unlock(folio);
return folio;
}
err_unresv:
if (nr_resv > 0)
hugetlb_unreserve_pages(inode, idx, idx + 1, 0);
return ERR_PTR(err);
}
#endif
return shmem_read_folio(memfd->f_mapping, idx);
}
static int memfd_wait_for_pins(struct address_space *mapping)
{
XA_STATE(xas, &mapping->i_pages, 0);
struct folio *folio;
int error, scan;
memfd_tag_pins(&xas);
error = 0;
for (scan = 0; scan <= LAST_SCAN; scan++) {
int latency = 0;
if (!xas_marked(&xas, MEMFD_TAG_PINNED))
break;
if (!scan)
lru_add_drain_all();
else if (schedule_timeout_killable((HZ << scan) / 200))
scan = LAST_SCAN;
xas_set(&xas, 0);
xas_lock_irq(&xas);
xas_for_each_marked(&xas, folio, ULONG_MAX, MEMFD_TAG_PINNED) {
bool clear = true;
if (!xa_is_value(folio) &&
memfd_folio_has_extra_refs(folio)) {
if (scan == LAST_SCAN)
error = -EBUSY;
else
clear = false;
}
if (clear)
xas_clear_mark(&xas, MEMFD_TAG_PINNED);
if (++latency < XA_CHECK_SCHED)
continue;
latency = 0;
xas_pause(&xas);
xas_unlock_irq(&xas);
cond_resched();
xas_lock_irq(&xas);
}
xas_unlock_irq(&xas);
}
return error;
}
static unsigned int *memfd_file_seals_ptr(struct file *file)
{
if (shmem_file(file))
return &SHMEM_I(file_inode(file))->seals;
#ifdef CONFIG_HUGETLBFS
if (is_file_hugepages(file))
return &HUGETLBFS_I(file_inode(file))->seals;
#endif
return NULL;
}
#define F_ALL_SEALS (F_SEAL_SEAL | \
F_SEAL_EXEC | \
F_SEAL_SHRINK | \
F_SEAL_GROW | \
F_SEAL_WRITE | \
F_SEAL_FUTURE_WRITE)
static int memfd_add_seals(struct file *file, unsigned int seals)
{
struct inode *inode = file_inode(file);
unsigned int *file_seals;
int error;
if (!(file->f_mode & FMODE_WRITE))
return -EPERM;
if (seals & ~(unsigned int)F_ALL_SEALS)
return -EINVAL;
inode_lock(inode);
file_seals = memfd_file_seals_ptr(file);
if (!file_seals) {
error = -EINVAL;
goto unlock;
}
if (*file_seals & F_SEAL_SEAL) {
error = -EPERM;
goto unlock;
}
if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) {
error = mapping_deny_writable(file->f_mapping);
if (error)
goto unlock;
error = memfd_wait_for_pins(file->f_mapping);
if (error) {
mapping_allow_writable(file->f_mapping);
goto unlock;
}
}
if (seals & F_SEAL_EXEC && inode->i_mode & 0111)
seals |= F_SEAL_SHRINK|F_SEAL_GROW|F_SEAL_WRITE|F_SEAL_FUTURE_WRITE;
*file_seals |= seals;
error = 0;
unlock:
inode_unlock(inode);
return error;
}
static int memfd_get_seals(struct file *file)
{
unsigned int *seals = memfd_file_seals_ptr(file);
return seals ? *seals : -EINVAL;
}
long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg)
{
long error;
switch (cmd) {
case F_ADD_SEALS:
error = memfd_add_seals(file, arg);
break;
case F_GET_SEALS:
error = memfd_get_seals(file);
break;
default:
error = -EINVAL;
break;
}
return error;
}
#define MFD_NAME_PREFIX "memfd:"
#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | MFD_NOEXEC_SEAL | MFD_EXEC)
static int check_sysctl_memfd_noexec(unsigned int *flags)
{
#ifdef CONFIG_SYSCTL
struct pid_namespace *ns = task_active_pid_ns(current);
int sysctl = pidns_memfd_noexec_scope(ns);
if (!(*flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) {
if (sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL)
*flags |= MFD_NOEXEC_SEAL;
else
*flags |= MFD_EXEC;
}
if (!(*flags & MFD_NOEXEC_SEAL) && sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED) {
pr_err_ratelimited(
"%s[%d]: memfd_create() requires MFD_NOEXEC_SEAL with vm.memfd_noexec=%d\n",
current->comm, task_pid_nr(current), sysctl);
return -EACCES;
}
#endif
return 0;
}
static inline bool is_write_sealed(unsigned int seals)
{
return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE);
}
static int check_write_seal(vm_flags_t *vm_flags_ptr)
{
vm_flags_t vm_flags = *vm_flags_ptr;
vm_flags_t mask = vm_flags & (VM_SHARED | VM_WRITE);
if (!(mask & VM_SHARED))
return 0;
if (mask & VM_WRITE)
return -EPERM;
*vm_flags_ptr &= ~VM_MAYWRITE;
return 0;
}
int memfd_check_seals_mmap(struct file *file, vm_flags_t *vm_flags_ptr)
{
int err = 0;
unsigned int *seals_ptr = memfd_file_seals_ptr(file);
unsigned int seals = seals_ptr ? *seals_ptr : 0;
if (is_write_sealed(seals))
err = check_write_seal(vm_flags_ptr);
return err;
}
static int sanitize_flags(unsigned int *flags_ptr)
{
unsigned int flags = *flags_ptr;
if (!(flags & MFD_HUGETLB)) {
if (flags & ~MFD_ALL_FLAGS)
return -EINVAL;
} else {
if (flags & ~(MFD_ALL_FLAGS |
(MFD_HUGE_MASK << MFD_HUGE_SHIFT)))
return -EINVAL;
}
if ((flags & MFD_EXEC) && (flags & MFD_NOEXEC_SEAL))
return -EINVAL;
return check_sysctl_memfd_noexec(flags_ptr);
}
static char *alloc_name(const char __user *uname)
{
int error;
char *name;
long len;
name = kmalloc(NAME_MAX + 1, GFP_KERNEL);
if (!name)
return ERR_PTR(-ENOMEM);
memcpy(name, MFD_NAME_PREFIX, MFD_NAME_PREFIX_LEN);
len = strncpy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, MFD_NAME_MAX_LEN + 1);
if (len < 0) {
error = -EFAULT;
goto err_name;
} else if (len > MFD_NAME_MAX_LEN) {
error = -EINVAL;
goto err_name;
}
return name;
err_name:
kfree(name);
return ERR_PTR(error);
}
struct file *memfd_alloc_file(const char *name, unsigned int flags)
{
unsigned int *file_seals;
struct file *file;
struct inode *inode;
int err = 0;
if (flags & MFD_HUGETLB) {
file = hugetlb_file_setup(name, 0, mk_vma_flags(VMA_NORESERVE_BIT),
HUGETLB_ANONHUGE_INODE,
(flags >> MFD_HUGE_SHIFT) &
MFD_HUGE_MASK);
} else {
file = shmem_file_setup(name, 0, mk_vma_flags(VMA_NORESERVE_BIT));
}
if (IS_ERR(file))
return file;
inode = file_inode(file);
err = security_inode_init_security_anon(inode,
&QSTR(MEMFD_ANON_NAME), NULL);
if (err) {
fput(file);
file = ERR_PTR(err);
return file;
}
file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
file->f_flags |= O_LARGEFILE;
if (flags & MFD_NOEXEC_SEAL) {
inode->i_mode &= ~0111;
file_seals = memfd_file_seals_ptr(file);
if (file_seals) {
*file_seals &= ~F_SEAL_SEAL;
*file_seals |= F_SEAL_EXEC;
}
} else if (flags & MFD_ALLOW_SEALING) {
file_seals = memfd_file_seals_ptr(file);
if (file_seals)
*file_seals &= ~F_SEAL_SEAL;
}
return file;
}
SYSCALL_DEFINE2(memfd_create,
const char __user *, uname,
unsigned int, flags)
{
char *name __free(kfree) = NULL;
unsigned int fd_flags;
int error;
error = sanitize_flags(&flags);
if (error < 0)
return error;
name = alloc_name(uname);
if (IS_ERR(name))
return PTR_ERR(name);
fd_flags = (flags & MFD_CLOEXEC) ? O_CLOEXEC : 0;
return FD_ADD(fd_flags, memfd_alloc_file(name, flags));
}