#include <linux/compat.h>
#include <linux/jhash.h>
#include <linux/pagemap.h>
#include <linux/debugfs.h>
#include <linux/plist.h>
#include <linux/gfp.h>
#include <linux/vmalloc.h>
#include <linux/memblock.h>
#include <linux/fault-inject.h>
#include <linux/slab.h>
#include <linux/prctl.h>
#include <linux/mempolicy.h>
#include <linux/mmap_lock.h>
#include "futex.h"
#include "../locking/rtmutex_common.h"
static struct {
unsigned long hashmask;
unsigned int hashshift;
struct futex_hash_bucket *queues[MAX_NUMNODES];
} __futex_data __read_mostly __aligned(2*sizeof(long));
#define futex_hashmask (__futex_data.hashmask)
#define futex_hashshift (__futex_data.hashshift)
#define futex_queues (__futex_data.queues)
struct futex_private_hash {
int state;
unsigned int hash_mask;
struct rcu_head rcu;
void *mm;
bool custom;
struct futex_hash_bucket queues[];
};
#ifdef CONFIG_FAIL_FUTEX
static struct {
struct fault_attr attr;
bool ignore_private;
} fail_futex = {
.attr = FAULT_ATTR_INITIALIZER,
.ignore_private = false,
};
static int __init setup_fail_futex(char *str)
{
return setup_fault_attr(&fail_futex.attr, str);
}
__setup("fail_futex=", setup_fail_futex);
bool should_fail_futex(bool fshared)
{
if (fail_futex.ignore_private && !fshared)
return false;
return should_fail(&fail_futex.attr, 1);
}
#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
static int __init fail_futex_debugfs(void)
{
umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
struct dentry *dir;
dir = fault_create_debugfs_attr("fail_futex", NULL,
&fail_futex.attr);
if (IS_ERR(dir))
return PTR_ERR(dir);
debugfs_create_bool("ignore-private", mode, dir,
&fail_futex.ignore_private);
return 0;
}
late_initcall(fail_futex_debugfs);
#endif
#endif
static struct futex_hash_bucket *
__futex_hash(union futex_key *key, struct futex_private_hash *fph);
#ifdef CONFIG_FUTEX_PRIVATE_HASH
static bool futex_ref_get(struct futex_private_hash *fph);
static bool futex_ref_put(struct futex_private_hash *fph);
static bool futex_ref_is_dead(struct futex_private_hash *fph);
enum { FR_PERCPU = 0, FR_ATOMIC };
static inline bool futex_key_is_private(union futex_key *key)
{
return !(key->both.offset & (FUT_OFF_INODE | FUT_OFF_MMSHARED));
}
static bool futex_private_hash_get(struct futex_private_hash *fph)
{
return futex_ref_get(fph);
}
void futex_private_hash_put(struct futex_private_hash *fph)
{
if (futex_ref_put(fph))
wake_up_var(fph->mm);
}
void futex_hash_get(struct futex_hash_bucket *hb)
{
struct futex_private_hash *fph = hb->priv;
if (!fph)
return;
WARN_ON_ONCE(!futex_private_hash_get(fph));
}
void futex_hash_put(struct futex_hash_bucket *hb)
{
struct futex_private_hash *fph = hb->priv;
if (!fph)
return;
futex_private_hash_put(fph);
}
static struct futex_hash_bucket *
__futex_hash_private(union futex_key *key, struct futex_private_hash *fph)
{
u32 hash;
if (!futex_key_is_private(key))
return NULL;
if (!fph)
fph = rcu_dereference(key->private.mm->futex_phash);
if (!fph || !fph->hash_mask)
return NULL;
hash = jhash2((void *)&key->private.address,
sizeof(key->private.address) / 4,
key->both.offset);
return &fph->queues[hash & fph->hash_mask];
}
static void futex_rehash_private(struct futex_private_hash *old,
struct futex_private_hash *new)
{
struct futex_hash_bucket *hb_old, *hb_new;
unsigned int slots = old->hash_mask + 1;
unsigned int i;
for (i = 0; i < slots; i++) {
struct futex_q *this, *tmp;
hb_old = &old->queues[i];
spin_lock(&hb_old->lock);
plist_for_each_entry_safe(this, tmp, &hb_old->chain, list) {
plist_del(&this->list, &hb_old->chain);
futex_hb_waiters_dec(hb_old);
WARN_ON_ONCE(this->lock_ptr != &hb_old->lock);
hb_new = __futex_hash(&this->key, new);
futex_hb_waiters_inc(hb_new);
spin_lock_nested(&hb_new->lock, SINGLE_DEPTH_NESTING);
plist_add(&this->list, &hb_new->chain);
this->lock_ptr = &hb_new->lock;
spin_unlock(&hb_new->lock);
}
spin_unlock(&hb_old->lock);
}
}
static bool __futex_pivot_hash(struct mm_struct *mm,
struct futex_private_hash *new)
{
struct futex_private_hash *fph;
WARN_ON_ONCE(mm->futex_phash_new);
fph = rcu_dereference_protected(mm->futex_phash,
lockdep_is_held(&mm->futex_hash_lock));
if (fph) {
if (!futex_ref_is_dead(fph)) {
mm->futex_phash_new = new;
return false;
}
futex_rehash_private(fph, new);
}
new->state = FR_PERCPU;
scoped_guard(rcu) {
mm->futex_batches = get_state_synchronize_rcu();
rcu_assign_pointer(mm->futex_phash, new);
}
kvfree_rcu(fph, rcu);
return true;
}
static void futex_pivot_hash(struct mm_struct *mm)
{
scoped_guard(mutex, &mm->futex_hash_lock) {
struct futex_private_hash *fph;
fph = mm->futex_phash_new;
if (fph) {
mm->futex_phash_new = NULL;
__futex_pivot_hash(mm, fph);
}
}
}
struct futex_private_hash *futex_private_hash(void)
{
struct mm_struct *mm = current->mm;
again:
scoped_guard(rcu) {
struct futex_private_hash *fph;
fph = rcu_dereference(mm->futex_phash);
if (!fph)
return NULL;
if (futex_private_hash_get(fph))
return fph;
}
futex_pivot_hash(mm);
goto again;
}
struct futex_hash_bucket *futex_hash(union futex_key *key)
{
struct futex_private_hash *fph;
struct futex_hash_bucket *hb;
again:
scoped_guard(rcu) {
hb = __futex_hash(key, NULL);
fph = hb->priv;
if (!fph || futex_private_hash_get(fph))
return hb;
}
futex_pivot_hash(key->private.mm);
goto again;
}
#else
static struct futex_hash_bucket *
__futex_hash_private(union futex_key *key, struct futex_private_hash *fph)
{
return NULL;
}
struct futex_hash_bucket *futex_hash(union futex_key *key)
{
return __futex_hash(key, NULL);
}
#endif
#ifdef CONFIG_FUTEX_MPOL
static int __futex_key_to_node(struct mm_struct *mm, unsigned long addr)
{
struct vm_area_struct *vma = vma_lookup(mm, addr);
struct mempolicy *mpol;
int node = FUTEX_NO_NODE;
if (!vma)
return FUTEX_NO_NODE;
mpol = READ_ONCE(vma->vm_policy);
if (!mpol)
return FUTEX_NO_NODE;
switch (mpol->mode) {
case MPOL_PREFERRED:
node = first_node(mpol->nodes);
break;
case MPOL_PREFERRED_MANY:
case MPOL_BIND:
if (mpol->home_node != NUMA_NO_NODE)
node = mpol->home_node;
break;
default:
break;
}
return node;
}
static int futex_key_to_node_opt(struct mm_struct *mm, unsigned long addr)
{
int seq, node;
guard(rcu)();
if (!mmap_lock_speculate_try_begin(mm, &seq))
return -EBUSY;
node = __futex_key_to_node(mm, addr);
if (mmap_lock_speculate_retry(mm, seq))
return -EAGAIN;
return node;
}
static int futex_mpol(struct mm_struct *mm, unsigned long addr)
{
int node;
node = futex_key_to_node_opt(mm, addr);
if (node >= FUTEX_NO_NODE)
return node;
guard(mmap_read_lock)(mm);
return __futex_key_to_node(mm, addr);
}
#else
static int futex_mpol(struct mm_struct *mm, unsigned long addr)
{
return FUTEX_NO_NODE;
}
#endif
static struct futex_hash_bucket *
__futex_hash(union futex_key *key, struct futex_private_hash *fph)
{
int node = key->both.node;
u32 hash;
if (node == FUTEX_NO_NODE) {
struct futex_hash_bucket *hb;
hb = __futex_hash_private(key, fph);
if (hb)
return hb;
}
hash = jhash2((u32 *)key,
offsetof(typeof(*key), both.offset) / sizeof(u32),
key->both.offset);
if (node == FUTEX_NO_NODE) {
node = (hash >> futex_hashshift) % nr_node_ids;
if (!node_possible(node)) {
node = find_next_bit_wrap(node_possible_map.bits,
nr_node_ids, node);
}
}
return &futex_queues[node][hash & futex_hashmask];
}
struct hrtimer_sleeper *
futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
int flags, u64 range_ns)
{
if (!time)
return NULL;
hrtimer_setup_sleeper_on_stack(timeout,
(flags & FLAGS_CLOCKRT) ? CLOCK_REALTIME : CLOCK_MONOTONIC,
HRTIMER_MODE_ABS);
hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns);
return timeout;
}
static u64 get_inode_sequence_number(struct inode *inode)
{
static atomic64_t i_seq;
u64 old;
old = atomic64_read(&inode->i_sequence);
if (likely(old))
return old;
for (;;) {
u64 new = atomic64_inc_return(&i_seq);
if (WARN_ON_ONCE(!new))
continue;
old = 0;
if (!atomic64_try_cmpxchg_relaxed(&inode->i_sequence, &old, new))
return old;
return new;
}
}
int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key,
enum futex_access rw)
{
unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
struct page *page;
struct folio *folio;
struct address_space *mapping;
int node, err, size, ro = 0;
bool node_updated = false;
bool fshared;
fshared = flags & FLAGS_SHARED;
size = futex_size(flags);
if (flags & FLAGS_NUMA)
size *= 2;
key->both.offset = address % PAGE_SIZE;
if (unlikely((address % size) != 0))
return -EINVAL;
address -= key->both.offset;
if (unlikely(!access_ok(uaddr, size)))
return -EFAULT;
if (unlikely(should_fail_futex(fshared)))
return -EFAULT;
node = FUTEX_NO_NODE;
if (flags & FLAGS_NUMA) {
u32 __user *naddr = (void *)uaddr + size / 2;
if (get_user_inline(node, naddr))
return -EFAULT;
if ((node != FUTEX_NO_NODE) &&
((unsigned int)node >= MAX_NUMNODES || !node_possible(node)))
return -EINVAL;
}
if (node == FUTEX_NO_NODE && (flags & FLAGS_MPOL)) {
node = futex_mpol(mm, address);
node_updated = true;
}
if (flags & FLAGS_NUMA) {
u32 __user *naddr = (void *)uaddr + size / 2;
if (node == FUTEX_NO_NODE) {
node = numa_node_id();
node_updated = true;
}
if (node_updated && put_user_inline(node, naddr))
return -EFAULT;
}
key->both.node = node;
if (!fshared) {
if (IS_ENABLED(CONFIG_MMU))
key->private.mm = mm;
else
key->private.mm = NULL;
key->private.address = address;
return 0;
}
again:
if (unlikely(should_fail_futex(true)))
return -EFAULT;
err = get_user_pages_fast(address, 1, FOLL_WRITE, &page);
if (err == -EFAULT && rw == FUTEX_READ) {
err = get_user_pages_fast(address, 1, 0, &page);
ro = 1;
}
if (err < 0)
return err;
else
err = 0;
folio = page_folio(page);
mapping = READ_ONCE(folio->mapping);
if (unlikely(!mapping)) {
int shmem_swizzled;
folio_lock(folio);
shmem_swizzled = folio_test_swapcache(folio) || folio->mapping;
folio_unlock(folio);
folio_put(folio);
if (shmem_swizzled)
goto again;
return -EFAULT;
}
if (folio_test_anon(folio)) {
if (unlikely(should_fail_futex(true)) || ro) {
err = -EFAULT;
goto out;
}
key->both.offset |= FUT_OFF_MMSHARED;
key->private.mm = mm;
key->private.address = address;
} else {
struct inode *inode;
rcu_read_lock();
if (READ_ONCE(folio->mapping) != mapping) {
rcu_read_unlock();
folio_put(folio);
goto again;
}
inode = READ_ONCE(mapping->host);
if (!inode) {
rcu_read_unlock();
folio_put(folio);
goto again;
}
key->both.offset |= FUT_OFF_INODE;
key->shared.i_seq = get_inode_sequence_number(inode);
key->shared.pgoff = page_pgoff(folio, page);
rcu_read_unlock();
}
out:
folio_put(folio);
return err;
}
int fault_in_user_writeable(u32 __user *uaddr)
{
struct mm_struct *mm = current->mm;
int ret;
mmap_read_lock(mm);
ret = fixup_user_fault(mm, (unsigned long)uaddr,
FAULT_FLAG_WRITE, NULL);
mmap_read_unlock(mm);
return ret < 0 ? ret : 0;
}
struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key)
{
struct futex_q *this;
plist_for_each_entry(this, &hb->chain, list) {
if (futex_match(&this->key, key))
return this;
}
return NULL;
}
void wait_for_owner_exiting(int ret, struct task_struct *exiting)
{
if (ret != -EBUSY) {
WARN_ON_ONCE(exiting);
return;
}
if (WARN_ON_ONCE(ret == -EBUSY && !exiting))
return;
mutex_lock(&exiting->futex_exit_mutex);
mutex_unlock(&exiting->futex_exit_mutex);
put_task_struct(exiting);
}
void __futex_unqueue(struct futex_q *q)
{
struct futex_hash_bucket *hb;
if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list)))
return;
lockdep_assert_held(q->lock_ptr);
hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
plist_del(&q->list, &hb->chain);
futex_hb_waiters_dec(hb);
}
void futex_q_lock(struct futex_q *q, struct futex_hash_bucket *hb)
__acquires(&hb->lock)
{
futex_hb_waiters_inc(hb);
q->lock_ptr = &hb->lock;
spin_lock(&hb->lock);
}
void futex_q_unlock(struct futex_hash_bucket *hb)
__releases(&hb->lock)
{
futex_hb_waiters_dec(hb);
spin_unlock(&hb->lock);
}
void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb,
struct task_struct *task)
{
int prio;
prio = min(current->normal_prio, MAX_RT_PRIO);
plist_node_init(&q->list, prio);
plist_add(&q->list, &hb->chain);
q->task = task;
}
int futex_unqueue(struct futex_q *q)
{
spinlock_t *lock_ptr;
int ret = 0;
guard(rcu)();
retry:
lock_ptr = READ_ONCE(q->lock_ptr);
if (lock_ptr != NULL) {
spin_lock(lock_ptr);
if (unlikely(lock_ptr != q->lock_ptr)) {
spin_unlock(lock_ptr);
goto retry;
}
__futex_unqueue(q);
BUG_ON(q->pi_state);
spin_unlock(lock_ptr);
ret = 1;
}
return ret;
}
void futex_q_lockptr_lock(struct futex_q *q)
{
spinlock_t *lock_ptr;
guard(rcu)();
retry:
lock_ptr = READ_ONCE(q->lock_ptr);
spin_lock(lock_ptr);
if (unlikely(lock_ptr != q->lock_ptr)) {
spin_unlock(lock_ptr);
goto retry;
}
}
void futex_unqueue_pi(struct futex_q *q)
{
if (!plist_node_empty(&q->list))
__futex_unqueue(q);
BUG_ON(!q->pi_state);
put_pi_state(q->pi_state);
q->pi_state = NULL;
}
#define HANDLE_DEATH_PENDING true
#define HANDLE_DEATH_LIST false
static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr,
bool pi, bool pending_op)
{
u32 uval, nval, mval;
pid_t owner;
int err;
if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0)
return -1;
retry:
if (get_user(uval, uaddr))
return -1;
owner = uval & FUTEX_TID_MASK;
if (pending_op && !pi && !owner) {
futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1,
FUTEX_BITSET_MATCH_ANY);
return 0;
}
if (owner != task_pid_vnr(curr))
return 0;
mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
if ((err = futex_cmpxchg_value_locked(&nval, uaddr, uval, mval))) {
switch (err) {
case -EFAULT:
if (fault_in_user_writeable(uaddr))
return -1;
goto retry;
case -EAGAIN:
cond_resched();
goto retry;
default:
WARN_ON_ONCE(1);
return err;
}
}
if (nval != uval)
goto retry;
if (!pi && (uval & FUTEX_WAITERS)) {
futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1,
FUTEX_BITSET_MATCH_ANY);
}
return 0;
}
static inline int fetch_robust_entry(struct robust_list __user **entry,
struct robust_list __user * __user *head,
unsigned int *pi)
{
unsigned long uentry;
if (get_user(uentry, (unsigned long __user *)head))
return -EFAULT;
*entry = (void __user *)(uentry & ~1UL);
*pi = uentry & 1;
return 0;
}
static void exit_robust_list(struct task_struct *curr)
{
struct robust_list_head __user *head = curr->robust_list;
struct robust_list __user *entry, *next_entry, *pending;
unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
unsigned int next_pi;
unsigned long futex_offset;
int rc;
if (fetch_robust_entry(&entry, &head->list.next, &pi))
return;
if (get_user(futex_offset, &head->futex_offset))
return;
if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
return;
next_entry = NULL;
while (entry != &head->list) {
rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
if (entry != pending) {
if (handle_futex_death((void __user *)entry + futex_offset,
curr, pi, HANDLE_DEATH_LIST))
return;
}
if (rc)
return;
entry = next_entry;
pi = next_pi;
if (!--limit)
break;
cond_resched();
}
if (pending) {
handle_futex_death((void __user *)pending + futex_offset,
curr, pip, HANDLE_DEATH_PENDING);
}
}
#ifdef CONFIG_COMPAT
static void __user *futex_uaddr(struct robust_list __user *entry,
compat_long_t futex_offset)
{
compat_uptr_t base = ptr_to_compat(entry);
void __user *uaddr = compat_ptr(base + futex_offset);
return uaddr;
}
static inline int
compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
compat_uptr_t __user *head, unsigned int *pi)
{
if (get_user(*uentry, head))
return -EFAULT;
*entry = compat_ptr((*uentry) & ~1);
*pi = (unsigned int)(*uentry) & 1;
return 0;
}
static void compat_exit_robust_list(struct task_struct *curr)
{
struct compat_robust_list_head __user *head = curr->compat_robust_list;
struct robust_list __user *entry, *next_entry, *pending;
unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
unsigned int next_pi;
compat_uptr_t uentry, next_uentry, upending;
compat_long_t futex_offset;
int rc;
if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
return;
if (get_user(futex_offset, &head->futex_offset))
return;
if (compat_fetch_robust_entry(&upending, &pending,
&head->list_op_pending, &pip))
return;
next_entry = NULL;
while (entry != (struct robust_list __user *) &head->list) {
rc = compat_fetch_robust_entry(&next_uentry, &next_entry,
(compat_uptr_t __user *)&entry->next, &next_pi);
if (entry != pending) {
void __user *uaddr = futex_uaddr(entry, futex_offset);
if (handle_futex_death(uaddr, curr, pi,
HANDLE_DEATH_LIST))
return;
}
if (rc)
return;
uentry = next_uentry;
entry = next_entry;
pi = next_pi;
if (!--limit)
break;
cond_resched();
}
if (pending) {
void __user *uaddr = futex_uaddr(pending, futex_offset);
handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING);
}
}
#endif
#ifdef CONFIG_FUTEX_PI
static void exit_pi_state_list(struct task_struct *curr)
{
struct list_head *next, *head = &curr->pi_state_list;
struct futex_pi_state *pi_state;
union futex_key key = FUTEX_KEY_INIT;
might_sleep();
WARN_ON(curr != current);
guard(private_hash)();
raw_spin_lock_irq(&curr->pi_lock);
while (!list_empty(head)) {
next = head->next;
pi_state = list_entry(next, struct futex_pi_state, list);
key = pi_state->key;
if (1) {
CLASS(hb, hb)(&key);
if (!refcount_inc_not_zero(&pi_state->refcount)) {
raw_spin_unlock_irq(&curr->pi_lock);
cpu_relax();
raw_spin_lock_irq(&curr->pi_lock);
continue;
}
raw_spin_unlock_irq(&curr->pi_lock);
spin_lock(&hb->lock);
raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
raw_spin_lock(&curr->pi_lock);
if (head->next != next) {
raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
spin_unlock(&hb->lock);
put_pi_state(pi_state);
continue;
}
WARN_ON(pi_state->owner != curr);
WARN_ON(list_empty(&pi_state->list));
list_del_init(&pi_state->list);
pi_state->owner = NULL;
raw_spin_unlock(&curr->pi_lock);
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
spin_unlock(&hb->lock);
}
rt_mutex_futex_unlock(&pi_state->pi_mutex);
put_pi_state(pi_state);
raw_spin_lock_irq(&curr->pi_lock);
}
raw_spin_unlock_irq(&curr->pi_lock);
}
#else
static inline void exit_pi_state_list(struct task_struct *curr) { }
#endif
static void futex_cleanup(struct task_struct *tsk)
{
if (unlikely(tsk->robust_list)) {
exit_robust_list(tsk);
tsk->robust_list = NULL;
}
#ifdef CONFIG_COMPAT
if (unlikely(tsk->compat_robust_list)) {
compat_exit_robust_list(tsk);
tsk->compat_robust_list = NULL;
}
#endif
if (unlikely(!list_empty(&tsk->pi_state_list)))
exit_pi_state_list(tsk);
}
void futex_exit_recursive(struct task_struct *tsk)
{
if (tsk->futex_state == FUTEX_STATE_EXITING)
mutex_unlock(&tsk->futex_exit_mutex);
tsk->futex_state = FUTEX_STATE_DEAD;
}
static void futex_cleanup_begin(struct task_struct *tsk)
{
mutex_lock(&tsk->futex_exit_mutex);
raw_spin_lock_irq(&tsk->pi_lock);
tsk->futex_state = FUTEX_STATE_EXITING;
raw_spin_unlock_irq(&tsk->pi_lock);
}
static void futex_cleanup_end(struct task_struct *tsk, int state)
{
tsk->futex_state = state;
mutex_unlock(&tsk->futex_exit_mutex);
}
void futex_exec_release(struct task_struct *tsk)
{
futex_cleanup_begin(tsk);
futex_cleanup(tsk);
futex_cleanup_end(tsk, FUTEX_STATE_OK);
}
void futex_exit_release(struct task_struct *tsk)
{
futex_cleanup_begin(tsk);
futex_cleanup(tsk);
futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
}
static void futex_hash_bucket_init(struct futex_hash_bucket *fhb,
struct futex_private_hash *fph)
{
#ifdef CONFIG_FUTEX_PRIVATE_HASH
fhb->priv = fph;
#endif
atomic_set(&fhb->waiters, 0);
plist_head_init(&fhb->chain);
spin_lock_init(&fhb->lock);
}
#define FH_CUSTOM 0x01
#ifdef CONFIG_FUTEX_PRIVATE_HASH
static void futex_ref_rcu(struct rcu_head *head);
static void __futex_ref_atomic_begin(struct futex_private_hash *fph)
{
struct mm_struct *mm = fph->mm;
WARN_ON_ONCE(atomic_long_read(&mm->futex_atomic) != 0);
atomic_long_set(&mm->futex_atomic, LONG_MAX);
smp_store_release(&fph->state, FR_ATOMIC);
call_rcu_hurry(&mm->futex_rcu, futex_ref_rcu);
}
static void __futex_ref_atomic_end(struct futex_private_hash *fph)
{
struct mm_struct *mm = fph->mm;
unsigned int count = 0;
long ret;
int cpu;
WARN_ON_ONCE(fph->state != FR_ATOMIC);
for_each_possible_cpu(cpu) {
unsigned int *ptr = per_cpu_ptr(mm->futex_ref, cpu);
count += *ptr;
*ptr = 0;
}
this_cpu_inc(*mm->futex_ref);
ret = atomic_long_add_return(count - LONG_MAX - 1, &mm->futex_atomic);
if (!ret)
wake_up_var(mm);
WARN_ON_ONCE(ret < 0);
mmput_async(mm);
}
static void futex_ref_rcu(struct rcu_head *head)
{
struct mm_struct *mm = container_of(head, struct mm_struct, futex_rcu);
struct futex_private_hash *fph = rcu_dereference_raw(mm->futex_phash);
if (fph->state == FR_PERCPU) {
__futex_ref_atomic_begin(fph);
return;
}
__futex_ref_atomic_end(fph);
}
static void futex_ref_drop(struct futex_private_hash *fph)
{
struct mm_struct *mm = fph->mm;
WARN_ON_ONCE(rcu_dereference_raw(mm->futex_phash) != fph);
mmget(mm);
if (poll_state_synchronize_rcu(mm->futex_batches)) {
__futex_ref_atomic_begin(fph);
return;
}
call_rcu_hurry(&mm->futex_rcu, futex_ref_rcu);
}
static bool futex_ref_get(struct futex_private_hash *fph)
{
struct mm_struct *mm = fph->mm;
guard(preempt)();
if (READ_ONCE(fph->state) == FR_PERCPU) {
__this_cpu_inc(*mm->futex_ref);
return true;
}
return atomic_long_inc_not_zero(&mm->futex_atomic);
}
static bool futex_ref_put(struct futex_private_hash *fph)
{
struct mm_struct *mm = fph->mm;
guard(preempt)();
if (READ_ONCE(fph->state) == FR_PERCPU) {
__this_cpu_dec(*mm->futex_ref);
return false;
}
return atomic_long_dec_and_test(&mm->futex_atomic);
}
static bool futex_ref_is_dead(struct futex_private_hash *fph)
{
struct mm_struct *mm = fph->mm;
guard(rcu)();
if (smp_load_acquire(&fph->state) == FR_PERCPU)
return false;
return atomic_long_read(&mm->futex_atomic) == 0;
}
int futex_mm_init(struct mm_struct *mm)
{
mutex_init(&mm->futex_hash_lock);
RCU_INIT_POINTER(mm->futex_phash, NULL);
mm->futex_phash_new = NULL;
mm->futex_ref = NULL;
atomic_long_set(&mm->futex_atomic, 0);
mm->futex_batches = get_state_synchronize_rcu();
return 0;
}
void futex_hash_free(struct mm_struct *mm)
{
struct futex_private_hash *fph;
free_percpu(mm->futex_ref);
kvfree(mm->futex_phash_new);
fph = rcu_dereference_raw(mm->futex_phash);
if (fph)
kvfree(fph);
}
static bool futex_pivot_pending(struct mm_struct *mm)
{
struct futex_private_hash *fph;
guard(rcu)();
if (!mm->futex_phash_new)
return true;
fph = rcu_dereference(mm->futex_phash);
return futex_ref_is_dead(fph);
}
static bool futex_hash_less(struct futex_private_hash *a,
struct futex_private_hash *b)
{
if (!a->custom && b->custom)
return true;
if (a->custom && !b->custom)
return false;
if (!b->hash_mask)
return true;
if (!a->hash_mask)
return false;
if (a->hash_mask < b->hash_mask)
return true;
if (a->hash_mask > b->hash_mask)
return false;
return false;
}
static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags)
{
struct mm_struct *mm = current->mm;
struct futex_private_hash *fph;
bool custom = flags & FH_CUSTOM;
int i;
if (hash_slots && (hash_slots == 1 || !is_power_of_2(hash_slots)))
return -EINVAL;
scoped_guard(rcu) {
fph = rcu_dereference(mm->futex_phash);
if (fph && !fph->hash_mask) {
if (custom)
return -EBUSY;
return 0;
}
}
if (!mm->futex_ref) {
mm->futex_ref = alloc_percpu(unsigned int);
if (!mm->futex_ref)
return -ENOMEM;
this_cpu_inc(*mm->futex_ref);
}
fph = kvzalloc(struct_size(fph, queues, hash_slots),
GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
if (!fph)
return -ENOMEM;
fph->hash_mask = hash_slots ? hash_slots - 1 : 0;
fph->custom = custom;
fph->mm = mm;
for (i = 0; i < hash_slots; i++)
futex_hash_bucket_init(&fph->queues[i], fph);
if (custom) {
again:
wait_var_event(mm, futex_pivot_pending(mm));
}
scoped_guard(mutex, &mm->futex_hash_lock) {
struct futex_private_hash *free __free(kvfree) = NULL;
struct futex_private_hash *cur, *new;
cur = rcu_dereference_protected(mm->futex_phash,
lockdep_is_held(&mm->futex_hash_lock));
new = mm->futex_phash_new;
mm->futex_phash_new = NULL;
if (fph) {
if (cur && !cur->hash_mask) {
free = fph;
mm->futex_phash_new = new;
return -EBUSY;
}
if (cur && !new) {
futex_ref_drop(cur);
}
if (new) {
if (futex_hash_less(new, fph)) {
free = new;
new = fph;
} else {
free = fph;
}
} else {
new = fph;
}
fph = NULL;
}
if (new) {
if (!__futex_pivot_hash(mm, new) && custom)
goto again;
}
}
return 0;
}
int futex_hash_allocate_default(void)
{
unsigned int threads, buckets, current_buckets = 0;
struct futex_private_hash *fph;
if (!current->mm)
return 0;
scoped_guard(rcu) {
threads = min_t(unsigned int,
get_nr_threads(current),
num_online_cpus());
fph = rcu_dereference(current->mm->futex_phash);
if (fph) {
if (fph->custom)
return 0;
current_buckets = fph->hash_mask + 1;
}
}
buckets = roundup_pow_of_two(4 * threads);
buckets = clamp(buckets, 16, futex_hashmask + 1);
if (current_buckets >= buckets)
return 0;
return futex_hash_allocate(buckets, 0);
}
static int futex_hash_get_slots(void)
{
struct futex_private_hash *fph;
guard(rcu)();
fph = rcu_dereference(current->mm->futex_phash);
if (fph && fph->hash_mask)
return fph->hash_mask + 1;
return 0;
}
#else
static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags)
{
return -EINVAL;
}
static int futex_hash_get_slots(void)
{
return 0;
}
#endif
int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4)
{
unsigned int flags = FH_CUSTOM;
int ret;
switch (arg2) {
case PR_FUTEX_HASH_SET_SLOTS:
if (arg4)
return -EINVAL;
ret = futex_hash_allocate(arg3, flags);
break;
case PR_FUTEX_HASH_GET_SLOTS:
ret = futex_hash_get_slots();
break;
default:
ret = -EINVAL;
break;
}
return ret;
}
static int __init futex_init(void)
{
unsigned long hashsize, i;
unsigned int order, n;
unsigned long size;
#ifdef CONFIG_BASE_SMALL
hashsize = 16;
#else
hashsize = 256 * num_possible_cpus();
hashsize /= num_possible_nodes();
hashsize = max(4, hashsize);
hashsize = roundup_pow_of_two(hashsize);
#endif
futex_hashshift = ilog2(hashsize);
size = sizeof(struct futex_hash_bucket) * hashsize;
order = get_order(size);
for_each_node(n) {
struct futex_hash_bucket *table;
if (order > MAX_PAGE_ORDER)
table = vmalloc_huge_node(size, GFP_KERNEL, n);
else
table = alloc_pages_exact_nid(n, size, GFP_KERNEL);
BUG_ON(!table);
for (i = 0; i < hashsize; i++)
futex_hash_bucket_init(&table[i], NULL);
futex_queues[n] = table;
}
futex_hashmask = hashsize - 1;
pr_info("futex hash table entries: %lu (%lu bytes on %d NUMA nodes, total %lu KiB, %s).\n",
hashsize, size, num_possible_nodes(), size * num_possible_nodes() / 1024,
order > MAX_PAGE_ORDER ? "vmalloc" : "linear");
return 0;
}
core_initcall(futex_init);