root/security/landlock/tsync.c
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Landlock - Cross-thread ruleset enforcement
 *
 * Copyright © 2025 Google LLC
 */

#include <linux/atomic.h>
#include <linux/cleanup.h>
#include <linux/completion.h>
#include <linux/cred.h>
#include <linux/errno.h>
#include <linux/overflow.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
#include <linux/slab.h>
#include <linux/task_work.h>

#include "cred.h"
#include "tsync.h"

/*
 * Shared state between multiple threads which are enforcing Landlock rulesets
 * in lockstep with each other.
 */
struct tsync_shared_context {
        /* The old and tentative new creds of the calling thread. */
        const struct cred *old_cred;
        const struct cred *new_cred;

        /* True if sibling tasks need to set the no_new_privs flag. */
        bool set_no_new_privs;

        /* An error encountered in preparation step, or 0. */
        atomic_t preparation_error;

        /*
         * Barrier after preparation step in restrict_one_thread.
         * The calling thread waits for completion.
         *
         * Re-initialized on every round of looking for newly spawned threads.
         */
        atomic_t num_preparing;
        struct completion all_prepared;

        /* Sibling threads wait for completion. */
        struct completion ready_to_commit;

        /*
         * Barrier after commit step (used by syscall impl to wait for
         * completion).
         */
        atomic_t num_unfinished;
        struct completion all_finished;
};

struct tsync_work {
        struct callback_head work;
        struct task_struct *task;
        struct tsync_shared_context *shared_ctx;
};

/*
 * restrict_one_thread - update a thread's Landlock domain in lockstep with the
 * other threads in the same process
 *
 * When this is run, the same function gets run in all other threads in the same
 * process (except for the calling thread which called landlock_restrict_self).
 * The concurrently running invocations of restrict_one_thread coordinate
 * through the shared ctx object to do their work in lockstep to implement
 * all-or-nothing semantics for enforcing the new Landlock domain.
 *
 * Afterwards, depending on the presence of an error, all threads either commit
 * or abort the prepared credentials.  The commit operation can not fail any
 * more.
 */
static void restrict_one_thread(struct tsync_shared_context *ctx)
{
        int err;
        struct cred *cred = NULL;

        if (current_cred() == ctx->old_cred) {
                /*
                 * Switch out old_cred with new_cred, if possible.
                 *
                 * In the common case, where all threads initially point to the same
                 * struct cred, this optimization avoids creating separate redundant
                 * credentials objects for each, which would all have the same contents.
                 *
                 * Note: We are intentionally dropping the const qualifier here, because
                 * it is required by commit_creds() and abort_creds().
                 */
                cred = (struct cred *)get_cred(ctx->new_cred);
        } else {
                /* Else, prepare new creds and populate them. */
                cred = prepare_creds();

                if (!cred) {
                        atomic_set(&ctx->preparation_error, -ENOMEM);

                        /*
                         * Even on error, we need to adhere to the protocol and coordinate
                         * with concurrently running invocations.
                         */
                        if (atomic_dec_return(&ctx->num_preparing) == 0)
                                complete_all(&ctx->all_prepared);

                        goto out;
                }

                landlock_cred_copy(landlock_cred(cred),
                                   landlock_cred(ctx->new_cred));
        }

        /*
         * Barrier: Wait until all threads are done preparing.
         * After this point, we can have no more failures.
         */
        if (atomic_dec_return(&ctx->num_preparing) == 0)
                complete_all(&ctx->all_prepared);

        /*
         * Wait for signal from calling thread that it's safe to read the
         * preparation error now and we are ready to commit (or abort).
         */
        wait_for_completion(&ctx->ready_to_commit);

        /* Abort the commit if any of the other threads had an error. */
        err = atomic_read(&ctx->preparation_error);
        if (err) {
                abort_creds(cred);
                goto out;
        }

        /*
         * Make sure that all sibling tasks fulfill the no_new_privs prerequisite.
         * (This is in line with Seccomp's SECCOMP_FILTER_FLAG_TSYNC logic in
         * kernel/seccomp.c)
         */
        if (ctx->set_no_new_privs)
                task_set_no_new_privs(current);

        commit_creds(cred);

out:
        /* Notify the calling thread once all threads are done */
        if (atomic_dec_return(&ctx->num_unfinished) == 0)
                complete_all(&ctx->all_finished);
}

/*
 * restrict_one_thread_callback - task_work callback for restricting a thread
 *
 * Calls restrict_one_thread with the struct landlock_shared_tsync_context.
 */
static void restrict_one_thread_callback(struct callback_head *work)
{
        struct tsync_work *ctx = container_of(work, struct tsync_work, work);

        restrict_one_thread(ctx->shared_ctx);
}

/*
 * struct tsync_works - a growable array of per-task contexts
 *
 * The zero-initialized struct represents the empty array.
 */
struct tsync_works {
        struct tsync_work **works;
        size_t size;
        size_t capacity;
};

/*
 * tsync_works_provide - provides a preallocated tsync_work for the given task
 *
 * This also stores a task pointer in the context and increments the reference
 * count of the task.
 *
 * This function may fail in the case where we did not preallocate sufficient
 * capacity.  This can legitimately happen if new threads get started after we
 * grew the capacity.
 *
 * Returns:
 *   A pointer to the preallocated context struct, with task filled in.
 *
 *   NULL, if we ran out of preallocated context structs.
 */
static struct tsync_work *tsync_works_provide(struct tsync_works *s,
                                              struct task_struct *task)
{
        struct tsync_work *ctx;

        if (s->size >= s->capacity)
                return NULL;

        ctx = s->works[s->size];
        s->size++;

        ctx->task = get_task_struct(task);
        return ctx;
}

/**
 * tsync_works_trim - Put the last tsync_work element
 *
 * @s: TSYNC works to trim.
 *
 * Put the last task and decrement the size of @s.
 *
 * This helper does not cancel a running task, but just reset the last element
 * to zero.
 */
static void tsync_works_trim(struct tsync_works *s)
{
        struct tsync_work *ctx;

        if (WARN_ON_ONCE(s->size <= 0))
                return;

        ctx = s->works[s->size - 1];

        /*
         * For consistency, remove the task from ctx so that it does not look like
         * we handed it a task_work.
         */
        put_task_struct(ctx->task);
        *ctx = (typeof(*ctx)){};

        /*
         * Cancel the tsync_works_provide() change to recycle the reserved memory
         * for the next thread, if any.  This also ensures that cancel_tsync_works()
         * and tsync_works_release() do not see any NULL task pointers.
         */
        s->size--;
}

/*
 * tsync_works_grow_by - preallocates space for n more contexts in s
 *
 * On a successful return, the subsequent n calls to tsync_works_provide() are
 * guaranteed to succeed.  (size + n <= capacity)
 *
 * Returns:
 *   -ENOMEM if the (re)allocation fails

 *   0       if the allocation succeeds, partially succeeds, or no reallocation
 *           was needed
 */
static int tsync_works_grow_by(struct tsync_works *s, size_t n, gfp_t flags)
{
        size_t i;
        size_t new_capacity;
        struct tsync_work **works;
        struct tsync_work *work;

        if (check_add_overflow(s->size, n, &new_capacity))
                return -EOVERFLOW;

        /* No need to reallocate if s already has sufficient capacity. */
        if (new_capacity <= s->capacity)
                return 0;

        works = krealloc_array(s->works, new_capacity, sizeof(s->works[0]),
                               flags);
        if (!works)
                return -ENOMEM;

        s->works = works;

        for (i = s->capacity; i < new_capacity; i++) {
                work = kzalloc_obj(*work, flags);
                if (!work) {
                        /*
                         * Leave the object in a consistent state,
                         * but return an error.
                         */
                        s->capacity = i;
                        return -ENOMEM;
                }
                s->works[i] = work;
        }
        s->capacity = new_capacity;
        return 0;
}

/*
 * tsync_works_contains - checks for presence of task in s
 */
static bool tsync_works_contains_task(const struct tsync_works *s,
                                      const struct task_struct *task)
{
        size_t i;

        for (i = 0; i < s->size; i++)
                if (s->works[i]->task == task)
                        return true;

        return false;
}

/*
 * tsync_works_release - frees memory held by s and drops all task references
 *
 * This does not free s itself, only the data structures held by it.
 */
static void tsync_works_release(struct tsync_works *s)
{
        size_t i;

        for (i = 0; i < s->size; i++) {
                if (WARN_ON_ONCE(!s->works[i]->task))
                        continue;

                put_task_struct(s->works[i]->task);
        }

        for (i = 0; i < s->capacity; i++)
                kfree(s->works[i]);

        kfree(s->works);
        s->works = NULL;
        s->size = 0;
        s->capacity = 0;
}

/*
 * count_additional_threads - counts the sibling threads that are not in works
 */
static size_t count_additional_threads(const struct tsync_works *works)
{
        const struct task_struct *caller, *thread;
        size_t n = 0;

        caller = current;

        guard(rcu)();

        for_each_thread(caller, thread) {
                /* Skip current, since it is initiating the sync. */
                if (thread == caller)
                        continue;

                /* Skip exited threads. */
                if (thread->flags & PF_EXITING)
                        continue;

                /* Skip threads that we have already seen. */
                if (tsync_works_contains_task(works, thread))
                        continue;

                n++;
        }
        return n;
}

/*
 * schedule_task_work - adds task_work for all eligible sibling threads
 *                      which have not been scheduled yet
 *
 * For each added task_work, atomically increments shared_ctx->num_preparing and
 * shared_ctx->num_unfinished.
 *
 * Returns:
 *     true, if at least one eligible sibling thread was found
 */
static bool schedule_task_work(struct tsync_works *works,
                               struct tsync_shared_context *shared_ctx)
{
        int err;
        const struct task_struct *caller;
        struct task_struct *thread;
        struct tsync_work *ctx;
        bool found_more_threads = false;

        caller = current;

        guard(rcu)();

        for_each_thread(caller, thread) {
                /* Skip current, since it is initiating the sync. */
                if (thread == caller)
                        continue;

                /* Skip exited threads. */
                if (thread->flags & PF_EXITING)
                        continue;

                /* Skip threads that we already looked at. */
                if (tsync_works_contains_task(works, thread))
                        continue;

                /*
                 * We found a sibling thread that is not doing its task_work yet, and
                 * which might spawn new threads before our task work runs, so we need
                 * at least one more round in the outer loop.
                 */
                found_more_threads = true;

                ctx = tsync_works_provide(works, thread);
                if (!ctx) {
                        /*
                         * We ran out of preallocated contexts -- we need to try again with
                         * this thread at a later time!
                         * found_more_threads is already true at this point.
                         */
                        break;
                }

                ctx->shared_ctx = shared_ctx;

                atomic_inc(&shared_ctx->num_preparing);
                atomic_inc(&shared_ctx->num_unfinished);

                init_task_work(&ctx->work, restrict_one_thread_callback);
                err = task_work_add(thread, &ctx->work, TWA_SIGNAL);
                if (unlikely(err)) {
                        /*
                         * task_work_add() only fails if the task is about to exit.  We
                         * checked that earlier, but it can happen as a race.  Resume
                         * without setting an error, as the task is probably gone in the
                         * next loop iteration.
                         */
                        tsync_works_trim(works);

                        atomic_dec(&shared_ctx->num_preparing);
                        atomic_dec(&shared_ctx->num_unfinished);
                }
        }

        return found_more_threads;
}

/*
 * cancel_tsync_works - cancel all task works where it is possible
 *
 * Task works can be canceled as long as they are still queued and have not
 * started running.  If they get canceled, we decrement
 * shared_ctx->num_preparing and shared_ctx->num_unfished and mark the two
 * completions if needed, as if the task was never scheduled.
 */
static void cancel_tsync_works(const struct tsync_works *works,
                               struct tsync_shared_context *shared_ctx)
{
        size_t i;

        for (i = 0; i < works->size; i++) {
                if (WARN_ON_ONCE(!works->works[i]->task))
                        continue;

                if (!task_work_cancel(works->works[i]->task,
                                      &works->works[i]->work))
                        continue;

                /* After dequeueing, act as if the task work had executed. */

                if (atomic_dec_return(&shared_ctx->num_preparing) == 0)
                        complete_all(&shared_ctx->all_prepared);

                if (atomic_dec_return(&shared_ctx->num_unfinished) == 0)
                        complete_all(&shared_ctx->all_finished);
        }
}

/*
 * restrict_sibling_threads - enables a Landlock policy for all sibling threads
 */
int landlock_restrict_sibling_threads(const struct cred *old_cred,
                                      const struct cred *new_cred)
{
        int err;
        struct tsync_shared_context shared_ctx;
        struct tsync_works works = {};
        size_t newly_discovered_threads;
        bool found_more_threads;

        atomic_set(&shared_ctx.preparation_error, 0);
        init_completion(&shared_ctx.all_prepared);
        init_completion(&shared_ctx.ready_to_commit);
        atomic_set(&shared_ctx.num_unfinished, 1);
        init_completion(&shared_ctx.all_finished);
        shared_ctx.old_cred = old_cred;
        shared_ctx.new_cred = new_cred;
        shared_ctx.set_no_new_privs = task_no_new_privs(current);

        /*
         * Serialize concurrent TSYNC operations to prevent deadlocks when
         * multiple threads call landlock_restrict_self() simultaneously.
         * If the lock is already held, we gracefully yield by restarting the
         * syscall. This allows the current thread to process pending
         * task_works before retrying.
         */
        if (!down_write_trylock(&current->signal->exec_update_lock))
                return restart_syscall();

        /*
         * We schedule a pseudo-signal task_work for each of the calling task's
         * sibling threads.  In the task work, each thread:
         *
         * 1) runs prepare_creds() and writes back the error to
         *    shared_ctx.preparation_error, if needed.
         *
         * 2) signals that it's done with prepare_creds() to the calling task.
         *    (completion "all_prepared").
         *
         * 3) waits for the completion "ready_to_commit".  This is sent by the
         *    calling task after ensuring that all sibling threads have done
         *    with the "preparation" stage.
         *
         *    After this barrier is reached, it's safe to read
         *    shared_ctx.preparation_error.
         *
         * 4) reads shared_ctx.preparation_error and then either does commit_creds()
         *    or abort_creds().
         *
         * 5) signals that it's done altogether (barrier synchronization
         *    "all_finished")
         *
         * Unlike seccomp, which modifies sibling tasks directly, we do not need to
         * acquire the cred_guard_mutex and sighand->siglock:
         *
         * - As in our case, all threads are themselves exchanging their own struct
         *   cred through the credentials API, no locks are needed for that.
         * - Our for_each_thread() loops are protected by RCU.
         * - We do not acquire a lock to keep the list of sibling threads stable
         *   between our for_each_thread loops.  If the list of available sibling
         *   threads changes between these for_each_thread loops, we make up for
         *   that by continuing to look for threads until they are all discovered
         *   and have entered their task_work, where they are unable to spawn new
         *   threads.
         */
        do {
                /* In RCU read-lock, count the threads we need. */
                newly_discovered_threads = count_additional_threads(&works);

                if (newly_discovered_threads == 0)
                        break; /* done */

                err = tsync_works_grow_by(&works, newly_discovered_threads,
                                          GFP_KERNEL_ACCOUNT);
                if (err) {
                        atomic_set(&shared_ctx.preparation_error, err);
                        break;
                }

                /*
                 * The "all_prepared" barrier is used locally to the loop body, this use
                 * of for_each_thread().  We can reset it on each loop iteration because
                 * all previous loop iterations are done with it already.
                 *
                 * num_preparing is initialized to 1 so that the counter can not go to 0
                 * and mark the completion as done before all task works are registered.
                 * We decrement it at the end of the loop body.
                 */
                atomic_set(&shared_ctx.num_preparing, 1);
                reinit_completion(&shared_ctx.all_prepared);

                /*
                 * In RCU read-lock, schedule task work on newly discovered sibling
                 * tasks.
                 */
                found_more_threads = schedule_task_work(&works, &shared_ctx);

                /*
                 * Decrement num_preparing for current, to undo that we initialized it
                 * to 1 a few lines above.
                 */
                if (atomic_dec_return(&shared_ctx.num_preparing) > 0) {
                        if (wait_for_completion_interruptible(
                                    &shared_ctx.all_prepared)) {
                                /* In case of interruption, we need to retry the system call. */
                                atomic_set(&shared_ctx.preparation_error,
                                           -ERESTARTNOINTR);

                                /*
                                 * Opportunistic improvement: try to cancel task
                                 * works for tasks that did not start running
                                 * yet. We do not have a guarantee that it
                                 * cancels any of the enqueued task works
                                 * because task_work_run() might already have
                                 * dequeued them.
                                 */
                                cancel_tsync_works(&works, &shared_ctx);

                                /*
                                 * Break the loop with error. The cleanup code
                                 * after the loop unblocks the remaining
                                 * task_works.
                                 */
                                break;
                        }
                }
        } while (found_more_threads &&
                 !atomic_read(&shared_ctx.preparation_error));

        /*
         * We now have either (a) all sibling threads blocking and in "prepared"
         * state in the task work, or (b) the preparation error is set. Ask all
         * threads to commit (or abort).
         */
        complete_all(&shared_ctx.ready_to_commit);

        /*
         * Decrement num_unfinished for current, to undo that we initialized it to 1
         * at the beginning.
         */
        if (atomic_dec_return(&shared_ctx.num_unfinished) > 0)
                wait_for_completion(&shared_ctx.all_finished);

        tsync_works_release(&works);
        up_write(&current->signal->exec_update_lock);
        return atomic_read(&shared_ctx.preparation_error);
}