root/kernel/nscommon.c
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */

#include <linux/ns_common.h>
#include <linux/nstree.h>
#include <linux/proc_ns.h>
#include <linux/user_namespace.h>
#include <linux/vfsdebug.h>

#ifdef CONFIG_DEBUG_VFS
static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops)
{
        switch (ns->ns_type) {
#ifdef CONFIG_CGROUPS
        case CLONE_NEWCGROUP:
                VFS_WARN_ON_ONCE(ops != &cgroupns_operations);
                break;
#endif
#ifdef CONFIG_IPC_NS
        case CLONE_NEWIPC:
                VFS_WARN_ON_ONCE(ops != &ipcns_operations);
                break;
#endif
        case CLONE_NEWNS:
                VFS_WARN_ON_ONCE(ops != &mntns_operations);
                break;
#ifdef CONFIG_NET_NS
        case CLONE_NEWNET:
                VFS_WARN_ON_ONCE(ops != &netns_operations);
                break;
#endif
#ifdef CONFIG_PID_NS
        case CLONE_NEWPID:
                VFS_WARN_ON_ONCE(ops != &pidns_operations);
                break;
#endif
#ifdef CONFIG_TIME_NS
        case CLONE_NEWTIME:
                VFS_WARN_ON_ONCE(ops != &timens_operations);
                break;
#endif
#ifdef CONFIG_USER_NS
        case CLONE_NEWUSER:
                VFS_WARN_ON_ONCE(ops != &userns_operations);
                break;
#endif
#ifdef CONFIG_UTS_NS
        case CLONE_NEWUTS:
                VFS_WARN_ON_ONCE(ops != &utsns_operations);
                break;
#endif
        }
}
#endif

int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum)
{
        int ret = 0;

        refcount_set(&ns->__ns_ref, 1);
        ns->stashed = NULL;
        ns->ops = ops;
        ns->ns_id = 0;
        ns->ns_type = ns_type;
        ns_tree_node_init(&ns->ns_tree_node);
        ns_tree_node_init(&ns->ns_unified_node);
        ns_tree_node_init(&ns->ns_owner_node);
        ns_tree_root_init(&ns->ns_owner_root);

#ifdef CONFIG_DEBUG_VFS
        ns_debug(ns, ops);
#endif

        if (inum)
                ns->inum = inum;
        else
                ret = proc_alloc_inum(&ns->inum);
        if (ret)
                return ret;
        /*
         * Tree ref starts at 0. It's incremented when namespace enters
         * active use (installed in nsproxy) and decremented when all
         * active uses are gone. Initial namespaces are always active.
         */
        if (is_ns_init_inum(ns))
                atomic_set(&ns->__ns_ref_active, 1);
        else
                atomic_set(&ns->__ns_ref_active, 0);
        return 0;
}

void __ns_common_free(struct ns_common *ns)
{
        proc_free_inum(ns->inum);
}

struct ns_common *__must_check ns_owner(struct ns_common *ns)
{
        struct user_namespace *owner;

        if (unlikely(!ns->ops))
                return NULL;
        VFS_WARN_ON_ONCE(!ns->ops->owner);
        owner = ns->ops->owner(ns);
        VFS_WARN_ON_ONCE(!owner && ns != to_ns_common(&init_user_ns));
        if (!owner)
                return NULL;
        /* Skip init_user_ns as it's always active */
        if (owner == &init_user_ns)
                return NULL;
        return to_ns_common(owner);
}

/*
 * The active reference count works by having each namespace that gets
 * created take a single active reference on its owning user namespace.
 * That single reference is only released once the child namespace's
 * active count itself goes down.
 *
 * A regular namespace tree might look as follow:
 * Legend:
 * + : adding active reference
 * - : dropping active reference
 * x : always active (initial namespace)
 *
 *
 *                 net_ns          pid_ns
 *                       \        /
 *                        +      +
 *                        user_ns1 (2)
 *                            |
 *                 ipc_ns     |     uts_ns
 *                       \    |    /
 *                        +   +   +
 *                        user_ns2 (3)
 *                            |
 *            cgroup_ns       |       mnt_ns
 *                     \      |      /
 *                      x     x     x
 *                      init_user_ns (1)
 *
 * If both net_ns and pid_ns put their last active reference on
 * themselves it will cascade to user_ns1 dropping its own active
 * reference and dropping one active reference on user_ns2:
 *
 *                 net_ns          pid_ns
 *                       \        /
 *                        -      -
 *                        user_ns1 (0)
 *                            |
 *                 ipc_ns     |     uts_ns
 *                       \    |    /
 *                        +   -   +
 *                        user_ns2 (2)
 *                            |
 *            cgroup_ns       |       mnt_ns
 *                     \      |      /
 *                      x     x     x
 *                      init_user_ns (1)
 *
 * The iteration stops once we reach a namespace that still has active
 * references.
 */
void __ns_ref_active_put(struct ns_common *ns)
{
        /* Initial namespaces are always active. */
        if (is_ns_init_id(ns))
                return;

        if (!atomic_dec_and_test(&ns->__ns_ref_active)) {
                VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0);
                return;
        }

        VFS_WARN_ON_ONCE(is_ns_init_id(ns));
        VFS_WARN_ON_ONCE(!__ns_ref_read(ns));

        for (;;) {
                ns = ns_owner(ns);
                if (!ns)
                        return;
                VFS_WARN_ON_ONCE(is_ns_init_id(ns));
                if (!atomic_dec_and_test(&ns->__ns_ref_active)) {
                        VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0);
                        return;
                }
        }
}

/*
 * The active reference count works by having each namespace that gets
 * created take a single active reference on its owning user namespace.
 * That single reference is only released once the child namespace's
 * active count itself goes down. This makes it possible to efficiently
 * resurrect a namespace tree:
 *
 * A regular namespace tree might look as follow:
 * Legend:
 * + : adding active reference
 * - : dropping active reference
 * x : always active (initial namespace)
 *
 *
 *                 net_ns          pid_ns
 *                       \        /
 *                        +      +
 *                        user_ns1 (2)
 *                            |
 *                 ipc_ns     |     uts_ns
 *                       \    |    /
 *                        +   +   +
 *                        user_ns2 (3)
 *                            |
 *            cgroup_ns       |       mnt_ns
 *                     \      |      /
 *                      x     x     x
 *                      init_user_ns (1)
 *
 * If both net_ns and pid_ns put their last active reference on
 * themselves it will cascade to user_ns1 dropping its own active
 * reference and dropping one active reference on user_ns2:
 *
 *                 net_ns          pid_ns
 *                       \        /
 *                        -      -
 *                        user_ns1 (0)
 *                            |
 *                 ipc_ns     |     uts_ns
 *                       \    |    /
 *                        +   -   +
 *                        user_ns2 (2)
 *                            |
 *            cgroup_ns       |       mnt_ns
 *                     \      |      /
 *                      x     x     x
 *                      init_user_ns (1)
 *
 * Assume the whole tree is dead but all namespaces are still active:
 *
 *                 net_ns          pid_ns
 *                       \        /
 *                        -      -
 *                        user_ns1 (0)
 *                            |
 *                 ipc_ns     |     uts_ns
 *                       \    |    /
 *                        -   -   -
 *                        user_ns2 (0)
 *                            |
 *            cgroup_ns       |       mnt_ns
 *                     \      |      /
 *                      x     x     x
 *                      init_user_ns (1)
 *
 * Now assume the net_ns gets resurrected (.e.g., via the SIOCGSKNS ioctl()):
 *
 *                 net_ns          pid_ns
 *                       \        /
 *                        +      -
 *                        user_ns1 (0)
 *                            |
 *                 ipc_ns     |     uts_ns
 *                       \    |    /
 *                        -   +   -
 *                        user_ns2 (0)
 *                            |
 *            cgroup_ns       |       mnt_ns
 *                     \      |      /
 *                      x     x     x
 *                      init_user_ns (1)
 *
 * If net_ns had a zero reference count and we bumped it we also need to
 * take another reference on its owning user namespace. Similarly, if
 * pid_ns had a zero reference count it also needs to take another
 * reference on its owning user namespace. So both net_ns and pid_ns
 * will each have their own reference on the owning user namespace.
 *
 * If the owning user namespace user_ns1 had a zero reference count then
 * it also needs to take another reference on its owning user namespace
 * and so on.
 */
void __ns_ref_active_get(struct ns_common *ns)
{
        int prev;

        /* Initial namespaces are always active. */
        if (is_ns_init_id(ns))
                return;

        /* If we didn't resurrect the namespace we're done. */
        prev = atomic_fetch_add(1, &ns->__ns_ref_active);
        VFS_WARN_ON_ONCE(prev < 0);
        if (likely(prev))
                return;

        /*
         * We did resurrect it. Walk the ownership hierarchy upwards
         * until we found an owning user namespace that is active.
         */
        for (;;) {
                ns = ns_owner(ns);
                if (!ns)
                        return;

                VFS_WARN_ON_ONCE(is_ns_init_id(ns));
                prev = atomic_fetch_add(1, &ns->__ns_ref_active);
                VFS_WARN_ON_ONCE(prev < 0);
                if (likely(prev))
                        return;
        }
}

bool may_see_all_namespaces(void)
{
        return (task_active_pid_ns(current) == &init_pid_ns) &&
               ns_capable_noaudit(init_pid_ns.user_ns, CAP_SYS_ADMIN);
}