namespaces 学习笔记5：user ns 源码实现

作者: 董泽润 | 来源:发表于2019-10-15 14:51 被阅读0次

namespaces 学习笔记5：user ns 源码实现
namespaces 学习笔记1：mount ns 源码实现
namespaces 学习笔记2：uts ns 源码实现
namespaces 学习笔记3：ipc ns 源码实现
容器安全-Namespaces
namespaces 学习笔记4：什么是 user namesp
Spring Event 实现原理
SpringBoot启动加载Apollo配置过程
Dialog源码学习笔记
[mark] vue source code

TL;DR 上一篇测试了 user ns，本篇从源码层面看如何实现用户映射，以及权限控制

核心结构体

struct user_namespace {
    struct uid_gid_map  uid_map;
    struct uid_gid_map  gid_map;
    struct uid_gid_map  projid_map;
    atomic_t        count;
    struct user_namespace   *parent;
    int         level;
    kuid_t          owner;
    kgid_t          group;
    struct ns_common    ns;
    unsigned long       flags;

    /* Register of per-UID persistent keyrings for this namespace */
#ifdef CONFIG_PERSISTENT_KEYRINGS
    struct key      *persistent_keyring_register;
    struct rw_semaphore persistent_keyring_register_sem;
#endif
    struct work_struct  work;
#ifdef CONFIG_SYSCTL
    struct ctl_table_set    set;
    struct ctl_table_header *sysctls;
#endif
    struct ucounts      *ucounts;
    int ucount_max[UCOUNT_COUNTS];
} __randomize_layout;

和其它 ns 一样，都有 ns_common 结构体。另外 uid_map, gid_map 用来管理真正的用户映射，parent 指向父 user ns

struct cred {
    atomic_t    usage;
#ifdef CONFIG_DEBUG_CREDENTIALS
    atomic_t    subscribers;    /* number of processes subscribed */
    void        *put_addr;
    unsigned    magic;
#define CRED_MAGIC  0x43736564
#define CRED_MAGIC_DEAD 0x44656144
#endif
    kuid_t      uid;        /* real UID of the task */
    kgid_t      gid;        /* real GID of the task */
    kuid_t      suid;       /* saved UID of the task */
    kgid_t      sgid;       /* saved GID of the task */
    kuid_t      euid;       /* effective UID of the task */
    kgid_t      egid;       /* effective GID of the task */
    kuid_t      fsuid;      /* UID for VFS ops */
    kgid_t      fsgid;      /* GID for VFS ops */
    unsigned    securebits; /* SUID-less security management */
    kernel_cap_t    cap_inheritable; /* caps our children can inherit */
    kernel_cap_t    cap_permitted;  /* caps we're permitted */
    kernel_cap_t    cap_effective;  /* caps we can actually use */
    kernel_cap_t    cap_bset;   /* capability bounding set */
    kernel_cap_t    cap_ambient;    /* Ambient capability set */
#ifdef CONFIG_KEYS
    unsigned char   jit_keyring;    /* default keyring to attach requested
                     * keys to */
    struct key  *session_keyring; /* keyring inherited over fork */
    struct key  *process_keyring; /* keyring private to this process */
    struct key  *thread_keyring; /* keyring private to this thread */
    struct key  *request_key_auth; /* assumed request_key authority */
#endif
#ifdef CONFIG_SECURITY
    void        *security;  /* subjective LSM security */
#endif
    struct user_struct *user;   /* real user ID subscription */
    struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */
    struct group_info *group_info;  /* supplementary groups for euid/fsgid */
    struct rcu_head rcu;        /* RCU deletion hook */
} __randomize_layout;

还有一个最重要的结构体 cred, 用到了再说

创建 user ns

与其它几个 ns 不同，user ns 创建是在 _do_fork 比较靠前完成的，由函数 copy_creds 单独创建，共享能共享的 credentials, 但是在特殊情况下还是要新建的。

int copy_creds(struct task_struct *p, unsigned long clone_flags)
{
    struct cred *new;
    int ret;
    ......
    new = prepare_creds();
    if (!new)
        return -ENOMEM;

    if (clone_flags & CLONE_NEWUSER) {
        ret = create_user_ns(new);
        if (ret < 0)
            goto error_put;
    }
    ......
    atomic_inc(&new->user->processes);
    p->cred = p->real_cred = get_cred(new);
    alter_cred_subscribers(new, 2);
    validate_creds(new);
    return 0;
    ......
}

prepare_creds 创建初始化结构体 struct cred，说白了就是从父进程拷贝一份而己
判断 flags 是否有 CLONE_NEWUSER 标记，有的话就创建 user ns
更新子进程的 cred 和 real_cred

int create_user_ns(struct cred *new)
{
    struct user_namespace *ns, *parent_ns = new->user_ns;
    kuid_t owner = new->euid;
    kgid_t group = new->egid;
    struct ucounts *ucounts;
    int ret, i;

    ret = -ENOSPC;
    if (parent_ns->level > 32)
        goto fail;

    ucounts = inc_user_namespaces(parent_ns, owner);
    if (!ucounts)
        goto fail;

    /*
     * Verify that we can not violate the policy of which files
     * may be accessed that is specified by the root directory,
     * by verifing that the root directory is at the root of the
     * mount namespace which allows all files to be accessed.
     */
    ret = -EPERM;
    if (current_chrooted())
        goto fail_dec;

    /* The creator needs a mapping in the parent user namespace
     * or else we won't be able to reasonably tell userspace who
     * created a user_namespace.
     */
    ret = -EPERM;
    if (!kuid_has_mapping(parent_ns, owner) ||
        !kgid_has_mapping(parent_ns, group))
        goto fail_dec;

    ret = -ENOMEM;
    ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL);
    if (!ns)
        goto fail_dec;

    ret = ns_alloc_inum(&ns->ns);
    if (ret)
        goto fail_free;
    ns->ns.ops = &userns_operations;

    atomic_set(&ns->count, 1);
    /* Leave the new->user_ns reference with the new user namespace. */
    ns->parent = parent_ns;
    ns->level = parent_ns->level + 1;
    ns->owner = owner;
    ns->group = group;
    INIT_WORK(&ns->work, free_user_ns);
    for (i = 0; i < UCOUNT_COUNTS; i++) {
        ns->ucount_max[i] = INT_MAX;
    }
    ns->ucounts = ucounts;

    /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
    mutex_lock(&userns_state_mutex);
    ns->flags = parent_ns->flags;
    mutex_unlock(&userns_state_mutex);

#ifdef CONFIG_PERSISTENT_KEYRINGS
    init_rwsem(&ns->persistent_keyring_register_sem);
#endif
    ret = -ENOMEM;
    if (!setup_userns_sysctls(ns))
        goto fail_keyring;

    set_cred_user_ns(new, ns);
    return 0;
    ......
}

create_user_ns 作用就是基于父进程的 cred 来创建新的 user_namespace，并更新子进程的 cred 字段，把原 root 用户替换成新的子 user ns 的 root 用户。

owner, group 分别是父进程的有效用户和组，这是容器在父 user ns 中的真正身份
user ns 是可以层级关系的，但是最高不允许超过 32 层
判断当前是否己经 chroot，如果是的话报错失败，这里涉及一个安全漏洞，感兴趣自己搜吧
kuid_has_mapping 判断当前 euid, egid 有效用户是否己mgf经处于在映射范围，不在的话报错。这里面涉及 init_user_ns，当系统启动时，默认全局 user namespace 己经做了映射
kmem_cache_zalloc 创建新的 user namespace 结构体，注意这个是空的，后续才是初始化各个字段，比如 ns.ops 赋值 userns_operations，设置 parent ns, 设置 owner, group 等
set_cred_user_ns 设置 crediential 就是所谓的 cap，再设置 ns 字段，此时完成。注意，到现在子 user namespace 是没有 uid_map 或是 gid_map 的

初始 user ns

每个用户都属于某个 user namespace, 所以在系统启动时，就有一个默认的全局 init_user_ns

struct user_namespace init_user_ns = {
    .uid_map = {
        .nr_extents = 1,
        {
            .extent[0] = {
                .first = 0,
                .lower_first = 0,
                .count = 4294967295U,
            },
        },
    },
    .gid_map = {
        .nr_extents = 1,
        {
            .extent[0] = {
                .first = 0,
                .lower_first = 0,
                .count = 4294967295U,
            },
        },
    },
    .projid_map = {
        .nr_extents = 1,
        {
            .extent[0] = {
                .first = 0,
                .lower_first = 0,
                .count = 4294967295U,
            },
        },
    },
    .count = ATOMIC_INIT(3),
    .owner = GLOBAL_ROOT_UID,
    .group = GLOBAL_ROOT_GID,
    .ns.inum = PROC_USER_INIT_INO,
#ifdef CONFIG_USER_NS
    .ns.ops = &userns_operations,
#endif
    .flags = USERNS_INIT_FLAGS,
#ifdef CONFIG_PERSISTENT_KEYRINGS
    .persistent_keyring_register_sem =
    __RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem),
#endif
};

很多对于系统的操作都只能基于 init_user_ns，这里可以看到uid_map , gid_map 默认是一一对应全部映射了。当调用 create_user_ns 创建新的 user ns 时这两字段是空的，需要由子进程或是拥有同样 euid 的父进程设置。

查看当前用户 id

user ns 的一个作用就是隔离，当我们使用 id 命令查看当前用户时，实际上己经被隔离了。

dongzerun@iZhp36ik63t96xhzjh00ujZ:~$ id
uid=1001(dongzerun) gid=1001(dongzerun) groups=1001(dongzerun)
dongzerun@iZhp36ik63t96xhzjh00ujZ:~$ strace id
......
geteuid()                               = 1001
getuid()                                = 1001
getegid()                               = 1001
getgid()                                = 1001
......

可以看到实际调用的是系统调用 geteuid, getuid, getegid, getgid，那直接看源码

SYSCALL_DEFINE0(getuid)
{
    /* Only we change this so SMP safe */
    return from_kuid_munged(current_user_ns(), current_uid());
}

uid_t from_kuid_munged(struct user_namespace *targ, kuid_t kuid)
{
    uid_t uid;
    uid = from_kuid(targ, kuid);

    if (uid == (uid_t) -1)
        uid = overflowuid;
    return uid;
}

current_user_ns, current_uid 都是宏，前者从 cred 获取当前的 user ns, 后者获取 uid，注意这里，子进程的 cred 是从父进程拷贝过来的，uid 就是父进程的 uid。但是我们知道 getuid 是获取当前用户 id 的，所以在容器中看到的 id 肯定不是父进程的 uid，需要 from_kuid 从 uid_map 中查找，如果不存在，那么返回 overflowuid，这个就是最开始看到的 nobody 的来源

小结

这块还是比较复杂的，以后看到 docker 的时候再回头补充一下，看看 docker 是如何实现的~~

namespaces 学习笔记5：user ns 源码实现
TL;DR 上一篇测试了 user ns，本篇从源码层面看如何实现用户映射，以及权限控制核心结构体和其它 ns...
namespaces 学习笔记1：mount ns 源码实现
TL;DR 最近想看 docker 相关的实现，自然涉及底层 namespace, 所以边做实验边看源码，感兴趣的...
namespaces 学习笔记2：uts ns 源码实现
TL;DR 最近想看 docker 相关的实现，自然涉及底层 namespace, 所以边做实验边看源码，感兴趣的...
namespaces 学习笔记3：ipc ns 源码实现
TL;DR 这篇讲 ipc namespace，用于隔离共享内存，队列，信号量。虽然这块用的很少，还是要弄清楚原理...
容器安全-Namespaces
namespaces的使用情况 linux每个进程都有自己的namespaces，可以通过/proc/pid/ns...
namespaces 学习笔记4：什么是 user namesp
TL;DR 这篇讲 user namespace，用于隔离用户，组，root 目录以及 capabilities。...
Spring Event 实现原理
笔记简述本学习笔记主要介绍Spring的事件通知是如何实现的，以及源码分析 Demo 事件定义继承 Applic...
SpringBoot启动加载Apollo配置过程
源码流程： 1、遍历yaml配置的namespaces 2、每个namespace创建对应的config 3、第一...
Dialog源码学习笔记
Dialog源码学习笔记 [TOC] (简书这个不支持吗？) Dialog源码学习笔记Dialog中值得学习之-...
[mark] vue source code
vue源码学习笔记

namespaces 学习笔记5：user ns 源码实现

核心结构体

创建 user ns

初始 user ns

查看当前用户 id

小结

相关文章

namespaces 学习笔记5：user ns 源码实现

namespaces 学习笔记1：mount ns 源码实现

namespaces 学习笔记2：uts ns 源码实现

namespaces 学习笔记3：ipc ns 源码实现

容器安全-Namespaces

namespaces 学习笔记4：什么是 user namesp

Spring Event 实现原理

SpringBoot启动加载Apollo配置过程

Dialog源码学习笔记

[mark] vue source code

网友评论

延伸阅读

深度阅读

栏目导航

热点阅读