美文网首页
namespaces 学习笔记5:user ns 源码实现

namespaces 学习笔记5:user ns 源码实现

作者: 董泽润 | 来源:发表于2019-10-15 14:51 被阅读0次

TL;DR 上一篇测试了 user ns,本篇从源码层面看如何实现用户映射,以及权限控制

核心结构体

struct user_namespace {
    struct uid_gid_map  uid_map;
    struct uid_gid_map  gid_map;
    struct uid_gid_map  projid_map;
    atomic_t        count;
    struct user_namespace   *parent;
    int         level;
    kuid_t          owner;
    kgid_t          group;
    struct ns_common    ns;
    unsigned long       flags;

    /* Register of per-UID persistent keyrings for this namespace */
#ifdef CONFIG_PERSISTENT_KEYRINGS
    struct key      *persistent_keyring_register;
    struct rw_semaphore persistent_keyring_register_sem;
#endif
    struct work_struct  work;
#ifdef CONFIG_SYSCTL
    struct ctl_table_set    set;
    struct ctl_table_header *sysctls;
#endif
    struct ucounts      *ucounts;
    int ucount_max[UCOUNT_COUNTS];
} __randomize_layout;

和其它 ns 一样,都有 ns_common 结构体。另外 uid_map, gid_map 用来管理真正的用户映射,parent 指向父 user ns

struct cred {
    atomic_t    usage;
#ifdef CONFIG_DEBUG_CREDENTIALS
    atomic_t    subscribers;    /* number of processes subscribed */
    void        *put_addr;
    unsigned    magic;
#define CRED_MAGIC  0x43736564
#define CRED_MAGIC_DEAD 0x44656144
#endif
    kuid_t      uid;        /* real UID of the task */
    kgid_t      gid;        /* real GID of the task */
    kuid_t      suid;       /* saved UID of the task */
    kgid_t      sgid;       /* saved GID of the task */
    kuid_t      euid;       /* effective UID of the task */
    kgid_t      egid;       /* effective GID of the task */
    kuid_t      fsuid;      /* UID for VFS ops */
    kgid_t      fsgid;      /* GID for VFS ops */
    unsigned    securebits; /* SUID-less security management */
    kernel_cap_t    cap_inheritable; /* caps our children can inherit */
    kernel_cap_t    cap_permitted;  /* caps we're permitted */
    kernel_cap_t    cap_effective;  /* caps we can actually use */
    kernel_cap_t    cap_bset;   /* capability bounding set */
    kernel_cap_t    cap_ambient;    /* Ambient capability set */
#ifdef CONFIG_KEYS
    unsigned char   jit_keyring;    /* default keyring to attach requested
                     * keys to */
    struct key  *session_keyring; /* keyring inherited over fork */
    struct key  *process_keyring; /* keyring private to this process */
    struct key  *thread_keyring; /* keyring private to this thread */
    struct key  *request_key_auth; /* assumed request_key authority */
#endif
#ifdef CONFIG_SECURITY
    void        *security;  /* subjective LSM security */
#endif
    struct user_struct *user;   /* real user ID subscription */
    struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */
    struct group_info *group_info;  /* supplementary groups for euid/fsgid */
    struct rcu_head rcu;        /* RCU deletion hook */
} __randomize_layout;

还有一个最重要的结构体 cred, 用到了再说

创建 user ns

与其它几个 ns 不同,user ns 创建是在 _do_fork 比较靠前完成的,由函数 copy_creds 单独创建,共享能共享的 credentials, 但是在特殊情况下还是要新建的。

int copy_creds(struct task_struct *p, unsigned long clone_flags)
{
    struct cred *new;
    int ret;
    ......
    new = prepare_creds();
    if (!new)
        return -ENOMEM;

    if (clone_flags & CLONE_NEWUSER) {
        ret = create_user_ns(new);
        if (ret < 0)
            goto error_put;
    }
    ......
    atomic_inc(&new->user->processes);
    p->cred = p->real_cred = get_cred(new);
    alter_cred_subscribers(new, 2);
    validate_creds(new);
    return 0;
    ......
}
  1. prepare_creds 创建初始化结构体 struct cred,说白了就是从父进程拷贝一份而己
  2. 判断 flags 是否有 CLONE_NEWUSER 标记,有的话就创建 user ns
  3. 更新子进程的 credreal_cred
int create_user_ns(struct cred *new)
{
    struct user_namespace *ns, *parent_ns = new->user_ns;
    kuid_t owner = new->euid;
    kgid_t group = new->egid;
    struct ucounts *ucounts;
    int ret, i;

    ret = -ENOSPC;
    if (parent_ns->level > 32)
        goto fail;

    ucounts = inc_user_namespaces(parent_ns, owner);
    if (!ucounts)
        goto fail;

    /*
     * Verify that we can not violate the policy of which files
     * may be accessed that is specified by the root directory,
     * by verifing that the root directory is at the root of the
     * mount namespace which allows all files to be accessed.
     */
    ret = -EPERM;
    if (current_chrooted())
        goto fail_dec;

    /* The creator needs a mapping in the parent user namespace
     * or else we won't be able to reasonably tell userspace who
     * created a user_namespace.
     */
    ret = -EPERM;
    if (!kuid_has_mapping(parent_ns, owner) ||
        !kgid_has_mapping(parent_ns, group))
        goto fail_dec;

    ret = -ENOMEM;
    ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL);
    if (!ns)
        goto fail_dec;

    ret = ns_alloc_inum(&ns->ns);
    if (ret)
        goto fail_free;
    ns->ns.ops = &userns_operations;

    atomic_set(&ns->count, 1);
    /* Leave the new->user_ns reference with the new user namespace. */
    ns->parent = parent_ns;
    ns->level = parent_ns->level + 1;
    ns->owner = owner;
    ns->group = group;
    INIT_WORK(&ns->work, free_user_ns);
    for (i = 0; i < UCOUNT_COUNTS; i++) {
        ns->ucount_max[i] = INT_MAX;
    }
    ns->ucounts = ucounts;

    /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
    mutex_lock(&userns_state_mutex);
    ns->flags = parent_ns->flags;
    mutex_unlock(&userns_state_mutex);

#ifdef CONFIG_PERSISTENT_KEYRINGS
    init_rwsem(&ns->persistent_keyring_register_sem);
#endif
    ret = -ENOMEM;
    if (!setup_userns_sysctls(ns))
        goto fail_keyring;

    set_cred_user_ns(new, ns);
    return 0;
    ......
}

create_user_ns 作用就是基于父进程的 cred 来创建新的 user_namespace, 并更新子进程的 cred 字段,把原 root 用户替换成新的子 user ns 的 root 用户。

  1. owner, group 分别是父进程的有效用户和组,这是容器在父 user ns 中的真正身份
  2. user ns 是可以层级关系的,但是最高不允许超过 32 层
  3. 判断当前是否己经 chroot,如果是的话报错失败,这里涉及一个安全漏洞,感兴趣自己搜吧
  4. kuid_has_mapping 判断当前 euid, egid 有效用户是否己mgf经处于在映射范围,不在的话报错。这里面涉及 init_user_ns,当系统启动时,默认全局 user namespace 己经做了映射
  5. kmem_cache_zalloc 创建新的 user namespace 结构体,注意这个是空的,后续才是初始化各个字段,比如 ns.ops 赋值 userns_operations,设置 parent ns, 设置 owner, group 等
  6. set_cred_user_ns 设置 crediential 就是所谓的 cap,再设置 ns 字段,此时完成。注意,到现在子 user namespace 是没有 uid_map 或是 gid_map 的

初始 user ns

每个用户都属于某个 user namespace, 所以在系统启动时,就有一个默认的全局 init_user_ns

struct user_namespace init_user_ns = {
    .uid_map = {
        .nr_extents = 1,
        {
            .extent[0] = {
                .first = 0,
                .lower_first = 0,
                .count = 4294967295U,
            },
        },
    },
    .gid_map = {
        .nr_extents = 1,
        {
            .extent[0] = {
                .first = 0,
                .lower_first = 0,
                .count = 4294967295U,
            },
        },
    },
    .projid_map = {
        .nr_extents = 1,
        {
            .extent[0] = {
                .first = 0,
                .lower_first = 0,
                .count = 4294967295U,
            },
        },
    },
    .count = ATOMIC_INIT(3),
    .owner = GLOBAL_ROOT_UID,
    .group = GLOBAL_ROOT_GID,
    .ns.inum = PROC_USER_INIT_INO,
#ifdef CONFIG_USER_NS
    .ns.ops = &userns_operations,
#endif
    .flags = USERNS_INIT_FLAGS,
#ifdef CONFIG_PERSISTENT_KEYRINGS
    .persistent_keyring_register_sem =
    __RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem),
#endif
};

很多对于系统的操作都只能基于 init_user_ns,这里可以看到uid_map , gid_map 默认是 一一对应全部映射了。当调用 create_user_ns 创建新的 user ns 时这两字段是空的,需要由子进程或是拥有同样 euid 的父进程设置。

查看当前用户 id

user ns 的一个作用就是隔离,当我们使用 id 命令查看当前用户时,实际上己经被隔离了。

dongzerun@iZhp36ik63t96xhzjh00ujZ:~$ id
uid=1001(dongzerun) gid=1001(dongzerun) groups=1001(dongzerun)
dongzerun@iZhp36ik63t96xhzjh00ujZ:~$ strace id
......
geteuid()                               = 1001
getuid()                                = 1001
getegid()                               = 1001
getgid()                                = 1001
......

可以看到实际调用的是系统调用 geteuid, getuid, getegid, getgid,那直接看源码

SYSCALL_DEFINE0(getuid)
{
    /* Only we change this so SMP safe */
    return from_kuid_munged(current_user_ns(), current_uid());
}

uid_t from_kuid_munged(struct user_namespace *targ, kuid_t kuid)
{
    uid_t uid;
    uid = from_kuid(targ, kuid);

    if (uid == (uid_t) -1)
        uid = overflowuid;
    return uid;
}

current_user_ns, current_uid 都是宏,前者从 cred 获取当前的 user ns, 后者获取 uid,注意这里,子进程的 cred 是从父进程拷贝过来的,uid 就是父进程的 uid。但是我们知道 getuid 是获取当前用户 id 的,所以在容器中看到的 id 肯定不是父进程的 uid,需要 from_kuiduid_map 中查找,如果不存在,那么返回 overflowuid,这个就是最开始看到的 nobody 的来源

小结

这块还是比较复杂的,以后看到 docker 的时候再回头补充一下,看看 docker 是如何实现的~~

相关文章

网友评论

      本文标题:namespaces 学习笔记5:user ns 源码实现

      本文链接:https://www.haomeiwen.com/subject/ifuxmctx.html