TL;DR 上一篇测试了 user ns,本篇从源码层面看如何实现用户映射,以及权限控制
核心结构体
struct user_namespace {
struct uid_gid_map uid_map;
struct uid_gid_map gid_map;
struct uid_gid_map projid_map;
atomic_t count;
struct user_namespace *parent;
int level;
kuid_t owner;
kgid_t group;
struct ns_common ns;
unsigned long flags;
/* Register of per-UID persistent keyrings for this namespace */
#ifdef CONFIG_PERSISTENT_KEYRINGS
struct key *persistent_keyring_register;
struct rw_semaphore persistent_keyring_register_sem;
#endif
struct work_struct work;
#ifdef CONFIG_SYSCTL
struct ctl_table_set set;
struct ctl_table_header *sysctls;
#endif
struct ucounts *ucounts;
int ucount_max[UCOUNT_COUNTS];
} __randomize_layout;
和其它 ns 一样,都有 ns_common
结构体。另外 uid_map
, gid_map
用来管理真正的用户映射,parent
指向父 user ns
struct cred {
atomic_t usage;
#ifdef CONFIG_DEBUG_CREDENTIALS
atomic_t subscribers; /* number of processes subscribed */
void *put_addr;
unsigned magic;
#define CRED_MAGIC 0x43736564
#define CRED_MAGIC_DEAD 0x44656144
#endif
kuid_t uid; /* real UID of the task */
kgid_t gid; /* real GID of the task */
kuid_t suid; /* saved UID of the task */
kgid_t sgid; /* saved GID of the task */
kuid_t euid; /* effective UID of the task */
kgid_t egid; /* effective GID of the task */
kuid_t fsuid; /* UID for VFS ops */
kgid_t fsgid; /* GID for VFS ops */
unsigned securebits; /* SUID-less security management */
kernel_cap_t cap_inheritable; /* caps our children can inherit */
kernel_cap_t cap_permitted; /* caps we're permitted */
kernel_cap_t cap_effective; /* caps we can actually use */
kernel_cap_t cap_bset; /* capability bounding set */
kernel_cap_t cap_ambient; /* Ambient capability set */
#ifdef CONFIG_KEYS
unsigned char jit_keyring; /* default keyring to attach requested
* keys to */
struct key *session_keyring; /* keyring inherited over fork */
struct key *process_keyring; /* keyring private to this process */
struct key *thread_keyring; /* keyring private to this thread */
struct key *request_key_auth; /* assumed request_key authority */
#endif
#ifdef CONFIG_SECURITY
void *security; /* subjective LSM security */
#endif
struct user_struct *user; /* real user ID subscription */
struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */
struct group_info *group_info; /* supplementary groups for euid/fsgid */
struct rcu_head rcu; /* RCU deletion hook */
} __randomize_layout;
还有一个最重要的结构体 cred
, 用到了再说
创建 user ns
与其它几个 ns 不同,user ns 创建是在 _do_fork
比较靠前完成的,由函数 copy_creds
单独创建,共享能共享的 credentials, 但是在特殊情况下还是要新建的。
int copy_creds(struct task_struct *p, unsigned long clone_flags)
{
struct cred *new;
int ret;
......
new = prepare_creds();
if (!new)
return -ENOMEM;
if (clone_flags & CLONE_NEWUSER) {
ret = create_user_ns(new);
if (ret < 0)
goto error_put;
}
......
atomic_inc(&new->user->processes);
p->cred = p->real_cred = get_cred(new);
alter_cred_subscribers(new, 2);
validate_creds(new);
return 0;
......
}
-
prepare_creds
创建初始化结构体struct cred
,说白了就是从父进程拷贝一份而己 - 判断 flags 是否有
CLONE_NEWUSER
标记,有的话就创建 user ns - 更新子进程的
cred
和real_cred
int create_user_ns(struct cred *new)
{
struct user_namespace *ns, *parent_ns = new->user_ns;
kuid_t owner = new->euid;
kgid_t group = new->egid;
struct ucounts *ucounts;
int ret, i;
ret = -ENOSPC;
if (parent_ns->level > 32)
goto fail;
ucounts = inc_user_namespaces(parent_ns, owner);
if (!ucounts)
goto fail;
/*
* Verify that we can not violate the policy of which files
* may be accessed that is specified by the root directory,
* by verifing that the root directory is at the root of the
* mount namespace which allows all files to be accessed.
*/
ret = -EPERM;
if (current_chrooted())
goto fail_dec;
/* The creator needs a mapping in the parent user namespace
* or else we won't be able to reasonably tell userspace who
* created a user_namespace.
*/
ret = -EPERM;
if (!kuid_has_mapping(parent_ns, owner) ||
!kgid_has_mapping(parent_ns, group))
goto fail_dec;
ret = -ENOMEM;
ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL);
if (!ns)
goto fail_dec;
ret = ns_alloc_inum(&ns->ns);
if (ret)
goto fail_free;
ns->ns.ops = &userns_operations;
atomic_set(&ns->count, 1);
/* Leave the new->user_ns reference with the new user namespace. */
ns->parent = parent_ns;
ns->level = parent_ns->level + 1;
ns->owner = owner;
ns->group = group;
INIT_WORK(&ns->work, free_user_ns);
for (i = 0; i < UCOUNT_COUNTS; i++) {
ns->ucount_max[i] = INT_MAX;
}
ns->ucounts = ucounts;
/* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
mutex_lock(&userns_state_mutex);
ns->flags = parent_ns->flags;
mutex_unlock(&userns_state_mutex);
#ifdef CONFIG_PERSISTENT_KEYRINGS
init_rwsem(&ns->persistent_keyring_register_sem);
#endif
ret = -ENOMEM;
if (!setup_userns_sysctls(ns))
goto fail_keyring;
set_cred_user_ns(new, ns);
return 0;
......
}
create_user_ns
作用就是基于父进程的 cred
来创建新的 user_namespace
, 并更新子进程的 cred
字段,把原 root 用户替换成新的子 user ns
的 root 用户。
-
owner
,group
分别是父进程的有效用户和组,这是容器在父 user ns 中的真正身份 - user ns 是可以层级关系的,但是最高不允许超过 32 层
- 判断当前是否己经
chroot
,如果是的话报错失败,这里涉及一个安全漏洞,感兴趣自己搜吧 -
kuid_has_mapping
判断当前euid
,egid
有效用户是否己mgf经处于在映射范围,不在的话报错。这里面涉及init_user_ns
,当系统启动时,默认全局 user namespace 己经做了映射 -
kmem_cache_zalloc
创建新的 user namespace 结构体,注意这个是空的,后续才是初始化各个字段,比如 ns.ops 赋值 userns_operations,设置 parent ns, 设置 owner, group 等 -
set_cred_user_ns
设置 crediential 就是所谓的 cap,再设置 ns 字段,此时完成。注意,到现在子 user namespace 是没有 uid_map 或是 gid_map 的
初始 user ns
每个用户都属于某个 user namespace, 所以在系统启动时,就有一个默认的全局 init_user_ns
struct user_namespace init_user_ns = {
.uid_map = {
.nr_extents = 1,
{
.extent[0] = {
.first = 0,
.lower_first = 0,
.count = 4294967295U,
},
},
},
.gid_map = {
.nr_extents = 1,
{
.extent[0] = {
.first = 0,
.lower_first = 0,
.count = 4294967295U,
},
},
},
.projid_map = {
.nr_extents = 1,
{
.extent[0] = {
.first = 0,
.lower_first = 0,
.count = 4294967295U,
},
},
},
.count = ATOMIC_INIT(3),
.owner = GLOBAL_ROOT_UID,
.group = GLOBAL_ROOT_GID,
.ns.inum = PROC_USER_INIT_INO,
#ifdef CONFIG_USER_NS
.ns.ops = &userns_operations,
#endif
.flags = USERNS_INIT_FLAGS,
#ifdef CONFIG_PERSISTENT_KEYRINGS
.persistent_keyring_register_sem =
__RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem),
#endif
};
很多对于系统的操作都只能基于 init_user_ns
,这里可以看到uid_map
, gid_map
默认是 一一对应全部映射了。当调用 create_user_ns
创建新的 user ns 时这两字段是空的,需要由子进程或是拥有同样 euid 的父进程设置。
查看当前用户 id
user ns 的一个作用就是隔离,当我们使用 id
命令查看当前用户时,实际上己经被隔离了。
dongzerun@iZhp36ik63t96xhzjh00ujZ:~$ id
uid=1001(dongzerun) gid=1001(dongzerun) groups=1001(dongzerun)
dongzerun@iZhp36ik63t96xhzjh00ujZ:~$ strace id
......
geteuid() = 1001
getuid() = 1001
getegid() = 1001
getgid() = 1001
......
可以看到实际调用的是系统调用 geteuid
, getuid
, getegid
, getgid
,那直接看源码
SYSCALL_DEFINE0(getuid)
{
/* Only we change this so SMP safe */
return from_kuid_munged(current_user_ns(), current_uid());
}
uid_t from_kuid_munged(struct user_namespace *targ, kuid_t kuid)
{
uid_t uid;
uid = from_kuid(targ, kuid);
if (uid == (uid_t) -1)
uid = overflowuid;
return uid;
}
current_user_ns
, current_uid
都是宏,前者从 cred
获取当前的 user ns, 后者获取 uid,注意这里,子进程的 cred
是从父进程拷贝过来的,uid 就是父进程的 uid。但是我们知道 getuid
是获取当前用户 id 的,所以在容器中看到的 id 肯定不是父进程的 uid,需要 from_kuid
从 uid_map
中查找,如果不存在,那么返回 overflowuid,这个就是最开始看到的 nobody
的来源
小结
这块还是比较复杂的,以后看到 docker 的时候再回头补充一下,看看 docker 是如何实现的~~
网友评论