TL;DR 这篇讲 user namespace,用于隔离用户,组,root 目录以及 capabilities。用户其实最终就是控制权限,可以使容器里是 root 用户,但是对应宿主机的普通用户,可以先看官方文档
测试例子
代码同样来自耗子叔的文章,添加 CLONE_NEWUSER
#define _GNU_SOURCE
#include <sys/types.h>
#include <sys/wait.h>
#include <fcntl.h>
#include <stdio.h>
#include <sched.h>
#include <signal.h>
#include <unistd.h>
#include <sys/mount.h>
#include <limits.h>
#include <string.h>
#include <errno.h>
#include <sys/capability.h>
/* 定义一个给 clone 用的栈,栈大小1M */
#define STACK_SIZE (1024 * 1024)
static char container_stack[STACK_SIZE];
char* const container_args[] = {
"/bin/bash",
NULL
};
int container_main(void* arg)
{
printf("Container - inside the container!\n");
sethostname("my-container",10); /* 设置hostname */
/* 直接执行一个shell,以便我们观察这个进程空间里的资源是否被隔离了 */
execv(container_args[0], container_args);
printf("Something's wrong!\n");
return 1;
}
int main()
{
printf("Parent - start a container!\n");
/* 调用clone函数,其中传出一个函数,还有一个栈空间的(为什么传尾指针,因为栈是反着的) */
int container_pid = clone(container_main, container_stack+STACK_SIZE, SIGCHLD|CLONE_NEWUTS|CLONE_NEWUSER, NULL);
/* 等待子进程结束 */
waitpid(container_pid, NULL, 0);
printf("Parent - container stopped!\n");
return 0;
}
为了便于观察,同样做了 uts ns 隔离,并设置主机名
root@iZhp36ik63t96xhzjh00ujZ:~# gcc user.c && ./a.out
Parent - start a container!
Container - inside the container!
nobody@my-contain:~$ id
uid=65534(nobody) gid=65534(nogroup) groups=65534(nogroup)
nobody@my-contain:~$ reboot
Failed to set wall message, ignoring: Access denied
Failed to reboot system via logind: Access denied
nobody@my-contain:~$
nobody@my-contain:~$ exit
exit
Parent - container stopped!
编绎运行后,会发现当前的用户名是 nobody, 组也换了,如果此时在容器中执行 reboot
命令会报错,可见己经隔离了。但是问题来了,在容器里的用户组怎么是 nobody
?变成 root 或是其它用户可不可以?
用户映射
用户映射是指,将宿主机真实的用户与容器中的有户一一对应,这里涉及两个文件:/proc/[pid]/uid_map
和 /proc/[pid]/gid_map
root@iZhp36ik63t96xhzjh00ujZ:~# cat /proc/$$/uid_map
0 0 4294967295
root@iZhp36ik63t96xhzjh00ujZ:~# cat /proc/$$/gid_map
0 0 4294967295
root@iZhp36ik63t96xhzjh00ujZ:~#
比如我们看当前 shell 的 uid_map 和 gid_map,内容都是一样的。这是默认值,(int32) -1 是一个特殊值,全局初始 init_user_ns
默认就是这个,表示全部一一映射。先看下文件格式:
inner_ns_start outer_ns_start count
inner_ns_start
表示容器内的起始 id,outer_ns_start
表示对应外层宿主机的超始 id,而 count
表示个数,可以写成 1 表示只将 inner_ns_start 映射到 outer_ns_start,如果 count 为 100,表示映射范围 [inner_ns_start, inner_ns_start+100)。而默认值 4294967295 是 (int32)-1 的意思,表示不映射
void set_uid_map(pid_t pid, int inside_id, int outside_id, int length) {
char path[256];
sprintf(path, "/proc/%d/uid_map", pid);
printf("%s\n", path);
FILE* uid_map = fopen(path, "w");
fprintf(uid_map, "%d %d %d\n", inside_id, outside_id, length);
printf("%d %d %d\n", inside_id, outside_id, length);
fclose(uid_map);
}
void set_gid_map(pid_t pid, int inside_id, int outside_id, int length) {
char path[256];
sprintf(path, "/proc/%d/gid_map", pid);
printf("%s\n", path);
FILE* gid_map = fopen(path, "w");
fprintf(gid_map, "%d %d %d\n", inside_id, outside_id, length);
printf("%d %d %d\n", inside_id, outside_id, length);
fclose(gid_map);
}
int container_main(void* arg)
{
printf("Container - inside the container!\n");
sethostname("my-container",10); /* 设置hostname */
cap_t caps;
caps = cap_get_proc();
printf("capabilities: %s\n", cap_to_text(caps, NULL));
set_uid_map(getpid(), 0, 0, 1);
set_gid_map(getpid(), 0, 0, 1);
printf("eUID = %ld; eGID = %ld, child_pid=%ld\n", (long)geteuid(), (long)getegid(), (long)getpid());
/* 直接执行一个shell,以便我们观察这个进程空间里的资源是否被隔离了 */
execv(container_args[0], container_args);
printf("Something's wrong!\n");
return 1;
}
现在修改下代码,增加两个函数 set_uid_map
和 set_gid_map
,再修改 container_main
增加些调试打印信息,并写 uid_map gid_map
set_uid_map(getpid(), 0, 0, 1);
其中参数 0, 0, 1
表示将宿主机 id 0 映射到容器中的 0,即 root 用户。
root@iZhp36ik63t96xhzjh00ujZ:~# gcc -Wl,--no-as-needed -lcap user.c && ./a.out
Parent - start a container!
Container - inside the container!
capabilities: = cap_chown,cap_dac_override,cap_dac_read_search,cap_fowner,cap_fsetid,cap_kill,cap_setgid,cap_setuid,cap_setpcap,cap_linux_immutable,cap_net_bind_service,cap_net_broadcast,cap_net_admin,cap_net_raw,cap_ipc_lock,cap_ipc_owner,cap_sys_module,cap_sys_rawio,cap_sys_chroot,cap_sys_ptrace,cap_sys_pacct,cap_sys_admin,cap_sys_boot,cap_sys_nice,cap_sys_resource,cap_sys_time,cap_sys_tty_config,cap_mknod,cap_lease,cap_audit_write,cap_audit_control,cap_setfcap,cap_mac_override,cap_mac_admin,cap_syslog,cap_wake_alarm,cap_block_suspend,cap_audit_read+ep
/proc/3581/uid_map
0 0 1
/proc/3581/gid_map
0 0 1
eUID = 0; eGID = 65534, child_pid=3581
root@my-contain:~# id
uid=0(root) gid=65534(nogroup) groups=65534(nogroup)
root@my-contain:~#
此时我们发现,在容器中当前用户是 root 了,但是组 gid 居然还是 nogroup。这里面涉及到了 setgroups
,具体原理可以看官方文档,需要增加函数来设置
static void
proc_setgroups_write(pid_t child_pid, char *str)
{
char setgroups_path[256];
int fd;
snprintf(setgroups_path, PATH_MAX, "/proc/%ld/setgroups",
(long) child_pid);
fd = open(setgroups_path, O_RDWR);
if (fd == -1) {
/* We may be on a system that doesn't support
/proc/PID/setgroups. In that case, the file won't exist,
and the system won't impose the restrictions that Linux 3.19
added. That's fine: we don't need to do anything in order
to permit 'gid_map' to be updated.
However, if the error from open() was something other than
the ENOENT error that is expected for that case, let the
user know. */
if (errno != ENOENT)
fprintf(stderr, "ERROR: open %s: %s\n", setgroups_path,
strerror(errno));
return;
}
if (write(fd, str, strlen(str)) == -1)
fprintf(stderr, "ERROR: write %s: %s\n", setgroups_path,
strerror(errno));
close(fd);
}
然后还要修改 container_main
,在设置用户映射前增加 proc_setgroups_write
int container_main(void* arg)
{
printf("Container - inside the container!\n");
sethostname("my-container",10); /* 设置hostname */
cap_t caps;
caps = cap_get_proc();
printf("capabilities: %s\n", cap_to_text(caps, NULL));
proc_setgroups_write(getpid(), "deny");
set_uid_map(getpid(), 0, 0, 1);
set_gid_map(getpid(), 0, 0, 1);
printf("eUID = %ld; eGID = %ld, child_pid=%ld\n", (long)geteuid(), (long)getegid(), (long)getpid());
/* 直接执行一个shell,以便我们观察这个进程空间里的资源是否被隔离了 */
execv(container_args[0], container_args);
printf("Something's wrong!\n");
return 1;
}
再次编绎运行后查看结果符合预期
root@iZhp36ik63t96xhzjh00ujZ:~# gcc -Wl,--no-as-needed -lcap user.c && ./a.out
Parent - start a container!
Container - inside the container!
capabilities: = cap_chown,cap_dac_override,cap_dac_read_search,cap_fowner,cap_fsetid,cap_kill,cap_setgid,cap_setuid,cap_setpcap,cap_linux_immutable,cap_net_bind_service,cap_net_broadcast,cap_net_admin,cap_net_raw,cap_ipc_lock,cap_ipc_owner,cap_sys_module,cap_sys_rawio,cap_sys_chroot,cap_sys_ptrace,cap_sys_pacct,cap_sys_admin,cap_sys_boot,cap_sys_nice,cap_sys_resource,cap_sys_time,cap_sys_tty_config,cap_mknod,cap_lease,cap_audit_write,cap_audit_control,cap_setfcap,cap_mac_override,cap_mac_admin,cap_syslog,cap_wake_alarm,cap_block_suspend,cap_audit_read+ep
/proc/3618/uid_map
0 0 1
/proc/3618/gid_map
0 0 1
eUID = 0; eGID = 0, child_pid=3618
root@my-contain:~# id
uid=0(root) gid=0(root) groups=0(root)
root@my-contain:~# exit
exit
Parent - container stopped!
root@iZhp36ik63t96xhzjh00ujZ:~#
普通用户
刚才一直测试 root 用户,现在来将一个普通用户映射为容器中的 root,先来给系统添加普通用户, 然后查看普通用户的 id
root@iZhp36ik63t96xhzjh00ujZ:~# useradd -d /home/dongzerun -m dongzerun -s /bin/bash
root@iZhp36ik63t96xhzjh00ujZ:~# cat /etc/passwd | grep dongzerun
dongzerun:x:1001:1001::/home/dongzerun:/bin/bash
再次修改程序,将宿主机中的 0 修改为 1001
set_uid_map(getpid(), 0, 1001, 1);
set_gid_map(getpid(), 0, 1001, 1);
再次编绎运行,符合预期
dongzerun@iZhp36ik63t96xhzjh00ujZ:~$ gcc -Wl,--no-as-needed -lcap user.c && ./a.out
Parent - start a container!
Container - inside the container!
capabilities: = cap_chown,cap_dac_override,cap_dac_read_search,cap_fowner,cap_fsetid,cap_kill,cap_setgid,cap_setuid,cap_setpcap,cap_linux_immutable,cap_net_bind_service,cap_net_broadcast,cap_net_admin,cap_net_raw,cap_ipc_lock,cap_ipc_owner,cap_sys_module,cap_sys_rawio,cap_sys_chroot,cap_sys_ptrace,cap_sys_pacct,cap_sys_admin,cap_sys_boot,cap_sys_nice,cap_sys_resource,cap_sys_time,cap_sys_tty_config,cap_mknod,cap_lease,cap_audit_write,cap_audit_control,cap_setfcap,cap_mac_override,cap_mac_admin,cap_syslog,cap_wake_alarm,cap_block_suspend,cap_audit_read+ep
/proc/3658/uid_map
0 1001 1
/proc/3658/gid_map
0 1001 1
eUID = 0; eGID = 0, child_pid=3658
root@my-contain:~# id
uid=0(root) gid=0(root) groups=0(root)
root@my-contain:~# reboot
root@my-contain:~#
最后尝试了一下 reboot
,肯定不会成功
小结
user namespace 比较特殊,所有 ns 中需要先创建,其它几个也都依赖于他。官方文档东西很多,值得通读几次。
网友评论