TL;DR 最近想看 docker
相关的实现,自然涉及底层 namespace
, 所以边做实验边看源码,感兴趣的先看耗子叔的文章
概览
先看下 Namespaces in operation,主要有以下几种
- Mount namespaces, 参数是
CLONE_NEWNS
,由于是第一个 namespaces 实现,这个参数名比较特殊。用于隔离文件系统挂载 - UTS namesapces, 参数是
CLONE_NEWUTS
,隔离 hostname 和 domain name - IPC namespaces, 参数是
CLONE_NEWIPC
,隔离 ipc, 信号量,共享内存等等 - PID namespaces,参数是
CLONE_NEWPID
,隔离 pid - NETWORK namespaces, 参数是
CLONE_NEWNET
,隔离网络 - USER namespaces, 参数是
CLONE_NEWUSER
,隔离 uid, gid 等等
可以通过 /proc/pid/ns
来查看进程扔有哪些 ns, id 相同代码属于同一个 ns
root@iZhp36ik63t96xhzjh00ujZ:~# ls -l /proc/$$/ns
total 0
lrwxrwxrwx 1 root root 0 Oct 5 15:06 cgroup -> 'cgroup:[4026531835]'
lrwxrwxrwx 1 root root 0 Oct 5 15:06 ipc -> 'ipc:[4026531839]'
lrwxrwxrwx 1 root root 0 Oct 5 15:06 mnt -> 'mnt:[4026531840]'
lrwxrwxrwx 1 root root 0 Oct 5 15:06 net -> 'net:[4026531993]'
lrwxrwxrwx 1 root root 0 Oct 5 15:06 pid -> 'pid:[4026531836]'
lrwxrwxrwx 1 root root 0 Oct 5 15:06 pid_for_children -> 'pid:[4026531836]'
lrwxrwxrwx 1 root root 0 Oct 5 15:06 user -> 'user:[4026531837]'
lrwxrwxrwx 1 root root 0 Oct 5 15:06 uts -> 'uts:[4026531838]'
测试案例
下面的代码大部份来自耗子叔的文章,唯一区别是调用 mount
将根变成私有
#define _GNU_SOURCE
#include <sys/types.h>
#include <sys/wait.h>
#include <stdio.h>
#include <sched.h>
#include <signal.h>
#include <unistd.h>
#include <sys/mount.h>
/* 定义一个给 clone 用的栈,栈大小1M */
#define STACK_SIZE (1024 * 1024)
static char container_stack[STACK_SIZE];
char* const container_args[] = {
"/bin/bash",
NULL
};
int container_main(void* arg)
{
printf("Container - inside the container!\n");
/* 直接执行一个shell,以便我们观察这个进程空间里的资源是否被隔离了 */
mount("none", "/", NULL, MS_REC|MS_PRIVATE, NULL);
execv(container_args[0], container_args);
printf("Something's wrong!\n");
return 1;
}
int main()
{
printf("Parent - start a container!\n");
/* 调用clone函数,其中传出一个函数,还有一个栈空间的(为什么传尾指针,因为栈是反着的) */
int container_pid = clone(container_main, container_stack+STACK_SIZE, SIGCHLD|CLONE_NEWNS, NULL);
/* 等待子进程结束 */
waitpid(container_pid, NULL, 0);
printf("Parent - container stopped!\n");
return 0;
}
运行后进入容器中,随变 mount
一个目录,我的例子是 dev/vda1
挂到 mnt
~# mount /dev/vda1 /mnt
然后再执行 mount
查看当前挂载了哪些目录
~# mount
......
systemd-1 on /proc/sys/fs/binfmt_misc type autofs (rw,relatime,fd=25,pgrp=1,timeout=0,minproto=5,maxproto=5,direct,pipe_ino=12886)
/dev/vda1 on /mnt type ext4 (rw,relatime,errors=remount-ro,data=ordered)
~# ls -l /proc/$$/ns | grep mnt
lrwxrwxrwx 1 root root 0 Oct 6 15:36 mnt -> mnt:[4026532235]
再打开另一个终端执行 mount
发现并没有 /mnt
这一列,并且 mount ns id 也是不同的
~# mount
......
configfs on /sys/kernel/config type configfs (rw,relatime)
fusectl on /sys/fs/fuse/connections type fusectl (rw,relatime)
tmpfs on /run/user/0 type tmpfs (rw,nosuid,nodev,relatime,size=403944k,mode=700)
~# ls -l /proc/$$/ns | grep mnt
lrwxrwxrwx 1 root root 0 Oct 6 15:36 mnt -> mnt:[4026531840]
这个例子有一点小问题,就是 PID 没有隔离,/proc
目录还是和宿主机一样的,所以能看到其它无用进程,再次尝试 clone
时添加 CLONE_NEWPID
int container_pid = clone(container_main, container_stack+STACK_SIZE, SIGCHLD|CLONE_NEWNS|CLONE_NEWPID, NULL);
并且在 mount
后再添加重新挂载 /proc
目录
mount("proc", "/proc", "proc", MS_NOSUID|MS_NODEV|MS_NOEXEC, NULL);
再次编绎运行测试程序,并且发现 /bin/bash
的 pid 是 1
~# gcc clone.c && ./a.out
Parent - start a container!
Container - inside the container!
~# ps aux
USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMAND
root 1 0.0 0.0 23324 3648 pts/2 S 16:15 0:00 /bin/bash
root 11 0.0 0.0 39084 3264 pts/2 R+ 16:15 0:00 ps aux
核心结构体
做完测试,我们看下源码实现,先从核心结构体看起
struct task_struct {
......
/* Namespaces: */
struct nsproxy *nsproxy;
......
}
每个进程结构体 task_struct
都有个 nsproxy
字段
struct nsproxy {
atomic_t count;
struct uts_namespace *uts_ns;
struct ipc_namespace *ipc_ns;
struct mnt_namespace *mnt_ns;
struct pid_namespace *pid_ns_for_children;
struct net *net_ns;
struct cgroup_namespace *cgroup_ns;
};
可以看到,nsproxy
里有所有的不同 namespaces
的指针,count
字段用于引用计数
创建 ns
内核 _do_fork
时,会调用 copy_namespaces
生成 nsproxy
int copy_namespaces(unsigned long flags, struct task_struct *tsk)
{
struct nsproxy *old_ns = tsk->nsproxy;
struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
struct nsproxy *new_ns;
if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
CLONE_NEWPID | CLONE_NEWNET |
CLONE_NEWCGROUP)))) {
get_nsproxy(old_ns);
return 0;
}
......
new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs);
if (IS_ERR(new_ns))
return PTR_ERR(new_ns);
tsk->nsproxy = new_ns;
return 0;
}
首先判断 flags 里如果没有创建任何 ns 的参数,那么调用 get_nsproxy
将原 nsproxy->count
计数加一后直接返回。否则调用 create_new_namespaces
根据 flags 按需创建新的 ns
static struct nsproxy *create_new_namespaces(unsigned long flags,
struct task_struct *tsk, struct user_namespace *user_ns,
struct fs_struct *new_fs)
{
struct nsproxy *new_nsp;
int err;
new_nsp = create_nsproxy();
if (!new_nsp)
return ERR_PTR(-ENOMEM);
new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs);
if (IS_ERR(new_nsp->mnt_ns)) {
err = PTR_ERR(new_nsp->mnt_ns);
goto out_ns;
}
......
return new_nsp;
......
这段代码比较好理解,首先创建 nsproxy
结构体,引用计数置为 1,再分别调用 copy_xxxx_ns
创建不同的 ns
mount ns 结构体
struct mnt_namespace {
atomic_t count;
struct ns_common ns;
struct mount * root;
struct list_head list;
struct user_namespace *user_ns;
struct ucounts *ucounts;
u64 seq; /* Sequence number to prevent loops */
wait_queue_head_t poll;
u64 event;
unsigned int mounts; /* # of mounts in the namespace */
unsigned int pending_mounts;
} __randomize_layout;
其中 count
是当前 mount namespaces
引用计数,ns_common
存储一些公用字段。root
是根目录挂载点
struct ns_common {
atomic_long_t stashed;
const struct proc_ns_operations *ops;
unsigned int inum;
};
struct proc_ns_operations {
const char *name;
const char *real_ns_name;
int type;
struct ns_common *(*get)(struct task_struct *task);
void (*put)(struct ns_common *ns);
int (*install)(struct nsproxy *nsproxy, struct ns_common *ns);
struct user_namespace *(*owner)(struct ns_common *ns);
struct ns_common *(*get_parent)(struct ns_common *ns);
} __randomize_layout;
const struct proc_ns_operations mntns_operations = {
.name = "mnt",
.type = CLONE_NEWNS,
.get = mntns_get,
.put = mntns_put,
.install = mntns_install,
.owner = mntns_owner,
};
ns_common
结构体最关心的就是 proc_ns_operations
回调结构体,里面有不同 ns 的抽像操作。
copy_mnt_ns 创建
struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
struct user_namespace *user_ns, struct fs_struct *new_fs)
{
struct mnt_namespace *new_ns;
struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
struct mount *p, *q;
struct mount *old;
struct mount *new;
int copy_flags;
BUG_ON(!ns);
if (likely(!(flags & CLONE_NEWNS))) {
get_mnt_ns(ns);
return ns;
}
old = ns->root;
new_ns = alloc_mnt_ns(user_ns, false); // 创建结构体,初始化基本字段
if (IS_ERR(new_ns))
return new_ns;
namespace_lock();
/* First pass: copy the tree topology */
copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
if (user_ns != ns->user_ns)
copy_flags |= CL_SHARED_TO_SLAVE;
new = copy_tree(old, old->mnt.mnt_root, copy_flags);
if (IS_ERR(new)) {
namespace_unlock();
free_mnt_ns(new_ns);
return ERR_CAST(new);
}
if (user_ns != ns->user_ns) {
lock_mount_hash();
lock_mnt_tree(new);
unlock_mount_hash();
}
new_ns->root = new;
list_add_tail(&new_ns->list, &new->mnt_list);
/*
* Second pass: switch the tsk->fs->* elements and mark new vfsmounts
* as belonging to new namespace. We have already acquired a private
* fs_struct, so tsk->fs->lock is not needed.
*/
p = old;
q = new;
while (p) {
q->mnt_ns = new_ns;
new_ns->mounts++;
if (new_fs) {
if (&p->mnt == new_fs->root.mnt) {
new_fs->root.mnt = mntget(&q->mnt);
rootmnt = &p->mnt;
}
if (&p->mnt == new_fs->pwd.mnt) {
new_fs->pwd.mnt = mntget(&q->mnt);
pwdmnt = &p->mnt;
}
}
p = next_mnt(p, old);
q = next_mnt(q, new);
if (!q)
break;
while (p->mnt.mnt_root != q->mnt.mnt_root)
p = next_mnt(p, old);
}
namespace_unlock();
if (rootmnt)
mntput(rootmnt);
if (pwdmnt)
mntput(pwdmnt);
return new_ns;
}
- 首先调用
alloc_mnt_ns
创建mnt_namespace
结构体,并且初始化一些基本字段。比如ns.ops
安装mntns_operations
回调函数,计数count
初始化为 1,初始化 List 等等 -
copy_tree
复制父进程的 root vfsmount 挂载拓扑,然后赋给 root 字段并把 root 连到new_ns->list
尾部 - 最后遍历所有挂载点,将
new_ns->mounts
计数加一,判断并设置新进程的pwd
和root
挂载点
mount 命令与 mount ns 交互
简单来说,mount namespace
就是用来隔离挂载点的,不同 ns 的修改不会影响其它。暂时不考滤 Shared subtrees
, 一般我们挂载文件系统时,用 mount
命令,来看一下系统调用的实现
SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
char __user *, type, unsigned long, flags, void __user *, data)
{
return ksys_mount(dev_name, dir_name, type, flags, data);
}
int ksys_mount(char __user *dev_name, char __user *dir_name, char __user *type,
unsigned long flags, void __user *data)
{
......
ret = do_mount(kernel_dev, dir_name, kernel_type, flags, options);
......
}
经过一些组装,最后调用 do_mount
long do_mount(const char *dev_name, const char __user *dir_name,
const char *type_page, unsigned long flags, void *data_page)
{
struct path path;
unsigned int mnt_flags = 0, sb_flags;
int retval = 0;
......
/* ... and get the mountpoint */
retval = user_path(dir_name, &path);
if (retval)
return retval;
......
if ((flags & (MS_REMOUNT | MS_BIND)) == (MS_REMOUNT | MS_BIND))
retval = do_reconfigure_mnt(&path, mnt_flags);
else if (flags & MS_REMOUNT)
retval = do_remount(&path, flags, sb_flags, mnt_flags,
data_page);
else if (flags & MS_BIND)
retval = do_loopback(&path, dev_name, flags & MS_REC);
else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
retval = do_change_type(&path, flags);
else if (flags & MS_MOVE)
retval = do_move_mount_old(&path, dev_name);
else
retval = do_new_mount(&path, type_page, sb_flags, mnt_flags,
dev_name, data_page);
......
}
这段代码中间忽略 flags
的判断与生成。首先是生成 struct path
结构体,即挂载点目录,这块代码有点复杂,不看了,path
结构体包含 dentry
目录项和 vfsmount
目标挂载点。有的时候我们是挂载 loopdev
设备,有的是移动挂载点,有的是重新 remount
,我们只看 do_new_mount
如何处理新挂载
static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
int mnt_flags, const char *name, void *data)
{
struct file_system_type *type;
struct fs_context *fc;
const char *subtype = NULL;
int err = 0;
if (!fstype)
return -EINVAL;
type = get_fs_type(fstype);
if (!type)
return -ENODEV;
......
fc = fs_context_for_mount(type, sb_flags);
put_filesystem(type);
if (IS_ERR(fc))
return PTR_ERR(fc);
if (subtype)
err = vfs_parse_fs_string(fc, "subtype",
subtype, strlen(subtype));
if (!err && name)
err = vfs_parse_fs_string(fc, "source", name, strlen(name));
if (!err)
err = parse_monolithic_mount_data(fc, data);
if (!err)
err = vfs_get_tree(fc);
if (!err)
err = do_new_mount_fc(fc, path, mnt_flags);
put_fs_context(fc);
return err;
}
这里有个重要的结构体 fs_context
, 用于封装本创建或重新配置 superblock
的上下文。
-
get_fs_type
获取调挂载的文件系统类型,比如proc
,ext3
,xfs
等等,用于多态处理 -
fs_context_for_mount
根据文件系统类型,来创建相应的fs_context
, 深入代码会发现,调用init_fs_context
初始化,主要是设置回调结构体legacy_fs_context_ops
- 如果有
subtype
, 调用vfs_parse_fs_string
初始化子类型 - 调用
vfs_parse_fs_string
初始化待挂载设备参数 -
parse_monolithic_mount_data
初始化待挂载的 kv 参数,暂时不看 -
vfs_get_tree
时际上调用具体文件系统的mount
函数,比如ext4_mount
生成待挂载设备的dentry
页目录项和superblock
,并赋值给fc->root
- 最后
do_new_mount_fc
做真正的挂载操作,并更新 mount namespace
static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint,
unsigned int mnt_flags)
{
struct vfsmount *mnt;
struct super_block *sb = fc->root->d_sb;
int error;
......
mnt = vfs_create_mount(fc);
if (IS_ERR(mnt))
return PTR_ERR(mnt);
error = do_add_mount(real_mount(mnt), mountpoint, mnt_flags);
if (error < 0)
mntput(mnt);
return error;
}
- 调用
vfs_create_mount
创建待挂载设备的vfsmount
及mount
结构体,各种初始化,设置superblock
及root dentry
- 最后调用
do_add_mount
将待挂载设备的vfsmount
挂到对应path
的 mount namespace tree 下面
static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
{
struct mountpoint *mp;
struct mount *parent;
mp = lock_mount(path);
......
parent = real_mount(path->mnt);
......
err = graft_tree(newmnt, parent, mp);
......
}
-
lock_mount
获取挂载目点的mountpoint
- 获取挂载点的
parent mount
,注意,这个parent
实际上就是在目录mount namespace
下的 - 调用
graft_tree
,将newmount
设备挂载到mp
上,这块最后工作的是attach_recursive_mnt
static int attach_recursive_mnt(struct mount *source_mnt,
struct mount *dest_mnt,
struct mountpoint *dest_mp,
struct path *parent_path)
{
struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
HLIST_HEAD(tree_list);
struct mnt_namespace *ns = dest_mnt->mnt_ns;
struct mountpoint *smp;
struct mount *child, *p;
struct hlist_node *n;
int err;
/* Preallocate a mountpoint in case the new mounts need
* to be tucked under other mounts.
*/
smp = get_mountpoint(source_mnt->mnt.mnt_root);
......
if (parent_path) {
detach_mnt(source_mnt, parent_path);
attach_mnt(source_mnt, dest_mnt, dest_mp);
touch_mnt_namespace(source_mnt->mnt_ns);
} else {
if (source_mnt->mnt_ns) {
/* move from anon - the caller will destroy */
list_del_init(&source_mnt->mnt_ns->list);
}
mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
commit_tree(source_mnt);
}
......
return 0;
}
这个函数是将 source_mnt
挂载到 dest_mp
,另外还处理了 MOVE
情况,也就是说如果 parent
不为空,那么要先从 parent
上 detach, 再 attach 到 dest_mp
, 本次只看新挂载的情况,也不看 shared 的情况
-
source_mnt->mnt_ns
如果待挂载设备有mnt_ns
,那么清除掉,因为要设置成dest_mp
的 - 调用
mnt_set_mountpoint
,设置挂载相关字段。所谓挂载,就是待挂载设备source_mnt.mnt_parent
字段指向dest_mp
,并将source_mnt
连到dest_mp.m_list
连表上 - 调用
commit_tree
更新source_mnt
的mount namespace
。遍历mnt_list
,如果source_mnt
还有child mount
, 递归更新mount namespace
static void commit_tree(struct mount *mnt)
{
struct mount *parent = mnt->mnt_parent;
struct mount *m;
LIST_HEAD(head);
struct mnt_namespace *n = parent->mnt_ns;
BUG_ON(parent == mnt);
list_add_tail(&head, &mnt->mnt_list);
list_for_each_entry(m, &head, mnt_list)
m->mnt_ns = n;
list_splice(&head, n->list.prev);
n->mounts += n->pending_mounts;
n->pending_mounts = 0;
__attach_mnt(mnt, parent);
touch_mnt_namespace(n);
}
小结
这一块还需要大量 vfs 相关知识,以后还需要多理解多看。感觉 namespace
就像祖谱一样的树形结构,docker
隔离时相当于重新定制了一份祖谱,只有一个人,那么他就是祖先。
网友评论