TL;DR 这篇讲 ipc namespace,用于隔离共享内存,队列,信号量。虽然这块用的很少,还是要弄清楚原理。
测试例子
代码同样来自耗子叔的文章,添加 CLONE_NEWIPC
#define _GNU_SOURCE
#include <sys/types.h>
#include <sys/wait.h>
#include <stdio.h>
#include <sched.h>
#include <signal.h>
#include <unistd.h>
#include <sys/mount.h>
/* 定义一个给 clone 用的栈,栈大小1M */
#define STACK_SIZE (1024 * 1024)
static char container_stack[STACK_SIZE];
char* const container_args[] = {
"/bin/bash",
NULL
};
int container_main(void* arg)
{
printf("Container - inside the container!\n");
/* 直接执行一个shell,以便我们观察这个进程空间里的资源是否被隔离了 */
execv(container_args[0], container_args);
printf("Something's wrong!\n");
return 1;
}
int main()
{
printf("Parent - start a container!\n");
/* 调用clone函数,其中传出一个函数,还有一个栈空间的(为什么传尾指针,因为栈是反着的) */
int container_pid = clone(container_main, container_stack+STACK_SIZE, SIGCHLD|CLONE_NEWIPC, NULL);
/* 等待子进程结束 */
waitpid(container_pid, NULL, 0);
printf("Parent - container stopped!\n");
return 0;
}
首先在宿主机上创建 message queue
root@iZhp36ik63t96xhzjh00ujZ:~# ipcmk -Q
Message queue id: 0
root@iZhp36ik63t96xhzjh00ujZ:~# ipcs -q
------ Message Queues --------
key msqid owner perms used-bytes messages
0xa59bafa6 0 root 644 0 0
然后编绎运行,启动后发现在容器中看不到共享队列
root@iZhp36ik63t96xhzjh00ujZ:~# gcc ipc.c && ./a.out
Parent - start a container!
Container - inside the container!
root@iZhp36ik63t96xhzjh00ujZ:~# ipcs -q
------ Message Queues --------
key msqid owner perms used-bytes messages
root@iZhp36ik63t96xhzjh00ujZ:~# exit
核心结构体
这个结构体定义比较好理解,但是不够好,msg
, shm
, queue
全平铺在 ipc_namespace
里了,应该抽出各自定义结构体。
struct ipc_namespace {
refcount_t count;
struct ipc_ids ids[3];
int sem_ctls[4];
int used_sems;
unsigned int msg_ctlmax;
unsigned int msg_ctlmnb;
unsigned int msg_ctlmni;
atomic_t msg_bytes;
atomic_t msg_hdrs;
size_t shm_ctlmax;
size_t shm_ctlall;
unsigned long shm_tot;
int shm_ctlmni;
/*
* Defines whether IPC_RMID is forced for _all_ shm segments regardless
* of shmctl()
*/
int shm_rmid_forced;
struct notifier_block ipcns_nb;
/* The kern_mount of the mqueuefs sb. We take a ref on it */
struct vfsmount *mq_mnt;
/* # queues in this ns, protected by mq_lock */
unsigned int mq_queues_count;
/* next fields are set through sysctl */
unsigned int mq_queues_max; /* initialized to DFLT_QUEUESMAX */
unsigned int mq_msg_max; /* initialized to DFLT_MSGMAX */
unsigned int mq_msgsize_max; /* initialized to DFLT_MSGSIZEMAX */
unsigned int mq_msg_default;
unsigned int mq_msgsize_default;
/* user_ns which owns the ipc ns */
struct user_namespace *user_ns;
struct ucounts *ucounts;
struct ns_common ns;
} __randomize_layout;
创建 ipc ns
static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
struct ipc_namespace *old_ns)
{
struct ipc_namespace *ns;
struct ucounts *ucounts;
int err;
err = -ENOSPC;
ucounts = inc_ipc_namespaces(user_ns);
if (!ucounts)
goto fail;
err = -ENOMEM;
ns = kzalloc(sizeof(struct ipc_namespace), GFP_KERNEL);
if (ns == NULL)
goto fail_dec;
err = ns_alloc_inum(&ns->ns);
if (err)
goto fail_free;
ns->ns.ops = &ipcns_operations;
refcount_set(&ns->count, 1);
ns->user_ns = get_user_ns(user_ns);
ns->ucounts = ucounts;
err = mq_init_ns(ns);
if (err)
goto fail_put;
sem_init_ns(ns);
msg_init_ns(ns);
shm_init_ns(ns);
return ns;
......
}
这段代码也好理解,创建 ipc_namespace
,然后分别初始化 sem
, msg
, shm
void sem_init_ns(struct ipc_namespace *ns)
{
ns->sc_semmsl = SEMMSL;
ns->sc_semmns = SEMMNS;
ns->sc_semopm = SEMOPM;
ns->sc_semmni = SEMMNI;
ns->used_sems = 0;
ipc_init_ids(&ns->ids[IPC_SEM_IDS]);
}
void msg_init_ns(struct ipc_namespace *ns)
{
ns->msg_ctlmax = MSGMAX;
ns->msg_ctlmnb = MSGMNB;
ns->msg_ctlmni = MSGMNI;
atomic_set(&ns->msg_bytes, 0);
atomic_set(&ns->msg_hdrs, 0);
ipc_init_ids(&ns->ids[IPC_MSG_IDS]);
}
void shm_init_ns(struct ipc_namespace *ns)
{
ns->shm_ctlmax = SHMMAX;
ns->shm_ctlall = SHMALL;
ns->shm_ctlmni = SHMMNI;
ns->shm_rmid_forced = 0;
ns->shm_tot = 0;
ipc_init_ids(&shm_ids(ns));
}
这是三个初始化函数,代码很简单,设置最大最小值,最后再初始化存储的 ids
ipmk 创建队列
先看下系统调用情况
root@iZhp36ik63t96xhzjh00ujZ:~# strace ipcmk -Q
......
gettid() = 26557
msgget(0x6d8a4a32, IPC_CREAT|0644) = 32769
......
fstat(1, {st_mode=S_IFCHR|0600, st_rdev=makedev(136, 3), ...}) = 0
write(1, "Message queue id: 32769\n", 24Message queue id: 32769
) = 24
close(1) = 0
close(2) = 0
exit_group(0) = ?
+++ exited with 0 +++
原来是调用 msgget
来生成的共享队列。
SYSCALL_DEFINE2(msgget, key_t, key, int, msgflg)
{
return ksys_msgget(key, msgflg);
}
long ksys_msgget(key_t key, int msgflg)
{
struct ipc_namespace *ns;
static const struct ipc_ops msg_ops = {
.getnew = newque,
.associate = security_msg_queue_associate,
};
struct ipc_params msg_params;
ns = current->nsproxy->ipc_ns;
msg_params.key = key;
msg_params.flg = msgflg;
return ipcget(ns, &msg_ids(ns), &msg_ops, &msg_params);
}
这里 msg_ops.getnew
是创建新队列的回调函数,ns
获取当前进程的 ipc namespace
,里面含有所有可见 ipc 的 ids
int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids,
const struct ipc_ops *ops, struct ipc_params *params)
{
if (params->key == IPC_PRIVATE)
return ipcget_new(ns, ids, ops, params);
else
return ipcget_public(ns, ids, ops, params);
}
static int newque(struct ipc_namespace *ns, struct ipc_params *params)
{
struct msg_queue *msq;
int retval;
key_t key = params->key;
int msgflg = params->flg;
msq = kvmalloc(sizeof(*msq), GFP_KERNEL);
if (unlikely(!msq))
return -ENOMEM;
msq->q_perm.mode = msgflg & S_IRWXUGO;
msq->q_perm.key = key;
msq->q_perm.security = NULL;
retval = security_msg_queue_alloc(&msq->q_perm);
if (retval) {
kvfree(msq);
return retval;
}
msq->q_stime = msq->q_rtime = 0;
msq->q_ctime = ktime_get_real_seconds();
msq->q_cbytes = msq->q_qnum = 0;
msq->q_qbytes = ns->msg_ctlmnb;
msq->q_lspid = msq->q_lrpid = NULL;
INIT_LIST_HEAD(&msq->q_messages);
INIT_LIST_HEAD(&msq->q_receivers);
INIT_LIST_HEAD(&msq->q_senders);
/* ipc_addid() locks msq upon success. */
retval = ipc_addid(&msg_ids(ns), &msq->q_perm, ns->msg_ctlmni);
if (retval < 0) {
ipc_rcu_putref(&msq->q_perm, msg_rcu_free);
return retval;
}
ipc_unlock_object(&msq->q_perm);
rcu_read_unlock();
return msq->q_perm.id;
}
无论是 ipcget_new
还是 ipcget_public
都会调用 newque
创建真正的队列,区别就在于公共的要做去重较验。newque
分配并初始化 msg_queue
,最后调用 ipc_addid
添加到 ipc ns
int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit)
{
kuid_t euid;
kgid_t egid;
int idx, err;
/* 1) Initialize the refcount so that ipc_rcu_putref works */
refcount_set(&new->refcount, 1);
if (limit > ipc_mni)
limit = ipc_mni;
if (ids->in_use >= limit)
return -ENOSPC;
idr_preload(GFP_KERNEL);
spin_lock_init(&new->lock);
rcu_read_lock();
spin_lock(&new->lock);
current_euid_egid(&euid, &egid);
new->cuid = new->uid = euid;
new->gid = new->cgid = egid;
new->deleted = false;
idx = ipc_idr_alloc(ids, new);
idr_preload_end();
if (idx >= 0 && new->key != IPC_PRIVATE) {
err = rhashtable_insert_fast(&ids->key_ht, &new->khtnode,
ipc_kht_params);
if (err < 0) {
idr_remove(&ids->ipcs_idr, idx);
idx = err;
}
}
if (idx < 0) {
new->deleted = true;
spin_unlock(&new->lock);
rcu_read_unlock();
return idx;
}
ids->in_use++;
if (idx > ids->max_idx)
ids->max_idx = idx;
return idx;
}
ipc_addid
逻辑比较简单,就是将刚创建的添加到 ipc ns 的 ids 结构中,这个结构是 radix tree
,具体原理就不看了,另外如果不是私有的,还要添加到 hash 中,用于快速查找去重。
ipcs 查看队列
还是先看系统调用是什么
root@iZhp36ik63t96xhzjh00ujZ:~# strace ipcs -q
......
openat(AT_FDCWD, "/proc/sysvipc/msg", O_RDONLY) = 3
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
......
省去部分无关信息,原来是直接读的内存文件 /proc/sysvipc/msg
,其实这个文件的注册是在 os 初始化时设置的。
void __init msg_init(void)
{
msg_init_ns(&init_ipc_ns);
ipc_init_proc_interface("sysvipc/msg",
" key msqid perms cbytes qnum lspid lrpid uid gid cuid cgid stime rtime ctime\n",
IPC_MSG_IDS, sysvipc_msg_proc_show);
}
先看看 ipc_init_proc_interface
做了哪些初始化工作
void __init ipc_init_proc_interface(const char *path, const char *header,
int ids, int (*show)(struct seq_file *, void *))
{
struct proc_dir_entry *pde;
struct ipc_proc_iface *iface;
iface = kmalloc(sizeof(*iface), GFP_KERNEL);
if (!iface)
return;
iface->path = path;
iface->header = header;
iface->ids = ids;
iface->show = show;
pde = proc_create_data(path,
S_IRUGO, /* world readable */
NULL, /* parent dir */
&sysvipc_proc_fops,
iface);
if (!pde)
kfree(iface);
}
这里最重要的就是将 sysvipc_msg_proc_show
设置成 show
回调。任一文件的操作,都有对应 file_operations
的回调结构体,ipc
对应着 sysvipc_proc_fops
.
static const struct file_operations sysvipc_proc_fops = {
.open = sysvipc_proc_open,
.read = seq_read,
.llseek = seq_lseek,
.release = sysvipc_proc_release,
};
static int sysvipc_proc_open(struct inode *inode, struct file *file)
{
struct ipc_proc_iter *iter;
iter = __seq_open_private(file, &sysvipc_proc_seqops, sizeof(*iter));
if (!iter)
return -ENOMEM;
iter->iface = PDE_DATA(inode);
iter->ns = get_ipc_ns(current->nsproxy->ipc_ns);
iter->pid_ns = get_pid_ns(task_active_pid_ns(current));
return 0;
}
重重的要实在这里,通过 file_operations.open 设置成 sysvipc_proc_open
,将文件读操作与查看 ipcs 关联起来, 调用 get_ipc_ns
返回当前 ipc ns. 再来看看 sysvipc_msg_proc_show
的实现
static int sysvipc_msg_proc_show(struct seq_file *s, void *it)
{
struct pid_namespace *pid_ns = ipc_seq_pid_ns(s);
struct user_namespace *user_ns = seq_user_ns(s);
struct kern_ipc_perm *ipcp = it;
struct msg_queue *msq = container_of(ipcp, struct msg_queue, q_perm);
seq_printf(s,
"%10d %10d %4o %10lu %10lu %5u %5u %5u %5u %5u %5u %10llu %10llu %10llu\n",
msq->q_perm.key,
msq->q_perm.id,
msq->q_perm.mode,
msq->q_cbytes,
msq->q_qnum,
pid_nr_ns(msq->q_lspid, pid_ns),
pid_nr_ns(msq->q_lrpid, pid_ns),
from_kuid_munged(user_ns, msq->q_perm.uid),
from_kgid_munged(user_ns, msq->q_perm.gid),
from_kuid_munged(user_ns, msq->q_perm.cuid),
from_kgid_munged(user_ns, msq->q_perm.cgid),
msq->q_stime,
msq->q_rtime,
msq->q_ctime);
return 0;
}
也很简单,容易理解,msg_queue
来自参数 it
传过来,这里包含了 ipc ns,最后打印终端即可。
小结
现在看 ipc 和 uts 都比较简单,实现起来也不复杂。
网友评论