namespaces 学习笔记3：ipc ns 源码实现

作者: 董泽润 | 来源:发表于2019-10-09 15:35 被阅读0次

namespaces 学习笔记3：ipc ns 源码实现
namespaces 学习笔记1：mount ns 源码实现
namespaces 学习笔记5：user ns 源码实现
namespaces 学习笔记2：uts ns 源码实现
容器安全-Namespaces
SpringBoot启动加载Apollo配置过程
NameSpace资源隔离
安卓实现IPC（四）—— Broadcast
安卓实现IPC（三）—— AIDL
Binder学习(四)利用AIDL、Messenger实现IPC

TL;DR 这篇讲 ipc namespace，用于隔离共享内存，队列，信号量。虽然这块用的很少，还是要弄清楚原理。

测试例子

代码同样来自耗子叔的文章，添加 CLONE_NEWIPC

#define _GNU_SOURCE
#include <sys/types.h>
#include <sys/wait.h>
#include <stdio.h>
#include <sched.h>
#include <signal.h>
#include <unistd.h>
#include <sys/mount.h>

/* 定义一个给 clone 用的栈，栈大小1M */
#define STACK_SIZE (1024 * 1024)
static char container_stack[STACK_SIZE];

char* const container_args[] = {
    "/bin/bash",
    NULL
};

int container_main(void* arg)
{
    printf("Container - inside the container!\n");
    /* 直接执行一个shell，以便我们观察这个进程空间里的资源是否被隔离了 */
    execv(container_args[0], container_args);
    printf("Something's wrong!\n");
    return 1;
}

int main()
{
    printf("Parent - start a container!\n");
    /* 调用clone函数，其中传出一个函数，还有一个栈空间的（为什么传尾指针，因为栈是反着的） */
    int container_pid = clone(container_main, container_stack+STACK_SIZE, SIGCHLD|CLONE_NEWIPC, NULL);
    /* 等待子进程结束 */
    waitpid(container_pid, NULL, 0);
    printf("Parent - container stopped!\n");
    return 0;
}

首先在宿主机上创建 message queue

root@iZhp36ik63t96xhzjh00ujZ:~# ipcmk -Q
Message queue id: 0
root@iZhp36ik63t96xhzjh00ujZ:~# ipcs -q

------ Message Queues --------
key        msqid      owner      perms      used-bytes   messages
0xa59bafa6 0          root       644        0            0

然后编绎运行，启动后发现在容器中看不到共享队列

root@iZhp36ik63t96xhzjh00ujZ:~# gcc ipc.c && ./a.out
Parent - start a container!
Container - inside the container!
root@iZhp36ik63t96xhzjh00ujZ:~# ipcs -q

------ Message Queues --------
key        msqid      owner      perms      used-bytes   messages

root@iZhp36ik63t96xhzjh00ujZ:~# exit

核心结构体

这个结构体定义比较好理解，但是不够好，msg, shm, queue 全平铺在 ipc_namespace 里了，应该抽出各自定义结构体。

struct ipc_namespace {
    refcount_t  count;
    struct ipc_ids  ids[3];

    int     sem_ctls[4];
    int     used_sems;

    unsigned int    msg_ctlmax;
    unsigned int    msg_ctlmnb;
    unsigned int    msg_ctlmni;
    atomic_t    msg_bytes;
    atomic_t    msg_hdrs;

    size_t      shm_ctlmax;
    size_t      shm_ctlall;
    unsigned long   shm_tot;
    int     shm_ctlmni;
    /*
     * Defines whether IPC_RMID is forced for _all_ shm segments regardless
     * of shmctl()
     */
    int     shm_rmid_forced;

    struct notifier_block ipcns_nb;

    /* The kern_mount of the mqueuefs sb.  We take a ref on it */
    struct vfsmount *mq_mnt;

    /* # queues in this ns, protected by mq_lock */
    unsigned int    mq_queues_count;

    /* next fields are set through sysctl */
    unsigned int    mq_queues_max;   /* initialized to DFLT_QUEUESMAX */
    unsigned int    mq_msg_max;      /* initialized to DFLT_MSGMAX */
    unsigned int    mq_msgsize_max;  /* initialized to DFLT_MSGSIZEMAX */
    unsigned int    mq_msg_default;
    unsigned int    mq_msgsize_default;

    /* user_ns which owns the ipc ns */
    struct user_namespace *user_ns;
    struct ucounts *ucounts;

    struct ns_common ns;
} __randomize_layout;

创建 ipc ns

static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
                       struct ipc_namespace *old_ns)
{
    struct ipc_namespace *ns;
    struct ucounts *ucounts;
    int err;

    err = -ENOSPC;
    ucounts = inc_ipc_namespaces(user_ns);
    if (!ucounts)
        goto fail;

    err = -ENOMEM;
    ns = kzalloc(sizeof(struct ipc_namespace), GFP_KERNEL);
    if (ns == NULL)
        goto fail_dec;

    err = ns_alloc_inum(&ns->ns);
    if (err)
        goto fail_free;
    ns->ns.ops = &ipcns_operations;

    refcount_set(&ns->count, 1);
    ns->user_ns = get_user_ns(user_ns);
    ns->ucounts = ucounts;

    err = mq_init_ns(ns);
    if (err)
        goto fail_put;

    sem_init_ns(ns);
    msg_init_ns(ns);
    shm_init_ns(ns);

    return ns;
    ......
}

这段代码也好理解，创建 ipc_namespace，然后分别初始化 sem, msg, shm

void sem_init_ns(struct ipc_namespace *ns)
{
    ns->sc_semmsl = SEMMSL;
    ns->sc_semmns = SEMMNS;
    ns->sc_semopm = SEMOPM;
    ns->sc_semmni = SEMMNI;
    ns->used_sems = 0;
    ipc_init_ids(&ns->ids[IPC_SEM_IDS]);
}

void msg_init_ns(struct ipc_namespace *ns)
{
    ns->msg_ctlmax = MSGMAX;
    ns->msg_ctlmnb = MSGMNB;
    ns->msg_ctlmni = MSGMNI;

    atomic_set(&ns->msg_bytes, 0);
    atomic_set(&ns->msg_hdrs, 0);
    ipc_init_ids(&ns->ids[IPC_MSG_IDS]);
}

void shm_init_ns(struct ipc_namespace *ns)
{
    ns->shm_ctlmax = SHMMAX;
    ns->shm_ctlall = SHMALL;
    ns->shm_ctlmni = SHMMNI;
    ns->shm_rmid_forced = 0;
    ns->shm_tot = 0;
    ipc_init_ids(&shm_ids(ns));
}

这是三个初始化函数，代码很简单，设置最大最小值，最后再初始化存储的 ids

ipmk 创建队列

先看下系统调用情况

root@iZhp36ik63t96xhzjh00ujZ:~# strace ipcmk -Q
......
gettid()                                = 26557
msgget(0x6d8a4a32, IPC_CREAT|0644)      = 32769
......
fstat(1, {st_mode=S_IFCHR|0600, st_rdev=makedev(136, 3), ...}) = 0
write(1, "Message queue id: 32769\n", 24Message queue id: 32769
) = 24
close(1)                                = 0
close(2)                                = 0
exit_group(0)                           = ?
+++ exited with 0 +++

原来是调用 msgget 来生成的共享队列。

SYSCALL_DEFINE2(msgget, key_t, key, int, msgflg)
{
    return ksys_msgget(key, msgflg);
}

long ksys_msgget(key_t key, int msgflg)
{
    struct ipc_namespace *ns;
    static const struct ipc_ops msg_ops = {
        .getnew = newque,
        .associate = security_msg_queue_associate,
    };
    struct ipc_params msg_params;

    ns = current->nsproxy->ipc_ns;

    msg_params.key = key;
    msg_params.flg = msgflg;

    return ipcget(ns, &msg_ids(ns), &msg_ops, &msg_params);
}

这里 msg_ops.getnew 是创建新队列的回调函数，ns 获取当前进程的 ipc namespace，里面含有所有可见 ipc 的 ids

int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids,
            const struct ipc_ops *ops, struct ipc_params *params)
{
    if (params->key == IPC_PRIVATE)
        return ipcget_new(ns, ids, ops, params);
    else
        return ipcget_public(ns, ids, ops, params);
}

static int newque(struct ipc_namespace *ns, struct ipc_params *params)
{
    struct msg_queue *msq;
    int retval;
    key_t key = params->key;
    int msgflg = params->flg;

    msq = kvmalloc(sizeof(*msq), GFP_KERNEL);
    if (unlikely(!msq))
        return -ENOMEM;

    msq->q_perm.mode = msgflg & S_IRWXUGO;
    msq->q_perm.key = key;

    msq->q_perm.security = NULL;
    retval = security_msg_queue_alloc(&msq->q_perm);
    if (retval) {
        kvfree(msq);
        return retval;
    }

    msq->q_stime = msq->q_rtime = 0;
    msq->q_ctime = ktime_get_real_seconds();
    msq->q_cbytes = msq->q_qnum = 0;
    msq->q_qbytes = ns->msg_ctlmnb;
    msq->q_lspid = msq->q_lrpid = NULL;
    INIT_LIST_HEAD(&msq->q_messages);
    INIT_LIST_HEAD(&msq->q_receivers);
    INIT_LIST_HEAD(&msq->q_senders);

    /* ipc_addid() locks msq upon success. */
    retval = ipc_addid(&msg_ids(ns), &msq->q_perm, ns->msg_ctlmni);
    if (retval < 0) {
        ipc_rcu_putref(&msq->q_perm, msg_rcu_free);
        return retval;
    }

    ipc_unlock_object(&msq->q_perm);
    rcu_read_unlock();

    return msq->q_perm.id;
}

无论是 ipcget_new 还是 ipcget_public 都会调用 newque 创建真正的队列，区别就在于公共的要做去重较验。newque 分配并初始化 msg_queue，最后调用 ipc_addid 添加到 ipc ns

int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit)
{
    kuid_t euid;
    kgid_t egid;
    int idx, err;

    /* 1) Initialize the refcount so that ipc_rcu_putref works */
    refcount_set(&new->refcount, 1);

    if (limit > ipc_mni)
        limit = ipc_mni;

    if (ids->in_use >= limit)
        return -ENOSPC;

    idr_preload(GFP_KERNEL);

    spin_lock_init(&new->lock);
    rcu_read_lock();
    spin_lock(&new->lock);

    current_euid_egid(&euid, &egid);
    new->cuid = new->uid = euid;
    new->gid = new->cgid = egid;

    new->deleted = false;

    idx = ipc_idr_alloc(ids, new);
    idr_preload_end();

    if (idx >= 0 && new->key != IPC_PRIVATE) {
        err = rhashtable_insert_fast(&ids->key_ht, &new->khtnode,
                         ipc_kht_params);
        if (err < 0) {
            idr_remove(&ids->ipcs_idr, idx);
            idx = err;
        }
    }
    if (idx < 0) {
        new->deleted = true;
        spin_unlock(&new->lock);
        rcu_read_unlock();
        return idx;
    }

    ids->in_use++;
    if (idx > ids->max_idx)
        ids->max_idx = idx;
    return idx;
}

ipc_addid 逻辑比较简单，就是将刚创建的添加到 ipc ns 的 ids 结构中，这个结构是 radix tree，具体原理就不看了，另外如果不是私有的，还要添加到 hash 中，用于快速查找去重。

ipcs 查看队列

还是先看系统调用是什么

root@iZhp36ik63t96xhzjh00ujZ:~# strace ipcs -q
......
openat(AT_FDCWD, "/proc/sysvipc/msg", O_RDONLY) = 3
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
......

省去部分无关信息，原来是直接读的内存文件 /proc/sysvipc/msg，其实这个文件的注册是在 os 初始化时设置的。

void __init msg_init(void)
{
    msg_init_ns(&init_ipc_ns);

    ipc_init_proc_interface("sysvipc/msg",
                "       key      msqid perms      cbytes       qnum lspid lrpid   uid   gid  cuid  cgid      stime      rtime      ctime\n",
                IPC_MSG_IDS, sysvipc_msg_proc_show);
}

先看看 ipc_init_proc_interface 做了哪些初始化工作

void __init ipc_init_proc_interface(const char *path, const char *header,
        int ids, int (*show)(struct seq_file *, void *))
{
    struct proc_dir_entry *pde;
    struct ipc_proc_iface *iface;

    iface = kmalloc(sizeof(*iface), GFP_KERNEL);
    if (!iface)
        return;
    iface->path = path;
    iface->header   = header;
    iface->ids  = ids;
    iface->show = show;

    pde = proc_create_data(path,
                   S_IRUGO,        /* world readable */
                   NULL,           /* parent dir */
                   &sysvipc_proc_fops,
                   iface);
    if (!pde)
        kfree(iface);
}

这里最重要的就是将 sysvipc_msg_proc_show 设置成 show 回调。任一文件的操作，都有对应 file_operations 的回调结构体，ipc 对应着 sysvipc_proc_fops.

static const struct file_operations sysvipc_proc_fops = {
    .open    = sysvipc_proc_open,
    .read    = seq_read,
    .llseek  = seq_lseek,
    .release = sysvipc_proc_release,
};

static int sysvipc_proc_open(struct inode *inode, struct file *file)
{
    struct ipc_proc_iter *iter;

    iter = __seq_open_private(file, &sysvipc_proc_seqops, sizeof(*iter));
    if (!iter)
        return -ENOMEM;

    iter->iface = PDE_DATA(inode);
    iter->ns    = get_ipc_ns(current->nsproxy->ipc_ns);
    iter->pid_ns = get_pid_ns(task_active_pid_ns(current));

    return 0;
}

重重的要实在这里，通过 file_operations.open 设置成 sysvipc_proc_open，将文件读操作与查看 ipcs 关联起来，调用 get_ipc_ns 返回当前 ipc ns. 再来看看 sysvipc_msg_proc_show 的实现

static int sysvipc_msg_proc_show(struct seq_file *s, void *it)
{
    struct pid_namespace *pid_ns = ipc_seq_pid_ns(s);
    struct user_namespace *user_ns = seq_user_ns(s);
    struct kern_ipc_perm *ipcp = it;
    struct msg_queue *msq = container_of(ipcp, struct msg_queue, q_perm);

    seq_printf(s,
           "%10d %10d  %4o  %10lu %10lu %5u %5u %5u %5u %5u %5u %10llu %10llu %10llu\n",
           msq->q_perm.key,
           msq->q_perm.id,
           msq->q_perm.mode,
           msq->q_cbytes,
           msq->q_qnum,
           pid_nr_ns(msq->q_lspid, pid_ns),
           pid_nr_ns(msq->q_lrpid, pid_ns),
           from_kuid_munged(user_ns, msq->q_perm.uid),
           from_kgid_munged(user_ns, msq->q_perm.gid),
           from_kuid_munged(user_ns, msq->q_perm.cuid),
           from_kgid_munged(user_ns, msq->q_perm.cgid),
           msq->q_stime,
           msq->q_rtime,
           msq->q_ctime);

    return 0;
}

也很简单，容易理解，msg_queue 来自参数 it 传过来，这里包含了 ipc ns，最后打印终端即可。

小结

现在看 ipc 和 uts 都比较简单，实现起来也不复杂。

namespaces 学习笔记3：ipc ns 源码实现
TL;DR 这篇讲 ipc namespace，用于隔离共享内存，队列，信号量。虽然这块用的很少，还是要弄清楚原理...
namespaces 学习笔记1：mount ns 源码实现
TL;DR 最近想看 docker 相关的实现，自然涉及底层 namespace, 所以边做实验边看源码，感兴趣的...
namespaces 学习笔记5：user ns 源码实现
TL;DR 上一篇测试了 user ns，本篇从源码层面看如何实现用户映射，以及权限控制核心结构体和其它 ns...
namespaces 学习笔记2：uts ns 源码实现
TL;DR 最近想看 docker 相关的实现，自然涉及底层 namespace, 所以边做实验边看源码，感兴趣的...
容器安全-Namespaces
namespaces的使用情况 linux每个进程都有自己的namespaces，可以通过/proc/pid/ns...
SpringBoot启动加载Apollo配置过程
源码流程： 1、遍历yaml配置的namespaces 2、每个namespace创建对应的config 3、第一...
NameSpace资源隔离
Namespace机制概述 Linux Namespaces机制提供了一种资源隔离方案。PID,IPC,Netwo...
安卓实现IPC（四）—— Broadcast
上一篇文章学习了用AIDL实现IPC，地址安卓实现IPC（三）—— AIDL，这一篇来学习用广播实现IPC，效果图...
安卓实现IPC（三）—— AIDL
继使用Intent实现IPC后，这篇文章来学习以下使用AIDL实现IPC，用Intent实现的方法可以查看上一篇，...
Binder学习(四)利用AIDL、Messenger实现IPC
Binder学习(四)利用AIDL、Messenger实现IPC 概述在利用Binder进行IPC的时候，会经常...

namespaces 学习笔记3：ipc ns 源码实现

测试例子

核心结构体

创建 ipc ns

ipmk 创建队列

ipcs 查看队列

小结

相关文章

namespaces 学习笔记3：ipc ns 源码实现

namespaces 学习笔记1：mount ns 源码实现

namespaces 学习笔记5：user ns 源码实现

namespaces 学习笔记2：uts ns 源码实现

容器安全-Namespaces

SpringBoot启动加载Apollo配置过程

NameSpace资源隔离

安卓实现IPC（四）—— Broadcast

安卓实现IPC（三）—— AIDL

Binder学习(四)利用AIDL、Messenger实现IPC

网友评论

延伸阅读

深度阅读

栏目导航

热点阅读