美文网首页
namespaces 学习笔记3:ipc ns 源码实现

namespaces 学习笔记3:ipc ns 源码实现

作者: 董泽润 | 来源:发表于2019-10-09 15:35 被阅读0次

    TL;DR 这篇讲 ipc namespace,用于隔离共享内存,队列,信号量。虽然这块用的很少,还是要弄清楚原理。

    测试例子

    代码同样来自耗子叔的文章,添加 CLONE_NEWIPC

    #define _GNU_SOURCE
    #include <sys/types.h>
    #include <sys/wait.h>
    #include <stdio.h>
    #include <sched.h>
    #include <signal.h>
    #include <unistd.h>
    #include <sys/mount.h>
    
    /* 定义一个给 clone 用的栈,栈大小1M */
    #define STACK_SIZE (1024 * 1024)
    static char container_stack[STACK_SIZE];
    
    char* const container_args[] = {
        "/bin/bash",
        NULL
    };
    
    int container_main(void* arg)
    {
        printf("Container - inside the container!\n");
        /* 直接执行一个shell,以便我们观察这个进程空间里的资源是否被隔离了 */
        execv(container_args[0], container_args);
        printf("Something's wrong!\n");
        return 1;
    }
    
    int main()
    {
        printf("Parent - start a container!\n");
        /* 调用clone函数,其中传出一个函数,还有一个栈空间的(为什么传尾指针,因为栈是反着的) */
        int container_pid = clone(container_main, container_stack+STACK_SIZE, SIGCHLD|CLONE_NEWIPC, NULL);
        /* 等待子进程结束 */
        waitpid(container_pid, NULL, 0);
        printf("Parent - container stopped!\n");
        return 0;
    }
    

    首先在宿主机上创建 message queue

    root@iZhp36ik63t96xhzjh00ujZ:~# ipcmk -Q
    Message queue id: 0
    root@iZhp36ik63t96xhzjh00ujZ:~# ipcs -q
    
    ------ Message Queues --------
    key        msqid      owner      perms      used-bytes   messages
    0xa59bafa6 0          root       644        0            0
    

    然后编绎运行,启动后发现在容器中看不到共享队列

    root@iZhp36ik63t96xhzjh00ujZ:~# gcc ipc.c && ./a.out
    Parent - start a container!
    Container - inside the container!
    root@iZhp36ik63t96xhzjh00ujZ:~# ipcs -q
    
    ------ Message Queues --------
    key        msqid      owner      perms      used-bytes   messages
    
    root@iZhp36ik63t96xhzjh00ujZ:~# exit
    

    核心结构体

    这个结构体定义比较好理解,但是不够好,msg, shm, queue 全平铺在 ipc_namespace 里了,应该抽出各自定义结构体。

    struct ipc_namespace {
        refcount_t  count;
        struct ipc_ids  ids[3];
    
        int     sem_ctls[4];
        int     used_sems;
    
        unsigned int    msg_ctlmax;
        unsigned int    msg_ctlmnb;
        unsigned int    msg_ctlmni;
        atomic_t    msg_bytes;
        atomic_t    msg_hdrs;
    
        size_t      shm_ctlmax;
        size_t      shm_ctlall;
        unsigned long   shm_tot;
        int     shm_ctlmni;
        /*
         * Defines whether IPC_RMID is forced for _all_ shm segments regardless
         * of shmctl()
         */
        int     shm_rmid_forced;
    
        struct notifier_block ipcns_nb;
    
        /* The kern_mount of the mqueuefs sb.  We take a ref on it */
        struct vfsmount *mq_mnt;
    
        /* # queues in this ns, protected by mq_lock */
        unsigned int    mq_queues_count;
    
        /* next fields are set through sysctl */
        unsigned int    mq_queues_max;   /* initialized to DFLT_QUEUESMAX */
        unsigned int    mq_msg_max;      /* initialized to DFLT_MSGMAX */
        unsigned int    mq_msgsize_max;  /* initialized to DFLT_MSGSIZEMAX */
        unsigned int    mq_msg_default;
        unsigned int    mq_msgsize_default;
    
        /* user_ns which owns the ipc ns */
        struct user_namespace *user_ns;
        struct ucounts *ucounts;
    
        struct ns_common ns;
    } __randomize_layout;
    

    创建 ipc ns

    static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
                           struct ipc_namespace *old_ns)
    {
        struct ipc_namespace *ns;
        struct ucounts *ucounts;
        int err;
    
        err = -ENOSPC;
        ucounts = inc_ipc_namespaces(user_ns);
        if (!ucounts)
            goto fail;
    
        err = -ENOMEM;
        ns = kzalloc(sizeof(struct ipc_namespace), GFP_KERNEL);
        if (ns == NULL)
            goto fail_dec;
    
        err = ns_alloc_inum(&ns->ns);
        if (err)
            goto fail_free;
        ns->ns.ops = &ipcns_operations;
    
        refcount_set(&ns->count, 1);
        ns->user_ns = get_user_ns(user_ns);
        ns->ucounts = ucounts;
    
        err = mq_init_ns(ns);
        if (err)
            goto fail_put;
    
        sem_init_ns(ns);
        msg_init_ns(ns);
        shm_init_ns(ns);
    
        return ns;
        ......
    }
    

    这段代码也好理解,创建 ipc_namespace,然后分别初始化 sem, msg, shm

    void sem_init_ns(struct ipc_namespace *ns)
    {
        ns->sc_semmsl = SEMMSL;
        ns->sc_semmns = SEMMNS;
        ns->sc_semopm = SEMOPM;
        ns->sc_semmni = SEMMNI;
        ns->used_sems = 0;
        ipc_init_ids(&ns->ids[IPC_SEM_IDS]);
    }
    
    void msg_init_ns(struct ipc_namespace *ns)
    {
        ns->msg_ctlmax = MSGMAX;
        ns->msg_ctlmnb = MSGMNB;
        ns->msg_ctlmni = MSGMNI;
    
        atomic_set(&ns->msg_bytes, 0);
        atomic_set(&ns->msg_hdrs, 0);
        ipc_init_ids(&ns->ids[IPC_MSG_IDS]);
    }
    
    void shm_init_ns(struct ipc_namespace *ns)
    {
        ns->shm_ctlmax = SHMMAX;
        ns->shm_ctlall = SHMALL;
        ns->shm_ctlmni = SHMMNI;
        ns->shm_rmid_forced = 0;
        ns->shm_tot = 0;
        ipc_init_ids(&shm_ids(ns));
    }
    

    这是三个初始化函数,代码很简单,设置最大最小值,最后再初始化存储的 ids

    ipmk 创建队列

    先看下系统调用情况

    root@iZhp36ik63t96xhzjh00ujZ:~# strace ipcmk -Q
    ......
    gettid()                                = 26557
    msgget(0x6d8a4a32, IPC_CREAT|0644)      = 32769
    ......
    fstat(1, {st_mode=S_IFCHR|0600, st_rdev=makedev(136, 3), ...}) = 0
    write(1, "Message queue id: 32769\n", 24Message queue id: 32769
    ) = 24
    close(1)                                = 0
    close(2)                                = 0
    exit_group(0)                           = ?
    +++ exited with 0 +++
    

    原来是调用 msgget 来生成的共享队列。

    SYSCALL_DEFINE2(msgget, key_t, key, int, msgflg)
    {
        return ksys_msgget(key, msgflg);
    }
    
    long ksys_msgget(key_t key, int msgflg)
    {
        struct ipc_namespace *ns;
        static const struct ipc_ops msg_ops = {
            .getnew = newque,
            .associate = security_msg_queue_associate,
        };
        struct ipc_params msg_params;
    
        ns = current->nsproxy->ipc_ns;
    
        msg_params.key = key;
        msg_params.flg = msgflg;
    
        return ipcget(ns, &msg_ids(ns), &msg_ops, &msg_params);
    }
    

    这里 msg_ops.getnew 是创建新队列的回调函数,ns 获取当前进程的 ipc namespace,里面含有所有可见 ipc 的 ids

    int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids,
                const struct ipc_ops *ops, struct ipc_params *params)
    {
        if (params->key == IPC_PRIVATE)
            return ipcget_new(ns, ids, ops, params);
        else
            return ipcget_public(ns, ids, ops, params);
    }
    
    static int newque(struct ipc_namespace *ns, struct ipc_params *params)
    {
        struct msg_queue *msq;
        int retval;
        key_t key = params->key;
        int msgflg = params->flg;
    
        msq = kvmalloc(sizeof(*msq), GFP_KERNEL);
        if (unlikely(!msq))
            return -ENOMEM;
    
        msq->q_perm.mode = msgflg & S_IRWXUGO;
        msq->q_perm.key = key;
    
        msq->q_perm.security = NULL;
        retval = security_msg_queue_alloc(&msq->q_perm);
        if (retval) {
            kvfree(msq);
            return retval;
        }
    
        msq->q_stime = msq->q_rtime = 0;
        msq->q_ctime = ktime_get_real_seconds();
        msq->q_cbytes = msq->q_qnum = 0;
        msq->q_qbytes = ns->msg_ctlmnb;
        msq->q_lspid = msq->q_lrpid = NULL;
        INIT_LIST_HEAD(&msq->q_messages);
        INIT_LIST_HEAD(&msq->q_receivers);
        INIT_LIST_HEAD(&msq->q_senders);
    
        /* ipc_addid() locks msq upon success. */
        retval = ipc_addid(&msg_ids(ns), &msq->q_perm, ns->msg_ctlmni);
        if (retval < 0) {
            ipc_rcu_putref(&msq->q_perm, msg_rcu_free);
            return retval;
        }
    
        ipc_unlock_object(&msq->q_perm);
        rcu_read_unlock();
    
        return msq->q_perm.id;
    }
    

    无论是 ipcget_new 还是 ipcget_public 都会调用 newque 创建真正的队列,区别就在于公共的要做去重较验。newque 分配并初始化 msg_queue,最后调用 ipc_addid 添加到 ipc ns

    int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit)
    {
        kuid_t euid;
        kgid_t egid;
        int idx, err;
    
        /* 1) Initialize the refcount so that ipc_rcu_putref works */
        refcount_set(&new->refcount, 1);
    
        if (limit > ipc_mni)
            limit = ipc_mni;
    
        if (ids->in_use >= limit)
            return -ENOSPC;
    
        idr_preload(GFP_KERNEL);
    
        spin_lock_init(&new->lock);
        rcu_read_lock();
        spin_lock(&new->lock);
    
        current_euid_egid(&euid, &egid);
        new->cuid = new->uid = euid;
        new->gid = new->cgid = egid;
    
        new->deleted = false;
    
        idx = ipc_idr_alloc(ids, new);
        idr_preload_end();
    
        if (idx >= 0 && new->key != IPC_PRIVATE) {
            err = rhashtable_insert_fast(&ids->key_ht, &new->khtnode,
                             ipc_kht_params);
            if (err < 0) {
                idr_remove(&ids->ipcs_idr, idx);
                idx = err;
            }
        }
        if (idx < 0) {
            new->deleted = true;
            spin_unlock(&new->lock);
            rcu_read_unlock();
            return idx;
        }
    
        ids->in_use++;
        if (idx > ids->max_idx)
            ids->max_idx = idx;
        return idx;
    }
    

    ipc_addid 逻辑比较简单,就是将刚创建的添加到 ipc ns 的 ids 结构中,这个结构是 radix tree,具体原理就不看了,另外如果不是私有的,还要添加到 hash 中,用于快速查找去重。

    ipcs 查看队列

    还是先看系统调用是什么

    root@iZhp36ik63t96xhzjh00ujZ:~# strace ipcs -q
    ......
    openat(AT_FDCWD, "/proc/sysvipc/msg", O_RDONLY) = 3
    fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
    ......
    

    省去部分无关信息,原来是直接读的内存文件 /proc/sysvipc/msg,其实这个文件的注册是在 os 初始化时设置的。

    void __init msg_init(void)
    {
        msg_init_ns(&init_ipc_ns);
    
        ipc_init_proc_interface("sysvipc/msg",
                    "       key      msqid perms      cbytes       qnum lspid lrpid   uid   gid  cuid  cgid      stime      rtime      ctime\n",
                    IPC_MSG_IDS, sysvipc_msg_proc_show);
    }
    

    先看看 ipc_init_proc_interface 做了哪些初始化工作

    void __init ipc_init_proc_interface(const char *path, const char *header,
            int ids, int (*show)(struct seq_file *, void *))
    {
        struct proc_dir_entry *pde;
        struct ipc_proc_iface *iface;
    
        iface = kmalloc(sizeof(*iface), GFP_KERNEL);
        if (!iface)
            return;
        iface->path = path;
        iface->header   = header;
        iface->ids  = ids;
        iface->show = show;
    
        pde = proc_create_data(path,
                       S_IRUGO,        /* world readable */
                       NULL,           /* parent dir */
                       &sysvipc_proc_fops,
                       iface);
        if (!pde)
            kfree(iface);
    }
    

    这里最重要的就是将 sysvipc_msg_proc_show 设置成 show 回调。任一文件的操作,都有对应 file_operations 的回调结构体,ipc 对应着 sysvipc_proc_fops.

    static const struct file_operations sysvipc_proc_fops = {
        .open    = sysvipc_proc_open,
        .read    = seq_read,
        .llseek  = seq_lseek,
        .release = sysvipc_proc_release,
    };
    
    static int sysvipc_proc_open(struct inode *inode, struct file *file)
    {
        struct ipc_proc_iter *iter;
    
        iter = __seq_open_private(file, &sysvipc_proc_seqops, sizeof(*iter));
        if (!iter)
            return -ENOMEM;
    
        iter->iface = PDE_DATA(inode);
        iter->ns    = get_ipc_ns(current->nsproxy->ipc_ns);
        iter->pid_ns = get_pid_ns(task_active_pid_ns(current));
    
        return 0;
    }
    

    重重的要实在这里,通过 file_operations.open 设置成 sysvipc_proc_open,将文件读操作与查看 ipcs 关联起来, 调用 get_ipc_ns 返回当前 ipc ns. 再来看看 sysvipc_msg_proc_show 的实现

    static int sysvipc_msg_proc_show(struct seq_file *s, void *it)
    {
        struct pid_namespace *pid_ns = ipc_seq_pid_ns(s);
        struct user_namespace *user_ns = seq_user_ns(s);
        struct kern_ipc_perm *ipcp = it;
        struct msg_queue *msq = container_of(ipcp, struct msg_queue, q_perm);
    
        seq_printf(s,
               "%10d %10d  %4o  %10lu %10lu %5u %5u %5u %5u %5u %5u %10llu %10llu %10llu\n",
               msq->q_perm.key,
               msq->q_perm.id,
               msq->q_perm.mode,
               msq->q_cbytes,
               msq->q_qnum,
               pid_nr_ns(msq->q_lspid, pid_ns),
               pid_nr_ns(msq->q_lrpid, pid_ns),
               from_kuid_munged(user_ns, msq->q_perm.uid),
               from_kgid_munged(user_ns, msq->q_perm.gid),
               from_kuid_munged(user_ns, msq->q_perm.cuid),
               from_kgid_munged(user_ns, msq->q_perm.cgid),
               msq->q_stime,
               msq->q_rtime,
               msq->q_ctime);
    
        return 0;
    }
    

    也很简单,容易理解,msg_queue 来自参数 it 传过来,这里包含了 ipc ns,最后打印终端即可。

    小结

    现在看 ipc 和 uts 都比较简单,实现起来也不复杂。

    相关文章

      网友评论

          本文标题:namespaces 学习笔记3:ipc ns 源码实现

          本文链接:https://www.haomeiwen.com/subject/cgwwpctx.html