美文网首页
namespaces 学习笔记3:ipc ns 源码实现

namespaces 学习笔记3:ipc ns 源码实现

作者: 董泽润 | 来源:发表于2019-10-09 15:35 被阅读0次

TL;DR 这篇讲 ipc namespace,用于隔离共享内存,队列,信号量。虽然这块用的很少,还是要弄清楚原理。

测试例子

代码同样来自耗子叔的文章,添加 CLONE_NEWIPC

#define _GNU_SOURCE
#include <sys/types.h>
#include <sys/wait.h>
#include <stdio.h>
#include <sched.h>
#include <signal.h>
#include <unistd.h>
#include <sys/mount.h>

/* 定义一个给 clone 用的栈,栈大小1M */
#define STACK_SIZE (1024 * 1024)
static char container_stack[STACK_SIZE];

char* const container_args[] = {
    "/bin/bash",
    NULL
};

int container_main(void* arg)
{
    printf("Container - inside the container!\n");
    /* 直接执行一个shell,以便我们观察这个进程空间里的资源是否被隔离了 */
    execv(container_args[0], container_args);
    printf("Something's wrong!\n");
    return 1;
}

int main()
{
    printf("Parent - start a container!\n");
    /* 调用clone函数,其中传出一个函数,还有一个栈空间的(为什么传尾指针,因为栈是反着的) */
    int container_pid = clone(container_main, container_stack+STACK_SIZE, SIGCHLD|CLONE_NEWIPC, NULL);
    /* 等待子进程结束 */
    waitpid(container_pid, NULL, 0);
    printf("Parent - container stopped!\n");
    return 0;
}

首先在宿主机上创建 message queue

root@iZhp36ik63t96xhzjh00ujZ:~# ipcmk -Q
Message queue id: 0
root@iZhp36ik63t96xhzjh00ujZ:~# ipcs -q

------ Message Queues --------
key        msqid      owner      perms      used-bytes   messages
0xa59bafa6 0          root       644        0            0

然后编绎运行,启动后发现在容器中看不到共享队列

root@iZhp36ik63t96xhzjh00ujZ:~# gcc ipc.c && ./a.out
Parent - start a container!
Container - inside the container!
root@iZhp36ik63t96xhzjh00ujZ:~# ipcs -q

------ Message Queues --------
key        msqid      owner      perms      used-bytes   messages

root@iZhp36ik63t96xhzjh00ujZ:~# exit

核心结构体

这个结构体定义比较好理解,但是不够好,msg, shm, queue 全平铺在 ipc_namespace 里了,应该抽出各自定义结构体。

struct ipc_namespace {
    refcount_t  count;
    struct ipc_ids  ids[3];

    int     sem_ctls[4];
    int     used_sems;

    unsigned int    msg_ctlmax;
    unsigned int    msg_ctlmnb;
    unsigned int    msg_ctlmni;
    atomic_t    msg_bytes;
    atomic_t    msg_hdrs;

    size_t      shm_ctlmax;
    size_t      shm_ctlall;
    unsigned long   shm_tot;
    int     shm_ctlmni;
    /*
     * Defines whether IPC_RMID is forced for _all_ shm segments regardless
     * of shmctl()
     */
    int     shm_rmid_forced;

    struct notifier_block ipcns_nb;

    /* The kern_mount of the mqueuefs sb.  We take a ref on it */
    struct vfsmount *mq_mnt;

    /* # queues in this ns, protected by mq_lock */
    unsigned int    mq_queues_count;

    /* next fields are set through sysctl */
    unsigned int    mq_queues_max;   /* initialized to DFLT_QUEUESMAX */
    unsigned int    mq_msg_max;      /* initialized to DFLT_MSGMAX */
    unsigned int    mq_msgsize_max;  /* initialized to DFLT_MSGSIZEMAX */
    unsigned int    mq_msg_default;
    unsigned int    mq_msgsize_default;

    /* user_ns which owns the ipc ns */
    struct user_namespace *user_ns;
    struct ucounts *ucounts;

    struct ns_common ns;
} __randomize_layout;

创建 ipc ns

static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
                       struct ipc_namespace *old_ns)
{
    struct ipc_namespace *ns;
    struct ucounts *ucounts;
    int err;

    err = -ENOSPC;
    ucounts = inc_ipc_namespaces(user_ns);
    if (!ucounts)
        goto fail;

    err = -ENOMEM;
    ns = kzalloc(sizeof(struct ipc_namespace), GFP_KERNEL);
    if (ns == NULL)
        goto fail_dec;

    err = ns_alloc_inum(&ns->ns);
    if (err)
        goto fail_free;
    ns->ns.ops = &ipcns_operations;

    refcount_set(&ns->count, 1);
    ns->user_ns = get_user_ns(user_ns);
    ns->ucounts = ucounts;

    err = mq_init_ns(ns);
    if (err)
        goto fail_put;

    sem_init_ns(ns);
    msg_init_ns(ns);
    shm_init_ns(ns);

    return ns;
    ......
}

这段代码也好理解,创建 ipc_namespace,然后分别初始化 sem, msg, shm

void sem_init_ns(struct ipc_namespace *ns)
{
    ns->sc_semmsl = SEMMSL;
    ns->sc_semmns = SEMMNS;
    ns->sc_semopm = SEMOPM;
    ns->sc_semmni = SEMMNI;
    ns->used_sems = 0;
    ipc_init_ids(&ns->ids[IPC_SEM_IDS]);
}

void msg_init_ns(struct ipc_namespace *ns)
{
    ns->msg_ctlmax = MSGMAX;
    ns->msg_ctlmnb = MSGMNB;
    ns->msg_ctlmni = MSGMNI;

    atomic_set(&ns->msg_bytes, 0);
    atomic_set(&ns->msg_hdrs, 0);
    ipc_init_ids(&ns->ids[IPC_MSG_IDS]);
}

void shm_init_ns(struct ipc_namespace *ns)
{
    ns->shm_ctlmax = SHMMAX;
    ns->shm_ctlall = SHMALL;
    ns->shm_ctlmni = SHMMNI;
    ns->shm_rmid_forced = 0;
    ns->shm_tot = 0;
    ipc_init_ids(&shm_ids(ns));
}

这是三个初始化函数,代码很简单,设置最大最小值,最后再初始化存储的 ids

ipmk 创建队列

先看下系统调用情况

root@iZhp36ik63t96xhzjh00ujZ:~# strace ipcmk -Q
......
gettid()                                = 26557
msgget(0x6d8a4a32, IPC_CREAT|0644)      = 32769
......
fstat(1, {st_mode=S_IFCHR|0600, st_rdev=makedev(136, 3), ...}) = 0
write(1, "Message queue id: 32769\n", 24Message queue id: 32769
) = 24
close(1)                                = 0
close(2)                                = 0
exit_group(0)                           = ?
+++ exited with 0 +++

原来是调用 msgget 来生成的共享队列。

SYSCALL_DEFINE2(msgget, key_t, key, int, msgflg)
{
    return ksys_msgget(key, msgflg);
}

long ksys_msgget(key_t key, int msgflg)
{
    struct ipc_namespace *ns;
    static const struct ipc_ops msg_ops = {
        .getnew = newque,
        .associate = security_msg_queue_associate,
    };
    struct ipc_params msg_params;

    ns = current->nsproxy->ipc_ns;

    msg_params.key = key;
    msg_params.flg = msgflg;

    return ipcget(ns, &msg_ids(ns), &msg_ops, &msg_params);
}

这里 msg_ops.getnew 是创建新队列的回调函数,ns 获取当前进程的 ipc namespace,里面含有所有可见 ipc 的 ids

int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids,
            const struct ipc_ops *ops, struct ipc_params *params)
{
    if (params->key == IPC_PRIVATE)
        return ipcget_new(ns, ids, ops, params);
    else
        return ipcget_public(ns, ids, ops, params);
}
static int newque(struct ipc_namespace *ns, struct ipc_params *params)
{
    struct msg_queue *msq;
    int retval;
    key_t key = params->key;
    int msgflg = params->flg;

    msq = kvmalloc(sizeof(*msq), GFP_KERNEL);
    if (unlikely(!msq))
        return -ENOMEM;

    msq->q_perm.mode = msgflg & S_IRWXUGO;
    msq->q_perm.key = key;

    msq->q_perm.security = NULL;
    retval = security_msg_queue_alloc(&msq->q_perm);
    if (retval) {
        kvfree(msq);
        return retval;
    }

    msq->q_stime = msq->q_rtime = 0;
    msq->q_ctime = ktime_get_real_seconds();
    msq->q_cbytes = msq->q_qnum = 0;
    msq->q_qbytes = ns->msg_ctlmnb;
    msq->q_lspid = msq->q_lrpid = NULL;
    INIT_LIST_HEAD(&msq->q_messages);
    INIT_LIST_HEAD(&msq->q_receivers);
    INIT_LIST_HEAD(&msq->q_senders);

    /* ipc_addid() locks msq upon success. */
    retval = ipc_addid(&msg_ids(ns), &msq->q_perm, ns->msg_ctlmni);
    if (retval < 0) {
        ipc_rcu_putref(&msq->q_perm, msg_rcu_free);
        return retval;
    }

    ipc_unlock_object(&msq->q_perm);
    rcu_read_unlock();

    return msq->q_perm.id;
}

无论是 ipcget_new 还是 ipcget_public 都会调用 newque 创建真正的队列,区别就在于公共的要做去重较验。newque 分配并初始化 msg_queue,最后调用 ipc_addid 添加到 ipc ns

int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit)
{
    kuid_t euid;
    kgid_t egid;
    int idx, err;

    /* 1) Initialize the refcount so that ipc_rcu_putref works */
    refcount_set(&new->refcount, 1);

    if (limit > ipc_mni)
        limit = ipc_mni;

    if (ids->in_use >= limit)
        return -ENOSPC;

    idr_preload(GFP_KERNEL);

    spin_lock_init(&new->lock);
    rcu_read_lock();
    spin_lock(&new->lock);

    current_euid_egid(&euid, &egid);
    new->cuid = new->uid = euid;
    new->gid = new->cgid = egid;

    new->deleted = false;

    idx = ipc_idr_alloc(ids, new);
    idr_preload_end();

    if (idx >= 0 && new->key != IPC_PRIVATE) {
        err = rhashtable_insert_fast(&ids->key_ht, &new->khtnode,
                         ipc_kht_params);
        if (err < 0) {
            idr_remove(&ids->ipcs_idr, idx);
            idx = err;
        }
    }
    if (idx < 0) {
        new->deleted = true;
        spin_unlock(&new->lock);
        rcu_read_unlock();
        return idx;
    }

    ids->in_use++;
    if (idx > ids->max_idx)
        ids->max_idx = idx;
    return idx;
}

ipc_addid 逻辑比较简单,就是将刚创建的添加到 ipc ns 的 ids 结构中,这个结构是 radix tree,具体原理就不看了,另外如果不是私有的,还要添加到 hash 中,用于快速查找去重。

ipcs 查看队列

还是先看系统调用是什么

root@iZhp36ik63t96xhzjh00ujZ:~# strace ipcs -q
......
openat(AT_FDCWD, "/proc/sysvipc/msg", O_RDONLY) = 3
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
......

省去部分无关信息,原来是直接读的内存文件 /proc/sysvipc/msg,其实这个文件的注册是在 os 初始化时设置的。

void __init msg_init(void)
{
    msg_init_ns(&init_ipc_ns);

    ipc_init_proc_interface("sysvipc/msg",
                "       key      msqid perms      cbytes       qnum lspid lrpid   uid   gid  cuid  cgid      stime      rtime      ctime\n",
                IPC_MSG_IDS, sysvipc_msg_proc_show);
}

先看看 ipc_init_proc_interface 做了哪些初始化工作

void __init ipc_init_proc_interface(const char *path, const char *header,
        int ids, int (*show)(struct seq_file *, void *))
{
    struct proc_dir_entry *pde;
    struct ipc_proc_iface *iface;

    iface = kmalloc(sizeof(*iface), GFP_KERNEL);
    if (!iface)
        return;
    iface->path = path;
    iface->header   = header;
    iface->ids  = ids;
    iface->show = show;

    pde = proc_create_data(path,
                   S_IRUGO,        /* world readable */
                   NULL,           /* parent dir */
                   &sysvipc_proc_fops,
                   iface);
    if (!pde)
        kfree(iface);
}

这里最重要的就是将 sysvipc_msg_proc_show 设置成 show 回调。任一文件的操作,都有对应 file_operations 的回调结构体,ipc 对应着 sysvipc_proc_fops.

static const struct file_operations sysvipc_proc_fops = {
    .open    = sysvipc_proc_open,
    .read    = seq_read,
    .llseek  = seq_lseek,
    .release = sysvipc_proc_release,
};

static int sysvipc_proc_open(struct inode *inode, struct file *file)
{
    struct ipc_proc_iter *iter;

    iter = __seq_open_private(file, &sysvipc_proc_seqops, sizeof(*iter));
    if (!iter)
        return -ENOMEM;

    iter->iface = PDE_DATA(inode);
    iter->ns    = get_ipc_ns(current->nsproxy->ipc_ns);
    iter->pid_ns = get_pid_ns(task_active_pid_ns(current));

    return 0;
}

重重的要实在这里,通过 file_operations.open 设置成 sysvipc_proc_open,将文件读操作与查看 ipcs 关联起来, 调用 get_ipc_ns 返回当前 ipc ns. 再来看看 sysvipc_msg_proc_show 的实现

static int sysvipc_msg_proc_show(struct seq_file *s, void *it)
{
    struct pid_namespace *pid_ns = ipc_seq_pid_ns(s);
    struct user_namespace *user_ns = seq_user_ns(s);
    struct kern_ipc_perm *ipcp = it;
    struct msg_queue *msq = container_of(ipcp, struct msg_queue, q_perm);

    seq_printf(s,
           "%10d %10d  %4o  %10lu %10lu %5u %5u %5u %5u %5u %5u %10llu %10llu %10llu\n",
           msq->q_perm.key,
           msq->q_perm.id,
           msq->q_perm.mode,
           msq->q_cbytes,
           msq->q_qnum,
           pid_nr_ns(msq->q_lspid, pid_ns),
           pid_nr_ns(msq->q_lrpid, pid_ns),
           from_kuid_munged(user_ns, msq->q_perm.uid),
           from_kgid_munged(user_ns, msq->q_perm.gid),
           from_kuid_munged(user_ns, msq->q_perm.cuid),
           from_kgid_munged(user_ns, msq->q_perm.cgid),
           msq->q_stime,
           msq->q_rtime,
           msq->q_ctime);

    return 0;
}

也很简单,容易理解,msg_queue 来自参数 it 传过来,这里包含了 ipc ns,最后打印终端即可。

小结

现在看 ipc 和 uts 都比较简单,实现起来也不复杂。

相关文章

网友评论

      本文标题:namespaces 学习笔记3:ipc ns 源码实现

      本文链接:https://www.haomeiwen.com/subject/cgwwpctx.html