6. 套接字创建

作者: 霜晨月_ScY | 来源:发表于2019-04-02 16:38 被阅读0次

套接字创建

在Linux下，几乎所有的东西都是基于文件系统的，网络也同样如此。当我们创建一个套接字时，获取到的也是一个文件描述符。本文探究的是socket系统调用，当我们创建套接字时，内核需要为其准备哪些资源。

框架

老套路，先放一个socket创建的大框架，调用链如下：

socket调用链.png

内核对应处理函数

这里就不花篇幅讨论，保护模式下，用户态进程如何通过系统调用陷入内核态的。总之，Linux系统API在内核中定义，大多是通过SYSCALL_DEFINE宏定义的。socket函数的内核对应函数如下：

SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
{
    return __sys_socket(family, type, protocol);
}

这个函数定义和API对应得非常好，名字和参数都一样。但他只是简单的包装了__sys_socket函数：

int __sys_socket(int family, int type, int protocol)
{
    int retval;
    struct socket *sock;
    int flags;

    /* Check the SOCK_* constants for consistency.  */
    BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
    BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
    BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
    BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);

    flags = type & ~SOCK_TYPE_MASK;
    if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
        return -EINVAL;
    type &= SOCK_TYPE_MASK;

    if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
        flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;

    retval = sock_create(family, type, protocol, &sock);
    if (retval < 0)
        return retval;

    return sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
}

__sys_socket经过一堆参数合法性检测，最后调用了sock_create和sock_map_fd。下面就分别看下这两个函数。

sock_create

sock_create只是简单的包装了__sock_create:

int __sock_create(struct net *net, int family, int type, int protocol,
             struct socket **res, int kern)
{
    int err;
    struct socket *sock;
    const struct net_proto_family *pf;
    //......
    
    /*
     *  Allocate the socket and allow the family to set things up. if
     *  the protocol is 0, the family is instructed to select an appropriate
     *  default.
     */
    sock = sock_alloc();
    if (!sock) {
        net_warn_ratelimited("socket: no more sockets\n");
        return -ENFILE; /* Not exactly a match, but its the
                   closest posix thing */
    }

    sock->type = type;

    rcu_read_lock();
    pf = rcu_dereference(net_families[family]); //net_familes是在inet_init注册的。
    err = -EAFNOSUPPORT;
    if (!pf)
        goto out_release;

    /*
     * We will call the ->create function, that possibly is in a loadable
     * module, so we have to bump that loadable module refcnt first.
     */
    if (!try_module_get(pf->owner))
        goto out_release;

    /* Now protected by module ref count */
    rcu_read_unlock();

    err = pf->create(net, sock, protocol, kern);    //调用到domain参数对应的create函数。
    if (err < 0)
        goto out_module_put;

    /*
     * Now to bump the refcnt of the [loadable] module that owns this
     * socket at sock_release time we decrement its refcnt.
     */
    if (!try_module_get(sock->ops->owner))
        goto out_module_busy;

    /*
     * Now that we're done with the ->create function, the [loadable]
     * module can have its refcnt decremented
     */
    module_put(pf->owner);
    err = security_socket_post_create(sock, family, type, protocol, kern);
    if (err)
        goto out_sock_release;
    *res = sock;

    return 0;
}

__sock_create函数通过用户传递给socket的domain参数，在net_families数组中索引到协议栈初始化时注册的struct net_proto_family。然后调用pf->create(net, sock, protocol, kern)。对于AF_INET参数，调用的是inet_create:

static const struct net_proto_family inet_family_ops = {
    .family = PF_INET,
    .create = inet_create,
    .owner  = THIS_MODULE,
};

static int inet_create(struct net *net, struct socket *sock, int protocol,
               int kern)
{
    struct sock *sk;
    struct inet_protosw *answer;
    struct inet_sock *inet;
    struct proto *answer_prot;
    unsigned char answer_flags;
    int try_loading_module = 0;
    int err;

    if (protocol < 0 || protocol >= IPPROTO_MAX)
        return -EINVAL;

    sock->state = SS_UNCONNECTED;

    /* Look for the requested type/protocol pair. */
lookup_protocol:
    err = -ESOCKTNOSUPPORT;
    rcu_read_lock();
    list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {    //通过type，和protocol参数查找协议
        err = 0;
        /* Check the non-wild match. */
        if (protocol == answer->protocol) {
            if (protocol != IPPROTO_IP)
                break;
        } else {
            /* Check for the two wild cases. */
            if (IPPROTO_IP == protocol) {
                protocol = answer->protocol;
                break;
            }
            if (IPPROTO_IP == answer->protocol)
                break;
        }
        err = -EPROTONOSUPPORT;
    }
    err = -EPERM;
    if (sock->type == SOCK_RAW && !kern &&
        !ns_capable(net->user_ns, CAP_NET_RAW))
        goto out_rcu_unlock;

    sock->ops = answer->ops;    //注册proto_ops
    answer_prot = answer->prot; //注册proto
    answer_flags = answer->flags;
    rcu_read_unlock();

    WARN_ON(!answer_prot->slab);

    err = -ENOBUFS;
    sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern);
    if (!sk)
        goto out;

    err = 0;
    if (INET_PROTOSW_REUSE & answer_flags)
        sk->sk_reuse = SK_CAN_REUSE;

    inet = inet_sk(sk);
    inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;

    inet->nodefrag = 0;

    if (SOCK_RAW == sock->type) {
        inet->inet_num = protocol;
        if (IPPROTO_RAW == protocol)
            inet->hdrincl = 1;
    }

    if (net->ipv4.sysctl_ip_no_pmtu_disc)
        inet->pmtudisc = IP_PMTUDISC_DONT;
    else
        inet->pmtudisc = IP_PMTUDISC_WANT;

    inet->inet_id = 0;

    sock_init_data(sock, sk);

    sk->sk_destruct    = inet_sock_destruct;
    sk->sk_protocol    = protocol;
    sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
    //......
    sk_refcnt_debug_inc(sk);

    if (inet->inet_num) {
        /* It assumes that any protocol which allows
         * the user to assign a number at socket
         * creation time automatically
         * shares.
         */
        inet->inet_sport = htons(inet->inet_num);
        /* Add to protocol hash chains. */
        err = sk->sk_prot->hash(sk);
        if (err) {
            sk_common_release(sk);
            goto out;
        }
    }

    if (sk->sk_prot->init) {
        err = sk->sk_prot->init(sk);
        if (err) {
            sk_common_release(sk);
            goto out;
        }
    }

    if (!kern) {
        err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk);
        if (err) {
            sk_common_release(sk);
            goto out;
        }
    }
out:
    return err;
out_rcu_unlock:
    rcu_read_unlock();
    goto out;
}

inet_create函数主要有如下五步工作：

sock->state = SS_UNCONNECTED;//socket 状态设置;
查找全局数组inetsw（在inet_init函数中初始化）中对应的协议操作集合，最重要的是struct proto和struct proto_ops,分别用于处理四层和socket相关的内容;
调用sk_alloc,分配一个struct sock。并将proto类型的指针指向第二步获得的内容;
struct inet_sock 是struct sock的超集，具体参见include/net/inet_sock.h中inet_sock的定义。初始化inet_sock，调用sock_init_data。形成socket和sock一一对应的关系，相互有指针指向对方;
最后调用proto中注册的init函数，err = sk->sk_prot->init(sk) 。如果对应于TCP，其函数指针指向tcp_v4_init_sock。在tcp_v4_init_sock中会向sock注册一组四层协议相关的处理函数ipv4_specific，这个后面分析。

sock_map_fd

sock_create分析完了接着看sock_map_fd：

static int sock_map_fd(struct socket *sock, int flags)
{
    struct file *newfile;
    int fd = get_unused_fd_flags(flags);
    if (unlikely(fd < 0)) {
        sock_release(sock);
        return fd;
    }

    newfile = sock_alloc_file(sock, flags, NULL);
    if (likely(!IS_ERR(newfile))) {
        fd_install(fd, newfile);
        return fd;
    }

    put_unused_fd(fd);
    return PTR_ERR(newfile);
}

首先分配一个未使用的文件描述符，也就是用户态socket最后返回那个fd。然后通过sock_alloc_file分配并用socket结构体初始化一个file。最后fd_install将fd和file关联起来，这样当用户态操作文件描述符时，内核才能知道具体的file对象。这其中最重要的一步是newfile = sock_alloc_file(sock, flags, NULL);。我们来分析下这函数：

/*
 *  Obtains the first available file descriptor and sets it up for use.
 *
 *  These functions create file structures and maps them to fd space
 *  of the current process. On success it returns file descriptor
 *  and file struct implicitly stored in sock->file.
 *  Note that another thread may close file descriptor before we return
 *  from this function. We use the fact that now we do not refer
 *  to socket after mapping. If one day we will need it, this
 *  function will increment ref. count on file by 1.
 *
 *  In any case returned fd MAY BE not valid!
 *  This race condition is unavoidable
 *  with shared fd spaces, we cannot solve it inside kernel,
 *  but we take care of internal coherence yet.
 */

struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname)
{
    struct qstr name = { .name = "" };
    struct path path;
    struct file *file;

    if (dname) {
        name.name = dname;
        name.len = strlen(name.name);
    } else if (sock->sk) {
        name.name = sock->sk->sk_prot_creator->name;
        name.len = strlen(name.name);
    }
    path.dentry = d_alloc_pseudo(sock_mnt->mnt_sb, &name);
    if (unlikely(!path.dentry)) {
        sock_release(sock);
        return ERR_PTR(-ENOMEM);
    }
    path.mnt = mntget(sock_mnt);

    d_instantiate(path.dentry, SOCK_INODE(sock));

    file = alloc_file(&path, FMODE_READ | FMODE_WRITE,
          &socket_file_ops);
    if (IS_ERR(file)) {
        /* drop dentry, keep inode for a bit */
        ihold(d_inode(path.dentry));
        path_put(&path);
        /* ... and now kill it properly */
        sock_release(sock);
        return file;
    }

    sock->file = file;
    file->f_flags = O_RDWR | (flags & O_NONBLOCK);
    file->private_data = sock;      //将sock保存到file->private_data
    return file;
}

主要做了两件事情：

调用alloc_file函数，分配并向file注册socket_file_ops。socket_file_ops就像其他文件操作一样，是一组通用函数。他们最终会通过file—>private_data调用注册对象真正的操作函数。
注册sock到file->private_data。

这之后，当用户通过read，write系统调用来读取套接字时，便会调用到socket_file_ops注册的sock_read_iter和sock_write_iter函数。

static const struct file_operations socket_file_ops = {
    .owner =    THIS_MODULE,
    .llseek =   no_llseek,
    .read_iter =    sock_read_iter,
    .write_iter =   sock_write_iter,
    .poll =     sock_poll,
    .unlocked_ioctl = sock_ioctl,
#ifdef CONFIG_COMPAT
    .compat_ioctl = compat_sock_ioctl,
#endif
    .mmap =     sock_mmap,
    .release =  sock_close,
    .fasync =   sock_fasync,
    .sendpage = sock_sendpage,
    .splice_write = generic_splice_sendpage,
    .splice_read =  sock_splice_read,
};

总结

我们简单验证下，sock_read_iter最后是不是调用注册到sock中proto。

static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
    struct file *file = iocb->ki_filp; //拿到当前操作的file
    struct socket *sock = file->private_data;   //拿到sock
    struct msghdr msg = {.msg_iter = *to,
                 .msg_iocb = iocb};
    ssize_t res;

    if (file->f_flags & O_NONBLOCK)
        msg.msg_flags = MSG_DONTWAIT;

    if (iocb->ki_pos != 0)
        return -ESPIPE;

    if (!iov_iter_count(to))    /* Match SYS5 behaviour */
        return 0;

    res = sock_recvmsg(sock, &msg, msg.msg_flags);  //向sock_recvmsg传递了sock 
    *to = msg.msg_iter;
    return res;
}

int sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags)
{
    int err = security_socket_recvmsg(sock, msg, msg_data_left(msg), flags);

    return err ?: sock_recvmsg_nosec(sock, msg, flags);
}

static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg,
                     int flags)
{
    return sock->ops->recvmsg(sock, msg, msg_data_left(msg), flags);    //最后调用这里
}

从上面的调用，可以很清楚的发现，最后就是简单的调用sock->ops->recvmsg(sock, msg, msg_data_left(msg), flags);。也就是在inet_create函数中，注册到sock的struct proto。如果是用户在创建socket时是int fd = socket(AF_INET, SOCK_STREAM, 0)的话。就会调用到tcp_recvmsg。

struct proto tcp_prot = {
    .name           = "TCP",
    .owner          = THIS_MODULE,
    .close          = tcp_close,
    .pre_connect        = tcp_v4_pre_connect,
    //......
    .recvmsg        = tcp_recvmsg,
    .sendmsg        = tcp_sendmsg,
    .sendpage       = tcp_sendpage,
    //......
};

好了，到这里，我们大致知道了，socket创建时，内核所做的工作。最后创建了如下一个数据结构：

last.png

6. 套接字创建

套接字创建

框架

内核对应处理函数

sock_create

sock_map_fd

总结

相关文章

网友评论

延伸阅读

深度阅读

栏目导航

热点阅读