美文网首页
6. 套接字创建

6. 套接字创建

作者: 霜晨月_ScY | 来源:发表于2019-04-02 16:38 被阅读0次

    套接字创建

    在Linux下, 几乎所有的东西都是基于文件系统的,网络也同样如此。当我们创建一个套接字时,获取到的也是一个文件描述符。本文探究的是socket系统调用,当我们创建套接字时,内核需要为其准备哪些资源。

    框架

    老套路,先放一个socket创建的大框架,调用链如下:

    socket调用链.png

    内核对应处理函数

    这里就不花篇幅讨论,保护模式下,用户态进程如何通过系统调用陷入内核态的。总之,Linux系统API在内核中定义,大多是通过SYSCALL_DEFINE宏定义的。socket函数的内核对应函数如下:

    SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
    {
        return __sys_socket(family, type, protocol);
    }
    

    这个函数定义和API对应得非常好,名字和参数都一样。但他只是简单的包装了__sys_socket函数:

    int __sys_socket(int family, int type, int protocol)
    {
        int retval;
        struct socket *sock;
        int flags;
    
        /* Check the SOCK_* constants for consistency.  */
        BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
        BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
        BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
        BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);
    
        flags = type & ~SOCK_TYPE_MASK;
        if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
            return -EINVAL;
        type &= SOCK_TYPE_MASK;
    
        if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
            flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
    
        retval = sock_create(family, type, protocol, &sock);
        if (retval < 0)
            return retval;
    
        return sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
    }
    

    __sys_socket经过一堆参数合法性检测,最后调用了sock_createsock_map_fd。下面就分别看下这两个函数。

    sock_create

    sock_create只是简单的包装了__sock_create:

    int __sock_create(struct net *net, int family, int type, int protocol,
                 struct socket **res, int kern)
    {
        int err;
        struct socket *sock;
        const struct net_proto_family *pf;
        //......
        
        /*
         *  Allocate the socket and allow the family to set things up. if
         *  the protocol is 0, the family is instructed to select an appropriate
         *  default.
         */
        sock = sock_alloc();
        if (!sock) {
            net_warn_ratelimited("socket: no more sockets\n");
            return -ENFILE; /* Not exactly a match, but its the
                       closest posix thing */
        }
    
        sock->type = type;
    
        rcu_read_lock();
        pf = rcu_dereference(net_families[family]); //net_familes是在inet_init注册的。
        err = -EAFNOSUPPORT;
        if (!pf)
            goto out_release;
    
        /*
         * We will call the ->create function, that possibly is in a loadable
         * module, so we have to bump that loadable module refcnt first.
         */
        if (!try_module_get(pf->owner))
            goto out_release;
    
        /* Now protected by module ref count */
        rcu_read_unlock();
    
        err = pf->create(net, sock, protocol, kern);    //调用到domain参数对应的create函数。
        if (err < 0)
            goto out_module_put;
    
        /*
         * Now to bump the refcnt of the [loadable] module that owns this
         * socket at sock_release time we decrement its refcnt.
         */
        if (!try_module_get(sock->ops->owner))
            goto out_module_busy;
    
        /*
         * Now that we're done with the ->create function, the [loadable]
         * module can have its refcnt decremented
         */
        module_put(pf->owner);
        err = security_socket_post_create(sock, family, type, protocol, kern);
        if (err)
            goto out_sock_release;
        *res = sock;
    
        return 0;
    }
    

    __sock_create函数通过用户传递给socketdomain参数,在net_families数组中索引到协议栈初始化时注册的struct net_proto_family。然后调用pf->create(net, sock, protocol, kern)。对于AF_INET参数,调用的是inet_create:

    static const struct net_proto_family inet_family_ops = {
        .family = PF_INET,
        .create = inet_create,
        .owner  = THIS_MODULE,
    };
    
    static int inet_create(struct net *net, struct socket *sock, int protocol,
                   int kern)
    {
        struct sock *sk;
        struct inet_protosw *answer;
        struct inet_sock *inet;
        struct proto *answer_prot;
        unsigned char answer_flags;
        int try_loading_module = 0;
        int err;
    
        if (protocol < 0 || protocol >= IPPROTO_MAX)
            return -EINVAL;
    
        sock->state = SS_UNCONNECTED;
    
        /* Look for the requested type/protocol pair. */
    lookup_protocol:
        err = -ESOCKTNOSUPPORT;
        rcu_read_lock();
        list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {    //通过type,和protocol参数查找协议
            err = 0;
            /* Check the non-wild match. */
            if (protocol == answer->protocol) {
                if (protocol != IPPROTO_IP)
                    break;
            } else {
                /* Check for the two wild cases. */
                if (IPPROTO_IP == protocol) {
                    protocol = answer->protocol;
                    break;
                }
                if (IPPROTO_IP == answer->protocol)
                    break;
            }
            err = -EPROTONOSUPPORT;
        }
        err = -EPERM;
        if (sock->type == SOCK_RAW && !kern &&
            !ns_capable(net->user_ns, CAP_NET_RAW))
            goto out_rcu_unlock;
    
        sock->ops = answer->ops;    //注册proto_ops
        answer_prot = answer->prot; //注册proto
        answer_flags = answer->flags;
        rcu_read_unlock();
    
        WARN_ON(!answer_prot->slab);
    
        err = -ENOBUFS;
        sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern);
        if (!sk)
            goto out;
    
        err = 0;
        if (INET_PROTOSW_REUSE & answer_flags)
            sk->sk_reuse = SK_CAN_REUSE;
    
        inet = inet_sk(sk);
        inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
    
        inet->nodefrag = 0;
    
        if (SOCK_RAW == sock->type) {
            inet->inet_num = protocol;
            if (IPPROTO_RAW == protocol)
                inet->hdrincl = 1;
        }
    
        if (net->ipv4.sysctl_ip_no_pmtu_disc)
            inet->pmtudisc = IP_PMTUDISC_DONT;
        else
            inet->pmtudisc = IP_PMTUDISC_WANT;
    
        inet->inet_id = 0;
    
        sock_init_data(sock, sk);
    
        sk->sk_destruct    = inet_sock_destruct;
        sk->sk_protocol    = protocol;
        sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
        //......
        sk_refcnt_debug_inc(sk);
    
        if (inet->inet_num) {
            /* It assumes that any protocol which allows
             * the user to assign a number at socket
             * creation time automatically
             * shares.
             */
            inet->inet_sport = htons(inet->inet_num);
            /* Add to protocol hash chains. */
            err = sk->sk_prot->hash(sk);
            if (err) {
                sk_common_release(sk);
                goto out;
            }
        }
    
        if (sk->sk_prot->init) {
            err = sk->sk_prot->init(sk);
            if (err) {
                sk_common_release(sk);
                goto out;
            }
        }
    
        if (!kern) {
            err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk);
            if (err) {
                sk_common_release(sk);
                goto out;
            }
        }
    out:
        return err;
    out_rcu_unlock:
        rcu_read_unlock();
        goto out;
    }
    

    inet_create函数主要有如下五步工作:

    1. sock->state = SS_UNCONNECTED;//socket 状态设置;
    2. 查找全局数组inetsw(在inet_init函数中初始化)中对应的协议操作集合,最重要的是struct protostruct proto_ops,分别用于处理四层和socket相关的内容;
    3. 调用sk_alloc,分配一个struct sock。并将proto类型的指针指向第二步获得的内容;
    4. struct inet_sockstruct sock的超集,具体参见include/net/inet_sock.h中inet_sock的定义。初始化inet_sock,调用sock_init_data。形成socketsock一一对应的关系,相互有指针指向对方;
    5. 最后调用proto中注册的init函数,err = sk->sk_prot->init(sk) 。如果对应于TCP,其函数指针指向tcp_v4_init_sock。在tcp_v4_init_sock中会向sock注册一组四层协议相关的处理函数ipv4_specific,这个后面分析。

    sock_map_fd

    sock_create分析完了接着看sock_map_fd

    static int sock_map_fd(struct socket *sock, int flags)
    {
        struct file *newfile;
        int fd = get_unused_fd_flags(flags);
        if (unlikely(fd < 0)) {
            sock_release(sock);
            return fd;
        }
    
        newfile = sock_alloc_file(sock, flags, NULL);
        if (likely(!IS_ERR(newfile))) {
            fd_install(fd, newfile);
            return fd;
        }
    
        put_unused_fd(fd);
        return PTR_ERR(newfile);
    }
    

    首先分配一个未使用的文件描述符,也就是用户态socket最后返回那个fd。然后通过sock_alloc_file分配并用socket结构体初始化一个file。最后fd_installfdfile关联起来,这样当用户态操作文件描述符时,内核才能知道具体的file对象。这其中最重要的一步是newfile = sock_alloc_file(sock, flags, NULL);。我们来分析下这函数:

    /*
     *  Obtains the first available file descriptor and sets it up for use.
     *
     *  These functions create file structures and maps them to fd space
     *  of the current process. On success it returns file descriptor
     *  and file struct implicitly stored in sock->file.
     *  Note that another thread may close file descriptor before we return
     *  from this function. We use the fact that now we do not refer
     *  to socket after mapping. If one day we will need it, this
     *  function will increment ref. count on file by 1.
     *
     *  In any case returned fd MAY BE not valid!
     *  This race condition is unavoidable
     *  with shared fd spaces, we cannot solve it inside kernel,
     *  but we take care of internal coherence yet.
     */
    
    struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname)
    {
        struct qstr name = { .name = "" };
        struct path path;
        struct file *file;
    
        if (dname) {
            name.name = dname;
            name.len = strlen(name.name);
        } else if (sock->sk) {
            name.name = sock->sk->sk_prot_creator->name;
            name.len = strlen(name.name);
        }
        path.dentry = d_alloc_pseudo(sock_mnt->mnt_sb, &name);
        if (unlikely(!path.dentry)) {
            sock_release(sock);
            return ERR_PTR(-ENOMEM);
        }
        path.mnt = mntget(sock_mnt);
    
        d_instantiate(path.dentry, SOCK_INODE(sock));
    
        file = alloc_file(&path, FMODE_READ | FMODE_WRITE,
              &socket_file_ops);
        if (IS_ERR(file)) {
            /* drop dentry, keep inode for a bit */
            ihold(d_inode(path.dentry));
            path_put(&path);
            /* ... and now kill it properly */
            sock_release(sock);
            return file;
        }
    
        sock->file = file;
        file->f_flags = O_RDWR | (flags & O_NONBLOCK);
        file->private_data = sock;      //将sock保存到file->private_data
        return file;
    }
    

    主要做了两件事情:

    1. 调用alloc_file函数,分配并向file注册socket_file_opssocket_file_ops就像其他文件操作一样,是一组通用函数。他们最终会通过file—>private_data调用注册对象真正的操作函数。
    2. 注册sockfile->private_data

    这之后,当用户通过readwrite系统调用来读取套接字时,便会调用到socket_file_ops注册的sock_read_itersock_write_iter函数。

    static const struct file_operations socket_file_ops = {
        .owner =    THIS_MODULE,
        .llseek =   no_llseek,
        .read_iter =    sock_read_iter,
        .write_iter =   sock_write_iter,
        .poll =     sock_poll,
        .unlocked_ioctl = sock_ioctl,
    #ifdef CONFIG_COMPAT
        .compat_ioctl = compat_sock_ioctl,
    #endif
        .mmap =     sock_mmap,
        .release =  sock_close,
        .fasync =   sock_fasync,
        .sendpage = sock_sendpage,
        .splice_write = generic_splice_sendpage,
        .splice_read =  sock_splice_read,
    };
    

    总结

    我们简单验证下,sock_read_iter最后是不是调用注册到sockproto

    static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to)
    {
        struct file *file = iocb->ki_filp; //拿到当前操作的file
        struct socket *sock = file->private_data;   //拿到sock
        struct msghdr msg = {.msg_iter = *to,
                     .msg_iocb = iocb};
        ssize_t res;
    
        if (file->f_flags & O_NONBLOCK)
            msg.msg_flags = MSG_DONTWAIT;
    
        if (iocb->ki_pos != 0)
            return -ESPIPE;
    
        if (!iov_iter_count(to))    /* Match SYS5 behaviour */
            return 0;
    
        res = sock_recvmsg(sock, &msg, msg.msg_flags);  //向sock_recvmsg传递了sock 
        *to = msg.msg_iter;
        return res;
    }
    
    int sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags)
    {
        int err = security_socket_recvmsg(sock, msg, msg_data_left(msg), flags);
    
        return err ?: sock_recvmsg_nosec(sock, msg, flags);
    }
    
    static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg,
                         int flags)
    {
        return sock->ops->recvmsg(sock, msg, msg_data_left(msg), flags);    //最后调用这里
    }
    

    从上面的调用,可以很清楚的发现,最后就是简单的调用sock->ops->recvmsg(sock, msg, msg_data_left(msg), flags);。也就是在inet_create函数中,注册到sockstruct proto。如果是用户在创建socket时是int fd = socket(AF_INET, SOCK_STREAM, 0)的话。就会调用到tcp_recvmsg

    struct proto tcp_prot = {
        .name           = "TCP",
        .owner          = THIS_MODULE,
        .close          = tcp_close,
        .pre_connect        = tcp_v4_pre_connect,
        //......
        .recvmsg        = tcp_recvmsg,
        .sendmsg        = tcp_sendmsg,
        .sendpage       = tcp_sendpage,
        //......
    };
    

    好了,到这里,我们大致知道了,socket创建时,内核所做的工作。最后创建了如下一个数据结构:

    last.png

    相关文章

      网友评论

          本文标题:6. 套接字创建

          本文链接:https://www.haomeiwen.com/subject/egbgbqtx.html