套接字创建
在Linux下, 几乎所有的东西都是基于文件系统的,网络也同样如此。当我们创建一个套接字时,获取到的也是一个文件描述符。本文探究的是socket
系统调用,当我们创建套接字时,内核需要为其准备哪些资源。
框架
老套路,先放一个socket
创建的大框架,调用链如下:
内核对应处理函数
这里就不花篇幅讨论,保护模式下,用户态进程如何通过系统调用陷入内核态的。总之,Linux系统API在内核中定义,大多是通过SYSCALL_DEFINE
宏定义的。socket
函数的内核对应函数如下:
SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
{
return __sys_socket(family, type, protocol);
}
这个函数定义和API对应得非常好,名字和参数都一样。但他只是简单的包装了__sys_socket
函数:
int __sys_socket(int family, int type, int protocol)
{
int retval;
struct socket *sock;
int flags;
/* Check the SOCK_* constants for consistency. */
BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);
flags = type & ~SOCK_TYPE_MASK;
if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
return -EINVAL;
type &= SOCK_TYPE_MASK;
if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
retval = sock_create(family, type, protocol, &sock);
if (retval < 0)
return retval;
return sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
}
__sys_socket
经过一堆参数合法性检测,最后调用了sock_create
和sock_map_fd
。下面就分别看下这两个函数。
sock_create
sock_create
只是简单的包装了__sock_create
:
int __sock_create(struct net *net, int family, int type, int protocol,
struct socket **res, int kern)
{
int err;
struct socket *sock;
const struct net_proto_family *pf;
//......
/*
* Allocate the socket and allow the family to set things up. if
* the protocol is 0, the family is instructed to select an appropriate
* default.
*/
sock = sock_alloc();
if (!sock) {
net_warn_ratelimited("socket: no more sockets\n");
return -ENFILE; /* Not exactly a match, but its the
closest posix thing */
}
sock->type = type;
rcu_read_lock();
pf = rcu_dereference(net_families[family]); //net_familes是在inet_init注册的。
err = -EAFNOSUPPORT;
if (!pf)
goto out_release;
/*
* We will call the ->create function, that possibly is in a loadable
* module, so we have to bump that loadable module refcnt first.
*/
if (!try_module_get(pf->owner))
goto out_release;
/* Now protected by module ref count */
rcu_read_unlock();
err = pf->create(net, sock, protocol, kern); //调用到domain参数对应的create函数。
if (err < 0)
goto out_module_put;
/*
* Now to bump the refcnt of the [loadable] module that owns this
* socket at sock_release time we decrement its refcnt.
*/
if (!try_module_get(sock->ops->owner))
goto out_module_busy;
/*
* Now that we're done with the ->create function, the [loadable]
* module can have its refcnt decremented
*/
module_put(pf->owner);
err = security_socket_post_create(sock, family, type, protocol, kern);
if (err)
goto out_sock_release;
*res = sock;
return 0;
}
__sock_create
函数通过用户传递给socket
的domain
参数,在net_families
数组中索引到协议栈初始化时注册的struct net_proto_family
。然后调用pf->create(net, sock, protocol, kern)
。对于AF_INET
参数,调用的是inet_create
:
static const struct net_proto_family inet_family_ops = {
.family = PF_INET,
.create = inet_create,
.owner = THIS_MODULE,
};
static int inet_create(struct net *net, struct socket *sock, int protocol,
int kern)
{
struct sock *sk;
struct inet_protosw *answer;
struct inet_sock *inet;
struct proto *answer_prot;
unsigned char answer_flags;
int try_loading_module = 0;
int err;
if (protocol < 0 || protocol >= IPPROTO_MAX)
return -EINVAL;
sock->state = SS_UNCONNECTED;
/* Look for the requested type/protocol pair. */
lookup_protocol:
err = -ESOCKTNOSUPPORT;
rcu_read_lock();
list_for_each_entry_rcu(answer, &inetsw[sock->type], list) { //通过type,和protocol参数查找协议
err = 0;
/* Check the non-wild match. */
if (protocol == answer->protocol) {
if (protocol != IPPROTO_IP)
break;
} else {
/* Check for the two wild cases. */
if (IPPROTO_IP == protocol) {
protocol = answer->protocol;
break;
}
if (IPPROTO_IP == answer->protocol)
break;
}
err = -EPROTONOSUPPORT;
}
err = -EPERM;
if (sock->type == SOCK_RAW && !kern &&
!ns_capable(net->user_ns, CAP_NET_RAW))
goto out_rcu_unlock;
sock->ops = answer->ops; //注册proto_ops
answer_prot = answer->prot; //注册proto
answer_flags = answer->flags;
rcu_read_unlock();
WARN_ON(!answer_prot->slab);
err = -ENOBUFS;
sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern);
if (!sk)
goto out;
err = 0;
if (INET_PROTOSW_REUSE & answer_flags)
sk->sk_reuse = SK_CAN_REUSE;
inet = inet_sk(sk);
inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
inet->nodefrag = 0;
if (SOCK_RAW == sock->type) {
inet->inet_num = protocol;
if (IPPROTO_RAW == protocol)
inet->hdrincl = 1;
}
if (net->ipv4.sysctl_ip_no_pmtu_disc)
inet->pmtudisc = IP_PMTUDISC_DONT;
else
inet->pmtudisc = IP_PMTUDISC_WANT;
inet->inet_id = 0;
sock_init_data(sock, sk);
sk->sk_destruct = inet_sock_destruct;
sk->sk_protocol = protocol;
sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
//......
sk_refcnt_debug_inc(sk);
if (inet->inet_num) {
/* It assumes that any protocol which allows
* the user to assign a number at socket
* creation time automatically
* shares.
*/
inet->inet_sport = htons(inet->inet_num);
/* Add to protocol hash chains. */
err = sk->sk_prot->hash(sk);
if (err) {
sk_common_release(sk);
goto out;
}
}
if (sk->sk_prot->init) {
err = sk->sk_prot->init(sk);
if (err) {
sk_common_release(sk);
goto out;
}
}
if (!kern) {
err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk);
if (err) {
sk_common_release(sk);
goto out;
}
}
out:
return err;
out_rcu_unlock:
rcu_read_unlock();
goto out;
}
inet_create
函数主要有如下五步工作:
-
sock->state = SS_UNCONNECTED;
//socket 状态设置; - 查找全局数组
inetsw
(在inet_init
函数中初始化)中对应的协议操作集合,最重要的是struct proto
和struct proto_ops
,分别用于处理四层和socket相关的内容; - 调用
sk_alloc
,分配一个struct sock
。并将proto
类型的指针指向第二步获得的内容; -
struct inet_sock
是struct sock
的超集,具体参见include/net/inet_sock.h中inet_sock
的定义。初始化inet_sock
,调用sock_init_data
。形成socket
和sock
一一对应的关系,相互有指针指向对方; - 最后调用
proto
中注册的init
函数,err = sk->sk_prot->init(sk)
。如果对应于TCP,其函数指针指向tcp_v4_init_sock
。在tcp_v4_init_sock
中会向sock
注册一组四层协议相关的处理函数ipv4_specific
,这个后面分析。
sock_map_fd
sock_create
分析完了接着看sock_map_fd
:
static int sock_map_fd(struct socket *sock, int flags)
{
struct file *newfile;
int fd = get_unused_fd_flags(flags);
if (unlikely(fd < 0)) {
sock_release(sock);
return fd;
}
newfile = sock_alloc_file(sock, flags, NULL);
if (likely(!IS_ERR(newfile))) {
fd_install(fd, newfile);
return fd;
}
put_unused_fd(fd);
return PTR_ERR(newfile);
}
首先分配一个未使用的文件描述符,也就是用户态socket
最后返回那个fd
。然后通过sock_alloc_file
分配并用socket
结构体初始化一个file
。最后fd_install
将fd
和file
关联起来,这样当用户态操作文件描述符时,内核才能知道具体的file
对象。这其中最重要的一步是newfile = sock_alloc_file(sock, flags, NULL);
。我们来分析下这函数:
/*
* Obtains the first available file descriptor and sets it up for use.
*
* These functions create file structures and maps them to fd space
* of the current process. On success it returns file descriptor
* and file struct implicitly stored in sock->file.
* Note that another thread may close file descriptor before we return
* from this function. We use the fact that now we do not refer
* to socket after mapping. If one day we will need it, this
* function will increment ref. count on file by 1.
*
* In any case returned fd MAY BE not valid!
* This race condition is unavoidable
* with shared fd spaces, we cannot solve it inside kernel,
* but we take care of internal coherence yet.
*/
struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname)
{
struct qstr name = { .name = "" };
struct path path;
struct file *file;
if (dname) {
name.name = dname;
name.len = strlen(name.name);
} else if (sock->sk) {
name.name = sock->sk->sk_prot_creator->name;
name.len = strlen(name.name);
}
path.dentry = d_alloc_pseudo(sock_mnt->mnt_sb, &name);
if (unlikely(!path.dentry)) {
sock_release(sock);
return ERR_PTR(-ENOMEM);
}
path.mnt = mntget(sock_mnt);
d_instantiate(path.dentry, SOCK_INODE(sock));
file = alloc_file(&path, FMODE_READ | FMODE_WRITE,
&socket_file_ops);
if (IS_ERR(file)) {
/* drop dentry, keep inode for a bit */
ihold(d_inode(path.dentry));
path_put(&path);
/* ... and now kill it properly */
sock_release(sock);
return file;
}
sock->file = file;
file->f_flags = O_RDWR | (flags & O_NONBLOCK);
file->private_data = sock; //将sock保存到file->private_data
return file;
}
主要做了两件事情:
- 调用
alloc_file
函数,分配并向file
注册socket_file_ops
。socket_file_ops
就像其他文件操作一样,是一组通用函数。他们最终会通过file—>private_data
调用注册对象真正的操作函数。 - 注册
sock
到file->private_data
。
这之后,当用户通过read
,write
系统调用来读取套接字时,便会调用到socket_file_ops
注册的sock_read_iter
和sock_write_iter
函数。
static const struct file_operations socket_file_ops = {
.owner = THIS_MODULE,
.llseek = no_llseek,
.read_iter = sock_read_iter,
.write_iter = sock_write_iter,
.poll = sock_poll,
.unlocked_ioctl = sock_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = compat_sock_ioctl,
#endif
.mmap = sock_mmap,
.release = sock_close,
.fasync = sock_fasync,
.sendpage = sock_sendpage,
.splice_write = generic_splice_sendpage,
.splice_read = sock_splice_read,
};
总结
我们简单验证下,sock_read_iter
最后是不是调用注册到sock
中proto
。
static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct file *file = iocb->ki_filp; //拿到当前操作的file
struct socket *sock = file->private_data; //拿到sock
struct msghdr msg = {.msg_iter = *to,
.msg_iocb = iocb};
ssize_t res;
if (file->f_flags & O_NONBLOCK)
msg.msg_flags = MSG_DONTWAIT;
if (iocb->ki_pos != 0)
return -ESPIPE;
if (!iov_iter_count(to)) /* Match SYS5 behaviour */
return 0;
res = sock_recvmsg(sock, &msg, msg.msg_flags); //向sock_recvmsg传递了sock
*to = msg.msg_iter;
return res;
}
int sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags)
{
int err = security_socket_recvmsg(sock, msg, msg_data_left(msg), flags);
return err ?: sock_recvmsg_nosec(sock, msg, flags);
}
static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg,
int flags)
{
return sock->ops->recvmsg(sock, msg, msg_data_left(msg), flags); //最后调用这里
}
从上面的调用,可以很清楚的发现,最后就是简单的调用sock->ops->recvmsg(sock, msg, msg_data_left(msg), flags);
。也就是在inet_create
函数中,注册到sock
的struct proto
。如果是用户在创建socket
时是int fd = socket(AF_INET, SOCK_STREAM, 0)
的话。就会调用到tcp_recvmsg
。
struct proto tcp_prot = {
.name = "TCP",
.owner = THIS_MODULE,
.close = tcp_close,
.pre_connect = tcp_v4_pre_connect,
//......
.recvmsg = tcp_recvmsg,
.sendmsg = tcp_sendmsg,
.sendpage = tcp_sendpage,
//......
};
好了,到这里,我们大致知道了,socket
创建时,内核所做的工作。最后创建了如下一个数据结构:
网友评论