先看[1]中的两段代码:
ret = sock_create_kern(&init_net, AF_INET, SOCK_STREAM, 0, &sock);
ret = sock->ops->connect(sock, (struct sockaddr *)&s_addr, sizeof(s_addr), 0);
sock是怎么创建的?sock->ops中有多少操作?
int __init inet_init(void)
{
(void)sock_register(&inet_family_ops);
}
//af_inet.c
static const struct net_proto_family inet_family_ops = {
.family = PF_INET,
.create = inet_create,
/* .owner = THIS_MODULE,*/
};
int sock_create_kern(int family, int type, int protocol, struct socket **res)
{
return __sock_create(&init_net, family, type, protocol, res, 1);
}
int __sock_create(struct net *net, int family, int type, int protocol,
struct socket **res, int kern)
{
err = pf->create(net, sock, protocol, kern);
}
//这里的pf->create指向inet_create函数
接下来就需要分析inet_create(af_inet.c)的处理流程。
static int inet_create(struct net *net, struct socket *sock, int protocol,
int kern)
{
sock->ops = answer->ops;
answer_prot = answer->prot;
sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);
sock_init_data(sock, sk);
}
struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
struct proto *prot)
{
sk->sk_prot = sk->sk_prot_creator = prot;
}
protocol就是在应用层调用sock函数传输的参数,对应三种数据包的处理方式,主要SOCK_STREAM(TCP)、SOCK_DGRAM(UDP)、SOCK_RAW(原始socket)。代码中的answer指针,就是根据protocol选择了inetsw_array中的某一个。sock->ops根据上面的代码,可知:sock->ops = answer->ops。
static struct inet_protosw inetsw_array[] =
{
{
.type = SOCK_STREAM,
.protocol = IPPROTO_TCP,
.prot = &tcp_prot,
.ops = &inet_stream_ops,
.no_check = 0,
.flags = INET_PROTOSW_PERMANENT |
INET_PROTOSW_ICSK,
},
{
.type = SOCK_DGRAM,
.protocol = IPPROTO_UDP,
.prot = &udp_prot,
.ops = &inet_dgram_ops,
.no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_PERMANENT,
},
{
.type = SOCK_DGRAM,
.protocol = IPPROTO_ICMP,
.prot = &ping_prot,
.ops = &inet_dgram_ops,
.no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_REUSE,
},
{
.type = SOCK_RAW,
.protocol = IPPROTO_IP, /* wild card */
.prot = &raw_prot,
.ops = &inet_sockraw_ops,
.no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_REUSE,
}
};
以tcp为例,这里的ops指针中内容是:
const struct proto_ops inet_stream_ops = {
.family = PF_INET,
/*.owner = THIS_MODULE,*/
.release = inet_release,
.bind = inet_bind,
.connect = inet_stream_connect,
.socketpair = sock_no_socketpair,
.accept = inet_accept,
.getname = inet_getname,
// .poll = tcp_poll,
.ioctl = inet_ioctl,
.listen = inet_listen,
.shutdown = inet_shutdown,
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
.recvmsg = inet_recvmsg,
// .mmap = sock_no_mmap,
.sendpage = inet_sendpage,
// .splice_read = tcp_splice_read,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_sock_common_setsockopt,
.compat_getsockopt = compat_sock_common_getsockopt,
.compat_ioctl = inet_compat_ioctl,
#endif
};
回到文首提到的sock->ops->connect操作,就是执行了inet_stream_connect。在上面提到sk_alloc函数, sk->sk_prot指向的就是answer->prot。
int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
int addr_len, int flags)
{
err = __inet_stream_connect(sock, uaddr, addr_len, flags);
}
int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
int addr_len, int flags)
{
err = sk->sk_prot->connect(sk, uaddr, addr_len);
}
如果是tcp协议,answer->prot就是指向了tcp_prot。结构体tcp_prot(tcp_ipv4.c)中的内容:
struct proto tcp_prot = {
.name = "TCP",
//.owner = THIS_MODULE,
.close = tcp_close,
.connect = tcp_v4_connect,
.disconnect = tcp_disconnect,
.accept = inet_csk_accept,
.ioctl = tcp_ioctl,
.init = tcp_v4_init_sock,
.destroy = tcp_v4_destroy_sock,
.shutdown = tcp_shutdown,
.setsockopt = tcp_setsockopt,
.getsockopt = tcp_getsockopt,
.recvmsg = tcp_recvmsg,
.sendmsg = tcp_sendmsg,
.sendpage = tcp_sendpage,
.backlog_rcv = tcp_v4_do_rcv,
.release_cb = tcp_release_cb,
.mtu_reduced = tcp_v4_mtu_reduced,
.hash = inet_hash,
.unhash = inet_unhash,
.get_port = inet_csk_get_port,
.enter_memory_pressure = tcp_enter_memory_pressure,
.stream_memory_free = tcp_stream_memory_free,
.sockets_allocated = &tcp_sockets_allocated,
.orphan_count = &tcp_orphan_count,
.memory_allocated = &tcp_memory_allocated,
.memory_pressure = &tcp_memory_pressure,
.sysctl_mem = sysctl_tcp_mem,
.sysctl_wmem = sysctl_tcp_wmem,
.sysctl_rmem = sysctl_tcp_rmem,
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp_sock),
// .slab_flags = SLAB_DESTROY_BY_RCU,
.twsk_prot = &tcp_timewait_sock_ops,
.rsk_prot = &tcp_request_sock_ops,
.h.hashinfo = &tcp_hashinfo,
.no_autobind = true,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_tcp_setsockopt,
.compat_getsockopt = compat_tcp_getsockopt,
#endif
#ifdef CONFIG_MEMCG_KMEM
.init_cgroup = tcp_init_cgroup,
.destroy_cgroup = tcp_destroy_cgroup,
.proto_cgroup = tcp_proto_cgroup,
#endif
};
sk->sk_prot->connect实际执行的就是tcp_v4_connect函数。
博客[5]会继续分析tcp_v4_connect之后的操作,比如源端口的分配。端口的分配与函数inet_hash_connect相关,基于博客[6]中描述的原则。
如果用户已经绑定了端口,就使用绑定的端口。
如果用户没有绑定端口,则让系统自动选取,策略如下:
- 获取端口的取值区间,以及区间内端口的个数。
- 根据初始偏移量,从端口区间内的某个端口开始,遍历整个区间。
2.1 如果端口是保留的,直接跳过。
2.2 如果端口已经被使用了。
2.2.1 不允许复用已经被bind()的端口。
2.2.2 检查端口是否能被重用,可以的话就重用此端口。
2.3 如果端口没有被使用过,就选择此端口。
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
IPPROTO_TCP,
orig_sport, orig_dport, sk);
err = inet_hash_connect(&tcp_death_row, sk);
}
int inet_hash_connect(struct inet_timewait_death_row *death_row,
struct sock *sk)
{
return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk),
__inet_check_established, __inet_hash_nolisten);
}
int __inet_hash_connect(struct inet_timewait_death_row *death_row,
struct sock *sk, u32 port_offset,
int (*check_established)(struct inet_timewait_death_row *,
struct sock *, __u16, struct inet_timewait_sock **),
int (*hash)(struct sock *sk, struct inet_timewait_sock *twp))
{
if (!snum) {
//端口分配
}
inet_bind_hash(sk, tb, port);
}
reference:
[1] 一个简单的内核Socket Client例子
[2] 2.1 Socket系统调用
[3]第一个Linux网络设备驱动——最简虚拟网卡virnet
[4] Linux串口网卡(一)——通用虚拟网卡的实现
[5] Linux TCP/IP 协议栈之 Socket的实现分析(Connect客户端发起连接请求)
[6] TCP连接建立系列 — 客户端的端口选取和重用
[7] 路由表
网友评论