TL;DR 本文想从源码角度解析正常的 tcp 关闭流程,代码也些冗长,可能还有些理解不到的地方,欢迎指正
我们都知道 tcp 关闭连接时是四次握手,但是抓包时会发现有些场景是三次握手,那从源码(kernel 5.0)角度是如何实现的呢?另外每一步都需要用户 (业务层) 感知嘛?
tcp 关闭四次握手流程
上图是我们常见的流程,无论面试还是书本上都是这样讲的
- 主动关闭一方发起
FIN
包,然后状态变成FIN-WAIT-1
- 被动方收到后,返回
ACK
, 并把被动方状态变成CLOSED-WAIT
- 主动方收到
ACK
后,状态置成FIN-WAIT2
- 被动方再发起一次关闭流程,发起
FIN
, 并把状态设置成LAST-ACK
- 主动方收到后,返回
ACK
, 并将状态置成TIME-WAIT
并等待 2MSL 时间后关闭 - 被动方收到
ACK
后直接关闭
但实际上抓包,可能只有三次握手,也就是第 2 步时被动方返回 ACK
顺便带了一个 FIN
, 为什么呢?
另外我们看 rfc793, 还有一种情况是
server
和 client
同时发送 FIN
,主动方处于 FIN-WAIT-1
时如果收到了 FIN
那么主动方将处于 CLOSING
状态,最后同时进入 TIME-WAIT
状态,等待 2MSL 超时结束后关闭清除
主动关闭连接: 1 发送FIN
业务调用 tcp_close
来触发关闭连接,当然了最重要的就是发 FIN
包。
void tcp_close(struct sock *sk, long timeout)
{
......
} else if (tcp_close_state(sk)) {
tcp_send_fin(sk);
}
......
}
现在高并发服务都用的 epoll
其中 tcp_close_state
会触发 tcp 状态机迁移,然后调用 tcp_send_fin
构建 FIN
包发送到网卡
static const unsigned char new_state[16] = {
/* current state: new state: action: */
[0 /* (Invalid) */] = TCP_CLOSE,
[TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
[TCP_SYN_SENT] = TCP_CLOSE,
[TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
[TCP_FIN_WAIT1] = TCP_FIN_WAIT1,
[TCP_FIN_WAIT2] = TCP_FIN_WAIT2,
[TCP_TIME_WAIT] = TCP_CLOSE,
[TCP_CLOSE] = TCP_CLOSE,
[TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN,
[TCP_LAST_ACK] = TCP_LAST_ACK,
[TCP_LISTEN] = TCP_CLOSE,
[TCP_CLOSING] = TCP_CLOSING,
[TCP_NEW_SYN_RECV] = TCP_CLOSE, /* should not happen ! */
};
void tcp_send_fin(struct sock *sk)
{
struct sk_buff *skb, *tskb = tcp_write_queue_tail(sk);
struct tcp_sock *tp = tcp_sk(sk);
......
if (!tskb && tcp_under_memory_pressure(sk))
tskb = skb_rb_last(&sk->tcp_rtx_queue);
if (tskb) {
TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
TCP_SKB_CB(tskb)->end_seq++;
tp->write_seq++;
if (tcp_write_queue_empty(sk)) {
/* This means tskb was already sent.
* Pretend we included the FIN on previous transmit.
* We need to set tp->snd_nxt to the value it would have
* if FIN had been sent. This is because retransmit path
* does not change tp->snd_nxt.
*/
tp->snd_nxt++;
return;
}
} else {
skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation);
if (unlikely(!skb))
return;
INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
skb_reserve(skb, MAX_TCP_HEADER);
sk_forced_mem_schedule(sk, skb->truesize);
/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
tcp_init_nondata_skb(skb, tp->write_seq,
TCPHDR_ACK | TCPHDR_FIN);
tcp_queue_skb(sk, skb);
}
__tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF);
}
-
tcp_send_fin
首先从sk->sk_write_queue
拿队尾skb
包,如果有的话,说明队列里还有未发送的数据,直接将tcp_flags
增加TCPHDR_FIN
标记,然后再次判断队列,如果为空说明发出去了。 - 如果队尾拿不到,那么从内核中分配一个
skb
包,并且加到队列tcp_queue_skb
- 最后调用
__tcp_push_pending_frames
发送数据,这里有两点要注意,这里只是发到协议栈下一层,并且关闭了nagle
表示立该发送
被动关闭连接: 1 接收FIN
NAPI这里有一个大背景,现代网络收发协义栈一般都用 NAPI
, 将从物理链路到应用层收包的过程,分成了两个部份:1. 硬件网卡接收数据,写到 DMA
内存,网卡硬中断 IRQ 处理函数 napi_schedule
触发 softirq
来唤醒 NAPI
2. 关闭硬中断,开启轮循模式读数据到 buffer
中,并在合适的时候 wake
阻塞的线程或触发 epoll
. 对应到网络协义栈四层接收数据的函数是 tcp_v4_rcv
int tcp_v4_rcv(struct sk_buff *skb)
{
......
if (!sock_owned_by_user(sk)) {
skb_to_free = sk->sk_rx_skb_cache;
sk->sk_rx_skb_cache = NULL;
ret = tcp_v4_do_rcv(sk, skb);
} else {
if (tcp_add_backlog(sk, skb))
goto discard_and_relse;
skb_to_free = NULL;
}
......
}
tcp_v4_rcv
是 tcp 协义中自下而上的入口,由于此时状态是 ESTABLISHED
,忽略其它状态处理内容。sock_owned_by_user
表示当前是否有进程正在使用此 socket
,如果正在使用,那么为防止冲突要调用 tcp_add_backlog
加到 backlog 队列中,否则调用 tcp_v4_do_rcv
处理。
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
struct sock *rsk;
if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
struct dst_entry *dst = sk->sk_rx_dst;
sock_rps_save_rxhash(sk, skb);
sk_mark_napi_id(sk, skb);
if (dst) {
if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
!dst->ops->check(dst, 0)) {
dst_release(dst);
sk->sk_rx_dst = NULL;
}
}
tcp_rcv_established(sk, skb);
return 0;
}
......
}
这里忽略其它所有无关代码,只看 TCP_ESTABLISHED
时的处理函数 tcp_rcv_established
,这个函数比较特殊,分为 fast path
和 slow path
,查阅文档发现接收 FIN
不会进入 fast path
void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
{
const struct tcphdr *th = (const struct tcphdr *)skb->data;
struct tcp_sock *tp = tcp_sk(sk);
unsigned int len = skb->len;
......
slow_path:
if (len < (th->doff << 2) || tcp_checksum_complete(skb))
goto csum_error;
if (!th->ack && !th->rst && !th->syn)
goto discard;
/*
* Standard slow path.
*/
if (!tcp_validate_incoming(sk, skb, th, 1))
return;
step5:
if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0)
goto discard;
tcp_rcv_rtt_measure_ts(sk, skb);
/* Process urgent data. */
tcp_urg(sk, skb, th);
/* step 7: process the segment text */
tcp_data_queue(sk, skb);
tcp_data_snd_check(sk);
tcp_ack_snd_check(sk);
return;
......
这里面和关闭连接相关的就只有 tcp_data_queue
,最后 tcp_ack_snd_check
会检查如果有必要会发送 ack
static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
bool fragstolen;
int eaten;
......
if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
if (tcp_receive_window(tp) == 0) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
goto out_of_window;
}
/* Ok. In sequence. In window. */
queue_and_out:
if (skb_queue_len(&sk->sk_receive_queue) == 0)
sk_forced_mem_schedule(sk, skb->truesize);
else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
goto drop;
}
eaten = tcp_queue_rcv(sk, skb, &fragstolen);
if (skb->len)
tcp_event_data_recv(sk, skb);
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
tcp_fin(sk);
if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
tcp_ofo_queue(sk);
/* RFC5681. 4.2. SHOULD send immediate ACK, when
* gap in queue is filled.
*/
if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
}
if (tp->rx_opt.num_sacks)
tcp_sack_remove(tp);
tcp_fast_path_check(sk);
if (eaten > 0)
kfree_skb_partial(skb, fragstolen);
if (!sock_flag(sk, SOCK_DEAD))
tcp_data_ready(sk);
return;
}
......
}
省去不相关内容,主要是调用 tcp_queue_rcv
将 skb
入队尾,然后调用最关键的 tcp_fin
处理 FIN
, 最后调用 tcp_data_ready
通知用户
void tcp_fin(struct sock *sk)
{
......
sk->sk_shutdown |= RCV_SHUTDOWN;
sock_set_flag(sk, SOCK_DONE);
switch (sk->sk_state) {
case TCP_SYN_RECV:
case TCP_ESTABLISHED:
/* Move to CLOSE_WAIT */
tcp_set_state(sk, TCP_CLOSE_WAIT);
inet_csk_enter_pingpong_mode(sk);
break;
......
if (!sock_flag(sk, SOCK_DEAD)) {
sk->sk_state_change(sk);
/* Do not send POLL_HUP for half duplex close. */
if (sk->sk_shutdown == SHUTDOWN_MASK ||
sk->sk_state == TCP_CLOSE)
sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
else
sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
}
可以看到 tcp_fin
做了三件事:
- 设置
sk->sk_shutdown
关闭单工读端 -
tcp_set_state
将状态设置成TCP_CLOSE_WAIT
- 发送
POLL_IN
通知,注意这里单工关闭连接,只发送POLL_IN
被动关闭连接: 2 处理FIN
此时应用收到 epoll
通知处理, 应用会 read
或 recv
读数据,此时会调用 tcp_recvmsg
, 返回由内核 copy 到用户空间数据大小
int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
int flags, int *addr_len)
{
......
do {
u32 offset;
/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
if (tp->urg_data && tp->urg_seq == *seq) {
if (copied)
break;
if (signal_pending(current)) {
copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
break;
}
}
/* Next get a buffer. */
last = skb_peek_tail(&sk->sk_receive_queue);
skb_queue_walk(&sk->sk_receive_queue, skb) {
last = skb;
/* Now that we have two receive queues this
* shouldn't happen.
*/
if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
"TCP recvmsg seq # bug: copied %X, seq %X, rcvnxt %X, fl %X\n",
*seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
flags))
break;
offset = *seq - TCP_SKB_CB(skb)->seq;
if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
pr_err_once("%s: found a SYN, please report !\n", __func__);
offset--;
}
if (offset < skb->len)
goto found_ok_skb;
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
goto found_fin_ok;
WARN(!(flags & MSG_PEEK),
"TCP recvmsg seq # bug 2: copied %X, seq %X, rcvnxt %X, fl %X\n",
*seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
}
/* Well, if we have backlog, try to process it now yet. */
if (copied >= target && !sk->sk_backlog.tail)
break;
if (copied) {
if (sk->sk_err ||
sk->sk_state == TCP_CLOSE ||
(sk->sk_shutdown & RCV_SHUTDOWN) ||
!timeo ||
signal_pending(current))
break;
} else {
if (sock_flag(sk, SOCK_DONE))
break;
if (sk->sk_err) {
copied = sock_error(sk);
break;
}
if (sk->sk_shutdown & RCV_SHUTDOWN)
break;
if (sk->sk_state == TCP_CLOSE) {
/* This occurs when user tries to read
* from never connected socket.
*/
copied = -ENOTCONN;
break;
}
if (!timeo) {
copied = -EAGAIN;
break;
}
if (signal_pending(current)) {
copied = sock_intr_errno(timeo);
break;
}
}
tcp_cleanup_rbuf(sk, copied);
if (copied >= target) {
/* Do not sleep, just process backlog. */
release_sock(sk);
lock_sock(sk);
} else {
sk_wait_data(sk, &timeo, last);
}
if ((flags & MSG_PEEK) &&
(peek_seq - copied - urg_hole != tp->copied_seq)) {
net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
current->comm,
task_pid_nr(current));
peek_seq = tp->copied_seq;
}
continue;
found_ok_skb:
/* Ok so how much can we use? */
used = skb->len - offset;
if (len < used)
used = len;
/* Do we have urgent data here? */
if (tp->urg_data) {
u32 urg_offset = tp->urg_seq - *seq;
if (urg_offset < used) {
if (!urg_offset) {
if (!sock_flag(sk, SOCK_URGINLINE)) {
++*seq;
urg_hole++;
offset++;
used--;
if (!used)
goto skip_copy;
}
} else
used = urg_offset;
}
}
if (!(flags & MSG_TRUNC)) {
err = skb_copy_datagram_msg(skb, offset, msg, used);
if (err) {
/* Exception. Bailout! */
if (!copied)
copied = -EFAULT;
break;
}
}
......
found_fin_ok:
/* Process the FIN. */
++*seq;
if (!(flags & MSG_PEEK))
sk_eat_skb(sk, skb);
break;
} while (len > 0);
/* According to UNIX98, msg_name/msg_namelen are ignored
* on connected socket. I was just happy when found this 8) --ANK
*/
/* Clean up data we have read: This will do ACK frames. */
tcp_cleanup_rbuf(sk, copied);
......
return copied;
......
}
这个函数比较长,理解起来有些吃力,重点是 for
循环用来不断的读数据,如果 copy 大于 0,那么业务正常处理,调用 skb_copy_datagram_msg
将数据拷贝到用户空间,等再次调用 tcp_recvmsg
时 skb_queue_walk
遇到了 FIN
包,break 跳出循环,tcp_cleanup_rbuf
发送 ACK
, 并返回 0.
根据 man 2 recv
可得知,recv
返回值为 0 表示对端关闭,此时被动方就要调用 close
来关闭连接。
主动关闭连接: 3 接收ACK
此时主动关闭方,收到了 ACK
,状态变更为 FIN-WAIT2
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
......
if (tcp_rcv_state_process(sk, skb)) {
rsk = sk;
goto reset;
}
return 0;
......
int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
{
......
/* step 5: check the ACK field */
acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH |
FLAG_UPDATE_TS_RECENT |
FLAG_NO_CHALLENGE_ACK) > 0;
if (!acceptable) {
if (sk->sk_state == TCP_SYN_RECV)
return 1; /* send one RST */
tcp_send_challenge_ack(sk, skb);
goto discard;
}
switch (sk->sk_state) {
......
case TCP_FIN_WAIT1: {
int tmo;
if (req)
tcp_rcv_synrecv_state_fastopen(sk);
if (tp->snd_una != tp->write_seq)
break;
tcp_set_state(sk, TCP_FIN_WAIT2);
sk->sk_shutdown |= SEND_SHUTDOWN;
sk_dst_confirm(sk);
if (!sock_flag(sk, SOCK_DEAD)) {
/* Wake up lingering close() */
sk->sk_state_change(sk);
break;
}
if (tp->linger2 < 0) {
tcp_done(sk);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
return 1;
}
if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
/* Receive out of order FIN after close() */
if (tp->syn_fastopen && th->fin)
tcp_fastopen_active_disable(sk);
tcp_done(sk);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
return 1;
}
tmo = tcp_fin_time(sk);
if (tmo > TCP_TIMEWAIT_LEN) {
inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
} else if (th->fin || sock_owned_by_user(sk)) {
/* Bad case. We could lose such FIN otherwise.
* It is not a big problem, but it looks confusing
* and not so rare event. We still can lose it now,
* if it spins in bh_lock_sock(), but it is really
* marginal case.
*/
inet_csk_reset_keepalive_timer(sk, tmo);
} else {
tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
goto discard;
}
break;
}
......
}
tcp_rcv_state_process
调用 tcp_ack
处理 ACK
, 然后判断状态,当前是 TCP_FIN_WAIT1
,变成 TCP_FIN_WAIT2
,并设置 sk->sk_shutdown |= SEND_SHUTDOWN
被动关闭连接: 3 发送FIN
此时被动方处于 CLOSE_WAIT
状态,业务方调用 close
发送 FIN
包
void tcp_close(struct sock *sk, long timeout)
{
......
} else if (tcp_close_state(sk)) {
tcp_send_fin(sk);
}
......
}
由 tcp_close_state
得知,状态变更为 TCP_LAST_ACK
,并调用 tcp_send_fin
发送 FIN
,仔细阅读 tcp_send_fin
代码会发现,如果当前 sk->sk_write_queue
有未发送的 skb
(上一步的 ACK 包),那么直接置 FIN
标记,而不是单独生成新序号的包,这里解释了为什么有时tcp关闭是三次握手
主动关闭连接: 4 接收 FIN
此时主动方处于 TCP_FIN_WAIT2
, 接收被动方传来的 FIN
进入 TIME_WAIT
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
......
if (tcp_rcv_state_process(sk, skb)) {
rsk = sk;
goto reset;
}
return 0;
同样的 tcp_v4_do_rcv
收包,然后调用 tcp_rcv_state_process
处理
int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
{
......
/* step 6: check the URG bit */
tcp_urg(sk, skb, th);
/* step 7: process the segment text */
switch (sk->sk_state) {
case TCP_CLOSE_WAIT:
case TCP_CLOSING:
case TCP_LAST_ACK:
if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
break;
/* fall through */
case TCP_FIN_WAIT1:
case TCP_FIN_WAIT2:
/* RFC 793 says to queue data in these states,
* RFC 1122 says we MUST send a reset.
* BSD 4.4 also does reset.
*/
if (sk->sk_shutdown & RCV_SHUTDOWN) {
if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
tcp_reset(sk);
return 1;
}
}
/* Fall through */
case TCP_ESTABLISHED:
tcp_data_queue(sk, skb);
queued = 1;
break;
}
/* tcp_data could move socket to TIME-WAIT */
if (sk->sk_state != TCP_CLOSE) {
tcp_data_snd_check(sk);
tcp_ack_snd_check(sk);
}
if (!queued) {
discard:
tcp_drop(sk, skb);
}
return 0;
}
此时 switch case
语法会 fall through
到 tcp_data_queue
函数
static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
{
......
if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
if (tcp_receive_window(tp) == 0) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
goto out_of_window;
}
/* Ok. In sequence. In window. */
queue_and_out:
if (skb_queue_len(&sk->sk_receive_queue) == 0)
sk_forced_mem_schedule(sk, skb->truesize);
else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
goto drop;
}
eaten = tcp_queue_rcv(sk, skb, &fragstolen);
if (skb->len)
tcp_event_data_recv(sk, skb);
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
tcp_fin(sk);
if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
tcp_ofo_queue(sk);
/* RFC5681. 4.2. SHOULD send immediate ACK, when
* gap in queue is filled.
*/
if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
}
if (tp->rx_opt.num_sacks)
tcp_sack_remove(tp);
tcp_fast_path_check(sk);
if (eaten > 0)
kfree_skb_partial(skb, fragstolen);
if (!sock_flag(sk, SOCK_DEAD))
tcp_data_ready(sk);
return;
}
if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
tcp_rcv_spurious_retrans(sk, skb);
/* A retransmit, 2nd most common case. Force an immediate ack. */
NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
out_of_window:
tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
inet_csk_schedule_ack(sk);
drop:
tcp_drop(sk, skb);
return;
}
......
}
此时 tcp_fin
处理状态机
void tcp_fin(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
inet_csk_schedule_ack(sk);
sk->sk_shutdown |= RCV_SHUTDOWN;
sock_set_flag(sk, SOCK_DONE);
switch (sk->sk_state) {
case TCP_SYN_RECV:
case TCP_ESTABLISHED:
/* Move to CLOSE_WAIT */
tcp_set_state(sk, TCP_CLOSE_WAIT);
inet_csk_enter_pingpong_mode(sk);
break;
case TCP_CLOSE_WAIT:
case TCP_CLOSING:
/* Received a retransmission of the FIN, do
* nothing.
*/
break;
case TCP_LAST_ACK:
/* RFC793: Remain in the LAST-ACK state. */
break;
case TCP_FIN_WAIT1:
/* This case occurs when a simultaneous close
* happens, we must ack the received FIN and
* enter the CLOSING state.
*/
tcp_send_ack(sk);
tcp_set_state(sk, TCP_CLOSING);
break;
case TCP_FIN_WAIT2:
/* Received a FIN -- send ACK and enter TIME_WAIT. */
tcp_send_ack(sk);
tcp_time_wait(sk, TCP_TIME_WAIT, 0);
break;
default:
/* Only TCP_LISTEN and TCP_CLOSE are left, in these
* cases we should never reach this piece of code.
*/
pr_err("%s: Impossible, sk->sk_state=%d\n",
__func__, sk->sk_state);
break;
}
/* It _is_ possible, that we have something out-of-order _after_ FIN.
* Probably, we should reset in this case. For now drop them.
*/
skb_rbtree_purge(&tp->out_of_order_queue);
if (tcp_is_sack(tp))
tcp_sack_reset(&tp->rx_opt);
sk_mem_reclaim(sk);
if (!sock_flag(sk, SOCK_DEAD)) {
sk->sk_state_change(sk);
/* Do not send POLL_HUP for half duplex close. */
if (sk->sk_shutdown == SHUTDOWN_MASK ||
sk->sk_state == TCP_CLOSE)
sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
else
sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
}
}
由于当前状态是 TCP_FIN_WAIT2
,所以先调用 tcp_send_ack
发送 ACK
,然后 tcp_time_wait(sk, TCP_TIME_WAIT, 0)
设置最后的状态为 TIME_WAIT
, 并设置 2MSL 的超时定时器和回调函数 tw_timer_handler
,但是细节很多
void tcp_time_wait(struct sock *sk, int state, int timeo)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcp_sock *tp = tcp_sk(sk);
struct inet_timewait_sock *tw;
struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
tw = inet_twsk_alloc(sk, tcp_death_row, state);
if (tw) {
struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
struct inet_sock *inet = inet_sk(sk);
tw->tw_transparent = inet->transparent;
tw->tw_mark = sk->sk_mark;
tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
tcptw->tw_rcv_nxt = tp->rcv_nxt;
tcptw->tw_snd_nxt = tp->snd_nxt;
tcptw->tw_rcv_wnd = tcp_receive_window(tp);
tcptw->tw_ts_recent = tp->rx_opt.ts_recent;
tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
tcptw->tw_ts_offset = tp->tsoffset;
tcptw->tw_last_oow_ack_time = 0;
......
/* Get the TIME_WAIT timeout firing. */
if (timeo < rto)
timeo = rto;
if (state == TCP_TIME_WAIT)
timeo = TCP_TIMEWAIT_LEN;
/* tw_timer is pinned, so we need to make sure BH are disabled
* in following section, otherwise timer handler could run before
* we complete the initialization.
*/
local_bh_disable();
inet_twsk_schedule(tw, timeo);
/* Linkage updates.
* Note that access to tw after this point is illegal.
*/
inet_twsk_hashdance(tw, sk, &tcp_hashinfo);
local_bh_enable();
}
tcp_update_metrics(sk);
tcp_done(sk);
}
inet_twsk_alloc
函数创建了一个专用于 TIME_WAIT
的 socket
,叫做 inet_timewait_sock
,在这个函数里面设置当前 sk
的信息,还有超时回调等等,而原 socket
调用 tcp_done
后会被销毁
被动关闭连接: 4 接收最后一次 ACK
此时处于 LAST_ACK
状态,tcp_v4_do_rcv
调用 tcp_rcv_state_process
处理包
int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
{
......
case TCP_LAST_ACK:
if (tp->snd_una == tp->write_seq) {
tcp_update_metrics(sk);
tcp_done(sk);
goto discard;
}
break;
}
......
}
可以看到 switch
语句,直接调用 tcp_done
将状态置为 TCP_CLOSE
,至此被动一方完成连接关闭。
小结
只是分析了大体逻辑,很多细节并没有深入,比如 ack 机制和其它 tcp 特性
网友评论