这篇接的是我最早在javaeye的那篇blog. http://simohayha.iteye.com/blog/614258

首先我们要知道在linux下分为5个拥塞状态,定义如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
  
enum tcp_ca_state {

TCP_CA_Open = 0,

#define TCPF_CA_Open (1<<TCP_CA_Open)

TCP_CA_Disorder = 1,

#define TCPF_CA_Disorder (1<<TCP_CA_Disorder)

TCP_CA_CWR = 2,

#define TCPF_CA_CWR (1<<TCP_CA_CWR)

TCP_CA_Recovery = 3,

#define TCPF_CA_Recovery (1<<TCP_CA_Recovery)

TCP_CA_Loss = 4

#define TCPF_CA_Loss (1<<TCP_CA_Loss)

}

TCP_CA_OPEN这个就是初始状态,也就是没有检测到任何拥塞的情况.

TCP_CA_Disorder 顾名思义,这个状态就是当第一次由于收到SACK或者重复的ack而检测到拥塞时,就进入这个状态.

TCP_CA_CWR 由于一些拥塞通知事件而导致拥塞窗口减小,然后就会进入这个状态。比如ECN,ICMP,本地设备拥塞。

TCP_CA_Recovery 当CWND减小

TCP_CA_Loss 超时或者SACK被拒绝,此时表示数据包丢失,因此进入这个状态.

在分析代码之前,先来看一个最重要的数据结构,那就是tcp_sock,这个结构就是下面的拥塞控制所操作的核心数据结构.注释都是非常详细.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
  
struct tcp_sock {

/\* inet_connection_sock has to be the first member of tcp_sock \*/

struct inet_connection_sock inet_conn;

u16 tcp_header_len; /\* Bytes of tcp header to send \*/

u16 xmit_size_goal_segs; /\* Goal for segmenting output packets \*/

/*

* Header prediction flags

* 0x5?10 << 16 + snd_wnd in net byte order

*/

__be32 pred_flags;

/*

* RFC793 variables by their proper names. This means you can

* read the code and the spec side by side (and laugh &#8230;)

* See RFC793 and RFC1122. The RFC writes these in capitals.

*/

u32 rcv_nxt; /\* What we want to receive next \*/

u32 copied_seq; /\* Head of yet unread data \*/

u32 rcv_wup; /\* rcv_nxt on last window update sent \*/

u32 snd_nxt; /\* Next sequence we send \*/

u32 snd_una; /\* First byte we want an ack for \*/

u32 snd_sml; /\* Last byte of the most recently transmitted small packet \*/

u32 rcv_tstamp; /\* timestamp of last received ACK (for keepalives) \*/

u32 lsndtime; /\* timestamp of last sent data packet (for restart window) \*/

/\* Data for direct copy to user \*/

struct {

struct sk_buff_head prequeue;

struct task_struct *task;

struct iovec *iov;

int memory;

int len;

#ifdef CONFIG_NET_DMA

/\* members for async copy \*/

struct dma_chan *dma_chan;

int wakeup;

struct dma_pinned_list *pinned_list;

dma_cookie_t dma_cookie;

#endif

} ucopy;

u32 snd_wl1; /\* Sequence for window update \*/

u32 snd_wnd; /\* The window we expect to receive \*/

u32 max_window; /\* Maximal window ever seen from peer \*/

u32 mss_cache; /\* Cached effective mss, not including SACKS \*/

u32 window_clamp; /\* Maximal window to advertise \*/

u32 rcv_ssthresh; /\* Current window clamp \*/

u32 frto_highmark; /\* snd_nxt when RTO occurred \*/

u16 advmss; /\* Advertised MSS \*/

u8 frto_counter; /\* Number of new acks after RTO \*/

u8 nonagle : 4,/\* Disable Nagle algorithm? \*/

thin_lto : 1,/\* Use linear timeouts for thin streams \*/

thin_dupack : 1,/\* Fast retransmit on first dupack \*/

repair : 1,

unused : 1;

u8 repair_queue;

u8 do_early_retrans:1,/\* Enable RFC5827 early-retransmit \*/

early_retrans_delayed:1; /\* Delayed ER timer installed \*/

/\* RTT measurement \*/

u32 srtt; /\* smoothed round trip time << 3 \*/

u32 mdev; /\* medium deviation \*/

u32 mdev_max; /\* maximal mdev for the last rtt period \*/

u32 rttvar; /\* smoothed mdev_max \*/

u32 rtt_seq; /\* sequence number to update rttvar \*/

u32 packets_out; /\* Packets which are "in flight" \*/

u32 retrans_out; /\* Retransmitted packets out \*/

u16 urg_data; /\* Saved octet of OOB data and control flags \*/

u8 ecn_flags; /\* ECN status bits. \*/

u8 reordering; /\* Packet reordering metric. \*/

u32 snd_up; /\* Urgent pointer \*/

u8 keepalive_probes; /\* num of allowed keep alive probes \*/

/*

* Options received (usually on last packet, some only on SYN packets).

*/

struct tcp_options_received rx_opt;

/*

* Slow start and congestion control (see also Nagle, and Karn & Partridge)

*/

u32 snd_ssthresh; /\* Slow start size threshold \*/

u32 snd_cwnd; /\* Sending congestion window \*/

u32 snd_cwnd_cnt; /\* Linear increase counter \*/

u32 snd_cwnd_clamp; /\* Do not allow snd_cwnd to grow above this \*/

u32 snd_cwnd_used;

u32 snd_cwnd_stamp;

u32 prior_cwnd; /\* Congestion window at start of Recovery. \*/

u32 prr_delivered; /* Number of newly delivered packets to

\* receiver in Recovery. \*/

u32 prr_out; /\* Total number of pkts sent during Recovery. \*/

u32 rcv_wnd; /\* Current receiver window \*/

u32 write_seq; /\* Tail(+1) of data held in tcp send buffer \*/

u32 pushed_seq; /\* Last pushed seq, required to talk to windows \*/

u32 lost_out; /\* Lost packets \*/

u32 sacked_out; /\* SACK&#8217;d packets \*/

u32 fackets_out; /\* FACK&#8217;d packets \*/

u32 tso_deferred;

u32 bytes_acked; /\* Appropriate Byte Counting &#8211; RFC3465 \*/

/\* from STCP, retrans queue hinting \*/

struct sk_buff* lost_skb_hint;

struct sk_buff *scoreboard_skb_hint;

struct sk_buff *retransmit_skb_hint;

struct sk_buff_head out_of_order_queue; /\* Out of order segments go here \*/

/\* SACKs data, these 2 need to be together (see tcp_options_write) \*/

struct tcp_sack_block duplicate_sack[1]; /\* D-SACK block \*/

struct tcp_sack_block selective_acks[4]; /\* The SACKS themselves\*/

struct tcp_sack_block recv_sack_cache[4];

struct sk_buff \*highest_sack; /\* skb just after the highest

* skb with SACKed bit set

* (validity guaranteed only if

* sacked_out > 0)

*/

int lost_cnt_hint;

u32 retransmit_high; /\* L-bits may be on up to this seqno \*/

u32 lost_retrans_low; /\* Sent seq after any rxmit (lowest) \*/

u32 prior_ssthresh; /\* ssthresh saved at recovery start \*/

u32 high_seq; /\* snd_nxt at onset of congestion \*/

u32 retrans_stamp; /* Timestamp of the last retransmit,

* also used in SYN-SENT to remember stamp of

\* the first SYN. \*/

u32 undo_marker; /\* tracking retrans started here. \*/

int undo_retrans; /\* number of undoable retransmissions. \*/

u32 total_retrans; /\* Total retransmits for entire connection \*/

u32 urg_seq; /\* Seq of received urgent pointer \*/

unsigned int keepalive_time; /\* time before keep alive takes place \*/

unsigned int keepalive_intvl; /\* time interval between keep alive probes \*/

int linger2;

/\* Receiver side RTT estimation \*/

struct {

u32 rtt;

u32 seq;

u32 time;

} rcv_rtt_est;

/\* Receiver queue space \*/

struct {

int space;

u32 seq;

u32 time;

} rcvq_space;

/\* TCP-specific MTU probe information. \*/

struct {

u32 probe_seq_start;

u32 probe_seq_end;

} mtu_probe;

#ifdef CONFIG_TCP_MD5SIG

/\* TCP AF-Specific parts; only used by MD5 Signature support so far \*/

const struct tcp_sock_af_ops *af_specific;

/\* TCP MD5 Signature Option information \*/

struct tcp_md5sig_info __rcu *md5sig_info;

#endif

/* When the cookie options are generated and exchanged, then this

* object holds a reference to them (cookie_values->kref). Also

* contains related tcp_cookie_transactions fields.

*/

struct tcp_cookie_values *cookie_values;

}

这里最主要的函数就是tcp_fastretrans_alert,进入这个函数的条件:

  • – each incoming ACK, if state is not “Open”
  • – when arrived ACK is unusual, namely:

* * SACK

* * Duplicate ACK.

* * ECN ECE.

接下来就看这几个状态是如何变迁的,tcp_fastretrans_alert这个函数我以前的blog有分析过,因此我这里只是用来描述状态变迁相关的函数。

首先状态变迁分为两部分,一部分是进入某些状态,一部分是从某些状态跳出来。首先来分析状态的默认处理,也就是假设是处于Open状态,然后收到了异常的ACK,此时代码是如何处理的.。

这里要注意TCP reno算法是用快重传来模拟SACK,所以如果关闭了SACK那么就需要模拟SACK.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
	  
switch (icsk->icsk_ca_state) {

&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;&#8230;.

/\* Loss is undone; fall through to processing in Open state. \*/

// 进入下面则有可能是 disorder,open, cwr,loss 这几个状态.

default:

//如果SACK关闭,那么就需要模拟SACK

if (tcp_is_reno(tp)) {

if (flag & FLAG_SND_UNA_ADVANCED)

tcp_reset_reno_sack(tp);

if (is_dupack)

tcp_add_reno_sack(sk);

}

//从DSACK恢复

if (icsk->icsk_ca_state <= TCP_CA_Disorder)

tcp_try_undo_dsack(sk);

//是否需要进入revocer状态。

if (!tcp_time_to_recover(sk, flag)) {

//如果不需要,则尝试着检测是否需要进入CWR或者Disorder状态.

tcp_try_to_open(sk, flag);

return;

}

/\* MTU probe failure: don&#8217;t reduce cwnd \*/

if (icsk->icsk_ca_state < TCP_CA_CWR &&

icsk->icsk_mtup.probe_size &&

tp->snd_una == tp->mtu_probe.probe_seq_start) {

tcp_mtup_probe_failed(sk);

/\* Restores the reduction we did in tcp_mtup_probe() \*/

tp->snd_cwnd++;

tcp_simple_retransmit(sk);

return;

}

//最终进入recovery状态

/\* Otherwise enter Recovery state \*/

tcp_enter_recovery(sk, (flag & FLAG_ECE));

fast_rexmit = 1;

}

上面有三个函数需要详细分析,分别是tcp_time_to_recover,tcp_try_to_open以及tcp_enter_recovery。

首先是tcp_time_to_recover,这个函数主要是用来判断是否需要进入recover状态。

首先来描述下几个基本概念,一个就是重定序长度(reordering),这个值的意思是当有大于1个的SACK之后,相差最大的两个SACK之间的距离,比如第一个SACK通知的序列是7,第二个是2,那么reordering值就是6.而FACK_OUT表示sack确认的最大的序列号。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
  
static bool tcp_time_to_recover(struct sock *sk, int flag)

{

struct tcp_sock *tp = tcp_sk(sk);

__u32 packets_out;

/\* Do not perform any recovery during F-RTO algorithm \*/

if (tp->frto_counter)

return false;

/\* Trick#1: The loss is proven. \*/

if (tp->lost_out)

return true;

/\* Not-A-Trick#2 : Classic rule&#8230; \*/

//如果SACK的最大序列号大于重定序长度,那么说明重定序序列中头部的数据一定丢失,那么就需要进入recover状态.

if (tcp_dupack_heuristics(tp) > tp->reordering)

return true;

/* Trick#3 : when we use RFC2988 timer restart, fast

* retransmit can be triggered by timeout of queue head.

*/

//如果数据包超时(因为每次重传定时器都会被重置),则进入recover状态.

if (tcp_is_fack(tp) && tcp_head_timedout(sk))

return true;

/* Trick#4: It is still not OK&#8230; But will it be useful to delay

* recovery more?

*/

packets_out = tp->packets_out;

//这里不太理解什么意思

if (packets_out <= tp->reordering &&

tp->sacked_out >= max_t(__u32, packets_out/2, sysctl_tcp_reordering) &&

!tcp_may_send_now(sk)) {

/* We have nothing to send. This connection is limited

* either by receiver window or by application.

*/

return true;

}

/* If a thin stream is detected, retransmit after first

* received dupack. Employ only if SACK is supported in order

* to avoid possible corner-case series of spurious retransmissions

* Use only if there are no unsent data.

*/

//处理thin stream

if ((tp->thin_dupack || sysctl_tcp_thin_dupack) &&

tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 &&

tcp_is_sack(tp) && !tcp_send_head(sk))

return true;

/* Trick#6: TCP early retransmit, per RFC5827. To avoid spurious

* retransmissions due to small network reorderings, we implement

* Mitigation A.3 in the RFC and delay the retransmission for a short

* interval if appropriate.

*/

//处理early retransmit

if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out &&

(tp->packets_out == (tp->sacked_out + 1) && tp->packets_out < 4) &&

!tcp_may_send_now(sk))

return !tcp_pause_early_retransmit(sk, flag);

//最终返回false.

return false;

}

然后来看tcp_try_to_open方法,这个函数名字有点问题,它的主要作用是检测是否需要进入CWR或者Disorder状态.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
  
static void tcp_try_to_open(struct sock *sk, int flag)

{

struct tcp_sock *tp = tcp_sk(sk);

tcp_verify_left_out(tp);

if (!tp->frto_counter && !tcp_any_retrans_done(sk))

tp->retrans_stamp = 0;

//如果接受到ECE那么就进入cwr状态.

if (flag & FLAG_ECE)

tcp_enter_cwr(sk, 1);

if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {

//检测是否需要进入disorder状态,否则进入open状态.

tcp_try_keep_open(sk);

//如果不是open状态,则修改拥塞窗口

if (inet_csk(sk)->icsk_ca_state != TCP_CA_Open)

tcp_moderate_cwnd(tp);

} else {

//减小拥塞窗口

tcp_cwnd_down(sk, flag);

}

}

然后来看tcp__try_keep_open方法,这个方法就是判断是否进入Disorder状态.条件很简单,那就是要么有SACK的段或者有丢失的段,要么有任何重传的段,那么就进入Disorder状态。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
  
static void tcp_try_keep_open(struct sock *sk)

{

struct tcp_sock *tp = tcp_sk(sk);

//默认是进入open状态

int state = TCP_CA_Open;

//进入Disorder状态

if (tcp_left_out(tp) || tcp_any_retrans_done(sk))

state = TCP_CA_Disorder;

if (inet_csk(sk)->icsk_ca_state != state) {

tcp_set_ca_state(sk, state);

tp->high_seq = tp->snd_nxt;

}

}

然后就是tcp_enter_recovery,这个函数主要就是用来进入recover状态,然后设置相关的域。

high_seq : 进入recover状态时的snd_nxt.

undo_marker: 表示进入revover状态时的snd_una

undo_retrans: 表示进入revover状态时的重传段个数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
  
static void tcp_enter_recovery(struct sock *sk, bool ece_ack)

{

struct tcp_sock *tp = tcp_sk(sk);

int mib_idx;

if (tcp_is_reno(tp))

mib_idx = LINUX_MIB_TCPRENORECOVERY;

else

mib_idx = LINUX_MIB_TCPSACKRECOVERY;

NET_INC_STATS_BH(sock_net(sk), mib_idx);

//更新相关域

tp->high_seq = tp->snd_nxt;

tp->prior_ssthresh = 0;

tp->undo_marker = tp->snd_una;

tp->undo_retrans = tp->retrans_out;

if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {

if (!ece_ack)

//保存当前的ssthresh,以便于后续恢复

tp->prior_ssthresh = tcp_current_ssthresh(sk);

//更新slow start的阈值.

tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);

TCP_ECN_queue_cwr(tp);

}

tp->bytes_acked = 0;

tp->snd_cwnd_cnt = 0;

//保存拥塞窗口

tp->prior_cwnd = tp->snd_cwnd;

tp->prr_delivered = 0;

tp->prr_out = 0;

//进入recovery状态

tcp_set_ca_state(sk, TCP_CA_Recovery);

}

然后来看tcp_enter_cwr,这个函数主要是用于进入CWR状态。看这个函数的时候,可以看到和上面recover状态使用的变量的设置关系.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
  
void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)

{

struct tcp_sock *tp = tcp_sk(sk);

const struct inet_connection_sock *icsk = inet_csk(sk);

tp->prior_ssthresh = 0;

tp->bytes_acked = 0;

if (icsk->icsk_ca_state < TCP_CA_CWR) {

tp->undo_marker = 0;

if (set_ssthresh)

tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);

tp->snd_cwnd = min(tp->snd_cwnd,

tcp_packets_in_flight(tp) + 1U);

tp->snd_cwnd_cnt = 0;

//设置最大序列号

tp->high_seq = tp->snd_nxt;

tp->snd_cwnd_stamp = tcp_time_stamp;

TCP_ECN_queue_cwr(tp);

//进入CWR状态.

tcp_set_ca_state(sk, TCP_CA_CWR);

}

}

可以看到上面的设置保存了很多值,那么这些值在什么时候使用呢,先来看上面的代码分析中跳过的片段,也就是tcp_try_undo_dsack函数。

这个函数主要是用于检测是否需要从cwnd减小的驱使中恢复,这里判断条件就是undo_marker和undo_retrans.

这里undo_retrans为0,则表示没有重传任何数据,或者说重传的数据都已经被DSACK了,从而说明数据都已经安全抵达,那么这个时候自然需要从cwr恢复。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
  
static void tcp_try_undo_dsack(struct sock *sk)

{

struct tcp_sock *tp = tcp_sk(sk);

if (tp->undo_marker && !tp->undo_retrans) {

DBGUNDO(sk, "D-SACK");

//恢复拥塞窗口以及slow start的阈值。

tcp_undo_cwr(sk, true);

tp->undo_marker = 0;

NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKUNDO);

}

}

从上面可以看到核心就是tcp_undo_cwr函数。这个函数主要就是用于重置发送拥塞窗口和slow start的阈值.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
  
static void tcp_undo_cwr(struct sock *sk, const bool undo_ssthresh)

{

struct tcp_sock *tp = tcp_sk(sk);

if (tp->prior_ssthresh) {

const struct inet_connection_sock *icsk = inet_csk(sk);

if (icsk->icsk_ca_ops->undo_cwnd)

tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);

else

tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1);

if (undo_ssthresh && tp->prior_ssthresh > tp->snd_ssthresh) {

//恢复到先前保存的阈值

tp->snd_ssthresh = tp->prior_ssthresh;

TCP_ECN_withdraw_cwr(tp);

}

} else {

tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh);

}

tp->snd_cwnd_stamp = tcp_time_stamp;

}

然后来看在recover状态下,如何处理重复/部分确认.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
	  
case TCP_CA_Recovery:

//如果收到了重复确认

if (!(flag & FLAG_SND_UNA_ADVANCED)) {

//如果是reno算法,则更新sack

if (tcp_is_reno(tp) && is_dupack)

tcp_add_reno_sack(sk);

} else

//接收到了部分确认,那么此时就需要撤销先前的设置

do_lost = tcp_try_undo_partial(sk, pkts_acked);

break;

然后来看tcp_try_undo_partial,这个函数就是用来检查是否可以从接受到的部分确认撤销,它的返回值就是是否需要标记一些段为丢失。

这里要注意tcp_may_undo函数,这个函数主要用于判断是否是错误的进入了拥塞状态,如果是那么就返回true,然后接下来就需要从错误的状态撤销.它的判断类似上面的tcp_try_undo_dsack。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
  
static int tcp_try_undo_partial(struct sock *sk, int acked)

{

struct tcp_sock *tp = tcp_sk(sk);

/\* Partial ACK arrived. Force Hoe&#8217;s retransmit. \*/

int failed = tcp_is_reno(tp) || (tcp_fackets_out(tp) > tp->reordering);

//判断是否需要撤销

if (tcp_may_undo(tp)) {

/* Plain luck! Hole if filled with delayed

* packet, rather than with a retransmit.

*/

if (!tcp_any_retrans_done(sk))

tp->retrans_stamp = 0;

//update重定序长度

tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1);

DBGUNDO(sk, "Hoe");

//从cwr撤销

tcp_undo_cwr(sk, false);

NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);

/* So&#8230; Do not make Hoe&#8217;s retransmit yet.

* If the first packet was delayed, the rest

* ones are most probably delayed as well.

*/

failed = 0;

}

return failed;

}

然后就是LOSS状态的处理,也就是说在LOSS状态时处理重复以及部分确认.这个状态的处理类似上面的recover状态,因此这里就简要的描述下.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
	  
case TCP_CA_Loss:

if (flag & FLAG_DATA_ACKED)

icsk->icsk_retransmits = 0;

if (tcp_is_reno(tp) && flag & FLAG_SND_UNA_ADVANCED)

tcp_reset_reno_sack(tp);

//判断是否需要从LOSS状态撤销

if (!tcp_try_undo_loss(sk)) {

//不需要撤销,则调整拥塞窗口

tcp_moderate_cwnd(tp);

//重传

tcp_xmit_retransmit_queue(sk);

return;

}

if (icsk->icsk_ca_state != TCP_CA_Open)

return;

最后我们来看退出拥塞状态检测的处理,也就是当发送未确认的段大于等于high_seq(拥塞发生时的snd_nxt).当大于high_seq的时候,则说明可能我们需要从拥塞状态退出了,因为此时可能当拥塞状态发生时丢失的段都已经传输完毕。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
		  
case TCP_CA_Loss:

icsk->icsk_retransmits = 0;

//尝试撤销拥塞状态

if (tcp_try_undo_recovery(sk))

return;

break;

case TCP_CA_CWR:

/\* CWR is to be held something \*above* high_seq

\* is ACKed for CWR bit to reach receiver. \*/

if (tp->snd_una != tp->high_seq) {

tcp_complete_cwr(sk);

//恢复到open状态.

tcp_set_ca_state(sk, TCP_CA_Open);

}

break;

case TCP_CA_Recovery:

if (tcp_is_reno(tp))

tcp_reset_reno_sack(tp);

//尝试撤销拥塞状态

if (tcp_try_undo_recovery(sk))

return;

//完成cwr

tcp_complete_cwr(sk);

break;

}

来看最后一个函数,也就是tcp_try_undo_recovery函数。这个函数主要就是用于撤销拥塞状态。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
  
static bool tcp_try_undo_recovery(struct sock *sk)

{

struct tcp_sock *tp = tcp_sk(sk);

//是否需要撤销

if (tcp_may_undo(tp)) {

int mib_idx;

/* Happy end! We did not retransmit anything

* or our original transmission succeeded.

*/

DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");

//撤销设置

tcp_undo_cwr(sk, true);

if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)

mib_idx = LINUX_MIB_TCPLOSSUNDO;

else

mib_idx = LINUX_MIB_TCPFULLUNDO;

NET_INC_STATS_BH(sock_net(sk), mib_idx);

tp->undo_marker = 0;

}

if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {

/\* Hold old state until something \*above* high_seq

* is ACKed. For Reno it is MUST to prevent false

\* fast retransmits (RFC2582). SACK TCP is safe. \*/

tcp_moderate_cwnd(tp);

return true;

}

//设置状态为open.

tcp_set_ca_state(sk, TCP_CA_Open);

return false;

}