Line 0
Link Here
|
|
|
1 |
/* |
2 |
* MPTCP implementation - Sending side |
3 |
* |
4 |
* Initial Design & Implementation: |
5 |
* Sébastien Barré <sebastien.barre@uclouvain.be> |
6 |
* |
7 |
* Current Maintainer & Author: |
8 |
* Christoph Paasch <christoph.paasch@uclouvain.be> |
9 |
* |
10 |
* Additional authors: |
11 |
* Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi> |
12 |
* Gregory Detal <gregory.detal@uclouvain.be> |
13 |
* Fabien Duchêne <fabien.duchene@uclouvain.be> |
14 |
* Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de> |
15 |
* Lavkesh Lahngir <lavkesh51@gmail.com> |
16 |
* Andreas Ripke <ripke@neclab.eu> |
17 |
* Vlad Dogaru <vlad.dogaru@intel.com> |
18 |
* Octavian Purdila <octavian.purdila@intel.com> |
19 |
* John Ronan <jronan@tssg.org> |
20 |
* Catalin Nicutar <catalin.nicutar@gmail.com> |
21 |
* Brandon Heller <brandonh@stanford.edu> |
22 |
* |
23 |
* |
24 |
* This program is free software; you can redistribute it and/or |
25 |
* modify it under the terms of the GNU General Public License |
26 |
* as published by the Free Software Foundation; either version |
27 |
* 2 of the License, or (at your option) any later version. |
28 |
*/ |
29 |
|
30 |
#include <linux/kconfig.h> |
31 |
#include <linux/skbuff.h> |
32 |
#include <linux/tcp.h> |
33 |
|
34 |
#include <net/mptcp.h> |
35 |
#include <net/mptcp_v4.h> |
36 |
#include <net/mptcp_v6.h> |
37 |
#include <net/sock.h> |
38 |
|
39 |
/* If the sub-socket sk available to send the skb? */ |
40 |
static int mptcp_is_available(struct sock *sk, struct sk_buff *skb, |
41 |
unsigned int *mss) |
42 |
{ |
43 |
struct tcp_sock *tp = tcp_sk(sk); |
44 |
unsigned int mss_now; |
45 |
|
46 |
/* Set of states for which we are allowed to send data */ |
47 |
if (!mptcp_sk_can_send(sk)) |
48 |
return 0; |
49 |
|
50 |
/* We do not send data on this subflow unless it is |
51 |
* fully established, i.e. the 4th ack has been received. |
52 |
*/ |
53 |
if (tp->mptcp->pre_established) |
54 |
return 0; |
55 |
|
56 |
if (tp->pf || |
57 |
(tp->mpcb->noneligible & mptcp_pi_to_flag(tp->mptcp->path_index))) |
58 |
return 0; |
59 |
|
60 |
if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) { |
61 |
/* If SACK is disabled, and we got a loss, TCP does not exist |
62 |
* the loss-state until something above high_seq has been acked. |
63 |
* (see tcp_try_undo_recovery) |
64 |
* |
65 |
* high_seq is the snd_nxt at the moment of the RTO. As soon |
66 |
* as we have an RTO, we won't push data on the subflow. |
67 |
* Thus, snd_una can never go beyond high_seq. |
68 |
*/ |
69 |
if (!tcp_is_reno(tp)) |
70 |
return 0; |
71 |
else if (tp->snd_una != tp->high_seq) |
72 |
return 0; |
73 |
} |
74 |
|
75 |
if (!tp->mptcp->fully_established) { |
76 |
/* Make sure that we send in-order data */ |
77 |
if (skb && tp->mptcp->second_packet && |
78 |
tp->mptcp->last_end_data_seq != TCP_SKB_CB(skb)->seq) |
79 |
return 0; |
80 |
} |
81 |
|
82 |
if (!tcp_cwnd_test(tp, skb)) |
83 |
return 0; |
84 |
|
85 |
mss_now = tcp_current_mss(sk); |
86 |
/* Don't send on this subflow if we bypass the allowed send-window at |
87 |
* the per-subflow level. Similar to tcp_snd_wnd_test, but manually |
88 |
* calculated end_seq (because here at this point end_seq is still at |
89 |
* the meta-level). |
90 |
*/ |
91 |
if (skb && after(tp->write_seq + min(skb->len, mss_now), tcp_wnd_end(tp))) |
92 |
return 0; |
93 |
|
94 |
if (mss) |
95 |
*mss = mss_now; |
96 |
|
97 |
return 1; |
98 |
} |
99 |
|
100 |
/* Are we not allowed to reinject this skb on tp? */ |
101 |
static int mptcp_dont_reinject_skb(struct tcp_sock *tp, struct sk_buff *skb) |
102 |
{ |
103 |
/* If the skb has already been enqueued in this sk, try to find |
104 |
* another one. |
105 |
* An exception is a DATA_FIN without data. These ones are not |
106 |
* reinjected at the subflow-level as they do not consume |
107 |
* subflow-sequence-number space. |
108 |
*/ |
109 |
return skb && |
110 |
/* We either have a data_fin with data or not a data_fin */ |
111 |
((mptcp_is_data_fin(skb) && TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq > 1) || |
112 |
!mptcp_is_data_fin(skb)) && |
113 |
/* Has the skb already been enqueued into this subsocket? */ |
114 |
mptcp_pi_to_flag(tp->mptcp->path_index) & TCP_SKB_CB(skb)->path_mask; |
115 |
} |
116 |
|
117 |
/* This is the scheduler. This function decides on which flow to send |
118 |
* a given MSS. If all subflows are found to be busy, NULL is returned |
119 |
* The flow is selected based on the shortest RTT. |
120 |
* If all paths have full cong windows, we simply return NULL. |
121 |
* |
122 |
* Additionally, this function is aware of the backup-subflows. |
123 |
*/ |
124 |
static struct sock *get_available_subflow(struct sock *meta_sk, |
125 |
struct sk_buff *skb, |
126 |
unsigned int *mss_now) |
127 |
{ |
128 |
struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; |
129 |
struct sock *sk, *bestsk = NULL, *lowpriosk = NULL, *backupsk = NULL; |
130 |
unsigned int mss = 0, mss_lowprio = 0, mss_backup = 0; |
131 |
u32 min_time_to_peer = 0xffffffff, lowprio_min_time_to_peer = 0xffffffff; |
132 |
int cnt_backups = 0; |
133 |
|
134 |
/* if there is only one subflow, bypass the scheduling function */ |
135 |
if (mpcb->cnt_subflows == 1) { |
136 |
bestsk = (struct sock *)mpcb->connection_list; |
137 |
if (!mptcp_is_available(bestsk, skb, mss_now)) |
138 |
bestsk = NULL; |
139 |
return bestsk; |
140 |
} |
141 |
|
142 |
/* Answer data_fin on same subflow!!! */ |
143 |
if (meta_sk->sk_shutdown & RCV_SHUTDOWN && |
144 |
skb && mptcp_is_data_fin(skb)) { |
145 |
mptcp_for_each_sk(mpcb, sk) { |
146 |
if (tcp_sk(sk)->mptcp->path_index == mpcb->dfin_path_index && |
147 |
mptcp_is_available(sk, skb, mss_now)) |
148 |
return sk; |
149 |
} |
150 |
} |
151 |
|
152 |
/* First, find the best subflow */ |
153 |
mptcp_for_each_sk(mpcb, sk) { |
154 |
struct tcp_sock *tp = tcp_sk(sk); |
155 |
int this_mss; |
156 |
|
157 |
if (tp->mptcp->rcv_low_prio || tp->mptcp->low_prio) |
158 |
cnt_backups++; |
159 |
|
160 |
if ((tp->mptcp->rcv_low_prio || tp->mptcp->low_prio) && |
161 |
tp->srtt < lowprio_min_time_to_peer) { |
162 |
|
163 |
if (!mptcp_is_available(sk, skb, &this_mss)) |
164 |
continue; |
165 |
|
166 |
if (mptcp_dont_reinject_skb(tp, skb)) { |
167 |
mss_backup = this_mss; |
168 |
backupsk = sk; |
169 |
continue; |
170 |
} |
171 |
|
172 |
lowprio_min_time_to_peer = tp->srtt; |
173 |
lowpriosk = sk; |
174 |
mss_lowprio = this_mss; |
175 |
} else if (!(tp->mptcp->rcv_low_prio || tp->mptcp->low_prio) && |
176 |
tp->srtt < min_time_to_peer) { |
177 |
if (!mptcp_is_available(sk, skb, &this_mss)) |
178 |
continue; |
179 |
|
180 |
if (mptcp_dont_reinject_skb(tp, skb)) { |
181 |
mss_backup = this_mss; |
182 |
backupsk = sk; |
183 |
continue; |
184 |
} |
185 |
|
186 |
min_time_to_peer = tp->srtt; |
187 |
bestsk = sk; |
188 |
mss = this_mss; |
189 |
} |
190 |
} |
191 |
|
192 |
if (mpcb->cnt_established == cnt_backups && lowpriosk) { |
193 |
mss = mss_lowprio; |
194 |
sk = lowpriosk; |
195 |
} else if (bestsk) { |
196 |
sk = bestsk; |
197 |
} else if (backupsk){ |
198 |
/* It has been sent on all subflows once - let's give it a |
199 |
* chance again by restarting its pathmask. |
200 |
*/ |
201 |
if (skb) |
202 |
TCP_SKB_CB(skb)->path_mask = 0; |
203 |
mss = mss_backup; |
204 |
sk = backupsk; |
205 |
} |
206 |
|
207 |
if (mss_now) |
208 |
*mss_now = mss; |
209 |
|
210 |
return sk; |
211 |
} |
212 |
|
213 |
static struct mp_dss *mptcp_skb_find_dss(const struct sk_buff *skb) |
214 |
{ |
215 |
if (!mptcp_is_data_seq(skb)) |
216 |
return NULL; |
217 |
|
218 |
return (struct mp_dss *)(skb->data - (MPTCP_SUB_LEN_DSS_ALIGN + |
219 |
MPTCP_SUB_LEN_ACK_ALIGN + |
220 |
MPTCP_SUB_LEN_SEQ_ALIGN)); |
221 |
} |
222 |
|
223 |
/* get the data-seq and end-data-seq and store them again in the |
224 |
* tcp_skb_cb |
225 |
*/ |
226 |
static int mptcp_reconstruct_mapping(struct sk_buff *skb, struct sk_buff *orig_skb) |
227 |
{ |
228 |
struct mp_dss *mpdss = mptcp_skb_find_dss(orig_skb); |
229 |
u32 *p32; |
230 |
u16 *p16; |
231 |
|
232 |
if (!mpdss || !mpdss->M) |
233 |
return 1; |
234 |
|
235 |
/* Move the pointer to the data-seq */ |
236 |
p32 = (u32 *)mpdss; |
237 |
p32++; |
238 |
if (mpdss->A) { |
239 |
p32++; |
240 |
if (mpdss->a) |
241 |
p32++; |
242 |
} |
243 |
|
244 |
TCP_SKB_CB(skb)->seq = ntohl(*p32); |
245 |
|
246 |
/* Get the data_len to calculate the end_data_seq */ |
247 |
p32++; |
248 |
p32++; |
249 |
p16 = (u16 *)p32; |
250 |
TCP_SKB_CB(skb)->end_seq = ntohs(*p16) + TCP_SKB_CB(skb)->seq; |
251 |
|
252 |
return 0; |
253 |
} |
254 |
|
255 |
/* Similar to __pskb_copy and sk_stream_alloc_skb. */ |
256 |
static struct sk_buff *mptcp_pskb_copy(struct sk_buff *skb) |
257 |
{ |
258 |
struct sk_buff *n; |
259 |
/* The TCP header must be at least 32-bit aligned. */ |
260 |
int size = ALIGN(skb_headlen(skb), 4); |
261 |
|
262 |
n = alloc_skb_fclone(size + MAX_TCP_HEADER, GFP_ATOMIC); |
263 |
if (!n) |
264 |
return NULL; |
265 |
|
266 |
/* Set the data pointer */ |
267 |
skb_reserve(n, MAX_TCP_HEADER); |
268 |
/* Set the tail pointer and length */ |
269 |
skb_put(n, skb_headlen(skb)); |
270 |
/* Copy the bytes */ |
271 |
skb_copy_from_linear_data(skb, n->data, n->len); |
272 |
|
273 |
n->truesize += skb->data_len; |
274 |
n->data_len = skb->data_len; |
275 |
n->len = skb->len; |
276 |
|
277 |
if (skb_shinfo(skb)->nr_frags) { |
278 |
int i; |
279 |
|
280 |
if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { |
281 |
if (skb_copy_ubufs(skb, GFP_ATOMIC)) { |
282 |
kfree_skb(n); |
283 |
n = NULL; |
284 |
goto out; |
285 |
} |
286 |
} |
287 |
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { |
288 |
skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i]; |
289 |
skb_frag_ref(skb, i); |
290 |
} |
291 |
skb_shinfo(n)->nr_frags = i; |
292 |
} |
293 |
|
294 |
if (skb_has_frag_list(skb)) { |
295 |
skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list; |
296 |
skb_clone_fraglist(n); |
297 |
} |
298 |
|
299 |
copy_skb_header(n, skb); |
300 |
out: |
301 |
return n; |
302 |
} |
303 |
|
304 |
/* Reinject data from one TCP subflow to the meta_sk. If sk == NULL, we are |
305 |
* coming from the meta-retransmit-timer |
306 |
*/ |
307 |
static void __mptcp_reinject_data(struct sk_buff *orig_skb, struct sock *meta_sk, |
308 |
struct sock *sk, int clone_it) |
309 |
{ |
310 |
struct sk_buff *skb, *skb1; |
311 |
struct tcp_sock *meta_tp = tcp_sk(meta_sk); |
312 |
struct mptcp_cb *mpcb = meta_tp->mpcb; |
313 |
u32 seq, end_seq; |
314 |
|
315 |
if (clone_it) { |
316 |
/* pskb_copy is necessary here, because the TCP/IP-headers |
317 |
* will be changed when it's going to be reinjected on another |
318 |
* subflow. |
319 |
*/ |
320 |
skb = mptcp_pskb_copy(orig_skb); |
321 |
} else { |
322 |
__skb_unlink(orig_skb, &sk->sk_write_queue); |
323 |
sock_set_flag(sk, SOCK_QUEUE_SHRUNK); |
324 |
sk->sk_wmem_queued -= orig_skb->truesize; |
325 |
sk_mem_uncharge(sk, orig_skb->truesize); |
326 |
skb = orig_skb; |
327 |
} |
328 |
if (unlikely(!skb)) |
329 |
return; |
330 |
|
331 |
if (sk && mptcp_reconstruct_mapping(skb, orig_skb)) { |
332 |
__kfree_skb(skb); |
333 |
return; |
334 |
} |
335 |
|
336 |
skb->sk = meta_sk; |
337 |
|
338 |
/* If it reached already the destination, we don't have to reinject it */ |
339 |
if (!after(TCP_SKB_CB(skb)->end_seq, meta_tp->snd_una)) { |
340 |
__kfree_skb(skb); |
341 |
return; |
342 |
} |
343 |
|
344 |
/* Only reinject segments that are fully covered by the mapping */ |
345 |
if (skb->len + (mptcp_is_data_fin(skb) ? 1 : 0) != |
346 |
TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq) { |
347 |
u32 seq = TCP_SKB_CB(skb)->seq; |
348 |
u32 end_seq = TCP_SKB_CB(skb)->end_seq; |
349 |
|
350 |
__kfree_skb(skb); |
351 |
|
352 |
/* Ok, now we have to look for the full mapping in the meta |
353 |
* send-queue :S |
354 |
*/ |
355 |
tcp_for_write_queue(skb, meta_sk) { |
356 |
/* Not yet at the mapping? */ |
357 |
if (before(TCP_SKB_CB(skb)->seq, seq)) |
358 |
continue; |
359 |
/* We have passed by the mapping */ |
360 |
if (after(TCP_SKB_CB(skb)->end_seq, end_seq)) |
361 |
return; |
362 |
|
363 |
__mptcp_reinject_data(skb, meta_sk, NULL, 1); |
364 |
} |
365 |
return; |
366 |
} |
367 |
|
368 |
/* If it's empty, just add */ |
369 |
if (skb_queue_empty(&mpcb->reinject_queue)) { |
370 |
skb_queue_head(&mpcb->reinject_queue, skb); |
371 |
return; |
372 |
} |
373 |
|
374 |
/* Find place to insert skb - or even we can 'drop' it, as the |
375 |
* data is already covered by other skb's in the reinject-queue. |
376 |
* |
377 |
* This is inspired by code from tcp_data_queue. |
378 |
*/ |
379 |
|
380 |
skb1 = skb_peek_tail(&mpcb->reinject_queue); |
381 |
seq = TCP_SKB_CB(skb)->seq; |
382 |
while (1) { |
383 |
if (!after(TCP_SKB_CB(skb1)->seq, seq)) |
384 |
break; |
385 |
if (skb_queue_is_first(&mpcb->reinject_queue, skb1)) { |
386 |
skb1 = NULL; |
387 |
break; |
388 |
} |
389 |
skb1 = skb_queue_prev(&mpcb->reinject_queue, skb1); |
390 |
} |
391 |
|
392 |
/* Do skb overlap to previous one? */ |
393 |
end_seq = TCP_SKB_CB(skb)->end_seq; |
394 |
if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) { |
395 |
if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { |
396 |
/* All the bits are present. Don't reinject */ |
397 |
__kfree_skb(skb); |
398 |
return; |
399 |
} |
400 |
if (seq == TCP_SKB_CB(skb1)->seq) { |
401 |
if (skb_queue_is_first(&mpcb->reinject_queue, skb1)) |
402 |
skb1 = NULL; |
403 |
else |
404 |
skb1 = skb_queue_prev(&mpcb->reinject_queue, skb1); |
405 |
} |
406 |
} |
407 |
if (!skb1) |
408 |
__skb_queue_head(&mpcb->reinject_queue, skb); |
409 |
else |
410 |
__skb_queue_after(&mpcb->reinject_queue, skb1, skb); |
411 |
|
412 |
/* And clean segments covered by new one as whole. */ |
413 |
while (!skb_queue_is_last(&mpcb->reinject_queue, skb)) { |
414 |
skb1 = skb_queue_next(&mpcb->reinject_queue, skb); |
415 |
|
416 |
if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) |
417 |
break; |
418 |
|
419 |
__skb_unlink(skb1, &mpcb->reinject_queue); |
420 |
__kfree_skb(skb1); |
421 |
} |
422 |
return; |
423 |
} |
424 |
|
425 |
/* Inserts data into the reinject queue */ |
426 |
void mptcp_reinject_data(struct sock *sk, int clone_it) |
427 |
{ |
428 |
struct sk_buff *skb_it, *tmp; |
429 |
struct tcp_sock *tp = tcp_sk(sk); |
430 |
struct sock *meta_sk = tp->meta_sk; |
431 |
|
432 |
/* It has already been closed - there is really no point in reinjecting */ |
433 |
if (meta_sk->sk_state == TCP_CLOSE) |
434 |
return; |
435 |
|
436 |
skb_queue_walk_safe(&sk->sk_write_queue, skb_it, tmp) { |
437 |
struct tcp_skb_cb *tcb = TCP_SKB_CB(skb_it); |
438 |
/* Subflow syn's and fin's are not reinjected. |
439 |
* |
440 |
* As well as empty subflow-fins with a data-fin. |
441 |
* They are reinjected below (without the subflow-fin-flag) |
442 |
*/ |
443 |
if (tcb->tcp_flags & TCPHDR_SYN || |
444 |
(tcb->tcp_flags & TCPHDR_FIN && !mptcp_is_data_fin(skb_it)) || |
445 |
(tcb->tcp_flags & TCPHDR_FIN && mptcp_is_data_fin(skb_it) && !skb_it->len)) |
446 |
continue; |
447 |
|
448 |
__mptcp_reinject_data(skb_it, meta_sk, sk, clone_it); |
449 |
} |
450 |
|
451 |
skb_it = tcp_write_queue_tail(meta_sk); |
452 |
/* If sk has sent the empty data-fin, we have to reinject it too. */ |
453 |
if (skb_it && mptcp_is_data_fin(skb_it) && skb_it->len == 0 && |
454 |
TCP_SKB_CB(skb_it)->path_mask & mptcp_pi_to_flag(tp->mptcp->path_index)) { |
455 |
__mptcp_reinject_data(skb_it, meta_sk, NULL, 1); |
456 |
} |
457 |
|
458 |
mptcp_push_pending_frames(meta_sk); |
459 |
|
460 |
tp->pf = 1; |
461 |
} |
462 |
|
463 |
|
464 |
static void mptcp_combine_dfin(struct sk_buff *skb, struct sock *meta_sk, |
465 |
struct sock *subsk) |
466 |
{ |
467 |
struct tcp_sock *meta_tp = tcp_sk(meta_sk); |
468 |
struct mptcp_cb *mpcb = meta_tp->mpcb; |
469 |
struct sock *sk_it; |
470 |
int all_empty = 1, all_acked; |
471 |
|
472 |
/* In infinite mapping we always try to combine */ |
473 |
if (mpcb->infinite_mapping_snd && tcp_close_state(subsk)) { |
474 |
TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN; |
475 |
return; |
476 |
} |
477 |
|
478 |
/* Don't combine, if they didn't combine - otherwise we end up in |
479 |
* TIME_WAIT, even if our app is smart enough to avoid it |
480 |
*/ |
481 |
if (meta_sk->sk_shutdown & RCV_SHUTDOWN) { |
482 |
if (!mpcb->dfin_combined) |
483 |
return; |
484 |
} |
485 |
|
486 |
/* If no other subflow has data to send, we can combine */ |
487 |
mptcp_for_each_sk(mpcb, sk_it) { |
488 |
if (!mptcp_sk_can_send(sk_it)) |
489 |
continue; |
490 |
|
491 |
if (!tcp_write_queue_empty(sk_it)) |
492 |
all_empty = 0; |
493 |
} |
494 |
|
495 |
/* If all data has been DATA_ACKed, we can combine. |
496 |
* -1, because the data_fin consumed one byte |
497 |
*/ |
498 |
all_acked = (meta_tp->snd_una == (meta_tp->write_seq - 1)); |
499 |
|
500 |
if ((all_empty || all_acked) && tcp_close_state(subsk)) |
501 |
TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN; |
502 |
} |
503 |
|
504 |
static struct sk_buff *mptcp_skb_entail(struct sock *sk, struct sk_buff *skb, |
505 |
int reinject) |
506 |
{ |
507 |
__be32 *ptr; |
508 |
__u16 data_len; |
509 |
struct mp_dss *mdss; |
510 |
struct tcp_sock *tp = tcp_sk(sk); |
511 |
struct sock *meta_sk = mptcp_meta_sk(sk); |
512 |
struct mptcp_cb *mpcb = tp->mpcb; |
513 |
struct tcp_skb_cb *tcb; |
514 |
struct sk_buff *subskb = NULL; |
515 |
|
516 |
if (!reinject) |
517 |
TCP_SKB_CB(skb)->mptcp_flags |= (mpcb->snd_hiseq_index ? |
518 |
MPTCPHDR_SEQ64_INDEX : 0); |
519 |
|
520 |
subskb = mptcp_pskb_copy(skb); |
521 |
if (!subskb) |
522 |
return NULL; |
523 |
|
524 |
TCP_SKB_CB(skb)->path_mask |= mptcp_pi_to_flag(tp->mptcp->path_index); |
525 |
|
526 |
if (!(sk->sk_route_caps & NETIF_F_ALL_CSUM) && |
527 |
skb->ip_summed == CHECKSUM_PARTIAL) { |
528 |
subskb->csum = skb->csum = skb_checksum(skb, 0, skb->len, 0); |
529 |
subskb->ip_summed = skb->ip_summed = CHECKSUM_NONE; |
530 |
} |
531 |
|
532 |
/* The subskb is going in the subflow send-queue. Its path-mask |
533 |
* is not needed anymore and MUST be set to 0, as the path-mask |
534 |
* is a union with inet_skb_param. |
535 |
*/ |
536 |
tcb = TCP_SKB_CB(subskb); |
537 |
tcb->path_mask = 0; |
538 |
|
539 |
if (mptcp_is_data_fin(subskb)) |
540 |
mptcp_combine_dfin(subskb, meta_sk, sk); |
541 |
|
542 |
if (tp->mpcb->infinite_mapping_snd) |
543 |
goto no_data_seq; |
544 |
|
545 |
if (tp->mpcb->send_infinite_mapping && |
546 |
!before(tcb->seq, mptcp_meta_tp(tp)->snd_nxt)) { |
547 |
tp->mptcp->fully_established = 1; |
548 |
tp->mpcb->infinite_mapping_snd = 1; |
549 |
tp->mptcp->infinite_cutoff_seq = tp->write_seq; |
550 |
tcb->mptcp_flags |= MPTCPHDR_INF; |
551 |
data_len = 0; |
552 |
} else { |
553 |
data_len = tcb->end_seq - tcb->seq; |
554 |
} |
555 |
|
556 |
/**** Write MPTCP DSS-option to the packet. ****/ |
557 |
ptr = (__be32 *)(subskb->data - (MPTCP_SUB_LEN_DSS_ALIGN + |
558 |
MPTCP_SUB_LEN_ACK_ALIGN + |
559 |
MPTCP_SUB_LEN_SEQ_ALIGN)); |
560 |
|
561 |
/* Then we start writing it from the start */ |
562 |
mdss = (struct mp_dss *)ptr; |
563 |
|
564 |
mdss->kind = TCPOPT_MPTCP; |
565 |
mdss->sub = MPTCP_SUB_DSS; |
566 |
mdss->rsv1 = 0; |
567 |
mdss->rsv2 = 0; |
568 |
mdss->F = (mptcp_is_data_fin(subskb) ? 1 : 0); |
569 |
mdss->m = 0; |
570 |
mdss->M = 1; |
571 |
mdss->a = 0; |
572 |
mdss->A = 1; |
573 |
mdss->len = mptcp_sub_len_dss(mdss, tp->mpcb->dss_csum); |
574 |
|
575 |
ptr++; |
576 |
ptr++; /* data_ack will be set in mptcp_options_write */ |
577 |
*ptr++ = htonl(tcb->seq); /* data_seq */ |
578 |
|
579 |
/* If it's a non-data DATA_FIN, we set subseq to 0 (draft v7) */ |
580 |
if (mptcp_is_data_fin(subskb) && subskb->len == 0) |
581 |
*ptr++ = 0; /* subseq */ |
582 |
else |
583 |
*ptr++ = htonl(tp->write_seq - tp->mptcp->snt_isn); /* subseq */ |
584 |
|
585 |
if (tp->mpcb->dss_csum && data_len) { |
586 |
__be16 *p16 = (__be16 *)ptr; |
587 |
__be32 hdseq = mptcp_get_highorder_sndbits(subskb, tp->mpcb); |
588 |
__wsum csum; |
589 |
*ptr = htonl(((data_len) << 16) | |
590 |
(TCPOPT_EOL << 8) | |
591 |
(TCPOPT_EOL)); |
592 |
|
593 |
csum = csum_partial(ptr - 2, 12, subskb->csum); |
594 |
p16++; |
595 |
*p16++ = csum_fold(csum_partial(&hdseq, sizeof(hdseq), csum)); |
596 |
} else { |
597 |
*ptr++ = htonl(((data_len) << 16) | |
598 |
(TCPOPT_NOP << 8) | |
599 |
(TCPOPT_NOP)); |
600 |
} |
601 |
|
602 |
no_data_seq: |
603 |
tcb->seq = tp->write_seq; |
604 |
tcb->sacked = 0; /* reset the sacked field: from the point of view |
605 |
* of this subflow, we are sending a brand new |
606 |
* segment */ |
607 |
/* Take into account seg len */ |
608 |
tp->write_seq += subskb->len + ((tcb->tcp_flags & TCPHDR_FIN) ? 1 : 0); |
609 |
tcb->end_seq = tp->write_seq; |
610 |
|
611 |
/* If it's a non-payload DATA_FIN (also no subflow-fin), the |
612 |
* segment is not part of the subflow but on a meta-only-level |
613 |
*/ |
614 |
if (!mptcp_is_data_fin(subskb) || tcb->end_seq != tcb->seq) { |
615 |
tcp_add_write_queue_tail(sk, subskb); |
616 |
sk->sk_wmem_queued += subskb->truesize; |
617 |
sk_mem_charge(sk, subskb->truesize); |
618 |
} |
619 |
|
620 |
return subskb; |
621 |
} |
622 |
|
623 |
static void mptcp_sub_event_new_data_sent(struct sock *sk, |
624 |
struct sk_buff *subskb, |
625 |
struct sk_buff *skb) |
626 |
{ |
627 |
/* If it's a non-payload DATA_FIN (also no subflow-fin), the |
628 |
* segment is not part of the subflow but on a meta-only-level |
629 |
* |
630 |
* We free it, because it has been queued nowhere. |
631 |
*/ |
632 |
if (!mptcp_is_data_fin(subskb) || |
633 |
(TCP_SKB_CB(subskb)->end_seq != TCP_SKB_CB(subskb)->seq)) { |
634 |
tcp_event_new_data_sent(sk, subskb); |
635 |
tcp_sk(sk)->mptcp->second_packet = 1; |
636 |
tcp_sk(sk)->mptcp->last_end_data_seq = TCP_SKB_CB(skb)->end_seq; |
637 |
} else { |
638 |
kfree_skb(subskb); |
639 |
} |
640 |
} |
641 |
|
642 |
/* Handle the packets and sockets after a tcp_transmit_skb failed */ |
643 |
static void mptcp_transmit_skb_failed(struct sock *sk, struct sk_buff *skb, |
644 |
struct sk_buff *subskb, int reinject) |
645 |
{ |
646 |
struct tcp_sock *tp = tcp_sk(sk); |
647 |
struct mptcp_cb *mpcb = tp->mpcb; |
648 |
|
649 |
/* No work to do if we are in infinite mapping mode |
650 |
* There is only one subflow left and we cannot send this segment on |
651 |
* another subflow. |
652 |
*/ |
653 |
if (mpcb->infinite_mapping_snd) |
654 |
return; |
655 |
|
656 |
TCP_SKB_CB(skb)->path_mask &= ~mptcp_pi_to_flag(tp->mptcp->path_index); |
657 |
|
658 |
if (TCP_SKB_CB(subskb)->tcp_flags & TCPHDR_FIN) { |
659 |
/* If it is a subflow-fin we must leave it on the |
660 |
* subflow-send-queue, so that the probe-timer |
661 |
* can retransmit it. |
662 |
*/ |
663 |
if (!tp->packets_out && !inet_csk(sk)->icsk_pending) |
664 |
inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, |
665 |
inet_csk(sk)->icsk_rto, TCP_RTO_MAX); |
666 |
} else if (mptcp_is_data_fin(subskb) && |
667 |
TCP_SKB_CB(subskb)->end_seq == TCP_SKB_CB(subskb)->seq) { |
668 |
/* An empty data-fin has not been enqueued on the subflow |
669 |
* and thus we free it. |
670 |
*/ |
671 |
|
672 |
kfree_skb(subskb); |
673 |
} else { |
674 |
/* In all other cases we remove it from the sub-queue. |
675 |
* Other subflows may send it, or the probe-timer will |
676 |
* handle it. |
677 |
*/ |
678 |
tcp_advance_send_head(sk, subskb); |
679 |
|
680 |
/* tcp_add_write_queue_tail initialized highest_sack. We have |
681 |
* to reset it, if necessary. |
682 |
*/ |
683 |
if (tp->highest_sack == subskb) |
684 |
tp->highest_sack = NULL; |
685 |
|
686 |
tcp_unlink_write_queue(subskb, sk); |
687 |
tp->write_seq -= subskb->len; |
688 |
sk_wmem_free_skb(sk, subskb); |
689 |
} |
690 |
} |
691 |
|
692 |
/* Function to create two new TCP segments. Shrinks the given segment |
693 |
* to the specified size and appends a new segment with the rest of the |
694 |
* packet to the list. This won't be called frequently, I hope. |
695 |
* Remember, these are still headerless SKBs at this point. |
696 |
*/ |
697 |
int mptcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, |
698 |
unsigned int mss_now, int reinject) |
699 |
{ |
700 |
struct tcp_sock *tp = tcp_sk(sk); |
701 |
struct sk_buff *buff; |
702 |
int nsize, old_factor; |
703 |
int nlen; |
704 |
u8 flags; |
705 |
int dsslen = MPTCP_SUB_LEN_DSS_ALIGN + MPTCP_SUB_LEN_ACK_ALIGN + |
706 |
MPTCP_SUB_LEN_SEQ_ALIGN; |
707 |
char dss[dsslen]; |
708 |
|
709 |
if (WARN_ON(len > skb->len)) |
710 |
return -EINVAL; |
711 |
|
712 |
/* DSS-option must be recovered afterwards. */ |
713 |
if (!is_meta_sk(sk)) |
714 |
memcpy(dss, skb->data - dsslen, dsslen); |
715 |
|
716 |
nsize = skb_headlen(skb) - len; |
717 |
if (nsize < 0) |
718 |
nsize = 0; |
719 |
|
720 |
if (skb_cloned(skb) && |
721 |
skb_is_nonlinear(skb)) { |
722 |
if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) |
723 |
return -ENOMEM; |
724 |
/* Recover dss-option */ |
725 |
if (!is_meta_sk(sk)) |
726 |
memcpy(skb->data - dsslen, dss, dsslen); |
727 |
} |
728 |
|
729 |
/* Get a new skb... force flag on. */ |
730 |
buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC); |
731 |
if (buff == NULL) |
732 |
return -ENOMEM; /* We'll just try again later. */ |
733 |
|
734 |
/* See below - if reinject == 1, the buff will be added to the reinject- |
735 |
* queue, which is currently not part of the memory-accounting. |
736 |
*/ |
737 |
if (reinject != 1) { |
738 |
sk->sk_wmem_queued += buff->truesize; |
739 |
sk_mem_charge(sk, buff->truesize); |
740 |
} |
741 |
nlen = skb->len - len - nsize; |
742 |
buff->truesize += nlen; |
743 |
skb->truesize -= nlen; |
744 |
|
745 |
/* Correct the sequence numbers. */ |
746 |
TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; |
747 |
TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; |
748 |
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; |
749 |
|
750 |
/* PSH and FIN should only be set in the second packet. */ |
751 |
flags = TCP_SKB_CB(skb)->tcp_flags; |
752 |
TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); |
753 |
TCP_SKB_CB(buff)->tcp_flags = flags; |
754 |
TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked; |
755 |
|
756 |
flags = TCP_SKB_CB(skb)->mptcp_flags; |
757 |
TCP_SKB_CB(skb)->mptcp_flags = flags & ~(MPTCPHDR_FIN); |
758 |
TCP_SKB_CB(buff)->mptcp_flags = flags; |
759 |
|
760 |
if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) { |
761 |
/* Copy and checksum data tail into the new buffer. */ |
762 |
buff->csum = csum_partial_copy_nocheck(skb->data + len, |
763 |
skb_put(buff, nsize), |
764 |
nsize, 0); |
765 |
|
766 |
skb_trim(skb, len); |
767 |
|
768 |
skb->csum = csum_block_sub(skb->csum, buff->csum, len); |
769 |
} else { |
770 |
skb->ip_summed = CHECKSUM_PARTIAL; |
771 |
skb_split(skb, buff, len); |
772 |
} |
773 |
|
774 |
/* We lost the dss-option when creating buff - put it back! */ |
775 |
if (!is_meta_sk(sk)) |
776 |
memcpy(buff->data - dsslen, dss, dsslen); |
777 |
|
778 |
buff->ip_summed = skb->ip_summed; |
779 |
|
780 |
/* Looks stupid, but our code really uses when of |
781 |
* skbs, which it never sent before. --ANK |
782 |
*/ |
783 |
TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; |
784 |
buff->tstamp = skb->tstamp; |
785 |
|
786 |
old_factor = tcp_skb_pcount(skb); |
787 |
|
788 |
/* Fix up tso_factor for both original and new SKB. */ |
789 |
tcp_set_skb_tso_segs(sk, skb, mss_now); |
790 |
tcp_set_skb_tso_segs(sk, buff, mss_now); |
791 |
|
792 |
/* If this packet has been sent out already, we must |
793 |
* adjust the various packet counters. |
794 |
*/ |
795 |
if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq) && reinject != 1) { |
796 |
int diff = old_factor - tcp_skb_pcount(skb) - |
797 |
tcp_skb_pcount(buff); |
798 |
|
799 |
if (diff) |
800 |
tcp_adjust_pcount(sk, skb, diff); |
801 |
} |
802 |
|
803 |
/* Link BUFF into the send queue. */ |
804 |
skb_header_release(buff); |
805 |
if (reinject == 1) |
806 |
__skb_queue_after(&tcp_sk(sk)->mpcb->reinject_queue, skb, buff); |
807 |
else |
808 |
tcp_insert_write_queue_after(skb, buff, sk); |
809 |
|
810 |
return 0; |
811 |
} |
812 |
|
813 |
int mptso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, |
814 |
unsigned int mss_now, gfp_t gfp, int reinject) |
815 |
{ |
816 |
struct sk_buff *buff; |
817 |
int nlen = skb->len - len, old_factor; |
818 |
u8 flags; |
819 |
int dsslen = MPTCP_SUB_LEN_DSS_ALIGN + MPTCP_SUB_LEN_ACK_ALIGN + |
820 |
MPTCP_SUB_LEN_SEQ_ALIGN; |
821 |
|
822 |
/* All of a TSO frame must be composed of paged data. */ |
823 |
if (skb->len != skb->data_len) |
824 |
return mptcp_fragment(sk, skb, len, mss_now, reinject); |
825 |
|
826 |
buff = sk_stream_alloc_skb(sk, 0, gfp); |
827 |
if (unlikely(buff == NULL)) |
828 |
return -ENOMEM; |
829 |
|
830 |
/* See below - if reinject == 1, the buff will be added to the reinject- |
831 |
* queue, which is currently not part of the memory-accounting. |
832 |
*/ |
833 |
if (reinject != 1) { |
834 |
sk->sk_wmem_queued += buff->truesize; |
835 |
sk_mem_charge(sk, buff->truesize); |
836 |
} |
837 |
buff->truesize += nlen; |
838 |
skb->truesize -= nlen; |
839 |
|
840 |
/* Correct the sequence numbers. */ |
841 |
TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; |
842 |
TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; |
843 |
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; |
844 |
|
845 |
/* PSH and FIN should only be set in the second packet. */ |
846 |
flags = TCP_SKB_CB(skb)->tcp_flags; |
847 |
TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); |
848 |
TCP_SKB_CB(buff)->tcp_flags = flags; |
849 |
|
850 |
flags = TCP_SKB_CB(skb)->mptcp_flags; |
851 |
TCP_SKB_CB(skb)->mptcp_flags = flags & ~(MPTCPHDR_FIN); |
852 |
TCP_SKB_CB(buff)->mptcp_flags = flags; |
853 |
|
854 |
/* This packet was never sent out yet, so no SACK bits. */ |
855 |
TCP_SKB_CB(buff)->sacked = 0; |
856 |
|
857 |
buff->ip_summed = CHECKSUM_PARTIAL; |
858 |
skb->ip_summed = CHECKSUM_PARTIAL; |
859 |
skb_split(skb, buff, len); |
860 |
|
861 |
/* We lost the dss-option when creating buff - put it back! */ |
862 |
if (!is_meta_sk(sk)) |
863 |
memcpy(buff->data - dsslen, skb->data - dsslen, dsslen); |
864 |
|
865 |
old_factor = tcp_skb_pcount(skb); |
866 |
|
867 |
/* Fix up tso_factor for both original and new SKB. */ |
868 |
tcp_set_skb_tso_segs(sk, skb, mss_now); |
869 |
tcp_set_skb_tso_segs(sk, buff, mss_now); |
870 |
|
871 |
/* If this packet has been sent out already, we must |
872 |
* adjust the various packet counters. |
873 |
*/ |
874 |
if (!before(tcp_sk(sk)->snd_nxt, TCP_SKB_CB(buff)->end_seq) && reinject != 1) { |
875 |
int diff = old_factor - tcp_skb_pcount(skb) - |
876 |
tcp_skb_pcount(buff); |
877 |
|
878 |
if (diff) |
879 |
tcp_adjust_pcount(sk, skb, diff); |
880 |
} |
881 |
|
882 |
/* Link BUFF into the send queue. */ |
883 |
skb_header_release(buff); |
884 |
if (reinject == 1) |
885 |
__skb_queue_after(&tcp_sk(sk)->mpcb->reinject_queue, skb, buff); |
886 |
else |
887 |
tcp_insert_write_queue_after(skb, buff, sk); |
888 |
|
889 |
return 0; |
890 |
} |
891 |
|
892 |
/* Inspired by tcp_write_wakeup */ |
893 |
int mptcp_write_wakeup(struct sock *meta_sk) |
894 |
{ |
895 |
struct tcp_sock *meta_tp = tcp_sk(meta_sk); |
896 |
struct sk_buff *skb, *subskb; |
897 |
|
898 |
skb = tcp_send_head(meta_sk); |
899 |
if (skb && |
900 |
before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(meta_tp))) { |
901 |
int err; |
902 |
unsigned int mss; |
903 |
unsigned int seg_size = tcp_wnd_end(meta_tp) - TCP_SKB_CB(skb)->seq; |
904 |
struct sock *subsk = get_available_subflow(meta_sk, skb, &mss); |
905 |
if (!subsk) |
906 |
return -1; |
907 |
|
908 |
if (before(meta_tp->pushed_seq, TCP_SKB_CB(skb)->end_seq)) |
909 |
meta_tp->pushed_seq = TCP_SKB_CB(skb)->end_seq; |
910 |
|
911 |
/* We are probing the opening of a window |
912 |
* but the window size is != 0 |
913 |
* must have been a result SWS avoidance ( sender ) |
914 |
*/ |
915 |
if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq || |
916 |
skb->len > mss) { |
917 |
seg_size = min(seg_size, mss); |
918 |
TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; |
919 |
if (mptcp_fragment(meta_sk, skb, seg_size, mss, 0)) |
920 |
return -1; |
921 |
} else if (!tcp_skb_pcount(skb)) { |
922 |
tcp_set_skb_tso_segs(meta_sk, skb, mss); |
923 |
} |
924 |
|
925 |
subskb = mptcp_skb_entail(subsk, skb, 0); |
926 |
if (!subskb) |
927 |
return -1; |
928 |
|
929 |
TCP_SKB_CB(subskb)->tcp_flags |= TCPHDR_PSH; |
930 |
TCP_SKB_CB(skb)->when = tcp_time_stamp; |
931 |
TCP_SKB_CB(subskb)->when = tcp_time_stamp; |
932 |
err = tcp_transmit_skb(subsk, subskb, 1, GFP_ATOMIC); |
933 |
if (unlikely(err)) { |
934 |
mptcp_transmit_skb_failed(subsk, skb, subskb, 0); |
935 |
return err; |
936 |
} |
937 |
|
938 |
mptcp_check_sndseq_wrap(meta_tp, TCP_SKB_CB(skb)->end_seq - |
939 |
TCP_SKB_CB(skb)->seq); |
940 |
tcp_event_new_data_sent(meta_sk, skb); |
941 |
mptcp_sub_event_new_data_sent(subsk, subskb, skb); |
942 |
|
943 |
return 0; |
944 |
} else { |
945 |
struct sock *sk_it; |
946 |
int ans = 0; |
947 |
|
948 |
if (between(meta_tp->snd_up, meta_tp->snd_una + 1, |
949 |
meta_tp->snd_una + 0xFFFF)) { |
950 |
mptcp_for_each_sk(meta_tp->mpcb, sk_it) { |
951 |
if (mptcp_sk_can_send_ack(sk_it)) |
952 |
tcp_xmit_probe_skb(sk_it, 1); |
953 |
} |
954 |
} |
955 |
|
956 |
/* At least one of the tcp_xmit_probe_skb's has to succeed */ |
957 |
mptcp_for_each_sk(meta_tp->mpcb, sk_it) { |
958 |
int ret; |
959 |
|
960 |
if (!mptcp_sk_can_send_ack(sk_it)) |
961 |
continue; |
962 |
|
963 |
ret = tcp_xmit_probe_skb(sk_it, 0); |
964 |
if (unlikely(ret > 0)) |
965 |
ans = ret; |
966 |
} |
967 |
return ans; |
968 |
} |
969 |
} |
970 |
|
971 |
static void mptcp_find_and_set_pathmask(struct sock *meta_sk, struct sk_buff *skb) |
972 |
{ |
973 |
struct sk_buff *skb_it; |
974 |
|
975 |
skb_it = tcp_write_queue_head(meta_sk); |
976 |
|
977 |
tcp_for_write_queue_from(skb_it, meta_sk) { |
978 |
if (skb_it == tcp_send_head(meta_sk)) |
979 |
break; |
980 |
|
981 |
if (TCP_SKB_CB(skb_it)->seq == TCP_SKB_CB(skb)->seq) { |
982 |
TCP_SKB_CB(skb)->path_mask = TCP_SKB_CB(skb_it)->path_mask; |
983 |
break; |
984 |
} |
985 |
} |
986 |
} |
987 |
|
988 |
static struct sk_buff *mptcp_rcv_buf_optimization(struct sock *sk, int penal) |
989 |
{ |
990 |
struct sock *meta_sk; |
991 |
struct tcp_sock *tp = tcp_sk(sk), *tp_it; |
992 |
struct sk_buff *skb_head; |
993 |
|
994 |
if (tp->mpcb->cnt_subflows == 1) |
995 |
return NULL; |
996 |
|
997 |
meta_sk = mptcp_meta_sk(sk); |
998 |
skb_head = tcp_write_queue_head(meta_sk); |
999 |
|
1000 |
if (!skb_head || skb_head == tcp_send_head(meta_sk)) |
1001 |
return NULL; |
1002 |
|
1003 |
/* If penalization is optional (coming from mptcp_next_segment() and |
1004 |
* We are not send-buffer-limited we do not penalize. The retransmission |
1005 |
* is just an optimization to fix the idle-time due to the delay before |
1006 |
* we wake up the application. |
1007 |
*/ |
1008 |
if (!penal && sk_stream_memory_free(meta_sk)) |
1009 |
goto retrans; |
1010 |
|
1011 |
/* Half the cwnd of the slow flow */ |
1012 |
mptcp_for_each_tp(tp->mpcb, tp_it) { |
1013 |
if (tp_it != tp && |
1014 |
TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp_it->mptcp->path_index)) { |
1015 |
/* Only update every subflow rtt */ |
1016 |
if (tcp_time_stamp - tp_it->mptcp->last_rbuf_opti < tp_it->srtt >> 3) |
1017 |
break; |
1018 |
|
1019 |
if (tp->srtt < tp_it->srtt && inet_csk((struct sock *)tp_it)->icsk_ca_state == TCP_CA_Open) { |
1020 |
tp_it->snd_cwnd = max(tp_it->snd_cwnd >> 1U, 1U); |
1021 |
if (tp_it->snd_ssthresh != TCP_INFINITE_SSTHRESH) |
1022 |
tp_it->snd_ssthresh = max(tp_it->snd_ssthresh >> 1U, 2U); |
1023 |
|
1024 |
tp_it->mptcp->last_rbuf_opti = tcp_time_stamp; |
1025 |
} |
1026 |
break; |
1027 |
} |
1028 |
} |
1029 |
|
1030 |
retrans: |
1031 |
|
1032 |
/* Segment not yet injected into this path? Take it!!! */ |
1033 |
if (!(TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp->mptcp->path_index))) { |
1034 |
int do_retrans = 0; |
1035 |
mptcp_for_each_tp(tp->mpcb, tp_it) { |
1036 |
if (tp_it != tp && |
1037 |
TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp_it->mptcp->path_index)) { |
1038 |
if (tp_it->snd_cwnd <= 4) { |
1039 |
do_retrans = 1; |
1040 |
break; |
1041 |
} |
1042 |
|
1043 |
if (4 * tp->srtt >= tp_it->srtt) { |
1044 |
do_retrans = 0; |
1045 |
break; |
1046 |
} else { |
1047 |
do_retrans = 1; |
1048 |
} |
1049 |
} |
1050 |
} |
1051 |
|
1052 |
if (do_retrans) |
1053 |
return skb_head; |
1054 |
} |
1055 |
return NULL; |
1056 |
} |
1057 |
|
1058 |
int mptcp_write_xmit(struct sock *meta_sk, unsigned int mss_now, int nonagle, |
1059 |
int push_one, gfp_t gfp) |
1060 |
{ |
1061 |
struct tcp_sock *meta_tp = tcp_sk(meta_sk), *subtp; |
1062 |
struct sock *subsk; |
1063 |
struct mptcp_cb *mpcb = meta_tp->mpcb; |
1064 |
struct sk_buff *skb; |
1065 |
unsigned int tso_segs, sent_pkts; |
1066 |
int cwnd_quota; |
1067 |
int result; |
1068 |
int reinject = 0; |
1069 |
|
1070 |
sent_pkts = 0; |
1071 |
|
1072 |
/* Currently mtu-probing is not done in MPTCP */ |
1073 |
if (!push_one && 0) { |
1074 |
/* Do MTU probing. */ |
1075 |
result = tcp_mtu_probe(meta_sk); |
1076 |
if (!result) |
1077 |
return 0; |
1078 |
else if (result > 0) |
1079 |
sent_pkts = 1; |
1080 |
} |
1081 |
|
1082 |
while ((skb = mptcp_next_segment(meta_sk, &reinject))) { |
1083 |
unsigned int limit; |
1084 |
struct sk_buff *subskb = NULL; |
1085 |
u32 noneligible = mpcb->noneligible; |
1086 |
|
1087 |
if (reinject == 1) { |
1088 |
if (!after(TCP_SKB_CB(skb)->end_seq, meta_tp->snd_una)) { |
1089 |
/* Segment already reached the peer, take the next one */ |
1090 |
__skb_unlink(skb, &mpcb->reinject_queue); |
1091 |
__kfree_skb(skb); |
1092 |
continue; |
1093 |
} |
1094 |
|
1095 |
/* Reinjection and it is coming from a subflow? We need |
1096 |
* to find out the path-mask from the meta-write-queue |
1097 |
* to properly select a subflow. |
1098 |
*/ |
1099 |
if (!TCP_SKB_CB(skb)->path_mask) |
1100 |
mptcp_find_and_set_pathmask(meta_sk, skb); |
1101 |
} |
1102 |
|
1103 |
subflow: |
1104 |
subsk = get_available_subflow(meta_sk, skb, &mss_now); |
1105 |
if (!subsk) |
1106 |
break; |
1107 |
subtp = tcp_sk(subsk); |
1108 |
|
1109 |
/* Since all subsocks are locked before calling the scheduler, |
1110 |
* the tcp_send_head should not change. |
1111 |
*/ |
1112 |
BUG_ON(!reinject && tcp_send_head(meta_sk) != skb); |
1113 |
retry: |
1114 |
/* If the segment was cloned (e.g. a meta retransmission), |
1115 |
* the header must be expanded/copied so that there is no |
1116 |
* corruption of TSO information. |
1117 |
*/ |
1118 |
if (skb_cloned(skb) && skb_is_nonlinear(skb) && |
1119 |
unlikely(pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) |
1120 |
break; |
1121 |
|
1122 |
tcp_set_skb_tso_segs(meta_sk, skb, mss_now); |
1123 |
tso_segs = tcp_skb_pcount(skb); |
1124 |
BUG_ON(!tso_segs); |
1125 |
|
1126 |
cwnd_quota = tcp_cwnd_test(subtp, skb); |
1127 |
if (!cwnd_quota) { |
1128 |
/* May happen, if at the first selection we circumvented |
1129 |
* the test due to a DATA_FIN (and got rejected at |
1130 |
* tcp_snd_wnd_test), but the reinjected segment is not |
1131 |
* a DATA_FIN. |
1132 |
*/ |
1133 |
BUG_ON(reinject != -1); |
1134 |
break; |
1135 |
} |
1136 |
|
1137 |
if (!reinject && unlikely(!tcp_snd_wnd_test(meta_tp, skb, mss_now))) { |
1138 |
skb = mptcp_rcv_buf_optimization(subsk, 1); |
1139 |
if (skb) { |
1140 |
reinject = -1; |
1141 |
goto retry; |
1142 |
} |
1143 |
break; |
1144 |
} |
1145 |
|
1146 |
if (tso_segs == 1) { |
1147 |
if (unlikely(!tcp_nagle_test(meta_tp, skb, mss_now, |
1148 |
(tcp_skb_is_last(meta_sk, skb) ? |
1149 |
nonagle : TCP_NAGLE_PUSH)))) |
1150 |
break; |
1151 |
} else { |
1152 |
/* Do not try to defer the transmission of a reinjected |
1153 |
* segment. Send it directly. |
1154 |
* If it is not possible to send the TSO segment on the |
1155 |
* best subflow right now try to look for another subflow. |
1156 |
* If there is no subflow available defer the segment to avoid |
1157 |
* the call to mptso_fragment. |
1158 |
*/ |
1159 |
if (!push_one && !reinject && tcp_tso_should_defer(subsk, skb)) { |
1160 |
mpcb->noneligible |= mptcp_pi_to_flag(subtp->mptcp->path_index); |
1161 |
goto subflow; |
1162 |
} |
1163 |
} |
1164 |
|
1165 |
/* TSQ : sk_wmem_alloc accounts skb truesize, |
1166 |
* including skb overhead. But thats OK. |
1167 |
*/ |
1168 |
if (atomic_read(&subsk->sk_wmem_alloc) >= sysctl_tcp_limit_output_bytes) { |
1169 |
set_bit(TSQ_THROTTLED, &subtp->tsq_flags); |
1170 |
mpcb->noneligible |= mptcp_pi_to_flag(subtp->mptcp->path_index); |
1171 |
continue; |
1172 |
} |
1173 |
|
1174 |
limit = mss_now; |
1175 |
if (tso_segs > 1 && !tcp_urg_mode(meta_tp)) |
1176 |
limit = tcp_mss_split_point(subsk, skb, mss_now, |
1177 |
min_t(unsigned int, |
1178 |
cwnd_quota, |
1179 |
subsk->sk_gso_max_segs)); |
1180 |
|
1181 |
if (skb->len > limit && |
1182 |
unlikely(mptso_fragment(meta_sk, skb, limit, mss_now, gfp, reinject))) |
1183 |
break; |
1184 |
|
1185 |
subskb = mptcp_skb_entail(subsk, skb, reinject); |
1186 |
if (!subskb) |
1187 |
break; |
1188 |
|
1189 |
mpcb->noneligible = noneligible; |
1190 |
TCP_SKB_CB(skb)->when = tcp_time_stamp; |
1191 |
TCP_SKB_CB(subskb)->when = tcp_time_stamp; |
1192 |
if (unlikely(tcp_transmit_skb(subsk, subskb, 1, gfp))) { |
1193 |
mptcp_transmit_skb_failed(subsk, skb, subskb, reinject); |
1194 |
mpcb->noneligible |= mptcp_pi_to_flag(subtp->mptcp->path_index); |
1195 |
continue; |
1196 |
} |
1197 |
|
1198 |
if (!reinject) { |
1199 |
mptcp_check_sndseq_wrap(meta_tp, |
1200 |
TCP_SKB_CB(skb)->end_seq - |
1201 |
TCP_SKB_CB(skb)->seq); |
1202 |
tcp_event_new_data_sent(meta_sk, skb); |
1203 |
} |
1204 |
|
1205 |
tcp_minshall_update(meta_tp, mss_now, skb); |
1206 |
sent_pkts += tcp_skb_pcount(skb); |
1207 |
tcp_sk(subsk)->mptcp->sent_pkts += tcp_skb_pcount(skb); |
1208 |
|
1209 |
mptcp_sub_event_new_data_sent(subsk, subskb, skb); |
1210 |
|
1211 |
if (reinject > 0) { |
1212 |
__skb_unlink(skb, &mpcb->reinject_queue); |
1213 |
kfree_skb(skb); |
1214 |
} |
1215 |
|
1216 |
if (push_one) |
1217 |
break; |
1218 |
} |
1219 |
|
1220 |
mpcb->noneligible = 0; |
1221 |
|
1222 |
if (likely(sent_pkts)) { |
1223 |
mptcp_for_each_sk(mpcb, subsk) { |
1224 |
subtp = tcp_sk(subsk); |
1225 |
if (subtp->mptcp->sent_pkts) { |
1226 |
if (tcp_in_cwnd_reduction(subsk)) |
1227 |
subtp->prr_out += subtp->mptcp->sent_pkts; |
1228 |
tcp_cwnd_validate(subsk); |
1229 |
subtp->mptcp->sent_pkts = 0; |
1230 |
} |
1231 |
} |
1232 |
return 0; |
1233 |
} |
1234 |
|
1235 |
return !meta_tp->packets_out && tcp_send_head(meta_sk); |
1236 |
} |
1237 |
|
1238 |
void mptcp_write_space(struct sock *sk) |
1239 |
{ |
1240 |
mptcp_push_pending_frames(mptcp_meta_sk(sk)); |
1241 |
} |
1242 |
|
1243 |
u32 __mptcp_select_window(struct sock *sk) |
1244 |
{ |
1245 |
struct inet_connection_sock *icsk = inet_csk(sk); |
1246 |
struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp); |
1247 |
int mss, free_space, full_space, window; |
1248 |
|
1249 |
/* MSS for the peer's data. Previous versions used mss_clamp |
1250 |
* here. I don't know if the value based on our guesses |
1251 |
* of peer's MSS is better for the performance. It's more correct |
1252 |
* but may be worse for the performance because of rcv_mss |
1253 |
* fluctuations. --SAW 1998/11/1 |
1254 |
*/ |
1255 |
mss = icsk->icsk_ack.rcv_mss; |
1256 |
free_space = tcp_space(sk); |
1257 |
full_space = min_t(int, meta_tp->window_clamp, |
1258 |
tcp_full_space(sk)); |
1259 |
|
1260 |
if (mss > full_space) |
1261 |
mss = full_space; |
1262 |
|
1263 |
if (free_space < (full_space >> 1)) { |
1264 |
icsk->icsk_ack.quick = 0; |
1265 |
|
1266 |
if (tcp_memory_pressure) |
1267 |
/* TODO this has to be adapted when we support different |
1268 |
* MSS's among the subflows. |
1269 |
*/ |
1270 |
meta_tp->rcv_ssthresh = min(meta_tp->rcv_ssthresh, |
1271 |
4U * meta_tp->advmss); |
1272 |
|
1273 |
if (free_space < mss) |
1274 |
return 0; |
1275 |
} |
1276 |
|
1277 |
if (free_space > meta_tp->rcv_ssthresh) |
1278 |
free_space = meta_tp->rcv_ssthresh; |
1279 |
|
1280 |
/* Don't do rounding if we are using window scaling, since the |
1281 |
* scaled window will not line up with the MSS boundary anyway. |
1282 |
*/ |
1283 |
window = meta_tp->rcv_wnd; |
1284 |
if (tp->rx_opt.rcv_wscale) { |
1285 |
window = free_space; |
1286 |
|
1287 |
/* Advertise enough space so that it won't get scaled away. |
1288 |
* Import case: prevent zero window announcement if |
1289 |
* 1<<rcv_wscale > mss. |
1290 |
*/ |
1291 |
if (((window >> tp->rx_opt.rcv_wscale) << tp-> |
1292 |
rx_opt.rcv_wscale) != window) |
1293 |
window = (((window >> tp->rx_opt.rcv_wscale) + 1) |
1294 |
<< tp->rx_opt.rcv_wscale); |
1295 |
} else { |
1296 |
/* Get the largest window that is a nice multiple of mss. |
1297 |
* Window clamp already applied above. |
1298 |
* If our current window offering is within 1 mss of the |
1299 |
* free space we just keep it. This prevents the divide |
1300 |
* and multiply from happening most of the time. |
1301 |
* We also don't do any window rounding when the free space |
1302 |
* is too small. |
1303 |
*/ |
1304 |
if (window <= free_space - mss || window > free_space) |
1305 |
window = (free_space / mss) * mss; |
1306 |
else if (mss == full_space && |
1307 |
free_space > window + (full_space >> 1)) |
1308 |
window = free_space; |
1309 |
} |
1310 |
|
1311 |
return window; |
1312 |
} |
1313 |
|
1314 |
static void mptcp_set_nonce(struct sock *sk) |
1315 |
{ |
1316 |
struct tcp_sock *tp = tcp_sk(sk); |
1317 |
struct inet_sock *inet = inet_sk(sk); |
1318 |
|
1319 |
if (sk->sk_family == AF_INET) |
1320 |
tp->mptcp->mptcp_loc_nonce = mptcp_v4_get_nonce(inet->inet_saddr, |
1321 |
inet->inet_daddr, |
1322 |
inet->inet_sport, |
1323 |
inet->inet_dport, |
1324 |
tp->write_seq); |
1325 |
#if IS_ENABLED(CONFIG_IPV6) |
1326 |
else |
1327 |
tp->mptcp->mptcp_loc_nonce = mptcp_v6_get_nonce(inet6_sk(sk)->saddr.s6_addr32, |
1328 |
inet6_sk(sk)->daddr.s6_addr32, |
1329 |
inet->inet_sport, |
1330 |
inet->inet_dport, |
1331 |
tp->write_seq); |
1332 |
#endif |
1333 |
|
1334 |
tp->mptcp->nonce_set = 1; |
1335 |
} |
1336 |
|
1337 |
void mptcp_syn_options(struct sock *sk, struct tcp_out_options *opts, |
1338 |
unsigned *remaining) |
1339 |
{ |
1340 |
struct tcp_sock *tp = tcp_sk(sk); |
1341 |
|
1342 |
opts->options |= OPTION_MPTCP; |
1343 |
if (is_master_tp(tp)) { |
1344 |
opts->mptcp_options |= OPTION_MP_CAPABLE | OPTION_TYPE_SYN; |
1345 |
*remaining -= MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN; |
1346 |
opts->mp_capable.sender_key = tp->mptcp_loc_key; |
1347 |
opts->dss_csum = sysctl_mptcp_checksum; |
1348 |
|
1349 |
/* We arrive here either when sending a SYN or a |
1350 |
* SYN+ACK when in SYN_SENT state (that is, tcp_synack_options |
1351 |
* is only called for syn+ack replied by a server, while this |
1352 |
* function is called when SYNs are sent by both parties and |
1353 |
* are crossed) |
1354 |
* Due to this possibility, a slave subsocket may arrive here, |
1355 |
* and does not need to set the dataseq options, since |
1356 |
* there is no data in the segment |
1357 |
*/ |
1358 |
} else { |
1359 |
struct mptcp_cb *mpcb = tp->mpcb; |
1360 |
|
1361 |
opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_SYN; |
1362 |
*remaining -= MPTCP_SUB_LEN_JOIN_SYN_ALIGN; |
1363 |
opts->mp_join_syns.token = mpcb->mptcp_rem_token; |
1364 |
opts->addr_id = mptcp_get_loc_addrid(mpcb, sk); |
1365 |
|
1366 |
if (!tp->mptcp->nonce_set) |
1367 |
mptcp_set_nonce(sk); |
1368 |
|
1369 |
opts->mp_join_syns.sender_nonce = tp->mptcp->mptcp_loc_nonce; |
1370 |
} |
1371 |
} |
1372 |
|
1373 |
void mptcp_synack_options(struct request_sock *req, |
1374 |
struct tcp_out_options *opts, unsigned *remaining) |
1375 |
{ |
1376 |
struct mptcp_request_sock *mtreq; |
1377 |
mtreq = mptcp_rsk(req); |
1378 |
|
1379 |
opts->options |= OPTION_MPTCP; |
1380 |
/* MPCB not yet set - thus it's a new MPTCP-session */ |
1381 |
if (!mtreq->mpcb) { |
1382 |
opts->mptcp_options |= OPTION_MP_CAPABLE | OPTION_TYPE_SYNACK; |
1383 |
*remaining -= MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN; |
1384 |
opts->mp_capable.sender_key = mtreq->mptcp_loc_key; |
1385 |
opts->dss_csum = sysctl_mptcp_checksum || mtreq->dss_csum; |
1386 |
} else { |
1387 |
struct inet_request_sock *ireq = inet_rsk(req); |
1388 |
int i; |
1389 |
|
1390 |
opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_SYNACK; |
1391 |
opts->mp_join_syns.sender_truncated_mac = |
1392 |
mtreq->mptcp_hash_tmac; |
1393 |
opts->mp_join_syns.sender_nonce = mtreq->mptcp_loc_nonce; |
1394 |
opts->addr_id = 0; |
1395 |
|
1396 |
/* Finding Address ID */ |
1397 |
if (req->rsk_ops->family == AF_INET) |
1398 |
mptcp_for_each_bit_set(mtreq->mpcb->loc4_bits, i) { |
1399 |
struct mptcp_loc4 *addr = |
1400 |
&mtreq->mpcb->locaddr4[i]; |
1401 |
if (addr->addr.s_addr == ireq->loc_addr) |
1402 |
opts->addr_id = addr->id; |
1403 |
} |
1404 |
#if IS_ENABLED(CONFIG_IPV6) |
1405 |
else /* IPv6 */ |
1406 |
mptcp_for_each_bit_set(mtreq->mpcb->loc6_bits, i) { |
1407 |
struct mptcp_loc6 *addr = |
1408 |
&mtreq->mpcb->locaddr6[i]; |
1409 |
if (ipv6_addr_equal(&addr->addr, |
1410 |
&inet6_rsk(req)->loc_addr)) |
1411 |
opts->addr_id = addr->id; |
1412 |
} |
1413 |
#endif /* CONFIG_IPV6 */ |
1414 |
*remaining -= MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN; |
1415 |
} |
1416 |
} |
1417 |
|
1418 |
void mptcp_established_options(struct sock *sk, struct sk_buff *skb, |
1419 |
struct tcp_out_options *opts, unsigned *size) |
1420 |
{ |
1421 |
struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp); |
1422 |
struct mptcp_cb *mpcb = tp->mpcb; |
1423 |
struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL; |
1424 |
|
1425 |
/* In fallback mp_fail-mode, we have to repeat it until the fallback |
1426 |
* has been done by the sender |
1427 |
*/ |
1428 |
if (unlikely(tp->mptcp->send_mp_fail)) { |
1429 |
opts->options |= OPTION_MPTCP; |
1430 |
opts->mptcp_options |= OPTION_MP_FAIL; |
1431 |
opts->data_ack = (__u32)(mpcb->csum_cutoff_seq >> 32); |
1432 |
opts->data_seq = (__u32)mpcb->csum_cutoff_seq; |
1433 |
*size += MPTCP_SUB_LEN_FAIL; |
1434 |
return; |
1435 |
} |
1436 |
|
1437 |
if (unlikely(tp->send_mp_fclose)) { |
1438 |
opts->options |= OPTION_MPTCP; |
1439 |
opts->mptcp_options |= OPTION_MP_FCLOSE; |
1440 |
opts->mp_capable.receiver_key = mpcb->mptcp_rem_key; |
1441 |
*size += MPTCP_SUB_LEN_FCLOSE_ALIGN; |
1442 |
return; |
1443 |
} |
1444 |
|
1445 |
/* 1. If we are the sender of the infinite-mapping, we need the |
1446 |
* MPTCPHDR_INF-flag, because a retransmission of the |
1447 |
* infinite-announcment still needs the mptcp-option. |
1448 |
* |
1449 |
* We need infinite_cutoff_seq, because retransmissions from before |
1450 |
* the infinite-cutoff-moment still need the MPTCP-signalling to stay |
1451 |
* consistent. |
1452 |
* |
1453 |
* 2. If we are the receiver of the infinite-mapping, we always skip |
1454 |
* mptcp-options, because acknowledgments from before the |
1455 |
* infinite-mapping point have already been sent out. |
1456 |
* |
1457 |
* I know, the whole infinite-mapping stuff is ugly... |
1458 |
* |
1459 |
* TODO: Handle wrapped data-sequence numbers |
1460 |
* (even if it's very unlikely) |
1461 |
*/ |
1462 |
if (unlikely(mpcb->infinite_mapping_snd) && |
1463 |
tp->mptcp->fully_established && |
1464 |
((mpcb->send_infinite_mapping && tcb && |
1465 |
!(tcb->mptcp_flags & MPTCPHDR_INF) && |
1466 |
!before(tcb->seq, tp->mptcp->infinite_cutoff_seq)) || |
1467 |
!mpcb->send_infinite_mapping)) |
1468 |
return; |
1469 |
|
1470 |
if (unlikely(tp->mptcp->include_mpc)) { |
1471 |
opts->options |= OPTION_MPTCP; |
1472 |
opts->mptcp_options |= OPTION_MP_CAPABLE | |
1473 |
OPTION_TYPE_ACK; |
1474 |
*size += MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN; |
1475 |
opts->mp_capable.sender_key = mpcb->mptcp_loc_key; |
1476 |
opts->mp_capable.receiver_key = mpcb->mptcp_rem_key; |
1477 |
opts->dss_csum = mpcb->dss_csum; |
1478 |
|
1479 |
if (skb) |
1480 |
tp->mptcp->include_mpc = 0; |
1481 |
} |
1482 |
if (unlikely(tp->mptcp->pre_established)) { |
1483 |
opts->options |= OPTION_MPTCP; |
1484 |
opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_ACK; |
1485 |
*size += MPTCP_SUB_LEN_JOIN_ACK_ALIGN; |
1486 |
} |
1487 |
|
1488 |
if (!tp->mptcp_add_addr_ack && !tp->mptcp->include_mpc && |
1489 |
!tp->mptcp->pre_established) { |
1490 |
opts->options |= OPTION_MPTCP; |
1491 |
opts->mptcp_options |= OPTION_DATA_ACK; |
1492 |
/* If !skb, we come from tcp_current_mss and thus we always |
1493 |
* assume that the DSS-option will be set for the data-packet. |
1494 |
*/ |
1495 |
if (skb && !mptcp_is_data_seq(skb)) { |
1496 |
opts->data_ack = meta_tp->rcv_nxt; |
1497 |
|
1498 |
*size += MPTCP_SUB_LEN_ACK_ALIGN; |
1499 |
} else { |
1500 |
opts->data_ack = meta_tp->rcv_nxt; |
1501 |
|
1502 |
/* Doesn't matter, if csum included or not. It will be |
1503 |
* either 10 or 12, and thus aligned = 12 |
1504 |
*/ |
1505 |
*size += MPTCP_SUB_LEN_ACK_ALIGN + |
1506 |
MPTCP_SUB_LEN_SEQ_ALIGN; |
1507 |
} |
1508 |
|
1509 |
*size += MPTCP_SUB_LEN_DSS_ALIGN; |
1510 |
} |
1511 |
|
1512 |
if (unlikely(tp->mptcp->add_addr4) && |
1513 |
MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_ADD_ADDR4_ALIGN) { |
1514 |
int ind = mptcp_find_free_index(~(tp->mptcp->add_addr4)); |
1515 |
opts->options |= OPTION_MPTCP; |
1516 |
opts->mptcp_options |= OPTION_ADD_ADDR; |
1517 |
opts->addr4 = &mpcb->locaddr4[ind]; |
1518 |
if (skb) |
1519 |
tp->mptcp->add_addr4 &= ~(1 << ind); |
1520 |
*size += MPTCP_SUB_LEN_ADD_ADDR4_ALIGN; |
1521 |
} else if (unlikely(tp->mptcp->add_addr6) && |
1522 |
MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_ADD_ADDR6_ALIGN) { |
1523 |
int ind = mptcp_find_free_index(~(tp->mptcp->add_addr6)); |
1524 |
opts->options |= OPTION_MPTCP; |
1525 |
opts->mptcp_options |= OPTION_ADD_ADDR; |
1526 |
opts->addr6 = &mpcb->locaddr6[ind]; |
1527 |
if (skb) |
1528 |
tp->mptcp->add_addr6 &= ~(1 << ind); |
1529 |
*size += MPTCP_SUB_LEN_ADD_ADDR6_ALIGN; |
1530 |
} else if (unlikely(mpcb->remove_addrs) && |
1531 |
MAX_TCP_OPTION_SPACE - *size >= |
1532 |
mptcp_sub_len_remove_addr_align(mpcb->remove_addrs)) { |
1533 |
opts->options |= OPTION_MPTCP; |
1534 |
opts->mptcp_options |= OPTION_REMOVE_ADDR; |
1535 |
opts->remove_addrs = mpcb->remove_addrs; |
1536 |
*size += mptcp_sub_len_remove_addr_align(opts->remove_addrs); |
1537 |
if (skb) |
1538 |
mpcb->remove_addrs = 0; |
1539 |
} else if (!(opts->mptcp_options & OPTION_MP_CAPABLE) && |
1540 |
!(opts->mptcp_options & OPTION_MP_JOIN) && |
1541 |
((unlikely(tp->mptcp->add_addr6) && |
1542 |
MAX_TCP_OPTION_SPACE - *size <= |
1543 |
MPTCP_SUB_LEN_ADD_ADDR6_ALIGN) || |
1544 |
(unlikely(tp->mptcp->add_addr4) && |
1545 |
MAX_TCP_OPTION_SPACE - *size >= |
1546 |
MPTCP_SUB_LEN_ADD_ADDR4_ALIGN))) { |
1547 |
mptcp_debug("no space for add addr. unsent IPv4: %#x,IPv6: %#x\n", |
1548 |
tp->mptcp->add_addr4, tp->mptcp->add_addr6); |
1549 |
tp->mptcp_add_addr_ack = 1; |
1550 |
tcp_send_ack(sk); |
1551 |
tp->mptcp_add_addr_ack = 0; |
1552 |
} |
1553 |
|
1554 |
if (unlikely(tp->mptcp->send_mp_prio) && |
1555 |
MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_PRIO_ALIGN) { |
1556 |
opts->options |= OPTION_MPTCP; |
1557 |
opts->mptcp_options |= OPTION_MP_PRIO; |
1558 |
if (skb) |
1559 |
tp->mptcp->send_mp_prio = 0; |
1560 |
*size += MPTCP_SUB_LEN_PRIO_ALIGN; |
1561 |
} |
1562 |
|
1563 |
return; |
1564 |
} |
1565 |
|
1566 |
void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp, |
1567 |
struct tcp_out_options *opts, |
1568 |
struct sk_buff *skb) |
1569 |
{ |
1570 |
if (unlikely(OPTION_MP_CAPABLE & opts->mptcp_options)) { |
1571 |
struct mp_capable *mpc = (struct mp_capable *)ptr; |
1572 |
|
1573 |
mpc->kind = TCPOPT_MPTCP; |
1574 |
|
1575 |
if ((OPTION_TYPE_SYN & opts->mptcp_options) || |
1576 |
(OPTION_TYPE_SYNACK & opts->mptcp_options)) { |
1577 |
mpc->sender_key = opts->mp_capable.sender_key; |
1578 |
mpc->len = MPTCP_SUB_LEN_CAPABLE_SYN; |
1579 |
ptr += MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN >> 2; |
1580 |
} else if (OPTION_TYPE_ACK & opts->mptcp_options) { |
1581 |
mpc->sender_key = opts->mp_capable.sender_key; |
1582 |
mpc->receiver_key = opts->mp_capable.receiver_key; |
1583 |
mpc->len = MPTCP_SUB_LEN_CAPABLE_ACK; |
1584 |
ptr += MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN >> 2; |
1585 |
} |
1586 |
|
1587 |
mpc->sub = MPTCP_SUB_CAPABLE; |
1588 |
mpc->ver = 0; |
1589 |
mpc->a = opts->dss_csum ? 1 : 0; |
1590 |
mpc->b = 0; |
1591 |
mpc->rsv = 0; |
1592 |
mpc->h = 1; |
1593 |
} |
1594 |
|
1595 |
if (unlikely(OPTION_MP_JOIN & opts->mptcp_options)) { |
1596 |
struct mp_join *mpj = (struct mp_join *)ptr; |
1597 |
|
1598 |
mpj->kind = TCPOPT_MPTCP; |
1599 |
mpj->sub = MPTCP_SUB_JOIN; |
1600 |
mpj->rsv = 0; |
1601 |
mpj->addr_id = opts->addr_id; |
1602 |
|
1603 |
if (OPTION_TYPE_SYN & opts->mptcp_options) { |
1604 |
mpj->len = MPTCP_SUB_LEN_JOIN_SYN; |
1605 |
mpj->u.syn.token = opts->mp_join_syns.token; |
1606 |
mpj->u.syn.nonce = opts->mp_join_syns.sender_nonce; |
1607 |
mpj->b = tp->mptcp->low_prio; |
1608 |
ptr += MPTCP_SUB_LEN_JOIN_SYN_ALIGN >> 2; |
1609 |
} else if (OPTION_TYPE_SYNACK & opts->mptcp_options) { |
1610 |
mpj->len = MPTCP_SUB_LEN_JOIN_SYNACK; |
1611 |
mpj->u.synack.mac = |
1612 |
opts->mp_join_syns.sender_truncated_mac; |
1613 |
mpj->u.synack.nonce = opts->mp_join_syns.sender_nonce; |
1614 |
mpj->b = tp->mptcp->low_prio; |
1615 |
ptr += MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN >> 2; |
1616 |
} else if (OPTION_TYPE_ACK & opts->mptcp_options) { |
1617 |
mpj->len = MPTCP_SUB_LEN_JOIN_ACK; |
1618 |
memcpy(mpj->u.ack.mac, &tp->mptcp->sender_mac[0], 20); |
1619 |
ptr += MPTCP_SUB_LEN_JOIN_ACK_ALIGN >> 2; |
1620 |
} |
1621 |
} |
1622 |
if (unlikely(OPTION_ADD_ADDR & opts->mptcp_options)) { |
1623 |
struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr; |
1624 |
|
1625 |
mpadd->kind = TCPOPT_MPTCP; |
1626 |
if (opts->addr4) { |
1627 |
mpadd->len = MPTCP_SUB_LEN_ADD_ADDR4; |
1628 |
mpadd->sub = MPTCP_SUB_ADD_ADDR; |
1629 |
mpadd->ipver = 4; |
1630 |
mpadd->addr_id = opts->addr4->id; |
1631 |
mpadd->u.v4.addr = opts->addr4->addr; |
1632 |
ptr += MPTCP_SUB_LEN_ADD_ADDR4_ALIGN >> 2; |
1633 |
} else if (opts->addr6) { |
1634 |
mpadd->len = MPTCP_SUB_LEN_ADD_ADDR6; |
1635 |
mpadd->sub = MPTCP_SUB_ADD_ADDR; |
1636 |
mpadd->ipver = 6; |
1637 |
mpadd->addr_id = opts->addr6->id; |
1638 |
memcpy(&mpadd->u.v6.addr, &opts->addr6->addr, |
1639 |
sizeof(mpadd->u.v6.addr)); |
1640 |
ptr += MPTCP_SUB_LEN_ADD_ADDR6_ALIGN >> 2; |
1641 |
} else { |
1642 |
BUG(); |
1643 |
} |
1644 |
} |
1645 |
if (unlikely(OPTION_REMOVE_ADDR & opts->mptcp_options)) { |
1646 |
struct mp_remove_addr *mprem = (struct mp_remove_addr *)ptr; |
1647 |
u8 *addrs_id; |
1648 |
int id, len, len_align; |
1649 |
|
1650 |
len = mptcp_sub_len_remove_addr(opts->remove_addrs); |
1651 |
len_align = mptcp_sub_len_remove_addr_align(opts->remove_addrs); |
1652 |
|
1653 |
mprem->kind = TCPOPT_MPTCP; |
1654 |
mprem->len = len; |
1655 |
mprem->sub = MPTCP_SUB_REMOVE_ADDR; |
1656 |
mprem->rsv = 0; |
1657 |
addrs_id = &mprem->addrs_id; |
1658 |
|
1659 |
mptcp_for_each_bit_set(opts->remove_addrs, id) |
1660 |
*(addrs_id++) = id; |
1661 |
|
1662 |
/* Fill the rest with NOP's */ |
1663 |
if (len_align > len) { |
1664 |
int i; |
1665 |
for (i = 0; i < len_align - len; i++) |
1666 |
*(addrs_id++) = TCPOPT_NOP; |
1667 |
} |
1668 |
|
1669 |
ptr += len_align >> 2; |
1670 |
} |
1671 |
if (unlikely(OPTION_MP_FAIL & opts->mptcp_options)) { |
1672 |
struct mp_fail *mpfail = (struct mp_fail *)ptr; |
1673 |
|
1674 |
mpfail->kind = TCPOPT_MPTCP; |
1675 |
mpfail->len = MPTCP_SUB_LEN_FAIL; |
1676 |
mpfail->sub = MPTCP_SUB_FAIL; |
1677 |
mpfail->rsv1 = 0; |
1678 |
mpfail->rsv2 = 0; |
1679 |
mpfail->data_seq = htonll(((u64)opts->data_ack << 32) | opts->data_seq); |
1680 |
|
1681 |
ptr += MPTCP_SUB_LEN_FAIL_ALIGN >> 2; |
1682 |
} |
1683 |
if (unlikely(OPTION_MP_FCLOSE & opts->mptcp_options)) { |
1684 |
struct mp_fclose *mpfclose = (struct mp_fclose *)ptr; |
1685 |
|
1686 |
mpfclose->kind = TCPOPT_MPTCP; |
1687 |
mpfclose->len = MPTCP_SUB_LEN_FCLOSE; |
1688 |
mpfclose->sub = MPTCP_SUB_FCLOSE; |
1689 |
mpfclose->rsv1 = 0; |
1690 |
mpfclose->rsv2 = 0; |
1691 |
mpfclose->key = opts->mp_capable.receiver_key; |
1692 |
|
1693 |
ptr += MPTCP_SUB_LEN_FCLOSE_ALIGN >> 2; |
1694 |
} |
1695 |
|
1696 |
if (OPTION_DATA_ACK & opts->mptcp_options) { |
1697 |
if (!mptcp_is_data_seq(skb)) { |
1698 |
struct mp_dss *mdss = (struct mp_dss *)ptr; |
1699 |
|
1700 |
mdss->kind = TCPOPT_MPTCP; |
1701 |
mdss->sub = MPTCP_SUB_DSS; |
1702 |
mdss->rsv1 = 0; |
1703 |
mdss->rsv2 = 0; |
1704 |
mdss->F = 0; |
1705 |
mdss->m = 0; |
1706 |
mdss->M = 0; |
1707 |
mdss->a = 0; |
1708 |
mdss->A = 1; |
1709 |
mdss->len = mptcp_sub_len_dss(mdss, tp->mpcb->dss_csum); |
1710 |
|
1711 |
ptr++; |
1712 |
*ptr++ = htonl(opts->data_ack); |
1713 |
} else { |
1714 |
/**** Just update the data_ack ****/ |
1715 |
|
1716 |
/* Get pointer to data_ack-field. MPTCP is always at |
1717 |
* the end of the TCP-options. |
1718 |
*/ |
1719 |
/* TODO if we allow sending 64-bit dseq's we have to change "16" */ |
1720 |
__be32 *dack = (__be32 *)(skb->data + (tcp_hdr(skb)->doff << 2) - 16); |
1721 |
|
1722 |
*dack = htonl(opts->data_ack); |
1723 |
} |
1724 |
} |
1725 |
if (unlikely(OPTION_MP_PRIO & opts->mptcp_options)) { |
1726 |
struct mp_prio *mpprio = (struct mp_prio *)ptr; |
1727 |
|
1728 |
mpprio->kind = TCPOPT_MPTCP; |
1729 |
mpprio->len = MPTCP_SUB_LEN_PRIO; |
1730 |
mpprio->sub = MPTCP_SUB_PRIO; |
1731 |
mpprio->rsv = 0; |
1732 |
mpprio->b = tp->mptcp->low_prio; |
1733 |
mpprio->addr_id = TCPOPT_NOP; |
1734 |
|
1735 |
ptr += MPTCP_SUB_LEN_PRIO_ALIGN >> 2; |
1736 |
} |
1737 |
} |
1738 |
|
1739 |
/* Returns the next segment to be sent from the mptcp meta-queue. |
1740 |
* (chooses the reinject queue if any segment is waiting in it, otherwise, |
1741 |
* chooses the normal write queue). |
1742 |
* Sets *@reinject to 1 if the returned segment comes from the |
1743 |
* reinject queue. Sets it to 0 if it is the regular send-head of the meta-sk, |
1744 |
* and sets it to -1 if it is a meta-level retransmission to optimize the |
1745 |
* receive-buffer. |
1746 |
*/ |
1747 |
struct sk_buff *mptcp_next_segment(struct sock *meta_sk, int *reinject) |
1748 |
{ |
1749 |
struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; |
1750 |
struct sk_buff *skb = NULL; |
1751 |
if (reinject) |
1752 |
*reinject = 0; |
1753 |
|
1754 |
/* If we are in fallback-mode, just take from the meta-send-queue */ |
1755 |
if (mpcb->infinite_mapping_snd || mpcb->send_infinite_mapping) |
1756 |
return tcp_send_head(meta_sk); |
1757 |
|
1758 |
skb = skb_peek(&mpcb->reinject_queue); |
1759 |
|
1760 |
if (skb) { |
1761 |
if (reinject) |
1762 |
*reinject = 1; |
1763 |
} else { |
1764 |
skb = tcp_send_head(meta_sk); |
1765 |
|
1766 |
if (!skb && meta_sk->sk_write_pending && |
1767 |
sk_stream_wspace(meta_sk) < sk_stream_min_wspace(meta_sk)) { |
1768 |
struct sock *subsk = get_available_subflow(meta_sk, NULL, NULL); |
1769 |
if (!subsk) |
1770 |
return NULL; |
1771 |
|
1772 |
skb = mptcp_rcv_buf_optimization(subsk, 0); |
1773 |
if (skb && reinject) |
1774 |
*reinject = -1; |
1775 |
} |
1776 |
} |
1777 |
return skb; |
1778 |
} |
1779 |
|
1780 |
/* Sends the datafin */ |
1781 |
void mptcp_send_fin(struct sock *meta_sk) |
1782 |
{ |
1783 |
struct tcp_sock *meta_tp = tcp_sk(meta_sk); |
1784 |
struct sk_buff *skb = tcp_write_queue_tail(meta_sk); |
1785 |
int mss_now; |
1786 |
|
1787 |
if ((1 << meta_sk->sk_state) & (TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) |
1788 |
meta_tp->mpcb->passive_close = 1; |
1789 |
|
1790 |
/* Optimization, tack on the FIN if we have a queue of |
1791 |
* unsent frames. But be careful about outgoing SACKS |
1792 |
* and IP options. |
1793 |
*/ |
1794 |
mss_now = mptcp_current_mss(meta_sk); |
1795 |
|
1796 |
if (tcp_send_head(meta_sk) != NULL) { |
1797 |
TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_FIN; |
1798 |
TCP_SKB_CB(skb)->end_seq++; |
1799 |
meta_tp->write_seq++; |
1800 |
} else { |
1801 |
/* Socket is locked, keep trying until memory is available. */ |
1802 |
for (;;) { |
1803 |
skb = alloc_skb_fclone(MAX_TCP_HEADER, |
1804 |
meta_sk->sk_allocation); |
1805 |
if (skb) |
1806 |
break; |
1807 |
yield(); |
1808 |
} |
1809 |
/* Reserve space for headers and prepare control bits. */ |
1810 |
skb_reserve(skb, MAX_TCP_HEADER); |
1811 |
|
1812 |
tcp_init_nondata_skb(skb, meta_tp->write_seq, TCPHDR_ACK); |
1813 |
TCP_SKB_CB(skb)->end_seq++; |
1814 |
TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_FIN | MPTCPHDR_SEQ; |
1815 |
tcp_queue_skb(meta_sk, skb); |
1816 |
} |
1817 |
__tcp_push_pending_frames(meta_sk, mss_now, TCP_NAGLE_OFF); |
1818 |
} |
1819 |
|
1820 |
void mptcp_send_active_reset(struct sock *meta_sk, gfp_t priority) |
1821 |
{ |
1822 |
struct tcp_sock *meta_tp = tcp_sk(meta_sk); |
1823 |
struct mptcp_cb *mpcb = meta_tp->mpcb; |
1824 |
struct sock *sk = NULL, *sk_it = NULL, *tmpsk; |
1825 |
|
1826 |
if (!mpcb->cnt_subflows) |
1827 |
return; |
1828 |
|
1829 |
/* First - select a socket */ |
1830 |
|
1831 |
/* Socket already selected? */ |
1832 |
mptcp_for_each_sk(mpcb, sk_it) { |
1833 |
if (tcp_sk(sk_it)->send_mp_fclose) { |
1834 |
sk = sk_it; |
1835 |
goto found; |
1836 |
} |
1837 |
} |
1838 |
|
1839 |
sk = mptcp_select_ack_sock(meta_sk, 0); |
1840 |
/* May happen if no subflow is in an appropriate state */ |
1841 |
if (!sk) |
1842 |
return; |
1843 |
|
1844 |
/* We are in infinite mode - just send a reset */ |
1845 |
if (mpcb->infinite_mapping_snd || mpcb->infinite_mapping_rcv) { |
1846 |
tcp_send_active_reset(sk, priority); |
1847 |
return; |
1848 |
} |
1849 |
|
1850 |
tcp_sk(sk)->send_mp_fclose = 1; |
1851 |
|
1852 |
/** Reset all other subflows */ |
1853 |
|
1854 |
found: |
1855 |
/* tcp_done must be handled with bh disabled */ |
1856 |
if (!in_serving_softirq()) |
1857 |
local_bh_disable(); |
1858 |
mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) { |
1859 |
if (tcp_sk(sk_it)->send_mp_fclose) |
1860 |
continue; |
1861 |
|
1862 |
sk_it->sk_err = ECONNRESET; |
1863 |
if (tcp_need_reset(sk_it->sk_state)) |
1864 |
tcp_send_active_reset(sk_it, GFP_ATOMIC); |
1865 |
mptcp_sub_force_close(sk_it); |
1866 |
} |
1867 |
if (!in_serving_softirq()) |
1868 |
local_bh_enable(); |
1869 |
|
1870 |
tcp_send_ack(sk); |
1871 |
|
1872 |
if (!meta_tp->send_mp_fclose) { |
1873 |
struct inet_connection_sock *meta_icsk = inet_csk(meta_sk); |
1874 |
|
1875 |
meta_icsk->icsk_rto = min(inet_csk(sk)->icsk_rto, TCP_RTO_MAX); |
1876 |
inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS, |
1877 |
meta_icsk->icsk_rto, TCP_RTO_MAX); |
1878 |
} |
1879 |
|
1880 |
meta_tp->send_mp_fclose = 1; |
1881 |
} |
1882 |
|
1883 |
void mptcp_ack_retransmit_timer(struct sock *sk) |
1884 |
{ |
1885 |
struct sk_buff *skb; |
1886 |
struct tcp_sock *tp = tcp_sk(sk); |
1887 |
struct inet_connection_sock *icsk = inet_csk(sk); |
1888 |
|
1889 |
if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) |
1890 |
goto out; /* Routing failure or similar */ |
1891 |
|
1892 |
if (!tp->retrans_stamp) |
1893 |
tp->retrans_stamp = tcp_time_stamp ? : 1; |
1894 |
|
1895 |
if (tcp_write_timeout(sk)) { |
1896 |
tp->mptcp->pre_established = 0; |
1897 |
sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer); |
1898 |
tcp_send_active_reset(sk, GFP_ATOMIC); |
1899 |
goto out; |
1900 |
} |
1901 |
|
1902 |
skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); |
1903 |
if (skb == NULL) { |
1904 |
sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer, |
1905 |
jiffies + icsk->icsk_rto); |
1906 |
return; |
1907 |
} |
1908 |
|
1909 |
/* Reserve space for headers and prepare control bits */ |
1910 |
skb_reserve(skb, MAX_TCP_HEADER); |
1911 |
tcp_init_nondata_skb(skb, tp->snd_una, TCPHDR_ACK); |
1912 |
|
1913 |
TCP_SKB_CB(skb)->when = tcp_time_stamp; |
1914 |
if (tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC) > 0) { |
1915 |
/* Retransmission failed because of local congestion, |
1916 |
* do not backoff. |
1917 |
*/ |
1918 |
if (!icsk->icsk_retransmits) |
1919 |
icsk->icsk_retransmits = 1; |
1920 |
sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer, |
1921 |
jiffies + icsk->icsk_rto); |
1922 |
return; |
1923 |
} |
1924 |
|
1925 |
|
1926 |
icsk->icsk_retransmits++; |
1927 |
icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); |
1928 |
sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer, |
1929 |
jiffies + icsk->icsk_rto); |
1930 |
if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0)) { |
1931 |
__sk_dst_reset(sk); |
1932 |
} |
1933 |
|
1934 |
out:; |
1935 |
} |
1936 |
|
1937 |
void mptcp_ack_handler(unsigned long data) |
1938 |
{ |
1939 |
struct sock *sk = (struct sock *)data; |
1940 |
struct sock *meta_sk = mptcp_meta_sk(sk); |
1941 |
|
1942 |
bh_lock_sock(meta_sk); |
1943 |
if (sock_owned_by_user(meta_sk)) { |
1944 |
/* Try again later */ |
1945 |
sk_reset_timer(sk, &tcp_sk(sk)->mptcp->mptcp_ack_timer, |
1946 |
jiffies + (HZ / 20)); |
1947 |
goto out_unlock; |
1948 |
} |
1949 |
|
1950 |
if (sk->sk_state == TCP_CLOSE) |
1951 |
goto out_unlock; |
1952 |
|
1953 |
mptcp_ack_retransmit_timer(sk); |
1954 |
|
1955 |
sk_mem_reclaim(sk); |
1956 |
|
1957 |
out_unlock: |
1958 |
bh_unlock_sock(meta_sk); |
1959 |
sock_put(sk); |
1960 |
} |
1961 |
|
1962 |
/* Similar to tcp_retransmit_skb |
1963 |
* |
1964 |
* The diff is that we handle the retransmission-stats (retrans_stamp) at the |
1965 |
* meta-level. |
1966 |
*/ |
1967 |
int mptcp_retransmit_skb(struct sock *meta_sk, struct sk_buff *skb) |
1968 |
{ |
1969 |
struct tcp_sock *meta_tp = tcp_sk(meta_sk); |
1970 |
struct sock *subsk; |
1971 |
struct sk_buff *subskb; |
1972 |
unsigned int limit, tso_segs, mss_now; |
1973 |
int err = -1, oldpcount; |
1974 |
|
1975 |
/* Do not sent more than we queued. 1/4 is reserved for possible |
1976 |
* copying overhead: fragmentation, tunneling, mangling etc. |
1977 |
* |
1978 |
* This is a meta-retransmission thus we check on the meta-socket. |
1979 |
*/ |
1980 |
if (atomic_read(&meta_sk->sk_wmem_alloc) > |
1981 |
min(meta_sk->sk_wmem_queued + (meta_sk->sk_wmem_queued >> 2), meta_sk->sk_sndbuf)) { |
1982 |
return -EAGAIN; |
1983 |
} |
1984 |
|
1985 |
/* We need to make sure that the retransmitted segment can be sent on a |
1986 |
* subflow right now. If it is too big, it needs to be fragmented. |
1987 |
*/ |
1988 |
subsk = get_available_subflow(meta_sk, skb, &mss_now); |
1989 |
if (!subsk) { |
1990 |
/* We want to increase icsk_retransmits, thus return 0, so that |
1991 |
* mptcp_retransmit_timer enters the desired branch. |
1992 |
*/ |
1993 |
err = 0; |
1994 |
goto failed; |
1995 |
} |
1996 |
|
1997 |
/* If the segment was cloned (e.g. a meta retransmission), the header |
1998 |
* must be expanded/copied so that there is no corruption of TSO |
1999 |
* information. |
2000 |
*/ |
2001 |
if (skb_cloned(skb) && skb_is_nonlinear(skb) && |
2002 |
unlikely(pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) { |
2003 |
err = ENOMEM; |
2004 |
goto failed; |
2005 |
} |
2006 |
|
2007 |
oldpcount = tcp_skb_pcount(skb); |
2008 |
tcp_set_skb_tso_segs(meta_sk, skb, mss_now); |
2009 |
tso_segs = tcp_skb_pcount(skb); |
2010 |
BUG_ON(!tso_segs); |
2011 |
|
2012 |
/* The MSS might have changed and so the number of segments. We |
2013 |
* need to account for this change. |
2014 |
*/ |
2015 |
if (unlikely(oldpcount != tso_segs)) |
2016 |
tcp_adjust_pcount(meta_sk, skb, oldpcount - tso_segs); |
2017 |
|
2018 |
limit = mss_now; |
2019 |
if (tso_segs > 1 && !tcp_urg_mode(meta_tp)) |
2020 |
limit = tcp_mss_split_point(subsk, skb, mss_now, |
2021 |
min_t(unsigned int, |
2022 |
tcp_cwnd_test(tcp_sk(subsk), skb), |
2023 |
subsk->sk_gso_max_segs)); |
2024 |
|
2025 |
if (skb->len > limit && |
2026 |
unlikely(mptso_fragment(meta_sk, skb, limit, mss_now, |
2027 |
GFP_ATOMIC, 0))) |
2028 |
goto failed; |
2029 |
|
2030 |
subskb = mptcp_skb_entail(subsk, skb, -1); |
2031 |
if (!subskb) |
2032 |
goto failed; |
2033 |
|
2034 |
TCP_SKB_CB(skb)->when = tcp_time_stamp; |
2035 |
TCP_SKB_CB(subskb)->when = tcp_time_stamp; |
2036 |
err = tcp_transmit_skb(subsk, subskb, 1, GFP_ATOMIC); |
2037 |
if (!err) { |
2038 |
/* Update global TCP statistics. */ |
2039 |
TCP_INC_STATS(sock_net(meta_sk), TCP_MIB_RETRANSSEGS); |
2040 |
|
2041 |
/* Diff to tcp_retransmit_skb */ |
2042 |
|
2043 |
/* Save stamp of the first retransmit. */ |
2044 |
if (!meta_tp->retrans_stamp) |
2045 |
meta_tp->retrans_stamp = TCP_SKB_CB(subskb)->when; |
2046 |
mptcp_sub_event_new_data_sent(subsk, subskb, skb); |
2047 |
} else { |
2048 |
mptcp_transmit_skb_failed(subsk, skb, subskb, 0); |
2049 |
} |
2050 |
|
2051 |
failed: |
2052 |
return err; |
2053 |
} |
2054 |
|
2055 |
/* Similar to tcp_retransmit_timer |
2056 |
* |
2057 |
* The diff is that we have to handle retransmissions of the FAST_CLOSE-message |
2058 |
* and that we don't have an srtt estimation at the meta-level. |
2059 |
*/ |
2060 |
void mptcp_retransmit_timer(struct sock *meta_sk) |
2061 |
{ |
2062 |
struct tcp_sock *meta_tp = tcp_sk(meta_sk); |
2063 |
struct mptcp_cb *mpcb = meta_tp->mpcb; |
2064 |
struct inet_connection_sock *meta_icsk = inet_csk(meta_sk); |
2065 |
int err; |
2066 |
|
2067 |
if (unlikely(meta_tp->send_mp_fclose)) |
2068 |
goto send_mp_fclose; |
2069 |
|
2070 |
/* In fallback, retransmission is handled at the subflow-level */ |
2071 |
if (!meta_tp->packets_out || mpcb->infinite_mapping_snd || |
2072 |
mpcb->send_infinite_mapping) |
2073 |
return; |
2074 |
|
2075 |
WARN_ON(tcp_write_queue_empty(meta_sk)); |
2076 |
|
2077 |
if (!meta_tp->snd_wnd && !sock_flag(meta_sk, SOCK_DEAD) && |
2078 |
!((1 << meta_sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) { |
2079 |
/* Receiver dastardly shrinks window. Our retransmits |
2080 |
* become zero probes, but we should not timeout this |
2081 |
* connection. If the socket is an orphan, time it out, |
2082 |
* we cannot allow such beasts to hang infinitely. |
2083 |
*/ |
2084 |
struct inet_sock *meta_inet = inet_sk(meta_sk); |
2085 |
if (meta_sk->sk_family == AF_INET) { |
2086 |
LIMIT_NETDEBUG(KERN_DEBUG "TCP: Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n", |
2087 |
&meta_inet->inet_daddr, |
2088 |
ntohs(meta_inet->inet_dport), |
2089 |
meta_inet->inet_num, meta_tp->snd_una, |
2090 |
meta_tp->snd_nxt); |
2091 |
} |
2092 |
#if IS_ENABLED(CONFIG_IPV6) |
2093 |
else if (meta_sk->sk_family == AF_INET6) { |
2094 |
struct ipv6_pinfo *np = inet6_sk(meta_sk); |
2095 |
LIMIT_NETDEBUG(KERN_DEBUG "TCP: Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n", |
2096 |
&np->daddr, ntohs(meta_inet->inet_dport), |
2097 |
meta_inet->inet_num, meta_tp->snd_una, |
2098 |
meta_tp->snd_nxt); |
2099 |
} |
2100 |
#endif |
2101 |
if (tcp_time_stamp - meta_tp->rcv_tstamp > TCP_RTO_MAX) { |
2102 |
tcp_write_err(meta_sk); |
2103 |
return; |
2104 |
} |
2105 |
|
2106 |
mptcp_retransmit_skb(meta_sk, tcp_write_queue_head(meta_sk)); |
2107 |
goto out_reset_timer; |
2108 |
} |
2109 |
|
2110 |
if (tcp_write_timeout(meta_sk)) |
2111 |
return; |
2112 |
|
2113 |
if (meta_icsk->icsk_retransmits == 0) |
2114 |
NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_TCPTIMEOUTS); |
2115 |
|
2116 |
meta_icsk->icsk_ca_state = TCP_CA_Loss; |
2117 |
|
2118 |
err = mptcp_retransmit_skb(meta_sk, tcp_write_queue_head(meta_sk)); |
2119 |
if (err > 0) { |
2120 |
/* Retransmission failed because of local congestion, |
2121 |
* do not backoff. |
2122 |
*/ |
2123 |
if (!meta_icsk->icsk_retransmits) |
2124 |
meta_icsk->icsk_retransmits = 1; |
2125 |
inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS, |
2126 |
min(meta_icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL), |
2127 |
TCP_RTO_MAX); |
2128 |
return; |
2129 |
} |
2130 |
|
2131 |
out_backoff: |
2132 |
/* Increase the timeout each time we retransmit. Note that |
2133 |
* we do not increase the rtt estimate. rto is initialized |
2134 |
* from rtt, but increases here. Jacobson (SIGCOMM 88) suggests |
2135 |
* that doubling rto each time is the least we can get away with. |
2136 |
* In KA9Q, Karn uses this for the first few times, and then |
2137 |
* goes to quadratic. netBSD doubles, but only goes up to *64, |
2138 |
* and clamps at 1 to 64 sec afterwards. Note that 120 sec is |
2139 |
* defined in the protocol as the maximum possible RTT. I guess |
2140 |
* we'll have to use something other than TCP to talk to the |
2141 |
* University of Mars. |
2142 |
* |
2143 |
* PAWS allows us longer timeouts and large windows, so once |
2144 |
* implemented ftp to mars will work nicely. We will have to fix |
2145 |
* the 120 second clamps though! |
2146 |
*/ |
2147 |
meta_icsk->icsk_backoff++; |
2148 |
meta_icsk->icsk_retransmits++; |
2149 |
|
2150 |
out_reset_timer: |
2151 |
/* If stream is thin, use linear timeouts. Since 'icsk_backoff' is |
2152 |
* used to reset timer, set to 0. Recalculate 'icsk_rto' as this |
2153 |
* might be increased if the stream oscillates between thin and thick, |
2154 |
* thus the old value might already be too high compared to the value |
2155 |
* set by 'tcp_set_rto' in tcp_input.c which resets the rto without |
2156 |
* backoff. Limit to TCP_THIN_LINEAR_RETRIES before initiating |
2157 |
* exponential backoff behaviour to avoid continue hammering |
2158 |
* linear-timeout retransmissions into a black hole |
2159 |
*/ |
2160 |
if (meta_sk->sk_state == TCP_ESTABLISHED && |
2161 |
(meta_tp->thin_lto || sysctl_tcp_thin_linear_timeouts) && |
2162 |
tcp_stream_is_thin(meta_tp) && |
2163 |
meta_icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) { |
2164 |
meta_icsk->icsk_backoff = 0; |
2165 |
/* We cannot do the same as in tcp_write_timer because the |
2166 |
* srtt is not set here. |
2167 |
*/ |
2168 |
mptcp_set_rto(meta_sk); |
2169 |
} else { |
2170 |
/* Use normal (exponential) backoff */ |
2171 |
meta_icsk->icsk_rto = min(meta_icsk->icsk_rto << 1, TCP_RTO_MAX); |
2172 |
} |
2173 |
inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS, meta_icsk->icsk_rto, TCP_RTO_MAX); |
2174 |
|
2175 |
return; |
2176 |
|
2177 |
send_mp_fclose: |
2178 |
/* MUST do this before tcp_write_timeout, because retrans_stamp may have |
2179 |
* been set to 0 in another part while we are retransmitting |
2180 |
* MP_FASTCLOSE. Then, we would crash, because retransmits_timed_out |
2181 |
* accesses the meta-write-queue. |
2182 |
* |
2183 |
* We make sure that the timestamp is != 0. |
2184 |
*/ |
2185 |
if (!meta_tp->retrans_stamp) |
2186 |
meta_tp->retrans_stamp = tcp_time_stamp ? : 1; |
2187 |
|
2188 |
if (tcp_write_timeout(meta_sk)) |
2189 |
return; |
2190 |
|
2191 |
mptcp_send_active_reset(meta_sk, GFP_ATOMIC); |
2192 |
|
2193 |
goto out_backoff; |
2194 |
} |
2195 |
|
2196 |
/* Modify values to an mptcp-level for the initial window of new subflows */ |
2197 |
void mptcp_select_initial_window(int *__space, __u32 *window_clamp, |
2198 |
const struct sock *sk) |
2199 |
{ |
2200 |
struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb; |
2201 |
|
2202 |
*window_clamp = mpcb->orig_window_clamp; |
2203 |
*__space = tcp_win_from_space(mpcb->orig_sk_rcvbuf); |
2204 |
} |
2205 |
|
2206 |
unsigned int mptcp_current_mss(struct sock *meta_sk) |
2207 |
{ |
2208 |
unsigned int mss = 0; |
2209 |
struct sock *sk; |
2210 |
|
2211 |
mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) { |
2212 |
int this_mss; |
2213 |
|
2214 |
if (!mptcp_sk_can_send(sk)) |
2215 |
continue; |
2216 |
|
2217 |
this_mss = tcp_current_mss(sk); |
2218 |
if (!mss || this_mss < mss) |
2219 |
mss = this_mss; |
2220 |
} |
2221 |
|
2222 |
/* If no subflow is available, we take a default-mss from the |
2223 |
* meta-socket. |
2224 |
*/ |
2225 |
return !mss ? tcp_current_mss(meta_sk) : mss; |
2226 |
} |
2227 |
|
2228 |
int mptcp_select_size(const struct sock *meta_sk, bool sg) |
2229 |
{ |
2230 |
int mss = 0; /* We look for the smallest MSS */ |
2231 |
struct sock *sk; |
2232 |
|
2233 |
mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) { |
2234 |
int this_mss; |
2235 |
|
2236 |
if (!mptcp_sk_can_send(sk)) |
2237 |
continue; |
2238 |
|
2239 |
this_mss = tcp_sk(sk)->mss_cache; |
2240 |
if (!mss || this_mss < mss) |
2241 |
mss = this_mss; |
2242 |
} |
2243 |
|
2244 |
if (sg) { |
2245 |
if (mptcp_sk_can_gso(meta_sk)) { |
2246 |
mss = SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER); |
2247 |
} else { |
2248 |
int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER); |
2249 |
|
2250 |
if (mss >= pgbreak && |
2251 |
mss <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE) |
2252 |
mss = pgbreak; |
2253 |
} |
2254 |
} |
2255 |
|
2256 |
return !mss ? tcp_sk(meta_sk)->mss_cache : mss; |
2257 |
} |
2258 |
|
2259 |
int mptcp_check_snd_buf(const struct tcp_sock *tp) |
2260 |
{ |
2261 |
struct sock *sk; |
2262 |
u32 rtt_max = tp->srtt; |
2263 |
u64 bw_est; |
2264 |
|
2265 |
if (!tp->srtt) |
2266 |
return tp->reordering + 1; |
2267 |
|
2268 |
mptcp_for_each_sk(tp->mpcb, sk) { |
2269 |
if (!mptcp_sk_can_send(sk)) |
2270 |
continue; |
2271 |
|
2272 |
if (rtt_max < tcp_sk(sk)->srtt) |
2273 |
rtt_max = tcp_sk(sk)->srtt; |
2274 |
} |
2275 |
|
2276 |
bw_est = div64_u64(((u64)tp->snd_cwnd * rtt_max) << 16, |
2277 |
(u64)tp->srtt); |
2278 |
|
2279 |
return max_t(unsigned int, (u32)(bw_est >> 16), |
2280 |
tp->reordering + 1); |
2281 |
|
2282 |
} |
2283 |
|
2284 |
unsigned int mptcp_xmit_size_goal(struct sock *meta_sk, u32 mss_now, |
2285 |
int large_allowed) |
2286 |
{ |
2287 |
struct sock *sk; |
2288 |
u32 xmit_size_goal = 0; |
2289 |
|
2290 |
if (large_allowed && mptcp_sk_can_gso(meta_sk)) { |
2291 |
mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) { |
2292 |
int this_size_goal; |
2293 |
|
2294 |
if (!mptcp_sk_can_send(sk)) |
2295 |
continue; |
2296 |
|
2297 |
this_size_goal = tcp_xmit_size_goal(sk, mss_now, 1); |
2298 |
if (!xmit_size_goal || this_size_goal < xmit_size_goal) |
2299 |
xmit_size_goal = this_size_goal; |
2300 |
} |
2301 |
} |
2302 |
|
2303 |
return max(xmit_size_goal, mss_now); |
2304 |
} |
2305 |
|
2306 |
/* Similar to tcp_trim_head - but we correctly copy the DSS-option */ |
2307 |
int mptcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) |
2308 |
{ |
2309 |
int dsslen = MPTCP_SUB_LEN_DSS_ALIGN + MPTCP_SUB_LEN_ACK_ALIGN + |
2310 |
MPTCP_SUB_LEN_SEQ_ALIGN; |
2311 |
char dss[dsslen]; |
2312 |
|
2313 |
/* DSS-option must be recovered afterwards. */ |
2314 |
memcpy(dss, skb->data - dsslen, dsslen); |
2315 |
|
2316 |
if (skb_cloned(skb)) { |
2317 |
/* pskb_expand_head will delete our DSS-option. We have to copy |
2318 |
* it back if pskb_expand_head succeeds. |
2319 |
*/ |
2320 |
|
2321 |
if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) |
2322 |
return -ENOMEM; |
2323 |
|
2324 |
memcpy(skb->data - dsslen, dss, dsslen); |
2325 |
} |
2326 |
|
2327 |
__pskb_trim_head(skb, len); |
2328 |
|
2329 |
/* Put the DSS-option back in our header */ |
2330 |
memcpy(skb->data - dsslen, dss, dsslen); |
2331 |
|
2332 |
TCP_SKB_CB(skb)->seq += len; |
2333 |
skb->ip_summed = CHECKSUM_PARTIAL; |
2334 |
|
2335 |
skb->truesize -= len; |
2336 |
sk->sk_wmem_queued -= len; |
2337 |
sk_mem_uncharge(sk, len); |
2338 |
sock_set_flag(sk, SOCK_QUEUE_SHRUNK); |
2339 |
|
2340 |
/* Any change of skb->len requires recalculation of tso factor. */ |
2341 |
if (tcp_skb_pcount(skb) > 1) |
2342 |
tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb)); |
2343 |
|
2344 |
return 0; |
2345 |
} |