Gentoo Websites Logo
Go to: Gentoo Home Documentation Forums Lists Bugs Planet Store Wiki Get Gentoo!
View | Details | Raw Unified | Return to bug 477786 | Differences between
and this patch

Collapse All | Expand All

(-)a/drivers/infiniband/hw/cxgb4/cm.c (-1 / +1 lines)
Lines 2921-2927 Link Here
2921
	 */
2921
	 */
2922
	memset(&tmp_opt, 0, sizeof(tmp_opt));
2922
	memset(&tmp_opt, 0, sizeof(tmp_opt));
2923
	tcp_clear_options(&tmp_opt);
2923
	tcp_clear_options(&tmp_opt);
2924
	tcp_parse_options(skb, &tmp_opt, 0, NULL);
2924
	tcp_parse_options(skb, &tmp_opt, NULL, 0, NULL);
2925
2925
2926
	req = (struct cpl_pass_accept_req *)__skb_push(skb, sizeof(*req));
2926
	req = (struct cpl_pass_accept_req *)__skb_push(skb, sizeof(*req));
2927
	memset(req, 0, sizeof(*req));
2927
	memset(req, 0, sizeof(*req));
(-)a/drivers/virtio/virtio_ring.c (+2 lines)
Lines 173-178 Link Here
173
	head = vq->free_head;
173
	head = vq->free_head;
174
	vq->vring.desc[head].flags = VRING_DESC_F_INDIRECT;
174
	vq->vring.desc[head].flags = VRING_DESC_F_INDIRECT;
175
	vq->vring.desc[head].addr = virt_to_phys(desc);
175
	vq->vring.desc[head].addr = virt_to_phys(desc);
176
	/* kmemleak gives a false positive, as it's hidden by virt_to_phys */
177
	kmemleak_ignore(desc);
176
	vq->vring.desc[head].len = i * sizeof(struct vring_desc);
178
	vq->vring.desc[head].len = i * sizeof(struct vring_desc);
177
179
178
	/* Update free pointer */
180
	/* Update free pointer */
(-)a/include/linux/tcp.h (+74 lines)
Lines 72-77 Link Here
72
	u32	end_seq;
72
	u32	end_seq;
73
};
73
};
74
74
75
struct tcp_out_options {
76
	u16	options;	/* bit field of OPTION_* */
77
	u8	ws;		/* window scale, 0 to disable */
78
	u8	num_sack_blocks;/* number of SACK blocks to include */
79
	u8	hash_size;	/* bytes in hash_location */
80
	u16	mss;		/* 0 to disable */
81
	__u8	*hash_location;	/* temporary pointer, overloaded */
82
	__u32	tsval, tsecr;	/* need to include OPTION_TS */
83
	struct tcp_fastopen_cookie *fastopen_cookie;	/* Fast open cookie */
84
#ifdef CONFIG_MPTCP
85
	u16	mptcp_options;	/* bit field of MPTCP related OPTION_* */
86
	__sum16	dss_csum;	/* Overloaded field: dss-checksum required
87
				 * (for SYN-packets)? Or dss-csum itself */
88
89
	__u32	data_seq;	/* data sequence number, for MPTCP */
90
	__u32	data_ack;	/* data ack, for MPTCP */
91
92
	union {
93
		struct {
94
			__u64	sender_key;	/* sender's key for mptcp */
95
			__u64	receiver_key;	/* receiver's key for mptcp */
96
		} mp_capable;
97
98
		struct {
99
			__u64	sender_truncated_mac;
100
			__u32	sender_nonce;
101
					/* random number of the sender */
102
			__u32	token;	/* token for mptcp */
103
		} mp_join_syns;
104
	};
105
106
	struct mptcp_loc4 *addr4;/* v4 addresses for MPTCP */
107
	struct mptcp_loc6 *addr6;/* v6 addresses for MPTCP */
108
109
	u16	remove_addrs;	/* list of address id */
110
	u8	addr_id;	/* address id */
111
#endif /* CONFIG_MPTCP */
112
};
113
75
/*These are used to set the sack_ok field in struct tcp_options_received */
114
/*These are used to set the sack_ok field in struct tcp_options_received */
76
#define TCP_SACK_SEEN     (1 << 0)   /*1 = peer is SACK capable, */
115
#define TCP_SACK_SEEN     (1 << 0)   /*1 = peer is SACK capable, */
77
#define TCP_FACK_ENABLED  (1 << 1)   /*1 = FACK is enabled locally*/
116
#define TCP_FACK_ENABLED  (1 << 1)   /*1 = FACK is enabled locally*/
Lines 95-100 Link Here
95
	u16	mss_clamp;	/* Maximal mss, negotiated at connection setup */
134
	u16	mss_clamp;	/* Maximal mss, negotiated at connection setup */
96
};
135
};
97
136
137
struct mptcp_cb;
138
struct mptcp_tcp_sock;
139
98
static inline void tcp_clear_options(struct tcp_options_received *rx_opt)
140
static inline void tcp_clear_options(struct tcp_options_received *rx_opt)
99
{
141
{
100
	rx_opt->tstamp_ok = rx_opt->sack_ok = 0;
142
	rx_opt->tstamp_ok = rx_opt->sack_ok = 0;
Lines 124-129 Link Here
124
						  * FastOpen it's the seq#
166
						  * FastOpen it's the seq#
125
						  * after data-in-SYN.
167
						  * after data-in-SYN.
126
						  */
168
						  */
169
	u8				saw_mpc:1;
127
};
170
};
128
171
129
static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
172
static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
Lines 320-325 Link Here
320
	 * socket. Used to retransmit SYNACKs etc.
363
	 * socket. Used to retransmit SYNACKs etc.
321
	 */
364
	 */
322
	struct request_sock *fastopen_rsk;
365
	struct request_sock *fastopen_rsk;
366
367
368
	struct mptcp_cb		*mpcb;
369
	struct sock		*meta_sk;
370
	/* We keep these flags even if CONFIG_MPTCP is not checked, because
371
	 * it allows checking MPTCP capability just by checking the mpc flag,
372
	 * rather than adding ifdefs everywhere.
373
	 */
374
	u16     mpc:1,          /* Other end is multipath capable */
375
		inside_tk_table:1, /* Is the tcp_sock inside the token-table? */
376
		send_mp_fclose:1,
377
		request_mptcp:1, /* Did we send out an MP_CAPABLE?
378
				  * (this speeds up mptcp_doit() in tcp_recvmsg)
379
				  */
380
		pf:1, /* Potentially Failed state: when this flag is set, we
381
		       * stop using the subflow
382
		       */
383
		mp_killed:1, /* Killed with a tcp_done in mptcp? */
384
		mptcp_add_addr_ack:1,	/* Tell tcp_send_ack to return in case
385
					 * alloc_skb fails. */
386
		was_meta_sk:1,	/* This was a meta sk (in case of reuse) */
387
		close_it:1,	/* Must close socket in mptcp_data_ready? */
388
		closing:1;
389
	struct mptcp_tcp_sock *mptcp;
390
#ifdef CONFIG_MPTCP
391
	struct hlist_nulls_node tk_table;
392
	u32		mptcp_loc_token;
393
	u64		mptcp_loc_key;
394
#endif /* CONFIG_MPTCP */
323
};
395
};
324
396
325
enum tsq_flags {
397
enum tsq_flags {
Lines 331-336 Link Here
331
	TCP_MTU_REDUCED_DEFERRED,  /* tcp_v{4|6}_err() could not call
403
	TCP_MTU_REDUCED_DEFERRED,  /* tcp_v{4|6}_err() could not call
332
				    * tcp_v{4|6}_mtu_reduced()
404
				    * tcp_v{4|6}_mtu_reduced()
333
				    */
405
				    */
406
	MPTCP_SUB_DEFERRED, /* A subflow got deferred - process them */
334
};
407
};
335
408
336
static inline struct tcp_sock *tcp_sk(const struct sock *sk)
409
static inline struct tcp_sock *tcp_sk(const struct sock *sk)
Lines 349-354 Link Here
349
#ifdef CONFIG_TCP_MD5SIG
422
#ifdef CONFIG_TCP_MD5SIG
350
	struct tcp_md5sig_key	  *tw_md5_key;
423
	struct tcp_md5sig_key	  *tw_md5_key;
351
#endif
424
#endif
425
	struct mptcp_tw		  *mptcp_tw;
352
};
426
};
353
427
354
static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk)
428
static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk)
(-)a/include/net/inet6_connection_sock.h (+2 lines)
Lines 25-30 Link Here
25
extern int inet6_csk_bind_conflict(const struct sock *sk,
25
extern int inet6_csk_bind_conflict(const struct sock *sk,
26
				   const struct inet_bind_bucket *tb, bool relax);
26
				   const struct inet_bind_bucket *tb, bool relax);
27
27
28
extern u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport,
29
			   const u32 rnd, const u32 synq_hsize);
28
extern struct dst_entry* inet6_csk_route_req(struct sock *sk,
30
extern struct dst_entry* inet6_csk_route_req(struct sock *sk,
29
					     struct flowi6 *fl6,
31
					     struct flowi6 *fl6,
30
					     const struct request_sock *req);
32
					     const struct request_sock *req);
(-)a/include/net/inet_common.h (+6 lines)
Lines 1-6 Link Here
1
#ifndef _INET_COMMON_H
1
#ifndef _INET_COMMON_H
2
#define _INET_COMMON_H
2
#define _INET_COMMON_H
3
3
4
#include <net/sock.h>
5
4
extern const struct proto_ops inet_stream_ops;
6
extern const struct proto_ops inet_stream_ops;
5
extern const struct proto_ops inet_dgram_ops;
7
extern const struct proto_ops inet_dgram_ops;
6
8
Lines 13-18 Link Here
13
struct sockaddr;
15
struct sockaddr;
14
struct socket;
16
struct socket;
15
17
18
extern int inet_create(struct net *net, struct socket *sock, int protocol,
19
		       int kern);
20
extern int inet6_create(struct net *net, struct socket *sock, int protocol,
21
			int kern);
16
extern int inet_release(struct socket *sock);
22
extern int inet_release(struct socket *sock);
17
extern int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
23
extern int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
18
			       int addr_len, int flags);
24
			       int addr_len, int flags);
(-)a/include/net/inet_connection_sock.h (+2 lines)
Lines 243-248 Link Here
243
243
244
extern struct sock *inet_csk_accept(struct sock *sk, int flags, int *err);
244
extern struct sock *inet_csk_accept(struct sock *sk, int flags, int *err);
245
245
246
extern u32 inet_synq_hash(const __be32 raddr, const __be16 rport, const u32 rnd,
247
			  const u32 synq_hsize);
246
extern struct request_sock *inet_csk_search_req(const struct sock *sk,
248
extern struct request_sock *inet_csk_search_req(const struct sock *sk,
247
						struct request_sock ***prevp,
249
						struct request_sock ***prevp,
248
						const __be16 rport,
250
						const __be16 rport,
(-)a/include/net/mptcp.h (+1417 lines)
Line 0 Link Here
1
/*
2
 *	MPTCP implementation
3
 *
4
 *	Initial Design & Implementation:
5
 *	Sébastien Barré <sebastien.barre@uclouvain.be>
6
 *
7
 *	Current Maintainer & Author:
8
 *	Christoph Paasch <christoph.paasch@uclouvain.be>
9
 *
10
 *	Additional authors:
11
 *	Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
12
 *	Gregory Detal <gregory.detal@uclouvain.be>
13
 *	Fabien Duchêne <fabien.duchene@uclouvain.be>
14
 *	Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
15
 *	Lavkesh Lahngir <lavkesh51@gmail.com>
16
 *	Andreas Ripke <ripke@neclab.eu>
17
 *	Vlad Dogaru <vlad.dogaru@intel.com>
18
 *	Octavian Purdila <octavian.purdila@intel.com>
19
 *	John Ronan <jronan@tssg.org>
20
 *	Catalin Nicutar <catalin.nicutar@gmail.com>
21
 *	Brandon Heller <brandonh@stanford.edu>
22
 *
23
 *
24
 *	This program is free software; you can redistribute it and/or
25
 *      modify it under the terms of the GNU General Public License
26
 *      as published by the Free Software Foundation; either version
27
 *      2 of the License, or (at your option) any later version.
28
 */
29
30
#ifndef _MPTCP_H
31
#define _MPTCP_H
32
33
#include <linux/inetdevice.h>
34
#include <linux/ipv6.h>
35
#include <linux/list.h>
36
#include <linux/net.h>
37
#include <linux/skbuff.h>
38
#include <linux/socket.h>
39
#include <linux/tcp.h>
40
#include <linux/kernel.h>
41
42
#include <asm/byteorder.h>
43
#include <asm/unaligned.h>
44
#include <crypto/hash.h>
45
#include <net/mptcp_pm.h>
46
#include <net/tcp.h>
47
48
#if defined(__LITTLE_ENDIAN_BITFIELD)
49
	#define ntohll(x)  be64_to_cpu(x)
50
	#define htonll(x)  cpu_to_be64(x)
51
#elif defined(__BIG_ENDIAN_BITFIELD)
52
	#define ntohll(x) (x)
53
	#define htonll(x) (x)
54
#endif
55
56
/* is seq1 < seq2 ? */
57
static inline int before64(const u64 seq1, const u64 seq2)
58
{
59
	return (s64)(seq1 - seq2) < 0;
60
}
61
62
/* is seq1 > seq2 ? */
63
#define after64(seq1, seq2)	before64(seq2, seq1)
64
65
struct mptcp_request_sock {
66
	struct tcp_request_sock		req;
67
	struct mptcp_cb			*mpcb;
68
	/* Collision list in the tuple hashtable. We need to find
69
	 * the req sock when receiving the third msg of the 3-way handshake,
70
	 * since that one does not contain the token. If this makes
71
	 * the request sock too long, we can use kmalloc'ed specific entries for
72
	 * that tuple hashtable. At the moment, though, I extend the
73
	 * request_sock.
74
	 */
75
	struct list_head		collide_tuple;
76
	struct hlist_nulls_node		collide_tk;
77
	u32				mptcp_rem_nonce;
78
	u32				mptcp_loc_token;
79
	u64				mptcp_loc_key;
80
	u64				mptcp_rem_key;
81
	u64				mptcp_hash_tmac;
82
	u32				mptcp_loc_nonce;
83
	__u8				rem_id; /* Address-id in the MP_JOIN */
84
	u8				dss_csum:1,
85
					low_prio:1;
86
};
87
88
static inline
89
struct mptcp_request_sock *mptcp_rsk(const struct request_sock *req)
90
{
91
	return (struct mptcp_request_sock *)req;
92
}
93
94
static inline
95
struct request_sock *rev_mptcp_rsk(const struct mptcp_request_sock *req)
96
{
97
	return (struct request_sock *)req;
98
}
99
100
struct mptcp_options_received {
101
	u16	saw_mpc:1,
102
		dss_csum:1,
103
		drop_me:1,
104
105
		is_mp_join:1,
106
		join_ack:1,
107
108
		saw_low_prio:2, /* 0x1 - low-prio set for this subflow
109
				 * 0x2 - low-prio set for another subflow
110
				 */
111
		low_prio:1,
112
113
		saw_add_addr:2, /* Saw at least one add_addr option:
114
				 * 0x1: IPv4 - 0x2: IPv6
115
				 */
116
		more_add_addr:1, /* Saw one more add-addr. */
117
118
		saw_rem_addr:1, /* Saw at least one rem_addr option */
119
		more_rem_addr:1, /* Saw one more rem-addr. */
120
121
		mp_fail:1,
122
		mp_fclose:1;
123
	u8	rem_id;		/* Address-id in the MP_JOIN */
124
	u8	prio_addr_id;	/* Address-id in the MP_PRIO */
125
126
	const unsigned char *add_addr_ptr; /* Pointer to add-address option */
127
	const unsigned char *rem_addr_ptr; /* Pointer to rem-address option */
128
129
	u32	data_ack;
130
	u32	data_seq;
131
	u16	data_len;
132
133
	u32	mptcp_rem_token;/* Remote token */
134
135
	/* Key inside the option (from mp_capable or fast_close) */
136
	u64	mptcp_key;
137
138
	u32	mptcp_recv_nonce;
139
	u64	mptcp_recv_tmac;
140
	u8	mptcp_recv_mac[20];
141
};
142
143
struct mptcp_tcp_sock {
144
	struct tcp_sock	*next;		/* Next subflow socket */
145
	struct sock	*next_cb;
146
	struct mptcp_options_received rx_opt;
147
148
	 /* Those three fields record the current mapping */
149
	u64	map_data_seq;
150
	u32	map_subseq;
151
	u16	map_data_len;
152
	u16	slave_sk:1,
153
		nonce_set:1, /* Is the nonce set? (in order to support 0-nonce) */
154
		fully_established:1,
155
		establish_increased:1,
156
		second_packet:1,
157
		attached:1,
158
		send_mp_fail:1,
159
		include_mpc:1,
160
		mapping_present:1,
161
		map_data_fin:1,
162
		low_prio:1, /* use this socket as backup */
163
		rcv_low_prio:1, /* Peer sent low-prio option to us */
164
		send_mp_prio:1, /* Trigger to send mp_prio on this socket */
165
		pre_established:1; /* State between sending 3rd ACK and
166
				    * receiving the fourth ack of new subflows.
167
				    */
168
169
	/* isn: needed to translate abs to relative subflow seqnums */
170
	u32	snt_isn;
171
	u32	rcv_isn;
172
	u32	last_data_seq;
173
	u8	path_index;
174
	u8	add_addr4; /* bit-field of addrs not yet sent to our peer */
175
	u8	add_addr6;
176
	u8	rem_id;
177
178
	u32	last_rbuf_opti;	/* Timestamp of last rbuf optimization */
179
	unsigned int sent_pkts;
180
181
	struct sk_buff  *shortcut_ofoqueue; /* Shortcut to the current modified
182
					     * skb in the ofo-queue.
183
					     */
184
185
	int	init_rcv_wnd;
186
	u32	infinite_cutoff_seq;
187
	struct delayed_work work;
188
	u32	mptcp_loc_nonce;
189
	struct tcp_sock *tp; /* Where is my daddy? */
190
	u32	last_end_data_seq;
191
192
	/* MP_JOIN subflow: timer for retransmitting the 3rd ack */
193
	struct timer_list mptcp_ack_timer;
194
195
	/* HMAC of the third ack */
196
	char sender_mac[20];
197
};
198
199
struct mptcp_tw {
200
	struct list_head list;
201
	u64 loc_key;
202
	u64 rcv_nxt;
203
	struct mptcp_cb __rcu *mpcb;
204
	u8 meta_tw:1,
205
	   in_list:1;
206
};
207
208
struct mptcp_cb {
209
	struct sock *meta_sk;
210
211
	/* list of sockets in this multipath connection */
212
	struct tcp_sock *connection_list;
213
	/* list of sockets that need a call to release_cb */
214
	struct sock *callback_list;
215
216
	spinlock_t	 tw_lock;
217
	struct list_head tw_list;
218
	unsigned char	 mptw_state;
219
220
	atomic_t	refcnt;
221
222
	/* High-order bits of 64-bit sequence numbers */
223
	u32 snd_high_order[2];
224
	u32 rcv_high_order[2];
225
226
	u16	send_infinite_mapping:1,
227
		in_time_wait:1,
228
		list_rcvd:1, /* XXX TO REMOVE */
229
		dss_csum:1,
230
		server_side:1,
231
		infinite_mapping_rcv:1,
232
		infinite_mapping_snd:1,
233
		dfin_combined:1,   /* Was the DFIN combined with subflow-fin? */
234
		passive_close:1,
235
		snd_hiseq_index:1, /* Index in snd_high_order of snd_nxt */
236
		rcv_hiseq_index:1; /* Index in rcv_high_order of rcv_nxt */
237
238
	/* socket count in this connection */
239
	u8 cnt_subflows;
240
	u8 cnt_established;
241
242
	u32 noneligible;	/* Path mask of temporarily non
243
				 * eligible subflows by the scheduler
244
				 */
245
246
	struct sk_buff_head reinject_queue;
247
248
	u16 remove_addrs;
249
250
	u8 dfin_path_index;
251
	/* Worker struct for subflow establishment */
252
	struct work_struct subflow_work;
253
	struct delayed_work subflow_retry_work;
254
	/* Worker to handle interface/address changes if socket is owned */
255
	struct work_struct address_work;
256
	/* Mutex needed, because otherwise mptcp_close will complain that the
257
	 * socket is owned by the user.
258
	 * E.g., mptcp_sub_close_wq is taking the meta-lock.
259
	 */
260
	struct mutex mutex;
261
262
	/* Master socket, also part of the connection_list, this
263
	 * socket is the one that the application sees.
264
	 */
265
	struct sock *master_sk;
266
267
	u64	csum_cutoff_seq;
268
269
	__u64	mptcp_loc_key;
270
	__u32	mptcp_loc_token;
271
	__u64	mptcp_rem_key;
272
	__u32	mptcp_rem_token;
273
274
	/* Create a new subflow - necessary because the meta-sk may be IPv4, but
275
	 * the new subflow can be IPv6
276
	 */
277
	struct sock *(*syn_recv_sock)(struct sock *sk, struct sk_buff *skb,
278
				      struct request_sock *req,
279
				      struct dst_entry *dst);
280
281
	/* Local addresses */
282
	struct mptcp_loc4 locaddr4[MPTCP_MAX_ADDR];
283
	u8 loc4_bits; /* Bitfield indicating which of the above addrs are set */
284
	u8 next_v4_index;
285
286
	struct mptcp_loc6 locaddr6[MPTCP_MAX_ADDR];
287
	u8 loc6_bits;
288
	u8 next_v6_index;
289
290
	/* Remove addresses */
291
	struct mptcp_rem4 remaddr4[MPTCP_MAX_ADDR];
292
	u8 rem4_bits;
293
294
	struct mptcp_rem6 remaddr6[MPTCP_MAX_ADDR];
295
	u8 rem6_bits;
296
297
	u32 path_index_bits;
298
	/* Next pi to pick up in case a new path becomes available */
299
	u8 next_path_index;
300
301
	/* Original snd/rcvbuf of the initial subflow.
302
	 * Used for the new subflows on the server-side to allow correct
303
	 * autotuning
304
	 */
305
	int orig_sk_rcvbuf;
306
	int orig_sk_sndbuf;
307
	u32 orig_window_clamp;
308
};
309
310
static inline int mptcp_pi_to_flag(int pi)
311
{
312
	return 1 << (pi - 1);
313
}
314
315
#define MPTCP_SUB_CAPABLE			0
316
#define MPTCP_SUB_LEN_CAPABLE_SYN		12
317
#define MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN		12
318
#define MPTCP_SUB_LEN_CAPABLE_ACK		20
319
#define MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN		20
320
321
#define MPTCP_SUB_JOIN			1
322
#define MPTCP_SUB_LEN_JOIN_SYN		12
323
#define MPTCP_SUB_LEN_JOIN_SYN_ALIGN	12
324
#define MPTCP_SUB_LEN_JOIN_SYNACK	16
325
#define MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN	16
326
#define MPTCP_SUB_LEN_JOIN_ACK		24
327
#define MPTCP_SUB_LEN_JOIN_ACK_ALIGN	24
328
329
#define MPTCP_SUB_DSS		2
330
#define MPTCP_SUB_LEN_DSS	4
331
#define MPTCP_SUB_LEN_DSS_ALIGN	4
332
333
/* Lengths for seq and ack are the ones without the generic MPTCP-option header,
334
 * as they are part of the DSS-option.
335
 * To get the total length, just add the different options together.
336
 */
337
#define MPTCP_SUB_LEN_SEQ	10
338
#define MPTCP_SUB_LEN_SEQ_CSUM	12
339
#define MPTCP_SUB_LEN_SEQ_ALIGN	12
340
341
#define MPTCP_SUB_LEN_SEQ_64		14
342
#define MPTCP_SUB_LEN_SEQ_CSUM_64	16
343
#define MPTCP_SUB_LEN_SEQ_64_ALIGN	16
344
345
#define MPTCP_SUB_LEN_ACK	4
346
#define MPTCP_SUB_LEN_ACK_ALIGN	4
347
348
#define MPTCP_SUB_LEN_ACK_64		8
349
#define MPTCP_SUB_LEN_ACK_64_ALIGN	8
350
351
/* This is the "default" option-length we will send out most often.
352
 * MPTCP DSS-header
353
 * 32-bit data sequence number
354
 * 32-bit data ack
355
 *
356
 * It is necessary to calculate the effective MSS we will be using when
357
 * sending data.
358
 */
359
#define MPTCP_SUB_LEN_DSM_ALIGN  (MPTCP_SUB_LEN_DSS_ALIGN +		\
360
				  MPTCP_SUB_LEN_SEQ_ALIGN +		\
361
				  MPTCP_SUB_LEN_ACK_ALIGN)
362
363
#define MPTCP_SUB_ADD_ADDR		3
364
#define MPTCP_SUB_LEN_ADD_ADDR4		8
365
#define MPTCP_SUB_LEN_ADD_ADDR6		20
366
#define MPTCP_SUB_LEN_ADD_ADDR4_ALIGN	8
367
#define MPTCP_SUB_LEN_ADD_ADDR6_ALIGN	20
368
369
#define MPTCP_SUB_REMOVE_ADDR	4
370
#define MPTCP_SUB_LEN_REMOVE_ADDR	4
371
372
#define MPTCP_SUB_PRIO		5
373
#define MPTCP_SUB_LEN_PRIO	3
374
#define MPTCP_SUB_LEN_PRIO_ADDR	4
375
#define MPTCP_SUB_LEN_PRIO_ALIGN	4
376
377
#define MPTCP_SUB_FAIL		6
378
#define MPTCP_SUB_LEN_FAIL	12
379
#define MPTCP_SUB_LEN_FAIL_ALIGN	12
380
381
#define MPTCP_SUB_FCLOSE	7
382
#define MPTCP_SUB_LEN_FCLOSE	12
383
#define MPTCP_SUB_LEN_FCLOSE_ALIGN	12
384
385
386
#define OPTION_MPTCP		(1 << 5)
387
388
#ifdef CONFIG_MPTCP
389
390
/* MPTCP options */
391
#define OPTION_TYPE_SYN		(1 << 0)
392
#define OPTION_TYPE_SYNACK	(1 << 1)
393
#define OPTION_TYPE_ACK		(1 << 2)
394
#define OPTION_MP_CAPABLE	(1 << 3)
395
#define OPTION_DATA_ACK		(1 << 4)
396
#define OPTION_ADD_ADDR		(1 << 5)
397
#define OPTION_MP_JOIN		(1 << 6)
398
#define OPTION_MP_FAIL		(1 << 7)
399
#define OPTION_MP_FCLOSE	(1 << 8)
400
#define OPTION_REMOVE_ADDR	(1 << 9)
401
#define OPTION_MP_PRIO		(1 << 10)
402
403
/* Used for checking if the mptcp initialization has been successful */
404
extern bool mptcp_init_failed;
405
406
struct mptcp_option {
407
	__u8	kind;
408
	__u8	len;
409
#if defined(__LITTLE_ENDIAN_BITFIELD)
410
	__u8	ver:4,
411
		sub:4;
412
#elif defined(__BIG_ENDIAN_BITFIELD)
413
	__u8	sub:4,
414
		ver:4;
415
#else
416
#error	"Adjust your <asm/byteorder.h> defines"
417
#endif
418
};
419
420
struct mp_capable {
421
	__u8	kind;
422
	__u8	len;
423
#if defined(__LITTLE_ENDIAN_BITFIELD)
424
	__u8	ver:4,
425
		sub:4;
426
	__u8	h:1,
427
		rsv:5,
428
		b:1,
429
		a:1;
430
#elif defined(__BIG_ENDIAN_BITFIELD)
431
	__u8	sub:4,
432
		ver:4;
433
	__u8	a:1,
434
		b:1,
435
		rsv:5,
436
		h:1;
437
#else
438
#error	"Adjust your <asm/byteorder.h> defines"
439
#endif
440
	__u64	sender_key;
441
	__u64	receiver_key;
442
} __attribute__((__packed__));
443
444
struct mp_join {
445
	__u8	kind;
446
	__u8	len;
447
#if defined(__LITTLE_ENDIAN_BITFIELD)
448
	__u8	b:1,
449
		rsv:3,
450
		sub:4;
451
#elif defined(__BIG_ENDIAN_BITFIELD)
452
	__u8	sub:4,
453
		rsv:3,
454
		b:1;
455
#else
456
#error	"Adjust your <asm/byteorder.h> defines"
457
#endif
458
	__u8	addr_id;
459
	union {
460
		struct {
461
			u32	token;
462
			u32	nonce;
463
		} syn;
464
		struct {
465
			__u64	mac;
466
			u32	nonce;
467
		} synack;
468
		struct {
469
			__u8	mac[20];
470
		} ack;
471
	} u;
472
} __attribute__((__packed__));
473
474
struct mp_dss {
475
	__u8	kind;
476
	__u8	len;
477
#if defined(__LITTLE_ENDIAN_BITFIELD)
478
	__u16	rsv1:4,
479
		sub:4,
480
		A:1,
481
		a:1,
482
		M:1,
483
		m:1,
484
		F:1,
485
		rsv2:3;
486
#elif defined(__BIG_ENDIAN_BITFIELD)
487
	__u16	sub:4,
488
		rsv1:4,
489
		rsv2:3,
490
		F:1,
491
		m:1,
492
		M:1,
493
		a:1,
494
		A:1;
495
#else
496
#error	"Adjust your <asm/byteorder.h> defines"
497
#endif
498
};
499
500
struct mp_add_addr {
501
	__u8	kind;
502
	__u8	len;
503
#if defined(__LITTLE_ENDIAN_BITFIELD)
504
	__u8	ipver:4,
505
		sub:4;
506
#elif defined(__BIG_ENDIAN_BITFIELD)
507
	__u8	sub:4,
508
		ipver:4;
509
#else
510
#error	"Adjust your <asm/byteorder.h> defines"
511
#endif
512
	__u8	addr_id;
513
	union {
514
		struct {
515
			struct in_addr	addr;
516
			__be16		port;
517
		} v4;
518
		struct {
519
			struct in6_addr	addr;
520
			__be16		port;
521
		} v6;
522
	} u;
523
} __attribute__((__packed__));
524
525
struct mp_remove_addr {
526
	__u8	kind;
527
	__u8	len;
528
#if defined(__LITTLE_ENDIAN_BITFIELD)
529
	__u8	rsv:4,
530
		sub:4;
531
#elif defined(__BIG_ENDIAN_BITFIELD)
532
	__u8	sub:4,
533
		rsv:4;
534
#else
535
#error "Adjust your <asm/byteorder.h> defines"
536
#endif
537
	/* list of addr_id */
538
	__u8	addrs_id;
539
};
540
541
struct mp_fail {
542
	__u8	kind;
543
	__u8	len;
544
#if defined(__LITTLE_ENDIAN_BITFIELD)
545
	__u16	rsv1:4,
546
		sub:4,
547
		rsv2:8;
548
#elif defined(__BIG_ENDIAN_BITFIELD)
549
	__u16	sub:4,
550
		rsv1:4,
551
		rsv2:8;
552
#else
553
#error	"Adjust your <asm/byteorder.h> defines"
554
#endif
555
	__be64	data_seq;
556
} __attribute__((__packed__));
557
558
struct mp_fclose {
559
	__u8	kind;
560
	__u8	len;
561
#if defined(__LITTLE_ENDIAN_BITFIELD)
562
	__u16	rsv1:4,
563
		sub:4,
564
		rsv2:8;
565
#elif defined(__BIG_ENDIAN_BITFIELD)
566
	__u16	sub:4,
567
		rsv1:4,
568
		rsv2:8;
569
#else
570
#error	"Adjust your <asm/byteorder.h> defines"
571
#endif
572
	__u64	key;
573
} __attribute__((__packed__));
574
575
struct mp_prio {
576
	__u8	kind;
577
	__u8	len;
578
#if defined(__LITTLE_ENDIAN_BITFIELD)
579
	__u8	b:1,
580
		rsv:3,
581
		sub:4;
582
#elif defined(__BIG_ENDIAN_BITFIELD)
583
	__u8	sub:4,
584
		rsv:3,
585
		b:1;
586
#else
587
#error	"Adjust your <asm/byteorder.h> defines"
588
#endif
589
	__u8	addr_id;
590
} __attribute__((__packed__));
591
592
static inline int mptcp_sub_len_remove_addr(u16 bitfield)
593
{
594
	unsigned int c;
595
	for (c = 0; bitfield; c++)
596
		bitfield &= bitfield - 1;
597
	return MPTCP_SUB_LEN_REMOVE_ADDR + c - 1;
598
}
599
600
static inline int mptcp_sub_len_remove_addr_align(u16 bitfield)
601
{
602
	return ALIGN(mptcp_sub_len_remove_addr(bitfield), 4);
603
}
604
605
static inline int mptcp_sub_len_dss(struct mp_dss *m, int csum)
606
{
607
	return 4 + m->A * (4 + m->a * 4) + m->M * (10 + m->m * 4 + csum * 2);
608
}
609
610
/* Default MSS for MPTCP
611
 * All subflows will be using that MSS. If any subflow has a lower MSS, it is
612
 * just not used. */
613
#define MPTCP_MSS 1400
614
#define MPTCP_SYN_RETRIES 3
615
extern int sysctl_mptcp_ndiffports;
616
extern int sysctl_mptcp_enabled;
617
extern int sysctl_mptcp_checksum;
618
extern int sysctl_mptcp_debug;
619
extern int sysctl_mptcp_syn_retries;
620
621
extern struct workqueue_struct *mptcp_wq;
622
623
#define mptcp_debug(fmt, args...)					\
624
	do {								\
625
		if (unlikely(sysctl_mptcp_debug))			\
626
			pr_err(__FILE__ ": " fmt, ##args);	\
627
	} while (0)
628
629
/* Iterates over all subflows */
630
#define mptcp_for_each_tp(mpcb, tp)					\
631
	for ((tp) = (mpcb)->connection_list; (tp); (tp) = (tp)->mptcp->next)
632
633
#define mptcp_for_each_sk(mpcb, sk)					\
634
	for ((sk) = (struct sock *)(mpcb)->connection_list;		\
635
	     sk;							\
636
	     sk = (struct sock *)tcp_sk(sk)->mptcp->next)
637
638
#define mptcp_for_each_sk_safe(__mpcb, __sk, __temp)			\
639
	for (__sk = (struct sock *)(__mpcb)->connection_list,		\
640
	     __temp = __sk ? (struct sock *)tcp_sk(__sk)->mptcp->next : NULL; \
641
	     __sk;							\
642
	     __sk = __temp,						\
643
	     __temp = __sk ? (struct sock *)tcp_sk(__sk)->mptcp->next : NULL)
644
645
/* Iterates over all bit set to 1 in a bitset */
646
#define mptcp_for_each_bit_set(b, i)					\
647
	for (i = ffs(b) - 1; i >= 0; i = ffs(b >> (i + 1) << (i + 1)) - 1)
648
649
#define mptcp_for_each_bit_unset(b, i)					\
650
	mptcp_for_each_bit_set(~b, i)
651
652
extern struct lock_class_key meta_key;
653
extern struct lock_class_key meta_slock_key;
654
extern u32 mptcp_secret[MD5_MESSAGE_BYTES / 4];
655
656
/* This is needed to ensure that two subsequent key-generation result in
657
 * different keys if the IPs and ports are the same.
658
 */
659
extern u32 mptcp_key_seed;
660
661
void mptcp_data_ready(struct sock *sk, int bytes);
662
void mptcp_write_space(struct sock *sk);
663
664
void mptcp_add_meta_ofo_queue(struct sock *meta_sk, struct sk_buff *skb,
665
			      struct sock *sk);
666
void mptcp_ofo_queue(struct sock *meta_sk);
667
void mptcp_purge_ofo_queue(struct tcp_sock *meta_tp);
668
void mptcp_cleanup_rbuf(struct sock *meta_sk, int copied);
669
int mptcp_alloc_mpcb(struct sock *master_sk, __u64 remote_key, u32 window);
670
int mptcp_add_sock(struct sock *meta_sk, struct sock *sk, u8 rem_id, gfp_t flags);
671
void mptcp_del_sock(struct sock *sk);
672
void mptcp_update_metasocket(struct sock *sock, struct sock *meta_sk);
673
void mptcp_reinject_data(struct sock *orig_sk, int clone_it);
674
void mptcp_update_sndbuf(struct mptcp_cb *mpcb);
675
struct sk_buff *mptcp_next_segment(struct sock *sk, int *reinject);
676
void mptcp_send_fin(struct sock *meta_sk);
677
void mptcp_send_active_reset(struct sock *meta_sk, gfp_t priority);
678
int mptcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
679
		     int push_one, gfp_t gfp);
680
void mptcp_parse_options(const uint8_t *ptr, int opsize,
681
			 struct tcp_options_received *opt_rx,
682
			 struct mptcp_options_received *mopt,
683
			 const struct sk_buff *skb);
684
void mptcp_syn_options(struct sock *sk, struct tcp_out_options *opts,
685
		       unsigned *remaining);
686
void mptcp_synack_options(struct request_sock *req,
687
			  struct tcp_out_options *opts,
688
			  unsigned *remaining);
689
void mptcp_established_options(struct sock *sk, struct sk_buff *skb,
690
			       struct tcp_out_options *opts, unsigned *size);
691
void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp,
692
			 struct tcp_out_options *opts,
693
			 struct sk_buff *skb);
694
void mptcp_close(struct sock *meta_sk, long timeout);
695
int mptcp_doit(struct sock *sk);
696
int mptcp_create_master_sk(struct sock *meta_sk, __u64 remote_key, u32 window);
697
int mptcp_check_req_master(struct sock *sk, struct sock *child,
698
			   struct request_sock *req,
699
			   struct request_sock **prev,
700
			   struct mptcp_options_received *mopt);
701
struct sock *mptcp_check_req_child(struct sock *sk, struct sock *child,
702
				   struct request_sock *req,
703
				   struct request_sock **prev,
704
				   struct mptcp_options_received *mopt);
705
u32 __mptcp_select_window(struct sock *sk);
706
void mptcp_select_initial_window(int *__space, __u32 *window_clamp,
707
			         const struct sock *sk);
708
unsigned int mptcp_current_mss(struct sock *meta_sk);
709
int mptcp_select_size(const struct sock *meta_sk, bool sg);
710
void mptcp_key_sha1(u64 key, u32 *token, u64 *idsn);
711
void mptcp_hmac_sha1(u8 *key_1, u8 *key_2, u8 *rand_1, u8 *rand_2,
712
		     u32 *hash_out);
713
void mptcp_clean_rtx_infinite(struct sk_buff *skb, struct sock *sk);
714
void mptcp_fin(struct sock *meta_sk);
715
void mptcp_retransmit_timer(struct sock *meta_sk);
716
int mptcp_write_wakeup(struct sock *meta_sk);
717
void mptcp_sub_close_wq(struct work_struct *work);
718
void mptcp_sub_close(struct sock *sk, unsigned long delay);
719
struct sock *mptcp_select_ack_sock(const struct sock *meta_sk, int copied);
720
void mptcp_fallback_meta_sk(struct sock *meta_sk);
721
int mptcp_backlog_rcv(struct sock *meta_sk, struct sk_buff *skb);
722
struct sock *mptcp_sk_clone(const struct sock *sk, int family, const gfp_t priority);
723
void mptcp_ack_handler(unsigned long);
724
void mptcp_set_keepalive(struct sock *sk, int val);
725
int mptcp_check_rtt(const struct tcp_sock *tp, int time);
726
int mptcp_check_snd_buf(const struct tcp_sock *tp);
727
int mptcp_handle_options(struct sock *sk, const struct tcphdr *th, struct sk_buff *skb);
728
void __init mptcp_init(void);
729
int mptcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len);
730
int mptcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
731
		   unsigned int mss_now, int reinject);
732
int mptso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
733
		   unsigned int mss_now, gfp_t gfp, int reinject);
734
void mptcp_destroy_sock(struct sock *sk);
735
int mptcp_rcv_synsent_state_process(struct sock *sk, struct sock **skptr,
736
				    struct sk_buff *skb,
737
				    struct mptcp_options_received *mopt);
738
unsigned int mptcp_xmit_size_goal(struct sock *meta_sk, u32 mss_now,
739
				  int large_allowed);
740
int mptcp_time_wait(struct sock *sk, struct tcp_timewait_sock *tw);
741
void mptcp_twsk_destructor(struct tcp_timewait_sock *tw);
742
void mptcp_update_tw_socks(const struct tcp_sock *tp, int state);
743
int mptcp_retransmit_skb(struct sock *meta_sk, struct sk_buff *skb);
744
void mptcp_tsq_flags(struct sock *sk);
745
void mptcp_tsq_sub_deferred(struct sock *meta_sk);
746
747
static inline bool mptcp_can_sendpage(struct sock *sk)
748
{
749
	struct sock *sk_it;
750
751
	if (tcp_sk(sk)->mpcb->dss_csum)
752
		return false;
753
754
	mptcp_for_each_sk(tcp_sk(sk)->mpcb, sk_it) {
755
		if (!(sk_it->sk_route_caps & NETIF_F_SG) ||
756
		    !(sk_it->sk_route_caps & NETIF_F_ALL_CSUM))
757
			return false;
758
	}
759
760
	return true;
761
}
762
763
static inline void mptcp_push_pending_frames(struct sock *meta_sk)
764
{
765
	if (mptcp_next_segment(meta_sk, NULL)) {
766
		struct tcp_sock *tp = tcp_sk(meta_sk);
767
768
		/* We don't care about the MSS, because it will be set in
769
		 * mptcp_write_xmit.
770
		 */
771
		__tcp_push_pending_frames(meta_sk, 0, tp->nonagle);
772
	}
773
}
774
775
static inline void mptcp_sub_force_close(struct sock *sk)
776
{
777
	/* The below tcp_done may have freed the socket, if he is already dead.
778
	 * Thus, we are not allowed to access it afterwards. That's why
779
	 * we have to store the dead-state in this local variable.
780
	 */
781
	int sock_is_dead = sock_flag(sk, SOCK_DEAD);
782
783
	tcp_sk(sk)->mp_killed = 1;
784
785
	if (sk->sk_state != TCP_CLOSE)
786
		tcp_done(sk);
787
788
	if (!sock_is_dead)
789
		mptcp_sub_close(sk, 0);
790
}
791
792
static inline void mptcp_send_reset(struct sock *sk)
793
{
794
	tcp_send_active_reset(sk, GFP_ATOMIC);
795
	mptcp_sub_force_close(sk);
796
}
797
798
static inline int mptcp_is_data_seq(const struct sk_buff *skb)
799
{
800
	return TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_SEQ;
801
}
802
803
static inline int mptcp_is_data_fin(const struct sk_buff *skb)
804
{
805
	return mptcp_is_data_seq(skb) &&
806
	       (TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_FIN);
807
}
808
809
/* Is it a data-fin while in infinite mapping mode?
810
 * In infinite mode, a subflow-fin is in fact a data-fin.
811
 */
812
static inline int mptcp_is_data_fin2(const struct sk_buff *skb,
813
				     const struct tcp_sock *tp)
814
{
815
	return mptcp_is_data_fin(skb) ||
816
	       (tp->mpcb->infinite_mapping_rcv && tcp_hdr(skb)->fin);
817
}
818
819
static inline void mptcp_skb_entail_init(const struct tcp_sock *tp,
820
					 struct sk_buff *skb)
821
{
822
	if (tp->mpc)
823
		TCP_SKB_CB(skb)->mptcp_flags = MPTCPHDR_SEQ;
824
}
825
826
static inline u8 mptcp_get_64_bit(u64 data_seq, struct mptcp_cb *mpcb)
827
{
828
	u64 data_seq_high = (u32)(data_seq >> 32);
829
830
	if (mpcb->rcv_high_order[0] == data_seq_high)
831
		return 0;
832
	else if (mpcb->rcv_high_order[1] == data_seq_high)
833
		return MPTCPHDR_SEQ64_INDEX;
834
	else
835
		return MPTCPHDR_SEQ64_OFO;
836
}
837
838
/* Sets the data_seq and returns pointer to the in-skb field of the data_seq.
839
 * If the packet has a 64-bit dseq, the pointer points to the last 32 bits.
840
 */
841
static inline __u32 *mptcp_skb_set_data_seq(const struct sk_buff *skb,
842
					    u32 *data_seq,
843
					    struct mptcp_cb *mpcb)
844
{
845
	__u32 *ptr = (__u32 *)(skb_transport_header(skb) + TCP_SKB_CB(skb)->dss_off);
846
847
	if (TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_SEQ64_SET) {
848
		u64 data_seq64 = get_unaligned_be64(ptr);
849
850
		if (mpcb)
851
			TCP_SKB_CB(skb)->mptcp_flags |= mptcp_get_64_bit(data_seq64, mpcb);
852
853
		*data_seq = (u32)data_seq64 ;
854
		ptr++;
855
	} else {
856
		*data_seq = get_unaligned_be32(ptr);
857
	}
858
859
	return ptr;
860
}
861
862
static inline struct sock *mptcp_meta_sk(const struct sock *sk)
863
{
864
	return tcp_sk(sk)->meta_sk;
865
}
866
867
static inline struct tcp_sock *mptcp_meta_tp(const struct tcp_sock *tp)
868
{
869
	return tcp_sk(tp->meta_sk);
870
}
871
872
static inline int is_meta_tp(const struct tcp_sock *tp)
873
{
874
	return tp->mpcb && mptcp_meta_tp(tp) == tp;
875
}
876
877
static inline int is_meta_sk(const struct sock *sk)
878
{
879
	return sk->sk_type == SOCK_STREAM  && sk->sk_protocol == IPPROTO_TCP &&
880
	       tcp_sk(sk)->mpc && mptcp_meta_sk(sk) == sk;
881
}
882
883
static inline int is_master_tp(const struct tcp_sock *tp)
884
{
885
	return !tp->mpc || (!tp->mptcp->slave_sk && !is_meta_tp(tp));
886
}
887
888
static inline void mptcp_hash_request_remove(struct request_sock *req)
889
{
890
	int in_softirq = 0;
891
892
	if (list_empty(&mptcp_rsk(req)->collide_tuple))
893
		return;
894
895
	if (in_softirq()) {
896
		spin_lock(&mptcp_reqsk_hlock);
897
		in_softirq = 1;
898
	} else {
899
		spin_lock_bh(&mptcp_reqsk_hlock);
900
	}
901
902
	list_del(&mptcp_rsk(req)->collide_tuple);
903
904
	if (in_softirq)
905
		spin_unlock(&mptcp_reqsk_hlock);
906
	else
907
		spin_unlock_bh(&mptcp_reqsk_hlock);
908
}
909
910
static inline void mptcp_reqsk_destructor(struct request_sock *req)
911
{
912
	if (!mptcp_rsk(req)->mpcb) {
913
		if (hlist_nulls_unhashed(&mptcp_rsk(req)->collide_tk))
914
			return;
915
916
		if (in_softirq()) {
917
			mptcp_reqsk_remove_tk(req);
918
		} else {
919
			rcu_read_lock_bh();
920
			spin_lock(&mptcp_tk_hashlock);
921
			hlist_nulls_del_rcu(&mptcp_rsk(req)->collide_tk);
922
			spin_unlock(&mptcp_tk_hashlock);
923
			rcu_read_unlock_bh();
924
		}
925
	} else {
926
		mptcp_hash_request_remove(req);
927
	}
928
}
929
930
static inline void mptcp_init_mp_opt(struct mptcp_options_received *mopt)
931
{
932
	mopt->saw_mpc = 0;
933
	mopt->dss_csum = 0;
934
	mopt->drop_me = 0;
935
936
	mopt->is_mp_join = 0;
937
	mopt->join_ack = 0;
938
939
	mopt->saw_low_prio = 0;
940
	mopt->low_prio = 0;
941
942
	mopt->saw_add_addr = 0;
943
	mopt->more_add_addr = 0;
944
945
	mopt->saw_rem_addr = 0;
946
	mopt->more_rem_addr = 0;
947
948
	mopt->mp_fail = 0;
949
	mopt->mp_fclose = 0;
950
}
951
952
static inline void mptcp_reset_mopt(struct tcp_sock *tp)
953
{
954
	struct mptcp_options_received *mopt = &tp->mptcp->rx_opt;
955
956
	mopt->saw_low_prio = 0;
957
	mopt->saw_add_addr = 0;
958
	mopt->more_add_addr = 0;
959
	mopt->saw_rem_addr = 0;
960
	mopt->more_rem_addr = 0;
961
	mopt->join_ack = 0;
962
	mopt->mp_fail = 0;
963
	mopt->mp_fclose = 0;
964
}
965
966
static inline __be32 mptcp_get_highorder_sndbits(const struct sk_buff *skb,
967
						 const struct mptcp_cb *mpcb)
968
{
969
	return htonl(mpcb->snd_high_order[(TCP_SKB_CB(skb)->mptcp_flags &
970
			MPTCPHDR_SEQ64_INDEX) ? 1 : 0]);
971
}
972
973
static inline u64 mptcp_get_data_seq_64(const struct mptcp_cb *mpcb, int index,
974
					u32 data_seq_32)
975
{
976
	return ((u64)mpcb->rcv_high_order[index] << 32) | data_seq_32;
977
}
978
979
static inline u64 mptcp_get_rcv_nxt_64(const struct tcp_sock *meta_tp)
980
{
981
	struct mptcp_cb *mpcb = meta_tp->mpcb;
982
	return mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index,
983
				     meta_tp->rcv_nxt);
984
}
985
986
static inline void mptcp_check_sndseq_wrap(struct tcp_sock *meta_tp, int inc)
987
{
988
	if (unlikely(meta_tp->snd_nxt > meta_tp->snd_nxt + inc)) {
989
		struct mptcp_cb *mpcb = meta_tp->mpcb;
990
		mpcb->snd_hiseq_index = mpcb->snd_hiseq_index ? 0 : 1;
991
		mpcb->snd_high_order[mpcb->snd_hiseq_index] += 2;
992
	}
993
}
994
995
static inline void mptcp_check_rcvseq_wrap(struct tcp_sock *meta_tp,
996
					   u32 old_rcv_nxt)
997
{
998
	if (unlikely(old_rcv_nxt > meta_tp->rcv_nxt)) {
999
		struct mptcp_cb *mpcb = meta_tp->mpcb;
1000
		mpcb->rcv_high_order[mpcb->rcv_hiseq_index] += 2;
1001
		mpcb->rcv_hiseq_index = mpcb->rcv_hiseq_index ? 0 : 1;
1002
	}
1003
}
1004
1005
static inline int mptcp_sk_can_send(const struct sock *sk)
1006
{
1007
	return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT);
1008
}
1009
1010
static inline int mptcp_sk_can_recv(const struct sock *sk)
1011
{
1012
	return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCP_FIN_WAIT1 | TCP_FIN_WAIT2);
1013
}
1014
1015
static inline int mptcp_sk_can_send_ack(const struct sock *sk)
1016
{
1017
	return !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV |
1018
					TCPF_CLOSE | TCPF_LISTEN));
1019
}
1020
1021
/* Only support GSO if all subflows supports it */
1022
static inline bool mptcp_sk_can_gso(const struct sock *meta_sk)
1023
{
1024
	struct sock *sk;
1025
1026
	if (tcp_sk(meta_sk)->mpcb->dss_csum)
1027
		return 0;
1028
1029
	mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) {
1030
		if (!mptcp_sk_can_send(sk))
1031
			continue;
1032
		if (!sk_can_gso(sk))
1033
			return false;
1034
	}
1035
	return true;
1036
}
1037
1038
static inline bool mptcp_can_sg(const struct sock *meta_sk)
1039
{
1040
	struct sock *sk;
1041
1042
	if (tcp_sk(meta_sk)->mpcb->dss_csum)
1043
		return 0;
1044
1045
	mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) {
1046
		if (!mptcp_sk_can_send(sk))
1047
			continue;
1048
		if (!(sk->sk_route_caps & NETIF_F_SG))
1049
			return false;
1050
	}
1051
	return true;
1052
}
1053
1054
/* Adding a new subflow to the rcv-buffer space. We make a simple addition,
1055
 * to give some space to allow traffic on the new subflow. Autotuning will
1056
 * increase it further later on.
1057
 */
1058
static inline void mptcp_init_buffer_space(struct sock *sk)
1059
{
1060
	struct sock *meta_sk = mptcp_meta_sk(sk);
1061
	int space = min(meta_sk->sk_rcvbuf + sk->sk_rcvbuf, sysctl_tcp_rmem[2]);
1062
1063
	if (space > meta_sk->sk_rcvbuf) {
1064
		tcp_sk(meta_sk)->window_clamp += tcp_sk(sk)->window_clamp;
1065
		tcp_sk(meta_sk)->rcv_ssthresh += tcp_sk(sk)->rcv_ssthresh;
1066
		meta_sk->sk_rcvbuf = space;
1067
	}
1068
}
1069
1070
static inline void mptcp_set_rto(struct sock *sk)
1071
{
1072
	struct tcp_sock *tp = tcp_sk(sk);
1073
	struct sock *sk_it;
1074
	struct inet_connection_sock *micsk = inet_csk(mptcp_meta_sk(sk));
1075
	__u32 max_rto = 0;
1076
1077
	if (!tp->mpc)
1078
		return;
1079
1080
	/* We are in recovery-phase on the MPTCP-level. Do not update the
1081
	 * RTO, because this would kill exponential backoff.
1082
	 */
1083
	if (micsk->icsk_retransmits)
1084
		return;
1085
1086
	mptcp_for_each_sk(tp->mpcb, sk_it) {
1087
		if (mptcp_sk_can_send(sk_it) &&
1088
		    inet_csk(sk_it)->icsk_rto > max_rto)
1089
			max_rto = inet_csk(sk_it)->icsk_rto;
1090
	}
1091
	if (max_rto) {
1092
		micsk->icsk_rto = max_rto << 1;
1093
1094
		/* A successfull rto-measurement - reset backoff counter */
1095
		micsk->icsk_backoff = 0;
1096
	}
1097
}
1098
1099
static inline int mptcp_sysctl_syn_retries(void)
1100
{
1101
	return sysctl_mptcp_syn_retries;
1102
}
1103
1104
static inline void mptcp_sub_close_passive(struct sock *sk)
1105
{
1106
	struct sock *meta_sk = mptcp_meta_sk(sk);
1107
	struct tcp_sock *tp = tcp_sk(sk), *meta_tp = tcp_sk(meta_sk);
1108
1109
	/* Only close, if the app did a send-shutdown (passive close), and we
1110
	 * received the data-ack of the data-fin.
1111
	 */
1112
	if (tp->mpcb->passive_close && meta_tp->snd_una == meta_tp->write_seq)
1113
		mptcp_sub_close(sk, 0);
1114
}
1115
1116
static inline int mptcp_fallback_infinite(struct sock *sk, int flag)
1117
{
1118
	struct tcp_sock *tp = tcp_sk(sk);
1119
1120
	/* If data has been acknowleged on the meta-level, fully_established
1121
	 * will have been set before and thus we will not fall back to infinite
1122
	 * mapping.
1123
	 */
1124
	if (likely(tp->mptcp->fully_established))
1125
		return 0;
1126
1127
	if (!(flag & MPTCP_FLAG_DATA_ACKED))
1128
		return 0;
1129
1130
	/* Don't fallback twice ;) */
1131
	if (tp->mpcb->infinite_mapping_snd)
1132
		return 0;
1133
1134
	pr_err("%s %#x will fallback - pi %d, src %pI4 dst %pI4 from %pS\n",
1135
	       __func__, tp->mpcb->mptcp_loc_token, tp->mptcp->path_index,
1136
	       &inet_sk(sk)->inet_saddr, &inet_sk(sk)->inet_daddr,
1137
	       __builtin_return_address(0));
1138
	if (!is_master_tp(tp))
1139
		return MPTCP_FLAG_SEND_RESET;
1140
1141
	tp->mpcb->infinite_mapping_snd = 1;
1142
	tp->mpcb->infinite_mapping_rcv = 1;
1143
	tp->mptcp->fully_established = 1;
1144
1145
	return 0;
1146
}
1147
1148
/* Find the first free index in the bitfield */
1149
static inline int __mptcp_find_free_index(u8 bitfield, int j, u8 base)
1150
{
1151
	int i;
1152
	mptcp_for_each_bit_unset(bitfield >> base, i) {
1153
		/* We wrapped at the bitfield - try from 0 on */
1154
		if (i + base >= sizeof(bitfield) * 8) {
1155
			mptcp_for_each_bit_unset(bitfield, i) {
1156
				if (i != j)
1157
					return i;
1158
			}
1159
			goto exit;
1160
		}
1161
		if (i + base != j)
1162
			return i + base;
1163
	}
1164
exit:
1165
	return -1;
1166
}
1167
1168
static inline int mptcp_find_free_index(u8 bitfield)
1169
{
1170
	return __mptcp_find_free_index(bitfield, -1, 0);
1171
}
1172
1173
/* Find the first index whose bit in the bit-field == 0 */
1174
static inline u8 mptcp_set_new_pathindex(struct mptcp_cb *mpcb)
1175
{
1176
	u8 base = mpcb->next_path_index;
1177
	int i;
1178
1179
	/* Start at 1, because 0 is reserved for the meta-sk */
1180
	mptcp_for_each_bit_unset(mpcb->path_index_bits >> base, i) {
1181
		if (i + base < 1)
1182
			continue;
1183
		if (i + base >= sizeof(mpcb->path_index_bits) * 8)
1184
			break;
1185
		i += base;
1186
		mpcb->path_index_bits |= (1 << i);
1187
		mpcb->next_path_index = i + 1;
1188
		return i;
1189
	}
1190
	mptcp_for_each_bit_unset(mpcb->path_index_bits, i) {
1191
		if (i < 1)
1192
			continue;
1193
		mpcb->path_index_bits |= (1 << i);
1194
		mpcb->next_path_index = i + 1;
1195
		return i;
1196
	}
1197
1198
	return 0;
1199
}
1200
1201
static inline int mptcp_v6_is_v4_mapped(struct sock *sk)
1202
{
1203
	return sk->sk_family == AF_INET6 &&
1204
	       ipv6_addr_type(&inet6_sk(sk)->saddr) == IPV6_ADDR_MAPPED;
1205
}
1206
#else /* CONFIG_MPTCP */
1207
#define mptcp_debug(fmt, args...)	\
1208
	do {				\
1209
	} while (0)
1210
1211
/* Without MPTCP, we just do one iteration
1212
 * over the only socket available. This assumes that
1213
 * the sk/tp arg is the socket in that case.
1214
 */
1215
#define mptcp_for_each_sk(mpcb, sk)
1216
#define mptcp_for_each_sk_safe(__mpcb, __sk, __temp)
1217
1218
static inline __u32 *mptcp_skb_set_data_seq(const struct sk_buff *skb,
1219
					    u32 *data_seq,
1220
					    struct mptcp_cb *mpcb)
1221
{
1222
	return 0;
1223
}
1224
static inline int mptcp_is_data_fin(const struct sk_buff *skb)
1225
{
1226
	return 0;
1227
}
1228
static inline int mptcp_is_data_seq(const struct sk_buff *skb)
1229
{
1230
	return 0;
1231
}
1232
static inline struct sock *mptcp_meta_sk(const struct sock *sk)
1233
{
1234
	return NULL;
1235
}
1236
static inline struct tcp_sock *mptcp_meta_tp(const struct tcp_sock *tp)
1237
{
1238
	return NULL;
1239
}
1240
static inline int is_meta_sk(const struct sock *sk)
1241
{
1242
	return 0;
1243
}
1244
static inline int is_master_tp(const struct tcp_sock *tp)
1245
{
1246
	return 0;
1247
}
1248
static inline void mptcp_purge_ofo_queue(struct tcp_sock *meta_tp) {}
1249
static inline void mptcp_cleanup_rbuf(const struct sock *meta_sk, int copied) {}
1250
static inline void mptcp_del_sock(const struct sock *sk) {}
1251
static inline void mptcp_reinject_data(struct sock *orig_sk, int clone_it) {}
1252
static inline void mptcp_init_buffer_space(const struct sock *sk) {}
1253
static inline void mptcp_update_sndbuf(const struct mptcp_cb *mpcb) {}
1254
static inline void mptcp_skb_entail_init(const struct tcp_sock *tp,
1255
					 const struct sk_buff *skb) {}
1256
static inline struct sk_buff *mptcp_next_segment(const struct sock *sk,
1257
						 const int *reinject)
1258
{
1259
	return NULL;
1260
}
1261
static inline void mptcp_clean_rtx_infinite(const struct sk_buff *skb,
1262
					    const struct sock *sk) {}
1263
static inline void mptcp_retransmit_timer(const struct sock *meta_sk) {}
1264
static inline int mptcp_write_wakeup(struct sock *meta_sk)
1265
{
1266
	return 0;
1267
}
1268
static inline void mptcp_sub_close(struct sock *sk, unsigned long delay) {}
1269
static inline void mptcp_set_rto(const struct sock *sk) {}
1270
static inline void mptcp_send_fin(const struct sock *meta_sk) {}
1271
static inline void mptcp_parse_options(const uint8_t *ptr, const int opsize,
1272
				       const struct tcp_options_received *opt_rx,
1273
				       const struct mptcp_options_received *mopt,
1274
				       const struct sk_buff *skb) {}
1275
static inline void mptcp_syn_options(struct sock *sk,
1276
				     struct tcp_out_options *opts,
1277
				     unsigned *remaining) {}
1278
static inline void mptcp_synack_options(struct request_sock *req,
1279
					struct tcp_out_options *opts,
1280
					unsigned *remaining) {}
1281
1282
static inline void mptcp_established_options(struct sock *sk,
1283
					     struct sk_buff *skb,
1284
					     struct tcp_out_options *opts,
1285
					     unsigned *size) {}
1286
static inline void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp,
1287
				       struct tcp_out_options *opts,
1288
				       struct sk_buff *skb) {}
1289
static inline void mptcp_close(struct sock *meta_sk, long timeout) {}
1290
static inline int mptcp_doit(struct sock *sk)
1291
{
1292
	return 0;
1293
}
1294
static inline int mptcp_check_req_master(const struct sock *sk,
1295
					 const struct sock *child,
1296
					 struct request_sock *req,
1297
					 struct request_sock **prev,
1298
					 const struct mptcp_options_received *mopt)
1299
{
1300
	return 1;
1301
}
1302
static inline struct sock *mptcp_check_req_child(struct sock *sk,
1303
						 struct sock *child,
1304
						 struct request_sock *req,
1305
						 struct request_sock **prev,
1306
						 struct mptcp_options_received *mopt)
1307
{
1308
	return NULL;
1309
}
1310
static inline u32 __mptcp_select_window(const struct sock *sk)
1311
{
1312
	return 0;
1313
}
1314
static inline void mptcp_select_initial_window(int *__space,
1315
					       __u32 *window_clamp,
1316
					       const struct sock *sk) {}
1317
static inline unsigned int mptcp_current_mss(struct sock *meta_sk)
1318
{
1319
	return 0;
1320
}
1321
static inline int mptcp_select_size(const struct sock *meta_sk, bool sg)
1322
{
1323
	return 0;
1324
}
1325
static inline void mptcp_key_sha1(u64 key, u32 *token, u64 *idsn) {}
1326
static inline void mptcp_sub_close_passive(struct sock *sk) {}
1327
static inline int mptcp_fallback_infinite(const struct sock *sk, int flag)
1328
{
1329
	return 0;
1330
}
1331
static inline void mptcp_init_mp_opt(const struct mptcp_options_received *mopt) {}
1332
static inline int mptcp_check_rtt(const struct tcp_sock *tp, int time)
1333
{
1334
	return 0;
1335
}
1336
static inline int mptcp_check_snd_buf(const struct tcp_sock *tp)
1337
{
1338
	return 0;
1339
}
1340
static inline int mptcp_sysctl_syn_retries(void)
1341
{
1342
	return 0;
1343
}
1344
static inline void mptcp_send_reset(const struct sock *sk) {}
1345
static inline void mptcp_send_active_reset(struct sock *meta_sk,
1346
					   gfp_t priority) {}
1347
static inline int mptcp_write_xmit(struct sock *sk, unsigned int mss_now,
1348
				   int nonagle, int push_one, gfp_t gfp)
1349
{
1350
	return 0;
1351
}
1352
static inline struct sock *mptcp_sk_clone(const struct sock *sk,
1353
					  int family, int priority)
1354
{
1355
	return NULL;
1356
}
1357
static inline void mptcp_set_keepalive(struct sock *sk, int val) {}
1358
static inline int mptcp_handle_options(struct sock *sk,
1359
				       const struct tcphdr *th,
1360
				       struct sk_buff *skb)
1361
{
1362
	return 0;
1363
}
1364
static inline void mptcp_reset_mopt(struct tcp_sock *tp) {}
1365
static inline void  __init mptcp_init(void) {}
1366
static inline int mptcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
1367
{
1368
	return 0;
1369
}
1370
static inline int mptcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
1371
				 unsigned int mss_now, int reinject)
1372
{
1373
	return 0;
1374
}
1375
static inline int mptso_fragment(struct sock *sk, struct sk_buff *skb,
1376
				 unsigned int len, unsigned int mss_now,
1377
				 gfp_t gfp, int reinject)
1378
{
1379
	return 0;
1380
}
1381
static inline bool mptcp_sk_can_gso(const struct sock *sk)
1382
{
1383
	return false;
1384
}
1385
static inline bool mptcp_can_sg(const struct sock *meta_sk)
1386
{
1387
	return false;
1388
}
1389
static inline unsigned int mptcp_xmit_size_goal(struct sock *meta_sk,
1390
						u32 mss_now, int large_allowed)
1391
{
1392
	return 0;
1393
}
1394
static inline void mptcp_destroy_sock(struct sock *sk) {}
1395
static inline int mptcp_rcv_synsent_state_process(struct sock *sk,
1396
						  struct sock **skptr,
1397
						  struct sk_buff *skb,
1398
						  struct mptcp_options_received *mopt)
1399
{
1400
	return 0;
1401
}
1402
static inline bool mptcp_can_sendpage(struct sock *sk)
1403
{
1404
	return false;
1405
}
1406
static inline int mptcp_time_wait(struct sock *sk, struct tcp_timewait_sock *tw)
1407
{
1408
	return 0;
1409
}
1410
static inline void mptcp_twsk_destructor(struct tcp_timewait_sock *tw) {}
1411
static inline void mptcp_update_tw_socks(const struct tcp_sock *tp, int state) {}
1412
static inline void mptcp_tsq_flags(struct sock *sk, int bit) {}
1413
static inline void mptcp_tsq_flags(struct sock *sk) {}
1414
static inline void mptcp_tsq_sub_deferred(struct sock *meta_sk) {}
1415
#endif /* CONFIG_MPTCP */
1416
1417
#endif /* _MPTCP_H */
(-)a/include/net/mptcp_pm.h (+133 lines)
Line 0 Link Here
1
/*
2
 *	MPTCP implementation
3
 *
4
 *	Initial Design & Implementation:
5
 *	Sébastien Barré <sebastien.barre@uclouvain.be>
6
 *
7
 *	Current Maintainer & Author:
8
 *	Christoph Paasch <christoph.paasch@uclouvain.be>
9
 *
10
 *	Additional authors:
11
 *	Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
12
 *	Gregory Detal <gregory.detal@uclouvain.be>
13
 *	Fabien Duchêne <fabien.duchene@uclouvain.be>
14
 *	Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
15
 *	Lavkesh Lahngir <lavkesh51@gmail.com>
16
 *	Andreas Ripke <ripke@neclab.eu>
17
 *	Vlad Dogaru <vlad.dogaru@intel.com>
18
 *	Octavian Purdila <octavian.purdila@intel.com>
19
 *	John Ronan <jronan@tssg.org>
20
 *	Catalin Nicutar <catalin.nicutar@gmail.com>
21
 *	Brandon Heller <brandonh@stanford.edu>
22
 *
23
 *
24
 *	This program is free software; you can redistribute it and/or
25
 *      modify it under the terms of the GNU General Public License
26
 *      as published by the Free Software Foundation; either version
27
 *      2 of the License, or (at your option) any later version.
28
 */
29
30
#ifndef _MPTCP_PM_H
31
#define _MPTCP_PM_H
32
33
#include <linux/in.h>
34
#include <linux/in6.h>
35
#include <linux/jhash.h>
36
#include <linux/list.h>
37
#include <linux/skbuff.h>
38
#include <linux/spinlock_types.h>
39
#include <linux/types.h>
40
41
#include <net/request_sock.h>
42
#include <net/sock.h>
43
#include <net/tcp.h>
44
45
/* Max number of local or remote addresses we can store.
46
 * When changing, see the bitfield below in mptcp_loc4/6. */
47
#define MPTCP_MAX_ADDR	8
48
49
#define MPTCP_SUBFLOW_RETRY_DELAY	1000
50
51
struct mptcp_loc4 {
52
	u8		id;
53
	u8		low_prio:1;
54
	__be16		port;
55
	struct in_addr	addr;
56
};
57
58
struct mptcp_rem4 {
59
	u8		id;
60
	u8		bitfield;
61
	u8		retry_bitfield;
62
	__be16		port;
63
	struct in_addr	addr;
64
};
65
66
struct mptcp_loc6 {
67
	u8		id;
68
	u8		low_prio:1;
69
	__be16		port;
70
	struct in6_addr	addr;
71
};
72
73
struct mptcp_rem6 {
74
	u8		id;
75
	u8		bitfield;
76
	u8		retry_bitfield;
77
	__be16		port;
78
	struct in6_addr	addr;
79
};
80
81
struct mptcp_cb;
82
#ifdef CONFIG_MPTCP
83
84
#define MPTCP_HASH_SIZE                1024
85
86
/* This second hashtable is needed to retrieve request socks
87
 * created as a result of a join request. While the SYN contains
88
 * the token, the final ack does not, so we need a separate hashtable
89
 * to retrieve the mpcb.
90
 */
91
extern struct list_head mptcp_reqsk_htb[MPTCP_HASH_SIZE];
92
extern spinlock_t mptcp_reqsk_hlock;	/* hashtable protection */
93
94
/* Lock, protecting the two hash-tables that hold the token. Namely,
95
 * mptcp_reqsk_tk_htb and tk_hashtable
96
 */
97
extern spinlock_t mptcp_tk_hashlock;	/* hashtable protection */
98
99
void mptcp_create_subflows(struct sock *meta_sk);
100
void mptcp_create_subflow_worker(struct work_struct *work);
101
void mptcp_retry_subflow_worker(struct work_struct *work);
102
struct mp_join *mptcp_find_join(struct sk_buff *skb);
103
u8 mptcp_get_loc_addrid(struct mptcp_cb *mpcb, struct sock *sk);
104
void __mptcp_hash_insert(struct tcp_sock *meta_tp, u32 token);
105
void mptcp_hash_remove_bh(struct tcp_sock *meta_tp);
106
void mptcp_hash_remove(struct tcp_sock *meta_tp);
107
struct sock *mptcp_hash_find(struct net *net, u32 token);
108
int mptcp_lookup_join(struct sk_buff *skb, struct inet_timewait_sock *tw);
109
int mptcp_do_join_short(struct sk_buff *skb, struct mptcp_options_received *mopt,
110
			struct tcp_options_received *tmp_opt, struct net *net);
111
void mptcp_reqsk_remove_tk(struct request_sock *reqsk);
112
void mptcp_reqsk_new_mptcp(struct request_sock *req,
113
			   const struct tcp_options_received *rx_opt,
114
			   const struct mptcp_options_received *mopt,
115
			   const struct sk_buff *skb);
116
void mptcp_connect_init(struct sock *sk);
117
void mptcp_set_addresses(struct sock *meta_sk);
118
int mptcp_check_req(struct sk_buff *skb, struct net *net);
119
void mptcp_address_worker(struct work_struct *work);
120
int mptcp_pm_addr_event_handler(unsigned long event, void *ptr, int family);
121
int mptcp_pm_init(void);
122
void mptcp_pm_undo(void);
123
124
#else /* CONFIG_MPTCP */
125
static inline void mptcp_reqsk_new_mptcp(struct request_sock *req,
126
					 const struct tcp_options_received *rx_opt,
127
					 const struct mptcp_options_received *mopt,
128
					 const struct sk_buff *skb)
129
{}
130
static inline void mptcp_hash_remove(struct tcp_sock *meta_tp) {}
131
#endif /* CONFIG_MPTCP */
132
133
#endif /*_MPTCP_PM_H*/
(-)a/include/net/mptcp_v4.h (+81 lines)
Line 0 Link Here
1
/*
2
 *	MPTCP implementation
3
 *
4
 *	Initial Design & Implementation:
5
 *	Sébastien Barré <sebastien.barre@uclouvain.be>
6
 *
7
 *	Current Maintainer & Author:
8
 *	Christoph Paasch <christoph.paasch@uclouvain.be>
9
 *
10
 *	Additional authors:
11
 *	Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
12
 *	Gregory Detal <gregory.detal@uclouvain.be>
13
 *	Fabien Duchêne <fabien.duchene@uclouvain.be>
14
 *	Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
15
 *	Lavkesh Lahngir <lavkesh51@gmail.com>
16
 *	Andreas Ripke <ripke@neclab.eu>
17
 *	Vlad Dogaru <vlad.dogaru@intel.com>
18
 *	Octavian Purdila <octavian.purdila@intel.com>
19
 *	John Ronan <jronan@tssg.org>
20
 *	Catalin Nicutar <catalin.nicutar@gmail.com>
21
 *	Brandon Heller <brandonh@stanford.edu>
22
 *
23
 *
24
 *	This program is free software; you can redistribute it and/or
25
 *      modify it under the terms of the GNU General Public License
26
 *      as published by the Free Software Foundation; either version
27
 *      2 of the License, or (at your option) any later version.
28
 */
29
30
#ifndef MPTCP_V4_H_
31
#define MPTCP_V4_H_
32
33
34
#include <linux/in.h>
35
#include <linux/skbuff.h>
36
#include <net/mptcp.h>
37
#include <net/mptcp_pm.h>
38
#include <net/request_sock.h>
39
#include <net/sock.h>
40
41
extern struct request_sock_ops mptcp_request_sock_ops;
42
extern struct proto mptcp_prot;
43
44
#ifdef CONFIG_MPTCP
45
46
int mptcp_v4_do_rcv(struct sock *meta_sk, struct sk_buff *skb);
47
int mptcp_v4_rem_raddress(struct mptcp_cb *mpcb, u8 id);
48
int mptcp_v4_add_raddress(struct mptcp_cb *mpcb, const struct in_addr *addr,
49
			  __be16 port, u8 id);
50
void mptcp_v4_set_init_addr_bit(struct mptcp_cb *mpcb, __be32 daddr);
51
struct sock *mptcp_v4_search_req(const __be16 rport, const __be32 raddr,
52
				 const __be32 laddr, const struct net *net);
53
int mptcp_init4_subsockets(struct sock *meta_sk, const struct mptcp_loc4 *loc,
54
			   struct mptcp_rem4 *rem);
55
void mptcp_pm_addr4_event_handler(struct in_ifaddr *ifa, unsigned long event,
56
				  struct mptcp_cb *mpcb);
57
int mptcp_pm_v4_init(void);
58
void mptcp_pm_v4_undo(void);
59
void mptcp_v4_send_add_addr(int loc_id, struct mptcp_cb *mpcb);
60
u32 mptcp_v4_get_nonce(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
61
		       u32 seq);
62
u64 mptcp_v4_get_key(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport);
63
64
#else
65
66
static inline int mptcp_v4_do_rcv(const struct sock *meta_sk,
67
				  const struct sk_buff *skb)
68
{
69
	return 0;
70
}
71
72
static inline int mptcp_v4_send_synack(const struct sock *meta_sk,
73
				       const struct request_sock *req,
74
				       const struct request_values *rvp)
75
{
76
	return 0;
77
}
78
79
#endif /* CONFIG_MPTCP */
80
81
#endif /* MPTCP_V4_H_ */
(-)a/include/net/mptcp_v6.h (+87 lines)
Line 0 Link Here
1
/*
2
 *	MPTCP implementation
3
 *
4
 *	Initial Design & Implementation:
5
 *	Sébastien Barré <sebastien.barre@uclouvain.be>
6
 *
7
 *	Current Maintainer & Author:
8
 *	Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
9
 *
10
 *	Additional authors:
11
 *	Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
12
 *	Gregory Detal <gregory.detal@uclouvain.be>
13
 *	Fabien Duchêne <fabien.duchene@uclouvain.be>
14
 *	Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
15
 *	Lavkesh Lahngir <lavkesh51@gmail.com>
16
 *	Andreas Ripke <ripke@neclab.eu>
17
 *	Vlad Dogaru <vlad.dogaru@intel.com>
18
 *	Octavian Purdila <octavian.purdila@intel.com>
19
 *	John Ronan <jronan@tssg.org>
20
 *	Catalin Nicutar <catalin.nicutar@gmail.com>
21
 *	Brandon Heller <brandonh@stanford.edu>
22
 *
23
 *
24
 *	This program is free software; you can redistribute it and/or
25
 *      modify it under the terms of the GNU General Public License
26
 *      as published by the Free Software Foundation; either version
27
 *      2 of the License, or (at your option) any later version.
28
 */
29
30
#ifndef _MPTCP_V6_H
31
#define _MPTCP_V6_H
32
33
#include <linux/in6.h>
34
#include <net/if_inet6.h>
35
36
#include <net/mptcp.h>
37
#include <net/mptcp_pm.h>
38
39
extern struct request_sock_ops mptcp6_request_sock_ops;
40
extern struct proto mptcpv6_prot;
41
42
struct mptcp6_request_sock {
43
	struct mptcp_request_sock	mptcp6rsk_tcp;
44
	struct inet6_request_sock	mptcp6rsk_inet6;
45
};
46
47
#ifdef CONFIG_MPTCP
48
49
/*
50
 * Used to wait for DAD to finish. If rtr_solicit_delay is set, we use it
51
 * instead
52
 */
53
#define MPTCP_IPV6_DEFAULT_DAD_WAIT (HZ/10)
54
55
int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb);
56
int mptcp_v6_rem_raddress(struct mptcp_cb *mpcb, u8 id);
57
int mptcp_v6_add_raddress(struct mptcp_cb *mpcb, const struct in6_addr *addr,
58
			  __be16 port, u8 id);
59
void mptcp_v6_set_init_addr_bit(struct mptcp_cb *mpcb,
60
				const struct in6_addr *daddr);
61
struct sock *mptcp_v6_search_req(const __be16 rport, const struct in6_addr *raddr,
62
				 const struct in6_addr *laddr, const struct net *net);
63
int mptcp_init6_subsockets(struct sock *meta_sk, const struct mptcp_loc6 *loc,
64
			   struct mptcp_rem6 *rem);
65
void mptcp_pm_addr6_event_handler(struct inet6_ifaddr *ifa, unsigned long event,
66
				  struct mptcp_cb *mpcb);
67
int mptcp_pm_v6_init(void);
68
void mptcp_pm_v6_undo(void);
69
void mptcp_v6_send_add_addr(int loc_id, struct mptcp_cb *mpcb);
70
struct sock *mptcp_v6v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
71
				      struct request_sock *req,
72
				      struct dst_entry *dst);
73
__u32 mptcp_v6_get_nonce(const __be32 *saddr, const __be32 *daddr,
74
			 __be16 sport, __be16 dport, u32 seq);
75
u64 mptcp_v6_get_key(const __be32 *saddr, const __be32 *daddr,
76
		     __be16 sport, __be16 dport);
77
78
#else /* CONFIG_MPTCP */
79
80
static inline int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb)
81
{
82
	return 0;
83
}
84
85
#endif /* CONFIG_MPTCP */
86
87
#endif /* _MPTCP_V6_H */
(-)a/include/net/request_sock.h (-1 / +2 lines)
Lines 163-169 Link Here
163
};
163
};
164
164
165
extern int reqsk_queue_alloc(struct request_sock_queue *queue,
165
extern int reqsk_queue_alloc(struct request_sock_queue *queue,
166
			     unsigned int nr_table_entries);
166
			     unsigned int nr_table_entries,
167
			     gfp_t flags);
167
168
168
extern void __reqsk_queue_destroy(struct request_sock_queue *queue);
169
extern void __reqsk_queue_destroy(struct request_sock_queue *queue);
169
extern void reqsk_queue_destroy(struct request_sock_queue *queue);
170
extern void reqsk_queue_destroy(struct request_sock_queue *queue);
(-)a/include/net/sock.h (+10 lines)
Lines 866-871 Link Here
866
866
867
extern int sk_wait_data(struct sock *sk, long *timeo);
867
extern int sk_wait_data(struct sock *sk, long *timeo);
868
868
869
/* START - needed for MPTCP */
870
extern void sock_def_error_report(struct sock *sk);
871
extern struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
872
				  int family);
873
extern void sock_lock_init(struct sock *sk);
874
875
extern struct lock_class_key af_callback_keys[AF_MAX];
876
extern char *const af_family_clock_key_strings[AF_MAX+1];
877
/* END - needed for MPTCP */
878
869
struct request_sock_ops;
879
struct request_sock_ops;
870
struct timewait_sock_ops;
880
struct timewait_sock_ops;
871
struct inet_hashinfo;
881
struct inet_hashinfo;
(-)a/include/net/tcp.h (-4 / +159 lines)
Lines 176-181 Link Here
176
#define TCPOPT_SACK             5       /* SACK Block */
176
#define TCPOPT_SACK             5       /* SACK Block */
177
#define TCPOPT_TIMESTAMP	8	/* Better RTT estimations/PAWS */
177
#define TCPOPT_TIMESTAMP	8	/* Better RTT estimations/PAWS */
178
#define TCPOPT_MD5SIG		19	/* MD5 Signature (RFC2385) */
178
#define TCPOPT_MD5SIG		19	/* MD5 Signature (RFC2385) */
179
#define TCPOPT_MPTCP		30
179
#define TCPOPT_EXP		254	/* Experimental */
180
#define TCPOPT_EXP		254	/* Experimental */
180
/* Magic number to be after the option value for sharing TCP
181
/* Magic number to be after the option value for sharing TCP
181
 * experimental options. See draft-ietf-tcpm-experimental-options-00.txt
182
 * experimental options. See draft-ietf-tcpm-experimental-options-00.txt
Lines 238-243 Link Here
238
 */
239
 */
239
#define	TFO_SERVER_ALWAYS	0x1000
240
#define	TFO_SERVER_ALWAYS	0x1000
240
241
242
/* Flags from tcp_input.c for tcp_ack */
243
#define FLAG_DATA               0x01 /* Incoming frame contained data.          */
244
#define FLAG_WIN_UPDATE         0x02 /* Incoming ACK was a window update.       */
245
#define FLAG_DATA_ACKED         0x04 /* This ACK acknowledged new data.         */
246
#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted.  */
247
#define FLAG_SYN_ACKED          0x10 /* This ACK acknowledged SYN.              */
248
#define FLAG_DATA_SACKED        0x20 /* New SACK.                               */
249
#define FLAG_ECE                0x40 /* ECE in this ACK                         */
250
#define FLAG_SLOWPATH           0x100 /* Do not skip RFC checks for window update.*/
251
#define FLAG_ORIG_SACK_ACKED    0x200 /* Never retransmitted data are (s)acked  */
252
#define FLAG_SND_UNA_ADVANCED   0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
253
#define FLAG_DSACKING_ACK       0x800 /* SACK blocks contained D-SACK info */
254
#define FLAG_SACK_RENEGING      0x2000 /* snd_una advanced to a sacked seq */
255
#define FLAG_UPDATE_TS_RECENT   0x4000 /* tcp_replace_ts_recent() */
256
#define MPTCP_FLAG_SEND_RESET	0x8000
257
#define MPTCP_FLAG_DATA_ACKED	0x10000
258
259
#define FLAG_ACKED              (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
260
#define FLAG_NOT_DUP            (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
261
#define FLAG_CA_ALERT           (FLAG_DATA_SACKED|FLAG_ECE)
262
#define FLAG_FORWARD_PROGRESS   (FLAG_ACKED|FLAG_DATA_SACKED)
263
241
extern struct inet_timewait_death_row tcp_death_row;
264
extern struct inet_timewait_death_row tcp_death_row;
242
265
243
/* sysctl variables for tcp */
266
/* sysctl variables for tcp */
Lines 350-355 Link Here
350
#define TCP_ADD_STATS_USER(net, field, val) SNMP_ADD_STATS_USER((net)->mib.tcp_statistics, field, val)
373
#define TCP_ADD_STATS_USER(net, field, val) SNMP_ADD_STATS_USER((net)->mib.tcp_statistics, field, val)
351
#define TCP_ADD_STATS(net, field, val)	SNMP_ADD_STATS((net)->mib.tcp_statistics, field, val)
374
#define TCP_ADD_STATS(net, field, val)	SNMP_ADD_STATS((net)->mib.tcp_statistics, field, val)
352
375
376
/**** START - Exports needed for MPTCP ****/
377
extern const struct inet_connection_sock_af_ops ipv4_specific;
378
extern const struct inet_connection_sock_af_ops ipv6_specific;
379
extern const struct inet_connection_sock_af_ops ipv6_mapped;
380
381
struct mptcp_options_received;
382
383
extern int tcp_close_state(struct sock *sk);
384
extern void tcp_push(struct sock *sk, int flags, int mss_now,
385
			    int nonagle);
386
extern int tcp_xmit_probe_skb(struct sock *sk, int urgent);
387
extern void tcp_cwnd_validate(struct sock *sk);
388
extern void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb);
389
extern int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
390
			    gfp_t gfp_mask);
391
extern unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb,
392
					unsigned int mss_now, unsigned int cwnd);
393
extern bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb);
394
extern bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
395
			   unsigned int cur_mss, int nonagle);
396
extern bool tcp_snd_wnd_test(const struct tcp_sock *tp, const struct sk_buff *skb,
397
			     unsigned int cur_mss);
398
extern unsigned int tcp_cwnd_test(const struct tcp_sock *tp, const struct sk_buff *skb);
399
extern int tcp_mtu_probe(struct sock *sk);
400
extern int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,
401
			     unsigned int mss_now);
402
extern void __pskb_trim_head(struct sk_buff *skb, int len);
403
extern void tcp_queue_skb(struct sock *sk, struct sk_buff *skb);
404
extern void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags);
405
extern void tcp_reset(struct sock *sk);
406
extern bool tcp_may_update_window(const struct tcp_sock *tp, const u32 ack,
407
				  const u32 ack_seq, const u32 nwin);
408
extern bool tcp_urg_mode(const struct tcp_sock *tp);
409
extern void tcp_ack_probe(struct sock *sk);
410
extern void tcp_rearm_rto(struct sock *sk);
411
extern int tcp_write_timeout(struct sock *sk);
412
extern bool retransmits_timed_out(struct sock *sk, unsigned int boundary,
413
				  unsigned int timeout, bool syn_set);
414
extern void tcp_write_err(struct sock *sk);
415
extern void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr);
416
extern void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
417
				 unsigned int mss_now);
418
419
extern int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req);
420
extern void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
421
				  struct request_sock *req);
422
extern __u32 tcp_v4_init_sequence(const struct sk_buff *skb);
423
extern int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
424
			      struct request_sock *req, u16 queue_mapping,
425
			      bool nocache);
426
extern void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb);
427
extern struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb);
428
extern struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb);
429
extern void tcp_v4_reqsk_destructor(struct request_sock *req);
430
431
extern int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req);
432
extern void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
433
				  struct request_sock *req);
434
extern __u32 tcp_v6_init_sequence(const struct sk_buff *skb);
435
extern int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst,
436
			      struct flowi6 *fl6, struct request_sock *req,
437
			      u16 queue_mapping);
438
extern void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb);
439
extern int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
440
extern int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
441
extern void tcp_v6_destroy_sock(struct sock *sk);
442
void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb);
443
extern void tcp_v6_hash(struct sock *sk);
444
extern struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb);
445
extern struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
446
					 struct request_sock *req,
447
					 struct dst_entry *dst);
448
extern void tcp_v6_reqsk_destructor(struct request_sock *req);
449
450
extern void sock_valbool_flag(struct sock *sk, int bit, int valbool);
451
extern unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
452
				       int large_allowed);
453
extern u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb);
454
455
extern void skb_clone_fraglist(struct sk_buff *skb);
456
extern void copy_skb_header(struct sk_buff *new, const struct sk_buff *old);
457
458
extern void inet_twsk_free(struct inet_timewait_sock *tw);
459
/* These states need RST on ABORT according to RFC793 */
460
static inline bool tcp_need_reset(int state)
461
{
462
	return (1 << state) &
463
	       (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
464
		TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
465
}
466
467
extern bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,
468
				   int hlen);
469
extern int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
470
				      bool *fragstolen);
471
extern bool tcp_try_coalesce(struct sock *sk, struct sk_buff *to,
472
			     struct sk_buff *from, bool *fragstolen);
473
/**** END - Exports needed for MPTCP ****/
474
353
extern void tcp_init_mem(struct net *net);
475
extern void tcp_init_mem(struct net *net);
354
476
355
extern void tcp_tasklet_init(void);
477
extern void tcp_tasklet_init(void);
Lines 448-453 Link Here
448
		       size_t len, int nonblock, int flags, int *addr_len);
570
		       size_t len, int nonblock, int flags, int *addr_len);
449
extern void tcp_parse_options(const struct sk_buff *skb,
571
extern void tcp_parse_options(const struct sk_buff *skb,
450
			      struct tcp_options_received *opt_rx,
572
			      struct tcp_options_received *opt_rx,
573
			      struct mptcp_options_received *mopt,
451
			      int estab, struct tcp_fastopen_cookie *foc);
574
			      int estab, struct tcp_fastopen_cookie *foc);
452
extern const u8 *tcp_parse_md5sig_option(const struct tcphdr *th);
575
extern const u8 *tcp_parse_md5sig_option(const struct tcphdr *th);
453
576
Lines 681-686 Link Here
681
#define TCPHDR_ECE 0x40
804
#define TCPHDR_ECE 0x40
682
#define TCPHDR_CWR 0x80
805
#define TCPHDR_CWR 0x80
683
806
807
/* MPTCP flags */
808
#define MPTCPHDR_ACK		0x01
809
#define MPTCPHDR_SEQ		0x02
810
#define MPTCPHDR_FIN		0x04
811
#define MPTCPHDR_INF		0x08
812
#define MPTCPHDR_SEQ64_SET	0x10 /* Did we received a 64-bit seq number */
813
#define MPTCPHDR_SEQ64_OFO	0x20 /* Is it not in our circular array? */
814
#define MPTCPHDR_SEQ64_INDEX	0x40 /* Index of seq in mpcb->snd_high_order */
815
#define MPTCPHDR_DSS_CSUM	0x80
816
817
/* It is impossible, that all 8 bits of mptcp_flags are set to 1 with the above
818
 * Thus, defining MPTCPHDR_JOIN as 0xFF is safe.
819
 */
820
#define MPTCPHDR_JOIN		0xFF
821
684
/* This is what the send packet queuing engine uses to pass
822
/* This is what the send packet queuing engine uses to pass
685
 * TCP per-packet control information to the transmission code.
823
 * TCP per-packet control information to the transmission code.
686
 * We also store the host-order sequence numbers in here too.
824
 * We also store the host-order sequence numbers in here too.
Lines 689-702 Link Here
689
 */
827
 */
690
struct tcp_skb_cb {
828
struct tcp_skb_cb {
691
	union {
829
	union {
692
		struct inet_skb_parm	h4;
830
		union {
831
			struct inet_skb_parm	h4;
693
#if IS_ENABLED(CONFIG_IPV6)
832
#if IS_ENABLED(CONFIG_IPV6)
694
		struct inet6_skb_parm	h6;
833
			struct inet6_skb_parm	h6;
834
#endif
835
		} header;	/* For incoming frames		*/
836
#ifdef CONFIG_MPTCP
837
		__u32 path_mask; /* path indices that tried to send this skb */
695
#endif
838
#endif
696
	} header;	/* For incoming frames		*/
839
	};
697
	__u32		seq;		/* Starting sequence number	*/
840
	__u32		seq;		/* Starting sequence number	*/
698
	__u32		end_seq;	/* SEQ + FIN + SYN + datalen	*/
841
	__u32		end_seq;	/* SEQ + FIN + SYN + datalen	*/
699
	__u32		when;		/* used to compute rtt's	*/
842
	__u32		when;		/* used to compute rtt's	*/
843
#ifdef CONFIG_MPTCP
844
	__u8		mptcp_flags;	/* flags for the MPTCP layer    */
845
	__u8		dss_off;	/* Number of 4-byte words until
846
					 * seq-number */
847
#endif
700
	__u8		tcp_flags;	/* TCP header flags. (tcp[13])	*/
848
	__u8		tcp_flags;	/* TCP header flags. (tcp[13])	*/
701
849
702
	__u8		sacked;		/* State flags for SACK/FACK.	*/
850
	__u8		sacked;		/* State flags for SACK/FACK.	*/
Lines 1050-1056 Link Here
1050
extern void tcp_select_initial_window(int __space, __u32 mss,
1198
extern void tcp_select_initial_window(int __space, __u32 mss,
1051
				      __u32 *rcv_wnd, __u32 *window_clamp,
1199
				      __u32 *rcv_wnd, __u32 *window_clamp,
1052
				      int wscale_ok, __u8 *rcv_wscale,
1200
				      int wscale_ok, __u8 *rcv_wscale,
1053
				      __u32 init_rcv_wnd);
1201
				      __u32 init_rcv_wnd, const struct sock *sk);
1054
1202
1055
static inline int tcp_win_from_space(int space)
1203
static inline int tcp_win_from_space(int space)
1056
{
1204
{
Lines 1062-1073 Link Here
1062
/* Note: caller must be prepared to deal with negative returns */ 
1210
/* Note: caller must be prepared to deal with negative returns */ 
1063
static inline int tcp_space(const struct sock *sk)
1211
static inline int tcp_space(const struct sock *sk)
1064
{
1212
{
1213
	if (tcp_sk(sk)->mpc)
1214
		sk = tcp_sk(sk)->meta_sk;
1215
1065
	return tcp_win_from_space(sk->sk_rcvbuf -
1216
	return tcp_win_from_space(sk->sk_rcvbuf -
1066
				  atomic_read(&sk->sk_rmem_alloc));
1217
				  atomic_read(&sk->sk_rmem_alloc));
1067
} 
1218
} 
1068
1219
1069
static inline int tcp_full_space(const struct sock *sk)
1220
static inline int tcp_full_space(const struct sock *sk)
1070
{
1221
{
1222
	if (tcp_sk(sk)->mpc)
1223
		sk = tcp_sk(sk)->meta_sk;
1224
1071
	return tcp_win_from_space(sk->sk_rcvbuf); 
1225
	return tcp_win_from_space(sk->sk_rcvbuf); 
1072
}
1226
}
1073
1227
Lines 1082-1087 Link Here
1082
	tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
1236
	tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
1083
	tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
1237
	tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
1084
	tcp_rsk(req)->snt_synack = 0;
1238
	tcp_rsk(req)->snt_synack = 0;
1239
	tcp_rsk(req)->saw_mpc = 0;
1085
	req->mss = rx_opt->mss_clamp;
1240
	req->mss = rx_opt->mss_clamp;
1086
	req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
1241
	req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
1087
	ireq->tstamp_ok = rx_opt->tstamp_ok;
1242
	ireq->tstamp_ok = rx_opt->tstamp_ok;
(-)a/include/uapi/linux/if.h (+3 lines)
Lines 53-58 Link Here
53
53
54
#define IFF_ECHO	0x40000		/* echo sent packets		*/
54
#define IFF_ECHO	0x40000		/* echo sent packets		*/
55
55
56
#define IFF_NOMULTIPATH	0x80000		/* Disable for MPTCP 		*/
57
#define IFF_MPBACKUP	0x100000	/* Use as backup path for MPTCP */
58
56
#define IFF_VOLATILE	(IFF_LOOPBACK|IFF_POINTOPOINT|IFF_BROADCAST|IFF_ECHO|\
59
#define IFF_VOLATILE	(IFF_LOOPBACK|IFF_POINTOPOINT|IFF_BROADCAST|IFF_ECHO|\
57
		IFF_MASTER|IFF_SLAVE|IFF_RUNNING|IFF_LOWER_UP|IFF_DORMANT)
60
		IFF_MASTER|IFF_SLAVE|IFF_RUNNING|IFF_LOWER_UP|IFF_DORMANT)
58
61
(-)a/net/core/dev.c (-1 / +1 lines)
Lines 4801-4807 Link Here
4801
4801
4802
	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4802
	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4803
			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4803
			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4804
			       IFF_AUTOMEDIA)) |
4804
			       IFF_AUTOMEDIA | IFF_NOMULTIPATH | IFF_MPBACKUP)) |
4805
		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4805
		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4806
				    IFF_ALLMULTI));
4806
				    IFF_ALLMULTI));
4807
4807
(-)a/net/core/request_sock.c (-3 / +6 lines)
Lines 38-44 Link Here
38
EXPORT_SYMBOL(sysctl_max_syn_backlog);
38
EXPORT_SYMBOL(sysctl_max_syn_backlog);
39
39
40
int reqsk_queue_alloc(struct request_sock_queue *queue,
40
int reqsk_queue_alloc(struct request_sock_queue *queue,
41
		      unsigned int nr_table_entries)
41
		      unsigned int nr_table_entries,
42
		      gfp_t flags)
42
{
43
{
43
	size_t lopt_size = sizeof(struct listen_sock);
44
	size_t lopt_size = sizeof(struct listen_sock);
44
	struct listen_sock *lopt;
45
	struct listen_sock *lopt;
Lines 48-56 Link Here
48
	nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);
49
	nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);
49
	lopt_size += nr_table_entries * sizeof(struct request_sock *);
50
	lopt_size += nr_table_entries * sizeof(struct request_sock *);
50
	if (lopt_size > PAGE_SIZE)
51
	if (lopt_size > PAGE_SIZE)
51
		lopt = vzalloc(lopt_size);
52
		lopt = __vmalloc(lopt_size,
53
			flags | __GFP_HIGHMEM | __GFP_ZERO,
54
			PAGE_KERNEL);
52
	else
55
	else
53
		lopt = kzalloc(lopt_size, GFP_KERNEL);
56
		lopt = kzalloc(lopt_size, flags);
54
	if (lopt == NULL)
57
	if (lopt == NULL)
55
		return -ENOMEM;
58
		return -ENOMEM;
56
59
(-)a/net/core/skbuff.c (-2 / +2 lines)
Lines 487-493 Link Here
487
	skb_drop_list(&skb_shinfo(skb)->frag_list);
487
	skb_drop_list(&skb_shinfo(skb)->frag_list);
488
}
488
}
489
489
490
static void skb_clone_fraglist(struct sk_buff *skb)
490
void skb_clone_fraglist(struct sk_buff *skb)
491
{
491
{
492
	struct sk_buff *list;
492
	struct sk_buff *list;
493
493
Lines 913-919 Link Here
913
	skb->inner_mac_header += off;
913
	skb->inner_mac_header += off;
914
}
914
}
915
915
916
static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
916
void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
917
{
917
{
918
	__copy_skb_header(new, old);
918
	__copy_skb_header(new, old);
919
919
(-)a/net/core/sock.c (-6 / +6 lines)
Lines 230-236 Link Here
230
  "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
230
  "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
231
  "slock-AF_NFC"   , "slock-AF_VSOCK"    ,"slock-AF_MAX"
231
  "slock-AF_NFC"   , "slock-AF_VSOCK"    ,"slock-AF_MAX"
232
};
232
};
233
static const char *const af_family_clock_key_strings[AF_MAX+1] = {
233
char *const af_family_clock_key_strings[AF_MAX+1] = {
234
  "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
234
  "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
235
  "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
235
  "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
236
  "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
236
  "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
Lines 251-257 Link Here
251
 * sk_callback_lock locking rules are per-address-family,
251
 * sk_callback_lock locking rules are per-address-family,
252
 * so split the lock classes by using a per-AF key:
252
 * so split the lock classes by using a per-AF key:
253
 */
253
 */
254
static struct lock_class_key af_callback_keys[AF_MAX];
254
struct lock_class_key af_callback_keys[AF_MAX];
255
255
256
/* Take into consideration the size of the struct sk_buff overhead in the
256
/* Take into consideration the size of the struct sk_buff overhead in the
257
 * determination of these values, since that is non-constant across
257
 * determination of these values, since that is non-constant across
Lines 607-613 Link Here
607
	return ret;
607
	return ret;
608
}
608
}
609
609
610
static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
610
void sock_valbool_flag(struct sock *sk, int bit, int valbool)
611
{
611
{
612
	if (valbool)
612
	if (valbool)
613
		sock_set_flag(sk, bit);
613
		sock_set_flag(sk, bit);
Lines 1195-1201 Link Here
1195
 *
1195
 *
1196
 * (We also register the sk_lock with the lock validator.)
1196
 * (We also register the sk_lock with the lock validator.)
1197
 */
1197
 */
1198
static inline void sock_lock_init(struct sock *sk)
1198
void sock_lock_init(struct sock *sk)
1199
{
1199
{
1200
	sock_lock_init_class_and_name(sk,
1200
	sock_lock_init_class_and_name(sk,
1201
			af_family_slock_key_strings[sk->sk_family],
1201
			af_family_slock_key_strings[sk->sk_family],
Lines 1243-1249 Link Here
1243
}
1243
}
1244
EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1244
EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1245
1245
1246
static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1246
struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1247
		int family)
1247
		int family)
1248
{
1248
{
1249
	struct sock *sk;
1249
	struct sock *sk;
Lines 2163-2169 Link Here
2163
	rcu_read_unlock();
2163
	rcu_read_unlock();
2164
}
2164
}
2165
2165
2166
static void sock_def_error_report(struct sock *sk)
2166
void sock_def_error_report(struct sock *sk)
2167
{
2167
{
2168
	struct socket_wq *wq;
2168
	struct socket_wq *wq;
2169
2169
(-)a/net/ipv4/af_inet.c (-2 / +22 lines)
Lines 104-109 Link Here
104
#include <net/ip_fib.h>
104
#include <net/ip_fib.h>
105
#include <net/inet_connection_sock.h>
105
#include <net/inet_connection_sock.h>
106
#include <net/tcp.h>
106
#include <net/tcp.h>
107
#include <net/mptcp.h>
107
#include <net/udp.h>
108
#include <net/udp.h>
108
#include <net/udplite.h>
109
#include <net/udplite.h>
109
#include <net/ping.h>
110
#include <net/ping.h>
Lines 272-279 Link Here
272
 *	Create an inet socket.
273
 *	Create an inet socket.
273
 */
274
 */
274
275
275
static int inet_create(struct net *net, struct socket *sock, int protocol,
276
int inet_create(struct net *net, struct socket *sock, int protocol, int kern)
276
		       int kern)
277
{
277
{
278
	struct sock *sk;
278
	struct sock *sk;
279
	struct inet_protosw *answer;
279
	struct inet_protosw *answer;
Lines 709-714 Link Here
709
	lock_sock(sk2);
709
	lock_sock(sk2);
710
710
711
	sock_rps_record_flow(sk2);
711
	sock_rps_record_flow(sk2);
712
713
	if (sk2->sk_protocol == IPPROTO_TCP && tcp_sk(sk2)->mpc) {
714
		struct sock *sk_it;
715
716
		mptcp_for_each_sk(tcp_sk(sk2)->mpcb, sk_it)
717
				sock_rps_record_flow(sk_it);
718
719
		if (tcp_sk(sk2)->mpcb->master_sk) {
720
			sk_it = tcp_sk(sk2)->mpcb->master_sk;
721
722
			write_lock_bh(&sk_it->sk_callback_lock);
723
			sk_it->sk_wq = newsock->wq;
724
			sk_it->sk_socket = newsock;
725
			write_unlock_bh(&sk_it->sk_callback_lock);
726
		}
727
	}
728
712
	WARN_ON(!((1 << sk2->sk_state) &
729
	WARN_ON(!((1 << sk2->sk_state) &
713
		  (TCPF_ESTABLISHED | TCPF_SYN_RECV |
730
		  (TCPF_ESTABLISHED | TCPF_SYN_RECV |
714
		  TCPF_CLOSE_WAIT | TCPF_CLOSE)));
731
		  TCPF_CLOSE_WAIT | TCPF_CLOSE)));
Lines 1753-1758 Link Here
1753
1770
1754
	ip_init();
1771
	ip_init();
1755
1772
1773
	/* We must initialize MPTCP before TCP. */
1774
	mptcp_init();
1775
1756
	tcp_v4_init();
1776
	tcp_v4_init();
1757
1777
1758
	/* Setup TCP slab cache for open requests. */
1778
	/* Setup TCP slab cache for open requests. */
(-)a/net/ipv4/inet_connection_sock.c (-4 / +15 lines)
Lines 23-28 Link Here
23
#include <net/route.h>
23
#include <net/route.h>
24
#include <net/tcp_states.h>
24
#include <net/tcp_states.h>
25
#include <net/xfrm.h>
25
#include <net/xfrm.h>
26
#include <net/mptcp.h>
26
27
27
#ifdef INET_CSK_DEBUG
28
#ifdef INET_CSK_DEBUG
28
const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
29
const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
Lines 477-484 Link Here
477
}
478
}
478
EXPORT_SYMBOL_GPL(inet_csk_route_child_sock);
479
EXPORT_SYMBOL_GPL(inet_csk_route_child_sock);
479
480
480
static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport,
481
u32 inet_synq_hash(const __be32 raddr, const __be16 rport, const u32 rnd,
481
				 const u32 rnd, const u32 synq_hsize)
482
		   const u32 synq_hsize)
482
{
483
{
483
	return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1);
484
	return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1);
484
}
485
}
Lines 675-681 Link Here
675
				 const struct request_sock *req,
676
				 const struct request_sock *req,
676
				 const gfp_t priority)
677
				 const gfp_t priority)
677
{
678
{
678
	struct sock *newsk = sk_clone_lock(sk, priority);
679
	struct sock *newsk;
680
681
	if (sk->sk_protocol == IPPROTO_TCP && tcp_sk(sk)->mpc)
682
		newsk = mptcp_sk_clone(sk, req->rsk_ops->family, priority);
683
	else
684
		newsk = sk_clone_lock(sk, priority);
679
685
680
	if (newsk != NULL) {
686
	if (newsk != NULL) {
681
		struct inet_connection_sock *newicsk = inet_csk(newsk);
687
		struct inet_connection_sock *newicsk = inet_csk(newsk);
Lines 752-758 Link Here
752
{
758
{
753
	struct inet_sock *inet = inet_sk(sk);
759
	struct inet_sock *inet = inet_sk(sk);
754
	struct inet_connection_sock *icsk = inet_csk(sk);
760
	struct inet_connection_sock *icsk = inet_csk(sk);
755
	int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);
761
	int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries,
762
			GFP_KERNEL);
756
763
757
	if (rc != 0)
764
	if (rc != 0)
758
		return rc;
765
		return rc;
Lines 813-818 Link Here
813
820
814
		acc_req = req->dl_next;
821
		acc_req = req->dl_next;
815
822
823
		if (is_meta_sk(child))
824
			mutex_lock(&tcp_sk(child)->mpcb->mutex);
816
		local_bh_disable();
825
		local_bh_disable();
817
		bh_lock_sock(child);
826
		bh_lock_sock(child);
818
		WARN_ON(sock_owned_by_user(child));
827
		WARN_ON(sock_owned_by_user(child));
Lines 841-846 Link Here
841
850
842
		bh_unlock_sock(child);
851
		bh_unlock_sock(child);
843
		local_bh_enable();
852
		local_bh_enable();
853
		if (is_meta_sk(child))
854
			mutex_unlock(&tcp_sk(child)->mpcb->mutex);
844
		sock_put(child);
855
		sock_put(child);
845
856
846
		sk_acceptq_removed(sk);
857
		sk_acceptq_removed(sk);
(-)a/net/ipv4/inet_timewait_sock.c (-1 / +1 lines)
Lines 99-105 Link Here
99
	}
99
	}
100
}
100
}
101
101
102
static noinline void inet_twsk_free(struct inet_timewait_sock *tw)
102
void inet_twsk_free(struct inet_timewait_sock *tw)
103
{
103
{
104
	struct module *owner = tw->tw_prot->owner;
104
	struct module *owner = tw->tw_prot->owner;
105
	twsk_destructor((struct sock *)tw);
105
	twsk_destructor((struct sock *)tw);
(-)a/net/ipv4/Kconfig (+23 lines)
Lines 572-577 Link Here
572
	For further details see:
572
	For further details see:
573
	  http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html
573
	  http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html
574
574
575
config TCP_CONG_COUPLED
576
	tristate "MPTCP COUPLED CONGESTION CONTROL"
577
	depends on MPTCP
578
	default n
579
	---help---
580
	MultiPath TCP Coupled Congestion Control
581
	To enable it, just put 'coupled' in tcp_congestion_control
582
583
config TCP_CONG_OLIA
584
	tristate "MPTCP Opportunistic Linked Increase"
585
	depends on MPTCP
586
	default n
587
	---help---
588
	MultiPath TCP Opportunistic Linked Increase Congestion Control
589
	To enable it, just put 'olia' in tcp_congestion_control
590
575
choice
591
choice
576
	prompt "Default TCP congestion control"
592
	prompt "Default TCP congestion control"
577
	default DEFAULT_CUBIC
593
	default DEFAULT_CUBIC
Lines 600-605 Link Here
600
	config DEFAULT_WESTWOOD
616
	config DEFAULT_WESTWOOD
601
		bool "Westwood" if TCP_CONG_WESTWOOD=y
617
		bool "Westwood" if TCP_CONG_WESTWOOD=y
602
618
619
	config DEFAULT_COUPLED
620
		bool "Coupled" if TCP_CONG_COUPLED=y
621
622
	config DEFAULT_OLIA
623
		bool "Olia" if TCP_CONG_OLIA=y
624
603
	config DEFAULT_RENO
625
	config DEFAULT_RENO
604
		bool "Reno"
626
		bool "Reno"
605
627
Lines 621-626 Link Here
621
	default "vegas" if DEFAULT_VEGAS
643
	default "vegas" if DEFAULT_VEGAS
622
	default "westwood" if DEFAULT_WESTWOOD
644
	default "westwood" if DEFAULT_WESTWOOD
623
	default "veno" if DEFAULT_VENO
645
	default "veno" if DEFAULT_VENO
646
	default "coupled" if DEFAULT_COUPLED
624
	default "reno" if DEFAULT_RENO
647
	default "reno" if DEFAULT_RENO
625
	default "cubic"
648
	default "cubic"
626
649
(-)a/net/ipv4/syncookies.c (-2 / +2 lines)
Lines 293-299 Link Here
293
293
294
	/* check for timestamp cookie support */
294
	/* check for timestamp cookie support */
295
	memset(&tcp_opt, 0, sizeof(tcp_opt));
295
	memset(&tcp_opt, 0, sizeof(tcp_opt));
296
	tcp_parse_options(skb, &tcp_opt, 0, NULL);
296
	tcp_parse_options(skb, &tcp_opt, NULL, 0, NULL);
297
297
298
	if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok))
298
	if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok))
299
		goto out;
299
		goto out;
Lines 366-372 Link Here
366
	tcp_select_initial_window(tcp_full_space(sk), req->mss,
366
	tcp_select_initial_window(tcp_full_space(sk), req->mss,
367
				  &req->rcv_wnd, &req->window_clamp,
367
				  &req->rcv_wnd, &req->window_clamp,
368
				  ireq->wscale_ok, &rcv_wscale,
368
				  ireq->wscale_ok, &rcv_wscale,
369
				  dst_metric(&rt->dst, RTAX_INITRWND));
369
				  dst_metric(&rt->dst, RTAX_INITRWND), sk);
370
370
371
	ireq->rcv_wscale  = rcv_wscale;
371
	ireq->rcv_wscale  = rcv_wscale;
372
372
(-)a/net/ipv4/tcp.c (-22 / +147 lines)
Lines 271-276 Link Here
271
271
272
#include <net/icmp.h>
272
#include <net/icmp.h>
273
#include <net/inet_common.h>
273
#include <net/inet_common.h>
274
#include <net/mptcp.h>
274
#include <net/tcp.h>
275
#include <net/tcp.h>
275
#include <net/xfrm.h>
276
#include <net/xfrm.h>
276
#include <net/ip.h>
277
#include <net/ip.h>
Lines 605-610 Link Here
605
	tcb->seq     = tcb->end_seq = tp->write_seq;
606
	tcb->seq     = tcb->end_seq = tp->write_seq;
606
	tcb->tcp_flags = TCPHDR_ACK;
607
	tcb->tcp_flags = TCPHDR_ACK;
607
	tcb->sacked  = 0;
608
	tcb->sacked  = 0;
609
	mptcp_skb_entail_init(tp, skb);
608
	skb_header_release(skb);
610
	skb_header_release(skb);
609
	tcp_add_write_queue_tail(sk, skb);
611
	tcp_add_write_queue_tail(sk, skb);
610
	sk->sk_wmem_queued += skb->truesize;
612
	sk->sk_wmem_queued += skb->truesize;
Lines 619-625 Link Here
619
		tp->snd_up = tp->write_seq;
621
		tp->snd_up = tp->write_seq;
620
}
622
}
621
623
622
static inline void tcp_push(struct sock *sk, int flags, int mss_now,
624
void tcp_push(struct sock *sk, int flags, int mss_now,
623
			    int nonagle)
625
			    int nonagle)
624
{
626
{
625
	if (tcp_send_head(sk)) {
627
	if (tcp_send_head(sk)) {
Lines 685-690 Link Here
685
	int ret;
687
	int ret;
686
688
687
	sock_rps_record_flow(sk);
689
	sock_rps_record_flow(sk);
690
691
#ifdef CONFIG_MPTCP
692
	if (tcp_sk(sk)->mpc) {
693
		struct sock *sk_it;
694
		mptcp_for_each_sk(tcp_sk(sk)->mpcb, sk_it)
695
				sock_rps_record_flow(sk_it);
696
	}
697
#endif
688
	/*
698
	/*
689
	 * We can't seek on a socket input
699
	 * We can't seek on a socket input
690
	 */
700
	 */
Lines 780-787 Link Here
780
	return NULL;
790
	return NULL;
781
}
791
}
782
792
783
static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
793
unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, int large_allowed)
784
				       int large_allowed)
785
{
794
{
786
	struct tcp_sock *tp = tcp_sk(sk);
795
	struct tcp_sock *tp = tcp_sk(sk);
787
	u32 xmit_size_goal, old_size_goal;
796
	u32 xmit_size_goal, old_size_goal;
Lines 821-828 Link Here
821
{
830
{
822
	int mss_now;
831
	int mss_now;
823
832
824
	mss_now = tcp_current_mss(sk);
833
	if (tcp_sk(sk)->mpc) {
825
	*size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
834
		mss_now = mptcp_current_mss(sk);
835
		*size_goal = mptcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
836
	} else {
837
		mss_now = tcp_current_mss(sk);
838
		*size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
839
	}
826
840
827
	return mss_now;
841
	return mss_now;
828
}
842
}
Lines 846-851 Link Here
846
			goto out_err;
860
			goto out_err;
847
	}
861
	}
848
862
863
	if (tp->mpc) {
864
		struct sock *sk_it;
865
866
		/* We must check this with socket-lock hold because we iterate
867
		 * over the subflows.
868
		 */
869
		if (!mptcp_can_sendpage(sk)) {
870
			ssize_t ret;
871
872
			release_sock(sk);
873
			ret = sock_no_sendpage(sk->sk_socket, page, offset,
874
					       size, flags);
875
			lock_sock(sk);
876
			return ret;
877
		}
878
879
		mptcp_for_each_sk(tp->mpcb, sk_it)
880
			sock_rps_record_flow(sk_it);
881
	}
882
849
	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
883
	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
850
884
851
	mss_now = tcp_send_mss(sk, &size_goal, flags);
885
	mss_now = tcp_send_mss(sk, &size_goal, flags);
Lines 949-956 Link Here
949
{
983
{
950
	ssize_t res;
984
	ssize_t res;
951
985
952
	if (!(sk->sk_route_caps & NETIF_F_SG) ||
986
	/* If MPTCP is enabled, we check it later after establishment */
953
	    !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
987
	if (!tcp_sk(sk)->mpc && (!(sk->sk_route_caps & NETIF_F_SG) ||
988
	    !(sk->sk_route_caps & NETIF_F_ALL_CSUM)))
954
		return sock_no_sendpage(sk->sk_socket, page, offset, size,
989
		return sock_no_sendpage(sk->sk_socket, page, offset, size,
955
					flags);
990
					flags);
956
991
Lines 966-971 Link Here
966
	const struct tcp_sock *tp = tcp_sk(sk);
1001
	const struct tcp_sock *tp = tcp_sk(sk);
967
	int tmp = tp->mss_cache;
1002
	int tmp = tp->mss_cache;
968
1003
1004
	if (tp->mpc)
1005
		return mptcp_select_size(sk, sg);
1006
969
	if (sg) {
1007
	if (sg) {
970
		if (sk_can_gso(sk)) {
1008
		if (sk_can_gso(sk)) {
971
			/* Small frames wont use a full page:
1009
			/* Small frames wont use a full page:
Lines 1051-1056 Link Here
1051
			goto do_error;
1089
			goto do_error;
1052
	}
1090
	}
1053
1091
1092
	if (tp->mpc) {
1093
		struct sock *sk_it;
1094
		mptcp_for_each_sk(tp->mpcb, sk_it)
1095
			sock_rps_record_flow(sk_it);
1096
	}
1097
1054
	if (unlikely(tp->repair)) {
1098
	if (unlikely(tp->repair)) {
1055
		if (tp->repair_queue == TCP_RECV_QUEUE) {
1099
		if (tp->repair_queue == TCP_RECV_QUEUE) {
1056
			copied = tcp_send_rcvq(sk, msg, size);
1100
			copied = tcp_send_rcvq(sk, msg, size);
Lines 1078-1084 Link Here
1078
	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1122
	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1079
		goto out_err;
1123
		goto out_err;
1080
1124
1081
	sg = !!(sk->sk_route_caps & NETIF_F_SG);
1125
	if (tp->mpc)
1126
		sg = mptcp_can_sg(sk);
1127
	else
1128
		sg = !!(sk->sk_route_caps & NETIF_F_SG);
1082
1129
1083
	while (--iovlen >= 0) {
1130
	while (--iovlen >= 0) {
1084
		size_t seglen = iov->iov_len;
1131
		size_t seglen = iov->iov_len;
Lines 1129-1136 Link Here
1129
1176
1130
				/*
1177
				/*
1131
				 * Check whether we can use HW checksum.
1178
				 * Check whether we can use HW checksum.
1179
				 *
1180
				 * If dss-csum is enabled, we do not do hw-csum.
1181
				 * In case of non-mptcp we check the
1182
				 * device-capabilities.
1183
				 * In case of mptcp, hw-csum's will be handled
1184
				 * later in mptcp_write_xmit.
1132
				 */
1185
				 */
1133
				if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
1186
				if (((tp->mpc && !tp->mpcb->dss_csum) || !tp->mpc) &&
1187
				    (tp->mpc || sk->sk_route_caps & NETIF_F_ALL_CSUM))
1134
					skb->ip_summed = CHECKSUM_PARTIAL;
1188
					skb->ip_summed = CHECKSUM_PARTIAL;
1135
1189
1136
				skb_entail(sk, skb);
1190
				skb_entail(sk, skb);
Lines 1330-1335 Link Here
1330
1384
1331
	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1385
	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1332
1386
1387
	if (is_meta_sk(sk)) {
1388
		mptcp_cleanup_rbuf(sk, copied);
1389
		return;
1390
	}
1391
1333
	WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
1392
	WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
1334
	     "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
1393
	     "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
1335
	     tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
1394
	     tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
Lines 1567-1572 Link Here
1567
1626
1568
	lock_sock(sk);
1627
	lock_sock(sk);
1569
1628
1629
#ifdef CONFIG_MPTCP
1630
	if (tp->mpc) {
1631
		struct sock *sk_it;
1632
		mptcp_for_each_sk(tp->mpcb, sk_it)
1633
			sock_rps_record_flow(sk_it);
1634
	}
1635
#endif
1636
1570
	err = -ENOTCONN;
1637
	err = -ENOTCONN;
1571
	if (sk->sk_state == TCP_LISTEN)
1638
	if (sk->sk_state == TCP_LISTEN)
1572
		goto out;
1639
		goto out;
Lines 2014-2020 Link Here
2014
  /* TCP_CLOSING	*/ TCP_CLOSING,
2081
  /* TCP_CLOSING	*/ TCP_CLOSING,
2015
};
2082
};
2016
2083
2017
static int tcp_close_state(struct sock *sk)
2084
int tcp_close_state(struct sock *sk)
2018
{
2085
{
2019
	int next = (int)new_state[sk->sk_state];
2086
	int next = (int)new_state[sk->sk_state];
2020
	int ns = next & TCP_STATE_MASK;
2087
	int ns = next & TCP_STATE_MASK;
Lines 2043-2050 Link Here
2043
	    (TCPF_ESTABLISHED | TCPF_SYN_SENT |
2110
	    (TCPF_ESTABLISHED | TCPF_SYN_SENT |
2044
	     TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
2111
	     TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
2045
		/* Clear out any half completed packets.  FIN if needed. */
2112
		/* Clear out any half completed packets.  FIN if needed. */
2046
		if (tcp_close_state(sk))
2113
		if (tcp_close_state(sk)) {
2047
			tcp_send_fin(sk);
2114
			if (!is_meta_sk(sk))
2115
				tcp_send_fin(sk);
2116
			else
2117
				mptcp_send_fin(sk);
2118
		}
2048
	}
2119
	}
2049
}
2120
}
2050
EXPORT_SYMBOL(tcp_shutdown);
2121
EXPORT_SYMBOL(tcp_shutdown);
Lines 2069-2074 Link Here
2069
	int data_was_unread = 0;
2140
	int data_was_unread = 0;
2070
	int state;
2141
	int state;
2071
2142
2143
	if (is_meta_sk(sk)) {
2144
		mptcp_close(sk, timeout);
2145
		return;
2146
	}
2147
2072
	lock_sock(sk);
2148
	lock_sock(sk);
2073
	sk->sk_shutdown = SHUTDOWN_MASK;
2149
	sk->sk_shutdown = SHUTDOWN_MASK;
2074
2150
Lines 2235-2249 Link Here
2235
}
2311
}
2236
EXPORT_SYMBOL(tcp_close);
2312
EXPORT_SYMBOL(tcp_close);
2237
2313
2238
/* These states need RST on ABORT according to RFC793 */
2239
2240
static inline bool tcp_need_reset(int state)
2241
{
2242
	return (1 << state) &
2243
	       (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
2244
		TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
2245
}
2246
2247
int tcp_disconnect(struct sock *sk, int flags)
2314
int tcp_disconnect(struct sock *sk, int flags)
2248
{
2315
{
2249
	struct inet_sock *inet = inet_sk(sk);
2316
	struct inet_sock *inet = inet_sk(sk);
Lines 2284-2289 Link Here
2284
	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
2351
	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
2285
		inet_reset_saddr(sk);
2352
		inet_reset_saddr(sk);
2286
2353
2354
#ifdef CONFIG_MPTCP
2355
	if (is_meta_sk(sk)) {
2356
		struct sock *subsk, *tmpsk;
2357
		struct tcp_sock *tp = tcp_sk(sk);
2358
2359
		__skb_queue_purge(&tp->mpcb->reinject_queue);
2360
2361
		if (tp->inside_tk_table) {
2362
			mptcp_hash_remove_bh(tp);
2363
			reqsk_queue_destroy(&inet_csk(tp->meta_sk)->icsk_accept_queue);
2364
		}
2365
2366
		local_bh_disable();
2367
		mptcp_for_each_sk_safe(tp->mpcb, subsk, tmpsk) {
2368
			/* The socket will get removed from the subsocket-list
2369
			 * and made non-mptcp by setting mpc to 0.
2370
			 *
2371
			 * This is necessary, because tcp_disconnect assumes
2372
			 * that the connection is completly dead afterwards.
2373
			 * Thus we need to do a mptcp_del_sock. Due to this call
2374
			 * we have to make it non-mptcp.
2375
			 *
2376
			 * We have to lock the socket, because we set mpc to 0.
2377
			 * An incoming packet would take the subsocket's lock
2378
			 * and go on into the receive-path.
2379
			 * This would be a race.
2380
			 */
2381
2382
			bh_lock_sock(subsk);
2383
			mptcp_del_sock(subsk);
2384
			tcp_sk(subsk)->mpc = 0;
2385
			mptcp_sub_force_close(subsk);
2386
			bh_unlock_sock(subsk);
2387
		}
2388
		local_bh_enable();
2389
2390
		tp->was_meta_sk = 1;
2391
		tp->mpc = 0;
2392
	} else {
2393
		if (tp->inside_tk_table)
2394
			mptcp_hash_remove_bh(tp);
2395
	}
2396
#endif
2397
2287
	sk->sk_shutdown = 0;
2398
	sk->sk_shutdown = 0;
2288
	sock_reset_flag(sk, SOCK_DONE);
2399
	sock_reset_flag(sk, SOCK_DONE);
2289
	tp->srtt = 0;
2400
	tp->srtt = 0;
Lines 2543-2548 Link Here
2543
					elapsed = tp->keepalive_time - elapsed;
2654
					elapsed = tp->keepalive_time - elapsed;
2544
				else
2655
				else
2545
					elapsed = 0;
2656
					elapsed = 0;
2657
				if (tp->mpc) {
2658
					struct sock *sk_it = sk;
2659
					mptcp_for_each_sk(tp->mpcb, sk_it)
2660
						if (!(1 << sk->sk_state & (TCPF_CLOSE | TCPF_LISTEN)))
2661
							inet_csk_reset_keepalive_timer(sk_it, elapsed);
2662
					break;
2663
				}
2546
				inet_csk_reset_keepalive_timer(sk, elapsed);
2664
				inet_csk_reset_keepalive_timer(sk, elapsed);
2547
			}
2665
			}
2548
		}
2666
		}
Lines 3040-3051 Link Here
3040
void tcp_done(struct sock *sk)
3158
void tcp_done(struct sock *sk)
3041
{
3159
{
3042
	struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
3160
	struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
3161
	struct tcp_sock *tp = tcp_sk(sk);
3043
3162
3044
	if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
3163
	if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
3045
		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
3164
		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
3046
3165
3166
	WARN_ON(sk->sk_state == TCP_CLOSE);
3047
	tcp_set_state(sk, TCP_CLOSE);
3167
	tcp_set_state(sk, TCP_CLOSE);
3048
	tcp_clear_xmit_timers(sk);
3168
3169
	/* If it is a meta-sk sending mp_fclose we have to maintain the
3170
	 * rexmit-timer for retransmitting the MP_FCLOSE */
3171
	if (!tp->mpc || !is_meta_sk(sk) || !tp->send_mp_fclose)
3172
		tcp_clear_xmit_timers(sk);
3173
3049
	if (req != NULL)
3174
	if (req != NULL)
3050
		reqsk_fastopen_remove(sk, req, false);
3175
		reqsk_fastopen_remove(sk, req, false);
3051
3176
(-)a/net/ipv4/tcp_input.c (-62 / +258 lines)
Lines 74-79 Link Here
74
#include <linux/ipsec.h>
74
#include <linux/ipsec.h>
75
#include <asm/unaligned.h>
75
#include <asm/unaligned.h>
76
#include <net/netdma.h>
76
#include <net/netdma.h>
77
#include <net/mptcp.h>
78
#include <net/mptcp_v4.h>
79
#include <net/mptcp_v6.h>
77
80
78
int sysctl_tcp_timestamps __read_mostly = 1;
81
int sysctl_tcp_timestamps __read_mostly = 1;
79
int sysctl_tcp_window_scaling __read_mostly = 1;
82
int sysctl_tcp_window_scaling __read_mostly = 1;
Lines 99-123 Link Here
99
int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
102
int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
100
int sysctl_tcp_early_retrans __read_mostly = 3;
103
int sysctl_tcp_early_retrans __read_mostly = 3;
101
104
102
#define FLAG_DATA		0x01 /* Incoming frame contained data.		*/
103
#define FLAG_WIN_UPDATE		0x02 /* Incoming ACK was a window update.	*/
104
#define FLAG_DATA_ACKED		0x04 /* This ACK acknowledged new data.		*/
105
#define FLAG_RETRANS_DATA_ACKED	0x08 /* "" "" some of which was retransmitted.	*/
106
#define FLAG_SYN_ACKED		0x10 /* This ACK acknowledged SYN.		*/
107
#define FLAG_DATA_SACKED	0x20 /* New SACK.				*/
108
#define FLAG_ECE		0x40 /* ECE in this ACK				*/
109
#define FLAG_SLOWPATH		0x100 /* Do not skip RFC checks for window update.*/
110
#define FLAG_ORIG_SACK_ACKED	0x200 /* Never retransmitted data are (s)acked	*/
111
#define FLAG_SND_UNA_ADVANCED	0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
112
#define FLAG_DSACKING_ACK	0x800 /* SACK blocks contained D-SACK info */
113
#define FLAG_SACK_RENEGING	0x2000 /* snd_una advanced to a sacked seq */
114
#define FLAG_UPDATE_TS_RECENT	0x4000 /* tcp_replace_ts_recent() */
115
116
#define FLAG_ACKED		(FLAG_DATA_ACKED|FLAG_SYN_ACKED)
117
#define FLAG_NOT_DUP		(FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
118
#define FLAG_CA_ALERT		(FLAG_DATA_SACKED|FLAG_ECE)
119
#define FLAG_FORWARD_PROGRESS	(FLAG_ACKED|FLAG_DATA_SACKED)
120
121
#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
105
#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
122
#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
106
#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
123
107
Lines 322-331 Link Here
322
static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
306
static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
323
{
307
{
324
	struct tcp_sock *tp = tcp_sk(sk);
308
	struct tcp_sock *tp = tcp_sk(sk);
309
	struct sock *meta_sk = tp->mpc ? mptcp_meta_sk(sk) : sk;
310
	struct tcp_sock *meta_tp = tcp_sk(meta_sk);
325
311
326
	/* Check #1 */
312
	/* Check #1 */
327
	if (tp->rcv_ssthresh < tp->window_clamp &&
313
	if (meta_tp->rcv_ssthresh < meta_tp->window_clamp &&
328
	    (int)tp->rcv_ssthresh < tcp_space(sk) &&
314
	    (int)meta_tp->rcv_ssthresh < tcp_space(sk) &&
329
	    !sk_under_memory_pressure(sk)) {
315
	    !sk_under_memory_pressure(sk)) {
330
		int incr;
316
		int incr;
331
317
Lines 333-346 Link Here
333
		 * will fit to rcvbuf in future.
319
		 * will fit to rcvbuf in future.
334
		 */
320
		 */
335
		if (tcp_win_from_space(skb->truesize) <= skb->len)
321
		if (tcp_win_from_space(skb->truesize) <= skb->len)
336
			incr = 2 * tp->advmss;
322
			incr = 2 * meta_tp->advmss;
337
		else
323
		else
338
			incr = __tcp_grow_window(sk, skb);
324
			incr = __tcp_grow_window(meta_sk, skb);
339
325
340
		if (incr) {
326
		if (incr) {
341
			incr = max_t(int, incr, 2 * skb->len);
327
			incr = max_t(int, incr, 2 * skb->len);
342
			tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr,
328
			meta_tp->rcv_ssthresh = min(meta_tp->rcv_ssthresh + incr,
343
					       tp->window_clamp);
329
					            meta_tp->window_clamp);
344
			inet_csk(sk)->icsk_ack.quick |= 1;
330
			inet_csk(sk)->icsk_ack.quick |= 1;
345
		}
331
		}
346
	}
332
	}
Lines 393-398 Link Here
393
379
394
	tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp);
380
	tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp);
395
	tp->snd_cwnd_stamp = tcp_time_stamp;
381
	tp->snd_cwnd_stamp = tcp_time_stamp;
382
383
	if (tp->mpc) {
384
		mptcp_init_buffer_space(sk);
385
		mptcp_update_sndbuf(tp->mpcb);
386
	}
396
}
387
}
397
388
398
/* 5. Recalculate window clamp after socket hit its memory bounds. */
389
/* 5. Recalculate window clamp after socket hit its memory bounds. */
Lines 518-524 Link Here
518
		goto new_measure;
509
		goto new_measure;
519
510
520
	time = tcp_time_stamp - tp->rcvq_space.time;
511
	time = tcp_time_stamp - tp->rcvq_space.time;
521
	if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0)
512
	if (tp->mpc) {
513
		if (mptcp_check_rtt(tp, time))
514
			return;
515
	} else if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0)
522
		return;
516
		return;
523
517
524
	space = 2 * (tp->copied_seq - tp->rcvq_space.seq);
518
	space = 2 * (tp->copied_seq - tp->rcvq_space.seq);
Lines 716-721 Link Here
716
	 * guarantees that rto is higher.
710
	 * guarantees that rto is higher.
717
	 */
711
	 */
718
	tcp_bound_rto(sk);
712
	tcp_bound_rto(sk);
713
	mptcp_set_rto(sk);
719
}
714
}
720
715
721
__u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
716
__u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
Lines 2914-2920 Link Here
2914
}
2909
}
2915
2910
2916
/* If we get here, the whole TSO packet has not been acked. */
2911
/* If we get here, the whole TSO packet has not been acked. */
2917
static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
2912
u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
2918
{
2913
{
2919
	struct tcp_sock *tp = tcp_sk(sk);
2914
	struct tcp_sock *tp = tcp_sk(sk);
2920
	u32 packets_acked;
2915
	u32 packets_acked;
Lines 3009-3014 Link Here
3009
		 */
3004
		 */
3010
		if (!(scb->tcp_flags & TCPHDR_SYN)) {
3005
		if (!(scb->tcp_flags & TCPHDR_SYN)) {
3011
			flag |= FLAG_DATA_ACKED;
3006
			flag |= FLAG_DATA_ACKED;
3007
			if (tp->mpc && mptcp_is_data_seq(skb))
3008
				flag |= MPTCP_FLAG_DATA_ACKED;
3012
		} else {
3009
		} else {
3013
			flag |= FLAG_SYN_ACKED;
3010
			flag |= FLAG_SYN_ACKED;
3014
			tp->retrans_stamp = 0;
3011
			tp->retrans_stamp = 0;
Lines 3018-3023 Link Here
3018
			break;
3015
			break;
3019
3016
3020
		tcp_unlink_write_queue(skb, sk);
3017
		tcp_unlink_write_queue(skb, sk);
3018
3021
		sk_wmem_free_skb(sk, skb);
3019
		sk_wmem_free_skb(sk, skb);
3022
		if (skb == tp->retransmit_skb_hint)
3020
		if (skb == tp->retransmit_skb_hint)
3023
			tp->retransmit_skb_hint = NULL;
3021
			tp->retransmit_skb_hint = NULL;
Lines 3104-3110 Link Here
3104
	return flag;
3102
	return flag;
3105
}
3103
}
3106
3104
3107
static void tcp_ack_probe(struct sock *sk)
3105
void tcp_ack_probe(struct sock *sk)
3108
{
3106
{
3109
	const struct tcp_sock *tp = tcp_sk(sk);
3107
	const struct tcp_sock *tp = tcp_sk(sk);
3110
	struct inet_connection_sock *icsk = inet_csk(sk);
3108
	struct inet_connection_sock *icsk = inet_csk(sk);
Lines 3140-3148 Link Here
3140
/* Check that window update is acceptable.
3138
/* Check that window update is acceptable.
3141
 * The function assumes that snd_una<=ack<=snd_next.
3139
 * The function assumes that snd_una<=ack<=snd_next.
3142
 */
3140
 */
3143
static inline bool tcp_may_update_window(const struct tcp_sock *tp,
3141
bool tcp_may_update_window(const struct tcp_sock *tp, const u32 ack,
3144
					const u32 ack, const u32 ack_seq,
3142
			   const u32 ack_seq, const u32 nwin)
3145
					const u32 nwin)
3146
{
3143
{
3147
	return	after(ack, tp->snd_una) ||
3144
	return	after(ack, tp->snd_una) ||
3148
		after(ack_seq, tp->snd_wl1) ||
3145
		after(ack_seq, tp->snd_wl1) ||
Lines 3261-3267 Link Here
3261
}
3258
}
3262
3259
3263
/* This routine deals with incoming acks, but not outgoing ones. */
3260
/* This routine deals with incoming acks, but not outgoing ones. */
3264
static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3261
static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
3265
{
3262
{
3266
	struct inet_connection_sock *icsk = inet_csk(sk);
3263
	struct inet_connection_sock *icsk = inet_csk(sk);
3267
	struct tcp_sock *tp = tcp_sk(sk);
3264
	struct tcp_sock *tp = tcp_sk(sk);
Lines 3350-3355 Link Here
3350
	/* See if we can take anything off of the retransmit queue. */
3347
	/* See if we can take anything off of the retransmit queue. */
3351
	acked = tp->packets_out;
3348
	acked = tp->packets_out;
3352
	flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una);
3349
	flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una);
3350
3351
	if (tp->mpc) {
3352
		flag |= mptcp_fallback_infinite(sk, flag);
3353
3354
		if (flag & MPTCP_FLAG_SEND_RESET) {
3355
			pr_err("%s resetting flow\n", __func__);
3356
			mptcp_send_reset(sk);
3357
			goto invalid_ack;
3358
		}
3359
3360
		mptcp_clean_rtx_infinite(skb, sk);
3361
	}
3362
3353
	acked -= tp->packets_out;
3363
	acked -= tp->packets_out;
3354
3364
3355
	if (tcp_ack_is_dubious(sk, flag)) {
3365
	if (tcp_ack_is_dubious(sk, flag)) {
Lines 3416-3423 Link Here
3416
 * the fast version below fails.
3426
 * the fast version below fails.
3417
 */
3427
 */
3418
void tcp_parse_options(const struct sk_buff *skb,
3428
void tcp_parse_options(const struct sk_buff *skb,
3419
		       struct tcp_options_received *opt_rx, int estab,
3429
		       struct tcp_options_received *opt_rx,
3420
		       struct tcp_fastopen_cookie *foc)
3430
		       struct mptcp_options_received *mopt,
3431
		       int estab, struct tcp_fastopen_cookie *foc)
3421
{
3432
{
3422
	const unsigned char *ptr;
3433
	const unsigned char *ptr;
3423
	const struct tcphdr *th = tcp_hdr(skb);
3434
	const struct tcphdr *th = tcp_hdr(skb);
Lines 3500-3505 Link Here
3500
				 */
3511
				 */
3501
				break;
3512
				break;
3502
#endif
3513
#endif
3514
			case TCPOPT_MPTCP:
3515
				mptcp_parse_options(ptr - 2, opsize, opt_rx,
3516
						    mopt, skb);
3517
				break;
3503
			case TCPOPT_EXP:
3518
			case TCPOPT_EXP:
3504
				/* Fast Open option shares code 254 using a
3519
				/* Fast Open option shares code 254 using a
3505
				 * 16 bits magic number. It's valid only in
3520
				 * 16 bits magic number. It's valid only in
Lines 3561-3568 Link Here
3561
		if (tcp_parse_aligned_timestamp(tp, th))
3576
		if (tcp_parse_aligned_timestamp(tp, th))
3562
			return true;
3577
			return true;
3563
	}
3578
	}
3564
3579
	tcp_parse_options(skb, &tp->rx_opt, tp->mpc ? &tp->mptcp->rx_opt : NULL,
3565
	tcp_parse_options(skb, &tp->rx_opt, 1, NULL);
3580
			  1, NULL);
3566
	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
3581
	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
3567
		tp->rx_opt.rcv_tsecr -= tp->tsoffset;
3582
		tp->rx_opt.rcv_tsecr -= tp->tsoffset;
3568
3583
Lines 3732-3737 Link Here
3732
	case TCP_ESTABLISHED:
3747
	case TCP_ESTABLISHED:
3733
		/* Move to CLOSE_WAIT */
3748
		/* Move to CLOSE_WAIT */
3734
		tcp_set_state(sk, TCP_CLOSE_WAIT);
3749
		tcp_set_state(sk, TCP_CLOSE_WAIT);
3750
		if (tp->mpc)
3751
			mptcp_sub_close_passive(sk);
3735
		dst = __sk_dst_get(sk);
3752
		dst = __sk_dst_get(sk);
3736
		if (!dst || !dst_metric(dst, RTAX_QUICKACK))
3753
		if (!dst || !dst_metric(dst, RTAX_QUICKACK))
3737
			inet_csk(sk)->icsk_ack.pingpong = 1;
3754
			inet_csk(sk)->icsk_ack.pingpong = 1;
Lines 3756-3761 Link Here
3756
		tcp_set_state(sk, TCP_CLOSING);
3773
		tcp_set_state(sk, TCP_CLOSING);
3757
		break;
3774
		break;
3758
	case TCP_FIN_WAIT2:
3775
	case TCP_FIN_WAIT2:
3776
		if (tp->mpc) {
3777
			/* The socket will get closed by mptcp_data_ready.
3778
			 * We first have to process all data-sequences.
3779
			 */
3780
			tp->close_it = 1;
3781
			break;
3782
		}
3759
		/* Received a FIN -- send ACK and enter TIME_WAIT. */
3783
		/* Received a FIN -- send ACK and enter TIME_WAIT. */
3760
		tcp_send_ack(sk);
3784
		tcp_send_ack(sk);
3761
		tcp_time_wait(sk, TCP_TIME_WAIT, 0);
3785
		tcp_time_wait(sk, TCP_TIME_WAIT, 0);
Lines 3780-3785 Link Here
3780
	if (!sock_flag(sk, SOCK_DEAD)) {
3804
	if (!sock_flag(sk, SOCK_DEAD)) {
3781
		sk->sk_state_change(sk);
3805
		sk->sk_state_change(sk);
3782
3806
3807
		/* Don't wake up MPTCP-subflows */
3808
		if (tp->mpc)
3809
			return;
3810
3783
		/* Do not send POLL_HUP for half duplex close. */
3811
		/* Do not send POLL_HUP for half duplex close. */
3784
		if (sk->sk_shutdown == SHUTDOWN_MASK ||
3812
		if (sk->sk_shutdown == SHUTDOWN_MASK ||
3785
		    sk->sk_state == TCP_CLOSE)
3813
		    sk->sk_state == TCP_CLOSE)
Lines 3977-3983 Link Here
3977
			tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
4005
			tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
3978
		}
4006
		}
3979
4007
3980
		if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
4008
		/* In case of MPTCP, the segment may be empty if it's a
4009
		 * non-data DATA_FIN. (see beginning of tcp_data_queue)
4010
		 */
4011
		if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt) &&
4012
		    !(tp->mpc && TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq)) {
3981
			SOCK_DEBUG(sk, "ofo packet was already received\n");
4013
			SOCK_DEBUG(sk, "ofo packet was already received\n");
3982
			__skb_unlink(skb, &tp->out_of_order_queue);
4014
			__skb_unlink(skb, &tp->out_of_order_queue);
3983
			__kfree_skb(skb);
4015
			__kfree_skb(skb);
Lines 4001-4006 Link Here
4001
static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
4033
static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
4002
				 unsigned int size)
4034
				 unsigned int size)
4003
{
4035
{
4036
	if (tcp_sk(sk)->mpc)
4037
		sk = mptcp_meta_sk(sk);
4038
4004
	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
4039
	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
4005
	    !sk_rmem_schedule(sk, skb, size)) {
4040
	    !sk_rmem_schedule(sk, skb, size)) {
4006
4041
Lines 4031-4045 Link Here
4031
 * Better try to coalesce them right now to avoid future collapses.
4066
 * Better try to coalesce them right now to avoid future collapses.
4032
 * Returns true if caller should free @from instead of queueing it
4067
 * Returns true if caller should free @from instead of queueing it
4033
 */
4068
 */
4034
static bool tcp_try_coalesce(struct sock *sk,
4069
bool tcp_try_coalesce(struct sock *sk, struct sk_buff *to, struct sk_buff *from,
4035
			     struct sk_buff *to,
4070
		      bool *fragstolen)
4036
			     struct sk_buff *from,
4037
			     bool *fragstolen)
4038
{
4071
{
4039
	int delta;
4072
	int delta;
4040
4073
4041
	*fragstolen = false;
4074
	*fragstolen = false;
4042
4075
4076
	if (tcp_sk(sk)->mpc && !is_meta_sk(sk))
4077
		return false;
4078
4043
	if (tcp_hdr(from)->fin)
4079
	if (tcp_hdr(from)->fin)
4044
		return false;
4080
		return false;
4045
4081
Lines 4128-4134 Link Here
4128
4164
4129
	/* Do skb overlap to previous one? */
4165
	/* Do skb overlap to previous one? */
4130
	if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
4166
	if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
4131
		if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
4167
		/* MPTCP allows non-data data-fin to be in the ofo-queue */
4168
		if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq) &&
4169
		    !(tp->mpc && end_seq == seq)) {
4132
			/* All the bits are present. Drop. */
4170
			/* All the bits are present. Drop. */
4133
			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
4171
			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
4134
			__kfree_skb(skb);
4172
			__kfree_skb(skb);
Lines 4166-4171 Link Here
4166
					 end_seq);
4204
					 end_seq);
4167
			break;
4205
			break;
4168
		}
4206
		}
4207
		/* MPTCP allows non-data data-fin to be in the ofo-queue */
4208
		if (tp->mpc && TCP_SKB_CB(skb1)->seq == TCP_SKB_CB(skb1)->end_seq)
4209
			continue;
4169
		__skb_unlink(skb1, &tp->out_of_order_queue);
4210
		__skb_unlink(skb1, &tp->out_of_order_queue);
4170
		tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
4211
		tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
4171
				 TCP_SKB_CB(skb1)->end_seq);
4212
				 TCP_SKB_CB(skb1)->end_seq);
Lines 4181-4188 Link Here
4181
		skb_set_owner_r(skb, sk);
4222
		skb_set_owner_r(skb, sk);
4182
}
4223
}
4183
4224
4184
static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
4225
int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
4185
		  bool *fragstolen)
4226
			       bool *fragstolen)
4186
{
4227
{
4187
	int eaten;
4228
	int eaten;
4188
	struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);
4229
	struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);
Lines 4244-4250 Link Here
4244
	int eaten = -1;
4285
	int eaten = -1;
4245
	bool fragstolen = false;
4286
	bool fragstolen = false;
4246
4287
4247
	if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
4288
	/* If no data is present, but a data_fin is in the options, we still
4289
	 * have to call mptcp_queue_skb later on. */
4290
	if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq &&
4291
	    !(tp->mpc && mptcp_is_data_fin(skb)))
4248
		goto drop;
4292
		goto drop;
4249
4293
4250
	skb_dst_drop(skb);
4294
	skb_dst_drop(skb);
Lines 4290-4296 Link Here
4290
			eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
4334
			eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
4291
		}
4335
		}
4292
		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
4336
		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
4293
		if (skb->len)
4337
		if (skb->len || mptcp_is_data_fin(skb))
4294
			tcp_event_data_recv(sk, skb);
4338
			tcp_event_data_recv(sk, skb);
4295
		if (th->fin)
4339
		if (th->fin)
4296
			tcp_fin(sk);
4340
			tcp_fin(sk);
Lines 4312-4318 Link Here
4312
4356
4313
		if (eaten > 0)
4357
		if (eaten > 0)
4314
			kfree_skb_partial(skb, fragstolen);
4358
			kfree_skb_partial(skb, fragstolen);
4315
		if (!sock_flag(sk, SOCK_DEAD))
4359
		if (!sock_flag(sk, SOCK_DEAD) || tp->mpc)
4360
			/* MPTCP: we always have to call data_ready, because
4361
			 * we may be about to receive a data-fin, which still
4362
			 * must get queued.
4363
			 */
4316
			sk->sk_data_ready(sk, 0);
4364
			sk->sk_data_ready(sk, 0);
4317
		return;
4365
		return;
4318
	}
4366
	}
Lines 4386-4391 Link Here
4386
	struct sk_buff *skb, *n;
4434
	struct sk_buff *skb, *n;
4387
	bool end_of_skbs;
4435
	bool end_of_skbs;
4388
4436
4437
	if (tcp_sk(sk)->mpc)
4438
		return;
4439
4389
	/* First, check that queue is collapsible and find
4440
	/* First, check that queue is collapsible and find
4390
	 * the point where collapsing can be useful. */
4441
	 * the point where collapsing can be useful. */
4391
	skb = head;
4442
	skb = head;
Lines 4491-4497 Link Here
4491
	struct sk_buff *head;
4542
	struct sk_buff *head;
4492
	u32 start, end;
4543
	u32 start, end;
4493
4544
4494
	if (skb == NULL)
4545
	if (skb == NULL || tp->mpc)
4495
		return;
4546
		return;
4496
4547
4497
	start = TCP_SKB_CB(skb)->seq;
4548
	start = TCP_SKB_CB(skb)->seq;
Lines 4536-4541 Link Here
4536
	struct tcp_sock *tp = tcp_sk(sk);
4587
	struct tcp_sock *tp = tcp_sk(sk);
4537
	bool res = false;
4588
	bool res = false;
4538
4589
4590
	if (is_meta_sk(sk)) {
4591
		if (!skb_queue_empty(&tp->out_of_order_queue)) {
4592
			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED);
4593
			mptcp_purge_ofo_queue(tp);
4594
4595
			/* No sack at the mptcp-level */
4596
			sk_mem_reclaim(sk);
4597
			res = 1;
4598
		}
4599
		return res;
4600
	}
4601
4539
	if (!skb_queue_empty(&tp->out_of_order_queue)) {
4602
	if (!skb_queue_empty(&tp->out_of_order_queue)) {
4540
		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED);
4603
		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED);
4541
		__skb_queue_purge(&tp->out_of_order_queue);
4604
		__skb_queue_purge(&tp->out_of_order_queue);
Lines 4644-4652 Link Here
4644
		return false;
4707
		return false;
4645
4708
4646
	/* If we filled the congestion window, do not expand.  */
4709
	/* If we filled the congestion window, do not expand.  */
4647
	if (tp->packets_out >= tp->snd_cwnd)
4710
	if (!tp->mpc && tp->packets_out >= tp->snd_cwnd)
4648
		return false;
4711
		return false;
4649
4712
4713
#ifdef CONFIG_MPTCP
4714
	if (tp->mpc) {
4715
		struct sock *sk_it;
4716
		int cnt_backups = 0;
4717
		int backup_available = 0;
4718
4719
		/* For MPTCP we look for a subsocket that could send data.
4720
		 * If we found one, then we update the send-buffer.
4721
		 */
4722
		mptcp_for_each_sk(tp->mpcb, sk_it) {
4723
			struct tcp_sock *tp_it = tcp_sk(sk_it);
4724
4725
			if (!mptcp_sk_can_send(sk_it))
4726
				continue;
4727
4728
			/* Backup-flows have to be counted - if there is no other
4729
			 * subflow we take the backup-flow into account. */
4730
			if (tp_it->mptcp->rcv_low_prio || tp_it->mptcp->low_prio) {
4731
				cnt_backups++;
4732
			}
4733
4734
			if (tp_it->packets_out < tp_it->snd_cwnd) {
4735
				if (tp_it->mptcp->rcv_low_prio || tp_it->mptcp->low_prio) {
4736
					backup_available = 1;
4737
					continue;
4738
				}
4739
				return 1;
4740
			}
4741
		}
4742
4743
		/* Backup-flow is available for sending - update send-buffer */
4744
		if (tp->mpcb->cnt_established == cnt_backups && backup_available)
4745
			return 1;
4746
		return 0;
4747
	}
4748
#endif
4749
4650
	return true;
4750
	return true;
4651
}
4751
}
4652
4752
Lines 4659-4675 Link Here
4659
static void tcp_new_space(struct sock *sk)
4759
static void tcp_new_space(struct sock *sk)
4660
{
4760
{
4661
	struct tcp_sock *tp = tcp_sk(sk);
4761
	struct tcp_sock *tp = tcp_sk(sk);
4762
	struct sock *meta_sk = tp->mpc ? mptcp_meta_sk(sk) : sk;
4662
4763
4663
	if (tcp_should_expand_sndbuf(sk)) {
4764
	if (tcp_should_expand_sndbuf(meta_sk)) {
4664
		int sndmem = SKB_TRUESIZE(max_t(u32,
4765
		int sndmem = SKB_TRUESIZE(max_t(u32,
4665
						tp->rx_opt.mss_clamp,
4766
						tp->rx_opt.mss_clamp,
4666
						tp->mss_cache) +
4767
						tp->mss_cache) +
4667
					  MAX_TCP_HEADER);
4768
					  MAX_TCP_HEADER);
4668
		int demanded = max_t(unsigned int, tp->snd_cwnd,
4769
		int demanded;
4669
				     tp->reordering + 1);
4770
4771
		if (tp->mpc)
4772
			demanded = mptcp_check_snd_buf(tp);
4773
		else
4774
			demanded = max_t(unsigned int, tp->snd_cwnd,
4775
					 tp->reordering + 1);
4776
4777
		/* MPTCP: After this, sndmem is the new contribution of the
4778
		 * current subflow to the aggregate sndbuf
4779
		 */
4670
		sndmem *= 2 * demanded;
4780
		sndmem *= 2 * demanded;
4671
		if (sndmem > sk->sk_sndbuf)
4781
		if (sndmem > sk->sk_sndbuf) {
4782
			int old_sndbuf = sk->sk_sndbuf;
4672
			sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
4783
			sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
4784
			/* MPTCP: ok, the subflow sndbuf has grown, reflect
4785
			 * this in the aggregate buffer.
4786
			 */
4787
			if (tp->mpc && old_sndbuf != sk->sk_sndbuf)
4788
				mptcp_update_sndbuf(tp->mpcb);
4789
		}
4673
		tp->snd_cwnd_stamp = tcp_time_stamp;
4790
		tp->snd_cwnd_stamp = tcp_time_stamp;
4674
	}
4791
	}
4675
4792
Lines 4680-4687 Link Here
4680
{
4797
{
4681
	if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
4798
	if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
4682
		sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
4799
		sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
4683
		if (sk->sk_socket &&
4800
		if (tcp_sk(sk)->mpc ||
4684
		    test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
4801
		    (sk->sk_socket &&
4802
			test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)))
4685
			tcp_new_space(sk);
4803
			tcp_new_space(sk);
4686
	}
4804
	}
4687
}
4805
}
Lines 4806-4811 Link Here
4806
{
4924
{
4807
	struct tcp_sock *tp = tcp_sk(sk);
4925
	struct tcp_sock *tp = tcp_sk(sk);
4808
4926
4927
	/* MPTCP urgent data is not yet supported */
4928
	if (tp->mpc)
4929
		return;
4930
4809
	/* Check if we get a new urgent pointer - normally not. */
4931
	/* Check if we get a new urgent pointer - normally not. */
4810
	if (th->urg)
4932
	if (th->urg)
4811
		tcp_check_urg(sk, th);
4933
		tcp_check_urg(sk, th);
Lines 4818-4825 Link Here
4818
		/* Is the urgent pointer pointing into this packet? */
4940
		/* Is the urgent pointer pointing into this packet? */
4819
		if (ptr < skb->len) {
4941
		if (ptr < skb->len) {
4820
			u8 tmp;
4942
			u8 tmp;
4943
4821
			if (skb_copy_bits(skb, ptr, &tmp, 1))
4944
			if (skb_copy_bits(skb, ptr, &tmp, 1))
4822
				BUG();
4945
				BUG();
4946
4823
			tp->urg_data = TCP_URG_VALID | tmp;
4947
			tp->urg_data = TCP_URG_VALID | tmp;
4824
			if (!sock_flag(sk, SOCK_DEAD))
4948
			if (!sock_flag(sk, SOCK_DEAD))
4825
				sk->sk_data_ready(sk, 0);
4949
				sk->sk_data_ready(sk, 0);
Lines 4873-4880 Link Here
4873
}
4997
}
4874
4998
4875
#ifdef CONFIG_NET_DMA
4999
#ifdef CONFIG_NET_DMA
4876
static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,
5000
bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen)
4877
				  int hlen)
4878
{
5001
{
4879
	struct tcp_sock *tp = tcp_sk(sk);
5002
	struct tcp_sock *tp = tcp_sk(sk);
4880
	int chunk = skb->len - hlen;
5003
	int chunk = skb->len - hlen;
Lines 4983-4991 Link Here
4983
		goto discard;
5106
		goto discard;
4984
	}
5107
	}
4985
5108
5109
	/* If valid: post process the received MPTCP options. */
5110
	if (tp->mpc && mptcp_handle_options(sk, th, skb))
5111
		goto discard;
5112
4986
	return true;
5113
	return true;
4987
5114
4988
discard:
5115
discard:
5116
	if (tp->mpc)
5117
		mptcp_reset_mopt(tp);
4989
	__kfree_skb(skb);
5118
	__kfree_skb(skb);
4990
	return false;
5119
	return false;
4991
}
5120
}
Lines 5037-5042 Link Here
5037
5166
5038
	tp->rx_opt.saw_tstamp = 0;
5167
	tp->rx_opt.saw_tstamp = 0;
5039
5168
5169
	/* MPTCP: force slowpath. */
5170
	if (tp->mpc)
5171
		goto slow_path;
5172
5040
	/*	pred_flags is 0xS?10 << 16 + snd_wnd
5173
	/*	pred_flags is 0xS?10 << 16 + snd_wnd
5041
	 *	if header_prediction is to be made
5174
	 *	if header_prediction is to be made
5042
	 *	'S' will always be tp->tcp_header_len >> 2
5175
	 *	'S' will always be tp->tcp_header_len >> 2
Lines 5282-5288 Link Here
5282
		/* Get original SYNACK MSS value if user MSS sets mss_clamp */
5415
		/* Get original SYNACK MSS value if user MSS sets mss_clamp */
5283
		tcp_clear_options(&opt);
5416
		tcp_clear_options(&opt);
5284
		opt.user_mss = opt.mss_clamp = 0;
5417
		opt.user_mss = opt.mss_clamp = 0;
5285
		tcp_parse_options(synack, &opt, 0, NULL);
5418
		tcp_parse_options(synack, &opt, NULL, 0, NULL);
5286
		mss = opt.mss_clamp;
5419
		mss = opt.mss_clamp;
5287
	}
5420
	}
5288
5421
Lines 5317-5324 Link Here
5317
	struct tcp_sock *tp = tcp_sk(sk);
5450
	struct tcp_sock *tp = tcp_sk(sk);
5318
	struct tcp_fastopen_cookie foc = { .len = -1 };
5451
	struct tcp_fastopen_cookie foc = { .len = -1 };
5319
	int saved_clamp = tp->rx_opt.mss_clamp;
5452
	int saved_clamp = tp->rx_opt.mss_clamp;
5453
	struct mptcp_options_received mopt;
5454
	mptcp_init_mp_opt(&mopt);
5320
5455
5321
	tcp_parse_options(skb, &tp->rx_opt, 0, &foc);
5456
	tcp_parse_options(skb, &tp->rx_opt,
5457
			  tp->mpc ? &tp->mptcp->rx_opt : &mopt, 0, &foc);
5322
	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
5458
	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
5323
		tp->rx_opt.rcv_tsecr -= tp->tsoffset;
5459
		tp->rx_opt.rcv_tsecr -= tp->tsoffset;
5324
5460
Lines 5365-5370 Link Here
5365
		if (!th->syn)
5501
		if (!th->syn)
5366
			goto discard_and_undo;
5502
			goto discard_and_undo;
5367
5503
5504
		if (tp->request_mptcp || tp->mpc) {
5505
			int ret;
5506
			ret = mptcp_rcv_synsent_state_process(sk, &sk,
5507
							      skb, &mopt);
5508
5509
			/* May have changed if we support MPTCP */
5510
			tp = tcp_sk(sk);
5511
			icsk = inet_csk(sk);
5512
5513
			if (ret == 1)
5514
				goto reset_and_undo;
5515
			if (ret == 2)
5516
				goto discard;
5517
		}
5518
5368
		/* rfc793:
5519
		/* rfc793:
5369
		 *   "If the SYN bit is on ...
5520
		 *   "If the SYN bit is on ...
5370
		 *    are acceptable then ...
5521
		 *    are acceptable then ...
Lines 5377-5382 Link Here
5377
		tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
5528
		tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
5378
		tcp_ack(sk, skb, FLAG_SLOWPATH);
5529
		tcp_ack(sk, skb, FLAG_SLOWPATH);
5379
5530
5531
		if (tp->mpc && !is_master_tp(tp)) {
5532
			/* Timer for repeating the ACK until an answer
5533
			 * arrives. Used only when establishing an additional
5534
			 * subflow inside of an MPTCP connection.
5535
			 */
5536
			sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer,
5537
				       jiffies + icsk->icsk_rto);
5538
		}
5539
5380
		/* Ok.. it's good. Set up sequence numbers and
5540
		/* Ok.. it's good. Set up sequence numbers and
5381
		 * move to established.
5541
		 * move to established.
5382
		 */
5542
		 */
Lines 5403-5408 Link Here
5403
			tp->tcp_header_len = sizeof(struct tcphdr);
5563
			tp->tcp_header_len = sizeof(struct tcphdr);
5404
		}
5564
		}
5405
5565
5566
		if (tp->mpc) {
5567
			tp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN;
5568
			tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN;
5569
		}
5570
5406
		if (tcp_is_sack(tp) && sysctl_tcp_fack)
5571
		if (tcp_is_sack(tp) && sysctl_tcp_fack)
5407
			tcp_enable_fack(tp);
5572
			tcp_enable_fack(tp);
5408
5573
Lines 5423-5429 Link Here
5423
		    tcp_rcv_fastopen_synack(sk, skb, &foc))
5588
		    tcp_rcv_fastopen_synack(sk, skb, &foc))
5424
			return -1;
5589
			return -1;
5425
5590
5426
		if (sk->sk_write_pending ||
5591
		/* With MPTCP we cannot send data on the third ack due to the
5592
		 * lack of option-space */
5593
		if ((sk->sk_write_pending && !tp->mpc) ||
5427
		    icsk->icsk_accept_queue.rskq_defer_accept ||
5594
		    icsk->icsk_accept_queue.rskq_defer_accept ||
5428
		    icsk->icsk_ack.pingpong) {
5595
		    icsk->icsk_ack.pingpong) {
5429
			/* Save one ACK. Data will be ready after
5596
			/* Save one ACK. Data will be ready after
Lines 5465-5470 Link Here
5465
	    tcp_paws_reject(&tp->rx_opt, 0))
5632
	    tcp_paws_reject(&tp->rx_opt, 0))
5466
		goto discard_and_undo;
5633
		goto discard_and_undo;
5467
5634
5635
	/* TODO - check this here for MPTCP */
5468
	if (th->syn) {
5636
	if (th->syn) {
5469
		/* We see SYN without ACK. It is attempt of
5637
		/* We see SYN without ACK. It is attempt of
5470
		 * simultaneous connect with crossed SYNs.
5638
		 * simultaneous connect with crossed SYNs.
Lines 5481-5486 Link Here
5481
			tp->tcp_header_len = sizeof(struct tcphdr);
5649
			tp->tcp_header_len = sizeof(struct tcphdr);
5482
		}
5650
		}
5483
5651
5652
		if (tp->mpc) {
5653
			tp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN;
5654
			tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN;
5655
		}
5656
5484
		tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
5657
		tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
5485
		tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
5658
		tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
5486
5659
Lines 5589-5594 Link Here
5589
5762
5590
	case TCP_SYN_SENT:
5763
	case TCP_SYN_SENT:
5591
		queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
5764
		queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
5765
		if (is_meta_sk(sk)) {
5766
			sk = tcp_sk(sk)->mpcb->master_sk;
5767
			tp = tcp_sk(sk);
5768
		}
5592
		if (queued >= 0)
5769
		if (queued >= 0)
5593
			return queued;
5770
			return queued;
5594
5771
Lines 5596-5601 Link Here
5596
		tcp_urg(sk, skb, th);
5773
		tcp_urg(sk, skb, th);
5597
		__kfree_skb(skb);
5774
		__kfree_skb(skb);
5598
		tcp_data_snd_check(sk);
5775
		tcp_data_snd_check(sk);
5776
		if (tp->mpc && is_master_tp(tp))
5777
			bh_unlock_sock(sk);
5599
		return 0;
5778
		return 0;
5600
	}
5779
	}
5601
5780
Lines 5657-5662 Link Here
5657
5836
5658
		if (tp->rx_opt.tstamp_ok)
5837
		if (tp->rx_opt.tstamp_ok)
5659
			tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
5838
			tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
5839
		if (tp->mpc)
5840
			tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN;
5660
5841
5661
		if (req) {
5842
		if (req) {
5662
			/* Re-arm the timer because data may have been sent out.
5843
			/* Re-arm the timer because data may have been sent out.
Lines 5676-5681 Link Here
5676
5857
5677
		tcp_initialize_rcv_mss(sk);
5858
		tcp_initialize_rcv_mss(sk);
5678
		tcp_fast_path_on(tp);
5859
		tcp_fast_path_on(tp);
5860
		
5861
		/* Send an ACK when establishing a new
5862
		 * MPTCP subflow, i.e. using an MP_JOIN
5863
		 * subtype.
5864
		*/
5865
		if (tp->mpc && !is_master_tp(tp))
5866
			tcp_send_ack(sk);
5867
5679
		break;
5868
		break;
5680
5869
5681
	case TCP_FIN_WAIT1: {
5870
	case TCP_FIN_WAIT1: {
Lines 5714-5719 Link Here
5714
			/* Wake up lingering close() */
5903
			/* Wake up lingering close() */
5715
			sk->sk_state_change(sk);
5904
			sk->sk_state_change(sk);
5716
			break;
5905
			break;
5906
		case TCP_CLOSE:
5907
			if (tp->mp_killed)
5908
				goto discard;
5717
		}
5909
		}
5718
5910
5719
		if (tp->linger2 < 0 ||
5911
		if (tp->linger2 < 0 ||
Lines 5727-5733 Link Here
5727
		tmo = tcp_fin_time(sk);
5919
		tmo = tcp_fin_time(sk);
5728
		if (tmo > TCP_TIMEWAIT_LEN) {
5920
		if (tmo > TCP_TIMEWAIT_LEN) {
5729
			inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
5921
			inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
5730
		} else if (th->fin || sock_owned_by_user(sk)) {
5922
		} else if (th->fin || mptcp_is_data_fin(skb) ||
5923
			   sock_owned_by_user(sk)) {
5731
			/* Bad case. We could lose such FIN otherwise.
5924
			/* Bad case. We could lose such FIN otherwise.
5732
			 * It is not a big problem, but it looks confusing
5925
			 * It is not a big problem, but it looks confusing
5733
			 * and not so rare event. We still can lose it now,
5926
			 * and not so rare event. We still can lose it now,
Lines 5776-5782 Link Here
5776
		 */
5969
		 */
5777
		if (sk->sk_shutdown & RCV_SHUTDOWN) {
5970
		if (sk->sk_shutdown & RCV_SHUTDOWN) {
5778
			if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
5971
			if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
5779
			    after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
5972
			    after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt) &&
5973
			    !tp->mpc) {
5974
				/* In case of mptcp, the reset is handled by
5975
				 * mptcp_rcv_state_process */
5780
				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
5976
				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
5781
				tcp_reset(sk);
5977
				tcp_reset(sk);
5782
				return 1;
5978
				return 1;
(-)a/net/ipv4/tcp_ipv4.c (-46 / +169 lines)
Lines 67-72 Link Here
67
#include <net/icmp.h>
67
#include <net/icmp.h>
68
#include <net/inet_hashtables.h>
68
#include <net/inet_hashtables.h>
69
#include <net/tcp.h>
69
#include <net/tcp.h>
70
#include <net/mptcp.h>
71
#include <net/mptcp_v4.h>
70
#include <net/transp_v6.h>
72
#include <net/transp_v6.h>
71
#include <net/ipv6.h>
73
#include <net/ipv6.h>
72
#include <net/inet_common.h>
74
#include <net/inet_common.h>
Lines 99-105 Link Here
99
struct inet_hashinfo tcp_hashinfo;
101
struct inet_hashinfo tcp_hashinfo;
100
EXPORT_SYMBOL(tcp_hashinfo);
102
EXPORT_SYMBOL(tcp_hashinfo);
101
103
102
static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
104
__u32 tcp_v4_init_sequence(const struct sk_buff *skb)
103
{
105
{
104
	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
106
	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
105
					  ip_hdr(skb)->saddr,
107
					  ip_hdr(skb)->saddr,
Lines 333-339 Link Here
333
	struct inet_sock *inet;
335
	struct inet_sock *inet;
334
	const int type = icmp_hdr(icmp_skb)->type;
336
	const int type = icmp_hdr(icmp_skb)->type;
335
	const int code = icmp_hdr(icmp_skb)->code;
337
	const int code = icmp_hdr(icmp_skb)->code;
336
	struct sock *sk;
338
	struct sock *sk, *meta_sk;
337
	struct sk_buff *skb;
339
	struct sk_buff *skb;
338
	struct request_sock *req;
340
	struct request_sock *req;
339
	__u32 seq;
341
	__u32 seq;
Lines 357-369 Link Here
357
		return;
359
		return;
358
	}
360
	}
359
361
360
	bh_lock_sock(sk);
362
	tp = tcp_sk(sk);
363
	if (tp->mpc)
364
		meta_sk = mptcp_meta_sk(sk);
365
	else
366
		meta_sk = sk;
367
368
	bh_lock_sock(meta_sk);
361
	/* If too many ICMPs get dropped on busy
369
	/* If too many ICMPs get dropped on busy
362
	 * servers this needs to be solved differently.
370
	 * servers this needs to be solved differently.
363
	 * We do take care of PMTU discovery (RFC1191) special case :
371
	 * We do take care of PMTU discovery (RFC1191) special case :
364
	 * we can receive locally generated ICMP messages while socket is held.
372
	 * we can receive locally generated ICMP messages while socket is held.
365
	 */
373
	 */
366
	if (sock_owned_by_user(sk)) {
374
	if (sock_owned_by_user(meta_sk)) {
367
		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
375
		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
368
			NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
376
			NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
369
	}
377
	}
Lines 376-382 Link Here
376
	}
384
	}
377
385
378
	icsk = inet_csk(sk);
386
	icsk = inet_csk(sk);
379
	tp = tcp_sk(sk);
380
	req = tp->fastopen_rsk;
387
	req = tp->fastopen_rsk;
381
	seq = ntohl(th->seq);
388
	seq = ntohl(th->seq);
382
	if (sk->sk_state != TCP_LISTEN &&
389
	if (sk->sk_state != TCP_LISTEN &&
Lines 410-420 Link Here
410
				goto out;
417
				goto out;
411
418
412
			tp->mtu_info = info;
419
			tp->mtu_info = info;
413
			if (!sock_owned_by_user(sk)) {
420
			if (!sock_owned_by_user(meta_sk)) {
414
				tcp_v4_mtu_reduced(sk);
421
				tcp_v4_mtu_reduced(sk);
415
			} else {
422
			} else {
416
				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
423
				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
417
					sock_hold(sk);
424
					sock_hold(sk);
425
				if (tp->mpc)
426
					mptcp_tsq_flags(sk);
418
			}
427
			}
419
			goto out;
428
			goto out;
420
		}
429
		}
Lines 430-436 Link Here
430
439
431
		/* XXX (TFO) - revisit the following logic for TFO */
440
		/* XXX (TFO) - revisit the following logic for TFO */
432
441
433
		if (sock_owned_by_user(sk))
442
		if (sock_owned_by_user(meta_sk))
434
			break;
443
			break;
435
444
436
		icsk->icsk_backoff--;
445
		icsk->icsk_backoff--;
Lines 472-478 Link Here
472
	switch (sk->sk_state) {
481
	switch (sk->sk_state) {
473
		struct request_sock *req, **prev;
482
		struct request_sock *req, **prev;
474
	case TCP_LISTEN:
483
	case TCP_LISTEN:
475
		if (sock_owned_by_user(sk))
484
		if (sock_owned_by_user(meta_sk))
476
			goto out;
485
			goto out;
477
486
478
		req = inet_csk_search_req(sk, &prev, th->dest,
487
		req = inet_csk_search_req(sk, &prev, th->dest,
Lines 505-511 Link Here
505
			       It can f.e. if SYNs crossed,
514
			       It can f.e. if SYNs crossed,
506
			       or Fast Open.
515
			       or Fast Open.
507
			     */
516
			     */
508
		if (!sock_owned_by_user(sk)) {
517
		if (!sock_owned_by_user(meta_sk)) {
509
			sk->sk_err = err;
518
			sk->sk_err = err;
510
519
511
			sk->sk_error_report(sk);
520
			sk->sk_error_report(sk);
Lines 534-540 Link Here
534
	 */
543
	 */
535
544
536
	inet = inet_sk(sk);
545
	inet = inet_sk(sk);
537
	if (!sock_owned_by_user(sk) && inet->recverr) {
546
	if (!sock_owned_by_user(meta_sk) && inet->recverr) {
538
		sk->sk_err = err;
547
		sk->sk_err = err;
539
		sk->sk_error_report(sk);
548
		sk->sk_error_report(sk);
540
	} else	{ /* Only an error on timeout */
549
	} else	{ /* Only an error on timeout */
Lines 542-548 Link Here
542
	}
551
	}
543
552
544
out:
553
out:
545
	bh_unlock_sock(sk);
554
	bh_unlock_sock(meta_sk);
546
	sock_put(sk);
555
	sock_put(sk);
547
}
556
}
548
557
Lines 584-590 Link Here
584
 *	Exception: precedence violation. We do not implement it in any case.
593
 *	Exception: precedence violation. We do not implement it in any case.
585
 */
594
 */
586
595
587
static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
596
void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
588
{
597
{
589
	const struct tcphdr *th = tcp_hdr(skb);
598
	const struct tcphdr *th = tcp_hdr(skb);
590
	struct {
599
	struct {
Lines 708-717 Link Here
708
   outside socket context is ugly, certainly. What can I do?
717
   outside socket context is ugly, certainly. What can I do?
709
 */
718
 */
710
719
711
static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
720
static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 data_ack,
712
			    u32 win, u32 tsval, u32 tsecr, int oif,
721
			    u32 win, u32 tsval, u32 tsecr, int oif,
713
			    struct tcp_md5sig_key *key,
722
			    struct tcp_md5sig_key *key,
714
			    int reply_flags, u8 tos)
723
			    int reply_flags, u8 tos, int mptcp)
715
{
724
{
716
	const struct tcphdr *th = tcp_hdr(skb);
725
	const struct tcphdr *th = tcp_hdr(skb);
717
	struct {
726
	struct {
Lines 720-725 Link Here
720
#ifdef CONFIG_TCP_MD5SIG
729
#ifdef CONFIG_TCP_MD5SIG
721
			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
730
			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
722
#endif
731
#endif
732
#ifdef CONFIG_MPTCP
733
			   + ((MPTCP_SUB_LEN_DSS >> 2) +
734
			      (MPTCP_SUB_LEN_ACK >> 2))
735
#endif
723
			];
736
			];
724
	} rep;
737
	} rep;
725
	struct ip_reply_arg arg;
738
	struct ip_reply_arg arg;
Lines 764-769 Link Here
764
				    ip_hdr(skb)->daddr, &rep.th);
777
				    ip_hdr(skb)->daddr, &rep.th);
765
	}
778
	}
766
#endif
779
#endif
780
#ifdef CONFIG_MPTCP
781
	if (mptcp) {
782
		int offset = (tsecr) ? 3 : 0;
783
		/* Construction of 32-bit data_ack */
784
		rep.opt[offset++] = htonl((TCPOPT_MPTCP << 24) |
785
					  ((MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK) << 16) |
786
					  (0x20 << 8) |
787
					  (0x01));
788
		rep.opt[offset] = htonl(data_ack);
789
790
		arg.iov[0].iov_len += MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK;
791
		rep.th.doff = arg.iov[0].iov_len / 4;
792
	}
793
#endif /* CONFIG_MPTCP */
794
767
	arg.flags = reply_flags;
795
	arg.flags = reply_flags;
768
	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
796
	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
769
				      ip_hdr(skb)->saddr, /* XXX */
797
				      ip_hdr(skb)->saddr, /* XXX */
Lines 782-817 Link Here
782
{
810
{
783
	struct inet_timewait_sock *tw = inet_twsk(sk);
811
	struct inet_timewait_sock *tw = inet_twsk(sk);
784
	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
812
	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
813
	u32 data_ack = 0;
814
	int mptcp = 0;
815
816
	if (tcptw->mptcp_tw && tcptw->mptcp_tw->meta_tw) {
817
		data_ack = (u32)tcptw->mptcp_tw->rcv_nxt;
818
		mptcp = 1;
819
	}
785
820
786
	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
821
	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
822
			data_ack,
787
			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
823
			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
788
			tcp_time_stamp + tcptw->tw_ts_offset,
824
			tcp_time_stamp + tcptw->tw_ts_offset,
789
			tcptw->tw_ts_recent,
825
			tcptw->tw_ts_recent,
790
			tw->tw_bound_dev_if,
826
			tw->tw_bound_dev_if,
791
			tcp_twsk_md5_key(tcptw),
827
			tcp_twsk_md5_key(tcptw),
792
			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
828
			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
793
			tw->tw_tos
829
			tw->tw_tos, mptcp
794
			);
830
			);
795
831
796
	inet_twsk_put(tw);
832
	inet_twsk_put(tw);
797
}
833
}
798
834
799
static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
835
void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
800
				  struct request_sock *req)
836
			   struct request_sock *req)
801
{
837
{
802
	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
838
	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
803
	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
839
	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
804
	 */
840
	 */
805
	tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
841
	tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
806
			tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
842
			tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
807
			tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
843
			tcp_rsk(req)->rcv_nxt, 0, req->rcv_wnd,
808
			tcp_time_stamp,
844
			tcp_time_stamp,
809
			req->ts_recent,
845
			req->ts_recent,
810
			0,
846
			0,
811
			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
847
			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
812
					  AF_INET),
848
					  AF_INET),
813
			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
849
			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
814
			ip_hdr(skb)->tos);
850
			ip_hdr(skb)->tos, 0);
815
}
851
}
816
852
817
/*
853
/*
Lines 819-828 Link Here
819
 *	This still operates on a request_sock only, not on a big
855
 *	This still operates on a request_sock only, not on a big
820
 *	socket.
856
 *	socket.
821
 */
857
 */
822
static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
858
int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
823
			      struct request_sock *req,
859
		       struct request_sock *req, u16 queue_mapping,
824
			      u16 queue_mapping,
860
		       bool nocache)
825
			      bool nocache)
826
{
861
{
827
	const struct inet_request_sock *ireq = inet_rsk(req);
862
	const struct inet_request_sock *ireq = inet_rsk(req);
828
	struct flowi4 fl4;
863
	struct flowi4 fl4;
Lines 850-856 Link Here
850
	return err;
885
	return err;
851
}
886
}
852
887
853
static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req)
888
int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req)
854
{
889
{
855
	int res = tcp_v4_send_synack(sk, NULL, req, 0, false);
890
	int res = tcp_v4_send_synack(sk, NULL, req, 0, false);
856
891
Lines 862-868 Link Here
862
/*
897
/*
863
 *	IPv4 request_sock destructor.
898
 *	IPv4 request_sock destructor.
864
 */
899
 */
865
static void tcp_v4_reqsk_destructor(struct request_sock *req)
900
void tcp_v4_reqsk_destructor(struct request_sock *req)
866
{
901
{
867
	kfree(inet_rsk(req)->opt);
902
	kfree(inet_rsk(req)->opt);
868
}
903
}
Lines 902-908 Link Here
902
/*
937
/*
903
 * Save and compile IPv4 options into the request_sock if needed.
938
 * Save and compile IPv4 options into the request_sock if needed.
904
 */
939
 */
905
static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
940
struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
906
{
941
{
907
	const struct ip_options *opt = &(IPCB(skb)->opt);
942
	const struct ip_options *opt = &(IPCB(skb)->opt);
908
	struct ip_options_rcu *dopt = NULL;
943
	struct ip_options_rcu *dopt = NULL;
Lines 1440-1445 Link Here
1440
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1475
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1441
{
1476
{
1442
	struct tcp_options_received tmp_opt;
1477
	struct tcp_options_received tmp_opt;
1478
	struct mptcp_options_received mopt;
1443
	struct request_sock *req;
1479
	struct request_sock *req;
1444
	struct inet_request_sock *ireq;
1480
	struct inet_request_sock *ireq;
1445
	struct tcp_sock *tp = tcp_sk(sk);
1481
	struct tcp_sock *tp = tcp_sk(sk);
Lines 1454-1459 Link Here
1454
	struct sk_buff *skb_synack;
1490
	struct sk_buff *skb_synack;
1455
	int do_fastopen;
1491
	int do_fastopen;
1456
1492
1493
	tcp_clear_options(&tmp_opt);
1494
	tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1495
	tmp_opt.user_mss  = tp->rx_opt.user_mss;
1496
	mptcp_init_mp_opt(&mopt);
1497
	tcp_parse_options(skb, &tmp_opt, &mopt, 0, want_cookie ? NULL : &foc);
1498
1499
#ifdef CONFIG_MPTCP
1500
	/* MPTCP structures not initialized, so clear MPTCP fields */
1501
	if  (mptcp_init_failed)
1502
		mptcp_init_mp_opt(&mopt);
1503
1504
	if (mopt.is_mp_join)
1505
		return mptcp_do_join_short(skb, &mopt, &tmp_opt, sock_net(sk));
1506
	if (mopt.drop_me)
1507
		goto drop;
1508
#endif
1457
	/* Never answer to SYNs send to broadcast or multicast */
1509
	/* Never answer to SYNs send to broadcast or multicast */
1458
	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1510
	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1459
		goto drop;
1511
		goto drop;
Lines 1478-1484 Link Here
1478
		goto drop;
1530
		goto drop;
1479
	}
1531
	}
1480
1532
1481
	req = inet_reqsk_alloc(&tcp_request_sock_ops);
1533
#ifdef CONFIG_MPTCP
1534
	if (mopt.saw_mpc) {
1535
		req = inet_reqsk_alloc(&mptcp_request_sock_ops);
1536
1537
		if (!req)
1538
			goto drop;
1539
1540
		mptcp_rsk(req)->mpcb = NULL;
1541
		mptcp_rsk(req)->dss_csum = mopt.dss_csum;
1542
		mptcp_rsk(req)->collide_tk.pprev = NULL;
1543
	} else
1544
#endif
1545
		req = inet_reqsk_alloc(&tcp_request_sock_ops);
1546
1482
	if (!req)
1547
	if (!req)
1483
		goto drop;
1548
		goto drop;
1484
1549
Lines 1486-1502 Link Here
1486
	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1551
	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1487
#endif
1552
#endif
1488
1553
1489
	tcp_clear_options(&tmp_opt);
1490
	tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1491
	tmp_opt.user_mss  = tp->rx_opt.user_mss;
1492
	tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
1493
1494
	if (want_cookie && !tmp_opt.saw_tstamp)
1554
	if (want_cookie && !tmp_opt.saw_tstamp)
1495
		tcp_clear_options(&tmp_opt);
1555
		tcp_clear_options(&tmp_opt);
1496
1556
1497
	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1557
	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1498
	tcp_openreq_init(req, &tmp_opt, skb);
1558
	tcp_openreq_init(req, &tmp_opt, skb);
1499
1559
1560
	if (mopt.saw_mpc)
1561
		mptcp_reqsk_new_mptcp(req, &tmp_opt, &mopt, skb);
1562
1500
	ireq = inet_rsk(req);
1563
	ireq = inet_rsk(req);
1501
	ireq->loc_addr = daddr;
1564
	ireq->loc_addr = daddr;
1502
	ireq->rmt_addr = saddr;
1565
	ireq->rmt_addr = saddr;
Lines 1711-1717 Link Here
1711
}
1774
}
1712
EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1775
EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1713
1776
1714
static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1777
struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1715
{
1778
{
1716
	struct tcphdr *th = tcp_hdr(skb);
1779
	struct tcphdr *th = tcp_hdr(skb);
1717
	const struct iphdr *iph = ip_hdr(skb);
1780
	const struct iphdr *iph = ip_hdr(skb);
Lines 1728-1735 Link Here
1728
1791
1729
	if (nsk) {
1792
	if (nsk) {
1730
		if (nsk->sk_state != TCP_TIME_WAIT) {
1793
		if (nsk->sk_state != TCP_TIME_WAIT) {
1794
			/* Don't lock again the meta-sk. It has been locked
1795
			 * before mptcp_v4_do_rcv.
1796
			 */
1797
			if (tcp_sk(nsk)->mpc && !is_meta_sk(sk))
1798
				bh_lock_sock(mptcp_meta_sk(nsk));
1731
			bh_lock_sock(nsk);
1799
			bh_lock_sock(nsk);
1800
1732
			return nsk;
1801
			return nsk;
1802
1733
		}
1803
		}
1734
		inet_twsk_put(inet_twsk(nsk));
1804
		inet_twsk_put(inet_twsk(nsk));
1735
		return NULL;
1805
		return NULL;
Lines 1786-1791 Link Here
1786
		goto discard;
1856
		goto discard;
1787
#endif
1857
#endif
1788
1858
1859
	if (is_meta_sk(sk))
1860
		return mptcp_v4_do_rcv(sk, skb);
1861
1789
	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1862
	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1790
		struct dst_entry *dst = sk->sk_rx_dst;
1863
		struct dst_entry *dst = sk->sk_rx_dst;
1791
1864
Lines 1920-1926 Link Here
1920
	} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1993
	} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1921
		wake_up_interruptible_sync_poll(sk_sleep(sk),
1994
		wake_up_interruptible_sync_poll(sk_sleep(sk),
1922
					   POLLIN | POLLRDNORM | POLLRDBAND);
1995
					   POLLIN | POLLRDNORM | POLLRDBAND);
1923
		if (!inet_csk_ack_scheduled(sk))
1996
		if (!inet_csk_ack_scheduled(sk) && !tp->mpc)
1924
			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1997
			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1925
						  (3 * tcp_rto_min(sk)) / 4,
1998
						  (3 * tcp_rto_min(sk)) / 4,
1926
						  TCP_RTO_MAX);
1999
						  TCP_RTO_MAX);
Lines 1937-1943 Link Here
1937
{
2010
{
1938
	const struct iphdr *iph;
2011
	const struct iphdr *iph;
1939
	const struct tcphdr *th;
2012
	const struct tcphdr *th;
1940
	struct sock *sk;
2013
	struct sock *sk, *meta_sk = NULL;
1941
	int ret;
2014
	int ret;
1942
	struct net *net = dev_net(skb->dev);
2015
	struct net *net = dev_net(skb->dev);
1943
2016
Lines 1970-1987 Link Here
1970
	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2043
	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1971
				    skb->len - th->doff * 4);
2044
				    skb->len - th->doff * 4);
1972
	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2045
	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2046
#ifdef CONFIG_MPTCP
2047
	TCP_SKB_CB(skb)->mptcp_flags = 0;
2048
	TCP_SKB_CB(skb)->dss_off = 0;
2049
#endif
1973
	TCP_SKB_CB(skb)->when	 = 0;
2050
	TCP_SKB_CB(skb)->when	 = 0;
1974
	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2051
	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1975
	TCP_SKB_CB(skb)->sacked	 = 0;
2052
	TCP_SKB_CB(skb)->sacked	 = 0;
1976
2053
1977
	sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
2054
	sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1978
	if (!sk)
1979
		goto no_tcp_socket;
1980
2055
1981
process:
2056
process:
1982
	if (sk->sk_state == TCP_TIME_WAIT)
2057
	if (sk && sk->sk_state == TCP_TIME_WAIT)
1983
		goto do_time_wait;
2058
		goto do_time_wait;
1984
2059
2060
#ifdef CONFIG_MPTCP
2061
	if (!sk && th->syn && !th->ack) {
2062
		int ret = mptcp_lookup_join(skb, NULL);
2063
2064
		if (ret < 0) {
2065
			tcp_v4_send_reset(NULL, skb);
2066
			goto discard_it;
2067
		} else if (ret > 0) {
2068
			return 0;
2069
		}
2070
	}
2071
2072
	/* Is there a pending request sock for this segment ? */
2073
	if ((!sk || sk->sk_state == TCP_LISTEN) && mptcp_check_req(skb, net)) {
2074
		if (sk)
2075
			sock_put(sk);
2076
		return 0;
2077
	}
2078
#endif
2079
	if (!sk)
2080
		goto no_tcp_socket;
2081
1985
	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2082
	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1986
		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
2083
		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1987
		goto discard_and_relse;
2084
		goto discard_and_relse;
Lines 1997-2007 Link Here
1997
	sk_mark_napi_id(sk, skb);
2094
	sk_mark_napi_id(sk, skb);
1998
	skb->dev = NULL;
2095
	skb->dev = NULL;
1999
2096
2000
	bh_lock_sock_nested(sk);
2097
	if (tcp_sk(sk)->mpc) {
2098
		meta_sk = mptcp_meta_sk(sk);
2099
2100
		bh_lock_sock_nested(meta_sk);
2101
		skb->sk = sk;
2102
	} else {
2103
		meta_sk = sk;
2104
		bh_lock_sock_nested(sk);
2105
	}
2106
2001
	ret = 0;
2107
	ret = 0;
2002
	if (!sock_owned_by_user(sk)) {
2108
	if (!sock_owned_by_user(meta_sk)) {
2003
#ifdef CONFIG_NET_DMA
2109
#ifdef CONFIG_NET_DMA
2004
		struct tcp_sock *tp = tcp_sk(sk);
2110
		struct tcp_sock *tp = tcp_sk(meta_sk);
2005
		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
2111
		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
2006
			tp->ucopy.dma_chan = net_dma_find_channel();
2112
			tp->ucopy.dma_chan = net_dma_find_channel();
2007
		if (tp->ucopy.dma_chan)
2113
		if (tp->ucopy.dma_chan)
Lines 2009-2024 Link Here
2009
		else
2115
		else
2010
#endif
2116
#endif
2011
		{
2117
		{
2012
			if (!tcp_prequeue(sk, skb))
2118
			if (!tcp_prequeue(meta_sk, skb))
2013
				ret = tcp_v4_do_rcv(sk, skb);
2119
				ret = tcp_v4_do_rcv(sk, skb);
2014
		}
2120
		}
2015
	} else if (unlikely(sk_add_backlog(sk, skb,
2121
	} else if (unlikely(sk_add_backlog(meta_sk, skb,
2016
					   sk->sk_rcvbuf + sk->sk_sndbuf))) {
2122
					   meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) {
2017
		bh_unlock_sock(sk);
2123
		bh_unlock_sock(meta_sk);
2018
		NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
2124
		NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
2019
		goto discard_and_relse;
2125
		goto discard_and_relse;
2020
	}
2126
	}
2021
	bh_unlock_sock(sk);
2127
	bh_unlock_sock(meta_sk);
2022
2128
2023
	sock_put(sk);
2129
	sock_put(sk);
2024
2130
Lines 2073-2078 Link Here
2073
			sk = sk2;
2179
			sk = sk2;
2074
			goto process;
2180
			goto process;
2075
		}
2181
		}
2182
#ifdef CONFIG_MPTCP
2183
		if (th->syn && !th->ack) {
2184
			int ret = mptcp_lookup_join(skb, inet_twsk(sk));
2185
2186
			if (ret < 0) {
2187
				tcp_v4_send_reset(NULL, skb);
2188
				goto discard_it;
2189
			} else if (ret > 0) {
2190
				return 0;
2191
			}
2192
		}
2193
#endif
2076
		/* Fall through to ACK */
2194
		/* Fall through to ACK */
2077
	}
2195
	}
2078
	case TCP_TW_ACK:
2196
	case TCP_TW_ACK:
Lines 2155-2160 Link Here
2155
2273
2156
	tcp_cleanup_congestion_control(sk);
2274
	tcp_cleanup_congestion_control(sk);
2157
2275
2276
	if (tp->mpc)
2277
		mptcp_destroy_sock(sk);
2278
	if (tp->inside_tk_table)
2279
		mptcp_hash_remove(tp);
2280
2158
	/* Cleanup up the write buffer. */
2281
	/* Cleanup up the write buffer. */
2159
	tcp_write_queue_purge(sk);
2282
	tcp_write_queue_purge(sk);
2160
2283
(-)a/net/ipv4/tcp_minisocks.c (-9 / +101 lines)
Lines 18-28 Link Here
18
 *		Jorge Cwik, <jorge@laser.satlink.net>
18
 *		Jorge Cwik, <jorge@laser.satlink.net>
19
 */
19
 */
20
20
21
#include <linux/kconfig.h>
21
#include <linux/mm.h>
22
#include <linux/mm.h>
22
#include <linux/module.h>
23
#include <linux/module.h>
23
#include <linux/slab.h>
24
#include <linux/slab.h>
24
#include <linux/sysctl.h>
25
#include <linux/sysctl.h>
25
#include <linux/workqueue.h>
26
#include <linux/workqueue.h>
27
#include <net/mptcp.h>
26
#include <net/tcp.h>
28
#include <net/tcp.h>
27
#include <net/inet_common.h>
29
#include <net/inet_common.h>
28
#include <net/xfrm.h>
30
#include <net/xfrm.h>
Lines 95-104 Link Here
95
	struct tcp_options_received tmp_opt;
97
	struct tcp_options_received tmp_opt;
96
	struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
98
	struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
97
	bool paws_reject = false;
99
	bool paws_reject = false;
100
	struct mptcp_options_received mopt;
98
101
99
	tmp_opt.saw_tstamp = 0;
102
	tmp_opt.saw_tstamp = 0;
100
	if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
103
	if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
101
		tcp_parse_options(skb, &tmp_opt, 0, NULL);
104
		mptcp_init_mp_opt(&mopt);
105
106
		tcp_parse_options(skb, &tmp_opt, &mopt, 0, NULL);
102
107
103
		if (tmp_opt.saw_tstamp) {
108
		if (tmp_opt.saw_tstamp) {
104
			tmp_opt.rcv_tsecr	-= tcptw->tw_ts_offset;
109
			tmp_opt.rcv_tsecr	-= tcptw->tw_ts_offset;
Lines 106-111 Link Here
106
			tmp_opt.ts_recent_stamp	= tcptw->tw_ts_recent_stamp;
111
			tmp_opt.ts_recent_stamp	= tcptw->tw_ts_recent_stamp;
107
			paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
112
			paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
108
		}
113
		}
114
115
		if (unlikely(mopt.mp_fclose) && tcptw->mptcp_tw) {
116
			if (mopt.mptcp_key == tcptw->mptcp_tw->loc_key)
117
				goto kill_with_rst;
118
		}
109
	}
119
	}
110
120
111
	if (tw->tw_substate == TCP_FIN_WAIT2) {
121
	if (tw->tw_substate == TCP_FIN_WAIT2) {
Lines 128-133 Link Here
128
		if (!th->ack ||
138
		if (!th->ack ||
129
		    !after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) ||
139
		    !after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) ||
130
		    TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
140
		    TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
141
			/* If mptcp_is_data_fin() returns true, we are sure that
142
			 * mopt has been initialized - otherwise it would not
143
			 * be a DATA_FIN.
144
			 */
145
			if (tcptw->mptcp_tw && tcptw->mptcp_tw->meta_tw &&
146
			    mptcp_is_data_fin(skb) &&
147
			    TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt &&
148
			    mopt.data_seq + 1 == (u32)tcptw->mptcp_tw->rcv_nxt)
149
				return TCP_TW_ACK;
150
131
			inet_twsk_put(tw);
151
			inet_twsk_put(tw);
132
			return TCP_TW_SUCCESS;
152
			return TCP_TW_SUCCESS;
133
		}
153
		}
Lines 159-164 Link Here
159
		else
179
		else
160
			inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
180
			inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
161
					   TCP_TIMEWAIT_LEN);
181
					   TCP_TIMEWAIT_LEN);
182
162
		return TCP_TW_ACK;
183
		return TCP_TW_ACK;
163
	}
184
	}
164
185
Lines 270-275 Link Here
270
	const struct tcp_sock *tp = tcp_sk(sk);
291
	const struct tcp_sock *tp = tcp_sk(sk);
271
	bool recycle_ok = false;
292
	bool recycle_ok = false;
272
293
294
	if (is_meta_sk(sk)) {
295
		mptcp_update_tw_socks(tp, state);
296
		goto tcp_done;
297
	}
298
273
	if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
299
	if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
274
		recycle_ok = tcp_remember_stamp(sk);
300
		recycle_ok = tcp_remember_stamp(sk);
275
301
Lines 290-295 Link Here
290
		tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
316
		tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
291
		tcptw->tw_ts_offset	= tp->tsoffset;
317
		tcptw->tw_ts_offset	= tp->tsoffset;
292
318
319
		if (tp->mpc) {
320
			if (mptcp_time_wait(sk, tcptw)) {
321
				inet_twsk_free(tw);
322
				goto exit;
323
			}
324
		} else {
325
			tcptw->mptcp_tw = NULL;
326
		}
327
293
#if IS_ENABLED(CONFIG_IPV6)
328
#if IS_ENABLED(CONFIG_IPV6)
294
		if (tw->tw_family == PF_INET6) {
329
		if (tw->tw_family == PF_INET6) {
295
			struct ipv6_pinfo *np = inet6_sk(sk);
330
			struct ipv6_pinfo *np = inet6_sk(sk);
Lines 349-363 Link Here
349
		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW);
384
		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW);
350
	}
385
	}
351
386
387
exit:
352
	tcp_update_metrics(sk);
388
	tcp_update_metrics(sk);
389
tcp_done:
353
	tcp_done(sk);
390
	tcp_done(sk);
354
}
391
}
355
392
356
void tcp_twsk_destructor(struct sock *sk)
393
void tcp_twsk_destructor(struct sock *sk)
357
{
394
{
358
#ifdef CONFIG_TCP_MD5SIG
359
	struct tcp_timewait_sock *twsk = tcp_twsk(sk);
395
	struct tcp_timewait_sock *twsk = tcp_twsk(sk);
360
396
397
	if (twsk->mptcp_tw)
398
		mptcp_twsk_destructor(twsk);
399
#ifdef CONFIG_TCP_MD5SIG
361
	if (twsk->tw_md5_key)
400
	if (twsk->tw_md5_key)
362
		kfree_rcu(twsk->tw_md5_key, rcu);
401
		kfree_rcu(twsk->tw_md5_key, rcu);
363
#endif
402
#endif
Lines 394-399 Link Here
394
433
395
		newtp->snd_sml = newtp->snd_una =
434
		newtp->snd_sml = newtp->snd_una =
396
		newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
435
		newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
436
#ifdef CONFIG_MPTCP
437
		memset(&newtp->rcvq_space, 0, sizeof(newtp->rcvq_space));
438
#endif
397
439
398
		tcp_prequeue_init(newtp);
440
		tcp_prequeue_init(newtp);
399
		INIT_LIST_HEAD(&newtp->tsq_node);
441
		INIT_LIST_HEAD(&newtp->tsq_node);
Lines 468-473 Link Here
468
			newtp->rx_opt.ts_recent_stamp = 0;
510
			newtp->rx_opt.ts_recent_stamp = 0;
469
			newtp->tcp_header_len = sizeof(struct tcphdr);
511
			newtp->tcp_header_len = sizeof(struct tcphdr);
470
		}
512
		}
513
		if (treq->saw_mpc)
514
			newtp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN;
471
		newtp->tsoffset = 0;
515
		newtp->tsoffset = 0;
472
#ifdef CONFIG_TCP_MD5SIG
516
#ifdef CONFIG_TCP_MD5SIG
473
		newtp->md5sig_info = NULL;	/*XXX*/
517
		newtp->md5sig_info = NULL;	/*XXX*/
Lines 504-519 Link Here
504
			   bool fastopen)
548
			   bool fastopen)
505
{
549
{
506
	struct tcp_options_received tmp_opt;
550
	struct tcp_options_received tmp_opt;
551
	struct mptcp_options_received mopt;
507
	struct sock *child;
552
	struct sock *child;
508
	const struct tcphdr *th = tcp_hdr(skb);
553
	const struct tcphdr *th = tcp_hdr(skb);
509
	__be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
554
	__be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
510
	bool paws_reject = false;
555
	bool paws_reject = false;
511
556
512
	BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN));
557
	BUG_ON(!tcp_sk(sk)->mpc && fastopen == (sk->sk_state == TCP_LISTEN));
513
558
514
	tmp_opt.saw_tstamp = 0;
559
	tmp_opt.saw_tstamp = 0;
560
561
	mptcp_init_mp_opt(&mopt);
562
515
	if (th->doff > (sizeof(struct tcphdr)>>2)) {
563
	if (th->doff > (sizeof(struct tcphdr)>>2)) {
516
		tcp_parse_options(skb, &tmp_opt, 0, NULL);
564
		tcp_parse_options(skb, &tmp_opt, &mopt, 0, NULL);
517
565
518
		if (tmp_opt.saw_tstamp) {
566
		if (tmp_opt.saw_tstamp) {
519
			tmp_opt.ts_recent = req->ts_recent;
567
			tmp_opt.ts_recent = req->ts_recent;
Lines 552-558 Link Here
552
		 *
600
		 *
553
		 * Reset timer after retransmitting SYNACK, similar to
601
		 * Reset timer after retransmitting SYNACK, similar to
554
		 * the idea of fast retransmit in recovery.
602
		 * the idea of fast retransmit in recovery.
603
		 *
604
		 * Fall back to TCP if MP_CAPABLE is not set.
555
		 */
605
		 */
606
607
		if (tcp_rsk(req)->saw_mpc && !mopt.saw_mpc)
608
			tcp_rsk(req)->saw_mpc = false;
609
610
556
		if (!inet_rtx_syn_ack(sk, req))
611
		if (!inet_rtx_syn_ack(sk, req))
557
			req->expires = min(TCP_TIMEOUT_INIT << req->num_timeout,
612
			req->expires = min(TCP_TIMEOUT_INIT << req->num_timeout,
558
					   TCP_RTO_MAX) + jiffies;
613
					   TCP_RTO_MAX) + jiffies;
Lines 680-686 Link Here
680
735
681
	/* While TCP_DEFER_ACCEPT is active, drop bare ACK. */
736
	/* While TCP_DEFER_ACCEPT is active, drop bare ACK. */
682
	if (req->num_timeout < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
737
	if (req->num_timeout < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
683
	    TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
738
	    TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1 &&
739
	    /* TODO MPTCP:
740
	     * We do this here, because otherwise options sent in the third ack,
741
	     * or duplicate fourth ack will get lost. Options like MP_PRIO, ADD_ADDR,...
742
	     *
743
	     * We could store them in request_sock, but this would mean that we
744
	     * have to put tcp_options_received and mptcp_options_received in there,
745
	     * increasing considerably the size of the request-sock.
746
	     *
747
	     * As soon as we have reworked the request-sock MPTCP-fields and
748
	     * created a mptcp_request_sock structure, we can handle options
749
	     * correclty there without increasing request_sock.
750
	     */
751
	    !tcp_rsk(req)->saw_mpc) {
684
		inet_rsk(req)->acked = 1;
752
		inet_rsk(req)->acked = 1;
685
		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
753
		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
686
		return NULL;
754
		return NULL;
Lines 692-701 Link Here
692
	 * ESTABLISHED STATE. If it will be dropped after
760
	 * ESTABLISHED STATE. If it will be dropped after
693
	 * socket is created, wait for troubles.
761
	 * socket is created, wait for troubles.
694
	 */
762
	 */
695
	child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
763
#if defined(CONFIG_MPTCP)
764
	if (tcp_sk(sk)->mpc)
765
		/* MPTCP: We call the mptcp-specific syn_recv_sock */
766
		child = tcp_sk(sk)->mpcb->syn_recv_sock(sk, skb, req, NULL);
767
	else
768
#endif
769
		child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb,
770
				req, NULL);
771
696
	if (child == NULL)
772
	if (child == NULL)
697
		goto listen_overflow;
773
		goto listen_overflow;
698
774
775
	if (!is_meta_sk(sk)) {
776
		int ret = mptcp_check_req_master(sk, child, req, prev, &mopt);
777
		if (ret < 0)
778
			goto listen_overflow;
779
780
		/* MPTCP-supported */
781
		if (!ret)
782
			return tcp_sk(child)->mpcb->master_sk;
783
	} else {
784
		return mptcp_check_req_child(sk, child, req, prev, &mopt);
785
	}
699
	inet_csk_reqsk_queue_unlink(sk, req, prev);
786
	inet_csk_reqsk_queue_unlink(sk, req, prev);
700
	inet_csk_reqsk_queue_removed(sk, req);
787
	inet_csk_reqsk_queue_removed(sk, req);
701
788
Lines 745-752 Link Here
745
{
832
{
746
	int ret = 0;
833
	int ret = 0;
747
	int state = child->sk_state;
834
	int state = child->sk_state;
835
	struct sock *meta_sk = tcp_sk(child)->mpc ? mptcp_meta_sk(child) : child;
748
836
749
	if (!sock_owned_by_user(child)) {
837
	if (!sock_owned_by_user(meta_sk)) {
750
		ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb),
838
		ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb),
751
					    skb->len);
839
					    skb->len);
752
		/* Wakeup parent, send SIGIO */
840
		/* Wakeup parent, send SIGIO */
Lines 757-766 Link Here
757
		 * in main socket hash table and lock on listening
845
		 * in main socket hash table and lock on listening
758
		 * socket does not protect us more.
846
		 * socket does not protect us more.
759
		 */
847
		 */
760
		__sk_add_backlog(child, skb);
848
		if (tcp_sk(child)->mpc)
849
			skb->sk = child;
850
		__sk_add_backlog(meta_sk, skb);
761
	}
851
	}
762
852
763
	bh_unlock_sock(child);
853
	if (tcp_sk(child)->mpc)
854
		bh_unlock_sock(child);
855
	bh_unlock_sock(meta_sk);
764
	sock_put(child);
856
	sock_put(child);
765
	return ret;
857
	return ret;
766
}
858
}
(-)a/net/ipv4/tcp_output.c (-78 / +225 lines)
Lines 36-41 Link Here
36
36
37
#define pr_fmt(fmt) "TCP: " fmt
37
#define pr_fmt(fmt) "TCP: " fmt
38
38
39
#include <net/mptcp.h>
40
#include <net/ipv6.h>
39
#include <net/tcp.h>
41
#include <net/tcp.h>
40
42
41
#include <linux/compiler.h>
43
#include <linux/compiler.h>
Lines 69-75 Link Here
69
			   int push_one, gfp_t gfp);
71
			   int push_one, gfp_t gfp);
70
72
71
/* Account for new data that has been sent to the network. */
73
/* Account for new data that has been sent to the network. */
72
static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
74
void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
73
{
75
{
74
	struct inet_connection_sock *icsk = inet_csk(sk);
76
	struct inet_connection_sock *icsk = inet_csk(sk);
75
	struct tcp_sock *tp = tcp_sk(sk);
77
	struct tcp_sock *tp = tcp_sk(sk);
Lines 208-216 Link Here
208
void tcp_select_initial_window(int __space, __u32 mss,
210
void tcp_select_initial_window(int __space, __u32 mss,
209
			       __u32 *rcv_wnd, __u32 *window_clamp,
211
			       __u32 *rcv_wnd, __u32 *window_clamp,
210
			       int wscale_ok, __u8 *rcv_wscale,
212
			       int wscale_ok, __u8 *rcv_wscale,
211
			       __u32 init_rcv_wnd)
213
			       __u32 init_rcv_wnd, const struct sock *sk)
212
{
214
{
213
	unsigned int space = (__space < 0 ? 0 : __space);
215
	unsigned int space;
216
217
	if (tcp_sk(sk)->mpc)
218
		mptcp_select_initial_window(&__space, window_clamp, sk);
219
220
	space = (__space < 0 ? 0 : __space);
214
221
215
	/* If no clamp set the clamp to the max possible scaled window */
222
	/* If no clamp set the clamp to the max possible scaled window */
216
	if (*window_clamp == 0)
223
	if (*window_clamp == 0)
Lines 266-272 Link Here
266
static u16 tcp_select_window(struct sock *sk)
273
static u16 tcp_select_window(struct sock *sk)
267
{
274
{
268
	struct tcp_sock *tp = tcp_sk(sk);
275
	struct tcp_sock *tp = tcp_sk(sk);
269
	u32 cur_win = tcp_receive_window(tp);
276
	/* The window must never shrink at the meta-level. At the subflow we
277
	 * have to allow this. Otherwise we may announce a window too large
278
	 * for the current meta-level sk_rcvbuf.
279
	 */
280
	u32 cur_win = tcp_receive_window(tp->mpc ? tcp_sk(mptcp_meta_sk(sk)) : tp);
270
	u32 new_win = __tcp_select_window(sk);
281
	u32 new_win = __tcp_select_window(sk);
271
282
272
	/* Never shrink the offered window */
283
	/* Never shrink the offered window */
Lines 280-285 Link Here
280
		 */
291
		 */
281
		new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
292
		new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
282
	}
293
	}
294
295
	if (tp->mpc) {
296
		mptcp_meta_tp(tp)->rcv_wnd = new_win;
297
		mptcp_meta_tp(tp)->rcv_wup = mptcp_meta_tp(tp)->rcv_nxt;
298
	}
299
283
	tp->rcv_wnd = new_win;
300
	tp->rcv_wnd = new_win;
284
	tp->rcv_wup = tp->rcv_nxt;
301
	tp->rcv_wup = tp->rcv_nxt;
285
302
Lines 358-364 Link Here
358
/* Constructs common control bits of non-data skb. If SYN/FIN is present,
375
/* Constructs common control bits of non-data skb. If SYN/FIN is present,
359
 * auto increment end seqno.
376
 * auto increment end seqno.
360
 */
377
 */
361
static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
378
void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
362
{
379
{
363
	skb->ip_summed = CHECKSUM_PARTIAL;
380
	skb->ip_summed = CHECKSUM_PARTIAL;
364
	skb->csum = 0;
381
	skb->csum = 0;
Lines 376-382 Link Here
376
	TCP_SKB_CB(skb)->end_seq = seq;
393
	TCP_SKB_CB(skb)->end_seq = seq;
377
}
394
}
378
395
379
static inline bool tcp_urg_mode(const struct tcp_sock *tp)
396
bool tcp_urg_mode(const struct tcp_sock *tp)
380
{
397
{
381
	return tp->snd_una != tp->snd_up;
398
	return tp->snd_una != tp->snd_up;
382
}
399
}
Lines 386-402 Link Here
386
#define OPTION_MD5		(1 << 2)
403
#define OPTION_MD5		(1 << 2)
387
#define OPTION_WSCALE		(1 << 3)
404
#define OPTION_WSCALE		(1 << 3)
388
#define OPTION_FAST_OPEN_COOKIE	(1 << 8)
405
#define OPTION_FAST_OPEN_COOKIE	(1 << 8)
389
406
/* Before adding here - take a look at OPTION_MPTCP in include/net/mptcp.h */
390
struct tcp_out_options {
391
	u16 options;		/* bit field of OPTION_* */
392
	u16 mss;		/* 0 to disable */
393
	u8 ws;			/* window scale, 0 to disable */
394
	u8 num_sack_blocks;	/* number of SACK blocks to include */
395
	u8 hash_size;		/* bytes in hash_location */
396
	__u8 *hash_location;	/* temporary pointer, overloaded */
397
	__u32 tsval, tsecr;	/* need to include OPTION_TS */
398
	struct tcp_fastopen_cookie *fastopen_cookie;	/* Fast open cookie */
399
};
400
407
401
/* Write previously computed TCP options to the packet.
408
/* Write previously computed TCP options to the packet.
402
 *
409
 *
Lines 412-418 Link Here
412
 * (but it may well be that other scenarios fail similarly).
419
 * (but it may well be that other scenarios fail similarly).
413
 */
420
 */
414
static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
421
static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
415
			      struct tcp_out_options *opts)
422
			      struct tcp_out_options *opts, struct sk_buff *skb)
416
{
423
{
417
	u16 options = opts->options;	/* mungable copy */
424
	u16 options = opts->options;	/* mungable copy */
418
425
Lines 495-500 Link Here
495
		}
502
		}
496
		ptr += (foc->len + 3) >> 2;
503
		ptr += (foc->len + 3) >> 2;
497
	}
504
	}
505
506
	if (unlikely(OPTION_MPTCP & opts->options))
507
		mptcp_options_write(ptr, tp, opts, skb);
498
}
508
}
499
509
500
/* Compute TCP options for SYN packets. This is not the final
510
/* Compute TCP options for SYN packets. This is not the final
Lines 546-551 Link Here
546
		if (unlikely(!(OPTION_TS & opts->options)))
556
		if (unlikely(!(OPTION_TS & opts->options)))
547
			remaining -= TCPOLEN_SACKPERM_ALIGNED;
557
			remaining -= TCPOLEN_SACKPERM_ALIGNED;
548
	}
558
	}
559
	if (tp->request_mptcp || tp->mpc)
560
		mptcp_syn_options(sk, opts, &remaining);
549
561
550
	if (fastopen && fastopen->cookie.len >= 0) {
562
	if (fastopen && fastopen->cookie.len >= 0) {
551
		u32 need = TCPOLEN_EXP_FASTOPEN_BASE + fastopen->cookie.len;
563
		u32 need = TCPOLEN_EXP_FASTOPEN_BASE + fastopen->cookie.len;
Lines 619-624 Link Here
619
		}
631
		}
620
	}
632
	}
621
633
634
	if (tcp_rsk(req)->saw_mpc)
635
		mptcp_synack_options(req, opts, &remaining);
636
622
	return MAX_TCP_OPTION_SPACE - remaining;
637
	return MAX_TCP_OPTION_SPACE - remaining;
623
}
638
}
624
639
Lines 650-665 Link Here
650
		opts->tsecr = tp->rx_opt.ts_recent;
665
		opts->tsecr = tp->rx_opt.ts_recent;
651
		size += TCPOLEN_TSTAMP_ALIGNED;
666
		size += TCPOLEN_TSTAMP_ALIGNED;
652
	}
667
	}
668
	if (tp->mpc)
669
		mptcp_established_options(sk, skb, opts, &size);
653
670
654
	eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
671
	eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
655
	if (unlikely(eff_sacks)) {
672
	if (unlikely(eff_sacks)) {
656
		const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
673
		const unsigned remaining = MAX_TCP_OPTION_SPACE - size;
657
		opts->num_sack_blocks =
674
		if (remaining < TCPOLEN_SACK_BASE_ALIGNED)
658
			min_t(unsigned int, eff_sacks,
675
			opts->num_sack_blocks = 0;
659
			      (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
676
		else
660
			      TCPOLEN_SACK_PERBLOCK);
677
			opts->num_sack_blocks =
661
		size += TCPOLEN_SACK_BASE_ALIGNED +
678
			    min_t(unsigned int, eff_sacks,
662
			opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
679
				  (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
680
				  TCPOLEN_SACK_PERBLOCK);
681
		if (opts->num_sack_blocks)
682
			size += TCPOLEN_SACK_BASE_ALIGNED +
683
			    opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
663
	}
684
	}
664
685
665
	return size;
686
	return size;
Lines 706-712 Link Here
706
	unsigned long flags;
727
	unsigned long flags;
707
	struct list_head *q, *n;
728
	struct list_head *q, *n;
708
	struct tcp_sock *tp;
729
	struct tcp_sock *tp;
709
	struct sock *sk;
730
	struct sock *sk, *meta_sk;
710
731
711
	local_irq_save(flags);
732
	local_irq_save(flags);
712
	list_splice_init(&tsq->head, &list);
733
	list_splice_init(&tsq->head, &list);
Lines 717-731 Link Here
717
		list_del(&tp->tsq_node);
738
		list_del(&tp->tsq_node);
718
739
719
		sk = (struct sock *)tp;
740
		sk = (struct sock *)tp;
720
		bh_lock_sock(sk);
741
		meta_sk = tp->mpc ? mptcp_meta_sk(sk) : sk;
742
		bh_lock_sock(meta_sk);
721
743
722
		if (!sock_owned_by_user(sk)) {
744
		if (!sock_owned_by_user(meta_sk)) {
723
			tcp_tsq_handler(sk);
745
			tcp_tsq_handler(sk);
746
			if (tp->mpc)
747
				tcp_tsq_handler(meta_sk);
724
		} else {
748
		} else {
725
			/* defer the work to tcp_release_cb() */
749
			/* defer the work to tcp_release_cb() */
726
			set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
750
			set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
751
752
			/* For MPTCP, we set the tsq-bit on the meta, and the
753
			 * subflow as we don't know if the limitation happened
754
			 * while inside mptcp_write_xmit or during tcp_write_xmit.
755
			 */
756
			if (tp->mpc) {
757
				set_bit(TCP_TSQ_DEFERRED, &tcp_sk(meta_sk)->tsq_flags);
758
				mptcp_tsq_flags(sk);
759
			}
727
		}
760
		}
728
		bh_unlock_sock(sk);
761
		bh_unlock_sock(meta_sk);
729
762
730
		clear_bit(TSQ_QUEUED, &tp->tsq_flags);
763
		clear_bit(TSQ_QUEUED, &tp->tsq_flags);
731
		sk_free(sk);
764
		sk_free(sk);
Lines 735-741 Link Here
735
#define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) |		\
768
#define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) |		\
736
			  (1UL << TCP_WRITE_TIMER_DEFERRED) |	\
769
			  (1UL << TCP_WRITE_TIMER_DEFERRED) |	\
737
			  (1UL << TCP_DELACK_TIMER_DEFERRED) |	\
770
			  (1UL << TCP_DELACK_TIMER_DEFERRED) |	\
738
			  (1UL << TCP_MTU_REDUCED_DEFERRED))
771
			  (1UL << TCP_MTU_REDUCED_DEFERRED) |	\
772
			  (1UL << MPTCP_SUB_DEFERRED))
773
739
/**
774
/**
740
 * tcp_release_cb - tcp release_sock() callback
775
 * tcp_release_cb - tcp release_sock() callback
741
 * @sk: socket
776
 * @sk: socket
Lines 771-776 Link Here
771
		sk->sk_prot->mtu_reduced(sk);
806
		sk->sk_prot->mtu_reduced(sk);
772
		__sock_put(sk);
807
		__sock_put(sk);
773
	}
808
	}
809
	if (flags & (1UL << MPTCP_SUB_DEFERRED))
810
		mptcp_tsq_sub_deferred(sk);
774
}
811
}
775
EXPORT_SYMBOL(tcp_release_cb);
812
EXPORT_SYMBOL(tcp_release_cb);
776
813
Lines 830-837 Link Here
830
 * We are working here with either a clone of the original
867
 * We are working here with either a clone of the original
831
 * SKB, or a fresh unique copy made by the retransmit engine.
868
 * SKB, or a fresh unique copy made by the retransmit engine.
832
 */
869
 */
833
static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
870
int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
834
			    gfp_t gfp_mask)
871
		        gfp_t gfp_mask)
835
{
872
{
836
	const struct inet_connection_sock *icsk = inet_csk(sk);
873
	const struct inet_connection_sock *icsk = inet_csk(sk);
837
	struct inet_sock *inet;
874
	struct inet_sock *inet;
Lines 851-856 Link Here
851
	if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP)
888
	if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP)
852
		__net_timestamp(skb);
889
		__net_timestamp(skb);
853
890
891
	tp = tcp_sk(sk);
892
854
	if (likely(clone_it)) {
893
	if (likely(clone_it)) {
855
		const struct sk_buff *fclone = skb + 1;
894
		const struct sk_buff *fclone = skb + 1;
856
895
Lines 859-874 Link Here
859
			NET_INC_STATS_BH(sock_net(sk),
898
			NET_INC_STATS_BH(sock_net(sk),
860
					 LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
899
					 LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
861
900
862
		if (unlikely(skb_cloned(skb)))
901
		if (unlikely(skb_cloned(skb))) {
863
			skb = pskb_copy(skb, gfp_mask);
902
			struct sk_buff *newskb;
864
		else
903
			if (mptcp_is_data_seq(skb))
904
				skb_push(skb, MPTCP_SUB_LEN_DSS_ALIGN +
905
					      MPTCP_SUB_LEN_ACK_ALIGN +
906
					      MPTCP_SUB_LEN_SEQ_ALIGN);
907
908
			newskb = pskb_copy(skb, gfp_mask);
909
910
			if (mptcp_is_data_seq(skb)) {
911
				skb_pull(skb, MPTCP_SUB_LEN_DSS_ALIGN +
912
					      MPTCP_SUB_LEN_ACK_ALIGN +
913
					      MPTCP_SUB_LEN_SEQ_ALIGN);
914
				if (newskb)
915
					skb_pull(newskb, MPTCP_SUB_LEN_DSS_ALIGN +
916
							 MPTCP_SUB_LEN_ACK_ALIGN +
917
							 MPTCP_SUB_LEN_SEQ_ALIGN);
918
			}
919
			skb = newskb;
920
		} else {
865
			skb = skb_clone(skb, gfp_mask);
921
			skb = skb_clone(skb, gfp_mask);
922
		}
866
		if (unlikely(!skb))
923
		if (unlikely(!skb))
867
			return -ENOBUFS;
924
			return -ENOBUFS;
868
	}
925
	}
869
926
870
	inet = inet_sk(sk);
927
	inet = inet_sk(sk);
871
	tp = tcp_sk(sk);
872
	tcb = TCP_SKB_CB(skb);
928
	tcb = TCP_SKB_CB(skb);
873
	memset(&opts, 0, sizeof(opts));
929
	memset(&opts, 0, sizeof(opts));
874
930
Lines 927-933 Link Here
927
		}
983
		}
928
	}
984
	}
929
985
930
	tcp_options_write((__be32 *)(th + 1), tp, &opts);
986
	tcp_options_write((__be32 *)(th + 1), tp, &opts, skb);
931
	if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0))
987
	if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0))
932
		TCP_ECN_send(sk, skb, tcp_header_size);
988
		TCP_ECN_send(sk, skb, tcp_header_size);
933
989
Lines 966-972 Link Here
966
 * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
1022
 * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
967
 * otherwise socket can stall.
1023
 * otherwise socket can stall.
968
 */
1024
 */
969
static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
1025
void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
970
{
1026
{
971
	struct tcp_sock *tp = tcp_sk(sk);
1027
	struct tcp_sock *tp = tcp_sk(sk);
972
1028
Lines 979-989 Link Here
979
}
1035
}
980
1036
981
/* Initialize TSO segments for a packet. */
1037
/* Initialize TSO segments for a packet. */
982
static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
1038
void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
983
				 unsigned int mss_now)
1039
			  unsigned int mss_now)
984
{
1040
{
985
	if (skb->len <= mss_now || !sk_can_gso(sk) ||
1041
	if (skb->len <= mss_now || (is_meta_sk(sk) && !mptcp_sk_can_gso(sk)) ||
986
	    skb->ip_summed == CHECKSUM_NONE) {
1042
	    (!is_meta_sk(sk) && !sk_can_gso(sk)) || skb->ip_summed == CHECKSUM_NONE) {
987
		/* Avoid the costly divide in the normal
1043
		/* Avoid the costly divide in the normal
988
		 * non-TSO case.
1044
		 * non-TSO case.
989
		 */
1045
		 */
Lines 1015-1021 Link Here
1015
/* Pcount in the middle of the write queue got changed, we need to do various
1071
/* Pcount in the middle of the write queue got changed, we need to do various
1016
 * tweaks to fix counters
1072
 * tweaks to fix counters
1017
 */
1073
 */
1018
static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)
1074
void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)
1019
{
1075
{
1020
	struct tcp_sock *tp = tcp_sk(sk);
1076
	struct tcp_sock *tp = tcp_sk(sk);
1021
1077
Lines 1056-1061 Link Here
1056
	int nlen;
1112
	int nlen;
1057
	u8 flags;
1113
	u8 flags;
1058
1114
1115
	if (tcp_sk(sk)->mpc && mptcp_is_data_seq(skb))
1116
		mptcp_fragment(sk, skb, len, mss_now, 0);
1117
1059
	if (WARN_ON(len > skb->len))
1118
	if (WARN_ON(len > skb->len))
1060
		return -EINVAL;
1119
		return -EINVAL;
1061
1120
Lines 1140-1146 Link Here
1140
 * eventually). The difference is that pulled data not copied, but
1199
 * eventually). The difference is that pulled data not copied, but
1141
 * immediately discarded.
1200
 * immediately discarded.
1142
 */
1201
 */
1143
static void __pskb_trim_head(struct sk_buff *skb, int len)
1202
void __pskb_trim_head(struct sk_buff *skb, int len)
1144
{
1203
{
1145
	int i, k, eat;
1204
	int i, k, eat;
1146
1205
Lines 1179-1184 Link Here
1179
/* Remove acked data from a packet in the transmit queue. */
1238
/* Remove acked data from a packet in the transmit queue. */
1180
int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
1239
int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
1181
{
1240
{
1241
	if (tcp_sk(sk)->mpc && !is_meta_sk(sk) && mptcp_is_data_seq(skb))
1242
		return mptcp_trim_head(sk, skb, len);
1243
1182
	if (skb_unclone(skb, GFP_ATOMIC))
1244
	if (skb_unclone(skb, GFP_ATOMIC))
1183
		return -ENOMEM;
1245
		return -ENOMEM;
1184
1246
Lines 1196-1201 Link Here
1196
	if (tcp_skb_pcount(skb) > 1)
1258
	if (tcp_skb_pcount(skb) > 1)
1197
		tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb));
1259
		tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb));
1198
1260
1261
#ifdef CONFIG_MPTCP
1262
	/* Some data got acked - we assume that the seq-number reached the dest.
1263
	 * Anyway, our MPTCP-option has been trimmed above - we lost it here.
1264
	 * Only remove the SEQ if the call does not come from a meta retransmit.
1265
	 */
1266
	if (tcp_sk(sk)->mpc && !is_meta_sk(sk))
1267
		TCP_SKB_CB(skb)->mptcp_flags &= ~MPTCPHDR_SEQ;
1268
#endif
1269
1199
	return 0;
1270
	return 0;
1200
}
1271
}
1201
1272
Lines 1355-1361 Link Here
1355
}
1426
}
1356
1427
1357
/* Congestion window validation. (RFC2861) */
1428
/* Congestion window validation. (RFC2861) */
1358
static void tcp_cwnd_validate(struct sock *sk)
1429
void tcp_cwnd_validate(struct sock *sk)
1359
{
1430
{
1360
	struct tcp_sock *tp = tcp_sk(sk);
1431
	struct tcp_sock *tp = tcp_sk(sk);
1361
1432
Lines 1386-1401 Link Here
1386
 * modulo only when the receiver window alone is the limiting factor or
1457
 * modulo only when the receiver window alone is the limiting factor or
1387
 * when we would be allowed to send the split-due-to-Nagle skb fully.
1458
 * when we would be allowed to send the split-due-to-Nagle skb fully.
1388
 */
1459
 */
1389
static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb,
1460
unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb,
1390
					unsigned int mss_now, unsigned int max_segs)
1461
				 unsigned int mss_now, unsigned int max_segs)
1391
{
1462
{
1392
	const struct tcp_sock *tp = tcp_sk(sk);
1463
	const struct tcp_sock *tp = tcp_sk(sk);
1464
	const struct sock *meta_sk = tp->mpc ? mptcp_meta_sk(sk) : sk;
1393
	u32 needed, window, max_len;
1465
	u32 needed, window, max_len;
1394
1466
1395
	window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
1467
	if (!tp->mpc)
1468
		window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
1469
	else
1470
		/* We need to evaluate the available space in the sending window
1471
		 * at the subflow level. However, the subflow seq has not yet
1472
		 * been set. Nevertheless we know that the caller will set it to
1473
		 * write_seq.
1474
		 */
1475
		window = tcp_wnd_end(tp) - tp->write_seq;
1396
	max_len = mss_now * max_segs;
1476
	max_len = mss_now * max_segs;
1397
1477
1398
	if (likely(max_len <= window && skb != tcp_write_queue_tail(sk)))
1478
	if (likely(max_len <= window && skb != tcp_write_queue_tail(meta_sk)))
1399
		return max_len;
1479
		return max_len;
1400
1480
1401
	needed = min(skb->len, window);
1481
	needed = min(skb->len, window);
Lines 1409-1421 Link Here
1409
/* Can at least one segment of SKB be sent right now, according to the
1489
/* Can at least one segment of SKB be sent right now, according to the
1410
 * congestion window rules?  If so, return how many segments are allowed.
1490
 * congestion window rules?  If so, return how many segments are allowed.
1411
 */
1491
 */
1412
static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
1492
unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
1413
					 const struct sk_buff *skb)
1493
			   const struct sk_buff *skb)
1414
{
1494
{
1415
	u32 in_flight, cwnd;
1495
	u32 in_flight, cwnd;
1416
1496
1417
	/* Don't be strict about the congestion window for the final FIN.  */
1497
	/* Don't be strict about the congestion window for the final FIN.  */
1418
	if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
1498
	if (skb &&
1499
	    ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) || mptcp_is_data_fin(skb)) &&
1419
	    tcp_skb_pcount(skb) == 1)
1500
	    tcp_skb_pcount(skb) == 1)
1420
		return 1;
1501
		return 1;
1421
1502
Lines 1431-1438 Link Here
1431
 * This must be invoked the first time we consider transmitting
1512
 * This must be invoked the first time we consider transmitting
1432
 * SKB onto the wire.
1513
 * SKB onto the wire.
1433
 */
1514
 */
1434
static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,
1515
int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,
1435
			     unsigned int mss_now)
1516
		      unsigned int mss_now)
1436
{
1517
{
1437
	int tso_segs = tcp_skb_pcount(skb);
1518
	int tso_segs = tcp_skb_pcount(skb);
1438
1519
Lines 1469-1476 Link Here
1469
/* Return true if the Nagle test allows this packet to be
1550
/* Return true if the Nagle test allows this packet to be
1470
 * sent now.
1551
 * sent now.
1471
 */
1552
 */
1472
static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
1553
bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
1473
				  unsigned int cur_mss, int nonagle)
1554
		    unsigned int cur_mss, int nonagle)
1474
{
1555
{
1475
	/* Nagle rule does not apply to frames, which sit in the middle of the
1556
	/* Nagle rule does not apply to frames, which sit in the middle of the
1476
	 * write_queue (they have no chances to get new data).
1557
	 * write_queue (they have no chances to get new data).
Lines 1482-1488 Link Here
1482
		return true;
1563
		return true;
1483
1564
1484
	/* Don't use the nagle rule for urgent data (or for the final FIN). */
1565
	/* Don't use the nagle rule for urgent data (or for the final FIN). */
1485
	if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
1566
	if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) ||
1567
	    mptcp_is_data_fin(skb))
1486
		return true;
1568
		return true;
1487
1569
1488
	if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
1570
	if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
Lines 1492-1500 Link Here
1492
}
1574
}
1493
1575
1494
/* Does at least the first segment of SKB fit into the send window? */
1576
/* Does at least the first segment of SKB fit into the send window? */
1495
static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
1577
bool tcp_snd_wnd_test(const struct tcp_sock *tp, const struct sk_buff *skb,
1496
			     const struct sk_buff *skb,
1578
		      unsigned int cur_mss)
1497
			     unsigned int cur_mss)
1498
{
1579
{
1499
	u32 end_seq = TCP_SKB_CB(skb)->end_seq;
1580
	u32 end_seq = TCP_SKB_CB(skb)->end_seq;
1500
1581
Lines 1552-1557 Link Here
1552
	int nlen = skb->len - len;
1633
	int nlen = skb->len - len;
1553
	u8 flags;
1634
	u8 flags;
1554
1635
1636
	if (tcp_sk(sk)->mpc && mptcp_is_data_seq(skb))
1637
		mptso_fragment(sk, skb, len, mss_now, gfp, 0);
1638
1555
	/* All of a TSO frame must be composed of paged data.  */
1639
	/* All of a TSO frame must be composed of paged data.  */
1556
	if (skb->len != skb->data_len)
1640
	if (skb->len != skb->data_len)
1557
		return tcp_fragment(sk, skb, len, mss_now);
1641
		return tcp_fragment(sk, skb, len, mss_now);
Lines 1597-1625 Link Here
1597
 *
1681
 *
1598
 * This algorithm is from John Heffner.
1682
 * This algorithm is from John Heffner.
1599
 */
1683
 */
1600
static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
1684
bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
1601
{
1685
{
1602
	struct tcp_sock *tp = tcp_sk(sk);
1686
	struct tcp_sock *tp = tcp_sk(sk);
1687
	struct sock *meta_sk = tp->mpc ? mptcp_meta_sk(sk) : sk;
1688
	struct tcp_sock *meta_tp = tcp_sk(meta_sk);
1603
	const struct inet_connection_sock *icsk = inet_csk(sk);
1689
	const struct inet_connection_sock *icsk = inet_csk(sk);
1604
	u32 send_win, cong_win, limit, in_flight;
1690
	u32 send_win, cong_win, limit, in_flight;
1605
	int win_divisor;
1691
	int win_divisor;
1606
1692
1607
	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
1693
	if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) || mptcp_is_data_fin(skb))
1608
		goto send_now;
1694
		goto send_now;
1609
1695
1610
	if (icsk->icsk_ca_state != TCP_CA_Open)
1696
	if (icsk->icsk_ca_state != TCP_CA_Open)
1611
		goto send_now;
1697
		goto send_now;
1612
1698
1613
	/* Defer for less than two clock ticks. */
1699
	/* Defer for less than two clock ticks. */
1614
	if (tp->tso_deferred &&
1700
	if (meta_tp->tso_deferred &&
1615
	    (((u32)jiffies << 1) >> 1) - (tp->tso_deferred >> 1) > 1)
1701
	    (((u32)jiffies << 1) >> 1) - (meta_tp->tso_deferred >> 1) > 1)
1616
		goto send_now;
1702
		goto send_now;
1617
1703
1618
	in_flight = tcp_packets_in_flight(tp);
1704
	in_flight = tcp_packets_in_flight(tp);
1619
1705
1620
	BUG_ON(tcp_skb_pcount(skb) <= 1 || (tp->snd_cwnd <= in_flight));
1706
	BUG_ON(tcp_skb_pcount(skb) <= 1 || (tp->snd_cwnd <= in_flight));
1621
1707
1622
	send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
1708
	if (!tp->mpc)
1709
		send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
1710
	else
1711
		/* We need to evaluate the available space in the sending window
1712
		 * at the subflow level. However, the subflow seq has not yet
1713
		 * been set. Nevertheless we know that the caller will set it to
1714
		 * write_seq.
1715
		 */
1716
		send_win = tcp_wnd_end(tp) - tp->write_seq;
1623
1717
1624
	/* From in_flight test above, we know that cwnd > in_flight.  */
1718
	/* From in_flight test above, we know that cwnd > in_flight.  */
1625
	cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
1719
	cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
Lines 1632-1638 Link Here
1632
		goto send_now;
1726
		goto send_now;
1633
1727
1634
	/* Middle in queue won't get any more data, full sendable already? */
1728
	/* Middle in queue won't get any more data, full sendable already? */
1635
	if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
1729
	if ((skb != tcp_write_queue_tail(meta_sk)) && (limit >= skb->len))
1636
		goto send_now;
1730
		goto send_now;
1637
1731
1638
	win_divisor = ACCESS_ONCE(sysctl_tcp_tso_win_divisor);
1732
	win_divisor = ACCESS_ONCE(sysctl_tcp_tso_win_divisor);
Lines 1658-1670 Link Here
1658
	/* Ok, it looks like it is advisable to defer.
1752
	/* Ok, it looks like it is advisable to defer.
1659
	 * Do not rearm the timer if already set to not break TCP ACK clocking.
1753
	 * Do not rearm the timer if already set to not break TCP ACK clocking.
1660
	 */
1754
	 */
1661
	if (!tp->tso_deferred)
1755
	if (!meta_tp->tso_deferred)
1662
		tp->tso_deferred = 1 | (jiffies << 1);
1756
		meta_tp->tso_deferred = 1 | (jiffies << 1);
1663
1757
1664
	return true;
1758
	return true;
1665
1759
1666
send_now:
1760
send_now:
1667
	tp->tso_deferred = 0;
1761
	meta_tp->tso_deferred = 0;
1668
	return false;
1762
	return false;
1669
}
1763
}
1670
1764
Lines 1677-1683 Link Here
1677
 *         1 if a probe was sent,
1771
 *         1 if a probe was sent,
1678
 *         -1 otherwise
1772
 *         -1 otherwise
1679
 */
1773
 */
1680
static int tcp_mtu_probe(struct sock *sk)
1774
int tcp_mtu_probe(struct sock *sk)
1681
{
1775
{
1682
	struct tcp_sock *tp = tcp_sk(sk);
1776
	struct tcp_sock *tp = tcp_sk(sk);
1683
	struct inet_connection_sock *icsk = inet_csk(sk);
1777
	struct inet_connection_sock *icsk = inet_csk(sk);
Lines 1822-1827 Link Here
1822
	int cwnd_quota;
1916
	int cwnd_quota;
1823
	int result;
1917
	int result;
1824
1918
1919
	if (is_meta_sk(sk))
1920
		return mptcp_write_xmit(sk, mss_now, nonagle, push_one, gfp);
1921
1825
	sent_pkts = 0;
1922
	sent_pkts = 0;
1826
1923
1827
	if (!push_one) {
1924
	if (!push_one) {
Lines 2128-2133 Link Here
2128
	int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk));
2225
	int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk));
2129
	int window;
2226
	int window;
2130
2227
2228
	if (tp->mpc)
2229
		return __mptcp_select_window(sk);
2230
2131
	if (mss > full_space)
2231
	if (mss > full_space)
2132
		mss = full_space;
2232
		mss = full_space;
2133
2233
Lines 2258-2263 Link Here
2258
	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
2358
	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
2259
		return;
2359
		return;
2260
2360
2361
	/* Currently not supported for MPTCP - but it should be possible */
2362
	if (tp->mpc)
2363
		return;
2364
2261
	tcp_for_write_queue_from_safe(skb, tmp, sk) {
2365
	tcp_for_write_queue_from_safe(skb, tmp, sk) {
2262
		if (!tcp_can_collapse(sk, skb))
2366
		if (!tcp_can_collapse(sk, skb))
2263
			break;
2367
			break;
Lines 2367-2374 Link Here
2367
	 */
2471
	 */
2368
	if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) ||
2472
	if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) ||
2369
		     skb_headroom(skb) >= 0xFFFF)) {
2473
		     skb_headroom(skb) >= 0xFFFF)) {
2370
		struct sk_buff *nskb = __pskb_copy(skb, MAX_TCP_HEADER,
2474
		struct sk_buff *nskb;
2371
						   GFP_ATOMIC);
2475
2476
		if (mptcp_is_data_seq(skb))
2477
			skb_push(skb, MPTCP_SUB_LEN_DSS_ALIGN +
2478
				      MPTCP_SUB_LEN_ACK_ALIGN +
2479
				      MPTCP_SUB_LEN_SEQ_ALIGN);
2480
2481
		nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
2482
2483
		if (mptcp_is_data_seq(skb)) {
2484
			skb_pull(skb, MPTCP_SUB_LEN_DSS_ALIGN +
2485
				      MPTCP_SUB_LEN_ACK_ALIGN +
2486
				      MPTCP_SUB_LEN_SEQ_ALIGN);
2487
			if (nskb)
2488
				skb_pull(nskb, MPTCP_SUB_LEN_DSS_ALIGN +
2489
					       MPTCP_SUB_LEN_ACK_ALIGN +
2490
					       MPTCP_SUB_LEN_SEQ_ALIGN);
2491
		}
2372
		return nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
2492
		return nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
2373
			      -ENOBUFS;
2493
			      -ENOBUFS;
2374
	} else {
2494
	} else {
Lines 2593-2598 Link Here
2593
{
2713
{
2594
	struct sk_buff *skb;
2714
	struct sk_buff *skb;
2595
2715
2716
	if (is_meta_sk(sk)) {
2717
		mptcp_send_active_reset(sk, priority);
2718
		return;
2719
	}
2720
2596
	/* NOTE: No TCP options attached and we never retransmit this. */
2721
	/* NOTE: No TCP options attached and we never retransmit this. */
2597
	skb = alloc_skb(MAX_TCP_HEADER, priority);
2722
	skb = alloc_skb(MAX_TCP_HEADER, priority);
2598
	if (!skb) {
2723
	if (!skb) {
Lines 2695-2708 Link Here
2695
		    (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0))
2820
		    (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0))
2696
			req->window_clamp = tcp_full_space(sk);
2821
			req->window_clamp = tcp_full_space(sk);
2697
2822
2698
		/* tcp_full_space because it is guaranteed to be the first packet */
2699
		tcp_select_initial_window(tcp_full_space(sk),
2823
		tcp_select_initial_window(tcp_full_space(sk),
2700
			mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
2824
			mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) -
2825
			(tcp_rsk(req)->saw_mpc ? MPTCP_SUB_LEN_DSM_ALIGN : 0),
2701
			&req->rcv_wnd,
2826
			&req->rcv_wnd,
2702
			&req->window_clamp,
2827
			&req->window_clamp,
2703
			ireq->wscale_ok,
2828
			ireq->wscale_ok,
2704
			&rcv_wscale,
2829
			&rcv_wscale,
2705
			dst_metric(dst, RTAX_INITRWND));
2830
			dst_metric(dst, RTAX_INITRWND), sk);
2706
		ireq->rcv_wscale = rcv_wscale;
2831
		ireq->rcv_wscale = rcv_wscale;
2707
	}
2832
	}
2708
2833
Lines 2738-2744 Link Here
2738
2863
2739
	/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
2864
	/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
2740
	th->window = htons(min(req->rcv_wnd, 65535U));
2865
	th->window = htons(min(req->rcv_wnd, 65535U));
2741
	tcp_options_write((__be32 *)(th + 1), tp, &opts);
2866
	tcp_options_write((__be32 *)(th + 1), tp, &opts, skb);
2742
	th->doff = (tcp_header_size >> 2);
2867
	th->doff = (tcp_header_size >> 2);
2743
	TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb));
2868
	TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb));
2744
2869
Lines 2798-2804 Link Here
2798
				  &tp->window_clamp,
2923
				  &tp->window_clamp,
2799
				  sysctl_tcp_window_scaling,
2924
				  sysctl_tcp_window_scaling,
2800
				  &rcv_wscale,
2925
				  &rcv_wscale,
2801
				  dst_metric(dst, RTAX_INITRWND));
2926
				  dst_metric(dst, RTAX_INITRWND), sk);
2802
2927
2803
	tp->rx_opt.rcv_wscale = rcv_wscale;
2928
	tp->rx_opt.rcv_wscale = rcv_wscale;
2804
	tp->rcv_ssthresh = tp->rcv_wnd;
2929
	tp->rcv_ssthresh = tp->rcv_wnd;
Lines 2822-2827 Link Here
2822
	inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
2947
	inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
2823
	inet_csk(sk)->icsk_retransmits = 0;
2948
	inet_csk(sk)->icsk_retransmits = 0;
2824
	tcp_clear_retrans(tp);
2949
	tcp_clear_retrans(tp);
2950
2951
#ifdef CONFIG_MPTCP
2952
	if (mptcp_doit(sk)) {
2953
		if (is_master_tp(tp)) {
2954
			tp->request_mptcp = 1;
2955
			mptcp_connect_init(sk);
2956
		} else {
2957
			tp->mptcp->snt_isn = tp->write_seq;
2958
			tp->mptcp->init_rcv_wnd = tp->rcv_wnd;
2959
		}
2960
	}
2961
#endif
2825
}
2962
}
2826
2963
2827
static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
2964
static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
Lines 3044-3049 Link Here
3044
	 */
3181
	 */
3045
	buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));
3182
	buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));
3046
	if (buff == NULL) {
3183
	if (buff == NULL) {
3184
3185
		/* MPTCP: We don't send a delayed ack if we are sending an mptcp
3186
		 * ADD_ADDR ack to avoid sending multiple ADD_ADDR acks for the
3187
		 * same address. */
3188
		if (tcp_sk(sk)->mptcp_add_addr_ack == 1)
3189
			return;
3190
3047
		inet_csk_schedule_ack(sk);
3191
		inet_csk_schedule_ack(sk);
3048
		inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
3192
		inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
3049
		inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
3193
		inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
Lines 3071-3077 Link Here
3071
 * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
3215
 * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
3072
 * out-of-date with SND.UNA-1 to probe window.
3216
 * out-of-date with SND.UNA-1 to probe window.
3073
 */
3217
 */
3074
static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
3218
int tcp_xmit_probe_skb(struct sock *sk, int urgent)
3075
{
3219
{
3076
	struct tcp_sock *tp = tcp_sk(sk);
3220
	struct tcp_sock *tp = tcp_sk(sk);
3077
	struct sk_buff *skb;
3221
	struct sk_buff *skb;
Lines 3110-3115 Link Here
3110
	if (sk->sk_state == TCP_CLOSE)
3254
	if (sk->sk_state == TCP_CLOSE)
3111
		return -1;
3255
		return -1;
3112
3256
3257
	if (is_meta_sk(sk))
3258
		return mptcp_write_wakeup(sk);
3259
3113
	if ((skb = tcp_send_head(sk)) != NULL &&
3260
	if ((skb = tcp_send_head(sk)) != NULL &&
3114
	    before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
3261
	    before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
3115
		int err;
3262
		int err;
(-)a/net/ipv4/tcp_timer.c (-23 / +47 lines)
Lines 20-25 Link Here
20
20
21
#include <linux/module.h>
21
#include <linux/module.h>
22
#include <linux/gfp.h>
22
#include <linux/gfp.h>
23
#include <net/mptcp.h>
23
#include <net/tcp.h>
24
#include <net/tcp.h>
24
25
25
int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES;
26
int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES;
Lines 32-38 Link Here
32
int sysctl_tcp_orphan_retries __read_mostly;
33
int sysctl_tcp_orphan_retries __read_mostly;
33
int sysctl_tcp_thin_linear_timeouts __read_mostly;
34
int sysctl_tcp_thin_linear_timeouts __read_mostly;
34
35
35
static void tcp_write_err(struct sock *sk)
36
void tcp_write_err(struct sock *sk)
36
{
37
{
37
	sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
38
	sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
38
	sk->sk_error_report(sk);
39
	sk->sk_error_report(sk);
Lines 124-133 Link Here
124
 * retransmissions with an initial RTO of TCP_RTO_MIN or TCP_TIMEOUT_INIT if
125
 * retransmissions with an initial RTO of TCP_RTO_MIN or TCP_TIMEOUT_INIT if
125
 * syn_set flag is set.
126
 * syn_set flag is set.
126
 */
127
 */
127
static bool retransmits_timed_out(struct sock *sk,
128
bool retransmits_timed_out(struct sock *sk, unsigned int boundary,
128
				  unsigned int boundary,
129
			   unsigned int timeout, bool syn_set)
129
				  unsigned int timeout,
130
				  bool syn_set)
131
{
130
{
132
	unsigned int linear_backoff_thresh, start_ts;
131
	unsigned int linear_backoff_thresh, start_ts;
133
	unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN;
132
	unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN;
Lines 153-159 Link Here
153
}
152
}
154
153
155
/* A write timeout has occurred. Process the after effects. */
154
/* A write timeout has occurred. Process the after effects. */
156
static int tcp_write_timeout(struct sock *sk)
155
int tcp_write_timeout(struct sock *sk)
157
{
156
{
158
	struct inet_connection_sock *icsk = inet_csk(sk);
157
	struct inet_connection_sock *icsk = inet_csk(sk);
159
	int retry_until;
158
	int retry_until;
Lines 164-169 Link Here
164
			dst_negative_advice(sk);
163
			dst_negative_advice(sk);
165
		retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
164
		retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
166
		syn_set = true;
165
		syn_set = true;
166
		/* Stop retransmitting MP_CAPABLE options in SYN if timed out. */
167
		if (tcp_sk(sk)->request_mptcp &&
168
		    icsk->icsk_retransmits >= mptcp_sysctl_syn_retries())
169
			tcp_sk(sk)->request_mptcp = 0;
167
	} else {
170
	} else {
168
		if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) {
171
		if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) {
169
			/* Black hole detection */
172
			/* Black hole detection */
Lines 244-261 Link Here
244
static void tcp_delack_timer(unsigned long data)
247
static void tcp_delack_timer(unsigned long data)
245
{
248
{
246
	struct sock *sk = (struct sock *)data;
249
	struct sock *sk = (struct sock *)data;
250
	struct tcp_sock *tp = tcp_sk(sk);
251
	struct sock *meta_sk = tp->mpc ? mptcp_meta_sk(sk) : sk;
247
252
248
	bh_lock_sock(sk);
253
	bh_lock_sock(meta_sk);
249
	if (!sock_owned_by_user(sk)) {
254
	if (!sock_owned_by_user(meta_sk)) {
250
		tcp_delack_timer_handler(sk);
255
		tcp_delack_timer_handler(sk);
251
	} else {
256
	} else {
252
		inet_csk(sk)->icsk_ack.blocked = 1;
257
		inet_csk(sk)->icsk_ack.blocked = 1;
253
		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
258
		NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_DELAYEDACKLOCKED);
254
		/* deleguate our work to tcp_release_cb() */
259
		/* deleguate our work to tcp_release_cb() */
255
		if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
260
		if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
256
			sock_hold(sk);
261
			sock_hold(sk);
262
		if (tp->mpc)
263
			mptcp_tsq_flags(sk);
257
	}
264
	}
258
	bh_unlock_sock(sk);
265
	bh_unlock_sock(meta_sk);
259
	sock_put(sk);
266
	sock_put(sk);
260
}
267
}
261
268
Lines 418-423 Link Here
418
425
419
	tcp_enter_loss(sk, 0);
426
	tcp_enter_loss(sk, 0);
420
427
428
	if (tp->mpc)
429
		mptcp_reinject_data(sk, 1);
430
421
	if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) {
431
	if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) {
422
		/* Retransmission failed because of local congestion,
432
		/* Retransmission failed because of local congestion,
423
		 * do not backoff.
433
		 * do not backoff.
Lines 468-473 Link Here
468
		/* Use normal (exponential) backoff */
478
		/* Use normal (exponential) backoff */
469
		icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
479
		icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
470
	}
480
	}
481
	mptcp_set_rto(sk);
471
	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
482
	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
472
	if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0))
483
	if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0))
473
		__sk_dst_reset(sk);
484
		__sk_dst_reset(sk);
Lines 499-505 Link Here
499
		break;
510
		break;
500
	case ICSK_TIME_RETRANS:
511
	case ICSK_TIME_RETRANS:
501
		icsk->icsk_pending = 0;
512
		icsk->icsk_pending = 0;
502
		tcp_retransmit_timer(sk);
513
		if (is_meta_sk(sk))
514
			mptcp_retransmit_timer(sk);
515
		else
516
			tcp_retransmit_timer(sk);
503
		break;
517
		break;
504
	case ICSK_TIME_PROBE0:
518
	case ICSK_TIME_PROBE0:
505
		icsk->icsk_pending = 0;
519
		icsk->icsk_pending = 0;
Lines 514-529 Link Here
514
static void tcp_write_timer(unsigned long data)
528
static void tcp_write_timer(unsigned long data)
515
{
529
{
516
	struct sock *sk = (struct sock *)data;
530
	struct sock *sk = (struct sock *)data;
531
	struct sock *meta_sk = tcp_sk(sk)->mpc ? mptcp_meta_sk(sk) : sk;
517
532
518
	bh_lock_sock(sk);
533
	bh_lock_sock(meta_sk);
519
	if (!sock_owned_by_user(sk)) {
534
	if (!sock_owned_by_user(meta_sk)) {
520
		tcp_write_timer_handler(sk);
535
		tcp_write_timer_handler(sk);
521
	} else {
536
	} else {
522
		/* deleguate our work to tcp_release_cb() */
537
		/* deleguate our work to tcp_release_cb() */
523
		if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
538
		if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
524
			sock_hold(sk);
539
			sock_hold(sk);
540
		if (tcp_sk(sk)->mpc)
541
			mptcp_tsq_flags(sk);
525
	}
542
	}
526
	bh_unlock_sock(sk);
543
	bh_unlock_sock(meta_sk);
527
	sock_put(sk);
544
	sock_put(sk);
528
}
545
}
529
546
Lines 548-553 Link Here
548
	if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
565
	if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
549
		return;
566
		return;
550
567
568
	if (is_meta_sk(sk)) {
569
		mptcp_set_keepalive(sk, val);
570
		return;
571
	}
572
551
	if (val && !sock_flag(sk, SOCK_KEEPOPEN))
573
	if (val && !sock_flag(sk, SOCK_KEEPOPEN))
552
		inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
574
		inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
553
	else if (!val)
575
	else if (!val)
Lines 560-580 Link Here
560
	struct sock *sk = (struct sock *) data;
582
	struct sock *sk = (struct sock *) data;
561
	struct inet_connection_sock *icsk = inet_csk(sk);
583
	struct inet_connection_sock *icsk = inet_csk(sk);
562
	struct tcp_sock *tp = tcp_sk(sk);
584
	struct tcp_sock *tp = tcp_sk(sk);
585
	struct sock *meta_sk = tp->mpc ? mptcp_meta_sk(sk) : sk;
563
	u32 elapsed;
586
	u32 elapsed;
564
587
565
	/* Only process if socket is not in use. */
588
	/* Only process if socket is not in use. */
566
	bh_lock_sock(sk);
589
	bh_lock_sock(meta_sk);
567
	if (sock_owned_by_user(sk)) {
590
	if (sock_owned_by_user(meta_sk)) {
568
		/* Try again later. */
591
		/* Try again later. */
569
		inet_csk_reset_keepalive_timer (sk, HZ/20);
592
		inet_csk_reset_keepalive_timer (sk, HZ/20);
570
		goto out;
593
		goto out;
571
	}
594
	}
572
595
573
	if (sk->sk_state == TCP_LISTEN) {
574
		tcp_synack_timer(sk);
575
		goto out;
576
	}
577
578
	if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
596
	if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
579
		if (tp->linger2 >= 0) {
597
		if (tp->linger2 >= 0) {
580
			const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;
598
			const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;
Lines 588-594 Link Here
588
		goto death;
606
		goto death;
589
	}
607
	}
590
608
591
	if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE)
609
	if (sk->sk_state == TCP_LISTEN || is_meta_sk(sk)) {
610
		tcp_synack_timer(sk);
611
		goto out;
612
	}
613
614
	/* MPTCP: Keepalive timers are handled at the subflow level */
615
	if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE || is_meta_sk(sk))
592
		goto out;
616
		goto out;
593
617
594
	elapsed = keepalive_time_when(tp);
618
	elapsed = keepalive_time_when(tp);
Lines 636-642 Link Here
636
	tcp_done(sk);
660
	tcp_done(sk);
637
661
638
out:
662
out:
639
	bh_unlock_sock(sk);
663
	bh_unlock_sock(meta_sk);
640
	sock_put(sk);
664
	sock_put(sk);
641
}
665
}
642
666
(-)a/net/ipv6/af_inet6.c (-2 / +1 lines)
Lines 96-103 Link Here
96
	return (struct ipv6_pinfo *)(((u8 *)sk) + offset);
96
	return (struct ipv6_pinfo *)(((u8 *)sk) + offset);
97
}
97
}
98
98
99
static int inet6_create(struct net *net, struct socket *sock, int protocol,
99
int inet6_create(struct net *net, struct socket *sock, int protocol, int kern)
100
			int kern)
101
{
100
{
102
	struct inet_sock *inet;
101
	struct inet_sock *inet;
103
	struct ipv6_pinfo *np;
102
	struct ipv6_pinfo *np;
(-)a/net/ipv6/inet6_connection_sock.c (-2 / +2 lines)
Lines 96-103 Link Here
96
/*
96
/*
97
 * request_sock (formerly open request) hash tables.
97
 * request_sock (formerly open request) hash tables.
98
 */
98
 */
99
static u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport,
99
u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport,
100
			   const u32 rnd, const u32 synq_hsize)
100
		    const u32 rnd, const u32 synq_hsize)
101
{
101
{
102
	u32 c;
102
	u32 c;
103
103
(-)a/net/ipv6/syncookies.c (-2 / +2 lines)
Lines 176-182 Link Here
176
176
177
	/* check for timestamp cookie support */
177
	/* check for timestamp cookie support */
178
	memset(&tcp_opt, 0, sizeof(tcp_opt));
178
	memset(&tcp_opt, 0, sizeof(tcp_opt));
179
	tcp_parse_options(skb, &tcp_opt, 0, NULL);
179
	tcp_parse_options(skb, &tcp_opt, NULL, 0, NULL);
180
180
181
	if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok))
181
	if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok))
182
		goto out;
182
		goto out;
Lines 252-258 Link Here
252
	tcp_select_initial_window(tcp_full_space(sk), req->mss,
252
	tcp_select_initial_window(tcp_full_space(sk), req->mss,
253
				  &req->rcv_wnd, &req->window_clamp,
253
				  &req->rcv_wnd, &req->window_clamp,
254
				  ireq->wscale_ok, &rcv_wscale,
254
				  ireq->wscale_ok, &rcv_wscale,
255
				  dst_metric(dst, RTAX_INITRWND));
255
				  dst_metric(dst, RTAX_INITRWND), sk);
256
256
257
	ireq->rcv_wscale = rcv_wscale;
257
	ireq->rcv_wscale = rcv_wscale;
258
258
(-)a/net/ipv6/tcp_ipv6.c (-70 / +179 lines)
Lines 64-69 Link Here
64
#include <net/secure_seq.h>
64
#include <net/secure_seq.h>
65
#include <net/tcp_memcontrol.h>
65
#include <net/tcp_memcontrol.h>
66
#include <net/busy_poll.h>
66
#include <net/busy_poll.h>
67
#include <net/mptcp.h>
68
#include <net/mptcp_v6.h>
67
69
68
#include <asm/uaccess.h>
70
#include <asm/uaccess.h>
69
71
Lines 73-86 Link Here
73
#include <linux/crypto.h>
75
#include <linux/crypto.h>
74
#include <linux/scatterlist.h>
76
#include <linux/scatterlist.h>
75
77
76
static void	tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb);
77
static void	tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
78
				      struct request_sock *req);
79
80
static int	tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
81
82
static const struct inet_connection_sock_af_ops ipv6_mapped;
83
static const struct inet_connection_sock_af_ops ipv6_specific;
84
#ifdef CONFIG_TCP_MD5SIG
78
#ifdef CONFIG_TCP_MD5SIG
85
static const struct tcp_sock_af_ops tcp_sock_ipv6_specific;
79
static const struct tcp_sock_af_ops tcp_sock_ipv6_specific;
86
static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific;
80
static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific;
Lines 92-98 Link Here
92
}
86
}
93
#endif
87
#endif
94
88
95
static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
89
void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
96
{
90
{
97
	struct dst_entry *dst = skb_dst(skb);
91
	struct dst_entry *dst = skb_dst(skb);
98
	const struct rt6_info *rt = (const struct rt6_info *)dst;
92
	const struct rt6_info *rt = (const struct rt6_info *)dst;
Lines 104-110 Link Here
104
		inet6_sk(sk)->rx_dst_cookie = rt->rt6i_node->fn_sernum;
98
		inet6_sk(sk)->rx_dst_cookie = rt->rt6i_node->fn_sernum;
105
}
99
}
106
100
107
static void tcp_v6_hash(struct sock *sk)
101
void tcp_v6_hash(struct sock *sk)
108
{
102
{
109
	if (sk->sk_state != TCP_CLOSE) {
103
	if (sk->sk_state != TCP_CLOSE) {
110
		if (inet_csk(sk)->icsk_af_ops == &ipv6_mapped) {
104
		if (inet_csk(sk)->icsk_af_ops == &ipv6_mapped) {
Lines 117-123 Link Here
117
	}
111
	}
118
}
112
}
119
113
120
static __u32 tcp_v6_init_sequence(const struct sk_buff *skb)
114
__u32 tcp_v6_init_sequence(const struct sk_buff *skb)
121
{
115
{
122
	return secure_tcpv6_sequence_number(ipv6_hdr(skb)->daddr.s6_addr32,
116
	return secure_tcpv6_sequence_number(ipv6_hdr(skb)->daddr.s6_addr32,
123
					    ipv6_hdr(skb)->saddr.s6_addr32,
117
					    ipv6_hdr(skb)->saddr.s6_addr32,
Lines 125-131 Link Here
125
					    tcp_hdr(skb)->source);
119
					    tcp_hdr(skb)->source);
126
}
120
}
127
121
128
static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
122
int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
129
			  int addr_len)
123
			  int addr_len)
130
{
124
{
131
	struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
125
	struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
Lines 340-346 Link Here
340
	const struct ipv6hdr *hdr = (const struct ipv6hdr*)skb->data;
334
	const struct ipv6hdr *hdr = (const struct ipv6hdr*)skb->data;
341
	const struct tcphdr *th = (struct tcphdr *)(skb->data+offset);
335
	const struct tcphdr *th = (struct tcphdr *)(skb->data+offset);
342
	struct ipv6_pinfo *np;
336
	struct ipv6_pinfo *np;
343
	struct sock *sk;
337
	struct sock *sk, *meta_sk;
344
	int err;
338
	int err;
345
	struct tcp_sock *tp;
339
	struct tcp_sock *tp;
346
	__u32 seq;
340
	__u32 seq;
Lines 360-367 Link Here
360
		return;
354
		return;
361
	}
355
	}
362
356
363
	bh_lock_sock(sk);
357
	tp = tcp_sk(sk);
364
	if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG)
358
	if (tp->mpc)
359
		meta_sk = mptcp_meta_sk(sk);
360
	else
361
		meta_sk = sk;
362
363
	bh_lock_sock(meta_sk);
364
	if (sock_owned_by_user(meta_sk) && type != ICMPV6_PKT_TOOBIG)
365
		NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
365
		NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
366
366
367
	if (sk->sk_state == TCP_CLOSE)
367
	if (sk->sk_state == TCP_CLOSE)
Lines 372-378 Link Here
372
		goto out;
372
		goto out;
373
	}
373
	}
374
374
375
	tp = tcp_sk(sk);
376
	seq = ntohl(th->seq);
375
	seq = ntohl(th->seq);
377
	if (sk->sk_state != TCP_LISTEN &&
376
	if (sk->sk_state != TCP_LISTEN &&
378
	    !between(seq, tp->snd_una, tp->snd_nxt)) {
377
	    !between(seq, tp->snd_una, tp->snd_nxt)) {
Lines 399-409 Link Here
399
			goto out;
398
			goto out;
400
399
401
		tp->mtu_info = ntohl(info);
400
		tp->mtu_info = ntohl(info);
402
		if (!sock_owned_by_user(sk))
401
		if (!sock_owned_by_user(meta_sk))
403
			tcp_v6_mtu_reduced(sk);
402
			tcp_v6_mtu_reduced(sk);
404
		else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED,
403
		else {
404
			if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED,
405
					   &tp->tsq_flags))
405
					   &tp->tsq_flags))
406
			sock_hold(sk);
406
				sock_hold(sk);
407
			if (tp->mpc)
408
				mptcp_tsq_flags(sk);
409
		}
407
		goto out;
410
		goto out;
408
	}
411
	}
409
412
Lines 413-419 Link Here
413
	switch (sk->sk_state) {
416
	switch (sk->sk_state) {
414
		struct request_sock *req, **prev;
417
		struct request_sock *req, **prev;
415
	case TCP_LISTEN:
418
	case TCP_LISTEN:
416
		if (sock_owned_by_user(sk))
419
		if (sock_owned_by_user(meta_sk))
417
			goto out;
420
			goto out;
418
421
419
		req = inet6_csk_search_req(sk, &prev, th->dest, &hdr->daddr,
422
		req = inet6_csk_search_req(sk, &prev, th->dest, &hdr->daddr,
Lines 438-444 Link Here
438
	case TCP_SYN_SENT:
441
	case TCP_SYN_SENT:
439
	case TCP_SYN_RECV:  /* Cannot happen.
442
	case TCP_SYN_RECV:  /* Cannot happen.
440
			       It can, it SYNs are crossed. --ANK */
443
			       It can, it SYNs are crossed. --ANK */
441
		if (!sock_owned_by_user(sk)) {
444
		if (!sock_owned_by_user(meta_sk)) {
442
			sk->sk_err = err;
445
			sk->sk_err = err;
443
			sk->sk_error_report(sk);		/* Wake people up to see the error (see connect in sock.c) */
446
			sk->sk_error_report(sk);		/* Wake people up to see the error (see connect in sock.c) */
444
447
Lines 448-469 Link Here
448
		goto out;
451
		goto out;
449
	}
452
	}
450
453
451
	if (!sock_owned_by_user(sk) && np->recverr) {
454
	if (!sock_owned_by_user(meta_sk) && np->recverr) {
452
		sk->sk_err = err;
455
		sk->sk_err = err;
453
		sk->sk_error_report(sk);
456
		sk->sk_error_report(sk);
454
	} else
457
	} else
455
		sk->sk_err_soft = err;
458
		sk->sk_err_soft = err;
456
459
457
out:
460
out:
458
	bh_unlock_sock(sk);
461
	bh_unlock_sock(meta_sk);
459
	sock_put(sk);
462
	sock_put(sk);
460
}
463
}
461
464
462
465
463
static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst,
466
int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst,
464
			      struct flowi6 *fl6,
467
		       struct flowi6 *fl6,
465
			      struct request_sock *req,
468
		       struct request_sock *req,
466
			      u16 queue_mapping)
469
		       u16 queue_mapping)
467
{
470
{
468
	struct inet6_request_sock *treq = inet6_rsk(req);
471
	struct inet6_request_sock *treq = inet6_rsk(req);
469
	struct ipv6_pinfo *np = inet6_sk(sk);
472
	struct ipv6_pinfo *np = inet6_sk(sk);
Lines 489-495 Link Here
489
	return err;
492
	return err;
490
}
493
}
491
494
492
static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req)
495
int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req)
493
{
496
{
494
	struct flowi6 fl6;
497
	struct flowi6 fl6;
495
	int res;
498
	int res;
Lines 500-506 Link Here
500
	return res;
503
	return res;
501
}
504
}
502
505
503
static void tcp_v6_reqsk_destructor(struct request_sock *req)
506
void tcp_v6_reqsk_destructor(struct request_sock *req)
504
{
507
{
505
	kfree_skb(inet6_rsk(req)->pktopts);
508
	kfree_skb(inet6_rsk(req)->pktopts);
506
}
509
}
Lines 719-727 Link Here
719
};
722
};
720
#endif
723
#endif
721
724
722
static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,
725
static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack,
723
				 u32 tsval, u32 tsecr,
726
				 u32 data_ack, u32 win, u32 tsval, u32 tsecr,
724
				 struct tcp_md5sig_key *key, int rst, u8 tclass)
727
				 struct tcp_md5sig_key *key, int rst, u8 tclass, int mptcp)
725
{
728
{
726
	const struct tcphdr *th = tcp_hdr(skb);
729
	const struct tcphdr *th = tcp_hdr(skb);
727
	struct tcphdr *t1;
730
	struct tcphdr *t1;
Lines 739-745 Link Here
739
	if (key)
742
	if (key)
740
		tot_len += TCPOLEN_MD5SIG_ALIGNED;
743
		tot_len += TCPOLEN_MD5SIG_ALIGNED;
741
#endif
744
#endif
742
745
#ifdef CONFIG_MPTCP
746
	if (mptcp)
747
		tot_len += MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK;
748
#endif
743
	buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len,
749
	buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len,
744
			 GFP_ATOMIC);
750
			 GFP_ATOMIC);
745
	if (buff == NULL)
751
	if (buff == NULL)
Lines 777-782 Link Here
777
		tcp_v6_md5_hash_hdr((__u8 *)topt, key,
783
		tcp_v6_md5_hash_hdr((__u8 *)topt, key,
778
				    &ipv6_hdr(skb)->saddr,
784
				    &ipv6_hdr(skb)->saddr,
779
				    &ipv6_hdr(skb)->daddr, t1);
785
				    &ipv6_hdr(skb)->daddr, t1);
786
		topt += 4;
787
	}
788
#endif
789
#ifdef CONFIG_MPTCP
790
	if (mptcp) {
791
		/* Construction of 32-bit data_ack */
792
		*topt++ = htonl((TCPOPT_MPTCP << 24) |
793
				((MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK) << 16) |
794
				(0x20 << 8) |
795
				(0x01));
796
		*topt++ = htonl(data_ack);
780
	}
797
	}
781
#endif
798
#endif
782
799
Lines 813-819 Link Here
813
	kfree_skb(buff);
830
	kfree_skb(buff);
814
}
831
}
815
832
816
static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
833
void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
817
{
834
{
818
	const struct tcphdr *th = tcp_hdr(skb);
835
	const struct tcphdr *th = tcp_hdr(skb);
819
	u32 seq = 0, ack_seq = 0;
836
	u32 seq = 0, ack_seq = 0;
Lines 868-874 Link Here
868
		ack_seq = ntohl(th->seq) + th->syn + th->fin + skb->len -
885
		ack_seq = ntohl(th->seq) + th->syn + th->fin + skb->len -
869
			  (th->doff << 2);
886
			  (th->doff << 2);
870
887
871
	tcp_v6_send_response(skb, seq, ack_seq, 0, 0, 0, key, 1, 0);
888
	tcp_v6_send_response(skb, seq, ack_seq, 0, 0, 0, 0, key, 1, 0, 0);
872
889
873
#ifdef CONFIG_TCP_MD5SIG
890
#ifdef CONFIG_TCP_MD5SIG
874
release_sk1:
891
release_sk1:
Lines 879-915 Link Here
879
#endif
896
#endif
880
}
897
}
881
898
882
static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
899
static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 data_ack,
883
			    u32 win, u32 tsval, u32 tsecr,
900
			    u32 win, u32 tsval, u32 tsecr,
884
			    struct tcp_md5sig_key *key, u8 tclass)
901
			    struct tcp_md5sig_key *key, u8 tclass, int mptcp)
885
{
902
{
886
	tcp_v6_send_response(skb, seq, ack, win, tsval, tsecr, key, 0, tclass);
903
	tcp_v6_send_response(skb, seq, ack, data_ack, win, tsval, tsecr, key, 0, tclass, mptcp);
887
}
904
}
888
905
889
static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)
906
static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)
890
{
907
{
891
	struct inet_timewait_sock *tw = inet_twsk(sk);
908
	struct inet_timewait_sock *tw = inet_twsk(sk);
892
	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
909
	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
910
	u32 data_ack = 0;
911
	int mptcp = 0;
893
912
913
	if (tcptw->mptcp_tw && tcptw->mptcp_tw->meta_tw) {
914
		data_ack = (u32)tcptw->mptcp_tw->rcv_nxt;
915
		mptcp = 1;
916
	}
894
	tcp_v6_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
917
	tcp_v6_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
918
			data_ack,
895
			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
919
			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
896
			tcp_time_stamp + tcptw->tw_ts_offset,
920
			tcp_time_stamp + tcptw->tw_ts_offset,
897
			tcptw->tw_ts_recent, tcp_twsk_md5_key(tcptw),
921
			tcptw->tw_ts_recent, tcp_twsk_md5_key(tcptw),
898
			tw->tw_tclass);
922
			tw->tw_tclass, mptcp);
899
923
900
	inet_twsk_put(tw);
924
	inet_twsk_put(tw);
901
}
925
}
902
926
903
static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
927
void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
904
				  struct request_sock *req)
928
			   struct request_sock *req)
905
{
929
{
906
	tcp_v6_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1,
930
	tcp_v6_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1,
907
			req->rcv_wnd, tcp_time_stamp, req->ts_recent,
931
			0, req->rcv_wnd, tcp_time_stamp, req->ts_recent,
908
			tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr), 0);
932
			tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr), 0, 0);
909
}
933
}
910
934
911
935
912
static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
936
struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
913
{
937
{
914
	struct request_sock *req, **prev;
938
	struct request_sock *req, **prev;
915
	const struct tcphdr *th = tcp_hdr(skb);
939
	const struct tcphdr *th = tcp_hdr(skb);
Lines 928-934 Link Here
928
952
929
	if (nsk) {
953
	if (nsk) {
930
		if (nsk->sk_state != TCP_TIME_WAIT) {
954
		if (nsk->sk_state != TCP_TIME_WAIT) {
955
			/* Don't lock again the meta-sk. It has been locked
956
			 * before mptcp_v6_do_rcv.
957
			 */
958
			if (tcp_sk(nsk)->mpc && !is_meta_sk(sk))
959
				bh_lock_sock(mptcp_meta_sk(nsk));
931
			bh_lock_sock(nsk);
960
			bh_lock_sock(nsk);
961
932
			return nsk;
962
			return nsk;
933
		}
963
		}
934
		inet_twsk_put(inet_twsk(nsk));
964
		inet_twsk_put(inet_twsk(nsk));
Lines 948-953 Link Here
948
static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
978
static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
949
{
979
{
950
	struct tcp_options_received tmp_opt;
980
	struct tcp_options_received tmp_opt;
981
	struct mptcp_options_received mopt;
951
	struct request_sock *req;
982
	struct request_sock *req;
952
	struct inet6_request_sock *treq;
983
	struct inet6_request_sock *treq;
953
	struct ipv6_pinfo *np = inet6_sk(sk);
984
	struct ipv6_pinfo *np = inet6_sk(sk);
Lines 960-965 Link Here
960
	if (skb->protocol == htons(ETH_P_IP))
991
	if (skb->protocol == htons(ETH_P_IP))
961
		return tcp_v4_conn_request(sk, skb);
992
		return tcp_v4_conn_request(sk, skb);
962
993
994
	tcp_clear_options(&tmp_opt);
995
	tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
996
	tmp_opt.user_mss = tp->rx_opt.user_mss;
997
	mptcp_init_mp_opt(&mopt);
998
	tcp_parse_options(skb, &tmp_opt, &mopt, 0, NULL);
999
1000
#ifdef CONFIG_MPTCP
1001
	/*MPTCP structures not initialized, so return error */
1002
	if (mptcp_init_failed)
1003
		mptcp_init_mp_opt(&mopt);
1004
1005
	if (mopt.is_mp_join)
1006
		return mptcp_do_join_short(skb, &mopt, &tmp_opt, sock_net(sk));
1007
	if (mopt.drop_me)
1008
		goto drop;
1009
#endif
1010
963
	if (!ipv6_unicast_destination(skb))
1011
	if (!ipv6_unicast_destination(skb))
964
		goto drop;
1012
		goto drop;
965
1013
Lines 974-980 Link Here
974
		goto drop;
1022
		goto drop;
975
	}
1023
	}
976
1024
977
	req = inet6_reqsk_alloc(&tcp6_request_sock_ops);
1025
#ifdef CONFIG_MPTCP
1026
	if (mopt.saw_mpc) {
1027
		req = inet6_reqsk_alloc(&mptcp6_request_sock_ops);
1028
1029
		if (req == NULL)
1030
			goto drop;
1031
1032
		mptcp_rsk(req)->mpcb = NULL;
1033
		mptcp_rsk(req)->dss_csum = mopt.dss_csum;
1034
		mptcp_rsk(req)->collide_tk.pprev = NULL;
1035
	} else
1036
#endif
1037
		req = inet6_reqsk_alloc(&tcp6_request_sock_ops);
1038
978
	if (req == NULL)
1039
	if (req == NULL)
979
		goto drop;
1040
		goto drop;
980
1041
Lines 982-998 Link Here
982
	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv6_ops;
1043
	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv6_ops;
983
#endif
1044
#endif
984
1045
985
	tcp_clear_options(&tmp_opt);
986
	tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
987
	tmp_opt.user_mss = tp->rx_opt.user_mss;
988
	tcp_parse_options(skb, &tmp_opt, 0, NULL);
989
990
	if (want_cookie && !tmp_opt.saw_tstamp)
1046
	if (want_cookie && !tmp_opt.saw_tstamp)
991
		tcp_clear_options(&tmp_opt);
1047
		tcp_clear_options(&tmp_opt);
992
1048
993
	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1049
	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
994
	tcp_openreq_init(req, &tmp_opt, skb);
1050
	tcp_openreq_init(req, &tmp_opt, skb);
995
1051
1052
	if (mopt.saw_mpc)
1053
		mptcp_reqsk_new_mptcp(req, &tmp_opt, &mopt, skb);
1054
996
	treq = inet6_rsk(req);
1055
	treq = inet6_rsk(req);
997
	treq->rmt_addr = ipv6_hdr(skb)->saddr;
1056
	treq->rmt_addr = ipv6_hdr(skb)->saddr;
998
	treq->loc_addr = ipv6_hdr(skb)->daddr;
1057
	treq->loc_addr = ipv6_hdr(skb)->daddr;
Lines 1081-1089 Link Here
1081
	return 0; /* don't send reset */
1140
	return 0; /* don't send reset */
1082
}
1141
}
1083
1142
1084
static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1143
struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1085
					  struct request_sock *req,
1144
				  struct request_sock *req,
1086
					  struct dst_entry *dst)
1145
				  struct dst_entry *dst)
1087
{
1146
{
1088
	struct inet6_request_sock *treq;
1147
	struct inet6_request_sock *treq;
1089
	struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
1148
	struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
Lines 1303-1309 Link Here
1303
 * This is because we cannot sleep with the original spinlock
1362
 * This is because we cannot sleep with the original spinlock
1304
 * held.
1363
 * held.
1305
 */
1364
 */
1306
static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
1365
int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
1307
{
1366
{
1308
	struct ipv6_pinfo *np = inet6_sk(sk);
1367
	struct ipv6_pinfo *np = inet6_sk(sk);
1309
	struct tcp_sock *tp;
1368
	struct tcp_sock *tp;
Lines 1325-1330 Link Here
1325
		goto discard;
1384
		goto discard;
1326
#endif
1385
#endif
1327
1386
1387
	if (is_meta_sk(sk))
1388
		return mptcp_v6_do_rcv(sk, skb);
1389
1328
	if (sk_filter(sk, skb))
1390
	if (sk_filter(sk, skb))
1329
		goto discard;
1391
		goto discard;
1330
1392
Lines 1445-1451 Link Here
1445
{
1507
{
1446
	const struct tcphdr *th;
1508
	const struct tcphdr *th;
1447
	const struct ipv6hdr *hdr;
1509
	const struct ipv6hdr *hdr;
1448
	struct sock *sk;
1510
	struct sock *sk, *meta_sk = NULL;
1449
	int ret;
1511
	int ret;
1450
	struct net *net = dev_net(skb->dev);
1512
	struct net *net = dev_net(skb->dev);
1451
1513
Lines 1476-1493 Link Here
1476
	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1538
	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1477
				    skb->len - th->doff*4);
1539
				    skb->len - th->doff*4);
1478
	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1540
	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1541
#ifdef CONFIG_MPTCP
1542
	TCP_SKB_CB(skb)->mptcp_flags = 0;
1543
	TCP_SKB_CB(skb)->dss_off = 0;
1544
#endif
1479
	TCP_SKB_CB(skb)->when = 0;
1545
	TCP_SKB_CB(skb)->when = 0;
1480
	TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr);
1546
	TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr);
1481
	TCP_SKB_CB(skb)->sacked = 0;
1547
	TCP_SKB_CB(skb)->sacked = 0;
1482
1548
1483
	sk = __inet6_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1549
	sk = __inet6_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1484
	if (!sk)
1485
		goto no_tcp_socket;
1486
1550
1487
process:
1551
process:
1488
	if (sk->sk_state == TCP_TIME_WAIT)
1552
	if (sk && sk->sk_state == TCP_TIME_WAIT)
1489
		goto do_time_wait;
1553
		goto do_time_wait;
1490
1554
1555
#ifdef CONFIG_MPTCP
1556
	if (!sk && th->syn && !th->ack) {
1557
		int ret = mptcp_lookup_join(skb, NULL);
1558
1559
		if (ret < 0) {
1560
			tcp_v6_send_reset(NULL, skb);
1561
			goto discard_it;
1562
		} else if (ret > 0) {
1563
			return 0;
1564
		}
1565
	}
1566
1567
	/* Is there a pending request sock for this segment ? */
1568
	if ((!sk || sk->sk_state == TCP_LISTEN) && mptcp_check_req(skb, net)) {
1569
		if (sk)
1570
			sock_put(sk);
1571
		return 0;
1572
	}
1573
#endif
1574
1575
	if (!sk)
1576
		goto no_tcp_socket;
1577
1491
	if (hdr->hop_limit < inet6_sk(sk)->min_hopcount) {
1578
	if (hdr->hop_limit < inet6_sk(sk)->min_hopcount) {
1492
		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1579
		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1493
		goto discard_and_relse;
1580
		goto discard_and_relse;
Lines 1502-1512 Link Here
1502
	sk_mark_napi_id(sk, skb);
1589
	sk_mark_napi_id(sk, skb);
1503
	skb->dev = NULL;
1590
	skb->dev = NULL;
1504
1591
1505
	bh_lock_sock_nested(sk);
1592
	if (tcp_sk(sk)->mpc) {
1593
		meta_sk = mptcp_meta_sk(sk);
1594
1595
		bh_lock_sock_nested(meta_sk);
1596
		skb->sk = sk;
1597
	} else {
1598
		meta_sk = sk;
1599
		bh_lock_sock_nested(sk);
1600
	}
1601
1506
	ret = 0;
1602
	ret = 0;
1507
	if (!sock_owned_by_user(sk)) {
1603
	if (!sock_owned_by_user(meta_sk)) {
1508
#ifdef CONFIG_NET_DMA
1604
#ifdef CONFIG_NET_DMA
1509
		struct tcp_sock *tp = tcp_sk(sk);
1605
		struct tcp_sock *tp = tcp_sk(meta_sk);
1510
		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1606
		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1511
			tp->ucopy.dma_chan = net_dma_find_channel();
1607
			tp->ucopy.dma_chan = net_dma_find_channel();
1512
		if (tp->ucopy.dma_chan)
1608
		if (tp->ucopy.dma_chan)
Lines 1514-1529 Link Here
1514
		else
1610
		else
1515
#endif
1611
#endif
1516
		{
1612
		{
1517
			if (!tcp_prequeue(sk, skb))
1613
			if (!tcp_prequeue(meta_sk, skb))
1518
				ret = tcp_v6_do_rcv(sk, skb);
1614
				ret = tcp_v6_do_rcv(sk, skb);
1519
		}
1615
		}
1520
	} else if (unlikely(sk_add_backlog(sk, skb,
1616
	} else if (unlikely(sk_add_backlog(meta_sk, skb,
1521
					   sk->sk_rcvbuf + sk->sk_sndbuf))) {
1617
					   meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) {
1522
		bh_unlock_sock(sk);
1618
		bh_unlock_sock(meta_sk);
1523
		NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1619
		NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1524
		goto discard_and_relse;
1620
		goto discard_and_relse;
1525
	}
1621
	}
1526
	bh_unlock_sock(sk);
1622
1623
	bh_unlock_sock(meta_sk);
1527
1624
1528
	sock_put(sk);
1625
	sock_put(sk);
1529
	return ret ? -1 : 0;
1626
	return ret ? -1 : 0;
Lines 1580-1585 Link Here
1580
			sk = sk2;
1677
			sk = sk2;
1581
			goto process;
1678
			goto process;
1582
		}
1679
		}
1680
#ifdef CONFIG_MPTCP
1681
		if (th->syn && !th->ack) {
1682
			int ret = mptcp_lookup_join(skb, inet_twsk(sk));
1683
1684
			if (ret < 0) {
1685
				tcp_v6_send_reset(NULL, skb);
1686
				goto discard_it;
1687
			} else if (ret > 0) {
1688
				return 0;
1689
			}
1690
		}
1691
#endif
1583
		/* Fall through to ACK */
1692
		/* Fall through to ACK */
1584
	}
1693
	}
1585
	case TCP_TW_ACK:
1694
	case TCP_TW_ACK:
Lines 1629-1641 Link Here
1629
	}
1738
	}
1630
}
1739
}
1631
1740
1632
static struct timewait_sock_ops tcp6_timewait_sock_ops = {
1741
struct timewait_sock_ops tcp6_timewait_sock_ops = {
1633
	.twsk_obj_size	= sizeof(struct tcp6_timewait_sock),
1742
	.twsk_obj_size	= sizeof(struct tcp6_timewait_sock),
1634
	.twsk_unique	= tcp_twsk_unique,
1743
	.twsk_unique	= tcp_twsk_unique,
1635
	.twsk_destructor= tcp_twsk_destructor,
1744
	.twsk_destructor= tcp_twsk_destructor,
1636
};
1745
};
1637
1746
1638
static const struct inet_connection_sock_af_ops ipv6_specific = {
1747
const struct inet_connection_sock_af_ops ipv6_specific = {
1639
	.queue_xmit	   = inet6_csk_xmit,
1748
	.queue_xmit	   = inet6_csk_xmit,
1640
	.send_check	   = tcp_v6_send_check,
1749
	.send_check	   = tcp_v6_send_check,
1641
	.rebuild_header	   = inet6_sk_rebuild_header,
1750
	.rebuild_header	   = inet6_sk_rebuild_header,
Lines 1667-1673 Link Here
1667
 *	TCP over IPv4 via INET6 API
1776
 *	TCP over IPv4 via INET6 API
1668
 */
1777
 */
1669
1778
1670
static const struct inet_connection_sock_af_ops ipv6_mapped = {
1779
const struct inet_connection_sock_af_ops ipv6_mapped = {
1671
	.queue_xmit	   = ip_queue_xmit,
1780
	.queue_xmit	   = ip_queue_xmit,
1672
	.send_check	   = tcp_v4_send_check,
1781
	.send_check	   = tcp_v4_send_check,
1673
	.rebuild_header	   = inet_sk_rebuild_header,
1782
	.rebuild_header	   = inet_sk_rebuild_header,
Lines 1712-1718 Link Here
1712
	return 0;
1821
	return 0;
1713
}
1822
}
1714
1823
1715
static void tcp_v6_destroy_sock(struct sock *sk)
1824
void tcp_v6_destroy_sock(struct sock *sk)
1716
{
1825
{
1717
	tcp_v4_destroy_sock(sk);
1826
	tcp_v4_destroy_sock(sk);
1718
	inet6_destroy_sock(sk);
1827
	inet6_destroy_sock(sk);
(-)a/net/Kconfig (+1 lines)
Lines 79-84 Link Here
79
source "net/ipv4/Kconfig"
79
source "net/ipv4/Kconfig"
80
source "net/ipv6/Kconfig"
80
source "net/ipv6/Kconfig"
81
source "net/netlabel/Kconfig"
81
source "net/netlabel/Kconfig"
82
source "net/mptcp/Kconfig"
82
83
83
endif # if INET
84
endif # if INET
84
85
(-)a/net/Makefile (+1 lines)
Lines 20-25 Link Here
20
obj-$(CONFIG_XFRM)		+= xfrm/
20
obj-$(CONFIG_XFRM)		+= xfrm/
21
obj-$(CONFIG_UNIX)		+= unix/
21
obj-$(CONFIG_UNIX)		+= unix/
22
obj-$(CONFIG_NET)		+= ipv6/
22
obj-$(CONFIG_NET)		+= ipv6/
23
obj-$(CONFIG_MPTCP)		+= mptcp/
23
obj-$(CONFIG_PACKET)		+= packet/
24
obj-$(CONFIG_PACKET)		+= packet/
24
obj-$(CONFIG_NET_KEY)		+= key/
25
obj-$(CONFIG_NET_KEY)		+= key/
25
obj-$(CONFIG_BRIDGE)		+= bridge/
26
obj-$(CONFIG_BRIDGE)		+= bridge/
(-)a/net/mptcp/Kconfig (+9 lines)
Line 0 Link Here
1
#
2
# MPTCP configuration
3
#
4
config MPTCP
5
        bool "MPTCP protocol"
6
        depends on !SYN_COOKIES && !TCP_MD5SIG && (IPV6=y || IPV6=n)
7
        ---help---
8
          This replaces the normal TCP stack with a Multipath TCP stack,
9
          able to use several paths at once.
(-)a/net/mptcp/Makefile (+15 lines)
Line 0 Link Here
1
#
2
## Makefile for MultiPath TCP support code.
3
#
4
#
5
6
obj-$(CONFIG_MPTCP) += mptcp.o
7
8
mptcp-y := mptcp_ctrl.o mptcp_ipv4.o mptcp_ofo_queue.o mptcp_pm.o \
9
	   mptcp_output.o mptcp_input.o
10
11
obj-$(CONFIG_TCP_CONG_COUPLED) += mptcp_coupled.o
12
obj-$(CONFIG_TCP_CONG_OLIA) += mptcp_olia.o
13
14
mptcp-$(subst m,y,$(CONFIG_IPV6)) += mptcp_ipv6.o
15
(-)a/net/mptcp/mptcp_coupled.c (+273 lines)
Line 0 Link Here
1
/*
2
 *	MPTCP implementation - Coupled Congestion Control
3
 *
4
 *	Initial Design & Implementation:
5
 *	Sébastien Barré <sebastien.barre@uclouvain.be>
6
 *
7
 *	Current Maintainer & Author:
8
 *	Christoph Paasch <christoph.paasch@uclouvain.be>
9
 *
10
 *	Additional authors:
11
 *	Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
12
 *	Gregory Detal <gregory.detal@uclouvain.be>
13
 *	Fabien Duchêne <fabien.duchene@uclouvain.be>
14
 *	Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
15
 *	Lavkesh Lahngir <lavkesh51@gmail.com>
16
 *	Andreas Ripke <ripke@neclab.eu>
17
 *	Vlad Dogaru <vlad.dogaru@intel.com>
18
 *	Octavian Purdila <octavian.purdila@intel.com>
19
 *	John Ronan <jronan@tssg.org>
20
 *	Catalin Nicutar <catalin.nicutar@gmail.com>
21
 *	Brandon Heller <brandonh@stanford.edu>
22
 *
23
 *
24
 *	This program is free software; you can redistribute it and/or
25
 *      modify it under the terms of the GNU General Public License
26
 *      as published by the Free Software Foundation; either version
27
 *      2 of the License, or (at your option) any later version.
28
 */
29
#include <net/tcp.h>
30
#include <net/mptcp.h>
31
32
#include <linux/module.h>
33
34
/* Scaling is done in the numerator with alpha_scale_num and in the denominator
35
 * with alpha_scale_den.
36
 *
37
 * To downscale, we just need to use alpha_scale.
38
 *
39
 * We have: alpha_scale = alpha_scale_num / (alpha_scale_den ^ 2)
40
 */
41
static int alpha_scale_den = 10;
42
static int alpha_scale_num = 32;
43
static int alpha_scale = 12;
44
45
struct mptcp_ccc {
46
	u64	alpha;
47
	bool	forced_update;
48
};
49
50
static inline int mptcp_ccc_sk_can_send(const struct sock *sk)
51
{
52
	return mptcp_sk_can_send(sk) && tcp_sk(sk)->srtt;
53
}
54
55
static inline u64 mptcp_get_alpha(struct sock *meta_sk)
56
{
57
	struct mptcp_ccc *mptcp_ccc = inet_csk_ca(meta_sk);
58
	return mptcp_ccc->alpha;
59
}
60
61
static inline void mptcp_set_alpha(struct sock *meta_sk, u64 alpha)
62
{
63
	struct mptcp_ccc *mptcp_ccc = inet_csk_ca(meta_sk);
64
	mptcp_ccc->alpha = alpha;
65
}
66
67
static inline u64 mptcp_ccc_scale(u32 val, int scale)
68
{
69
	return (u64) val << scale;
70
}
71
72
static inline bool mptcp_get_forced(struct sock *meta_sk)
73
{
74
	struct mptcp_ccc *mptcp_ccc = inet_csk_ca(meta_sk);
75
	return mptcp_ccc->forced_update;
76
}
77
78
static inline void mptcp_set_forced(struct sock *meta_sk, bool force)
79
{
80
	struct mptcp_ccc *mptcp_ccc = inet_csk_ca(meta_sk);
81
	mptcp_ccc->forced_update = force;
82
}
83
84
static void mptcp_ccc_recalc_alpha(struct sock *sk)
85
{
86
	struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
87
	struct sock *sub_sk;
88
	int best_cwnd = 0, best_rtt = 0, can_send = 0;
89
	u64 max_numerator = 0, sum_denominator = 0, alpha = 1;
90
91
	if (!mpcb)
92
		return;
93
94
	/* Only one subflow left - fall back to normal reno-behavior
95
	 * (set alpha to 1) */
96
	if (mpcb->cnt_established <= 1)
97
		goto exit;
98
99
	/* Do regular alpha-calculation for multiple subflows */
100
101
	/* Find the max numerator of the alpha-calculation */
102
	mptcp_for_each_sk(mpcb, sub_sk) {
103
		struct tcp_sock *sub_tp = tcp_sk(sub_sk);
104
		u64 tmp;
105
106
		if (!mptcp_ccc_sk_can_send(sub_sk))
107
			continue;
108
109
		can_send++;
110
111
		/* We need to look for the path, that provides the max-value.
112
		 * Integer-overflow is not possible here, because
113
		 * tmp will be in u64.
114
		 */
115
		tmp = div64_u64(mptcp_ccc_scale(sub_tp->snd_cwnd,
116
				alpha_scale_num), sub_tp->srtt * sub_tp->srtt);
117
118
		if (tmp >= max_numerator) {
119
			max_numerator = tmp;
120
			best_cwnd = sub_tp->snd_cwnd;
121
			best_rtt = sub_tp->srtt;
122
		}
123
	}
124
125
	/* No subflow is able to send - we don't care anymore */
126
	if (unlikely(!can_send))
127
		goto exit;
128
129
	/* Calculate the denominator */
130
	mptcp_for_each_sk(mpcb, sub_sk) {
131
		struct tcp_sock *sub_tp = tcp_sk(sub_sk);
132
133
		if (!mptcp_ccc_sk_can_send(sub_sk))
134
			continue;
135
136
		sum_denominator += div_u64(
137
				mptcp_ccc_scale(sub_tp->snd_cwnd,
138
						alpha_scale_den) * best_rtt,
139
						sub_tp->srtt);
140
	}
141
	sum_denominator *= sum_denominator;
142
	if (unlikely(!sum_denominator)) {
143
		pr_err("%s: sum_denominator == 0, cnt_established:%d\n",
144
		       __func__, mpcb->cnt_established);
145
		mptcp_for_each_sk(mpcb, sub_sk) {
146
			struct tcp_sock *sub_tp = tcp_sk(sub_sk);
147
			pr_err("%s: pi:%d, state:%d\n, rtt:%u, cwnd: %u",
148
			       __func__, sub_tp->mptcp->path_index,
149
			       sub_sk->sk_state, sub_tp->srtt,
150
			       sub_tp->snd_cwnd);
151
		}
152
	}
153
154
	alpha = div64_u64(mptcp_ccc_scale(best_cwnd, alpha_scale_num), sum_denominator);
155
156
	if (unlikely(!alpha))
157
		alpha = 1;
158
159
exit:
160
	mptcp_set_alpha(mptcp_meta_sk(sk), alpha);
161
}
162
163
static void mptcp_ccc_init(struct sock *sk)
164
{
165
	if (tcp_sk(sk)->mpc) {
166
		mptcp_set_forced(mptcp_meta_sk(sk), 0);
167
		mptcp_set_alpha(mptcp_meta_sk(sk), 1);
168
	}
169
	/* If we do not mptcp, behave like reno: return */
170
}
171
172
static void mptcp_ccc_cwnd_event(struct sock *sk, enum tcp_ca_event event)
173
{
174
	if (event == CA_EVENT_LOSS)
175
		mptcp_ccc_recalc_alpha(sk);
176
}
177
178
static void mptcp_ccc_set_state(struct sock *sk, u8 ca_state)
179
{
180
	if (!tcp_sk(sk)->mpc)
181
		return;
182
183
	mptcp_set_forced(mptcp_meta_sk(sk), 1);
184
}
185
186
static void mptcp_ccc_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
187
{
188
	struct tcp_sock *tp = tcp_sk(sk);
189
	struct mptcp_cb *mpcb = tp->mpcb;
190
	int snd_cwnd;
191
192
	if (!tp->mpc) {
193
		tcp_reno_cong_avoid(sk, ack, in_flight);
194
		return;
195
	}
196
197
	if (!tcp_is_cwnd_limited(sk, in_flight))
198
		return;
199
200
	if (tp->snd_cwnd <= tp->snd_ssthresh) {
201
		/* In "safe" area, increase. */
202
		tcp_slow_start(tp);
203
		mptcp_ccc_recalc_alpha(sk);
204
		return;
205
	}
206
207
	if (mptcp_get_forced(mptcp_meta_sk(sk))) {
208
		mptcp_ccc_recalc_alpha(sk);
209
		mptcp_set_forced(mptcp_meta_sk(sk), 0);
210
	}
211
212
	if (mpcb->cnt_established > 1) {
213
		u64 alpha = mptcp_get_alpha(mptcp_meta_sk(sk));
214
215
		/* This may happen, if at the initialization, the mpcb
216
		 * was not yet attached to the sock, and thus
217
		 * initializing alpha failed.
218
		 */
219
		if (unlikely(!alpha))
220
			alpha = 1;
221
222
		snd_cwnd = (int) div_u64 ((u64) mptcp_ccc_scale(1, alpha_scale),
223
						alpha);
224
225
		/* snd_cwnd_cnt >= max (scale * tot_cwnd / alpha, cwnd)
226
		 * Thus, we select here the max value. */
227
		if (snd_cwnd < tp->snd_cwnd)
228
			snd_cwnd = tp->snd_cwnd;
229
	} else {
230
		snd_cwnd = tp->snd_cwnd;
231
	}
232
233
	if (tp->snd_cwnd_cnt >= snd_cwnd) {
234
		if (tp->snd_cwnd < tp->snd_cwnd_clamp) {
235
			tp->snd_cwnd++;
236
			mptcp_ccc_recalc_alpha(sk);
237
		}
238
239
		tp->snd_cwnd_cnt = 0;
240
	} else {
241
		tp->snd_cwnd_cnt++;
242
	}
243
}
244
245
static struct tcp_congestion_ops mptcp_ccc = {
246
	.init		= mptcp_ccc_init,
247
	.ssthresh	= tcp_reno_ssthresh,
248
	.cong_avoid	= mptcp_ccc_cong_avoid,
249
	.cwnd_event	= mptcp_ccc_cwnd_event,
250
	.set_state	= mptcp_ccc_set_state,
251
	.min_cwnd	= tcp_reno_min_cwnd,
252
	.owner		= THIS_MODULE,
253
	.name		= "coupled",
254
};
255
256
static int __init mptcp_ccc_register(void)
257
{
258
	BUILD_BUG_ON(sizeof(struct mptcp_ccc) > ICSK_CA_PRIV_SIZE);
259
	return tcp_register_congestion_control(&mptcp_ccc);
260
}
261
262
static void __exit mptcp_ccc_unregister(void)
263
{
264
	tcp_unregister_congestion_control(&mptcp_ccc);
265
}
266
267
module_init(mptcp_ccc_register);
268
module_exit(mptcp_ccc_unregister);
269
270
MODULE_AUTHOR("Christoph Paasch, Sébastien Barré");
271
MODULE_LICENSE("GPL");
272
MODULE_DESCRIPTION("MPTCP COUPLED CONGESTION CONTROL");
273
MODULE_VERSION("0.1");
(-)a/net/mptcp/mptcp_ctrl.c (+1891 lines)
Line 0 Link Here
1
/*
2
 *	MPTCP implementation - MPTCP-control
3
 *
4
 *	Initial Design & Implementation:
5
 *	Sébastien Barré <sebastien.barre@uclouvain.be>
6
 *
7
 *	Current Maintainer & Author:
8
 *	Christoph Paasch <christoph.paasch@uclouvain.be>
9
 *
10
 *	Additional authors:
11
 *	Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
12
 *	Gregory Detal <gregory.detal@uclouvain.be>
13
 *	Fabien Duchêne <fabien.duchene@uclouvain.be>
14
 *	Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
15
 *	Lavkesh Lahngir <lavkesh51@gmail.com>
16
 *	Andreas Ripke <ripke@neclab.eu>
17
 *	Vlad Dogaru <vlad.dogaru@intel.com>
18
 *	Octavian Purdila <octavian.purdila@intel.com>
19
 *	John Ronan <jronan@tssg.org>
20
 *	Catalin Nicutar <catalin.nicutar@gmail.com>
21
 *	Brandon Heller <brandonh@stanford.edu>
22
 *
23
 *
24
 *	This program is free software; you can redistribute it and/or
25
 *      modify it under the terms of the GNU General Public License
26
 *      as published by the Free Software Foundation; either version
27
 *      2 of the License, or (at your option) any later version.
28
 */
29
30
#include <net/inet_common.h>
31
#include <net/inet6_hashtables.h>
32
#include <net/ipv6.h>
33
#include <net/ip6_checksum.h>
34
#include <net/mptcp.h>
35
#include <net/mptcp_v4.h>
36
#include <net/mptcp_v6.h>
37
#include <net/sock.h>
38
#include <net/tcp.h>
39
#include <net/tcp_states.h>
40
#include <net/transp_v6.h>
41
#include <net/xfrm.h>
42
43
#include <linux/cryptohash.h>
44
#include <linux/kconfig.h>
45
#include <linux/module.h>
46
#include <linux/list.h>
47
#include <linux/jhash.h>
48
#include <linux/tcp.h>
49
#include <linux/net.h>
50
#include <linux/in.h>
51
#include <linux/random.h>
52
#include <linux/inetdevice.h>
53
#include <linux/workqueue.h>
54
#include <linux/atomic.h>
55
#ifdef CONFIG_SYSCTL
56
#include <linux/sysctl.h>
57
#endif
58
59
static struct kmem_cache *mptcp_sock_cache __read_mostly;
60
static struct kmem_cache *mptcp_cb_cache __read_mostly;
61
static struct kmem_cache *mptcp_tw_cache __read_mostly;
62
63
int sysctl_mptcp_ndiffports __read_mostly = 1;
64
int sysctl_mptcp_enabled __read_mostly = 1;
65
int sysctl_mptcp_checksum __read_mostly = 1;
66
int sysctl_mptcp_debug __read_mostly;
67
EXPORT_SYMBOL(sysctl_mptcp_debug);
68
int sysctl_mptcp_syn_retries __read_mostly = MPTCP_SYN_RETRIES;
69
70
bool mptcp_init_failed __read_mostly;
71
72
#ifdef CONFIG_SYSCTL
73
static struct ctl_table mptcp_table[] = {
74
	{
75
		.procname = "mptcp_ndiffports",
76
		.data = &sysctl_mptcp_ndiffports,
77
		.maxlen = sizeof(int),
78
		.mode = 0644,
79
		.proc_handler = &proc_dointvec
80
	},
81
	{
82
		.procname = "mptcp_enabled",
83
		.data = &sysctl_mptcp_enabled,
84
		.maxlen = sizeof(int),
85
		.mode = 0644,
86
		.proc_handler = &proc_dointvec
87
	},
88
	{
89
		.procname = "mptcp_checksum",
90
		.data = &sysctl_mptcp_checksum,
91
		.maxlen = sizeof(int),
92
		.mode = 0644,
93
		.proc_handler = &proc_dointvec
94
	},
95
	{
96
		.procname = "mptcp_debug",
97
		.data = &sysctl_mptcp_debug,
98
		.maxlen = sizeof(int),
99
		.mode = 0644,
100
		.proc_handler = &proc_dointvec
101
	},
102
	{
103
		.procname = "mptcp_syn_retries",
104
		.data = &sysctl_mptcp_syn_retries,
105
		.maxlen = sizeof(int),
106
		.mode = 0644,
107
		.proc_handler = &proc_dointvec
108
	},
109
	{ }
110
};
111
#endif
112
113
static struct sock *mptcp_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
114
					struct request_sock *req,
115
					struct dst_entry *dst)
116
{
117
#if IS_ENABLED(CONFIG_IPV6)
118
	if (sk->sk_family == AF_INET6)
119
		return tcp_v6_syn_recv_sock(sk, skb, req, dst);
120
121
	/* sk->sk_family == AF_INET */
122
	if (req->rsk_ops->family == AF_INET6)
123
		return mptcp_v6v4_syn_recv_sock(sk, skb, req, dst);
124
#endif
125
126
	/* sk->sk_family == AF_INET && req->rsk_ops->family == AF_INET */
127
	return tcp_v4_syn_recv_sock(sk, skb, req, dst);
128
}
129
130
struct sock *mptcp_select_ack_sock(const struct sock *meta_sk, int copied)
131
{
132
	struct tcp_sock *meta_tp = tcp_sk(meta_sk);
133
	struct sock *sk, *subsk = NULL;
134
	u32 max_data_seq = 0;
135
	/* max_data_seq initialized to correct compiler-warning.
136
	 * But the initialization is handled by max_data_seq_set
137
	 */
138
	short max_data_seq_set = 0;
139
	u32 min_time = 0xffffffff;
140
141
	/* How do we select the subflow to send the window-update on?
142
	 *
143
	 * 1. He has to be in a state where he can send an ack.
144
	 * 2. He has to be one of those subflow who recently
145
	 *    contributed to the received stream
146
	 *    (this guarantees a working subflow)
147
	 *    a) its latest data_seq received is after the original
148
	 *       copied_seq.
149
	 *       We select the one with the lowest rtt, so that the
150
	 *       window-update reaches our peer the fastest.
151
	 *    b) if no subflow has this kind of data_seq (e.g., very
152
	 *       strange meta-level retransmissions going on), we take
153
	 *       the subflow who last sent the highest data_seq.
154
	 */
155
	mptcp_for_each_sk(meta_tp->mpcb, sk) {
156
		struct tcp_sock *tp = tcp_sk(sk);
157
158
		if (!mptcp_sk_can_send_ack(sk))
159
			continue;
160
161
		/* Select among those who contributed to the
162
		 * current receive-queue.
163
		 */
164
		if (copied && after(tp->mptcp->last_data_seq, meta_tp->copied_seq - copied)) {
165
			if (tp->srtt < min_time) {
166
				min_time = tp->srtt;
167
				subsk = sk;
168
				max_data_seq_set = 0;
169
			}
170
			continue;
171
		}
172
173
		if (!subsk && !max_data_seq_set) {
174
			max_data_seq = tp->mptcp->last_data_seq;
175
			max_data_seq_set = 1;
176
			subsk = sk;
177
		}
178
179
		/* Otherwise, take the one with the highest data_seq */
180
		if ((!subsk || max_data_seq_set) &&
181
		    after(tp->mptcp->last_data_seq, max_data_seq)) {
182
			max_data_seq = tp->mptcp->last_data_seq;
183
			subsk = sk;
184
		}
185
	}
186
187
	if (!subsk) {
188
		mptcp_debug("%s subsk is null, copied %d, cseq %u\n", __func__,
189
			    copied, meta_tp->copied_seq);
190
		mptcp_for_each_sk(meta_tp->mpcb, sk) {
191
			struct tcp_sock *tp = tcp_sk(sk);
192
			mptcp_debug("%s pi %d state %u last_dseq %u\n",
193
				    __func__, tp->mptcp->path_index, sk->sk_state,
194
				    tp->mptcp->last_data_seq);
195
		}
196
	}
197
198
	return subsk;
199
}
200
201
static void mptcp_sock_def_error_report(struct sock *sk)
202
{
203
	struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
204
205
	if (!sock_flag(sk, SOCK_DEAD))
206
		mptcp_sub_close(sk, 0);
207
208
	if (mpcb->infinite_mapping_rcv || mpcb->infinite_mapping_snd ||
209
	    mpcb->send_infinite_mapping) {
210
		struct sock *meta_sk = mptcp_meta_sk(sk);
211
212
		meta_sk->sk_err = sk->sk_err;
213
		meta_sk->sk_err_soft = sk->sk_err_soft;
214
215
		if (!sock_flag(meta_sk, SOCK_DEAD))
216
			meta_sk->sk_error_report(meta_sk);
217
218
		tcp_done(meta_sk);
219
	}
220
221
	sk->sk_err = 0;
222
	return;
223
}
224
225
static void mptcp_mpcb_put(struct mptcp_cb *mpcb)
226
{
227
	if (atomic_dec_and_test(&mpcb->refcnt))
228
		kmem_cache_free(mptcp_cb_cache, mpcb);
229
}
230
231
static void mptcp_sock_destruct(struct sock *sk)
232
{
233
	struct sock *cb_sk, *prev = NULL;
234
	struct tcp_sock *tp = tcp_sk(sk);
235
236
	inet_sock_destruct(sk);
237
238
	cb_sk = tp->mpcb->callback_list;
239
	while (cb_sk) {
240
		if (cb_sk == sk) {
241
			if (prev)
242
				tcp_sk(prev)->mptcp->next_cb = tcp_sk(cb_sk)->mptcp->next_cb;
243
			else
244
				tp->mpcb->callback_list = tcp_sk(cb_sk)->mptcp->next_cb;
245
246
			tcp_sk(cb_sk)->mptcp->next_cb = NULL;
247
			cb_sk->sk_prot->release_cb(cb_sk);
248
			break;
249
		}
250
251
		prev = cb_sk;
252
		cb_sk = tcp_sk(cb_sk)->mptcp->next_cb;
253
	}
254
255
	kmem_cache_free(mptcp_sock_cache, tp->mptcp);
256
	tp->mptcp = NULL;
257
258
	if (!is_meta_sk(sk) && !tp->was_meta_sk) {
259
		/* Taken when mpcb pointer was set */
260
		sock_put(mptcp_meta_sk(sk));
261
		mptcp_mpcb_put(tp->mpcb);
262
	} else {
263
		struct mptcp_cb *mpcb = tp->mpcb;
264
		struct mptcp_tw *mptw;
265
266
		/* The mpcb is disappearing - we can make the final
267
		 * update to the rcv_nxt of the time-wait-sock and remove
268
		 * its reference to the mpcb.
269
		 */
270
		spin_lock_bh(&mpcb->tw_lock);
271
		list_for_each_entry_rcu(mptw, &mpcb->tw_list, list) {
272
			list_del_rcu(&mptw->list);
273
			mptw->in_list = 0;
274
			mptcp_mpcb_put(mpcb);
275
			rcu_assign_pointer(mptw->mpcb, NULL);
276
		}
277
		spin_unlock_bh(&mpcb->tw_lock);
278
279
		mptcp_mpcb_put(mpcb);
280
281
		mptcp_debug("%s destroying meta-sk\n", __func__);
282
	}
283
}
284
285
void mptcp_destroy_sock(struct sock *sk)
286
{
287
	if (is_meta_sk(sk)) {
288
		struct sock *sk_it, *tmpsk;
289
290
		__skb_queue_purge(&tcp_sk(sk)->mpcb->reinject_queue);
291
		mptcp_purge_ofo_queue(tcp_sk(sk));
292
293
		/* We have to close all remaining subflows. Normally, they
294
		 * should all be about to get closed. But, if the kernel is
295
		 * forcing a closure (e.g., tcp_write_err), the subflows might
296
		 * not have been closed properly (as we are waiting for the
297
		 * DATA_ACK of the DATA_FIN).
298
		 */
299
		mptcp_for_each_sk_safe(tcp_sk(sk)->mpcb, sk_it, tmpsk) {
300
			/* Already did call tcp_close - waiting for graceful
301
			 * closure.
302
			 */
303
			if (tcp_sk(sk_it)->closing)
304
				continue;
305
306
			/* Allow the delayed work first to prevent time-wait state */
307
			if (delayed_work_pending(&tcp_sk(sk_it)->mptcp->work))
308
				continue;
309
310
			mptcp_sub_close(sk_it, 0);
311
		}
312
	} else {
313
		mptcp_del_sock(sk);
314
	}
315
}
316
317
static void mptcp_set_state(struct sock *sk)
318
{
319
	struct sock *meta_sk = mptcp_meta_sk(sk);
320
321
	/* Meta is not yet established - wake up the application */
322
	if ((1 << meta_sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV) &&
323
	    sk->sk_state == TCP_ESTABLISHED) {
324
		tcp_set_state(meta_sk, TCP_ESTABLISHED);
325
326
		if (!sock_flag(meta_sk, SOCK_DEAD)) {
327
			meta_sk->sk_state_change(meta_sk);
328
			sk_wake_async(meta_sk, SOCK_WAKE_IO, POLL_OUT);
329
		}
330
	}
331
332
	if (sk->sk_state == TCP_ESTABLISHED) {
333
		tcp_sk(sk)->mptcp->establish_increased = 1;
334
		tcp_sk(sk)->mpcb->cnt_established++;
335
	}
336
}
337
338
void mptcp_set_keepalive(struct sock *sk, int val)
339
{
340
	struct sock *sk_it;
341
342
	mptcp_for_each_sk(tcp_sk(sk)->mpcb, sk_it) {
343
		tcp_set_keepalive(sk_it, val);
344
		sock_valbool_flag(sk, SOCK_KEEPOPEN, val);
345
	}
346
}
347
348
u32 mptcp_secret[MD5_MESSAGE_BYTES / 4] ____cacheline_aligned;
349
u32 mptcp_key_seed = 0;
350
351
void mptcp_key_sha1(u64 key, u32 *token, u64 *idsn)
352
{
353
	u32 workspace[SHA_WORKSPACE_WORDS];
354
	u32 mptcp_hashed_key[SHA_DIGEST_WORDS];
355
	u8 input[64];
356
	int i;
357
358
	memset(workspace, 0, sizeof(workspace));
359
360
	/* Initialize input with appropriate padding */
361
	memset(&input[9], 0, sizeof(input) - 10); /* -10, because the last byte
362
						   * is explicitly set too */
363
	memcpy(input, &key, sizeof(key)); /* Copy key to the msg beginning */
364
	input[8] = 0x80; /* Padding: First bit after message = 1 */
365
	input[63] = 0x40; /* Padding: Length of the message = 64 bits */
366
367
	sha_init(mptcp_hashed_key);
368
	sha_transform(mptcp_hashed_key, input, workspace);
369
370
	for (i = 0; i < 5; i++)
371
		mptcp_hashed_key[i] = cpu_to_be32(mptcp_hashed_key[i]);
372
373
	if (token)
374
		*token = mptcp_hashed_key[0];
375
	if (idsn)
376
		*idsn = *((u64 *)&mptcp_hashed_key[3]);
377
}
378
379
void mptcp_hmac_sha1(u8 *key_1, u8 *key_2, u8 *rand_1, u8 *rand_2,
380
		       u32 *hash_out)
381
{
382
	u32 workspace[SHA_WORKSPACE_WORDS];
383
	u8 input[128]; /* 2 512-bit blocks */
384
	int i;
385
386
	memset(workspace, 0, sizeof(workspace));
387
388
	/* Generate key xored with ipad */
389
	memset(input, 0x36, 64);
390
	for (i = 0; i < 8; i++)
391
		input[i] ^= key_1[i];
392
	for (i = 0; i < 8; i++)
393
		input[i + 8] ^= key_2[i];
394
395
	memcpy(&input[64], rand_1, 4);
396
	memcpy(&input[68], rand_2, 4);
397
	input[72] = 0x80; /* Padding: First bit after message = 1 */
398
	memset(&input[73], 0, 53);
399
400
	/* Padding: Length of the message = 512 + 64 bits */
401
	input[126] = 0x02;
402
	input[127] = 0x40;
403
404
	sha_init(hash_out);
405
	sha_transform(hash_out, input, workspace);
406
	memset(workspace, 0, sizeof(workspace));
407
408
	sha_transform(hash_out, &input[64], workspace);
409
	memset(workspace, 0, sizeof(workspace));
410
411
	for (i = 0; i < 5; i++)
412
		hash_out[i] = cpu_to_be32(hash_out[i]);
413
414
	/* Prepare second part of hmac */
415
	memset(input, 0x5C, 64);
416
	for (i = 0; i < 8; i++)
417
		input[i] ^= key_1[i];
418
	for (i = 0; i < 8; i++)
419
		input[i + 8] ^= key_2[i];
420
421
	memcpy(&input[64], hash_out, 20);
422
	input[84] = 0x80;
423
	memset(&input[85], 0, 41);
424
425
	/* Padding: Length of the message = 512 + 160 bits */
426
	input[126] = 0x02;
427
	input[127] = 0xA0;
428
429
	sha_init(hash_out);
430
	sha_transform(hash_out, input, workspace);
431
	memset(workspace, 0, sizeof(workspace));
432
433
	sha_transform(hash_out, &input[64], workspace);
434
435
	for (i = 0; i < 5; i++)
436
		hash_out[i] = cpu_to_be32(hash_out[i]);
437
}
438
439
static void mptcp_mpcb_inherit_sockopts(struct sock *meta_sk, struct sock *master_sk)
440
{
441
	/* Socket-options handled by mptcp_inherit_sk while creating the meta-sk.
442
	 * ======
443
	 * SO_SNDBUF, SO_SNDBUFFORCE, SO_RCVBUF, SO_RCVBUFFORCE, SO_RCVLOWAT,
444
	 * SO_RCVTIMEO, SO_SNDTIMEO, SO_ATTACH_FILTER, SO_DETACH_FILTER,
445
	 * TCP_NODELAY, TCP_CORK
446
	 *
447
	 * Socket-options handled in this function here
448
	 * ======
449
	 * SO_KEEPALIVE
450
	 * TCP_KEEP*
451
	 * TCP_DEFER_ACCEPT
452
	 *
453
	 * Socket-options on the todo-list
454
	 * ======
455
	 * SO_BINDTODEVICE - should probably prevent creation of new subsocks
456
	 *		     across other devices. - what about the api-draft?
457
	 * SO_DEBUG
458
	 * SO_REUSEADDR - probably we don't care about this
459
	 * SO_DONTROUTE, SO_BROADCAST
460
	 * SO_OOBINLINE
461
	 * SO_LINGER
462
	 * SO_TIMESTAMP* - I don't think this is of concern for a SOCK_STREAM
463
	 * SO_PASSSEC - I don't think this is of concern for a SOCK_STREAM
464
	 * SO_RXQ_OVFL
465
	 * TCP_COOKIE_TRANSACTIONS
466
	 * TCP_MAXSEG
467
	 * TCP_THIN_* - Handled by mptcp_inherit_sk, but we need to support this
468
	 *		in mptcp_retransmit_timer. AND we need to check what is
469
	 *		about the subsockets.
470
	 * TCP_LINGER2
471
	 * TCP_WINDOW_CLAMP
472
	 * TCP_USER_TIMEOUT
473
	 * TCP_MD5SIG
474
	 *
475
	 * Socket-options of no concern for the meta-socket (but for the subsocket)
476
	 * ======
477
	 * SO_PRIORITY
478
	 * SO_MARK
479
	 * TCP_CONGESTION
480
	 * TCP_SYNCNT
481
	 * TCP_QUICKACK
482
	 */
483
	struct tcp_sock *meta_tp = tcp_sk(meta_sk);
484
485
	/****** KEEPALIVE-handler ******/
486
487
	/* Keepalive-timer has been started already, but it is handled at the
488
	 * subflow level.
489
	 */
490
	if (sock_flag(meta_sk, SOCK_KEEPOPEN)) {
491
		inet_csk_delete_keepalive_timer(meta_sk);
492
		inet_csk_reset_keepalive_timer(master_sk, keepalive_time_when(meta_tp));
493
	}
494
495
	/****** DEFER_ACCEPT-handler ******/
496
497
	/* DEFER_ACCEPT is not of concern for new subflows - we always accept
498
	 * them
499
	 */
500
	inet_csk(meta_sk)->icsk_accept_queue.rskq_defer_accept = 0;
501
}
502
503
static void mptcp_sub_inherit_sockopts(struct sock *meta_sk, struct sock *sub_sk)
504
{
505
	struct tcp_sock *meta_tp = tcp_sk(meta_sk);
506
	/* Keepalive is handled at the subflow-level */
507
	if (sock_flag(meta_sk, SOCK_KEEPOPEN)) {
508
		inet_csk_reset_keepalive_timer(sub_sk, keepalive_time_when(meta_tp));
509
		sock_valbool_flag(sub_sk, SOCK_KEEPOPEN, keepalive_time_when(meta_tp));
510
	}
511
512
	/* IP_TOS also goes to the subflow. */
513
	if (inet_sk(sub_sk)->tos != inet_sk(meta_sk)->tos) {
514
		inet_sk(sub_sk)->tos = inet_sk(meta_sk)->tos;
515
		sub_sk->sk_priority = meta_sk->sk_priority;
516
		sk_dst_reset(sub_sk);
517
	}
518
519
	/* Inheris SO_REUSEADDR */
520
	sub_sk->sk_reuse = meta_sk->sk_reuse;
521
}
522
523
int mptcp_backlog_rcv(struct sock *meta_sk, struct sk_buff *skb)
524
{
525
	/* skb-sk may be NULL if we receive a packet immediatly after the
526
	 * SYN/ACK + MP_CAPABLE.
527
	 */
528
	struct sock *sk = skb->sk ? skb->sk : meta_sk;
529
	int ret = 0;
530
531
	if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) {
532
		kfree_skb(skb);
533
		return 0;
534
	}
535
536
	if (sk->sk_family == AF_INET)
537
		ret = tcp_v4_do_rcv(sk, skb);
538
#if IS_ENABLED(CONFIG_IPV6)
539
	else
540
		ret = tcp_v6_do_rcv(sk, skb);
541
#endif
542
543
	sock_put(sk);
544
	return ret;
545
}
546
547
struct lock_class_key meta_key;
548
struct lock_class_key meta_slock_key;
549
550
/* Code heavily inspired from sk_clone() */
551
static int mptcp_inherit_sk(const struct sock *sk, struct sock *newsk,
552
			    int family, const gfp_t flags)
553
{
554
	struct sk_filter *filter;
555
	struct proto *prot = newsk->sk_prot;
556
	const struct inet_connection_sock_af_ops *af_ops = inet_csk(newsk)->icsk_af_ops;
557
#ifdef CONFIG_SECURITY_NETWORK
558
	void *sptr = newsk->sk_security;
559
#endif
560
561
	if (sk->sk_family == AF_INET) {
562
		memcpy(newsk, sk, offsetof(struct sock, sk_dontcopy_begin));
563
		memcpy(&newsk->sk_dontcopy_end, &sk->sk_dontcopy_end,
564
		       sizeof(struct tcp_sock) - offsetof(struct sock, sk_dontcopy_end));
565
	} else {
566
		memcpy(newsk, sk, offsetof(struct sock, sk_dontcopy_begin));
567
		memcpy(&newsk->sk_dontcopy_end, &sk->sk_dontcopy_end,
568
		       sizeof(struct tcp6_sock) - offsetof(struct sock, sk_dontcopy_end));
569
	}
570
571
#ifdef CONFIG_SECURITY_NETWORK
572
	newsk->sk_security = sptr;
573
	security_sk_clone(sk, newsk);
574
#endif
575
576
	/* Has been changed by sock_copy above - we may need an IPv6-socket */
577
	newsk->sk_family = family;
578
	newsk->sk_prot = prot;
579
	newsk->sk_prot_creator = prot;
580
	inet_csk(newsk)->icsk_af_ops = af_ops;
581
582
	/* We don't yet have the mptcp-point. Thus we still need inet_sock_destruct */
583
	newsk->sk_destruct = inet_sock_destruct;
584
585
	/* SANITY */
586
	get_net(sock_net(newsk));
587
	sk_node_init(&newsk->sk_node);
588
	sock_lock_init_class_and_name(newsk, "slock-AF_INET-MPTCP",
589
				      &meta_slock_key, "sk_lock-AF_INET-MPTCP",
590
				      &meta_key);
591
592
	/* Unlocks are in:
593
	 *
594
	 * 1. If we are creating the master-sk
595
	 *	* on client-side in tcp_rcv_state_process, "case TCP_SYN_SENT"
596
	 *	* on server-side in tcp_child_process
597
	 * 2. If we are creating another subsock
598
	 *	* Also in tcp_child_process
599
	 */
600
	bh_lock_sock(newsk);
601
	newsk->sk_backlog.head = NULL;
602
	newsk->sk_backlog.tail = NULL;
603
	newsk->sk_backlog.len = 0;
604
605
	atomic_set(&newsk->sk_rmem_alloc, 0);
606
	atomic_set(&newsk->sk_wmem_alloc, 1);
607
	atomic_set(&newsk->sk_omem_alloc, 0);
608
609
	skb_queue_head_init(&newsk->sk_receive_queue);
610
	skb_queue_head_init(&newsk->sk_write_queue);
611
#ifdef CONFIG_NET_DMA
612
	skb_queue_head_init(&newsk->sk_async_wait_queue);
613
#endif
614
615
	spin_lock_init(&newsk->sk_dst_lock);
616
	rwlock_init(&newsk->sk_callback_lock);
617
	lockdep_set_class_and_name(&newsk->sk_callback_lock,
618
				   af_callback_keys + newsk->sk_family,
619
				   af_family_clock_key_strings[newsk->sk_family]);
620
	newsk->sk_dst_cache	= NULL;
621
	newsk->sk_rx_dst	= NULL;
622
	newsk->sk_wmem_queued	= 0;
623
	newsk->sk_forward_alloc = 0;
624
	newsk->sk_send_head	= NULL;
625
	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
626
627
	tcp_sk(newsk)->mptcp = NULL;
628
629
	sock_reset_flag(newsk, SOCK_DONE);
630
	skb_queue_head_init(&newsk->sk_error_queue);
631
632
	filter = rcu_dereference_protected(newsk->sk_filter, 1);
633
	if (filter != NULL)
634
		sk_filter_charge(newsk, filter);
635
636
	if (unlikely(xfrm_sk_clone_policy(newsk))) {
637
		/* It is still raw copy of parent, so invalidate
638
		 * destructor and make plain sk_free()
639
		 */
640
		newsk->sk_destruct = NULL;
641
		bh_unlock_sock(newsk);
642
		sk_free(newsk);
643
		newsk = NULL;
644
		return -ENOMEM;
645
	}
646
647
	newsk->sk_err	   = 0;
648
	newsk->sk_priority = 0;
649
	/* Before updating sk_refcnt, we must commit prior changes to memory
650
	 * (Documentation/RCU/rculist_nulls.txt for details)
651
	 */
652
	smp_wmb();
653
	atomic_set(&newsk->sk_refcnt, 2);
654
655
	/* Increment the counter in the same struct proto as the master
656
	 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
657
	 * is the same as sk->sk_prot->socks, as this field was copied
658
	 * with memcpy).
659
	 *
660
	 * This _changes_ the previous behaviour, where
661
	 * tcp_create_openreq_child always was incrementing the
662
	 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
663
	 * to be taken into account in all callers. -acme
664
	 */
665
	sk_refcnt_debug_inc(newsk);
666
	sk_set_socket(newsk, NULL);
667
	newsk->sk_wq = NULL;
668
669
	if (newsk->sk_prot->sockets_allocated)
670
		percpu_counter_inc(newsk->sk_prot->sockets_allocated);
671
672
	if (sock_flag(newsk, SOCK_TIMESTAMP) ||
673
	    sock_flag(newsk, SOCK_TIMESTAMPING_RX_SOFTWARE))
674
		net_enable_timestamp();
675
676
	return 0;
677
}
678
679
int mptcp_alloc_mpcb(struct sock *meta_sk, __u64 remote_key, u32 window)
680
{
681
	struct mptcp_cb *mpcb;
682
	struct sock *master_sk;
683
	struct inet_connection_sock *master_icsk, *meta_icsk = inet_csk(meta_sk);
684
	struct tcp_sock *master_tp, *meta_tp = tcp_sk(meta_sk);
685
	struct sk_buff *skb, *tmp;
686
	u64 idsn;
687
688
	master_sk = sk_prot_alloc(meta_sk->sk_prot, GFP_ATOMIC | __GFP_ZERO,
689
				  meta_sk->sk_family);
690
	if (!master_sk)
691
		return -ENOBUFS;
692
693
	master_tp = tcp_sk(master_sk);
694
	master_icsk = inet_csk(master_sk);
695
696
	/* Need to set this here - it is needed by mptcp_inherit_sk */
697
	master_sk->sk_prot = meta_sk->sk_prot;
698
	master_sk->sk_prot_creator = meta_sk->sk_prot;
699
	master_icsk->icsk_af_ops = meta_icsk->icsk_af_ops;
700
701
	mpcb = kmem_cache_zalloc(mptcp_cb_cache, GFP_ATOMIC);
702
	if (!mpcb) {
703
		sk_free(master_sk);
704
		return -ENOBUFS;
705
	}
706
707
	/* master_sk inherits from meta_sk */
708
	if (mptcp_inherit_sk(meta_sk, master_sk, meta_sk->sk_family, GFP_ATOMIC)) {
709
		kmem_cache_free(mptcp_cb_cache, mpcb);
710
		return -ENOBUFS;
711
	}
712
713
#if IS_ENABLED(CONFIG_IPV6)
714
	if (meta_icsk->icsk_af_ops == &ipv6_mapped) {
715
		struct ipv6_pinfo *newnp, *np = inet6_sk(meta_sk);
716
717
		inet_sk(master_sk)->pinet6 = &((struct tcp6_sock *)master_sk)->inet6;
718
719
		newnp = inet6_sk(master_sk);
720
		memcpy(newnp, np, sizeof(struct ipv6_pinfo));
721
722
		newnp->ipv6_mc_list = NULL;
723
		newnp->ipv6_ac_list = NULL;
724
		newnp->ipv6_fl_list = NULL;
725
		newnp->opt = NULL;
726
		newnp->pktoptions = NULL;
727
		(void)xchg(&newnp->rxpmtu, NULL);
728
	} else if (meta_sk->sk_family == AF_INET6) {
729
		struct ipv6_pinfo *newnp;
730
731
		/* Meta is IPv4. Initialize pinet6 for the master-sk. */
732
		inet_sk(master_sk)->pinet6 = &((struct tcp6_sock *)master_sk)->inet6;
733
734
		newnp = inet6_sk(master_sk);
735
736
		newnp->hop_limit	= -1;
737
		newnp->mcast_hops	= IPV6_DEFAULT_MCASTHOPS;
738
		newnp->mc_loop	= 1;
739
		newnp->pmtudisc	= IPV6_PMTUDISC_WANT;
740
		newnp->ipv6only	= sock_net(master_sk)->ipv6.sysctl.bindv6only;
741
	}
742
#endif
743
744
	meta_tp->mptcp = kmem_cache_zalloc(mptcp_sock_cache, GFP_ATOMIC);
745
	if (!meta_tp->mptcp) {
746
		kmem_cache_free(mptcp_cb_cache, mpcb);
747
		sk_free(master_sk);
748
		return -ENOBUFS;
749
	}
750
751
	/* Store the keys and generate the peer's token */
752
	mpcb->mptcp_loc_key = meta_tp->mptcp_loc_key;
753
	mpcb->mptcp_loc_token = meta_tp->mptcp_loc_token;
754
755
	/* Generate Initial data-sequence-numbers */
756
	mptcp_key_sha1(mpcb->mptcp_loc_key, NULL, &idsn);
757
	idsn = ntohll(idsn) + 1;
758
	mpcb->snd_high_order[0] = idsn >> 32;
759
	mpcb->snd_high_order[1] = mpcb->snd_high_order[0] - 1;
760
761
	meta_tp->write_seq = (u32)idsn;
762
	meta_tp->snd_sml = meta_tp->write_seq;
763
	meta_tp->snd_una = meta_tp->write_seq;
764
	meta_tp->snd_nxt = meta_tp->write_seq;
765
	meta_tp->pushed_seq = meta_tp->write_seq;
766
	meta_tp->snd_up = meta_tp->write_seq;
767
768
	mpcb->mptcp_rem_key = remote_key;
769
	mptcp_key_sha1(mpcb->mptcp_rem_key, &mpcb->mptcp_rem_token, &idsn);
770
	idsn = ntohll(idsn) + 1;
771
	mpcb->rcv_high_order[0] = idsn >> 32;
772
	mpcb->rcv_high_order[1] = mpcb->rcv_high_order[0] + 1;
773
	meta_tp->copied_seq = (u32) idsn;
774
	meta_tp->rcv_nxt = (u32) idsn;
775
	meta_tp->rcv_wup = (u32) idsn;
776
777
	meta_tp->snd_wl1 = meta_tp->rcv_nxt - 1;
778
	meta_tp->snd_wnd = window;
779
	meta_tp->retrans_stamp = 0; /* Set in tcp_connect() */
780
781
	meta_tp->packets_out = 0;
782
	meta_tp->mptcp->snt_isn = meta_tp->write_seq; /* Initial data-sequence-number */
783
	meta_icsk->icsk_probes_out = 0;
784
785
	/* Set mptcp-pointers */
786
	master_tp->mpcb = mpcb;
787
	master_tp->meta_sk = meta_sk;
788
	meta_tp->mpcb = mpcb;
789
	meta_tp->meta_sk = meta_sk;
790
	mpcb->meta_sk = meta_sk;
791
	mpcb->master_sk = master_sk;
792
793
	meta_tp->mpc = 1;
794
	meta_tp->mptcp->attached = 0;
795
	meta_tp->was_meta_sk = 0;
796
797
	/* Initialize the queues */
798
	skb_queue_head_init(&mpcb->reinject_queue);
799
	skb_queue_head_init(&master_tp->out_of_order_queue);
800
	tcp_prequeue_init(master_tp);
801
	INIT_LIST_HEAD(&master_tp->tsq_node);
802
803
	master_tp->tsq_flags = 0;
804
805
	/* Copy the write-queue from the meta down to the master.
806
	 * This is necessary to get the SYN to the master-write-queue.
807
	 * No other data can be queued, before tcp_sendmsg waits for the
808
	 * connection to finish.
809
	 */
810
	skb_queue_walk_safe(&meta_sk->sk_write_queue, skb, tmp) {
811
		skb_unlink(skb, &meta_sk->sk_write_queue);
812
		skb_queue_tail(&master_sk->sk_write_queue, skb);
813
814
		master_sk->sk_wmem_queued += skb->truesize;
815
		sk_mem_charge(master_sk, skb->truesize);
816
	}
817
818
	meta_sk->sk_wmem_queued = 0;
819
	meta_sk->sk_forward_alloc = 0;
820
821
	mutex_init(&mpcb->mutex);
822
823
	/* Initialize workqueue-struct */
824
	INIT_WORK(&mpcb->subflow_work, mptcp_create_subflow_worker);
825
	INIT_DELAYED_WORK(&mpcb->subflow_retry_work, mptcp_retry_subflow_worker);
826
	INIT_WORK(&mpcb->address_work, mptcp_address_worker);
827
828
	/* Init the accept_queue structure, we support a queue of 32 pending
829
	 * connections, it does not need to be huge, since we only store  here
830
	 * pending subflow creations.
831
	 */
832
	if (reqsk_queue_alloc(&meta_icsk->icsk_accept_queue, 32, GFP_ATOMIC)) {
833
		inet_put_port(master_sk);
834
		kmem_cache_free(mptcp_sock_cache, meta_tp->mptcp);
835
		kmem_cache_free(mptcp_cb_cache, mpcb);
836
		sk_free(master_sk);
837
		meta_tp->mpc = 0;
838
		return -ENOMEM;
839
	}
840
841
	/* Redefine function-pointers as the meta-sk is now fully ready */
842
	meta_sk->sk_backlog_rcv = mptcp_backlog_rcv;
843
	meta_sk->sk_destruct = mptcp_sock_destruct;
844
	mpcb->syn_recv_sock = mptcp_syn_recv_sock;
845
846
	/* Meta-level retransmit timer */
847
	meta_icsk->icsk_rto *= 2; /* Double of initial - rto */
848
849
	tcp_init_xmit_timers(master_sk);
850
	/* Has been set for sending out the SYN */
851
	inet_csk_clear_xmit_timer(meta_sk, ICSK_TIME_RETRANS);
852
853
	if (!meta_tp->inside_tk_table) {
854
		/* Adding the meta_tp in the token hashtable - coming from server-side */
855
		rcu_read_lock();
856
		spin_lock(&mptcp_tk_hashlock);
857
858
		__mptcp_hash_insert(meta_tp, mpcb->mptcp_loc_token);
859
860
		spin_unlock(&mptcp_tk_hashlock);
861
		rcu_read_unlock();
862
	}
863
	master_tp->inside_tk_table = 0;
864
865
	/* Init time-wait stuff */
866
	INIT_LIST_HEAD(&mpcb->tw_list);
867
	spin_lock_init(&mpcb->tw_lock);
868
869
	mptcp_mpcb_inherit_sockopts(meta_sk, master_sk);
870
871
	mpcb->orig_sk_rcvbuf = meta_sk->sk_rcvbuf;
872
	mpcb->orig_sk_sndbuf = meta_sk->sk_sndbuf;
873
	mpcb->orig_window_clamp = meta_tp->window_clamp;
874
875
	/* The meta is directly linked - set refcnt to 1 */
876
	atomic_set(&mpcb->refcnt, 1);
877
878
	mptcp_debug("%s: created mpcb with token %#x\n",
879
		    __func__, mpcb->mptcp_loc_token);
880
881
	return 0;
882
}
883
884
struct sock *mptcp_sk_clone(const struct sock *sk, int family,
885
			    const gfp_t priority)
886
{
887
	struct sock *newsk = NULL;
888
889
	if (family == AF_INET && sk->sk_family == AF_INET) {
890
		newsk = sk_prot_alloc(&tcp_prot, priority, family);
891
		if (!newsk)
892
			return NULL;
893
894
		/* Set these pointers - they are needed by mptcp_inherit_sk */
895
		newsk->sk_prot = &tcp_prot;
896
		newsk->sk_prot_creator = &tcp_prot;
897
		inet_csk(newsk)->icsk_af_ops = &ipv4_specific;
898
		newsk->sk_family = AF_INET;
899
	}
900
#if IS_ENABLED(CONFIG_IPV6)
901
	else {
902
		newsk = sk_prot_alloc(&tcpv6_prot, priority, family);
903
		if (!newsk)
904
			return NULL;
905
906
		newsk->sk_prot = &tcpv6_prot;
907
		newsk->sk_prot_creator = &tcpv6_prot;
908
		if (family == AF_INET)
909
			inet_csk(newsk)->icsk_af_ops = &ipv6_mapped;
910
		else
911
			inet_csk(newsk)->icsk_af_ops = &ipv6_specific;
912
		newsk->sk_family = AF_INET6;
913
	}
914
#endif
915
916
	if (mptcp_inherit_sk(sk, newsk, family, priority))
917
		return NULL;
918
919
	return newsk;
920
}
921
922
void mptcp_fallback_meta_sk(struct sock *meta_sk)
923
{
924
	kfree(inet_csk(meta_sk)->icsk_accept_queue.listen_opt);
925
	kmem_cache_free(mptcp_sock_cache, tcp_sk(meta_sk)->mptcp);
926
	kmem_cache_free(mptcp_cb_cache, tcp_sk(meta_sk)->mpcb);
927
}
928
929
int mptcp_add_sock(struct sock *meta_sk, struct sock *sk, u8 rem_id,
930
		   gfp_t flags)
931
{
932
	struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
933
	struct tcp_sock *tp = tcp_sk(sk);
934
935
	tp->mptcp = kmem_cache_zalloc(mptcp_sock_cache, flags);
936
	if (!tp->mptcp)
937
		return -ENOMEM;
938
939
	tp->mptcp->path_index = mptcp_set_new_pathindex(mpcb);
940
	/* No more space for more subflows? */
941
	if (!tp->mptcp->path_index) {
942
		kmem_cache_free(mptcp_sock_cache, tp->mptcp);
943
		return -EPERM;
944
	}
945
946
	tp->mptcp->tp = tp;
947
	tp->mpcb = mpcb;
948
	tp->meta_sk = meta_sk;
949
	tp->mpc = 1;
950
	tp->mptcp->rem_id = rem_id;
951
	tp->mptcp->last_rbuf_opti = tcp_time_stamp;
952
953
	/* The corresponding sock_put is in mptcp_sock_destruct(). It cannot be
954
	 * included in mptcp_del_sock(), because the mpcb must remain alive
955
	 * until the last subsocket is completely destroyed.
956
	 */
957
	sock_hold(meta_sk);
958
	atomic_inc(&mpcb->refcnt);
959
960
	tp->mptcp->next = mpcb->connection_list;
961
	mpcb->connection_list = tp;
962
	tp->mptcp->attached = 1;
963
964
	mpcb->cnt_subflows++;
965
	atomic_add(atomic_read(&((struct sock *)tp)->sk_rmem_alloc),
966
		   &meta_sk->sk_rmem_alloc);
967
968
	mptcp_sub_inherit_sockopts(meta_sk, sk);
969
	INIT_DELAYED_WORK(&tp->mptcp->work, mptcp_sub_close_wq);
970
971
	/* As we successfully allocated the mptcp_tcp_sock, we have to
972
	 * change the function-pointers here (for sk_destruct to work correctly)
973
	 */
974
	sk->sk_error_report = mptcp_sock_def_error_report;
975
	sk->sk_data_ready = mptcp_data_ready;
976
	sk->sk_write_space = mptcp_write_space;
977
	sk->sk_state_change = mptcp_set_state;
978
	sk->sk_destruct = mptcp_sock_destruct;
979
980
	if (sk->sk_family == AF_INET)
981
		mptcp_debug("%s: token %#x pi %d, src_addr:%pI4:%d dst_addr:%pI4:%d, cnt_subflows now %d\n",
982
			    __func__ , mpcb->mptcp_loc_token,
983
			    tp->mptcp->path_index,
984
			    &((struct inet_sock *)tp)->inet_saddr,
985
			    ntohs(((struct inet_sock *)tp)->inet_sport),
986
			    &((struct inet_sock *)tp)->inet_daddr,
987
			    ntohs(((struct inet_sock *)tp)->inet_dport),
988
			    mpcb->cnt_subflows);
989
	else
990
		mptcp_debug("%s: token %#x pi %d, src_addr:%pI6:%d dst_addr:%pI6:%d, cnt_subflows now %d\n",
991
			    __func__ , mpcb->mptcp_loc_token,
992
			    tp->mptcp->path_index, &inet6_sk(sk)->saddr,
993
			    ntohs(((struct inet_sock *)tp)->inet_sport),
994
			    &inet6_sk(sk)->daddr,
995
			    ntohs(((struct inet_sock *)tp)->inet_dport),
996
			    mpcb->cnt_subflows);
997
998
	return 0;
999
}
1000
1001
void mptcp_del_sock(struct sock *sk)
1002
{
1003
	struct tcp_sock *tp = tcp_sk(sk), *tp_prev;
1004
	struct mptcp_cb *mpcb;
1005
1006
	if (!tp->mptcp || !tp->mptcp->attached)
1007
		return;
1008
1009
	mpcb = tp->mpcb;
1010
	tp_prev = mpcb->connection_list;
1011
1012
	mptcp_debug("%s: Removing subsock tok %#x pi:%d state %d is_meta? %d\n",
1013
		    __func__, mpcb->mptcp_loc_token, tp->mptcp->path_index,
1014
		    sk->sk_state, is_meta_sk(sk));
1015
1016
	if (tp_prev == tp) {
1017
		mpcb->connection_list = tp->mptcp->next;
1018
	} else {
1019
		for (; tp_prev && tp_prev->mptcp->next; tp_prev = tp_prev->mptcp->next) {
1020
			if (tp_prev->mptcp->next == tp) {
1021
				tp_prev->mptcp->next = tp->mptcp->next;
1022
				break;
1023
			}
1024
		}
1025
	}
1026
	mpcb->cnt_subflows--;
1027
	if (tp->mptcp->establish_increased)
1028
		mpcb->cnt_established--;
1029
1030
	tp->mptcp->next = NULL;
1031
	tp->mptcp->attached = 0;
1032
	mpcb->path_index_bits &= ~(1 << tp->mptcp->path_index);
1033
1034
	if (!skb_queue_empty(&sk->sk_write_queue))
1035
		mptcp_reinject_data(sk, 0);
1036
1037
	if (is_master_tp(tp))
1038
		mpcb->master_sk = NULL;
1039
	else if (tp->mptcp->pre_established)
1040
		sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer);
1041
1042
	rcu_assign_pointer(inet_sk(sk)->inet_opt, NULL);
1043
}
1044
1045
/* Updates the metasocket ULID/port data, based on the given sock.
1046
 * The argument sock must be the sock accessible to the application.
1047
 * In this function, we update the meta socket info, based on the changes
1048
 * in the application socket (bind, address allocation, ...)
1049
 */
1050
void mptcp_update_metasocket(struct sock *sk, struct sock *meta_sk)
1051
{
1052
	struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
1053
1054
	switch (sk->sk_family) {
1055
#if IS_ENABLED(CONFIG_IPV6)
1056
	case AF_INET6:
1057
		/* If the socket is v4 mapped, we continue with v4 operations */
1058
		if (!mptcp_v6_is_v4_mapped(sk)) {
1059
			mpcb->locaddr6[0].addr = inet6_sk(sk)->saddr;
1060
			mpcb->locaddr6[0].id = 0;
1061
			mpcb->locaddr6[0].port = 0;
1062
			mpcb->locaddr6[0].low_prio = 0;
1063
			mpcb->loc6_bits |= 1;
1064
			mpcb->next_v6_index = 1;
1065
1066
			mptcp_v6_add_raddress(mpcb,
1067
					      &inet6_sk(sk)->daddr, 0, 0);
1068
			mptcp_v6_set_init_addr_bit(mpcb, &inet6_sk(sk)->daddr);
1069
			break;
1070
		}
1071
#endif
1072
	case AF_INET:
1073
		mpcb->locaddr4[0].addr.s_addr = inet_sk(sk)->inet_saddr;
1074
		mpcb->locaddr4[0].id = 0;
1075
		mpcb->locaddr4[0].port = 0;
1076
		mpcb->locaddr4[0].low_prio = 0;
1077
		mpcb->loc4_bits |= 1;
1078
		mpcb->next_v4_index = 1;
1079
1080
		mptcp_v4_add_raddress(mpcb,
1081
				      (struct in_addr *)&inet_sk(sk)->inet_daddr,
1082
				      0, 0);
1083
		mptcp_v4_set_init_addr_bit(mpcb, inet_sk(sk)->inet_daddr);
1084
		break;
1085
	}
1086
1087
	mptcp_set_addresses(meta_sk);
1088
1089
	switch (sk->sk_family) {
1090
	case AF_INET:
1091
		tcp_sk(sk)->mptcp->low_prio = mpcb->locaddr4[0].low_prio;
1092
		break;
1093
#if IS_ENABLED(CONFIG_IPV6)
1094
	case AF_INET6:
1095
		tcp_sk(sk)->mptcp->low_prio = mpcb->locaddr6[0].low_prio;
1096
		break;
1097
#endif
1098
	}
1099
1100
	tcp_sk(sk)->mptcp->send_mp_prio = tcp_sk(sk)->mptcp->low_prio;
1101
}
1102
1103
/* Clean up the receive buffer for full frames taken by the user,
1104
 * then send an ACK if necessary.  COPIED is the number of bytes
1105
 * tcp_recvmsg has given to the user so far, it speeds up the
1106
 * calculation of whether or not we must ACK for the sake of
1107
 * a window update.
1108
 */
1109
void mptcp_cleanup_rbuf(struct sock *meta_sk, int copied)
1110
{
1111
	struct tcp_sock *meta_tp = tcp_sk(meta_sk);
1112
	struct sock *sk;
1113
	__u32 rcv_window_now = 0;
1114
1115
	if (copied > 0 && !(meta_sk->sk_shutdown & RCV_SHUTDOWN)) {
1116
		rcv_window_now = tcp_receive_window(meta_tp);
1117
1118
		if (2 * rcv_window_now > meta_tp->window_clamp)
1119
			rcv_window_now = 0;
1120
	}
1121
1122
	mptcp_for_each_sk(meta_tp->mpcb, sk) {
1123
		struct tcp_sock *tp = tcp_sk(sk);
1124
		const struct inet_connection_sock *icsk = inet_csk(sk);
1125
1126
		if (!mptcp_sk_can_send_ack(sk))
1127
			continue;
1128
1129
		if (!inet_csk_ack_scheduled(sk))
1130
			goto second_part;
1131
		/* Delayed ACKs frequently hit locked sockets during bulk
1132
		 * receive.
1133
		 */
1134
		if (icsk->icsk_ack.blocked ||
1135
		    /* Once-per-two-segments ACK was not sent by tcp_input.c */
1136
		    tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
1137
		    /* If this read emptied read buffer, we send ACK, if
1138
		     * connection is not bidirectional, user drained
1139
		     * receive buffer and there was a small segment
1140
		     * in queue.
1141
		     */
1142
		    (copied > 0 &&
1143
		     ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
1144
		      ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1145
		       !icsk->icsk_ack.pingpong)) &&
1146
		     !atomic_read(&meta_sk->sk_rmem_alloc))) {
1147
			tcp_send_ack(sk);
1148
			continue;
1149
		}
1150
1151
second_part:
1152
		/* This here is the second part of tcp_cleanup_rbuf */
1153
		if (rcv_window_now) {
1154
			__u32 new_window = __tcp_select_window(sk);
1155
1156
			/* Send ACK now, if this read freed lots of space
1157
			 * in our buffer. Certainly, new_window is new window.
1158
			 * We can advertise it now, if it is not less than
1159
			 * current one.
1160
			 * "Lots" means "at least twice" here.
1161
			 */
1162
			if (new_window && new_window >= 2 * rcv_window_now)
1163
				tcp_send_ack(sk);
1164
		}
1165
	}
1166
}
1167
1168
static int mptcp_sub_send_fin(struct sock *sk)
1169
{
1170
	struct tcp_sock *tp = tcp_sk(sk);
1171
	struct sk_buff *skb = tcp_write_queue_tail(sk);
1172
	int mss_now;
1173
1174
	/* Optimization, tack on the FIN if we have a queue of
1175
	 * unsent frames.  But be careful about outgoing SACKS
1176
	 * and IP options.
1177
	 */
1178
	mss_now = tcp_current_mss(sk);
1179
1180
	if (tcp_send_head(sk) != NULL) {
1181
		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;
1182
		TCP_SKB_CB(skb)->end_seq++;
1183
		tp->write_seq++;
1184
	} else {
1185
		skb = alloc_skb_fclone(MAX_TCP_HEADER, GFP_ATOMIC);
1186
		if (!skb)
1187
			return 1;
1188
1189
		/* Reserve space for headers and prepare control bits. */
1190
		skb_reserve(skb, MAX_TCP_HEADER);
1191
		/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
1192
		tcp_init_nondata_skb(skb, tp->write_seq,
1193
				     TCPHDR_ACK | TCPHDR_FIN);
1194
		tcp_queue_skb(sk, skb);
1195
	}
1196
	__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF);
1197
1198
	return 0;
1199
}
1200
1201
void mptcp_sub_close_wq(struct work_struct *work)
1202
{
1203
	struct mptcp_tcp_sock *mptcp = container_of(work, struct mptcp_tcp_sock, work.work);
1204
	struct tcp_sock *tp = mptcp->tp;
1205
	struct sock *sk = (struct sock *)tp;
1206
	struct sock *meta_sk = mptcp_meta_sk(sk);
1207
1208
	mutex_lock(&tp->mpcb->mutex);
1209
	lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);
1210
1211
	if (sock_flag(sk, SOCK_DEAD))
1212
		goto exit;
1213
1214
	/* We come from tcp_disconnect. We are sure that meta_sk is set */
1215
	if (!tp->mpc) {
1216
		tp->closing = 1;
1217
		sock_rps_reset_flow(sk);
1218
		tcp_close(sk, 0);
1219
		goto exit;
1220
	}
1221
1222
	if (meta_sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE) {
1223
		tp->closing = 1;
1224
		sock_rps_reset_flow(sk);
1225
		tcp_close(sk, 0);
1226
	} else if (tcp_close_state(sk)) {
1227
		sk->sk_shutdown |= SEND_SHUTDOWN;
1228
		tcp_send_fin(sk);
1229
	}
1230
1231
exit:
1232
	release_sock(meta_sk);
1233
	mutex_unlock(&tp->mpcb->mutex);
1234
	sock_put(sk);
1235
}
1236
1237
void mptcp_sub_close(struct sock *sk, unsigned long delay)
1238
{
1239
	struct tcp_sock *tp = tcp_sk(sk);
1240
	struct delayed_work *work = &tcp_sk(sk)->mptcp->work;
1241
1242
	/* We are already closing - e.g., call from sock_def_error_report upon
1243
	 * tcp_disconnect in tcp_close.
1244
	 */
1245
	if (tp->closing)
1246
		return;
1247
1248
	/* Work already scheduled ? */
1249
	if (work_pending(&work->work)) {
1250
		/* Work present - who will be first ? */
1251
		if (jiffies + delay > work->timer.expires)
1252
			return;
1253
1254
		/* Try canceling - if it fails, work will be executed soon */
1255
		if (!cancel_delayed_work(work))
1256
			return;
1257
		sock_put(sk);
1258
	}
1259
1260
	if (!delay) {
1261
		unsigned char old_state = sk->sk_state;
1262
1263
		/* If we are in user-context we can directly do the closing
1264
		 * procedure. No need to schedule a work-queue.
1265
		 */
1266
		if (!in_softirq()) {
1267
			if (sock_flag(sk, SOCK_DEAD))
1268
				return;
1269
1270
			if (!tp->mpc) {
1271
				tp->closing = 1;
1272
				sock_rps_reset_flow(sk);
1273
				tcp_close(sk, 0);
1274
				return;
1275
			}
1276
1277
			if (mptcp_meta_sk(sk)->sk_shutdown == SHUTDOWN_MASK ||
1278
			    sk->sk_state == TCP_CLOSE) {
1279
				tp->closing = 1;
1280
				sock_rps_reset_flow(sk);
1281
				tcp_close(sk, 0);
1282
			} else if (tcp_close_state(sk)) {
1283
				sk->sk_shutdown |= SEND_SHUTDOWN;
1284
				tcp_send_fin(sk);
1285
			}
1286
1287
			return;
1288
		}
1289
1290
		/* We directly send the FIN. Because it may take so a long time,
1291
		 * untile the work-queue will get scheduled...
1292
		 *
1293
		 * If mptcp_sub_send_fin returns 1, it failed and thus we reset
1294
		 * the old state so that tcp_close will finally send the fin
1295
		 * in user-context.
1296
		 */
1297
		if (!sk->sk_err && old_state != TCP_CLOSE &&
1298
		    tcp_close_state(sk) && mptcp_sub_send_fin(sk)) {
1299
			if (old_state == TCP_ESTABLISHED)
1300
				TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
1301
			sk->sk_state = old_state;
1302
		}
1303
	}
1304
1305
	sock_hold(sk);
1306
	queue_delayed_work(mptcp_wq, work, delay);
1307
}
1308
1309
/* Update the mpcb send window, based on the contributions
1310
 * of each subflow
1311
 */
1312
void mptcp_update_sndbuf(struct mptcp_cb *mpcb)
1313
{
1314
	struct sock *meta_sk = mpcb->meta_sk, *sk;
1315
	int new_sndbuf = 0, old_sndbuf = meta_sk->sk_sndbuf;
1316
	mptcp_for_each_sk(mpcb, sk) {
1317
		if (!mptcp_sk_can_send(sk))
1318
			continue;
1319
1320
		new_sndbuf += sk->sk_sndbuf;
1321
1322
		if (new_sndbuf > sysctl_tcp_wmem[2] || new_sndbuf < 0) {
1323
			new_sndbuf = sysctl_tcp_wmem[2];
1324
			break;
1325
		}
1326
	}
1327
	meta_sk->sk_sndbuf = max(min(new_sndbuf, sysctl_tcp_wmem[2]), meta_sk->sk_sndbuf);
1328
1329
	/* The subflow's call to sk_write_space in tcp_new_space ends up in
1330
	 * mptcp_write_space.
1331
	 * It has nothing to do with waking up the application.
1332
	 * So, we do it here.
1333
	 */
1334
	if (old_sndbuf != meta_sk->sk_sndbuf)
1335
		meta_sk->sk_write_space(meta_sk);
1336
}
1337
1338
void mptcp_close(struct sock *meta_sk, long timeout)
1339
{
1340
	struct tcp_sock *meta_tp = tcp_sk(meta_sk);
1341
	struct sock *sk_it, *tmpsk;
1342
	struct mptcp_cb *mpcb = meta_tp->mpcb;
1343
	struct sk_buff *skb;
1344
	int data_was_unread = 0;
1345
	int state;
1346
1347
	mptcp_debug("%s: Close of meta_sk with tok %#x\n",
1348
		    __func__, mpcb->mptcp_loc_token);
1349
1350
	mutex_lock(&mpcb->mutex);
1351
	lock_sock(meta_sk);
1352
1353
	if (meta_tp->inside_tk_table) {
1354
		/* Detach the mpcb from the token hashtable */
1355
		mptcp_hash_remove_bh(meta_tp);
1356
		reqsk_queue_destroy(&inet_csk(meta_sk)->icsk_accept_queue);
1357
	}
1358
1359
	meta_sk->sk_shutdown = SHUTDOWN_MASK;
1360
	/* We need to flush the recv. buffs.  We do this only on the
1361
	 * descriptor close, not protocol-sourced closes, because the
1362
	 * reader process may not have drained the data yet!
1363
	 */
1364
	while ((skb = __skb_dequeue(&meta_sk->sk_receive_queue)) != NULL) {
1365
		u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1366
			  tcp_hdr(skb)->fin;
1367
		data_was_unread += len;
1368
		__kfree_skb(skb);
1369
	}
1370
1371
	sk_mem_reclaim(meta_sk);
1372
1373
	/* If socket has been already reset (e.g. in tcp_reset()) - kill it. */
1374
	if (meta_sk->sk_state == TCP_CLOSE) {
1375
		mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk)
1376
			mptcp_sub_close(sk_it, 0);
1377
		goto adjudge_to_death;
1378
	}
1379
1380
	if (data_was_unread) {
1381
		/* Unread data was tossed, zap the connection. */
1382
		NET_INC_STATS_USER(sock_net(meta_sk), LINUX_MIB_TCPABORTONCLOSE);
1383
		tcp_set_state(meta_sk, TCP_CLOSE);
1384
		tcp_send_active_reset(meta_sk, meta_sk->sk_allocation);
1385
	} else if (sock_flag(meta_sk, SOCK_LINGER) && !meta_sk->sk_lingertime) {
1386
		/* Check zero linger _after_ checking for unread data. */
1387
		meta_sk->sk_prot->disconnect(meta_sk, 0);
1388
		NET_INC_STATS_USER(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA);
1389
	} else if (tcp_close_state(meta_sk)) {
1390
		mptcp_send_fin(meta_sk);
1391
	} else if (meta_tp->snd_una == meta_tp->write_seq) {
1392
		/* The DATA_FIN has been sent and acknowledged
1393
		 * (e.g., by sk_shutdown). Close all the other subflows
1394
		 */
1395
		mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) {
1396
			unsigned long delay = 0;
1397
			/* If we are the passive closer, don't trigger
1398
			 * subflow-fin until the subflow has been finned
1399
			 * by the peer. - thus we add a delay
1400
			 */
1401
			if (mpcb->passive_close &&
1402
			    sk_it->sk_state == TCP_ESTABLISHED)
1403
				delay = inet_csk(sk_it)->icsk_rto << 3;
1404
1405
			mptcp_sub_close(sk_it, delay);
1406
		}
1407
	}
1408
1409
	sk_stream_wait_close(meta_sk, timeout);
1410
1411
adjudge_to_death:
1412
	state = meta_sk->sk_state;
1413
	sock_hold(meta_sk);
1414
	sock_orphan(meta_sk);
1415
1416
	/* socket will be freed after mptcp_close - we have to prevent
1417
	 * access from the subflows.
1418
	 */
1419
	mptcp_for_each_sk(mpcb, sk_it) {
1420
		/* Similar to sock_orphan, but we don't set it DEAD, because
1421
		 * the callbacks are still set and must be called.
1422
		 */
1423
		write_lock_bh(&sk_it->sk_callback_lock);
1424
		sk_set_socket(sk_it, NULL);
1425
		sk_it->sk_wq  = NULL;
1426
		write_unlock_bh(&sk_it->sk_callback_lock);
1427
	}
1428
1429
	/* It is the last release_sock in its life. It will remove backlog. */
1430
	release_sock(meta_sk);
1431
1432
	/* Now socket is owned by kernel and we acquire BH lock
1433
	 * to finish close. No need to check for user refs.
1434
	 */
1435
	local_bh_disable();
1436
	bh_lock_sock(meta_sk);
1437
	WARN_ON(sock_owned_by_user(meta_sk));
1438
1439
	percpu_counter_inc(meta_sk->sk_prot->orphan_count);
1440
1441
	/* Have we already been destroyed by a softirq or backlog? */
1442
	if (state != TCP_CLOSE && meta_sk->sk_state == TCP_CLOSE)
1443
		goto out;
1444
1445
	/*	This is a (useful) BSD violating of the RFC. There is a
1446
	 *	problem with TCP as specified in that the other end could
1447
	 *	keep a socket open forever with no application left this end.
1448
	 *	We use a 3 minute timeout (about the same as BSD) then kill
1449
	 *	our end. If they send after that then tough - BUT: long enough
1450
	 *	that we won't make the old 4*rto = almost no time - whoops
1451
	 *	reset mistake.
1452
	 *
1453
	 *	Nope, it was not mistake. It is really desired behaviour
1454
	 *	f.e. on http servers, when such sockets are useless, but
1455
	 *	consume significant resources. Let's do it with special
1456
	 *	linger2	option.					--ANK
1457
	 */
1458
1459
	if (meta_sk->sk_state == TCP_FIN_WAIT2) {
1460
		if (meta_tp->linger2 < 0) {
1461
			tcp_set_state(meta_sk, TCP_CLOSE);
1462
			tcp_send_active_reset(meta_sk, GFP_ATOMIC);
1463
			NET_INC_STATS_BH(sock_net(meta_sk),
1464
					 LINUX_MIB_TCPABORTONLINGER);
1465
		} else {
1466
			const int tmo = tcp_fin_time(meta_sk);
1467
1468
			if (tmo > TCP_TIMEWAIT_LEN) {
1469
				inet_csk_reset_keepalive_timer(meta_sk,
1470
							       tmo - TCP_TIMEWAIT_LEN);
1471
			} else {
1472
				tcp_time_wait(meta_sk, TCP_FIN_WAIT2, tmo);
1473
				goto out;
1474
			}
1475
		}
1476
	}
1477
	if (meta_sk->sk_state != TCP_CLOSE) {
1478
		sk_mem_reclaim(meta_sk);
1479
		if (tcp_too_many_orphans(meta_sk, 0)) {
1480
			if (net_ratelimit())
1481
				pr_info("MPTCP: too many of orphaned sockets\n");
1482
			tcp_set_state(meta_sk, TCP_CLOSE);
1483
			tcp_send_active_reset(meta_sk, GFP_ATOMIC);
1484
			NET_INC_STATS_BH(sock_net(meta_sk),
1485
					 LINUX_MIB_TCPABORTONMEMORY);
1486
		}
1487
	}
1488
1489
1490
	if (meta_sk->sk_state == TCP_CLOSE)
1491
		inet_csk_destroy_sock(meta_sk);
1492
	/* Otherwise, socket is reprieved until protocol close. */
1493
1494
out:
1495
	bh_unlock_sock(meta_sk);
1496
	local_bh_enable();
1497
	mutex_unlock(&mpcb->mutex);
1498
	sock_put(meta_sk); /* Taken by sock_hold */
1499
}
1500
1501
/* Returns 1 if we should enable MPTCP for that socket. */
1502
int mptcp_doit(struct sock *sk)
1503
{
1504
	/* Do not allow MPTCP enabling if the MPTCP initialization failed */
1505
	if (mptcp_init_failed)
1506
		return 0;
1507
1508
	/* Socket may already be established (e.g., called from tcp_recvmsg) */
1509
	if (tcp_sk(sk)->mpc || tcp_sk(sk)->request_mptcp)
1510
		return 1;
1511
1512
	if (!sysctl_mptcp_enabled)
1513
		return 0;
1514
1515
	/* Don't do mptcp over loopback or local addresses */
1516
	if (sk->sk_family == AF_INET &&
1517
	    (ipv4_is_loopback(inet_sk(sk)->inet_daddr) ||
1518
	     ipv4_is_loopback(inet_sk(sk)->inet_saddr)))
1519
		return 0;
1520
	if (sk->sk_family == AF_INET6 &&
1521
	    (ipv6_addr_loopback(&inet6_sk(sk)->daddr) ||
1522
	     ipv6_addr_loopback(&inet6_sk(sk)->saddr)))
1523
		return 0;
1524
	if (mptcp_v6_is_v4_mapped(sk) &&
1525
	    ipv4_is_loopback(inet_sk(sk)->inet_saddr))
1526
		return 0;
1527
1528
	return 1;
1529
}
1530
1531
int mptcp_create_master_sk(struct sock *meta_sk, __u64 remote_key, u32 window)
1532
{
1533
	struct tcp_sock *master_tp;
1534
	struct sock *master_sk;
1535
1536
	if (mptcp_alloc_mpcb(meta_sk, remote_key, window))
1537
		goto err_alloc_mpcb;
1538
1539
	master_sk = tcp_sk(meta_sk)->mpcb->master_sk;
1540
	master_tp = tcp_sk(master_sk);
1541
1542
	if (mptcp_add_sock(meta_sk, master_sk, 0, GFP_ATOMIC))
1543
		goto err_add_sock;
1544
1545
	if (__inet_inherit_port(meta_sk, master_sk) < 0)
1546
		goto err_add_sock;
1547
1548
	meta_sk->sk_prot->unhash(meta_sk);
1549
1550
	if (master_sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(master_sk))
1551
		__inet_hash_nolisten(master_sk, NULL);
1552
#if IS_ENABLED(CONFIG_IPV6)
1553
	else
1554
		__inet6_hash(master_sk, NULL);
1555
#endif
1556
1557
	master_tp->mptcp->init_rcv_wnd = master_tp->rcv_wnd;
1558
1559
	return 0;
1560
1561
err_add_sock:
1562
	mptcp_fallback_meta_sk(meta_sk);
1563
1564
	inet_csk_prepare_forced_close(master_sk);
1565
	tcp_done(master_sk);
1566
	inet_csk_prepare_forced_close(meta_sk);
1567
	tcp_done(meta_sk);
1568
1569
err_alloc_mpcb:
1570
	return -ENOBUFS;
1571
}
1572
1573
int mptcp_check_req_master(struct sock *sk, struct sock *child,
1574
			   struct request_sock *req,
1575
			   struct request_sock **prev,
1576
			   struct mptcp_options_received *mopt)
1577
{
1578
	struct tcp_sock *child_tp = tcp_sk(child);
1579
	struct sock *meta_sk = child;
1580
	struct mptcp_cb *mpcb;
1581
	struct mptcp_request_sock *mtreq;
1582
1583
	if (!tcp_rsk(req)->saw_mpc)
1584
		return 1;
1585
1586
	/* Just set this values to pass them to mptcp_alloc_mpcb */
1587
	mtreq = mptcp_rsk(req);
1588
	child_tp->mptcp_loc_key = mtreq->mptcp_loc_key;
1589
	child_tp->mptcp_loc_token = mtreq->mptcp_loc_token;
1590
1591
	if (mptcp_create_master_sk(meta_sk, mtreq->mptcp_rem_key,
1592
				   child_tp->snd_wnd))
1593
		return -ENOBUFS;
1594
1595
	child = tcp_sk(child)->mpcb->master_sk;
1596
	child_tp = tcp_sk(child);
1597
	mpcb = child_tp->mpcb;
1598
1599
	child_tp->mptcp->snt_isn = tcp_rsk(req)->snt_isn;
1600
	child_tp->mptcp->rcv_isn = tcp_rsk(req)->rcv_isn;
1601
1602
	mpcb->dss_csum = mtreq->dss_csum;
1603
	mpcb->server_side = 1;
1604
1605
	/* Will be moved to ESTABLISHED by  tcp_rcv_state_process() */
1606
	mptcp_update_metasocket(child, meta_sk);
1607
1608
	/* Needs to be done here additionally, because when accepting a
1609
	 * new connection we pass by __reqsk_free and not reqsk_free.
1610
	 */
1611
	mptcp_reqsk_remove_tk(req);
1612
1613
	 /* Hold when creating the meta-sk in tcp_vX_syn_recv_sock. */
1614
	sock_put(meta_sk);
1615
1616
	inet_csk_reqsk_queue_unlink(sk, req, prev);
1617
	inet_csk_reqsk_queue_removed(sk, req);
1618
	inet_csk_reqsk_queue_add(sk, req, meta_sk);
1619
1620
	return 0;
1621
}
1622
1623
struct sock *mptcp_check_req_child(struct sock *meta_sk, struct sock *child,
1624
				   struct request_sock *req,
1625
				   struct request_sock **prev,
1626
				   struct mptcp_options_received *mopt)
1627
{
1628
	struct tcp_sock *child_tp = tcp_sk(child);
1629
	struct mptcp_request_sock *mtreq = mptcp_rsk(req);
1630
	struct mptcp_cb *mpcb = mtreq->mpcb;
1631
	u8 hash_mac_check[20];
1632
1633
	child_tp->inside_tk_table = 0;
1634
1635
	if (!mopt->join_ack)
1636
		goto teardown;
1637
1638
	mptcp_hmac_sha1((u8 *)&mpcb->mptcp_rem_key,
1639
			(u8 *)&mpcb->mptcp_loc_key,
1640
			(u8 *)&mtreq->mptcp_rem_nonce,
1641
			(u8 *)&mtreq->mptcp_loc_nonce,
1642
			(u32 *)hash_mac_check);
1643
1644
	if (memcmp(hash_mac_check, (char *)&mopt->mptcp_recv_mac, 20))
1645
		goto teardown;
1646
1647
	/* Point it to the same struct socket and wq as the meta_sk */
1648
	sk_set_socket(child, meta_sk->sk_socket);
1649
	child->sk_wq = meta_sk->sk_wq;
1650
1651
	if (mptcp_add_sock(meta_sk, child, mtreq->rem_id, GFP_ATOMIC)) {
1652
		child_tp->mpc = 0; /* Has been inherited, but now
1653
				    * child_tp->mptcp is NULL
1654
				    */
1655
		/* TODO when we support acking the third ack for new subflows,
1656
		 * we should silently discard this third ack, by returning NULL.
1657
		 *
1658
		 * Maybe, at the retransmission we will have enough memory to
1659
		 * fully add the socket to the meta-sk.
1660
		 */
1661
		goto teardown;
1662
	}
1663
1664
	/* The child is a clone of the meta socket, we must now reset
1665
	 * some of the fields
1666
	 */
1667
	child_tp->mptcp->rcv_low_prio = mtreq->low_prio;
1668
1669
	/* We should allow proper increase of the snd/rcv-buffers. Thus, we
1670
	 * use the original values instead of the bloated up ones from the
1671
	 * clone.
1672
	 */
1673
	child->sk_sndbuf = mpcb->orig_sk_sndbuf;
1674
	child->sk_rcvbuf = mpcb->orig_sk_rcvbuf;
1675
1676
	child_tp->mptcp->slave_sk = 1;
1677
	child_tp->mptcp->snt_isn = tcp_rsk(req)->snt_isn;
1678
	child_tp->mptcp->rcv_isn = tcp_rsk(req)->rcv_isn;
1679
	child_tp->mptcp->init_rcv_wnd = req->rcv_wnd;
1680
1681
	child_tp->tsq_flags = 0;
1682
1683
	/* Subflows do not use the accept queue, as they
1684
	 * are attached immediately to the mpcb.
1685
	 */
1686
	inet_csk_reqsk_queue_drop(meta_sk, req, prev);
1687
	return child;
1688
1689
teardown:
1690
	/* Drop this request - sock creation failed. */
1691
	inet_csk_reqsk_queue_drop(meta_sk, req, prev);
1692
	inet_csk_prepare_forced_close(child);
1693
	tcp_done(child);
1694
	return meta_sk;
1695
}
1696
1697
int mptcp_time_wait(struct sock *sk, struct tcp_timewait_sock *tw)
1698
{
1699
	struct mptcp_tw *mptw;
1700
	struct tcp_sock *tp = tcp_sk(sk);
1701
	struct mptcp_cb *mpcb = tp->mpcb;
1702
1703
	/* Alloc MPTCP-tw-sock */
1704
	mptw = kmem_cache_alloc(mptcp_tw_cache, GFP_ATOMIC);
1705
	if (!mptw)
1706
		return -ENOBUFS;
1707
1708
	atomic_inc(&mpcb->refcnt);
1709
1710
	tw->mptcp_tw = mptw;
1711
	mptw->loc_key = mpcb->mptcp_loc_key;
1712
	mptw->meta_tw = mpcb->in_time_wait;
1713
	if (mptw->meta_tw) {
1714
		mptw->rcv_nxt = mptcp_get_rcv_nxt_64(mptcp_meta_tp(tp));
1715
		if (mpcb->mptw_state != TCP_TIME_WAIT)
1716
			mptw->rcv_nxt++;
1717
	}
1718
	rcu_assign_pointer(mptw->mpcb, mpcb);
1719
1720
	spin_lock(&mpcb->tw_lock);
1721
	list_add_rcu(&mptw->list, &tp->mpcb->tw_list);
1722
	mptw->in_list = 1;
1723
	spin_unlock(&mpcb->tw_lock);
1724
1725
	return 0;
1726
}
1727
1728
void mptcp_twsk_destructor(struct tcp_timewait_sock *tw)
1729
{
1730
	struct mptcp_cb *mpcb;
1731
1732
	rcu_read_lock();
1733
	mpcb = rcu_dereference(tw->mptcp_tw->mpcb);
1734
1735
	/* If we are still holding a ref to the mpcb, we have to remove ourself
1736
	 * from the list and drop the ref properly.
1737
	 */
1738
	if (mpcb && atomic_inc_not_zero(&mpcb->refcnt)) {
1739
		spin_lock(&mpcb->tw_lock);
1740
		if (tw->mptcp_tw->in_list) {
1741
			list_del_rcu(&tw->mptcp_tw->list);
1742
			tw->mptcp_tw->in_list = 0;
1743
		}
1744
		spin_unlock(&mpcb->tw_lock);
1745
1746
		/* Twice, because we increased it above */
1747
		mptcp_mpcb_put(mpcb);
1748
		mptcp_mpcb_put(mpcb);
1749
	}
1750
1751
	rcu_read_unlock();
1752
1753
	kmem_cache_free(mptcp_tw_cache, tw->mptcp_tw);
1754
}
1755
1756
/* Updates the rcv_nxt of the time-wait-socks and allows them to ack a
1757
 * data-fin.
1758
 */
1759
void mptcp_update_tw_socks(const struct tcp_sock *tp, int state)
1760
{
1761
	struct mptcp_tw *mptw;
1762
1763
	/* Used for sockets that go into tw after the meta
1764
	 * (see mptcp_time_wait())
1765
	 */
1766
	tp->mpcb->in_time_wait = 1;
1767
	tp->mpcb->mptw_state = state;
1768
1769
	/* Update the time-wait-sock's information */
1770
	rcu_read_lock_bh();
1771
	list_for_each_entry_rcu(mptw, &tp->mpcb->tw_list, list) {
1772
		mptw->meta_tw = 1;
1773
		mptw->rcv_nxt = mptcp_get_rcv_nxt_64(tp);
1774
1775
		/* We want to ack a DATA_FIN, but are yet in FIN_WAIT_2 -
1776
		 * pretend as if the DATA_FIN has already reached us, that way
1777
		 * the checks in tcp_timewait_state_process will be good as the
1778
		 * DATA_FIN comes in.
1779
		 */
1780
		if (state != TCP_TIME_WAIT)
1781
			mptw->rcv_nxt++;
1782
	}
1783
	rcu_read_unlock_bh();
1784
}
1785
1786
void mptcp_tsq_flags(struct sock *sk)
1787
{
1788
	struct tcp_sock *tp = tcp_sk(sk);
1789
	struct sock *meta_sk = mptcp_meta_sk(sk);
1790
1791
	/* It will be handled as a regular deferred-call */
1792
	if (is_meta_sk(sk))
1793
		return;
1794
1795
	if (!tp->mptcp->next_cb) {
1796
		tp->mptcp->next_cb = tp->mpcb->callback_list;
1797
		tp->mpcb->callback_list = sk;
1798
		/* We need to hold it here, as the sock_hold is not assured
1799
		 * by the release_sock as it is done in regular TCP.
1800
		 *
1801
		 * The subsocket may get inet_csk_destroy'd while it is inside
1802
		 * the callback_list.
1803
		 */
1804
		sock_hold(sk);
1805
	}
1806
1807
	if (!test_and_set_bit(MPTCP_SUB_DEFERRED, &tcp_sk(meta_sk)->tsq_flags))
1808
		sock_hold(meta_sk);
1809
}
1810
1811
void mptcp_tsq_sub_deferred(struct sock *meta_sk)
1812
{
1813
	struct sock *sk;
1814
	struct tcp_sock *meta_tp = tcp_sk(meta_sk);
1815
1816
	BUG_ON(!is_meta_sk(meta_sk) && !meta_tp->was_meta_sk);
1817
1818
	__sock_put(meta_sk);
1819
	while ((sk = meta_tp->mpcb->callback_list) != NULL) {
1820
		meta_tp->mpcb->callback_list = tcp_sk(sk)->mptcp->next_cb;
1821
		tcp_sk(sk)->mptcp->next_cb = NULL;
1822
		sk->sk_prot->release_cb(sk);
1823
		/* Final sock_put (cfr. mptcp_tsq_flags */
1824
		sock_put(sk);
1825
	}
1826
}
1827
1828
struct workqueue_struct *mptcp_wq;
1829
1830
/* General initialization of mptcp */
1831
void __init mptcp_init(void)
1832
{
1833
#ifdef CONFIG_SYSCTL
1834
	struct ctl_table_header *mptcp_sysctl;
1835
#endif
1836
1837
	mptcp_sock_cache = kmem_cache_create("mptcp_sock",
1838
					     sizeof(struct mptcp_tcp_sock),
1839
					     0, SLAB_HWCACHE_ALIGN,
1840
					     NULL);
1841
	if (!mptcp_sock_cache)
1842
		goto mptcp_sock_cache_failed;
1843
1844
	mptcp_cb_cache = kmem_cache_create("mptcp_cb", sizeof(struct mptcp_cb),
1845
					   0, SLAB_DESTROY_BY_RCU|SLAB_HWCACHE_ALIGN,
1846
					   NULL);
1847
	if (!mptcp_cb_cache)
1848
		goto mptcp_cb_cache_failed;
1849
1850
	mptcp_tw_cache = kmem_cache_create("mptcp_tw", sizeof(struct mptcp_tw),
1851
					   0, SLAB_DESTROY_BY_RCU|SLAB_HWCACHE_ALIGN,
1852
					   NULL);
1853
	if (!mptcp_tw_cache)
1854
		goto mptcp_tw_cache_failed;
1855
1856
	get_random_bytes(mptcp_secret, sizeof(mptcp_secret));
1857
1858
	mptcp_wq = alloc_workqueue("mptcp_wq", WQ_UNBOUND | WQ_MEM_RECLAIM, 8);
1859
	if (!mptcp_wq)
1860
		goto alloc_workqueue_failed;
1861
1862
	if (mptcp_pm_init())
1863
		goto mptcp_pm_failed;
1864
1865
#ifdef CONFIG_SYSCTL
1866
	mptcp_sysctl = register_net_sysctl(&init_net, "net/mptcp", mptcp_table);
1867
	if (!mptcp_sysctl)
1868
		goto register_sysctl_failed;
1869
#endif
1870
1871
	pr_info("MPTCP: Stable release v0.87.3");
1872
1873
	mptcp_init_failed = false;
1874
1875
	return;
1876
1877
#ifdef CONFIG_SYSCTL
1878
register_sysctl_failed:
1879
	mptcp_pm_undo();
1880
#endif
1881
mptcp_pm_failed:
1882
	destroy_workqueue(mptcp_wq);
1883
alloc_workqueue_failed:
1884
	kmem_cache_destroy(mptcp_tw_cache);
1885
mptcp_tw_cache_failed:
1886
	kmem_cache_destroy(mptcp_cb_cache);
1887
mptcp_cb_cache_failed:
1888
	kmem_cache_destroy(mptcp_sock_cache);
1889
mptcp_sock_cache_failed:
1890
	mptcp_init_failed = true;
1891
}
(-)a/net/mptcp/mptcp_input.c (+1904 lines)
Line 0 Link Here
1
/*
2
 *	MPTCP implementation - Sending side
3
 *
4
 *	Initial Design & Implementation:
5
 *	Sébastien Barré <sebastien.barre@uclouvain.be>
6
 *
7
 *	Current Maintainer & Author:
8
 *	Christoph Paasch <christoph.paasch@uclouvain.be>
9
 *
10
 *	Additional authors:
11
 *	Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
12
 *	Gregory Detal <gregory.detal@uclouvain.be>
13
 *	Fabien Duchêne <fabien.duchene@uclouvain.be>
14
 *	Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
15
 *	Lavkesh Lahngir <lavkesh51@gmail.com>
16
 *	Andreas Ripke <ripke@neclab.eu>
17
 *	Vlad Dogaru <vlad.dogaru@intel.com>
18
 *	Octavian Purdila <octavian.purdila@intel.com>
19
 *	John Ronan <jronan@tssg.org>
20
 *	Catalin Nicutar <catalin.nicutar@gmail.com>
21
 *	Brandon Heller <brandonh@stanford.edu>
22
 *
23
 *
24
 *	This program is free software; you can redistribute it and/or
25
 *      modify it under the terms of the GNU General Public License
26
 *      as published by the Free Software Foundation; either version
27
 *      2 of the License, or (at your option) any later version.
28
 */
29
30
#include <asm/unaligned.h>
31
32
#include <net/mptcp.h>
33
#include <net/mptcp_v4.h>
34
#include <net/mptcp_v6.h>
35
36
#include <linux/kconfig.h>
37
38
static inline void mptcp_become_fully_estab(struct sock *sk)
39
{
40
	tcp_sk(sk)->mptcp->fully_established = 1;
41
42
	if (is_master_tp(tcp_sk(sk)))
43
		mptcp_create_subflows(mptcp_meta_sk(sk));
44
}
45
46
/* Similar to tcp_tso_acked without any memory accounting */
47
static inline int mptcp_tso_acked_reinject(struct sock *sk, struct sk_buff *skb)
48
{
49
	struct tcp_sock *tp = tcp_sk(sk);
50
	u32 packets_acked, len;
51
52
	BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una));
53
54
	packets_acked = tcp_skb_pcount(skb);
55
56
	if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
57
		return 0;
58
59
	len = tp->snd_una - TCP_SKB_CB(skb)->seq;
60
	__pskb_trim_head(skb, len);
61
62
	TCP_SKB_CB(skb)->seq += len;
63
	skb->ip_summed = CHECKSUM_PARTIAL;
64
	skb->truesize	     -= len;
65
66
	/* Any change of skb->len requires recalculation of tso factor. */
67
	if (tcp_skb_pcount(skb) > 1)
68
		tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb));
69
	packets_acked -= tcp_skb_pcount(skb);
70
71
	if (packets_acked) {
72
		BUG_ON(tcp_skb_pcount(skb) == 0);
73
		BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq));
74
	}
75
76
	return packets_acked;
77
}
78
79
/**
80
 * Cleans the meta-socket retransmission queue and the reinject-queue.
81
 * @sk must be the metasocket.
82
 */
83
static void mptcp_clean_rtx_queue(struct sock *meta_sk, u32 prior_snd_una)
84
{
85
	struct sk_buff *skb, *tmp;
86
	struct tcp_sock *meta_tp = tcp_sk(meta_sk);
87
	struct mptcp_cb *mpcb = meta_tp->mpcb;
88
	bool acked = false;
89
	u32 acked_pcount;
90
91
	while ((skb = tcp_write_queue_head(meta_sk)) &&
92
	       skb != tcp_send_head(meta_sk)) {
93
		bool fully_acked = true;
94
95
		if (before(meta_tp->snd_una, TCP_SKB_CB(skb)->end_seq)) {
96
			if (tcp_skb_pcount(skb) == 1 ||
97
			    !after(meta_tp->snd_una, TCP_SKB_CB(skb)->seq))
98
				break;
99
100
			acked_pcount = tcp_tso_acked(meta_sk, skb);
101
			if (!acked_pcount)
102
				break;
103
104
			fully_acked = false;
105
		} else {
106
			acked_pcount = tcp_skb_pcount(skb);
107
		}
108
109
		acked = true;
110
		meta_tp->packets_out -= acked_pcount;
111
		meta_tp->retrans_stamp = 0;
112
113
		if (!fully_acked)
114
			break;
115
116
		tcp_unlink_write_queue(skb, meta_sk);
117
118
		if (mptcp_is_data_fin(skb)) {
119
			struct sock *sk_it;
120
121
			/* DATA_FIN has been acknowledged - now we can close
122
			 * the subflows
123
			 */
124
			mptcp_for_each_sk(mpcb, sk_it) {
125
				unsigned long delay = 0;
126
127
				/* If we are the passive closer, don't trigger
128
				 * subflow-fin until the subflow has been finned
129
				 * by the peer - thus we add a delay.
130
				 */
131
				if (mpcb->passive_close &&
132
				    sk_it->sk_state == TCP_ESTABLISHED)
133
					delay = inet_csk(sk_it)->icsk_rto << 3;
134
135
				mptcp_sub_close(sk_it, delay);
136
			}
137
		}
138
		sk_wmem_free_skb(meta_sk, skb);
139
	}
140
	/* Remove acknowledged data from the reinject queue */
141
	skb_queue_walk_safe(&mpcb->reinject_queue, skb, tmp) {
142
		if (before(meta_tp->snd_una, TCP_SKB_CB(skb)->end_seq)) {
143
			if (tcp_skb_pcount(skb) == 1 ||
144
			    !after(meta_tp->snd_una, TCP_SKB_CB(skb)->seq))
145
				break;
146
147
			mptcp_tso_acked_reinject(meta_sk, skb);
148
			break;
149
		}
150
151
		__skb_unlink(skb, &mpcb->reinject_queue);
152
		__kfree_skb(skb);
153
	}
154
155
	if (likely(between(meta_tp->snd_up, prior_snd_una, meta_tp->snd_una)))
156
		meta_tp->snd_up = meta_tp->snd_una;
157
158
	if (acked) {
159
		tcp_rearm_rto(meta_sk);
160
		/* Normally this is done in tcp_try_undo_loss - but MPTCP
161
		 * does not call this function.
162
		 */
163
		inet_csk(meta_sk)->icsk_retransmits = 0;
164
	}
165
}
166
167
/* Inspired by tcp_rcv_state_process */
168
static int mptcp_rcv_state_process(struct sock *meta_sk, struct sock *sk,
169
				   const struct sk_buff *skb, u32 data_seq,
170
				   u16 data_len)
171
{
172
	struct tcp_sock *meta_tp = tcp_sk(meta_sk), *tp = tcp_sk(sk);
173
	struct tcphdr *th = tcp_hdr(skb);
174
175
	/* State-machine handling if FIN has been enqueued and he has
176
	 * been acked (snd_una == write_seq) - it's important that this
177
	 * here is after sk_wmem_free_skb because otherwise
178
	 * sk_forward_alloc is wrong upon inet_csk_destroy_sock()
179
	 */
180
	switch (meta_sk->sk_state) {
181
	case TCP_FIN_WAIT1:
182
		if (meta_tp->snd_una == meta_tp->write_seq) {
183
			struct dst_entry *dst = __sk_dst_get(meta_sk);
184
185
			tcp_set_state(meta_sk, TCP_FIN_WAIT2);
186
			meta_sk->sk_shutdown |= SEND_SHUTDOWN;
187
188
			dst = __sk_dst_get(sk);
189
			if (dst)
190
				dst_confirm(dst);
191
192
			if (!sock_flag(meta_sk, SOCK_DEAD)) {
193
				/* Wake up lingering close() */
194
				meta_sk->sk_state_change(meta_sk);
195
			} else {
196
				int tmo;
197
198
				if (meta_tp->linger2 < 0 ||
199
				    (data_len &&
200
				     after(data_seq + data_len - (mptcp_is_data_fin2(skb, tp) ? 1 : 0),
201
					   meta_tp->rcv_nxt))) {
202
					mptcp_send_active_reset(meta_sk, GFP_ATOMIC);
203
					tcp_done(meta_sk);
204
					NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA);
205
					return 1;
206
				}
207
208
				tmo = tcp_fin_time(meta_sk);
209
				if (tmo > TCP_TIMEWAIT_LEN) {
210
					inet_csk_reset_keepalive_timer(meta_sk, tmo - TCP_TIMEWAIT_LEN);
211
				} else if (mptcp_is_data_fin2(skb, tp) ||
212
					   sock_owned_by_user(meta_sk)) {
213
					/* Bad case. We could lose such FIN otherwise.
214
					 * It is not a big problem, but it looks confusing
215
					 * and not so rare event. We still can lose it now,
216
					 * if it spins in bh_lock_sock(), but it is really
217
					 * marginal case.
218
					 */
219
					inet_csk_reset_keepalive_timer(meta_sk, tmo);
220
				} else {
221
					tcp_time_wait(meta_sk, TCP_FIN_WAIT2, tmo);
222
				}
223
			}
224
		}
225
		break;
226
	case TCP_CLOSING:
227
	case TCP_LAST_ACK:
228
		if (meta_tp->snd_una == meta_tp->write_seq) {
229
			tcp_done(meta_sk);
230
			return 1;
231
		}
232
		break;
233
	}
234
235
	/* step 7: process the segment text */
236
	switch (meta_sk->sk_state) {
237
	case TCP_FIN_WAIT1:
238
	case TCP_FIN_WAIT2:
239
		/* RFC 793 says to queue data in these states,
240
		 * RFC 1122 says we MUST send a reset.
241
		 * BSD 4.4 also does reset.
242
		 */
243
		if (meta_sk->sk_shutdown & RCV_SHUTDOWN) {
244
			if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
245
			    after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt) &&
246
			    !mptcp_is_data_fin2(skb, tp)) {
247
				NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA);
248
249
				mptcp_send_active_reset(meta_sk, GFP_ATOMIC);
250
			}
251
		}
252
		break;
253
	}
254
255
	return 0;
256
}
257
258
/**
259
 * @return:
260
 *  i) 1: Everything's fine.
261
 *  ii) -1: A reset has been sent on the subflow - csum-failure
262
 *  iii) 0: csum-failure but no reset sent, because it's the last subflow.
263
 *	 Last packet should not be destroyed by the caller because it has
264
 *	 been done here.
265
 */
266
static int mptcp_verif_dss_csum(struct sock *sk)
267
{
268
	struct tcp_sock *tp = tcp_sk(sk);
269
	struct sk_buff *tmp, *tmp1, *last = NULL;
270
	__wsum csum_tcp = 0; /* cumulative checksum of pld + mptcp-header */
271
	int ans = 1, overflowed = 0, offset = 0, dss_csum_added = 0;
272
	int iter = 0;
273
274
	skb_queue_walk_safe(&sk->sk_receive_queue, tmp, tmp1) {
275
		unsigned int csum_len;
276
277
		if (before(tp->mptcp->map_subseq + tp->mptcp->map_data_len, TCP_SKB_CB(tmp)->end_seq))
278
			/* Mapping ends in the middle of the packet -
279
			 * csum only these bytes
280
			 */
281
			csum_len = tp->mptcp->map_subseq + tp->mptcp->map_data_len - TCP_SKB_CB(tmp)->seq;
282
		else
283
			csum_len = tmp->len;
284
285
		offset = 0;
286
		if (overflowed) {
287
			char first_word[4];
288
			first_word[0] = 0;
289
			first_word[1] = 0;
290
			first_word[2] = 0;
291
			first_word[3] = *(tmp->data);
292
			csum_tcp = csum_partial(first_word, 4, csum_tcp);
293
			offset = 1;
294
			csum_len--;
295
			overflowed = 0;
296
		}
297
298
		csum_tcp = skb_checksum(tmp, offset, csum_len, csum_tcp);
299
300
		/* Was it on an odd-length? Then we have to merge the next byte
301
		 * correctly (see above)
302
		 */
303
		if (csum_len != (csum_len & (~1)))
304
			overflowed = 1;
305
306
		if (mptcp_is_data_seq(tmp) && !dss_csum_added) {
307
			__be32 data_seq = htonl((u32)(tp->mptcp->map_data_seq >> 32));
308
309
			/* If a 64-bit dss is present, we increase the offset
310
			 * by 4 bytes, as the high-order 64-bits will be added
311
			 * in the final csum_partial-call.
312
			 */
313
			u32 offset = skb_transport_offset(tmp) +
314
				     TCP_SKB_CB(tmp)->dss_off;
315
			if (TCP_SKB_CB(tmp)->mptcp_flags & MPTCPHDR_SEQ64_SET)
316
				offset += 4;
317
318
			csum_tcp = skb_checksum(tmp, offset,
319
						MPTCP_SUB_LEN_SEQ_CSUM,
320
						csum_tcp);
321
322
			csum_tcp = csum_partial(&data_seq,
323
						sizeof(data_seq), csum_tcp);
324
325
			dss_csum_added = 1; /* Just do it once */
326
		}
327
		last = tmp;
328
		iter++;
329
330
		if (!skb_queue_is_last(&sk->sk_receive_queue, tmp) &&
331
		    !before(TCP_SKB_CB(tmp1)->seq,
332
			    tp->mptcp->map_subseq + tp->mptcp->map_data_len))
333
			break;
334
	}
335
336
	/* Now, checksum must be 0 */
337
	if (unlikely(csum_fold(csum_tcp))) {
338
		pr_err("%s csum is wrong: %#x data_seq %u dss_csum_added %d overflowed %d iterations %d\n",
339
			    __func__, csum_fold(csum_tcp),
340
			    TCP_SKB_CB(last)->seq, dss_csum_added, overflowed,
341
			    iter);
342
343
		tp->mptcp->send_mp_fail = 1;
344
345
		/* map_data_seq is the data-seq number of the
346
		 * mapping we are currently checking
347
		 */
348
		tp->mpcb->csum_cutoff_seq = tp->mptcp->map_data_seq;
349
350
		if (tp->mpcb->cnt_subflows > 1) {
351
			mptcp_send_reset(sk);
352
			ans = -1;
353
		} else {
354
			tp->mpcb->send_infinite_mapping = 1;
355
356
			/* Need to purge the rcv-queue as it's no more valid */
357
			while ((tmp = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
358
				tp->copied_seq = TCP_SKB_CB(tmp)->end_seq;
359
				kfree_skb(tmp);
360
			}
361
362
			ans = 0;
363
		}
364
	}
365
366
	return ans;
367
}
368
369
static inline void mptcp_prepare_skb(struct sk_buff *skb, struct sk_buff *next,
370
				     struct sock *sk)
371
{
372
	struct tcp_sock *tp = tcp_sk(sk);
373
	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
374
	/* Adapt data-seq's to the packet itself. We kinda transform the
375
	 * dss-mapping to a per-packet granularity. This is necessary to
376
	 * correctly handle overlapping mappings coming from different
377
	 * subflows. Otherwise it would be a complete mess.
378
	 */
379
	tcb->seq = ((u32)tp->mptcp->map_data_seq) + tcb->seq - tp->mptcp->map_subseq;
380
	tcb->end_seq = tcb->seq + skb->len;
381
382
	/* If cur is the last one in the rcv-queue (or the last one for this
383
	 * mapping), and data_fin is enqueued, the end_data_seq is +1.
384
	 */
385
	if (skb_queue_is_last(&sk->sk_receive_queue, skb) ||
386
	    after(TCP_SKB_CB(next)->end_seq, tp->mptcp->map_subseq + tp->mptcp->map_data_len)) {
387
		tcb->end_seq += tp->mptcp->map_data_fin;
388
389
		/* We manually set the fin-flag if it is a data-fin. For easy
390
		 * processing in tcp_recvmsg.
391
		 */
392
		if (mptcp_is_data_fin2(skb, tp))
393
			tcp_hdr(skb)->fin = 1;
394
		else
395
			tcp_hdr(skb)->fin = 0;
396
	} else {
397
		/* We may have a subflow-fin with data but without data-fin */
398
		tcp_hdr(skb)->fin = 0;
399
	}
400
}
401
402
/**
403
 * @return: 1 if the segment has been eaten and can be suppressed,
404
 *          otherwise 0.
405
 */
406
static inline int mptcp_direct_copy(struct sk_buff *skb, struct sock *meta_sk)
407
{
408
	struct tcp_sock *meta_tp = tcp_sk(meta_sk);
409
	int chunk = min_t(unsigned int, skb->len, meta_tp->ucopy.len);
410
	int eaten = 0;
411
412
	__set_current_state(TASK_RUNNING);
413
414
	local_bh_enable();
415
	if (!skb_copy_datagram_iovec(skb, 0, meta_tp->ucopy.iov, chunk)) {
416
		meta_tp->ucopy.len -= chunk;
417
		meta_tp->copied_seq += chunk;
418
		eaten = (chunk == skb->len);
419
		tcp_rcv_space_adjust(meta_sk);
420
	}
421
	local_bh_disable();
422
	return eaten;
423
}
424
425
static inline void mptcp_reset_mapping(struct tcp_sock *tp)
426
{
427
	tp->mptcp->map_data_len = 0;
428
	tp->mptcp->map_data_seq = 0;
429
	tp->mptcp->map_subseq = 0;
430
	tp->mptcp->map_data_fin = 0;
431
	tp->mptcp->mapping_present = 0;
432
}
433
434
/* The DSS-mapping received on the sk only covers the second half of the skb
435
 * (cut at seq). We trim the head from the skb.
436
 * Data will be freed upon kfree().
437
 *
438
 * Inspired by tcp_trim_head().
439
 */
440
static void mptcp_skb_trim_head(struct sk_buff *skb, struct sock *sk, u32 seq)
441
{
442
	int len = seq - TCP_SKB_CB(skb)->seq;
443
	u32 new_seq = TCP_SKB_CB(skb)->seq + len;
444
445
	if (len < skb_headlen(skb))
446
		__skb_pull(skb, len);
447
	else
448
		__pskb_trim_head(skb, len - skb_headlen(skb));
449
450
	TCP_SKB_CB(skb)->seq = htonl(new_seq);
451
452
	skb->truesize -= len;
453
	atomic_sub(len, &sk->sk_rmem_alloc);
454
	sk_mem_uncharge(sk, len);
455
}
456
457
/* The DSS-mapping received on the sk only covers the first half of the skb
458
 * (cut at seq). We create a second skb (@return), and queue it in the rcv-queue
459
 * as further packets may resolve the mapping of the second half of data.
460
 *
461
 * Inspired by tcp_fragment().
462
 */
463
static int mptcp_skb_split_tail(struct sk_buff *skb, struct sock *sk, u32 seq)
464
{
465
	struct sk_buff *buff;
466
	int nsize;
467
	int nlen, len;
468
469
	len = seq - TCP_SKB_CB(skb)->seq;
470
	nsize = skb_headlen(skb) - len + tcp_sk(sk)->tcp_header_len;
471
	if (nsize < 0)
472
		nsize = 0;
473
474
	/* Get a new skb... force flag on. */
475
	buff = alloc_skb(nsize, GFP_ATOMIC);
476
	if (buff == NULL)
477
		return -ENOMEM;
478
479
	skb_reserve(buff, tcp_sk(sk)->tcp_header_len);
480
	skb_reset_transport_header(buff);
481
482
	tcp_hdr(buff)->fin = tcp_hdr(skb)->fin;
483
	tcp_hdr(skb)->fin = 0;
484
485
	/* We absolutly need to call skb_set_owner_r before refreshing the
486
	 * truesize of buff, otherwise the moved data will account twice.
487
	 */
488
	skb_set_owner_r(buff, sk);
489
	nlen = skb->len - len - nsize;
490
	buff->truesize += nlen;
491
	skb->truesize -= nlen;
492
493
	/* Correct the sequence numbers. */
494
	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
495
	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
496
	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
497
498
	skb_split(skb, buff, len);
499
500
	__skb_queue_after(&sk->sk_receive_queue, skb, buff);
501
502
	return 0;
503
}
504
505
/* @return: 0  everything is fine. Just continue processing
506
 *	    1  subflow is broken stop everything
507
 *	    -1 this packet was broken - continue with the next one.
508
 */
509
static int mptcp_prevalidate_skb(struct sock *sk, struct sk_buff *skb)
510
{
511
	struct tcp_sock *tp = tcp_sk(sk);
512
513
	/* If we are in infinite mode, the subflow-fin is in fact a data-fin. */
514
	if (!skb->len && tcp_hdr(skb)->fin && !mptcp_is_data_fin(skb) &&
515
	    !tp->mpcb->infinite_mapping_rcv) {
516
		/* Remove a pure subflow-fin from the queue and increase
517
		 * copied_seq.
518
		 */
519
		tp->copied_seq = TCP_SKB_CB(skb)->end_seq;
520
		__skb_unlink(skb, &sk->sk_receive_queue);
521
		__kfree_skb(skb);
522
		return -1;
523
	}
524
525
	/* If we are not yet fully established and do not know the mapping for
526
	 * this segment, this path has to fallback to infinite or be torn down.
527
	 */
528
	if (!tp->mptcp->fully_established && !mptcp_is_data_seq(skb) &&
529
	    !tp->mptcp->mapping_present && !tp->mpcb->infinite_mapping_rcv) {
530
		pr_err("%s %#x will fallback - pi %d from %pS, seq %u\n",
531
		       __func__, tp->mpcb->mptcp_loc_token,
532
		       tp->mptcp->path_index, __builtin_return_address(0),
533
		       TCP_SKB_CB(skb)->seq);
534
535
		if (!is_master_tp(tp)) {
536
			mptcp_send_reset(sk);
537
			return 1;
538
		}
539
540
		tp->mpcb->infinite_mapping_snd = 1;
541
		tp->mpcb->infinite_mapping_rcv = 1;
542
		tp->mptcp->fully_established = 1;
543
	}
544
545
	/* Receiver-side becomes fully established when a whole rcv-window has
546
	 * been received without the need to fallback due to the previous
547
	 * condition. */
548
	if (!tp->mptcp->fully_established) {
549
		tp->mptcp->init_rcv_wnd -= skb->len;
550
		if (tp->mptcp->init_rcv_wnd < 0)
551
			mptcp_become_fully_estab(sk);
552
	}
553
554
	return 0;
555
}
556
557
/* @return: 0  everything is fine. Just continue processing
558
 *	    1  subflow is broken stop everything
559
 *	    -1 this packet was broken - continue with the next one.
560
 */
561
static int mptcp_detect_mapping(struct sock *sk, struct sk_buff *skb)
562
{
563
	struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp);
564
	struct mptcp_cb *mpcb = tp->mpcb;
565
	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
566
	u32 *ptr;
567
	u32 data_seq, sub_seq, data_len, tcp_end_seq;
568
569
	/* If we are in infinite-mapping-mode, the subflow is guaranteed to be
570
	 * in-order at the data-level. Thus data-seq-numbers can be inferred
571
	 * from what is expected at the data-level.
572
	 */
573
	if (mpcb->infinite_mapping_rcv) {
574
		tp->mptcp->map_data_seq = mptcp_get_rcv_nxt_64(meta_tp);
575
		tp->mptcp->map_subseq = tcb->seq;
576
		tp->mptcp->map_data_len = skb->len;
577
		tp->mptcp->map_data_fin = tcp_hdr(skb)->fin;
578
		tp->mptcp->mapping_present = 1;
579
		return 0;
580
	}
581
582
	/* No mapping here? Exit - it is either already set or still on its way */
583
	if (!mptcp_is_data_seq(skb)) {
584
		/* Too many packets without a mapping - this subflow is broken */
585
		if (!tp->mptcp->mapping_present &&
586
		    tp->rcv_nxt - tp->copied_seq > 65536) {
587
			mptcp_send_reset(sk);
588
			return 1;
589
		}
590
591
		return 0;
592
	}
593
594
	ptr = mptcp_skb_set_data_seq(skb, &data_seq, mpcb);
595
	ptr++;
596
	sub_seq = get_unaligned_be32(ptr) + tp->mptcp->rcv_isn;
597
	ptr++;
598
	data_len = get_unaligned_be16(ptr);
599
600
	/* If it's an empty skb with DATA_FIN, sub_seq must get fixed.
601
	 * The draft sets it to 0, but we really would like to have the
602
	 * real value, to have an easy handling afterwards here in this
603
	 * function.
604
	 */
605
	if (mptcp_is_data_fin(skb) && skb->len == 0)
606
		sub_seq = TCP_SKB_CB(skb)->seq;
607
608
	/* If there is already a mapping - we check if it maps with the current
609
	 * one. If not - we reset.
610
	 */
611
	if (tp->mptcp->mapping_present &&
612
	    (data_seq != (u32)tp->mptcp->map_data_seq ||
613
	     sub_seq != tp->mptcp->map_subseq ||
614
	     data_len != tp->mptcp->map_data_len + tp->mptcp->map_data_fin ||
615
	     mptcp_is_data_fin(skb) != tp->mptcp->map_data_fin)) {
616
		/* Mapping in packet is different from what we want */
617
		pr_err("%s Mappings do not match!\n", __func__);
618
		pr_err("%s dseq %u mdseq %u, sseq %u msseq %u dlen %u mdlen %u dfin %d mdfin %d\n",
619
		       __func__, data_seq, (u32)tp->mptcp->map_data_seq,
620
		       sub_seq, tp->mptcp->map_subseq, data_len,
621
		       tp->mptcp->map_data_len, mptcp_is_data_fin(skb),
622
		       tp->mptcp->map_data_fin);
623
		mptcp_send_reset(sk);
624
		return 1;
625
	}
626
627
	/* If the previous check was good, the current mapping is valid and we exit. */
628
	if (tp->mptcp->mapping_present)
629
		return 0;
630
631
	/* Mapping not yet set on this subflow - we set it here! */
632
633
	if (!data_len) {
634
		mpcb->infinite_mapping_rcv = 1;
635
		tp->mptcp->fully_established = 1;
636
		/* We need to repeat mp_fail's until the sender felt
637
		 * back to infinite-mapping - here we stop repeating it.
638
		 */
639
		tp->mptcp->send_mp_fail = 0;
640
641
		/* We have to fixup data_len - it must be the same as skb->len */
642
		data_len = skb->len + (mptcp_is_data_fin(skb) ? 1 : 0);
643
		sub_seq = tcb->seq;
644
645
		/* TODO kill all other subflows than this one */
646
		/* data_seq and so on are set correctly */
647
648
		/* At this point, the meta-ofo-queue has to be emptied,
649
		 * as the following data is guaranteed to be in-order at
650
		 * the data and subflow-level
651
		 */
652
		mptcp_purge_ofo_queue(meta_tp);
653
	}
654
655
	/* We are sending mp-fail's and thus are in fallback mode.
656
	 * Ignore packets which do not announce the fallback and still
657
	 * want to provide a mapping.
658
	 */
659
	if (tp->mptcp->send_mp_fail) {
660
		tp->copied_seq = TCP_SKB_CB(skb)->end_seq;
661
		__skb_unlink(skb, &sk->sk_receive_queue);
662
		__kfree_skb(skb);
663
		return -1;
664
	}
665
666
	/* FIN increased the mapping-length by 1 */
667
	if (mptcp_is_data_fin(skb))
668
		data_len--;
669
670
	/* Subflow-sequences of packet must be
671
	 * (at least partially) be part of the DSS-mapping's
672
	 * subflow-sequence-space.
673
	 *
674
	 * Basically the mapping is not valid, if either of the
675
	 * following conditions is true:
676
	 *
677
	 * 1. It's not a data_fin and
678
	 *    MPTCP-sub_seq >= TCP-end_seq
679
	 *
680
	 * 2. It's a data_fin and TCP-end_seq > TCP-seq and
681
	 *    MPTCP-sub_seq >= TCP-end_seq
682
	 *
683
	 * The previous two can be merged into:
684
	 *    TCP-end_seq > TCP-seq and MPTCP-sub_seq >= TCP-end_seq
685
	 *    Because if it's not a data-fin, TCP-end_seq > TCP-seq
686
	 *
687
	 * 3. It's a data_fin and skb->len == 0 and
688
	 *    MPTCP-sub_seq > TCP-end_seq
689
	 *
690
	 * 4. It's not a data_fin and TCP-end_seq > TCP-seq and
691
	 *    MPTCP-sub_seq + MPTCP-data_len <= TCP-seq
692
	 *
693
	 * 5. MPTCP-sub_seq is prior to what we already copied (copied_seq)
694
	 */
695
696
	/* subflow-fin is not part of the mapping - ignore it here ! */
697
	tcp_end_seq = tcb->end_seq - tcp_hdr(skb)->fin;
698
	if ((!before(sub_seq, tcb->end_seq) && after(tcp_end_seq, tcb->seq)) ||
699
	    (mptcp_is_data_fin(skb) && skb->len == 0 && after(sub_seq, tcb->end_seq)) ||
700
	    (!after(sub_seq + data_len, tcb->seq) && after(tcp_end_seq, tcb->seq)) ||
701
	    before(sub_seq, tp->copied_seq)) {
702
		/* Subflow-sequences of packet is different from what is in the
703
		 * packet's dss-mapping. The peer is misbehaving - reset
704
		 */
705
		pr_err("%s Packet's mapping does not map to the DSS sub_seq %u "
706
		       "end_seq %u, tcp_end_seq %u seq %u dfin %u len %u data_len %u"
707
		       "copied_seq %u\n", __func__, sub_seq, tcb->end_seq, tcp_end_seq, tcb->seq, mptcp_is_data_fin(skb),
708
		       skb->len, data_len, tp->copied_seq);
709
		mptcp_send_reset(sk);
710
		return 1;
711
	}
712
713
	/* Does the DSS had 64-bit seqnum's ? */
714
	if (!(tcb->mptcp_flags & MPTCPHDR_SEQ64_SET)) {
715
		/* Wrapped around? */
716
		if (unlikely(after(data_seq, meta_tp->rcv_nxt) && data_seq < meta_tp->rcv_nxt)) {
717
			tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, !mpcb->rcv_hiseq_index, data_seq);
718
		} else {
719
			/* Else, access the default high-order bits */
720
			tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index, data_seq);
721
		}
722
	} else {
723
		tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, (tcb->mptcp_flags & MPTCPHDR_SEQ64_INDEX) ? 1 : 0, data_seq);
724
725
		if (unlikely(tcb->mptcp_flags & MPTCPHDR_SEQ64_OFO)) {
726
			/* We make sure that the data_seq is invalid.
727
			 * It will be dropped later.
728
			 */
729
			tp->mptcp->map_data_seq += 0xFFFFFFFF;
730
			tp->mptcp->map_data_seq += 0xFFFFFFFF;
731
		}
732
	}
733
734
	tp->mptcp->map_data_len = data_len;
735
	tp->mptcp->map_subseq = sub_seq;
736
	tp->mptcp->map_data_fin = mptcp_is_data_fin(skb) ? 1 : 0;
737
	tp->mptcp->mapping_present = 1;
738
739
	return 0;
740
}
741
742
/* Similar to tcp_sequence(...) */
743
static inline int mptcp_sequence(const struct tcp_sock *meta_tp,
744
				 u64 data_seq, u64 end_data_seq)
745
{
746
	struct mptcp_cb *mpcb = meta_tp->mpcb;
747
	u64 rcv_wup64;
748
749
	/* Wrap-around? */
750
	if (meta_tp->rcv_wup > meta_tp->rcv_nxt) {
751
		rcv_wup64 = ((u64)(mpcb->rcv_high_order[mpcb->rcv_hiseq_index] - 1) << 32) |
752
				meta_tp->rcv_wup;
753
	} else {
754
		rcv_wup64 = mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index,
755
						  meta_tp->rcv_wup);
756
	}
757
758
	return	!before64(end_data_seq, rcv_wup64) &&
759
		!after64(data_seq, mptcp_get_rcv_nxt_64(meta_tp) + tcp_receive_window(meta_tp));
760
}
761
762
/* @return: 0  everything is fine. Just continue processing
763
 *	    -1 this packet was broken - continue with the next one.
764
 */
765
static int mptcp_validate_mapping(struct sock *sk, struct sk_buff *skb)
766
{
767
	struct tcp_sock *tp = tcp_sk(sk);
768
	struct sk_buff *tmp, *tmp1;
769
	u32 tcp_end_seq;
770
771
	if (!tp->mptcp->mapping_present)
772
		return 0;
773
774
	/* either, the new skb gave us the mapping and the first segment
775
	 * in the sub-rcv-queue has to be trimmed ...
776
	 */
777
	tmp = skb_peek(&sk->sk_receive_queue);
778
	if (before(TCP_SKB_CB(tmp)->seq, tp->mptcp->map_subseq) &&
779
	    after(TCP_SKB_CB(tmp)->end_seq, tp->mptcp->map_subseq))
780
		mptcp_skb_trim_head(tmp, sk, tp->mptcp->map_subseq);
781
782
	/* ... or the new skb (tail) has to be split at the end. */
783
	tcp_end_seq = TCP_SKB_CB(skb)->end_seq - (tcp_hdr(skb)->fin ? 1 : 0);
784
	if (after(tcp_end_seq, tp->mptcp->map_subseq + tp->mptcp->map_data_len)) {
785
		u32 seq = tp->mptcp->map_subseq + tp->mptcp->map_data_len;
786
		if (mptcp_skb_split_tail(skb, sk, seq)) { /* Allocation failed */
787
			/* TODO : maybe handle this here better.
788
			 * We now just force meta-retransmission.
789
			 */
790
			tp->copied_seq = TCP_SKB_CB(skb)->end_seq;
791
			__skb_unlink(skb, &sk->sk_receive_queue);
792
			__kfree_skb(skb);
793
			return -1;
794
		}
795
	}
796
797
	/* Now, remove old sk_buff's from the receive-queue.
798
	 * This may happen if the mapping has been lost for these segments and
799
	 * the next mapping has already been received.
800
	 */
801
	if (tp->mptcp->mapping_present &&
802
	    before(TCP_SKB_CB(skb_peek(&sk->sk_receive_queue))->seq, tp->mptcp->map_subseq)) {
803
		skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) {
804
			if (!before(TCP_SKB_CB(tmp1)->seq, tp->mptcp->map_subseq))
805
				break;
806
807
			tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq;
808
			__skb_unlink(tmp1, &sk->sk_receive_queue);
809
810
			/* Impossible that we could free skb here, because his
811
			 * mapping is known to be valid from previous checks
812
			 */
813
			__kfree_skb(tmp1);
814
		}
815
	}
816
817
	return 0;
818
}
819
820
/* @return: 0  everything is fine. Just continue processing
821
 *	    1  subflow is broken stop everything
822
 *	    -1 this mapping has been put in the meta-receive-queue
823
 *	    -2 this mapping has been eaten by the application
824
 */
825
static int mptcp_queue_skb(struct sock *sk)
826
{
827
	struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp);
828
	struct sock *meta_sk = mptcp_meta_sk(sk);
829
	struct mptcp_cb *mpcb = tp->mpcb;
830
	struct sk_buff *tmp, *tmp1;
831
	u64 rcv_nxt64 = mptcp_get_rcv_nxt_64(meta_tp);
832
	bool data_queued = false;
833
834
	/* Have we not yet received the full mapping? */
835
	if (!tp->mptcp->mapping_present ||
836
	    before(tp->rcv_nxt, tp->mptcp->map_subseq + tp->mptcp->map_data_len))
837
		return 0;
838
839
	/* Is this an overlapping mapping? rcv_nxt >= end_data_seq
840
	 * OR
841
	 * This mapping is out of window
842
	 */
843
	if (!before64(rcv_nxt64, tp->mptcp->map_data_seq + tp->mptcp->map_data_len + tp->mptcp->map_data_fin) ||
844
	    !mptcp_sequence(meta_tp, tp->mptcp->map_data_seq,
845
			    tp->mptcp->map_data_seq + tp->mptcp->map_data_len + tp->mptcp->map_data_fin)) {
846
		skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) {
847
			__skb_unlink(tmp1, &sk->sk_receive_queue);
848
			tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq;
849
			__kfree_skb(tmp1);
850
851
			if (!skb_queue_empty(&sk->sk_receive_queue) &&
852
			    !before(TCP_SKB_CB(tmp)->seq,
853
				    tp->mptcp->map_subseq + tp->mptcp->map_data_len))
854
				break;
855
		}
856
857
		mptcp_reset_mapping(tp);
858
859
		return -1;
860
	}
861
862
	/* Record it, because we want to send our data_fin on the same path */
863
	if (tp->mptcp->map_data_fin) {
864
		mpcb->dfin_path_index = tp->mptcp->path_index;
865
		mpcb->dfin_combined = !!(sk->sk_shutdown & RCV_SHUTDOWN);
866
	}
867
868
	/* Verify the checksum */
869
	if (mpcb->dss_csum && !mpcb->infinite_mapping_rcv) {
870
		int ret = mptcp_verif_dss_csum(sk);
871
872
		if (ret <= 0) {
873
			mptcp_reset_mapping(tp);
874
			return 1;
875
		}
876
	}
877
878
	if (before64(rcv_nxt64, tp->mptcp->map_data_seq)) {
879
		/* Seg's have to go to the meta-ofo-queue */
880
		skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) {
881
			tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq;
882
			mptcp_prepare_skb(tmp1, tmp, sk);
883
			__skb_unlink(tmp1, &sk->sk_receive_queue);
884
			/* MUST be done here, because fragstolen may be true later.
885
			 * Then, kfree_skb_partial will not account the memory.
886
			 */
887
			skb_orphan(tmp1);
888
889
			if (!mpcb->in_time_wait) /* In time-wait, do not receive data */
890
				mptcp_add_meta_ofo_queue(meta_sk, tmp1, sk);
891
			else
892
				__kfree_skb(tmp1);
893
894
			if (!skb_queue_empty(&sk->sk_receive_queue) &&
895
			    !before(TCP_SKB_CB(tmp)->seq,
896
				    tp->mptcp->map_subseq + tp->mptcp->map_data_len))
897
				break;
898
899
		}
900
	} else {
901
		/* Ready for the meta-rcv-queue */
902
		skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) {
903
			int eaten = 0;
904
			int copied_early = 0;
905
			bool fragstolen = false;
906
			u32 old_rcv_nxt = meta_tp->rcv_nxt;
907
908
			tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq;
909
			mptcp_prepare_skb(tmp1, tmp, sk);
910
			__skb_unlink(tmp1, &sk->sk_receive_queue);
911
			/* MUST be done here, because fragstolen may be true.
912
			 * Then, kfree_skb_partial will not account the memory.
913
			 */
914
			skb_orphan(tmp1);
915
916
			/* This segment has already been received */
917
			if (!after(TCP_SKB_CB(tmp1)->end_seq, meta_tp->rcv_nxt)) {
918
				__kfree_skb(tmp1);
919
				goto next;
920
			}
921
922
#ifdef CONFIG_NET_DMA
923
			if (TCP_SKB_CB(tmp1)->seq == meta_tp->rcv_nxt  &&
924
			    meta_tp->ucopy.task == current &&
925
			    meta_tp->copied_seq == meta_tp->rcv_nxt &&
926
			    tmp1->len <= meta_tp->ucopy.len &&
927
			    sock_owned_by_user(meta_sk) &&
928
			    tcp_dma_try_early_copy(meta_sk, tmp1, 0)) {
929
				copied_early = 1;
930
				eaten = 1;
931
			}
932
#endif
933
934
			/* Is direct copy possible ? */
935
			if (TCP_SKB_CB(tmp1)->seq == meta_tp->rcv_nxt &&
936
			    meta_tp->ucopy.task == current &&
937
			    meta_tp->copied_seq == meta_tp->rcv_nxt &&
938
			    meta_tp->ucopy.len && sock_owned_by_user(meta_sk) &&
939
			    !copied_early)
940
				eaten = mptcp_direct_copy(tmp1, meta_sk);
941
942
			if (mpcb->in_time_wait) /* In time-wait, do not receive data */
943
				eaten = 1;
944
945
			if (!eaten)
946
				eaten = tcp_queue_rcv(meta_sk, tmp1, 0, &fragstolen);
947
948
			meta_tp->rcv_nxt = TCP_SKB_CB(tmp1)->end_seq;
949
			mptcp_check_rcvseq_wrap(meta_tp, old_rcv_nxt);
950
951
			if (copied_early)
952
				tcp_cleanup_rbuf(meta_sk, tmp1->len);
953
954
			if (tcp_hdr(tmp1)->fin && !mpcb->in_time_wait)
955
				mptcp_fin(meta_sk);
956
957
			/* Check if this fills a gap in the ofo queue */
958
			if (!skb_queue_empty(&meta_tp->out_of_order_queue))
959
				mptcp_ofo_queue(meta_sk);
960
961
#ifdef CONFIG_NET_DMA
962
			if (copied_early)
963
				__skb_queue_tail(&meta_sk->sk_async_wait_queue,
964
						 tmp1);
965
			else
966
#endif
967
			if (eaten)
968
				kfree_skb_partial(tmp1, fragstolen);
969
970
			data_queued = true;
971
next:
972
			if (!skb_queue_empty(&sk->sk_receive_queue) &&
973
			    !before(TCP_SKB_CB(tmp)->seq,
974
				    tp->mptcp->map_subseq + tp->mptcp->map_data_len))
975
				break;
976
		}
977
	}
978
979
	inet_csk(meta_sk)->icsk_ack.lrcvtime = tcp_time_stamp;
980
	tp->mptcp->last_data_seq = tp->mptcp->map_data_seq;
981
	mptcp_reset_mapping(tp);
982
983
	return data_queued ? -1 : -2;
984
}
985
986
void mptcp_data_ready(struct sock *sk, int bytes)
987
{
988
	struct sock *meta_sk = mptcp_meta_sk(sk);
989
	struct sk_buff *skb, *tmp;
990
	int queued = 0;
991
992
	/* If the meta is already closed, there is no point in pushing data */
993
	if (meta_sk->sk_state == TCP_CLOSE && !tcp_sk(sk)->mpcb->in_time_wait) {
994
		skb_queue_purge(&sk->sk_receive_queue);
995
		tcp_sk(sk)->copied_seq = tcp_sk(sk)->rcv_nxt;
996
		goto exit;
997
	}
998
999
restart:
1000
	/* Iterate over all segments, detect their mapping (if we don't have
1001
	 * one yet), validate them and push everything one level higher.
1002
	 */
1003
	skb_queue_walk_safe(&sk->sk_receive_queue, skb, tmp) {
1004
		int ret;
1005
		/* Pre-validation - e.g., early fallback */
1006
		ret = mptcp_prevalidate_skb(sk, skb);
1007
		if (ret < 0)
1008
			goto restart;
1009
		else if (ret > 0)
1010
			break;
1011
1012
		/* Set the current mapping */
1013
		ret = mptcp_detect_mapping(sk, skb);
1014
		if (ret < 0)
1015
			goto restart;
1016
		else if (ret > 0)
1017
			break;
1018
1019
		/* Validation */
1020
		if (mptcp_validate_mapping(sk, skb) < 0)
1021
			goto restart;
1022
1023
		/* Push a level higher */
1024
		ret = mptcp_queue_skb(sk);
1025
		if (ret < 0) {
1026
			if (ret == -1)
1027
				queued = ret;
1028
			goto restart;
1029
		} else if (ret == 0) {
1030
			continue;
1031
		} else { /* ret == 1 */
1032
			break;
1033
		}
1034
	}
1035
1036
exit:
1037
	if (tcp_sk(sk)->close_it) {
1038
		tcp_send_ack(sk);
1039
		tcp_time_wait(sk, TCP_TIME_WAIT, 0);
1040
	}
1041
1042
	if (queued == -1 && !sock_flag(meta_sk, SOCK_DEAD))
1043
		meta_sk->sk_data_ready(meta_sk, 0);
1044
}
1045
1046
/**
1047
 * Equivalent of tcp_fin() for MPTCP
1048
 * Can be called only when the FIN is validly part
1049
 * of the data seqnum space. Not before when we get holes.
1050
 */
1051
void mptcp_fin(struct sock *meta_sk)
1052
{
1053
	struct sock *sk = NULL, *sk_it;
1054
	struct tcp_sock *meta_tp = tcp_sk(meta_sk);
1055
	struct mptcp_cb *mpcb = meta_tp->mpcb;
1056
1057
	mptcp_for_each_sk(mpcb, sk_it) {
1058
		if (tcp_sk(sk_it)->mptcp->path_index == mpcb->dfin_path_index) {
1059
			sk = sk_it;
1060
			break;
1061
		}
1062
	}
1063
1064
	if (!sk || sk->sk_state == TCP_CLOSE)
1065
		sk = mptcp_select_ack_sock(meta_sk, 0);
1066
1067
	inet_csk_schedule_ack(sk);
1068
1069
	meta_sk->sk_shutdown |= RCV_SHUTDOWN;
1070
	sock_set_flag(meta_sk, SOCK_DONE);
1071
1072
	switch (meta_sk->sk_state) {
1073
	case TCP_SYN_RECV:
1074
	case TCP_ESTABLISHED:
1075
		/* Move to CLOSE_WAIT */
1076
		tcp_set_state(meta_sk, TCP_CLOSE_WAIT);
1077
		inet_csk(sk)->icsk_ack.pingpong = 1;
1078
		break;
1079
1080
	case TCP_CLOSE_WAIT:
1081
	case TCP_CLOSING:
1082
		/* Received a retransmission of the FIN, do
1083
		 * nothing.
1084
		 */
1085
		break;
1086
	case TCP_LAST_ACK:
1087
		/* RFC793: Remain in the LAST-ACK state. */
1088
		break;
1089
1090
	case TCP_FIN_WAIT1:
1091
		/* This case occurs when a simultaneous close
1092
		 * happens, we must ack the received FIN and
1093
		 * enter the CLOSING state.
1094
		 */
1095
		tcp_send_ack(sk);
1096
		tcp_set_state(meta_sk, TCP_CLOSING);
1097
		break;
1098
	case TCP_FIN_WAIT2:
1099
		/* Received a FIN -- send ACK and enter TIME_WAIT. */
1100
		tcp_send_ack(sk);
1101
		tcp_time_wait(meta_sk, TCP_TIME_WAIT, 0);
1102
		break;
1103
	default:
1104
		/* Only TCP_LISTEN and TCP_CLOSE are left, in these
1105
		 * cases we should never reach this piece of code.
1106
		 */
1107
		pr_err("%s: Impossible, meta_sk->sk_state=%d\n", __func__,
1108
		       meta_sk->sk_state);
1109
		break;
1110
	}
1111
1112
	/* It _is_ possible, that we have something out-of-order _after_ FIN.
1113
	 * Probably, we should reset in this case. For now drop them.
1114
	 */
1115
	mptcp_purge_ofo_queue(meta_tp);
1116
	sk_mem_reclaim(meta_sk);
1117
1118
	if (!sock_flag(meta_sk, SOCK_DEAD)) {
1119
		meta_sk->sk_state_change(meta_sk);
1120
1121
		/* Do not send POLL_HUP for half duplex close. */
1122
		if (meta_sk->sk_shutdown == SHUTDOWN_MASK ||
1123
		    meta_sk->sk_state == TCP_CLOSE)
1124
			sk_wake_async(meta_sk, SOCK_WAKE_WAITD, POLL_HUP);
1125
		else
1126
			sk_wake_async(meta_sk, SOCK_WAKE_WAITD, POLL_IN);
1127
	}
1128
1129
	return;
1130
}
1131
1132
static void mptcp_xmit_retransmit_queue(struct sock *meta_sk)
1133
{
1134
	struct tcp_sock *meta_tp = tcp_sk(meta_sk);
1135
	struct sk_buff *skb;
1136
1137
	if (!meta_tp->packets_out)
1138
		return;
1139
1140
	tcp_for_write_queue(skb, meta_sk) {
1141
		if (skb == tcp_send_head(meta_sk))
1142
			break;
1143
1144
		if (mptcp_retransmit_skb(meta_sk, skb))
1145
			return;
1146
1147
		if (skb == tcp_write_queue_head(meta_sk))
1148
			inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS,
1149
						  inet_csk(meta_sk)->icsk_rto,
1150
						  TCP_RTO_MAX);
1151
	}
1152
}
1153
1154
/* Handle the DATA_ACK */
1155
static void mptcp_data_ack(struct sock *sk, const struct sk_buff *skb)
1156
{
1157
	struct sock *meta_sk = mptcp_meta_sk(sk);
1158
	struct tcp_sock *meta_tp = tcp_sk(meta_sk), *tp = tcp_sk(sk);
1159
	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
1160
	u32 prior_snd_una = meta_tp->snd_una;
1161
	int prior_packets;
1162
	u32 nwin, data_ack, data_seq;
1163
	u16 data_len = 0;
1164
1165
	/* A valid packet came in - subflow is operational again */
1166
	tp->pf = 0;
1167
1168
	/* Even if there is no data-ack, we stop retransmitting.
1169
	 * Except if this is a SYN/ACK. Then it is just a retransmission
1170
	 */
1171
	if (tp->mptcp->pre_established && !tcp_hdr(skb)->syn) {
1172
		tp->mptcp->pre_established = 0;
1173
		sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer);
1174
	}
1175
1176
	/* If we are in infinite mapping mode, rx_opt.data_ack has been
1177
	 * set by mptcp_clean_rtx_infinite.
1178
	 */
1179
	if (!(tcb->mptcp_flags & MPTCPHDR_ACK) && !tp->mpcb->infinite_mapping_snd)
1180
		goto exit;
1181
1182
	data_ack = tp->mptcp->rx_opt.data_ack;
1183
1184
	if (unlikely(!tp->mptcp->fully_established) &&
1185
	    (data_ack != meta_tp->mptcp->snt_isn ||
1186
	    tp->mptcp->snt_isn + 1 != TCP_SKB_CB(skb)->ack_seq))
1187
		/* As soon as data has been data-acked,
1188
		 * or a subflow-data-ack (not acking syn - thus snt_isn + 1)
1189
		 * includes a data-ack, we are fully established
1190
		 */
1191
		mptcp_become_fully_estab(sk);
1192
1193
	/* Get the data_seq */
1194
	if (mptcp_is_data_seq(skb)) {
1195
		data_seq = tp->mptcp->rx_opt.data_seq;
1196
		data_len = tp->mptcp->rx_opt.data_len;
1197
	} else {
1198
		data_seq = meta_tp->snd_wl1;
1199
	}
1200
1201
	/* If the ack is older than previous acks
1202
	 * then we can probably ignore it.
1203
	 */
1204
	if (before(data_ack, prior_snd_una))
1205
		goto exit;
1206
1207
	/* If the ack includes data we haven't sent yet, discard
1208
	 * this segment (RFC793 Section 3.9).
1209
	 */
1210
	if (after(data_ack, meta_tp->snd_nxt))
1211
		goto exit;
1212
1213
	/*** Now, update the window  - inspired by tcp_ack_update_window ***/
1214
	nwin = ntohs(tcp_hdr(skb)->window);
1215
1216
	if (likely(!tcp_hdr(skb)->syn))
1217
		nwin <<= tp->rx_opt.snd_wscale;
1218
1219
	if (tcp_may_update_window(meta_tp, data_ack, data_seq, nwin)) {
1220
		tcp_update_wl(meta_tp, data_seq);
1221
1222
		/* Draft v09, Section 3.3.5:
1223
		 * [...] It should only update its local receive window values
1224
		 * when the largest sequence number allowed (i.e.  DATA_ACK +
1225
		 * receive window) increases. [...]
1226
		 */
1227
		if (meta_tp->snd_wnd != nwin &&
1228
		    !before(data_ack + nwin, tcp_wnd_end(meta_tp))) {
1229
			meta_tp->snd_wnd = nwin;
1230
1231
			if (nwin > meta_tp->max_window)
1232
				meta_tp->max_window = nwin;
1233
		}
1234
	}
1235
	/*** Done, update the window ***/
1236
1237
	/* We passed data and got it acked, remove any soft error
1238
	 * log. Something worked...
1239
	 */
1240
	sk->sk_err_soft = 0;
1241
	inet_csk(meta_sk)->icsk_probes_out = 0;
1242
	meta_tp->rcv_tstamp = tcp_time_stamp;
1243
	prior_packets = meta_tp->packets_out;
1244
	if (!prior_packets)
1245
		goto no_queue;
1246
1247
	meta_tp->snd_una = data_ack;
1248
1249
	mptcp_clean_rtx_queue(meta_sk, prior_snd_una);
1250
1251
	/* We are in loss-state, and something got acked, retransmit the whole
1252
	 * queue now!
1253
	 */
1254
	if (inet_csk(meta_sk)->icsk_ca_state == TCP_CA_Loss &&
1255
	    after(data_ack, prior_snd_una)) {
1256
		mptcp_xmit_retransmit_queue(meta_sk);
1257
		inet_csk(meta_sk)->icsk_ca_state = TCP_CA_Open;
1258
	}
1259
1260
	/* Simplified version of tcp_new_space, because the snd-buffer
1261
	 * is handled by all the subflows.
1262
	 */
1263
	if (sock_flag(meta_sk, SOCK_QUEUE_SHRUNK)) {
1264
		sock_reset_flag(meta_sk, SOCK_QUEUE_SHRUNK);
1265
		if (meta_sk->sk_socket &&
1266
		    test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags))
1267
			meta_sk->sk_write_space(meta_sk);
1268
	}
1269
1270
	if (meta_sk->sk_state != TCP_ESTABLISHED)
1271
		mptcp_rcv_state_process(meta_sk, sk, skb, data_seq, data_len);
1272
1273
exit:
1274
	mptcp_push_pending_frames(meta_sk);
1275
1276
	return;
1277
1278
no_queue:
1279
	if (tcp_send_head(meta_sk))
1280
		tcp_ack_probe(meta_sk);
1281
1282
	mptcp_push_pending_frames(meta_sk);
1283
1284
	return;
1285
}
1286
1287
void mptcp_clean_rtx_infinite(struct sk_buff *skb, struct sock *sk)
1288
{
1289
	struct tcp_sock *tp = tcp_sk(sk), *meta_tp = tcp_sk(mptcp_meta_sk(sk));
1290
1291
	if (!tp->mpcb->infinite_mapping_snd)
1292
		return;
1293
1294
	/* The difference between both write_seq's represents the offset between
1295
	 * data-sequence and subflow-sequence. As we are infinite, this must
1296
	 * match.
1297
	 *
1298
	 * Thus, from this difference we can infer the meta snd_una.
1299
	 */
1300
	tp->mptcp->rx_opt.data_ack = meta_tp->snd_nxt - tp->snd_nxt +
1301
				     tp->snd_una;
1302
1303
	mptcp_data_ack(sk, skb);
1304
}
1305
1306
/**** static functions used by mptcp_parse_options */
1307
1308
static inline int mptcp_rem_raddress(struct mptcp_cb *mpcb, u8 rem_id)
1309
{
1310
	if (mptcp_v4_rem_raddress(mpcb, rem_id) < 0) {
1311
#if IS_ENABLED(CONFIG_IPV6)
1312
		if (mptcp_v6_rem_raddress(mpcb, rem_id) < 0)
1313
			return -1;
1314
#else
1315
		return -1;
1316
#endif /* CONFIG_IPV6 */
1317
	}
1318
	return 0;
1319
}
1320
1321
static void mptcp_send_reset_rem_id(const struct mptcp_cb *mpcb, u8 rem_id)
1322
{
1323
	struct sock *sk_it, *tmpsk;
1324
1325
	mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) {
1326
		if (tcp_sk(sk_it)->mptcp->rem_id == rem_id) {
1327
			mptcp_reinject_data(sk_it, 0);
1328
			sk_it->sk_err = ECONNRESET;
1329
			if (tcp_need_reset(sk_it->sk_state))
1330
				tcp_send_active_reset(sk_it, GFP_ATOMIC);
1331
			mptcp_sub_force_close(sk_it);
1332
		}
1333
	}
1334
}
1335
1336
void mptcp_parse_options(const uint8_t *ptr, int opsize,
1337
			 struct tcp_options_received *opt_rx,
1338
			 struct mptcp_options_received *mopt,
1339
			 const struct sk_buff *skb)
1340
{
1341
	struct mptcp_option *mp_opt = (struct mptcp_option *)ptr;
1342
1343
	/* If the socket is mp-capable we would have a mopt. */
1344
	if (!mopt)
1345
		return;
1346
1347
	switch (mp_opt->sub) {
1348
	case MPTCP_SUB_CAPABLE:
1349
	{
1350
		struct mp_capable *mpcapable = (struct mp_capable *)ptr;
1351
1352
		if (opsize != MPTCP_SUB_LEN_CAPABLE_SYN &&
1353
		    opsize != MPTCP_SUB_LEN_CAPABLE_ACK) {
1354
			mptcp_debug("%s: mp_capable: bad option size %d\n",
1355
				    __func__, opsize);
1356
			break;
1357
		}
1358
1359
		if (!sysctl_mptcp_enabled)
1360
			break;
1361
1362
		/* We only support MPTCP version 0 */
1363
		if (mpcapable->ver != 0)
1364
			break;
1365
1366
		/* MPTCP-RFC 6824:
1367
		 * "If receiving a message with the 'B' flag set to 1, and this
1368
		 * is not understood, then this SYN MUST be silently ignored;
1369
		 */
1370
		if (mpcapable->b) {
1371
			mopt->drop_me = 1;
1372
			break;
1373
		}
1374
1375
		/* MPTCP-RFC 6824:
1376
		 * "An implementation that only supports this method MUST set
1377
		 *  bit "H" to 1, and bits "C" through "G" to 0."
1378
		 */
1379
		if (!mpcapable->h)
1380
			break;
1381
1382
		mopt->saw_mpc = 1;
1383
		mopt->dss_csum = sysctl_mptcp_checksum || mpcapable->a;
1384
1385
		if (opsize >= MPTCP_SUB_LEN_CAPABLE_SYN)
1386
			mopt->mptcp_key = mpcapable->sender_key;
1387
1388
		break;
1389
	}
1390
	case MPTCP_SUB_JOIN:
1391
	{
1392
		struct mp_join *mpjoin = (struct mp_join *)ptr;
1393
1394
		if (opsize != MPTCP_SUB_LEN_JOIN_SYN &&
1395
		    opsize != MPTCP_SUB_LEN_JOIN_SYNACK &&
1396
		    opsize != MPTCP_SUB_LEN_JOIN_ACK) {
1397
			mptcp_debug("%s: mp_join: bad option size %d\n",
1398
				    __func__, opsize);
1399
			break;
1400
		}
1401
1402
		switch (opsize) {
1403
		case MPTCP_SUB_LEN_JOIN_SYN:
1404
			mopt->is_mp_join = 1;
1405
			mopt->low_prio = mpjoin->b;
1406
			mopt->rem_id = mpjoin->addr_id;
1407
			mopt->mptcp_rem_token = mpjoin->u.syn.token;
1408
			mopt->mptcp_recv_nonce = mpjoin->u.syn.nonce;
1409
			break;
1410
		case MPTCP_SUB_LEN_JOIN_SYNACK:
1411
			mopt->low_prio = mpjoin->b;
1412
			mopt->rem_id = mpjoin->addr_id;
1413
			mopt->mptcp_recv_tmac = mpjoin->u.synack.mac;
1414
			mopt->mptcp_recv_nonce = mpjoin->u.synack.nonce;
1415
			break;
1416
		case MPTCP_SUB_LEN_JOIN_ACK:
1417
			mopt->join_ack = 1;
1418
			memcpy(mopt->mptcp_recv_mac, mpjoin->u.ack.mac, 20);
1419
			break;
1420
		}
1421
		break;
1422
	}
1423
	case MPTCP_SUB_DSS:
1424
	{
1425
		struct mp_dss *mdss = (struct mp_dss *)ptr;
1426
		struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
1427
1428
		/* We check opsize for the csum and non-csum case. We do this,
1429
		 * because the draft says that the csum SHOULD be ignored if
1430
		 * it has not been negotiated in the MP_CAPABLE but still is
1431
		 * present in the data.
1432
		 *
1433
		 * It will get ignored later in mptcp_queue_skb.
1434
		 */
1435
		if (opsize != mptcp_sub_len_dss(mdss, 0) &&
1436
		    opsize != mptcp_sub_len_dss(mdss, 1)) {
1437
			mptcp_debug("%s: mp_dss: bad option size %d\n",
1438
				    __func__, opsize);
1439
			break;
1440
		}
1441
1442
		ptr += 4;
1443
1444
		if (mdss->A) {
1445
			tcb->mptcp_flags |= MPTCPHDR_ACK;
1446
1447
			if (mdss->a) {
1448
				mopt->data_ack = (u32) get_unaligned_be64(ptr);
1449
				ptr += MPTCP_SUB_LEN_ACK_64;
1450
			} else {
1451
				mopt->data_ack = get_unaligned_be32(ptr);
1452
				ptr += MPTCP_SUB_LEN_ACK;
1453
			}
1454
		}
1455
1456
		tcb->dss_off = (ptr - skb_transport_header(skb));
1457
1458
		if (mdss->M) {
1459
			if (mdss->m) {
1460
				u64 data_seq64 = get_unaligned_be64(ptr);
1461
1462
				tcb->mptcp_flags |= MPTCPHDR_SEQ64_SET;
1463
				mopt->data_seq = (u32) data_seq64;
1464
1465
				ptr += 12; /* 64-bit dseq + subseq */
1466
			} else {
1467
				mopt->data_seq = get_unaligned_be32(ptr);
1468
				ptr += 8; /* 32-bit dseq + subseq */
1469
			}
1470
			mopt->data_len = get_unaligned_be16(ptr);
1471
1472
			tcb->mptcp_flags |= MPTCPHDR_SEQ;
1473
1474
			/* Is a check-sum present? */
1475
			if (opsize == mptcp_sub_len_dss(mdss, 1))
1476
				tcb->mptcp_flags |= MPTCPHDR_DSS_CSUM;
1477
1478
			/* DATA_FIN only possible with DSS-mapping */
1479
			if (mdss->F)
1480
				tcb->mptcp_flags |= MPTCPHDR_FIN;
1481
		}
1482
1483
		break;
1484
	}
1485
	case MPTCP_SUB_ADD_ADDR:
1486
	{
1487
#if IS_ENABLED(CONFIG_IPV6)
1488
		struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr;
1489
1490
		if ((mpadd->ipver == 4 && opsize != MPTCP_SUB_LEN_ADD_ADDR4 &&
1491
		     opsize != MPTCP_SUB_LEN_ADD_ADDR4 + 2) ||
1492
		    (mpadd->ipver == 6 && opsize != MPTCP_SUB_LEN_ADD_ADDR6 &&
1493
		     opsize != MPTCP_SUB_LEN_ADD_ADDR6 + 2)) {
1494
#else
1495
		if (opsize != MPTCP_SUB_LEN_ADD_ADDR4 &&
1496
		    opsize != MPTCP_SUB_LEN_ADD_ADDR4 + 2) {
1497
#endif /* CONFIG_IPV6 */
1498
			mptcp_debug("%s: mp_add_addr: bad option size %d\n",
1499
				    __func__, opsize);
1500
			break;
1501
		}
1502
1503
		/* We have to manually parse the options if we got two of them. */
1504
		if (mopt->saw_add_addr) {
1505
			mopt->more_add_addr = 1;
1506
			break;
1507
		}
1508
		mopt->saw_add_addr = 1;
1509
		mopt->add_addr_ptr = ptr;
1510
		break;
1511
	}
1512
	case MPTCP_SUB_REMOVE_ADDR:
1513
		if ((opsize - MPTCP_SUB_LEN_REMOVE_ADDR) < 0) {
1514
			mptcp_debug("%s: mp_remove_addr: bad option size %d\n",
1515
				    __func__, opsize);
1516
			break;
1517
		}
1518
1519
		if (mopt->saw_rem_addr) {
1520
			mopt->more_rem_addr = 1;
1521
			break;
1522
		}
1523
		mopt->saw_rem_addr = 1;
1524
		mopt->rem_addr_ptr = ptr;
1525
		break;
1526
	case MPTCP_SUB_PRIO:
1527
	{
1528
		struct mp_prio *mpprio = (struct mp_prio *)ptr;
1529
1530
		if (opsize != MPTCP_SUB_LEN_PRIO &&
1531
		    opsize != MPTCP_SUB_LEN_PRIO_ADDR) {
1532
			mptcp_debug("%s: mp_prio: bad option size %d\n",
1533
				    __func__, opsize);
1534
			break;
1535
		}
1536
1537
		mopt->saw_low_prio = 1;
1538
		mopt->low_prio = mpprio->b;
1539
1540
		if (opsize == MPTCP_SUB_LEN_PRIO_ADDR) {
1541
			mopt->saw_low_prio = 2;
1542
			mopt->prio_addr_id = mpprio->addr_id;
1543
		}
1544
		break;
1545
	}
1546
	case MPTCP_SUB_FAIL:
1547
		if (opsize != MPTCP_SUB_LEN_FAIL) {
1548
			mptcp_debug("%s: mp_fail: bad option size %d\n",
1549
				    __func__, opsize);
1550
			break;
1551
		}
1552
		mopt->mp_fail = 1;
1553
		break;
1554
	case MPTCP_SUB_FCLOSE:
1555
		if (opsize != MPTCP_SUB_LEN_FCLOSE) {
1556
			mptcp_debug("%s: mp_fclose: bad option size %d\n",
1557
				    __func__, opsize);
1558
			break;
1559
		}
1560
1561
		mopt->mp_fclose = 1;
1562
		mopt->mptcp_key = ((struct mp_fclose *)ptr)->key;
1563
1564
		break;
1565
	default:
1566
		mptcp_debug("%s: Received unkown subtype: %d\n",
1567
			    __func__, mp_opt->sub);
1568
		break;
1569
	}
1570
}
1571
1572
int mptcp_check_rtt(const struct tcp_sock *tp, int time)
1573
{
1574
	struct mptcp_cb *mpcb = tp->mpcb;
1575
	struct sock *sk;
1576
	u32 rtt_max = 0;
1577
1578
	/* In MPTCP, we take the max delay across all flows,
1579
	 * in order to take into account meta-reordering buffers.
1580
	 */
1581
	mptcp_for_each_sk(mpcb, sk) {
1582
		if (!mptcp_sk_can_recv(sk))
1583
			continue;
1584
1585
		if (rtt_max < tcp_sk(sk)->rcv_rtt_est.rtt)
1586
			rtt_max = tcp_sk(sk)->rcv_rtt_est.rtt;
1587
	}
1588
	if (time < (rtt_max >> 3) || !rtt_max)
1589
		return 1;
1590
1591
	return 0;
1592
}
1593
1594
static void mptcp_handle_add_addr(const unsigned char *ptr, struct sock *sk)
1595
{
1596
	struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr;
1597
1598
	if (mpadd->ipver == 4) {
1599
		__be16 port = 0;
1600
		if (mpadd->len == MPTCP_SUB_LEN_ADD_ADDR4 + 2)
1601
			port  = mpadd->u.v4.port;
1602
1603
		mptcp_v4_add_raddress(tcp_sk(sk)->mpcb, &mpadd->u.v4.addr, port,
1604
				      mpadd->addr_id);
1605
#if IS_ENABLED(CONFIG_IPV6)
1606
	} else if (mpadd->ipver == 6) {
1607
		__be16 port = 0;
1608
		if (mpadd->len == MPTCP_SUB_LEN_ADD_ADDR6 + 2)
1609
			port  = mpadd->u.v6.port;
1610
1611
		mptcp_v6_add_raddress(tcp_sk(sk)->mpcb, &mpadd->u.v6.addr, port,
1612
				      mpadd->addr_id);
1613
#endif /* CONFIG_IPV6 */
1614
	}
1615
}
1616
1617
static void mptcp_handle_rem_addr(const unsigned char *ptr, struct sock *sk)
1618
{
1619
	struct mp_remove_addr *mprem = (struct mp_remove_addr *)ptr;
1620
	int i;
1621
	u8 rem_id;
1622
1623
	for (i = 0; i <= mprem->len - MPTCP_SUB_LEN_REMOVE_ADDR; i++) {
1624
		rem_id = (&mprem->addrs_id)[i];
1625
		if (!mptcp_rem_raddress(tcp_sk(sk)->mpcb, rem_id))
1626
			mptcp_send_reset_rem_id(tcp_sk(sk)->mpcb, rem_id);
1627
	}
1628
}
1629
1630
static void mptcp_parse_addropt(const struct sk_buff *skb, struct sock *sk)
1631
{
1632
	struct tcphdr *th = tcp_hdr(skb);
1633
	unsigned char *ptr;
1634
	int length = (th->doff * 4) - sizeof(struct tcphdr);
1635
1636
	/* Jump through the options to check whether ADD_ADDR is there */
1637
	ptr = (unsigned char *)(th + 1);
1638
	while (length > 0) {
1639
		int opcode = *ptr++;
1640
		int opsize;
1641
1642
		switch (opcode) {
1643
		case TCPOPT_EOL:
1644
			return;
1645
		case TCPOPT_NOP:
1646
			length--;
1647
			continue;
1648
		default:
1649
			opsize = *ptr++;
1650
			if (opsize < 2)
1651
				return;
1652
			if (opsize > length)
1653
				return;  /* don't parse partial options */
1654
			if (opcode == TCPOPT_MPTCP &&
1655
			    ((struct mptcp_option *)ptr)->sub == MPTCP_SUB_ADD_ADDR) {
1656
#if IS_ENABLED(CONFIG_IPV6)
1657
				struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr;
1658
				if ((mpadd->ipver == 4 && opsize != MPTCP_SUB_LEN_ADD_ADDR4 &&
1659
				     opsize != MPTCP_SUB_LEN_ADD_ADDR4 + 2) ||
1660
				    (mpadd->ipver == 6 && opsize != MPTCP_SUB_LEN_ADD_ADDR6 &&
1661
				     opsize != MPTCP_SUB_LEN_ADD_ADDR6 + 2))
1662
#else
1663
				if (opsize != MPTCP_SUB_LEN_ADD_ADDR4 &&
1664
				    opsize != MPTCP_SUB_LEN_ADD_ADDR4 + 2)
1665
#endif /* CONFIG_IPV6 */
1666
					goto cont;
1667
1668
				mptcp_handle_add_addr(ptr, sk);
1669
			}
1670
			if (opcode == TCPOPT_MPTCP &&
1671
			    ((struct mptcp_option *)ptr)->sub == MPTCP_SUB_REMOVE_ADDR) {
1672
				if ((opsize - MPTCP_SUB_LEN_REMOVE_ADDR) < 0)
1673
					goto cont;
1674
1675
				mptcp_handle_rem_addr(ptr, sk);
1676
			}
1677
cont:
1678
			ptr += opsize - 2;
1679
			length -= opsize;
1680
		}
1681
	}
1682
	return;
1683
}
1684
1685
static inline int mptcp_mp_fail_rcvd(struct sock *sk, const struct tcphdr *th)
1686
{
1687
	struct mptcp_tcp_sock *mptcp = tcp_sk(sk)->mptcp;
1688
	struct sock *meta_sk = mptcp_meta_sk(sk);
1689
	struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
1690
1691
	if (unlikely(mptcp->rx_opt.mp_fail)) {
1692
		mptcp->rx_opt.mp_fail = 0;
1693
1694
		if (!th->rst && !mpcb->infinite_mapping_snd) {
1695
			struct sock *sk_it;
1696
1697
			mpcb->send_infinite_mapping = 1;
1698
			/* We resend everything that has not been acknowledged */
1699
			meta_sk->sk_send_head = tcp_write_queue_head(meta_sk);
1700
1701
			/* We artificially restart the whole send-queue. Thus,
1702
			 * it is as if no packets are in flight
1703
			 */
1704
			tcp_sk(meta_sk)->packets_out = 0;
1705
1706
			/* If the snd_nxt already wrapped around, we have to
1707
			 * undo the wrapping, as we are restarting from snd_una
1708
			 * on.
1709
			 */
1710
			if (tcp_sk(meta_sk)->snd_nxt < tcp_sk(meta_sk)->snd_una) {
1711
				mpcb->snd_high_order[mpcb->snd_hiseq_index] -= 2;
1712
				mpcb->snd_hiseq_index = mpcb->snd_hiseq_index ? 0 : 1;
1713
			}
1714
			tcp_sk(meta_sk)->snd_nxt = tcp_sk(meta_sk)->snd_una;
1715
1716
			/* Trigger a sending on the meta. */
1717
			mptcp_push_pending_frames(meta_sk);
1718
1719
			mptcp_for_each_sk(mpcb, sk_it) {
1720
				if (sk != sk_it)
1721
					mptcp_sub_force_close(sk_it);
1722
			}
1723
		}
1724
1725
		return 0;
1726
	}
1727
1728
	if (unlikely(mptcp->rx_opt.mp_fclose)) {
1729
		struct sock *sk_it, *tmpsk;
1730
1731
		mptcp->rx_opt.mp_fclose = 0;
1732
		if (mptcp->rx_opt.mptcp_key != mpcb->mptcp_loc_key)
1733
			return 0;
1734
1735
		if (tcp_need_reset(sk->sk_state))
1736
			tcp_send_active_reset(sk, GFP_ATOMIC);
1737
1738
		mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk)
1739
			mptcp_sub_force_close(sk_it);
1740
1741
		tcp_reset(meta_sk);
1742
1743
		return 1;
1744
	}
1745
1746
	return 0;
1747
}
1748
1749
static inline void mptcp_path_array_check(struct sock *meta_sk)
1750
{
1751
	struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
1752
1753
	if (unlikely(mpcb->list_rcvd)) {
1754
		mpcb->list_rcvd = 0;
1755
		mptcp_create_subflows(meta_sk);
1756
	}
1757
}
1758
1759
int mptcp_handle_options(struct sock *sk, const struct tcphdr *th, struct sk_buff *skb)
1760
{
1761
	struct tcp_sock *tp = tcp_sk(sk);
1762
	struct mptcp_options_received *mopt = &tp->mptcp->rx_opt;
1763
1764
	if (tp->mpcb->infinite_mapping_rcv || tp->mpcb->infinite_mapping_snd)
1765
		return 0;
1766
1767
	if (mptcp_mp_fail_rcvd(sk, th))
1768
		return 1;
1769
1770
	/* RFC 6824, Section 3.3:
1771
	 * If a checksum is not present when its use has been negotiated, the
1772
	 * receiver MUST close the subflow with a RST as it is considered broken.
1773
	 */
1774
	if (mptcp_is_data_seq(skb) && tp->mpcb->dss_csum &&
1775
	    !(TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_DSS_CSUM)) {
1776
		if (tcp_need_reset(sk->sk_state))
1777
			tcp_send_active_reset(sk, GFP_ATOMIC);
1778
1779
		mptcp_sub_force_close(sk);
1780
		return 1;
1781
	}
1782
1783
	/* We have to acknowledge retransmissions of the third
1784
	 * ack.
1785
	 */
1786
	if (mopt->join_ack) {
1787
		tcp_send_delayed_ack(sk);
1788
		mopt->join_ack = 0;
1789
	}
1790
1791
	if (mopt->saw_add_addr || mopt->saw_rem_addr) {
1792
		if (mopt->more_add_addr || mopt->more_rem_addr) {
1793
			mptcp_parse_addropt(skb, sk);
1794
		} else {
1795
			if (mopt->saw_add_addr)
1796
				mptcp_handle_add_addr(mopt->add_addr_ptr, sk);
1797
			if (mopt->saw_rem_addr)
1798
				mptcp_handle_rem_addr(mopt->rem_addr_ptr, sk);
1799
		}
1800
1801
		mopt->more_add_addr = 0;
1802
		mopt->saw_add_addr = 0;
1803
		mopt->more_rem_addr = 0;
1804
		mopt->saw_rem_addr = 0;
1805
	}
1806
	if (mopt->saw_low_prio) {
1807
		if (mopt->saw_low_prio == 1) {
1808
			tp->mptcp->rcv_low_prio = mopt->low_prio;
1809
		} else {
1810
			struct sock *sk_it;
1811
			mptcp_for_each_sk(tp->mpcb, sk_it) {
1812
				struct mptcp_tcp_sock *mptcp = tcp_sk(sk_it)->mptcp;
1813
				if (mptcp->rem_id == mopt->prio_addr_id)
1814
					mptcp->rcv_low_prio = mopt->low_prio;
1815
			}
1816
		}
1817
		mopt->saw_low_prio = 0;
1818
	}
1819
1820
	mptcp_data_ack(sk, skb);
1821
1822
	mptcp_path_array_check(mptcp_meta_sk(sk));
1823
	/* Socket may have been mp_killed by a REMOVE_ADDR */
1824
	if (tp->mp_killed)
1825
		return 1;
1826
1827
	return 0;
1828
}
1829
1830
/* The skptr is needed, because if we become MPTCP-capable, we have to switch
1831
 * from meta-socket to master-socket.
1832
 *
1833
 * @return: 1 - we want to reset this connection
1834
 * 	    2 - we want to discard the received syn/ack
1835
 * 	    0 - everything is fine - continue
1836
 */
1837
int mptcp_rcv_synsent_state_process(struct sock *sk, struct sock **skptr,
1838
				    struct sk_buff *skb,
1839
				    struct mptcp_options_received *mopt)
1840
{
1841
	struct tcp_sock *tp = tcp_sk(sk);
1842
1843
	if (tp->mpc) {
1844
		u8 hash_mac_check[20];
1845
		struct mptcp_cb *mpcb = tp->mpcb;
1846
1847
		mptcp_hmac_sha1((u8 *)&mpcb->mptcp_rem_key,
1848
				(u8 *)&mpcb->mptcp_loc_key,
1849
				(u8 *)&tp->mptcp->rx_opt.mptcp_recv_nonce,
1850
				(u8 *)&tp->mptcp->mptcp_loc_nonce,
1851
				(u32 *)hash_mac_check);
1852
		if (memcmp(hash_mac_check,
1853
			   (char *)&tp->mptcp->rx_opt.mptcp_recv_tmac, 8)) {
1854
			mptcp_sub_force_close(sk);
1855
			return 1;
1856
		}
1857
1858
		/* Set this flag in order to postpone data sending
1859
		 * until the 4th ack arrives.
1860
		 */
1861
		tp->mptcp->pre_established = 1;
1862
		tp->mptcp->rcv_low_prio = tp->mptcp->rx_opt.low_prio;
1863
1864
		mptcp_hmac_sha1((u8 *)&mpcb->mptcp_loc_key,
1865
				(u8 *)&mpcb->mptcp_rem_key,
1866
				(u8 *)&tp->mptcp->mptcp_loc_nonce,
1867
				(u8 *)&tp->mptcp->rx_opt.mptcp_recv_nonce,
1868
				(u32 *)&tp->mptcp->sender_mac[0]);
1869
1870
	} else if (mopt->saw_mpc) {
1871
		if (mptcp_create_master_sk(sk, mopt->mptcp_key,
1872
					   ntohs(tcp_hdr(skb)->window)))
1873
			return 2;
1874
1875
		sk = tcp_sk(sk)->mpcb->master_sk;
1876
		*skptr = sk;
1877
		tp = tcp_sk(sk);
1878
1879
		/* snd_nxt - 1, because it has been incremented
1880
		 * by tcp_connect for the SYN
1881
		 */
1882
		tp->mptcp->snt_isn = tp->snd_nxt - 1;
1883
		tp->mpcb->dss_csum = mopt->dss_csum;
1884
		tp->mptcp->include_mpc = 1;
1885
1886
		sk_set_socket(sk, mptcp_meta_sk(sk)->sk_socket);
1887
		sk->sk_wq = mptcp_meta_sk(sk)->sk_wq;
1888
1889
		mptcp_update_metasocket(sk, mptcp_meta_sk(sk));
1890
1891
		 /* hold in mptcp_inherit_sk due to initialization to 2 */
1892
		sock_put(sk);
1893
	} else {
1894
		tp->request_mptcp = 0;
1895
1896
		if (tp->inside_tk_table)
1897
			mptcp_hash_remove(tp);
1898
	}
1899
1900
	if (tp->mpc)
1901
		tp->mptcp->rcv_isn = TCP_SKB_CB(skb)->seq;
1902
1903
	return 0;
1904
}
(-)a/net/mptcp/mptcp_ipv4.c (+728 lines)
Line 0 Link Here
1
/*
2
 *	MPTCP implementation - IPv4-specific functions
3
 *
4
 *	Initial Design & Implementation:
5
 *	Sébastien Barré <sebastien.barre@uclouvain.be>
6
 *
7
 *	Current Maintainer:
8
 *	Christoph Paasch <christoph.paasch@uclouvain.be>
9
 *
10
 *	Additional authors:
11
 *	Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
12
 *	Gregory Detal <gregory.detal@uclouvain.be>
13
 *	Fabien Duchêne <fabien.duchene@uclouvain.be>
14
 *	Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
15
 *	Lavkesh Lahngir <lavkesh51@gmail.com>
16
 *	Andreas Ripke <ripke@neclab.eu>
17
 *	Vlad Dogaru <vlad.dogaru@intel.com>
18
 *	Octavian Purdila <octavian.purdila@intel.com>
19
 *	John Ronan <jronan@tssg.org>
20
 *	Catalin Nicutar <catalin.nicutar@gmail.com>
21
 *	Brandon Heller <brandonh@stanford.edu>
22
 *
23
 *
24
 *	This program is free software; you can redistribute it and/or
25
 *      modify it under the terms of the GNU General Public License
26
 *      as published by the Free Software Foundation; either version
27
 *      2 of the License, or (at your option) any later version.
28
 */
29
30
#include <linux/export.h>
31
#include <linux/ip.h>
32
#include <linux/list.h>
33
#include <linux/skbuff.h>
34
#include <linux/spinlock.h>
35
#include <linux/tcp.h>
36
37
#include <net/inet_common.h>
38
#include <net/inet_connection_sock.h>
39
#include <net/mptcp.h>
40
#include <net/mptcp_pm.h>
41
#include <net/mptcp_v4.h>
42
#include <net/mptcp_v6.h>
43
#include <net/request_sock.h>
44
#include <net/tcp.h>
45
46
u32 mptcp_v4_get_nonce(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
47
		       u32 seq)
48
{
49
	u32 hash[MD5_DIGEST_WORDS];
50
51
	hash[0] = (__force u32)saddr;
52
	hash[1] = (__force u32)daddr;
53
	hash[2] = ((__force u16)sport << 16) + (__force u16)dport;
54
	hash[3] = seq;
55
56
	md5_transform(hash, mptcp_secret);
57
58
	return hash[0];
59
}
60
61
u64 mptcp_v4_get_key(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport)
62
{
63
	u32 hash[MD5_DIGEST_WORDS];
64
65
	hash[0] = (__force u32)saddr;
66
	hash[1] = (__force u32)daddr;
67
	hash[2] = ((__force u16)sport << 16) + (__force u16)dport;
68
	hash[3] = mptcp_key_seed++;
69
70
	md5_transform(hash, mptcp_secret);
71
72
	return *((u64 *)hash);
73
}
74
75
76
static void mptcp_v4_reqsk_destructor(struct request_sock *req)
77
{
78
	mptcp_reqsk_destructor(req);
79
80
	tcp_v4_reqsk_destructor(req);
81
}
82
83
/* Similar to tcp_request_sock_ops */
84
struct request_sock_ops mptcp_request_sock_ops __read_mostly = {
85
	.family		=	PF_INET,
86
	.obj_size	=	sizeof(struct mptcp_request_sock),
87
	.rtx_syn_ack	=	tcp_v4_rtx_synack,
88
	.send_ack	=	tcp_v4_reqsk_send_ack,
89
	.destructor	=	mptcp_v4_reqsk_destructor,
90
	.send_reset	=	tcp_v4_send_reset,
91
	.syn_ack_timeout =	tcp_syn_ack_timeout,
92
};
93
94
static void mptcp_v4_reqsk_queue_hash_add(struct sock *meta_sk,
95
					  struct request_sock *req,
96
					  unsigned long timeout)
97
{
98
	const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr,
99
				     inet_rsk(req)->rmt_port,
100
				     0, MPTCP_HASH_SIZE);
101
102
	inet_csk_reqsk_queue_hash_add(meta_sk, req, timeout);
103
104
	spin_lock(&mptcp_reqsk_hlock);
105
	list_add(&mptcp_rsk(req)->collide_tuple, &mptcp_reqsk_htb[h]);
106
	spin_unlock(&mptcp_reqsk_hlock);
107
}
108
109
/* Similar to tcp_v4_conn_request */
110
static void mptcp_v4_join_request(struct sock *meta_sk, struct sk_buff *skb)
111
{
112
	struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
113
	struct tcp_options_received tmp_opt;
114
	struct mptcp_options_received mopt;
115
	struct request_sock *req;
116
	struct inet_request_sock *ireq;
117
	struct mptcp_request_sock *mtreq;
118
	struct dst_entry *dst = NULL;
119
	u8 mptcp_hash_mac[20];
120
	__be32 saddr = ip_hdr(skb)->saddr;
121
	__be32 daddr = ip_hdr(skb)->daddr;
122
	__u32 isn = TCP_SKB_CB(skb)->when;
123
	int want_cookie = 0;
124
125
	tcp_clear_options(&tmp_opt);
126
	mptcp_init_mp_opt(&mopt);
127
	tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
128
	tmp_opt.user_mss = tcp_sk(meta_sk)->rx_opt.user_mss;
129
	tcp_parse_options(skb, &tmp_opt, &mopt, 0, NULL);
130
131
	req = inet_reqsk_alloc(&mptcp_request_sock_ops);
132
	if (!req)
133
		return;
134
135
	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
136
	tcp_openreq_init(req, &tmp_opt, skb);
137
138
	ireq = inet_rsk(req);
139
	ireq->loc_addr = daddr;
140
	ireq->rmt_addr = saddr;
141
	ireq->no_srccheck = inet_sk(meta_sk)->transparent;
142
	ireq->opt = tcp_v4_save_options(skb);
143
144
	if (security_inet_conn_request(meta_sk, skb, req))
145
		goto drop_and_free;
146
147
	if (!want_cookie || tmp_opt.tstamp_ok)
148
		TCP_ECN_create_request(req, skb, sock_net(meta_sk));
149
150
	if (!isn) {
151
		struct flowi4 fl4;
152
153
		/* VJ's idea. We save last timestamp seen
154
		 * from the destination in peer table, when entering
155
		 * state TIME-WAIT, and check against it before
156
		 * accepting new connection request.
157
		 *
158
		 * If "isn" is not zero, this request hit alive
159
		 * timewait bucket, so that all the necessary checks
160
		 * are made in the function processing timewait state.
161
		 */
162
		if (tmp_opt.saw_tstamp &&
163
		    tcp_death_row.sysctl_tw_recycle &&
164
		    (dst = inet_csk_route_req(meta_sk, &fl4, req)) != NULL &&
165
		    fl4.daddr == saddr) {
166
			if (!tcp_peer_is_proven(req, dst, true)) {
167
				NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_PAWSPASSIVEREJECTED);
168
				goto drop_and_release;
169
			}
170
		}
171
		/* Kill the following clause, if you dislike this way. */
172
		else if (!sysctl_tcp_syncookies &&
173
			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(meta_sk) <
174
			  (sysctl_max_syn_backlog >> 2)) &&
175
			 !tcp_peer_is_proven(req, dst, false)) {
176
			/* Without syncookies last quarter of
177
			 * backlog is filled with destinations,
178
			 * proven to be alive.
179
			 * It means that we continue to communicate
180
			 * to destinations, already remembered
181
			 * to the moment of synflood.
182
			 */
183
			LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
184
				       &saddr, ntohs(tcp_hdr(skb)->source));
185
			goto drop_and_release;
186
		}
187
188
		isn = tcp_v4_init_sequence(skb);
189
	}
190
	tcp_rsk(req)->snt_isn = isn;
191
	tcp_rsk(req)->snt_synack = tcp_time_stamp;
192
193
	mtreq = mptcp_rsk(req);
194
	mtreq->mpcb = mpcb;
195
	INIT_LIST_HEAD(&mtreq->collide_tuple);
196
	mtreq->mptcp_rem_nonce = mopt.mptcp_recv_nonce;
197
	mtreq->mptcp_rem_key = mpcb->mptcp_rem_key;
198
	mtreq->mptcp_loc_key = mpcb->mptcp_loc_key;
199
	mtreq->mptcp_loc_nonce = mptcp_v4_get_nonce(saddr, daddr,
200
						    tcp_hdr(skb)->source,
201
						    tcp_hdr(skb)->dest, isn);
202
	mptcp_hmac_sha1((u8 *)&mtreq->mptcp_loc_key,
203
			(u8 *)&mtreq->mptcp_rem_key,
204
			(u8 *)&mtreq->mptcp_loc_nonce,
205
			(u8 *)&mtreq->mptcp_rem_nonce, (u32 *)mptcp_hash_mac);
206
	mtreq->mptcp_hash_tmac = *(u64 *)mptcp_hash_mac;
207
	mtreq->rem_id = mopt.rem_id;
208
	mtreq->low_prio = mopt.low_prio;
209
	tcp_rsk(req)->saw_mpc = 1;
210
211
	if (tcp_v4_send_synack(meta_sk, dst, req, skb_get_queue_mapping(skb), want_cookie))
212
		goto drop_and_free;
213
214
	/* Adding to request queue in metasocket */
215
	mptcp_v4_reqsk_queue_hash_add(meta_sk, req, TCP_TIMEOUT_INIT);
216
217
	return;
218
219
drop_and_release:
220
	dst_release(dst);
221
drop_and_free:
222
	reqsk_free(req);
223
	return;
224
}
225
226
int mptcp_v4_rem_raddress(struct mptcp_cb *mpcb, u8 id)
227
{
228
	int i;
229
230
	for (i = 0; i < MPTCP_MAX_ADDR; i++) {
231
		if (!((1 << i) & mpcb->rem4_bits))
232
			continue;
233
234
		if (mpcb->remaddr4[i].id == id) {
235
			/* remove address from bitfield */
236
			mpcb->rem4_bits &= ~(1 << i);
237
238
			return 0;
239
		}
240
	}
241
242
	return -1;
243
}
244
245
/* Based on function tcp_v4_conn_request (tcp_ipv4.c)
246
 * Returns -1 if there is no space anymore to store an additional
247
 * address
248
 */
249
int mptcp_v4_add_raddress(struct mptcp_cb *mpcb, const struct in_addr *addr,
250
			  __be16 port, u8 id)
251
{
252
	int i;
253
	struct mptcp_rem4 *rem4;
254
255
	mptcp_for_each_bit_set(mpcb->rem4_bits, i) {
256
		rem4 = &mpcb->remaddr4[i];
257
258
		/* Address is already in the list --- continue */
259
		if (rem4->id == id &&
260
		    rem4->addr.s_addr == addr->s_addr && rem4->port == port)
261
			return 0;
262
263
		/* This may be the case, when the peer is behind a NAT. He is
264
		 * trying to JOIN, thus sending the JOIN with a certain ID.
265
		 * However the src_addr of the IP-packet has been changed. We
266
		 * update the addr in the list, because this is the address as
267
		 * OUR BOX sees it.
268
		 */
269
		if (rem4->id == id && rem4->addr.s_addr != addr->s_addr) {
270
			/* update the address */
271
			mptcp_debug("%s: updating old addr:%pI4 to addr %pI4 with id:%d\n",
272
				    __func__, &rem4->addr.s_addr,
273
				    &addr->s_addr, id);
274
			rem4->addr.s_addr = addr->s_addr;
275
			rem4->port = port;
276
			mpcb->list_rcvd = 1;
277
			return 0;
278
		}
279
	}
280
281
	i = mptcp_find_free_index(mpcb->rem4_bits);
282
	/* Do we have already the maximum number of local/remote addresses? */
283
	if (i < 0) {
284
		mptcp_debug("%s: At max num of remote addresses: %d --- not adding address: %pI4\n",
285
			    __func__, MPTCP_MAX_ADDR, &addr->s_addr);
286
		return -1;
287
	}
288
289
	rem4 = &mpcb->remaddr4[i];
290
291
	/* Address is not known yet, store it */
292
	rem4->addr.s_addr = addr->s_addr;
293
	rem4->port = port;
294
	rem4->bitfield = 0;
295
	rem4->retry_bitfield = 0;
296
	rem4->id = id;
297
	mpcb->list_rcvd = 1;
298
	mpcb->rem4_bits |= (1 << i);
299
300
	return 0;
301
}
302
303
/* Sets the bitfield of the remote-address field
304
 * local address is not set as it will disappear with the global address-list
305
 */
306
void mptcp_v4_set_init_addr_bit(struct mptcp_cb *mpcb, __be32 daddr)
307
{
308
	int i;
309
310
	mptcp_for_each_bit_set(mpcb->rem4_bits, i) {
311
		if (mpcb->remaddr4[i].addr.s_addr == daddr) {
312
			/* It's the initial flow - thus local index == 0 */
313
			mpcb->remaddr4[i].bitfield |= 1;
314
			return;
315
		}
316
	}
317
}
318
319
/* We only process join requests here. (either the SYN or the final ACK) */
320
int mptcp_v4_do_rcv(struct sock *meta_sk, struct sk_buff *skb)
321
{
322
	struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
323
	struct sock *child, *rsk = NULL;
324
	int ret;
325
326
	if (!(TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_JOIN)) {
327
		struct tcphdr *th = tcp_hdr(skb);
328
		const struct iphdr *iph = ip_hdr(skb);
329
		struct sock *sk;
330
331
		sk = inet_lookup_established(sock_net(meta_sk), &tcp_hashinfo,
332
					     iph->saddr, th->source, iph->daddr,
333
					     th->dest, inet_iif(skb));
334
335
		if (!sk) {
336
			kfree_skb(skb);
337
			return 0;
338
		}
339
		if (is_meta_sk(sk)) {
340
			WARN("%s Did not find a sub-sk - did found the meta!\n", __func__);
341
			kfree_skb(skb);
342
			sock_put(sk);
343
			return 0;
344
		}
345
346
		if (sk->sk_state == TCP_TIME_WAIT) {
347
			inet_twsk_put(inet_twsk(sk));
348
			kfree_skb(skb);
349
			return 0;
350
		}
351
352
		ret = tcp_v4_do_rcv(sk, skb);
353
		sock_put(sk);
354
355
		return ret;
356
	}
357
	TCP_SKB_CB(skb)->mptcp_flags = 0;
358
359
	/* Has been removed from the tk-table. Thus, no new subflows.
360
	 *
361
	 * Check for close-state is necessary, because we may have been closed
362
	 * without passing by mptcp_close().
363
	 *
364
	 * When falling back, no new subflows are allowed either.
365
	 */
366
	if (meta_sk->sk_state == TCP_CLOSE || !tcp_sk(meta_sk)->inside_tk_table ||
367
	    mpcb->infinite_mapping_rcv || mpcb->send_infinite_mapping)
368
		goto reset_and_discard;
369
370
	child = tcp_v4_hnd_req(meta_sk, skb);
371
372
	if (!child)
373
		goto discard;
374
375
	if (child != meta_sk) {
376
		sock_rps_save_rxhash(child, skb);
377
		/* We don't call tcp_child_process here, because we hold
378
		 * already the meta-sk-lock and are sure that it is not owned
379
		 * by the user.
380
		 */
381
		ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb), skb->len);
382
		bh_unlock_sock(child);
383
		sock_put(child);
384
		if (ret) {
385
			rsk = child;
386
			goto reset_and_discard;
387
		}
388
	} else {
389
		if (tcp_hdr(skb)->syn) {
390
			struct mp_join *join_opt = mptcp_find_join(skb);
391
			/* Currently we make two calls to mptcp_find_join(). This
392
			 * can probably be optimized.
393
			 */
394
			if (mptcp_v4_add_raddress(mpcb,
395
						  (struct in_addr *)&ip_hdr(skb)->saddr,
396
						  0,
397
						  join_opt->addr_id) < 0)
398
				goto reset_and_discard;
399
			mpcb->list_rcvd = 0;
400
401
			mptcp_v4_join_request(meta_sk, skb);
402
			goto discard;
403
		}
404
		goto reset_and_discard;
405
	}
406
	return 0;
407
408
reset_and_discard:
409
	tcp_v4_send_reset(rsk, skb);
410
discard:
411
	kfree_skb(skb);
412
	return 0;
413
}
414
415
/* After this, the ref count of the meta_sk associated with the request_sock
416
 * is incremented. Thus it is the responsibility of the caller
417
 * to call sock_put() when the reference is not needed anymore.
418
 */
419
struct sock *mptcp_v4_search_req(const __be16 rport, const __be32 raddr,
420
				 const __be32 laddr, const struct net *net)
421
{
422
	struct mptcp_request_sock *mtreq;
423
	struct sock *meta_sk = NULL;
424
425
	spin_lock(&mptcp_reqsk_hlock);
426
	list_for_each_entry(mtreq,
427
			    &mptcp_reqsk_htb[inet_synq_hash(raddr, rport, 0,
428
							    MPTCP_HASH_SIZE)],
429
			    collide_tuple) {
430
		struct inet_request_sock *ireq = inet_rsk(rev_mptcp_rsk(mtreq));
431
		meta_sk = mtreq->mpcb->meta_sk;
432
433
		if (ireq->rmt_port == rport &&
434
		    ireq->rmt_addr == raddr &&
435
		    ireq->loc_addr == laddr &&
436
		    rev_mptcp_rsk(mtreq)->rsk_ops->family == AF_INET &&
437
		    net_eq(net, sock_net(meta_sk)))
438
			break;
439
		meta_sk = NULL;
440
	}
441
442
	if (meta_sk && unlikely(!atomic_inc_not_zero(&meta_sk->sk_refcnt)))
443
		meta_sk = NULL;
444
	spin_unlock(&mptcp_reqsk_hlock);
445
446
	return meta_sk;
447
}
448
449
/* Create a new IPv4 subflow.
450
 *
451
 * We are in user-context and meta-sock-lock is hold.
452
 */
453
int mptcp_init4_subsockets(struct sock *meta_sk, const struct mptcp_loc4 *loc,
454
			   struct mptcp_rem4 *rem)
455
{
456
	struct tcp_sock *tp;
457
	struct sock *sk;
458
	struct sockaddr_in loc_in, rem_in;
459
	struct socket sock;
460
	int ulid_size = 0, ret;
461
462
	/* Don't try again - even if it fails */
463
	rem->bitfield |= (1 << loc->id);
464
465
	/** First, create and prepare the new socket */
466
467
	sock.type = meta_sk->sk_socket->type;
468
	sock.state = SS_UNCONNECTED;
469
	sock.wq = meta_sk->sk_socket->wq;
470
	sock.file = meta_sk->sk_socket->file;
471
	sock.ops = NULL;
472
473
	ret = inet_create(sock_net(meta_sk), &sock, IPPROTO_TCP, 1);
474
	if (unlikely(ret < 0)) {
475
		mptcp_debug("%s inet_create failed ret: %d\n", __func__, ret);
476
		return ret;
477
	}
478
479
	sk = sock.sk;
480
	tp = tcp_sk(sk);
481
482
	/* All subsockets need the MPTCP-lock-class */
483
	lockdep_set_class_and_name(&(sk)->sk_lock.slock, &meta_slock_key, "slock-AF_INET-MPTCP");
484
	lockdep_init_map(&(sk)->sk_lock.dep_map, "sk_lock-AF_INET-MPTCP", &meta_key, 0);
485
486
	if (mptcp_add_sock(meta_sk, sk, rem->id, GFP_KERNEL))
487
		goto error;
488
489
	tp->mptcp->slave_sk = 1;
490
	tp->mptcp->low_prio = loc->low_prio;
491
492
	/* Initializing the timer for an MPTCP subflow */
493
	setup_timer(&tp->mptcp->mptcp_ack_timer, mptcp_ack_handler, (unsigned long)sk);
494
495
	/** Then, connect the socket to the peer */
496
497
	ulid_size = sizeof(struct sockaddr_in);
498
	loc_in.sin_family = AF_INET;
499
	rem_in.sin_family = AF_INET;
500
	loc_in.sin_port = 0;
501
	if (rem->port)
502
		rem_in.sin_port = rem->port;
503
	else
504
		rem_in.sin_port = inet_sk(meta_sk)->inet_dport;
505
	loc_in.sin_addr = loc->addr;
506
	rem_in.sin_addr = rem->addr;
507
508
	ret = sock.ops->bind(&sock, (struct sockaddr *)&loc_in, ulid_size);
509
	if (ret < 0) {
510
		mptcp_debug("%s: MPTCP subsocket bind() failed, error %d\n",
511
			    __func__, ret);
512
		goto error;
513
	}
514
515
	mptcp_debug("%s: token %#x pi %d src_addr:%pI4:%d dst_addr:%pI4:%d\n",
516
		    __func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token,
517
		    tp->mptcp->path_index, &loc_in.sin_addr,
518
		    ntohs(loc_in.sin_port), &rem_in.sin_addr,
519
		    ntohs(rem_in.sin_port));
520
521
	ret = sock.ops->connect(&sock, (struct sockaddr *)&rem_in,
522
				ulid_size, O_NONBLOCK);
523
	if (ret < 0 && ret != -EINPROGRESS) {
524
		mptcp_debug("%s: MPTCP subsocket connect() failed, error %d\n",
525
			    __func__, ret);
526
		goto error;
527
	}
528
529
	sk_set_socket(sk, meta_sk->sk_socket);
530
	sk->sk_wq = meta_sk->sk_wq;
531
532
	return 0;
533
534
error:
535
	/* May happen if mptcp_add_sock fails first */
536
	if (!tp->mpc) {
537
		tcp_close(sk, 0);
538
	} else {
539
		local_bh_disable();
540
		mptcp_sub_force_close(sk);
541
		local_bh_enable();
542
	}
543
	return ret;
544
}
545
546
/****** IPv4-Address event handler ******/
547
548
/* React on IP-addr add/rem-events */
549
static int mptcp_pm_inetaddr_event(struct notifier_block *this,
550
				   unsigned long event, void *ptr)
551
{
552
	return mptcp_pm_addr_event_handler(event, ptr, AF_INET);
553
}
554
555
/* React on ifup/down-events */
556
static int mptcp_pm_netdev_event(struct notifier_block *this,
557
				 unsigned long event, void *ptr)
558
{
559
	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
560
	struct in_device *in_dev;
561
562
	if (!(event == NETDEV_UP || event == NETDEV_DOWN ||
563
	      event == NETDEV_CHANGE))
564
		return NOTIFY_DONE;
565
566
	/* Iterate over the addresses of the interface, then we go over the
567
	 * mpcb's to modify them - that way we take tk_hash_lock for a shorter
568
	 * time at each iteration. - otherwise we would need to take it from the
569
	 * beginning till the end.
570
	 */
571
	rcu_read_lock();
572
	in_dev = __in_dev_get_rtnl(dev);
573
574
	if (in_dev) {
575
		for_primary_ifa(in_dev) {
576
			mptcp_pm_inetaddr_event(NULL, event, ifa);
577
		} endfor_ifa(in_dev);
578
	}
579
580
	rcu_read_unlock();
581
	return NOTIFY_DONE;
582
}
583
584
void mptcp_pm_addr4_event_handler(struct in_ifaddr *ifa, unsigned long event,
585
				  struct mptcp_cb *mpcb)
586
{
587
	int i;
588
	struct sock *sk, *tmpsk;
589
590
	if (ifa->ifa_scope > RT_SCOPE_LINK)
591
		return;
592
593
	/* Look for the address among the local addresses */
594
	mptcp_for_each_bit_set(mpcb->loc4_bits, i) {
595
		if (mpcb->locaddr4[i].addr.s_addr == ifa->ifa_local)
596
			goto found;
597
	}
598
599
	/* Not yet in address-list */
600
	if ((event == NETDEV_UP || event == NETDEV_CHANGE) &&
601
	    netif_running(ifa->ifa_dev->dev) &&
602
	    !(ifa->ifa_dev->dev->flags & IFF_NOMULTIPATH)) {
603
		i = __mptcp_find_free_index(mpcb->loc4_bits, 0, mpcb->next_v4_index);
604
		if (i < 0) {
605
			mptcp_debug("MPTCP_PM: NETDEV_UP Reached max number of local IPv4 addresses: %d\n",
606
				    MPTCP_MAX_ADDR);
607
			return;
608
		}
609
610
		/* update this mpcb */
611
		mpcb->locaddr4[i].addr.s_addr = ifa->ifa_local;
612
		mpcb->locaddr4[i].id = i;
613
		mpcb->loc4_bits |= (1 << i);
614
		mpcb->next_v4_index = i + 1;
615
		/* re-send addresses */
616
		mptcp_v4_send_add_addr(i, mpcb);
617
		/* re-evaluate paths */
618
		mptcp_create_subflows(mpcb->meta_sk);
619
	}
620
	return;
621
found:
622
	/* Address already in list. Reactivate/Deactivate the
623
	 * concerned paths.
624
	 */
625
	mptcp_for_each_sk_safe(mpcb, sk, tmpsk) {
626
		struct tcp_sock *tp = tcp_sk(sk);
627
		if (sk->sk_family != AF_INET ||
628
		    inet_sk(sk)->inet_saddr != ifa->ifa_local)
629
			continue;
630
631
		if (event == NETDEV_DOWN ||
632
		    (ifa->ifa_dev->dev->flags & IFF_NOMULTIPATH)) {
633
			mptcp_reinject_data(sk, 0);
634
			mptcp_sub_force_close(sk);
635
		} else if (event == NETDEV_CHANGE) {
636
			int new_low_prio = (ifa->ifa_dev->dev->flags & IFF_MPBACKUP) ?
637
						1 : 0;
638
			if (new_low_prio != tp->mptcp->low_prio)
639
				tp->mptcp->send_mp_prio = 1;
640
			tp->mptcp->low_prio = new_low_prio;
641
		}
642
	}
643
644
	if (event == NETDEV_DOWN ||
645
	    (ifa->ifa_dev->dev->flags & IFF_NOMULTIPATH)) {
646
		mpcb->loc4_bits &= ~(1 << i);
647
648
		/* Force sending directly the REMOVE_ADDR option */
649
		mpcb->remove_addrs |= (1 << mpcb->locaddr4[i].id);
650
		sk = mptcp_select_ack_sock(mpcb->meta_sk, 0);
651
		if (sk)
652
			tcp_send_ack(sk);
653
654
		mptcp_for_each_bit_set(mpcb->rem4_bits, i) {
655
			mpcb->remaddr4[i].bitfield &= mpcb->loc4_bits;
656
			mpcb->remaddr4[i].retry_bitfield &= mpcb->loc4_bits;
657
		}
658
	}
659
}
660
661
/* Send ADD_ADDR for loc_id on all available subflows */
662
void mptcp_v4_send_add_addr(int loc_id, struct mptcp_cb *mpcb)
663
{
664
	struct tcp_sock *tp;
665
666
	mptcp_for_each_tp(mpcb, tp)
667
		tp->mptcp->add_addr4 |= (1 << loc_id);
668
}
669
670
static struct notifier_block mptcp_pm_inetaddr_notifier = {
671
		.notifier_call = mptcp_pm_inetaddr_event,
672
};
673
674
static struct notifier_block mptcp_pm_netdev_notifier = {
675
		.notifier_call = mptcp_pm_netdev_event,
676
};
677
678
/****** End of IPv4-Address event handler ******/
679
680
/* General initialization of IPv4 for MPTCP */
681
int mptcp_pm_v4_init(void)
682
{
683
	int ret;
684
	struct request_sock_ops *ops = &mptcp_request_sock_ops;
685
686
	ops->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", "MPTCP");
687
	if (ops->slab_name == NULL) {
688
		ret = -ENOMEM;
689
		goto out;
690
	}
691
692
	ops->slab = kmem_cache_create(ops->slab_name, ops->obj_size, 0,
693
				      SLAB_HWCACHE_ALIGN, NULL);
694
695
	if (ops->slab == NULL) {
696
		ret =  -ENOMEM;
697
		goto err_reqsk_create;
698
	}
699
700
	ret = register_inetaddr_notifier(&mptcp_pm_inetaddr_notifier);
701
	if (ret)
702
		goto err_reg_inetaddr;
703
	ret = register_netdevice_notifier(&mptcp_pm_netdev_notifier);
704
	if (ret)
705
		goto err_reg_netdev;
706
707
out:
708
	return ret;
709
710
err_reg_netdev:
711
	unregister_inetaddr_notifier(&mptcp_pm_inetaddr_notifier);
712
err_reg_inetaddr:
713
	kmem_cache_destroy(ops->slab);
714
err_reqsk_create:
715
	kfree(ops->slab_name);
716
	ops->slab_name = NULL;
717
	goto out;
718
}
719
720
void mptcp_pm_v4_undo(void)
721
{
722
	unregister_inetaddr_notifier(&mptcp_pm_inetaddr_notifier);
723
	unregister_netdevice_notifier(&mptcp_pm_netdev_notifier);
724
	kmem_cache_destroy(mptcp_request_sock_ops.slab);
725
	kfree(mptcp_request_sock_ops.slab_name);
726
}
727
728
(-)a/net/mptcp/mptcp_ipv6.c (+1010 lines)
Line 0 Link Here
1
/*
2
 *	MPTCP implementation - IPv6-specific functions
3
 *
4
 *	Initial Design & Implementation:
5
 *	Sébastien Barré <sebastien.barre@uclouvain.be>
6
 *
7
 *	Current Maintainer:
8
 *	Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
9
 *
10
 *	Additional authors:
11
 *	Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
12
 *	Gregory Detal <gregory.detal@uclouvain.be>
13
 *	Fabien Duchêne <fabien.duchene@uclouvain.be>
14
 *	Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
15
 *	Lavkesh Lahngir <lavkesh51@gmail.com>
16
 *	Andreas Ripke <ripke@neclab.eu>
17
 *	Vlad Dogaru <vlad.dogaru@intel.com>
18
 *	Octavian Purdila <octavian.purdila@intel.com>
19
 *	John Ronan <jronan@tssg.org>
20
 *	Catalin Nicutar <catalin.nicutar@gmail.com>
21
 *	Brandon Heller <brandonh@stanford.edu>
22
 *
23
 *
24
 *	This program is free software; you can redistribute it and/or
25
 *      modify it under the terms of the GNU General Public License
26
 *      as published by the Free Software Foundation; either version
27
 *      2 of the License, or (at your option) any later version.
28
 */
29
30
#include <linux/export.h>
31
#include <linux/in6.h>
32
#include <linux/kernel.h>
33
34
#include <net/addrconf.h>
35
#include <net/flow.h>
36
#include <net/inet6_connection_sock.h>
37
#include <net/inet6_hashtables.h>
38
#include <net/inet_common.h>
39
#include <net/ipv6.h>
40
#include <net/ip6_checksum.h>
41
#include <net/ip6_route.h>
42
#include <net/mptcp.h>
43
#include <net/mptcp_pm.h>
44
#include <net/mptcp_v6.h>
45
#include <net/tcp.h>
46
#include <net/transp_v6.h>
47
48
static int mptcp_v6v4_send_synack(struct sock *meta_sk, struct request_sock *req,
49
				  u16 queue_mapping);
50
51
__u32 mptcp_v6_get_nonce(const __be32 *saddr, const __be32 *daddr,
52
			 __be16 sport, __be16 dport, u32 seq)
53
{
54
	u32 secret[MD5_MESSAGE_BYTES / 4];
55
	u32 hash[MD5_DIGEST_WORDS];
56
	u32 i;
57
58
	memcpy(hash, saddr, 16);
59
	for (i = 0; i < 4; i++)
60
		secret[i] = mptcp_secret[i] + (__force u32)daddr[i];
61
	secret[4] = mptcp_secret[4] +
62
		    (((__force u16)sport << 16) + (__force u16)dport);
63
	secret[5] = seq;
64
	for (i = 6; i < MD5_MESSAGE_BYTES / 4; i++)
65
		secret[i] = mptcp_secret[i];
66
67
	md5_transform(hash, secret);
68
69
	return hash[0];
70
}
71
72
u64 mptcp_v6_get_key(const __be32 *saddr, const __be32 *daddr,
73
		     __be16 sport, __be16 dport)
74
{
75
	u32 secret[MD5_MESSAGE_BYTES / 4];
76
	u32 hash[MD5_DIGEST_WORDS];
77
	u32 i;
78
79
	memcpy(hash, saddr, 16);
80
	for (i = 0; i < 4; i++)
81
		secret[i] = mptcp_secret[i] + (__force u32)daddr[i];
82
	secret[4] = mptcp_secret[4] +
83
		    (((__force u16)sport << 16) + (__force u16)dport);
84
	secret[5] = mptcp_key_seed++;
85
	for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
86
		secret[i] = mptcp_secret[i];
87
88
	md5_transform(hash, secret);
89
90
	return *((u64 *)hash);
91
}
92
93
static void mptcp_v6_reqsk_destructor(struct request_sock *req)
94
{
95
	mptcp_reqsk_destructor(req);
96
97
	tcp_v6_reqsk_destructor(req);
98
}
99
100
/* Similar to tcp_v6_rtx_synack */
101
static int mptcp_v6_rtx_synack(struct sock *meta_sk, struct request_sock *req)
102
{
103
	if (meta_sk->sk_family == AF_INET6)
104
		return tcp_v6_rtx_synack(meta_sk, req);
105
106
	TCP_INC_STATS_BH(sock_net(meta_sk), TCP_MIB_RETRANSSEGS);
107
	return mptcp_v6v4_send_synack(meta_sk, req, 0);
108
}
109
110
/* Similar to tcp6_request_sock_ops */
111
struct request_sock_ops mptcp6_request_sock_ops __read_mostly = {
112
	.family		=	AF_INET6,
113
	.obj_size	=	sizeof(struct mptcp6_request_sock),
114
	.rtx_syn_ack	=	mptcp_v6_rtx_synack,
115
	.send_ack	=	tcp_v6_reqsk_send_ack,
116
	.destructor	=	mptcp_v6_reqsk_destructor,
117
	.send_reset	=	tcp_v6_send_reset,
118
	.syn_ack_timeout =	tcp_syn_ack_timeout,
119
};
120
121
static void mptcp_v6_reqsk_queue_hash_add(struct sock *meta_sk,
122
					  struct request_sock *req,
123
					  unsigned long timeout)
124
{
125
	const u32 h = inet6_synq_hash(&inet6_rsk(req)->rmt_addr,
126
				      inet_rsk(req)->rmt_port,
127
				      0, MPTCP_HASH_SIZE);
128
129
	inet6_csk_reqsk_queue_hash_add(meta_sk, req, timeout);
130
131
	spin_lock(&mptcp_reqsk_hlock);
132
	list_add(&mptcp_rsk(req)->collide_tuple, &mptcp_reqsk_htb[h]);
133
	spin_unlock(&mptcp_reqsk_hlock);
134
}
135
136
/* Similar to tcp_v6_send_synack
137
 *
138
 * The meta-socket is IPv4, but a new subsocket is IPv6
139
 */
140
static int mptcp_v6v4_send_synack(struct sock *meta_sk, struct request_sock *req,
141
				  u16 queue_mapping)
142
{
143
	struct inet6_request_sock *treq = inet6_rsk(req);
144
	struct sk_buff *skb;
145
	struct flowi6 fl6;
146
	struct dst_entry *dst;
147
	int err;
148
149
	memset(&fl6, 0, sizeof(fl6));
150
	fl6.flowi6_proto = IPPROTO_TCP;
151
	fl6.daddr = treq->rmt_addr;
152
	fl6.saddr = treq->loc_addr;
153
	fl6.flowlabel = 0;
154
	fl6.flowi6_oif = treq->iif;
155
	fl6.flowi6_mark = meta_sk->sk_mark;
156
	fl6.fl6_dport = inet_rsk(req)->rmt_port;
157
	fl6.fl6_sport = inet_rsk(req)->loc_port;
158
	security_req_classify_flow(req, flowi6_to_flowi(&fl6));
159
160
	dst = ip6_dst_lookup_flow(meta_sk, &fl6, NULL, false);
161
	if (IS_ERR(dst)) {
162
		err = PTR_ERR(dst);
163
		return err;
164
	}
165
	skb = tcp_make_synack(meta_sk, dst, req, NULL);
166
	err = -ENOMEM;
167
	if (skb) {
168
		__tcp_v6_send_check(skb, &treq->loc_addr, &treq->rmt_addr);
169
170
		fl6.daddr = treq->rmt_addr;
171
		skb_set_queue_mapping(skb, queue_mapping);
172
		err = ip6_xmit(meta_sk, skb, &fl6, NULL, 0);
173
		err = net_xmit_eval(err);
174
	}
175
176
	return err;
177
}
178
179
/* Similar to tcp_v6_syn_recv_sock
180
 *
181
 * The meta-socket is IPv4, but a new subsocket is IPv6
182
 */
183
struct sock *mptcp_v6v4_syn_recv_sock(struct sock *meta_sk, struct sk_buff *skb,
184
				      struct request_sock *req,
185
				      struct dst_entry *dst)
186
{
187
	struct inet6_request_sock *treq;
188
	struct ipv6_pinfo *newnp;
189
	struct tcp6_sock *newtcp6sk;
190
	struct inet_sock *newinet;
191
	struct tcp_sock *newtp;
192
	struct sock *newsk;
193
194
	treq = inet6_rsk(req);
195
196
	if (sk_acceptq_is_full(meta_sk))
197
		goto out_overflow;
198
199
	if (!dst) {
200
		/* This code is similar to inet6_csk_route_req, but as we
201
		 * don't have a np-pointer in the meta, we have to do it
202
		 * manually.
203
		 */
204
		struct flowi6 fl6;
205
206
		memset(&fl6, 0, sizeof(fl6));
207
		fl6.flowi6_proto = IPPROTO_TCP;
208
		fl6.daddr = treq->rmt_addr;
209
		fl6.saddr = treq->loc_addr;
210
		fl6.flowi6_oif = meta_sk->sk_bound_dev_if;
211
		fl6.flowi6_mark = meta_sk->sk_mark;
212
		fl6.fl6_dport = inet_rsk(req)->rmt_port;
213
		fl6.fl6_sport = inet_rsk(req)->loc_port;
214
		security_req_classify_flow(req, flowi6_to_flowi(&fl6));
215
216
		dst = ip6_dst_lookup_flow(meta_sk, &fl6, NULL, false);
217
		if (IS_ERR(dst))
218
			goto out;
219
	}
220
221
	newsk = tcp_create_openreq_child(meta_sk, req, skb);
222
	if (newsk == NULL)
223
		goto out_nonewsk;
224
225
	newtcp6sk = (struct tcp6_sock *)newsk;
226
	inet_sk(newsk)->pinet6 = &newtcp6sk->inet6;
227
228
	/*
229
	 * No need to charge this sock to the relevant IPv6 refcnt debug socks
230
	 * count here, tcp_create_openreq_child now does this for us, see the
231
	 * comment in that function for the gory details. -acme
232
	 */
233
234
	newsk->sk_gso_type = SKB_GSO_TCPV6;
235
	__ip6_dst_store(newsk, dst, NULL, NULL);
236
	inet6_sk_rx_dst_set(newsk, skb);
237
238
	newtp = tcp_sk(newsk);
239
	newinet = inet_sk(newsk);
240
	newnp = inet6_sk(newsk);
241
242
	newnp->daddr = treq->rmt_addr;
243
	newnp->saddr = treq->loc_addr;
244
	newnp->rcv_saddr = treq->loc_addr;
245
	newsk->sk_bound_dev_if = treq->iif;
246
247
	/* Now IPv6 options...
248
249
	   First: no IPv4 options.
250
	 */
251
	newinet->inet_opt = NULL;
252
	newnp->ipv6_ac_list = NULL;
253
	newnp->ipv6_fl_list = NULL;
254
	newnp->rxopt.all = 0;
255
256
	/* Clone pktoptions received with SYN */
257
	newnp->pktoptions = NULL;
258
	if (treq->pktopts != NULL) {
259
		newnp->pktoptions = skb_clone(treq->pktopts,
260
					      sk_gfp_atomic(meta_sk, GFP_ATOMIC));
261
		consume_skb(treq->pktopts);
262
		treq->pktopts = NULL;
263
		if (newnp->pktoptions)
264
			skb_set_owner_r(newnp->pktoptions, newsk);
265
	}
266
	newnp->opt	  = NULL;
267
	newnp->mcast_oif  = inet6_iif(skb);
268
	newnp->mcast_hops = ipv6_hdr(skb)->hop_limit;
269
	newnp->rcv_tclass = ipv6_get_dsfield(ipv6_hdr(skb));
270
271
	/* Initialization copied from inet6_create - normally this should have
272
	 * been handled by the memcpy as in tcp_v6_syn_recv_sock
273
	 */
274
	newnp->hop_limit  = -1;
275
	newnp->mc_loop	  = 1;
276
	newnp->pmtudisc	  = IPV6_PMTUDISC_WANT;
277
	(void)xchg(&newnp->rxpmtu, NULL);
278
279
	inet_csk(newsk)->icsk_ext_hdr_len = 0;
280
281
	tcp_mtup_init(newsk);
282
	tcp_sync_mss(newsk, dst_mtu(dst));
283
	newtp->advmss = dst_metric_advmss(dst);
284
	if (tcp_sk(meta_sk)->rx_opt.user_mss &&
285
	    tcp_sk(meta_sk)->rx_opt.user_mss < newtp->advmss)
286
		newtp->advmss = tcp_sk(meta_sk)->rx_opt.user_mss;
287
288
	tcp_initialize_rcv_mss(newsk);
289
	tcp_synack_rtt_meas(newsk, req);
290
	newtp->total_retrans = req->num_retrans;
291
292
	newinet->inet_daddr = LOOPBACK4_IPV6;
293
	newinet->inet_saddr = LOOPBACK4_IPV6;
294
	newinet->inet_rcv_saddr = LOOPBACK4_IPV6;
295
296
	if (__inet_inherit_port(meta_sk, newsk) < 0) {
297
		inet_csk_prepare_forced_close(newsk);
298
		tcp_done(newsk);
299
		goto out;
300
	}
301
	__inet6_hash(newsk, NULL);
302
303
	return newsk;
304
305
out_overflow:
306
	NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_LISTENOVERFLOWS);
307
out_nonewsk:
308
	dst_release(dst);
309
out:
310
	NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_LISTENDROPS);
311
	return NULL;
312
}
313
314
/* Similar to tcp_v6_conn_request */
315
static void mptcp_v6_join_request(struct sock *meta_sk, struct sk_buff *skb)
316
{
317
	struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
318
	struct tcp_options_received tmp_opt;
319
	struct mptcp_options_received mopt;
320
	struct ipv6_pinfo *np = inet6_sk(meta_sk);
321
	struct request_sock *req;
322
	struct inet6_request_sock *treq;
323
	struct mptcp_request_sock *mtreq;
324
	u8 mptcp_hash_mac[20];
325
	__u32 isn = TCP_SKB_CB(skb)->when;
326
	struct dst_entry *dst = NULL;
327
	struct flowi6 fl6;
328
	int want_cookie = 0;
329
330
	tcp_clear_options(&tmp_opt);
331
	mptcp_init_mp_opt(&mopt);
332
	tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
333
	tmp_opt.user_mss  = tcp_sk(meta_sk)->rx_opt.user_mss;
334
	tcp_parse_options(skb, &tmp_opt, &mopt, 0, NULL);
335
336
	req = inet6_reqsk_alloc(&mptcp6_request_sock_ops);
337
	if (!req)
338
		return;
339
340
	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
341
	tcp_openreq_init(req, &tmp_opt, skb);
342
343
	treq = inet6_rsk(req);
344
	treq->rmt_addr = ipv6_hdr(skb)->saddr;
345
	treq->loc_addr = ipv6_hdr(skb)->daddr;
346
347
	if (!want_cookie || tmp_opt.tstamp_ok)
348
		TCP_ECN_create_request(req, skb, sock_net(meta_sk));
349
350
	treq->iif = meta_sk->sk_bound_dev_if;
351
352
	/* So that link locals have meaning */
353
	if (!meta_sk->sk_bound_dev_if &&
354
	    ipv6_addr_type(&treq->rmt_addr) & IPV6_ADDR_LINKLOCAL)
355
		treq->iif = inet6_iif(skb);
356
357
	if (!isn) {
358
		if (meta_sk->sk_family == AF_INET6 &&
359
		    (ipv6_opt_accepted(meta_sk, skb) ||
360
		    np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||
361
		    np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim)) {
362
			atomic_inc(&skb->users);
363
			treq->pktopts = skb;
364
		}
365
366
		/* VJ's idea. We save last timestamp seen
367
		 * from the destination in peer table, when entering
368
		 * state TIME-WAIT, and check against it before
369
		 * accepting new connection request.
370
		 *
371
		 * If "isn" is not zero, this request hit alive
372
		 * timewait bucket, so that all the necessary checks
373
		 * are made in the function processing timewait state.
374
		 */
375
		if (tmp_opt.saw_tstamp &&
376
		    tcp_death_row.sysctl_tw_recycle &&
377
		    (dst = inet6_csk_route_req(meta_sk, &fl6, req)) != NULL) {
378
			if (!tcp_peer_is_proven(req, dst, true)) {
379
				NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_PAWSPASSIVEREJECTED);
380
				goto drop_and_release;
381
			}
382
		}
383
		/* Kill the following clause, if you dislike this way. */
384
		else if (!sysctl_tcp_syncookies &&
385
			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(meta_sk) <
386
			  (sysctl_max_syn_backlog >> 2)) &&
387
			 !tcp_peer_is_proven(req, dst, false)) {
388
			/* Without syncookies last quarter of
389
			 * backlog is filled with destinations,
390
			 * proven to be alive.
391
			 * It means that we continue to communicate
392
			 * to destinations, already remembered
393
			 * to the moment of synflood.
394
			 */
395
			LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI6/%u\n",
396
				       &treq->rmt_addr, ntohs(tcp_hdr(skb)->source));
397
			goto drop_and_release;
398
		}
399
400
		isn = tcp_v6_init_sequence(skb);
401
	}
402
403
	tcp_rsk(req)->snt_isn = isn;
404
	tcp_rsk(req)->snt_synack = tcp_time_stamp;
405
406
	mtreq = mptcp_rsk(req);
407
	mtreq->mpcb = mpcb;
408
	INIT_LIST_HEAD(&mtreq->collide_tuple);
409
	mtreq->mptcp_rem_nonce = mopt.mptcp_recv_nonce;
410
	mtreq->mptcp_rem_key = mpcb->mptcp_rem_key;
411
	mtreq->mptcp_loc_key = mpcb->mptcp_loc_key;
412
	mtreq->mptcp_loc_nonce = mptcp_v6_get_nonce(ipv6_hdr(skb)->daddr.s6_addr32,
413
						    ipv6_hdr(skb)->saddr.s6_addr32,
414
						    tcp_hdr(skb)->dest,
415
						    tcp_hdr(skb)->source, isn);
416
	mptcp_hmac_sha1((u8 *)&mtreq->mptcp_loc_key,
417
			(u8 *)&mtreq->mptcp_rem_key,
418
			(u8 *)&mtreq->mptcp_loc_nonce,
419
			(u8 *)&mtreq->mptcp_rem_nonce, (u32 *)mptcp_hash_mac);
420
	mtreq->mptcp_hash_tmac = *(u64 *)mptcp_hash_mac;
421
	mtreq->rem_id = mopt.rem_id;
422
	mtreq->low_prio = mopt.low_prio;
423
	tcp_rsk(req)->saw_mpc = 1;
424
425
	if (meta_sk->sk_family == AF_INET6) {
426
		if (tcp_v6_send_synack(meta_sk, dst, &fl6, req,
427
				       skb_get_queue_mapping(skb)))
428
			goto drop_and_free;
429
	} else {
430
		if (mptcp_v6v4_send_synack(meta_sk, req, skb_get_queue_mapping(skb)))
431
			goto drop_and_free;
432
	}
433
434
	/* Adding to request queue in metasocket */
435
	mptcp_v6_reqsk_queue_hash_add(meta_sk, req, TCP_TIMEOUT_INIT);
436
437
	return;
438
439
drop_and_release:
440
	dst_release(dst);
441
drop_and_free:
442
	reqsk_free(req);
443
	return;
444
}
445
446
int mptcp_v6_rem_raddress(struct mptcp_cb *mpcb, u8 id)
447
{
448
	int i;
449
450
	for (i = 0; i < MPTCP_MAX_ADDR; i++) {
451
		if (!((1 << i) & mpcb->rem6_bits))
452
			continue;
453
454
		if (mpcb->remaddr6[i].id == id) {
455
			/* remove address from bitfield */
456
			mpcb->rem6_bits &= ~(1 << i);
457
458
			return 0;
459
		}
460
	}
461
462
	return -1;
463
}
464
465
/* Returns -1 if there is no space anymore to store an additional
466
 * address
467
 */
468
int mptcp_v6_add_raddress(struct mptcp_cb *mpcb, const struct in6_addr *addr,
469
			  __be16 port, u8 id)
470
{
471
	int i;
472
	struct mptcp_rem6 *rem6;
473
474
	mptcp_for_each_bit_set(mpcb->rem6_bits, i) {
475
		rem6 = &mpcb->remaddr6[i];
476
477
		/* Address is already in the list --- continue */
478
		if (rem6->id == id &&
479
		    ipv6_addr_equal(&rem6->addr, addr) && rem6->port == port)
480
			return 0;
481
482
		/* This may be the case, when the peer is behind a NAT. He is
483
		 * trying to JOIN, thus sending the JOIN with a certain ID.
484
		 * However the src_addr of the IP-packet has been changed. We
485
		 * update the addr in the list, because this is the address as
486
		 * OUR BOX sees it.
487
		 */
488
		if (rem6->id == id) {
489
			/* update the address */
490
			mptcp_debug("%s: updating old addr: %pI6 to addr %pI6 with id:%d\n",
491
				    __func__, &rem6->addr, addr, id);
492
			rem6->addr = *addr;
493
			rem6->port = port;
494
			mpcb->list_rcvd = 1;
495
			return 0;
496
		}
497
	}
498
499
	i = mptcp_find_free_index(mpcb->rem6_bits);
500
	/* Do we have already the maximum number of local/remote addresses? */
501
	if (i < 0) {
502
		mptcp_debug("%s: At max num of remote addresses: %d --- not adding address: %pI6\n",
503
			    __func__, MPTCP_MAX_ADDR, addr);
504
		return -1;
505
	}
506
507
	rem6 = &mpcb->remaddr6[i];
508
509
	/* Address is not known yet, store it */
510
	rem6->addr = *addr;
511
	rem6->port = port;
512
	rem6->bitfield = 0;
513
	rem6->retry_bitfield = 0;
514
	rem6->id = id;
515
	mpcb->list_rcvd = 1;
516
	mpcb->rem6_bits |= (1 << i);
517
518
	return 0;
519
}
520
521
/* Sets the bitfield of the remote-address field
522
 * local address is not set as it will disappear with the global address-list
523
 */
524
void mptcp_v6_set_init_addr_bit(struct mptcp_cb *mpcb,
525
				const struct in6_addr *daddr)
526
{
527
	int i;
528
	mptcp_for_each_bit_set(mpcb->rem6_bits, i) {
529
		if (ipv6_addr_equal(&mpcb->remaddr6[i].addr, daddr)) {
530
			/* It's the initial flow - thus local index == 0 */
531
			mpcb->remaddr6[i].bitfield |= 1;
532
			return;
533
		}
534
	}
535
}
536
537
int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb)
538
{
539
	struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
540
	struct sock *child, *rsk = NULL;
541
	int ret;
542
543
	if (!(TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_JOIN)) {
544
		struct tcphdr *th = tcp_hdr(skb);
545
		const struct ipv6hdr *ip6h = ipv6_hdr(skb);
546
		struct sock *sk;
547
548
		sk = __inet6_lookup_established(sock_net(meta_sk),
549
						&tcp_hashinfo,
550
						&ip6h->saddr, th->source,
551
						&ip6h->daddr, ntohs(th->dest),
552
						inet6_iif(skb));
553
554
		if (!sk) {
555
			kfree_skb(skb);
556
			return 0;
557
		}
558
		if (is_meta_sk(sk)) {
559
			WARN("%s Did not find a sub-sk!\n", __func__);
560
			kfree_skb(skb);
561
			sock_put(sk);
562
			return 0;
563
		}
564
565
		if (sk->sk_state == TCP_TIME_WAIT) {
566
			inet_twsk_put(inet_twsk(sk));
567
			kfree_skb(skb);
568
			return 0;
569
		}
570
571
		ret = tcp_v6_do_rcv(sk, skb);
572
		sock_put(sk);
573
574
		return ret;
575
	}
576
	TCP_SKB_CB(skb)->mptcp_flags = 0;
577
578
	/* Has been removed from the tk-table. Thus, no new subflows.
579
	 *
580
	 * Check for close-state is necessary, because we may have been closed
581
	 * without passing by mptcp_close().
582
	 *
583
	 * When falling back, no new subflows are allowed either.
584
	 */
585
	if (meta_sk->sk_state == TCP_CLOSE || !tcp_sk(meta_sk)->inside_tk_table ||
586
	    mpcb->infinite_mapping_rcv || mpcb->send_infinite_mapping)
587
		goto reset_and_discard;
588
589
	child = tcp_v6_hnd_req(meta_sk, skb);
590
591
	if (!child)
592
		goto discard;
593
594
	if (child != meta_sk) {
595
		sock_rps_save_rxhash(child, skb);
596
		/* We don't call tcp_child_process here, because we hold
597
		 * already the meta-sk-lock and are sure that it is not owned
598
		 * by the user.
599
		 */
600
		ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb), skb->len);
601
		bh_unlock_sock(child);
602
		sock_put(child);
603
		if (ret) {
604
			rsk = child;
605
			goto reset_and_discard;
606
		}
607
	} else {
608
		if (tcp_hdr(skb)->syn) {
609
			struct mp_join *join_opt = mptcp_find_join(skb);
610
			/* Currently we make two calls to mptcp_find_join(). This
611
			 * can probably be optimized. */
612
			if (mptcp_v6_add_raddress(mpcb,
613
						  (struct in6_addr *)&ipv6_hdr(skb)->saddr,
614
						  0,
615
						  join_opt->addr_id) < 0)
616
				goto reset_and_discard;
617
			mpcb->list_rcvd = 0;
618
619
			mptcp_v6_join_request(meta_sk, skb);
620
			goto discard;
621
		}
622
		goto reset_and_discard;
623
	}
624
	return 0;
625
626
reset_and_discard:
627
	tcp_v6_send_reset(rsk, skb);
628
discard:
629
	kfree_skb(skb);
630
	return 0;
631
}
632
633
/* After this, the ref count of the meta_sk associated with the request_sock
634
 * is incremented. Thus it is the responsibility of the caller
635
 * to call sock_put() when the reference is not needed anymore.
636
 */
637
struct sock *mptcp_v6_search_req(const __be16 rport, const struct in6_addr *raddr,
638
				 const struct in6_addr *laddr, const struct net *net)
639
{
640
	struct mptcp_request_sock *mtreq;
641
	struct sock *meta_sk = NULL;
642
643
	spin_lock(&mptcp_reqsk_hlock);
644
	list_for_each_entry(mtreq,
645
			    &mptcp_reqsk_htb[inet6_synq_hash(raddr, rport, 0,
646
							     MPTCP_HASH_SIZE)],
647
			    collide_tuple) {
648
		struct inet6_request_sock *treq = inet6_rsk(rev_mptcp_rsk(mtreq));
649
		meta_sk = mtreq->mpcb->meta_sk;
650
651
		if (inet_rsk(rev_mptcp_rsk(mtreq))->rmt_port == rport &&
652
		    rev_mptcp_rsk(mtreq)->rsk_ops->family == AF_INET6 &&
653
		    ipv6_addr_equal(&treq->rmt_addr, raddr) &&
654
		    ipv6_addr_equal(&treq->loc_addr, laddr) &&
655
		    net_eq(net, sock_net(meta_sk)))
656
			break;
657
		meta_sk = NULL;
658
	}
659
660
	if (meta_sk && unlikely(!atomic_inc_not_zero(&meta_sk->sk_refcnt)))
661
		meta_sk = NULL;
662
	spin_unlock(&mptcp_reqsk_hlock);
663
664
	return meta_sk;
665
}
666
667
/* Create a new IPv6 subflow.
668
 *
669
 * We are in user-context and meta-sock-lock is hold.
670
 */
671
int mptcp_init6_subsockets(struct sock *meta_sk, const struct mptcp_loc6 *loc,
672
			   struct mptcp_rem6 *rem)
673
{
674
	struct tcp_sock *tp;
675
	struct sock *sk;
676
	struct sockaddr_in6 loc_in, rem_in;
677
	struct socket sock;
678
	int ulid_size = 0, ret;
679
680
	/* Don't try again - even if it fails.
681
	 * There is a special case as the IPv6 address of the initial subflow
682
	 * has an id = 0. The other ones have id's in the range [8, 16[.
683
	 */
684
	rem->bitfield |= (1 << (loc->id - min_t(u8, loc->id, MPTCP_MAX_ADDR)));
685
686
	/** First, create and prepare the new socket */
687
688
	sock.type = meta_sk->sk_socket->type;
689
	sock.state = SS_UNCONNECTED;
690
	sock.wq = meta_sk->sk_socket->wq;
691
	sock.file = meta_sk->sk_socket->file;
692
	sock.ops = NULL;
693
694
	ret = inet6_create(sock_net(meta_sk), &sock, IPPROTO_TCP, 1);
695
	if (unlikely(ret < 0)) {
696
		mptcp_debug("%s inet6_create failed ret: %d\n", __func__, ret);
697
		return ret;
698
	}
699
700
	sk = sock.sk;
701
	tp = tcp_sk(sk);
702
703
	/* All subsockets need the MPTCP-lock-class */
704
	lockdep_set_class_and_name(&(sk)->sk_lock.slock, &meta_slock_key, "slock-AF_INET-MPTCP");
705
	lockdep_init_map(&(sk)->sk_lock.dep_map, "sk_lock-AF_INET-MPTCP", &meta_key, 0);
706
707
	if (mptcp_add_sock(meta_sk, sk, rem->id, GFP_KERNEL))
708
		goto error;
709
710
	tp->mptcp->slave_sk = 1;
711
	tp->mptcp->low_prio = loc->low_prio;
712
713
	/* Initializing the timer for an MPTCP subflow */
714
	setup_timer(&tp->mptcp->mptcp_ack_timer, mptcp_ack_handler, (unsigned long)sk);
715
716
	/** Then, connect the socket to the peer */
717
718
	ulid_size = sizeof(struct sockaddr_in6);
719
	loc_in.sin6_family = AF_INET6;
720
	rem_in.sin6_family = AF_INET6;
721
	loc_in.sin6_port = 0;
722
	if (rem->port)
723
		rem_in.sin6_port = rem->port;
724
	else
725
		rem_in.sin6_port = inet_sk(meta_sk)->inet_dport;
726
	loc_in.sin6_addr = loc->addr;
727
	rem_in.sin6_addr = rem->addr;
728
729
	ret = sock.ops->bind(&sock, (struct sockaddr *)&loc_in, ulid_size);
730
	if (ret < 0) {
731
		mptcp_debug("%s: MPTCP subsocket bind()failed, error %d\n",
732
			    __func__, ret);
733
		goto error;
734
	}
735
736
	mptcp_debug("%s: token %#x pi %d src_addr:%pI6:%d dst_addr:%pI6:%d\n",
737
		    __func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token,
738
		    tp->mptcp->path_index, &loc_in.sin6_addr,
739
		    ntohs(loc_in.sin6_port), &rem_in.sin6_addr,
740
		    ntohs(rem_in.sin6_port));
741
742
	ret = sock.ops->connect(&sock, (struct sockaddr *)&rem_in,
743
				ulid_size, O_NONBLOCK);
744
	if (ret < 0 && ret != -EINPROGRESS) {
745
		mptcp_debug("%s: MPTCP subsocket connect() failed, error %d\n",
746
			    __func__, ret);
747
		goto error;
748
	}
749
750
	sk_set_socket(sk, meta_sk->sk_socket);
751
	sk->sk_wq = meta_sk->sk_wq;
752
753
	return 0;
754
755
error:
756
	/* May happen if mptcp_add_sock fails first */
757
	if (!tp->mpc) {
758
		tcp_close(sk, 0);
759
	} else {
760
		local_bh_disable();
761
		mptcp_sub_force_close(sk);
762
		local_bh_enable();
763
	}
764
	return ret;
765
}
766
767
struct mptcp_dad_data {
768
	struct timer_list timer;
769
	struct inet6_ifaddr *ifa;
770
};
771
772
static int mptcp_ipv6_is_in_dad_state(struct inet6_ifaddr *ifa)
773
{
774
	return ((ifa->flags & IFA_F_TENTATIVE) &&
775
		ifa->state == INET6_IFADDR_STATE_DAD);
776
}
777
778
static void mptcp_dad_callback(unsigned long arg);
779
static int mptcp_pm_inet6_addr_event(struct notifier_block *this,
780
				     unsigned long event, void *ptr);
781
782
static inline void mptcp_dad_init_timer(struct mptcp_dad_data *data,
783
					struct inet6_ifaddr *ifa)
784
{
785
	data->ifa = ifa;
786
	data->timer.data = (unsigned long)data;
787
	data->timer.function = mptcp_dad_callback;
788
	if (ifa->idev->cnf.rtr_solicit_delay)
789
		data->timer.expires = jiffies + ifa->idev->cnf.rtr_solicit_delay;
790
	else
791
		data->timer.expires = jiffies + MPTCP_IPV6_DEFAULT_DAD_WAIT;
792
}
793
794
static void mptcp_dad_callback(unsigned long arg)
795
{
796
	struct mptcp_dad_data *data = (struct mptcp_dad_data *)arg;
797
798
	if (mptcp_ipv6_is_in_dad_state(data->ifa)) {
799
		mptcp_dad_init_timer(data, data->ifa);
800
		add_timer(&data->timer);
801
	} else {
802
		mptcp_pm_inet6_addr_event(NULL, NETDEV_UP, data->ifa);
803
		in6_ifa_put(data->ifa);
804
		kfree(data);
805
	}
806
}
807
808
static inline void mptcp_dad_setup_timer(struct inet6_ifaddr *ifa)
809
{
810
	struct mptcp_dad_data *data;
811
812
	data = kmalloc(sizeof(*data), GFP_ATOMIC);
813
814
	if (!data)
815
		return;
816
817
	init_timer(&data->timer);
818
	mptcp_dad_init_timer(data, ifa);
819
	add_timer(&data->timer);
820
	in6_ifa_hold(ifa);
821
}
822
823
/* React on IPv6-addr add/rem-events */
824
static int mptcp_pm_inet6_addr_event(struct notifier_block *this,
825
				     unsigned long event, void *ptr)
826
{
827
	if (mptcp_ipv6_is_in_dad_state((struct inet6_ifaddr *)ptr)) {
828
		mptcp_dad_setup_timer((struct inet6_ifaddr *)ptr);
829
		return NOTIFY_DONE;
830
	} else {
831
		return mptcp_pm_addr_event_handler(event, ptr, AF_INET6);
832
	}
833
}
834
835
/* React on ifup/down-events */
836
static int mptcp_pm_v6_netdev_event(struct notifier_block *this,
837
		unsigned long event, void *ptr)
838
{
839
	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
840
	struct inet6_dev *in6_dev = NULL;
841
842
	if (!(event == NETDEV_UP || event == NETDEV_DOWN ||
843
	      event == NETDEV_CHANGE))
844
		return NOTIFY_DONE;
845
846
	/* Iterate over the addresses of the interface, then we go over the
847
	 * mpcb's to modify them - that way we take tk_hash_lock for a shorter
848
	 * time at each iteration. - otherwise we would need to take it from the
849
	 * beginning till the end.
850
	 */
851
	rcu_read_lock();
852
	in6_dev = __in6_dev_get(dev);
853
854
	if (in6_dev) {
855
		struct inet6_ifaddr *ifa6;
856
		list_for_each_entry(ifa6, &in6_dev->addr_list, if_list)
857
				mptcp_pm_inet6_addr_event(NULL, event, ifa6);
858
	}
859
860
	rcu_read_unlock();
861
	return NOTIFY_DONE;
862
}
863
864
void mptcp_pm_addr6_event_handler(struct inet6_ifaddr *ifa, unsigned long event,
865
				  struct mptcp_cb *mpcb)
866
{
867
	int i;
868
	struct sock *sk, *tmpsk;
869
	int addr_type = ipv6_addr_type(&ifa->addr);
870
871
	/* Checks on interface and address-type */
872
	if (ifa->scope > RT_SCOPE_LINK ||
873
	    addr_type == IPV6_ADDR_ANY ||
874
	    (addr_type & IPV6_ADDR_LOOPBACK) ||
875
	    (addr_type & IPV6_ADDR_LINKLOCAL))
876
		return;
877
878
	/* Look for the address among the local addresses */
879
	mptcp_for_each_bit_set(mpcb->loc6_bits, i) {
880
		if (ipv6_addr_equal(&mpcb->locaddr6[i].addr, &ifa->addr))
881
			goto found;
882
	}
883
884
	/* Not yet in address-list */
885
	if ((event == NETDEV_UP || event == NETDEV_CHANGE) &&
886
	    netif_running(ifa->idev->dev) &&
887
	    !(ifa->idev->dev->flags & IFF_NOMULTIPATH)) {
888
		i = __mptcp_find_free_index(mpcb->loc6_bits, 0, mpcb->next_v6_index);
889
		if (i < 0) {
890
			mptcp_debug("MPTCP_PM: NETDEV_UP Reached max number of local IPv6 addresses: %d\n",
891
				    MPTCP_MAX_ADDR);
892
			return;
893
		}
894
895
		/* update this mpcb */
896
		mpcb->locaddr6[i].addr = ifa->addr;
897
		mpcb->locaddr6[i].id = i + MPTCP_MAX_ADDR;
898
		mpcb->loc6_bits |= (1 << i);
899
		mpcb->next_v6_index = i + 1;
900
		/* re-send addresses */
901
		mptcp_v6_send_add_addr(i, mpcb);
902
		/* re-evaluate paths */
903
		mptcp_create_subflows(mpcb->meta_sk);
904
	}
905
	return;
906
found:
907
	/* Address already in list. Reactivate/Deactivate the
908
	 * concerned paths. */
909
	mptcp_for_each_sk_safe(mpcb, sk, tmpsk) {
910
		struct tcp_sock *tp = tcp_sk(sk);
911
		if (sk->sk_family != AF_INET6 ||
912
		    !ipv6_addr_equal(&inet6_sk(sk)->saddr, &ifa->addr))
913
			continue;
914
915
		if (event == NETDEV_DOWN ||
916
		    (ifa->idev->dev->flags & IFF_NOMULTIPATH)) {
917
			mptcp_reinject_data(sk, 0);
918
			mptcp_sub_force_close(sk);
919
		} else if (event == NETDEV_CHANGE) {
920
			int new_low_prio = (ifa->idev->dev->flags & IFF_MPBACKUP) ?
921
						1 : 0;
922
			if (new_low_prio != tp->mptcp->low_prio)
923
				tp->mptcp->send_mp_prio = 1;
924
			tp->mptcp->low_prio = new_low_prio;
925
		}
926
	}
927
928
	if (event == NETDEV_DOWN ||
929
	    (ifa->idev->dev->flags & IFF_NOMULTIPATH)) {
930
		mpcb->loc6_bits &= ~(1 << i);
931
932
		/* Force sending directly the REMOVE_ADDR option */
933
		mpcb->remove_addrs |= (1 << mpcb->locaddr6[i].id);
934
		sk = mptcp_select_ack_sock(mpcb->meta_sk, 0);
935
		if (sk)
936
			tcp_send_ack(sk);
937
938
		mptcp_for_each_bit_set(mpcb->rem6_bits, i) {
939
			mpcb->remaddr6[i].bitfield &= mpcb->loc6_bits;
940
			mpcb->remaddr6[i].retry_bitfield &= mpcb->loc6_bits;
941
		}
942
	}
943
}
944
945
/* Send ADD_ADDR for loc_id on all available subflows */
946
void mptcp_v6_send_add_addr(int loc_id, struct mptcp_cb *mpcb)
947
{
948
	struct tcp_sock *tp;
949
950
	mptcp_for_each_tp(mpcb, tp)
951
		tp->mptcp->add_addr6 |= (1 << loc_id);
952
}
953
954
955
static struct notifier_block mptcp_pm_inet6_addr_notifier = {
956
		.notifier_call = mptcp_pm_inet6_addr_event,
957
};
958
959
static struct notifier_block mptcp_pm_v6_netdev_notifier = {
960
		.notifier_call = mptcp_pm_v6_netdev_event,
961
};
962
963
/****** End of IPv6-Address event handler ******/
964
965
int mptcp_pm_v6_init(void)
966
{
967
	int ret;
968
	struct request_sock_ops *ops = &mptcp6_request_sock_ops;
969
970
	ops->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", "MPTCP6");
971
	if (ops->slab_name == NULL) {
972
		ret = -ENOMEM;
973
		goto out;
974
	}
975
976
	ops->slab = kmem_cache_create(ops->slab_name, ops->obj_size, 0,
977
				      SLAB_HWCACHE_ALIGN, NULL);
978
979
	if (ops->slab == NULL) {
980
		ret =  -ENOMEM;
981
		goto err_reqsk_create;
982
	}
983
984
	ret = register_inet6addr_notifier(&mptcp_pm_inet6_addr_notifier);
985
	if (ret)
986
		goto err_reg_inet6addr;
987
	ret = register_netdevice_notifier(&mptcp_pm_v6_netdev_notifier);
988
	if (ret)
989
		goto err_reg_netdev6;
990
991
out:
992
	return ret;
993
994
err_reg_netdev6:
995
	unregister_inet6addr_notifier(&mptcp_pm_inet6_addr_notifier);
996
err_reg_inet6addr:
997
	kmem_cache_destroy(ops->slab);
998
err_reqsk_create:
999
	kfree(ops->slab_name);
1000
	ops->slab_name = NULL;
1001
	goto out;
1002
}
1003
1004
void mptcp_pm_v6_undo(void)
1005
{
1006
	kmem_cache_destroy(mptcp6_request_sock_ops.slab);
1007
	kfree(mptcp6_request_sock_ops.slab_name);
1008
	unregister_inet6addr_notifier(&mptcp_pm_inet6_addr_notifier);
1009
	unregister_netdevice_notifier(&mptcp_pm_v6_netdev_notifier);
1010
}
(-)a/net/mptcp/mptcp_ofo_queue.c (+278 lines)
Line 0 Link Here
1
/*
2
 *	MPTCP implementation - Fast algorithm for MPTCP meta-reordering
3
 *
4
 *	Initial Design & Implementation:
5
 *	Sébastien Barré <sebastien.barre@uclouvain.be>
6
 *
7
 *	Current Maintainer & Author:
8
 *	Christoph Paasch <christoph.paasch@uclouvain.be>
9
 *
10
 *	Additional authors:
11
 *	Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
12
 *	Gregory Detal <gregory.detal@uclouvain.be>
13
 *	Fabien Duchêne <fabien.duchene@uclouvain.be>
14
 *	Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
15
 *	Lavkesh Lahngir <lavkesh51@gmail.com>
16
 *	Andreas Ripke <ripke@neclab.eu>
17
 *	Vlad Dogaru <vlad.dogaru@intel.com>
18
 *	Octavian Purdila <octavian.purdila@intel.com>
19
 *	John Ronan <jronan@tssg.org>
20
 *	Catalin Nicutar <catalin.nicutar@gmail.com>
21
 *	Brandon Heller <brandonh@stanford.edu>
22
 *
23
 *	This program is free software; you can redistribute it and/or
24
 *      modify it under the terms of the GNU General Public License
25
 *      as published by the Free Software Foundation; either version
26
 *      2 of the License, or (at your option) any later version.
27
 */
28
29
#include <linux/skbuff.h>
30
#include <linux/slab.h>
31
#include <net/tcp.h>
32
#include <net/mptcp.h>
33
34
static void mptcp_remove_shortcuts(const struct mptcp_cb *mpcb,
35
				   const struct sk_buff *skb)
36
{
37
	struct tcp_sock *tp;
38
39
	mptcp_for_each_tp(mpcb, tp) {
40
		if (tp->mptcp->shortcut_ofoqueue == skb) {
41
			tp->mptcp->shortcut_ofoqueue = NULL;
42
			return;
43
		}
44
	}
45
}
46
47
/* Does 'skb' fits after 'here' in the queue 'head' ?
48
 * If yes, we queue it and return 1
49
 */
50
static int mptcp_ofo_queue_after(struct sk_buff_head *head,
51
				 struct sk_buff *skb, struct sk_buff *here,
52
				 struct tcp_sock *tp)
53
{
54
	struct sock *meta_sk = tp->meta_sk;
55
	struct tcp_sock *meta_tp = tcp_sk(meta_sk);
56
	u32 seq = TCP_SKB_CB(skb)->seq;
57
	u32 end_seq = TCP_SKB_CB(skb)->end_seq;
58
59
	/* We want to queue skb after here, thus seq >= end_seq */
60
	if (before(seq, TCP_SKB_CB(here)->end_seq))
61
		return 0;
62
63
	if (seq == TCP_SKB_CB(here)->end_seq) {
64
		bool fragstolen = false;
65
66
		if (!tcp_try_coalesce(meta_sk, here, skb, &fragstolen)) {
67
			__skb_queue_after(&meta_tp->out_of_order_queue, here, skb);
68
			return 1;
69
		} else {
70
			kfree_skb_partial(skb, fragstolen);
71
			return -1;
72
		}
73
	}
74
75
	/* If here is the last one, we can always queue it */
76
	if (skb_queue_is_last(head, here)) {
77
		__skb_queue_after(head, here, skb);
78
		return 1;
79
	} else {
80
		struct sk_buff *skb1 = skb_queue_next(head, here);
81
		/* It's not the last one, but does it fits between 'here' and
82
		 * the one after 'here' ? Thus, does end_seq <= after_here->seq
83
		 */
84
		if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) {
85
			__skb_queue_after(head, here, skb);
86
			return 1;
87
		}
88
	}
89
90
	return 0;
91
}
92
93
static void try_shortcut(struct sk_buff *shortcut, struct sk_buff *skb,
94
			 struct sk_buff_head *head, struct tcp_sock *tp)
95
{
96
	struct sock *meta_sk = tp->meta_sk;
97
	struct tcp_sock *tp_it, *meta_tp = tcp_sk(meta_sk);
98
	struct mptcp_cb *mpcb = meta_tp->mpcb;
99
	struct sk_buff *skb1, *best_shortcut = NULL;
100
	u32 seq = TCP_SKB_CB(skb)->seq;
101
	u32 end_seq = TCP_SKB_CB(skb)->end_seq;
102
	u32 distance = 0xffffffff;
103
104
	/* First, check the tp's shortcut */
105
	if (!shortcut) {
106
		if (skb_queue_empty(head)) {
107
			__skb_queue_head(head, skb);
108
			goto end;
109
		}
110
	} else {
111
		int ret = mptcp_ofo_queue_after(head, skb, shortcut, tp);
112
		/* Does the tp's shortcut is a hit? If yes, we insert. */
113
114
		if (ret) {
115
			skb = (ret > 0) ? skb : NULL;
116
			goto end;
117
		}
118
	}
119
120
	/* Check the shortcuts of the other subsockets. */
121
	mptcp_for_each_tp(mpcb, tp_it) {
122
		shortcut = tp_it->mptcp->shortcut_ofoqueue;
123
		/* Can we queue it here? If yes, do so! */
124
		if (shortcut) {
125
			int ret = mptcp_ofo_queue_after(head, skb, shortcut, tp);
126
127
			if (ret) {
128
				skb = (ret > 0) ? skb : NULL;
129
				goto end;
130
			}
131
		}
132
133
		/* Could not queue it, check if we are close.
134
		 * We are looking for a shortcut, close enough to seq to
135
		 * set skb1 prematurely and thus improve the subsequent lookup,
136
		 * which tries to find a skb1 so that skb1->seq <= seq.
137
		 *
138
		 * So, here we only take shortcuts, whose shortcut->seq > seq,
139
		 * and minimize the distance between shortcut->seq and seq and
140
		 * set best_shortcut to this one with the minimal distance.
141
		 *
142
		 * That way, the subsequent while-loop is shortest.
143
		 */
144
		if (shortcut && after(TCP_SKB_CB(shortcut)->seq, seq)) {
145
			/* Are we closer than the current best shortcut? */
146
			if ((u32)(seq - TCP_SKB_CB(shortcut)->seq) < distance) {
147
				distance = (u32)(seq - TCP_SKB_CB(shortcut)->seq);
148
				best_shortcut = shortcut;
149
			}
150
		}
151
	}
152
153
	if (best_shortcut)
154
		skb1 = best_shortcut;
155
	else
156
		skb1 = skb_peek_tail(head);
157
158
	if (seq == TCP_SKB_CB(skb1)->end_seq) {
159
		bool fragstolen = false;
160
161
		if (!tcp_try_coalesce(meta_sk, skb1, skb, &fragstolen)) {
162
			__skb_queue_after(&meta_tp->out_of_order_queue, skb1, skb);
163
		} else {
164
			kfree_skb_partial(skb, fragstolen);
165
			skb = NULL;
166
		}
167
168
		goto end;
169
	}
170
171
	/* Find the insertion point, starting from best_shortcut if available.
172
	 *
173
	 * Inspired from tcp_data_queue_ofo.
174
	 */
175
	while (1) {
176
		/* skb1->seq <= seq */
177
		if (!after(TCP_SKB_CB(skb1)->seq, seq))
178
			break;
179
		if (skb_queue_is_first(head, skb1)) {
180
			skb1 = NULL;
181
			break;
182
		}
183
		skb1 = skb_queue_prev(head, skb1);
184
	}
185
186
	/* Do skb overlap to previous one? */
187
	if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
188
		if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
189
			/* All the bits are present. */
190
			__kfree_skb(skb);
191
			skb = NULL;
192
			goto end;
193
		}
194
		if (seq == TCP_SKB_CB(skb1)->seq) {
195
			if (skb_queue_is_first(head, skb1))
196
				skb1 = NULL;
197
			else
198
				skb1 = skb_queue_prev(head, skb1);
199
		}
200
	}
201
	if (!skb1)
202
		__skb_queue_head(head, skb);
203
	else
204
		__skb_queue_after(head, skb1, skb);
205
206
	/* And clean segments covered by new one as whole. */
207
	while (!skb_queue_is_last(head, skb)) {
208
		skb1 = skb_queue_next(head, skb);
209
210
		if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
211
			break;
212
213
		__skb_unlink(skb1, head);
214
		mptcp_remove_shortcuts(mpcb, skb1);
215
		__kfree_skb(skb1);
216
	}
217
218
end:
219
	if (skb) {
220
		skb_set_owner_r(skb, meta_sk);
221
		tp->mptcp->shortcut_ofoqueue = skb;
222
	}
223
224
	return;
225
}
226
227
/**
228
 * @sk: the subflow that received this skb.
229
 */
230
void mptcp_add_meta_ofo_queue(struct sock *meta_sk, struct sk_buff *skb,
231
			      struct sock *sk)
232
{
233
	struct tcp_sock *tp = tcp_sk(sk);
234
235
	try_shortcut(tp->mptcp->shortcut_ofoqueue, skb,
236
		     &tcp_sk(meta_sk)->out_of_order_queue, tp);
237
}
238
239
void mptcp_ofo_queue(struct sock *meta_sk)
240
{
241
	struct tcp_sock *meta_tp = tcp_sk(meta_sk);
242
	struct sk_buff *skb;
243
244
	while ((skb = skb_peek(&meta_tp->out_of_order_queue)) != NULL) {
245
		u32 old_rcv_nxt = meta_tp->rcv_nxt;
246
		if (after(TCP_SKB_CB(skb)->seq, meta_tp->rcv_nxt))
247
			break;
248
249
		if (!after(TCP_SKB_CB(skb)->end_seq, meta_tp->rcv_nxt)) {
250
			__skb_unlink(skb, &meta_tp->out_of_order_queue);
251
			mptcp_remove_shortcuts(meta_tp->mpcb, skb);
252
			__kfree_skb(skb);
253
			continue;
254
		}
255
256
		__skb_unlink(skb, &meta_tp->out_of_order_queue);
257
		mptcp_remove_shortcuts(meta_tp->mpcb, skb);
258
259
		__skb_queue_tail(&meta_sk->sk_receive_queue, skb);
260
		meta_tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
261
		mptcp_check_rcvseq_wrap(meta_tp, old_rcv_nxt);
262
263
		if (tcp_hdr(skb)->fin)
264
			mptcp_fin(meta_sk);
265
	}
266
}
267
268
void mptcp_purge_ofo_queue(struct tcp_sock *meta_tp)
269
{
270
	struct sk_buff_head *head = &meta_tp->out_of_order_queue;
271
	struct sk_buff *skb, *tmp;
272
273
	skb_queue_walk_safe(head, skb, tmp) {
274
		__skb_unlink(skb, head);
275
		mptcp_remove_shortcuts(meta_tp->mpcb, skb);
276
		kfree_skb(skb);
277
	}
278
}
(-)a/net/mptcp/mptcp_olia.c (+314 lines)
Line 0 Link Here
1
/*
2
 * MPTCP implementation - OPPORTUNISTIC LINKED INCREASES CONGESTION CONTROL:
3
 *
4
 * Algorithm design:
5
 * Ramin Khalili <ramin.khalili@epfl.ch>
6
 * Nicolas Gast <nicolas.gast@epfl.ch>
7
 * Jean-Yves Le Boudec <jean-yves.leboudec@epfl.ch>
8
 *
9
 * Implementation:
10
 * Ramin Khalili <ramin.khalili@epfl.ch>
11
 *
12
 * Ported to the official MPTCP-kernel:
13
 * Christoph Paasch <christoph.paasch@uclouvain.be>
14
 *
15
 * This program is free software; you can redistribute it and/or
16
 * modify it under the terms of the GNU General Public License
17
 * as published by the Free Software Foundation; either version
18
 * 2 of the License, or (at your option) any later version.
19
 */
20
21
22
#include <net/tcp.h>
23
#include <net/mptcp.h>
24
25
#include <linux/module.h>
26
27
static int scale = 10;
28
29
struct mptcp_olia {
30
	u32	mptcp_loss1;
31
	u32	mptcp_loss2;
32
	u32	mptcp_loss3;
33
	int	epsilon_num;
34
	u32	epsilon_den;
35
	int	mptcp_snd_cwnd_cnt;
36
};
37
38
static inline int mptcp_olia_sk_can_send(const struct sock *sk)
39
{
40
	return mptcp_sk_can_send(sk) && tcp_sk(sk)->srtt;
41
}
42
43
static inline u64 mptcp_olia_scale(u64 val, int scale)
44
{
45
	return (u64) val << scale;
46
}
47
48
/* take care of artificially inflate (see RFC5681)
49
 * of cwnd during fast-retransmit phase
50
 */
51
static u32 mptcp_get_crt_cwnd(struct sock *sk)
52
{
53
	struct inet_connection_sock *icsk = inet_csk(sk);
54
55
	if (icsk->icsk_ca_state == TCP_CA_Recovery)
56
		return tcp_sk(sk)->snd_ssthresh;
57
	else
58
		return tcp_sk(sk)->snd_cwnd;
59
}
60
61
/* return the dominator of the first term of  the increasing term */
62
static u64 mptcp_get_rate(struct mptcp_cb *mpcb , u32 path_rtt)
63
{
64
	struct sock *sk;
65
	u64 rate = 1; /* We have to avoid a zero-rate because it is used as a divisor */
66
67
	mptcp_for_each_sk(mpcb, sk) {
68
		struct tcp_sock *tp = tcp_sk(sk);
69
		u64 scaled_num;
70
		u32 tmp_cwnd;
71
72
		if (!mptcp_olia_sk_can_send(sk))
73
			continue;
74
75
		tmp_cwnd = mptcp_get_crt_cwnd(sk);
76
		scaled_num = mptcp_olia_scale(tmp_cwnd, scale) * path_rtt;
77
		rate += div_u64(scaled_num , tp->srtt);
78
	}
79
	rate *= rate;
80
	return rate;
81
}
82
83
/* find the maximum cwnd, used to find set M */
84
static u32 mptcp_get_max_cwnd(struct mptcp_cb *mpcb)
85
{
86
	struct sock *sk;
87
	u32 best_cwnd = 0;
88
89
	mptcp_for_each_sk(mpcb, sk) {
90
		u32 tmp_cwnd;
91
92
		if (!mptcp_olia_sk_can_send(sk))
93
			continue;
94
95
		tmp_cwnd = mptcp_get_crt_cwnd(sk);
96
		if (tmp_cwnd > best_cwnd)
97
			best_cwnd = tmp_cwnd;
98
	}
99
	return best_cwnd;
100
}
101
102
static void mptcp_get_epsilon(struct mptcp_cb *mpcb)
103
{
104
	struct mptcp_olia *ca;
105
	struct tcp_sock *tp;
106
	struct sock *sk;
107
	u64 tmp_int, tmp_rtt, best_int = 0, best_rtt = 1;
108
	u32 max_cwnd = 1, best_cwnd = 1, tmp_cwnd;
109
	u8 M = 0, B_not_M = 0;
110
111
	/* TODO - integrate this in the following loop - we just want to iterate once */
112
113
	max_cwnd = mptcp_get_max_cwnd(mpcb);
114
115
	/* find the best path */
116
	mptcp_for_each_sk(mpcb, sk) {
117
		tp = tcp_sk(sk);
118
		ca = inet_csk_ca(sk);
119
120
		if (!mptcp_olia_sk_can_send(sk))
121
			continue;
122
123
		tmp_rtt = tp->srtt * tp->srtt;
124
		/* TODO - check here and rename variables */
125
		tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2,
126
			      ca->mptcp_loss2 - ca->mptcp_loss1);
127
128
		tmp_cwnd = mptcp_get_crt_cwnd(sk);
129
		if (tmp_int * best_rtt >= best_int * tmp_rtt) {
130
			best_rtt = tmp_rtt;
131
			best_int = tmp_int;
132
			best_cwnd = tmp_cwnd;
133
		}
134
	}
135
136
	/* TODO - integrate this here in mptcp_get_max_cwnd and in the previous loop */
137
	/* find the size of M and B_not_M */
138
	mptcp_for_each_sk(mpcb, sk) {
139
		tp = tcp_sk(sk);
140
		ca = inet_csk_ca(sk);
141
142
		if (!mptcp_olia_sk_can_send(sk))
143
			continue;
144
145
		tmp_cwnd = mptcp_get_crt_cwnd(sk);
146
		if (tmp_cwnd == max_cwnd) {
147
			M++;
148
		} else {
149
			tmp_rtt = tp->srtt * tp->srtt;
150
			tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2,
151
				      ca->mptcp_loss2 - ca->mptcp_loss1);
152
153
			if (tmp_int * best_rtt == best_int * tmp_rtt)
154
				B_not_M++;
155
		}
156
	}
157
158
	/* check if the path is in M or B_not_M and set the value of epsilon accordingly */
159
	mptcp_for_each_sk(mpcb, sk) {
160
		tp = tcp_sk(sk);
161
		ca = inet_csk_ca(sk);
162
163
		if (!mptcp_olia_sk_can_send(sk))
164
			continue;
165
166
		if (B_not_M == 0) {
167
			ca->epsilon_num = 0;
168
			ca->epsilon_den = 1;
169
		} else {
170
			tmp_rtt = tp->srtt * tp->srtt;
171
			tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2,
172
				      ca->mptcp_loss2 - ca->mptcp_loss1);
173
			tmp_cwnd = mptcp_get_crt_cwnd(sk);
174
175
			if (tmp_cwnd < max_cwnd &&
176
			    tmp_int * best_rtt == best_int * tmp_rtt){
177
				ca->epsilon_num = 1;
178
				ca->epsilon_den = mpcb->cnt_established * B_not_M;
179
			} else if (tmp_cwnd == max_cwnd) {
180
				ca->epsilon_num = -1;
181
				ca->epsilon_den = mpcb->cnt_established  * M;
182
			} else {
183
				ca->epsilon_num = 0;
184
				ca->epsilon_den = 1;
185
			}
186
		}
187
	}
188
189
}
190
191
/* setting the initial values */
192
static void mptcp_olia_init(struct sock *sk)
193
{
194
	struct tcp_sock *tp = tcp_sk(sk);
195
	struct mptcp_olia *ca = inet_csk_ca(sk);
196
197
	if (tp->mpc) {
198
		ca->mptcp_loss1 = tp->snd_una;
199
		ca->mptcp_loss2 = tp->snd_una;
200
		ca->mptcp_loss3 = tp->snd_una;
201
		ca->mptcp_snd_cwnd_cnt = 0;
202
		ca->epsilon_num = 0;
203
		ca->epsilon_den = 1;
204
	}
205
}
206
207
/* updating inter-loss distance and ssthresh */
208
static void mptcp_olia_set_state(struct sock *sk, u8 new_state)
209
{
210
	if (!tcp_sk(sk)->mpc)
211
		return;
212
213
	if (new_state == TCP_CA_Loss ||
214
	    new_state == TCP_CA_Recovery || new_state == TCP_CA_CWR) {
215
		struct mptcp_olia *ca = inet_csk_ca(sk);
216
217
		if (ca->mptcp_loss3 != ca->mptcp_loss2 &&
218
		    !inet_csk(sk)->icsk_retransmits) {
219
			ca->mptcp_loss1 = ca->mptcp_loss2;
220
			ca->mptcp_loss2 = ca->mptcp_loss3;
221
		}
222
	}
223
224
}
225
226
/* main algorithm */
227
static void mptcp_olia_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
228
{
229
	struct tcp_sock *tp = tcp_sk(sk);
230
	struct mptcp_olia *ca = inet_csk_ca(sk);
231
	struct mptcp_cb *mpcb = tp->mpcb;
232
233
	u64 inc_num, inc_den, rate, cwnd_scaled;
234
235
	if (!tp->mpc) {
236
		tcp_reno_cong_avoid(sk, ack, in_flight);
237
		return;
238
	}
239
240
	ca->mptcp_loss3 = tp->snd_una;
241
242
	if (!tcp_is_cwnd_limited(sk, in_flight))
243
		return;
244
245
	/* slow start if it is in the safe area */
246
	if (tp->snd_cwnd <= tp->snd_ssthresh) {
247
		tcp_slow_start(tp);
248
		return;
249
	}
250
251
	mptcp_get_epsilon(mpcb);
252
	rate = mptcp_get_rate(mpcb, tp->srtt);
253
	cwnd_scaled = mptcp_olia_scale(tp->snd_cwnd, scale);
254
	inc_den = ca->epsilon_den * tp->snd_cwnd * rate ? : 1;
255
256
	/* calculate the increasing term, scaling is used to reduce the rounding effect */
257
	if (ca->epsilon_num == -1) {
258
		if (ca->epsilon_den * cwnd_scaled * cwnd_scaled < rate) {
259
			inc_num = rate - ca->epsilon_den *
260
				cwnd_scaled * cwnd_scaled;
261
			ca->mptcp_snd_cwnd_cnt -= div64_u64(
262
			    mptcp_olia_scale(inc_num , scale) , inc_den);
263
		} else {
264
			inc_num = ca->epsilon_den *
265
			    cwnd_scaled * cwnd_scaled - rate;
266
			ca->mptcp_snd_cwnd_cnt += div64_u64(
267
			    mptcp_olia_scale(inc_num , scale) , inc_den);
268
		}
269
	} else {
270
		inc_num = ca->epsilon_num * rate +
271
		    ca->epsilon_den * cwnd_scaled * cwnd_scaled;
272
		ca->mptcp_snd_cwnd_cnt += div64_u64(
273
		    mptcp_olia_scale(inc_num , scale) , inc_den);
274
	}
275
276
277
	if (ca->mptcp_snd_cwnd_cnt >= (1 << scale) - 1) {
278
		if (tp->snd_cwnd < tp->snd_cwnd_clamp)
279
			tp->snd_cwnd++;
280
		ca->mptcp_snd_cwnd_cnt = 0;
281
	} else if (ca->mptcp_snd_cwnd_cnt <= 0 - (1 << scale) + 1) {
282
		tp->snd_cwnd = max((int) 1 , (int) tp->snd_cwnd - 1);
283
		ca->mptcp_snd_cwnd_cnt = 0;
284
	}
285
}
286
287
static struct tcp_congestion_ops mptcp_olia = {
288
	.init		= mptcp_olia_init,
289
	.ssthresh	= tcp_reno_ssthresh,
290
	.cong_avoid	= mptcp_olia_cong_avoid,
291
	.set_state	= mptcp_olia_set_state,
292
	.min_cwnd	= tcp_reno_min_cwnd,
293
	.owner		= THIS_MODULE,
294
	.name		= "olia",
295
};
296
297
static int __init mptcp_olia_register(void)
298
{
299
	BUILD_BUG_ON(sizeof(struct mptcp_olia) > ICSK_CA_PRIV_SIZE);
300
	return tcp_register_congestion_control(&mptcp_olia);
301
}
302
303
static void __exit mptcp_olia_unregister(void)
304
{
305
	tcp_unregister_congestion_control(&mptcp_olia);
306
}
307
308
module_init(mptcp_olia_register);
309
module_exit(mptcp_olia_unregister);
310
311
MODULE_AUTHOR("Ramin Khalili, Nicolas Gast, Jean-Yves Le Boudec");
312
MODULE_LICENSE("GPL");
313
MODULE_DESCRIPTION("MPTCP COUPLED CONGESTION CONTROL");
314
MODULE_VERSION("0.1");
(-)a/net/mptcp/mptcp_output.c (+2345 lines)
Line 0 Link Here
1
/*
2
 *	MPTCP implementation - Sending side
3
 *
4
 *	Initial Design & Implementation:
5
 *	Sébastien Barré <sebastien.barre@uclouvain.be>
6
 *
7
 *	Current Maintainer & Author:
8
 *	Christoph Paasch <christoph.paasch@uclouvain.be>
9
 *
10
 *	Additional authors:
11
 *	Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
12
 *	Gregory Detal <gregory.detal@uclouvain.be>
13
 *	Fabien Duchêne <fabien.duchene@uclouvain.be>
14
 *	Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
15
 *	Lavkesh Lahngir <lavkesh51@gmail.com>
16
 *	Andreas Ripke <ripke@neclab.eu>
17
 *	Vlad Dogaru <vlad.dogaru@intel.com>
18
 *	Octavian Purdila <octavian.purdila@intel.com>
19
 *	John Ronan <jronan@tssg.org>
20
 *	Catalin Nicutar <catalin.nicutar@gmail.com>
21
 *	Brandon Heller <brandonh@stanford.edu>
22
 *
23
 *
24
 *	This program is free software; you can redistribute it and/or
25
 *      modify it under the terms of the GNU General Public License
26
 *      as published by the Free Software Foundation; either version
27
 *      2 of the License, or (at your option) any later version.
28
 */
29
30
#include <linux/kconfig.h>
31
#include <linux/skbuff.h>
32
#include <linux/tcp.h>
33
34
#include <net/mptcp.h>
35
#include <net/mptcp_v4.h>
36
#include <net/mptcp_v6.h>
37
#include <net/sock.h>
38
39
/* If the sub-socket sk available to send the skb? */
40
static int mptcp_is_available(struct sock *sk, struct sk_buff *skb,
41
			      unsigned int *mss)
42
{
43
	struct tcp_sock *tp = tcp_sk(sk);
44
	unsigned int mss_now;
45
46
	/* Set of states for which we are allowed to send data */
47
	if (!mptcp_sk_can_send(sk))
48
		return 0;
49
50
	/* We do not send data on this subflow unless it is
51
	 * fully established, i.e. the 4th ack has been received.
52
	 */
53
	if (tp->mptcp->pre_established)
54
		return 0;
55
56
	if (tp->pf ||
57
	    (tp->mpcb->noneligible & mptcp_pi_to_flag(tp->mptcp->path_index)))
58
		return 0;
59
60
	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) {
61
		/* If SACK is disabled, and we got a loss, TCP does not exist
62
		 * the loss-state until something above high_seq has been acked.
63
		 * (see tcp_try_undo_recovery)
64
		 *
65
		 * high_seq is the snd_nxt at the moment of the RTO. As soon
66
		 * as we have an RTO, we won't push data on the subflow.
67
		 * Thus, snd_una can never go beyond high_seq.
68
		 */
69
		if (!tcp_is_reno(tp))
70
			return 0;
71
		else if (tp->snd_una != tp->high_seq)
72
			return 0;
73
	}
74
75
	if (!tp->mptcp->fully_established) {
76
		/* Make sure that we send in-order data */
77
		if (skb && tp->mptcp->second_packet &&
78
		    tp->mptcp->last_end_data_seq != TCP_SKB_CB(skb)->seq)
79
			return 0;
80
	}
81
82
	if (!tcp_cwnd_test(tp, skb))
83
		return 0;
84
85
	mss_now = tcp_current_mss(sk);
86
	/* Don't send on this subflow if we bypass the allowed send-window at
87
	 * the per-subflow level. Similar to tcp_snd_wnd_test, but manually
88
	 * calculated end_seq (because here at this point end_seq is still at
89
	 * the meta-level).
90
	 */
91
	if (skb && after(tp->write_seq + min(skb->len, mss_now), tcp_wnd_end(tp)))
92
		return 0;
93
94
	if (mss)
95
		*mss = mss_now;
96
97
	return 1;
98
}
99
100
/* Are we not allowed to reinject this skb on tp? */
101
static int mptcp_dont_reinject_skb(struct tcp_sock *tp, struct sk_buff *skb)
102
{
103
	/* If the skb has already been enqueued in this sk, try to find
104
	 * another one.
105
	 * An exception is a DATA_FIN without data. These ones are not
106
	 * reinjected at the subflow-level as they do not consume
107
	 * subflow-sequence-number space.
108
	 */
109
	return skb &&
110
		/* We either have a data_fin with data or not a data_fin */
111
		((mptcp_is_data_fin(skb) && TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq  > 1) ||
112
		!mptcp_is_data_fin(skb)) &&
113
		/* Has the skb already been enqueued into this subsocket? */
114
		mptcp_pi_to_flag(tp->mptcp->path_index) & TCP_SKB_CB(skb)->path_mask;
115
}
116
117
/* This is the scheduler. This function decides on which flow to send
118
 * a given MSS. If all subflows are found to be busy, NULL is returned
119
 * The flow is selected based on the shortest RTT.
120
 * If all paths have full cong windows, we simply return NULL.
121
 *
122
 * Additionally, this function is aware of the backup-subflows.
123
 */
124
static struct sock *get_available_subflow(struct sock *meta_sk,
125
					  struct sk_buff *skb,
126
					  unsigned int *mss_now)
127
{
128
	struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
129
	struct sock *sk, *bestsk = NULL, *lowpriosk = NULL, *backupsk = NULL;
130
	unsigned int mss = 0, mss_lowprio = 0, mss_backup = 0;
131
	u32 min_time_to_peer = 0xffffffff, lowprio_min_time_to_peer = 0xffffffff;
132
	int cnt_backups = 0;
133
134
	/* if there is only one subflow, bypass the scheduling function */
135
	if (mpcb->cnt_subflows == 1) {
136
		bestsk = (struct sock *)mpcb->connection_list;
137
		if (!mptcp_is_available(bestsk, skb, mss_now))
138
			bestsk = NULL;
139
		return bestsk;
140
	}
141
142
	/* Answer data_fin on same subflow!!! */
143
	if (meta_sk->sk_shutdown & RCV_SHUTDOWN &&
144
	    skb && mptcp_is_data_fin(skb)) {
145
		mptcp_for_each_sk(mpcb, sk) {
146
			if (tcp_sk(sk)->mptcp->path_index == mpcb->dfin_path_index &&
147
			    mptcp_is_available(sk, skb, mss_now))
148
				return sk;
149
		}
150
	}
151
152
	/* First, find the best subflow */
153
	mptcp_for_each_sk(mpcb, sk) {
154
		struct tcp_sock *tp = tcp_sk(sk);
155
		int this_mss;
156
157
		if (tp->mptcp->rcv_low_prio || tp->mptcp->low_prio)
158
			cnt_backups++;
159
160
		if ((tp->mptcp->rcv_low_prio || tp->mptcp->low_prio) &&
161
		    tp->srtt < lowprio_min_time_to_peer) {
162
163
			if (!mptcp_is_available(sk, skb, &this_mss))
164
				continue;
165
166
			if (mptcp_dont_reinject_skb(tp, skb)) {
167
				mss_backup = this_mss;
168
				backupsk = sk;
169
				continue;
170
			}
171
172
			lowprio_min_time_to_peer = tp->srtt;
173
			lowpriosk = sk;
174
			mss_lowprio = this_mss;
175
		} else if (!(tp->mptcp->rcv_low_prio || tp->mptcp->low_prio) &&
176
			   tp->srtt < min_time_to_peer) {
177
			if (!mptcp_is_available(sk, skb, &this_mss))
178
				continue;
179
180
			if (mptcp_dont_reinject_skb(tp, skb)) {
181
				mss_backup = this_mss;
182
				backupsk = sk;
183
				continue;
184
			}
185
186
			min_time_to_peer = tp->srtt;
187
			bestsk = sk;
188
			mss = this_mss;
189
		}
190
	}
191
192
	if (mpcb->cnt_established == cnt_backups && lowpriosk) {
193
		mss = mss_lowprio;
194
		sk = lowpriosk;
195
	} else if (bestsk) {
196
		sk = bestsk;
197
	} else if (backupsk){
198
		/* It has been sent on all subflows once - let's give it a
199
		 * chance again by restarting its pathmask.
200
		 */
201
		if (skb)
202
			TCP_SKB_CB(skb)->path_mask = 0;
203
		mss = mss_backup;
204
		sk = backupsk;
205
	}
206
207
	if (mss_now)
208
		*mss_now = mss;
209
210
	return sk;
211
}
212
213
static struct mp_dss *mptcp_skb_find_dss(const struct sk_buff *skb)
214
{
215
	if (!mptcp_is_data_seq(skb))
216
		return NULL;
217
218
	return (struct mp_dss *)(skb->data - (MPTCP_SUB_LEN_DSS_ALIGN +
219
					      MPTCP_SUB_LEN_ACK_ALIGN +
220
					      MPTCP_SUB_LEN_SEQ_ALIGN));
221
}
222
223
/* get the data-seq and end-data-seq and store them again in the
224
 * tcp_skb_cb
225
 */
226
static int mptcp_reconstruct_mapping(struct sk_buff *skb, struct sk_buff *orig_skb)
227
{
228
	struct mp_dss *mpdss = mptcp_skb_find_dss(orig_skb);
229
	u32 *p32;
230
	u16 *p16;
231
232
	if (!mpdss || !mpdss->M)
233
		return 1;
234
235
	/* Move the pointer to the data-seq */
236
	p32 = (u32 *)mpdss;
237
	p32++;
238
	if (mpdss->A) {
239
		p32++;
240
		if (mpdss->a)
241
			p32++;
242
	}
243
244
	TCP_SKB_CB(skb)->seq = ntohl(*p32);
245
246
	/* Get the data_len to calculate the end_data_seq */
247
	p32++;
248
	p32++;
249
	p16 = (u16 *)p32;
250
	TCP_SKB_CB(skb)->end_seq = ntohs(*p16) + TCP_SKB_CB(skb)->seq;
251
252
	return 0;
253
}
254
255
/* Similar to __pskb_copy and sk_stream_alloc_skb. */
256
static struct sk_buff *mptcp_pskb_copy(struct sk_buff *skb)
257
{
258
	struct sk_buff *n;
259
	/* The TCP header must be at least 32-bit aligned.  */
260
	int size = ALIGN(skb_headlen(skb), 4);
261
262
	n = alloc_skb_fclone(size + MAX_TCP_HEADER, GFP_ATOMIC);
263
	if (!n)
264
		return NULL;
265
266
	/* Set the data pointer */
267
	skb_reserve(n, MAX_TCP_HEADER);
268
	/* Set the tail pointer and length */
269
	skb_put(n, skb_headlen(skb));
270
	/* Copy the bytes */
271
	skb_copy_from_linear_data(skb, n->data, n->len);
272
273
	n->truesize += skb->data_len;
274
	n->data_len  = skb->data_len;
275
	n->len	     = skb->len;
276
277
	if (skb_shinfo(skb)->nr_frags) {
278
		int i;
279
280
		if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
281
			if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
282
				kfree_skb(n);
283
				n = NULL;
284
				goto out;
285
			}
286
		}
287
		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
288
			skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i];
289
			skb_frag_ref(skb, i);
290
		}
291
		skb_shinfo(n)->nr_frags = i;
292
	}
293
294
	if (skb_has_frag_list(skb)) {
295
		skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list;
296
		skb_clone_fraglist(n);
297
	}
298
299
	copy_skb_header(n, skb);
300
out:
301
	return n;
302
}
303
304
/* Reinject data from one TCP subflow to the meta_sk. If sk == NULL, we are
305
 * coming from the meta-retransmit-timer
306
 */
307
static void __mptcp_reinject_data(struct sk_buff *orig_skb, struct sock *meta_sk,
308
				  struct sock *sk, int clone_it)
309
{
310
	struct sk_buff *skb, *skb1;
311
	struct tcp_sock *meta_tp = tcp_sk(meta_sk);
312
	struct mptcp_cb *mpcb = meta_tp->mpcb;
313
	u32 seq, end_seq;
314
315
	if (clone_it) {
316
		/* pskb_copy is necessary here, because the TCP/IP-headers
317
		 * will be changed when it's going to be reinjected on another
318
		 * subflow.
319
		 */
320
		skb = mptcp_pskb_copy(orig_skb);
321
	} else {
322
		__skb_unlink(orig_skb, &sk->sk_write_queue);
323
		sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
324
		sk->sk_wmem_queued -= orig_skb->truesize;
325
		sk_mem_uncharge(sk, orig_skb->truesize);
326
		skb = orig_skb;
327
	}
328
	if (unlikely(!skb))
329
		return;
330
331
	if (sk && mptcp_reconstruct_mapping(skb, orig_skb)) {
332
		__kfree_skb(skb);
333
		return;
334
	}
335
336
	skb->sk = meta_sk;
337
338
	/* If it reached already the destination, we don't have to reinject it */
339
	if (!after(TCP_SKB_CB(skb)->end_seq, meta_tp->snd_una)) {
340
		__kfree_skb(skb);
341
		return;
342
	}
343
344
	/* Only reinject segments that are fully covered by the mapping */
345
	if (skb->len + (mptcp_is_data_fin(skb) ? 1 : 0) !=
346
	    TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq) {
347
		u32 seq = TCP_SKB_CB(skb)->seq;
348
		u32 end_seq = TCP_SKB_CB(skb)->end_seq;
349
350
		__kfree_skb(skb);
351
352
		/* Ok, now we have to look for the full mapping in the meta
353
		 * send-queue :S
354
		 */
355
		tcp_for_write_queue(skb, meta_sk) {
356
			/* Not yet at the mapping? */
357
			if (before(TCP_SKB_CB(skb)->seq, seq))
358
				continue;
359
			/* We have passed by the mapping */
360
			if (after(TCP_SKB_CB(skb)->end_seq, end_seq))
361
				return;
362
363
			__mptcp_reinject_data(skb, meta_sk, NULL, 1);
364
		}
365
		return;
366
	}
367
368
	/* If it's empty, just add */
369
	if (skb_queue_empty(&mpcb->reinject_queue)) {
370
		skb_queue_head(&mpcb->reinject_queue, skb);
371
		return;
372
	}
373
374
	/* Find place to insert skb - or even we can 'drop' it, as the
375
	 * data is already covered by other skb's in the reinject-queue.
376
	 *
377
	 * This is inspired by code from tcp_data_queue.
378
	 */
379
380
	skb1 = skb_peek_tail(&mpcb->reinject_queue);
381
	seq = TCP_SKB_CB(skb)->seq;
382
	while (1) {
383
		if (!after(TCP_SKB_CB(skb1)->seq, seq))
384
			break;
385
		if (skb_queue_is_first(&mpcb->reinject_queue, skb1)) {
386
			skb1 = NULL;
387
			break;
388
		}
389
		skb1 = skb_queue_prev(&mpcb->reinject_queue, skb1);
390
	}
391
392
	/* Do skb overlap to previous one? */
393
	end_seq = TCP_SKB_CB(skb)->end_seq;
394
	if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
395
		if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
396
			/* All the bits are present. Don't reinject */
397
			__kfree_skb(skb);
398
			return;
399
		}
400
		if (seq == TCP_SKB_CB(skb1)->seq) {
401
			if (skb_queue_is_first(&mpcb->reinject_queue, skb1))
402
				skb1 = NULL;
403
			else
404
				skb1 = skb_queue_prev(&mpcb->reinject_queue, skb1);
405
		}
406
	}
407
	if (!skb1)
408
		__skb_queue_head(&mpcb->reinject_queue, skb);
409
	else
410
		__skb_queue_after(&mpcb->reinject_queue, skb1, skb);
411
412
	/* And clean segments covered by new one as whole. */
413
	while (!skb_queue_is_last(&mpcb->reinject_queue, skb)) {
414
		skb1 = skb_queue_next(&mpcb->reinject_queue, skb);
415
416
		if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
417
			break;
418
419
		__skb_unlink(skb1, &mpcb->reinject_queue);
420
		__kfree_skb(skb1);
421
	}
422
	return;
423
}
424
425
/* Inserts data into the reinject queue */
426
void mptcp_reinject_data(struct sock *sk, int clone_it)
427
{
428
	struct sk_buff *skb_it, *tmp;
429
	struct tcp_sock *tp = tcp_sk(sk);
430
	struct sock *meta_sk = tp->meta_sk;
431
432
	/* It has already been closed - there is really no point in reinjecting */
433
	if (meta_sk->sk_state == TCP_CLOSE)
434
		return;
435
436
	skb_queue_walk_safe(&sk->sk_write_queue, skb_it, tmp) {
437
		struct tcp_skb_cb *tcb = TCP_SKB_CB(skb_it);
438
		/* Subflow syn's and fin's are not reinjected.
439
		 *
440
		 * As well as empty subflow-fins with a data-fin.
441
		 * They are reinjected below (without the subflow-fin-flag)
442
		 */
443
		if (tcb->tcp_flags & TCPHDR_SYN ||
444
		    (tcb->tcp_flags & TCPHDR_FIN && !mptcp_is_data_fin(skb_it)) ||
445
		    (tcb->tcp_flags & TCPHDR_FIN && mptcp_is_data_fin(skb_it) && !skb_it->len))
446
			continue;
447
448
		__mptcp_reinject_data(skb_it, meta_sk, sk, clone_it);
449
	}
450
451
	skb_it = tcp_write_queue_tail(meta_sk);
452
	/* If sk has sent the empty data-fin, we have to reinject it too. */
453
	if (skb_it && mptcp_is_data_fin(skb_it) && skb_it->len == 0 &&
454
	    TCP_SKB_CB(skb_it)->path_mask & mptcp_pi_to_flag(tp->mptcp->path_index)) {
455
		__mptcp_reinject_data(skb_it, meta_sk, NULL, 1);
456
	}
457
458
	mptcp_push_pending_frames(meta_sk);
459
460
	tp->pf = 1;
461
}
462
463
464
static void mptcp_combine_dfin(struct sk_buff *skb, struct sock *meta_sk,
465
			       struct sock *subsk)
466
{
467
	struct tcp_sock *meta_tp = tcp_sk(meta_sk);
468
	struct mptcp_cb *mpcb = meta_tp->mpcb;
469
	struct sock *sk_it;
470
	int all_empty = 1, all_acked;
471
472
	/* In infinite mapping we always try to combine */
473
	if (mpcb->infinite_mapping_snd && tcp_close_state(subsk)) {
474
		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;
475
		return;
476
	}
477
478
	/* Don't combine, if they didn't combine - otherwise we end up in
479
	 * TIME_WAIT, even if our app is smart enough to avoid it
480
	 */
481
	if (meta_sk->sk_shutdown & RCV_SHUTDOWN) {
482
		if (!mpcb->dfin_combined)
483
			return;
484
	}
485
486
	/* If no other subflow has data to send, we can combine */
487
	mptcp_for_each_sk(mpcb, sk_it) {
488
		if (!mptcp_sk_can_send(sk_it))
489
			continue;
490
491
		if (!tcp_write_queue_empty(sk_it))
492
			all_empty = 0;
493
	}
494
495
	/* If all data has been DATA_ACKed, we can combine.
496
	 * -1, because the data_fin consumed one byte
497
	 */
498
	all_acked = (meta_tp->snd_una == (meta_tp->write_seq - 1));
499
500
	if ((all_empty || all_acked) && tcp_close_state(subsk))
501
		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;
502
}
503
504
static struct sk_buff *mptcp_skb_entail(struct sock *sk, struct sk_buff *skb,
505
					int reinject)
506
{
507
	__be32 *ptr;
508
	__u16 data_len;
509
	struct mp_dss *mdss;
510
	struct tcp_sock *tp = tcp_sk(sk);
511
	struct sock *meta_sk = mptcp_meta_sk(sk);
512
	struct mptcp_cb *mpcb = tp->mpcb;
513
	struct tcp_skb_cb *tcb;
514
	struct sk_buff *subskb = NULL;
515
516
	if (!reinject)
517
		TCP_SKB_CB(skb)->mptcp_flags |= (mpcb->snd_hiseq_index ?
518
						  MPTCPHDR_SEQ64_INDEX : 0);
519
520
	subskb = mptcp_pskb_copy(skb);
521
	if (!subskb)
522
		return NULL;
523
524
	TCP_SKB_CB(skb)->path_mask |= mptcp_pi_to_flag(tp->mptcp->path_index);
525
526
	if (!(sk->sk_route_caps & NETIF_F_ALL_CSUM) &&
527
	    skb->ip_summed == CHECKSUM_PARTIAL) {
528
		subskb->csum = skb->csum = skb_checksum(skb, 0, skb->len, 0);
529
		subskb->ip_summed = skb->ip_summed = CHECKSUM_NONE;
530
	}
531
532
	/* The subskb is going in the subflow send-queue. Its path-mask
533
	 * is not needed anymore and MUST be set to 0, as the path-mask
534
	 * is a union with inet_skb_param.
535
	 */
536
	tcb = TCP_SKB_CB(subskb);
537
	tcb->path_mask = 0;
538
539
	if (mptcp_is_data_fin(subskb))
540
		mptcp_combine_dfin(subskb, meta_sk, sk);
541
542
	if (tp->mpcb->infinite_mapping_snd)
543
		goto no_data_seq;
544
545
	if (tp->mpcb->send_infinite_mapping &&
546
	    !before(tcb->seq, mptcp_meta_tp(tp)->snd_nxt)) {
547
		tp->mptcp->fully_established = 1;
548
		tp->mpcb->infinite_mapping_snd = 1;
549
		tp->mptcp->infinite_cutoff_seq = tp->write_seq;
550
		tcb->mptcp_flags |= MPTCPHDR_INF;
551
		data_len = 0;
552
	} else {
553
		data_len = tcb->end_seq - tcb->seq;
554
	}
555
556
	/**** Write MPTCP DSS-option to the packet. ****/
557
	ptr = (__be32 *)(subskb->data - (MPTCP_SUB_LEN_DSS_ALIGN +
558
				      MPTCP_SUB_LEN_ACK_ALIGN +
559
				      MPTCP_SUB_LEN_SEQ_ALIGN));
560
561
	/* Then we start writing it from the start */
562
	mdss = (struct mp_dss *)ptr;
563
564
	mdss->kind = TCPOPT_MPTCP;
565
	mdss->sub = MPTCP_SUB_DSS;
566
	mdss->rsv1 = 0;
567
	mdss->rsv2 = 0;
568
	mdss->F = (mptcp_is_data_fin(subskb) ? 1 : 0);
569
	mdss->m = 0;
570
	mdss->M = 1;
571
	mdss->a = 0;
572
	mdss->A = 1;
573
	mdss->len = mptcp_sub_len_dss(mdss, tp->mpcb->dss_csum);
574
575
	ptr++;
576
	ptr++; /* data_ack will be set in mptcp_options_write */
577
	*ptr++ = htonl(tcb->seq); /* data_seq */
578
579
	/* If it's a non-data DATA_FIN, we set subseq to 0 (draft v7) */
580
	if (mptcp_is_data_fin(subskb) && subskb->len == 0)
581
		*ptr++ = 0; /* subseq */
582
	else
583
		*ptr++ = htonl(tp->write_seq - tp->mptcp->snt_isn); /* subseq */
584
585
	if (tp->mpcb->dss_csum && data_len) {
586
		__be16 *p16 = (__be16 *)ptr;
587
		__be32 hdseq = mptcp_get_highorder_sndbits(subskb, tp->mpcb);
588
		__wsum csum;
589
		*ptr = htonl(((data_len) << 16) |
590
				(TCPOPT_EOL << 8) |
591
				(TCPOPT_EOL));
592
593
		csum = csum_partial(ptr - 2, 12, subskb->csum);
594
		p16++;
595
		*p16++ = csum_fold(csum_partial(&hdseq, sizeof(hdseq), csum));
596
	} else {
597
		*ptr++ = htonl(((data_len) << 16) |
598
				(TCPOPT_NOP << 8) |
599
				(TCPOPT_NOP));
600
	}
601
602
no_data_seq:
603
	tcb->seq = tp->write_seq;
604
	tcb->sacked = 0; /* reset the sacked field: from the point of view
605
			  * of this subflow, we are sending a brand new
606
			  * segment */
607
	/* Take into account seg len */
608
	tp->write_seq += subskb->len + ((tcb->tcp_flags & TCPHDR_FIN) ? 1 : 0);
609
	tcb->end_seq = tp->write_seq;
610
611
	/* If it's a non-payload DATA_FIN (also no subflow-fin), the
612
	 * segment is not part of the subflow but on a meta-only-level
613
	 */
614
	if (!mptcp_is_data_fin(subskb) || tcb->end_seq != tcb->seq) {
615
		tcp_add_write_queue_tail(sk, subskb);
616
		sk->sk_wmem_queued += subskb->truesize;
617
		sk_mem_charge(sk, subskb->truesize);
618
	}
619
620
	return subskb;
621
}
622
623
static void mptcp_sub_event_new_data_sent(struct sock *sk,
624
					  struct sk_buff *subskb,
625
					  struct sk_buff *skb)
626
{
627
	/* If it's a non-payload DATA_FIN (also no subflow-fin), the
628
	 * segment is not part of the subflow but on a meta-only-level
629
	 *
630
	 * We free it, because it has been queued nowhere.
631
	 */
632
	if (!mptcp_is_data_fin(subskb) ||
633
	    (TCP_SKB_CB(subskb)->end_seq != TCP_SKB_CB(subskb)->seq)) {
634
		tcp_event_new_data_sent(sk, subskb);
635
		tcp_sk(sk)->mptcp->second_packet = 1;
636
		tcp_sk(sk)->mptcp->last_end_data_seq = TCP_SKB_CB(skb)->end_seq;
637
	} else {
638
		kfree_skb(subskb);
639
	}
640
}
641
642
/* Handle the packets and sockets after a tcp_transmit_skb failed */
643
static void mptcp_transmit_skb_failed(struct sock *sk, struct sk_buff *skb,
644
				      struct sk_buff *subskb, int reinject)
645
{
646
	struct tcp_sock *tp = tcp_sk(sk);
647
	struct mptcp_cb *mpcb = tp->mpcb;
648
649
	/* No work to do if we are in infinite mapping mode
650
	 * There is only one subflow left and we cannot send this segment on
651
	 * another subflow.
652
	 */
653
	if (mpcb->infinite_mapping_snd)
654
		return;
655
656
	TCP_SKB_CB(skb)->path_mask &= ~mptcp_pi_to_flag(tp->mptcp->path_index);
657
658
	if (TCP_SKB_CB(subskb)->tcp_flags & TCPHDR_FIN) {
659
		/* If it is a subflow-fin we must leave it on the
660
		 * subflow-send-queue, so that the probe-timer
661
		 * can retransmit it.
662
		 */
663
		if (!tp->packets_out && !inet_csk(sk)->icsk_pending)
664
			inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
665
						  inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
666
	} else if (mptcp_is_data_fin(subskb) &&
667
		   TCP_SKB_CB(subskb)->end_seq == TCP_SKB_CB(subskb)->seq) {
668
		/* An empty data-fin has not been enqueued on the subflow
669
		 * and thus we free it.
670
		 */
671
672
		kfree_skb(subskb);
673
	} else {
674
		/* In all other cases we remove it from the sub-queue.
675
		 * Other subflows may send it, or the probe-timer will
676
		 * handle it.
677
		 */
678
		tcp_advance_send_head(sk, subskb);
679
680
		/* tcp_add_write_queue_tail initialized highest_sack. We have
681
		 * to reset it, if necessary.
682
		 */
683
		if (tp->highest_sack == subskb)
684
			tp->highest_sack = NULL;
685
686
		tcp_unlink_write_queue(subskb, sk);
687
		tp->write_seq -= subskb->len;
688
		sk_wmem_free_skb(sk, subskb);
689
	}
690
}
691
692
/* Function to create two new TCP segments.  Shrinks the given segment
693
 * to the specified size and appends a new segment with the rest of the
694
 * packet to the list.  This won't be called frequently, I hope.
695
 * Remember, these are still headerless SKBs at this point.
696
 */
697
int mptcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
698
		   unsigned int mss_now, int reinject)
699
{
700
	struct tcp_sock *tp = tcp_sk(sk);
701
	struct sk_buff *buff;
702
	int nsize, old_factor;
703
	int nlen;
704
	u8 flags;
705
	int dsslen = MPTCP_SUB_LEN_DSS_ALIGN + MPTCP_SUB_LEN_ACK_ALIGN +
706
		     MPTCP_SUB_LEN_SEQ_ALIGN;
707
	char dss[dsslen];
708
709
	if (WARN_ON(len > skb->len))
710
		return -EINVAL;
711
712
	/* DSS-option must be recovered afterwards. */
713
	if (!is_meta_sk(sk))
714
		memcpy(dss, skb->data - dsslen, dsslen);
715
716
	nsize = skb_headlen(skb) - len;
717
	if (nsize < 0)
718
		nsize = 0;
719
720
	if (skb_cloned(skb) &&
721
	    skb_is_nonlinear(skb)) {
722
		if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
723
			return -ENOMEM;
724
		/* Recover dss-option */
725
		if (!is_meta_sk(sk))
726
			memcpy(skb->data - dsslen, dss, dsslen);
727
	}
728
729
	/* Get a new skb... force flag on. */
730
	buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC);
731
	if (buff == NULL)
732
		return -ENOMEM; /* We'll just try again later. */
733
734
	/* See below - if reinject == 1, the buff will be added to the reinject-
735
	 * queue, which is currently not part of the memory-accounting.
736
	 */
737
	if (reinject != 1) {
738
		sk->sk_wmem_queued += buff->truesize;
739
		sk_mem_charge(sk, buff->truesize);
740
	}
741
	nlen = skb->len - len - nsize;
742
	buff->truesize += nlen;
743
	skb->truesize -= nlen;
744
745
	/* Correct the sequence numbers. */
746
	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
747
	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
748
	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
749
750
	/* PSH and FIN should only be set in the second packet. */
751
	flags = TCP_SKB_CB(skb)->tcp_flags;
752
	TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
753
	TCP_SKB_CB(buff)->tcp_flags = flags;
754
	TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
755
756
	flags = TCP_SKB_CB(skb)->mptcp_flags;
757
	TCP_SKB_CB(skb)->mptcp_flags = flags & ~(MPTCPHDR_FIN);
758
	TCP_SKB_CB(buff)->mptcp_flags = flags;
759
760
	if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) {
761
		/* Copy and checksum data tail into the new buffer. */
762
		buff->csum = csum_partial_copy_nocheck(skb->data + len,
763
						       skb_put(buff, nsize),
764
						       nsize, 0);
765
766
		skb_trim(skb, len);
767
768
		skb->csum = csum_block_sub(skb->csum, buff->csum, len);
769
	} else {
770
		skb->ip_summed = CHECKSUM_PARTIAL;
771
		skb_split(skb, buff, len);
772
	}
773
774
	/* We lost the dss-option when creating buff - put it back! */
775
	if (!is_meta_sk(sk))
776
		memcpy(buff->data - dsslen, dss, dsslen);
777
778
	buff->ip_summed = skb->ip_summed;
779
780
	/* Looks stupid, but our code really uses when of
781
	 * skbs, which it never sent before. --ANK
782
	 */
783
	TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
784
	buff->tstamp = skb->tstamp;
785
786
	old_factor = tcp_skb_pcount(skb);
787
788
	/* Fix up tso_factor for both original and new SKB.  */
789
	tcp_set_skb_tso_segs(sk, skb, mss_now);
790
	tcp_set_skb_tso_segs(sk, buff, mss_now);
791
792
	/* If this packet has been sent out already, we must
793
	 * adjust the various packet counters.
794
	 */
795
	if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq) && reinject != 1) {
796
		int diff = old_factor - tcp_skb_pcount(skb) -
797
			tcp_skb_pcount(buff);
798
799
		if (diff)
800
			tcp_adjust_pcount(sk, skb, diff);
801
	}
802
803
	/* Link BUFF into the send queue. */
804
	skb_header_release(buff);
805
	if (reinject == 1)
806
		__skb_queue_after(&tcp_sk(sk)->mpcb->reinject_queue, skb, buff);
807
	else
808
		tcp_insert_write_queue_after(skb, buff, sk);
809
810
	return 0;
811
}
812
813
int mptso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
814
		   unsigned int mss_now, gfp_t gfp, int reinject)
815
{
816
	struct sk_buff *buff;
817
	int nlen = skb->len - len, old_factor;
818
	u8 flags;
819
	int dsslen = MPTCP_SUB_LEN_DSS_ALIGN + MPTCP_SUB_LEN_ACK_ALIGN +
820
		     MPTCP_SUB_LEN_SEQ_ALIGN;
821
822
	/* All of a TSO frame must be composed of paged data.  */
823
	if (skb->len != skb->data_len)
824
		return mptcp_fragment(sk, skb, len, mss_now, reinject);
825
826
	buff = sk_stream_alloc_skb(sk, 0, gfp);
827
	if (unlikely(buff == NULL))
828
		return -ENOMEM;
829
830
	/* See below - if reinject == 1, the buff will be added to the reinject-
831
	 * queue, which is currently not part of the memory-accounting.
832
	 */
833
	if (reinject != 1) {
834
		sk->sk_wmem_queued += buff->truesize;
835
		sk_mem_charge(sk, buff->truesize);
836
	}
837
	buff->truesize += nlen;
838
	skb->truesize -= nlen;
839
840
	/* Correct the sequence numbers. */
841
	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
842
	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
843
	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
844
845
	/* PSH and FIN should only be set in the second packet. */
846
	flags = TCP_SKB_CB(skb)->tcp_flags;
847
	TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
848
	TCP_SKB_CB(buff)->tcp_flags = flags;
849
850
	flags = TCP_SKB_CB(skb)->mptcp_flags;
851
	TCP_SKB_CB(skb)->mptcp_flags = flags & ~(MPTCPHDR_FIN);
852
	TCP_SKB_CB(buff)->mptcp_flags = flags;
853
854
	/* This packet was never sent out yet, so no SACK bits. */
855
	TCP_SKB_CB(buff)->sacked = 0;
856
857
	buff->ip_summed = CHECKSUM_PARTIAL;
858
	skb->ip_summed = CHECKSUM_PARTIAL;
859
	skb_split(skb, buff, len);
860
861
	/* We lost the dss-option when creating buff - put it back! */
862
	if (!is_meta_sk(sk))
863
		memcpy(buff->data - dsslen, skb->data - dsslen, dsslen);
864
865
	old_factor = tcp_skb_pcount(skb);
866
867
	/* Fix up tso_factor for both original and new SKB.  */
868
	tcp_set_skb_tso_segs(sk, skb, mss_now);
869
	tcp_set_skb_tso_segs(sk, buff, mss_now);
870
871
	/* If this packet has been sent out already, we must
872
	 * adjust the various packet counters.
873
	 */
874
	if (!before(tcp_sk(sk)->snd_nxt, TCP_SKB_CB(buff)->end_seq) && reinject != 1) {
875
		int diff = old_factor - tcp_skb_pcount(skb) -
876
			tcp_skb_pcount(buff);
877
878
		if (diff)
879
			tcp_adjust_pcount(sk, skb, diff);
880
	}
881
882
	/* Link BUFF into the send queue. */
883
	skb_header_release(buff);
884
	if (reinject == 1)
885
		__skb_queue_after(&tcp_sk(sk)->mpcb->reinject_queue, skb, buff);
886
	else
887
		tcp_insert_write_queue_after(skb, buff, sk);
888
889
	return 0;
890
}
891
892
/* Inspired by tcp_write_wakeup */
893
int mptcp_write_wakeup(struct sock *meta_sk)
894
{
895
	struct tcp_sock *meta_tp = tcp_sk(meta_sk);
896
	struct sk_buff *skb, *subskb;
897
898
	skb = tcp_send_head(meta_sk);
899
	if (skb &&
900
	    before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(meta_tp))) {
901
		int err;
902
		unsigned int mss;
903
		unsigned int seg_size = tcp_wnd_end(meta_tp) - TCP_SKB_CB(skb)->seq;
904
		struct sock *subsk = get_available_subflow(meta_sk, skb, &mss);
905
		if (!subsk)
906
			return -1;
907
908
		if (before(meta_tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
909
			meta_tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
910
911
		/* We are probing the opening of a window
912
		 * but the window size is != 0
913
		 * must have been a result SWS avoidance ( sender )
914
		 */
915
		if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
916
		    skb->len > mss) {
917
			seg_size = min(seg_size, mss);
918
			TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
919
			if (mptcp_fragment(meta_sk, skb, seg_size, mss, 0))
920
				return -1;
921
		} else if (!tcp_skb_pcount(skb)) {
922
			tcp_set_skb_tso_segs(meta_sk, skb, mss);
923
		}
924
925
		subskb = mptcp_skb_entail(subsk, skb, 0);
926
		if (!subskb)
927
			return -1;
928
929
		TCP_SKB_CB(subskb)->tcp_flags |= TCPHDR_PSH;
930
		TCP_SKB_CB(skb)->when = tcp_time_stamp;
931
		TCP_SKB_CB(subskb)->when = tcp_time_stamp;
932
		err = tcp_transmit_skb(subsk, subskb, 1, GFP_ATOMIC);
933
		if (unlikely(err)) {
934
			mptcp_transmit_skb_failed(subsk, skb, subskb, 0);
935
			return err;
936
		}
937
938
		mptcp_check_sndseq_wrap(meta_tp, TCP_SKB_CB(skb)->end_seq -
939
						 TCP_SKB_CB(skb)->seq);
940
		tcp_event_new_data_sent(meta_sk, skb);
941
		mptcp_sub_event_new_data_sent(subsk, subskb, skb);
942
943
		return 0;
944
	} else {
945
		struct sock *sk_it;
946
		int ans = 0;
947
948
		if (between(meta_tp->snd_up, meta_tp->snd_una + 1,
949
			    meta_tp->snd_una + 0xFFFF)) {
950
			mptcp_for_each_sk(meta_tp->mpcb, sk_it) {
951
				if (mptcp_sk_can_send_ack(sk_it))
952
					tcp_xmit_probe_skb(sk_it, 1);
953
			}
954
		}
955
956
		/* At least one of the tcp_xmit_probe_skb's has to succeed */
957
		mptcp_for_each_sk(meta_tp->mpcb, sk_it) {
958
			int ret;
959
960
			if (!mptcp_sk_can_send_ack(sk_it))
961
				continue;
962
963
			ret = tcp_xmit_probe_skb(sk_it, 0);
964
			if (unlikely(ret > 0))
965
				ans = ret;
966
		}
967
		return ans;
968
	}
969
}
970
971
static void mptcp_find_and_set_pathmask(struct sock *meta_sk, struct sk_buff *skb)
972
{
973
	struct sk_buff *skb_it;
974
975
	skb_it = tcp_write_queue_head(meta_sk);
976
977
	tcp_for_write_queue_from(skb_it, meta_sk) {
978
		if (skb_it == tcp_send_head(meta_sk))
979
			break;
980
981
		if (TCP_SKB_CB(skb_it)->seq == TCP_SKB_CB(skb)->seq) {
982
			TCP_SKB_CB(skb)->path_mask = TCP_SKB_CB(skb_it)->path_mask;
983
			break;
984
		}
985
	}
986
}
987
988
static struct sk_buff *mptcp_rcv_buf_optimization(struct sock *sk, int penal)
989
{
990
	struct sock *meta_sk;
991
	struct tcp_sock *tp = tcp_sk(sk), *tp_it;
992
	struct sk_buff *skb_head;
993
994
	if (tp->mpcb->cnt_subflows == 1)
995
		return NULL;
996
997
	meta_sk = mptcp_meta_sk(sk);
998
	skb_head = tcp_write_queue_head(meta_sk);
999
1000
	if (!skb_head || skb_head == tcp_send_head(meta_sk))
1001
		return NULL;
1002
1003
	/* If penalization is optional (coming from mptcp_next_segment() and
1004
	 * We are not send-buffer-limited we do not penalize. The retransmission
1005
	 * is just an optimization to fix the idle-time due to the delay before
1006
	 * we wake up the application.
1007
	 */
1008
	if (!penal && sk_stream_memory_free(meta_sk))
1009
		goto retrans;
1010
1011
	/* Half the cwnd of the slow flow */
1012
	mptcp_for_each_tp(tp->mpcb, tp_it) {
1013
		if (tp_it != tp &&
1014
		    TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp_it->mptcp->path_index)) {
1015
			/* Only update every subflow rtt */
1016
			if (tcp_time_stamp - tp_it->mptcp->last_rbuf_opti < tp_it->srtt >> 3)
1017
				break;
1018
1019
			if (tp->srtt < tp_it->srtt && inet_csk((struct sock *)tp_it)->icsk_ca_state == TCP_CA_Open) {
1020
				tp_it->snd_cwnd = max(tp_it->snd_cwnd >> 1U, 1U);
1021
				if (tp_it->snd_ssthresh != TCP_INFINITE_SSTHRESH)
1022
					tp_it->snd_ssthresh = max(tp_it->snd_ssthresh >> 1U, 2U);
1023
1024
				tp_it->mptcp->last_rbuf_opti = tcp_time_stamp;
1025
			}
1026
			break;
1027
		}
1028
	}
1029
1030
retrans:
1031
1032
	/* Segment not yet injected into this path? Take it!!! */
1033
	if (!(TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp->mptcp->path_index))) {
1034
		int do_retrans = 0;
1035
		mptcp_for_each_tp(tp->mpcb, tp_it) {
1036
			if (tp_it != tp &&
1037
			    TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp_it->mptcp->path_index)) {
1038
				if (tp_it->snd_cwnd <= 4) {
1039
					do_retrans = 1;
1040
					break;
1041
				}
1042
1043
				if (4 * tp->srtt >= tp_it->srtt) {
1044
					do_retrans = 0;
1045
					break;
1046
				} else {
1047
					do_retrans = 1;
1048
				}
1049
			}
1050
		}
1051
1052
		if (do_retrans)
1053
			return skb_head;
1054
	}
1055
	return NULL;
1056
}
1057
1058
int mptcp_write_xmit(struct sock *meta_sk, unsigned int mss_now, int nonagle,
1059
		     int push_one, gfp_t gfp)
1060
{
1061
	struct tcp_sock *meta_tp = tcp_sk(meta_sk), *subtp;
1062
	struct sock *subsk;
1063
	struct mptcp_cb *mpcb = meta_tp->mpcb;
1064
	struct sk_buff *skb;
1065
	unsigned int tso_segs, sent_pkts;
1066
	int cwnd_quota;
1067
	int result;
1068
	int reinject = 0;
1069
1070
	sent_pkts = 0;
1071
1072
	/* Currently mtu-probing is not done in MPTCP */
1073
	if (!push_one && 0) {
1074
		/* Do MTU probing. */
1075
		result = tcp_mtu_probe(meta_sk);
1076
		if (!result)
1077
			return 0;
1078
		else if (result > 0)
1079
			sent_pkts = 1;
1080
	}
1081
1082
	while ((skb = mptcp_next_segment(meta_sk, &reinject))) {
1083
		unsigned int limit;
1084
		struct sk_buff *subskb = NULL;
1085
		u32 noneligible = mpcb->noneligible;
1086
1087
		if (reinject == 1) {
1088
			if (!after(TCP_SKB_CB(skb)->end_seq, meta_tp->snd_una)) {
1089
				/* Segment already reached the peer, take the next one */
1090
				__skb_unlink(skb, &mpcb->reinject_queue);
1091
				__kfree_skb(skb);
1092
				continue;
1093
			}
1094
1095
			/* Reinjection and it is coming from a subflow? We need
1096
			 * to find out the path-mask from the meta-write-queue
1097
			 * to properly select a subflow.
1098
			 */
1099
			if (!TCP_SKB_CB(skb)->path_mask)
1100
				mptcp_find_and_set_pathmask(meta_sk, skb);
1101
		}
1102
1103
subflow:
1104
		subsk = get_available_subflow(meta_sk, skb, &mss_now);
1105
		if (!subsk)
1106
			break;
1107
		subtp = tcp_sk(subsk);
1108
1109
		/* Since all subsocks are locked before calling the scheduler,
1110
		 * the tcp_send_head should not change.
1111
		 */
1112
		BUG_ON(!reinject && tcp_send_head(meta_sk) != skb);
1113
retry:
1114
		/* If the segment was cloned (e.g. a meta retransmission),
1115
		 * the header must be expanded/copied so that there is no
1116
		 * corruption of TSO information.
1117
		 */
1118
		if (skb_cloned(skb) && skb_is_nonlinear(skb) &&
1119
		    unlikely(pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1120
			break;
1121
1122
		tcp_set_skb_tso_segs(meta_sk, skb, mss_now);
1123
		tso_segs = tcp_skb_pcount(skb);
1124
		BUG_ON(!tso_segs);
1125
1126
		cwnd_quota = tcp_cwnd_test(subtp, skb);
1127
		if (!cwnd_quota) {
1128
			/* May happen, if at the first selection we circumvented
1129
			 * the test due to a DATA_FIN (and got rejected at
1130
			 * tcp_snd_wnd_test), but the reinjected segment is not
1131
			 * a DATA_FIN.
1132
			 */
1133
			BUG_ON(reinject != -1);
1134
			break;
1135
		}
1136
1137
		if (!reinject && unlikely(!tcp_snd_wnd_test(meta_tp, skb, mss_now))) {
1138
			skb = mptcp_rcv_buf_optimization(subsk, 1);
1139
			if (skb) {
1140
				reinject = -1;
1141
				goto retry;
1142
			}
1143
			break;
1144
		}
1145
1146
		if (tso_segs == 1) {
1147
			if (unlikely(!tcp_nagle_test(meta_tp, skb, mss_now,
1148
						     (tcp_skb_is_last(meta_sk, skb) ?
1149
						      nonagle : TCP_NAGLE_PUSH))))
1150
				break;
1151
		} else {
1152
			/* Do not try to defer the transmission of a reinjected
1153
			 * segment. Send it directly.
1154
			 * If it is not possible to send the TSO segment on the
1155
			 * best subflow right now try to look for another subflow.
1156
			 * If there is no subflow available defer the segment to avoid
1157
			 * the call to mptso_fragment.
1158
			 */
1159
			if (!push_one && !reinject && tcp_tso_should_defer(subsk, skb)) {
1160
				mpcb->noneligible |= mptcp_pi_to_flag(subtp->mptcp->path_index);
1161
				goto subflow;
1162
			}
1163
		}
1164
1165
		/* TSQ : sk_wmem_alloc accounts skb truesize,
1166
		 * including skb overhead. But thats OK.
1167
		 */
1168
		if (atomic_read(&subsk->sk_wmem_alloc) >= sysctl_tcp_limit_output_bytes) {
1169
			set_bit(TSQ_THROTTLED, &subtp->tsq_flags);
1170
			mpcb->noneligible |= mptcp_pi_to_flag(subtp->mptcp->path_index);
1171
			continue;
1172
		}
1173
1174
		limit = mss_now;
1175
		if (tso_segs > 1 && !tcp_urg_mode(meta_tp))
1176
			limit = tcp_mss_split_point(subsk, skb, mss_now,
1177
						    min_t(unsigned int,
1178
							  cwnd_quota,
1179
							  subsk->sk_gso_max_segs));
1180
1181
		if (skb->len > limit &&
1182
		    unlikely(mptso_fragment(meta_sk, skb, limit, mss_now, gfp, reinject)))
1183
			break;
1184
1185
		subskb = mptcp_skb_entail(subsk, skb, reinject);
1186
		if (!subskb)
1187
			break;
1188
1189
		mpcb->noneligible = noneligible;
1190
		TCP_SKB_CB(skb)->when = tcp_time_stamp;
1191
		TCP_SKB_CB(subskb)->when = tcp_time_stamp;
1192
		if (unlikely(tcp_transmit_skb(subsk, subskb, 1, gfp))) {
1193
			mptcp_transmit_skb_failed(subsk, skb, subskb, reinject);
1194
			mpcb->noneligible |= mptcp_pi_to_flag(subtp->mptcp->path_index);
1195
			continue;
1196
		}
1197
1198
		if (!reinject) {
1199
			mptcp_check_sndseq_wrap(meta_tp,
1200
						TCP_SKB_CB(skb)->end_seq -
1201
						TCP_SKB_CB(skb)->seq);
1202
			tcp_event_new_data_sent(meta_sk, skb);
1203
		}
1204
1205
		tcp_minshall_update(meta_tp, mss_now, skb);
1206
		sent_pkts += tcp_skb_pcount(skb);
1207
		tcp_sk(subsk)->mptcp->sent_pkts += tcp_skb_pcount(skb);
1208
1209
		mptcp_sub_event_new_data_sent(subsk, subskb, skb);
1210
1211
		if (reinject > 0) {
1212
			__skb_unlink(skb, &mpcb->reinject_queue);
1213
			kfree_skb(skb);
1214
		}
1215
1216
		if (push_one)
1217
			break;
1218
	}
1219
1220
	mpcb->noneligible = 0;
1221
1222
	if (likely(sent_pkts)) {
1223
		mptcp_for_each_sk(mpcb, subsk) {
1224
			subtp = tcp_sk(subsk);
1225
			if (subtp->mptcp->sent_pkts) {
1226
				if (tcp_in_cwnd_reduction(subsk))
1227
					subtp->prr_out += subtp->mptcp->sent_pkts;
1228
				tcp_cwnd_validate(subsk);
1229
				subtp->mptcp->sent_pkts = 0;
1230
			}
1231
		}
1232
		return 0;
1233
	}
1234
1235
	return !meta_tp->packets_out && tcp_send_head(meta_sk);
1236
}
1237
1238
void mptcp_write_space(struct sock *sk)
1239
{
1240
	mptcp_push_pending_frames(mptcp_meta_sk(sk));
1241
}
1242
1243
u32 __mptcp_select_window(struct sock *sk)
1244
{
1245
	struct inet_connection_sock *icsk = inet_csk(sk);
1246
	struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp);
1247
	int mss, free_space, full_space, window;
1248
1249
	/* MSS for the peer's data.  Previous versions used mss_clamp
1250
	 * here.  I don't know if the value based on our guesses
1251
	 * of peer's MSS is better for the performance.  It's more correct
1252
	 * but may be worse for the performance because of rcv_mss
1253
	 * fluctuations.  --SAW  1998/11/1
1254
	 */
1255
	mss = icsk->icsk_ack.rcv_mss;
1256
	free_space = tcp_space(sk);
1257
	full_space = min_t(int, meta_tp->window_clamp,
1258
			tcp_full_space(sk));
1259
1260
	if (mss > full_space)
1261
		mss = full_space;
1262
1263
	if (free_space < (full_space >> 1)) {
1264
		icsk->icsk_ack.quick = 0;
1265
1266
		if (tcp_memory_pressure)
1267
			/* TODO this has to be adapted when we support different
1268
			 * MSS's among the subflows.
1269
			 */
1270
			meta_tp->rcv_ssthresh = min(meta_tp->rcv_ssthresh,
1271
						    4U * meta_tp->advmss);
1272
1273
		if (free_space < mss)
1274
			return 0;
1275
	}
1276
1277
	if (free_space > meta_tp->rcv_ssthresh)
1278
		free_space = meta_tp->rcv_ssthresh;
1279
1280
	/* Don't do rounding if we are using window scaling, since the
1281
	 * scaled window will not line up with the MSS boundary anyway.
1282
	 */
1283
	window = meta_tp->rcv_wnd;
1284
	if (tp->rx_opt.rcv_wscale) {
1285
		window = free_space;
1286
1287
		/* Advertise enough space so that it won't get scaled away.
1288
		 * Import case: prevent zero window announcement if
1289
		 * 1<<rcv_wscale > mss.
1290
		 */
1291
		if (((window >> tp->rx_opt.rcv_wscale) << tp->
1292
		     rx_opt.rcv_wscale) != window)
1293
			window = (((window >> tp->rx_opt.rcv_wscale) + 1)
1294
				  << tp->rx_opt.rcv_wscale);
1295
	} else {
1296
		/* Get the largest window that is a nice multiple of mss.
1297
		 * Window clamp already applied above.
1298
		 * If our current window offering is within 1 mss of the
1299
		 * free space we just keep it. This prevents the divide
1300
		 * and multiply from happening most of the time.
1301
		 * We also don't do any window rounding when the free space
1302
		 * is too small.
1303
		 */
1304
		if (window <= free_space - mss || window > free_space)
1305
			window = (free_space / mss) * mss;
1306
		else if (mss == full_space &&
1307
			 free_space > window + (full_space >> 1))
1308
			window = free_space;
1309
	}
1310
1311
	return window;
1312
}
1313
1314
static void mptcp_set_nonce(struct sock *sk)
1315
{
1316
	struct tcp_sock *tp = tcp_sk(sk);
1317
	struct inet_sock *inet = inet_sk(sk);
1318
1319
	if (sk->sk_family == AF_INET)
1320
		tp->mptcp->mptcp_loc_nonce = mptcp_v4_get_nonce(inet->inet_saddr,
1321
								inet->inet_daddr,
1322
								inet->inet_sport,
1323
								inet->inet_dport,
1324
								tp->write_seq);
1325
#if IS_ENABLED(CONFIG_IPV6)
1326
	else
1327
		tp->mptcp->mptcp_loc_nonce = mptcp_v6_get_nonce(inet6_sk(sk)->saddr.s6_addr32,
1328
				     	     	     	        inet6_sk(sk)->daddr.s6_addr32,
1329
				     	     	     	        inet->inet_sport,
1330
				     	     	     	        inet->inet_dport,
1331
				     	     	     	        tp->write_seq);
1332
#endif
1333
1334
	tp->mptcp->nonce_set = 1;
1335
}
1336
1337
void mptcp_syn_options(struct sock *sk, struct tcp_out_options *opts,
1338
		       unsigned *remaining)
1339
{
1340
	struct tcp_sock *tp = tcp_sk(sk);
1341
1342
	opts->options |= OPTION_MPTCP;
1343
	if (is_master_tp(tp)) {
1344
		opts->mptcp_options |= OPTION_MP_CAPABLE | OPTION_TYPE_SYN;
1345
		*remaining -= MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN;
1346
		opts->mp_capable.sender_key = tp->mptcp_loc_key;
1347
		opts->dss_csum = sysctl_mptcp_checksum;
1348
1349
		/* We arrive here either when sending a SYN or a
1350
		 * SYN+ACK when in SYN_SENT state (that is, tcp_synack_options
1351
		 * is only called for syn+ack replied by a server, while this
1352
		 * function is called when SYNs are sent by both parties and
1353
		 * are crossed)
1354
		 * Due to this possibility, a slave subsocket may arrive here,
1355
		 * and does not need to set the dataseq options, since
1356
		 * there is no data in the segment
1357
		 */
1358
	} else {
1359
		struct mptcp_cb *mpcb = tp->mpcb;
1360
1361
		opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_SYN;
1362
		*remaining -= MPTCP_SUB_LEN_JOIN_SYN_ALIGN;
1363
		opts->mp_join_syns.token = mpcb->mptcp_rem_token;
1364
		opts->addr_id = mptcp_get_loc_addrid(mpcb, sk);
1365
1366
		if (!tp->mptcp->nonce_set)
1367
			mptcp_set_nonce(sk);
1368
1369
		opts->mp_join_syns.sender_nonce = tp->mptcp->mptcp_loc_nonce;
1370
	}
1371
}
1372
1373
void mptcp_synack_options(struct request_sock *req,
1374
			  struct tcp_out_options *opts, unsigned *remaining)
1375
{
1376
	struct mptcp_request_sock *mtreq;
1377
	mtreq = mptcp_rsk(req);
1378
1379
	opts->options |= OPTION_MPTCP;
1380
	/* MPCB not yet set - thus it's a new MPTCP-session */
1381
	if (!mtreq->mpcb) {
1382
		opts->mptcp_options |= OPTION_MP_CAPABLE | OPTION_TYPE_SYNACK;
1383
		*remaining -= MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN;
1384
		opts->mp_capable.sender_key = mtreq->mptcp_loc_key;
1385
		opts->dss_csum = sysctl_mptcp_checksum || mtreq->dss_csum;
1386
	} else {
1387
		struct inet_request_sock *ireq = inet_rsk(req);
1388
		int i;
1389
1390
		opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_SYNACK;
1391
		opts->mp_join_syns.sender_truncated_mac =
1392
				mtreq->mptcp_hash_tmac;
1393
		opts->mp_join_syns.sender_nonce = mtreq->mptcp_loc_nonce;
1394
		opts->addr_id = 0;
1395
1396
		/* Finding Address ID */
1397
		if (req->rsk_ops->family == AF_INET)
1398
			mptcp_for_each_bit_set(mtreq->mpcb->loc4_bits, i) {
1399
				struct mptcp_loc4 *addr =
1400
						&mtreq->mpcb->locaddr4[i];
1401
				if (addr->addr.s_addr == ireq->loc_addr)
1402
					opts->addr_id = addr->id;
1403
			}
1404
#if IS_ENABLED(CONFIG_IPV6)
1405
		else /* IPv6 */
1406
			mptcp_for_each_bit_set(mtreq->mpcb->loc6_bits, i) {
1407
				struct mptcp_loc6 *addr =
1408
						&mtreq->mpcb->locaddr6[i];
1409
				if (ipv6_addr_equal(&addr->addr,
1410
						    &inet6_rsk(req)->loc_addr))
1411
					opts->addr_id = addr->id;
1412
			}
1413
#endif /* CONFIG_IPV6 */
1414
		*remaining -= MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN;
1415
	}
1416
}
1417
1418
void mptcp_established_options(struct sock *sk, struct sk_buff *skb,
1419
			       struct tcp_out_options *opts, unsigned *size)
1420
{
1421
	struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp);
1422
	struct mptcp_cb *mpcb = tp->mpcb;
1423
	struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL;
1424
1425
	/* In fallback mp_fail-mode, we have to repeat it until the fallback
1426
	 * has been done by the sender
1427
	 */
1428
	if (unlikely(tp->mptcp->send_mp_fail)) {
1429
		opts->options |= OPTION_MPTCP;
1430
		opts->mptcp_options |= OPTION_MP_FAIL;
1431
		opts->data_ack = (__u32)(mpcb->csum_cutoff_seq >> 32);
1432
		opts->data_seq = (__u32)mpcb->csum_cutoff_seq;
1433
		*size += MPTCP_SUB_LEN_FAIL;
1434
		return;
1435
	}
1436
1437
	if (unlikely(tp->send_mp_fclose)) {
1438
		opts->options |= OPTION_MPTCP;
1439
		opts->mptcp_options |= OPTION_MP_FCLOSE;
1440
		opts->mp_capable.receiver_key = mpcb->mptcp_rem_key;
1441
		*size += MPTCP_SUB_LEN_FCLOSE_ALIGN;
1442
		return;
1443
	}
1444
1445
	/* 1. If we are the sender of the infinite-mapping, we need the
1446
	 *    MPTCPHDR_INF-flag, because a retransmission of the
1447
	 *    infinite-announcment still needs the mptcp-option.
1448
	 *
1449
	 *    We need infinite_cutoff_seq, because retransmissions from before
1450
	 *    the infinite-cutoff-moment still need the MPTCP-signalling to stay
1451
	 *    consistent.
1452
	 *
1453
	 * 2. If we are the receiver of the infinite-mapping, we always skip
1454
	 *    mptcp-options, because acknowledgments from before the
1455
	 *    infinite-mapping point have already been sent out.
1456
	 *
1457
	 * I know, the whole infinite-mapping stuff is ugly...
1458
	 *
1459
	 * TODO: Handle wrapped data-sequence numbers
1460
	 *       (even if it's very unlikely)
1461
	 */
1462
	if (unlikely(mpcb->infinite_mapping_snd) &&
1463
	    tp->mptcp->fully_established &&
1464
	    ((mpcb->send_infinite_mapping && tcb &&
1465
	      !(tcb->mptcp_flags & MPTCPHDR_INF) &&
1466
	      !before(tcb->seq, tp->mptcp->infinite_cutoff_seq)) ||
1467
	     !mpcb->send_infinite_mapping))
1468
		return;
1469
1470
	if (unlikely(tp->mptcp->include_mpc)) {
1471
		opts->options |= OPTION_MPTCP;
1472
		opts->mptcp_options |= OPTION_MP_CAPABLE |
1473
				       OPTION_TYPE_ACK;
1474
		*size += MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN;
1475
		opts->mp_capable.sender_key = mpcb->mptcp_loc_key;
1476
		opts->mp_capable.receiver_key = mpcb->mptcp_rem_key;
1477
		opts->dss_csum = mpcb->dss_csum;
1478
1479
		if (skb)
1480
			tp->mptcp->include_mpc = 0;
1481
	}
1482
	if (unlikely(tp->mptcp->pre_established)) {
1483
		opts->options |= OPTION_MPTCP;
1484
		opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_ACK;
1485
		*size += MPTCP_SUB_LEN_JOIN_ACK_ALIGN;
1486
	}
1487
1488
	if (!tp->mptcp_add_addr_ack && !tp->mptcp->include_mpc &&
1489
	    !tp->mptcp->pre_established) {
1490
		opts->options |= OPTION_MPTCP;
1491
		opts->mptcp_options |= OPTION_DATA_ACK;
1492
		/* If !skb, we come from tcp_current_mss and thus we always
1493
		 * assume that the DSS-option will be set for the data-packet.
1494
		 */
1495
		if (skb && !mptcp_is_data_seq(skb)) {
1496
			opts->data_ack = meta_tp->rcv_nxt;
1497
1498
			*size += MPTCP_SUB_LEN_ACK_ALIGN;
1499
		} else {
1500
			opts->data_ack = meta_tp->rcv_nxt;
1501
1502
			/* Doesn't matter, if csum included or not. It will be
1503
			 * either 10 or 12, and thus aligned = 12
1504
			 */
1505
			*size += MPTCP_SUB_LEN_ACK_ALIGN +
1506
				 MPTCP_SUB_LEN_SEQ_ALIGN;
1507
		}
1508
1509
		*size += MPTCP_SUB_LEN_DSS_ALIGN;
1510
	}
1511
1512
	if (unlikely(tp->mptcp->add_addr4) &&
1513
	    MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_ADD_ADDR4_ALIGN) {
1514
		int ind = mptcp_find_free_index(~(tp->mptcp->add_addr4));
1515
		opts->options |= OPTION_MPTCP;
1516
		opts->mptcp_options |= OPTION_ADD_ADDR;
1517
		opts->addr4 = &mpcb->locaddr4[ind];
1518
		if (skb)
1519
			tp->mptcp->add_addr4 &= ~(1 << ind);
1520
		*size += MPTCP_SUB_LEN_ADD_ADDR4_ALIGN;
1521
	} else if (unlikely(tp->mptcp->add_addr6) &&
1522
		   MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_ADD_ADDR6_ALIGN) {
1523
		int ind = mptcp_find_free_index(~(tp->mptcp->add_addr6));
1524
		opts->options |= OPTION_MPTCP;
1525
		opts->mptcp_options |= OPTION_ADD_ADDR;
1526
		opts->addr6 = &mpcb->locaddr6[ind];
1527
		if (skb)
1528
			tp->mptcp->add_addr6 &= ~(1 << ind);
1529
		*size += MPTCP_SUB_LEN_ADD_ADDR6_ALIGN;
1530
	} else if (unlikely(mpcb->remove_addrs) &&
1531
		   MAX_TCP_OPTION_SPACE - *size >=
1532
		   mptcp_sub_len_remove_addr_align(mpcb->remove_addrs)) {
1533
		opts->options |= OPTION_MPTCP;
1534
		opts->mptcp_options |= OPTION_REMOVE_ADDR;
1535
		opts->remove_addrs = mpcb->remove_addrs;
1536
		*size += mptcp_sub_len_remove_addr_align(opts->remove_addrs);
1537
		if (skb)
1538
			mpcb->remove_addrs = 0;
1539
	} else if (!(opts->mptcp_options & OPTION_MP_CAPABLE) &&
1540
		   !(opts->mptcp_options & OPTION_MP_JOIN) &&
1541
		   ((unlikely(tp->mptcp->add_addr6) &&
1542
		     MAX_TCP_OPTION_SPACE - *size <=
1543
		     MPTCP_SUB_LEN_ADD_ADDR6_ALIGN) ||
1544
		    (unlikely(tp->mptcp->add_addr4) &&
1545
		     MAX_TCP_OPTION_SPACE - *size >=
1546
		     MPTCP_SUB_LEN_ADD_ADDR4_ALIGN))) {
1547
		mptcp_debug("no space for add addr. unsent IPv4: %#x,IPv6: %#x\n",
1548
			    tp->mptcp->add_addr4, tp->mptcp->add_addr6);
1549
		tp->mptcp_add_addr_ack = 1;
1550
		tcp_send_ack(sk);
1551
		tp->mptcp_add_addr_ack = 0;
1552
	}
1553
1554
	if (unlikely(tp->mptcp->send_mp_prio) &&
1555
	    MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_PRIO_ALIGN) {
1556
		opts->options |= OPTION_MPTCP;
1557
		opts->mptcp_options |= OPTION_MP_PRIO;
1558
		if (skb)
1559
			tp->mptcp->send_mp_prio = 0;
1560
		*size += MPTCP_SUB_LEN_PRIO_ALIGN;
1561
	}
1562
1563
	return;
1564
}
1565
1566
void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp,
1567
			 struct tcp_out_options *opts,
1568
			 struct sk_buff *skb)
1569
{
1570
	if (unlikely(OPTION_MP_CAPABLE & opts->mptcp_options)) {
1571
		struct mp_capable *mpc = (struct mp_capable *)ptr;
1572
1573
		mpc->kind = TCPOPT_MPTCP;
1574
1575
		if ((OPTION_TYPE_SYN & opts->mptcp_options) ||
1576
		    (OPTION_TYPE_SYNACK & opts->mptcp_options)) {
1577
			mpc->sender_key = opts->mp_capable.sender_key;
1578
			mpc->len = MPTCP_SUB_LEN_CAPABLE_SYN;
1579
			ptr += MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN >> 2;
1580
		} else if (OPTION_TYPE_ACK & opts->mptcp_options) {
1581
			mpc->sender_key = opts->mp_capable.sender_key;
1582
			mpc->receiver_key = opts->mp_capable.receiver_key;
1583
			mpc->len = MPTCP_SUB_LEN_CAPABLE_ACK;
1584
			ptr += MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN >> 2;
1585
		}
1586
1587
		mpc->sub = MPTCP_SUB_CAPABLE;
1588
		mpc->ver = 0;
1589
		mpc->a = opts->dss_csum ? 1 : 0;
1590
		mpc->b = 0;
1591
		mpc->rsv = 0;
1592
		mpc->h = 1;
1593
	}
1594
1595
	if (unlikely(OPTION_MP_JOIN & opts->mptcp_options)) {
1596
		struct mp_join *mpj = (struct mp_join *)ptr;
1597
1598
		mpj->kind = TCPOPT_MPTCP;
1599
		mpj->sub = MPTCP_SUB_JOIN;
1600
		mpj->rsv = 0;
1601
		mpj->addr_id = opts->addr_id;
1602
1603
		if (OPTION_TYPE_SYN & opts->mptcp_options) {
1604
			mpj->len = MPTCP_SUB_LEN_JOIN_SYN;
1605
			mpj->u.syn.token = opts->mp_join_syns.token;
1606
			mpj->u.syn.nonce = opts->mp_join_syns.sender_nonce;
1607
			mpj->b = tp->mptcp->low_prio;
1608
			ptr += MPTCP_SUB_LEN_JOIN_SYN_ALIGN >> 2;
1609
		} else if (OPTION_TYPE_SYNACK & opts->mptcp_options) {
1610
			mpj->len = MPTCP_SUB_LEN_JOIN_SYNACK;
1611
			mpj->u.synack.mac =
1612
				opts->mp_join_syns.sender_truncated_mac;
1613
			mpj->u.synack.nonce = opts->mp_join_syns.sender_nonce;
1614
			mpj->b = tp->mptcp->low_prio;
1615
			ptr += MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN >> 2;
1616
		} else if (OPTION_TYPE_ACK & opts->mptcp_options) {
1617
			mpj->len = MPTCP_SUB_LEN_JOIN_ACK;
1618
			memcpy(mpj->u.ack.mac, &tp->mptcp->sender_mac[0], 20);
1619
			ptr += MPTCP_SUB_LEN_JOIN_ACK_ALIGN >> 2;
1620
		}
1621
	}
1622
	if (unlikely(OPTION_ADD_ADDR & opts->mptcp_options)) {
1623
		struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr;
1624
1625
		mpadd->kind = TCPOPT_MPTCP;
1626
		if (opts->addr4) {
1627
			mpadd->len = MPTCP_SUB_LEN_ADD_ADDR4;
1628
			mpadd->sub = MPTCP_SUB_ADD_ADDR;
1629
			mpadd->ipver = 4;
1630
			mpadd->addr_id = opts->addr4->id;
1631
			mpadd->u.v4.addr = opts->addr4->addr;
1632
			ptr += MPTCP_SUB_LEN_ADD_ADDR4_ALIGN >> 2;
1633
		} else if (opts->addr6) {
1634
			mpadd->len = MPTCP_SUB_LEN_ADD_ADDR6;
1635
			mpadd->sub = MPTCP_SUB_ADD_ADDR;
1636
			mpadd->ipver = 6;
1637
			mpadd->addr_id = opts->addr6->id;
1638
			memcpy(&mpadd->u.v6.addr, &opts->addr6->addr,
1639
			       sizeof(mpadd->u.v6.addr));
1640
			ptr += MPTCP_SUB_LEN_ADD_ADDR6_ALIGN >> 2;
1641
		} else {
1642
			BUG();
1643
		}
1644
	}
1645
	if (unlikely(OPTION_REMOVE_ADDR & opts->mptcp_options)) {
1646
		struct mp_remove_addr *mprem = (struct mp_remove_addr *)ptr;
1647
		u8 *addrs_id;
1648
		int id, len, len_align;
1649
1650
		len = mptcp_sub_len_remove_addr(opts->remove_addrs);
1651
		len_align = mptcp_sub_len_remove_addr_align(opts->remove_addrs);
1652
1653
		mprem->kind = TCPOPT_MPTCP;
1654
		mprem->len = len;
1655
		mprem->sub = MPTCP_SUB_REMOVE_ADDR;
1656
		mprem->rsv = 0;
1657
		addrs_id = &mprem->addrs_id;
1658
1659
		mptcp_for_each_bit_set(opts->remove_addrs, id)
1660
			*(addrs_id++) = id;
1661
1662
		/* Fill the rest with NOP's */
1663
		if (len_align > len) {
1664
			int i;
1665
			for (i = 0; i < len_align - len; i++)
1666
				*(addrs_id++) = TCPOPT_NOP;
1667
		}
1668
1669
		ptr += len_align >> 2;
1670
	}
1671
	if (unlikely(OPTION_MP_FAIL & opts->mptcp_options)) {
1672
		struct mp_fail *mpfail = (struct mp_fail *)ptr;
1673
1674
		mpfail->kind = TCPOPT_MPTCP;
1675
		mpfail->len = MPTCP_SUB_LEN_FAIL;
1676
		mpfail->sub = MPTCP_SUB_FAIL;
1677
		mpfail->rsv1 = 0;
1678
		mpfail->rsv2 = 0;
1679
		mpfail->data_seq = htonll(((u64)opts->data_ack << 32) | opts->data_seq);
1680
1681
		ptr += MPTCP_SUB_LEN_FAIL_ALIGN >> 2;
1682
	}
1683
	if (unlikely(OPTION_MP_FCLOSE & opts->mptcp_options)) {
1684
		struct mp_fclose *mpfclose = (struct mp_fclose *)ptr;
1685
1686
		mpfclose->kind = TCPOPT_MPTCP;
1687
		mpfclose->len = MPTCP_SUB_LEN_FCLOSE;
1688
		mpfclose->sub = MPTCP_SUB_FCLOSE;
1689
		mpfclose->rsv1 = 0;
1690
		mpfclose->rsv2 = 0;
1691
		mpfclose->key = opts->mp_capable.receiver_key;
1692
1693
		ptr += MPTCP_SUB_LEN_FCLOSE_ALIGN >> 2;
1694
	}
1695
1696
	if (OPTION_DATA_ACK & opts->mptcp_options) {
1697
		if (!mptcp_is_data_seq(skb)) {
1698
			struct mp_dss *mdss = (struct mp_dss *)ptr;
1699
1700
			mdss->kind = TCPOPT_MPTCP;
1701
			mdss->sub = MPTCP_SUB_DSS;
1702
			mdss->rsv1 = 0;
1703
			mdss->rsv2 = 0;
1704
			mdss->F = 0;
1705
			mdss->m = 0;
1706
			mdss->M = 0;
1707
			mdss->a = 0;
1708
			mdss->A = 1;
1709
			mdss->len = mptcp_sub_len_dss(mdss, tp->mpcb->dss_csum);
1710
1711
			ptr++;
1712
			*ptr++ = htonl(opts->data_ack);
1713
		} else {
1714
			/**** Just update the data_ack ****/
1715
1716
			/* Get pointer to data_ack-field. MPTCP is always at
1717
			 * the end of the TCP-options.
1718
			 */
1719
			/* TODO if we allow sending 64-bit dseq's we have to change "16" */
1720
			__be32 *dack = (__be32 *)(skb->data + (tcp_hdr(skb)->doff << 2) - 16);
1721
1722
			*dack = htonl(opts->data_ack);
1723
		}
1724
	}
1725
	if (unlikely(OPTION_MP_PRIO & opts->mptcp_options)) {
1726
		struct mp_prio *mpprio = (struct mp_prio *)ptr;
1727
1728
		mpprio->kind = TCPOPT_MPTCP;
1729
		mpprio->len = MPTCP_SUB_LEN_PRIO;
1730
		mpprio->sub = MPTCP_SUB_PRIO;
1731
		mpprio->rsv = 0;
1732
		mpprio->b = tp->mptcp->low_prio;
1733
		mpprio->addr_id = TCPOPT_NOP;
1734
1735
		ptr += MPTCP_SUB_LEN_PRIO_ALIGN >> 2;
1736
	}
1737
}
1738
1739
/* Returns the next segment to be sent from the mptcp meta-queue.
1740
 * (chooses the reinject queue if any segment is waiting in it, otherwise,
1741
 * chooses the normal write queue).
1742
 * Sets *@reinject to 1 if the returned segment comes from the
1743
 * reinject queue. Sets it to 0 if it is the regular send-head of the meta-sk,
1744
 * and sets it to -1 if it is a meta-level retransmission to optimize the
1745
 * receive-buffer.
1746
 */
1747
struct sk_buff *mptcp_next_segment(struct sock *meta_sk, int *reinject)
1748
{
1749
	struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
1750
	struct sk_buff *skb = NULL;
1751
	if (reinject)
1752
		*reinject = 0;
1753
1754
	/* If we are in fallback-mode, just take from the meta-send-queue */
1755
	if (mpcb->infinite_mapping_snd || mpcb->send_infinite_mapping)
1756
		return tcp_send_head(meta_sk);
1757
1758
	skb = skb_peek(&mpcb->reinject_queue);
1759
1760
	if (skb) {
1761
		if (reinject)
1762
			*reinject = 1;
1763
	} else {
1764
		skb = tcp_send_head(meta_sk);
1765
1766
		if (!skb && meta_sk->sk_write_pending &&
1767
		    sk_stream_wspace(meta_sk) < sk_stream_min_wspace(meta_sk)) {
1768
			struct sock *subsk = get_available_subflow(meta_sk, NULL, NULL);
1769
			if (!subsk)
1770
				return NULL;
1771
1772
			skb = mptcp_rcv_buf_optimization(subsk, 0);
1773
			if (skb && reinject)
1774
				*reinject = -1;
1775
		}
1776
	}
1777
	return skb;
1778
}
1779
1780
/* Sends the datafin */
1781
void mptcp_send_fin(struct sock *meta_sk)
1782
{
1783
	struct tcp_sock *meta_tp = tcp_sk(meta_sk);
1784
	struct sk_buff *skb = tcp_write_queue_tail(meta_sk);
1785
	int mss_now;
1786
1787
	if ((1 << meta_sk->sk_state) & (TCPF_CLOSE_WAIT | TCPF_LAST_ACK))
1788
		meta_tp->mpcb->passive_close = 1;
1789
1790
	/* Optimization, tack on the FIN if we have a queue of
1791
	 * unsent frames.  But be careful about outgoing SACKS
1792
	 * and IP options.
1793
	 */
1794
	mss_now = mptcp_current_mss(meta_sk);
1795
1796
	if (tcp_send_head(meta_sk) != NULL) {
1797
		TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_FIN;
1798
		TCP_SKB_CB(skb)->end_seq++;
1799
		meta_tp->write_seq++;
1800
	} else {
1801
		/* Socket is locked, keep trying until memory is available. */
1802
		for (;;) {
1803
			skb = alloc_skb_fclone(MAX_TCP_HEADER,
1804
					       meta_sk->sk_allocation);
1805
			if (skb)
1806
				break;
1807
			yield();
1808
		}
1809
		/* Reserve space for headers and prepare control bits. */
1810
		skb_reserve(skb, MAX_TCP_HEADER);
1811
1812
		tcp_init_nondata_skb(skb, meta_tp->write_seq, TCPHDR_ACK);
1813
		TCP_SKB_CB(skb)->end_seq++;
1814
		TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_FIN | MPTCPHDR_SEQ;
1815
		tcp_queue_skb(meta_sk, skb);
1816
	}
1817
	__tcp_push_pending_frames(meta_sk, mss_now, TCP_NAGLE_OFF);
1818
}
1819
1820
void mptcp_send_active_reset(struct sock *meta_sk, gfp_t priority)
1821
{
1822
	struct tcp_sock *meta_tp = tcp_sk(meta_sk);
1823
	struct mptcp_cb *mpcb = meta_tp->mpcb;
1824
	struct sock *sk = NULL, *sk_it = NULL, *tmpsk;
1825
1826
	if (!mpcb->cnt_subflows)
1827
		return;
1828
1829
	/* First - select a socket */
1830
1831
	/* Socket already selected? */
1832
	mptcp_for_each_sk(mpcb, sk_it) {
1833
		if (tcp_sk(sk_it)->send_mp_fclose) {
1834
			sk = sk_it;
1835
			goto found;
1836
		}
1837
	}
1838
1839
	sk = mptcp_select_ack_sock(meta_sk, 0);
1840
	/* May happen if no subflow is in an appropriate state */
1841
	if (!sk)
1842
		return;
1843
1844
	/* We are in infinite mode - just send a reset */
1845
	if (mpcb->infinite_mapping_snd || mpcb->infinite_mapping_rcv) {
1846
		tcp_send_active_reset(sk, priority);
1847
		return;
1848
	}
1849
1850
	tcp_sk(sk)->send_mp_fclose = 1;
1851
1852
	/** Reset all other subflows */
1853
1854
found:
1855
	/* tcp_done must be handled with bh disabled */
1856
	if (!in_serving_softirq())
1857
		local_bh_disable();
1858
	mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) {
1859
		if (tcp_sk(sk_it)->send_mp_fclose)
1860
			continue;
1861
1862
		sk_it->sk_err = ECONNRESET;
1863
		if (tcp_need_reset(sk_it->sk_state))
1864
			tcp_send_active_reset(sk_it, GFP_ATOMIC);
1865
		mptcp_sub_force_close(sk_it);
1866
	}
1867
	if (!in_serving_softirq())
1868
		local_bh_enable();
1869
1870
	tcp_send_ack(sk);
1871
1872
	if (!meta_tp->send_mp_fclose) {
1873
		struct inet_connection_sock *meta_icsk = inet_csk(meta_sk);
1874
1875
		meta_icsk->icsk_rto = min(inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
1876
		inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS,
1877
					  meta_icsk->icsk_rto, TCP_RTO_MAX);
1878
	}
1879
1880
	meta_tp->send_mp_fclose = 1;
1881
}
1882
1883
void mptcp_ack_retransmit_timer(struct sock *sk)
1884
{
1885
	struct sk_buff *skb;
1886
	struct tcp_sock *tp = tcp_sk(sk);
1887
	struct inet_connection_sock *icsk = inet_csk(sk);
1888
1889
	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
1890
		goto out; /* Routing failure or similar */
1891
1892
	if (!tp->retrans_stamp)
1893
		tp->retrans_stamp = tcp_time_stamp ? : 1;
1894
1895
	if (tcp_write_timeout(sk)) {
1896
		tp->mptcp->pre_established = 0;
1897
		sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer);
1898
		tcp_send_active_reset(sk, GFP_ATOMIC);
1899
		goto out;
1900
	}
1901
1902
	skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
1903
	if (skb == NULL) {
1904
		sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer,
1905
			       jiffies + icsk->icsk_rto);
1906
		return;
1907
	}
1908
1909
	/* Reserve space for headers and prepare control bits */
1910
	skb_reserve(skb, MAX_TCP_HEADER);
1911
	tcp_init_nondata_skb(skb, tp->snd_una, TCPHDR_ACK);
1912
1913
	TCP_SKB_CB(skb)->when = tcp_time_stamp;
1914
	if (tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC) > 0) {
1915
		/* Retransmission failed because of local congestion,
1916
		 * do not backoff.
1917
		 */
1918
		if (!icsk->icsk_retransmits)
1919
			icsk->icsk_retransmits = 1;
1920
		sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer,
1921
			       jiffies + icsk->icsk_rto);
1922
		return;
1923
	}
1924
1925
1926
	icsk->icsk_retransmits++;
1927
	icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
1928
	sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer,
1929
		       jiffies + icsk->icsk_rto);
1930
	if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0)) {
1931
		__sk_dst_reset(sk);
1932
	}
1933
1934
out:;
1935
}
1936
1937
void mptcp_ack_handler(unsigned long data)
1938
{
1939
	struct sock *sk = (struct sock *)data;
1940
	struct sock *meta_sk = mptcp_meta_sk(sk);
1941
1942
	bh_lock_sock(meta_sk);
1943
	if (sock_owned_by_user(meta_sk)) {
1944
		/* Try again later */
1945
		sk_reset_timer(sk, &tcp_sk(sk)->mptcp->mptcp_ack_timer,
1946
			       jiffies + (HZ / 20));
1947
		goto out_unlock;
1948
	}
1949
1950
	if (sk->sk_state == TCP_CLOSE)
1951
		goto out_unlock;
1952
1953
	mptcp_ack_retransmit_timer(sk);
1954
1955
	sk_mem_reclaim(sk);
1956
1957
out_unlock:
1958
	bh_unlock_sock(meta_sk);
1959
	sock_put(sk);
1960
}
1961
1962
/* Similar to tcp_retransmit_skb
1963
 *
1964
 * The diff is that we handle the retransmission-stats (retrans_stamp) at the
1965
 * meta-level.
1966
 */
1967
int mptcp_retransmit_skb(struct sock *meta_sk, struct sk_buff *skb)
1968
{
1969
	struct tcp_sock *meta_tp = tcp_sk(meta_sk);
1970
	struct sock *subsk;
1971
	struct sk_buff *subskb;
1972
	unsigned int limit, tso_segs, mss_now;
1973
	int err = -1, oldpcount;
1974
1975
	/* Do not sent more than we queued. 1/4 is reserved for possible
1976
	 * copying overhead: fragmentation, tunneling, mangling etc.
1977
	 *
1978
	 * This is a meta-retransmission thus we check on the meta-socket.
1979
	 */
1980
	if (atomic_read(&meta_sk->sk_wmem_alloc) >
1981
	    min(meta_sk->sk_wmem_queued + (meta_sk->sk_wmem_queued >> 2), meta_sk->sk_sndbuf)) {
1982
		return -EAGAIN;
1983
	}
1984
1985
	/* We need to make sure that the retransmitted segment can be sent on a
1986
	 * subflow right now. If it is too big, it needs to be fragmented.
1987
	 */
1988
	subsk = get_available_subflow(meta_sk, skb, &mss_now);
1989
	if (!subsk) {
1990
		/* We want to increase icsk_retransmits, thus return 0, so that
1991
		 * mptcp_retransmit_timer enters the desired branch.
1992
		 */
1993
		err = 0;
1994
		goto failed;
1995
	}
1996
1997
	/* If the segment was cloned (e.g. a meta retransmission), the header
1998
	 * must be expanded/copied so that there is no corruption of TSO
1999
	 * information.
2000
	 */
2001
	if (skb_cloned(skb) && skb_is_nonlinear(skb) &&
2002
	    unlikely(pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) {
2003
		err = ENOMEM;
2004
		goto failed;
2005
	}
2006
2007
	oldpcount = tcp_skb_pcount(skb);
2008
	tcp_set_skb_tso_segs(meta_sk, skb, mss_now);
2009
	tso_segs = tcp_skb_pcount(skb);
2010
	BUG_ON(!tso_segs);
2011
2012
	/* The MSS might have changed and so the number of segments. We
2013
	 * need to account for this change.
2014
	 */
2015
	if (unlikely(oldpcount != tso_segs))
2016
		tcp_adjust_pcount(meta_sk, skb, oldpcount - tso_segs);
2017
2018
	limit = mss_now;
2019
	if (tso_segs > 1 && !tcp_urg_mode(meta_tp))
2020
		limit = tcp_mss_split_point(subsk, skb, mss_now,
2021
					    min_t(unsigned int,
2022
						  tcp_cwnd_test(tcp_sk(subsk), skb),
2023
						  subsk->sk_gso_max_segs));
2024
2025
	if (skb->len > limit &&
2026
	    unlikely(mptso_fragment(meta_sk, skb, limit, mss_now,
2027
				    GFP_ATOMIC, 0)))
2028
		goto failed;
2029
2030
	subskb = mptcp_skb_entail(subsk, skb, -1);
2031
	if (!subskb)
2032
		goto failed;
2033
2034
	TCP_SKB_CB(skb)->when = tcp_time_stamp;
2035
	TCP_SKB_CB(subskb)->when = tcp_time_stamp;
2036
	err = tcp_transmit_skb(subsk, subskb, 1, GFP_ATOMIC);
2037
	if (!err) {
2038
		/* Update global TCP statistics. */
2039
		TCP_INC_STATS(sock_net(meta_sk), TCP_MIB_RETRANSSEGS);
2040
2041
		/* Diff to tcp_retransmit_skb */
2042
2043
		/* Save stamp of the first retransmit. */
2044
		if (!meta_tp->retrans_stamp)
2045
			meta_tp->retrans_stamp = TCP_SKB_CB(subskb)->when;
2046
		mptcp_sub_event_new_data_sent(subsk, subskb, skb);
2047
	} else {
2048
		mptcp_transmit_skb_failed(subsk, skb, subskb, 0);
2049
	}
2050
2051
failed:
2052
	return err;
2053
}
2054
2055
/* Similar to tcp_retransmit_timer
2056
 *
2057
 * The diff is that we have to handle retransmissions of the FAST_CLOSE-message
2058
 * and that we don't have an srtt estimation at the meta-level.
2059
 */
2060
void mptcp_retransmit_timer(struct sock *meta_sk)
2061
{
2062
	struct tcp_sock *meta_tp = tcp_sk(meta_sk);
2063
	struct mptcp_cb *mpcb = meta_tp->mpcb;
2064
	struct inet_connection_sock *meta_icsk = inet_csk(meta_sk);
2065
	int err;
2066
2067
	if (unlikely(meta_tp->send_mp_fclose))
2068
		goto send_mp_fclose;
2069
2070
	/* In fallback, retransmission is handled at the subflow-level */
2071
	if (!meta_tp->packets_out || mpcb->infinite_mapping_snd ||
2072
	    mpcb->send_infinite_mapping)
2073
		return;
2074
2075
	WARN_ON(tcp_write_queue_empty(meta_sk));
2076
2077
	if (!meta_tp->snd_wnd && !sock_flag(meta_sk, SOCK_DEAD) &&
2078
	    !((1 << meta_sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {
2079
		/* Receiver dastardly shrinks window. Our retransmits
2080
		 * become zero probes, but we should not timeout this
2081
		 * connection. If the socket is an orphan, time it out,
2082
		 * we cannot allow such beasts to hang infinitely.
2083
		 */
2084
		struct inet_sock *meta_inet = inet_sk(meta_sk);
2085
		if (meta_sk->sk_family == AF_INET) {
2086
			LIMIT_NETDEBUG(KERN_DEBUG "TCP: Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
2087
				       &meta_inet->inet_daddr,
2088
				       ntohs(meta_inet->inet_dport),
2089
				       meta_inet->inet_num, meta_tp->snd_una,
2090
				       meta_tp->snd_nxt);
2091
		}
2092
#if IS_ENABLED(CONFIG_IPV6)
2093
		else if (meta_sk->sk_family == AF_INET6) {
2094
			struct ipv6_pinfo *np = inet6_sk(meta_sk);
2095
			LIMIT_NETDEBUG(KERN_DEBUG "TCP: Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
2096
				       &np->daddr, ntohs(meta_inet->inet_dport),
2097
				       meta_inet->inet_num, meta_tp->snd_una,
2098
				       meta_tp->snd_nxt);
2099
		}
2100
#endif
2101
		if (tcp_time_stamp - meta_tp->rcv_tstamp > TCP_RTO_MAX) {
2102
			tcp_write_err(meta_sk);
2103
			return;
2104
		}
2105
2106
		mptcp_retransmit_skb(meta_sk, tcp_write_queue_head(meta_sk));
2107
		goto out_reset_timer;
2108
	}
2109
2110
	if (tcp_write_timeout(meta_sk))
2111
		return;
2112
2113
	if (meta_icsk->icsk_retransmits == 0)
2114
		NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_TCPTIMEOUTS);
2115
2116
	meta_icsk->icsk_ca_state = TCP_CA_Loss;
2117
2118
	err = mptcp_retransmit_skb(meta_sk, tcp_write_queue_head(meta_sk));
2119
	if (err > 0) {
2120
		/* Retransmission failed because of local congestion,
2121
		 * do not backoff.
2122
		 */
2123
		if (!meta_icsk->icsk_retransmits)
2124
			meta_icsk->icsk_retransmits = 1;
2125
		inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS,
2126
					  min(meta_icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL),
2127
					  TCP_RTO_MAX);
2128
		return;
2129
	}
2130
2131
out_backoff:
2132
	/* Increase the timeout each time we retransmit.  Note that
2133
	 * we do not increase the rtt estimate.  rto is initialized
2134
	 * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
2135
	 * that doubling rto each time is the least we can get away with.
2136
	 * In KA9Q, Karn uses this for the first few times, and then
2137
	 * goes to quadratic.  netBSD doubles, but only goes up to *64,
2138
	 * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is
2139
	 * defined in the protocol as the maximum possible RTT.  I guess
2140
	 * we'll have to use something other than TCP to talk to the
2141
	 * University of Mars.
2142
	 *
2143
	 * PAWS allows us longer timeouts and large windows, so once
2144
	 * implemented ftp to mars will work nicely. We will have to fix
2145
	 * the 120 second clamps though!
2146
	 */
2147
	meta_icsk->icsk_backoff++;
2148
	meta_icsk->icsk_retransmits++;
2149
2150
out_reset_timer:
2151
	/* If stream is thin, use linear timeouts. Since 'icsk_backoff' is
2152
	 * used to reset timer, set to 0. Recalculate 'icsk_rto' as this
2153
	 * might be increased if the stream oscillates between thin and thick,
2154
	 * thus the old value might already be too high compared to the value
2155
	 * set by 'tcp_set_rto' in tcp_input.c which resets the rto without
2156
	 * backoff. Limit to TCP_THIN_LINEAR_RETRIES before initiating
2157
	 * exponential backoff behaviour to avoid continue hammering
2158
	 * linear-timeout retransmissions into a black hole
2159
	 */
2160
	if (meta_sk->sk_state == TCP_ESTABLISHED &&
2161
	    (meta_tp->thin_lto || sysctl_tcp_thin_linear_timeouts) &&
2162
	    tcp_stream_is_thin(meta_tp) &&
2163
	    meta_icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) {
2164
		meta_icsk->icsk_backoff = 0;
2165
		/* We cannot do the same as in tcp_write_timer because the
2166
		 * srtt is not set here.
2167
		 */
2168
		mptcp_set_rto(meta_sk);
2169
	} else {
2170
		/* Use normal (exponential) backoff */
2171
		meta_icsk->icsk_rto = min(meta_icsk->icsk_rto << 1, TCP_RTO_MAX);
2172
	}
2173
	inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS, meta_icsk->icsk_rto, TCP_RTO_MAX);
2174
2175
	return;
2176
2177
send_mp_fclose:
2178
	/* MUST do this before tcp_write_timeout, because retrans_stamp may have
2179
	 * been set to 0 in another part while we are retransmitting
2180
	 * MP_FASTCLOSE. Then, we would crash, because retransmits_timed_out
2181
	 * accesses the meta-write-queue.
2182
	 *
2183
	 * We make sure that the timestamp is != 0.
2184
	 */
2185
	if (!meta_tp->retrans_stamp)
2186
		meta_tp->retrans_stamp = tcp_time_stamp ? : 1;
2187
2188
	if (tcp_write_timeout(meta_sk))
2189
		return;
2190
2191
	mptcp_send_active_reset(meta_sk, GFP_ATOMIC);
2192
2193
	goto out_backoff;
2194
}
2195
2196
/* Modify values to an mptcp-level for the initial window of new subflows */
2197
void mptcp_select_initial_window(int *__space, __u32 *window_clamp,
2198
				 const struct sock *sk)
2199
{
2200
	struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
2201
2202
	*window_clamp = mpcb->orig_window_clamp;
2203
	*__space = tcp_win_from_space(mpcb->orig_sk_rcvbuf);
2204
}
2205
2206
unsigned int mptcp_current_mss(struct sock *meta_sk)
2207
{
2208
	unsigned int mss = 0;
2209
	struct sock *sk;
2210
2211
	mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) {
2212
		int this_mss;
2213
2214
		if (!mptcp_sk_can_send(sk))
2215
			continue;
2216
2217
		this_mss = tcp_current_mss(sk);
2218
		if (!mss || this_mss < mss)
2219
			mss = this_mss;
2220
	}
2221
2222
	/* If no subflow is available, we take a default-mss from the
2223
	 * meta-socket.
2224
	 */
2225
	return !mss ? tcp_current_mss(meta_sk) : mss;
2226
}
2227
2228
int mptcp_select_size(const struct sock *meta_sk, bool sg)
2229
{
2230
	int mss = 0; /* We look for the smallest MSS */
2231
	struct sock *sk;
2232
2233
	mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) {
2234
		int this_mss;
2235
2236
		if (!mptcp_sk_can_send(sk))
2237
			continue;
2238
2239
		this_mss = tcp_sk(sk)->mss_cache;
2240
		if (!mss || this_mss < mss)
2241
			mss = this_mss;
2242
	}
2243
2244
	if (sg) {
2245
		if (mptcp_sk_can_gso(meta_sk)) {
2246
			mss = SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
2247
		} else {
2248
			int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
2249
2250
			if (mss >= pgbreak &&
2251
			    mss <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
2252
				mss = pgbreak;
2253
		}
2254
	}
2255
2256
	return !mss ? tcp_sk(meta_sk)->mss_cache : mss;
2257
}
2258
2259
int mptcp_check_snd_buf(const struct tcp_sock *tp)
2260
{
2261
	struct sock *sk;
2262
	u32 rtt_max = tp->srtt;
2263
	u64 bw_est;
2264
2265
	if (!tp->srtt)
2266
		return tp->reordering + 1;
2267
2268
	mptcp_for_each_sk(tp->mpcb, sk) {
2269
		if (!mptcp_sk_can_send(sk))
2270
			continue;
2271
2272
		if (rtt_max < tcp_sk(sk)->srtt)
2273
			rtt_max = tcp_sk(sk)->srtt;
2274
	}
2275
2276
	bw_est = div64_u64(((u64)tp->snd_cwnd * rtt_max) << 16,
2277
				(u64)tp->srtt);
2278
2279
	return max_t(unsigned int, (u32)(bw_est >> 16),
2280
			tp->reordering + 1);
2281
2282
}
2283
2284
unsigned int mptcp_xmit_size_goal(struct sock *meta_sk, u32 mss_now,
2285
				  int large_allowed)
2286
{
2287
	struct sock *sk;
2288
	u32 xmit_size_goal = 0;
2289
2290
	if (large_allowed && mptcp_sk_can_gso(meta_sk)) {
2291
		mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) {
2292
			int this_size_goal;
2293
2294
			if (!mptcp_sk_can_send(sk))
2295
				continue;
2296
2297
			this_size_goal = tcp_xmit_size_goal(sk, mss_now, 1);
2298
			if (!xmit_size_goal || this_size_goal < xmit_size_goal)
2299
				xmit_size_goal = this_size_goal;
2300
		}
2301
	}
2302
2303
	return max(xmit_size_goal, mss_now);
2304
}
2305
2306
/* Similar to tcp_trim_head - but we correctly copy the DSS-option */
2307
int mptcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
2308
{
2309
	int dsslen = MPTCP_SUB_LEN_DSS_ALIGN + MPTCP_SUB_LEN_ACK_ALIGN +
2310
		     MPTCP_SUB_LEN_SEQ_ALIGN;
2311
	char dss[dsslen];
2312
2313
	/* DSS-option must be recovered afterwards. */
2314
	memcpy(dss, skb->data - dsslen, dsslen);
2315
2316
	if (skb_cloned(skb)) {
2317
		/* pskb_expand_head will delete our DSS-option. We have to copy
2318
		 * it back if pskb_expand_head succeeds.
2319
		 */
2320
2321
		if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
2322
			return -ENOMEM;
2323
2324
		memcpy(skb->data - dsslen, dss, dsslen);
2325
	}
2326
2327
	__pskb_trim_head(skb, len);
2328
2329
	/* Put the DSS-option back in our header */
2330
	memcpy(skb->data - dsslen, dss, dsslen);
2331
2332
	TCP_SKB_CB(skb)->seq += len;
2333
	skb->ip_summed = CHECKSUM_PARTIAL;
2334
2335
	skb->truesize	     -= len;
2336
	sk->sk_wmem_queued   -= len;
2337
	sk_mem_uncharge(sk, len);
2338
	sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
2339
2340
	/* Any change of skb->len requires recalculation of tso factor. */
2341
	if (tcp_skb_pcount(skb) > 1)
2342
		tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb));
2343
2344
	return 0;
2345
}
(-)a/net/mptcp/mptcp_pm.c (+1194 lines)
Line 0 Link Here
1
/*
2
 *	MPTCP implementation - MPTCP-subflow-management
3
 *
4
 *	Initial Design & Implementation:
5
 *	Sébastien Barré <sebastien.barre@uclouvain.be>
6
 *
7
 *	Current Maintainer & Author:
8
 *	Christoph Paasch <christoph.paasch@uclouvain.be>
9
 *
10
 *	Additional authors:
11
 *	Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
12
 *	Gregory Detal <gregory.detal@uclouvain.be>
13
 *	Fabien Duchêne <fabien.duchene@uclouvain.be>
14
 *	Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
15
 *	Lavkesh Lahngir <lavkesh51@gmail.com>
16
 *	Andreas Ripke <ripke@neclab.eu>
17
 *	Vlad Dogaru <vlad.dogaru@intel.com>
18
 *	Octavian Purdila <octavian.purdila@intel.com>
19
 *	John Ronan <jronan@tssg.org>
20
 *	Catalin Nicutar <catalin.nicutar@gmail.com>
21
 *	Brandon Heller <brandonh@stanford.edu>
22
 *
23
 *
24
 *	This program is free software; you can redistribute it and/or
25
 *      modify it under the terms of the GNU General Public License
26
 *      as published by the Free Software Foundation; either version
27
 *      2 of the License, or (at your option) any later version.
28
 */
29
30
#include <linux/kconfig.h>
31
#include <linux/module.h>
32
#include <linux/netdevice.h>
33
#include <linux/inetdevice.h>
34
#include <linux/list.h>
35
#include <linux/tcp.h>
36
#include <linux/workqueue.h>
37
#include <linux/proc_fs.h>	/* Needed by proc_net_fops_create */
38
#include <net/inet_sock.h>
39
#include <net/tcp.h>
40
#include <net/mptcp.h>
41
#include <net/mptcp_v4.h>
42
#include <net/mptcp_pm.h>
43
#if IS_ENABLED(CONFIG_IPV6)
44
#include <net/if_inet6.h>
45
#include <net/ipv6.h>
46
#include <net/ip6_checksum.h>
47
#include <net/inet6_connection_sock.h>
48
#include <net/mptcp_v6.h>
49
#include <net/addrconf.h>
50
#endif
51
52
static inline u32 mptcp_hash_tk(u32 token)
53
{
54
	return token % MPTCP_HASH_SIZE;
55
}
56
57
static struct hlist_nulls_head tk_hashtable[MPTCP_HASH_SIZE];
58
59
/* This second hashtable is needed to retrieve request socks
60
 * created as a result of a join request. While the SYN contains
61
 * the token, the final ack does not, so we need a separate hashtable
62
 * to retrieve the mpcb.
63
 */
64
struct list_head mptcp_reqsk_htb[MPTCP_HASH_SIZE];
65
spinlock_t mptcp_reqsk_hlock;	/* hashtable protection */
66
67
/* The following hash table is used to avoid collision of token */
68
static struct hlist_nulls_head mptcp_reqsk_tk_htb[MPTCP_HASH_SIZE];
69
spinlock_t mptcp_tk_hashlock;	/* hashtable protection */
70
71
static int mptcp_reqsk_find_tk(u32 token)
72
{
73
	u32 hash = mptcp_hash_tk(token);
74
	struct mptcp_request_sock *mtreqsk;
75
	const struct hlist_nulls_node *node;
76
77
	hlist_nulls_for_each_entry_rcu(mtreqsk, node,
78
				       &mptcp_reqsk_tk_htb[hash], collide_tk) {
79
		if (token == mtreqsk->mptcp_loc_token)
80
			return 1;
81
	}
82
	return 0;
83
}
84
85
static void mptcp_reqsk_insert_tk(struct request_sock *reqsk, u32 token)
86
{
87
	u32 hash = mptcp_hash_tk(token);
88
89
	hlist_nulls_add_head_rcu(&mptcp_rsk(reqsk)->collide_tk,
90
				 &mptcp_reqsk_tk_htb[hash]);
91
}
92
93
void mptcp_reqsk_remove_tk(struct request_sock *reqsk)
94
{
95
	rcu_read_lock();
96
	spin_lock(&mptcp_tk_hashlock);
97
	hlist_nulls_del_rcu(&mptcp_rsk(reqsk)->collide_tk);
98
	spin_unlock(&mptcp_tk_hashlock);
99
	rcu_read_unlock();
100
}
101
102
void __mptcp_hash_insert(struct tcp_sock *meta_tp, u32 token)
103
{
104
	u32 hash = mptcp_hash_tk(token);
105
	hlist_nulls_add_head_rcu(&meta_tp->tk_table, &tk_hashtable[hash]);
106
	meta_tp->inside_tk_table = 1;
107
}
108
109
static int mptcp_find_token(u32 token)
110
{
111
	u32 hash = mptcp_hash_tk(token);
112
	struct tcp_sock *meta_tp;
113
	const struct hlist_nulls_node *node;
114
115
	hlist_nulls_for_each_entry_rcu(meta_tp, node, &tk_hashtable[hash], tk_table) {
116
		if (token == meta_tp->mptcp_loc_token)
117
			return 1;
118
	}
119
	return 0;
120
}
121
122
static void mptcp_set_key_reqsk(struct request_sock *req,
123
				const struct sk_buff *skb)
124
{
125
	struct inet_request_sock *ireq = inet_rsk(req);
126
	struct mptcp_request_sock *mtreq = mptcp_rsk(req);
127
128
	if (skb->protocol == htons(ETH_P_IP)) {
129
		mtreq->mptcp_loc_key = mptcp_v4_get_key(ip_hdr(skb)->saddr,
130
						        ip_hdr(skb)->daddr,
131
						        ireq->loc_port,
132
						        ireq->rmt_port);
133
#if IS_ENABLED(CONFIG_IPV6)
134
	} else {
135
		mtreq->mptcp_loc_key = mptcp_v6_get_key(ipv6_hdr(skb)->saddr.s6_addr32,
136
							ipv6_hdr(skb)->daddr.s6_addr32,
137
							ireq->loc_port,
138
							ireq->rmt_port);
139
#endif
140
	}
141
142
	mptcp_key_sha1(mtreq->mptcp_loc_key, &mtreq->mptcp_loc_token, NULL);
143
}
144
145
/* New MPTCP-connection request, prepare a new token for the meta-socket that
146
 * will be created in mptcp_check_req_master(), and store the received token.
147
 */
148
void mptcp_reqsk_new_mptcp(struct request_sock *req,
149
			   const struct tcp_options_received *rx_opt,
150
			   const struct mptcp_options_received *mopt,
151
			   const struct sk_buff *skb)
152
{
153
	struct mptcp_request_sock *mtreq = mptcp_rsk(req);
154
155
	tcp_rsk(req)->saw_mpc = 1;
156
157
	rcu_read_lock();
158
	spin_lock(&mptcp_tk_hashlock);
159
	do {
160
		mptcp_set_key_reqsk(req, skb);
161
	} while (mptcp_reqsk_find_tk(mtreq->mptcp_loc_token) ||
162
		 mptcp_find_token(mtreq->mptcp_loc_token));
163
164
	mptcp_reqsk_insert_tk(req, mtreq->mptcp_loc_token);
165
	spin_unlock(&mptcp_tk_hashlock);
166
	rcu_read_unlock();
167
	mtreq->mptcp_rem_key = mopt->mptcp_key;
168
}
169
170
static void mptcp_set_key_sk(struct sock *sk)
171
{
172
	struct tcp_sock *tp = tcp_sk(sk);
173
	struct inet_sock *isk = inet_sk(sk);
174
175
	if (sk->sk_family == AF_INET)
176
		tp->mptcp_loc_key = mptcp_v4_get_key(isk->inet_saddr,
177
						     isk->inet_daddr,
178
						     isk->inet_sport,
179
						     isk->inet_dport);
180
#if IS_ENABLED(CONFIG_IPV6)
181
	else
182
		tp->mptcp_loc_key = mptcp_v6_get_key(inet6_sk(sk)->saddr.s6_addr32,
183
						     inet6_sk(sk)->daddr.s6_addr32,
184
						     isk->inet_sport,
185
						     isk->inet_dport);
186
#endif
187
188
	mptcp_key_sha1(tp->mptcp_loc_key,
189
		       &tp->mptcp_loc_token, NULL);
190
}
191
192
void mptcp_connect_init(struct sock *sk)
193
{
194
	struct tcp_sock *tp = tcp_sk(sk);
195
196
	rcu_read_lock_bh();
197
	spin_lock(&mptcp_tk_hashlock);
198
	do {
199
		mptcp_set_key_sk(sk);
200
	} while (mptcp_reqsk_find_tk(tp->mptcp_loc_token) ||
201
		 mptcp_find_token(tp->mptcp_loc_token));
202
203
	__mptcp_hash_insert(tp, tp->mptcp_loc_token);
204
	spin_unlock(&mptcp_tk_hashlock);
205
	rcu_read_unlock_bh();
206
}
207
208
/**
209
 * This function increments the refcount of the mpcb struct.
210
 * It is the responsibility of the caller to decrement when releasing
211
 * the structure.
212
 */
213
struct sock *mptcp_hash_find(struct net *net, u32 token)
214
{
215
	u32 hash = mptcp_hash_tk(token);
216
	struct tcp_sock *meta_tp;
217
	struct sock *meta_sk = NULL;
218
	struct hlist_nulls_node *node;
219
220
	rcu_read_lock();
221
	hlist_nulls_for_each_entry_rcu(meta_tp, node, &tk_hashtable[hash],
222
				       tk_table) {
223
		meta_sk = (struct sock *)meta_tp;
224
		if (token == meta_tp->mptcp_loc_token &&
225
		    net_eq(net, sock_net(meta_sk)) &&
226
		    atomic_inc_not_zero(&meta_sk->sk_refcnt))
227
			break;
228
		meta_sk = NULL;
229
	}
230
	rcu_read_unlock();
231
	return meta_sk;
232
}
233
234
void mptcp_hash_remove_bh(struct tcp_sock *meta_tp)
235
{
236
	/* remove from the token hashtable */
237
	rcu_read_lock_bh();
238
	spin_lock(&mptcp_tk_hashlock);
239
	hlist_nulls_del_rcu(&meta_tp->tk_table);
240
	meta_tp->inside_tk_table = 0;
241
	spin_unlock(&mptcp_tk_hashlock);
242
	rcu_read_unlock_bh();
243
}
244
245
void mptcp_hash_remove(struct tcp_sock *meta_tp)
246
{
247
	rcu_read_lock();
248
	spin_lock(&mptcp_tk_hashlock);
249
	hlist_nulls_del_rcu(&meta_tp->tk_table);
250
	meta_tp->inside_tk_table = 0;
251
	spin_unlock(&mptcp_tk_hashlock);
252
	rcu_read_unlock();
253
}
254
255
u8 mptcp_get_loc_addrid(struct mptcp_cb *mpcb, struct sock *sk)
256
{
257
	int i;
258
259
	if (sk->sk_family == AF_INET) {
260
		mptcp_for_each_bit_set(mpcb->loc4_bits, i) {
261
			if (mpcb->locaddr4[i].addr.s_addr ==
262
					inet_sk(sk)->inet_saddr)
263
				return mpcb->locaddr4[i].id;
264
		}
265
266
		mptcp_debug("%s %pI4 not locally found\n", __func__,
267
			    &inet_sk(sk)->inet_saddr);
268
		BUG();
269
	}
270
#if IS_ENABLED(CONFIG_IPV6)
271
	if (sk->sk_family == AF_INET6) {
272
		mptcp_for_each_bit_set(mpcb->loc6_bits, i) {
273
			if (ipv6_addr_equal(&mpcb->locaddr6[i].addr,
274
					    &inet6_sk(sk)->saddr))
275
				return mpcb->locaddr6[i].id;
276
		}
277
278
		mptcp_debug("%s %pI6 not locally found\n", __func__,
279
			    &inet6_sk(sk)->saddr);
280
		BUG();
281
	}
282
#endif /* CONFIG_IPV6 */
283
284
	BUG();
285
	return 0;
286
}
287
288
void mptcp_set_addresses(struct sock *meta_sk)
289
{
290
	struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
291
	struct net *netns = sock_net(meta_sk);
292
	struct net_device *dev;
293
294
	/* if multiports is requested, we work with the main address
295
	 * and play only with the ports
296
	 */
297
	if (sysctl_mptcp_ndiffports > 1)
298
		return;
299
300
	rcu_read_lock();
301
	read_lock_bh(&dev_base_lock);
302
303
	for_each_netdev(netns, dev) {
304
		if (netif_running(dev)) {
305
			struct in_device *in_dev = __in_dev_get_rcu(dev);
306
			struct in_ifaddr *ifa;
307
			__be32 ifa_address;
308
#if IS_ENABLED(CONFIG_IPV6)
309
			struct inet6_dev *in6_dev = __in6_dev_get(dev);
310
			struct inet6_ifaddr *ifa6;
311
#endif
312
313
			if (dev->flags & (IFF_LOOPBACK | IFF_NOMULTIPATH))
314
				continue;
315
316
			if (!in_dev)
317
				goto cont_ipv6;
318
319
			for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
320
				int i;
321
				ifa_address = ifa->ifa_local;
322
323
				if (ifa->ifa_scope == RT_SCOPE_HOST)
324
					continue;
325
326
				if ((meta_sk->sk_family == AF_INET ||
327
				     mptcp_v6_is_v4_mapped(meta_sk)) &&
328
				    inet_sk(meta_sk)->inet_saddr == ifa_address) {
329
					mpcb->locaddr4[0].low_prio = dev->flags &
330
								IFF_MPBACKUP ? 1 : 0;
331
					continue;
332
				}
333
334
				i = __mptcp_find_free_index(mpcb->loc4_bits, -1,
335
							    mpcb->next_v4_index);
336
				if (i < 0) {
337
					mptcp_debug("%s: At max num of local addresses: %d --- not adding address: %pI4\n",
338
						    __func__, MPTCP_MAX_ADDR,
339
						    &ifa_address);
340
					goto out;
341
				}
342
				mpcb->locaddr4[i].addr.s_addr = ifa_address;
343
				mpcb->locaddr4[i].port = 0;
344
				mpcb->locaddr4[i].id = i;
345
				mpcb->locaddr4[i].low_prio = (dev->flags & IFF_MPBACKUP) ?
346
								1 : 0;
347
				mpcb->loc4_bits |= (1 << i);
348
				mpcb->next_v4_index = i + 1;
349
				mptcp_v4_send_add_addr(i, mpcb);
350
			}
351
352
cont_ipv6:
353
; /* This ; is necessary to fix build-errors when IPv6 is disabled */
354
#if IS_ENABLED(CONFIG_IPV6)
355
			if (!in6_dev)
356
				continue;
357
358
			list_for_each_entry(ifa6, &in6_dev->addr_list, if_list) {
359
				int addr_type = ipv6_addr_type(&ifa6->addr);
360
				int i;
361
362
				if (addr_type == IPV6_ADDR_ANY ||
363
				    addr_type & IPV6_ADDR_LOOPBACK ||
364
				    addr_type & IPV6_ADDR_LINKLOCAL)
365
					continue;
366
367
				if (meta_sk->sk_family == AF_INET6 &&
368
				    ipv6_addr_equal(&inet6_sk(meta_sk)->saddr,
369
						    &(ifa6->addr))) {
370
					mpcb->locaddr6[0].low_prio = dev->flags &
371
								IFF_MPBACKUP ? 1 : 0;
372
					continue;
373
				}
374
375
				i = __mptcp_find_free_index(mpcb->loc6_bits, -1,
376
							    mpcb->next_v6_index);
377
				if (i < 0) {
378
					mptcp_debug("%s: At max num of local addresses: %d --- not adding address: %pI6\n",
379
						    __func__, MPTCP_MAX_ADDR,
380
						    &ifa6->addr);
381
					goto out;
382
				}
383
384
				mpcb->locaddr6[i].addr = ifa6->addr;
385
				mpcb->locaddr6[i].port = 0;
386
				mpcb->locaddr6[i].id = i + MPTCP_MAX_ADDR;
387
				mpcb->locaddr6[i].low_prio = (dev->flags & IFF_MPBACKUP) ?
388
								1 : 0;
389
				mpcb->loc6_bits |= (1 << i);
390
				mpcb->next_v6_index = i + 1;
391
				mptcp_v6_send_add_addr(i, mpcb);
392
			}
393
#endif
394
		}
395
	}
396
397
out:
398
	read_unlock_bh(&dev_base_lock);
399
	rcu_read_unlock();
400
}
401
402
int mptcp_check_req(struct sk_buff *skb, struct net *net)
403
{
404
	struct tcphdr *th = tcp_hdr(skb);
405
	struct sock *meta_sk = NULL;
406
407
	/* MPTCP structures not initialized */
408
	if (mptcp_init_failed)
409
		return 0;
410
411
	if (skb->protocol == htons(ETH_P_IP))
412
		meta_sk = mptcp_v4_search_req(th->source, ip_hdr(skb)->saddr,
413
					      ip_hdr(skb)->daddr, net);
414
#if IS_ENABLED(CONFIG_IPV6)
415
	else /* IPv6 */
416
		meta_sk = mptcp_v6_search_req(th->source, &ipv6_hdr(skb)->saddr,
417
					      &ipv6_hdr(skb)->daddr, net);
418
#endif /* CONFIG_IPV6 */
419
420
	if (!meta_sk)
421
		return 0;
422
423
	TCP_SKB_CB(skb)->mptcp_flags = MPTCPHDR_JOIN;
424
425
	bh_lock_sock_nested(meta_sk);
426
	if (sock_owned_by_user(meta_sk)) {
427
		skb->sk = meta_sk;
428
		if (unlikely(sk_add_backlog(meta_sk, skb,
429
					    meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) {
430
			bh_unlock_sock(meta_sk);
431
			NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
432
			sock_put(meta_sk); /* Taken by mptcp_search_req */
433
			kfree_skb(skb);
434
			return 1;
435
		}
436
	} else if (skb->protocol == htons(ETH_P_IP)) {
437
		tcp_v4_do_rcv(meta_sk, skb);
438
#if IS_ENABLED(CONFIG_IPV6)
439
	} else { /* IPv6 */
440
		tcp_v6_do_rcv(meta_sk, skb);
441
#endif /* CONFIG_IPV6 */
442
	}
443
	bh_unlock_sock(meta_sk);
444
	sock_put(meta_sk); /* Taken by mptcp_vX_search_req */
445
	return 1;
446
}
447
448
struct mp_join *mptcp_find_join(struct sk_buff *skb)
449
{
450
	struct tcphdr *th = tcp_hdr(skb);
451
	unsigned char *ptr;
452
	int length = (th->doff * 4) - sizeof(struct tcphdr);
453
454
	/* Jump through the options to check whether JOIN is there */
455
	ptr = (unsigned char *)(th + 1);
456
	while (length > 0) {
457
		int opcode = *ptr++;
458
		int opsize;
459
460
		switch (opcode) {
461
		case TCPOPT_EOL:
462
			return NULL;
463
		case TCPOPT_NOP:	/* Ref: RFC 793 section 3.1 */
464
			length--;
465
			continue;
466
		default:
467
			opsize = *ptr++;
468
			if (opsize < 2)	/* "silly options" */
469
				return NULL;
470
			if (opsize > length)
471
				return NULL;  /* don't parse partial options */
472
			if (opcode == TCPOPT_MPTCP &&
473
			    ((struct mptcp_option *)(ptr - 2))->sub == MPTCP_SUB_JOIN) {
474
				return (struct mp_join *)(ptr - 2);
475
			}
476
			ptr += opsize - 2;
477
			length -= opsize;
478
		}
479
	}
480
	return NULL;
481
}
482
483
int mptcp_lookup_join(struct sk_buff *skb, struct inet_timewait_sock *tw)
484
{
485
	struct mptcp_cb *mpcb;
486
	struct sock *meta_sk;
487
	u32 token;
488
	struct mp_join *join_opt = mptcp_find_join(skb);
489
	if (!join_opt)
490
		return 0;
491
492
	/* MPTCP structures were not initialized, so return error */
493
	if (mptcp_init_failed)
494
		return -1;
495
496
	token = join_opt->u.syn.token;
497
	meta_sk = mptcp_hash_find(dev_net(skb_dst(skb)->dev), token);
498
	if (!meta_sk) {
499
		mptcp_debug("%s:mpcb not found:%x\n", __func__, token);
500
		return -1;
501
	}
502
503
	mpcb = tcp_sk(meta_sk)->mpcb;
504
	if (mpcb->infinite_mapping_rcv || mpcb->send_infinite_mapping) {
505
		/* We are in fallback-mode on the reception-side -
506
		 * no new subflows!
507
		 */
508
		sock_put(meta_sk); /* Taken by mptcp_hash_find */
509
		return -1;
510
	}
511
512
	/* Coming from time-wait-sock processing in tcp_v4_rcv.
513
	 * We have to deschedule it before continuing, because otherwise
514
	 * mptcp_v4_do_rcv will hit again on it inside tcp_v4_hnd_req.
515
	 */
516
	if (tw) {
517
		inet_twsk_deschedule(tw, &tcp_death_row);
518
		inet_twsk_put(tw);
519
	}
520
521
	TCP_SKB_CB(skb)->mptcp_flags = MPTCPHDR_JOIN;
522
	/* OK, this is a new syn/join, let's create a new open request and
523
	 * send syn+ack
524
	 */
525
	bh_lock_sock_nested(meta_sk);
526
	if (sock_owned_by_user(meta_sk)) {
527
		skb->sk = meta_sk;
528
		if (unlikely(sk_add_backlog(meta_sk, skb,
529
					    meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) {
530
			bh_unlock_sock(meta_sk);
531
			NET_INC_STATS_BH(sock_net(meta_sk),
532
					 LINUX_MIB_TCPBACKLOGDROP);
533
			sock_put(meta_sk); /* Taken by mptcp_hash_find */
534
			kfree_skb(skb);
535
			return 1;
536
		}
537
	} else if (skb->protocol == htons(ETH_P_IP)) {
538
		tcp_v4_do_rcv(meta_sk, skb);
539
#if IS_ENABLED(CONFIG_IPV6)
540
	} else {
541
		tcp_v6_do_rcv(meta_sk, skb);
542
#endif /* CONFIG_IPV6 */
543
	}
544
	bh_unlock_sock(meta_sk);
545
	sock_put(meta_sk); /* Taken by mptcp_hash_find */
546
	return 1;
547
}
548
549
int mptcp_do_join_short(struct sk_buff *skb, struct mptcp_options_received *mopt,
550
			struct tcp_options_received *tmp_opt, struct net *net)
551
{
552
	struct sock *meta_sk;
553
	u32 token;
554
555
	token = mopt->mptcp_rem_token;
556
	meta_sk = mptcp_hash_find(net, token);
557
	if (!meta_sk) {
558
		mptcp_debug("%s:mpcb not found:%x\n", __func__, token);
559
		return -1;
560
	}
561
562
	TCP_SKB_CB(skb)->mptcp_flags = MPTCPHDR_JOIN;
563
564
	/* OK, this is a new syn/join, let's create a new open request and
565
	 * send syn+ack
566
	 */
567
	bh_lock_sock(meta_sk);
568
569
	/* This check is also done in mptcp_vX_do_rcv. But, there we cannot
570
	 * call tcp_vX_send_reset, because we hold already two socket-locks.
571
	 * (the listener and the meta from above)
572
	 *
573
	 * And the send-reset will try to take yet another one (ip_send_reply).
574
	 * Thus, we propagate the reset up to tcp_rcv_state_process.
575
	 */
576
	if (tcp_sk(meta_sk)->mpcb->infinite_mapping_rcv ||
577
	    tcp_sk(meta_sk)->mpcb->send_infinite_mapping ||
578
	    meta_sk->sk_state == TCP_CLOSE || !tcp_sk(meta_sk)->inside_tk_table) {
579
		bh_unlock_sock(meta_sk);
580
		sock_put(meta_sk); /* Taken by mptcp_hash_find */
581
		return -1;
582
	}
583
584
	if (sock_owned_by_user(meta_sk)) {
585
		skb->sk = meta_sk;
586
		if (unlikely(sk_add_backlog(meta_sk, skb,
587
					    meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf)))
588
			NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
589
		else
590
			/* Must make sure that upper layers won't free the
591
			 * skb if it is added to the backlog-queue.
592
			 */
593
			skb_get(skb);
594
	} else {
595
		/* mptcp_v4_do_rcv tries to free the skb - we prevent this, as
596
		 * the skb will finally be freed by tcp_v4_do_rcv (where we are
597
		 * coming from)
598
		 */
599
		skb_get(skb);
600
		if (skb->protocol == htons(ETH_P_IP)) {
601
			tcp_v4_do_rcv(meta_sk, skb);
602
#if IS_ENABLED(CONFIG_IPV6)
603
		} else { /* IPv6 */
604
			tcp_v6_do_rcv(meta_sk, skb);
605
#endif /* CONFIG_IPV6 */
606
		}
607
	}
608
609
	bh_unlock_sock(meta_sk);
610
	sock_put(meta_sk); /* Taken by mptcp_hash_find */
611
	return 0;
612
}
613
614
void mptcp_retry_subflow_worker(struct work_struct *work)
615
{
616
	struct delayed_work *delayed_work =
617
		container_of(work, struct delayed_work, work);
618
	struct mptcp_cb *mpcb =
619
		container_of(delayed_work, struct mptcp_cb, subflow_retry_work);
620
	struct sock *meta_sk = mpcb->meta_sk;
621
	int iter = 0, i;
622
623
next_subflow:
624
	if (iter) {
625
		release_sock(meta_sk);
626
		mutex_unlock(&mpcb->mutex);
627
628
		yield();
629
	}
630
	mutex_lock(&mpcb->mutex);
631
	lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);
632
633
	iter++;
634
635
	if (sock_flag(meta_sk, SOCK_DEAD))
636
		goto exit;
637
638
	mptcp_for_each_bit_set(mpcb->rem4_bits, i) {
639
		struct mptcp_rem4 *rem = &mpcb->remaddr4[i];
640
		/* Do we need to retry establishing a subflow ? */
641
		if (rem->retry_bitfield) {
642
			int i = mptcp_find_free_index(~rem->retry_bitfield);
643
			mptcp_init4_subsockets(meta_sk, &mpcb->locaddr4[i], rem);
644
			rem->retry_bitfield &= ~(1 << mpcb->locaddr4[i].id);
645
			goto next_subflow;
646
		}
647
	}
648
649
#if IS_ENABLED(CONFIG_IPV6)
650
	mptcp_for_each_bit_set(mpcb->rem6_bits, i) {
651
		struct mptcp_rem6 *rem = &mpcb->remaddr6[i];
652
653
		/* Do we need to retry establishing a subflow ? */
654
		if (rem->retry_bitfield) {
655
			int i = mptcp_find_free_index(~rem->retry_bitfield);
656
			mptcp_init6_subsockets(meta_sk, &mpcb->locaddr6[i], rem);
657
			rem->retry_bitfield &= ~(1 << mpcb->locaddr6[i].id);
658
			goto next_subflow;
659
		}
660
	}
661
#endif
662
663
exit:
664
	release_sock(meta_sk);
665
	mutex_unlock(&mpcb->mutex);
666
	sock_put(meta_sk);
667
}
668
669
/**
670
 * Create all new subflows, by doing calls to mptcp_initX_subsockets
671
 *
672
 * This function uses a goto next_subflow, to allow releasing the lock between
673
 * new subflows and giving other processes a chance to do some work on the
674
 * socket and potentially finishing the communication.
675
 **/
676
void mptcp_create_subflow_worker(struct work_struct *work)
677
{
678
	struct mptcp_cb *mpcb = container_of(work, struct mptcp_cb, subflow_work);
679
	struct sock *meta_sk = mpcb->meta_sk;
680
	int iter = 0, retry = 0;
681
	int i;
682
683
next_subflow:
684
	if (iter) {
685
		release_sock(meta_sk);
686
		mutex_unlock(&mpcb->mutex);
687
688
		yield();
689
	}
690
	mutex_lock(&mpcb->mutex);
691
	lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);
692
693
	iter++;
694
695
	if (sock_flag(meta_sk, SOCK_DEAD))
696
		goto exit;
697
698
	if (sysctl_mptcp_ndiffports > iter &&
699
	    sysctl_mptcp_ndiffports > mpcb->cnt_subflows) {
700
		if (meta_sk->sk_family == AF_INET ||
701
		    mptcp_v6_is_v4_mapped(meta_sk)) {
702
			mptcp_init4_subsockets(meta_sk, &mpcb->locaddr4[0],
703
					       &mpcb->remaddr4[0]);
704
		} else {
705
#if IS_ENABLED(CONFIG_IPV6)
706
			mptcp_init6_subsockets(meta_sk, &mpcb->locaddr6[0],
707
					       &mpcb->remaddr6[0]);
708
#endif
709
		}
710
		goto next_subflow;
711
	}
712
	if (sysctl_mptcp_ndiffports > 1 &&
713
	    sysctl_mptcp_ndiffports == mpcb->cnt_subflows)
714
		goto exit;
715
716
	mptcp_for_each_bit_set(mpcb->rem4_bits, i) {
717
		struct mptcp_rem4 *rem;
718
		u8 remaining_bits;
719
720
		rem = &mpcb->remaddr4[i];
721
		remaining_bits = ~(rem->bitfield) & mpcb->loc4_bits;
722
723
		/* Are there still combinations to handle? */
724
		if (remaining_bits) {
725
			int i = mptcp_find_free_index(~remaining_bits);
726
			/* If a route is not yet available then retry once */
727
			if (mptcp_init4_subsockets(meta_sk, &mpcb->locaddr4[i],
728
						   rem) == -ENETUNREACH)
729
				retry = rem->retry_bitfield |=
730
					(1 << mpcb->locaddr4[i].id);
731
			goto next_subflow;
732
		}
733
	}
734
735
#if IS_ENABLED(CONFIG_IPV6)
736
	mptcp_for_each_bit_set(mpcb->rem6_bits, i) {
737
		struct mptcp_rem6 *rem;
738
		u8 remaining_bits;
739
740
		rem = &mpcb->remaddr6[i];
741
		remaining_bits = ~(rem->bitfield) & mpcb->loc6_bits;
742
743
		/* Are there still combinations to handle? */
744
		if (remaining_bits) {
745
			int i = mptcp_find_free_index(~remaining_bits);
746
			/* If a route is not yet available then retry once */
747
			if (mptcp_init6_subsockets(meta_sk, &mpcb->locaddr6[i],
748
						   rem) == -ENETUNREACH)
749
				retry = rem->retry_bitfield |=
750
					(1 << mpcb->locaddr6[i].id);
751
			goto next_subflow;
752
		}
753
	}
754
#endif
755
756
	if (retry && !delayed_work_pending(&mpcb->subflow_retry_work)) {
757
		sock_hold(meta_sk);
758
		queue_delayed_work(mptcp_wq, &mpcb->subflow_retry_work,
759
				   msecs_to_jiffies(MPTCP_SUBFLOW_RETRY_DELAY));
760
	}
761
762
exit:
763
	release_sock(meta_sk);
764
	mutex_unlock(&mpcb->mutex);
765
	sock_put(meta_sk);
766
}
767
768
void mptcp_create_subflows(struct sock *meta_sk)
769
{
770
	struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
771
772
	if ((mpcb->master_sk &&
773
	     !tcp_sk(mpcb->master_sk)->mptcp->fully_established) ||
774
	    mpcb->infinite_mapping_snd || mpcb->infinite_mapping_rcv ||
775
	    mpcb->send_infinite_mapping ||
776
	    mpcb->server_side || sock_flag(meta_sk, SOCK_DEAD))
777
		return;
778
779
	if (!work_pending(&mpcb->subflow_work)) {
780
		sock_hold(meta_sk);
781
		queue_work(mptcp_wq, &mpcb->subflow_work);
782
	}
783
}
784
785
void mptcp_address_worker(struct work_struct *work)
786
{
787
	struct mptcp_cb *mpcb = container_of(work, struct mptcp_cb, address_work);
788
	struct sock *meta_sk = mpcb->meta_sk, *sk, *tmpsk;
789
	struct net *netns = sock_net(meta_sk);
790
	struct net_device *dev;
791
	int i;
792
793
	mutex_lock(&mpcb->mutex);
794
	lock_sock(meta_sk);
795
796
	if (sock_flag(meta_sk, SOCK_DEAD))
797
		goto exit;
798
799
	/* The following is meant to run with bh disabled */
800
	local_bh_disable();
801
802
	/* First, we iterate over the interfaces to find addresses not yet
803
	 * in our local list.
804
	 */
805
806
	rcu_read_lock();
807
	read_lock_bh(&dev_base_lock);
808
809
	for_each_netdev(netns, dev) {
810
		struct in_device *in_dev = __in_dev_get_rcu(dev);
811
		struct in_ifaddr *ifa;
812
#if IS_ENABLED(CONFIG_IPV6)
813
		struct inet6_dev *in6_dev = __in6_dev_get(dev);
814
		struct inet6_ifaddr *ifa6;
815
#endif
816
817
		if (dev->flags & (IFF_LOOPBACK | IFF_NOMULTIPATH))
818
			continue;
819
820
		if (!in_dev)
821
			goto cont_ipv6;
822
823
		for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
824
			unsigned long event;
825
826
			if (!netif_running(in_dev->dev)) {
827
				event = NETDEV_DOWN;
828
			} else {
829
				/* If it's up, it may have been changed or came up.
830
				 * We set NETDEV_CHANGE, to take the good
831
				 * code-path in mptcp_pm_addr4_event_handler
832
				 */
833
				event = NETDEV_CHANGE;
834
			}
835
836
			mptcp_pm_addr4_event_handler(ifa, event, mpcb);
837
		}
838
cont_ipv6:
839
; /* This ; is necessary to fix build-errors when IPv6 is disabled */
840
#if IS_ENABLED(CONFIG_IPV6)
841
		if (!in6_dev)
842
			continue;
843
844
		read_lock(&in6_dev->lock);
845
		list_for_each_entry(ifa6, &in6_dev->addr_list, if_list) {
846
			unsigned long event;
847
848
			if (!netif_running(in_dev->dev)) {
849
				event = NETDEV_DOWN;
850
			} else {
851
				/* If it's up, it may have been changed or came up.
852
				 * We set NETDEV_CHANGE, to take the good
853
				 * code-path in mptcp_pm_addr4_event_handler
854
				 */
855
				event = NETDEV_CHANGE;
856
			}
857
858
			mptcp_pm_addr6_event_handler(ifa6, event, mpcb);
859
		}
860
		read_unlock(&in6_dev->lock);
861
#endif
862
	}
863
864
	/* Second, we iterate over our local addresses and check if they
865
	 * still exist in the interface-list.
866
	 */
867
868
	/* MPCB-Local IPv4 Addresses */
869
	mptcp_for_each_bit_set(mpcb->loc4_bits, i) {
870
		int j;
871
872
		for_each_netdev(netns, dev) {
873
			struct in_device *in_dev = __in_dev_get_rcu(dev);
874
			struct in_ifaddr *ifa;
875
876
			if (dev->flags & (IFF_LOOPBACK | IFF_NOMULTIPATH) ||
877
			    !in_dev)
878
				continue;
879
880
			for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
881
				if (ifa->ifa_address == mpcb->locaddr4[i].addr.s_addr &&
882
				    netif_running(dev))
883
					goto next_loc_addr;
884
			}
885
		}
886
887
		/* We did not find the address or the interface became NOMULTIPATH.
888
		 * We thus have to remove it.
889
		 */
890
891
		/* Look for the socket and remove him */
892
		mptcp_for_each_sk_safe(mpcb, sk, tmpsk) {
893
			if (sk->sk_family != AF_INET ||
894
			    inet_sk(sk)->inet_saddr != mpcb->locaddr4[i].addr.s_addr)
895
				continue;
896
897
			mptcp_reinject_data(sk, 0);
898
			mptcp_sub_force_close(sk);
899
		}
900
901
		/* Now, remove the address from the local ones */
902
		mpcb->loc4_bits &= ~(1 << i);
903
904
		mpcb->remove_addrs |= (1 << mpcb->locaddr4[i].id);
905
		sk = mptcp_select_ack_sock(meta_sk, 0);
906
		if (sk)
907
			tcp_send_ack(sk);
908
909
		mptcp_for_each_bit_set(mpcb->rem4_bits, j)
910
			mpcb->remaddr4[j].bitfield &= mpcb->loc4_bits;
911
912
next_loc_addr:
913
		continue; /* necessary here due to the previous label */
914
	}
915
916
#if IS_ENABLED(CONFIG_IPV6)
917
	/* MPCB-Local IPv6 Addresses */
918
	mptcp_for_each_bit_set(mpcb->loc6_bits, i) {
919
		int j;
920
921
		for_each_netdev(netns, dev) {
922
			struct inet6_dev *in6_dev = __in6_dev_get(dev);
923
			struct inet6_ifaddr *ifa6;
924
925
			if (dev->flags & (IFF_LOOPBACK | IFF_NOMULTIPATH) ||
926
			    !in6_dev)
927
				continue;
928
929
			read_lock(&in6_dev->lock);
930
			list_for_each_entry(ifa6, &in6_dev->addr_list, if_list) {
931
				if (ipv6_addr_equal(&mpcb->locaddr6[i].addr, &ifa6->addr) &&
932
				    netif_running(dev)) {
933
					read_unlock(&in6_dev->lock);
934
					goto next_loc6_addr;
935
				}
936
			}
937
			read_unlock(&in6_dev->lock);
938
		}
939
940
		/* We did not find the address or the interface became NOMULTIPATH.
941
		 * We thus have to remove it.
942
		 */
943
944
		/* Look for the socket and remove him */
945
		mptcp_for_each_sk_safe(mpcb, sk, tmpsk) {
946
			if (sk->sk_family != AF_INET6 ||
947
			    !ipv6_addr_equal(&inet6_sk(sk)->saddr, &mpcb->locaddr6[i].addr))
948
				continue;
949
950
			mptcp_reinject_data(sk, 0);
951
			mptcp_sub_force_close(sk);
952
		}
953
954
		/* Now, remove the address from the local ones */
955
		mpcb->loc6_bits &= ~(1 << i);
956
957
		/* Force sending directly the REMOVE_ADDR option */
958
		mpcb->remove_addrs |= (1 << mpcb->locaddr6[i].id);
959
		sk = mptcp_select_ack_sock(meta_sk, 0);
960
		if (sk)
961
			tcp_send_ack(sk);
962
963
		mptcp_for_each_bit_set(mpcb->rem6_bits, j)
964
			mpcb->remaddr6[j].bitfield &= mpcb->loc6_bits;
965
966
next_loc6_addr:
967
		continue; /* necessary here due to the previous label */
968
	}
969
#endif
970
971
	read_unlock_bh(&dev_base_lock);
972
	rcu_read_unlock();
973
974
	local_bh_enable();
975
exit:
976
	release_sock(meta_sk);
977
	mutex_unlock(&mpcb->mutex);
978
	sock_put(meta_sk);
979
}
980
981
static void mptcp_address_create_worker(struct mptcp_cb *mpcb)
982
{
983
	if (!work_pending(&mpcb->address_work)) {
984
		sock_hold(mpcb->meta_sk);
985
		queue_work(mptcp_wq, &mpcb->address_work);
986
	}
987
}
988
989
/**
990
 * React on IPv4+IPv6-addr add/rem-events
991
 */
992
int mptcp_pm_addr_event_handler(unsigned long event, void *ptr, int family)
993
{
994
	struct tcp_sock *meta_tp;
995
	int i;
996
997
	if (!(event == NETDEV_UP || event == NETDEV_DOWN ||
998
	      event == NETDEV_CHANGE))
999
		return NOTIFY_DONE;
1000
1001
	if (sysctl_mptcp_ndiffports > 1)
1002
		return NOTIFY_DONE;
1003
1004
	/* Now we iterate over the mpcb's */
1005
	for (i = 0; i < MPTCP_HASH_SIZE; i++) {
1006
		struct hlist_nulls_node *node;
1007
		rcu_read_lock_bh();
1008
		hlist_nulls_for_each_entry_rcu(meta_tp, node, &tk_hashtable[i],
1009
					       tk_table) {
1010
			struct mptcp_cb *mpcb = meta_tp->mpcb;
1011
			struct sock *meta_sk = (struct sock *)meta_tp;
1012
1013
			if (unlikely(!atomic_inc_not_zero(&meta_sk->sk_refcnt)))
1014
				continue;
1015
1016
			if (!meta_tp->mpc || !is_meta_sk(meta_sk) ||
1017
			    mpcb->infinite_mapping_snd ||
1018
			    mpcb->infinite_mapping_rcv ||
1019
			    mpcb->send_infinite_mapping) {
1020
				sock_put(meta_sk);
1021
				continue;
1022
			}
1023
1024
			bh_lock_sock(meta_sk);
1025
			if (sock_owned_by_user(meta_sk)) {
1026
				mptcp_address_create_worker(mpcb);
1027
			} else {
1028
				if (family == AF_INET)
1029
					mptcp_pm_addr4_event_handler(
1030
							(struct in_ifaddr *)ptr, event, mpcb);
1031
#if IS_ENABLED(CONFIG_IPV6)
1032
				else
1033
					mptcp_pm_addr6_event_handler(
1034
							(struct inet6_ifaddr *)ptr, event, mpcb);
1035
#endif
1036
			}
1037
1038
			bh_unlock_sock(meta_sk);
1039
			sock_put(meta_sk);
1040
		}
1041
		rcu_read_unlock_bh();
1042
	}
1043
	return NOTIFY_DONE;
1044
}
1045
1046
#ifdef CONFIG_PROC_FS
1047
1048
/* Output /proc/net/mptcp */
1049
static int mptcp_pm_seq_show(struct seq_file *seq, void *v)
1050
{
1051
	struct tcp_sock *meta_tp;
1052
	struct net *net = seq->private;
1053
	int i, n = 0;
1054
1055
	seq_printf(seq, "  sl  loc_tok  rem_tok  v6 "
1056
		   "local_address                         "
1057
		   "remote_address                        "
1058
		   "st ns tx_queue rx_queue inode");
1059
	seq_putc(seq, '\n');
1060
1061
	for (i = 0; i < MPTCP_HASH_SIZE; i++) {
1062
		struct hlist_nulls_node *node;
1063
		rcu_read_lock_bh();
1064
		hlist_nulls_for_each_entry_rcu(meta_tp, node,
1065
					       &tk_hashtable[i], tk_table) {
1066
			struct mptcp_cb *mpcb = meta_tp->mpcb;
1067
			struct sock *meta_sk = (struct sock *)meta_tp;
1068
			struct inet_sock *isk = inet_sk(meta_sk);
1069
1070
			if (!meta_tp->mpc || !net_eq(net, sock_net(meta_sk)))
1071
				continue;
1072
1073
			seq_printf(seq, "%4d: %04X %04X ", n++,
1074
				   mpcb->mptcp_loc_token,
1075
				   mpcb->mptcp_rem_token);
1076
			if (meta_sk->sk_family == AF_INET ||
1077
			    mptcp_v6_is_v4_mapped(meta_sk)) {
1078
				seq_printf(seq, " 0 %08X:%04X                         %08X:%04X                        ",
1079
					   isk->inet_saddr,
1080
					   ntohs(isk->inet_sport),
1081
					   isk->inet_daddr,
1082
					   ntohs(isk->inet_dport));
1083
#if IS_ENABLED(CONFIG_IPV6)
1084
			} else if (meta_sk->sk_family == AF_INET6) {
1085
				struct in6_addr *src = &isk->pinet6->saddr;
1086
				struct in6_addr *dst = &isk->pinet6->daddr;
1087
				seq_printf(seq, " 1 %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X",
1088
					   src->s6_addr32[0], src->s6_addr32[1],
1089
					   src->s6_addr32[2], src->s6_addr32[3],
1090
					   ntohs(isk->inet_sport),
1091
					   dst->s6_addr32[0], dst->s6_addr32[1],
1092
					   dst->s6_addr32[2], dst->s6_addr32[3],
1093
					   ntohs(isk->inet_dport));
1094
#endif
1095
			}
1096
			seq_printf(seq, " %02X %02X %08X:%08X %lu",
1097
				   meta_sk->sk_state, mpcb->cnt_subflows,
1098
				   meta_tp->write_seq - meta_tp->snd_una,
1099
				   max_t(int, meta_tp->rcv_nxt -
1100
					 meta_tp->copied_seq, 0),
1101
				   sock_i_ino(meta_sk));
1102
			seq_putc(seq, '\n');
1103
		}
1104
		rcu_read_unlock_bh();
1105
	}
1106
1107
	return 0;
1108
}
1109
1110
static int mptcp_pm_seq_open(struct inode *inode, struct file *file)
1111
{
1112
	return single_open_net(inode, file, mptcp_pm_seq_show);
1113
}
1114
1115
static const struct file_operations mptcp_pm_seq_fops = {
1116
	.owner = THIS_MODULE,
1117
	.open = mptcp_pm_seq_open,
1118
	.read = seq_read,
1119
	.llseek = seq_lseek,
1120
	.release = single_release_net,
1121
};
1122
1123
static int mptcp_pm_proc_init_net(struct net *net)
1124
{
1125
	if (!proc_create("mptcp", S_IRUGO, net->proc_net, &mptcp_pm_seq_fops))
1126
		return -ENOMEM;
1127
1128
	return 0;
1129
}
1130
1131
static void mptcp_pm_proc_exit_net(struct net *net)
1132
{
1133
	remove_proc_entry("mptcp", net->proc_net);
1134
}
1135
1136
static struct pernet_operations mptcp_pm_proc_ops = {
1137
	.init = mptcp_pm_proc_init_net,
1138
	.exit = mptcp_pm_proc_exit_net,
1139
};
1140
#endif
1141
1142
/* General initialization of MPTCP_PM */
1143
int mptcp_pm_init(void)
1144
{
1145
	int i, ret;
1146
	for (i = 0; i < MPTCP_HASH_SIZE; i++) {
1147
		INIT_HLIST_NULLS_HEAD(&tk_hashtable[i], i);
1148
		INIT_LIST_HEAD(&mptcp_reqsk_htb[i]);
1149
		INIT_HLIST_NULLS_HEAD(&mptcp_reqsk_tk_htb[i], i);
1150
	}
1151
1152
	spin_lock_init(&mptcp_reqsk_hlock);
1153
	spin_lock_init(&mptcp_tk_hashlock);
1154
1155
#ifdef CONFIG_PROC_FS
1156
	ret = register_pernet_subsys(&mptcp_pm_proc_ops);
1157
	if (ret)
1158
		goto out;
1159
#endif
1160
1161
#if IS_ENABLED(CONFIG_IPV6)
1162
	ret = mptcp_pm_v6_init();
1163
	if (ret)
1164
		goto mptcp_pm_v6_failed;
1165
#endif
1166
	ret = mptcp_pm_v4_init();
1167
	if (ret)
1168
		goto mptcp_pm_v4_failed;
1169
1170
out:
1171
	return ret;
1172
1173
mptcp_pm_v4_failed:
1174
#if IS_ENABLED(CONFIG_IPV6)
1175
	mptcp_pm_v6_undo();
1176
1177
mptcp_pm_v6_failed:
1178
#endif
1179
#ifdef CONFIG_PROC_FS
1180
	unregister_pernet_subsys(&mptcp_pm_proc_ops);
1181
#endif
1182
	goto out;
1183
}
1184
1185
void mptcp_pm_undo(void)
1186
{
1187
#if IS_ENABLED(CONFIG_IPV6)
1188
	mptcp_pm_v6_undo();
1189
#endif
1190
	mptcp_pm_v4_undo();
1191
#ifdef CONFIG_PROC_FS
1192
	unregister_pernet_subsys(&mptcp_pm_proc_ops);
1193
#endif
1194
}

Return to bug 477786