Go to:
Gentoo Home
Documentation
Forums
Lists
Bugs
Planet
Store
Wiki
Get Gentoo!
Gentoo's Bugzilla – Attachment 360162 Details for
Bug 477786
sys-kernel/gentoo-sources - add support for MultiPath TCP (mptcp )
Home
|
New
–
[Ex]
|
Browse
|
Search
|
Privacy Policy
|
[?]
|
Reports
|
Requests
|
Help
|
New Account
|
Log In
[x]
|
Forgot Password
Login:
[x]
[patch]
mptcp_against_3.11.patch
mptcp_patch_against_3.11.patch (text/plain), 450.06 KB, created by
David Heidelberg (okias)
on 2013-10-05 17:10:55 UTC
(
hide
)
Description:
mptcp_against_3.11.patch
Filename:
MIME Type:
Creator:
David Heidelberg (okias)
Created:
2013-10-05 17:10:55 UTC
Size:
450.06 KB
patch
obsolete
>diff -Naur a/linux-3.11/drivers/infiniband/hw/cxgb4/cm.c b/linux-3.11/drivers/infiniband/hw/cxgb4/cm.c >--- a/linux-3.11/drivers/infiniband/hw/cxgb4/cm.c 2013-09-02 22:46:10.000000000 +0200 >+++ b/linux-3.11/drivers/infiniband/hw/cxgb4/cm.c 2013-10-05 18:34:48.288377034 +0200 >@@ -2921,7 +2921,7 @@ > */ > memset(&tmp_opt, 0, sizeof(tmp_opt)); > tcp_clear_options(&tmp_opt); >- tcp_parse_options(skb, &tmp_opt, 0, NULL); >+ tcp_parse_options(skb, &tmp_opt, NULL, 0, NULL); > > req = (struct cpl_pass_accept_req *)__skb_push(skb, sizeof(*req)); > memset(req, 0, sizeof(*req)); >diff -Naur a/linux-3.11/include/linux/tcp.h b/linux-3.11/include/linux/tcp.h >--- a/linux-3.11/include/linux/tcp.h 2013-09-02 22:46:10.000000000 +0200 >+++ b/linux-3.11/include/linux/tcp.h 2013-10-05 18:34:48.542373841 +0200 >@@ -72,6 +72,45 @@ > u32 end_seq; > }; > >+struct tcp_out_options { >+ u16 options; /* bit field of OPTION_* */ >+ u8 ws; /* window scale, 0 to disable */ >+ u8 num_sack_blocks;/* number of SACK blocks to include */ >+ u8 hash_size; /* bytes in hash_location */ >+ u16 mss; /* 0 to disable */ >+ __u8 *hash_location; /* temporary pointer, overloaded */ >+ __u32 tsval, tsecr; /* need to include OPTION_TS */ >+ struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ >+#ifdef CONFIG_MPTCP >+ u16 mptcp_options; /* bit field of MPTCP related OPTION_* */ >+ __sum16 dss_csum; /* Overloaded field: dss-checksum required >+ * (for SYN-packets)? Or dss-csum itself */ >+ >+ __u32 data_seq; /* data sequence number, for MPTCP */ >+ __u32 data_ack; /* data ack, for MPTCP */ >+ >+ union { >+ struct { >+ __u64 sender_key; /* sender's key for mptcp */ >+ __u64 receiver_key; /* receiver's key for mptcp */ >+ } mp_capable; >+ >+ struct { >+ __u64 sender_truncated_mac; >+ __u32 sender_nonce; >+ /* random number of the sender */ >+ __u32 token; /* token for mptcp */ >+ } mp_join_syns; >+ }; >+ >+ struct mptcp_loc4 *addr4;/* v4 addresses for MPTCP */ >+ struct mptcp_loc6 *addr6;/* v6 addresses for MPTCP */ >+ >+ u16 remove_addrs; /* list of address id */ >+ u8 addr_id; /* address id */ >+#endif /* CONFIG_MPTCP */ >+}; >+ > /*These are used to set the sack_ok field in struct tcp_options_received */ > #define TCP_SACK_SEEN (1 << 0) /*1 = peer is SACK capable, */ > #define TCP_FACK_ENABLED (1 << 1) /*1 = FACK is enabled locally*/ >@@ -95,6 +134,9 @@ > u16 mss_clamp; /* Maximal mss, negotiated at connection setup */ > }; > >+struct mptcp_cb; >+struct mptcp_tcp_sock; >+ > static inline void tcp_clear_options(struct tcp_options_received *rx_opt) > { > rx_opt->tstamp_ok = rx_opt->sack_ok = 0; >@@ -124,6 +166,7 @@ > * FastOpen it's the seq# > * after data-in-SYN. > */ >+ u8 saw_mpc:1; > }; > > static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req) >@@ -320,6 +363,35 @@ > * socket. Used to retransmit SYNACKs etc. > */ > struct request_sock *fastopen_rsk; >+ >+ >+ struct mptcp_cb *mpcb; >+ struct sock *meta_sk; >+ /* We keep these flags even if CONFIG_MPTCP is not checked, because >+ * it allows checking MPTCP capability just by checking the mpc flag, >+ * rather than adding ifdefs everywhere. >+ */ >+ u16 mpc:1, /* Other end is multipath capable */ >+ inside_tk_table:1, /* Is the tcp_sock inside the token-table? */ >+ send_mp_fclose:1, >+ request_mptcp:1, /* Did we send out an MP_CAPABLE? >+ * (this speeds up mptcp_doit() in tcp_recvmsg) >+ */ >+ pf:1, /* Potentially Failed state: when this flag is set, we >+ * stop using the subflow >+ */ >+ mp_killed:1, /* Killed with a tcp_done in mptcp? */ >+ mptcp_add_addr_ack:1, /* Tell tcp_send_ack to return in case >+ * alloc_skb fails. */ >+ was_meta_sk:1, /* This was a meta sk (in case of reuse) */ >+ close_it:1, /* Must close socket in mptcp_data_ready? */ >+ closing:1; >+ struct mptcp_tcp_sock *mptcp; >+#ifdef CONFIG_MPTCP >+ struct hlist_nulls_node tk_table; >+ u32 mptcp_loc_token; >+ u64 mptcp_loc_key; >+#endif /* CONFIG_MPTCP */ > }; > > enum tsq_flags { >@@ -349,6 +421,7 @@ > #ifdef CONFIG_TCP_MD5SIG > struct tcp_md5sig_key *tw_md5_key; > #endif >+ struct mptcp_tw *mptcp_tw; > }; > > static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk) >diff -Naur a/linux-3.11/include/net/inet6_connection_sock.h b/linux-3.11/include/net/inet6_connection_sock.h >--- a/linux-3.11/include/net/inet6_connection_sock.h 2013-09-02 22:46:10.000000000 +0200 >+++ b/linux-3.11/include/net/inet6_connection_sock.h 2013-10-05 18:34:48.544373816 +0200 >@@ -25,6 +25,8 @@ > extern int inet6_csk_bind_conflict(const struct sock *sk, > const struct inet_bind_bucket *tb, bool relax); > >+extern u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport, >+ const u32 rnd, const u32 synq_hsize); > extern struct dst_entry* inet6_csk_route_req(struct sock *sk, > struct flowi6 *fl6, > const struct request_sock *req); >diff -Naur a/linux-3.11/include/net/inet_common.h b/linux-3.11/include/net/inet_common.h >--- a/linux-3.11/include/net/inet_common.h 2013-09-02 22:46:10.000000000 +0200 >+++ b/linux-3.11/include/net/inet_common.h 2013-10-05 18:34:48.545373803 +0200 >@@ -1,6 +1,8 @@ > #ifndef _INET_COMMON_H > #define _INET_COMMON_H > >+#include <net/sock.h> >+ > extern const struct proto_ops inet_stream_ops; > extern const struct proto_ops inet_dgram_ops; > >@@ -13,6 +15,10 @@ > struct sockaddr; > struct socket; > >+extern int inet_create(struct net *net, struct socket *sock, int protocol, >+ int kern); >+extern int inet6_create(struct net *net, struct socket *sock, int protocol, >+ int kern); > extern int inet_release(struct socket *sock); > extern int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, > int addr_len, int flags); >diff -Naur a/linux-3.11/include/net/inet_connection_sock.h b/linux-3.11/include/net/inet_connection_sock.h >--- a/linux-3.11/include/net/inet_connection_sock.h 2013-09-02 22:46:10.000000000 +0200 >+++ b/linux-3.11/include/net/inet_connection_sock.h 2013-10-05 18:34:48.545373803 +0200 >@@ -243,6 +243,8 @@ > > extern struct sock *inet_csk_accept(struct sock *sk, int flags, int *err); > >+extern u32 inet_synq_hash(const __be32 raddr, const __be16 rport, const u32 rnd, >+ const u32 synq_hsize); > extern struct request_sock *inet_csk_search_req(const struct sock *sk, > struct request_sock ***prevp, > const __be16 rport, >diff -Naur a/linux-3.11/include/net/mptcp.h b/linux-3.11/include/net/mptcp.h >--- a/linux-3.11/include/net/mptcp.h 1970-01-01 01:00:00.000000000 +0100 >+++ b/linux-3.11/include/net/mptcp.h 2013-10-05 18:34:48.680372106 +0200 >@@ -0,0 +1,1403 @@ >+/* >+ * MPTCP implementation >+ * >+ * Initial Design & Implementation: >+ * Sébastien Barré <sebastien.barre@uclouvain.be> >+ * >+ * Current Maintainer & Author: >+ * Christoph Paasch <christoph.paasch@uclouvain.be> >+ * >+ * Additional authors: >+ * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi> >+ * Gregory Detal <gregory.detal@uclouvain.be> >+ * Fabien Duchêne <fabien.duchene@uclouvain.be> >+ * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de> >+ * Lavkesh Lahngir <lavkesh51@gmail.com> >+ * Andreas Ripke <ripke@neclab.eu> >+ * Vlad Dogaru <vlad.dogaru@intel.com> >+ * Octavian Purdila <octavian.purdila@intel.com> >+ * John Ronan <jronan@tssg.org> >+ * Catalin Nicutar <catalin.nicutar@gmail.com> >+ * Brandon Heller <brandonh@stanford.edu> >+ * >+ * >+ * This program is free software; you can redistribute it and/or >+ * modify it under the terms of the GNU General Public License >+ * as published by the Free Software Foundation; either version >+ * 2 of the License, or (at your option) any later version. >+ */ >+ >+#ifndef _MPTCP_H >+#define _MPTCP_H >+ >+#include <linux/inetdevice.h> >+#include <linux/ipv6.h> >+#include <linux/list.h> >+#include <linux/net.h> >+#include <linux/skbuff.h> >+#include <linux/socket.h> >+#include <linux/tcp.h> >+#include <linux/kernel.h> >+ >+#include <asm/byteorder.h> >+#include <asm/unaligned.h> >+#include <crypto/hash.h> >+#include <net/mptcp_pm.h> >+#include <net/tcp.h> >+ >+#if defined(__LITTLE_ENDIAN_BITFIELD) >+ #define ntohll(x) be64_to_cpu(x) >+ #define htonll(x) cpu_to_be64(x) >+#elif defined(__BIG_ENDIAN_BITFIELD) >+ #define ntohll(x) (x) >+ #define htonll(x) (x) >+#endif >+ >+/* is seq1 < seq2 ? */ >+static inline int before64(const u64 seq1, const u64 seq2) >+{ >+ return (s64)(seq1 - seq2) < 0; >+} >+ >+/* is seq1 > seq2 ? */ >+#define after64(seq1, seq2) before64(seq2, seq1) >+ >+struct mptcp_request_sock { >+ struct tcp_request_sock req; >+ struct mptcp_cb *mpcb; >+ /* Collision list in the tuple hashtable. We need to find >+ * the req sock when receiving the third msg of the 3-way handshake, >+ * since that one does not contain the token. If this makes >+ * the request sock too long, we can use kmalloc'ed specific entries for >+ * that tuple hashtable. At the moment, though, I extend the >+ * request_sock. >+ */ >+ struct list_head collide_tuple; >+ struct hlist_nulls_node collide_tk; >+ u32 mptcp_rem_nonce; >+ u32 mptcp_loc_token; >+ u64 mptcp_loc_key; >+ u64 mptcp_rem_key; >+ u64 mptcp_hash_tmac; >+ u32 mptcp_loc_nonce; >+ __u8 rem_id; /* Address-id in the MP_JOIN */ >+ u8 dss_csum:1, >+ low_prio:1; >+}; >+ >+static inline >+struct mptcp_request_sock *mptcp_rsk(const struct request_sock *req) >+{ >+ return (struct mptcp_request_sock *)req; >+} >+ >+static inline >+struct request_sock *rev_mptcp_rsk(const struct mptcp_request_sock *req) >+{ >+ return (struct request_sock *)req; >+} >+ >+struct mptcp_options_received { >+ u16 saw_mpc:1, >+ dss_csum:1, >+ drop_me:1, >+ >+ is_mp_join:1, >+ join_ack:1, >+ >+ saw_low_prio:2, /* 0x1 - low-prio set for this subflow >+ * 0x2 - low-prio set for another subflow >+ */ >+ low_prio:1, >+ >+ saw_add_addr:2, /* Saw at least one add_addr option: >+ * 0x1: IPv4 - 0x2: IPv6 >+ */ >+ more_add_addr:1, /* Saw one more add-addr. */ >+ >+ saw_rem_addr:1, /* Saw at least one rem_addr option */ >+ more_rem_addr:1, /* Saw one more rem-addr. */ >+ >+ mp_fail:1, >+ mp_fclose:1; >+ u8 rem_id; /* Address-id in the MP_JOIN */ >+ u8 prio_addr_id; /* Address-id in the MP_PRIO */ >+ >+ const unsigned char *add_addr_ptr; /* Pointer to add-address option */ >+ const unsigned char *rem_addr_ptr; /* Pointer to rem-address option */ >+ >+ u32 data_ack; >+ u32 data_seq; >+ u16 data_len; >+ >+ u32 mptcp_rem_token;/* Remote token */ >+ >+ /* Key inside the option (from mp_capable or fast_close) */ >+ u64 mptcp_key; >+ >+ u32 mptcp_recv_nonce; >+ u64 mptcp_recv_tmac; >+ u8 mptcp_recv_mac[20]; >+}; >+ >+struct mptcp_tcp_sock { >+ struct tcp_sock *next; /* Next subflow socket */ >+ struct mptcp_options_received rx_opt; >+ >+ /* Those three fields record the current mapping */ >+ u64 map_data_seq; >+ u32 map_subseq; >+ u16 map_data_len; >+ u16 slave_sk:1, >+ nonce_set:1, /* Is the nonce set? (in order to support 0-nonce) */ >+ fully_established:1, >+ establish_increased:1, >+ second_packet:1, >+ attached:1, >+ send_mp_fail:1, >+ include_mpc:1, >+ mapping_present:1, >+ map_data_fin:1, >+ low_prio:1, /* use this socket as backup */ >+ rcv_low_prio:1, /* Peer sent low-prio option to us */ >+ send_mp_prio:1, /* Trigger to send mp_prio on this socket */ >+ pre_established:1; /* State between sending 3rd ACK and >+ * receiving the fourth ack of new subflows. >+ */ >+ >+ /* isn: needed to translate abs to relative subflow seqnums */ >+ u32 snt_isn; >+ u32 rcv_isn; >+ u32 last_data_seq; >+ u8 path_index; >+ u8 add_addr4; /* bit-field of addrs not yet sent to our peer */ >+ u8 add_addr6; >+ u8 rem_id; >+ >+ u32 last_rbuf_opti; /* Timestamp of last rbuf optimization */ >+ unsigned int sent_pkts; >+ >+ struct sk_buff *shortcut_ofoqueue; /* Shortcut to the current modified >+ * skb in the ofo-queue. >+ */ >+ >+ int init_rcv_wnd; >+ u32 infinite_cutoff_seq; >+ struct delayed_work work; >+ u32 mptcp_loc_nonce; >+ struct tcp_sock *tp; /* Where is my daddy? */ >+ u32 last_end_data_seq; >+ >+ /* MP_JOIN subflow: timer for retransmitting the 3rd ack */ >+ struct timer_list mptcp_ack_timer; >+ >+ /* HMAC of the third ack */ >+ char sender_mac[20]; >+}; >+ >+struct mptcp_tw { >+ struct list_head list; >+ u64 loc_key; >+ u64 rcv_nxt; >+ struct mptcp_cb __rcu *mpcb; >+ u8 meta_tw:1, >+ in_list:1; >+}; >+ >+struct mptcp_cb { >+ struct sock *meta_sk; >+ >+ /* list of sockets in this multipath connection */ >+ struct tcp_sock *connection_list; >+ >+ spinlock_t tw_lock; >+ struct list_head tw_list; >+ unsigned char mptw_state; >+ >+ atomic_t refcnt; >+ >+ /* High-order bits of 64-bit sequence numbers */ >+ u32 snd_high_order[2]; >+ u32 rcv_high_order[2]; >+ >+ u16 send_infinite_mapping:1, >+ in_time_wait:1, >+ list_rcvd:1, /* XXX TO REMOVE */ >+ dss_csum:1, >+ server_side:1, >+ infinite_mapping_rcv:1, >+ infinite_mapping_snd:1, >+ dfin_combined:1, /* Was the DFIN combined with subflow-fin? */ >+ passive_close:1, >+ snd_hiseq_index:1, /* Index in snd_high_order of snd_nxt */ >+ rcv_hiseq_index:1; /* Index in rcv_high_order of rcv_nxt */ >+ >+ /* socket count in this connection */ >+ u8 cnt_subflows; >+ u8 cnt_established; >+ >+ u32 noneligible; /* Path mask of temporarily non >+ * eligible subflows by the scheduler >+ */ >+ >+ struct sk_buff_head reinject_queue; >+ >+ u16 remove_addrs; >+ >+ u8 dfin_path_index; >+ /* Worker struct for subflow establishment */ >+ struct work_struct subflow_work; >+ struct delayed_work subflow_retry_work; >+ /* Worker to handle interface/address changes if socket is owned */ >+ struct work_struct address_work; >+ /* Mutex needed, because otherwise mptcp_close will complain that the >+ * socket is owned by the user. >+ * E.g., mptcp_sub_close_wq is taking the meta-lock. >+ */ >+ struct mutex mutex; >+ >+ /* Master socket, also part of the connection_list, this >+ * socket is the one that the application sees. >+ */ >+ struct sock *master_sk; >+ >+ u64 csum_cutoff_seq; >+ >+ __u64 mptcp_loc_key; >+ __u32 mptcp_loc_token; >+ __u64 mptcp_rem_key; >+ __u32 mptcp_rem_token; >+ >+ /* Create a new subflow - necessary because the meta-sk may be IPv4, but >+ * the new subflow can be IPv6 >+ */ >+ struct sock *(*syn_recv_sock)(struct sock *sk, struct sk_buff *skb, >+ struct request_sock *req, >+ struct dst_entry *dst); >+ >+ /* Local addresses */ >+ struct mptcp_loc4 locaddr4[MPTCP_MAX_ADDR]; >+ u8 loc4_bits; /* Bitfield indicating which of the above addrs are set */ >+ u8 next_v4_index; >+ >+ struct mptcp_loc6 locaddr6[MPTCP_MAX_ADDR]; >+ u8 loc6_bits; >+ u8 next_v6_index; >+ >+ /* Remove addresses */ >+ struct mptcp_rem4 remaddr4[MPTCP_MAX_ADDR]; >+ u8 rem4_bits; >+ >+ struct mptcp_rem6 remaddr6[MPTCP_MAX_ADDR]; >+ u8 rem6_bits; >+ >+ u32 path_index_bits; >+ /* Next pi to pick up in case a new path becomes available */ >+ u8 next_path_index; >+ >+ /* Original snd/rcvbuf of the initial subflow. >+ * Used for the new subflows on the server-side to allow correct >+ * autotuning >+ */ >+ int orig_sk_rcvbuf; >+ int orig_sk_sndbuf; >+ u32 orig_window_clamp; >+}; >+ >+static inline int mptcp_pi_to_flag(int pi) >+{ >+ return 1 << (pi - 1); >+} >+ >+#define MPTCP_SUB_CAPABLE 0 >+#define MPTCP_SUB_LEN_CAPABLE_SYN 12 >+#define MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN 12 >+#define MPTCP_SUB_LEN_CAPABLE_ACK 20 >+#define MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN 20 >+ >+#define MPTCP_SUB_JOIN 1 >+#define MPTCP_SUB_LEN_JOIN_SYN 12 >+#define MPTCP_SUB_LEN_JOIN_SYN_ALIGN 12 >+#define MPTCP_SUB_LEN_JOIN_SYNACK 16 >+#define MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN 16 >+#define MPTCP_SUB_LEN_JOIN_ACK 24 >+#define MPTCP_SUB_LEN_JOIN_ACK_ALIGN 24 >+ >+#define MPTCP_SUB_DSS 2 >+#define MPTCP_SUB_LEN_DSS 4 >+#define MPTCP_SUB_LEN_DSS_ALIGN 4 >+ >+/* Lengths for seq and ack are the ones without the generic MPTCP-option header, >+ * as they are part of the DSS-option. >+ * To get the total length, just add the different options together. >+ */ >+#define MPTCP_SUB_LEN_SEQ 10 >+#define MPTCP_SUB_LEN_SEQ_CSUM 12 >+#define MPTCP_SUB_LEN_SEQ_ALIGN 12 >+ >+#define MPTCP_SUB_LEN_SEQ_64 14 >+#define MPTCP_SUB_LEN_SEQ_CSUM_64 16 >+#define MPTCP_SUB_LEN_SEQ_64_ALIGN 16 >+ >+#define MPTCP_SUB_LEN_ACK 4 >+#define MPTCP_SUB_LEN_ACK_ALIGN 4 >+ >+#define MPTCP_SUB_LEN_ACK_64 8 >+#define MPTCP_SUB_LEN_ACK_64_ALIGN 8 >+ >+/* This is the "default" option-length we will send out most often. >+ * MPTCP DSS-header >+ * 32-bit data sequence number >+ * 32-bit data ack >+ * >+ * It is necessary to calculate the effective MSS we will be using when >+ * sending data. >+ */ >+#define MPTCP_SUB_LEN_DSM_ALIGN (MPTCP_SUB_LEN_DSS_ALIGN + \ >+ MPTCP_SUB_LEN_SEQ_ALIGN + \ >+ MPTCP_SUB_LEN_ACK_ALIGN) >+ >+#define MPTCP_SUB_ADD_ADDR 3 >+#define MPTCP_SUB_LEN_ADD_ADDR4 8 >+#define MPTCP_SUB_LEN_ADD_ADDR6 20 >+#define MPTCP_SUB_LEN_ADD_ADDR4_ALIGN 8 >+#define MPTCP_SUB_LEN_ADD_ADDR6_ALIGN 20 >+ >+#define MPTCP_SUB_REMOVE_ADDR 4 >+#define MPTCP_SUB_LEN_REMOVE_ADDR 4 >+ >+#define MPTCP_SUB_PRIO 5 >+#define MPTCP_SUB_LEN_PRIO 3 >+#define MPTCP_SUB_LEN_PRIO_ADDR 4 >+#define MPTCP_SUB_LEN_PRIO_ALIGN 4 >+ >+#define MPTCP_SUB_FAIL 6 >+#define MPTCP_SUB_LEN_FAIL 12 >+#define MPTCP_SUB_LEN_FAIL_ALIGN 12 >+ >+#define MPTCP_SUB_FCLOSE 7 >+#define MPTCP_SUB_LEN_FCLOSE 12 >+#define MPTCP_SUB_LEN_FCLOSE_ALIGN 12 >+ >+ >+#define OPTION_MPTCP (1 << 5) >+ >+#ifdef CONFIG_MPTCP >+ >+/* MPTCP options */ >+#define OPTION_TYPE_SYN (1 << 0) >+#define OPTION_TYPE_SYNACK (1 << 1) >+#define OPTION_TYPE_ACK (1 << 2) >+#define OPTION_MP_CAPABLE (1 << 3) >+#define OPTION_DATA_ACK (1 << 4) >+#define OPTION_ADD_ADDR (1 << 5) >+#define OPTION_MP_JOIN (1 << 6) >+#define OPTION_MP_FAIL (1 << 7) >+#define OPTION_MP_FCLOSE (1 << 8) >+#define OPTION_REMOVE_ADDR (1 << 9) >+#define OPTION_MP_PRIO (1 << 10) >+ >+/* Used for checking if the mptcp initialization has been successful */ >+extern bool mptcp_init_failed; >+ >+struct mptcp_option { >+ __u8 kind; >+ __u8 len; >+#if defined(__LITTLE_ENDIAN_BITFIELD) >+ __u8 ver:4, >+ sub:4; >+#elif defined(__BIG_ENDIAN_BITFIELD) >+ __u8 sub:4, >+ ver:4; >+#else >+#error "Adjust your <asm/byteorder.h> defines" >+#endif >+}; >+ >+struct mp_capable { >+ __u8 kind; >+ __u8 len; >+#if defined(__LITTLE_ENDIAN_BITFIELD) >+ __u8 ver:4, >+ sub:4; >+ __u8 h:1, >+ rsv:5, >+ b:1, >+ a:1; >+#elif defined(__BIG_ENDIAN_BITFIELD) >+ __u8 sub:4, >+ ver:4; >+ __u8 a:1, >+ b:1, >+ rsv:5, >+ h:1; >+#else >+#error "Adjust your <asm/byteorder.h> defines" >+#endif >+ __u64 sender_key; >+ __u64 receiver_key; >+} __attribute__((__packed__)); >+ >+struct mp_join { >+ __u8 kind; >+ __u8 len; >+#if defined(__LITTLE_ENDIAN_BITFIELD) >+ __u8 b:1, >+ rsv:3, >+ sub:4; >+#elif defined(__BIG_ENDIAN_BITFIELD) >+ __u8 sub:4, >+ rsv:3, >+ b:1; >+#else >+#error "Adjust your <asm/byteorder.h> defines" >+#endif >+ __u8 addr_id; >+ union { >+ struct { >+ u32 token; >+ u32 nonce; >+ } syn; >+ struct { >+ __u64 mac; >+ u32 nonce; >+ } synack; >+ struct { >+ __u8 mac[20]; >+ } ack; >+ } u; >+} __attribute__((__packed__)); >+ >+struct mp_dss { >+ __u8 kind; >+ __u8 len; >+#if defined(__LITTLE_ENDIAN_BITFIELD) >+ __u16 rsv1:4, >+ sub:4, >+ A:1, >+ a:1, >+ M:1, >+ m:1, >+ F:1, >+ rsv2:3; >+#elif defined(__BIG_ENDIAN_BITFIELD) >+ __u16 sub:4, >+ rsv1:4, >+ rsv2:3, >+ F:1, >+ m:1, >+ M:1, >+ a:1, >+ A:1; >+#else >+#error "Adjust your <asm/byteorder.h> defines" >+#endif >+}; >+ >+struct mp_add_addr { >+ __u8 kind; >+ __u8 len; >+#if defined(__LITTLE_ENDIAN_BITFIELD) >+ __u8 ipver:4, >+ sub:4; >+#elif defined(__BIG_ENDIAN_BITFIELD) >+ __u8 sub:4, >+ ipver:4; >+#else >+#error "Adjust your <asm/byteorder.h> defines" >+#endif >+ __u8 addr_id; >+ union { >+ struct { >+ struct in_addr addr; >+ __be16 port; >+ } v4; >+ struct { >+ struct in6_addr addr; >+ __be16 port; >+ } v6; >+ } u; >+} __attribute__((__packed__)); >+ >+struct mp_remove_addr { >+ __u8 kind; >+ __u8 len; >+#if defined(__LITTLE_ENDIAN_BITFIELD) >+ __u8 rsv:4, >+ sub:4; >+#elif defined(__BIG_ENDIAN_BITFIELD) >+ __u8 sub:4, >+ rsv:4; >+#else >+#error "Adjust your <asm/byteorder.h> defines" >+#endif >+ /* list of addr_id */ >+ __u8 addrs_id; >+}; >+ >+struct mp_fail { >+ __u8 kind; >+ __u8 len; >+#if defined(__LITTLE_ENDIAN_BITFIELD) >+ __u16 rsv1:4, >+ sub:4, >+ rsv2:8; >+#elif defined(__BIG_ENDIAN_BITFIELD) >+ __u16 sub:4, >+ rsv1:4, >+ rsv2:8; >+#else >+#error "Adjust your <asm/byteorder.h> defines" >+#endif >+ __be64 data_seq; >+} __attribute__((__packed__)); >+ >+struct mp_fclose { >+ __u8 kind; >+ __u8 len; >+#if defined(__LITTLE_ENDIAN_BITFIELD) >+ __u16 rsv1:4, >+ sub:4, >+ rsv2:8; >+#elif defined(__BIG_ENDIAN_BITFIELD) >+ __u16 sub:4, >+ rsv1:4, >+ rsv2:8; >+#else >+#error "Adjust your <asm/byteorder.h> defines" >+#endif >+ __u64 key; >+} __attribute__((__packed__)); >+ >+struct mp_prio { >+ __u8 kind; >+ __u8 len; >+#if defined(__LITTLE_ENDIAN_BITFIELD) >+ __u8 b:1, >+ rsv:3, >+ sub:4; >+#elif defined(__BIG_ENDIAN_BITFIELD) >+ __u8 sub:4, >+ rsv:3, >+ b:1; >+#else >+#error "Adjust your <asm/byteorder.h> defines" >+#endif >+ __u8 addr_id; >+} __attribute__((__packed__)); >+ >+static inline int mptcp_sub_len_remove_addr(u16 bitfield) >+{ >+ unsigned int c; >+ for (c = 0; bitfield; c++) >+ bitfield &= bitfield - 1; >+ return MPTCP_SUB_LEN_REMOVE_ADDR + c - 1; >+} >+ >+static inline int mptcp_sub_len_remove_addr_align(u16 bitfield) >+{ >+ return ALIGN(mptcp_sub_len_remove_addr(bitfield), 4); >+} >+ >+static inline int mptcp_sub_len_dss(struct mp_dss *m, int csum) >+{ >+ return 4 + m->A * (4 + m->a * 4) + m->M * (10 + m->m * 4 + csum * 2); >+} >+ >+/* Default MSS for MPTCP >+ * All subflows will be using that MSS. If any subflow has a lower MSS, it is >+ * just not used. */ >+#define MPTCP_MSS 1400 >+#define MPTCP_SYN_RETRIES 3 >+extern int sysctl_mptcp_ndiffports; >+extern int sysctl_mptcp_enabled; >+extern int sysctl_mptcp_checksum; >+extern int sysctl_mptcp_debug; >+extern int sysctl_mptcp_syn_retries; >+ >+extern struct workqueue_struct *mptcp_wq; >+ >+#define mptcp_debug(fmt, args...) \ >+ do { \ >+ if (unlikely(sysctl_mptcp_debug)) \ >+ pr_err(__FILE__ ": " fmt, ##args); \ >+ } while (0) >+ >+/* Iterates over all subflows */ >+#define mptcp_for_each_tp(mpcb, tp) \ >+ for ((tp) = (mpcb)->connection_list; (tp); (tp) = (tp)->mptcp->next) >+ >+#define mptcp_for_each_sk(mpcb, sk) \ >+ for ((sk) = (struct sock *)(mpcb)->connection_list; \ >+ sk; \ >+ sk = (struct sock *)tcp_sk(sk)->mptcp->next) >+ >+#define mptcp_for_each_sk_safe(__mpcb, __sk, __temp) \ >+ for (__sk = (struct sock *)(__mpcb)->connection_list, \ >+ __temp = __sk ? (struct sock *)tcp_sk(__sk)->mptcp->next : NULL; \ >+ __sk; \ >+ __sk = __temp, \ >+ __temp = __sk ? (struct sock *)tcp_sk(__sk)->mptcp->next : NULL) >+ >+/* Iterates over all bit set to 1 in a bitset */ >+#define mptcp_for_each_bit_set(b, i) \ >+ for (i = ffs(b) - 1; i >= 0; i = ffs(b >> (i + 1) << (i + 1)) - 1) >+ >+#define mptcp_for_each_bit_unset(b, i) \ >+ mptcp_for_each_bit_set(~b, i) >+ >+extern struct lock_class_key meta_key; >+extern struct lock_class_key meta_slock_key; >+extern u32 mptcp_secret[MD5_MESSAGE_BYTES / 4]; >+ >+/* This is needed to ensure that two subsequent key-generation result in >+ * different keys if the IPs and ports are the same. >+ */ >+extern u32 mptcp_key_seed; >+ >+void mptcp_data_ready(struct sock *sk, int bytes); >+void mptcp_write_space(struct sock *sk); >+ >+void mptcp_add_meta_ofo_queue(struct sock *meta_sk, struct sk_buff *skb, >+ struct sock *sk); >+void mptcp_ofo_queue(struct sock *meta_sk); >+void mptcp_purge_ofo_queue(struct tcp_sock *meta_tp); >+void mptcp_cleanup_rbuf(struct sock *meta_sk, int copied); >+int mptcp_alloc_mpcb(struct sock *master_sk, __u64 remote_key, u32 window); >+int mptcp_add_sock(struct sock *meta_sk, struct sock *sk, u8 rem_id, gfp_t flags); >+void mptcp_del_sock(struct sock *sk); >+void mptcp_update_metasocket(struct sock *sock, struct sock *meta_sk); >+void mptcp_reinject_data(struct sock *orig_sk, int clone_it); >+void mptcp_update_sndbuf(struct mptcp_cb *mpcb); >+struct sk_buff *mptcp_next_segment(struct sock *sk, int *reinject); >+void mptcp_send_fin(struct sock *meta_sk); >+void mptcp_send_active_reset(struct sock *meta_sk, gfp_t priority); >+int mptcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, >+ int push_one, gfp_t gfp); >+void mptcp_parse_options(const uint8_t *ptr, int opsize, >+ struct tcp_options_received *opt_rx, >+ struct mptcp_options_received *mopt, >+ const struct sk_buff *skb); >+void mptcp_syn_options(struct sock *sk, struct tcp_out_options *opts, >+ unsigned *remaining); >+void mptcp_synack_options(struct request_sock *req, >+ struct tcp_out_options *opts, >+ unsigned *remaining); >+void mptcp_established_options(struct sock *sk, struct sk_buff *skb, >+ struct tcp_out_options *opts, unsigned *size); >+void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp, >+ struct tcp_out_options *opts, >+ struct sk_buff *skb); >+void mptcp_close(struct sock *meta_sk, long timeout); >+int mptcp_doit(struct sock *sk); >+int mptcp_create_master_sk(struct sock *meta_sk, __u64 remote_key, u32 window); >+int mptcp_check_req_master(struct sock *sk, struct sock *child, >+ struct request_sock *req, >+ struct request_sock **prev, >+ struct mptcp_options_received *mopt); >+struct sock *mptcp_check_req_child(struct sock *sk, struct sock *child, >+ struct request_sock *req, >+ struct request_sock **prev, >+ struct mptcp_options_received *mopt); >+u32 __mptcp_select_window(struct sock *sk); >+void mptcp_select_initial_window(int *__space, __u32 *window_clamp, >+ const struct sock *sk); >+unsigned int mptcp_current_mss(struct sock *meta_sk); >+int mptcp_select_size(const struct sock *meta_sk, bool sg); >+void mptcp_key_sha1(u64 key, u32 *token, u64 *idsn); >+void mptcp_hmac_sha1(u8 *key_1, u8 *key_2, u8 *rand_1, u8 *rand_2, >+ u32 *hash_out); >+void mptcp_clean_rtx_infinite(struct sk_buff *skb, struct sock *sk); >+void mptcp_fin(struct sock *meta_sk); >+void mptcp_retransmit_timer(struct sock *meta_sk); >+int mptcp_write_wakeup(struct sock *meta_sk); >+void mptcp_sub_close_wq(struct work_struct *work); >+void mptcp_sub_close(struct sock *sk, unsigned long delay); >+struct sock *mptcp_select_ack_sock(const struct sock *meta_sk, int copied); >+void mptcp_fallback_meta_sk(struct sock *meta_sk); >+int mptcp_backlog_rcv(struct sock *meta_sk, struct sk_buff *skb); >+struct sock *mptcp_sk_clone(const struct sock *sk, int family, const gfp_t priority); >+void mptcp_ack_handler(unsigned long); >+void mptcp_set_keepalive(struct sock *sk, int val); >+int mptcp_check_rtt(const struct tcp_sock *tp, int time); >+int mptcp_check_snd_buf(const struct tcp_sock *tp); >+int mptcp_handle_options(struct sock *sk, const struct tcphdr *th, struct sk_buff *skb); >+void __init mptcp_init(void); >+int mptcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len); >+int mptcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, >+ unsigned int mss_now, int reinject); >+int mptso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, >+ unsigned int mss_now, gfp_t gfp, int reinject); >+void mptcp_destroy_sock(struct sock *sk); >+int mptcp_rcv_synsent_state_process(struct sock *sk, struct sock **skptr, >+ struct sk_buff *skb, >+ struct mptcp_options_received *mopt); >+unsigned int mptcp_xmit_size_goal(struct sock *meta_sk, u32 mss_now, >+ int large_allowed); >+int mptcp_time_wait(struct sock *sk, struct tcp_timewait_sock *tw); >+void mptcp_twsk_destructor(struct tcp_timewait_sock *tw); >+void mptcp_update_tw_socks(const struct tcp_sock *tp, int state); >+int mptcp_retransmit_skb(struct sock *meta_sk, struct sk_buff *skb); >+ >+static inline bool mptcp_can_sendpage(struct sock *sk) >+{ >+ struct sock *sk_it; >+ >+ if (tcp_sk(sk)->mpcb->dss_csum) >+ return false; >+ >+ mptcp_for_each_sk(tcp_sk(sk)->mpcb, sk_it) { >+ if (!(sk_it->sk_route_caps & NETIF_F_SG) || >+ !(sk_it->sk_route_caps & NETIF_F_ALL_CSUM)) >+ return false; >+ } >+ >+ return true; >+} >+ >+static inline void mptcp_push_pending_frames(struct sock *meta_sk) >+{ >+ if (mptcp_next_segment(meta_sk, NULL)) { >+ struct tcp_sock *tp = tcp_sk(meta_sk); >+ >+ /* We don't care about the MSS, because it will be set in >+ * mptcp_write_xmit. >+ */ >+ __tcp_push_pending_frames(meta_sk, 0, tp->nonagle); >+ } >+} >+ >+static inline void mptcp_sub_force_close(struct sock *sk) >+{ >+ /* The below tcp_done may have freed the socket, if he is already dead. >+ * Thus, we are not allowed to access it afterwards. That's why >+ * we have to store the dead-state in this local variable. >+ */ >+ int sock_is_dead = sock_flag(sk, SOCK_DEAD); >+ >+ tcp_sk(sk)->mp_killed = 1; >+ >+ if (sk->sk_state != TCP_CLOSE) >+ tcp_done(sk); >+ >+ if (!sock_is_dead) >+ mptcp_sub_close(sk, 0); >+} >+ >+static inline void mptcp_send_reset(struct sock *sk) >+{ >+ tcp_send_active_reset(sk, GFP_ATOMIC); >+ mptcp_sub_force_close(sk); >+} >+ >+static inline int mptcp_is_data_seq(const struct sk_buff *skb) >+{ >+ return TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_SEQ; >+} >+ >+static inline int mptcp_is_data_fin(const struct sk_buff *skb) >+{ >+ return mptcp_is_data_seq(skb) && >+ (TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_FIN); >+} >+ >+/* Is it a data-fin while in infinite mapping mode? >+ * In infinite mode, a subflow-fin is in fact a data-fin. >+ */ >+static inline int mptcp_is_data_fin2(const struct sk_buff *skb, >+ const struct tcp_sock *tp) >+{ >+ return mptcp_is_data_fin(skb) || >+ (tp->mpcb->infinite_mapping_rcv && tcp_hdr(skb)->fin); >+} >+ >+static inline void mptcp_skb_entail_init(const struct tcp_sock *tp, >+ struct sk_buff *skb) >+{ >+ if (tp->mpc) >+ TCP_SKB_CB(skb)->mptcp_flags = MPTCPHDR_SEQ; >+} >+ >+static inline u8 mptcp_get_64_bit(u64 data_seq, struct mptcp_cb *mpcb) >+{ >+ u64 data_seq_high = (u32)(data_seq >> 32); >+ >+ if (mpcb->rcv_high_order[0] == data_seq_high) >+ return 0; >+ else if (mpcb->rcv_high_order[1] == data_seq_high) >+ return MPTCPHDR_SEQ64_INDEX; >+ else >+ return MPTCPHDR_SEQ64_OFO; >+} >+ >+/* Sets the data_seq and returns pointer to the in-skb field of the data_seq. >+ * If the packet has a 64-bit dseq, the pointer points to the last 32 bits. >+ */ >+static inline __u32 *mptcp_skb_set_data_seq(const struct sk_buff *skb, >+ u32 *data_seq, >+ struct mptcp_cb *mpcb) >+{ >+ __u32 *ptr = (__u32 *)(skb_transport_header(skb) + TCP_SKB_CB(skb)->dss_off); >+ >+ if (TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_SEQ64_SET) { >+ u64 data_seq64 = get_unaligned_be64(ptr); >+ >+ if (mpcb) >+ TCP_SKB_CB(skb)->mptcp_flags |= mptcp_get_64_bit(data_seq64, mpcb); >+ >+ *data_seq = (u32)data_seq64 ; >+ ptr++; >+ } else { >+ *data_seq = get_unaligned_be32(ptr); >+ } >+ >+ return ptr; >+} >+ >+static inline struct sock *mptcp_meta_sk(const struct sock *sk) >+{ >+ return tcp_sk(sk)->meta_sk; >+} >+ >+static inline struct tcp_sock *mptcp_meta_tp(const struct tcp_sock *tp) >+{ >+ return tcp_sk(tp->meta_sk); >+} >+ >+static inline int is_meta_tp(const struct tcp_sock *tp) >+{ >+ return tp->mpcb && mptcp_meta_tp(tp) == tp; >+} >+ >+static inline int is_meta_sk(const struct sock *sk) >+{ >+ return sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP && >+ tcp_sk(sk)->mpc && mptcp_meta_sk(sk) == sk; >+} >+ >+static inline int is_master_tp(const struct tcp_sock *tp) >+{ >+ return !tp->mpc || (!tp->mptcp->slave_sk && !is_meta_tp(tp)); >+} >+ >+static inline void mptcp_hash_request_remove(struct request_sock *req) >+{ >+ int in_softirq = 0; >+ >+ if (list_empty(&mptcp_rsk(req)->collide_tuple)) >+ return; >+ >+ if (in_softirq()) { >+ spin_lock(&mptcp_reqsk_hlock); >+ in_softirq = 1; >+ } else { >+ spin_lock_bh(&mptcp_reqsk_hlock); >+ } >+ >+ list_del(&mptcp_rsk(req)->collide_tuple); >+ >+ if (in_softirq) >+ spin_unlock(&mptcp_reqsk_hlock); >+ else >+ spin_unlock_bh(&mptcp_reqsk_hlock); >+} >+ >+static inline void mptcp_reqsk_destructor(struct request_sock *req) >+{ >+ if (!mptcp_rsk(req)->mpcb) { >+ if (hlist_nulls_unhashed(&mptcp_rsk(req)->collide_tk)) >+ return; >+ >+ if (in_softirq()) { >+ mptcp_reqsk_remove_tk(req); >+ } else { >+ rcu_read_lock_bh(); >+ spin_lock(&mptcp_tk_hashlock); >+ hlist_nulls_del_rcu(&mptcp_rsk(req)->collide_tk); >+ spin_unlock(&mptcp_tk_hashlock); >+ rcu_read_unlock_bh(); >+ } >+ } else { >+ mptcp_hash_request_remove(req); >+ } >+} >+ >+static inline void mptcp_init_mp_opt(struct mptcp_options_received *mopt) >+{ >+ mopt->saw_mpc = 0; >+ mopt->dss_csum = 0; >+ mopt->drop_me = 0; >+ >+ mopt->is_mp_join = 0; >+ mopt->join_ack = 0; >+ >+ mopt->saw_low_prio = 0; >+ mopt->low_prio = 0; >+ >+ mopt->saw_add_addr = 0; >+ mopt->more_add_addr = 0; >+ >+ mopt->saw_rem_addr = 0; >+ mopt->more_rem_addr = 0; >+ >+ mopt->mp_fail = 0; >+ mopt->mp_fclose = 0; >+} >+ >+static inline void mptcp_reset_mopt(struct tcp_sock *tp) >+{ >+ struct mptcp_options_received *mopt = &tp->mptcp->rx_opt; >+ >+ mopt->saw_low_prio = 0; >+ mopt->saw_add_addr = 0; >+ mopt->more_add_addr = 0; >+ mopt->saw_rem_addr = 0; >+ mopt->more_rem_addr = 0; >+ mopt->join_ack = 0; >+ mopt->mp_fail = 0; >+ mopt->mp_fclose = 0; >+} >+ >+static inline __be32 mptcp_get_highorder_sndbits(const struct sk_buff *skb, >+ const struct mptcp_cb *mpcb) >+{ >+ return htonl(mpcb->snd_high_order[(TCP_SKB_CB(skb)->mptcp_flags & >+ MPTCPHDR_SEQ64_INDEX) ? 1 : 0]); >+} >+ >+static inline u64 mptcp_get_data_seq_64(const struct mptcp_cb *mpcb, int index, >+ u32 data_seq_32) >+{ >+ return ((u64)mpcb->rcv_high_order[index] << 32) | data_seq_32; >+} >+ >+static inline u64 mptcp_get_rcv_nxt_64(const struct tcp_sock *meta_tp) >+{ >+ struct mptcp_cb *mpcb = meta_tp->mpcb; >+ return mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index, >+ meta_tp->rcv_nxt); >+} >+ >+static inline void mptcp_check_sndseq_wrap(struct tcp_sock *meta_tp, int inc) >+{ >+ if (unlikely(meta_tp->snd_nxt > meta_tp->snd_nxt + inc)) { >+ struct mptcp_cb *mpcb = meta_tp->mpcb; >+ mpcb->snd_hiseq_index = mpcb->snd_hiseq_index ? 0 : 1; >+ mpcb->snd_high_order[mpcb->snd_hiseq_index] += 2; >+ } >+} >+ >+static inline void mptcp_check_rcvseq_wrap(struct tcp_sock *meta_tp, >+ u32 old_rcv_nxt) >+{ >+ if (unlikely(old_rcv_nxt > meta_tp->rcv_nxt)) { >+ struct mptcp_cb *mpcb = meta_tp->mpcb; >+ mpcb->rcv_high_order[mpcb->rcv_hiseq_index] += 2; >+ mpcb->rcv_hiseq_index = mpcb->rcv_hiseq_index ? 0 : 1; >+ } >+} >+ >+static inline int mptcp_sk_can_send(const struct sock *sk) >+{ >+ return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT); >+} >+ >+static inline int mptcp_sk_can_recv(const struct sock *sk) >+{ >+ return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCP_FIN_WAIT1 | TCP_FIN_WAIT2); >+} >+ >+static inline int mptcp_sk_can_send_ack(const struct sock *sk) >+{ >+ return !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV | >+ TCPF_CLOSE | TCPF_LISTEN)); >+} >+ >+/* Only support GSO if all subflows supports it */ >+static inline bool mptcp_sk_can_gso(const struct sock *meta_sk) >+{ >+ struct sock *sk; >+ >+ if (tcp_sk(meta_sk)->mpcb->dss_csum) >+ return 0; >+ >+ mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) >+ if (!sk_can_gso(sk)) >+ return false; >+ return true; >+} >+ >+static inline bool mptcp_can_sg(const struct sock *meta_sk) >+{ >+ struct sock *sk; >+ >+ if (tcp_sk(meta_sk)->mpcb->dss_csum) >+ return 0; >+ >+ mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) >+ if (!(sk->sk_route_caps & NETIF_F_SG)) >+ return false; >+ return true; >+} >+ >+/* Adding a new subflow to the rcv-buffer space. We make a simple addition, >+ * to give some space to allow traffic on the new subflow. Autotuning will >+ * increase it further later on. >+ */ >+static inline void mptcp_init_buffer_space(struct sock *sk) >+{ >+ struct sock *meta_sk = mptcp_meta_sk(sk); >+ int space = min(meta_sk->sk_rcvbuf + sk->sk_rcvbuf, sysctl_tcp_rmem[2]); >+ >+ if (space > meta_sk->sk_rcvbuf) { >+ tcp_sk(meta_sk)->window_clamp += tcp_sk(sk)->window_clamp; >+ tcp_sk(meta_sk)->rcv_ssthresh += tcp_sk(sk)->rcv_ssthresh; >+ meta_sk->sk_rcvbuf = space; >+ } >+} >+ >+static inline void mptcp_set_rto(struct sock *sk) >+{ >+ struct tcp_sock *tp = tcp_sk(sk); >+ struct sock *sk_it; >+ struct inet_connection_sock *micsk = inet_csk(mptcp_meta_sk(sk)); >+ __u32 max_rto = 0; >+ >+ if (!tp->mpc) >+ return; >+ >+ /* We are in recovery-phase on the MPTCP-level. Do not update the >+ * RTO, because this would kill exponential backoff. >+ */ >+ if (micsk->icsk_retransmits) >+ return; >+ >+ mptcp_for_each_sk(tp->mpcb, sk_it) { >+ if (mptcp_sk_can_send(sk_it) && >+ inet_csk(sk_it)->icsk_rto > max_rto) >+ max_rto = inet_csk(sk_it)->icsk_rto; >+ } >+ if (max_rto) { >+ micsk->icsk_rto = max_rto << 1; >+ >+ /* A successfull rto-measurement - reset backoff counter */ >+ micsk->icsk_backoff = 0; >+ } >+} >+ >+static inline int mptcp_sysctl_syn_retries(void) >+{ >+ return sysctl_mptcp_syn_retries; >+} >+ >+static inline void mptcp_sub_close_passive(struct sock *sk) >+{ >+ struct sock *meta_sk = mptcp_meta_sk(sk); >+ struct tcp_sock *tp = tcp_sk(sk), *meta_tp = tcp_sk(meta_sk); >+ >+ /* Only close, if the app did a send-shutdown (passive close), and we >+ * received the data-ack of the data-fin. >+ */ >+ if (tp->mpcb->passive_close && meta_tp->snd_una == meta_tp->write_seq) >+ mptcp_sub_close(sk, 0); >+} >+ >+static inline int mptcp_fallback_infinite(struct sock *sk, int flag) >+{ >+ struct tcp_sock *tp = tcp_sk(sk); >+ >+ /* If data has been acknowleged on the meta-level, fully_established >+ * will have been set before and thus we will not fall back to infinite >+ * mapping. >+ */ >+ if (likely(tp->mptcp->fully_established)) >+ return 0; >+ >+ if (!(flag & MPTCP_FLAG_DATA_ACKED)) >+ return 0; >+ >+ /* Don't fallback twice ;) */ >+ if (tp->mpcb->infinite_mapping_snd) >+ return 0; >+ >+ pr_err("%s %#x will fallback - pi %d, src %pI4 dst %pI4 from %pS\n", >+ __func__, tp->mpcb->mptcp_loc_token, tp->mptcp->path_index, >+ &inet_sk(sk)->inet_saddr, &inet_sk(sk)->inet_daddr, >+ __builtin_return_address(0)); >+ if (!is_master_tp(tp)) >+ return MPTCP_FLAG_SEND_RESET; >+ >+ tp->mpcb->infinite_mapping_snd = 1; >+ tp->mpcb->infinite_mapping_rcv = 1; >+ tp->mptcp->fully_established = 1; >+ >+ return 0; >+} >+ >+/* Find the first free index in the bitfield */ >+static inline int __mptcp_find_free_index(u8 bitfield, int j, u8 base) >+{ >+ int i; >+ mptcp_for_each_bit_unset(bitfield >> base, i) { >+ /* We wrapped at the bitfield - try from 0 on */ >+ if (i + base >= sizeof(bitfield) * 8) { >+ mptcp_for_each_bit_unset(bitfield, i) { >+ if (i != j) >+ return i; >+ } >+ goto exit; >+ } >+ if (i + base != j) >+ return i + base; >+ } >+exit: >+ return -1; >+} >+ >+static inline int mptcp_find_free_index(u8 bitfield) >+{ >+ return __mptcp_find_free_index(bitfield, -1, 0); >+} >+ >+/* Find the first index whose bit in the bit-field == 0 */ >+static inline u8 mptcp_set_new_pathindex(struct mptcp_cb *mpcb) >+{ >+ u8 base = mpcb->next_path_index; >+ int i; >+ >+ /* Start at 1, because 0 is reserved for the meta-sk */ >+ mptcp_for_each_bit_unset(mpcb->path_index_bits >> base, i) { >+ if (i + base < 1) >+ continue; >+ if (i + base >= sizeof(mpcb->path_index_bits) * 8) >+ break; >+ i += base; >+ mpcb->path_index_bits |= (1 << i); >+ mpcb->next_path_index = i + 1; >+ return i; >+ } >+ mptcp_for_each_bit_unset(mpcb->path_index_bits, i) { >+ if (i < 1) >+ continue; >+ mpcb->path_index_bits |= (1 << i); >+ mpcb->next_path_index = i + 1; >+ return i; >+ } >+ >+ return 0; >+} >+ >+static inline int mptcp_v6_is_v4_mapped(struct sock *sk) >+{ >+ return sk->sk_family == AF_INET6 && >+ ipv6_addr_type(&inet6_sk(sk)->saddr) == IPV6_ADDR_MAPPED; >+} >+#else /* CONFIG_MPTCP */ >+#define mptcp_debug(fmt, args...) \ >+ do { \ >+ } while (0) >+ >+/* Without MPTCP, we just do one iteration >+ * over the only socket available. This assumes that >+ * the sk/tp arg is the socket in that case. >+ */ >+#define mptcp_for_each_sk(mpcb, sk) >+#define mptcp_for_each_sk_safe(__mpcb, __sk, __temp) >+ >+static inline __u32 *mptcp_skb_set_data_seq(const struct sk_buff *skb, >+ u32 *data_seq, >+ struct mptcp_cb *mpcb) >+{ >+ return 0; >+} >+static inline int mptcp_is_data_fin(const struct sk_buff *skb) >+{ >+ return 0; >+} >+static inline int mptcp_is_data_seq(const struct sk_buff *skb) >+{ >+ return 0; >+} >+static inline struct sock *mptcp_meta_sk(const struct sock *sk) >+{ >+ return NULL; >+} >+static inline struct tcp_sock *mptcp_meta_tp(const struct tcp_sock *tp) >+{ >+ return NULL; >+} >+static inline int is_meta_sk(const struct sock *sk) >+{ >+ return 0; >+} >+static inline int is_master_tp(const struct tcp_sock *tp) >+{ >+ return 0; >+} >+static inline void mptcp_purge_ofo_queue(struct tcp_sock *meta_tp) {} >+static inline void mptcp_cleanup_rbuf(const struct sock *meta_sk, int copied) {} >+static inline void mptcp_del_sock(const struct sock *sk) {} >+static inline void mptcp_reinject_data(struct sock *orig_sk, int clone_it) {} >+static inline void mptcp_init_buffer_space(const struct sock *sk) {} >+static inline void mptcp_update_sndbuf(const struct mptcp_cb *mpcb) {} >+static inline void mptcp_skb_entail_init(const struct tcp_sock *tp, >+ const struct sk_buff *skb) {} >+static inline struct sk_buff *mptcp_next_segment(const struct sock *sk, >+ const int *reinject) >+{ >+ return NULL; >+} >+static inline void mptcp_clean_rtx_infinite(const struct sk_buff *skb, >+ const struct sock *sk) {} >+static inline void mptcp_retransmit_timer(const struct sock *meta_sk) {} >+static inline int mptcp_write_wakeup(struct sock *meta_sk) >+{ >+ return 0; >+} >+static inline void mptcp_sub_close(struct sock *sk, unsigned long delay) {} >+static inline void mptcp_set_rto(const struct sock *sk) {} >+static inline void mptcp_send_fin(const struct sock *meta_sk) {} >+static inline void mptcp_parse_options(const uint8_t *ptr, const int opsize, >+ const struct tcp_options_received *opt_rx, >+ const struct mptcp_options_received *mopt, >+ const struct sk_buff *skb) {} >+static inline void mptcp_syn_options(struct sock *sk, >+ struct tcp_out_options *opts, >+ unsigned *remaining) {} >+static inline void mptcp_synack_options(struct request_sock *req, >+ struct tcp_out_options *opts, >+ unsigned *remaining) {} >+ >+static inline void mptcp_established_options(struct sock *sk, >+ struct sk_buff *skb, >+ struct tcp_out_options *opts, >+ unsigned *size) {} >+static inline void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp, >+ struct tcp_out_options *opts, >+ struct sk_buff *skb) {} >+static inline void mptcp_close(struct sock *meta_sk, long timeout) {} >+static inline int mptcp_doit(struct sock *sk) >+{ >+ return 0; >+} >+static inline int mptcp_check_req_master(const struct sock *sk, >+ const struct sock *child, >+ struct request_sock *req, >+ struct request_sock **prev, >+ const struct mptcp_options_received *mopt) >+{ >+ return 1; >+} >+static inline struct sock *mptcp_check_req_child(struct sock *sk, >+ struct sock *child, >+ struct request_sock *req, >+ struct request_sock **prev, >+ struct mptcp_options_received *mopt) >+{ >+ return NULL; >+} >+static inline u32 __mptcp_select_window(const struct sock *sk) >+{ >+ return 0; >+} >+static inline void mptcp_select_initial_window(int *__space, >+ __u32 *window_clamp, >+ const struct sock *sk) {} >+static inline unsigned int mptcp_current_mss(struct sock *meta_sk) >+{ >+ return 0; >+} >+static inline int mptcp_select_size(const struct sock *meta_sk, bool sg) >+{ >+ return 0; >+} >+static inline void mptcp_key_sha1(u64 key, u32 *token, u64 *idsn) {} >+static inline void mptcp_sub_close_passive(struct sock *sk) {} >+static inline int mptcp_fallback_infinite(const struct sock *sk, int flag) >+{ >+ return 0; >+} >+static inline void mptcp_init_mp_opt(const struct mptcp_options_received *mopt) {} >+static inline int mptcp_check_rtt(const struct tcp_sock *tp, int time) >+{ >+ return 0; >+} >+static inline int mptcp_check_snd_buf(const struct tcp_sock *tp) >+{ >+ return 0; >+} >+static inline int mptcp_sysctl_syn_retries(void) >+{ >+ return 0; >+} >+static inline void mptcp_send_reset(const struct sock *sk) {} >+static inline void mptcp_send_active_reset(struct sock *meta_sk, >+ gfp_t priority) {} >+static inline int mptcp_write_xmit(struct sock *sk, unsigned int mss_now, >+ int nonagle, int push_one, gfp_t gfp) >+{ >+ return 0; >+} >+static inline struct sock *mptcp_sk_clone(const struct sock *sk, >+ int family, int priority) >+{ >+ return NULL; >+} >+static inline void mptcp_set_keepalive(struct sock *sk, int val) {} >+static inline int mptcp_handle_options(struct sock *sk, >+ const struct tcphdr *th, >+ struct sk_buff *skb) >+{ >+ return 0; >+} >+static inline void mptcp_reset_mopt(struct tcp_sock *tp) {} >+static inline void __init mptcp_init(void) {} >+static inline int mptcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) >+{ >+ return 0; >+} >+static inline int mptcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, >+ unsigned int mss_now, int reinject) >+{ >+ return 0; >+} >+static inline int mptso_fragment(struct sock *sk, struct sk_buff *skb, >+ unsigned int len, unsigned int mss_now, >+ gfp_t gfp, int reinject) >+{ >+ return 0; >+} >+static inline bool mptcp_sk_can_gso(const struct sock *sk) >+{ >+ return false; >+} >+static inline bool mptcp_can_sg(const struct sock *meta_sk) >+{ >+ return false; >+} >+static inline unsigned int mptcp_xmit_size_goal(struct sock *meta_sk, >+ u32 mss_now, int large_allowed) >+{ >+ return 0; >+} >+static inline void mptcp_destroy_sock(struct sock *sk) {} >+static inline int mptcp_rcv_synsent_state_process(struct sock *sk, >+ struct sock **skptr, >+ struct sk_buff *skb, >+ struct mptcp_options_received *mopt) >+{ >+ return 0; >+} >+static inline bool mptcp_can_sendpage(struct sock *sk) >+{ >+ return false; >+} >+static inline int mptcp_time_wait(struct sock *sk, struct tcp_timewait_sock *tw) >+{ >+ return 0; >+} >+static inline void mptcp_twsk_destructor(struct tcp_timewait_sock *tw) {} >+static inline void mptcp_update_tw_socks(const struct tcp_sock *tp, int state) {} >+#endif /* CONFIG_MPTCP */ >+ >+#endif /* _MPTCP_H */ >diff -Naur a/linux-3.11/include/net/mptcp_pm.h b/linux-3.11/include/net/mptcp_pm.h >--- a/linux-3.11/include/net/mptcp_pm.h 1970-01-01 01:00:00.000000000 +0100 >+++ b/linux-3.11/include/net/mptcp_pm.h 2013-10-05 18:34:48.681372093 +0200 >@@ -0,0 +1,133 @@ >+/* >+ * MPTCP implementation >+ * >+ * Initial Design & Implementation: >+ * Sébastien Barré <sebastien.barre@uclouvain.be> >+ * >+ * Current Maintainer & Author: >+ * Christoph Paasch <christoph.paasch@uclouvain.be> >+ * >+ * Additional authors: >+ * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi> >+ * Gregory Detal <gregory.detal@uclouvain.be> >+ * Fabien Duchêne <fabien.duchene@uclouvain.be> >+ * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de> >+ * Lavkesh Lahngir <lavkesh51@gmail.com> >+ * Andreas Ripke <ripke@neclab.eu> >+ * Vlad Dogaru <vlad.dogaru@intel.com> >+ * Octavian Purdila <octavian.purdila@intel.com> >+ * John Ronan <jronan@tssg.org> >+ * Catalin Nicutar <catalin.nicutar@gmail.com> >+ * Brandon Heller <brandonh@stanford.edu> >+ * >+ * >+ * This program is free software; you can redistribute it and/or >+ * modify it under the terms of the GNU General Public License >+ * as published by the Free Software Foundation; either version >+ * 2 of the License, or (at your option) any later version. >+ */ >+ >+#ifndef _MPTCP_PM_H >+#define _MPTCP_PM_H >+ >+#include <linux/in.h> >+#include <linux/in6.h> >+#include <linux/jhash.h> >+#include <linux/list.h> >+#include <linux/skbuff.h> >+#include <linux/spinlock_types.h> >+#include <linux/types.h> >+ >+#include <net/request_sock.h> >+#include <net/sock.h> >+#include <net/tcp.h> >+ >+/* Max number of local or remote addresses we can store. >+ * When changing, see the bitfield below in mptcp_loc4/6. */ >+#define MPTCP_MAX_ADDR 8 >+ >+#define MPTCP_SUBFLOW_RETRY_DELAY 1000 >+ >+struct mptcp_loc4 { >+ u8 id; >+ u8 low_prio:1; >+ __be16 port; >+ struct in_addr addr; >+}; >+ >+struct mptcp_rem4 { >+ u8 id; >+ u8 bitfield; >+ u8 retry_bitfield; >+ __be16 port; >+ struct in_addr addr; >+}; >+ >+struct mptcp_loc6 { >+ u8 id; >+ u8 low_prio:1; >+ __be16 port; >+ struct in6_addr addr; >+}; >+ >+struct mptcp_rem6 { >+ u8 id; >+ u8 bitfield; >+ u8 retry_bitfield; >+ __be16 port; >+ struct in6_addr addr; >+}; >+ >+struct mptcp_cb; >+#ifdef CONFIG_MPTCP >+ >+#define MPTCP_HASH_SIZE 1024 >+ >+/* This second hashtable is needed to retrieve request socks >+ * created as a result of a join request. While the SYN contains >+ * the token, the final ack does not, so we need a separate hashtable >+ * to retrieve the mpcb. >+ */ >+extern struct list_head mptcp_reqsk_htb[MPTCP_HASH_SIZE]; >+extern spinlock_t mptcp_reqsk_hlock; /* hashtable protection */ >+ >+/* Lock, protecting the two hash-tables that hold the token. Namely, >+ * mptcp_reqsk_tk_htb and tk_hashtable >+ */ >+extern spinlock_t mptcp_tk_hashlock; /* hashtable protection */ >+ >+void mptcp_create_subflows(struct sock *meta_sk); >+void mptcp_create_subflow_worker(struct work_struct *work); >+void mptcp_retry_subflow_worker(struct work_struct *work); >+struct mp_join *mptcp_find_join(struct sk_buff *skb); >+u8 mptcp_get_loc_addrid(struct mptcp_cb *mpcb, struct sock *sk); >+void __mptcp_hash_insert(struct tcp_sock *meta_tp, u32 token); >+void mptcp_hash_remove_bh(struct tcp_sock *meta_tp); >+void mptcp_hash_remove(struct tcp_sock *meta_tp); >+struct sock *mptcp_hash_find(struct net *net, u32 token); >+int mptcp_lookup_join(struct sk_buff *skb, struct inet_timewait_sock *tw); >+int mptcp_do_join_short(struct sk_buff *skb, struct mptcp_options_received *mopt, >+ struct tcp_options_received *tmp_opt, struct net *net); >+void mptcp_reqsk_remove_tk(struct request_sock *reqsk); >+void mptcp_reqsk_new_mptcp(struct request_sock *req, >+ const struct tcp_options_received *rx_opt, >+ const struct mptcp_options_received *mopt, >+ const struct sk_buff *skb); >+void mptcp_connect_init(struct sock *sk); >+void mptcp_set_addresses(struct sock *meta_sk); >+int mptcp_check_req(struct sk_buff *skb, struct net *net); >+void mptcp_address_worker(struct work_struct *work); >+int mptcp_pm_addr_event_handler(unsigned long event, void *ptr, int family); >+int mptcp_pm_init(void); >+void mptcp_pm_undo(void); >+ >+#else /* CONFIG_MPTCP */ >+static inline void mptcp_reqsk_new_mptcp(struct request_sock *req, >+ const struct tcp_options_received *rx_opt, >+ const struct mptcp_options_received *mopt, >+ const struct sk_buff *skb) >+{} >+static inline void mptcp_hash_remove(struct tcp_sock *meta_tp) {} >+#endif /* CONFIG_MPTCP */ >+ >+#endif /*_MPTCP_PM_H*/ >diff -Naur a/linux-3.11/include/net/mptcp_v4.h b/linux-3.11/include/net/mptcp_v4.h >--- a/linux-3.11/include/net/mptcp_v4.h 1970-01-01 01:00:00.000000000 +0100 >+++ b/linux-3.11/include/net/mptcp_v4.h 2013-10-05 18:34:48.681372093 +0200 >@@ -0,0 +1,81 @@ >+/* >+ * MPTCP implementation >+ * >+ * Initial Design & Implementation: >+ * Sébastien Barré <sebastien.barre@uclouvain.be> >+ * >+ * Current Maintainer & Author: >+ * Christoph Paasch <christoph.paasch@uclouvain.be> >+ * >+ * Additional authors: >+ * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi> >+ * Gregory Detal <gregory.detal@uclouvain.be> >+ * Fabien Duchêne <fabien.duchene@uclouvain.be> >+ * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de> >+ * Lavkesh Lahngir <lavkesh51@gmail.com> >+ * Andreas Ripke <ripke@neclab.eu> >+ * Vlad Dogaru <vlad.dogaru@intel.com> >+ * Octavian Purdila <octavian.purdila@intel.com> >+ * John Ronan <jronan@tssg.org> >+ * Catalin Nicutar <catalin.nicutar@gmail.com> >+ * Brandon Heller <brandonh@stanford.edu> >+ * >+ * >+ * This program is free software; you can redistribute it and/or >+ * modify it under the terms of the GNU General Public License >+ * as published by the Free Software Foundation; either version >+ * 2 of the License, or (at your option) any later version. >+ */ >+ >+#ifndef MPTCP_V4_H_ >+#define MPTCP_V4_H_ >+ >+ >+#include <linux/in.h> >+#include <linux/skbuff.h> >+#include <net/mptcp.h> >+#include <net/mptcp_pm.h> >+#include <net/request_sock.h> >+#include <net/sock.h> >+ >+extern struct request_sock_ops mptcp_request_sock_ops; >+extern struct proto mptcp_prot; >+ >+#ifdef CONFIG_MPTCP >+ >+int mptcp_v4_do_rcv(struct sock *meta_sk, struct sk_buff *skb); >+int mptcp_v4_rem_raddress(struct mptcp_cb *mpcb, u8 id); >+int mptcp_v4_add_raddress(struct mptcp_cb *mpcb, const struct in_addr *addr, >+ __be16 port, u8 id); >+void mptcp_v4_set_init_addr_bit(struct mptcp_cb *mpcb, __be32 daddr); >+struct sock *mptcp_v4_search_req(const __be16 rport, const __be32 raddr, >+ const __be32 laddr, const struct net *net); >+int mptcp_init4_subsockets(struct sock *meta_sk, const struct mptcp_loc4 *loc, >+ struct mptcp_rem4 *rem); >+void mptcp_pm_addr4_event_handler(struct in_ifaddr *ifa, unsigned long event, >+ struct mptcp_cb *mpcb); >+int mptcp_pm_v4_init(void); >+void mptcp_pm_v4_undo(void); >+void mptcp_v4_send_add_addr(int loc_id, struct mptcp_cb *mpcb); >+u32 mptcp_v4_get_nonce(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, >+ u32 seq); >+u64 mptcp_v4_get_key(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport); >+ >+#else >+ >+static inline int mptcp_v4_do_rcv(const struct sock *meta_sk, >+ const struct sk_buff *skb) >+{ >+ return 0; >+} >+ >+static inline int mptcp_v4_send_synack(const struct sock *meta_sk, >+ const struct request_sock *req, >+ const struct request_values *rvp) >+{ >+ return 0; >+} >+ >+#endif /* CONFIG_MPTCP */ >+ >+#endif /* MPTCP_V4_H_ */ >diff -Naur a/linux-3.11/include/net/mptcp_v6.h b/linux-3.11/include/net/mptcp_v6.h >--- a/linux-3.11/include/net/mptcp_v6.h 1970-01-01 01:00:00.000000000 +0100 >+++ b/linux-3.11/include/net/mptcp_v6.h 2013-10-05 18:34:48.681372093 +0200 >@@ -0,0 +1,87 @@ >+/* >+ * MPTCP implementation >+ * >+ * Initial Design & Implementation: >+ * Sébastien Barré <sebastien.barre@uclouvain.be> >+ * >+ * Current Maintainer & Author: >+ * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi> >+ * >+ * Additional authors: >+ * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi> >+ * Gregory Detal <gregory.detal@uclouvain.be> >+ * Fabien Duchêne <fabien.duchene@uclouvain.be> >+ * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de> >+ * Lavkesh Lahngir <lavkesh51@gmail.com> >+ * Andreas Ripke <ripke@neclab.eu> >+ * Vlad Dogaru <vlad.dogaru@intel.com> >+ * Octavian Purdila <octavian.purdila@intel.com> >+ * John Ronan <jronan@tssg.org> >+ * Catalin Nicutar <catalin.nicutar@gmail.com> >+ * Brandon Heller <brandonh@stanford.edu> >+ * >+ * >+ * This program is free software; you can redistribute it and/or >+ * modify it under the terms of the GNU General Public License >+ * as published by the Free Software Foundation; either version >+ * 2 of the License, or (at your option) any later version. >+ */ >+ >+#ifndef _MPTCP_V6_H >+#define _MPTCP_V6_H >+ >+#include <linux/in6.h> >+#include <net/if_inet6.h> >+ >+#include <net/mptcp.h> >+#include <net/mptcp_pm.h> >+ >+extern struct request_sock_ops mptcp6_request_sock_ops; >+extern struct proto mptcpv6_prot; >+ >+struct mptcp6_request_sock { >+ struct mptcp_request_sock mptcp6rsk_tcp; >+ struct inet6_request_sock mptcp6rsk_inet6; >+}; >+ >+#ifdef CONFIG_MPTCP >+ >+/* >+ * Used to wait for DAD to finish. If rtr_solicit_delay is set, we use it >+ * instead >+ */ >+#define MPTCP_IPV6_DEFAULT_DAD_WAIT (HZ/10) >+ >+int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb); >+int mptcp_v6_rem_raddress(struct mptcp_cb *mpcb, u8 id); >+int mptcp_v6_add_raddress(struct mptcp_cb *mpcb, const struct in6_addr *addr, >+ __be16 port, u8 id); >+void mptcp_v6_set_init_addr_bit(struct mptcp_cb *mpcb, >+ const struct in6_addr *daddr); >+struct sock *mptcp_v6_search_req(const __be16 rport, const struct in6_addr *raddr, >+ const struct in6_addr *laddr, const struct net *net); >+int mptcp_init6_subsockets(struct sock *meta_sk, const struct mptcp_loc6 *loc, >+ struct mptcp_rem6 *rem); >+void mptcp_pm_addr6_event_handler(struct inet6_ifaddr *ifa, unsigned long event, >+ struct mptcp_cb *mpcb); >+int mptcp_pm_v6_init(void); >+void mptcp_pm_v6_undo(void); >+void mptcp_v6_send_add_addr(int loc_id, struct mptcp_cb *mpcb); >+struct sock *mptcp_v6v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, >+ struct request_sock *req, >+ struct dst_entry *dst); >+__u32 mptcp_v6_get_nonce(const __be32 *saddr, const __be32 *daddr, >+ __be16 sport, __be16 dport, u32 seq); >+u64 mptcp_v6_get_key(const __be32 *saddr, const __be32 *daddr, >+ __be16 sport, __be16 dport); >+ >+#else /* CONFIG_MPTCP */ >+ >+static inline int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb) >+{ >+ return 0; >+} >+ >+#endif /* CONFIG_MPTCP */ >+ >+#endif /* _MPTCP_V6_H */ >diff -Naur a/linux-3.11/include/net/request_sock.h b/linux-3.11/include/net/request_sock.h >--- a/linux-3.11/include/net/request_sock.h 2013-09-02 22:46:10.000000000 +0200 >+++ b/linux-3.11/include/net/request_sock.h 2013-10-05 18:34:48.682372081 +0200 >@@ -163,7 +163,8 @@ > }; > > extern int reqsk_queue_alloc(struct request_sock_queue *queue, >- unsigned int nr_table_entries); >+ unsigned int nr_table_entries, >+ gfp_t flags); > > extern void __reqsk_queue_destroy(struct request_sock_queue *queue); > extern void reqsk_queue_destroy(struct request_sock_queue *queue); >diff -Naur a/linux-3.11/include/net/sock.h b/linux-3.11/include/net/sock.h >--- a/linux-3.11/include/net/sock.h 2013-09-02 22:46:10.000000000 +0200 >+++ b/linux-3.11/include/net/sock.h 2013-10-05 18:34:49.043367542 +0200 >@@ -866,6 +866,16 @@ > > extern int sk_wait_data(struct sock *sk, long *timeo); > >+/* START - needed for MPTCP */ >+extern void sock_def_error_report(struct sock *sk); >+extern struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, >+ int family); >+extern void sock_lock_init(struct sock *sk); >+ >+extern struct lock_class_key af_callback_keys[AF_MAX]; >+extern char *const af_family_clock_key_strings[AF_MAX+1]; >+/* END - needed for MPTCP */ >+ > struct request_sock_ops; > struct timewait_sock_ops; > struct inet_hashinfo; >diff -Naur a/linux-3.11/include/net/tcp.h b/linux-3.11/include/net/tcp.h >--- a/linux-3.11/include/net/tcp.h 2013-09-02 22:46:10.000000000 +0200 >+++ b/linux-3.11/include/net/tcp.h 2013-10-05 18:34:49.045367517 +0200 >@@ -176,6 +176,7 @@ > #define TCPOPT_SACK 5 /* SACK Block */ > #define TCPOPT_TIMESTAMP 8 /* Better RTT estimations/PAWS */ > #define TCPOPT_MD5SIG 19 /* MD5 Signature (RFC2385) */ >+#define TCPOPT_MPTCP 30 > #define TCPOPT_EXP 254 /* Experimental */ > /* Magic number to be after the option value for sharing TCP > * experimental options. See draft-ietf-tcpm-experimental-options-00.txt >@@ -238,6 +239,28 @@ > */ > #define TFO_SERVER_ALWAYS 0x1000 > >+/* Flags from tcp_input.c for tcp_ack */ >+#define FLAG_DATA 0x01 /* Incoming frame contained data. */ >+#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ >+#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ >+#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */ >+#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */ >+#define FLAG_DATA_SACKED 0x20 /* New SACK. */ >+#define FLAG_ECE 0x40 /* ECE in this ACK */ >+#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ >+#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */ >+#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ >+#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ >+#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ >+#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */ >+#define MPTCP_FLAG_SEND_RESET 0x8000 >+#define MPTCP_FLAG_DATA_ACKED 0x10000 >+ >+#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) >+#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) >+#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE) >+#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED) >+ > extern struct inet_timewait_death_row tcp_death_row; > > /* sysctl variables for tcp */ >@@ -350,6 +373,105 @@ > #define TCP_ADD_STATS_USER(net, field, val) SNMP_ADD_STATS_USER((net)->mib.tcp_statistics, field, val) > #define TCP_ADD_STATS(net, field, val) SNMP_ADD_STATS((net)->mib.tcp_statistics, field, val) > >+/**** START - Exports needed for MPTCP ****/ >+extern const struct inet_connection_sock_af_ops ipv4_specific; >+extern const struct inet_connection_sock_af_ops ipv6_specific; >+extern const struct inet_connection_sock_af_ops ipv6_mapped; >+ >+struct mptcp_options_received; >+ >+extern int tcp_close_state(struct sock *sk); >+extern void tcp_push(struct sock *sk, int flags, int mss_now, >+ int nonagle); >+extern int tcp_xmit_probe_skb(struct sock *sk, int urgent); >+extern void tcp_cwnd_validate(struct sock *sk); >+extern void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb); >+extern int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, >+ gfp_t gfp_mask); >+extern unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb, >+ unsigned int mss_now, unsigned int cwnd); >+extern bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb); >+extern bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb, >+ unsigned int cur_mss, int nonagle); >+extern bool tcp_snd_wnd_test(const struct tcp_sock *tp, const struct sk_buff *skb, >+ unsigned int cur_mss); >+extern unsigned int tcp_cwnd_test(const struct tcp_sock *tp, const struct sk_buff *skb); >+extern int tcp_mtu_probe(struct sock *sk); >+extern int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb, >+ unsigned int mss_now); >+extern void __pskb_trim_head(struct sk_buff *skb, int len); >+extern void tcp_queue_skb(struct sock *sk, struct sk_buff *skb); >+extern void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags); >+extern void tcp_reset(struct sock *sk); >+extern bool tcp_may_update_window(const struct tcp_sock *tp, const u32 ack, >+ const u32 ack_seq, const u32 nwin); >+extern bool tcp_urg_mode(const struct tcp_sock *tp); >+extern void tcp_ack_probe(struct sock *sk); >+extern void tcp_rearm_rto(struct sock *sk); >+extern int tcp_write_timeout(struct sock *sk); >+extern bool retransmits_timed_out(struct sock *sk, unsigned int boundary, >+ unsigned int timeout, bool syn_set); >+extern void tcp_write_err(struct sock *sk); >+extern void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr); >+extern void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, >+ unsigned int mss_now); >+ >+extern int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req); >+extern void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, >+ struct request_sock *req); >+extern __u32 tcp_v4_init_sequence(const struct sk_buff *skb); >+extern int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, >+ struct request_sock *req, u16 queue_mapping, >+ bool nocache); >+extern void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb); >+extern struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb); >+extern struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb); >+extern void tcp_v4_reqsk_destructor(struct request_sock *req); >+ >+extern int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req); >+extern void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, >+ struct request_sock *req); >+extern __u32 tcp_v6_init_sequence(const struct sk_buff *skb); >+extern int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst, >+ struct flowi6 *fl6, struct request_sock *req, >+ u16 queue_mapping); >+extern void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb); >+extern int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); >+extern int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); >+extern void tcp_v6_destroy_sock(struct sock *sk); >+void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb); >+extern void tcp_v6_hash(struct sock *sk); >+extern struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb); >+extern struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, >+ struct request_sock *req, >+ struct dst_entry *dst); >+extern void tcp_v6_reqsk_destructor(struct request_sock *req); >+ >+extern void sock_valbool_flag(struct sock *sk, int bit, int valbool); >+extern unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, >+ int large_allowed); >+extern u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb); >+ >+extern void skb_clone_fraglist(struct sk_buff *skb); >+extern void copy_skb_header(struct sk_buff *new, const struct sk_buff *old); >+ >+extern void inet_twsk_free(struct inet_timewait_sock *tw); >+/* These states need RST on ABORT according to RFC793 */ >+static inline bool tcp_need_reset(int state) >+{ >+ return (1 << state) & >+ (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 | >+ TCPF_FIN_WAIT2 | TCPF_SYN_RECV); >+} >+ >+extern bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, >+ int hlen); >+extern int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen, >+ bool *fragstolen); >+extern bool tcp_try_coalesce(struct sock *sk, struct sk_buff *to, >+ struct sk_buff *from, bool *fragstolen); >+/**** END - Exports needed for MPTCP ****/ >+ > extern void tcp_init_mem(struct net *net); > > extern void tcp_tasklet_init(void); >@@ -448,6 +570,7 @@ > size_t len, int nonblock, int flags, int *addr_len); > extern void tcp_parse_options(const struct sk_buff *skb, > struct tcp_options_received *opt_rx, >+ struct mptcp_options_received *mopt, > int estab, struct tcp_fastopen_cookie *foc); > extern const u8 *tcp_parse_md5sig_option(const struct tcphdr *th); > >@@ -681,6 +804,21 @@ > #define TCPHDR_ECE 0x40 > #define TCPHDR_CWR 0x80 > >+/* MPTCP flags */ >+#define MPTCPHDR_ACK 0x01 >+#define MPTCPHDR_SEQ 0x02 >+#define MPTCPHDR_FIN 0x04 >+#define MPTCPHDR_INF 0x08 >+#define MPTCPHDR_SEQ64_SET 0x10 /* Did we received a 64-bit seq number */ >+#define MPTCPHDR_SEQ64_OFO 0x20 /* Is it not in our circular array? */ >+#define MPTCPHDR_SEQ64_INDEX 0x40 /* Index of seq in mpcb->snd_high_order */ >+#define MPTCPHDR_DSS_CSUM 0x80 >+ >+/* It is impossible, that all 8 bits of mptcp_flags are set to 1 with the above >+ * Thus, defining MPTCPHDR_JOIN as 0xFF is safe. >+ */ >+#define MPTCPHDR_JOIN 0xFF >+ > /* This is what the send packet queuing engine uses to pass > * TCP per-packet control information to the transmission code. > * We also store the host-order sequence numbers in here too. >@@ -689,14 +827,24 @@ > */ > struct tcp_skb_cb { > union { >- struct inet_skb_parm h4; >+ union { >+ struct inet_skb_parm h4; > #if IS_ENABLED(CONFIG_IPV6) >- struct inet6_skb_parm h6; >+ struct inet6_skb_parm h6; >+#endif >+ } header; /* For incoming frames */ >+#ifdef CONFIG_MPTCP >+ __u32 path_mask; /* path indices that tried to send this skb */ > #endif >- } header; /* For incoming frames */ >+ }; > __u32 seq; /* Starting sequence number */ > __u32 end_seq; /* SEQ + FIN + SYN + datalen */ > __u32 when; /* used to compute rtt's */ >+#ifdef CONFIG_MPTCP >+ __u8 mptcp_flags; /* flags for the MPTCP layer */ >+ __u8 dss_off; /* Number of 4-byte words until >+ * seq-number */ >+#endif > __u8 tcp_flags; /* TCP header flags. (tcp[13]) */ > > __u8 sacked; /* State flags for SACK/FACK. */ >@@ -1050,7 +1198,7 @@ > extern void tcp_select_initial_window(int __space, __u32 mss, > __u32 *rcv_wnd, __u32 *window_clamp, > int wscale_ok, __u8 *rcv_wscale, >- __u32 init_rcv_wnd); >+ __u32 init_rcv_wnd, const struct sock *sk); > > static inline int tcp_win_from_space(int space) > { >@@ -1062,12 +1210,18 @@ > /* Note: caller must be prepared to deal with negative returns */ > static inline int tcp_space(const struct sock *sk) > { >+ if (tcp_sk(sk)->mpc) >+ sk = tcp_sk(sk)->meta_sk; >+ > return tcp_win_from_space(sk->sk_rcvbuf - > atomic_read(&sk->sk_rmem_alloc)); > } > > static inline int tcp_full_space(const struct sock *sk) > { >+ if (tcp_sk(sk)->mpc) >+ sk = tcp_sk(sk)->meta_sk; >+ > return tcp_win_from_space(sk->sk_rcvbuf); > } > >@@ -1082,6 +1236,7 @@ > tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; > tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; > tcp_rsk(req)->snt_synack = 0; >+ tcp_rsk(req)->saw_mpc = 0; > req->mss = rx_opt->mss_clamp; > req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; > ireq->tstamp_ok = rx_opt->tstamp_ok; >diff -Naur a/linux-3.11/include/uapi/linux/if.h b/linux-3.11/include/uapi/linux/if.h >--- a/linux-3.11/include/uapi/linux/if.h 2013-09-02 22:46:10.000000000 +0200 >+++ b/linux-3.11/include/uapi/linux/if.h 2013-10-05 18:34:49.045367517 +0200 >@@ -53,6 +53,9 @@ > > #define IFF_ECHO 0x40000 /* echo sent packets */ > >+#define IFF_NOMULTIPATH 0x80000 /* Disable for MPTCP */ >+#define IFF_MPBACKUP 0x100000 /* Use as backup path for MPTCP */ >+ > #define IFF_VOLATILE (IFF_LOOPBACK|IFF_POINTOPOINT|IFF_BROADCAST|IFF_ECHO|\ > IFF_MASTER|IFF_SLAVE|IFF_RUNNING|IFF_LOWER_UP|IFF_DORMANT) > >diff -Naur a/linux-3.11/net/core/dev.c b/linux-3.11/net/core/dev.c >--- a/linux-3.11/net/core/dev.c 2013-09-02 22:46:10.000000000 +0200 >+++ b/linux-3.11/net/core/dev.c 2013-10-05 18:34:49.048367479 +0200 >@@ -4801,7 +4801,7 @@ > > dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP | > IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL | >- IFF_AUTOMEDIA)) | >+ IFF_AUTOMEDIA | IFF_NOMULTIPATH | IFF_MPBACKUP)) | > (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC | > IFF_ALLMULTI)); > >diff -Naur a/linux-3.11/net/core/request_sock.c b/linux-3.11/net/core/request_sock.c >--- a/linux-3.11/net/core/request_sock.c 2013-09-02 22:46:10.000000000 +0200 >+++ b/linux-3.11/net/core/request_sock.c 2013-10-05 18:34:49.049367467 +0200 >@@ -38,7 +38,8 @@ > EXPORT_SYMBOL(sysctl_max_syn_backlog); > > int reqsk_queue_alloc(struct request_sock_queue *queue, >- unsigned int nr_table_entries) >+ unsigned int nr_table_entries, >+ gfp_t flags) > { > size_t lopt_size = sizeof(struct listen_sock); > struct listen_sock *lopt; >@@ -48,9 +49,11 @@ > nr_table_entries = roundup_pow_of_two(nr_table_entries + 1); > lopt_size += nr_table_entries * sizeof(struct request_sock *); > if (lopt_size > PAGE_SIZE) >- lopt = vzalloc(lopt_size); >+ lopt = __vmalloc(lopt_size, >+ flags | __GFP_HIGHMEM | __GFP_ZERO, >+ PAGE_KERNEL); > else >- lopt = kzalloc(lopt_size, GFP_KERNEL); >+ lopt = kzalloc(lopt_size, flags); > if (lopt == NULL) > return -ENOMEM; > >diff -Naur a/linux-3.11/net/core/skbuff.c b/linux-3.11/net/core/skbuff.c >--- a/linux-3.11/net/core/skbuff.c 2013-09-02 22:46:10.000000000 +0200 >+++ b/linux-3.11/net/core/skbuff.c 2013-10-05 18:34:49.051367442 +0200 >@@ -487,7 +487,7 @@ > skb_drop_list(&skb_shinfo(skb)->frag_list); > } > >-static void skb_clone_fraglist(struct sk_buff *skb) >+void skb_clone_fraglist(struct sk_buff *skb) > { > struct sk_buff *list; > >@@ -913,7 +913,7 @@ > skb->inner_mac_header += off; > } > >-static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) >+void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) > { > __copy_skb_header(new, old); > >diff -Naur a/linux-3.11/net/core/sock.c b/linux-3.11/net/core/sock.c >--- a/linux-3.11/net/core/sock.c 2013-09-02 22:46:10.000000000 +0200 >+++ b/linux-3.11/net/core/sock.c 2013-10-05 18:34:49.052367429 +0200 >@@ -230,7 +230,7 @@ > "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" , > "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_MAX" > }; >-static const char *const af_family_clock_key_strings[AF_MAX+1] = { >+char *const af_family_clock_key_strings[AF_MAX+1] = { > "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" , > "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK", > "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" , >@@ -251,7 +251,7 @@ > * sk_callback_lock locking rules are per-address-family, > * so split the lock classes by using a per-AF key: > */ >-static struct lock_class_key af_callback_keys[AF_MAX]; >+struct lock_class_key af_callback_keys[AF_MAX]; > > /* Take into consideration the size of the struct sk_buff overhead in the > * determination of these values, since that is non-constant across >@@ -607,7 +607,7 @@ > return ret; > } > >-static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool) >+void sock_valbool_flag(struct sock *sk, int bit, int valbool) > { > if (valbool) > sock_set_flag(sk, bit); >@@ -1195,7 +1195,7 @@ > * > * (We also register the sk_lock with the lock validator.) > */ >-static inline void sock_lock_init(struct sock *sk) >+void sock_lock_init(struct sock *sk) > { > sock_lock_init_class_and_name(sk, > af_family_slock_key_strings[sk->sk_family], >@@ -1243,7 +1243,7 @@ > } > EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls); > >-static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, >+struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, > int family) > { > struct sock *sk; >@@ -2163,7 +2163,7 @@ > rcu_read_unlock(); > } > >-static void sock_def_error_report(struct sock *sk) >+void sock_def_error_report(struct sock *sk) > { > struct socket_wq *wq; > >diff -Naur a/linux-3.11/net/ipv4/af_inet.c b/linux-3.11/net/ipv4/af_inet.c >--- a/linux-3.11/net/ipv4/af_inet.c 2013-09-02 22:46:10.000000000 +0200 >+++ b/linux-3.11/net/ipv4/af_inet.c 2013-10-05 18:34:49.053367417 +0200 >@@ -104,6 +104,7 @@ > #include <net/ip_fib.h> > #include <net/inet_connection_sock.h> > #include <net/tcp.h> >+#include <net/mptcp.h> > #include <net/udp.h> > #include <net/udplite.h> > #include <net/ping.h> >@@ -274,8 +275,7 @@ > * Create an inet socket. > */ > >-static int inet_create(struct net *net, struct socket *sock, int protocol, >- int kern) >+int inet_create(struct net *net, struct socket *sock, int protocol, int kern) > { > struct sock *sk; > struct inet_protosw *answer; >@@ -711,6 +711,23 @@ > lock_sock(sk2); > > sock_rps_record_flow(sk2); >+ >+ if (sk2->sk_protocol == IPPROTO_TCP && tcp_sk(sk2)->mpc) { >+ struct sock *sk_it; >+ >+ mptcp_for_each_sk(tcp_sk(sk2)->mpcb, sk_it) >+ sock_rps_record_flow(sk_it); >+ >+ if (tcp_sk(sk2)->mpcb->master_sk) { >+ sk_it = tcp_sk(sk2)->mpcb->master_sk; >+ >+ write_lock_bh(&sk_it->sk_callback_lock); >+ sk_it->sk_wq = newsock->wq; >+ sk_it->sk_socket = newsock; >+ write_unlock_bh(&sk_it->sk_callback_lock); >+ } >+ } >+ > WARN_ON(!((1 << sk2->sk_state) & > (TCPF_ESTABLISHED | TCPF_SYN_RECV | > TCPF_CLOSE_WAIT | TCPF_CLOSE))); >@@ -1755,6 +1772,9 @@ > > ip_init(); > >+ /* We must initialize MPTCP before TCP. */ >+ mptcp_init(); >+ > tcp_v4_init(); > > /* Setup TCP slab cache for open requests. */ >diff -Naur a/linux-3.11/net/ipv4/inet_connection_sock.c b/linux-3.11/net/ipv4/inet_connection_sock.c >--- a/linux-3.11/net/ipv4/inet_connection_sock.c 2013-09-02 22:46:10.000000000 +0200 >+++ b/linux-3.11/net/ipv4/inet_connection_sock.c 2013-10-05 18:34:49.054367404 +0200 >@@ -23,6 +23,7 @@ > #include <net/route.h> > #include <net/tcp_states.h> > #include <net/xfrm.h> >+#include <net/mptcp.h> > > #ifdef INET_CSK_DEBUG > const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; >@@ -477,8 +478,8 @@ > } > EXPORT_SYMBOL_GPL(inet_csk_route_child_sock); > >-static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport, >- const u32 rnd, const u32 synq_hsize) >+u32 inet_synq_hash(const __be32 raddr, const __be16 rport, const u32 rnd, >+ const u32 synq_hsize) > { > return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1); > } >@@ -675,7 +676,12 @@ > const struct request_sock *req, > const gfp_t priority) > { >- struct sock *newsk = sk_clone_lock(sk, priority); >+ struct sock *newsk; >+ >+ if (sk->sk_protocol == IPPROTO_TCP && tcp_sk(sk)->mpc) >+ newsk = mptcp_sk_clone(sk, req->rsk_ops->family, priority); >+ else >+ newsk = sk_clone_lock(sk, priority); > > if (newsk != NULL) { > struct inet_connection_sock *newicsk = inet_csk(newsk); >@@ -752,7 +758,8 @@ > { > struct inet_sock *inet = inet_sk(sk); > struct inet_connection_sock *icsk = inet_csk(sk); >- int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries); >+ int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries, >+ GFP_KERNEL); > > if (rc != 0) > return rc; >@@ -813,6 +820,8 @@ > > acc_req = req->dl_next; > >+ if (is_meta_sk(child)) >+ mutex_lock(&tcp_sk(child)->mpcb->mutex); > local_bh_disable(); > bh_lock_sock(child); > WARN_ON(sock_owned_by_user(child)); >@@ -841,6 +850,8 @@ > > bh_unlock_sock(child); > local_bh_enable(); >+ if (is_meta_sk(child)) >+ mutex_unlock(&tcp_sk(child)->mpcb->mutex); > sock_put(child); > > sk_acceptq_removed(sk); >diff -Naur a/linux-3.11/net/ipv4/inet_timewait_sock.c b/linux-3.11/net/ipv4/inet_timewait_sock.c >--- a/linux-3.11/net/ipv4/inet_timewait_sock.c 2013-09-02 22:46:10.000000000 +0200 >+++ b/linux-3.11/net/ipv4/inet_timewait_sock.c 2013-10-05 18:34:49.055367391 +0200 >@@ -99,7 +99,7 @@ > } > } > >-static noinline void inet_twsk_free(struct inet_timewait_sock *tw) >+void inet_twsk_free(struct inet_timewait_sock *tw) > { > struct module *owner = tw->tw_prot->owner; > twsk_destructor((struct sock *)tw); >diff -Naur a/linux-3.11/net/ipv4/Kconfig b/linux-3.11/net/ipv4/Kconfig >--- a/linux-3.11/net/ipv4/Kconfig 2013-09-02 22:46:10.000000000 +0200 >+++ b/linux-3.11/net/ipv4/Kconfig 2013-10-05 18:34:49.056367379 +0200 >@@ -572,6 +572,22 @@ > For further details see: > http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html > >+config TCP_CONG_COUPLED >+ tristate "MPTCP COUPLED CONGESTION CONTROL" >+ depends on MPTCP >+ default n >+ ---help--- >+ MultiPath TCP Coupled Congestion Control >+ To enable it, just put 'coupled' in tcp_congestion_control >+ >+config TCP_CONG_OLIA >+ tristate "MPTCP Opportunistic Linked Increase" >+ depends on MPTCP >+ default n >+ ---help--- >+ MultiPath TCP Opportunistic Linked Increase Congestion Control >+ To enable it, just put 'olia' in tcp_congestion_control >+ > choice > prompt "Default TCP congestion control" > default DEFAULT_CUBIC >@@ -600,6 +616,12 @@ > config DEFAULT_WESTWOOD > bool "Westwood" if TCP_CONG_WESTWOOD=y > >+ config DEFAULT_COUPLED >+ bool "Coupled" if TCP_CONG_COUPLED=y >+ >+ config DEFAULT_OLIA >+ bool "Olia" if TCP_CONG_OLIA=y >+ > config DEFAULT_RENO > bool "Reno" > >@@ -621,6 +643,7 @@ > default "vegas" if DEFAULT_VEGAS > default "westwood" if DEFAULT_WESTWOOD > default "veno" if DEFAULT_VENO >+ default "coupled" if DEFAULT_COUPLED > default "reno" if DEFAULT_RENO > default "cubic" > >diff -Naur a/linux-3.11/net/ipv4/syncookies.c b/linux-3.11/net/ipv4/syncookies.c >--- a/linux-3.11/net/ipv4/syncookies.c 2013-09-02 22:46:10.000000000 +0200 >+++ b/linux-3.11/net/ipv4/syncookies.c 2013-10-05 18:34:49.057367366 +0200 >@@ -293,7 +293,7 @@ > > /* check for timestamp cookie support */ > memset(&tcp_opt, 0, sizeof(tcp_opt)); >- tcp_parse_options(skb, &tcp_opt, 0, NULL); >+ tcp_parse_options(skb, &tcp_opt, NULL, 0, NULL); > > if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok)) > goto out; >@@ -366,7 +366,7 @@ > tcp_select_initial_window(tcp_full_space(sk), req->mss, > &req->rcv_wnd, &req->window_clamp, > ireq->wscale_ok, &rcv_wscale, >- dst_metric(&rt->dst, RTAX_INITRWND)); >+ dst_metric(&rt->dst, RTAX_INITRWND), sk); > > ireq->rcv_wscale = rcv_wscale; > >diff -Naur a/linux-3.11/net/ipv4/tcp.c b/linux-3.11/net/ipv4/tcp.c >--- a/linux-3.11/net/ipv4/tcp.c 2013-09-02 22:46:10.000000000 +0200 >+++ b/linux-3.11/net/ipv4/tcp.c 2013-10-05 18:34:49.069367215 +0200 >@@ -271,6 +271,7 @@ > > #include <net/icmp.h> > #include <net/inet_common.h> >+#include <net/mptcp.h> > #include <net/tcp.h> > #include <net/xfrm.h> > #include <net/ip.h> >@@ -605,6 +606,7 @@ > tcb->seq = tcb->end_seq = tp->write_seq; > tcb->tcp_flags = TCPHDR_ACK; > tcb->sacked = 0; >+ mptcp_skb_entail_init(tp, skb); > skb_header_release(skb); > tcp_add_write_queue_tail(sk, skb); > sk->sk_wmem_queued += skb->truesize; >@@ -619,7 +621,7 @@ > tp->snd_up = tp->write_seq; > } > >-static inline void tcp_push(struct sock *sk, int flags, int mss_now, >+void tcp_push(struct sock *sk, int flags, int mss_now, > int nonagle) > { > if (tcp_send_head(sk)) { >@@ -685,6 +687,14 @@ > int ret; > > sock_rps_record_flow(sk); >+ >+#ifdef CONFIG_MPTCP >+ if (tcp_sk(sk)->mpc) { >+ struct sock *sk_it; >+ mptcp_for_each_sk(tcp_sk(sk)->mpcb, sk_it) >+ sock_rps_record_flow(sk_it); >+ } >+#endif > /* > * We can't seek on a socket input > */ >@@ -780,8 +790,7 @@ > return NULL; > } > >-static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, >- int large_allowed) >+unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, int large_allowed) > { > struct tcp_sock *tp = tcp_sk(sk); > u32 xmit_size_goal, old_size_goal; >@@ -821,8 +830,13 @@ > { > int mss_now; > >- mss_now = tcp_current_mss(sk); >- *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB)); >+ if (tcp_sk(sk)->mpc) { >+ mss_now = mptcp_current_mss(sk); >+ *size_goal = mptcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB)); >+ } else { >+ mss_now = tcp_current_mss(sk); >+ *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB)); >+ } > > return mss_now; > } >@@ -846,6 +860,26 @@ > goto out_err; > } > >+ if (tp->mpc) { >+ struct sock *sk_it; >+ >+ /* We must check this with socket-lock hold because we iterate >+ * over the subflows. >+ */ >+ if (!mptcp_can_sendpage(sk)) { >+ ssize_t ret; >+ >+ release_sock(sk); >+ ret = sock_no_sendpage(sk->sk_socket, page, offset, >+ size, flags); >+ lock_sock(sk); >+ return ret; >+ } >+ >+ mptcp_for_each_sk(tp->mpcb, sk_it) >+ sock_rps_record_flow(sk_it); >+ } >+ > clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); > > mss_now = tcp_send_mss(sk, &size_goal, flags); >@@ -949,8 +983,9 @@ > { > ssize_t res; > >- if (!(sk->sk_route_caps & NETIF_F_SG) || >- !(sk->sk_route_caps & NETIF_F_ALL_CSUM)) >+ /* If MPTCP is enabled, we check it later after establishment */ >+ if (!tcp_sk(sk)->mpc && (!(sk->sk_route_caps & NETIF_F_SG) || >+ !(sk->sk_route_caps & NETIF_F_ALL_CSUM))) > return sock_no_sendpage(sk->sk_socket, page, offset, size, > flags); > >@@ -966,6 +1001,9 @@ > const struct tcp_sock *tp = tcp_sk(sk); > int tmp = tp->mss_cache; > >+ if (tp->mpc) >+ return mptcp_select_size(sk, sg); >+ > if (sg) { > if (sk_can_gso(sk)) { > /* Small frames wont use a full page: >@@ -1051,6 +1089,12 @@ > goto do_error; > } > >+ if (tp->mpc) { >+ struct sock *sk_it; >+ mptcp_for_each_sk(tp->mpcb, sk_it) >+ sock_rps_record_flow(sk_it); >+ } >+ > if (unlikely(tp->repair)) { > if (tp->repair_queue == TCP_RECV_QUEUE) { > copied = tcp_send_rcvq(sk, msg, size); >@@ -1078,7 +1122,10 @@ > if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) > goto out_err; > >- sg = !!(sk->sk_route_caps & NETIF_F_SG); >+ if (tp->mpc) >+ sg = mptcp_can_sg(sk); >+ else >+ sg = !!(sk->sk_route_caps & NETIF_F_SG); > > while (--iovlen >= 0) { > size_t seglen = iov->iov_len; >@@ -1129,8 +1176,15 @@ > > /* > * Check whether we can use HW checksum. >+ * >+ * If dss-csum is enabled, we do not do hw-csum. >+ * In case of non-mptcp we check the >+ * device-capabilities. >+ * In case of mptcp, hw-csum's will be handled >+ * later in mptcp_write_xmit. > */ >- if (sk->sk_route_caps & NETIF_F_ALL_CSUM) >+ if (((tp->mpc && !tp->mpcb->dss_csum) || !tp->mpc) && >+ (tp->mpc || sk->sk_route_caps & NETIF_F_ALL_CSUM)) > skb->ip_summed = CHECKSUM_PARTIAL; > > skb_entail(sk, skb); >@@ -1330,6 +1384,11 @@ > > struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); > >+ if (is_meta_sk(sk)) { >+ mptcp_cleanup_rbuf(sk, copied); >+ return; >+ } >+ > WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq), > "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n", > tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt); >@@ -1567,6 +1626,14 @@ > > lock_sock(sk); > >+#ifdef CONFIG_MPTCP >+ if (tp->mpc) { >+ struct sock *sk_it; >+ mptcp_for_each_sk(tp->mpcb, sk_it) >+ sock_rps_record_flow(sk_it); >+ } >+#endif >+ > err = -ENOTCONN; > if (sk->sk_state == TCP_LISTEN) > goto out; >@@ -2014,7 +2081,7 @@ > /* TCP_CLOSING */ TCP_CLOSING, > }; > >-static int tcp_close_state(struct sock *sk) >+int tcp_close_state(struct sock *sk) > { > int next = (int)new_state[sk->sk_state]; > int ns = next & TCP_STATE_MASK; >@@ -2043,8 +2110,12 @@ > (TCPF_ESTABLISHED | TCPF_SYN_SENT | > TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) { > /* Clear out any half completed packets. FIN if needed. */ >- if (tcp_close_state(sk)) >- tcp_send_fin(sk); >+ if (tcp_close_state(sk)) { >+ if (!is_meta_sk(sk)) >+ tcp_send_fin(sk); >+ else >+ mptcp_send_fin(sk); >+ } > } > } > EXPORT_SYMBOL(tcp_shutdown); >@@ -2069,6 +2140,11 @@ > int data_was_unread = 0; > int state; > >+ if (is_meta_sk(sk)) { >+ mptcp_close(sk, timeout); >+ return; >+ } >+ > lock_sock(sk); > sk->sk_shutdown = SHUTDOWN_MASK; > >@@ -2235,15 +2311,6 @@ > } > EXPORT_SYMBOL(tcp_close); > >-/* These states need RST on ABORT according to RFC793 */ >- >-static inline bool tcp_need_reset(int state) >-{ >- return (1 << state) & >- (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 | >- TCPF_FIN_WAIT2 | TCPF_SYN_RECV); >-} >- > int tcp_disconnect(struct sock *sk, int flags) > { > struct inet_sock *inet = inet_sk(sk); >@@ -2284,6 +2351,50 @@ > if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) > inet_reset_saddr(sk); > >+#ifdef CONFIG_MPTCP >+ if (is_meta_sk(sk)) { >+ struct sock *subsk, *tmpsk; >+ struct tcp_sock *tp = tcp_sk(sk); >+ >+ __skb_queue_purge(&tp->mpcb->reinject_queue); >+ >+ if (tp->inside_tk_table) { >+ mptcp_hash_remove_bh(tp); >+ reqsk_queue_destroy(&inet_csk(tp->meta_sk)->icsk_accept_queue); >+ } >+ >+ local_bh_disable(); >+ mptcp_for_each_sk_safe(tp->mpcb, subsk, tmpsk) { >+ /* The socket will get removed from the subsocket-list >+ * and made non-mptcp by setting mpc to 0. >+ * >+ * This is necessary, because tcp_disconnect assumes >+ * that the connection is completly dead afterwards. >+ * Thus we need to do a mptcp_del_sock. Due to this call >+ * we have to make it non-mptcp. >+ * >+ * We have to lock the socket, because we set mpc to 0. >+ * An incoming packet would take the subsocket's lock >+ * and go on into the receive-path. >+ * This would be a race. >+ */ >+ >+ bh_lock_sock(subsk); >+ mptcp_del_sock(subsk); >+ tcp_sk(subsk)->mpc = 0; >+ mptcp_sub_force_close(subsk); >+ bh_unlock_sock(subsk); >+ } >+ local_bh_enable(); >+ >+ tp->was_meta_sk = 1; >+ tp->mpc = 0; >+ } else { >+ if (tp->inside_tk_table) >+ mptcp_hash_remove_bh(tp); >+ } >+#endif >+ > sk->sk_shutdown = 0; > sock_reset_flag(sk, SOCK_DONE); > tp->srtt = 0; >@@ -2542,6 +2653,13 @@ > elapsed = tp->keepalive_time - elapsed; > else > elapsed = 0; >+ if (tp->mpc) { >+ struct sock *sk_it = sk; >+ mptcp_for_each_sk(tp->mpcb, sk_it) >+ if (!(1 << sk->sk_state & (TCPF_CLOSE | TCPF_LISTEN))) >+ inet_csk_reset_keepalive_timer(sk_it, elapsed); >+ break; >+ } > inet_csk_reset_keepalive_timer(sk, elapsed); > } > } >@@ -3039,12 +3157,19 @@ > void tcp_done(struct sock *sk) > { > struct request_sock *req = tcp_sk(sk)->fastopen_rsk; >+ struct tcp_sock *tp = tcp_sk(sk); > > if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) > TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); > >+ WARN_ON(sk->sk_state == TCP_CLOSE); > tcp_set_state(sk, TCP_CLOSE); >- tcp_clear_xmit_timers(sk); >+ >+ /* If it is a meta-sk sending mp_fclose we have to maintain the >+ * rexmit-timer for retransmitting the MP_FCLOSE */ >+ if (!tp->mpc || !is_meta_sk(sk) || !tp->send_mp_fclose) >+ tcp_clear_xmit_timers(sk); >+ > if (req != NULL) > reqsk_fastopen_remove(sk, req, false); > >diff -Naur a/linux-3.11/net/ipv4/tcp_input.c b/linux-3.11/net/ipv4/tcp_input.c >--- a/linux-3.11/net/ipv4/tcp_input.c 2013-09-02 22:46:10.000000000 +0200 >+++ b/linux-3.11/net/ipv4/tcp_input.c 2013-10-05 18:51:20.648901373 +0200 >@@ -74,6 +74,9 @@ > #include <linux/ipsec.h> > #include <asm/unaligned.h> > #include <net/netdma.h> >+#include <net/mptcp.h> >+#include <net/mptcp_v4.h> >+#include <net/mptcp_v6.h> > > int sysctl_tcp_timestamps __read_mostly = 1; > int sysctl_tcp_window_scaling __read_mostly = 1; >@@ -99,25 +102,6 @@ > int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; > int sysctl_tcp_early_retrans __read_mostly = 3; > >-#define FLAG_DATA 0x01 /* Incoming frame contained data. */ >-#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ >-#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ >-#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */ >-#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */ >-#define FLAG_DATA_SACKED 0x20 /* New SACK. */ >-#define FLAG_ECE 0x40 /* ECE in this ACK */ >-#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ >-#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */ >-#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ >-#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ >-#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ >-#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */ >- >-#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) >-#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) >-#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE) >-#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED) >- > #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) > #define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) > >@@ -322,10 +306,12 @@ > static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) > { > struct tcp_sock *tp = tcp_sk(sk); >+ struct sock *meta_sk = tp->mpc ? mptcp_meta_sk(sk) : sk; >+ struct tcp_sock *meta_tp = tcp_sk(meta_sk); > > /* Check #1 */ >- if (tp->rcv_ssthresh < tp->window_clamp && >- (int)tp->rcv_ssthresh < tcp_space(sk) && >+ if (meta_tp->rcv_ssthresh < meta_tp->window_clamp && >+ (int)meta_tp->rcv_ssthresh < tcp_space(sk) && > !sk_under_memory_pressure(sk)) { > int incr; > >@@ -333,14 +319,14 @@ > * will fit to rcvbuf in future. > */ > if (tcp_win_from_space(skb->truesize) <= skb->len) >- incr = 2 * tp->advmss; >+ incr = 2 * meta_tp->advmss; > else >- incr = __tcp_grow_window(sk, skb); >+ incr = __tcp_grow_window(meta_sk, skb); > > if (incr) { > incr = max_t(int, incr, 2 * skb->len); >- tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, >- tp->window_clamp); >+ meta_tp->rcv_ssthresh = min(meta_tp->rcv_ssthresh + incr, >+ meta_tp->window_clamp); > inet_csk(sk)->icsk_ack.quick |= 1; > } > } >@@ -393,6 +379,11 @@ > > tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp); > tp->snd_cwnd_stamp = tcp_time_stamp; >+ >+ if (tp->mpc) { >+ mptcp_init_buffer_space(sk); >+ mptcp_update_sndbuf(tp->mpcb); >+ } > } > > /* 5. Recalculate window clamp after socket hit its memory bounds. */ >@@ -518,7 +509,10 @@ > goto new_measure; > > time = tcp_time_stamp - tp->rcvq_space.time; >- if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0) >+ if (tp->mpc) { >+ if (mptcp_check_rtt(tp, time)) >+ return; >+ } else if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0) > return; > > space = 2 * (tp->copied_seq - tp->rcvq_space.seq); >@@ -716,6 +710,7 @@ > * guarantees that rto is higher. > */ > tcp_bound_rto(sk); >+ mptcp_set_rto(sk); > } > > __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst) >@@ -2914,7 +2909,7 @@ > } > > /* If we get here, the whole TSO packet has not been acked. */ >-static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) >+u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) > { > struct tcp_sock *tp = tcp_sk(sk); > u32 packets_acked; >@@ -3009,6 +3004,8 @@ > */ > if (!(scb->tcp_flags & TCPHDR_SYN)) { > flag |= FLAG_DATA_ACKED; >+ if (tp->mpc && mptcp_is_data_seq(skb)) >+ flag |= MPTCP_FLAG_DATA_ACKED; > } else { > flag |= FLAG_SYN_ACKED; > tp->retrans_stamp = 0; >@@ -3018,6 +3015,7 @@ > break; > > tcp_unlink_write_queue(skb, sk); >+ > sk_wmem_free_skb(sk, skb); > if (skb == tp->retransmit_skb_hint) > tp->retransmit_skb_hint = NULL; >@@ -3104,7 +3102,7 @@ > return flag; > } > >-static void tcp_ack_probe(struct sock *sk) >+void tcp_ack_probe(struct sock *sk) > { > const struct tcp_sock *tp = tcp_sk(sk); > struct inet_connection_sock *icsk = inet_csk(sk); >@@ -3140,9 +3138,8 @@ > /* Check that window update is acceptable. > * The function assumes that snd_una<=ack<=snd_next. > */ >-static inline bool tcp_may_update_window(const struct tcp_sock *tp, >- const u32 ack, const u32 ack_seq, >- const u32 nwin) >+bool tcp_may_update_window(const struct tcp_sock *tp, const u32 ack, >+ const u32 ack_seq, const u32 nwin) > { > return after(ack, tp->snd_una) || > after(ack_seq, tp->snd_wl1) || >@@ -3261,7 +3258,7 @@ > } > > /* This routine deals with incoming acks, but not outgoing ones. */ >-static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) >+static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) > { > struct inet_connection_sock *icsk = inet_csk(sk); > struct tcp_sock *tp = tcp_sk(sk); >@@ -3350,6 +3347,19 @@ > /* See if we can take anything off of the retransmit queue. */ > acked = tp->packets_out; > flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una); >+ >+ if (tp->mpc) { >+ flag |= mptcp_fallback_infinite(sk, flag); >+ >+ if (flag & MPTCP_FLAG_SEND_RESET) { >+ pr_err("%s resetting flow\n", __func__); >+ mptcp_send_reset(sk); >+ goto invalid_ack; >+ } >+ >+ mptcp_clean_rtx_infinite(skb, sk); >+ } >+ > acked -= tp->packets_out; > > if (tcp_ack_is_dubious(sk, flag)) { >@@ -3416,8 +3426,9 @@ > * the fast version below fails. > */ > void tcp_parse_options(const struct sk_buff *skb, >- struct tcp_options_received *opt_rx, int estab, >- struct tcp_fastopen_cookie *foc) >+ struct tcp_options_received *opt_rx, >+ struct mptcp_options_received *mopt, >+ int estab, struct tcp_fastopen_cookie *foc) > { > const unsigned char *ptr; > const struct tcphdr *th = tcp_hdr(skb); >@@ -3500,6 +3511,10 @@ > */ > break; > #endif >+ case TCPOPT_MPTCP: >+ mptcp_parse_options(ptr - 2, opsize, opt_rx, >+ mopt, skb); >+ break; > case TCPOPT_EXP: > /* Fast Open option shares code 254 using a > * 16 bits magic number. It's valid only in >@@ -3561,8 +3576,8 @@ > if (tcp_parse_aligned_timestamp(tp, th)) > return true; > } >- >- tcp_parse_options(skb, &tp->rx_opt, 1, NULL); >+ tcp_parse_options(skb, &tp->rx_opt, tp->mpc ? &tp->mptcp->rx_opt : NULL, >+ 1, NULL); > if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) > tp->rx_opt.rcv_tsecr -= tp->tsoffset; > >@@ -3732,6 +3747,8 @@ > case TCP_ESTABLISHED: > /* Move to CLOSE_WAIT */ > tcp_set_state(sk, TCP_CLOSE_WAIT); >+ if (tp->mpc) >+ mptcp_sub_close_passive(sk); > dst = __sk_dst_get(sk); > if (!dst || !dst_metric(dst, RTAX_QUICKACK)) > inet_csk(sk)->icsk_ack.pingpong = 1; >@@ -3756,6 +3773,13 @@ > tcp_set_state(sk, TCP_CLOSING); > break; > case TCP_FIN_WAIT2: >+ if (tp->mpc) { >+ /* The socket will get closed by mptcp_data_ready. >+ * We first have to process all data-sequences. >+ */ >+ tp->close_it = 1; >+ break; >+ } > /* Received a FIN -- send ACK and enter TIME_WAIT. */ > tcp_send_ack(sk); > tcp_time_wait(sk, TCP_TIME_WAIT, 0); >@@ -3780,6 +3804,10 @@ > if (!sock_flag(sk, SOCK_DEAD)) { > sk->sk_state_change(sk); > >+ /* Don't wake up MPTCP-subflows */ >+ if (tp->mpc) >+ return; >+ > /* Do not send POLL_HUP for half duplex close. */ > if (sk->sk_shutdown == SHUTDOWN_MASK || > sk->sk_state == TCP_CLOSE) >@@ -3977,7 +4005,11 @@ > tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack); > } > >- if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { >+ /* In case of MPTCP, the segment may be empty if it's a >+ * non-data DATA_FIN. (see beginning of tcp_data_queue) >+ */ >+ if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt) && >+ !(tp->mpc && TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq)) { > SOCK_DEBUG(sk, "ofo packet was already received\n"); > __skb_unlink(skb, &tp->out_of_order_queue); > __kfree_skb(skb); >@@ -4001,6 +4033,9 @@ > static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, > unsigned int size) > { >+ if (tcp_sk(sk)->mpc) >+ sk = mptcp_meta_sk(sk); >+ > if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || > !sk_rmem_schedule(sk, skb, size)) { > >@@ -4031,15 +4066,16 @@ > * Better try to coalesce them right now to avoid future collapses. > * Returns true if caller should free @from instead of queueing it > */ >-static bool tcp_try_coalesce(struct sock *sk, >- struct sk_buff *to, >- struct sk_buff *from, >- bool *fragstolen) >+bool tcp_try_coalesce(struct sock *sk, struct sk_buff *to, struct sk_buff *from, >+ bool *fragstolen) > { > int delta; > > *fragstolen = false; > >+ if (tcp_sk(sk)->mpc && !is_meta_sk(sk)) >+ return false; >+ > if (tcp_hdr(from)->fin) > return false; > >@@ -4128,7 +4164,9 @@ > > /* Do skb overlap to previous one? */ > if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) { >- if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { >+ /* MPTCP allows non-data data-fin to be in the ofo-queue */ >+ if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq) && >+ !(tp->mpc && end_seq == seq)) { > /* All the bits are present. Drop. */ > NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE); > __kfree_skb(skb); >@@ -4166,6 +4204,9 @@ > end_seq); > break; > } >+ /* MPTCP allows non-data data-fin to be in the ofo-queue */ >+ if (tp->mpc && TCP_SKB_CB(skb1)->seq == TCP_SKB_CB(skb1)->end_seq) >+ continue; > __skb_unlink(skb1, &tp->out_of_order_queue); > tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, > TCP_SKB_CB(skb1)->end_seq); >@@ -4181,8 +4222,8 @@ > skb_set_owner_r(skb, sk); > } > >-static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen, >- bool *fragstolen) >+int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen, >+ bool *fragstolen) > { > int eaten; > struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue); >@@ -4244,7 +4285,10 @@ > int eaten = -1; > bool fragstolen = false; > >- if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) >+ /* If no data is present, but a data_fin is in the options, we still >+ * have to call mptcp_queue_skb later on. */ >+ if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq && >+ !(tp->mpc && mptcp_is_data_fin(skb))) > goto drop; > > skb_dst_drop(skb); >@@ -4290,7 +4334,7 @@ > eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); > } > tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; >- if (skb->len) >+ if (skb->len || mptcp_is_data_fin(skb)) > tcp_event_data_recv(sk, skb); > if (th->fin) > tcp_fin(sk); >@@ -4312,7 +4356,11 @@ > > if (eaten > 0) > kfree_skb_partial(skb, fragstolen); >- if (!sock_flag(sk, SOCK_DEAD)) >+ if (!sock_flag(sk, SOCK_DEAD) || tp->mpc) >+ /* MPTCP: we always have to call data_ready, because >+ * we may be about to receive a data-fin, which still >+ * must get queued. >+ */ > sk->sk_data_ready(sk, 0); > return; > } >@@ -4386,6 +4434,9 @@ > struct sk_buff *skb, *n; > bool end_of_skbs; > >+ if (tcp_sk(sk)->mpc) >+ return; >+ > /* First, check that queue is collapsible and find > * the point where collapsing can be useful. */ > skb = head; >@@ -4491,7 +4542,7 @@ > struct sk_buff *head; > u32 start, end; > >- if (skb == NULL) >+ if (skb == NULL || tp->mpc) > return; > > start = TCP_SKB_CB(skb)->seq; >@@ -4536,6 +4587,18 @@ > struct tcp_sock *tp = tcp_sk(sk); > bool res = false; > >+ if (is_meta_sk(sk)) { >+ if (!skb_queue_empty(&tp->out_of_order_queue)) { >+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED); >+ mptcp_purge_ofo_queue(tp); >+ >+ /* No sack at the mptcp-level */ >+ sk_mem_reclaim(sk); >+ res = 1; >+ } >+ return res; >+ } >+ > if (!skb_queue_empty(&tp->out_of_order_queue)) { > NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED); > __skb_queue_purge(&tp->out_of_order_queue); >@@ -4644,9 +4707,46 @@ > return false; > > /* If we filled the congestion window, do not expand. */ >- if (tp->packets_out >= tp->snd_cwnd) >+ if (!tp->mpc && tp->packets_out >= tp->snd_cwnd) > return false; > >+#ifdef CONFIG_MPTCP >+ if (tp->mpc) { >+ struct sock *sk_it; >+ int cnt_backups = 0; >+ int backup_available = 0; >+ >+ /* For MPTCP we look for a subsocket that could send data. >+ * If we found one, then we update the send-buffer. >+ */ >+ mptcp_for_each_sk(tp->mpcb, sk_it) { >+ struct tcp_sock *tp_it = tcp_sk(sk_it); >+ >+ if (!mptcp_sk_can_send(sk_it)) >+ continue; >+ >+ /* Backup-flows have to be counted - if there is no other >+ * subflow we take the backup-flow into account. */ >+ if (tp_it->mptcp->rcv_low_prio || tp_it->mptcp->low_prio) { >+ cnt_backups++; >+ } >+ >+ if (tp_it->packets_out < tp_it->snd_cwnd) { >+ if (tp_it->mptcp->rcv_low_prio || tp_it->mptcp->low_prio) { >+ backup_available = 1; >+ continue; >+ } >+ return 1; >+ } >+ } >+ >+ /* Backup-flow is available for sending - update send-buffer */ >+ if (tp->mpcb->cnt_established == cnt_backups && backup_available) >+ return 1; >+ return 0; >+ } >+#endif >+ > return true; > } > >@@ -4659,17 +4759,34 @@ > static void tcp_new_space(struct sock *sk) > { > struct tcp_sock *tp = tcp_sk(sk); >+ struct sock *meta_sk = tp->mpc ? mptcp_meta_sk(sk) : sk; > >- if (tcp_should_expand_sndbuf(sk)) { >+ if (tcp_should_expand_sndbuf(meta_sk)) { > int sndmem = SKB_TRUESIZE(max_t(u32, > tp->rx_opt.mss_clamp, > tp->mss_cache) + > MAX_TCP_HEADER); >- int demanded = max_t(unsigned int, tp->snd_cwnd, >- tp->reordering + 1); >+ int demanded; >+ >+ if (tp->mpc) >+ demanded = mptcp_check_snd_buf(tp); >+ else >+ demanded = max_t(unsigned int, tp->snd_cwnd, >+ tp->reordering + 1); >+ >+ /* MPTCP: After this, sndmem is the new contribution of the >+ * current subflow to the aggregate sndbuf >+ */ > sndmem *= 2 * demanded; >- if (sndmem > sk->sk_sndbuf) >+ if (sndmem > sk->sk_sndbuf) { >+ int old_sndbuf = sk->sk_sndbuf; > sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]); >+ /* MPTCP: ok, the subflow sndbuf has grown, reflect >+ * this in the aggregate buffer. >+ */ >+ if (tp->mpc && old_sndbuf != sk->sk_sndbuf) >+ mptcp_update_sndbuf(tp->mpcb); >+ } > tp->snd_cwnd_stamp = tcp_time_stamp; > } > >@@ -4680,8 +4797,9 @@ > { > if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) { > sock_reset_flag(sk, SOCK_QUEUE_SHRUNK); >- if (sk->sk_socket && >- test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) >+ if (tcp_sk(sk)->mpc || >+ (sk->sk_socket && >+ test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))) > tcp_new_space(sk); > } > } >@@ -4806,6 +4924,10 @@ > { > struct tcp_sock *tp = tcp_sk(sk); > >+ /* MPTCP urgent data is not yet supported */ >+ if (tp->mpc) >+ return; >+ > /* Check if we get a new urgent pointer - normally not. */ > if (th->urg) > tcp_check_urg(sk, th); >@@ -4818,8 +4940,10 @@ > /* Is the urgent pointer pointing into this packet? */ > if (ptr < skb->len) { > u8 tmp; >+ > if (skb_copy_bits(skb, ptr, &tmp, 1)) > BUG(); >+ > tp->urg_data = TCP_URG_VALID | tmp; > if (!sock_flag(sk, SOCK_DEAD)) > sk->sk_data_ready(sk, 0); >@@ -4873,8 +4997,7 @@ > } > > #ifdef CONFIG_NET_DMA >-static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, >- int hlen) >+bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen) > { > struct tcp_sock *tp = tcp_sk(sk); > int chunk = skb->len - hlen; >@@ -4983,9 +5106,15 @@ > goto discard; > } > >+ /* If valid: post process the received MPTCP options. */ >+ if (tp->mpc && mptcp_handle_options(sk, th, skb)) >+ goto discard; >+ > return true; > > discard: >+ if (tp->mpc) >+ mptcp_reset_mopt(tp); > __kfree_skb(skb); > return false; > } >@@ -5037,6 +5166,10 @@ > > tp->rx_opt.saw_tstamp = 0; > >+ /* MPTCP: force slowpath. */ >+ if (tp->mpc) >+ goto slow_path; >+ > /* pred_flags is 0xS?10 << 16 + snd_wnd > * if header_prediction is to be made > * 'S' will always be tp->tcp_header_len >> 2 >@@ -5282,7 +5415,7 @@ > /* Get original SYNACK MSS value if user MSS sets mss_clamp */ > tcp_clear_options(&opt); > opt.user_mss = opt.mss_clamp = 0; >- tcp_parse_options(synack, &opt, 0, NULL); >+ tcp_parse_options(synack, &opt, NULL, 0, NULL); > mss = opt.mss_clamp; > } > >@@ -5317,8 +5450,11 @@ > struct tcp_sock *tp = tcp_sk(sk); > struct tcp_fastopen_cookie foc = { .len = -1 }; > int saved_clamp = tp->rx_opt.mss_clamp; >+ struct mptcp_options_received mopt; >+ mptcp_init_mp_opt(&mopt); > >- tcp_parse_options(skb, &tp->rx_opt, 0, &foc); >+ tcp_parse_options(skb, &tp->rx_opt, >+ tp->mpc ? &tp->mptcp->rx_opt : &mopt, 0, &foc); > if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) > tp->rx_opt.rcv_tsecr -= tp->tsoffset; > >@@ -5365,6 +5501,21 @@ > if (!th->syn) > goto discard_and_undo; > >+ if (tp->request_mptcp || tp->mpc) { >+ int ret; >+ ret = mptcp_rcv_synsent_state_process(sk, &sk, >+ skb, &mopt); >+ >+ /* May have changed if we support MPTCP */ >+ tp = tcp_sk(sk); >+ icsk = inet_csk(sk); >+ >+ if (ret == 1) >+ goto reset_and_undo; >+ if (ret == 2) >+ goto discard; >+ } >+ > /* rfc793: > * "If the SYN bit is on ... > * are acceptable then ... >@@ -5377,6 +5528,15 @@ > tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); > tcp_ack(sk, skb, FLAG_SLOWPATH); > >+ if (tp->mpc && !is_master_tp(tp)) { >+ /* Timer for repeating the ACK until an answer >+ * arrives. Used only when establishing an additional >+ * subflow inside of an MPTCP connection. >+ */ >+ sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer, >+ jiffies + icsk->icsk_rto); >+ } >+ > /* Ok.. it's good. Set up sequence numbers and > * move to established. > */ >@@ -5403,6 +5563,11 @@ > tp->tcp_header_len = sizeof(struct tcphdr); > } > >+ if (tp->mpc) { >+ tp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN; >+ tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN; >+ } >+ > if (tcp_is_sack(tp) && sysctl_tcp_fack) > tcp_enable_fack(tp); > >@@ -5423,7 +5588,9 @@ > tcp_rcv_fastopen_synack(sk, skb, &foc)) > return -1; > >- if (sk->sk_write_pending || >+ /* With MPTCP we cannot send data on the third ack due to the >+ * lack of option-space */ >+ if ((sk->sk_write_pending && !tp->mpc) || > icsk->icsk_accept_queue.rskq_defer_accept || > icsk->icsk_ack.pingpong) { > /* Save one ACK. Data will be ready after >@@ -5465,6 +5632,7 @@ > tcp_paws_reject(&tp->rx_opt, 0)) > goto discard_and_undo; > >+ /* TODO - check this here for MPTCP */ > if (th->syn) { > /* We see SYN without ACK. It is attempt of > * simultaneous connect with crossed SYNs. >@@ -5481,6 +5649,11 @@ > tp->tcp_header_len = sizeof(struct tcphdr); > } > >+ if (tp->mpc) { >+ tp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN; >+ tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN; >+ } >+ > tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; > tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; > >@@ -5589,6 +5762,10 @@ > > case TCP_SYN_SENT: > queued = tcp_rcv_synsent_state_process(sk, skb, th, len); >+ if (is_meta_sk(sk)) { >+ sk = tcp_sk(sk)->mpcb->master_sk; >+ tp = tcp_sk(sk); >+ } > if (queued >= 0) > return queued; > >@@ -5596,6 +5773,8 @@ > tcp_urg(sk, skb, th); > __kfree_skb(skb); > tcp_data_snd_check(sk); >+ if (tp->mpc && is_master_tp(tp)) >+ bh_unlock_sock(sk); > return 0; > } > >@@ -5657,6 +5836,8 @@ > > if (tp->rx_opt.tstamp_ok) > tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; >+ if (tp->mpc) >+ tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN; > > if (req) { > /* Re-arm the timer because data may have been sent out. >@@ -5676,6 +5857,14 @@ > > tcp_initialize_rcv_mss(sk); > tcp_fast_path_on(tp); >+ >+ /* Send an ACK when establishing a new >+ * MPTCP subflow, i.e. using an MP_JOIN >+ * subtype. >+ */ >+ if (tp->mpc && !is_master_tp(tp)) >+ tcp_send_ack(sk); >+ > break; > > case TCP_FIN_WAIT1: { >@@ -5714,6 +5903,9 @@ > /* Wake up lingering close() */ > sk->sk_state_change(sk); > break; >+ case TCP_CLOSE: >+ if (tp->mp_killed) >+ goto discard; > } > > if (tp->linger2 < 0 || >@@ -5727,7 +5919,8 @@ > tmo = tcp_fin_time(sk); > if (tmo > TCP_TIMEWAIT_LEN) { > inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); >- } else if (th->fin || sock_owned_by_user(sk)) { >+ } else if (th->fin || mptcp_is_data_fin(skb) || >+ sock_owned_by_user(sk)) { > /* Bad case. We could lose such FIN otherwise. > * It is not a big problem, but it looks confusing > * and not so rare event. We still can lose it now, >@@ -5776,7 +5969,10 @@ > */ > if (sk->sk_shutdown & RCV_SHUTDOWN) { > if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && >- after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { >+ after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt) && >+ !tp->mpc) { >+ /* In case of mptcp, the reset is handled by >+ * mptcp_rcv_state_process */ > NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA); > tcp_reset(sk); > return 1; >diff -Naur a/linux-3.11/net/ipv4/tcp_ipv4.c b/linux-3.11/net/ipv4/tcp_ipv4.c >--- a/linux-3.11/net/ipv4/tcp_ipv4.c 2013-09-02 22:46:10.000000000 +0200 >+++ b/linux-3.11/net/ipv4/tcp_ipv4.c 2013-10-05 18:34:49.078367102 +0200 >@@ -67,6 +67,8 @@ > #include <net/icmp.h> > #include <net/inet_hashtables.h> > #include <net/tcp.h> >+#include <net/mptcp.h> >+#include <net/mptcp_v4.h> > #include <net/transp_v6.h> > #include <net/ipv6.h> > #include <net/inet_common.h> >@@ -99,7 +101,7 @@ > struct inet_hashinfo tcp_hashinfo; > EXPORT_SYMBOL(tcp_hashinfo); > >-static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb) >+__u32 tcp_v4_init_sequence(const struct sk_buff *skb) > { > return secure_tcp_sequence_number(ip_hdr(skb)->daddr, > ip_hdr(skb)->saddr, >@@ -333,7 +335,7 @@ > struct inet_sock *inet; > const int type = icmp_hdr(icmp_skb)->type; > const int code = icmp_hdr(icmp_skb)->code; >- struct sock *sk; >+ struct sock *sk, *meta_sk; > struct sk_buff *skb; > struct request_sock *req; > __u32 seq; >@@ -357,13 +359,19 @@ > return; > } > >- bh_lock_sock(sk); >+ tp = tcp_sk(sk); >+ if (tp->mpc) >+ meta_sk = mptcp_meta_sk(sk); >+ else >+ meta_sk = sk; >+ >+ bh_lock_sock(meta_sk); > /* If too many ICMPs get dropped on busy > * servers this needs to be solved differently. > * We do take care of PMTU discovery (RFC1191) special case : > * we can receive locally generated ICMP messages while socket is held. > */ >- if (sock_owned_by_user(sk)) { >+ if (sock_owned_by_user(meta_sk)) { > if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) > NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); > } >@@ -376,7 +384,6 @@ > } > > icsk = inet_csk(sk); >- tp = tcp_sk(sk); > req = tp->fastopen_rsk; > seq = ntohl(th->seq); > if (sk->sk_state != TCP_LISTEN && >@@ -410,11 +417,14 @@ > goto out; > > tp->mtu_info = info; >- if (!sock_owned_by_user(sk)) { >+ if (!sock_owned_by_user(meta_sk)) { > tcp_v4_mtu_reduced(sk); > } else { > if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags)) > sock_hold(sk); >+ if (tp->mpc && >+ !test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tcp_sk(meta_sk)->tsq_flags)) >+ sock_hold(meta_sk); > } > goto out; > } >@@ -430,7 +440,7 @@ > > /* XXX (TFO) - revisit the following logic for TFO */ > >- if (sock_owned_by_user(sk)) >+ if (sock_owned_by_user(meta_sk)) > break; > > icsk->icsk_backoff--; >@@ -472,7 +482,7 @@ > switch (sk->sk_state) { > struct request_sock *req, **prev; > case TCP_LISTEN: >- if (sock_owned_by_user(sk)) >+ if (sock_owned_by_user(meta_sk)) > goto out; > > req = inet_csk_search_req(sk, &prev, th->dest, >@@ -505,7 +515,7 @@ > It can f.e. if SYNs crossed, > or Fast Open. > */ >- if (!sock_owned_by_user(sk)) { >+ if (!sock_owned_by_user(meta_sk)) { > sk->sk_err = err; > > sk->sk_error_report(sk); >@@ -534,7 +544,7 @@ > */ > > inet = inet_sk(sk); >- if (!sock_owned_by_user(sk) && inet->recverr) { >+ if (!sock_owned_by_user(meta_sk) && inet->recverr) { > sk->sk_err = err; > sk->sk_error_report(sk); > } else { /* Only an error on timeout */ >@@ -542,7 +552,7 @@ > } > > out: >- bh_unlock_sock(sk); >+ bh_unlock_sock(meta_sk); > sock_put(sk); > } > >@@ -584,7 +594,7 @@ > * Exception: precedence violation. We do not implement it in any case. > */ > >-static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) >+void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) > { > const struct tcphdr *th = tcp_hdr(skb); > struct { >@@ -708,10 +718,10 @@ > outside socket context is ugly, certainly. What can I do? > */ > >-static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, >+static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 data_ack, > u32 win, u32 tsval, u32 tsecr, int oif, > struct tcp_md5sig_key *key, >- int reply_flags, u8 tos) >+ int reply_flags, u8 tos, int mptcp) > { > const struct tcphdr *th = tcp_hdr(skb); > struct { >@@ -720,6 +730,10 @@ > #ifdef CONFIG_TCP_MD5SIG > + (TCPOLEN_MD5SIG_ALIGNED >> 2) > #endif >+#ifdef CONFIG_MPTCP >+ + ((MPTCP_SUB_LEN_DSS >> 2) + >+ (MPTCP_SUB_LEN_ACK >> 2)) >+#endif > ]; > } rep; > struct ip_reply_arg arg; >@@ -764,6 +778,21 @@ > ip_hdr(skb)->daddr, &rep.th); > } > #endif >+#ifdef CONFIG_MPTCP >+ if (mptcp) { >+ int offset = (tsecr) ? 3 : 0; >+ /* Construction of 32-bit data_ack */ >+ rep.opt[offset++] = htonl((TCPOPT_MPTCP << 24) | >+ ((MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK) << 16) | >+ (0x20 << 8) | >+ (0x01)); >+ rep.opt[offset] = htonl(data_ack); >+ >+ arg.iov[0].iov_len += MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK; >+ rep.th.doff = arg.iov[0].iov_len / 4; >+ } >+#endif /* CONFIG_MPTCP */ >+ > arg.flags = reply_flags; > arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, > ip_hdr(skb)->saddr, /* XXX */ >@@ -782,36 +811,44 @@ > { > struct inet_timewait_sock *tw = inet_twsk(sk); > struct tcp_timewait_sock *tcptw = tcp_twsk(sk); >+ u32 data_ack = 0; >+ int mptcp = 0; >+ >+ if (tcptw->mptcp_tw && tcptw->mptcp_tw->meta_tw) { >+ data_ack = (u32)tcptw->mptcp_tw->rcv_nxt; >+ mptcp = 1; >+ } > > tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, >+ data_ack, > tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, > tcp_time_stamp + tcptw->tw_ts_offset, > tcptw->tw_ts_recent, > tw->tw_bound_dev_if, > tcp_twsk_md5_key(tcptw), > tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, >- tw->tw_tos >+ tw->tw_tos, mptcp > ); > > inet_twsk_put(tw); > } > >-static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, >- struct request_sock *req) >+void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, >+ struct request_sock *req) > { > /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV > * sk->sk_state == TCP_SYN_RECV -> for Fast Open. > */ > tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ? > tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, >- tcp_rsk(req)->rcv_nxt, req->rcv_wnd, >+ tcp_rsk(req)->rcv_nxt, 0, req->rcv_wnd, > tcp_time_stamp, > req->ts_recent, > 0, > tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr, > AF_INET), > inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, >- ip_hdr(skb)->tos); >+ ip_hdr(skb)->tos, 0); > } > > /* >@@ -819,10 +856,9 @@ > * This still operates on a request_sock only, not on a big > * socket. > */ >-static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, >- struct request_sock *req, >- u16 queue_mapping, >- bool nocache) >+int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, >+ struct request_sock *req, u16 queue_mapping, >+ bool nocache) > { > const struct inet_request_sock *ireq = inet_rsk(req); > struct flowi4 fl4; >@@ -850,7 +886,7 @@ > return err; > } > >-static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req) >+int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req) > { > int res = tcp_v4_send_synack(sk, NULL, req, 0, false); > >@@ -862,7 +898,7 @@ > /* > * IPv4 request_sock destructor. > */ >-static void tcp_v4_reqsk_destructor(struct request_sock *req) >+void tcp_v4_reqsk_destructor(struct request_sock *req) > { > kfree(inet_rsk(req)->opt); > } >@@ -902,7 +938,7 @@ > /* > * Save and compile IPv4 options into the request_sock if needed. > */ >-static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb) >+struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb) > { > const struct ip_options *opt = &(IPCB(skb)->opt); > struct ip_options_rcu *dopt = NULL; >@@ -1440,6 +1476,7 @@ > int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) > { > struct tcp_options_received tmp_opt; >+ struct mptcp_options_received mopt; > struct request_sock *req; > struct inet_request_sock *ireq; > struct tcp_sock *tp = tcp_sk(sk); >@@ -1454,6 +1491,22 @@ > struct sk_buff *skb_synack; > int do_fastopen; > >+ tcp_clear_options(&tmp_opt); >+ tmp_opt.mss_clamp = TCP_MSS_DEFAULT; >+ tmp_opt.user_mss = tp->rx_opt.user_mss; >+ mptcp_init_mp_opt(&mopt); >+ tcp_parse_options(skb, &tmp_opt, &mopt, 0, want_cookie ? NULL : &foc); >+ >+#ifdef CONFIG_MPTCP >+ /* MPTCP structures not initialized, so clear MPTCP fields */ >+ if (mptcp_init_failed) >+ mptcp_init_mp_opt(&mopt); >+ >+ if (mopt.is_mp_join) >+ return mptcp_do_join_short(skb, &mopt, &tmp_opt, sock_net(sk)); >+ if (mopt.drop_me) >+ goto drop; >+#endif > /* Never answer to SYNs send to broadcast or multicast */ > if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) > goto drop; >@@ -1478,7 +1531,20 @@ > goto drop; > } > >- req = inet_reqsk_alloc(&tcp_request_sock_ops); >+#ifdef CONFIG_MPTCP >+ if (mopt.saw_mpc) { >+ req = inet_reqsk_alloc(&mptcp_request_sock_ops); >+ >+ if (!req) >+ goto drop; >+ >+ mptcp_rsk(req)->mpcb = NULL; >+ mptcp_rsk(req)->dss_csum = mopt.dss_csum; >+ mptcp_rsk(req)->collide_tk.pprev = NULL; >+ } else >+#endif >+ req = inet_reqsk_alloc(&tcp_request_sock_ops); >+ > if (!req) > goto drop; > >@@ -1486,17 +1552,15 @@ > tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops; > #endif > >- tcp_clear_options(&tmp_opt); >- tmp_opt.mss_clamp = TCP_MSS_DEFAULT; >- tmp_opt.user_mss = tp->rx_opt.user_mss; >- tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc); >- > if (want_cookie && !tmp_opt.saw_tstamp) > tcp_clear_options(&tmp_opt); > > tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; > tcp_openreq_init(req, &tmp_opt, skb); > >+ if (mopt.saw_mpc) >+ mptcp_reqsk_new_mptcp(req, &tmp_opt, &mopt, skb); >+ > ireq = inet_rsk(req); > ireq->loc_addr = daddr; > ireq->rmt_addr = saddr; >@@ -1711,7 +1775,7 @@ > } > EXPORT_SYMBOL(tcp_v4_syn_recv_sock); > >-static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) >+struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) > { > struct tcphdr *th = tcp_hdr(skb); > const struct iphdr *iph = ip_hdr(skb); >@@ -1728,8 +1792,15 @@ > > if (nsk) { > if (nsk->sk_state != TCP_TIME_WAIT) { >+ /* Don't lock again the meta-sk. It has been locked >+ * before mptcp_v4_do_rcv. >+ */ >+ if (tcp_sk(nsk)->mpc && !is_meta_sk(sk)) >+ bh_lock_sock(mptcp_meta_sk(nsk)); > bh_lock_sock(nsk); >+ > return nsk; >+ > } > inet_twsk_put(inet_twsk(nsk)); > return NULL; >@@ -1786,6 +1857,9 @@ > goto discard; > #endif > >+ if (is_meta_sk(sk)) >+ return mptcp_v4_do_rcv(sk, skb); >+ > if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ > struct dst_entry *dst = sk->sk_rx_dst; > >@@ -1920,7 +1994,7 @@ > } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) { > wake_up_interruptible_sync_poll(sk_sleep(sk), > POLLIN | POLLRDNORM | POLLRDBAND); >- if (!inet_csk_ack_scheduled(sk)) >+ if (!inet_csk_ack_scheduled(sk) && !tp->mpc) > inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, > (3 * tcp_rto_min(sk)) / 4, > TCP_RTO_MAX); >@@ -1937,7 +2011,7 @@ > { > const struct iphdr *iph; > const struct tcphdr *th; >- struct sock *sk; >+ struct sock *sk, *meta_sk = NULL; > int ret; > struct net *net = dev_net(skb->dev); > >@@ -1970,18 +2044,42 @@ > TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + > skb->len - th->doff * 4); > TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); >+#ifdef CONFIG_MPTCP >+ TCP_SKB_CB(skb)->mptcp_flags = 0; >+ TCP_SKB_CB(skb)->dss_off = 0; >+#endif > TCP_SKB_CB(skb)->when = 0; > TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); > TCP_SKB_CB(skb)->sacked = 0; > > sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); >- if (!sk) >- goto no_tcp_socket; > > process: >- if (sk->sk_state == TCP_TIME_WAIT) >+ if (sk && sk->sk_state == TCP_TIME_WAIT) > goto do_time_wait; > >+#ifdef CONFIG_MPTCP >+ if (!sk && th->syn && !th->ack) { >+ int ret = mptcp_lookup_join(skb, NULL); >+ >+ if (ret < 0) { >+ tcp_v4_send_reset(NULL, skb); >+ goto discard_it; >+ } else if (ret > 0) { >+ return 0; >+ } >+ } >+ >+ /* Is there a pending request sock for this segment ? */ >+ if ((!sk || sk->sk_state == TCP_LISTEN) && mptcp_check_req(skb, net)) { >+ if (sk) >+ sock_put(sk); >+ return 0; >+ } >+#endif >+ if (!sk) >+ goto no_tcp_socket; >+ > if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { > NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); > goto discard_and_relse; >@@ -1997,11 +2095,20 @@ > sk_mark_napi_id(sk, skb); > skb->dev = NULL; > >- bh_lock_sock_nested(sk); >+ if (tcp_sk(sk)->mpc) { >+ meta_sk = mptcp_meta_sk(sk); >+ >+ bh_lock_sock_nested(meta_sk); >+ skb->sk = sk; >+ } else { >+ meta_sk = sk; >+ bh_lock_sock_nested(sk); >+ } >+ > ret = 0; >- if (!sock_owned_by_user(sk)) { >+ if (!sock_owned_by_user(meta_sk)) { > #ifdef CONFIG_NET_DMA >- struct tcp_sock *tp = tcp_sk(sk); >+ struct tcp_sock *tp = tcp_sk(meta_sk); > if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) > tp->ucopy.dma_chan = net_dma_find_channel(); > if (tp->ucopy.dma_chan) >@@ -2009,16 +2116,16 @@ > else > #endif > { >- if (!tcp_prequeue(sk, skb)) >+ if (!tcp_prequeue(meta_sk, skb)) > ret = tcp_v4_do_rcv(sk, skb); > } >- } else if (unlikely(sk_add_backlog(sk, skb, >- sk->sk_rcvbuf + sk->sk_sndbuf))) { >- bh_unlock_sock(sk); >+ } else if (unlikely(sk_add_backlog(meta_sk, skb, >+ meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) { >+ bh_unlock_sock(meta_sk); > NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP); > goto discard_and_relse; > } >- bh_unlock_sock(sk); >+ bh_unlock_sock(meta_sk); > > sock_put(sk); > >@@ -2073,6 +2180,18 @@ > sk = sk2; > goto process; > } >+#ifdef CONFIG_MPTCP >+ if (th->syn && !th->ack) { >+ int ret = mptcp_lookup_join(skb, inet_twsk(sk)); >+ >+ if (ret < 0) { >+ tcp_v4_send_reset(NULL, skb); >+ goto discard_it; >+ } else if (ret > 0) { >+ return 0; >+ } >+ } >+#endif > /* Fall through to ACK */ > } > case TCP_TW_ACK: >@@ -2155,6 +2274,11 @@ > > tcp_cleanup_congestion_control(sk); > >+ if (tp->mpc) >+ mptcp_destroy_sock(sk); >+ if (tp->inside_tk_table) >+ mptcp_hash_remove(tp); >+ > /* Cleanup up the write buffer. */ > tcp_write_queue_purge(sk); > >diff -Naur a/linux-3.11/net/ipv4/tcp_minisocks.c b/linux-3.11/net/ipv4/tcp_minisocks.c >--- a/linux-3.11/net/ipv4/tcp_minisocks.c 2013-09-02 22:46:10.000000000 +0200 >+++ b/linux-3.11/net/ipv4/tcp_minisocks.c 2013-10-05 18:53:38.836164123 +0200 >@@ -18,11 +18,13 @@ > * Jorge Cwik, <jorge@laser.satlink.net> > */ > >+#include <linux/kconfig.h> > #include <linux/mm.h> > #include <linux/module.h> > #include <linux/slab.h> > #include <linux/sysctl.h> > #include <linux/workqueue.h> >+#include <net/mptcp.h> > #include <net/tcp.h> > #include <net/inet_common.h> > #include <net/xfrm.h> >@@ -95,10 +97,13 @@ > struct tcp_options_received tmp_opt; > struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); > bool paws_reject = false; >+ struct mptcp_options_received mopt; > > tmp_opt.saw_tstamp = 0; > if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { >- tcp_parse_options(skb, &tmp_opt, 0, NULL); >+ mptcp_init_mp_opt(&mopt); >+ >+ tcp_parse_options(skb, &tmp_opt, &mopt, 0, NULL); > > if (tmp_opt.saw_tstamp) { > tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset; >@@ -106,6 +111,11 @@ > tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; > paws_reject = tcp_paws_reject(&tmp_opt, th->rst); > } >+ >+ if (unlikely(mopt.mp_fclose) && tcptw->mptcp_tw) { >+ if (mopt.mptcp_key == tcptw->mptcp_tw->loc_key) >+ goto kill_with_rst; >+ } > } > > if (tw->tw_substate == TCP_FIN_WAIT2) { >@@ -128,6 +138,16 @@ > if (!th->ack || > !after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) || > TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) { >+ /* If mptcp_is_data_fin() returns true, we are sure that >+ * mopt has been initialized - otherwise it would not >+ * be a DATA_FIN. >+ */ >+ if (tcptw->mptcp_tw && tcptw->mptcp_tw->meta_tw && >+ mptcp_is_data_fin(skb) && >+ TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt && >+ mopt.data_seq + 1 == (u32)tcptw->mptcp_tw->rcv_nxt) >+ return TCP_TW_ACK; >+ > inet_twsk_put(tw); > return TCP_TW_SUCCESS; > } >@@ -159,6 +179,7 @@ > else > inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, > TCP_TIMEWAIT_LEN); >+ > return TCP_TW_ACK; > } > >@@ -270,6 +291,11 @@ > const struct tcp_sock *tp = tcp_sk(sk); > bool recycle_ok = false; > >+ if (is_meta_sk(sk)) { >+ mptcp_update_tw_socks(tp, state); >+ goto tcp_done; >+ } >+ > if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) > recycle_ok = tcp_remember_stamp(sk); > >@@ -290,6 +316,15 @@ > tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; > tcptw->tw_ts_offset = tp->tsoffset; > >+ if (tp->mpc) { >+ if (mptcp_time_wait(sk, tcptw)) { >+ inet_twsk_free(tw); >+ goto exit; >+ } >+ } else { >+ tcptw->mptcp_tw = NULL; >+ } >+ > #if IS_ENABLED(CONFIG_IPV6) > if (tw->tw_family == PF_INET6) { > struct ipv6_pinfo *np = inet6_sk(sk); >@@ -349,15 +384,18 @@ > NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW); > } > >+exit: > tcp_update_metrics(sk); >+tcp_done: > tcp_done(sk); > } > > void tcp_twsk_destructor(struct sock *sk) > { >-#ifdef CONFIG_TCP_MD5SIG > struct tcp_timewait_sock *twsk = tcp_twsk(sk); >- >+ if (twsk->mptcp_tw) >+ mptcp_twsk_destructor(twsk); >+#ifdef CONFIG_TCP_MD5SIG > if (twsk->tw_md5_key) > kfree_rcu(twsk->tw_md5_key, rcu); > #endif >@@ -394,6 +432,9 @@ > > newtp->snd_sml = newtp->snd_una = > newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1; >+#ifdef CONFIG_MPTCP >+ memset(&newtp->rcvq_space, 0, sizeof(newtp->rcvq_space)); >+#endif > > tcp_prequeue_init(newtp); > INIT_LIST_HEAD(&newtp->tsq_node); >@@ -468,6 +509,8 @@ > newtp->rx_opt.ts_recent_stamp = 0; > newtp->tcp_header_len = sizeof(struct tcphdr); > } >+ if (treq->saw_mpc) >+ newtp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN; > newtp->tsoffset = 0; > #ifdef CONFIG_TCP_MD5SIG > newtp->md5sig_info = NULL; /*XXX*/ >@@ -504,16 +547,20 @@ > bool fastopen) > { > struct tcp_options_received tmp_opt; >+ struct mptcp_options_received mopt; > struct sock *child; > const struct tcphdr *th = tcp_hdr(skb); > __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); > bool paws_reject = false; > >- BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN)); >+ BUG_ON(!tcp_sk(sk)->mpc && fastopen == (sk->sk_state == TCP_LISTEN)); > > tmp_opt.saw_tstamp = 0; >+ >+ mptcp_init_mp_opt(&mopt); >+ > if (th->doff > (sizeof(struct tcphdr)>>2)) { >- tcp_parse_options(skb, &tmp_opt, 0, NULL); >+ tcp_parse_options(skb, &tmp_opt, &mopt, 0, NULL); > > if (tmp_opt.saw_tstamp) { > tmp_opt.ts_recent = req->ts_recent; >@@ -552,7 +599,14 @@ > * > * Reset timer after retransmitting SYNACK, similar to > * the idea of fast retransmit in recovery. >+ * >+ * Fall back to TCP if MP_CAPABLE is not set. > */ >+ >+ if (tcp_rsk(req)->saw_mpc && !mopt.saw_mpc) >+ tcp_rsk(req)->saw_mpc = false; >+ >+ > if (!inet_rtx_syn_ack(sk, req)) > req->expires = min(TCP_TIMEOUT_INIT << req->num_timeout, > TCP_RTO_MAX) + jiffies; >@@ -680,7 +734,20 @@ > > /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */ > if (req->num_timeout < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && >- TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { >+ TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1 && >+ /* TODO MPTCP: >+ * We do this here, because otherwise options sent in the third ack, >+ * or duplicate fourth ack will get lost. Options like MP_PRIO, ADD_ADDR,... >+ * >+ * We could store them in request_sock, but this would mean that we >+ * have to put tcp_options_received and mptcp_options_received in there, >+ * increasing considerably the size of the request-sock. >+ * >+ * As soon as we have reworked the request-sock MPTCP-fields and >+ * created a mptcp_request_sock structure, we can handle options >+ * correclty there without increasing request_sock. >+ */ >+ !tcp_rsk(req)->saw_mpc) { > inet_rsk(req)->acked = 1; > NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP); > return NULL; >@@ -692,10 +759,29 @@ > * ESTABLISHED STATE. If it will be dropped after > * socket is created, wait for troubles. > */ >- child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); >+#if defined(CONFIG_MPTCP) >+ if (tcp_sk(sk)->mpc) >+ /* MPTCP: We call the mptcp-specific syn_recv_sock */ >+ child = tcp_sk(sk)->mpcb->syn_recv_sock(sk, skb, req, NULL); >+ else >+#endif >+ child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, >+ req, NULL); >+ > if (child == NULL) > goto listen_overflow; > >+ if (!is_meta_sk(sk)) { >+ int ret = mptcp_check_req_master(sk, child, req, prev, &mopt); >+ if (ret < 0) >+ goto listen_overflow; >+ >+ /* MPTCP-supported */ >+ if (!ret) >+ return tcp_sk(child)->mpcb->master_sk; >+ } else { >+ return mptcp_check_req_child(sk, child, req, prev, &mopt); >+ } > inet_csk_reqsk_queue_unlink(sk, req, prev); > inet_csk_reqsk_queue_removed(sk, req); > >@@ -745,8 +831,9 @@ > { > int ret = 0; > int state = child->sk_state; >+ struct sock *meta_sk = tcp_sk(child)->mpc ? mptcp_meta_sk(child) : child; > >- if (!sock_owned_by_user(child)) { >+ if (!sock_owned_by_user(meta_sk)) { > ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb), > skb->len); > /* Wakeup parent, send SIGIO */ >@@ -757,10 +844,14 @@ > * in main socket hash table and lock on listening > * socket does not protect us more. > */ >- __sk_add_backlog(child, skb); >+ if (tcp_sk(child)->mpc) >+ skb->sk = child; >+ __sk_add_backlog(meta_sk, skb); > } > >- bh_unlock_sock(child); >+ if (tcp_sk(child)->mpc) >+ bh_unlock_sock(child); >+ bh_unlock_sock(meta_sk); > sock_put(child); > return ret; > } >diff -Naur a/linux-3.11/net/ipv4/tcp_output.c b/linux-3.11/net/ipv4/tcp_output.c >--- a/linux-3.11/net/ipv4/tcp_output.c 2013-09-02 22:46:10.000000000 +0200 >+++ b/linux-3.11/net/ipv4/tcp_output.c 2013-10-05 18:34:49.084367027 +0200 >@@ -36,6 +36,8 @@ > > #define pr_fmt(fmt) "TCP: " fmt > >+#include <net/mptcp.h> >+#include <net/ipv6.h> > #include <net/tcp.h> > > #include <linux/compiler.h> >@@ -69,7 +71,7 @@ > int push_one, gfp_t gfp); > > /* Account for new data that has been sent to the network. */ >-static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) >+void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) > { > struct inet_connection_sock *icsk = inet_csk(sk); > struct tcp_sock *tp = tcp_sk(sk); >@@ -208,9 +210,14 @@ > void tcp_select_initial_window(int __space, __u32 mss, > __u32 *rcv_wnd, __u32 *window_clamp, > int wscale_ok, __u8 *rcv_wscale, >- __u32 init_rcv_wnd) >+ __u32 init_rcv_wnd, const struct sock *sk) > { >- unsigned int space = (__space < 0 ? 0 : __space); >+ unsigned int space; >+ >+ if (tcp_sk(sk)->mpc) >+ mptcp_select_initial_window(&__space, window_clamp, sk); >+ >+ space = (__space < 0 ? 0 : __space); > > /* If no clamp set the clamp to the max possible scaled window */ > if (*window_clamp == 0) >@@ -266,7 +273,11 @@ > static u16 tcp_select_window(struct sock *sk) > { > struct tcp_sock *tp = tcp_sk(sk); >- u32 cur_win = tcp_receive_window(tp); >+ /* The window must never shrink at the meta-level. At the subflow we >+ * have to allow this. Otherwise we may announce a window too large >+ * for the current meta-level sk_rcvbuf. >+ */ >+ u32 cur_win = tcp_receive_window(tp->mpc ? tcp_sk(mptcp_meta_sk(sk)) : tp); > u32 new_win = __tcp_select_window(sk); > > /* Never shrink the offered window */ >@@ -280,6 +291,12 @@ > */ > new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale); > } >+ >+ if (tp->mpc) { >+ mptcp_meta_tp(tp)->rcv_wnd = new_win; >+ mptcp_meta_tp(tp)->rcv_wup = mptcp_meta_tp(tp)->rcv_nxt; >+ } >+ > tp->rcv_wnd = new_win; > tp->rcv_wup = tp->rcv_nxt; > >@@ -358,7 +375,7 @@ > /* Constructs common control bits of non-data skb. If SYN/FIN is present, > * auto increment end seqno. > */ >-static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) >+void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) > { > skb->ip_summed = CHECKSUM_PARTIAL; > skb->csum = 0; >@@ -376,7 +393,7 @@ > TCP_SKB_CB(skb)->end_seq = seq; > } > >-static inline bool tcp_urg_mode(const struct tcp_sock *tp) >+bool tcp_urg_mode(const struct tcp_sock *tp) > { > return tp->snd_una != tp->snd_up; > } >@@ -386,17 +403,7 @@ > #define OPTION_MD5 (1 << 2) > #define OPTION_WSCALE (1 << 3) > #define OPTION_FAST_OPEN_COOKIE (1 << 8) >- >-struct tcp_out_options { >- u16 options; /* bit field of OPTION_* */ >- u16 mss; /* 0 to disable */ >- u8 ws; /* window scale, 0 to disable */ >- u8 num_sack_blocks; /* number of SACK blocks to include */ >- u8 hash_size; /* bytes in hash_location */ >- __u8 *hash_location; /* temporary pointer, overloaded */ >- __u32 tsval, tsecr; /* need to include OPTION_TS */ >- struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ >-}; >+/* Before adding here - take a look at OPTION_MPTCP in include/net/mptcp.h */ > > /* Write previously computed TCP options to the packet. > * >@@ -412,7 +419,7 @@ > * (but it may well be that other scenarios fail similarly). > */ > static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, >- struct tcp_out_options *opts) >+ struct tcp_out_options *opts, struct sk_buff *skb) > { > u16 options = opts->options; /* mungable copy */ > >@@ -495,6 +502,9 @@ > } > ptr += (foc->len + 3) >> 2; > } >+ >+ if (unlikely(OPTION_MPTCP & opts->options)) >+ mptcp_options_write(ptr, tp, opts, skb); > } > > /* Compute TCP options for SYN packets. This is not the final >@@ -546,6 +556,8 @@ > if (unlikely(!(OPTION_TS & opts->options))) > remaining -= TCPOLEN_SACKPERM_ALIGNED; > } >+ if (tp->request_mptcp || tp->mpc) >+ mptcp_syn_options(sk, opts, &remaining); > > if (fastopen && fastopen->cookie.len >= 0) { > u32 need = TCPOLEN_EXP_FASTOPEN_BASE + fastopen->cookie.len; >@@ -619,6 +631,9 @@ > } > } > >+ if (tcp_rsk(req)->saw_mpc) >+ mptcp_synack_options(req, opts, &remaining); >+ > return MAX_TCP_OPTION_SPACE - remaining; > } > >@@ -650,16 +665,22 @@ > opts->tsecr = tp->rx_opt.ts_recent; > size += TCPOLEN_TSTAMP_ALIGNED; > } >+ if (tp->mpc) >+ mptcp_established_options(sk, skb, opts, &size); > > eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack; > if (unlikely(eff_sacks)) { >- const unsigned int remaining = MAX_TCP_OPTION_SPACE - size; >- opts->num_sack_blocks = >- min_t(unsigned int, eff_sacks, >- (remaining - TCPOLEN_SACK_BASE_ALIGNED) / >- TCPOLEN_SACK_PERBLOCK); >- size += TCPOLEN_SACK_BASE_ALIGNED + >- opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; >+ const unsigned remaining = MAX_TCP_OPTION_SPACE - size; >+ if (remaining < TCPOLEN_SACK_BASE_ALIGNED) >+ opts->num_sack_blocks = 0; >+ else >+ opts->num_sack_blocks = >+ min_t(unsigned int, eff_sacks, >+ (remaining - TCPOLEN_SACK_BASE_ALIGNED) / >+ TCPOLEN_SACK_PERBLOCK); >+ if (opts->num_sack_blocks) >+ size += TCPOLEN_SACK_BASE_ALIGNED + >+ opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; > } > > return size; >@@ -736,6 +757,32 @@ > (1UL << TCP_WRITE_TIMER_DEFERRED) | \ > (1UL << TCP_DELACK_TIMER_DEFERRED) | \ > (1UL << TCP_MTU_REDUCED_DEFERRED)) >+ >+static void mptcp_release_cb(struct sock *meta_sk) >+{ >+ struct tcp_sock *meta_tp = tcp_sk(meta_sk); >+ struct sock *sk; >+ unsigned long flags, nflags; >+ >+ /* perform an atomic operation only if at least one flag is set */ >+ do { >+ flags = meta_tp->tsq_flags; >+ if (!(flags & TCP_DEFERRED_ALL)) >+ return; >+ nflags = flags & ~TCP_DEFERRED_ALL; >+ } while (cmpxchg(&meta_tp->tsq_flags, flags, nflags) != flags); >+ >+ if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) >+ __sock_put(meta_sk); >+ if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) >+ __sock_put(meta_sk); >+ if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) >+ __sock_put(meta_sk); >+ >+ mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) >+ sk->sk_prot->release_cb(sk); >+} >+ > /** > * tcp_release_cb - tcp release_sock() callback > * @sk: socket >@@ -748,6 +795,11 @@ > struct tcp_sock *tp = tcp_sk(sk); > unsigned long flags, nflags; > >+ if (is_meta_sk(sk)) { >+ mptcp_release_cb(sk); >+ return; >+ } >+ > /* perform an atomic operation only if at least one flag is set */ > do { > flags = tp->tsq_flags; >@@ -830,8 +882,8 @@ > * We are working here with either a clone of the original > * SKB, or a fresh unique copy made by the retransmit engine. > */ >-static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, >- gfp_t gfp_mask) >+int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, >+ gfp_t gfp_mask) > { > const struct inet_connection_sock *icsk = inet_csk(sk); > struct inet_sock *inet; >@@ -851,6 +903,8 @@ > if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP) > __net_timestamp(skb); > >+ tp = tcp_sk(sk); >+ > if (likely(clone_it)) { > const struct sk_buff *fclone = skb + 1; > >@@ -859,16 +913,33 @@ > NET_INC_STATS_BH(sock_net(sk), > LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); > >- if (unlikely(skb_cloned(skb))) >- skb = pskb_copy(skb, gfp_mask); >- else >+ if (unlikely(skb_cloned(skb))) { >+ struct sk_buff *newskb; >+ if (mptcp_is_data_seq(skb)) >+ skb_push(skb, MPTCP_SUB_LEN_DSS_ALIGN + >+ MPTCP_SUB_LEN_ACK_ALIGN + >+ MPTCP_SUB_LEN_SEQ_ALIGN); >+ >+ newskb = pskb_copy(skb, gfp_mask); >+ >+ if (mptcp_is_data_seq(skb)) { >+ skb_pull(skb, MPTCP_SUB_LEN_DSS_ALIGN + >+ MPTCP_SUB_LEN_ACK_ALIGN + >+ MPTCP_SUB_LEN_SEQ_ALIGN); >+ if (newskb) >+ skb_pull(newskb, MPTCP_SUB_LEN_DSS_ALIGN + >+ MPTCP_SUB_LEN_ACK_ALIGN + >+ MPTCP_SUB_LEN_SEQ_ALIGN); >+ } >+ skb = newskb; >+ } else { > skb = skb_clone(skb, gfp_mask); >+ } > if (unlikely(!skb)) > return -ENOBUFS; > } > > inet = inet_sk(sk); >- tp = tcp_sk(sk); > tcb = TCP_SKB_CB(skb); > memset(&opts, 0, sizeof(opts)); > >@@ -927,7 +998,7 @@ > } > } > >- tcp_options_write((__be32 *)(th + 1), tp, &opts); >+ tcp_options_write((__be32 *)(th + 1), tp, &opts, skb); > if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0)) > TCP_ECN_send(sk, skb, tcp_header_size); > >@@ -966,7 +1037,7 @@ > * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames, > * otherwise socket can stall. > */ >-static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) >+void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) > { > struct tcp_sock *tp = tcp_sk(sk); > >@@ -979,11 +1050,11 @@ > } > > /* Initialize TSO segments for a packet. */ >-static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, >- unsigned int mss_now) >+void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, >+ unsigned int mss_now) > { >- if (skb->len <= mss_now || !sk_can_gso(sk) || >- skb->ip_summed == CHECKSUM_NONE) { >+ if (skb->len <= mss_now || (is_meta_sk(sk) && !mptcp_sk_can_gso(sk)) || >+ (!is_meta_sk(sk) && !sk_can_gso(sk)) || skb->ip_summed == CHECKSUM_NONE) { > /* Avoid the costly divide in the normal > * non-TSO case. > */ >@@ -1015,7 +1086,7 @@ > /* Pcount in the middle of the write queue got changed, we need to do various > * tweaks to fix counters > */ >-static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr) >+void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr) > { > struct tcp_sock *tp = tcp_sk(sk); > >@@ -1056,6 +1127,9 @@ > int nlen; > u8 flags; > >+ if (tcp_sk(sk)->mpc && mptcp_is_data_seq(skb)) >+ mptcp_fragment(sk, skb, len, mss_now, 0); >+ > if (WARN_ON(len > skb->len)) > return -EINVAL; > >@@ -1140,7 +1214,7 @@ > * eventually). The difference is that pulled data not copied, but > * immediately discarded. > */ >-static void __pskb_trim_head(struct sk_buff *skb, int len) >+void __pskb_trim_head(struct sk_buff *skb, int len) > { > int i, k, eat; > >@@ -1179,6 +1253,9 @@ > /* Remove acked data from a packet in the transmit queue. */ > int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) > { >+ if (tcp_sk(sk)->mpc && !is_meta_sk(sk) && mptcp_is_data_seq(skb)) >+ return mptcp_trim_head(sk, skb, len); >+ > if (skb_unclone(skb, GFP_ATOMIC)) > return -ENOMEM; > >@@ -1196,6 +1273,15 @@ > if (tcp_skb_pcount(skb) > 1) > tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb)); > >+#ifdef CONFIG_MPTCP >+ /* Some data got acked - we assume that the seq-number reached the dest. >+ * Anyway, our MPTCP-option has been trimmed above - we lost it here. >+ * Only remove the SEQ if the call does not come from a meta retransmit. >+ */ >+ if (tcp_sk(sk)->mpc && !is_meta_sk(sk)) >+ TCP_SKB_CB(skb)->mptcp_flags &= ~MPTCPHDR_SEQ; >+#endif >+ > return 0; > } > >@@ -1355,7 +1441,7 @@ > } > > /* Congestion window validation. (RFC2861) */ >-static void tcp_cwnd_validate(struct sock *sk) >+void tcp_cwnd_validate(struct sock *sk) > { > struct tcp_sock *tp = tcp_sk(sk); > >@@ -1386,16 +1472,25 @@ > * modulo only when the receiver window alone is the limiting factor or > * when we would be allowed to send the split-due-to-Nagle skb fully. > */ >-static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb, >- unsigned int mss_now, unsigned int max_segs) >+unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb, >+ unsigned int mss_now, unsigned int max_segs) > { > const struct tcp_sock *tp = tcp_sk(sk); >+ const struct sock *meta_sk = tp->mpc ? mptcp_meta_sk(sk) : sk; > u32 needed, window, max_len; > >- window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; >+ if (!tp->mpc) >+ window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; >+ else >+ /* We need to evaluate the available space in the sending window >+ * at the subflow level. However, the subflow seq has not yet >+ * been set. Nevertheless we know that the caller will set it to >+ * write_seq. >+ */ >+ window = tcp_wnd_end(tp) - tp->write_seq; > max_len = mss_now * max_segs; > >- if (likely(max_len <= window && skb != tcp_write_queue_tail(sk))) >+ if (likely(max_len <= window && skb != tcp_write_queue_tail(meta_sk))) > return max_len; > > needed = min(skb->len, window); >@@ -1409,13 +1504,14 @@ > /* Can at least one segment of SKB be sent right now, according to the > * congestion window rules? If so, return how many segments are allowed. > */ >-static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp, >- const struct sk_buff *skb) >+unsigned int tcp_cwnd_test(const struct tcp_sock *tp, >+ const struct sk_buff *skb) > { > u32 in_flight, cwnd; > > /* Don't be strict about the congestion window for the final FIN. */ >- if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) && >+ if (skb && >+ ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) || mptcp_is_data_fin(skb)) && > tcp_skb_pcount(skb) == 1) > return 1; > >@@ -1431,8 +1527,8 @@ > * This must be invoked the first time we consider transmitting > * SKB onto the wire. > */ >-static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb, >- unsigned int mss_now) >+int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb, >+ unsigned int mss_now) > { > int tso_segs = tcp_skb_pcount(skb); > >@@ -1469,8 +1565,8 @@ > /* Return true if the Nagle test allows this packet to be > * sent now. > */ >-static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb, >- unsigned int cur_mss, int nonagle) >+bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb, >+ unsigned int cur_mss, int nonagle) > { > /* Nagle rule does not apply to frames, which sit in the middle of the > * write_queue (they have no chances to get new data). >@@ -1482,7 +1578,8 @@ > return true; > > /* Don't use the nagle rule for urgent data (or for the final FIN). */ >- if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) >+ if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) || >+ mptcp_is_data_fin(skb)) > return true; > > if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) >@@ -1492,9 +1589,8 @@ > } > > /* Does at least the first segment of SKB fit into the send window? */ >-static bool tcp_snd_wnd_test(const struct tcp_sock *tp, >- const struct sk_buff *skb, >- unsigned int cur_mss) >+bool tcp_snd_wnd_test(const struct tcp_sock *tp, const struct sk_buff *skb, >+ unsigned int cur_mss) > { > u32 end_seq = TCP_SKB_CB(skb)->end_seq; > >@@ -1552,6 +1648,9 @@ > int nlen = skb->len - len; > u8 flags; > >+ if (tcp_sk(sk)->mpc && mptcp_is_data_seq(skb)) >+ mptso_fragment(sk, skb, len, mss_now, gfp, 0); >+ > /* All of a TSO frame must be composed of paged data. */ > if (skb->len != skb->data_len) > return tcp_fragment(sk, skb, len, mss_now); >@@ -1597,29 +1696,39 @@ > * > * This algorithm is from John Heffner. > */ >-static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) >+bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) > { > struct tcp_sock *tp = tcp_sk(sk); >+ struct sock *meta_sk = tp->mpc ? mptcp_meta_sk(sk) : sk; >+ struct tcp_sock *meta_tp = tcp_sk(meta_sk); > const struct inet_connection_sock *icsk = inet_csk(sk); > u32 send_win, cong_win, limit, in_flight; > int win_divisor; > >- if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) >+ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) || mptcp_is_data_fin(skb)) > goto send_now; > > if (icsk->icsk_ca_state != TCP_CA_Open) > goto send_now; > > /* Defer for less than two clock ticks. */ >- if (tp->tso_deferred && >- (((u32)jiffies << 1) >> 1) - (tp->tso_deferred >> 1) > 1) >+ if (meta_tp->tso_deferred && >+ (((u32)jiffies << 1) >> 1) - (meta_tp->tso_deferred >> 1) > 1) > goto send_now; > > in_flight = tcp_packets_in_flight(tp); > > BUG_ON(tcp_skb_pcount(skb) <= 1 || (tp->snd_cwnd <= in_flight)); > >- send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; >+ if (!tp->mpc) >+ send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; >+ else >+ /* We need to evaluate the available space in the sending window >+ * at the subflow level. However, the subflow seq has not yet >+ * been set. Nevertheless we know that the caller will set it to >+ * write_seq. >+ */ >+ send_win = tcp_wnd_end(tp) - tp->write_seq; > > /* From in_flight test above, we know that cwnd > in_flight. */ > cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache; >@@ -1632,7 +1741,7 @@ > goto send_now; > > /* Middle in queue won't get any more data, full sendable already? */ >- if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len)) >+ if ((skb != tcp_write_queue_tail(meta_sk)) && (limit >= skb->len)) > goto send_now; > > win_divisor = ACCESS_ONCE(sysctl_tcp_tso_win_divisor); >@@ -1658,13 +1767,13 @@ > /* Ok, it looks like it is advisable to defer. > * Do not rearm the timer if already set to not break TCP ACK clocking. > */ >- if (!tp->tso_deferred) >- tp->tso_deferred = 1 | (jiffies << 1); >+ if (!meta_tp->tso_deferred) >+ meta_tp->tso_deferred = 1 | (jiffies << 1); > > return true; > > send_now: >- tp->tso_deferred = 0; >+ meta_tp->tso_deferred = 0; > return false; > } > >@@ -1677,7 +1786,7 @@ > * 1 if a probe was sent, > * -1 otherwise > */ >-static int tcp_mtu_probe(struct sock *sk) >+int tcp_mtu_probe(struct sock *sk) > { > struct tcp_sock *tp = tcp_sk(sk); > struct inet_connection_sock *icsk = inet_csk(sk); >@@ -1822,6 +1931,9 @@ > int cwnd_quota; > int result; > >+ if (is_meta_sk(sk)) >+ return mptcp_write_xmit(sk, mss_now, nonagle, push_one, gfp); >+ > sent_pkts = 0; > > if (!push_one) { >@@ -2128,6 +2240,9 @@ > int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk)); > int window; > >+ if (tp->mpc) >+ return __mptcp_select_window(sk); >+ > if (mss > full_space) > mss = full_space; > >@@ -2258,6 +2373,10 @@ > if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) > return; > >+ /* Currently not supported for MPTCP - but it should be possible */ >+ if (tp->mpc) >+ return; >+ > tcp_for_write_queue_from_safe(skb, tmp, sk) { > if (!tcp_can_collapse(sk, skb)) > break; >@@ -2367,8 +2486,24 @@ > */ > if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) || > skb_headroom(skb) >= 0xFFFF)) { >- struct sk_buff *nskb = __pskb_copy(skb, MAX_TCP_HEADER, >- GFP_ATOMIC); >+ struct sk_buff *nskb; >+ >+ if (mptcp_is_data_seq(skb)) >+ skb_push(skb, MPTCP_SUB_LEN_DSS_ALIGN + >+ MPTCP_SUB_LEN_ACK_ALIGN + >+ MPTCP_SUB_LEN_SEQ_ALIGN); >+ >+ nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); >+ >+ if (mptcp_is_data_seq(skb)) { >+ skb_pull(skb, MPTCP_SUB_LEN_DSS_ALIGN + >+ MPTCP_SUB_LEN_ACK_ALIGN + >+ MPTCP_SUB_LEN_SEQ_ALIGN); >+ if (nskb) >+ skb_pull(nskb, MPTCP_SUB_LEN_DSS_ALIGN + >+ MPTCP_SUB_LEN_ACK_ALIGN + >+ MPTCP_SUB_LEN_SEQ_ALIGN); >+ } > return nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : > -ENOBUFS; > } else { >@@ -2593,6 +2728,11 @@ > { > struct sk_buff *skb; > >+ if (is_meta_sk(sk)) { >+ mptcp_send_active_reset(sk, priority); >+ return; >+ } >+ > /* NOTE: No TCP options attached and we never retransmit this. */ > skb = alloc_skb(MAX_TCP_HEADER, priority); > if (!skb) { >@@ -2695,14 +2835,14 @@ > (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0)) > req->window_clamp = tcp_full_space(sk); > >- /* tcp_full_space because it is guaranteed to be the first packet */ > tcp_select_initial_window(tcp_full_space(sk), >- mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), >+ mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) - >+ (tcp_rsk(req)->saw_mpc ? MPTCP_SUB_LEN_DSM_ALIGN : 0), > &req->rcv_wnd, > &req->window_clamp, > ireq->wscale_ok, > &rcv_wscale, >- dst_metric(dst, RTAX_INITRWND)); >+ dst_metric(dst, RTAX_INITRWND), sk); > ireq->rcv_wscale = rcv_wscale; > } > >@@ -2738,7 +2878,7 @@ > > /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ > th->window = htons(min(req->rcv_wnd, 65535U)); >- tcp_options_write((__be32 *)(th + 1), tp, &opts); >+ tcp_options_write((__be32 *)(th + 1), tp, &opts, skb); > th->doff = (tcp_header_size >> 2); > TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); > >@@ -2798,7 +2938,7 @@ > &tp->window_clamp, > sysctl_tcp_window_scaling, > &rcv_wscale, >- dst_metric(dst, RTAX_INITRWND)); >+ dst_metric(dst, RTAX_INITRWND), sk); > > tp->rx_opt.rcv_wscale = rcv_wscale; > tp->rcv_ssthresh = tp->rcv_wnd; >@@ -2822,6 +2962,18 @@ > inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; > inet_csk(sk)->icsk_retransmits = 0; > tcp_clear_retrans(tp); >+ >+#ifdef CONFIG_MPTCP >+ if (mptcp_doit(sk)) { >+ if (is_master_tp(tp)) { >+ tp->request_mptcp = 1; >+ mptcp_connect_init(sk); >+ } else { >+ tp->mptcp->snt_isn = tp->write_seq; >+ tp->mptcp->init_rcv_wnd = tp->rcv_wnd; >+ } >+ } >+#endif > } > > static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb) >@@ -3044,6 +3196,13 @@ > */ > buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC)); > if (buff == NULL) { >+ >+ /* MPTCP: We don't send a delayed ack if we are sending an mptcp >+ * ADD_ADDR ack to avoid sending multiple ADD_ADDR acks for the >+ * same address. */ >+ if (tcp_sk(sk)->mptcp_add_addr_ack == 1) >+ return; >+ > inet_csk_schedule_ack(sk); > inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; > inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, >@@ -3071,7 +3230,7 @@ > * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is > * out-of-date with SND.UNA-1 to probe window. > */ >-static int tcp_xmit_probe_skb(struct sock *sk, int urgent) >+int tcp_xmit_probe_skb(struct sock *sk, int urgent) > { > struct tcp_sock *tp = tcp_sk(sk); > struct sk_buff *skb; >@@ -3110,6 +3269,9 @@ > if (sk->sk_state == TCP_CLOSE) > return -1; > >+ if (is_meta_sk(sk)) >+ return mptcp_write_wakeup(sk); >+ > if ((skb = tcp_send_head(sk)) != NULL && > before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) { > int err; >diff -Naur a/linux-3.11/net/ipv4/tcp_timer.c b/linux-3.11/net/ipv4/tcp_timer.c >--- a/linux-3.11/net/ipv4/tcp_timer.c 2013-09-02 22:46:10.000000000 +0200 >+++ b/linux-3.11/net/ipv4/tcp_timer.c 2013-10-05 18:34:49.085367014 +0200 >@@ -20,6 +20,7 @@ > > #include <linux/module.h> > #include <linux/gfp.h> >+#include <net/mptcp.h> > #include <net/tcp.h> > > int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES; >@@ -32,7 +33,7 @@ > int sysctl_tcp_orphan_retries __read_mostly; > int sysctl_tcp_thin_linear_timeouts __read_mostly; > >-static void tcp_write_err(struct sock *sk) >+void tcp_write_err(struct sock *sk) > { > sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT; > sk->sk_error_report(sk); >@@ -124,10 +125,8 @@ > * retransmissions with an initial RTO of TCP_RTO_MIN or TCP_TIMEOUT_INIT if > * syn_set flag is set. > */ >-static bool retransmits_timed_out(struct sock *sk, >- unsigned int boundary, >- unsigned int timeout, >- bool syn_set) >+bool retransmits_timed_out(struct sock *sk, unsigned int boundary, >+ unsigned int timeout, bool syn_set) > { > unsigned int linear_backoff_thresh, start_ts; > unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN; >@@ -153,7 +152,7 @@ > } > > /* A write timeout has occurred. Process the after effects. */ >-static int tcp_write_timeout(struct sock *sk) >+int tcp_write_timeout(struct sock *sk) > { > struct inet_connection_sock *icsk = inet_csk(sk); > int retry_until; >@@ -164,6 +163,10 @@ > dst_negative_advice(sk); > retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; > syn_set = true; >+ /* Stop retransmitting MP_CAPABLE options in SYN if timed out. */ >+ if (tcp_sk(sk)->request_mptcp && >+ icsk->icsk_retransmits >= mptcp_sysctl_syn_retries()) >+ tcp_sk(sk)->request_mptcp = 0; > } else { > if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) { > /* Black hole detection */ >@@ -244,18 +247,22 @@ > static void tcp_delack_timer(unsigned long data) > { > struct sock *sk = (struct sock *)data; >+ struct sock *meta_sk = tcp_sk(sk)->mpc ? mptcp_meta_sk(sk) : sk; > >- bh_lock_sock(sk); >- if (!sock_owned_by_user(sk)) { >+ bh_lock_sock(meta_sk); >+ if (!sock_owned_by_user(meta_sk)) { > tcp_delack_timer_handler(sk); > } else { > inet_csk(sk)->icsk_ack.blocked = 1; >- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); >+ NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_DELAYEDACKLOCKED); > /* deleguate our work to tcp_release_cb() */ > if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags)) > sock_hold(sk); >+ if (tcp_sk(sk)->mpc && >+ !test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(meta_sk)->tsq_flags)) >+ sock_hold(meta_sk); > } >- bh_unlock_sock(sk); >+ bh_unlock_sock(meta_sk); > sock_put(sk); > } > >@@ -418,6 +425,9 @@ > > tcp_enter_loss(sk, 0); > >+ if (tp->mpc) >+ mptcp_reinject_data(sk, 1); >+ > if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) { > /* Retransmission failed because of local congestion, > * do not backoff. >@@ -468,6 +478,7 @@ > /* Use normal (exponential) backoff */ > icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); > } >+ mptcp_set_rto(sk); > inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); > if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0)) > __sk_dst_reset(sk); >@@ -499,7 +510,10 @@ > break; > case ICSK_TIME_RETRANS: > icsk->icsk_pending = 0; >- tcp_retransmit_timer(sk); >+ if (is_meta_sk(sk)) >+ mptcp_retransmit_timer(sk); >+ else >+ tcp_retransmit_timer(sk); > break; > case ICSK_TIME_PROBE0: > icsk->icsk_pending = 0; >@@ -514,16 +528,20 @@ > static void tcp_write_timer(unsigned long data) > { > struct sock *sk = (struct sock *)data; >+ struct sock *meta_sk = tcp_sk(sk)->mpc ? mptcp_meta_sk(sk) : sk; > >- bh_lock_sock(sk); >- if (!sock_owned_by_user(sk)) { >+ bh_lock_sock(meta_sk); >+ if (!sock_owned_by_user(meta_sk)) { > tcp_write_timer_handler(sk); > } else { > /* deleguate our work to tcp_release_cb() */ > if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags)) > sock_hold(sk); >+ if (tcp_sk(sk)->mpc && >+ !test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(meta_sk)->tsq_flags)) >+ sock_hold(meta_sk); > } >- bh_unlock_sock(sk); >+ bh_unlock_sock(meta_sk); > sock_put(sk); > } > >@@ -548,6 +566,11 @@ > if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) > return; > >+ if (is_meta_sk(sk)) { >+ mptcp_set_keepalive(sk, val); >+ return; >+ } >+ > if (val && !sock_flag(sk, SOCK_KEEPOPEN)) > inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk))); > else if (!val) >@@ -560,21 +583,17 @@ > struct sock *sk = (struct sock *) data; > struct inet_connection_sock *icsk = inet_csk(sk); > struct tcp_sock *tp = tcp_sk(sk); >+ struct sock *meta_sk = tp->mpc ? mptcp_meta_sk(sk) : sk; > u32 elapsed; > > /* Only process if socket is not in use. */ >- bh_lock_sock(sk); >- if (sock_owned_by_user(sk)) { >+ bh_lock_sock(meta_sk); >+ if (sock_owned_by_user(meta_sk)) { > /* Try again later. */ > inet_csk_reset_keepalive_timer (sk, HZ/20); > goto out; > } > >- if (sk->sk_state == TCP_LISTEN) { >- tcp_synack_timer(sk); >- goto out; >- } >- > if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) { > if (tp->linger2 >= 0) { > const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN; >@@ -588,7 +607,13 @@ > goto death; > } > >- if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE) >+ if (sk->sk_state == TCP_LISTEN || is_meta_sk(sk)) { >+ tcp_synack_timer(sk); >+ goto out; >+ } >+ >+ /* MPTCP: Keepalive timers are handled at the subflow level */ >+ if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE || is_meta_sk(sk)) > goto out; > > elapsed = keepalive_time_when(tp); >@@ -636,7 +661,7 @@ > tcp_done(sk); > > out: >- bh_unlock_sock(sk); >+ bh_unlock_sock(meta_sk); > sock_put(sk); > } > >diff -Naur a/linux-3.11/net/ipv6/af_inet6.c b/linux-3.11/net/ipv6/af_inet6.c >--- a/linux-3.11/net/ipv6/af_inet6.c 2013-09-02 22:46:10.000000000 +0200 >+++ b/linux-3.11/net/ipv6/af_inet6.c 2013-10-05 18:34:49.259364827 +0200 >@@ -96,8 +96,7 @@ > return (struct ipv6_pinfo *)(((u8 *)sk) + offset); > } > >-static int inet6_create(struct net *net, struct socket *sock, int protocol, >- int kern) >+int inet6_create(struct net *net, struct socket *sock, int protocol, int kern) > { > struct inet_sock *inet; > struct ipv6_pinfo *np; >diff -Naur a/linux-3.11/net/ipv6/inet6_connection_sock.c b/linux-3.11/net/ipv6/inet6_connection_sock.c >--- a/linux-3.11/net/ipv6/inet6_connection_sock.c 2013-09-02 22:46:10.000000000 +0200 >+++ b/linux-3.11/net/ipv6/inet6_connection_sock.c 2013-10-05 18:34:49.259364827 +0200 >@@ -96,8 +96,8 @@ > /* > * request_sock (formerly open request) hash tables. > */ >-static u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport, >- const u32 rnd, const u32 synq_hsize) >+u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport, >+ const u32 rnd, const u32 synq_hsize) > { > u32 c; > >diff -Naur a/linux-3.11/net/ipv6/syncookies.c b/linux-3.11/net/ipv6/syncookies.c >--- a/linux-3.11/net/ipv6/syncookies.c 2013-09-02 22:46:10.000000000 +0200 >+++ b/linux-3.11/net/ipv6/syncookies.c 2013-10-05 18:34:49.260364814 +0200 >@@ -176,7 +176,7 @@ > > /* check for timestamp cookie support */ > memset(&tcp_opt, 0, sizeof(tcp_opt)); >- tcp_parse_options(skb, &tcp_opt, 0, NULL); >+ tcp_parse_options(skb, &tcp_opt, NULL, 0, NULL); > > if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok)) > goto out; >@@ -252,7 +252,7 @@ > tcp_select_initial_window(tcp_full_space(sk), req->mss, > &req->rcv_wnd, &req->window_clamp, > ireq->wscale_ok, &rcv_wscale, >- dst_metric(dst, RTAX_INITRWND)); >+ dst_metric(dst, RTAX_INITRWND), sk); > > ireq->rcv_wscale = rcv_wscale; > >diff -Naur a/linux-3.11/net/ipv6/tcp_ipv6.c b/linux-3.11/net/ipv6/tcp_ipv6.c >--- a/linux-3.11/net/ipv6/tcp_ipv6.c 2013-09-02 22:46:10.000000000 +0200 >+++ b/linux-3.11/net/ipv6/tcp_ipv6.c 2013-10-05 18:54:36.672437023 +0200 >@@ -64,6 +64,9 @@ > #include <net/secure_seq.h> > #include <net/tcp_memcontrol.h> > #include <net/busy_poll.h> >+#include <net/mptcp.h> >+#include <net/mptcp_v6.h> >+ > > #include <asm/uaccess.h> > >@@ -73,14 +76,6 @@ > #include <linux/crypto.h> > #include <linux/scatterlist.h> > >-static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb); >-static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, >- struct request_sock *req); >- >-static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); >- >-static const struct inet_connection_sock_af_ops ipv6_mapped; >-static const struct inet_connection_sock_af_ops ipv6_specific; > #ifdef CONFIG_TCP_MD5SIG > static const struct tcp_sock_af_ops tcp_sock_ipv6_specific; > static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific; >@@ -92,7 +87,7 @@ > } > #endif > >-static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) >+void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) > { > struct dst_entry *dst = skb_dst(skb); > const struct rt6_info *rt = (const struct rt6_info *)dst; >@@ -104,7 +99,7 @@ > inet6_sk(sk)->rx_dst_cookie = rt->rt6i_node->fn_sernum; > } > >-static void tcp_v6_hash(struct sock *sk) >+void tcp_v6_hash(struct sock *sk) > { > if (sk->sk_state != TCP_CLOSE) { > if (inet_csk(sk)->icsk_af_ops == &ipv6_mapped) { >@@ -117,7 +112,7 @@ > } > } > >-static __u32 tcp_v6_init_sequence(const struct sk_buff *skb) >+__u32 tcp_v6_init_sequence(const struct sk_buff *skb) > { > return secure_tcpv6_sequence_number(ipv6_hdr(skb)->daddr.s6_addr32, > ipv6_hdr(skb)->saddr.s6_addr32, >@@ -125,7 +120,7 @@ > tcp_hdr(skb)->source); > } > >-static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, >+int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, > int addr_len) > { > struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; >@@ -340,7 +335,7 @@ > const struct ipv6hdr *hdr = (const struct ipv6hdr*)skb->data; > const struct tcphdr *th = (struct tcphdr *)(skb->data+offset); > struct ipv6_pinfo *np; >- struct sock *sk; >+ struct sock *sk, *meta_sk; > int err; > struct tcp_sock *tp; > __u32 seq; >@@ -360,8 +355,14 @@ > return; > } > >- bh_lock_sock(sk); >- if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG) >+ tp = tcp_sk(sk); >+ if (tp->mpc) >+ meta_sk = mptcp_meta_sk(sk); >+ else >+ meta_sk = sk; >+ >+ bh_lock_sock(meta_sk); >+ if (sock_owned_by_user(meta_sk) && type != ICMPV6_PKT_TOOBIG) > NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); > > if (sk->sk_state == TCP_CLOSE) >@@ -372,7 +373,6 @@ > goto out; > } > >- tp = tcp_sk(sk); > seq = ntohl(th->seq); > if (sk->sk_state != TCP_LISTEN && > !between(seq, tp->snd_una, tp->snd_nxt)) { >@@ -399,11 +399,17 @@ > goto out; > > tp->mtu_info = ntohl(info); >- if (!sock_owned_by_user(sk)) >+ if (!sock_owned_by_user(meta_sk)) > tcp_v6_mtu_reduced(sk); >- else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, >+ else { >+ if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, > &tp->tsq_flags)) >- sock_hold(sk); >+ sock_hold(sk); >+ if (tp->mpc && >+ !test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, >+ &tcp_sk(meta_sk)->tsq_flags)) >+ sock_hold(meta_sk); >+ } > goto out; > } > >@@ -413,7 +419,7 @@ > switch (sk->sk_state) { > struct request_sock *req, **prev; > case TCP_LISTEN: >- if (sock_owned_by_user(sk)) >+ if (sock_owned_by_user(meta_sk)) > goto out; > > req = inet6_csk_search_req(sk, &prev, th->dest, &hdr->daddr, >@@ -438,7 +444,7 @@ > case TCP_SYN_SENT: > case TCP_SYN_RECV: /* Cannot happen. > It can, it SYNs are crossed. --ANK */ >- if (!sock_owned_by_user(sk)) { >+ if (!sock_owned_by_user(meta_sk)) { > sk->sk_err = err; > sk->sk_error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ > >@@ -448,22 +454,22 @@ > goto out; > } > >- if (!sock_owned_by_user(sk) && np->recverr) { >+ if (!sock_owned_by_user(meta_sk) && np->recverr) { > sk->sk_err = err; > sk->sk_error_report(sk); > } else > sk->sk_err_soft = err; > > out: >- bh_unlock_sock(sk); >+ bh_unlock_sock(meta_sk); > sock_put(sk); > } > > >-static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst, >- struct flowi6 *fl6, >- struct request_sock *req, >- u16 queue_mapping) >+int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst, >+ struct flowi6 *fl6, >+ struct request_sock *req, >+ u16 queue_mapping) > { > struct inet6_request_sock *treq = inet6_rsk(req); > struct ipv6_pinfo *np = inet6_sk(sk); >@@ -489,7 +495,7 @@ > return err; > } > >-static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req) >+int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req) > { > struct flowi6 fl6; > int res; >@@ -500,7 +506,7 @@ > return res; > } > >-static void tcp_v6_reqsk_destructor(struct request_sock *req) >+void tcp_v6_reqsk_destructor(struct request_sock *req) > { > kfree_skb(inet6_rsk(req)->pktopts); > } >@@ -719,9 +725,9 @@ > }; > #endif > >-static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win, >- u32 tsval, u32 tsecr, >- struct tcp_md5sig_key *key, int rst, u8 tclass) >+static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, >+ u32 data_ack, u32 win, u32 tsval, u32 tsecr, >+ struct tcp_md5sig_key *key, int rst, u8 tclass, int mptcp) > { > const struct tcphdr *th = tcp_hdr(skb); > struct tcphdr *t1; >@@ -739,7 +745,10 @@ > if (key) > tot_len += TCPOLEN_MD5SIG_ALIGNED; > #endif >- >+#ifdef CONFIG_MPTCP >+ if (mptcp) >+ tot_len += MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK; >+#endif > buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len, > GFP_ATOMIC); > if (buff == NULL) >@@ -777,6 +786,17 @@ > tcp_v6_md5_hash_hdr((__u8 *)topt, key, > &ipv6_hdr(skb)->saddr, > &ipv6_hdr(skb)->daddr, t1); >+ topt += 4; >+ } >+#endif >+#ifdef CONFIG_MPTCP >+ if (mptcp) { >+ /* Construction of 32-bit data_ack */ >+ *topt++ = htonl((TCPOPT_MPTCP << 24) | >+ ((MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK) << 16) | >+ (0x20 << 8) | >+ (0x01)); >+ *topt++ = htonl(data_ack); > } > #endif > >@@ -813,7 +833,7 @@ > kfree_skb(buff); > } > >-static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb) >+void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb) > { > const struct tcphdr *th = tcp_hdr(skb); > u32 seq = 0, ack_seq = 0; >@@ -868,7 +888,7 @@ > ack_seq = ntohl(th->seq) + th->syn + th->fin + skb->len - > (th->doff << 2); > >- tcp_v6_send_response(skb, seq, ack_seq, 0, 0, 0, key, 1, 0); >+ tcp_v6_send_response(skb, seq, ack_seq, 0, 0, 0, 0, key, 1, 0, 0); > > #ifdef CONFIG_TCP_MD5SIG > release_sk1: >@@ -879,37 +899,44 @@ > #endif > } > >-static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, >+static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 data_ack, > u32 win, u32 tsval, u32 tsecr, >- struct tcp_md5sig_key *key, u8 tclass) >+ struct tcp_md5sig_key *key, u8 tclass, int mptcp) > { >- tcp_v6_send_response(skb, seq, ack, win, tsval, tsecr, key, 0, tclass); >+ tcp_v6_send_response(skb, seq, ack, data_ack, win, tsval, tsecr, key, 0, tclass, mptcp); > } > > static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) > { > struct inet_timewait_sock *tw = inet_twsk(sk); > struct tcp_timewait_sock *tcptw = tcp_twsk(sk); >+ u32 data_ack = 0; >+ int mptcp = 0; > >+ if (tcptw->mptcp_tw && tcptw->mptcp_tw->meta_tw) { >+ data_ack = (u32)tcptw->mptcp_tw->rcv_nxt; >+ mptcp = 1; >+ } > tcp_v6_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, >+ data_ack, > tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, > tcp_time_stamp + tcptw->tw_ts_offset, > tcptw->tw_ts_recent, tcp_twsk_md5_key(tcptw), >- tw->tw_tclass); >+ tw->tw_tclass, mptcp); > > inet_twsk_put(tw); > } > >-static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, >- struct request_sock *req) >+void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, >+ struct request_sock *req) > { > tcp_v6_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, >- req->rcv_wnd, tcp_time_stamp, req->ts_recent, >- tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr), 0); >+ 0, req->rcv_wnd, tcp_time_stamp, req->ts_recent, >+ tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr), 0, 0); > } > > >-static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) >+struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) > { > struct request_sock *req, **prev; > const struct tcphdr *th = tcp_hdr(skb); >@@ -928,7 +955,13 @@ > > if (nsk) { > if (nsk->sk_state != TCP_TIME_WAIT) { >+ /* Don't lock again the meta-sk. It has been locked >+ * before mptcp_v6_do_rcv. >+ */ >+ if (tcp_sk(nsk)->mpc && !is_meta_sk(sk)) >+ bh_lock_sock(mptcp_meta_sk(nsk)); > bh_lock_sock(nsk); >+ > return nsk; > } > inet_twsk_put(inet_twsk(nsk)); >@@ -948,6 +981,7 @@ > static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) > { > struct tcp_options_received tmp_opt; >+ struct mptcp_options_received mopt; > struct request_sock *req; > struct inet6_request_sock *treq; > struct ipv6_pinfo *np = inet6_sk(sk); >@@ -960,6 +994,23 @@ > if (skb->protocol == htons(ETH_P_IP)) > return tcp_v4_conn_request(sk, skb); > >+ tcp_clear_options(&tmp_opt); >+ tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); >+ tmp_opt.user_mss = tp->rx_opt.user_mss; >+ mptcp_init_mp_opt(&mopt); >+ tcp_parse_options(skb, &tmp_opt, &mopt, 0, NULL); >+ >+#ifdef CONFIG_MPTCP >+ /*MPTCP structures not initialized, so return error */ >+ if (mptcp_init_failed) >+ mptcp_init_mp_opt(&mopt); >+ >+ if (mopt.is_mp_join) >+ return mptcp_do_join_short(skb, &mopt, &tmp_opt, sock_net(sk)); >+ if (mopt.drop_me) >+ goto drop; >+#endif >+ > if (!ipv6_unicast_destination(skb)) > goto drop; > >@@ -974,7 +1025,20 @@ > goto drop; > } > >- req = inet6_reqsk_alloc(&tcp6_request_sock_ops); >+#ifdef CONFIG_MPTCP >+ if (mopt.saw_mpc) { >+ req = inet6_reqsk_alloc(&mptcp6_request_sock_ops); >+ >+ if (req == NULL) >+ goto drop; >+ >+ mptcp_rsk(req)->mpcb = NULL; >+ mptcp_rsk(req)->dss_csum = mopt.dss_csum; >+ mptcp_rsk(req)->collide_tk.pprev = NULL; >+ } else >+#endif >+ req = inet6_reqsk_alloc(&tcp6_request_sock_ops); >+ > if (req == NULL) > goto drop; > >@@ -982,17 +1046,15 @@ > tcp_rsk(req)->af_specific = &tcp_request_sock_ipv6_ops; > #endif > >- tcp_clear_options(&tmp_opt); >- tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); >- tmp_opt.user_mss = tp->rx_opt.user_mss; >- tcp_parse_options(skb, &tmp_opt, 0, NULL); >- > if (want_cookie && !tmp_opt.saw_tstamp) > tcp_clear_options(&tmp_opt); > > tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; > tcp_openreq_init(req, &tmp_opt, skb); > >+ if (mopt.saw_mpc) >+ mptcp_reqsk_new_mptcp(req, &tmp_opt, &mopt, skb); >+ > treq = inet6_rsk(req); > treq->rmt_addr = ipv6_hdr(skb)->saddr; > treq->loc_addr = ipv6_hdr(skb)->daddr; >@@ -1081,9 +1143,9 @@ > return 0; /* don't send reset */ > } > >-static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, >- struct request_sock *req, >- struct dst_entry *dst) >+struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, >+ struct request_sock *req, >+ struct dst_entry *dst) > { > struct inet6_request_sock *treq; > struct ipv6_pinfo *newnp, *np = inet6_sk(sk); >@@ -1303,7 +1365,7 @@ > * This is because we cannot sleep with the original spinlock > * held. > */ >-static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) >+int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) > { > struct ipv6_pinfo *np = inet6_sk(sk); > struct tcp_sock *tp; >@@ -1325,6 +1387,9 @@ > goto discard; > #endif > >+ if (is_meta_sk(sk)) >+ return mptcp_v6_do_rcv(sk, skb); >+ > if (sk_filter(sk, skb)) > goto discard; > >@@ -1445,7 +1510,7 @@ > { > const struct tcphdr *th; > const struct ipv6hdr *hdr; >- struct sock *sk; >+ struct sock *sk, *meta_sk = NULL; > int ret; > struct net *net = dev_net(skb->dev); > >@@ -1476,18 +1541,43 @@ > TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + > skb->len - th->doff*4); > TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); >+#ifdef CONFIG_MPTCP >+ TCP_SKB_CB(skb)->mptcp_flags = 0; >+ TCP_SKB_CB(skb)->dss_off = 0; >+#endif > TCP_SKB_CB(skb)->when = 0; > TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr); > TCP_SKB_CB(skb)->sacked = 0; > > sk = __inet6_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); >- if (!sk) >- goto no_tcp_socket; > > process: >- if (sk->sk_state == TCP_TIME_WAIT) >+ if (sk && sk->sk_state == TCP_TIME_WAIT) > goto do_time_wait; > >+#ifdef CONFIG_MPTCP >+ if (!sk && th->syn && !th->ack) { >+ int ret = mptcp_lookup_join(skb, NULL); >+ >+ if (ret < 0) { >+ tcp_v6_send_reset(NULL, skb); >+ goto discard_it; >+ } else if (ret > 0) { >+ return 0; >+ } >+ } >+ >+ /* Is there a pending request sock for this segment ? */ >+ if ((!sk || sk->sk_state == TCP_LISTEN) && mptcp_check_req(skb, net)) { >+ if (sk) >+ sock_put(sk); >+ return 0; >+ } >+#endif >+ >+ if (!sk) >+ goto no_tcp_socket; >+ > if (hdr->hop_limit < inet6_sk(sk)->min_hopcount) { > NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); > goto discard_and_relse; >@@ -1502,11 +1592,20 @@ > sk_mark_napi_id(sk, skb); > skb->dev = NULL; > >- bh_lock_sock_nested(sk); >+ if (tcp_sk(sk)->mpc) { >+ meta_sk = mptcp_meta_sk(sk); >+ >+ bh_lock_sock_nested(meta_sk); >+ skb->sk = sk; >+ } else { >+ meta_sk = sk; >+ bh_lock_sock_nested(sk); >+ } >+ > ret = 0; >- if (!sock_owned_by_user(sk)) { >+ if (!sock_owned_by_user(meta_sk)) { > #ifdef CONFIG_NET_DMA >- struct tcp_sock *tp = tcp_sk(sk); >+ struct tcp_sock *tp = tcp_sk(meta_sk); > if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) > tp->ucopy.dma_chan = net_dma_find_channel(); > if (tp->ucopy.dma_chan) >@@ -1514,16 +1613,17 @@ > else > #endif > { >- if (!tcp_prequeue(sk, skb)) >+ if (!tcp_prequeue(meta_sk, skb)) > ret = tcp_v6_do_rcv(sk, skb); > } >- } else if (unlikely(sk_add_backlog(sk, skb, >- sk->sk_rcvbuf + sk->sk_sndbuf))) { >- bh_unlock_sock(sk); >+ } else if (unlikely(sk_add_backlog(meta_sk, skb, >+ meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) { >+ bh_unlock_sock(meta_sk); > NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP); > goto discard_and_relse; > } >- bh_unlock_sock(sk); >+ >+ bh_unlock_sock(meta_sk); > > sock_put(sk); > return ret ? -1 : 0; >@@ -1580,6 +1680,18 @@ > sk = sk2; > goto process; > } >+#ifdef CONFIG_MPTCP >+ if (th->syn && !th->ack) { >+ int ret = mptcp_lookup_join(skb, inet_twsk(sk)); >+ >+ if (ret < 0) { >+ tcp_v6_send_reset(NULL, skb); >+ goto discard_it; >+ } else if (ret > 0) { >+ return 0; >+ } >+ } >+#endif > /* Fall through to ACK */ > } > case TCP_TW_ACK: >@@ -1629,13 +1741,13 @@ > } > } > >-static struct timewait_sock_ops tcp6_timewait_sock_ops = { >+struct timewait_sock_ops tcp6_timewait_sock_ops = { > .twsk_obj_size = sizeof(struct tcp6_timewait_sock), > .twsk_unique = tcp_twsk_unique, > .twsk_destructor= tcp_twsk_destructor, > }; > >-static const struct inet_connection_sock_af_ops ipv6_specific = { >+const struct inet_connection_sock_af_ops ipv6_specific = { > .queue_xmit = inet6_csk_xmit, > .send_check = tcp_v6_send_check, > .rebuild_header = inet6_sk_rebuild_header, >@@ -1667,7 +1779,7 @@ > * TCP over IPv4 via INET6 API > */ > >-static const struct inet_connection_sock_af_ops ipv6_mapped = { >+const struct inet_connection_sock_af_ops ipv6_mapped = { > .queue_xmit = ip_queue_xmit, > .send_check = tcp_v4_send_check, > .rebuild_header = inet_sk_rebuild_header, >@@ -1712,7 +1824,7 @@ > return 0; > } > >-static void tcp_v6_destroy_sock(struct sock *sk) >+void tcp_v6_destroy_sock(struct sock *sk) > { > tcp_v4_destroy_sock(sk); > inet6_destroy_sock(sk); >diff -Naur a/linux-3.11/net/Kconfig b/linux-3.11/net/Kconfig >--- a/linux-3.11/net/Kconfig 2013-09-02 22:46:10.000000000 +0200 >+++ b/linux-3.11/net/Kconfig 2013-10-05 18:34:49.263364777 +0200 >@@ -79,6 +79,7 @@ > source "net/ipv4/Kconfig" > source "net/ipv6/Kconfig" > source "net/netlabel/Kconfig" >+source "net/mptcp/Kconfig" > > endif # if INET > >diff -Naur a/linux-3.11/net/Makefile b/linux-3.11/net/Makefile >--- a/linux-3.11/net/Makefile 2013-09-02 22:46:10.000000000 +0200 >+++ b/linux-3.11/net/Makefile 2013-10-05 18:34:49.263364777 +0200 >@@ -20,6 +20,7 @@ > obj-$(CONFIG_XFRM) += xfrm/ > obj-$(CONFIG_UNIX) += unix/ > obj-$(CONFIG_NET) += ipv6/ >+obj-$(CONFIG_MPTCP) += mptcp/ > obj-$(CONFIG_PACKET) += packet/ > obj-$(CONFIG_NET_KEY) += key/ > obj-$(CONFIG_BRIDGE) += bridge/ >diff -Naur a/linux-3.11/net/mptcp/Kconfig b/linux-3.11/net/mptcp/Kconfig >--- a/linux-3.11/net/mptcp/Kconfig 1970-01-01 01:00:00.000000000 +0100 >+++ b/linux-3.11/net/mptcp/Kconfig 2013-10-05 18:34:49.264364764 +0200 >@@ -0,0 +1,9 @@ >+# >+# MPTCP configuration >+# >+config MPTCP >+ bool "MPTCP protocol" >+ depends on !SYN_COOKIES && !TCP_MD5SIG && (IPV6=y || IPV6=n) >+ ---help--- >+ This replaces the normal TCP stack with a Multipath TCP stack, >+ able to use several paths at once. >diff -Naur a/linux-3.11/net/mptcp/Makefile b/linux-3.11/net/mptcp/Makefile >--- a/linux-3.11/net/mptcp/Makefile 1970-01-01 01:00:00.000000000 +0100 >+++ b/linux-3.11/net/mptcp/Makefile 2013-10-05 18:34:49.264364764 +0200 >@@ -0,0 +1,15 @@ >+# >+## Makefile for MultiPath TCP support code. >+# >+# >+ >+obj-$(CONFIG_MPTCP) += mptcp.o >+ >+mptcp-y := mptcp_ctrl.o mptcp_ipv4.o mptcp_ofo_queue.o mptcp_pm.o \ >+ mptcp_output.o mptcp_input.o >+ >+obj-$(CONFIG_TCP_CONG_COUPLED) += mptcp_coupled.o >+obj-$(CONFIG_TCP_CONG_OLIA) += mptcp_olia.o >+ >+mptcp-$(subst m,y,$(CONFIG_IPV6)) += mptcp_ipv6.o >+ >diff -Naur a/linux-3.11/net/mptcp/mptcp_coupled.c b/linux-3.11/net/mptcp/mptcp_coupled.c >--- a/linux-3.11/net/mptcp/mptcp_coupled.c 1970-01-01 01:00:00.000000000 +0100 >+++ b/linux-3.11/net/mptcp/mptcp_coupled.c 2013-10-05 18:34:49.265364751 +0200 >@@ -0,0 +1,273 @@ >+/* >+ * MPTCP implementation - Coupled Congestion Control >+ * >+ * Initial Design & Implementation: >+ * Sébastien Barré <sebastien.barre@uclouvain.be> >+ * >+ * Current Maintainer & Author: >+ * Christoph Paasch <christoph.paasch@uclouvain.be> >+ * >+ * Additional authors: >+ * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi> >+ * Gregory Detal <gregory.detal@uclouvain.be> >+ * Fabien Duchêne <fabien.duchene@uclouvain.be> >+ * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de> >+ * Lavkesh Lahngir <lavkesh51@gmail.com> >+ * Andreas Ripke <ripke@neclab.eu> >+ * Vlad Dogaru <vlad.dogaru@intel.com> >+ * Octavian Purdila <octavian.purdila@intel.com> >+ * John Ronan <jronan@tssg.org> >+ * Catalin Nicutar <catalin.nicutar@gmail.com> >+ * Brandon Heller <brandonh@stanford.edu> >+ * >+ * >+ * This program is free software; you can redistribute it and/or >+ * modify it under the terms of the GNU General Public License >+ * as published by the Free Software Foundation; either version >+ * 2 of the License, or (at your option) any later version. >+ */ >+#include <net/tcp.h> >+#include <net/mptcp.h> >+ >+#include <linux/module.h> >+ >+/* Scaling is done in the numerator with alpha_scale_num and in the denominator >+ * with alpha_scale_den. >+ * >+ * To downscale, we just need to use alpha_scale. >+ * >+ * We have: alpha_scale = alpha_scale_num / (alpha_scale_den ^ 2) >+ */ >+static int alpha_scale_den = 10; >+static int alpha_scale_num = 32; >+static int alpha_scale = 12; >+ >+struct mptcp_ccc { >+ u64 alpha; >+ bool forced_update; >+}; >+ >+static inline int mptcp_ccc_sk_can_send(const struct sock *sk) >+{ >+ return mptcp_sk_can_send(sk) && tcp_sk(sk)->srtt; >+} >+ >+static inline u64 mptcp_get_alpha(struct sock *meta_sk) >+{ >+ struct mptcp_ccc *mptcp_ccc = inet_csk_ca(meta_sk); >+ return mptcp_ccc->alpha; >+} >+ >+static inline void mptcp_set_alpha(struct sock *meta_sk, u64 alpha) >+{ >+ struct mptcp_ccc *mptcp_ccc = inet_csk_ca(meta_sk); >+ mptcp_ccc->alpha = alpha; >+} >+ >+static inline u64 mptcp_ccc_scale(u32 val, int scale) >+{ >+ return (u64) val << scale; >+} >+ >+static inline bool mptcp_get_forced(struct sock *meta_sk) >+{ >+ struct mptcp_ccc *mptcp_ccc = inet_csk_ca(meta_sk); >+ return mptcp_ccc->forced_update; >+} >+ >+static inline void mptcp_set_forced(struct sock *meta_sk, bool force) >+{ >+ struct mptcp_ccc *mptcp_ccc = inet_csk_ca(meta_sk); >+ mptcp_ccc->forced_update = force; >+} >+ >+static void mptcp_ccc_recalc_alpha(struct sock *sk) >+{ >+ struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb; >+ struct sock *sub_sk; >+ int best_cwnd = 0, best_rtt = 0, can_send = 0; >+ u64 max_numerator = 0, sum_denominator = 0, alpha = 1; >+ >+ if (!mpcb) >+ return; >+ >+ /* Only one subflow left - fall back to normal reno-behavior >+ * (set alpha to 1) */ >+ if (mpcb->cnt_established <= 1) >+ goto exit; >+ >+ /* Do regular alpha-calculation for multiple subflows */ >+ >+ /* Find the max numerator of the alpha-calculation */ >+ mptcp_for_each_sk(mpcb, sub_sk) { >+ struct tcp_sock *sub_tp = tcp_sk(sub_sk); >+ u64 tmp; >+ >+ if (!mptcp_ccc_sk_can_send(sub_sk)) >+ continue; >+ >+ can_send++; >+ >+ /* We need to look for the path, that provides the max-value. >+ * Integer-overflow is not possible here, because >+ * tmp will be in u64. >+ */ >+ tmp = div64_u64(mptcp_ccc_scale(sub_tp->snd_cwnd, >+ alpha_scale_num), sub_tp->srtt * sub_tp->srtt); >+ >+ if (tmp >= max_numerator) { >+ max_numerator = tmp; >+ best_cwnd = sub_tp->snd_cwnd; >+ best_rtt = sub_tp->srtt; >+ } >+ } >+ >+ /* No subflow is able to send - we don't care anymore */ >+ if (unlikely(!can_send)) >+ goto exit; >+ >+ /* Calculate the denominator */ >+ mptcp_for_each_sk(mpcb, sub_sk) { >+ struct tcp_sock *sub_tp = tcp_sk(sub_sk); >+ >+ if (!mptcp_ccc_sk_can_send(sub_sk)) >+ continue; >+ >+ sum_denominator += div_u64( >+ mptcp_ccc_scale(sub_tp->snd_cwnd, >+ alpha_scale_den) * best_rtt, >+ sub_tp->srtt); >+ } >+ sum_denominator *= sum_denominator; >+ if (unlikely(!sum_denominator)) { >+ pr_err("%s: sum_denominator == 0, cnt_established:%d\n", >+ __func__, mpcb->cnt_established); >+ mptcp_for_each_sk(mpcb, sub_sk) { >+ struct tcp_sock *sub_tp = tcp_sk(sub_sk); >+ pr_err("%s: pi:%d, state:%d\n, rtt:%u, cwnd: %u", >+ __func__, sub_tp->mptcp->path_index, >+ sub_sk->sk_state, sub_tp->srtt, >+ sub_tp->snd_cwnd); >+ } >+ } >+ >+ alpha = div64_u64(mptcp_ccc_scale(best_cwnd, alpha_scale_num), sum_denominator); >+ >+ if (unlikely(!alpha)) >+ alpha = 1; >+ >+exit: >+ mptcp_set_alpha(mptcp_meta_sk(sk), alpha); >+} >+ >+static void mptcp_ccc_init(struct sock *sk) >+{ >+ if (tcp_sk(sk)->mpc) { >+ mptcp_set_forced(mptcp_meta_sk(sk), 0); >+ mptcp_set_alpha(mptcp_meta_sk(sk), 1); >+ } >+ /* If we do not mptcp, behave like reno: return */ >+} >+ >+static void mptcp_ccc_cwnd_event(struct sock *sk, enum tcp_ca_event event) >+{ >+ if (event == CA_EVENT_LOSS) >+ mptcp_ccc_recalc_alpha(sk); >+} >+ >+static void mptcp_ccc_set_state(struct sock *sk, u8 ca_state) >+{ >+ if (!tcp_sk(sk)->mpc) >+ return; >+ >+ mptcp_set_forced(mptcp_meta_sk(sk), 1); >+} >+ >+static void mptcp_ccc_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) >+{ >+ struct tcp_sock *tp = tcp_sk(sk); >+ struct mptcp_cb *mpcb = tp->mpcb; >+ int snd_cwnd; >+ >+ if (!tp->mpc) { >+ tcp_reno_cong_avoid(sk, ack, in_flight); >+ return; >+ } >+ >+ if (!tcp_is_cwnd_limited(sk, in_flight)) >+ return; >+ >+ if (tp->snd_cwnd <= tp->snd_ssthresh) { >+ /* In "safe" area, increase. */ >+ tcp_slow_start(tp); >+ mptcp_ccc_recalc_alpha(sk); >+ return; >+ } >+ >+ if (mptcp_get_forced(mptcp_meta_sk(sk))) { >+ mptcp_ccc_recalc_alpha(sk); >+ mptcp_set_forced(mptcp_meta_sk(sk), 0); >+ } >+ >+ if (mpcb->cnt_established > 1) { >+ u64 alpha = mptcp_get_alpha(mptcp_meta_sk(sk)); >+ >+ /* This may happen, if at the initialization, the mpcb >+ * was not yet attached to the sock, and thus >+ * initializing alpha failed. >+ */ >+ if (unlikely(!alpha)) >+ alpha = 1; >+ >+ snd_cwnd = (int) div_u64 ((u64) mptcp_ccc_scale(1, alpha_scale), >+ alpha); >+ >+ /* snd_cwnd_cnt >= max (scale * tot_cwnd / alpha, cwnd) >+ * Thus, we select here the max value. */ >+ if (snd_cwnd < tp->snd_cwnd) >+ snd_cwnd = tp->snd_cwnd; >+ } else { >+ snd_cwnd = tp->snd_cwnd; >+ } >+ >+ if (tp->snd_cwnd_cnt >= snd_cwnd) { >+ if (tp->snd_cwnd < tp->snd_cwnd_clamp) { >+ tp->snd_cwnd++; >+ mptcp_ccc_recalc_alpha(sk); >+ } >+ >+ tp->snd_cwnd_cnt = 0; >+ } else { >+ tp->snd_cwnd_cnt++; >+ } >+} >+ >+static struct tcp_congestion_ops mptcp_ccc = { >+ .init = mptcp_ccc_init, >+ .ssthresh = tcp_reno_ssthresh, >+ .cong_avoid = mptcp_ccc_cong_avoid, >+ .cwnd_event = mptcp_ccc_cwnd_event, >+ .set_state = mptcp_ccc_set_state, >+ .min_cwnd = tcp_reno_min_cwnd, >+ .owner = THIS_MODULE, >+ .name = "coupled", >+}; >+ >+static int __init mptcp_ccc_register(void) >+{ >+ BUILD_BUG_ON(sizeof(struct mptcp_ccc) > ICSK_CA_PRIV_SIZE); >+ return tcp_register_congestion_control(&mptcp_ccc); >+} >+ >+static void __exit mptcp_ccc_unregister(void) >+{ >+ tcp_unregister_congestion_control(&mptcp_ccc); >+} >+ >+module_init(mptcp_ccc_register); >+module_exit(mptcp_ccc_unregister); >+ >+MODULE_AUTHOR("Christoph Paasch, Sébastien Barré"); >+MODULE_LICENSE("GPL"); >+MODULE_DESCRIPTION("MPTCP COUPLED CONGESTION CONTROL"); >+MODULE_VERSION("0.1"); >diff -Naur a/linux-3.11/net/mptcp/mptcp_ctrl.c b/linux-3.11/net/mptcp/mptcp_ctrl.c >--- a/linux-3.11/net/mptcp/mptcp_ctrl.c 1970-01-01 01:00:00.000000000 +0100 >+++ b/linux-3.11/net/mptcp/mptcp_ctrl.c 2013-10-05 18:34:49.267364726 +0200 >@@ -0,0 +1,1824 @@ >+/* >+ * MPTCP implementation - MPTCP-control >+ * >+ * Initial Design & Implementation: >+ * Sébastien Barré <sebastien.barre@uclouvain.be> >+ * >+ * Current Maintainer & Author: >+ * Christoph Paasch <christoph.paasch@uclouvain.be> >+ * >+ * Additional authors: >+ * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi> >+ * Gregory Detal <gregory.detal@uclouvain.be> >+ * Fabien Duchêne <fabien.duchene@uclouvain.be> >+ * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de> >+ * Lavkesh Lahngir <lavkesh51@gmail.com> >+ * Andreas Ripke <ripke@neclab.eu> >+ * Vlad Dogaru <vlad.dogaru@intel.com> >+ * Octavian Purdila <octavian.purdila@intel.com> >+ * John Ronan <jronan@tssg.org> >+ * Catalin Nicutar <catalin.nicutar@gmail.com> >+ * Brandon Heller <brandonh@stanford.edu> >+ * >+ * >+ * This program is free software; you can redistribute it and/or >+ * modify it under the terms of the GNU General Public License >+ * as published by the Free Software Foundation; either version >+ * 2 of the License, or (at your option) any later version. >+ */ >+ >+#include <net/inet_common.h> >+#include <net/inet6_hashtables.h> >+#include <net/ipv6.h> >+#include <net/ip6_checksum.h> >+#include <net/mptcp.h> >+#include <net/mptcp_v4.h> >+#include <net/mptcp_v6.h> >+#include <net/sock.h> >+#include <net/tcp.h> >+#include <net/tcp_states.h> >+#include <net/transp_v6.h> >+#include <net/xfrm.h> >+ >+#include <linux/cryptohash.h> >+#include <linux/kconfig.h> >+#include <linux/module.h> >+#include <linux/list.h> >+#include <linux/jhash.h> >+#include <linux/tcp.h> >+#include <linux/net.h> >+#include <linux/in.h> >+#include <linux/random.h> >+#include <linux/inetdevice.h> >+#include <linux/workqueue.h> >+#include <linux/atomic.h> >+#ifdef CONFIG_SYSCTL >+#include <linux/sysctl.h> >+#endif >+ >+static struct kmem_cache *mptcp_sock_cache __read_mostly; >+static struct kmem_cache *mptcp_cb_cache __read_mostly; >+static struct kmem_cache *mptcp_tw_cache __read_mostly; >+ >+int sysctl_mptcp_ndiffports __read_mostly = 1; >+int sysctl_mptcp_enabled __read_mostly = 1; >+int sysctl_mptcp_checksum __read_mostly = 1; >+int sysctl_mptcp_debug __read_mostly; >+EXPORT_SYMBOL(sysctl_mptcp_debug); >+int sysctl_mptcp_syn_retries __read_mostly = MPTCP_SYN_RETRIES; >+ >+bool mptcp_init_failed __read_mostly; >+ >+#ifdef CONFIG_SYSCTL >+static struct ctl_table mptcp_table[] = { >+ { >+ .procname = "mptcp_ndiffports", >+ .data = &sysctl_mptcp_ndiffports, >+ .maxlen = sizeof(int), >+ .mode = 0644, >+ .proc_handler = &proc_dointvec >+ }, >+ { >+ .procname = "mptcp_enabled", >+ .data = &sysctl_mptcp_enabled, >+ .maxlen = sizeof(int), >+ .mode = 0644, >+ .proc_handler = &proc_dointvec >+ }, >+ { >+ .procname = "mptcp_checksum", >+ .data = &sysctl_mptcp_checksum, >+ .maxlen = sizeof(int), >+ .mode = 0644, >+ .proc_handler = &proc_dointvec >+ }, >+ { >+ .procname = "mptcp_debug", >+ .data = &sysctl_mptcp_debug, >+ .maxlen = sizeof(int), >+ .mode = 0644, >+ .proc_handler = &proc_dointvec >+ }, >+ { >+ .procname = "mptcp_syn_retries", >+ .data = &sysctl_mptcp_syn_retries, >+ .maxlen = sizeof(int), >+ .mode = 0644, >+ .proc_handler = &proc_dointvec >+ }, >+ { } >+}; >+#endif >+ >+static struct sock *mptcp_syn_recv_sock(struct sock *sk, struct sk_buff *skb, >+ struct request_sock *req, >+ struct dst_entry *dst) >+{ >+#if IS_ENABLED(CONFIG_IPV6) >+ if (sk->sk_family == AF_INET6) >+ return tcp_v6_syn_recv_sock(sk, skb, req, dst); >+ >+ /* sk->sk_family == AF_INET */ >+ if (req->rsk_ops->family == AF_INET6) >+ return mptcp_v6v4_syn_recv_sock(sk, skb, req, dst); >+#endif >+ >+ /* sk->sk_family == AF_INET && req->rsk_ops->family == AF_INET */ >+ return tcp_v4_syn_recv_sock(sk, skb, req, dst); >+} >+ >+struct sock *mptcp_select_ack_sock(const struct sock *meta_sk, int copied) >+{ >+ struct tcp_sock *meta_tp = tcp_sk(meta_sk); >+ struct sock *sk, *subsk = NULL; >+ u32 max_data_seq = 0; >+ /* max_data_seq initialized to correct compiler-warning. >+ * But the initialization is handled by max_data_seq_set >+ */ >+ short max_data_seq_set = 0; >+ u32 min_time = 0xffffffff; >+ >+ /* How do we select the subflow to send the window-update on? >+ * >+ * 1. He has to be in a state where he can send an ack. >+ * 2. He has to be one of those subflow who recently >+ * contributed to the received stream >+ * (this guarantees a working subflow) >+ * a) its latest data_seq received is after the original >+ * copied_seq. >+ * We select the one with the lowest rtt, so that the >+ * window-update reaches our peer the fastest. >+ * b) if no subflow has this kind of data_seq (e.g., very >+ * strange meta-level retransmissions going on), we take >+ * the subflow who last sent the highest data_seq. >+ */ >+ mptcp_for_each_sk(meta_tp->mpcb, sk) { >+ struct tcp_sock *tp = tcp_sk(sk); >+ >+ if (!mptcp_sk_can_send_ack(sk)) >+ continue; >+ >+ /* Select among those who contributed to the >+ * current receive-queue. >+ */ >+ if (copied && after(tp->mptcp->last_data_seq, meta_tp->copied_seq - copied)) { >+ if (tp->srtt < min_time) { >+ min_time = tp->srtt; >+ subsk = sk; >+ max_data_seq_set = 0; >+ } >+ continue; >+ } >+ >+ if (!subsk && !max_data_seq_set) { >+ max_data_seq = tp->mptcp->last_data_seq; >+ max_data_seq_set = 1; >+ subsk = sk; >+ } >+ >+ /* Otherwise, take the one with the highest data_seq */ >+ if ((!subsk || max_data_seq_set) && >+ after(tp->mptcp->last_data_seq, max_data_seq)) { >+ max_data_seq = tp->mptcp->last_data_seq; >+ subsk = sk; >+ } >+ } >+ >+ if (!subsk) { >+ mptcp_debug("%s subsk is null, copied %d, cseq %u\n", __func__, >+ copied, meta_tp->copied_seq); >+ mptcp_for_each_sk(meta_tp->mpcb, sk) { >+ struct tcp_sock *tp = tcp_sk(sk); >+ mptcp_debug("%s pi %d state %u last_dseq %u\n", >+ __func__, tp->mptcp->path_index, sk->sk_state, >+ tp->mptcp->last_data_seq); >+ } >+ } >+ >+ return subsk; >+} >+ >+static void mptcp_sock_def_error_report(struct sock *sk) >+{ >+ struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb; >+ >+ if (!sock_flag(sk, SOCK_DEAD)) >+ mptcp_sub_close(sk, 0); >+ >+ if (mpcb->infinite_mapping_rcv || mpcb->infinite_mapping_snd || >+ mpcb->send_infinite_mapping) { >+ struct sock *meta_sk = mptcp_meta_sk(sk); >+ >+ meta_sk->sk_err = sk->sk_err; >+ meta_sk->sk_err_soft = sk->sk_err_soft; >+ >+ if (!sock_flag(meta_sk, SOCK_DEAD)) >+ meta_sk->sk_error_report(meta_sk); >+ >+ tcp_done(meta_sk); >+ } >+ >+ sk->sk_err = 0; >+ return; >+} >+ >+static void mptcp_mpcb_put(struct mptcp_cb *mpcb) >+{ >+ if (atomic_dec_and_test(&mpcb->refcnt)) >+ kmem_cache_free(mptcp_cb_cache, mpcb); >+} >+ >+static void mptcp_sock_destruct(struct sock *sk) >+{ >+ struct tcp_sock *tp = tcp_sk(sk); >+ >+ inet_sock_destruct(sk); >+ >+ kmem_cache_free(mptcp_sock_cache, tp->mptcp); >+ tp->mptcp = NULL; >+ >+ if (!is_meta_sk(sk) && !tp->was_meta_sk) { >+ /* Taken when mpcb pointer was set */ >+ sock_put(mptcp_meta_sk(sk)); >+ mptcp_mpcb_put(tp->mpcb); >+ } else { >+ struct mptcp_cb *mpcb = tp->mpcb; >+ struct mptcp_tw *mptw; >+ >+ /* The mpcb is disappearing - we can make the final >+ * update to the rcv_nxt of the time-wait-sock and remove >+ * its reference to the mpcb. >+ */ >+ spin_lock_bh(&mpcb->tw_lock); >+ list_for_each_entry_rcu(mptw, &mpcb->tw_list, list) { >+ list_del_rcu(&mptw->list); >+ mptw->in_list = 0; >+ mptcp_mpcb_put(mpcb); >+ rcu_assign_pointer(mptw->mpcb, NULL); >+ } >+ spin_unlock_bh(&mpcb->tw_lock); >+ >+ mptcp_mpcb_put(mpcb); >+ >+ mptcp_debug("%s destroying meta-sk\n", __func__); >+ } >+} >+ >+void mptcp_destroy_sock(struct sock *sk) >+{ >+ if (is_meta_sk(sk)) { >+ struct sock *sk_it, *tmpsk; >+ >+ __skb_queue_purge(&tcp_sk(sk)->mpcb->reinject_queue); >+ mptcp_purge_ofo_queue(tcp_sk(sk)); >+ >+ /* We have to close all remaining subflows. Normally, they >+ * should all be about to get closed. But, if the kernel is >+ * forcing a closure (e.g., tcp_write_err), the subflows might >+ * not have been closed properly (as we are waiting for the >+ * DATA_ACK of the DATA_FIN). >+ */ >+ mptcp_for_each_sk_safe(tcp_sk(sk)->mpcb, sk_it, tmpsk) { >+ /* Already did call tcp_close - waiting for graceful >+ * closure. >+ */ >+ if (tcp_sk(sk_it)->closing) >+ continue; >+ >+ /* Allow the delayed work first to prevent time-wait state */ >+ if (delayed_work_pending(&tcp_sk(sk_it)->mptcp->work)) >+ continue; >+ >+ mptcp_sub_close(sk_it, 0); >+ } >+ } else { >+ mptcp_del_sock(sk); >+ } >+} >+ >+static void mptcp_set_state(struct sock *sk) >+{ >+ struct sock *meta_sk = mptcp_meta_sk(sk); >+ >+ /* Meta is not yet established - wake up the application */ >+ if ((1 << meta_sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV) && >+ sk->sk_state == TCP_ESTABLISHED) { >+ tcp_set_state(meta_sk, TCP_ESTABLISHED); >+ >+ if (!sock_flag(meta_sk, SOCK_DEAD)) { >+ meta_sk->sk_state_change(meta_sk); >+ sk_wake_async(meta_sk, SOCK_WAKE_IO, POLL_OUT); >+ } >+ } >+ >+ if (sk->sk_state == TCP_ESTABLISHED) { >+ tcp_sk(sk)->mptcp->establish_increased = 1; >+ tcp_sk(sk)->mpcb->cnt_established++; >+ } >+} >+ >+void mptcp_set_keepalive(struct sock *sk, int val) >+{ >+ struct sock *sk_it; >+ >+ mptcp_for_each_sk(tcp_sk(sk)->mpcb, sk_it) { >+ tcp_set_keepalive(sk_it, val); >+ sock_valbool_flag(sk, SOCK_KEEPOPEN, val); >+ } >+} >+ >+u32 mptcp_secret[MD5_MESSAGE_BYTES / 4] ____cacheline_aligned; >+u32 mptcp_key_seed = 0; >+ >+void mptcp_key_sha1(u64 key, u32 *token, u64 *idsn) >+{ >+ u32 workspace[SHA_WORKSPACE_WORDS]; >+ u32 mptcp_hashed_key[SHA_DIGEST_WORDS]; >+ u8 input[64]; >+ int i; >+ >+ memset(workspace, 0, sizeof(workspace)); >+ >+ /* Initialize input with appropriate padding */ >+ memset(&input[9], 0, sizeof(input) - 10); /* -10, because the last byte >+ * is explicitly set too */ >+ memcpy(input, &key, sizeof(key)); /* Copy key to the msg beginning */ >+ input[8] = 0x80; /* Padding: First bit after message = 1 */ >+ input[63] = 0x40; /* Padding: Length of the message = 64 bits */ >+ >+ sha_init(mptcp_hashed_key); >+ sha_transform(mptcp_hashed_key, input, workspace); >+ >+ for (i = 0; i < 5; i++) >+ mptcp_hashed_key[i] = cpu_to_be32(mptcp_hashed_key[i]); >+ >+ if (token) >+ *token = mptcp_hashed_key[0]; >+ if (idsn) >+ *idsn = *((u64 *)&mptcp_hashed_key[3]); >+} >+ >+void mptcp_hmac_sha1(u8 *key_1, u8 *key_2, u8 *rand_1, u8 *rand_2, >+ u32 *hash_out) >+{ >+ u32 workspace[SHA_WORKSPACE_WORDS]; >+ u8 input[128]; /* 2 512-bit blocks */ >+ int i; >+ >+ memset(workspace, 0, sizeof(workspace)); >+ >+ /* Generate key xored with ipad */ >+ memset(input, 0x36, 64); >+ for (i = 0; i < 8; i++) >+ input[i] ^= key_1[i]; >+ for (i = 0; i < 8; i++) >+ input[i + 8] ^= key_2[i]; >+ >+ memcpy(&input[64], rand_1, 4); >+ memcpy(&input[68], rand_2, 4); >+ input[72] = 0x80; /* Padding: First bit after message = 1 */ >+ memset(&input[73], 0, 53); >+ >+ /* Padding: Length of the message = 512 + 64 bits */ >+ input[126] = 0x02; >+ input[127] = 0x40; >+ >+ sha_init(hash_out); >+ sha_transform(hash_out, input, workspace); >+ memset(workspace, 0, sizeof(workspace)); >+ >+ sha_transform(hash_out, &input[64], workspace); >+ memset(workspace, 0, sizeof(workspace)); >+ >+ for (i = 0; i < 5; i++) >+ hash_out[i] = cpu_to_be32(hash_out[i]); >+ >+ /* Prepare second part of hmac */ >+ memset(input, 0x5C, 64); >+ for (i = 0; i < 8; i++) >+ input[i] ^= key_1[i]; >+ for (i = 0; i < 8; i++) >+ input[i + 8] ^= key_2[i]; >+ >+ memcpy(&input[64], hash_out, 20); >+ input[84] = 0x80; >+ memset(&input[85], 0, 41); >+ >+ /* Padding: Length of the message = 512 + 160 bits */ >+ input[126] = 0x02; >+ input[127] = 0xA0; >+ >+ sha_init(hash_out); >+ sha_transform(hash_out, input, workspace); >+ memset(workspace, 0, sizeof(workspace)); >+ >+ sha_transform(hash_out, &input[64], workspace); >+ >+ for (i = 0; i < 5; i++) >+ hash_out[i] = cpu_to_be32(hash_out[i]); >+} >+ >+static void mptcp_mpcb_inherit_sockopts(struct sock *meta_sk, struct sock *master_sk) >+{ >+ /* Socket-options handled by mptcp_inherit_sk while creating the meta-sk. >+ * ====== >+ * SO_SNDBUF, SO_SNDBUFFORCE, SO_RCVBUF, SO_RCVBUFFORCE, SO_RCVLOWAT, >+ * SO_RCVTIMEO, SO_SNDTIMEO, SO_ATTACH_FILTER, SO_DETACH_FILTER, >+ * TCP_NODELAY, TCP_CORK >+ * >+ * Socket-options handled in this function here >+ * ====== >+ * SO_KEEPALIVE >+ * TCP_KEEP* >+ * TCP_DEFER_ACCEPT >+ * >+ * Socket-options on the todo-list >+ * ====== >+ * SO_BINDTODEVICE - should probably prevent creation of new subsocks >+ * across other devices. - what about the api-draft? >+ * SO_DEBUG >+ * SO_REUSEADDR - probably we don't care about this >+ * SO_DONTROUTE, SO_BROADCAST >+ * SO_OOBINLINE >+ * SO_LINGER >+ * SO_TIMESTAMP* - I don't think this is of concern for a SOCK_STREAM >+ * SO_PASSSEC - I don't think this is of concern for a SOCK_STREAM >+ * SO_RXQ_OVFL >+ * TCP_COOKIE_TRANSACTIONS >+ * TCP_MAXSEG >+ * TCP_THIN_* - Handled by mptcp_inherit_sk, but we need to support this >+ * in mptcp_retransmit_timer. AND we need to check what is >+ * about the subsockets. >+ * TCP_LINGER2 >+ * TCP_WINDOW_CLAMP >+ * TCP_USER_TIMEOUT >+ * TCP_MD5SIG >+ * >+ * Socket-options of no concern for the meta-socket (but for the subsocket) >+ * ====== >+ * SO_PRIORITY >+ * SO_MARK >+ * TCP_CONGESTION >+ * TCP_SYNCNT >+ * TCP_QUICKACK >+ */ >+ struct tcp_sock *meta_tp = tcp_sk(meta_sk); >+ >+ /****** KEEPALIVE-handler ******/ >+ >+ /* Keepalive-timer has been started already, but it is handled at the >+ * subflow level. >+ */ >+ if (sock_flag(meta_sk, SOCK_KEEPOPEN)) { >+ inet_csk_delete_keepalive_timer(meta_sk); >+ inet_csk_reset_keepalive_timer(master_sk, keepalive_time_when(meta_tp)); >+ } >+ >+ /****** DEFER_ACCEPT-handler ******/ >+ >+ /* DEFER_ACCEPT is not of concern for new subflows - we always accept >+ * them >+ */ >+ inet_csk(meta_sk)->icsk_accept_queue.rskq_defer_accept = 0; >+} >+ >+static void mptcp_sub_inherit_sockopts(struct sock *meta_sk, struct sock *sub_sk) >+{ >+ struct tcp_sock *meta_tp = tcp_sk(meta_sk); >+ /* Keepalive is handled at the subflow-level */ >+ if (sock_flag(meta_sk, SOCK_KEEPOPEN)) { >+ inet_csk_reset_keepalive_timer(sub_sk, keepalive_time_when(meta_tp)); >+ sock_valbool_flag(sub_sk, SOCK_KEEPOPEN, keepalive_time_when(meta_tp)); >+ } >+ >+ /* IP_TOS also goes to the subflow. */ >+ if (inet_sk(sub_sk)->tos != inet_sk(meta_sk)->tos) { >+ inet_sk(sub_sk)->tos = inet_sk(meta_sk)->tos; >+ sub_sk->sk_priority = meta_sk->sk_priority; >+ sk_dst_reset(sub_sk); >+ } >+ >+ /* Inheris SO_REUSEADDR */ >+ sub_sk->sk_reuse = meta_sk->sk_reuse; >+} >+ >+int mptcp_backlog_rcv(struct sock *meta_sk, struct sk_buff *skb) >+{ >+ /* skb-sk may be NULL if we receive a packet immediatly after the >+ * SYN/ACK + MP_CAPABLE. >+ */ >+ struct sock *sk = skb->sk ? skb->sk : meta_sk; >+ int ret = 0; >+ >+ if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) { >+ kfree_skb(skb); >+ return 0; >+ } >+ >+ if (sk->sk_family == AF_INET) >+ ret = tcp_v4_do_rcv(sk, skb); >+#if IS_ENABLED(CONFIG_IPV6) >+ else >+ ret = tcp_v6_do_rcv(sk, skb); >+#endif >+ >+ sock_put(sk); >+ return ret; >+} >+ >+struct lock_class_key meta_key; >+struct lock_class_key meta_slock_key; >+ >+/* Code heavily inspired from sk_clone() */ >+static int mptcp_inherit_sk(const struct sock *sk, struct sock *newsk, >+ int family, const gfp_t flags) >+{ >+ struct sk_filter *filter; >+ struct proto *prot = newsk->sk_prot; >+ const struct inet_connection_sock_af_ops *af_ops = inet_csk(newsk)->icsk_af_ops; >+#ifdef CONFIG_SECURITY_NETWORK >+ void *sptr = newsk->sk_security; >+#endif >+ >+ if (sk->sk_family == AF_INET) { >+ memcpy(newsk, sk, offsetof(struct sock, sk_dontcopy_begin)); >+ memcpy(&newsk->sk_dontcopy_end, &sk->sk_dontcopy_end, >+ sizeof(struct tcp_sock) - offsetof(struct sock, sk_dontcopy_end)); >+ } else { >+ memcpy(newsk, sk, offsetof(struct sock, sk_dontcopy_begin)); >+ memcpy(&newsk->sk_dontcopy_end, &sk->sk_dontcopy_end, >+ sizeof(struct tcp6_sock) - offsetof(struct sock, sk_dontcopy_end)); >+ } >+ >+#ifdef CONFIG_SECURITY_NETWORK >+ newsk->sk_security = sptr; >+ security_sk_clone(sk, newsk); >+#endif >+ >+ /* Has been changed by sock_copy above - we may need an IPv6-socket */ >+ newsk->sk_family = family; >+ newsk->sk_prot = prot; >+ newsk->sk_prot_creator = prot; >+ inet_csk(newsk)->icsk_af_ops = af_ops; >+ >+ /* We don't yet have the mptcp-point. Thus we still need inet_sock_destruct */ >+ newsk->sk_destruct = inet_sock_destruct; >+ >+ /* SANITY */ >+ get_net(sock_net(newsk)); >+ sk_node_init(&newsk->sk_node); >+ sock_lock_init_class_and_name(newsk, "slock-AF_INET-MPTCP", >+ &meta_slock_key, "sk_lock-AF_INET-MPTCP", >+ &meta_key); >+ >+ /* Unlocks are in: >+ * >+ * 1. If we are creating the master-sk >+ * * on client-side in tcp_rcv_state_process, "case TCP_SYN_SENT" >+ * * on server-side in tcp_child_process >+ * 2. If we are creating another subsock >+ * * Also in tcp_child_process >+ */ >+ bh_lock_sock(newsk); >+ newsk->sk_backlog.head = NULL; >+ newsk->sk_backlog.tail = NULL; >+ newsk->sk_backlog.len = 0; >+ >+ atomic_set(&newsk->sk_rmem_alloc, 0); >+ atomic_set(&newsk->sk_wmem_alloc, 1); >+ atomic_set(&newsk->sk_omem_alloc, 0); >+ >+ skb_queue_head_init(&newsk->sk_receive_queue); >+ skb_queue_head_init(&newsk->sk_write_queue); >+#ifdef CONFIG_NET_DMA >+ skb_queue_head_init(&newsk->sk_async_wait_queue); >+#endif >+ >+ spin_lock_init(&newsk->sk_dst_lock); >+ rwlock_init(&newsk->sk_callback_lock); >+ lockdep_set_class_and_name(&newsk->sk_callback_lock, >+ af_callback_keys + newsk->sk_family, >+ af_family_clock_key_strings[newsk->sk_family]); >+ newsk->sk_dst_cache = NULL; >+ newsk->sk_rx_dst = NULL; >+ newsk->sk_wmem_queued = 0; >+ newsk->sk_forward_alloc = 0; >+ newsk->sk_send_head = NULL; >+ newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; >+ >+ tcp_sk(newsk)->mptcp = NULL; >+ >+ sock_reset_flag(newsk, SOCK_DONE); >+ skb_queue_head_init(&newsk->sk_error_queue); >+ >+ filter = rcu_dereference_protected(newsk->sk_filter, 1); >+ if (filter != NULL) >+ sk_filter_charge(newsk, filter); >+ >+ if (unlikely(xfrm_sk_clone_policy(newsk))) { >+ /* It is still raw copy of parent, so invalidate >+ * destructor and make plain sk_free() >+ */ >+ newsk->sk_destruct = NULL; >+ bh_unlock_sock(newsk); >+ sk_free(newsk); >+ newsk = NULL; >+ return -ENOMEM; >+ } >+ >+ newsk->sk_err = 0; >+ newsk->sk_priority = 0; >+ /* Before updating sk_refcnt, we must commit prior changes to memory >+ * (Documentation/RCU/rculist_nulls.txt for details) >+ */ >+ smp_wmb(); >+ atomic_set(&newsk->sk_refcnt, 2); >+ >+ /* Increment the counter in the same struct proto as the master >+ * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that >+ * is the same as sk->sk_prot->socks, as this field was copied >+ * with memcpy). >+ * >+ * This _changes_ the previous behaviour, where >+ * tcp_create_openreq_child always was incrementing the >+ * equivalent to tcp_prot->socks (inet_sock_nr), so this have >+ * to be taken into account in all callers. -acme >+ */ >+ sk_refcnt_debug_inc(newsk); >+ sk_set_socket(newsk, NULL); >+ newsk->sk_wq = NULL; >+ >+ if (newsk->sk_prot->sockets_allocated) >+ percpu_counter_inc(newsk->sk_prot->sockets_allocated); >+ >+ if (sock_flag(newsk, SOCK_TIMESTAMP) || >+ sock_flag(newsk, SOCK_TIMESTAMPING_RX_SOFTWARE)) >+ net_enable_timestamp(); >+ >+ return 0; >+} >+ >+int mptcp_alloc_mpcb(struct sock *meta_sk, __u64 remote_key, u32 window) >+{ >+ struct mptcp_cb *mpcb; >+ struct sock *master_sk; >+ struct inet_connection_sock *master_icsk, *meta_icsk = inet_csk(meta_sk); >+ struct tcp_sock *master_tp, *meta_tp = tcp_sk(meta_sk); >+ struct sk_buff *skb, *tmp; >+ u64 idsn; >+ >+ master_sk = sk_prot_alloc(meta_sk->sk_prot, GFP_ATOMIC | __GFP_ZERO, >+ meta_sk->sk_family); >+ if (!master_sk) >+ return -ENOBUFS; >+ >+ master_tp = tcp_sk(master_sk); >+ master_icsk = inet_csk(master_sk); >+ >+ /* Need to set this here - it is needed by mptcp_inherit_sk */ >+ master_sk->sk_prot = meta_sk->sk_prot; >+ master_sk->sk_prot_creator = meta_sk->sk_prot; >+ master_icsk->icsk_af_ops = meta_icsk->icsk_af_ops; >+ >+ mpcb = kmem_cache_zalloc(mptcp_cb_cache, GFP_ATOMIC); >+ if (!mpcb) { >+ sk_free(master_sk); >+ return -ENOBUFS; >+ } >+ >+ /* master_sk inherits from meta_sk */ >+ if (mptcp_inherit_sk(meta_sk, master_sk, meta_sk->sk_family, GFP_ATOMIC)) { >+ kmem_cache_free(mptcp_cb_cache, mpcb); >+ return -ENOBUFS; >+ } >+ >+#if IS_ENABLED(CONFIG_IPV6) >+ if (meta_icsk->icsk_af_ops == &ipv6_mapped) { >+ struct ipv6_pinfo *newnp, *np = inet6_sk(meta_sk); >+ >+ inet_sk(master_sk)->pinet6 = &((struct tcp6_sock *)master_sk)->inet6; >+ >+ newnp = inet6_sk(master_sk); >+ memcpy(newnp, np, sizeof(struct ipv6_pinfo)); >+ >+ newnp->ipv6_mc_list = NULL; >+ newnp->ipv6_ac_list = NULL; >+ newnp->ipv6_fl_list = NULL; >+ newnp->opt = NULL; >+ newnp->pktoptions = NULL; >+ (void)xchg(&newnp->rxpmtu, NULL); >+ } else if (meta_sk->sk_family == AF_INET6) { >+ struct ipv6_pinfo *newnp; >+ >+ /* Meta is IPv4. Initialize pinet6 for the master-sk. */ >+ inet_sk(master_sk)->pinet6 = &((struct tcp6_sock *)master_sk)->inet6; >+ >+ newnp = inet6_sk(master_sk); >+ >+ newnp->hop_limit = -1; >+ newnp->mcast_hops = IPV6_DEFAULT_MCASTHOPS; >+ newnp->mc_loop = 1; >+ newnp->pmtudisc = IPV6_PMTUDISC_WANT; >+ newnp->ipv6only = sock_net(master_sk)->ipv6.sysctl.bindv6only; >+ } >+#endif >+ >+ meta_tp->mptcp = kmem_cache_zalloc(mptcp_sock_cache, GFP_ATOMIC); >+ if (!meta_tp->mptcp) { >+ kmem_cache_free(mptcp_cb_cache, mpcb); >+ sk_free(master_sk); >+ return -ENOBUFS; >+ } >+ >+ /* Store the keys and generate the peer's token */ >+ mpcb->mptcp_loc_key = meta_tp->mptcp_loc_key; >+ mpcb->mptcp_loc_token = meta_tp->mptcp_loc_token; >+ >+ /* Generate Initial data-sequence-numbers */ >+ mptcp_key_sha1(mpcb->mptcp_loc_key, NULL, &idsn); >+ idsn = ntohll(idsn) + 1; >+ mpcb->snd_high_order[0] = idsn >> 32; >+ mpcb->snd_high_order[1] = mpcb->snd_high_order[0] - 1; >+ >+ meta_tp->write_seq = (u32)idsn; >+ meta_tp->snd_sml = meta_tp->write_seq; >+ meta_tp->snd_una = meta_tp->write_seq; >+ meta_tp->snd_nxt = meta_tp->write_seq; >+ meta_tp->pushed_seq = meta_tp->write_seq; >+ meta_tp->snd_up = meta_tp->write_seq; >+ >+ mpcb->mptcp_rem_key = remote_key; >+ mptcp_key_sha1(mpcb->mptcp_rem_key, &mpcb->mptcp_rem_token, &idsn); >+ idsn = ntohll(idsn) + 1; >+ mpcb->rcv_high_order[0] = idsn >> 32; >+ mpcb->rcv_high_order[1] = mpcb->rcv_high_order[0] + 1; >+ meta_tp->copied_seq = (u32) idsn; >+ meta_tp->rcv_nxt = (u32) idsn; >+ meta_tp->rcv_wup = (u32) idsn; >+ >+ meta_tp->snd_wl1 = meta_tp->rcv_nxt - 1; >+ meta_tp->snd_wnd = window; >+ meta_tp->retrans_stamp = 0; /* Set in tcp_connect() */ >+ >+ meta_tp->packets_out = 0; >+ meta_tp->mptcp->snt_isn = meta_tp->write_seq; /* Initial data-sequence-number */ >+ meta_icsk->icsk_probes_out = 0; >+ >+ /* Set mptcp-pointers */ >+ master_tp->mpcb = mpcb; >+ master_tp->meta_sk = meta_sk; >+ meta_tp->mpcb = mpcb; >+ meta_tp->meta_sk = meta_sk; >+ mpcb->meta_sk = meta_sk; >+ mpcb->master_sk = master_sk; >+ >+ meta_tp->mpc = 1; >+ meta_tp->mptcp->attached = 0; >+ meta_tp->was_meta_sk = 0; >+ >+ /* Initialize the queues */ >+ skb_queue_head_init(&mpcb->reinject_queue); >+ skb_queue_head_init(&master_tp->out_of_order_queue); >+ tcp_prequeue_init(master_tp); >+ >+ master_tp->tsq_flags = 0; >+ >+ /* Copy the write-queue from the meta down to the master. >+ * This is necessary to get the SYN to the master-write-queue. >+ * No other data can be queued, before tcp_sendmsg waits for the >+ * connection to finish. >+ */ >+ skb_queue_walk_safe(&meta_sk->sk_write_queue, skb, tmp) { >+ skb_unlink(skb, &meta_sk->sk_write_queue); >+ skb_queue_tail(&master_sk->sk_write_queue, skb); >+ >+ master_sk->sk_wmem_queued += skb->truesize; >+ sk_mem_charge(master_sk, skb->truesize); >+ } >+ >+ meta_sk->sk_wmem_queued = 0; >+ meta_sk->sk_forward_alloc = 0; >+ >+ mutex_init(&mpcb->mutex); >+ >+ /* Initialize workqueue-struct */ >+ INIT_WORK(&mpcb->subflow_work, mptcp_create_subflow_worker); >+ INIT_DELAYED_WORK(&mpcb->subflow_retry_work, mptcp_retry_subflow_worker); >+ INIT_WORK(&mpcb->address_work, mptcp_address_worker); >+ >+ /* Init the accept_queue structure, we support a queue of 32 pending >+ * connections, it does not need to be huge, since we only store here >+ * pending subflow creations. >+ */ >+ if (reqsk_queue_alloc(&meta_icsk->icsk_accept_queue, 32, GFP_ATOMIC)) { >+ inet_put_port(master_sk); >+ kmem_cache_free(mptcp_sock_cache, meta_tp->mptcp); >+ kmem_cache_free(mptcp_cb_cache, mpcb); >+ sk_free(master_sk); >+ meta_tp->mpc = 0; >+ return -ENOMEM; >+ } >+ >+ /* Redefine function-pointers as the meta-sk is now fully ready */ >+ meta_sk->sk_backlog_rcv = mptcp_backlog_rcv; >+ meta_sk->sk_destruct = mptcp_sock_destruct; >+ mpcb->syn_recv_sock = mptcp_syn_recv_sock; >+ >+ /* Meta-level retransmit timer */ >+ meta_icsk->icsk_rto *= 2; /* Double of initial - rto */ >+ >+ tcp_init_xmit_timers(master_sk); >+ /* Has been set for sending out the SYN */ >+ inet_csk_clear_xmit_timer(meta_sk, ICSK_TIME_RETRANS); >+ >+ if (!meta_tp->inside_tk_table) { >+ /* Adding the meta_tp in the token hashtable - coming from server-side */ >+ rcu_read_lock(); >+ spin_lock(&mptcp_tk_hashlock); >+ >+ __mptcp_hash_insert(meta_tp, mpcb->mptcp_loc_token); >+ >+ spin_unlock(&mptcp_tk_hashlock); >+ rcu_read_unlock(); >+ } >+ master_tp->inside_tk_table = 0; >+ >+ /* Init time-wait stuff */ >+ INIT_LIST_HEAD(&mpcb->tw_list); >+ spin_lock_init(&mpcb->tw_lock); >+ >+ mptcp_mpcb_inherit_sockopts(meta_sk, master_sk); >+ >+ mpcb->orig_sk_rcvbuf = meta_sk->sk_rcvbuf; >+ mpcb->orig_sk_sndbuf = meta_sk->sk_sndbuf; >+ mpcb->orig_window_clamp = meta_tp->window_clamp; >+ >+ /* The meta is directly linked - set refcnt to 1 */ >+ atomic_set(&mpcb->refcnt, 1); >+ >+ mptcp_debug("%s: created mpcb with token %#x\n", >+ __func__, mpcb->mptcp_loc_token); >+ >+ return 0; >+} >+ >+struct sock *mptcp_sk_clone(const struct sock *sk, int family, >+ const gfp_t priority) >+{ >+ struct sock *newsk = NULL; >+ >+ if (family == AF_INET && sk->sk_family == AF_INET) { >+ newsk = sk_prot_alloc(&tcp_prot, priority, family); >+ if (!newsk) >+ return NULL; >+ >+ /* Set these pointers - they are needed by mptcp_inherit_sk */ >+ newsk->sk_prot = &tcp_prot; >+ newsk->sk_prot_creator = &tcp_prot; >+ inet_csk(newsk)->icsk_af_ops = &ipv4_specific; >+ newsk->sk_family = AF_INET; >+ } >+#if IS_ENABLED(CONFIG_IPV6) >+ else { >+ newsk = sk_prot_alloc(&tcpv6_prot, priority, family); >+ if (!newsk) >+ return NULL; >+ >+ newsk->sk_prot = &tcpv6_prot; >+ newsk->sk_prot_creator = &tcpv6_prot; >+ if (family == AF_INET) >+ inet_csk(newsk)->icsk_af_ops = &ipv6_mapped; >+ else >+ inet_csk(newsk)->icsk_af_ops = &ipv6_specific; >+ newsk->sk_family = AF_INET6; >+ } >+#endif >+ >+ if (mptcp_inherit_sk(sk, newsk, family, priority)) >+ return NULL; >+ >+ return newsk; >+} >+ >+void mptcp_fallback_meta_sk(struct sock *meta_sk) >+{ >+ kfree(inet_csk(meta_sk)->icsk_accept_queue.listen_opt); >+ kmem_cache_free(mptcp_sock_cache, tcp_sk(meta_sk)->mptcp); >+ kmem_cache_free(mptcp_cb_cache, tcp_sk(meta_sk)->mpcb); >+} >+ >+int mptcp_add_sock(struct sock *meta_sk, struct sock *sk, u8 rem_id, >+ gfp_t flags) >+{ >+ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; >+ struct tcp_sock *tp = tcp_sk(sk); >+ >+ tp->mptcp = kmem_cache_zalloc(mptcp_sock_cache, flags); >+ if (!tp->mptcp) >+ return -ENOMEM; >+ >+ tp->mptcp->path_index = mptcp_set_new_pathindex(mpcb); >+ /* No more space for more subflows? */ >+ if (!tp->mptcp->path_index) { >+ kmem_cache_free(mptcp_sock_cache, tp->mptcp); >+ return -EPERM; >+ } >+ >+ tp->mptcp->tp = tp; >+ tp->mpcb = mpcb; >+ tp->meta_sk = meta_sk; >+ tp->mpc = 1; >+ tp->mptcp->rem_id = rem_id; >+ tp->mptcp->last_rbuf_opti = tcp_time_stamp; >+ >+ /* The corresponding sock_put is in mptcp_sock_destruct(). It cannot be >+ * included in mptcp_del_sock(), because the mpcb must remain alive >+ * until the last subsocket is completely destroyed. >+ */ >+ sock_hold(meta_sk); >+ atomic_inc(&mpcb->refcnt); >+ >+ tp->mptcp->next = mpcb->connection_list; >+ mpcb->connection_list = tp; >+ tp->mptcp->attached = 1; >+ >+ mpcb->cnt_subflows++; >+ atomic_add(atomic_read(&((struct sock *)tp)->sk_rmem_alloc), >+ &meta_sk->sk_rmem_alloc); >+ >+ mptcp_sub_inherit_sockopts(meta_sk, sk); >+ INIT_DELAYED_WORK(&tp->mptcp->work, mptcp_sub_close_wq); >+ >+ /* As we successfully allocated the mptcp_tcp_sock, we have to >+ * change the function-pointers here (for sk_destruct to work correctly) >+ */ >+ sk->sk_error_report = mptcp_sock_def_error_report; >+ sk->sk_data_ready = mptcp_data_ready; >+ sk->sk_write_space = mptcp_write_space; >+ sk->sk_state_change = mptcp_set_state; >+ sk->sk_destruct = mptcp_sock_destruct; >+ >+ if (sk->sk_family == AF_INET) >+ mptcp_debug("%s: token %#x pi %d, src_addr:%pI4:%d dst_addr:%pI4:%d, cnt_subflows now %d\n", >+ __func__ , mpcb->mptcp_loc_token, >+ tp->mptcp->path_index, >+ &((struct inet_sock *)tp)->inet_saddr, >+ ntohs(((struct inet_sock *)tp)->inet_sport), >+ &((struct inet_sock *)tp)->inet_daddr, >+ ntohs(((struct inet_sock *)tp)->inet_dport), >+ mpcb->cnt_subflows); >+ else >+ mptcp_debug("%s: token %#x pi %d, src_addr:%pI6:%d dst_addr:%pI6:%d, cnt_subflows now %d\n", >+ __func__ , mpcb->mptcp_loc_token, >+ tp->mptcp->path_index, &inet6_sk(sk)->saddr, >+ ntohs(((struct inet_sock *)tp)->inet_sport), >+ &inet6_sk(sk)->daddr, >+ ntohs(((struct inet_sock *)tp)->inet_dport), >+ mpcb->cnt_subflows); >+ >+ return 0; >+} >+ >+void mptcp_del_sock(struct sock *sk) >+{ >+ struct tcp_sock *tp = tcp_sk(sk), *tp_prev; >+ struct mptcp_cb *mpcb; >+ >+ if (!tp->mptcp || !tp->mptcp->attached) >+ return; >+ >+ mpcb = tp->mpcb; >+ tp_prev = mpcb->connection_list; >+ >+ mptcp_debug("%s: Removing subsock tok %#x pi:%d state %d is_meta? %d\n", >+ __func__, mpcb->mptcp_loc_token, tp->mptcp->path_index, >+ sk->sk_state, is_meta_sk(sk)); >+ >+ if (tp_prev == tp) { >+ mpcb->connection_list = tp->mptcp->next; >+ } else { >+ for (; tp_prev && tp_prev->mptcp->next; tp_prev = tp_prev->mptcp->next) { >+ if (tp_prev->mptcp->next == tp) { >+ tp_prev->mptcp->next = tp->mptcp->next; >+ break; >+ } >+ } >+ } >+ mpcb->cnt_subflows--; >+ if (tp->mptcp->establish_increased) >+ mpcb->cnt_established--; >+ >+ tp->mptcp->next = NULL; >+ tp->mptcp->attached = 0; >+ mpcb->path_index_bits &= ~(1 << tp->mptcp->path_index); >+ >+ if (!skb_queue_empty(&sk->sk_write_queue)) >+ mptcp_reinject_data(sk, 0); >+ >+ if (is_master_tp(tp)) >+ mpcb->master_sk = NULL; >+ else if (tp->mptcp->pre_established) >+ sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer); >+ >+ sk->sk_prot->release_cb(sk); >+ >+ rcu_assign_pointer(inet_sk(sk)->inet_opt, NULL); >+} >+ >+/* Updates the metasocket ULID/port data, based on the given sock. >+ * The argument sock must be the sock accessible to the application. >+ * In this function, we update the meta socket info, based on the changes >+ * in the application socket (bind, address allocation, ...) >+ */ >+void mptcp_update_metasocket(struct sock *sk, struct sock *meta_sk) >+{ >+ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; >+ >+ switch (sk->sk_family) { >+#if IS_ENABLED(CONFIG_IPV6) >+ case AF_INET6: >+ /* If the socket is v4 mapped, we continue with v4 operations */ >+ if (!mptcp_v6_is_v4_mapped(sk)) { >+ mpcb->locaddr6[0].addr = inet6_sk(sk)->saddr; >+ mpcb->locaddr6[0].id = 0; >+ mpcb->locaddr6[0].port = 0; >+ mpcb->locaddr6[0].low_prio = 0; >+ mpcb->loc6_bits |= 1; >+ mpcb->next_v6_index = 1; >+ >+ mptcp_v6_add_raddress(mpcb, >+ &inet6_sk(sk)->daddr, 0, 0); >+ mptcp_v6_set_init_addr_bit(mpcb, &inet6_sk(sk)->daddr); >+ break; >+ } >+#endif >+ case AF_INET: >+ mpcb->locaddr4[0].addr.s_addr = inet_sk(sk)->inet_saddr; >+ mpcb->locaddr4[0].id = 0; >+ mpcb->locaddr4[0].port = 0; >+ mpcb->locaddr4[0].low_prio = 0; >+ mpcb->loc4_bits |= 1; >+ mpcb->next_v4_index = 1; >+ >+ mptcp_v4_add_raddress(mpcb, >+ (struct in_addr *)&inet_sk(sk)->inet_daddr, >+ 0, 0); >+ mptcp_v4_set_init_addr_bit(mpcb, inet_sk(sk)->inet_daddr); >+ break; >+ } >+ >+ mptcp_set_addresses(meta_sk); >+ >+ switch (sk->sk_family) { >+ case AF_INET: >+ tcp_sk(sk)->mptcp->low_prio = mpcb->locaddr4[0].low_prio; >+ break; >+#if IS_ENABLED(CONFIG_IPV6) >+ case AF_INET6: >+ tcp_sk(sk)->mptcp->low_prio = mpcb->locaddr6[0].low_prio; >+ break; >+#endif >+ } >+ >+ tcp_sk(sk)->mptcp->send_mp_prio = tcp_sk(sk)->mptcp->low_prio; >+} >+ >+/* Clean up the receive buffer for full frames taken by the user, >+ * then send an ACK if necessary. COPIED is the number of bytes >+ * tcp_recvmsg has given to the user so far, it speeds up the >+ * calculation of whether or not we must ACK for the sake of >+ * a window update. >+ */ >+void mptcp_cleanup_rbuf(struct sock *meta_sk, int copied) >+{ >+ struct tcp_sock *meta_tp = tcp_sk(meta_sk); >+ struct sock *sk; >+ __u32 rcv_window_now = 0; >+ >+ if (copied > 0 && !(meta_sk->sk_shutdown & RCV_SHUTDOWN)) { >+ rcv_window_now = tcp_receive_window(meta_tp); >+ >+ if (2 * rcv_window_now > meta_tp->window_clamp) >+ rcv_window_now = 0; >+ } >+ >+ mptcp_for_each_sk(meta_tp->mpcb, sk) { >+ struct tcp_sock *tp = tcp_sk(sk); >+ const struct inet_connection_sock *icsk = inet_csk(sk); >+ >+ if (!mptcp_sk_can_send_ack(sk)) >+ continue; >+ >+ if (!inet_csk_ack_scheduled(sk)) >+ goto second_part; >+ /* Delayed ACKs frequently hit locked sockets during bulk >+ * receive. >+ */ >+ if (icsk->icsk_ack.blocked || >+ /* Once-per-two-segments ACK was not sent by tcp_input.c */ >+ tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss || >+ /* If this read emptied read buffer, we send ACK, if >+ * connection is not bidirectional, user drained >+ * receive buffer and there was a small segment >+ * in queue. >+ */ >+ (copied > 0 && >+ ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) || >+ ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) && >+ !icsk->icsk_ack.pingpong)) && >+ !atomic_read(&meta_sk->sk_rmem_alloc))) { >+ tcp_send_ack(sk); >+ continue; >+ } >+ >+second_part: >+ /* This here is the second part of tcp_cleanup_rbuf */ >+ if (rcv_window_now) { >+ __u32 new_window = __tcp_select_window(sk); >+ >+ /* Send ACK now, if this read freed lots of space >+ * in our buffer. Certainly, new_window is new window. >+ * We can advertise it now, if it is not less than >+ * current one. >+ * "Lots" means "at least twice" here. >+ */ >+ if (new_window && new_window >= 2 * rcv_window_now) >+ tcp_send_ack(sk); >+ } >+ } >+} >+ >+static int mptcp_sub_send_fin(struct sock *sk) >+{ >+ struct tcp_sock *tp = tcp_sk(sk); >+ struct sk_buff *skb = tcp_write_queue_tail(sk); >+ int mss_now; >+ >+ /* Optimization, tack on the FIN if we have a queue of >+ * unsent frames. But be careful about outgoing SACKS >+ * and IP options. >+ */ >+ mss_now = tcp_current_mss(sk); >+ >+ if (tcp_send_head(sk) != NULL) { >+ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN; >+ TCP_SKB_CB(skb)->end_seq++; >+ tp->write_seq++; >+ } else { >+ skb = alloc_skb_fclone(MAX_TCP_HEADER, GFP_ATOMIC); >+ if (!skb) >+ return 1; >+ >+ /* Reserve space for headers and prepare control bits. */ >+ skb_reserve(skb, MAX_TCP_HEADER); >+ /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ >+ tcp_init_nondata_skb(skb, tp->write_seq, >+ TCPHDR_ACK | TCPHDR_FIN); >+ tcp_queue_skb(sk, skb); >+ } >+ __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF); >+ >+ return 0; >+} >+ >+void mptcp_sub_close_wq(struct work_struct *work) >+{ >+ struct mptcp_tcp_sock *mptcp = container_of(work, struct mptcp_tcp_sock, work.work); >+ struct tcp_sock *tp = mptcp->tp; >+ struct sock *sk = (struct sock *)tp; >+ struct sock *meta_sk = mptcp_meta_sk(sk); >+ >+ mutex_lock(&tp->mpcb->mutex); >+ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING); >+ >+ if (sock_flag(sk, SOCK_DEAD)) >+ goto exit; >+ >+ /* We come from tcp_disconnect. We are sure that meta_sk is set */ >+ if (!tp->mpc) { >+ tp->closing = 1; >+ sock_rps_reset_flow(sk); >+ tcp_close(sk, 0); >+ goto exit; >+ } >+ >+ if (meta_sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE) { >+ tp->closing = 1; >+ sock_rps_reset_flow(sk); >+ tcp_close(sk, 0); >+ } else if (tcp_close_state(sk)) { >+ sk->sk_shutdown |= SEND_SHUTDOWN; >+ tcp_send_fin(sk); >+ } >+ >+exit: >+ release_sock(meta_sk); >+ mutex_unlock(&tp->mpcb->mutex); >+ sock_put(sk); >+} >+ >+void mptcp_sub_close(struct sock *sk, unsigned long delay) >+{ >+ struct tcp_sock *tp = tcp_sk(sk); >+ struct delayed_work *work = &tcp_sk(sk)->mptcp->work; >+ >+ /* We are already closing - e.g., call from sock_def_error_report upon >+ * tcp_disconnect in tcp_close. >+ */ >+ if (tp->closing) >+ return; >+ >+ /* Work already scheduled ? */ >+ if (work_pending(&work->work)) { >+ /* Work present - who will be first ? */ >+ if (jiffies + delay > work->timer.expires) >+ return; >+ >+ /* Try canceling - if it fails, work will be executed soon */ >+ if (!cancel_delayed_work(work)) >+ return; >+ sock_put(sk); >+ } >+ >+ if (!delay) { >+ unsigned char old_state = sk->sk_state; >+ >+ /* If we are in user-context we can directly do the closing >+ * procedure. No need to schedule a work-queue. >+ */ >+ if (!in_softirq()) { >+ if (sock_flag(sk, SOCK_DEAD)) >+ return; >+ >+ if (!tp->mpc) { >+ tp->closing = 1; >+ sock_rps_reset_flow(sk); >+ tcp_close(sk, 0); >+ return; >+ } >+ >+ if (mptcp_meta_sk(sk)->sk_shutdown == SHUTDOWN_MASK || >+ sk->sk_state == TCP_CLOSE) { >+ tp->closing = 1; >+ sock_rps_reset_flow(sk); >+ tcp_close(sk, 0); >+ } else if (tcp_close_state(sk)) { >+ sk->sk_shutdown |= SEND_SHUTDOWN; >+ tcp_send_fin(sk); >+ } >+ >+ return; >+ } >+ >+ /* We directly send the FIN. Because it may take so a long time, >+ * untile the work-queue will get scheduled... >+ * >+ * If mptcp_sub_send_fin returns 1, it failed and thus we reset >+ * the old state so that tcp_close will finally send the fin >+ * in user-context. >+ */ >+ if (!sk->sk_err && old_state != TCP_CLOSE && >+ tcp_close_state(sk) && mptcp_sub_send_fin(sk)) { >+ if (old_state == TCP_ESTABLISHED) >+ TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); >+ sk->sk_state = old_state; >+ } >+ } >+ >+ sock_hold(sk); >+ queue_delayed_work(mptcp_wq, work, delay); >+} >+ >+/* Update the mpcb send window, based on the contributions >+ * of each subflow >+ */ >+void mptcp_update_sndbuf(struct mptcp_cb *mpcb) >+{ >+ struct sock *meta_sk = mpcb->meta_sk, *sk; >+ int new_sndbuf = 0; >+ mptcp_for_each_sk(mpcb, sk) { >+ if (!mptcp_sk_can_send(sk)) >+ continue; >+ >+ new_sndbuf += sk->sk_sndbuf; >+ >+ if (new_sndbuf > sysctl_tcp_wmem[2] || new_sndbuf < 0) { >+ new_sndbuf = sysctl_tcp_wmem[2]; >+ break; >+ } >+ } >+ meta_sk->sk_sndbuf = max(min(new_sndbuf, sysctl_tcp_wmem[2]), meta_sk->sk_sndbuf); >+} >+ >+void mptcp_close(struct sock *meta_sk, long timeout) >+{ >+ struct tcp_sock *meta_tp = tcp_sk(meta_sk); >+ struct sock *sk_it, *tmpsk; >+ struct mptcp_cb *mpcb = meta_tp->mpcb; >+ struct sk_buff *skb; >+ int data_was_unread = 0; >+ int state; >+ >+ mptcp_debug("%s: Close of meta_sk with tok %#x\n", >+ __func__, mpcb->mptcp_loc_token); >+ >+ mutex_lock(&mpcb->mutex); >+ lock_sock(meta_sk); >+ >+ if (meta_tp->inside_tk_table) { >+ /* Detach the mpcb from the token hashtable */ >+ mptcp_hash_remove_bh(meta_tp); >+ reqsk_queue_destroy(&inet_csk(meta_sk)->icsk_accept_queue); >+ } >+ >+ meta_sk->sk_shutdown = SHUTDOWN_MASK; >+ /* We need to flush the recv. buffs. We do this only on the >+ * descriptor close, not protocol-sourced closes, because the >+ * reader process may not have drained the data yet! >+ */ >+ while ((skb = __skb_dequeue(&meta_sk->sk_receive_queue)) != NULL) { >+ u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - >+ tcp_hdr(skb)->fin; >+ data_was_unread += len; >+ __kfree_skb(skb); >+ } >+ >+ sk_mem_reclaim(meta_sk); >+ >+ /* If socket has been already reset (e.g. in tcp_reset()) - kill it. */ >+ if (meta_sk->sk_state == TCP_CLOSE) { >+ mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) >+ mptcp_sub_close(sk_it, 0); >+ goto adjudge_to_death; >+ } >+ >+ if (data_was_unread) { >+ /* Unread data was tossed, zap the connection. */ >+ NET_INC_STATS_USER(sock_net(meta_sk), LINUX_MIB_TCPABORTONCLOSE); >+ tcp_set_state(meta_sk, TCP_CLOSE); >+ tcp_send_active_reset(meta_sk, meta_sk->sk_allocation); >+ } else if (sock_flag(meta_sk, SOCK_LINGER) && !meta_sk->sk_lingertime) { >+ /* Check zero linger _after_ checking for unread data. */ >+ meta_sk->sk_prot->disconnect(meta_sk, 0); >+ NET_INC_STATS_USER(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA); >+ } else if (tcp_close_state(meta_sk)) { >+ mptcp_send_fin(meta_sk); >+ } else if (meta_tp->snd_una == meta_tp->write_seq) { >+ /* The DATA_FIN has been sent and acknowledged >+ * (e.g., by sk_shutdown). Close all the other subflows >+ */ >+ mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) { >+ unsigned long delay = 0; >+ /* If we are the passive closer, don't trigger >+ * subflow-fin until the subflow has been finned >+ * by the peer. - thus we add a delay >+ */ >+ if (mpcb->passive_close && >+ sk_it->sk_state == TCP_ESTABLISHED) >+ delay = inet_csk(sk_it)->icsk_rto << 3; >+ >+ mptcp_sub_close(sk_it, delay); >+ } >+ } >+ >+ sk_stream_wait_close(meta_sk, timeout); >+ >+adjudge_to_death: >+ state = meta_sk->sk_state; >+ sock_hold(meta_sk); >+ sock_orphan(meta_sk); >+ >+ /* socket will be freed after mptcp_close - we have to prevent >+ * access from the subflows. >+ */ >+ mptcp_for_each_sk(mpcb, sk_it) { >+ /* Similar to sock_orphan, but we don't set it DEAD, because >+ * the callbacks are still set and must be called. >+ */ >+ write_lock_bh(&sk_it->sk_callback_lock); >+ sk_set_socket(sk_it, NULL); >+ sk_it->sk_wq = NULL; >+ write_unlock_bh(&sk_it->sk_callback_lock); >+ } >+ >+ /* It is the last release_sock in its life. It will remove backlog. */ >+ release_sock(meta_sk); >+ >+ /* Now socket is owned by kernel and we acquire BH lock >+ * to finish close. No need to check for user refs. >+ */ >+ local_bh_disable(); >+ bh_lock_sock(meta_sk); >+ WARN_ON(sock_owned_by_user(meta_sk)); >+ >+ percpu_counter_inc(meta_sk->sk_prot->orphan_count); >+ >+ /* Have we already been destroyed by a softirq or backlog? */ >+ if (state != TCP_CLOSE && meta_sk->sk_state == TCP_CLOSE) >+ goto out; >+ >+ /* This is a (useful) BSD violating of the RFC. There is a >+ * problem with TCP as specified in that the other end could >+ * keep a socket open forever with no application left this end. >+ * We use a 3 minute timeout (about the same as BSD) then kill >+ * our end. If they send after that then tough - BUT: long enough >+ * that we won't make the old 4*rto = almost no time - whoops >+ * reset mistake. >+ * >+ * Nope, it was not mistake. It is really desired behaviour >+ * f.e. on http servers, when such sockets are useless, but >+ * consume significant resources. Let's do it with special >+ * linger2 option. --ANK >+ */ >+ >+ if (meta_sk->sk_state == TCP_FIN_WAIT2) { >+ if (meta_tp->linger2 < 0) { >+ tcp_set_state(meta_sk, TCP_CLOSE); >+ tcp_send_active_reset(meta_sk, GFP_ATOMIC); >+ NET_INC_STATS_BH(sock_net(meta_sk), >+ LINUX_MIB_TCPABORTONLINGER); >+ } else { >+ const int tmo = tcp_fin_time(meta_sk); >+ >+ if (tmo > TCP_TIMEWAIT_LEN) { >+ inet_csk_reset_keepalive_timer(meta_sk, >+ tmo - TCP_TIMEWAIT_LEN); >+ } else { >+ tcp_time_wait(meta_sk, TCP_FIN_WAIT2, tmo); >+ goto out; >+ } >+ } >+ } >+ if (meta_sk->sk_state != TCP_CLOSE) { >+ sk_mem_reclaim(meta_sk); >+ if (tcp_too_many_orphans(meta_sk, 0)) { >+ if (net_ratelimit()) >+ pr_info("MPTCP: too many of orphaned sockets\n"); >+ tcp_set_state(meta_sk, TCP_CLOSE); >+ tcp_send_active_reset(meta_sk, GFP_ATOMIC); >+ NET_INC_STATS_BH(sock_net(meta_sk), >+ LINUX_MIB_TCPABORTONMEMORY); >+ } >+ } >+ >+ >+ if (meta_sk->sk_state == TCP_CLOSE) >+ inet_csk_destroy_sock(meta_sk); >+ /* Otherwise, socket is reprieved until protocol close. */ >+ >+out: >+ bh_unlock_sock(meta_sk); >+ local_bh_enable(); >+ mutex_unlock(&mpcb->mutex); >+ sock_put(meta_sk); /* Taken by sock_hold */ >+} >+ >+/* Returns 1 if we should enable MPTCP for that socket. */ >+int mptcp_doit(struct sock *sk) >+{ >+ /* Do not allow MPTCP enabling if the MPTCP initialization failed */ >+ if (mptcp_init_failed) >+ return 0; >+ >+ /* Socket may already be established (e.g., called from tcp_recvmsg) */ >+ if (tcp_sk(sk)->mpc || tcp_sk(sk)->request_mptcp) >+ return 1; >+ >+ if (!sysctl_mptcp_enabled) >+ return 0; >+ >+ /* Don't do mptcp over loopback or local addresses */ >+ if (sk->sk_family == AF_INET && >+ (ipv4_is_loopback(inet_sk(sk)->inet_daddr) || >+ ipv4_is_loopback(inet_sk(sk)->inet_saddr))) >+ return 0; >+ if (sk->sk_family == AF_INET6 && >+ (ipv6_addr_loopback(&inet6_sk(sk)->daddr) || >+ ipv6_addr_loopback(&inet6_sk(sk)->saddr))) >+ return 0; >+ if (mptcp_v6_is_v4_mapped(sk) && >+ ipv4_is_loopback(inet_sk(sk)->inet_saddr)) >+ return 0; >+ >+ return 1; >+} >+ >+int mptcp_create_master_sk(struct sock *meta_sk, __u64 remote_key, u32 window) >+{ >+ struct tcp_sock *master_tp; >+ struct sock *master_sk; >+ >+ if (mptcp_alloc_mpcb(meta_sk, remote_key, window)) >+ goto err_alloc_mpcb; >+ >+ master_sk = tcp_sk(meta_sk)->mpcb->master_sk; >+ master_tp = tcp_sk(master_sk); >+ >+ if (mptcp_add_sock(meta_sk, master_sk, 0, GFP_ATOMIC)) >+ goto err_add_sock; >+ >+ if (__inet_inherit_port(meta_sk, master_sk) < 0) >+ goto err_add_sock; >+ >+ meta_sk->sk_prot->unhash(meta_sk); >+ >+ if (master_sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(master_sk)) >+ __inet_hash_nolisten(master_sk, NULL); >+#if IS_ENABLED(CONFIG_IPV6) >+ else >+ __inet6_hash(master_sk, NULL); >+#endif >+ >+ master_tp->mptcp->init_rcv_wnd = master_tp->rcv_wnd; >+ >+ return 0; >+ >+err_add_sock: >+ mptcp_fallback_meta_sk(meta_sk); >+ >+ inet_csk_prepare_forced_close(master_sk); >+ tcp_done(master_sk); >+ inet_csk_prepare_forced_close(meta_sk); >+ tcp_done(meta_sk); >+ >+err_alloc_mpcb: >+ return -ENOBUFS; >+} >+ >+int mptcp_check_req_master(struct sock *sk, struct sock *child, >+ struct request_sock *req, >+ struct request_sock **prev, >+ struct mptcp_options_received *mopt) >+{ >+ struct tcp_sock *child_tp = tcp_sk(child); >+ struct sock *meta_sk = child; >+ struct mptcp_cb *mpcb; >+ struct mptcp_request_sock *mtreq; >+ >+ if (!tcp_rsk(req)->saw_mpc) >+ return 1; >+ >+ /* Just set this values to pass them to mptcp_alloc_mpcb */ >+ mtreq = mptcp_rsk(req); >+ child_tp->mptcp_loc_key = mtreq->mptcp_loc_key; >+ child_tp->mptcp_loc_token = mtreq->mptcp_loc_token; >+ >+ if (mptcp_create_master_sk(meta_sk, mtreq->mptcp_rem_key, >+ child_tp->snd_wnd)) >+ return -ENOBUFS; >+ >+ child = tcp_sk(child)->mpcb->master_sk; >+ child_tp = tcp_sk(child); >+ mpcb = child_tp->mpcb; >+ >+ child_tp->mptcp->snt_isn = tcp_rsk(req)->snt_isn; >+ child_tp->mptcp->rcv_isn = tcp_rsk(req)->rcv_isn; >+ >+ mpcb->dss_csum = mtreq->dss_csum; >+ mpcb->server_side = 1; >+ >+ /* Will be moved to ESTABLISHED by tcp_rcv_state_process() */ >+ mptcp_update_metasocket(child, meta_sk); >+ >+ /* Needs to be done here additionally, because when accepting a >+ * new connection we pass by __reqsk_free and not reqsk_free. >+ */ >+ mptcp_reqsk_remove_tk(req); >+ >+ /* Hold when creating the meta-sk in tcp_vX_syn_recv_sock. */ >+ sock_put(meta_sk); >+ >+ inet_csk_reqsk_queue_unlink(sk, req, prev); >+ inet_csk_reqsk_queue_removed(sk, req); >+ inet_csk_reqsk_queue_add(sk, req, meta_sk); >+ >+ return 0; >+} >+ >+struct sock *mptcp_check_req_child(struct sock *meta_sk, struct sock *child, >+ struct request_sock *req, >+ struct request_sock **prev, >+ struct mptcp_options_received *mopt) >+{ >+ struct tcp_sock *child_tp = tcp_sk(child); >+ struct mptcp_request_sock *mtreq = mptcp_rsk(req); >+ struct mptcp_cb *mpcb = mtreq->mpcb; >+ u8 hash_mac_check[20]; >+ >+ child_tp->inside_tk_table = 0; >+ >+ if (!mopt->join_ack) >+ goto teardown; >+ >+ mptcp_hmac_sha1((u8 *)&mpcb->mptcp_rem_key, >+ (u8 *)&mpcb->mptcp_loc_key, >+ (u8 *)&mtreq->mptcp_rem_nonce, >+ (u8 *)&mtreq->mptcp_loc_nonce, >+ (u32 *)hash_mac_check); >+ >+ if (memcmp(hash_mac_check, (char *)&mopt->mptcp_recv_mac, 20)) >+ goto teardown; >+ >+ /* Point it to the same struct socket and wq as the meta_sk */ >+ sk_set_socket(child, meta_sk->sk_socket); >+ child->sk_wq = meta_sk->sk_wq; >+ >+ if (mptcp_add_sock(meta_sk, child, mtreq->rem_id, GFP_ATOMIC)) { >+ child_tp->mpc = 0; /* Has been inherited, but now >+ * child_tp->mptcp is NULL >+ */ >+ /* TODO when we support acking the third ack for new subflows, >+ * we should silently discard this third ack, by returning NULL. >+ * >+ * Maybe, at the retransmission we will have enough memory to >+ * fully add the socket to the meta-sk. >+ */ >+ goto teardown; >+ } >+ >+ /* The child is a clone of the meta socket, we must now reset >+ * some of the fields >+ */ >+ child_tp->mptcp->rcv_low_prio = mtreq->low_prio; >+ >+ /* We should allow proper increase of the snd/rcv-buffers. Thus, we >+ * use the original values instead of the bloated up ones from the >+ * clone. >+ */ >+ child->sk_sndbuf = mpcb->orig_sk_sndbuf; >+ child->sk_rcvbuf = mpcb->orig_sk_rcvbuf; >+ >+ child_tp->mptcp->slave_sk = 1; >+ child_tp->mptcp->snt_isn = tcp_rsk(req)->snt_isn; >+ child_tp->mptcp->rcv_isn = tcp_rsk(req)->rcv_isn; >+ child_tp->mptcp->init_rcv_wnd = req->rcv_wnd; >+ >+ child_tp->tsq_flags = 0; >+ >+ /* Subflows do not use the accept queue, as they >+ * are attached immediately to the mpcb. >+ */ >+ inet_csk_reqsk_queue_drop(meta_sk, req, prev); >+ return child; >+ >+teardown: >+ /* Drop this request - sock creation failed. */ >+ inet_csk_reqsk_queue_drop(meta_sk, req, prev); >+ inet_csk_prepare_forced_close(child); >+ tcp_done(child); >+ return meta_sk; >+} >+ >+int mptcp_time_wait(struct sock *sk, struct tcp_timewait_sock *tw) >+{ >+ struct mptcp_tw *mptw; >+ struct tcp_sock *tp = tcp_sk(sk); >+ struct mptcp_cb *mpcb = tp->mpcb; >+ >+ /* Alloc MPTCP-tw-sock */ >+ mptw = kmem_cache_alloc(mptcp_tw_cache, GFP_ATOMIC); >+ if (!mptw) >+ return -ENOBUFS; >+ >+ atomic_inc(&mpcb->refcnt); >+ >+ tw->mptcp_tw = mptw; >+ mptw->loc_key = mpcb->mptcp_loc_key; >+ mptw->meta_tw = mpcb->in_time_wait; >+ if (mptw->meta_tw) { >+ mptw->rcv_nxt = mptcp_get_rcv_nxt_64(mptcp_meta_tp(tp)); >+ if (mpcb->mptw_state != TCP_TIME_WAIT) >+ mptw->rcv_nxt++; >+ } >+ rcu_assign_pointer(mptw->mpcb, mpcb); >+ >+ spin_lock(&mpcb->tw_lock); >+ list_add_rcu(&mptw->list, &tp->mpcb->tw_list); >+ mptw->in_list = 1; >+ spin_unlock(&mpcb->tw_lock); >+ >+ return 0; >+} >+ >+void mptcp_twsk_destructor(struct tcp_timewait_sock *tw) >+{ >+ struct mptcp_cb *mpcb; >+ >+ rcu_read_lock(); >+ mpcb = rcu_dereference(tw->mptcp_tw->mpcb); >+ >+ /* If we are still holding a ref to the mpcb, we have to remove ourself >+ * from the list and drop the ref properly. >+ */ >+ if (mpcb && atomic_inc_not_zero(&mpcb->refcnt)) { >+ spin_lock(&mpcb->tw_lock); >+ if (tw->mptcp_tw->in_list) { >+ list_del_rcu(&tw->mptcp_tw->list); >+ tw->mptcp_tw->in_list = 0; >+ } >+ spin_unlock(&mpcb->tw_lock); >+ >+ /* Twice, because we increased it above */ >+ mptcp_mpcb_put(mpcb); >+ mptcp_mpcb_put(mpcb); >+ } >+ >+ rcu_read_unlock(); >+ >+ kmem_cache_free(mptcp_tw_cache, tw->mptcp_tw); >+} >+ >+/* Updates the rcv_nxt of the time-wait-socks and allows them to ack a >+ * data-fin. >+ */ >+void mptcp_update_tw_socks(const struct tcp_sock *tp, int state) >+{ >+ struct mptcp_tw *mptw; >+ >+ /* Used for sockets that go into tw after the meta >+ * (see mptcp_time_wait()) >+ */ >+ tp->mpcb->in_time_wait = 1; >+ tp->mpcb->mptw_state = state; >+ >+ /* Update the time-wait-sock's information */ >+ rcu_read_lock_bh(); >+ list_for_each_entry_rcu(mptw, &tp->mpcb->tw_list, list) { >+ mptw->meta_tw = 1; >+ mptw->rcv_nxt = mptcp_get_rcv_nxt_64(tp); >+ >+ /* We want to ack a DATA_FIN, but are yet in FIN_WAIT_2 - >+ * pretend as if the DATA_FIN has already reached us, that way >+ * the checks in tcp_timewait_state_process will be good as the >+ * DATA_FIN comes in. >+ */ >+ if (state != TCP_TIME_WAIT) >+ mptw->rcv_nxt++; >+ } >+ rcu_read_unlock_bh(); >+} >+ >+struct workqueue_struct *mptcp_wq; >+ >+/* General initialization of mptcp */ >+void __init mptcp_init(void) >+{ >+#ifdef CONFIG_SYSCTL >+ struct ctl_table_header *mptcp_sysctl; >+#endif >+ >+ mptcp_sock_cache = kmem_cache_create("mptcp_sock", >+ sizeof(struct mptcp_tcp_sock), >+ 0, SLAB_HWCACHE_ALIGN, >+ NULL); >+ if (!mptcp_sock_cache) >+ goto mptcp_sock_cache_failed; >+ >+ mptcp_cb_cache = kmem_cache_create("mptcp_cb", sizeof(struct mptcp_cb), >+ 0, SLAB_DESTROY_BY_RCU|SLAB_HWCACHE_ALIGN, >+ NULL); >+ if (!mptcp_cb_cache) >+ goto mptcp_cb_cache_failed; >+ >+ mptcp_tw_cache = kmem_cache_create("mptcp_tw", sizeof(struct mptcp_tw), >+ 0, SLAB_DESTROY_BY_RCU|SLAB_HWCACHE_ALIGN, >+ NULL); >+ if (!mptcp_tw_cache) >+ goto mptcp_tw_cache_failed; >+ >+ get_random_bytes(mptcp_secret, sizeof(mptcp_secret)); >+ >+ mptcp_wq = alloc_workqueue("mptcp_wq", WQ_UNBOUND | WQ_MEM_RECLAIM, 8); >+ if (!mptcp_wq) >+ goto alloc_workqueue_failed; >+ >+ if (mptcp_pm_init()) >+ goto mptcp_pm_failed; >+ >+#ifdef CONFIG_SYSCTL >+ mptcp_sysctl = register_net_sysctl(&init_net, "net/mptcp", mptcp_table); >+ if (!mptcp_sysctl) >+ goto register_sysctl_failed; >+#endif >+ >+ pr_info("MPTCP: Stable release v0.87.2"); >+ >+ mptcp_init_failed = false; >+ >+ return; >+ >+#ifdef CONFIG_SYSCTL >+register_sysctl_failed: >+ mptcp_pm_undo(); >+#endif >+mptcp_pm_failed: >+ destroy_workqueue(mptcp_wq); >+alloc_workqueue_failed: >+ kmem_cache_destroy(mptcp_tw_cache); >+mptcp_tw_cache_failed: >+ kmem_cache_destroy(mptcp_cb_cache); >+mptcp_cb_cache_failed: >+ kmem_cache_destroy(mptcp_sock_cache); >+mptcp_sock_cache_failed: >+ mptcp_init_failed = true; >+} >diff -Naur a/linux-3.11/net/mptcp/mptcp_input.c b/linux-3.11/net/mptcp/mptcp_input.c >--- a/linux-3.11/net/mptcp/mptcp_input.c 1970-01-01 01:00:00.000000000 +0100 >+++ b/linux-3.11/net/mptcp/mptcp_input.c 2013-10-05 18:34:49.269364701 +0200 >@@ -0,0 +1,1904 @@ >+/* >+ * MPTCP implementation - Sending side >+ * >+ * Initial Design & Implementation: >+ * Sébastien Barré <sebastien.barre@uclouvain.be> >+ * >+ * Current Maintainer & Author: >+ * Christoph Paasch <christoph.paasch@uclouvain.be> >+ * >+ * Additional authors: >+ * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi> >+ * Gregory Detal <gregory.detal@uclouvain.be> >+ * Fabien Duchêne <fabien.duchene@uclouvain.be> >+ * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de> >+ * Lavkesh Lahngir <lavkesh51@gmail.com> >+ * Andreas Ripke <ripke@neclab.eu> >+ * Vlad Dogaru <vlad.dogaru@intel.com> >+ * Octavian Purdila <octavian.purdila@intel.com> >+ * John Ronan <jronan@tssg.org> >+ * Catalin Nicutar <catalin.nicutar@gmail.com> >+ * Brandon Heller <brandonh@stanford.edu> >+ * >+ * >+ * This program is free software; you can redistribute it and/or >+ * modify it under the terms of the GNU General Public License >+ * as published by the Free Software Foundation; either version >+ * 2 of the License, or (at your option) any later version. >+ */ >+ >+#include <asm/unaligned.h> >+ >+#include <net/mptcp.h> >+#include <net/mptcp_v4.h> >+#include <net/mptcp_v6.h> >+ >+#include <linux/kconfig.h> >+ >+static inline void mptcp_become_fully_estab(struct sock *sk) >+{ >+ tcp_sk(sk)->mptcp->fully_established = 1; >+ >+ if (is_master_tp(tcp_sk(sk))) >+ mptcp_create_subflows(mptcp_meta_sk(sk)); >+} >+ >+/* Similar to tcp_tso_acked without any memory accounting */ >+static inline int mptcp_tso_acked_reinject(struct sock *sk, struct sk_buff *skb) >+{ >+ struct tcp_sock *tp = tcp_sk(sk); >+ u32 packets_acked, len; >+ >+ BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)); >+ >+ packets_acked = tcp_skb_pcount(skb); >+ >+ if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) >+ return 0; >+ >+ len = tp->snd_una - TCP_SKB_CB(skb)->seq; >+ __pskb_trim_head(skb, len); >+ >+ TCP_SKB_CB(skb)->seq += len; >+ skb->ip_summed = CHECKSUM_PARTIAL; >+ skb->truesize -= len; >+ >+ /* Any change of skb->len requires recalculation of tso factor. */ >+ if (tcp_skb_pcount(skb) > 1) >+ tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb)); >+ packets_acked -= tcp_skb_pcount(skb); >+ >+ if (packets_acked) { >+ BUG_ON(tcp_skb_pcount(skb) == 0); >+ BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)); >+ } >+ >+ return packets_acked; >+} >+ >+/** >+ * Cleans the meta-socket retransmission queue and the reinject-queue. >+ * @sk must be the metasocket. >+ */ >+static void mptcp_clean_rtx_queue(struct sock *meta_sk, u32 prior_snd_una) >+{ >+ struct sk_buff *skb, *tmp; >+ struct tcp_sock *meta_tp = tcp_sk(meta_sk); >+ struct mptcp_cb *mpcb = meta_tp->mpcb; >+ bool acked = false; >+ u32 acked_pcount; >+ >+ while ((skb = tcp_write_queue_head(meta_sk)) && >+ skb != tcp_send_head(meta_sk)) { >+ bool fully_acked = true; >+ >+ if (before(meta_tp->snd_una, TCP_SKB_CB(skb)->end_seq)) { >+ if (tcp_skb_pcount(skb) == 1 || >+ !after(meta_tp->snd_una, TCP_SKB_CB(skb)->seq)) >+ break; >+ >+ acked_pcount = tcp_tso_acked(meta_sk, skb); >+ if (!acked_pcount) >+ break; >+ >+ fully_acked = false; >+ } else { >+ acked_pcount = tcp_skb_pcount(skb); >+ } >+ >+ acked = true; >+ meta_tp->packets_out -= acked_pcount; >+ meta_tp->retrans_stamp = 0; >+ >+ if (!fully_acked) >+ break; >+ >+ tcp_unlink_write_queue(skb, meta_sk); >+ >+ if (mptcp_is_data_fin(skb)) { >+ struct sock *sk_it; >+ >+ /* DATA_FIN has been acknowledged - now we can close >+ * the subflows >+ */ >+ mptcp_for_each_sk(mpcb, sk_it) { >+ unsigned long delay = 0; >+ >+ /* If we are the passive closer, don't trigger >+ * subflow-fin until the subflow has been finned >+ * by the peer - thus we add a delay. >+ */ >+ if (mpcb->passive_close && >+ sk_it->sk_state == TCP_ESTABLISHED) >+ delay = inet_csk(sk_it)->icsk_rto << 3; >+ >+ mptcp_sub_close(sk_it, delay); >+ } >+ } >+ sk_wmem_free_skb(meta_sk, skb); >+ } >+ /* Remove acknowledged data from the reinject queue */ >+ skb_queue_walk_safe(&mpcb->reinject_queue, skb, tmp) { >+ if (before(meta_tp->snd_una, TCP_SKB_CB(skb)->end_seq)) { >+ if (tcp_skb_pcount(skb) == 1 || >+ !after(meta_tp->snd_una, TCP_SKB_CB(skb)->seq)) >+ break; >+ >+ mptcp_tso_acked_reinject(meta_sk, skb); >+ break; >+ } >+ >+ __skb_unlink(skb, &mpcb->reinject_queue); >+ __kfree_skb(skb); >+ } >+ >+ if (likely(between(meta_tp->snd_up, prior_snd_una, meta_tp->snd_una))) >+ meta_tp->snd_up = meta_tp->snd_una; >+ >+ if (acked) { >+ tcp_rearm_rto(meta_sk); >+ /* Normally this is done in tcp_try_undo_loss - but MPTCP >+ * does not call this function. >+ */ >+ inet_csk(meta_sk)->icsk_retransmits = 0; >+ } >+} >+ >+/* Inspired by tcp_rcv_state_process */ >+static int mptcp_rcv_state_process(struct sock *meta_sk, struct sock *sk, >+ const struct sk_buff *skb, u32 data_seq, >+ u16 data_len) >+{ >+ struct tcp_sock *meta_tp = tcp_sk(meta_sk), *tp = tcp_sk(sk); >+ struct tcphdr *th = tcp_hdr(skb); >+ >+ /* State-machine handling if FIN has been enqueued and he has >+ * been acked (snd_una == write_seq) - it's important that this >+ * here is after sk_wmem_free_skb because otherwise >+ * sk_forward_alloc is wrong upon inet_csk_destroy_sock() >+ */ >+ switch (meta_sk->sk_state) { >+ case TCP_FIN_WAIT1: >+ if (meta_tp->snd_una == meta_tp->write_seq) { >+ struct dst_entry *dst = __sk_dst_get(meta_sk); >+ >+ tcp_set_state(meta_sk, TCP_FIN_WAIT2); >+ meta_sk->sk_shutdown |= SEND_SHUTDOWN; >+ >+ dst = __sk_dst_get(sk); >+ if (dst) >+ dst_confirm(dst); >+ >+ if (!sock_flag(meta_sk, SOCK_DEAD)) { >+ /* Wake up lingering close() */ >+ meta_sk->sk_state_change(meta_sk); >+ } else { >+ int tmo; >+ >+ if (meta_tp->linger2 < 0 || >+ (data_len && >+ after(data_seq + data_len - (mptcp_is_data_fin2(skb, tp) ? 1 : 0), >+ meta_tp->rcv_nxt))) { >+ mptcp_send_active_reset(meta_sk, GFP_ATOMIC); >+ tcp_done(meta_sk); >+ NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA); >+ return 1; >+ } >+ >+ tmo = tcp_fin_time(meta_sk); >+ if (tmo > TCP_TIMEWAIT_LEN) { >+ inet_csk_reset_keepalive_timer(meta_sk, tmo - TCP_TIMEWAIT_LEN); >+ } else if (mptcp_is_data_fin2(skb, tp) || >+ sock_owned_by_user(meta_sk)) { >+ /* Bad case. We could lose such FIN otherwise. >+ * It is not a big problem, but it looks confusing >+ * and not so rare event. We still can lose it now, >+ * if it spins in bh_lock_sock(), but it is really >+ * marginal case. >+ */ >+ inet_csk_reset_keepalive_timer(meta_sk, tmo); >+ } else { >+ tcp_time_wait(meta_sk, TCP_FIN_WAIT2, tmo); >+ } >+ } >+ } >+ break; >+ case TCP_CLOSING: >+ case TCP_LAST_ACK: >+ if (meta_tp->snd_una == meta_tp->write_seq) { >+ tcp_done(meta_sk); >+ return 1; >+ } >+ break; >+ } >+ >+ /* step 7: process the segment text */ >+ switch (meta_sk->sk_state) { >+ case TCP_FIN_WAIT1: >+ case TCP_FIN_WAIT2: >+ /* RFC 793 says to queue data in these states, >+ * RFC 1122 says we MUST send a reset. >+ * BSD 4.4 also does reset. >+ */ >+ if (meta_sk->sk_shutdown & RCV_SHUTDOWN) { >+ if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && >+ after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt) && >+ !mptcp_is_data_fin2(skb, tp)) { >+ NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA); >+ >+ mptcp_send_active_reset(meta_sk, GFP_ATOMIC); >+ } >+ } >+ break; >+ } >+ >+ return 0; >+} >+ >+/** >+ * @return: >+ * i) 1: Everything's fine. >+ * ii) -1: A reset has been sent on the subflow - csum-failure >+ * iii) 0: csum-failure but no reset sent, because it's the last subflow. >+ * Last packet should not be destroyed by the caller because it has >+ * been done here. >+ */ >+static int mptcp_verif_dss_csum(struct sock *sk) >+{ >+ struct tcp_sock *tp = tcp_sk(sk); >+ struct sk_buff *tmp, *tmp1, *last = NULL; >+ __wsum csum_tcp = 0; /* cumulative checksum of pld + mptcp-header */ >+ int ans = 1, overflowed = 0, offset = 0, dss_csum_added = 0; >+ int iter = 0; >+ >+ skb_queue_walk_safe(&sk->sk_receive_queue, tmp, tmp1) { >+ unsigned int csum_len; >+ >+ if (before(tp->mptcp->map_subseq + tp->mptcp->map_data_len, TCP_SKB_CB(tmp)->end_seq)) >+ /* Mapping ends in the middle of the packet - >+ * csum only these bytes >+ */ >+ csum_len = tp->mptcp->map_subseq + tp->mptcp->map_data_len - TCP_SKB_CB(tmp)->seq; >+ else >+ csum_len = tmp->len; >+ >+ offset = 0; >+ if (overflowed) { >+ char first_word[4]; >+ first_word[0] = 0; >+ first_word[1] = 0; >+ first_word[2] = 0; >+ first_word[3] = *(tmp->data); >+ csum_tcp = csum_partial(first_word, 4, csum_tcp); >+ offset = 1; >+ csum_len--; >+ overflowed = 0; >+ } >+ >+ csum_tcp = skb_checksum(tmp, offset, csum_len, csum_tcp); >+ >+ /* Was it on an odd-length? Then we have to merge the next byte >+ * correctly (see above) >+ */ >+ if (csum_len != (csum_len & (~1))) >+ overflowed = 1; >+ >+ if (mptcp_is_data_seq(tmp) && !dss_csum_added) { >+ __be32 data_seq = htonl((u32)(tp->mptcp->map_data_seq >> 32)); >+ >+ /* If a 64-bit dss is present, we increase the offset >+ * by 4 bytes, as the high-order 64-bits will be added >+ * in the final csum_partial-call. >+ */ >+ u32 offset = skb_transport_offset(tmp) + >+ TCP_SKB_CB(tmp)->dss_off; >+ if (TCP_SKB_CB(tmp)->mptcp_flags & MPTCPHDR_SEQ64_SET) >+ offset += 4; >+ >+ csum_tcp = skb_checksum(tmp, offset, >+ MPTCP_SUB_LEN_SEQ_CSUM, >+ csum_tcp); >+ >+ csum_tcp = csum_partial(&data_seq, >+ sizeof(data_seq), csum_tcp); >+ >+ dss_csum_added = 1; /* Just do it once */ >+ } >+ last = tmp; >+ iter++; >+ >+ if (!skb_queue_is_last(&sk->sk_receive_queue, tmp) && >+ !before(TCP_SKB_CB(tmp1)->seq, >+ tp->mptcp->map_subseq + tp->mptcp->map_data_len)) >+ break; >+ } >+ >+ /* Now, checksum must be 0 */ >+ if (unlikely(csum_fold(csum_tcp))) { >+ pr_err("%s csum is wrong: %#x data_seq %u dss_csum_added %d overflowed %d iterations %d\n", >+ __func__, csum_fold(csum_tcp), >+ TCP_SKB_CB(last)->seq, dss_csum_added, overflowed, >+ iter); >+ >+ tp->mptcp->send_mp_fail = 1; >+ >+ /* map_data_seq is the data-seq number of the >+ * mapping we are currently checking >+ */ >+ tp->mpcb->csum_cutoff_seq = tp->mptcp->map_data_seq; >+ >+ if (tp->mpcb->cnt_subflows > 1) { >+ mptcp_send_reset(sk); >+ ans = -1; >+ } else { >+ tp->mpcb->send_infinite_mapping = 1; >+ >+ /* Need to purge the rcv-queue as it's no more valid */ >+ while ((tmp = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { >+ tp->copied_seq = TCP_SKB_CB(tmp)->end_seq; >+ kfree_skb(tmp); >+ } >+ >+ ans = 0; >+ } >+ } >+ >+ return ans; >+} >+ >+static inline void mptcp_prepare_skb(struct sk_buff *skb, struct sk_buff *next, >+ struct sock *sk) >+{ >+ struct tcp_sock *tp = tcp_sk(sk); >+ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); >+ /* Adapt data-seq's to the packet itself. We kinda transform the >+ * dss-mapping to a per-packet granularity. This is necessary to >+ * correctly handle overlapping mappings coming from different >+ * subflows. Otherwise it would be a complete mess. >+ */ >+ tcb->seq = ((u32)tp->mptcp->map_data_seq) + tcb->seq - tp->mptcp->map_subseq; >+ tcb->end_seq = tcb->seq + skb->len; >+ >+ /* If cur is the last one in the rcv-queue (or the last one for this >+ * mapping), and data_fin is enqueued, the end_data_seq is +1. >+ */ >+ if (skb_queue_is_last(&sk->sk_receive_queue, skb) || >+ after(TCP_SKB_CB(next)->end_seq, tp->mptcp->map_subseq + tp->mptcp->map_data_len)) { >+ tcb->end_seq += tp->mptcp->map_data_fin; >+ >+ /* We manually set the fin-flag if it is a data-fin. For easy >+ * processing in tcp_recvmsg. >+ */ >+ if (mptcp_is_data_fin2(skb, tp)) >+ tcp_hdr(skb)->fin = 1; >+ else >+ tcp_hdr(skb)->fin = 0; >+ } else { >+ /* We may have a subflow-fin with data but without data-fin */ >+ tcp_hdr(skb)->fin = 0; >+ } >+} >+ >+/** >+ * @return: 1 if the segment has been eaten and can be suppressed, >+ * otherwise 0. >+ */ >+static inline int mptcp_direct_copy(struct sk_buff *skb, struct sock *meta_sk) >+{ >+ struct tcp_sock *meta_tp = tcp_sk(meta_sk); >+ int chunk = min_t(unsigned int, skb->len, meta_tp->ucopy.len); >+ int eaten = 0; >+ >+ __set_current_state(TASK_RUNNING); >+ >+ local_bh_enable(); >+ if (!skb_copy_datagram_iovec(skb, 0, meta_tp->ucopy.iov, chunk)) { >+ meta_tp->ucopy.len -= chunk; >+ meta_tp->copied_seq += chunk; >+ eaten = (chunk == skb->len); >+ tcp_rcv_space_adjust(meta_sk); >+ } >+ local_bh_disable(); >+ return eaten; >+} >+ >+static inline void mptcp_reset_mapping(struct tcp_sock *tp) >+{ >+ tp->mptcp->map_data_len = 0; >+ tp->mptcp->map_data_seq = 0; >+ tp->mptcp->map_subseq = 0; >+ tp->mptcp->map_data_fin = 0; >+ tp->mptcp->mapping_present = 0; >+} >+ >+/* The DSS-mapping received on the sk only covers the second half of the skb >+ * (cut at seq). We trim the head from the skb. >+ * Data will be freed upon kfree(). >+ * >+ * Inspired by tcp_trim_head(). >+ */ >+static void mptcp_skb_trim_head(struct sk_buff *skb, struct sock *sk, u32 seq) >+{ >+ int len = seq - TCP_SKB_CB(skb)->seq; >+ u32 new_seq = TCP_SKB_CB(skb)->seq + len; >+ >+ if (len < skb_headlen(skb)) >+ __skb_pull(skb, len); >+ else >+ __pskb_trim_head(skb, len - skb_headlen(skb)); >+ >+ TCP_SKB_CB(skb)->seq = htonl(new_seq); >+ >+ skb->truesize -= len; >+ atomic_sub(len, &sk->sk_rmem_alloc); >+ sk_mem_uncharge(sk, len); >+} >+ >+/* The DSS-mapping received on the sk only covers the first half of the skb >+ * (cut at seq). We create a second skb (@return), and queue it in the rcv-queue >+ * as further packets may resolve the mapping of the second half of data. >+ * >+ * Inspired by tcp_fragment(). >+ */ >+static int mptcp_skb_split_tail(struct sk_buff *skb, struct sock *sk, u32 seq) >+{ >+ struct sk_buff *buff; >+ int nsize; >+ int nlen, len; >+ >+ len = seq - TCP_SKB_CB(skb)->seq; >+ nsize = skb_headlen(skb) - len + tcp_sk(sk)->tcp_header_len; >+ if (nsize < 0) >+ nsize = 0; >+ >+ /* Get a new skb... force flag on. */ >+ buff = alloc_skb(nsize, GFP_ATOMIC); >+ if (buff == NULL) >+ return -ENOMEM; >+ >+ skb_reserve(buff, tcp_sk(sk)->tcp_header_len); >+ skb_reset_transport_header(buff); >+ >+ tcp_hdr(buff)->fin = tcp_hdr(skb)->fin; >+ tcp_hdr(skb)->fin = 0; >+ >+ /* We absolutly need to call skb_set_owner_r before refreshing the >+ * truesize of buff, otherwise the moved data will account twice. >+ */ >+ skb_set_owner_r(buff, sk); >+ nlen = skb->len - len - nsize; >+ buff->truesize += nlen; >+ skb->truesize -= nlen; >+ >+ /* Correct the sequence numbers. */ >+ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; >+ TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; >+ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; >+ >+ skb_split(skb, buff, len); >+ >+ __skb_queue_after(&sk->sk_receive_queue, skb, buff); >+ >+ return 0; >+} >+ >+/* @return: 0 everything is fine. Just continue processing >+ * 1 subflow is broken stop everything >+ * -1 this packet was broken - continue with the next one. >+ */ >+static int mptcp_prevalidate_skb(struct sock *sk, struct sk_buff *skb) >+{ >+ struct tcp_sock *tp = tcp_sk(sk); >+ >+ /* If we are in infinite mode, the subflow-fin is in fact a data-fin. */ >+ if (!skb->len && tcp_hdr(skb)->fin && !mptcp_is_data_fin(skb) && >+ !tp->mpcb->infinite_mapping_rcv) { >+ /* Remove a pure subflow-fin from the queue and increase >+ * copied_seq. >+ */ >+ tp->copied_seq = TCP_SKB_CB(skb)->end_seq; >+ __skb_unlink(skb, &sk->sk_receive_queue); >+ __kfree_skb(skb); >+ return -1; >+ } >+ >+ /* If we are not yet fully established and do not know the mapping for >+ * this segment, this path has to fallback to infinite or be torn down. >+ */ >+ if (!tp->mptcp->fully_established && !mptcp_is_data_seq(skb) && >+ !tp->mptcp->mapping_present && !tp->mpcb->infinite_mapping_rcv) { >+ pr_err("%s %#x will fallback - pi %d from %pS, seq %u\n", >+ __func__, tp->mpcb->mptcp_loc_token, >+ tp->mptcp->path_index, __builtin_return_address(0), >+ TCP_SKB_CB(skb)->seq); >+ >+ if (!is_master_tp(tp)) { >+ mptcp_send_reset(sk); >+ return 1; >+ } >+ >+ tp->mpcb->infinite_mapping_snd = 1; >+ tp->mpcb->infinite_mapping_rcv = 1; >+ tp->mptcp->fully_established = 1; >+ } >+ >+ /* Receiver-side becomes fully established when a whole rcv-window has >+ * been received without the need to fallback due to the previous >+ * condition. */ >+ if (!tp->mptcp->fully_established) { >+ tp->mptcp->init_rcv_wnd -= skb->len; >+ if (tp->mptcp->init_rcv_wnd < 0) >+ mptcp_become_fully_estab(sk); >+ } >+ >+ return 0; >+} >+ >+/* @return: 0 everything is fine. Just continue processing >+ * 1 subflow is broken stop everything >+ * -1 this packet was broken - continue with the next one. >+ */ >+static int mptcp_detect_mapping(struct sock *sk, struct sk_buff *skb) >+{ >+ struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp); >+ struct mptcp_cb *mpcb = tp->mpcb; >+ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); >+ u32 *ptr; >+ u32 data_seq, sub_seq, data_len, tcp_end_seq; >+ >+ /* If we are in infinite-mapping-mode, the subflow is guaranteed to be >+ * in-order at the data-level. Thus data-seq-numbers can be inferred >+ * from what is expected at the data-level. >+ */ >+ if (mpcb->infinite_mapping_rcv) { >+ tp->mptcp->map_data_seq = mptcp_get_rcv_nxt_64(meta_tp); >+ tp->mptcp->map_subseq = tcb->seq; >+ tp->mptcp->map_data_len = skb->len; >+ tp->mptcp->map_data_fin = tcp_hdr(skb)->fin; >+ tp->mptcp->mapping_present = 1; >+ return 0; >+ } >+ >+ /* No mapping here? Exit - it is either already set or still on its way */ >+ if (!mptcp_is_data_seq(skb)) { >+ /* Too many packets without a mapping - this subflow is broken */ >+ if (!tp->mptcp->mapping_present && >+ tp->rcv_nxt - tp->copied_seq > 65536) { >+ mptcp_send_reset(sk); >+ return 1; >+ } >+ >+ return 0; >+ } >+ >+ ptr = mptcp_skb_set_data_seq(skb, &data_seq, mpcb); >+ ptr++; >+ sub_seq = get_unaligned_be32(ptr) + tp->mptcp->rcv_isn; >+ ptr++; >+ data_len = get_unaligned_be16(ptr); >+ >+ /* If it's an empty skb with DATA_FIN, sub_seq must get fixed. >+ * The draft sets it to 0, but we really would like to have the >+ * real value, to have an easy handling afterwards here in this >+ * function. >+ */ >+ if (mptcp_is_data_fin(skb) && skb->len == 0) >+ sub_seq = TCP_SKB_CB(skb)->seq; >+ >+ /* If there is already a mapping - we check if it maps with the current >+ * one. If not - we reset. >+ */ >+ if (tp->mptcp->mapping_present && >+ (data_seq != (u32)tp->mptcp->map_data_seq || >+ sub_seq != tp->mptcp->map_subseq || >+ data_len != tp->mptcp->map_data_len + tp->mptcp->map_data_fin || >+ mptcp_is_data_fin(skb) != tp->mptcp->map_data_fin)) { >+ /* Mapping in packet is different from what we want */ >+ pr_err("%s Mappings do not match!\n", __func__); >+ pr_err("%s dseq %u mdseq %u, sseq %u msseq %u dlen %u mdlen %u dfin %d mdfin %d\n", >+ __func__, data_seq, (u32)tp->mptcp->map_data_seq, >+ sub_seq, tp->mptcp->map_subseq, data_len, >+ tp->mptcp->map_data_len, mptcp_is_data_fin(skb), >+ tp->mptcp->map_data_fin); >+ mptcp_send_reset(sk); >+ return 1; >+ } >+ >+ /* If the previous check was good, the current mapping is valid and we exit. */ >+ if (tp->mptcp->mapping_present) >+ return 0; >+ >+ /* Mapping not yet set on this subflow - we set it here! */ >+ >+ if (!data_len) { >+ mpcb->infinite_mapping_rcv = 1; >+ tp->mptcp->fully_established = 1; >+ /* We need to repeat mp_fail's until the sender felt >+ * back to infinite-mapping - here we stop repeating it. >+ */ >+ tp->mptcp->send_mp_fail = 0; >+ >+ /* We have to fixup data_len - it must be the same as skb->len */ >+ data_len = skb->len + (mptcp_is_data_fin(skb) ? 1 : 0); >+ sub_seq = tcb->seq; >+ >+ /* TODO kill all other subflows than this one */ >+ /* data_seq and so on are set correctly */ >+ >+ /* At this point, the meta-ofo-queue has to be emptied, >+ * as the following data is guaranteed to be in-order at >+ * the data and subflow-level >+ */ >+ mptcp_purge_ofo_queue(meta_tp); >+ } >+ >+ /* We are sending mp-fail's and thus are in fallback mode. >+ * Ignore packets which do not announce the fallback and still >+ * want to provide a mapping. >+ */ >+ if (tp->mptcp->send_mp_fail) { >+ tp->copied_seq = TCP_SKB_CB(skb)->end_seq; >+ __skb_unlink(skb, &sk->sk_receive_queue); >+ __kfree_skb(skb); >+ return -1; >+ } >+ >+ /* FIN increased the mapping-length by 1 */ >+ if (mptcp_is_data_fin(skb)) >+ data_len--; >+ >+ /* Subflow-sequences of packet must be >+ * (at least partially) be part of the DSS-mapping's >+ * subflow-sequence-space. >+ * >+ * Basically the mapping is not valid, if either of the >+ * following conditions is true: >+ * >+ * 1. It's not a data_fin and >+ * MPTCP-sub_seq >= TCP-end_seq >+ * >+ * 2. It's a data_fin and TCP-end_seq > TCP-seq and >+ * MPTCP-sub_seq >= TCP-end_seq >+ * >+ * The previous two can be merged into: >+ * TCP-end_seq > TCP-seq and MPTCP-sub_seq >= TCP-end_seq >+ * Because if it's not a data-fin, TCP-end_seq > TCP-seq >+ * >+ * 3. It's a data_fin and skb->len == 0 and >+ * MPTCP-sub_seq > TCP-end_seq >+ * >+ * 4. It's not a data_fin and TCP-end_seq > TCP-seq and >+ * MPTCP-sub_seq + MPTCP-data_len <= TCP-seq >+ * >+ * 5. MPTCP-sub_seq is prior to what we already copied (copied_seq) >+ */ >+ >+ /* subflow-fin is not part of the mapping - ignore it here ! */ >+ tcp_end_seq = tcb->end_seq - tcp_hdr(skb)->fin; >+ if ((!before(sub_seq, tcb->end_seq) && after(tcp_end_seq, tcb->seq)) || >+ (mptcp_is_data_fin(skb) && skb->len == 0 && after(sub_seq, tcb->end_seq)) || >+ (!after(sub_seq + data_len, tcb->seq) && after(tcp_end_seq, tcb->seq)) || >+ before(sub_seq, tp->copied_seq)) { >+ /* Subflow-sequences of packet is different from what is in the >+ * packet's dss-mapping. The peer is misbehaving - reset >+ */ >+ pr_err("%s Packet's mapping does not map to the DSS sub_seq %u " >+ "end_seq %u, tcp_end_seq %u seq %u dfin %u len %u data_len %u" >+ "copied_seq %u\n", __func__, sub_seq, tcb->end_seq, tcp_end_seq, tcb->seq, mptcp_is_data_fin(skb), >+ skb->len, data_len, tp->copied_seq); >+ mptcp_send_reset(sk); >+ return 1; >+ } >+ >+ /* Does the DSS had 64-bit seqnum's ? */ >+ if (!(tcb->mptcp_flags & MPTCPHDR_SEQ64_SET)) { >+ /* Wrapped around? */ >+ if (unlikely(after(data_seq, meta_tp->rcv_nxt) && data_seq < meta_tp->rcv_nxt)) { >+ tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, !mpcb->rcv_hiseq_index, data_seq); >+ } else { >+ /* Else, access the default high-order bits */ >+ tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index, data_seq); >+ } >+ } else { >+ tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, (tcb->mptcp_flags & MPTCPHDR_SEQ64_INDEX) ? 1 : 0, data_seq); >+ >+ if (unlikely(tcb->mptcp_flags & MPTCPHDR_SEQ64_OFO)) { >+ /* We make sure that the data_seq is invalid. >+ * It will be dropped later. >+ */ >+ tp->mptcp->map_data_seq += 0xFFFFFFFF; >+ tp->mptcp->map_data_seq += 0xFFFFFFFF; >+ } >+ } >+ >+ tp->mptcp->map_data_len = data_len; >+ tp->mptcp->map_subseq = sub_seq; >+ tp->mptcp->map_data_fin = mptcp_is_data_fin(skb) ? 1 : 0; >+ tp->mptcp->mapping_present = 1; >+ >+ return 0; >+} >+ >+/* Similar to tcp_sequence(...) */ >+static inline int mptcp_sequence(const struct tcp_sock *meta_tp, >+ u64 data_seq, u64 end_data_seq) >+{ >+ struct mptcp_cb *mpcb = meta_tp->mpcb; >+ u64 rcv_wup64; >+ >+ /* Wrap-around? */ >+ if (meta_tp->rcv_wup > meta_tp->rcv_nxt) { >+ rcv_wup64 = ((u64)(mpcb->rcv_high_order[mpcb->rcv_hiseq_index] - 1) << 32) | >+ meta_tp->rcv_wup; >+ } else { >+ rcv_wup64 = mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index, >+ meta_tp->rcv_wup); >+ } >+ >+ return !before64(end_data_seq, rcv_wup64) && >+ !after64(data_seq, mptcp_get_rcv_nxt_64(meta_tp) + tcp_receive_window(meta_tp)); >+} >+ >+/* @return: 0 everything is fine. Just continue processing >+ * -1 this packet was broken - continue with the next one. >+ */ >+static int mptcp_validate_mapping(struct sock *sk, struct sk_buff *skb) >+{ >+ struct tcp_sock *tp = tcp_sk(sk); >+ struct sk_buff *tmp, *tmp1; >+ u32 tcp_end_seq; >+ >+ if (!tp->mptcp->mapping_present) >+ return 0; >+ >+ /* either, the new skb gave us the mapping and the first segment >+ * in the sub-rcv-queue has to be trimmed ... >+ */ >+ tmp = skb_peek(&sk->sk_receive_queue); >+ if (before(TCP_SKB_CB(tmp)->seq, tp->mptcp->map_subseq) && >+ after(TCP_SKB_CB(tmp)->end_seq, tp->mptcp->map_subseq)) >+ mptcp_skb_trim_head(tmp, sk, tp->mptcp->map_subseq); >+ >+ /* ... or the new skb (tail) has to be split at the end. */ >+ tcp_end_seq = TCP_SKB_CB(skb)->end_seq - (tcp_hdr(skb)->fin ? 1 : 0); >+ if (after(tcp_end_seq, tp->mptcp->map_subseq + tp->mptcp->map_data_len)) { >+ u32 seq = tp->mptcp->map_subseq + tp->mptcp->map_data_len; >+ if (mptcp_skb_split_tail(skb, sk, seq)) { /* Allocation failed */ >+ /* TODO : maybe handle this here better. >+ * We now just force meta-retransmission. >+ */ >+ tp->copied_seq = TCP_SKB_CB(skb)->end_seq; >+ __skb_unlink(skb, &sk->sk_receive_queue); >+ __kfree_skb(skb); >+ return -1; >+ } >+ } >+ >+ /* Now, remove old sk_buff's from the receive-queue. >+ * This may happen if the mapping has been lost for these segments and >+ * the next mapping has already been received. >+ */ >+ if (tp->mptcp->mapping_present && >+ before(TCP_SKB_CB(skb_peek(&sk->sk_receive_queue))->seq, tp->mptcp->map_subseq)) { >+ skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) { >+ if (!before(TCP_SKB_CB(tmp1)->seq, tp->mptcp->map_subseq)) >+ break; >+ >+ tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq; >+ __skb_unlink(tmp1, &sk->sk_receive_queue); >+ >+ /* Impossible that we could free skb here, because his >+ * mapping is known to be valid from previous checks >+ */ >+ __kfree_skb(tmp1); >+ } >+ } >+ >+ return 0; >+} >+ >+/* @return: 0 everything is fine. Just continue processing >+ * 1 subflow is broken stop everything >+ * -1 this mapping has been put in the meta-receive-queue >+ * -2 this mapping has been eaten by the application >+ */ >+static int mptcp_queue_skb(struct sock *sk) >+{ >+ struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp); >+ struct sock *meta_sk = mptcp_meta_sk(sk); >+ struct mptcp_cb *mpcb = tp->mpcb; >+ struct sk_buff *tmp, *tmp1; >+ u64 rcv_nxt64 = mptcp_get_rcv_nxt_64(meta_tp); >+ bool data_queued = false; >+ >+ /* Have we not yet received the full mapping? */ >+ if (!tp->mptcp->mapping_present || >+ before(tp->rcv_nxt, tp->mptcp->map_subseq + tp->mptcp->map_data_len)) >+ return 0; >+ >+ /* Is this an overlapping mapping? rcv_nxt >= end_data_seq >+ * OR >+ * This mapping is out of window >+ */ >+ if (!before64(rcv_nxt64, tp->mptcp->map_data_seq + tp->mptcp->map_data_len + tp->mptcp->map_data_fin) || >+ !mptcp_sequence(meta_tp, tp->mptcp->map_data_seq, >+ tp->mptcp->map_data_seq + tp->mptcp->map_data_len + tp->mptcp->map_data_fin)) { >+ skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) { >+ __skb_unlink(tmp1, &sk->sk_receive_queue); >+ tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq; >+ __kfree_skb(tmp1); >+ >+ if (!skb_queue_empty(&sk->sk_receive_queue) && >+ !before(TCP_SKB_CB(tmp)->seq, >+ tp->mptcp->map_subseq + tp->mptcp->map_data_len)) >+ break; >+ } >+ >+ mptcp_reset_mapping(tp); >+ >+ return -1; >+ } >+ >+ /* Record it, because we want to send our data_fin on the same path */ >+ if (tp->mptcp->map_data_fin) { >+ mpcb->dfin_path_index = tp->mptcp->path_index; >+ mpcb->dfin_combined = !!(sk->sk_shutdown & RCV_SHUTDOWN); >+ } >+ >+ /* Verify the checksum */ >+ if (mpcb->dss_csum && !mpcb->infinite_mapping_rcv) { >+ int ret = mptcp_verif_dss_csum(sk); >+ >+ if (ret <= 0) { >+ mptcp_reset_mapping(tp); >+ return 1; >+ } >+ } >+ >+ if (before64(rcv_nxt64, tp->mptcp->map_data_seq)) { >+ /* Seg's have to go to the meta-ofo-queue */ >+ skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) { >+ tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq; >+ mptcp_prepare_skb(tmp1, tmp, sk); >+ __skb_unlink(tmp1, &sk->sk_receive_queue); >+ /* MUST be done here, because fragstolen may be true later. >+ * Then, kfree_skb_partial will not account the memory. >+ */ >+ skb_orphan(tmp1); >+ >+ if (!mpcb->in_time_wait) /* In time-wait, do not receive data */ >+ mptcp_add_meta_ofo_queue(meta_sk, tmp1, sk); >+ else >+ __kfree_skb(tmp1); >+ >+ if (!skb_queue_empty(&sk->sk_receive_queue) && >+ !before(TCP_SKB_CB(tmp)->seq, >+ tp->mptcp->map_subseq + tp->mptcp->map_data_len)) >+ break; >+ >+ } >+ } else { >+ /* Ready for the meta-rcv-queue */ >+ skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) { >+ int eaten = 0; >+ int copied_early = 0; >+ bool fragstolen = false; >+ u32 old_rcv_nxt = meta_tp->rcv_nxt; >+ >+ tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq; >+ mptcp_prepare_skb(tmp1, tmp, sk); >+ __skb_unlink(tmp1, &sk->sk_receive_queue); >+ /* MUST be done here, because fragstolen may be true. >+ * Then, kfree_skb_partial will not account the memory. >+ */ >+ skb_orphan(tmp1); >+ >+ /* This segment has already been received */ >+ if (!after(TCP_SKB_CB(tmp1)->end_seq, meta_tp->rcv_nxt)) { >+ __kfree_skb(tmp1); >+ goto next; >+ } >+ >+#ifdef CONFIG_NET_DMA >+ if (TCP_SKB_CB(tmp1)->seq == meta_tp->rcv_nxt && >+ meta_tp->ucopy.task == current && >+ meta_tp->copied_seq == meta_tp->rcv_nxt && >+ tmp1->len <= meta_tp->ucopy.len && >+ sock_owned_by_user(meta_sk) && >+ tcp_dma_try_early_copy(meta_sk, tmp1, 0)) { >+ copied_early = 1; >+ eaten = 1; >+ } >+#endif >+ >+ /* Is direct copy possible ? */ >+ if (TCP_SKB_CB(tmp1)->seq == meta_tp->rcv_nxt && >+ meta_tp->ucopy.task == current && >+ meta_tp->copied_seq == meta_tp->rcv_nxt && >+ meta_tp->ucopy.len && sock_owned_by_user(meta_sk) && >+ !copied_early) >+ eaten = mptcp_direct_copy(tmp1, meta_sk); >+ >+ if (mpcb->in_time_wait) /* In time-wait, do not receive data */ >+ eaten = 1; >+ >+ if (!eaten) >+ eaten = tcp_queue_rcv(meta_sk, tmp1, 0, &fragstolen); >+ >+ meta_tp->rcv_nxt = TCP_SKB_CB(tmp1)->end_seq; >+ mptcp_check_rcvseq_wrap(meta_tp, old_rcv_nxt); >+ >+ if (copied_early) >+ tcp_cleanup_rbuf(meta_sk, tmp1->len); >+ >+ if (tcp_hdr(tmp1)->fin && !mpcb->in_time_wait) >+ mptcp_fin(meta_sk); >+ >+ /* Check if this fills a gap in the ofo queue */ >+ if (!skb_queue_empty(&meta_tp->out_of_order_queue)) >+ mptcp_ofo_queue(meta_sk); >+ >+#ifdef CONFIG_NET_DMA >+ if (copied_early) >+ __skb_queue_tail(&meta_sk->sk_async_wait_queue, >+ tmp1); >+ else >+#endif >+ if (eaten) >+ kfree_skb_partial(tmp1, fragstolen); >+ >+ data_queued = true; >+next: >+ if (!skb_queue_empty(&sk->sk_receive_queue) && >+ !before(TCP_SKB_CB(tmp)->seq, >+ tp->mptcp->map_subseq + tp->mptcp->map_data_len)) >+ break; >+ } >+ } >+ >+ inet_csk(meta_sk)->icsk_ack.lrcvtime = tcp_time_stamp; >+ tp->mptcp->last_data_seq = tp->mptcp->map_data_seq; >+ mptcp_reset_mapping(tp); >+ >+ return data_queued ? -1 : -2; >+} >+ >+void mptcp_data_ready(struct sock *sk, int bytes) >+{ >+ struct sock *meta_sk = mptcp_meta_sk(sk); >+ struct sk_buff *skb, *tmp; >+ int queued = 0; >+ >+ /* If the meta is already closed, there is no point in pushing data */ >+ if (meta_sk->sk_state == TCP_CLOSE && !tcp_sk(sk)->mpcb->in_time_wait) { >+ skb_queue_purge(&sk->sk_receive_queue); >+ tcp_sk(sk)->copied_seq = tcp_sk(sk)->rcv_nxt; >+ goto exit; >+ } >+ >+restart: >+ /* Iterate over all segments, detect their mapping (if we don't have >+ * one yet), validate them and push everything one level higher. >+ */ >+ skb_queue_walk_safe(&sk->sk_receive_queue, skb, tmp) { >+ int ret; >+ /* Pre-validation - e.g., early fallback */ >+ ret = mptcp_prevalidate_skb(sk, skb); >+ if (ret < 0) >+ goto restart; >+ else if (ret > 0) >+ break; >+ >+ /* Set the current mapping */ >+ ret = mptcp_detect_mapping(sk, skb); >+ if (ret < 0) >+ goto restart; >+ else if (ret > 0) >+ break; >+ >+ /* Validation */ >+ if (mptcp_validate_mapping(sk, skb) < 0) >+ goto restart; >+ >+ /* Push a level higher */ >+ ret = mptcp_queue_skb(sk); >+ if (ret < 0) { >+ if (ret == -1) >+ queued = ret; >+ goto restart; >+ } else if (ret == 0) { >+ continue; >+ } else { /* ret == 1 */ >+ break; >+ } >+ } >+ >+exit: >+ if (tcp_sk(sk)->close_it) { >+ tcp_send_ack(sk); >+ tcp_time_wait(sk, TCP_TIME_WAIT, 0); >+ } >+ >+ if (queued == -1 && !sock_flag(meta_sk, SOCK_DEAD)) >+ meta_sk->sk_data_ready(meta_sk, 0); >+} >+ >+/** >+ * Equivalent of tcp_fin() for MPTCP >+ * Can be called only when the FIN is validly part >+ * of the data seqnum space. Not before when we get holes. >+ */ >+void mptcp_fin(struct sock *meta_sk) >+{ >+ struct sock *sk = NULL, *sk_it; >+ struct tcp_sock *meta_tp = tcp_sk(meta_sk); >+ struct mptcp_cb *mpcb = meta_tp->mpcb; >+ >+ mptcp_for_each_sk(mpcb, sk_it) { >+ if (tcp_sk(sk_it)->mptcp->path_index == mpcb->dfin_path_index) { >+ sk = sk_it; >+ break; >+ } >+ } >+ >+ if (!sk || sk->sk_state == TCP_CLOSE) >+ sk = mptcp_select_ack_sock(meta_sk, 0); >+ >+ inet_csk_schedule_ack(sk); >+ >+ meta_sk->sk_shutdown |= RCV_SHUTDOWN; >+ sock_set_flag(meta_sk, SOCK_DONE); >+ >+ switch (meta_sk->sk_state) { >+ case TCP_SYN_RECV: >+ case TCP_ESTABLISHED: >+ /* Move to CLOSE_WAIT */ >+ tcp_set_state(meta_sk, TCP_CLOSE_WAIT); >+ inet_csk(sk)->icsk_ack.pingpong = 1; >+ break; >+ >+ case TCP_CLOSE_WAIT: >+ case TCP_CLOSING: >+ /* Received a retransmission of the FIN, do >+ * nothing. >+ */ >+ break; >+ case TCP_LAST_ACK: >+ /* RFC793: Remain in the LAST-ACK state. */ >+ break; >+ >+ case TCP_FIN_WAIT1: >+ /* This case occurs when a simultaneous close >+ * happens, we must ack the received FIN and >+ * enter the CLOSING state. >+ */ >+ tcp_send_ack(sk); >+ tcp_set_state(meta_sk, TCP_CLOSING); >+ break; >+ case TCP_FIN_WAIT2: >+ /* Received a FIN -- send ACK and enter TIME_WAIT. */ >+ tcp_send_ack(sk); >+ tcp_time_wait(meta_sk, TCP_TIME_WAIT, 0); >+ break; >+ default: >+ /* Only TCP_LISTEN and TCP_CLOSE are left, in these >+ * cases we should never reach this piece of code. >+ */ >+ pr_err("%s: Impossible, meta_sk->sk_state=%d\n", __func__, >+ meta_sk->sk_state); >+ break; >+ } >+ >+ /* It _is_ possible, that we have something out-of-order _after_ FIN. >+ * Probably, we should reset in this case. For now drop them. >+ */ >+ mptcp_purge_ofo_queue(meta_tp); >+ sk_mem_reclaim(meta_sk); >+ >+ if (!sock_flag(meta_sk, SOCK_DEAD)) { >+ meta_sk->sk_state_change(meta_sk); >+ >+ /* Do not send POLL_HUP for half duplex close. */ >+ if (meta_sk->sk_shutdown == SHUTDOWN_MASK || >+ meta_sk->sk_state == TCP_CLOSE) >+ sk_wake_async(meta_sk, SOCK_WAKE_WAITD, POLL_HUP); >+ else >+ sk_wake_async(meta_sk, SOCK_WAKE_WAITD, POLL_IN); >+ } >+ >+ return; >+} >+ >+static void mptcp_xmit_retransmit_queue(struct sock *meta_sk) >+{ >+ struct tcp_sock *meta_tp = tcp_sk(meta_sk); >+ struct sk_buff *skb; >+ >+ if (!meta_tp->packets_out) >+ return; >+ >+ tcp_for_write_queue(skb, meta_sk) { >+ if (skb == tcp_send_head(meta_sk)) >+ break; >+ >+ if (mptcp_retransmit_skb(meta_sk, skb)) >+ return; >+ >+ if (skb == tcp_write_queue_head(meta_sk)) >+ inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS, >+ inet_csk(meta_sk)->icsk_rto, >+ TCP_RTO_MAX); >+ } >+} >+ >+/* Handle the DATA_ACK */ >+static void mptcp_data_ack(struct sock *sk, const struct sk_buff *skb) >+{ >+ struct sock *meta_sk = mptcp_meta_sk(sk); >+ struct tcp_sock *meta_tp = tcp_sk(meta_sk), *tp = tcp_sk(sk); >+ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); >+ u32 prior_snd_una = meta_tp->snd_una; >+ int prior_packets; >+ u32 nwin, data_ack, data_seq; >+ u16 data_len = 0; >+ >+ /* A valid packet came in - subflow is operational again */ >+ tp->pf = 0; >+ >+ /* Even if there is no data-ack, we stop retransmitting. >+ * Except if this is a SYN/ACK. Then it is just a retransmission >+ */ >+ if (tp->mptcp->pre_established && !tcp_hdr(skb)->syn) { >+ tp->mptcp->pre_established = 0; >+ sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer); >+ } >+ >+ /* If we are in infinite mapping mode, rx_opt.data_ack has been >+ * set by mptcp_clean_rtx_infinite. >+ */ >+ if (!(tcb->mptcp_flags & MPTCPHDR_ACK) && !tp->mpcb->infinite_mapping_snd) >+ goto exit; >+ >+ data_ack = tp->mptcp->rx_opt.data_ack; >+ >+ if (unlikely(!tp->mptcp->fully_established) && >+ (data_ack != meta_tp->mptcp->snt_isn || >+ tp->mptcp->snt_isn + 1 != TCP_SKB_CB(skb)->ack_seq)) >+ /* As soon as data has been data-acked, >+ * or a subflow-data-ack (not acking syn - thus snt_isn + 1) >+ * includes a data-ack, we are fully established >+ */ >+ mptcp_become_fully_estab(sk); >+ >+ /* Get the data_seq */ >+ if (mptcp_is_data_seq(skb)) { >+ data_seq = tp->mptcp->rx_opt.data_seq; >+ data_len = tp->mptcp->rx_opt.data_len; >+ } else { >+ data_seq = meta_tp->snd_wl1; >+ } >+ >+ /* If the ack is older than previous acks >+ * then we can probably ignore it. >+ */ >+ if (before(data_ack, prior_snd_una)) >+ goto exit; >+ >+ /* If the ack includes data we haven't sent yet, discard >+ * this segment (RFC793 Section 3.9). >+ */ >+ if (after(data_ack, meta_tp->snd_nxt)) >+ goto exit; >+ >+ /*** Now, update the window - inspired by tcp_ack_update_window ***/ >+ nwin = ntohs(tcp_hdr(skb)->window); >+ >+ if (likely(!tcp_hdr(skb)->syn)) >+ nwin <<= tp->rx_opt.snd_wscale; >+ >+ if (tcp_may_update_window(meta_tp, data_ack, data_seq, nwin)) { >+ tcp_update_wl(meta_tp, data_seq); >+ >+ /* Draft v09, Section 3.3.5: >+ * [...] It should only update its local receive window values >+ * when the largest sequence number allowed (i.e. DATA_ACK + >+ * receive window) increases. [...] >+ */ >+ if (meta_tp->snd_wnd != nwin && >+ !before(data_ack + nwin, tcp_wnd_end(meta_tp))) { >+ meta_tp->snd_wnd = nwin; >+ >+ if (nwin > meta_tp->max_window) >+ meta_tp->max_window = nwin; >+ } >+ } >+ /*** Done, update the window ***/ >+ >+ /* We passed data and got it acked, remove any soft error >+ * log. Something worked... >+ */ >+ sk->sk_err_soft = 0; >+ inet_csk(meta_sk)->icsk_probes_out = 0; >+ meta_tp->rcv_tstamp = tcp_time_stamp; >+ prior_packets = meta_tp->packets_out; >+ if (!prior_packets) >+ goto no_queue; >+ >+ meta_tp->snd_una = data_ack; >+ >+ mptcp_clean_rtx_queue(meta_sk, prior_snd_una); >+ >+ /* We are in loss-state, and something got acked, retransmit the whole >+ * queue now! >+ */ >+ if (inet_csk(meta_sk)->icsk_ca_state == TCP_CA_Loss && >+ after(data_ack, prior_snd_una)) { >+ mptcp_xmit_retransmit_queue(meta_sk); >+ inet_csk(meta_sk)->icsk_ca_state = TCP_CA_Open; >+ } >+ >+ /* Simplified version of tcp_new_space, because the snd-buffer >+ * is handled by all the subflows. >+ */ >+ if (sock_flag(meta_sk, SOCK_QUEUE_SHRUNK)) { >+ sock_reset_flag(meta_sk, SOCK_QUEUE_SHRUNK); >+ if (meta_sk->sk_socket && >+ test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags)) >+ meta_sk->sk_write_space(meta_sk); >+ } >+ >+ if (meta_sk->sk_state != TCP_ESTABLISHED) >+ mptcp_rcv_state_process(meta_sk, sk, skb, data_seq, data_len); >+ >+exit: >+ mptcp_push_pending_frames(meta_sk); >+ >+ return; >+ >+no_queue: >+ if (tcp_send_head(meta_sk)) >+ tcp_ack_probe(meta_sk); >+ >+ mptcp_push_pending_frames(meta_sk); >+ >+ return; >+} >+ >+void mptcp_clean_rtx_infinite(struct sk_buff *skb, struct sock *sk) >+{ >+ struct tcp_sock *tp = tcp_sk(sk), *meta_tp = tcp_sk(mptcp_meta_sk(sk)); >+ >+ if (!tp->mpcb->infinite_mapping_snd) >+ return; >+ >+ /* The difference between both write_seq's represents the offset between >+ * data-sequence and subflow-sequence. As we are infinite, this must >+ * match. >+ * >+ * Thus, from this difference we can infer the meta snd_una. >+ */ >+ tp->mptcp->rx_opt.data_ack = meta_tp->snd_nxt - tp->snd_nxt + >+ tp->snd_una; >+ >+ mptcp_data_ack(sk, skb); >+} >+ >+/**** static functions used by mptcp_parse_options */ >+ >+static inline int mptcp_rem_raddress(struct mptcp_cb *mpcb, u8 rem_id) >+{ >+ if (mptcp_v4_rem_raddress(mpcb, rem_id) < 0) { >+#if IS_ENABLED(CONFIG_IPV6) >+ if (mptcp_v6_rem_raddress(mpcb, rem_id) < 0) >+ return -1; >+#else >+ return -1; >+#endif /* CONFIG_IPV6 */ >+ } >+ return 0; >+} >+ >+static void mptcp_send_reset_rem_id(const struct mptcp_cb *mpcb, u8 rem_id) >+{ >+ struct sock *sk_it, *tmpsk; >+ >+ mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) { >+ if (tcp_sk(sk_it)->mptcp->rem_id == rem_id) { >+ mptcp_reinject_data(sk_it, 0); >+ sk_it->sk_err = ECONNRESET; >+ if (tcp_need_reset(sk_it->sk_state)) >+ tcp_send_active_reset(sk_it, GFP_ATOMIC); >+ mptcp_sub_force_close(sk_it); >+ } >+ } >+} >+ >+void mptcp_parse_options(const uint8_t *ptr, int opsize, >+ struct tcp_options_received *opt_rx, >+ struct mptcp_options_received *mopt, >+ const struct sk_buff *skb) >+{ >+ struct mptcp_option *mp_opt = (struct mptcp_option *)ptr; >+ >+ /* If the socket is mp-capable we would have a mopt. */ >+ if (!mopt) >+ return; >+ >+ switch (mp_opt->sub) { >+ case MPTCP_SUB_CAPABLE: >+ { >+ struct mp_capable *mpcapable = (struct mp_capable *)ptr; >+ >+ if (opsize != MPTCP_SUB_LEN_CAPABLE_SYN && >+ opsize != MPTCP_SUB_LEN_CAPABLE_ACK) { >+ mptcp_debug("%s: mp_capable: bad option size %d\n", >+ __func__, opsize); >+ break; >+ } >+ >+ if (!sysctl_mptcp_enabled) >+ break; >+ >+ /* We only support MPTCP version 0 */ >+ if (mpcapable->ver != 0) >+ break; >+ >+ /* MPTCP-RFC 6824: >+ * "If receiving a message with the 'B' flag set to 1, and this >+ * is not understood, then this SYN MUST be silently ignored; >+ */ >+ if (mpcapable->b) { >+ mopt->drop_me = 1; >+ break; >+ } >+ >+ /* MPTCP-RFC 6824: >+ * "An implementation that only supports this method MUST set >+ * bit "H" to 1, and bits "C" through "G" to 0." >+ */ >+ if (!mpcapable->h) >+ break; >+ >+ mopt->saw_mpc = 1; >+ mopt->dss_csum = sysctl_mptcp_checksum || mpcapable->a; >+ >+ if (opsize >= MPTCP_SUB_LEN_CAPABLE_SYN) >+ mopt->mptcp_key = mpcapable->sender_key; >+ >+ break; >+ } >+ case MPTCP_SUB_JOIN: >+ { >+ struct mp_join *mpjoin = (struct mp_join *)ptr; >+ >+ if (opsize != MPTCP_SUB_LEN_JOIN_SYN && >+ opsize != MPTCP_SUB_LEN_JOIN_SYNACK && >+ opsize != MPTCP_SUB_LEN_JOIN_ACK) { >+ mptcp_debug("%s: mp_join: bad option size %d\n", >+ __func__, opsize); >+ break; >+ } >+ >+ switch (opsize) { >+ case MPTCP_SUB_LEN_JOIN_SYN: >+ mopt->is_mp_join = 1; >+ mopt->low_prio = mpjoin->b; >+ mopt->rem_id = mpjoin->addr_id; >+ mopt->mptcp_rem_token = mpjoin->u.syn.token; >+ mopt->mptcp_recv_nonce = mpjoin->u.syn.nonce; >+ break; >+ case MPTCP_SUB_LEN_JOIN_SYNACK: >+ mopt->low_prio = mpjoin->b; >+ mopt->rem_id = mpjoin->addr_id; >+ mopt->mptcp_recv_tmac = mpjoin->u.synack.mac; >+ mopt->mptcp_recv_nonce = mpjoin->u.synack.nonce; >+ break; >+ case MPTCP_SUB_LEN_JOIN_ACK: >+ mopt->join_ack = 1; >+ memcpy(mopt->mptcp_recv_mac, mpjoin->u.ack.mac, 20); >+ break; >+ } >+ break; >+ } >+ case MPTCP_SUB_DSS: >+ { >+ struct mp_dss *mdss = (struct mp_dss *)ptr; >+ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); >+ >+ /* We check opsize for the csum and non-csum case. We do this, >+ * because the draft says that the csum SHOULD be ignored if >+ * it has not been negotiated in the MP_CAPABLE but still is >+ * present in the data. >+ * >+ * It will get ignored later in mptcp_queue_skb. >+ */ >+ if (opsize != mptcp_sub_len_dss(mdss, 0) && >+ opsize != mptcp_sub_len_dss(mdss, 1)) { >+ mptcp_debug("%s: mp_dss: bad option size %d\n", >+ __func__, opsize); >+ break; >+ } >+ >+ ptr += 4; >+ >+ if (mdss->A) { >+ tcb->mptcp_flags |= MPTCPHDR_ACK; >+ >+ if (mdss->a) { >+ mopt->data_ack = (u32) get_unaligned_be64(ptr); >+ ptr += MPTCP_SUB_LEN_ACK_64; >+ } else { >+ mopt->data_ack = get_unaligned_be32(ptr); >+ ptr += MPTCP_SUB_LEN_ACK; >+ } >+ } >+ >+ tcb->dss_off = (ptr - skb_transport_header(skb)); >+ >+ if (mdss->M) { >+ if (mdss->m) { >+ u64 data_seq64 = get_unaligned_be64(ptr); >+ >+ tcb->mptcp_flags |= MPTCPHDR_SEQ64_SET; >+ mopt->data_seq = (u32) data_seq64; >+ >+ ptr += 12; /* 64-bit dseq + subseq */ >+ } else { >+ mopt->data_seq = get_unaligned_be32(ptr); >+ ptr += 8; /* 32-bit dseq + subseq */ >+ } >+ mopt->data_len = get_unaligned_be16(ptr); >+ >+ tcb->mptcp_flags |= MPTCPHDR_SEQ; >+ >+ /* Is a check-sum present? */ >+ if (opsize == mptcp_sub_len_dss(mdss, 1)) >+ tcb->mptcp_flags |= MPTCPHDR_DSS_CSUM; >+ >+ /* DATA_FIN only possible with DSS-mapping */ >+ if (mdss->F) >+ tcb->mptcp_flags |= MPTCPHDR_FIN; >+ } >+ >+ break; >+ } >+ case MPTCP_SUB_ADD_ADDR: >+ { >+#if IS_ENABLED(CONFIG_IPV6) >+ struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr; >+ >+ if ((mpadd->ipver == 4 && opsize != MPTCP_SUB_LEN_ADD_ADDR4 && >+ opsize != MPTCP_SUB_LEN_ADD_ADDR4 + 2) || >+ (mpadd->ipver == 6 && opsize != MPTCP_SUB_LEN_ADD_ADDR6 && >+ opsize != MPTCP_SUB_LEN_ADD_ADDR6 + 2)) { >+#else >+ if (opsize != MPTCP_SUB_LEN_ADD_ADDR4 && >+ opsize != MPTCP_SUB_LEN_ADD_ADDR4 + 2) { >+#endif /* CONFIG_IPV6 */ >+ mptcp_debug("%s: mp_add_addr: bad option size %d\n", >+ __func__, opsize); >+ break; >+ } >+ >+ /* We have to manually parse the options if we got two of them. */ >+ if (mopt->saw_add_addr) { >+ mopt->more_add_addr = 1; >+ break; >+ } >+ mopt->saw_add_addr = 1; >+ mopt->add_addr_ptr = ptr; >+ break; >+ } >+ case MPTCP_SUB_REMOVE_ADDR: >+ if ((opsize - MPTCP_SUB_LEN_REMOVE_ADDR) < 0) { >+ mptcp_debug("%s: mp_remove_addr: bad option size %d\n", >+ __func__, opsize); >+ break; >+ } >+ >+ if (mopt->saw_rem_addr) { >+ mopt->more_rem_addr = 1; >+ break; >+ } >+ mopt->saw_rem_addr = 1; >+ mopt->rem_addr_ptr = ptr; >+ break; >+ case MPTCP_SUB_PRIO: >+ { >+ struct mp_prio *mpprio = (struct mp_prio *)ptr; >+ >+ if (opsize != MPTCP_SUB_LEN_PRIO && >+ opsize != MPTCP_SUB_LEN_PRIO_ADDR) { >+ mptcp_debug("%s: mp_prio: bad option size %d\n", >+ __func__, opsize); >+ break; >+ } >+ >+ mopt->saw_low_prio = 1; >+ mopt->low_prio = mpprio->b; >+ >+ if (opsize == MPTCP_SUB_LEN_PRIO_ADDR) { >+ mopt->saw_low_prio = 2; >+ mopt->prio_addr_id = mpprio->addr_id; >+ } >+ break; >+ } >+ case MPTCP_SUB_FAIL: >+ if (opsize != MPTCP_SUB_LEN_FAIL) { >+ mptcp_debug("%s: mp_fail: bad option size %d\n", >+ __func__, opsize); >+ break; >+ } >+ mopt->mp_fail = 1; >+ break; >+ case MPTCP_SUB_FCLOSE: >+ if (opsize != MPTCP_SUB_LEN_FCLOSE) { >+ mptcp_debug("%s: mp_fclose: bad option size %d\n", >+ __func__, opsize); >+ break; >+ } >+ >+ mopt->mp_fclose = 1; >+ mopt->mptcp_key = ((struct mp_fclose *)ptr)->key; >+ >+ break; >+ default: >+ mptcp_debug("%s: Received unkown subtype: %d\n", >+ __func__, mp_opt->sub); >+ break; >+ } >+} >+ >+int mptcp_check_rtt(const struct tcp_sock *tp, int time) >+{ >+ struct mptcp_cb *mpcb = tp->mpcb; >+ struct sock *sk; >+ u32 rtt_max = 0; >+ >+ /* In MPTCP, we take the max delay across all flows, >+ * in order to take into account meta-reordering buffers. >+ */ >+ mptcp_for_each_sk(mpcb, sk) { >+ if (!mptcp_sk_can_recv(sk)) >+ continue; >+ >+ if (rtt_max < tcp_sk(sk)->rcv_rtt_est.rtt) >+ rtt_max = tcp_sk(sk)->rcv_rtt_est.rtt; >+ } >+ if (time < (rtt_max >> 3) || !rtt_max) >+ return 1; >+ >+ return 0; >+} >+ >+static void mptcp_handle_add_addr(const unsigned char *ptr, struct sock *sk) >+{ >+ struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr; >+ >+ if (mpadd->ipver == 4) { >+ __be16 port = 0; >+ if (mpadd->len == MPTCP_SUB_LEN_ADD_ADDR4 + 2) >+ port = mpadd->u.v4.port; >+ >+ mptcp_v4_add_raddress(tcp_sk(sk)->mpcb, &mpadd->u.v4.addr, port, >+ mpadd->addr_id); >+#if IS_ENABLED(CONFIG_IPV6) >+ } else if (mpadd->ipver == 6) { >+ __be16 port = 0; >+ if (mpadd->len == MPTCP_SUB_LEN_ADD_ADDR6 + 2) >+ port = mpadd->u.v6.port; >+ >+ mptcp_v6_add_raddress(tcp_sk(sk)->mpcb, &mpadd->u.v6.addr, port, >+ mpadd->addr_id); >+#endif /* CONFIG_IPV6 */ >+ } >+} >+ >+static void mptcp_handle_rem_addr(const unsigned char *ptr, struct sock *sk) >+{ >+ struct mp_remove_addr *mprem = (struct mp_remove_addr *)ptr; >+ int i; >+ u8 rem_id; >+ >+ for (i = 0; i <= mprem->len - MPTCP_SUB_LEN_REMOVE_ADDR; i++) { >+ rem_id = (&mprem->addrs_id)[i]; >+ if (!mptcp_rem_raddress(tcp_sk(sk)->mpcb, rem_id)) >+ mptcp_send_reset_rem_id(tcp_sk(sk)->mpcb, rem_id); >+ } >+} >+ >+static void mptcp_parse_addropt(const struct sk_buff *skb, struct sock *sk) >+{ >+ struct tcphdr *th = tcp_hdr(skb); >+ unsigned char *ptr; >+ int length = (th->doff * 4) - sizeof(struct tcphdr); >+ >+ /* Jump through the options to check whether ADD_ADDR is there */ >+ ptr = (unsigned char *)(th + 1); >+ while (length > 0) { >+ int opcode = *ptr++; >+ int opsize; >+ >+ switch (opcode) { >+ case TCPOPT_EOL: >+ return; >+ case TCPOPT_NOP: >+ length--; >+ continue; >+ default: >+ opsize = *ptr++; >+ if (opsize < 2) >+ return; >+ if (opsize > length) >+ return; /* don't parse partial options */ >+ if (opcode == TCPOPT_MPTCP && >+ ((struct mptcp_option *)ptr)->sub == MPTCP_SUB_ADD_ADDR) { >+#if IS_ENABLED(CONFIG_IPV6) >+ struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr; >+ if ((mpadd->ipver == 4 && opsize != MPTCP_SUB_LEN_ADD_ADDR4 && >+ opsize != MPTCP_SUB_LEN_ADD_ADDR4 + 2) || >+ (mpadd->ipver == 6 && opsize != MPTCP_SUB_LEN_ADD_ADDR6 && >+ opsize != MPTCP_SUB_LEN_ADD_ADDR6 + 2)) >+#else >+ if (opsize != MPTCP_SUB_LEN_ADD_ADDR4 && >+ opsize != MPTCP_SUB_LEN_ADD_ADDR4 + 2) >+#endif /* CONFIG_IPV6 */ >+ goto cont; >+ >+ mptcp_handle_add_addr(ptr, sk); >+ } >+ if (opcode == TCPOPT_MPTCP && >+ ((struct mptcp_option *)ptr)->sub == MPTCP_SUB_REMOVE_ADDR) { >+ if ((opsize - MPTCP_SUB_LEN_REMOVE_ADDR) < 0) >+ goto cont; >+ >+ mptcp_handle_rem_addr(ptr, sk); >+ } >+cont: >+ ptr += opsize - 2; >+ length -= opsize; >+ } >+ } >+ return; >+} >+ >+static inline int mptcp_mp_fail_rcvd(struct sock *sk, const struct tcphdr *th) >+{ >+ struct mptcp_tcp_sock *mptcp = tcp_sk(sk)->mptcp; >+ struct sock *meta_sk = mptcp_meta_sk(sk); >+ struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb; >+ >+ if (unlikely(mptcp->rx_opt.mp_fail)) { >+ mptcp->rx_opt.mp_fail = 0; >+ >+ if (!th->rst && !mpcb->infinite_mapping_snd) { >+ struct sock *sk_it; >+ >+ mpcb->send_infinite_mapping = 1; >+ /* We resend everything that has not been acknowledged */ >+ meta_sk->sk_send_head = tcp_write_queue_head(meta_sk); >+ >+ /* We artificially restart the whole send-queue. Thus, >+ * it is as if no packets are in flight >+ */ >+ tcp_sk(meta_sk)->packets_out = 0; >+ >+ /* If the snd_nxt already wrapped around, we have to >+ * undo the wrapping, as we are restarting from snd_una >+ * on. >+ */ >+ if (tcp_sk(meta_sk)->snd_nxt < tcp_sk(meta_sk)->snd_una) { >+ mpcb->snd_high_order[mpcb->snd_hiseq_index] -= 2; >+ mpcb->snd_hiseq_index = mpcb->snd_hiseq_index ? 0 : 1; >+ } >+ tcp_sk(meta_sk)->snd_nxt = tcp_sk(meta_sk)->snd_una; >+ >+ /* Trigger a sending on the meta. */ >+ mptcp_push_pending_frames(meta_sk); >+ >+ mptcp_for_each_sk(mpcb, sk_it) { >+ if (sk != sk_it) >+ mptcp_sub_force_close(sk_it); >+ } >+ } >+ >+ return 0; >+ } >+ >+ if (unlikely(mptcp->rx_opt.mp_fclose)) { >+ struct sock *sk_it, *tmpsk; >+ >+ mptcp->rx_opt.mp_fclose = 0; >+ if (mptcp->rx_opt.mptcp_key != mpcb->mptcp_loc_key) >+ return 0; >+ >+ if (tcp_need_reset(sk->sk_state)) >+ tcp_send_active_reset(sk, GFP_ATOMIC); >+ >+ mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) >+ mptcp_sub_force_close(sk_it); >+ >+ tcp_reset(meta_sk); >+ >+ return 1; >+ } >+ >+ return 0; >+} >+ >+static inline void mptcp_path_array_check(struct sock *meta_sk) >+{ >+ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; >+ >+ if (unlikely(mpcb->list_rcvd)) { >+ mpcb->list_rcvd = 0; >+ mptcp_create_subflows(meta_sk); >+ } >+} >+ >+int mptcp_handle_options(struct sock *sk, const struct tcphdr *th, struct sk_buff *skb) >+{ >+ struct tcp_sock *tp = tcp_sk(sk); >+ struct mptcp_options_received *mopt = &tp->mptcp->rx_opt; >+ >+ if (tp->mpcb->infinite_mapping_rcv || tp->mpcb->infinite_mapping_snd) >+ return 0; >+ >+ if (mptcp_mp_fail_rcvd(sk, th)) >+ return 1; >+ >+ /* RFC 6824, Section 3.3: >+ * If a checksum is not present when its use has been negotiated, the >+ * receiver MUST close the subflow with a RST as it is considered broken. >+ */ >+ if (mptcp_is_data_seq(skb) && tp->mpcb->dss_csum && >+ !(TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_DSS_CSUM)) { >+ if (tcp_need_reset(sk->sk_state)) >+ tcp_send_active_reset(sk, GFP_ATOMIC); >+ >+ mptcp_sub_force_close(sk); >+ return 1; >+ } >+ >+ /* We have to acknowledge retransmissions of the third >+ * ack. >+ */ >+ if (mopt->join_ack) { >+ tcp_send_delayed_ack(sk); >+ mopt->join_ack = 0; >+ } >+ >+ if (mopt->saw_add_addr || mopt->saw_rem_addr) { >+ if (mopt->more_add_addr || mopt->more_rem_addr) { >+ mptcp_parse_addropt(skb, sk); >+ } else { >+ if (mopt->saw_add_addr) >+ mptcp_handle_add_addr(mopt->add_addr_ptr, sk); >+ if (mopt->saw_rem_addr) >+ mptcp_handle_rem_addr(mopt->rem_addr_ptr, sk); >+ } >+ >+ mopt->more_add_addr = 0; >+ mopt->saw_add_addr = 0; >+ mopt->more_rem_addr = 0; >+ mopt->saw_rem_addr = 0; >+ } >+ if (mopt->saw_low_prio) { >+ if (mopt->saw_low_prio == 1) { >+ tp->mptcp->rcv_low_prio = mopt->low_prio; >+ } else { >+ struct sock *sk_it; >+ mptcp_for_each_sk(tp->mpcb, sk_it) { >+ struct mptcp_tcp_sock *mptcp = tcp_sk(sk_it)->mptcp; >+ if (mptcp->rem_id == mopt->prio_addr_id) >+ mptcp->rcv_low_prio = mopt->low_prio; >+ } >+ } >+ mopt->saw_low_prio = 0; >+ } >+ >+ mptcp_data_ack(sk, skb); >+ >+ mptcp_path_array_check(mptcp_meta_sk(sk)); >+ /* Socket may have been mp_killed by a REMOVE_ADDR */ >+ if (tp->mp_killed) >+ return 1; >+ >+ return 0; >+} >+ >+/* The skptr is needed, because if we become MPTCP-capable, we have to switch >+ * from meta-socket to master-socket. >+ * >+ * @return: 1 - we want to reset this connection >+ * 2 - we want to discard the received syn/ack >+ * 0 - everything is fine - continue >+ */ >+int mptcp_rcv_synsent_state_process(struct sock *sk, struct sock **skptr, >+ struct sk_buff *skb, >+ struct mptcp_options_received *mopt) >+{ >+ struct tcp_sock *tp = tcp_sk(sk); >+ >+ if (tp->mpc) { >+ u8 hash_mac_check[20]; >+ struct mptcp_cb *mpcb = tp->mpcb; >+ >+ mptcp_hmac_sha1((u8 *)&mpcb->mptcp_rem_key, >+ (u8 *)&mpcb->mptcp_loc_key, >+ (u8 *)&tp->mptcp->rx_opt.mptcp_recv_nonce, >+ (u8 *)&tp->mptcp->mptcp_loc_nonce, >+ (u32 *)hash_mac_check); >+ if (memcmp(hash_mac_check, >+ (char *)&tp->mptcp->rx_opt.mptcp_recv_tmac, 8)) { >+ mptcp_sub_force_close(sk); >+ return 1; >+ } >+ >+ /* Set this flag in order to postpone data sending >+ * until the 4th ack arrives. >+ */ >+ tp->mptcp->pre_established = 1; >+ tp->mptcp->rcv_low_prio = tp->mptcp->rx_opt.low_prio; >+ >+ mptcp_hmac_sha1((u8 *)&mpcb->mptcp_loc_key, >+ (u8 *)&mpcb->mptcp_rem_key, >+ (u8 *)&tp->mptcp->mptcp_loc_nonce, >+ (u8 *)&tp->mptcp->rx_opt.mptcp_recv_nonce, >+ (u32 *)&tp->mptcp->sender_mac[0]); >+ >+ } else if (mopt->saw_mpc) { >+ if (mptcp_create_master_sk(sk, mopt->mptcp_key, >+ ntohs(tcp_hdr(skb)->window))) >+ return 2; >+ >+ sk = tcp_sk(sk)->mpcb->master_sk; >+ *skptr = sk; >+ tp = tcp_sk(sk); >+ >+ /* snd_nxt - 1, because it has been incremented >+ * by tcp_connect for the SYN >+ */ >+ tp->mptcp->snt_isn = tp->snd_nxt - 1; >+ tp->mpcb->dss_csum = mopt->dss_csum; >+ tp->mptcp->include_mpc = 1; >+ >+ sk_set_socket(sk, mptcp_meta_sk(sk)->sk_socket); >+ sk->sk_wq = mptcp_meta_sk(sk)->sk_wq; >+ >+ mptcp_update_metasocket(sk, mptcp_meta_sk(sk)); >+ >+ /* hold in mptcp_inherit_sk due to initialization to 2 */ >+ sock_put(sk); >+ } else { >+ tp->request_mptcp = 0; >+ >+ if (tp->inside_tk_table) >+ mptcp_hash_remove(tp); >+ } >+ >+ if (tp->mpc) >+ tp->mptcp->rcv_isn = TCP_SKB_CB(skb)->seq; >+ >+ return 0; >+} >diff -Naur a/linux-3.11/net/mptcp/mptcp_ipv4.c b/linux-3.11/net/mptcp/mptcp_ipv4.c >--- a/linux-3.11/net/mptcp/mptcp_ipv4.c 1970-01-01 01:00:00.000000000 +0100 >+++ b/linux-3.11/net/mptcp/mptcp_ipv4.c 2013-10-05 18:34:49.270364689 +0200 >@@ -0,0 +1,726 @@ >+/* >+ * MPTCP implementation - IPv4-specific functions >+ * >+ * Initial Design & Implementation: >+ * Sébastien Barré <sebastien.barre@uclouvain.be> >+ * >+ * Current Maintainer: >+ * Christoph Paasch <christoph.paasch@uclouvain.be> >+ * >+ * Additional authors: >+ * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi> >+ * Gregory Detal <gregory.detal@uclouvain.be> >+ * Fabien Duchêne <fabien.duchene@uclouvain.be> >+ * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de> >+ * Lavkesh Lahngir <lavkesh51@gmail.com> >+ * Andreas Ripke <ripke@neclab.eu> >+ * Vlad Dogaru <vlad.dogaru@intel.com> >+ * Octavian Purdila <octavian.purdila@intel.com> >+ * John Ronan <jronan@tssg.org> >+ * Catalin Nicutar <catalin.nicutar@gmail.com> >+ * Brandon Heller <brandonh@stanford.edu> >+ * >+ * >+ * This program is free software; you can redistribute it and/or >+ * modify it under the terms of the GNU General Public License >+ * as published by the Free Software Foundation; either version >+ * 2 of the License, or (at your option) any later version. >+ */ >+ >+#include <linux/export.h> >+#include <linux/ip.h> >+#include <linux/list.h> >+#include <linux/skbuff.h> >+#include <linux/spinlock.h> >+#include <linux/tcp.h> >+ >+#include <net/inet_common.h> >+#include <net/inet_connection_sock.h> >+#include <net/mptcp.h> >+#include <net/mptcp_pm.h> >+#include <net/mptcp_v4.h> >+#include <net/mptcp_v6.h> >+#include <net/request_sock.h> >+#include <net/tcp.h> >+ >+u32 mptcp_v4_get_nonce(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, >+ u32 seq) >+{ >+ u32 hash[MD5_DIGEST_WORDS]; >+ >+ hash[0] = (__force u32)saddr; >+ hash[1] = (__force u32)daddr; >+ hash[2] = ((__force u16)sport << 16) + (__force u16)dport; >+ hash[3] = seq; >+ >+ md5_transform(hash, mptcp_secret); >+ >+ return hash[0]; >+} >+ >+u64 mptcp_v4_get_key(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport) >+{ >+ u32 hash[MD5_DIGEST_WORDS]; >+ >+ hash[0] = (__force u32)saddr; >+ hash[1] = (__force u32)daddr; >+ hash[2] = ((__force u16)sport << 16) + (__force u16)dport; >+ hash[3] = mptcp_key_seed++; >+ >+ md5_transform(hash, mptcp_secret); >+ >+ return *((u64 *)hash); >+} >+ >+ >+static void mptcp_v4_reqsk_destructor(struct request_sock *req) >+{ >+ mptcp_reqsk_destructor(req); >+ >+ tcp_v4_reqsk_destructor(req); >+} >+ >+/* Similar to tcp_request_sock_ops */ >+struct request_sock_ops mptcp_request_sock_ops __read_mostly = { >+ .family = PF_INET, >+ .obj_size = sizeof(struct mptcp_request_sock), >+ .rtx_syn_ack = tcp_v4_rtx_synack, >+ .send_ack = tcp_v4_reqsk_send_ack, >+ .destructor = mptcp_v4_reqsk_destructor, >+ .send_reset = tcp_v4_send_reset, >+ .syn_ack_timeout = tcp_syn_ack_timeout, >+}; >+ >+static void mptcp_v4_reqsk_queue_hash_add(struct sock *meta_sk, >+ struct request_sock *req, >+ unsigned long timeout) >+{ >+ const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, >+ inet_rsk(req)->rmt_port, >+ 0, MPTCP_HASH_SIZE); >+ >+ inet_csk_reqsk_queue_hash_add(meta_sk, req, timeout); >+ >+ spin_lock(&mptcp_reqsk_hlock); >+ list_add(&mptcp_rsk(req)->collide_tuple, &mptcp_reqsk_htb[h]); >+ spin_unlock(&mptcp_reqsk_hlock); >+} >+ >+/* Similar to tcp_v4_conn_request */ >+static void mptcp_v4_join_request(struct sock *meta_sk, struct sk_buff *skb) >+{ >+ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; >+ struct tcp_options_received tmp_opt; >+ struct mptcp_options_received mopt; >+ struct request_sock *req; >+ struct inet_request_sock *ireq; >+ struct mptcp_request_sock *mtreq; >+ struct dst_entry *dst = NULL; >+ u8 mptcp_hash_mac[20]; >+ __be32 saddr = ip_hdr(skb)->saddr; >+ __be32 daddr = ip_hdr(skb)->daddr; >+ __u32 isn = TCP_SKB_CB(skb)->when; >+ int want_cookie = 0; >+ >+ tcp_clear_options(&tmp_opt); >+ mptcp_init_mp_opt(&mopt); >+ tmp_opt.mss_clamp = TCP_MSS_DEFAULT; >+ tmp_opt.user_mss = tcp_sk(meta_sk)->rx_opt.user_mss; >+ tcp_parse_options(skb, &tmp_opt, &mopt, 0, NULL); >+ >+ req = inet_reqsk_alloc(&mptcp_request_sock_ops); >+ if (!req) >+ return; >+ >+ tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; >+ tcp_openreq_init(req, &tmp_opt, skb); >+ >+ ireq = inet_rsk(req); >+ ireq->loc_addr = daddr; >+ ireq->rmt_addr = saddr; >+ ireq->no_srccheck = inet_sk(meta_sk)->transparent; >+ ireq->opt = tcp_v4_save_options(skb); >+ >+ if (security_inet_conn_request(meta_sk, skb, req)) >+ goto drop_and_free; >+ >+ if (!want_cookie || tmp_opt.tstamp_ok) >+ TCP_ECN_create_request(req, skb, sock_net(meta_sk)); >+ >+ if (!isn) { >+ struct flowi4 fl4; >+ >+ /* VJ's idea. We save last timestamp seen >+ * from the destination in peer table, when entering >+ * state TIME-WAIT, and check against it before >+ * accepting new connection request. >+ * >+ * If "isn" is not zero, this request hit alive >+ * timewait bucket, so that all the necessary checks >+ * are made in the function processing timewait state. >+ */ >+ if (tmp_opt.saw_tstamp && >+ tcp_death_row.sysctl_tw_recycle && >+ (dst = inet_csk_route_req(meta_sk, &fl4, req)) != NULL && >+ fl4.daddr == saddr) { >+ if (!tcp_peer_is_proven(req, dst, true)) { >+ NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_PAWSPASSIVEREJECTED); >+ goto drop_and_release; >+ } >+ } >+ /* Kill the following clause, if you dislike this way. */ >+ else if (!sysctl_tcp_syncookies && >+ (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(meta_sk) < >+ (sysctl_max_syn_backlog >> 2)) && >+ !tcp_peer_is_proven(req, dst, false)) { >+ /* Without syncookies last quarter of >+ * backlog is filled with destinations, >+ * proven to be alive. >+ * It means that we continue to communicate >+ * to destinations, already remembered >+ * to the moment of synflood. >+ */ >+ LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"), >+ &saddr, ntohs(tcp_hdr(skb)->source)); >+ goto drop_and_release; >+ } >+ >+ isn = tcp_v4_init_sequence(skb); >+ } >+ tcp_rsk(req)->snt_isn = isn; >+ tcp_rsk(req)->snt_synack = tcp_time_stamp; >+ >+ mtreq = mptcp_rsk(req); >+ mtreq->mpcb = mpcb; >+ INIT_LIST_HEAD(&mtreq->collide_tuple); >+ mtreq->mptcp_rem_nonce = mopt.mptcp_recv_nonce; >+ mtreq->mptcp_rem_key = mpcb->mptcp_rem_key; >+ mtreq->mptcp_loc_key = mpcb->mptcp_loc_key; >+ mtreq->mptcp_loc_nonce = mptcp_v4_get_nonce(saddr, daddr, >+ tcp_hdr(skb)->source, >+ tcp_hdr(skb)->dest, isn); >+ mptcp_hmac_sha1((u8 *)&mtreq->mptcp_loc_key, >+ (u8 *)&mtreq->mptcp_rem_key, >+ (u8 *)&mtreq->mptcp_loc_nonce, >+ (u8 *)&mtreq->mptcp_rem_nonce, (u32 *)mptcp_hash_mac); >+ mtreq->mptcp_hash_tmac = *(u64 *)mptcp_hash_mac; >+ mtreq->rem_id = mopt.rem_id; >+ mtreq->low_prio = mopt.low_prio; >+ tcp_rsk(req)->saw_mpc = 1; >+ >+ if (tcp_v4_send_synack(meta_sk, dst, req, skb_get_queue_mapping(skb), want_cookie)) >+ goto drop_and_free; >+ >+ /* Adding to request queue in metasocket */ >+ mptcp_v4_reqsk_queue_hash_add(meta_sk, req, TCP_TIMEOUT_INIT); >+ >+ return; >+ >+drop_and_release: >+ dst_release(dst); >+drop_and_free: >+ reqsk_free(req); >+ return; >+} >+ >+int mptcp_v4_rem_raddress(struct mptcp_cb *mpcb, u8 id) >+{ >+ int i; >+ >+ for (i = 0; i < MPTCP_MAX_ADDR; i++) { >+ if (!((1 << i) & mpcb->rem4_bits)) >+ continue; >+ >+ if (mpcb->remaddr4[i].id == id) { >+ /* remove address from bitfield */ >+ mpcb->rem4_bits &= ~(1 << i); >+ >+ return 0; >+ } >+ } >+ >+ return -1; >+} >+ >+/* Based on function tcp_v4_conn_request (tcp_ipv4.c) >+ * Returns -1 if there is no space anymore to store an additional >+ * address >+ */ >+int mptcp_v4_add_raddress(struct mptcp_cb *mpcb, const struct in_addr *addr, >+ __be16 port, u8 id) >+{ >+ int i; >+ struct mptcp_rem4 *rem4; >+ >+ mptcp_for_each_bit_set(mpcb->rem4_bits, i) { >+ rem4 = &mpcb->remaddr4[i]; >+ >+ /* Address is already in the list --- continue */ >+ if (rem4->id == id && >+ rem4->addr.s_addr == addr->s_addr && rem4->port == port) >+ return 0; >+ >+ /* This may be the case, when the peer is behind a NAT. He is >+ * trying to JOIN, thus sending the JOIN with a certain ID. >+ * However the src_addr of the IP-packet has been changed. We >+ * update the addr in the list, because this is the address as >+ * OUR BOX sees it. >+ */ >+ if (rem4->id == id && rem4->addr.s_addr != addr->s_addr) { >+ /* update the address */ >+ mptcp_debug("%s: updating old addr:%pI4 to addr %pI4 with id:%d\n", >+ __func__, &rem4->addr.s_addr, >+ &addr->s_addr, id); >+ rem4->addr.s_addr = addr->s_addr; >+ rem4->port = port; >+ mpcb->list_rcvd = 1; >+ return 0; >+ } >+ } >+ >+ i = mptcp_find_free_index(mpcb->rem4_bits); >+ /* Do we have already the maximum number of local/remote addresses? */ >+ if (i < 0) { >+ mptcp_debug("%s: At max num of remote addresses: %d --- not adding address: %pI4\n", >+ __func__, MPTCP_MAX_ADDR, &addr->s_addr); >+ return -1; >+ } >+ >+ rem4 = &mpcb->remaddr4[i]; >+ >+ /* Address is not known yet, store it */ >+ rem4->addr.s_addr = addr->s_addr; >+ rem4->port = port; >+ rem4->bitfield = 0; >+ rem4->retry_bitfield = 0; >+ rem4->id = id; >+ mpcb->list_rcvd = 1; >+ mpcb->rem4_bits |= (1 << i); >+ >+ return 0; >+} >+ >+/* Sets the bitfield of the remote-address field >+ * local address is not set as it will disappear with the global address-list >+ */ >+void mptcp_v4_set_init_addr_bit(struct mptcp_cb *mpcb, __be32 daddr) >+{ >+ int i; >+ >+ mptcp_for_each_bit_set(mpcb->rem4_bits, i) { >+ if (mpcb->remaddr4[i].addr.s_addr == daddr) { >+ /* It's the initial flow - thus local index == 0 */ >+ mpcb->remaddr4[i].bitfield |= 1; >+ return; >+ } >+ } >+} >+ >+/* We only process join requests here. (either the SYN or the final ACK) */ >+int mptcp_v4_do_rcv(struct sock *meta_sk, struct sk_buff *skb) >+{ >+ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; >+ struct sock *child, *rsk = NULL; >+ int ret; >+ >+ if (!(TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_JOIN)) { >+ struct tcphdr *th = tcp_hdr(skb); >+ const struct iphdr *iph = ip_hdr(skb); >+ struct sock *sk; >+ >+ sk = inet_lookup_established(sock_net(meta_sk), &tcp_hashinfo, >+ iph->saddr, th->source, iph->daddr, >+ th->dest, inet_iif(skb)); >+ >+ if (!sk) { >+ kfree_skb(skb); >+ return 0; >+ } >+ if (is_meta_sk(sk)) { >+ WARN("%s Did not find a sub-sk - did found the meta!\n", __func__); >+ kfree_skb(skb); >+ sock_put(sk); >+ return 0; >+ } >+ >+ if (sk->sk_state == TCP_TIME_WAIT) { >+ inet_twsk_put(inet_twsk(sk)); >+ kfree_skb(skb); >+ return 0; >+ } >+ >+ ret = tcp_v4_do_rcv(sk, skb); >+ sock_put(sk); >+ >+ return ret; >+ } >+ TCP_SKB_CB(skb)->mptcp_flags = 0; >+ >+ /* Has been removed from the tk-table. Thus, no new subflows. >+ * >+ * Check for close-state is necessary, because we may have been closed >+ * without passing by mptcp_close(). >+ * >+ * When falling back, no new subflows are allowed either. >+ */ >+ if (meta_sk->sk_state == TCP_CLOSE || !tcp_sk(meta_sk)->inside_tk_table || >+ mpcb->infinite_mapping_rcv || mpcb->send_infinite_mapping) >+ goto reset_and_discard; >+ >+ child = tcp_v4_hnd_req(meta_sk, skb); >+ >+ if (!child) >+ goto discard; >+ >+ if (child != meta_sk) { >+ sock_rps_save_rxhash(child, skb); >+ /* We don't call tcp_child_process here, because we hold >+ * already the meta-sk-lock and are sure that it is not owned >+ * by the user. >+ */ >+ ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb), skb->len); >+ bh_unlock_sock(child); >+ sock_put(child); >+ if (ret) { >+ rsk = child; >+ goto reset_and_discard; >+ } >+ } else { >+ if (tcp_hdr(skb)->syn) { >+ struct mp_join *join_opt = mptcp_find_join(skb); >+ /* Currently we make two calls to mptcp_find_join(). This >+ * can probably be optimized. >+ */ >+ if (mptcp_v4_add_raddress(mpcb, >+ (struct in_addr *)&ip_hdr(skb)->saddr, >+ 0, >+ join_opt->addr_id) < 0) >+ goto reset_and_discard; >+ mpcb->list_rcvd = 0; >+ >+ mptcp_v4_join_request(meta_sk, skb); >+ goto discard; >+ } >+ goto reset_and_discard; >+ } >+ return 0; >+ >+reset_and_discard: >+ tcp_v4_send_reset(rsk, skb); >+discard: >+ kfree_skb(skb); >+ return 0; >+} >+ >+/* After this, the ref count of the meta_sk associated with the request_sock >+ * is incremented. Thus it is the responsibility of the caller >+ * to call sock_put() when the reference is not needed anymore. >+ */ >+struct sock *mptcp_v4_search_req(const __be16 rport, const __be32 raddr, >+ const __be32 laddr, const struct net *net) >+{ >+ struct mptcp_request_sock *mtreq; >+ struct sock *meta_sk = NULL; >+ >+ spin_lock(&mptcp_reqsk_hlock); >+ list_for_each_entry(mtreq, >+ &mptcp_reqsk_htb[inet_synq_hash(raddr, rport, 0, >+ MPTCP_HASH_SIZE)], >+ collide_tuple) { >+ struct inet_request_sock *ireq = inet_rsk(rev_mptcp_rsk(mtreq)); >+ meta_sk = mtreq->mpcb->meta_sk; >+ >+ if (ireq->rmt_port == rport && >+ ireq->rmt_addr == raddr && >+ ireq->loc_addr == laddr && >+ rev_mptcp_rsk(mtreq)->rsk_ops->family == AF_INET && >+ net_eq(net, sock_net(meta_sk))) >+ break; >+ meta_sk = NULL; >+ } >+ >+ if (meta_sk && unlikely(!atomic_inc_not_zero(&meta_sk->sk_refcnt))) >+ meta_sk = NULL; >+ spin_unlock(&mptcp_reqsk_hlock); >+ >+ return meta_sk; >+} >+ >+/* Create a new IPv4 subflow. >+ * >+ * We are in user-context and meta-sock-lock is hold. >+ */ >+int mptcp_init4_subsockets(struct sock *meta_sk, const struct mptcp_loc4 *loc, >+ struct mptcp_rem4 *rem) >+{ >+ struct tcp_sock *tp; >+ struct sock *sk; >+ struct sockaddr_in loc_in, rem_in; >+ struct socket sock; >+ int ulid_size = 0, ret; >+ >+ /* Don't try again - even if it fails */ >+ rem->bitfield |= (1 << loc->id); >+ >+ /** First, create and prepare the new socket */ >+ >+ sock.type = meta_sk->sk_socket->type; >+ sock.state = SS_UNCONNECTED; >+ sock.wq = meta_sk->sk_socket->wq; >+ sock.file = meta_sk->sk_socket->file; >+ sock.ops = NULL; >+ >+ ret = inet_create(sock_net(meta_sk), &sock, IPPROTO_TCP, 1); >+ if (unlikely(ret < 0)) { >+ mptcp_debug("%s inet_create failed ret: %d\n", __func__, ret); >+ return ret; >+ } >+ >+ sk = sock.sk; >+ tp = tcp_sk(sk); >+ >+ /* All subsockets need the MPTCP-lock-class */ >+ lockdep_set_class_and_name(&(sk)->sk_lock.slock, &meta_slock_key, "slock-AF_INET-MPTCP"); >+ lockdep_init_map(&(sk)->sk_lock.dep_map, "sk_lock-AF_INET-MPTCP", &meta_key, 0); >+ >+ if (mptcp_add_sock(meta_sk, sk, rem->id, GFP_KERNEL)) >+ goto error; >+ >+ tp->mptcp->slave_sk = 1; >+ tp->mptcp->low_prio = loc->low_prio; >+ >+ /* Initializing the timer for an MPTCP subflow */ >+ setup_timer(&tp->mptcp->mptcp_ack_timer, mptcp_ack_handler, (unsigned long)sk); >+ >+ /** Then, connect the socket to the peer */ >+ >+ ulid_size = sizeof(struct sockaddr_in); >+ loc_in.sin_family = AF_INET; >+ rem_in.sin_family = AF_INET; >+ loc_in.sin_port = 0; >+ if (rem->port) >+ rem_in.sin_port = rem->port; >+ else >+ rem_in.sin_port = inet_sk(meta_sk)->inet_dport; >+ loc_in.sin_addr = loc->addr; >+ rem_in.sin_addr = rem->addr; >+ >+ ret = sock.ops->bind(&sock, (struct sockaddr *)&loc_in, ulid_size); >+ if (ret < 0) { >+ mptcp_debug("%s: MPTCP subsocket bind() failed, error %d\n", >+ __func__, ret); >+ goto error; >+ } >+ >+ mptcp_debug("%s: token %#x pi %d src_addr:%pI4:%d dst_addr:%pI4:%d\n", >+ __func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token, >+ tp->mptcp->path_index, &loc_in.sin_addr, >+ ntohs(loc_in.sin_port), &rem_in.sin_addr, >+ ntohs(rem_in.sin_port)); >+ >+ ret = sock.ops->connect(&sock, (struct sockaddr *)&rem_in, >+ ulid_size, O_NONBLOCK); >+ if (ret < 0 && ret != -EINPROGRESS) { >+ mptcp_debug("%s: MPTCP subsocket connect() failed, error %d\n", >+ __func__, ret); >+ goto error; >+ } >+ >+ sk_set_socket(sk, meta_sk->sk_socket); >+ sk->sk_wq = meta_sk->sk_wq; >+ >+ return 0; >+ >+error: >+ /* May happen if mptcp_add_sock fails first */ >+ if (!tp->mpc) { >+ tcp_close(sk, 0); >+ } else { >+ local_bh_disable(); >+ mptcp_sub_force_close(sk); >+ local_bh_enable(); >+ } >+ return ret; >+} >+ >+/****** IPv4-Address event handler ******/ >+ >+/* React on IP-addr add/rem-events */ >+static int mptcp_pm_inetaddr_event(struct notifier_block *this, >+ unsigned long event, void *ptr) >+{ >+ return mptcp_pm_addr_event_handler(event, ptr, AF_INET); >+} >+ >+/* React on ifup/down-events */ >+static int mptcp_pm_netdev_event(struct notifier_block *this, >+ unsigned long event, void *ptr) >+{ >+ struct net_device *dev = ptr; >+ struct in_device *in_dev; >+ >+ if (!(event == NETDEV_UP || event == NETDEV_DOWN || >+ event == NETDEV_CHANGE)) >+ return NOTIFY_DONE; >+ >+ /* Iterate over the addresses of the interface, then we go over the >+ * mpcb's to modify them - that way we take tk_hash_lock for a shorter >+ * time at each iteration. - otherwise we would need to take it from the >+ * beginning till the end. >+ */ >+ rcu_read_lock(); >+ in_dev = __in_dev_get_rtnl(dev); >+ >+ if (in_dev) { >+ for_primary_ifa(in_dev) { >+ mptcp_pm_inetaddr_event(NULL, event, ifa); >+ } endfor_ifa(in_dev); >+ } >+ >+ rcu_read_unlock(); >+ return NOTIFY_DONE; >+} >+ >+void mptcp_pm_addr4_event_handler(struct in_ifaddr *ifa, unsigned long event, >+ struct mptcp_cb *mpcb) >+{ >+ int i; >+ struct sock *sk, *tmpsk; >+ >+ if (ifa->ifa_scope > RT_SCOPE_LINK) >+ return; >+ >+ /* Look for the address among the local addresses */ >+ mptcp_for_each_bit_set(mpcb->loc4_bits, i) { >+ if (mpcb->locaddr4[i].addr.s_addr == ifa->ifa_local) >+ goto found; >+ } >+ >+ /* Not yet in address-list */ >+ if ((event == NETDEV_UP || event == NETDEV_CHANGE) && >+ netif_running(ifa->ifa_dev->dev) && >+ !(ifa->ifa_dev->dev->flags & IFF_NOMULTIPATH)) { >+ i = __mptcp_find_free_index(mpcb->loc4_bits, 0, mpcb->next_v4_index); >+ if (i < 0) { >+ mptcp_debug("MPTCP_PM: NETDEV_UP Reached max number of local IPv4 addresses: %d\n", >+ MPTCP_MAX_ADDR); >+ return; >+ } >+ >+ /* update this mpcb */ >+ mpcb->locaddr4[i].addr.s_addr = ifa->ifa_local; >+ mpcb->locaddr4[i].id = i; >+ mpcb->loc4_bits |= (1 << i); >+ mpcb->next_v4_index = i + 1; >+ /* re-send addresses */ >+ mptcp_v4_send_add_addr(i, mpcb); >+ /* re-evaluate paths */ >+ mptcp_create_subflows(mpcb->meta_sk); >+ } >+ return; >+found: >+ /* Address already in list. Reactivate/Deactivate the >+ * concerned paths. >+ */ >+ mptcp_for_each_sk_safe(mpcb, sk, tmpsk) { >+ struct tcp_sock *tp = tcp_sk(sk); >+ if (sk->sk_family != AF_INET || >+ inet_sk(sk)->inet_saddr != ifa->ifa_local) >+ continue; >+ >+ if (event == NETDEV_DOWN || >+ (ifa->ifa_dev->dev->flags & IFF_NOMULTIPATH)) { >+ mptcp_reinject_data(sk, 0); >+ mptcp_sub_force_close(sk); >+ } else if (event == NETDEV_CHANGE) { >+ int new_low_prio = (ifa->ifa_dev->dev->flags & IFF_MPBACKUP) ? >+ 1 : 0; >+ if (new_low_prio != tp->mptcp->low_prio) >+ tp->mptcp->send_mp_prio = 1; >+ tp->mptcp->low_prio = new_low_prio; >+ } >+ } >+ >+ if (event == NETDEV_DOWN || >+ (ifa->ifa_dev->dev->flags & IFF_NOMULTIPATH)) { >+ mpcb->loc4_bits &= ~(1 << i); >+ >+ /* Force sending directly the REMOVE_ADDR option */ >+ mpcb->remove_addrs |= (1 << mpcb->locaddr4[i].id); >+ sk = mptcp_select_ack_sock(mpcb->meta_sk, 0); >+ if (sk) >+ tcp_send_ack(sk); >+ >+ mptcp_for_each_bit_set(mpcb->rem4_bits, i) >+ mpcb->remaddr4[i].bitfield &= mpcb->loc4_bits; >+ } >+} >+ >+/* Send ADD_ADDR for loc_id on all available subflows */ >+void mptcp_v4_send_add_addr(int loc_id, struct mptcp_cb *mpcb) >+{ >+ struct tcp_sock *tp; >+ >+ mptcp_for_each_tp(mpcb, tp) >+ tp->mptcp->add_addr4 |= (1 << loc_id); >+} >+ >+static struct notifier_block mptcp_pm_inetaddr_notifier = { >+ .notifier_call = mptcp_pm_inetaddr_event, >+}; >+ >+static struct notifier_block mptcp_pm_netdev_notifier = { >+ .notifier_call = mptcp_pm_netdev_event, >+}; >+ >+/****** End of IPv4-Address event handler ******/ >+ >+/* General initialization of IPv4 for MPTCP */ >+int mptcp_pm_v4_init(void) >+{ >+ int ret; >+ struct request_sock_ops *ops = &mptcp_request_sock_ops; >+ >+ ops->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", "MPTCP"); >+ if (ops->slab_name == NULL) { >+ ret = -ENOMEM; >+ goto out; >+ } >+ >+ ops->slab = kmem_cache_create(ops->slab_name, ops->obj_size, 0, >+ SLAB_HWCACHE_ALIGN, NULL); >+ >+ if (ops->slab == NULL) { >+ ret = -ENOMEM; >+ goto err_reqsk_create; >+ } >+ >+ ret = register_inetaddr_notifier(&mptcp_pm_inetaddr_notifier); >+ if (ret) >+ goto err_reg_inetaddr; >+ ret = register_netdevice_notifier(&mptcp_pm_netdev_notifier); >+ if (ret) >+ goto err_reg_netdev; >+ >+out: >+ return ret; >+ >+err_reg_netdev: >+ unregister_inetaddr_notifier(&mptcp_pm_inetaddr_notifier); >+err_reg_inetaddr: >+ kmem_cache_destroy(ops->slab); >+err_reqsk_create: >+ kfree(ops->slab_name); >+ ops->slab_name = NULL; >+ goto out; >+} >+ >+void mptcp_pm_v4_undo(void) >+{ >+ unregister_inetaddr_notifier(&mptcp_pm_inetaddr_notifier); >+ unregister_netdevice_notifier(&mptcp_pm_netdev_notifier); >+ kmem_cache_destroy(mptcp_request_sock_ops.slab); >+ kfree(mptcp_request_sock_ops.slab_name); >+} >+ >+ >diff -Naur a/linux-3.11/net/mptcp/mptcp_ipv6.c b/linux-3.11/net/mptcp/mptcp_ipv6.c >--- a/linux-3.11/net/mptcp/mptcp_ipv6.c 1970-01-01 01:00:00.000000000 +0100 >+++ b/linux-3.11/net/mptcp/mptcp_ipv6.c 2013-10-05 18:34:49.271364676 +0200 >@@ -0,0 +1,1008 @@ >+/* >+ * MPTCP implementation - IPv6-specific functions >+ * >+ * Initial Design & Implementation: >+ * Sébastien Barré <sebastien.barre@uclouvain.be> >+ * >+ * Current Maintainer: >+ * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi> >+ * >+ * Additional authors: >+ * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi> >+ * Gregory Detal <gregory.detal@uclouvain.be> >+ * Fabien Duchêne <fabien.duchene@uclouvain.be> >+ * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de> >+ * Lavkesh Lahngir <lavkesh51@gmail.com> >+ * Andreas Ripke <ripke@neclab.eu> >+ * Vlad Dogaru <vlad.dogaru@intel.com> >+ * Octavian Purdila <octavian.purdila@intel.com> >+ * John Ronan <jronan@tssg.org> >+ * Catalin Nicutar <catalin.nicutar@gmail.com> >+ * Brandon Heller <brandonh@stanford.edu> >+ * >+ * >+ * This program is free software; you can redistribute it and/or >+ * modify it under the terms of the GNU General Public License >+ * as published by the Free Software Foundation; either version >+ * 2 of the License, or (at your option) any later version. >+ */ >+ >+#include <linux/export.h> >+#include <linux/in6.h> >+#include <linux/kernel.h> >+ >+#include <net/addrconf.h> >+#include <net/flow.h> >+#include <net/inet6_connection_sock.h> >+#include <net/inet6_hashtables.h> >+#include <net/inet_common.h> >+#include <net/ipv6.h> >+#include <net/ip6_checksum.h> >+#include <net/ip6_route.h> >+#include <net/mptcp.h> >+#include <net/mptcp_pm.h> >+#include <net/mptcp_v6.h> >+#include <net/tcp.h> >+#include <net/transp_v6.h> >+ >+static int mptcp_v6v4_send_synack(struct sock *meta_sk, struct request_sock *req, >+ u16 queue_mapping); >+ >+__u32 mptcp_v6_get_nonce(const __be32 *saddr, const __be32 *daddr, >+ __be16 sport, __be16 dport, u32 seq) >+{ >+ u32 secret[MD5_MESSAGE_BYTES / 4]; >+ u32 hash[MD5_DIGEST_WORDS]; >+ u32 i; >+ >+ memcpy(hash, saddr, 16); >+ for (i = 0; i < 4; i++) >+ secret[i] = mptcp_secret[i] + (__force u32)daddr[i]; >+ secret[4] = mptcp_secret[4] + >+ (((__force u16)sport << 16) + (__force u16)dport); >+ secret[5] = seq; >+ for (i = 6; i < MD5_MESSAGE_BYTES / 4; i++) >+ secret[i] = mptcp_secret[i]; >+ >+ md5_transform(hash, secret); >+ >+ return hash[0]; >+} >+ >+u64 mptcp_v6_get_key(const __be32 *saddr, const __be32 *daddr, >+ __be16 sport, __be16 dport) >+{ >+ u32 secret[MD5_MESSAGE_BYTES / 4]; >+ u32 hash[MD5_DIGEST_WORDS]; >+ u32 i; >+ >+ memcpy(hash, saddr, 16); >+ for (i = 0; i < 4; i++) >+ secret[i] = mptcp_secret[i] + (__force u32)daddr[i]; >+ secret[4] = mptcp_secret[4] + >+ (((__force u16)sport << 16) + (__force u16)dport); >+ secret[5] = mptcp_key_seed++; >+ for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++) >+ secret[i] = mptcp_secret[i]; >+ >+ md5_transform(hash, secret); >+ >+ return *((u64 *)hash); >+} >+ >+static void mptcp_v6_reqsk_destructor(struct request_sock *req) >+{ >+ mptcp_reqsk_destructor(req); >+ >+ tcp_v6_reqsk_destructor(req); >+} >+ >+/* Similar to tcp_v6_rtx_synack */ >+static int mptcp_v6_rtx_synack(struct sock *meta_sk, struct request_sock *req) >+{ >+ if (meta_sk->sk_family == AF_INET6) >+ return tcp_v6_rtx_synack(meta_sk, req); >+ >+ TCP_INC_STATS_BH(sock_net(meta_sk), TCP_MIB_RETRANSSEGS); >+ return mptcp_v6v4_send_synack(meta_sk, req, 0); >+} >+ >+/* Similar to tcp6_request_sock_ops */ >+struct request_sock_ops mptcp6_request_sock_ops __read_mostly = { >+ .family = AF_INET6, >+ .obj_size = sizeof(struct mptcp6_request_sock), >+ .rtx_syn_ack = mptcp_v6_rtx_synack, >+ .send_ack = tcp_v6_reqsk_send_ack, >+ .destructor = mptcp_v6_reqsk_destructor, >+ .send_reset = tcp_v6_send_reset, >+ .syn_ack_timeout = tcp_syn_ack_timeout, >+}; >+ >+static void mptcp_v6_reqsk_queue_hash_add(struct sock *meta_sk, >+ struct request_sock *req, >+ unsigned long timeout) >+{ >+ const u32 h = inet6_synq_hash(&inet6_rsk(req)->rmt_addr, >+ inet_rsk(req)->rmt_port, >+ 0, MPTCP_HASH_SIZE); >+ >+ inet6_csk_reqsk_queue_hash_add(meta_sk, req, timeout); >+ >+ spin_lock(&mptcp_reqsk_hlock); >+ list_add(&mptcp_rsk(req)->collide_tuple, &mptcp_reqsk_htb[h]); >+ spin_unlock(&mptcp_reqsk_hlock); >+} >+ >+/* Similar to tcp_v6_send_synack >+ * >+ * The meta-socket is IPv4, but a new subsocket is IPv6 >+ */ >+static int mptcp_v6v4_send_synack(struct sock *meta_sk, struct request_sock *req, >+ u16 queue_mapping) >+{ >+ struct inet6_request_sock *treq = inet6_rsk(req); >+ struct sk_buff *skb; >+ struct flowi6 fl6; >+ struct dst_entry *dst; >+ int err; >+ >+ memset(&fl6, 0, sizeof(fl6)); >+ fl6.flowi6_proto = IPPROTO_TCP; >+ fl6.daddr = treq->rmt_addr; >+ fl6.saddr = treq->loc_addr; >+ fl6.flowlabel = 0; >+ fl6.flowi6_oif = treq->iif; >+ fl6.flowi6_mark = meta_sk->sk_mark; >+ fl6.fl6_dport = inet_rsk(req)->rmt_port; >+ fl6.fl6_sport = inet_rsk(req)->loc_port; >+ security_req_classify_flow(req, flowi6_to_flowi(&fl6)); >+ >+ dst = ip6_dst_lookup_flow(meta_sk, &fl6, NULL, false); >+ if (IS_ERR(dst)) { >+ err = PTR_ERR(dst); >+ return err; >+ } >+ skb = tcp_make_synack(meta_sk, dst, req, NULL); >+ err = -ENOMEM; >+ if (skb) { >+ __tcp_v6_send_check(skb, &treq->loc_addr, &treq->rmt_addr); >+ >+ fl6.daddr = treq->rmt_addr; >+ skb_set_queue_mapping(skb, queue_mapping); >+ err = ip6_xmit(meta_sk, skb, &fl6, NULL, 0); >+ err = net_xmit_eval(err); >+ } >+ >+ return err; >+} >+ >+/* Similar to tcp_v6_syn_recv_sock >+ * >+ * The meta-socket is IPv4, but a new subsocket is IPv6 >+ */ >+struct sock *mptcp_v6v4_syn_recv_sock(struct sock *meta_sk, struct sk_buff *skb, >+ struct request_sock *req, >+ struct dst_entry *dst) >+{ >+ struct inet6_request_sock *treq; >+ struct ipv6_pinfo *newnp; >+ struct tcp6_sock *newtcp6sk; >+ struct inet_sock *newinet; >+ struct tcp_sock *newtp; >+ struct sock *newsk; >+ >+ treq = inet6_rsk(req); >+ >+ if (sk_acceptq_is_full(meta_sk)) >+ goto out_overflow; >+ >+ if (!dst) { >+ /* This code is similar to inet6_csk_route_req, but as we >+ * don't have a np-pointer in the meta, we have to do it >+ * manually. >+ */ >+ struct flowi6 fl6; >+ >+ memset(&fl6, 0, sizeof(fl6)); >+ fl6.flowi6_proto = IPPROTO_TCP; >+ fl6.daddr = treq->rmt_addr; >+ fl6.saddr = treq->loc_addr; >+ fl6.flowi6_oif = meta_sk->sk_bound_dev_if; >+ fl6.flowi6_mark = meta_sk->sk_mark; >+ fl6.fl6_dport = inet_rsk(req)->rmt_port; >+ fl6.fl6_sport = inet_rsk(req)->loc_port; >+ security_req_classify_flow(req, flowi6_to_flowi(&fl6)); >+ >+ dst = ip6_dst_lookup_flow(meta_sk, &fl6, NULL, false); >+ if (IS_ERR(dst)) >+ goto out; >+ } >+ >+ newsk = tcp_create_openreq_child(meta_sk, req, skb); >+ if (newsk == NULL) >+ goto out_nonewsk; >+ >+ newtcp6sk = (struct tcp6_sock *)newsk; >+ inet_sk(newsk)->pinet6 = &newtcp6sk->inet6; >+ >+ /* >+ * No need to charge this sock to the relevant IPv6 refcnt debug socks >+ * count here, tcp_create_openreq_child now does this for us, see the >+ * comment in that function for the gory details. -acme >+ */ >+ >+ newsk->sk_gso_type = SKB_GSO_TCPV6; >+ __ip6_dst_store(newsk, dst, NULL, NULL); >+ inet6_sk_rx_dst_set(newsk, skb); >+ >+ newtp = tcp_sk(newsk); >+ newinet = inet_sk(newsk); >+ newnp = inet6_sk(newsk); >+ >+ newnp->daddr = treq->rmt_addr; >+ newnp->saddr = treq->loc_addr; >+ newnp->rcv_saddr = treq->loc_addr; >+ newsk->sk_bound_dev_if = treq->iif; >+ >+ /* Now IPv6 options... >+ >+ First: no IPv4 options. >+ */ >+ newinet->inet_opt = NULL; >+ newnp->ipv6_ac_list = NULL; >+ newnp->ipv6_fl_list = NULL; >+ newnp->rxopt.all = 0; >+ >+ /* Clone pktoptions received with SYN */ >+ newnp->pktoptions = NULL; >+ if (treq->pktopts != NULL) { >+ newnp->pktoptions = skb_clone(treq->pktopts, >+ sk_gfp_atomic(meta_sk, GFP_ATOMIC)); >+ consume_skb(treq->pktopts); >+ treq->pktopts = NULL; >+ if (newnp->pktoptions) >+ skb_set_owner_r(newnp->pktoptions, newsk); >+ } >+ newnp->opt = NULL; >+ newnp->mcast_oif = inet6_iif(skb); >+ newnp->mcast_hops = ipv6_hdr(skb)->hop_limit; >+ newnp->rcv_tclass = ipv6_get_dsfield(ipv6_hdr(skb)); >+ >+ /* Initialization copied from inet6_create - normally this should have >+ * been handled by the memcpy as in tcp_v6_syn_recv_sock >+ */ >+ newnp->hop_limit = -1; >+ newnp->mc_loop = 1; >+ newnp->pmtudisc = IPV6_PMTUDISC_WANT; >+ (void)xchg(&newnp->rxpmtu, NULL); >+ >+ inet_csk(newsk)->icsk_ext_hdr_len = 0; >+ >+ tcp_mtup_init(newsk); >+ tcp_sync_mss(newsk, dst_mtu(dst)); >+ newtp->advmss = dst_metric_advmss(dst); >+ if (tcp_sk(meta_sk)->rx_opt.user_mss && >+ tcp_sk(meta_sk)->rx_opt.user_mss < newtp->advmss) >+ newtp->advmss = tcp_sk(meta_sk)->rx_opt.user_mss; >+ >+ tcp_initialize_rcv_mss(newsk); >+ tcp_synack_rtt_meas(newsk, req); >+ newtp->total_retrans = req->num_retrans; >+ >+ newinet->inet_daddr = LOOPBACK4_IPV6; >+ newinet->inet_saddr = LOOPBACK4_IPV6; >+ newinet->inet_rcv_saddr = LOOPBACK4_IPV6; >+ >+ if (__inet_inherit_port(meta_sk, newsk) < 0) { >+ inet_csk_prepare_forced_close(newsk); >+ tcp_done(newsk); >+ goto out; >+ } >+ __inet6_hash(newsk, NULL); >+ >+ return newsk; >+ >+out_overflow: >+ NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_LISTENOVERFLOWS); >+out_nonewsk: >+ dst_release(dst); >+out: >+ NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_LISTENDROPS); >+ return NULL; >+} >+ >+/* Similar to tcp_v6_conn_request */ >+static void mptcp_v6_join_request(struct sock *meta_sk, struct sk_buff *skb) >+{ >+ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; >+ struct tcp_options_received tmp_opt; >+ struct mptcp_options_received mopt; >+ struct ipv6_pinfo *np = inet6_sk(meta_sk); >+ struct request_sock *req; >+ struct inet6_request_sock *treq; >+ struct mptcp_request_sock *mtreq; >+ u8 mptcp_hash_mac[20]; >+ __u32 isn = TCP_SKB_CB(skb)->when; >+ struct dst_entry *dst = NULL; >+ struct flowi6 fl6; >+ int want_cookie = 0; >+ >+ tcp_clear_options(&tmp_opt); >+ mptcp_init_mp_opt(&mopt); >+ tmp_opt.mss_clamp = TCP_MSS_DEFAULT; >+ tmp_opt.user_mss = tcp_sk(meta_sk)->rx_opt.user_mss; >+ tcp_parse_options(skb, &tmp_opt, &mopt, 0, NULL); >+ >+ req = inet6_reqsk_alloc(&mptcp6_request_sock_ops); >+ if (!req) >+ return; >+ >+ tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; >+ tcp_openreq_init(req, &tmp_opt, skb); >+ >+ treq = inet6_rsk(req); >+ treq->rmt_addr = ipv6_hdr(skb)->saddr; >+ treq->loc_addr = ipv6_hdr(skb)->daddr; >+ >+ if (!want_cookie || tmp_opt.tstamp_ok) >+ TCP_ECN_create_request(req, skb, sock_net(meta_sk)); >+ >+ treq->iif = meta_sk->sk_bound_dev_if; >+ >+ /* So that link locals have meaning */ >+ if (!meta_sk->sk_bound_dev_if && >+ ipv6_addr_type(&treq->rmt_addr) & IPV6_ADDR_LINKLOCAL) >+ treq->iif = inet6_iif(skb); >+ >+ if (!isn) { >+ if (meta_sk->sk_family == AF_INET6 && >+ (ipv6_opt_accepted(meta_sk, skb) || >+ np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || >+ np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim)) { >+ atomic_inc(&skb->users); >+ treq->pktopts = skb; >+ } >+ >+ /* VJ's idea. We save last timestamp seen >+ * from the destination in peer table, when entering >+ * state TIME-WAIT, and check against it before >+ * accepting new connection request. >+ * >+ * If "isn" is not zero, this request hit alive >+ * timewait bucket, so that all the necessary checks >+ * are made in the function processing timewait state. >+ */ >+ if (tmp_opt.saw_tstamp && >+ tcp_death_row.sysctl_tw_recycle && >+ (dst = inet6_csk_route_req(meta_sk, &fl6, req)) != NULL) { >+ if (!tcp_peer_is_proven(req, dst, true)) { >+ NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_PAWSPASSIVEREJECTED); >+ goto drop_and_release; >+ } >+ } >+ /* Kill the following clause, if you dislike this way. */ >+ else if (!sysctl_tcp_syncookies && >+ (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(meta_sk) < >+ (sysctl_max_syn_backlog >> 2)) && >+ !tcp_peer_is_proven(req, dst, false)) { >+ /* Without syncookies last quarter of >+ * backlog is filled with destinations, >+ * proven to be alive. >+ * It means that we continue to communicate >+ * to destinations, already remembered >+ * to the moment of synflood. >+ */ >+ LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI6/%u\n", >+ &treq->rmt_addr, ntohs(tcp_hdr(skb)->source)); >+ goto drop_and_release; >+ } >+ >+ isn = tcp_v6_init_sequence(skb); >+ } >+ >+ tcp_rsk(req)->snt_isn = isn; >+ tcp_rsk(req)->snt_synack = tcp_time_stamp; >+ >+ mtreq = mptcp_rsk(req); >+ mtreq->mpcb = mpcb; >+ INIT_LIST_HEAD(&mtreq->collide_tuple); >+ mtreq->mptcp_rem_nonce = mopt.mptcp_recv_nonce; >+ mtreq->mptcp_rem_key = mpcb->mptcp_rem_key; >+ mtreq->mptcp_loc_key = mpcb->mptcp_loc_key; >+ mtreq->mptcp_loc_nonce = mptcp_v6_get_nonce(ipv6_hdr(skb)->daddr.s6_addr32, >+ ipv6_hdr(skb)->saddr.s6_addr32, >+ tcp_hdr(skb)->dest, >+ tcp_hdr(skb)->source, isn); >+ mptcp_hmac_sha1((u8 *)&mtreq->mptcp_loc_key, >+ (u8 *)&mtreq->mptcp_rem_key, >+ (u8 *)&mtreq->mptcp_loc_nonce, >+ (u8 *)&mtreq->mptcp_rem_nonce, (u32 *)mptcp_hash_mac); >+ mtreq->mptcp_hash_tmac = *(u64 *)mptcp_hash_mac; >+ mtreq->rem_id = mopt.rem_id; >+ mtreq->low_prio = mopt.low_prio; >+ tcp_rsk(req)->saw_mpc = 1; >+ >+ if (meta_sk->sk_family == AF_INET6) { >+ if (tcp_v6_send_synack(meta_sk, dst, &fl6, req, >+ skb_get_queue_mapping(skb))) >+ goto drop_and_free; >+ } else { >+ if (mptcp_v6v4_send_synack(meta_sk, req, skb_get_queue_mapping(skb))) >+ goto drop_and_free; >+ } >+ >+ /* Adding to request queue in metasocket */ >+ mptcp_v6_reqsk_queue_hash_add(meta_sk, req, TCP_TIMEOUT_INIT); >+ >+ return; >+ >+drop_and_release: >+ dst_release(dst); >+drop_and_free: >+ reqsk_free(req); >+ return; >+} >+ >+int mptcp_v6_rem_raddress(struct mptcp_cb *mpcb, u8 id) >+{ >+ int i; >+ >+ for (i = 0; i < MPTCP_MAX_ADDR; i++) { >+ if (!((1 << i) & mpcb->rem6_bits)) >+ continue; >+ >+ if (mpcb->remaddr6[i].id == id) { >+ /* remove address from bitfield */ >+ mpcb->rem6_bits &= ~(1 << i); >+ >+ return 0; >+ } >+ } >+ >+ return -1; >+} >+ >+/* Returns -1 if there is no space anymore to store an additional >+ * address >+ */ >+int mptcp_v6_add_raddress(struct mptcp_cb *mpcb, const struct in6_addr *addr, >+ __be16 port, u8 id) >+{ >+ int i; >+ struct mptcp_rem6 *rem6; >+ >+ mptcp_for_each_bit_set(mpcb->rem6_bits, i) { >+ rem6 = &mpcb->remaddr6[i]; >+ >+ /* Address is already in the list --- continue */ >+ if (rem6->id == id && >+ ipv6_addr_equal(&rem6->addr, addr) && rem6->port == port) >+ return 0; >+ >+ /* This may be the case, when the peer is behind a NAT. He is >+ * trying to JOIN, thus sending the JOIN with a certain ID. >+ * However the src_addr of the IP-packet has been changed. We >+ * update the addr in the list, because this is the address as >+ * OUR BOX sees it. >+ */ >+ if (rem6->id == id) { >+ /* update the address */ >+ mptcp_debug("%s: updating old addr: %pI6 to addr %pI6 with id:%d\n", >+ __func__, &rem6->addr, addr, id); >+ rem6->addr = *addr; >+ rem6->port = port; >+ mpcb->list_rcvd = 1; >+ return 0; >+ } >+ } >+ >+ i = mptcp_find_free_index(mpcb->rem6_bits); >+ /* Do we have already the maximum number of local/remote addresses? */ >+ if (i < 0) { >+ mptcp_debug("%s: At max num of remote addresses: %d --- not adding address: %pI6\n", >+ __func__, MPTCP_MAX_ADDR, addr); >+ return -1; >+ } >+ >+ rem6 = &mpcb->remaddr6[i]; >+ >+ /* Address is not known yet, store it */ >+ rem6->addr = *addr; >+ rem6->port = port; >+ rem6->bitfield = 0; >+ rem6->retry_bitfield = 0; >+ rem6->id = id; >+ mpcb->list_rcvd = 1; >+ mpcb->rem6_bits |= (1 << i); >+ >+ return 0; >+} >+ >+/* Sets the bitfield of the remote-address field >+ * local address is not set as it will disappear with the global address-list >+ */ >+void mptcp_v6_set_init_addr_bit(struct mptcp_cb *mpcb, >+ const struct in6_addr *daddr) >+{ >+ int i; >+ mptcp_for_each_bit_set(mpcb->rem6_bits, i) { >+ if (ipv6_addr_equal(&mpcb->remaddr6[i].addr, daddr)) { >+ /* It's the initial flow - thus local index == 0 */ >+ mpcb->remaddr6[i].bitfield |= 1; >+ return; >+ } >+ } >+} >+ >+int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb) >+{ >+ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; >+ struct sock *child, *rsk = NULL; >+ int ret; >+ >+ if (!(TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_JOIN)) { >+ struct tcphdr *th = tcp_hdr(skb); >+ const struct ipv6hdr *ip6h = ipv6_hdr(skb); >+ struct sock *sk; >+ >+ sk = __inet6_lookup_established(sock_net(meta_sk), >+ &tcp_hashinfo, >+ &ip6h->saddr, th->source, >+ &ip6h->daddr, ntohs(th->dest), >+ inet6_iif(skb)); >+ >+ if (!sk) { >+ kfree_skb(skb); >+ return 0; >+ } >+ if (is_meta_sk(sk)) { >+ WARN("%s Did not find a sub-sk!\n", __func__); >+ kfree_skb(skb); >+ sock_put(sk); >+ return 0; >+ } >+ >+ if (sk->sk_state == TCP_TIME_WAIT) { >+ inet_twsk_put(inet_twsk(sk)); >+ kfree_skb(skb); >+ return 0; >+ } >+ >+ ret = tcp_v6_do_rcv(sk, skb); >+ sock_put(sk); >+ >+ return ret; >+ } >+ TCP_SKB_CB(skb)->mptcp_flags = 0; >+ >+ /* Has been removed from the tk-table. Thus, no new subflows. >+ * >+ * Check for close-state is necessary, because we may have been closed >+ * without passing by mptcp_close(). >+ * >+ * When falling back, no new subflows are allowed either. >+ */ >+ if (meta_sk->sk_state == TCP_CLOSE || !tcp_sk(meta_sk)->inside_tk_table || >+ mpcb->infinite_mapping_rcv || mpcb->send_infinite_mapping) >+ goto reset_and_discard; >+ >+ child = tcp_v6_hnd_req(meta_sk, skb); >+ >+ if (!child) >+ goto discard; >+ >+ if (child != meta_sk) { >+ sock_rps_save_rxhash(child, skb); >+ /* We don't call tcp_child_process here, because we hold >+ * already the meta-sk-lock and are sure that it is not owned >+ * by the user. >+ */ >+ ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb), skb->len); >+ bh_unlock_sock(child); >+ sock_put(child); >+ if (ret) { >+ rsk = child; >+ goto reset_and_discard; >+ } >+ } else { >+ if (tcp_hdr(skb)->syn) { >+ struct mp_join *join_opt = mptcp_find_join(skb); >+ /* Currently we make two calls to mptcp_find_join(). This >+ * can probably be optimized. */ >+ if (mptcp_v6_add_raddress(mpcb, >+ (struct in6_addr *)&ipv6_hdr(skb)->saddr, >+ 0, >+ join_opt->addr_id) < 0) >+ goto reset_and_discard; >+ mpcb->list_rcvd = 0; >+ >+ mptcp_v6_join_request(meta_sk, skb); >+ goto discard; >+ } >+ goto reset_and_discard; >+ } >+ return 0; >+ >+reset_and_discard: >+ tcp_v6_send_reset(rsk, skb); >+discard: >+ kfree_skb(skb); >+ return 0; >+} >+ >+/* After this, the ref count of the meta_sk associated with the request_sock >+ * is incremented. Thus it is the responsibility of the caller >+ * to call sock_put() when the reference is not needed anymore. >+ */ >+struct sock *mptcp_v6_search_req(const __be16 rport, const struct in6_addr *raddr, >+ const struct in6_addr *laddr, const struct net *net) >+{ >+ struct mptcp_request_sock *mtreq; >+ struct sock *meta_sk = NULL; >+ >+ spin_lock(&mptcp_reqsk_hlock); >+ list_for_each_entry(mtreq, >+ &mptcp_reqsk_htb[inet6_synq_hash(raddr, rport, 0, >+ MPTCP_HASH_SIZE)], >+ collide_tuple) { >+ struct inet6_request_sock *treq = inet6_rsk(rev_mptcp_rsk(mtreq)); >+ meta_sk = mtreq->mpcb->meta_sk; >+ >+ if (inet_rsk(rev_mptcp_rsk(mtreq))->rmt_port == rport && >+ rev_mptcp_rsk(mtreq)->rsk_ops->family == AF_INET6 && >+ ipv6_addr_equal(&treq->rmt_addr, raddr) && >+ ipv6_addr_equal(&treq->loc_addr, laddr) && >+ net_eq(net, sock_net(meta_sk))) >+ break; >+ meta_sk = NULL; >+ } >+ >+ if (meta_sk && unlikely(!atomic_inc_not_zero(&meta_sk->sk_refcnt))) >+ meta_sk = NULL; >+ spin_unlock(&mptcp_reqsk_hlock); >+ >+ return meta_sk; >+} >+ >+/* Create a new IPv6 subflow. >+ * >+ * We are in user-context and meta-sock-lock is hold. >+ */ >+int mptcp_init6_subsockets(struct sock *meta_sk, const struct mptcp_loc6 *loc, >+ struct mptcp_rem6 *rem) >+{ >+ struct tcp_sock *tp; >+ struct sock *sk; >+ struct sockaddr_in6 loc_in, rem_in; >+ struct socket sock; >+ int ulid_size = 0, ret; >+ >+ /* Don't try again - even if it fails. >+ * There is a special case as the IPv6 address of the initial subflow >+ * has an id = 0. The other ones have id's in the range [8, 16[. >+ */ >+ rem->bitfield |= (1 << (loc->id - min_t(u8, loc->id, MPTCP_MAX_ADDR))); >+ >+ /** First, create and prepare the new socket */ >+ >+ sock.type = meta_sk->sk_socket->type; >+ sock.state = SS_UNCONNECTED; >+ sock.wq = meta_sk->sk_socket->wq; >+ sock.file = meta_sk->sk_socket->file; >+ sock.ops = NULL; >+ >+ ret = inet6_create(sock_net(meta_sk), &sock, IPPROTO_TCP, 1); >+ if (unlikely(ret < 0)) { >+ mptcp_debug("%s inet6_create failed ret: %d\n", __func__, ret); >+ return ret; >+ } >+ >+ sk = sock.sk; >+ tp = tcp_sk(sk); >+ >+ /* All subsockets need the MPTCP-lock-class */ >+ lockdep_set_class_and_name(&(sk)->sk_lock.slock, &meta_slock_key, "slock-AF_INET-MPTCP"); >+ lockdep_init_map(&(sk)->sk_lock.dep_map, "sk_lock-AF_INET-MPTCP", &meta_key, 0); >+ >+ if (mptcp_add_sock(meta_sk, sk, rem->id, GFP_KERNEL)) >+ goto error; >+ >+ tp->mptcp->slave_sk = 1; >+ tp->mptcp->low_prio = loc->low_prio; >+ >+ /* Initializing the timer for an MPTCP subflow */ >+ setup_timer(&tp->mptcp->mptcp_ack_timer, mptcp_ack_handler, (unsigned long)sk); >+ >+ /** Then, connect the socket to the peer */ >+ >+ ulid_size = sizeof(struct sockaddr_in6); >+ loc_in.sin6_family = AF_INET6; >+ rem_in.sin6_family = AF_INET6; >+ loc_in.sin6_port = 0; >+ if (rem->port) >+ rem_in.sin6_port = rem->port; >+ else >+ rem_in.sin6_port = inet_sk(meta_sk)->inet_dport; >+ loc_in.sin6_addr = loc->addr; >+ rem_in.sin6_addr = rem->addr; >+ >+ ret = sock.ops->bind(&sock, (struct sockaddr *)&loc_in, ulid_size); >+ if (ret < 0) { >+ mptcp_debug("%s: MPTCP subsocket bind()failed, error %d\n", >+ __func__, ret); >+ goto error; >+ } >+ >+ mptcp_debug("%s: token %#x pi %d src_addr:%pI6:%d dst_addr:%pI6:%d\n", >+ __func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token, >+ tp->mptcp->path_index, &loc_in.sin6_addr, >+ ntohs(loc_in.sin6_port), &rem_in.sin6_addr, >+ ntohs(rem_in.sin6_port)); >+ >+ ret = sock.ops->connect(&sock, (struct sockaddr *)&rem_in, >+ ulid_size, O_NONBLOCK); >+ if (ret < 0 && ret != -EINPROGRESS) { >+ mptcp_debug("%s: MPTCP subsocket connect() failed, error %d\n", >+ __func__, ret); >+ goto error; >+ } >+ >+ sk_set_socket(sk, meta_sk->sk_socket); >+ sk->sk_wq = meta_sk->sk_wq; >+ >+ return 0; >+ >+error: >+ /* May happen if mptcp_add_sock fails first */ >+ if (!tp->mpc) { >+ tcp_close(sk, 0); >+ } else { >+ local_bh_disable(); >+ mptcp_sub_force_close(sk); >+ local_bh_enable(); >+ } >+ return ret; >+} >+ >+struct mptcp_dad_data { >+ struct timer_list timer; >+ struct inet6_ifaddr *ifa; >+}; >+ >+static int mptcp_ipv6_is_in_dad_state(struct inet6_ifaddr *ifa) >+{ >+ return ((ifa->flags & IFA_F_TENTATIVE) && >+ ifa->state == INET6_IFADDR_STATE_DAD); >+} >+ >+static void mptcp_dad_callback(unsigned long arg); >+static int mptcp_pm_inet6_addr_event(struct notifier_block *this, >+ unsigned long event, void *ptr); >+ >+static inline void mptcp_dad_init_timer(struct mptcp_dad_data *data, >+ struct inet6_ifaddr *ifa) >+{ >+ data->ifa = ifa; >+ data->timer.data = (unsigned long)data; >+ data->timer.function = mptcp_dad_callback; >+ if (ifa->idev->cnf.rtr_solicit_delay) >+ data->timer.expires = jiffies + ifa->idev->cnf.rtr_solicit_delay; >+ else >+ data->timer.expires = jiffies + MPTCP_IPV6_DEFAULT_DAD_WAIT; >+} >+ >+static void mptcp_dad_callback(unsigned long arg) >+{ >+ struct mptcp_dad_data *data = (struct mptcp_dad_data *)arg; >+ >+ if (mptcp_ipv6_is_in_dad_state(data->ifa)) { >+ mptcp_dad_init_timer(data, data->ifa); >+ add_timer(&data->timer); >+ } else { >+ mptcp_pm_inet6_addr_event(NULL, NETDEV_UP, data->ifa); >+ in6_ifa_put(data->ifa); >+ kfree(data); >+ } >+} >+ >+static inline void mptcp_dad_setup_timer(struct inet6_ifaddr *ifa) >+{ >+ struct mptcp_dad_data *data; >+ >+ data = kmalloc(sizeof(*data), GFP_ATOMIC); >+ >+ if (!data) >+ return; >+ >+ init_timer(&data->timer); >+ mptcp_dad_init_timer(data, ifa); >+ add_timer(&data->timer); >+ in6_ifa_hold(ifa); >+} >+ >+/* React on IPv6-addr add/rem-events */ >+static int mptcp_pm_inet6_addr_event(struct notifier_block *this, >+ unsigned long event, void *ptr) >+{ >+ if (mptcp_ipv6_is_in_dad_state((struct inet6_ifaddr *)ptr)) { >+ mptcp_dad_setup_timer((struct inet6_ifaddr *)ptr); >+ return NOTIFY_DONE; >+ } else { >+ return mptcp_pm_addr_event_handler(event, ptr, AF_INET6); >+ } >+} >+ >+/* React on ifup/down-events */ >+static int mptcp_pm_v6_netdev_event(struct notifier_block *this, >+ unsigned long event, void *ptr) >+{ >+ struct net_device *dev = ptr; >+ struct inet6_dev *in6_dev = NULL; >+ >+ if (!(event == NETDEV_UP || event == NETDEV_DOWN || >+ event == NETDEV_CHANGE)) >+ return NOTIFY_DONE; >+ >+ /* Iterate over the addresses of the interface, then we go over the >+ * mpcb's to modify them - that way we take tk_hash_lock for a shorter >+ * time at each iteration. - otherwise we would need to take it from the >+ * beginning till the end. >+ */ >+ rcu_read_lock(); >+ in6_dev = __in6_dev_get(dev); >+ >+ if (in6_dev) { >+ struct inet6_ifaddr *ifa6; >+ list_for_each_entry(ifa6, &in6_dev->addr_list, if_list) >+ mptcp_pm_inet6_addr_event(NULL, event, ifa6); >+ } >+ >+ rcu_read_unlock(); >+ return NOTIFY_DONE; >+} >+ >+void mptcp_pm_addr6_event_handler(struct inet6_ifaddr *ifa, unsigned long event, >+ struct mptcp_cb *mpcb) >+{ >+ int i; >+ struct sock *sk, *tmpsk; >+ int addr_type = ipv6_addr_type(&ifa->addr); >+ >+ /* Checks on interface and address-type */ >+ if (ifa->scope > RT_SCOPE_LINK || >+ addr_type == IPV6_ADDR_ANY || >+ (addr_type & IPV6_ADDR_LOOPBACK) || >+ (addr_type & IPV6_ADDR_LINKLOCAL)) >+ return; >+ >+ /* Look for the address among the local addresses */ >+ mptcp_for_each_bit_set(mpcb->loc6_bits, i) { >+ if (ipv6_addr_equal(&mpcb->locaddr6[i].addr, &ifa->addr)) >+ goto found; >+ } >+ >+ /* Not yet in address-list */ >+ if ((event == NETDEV_UP || event == NETDEV_CHANGE) && >+ netif_running(ifa->idev->dev) && >+ !(ifa->idev->dev->flags & IFF_NOMULTIPATH)) { >+ i = __mptcp_find_free_index(mpcb->loc6_bits, 0, mpcb->next_v6_index); >+ if (i < 0) { >+ mptcp_debug("MPTCP_PM: NETDEV_UP Reached max number of local IPv6 addresses: %d\n", >+ MPTCP_MAX_ADDR); >+ return; >+ } >+ >+ /* update this mpcb */ >+ mpcb->locaddr6[i].addr = ifa->addr; >+ mpcb->locaddr6[i].id = i + MPTCP_MAX_ADDR; >+ mpcb->loc6_bits |= (1 << i); >+ mpcb->next_v6_index = i + 1; >+ /* re-send addresses */ >+ mptcp_v6_send_add_addr(i, mpcb); >+ /* re-evaluate paths */ >+ mptcp_create_subflows(mpcb->meta_sk); >+ } >+ return; >+found: >+ /* Address already in list. Reactivate/Deactivate the >+ * concerned paths. */ >+ mptcp_for_each_sk_safe(mpcb, sk, tmpsk) { >+ struct tcp_sock *tp = tcp_sk(sk); >+ if (sk->sk_family != AF_INET6 || >+ !ipv6_addr_equal(&inet6_sk(sk)->saddr, &ifa->addr)) >+ continue; >+ >+ if (event == NETDEV_DOWN || >+ (ifa->idev->dev->flags & IFF_NOMULTIPATH)) { >+ mptcp_reinject_data(sk, 0); >+ mptcp_sub_force_close(sk); >+ } else if (event == NETDEV_CHANGE) { >+ int new_low_prio = (ifa->idev->dev->flags & IFF_MPBACKUP) ? >+ 1 : 0; >+ if (new_low_prio != tp->mptcp->low_prio) >+ tp->mptcp->send_mp_prio = 1; >+ tp->mptcp->low_prio = new_low_prio; >+ } >+ } >+ >+ if (event == NETDEV_DOWN || >+ (ifa->idev->dev->flags & IFF_NOMULTIPATH)) { >+ mpcb->loc6_bits &= ~(1 << i); >+ >+ /* Force sending directly the REMOVE_ADDR option */ >+ mpcb->remove_addrs |= (1 << mpcb->locaddr6[i].id); >+ sk = mptcp_select_ack_sock(mpcb->meta_sk, 0); >+ if (sk) >+ tcp_send_ack(sk); >+ >+ mptcp_for_each_bit_set(mpcb->rem6_bits, i) >+ mpcb->remaddr6[i].bitfield &= mpcb->loc6_bits; >+ } >+} >+ >+/* Send ADD_ADDR for loc_id on all available subflows */ >+void mptcp_v6_send_add_addr(int loc_id, struct mptcp_cb *mpcb) >+{ >+ struct tcp_sock *tp; >+ >+ mptcp_for_each_tp(mpcb, tp) >+ tp->mptcp->add_addr6 |= (1 << loc_id); >+} >+ >+ >+static struct notifier_block mptcp_pm_inet6_addr_notifier = { >+ .notifier_call = mptcp_pm_inet6_addr_event, >+}; >+ >+static struct notifier_block mptcp_pm_v6_netdev_notifier = { >+ .notifier_call = mptcp_pm_v6_netdev_event, >+}; >+ >+/****** End of IPv6-Address event handler ******/ >+ >+int mptcp_pm_v6_init(void) >+{ >+ int ret; >+ struct request_sock_ops *ops = &mptcp6_request_sock_ops; >+ >+ ops->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", "MPTCP6"); >+ if (ops->slab_name == NULL) { >+ ret = -ENOMEM; >+ goto out; >+ } >+ >+ ops->slab = kmem_cache_create(ops->slab_name, ops->obj_size, 0, >+ SLAB_HWCACHE_ALIGN, NULL); >+ >+ if (ops->slab == NULL) { >+ ret = -ENOMEM; >+ goto err_reqsk_create; >+ } >+ >+ ret = register_inet6addr_notifier(&mptcp_pm_inet6_addr_notifier); >+ if (ret) >+ goto err_reg_inet6addr; >+ ret = register_netdevice_notifier(&mptcp_pm_v6_netdev_notifier); >+ if (ret) >+ goto err_reg_netdev6; >+ >+out: >+ return ret; >+ >+err_reg_netdev6: >+ unregister_inet6addr_notifier(&mptcp_pm_inet6_addr_notifier); >+err_reg_inet6addr: >+ kmem_cache_destroy(ops->slab); >+err_reqsk_create: >+ kfree(ops->slab_name); >+ ops->slab_name = NULL; >+ goto out; >+} >+ >+void mptcp_pm_v6_undo(void) >+{ >+ kmem_cache_destroy(mptcp6_request_sock_ops.slab); >+ kfree(mptcp6_request_sock_ops.slab_name); >+ unregister_inet6addr_notifier(&mptcp_pm_inet6_addr_notifier); >+ unregister_netdevice_notifier(&mptcp_pm_v6_netdev_notifier); >+} >diff -Naur a/linux-3.11/net/mptcp/mptcp_ofo_queue.c b/linux-3.11/net/mptcp/mptcp_ofo_queue.c >--- a/linux-3.11/net/mptcp/mptcp_ofo_queue.c 1970-01-01 01:00:00.000000000 +0100 >+++ b/linux-3.11/net/mptcp/mptcp_ofo_queue.c 2013-10-05 18:34:49.272364663 +0200 >@@ -0,0 +1,278 @@ >+/* >+ * MPTCP implementation - Fast algorithm for MPTCP meta-reordering >+ * >+ * Initial Design & Implementation: >+ * Sébastien Barré <sebastien.barre@uclouvain.be> >+ * >+ * Current Maintainer & Author: >+ * Christoph Paasch <christoph.paasch@uclouvain.be> >+ * >+ * Additional authors: >+ * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi> >+ * Gregory Detal <gregory.detal@uclouvain.be> >+ * Fabien Duchêne <fabien.duchene@uclouvain.be> >+ * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de> >+ * Lavkesh Lahngir <lavkesh51@gmail.com> >+ * Andreas Ripke <ripke@neclab.eu> >+ * Vlad Dogaru <vlad.dogaru@intel.com> >+ * Octavian Purdila <octavian.purdila@intel.com> >+ * John Ronan <jronan@tssg.org> >+ * Catalin Nicutar <catalin.nicutar@gmail.com> >+ * Brandon Heller <brandonh@stanford.edu> >+ * >+ * This program is free software; you can redistribute it and/or >+ * modify it under the terms of the GNU General Public License >+ * as published by the Free Software Foundation; either version >+ * 2 of the License, or (at your option) any later version. >+ */ >+ >+#include <linux/skbuff.h> >+#include <linux/slab.h> >+#include <net/tcp.h> >+#include <net/mptcp.h> >+ >+static void mptcp_remove_shortcuts(const struct mptcp_cb *mpcb, >+ const struct sk_buff *skb) >+{ >+ struct tcp_sock *tp; >+ >+ mptcp_for_each_tp(mpcb, tp) { >+ if (tp->mptcp->shortcut_ofoqueue == skb) { >+ tp->mptcp->shortcut_ofoqueue = NULL; >+ return; >+ } >+ } >+} >+ >+/* Does 'skb' fits after 'here' in the queue 'head' ? >+ * If yes, we queue it and return 1 >+ */ >+static int mptcp_ofo_queue_after(struct sk_buff_head *head, >+ struct sk_buff *skb, struct sk_buff *here, >+ struct tcp_sock *tp) >+{ >+ struct sock *meta_sk = tp->meta_sk; >+ struct tcp_sock *meta_tp = tcp_sk(meta_sk); >+ u32 seq = TCP_SKB_CB(skb)->seq; >+ u32 end_seq = TCP_SKB_CB(skb)->end_seq; >+ >+ /* We want to queue skb after here, thus seq >= end_seq */ >+ if (before(seq, TCP_SKB_CB(here)->end_seq)) >+ return 0; >+ >+ if (seq == TCP_SKB_CB(here)->end_seq) { >+ bool fragstolen = false; >+ >+ if (!tcp_try_coalesce(meta_sk, here, skb, &fragstolen)) { >+ __skb_queue_after(&meta_tp->out_of_order_queue, here, skb); >+ return 1; >+ } else { >+ kfree_skb_partial(skb, fragstolen); >+ return -1; >+ } >+ } >+ >+ /* If here is the last one, we can always queue it */ >+ if (skb_queue_is_last(head, here)) { >+ __skb_queue_after(head, here, skb); >+ return 1; >+ } else { >+ struct sk_buff *skb1 = skb_queue_next(head, here); >+ /* It's not the last one, but does it fits between 'here' and >+ * the one after 'here' ? Thus, does end_seq <= after_here->seq >+ */ >+ if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) { >+ __skb_queue_after(head, here, skb); >+ return 1; >+ } >+ } >+ >+ return 0; >+} >+ >+static void try_shortcut(struct sk_buff *shortcut, struct sk_buff *skb, >+ struct sk_buff_head *head, struct tcp_sock *tp) >+{ >+ struct sock *meta_sk = tp->meta_sk; >+ struct tcp_sock *tp_it, *meta_tp = tcp_sk(meta_sk); >+ struct mptcp_cb *mpcb = meta_tp->mpcb; >+ struct sk_buff *skb1, *best_shortcut = NULL; >+ u32 seq = TCP_SKB_CB(skb)->seq; >+ u32 end_seq = TCP_SKB_CB(skb)->end_seq; >+ u32 distance = 0xffffffff; >+ >+ /* First, check the tp's shortcut */ >+ if (!shortcut) { >+ if (skb_queue_empty(head)) { >+ __skb_queue_head(head, skb); >+ goto end; >+ } >+ } else { >+ int ret = mptcp_ofo_queue_after(head, skb, shortcut, tp); >+ /* Does the tp's shortcut is a hit? If yes, we insert. */ >+ >+ if (ret) { >+ skb = (ret > 0) ? skb : NULL; >+ goto end; >+ } >+ } >+ >+ /* Check the shortcuts of the other subsockets. */ >+ mptcp_for_each_tp(mpcb, tp_it) { >+ shortcut = tp_it->mptcp->shortcut_ofoqueue; >+ /* Can we queue it here? If yes, do so! */ >+ if (shortcut) { >+ int ret = mptcp_ofo_queue_after(head, skb, shortcut, tp); >+ >+ if (ret) { >+ skb = (ret > 0) ? skb : NULL; >+ goto end; >+ } >+ } >+ >+ /* Could not queue it, check if we are close. >+ * We are looking for a shortcut, close enough to seq to >+ * set skb1 prematurely and thus improve the subsequent lookup, >+ * which tries to find a skb1 so that skb1->seq <= seq. >+ * >+ * So, here we only take shortcuts, whose shortcut->seq > seq, >+ * and minimize the distance between shortcut->seq and seq and >+ * set best_shortcut to this one with the minimal distance. >+ * >+ * That way, the subsequent while-loop is shortest. >+ */ >+ if (shortcut && after(TCP_SKB_CB(shortcut)->seq, seq)) { >+ /* Are we closer than the current best shortcut? */ >+ if ((u32)(seq - TCP_SKB_CB(shortcut)->seq) < distance) { >+ distance = (u32)(seq - TCP_SKB_CB(shortcut)->seq); >+ best_shortcut = shortcut; >+ } >+ } >+ } >+ >+ if (best_shortcut) >+ skb1 = best_shortcut; >+ else >+ skb1 = skb_peek_tail(head); >+ >+ if (seq == TCP_SKB_CB(skb1)->end_seq) { >+ bool fragstolen = false; >+ >+ if (!tcp_try_coalesce(meta_sk, skb1, skb, &fragstolen)) { >+ __skb_queue_after(&meta_tp->out_of_order_queue, skb1, skb); >+ } else { >+ kfree_skb_partial(skb, fragstolen); >+ skb = NULL; >+ } >+ >+ goto end; >+ } >+ >+ /* Find the insertion point, starting from best_shortcut if available. >+ * >+ * Inspired from tcp_data_queue_ofo. >+ */ >+ while (1) { >+ /* skb1->seq <= seq */ >+ if (!after(TCP_SKB_CB(skb1)->seq, seq)) >+ break; >+ if (skb_queue_is_first(head, skb1)) { >+ skb1 = NULL; >+ break; >+ } >+ skb1 = skb_queue_prev(head, skb1); >+ } >+ >+ /* Do skb overlap to previous one? */ >+ if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) { >+ if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { >+ /* All the bits are present. */ >+ __kfree_skb(skb); >+ skb = NULL; >+ goto end; >+ } >+ if (seq == TCP_SKB_CB(skb1)->seq) { >+ if (skb_queue_is_first(head, skb1)) >+ skb1 = NULL; >+ else >+ skb1 = skb_queue_prev(head, skb1); >+ } >+ } >+ if (!skb1) >+ __skb_queue_head(head, skb); >+ else >+ __skb_queue_after(head, skb1, skb); >+ >+ /* And clean segments covered by new one as whole. */ >+ while (!skb_queue_is_last(head, skb)) { >+ skb1 = skb_queue_next(head, skb); >+ >+ if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) >+ break; >+ >+ __skb_unlink(skb1, head); >+ mptcp_remove_shortcuts(mpcb, skb1); >+ __kfree_skb(skb1); >+ } >+ >+end: >+ if (skb) { >+ skb_set_owner_r(skb, meta_sk); >+ tp->mptcp->shortcut_ofoqueue = skb; >+ } >+ >+ return; >+} >+ >+/** >+ * @sk: the subflow that received this skb. >+ */ >+void mptcp_add_meta_ofo_queue(struct sock *meta_sk, struct sk_buff *skb, >+ struct sock *sk) >+{ >+ struct tcp_sock *tp = tcp_sk(sk); >+ >+ try_shortcut(tp->mptcp->shortcut_ofoqueue, skb, >+ &tcp_sk(meta_sk)->out_of_order_queue, tp); >+} >+ >+void mptcp_ofo_queue(struct sock *meta_sk) >+{ >+ struct tcp_sock *meta_tp = tcp_sk(meta_sk); >+ struct sk_buff *skb; >+ >+ while ((skb = skb_peek(&meta_tp->out_of_order_queue)) != NULL) { >+ u32 old_rcv_nxt = meta_tp->rcv_nxt; >+ if (after(TCP_SKB_CB(skb)->seq, meta_tp->rcv_nxt)) >+ break; >+ >+ if (!after(TCP_SKB_CB(skb)->end_seq, meta_tp->rcv_nxt)) { >+ __skb_unlink(skb, &meta_tp->out_of_order_queue); >+ mptcp_remove_shortcuts(meta_tp->mpcb, skb); >+ __kfree_skb(skb); >+ continue; >+ } >+ >+ __skb_unlink(skb, &meta_tp->out_of_order_queue); >+ mptcp_remove_shortcuts(meta_tp->mpcb, skb); >+ >+ __skb_queue_tail(&meta_sk->sk_receive_queue, skb); >+ meta_tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; >+ mptcp_check_rcvseq_wrap(meta_tp, old_rcv_nxt); >+ >+ if (tcp_hdr(skb)->fin) >+ mptcp_fin(meta_sk); >+ } >+} >+ >+void mptcp_purge_ofo_queue(struct tcp_sock *meta_tp) >+{ >+ struct sk_buff_head *head = &meta_tp->out_of_order_queue; >+ struct sk_buff *skb, *tmp; >+ >+ skb_queue_walk_safe(head, skb, tmp) { >+ __skb_unlink(skb, head); >+ mptcp_remove_shortcuts(meta_tp->mpcb, skb); >+ kfree_skb(skb); >+ } >+} >diff -Naur a/linux-3.11/net/mptcp/mptcp_olia.c b/linux-3.11/net/mptcp/mptcp_olia.c >--- a/linux-3.11/net/mptcp/mptcp_olia.c 1970-01-01 01:00:00.000000000 +0100 >+++ b/linux-3.11/net/mptcp/mptcp_olia.c 2013-10-05 18:34:49.272364663 +0200 >@@ -0,0 +1,314 @@ >+/* >+ * MPTCP implementation - OPPORTUNISTIC LINKED INCREASES CONGESTION CONTROL: >+ * >+ * Algorithm design: >+ * Ramin Khalili <ramin.khalili@epfl.ch> >+ * Nicolas Gast <nicolas.gast@epfl.ch> >+ * Jean-Yves Le Boudec <jean-yves.leboudec@epfl.ch> >+ * >+ * Implementation: >+ * Ramin Khalili <ramin.khalili@epfl.ch> >+ * >+ * Ported to the official MPTCP-kernel: >+ * Christoph Paasch <christoph.paasch@uclouvain.be> >+ * >+ * This program is free software; you can redistribute it and/or >+ * modify it under the terms of the GNU General Public License >+ * as published by the Free Software Foundation; either version >+ * 2 of the License, or (at your option) any later version. >+ */ >+ >+ >+#include <net/tcp.h> >+#include <net/mptcp.h> >+ >+#include <linux/module.h> >+ >+static int scale = 10; >+ >+struct mptcp_olia { >+ u32 mptcp_loss1; >+ u32 mptcp_loss2; >+ u32 mptcp_loss3; >+ int epsilon_num; >+ u32 epsilon_den; >+ int mptcp_snd_cwnd_cnt; >+}; >+ >+static inline int mptcp_olia_sk_can_send(const struct sock *sk) >+{ >+ return mptcp_sk_can_send(sk) && tcp_sk(sk)->srtt; >+} >+ >+static inline u64 mptcp_olia_scale(u64 val, int scale) >+{ >+ return (u64) val << scale; >+} >+ >+/* take care of artificially inflate (see RFC5681) >+ * of cwnd during fast-retransmit phase >+ */ >+static u32 mptcp_get_crt_cwnd(struct sock *sk) >+{ >+ struct inet_connection_sock *icsk = inet_csk(sk); >+ >+ if (icsk->icsk_ca_state == TCP_CA_Recovery) >+ return tcp_sk(sk)->snd_ssthresh; >+ else >+ return tcp_sk(sk)->snd_cwnd; >+} >+ >+/* return the dominator of the first term of the increasing term */ >+static u64 mptcp_get_rate(struct mptcp_cb *mpcb , u32 path_rtt) >+{ >+ struct sock *sk; >+ u64 rate = 1; /* We have to avoid a zero-rate because it is used as a divisor */ >+ >+ mptcp_for_each_sk(mpcb, sk) { >+ struct tcp_sock *tp = tcp_sk(sk); >+ u64 scaled_num; >+ u32 tmp_cwnd; >+ >+ if (!mptcp_olia_sk_can_send(sk)) >+ continue; >+ >+ tmp_cwnd = mptcp_get_crt_cwnd(sk); >+ scaled_num = mptcp_olia_scale(tmp_cwnd, scale) * path_rtt; >+ rate += div_u64(scaled_num , tp->srtt); >+ } >+ rate *= rate; >+ return rate; >+} >+ >+/* find the maximum cwnd, used to find set M */ >+static u32 mptcp_get_max_cwnd(struct mptcp_cb *mpcb) >+{ >+ struct sock *sk; >+ u32 best_cwnd = 0; >+ >+ mptcp_for_each_sk(mpcb, sk) { >+ u32 tmp_cwnd; >+ >+ if (!mptcp_olia_sk_can_send(sk)) >+ continue; >+ >+ tmp_cwnd = mptcp_get_crt_cwnd(sk); >+ if (tmp_cwnd > best_cwnd) >+ best_cwnd = tmp_cwnd; >+ } >+ return best_cwnd; >+} >+ >+static void mptcp_get_epsilon(struct mptcp_cb *mpcb) >+{ >+ struct mptcp_olia *ca; >+ struct tcp_sock *tp; >+ struct sock *sk; >+ u64 tmp_int, tmp_rtt, best_int = 0, best_rtt = 1; >+ u32 max_cwnd = 1, best_cwnd = 1, tmp_cwnd; >+ u8 M = 0, B_not_M = 0; >+ >+ /* TODO - integrate this in the following loop - we just want to iterate once */ >+ >+ max_cwnd = mptcp_get_max_cwnd(mpcb); >+ >+ /* find the best path */ >+ mptcp_for_each_sk(mpcb, sk) { >+ tp = tcp_sk(sk); >+ ca = inet_csk_ca(sk); >+ >+ if (!mptcp_olia_sk_can_send(sk)) >+ continue; >+ >+ tmp_rtt = tp->srtt * tp->srtt; >+ /* TODO - check here and rename variables */ >+ tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2, >+ ca->mptcp_loss2 - ca->mptcp_loss1); >+ >+ tmp_cwnd = mptcp_get_crt_cwnd(sk); >+ if (tmp_int * best_rtt >= best_int * tmp_rtt) { >+ best_rtt = tmp_rtt; >+ best_int = tmp_int; >+ best_cwnd = tmp_cwnd; >+ } >+ } >+ >+ /* TODO - integrate this here in mptcp_get_max_cwnd and in the previous loop */ >+ /* find the size of M and B_not_M */ >+ mptcp_for_each_sk(mpcb, sk) { >+ tp = tcp_sk(sk); >+ ca = inet_csk_ca(sk); >+ >+ if (!mptcp_olia_sk_can_send(sk)) >+ continue; >+ >+ tmp_cwnd = mptcp_get_crt_cwnd(sk); >+ if (tmp_cwnd == max_cwnd) { >+ M++; >+ } else { >+ tmp_rtt = tp->srtt * tp->srtt; >+ tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2, >+ ca->mptcp_loss2 - ca->mptcp_loss1); >+ >+ if (tmp_int * best_rtt == best_int * tmp_rtt) >+ B_not_M++; >+ } >+ } >+ >+ /* check if the path is in M or B_not_M and set the value of epsilon accordingly */ >+ mptcp_for_each_sk(mpcb, sk) { >+ tp = tcp_sk(sk); >+ ca = inet_csk_ca(sk); >+ >+ if (!mptcp_olia_sk_can_send(sk)) >+ continue; >+ >+ if (B_not_M == 0) { >+ ca->epsilon_num = 0; >+ ca->epsilon_den = 1; >+ } else { >+ tmp_rtt = tp->srtt * tp->srtt; >+ tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2, >+ ca->mptcp_loss2 - ca->mptcp_loss1); >+ tmp_cwnd = mptcp_get_crt_cwnd(sk); >+ >+ if (tmp_cwnd < max_cwnd && >+ tmp_int * best_rtt == best_int * tmp_rtt){ >+ ca->epsilon_num = 1; >+ ca->epsilon_den = mpcb->cnt_established * B_not_M; >+ } else if (tmp_cwnd == max_cwnd) { >+ ca->epsilon_num = -1; >+ ca->epsilon_den = mpcb->cnt_established * M; >+ } else { >+ ca->epsilon_num = 0; >+ ca->epsilon_den = 1; >+ } >+ } >+ } >+ >+} >+ >+/* setting the initial values */ >+static void mptcp_olia_init(struct sock *sk) >+{ >+ struct tcp_sock *tp = tcp_sk(sk); >+ struct mptcp_olia *ca = inet_csk_ca(sk); >+ >+ if (tp->mpc) { >+ ca->mptcp_loss1 = tp->snd_una; >+ ca->mptcp_loss2 = tp->snd_una; >+ ca->mptcp_loss3 = tp->snd_una; >+ ca->mptcp_snd_cwnd_cnt = 0; >+ ca->epsilon_num = 0; >+ ca->epsilon_den = 1; >+ } >+} >+ >+/* updating inter-loss distance and ssthresh */ >+static void mptcp_olia_set_state(struct sock *sk, u8 new_state) >+{ >+ if (!tcp_sk(sk)->mpc) >+ return; >+ >+ if (new_state == TCP_CA_Loss || >+ new_state == TCP_CA_Recovery || new_state == TCP_CA_CWR) { >+ struct mptcp_olia *ca = inet_csk_ca(sk); >+ >+ if (ca->mptcp_loss3 != ca->mptcp_loss2 && >+ !inet_csk(sk)->icsk_retransmits) { >+ ca->mptcp_loss1 = ca->mptcp_loss2; >+ ca->mptcp_loss2 = ca->mptcp_loss3; >+ } >+ } >+ >+} >+ >+/* main algorithm */ >+static void mptcp_olia_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) >+{ >+ struct tcp_sock *tp = tcp_sk(sk); >+ struct mptcp_olia *ca = inet_csk_ca(sk); >+ struct mptcp_cb *mpcb = tp->mpcb; >+ >+ u64 inc_num, inc_den, rate, cwnd_scaled; >+ >+ if (!tp->mpc) { >+ tcp_reno_cong_avoid(sk, ack, in_flight); >+ return; >+ } >+ >+ ca->mptcp_loss3 = tp->snd_una; >+ >+ if (!tcp_is_cwnd_limited(sk, in_flight)) >+ return; >+ >+ /* slow start if it is in the safe area */ >+ if (tp->snd_cwnd <= tp->snd_ssthresh) { >+ tcp_slow_start(tp); >+ return; >+ } >+ >+ mptcp_get_epsilon(mpcb); >+ rate = mptcp_get_rate(mpcb, tp->srtt); >+ cwnd_scaled = mptcp_olia_scale(tp->snd_cwnd, scale); >+ inc_den = ca->epsilon_den * tp->snd_cwnd * rate ? : 1; >+ >+ /* calculate the increasing term, scaling is used to reduce the rounding effect */ >+ if (ca->epsilon_num == -1) { >+ if (ca->epsilon_den * cwnd_scaled * cwnd_scaled < rate) { >+ inc_num = rate - ca->epsilon_den * >+ cwnd_scaled * cwnd_scaled; >+ ca->mptcp_snd_cwnd_cnt -= div64_u64( >+ mptcp_olia_scale(inc_num , scale) , inc_den); >+ } else { >+ inc_num = ca->epsilon_den * >+ cwnd_scaled * cwnd_scaled - rate; >+ ca->mptcp_snd_cwnd_cnt += div64_u64( >+ mptcp_olia_scale(inc_num , scale) , inc_den); >+ } >+ } else { >+ inc_num = ca->epsilon_num * rate + >+ ca->epsilon_den * cwnd_scaled * cwnd_scaled; >+ ca->mptcp_snd_cwnd_cnt += div64_u64( >+ mptcp_olia_scale(inc_num , scale) , inc_den); >+ } >+ >+ >+ if (ca->mptcp_snd_cwnd_cnt >= (1 << scale) - 1) { >+ if (tp->snd_cwnd < tp->snd_cwnd_clamp) >+ tp->snd_cwnd++; >+ ca->mptcp_snd_cwnd_cnt = 0; >+ } else if (ca->mptcp_snd_cwnd_cnt <= 0 - (1 << scale) + 1) { >+ tp->snd_cwnd = max((int) 1 , (int) tp->snd_cwnd - 1); >+ ca->mptcp_snd_cwnd_cnt = 0; >+ } >+} >+ >+static struct tcp_congestion_ops mptcp_olia = { >+ .init = mptcp_olia_init, >+ .ssthresh = tcp_reno_ssthresh, >+ .cong_avoid = mptcp_olia_cong_avoid, >+ .set_state = mptcp_olia_set_state, >+ .min_cwnd = tcp_reno_min_cwnd, >+ .owner = THIS_MODULE, >+ .name = "olia", >+}; >+ >+static int __init mptcp_olia_register(void) >+{ >+ BUILD_BUG_ON(sizeof(struct mptcp_olia) > ICSK_CA_PRIV_SIZE); >+ return tcp_register_congestion_control(&mptcp_olia); >+} >+ >+static void __exit mptcp_olia_unregister(void) >+{ >+ tcp_unregister_congestion_control(&mptcp_olia); >+} >+ >+module_init(mptcp_olia_register); >+module_exit(mptcp_olia_unregister); >+ >+MODULE_AUTHOR("Ramin Khalili, Nicolas Gast, Jean-Yves Le Boudec"); >+MODULE_LICENSE("GPL"); >+MODULE_DESCRIPTION("MPTCP COUPLED CONGESTION CONTROL"); >+MODULE_VERSION("0.1"); >diff -Naur a/linux-3.11/net/mptcp/mptcp_output.c b/linux-3.11/net/mptcp/mptcp_output.c >--- a/linux-3.11/net/mptcp/mptcp_output.c 1970-01-01 01:00:00.000000000 +0100 >+++ b/linux-3.11/net/mptcp/mptcp_output.c 2013-10-05 18:34:49.275364626 +0200 >@@ -0,0 +1,2334 @@ >+/* >+ * MPTCP implementation - Sending side >+ * >+ * Initial Design & Implementation: >+ * Sébastien Barré <sebastien.barre@uclouvain.be> >+ * >+ * Current Maintainer & Author: >+ * Christoph Paasch <christoph.paasch@uclouvain.be> >+ * >+ * Additional authors: >+ * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi> >+ * Gregory Detal <gregory.detal@uclouvain.be> >+ * Fabien Duchêne <fabien.duchene@uclouvain.be> >+ * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de> >+ * Lavkesh Lahngir <lavkesh51@gmail.com> >+ * Andreas Ripke <ripke@neclab.eu> >+ * Vlad Dogaru <vlad.dogaru@intel.com> >+ * Octavian Purdila <octavian.purdila@intel.com> >+ * John Ronan <jronan@tssg.org> >+ * Catalin Nicutar <catalin.nicutar@gmail.com> >+ * Brandon Heller <brandonh@stanford.edu> >+ * >+ * >+ * This program is free software; you can redistribute it and/or >+ * modify it under the terms of the GNU General Public License >+ * as published by the Free Software Foundation; either version >+ * 2 of the License, or (at your option) any later version. >+ */ >+ >+#include <linux/kconfig.h> >+#include <linux/skbuff.h> >+#include <linux/tcp.h> >+ >+#include <net/mptcp.h> >+#include <net/mptcp_v4.h> >+#include <net/mptcp_v6.h> >+#include <net/sock.h> >+ >+/* If the sub-socket sk available to send the skb? */ >+static int mptcp_is_available(struct sock *sk, struct sk_buff *skb, >+ unsigned int *mss) >+{ >+ struct tcp_sock *tp = tcp_sk(sk); >+ unsigned int mss_now; >+ >+ /* Set of states for which we are allowed to send data */ >+ if (!mptcp_sk_can_send(sk)) >+ return 0; >+ >+ /* We do not send data on this subflow unless it is >+ * fully established, i.e. the 4th ack has been received. >+ */ >+ if (tp->mptcp->pre_established) >+ return 0; >+ >+ if (tp->pf || >+ (tp->mpcb->noneligible & mptcp_pi_to_flag(tp->mptcp->path_index))) >+ return 0; >+ >+ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) { >+ /* If SACK is disabled, and we got a loss, TCP does not exist >+ * the loss-state until something above high_seq has been acked. >+ * (see tcp_try_undo_recovery) >+ * >+ * high_seq is the snd_nxt at the moment of the RTO. As soon >+ * as we have an RTO, we won't push data on the subflow. >+ * Thus, snd_una can never go beyond high_seq. >+ */ >+ if (!tcp_is_reno(tp)) >+ return 0; >+ else if (tp->snd_una != tp->high_seq) >+ return 0; >+ } >+ >+ if (!tp->mptcp->fully_established) { >+ /* Make sure that we send in-order data */ >+ if (skb && tp->mptcp->second_packet && >+ tp->mptcp->last_end_data_seq != TCP_SKB_CB(skb)->seq) >+ return 0; >+ } >+ >+ if (!tcp_cwnd_test(tp, skb)) >+ return 0; >+ >+ mss_now = tcp_current_mss(sk); >+ /* Don't send on this subflow if we bypass the allowed send-window at >+ * the per-subflow level. Similar to tcp_snd_wnd_test, but manually >+ * calculated end_seq (because here at this point end_seq is still at >+ * the meta-level). >+ */ >+ if (skb && after(tp->write_seq + min(skb->len, mss_now), tcp_wnd_end(tp))) >+ return 0; >+ >+ if (mss) >+ *mss = mss_now; >+ >+ return 1; >+} >+ >+/* Are we not allowed to reinject this skb on tp? */ >+static int mptcp_dont_reinject_skb(struct tcp_sock *tp, struct sk_buff *skb) >+{ >+ /* If the skb has already been enqueued in this sk, try to find >+ * another one. >+ * An exception is a DATA_FIN without data. These ones are not >+ * reinjected at the subflow-level as they do not consume >+ * subflow-sequence-number space. >+ */ >+ return skb && >+ /* We either have a data_fin with data or not a data_fin */ >+ ((mptcp_is_data_fin(skb) && TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq > 1) || >+ !mptcp_is_data_fin(skb)) && >+ /* Has the skb already been enqueued into this subsocket? */ >+ mptcp_pi_to_flag(tp->mptcp->path_index) & TCP_SKB_CB(skb)->path_mask; >+} >+ >+/* This is the scheduler. This function decides on which flow to send >+ * a given MSS. If all subflows are found to be busy, NULL is returned >+ * The flow is selected based on the shortest RTT. >+ * If all paths have full cong windows, we simply return NULL. >+ * >+ * Additionally, this function is aware of the backup-subflows. >+ */ >+static struct sock *get_available_subflow(struct sock *meta_sk, >+ struct sk_buff *skb, >+ unsigned int *mss_now) >+{ >+ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; >+ struct sock *sk, *bestsk = NULL, *lowpriosk = NULL, *backupsk = NULL; >+ unsigned int mss = 0, mss_lowprio = 0, mss_backup = 0; >+ u32 min_time_to_peer = 0xffffffff, lowprio_min_time_to_peer = 0xffffffff; >+ int cnt_backups = 0; >+ >+ /* if there is only one subflow, bypass the scheduling function */ >+ if (mpcb->cnt_subflows == 1) { >+ bestsk = (struct sock *)mpcb->connection_list; >+ if (!mptcp_is_available(bestsk, skb, mss_now)) >+ bestsk = NULL; >+ return bestsk; >+ } >+ >+ /* Answer data_fin on same subflow!!! */ >+ if (meta_sk->sk_shutdown & RCV_SHUTDOWN && >+ skb && mptcp_is_data_fin(skb)) { >+ mptcp_for_each_sk(mpcb, sk) { >+ if (tcp_sk(sk)->mptcp->path_index == mpcb->dfin_path_index && >+ mptcp_is_available(sk, skb, mss_now)) >+ return sk; >+ } >+ } >+ >+ /* First, find the best subflow */ >+ mptcp_for_each_sk(mpcb, sk) { >+ struct tcp_sock *tp = tcp_sk(sk); >+ int this_mss; >+ >+ if (tp->mptcp->rcv_low_prio || tp->mptcp->low_prio) >+ cnt_backups++; >+ >+ if ((tp->mptcp->rcv_low_prio || tp->mptcp->low_prio) && >+ tp->srtt < lowprio_min_time_to_peer) { >+ >+ if (!mptcp_is_available(sk, skb, &this_mss)) >+ continue; >+ >+ if (mptcp_dont_reinject_skb(tp, skb)) { >+ mss_backup = this_mss; >+ backupsk = sk; >+ continue; >+ } >+ >+ lowprio_min_time_to_peer = tp->srtt; >+ lowpriosk = sk; >+ mss_lowprio = this_mss; >+ } else if (!(tp->mptcp->rcv_low_prio || tp->mptcp->low_prio) && >+ tp->srtt < min_time_to_peer) { >+ if (!mptcp_is_available(sk, skb, &this_mss)) >+ continue; >+ >+ if (mptcp_dont_reinject_skb(tp, skb)) { >+ mss_backup = this_mss; >+ backupsk = sk; >+ continue; >+ } >+ >+ min_time_to_peer = tp->srtt; >+ bestsk = sk; >+ mss = this_mss; >+ } >+ } >+ >+ if (mpcb->cnt_established == cnt_backups && lowpriosk) { >+ mss = mss_lowprio; >+ sk = lowpriosk; >+ } else if (bestsk) { >+ sk = bestsk; >+ } else if (backupsk){ >+ /* It has been sent on all subflows once - let's give it a >+ * chance again by restarting its pathmask. >+ */ >+ if (skb) >+ TCP_SKB_CB(skb)->path_mask = 0; >+ mss = mss_backup; >+ sk = backupsk; >+ } >+ >+ if (mss_now) >+ *mss_now = mss; >+ >+ return sk; >+} >+ >+static struct mp_dss *mptcp_skb_find_dss(const struct sk_buff *skb) >+{ >+ if (!mptcp_is_data_seq(skb)) >+ return NULL; >+ >+ return (struct mp_dss *)(skb->data - (MPTCP_SUB_LEN_DSS_ALIGN + >+ MPTCP_SUB_LEN_ACK_ALIGN + >+ MPTCP_SUB_LEN_SEQ_ALIGN)); >+} >+ >+/* get the data-seq and end-data-seq and store them again in the >+ * tcp_skb_cb >+ */ >+static int mptcp_reconstruct_mapping(struct sk_buff *skb, struct sk_buff *orig_skb) >+{ >+ struct mp_dss *mpdss = mptcp_skb_find_dss(orig_skb); >+ u32 *p32; >+ u16 *p16; >+ >+ if (!mpdss || !mpdss->M) >+ return 1; >+ >+ /* Move the pointer to the data-seq */ >+ p32 = (u32 *)mpdss; >+ p32++; >+ if (mpdss->A) { >+ p32++; >+ if (mpdss->a) >+ p32++; >+ } >+ >+ TCP_SKB_CB(skb)->seq = ntohl(*p32); >+ >+ /* Get the data_len to calculate the end_data_seq */ >+ p32++; >+ p32++; >+ p16 = (u16 *)p32; >+ TCP_SKB_CB(skb)->end_seq = ntohs(*p16) + TCP_SKB_CB(skb)->seq; >+ >+ return 0; >+} >+ >+/* Similar to __pskb_copy and sk_stream_alloc_skb. */ >+static struct sk_buff *mptcp_pskb_copy(struct sk_buff *skb) >+{ >+ struct sk_buff *n; >+ /* The TCP header must be at least 32-bit aligned. */ >+ int size = ALIGN(skb_headlen(skb), 4); >+ >+ n = alloc_skb_fclone(size + MAX_TCP_HEADER, GFP_ATOMIC); >+ if (!n) >+ return NULL; >+ >+ /* Set the data pointer */ >+ skb_reserve(n, MAX_TCP_HEADER); >+ /* Set the tail pointer and length */ >+ skb_put(n, skb_headlen(skb)); >+ /* Copy the bytes */ >+ skb_copy_from_linear_data(skb, n->data, n->len); >+ >+ n->truesize += skb->data_len; >+ n->data_len = skb->data_len; >+ n->len = skb->len; >+ >+ if (skb_shinfo(skb)->nr_frags) { >+ int i; >+ >+ if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { >+ if (skb_copy_ubufs(skb, GFP_ATOMIC)) { >+ kfree_skb(n); >+ n = NULL; >+ goto out; >+ } >+ } >+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { >+ skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i]; >+ skb_frag_ref(skb, i); >+ } >+ skb_shinfo(n)->nr_frags = i; >+ } >+ >+ if (skb_has_frag_list(skb)) { >+ skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list; >+ skb_clone_fraglist(n); >+ } >+ >+ copy_skb_header(n, skb); >+out: >+ return n; >+} >+ >+/* Reinject data from one TCP subflow to the meta_sk. If sk == NULL, we are >+ * coming from the meta-retransmit-timer >+ */ >+static void __mptcp_reinject_data(struct sk_buff *orig_skb, struct sock *meta_sk, >+ struct sock *sk, int clone_it) >+{ >+ struct sk_buff *skb, *skb1; >+ struct tcp_sock *meta_tp = tcp_sk(meta_sk); >+ struct mptcp_cb *mpcb = meta_tp->mpcb; >+ u32 seq, end_seq; >+ >+ if (clone_it) { >+ /* pskb_copy is necessary here, because the TCP/IP-headers >+ * will be changed when it's going to be reinjected on another >+ * subflow. >+ */ >+ skb = mptcp_pskb_copy(orig_skb); >+ } else { >+ __skb_unlink(orig_skb, &sk->sk_write_queue); >+ sock_set_flag(sk, SOCK_QUEUE_SHRUNK); >+ sk->sk_wmem_queued -= orig_skb->truesize; >+ sk_mem_uncharge(sk, orig_skb->truesize); >+ skb = orig_skb; >+ } >+ if (unlikely(!skb)) >+ return; >+ >+ if (sk && mptcp_reconstruct_mapping(skb, orig_skb)) { >+ __kfree_skb(skb); >+ return; >+ } >+ >+ skb->sk = meta_sk; >+ >+ /* If it reached already the destination, we don't have to reinject it */ >+ if (!after(TCP_SKB_CB(skb)->end_seq, meta_tp->snd_una)) { >+ __kfree_skb(skb); >+ return; >+ } >+ >+ /* Only reinject segments that are fully covered by the mapping */ >+ if (skb->len + (mptcp_is_data_fin(skb) ? 1 : 0) != >+ TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq) { >+ u32 seq = TCP_SKB_CB(skb)->seq; >+ u32 end_seq = TCP_SKB_CB(skb)->end_seq; >+ >+ __kfree_skb(skb); >+ >+ /* Ok, now we have to look for the full mapping in the meta >+ * send-queue :S >+ */ >+ tcp_for_write_queue(skb, meta_sk) { >+ /* Not yet at the mapping? */ >+ if (before(TCP_SKB_CB(skb)->seq, seq)) >+ continue; >+ /* We have passed by the mapping */ >+ if (after(TCP_SKB_CB(skb)->end_seq, end_seq)) >+ return; >+ >+ __mptcp_reinject_data(skb, meta_sk, NULL, 1); >+ } >+ return; >+ } >+ >+ /* If it's empty, just add */ >+ if (skb_queue_empty(&mpcb->reinject_queue)) { >+ skb_queue_head(&mpcb->reinject_queue, skb); >+ return; >+ } >+ >+ /* Find place to insert skb - or even we can 'drop' it, as the >+ * data is already covered by other skb's in the reinject-queue. >+ * >+ * This is inspired by code from tcp_data_queue. >+ */ >+ >+ skb1 = skb_peek_tail(&mpcb->reinject_queue); >+ seq = TCP_SKB_CB(skb)->seq; >+ while (1) { >+ if (!after(TCP_SKB_CB(skb1)->seq, seq)) >+ break; >+ if (skb_queue_is_first(&mpcb->reinject_queue, skb1)) { >+ skb1 = NULL; >+ break; >+ } >+ skb1 = skb_queue_prev(&mpcb->reinject_queue, skb1); >+ } >+ >+ /* Do skb overlap to previous one? */ >+ end_seq = TCP_SKB_CB(skb)->end_seq; >+ if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) { >+ if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { >+ /* All the bits are present. Don't reinject */ >+ __kfree_skb(skb); >+ return; >+ } >+ if (seq == TCP_SKB_CB(skb1)->seq) { >+ if (skb_queue_is_first(&mpcb->reinject_queue, skb1)) >+ skb1 = NULL; >+ else >+ skb1 = skb_queue_prev(&mpcb->reinject_queue, skb1); >+ } >+ } >+ if (!skb1) >+ __skb_queue_head(&mpcb->reinject_queue, skb); >+ else >+ __skb_queue_after(&mpcb->reinject_queue, skb1, skb); >+ >+ /* And clean segments covered by new one as whole. */ >+ while (!skb_queue_is_last(&mpcb->reinject_queue, skb)) { >+ skb1 = skb_queue_next(&mpcb->reinject_queue, skb); >+ >+ if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) >+ break; >+ >+ __skb_unlink(skb1, &mpcb->reinject_queue); >+ __kfree_skb(skb1); >+ } >+ return; >+} >+ >+/* Inserts data into the reinject queue */ >+void mptcp_reinject_data(struct sock *sk, int clone_it) >+{ >+ struct sk_buff *skb_it, *tmp; >+ struct tcp_sock *tp = tcp_sk(sk); >+ struct sock *meta_sk = tp->meta_sk; >+ >+ /* It has already been closed - there is really no point in reinjecting */ >+ if (meta_sk->sk_state == TCP_CLOSE) >+ return; >+ >+ skb_queue_walk_safe(&sk->sk_write_queue, skb_it, tmp) { >+ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb_it); >+ /* Subflow syn's and fin's are not reinjected. >+ * >+ * As well as empty subflow-fins with a data-fin. >+ * They are reinjected below (without the subflow-fin-flag) >+ */ >+ if (tcb->tcp_flags & TCPHDR_SYN || >+ (tcb->tcp_flags & TCPHDR_FIN && !mptcp_is_data_fin(skb_it)) || >+ (tcb->tcp_flags & TCPHDR_FIN && mptcp_is_data_fin(skb_it) && !skb_it->len)) >+ continue; >+ >+ __mptcp_reinject_data(skb_it, meta_sk, sk, clone_it); >+ } >+ >+ skb_it = tcp_write_queue_tail(meta_sk); >+ /* If sk has sent the empty data-fin, we have to reinject it too. */ >+ if (skb_it && mptcp_is_data_fin(skb_it) && skb_it->len == 0 && >+ TCP_SKB_CB(skb_it)->path_mask & mptcp_pi_to_flag(tp->mptcp->path_index)) { >+ __mptcp_reinject_data(skb_it, meta_sk, NULL, 1); >+ } >+ >+ mptcp_push_pending_frames(meta_sk); >+ >+ tp->pf = 1; >+} >+ >+ >+static void mptcp_combine_dfin(struct sk_buff *skb, struct sock *meta_sk, >+ struct sock *subsk) >+{ >+ struct tcp_sock *meta_tp = tcp_sk(meta_sk); >+ struct mptcp_cb *mpcb = meta_tp->mpcb; >+ struct sock *sk_it; >+ int all_empty = 1, all_acked; >+ >+ /* In infinite mapping we always try to combine */ >+ if (mpcb->infinite_mapping_snd && tcp_close_state(subsk)) { >+ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN; >+ return; >+ } >+ >+ /* Don't combine, if they didn't combine - otherwise we end up in >+ * TIME_WAIT, even if our app is smart enough to avoid it >+ */ >+ if (meta_sk->sk_shutdown & RCV_SHUTDOWN) { >+ if (!mpcb->dfin_combined) >+ return; >+ } >+ >+ /* If no other subflow has data to send, we can combine */ >+ mptcp_for_each_sk(mpcb, sk_it) { >+ if (!mptcp_sk_can_send(sk_it)) >+ continue; >+ >+ if (!tcp_write_queue_empty(sk_it)) >+ all_empty = 0; >+ } >+ >+ /* If all data has been DATA_ACKed, we can combine. >+ * -1, because the data_fin consumed one byte >+ */ >+ all_acked = (meta_tp->snd_una == (meta_tp->write_seq - 1)); >+ >+ if ((all_empty || all_acked) && tcp_close_state(subsk)) >+ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN; >+} >+ >+static struct sk_buff *mptcp_skb_entail(struct sock *sk, struct sk_buff *skb, >+ int reinject) >+{ >+ __be32 *ptr; >+ __u16 data_len; >+ struct mp_dss *mdss; >+ struct tcp_sock *tp = tcp_sk(sk); >+ struct sock *meta_sk = mptcp_meta_sk(sk); >+ struct mptcp_cb *mpcb = tp->mpcb; >+ struct tcp_skb_cb *tcb; >+ struct sk_buff *subskb = NULL; >+ >+ if (!reinject) >+ TCP_SKB_CB(skb)->mptcp_flags |= (mpcb->snd_hiseq_index ? >+ MPTCPHDR_SEQ64_INDEX : 0); >+ >+ subskb = mptcp_pskb_copy(skb); >+ if (!subskb) >+ return NULL; >+ >+ TCP_SKB_CB(skb)->path_mask |= mptcp_pi_to_flag(tp->mptcp->path_index); >+ >+ if (!(sk->sk_route_caps & NETIF_F_ALL_CSUM) && >+ skb->ip_summed == CHECKSUM_PARTIAL) { >+ subskb->csum = skb->csum = skb_checksum(skb, 0, skb->len, 0); >+ subskb->ip_summed = skb->ip_summed = CHECKSUM_NONE; >+ } >+ >+ /* The subskb is going in the subflow send-queue. Its path-mask >+ * is not needed anymore and MUST be set to 0, as the path-mask >+ * is a union with inet_skb_param. >+ */ >+ tcb = TCP_SKB_CB(subskb); >+ tcb->path_mask = 0; >+ >+ if (mptcp_is_data_fin(subskb)) >+ mptcp_combine_dfin(subskb, meta_sk, sk); >+ >+ if (tp->mpcb->infinite_mapping_snd) >+ goto no_data_seq; >+ >+ if (tp->mpcb->send_infinite_mapping && >+ !before(tcb->seq, mptcp_meta_tp(tp)->snd_nxt)) { >+ tp->mptcp->fully_established = 1; >+ tp->mpcb->infinite_mapping_snd = 1; >+ tp->mptcp->infinite_cutoff_seq = tp->write_seq; >+ tcb->mptcp_flags |= MPTCPHDR_INF; >+ data_len = 0; >+ } else { >+ data_len = tcb->end_seq - tcb->seq; >+ } >+ >+ /**** Write MPTCP DSS-option to the packet. ****/ >+ ptr = (__be32 *)(subskb->data - (MPTCP_SUB_LEN_DSS_ALIGN + >+ MPTCP_SUB_LEN_ACK_ALIGN + >+ MPTCP_SUB_LEN_SEQ_ALIGN)); >+ >+ /* Then we start writing it from the start */ >+ mdss = (struct mp_dss *)ptr; >+ >+ mdss->kind = TCPOPT_MPTCP; >+ mdss->sub = MPTCP_SUB_DSS; >+ mdss->rsv1 = 0; >+ mdss->rsv2 = 0; >+ mdss->F = (mptcp_is_data_fin(subskb) ? 1 : 0); >+ mdss->m = 0; >+ mdss->M = 1; >+ mdss->a = 0; >+ mdss->A = 1; >+ mdss->len = mptcp_sub_len_dss(mdss, tp->mpcb->dss_csum); >+ >+ ptr++; >+ ptr++; /* data_ack will be set in mptcp_options_write */ >+ *ptr++ = htonl(tcb->seq); /* data_seq */ >+ >+ /* If it's a non-data DATA_FIN, we set subseq to 0 (draft v7) */ >+ if (mptcp_is_data_fin(subskb) && subskb->len == 0) >+ *ptr++ = 0; /* subseq */ >+ else >+ *ptr++ = htonl(tp->write_seq - tp->mptcp->snt_isn); /* subseq */ >+ >+ if (tp->mpcb->dss_csum && data_len) { >+ __be16 *p16 = (__be16 *)ptr; >+ __be32 hdseq = mptcp_get_highorder_sndbits(subskb, tp->mpcb); >+ __wsum csum; >+ *ptr = htonl(((data_len) << 16) | >+ (TCPOPT_EOL << 8) | >+ (TCPOPT_EOL)); >+ >+ csum = csum_partial(ptr - 2, 12, subskb->csum); >+ p16++; >+ *p16++ = csum_fold(csum_partial(&hdseq, sizeof(hdseq), csum)); >+ } else { >+ *ptr++ = htonl(((data_len) << 16) | >+ (TCPOPT_NOP << 8) | >+ (TCPOPT_NOP)); >+ } >+ >+no_data_seq: >+ tcb->seq = tp->write_seq; >+ tcb->sacked = 0; /* reset the sacked field: from the point of view >+ * of this subflow, we are sending a brand new >+ * segment */ >+ /* Take into account seg len */ >+ tp->write_seq += subskb->len + ((tcb->tcp_flags & TCPHDR_FIN) ? 1 : 0); >+ tcb->end_seq = tp->write_seq; >+ >+ /* If it's a non-payload DATA_FIN (also no subflow-fin), the >+ * segment is not part of the subflow but on a meta-only-level >+ */ >+ if (!mptcp_is_data_fin(subskb) || tcb->end_seq != tcb->seq) { >+ tcp_add_write_queue_tail(sk, subskb); >+ sk->sk_wmem_queued += subskb->truesize; >+ sk_mem_charge(sk, subskb->truesize); >+ } >+ >+ return subskb; >+} >+ >+static void mptcp_sub_event_new_data_sent(struct sock *sk, >+ struct sk_buff *subskb, >+ struct sk_buff *skb) >+{ >+ /* If it's a non-payload DATA_FIN (also no subflow-fin), the >+ * segment is not part of the subflow but on a meta-only-level >+ * >+ * We free it, because it has been queued nowhere. >+ */ >+ if (!mptcp_is_data_fin(subskb) || >+ (TCP_SKB_CB(subskb)->end_seq != TCP_SKB_CB(subskb)->seq)) { >+ tcp_event_new_data_sent(sk, subskb); >+ tcp_sk(sk)->mptcp->second_packet = 1; >+ tcp_sk(sk)->mptcp->last_end_data_seq = TCP_SKB_CB(skb)->end_seq; >+ } else { >+ kfree_skb(subskb); >+ } >+} >+ >+/* Handle the packets and sockets after a tcp_transmit_skb failed */ >+static void mptcp_transmit_skb_failed(struct sock *sk, struct sk_buff *skb, >+ struct sk_buff *subskb, int reinject) >+{ >+ struct tcp_sock *tp = tcp_sk(sk); >+ struct mptcp_cb *mpcb = tp->mpcb; >+ >+ /* No work to do if we are in infinite mapping mode >+ * There is only one subflow left and we cannot send this segment on >+ * another subflow. >+ */ >+ if (mpcb->infinite_mapping_snd) >+ return; >+ >+ TCP_SKB_CB(skb)->path_mask &= ~mptcp_pi_to_flag(tp->mptcp->path_index); >+ >+ if (TCP_SKB_CB(subskb)->tcp_flags & TCPHDR_FIN) { >+ /* If it is a subflow-fin we must leave it on the >+ * subflow-send-queue, so that the probe-timer >+ * can retransmit it. >+ */ >+ if (!tp->packets_out && !inet_csk(sk)->icsk_pending) >+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, >+ inet_csk(sk)->icsk_rto, TCP_RTO_MAX); >+ } else if (mptcp_is_data_fin(subskb) && >+ TCP_SKB_CB(subskb)->end_seq == TCP_SKB_CB(subskb)->seq) { >+ /* An empty data-fin has not been enqueued on the subflow >+ * and thus we free it. >+ */ >+ >+ kfree_skb(subskb); >+ } else { >+ /* In all other cases we remove it from the sub-queue. >+ * Other subflows may send it, or the probe-timer will >+ * handle it. >+ */ >+ tcp_advance_send_head(sk, subskb); >+ >+ /* tcp_add_write_queue_tail initialized highest_sack. We have >+ * to reset it, if necessary. >+ */ >+ if (tp->highest_sack == subskb) >+ tp->highest_sack = NULL; >+ >+ tcp_unlink_write_queue(subskb, sk); >+ tp->write_seq -= subskb->len; >+ sk_wmem_free_skb(sk, subskb); >+ } >+} >+ >+/* Function to create two new TCP segments. Shrinks the given segment >+ * to the specified size and appends a new segment with the rest of the >+ * packet to the list. This won't be called frequently, I hope. >+ * Remember, these are still headerless SKBs at this point. >+ */ >+int mptcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, >+ unsigned int mss_now, int reinject) >+{ >+ struct tcp_sock *tp = tcp_sk(sk); >+ struct sk_buff *buff; >+ int nsize, old_factor; >+ int nlen; >+ u8 flags; >+ int dsslen = MPTCP_SUB_LEN_DSS_ALIGN + MPTCP_SUB_LEN_ACK_ALIGN + >+ MPTCP_SUB_LEN_SEQ_ALIGN; >+ char dss[dsslen]; >+ >+ if (WARN_ON(len > skb->len)) >+ return -EINVAL; >+ >+ /* DSS-option must be recovered afterwards. */ >+ if (!is_meta_sk(sk)) >+ memcpy(dss, skb->data - dsslen, dsslen); >+ >+ nsize = skb_headlen(skb) - len; >+ if (nsize < 0) >+ nsize = 0; >+ >+ if (skb_cloned(skb) && >+ skb_is_nonlinear(skb)) { >+ if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) >+ return -ENOMEM; >+ /* Recover dss-option */ >+ if (!is_meta_sk(sk)) >+ memcpy(skb->data - dsslen, dss, dsslen); >+ } >+ >+ /* Get a new skb... force flag on. */ >+ buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC); >+ if (buff == NULL) >+ return -ENOMEM; /* We'll just try again later. */ >+ >+ /* See below - if reinject == 1, the buff will be added to the reinject- >+ * queue, which is currently not part of the memory-accounting. >+ */ >+ if (reinject != 1) { >+ sk->sk_wmem_queued += buff->truesize; >+ sk_mem_charge(sk, buff->truesize); >+ } >+ nlen = skb->len - len - nsize; >+ buff->truesize += nlen; >+ skb->truesize -= nlen; >+ >+ /* Correct the sequence numbers. */ >+ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; >+ TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; >+ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; >+ >+ /* PSH and FIN should only be set in the second packet. */ >+ flags = TCP_SKB_CB(skb)->tcp_flags; >+ TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); >+ TCP_SKB_CB(buff)->tcp_flags = flags; >+ TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked; >+ >+ flags = TCP_SKB_CB(skb)->mptcp_flags; >+ TCP_SKB_CB(skb)->mptcp_flags = flags & ~(MPTCPHDR_FIN); >+ TCP_SKB_CB(buff)->mptcp_flags = flags; >+ >+ if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) { >+ /* Copy and checksum data tail into the new buffer. */ >+ buff->csum = csum_partial_copy_nocheck(skb->data + len, >+ skb_put(buff, nsize), >+ nsize, 0); >+ >+ skb_trim(skb, len); >+ >+ skb->csum = csum_block_sub(skb->csum, buff->csum, len); >+ } else { >+ skb->ip_summed = CHECKSUM_PARTIAL; >+ skb_split(skb, buff, len); >+ } >+ >+ /* We lost the dss-option when creating buff - put it back! */ >+ if (!is_meta_sk(sk)) >+ memcpy(buff->data - dsslen, dss, dsslen); >+ >+ buff->ip_summed = skb->ip_summed; >+ >+ /* Looks stupid, but our code really uses when of >+ * skbs, which it never sent before. --ANK >+ */ >+ TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; >+ buff->tstamp = skb->tstamp; >+ >+ old_factor = tcp_skb_pcount(skb); >+ >+ /* Fix up tso_factor for both original and new SKB. */ >+ tcp_set_skb_tso_segs(sk, skb, mss_now); >+ tcp_set_skb_tso_segs(sk, buff, mss_now); >+ >+ /* If this packet has been sent out already, we must >+ * adjust the various packet counters. >+ */ >+ if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq) && reinject != 1) { >+ int diff = old_factor - tcp_skb_pcount(skb) - >+ tcp_skb_pcount(buff); >+ >+ if (diff) >+ tcp_adjust_pcount(sk, skb, diff); >+ } >+ >+ /* Link BUFF into the send queue. */ >+ skb_header_release(buff); >+ if (reinject == 1) >+ __skb_queue_after(&tcp_sk(sk)->mpcb->reinject_queue, skb, buff); >+ else >+ tcp_insert_write_queue_after(skb, buff, sk); >+ >+ return 0; >+} >+ >+int mptso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, >+ unsigned int mss_now, gfp_t gfp, int reinject) >+{ >+ struct sk_buff *buff; >+ int nlen = skb->len - len, old_factor; >+ u8 flags; >+ int dsslen = MPTCP_SUB_LEN_DSS_ALIGN + MPTCP_SUB_LEN_ACK_ALIGN + >+ MPTCP_SUB_LEN_SEQ_ALIGN; >+ >+ /* All of a TSO frame must be composed of paged data. */ >+ if (skb->len != skb->data_len) >+ return mptcp_fragment(sk, skb, len, mss_now, reinject); >+ >+ buff = sk_stream_alloc_skb(sk, 0, gfp); >+ if (unlikely(buff == NULL)) >+ return -ENOMEM; >+ >+ /* See below - if reinject == 1, the buff will be added to the reinject- >+ * queue, which is currently not part of the memory-accounting. >+ */ >+ if (reinject != 1) { >+ sk->sk_wmem_queued += buff->truesize; >+ sk_mem_charge(sk, buff->truesize); >+ } >+ buff->truesize += nlen; >+ skb->truesize -= nlen; >+ >+ /* Correct the sequence numbers. */ >+ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; >+ TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; >+ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; >+ >+ /* PSH and FIN should only be set in the second packet. */ >+ flags = TCP_SKB_CB(skb)->tcp_flags; >+ TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); >+ TCP_SKB_CB(buff)->tcp_flags = flags; >+ >+ flags = TCP_SKB_CB(skb)->mptcp_flags; >+ TCP_SKB_CB(skb)->mptcp_flags = flags & ~(MPTCPHDR_FIN); >+ TCP_SKB_CB(buff)->mptcp_flags = flags; >+ >+ /* This packet was never sent out yet, so no SACK bits. */ >+ TCP_SKB_CB(buff)->sacked = 0; >+ >+ buff->ip_summed = CHECKSUM_PARTIAL; >+ skb->ip_summed = CHECKSUM_PARTIAL; >+ skb_split(skb, buff, len); >+ >+ /* We lost the dss-option when creating buff - put it back! */ >+ if (!is_meta_sk(sk)) >+ memcpy(buff->data - dsslen, skb->data - dsslen, dsslen); >+ >+ old_factor = tcp_skb_pcount(skb); >+ >+ /* Fix up tso_factor for both original and new SKB. */ >+ tcp_set_skb_tso_segs(sk, skb, mss_now); >+ tcp_set_skb_tso_segs(sk, buff, mss_now); >+ >+ /* If this packet has been sent out already, we must >+ * adjust the various packet counters. >+ */ >+ if (!before(tcp_sk(sk)->snd_nxt, TCP_SKB_CB(buff)->end_seq) && reinject != 1) { >+ int diff = old_factor - tcp_skb_pcount(skb) - >+ tcp_skb_pcount(buff); >+ >+ if (diff) >+ tcp_adjust_pcount(sk, skb, diff); >+ } >+ >+ /* Link BUFF into the send queue. */ >+ skb_header_release(buff); >+ if (reinject == 1) >+ __skb_queue_after(&tcp_sk(sk)->mpcb->reinject_queue, skb, buff); >+ else >+ tcp_insert_write_queue_after(skb, buff, sk); >+ >+ return 0; >+} >+ >+/* Inspired by tcp_write_wakeup */ >+int mptcp_write_wakeup(struct sock *meta_sk) >+{ >+ struct tcp_sock *meta_tp = tcp_sk(meta_sk); >+ struct sk_buff *skb, *subskb; >+ >+ skb = tcp_send_head(meta_sk); >+ if (skb && >+ before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(meta_tp))) { >+ int err; >+ unsigned int mss; >+ unsigned int seg_size = tcp_wnd_end(meta_tp) - TCP_SKB_CB(skb)->seq; >+ struct sock *subsk = get_available_subflow(meta_sk, skb, &mss); >+ if (!subsk) >+ return -1; >+ >+ if (before(meta_tp->pushed_seq, TCP_SKB_CB(skb)->end_seq)) >+ meta_tp->pushed_seq = TCP_SKB_CB(skb)->end_seq; >+ >+ /* We are probing the opening of a window >+ * but the window size is != 0 >+ * must have been a result SWS avoidance ( sender ) >+ */ >+ if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq || >+ skb->len > mss) { >+ seg_size = min(seg_size, mss); >+ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; >+ if (mptcp_fragment(meta_sk, skb, seg_size, mss, 0)) >+ return -1; >+ } else if (!tcp_skb_pcount(skb)) { >+ tcp_set_skb_tso_segs(meta_sk, skb, mss); >+ } >+ >+ subskb = mptcp_skb_entail(subsk, skb, 0); >+ if (!subskb) >+ return -1; >+ >+ TCP_SKB_CB(subskb)->tcp_flags |= TCPHDR_PSH; >+ TCP_SKB_CB(skb)->when = tcp_time_stamp; >+ TCP_SKB_CB(subskb)->when = tcp_time_stamp; >+ err = tcp_transmit_skb(subsk, subskb, 1, GFP_ATOMIC); >+ if (unlikely(err)) { >+ mptcp_transmit_skb_failed(subsk, skb, subskb, 0); >+ return err; >+ } >+ >+ mptcp_check_sndseq_wrap(meta_tp, TCP_SKB_CB(skb)->end_seq - >+ TCP_SKB_CB(skb)->seq); >+ tcp_event_new_data_sent(meta_sk, skb); >+ mptcp_sub_event_new_data_sent(subsk, subskb, skb); >+ >+ return 0; >+ } else { >+ struct sock *sk_it; >+ int ans = 0; >+ >+ if (between(meta_tp->snd_up, meta_tp->snd_una + 1, >+ meta_tp->snd_una + 0xFFFF)) { >+ mptcp_for_each_sk(meta_tp->mpcb, sk_it) { >+ if (mptcp_sk_can_send_ack(sk_it)) >+ tcp_xmit_probe_skb(sk_it, 1); >+ } >+ } >+ >+ /* At least one of the tcp_xmit_probe_skb's has to succeed */ >+ mptcp_for_each_sk(meta_tp->mpcb, sk_it) { >+ int ret; >+ >+ if (!mptcp_sk_can_send_ack(sk_it)) >+ continue; >+ >+ ret = tcp_xmit_probe_skb(sk_it, 0); >+ if (unlikely(ret > 0)) >+ ans = ret; >+ } >+ return ans; >+ } >+} >+ >+static void mptcp_find_and_set_pathmask(struct sock *meta_sk, struct sk_buff *skb) >+{ >+ struct sk_buff *skb_it; >+ >+ skb_it = tcp_write_queue_head(meta_sk); >+ >+ tcp_for_write_queue_from(skb_it, meta_sk) { >+ if (skb_it == tcp_send_head(meta_sk)) >+ break; >+ >+ if (TCP_SKB_CB(skb_it)->seq == TCP_SKB_CB(skb)->seq) { >+ TCP_SKB_CB(skb)->path_mask = TCP_SKB_CB(skb_it)->path_mask; >+ break; >+ } >+ } >+} >+ >+static struct sk_buff *mptcp_rcv_buf_optimization(struct sock *sk, int penal) >+{ >+ struct sock *meta_sk; >+ struct tcp_sock *tp = tcp_sk(sk), *tp_it; >+ struct sk_buff *skb_head; >+ >+ if (tp->mpcb->cnt_subflows == 1) >+ return NULL; >+ >+ meta_sk = mptcp_meta_sk(sk); >+ skb_head = tcp_write_queue_head(meta_sk); >+ >+ if (!skb_head || skb_head == tcp_send_head(meta_sk)) >+ return NULL; >+ >+ /* If penalization is optional (coming from mptcp_next_segment() and >+ * We are not send-buffer-limited we do not penalize. The retransmission >+ * is just an optimization to fix the idle-time due to the delay before >+ * we wake up the application. >+ */ >+ if (!penal && sk_stream_memory_free(meta_sk)) >+ goto retrans; >+ >+ /* Half the cwnd of the slow flow */ >+ mptcp_for_each_tp(tp->mpcb, tp_it) { >+ if (tp_it != tp && >+ TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp_it->mptcp->path_index)) { >+ /* Only update every subflow rtt */ >+ if (tcp_time_stamp - tp_it->mptcp->last_rbuf_opti < tp_it->srtt >> 3) >+ break; >+ >+ if (tp->srtt < tp_it->srtt && inet_csk((struct sock *)tp_it)->icsk_ca_state == TCP_CA_Open) { >+ tp_it->snd_cwnd = max(tp_it->snd_cwnd >> 1U, 1U); >+ if (tp_it->snd_ssthresh != TCP_INFINITE_SSTHRESH) >+ tp_it->snd_ssthresh = max(tp_it->snd_ssthresh >> 1U, 2U); >+ >+ tp_it->mptcp->last_rbuf_opti = tcp_time_stamp; >+ } >+ break; >+ } >+ } >+ >+retrans: >+ >+ /* Segment not yet injected into this path? Take it!!! */ >+ if (!(TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp->mptcp->path_index))) { >+ int do_retrans = 0; >+ mptcp_for_each_tp(tp->mpcb, tp_it) { >+ if (tp_it != tp && >+ TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp_it->mptcp->path_index)) { >+ if (tp_it->snd_cwnd <= 4) { >+ do_retrans = 1; >+ break; >+ } >+ >+ if (4 * tp->srtt >= tp_it->srtt) { >+ do_retrans = 0; >+ break; >+ } else { >+ do_retrans = 1; >+ } >+ } >+ } >+ >+ if (do_retrans) >+ return skb_head; >+ } >+ return NULL; >+} >+ >+int mptcp_write_xmit(struct sock *meta_sk, unsigned int mss_now, int nonagle, >+ int push_one, gfp_t gfp) >+{ >+ struct tcp_sock *meta_tp = tcp_sk(meta_sk), *subtp; >+ struct sock *subsk; >+ struct mptcp_cb *mpcb = meta_tp->mpcb; >+ struct sk_buff *skb; >+ unsigned int tso_segs, sent_pkts; >+ int cwnd_quota; >+ int result; >+ int reinject = 0; >+ >+ sent_pkts = 0; >+ >+ /* Currently mtu-probing is not done in MPTCP */ >+ if (!push_one && 0) { >+ /* Do MTU probing. */ >+ result = tcp_mtu_probe(meta_sk); >+ if (!result) >+ return 0; >+ else if (result > 0) >+ sent_pkts = 1; >+ } >+ >+ while ((skb = mptcp_next_segment(meta_sk, &reinject))) { >+ unsigned int limit; >+ struct sk_buff *subskb = NULL; >+ u32 noneligible = mpcb->noneligible; >+ >+ if (reinject == 1) { >+ if (!after(TCP_SKB_CB(skb)->end_seq, meta_tp->snd_una)) { >+ /* Segment already reached the peer, take the next one */ >+ __skb_unlink(skb, &mpcb->reinject_queue); >+ __kfree_skb(skb); >+ continue; >+ } >+ >+ /* Reinjection and it is coming from a subflow? We need >+ * to find out the path-mask from the meta-write-queue >+ * to properly select a subflow. >+ */ >+ if (!TCP_SKB_CB(skb)->path_mask) >+ mptcp_find_and_set_pathmask(meta_sk, skb); >+ } >+ >+subflow: >+ subsk = get_available_subflow(meta_sk, skb, &mss_now); >+ if (!subsk) >+ break; >+ subtp = tcp_sk(subsk); >+ >+ /* Since all subsocks are locked before calling the scheduler, >+ * the tcp_send_head should not change. >+ */ >+ BUG_ON(!reinject && tcp_send_head(meta_sk) != skb); >+retry: >+ /* If the segment was cloned (e.g. a meta retransmission), >+ * the header must be expanded/copied so that there is no >+ * corruption of TSO information. >+ */ >+ if (skb_cloned(skb) && skb_is_nonlinear(skb) && >+ unlikely(pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) >+ break; >+ >+ tso_segs = tcp_init_tso_segs(meta_sk, skb, mss_now); >+ BUG_ON(!tso_segs); >+ >+ cwnd_quota = tcp_cwnd_test(subtp, skb); >+ if (!cwnd_quota) { >+ /* May happen, if at the first selection we circumvented >+ * the test due to a DATA_FIN (and got rejected at >+ * tcp_snd_wnd_test), but the reinjected segment is not >+ * a DATA_FIN. >+ */ >+ BUG_ON(reinject != -1); >+ break; >+ } >+ >+ if (!reinject && unlikely(!tcp_snd_wnd_test(meta_tp, skb, mss_now))) { >+ skb = mptcp_rcv_buf_optimization(subsk, 1); >+ if (skb) { >+ reinject = -1; >+ goto retry; >+ } >+ break; >+ } >+ >+ if (tso_segs == 1) { >+ if (unlikely(!tcp_nagle_test(meta_tp, skb, mss_now, >+ (tcp_skb_is_last(meta_sk, skb) ? >+ nonagle : TCP_NAGLE_PUSH)))) >+ break; >+ } else { >+ /* Do not try to defer the transmission of a reinjected >+ * segment. Send it directly. >+ * If it is not possible to send the TSO segment on the >+ * best subflow right now try to look for another subflow. >+ * If there is no subflow available defer the segment to avoid >+ * the call to mptso_fragment. >+ */ >+ if (!push_one && !reinject && tcp_tso_should_defer(subsk, skb)) { >+ mpcb->noneligible |= mptcp_pi_to_flag(subtp->mptcp->path_index); >+ goto subflow; >+ } >+ } >+ >+ limit = mss_now; >+ if (tso_segs > 1 && !tcp_urg_mode(meta_tp)) >+ limit = tcp_mss_split_point(subsk, skb, mss_now, >+ min_t(unsigned int, >+ cwnd_quota, >+ subsk->sk_gso_max_segs)); >+ >+ if (skb->len > limit && >+ unlikely(mptso_fragment(meta_sk, skb, limit, mss_now, gfp, reinject))) >+ break; >+ >+ subskb = mptcp_skb_entail(subsk, skb, reinject); >+ if (!subskb) >+ break; >+ >+ mpcb->noneligible = noneligible; >+ TCP_SKB_CB(skb)->when = tcp_time_stamp; >+ TCP_SKB_CB(subskb)->when = tcp_time_stamp; >+ if (unlikely(tcp_transmit_skb(subsk, subskb, 1, gfp))) { >+ mptcp_transmit_skb_failed(subsk, skb, subskb, reinject); >+ mpcb->noneligible |= mptcp_pi_to_flag(subtp->mptcp->path_index); >+ continue; >+ } >+ >+ if (!reinject) { >+ mptcp_check_sndseq_wrap(meta_tp, >+ TCP_SKB_CB(skb)->end_seq - >+ TCP_SKB_CB(skb)->seq); >+ tcp_event_new_data_sent(meta_sk, skb); >+ } >+ >+ tcp_minshall_update(meta_tp, mss_now, skb); >+ sent_pkts += tcp_skb_pcount(skb); >+ tcp_sk(subsk)->mptcp->sent_pkts += tcp_skb_pcount(skb); >+ >+ mptcp_sub_event_new_data_sent(subsk, subskb, skb); >+ >+ if (reinject > 0) { >+ __skb_unlink(skb, &mpcb->reinject_queue); >+ kfree_skb(skb); >+ } >+ >+ if (push_one) >+ break; >+ } >+ >+ mpcb->noneligible = 0; >+ >+ if (likely(sent_pkts)) { >+ mptcp_for_each_sk(mpcb, subsk) { >+ subtp = tcp_sk(subsk); >+ if (subtp->mptcp->sent_pkts) { >+ if (tcp_in_cwnd_reduction(subsk)) >+ subtp->prr_out += subtp->mptcp->sent_pkts; >+ tcp_cwnd_validate(subsk); >+ subtp->mptcp->sent_pkts = 0; >+ } >+ } >+ return 0; >+ } >+ >+ return !meta_tp->packets_out && tcp_send_head(meta_sk); >+} >+ >+void mptcp_write_space(struct sock *sk) >+{ >+ mptcp_push_pending_frames(mptcp_meta_sk(sk)); >+} >+ >+u32 __mptcp_select_window(struct sock *sk) >+{ >+ struct inet_connection_sock *icsk = inet_csk(sk); >+ struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp); >+ int mss, free_space, full_space, window; >+ >+ /* MSS for the peer's data. Previous versions used mss_clamp >+ * here. I don't know if the value based on our guesses >+ * of peer's MSS is better for the performance. It's more correct >+ * but may be worse for the performance because of rcv_mss >+ * fluctuations. --SAW 1998/11/1 >+ */ >+ mss = icsk->icsk_ack.rcv_mss; >+ free_space = tcp_space(sk); >+ full_space = min_t(int, meta_tp->window_clamp, >+ tcp_full_space(sk)); >+ >+ if (mss > full_space) >+ mss = full_space; >+ >+ if (free_space < (full_space >> 1)) { >+ icsk->icsk_ack.quick = 0; >+ >+ if (tcp_memory_pressure) >+ /* TODO this has to be adapted when we support different >+ * MSS's among the subflows. >+ */ >+ meta_tp->rcv_ssthresh = min(meta_tp->rcv_ssthresh, >+ 4U * meta_tp->advmss); >+ >+ if (free_space < mss) >+ return 0; >+ } >+ >+ if (free_space > meta_tp->rcv_ssthresh) >+ free_space = meta_tp->rcv_ssthresh; >+ >+ /* Don't do rounding if we are using window scaling, since the >+ * scaled window will not line up with the MSS boundary anyway. >+ */ >+ window = meta_tp->rcv_wnd; >+ if (tp->rx_opt.rcv_wscale) { >+ window = free_space; >+ >+ /* Advertise enough space so that it won't get scaled away. >+ * Import case: prevent zero window announcement if >+ * 1<<rcv_wscale > mss. >+ */ >+ if (((window >> tp->rx_opt.rcv_wscale) << tp-> >+ rx_opt.rcv_wscale) != window) >+ window = (((window >> tp->rx_opt.rcv_wscale) + 1) >+ << tp->rx_opt.rcv_wscale); >+ } else { >+ /* Get the largest window that is a nice multiple of mss. >+ * Window clamp already applied above. >+ * If our current window offering is within 1 mss of the >+ * free space we just keep it. This prevents the divide >+ * and multiply from happening most of the time. >+ * We also don't do any window rounding when the free space >+ * is too small. >+ */ >+ if (window <= free_space - mss || window > free_space) >+ window = (free_space / mss) * mss; >+ else if (mss == full_space && >+ free_space > window + (full_space >> 1)) >+ window = free_space; >+ } >+ >+ return window; >+} >+ >+static void mptcp_set_nonce(struct sock *sk) >+{ >+ struct tcp_sock *tp = tcp_sk(sk); >+ struct inet_sock *inet = inet_sk(sk); >+ >+ if (sk->sk_family == AF_INET) >+ tp->mptcp->mptcp_loc_nonce = mptcp_v4_get_nonce(inet->inet_saddr, >+ inet->inet_daddr, >+ inet->inet_sport, >+ inet->inet_dport, >+ tp->write_seq); >+#if IS_ENABLED(CONFIG_IPV6) >+ else >+ tp->mptcp->mptcp_loc_nonce = mptcp_v6_get_nonce(inet6_sk(sk)->saddr.s6_addr32, >+ inet6_sk(sk)->daddr.s6_addr32, >+ inet->inet_sport, >+ inet->inet_dport, >+ tp->write_seq); >+#endif >+ >+ tp->mptcp->nonce_set = 1; >+} >+ >+void mptcp_syn_options(struct sock *sk, struct tcp_out_options *opts, >+ unsigned *remaining) >+{ >+ struct tcp_sock *tp = tcp_sk(sk); >+ >+ opts->options |= OPTION_MPTCP; >+ if (is_master_tp(tp)) { >+ opts->mptcp_options |= OPTION_MP_CAPABLE | OPTION_TYPE_SYN; >+ *remaining -= MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN; >+ opts->mp_capable.sender_key = tp->mptcp_loc_key; >+ opts->dss_csum = sysctl_mptcp_checksum; >+ >+ /* We arrive here either when sending a SYN or a >+ * SYN+ACK when in SYN_SENT state (that is, tcp_synack_options >+ * is only called for syn+ack replied by a server, while this >+ * function is called when SYNs are sent by both parties and >+ * are crossed) >+ * Due to this possibility, a slave subsocket may arrive here, >+ * and does not need to set the dataseq options, since >+ * there is no data in the segment >+ */ >+ } else { >+ struct mptcp_cb *mpcb = tp->mpcb; >+ >+ opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_SYN; >+ *remaining -= MPTCP_SUB_LEN_JOIN_SYN_ALIGN; >+ opts->mp_join_syns.token = mpcb->mptcp_rem_token; >+ opts->addr_id = mptcp_get_loc_addrid(mpcb, sk); >+ >+ if (!tp->mptcp->nonce_set) >+ mptcp_set_nonce(sk); >+ >+ opts->mp_join_syns.sender_nonce = tp->mptcp->mptcp_loc_nonce; >+ } >+} >+ >+void mptcp_synack_options(struct request_sock *req, >+ struct tcp_out_options *opts, unsigned *remaining) >+{ >+ struct mptcp_request_sock *mtreq; >+ mtreq = mptcp_rsk(req); >+ >+ opts->options |= OPTION_MPTCP; >+ /* MPCB not yet set - thus it's a new MPTCP-session */ >+ if (!mtreq->mpcb) { >+ opts->mptcp_options |= OPTION_MP_CAPABLE | OPTION_TYPE_SYNACK; >+ *remaining -= MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN; >+ opts->mp_capable.sender_key = mtreq->mptcp_loc_key; >+ opts->dss_csum = sysctl_mptcp_checksum || mtreq->dss_csum; >+ } else { >+ struct inet_request_sock *ireq = inet_rsk(req); >+ int i; >+ >+ opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_SYNACK; >+ opts->mp_join_syns.sender_truncated_mac = >+ mtreq->mptcp_hash_tmac; >+ opts->mp_join_syns.sender_nonce = mtreq->mptcp_loc_nonce; >+ opts->addr_id = 0; >+ >+ /* Finding Address ID */ >+ if (req->rsk_ops->family == AF_INET) >+ mptcp_for_each_bit_set(mtreq->mpcb->loc4_bits, i) { >+ struct mptcp_loc4 *addr = >+ &mtreq->mpcb->locaddr4[i]; >+ if (addr->addr.s_addr == ireq->loc_addr) >+ opts->addr_id = addr->id; >+ } >+#if IS_ENABLED(CONFIG_IPV6) >+ else /* IPv6 */ >+ mptcp_for_each_bit_set(mtreq->mpcb->loc6_bits, i) { >+ struct mptcp_loc6 *addr = >+ &mtreq->mpcb->locaddr6[i]; >+ if (ipv6_addr_equal(&addr->addr, >+ &inet6_rsk(req)->loc_addr)) >+ opts->addr_id = addr->id; >+ } >+#endif /* CONFIG_IPV6 */ >+ *remaining -= MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN; >+ } >+} >+ >+void mptcp_established_options(struct sock *sk, struct sk_buff *skb, >+ struct tcp_out_options *opts, unsigned *size) >+{ >+ struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp); >+ struct mptcp_cb *mpcb = tp->mpcb; >+ struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL; >+ >+ /* In fallback mp_fail-mode, we have to repeat it until the fallback >+ * has been done by the sender >+ */ >+ if (unlikely(tp->mptcp->send_mp_fail)) { >+ opts->options |= OPTION_MPTCP; >+ opts->mptcp_options |= OPTION_MP_FAIL; >+ opts->data_ack = (__u32)(mpcb->csum_cutoff_seq >> 32); >+ opts->data_seq = (__u32)mpcb->csum_cutoff_seq; >+ *size += MPTCP_SUB_LEN_FAIL; >+ return; >+ } >+ >+ if (unlikely(tp->send_mp_fclose)) { >+ opts->options |= OPTION_MPTCP; >+ opts->mptcp_options |= OPTION_MP_FCLOSE; >+ opts->mp_capable.receiver_key = mpcb->mptcp_rem_key; >+ *size += MPTCP_SUB_LEN_FCLOSE_ALIGN; >+ return; >+ } >+ >+ /* 1. If we are the sender of the infinite-mapping, we need the >+ * MPTCPHDR_INF-flag, because a retransmission of the >+ * infinite-announcment still needs the mptcp-option. >+ * >+ * We need infinite_cutoff_seq, because retransmissions from before >+ * the infinite-cutoff-moment still need the MPTCP-signalling to stay >+ * consistent. >+ * >+ * 2. If we are the receiver of the infinite-mapping, we always skip >+ * mptcp-options, because acknowledgments from before the >+ * infinite-mapping point have already been sent out. >+ * >+ * I know, the whole infinite-mapping stuff is ugly... >+ * >+ * TODO: Handle wrapped data-sequence numbers >+ * (even if it's very unlikely) >+ */ >+ if (unlikely(mpcb->infinite_mapping_snd) && >+ tp->mptcp->fully_established && >+ ((mpcb->send_infinite_mapping && tcb && >+ !(tcb->mptcp_flags & MPTCPHDR_INF) && >+ !before(tcb->seq, tp->mptcp->infinite_cutoff_seq)) || >+ !mpcb->send_infinite_mapping)) >+ return; >+ >+ if (unlikely(tp->mptcp->include_mpc)) { >+ opts->options |= OPTION_MPTCP; >+ opts->mptcp_options |= OPTION_MP_CAPABLE | >+ OPTION_TYPE_ACK; >+ *size += MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN; >+ opts->mp_capable.sender_key = mpcb->mptcp_loc_key; >+ opts->mp_capable.receiver_key = mpcb->mptcp_rem_key; >+ opts->dss_csum = mpcb->dss_csum; >+ >+ if (skb) >+ tp->mptcp->include_mpc = 0; >+ } >+ if (unlikely(tp->mptcp->pre_established)) { >+ opts->options |= OPTION_MPTCP; >+ opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_ACK; >+ *size += MPTCP_SUB_LEN_JOIN_ACK_ALIGN; >+ } >+ >+ if (!tp->mptcp_add_addr_ack && !tp->mptcp->include_mpc && >+ !tp->mptcp->pre_established) { >+ opts->options |= OPTION_MPTCP; >+ opts->mptcp_options |= OPTION_DATA_ACK; >+ /* If !skb, we come from tcp_current_mss and thus we always >+ * assume that the DSS-option will be set for the data-packet. >+ */ >+ if (skb && !mptcp_is_data_seq(skb)) { >+ opts->data_ack = meta_tp->rcv_nxt; >+ >+ *size += MPTCP_SUB_LEN_ACK_ALIGN; >+ } else { >+ opts->data_ack = meta_tp->rcv_nxt; >+ >+ /* Doesn't matter, if csum included or not. It will be >+ * either 10 or 12, and thus aligned = 12 >+ */ >+ *size += MPTCP_SUB_LEN_ACK_ALIGN + >+ MPTCP_SUB_LEN_SEQ_ALIGN; >+ } >+ >+ *size += MPTCP_SUB_LEN_DSS_ALIGN; >+ } >+ >+ if (unlikely(tp->mptcp->add_addr4) && >+ MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_ADD_ADDR4_ALIGN) { >+ int ind = mptcp_find_free_index(~(tp->mptcp->add_addr4)); >+ opts->options |= OPTION_MPTCP; >+ opts->mptcp_options |= OPTION_ADD_ADDR; >+ opts->addr4 = &mpcb->locaddr4[ind]; >+ if (skb) >+ tp->mptcp->add_addr4 &= ~(1 << ind); >+ *size += MPTCP_SUB_LEN_ADD_ADDR4_ALIGN; >+ } else if (unlikely(tp->mptcp->add_addr6) && >+ MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_ADD_ADDR6_ALIGN) { >+ int ind = mptcp_find_free_index(~(tp->mptcp->add_addr6)); >+ opts->options |= OPTION_MPTCP; >+ opts->mptcp_options |= OPTION_ADD_ADDR; >+ opts->addr6 = &mpcb->locaddr6[ind]; >+ if (skb) >+ tp->mptcp->add_addr6 &= ~(1 << ind); >+ *size += MPTCP_SUB_LEN_ADD_ADDR6_ALIGN; >+ } else if (unlikely(mpcb->remove_addrs) && >+ MAX_TCP_OPTION_SPACE - *size >= >+ mptcp_sub_len_remove_addr_align(mpcb->remove_addrs)) { >+ opts->options |= OPTION_MPTCP; >+ opts->mptcp_options |= OPTION_REMOVE_ADDR; >+ opts->remove_addrs = mpcb->remove_addrs; >+ *size += mptcp_sub_len_remove_addr_align(opts->remove_addrs); >+ if (skb) >+ mpcb->remove_addrs = 0; >+ } else if (!(opts->mptcp_options & OPTION_MP_CAPABLE) && >+ !(opts->mptcp_options & OPTION_MP_JOIN) && >+ ((unlikely(tp->mptcp->add_addr6) && >+ MAX_TCP_OPTION_SPACE - *size <= >+ MPTCP_SUB_LEN_ADD_ADDR6_ALIGN) || >+ (unlikely(tp->mptcp->add_addr4) && >+ MAX_TCP_OPTION_SPACE - *size >= >+ MPTCP_SUB_LEN_ADD_ADDR4_ALIGN))) { >+ mptcp_debug("no space for add addr. unsent IPv4: %#x,IPv6: %#x\n", >+ tp->mptcp->add_addr4, tp->mptcp->add_addr6); >+ tp->mptcp_add_addr_ack = 1; >+ tcp_send_ack(sk); >+ tp->mptcp_add_addr_ack = 0; >+ } >+ >+ if (unlikely(tp->mptcp->send_mp_prio) && >+ MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_PRIO_ALIGN) { >+ opts->options |= OPTION_MPTCP; >+ opts->mptcp_options |= OPTION_MP_PRIO; >+ if (skb) >+ tp->mptcp->send_mp_prio = 0; >+ *size += MPTCP_SUB_LEN_PRIO_ALIGN; >+ } >+ >+ return; >+} >+ >+void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp, >+ struct tcp_out_options *opts, >+ struct sk_buff *skb) >+{ >+ if (unlikely(OPTION_MP_CAPABLE & opts->mptcp_options)) { >+ struct mp_capable *mpc = (struct mp_capable *)ptr; >+ >+ mpc->kind = TCPOPT_MPTCP; >+ >+ if ((OPTION_TYPE_SYN & opts->mptcp_options) || >+ (OPTION_TYPE_SYNACK & opts->mptcp_options)) { >+ mpc->sender_key = opts->mp_capable.sender_key; >+ mpc->len = MPTCP_SUB_LEN_CAPABLE_SYN; >+ ptr += MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN >> 2; >+ } else if (OPTION_TYPE_ACK & opts->mptcp_options) { >+ mpc->sender_key = opts->mp_capable.sender_key; >+ mpc->receiver_key = opts->mp_capable.receiver_key; >+ mpc->len = MPTCP_SUB_LEN_CAPABLE_ACK; >+ ptr += MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN >> 2; >+ } >+ >+ mpc->sub = MPTCP_SUB_CAPABLE; >+ mpc->ver = 0; >+ mpc->a = opts->dss_csum ? 1 : 0; >+ mpc->b = 0; >+ mpc->rsv = 0; >+ mpc->h = 1; >+ } >+ >+ if (unlikely(OPTION_MP_JOIN & opts->mptcp_options)) { >+ struct mp_join *mpj = (struct mp_join *)ptr; >+ >+ mpj->kind = TCPOPT_MPTCP; >+ mpj->sub = MPTCP_SUB_JOIN; >+ mpj->rsv = 0; >+ mpj->addr_id = opts->addr_id; >+ >+ if (OPTION_TYPE_SYN & opts->mptcp_options) { >+ mpj->len = MPTCP_SUB_LEN_JOIN_SYN; >+ mpj->u.syn.token = opts->mp_join_syns.token; >+ mpj->u.syn.nonce = opts->mp_join_syns.sender_nonce; >+ mpj->b = tp->mptcp->low_prio; >+ ptr += MPTCP_SUB_LEN_JOIN_SYN_ALIGN >> 2; >+ } else if (OPTION_TYPE_SYNACK & opts->mptcp_options) { >+ mpj->len = MPTCP_SUB_LEN_JOIN_SYNACK; >+ mpj->u.synack.mac = >+ opts->mp_join_syns.sender_truncated_mac; >+ mpj->u.synack.nonce = opts->mp_join_syns.sender_nonce; >+ mpj->b = tp->mptcp->low_prio; >+ ptr += MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN >> 2; >+ } else if (OPTION_TYPE_ACK & opts->mptcp_options) { >+ mpj->len = MPTCP_SUB_LEN_JOIN_ACK; >+ memcpy(mpj->u.ack.mac, &tp->mptcp->sender_mac[0], 20); >+ ptr += MPTCP_SUB_LEN_JOIN_ACK_ALIGN >> 2; >+ } >+ } >+ if (unlikely(OPTION_ADD_ADDR & opts->mptcp_options)) { >+ struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr; >+ >+ mpadd->kind = TCPOPT_MPTCP; >+ if (opts->addr4) { >+ mpadd->len = MPTCP_SUB_LEN_ADD_ADDR4; >+ mpadd->sub = MPTCP_SUB_ADD_ADDR; >+ mpadd->ipver = 4; >+ mpadd->addr_id = opts->addr4->id; >+ mpadd->u.v4.addr = opts->addr4->addr; >+ ptr += MPTCP_SUB_LEN_ADD_ADDR4_ALIGN >> 2; >+ } else if (opts->addr6) { >+ mpadd->len = MPTCP_SUB_LEN_ADD_ADDR6; >+ mpadd->sub = MPTCP_SUB_ADD_ADDR; >+ mpadd->ipver = 6; >+ mpadd->addr_id = opts->addr6->id; >+ memcpy(&mpadd->u.v6.addr, &opts->addr6->addr, >+ sizeof(mpadd->u.v6.addr)); >+ ptr += MPTCP_SUB_LEN_ADD_ADDR6_ALIGN >> 2; >+ } else { >+ BUG(); >+ } >+ } >+ if (unlikely(OPTION_REMOVE_ADDR & opts->mptcp_options)) { >+ struct mp_remove_addr *mprem = (struct mp_remove_addr *)ptr; >+ u8 *addrs_id; >+ int id, len, len_align; >+ >+ len = mptcp_sub_len_remove_addr(opts->remove_addrs); >+ len_align = mptcp_sub_len_remove_addr_align(opts->remove_addrs); >+ >+ mprem->kind = TCPOPT_MPTCP; >+ mprem->len = len; >+ mprem->sub = MPTCP_SUB_REMOVE_ADDR; >+ mprem->rsv = 0; >+ addrs_id = &mprem->addrs_id; >+ >+ mptcp_for_each_bit_set(opts->remove_addrs, id) >+ *(addrs_id++) = id; >+ >+ /* Fill the rest with NOP's */ >+ if (len_align > len) { >+ int i; >+ for (i = 0; i < len_align - len; i++) >+ *(addrs_id++) = TCPOPT_NOP; >+ } >+ >+ ptr += len_align >> 2; >+ } >+ if (unlikely(OPTION_MP_FAIL & opts->mptcp_options)) { >+ struct mp_fail *mpfail = (struct mp_fail *)ptr; >+ >+ mpfail->kind = TCPOPT_MPTCP; >+ mpfail->len = MPTCP_SUB_LEN_FAIL; >+ mpfail->sub = MPTCP_SUB_FAIL; >+ mpfail->rsv1 = 0; >+ mpfail->rsv2 = 0; >+ mpfail->data_seq = htonll(((u64)opts->data_ack << 32) | opts->data_seq); >+ >+ ptr += MPTCP_SUB_LEN_FAIL_ALIGN >> 2; >+ } >+ if (unlikely(OPTION_MP_FCLOSE & opts->mptcp_options)) { >+ struct mp_fclose *mpfclose = (struct mp_fclose *)ptr; >+ >+ mpfclose->kind = TCPOPT_MPTCP; >+ mpfclose->len = MPTCP_SUB_LEN_FCLOSE; >+ mpfclose->sub = MPTCP_SUB_FCLOSE; >+ mpfclose->rsv1 = 0; >+ mpfclose->rsv2 = 0; >+ mpfclose->key = opts->mp_capable.receiver_key; >+ >+ ptr += MPTCP_SUB_LEN_FCLOSE_ALIGN >> 2; >+ } >+ >+ if (OPTION_DATA_ACK & opts->mptcp_options) { >+ if (!mptcp_is_data_seq(skb)) { >+ struct mp_dss *mdss = (struct mp_dss *)ptr; >+ >+ mdss->kind = TCPOPT_MPTCP; >+ mdss->sub = MPTCP_SUB_DSS; >+ mdss->rsv1 = 0; >+ mdss->rsv2 = 0; >+ mdss->F = 0; >+ mdss->m = 0; >+ mdss->M = 0; >+ mdss->a = 0; >+ mdss->A = 1; >+ mdss->len = mptcp_sub_len_dss(mdss, tp->mpcb->dss_csum); >+ >+ ptr++; >+ *ptr++ = htonl(opts->data_ack); >+ } else { >+ /**** Just update the data_ack ****/ >+ >+ /* Get pointer to data_ack-field. MPTCP is always at >+ * the end of the TCP-options. >+ */ >+ /* TODO if we allow sending 64-bit dseq's we have to change "16" */ >+ __be32 *dack = (__be32 *)(skb->data + (tcp_hdr(skb)->doff << 2) - 16); >+ >+ *dack = htonl(opts->data_ack); >+ } >+ } >+ if (unlikely(OPTION_MP_PRIO & opts->mptcp_options)) { >+ struct mp_prio *mpprio = (struct mp_prio *)ptr; >+ >+ mpprio->kind = TCPOPT_MPTCP; >+ mpprio->len = MPTCP_SUB_LEN_PRIO; >+ mpprio->sub = MPTCP_SUB_PRIO; >+ mpprio->rsv = 0; >+ mpprio->b = tp->mptcp->low_prio; >+ mpprio->addr_id = TCPOPT_NOP; >+ >+ ptr += MPTCP_SUB_LEN_PRIO_ALIGN >> 2; >+ } >+} >+ >+/* Returns the next segment to be sent from the mptcp meta-queue. >+ * (chooses the reinject queue if any segment is waiting in it, otherwise, >+ * chooses the normal write queue). >+ * Sets *@reinject to 1 if the returned segment comes from the >+ * reinject queue. Sets it to 0 if it is the regular send-head of the meta-sk, >+ * and sets it to -1 if it is a meta-level retransmission to optimize the >+ * receive-buffer. >+ */ >+struct sk_buff *mptcp_next_segment(struct sock *meta_sk, int *reinject) >+{ >+ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; >+ struct sk_buff *skb = NULL; >+ if (reinject) >+ *reinject = 0; >+ >+ /* If we are in fallback-mode, just take from the meta-send-queue */ >+ if (mpcb->infinite_mapping_snd || mpcb->send_infinite_mapping) >+ return tcp_send_head(meta_sk); >+ >+ skb = skb_peek(&mpcb->reinject_queue); >+ >+ if (skb) { >+ if (reinject) >+ *reinject = 1; >+ } else { >+ skb = tcp_send_head(meta_sk); >+ >+ if (!skb && meta_sk->sk_write_pending && >+ sk_stream_wspace(meta_sk) < sk_stream_min_wspace(meta_sk)) { >+ struct sock *subsk = get_available_subflow(meta_sk, NULL, NULL); >+ if (!subsk) >+ return NULL; >+ >+ skb = mptcp_rcv_buf_optimization(subsk, 0); >+ if (skb && reinject) >+ *reinject = -1; >+ } >+ } >+ return skb; >+} >+ >+/* Sends the datafin */ >+void mptcp_send_fin(struct sock *meta_sk) >+{ >+ struct tcp_sock *meta_tp = tcp_sk(meta_sk); >+ struct sk_buff *skb = tcp_write_queue_tail(meta_sk); >+ int mss_now; >+ >+ if ((1 << meta_sk->sk_state) & (TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) >+ meta_tp->mpcb->passive_close = 1; >+ >+ /* Optimization, tack on the FIN if we have a queue of >+ * unsent frames. But be careful about outgoing SACKS >+ * and IP options. >+ */ >+ mss_now = mptcp_current_mss(meta_sk); >+ >+ if (tcp_send_head(meta_sk) != NULL) { >+ TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_FIN; >+ TCP_SKB_CB(skb)->end_seq++; >+ meta_tp->write_seq++; >+ } else { >+ /* Socket is locked, keep trying until memory is available. */ >+ for (;;) { >+ skb = alloc_skb_fclone(MAX_TCP_HEADER, >+ meta_sk->sk_allocation); >+ if (skb) >+ break; >+ yield(); >+ } >+ /* Reserve space for headers and prepare control bits. */ >+ skb_reserve(skb, MAX_TCP_HEADER); >+ >+ tcp_init_nondata_skb(skb, meta_tp->write_seq, TCPHDR_ACK); >+ TCP_SKB_CB(skb)->end_seq++; >+ TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_FIN | MPTCPHDR_SEQ; >+ tcp_queue_skb(meta_sk, skb); >+ } >+ __tcp_push_pending_frames(meta_sk, mss_now, TCP_NAGLE_OFF); >+} >+ >+void mptcp_send_active_reset(struct sock *meta_sk, gfp_t priority) >+{ >+ struct tcp_sock *meta_tp = tcp_sk(meta_sk); >+ struct mptcp_cb *mpcb = meta_tp->mpcb; >+ struct sock *sk = NULL, *sk_it = NULL, *tmpsk; >+ >+ if (!mpcb->cnt_subflows) >+ return; >+ >+ /* First - select a socket */ >+ >+ /* Socket already selected? */ >+ mptcp_for_each_sk(mpcb, sk_it) { >+ if (tcp_sk(sk_it)->send_mp_fclose) { >+ sk = sk_it; >+ goto found; >+ } >+ } >+ >+ sk = mptcp_select_ack_sock(meta_sk, 0); >+ /* May happen if no subflow is in an appropriate state */ >+ if (!sk) >+ return; >+ >+ /* We are in infinite mode - just send a reset */ >+ if (mpcb->infinite_mapping_snd || mpcb->infinite_mapping_rcv) { >+ tcp_send_active_reset(sk, priority); >+ return; >+ } >+ >+ tcp_sk(sk)->send_mp_fclose = 1; >+ >+ /** Reset all other subflows */ >+ >+found: >+ /* tcp_done must be handled with bh disabled */ >+ if (!in_serving_softirq()) >+ local_bh_disable(); >+ mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) { >+ if (tcp_sk(sk_it)->send_mp_fclose) >+ continue; >+ >+ sk_it->sk_err = ECONNRESET; >+ if (tcp_need_reset(sk_it->sk_state)) >+ tcp_send_active_reset(sk_it, GFP_ATOMIC); >+ mptcp_sub_force_close(sk_it); >+ } >+ if (!in_serving_softirq()) >+ local_bh_enable(); >+ >+ tcp_send_ack(sk); >+ >+ if (!meta_tp->send_mp_fclose) { >+ struct inet_connection_sock *meta_icsk = inet_csk(meta_sk); >+ >+ meta_icsk->icsk_rto = min(inet_csk(sk)->icsk_rto, TCP_RTO_MAX); >+ inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS, >+ meta_icsk->icsk_rto, TCP_RTO_MAX); >+ } >+ >+ meta_tp->send_mp_fclose = 1; >+} >+ >+void mptcp_ack_retransmit_timer(struct sock *sk) >+{ >+ struct sk_buff *skb; >+ struct tcp_sock *tp = tcp_sk(sk); >+ struct inet_connection_sock *icsk = inet_csk(sk); >+ >+ if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) >+ goto out; /* Routing failure or similar */ >+ >+ if (!tp->retrans_stamp) >+ tp->retrans_stamp = tcp_time_stamp ? : 1; >+ >+ if (tcp_write_timeout(sk)) { >+ tp->mptcp->pre_established = 0; >+ sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer); >+ tcp_send_active_reset(sk, GFP_ATOMIC); >+ goto out; >+ } >+ >+ skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); >+ if (skb == NULL) { >+ sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer, >+ jiffies + icsk->icsk_rto); >+ return; >+ } >+ >+ /* Reserve space for headers and prepare control bits */ >+ skb_reserve(skb, MAX_TCP_HEADER); >+ tcp_init_nondata_skb(skb, tp->snd_una, TCPHDR_ACK); >+ >+ TCP_SKB_CB(skb)->when = tcp_time_stamp; >+ if (tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC) > 0) { >+ /* Retransmission failed because of local congestion, >+ * do not backoff. >+ */ >+ if (!icsk->icsk_retransmits) >+ icsk->icsk_retransmits = 1; >+ sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer, >+ jiffies + icsk->icsk_rto); >+ return; >+ } >+ >+ >+ icsk->icsk_retransmits++; >+ icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); >+ sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer, >+ jiffies + icsk->icsk_rto); >+ if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0)) { >+ __sk_dst_reset(sk); >+ } >+ >+out:; >+} >+ >+void mptcp_ack_handler(unsigned long data) >+{ >+ struct sock *sk = (struct sock *)data; >+ struct sock *meta_sk = mptcp_meta_sk(sk); >+ >+ bh_lock_sock(meta_sk); >+ if (sock_owned_by_user(meta_sk)) { >+ /* Try again later */ >+ sk_reset_timer(sk, &tcp_sk(sk)->mptcp->mptcp_ack_timer, >+ jiffies + (HZ / 20)); >+ goto out_unlock; >+ } >+ >+ if (sk->sk_state == TCP_CLOSE) >+ goto out_unlock; >+ >+ mptcp_ack_retransmit_timer(sk); >+ >+ sk_mem_reclaim(sk); >+ >+out_unlock: >+ bh_unlock_sock(meta_sk); >+ sock_put(sk); >+} >+ >+/* Similar to tcp_retransmit_skb >+ * >+ * The diff is that we handle the retransmission-stats (retrans_stamp) at the >+ * meta-level. >+ */ >+int mptcp_retransmit_skb(struct sock *meta_sk, struct sk_buff *skb) >+{ >+ struct tcp_sock *meta_tp = tcp_sk(meta_sk); >+ struct sock *subsk; >+ struct sk_buff *subskb; >+ unsigned int limit, tso_segs, mss_now; >+ int err = -1, oldpcount; >+ >+ /* Do not sent more than we queued. 1/4 is reserved for possible >+ * copying overhead: fragmentation, tunneling, mangling etc. >+ * >+ * This is a meta-retransmission thus we check on the meta-socket. >+ */ >+ if (atomic_read(&meta_sk->sk_wmem_alloc) > >+ min(meta_sk->sk_wmem_queued + (meta_sk->sk_wmem_queued >> 2), meta_sk->sk_sndbuf)) { >+ return -EAGAIN; >+ } >+ >+ /* We need to make sure that the retransmitted segment can be sent on a >+ * subflow right now. If it is too big, it needs to be fragmented. >+ */ >+ subsk = get_available_subflow(meta_sk, skb, &mss_now); >+ if (!subsk) { >+ /* We want to increase icsk_retransmits, thus return 0, so that >+ * mptcp_retransmit_timer enters the desired branch. >+ */ >+ err = 0; >+ goto failed; >+ } >+ >+ /* If the segment was cloned (e.g. a meta retransmission), the header >+ * must be expanded/copied so that there is no corruption of TSO >+ * information. >+ */ >+ if (skb_cloned(skb) && skb_is_nonlinear(skb) && >+ unlikely(pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) { >+ err = ENOMEM; >+ goto failed; >+ } >+ >+ oldpcount = tcp_skb_pcount(skb); >+ tso_segs = tcp_init_tso_segs(meta_sk, skb, mss_now); >+ BUG_ON(!tso_segs); >+ >+ /* The MSS might have changed and so the number of segments. We >+ * need to account for this change. >+ */ >+ if (unlikely(oldpcount != tso_segs)) >+ tcp_adjust_pcount(meta_sk, skb, oldpcount - tso_segs); >+ >+ limit = mss_now; >+ if (tso_segs > 1 && !tcp_urg_mode(meta_tp)) >+ limit = tcp_mss_split_point(subsk, skb, mss_now, >+ min_t(unsigned int, >+ tcp_cwnd_test(tcp_sk(subsk), skb), >+ subsk->sk_gso_max_segs)); >+ >+ if (skb->len > limit && >+ unlikely(mptso_fragment(meta_sk, skb, limit, mss_now, >+ GFP_ATOMIC, 0))) >+ goto failed; >+ >+ subskb = mptcp_skb_entail(subsk, skb, -1); >+ if (!subskb) >+ goto failed; >+ >+ TCP_SKB_CB(skb)->when = tcp_time_stamp; >+ TCP_SKB_CB(subskb)->when = tcp_time_stamp; >+ err = tcp_transmit_skb(subsk, subskb, 1, GFP_ATOMIC); >+ if (!err) { >+ /* Update global TCP statistics. */ >+ TCP_INC_STATS(sock_net(meta_sk), TCP_MIB_RETRANSSEGS); >+ >+ /* Diff to tcp_retransmit_skb */ >+ >+ /* Save stamp of the first retransmit. */ >+ if (!meta_tp->retrans_stamp) >+ meta_tp->retrans_stamp = TCP_SKB_CB(subskb)->when; >+ mptcp_sub_event_new_data_sent(subsk, subskb, skb); >+ } else { >+ mptcp_transmit_skb_failed(subsk, skb, subskb, 0); >+ } >+ >+failed: >+ return err; >+} >+ >+/* Similar to tcp_retransmit_timer >+ * >+ * The diff is that we have to handle retransmissions of the FAST_CLOSE-message >+ * and that we don't have an srtt estimation at the meta-level. >+ */ >+void mptcp_retransmit_timer(struct sock *meta_sk) >+{ >+ struct tcp_sock *meta_tp = tcp_sk(meta_sk); >+ struct mptcp_cb *mpcb = meta_tp->mpcb; >+ struct inet_connection_sock *meta_icsk = inet_csk(meta_sk); >+ int err; >+ >+ if (unlikely(meta_tp->send_mp_fclose)) >+ goto send_mp_fclose; >+ >+ /* In fallback, retransmission is handled at the subflow-level */ >+ if (!meta_tp->packets_out || mpcb->infinite_mapping_snd || >+ mpcb->send_infinite_mapping) >+ return; >+ >+ WARN_ON(tcp_write_queue_empty(meta_sk)); >+ >+ if (!meta_tp->snd_wnd && !sock_flag(meta_sk, SOCK_DEAD) && >+ !((1 << meta_sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) { >+ /* Receiver dastardly shrinks window. Our retransmits >+ * become zero probes, but we should not timeout this >+ * connection. If the socket is an orphan, time it out, >+ * we cannot allow such beasts to hang infinitely. >+ */ >+ struct inet_sock *meta_inet = inet_sk(meta_sk); >+ if (meta_sk->sk_family == AF_INET) { >+ LIMIT_NETDEBUG(KERN_DEBUG "TCP: Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n", >+ &meta_inet->inet_daddr, >+ ntohs(meta_inet->inet_dport), >+ meta_inet->inet_num, meta_tp->snd_una, >+ meta_tp->snd_nxt); >+ } >+#if IS_ENABLED(CONFIG_IPV6) >+ else if (meta_sk->sk_family == AF_INET6) { >+ struct ipv6_pinfo *np = inet6_sk(meta_sk); >+ LIMIT_NETDEBUG(KERN_DEBUG "TCP: Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n", >+ &np->daddr, ntohs(meta_inet->inet_dport), >+ meta_inet->inet_num, meta_tp->snd_una, >+ meta_tp->snd_nxt); >+ } >+#endif >+ if (tcp_time_stamp - meta_tp->rcv_tstamp > TCP_RTO_MAX) { >+ tcp_write_err(meta_sk); >+ return; >+ } >+ >+ mptcp_retransmit_skb(meta_sk, tcp_write_queue_head(meta_sk)); >+ goto out_reset_timer; >+ } >+ >+ if (tcp_write_timeout(meta_sk)) >+ return; >+ >+ if (meta_icsk->icsk_retransmits == 0) >+ NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_TCPTIMEOUTS); >+ >+ meta_icsk->icsk_ca_state = TCP_CA_Loss; >+ >+ err = mptcp_retransmit_skb(meta_sk, tcp_write_queue_head(meta_sk)); >+ if (err > 0) { >+ /* Retransmission failed because of local congestion, >+ * do not backoff. >+ */ >+ if (!meta_icsk->icsk_retransmits) >+ meta_icsk->icsk_retransmits = 1; >+ inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS, >+ min(meta_icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL), >+ TCP_RTO_MAX); >+ return; >+ } >+ >+out_backoff: >+ /* Increase the timeout each time we retransmit. Note that >+ * we do not increase the rtt estimate. rto is initialized >+ * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests >+ * that doubling rto each time is the least we can get away with. >+ * In KA9Q, Karn uses this for the first few times, and then >+ * goes to quadratic. netBSD doubles, but only goes up to *64, >+ * and clamps at 1 to 64 sec afterwards. Note that 120 sec is >+ * defined in the protocol as the maximum possible RTT. I guess >+ * we'll have to use something other than TCP to talk to the >+ * University of Mars. >+ * >+ * PAWS allows us longer timeouts and large windows, so once >+ * implemented ftp to mars will work nicely. We will have to fix >+ * the 120 second clamps though! >+ */ >+ meta_icsk->icsk_backoff++; >+ meta_icsk->icsk_retransmits++; >+ >+out_reset_timer: >+ /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is >+ * used to reset timer, set to 0. Recalculate 'icsk_rto' as this >+ * might be increased if the stream oscillates between thin and thick, >+ * thus the old value might already be too high compared to the value >+ * set by 'tcp_set_rto' in tcp_input.c which resets the rto without >+ * backoff. Limit to TCP_THIN_LINEAR_RETRIES before initiating >+ * exponential backoff behaviour to avoid continue hammering >+ * linear-timeout retransmissions into a black hole >+ */ >+ if (meta_sk->sk_state == TCP_ESTABLISHED && >+ (meta_tp->thin_lto || sysctl_tcp_thin_linear_timeouts) && >+ tcp_stream_is_thin(meta_tp) && >+ meta_icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) { >+ meta_icsk->icsk_backoff = 0; >+ /* We cannot do the same as in tcp_write_timer because the >+ * srtt is not set here. >+ */ >+ mptcp_set_rto(meta_sk); >+ } else { >+ /* Use normal (exponential) backoff */ >+ meta_icsk->icsk_rto = min(meta_icsk->icsk_rto << 1, TCP_RTO_MAX); >+ } >+ inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS, meta_icsk->icsk_rto, TCP_RTO_MAX); >+ >+ return; >+ >+send_mp_fclose: >+ /* MUST do this before tcp_write_timeout, because retrans_stamp may have >+ * been set to 0 in another part while we are retransmitting >+ * MP_FASTCLOSE. Then, we would crash, because retransmits_timed_out >+ * accesses the meta-write-queue. >+ * >+ * We make sure that the timestamp is != 0. >+ */ >+ if (!meta_tp->retrans_stamp) >+ meta_tp->retrans_stamp = tcp_time_stamp ? : 1; >+ >+ if (tcp_write_timeout(meta_sk)) >+ return; >+ >+ mptcp_send_active_reset(meta_sk, GFP_ATOMIC); >+ >+ goto out_backoff; >+} >+ >+/* Modify values to an mptcp-level for the initial window of new subflows */ >+void mptcp_select_initial_window(int *__space, __u32 *window_clamp, >+ const struct sock *sk) >+{ >+ struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb; >+ >+ *window_clamp = mpcb->orig_window_clamp; >+ *__space = tcp_win_from_space(mpcb->orig_sk_rcvbuf); >+} >+ >+unsigned int mptcp_current_mss(struct sock *meta_sk) >+{ >+ unsigned int mss = 0; >+ struct sock *sk; >+ >+ mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) { >+ int this_mss; >+ >+ if (!mptcp_sk_can_send(sk)) >+ continue; >+ >+ this_mss = tcp_current_mss(sk); >+ if (!mss || this_mss < mss) >+ mss = this_mss; >+ } >+ >+ /* If no subflow is available, we take a default-mss from the >+ * meta-socket. >+ */ >+ return !mss ? tcp_current_mss(meta_sk) : mss; >+} >+ >+int mptcp_select_size(const struct sock *meta_sk, bool sg) >+{ >+ int mss = 0; /* We look for the smallest MSS */ >+ struct sock *sk; >+ >+ mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) { >+ int this_mss; >+ >+ if (!mptcp_sk_can_send(sk)) >+ continue; >+ >+ this_mss = tcp_sk(sk)->mss_cache; >+ if (!mss || this_mss < mss) >+ mss = this_mss; >+ } >+ >+ if (sg) { >+ if (mptcp_sk_can_gso(meta_sk)) { >+ mss = SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER); >+ } else { >+ int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER); >+ >+ if (mss >= pgbreak && >+ mss <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE) >+ mss = pgbreak; >+ } >+ } >+ >+ return !mss ? tcp_sk(meta_sk)->mss_cache : mss; >+} >+ >+int mptcp_check_snd_buf(const struct tcp_sock *tp) >+{ >+ struct sock *sk; >+ u32 rtt_max = tp->srtt; >+ u64 bw_est; >+ >+ if (!tp->srtt) >+ return tp->reordering + 1; >+ >+ mptcp_for_each_sk(tp->mpcb, sk) { >+ if (!mptcp_sk_can_send(sk)) >+ continue; >+ >+ if (rtt_max < tcp_sk(sk)->srtt) >+ rtt_max = tcp_sk(sk)->srtt; >+ } >+ >+ bw_est = div64_u64(((u64)tp->snd_cwnd * rtt_max) << 16, >+ (u64)tp->srtt); >+ >+ return max_t(unsigned int, (u32)(bw_est >> 16), >+ tp->reordering + 1); >+ >+} >+ >+unsigned int mptcp_xmit_size_goal(struct sock *meta_sk, u32 mss_now, >+ int large_allowed) >+{ >+ struct sock *sk; >+ u32 xmit_size_goal = 0; >+ >+ if (large_allowed && mptcp_sk_can_gso(meta_sk)) { >+ mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) { >+ int this_size_goal; >+ >+ if (!mptcp_sk_can_send(sk)) >+ continue; >+ >+ this_size_goal = tcp_xmit_size_goal(sk, mss_now, 1); >+ if (!xmit_size_goal || this_size_goal < xmit_size_goal) >+ xmit_size_goal = this_size_goal; >+ } >+ } >+ >+ return max(xmit_size_goal, mss_now); >+} >+ >+/* Similar to tcp_trim_head - but we correctly copy the DSS-option */ >+int mptcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) >+{ >+ int dsslen = MPTCP_SUB_LEN_DSS_ALIGN + MPTCP_SUB_LEN_ACK_ALIGN + >+ MPTCP_SUB_LEN_SEQ_ALIGN; >+ char dss[dsslen]; >+ >+ /* DSS-option must be recovered afterwards. */ >+ memcpy(dss, skb->data - dsslen, dsslen); >+ >+ if (skb_cloned(skb)) { >+ /* pskb_expand_head will delete our DSS-option. We have to copy >+ * it back if pskb_expand_head succeeds. >+ */ >+ >+ if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) >+ return -ENOMEM; >+ >+ memcpy(skb->data - dsslen, dss, dsslen); >+ } >+ >+ __pskb_trim_head(skb, len); >+ >+ /* Put the DSS-option back in our header */ >+ memcpy(skb->data - dsslen, dss, dsslen); >+ >+ TCP_SKB_CB(skb)->seq += len; >+ skb->ip_summed = CHECKSUM_PARTIAL; >+ >+ skb->truesize -= len; >+ sk->sk_wmem_queued -= len; >+ sk_mem_uncharge(sk, len); >+ sock_set_flag(sk, SOCK_QUEUE_SHRUNK); >+ >+ /* Any change of skb->len requires recalculation of tso factor. */ >+ if (tcp_skb_pcount(skb) > 1) >+ tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb)); >+ >+ return 0; >+} >diff -Naur a/linux-3.11/net/mptcp/mptcp_pm.c b/linux-3.11/net/mptcp/mptcp_pm.c >--- a/linux-3.11/net/mptcp/mptcp_pm.c 1970-01-01 01:00:00.000000000 +0100 >+++ b/linux-3.11/net/mptcp/mptcp_pm.c 2013-10-05 18:34:49.276364613 +0200 >@@ -0,0 +1,1194 @@ >+/* >+ * MPTCP implementation - MPTCP-subflow-management >+ * >+ * Initial Design & Implementation: >+ * Sébastien Barré <sebastien.barre@uclouvain.be> >+ * >+ * Current Maintainer & Author: >+ * Christoph Paasch <christoph.paasch@uclouvain.be> >+ * >+ * Additional authors: >+ * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi> >+ * Gregory Detal <gregory.detal@uclouvain.be> >+ * Fabien Duchêne <fabien.duchene@uclouvain.be> >+ * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de> >+ * Lavkesh Lahngir <lavkesh51@gmail.com> >+ * Andreas Ripke <ripke@neclab.eu> >+ * Vlad Dogaru <vlad.dogaru@intel.com> >+ * Octavian Purdila <octavian.purdila@intel.com> >+ * John Ronan <jronan@tssg.org> >+ * Catalin Nicutar <catalin.nicutar@gmail.com> >+ * Brandon Heller <brandonh@stanford.edu> >+ * >+ * >+ * This program is free software; you can redistribute it and/or >+ * modify it under the terms of the GNU General Public License >+ * as published by the Free Software Foundation; either version >+ * 2 of the License, or (at your option) any later version. >+ */ >+ >+#include <linux/kconfig.h> >+#include <linux/module.h> >+#include <linux/netdevice.h> >+#include <linux/inetdevice.h> >+#include <linux/list.h> >+#include <linux/tcp.h> >+#include <linux/workqueue.h> >+#include <linux/proc_fs.h> /* Needed by proc_net_fops_create */ >+#include <net/inet_sock.h> >+#include <net/tcp.h> >+#include <net/mptcp.h> >+#include <net/mptcp_v4.h> >+#include <net/mptcp_pm.h> >+#if IS_ENABLED(CONFIG_IPV6) >+#include <net/if_inet6.h> >+#include <net/ipv6.h> >+#include <net/ip6_checksum.h> >+#include <net/inet6_connection_sock.h> >+#include <net/mptcp_v6.h> >+#include <net/addrconf.h> >+#endif >+ >+static inline u32 mptcp_hash_tk(u32 token) >+{ >+ return token % MPTCP_HASH_SIZE; >+} >+ >+static struct hlist_nulls_head tk_hashtable[MPTCP_HASH_SIZE]; >+ >+/* This second hashtable is needed to retrieve request socks >+ * created as a result of a join request. While the SYN contains >+ * the token, the final ack does not, so we need a separate hashtable >+ * to retrieve the mpcb. >+ */ >+struct list_head mptcp_reqsk_htb[MPTCP_HASH_SIZE]; >+spinlock_t mptcp_reqsk_hlock; /* hashtable protection */ >+ >+/* The following hash table is used to avoid collision of token */ >+static struct hlist_nulls_head mptcp_reqsk_tk_htb[MPTCP_HASH_SIZE]; >+spinlock_t mptcp_tk_hashlock; /* hashtable protection */ >+ >+static int mptcp_reqsk_find_tk(u32 token) >+{ >+ u32 hash = mptcp_hash_tk(token); >+ struct mptcp_request_sock *mtreqsk; >+ const struct hlist_nulls_node *node; >+ >+ hlist_nulls_for_each_entry_rcu(mtreqsk, node, >+ &mptcp_reqsk_tk_htb[hash], collide_tk) { >+ if (token == mtreqsk->mptcp_loc_token) >+ return 1; >+ } >+ return 0; >+} >+ >+static void mptcp_reqsk_insert_tk(struct request_sock *reqsk, u32 token) >+{ >+ u32 hash = mptcp_hash_tk(token); >+ >+ hlist_nulls_add_head_rcu(&mptcp_rsk(reqsk)->collide_tk, >+ &mptcp_reqsk_tk_htb[hash]); >+} >+ >+void mptcp_reqsk_remove_tk(struct request_sock *reqsk) >+{ >+ rcu_read_lock(); >+ spin_lock(&mptcp_tk_hashlock); >+ hlist_nulls_del_rcu(&mptcp_rsk(reqsk)->collide_tk); >+ spin_unlock(&mptcp_tk_hashlock); >+ rcu_read_unlock(); >+} >+ >+void __mptcp_hash_insert(struct tcp_sock *meta_tp, u32 token) >+{ >+ u32 hash = mptcp_hash_tk(token); >+ hlist_nulls_add_head_rcu(&meta_tp->tk_table, &tk_hashtable[hash]); >+ meta_tp->inside_tk_table = 1; >+} >+ >+static int mptcp_find_token(u32 token) >+{ >+ u32 hash = mptcp_hash_tk(token); >+ struct tcp_sock *meta_tp; >+ const struct hlist_nulls_node *node; >+ >+ hlist_nulls_for_each_entry_rcu(meta_tp, node, &tk_hashtable[hash], tk_table) { >+ if (token == meta_tp->mptcp_loc_token) >+ return 1; >+ } >+ return 0; >+} >+ >+static void mptcp_set_key_reqsk(struct request_sock *req, >+ const struct sk_buff *skb) >+{ >+ struct inet_request_sock *ireq = inet_rsk(req); >+ struct mptcp_request_sock *mtreq = mptcp_rsk(req); >+ >+ if (skb->protocol == htons(ETH_P_IP)) { >+ mtreq->mptcp_loc_key = mptcp_v4_get_key(ip_hdr(skb)->saddr, >+ ip_hdr(skb)->daddr, >+ ireq->loc_port, >+ ireq->rmt_port); >+#if IS_ENABLED(CONFIG_IPV6) >+ } else { >+ mtreq->mptcp_loc_key = mptcp_v6_get_key(ipv6_hdr(skb)->saddr.s6_addr32, >+ ipv6_hdr(skb)->daddr.s6_addr32, >+ ireq->loc_port, >+ ireq->rmt_port); >+#endif >+ } >+ >+ mptcp_key_sha1(mtreq->mptcp_loc_key, &mtreq->mptcp_loc_token, NULL); >+} >+ >+/* New MPTCP-connection request, prepare a new token for the meta-socket that >+ * will be created in mptcp_check_req_master(), and store the received token. >+ */ >+void mptcp_reqsk_new_mptcp(struct request_sock *req, >+ const struct tcp_options_received *rx_opt, >+ const struct mptcp_options_received *mopt, >+ const struct sk_buff *skb) >+{ >+ struct mptcp_request_sock *mtreq = mptcp_rsk(req); >+ >+ tcp_rsk(req)->saw_mpc = 1; >+ >+ rcu_read_lock(); >+ spin_lock(&mptcp_tk_hashlock); >+ do { >+ mptcp_set_key_reqsk(req, skb); >+ } while (mptcp_reqsk_find_tk(mtreq->mptcp_loc_token) || >+ mptcp_find_token(mtreq->mptcp_loc_token)); >+ >+ mptcp_reqsk_insert_tk(req, mtreq->mptcp_loc_token); >+ spin_unlock(&mptcp_tk_hashlock); >+ rcu_read_unlock(); >+ mtreq->mptcp_rem_key = mopt->mptcp_key; >+} >+ >+static void mptcp_set_key_sk(struct sock *sk) >+{ >+ struct tcp_sock *tp = tcp_sk(sk); >+ struct inet_sock *isk = inet_sk(sk); >+ >+ if (sk->sk_family == AF_INET) >+ tp->mptcp_loc_key = mptcp_v4_get_key(isk->inet_saddr, >+ isk->inet_daddr, >+ isk->inet_sport, >+ isk->inet_dport); >+#if IS_ENABLED(CONFIG_IPV6) >+ else >+ tp->mptcp_loc_key = mptcp_v6_get_key(inet6_sk(sk)->saddr.s6_addr32, >+ inet6_sk(sk)->daddr.s6_addr32, >+ isk->inet_sport, >+ isk->inet_dport); >+#endif >+ >+ mptcp_key_sha1(tp->mptcp_loc_key, >+ &tp->mptcp_loc_token, NULL); >+} >+ >+void mptcp_connect_init(struct sock *sk) >+{ >+ struct tcp_sock *tp = tcp_sk(sk); >+ >+ rcu_read_lock_bh(); >+ spin_lock(&mptcp_tk_hashlock); >+ do { >+ mptcp_set_key_sk(sk); >+ } while (mptcp_reqsk_find_tk(tp->mptcp_loc_token) || >+ mptcp_find_token(tp->mptcp_loc_token)); >+ >+ __mptcp_hash_insert(tp, tp->mptcp_loc_token); >+ spin_unlock(&mptcp_tk_hashlock); >+ rcu_read_unlock_bh(); >+} >+ >+/** >+ * This function increments the refcount of the mpcb struct. >+ * It is the responsibility of the caller to decrement when releasing >+ * the structure. >+ */ >+struct sock *mptcp_hash_find(struct net *net, u32 token) >+{ >+ u32 hash = mptcp_hash_tk(token); >+ struct tcp_sock *meta_tp; >+ struct sock *meta_sk = NULL; >+ struct hlist_nulls_node *node; >+ >+ rcu_read_lock(); >+ hlist_nulls_for_each_entry_rcu(meta_tp, node, &tk_hashtable[hash], >+ tk_table) { >+ meta_sk = (struct sock *)meta_tp; >+ if (token == meta_tp->mptcp_loc_token && >+ net_eq(net, sock_net(meta_sk)) && >+ atomic_inc_not_zero(&meta_sk->sk_refcnt)) >+ break; >+ meta_sk = NULL; >+ } >+ rcu_read_unlock(); >+ return meta_sk; >+} >+ >+void mptcp_hash_remove_bh(struct tcp_sock *meta_tp) >+{ >+ /* remove from the token hashtable */ >+ rcu_read_lock_bh(); >+ spin_lock(&mptcp_tk_hashlock); >+ hlist_nulls_del_rcu(&meta_tp->tk_table); >+ meta_tp->inside_tk_table = 0; >+ spin_unlock(&mptcp_tk_hashlock); >+ rcu_read_unlock_bh(); >+} >+ >+void mptcp_hash_remove(struct tcp_sock *meta_tp) >+{ >+ rcu_read_lock(); >+ spin_lock(&mptcp_tk_hashlock); >+ hlist_nulls_del_rcu(&meta_tp->tk_table); >+ meta_tp->inside_tk_table = 0; >+ spin_unlock(&mptcp_tk_hashlock); >+ rcu_read_unlock(); >+} >+ >+u8 mptcp_get_loc_addrid(struct mptcp_cb *mpcb, struct sock *sk) >+{ >+ int i; >+ >+ if (sk->sk_family == AF_INET) { >+ mptcp_for_each_bit_set(mpcb->loc4_bits, i) { >+ if (mpcb->locaddr4[i].addr.s_addr == >+ inet_sk(sk)->inet_saddr) >+ return mpcb->locaddr4[i].id; >+ } >+ >+ mptcp_debug("%s %pI4 not locally found\n", __func__, >+ &inet_sk(sk)->inet_saddr); >+ BUG(); >+ } >+#if IS_ENABLED(CONFIG_IPV6) >+ if (sk->sk_family == AF_INET6) { >+ mptcp_for_each_bit_set(mpcb->loc6_bits, i) { >+ if (ipv6_addr_equal(&mpcb->locaddr6[i].addr, >+ &inet6_sk(sk)->saddr)) >+ return mpcb->locaddr6[i].id; >+ } >+ >+ mptcp_debug("%s %pI6 not locally found\n", __func__, >+ &inet6_sk(sk)->saddr); >+ BUG(); >+ } >+#endif /* CONFIG_IPV6 */ >+ >+ BUG(); >+ return 0; >+} >+ >+void mptcp_set_addresses(struct sock *meta_sk) >+{ >+ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; >+ struct net *netns = sock_net(meta_sk); >+ struct net_device *dev; >+ >+ /* if multiports is requested, we work with the main address >+ * and play only with the ports >+ */ >+ if (sysctl_mptcp_ndiffports > 1) >+ return; >+ >+ rcu_read_lock(); >+ read_lock_bh(&dev_base_lock); >+ >+ for_each_netdev(netns, dev) { >+ if (netif_running(dev)) { >+ struct in_device *in_dev = __in_dev_get_rcu(dev); >+ struct in_ifaddr *ifa; >+ __be32 ifa_address; >+#if IS_ENABLED(CONFIG_IPV6) >+ struct inet6_dev *in6_dev = __in6_dev_get(dev); >+ struct inet6_ifaddr *ifa6; >+#endif >+ >+ if (dev->flags & (IFF_LOOPBACK | IFF_NOMULTIPATH)) >+ continue; >+ >+ if (!in_dev) >+ goto cont_ipv6; >+ >+ for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { >+ int i; >+ ifa_address = ifa->ifa_local; >+ >+ if (ifa->ifa_scope == RT_SCOPE_HOST) >+ continue; >+ >+ if ((meta_sk->sk_family == AF_INET || >+ mptcp_v6_is_v4_mapped(meta_sk)) && >+ inet_sk(meta_sk)->inet_saddr == ifa_address) { >+ mpcb->locaddr4[0].low_prio = dev->flags & >+ IFF_MPBACKUP ? 1 : 0; >+ continue; >+ } >+ >+ i = __mptcp_find_free_index(mpcb->loc4_bits, -1, >+ mpcb->next_v4_index); >+ if (i < 0) { >+ mptcp_debug("%s: At max num of local addresses: %d --- not adding address: %pI4\n", >+ __func__, MPTCP_MAX_ADDR, >+ &ifa_address); >+ goto out; >+ } >+ mpcb->locaddr4[i].addr.s_addr = ifa_address; >+ mpcb->locaddr4[i].port = 0; >+ mpcb->locaddr4[i].id = i; >+ mpcb->locaddr4[i].low_prio = (dev->flags & IFF_MPBACKUP) ? >+ 1 : 0; >+ mpcb->loc4_bits |= (1 << i); >+ mpcb->next_v4_index = i + 1; >+ mptcp_v4_send_add_addr(i, mpcb); >+ } >+ >+cont_ipv6: >+; /* This ; is necessary to fix build-errors when IPv6 is disabled */ >+#if IS_ENABLED(CONFIG_IPV6) >+ if (!in6_dev) >+ continue; >+ >+ list_for_each_entry(ifa6, &in6_dev->addr_list, if_list) { >+ int addr_type = ipv6_addr_type(&ifa6->addr); >+ int i; >+ >+ if (addr_type == IPV6_ADDR_ANY || >+ addr_type & IPV6_ADDR_LOOPBACK || >+ addr_type & IPV6_ADDR_LINKLOCAL) >+ continue; >+ >+ if (meta_sk->sk_family == AF_INET6 && >+ ipv6_addr_equal(&inet6_sk(meta_sk)->saddr, >+ &(ifa6->addr))) { >+ mpcb->locaddr6[0].low_prio = dev->flags & >+ IFF_MPBACKUP ? 1 : 0; >+ continue; >+ } >+ >+ i = __mptcp_find_free_index(mpcb->loc6_bits, -1, >+ mpcb->next_v6_index); >+ if (i < 0) { >+ mptcp_debug("%s: At max num of local addresses: %d --- not adding address: %pI6\n", >+ __func__, MPTCP_MAX_ADDR, >+ &ifa6->addr); >+ goto out; >+ } >+ >+ mpcb->locaddr6[i].addr = ifa6->addr; >+ mpcb->locaddr6[i].port = 0; >+ mpcb->locaddr6[i].id = i + MPTCP_MAX_ADDR; >+ mpcb->locaddr6[i].low_prio = (dev->flags & IFF_MPBACKUP) ? >+ 1 : 0; >+ mpcb->loc6_bits |= (1 << i); >+ mpcb->next_v6_index = i + 1; >+ mptcp_v6_send_add_addr(i, mpcb); >+ } >+#endif >+ } >+ } >+ >+out: >+ read_unlock_bh(&dev_base_lock); >+ rcu_read_unlock(); >+} >+ >+int mptcp_check_req(struct sk_buff *skb, struct net *net) >+{ >+ struct tcphdr *th = tcp_hdr(skb); >+ struct sock *meta_sk = NULL; >+ >+ /* MPTCP structures not initialized */ >+ if (mptcp_init_failed) >+ return 0; >+ >+ if (skb->protocol == htons(ETH_P_IP)) >+ meta_sk = mptcp_v4_search_req(th->source, ip_hdr(skb)->saddr, >+ ip_hdr(skb)->daddr, net); >+#if IS_ENABLED(CONFIG_IPV6) >+ else /* IPv6 */ >+ meta_sk = mptcp_v6_search_req(th->source, &ipv6_hdr(skb)->saddr, >+ &ipv6_hdr(skb)->daddr, net); >+#endif /* CONFIG_IPV6 */ >+ >+ if (!meta_sk) >+ return 0; >+ >+ TCP_SKB_CB(skb)->mptcp_flags = MPTCPHDR_JOIN; >+ >+ bh_lock_sock_nested(meta_sk); >+ if (sock_owned_by_user(meta_sk)) { >+ skb->sk = meta_sk; >+ if (unlikely(sk_add_backlog(meta_sk, skb, >+ meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) { >+ bh_unlock_sock(meta_sk); >+ NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP); >+ sock_put(meta_sk); /* Taken by mptcp_search_req */ >+ kfree_skb(skb); >+ return 1; >+ } >+ } else if (skb->protocol == htons(ETH_P_IP)) { >+ tcp_v4_do_rcv(meta_sk, skb); >+#if IS_ENABLED(CONFIG_IPV6) >+ } else { /* IPv6 */ >+ tcp_v6_do_rcv(meta_sk, skb); >+#endif /* CONFIG_IPV6 */ >+ } >+ bh_unlock_sock(meta_sk); >+ sock_put(meta_sk); /* Taken by mptcp_vX_search_req */ >+ return 1; >+} >+ >+struct mp_join *mptcp_find_join(struct sk_buff *skb) >+{ >+ struct tcphdr *th = tcp_hdr(skb); >+ unsigned char *ptr; >+ int length = (th->doff * 4) - sizeof(struct tcphdr); >+ >+ /* Jump through the options to check whether JOIN is there */ >+ ptr = (unsigned char *)(th + 1); >+ while (length > 0) { >+ int opcode = *ptr++; >+ int opsize; >+ >+ switch (opcode) { >+ case TCPOPT_EOL: >+ return NULL; >+ case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ >+ length--; >+ continue; >+ default: >+ opsize = *ptr++; >+ if (opsize < 2) /* "silly options" */ >+ return NULL; >+ if (opsize > length) >+ return NULL; /* don't parse partial options */ >+ if (opcode == TCPOPT_MPTCP && >+ ((struct mptcp_option *)(ptr - 2))->sub == MPTCP_SUB_JOIN) { >+ return (struct mp_join *)(ptr - 2); >+ } >+ ptr += opsize - 2; >+ length -= opsize; >+ } >+ } >+ return NULL; >+} >+ >+int mptcp_lookup_join(struct sk_buff *skb, struct inet_timewait_sock *tw) >+{ >+ struct mptcp_cb *mpcb; >+ struct sock *meta_sk; >+ u32 token; >+ struct mp_join *join_opt = mptcp_find_join(skb); >+ if (!join_opt) >+ return 0; >+ >+ /* MPTCP structures were not initialized, so return error */ >+ if (mptcp_init_failed) >+ return -1; >+ >+ token = join_opt->u.syn.token; >+ meta_sk = mptcp_hash_find(dev_net(skb_dst(skb)->dev), token); >+ if (!meta_sk) { >+ mptcp_debug("%s:mpcb not found:%x\n", __func__, token); >+ return -1; >+ } >+ >+ mpcb = tcp_sk(meta_sk)->mpcb; >+ if (mpcb->infinite_mapping_rcv || mpcb->send_infinite_mapping) { >+ /* We are in fallback-mode on the reception-side - >+ * no new subflows! >+ */ >+ sock_put(meta_sk); /* Taken by mptcp_hash_find */ >+ return -1; >+ } >+ >+ /* Coming from time-wait-sock processing in tcp_v4_rcv. >+ * We have to deschedule it before continuing, because otherwise >+ * mptcp_v4_do_rcv will hit again on it inside tcp_v4_hnd_req. >+ */ >+ if (tw) { >+ inet_twsk_deschedule(tw, &tcp_death_row); >+ inet_twsk_put(tw); >+ } >+ >+ TCP_SKB_CB(skb)->mptcp_flags = MPTCPHDR_JOIN; >+ /* OK, this is a new syn/join, let's create a new open request and >+ * send syn+ack >+ */ >+ bh_lock_sock_nested(meta_sk); >+ if (sock_owned_by_user(meta_sk)) { >+ skb->sk = meta_sk; >+ if (unlikely(sk_add_backlog(meta_sk, skb, >+ meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) { >+ bh_unlock_sock(meta_sk); >+ NET_INC_STATS_BH(sock_net(meta_sk), >+ LINUX_MIB_TCPBACKLOGDROP); >+ sock_put(meta_sk); /* Taken by mptcp_hash_find */ >+ kfree_skb(skb); >+ return 1; >+ } >+ } else if (skb->protocol == htons(ETH_P_IP)) { >+ tcp_v4_do_rcv(meta_sk, skb); >+#if IS_ENABLED(CONFIG_IPV6) >+ } else { >+ tcp_v6_do_rcv(meta_sk, skb); >+#endif /* CONFIG_IPV6 */ >+ } >+ bh_unlock_sock(meta_sk); >+ sock_put(meta_sk); /* Taken by mptcp_hash_find */ >+ return 1; >+} >+ >+int mptcp_do_join_short(struct sk_buff *skb, struct mptcp_options_received *mopt, >+ struct tcp_options_received *tmp_opt, struct net *net) >+{ >+ struct sock *meta_sk; >+ u32 token; >+ >+ token = mopt->mptcp_rem_token; >+ meta_sk = mptcp_hash_find(net, token); >+ if (!meta_sk) { >+ mptcp_debug("%s:mpcb not found:%x\n", __func__, token); >+ return -1; >+ } >+ >+ TCP_SKB_CB(skb)->mptcp_flags = MPTCPHDR_JOIN; >+ >+ /* OK, this is a new syn/join, let's create a new open request and >+ * send syn+ack >+ */ >+ bh_lock_sock(meta_sk); >+ >+ /* This check is also done in mptcp_vX_do_rcv. But, there we cannot >+ * call tcp_vX_send_reset, because we hold already two socket-locks. >+ * (the listener and the meta from above) >+ * >+ * And the send-reset will try to take yet another one (ip_send_reply). >+ * Thus, we propagate the reset up to tcp_rcv_state_process. >+ */ >+ if (tcp_sk(meta_sk)->mpcb->infinite_mapping_rcv || >+ tcp_sk(meta_sk)->mpcb->send_infinite_mapping || >+ meta_sk->sk_state == TCP_CLOSE || !tcp_sk(meta_sk)->inside_tk_table) { >+ bh_unlock_sock(meta_sk); >+ sock_put(meta_sk); /* Taken by mptcp_hash_find */ >+ return -1; >+ } >+ >+ if (sock_owned_by_user(meta_sk)) { >+ skb->sk = meta_sk; >+ if (unlikely(sk_add_backlog(meta_sk, skb, >+ meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) >+ NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP); >+ else >+ /* Must make sure that upper layers won't free the >+ * skb if it is added to the backlog-queue. >+ */ >+ skb_get(skb); >+ } else { >+ /* mptcp_v4_do_rcv tries to free the skb - we prevent this, as >+ * the skb will finally be freed by tcp_v4_do_rcv (where we are >+ * coming from) >+ */ >+ skb_get(skb); >+ if (skb->protocol == htons(ETH_P_IP)) { >+ tcp_v4_do_rcv(meta_sk, skb); >+#if IS_ENABLED(CONFIG_IPV6) >+ } else { /* IPv6 */ >+ tcp_v6_do_rcv(meta_sk, skb); >+#endif /* CONFIG_IPV6 */ >+ } >+ } >+ >+ bh_unlock_sock(meta_sk); >+ sock_put(meta_sk); /* Taken by mptcp_hash_find */ >+ return 0; >+} >+ >+void mptcp_retry_subflow_worker(struct work_struct *work) >+{ >+ struct delayed_work *delayed_work = >+ container_of(work, struct delayed_work, work); >+ struct mptcp_cb *mpcb = >+ container_of(delayed_work, struct mptcp_cb, subflow_retry_work); >+ struct sock *meta_sk = mpcb->meta_sk; >+ int iter = 0, i; >+ >+next_subflow: >+ if (iter) { >+ release_sock(meta_sk); >+ mutex_unlock(&mpcb->mutex); >+ >+ yield(); >+ } >+ mutex_lock(&mpcb->mutex); >+ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING); >+ >+ iter++; >+ >+ if (sock_flag(meta_sk, SOCK_DEAD)) >+ goto exit; >+ >+ mptcp_for_each_bit_set(mpcb->rem4_bits, i) { >+ struct mptcp_rem4 *rem = &mpcb->remaddr4[i]; >+ /* Do we need to retry establishing a subflow ? */ >+ if (rem->retry_bitfield) { >+ int i = mptcp_find_free_index(~rem->retry_bitfield); >+ mptcp_init4_subsockets(meta_sk, &mpcb->locaddr4[i], rem); >+ rem->retry_bitfield &= ~(1 << mpcb->locaddr4[i].id); >+ goto next_subflow; >+ } >+ } >+ >+#if IS_ENABLED(CONFIG_IPV6) >+ mptcp_for_each_bit_set(mpcb->rem6_bits, i) { >+ struct mptcp_rem6 *rem = &mpcb->remaddr6[i]; >+ >+ /* Do we need to retry establishing a subflow ? */ >+ if (rem->retry_bitfield) { >+ int i = mptcp_find_free_index(~rem->retry_bitfield); >+ mptcp_init6_subsockets(meta_sk, &mpcb->locaddr6[i], rem); >+ rem->retry_bitfield &= ~(1 << mpcb->locaddr6[i].id); >+ goto next_subflow; >+ } >+ } >+#endif >+ >+exit: >+ release_sock(meta_sk); >+ mutex_unlock(&mpcb->mutex); >+ sock_put(meta_sk); >+} >+ >+/** >+ * Create all new subflows, by doing calls to mptcp_initX_subsockets >+ * >+ * This function uses a goto next_subflow, to allow releasing the lock between >+ * new subflows and giving other processes a chance to do some work on the >+ * socket and potentially finishing the communication. >+ **/ >+void mptcp_create_subflow_worker(struct work_struct *work) >+{ >+ struct mptcp_cb *mpcb = container_of(work, struct mptcp_cb, subflow_work); >+ struct sock *meta_sk = mpcb->meta_sk; >+ int iter = 0, retry = 0; >+ int i; >+ >+next_subflow: >+ if (iter) { >+ release_sock(meta_sk); >+ mutex_unlock(&mpcb->mutex); >+ >+ yield(); >+ } >+ mutex_lock(&mpcb->mutex); >+ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING); >+ >+ iter++; >+ >+ if (sock_flag(meta_sk, SOCK_DEAD)) >+ goto exit; >+ >+ if (sysctl_mptcp_ndiffports > iter && >+ sysctl_mptcp_ndiffports > mpcb->cnt_subflows) { >+ if (meta_sk->sk_family == AF_INET || >+ mptcp_v6_is_v4_mapped(meta_sk)) { >+ mptcp_init4_subsockets(meta_sk, &mpcb->locaddr4[0], >+ &mpcb->remaddr4[0]); >+ } else { >+#if IS_ENABLED(CONFIG_IPV6) >+ mptcp_init6_subsockets(meta_sk, &mpcb->locaddr6[0], >+ &mpcb->remaddr6[0]); >+#endif >+ } >+ goto next_subflow; >+ } >+ if (sysctl_mptcp_ndiffports > 1 && >+ sysctl_mptcp_ndiffports == mpcb->cnt_subflows) >+ goto exit; >+ >+ mptcp_for_each_bit_set(mpcb->rem4_bits, i) { >+ struct mptcp_rem4 *rem; >+ u8 remaining_bits; >+ >+ rem = &mpcb->remaddr4[i]; >+ remaining_bits = ~(rem->bitfield) & mpcb->loc4_bits; >+ >+ /* Are there still combinations to handle? */ >+ if (remaining_bits) { >+ int i = mptcp_find_free_index(~remaining_bits); >+ /* If a route is not yet available then retry once */ >+ if (mptcp_init4_subsockets(meta_sk, &mpcb->locaddr4[i], >+ rem) == -ENETUNREACH) >+ retry = rem->retry_bitfield |= >+ (1 << mpcb->locaddr4[i].id); >+ goto next_subflow; >+ } >+ } >+ >+#if IS_ENABLED(CONFIG_IPV6) >+ mptcp_for_each_bit_set(mpcb->rem6_bits, i) { >+ struct mptcp_rem6 *rem; >+ u8 remaining_bits; >+ >+ rem = &mpcb->remaddr6[i]; >+ remaining_bits = ~(rem->bitfield) & mpcb->loc6_bits; >+ >+ /* Are there still combinations to handle? */ >+ if (remaining_bits) { >+ int i = mptcp_find_free_index(~remaining_bits); >+ /* If a route is not yet available then retry once */ >+ if (mptcp_init6_subsockets(meta_sk, &mpcb->locaddr6[i], >+ rem) == -ENETUNREACH) >+ retry = rem->retry_bitfield |= >+ (1 << mpcb->locaddr6[i].id); >+ goto next_subflow; >+ } >+ } >+#endif >+ >+ if (retry && !delayed_work_pending(&mpcb->subflow_retry_work)) { >+ sock_hold(meta_sk); >+ queue_delayed_work(mptcp_wq, &mpcb->subflow_retry_work, >+ msecs_to_jiffies(MPTCP_SUBFLOW_RETRY_DELAY)); >+ } >+ >+exit: >+ release_sock(meta_sk); >+ mutex_unlock(&mpcb->mutex); >+ sock_put(meta_sk); >+} >+ >+void mptcp_create_subflows(struct sock *meta_sk) >+{ >+ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; >+ >+ if ((mpcb->master_sk && >+ !tcp_sk(mpcb->master_sk)->mptcp->fully_established) || >+ mpcb->infinite_mapping_snd || mpcb->infinite_mapping_rcv || >+ mpcb->send_infinite_mapping || >+ mpcb->server_side || sock_flag(meta_sk, SOCK_DEAD)) >+ return; >+ >+ if (!work_pending(&mpcb->subflow_work)) { >+ sock_hold(meta_sk); >+ queue_work(mptcp_wq, &mpcb->subflow_work); >+ } >+} >+ >+void mptcp_address_worker(struct work_struct *work) >+{ >+ struct mptcp_cb *mpcb = container_of(work, struct mptcp_cb, address_work); >+ struct sock *meta_sk = mpcb->meta_sk, *sk, *tmpsk; >+ struct net *netns = sock_net(meta_sk); >+ struct net_device *dev; >+ int i; >+ >+ mutex_lock(&mpcb->mutex); >+ lock_sock(meta_sk); >+ >+ if (sock_flag(meta_sk, SOCK_DEAD)) >+ goto exit; >+ >+ /* The following is meant to run with bh disabled */ >+ local_bh_disable(); >+ >+ /* First, we iterate over the interfaces to find addresses not yet >+ * in our local list. >+ */ >+ >+ rcu_read_lock(); >+ read_lock_bh(&dev_base_lock); >+ >+ for_each_netdev(netns, dev) { >+ struct in_device *in_dev = __in_dev_get_rcu(dev); >+ struct in_ifaddr *ifa; >+#if IS_ENABLED(CONFIG_IPV6) >+ struct inet6_dev *in6_dev = __in6_dev_get(dev); >+ struct inet6_ifaddr *ifa6; >+#endif >+ >+ if (dev->flags & (IFF_LOOPBACK | IFF_NOMULTIPATH)) >+ continue; >+ >+ if (!in_dev) >+ goto cont_ipv6; >+ >+ for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { >+ unsigned long event; >+ >+ if (!netif_running(in_dev->dev)) { >+ event = NETDEV_DOWN; >+ } else { >+ /* If it's up, it may have been changed or came up. >+ * We set NETDEV_CHANGE, to take the good >+ * code-path in mptcp_pm_addr4_event_handler >+ */ >+ event = NETDEV_CHANGE; >+ } >+ >+ mptcp_pm_addr4_event_handler(ifa, event, mpcb); >+ } >+cont_ipv6: >+; /* This ; is necessary to fix build-errors when IPv6 is disabled */ >+#if IS_ENABLED(CONFIG_IPV6) >+ if (!in6_dev) >+ continue; >+ >+ read_lock(&in6_dev->lock); >+ list_for_each_entry(ifa6, &in6_dev->addr_list, if_list) { >+ unsigned long event; >+ >+ if (!netif_running(in_dev->dev)) { >+ event = NETDEV_DOWN; >+ } else { >+ /* If it's up, it may have been changed or came up. >+ * We set NETDEV_CHANGE, to take the good >+ * code-path in mptcp_pm_addr4_event_handler >+ */ >+ event = NETDEV_CHANGE; >+ } >+ >+ mptcp_pm_addr6_event_handler(ifa6, event, mpcb); >+ } >+ read_unlock(&in6_dev->lock); >+#endif >+ } >+ >+ /* Second, we iterate over our local addresses and check if they >+ * still exist in the interface-list. >+ */ >+ >+ /* MPCB-Local IPv4 Addresses */ >+ mptcp_for_each_bit_set(mpcb->loc4_bits, i) { >+ int j; >+ >+ for_each_netdev(netns, dev) { >+ struct in_device *in_dev = __in_dev_get_rcu(dev); >+ struct in_ifaddr *ifa; >+ >+ if (dev->flags & (IFF_LOOPBACK | IFF_NOMULTIPATH) || >+ !in_dev) >+ continue; >+ >+ for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { >+ if (ifa->ifa_address == mpcb->locaddr4[i].addr.s_addr && >+ netif_running(dev)) >+ goto next_loc_addr; >+ } >+ } >+ >+ /* We did not find the address or the interface became NOMULTIPATH. >+ * We thus have to remove it. >+ */ >+ >+ /* Look for the socket and remove him */ >+ mptcp_for_each_sk_safe(mpcb, sk, tmpsk) { >+ if (sk->sk_family != AF_INET || >+ inet_sk(sk)->inet_saddr != mpcb->locaddr4[i].addr.s_addr) >+ continue; >+ >+ mptcp_reinject_data(sk, 0); >+ mptcp_sub_force_close(sk); >+ } >+ >+ /* Now, remove the address from the local ones */ >+ mpcb->loc4_bits &= ~(1 << i); >+ >+ mpcb->remove_addrs |= (1 << mpcb->locaddr4[i].id); >+ sk = mptcp_select_ack_sock(meta_sk, 0); >+ if (sk) >+ tcp_send_ack(sk); >+ >+ mptcp_for_each_bit_set(mpcb->rem4_bits, j) >+ mpcb->remaddr4[j].bitfield &= mpcb->loc4_bits; >+ >+next_loc_addr: >+ continue; /* necessary here due to the previous label */ >+ } >+ >+#if IS_ENABLED(CONFIG_IPV6) >+ /* MPCB-Local IPv6 Addresses */ >+ mptcp_for_each_bit_set(mpcb->loc6_bits, i) { >+ int j; >+ >+ for_each_netdev(netns, dev) { >+ struct inet6_dev *in6_dev = __in6_dev_get(dev); >+ struct inet6_ifaddr *ifa6; >+ >+ if (dev->flags & (IFF_LOOPBACK | IFF_NOMULTIPATH) || >+ !in6_dev) >+ continue; >+ >+ read_lock(&in6_dev->lock); >+ list_for_each_entry(ifa6, &in6_dev->addr_list, if_list) { >+ if (ipv6_addr_equal(&mpcb->locaddr6[i].addr, &ifa6->addr) && >+ netif_running(dev)) { >+ read_unlock(&in6_dev->lock); >+ goto next_loc6_addr; >+ } >+ } >+ read_unlock(&in6_dev->lock); >+ } >+ >+ /* We did not find the address or the interface became NOMULTIPATH. >+ * We thus have to remove it. >+ */ >+ >+ /* Look for the socket and remove him */ >+ mptcp_for_each_sk_safe(mpcb, sk, tmpsk) { >+ if (sk->sk_family != AF_INET6 || >+ !ipv6_addr_equal(&inet6_sk(sk)->saddr, &mpcb->locaddr6[i].addr)) >+ continue; >+ >+ mptcp_reinject_data(sk, 0); >+ mptcp_sub_force_close(sk); >+ } >+ >+ /* Now, remove the address from the local ones */ >+ mpcb->loc6_bits &= ~(1 << i); >+ >+ /* Force sending directly the REMOVE_ADDR option */ >+ mpcb->remove_addrs |= (1 << mpcb->locaddr6[i].id); >+ sk = mptcp_select_ack_sock(meta_sk, 0); >+ if (sk) >+ tcp_send_ack(sk); >+ >+ mptcp_for_each_bit_set(mpcb->rem6_bits, j) >+ mpcb->remaddr6[j].bitfield &= mpcb->loc6_bits; >+ >+next_loc6_addr: >+ continue; /* necessary here due to the previous label */ >+ } >+#endif >+ >+ read_unlock_bh(&dev_base_lock); >+ rcu_read_unlock(); >+ >+ local_bh_enable(); >+exit: >+ release_sock(meta_sk); >+ mutex_unlock(&mpcb->mutex); >+ sock_put(meta_sk); >+} >+ >+static void mptcp_address_create_worker(struct mptcp_cb *mpcb) >+{ >+ if (!work_pending(&mpcb->address_work)) { >+ sock_hold(mpcb->meta_sk); >+ queue_work(mptcp_wq, &mpcb->address_work); >+ } >+} >+ >+/** >+ * React on IPv4+IPv6-addr add/rem-events >+ */ >+int mptcp_pm_addr_event_handler(unsigned long event, void *ptr, int family) >+{ >+ struct tcp_sock *meta_tp; >+ int i; >+ >+ if (!(event == NETDEV_UP || event == NETDEV_DOWN || >+ event == NETDEV_CHANGE)) >+ return NOTIFY_DONE; >+ >+ if (sysctl_mptcp_ndiffports > 1) >+ return NOTIFY_DONE; >+ >+ /* Now we iterate over the mpcb's */ >+ for (i = 0; i < MPTCP_HASH_SIZE; i++) { >+ struct hlist_nulls_node *node; >+ rcu_read_lock_bh(); >+ hlist_nulls_for_each_entry_rcu(meta_tp, node, &tk_hashtable[i], >+ tk_table) { >+ struct mptcp_cb *mpcb = meta_tp->mpcb; >+ struct sock *meta_sk = (struct sock *)meta_tp; >+ >+ if (unlikely(!atomic_inc_not_zero(&meta_sk->sk_refcnt))) >+ continue; >+ >+ if (!meta_tp->mpc || !is_meta_sk(meta_sk) || >+ mpcb->infinite_mapping_snd || >+ mpcb->infinite_mapping_rcv || >+ mpcb->send_infinite_mapping) { >+ sock_put(meta_sk); >+ continue; >+ } >+ >+ bh_lock_sock(meta_sk); >+ if (sock_owned_by_user(meta_sk)) { >+ mptcp_address_create_worker(mpcb); >+ } else { >+ if (family == AF_INET) >+ mptcp_pm_addr4_event_handler( >+ (struct in_ifaddr *)ptr, event, mpcb); >+#if IS_ENABLED(CONFIG_IPV6) >+ else >+ mptcp_pm_addr6_event_handler( >+ (struct inet6_ifaddr *)ptr, event, mpcb); >+#endif >+ } >+ >+ bh_unlock_sock(meta_sk); >+ sock_put(meta_sk); >+ } >+ rcu_read_unlock_bh(); >+ } >+ return NOTIFY_DONE; >+} >+ >+#ifdef CONFIG_PROC_FS >+ >+/* Output /proc/net/mptcp */ >+static int mptcp_pm_seq_show(struct seq_file *seq, void *v) >+{ >+ struct tcp_sock *meta_tp; >+ struct net *net = seq->private; >+ int i, n = 0; >+ >+ seq_printf(seq, " sl loc_tok rem_tok v6 " >+ "local_address " >+ "remote_address " >+ "st ns tx_queue rx_queue inode"); >+ seq_putc(seq, '\n'); >+ >+ for (i = 0; i < MPTCP_HASH_SIZE; i++) { >+ struct hlist_nulls_node *node; >+ rcu_read_lock_bh(); >+ hlist_nulls_for_each_entry_rcu(meta_tp, node, >+ &tk_hashtable[i], tk_table) { >+ struct mptcp_cb *mpcb = meta_tp->mpcb; >+ struct sock *meta_sk = (struct sock *)meta_tp; >+ struct inet_sock *isk = inet_sk(meta_sk); >+ >+ if (!meta_tp->mpc || !net_eq(net, sock_net(meta_sk))) >+ continue; >+ >+ seq_printf(seq, "%4d: %04X %04X ", n++, >+ mpcb->mptcp_loc_token, >+ mpcb->mptcp_rem_token); >+ if (meta_sk->sk_family == AF_INET || >+ mptcp_v6_is_v4_mapped(meta_sk)) { >+ seq_printf(seq, " 0 %08X:%04X %08X:%04X ", >+ isk->inet_saddr, >+ ntohs(isk->inet_sport), >+ isk->inet_daddr, >+ ntohs(isk->inet_dport)); >+#if IS_ENABLED(CONFIG_IPV6) >+ } else if (meta_sk->sk_family == AF_INET6) { >+ struct in6_addr *src = &isk->pinet6->saddr; >+ struct in6_addr *dst = &isk->pinet6->daddr; >+ seq_printf(seq, " 1 %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X", >+ src->s6_addr32[0], src->s6_addr32[1], >+ src->s6_addr32[2], src->s6_addr32[3], >+ ntohs(isk->inet_sport), >+ dst->s6_addr32[0], dst->s6_addr32[1], >+ dst->s6_addr32[2], dst->s6_addr32[3], >+ ntohs(isk->inet_dport)); >+#endif >+ } >+ seq_printf(seq, " %02X %02X %08X:%08X %lu", >+ meta_sk->sk_state, mpcb->cnt_subflows, >+ meta_tp->write_seq - meta_tp->snd_una, >+ max_t(int, meta_tp->rcv_nxt - >+ meta_tp->copied_seq, 0), >+ sock_i_ino(meta_sk)); >+ seq_putc(seq, '\n'); >+ } >+ rcu_read_unlock_bh(); >+ } >+ >+ return 0; >+} >+ >+static int mptcp_pm_seq_open(struct inode *inode, struct file *file) >+{ >+ return single_open_net(inode, file, mptcp_pm_seq_show); >+} >+ >+static const struct file_operations mptcp_pm_seq_fops = { >+ .owner = THIS_MODULE, >+ .open = mptcp_pm_seq_open, >+ .read = seq_read, >+ .llseek = seq_lseek, >+ .release = single_release_net, >+}; >+ >+static int mptcp_pm_proc_init_net(struct net *net) >+{ >+ if (!proc_create("mptcp", S_IRUGO, net->proc_net, &mptcp_pm_seq_fops)) >+ return -ENOMEM; >+ >+ return 0; >+} >+ >+static void mptcp_pm_proc_exit_net(struct net *net) >+{ >+ remove_proc_entry("mptcp", net->proc_net); >+} >+ >+static struct pernet_operations mptcp_pm_proc_ops = { >+ .init = mptcp_pm_proc_init_net, >+ .exit = mptcp_pm_proc_exit_net, >+}; >+#endif >+ >+/* General initialization of MPTCP_PM */ >+int mptcp_pm_init(void) >+{ >+ int i, ret; >+ for (i = 0; i < MPTCP_HASH_SIZE; i++) { >+ INIT_HLIST_NULLS_HEAD(&tk_hashtable[i], i); >+ INIT_LIST_HEAD(&mptcp_reqsk_htb[i]); >+ INIT_HLIST_NULLS_HEAD(&mptcp_reqsk_tk_htb[i], i); >+ } >+ >+ spin_lock_init(&mptcp_reqsk_hlock); >+ spin_lock_init(&mptcp_tk_hashlock); >+ >+#ifdef CONFIG_PROC_FS >+ ret = register_pernet_subsys(&mptcp_pm_proc_ops); >+ if (ret) >+ goto out; >+#endif >+ >+#if IS_ENABLED(CONFIG_IPV6) >+ ret = mptcp_pm_v6_init(); >+ if (ret) >+ goto mptcp_pm_v6_failed; >+#endif >+ ret = mptcp_pm_v4_init(); >+ if (ret) >+ goto mptcp_pm_v4_failed; >+ >+out: >+ return ret; >+ >+mptcp_pm_v4_failed: >+#if IS_ENABLED(CONFIG_IPV6) >+ mptcp_pm_v6_undo(); >+ >+mptcp_pm_v6_failed: >+#endif >+#ifdef CONFIG_PROC_FS >+ unregister_pernet_subsys(&mptcp_pm_proc_ops); >+#endif >+ goto out; >+} >+ >+void mptcp_pm_undo(void) >+{ >+#if IS_ENABLED(CONFIG_IPV6) >+ mptcp_pm_v6_undo(); >+#endif >+ mptcp_pm_v4_undo(); >+#ifdef CONFIG_PROC_FS >+ unregister_pernet_subsys(&mptcp_pm_proc_ops); >+#endif >+}
You cannot view the attachment while viewing its details because your browser does not support IFRAMEs.
View the attachment on a separate page
.
View Attachment As Diff
View Attachment As Raw
Actions:
View
|
Diff
Attachments on
bug 477786
:
360144
|
360146
|
360162
|
360244
|
361276